Parser
Posted on March 1, 2020
Tags: typetheory
1 The steps before parsing
- Input String -> Lexer/Tokenizer creates array of tokens -> Array of tokens gets parsed into AST
2+4
-> `[‘2’,‘+’,‘4’] -> AST
Think of a lexer/Tokenizer as just splitting a string by space.
A Lexer does not care about things like matching parenthesis or brackets(
, )
.
The Lexer is basically labelling which word is a keyword and which isn’t.
* The quick brown fox jumps over the lazy dog
==lexer/Tokenizer=> ["The","quick","brown","fox","jumps","over","the","lazy","dog"] *
let x = 5==lexer/Tokenizer=>
[(“let”,KEYWORD),(“x”,VARIABLE),(“=”,OPERATOR),(“5”,“LITERAL”)]
Tokenizer implementation intuition: A tokenizer generally determine the token-type by the first character of a string even when read char by char.
2 How to read grammar rules
S -> aSa S -> e
the above grammar rule can generate the below
S
/ | \
a S a
/ | \
a S a
| e
The resulting generation is aaeaa
3 Overview
// Define token types
const TokenType = {
IF: 'IF',
THEN: 'THEN',
IDENTIFIER: 'IDENTIFIER',
EOF: 'EOF'
;
}
// Token class
class Token {
constructor(type, value) {
this.type = type;
this.value = value;
}
}
// Lexer class
class Lexer {
constructor(input) {
this.input = input;
this.position = 0;
}
getNextToken() {
if (this.position >= this.input.length) {
return new Token(TokenType.EOF, null);
}
const currentChar = this.input[this.position];
if (currentChar === 'i') {
if(this.input[this.position + 1] == 'f' &&(this.input[this.position + 2] == '(' || this.input[this.position + 2] == ' ') ){
this.position = this.position + 2;
return new Token(TokenType.IF, 'if');
}
}
if (currentChar === 't') {
if(this.input.substring(this.position+1,this.position+4) == "hen" ){
this.position = this.position + 4;
return new Token(TokenType.THEN, 'then');
}
}
else if (/[a-zA-Z]/.test(currentChar)) {
let identifier = '';
while (this.input[this.position] !== ' ' && this.input[this.position] != undefined) {
+= this.input[this.position];
identifier this.position++;
console.log(this.input[this.position])
}return new Token(TokenType.IDENTIFIER, identifier);
}
// Ignore whitespaces
else if (/\s/.test(currentChar)) {
this.position++;
return this.getNextToken();
}
throw new Error('Invalid character');
}
}
// Parser class
// Parser class
class Parser {
constructor(lexer) {
this.lexer = lexer;
this.currentToken = this.lexer.getNextToken();
}
match(expectedType) {
if (this.currentToken.type === expectedType) {
this.currentToken = this.lexer.getNextToken();
else {
} console.log(this.currentToken)
throw new Error(`Syntax error: Expected ${expectedType}, but got ${this.currentToken.type}`);
}
}
parse() {
this.match(TokenType.IF);
this.match(TokenType.IDENTIFIER);
this.match(TokenType.THEN);
this.match(TokenType.IDENTIFIER);
this.match(TokenType.EOF);
console.log('Parsing successful');
}
}
// Test the parser
const input = 'if x then y';
const lexer = new Lexer(input);
const parser = new Parser(lexer);
.parse(); parser
4 Lisp example
- Tokenizer intuition:
- read char by char
- Skip whitespaces
- first character and/or 2nd char determines the token-type
- read until token is finished
- read char by char
function tree2dict(tree){
let dict = {}
if(Array.isArray(tree)){
'type'] = 'Sexp'
dict[if(tree.length == 0){
'name'] = 'unit'
dict[return dict
else{
}'children'] = []
dict[let head = tree[0]
if (Array.isArray(head)){
'name'] = '*'
dict['children'].push(tree2dict(head))
dict[else{
}'name'] = head
dict[
}let tail = tree.slice(1)
for (let i = 0; i < tail.length; i++) {
let IH = tree2dict(tail[i])
'children'].push(IH)
dict[
}return dict
}else{
}'type'] = 'atom'
dict['name'] = tree
dict[return dict
}
}// function tree2dict(tree) {
// let dict = {};
// if(tree.length == 0){
// dict['name'] = 'unit'
// return dict
// }
// let head = tree[0];
// if (Array.isArray(head)) {
// dict['name'] = '*';
// dict['children'] = [tree2dict(head)];
// } else {
// dict['name'] = head;
// dict['children'] = [];
// }
// let tail = tree.slice(1);
// for (let i = 0; i < tail.length; i++) {
// if (Array.isArray(tail[i])) {
// // let IH = tree2LISP(tail[i])
// let IH = tree2dict(tail[i]);
// dict['children'].push(IH);
// } else {
// // let leaf = new LISPnode(tail[i])
// // x.addChild(leaf)
// dict['children'].push({ name: tail[i] });
// }
// }
// return dict;
// }
class InputSt {
constructor(input) {
this.input = input;
this.pos = 0;
this.line = 1;
this.col = 0;
}
next() {
let ch = this.input.charAt(this.pos++);
if (ch == '\n') {
this.line++;
this.col = 0;
else {
} this.col++;
}return ch;
}peek() {
return this.input.charAt(this.pos);
}peekn(n){
//WARN: Does not catch out of bound
return this.input.charAt(this.pos + n)
}eof() {
return this.input.charAt(this.pos) == '';
}croak(msg) {
throw new Error(msg + ' (' + this.line + ':' + this.col + ')');
}
}
//Tokenizer checks the first character of the string, then process() the entire string including the first char
//This is bad when the first character is uniquely different from other cahracters
class Tokenizer {
constructor(inputSt) {
this.limCount = 0;
this.inputSt = inputSt;
this.partialSol = [];
}windowChk(instring) {
const len = instring.length;
if (this.inputSt.pos + len <= this.inputSt.input.length) {
return this.inputSt.input.slice(this.inputSt.pos, this.inputSt.pos + len) === instring;
}return false;
}
OutOfBound(n){
return n >= this.inputSt.input.length
}satisfy(predicate) {
//pure
return !this.inputSt.eof() && predicate(this.inputSt.peek());
}satisfy1(predicate) {
if(this.inputSt.pos + 1 >= this.inputSt.length){
return false
}return !this.inputSt.eof() && predicate(this.inputSt.peekn(1));
}readWhile(predicate) {
//impure
let complete = '';
while (this.satisfy(predicate)) {
= complete + this.inputSt.peek();
complete this.inputSt.next();
}return complete;
}readWhileNextLine(){
const oldline = this.inputSt.line;
let complete = '';
while (this.inputSt.line == oldline) {
= complete + this.inputSt.peek();
complete this.inputSt.next();
}return complete;
}readSingle(predicate) {
//impure
let complete = '';
if (this.satisfy(predicate)) {
= complete + this.inputSt.peek();
complete this.inputSt.next();
}return complete;
}condParen(inchar) {
if (inchar == '(' || inchar == ')' || inchar == '[' || inchar == ']') {
return true;
}
}condQuote(inchar){
if(inchar =='`'){
return true;
}
}condQuote2(inchar){
if(inchar ==`'`){
return true;
}
}condComma(inchar){
if(inchar == ','){
return true;
}
}condComment(inchar){
if(inchar == ';'){
return true;
}
}condTupleHash(inchar){
if(inchar =='#'){
return true
}
}condNum(inchar) {
return /[0-9]/i.test(inchar);
}condOp(inchar) {
return '+-*/%=&|<>!'.indexOf(inchar) >= 0;
}condNotWhitespace(inchar){
return !(/\s/.test(inchar))
}condOp1(inchar1){
console.log("whitespace?",inchar1 == ' ')
if((/\s/.test(inchar1))){
return true
else{
}return false
}
}condId(inchar) {
return /^[\\?a-zA-Z0-9_':-]*$/i.test(inchar);
}condIdInside(inchar){
return /^[\\?a-zA-Z0-9_':-\\*]*$/i.test(inchar);
}condStr(inchar){
return (/["]/i).test(inchar)
}readStr(){
let complete = ''
+= this.inputSt.peek();
complete this.inputSt.next(); //skip beginning quote "
const tokpredicate = (mystr) => ((/[^"]/i).test(mystr))
while(!this.inputSt.eof() && tokpredicate(this.inputSt.peek())){
= complete + this.inputSt.peek();
complete this.inputSt.next()
}+= this.inputSt.peek();
complete this.inputSt.next(); //skip end quote "
return complete
}
next() {
if (this.inputSt.peek() == ' ' || this.inputSt.peek() == '\n') {
this.inputSt.next();
return null;
}let tokenVal = null;
if (this.satisfy(this.condNum)) {
console.log("Num",this.inputSt.peek())
= this.readWhile(this.condNum);
tokenVal else if (this.satisfy(this.condComment)){
} = this.readWhileNextLine();
tokenVal else if (this.satisfy(this.condQuote)){
}= this.readSingle(this.condQuote)
tokenVal else if (this.satisfy(this.condQuote2) && this.satisfy1(this.condParen)){
}= this.readSingle(this.condQuote2)
tokenVal else if (this.satisfy(this.condTupleHash) && this.satisfy1(this.condParen)){
}= this.readSingle(this.condTupleHash)
tokenVal else if (this.satisfy(this.condComma)){
} = this.readSingle(this.condComma)
tokenVal else if (this.satisfy(this.condParen)) {
} console.log("Paren",this.inputSt.peek())
= this.readSingle(this.condParen);
tokenVal else if (this.satisfy(this.condOp)) {
} console.log("Op",this.inputSt.peek())
= this.readWhile(this.condNotWhitespace);
tokenVal else if (this.satisfy(this.condStr)) {
} console.log("str",this.inputSt.peek())
= this.readStr();
tokenVal
else if (this.satisfy(this.condId)) {
} console.log("id",this.inputSt.peek())
// let head = this.readSingle(this.condIdInside)
= this.readWhile(this.condIdInside);
tokenVal else {
} console.log("uncaught: ", this.inputSt.peek());
}// tokenVal += this.readSingle(this.condParen);
// tokenVal += this.readSingle(this.condOp);
// tokenVal += this.readWhile(this.condId);
this.partialSol.push(tokenVal);
// this.inputSt.next()
}process() {
while (!this.inputSt.eof()) {
this.limCount += 1;
if(this.limCount >= 3000){
console.log("the problem:",this.inputSt.peek(), this.inputSt.line, this.inputSt.col, this.inputSt.pos)
throw new Error('Stack blown');
}this.next();
}
}hint() {
console.log(this.partialSol);
}
}
function mktree(tokens) {
// let tokens = [...tokens1]
let partialSol = [];
let head = tokens.shift();
// let rest = tokens.slice(1)
if(head != undefined && head[0] == ';'){
= tokens.shift()
head
}if (head == ","){
let IH = mktree(tokens);
.push(IH)
partialSol// partialSol = ["quote", partialSol]
.unshift("comma")
partialSolreturn partialSol
else if (head == "`"){
} let IH = mktree(tokens);
//IH will always be a list since in lisp quote tokens are always in front of list tokens
.push(...IH)
partialSol// partialSol = ["quote", partialSol]
.unshift("quote")
partialSolreturn partialSol
else if (head == `'`){
}let IH = mktree(tokens);
.push(...IH)
partialSol= ["quote", partialSol]
partialSol // partialSol.unshift("quote")
return partialSol
else if (head == `#`){
}let IH = mktree(tokens);
// partialSol = [...partialSol,...IH]
// partialSol = ["quote", partialSol]
.push(...IH)
partialSol.unshift("tuple")
partialSolreturn partialSol
else if (head == '(') {
} while (tokens[0] != ')') {
// console.log(tokens)
let IH = mktree(tokens);
.push(IH);
partialSol
}.shift();
tokensreturn partialSol;
else if (head == '[') {
} while (tokens[0] != ']') {
// console.log(tokens)
let IH = mktree(tokens);
.push(IH);
partialSol
}.shift();
tokensreturn partialSol;
else {
} // console.log(rest)
return head;
}
}let thestr = `
( '[#(port 8080)])
`
let z = new InputSt(thestr)
let z1 = new Tokenizer(z)
.process()
z1let t1 = z1.partialSol
console.log("tokenized",t1)
let b1 = mktree(t1)
console.log("the tree", b1)
let b2 = tree2dict(b1)
console.log("tree2dict",b2)
// console.log("FUCK", tree3dict(b1))
// console.log(b1[3][2])
export {tree2dict,InputSt,Tokenizer,mktree}
4.1 More advanced tokenizer and parser
- Include the token start position and end position with the token
- By doing so you can build a AST with a easy to build mapping from AST node to original input string positions.