Parser
Posted on March 1, 2020
Tags: typetheory
1 The steps before parsing
- Input String -> Lexer/Tokenizer creates array of tokens -> Array of tokens gets parsed into AST
2+4-> `[‘2’,‘+’,‘4’] -> AST
Think of a lexer/Tokenizer as just splitting a string by space.
A Lexer does not care about things like matching parenthesis or brackets(, ).
The Lexer is basically labelling which word is a keyword and which isn’t.
* The quick brown fox jumps over the lazy dog ==lexer/Tokenizer=> ["The","quick","brown","fox","jumps","over","the","lazy","dog"] *let x = 5==lexer/Tokenizer=>[(“let”,KEYWORD),(“x”,VARIABLE),(“=”,OPERATOR),(“5”,“LITERAL”)]
Tokenizer implementation intuition: A tokenizer generally determine the token-type by the first character of a string even when read char by char.
2 How to read grammar rules
S -> aSa
S -> ethe above grammar rule can generate the below
S
/ | \
a S a
/ | \
a S a
|
eThe resulting generation is aaeaa
3 Overview
// Define token types
const TokenType = {
IF: 'IF',
THEN: 'THEN',
IDENTIFIER: 'IDENTIFIER',
EOF: 'EOF'
};
// Token class
class Token {
constructor(type, value) {
this.type = type;
this.value = value;
}
}
// Lexer class
class Lexer {
constructor(input) {
this.input = input;
this.position = 0;
}
getNextToken() {
if (this.position >= this.input.length) {
return new Token(TokenType.EOF, null);
}
const currentChar = this.input[this.position];
if (currentChar === 'i') {
if(this.input[this.position + 1] == 'f' &&(this.input[this.position + 2] == '(' || this.input[this.position + 2] == ' ') ){
this.position = this.position + 2;
return new Token(TokenType.IF, 'if');
}
}
if (currentChar === 't') {
if(this.input.substring(this.position+1,this.position+4) == "hen" ){
this.position = this.position + 4;
return new Token(TokenType.THEN, 'then');
}
}
else if (/[a-zA-Z]/.test(currentChar)) {
let identifier = '';
while (this.input[this.position] !== ' ' && this.input[this.position] != undefined) {
identifier += this.input[this.position];
this.position++;
console.log(this.input[this.position])
}
return new Token(TokenType.IDENTIFIER, identifier);
}
// Ignore whitespaces
else if (/\s/.test(currentChar)) {
this.position++;
return this.getNextToken();
}
throw new Error('Invalid character');
}
}
// Parser class
// Parser class
class Parser {
constructor(lexer) {
this.lexer = lexer;
this.currentToken = this.lexer.getNextToken();
}
match(expectedType) {
if (this.currentToken.type === expectedType) {
this.currentToken = this.lexer.getNextToken();
} else {
console.log(this.currentToken)
throw new Error(`Syntax error: Expected ${expectedType}, but got ${this.currentToken.type}`);
}
}
parse() {
this.match(TokenType.IF);
this.match(TokenType.IDENTIFIER);
this.match(TokenType.THEN);
this.match(TokenType.IDENTIFIER);
this.match(TokenType.EOF);
console.log('Parsing successful');
}
}
// Test the parser
const input = 'if x then y';
const lexer = new Lexer(input);
const parser = new Parser(lexer);
parser.parse();4 Lisp example
- Tokenizer intuition:
- read char by char
- Skip whitespaces
- first character and/or 2nd char determines the token-type
- read until token is finished
- read char by char
function tree2dict(tree){
let dict = {}
if(Array.isArray(tree)){
dict['type'] = 'Sexp'
if(tree.length == 0){
dict['name'] = 'unit'
return dict
}else{
dict['children'] = []
let head = tree[0]
if (Array.isArray(head)){
dict['name'] = '*'
dict['children'].push(tree2dict(head))
}else{
dict['name'] = head
}
let tail = tree.slice(1)
for (let i = 0; i < tail.length; i++) {
let IH = tree2dict(tail[i])
dict['children'].push(IH)
}
return dict
}
}else{
dict['type'] = 'atom'
dict['name'] = tree
return dict
}
}
// function tree2dict(tree) {
// let dict = {};
// if(tree.length == 0){
// dict['name'] = 'unit'
// return dict
// }
// let head = tree[0];
// if (Array.isArray(head)) {
// dict['name'] = '*';
// dict['children'] = [tree2dict(head)];
// } else {
// dict['name'] = head;
// dict['children'] = [];
// }
// let tail = tree.slice(1);
// for (let i = 0; i < tail.length; i++) {
// if (Array.isArray(tail[i])) {
// // let IH = tree2LISP(tail[i])
// let IH = tree2dict(tail[i]);
// dict['children'].push(IH);
// } else {
// // let leaf = new LISPnode(tail[i])
// // x.addChild(leaf)
// dict['children'].push({ name: tail[i] });
// }
// }
// return dict;
// }
class InputSt {
constructor(input) {
this.input = input;
this.pos = 0;
this.line = 1;
this.col = 0;
}
next() {
let ch = this.input.charAt(this.pos++);
if (ch == '\n') {
this.line++;
this.col = 0;
} else {
this.col++;
}
return ch;
}
peek() {
return this.input.charAt(this.pos);
}
peekn(n){
//WARN: Does not catch out of bound
return this.input.charAt(this.pos + n)
}
eof() {
return this.input.charAt(this.pos) == '';
}
croak(msg) {
throw new Error(msg + ' (' + this.line + ':' + this.col + ')');
}
}
//Tokenizer checks the first character of the string, then process() the entire string including the first char
//This is bad when the first character is uniquely different from other cahracters
class Tokenizer {
constructor(inputSt) {
this.limCount = 0;
this.inputSt = inputSt;
this.partialSol = [];
}
windowChk(instring) {
const len = instring.length;
if (this.inputSt.pos + len <= this.inputSt.input.length) {
return this.inputSt.input.slice(this.inputSt.pos, this.inputSt.pos + len) === instring;
}
return false;
}
OutOfBound(n){
return n >= this.inputSt.input.length
}
satisfy(predicate) {
//pure
return !this.inputSt.eof() && predicate(this.inputSt.peek());
}
satisfy1(predicate) {
if(this.inputSt.pos + 1 >= this.inputSt.length){
return false
}
return !this.inputSt.eof() && predicate(this.inputSt.peekn(1));
}
readWhile(predicate) {
//impure
let complete = '';
while (this.satisfy(predicate)) {
complete = complete + this.inputSt.peek();
this.inputSt.next();
}
return complete;
}
readWhileNextLine(){
const oldline = this.inputSt.line;
let complete = '';
while (this.inputSt.line == oldline) {
complete = complete + this.inputSt.peek();
this.inputSt.next();
}
return complete;
}
readSingle(predicate) {
//impure
let complete = '';
if (this.satisfy(predicate)) {
complete = complete + this.inputSt.peek();
this.inputSt.next();
}
return complete;
}
condParen(inchar) {
if (inchar == '(' || inchar == ')' || inchar == '[' || inchar == ']') {
return true;
}
}
condQuote(inchar){
if(inchar =='`'){
return true;
}
}
condQuote2(inchar){
if(inchar ==`'`){
return true;
}
}
condComma(inchar){
if(inchar == ','){
return true;
}
}
condComment(inchar){
if(inchar == ';'){
return true;
}
}
condTupleHash(inchar){
if(inchar =='#'){
return true
}
}
condNum(inchar) {
return /[0-9]/i.test(inchar);
}
condOp(inchar) {
return '+-*/%=&|<>!'.indexOf(inchar) >= 0;
}
condNotWhitespace(inchar){
return !(/\s/.test(inchar))
}
condOp1(inchar1){
console.log("whitespace?",inchar1 == ' ')
if((/\s/.test(inchar1))){
return true
}else{
return false
}
}
condId(inchar) {
return /^[\\?a-zA-Z0-9_':-]*$/i.test(inchar);
}
condIdInside(inchar){
return /^[\\?a-zA-Z0-9_':-\\*]*$/i.test(inchar);
}
condStr(inchar){
return (/["]/i).test(inchar)
}
readStr(){
let complete = ''
complete += this.inputSt.peek();
this.inputSt.next(); //skip beginning quote "
const tokpredicate = (mystr) => ((/[^"]/i).test(mystr))
while(!this.inputSt.eof() && tokpredicate(this.inputSt.peek())){
complete = complete + this.inputSt.peek();
this.inputSt.next()
}
complete += this.inputSt.peek();
this.inputSt.next(); //skip end quote "
return complete
}
next() {
if (this.inputSt.peek() == ' ' || this.inputSt.peek() == '\n') {
this.inputSt.next();
return null;
}
let tokenVal = null;
if (this.satisfy(this.condNum)) {
console.log("Num",this.inputSt.peek())
tokenVal = this.readWhile(this.condNum);
} else if (this.satisfy(this.condComment)){
tokenVal = this.readWhileNextLine();
}else if (this.satisfy(this.condQuote)){
tokenVal = this.readSingle(this.condQuote)
}else if (this.satisfy(this.condQuote2) && this.satisfy1(this.condParen)){
tokenVal = this.readSingle(this.condQuote2)
}else if (this.satisfy(this.condTupleHash) && this.satisfy1(this.condParen)){
tokenVal = this.readSingle(this.condTupleHash)
} else if (this.satisfy(this.condComma)){
tokenVal = this.readSingle(this.condComma)
} else if (this.satisfy(this.condParen)) {
console.log("Paren",this.inputSt.peek())
tokenVal = this.readSingle(this.condParen);
} else if (this.satisfy(this.condOp)) {
console.log("Op",this.inputSt.peek())
tokenVal = this.readWhile(this.condNotWhitespace);
} else if (this.satisfy(this.condStr)) {
console.log("str",this.inputSt.peek())
tokenVal = this.readStr();
} else if (this.satisfy(this.condId)) {
console.log("id",this.inputSt.peek())
// let head = this.readSingle(this.condIdInside)
tokenVal = this.readWhile(this.condIdInside);
} else {
console.log("uncaught: ", this.inputSt.peek());
}
// tokenVal += this.readSingle(this.condParen);
// tokenVal += this.readSingle(this.condOp);
// tokenVal += this.readWhile(this.condId);
this.partialSol.push(tokenVal);
// this.inputSt.next()
}
process() {
while (!this.inputSt.eof()) {
this.limCount += 1;
if(this.limCount >= 3000){
console.log("the problem:",this.inputSt.peek(), this.inputSt.line, this.inputSt.col, this.inputSt.pos)
throw new Error('Stack blown');
}
this.next();
}
}
hint() {
console.log(this.partialSol);
}
}
function mktree(tokens) {
// let tokens = [...tokens1]
let partialSol = [];
let head = tokens.shift();
// let rest = tokens.slice(1)
if(head != undefined && head[0] == ';'){
head = tokens.shift()
}
if (head == ","){
let IH = mktree(tokens);
partialSol.push(IH)
// partialSol = ["quote", partialSol]
partialSol.unshift("comma")
return partialSol
} else if (head == "`"){
let IH = mktree(tokens);
//IH will always be a list since in lisp quote tokens are always in front of list tokens
partialSol.push(...IH)
// partialSol = ["quote", partialSol]
partialSol.unshift("quote")
return partialSol
}else if (head == `'`){
let IH = mktree(tokens);
partialSol.push(...IH)
partialSol = ["quote", partialSol]
// partialSol.unshift("quote")
return partialSol
}else if (head == `#`){
let IH = mktree(tokens);
// partialSol = [...partialSol,...IH]
// partialSol = ["quote", partialSol]
partialSol.push(...IH)
partialSol.unshift("tuple")
return partialSol
} else if (head == '(') {
while (tokens[0] != ')') {
// console.log(tokens)
let IH = mktree(tokens);
partialSol.push(IH);
}
tokens.shift();
return partialSol;
} else if (head == '[') {
while (tokens[0] != ']') {
// console.log(tokens)
let IH = mktree(tokens);
partialSol.push(IH);
}
tokens.shift();
return partialSol;
} else {
// console.log(rest)
return head;
}
}
let thestr = `
( '[#(port 8080)])
`
let z = new InputSt(thestr)
let z1 = new Tokenizer(z)
z1.process()
let t1 = z1.partialSol
console.log("tokenized",t1)
let b1 = mktree(t1)
console.log("the tree", b1)
let b2 = tree2dict(b1)
console.log("tree2dict",b2)
// console.log("FUCK", tree3dict(b1))
// console.log(b1[3][2])
export {tree2dict,InputSt,Tokenizer,mktree}4.1 More advanced tokenizer and parser
- Include the token start position and end position with the token
- By doing so you can build a AST with a easy to build mapping from AST node to original input string positions.