Parser

Posted on March 1, 2020

Tags: typetheory

1 The steps before parsing

Input String -> Lexer/Tokenizer creates array of tokens -> Array of tokens gets parsed into AST
2+4 -> `[‘2’,‘+’,‘4’] -> AST

Think of a lexer/Tokenizer as just splitting a string by space.
A Lexer does not care about things like matching parenthesis or brackets(, ). The Lexer is basically labelling which word is a keyword and which isn’t. * The quick brown fox jumps over the lazy dog ==lexer/Tokenizer=> ["The","quick","brown","fox","jumps","over","the","lazy","dog"] *let x = 5==lexer/Tokenizer=>[(“let”,KEYWORD),(“x”,VARIABLE),(“=”,OPERATOR),(“5”,“LITERAL”)]

Tokenizer implementation intuition: A tokenizer generally determine the token-type by the first character of a string even when read char by char.

2 How to read grammar rules

S -> aSa
S -> e

the above grammar rule can generate the below

    S
  / | \
 a  S  a
  / |  \
 a  S   a
    | 
    e

The resulting generation is aaeaa

3 Overview


// Define token types
const TokenType = {
    IF: 'IF',
    THEN: 'THEN',
    IDENTIFIER: 'IDENTIFIER',
    EOF: 'EOF'
  };
  
  // Token class
  class Token {
    constructor(type, value) {
      this.type = type;
      this.value = value;
    }
  }
  
  // Lexer class
  class Lexer {
    constructor(input) {
      this.input = input;
      this.position = 0;
    }
  
    getNextToken() {
      if (this.position >= this.input.length) {
        return new Token(TokenType.EOF, null);
      }
  
      const currentChar = this.input[this.position];
  
      if (currentChar === 'i') {
        if(this.input[this.position + 1] == 'f' &&(this.input[this.position + 2] == '(' || this.input[this.position + 2] == ' ') ){
            this.position = this.position + 2;
            return new Token(TokenType.IF, 'if');
          }
      }
  
      if (currentChar === 't') {
        if(this.input.substring(this.position+1,this.position+4) == "hen" ){
          this.position = this.position + 4;
          return new Token(TokenType.THEN, 'then');
        }
      }
  
      else if (/[a-zA-Z]/.test(currentChar)) {
        let identifier = '';
        while (this.input[this.position] !== ' ' && this.input[this.position] != undefined) {
          identifier += this.input[this.position];
          this.position++;
          console.log(this.input[this.position])
        }
        return new Token(TokenType.IDENTIFIER, identifier);
      }
  
      // Ignore whitespaces
      else if (/\s/.test(currentChar)) {
        this.position++;
        return this.getNextToken();
      }
  
      throw new Error('Invalid character');
    }
  }
  
  // Parser class
// Parser class
class Parser {
    constructor(lexer) {
      this.lexer = lexer;
      this.currentToken = this.lexer.getNextToken();
    }
  
    match(expectedType) {
      if (this.currentToken.type === expectedType) {
        this.currentToken = this.lexer.getNextToken();
      } else {
        console.log(this.currentToken)
        throw new Error(`Syntax error: Expected ${expectedType}, but got ${this.currentToken.type}`);
      }
    }
  
    parse() {
      this.match(TokenType.IF);
      this.match(TokenType.IDENTIFIER);
      this.match(TokenType.THEN);
      this.match(TokenType.IDENTIFIER);
      this.match(TokenType.EOF);
      console.log('Parsing successful');
    }
  }
  
  
  // Test the parser
  const input = 'if x then y';
  const lexer = new Lexer(input);
  const parser = new Parser(lexer);
  parser.parse();

4 Lisp example

Tokenizer intuition:
- read char by char
  - Skip whitespaces
  - first character and/or 2nd char determines the token-type
    - read until token is finished


function tree2dict(tree){
    let dict = {}
    if(Array.isArray(tree)){
        dict['type'] = 'Sexp'
        if(tree.length == 0){
            dict['name'] = 'unit'
            return dict
        }else{
            dict['children'] = []
            let head = tree[0]
            if (Array.isArray(head)){
                dict['name'] = '*'
                dict['children'].push(tree2dict(head))
            }else{
                dict['name'] = head
            }
            let tail = tree.slice(1)
            for (let i = 0; i < tail.length; i++) {
                let IH = tree2dict(tail[i])
                dict['children'].push(IH)
            }
            return dict
        }
    }else{
        dict['type'] = 'atom'
        dict['name'] = tree
        return dict
    }
}
// function tree2dict(tree) {
//     let dict = {};
//     if(tree.length == 0){
//         dict['name'] = 'unit'
//         return dict
//     }
//     let head = tree[0];
//     if (Array.isArray(head)) {
//         dict['name'] = '*';
//         dict['children'] = [tree2dict(head)];
//     } else {
//         dict['name'] = head;
//         dict['children'] = [];
//     }
//     let tail = tree.slice(1);


//     for (let i = 0; i < tail.length; i++) {
//         if (Array.isArray(tail[i])) {
//             // let IH = tree2LISP(tail[i])
//             let IH = tree2dict(tail[i]);
//             dict['children'].push(IH);
//         } else {
//             // let leaf = new LISPnode(tail[i])
//             // x.addChild(leaf)
//             dict['children'].push({ name: tail[i] });
//         }
//     }
    
//     return dict;
// }


class InputSt {
    constructor(input) {
        this.input = input;
        this.pos = 0;
        this.line = 1;
        this.col = 0;
    }

    next() {
        let ch = this.input.charAt(this.pos++);
        if (ch == '\n') {
            this.line++;
            this.col = 0;
        } else {
            this.col++;
        }
        return ch;
    }
    peek() {
        return this.input.charAt(this.pos);
    }
    peekn(n){
        //WARN: Does not catch out of bound
            return this.input.charAt(this.pos + n)
        
    }
    eof() {
        return this.input.charAt(this.pos) == '';
    }
    croak(msg) {
        throw new Error(msg + ' (' + this.line + ':' + this.col + ')');
    }
}

//Tokenizer checks the first character of the string, then process() the entire string including the first char
//This is bad when the first character is uniquely different from other cahracters




class Tokenizer {
    constructor(inputSt) {
        this.limCount = 0;
        this.inputSt = inputSt;
        this.partialSol = [];
    }
    windowChk(instring) {
        const len = instring.length;
        if (this.inputSt.pos + len <= this.inputSt.input.length) {
            return this.inputSt.input.slice(this.inputSt.pos, this.inputSt.pos + len) === instring;
        }
        return false;
    }

    OutOfBound(n){
        return n >= this.inputSt.input.length
    }
    satisfy(predicate) {
        //pure
        return !this.inputSt.eof() && predicate(this.inputSt.peek());
    }
    satisfy1(predicate) {
        if(this.inputSt.pos + 1 >= this.inputSt.length){
            return false
        }
        return !this.inputSt.eof() && predicate(this.inputSt.peekn(1));
    }
    readWhile(predicate) {
        //impure
        let complete = '';
        while (this.satisfy(predicate)) {
            complete = complete + this.inputSt.peek();
            this.inputSt.next();
        }
        return complete;
    }
    readWhileNextLine(){
        const oldline = this.inputSt.line;
        let complete = '';
        while (this.inputSt.line == oldline) {
            complete = complete + this.inputSt.peek();
            this.inputSt.next();
        }
        return complete;   
    }
    readSingle(predicate) {
        //impure
        let complete = '';
        if (this.satisfy(predicate)) {
            complete = complete + this.inputSt.peek();
            this.inputSt.next();
        }
        return complete;
    }
    condParen(inchar) {
        if (inchar == '(' || inchar == ')' || inchar == '[' || inchar == ']') {
            return true;
        }
    }
    condQuote(inchar){
        if(inchar =='`'){
            return true;
        }
    }
    condQuote2(inchar){
        if(inchar ==`'`){
            return true;
        }
    }
    condComma(inchar){
        if(inchar == ','){
            return true;
        }
    }
    condComment(inchar){
        if(inchar == ';'){
            return true;
        }
    }
    condTupleHash(inchar){
        if(inchar =='#'){
            return true
        }
    }
    condNum(inchar) {
        return /[0-9]/i.test(inchar);
    }
    condOp(inchar) {

        return '+-*/%=&|<>!'.indexOf(inchar) >= 0;
    }
    condNotWhitespace(inchar){
        return !(/\s/.test(inchar))
    }
    condOp1(inchar1){
        console.log("whitespace?",inchar1 == ' ')
        if((/\s/.test(inchar1))){
            return true
        }else{
            return false
        }
        
    }
    condId(inchar) {
        return /^[\\?a-zA-Z0-9_':-]*$/i.test(inchar);
    }
    condIdInside(inchar){
        return /^[\\?a-zA-Z0-9_':-\\*]*$/i.test(inchar);
    }
    condStr(inchar){
        return (/["]/i).test(inchar)
    }
    readStr(){
        let complete = ''
        complete += this.inputSt.peek();
        this.inputSt.next(); //skip beginning quote "
        const tokpredicate = (mystr) => ((/[^"]/i).test(mystr))
        while(!this.inputSt.eof() && tokpredicate(this.inputSt.peek())){
            complete = complete + this.inputSt.peek();
            this.inputSt.next()
        }
        complete += this.inputSt.peek();
        this.inputSt.next(); //skip end quote "
        return complete
    }


    next() {
        if (this.inputSt.peek() == ' ' || this.inputSt.peek() == '\n') {
            this.inputSt.next();
            return null;
        }
        let tokenVal = null;

        if (this.satisfy(this.condNum)) {
            console.log("Num",this.inputSt.peek())
            tokenVal = this.readWhile(this.condNum);
        } else if (this.satisfy(this.condComment)){
            tokenVal = this.readWhileNextLine();
        }else if (this.satisfy(this.condQuote)){
            tokenVal = this.readSingle(this.condQuote)
        }else if (this.satisfy(this.condQuote2) && this.satisfy1(this.condParen)){
            tokenVal = this.readSingle(this.condQuote2)
        }else if (this.satisfy(this.condTupleHash) && this.satisfy1(this.condParen)){
            tokenVal = this.readSingle(this.condTupleHash)
        } else if (this.satisfy(this.condComma)){
            tokenVal = this.readSingle(this.condComma)
        } else if (this.satisfy(this.condParen)) {
            console.log("Paren",this.inputSt.peek())
            tokenVal = this.readSingle(this.condParen);
        } else if (this.satisfy(this.condOp)) {
            console.log("Op",this.inputSt.peek())
            tokenVal = this.readWhile(this.condNotWhitespace);
        } else if (this.satisfy(this.condStr)) {
            console.log("str",this.inputSt.peek())
            tokenVal = this.readStr();
            
        } else if (this.satisfy(this.condId)) {
            console.log("id",this.inputSt.peek())
            // let head = this.readSingle(this.condIdInside)
            tokenVal = this.readWhile(this.condIdInside);
        } else {
            console.log("uncaught: ", this.inputSt.peek());
        }
        // tokenVal += this.readSingle(this.condParen);
        // tokenVal += this.readSingle(this.condOp);
        // tokenVal += this.readWhile(this.condId);
        this.partialSol.push(tokenVal);
        // this.inputSt.next()
    }
    process() {
        while (!this.inputSt.eof()) {
            this.limCount += 1;
            if(this.limCount >= 3000){
                console.log("the problem:",this.inputSt.peek(), this.inputSt.line, this.inputSt.col, this.inputSt.pos)
                throw new Error('Stack blown');
            }
            this.next();
        }
    }
    hint() {
        console.log(this.partialSol);
    }
}

function mktree(tokens) {
    // let tokens = [...tokens1]
    let partialSol = [];
    let head = tokens.shift();
    // let rest = tokens.slice(1)
    if(head != undefined && head[0] == ';'){
        head = tokens.shift()
    }
    if (head == ","){
        let IH = mktree(tokens);
        partialSol.push(IH)
        // partialSol = ["quote", partialSol]
        partialSol.unshift("comma")
        return partialSol
        
    } else if (head == "`"){
        let IH = mktree(tokens);
        //IH will always be a list since in lisp quote tokens are always in front of list tokens
        partialSol.push(...IH)
        // partialSol = ["quote", partialSol]
        partialSol.unshift("quote")
        return partialSol
        
    }else if (head == `'`){
        let IH = mktree(tokens);
        partialSol.push(...IH)
        partialSol = ["quote", partialSol]
        // partialSol.unshift("quote")
        return partialSol
        
    }else if (head == `#`){
        let IH = mktree(tokens);
        // partialSol = [...partialSol,...IH]
        // partialSol = ["quote", partialSol]
        partialSol.push(...IH)
        partialSol.unshift("tuple")
        return partialSol
        
    } else if (head == '(') {
        while (tokens[0] != ')') {
            // console.log(tokens)
            let IH = mktree(tokens);
            partialSol.push(IH);
        }
        tokens.shift();
        return partialSol;
    } else if (head == '[') {
        while (tokens[0] != ']') {
            // console.log(tokens)
            let IH = mktree(tokens);
            partialSol.push(IH);
        }
        tokens.shift();
        return partialSol;
    } else {
        // console.log(rest)
        return head;
    }
}
    let thestr = `
 ( '[#(port 8080)])
    `
	let z = new InputSt(thestr)
	let z1 = new Tokenizer(z)
	z1.process()
	let t1 = z1.partialSol
    console.log("tokenized",t1)
    let b1 = mktree(t1)
    console.log("the tree", b1)
    let b2 = tree2dict(b1)
    console.log("tree2dict",b2)
    // console.log("FUCK", tree3dict(b1))
    // console.log(b1[3][2])
    export {tree2dict,InputSt,Tokenizer,mktree}

4.1 More advanced tokenizer and parser

Include the token start position and end position with the token
By doing so you can build a AST with a easy to build mapping from AST node to original input string positions.