mighty

The mighty programming language, compiler and tools (WIP)
Log | Files | Refs

lexer.mty (3541B)


      1 ns lexer
      2 
      3 // This file is a sketch of how a lexer could look in Mighty.
      4 // The important idea is `generate`: it hides the loop, while `next_token`
      5 // advances an explicit immutable lexer state.
      6 
      7 pub enum TokenKind ::
      8     ident
      9     int_lit
     10     plus
     11     lparen
     12     rparen
     13     eof
     14     bad
     15 
     16 pub struct Token ::
     17     TokenKind kind
     18     str text
     19     int line
     20     int col
     21 
     22 struct Lexer ::
     23     str src
     24     int i
     25     int line
     26     int col
     27 
     28 // A generation step returns one value and the next state.
     29 struct Step[T, S] ::
     30     T value
     31     S state
     32 
     33 // Public entry point.
     34 // generate(seed, step) repeatedly calls step(state), collects each value,
     35 // and stops when step returns nil.
     36 pub Token[] lex(str src) =
     37     generate(Lexer(src: src, i: 0, line: 1, col: 1), next_token)
     38 
     39 // Produce one token, plus the lexer state after that token.
     40 // Returning nil means there is nothing else to produce.
     41 Step[Token, Lexer]? next_token(Lexer s) =
     42     nil if done(s)
     43     else next_token(skip_space(s)) if space(peek(s))
     44     else read_ident(s) if ident_head(peek(s))
     45     else read_int(s) if digit(peek(s))
     46     else read_one(s, TokenKind.plus) if peek(s) == '+'
     47     else read_one(s, TokenKind.lparen) if peek(s) == '('
     48     else read_one(s, TokenKind.rparen) if peek(s) == ')'
     49     else read_one(s, TokenKind.bad)
     50 
     51 // End of input. The index is the only thing that moves through src.
     52 bool done(Lexer s) =
     53     s.i >= #s.src
     54 
     55 // Safe current character. Only call this after checking not done(s).
     56 char peek(Lexer s) =
     57     s.src[s.i]
     58 
     59 // Advance by one character, updating line/column.
     60 Lexer advance(Lexer s) =
     61     Lexer(src: s.src, i: s.i + 1, line: s.line + 1, col: 1)
     62         if peek(s) == '\n'
     63         else Lexer(src: s.src, i: s.i + 1, line: s.line, col: s.col + 1)
     64 
     65 // Skip spaces and newlines. This returns only the updated state;
     66 // no token is emitted for whitespace in this tiny lexer.
     67 Lexer skip_space(Lexer s) =
     68     s if done(s) or not space(peek(s))
     69     else skip_space(advance(s))
     70 
     71 // Read a single-character token like '+', '(', or ')'.
     72 Step[Token, Lexer] read_one(Lexer s, TokenKind kind) =
     73     Step(
     74         value: Token(kind: kind, text: s.src[s.i..s.i + 1], line: s.line, col: s.col),
     75         state: advance(s)
     76     )
     77 
     78 // Read an identifier by consuming the longest identifier tail.
     79 Step[Token, Lexer] read_ident(Lexer s) ::
     80     int start = s.i
     81     int line = s.line
     82     int col = s.col
     83     Lexer end = skip_while(s, ident_tail)
     84 
     85     ret Step(
     86         value: Token(kind: TokenKind.ident, text: s.src[start..end.i], line: line, col: col),
     87         state: end
     88     )
     89 
     90 // Read an integer literal by consuming consecutive digits.
     91 Step[Token, Lexer] read_int(Lexer s) ::
     92     int start = s.i
     93     int line = s.line
     94     int col = s.col
     95     Lexer end = skip_while(s, digit)
     96 
     97     ret Step(
     98         value: Token(kind: TokenKind.int_lit, text: s.src[start..end.i], line: line, col: col),
     99         state: end
    100     )
    101 
    102 // Move forward while pred(current_char) is true.
    103 Lexer skip_while(Lexer s, bool pred(char)) =
    104     s if done(s) or not pred(peek(s))
    105     else skip_while(advance(s), pred)
    106 
    107 bool space(char c) =
    108     c == ' ' or c == '\t' or c == '\r' or c == '\n'
    109 
    110 bool digit(char c) =
    111     c >= '0' and c <= '9'
    112 
    113 bool alpha(char c) =
    114     c >= 'a' and c <= 'z' or c >= 'A' and c <= 'Z'
    115 
    116 bool ident_head(char c) =
    117     alpha(c) or c == '_'
    118 
    119 bool ident_tail(char c) =
    120     ident_head(c) or digit(c)
    121 
    122 // Builtin shape, shown here only to make the control flow explicit.
    123 // A compiler/runtime would implement this as a loop.
    124 T[] generate[T, S](S state, Step[T, S]? step(S)) = builtin