lexer.mty (3541B)
1 ns lexer 2 3 // This file is a sketch of how a lexer could look in Mighty. 4 // The important idea is `generate`: it hides the loop, while `next_token` 5 // advances an explicit immutable lexer state. 6 7 pub enum TokenKind :: 8 ident 9 int_lit 10 plus 11 lparen 12 rparen 13 eof 14 bad 15 16 pub struct Token :: 17 TokenKind kind 18 str text 19 int line 20 int col 21 22 struct Lexer :: 23 str src 24 int i 25 int line 26 int col 27 28 // A generation step returns one value and the next state. 29 struct Step[T, S] :: 30 T value 31 S state 32 33 // Public entry point. 34 // generate(seed, step) repeatedly calls step(state), collects each value, 35 // and stops when step returns nil. 36 pub Token[] lex(str src) = 37 generate(Lexer(src: src, i: 0, line: 1, col: 1), next_token) 38 39 // Produce one token, plus the lexer state after that token. 40 // Returning nil means there is nothing else to produce. 41 Step[Token, Lexer]? next_token(Lexer s) = 42 nil if done(s) 43 else next_token(skip_space(s)) if space(peek(s)) 44 else read_ident(s) if ident_head(peek(s)) 45 else read_int(s) if digit(peek(s)) 46 else read_one(s, TokenKind.plus) if peek(s) == '+' 47 else read_one(s, TokenKind.lparen) if peek(s) == '(' 48 else read_one(s, TokenKind.rparen) if peek(s) == ')' 49 else read_one(s, TokenKind.bad) 50 51 // End of input. The index is the only thing that moves through src. 52 bool done(Lexer s) = 53 s.i >= #s.src 54 55 // Safe current character. Only call this after checking not done(s). 56 char peek(Lexer s) = 57 s.src[s.i] 58 59 // Advance by one character, updating line/column. 60 Lexer advance(Lexer s) = 61 Lexer(src: s.src, i: s.i + 1, line: s.line + 1, col: 1) 62 if peek(s) == '\n' 63 else Lexer(src: s.src, i: s.i + 1, line: s.line, col: s.col + 1) 64 65 // Skip spaces and newlines. This returns only the updated state; 66 // no token is emitted for whitespace in this tiny lexer. 67 Lexer skip_space(Lexer s) = 68 s if done(s) or not space(peek(s)) 69 else skip_space(advance(s)) 70 71 // Read a single-character token like '+', '(', or ')'. 72 Step[Token, Lexer] read_one(Lexer s, TokenKind kind) = 73 Step( 74 value: Token(kind: kind, text: s.src[s.i..s.i + 1], line: s.line, col: s.col), 75 state: advance(s) 76 ) 77 78 // Read an identifier by consuming the longest identifier tail. 79 Step[Token, Lexer] read_ident(Lexer s) :: 80 int start = s.i 81 int line = s.line 82 int col = s.col 83 Lexer end = skip_while(s, ident_tail) 84 85 ret Step( 86 value: Token(kind: TokenKind.ident, text: s.src[start..end.i], line: line, col: col), 87 state: end 88 ) 89 90 // Read an integer literal by consuming consecutive digits. 91 Step[Token, Lexer] read_int(Lexer s) :: 92 int start = s.i 93 int line = s.line 94 int col = s.col 95 Lexer end = skip_while(s, digit) 96 97 ret Step( 98 value: Token(kind: TokenKind.int_lit, text: s.src[start..end.i], line: line, col: col), 99 state: end 100 ) 101 102 // Move forward while pred(current_char) is true. 103 Lexer skip_while(Lexer s, bool pred(char)) = 104 s if done(s) or not pred(peek(s)) 105 else skip_while(advance(s), pred) 106 107 bool space(char c) = 108 c == ' ' or c == '\t' or c == '\r' or c == '\n' 109 110 bool digit(char c) = 111 c >= '0' and c <= '9' 112 113 bool alpha(char c) = 114 c >= 'a' and c <= 'z' or c >= 'A' and c <= 'Z' 115 116 bool ident_head(char c) = 117 alpha(c) or c == '_' 118 119 bool ident_tail(char c) = 120 ident_head(c) or digit(c) 121 122 // Builtin shape, shown here only to make the control flow explicit. 123 // A compiler/runtime would implement this as a loop. 124 T[] generate[T, S](S state, Step[T, S]? step(S)) = builtin