mighty

The mighty programming language, compiler and tools (WIP)
Log | Files | Refs

lexer.mty (3541B)



ns lexer

// This file is a sketch of how a lexer could look in Mighty.
// The important idea is `generate`: it hides the loop, while `next_token`
// advances an explicit immutable lexer state.

pub enum TokenKind ::
    ident
    int_lit
    plus
    lparen
    rparen
    eof
    bad

pub struct Token ::
    TokenKind kind
    str text
    int line
    int col

struct Lexer ::
    str src
    int i
    int line
    int col

// A generation step returns one value and the next state.
struct Step[T, S] ::
    T value
    S state

// Public entry point.
// generate(seed, step) repeatedly calls step(state), collects each value,
// and stops when step returns nil.
pub Token[] lex(str src) =
    generate(Lexer(src: src, i: 0, line: 1, col: 1), next_token)

// Produce one token, plus the lexer state after that token.
// Returning nil means there is nothing else to produce.
Step[Token, Lexer]? next_token(Lexer s) =
    nil if done(s)
    else next_token(skip_space(s)) if space(peek(s))
    else read_ident(s) if ident_head(peek(s))
    else read_int(s) if digit(peek(s))
    else read_one(s, TokenKind.plus) if peek(s) == '+'
    else read_one(s, TokenKind.lparen) if peek(s) == '('
    else read_one(s, TokenKind.rparen) if peek(s) == ')'
    else read_one(s, TokenKind.bad)

// End of input. The index is the only thing that moves through src.
bool done(Lexer s) =
    s.i >= #s.src

// Safe current character. Only call this after checking not done(s).
char peek(Lexer s) =
    s.src[s.i]

// Advance by one character, updating line/column.
Lexer advance(Lexer s) =
    Lexer(src: s.src, i: s.i + 1, line: s.line + 1, col: 1)
        if peek(s) == '\n'
        else Lexer(src: s.src, i: s.i + 1, line: s.line, col: s.col + 1)

// Skip spaces and newlines. This returns only the updated state;
// no token is emitted for whitespace in this tiny lexer.
Lexer skip_space(Lexer s) =
    s if done(s) or not space(peek(s))
    else skip_space(advance(s))

// Read a single-character token like '+', '(', or ')'.
Step[Token, Lexer] read_one(Lexer s, TokenKind kind) =
    Step(
        value: Token(kind: kind, text: s.src[s.i..s.i + 1], line: s.line, col: s.col),
        state: advance(s)
    )

// Read an identifier by consuming the longest identifier tail.
Step[Token, Lexer] read_ident(Lexer s) ::
    int start = s.i
    int line = s.line
    int col = s.col
    Lexer end = skip_while(s, ident_tail)

    ret Step(
        value: Token(kind: TokenKind.ident, text: s.src[start..end.i], line: line, col: col),
        state: end
    )

// Read an integer literal by consuming consecutive digits.
Step[Token, Lexer] read_int(Lexer s) ::
    int start = s.i
    int line = s.line
    int col = s.col
    Lexer end = skip_while(s, digit)

    ret Step(
        value: Token(kind: TokenKind.int_lit, text: s.src[start..end.i], line: line, col: col),
        state: end
    )

// Move forward while pred(current_char) is true.
Lexer skip_while(Lexer s, bool pred(char)) =
    s if done(s) or not pred(peek(s))
    else skip_while(advance(s), pred)

bool space(char c) =
    c == ' ' or c == '\t' or c == '\r' or c == '\n'

bool digit(char c) =
    c >= '0' and c <= '9'

bool alpha(char c) =
    c >= 'a' and c <= 'z' or c >= 'A' and c <= 'Z'

bool ident_head(char c) =
    alpha(c) or c == '_'

bool ident_tail(char c) =
    ident_head(c) or digit(c)

// Builtin shape, shown here only to make the control flow explicit.
// A compiler/runtime would implement this as a loop.
T[] generate[T, S](S state, Step[T, S]? step(S)) = builtin