mighty

The mighty programming language, compiler and tools (WIP)
Log | Files | Refs

commit 791321af39c8203340815bd2747ddc10163b3f2b
parent 308881ef7bfcfa3943a70e24f1a1e5e0a005e1d9
Author: keyle <keyle@capsule.org>
Date:   Sat, 16 May 2026 16:40:02 +1000

syntax test

Diffstat:
Alexer.mty | 124+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 124 insertions(+), 0 deletions(-)

diff --git a/lexer.mty b/lexer.mty @@ -0,0 +1,124 @@ +ns lexer + +// This file is a sketch of how a lexer could look in Mighty. +// The important idea is `generate`: it hides the loop, while `next_token` +// advances an explicit immutable lexer state. + +pub enum TokenKind :: + ident + int_lit + plus + lparen + rparen + eof + bad + +pub struct Token :: + TokenKind kind + str text + int line + int col + +struct Lexer :: + str src + int i + int line + int col + +// A generation step returns one value and the next state. +struct Step[T, S] :: + T value + S state + +// Public entry point. +// generate(seed, step) repeatedly calls step(state), collects each value, +// and stops when step returns nil. +pub Token[] lex(str src) = + generate(Lexer(src: src, i: 0, line: 1, col: 1), next_token) + +// Produce one token, plus the lexer state after that token. +// Returning nil means there is nothing else to produce. +Step[Token, Lexer]? next_token(Lexer s) = + nil if done(s) + else next_token(skip_space(s)) if space(peek(s)) + else read_ident(s) if ident_head(peek(s)) + else read_int(s) if digit(peek(s)) + else read_one(s, TokenKind.plus) if peek(s) == '+' + else read_one(s, TokenKind.lparen) if peek(s) == '(' + else read_one(s, TokenKind.rparen) if peek(s) == ')' + else read_one(s, TokenKind.bad) + +// End of input. The index is the only thing that moves through src. +bool done(Lexer s) = + s.i >= #s.src + +// Safe current character. Only call this after checking not done(s). +char peek(Lexer s) = + s.src[s.i] + +// Advance by one character, updating line/column. +Lexer advance(Lexer s) = + Lexer(src: s.src, i: s.i + 1, line: s.line + 1, col: 1) + if peek(s) == '\n' + else Lexer(src: s.src, i: s.i + 1, line: s.line, col: s.col + 1) + +// Skip spaces and newlines. This returns only the updated state; +// no token is emitted for whitespace in this tiny lexer. +Lexer skip_space(Lexer s) = + s if done(s) or not space(peek(s)) + else skip_space(advance(s)) + +// Read a single-character token like '+', '(', or ')'. +Step[Token, Lexer] read_one(Lexer s, TokenKind kind) = + Step( + value: Token(kind: kind, text: s.src[s.i..s.i + 1], line: s.line, col: s.col), + state: advance(s) + ) + +// Read an identifier by consuming the longest identifier tail. +Step[Token, Lexer] read_ident(Lexer s) :: + int start = s.i + int line = s.line + int col = s.col + Lexer end = skip_while(s, ident_tail) + + ret Step( + value: Token(kind: TokenKind.ident, text: s.src[start..end.i], line: line, col: col), + state: end + ) + +// Read an integer literal by consuming consecutive digits. +Step[Token, Lexer] read_int(Lexer s) :: + int start = s.i + int line = s.line + int col = s.col + Lexer end = skip_while(s, digit) + + ret Step( + value: Token(kind: TokenKind.int_lit, text: s.src[start..end.i], line: line, col: col), + state: end + ) + +// Move forward while pred(current_char) is true. +Lexer skip_while(Lexer s, bool pred(char)) = + s if done(s) or not pred(peek(s)) + else skip_while(advance(s), pred) + +bool space(char c) = + c == ' ' or c == '\t' or c == '\r' or c == '\n' + +bool digit(char c) = + c >= '0' and c <= '9' + +bool alpha(char c) = + c >= 'a' and c <= 'z' or c >= 'A' and c <= 'Z' + +bool ident_head(char c) = + alpha(c) or c == '_' + +bool ident_tail(char c) = + ident_head(c) or digit(c) + +// Builtin shape, shown here only to make the control flow explicit. +// A compiler/runtime would implement this as a loop. +T[] generate[T, S](S state, Step[T, S]? step(S)) = builtin