mighty

The mighty programming language, compiler and tools (WIP)
Log | Files | Refs

commit 908cf8e1d938159a8f0c4b5e01213d797f5cfbd2
parent b57370c7e257f6f5c2988a8aca2f6836c6a5552f
Author: citbl <citbl@citbl.org>
Date:   Thu, 21 May 2026 20:45:49 +1000

lexing part 1

Diffstat:
Mmtcc/src/lexer.c | 139+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
Mmtcc/src/token.h | 7+------
2 files changed, 112 insertions(+), 34 deletions(-)

diff --git a/mtcc/src/lexer.c b/mtcc/src/lexer.c @@ -10,7 +10,7 @@ static bool is_digit(const char c); static bool is_space(const char c); static bool is_alpha_numeric(const char c); static bool is_dot(const char c); -static enum token_type compare_span_to_token(struct lexer *lexer, struct span ident); +static enum token_type compare_span_to_token(struct lexer *lexer, struct span ident, size_t); static void add_token(struct lexer *, struct token); static void print_tokens(struct lexer *); @@ -24,7 +24,7 @@ lexer_lex(struct lexer *lexer) { size_t line = 1; size_t start = 0; size_t start_col = 0; - struct span ident; + struct span span; struct token tok; enum token_type type; @@ -45,15 +45,32 @@ lexer_lex(struct lexer *lexer) { start = i; start_col = col; + if (is_digit(c)) { + type = TOKEN_LITERAL_INT; + while (i < len && (is_digit(src[i]) || is_dot(src[i]))) { + if (is_dot(src[i])) { + type = TOKEN_LITERAL_FLOAT; + } + i++; + col++; + } + span = (struct span){ + .filename = lexer->file, .col = start_col, .line = line, .start = start, .stop = i}; + tok = (struct token){.span = span, .token_type = type}; + add_token(lexer, tok); + continue; + } + if (is_alpha(c) || is__(c)) { + type = TOKEN_IDENT; while (i < len && (is_alpha_numeric(src[i]) || is__(src[i]))) { i++; col++; } - ident = (struct span){ + span = (struct span){ .filename = lexer->file, .col = start_col, .line = line, .start = start, .stop = i}; - type = compare_span_to_token(lexer, ident); - tok = (struct token){.span = ident, .token_type = type}; + type = compare_span_to_token(lexer, span, i - start); + tok = (struct token){.span = span, .token_type = type}; add_token(lexer, tok); continue; } @@ -68,11 +85,60 @@ lexer_lex(struct lexer *lexer) { continue; } + switch (c) { + case '(': + type = TOKEN_L_PAREN; + break; + case ')': + type = TOKEN_R_PAREN; + break; + case '[': + type = TOKEN_L_BRACKET; + break; + case ']': + type = TOKEN_R_BRACKET; + break; + case '{': + type = TOKEN_L_BRACE; + break; + case '}': + type = TOKEN_R_BRACE; + break; + case ',': + type = TOKEN_COMMA; + break; + case '=': + type = TOKEN_EQ; + break; + case ':': + type = TOKEN_COLON; + break; + case '+': + type = TOKEN_PLUS; + break; + case '-': + type = TOKEN_MINUS; + break; + case '*': + type = TOKEN_STAR; + break; + case '/': + type = TOKEN_SLASH; + break; + default: + type = TOKEN_BAD_TOKEN; + break; + } i++; col++; + span = (struct span){ + .filename = lexer->file, .col = start_col, .line = line, .start = start, .stop = i}; + tok = (struct token){.span = span, .token_type = type}; + add_token(lexer, tok); } print_tokens(lexer); + printf("----------------------------\n"); } static bool @@ -106,11 +172,25 @@ is_dot(const char c) { } static const char *NAMES_TOKEN[] = { - [TOKEN_IDENT] = "ident/type", + [TOKEN_IDENT] = "ident / type", [TOKEN_L_PAREN] = "open paren", [TOKEN_R_PAREN] = "close paren", [TOKEN_L_BRACE] = "open brace", [TOKEN_R_BRACE] = "close brace", + [TOKEN_COMMA] = "comma", + [TOKEN_COLON] = "colon", + [TOKEN_EQ] = "equal / assign", + [TOKEN_PLUS] = "plus", + [TOKEN_MINUS] = "minus", + [TOKEN_STAR] = "star / mult", + [TOKEN_SLASH] = "slash / div", + [TOKEN_KEYWORD_PUB] = "keyword public", + [TOKEN_LITERAL_INT] = "literal INT", + [TOKEN_LITERAL_FLOAT] = "literal FLOAT", + [TOKEN_LITERAL_CHAR] = "literal CHAR", + [TOKEN_LITERAL_STR] = "literal STRING", + [TOKEN_LITERAL_BOOL] = "literal BOOL", + [TOKEN_BAD_TOKEN] = "BAD TOKEN", }; static void @@ -120,6 +200,9 @@ print_tokens(struct lexer *lexer) { struct token t; for (i = 0; i < lexer->tok_len; i++) { t = lexer->tokens[i]; + if (NAMES_TOKEN[t.token_type] == NULL) { + printf("null token: %d:\n", t.token_type); + } printf("L%zu:%zu \t%-14s '", t.span.line, t.span.col, NAMES_TOKEN[t.token_type]); fwrite(src + t.span.start, 1, t.span.stop - t.span.start, stdout); printf("'\n"); @@ -127,52 +210,52 @@ print_tokens(struct lexer *lexer) { } static enum token_type -compare_span_to_token(struct lexer *lexer, struct span ident) { +compare_span_to_token(struct lexer *lexer, struct span ident, size_t len) { enum token_type t = TOKEN_IDENT; char c; size_t i; - if (ident.start >= lexer->src_len || ident.stop >= lexer->src_len) return false; + if (ident.start >= lexer->src_len || ident.stop >= lexer->src_len) return t; - if (strncmp(lexer->src + ident.start, "ns", 2) == 0) + if (strncmp(lexer->src + ident.start, "ns", 2) == 0 && len == 2) t = TOKEN_KEYWORD_NS; - else if (strncmp(lexer->src + ident.start, "in", 2) == 0) + else if (strncmp(lexer->src + ident.start, "in", 2) == 0 && len == 2) t = TOKEN_KEYWORD_IN; - else if (strncmp(lexer->src + ident.start, "from", 4) == 0) + else if (strncmp(lexer->src + ident.start, "from", 4) == 0 && len == 4) t = TOKEN_KEYWORD_FROM; - else if (strncmp(lexer->src + ident.start, "use", 3) == 0) + else if (strncmp(lexer->src + ident.start, "use", 3) == 0 && len == 3) t = TOKEN_KEYWORD_USE; - else if (strncmp(lexer->src + ident.start, "ffi", 3) == 0) + else if (strncmp(lexer->src + ident.start, "ffi", 3) == 0 && len == 3) t = TOKEN_KEYWORD_FFI; - else if (strncmp(lexer->src + ident.start, "drop", 4) == 0) + else if (strncmp(lexer->src + ident.start, "drop", 4) == 0 && len == 4) t = TOKEN_KEYWORD_DROP; - else if (strncmp(lexer->src + ident.start, "as", 2) == 0) + else if (strncmp(lexer->src + ident.start, "as", 2) == 0 && len == 2) t = TOKEN_KEYWORD_AS; - else if (strncmp(lexer->src + ident.start, "of", 2) == 0) + else if (strncmp(lexer->src + ident.start, "of", 2) == 0 && len == 2) t = TOKEN_KEYWORD_OF; - else if (strncmp(lexer->src + ident.start, "and", 3) == 0) + else if (strncmp(lexer->src + ident.start, "and", 3) == 0 && len == 3) t = TOKEN_KEYWORD_AND; - else if (strncmp(lexer->src + ident.start, "or", 2) == 0) + else if (strncmp(lexer->src + ident.start, "or", 2) == 0 && len == 2) t = TOKEN_KEYWORD_OR; - else if (strncmp(lexer->src + ident.start, "ref", 3) == 0) + else if (strncmp(lexer->src + ident.start, "ref", 3) == 0 && len == 3) t = TOKEN_KEYWORD_REF; - else if (strncmp(lexer->src + ident.start, "struct", 6) == 0) + else if (strncmp(lexer->src + ident.start, "struct", 6) == 0 && len == 6) t = TOKEN_KEYWORD_STRUCT; - else if (strncmp(lexer->src + ident.start, "enum", 4) == 0) + else if (strncmp(lexer->src + ident.start, "enum", 4) == 0 && len == 4) t = TOKEN_KEYWORD_ENUM; - else if (strncmp(lexer->src + ident.start, "pre", 3) == 0) + else if (strncmp(lexer->src + ident.start, "pre", 3) == 0 && len == 3) t = TOKEN_KEYWORD_PRE; - else if (strncmp(lexer->src + ident.start, "post", 4) == 0) + else if (strncmp(lexer->src + ident.start, "post", 4) == 0 && len == 4) t = TOKEN_KEYWORD_POST; - else if (strncmp(lexer->src + ident.start, "inv", 3) == 0) + else if (strncmp(lexer->src + ident.start, "inv", 3) == 0 && len == 3) t = TOKEN_KEYWORD_INV; - else if (strncmp(lexer->src + ident.start, "if", 2) == 0) + else if (strncmp(lexer->src + ident.start, "if", 2) == 0 && len == 2) t = TOKEN_KEYWORD_IF; - else if (strncmp(lexer->src + ident.start, "else", 4) == 0) + else if (strncmp(lexer->src + ident.start, "else", 4) == 0 && len == 4) t = TOKEN_KEYWORD_ELSE; - else if (strncmp(lexer->src + ident.start, "where", 5) == 0) + else if (strncmp(lexer->src + ident.start, "where", 5) == 0 && len == 5) t = TOKEN_KEYWORD_WHERE; - else if (strncmp(lexer->src + ident.start, "pub", 3) == 0) + else if (strncmp(lexer->src + ident.start, "pub", 3) == 0 && len == 3) t = TOKEN_KEYWORD_PUB; return t; } diff --git a/mtcc/src/token.h b/mtcc/src/token.h @@ -3,8 +3,7 @@ #include <stddef.h> enum token_type { - TOKEN_IDENT, - + TOKEN_IDENT = 6, TOKEN_KEYWORD_NS, TOKEN_KEYWORD_IN, TOKEN_KEYWORD_FROM, @@ -25,20 +24,17 @@ enum token_type { TOKEN_KEYWORD_ELSE, TOKEN_KEYWORD_WHERE, TOKEN_KEYWORD_PUB, - TOKEN_DOT, TOKEN_COLON, TOKEN_COLON_COLON, TOKEN_COMMA, TOKEN_EOF, TOKEN_EQ, - TOKEN_LITERAL_BOOL, TOKEN_LITERAL_INT, TOKEN_LITERAL_FLOAT, TOKEN_LITERAL_CHAR, TOKEN_LITERAL_STR, - TOKEN_MINUS, TOKEN_PLUS, TOKEN_STAR, @@ -48,7 +44,6 @@ enum token_type { TOKEN_PLUS_EQ, TOKEN_MINUS_MINUS, TOKEN_PLUS_PLUS, - TOKEN_L_PAREN, TOKEN_R_PAREN, TOKEN_L_BRACKET,