lexing part 1 - mighty - The mighty programming language, compiler and tools (WIP)

commit 908cf8e1d938159a8f0c4b5e01213d797f5cfbd2
parent b57370c7e257f6f5c2988a8aca2f6836c6a5552f
Author: citbl <citbl@citbl.org>
Date:   Thu, 21 May 2026 20:45:49 +1000

lexing part 1

Diffstat:
M mtcc/src/lexer.c  | 139 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
M mtcc/src/token.h  | 7 +------

2 files changed, 112 insertions(+), 34 deletions(-)
diff --git a/mtcc/src/lexer.c b/mtcc/src/lexer.c
@@ -10,7 +10,7 @@ static bool is_digit(const char c);
 static bool is_space(const char c);
 static bool is_alpha_numeric(const char c);
 static bool is_dot(const char c);
-static enum token_type compare_span_to_token(struct lexer *lexer, struct span ident);
+static enum token_type compare_span_to_token(struct lexer *lexer, struct span ident, size_t);
 static void add_token(struct lexer *, struct token);
 static void print_tokens(struct lexer *);
 
@@ -24,7 +24,7 @@ lexer_lex(struct lexer *lexer) {
     size_t line = 1;
     size_t start = 0;
     size_t start_col = 0;
-    struct span ident;
+    struct span span;
     struct token tok;
     enum token_type type;
 
@@ -45,15 +45,32 @@ lexer_lex(struct lexer *lexer) {
         start = i;
         start_col = col;
 
+        if (is_digit(c)) {
+            type = TOKEN_LITERAL_INT;
+            while (i < len && (is_digit(src[i]) || is_dot(src[i]))) {
+                if (is_dot(src[i])) {
+                    type = TOKEN_LITERAL_FLOAT;
+                }
+                i++;
+                col++;
+            }
+            span = (struct span){
+                .filename = lexer->file, .col = start_col, .line = line, .start = start, .stop = i};
+            tok = (struct token){.span = span, .token_type = type};
+            add_token(lexer, tok);
+            continue;
+        }
+
         if (is_alpha(c) || is__(c)) {
+            type = TOKEN_IDENT;
             while (i < len && (is_alpha_numeric(src[i]) || is__(src[i]))) {
                 i++;
                 col++;
             }
-            ident = (struct span){
+            span = (struct span){
                 .filename = lexer->file, .col = start_col, .line = line, .start = start, .stop = i};
-            type = compare_span_to_token(lexer, ident);
-            tok = (struct token){.span = ident, .token_type = type};
+            type = compare_span_to_token(lexer, span, i - start);
+            tok = (struct token){.span = span, .token_type = type};
             add_token(lexer, tok);
             continue;
         }
@@ -68,11 +85,60 @@ lexer_lex(struct lexer *lexer) {
             continue;
         }
 
+        switch (c) {
+            case '(':
+                type = TOKEN_L_PAREN;
+                break;
+            case ')':
+                type = TOKEN_R_PAREN;
+                break;
+            case '[':
+                type = TOKEN_L_BRACKET;
+                break;
+            case ']':
+                type = TOKEN_R_BRACKET;
+                break;
+            case '{':
+                type = TOKEN_L_BRACE;
+                break;
+            case '}':
+                type = TOKEN_R_BRACE;
+                break;
+            case ',':
+                type = TOKEN_COMMA;
+                break;
+            case '=':
+                type = TOKEN_EQ;
+                break;
+            case ':':
+                type = TOKEN_COLON;
+                break;
+            case '+':
+                type = TOKEN_PLUS;
+                break;
+            case '-':
+                type = TOKEN_MINUS;
+                break;
+            case '*':
+                type = TOKEN_STAR;
+                break;
+            case '/':
+                type = TOKEN_SLASH;
+                break;
+            default:
+                type = TOKEN_BAD_TOKEN;
+                break;
+        }
         i++;
         col++;
+        span = (struct span){
+            .filename = lexer->file, .col = start_col, .line = line, .start = start, .stop = i};
+        tok = (struct token){.span = span, .token_type = type};
+        add_token(lexer, tok);
     }
 
     print_tokens(lexer);
+    printf("----------------------------\n");
 }
 
 static bool
@@ -106,11 +172,25 @@ is_dot(const char c) {
 }
 
 static const char *NAMES_TOKEN[] = {
-    [TOKEN_IDENT] = "ident/type",
+    [TOKEN_IDENT] = "ident / type",
     [TOKEN_L_PAREN] = "open paren",
     [TOKEN_R_PAREN] = "close paren",
     [TOKEN_L_BRACE] = "open brace",
     [TOKEN_R_BRACE] = "close brace",
+    [TOKEN_COMMA] = "comma",
+    [TOKEN_COLON] = "colon",
+    [TOKEN_EQ] = "equal / assign",
+    [TOKEN_PLUS] = "plus",
+    [TOKEN_MINUS] = "minus",
+    [TOKEN_STAR] = "star / mult",
+    [TOKEN_SLASH] = "slash / div",
+    [TOKEN_KEYWORD_PUB] = "keyword public",
+    [TOKEN_LITERAL_INT] = "literal INT",
+    [TOKEN_LITERAL_FLOAT] = "literal FLOAT",
+    [TOKEN_LITERAL_CHAR] = "literal CHAR",
+    [TOKEN_LITERAL_STR] = "literal STRING",
+    [TOKEN_LITERAL_BOOL] = "literal BOOL",
+    [TOKEN_BAD_TOKEN] = "BAD TOKEN",
 };
 
 static void
@@ -120,6 +200,9 @@ print_tokens(struct lexer *lexer) {
     struct token t;
     for (i = 0; i < lexer->tok_len; i++) {
         t = lexer->tokens[i];
+        if (NAMES_TOKEN[t.token_type] == NULL) {
+            printf("null token: %d:\n", t.token_type);
+        }
         printf("L%zu:%zu \t%-14s '", t.span.line, t.span.col, NAMES_TOKEN[t.token_type]);
         fwrite(src + t.span.start, 1, t.span.stop - t.span.start, stdout);
         printf("'\n");
@@ -127,52 +210,52 @@ print_tokens(struct lexer *lexer) {
 }
 
 static enum token_type
-compare_span_to_token(struct lexer *lexer, struct span ident) {
+compare_span_to_token(struct lexer *lexer, struct span ident, size_t len) {
     enum token_type t = TOKEN_IDENT;
     char c;
     size_t i;
 
-    if (ident.start >= lexer->src_len || ident.stop >= lexer->src_len) return false;
+    if (ident.start >= lexer->src_len || ident.stop >= lexer->src_len) return t;
 
-    if (strncmp(lexer->src + ident.start, "ns", 2) == 0)
+    if (strncmp(lexer->src + ident.start, "ns", 2) == 0 && len == 2)
         t = TOKEN_KEYWORD_NS;
-    else if (strncmp(lexer->src + ident.start, "in", 2) == 0)
+    else if (strncmp(lexer->src + ident.start, "in", 2) == 0 && len == 2)
         t = TOKEN_KEYWORD_IN;
-    else if (strncmp(lexer->src + ident.start, "from", 4) == 0)
+    else if (strncmp(lexer->src + ident.start, "from", 4) == 0 && len == 4)
         t = TOKEN_KEYWORD_FROM;
-    else if (strncmp(lexer->src + ident.start, "use", 3) == 0)
+    else if (strncmp(lexer->src + ident.start, "use", 3) == 0 && len == 3)
         t = TOKEN_KEYWORD_USE;
-    else if (strncmp(lexer->src + ident.start, "ffi", 3) == 0)
+    else if (strncmp(lexer->src + ident.start, "ffi", 3) == 0 && len == 3)
         t = TOKEN_KEYWORD_FFI;
-    else if (strncmp(lexer->src + ident.start, "drop", 4) == 0)
+    else if (strncmp(lexer->src + ident.start, "drop", 4) == 0 && len == 4)
         t = TOKEN_KEYWORD_DROP;
-    else if (strncmp(lexer->src + ident.start, "as", 2) == 0)
+    else if (strncmp(lexer->src + ident.start, "as", 2) == 0 && len == 2)
         t = TOKEN_KEYWORD_AS;
-    else if (strncmp(lexer->src + ident.start, "of", 2) == 0)
+    else if (strncmp(lexer->src + ident.start, "of", 2) == 0 && len == 2)
         t = TOKEN_KEYWORD_OF;
-    else if (strncmp(lexer->src + ident.start, "and", 3) == 0)
+    else if (strncmp(lexer->src + ident.start, "and", 3) == 0 && len == 3)
         t = TOKEN_KEYWORD_AND;
-    else if (strncmp(lexer->src + ident.start, "or", 2) == 0)
+    else if (strncmp(lexer->src + ident.start, "or", 2) == 0 && len == 2)
         t = TOKEN_KEYWORD_OR;
-    else if (strncmp(lexer->src + ident.start, "ref", 3) == 0)
+    else if (strncmp(lexer->src + ident.start, "ref", 3) == 0 && len == 3)
         t = TOKEN_KEYWORD_REF;
-    else if (strncmp(lexer->src + ident.start, "struct", 6) == 0)
+    else if (strncmp(lexer->src + ident.start, "struct", 6) == 0 && len == 6)
         t = TOKEN_KEYWORD_STRUCT;
-    else if (strncmp(lexer->src + ident.start, "enum", 4) == 0)
+    else if (strncmp(lexer->src + ident.start, "enum", 4) == 0 && len == 4)
         t = TOKEN_KEYWORD_ENUM;
-    else if (strncmp(lexer->src + ident.start, "pre", 3) == 0)
+    else if (strncmp(lexer->src + ident.start, "pre", 3) == 0 && len == 3)
         t = TOKEN_KEYWORD_PRE;
-    else if (strncmp(lexer->src + ident.start, "post", 4) == 0)
+    else if (strncmp(lexer->src + ident.start, "post", 4) == 0 && len == 4)
         t = TOKEN_KEYWORD_POST;
-    else if (strncmp(lexer->src + ident.start, "inv", 3) == 0)
+    else if (strncmp(lexer->src + ident.start, "inv", 3) == 0 && len == 3)
         t = TOKEN_KEYWORD_INV;
-    else if (strncmp(lexer->src + ident.start, "if", 2) == 0)
+    else if (strncmp(lexer->src + ident.start, "if", 2) == 0 && len == 2)
         t = TOKEN_KEYWORD_IF;
-    else if (strncmp(lexer->src + ident.start, "else", 4) == 0)
+    else if (strncmp(lexer->src + ident.start, "else", 4) == 0 && len == 4)
         t = TOKEN_KEYWORD_ELSE;
-    else if (strncmp(lexer->src + ident.start, "where", 5) == 0)
+    else if (strncmp(lexer->src + ident.start, "where", 5) == 0 && len == 5)
         t = TOKEN_KEYWORD_WHERE;
-    else if (strncmp(lexer->src + ident.start, "pub", 3) == 0)
+    else if (strncmp(lexer->src + ident.start, "pub", 3) == 0 && len == 3)
         t = TOKEN_KEYWORD_PUB;
     return t;
 }
diff --git a/mtcc/src/token.h b/mtcc/src/token.h
@@ -3,8 +3,7 @@
 #include <stddef.h>
 
 enum token_type {
-    TOKEN_IDENT,
-
+    TOKEN_IDENT = 6,
     TOKEN_KEYWORD_NS,
     TOKEN_KEYWORD_IN,
     TOKEN_KEYWORD_FROM,
@@ -25,20 +24,17 @@ enum token_type {
     TOKEN_KEYWORD_ELSE,
     TOKEN_KEYWORD_WHERE,
     TOKEN_KEYWORD_PUB,
-
     TOKEN_DOT,
     TOKEN_COLON,
     TOKEN_COLON_COLON,
     TOKEN_COMMA,
     TOKEN_EOF,
     TOKEN_EQ,
-
     TOKEN_LITERAL_BOOL,
     TOKEN_LITERAL_INT,
     TOKEN_LITERAL_FLOAT,
     TOKEN_LITERAL_CHAR,
     TOKEN_LITERAL_STR,
-
     TOKEN_MINUS,
     TOKEN_PLUS,
     TOKEN_STAR,
@@ -48,7 +44,6 @@ enum token_type {
     TOKEN_PLUS_EQ,
     TOKEN_MINUS_MINUS,
     TOKEN_PLUS_PLUS,
-
     TOKEN_L_PAREN,
     TOKEN_R_PAREN,
     TOKEN_L_BRACKET,

	mighty The mighty programming language, compiler and tools (WIP)
	Log \| Files \| Refs

M	mtcc/src/lexer.c	\|	139	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
M	mtcc/src/token.h	\|	7	+------