commit 908cf8e1d938159a8f0c4b5e01213d797f5cfbd2
parent b57370c7e257f6f5c2988a8aca2f6836c6a5552f
Author: citbl <citbl@citbl.org>
Date: Thu, 21 May 2026 20:45:49 +1000
lexing part 1
Diffstat:
2 files changed, 112 insertions(+), 34 deletions(-)
diff --git a/mtcc/src/lexer.c b/mtcc/src/lexer.c
@@ -10,7 +10,7 @@ static bool is_digit(const char c);
static bool is_space(const char c);
static bool is_alpha_numeric(const char c);
static bool is_dot(const char c);
-static enum token_type compare_span_to_token(struct lexer *lexer, struct span ident);
+static enum token_type compare_span_to_token(struct lexer *lexer, struct span ident, size_t);
static void add_token(struct lexer *, struct token);
static void print_tokens(struct lexer *);
@@ -24,7 +24,7 @@ lexer_lex(struct lexer *lexer) {
size_t line = 1;
size_t start = 0;
size_t start_col = 0;
- struct span ident;
+ struct span span;
struct token tok;
enum token_type type;
@@ -45,15 +45,32 @@ lexer_lex(struct lexer *lexer) {
start = i;
start_col = col;
+ if (is_digit(c)) {
+ type = TOKEN_LITERAL_INT;
+ while (i < len && (is_digit(src[i]) || is_dot(src[i]))) {
+ if (is_dot(src[i])) {
+ type = TOKEN_LITERAL_FLOAT;
+ }
+ i++;
+ col++;
+ }
+ span = (struct span){
+ .filename = lexer->file, .col = start_col, .line = line, .start = start, .stop = i};
+ tok = (struct token){.span = span, .token_type = type};
+ add_token(lexer, tok);
+ continue;
+ }
+
if (is_alpha(c) || is__(c)) {
+ type = TOKEN_IDENT;
while (i < len && (is_alpha_numeric(src[i]) || is__(src[i]))) {
i++;
col++;
}
- ident = (struct span){
+ span = (struct span){
.filename = lexer->file, .col = start_col, .line = line, .start = start, .stop = i};
- type = compare_span_to_token(lexer, ident);
- tok = (struct token){.span = ident, .token_type = type};
+ type = compare_span_to_token(lexer, span, i - start);
+ tok = (struct token){.span = span, .token_type = type};
add_token(lexer, tok);
continue;
}
@@ -68,11 +85,60 @@ lexer_lex(struct lexer *lexer) {
continue;
}
+ switch (c) {
+ case '(':
+ type = TOKEN_L_PAREN;
+ break;
+ case ')':
+ type = TOKEN_R_PAREN;
+ break;
+ case '[':
+ type = TOKEN_L_BRACKET;
+ break;
+ case ']':
+ type = TOKEN_R_BRACKET;
+ break;
+ case '{':
+ type = TOKEN_L_BRACE;
+ break;
+ case '}':
+ type = TOKEN_R_BRACE;
+ break;
+ case ',':
+ type = TOKEN_COMMA;
+ break;
+ case '=':
+ type = TOKEN_EQ;
+ break;
+ case ':':
+ type = TOKEN_COLON;
+ break;
+ case '+':
+ type = TOKEN_PLUS;
+ break;
+ case '-':
+ type = TOKEN_MINUS;
+ break;
+ case '*':
+ type = TOKEN_STAR;
+ break;
+ case '/':
+ type = TOKEN_SLASH;
+ break;
+ default:
+ type = TOKEN_BAD_TOKEN;
+ break;
+ }
i++;
col++;
+ span = (struct span){
+ .filename = lexer->file, .col = start_col, .line = line, .start = start, .stop = i};
+ tok = (struct token){.span = span, .token_type = type};
+ add_token(lexer, tok);
}
print_tokens(lexer);
+ printf("----------------------------\n");
}
static bool
@@ -106,11 +172,25 @@ is_dot(const char c) {
}
static const char *NAMES_TOKEN[] = {
- [TOKEN_IDENT] = "ident/type",
+ [TOKEN_IDENT] = "ident / type",
[TOKEN_L_PAREN] = "open paren",
[TOKEN_R_PAREN] = "close paren",
[TOKEN_L_BRACE] = "open brace",
[TOKEN_R_BRACE] = "close brace",
+ [TOKEN_COMMA] = "comma",
+ [TOKEN_COLON] = "colon",
+ [TOKEN_EQ] = "equal / assign",
+ [TOKEN_PLUS] = "plus",
+ [TOKEN_MINUS] = "minus",
+ [TOKEN_STAR] = "star / mult",
+ [TOKEN_SLASH] = "slash / div",
+ [TOKEN_KEYWORD_PUB] = "keyword public",
+ [TOKEN_LITERAL_INT] = "literal INT",
+ [TOKEN_LITERAL_FLOAT] = "literal FLOAT",
+ [TOKEN_LITERAL_CHAR] = "literal CHAR",
+ [TOKEN_LITERAL_STR] = "literal STRING",
+ [TOKEN_LITERAL_BOOL] = "literal BOOL",
+ [TOKEN_BAD_TOKEN] = "BAD TOKEN",
};
static void
@@ -120,6 +200,9 @@ print_tokens(struct lexer *lexer) {
struct token t;
for (i = 0; i < lexer->tok_len; i++) {
t = lexer->tokens[i];
+ if (NAMES_TOKEN[t.token_type] == NULL) {
+ printf("null token: %d:\n", t.token_type);
+ }
printf("L%zu:%zu \t%-14s '", t.span.line, t.span.col, NAMES_TOKEN[t.token_type]);
fwrite(src + t.span.start, 1, t.span.stop - t.span.start, stdout);
printf("'\n");
@@ -127,52 +210,52 @@ print_tokens(struct lexer *lexer) {
}
static enum token_type
-compare_span_to_token(struct lexer *lexer, struct span ident) {
+compare_span_to_token(struct lexer *lexer, struct span ident, size_t len) {
enum token_type t = TOKEN_IDENT;
char c;
size_t i;
- if (ident.start >= lexer->src_len || ident.stop >= lexer->src_len) return false;
+ if (ident.start >= lexer->src_len || ident.stop >= lexer->src_len) return t;
- if (strncmp(lexer->src + ident.start, "ns", 2) == 0)
+ if (strncmp(lexer->src + ident.start, "ns", 2) == 0 && len == 2)
t = TOKEN_KEYWORD_NS;
- else if (strncmp(lexer->src + ident.start, "in", 2) == 0)
+ else if (strncmp(lexer->src + ident.start, "in", 2) == 0 && len == 2)
t = TOKEN_KEYWORD_IN;
- else if (strncmp(lexer->src + ident.start, "from", 4) == 0)
+ else if (strncmp(lexer->src + ident.start, "from", 4) == 0 && len == 4)
t = TOKEN_KEYWORD_FROM;
- else if (strncmp(lexer->src + ident.start, "use", 3) == 0)
+ else if (strncmp(lexer->src + ident.start, "use", 3) == 0 && len == 3)
t = TOKEN_KEYWORD_USE;
- else if (strncmp(lexer->src + ident.start, "ffi", 3) == 0)
+ else if (strncmp(lexer->src + ident.start, "ffi", 3) == 0 && len == 3)
t = TOKEN_KEYWORD_FFI;
- else if (strncmp(lexer->src + ident.start, "drop", 4) == 0)
+ else if (strncmp(lexer->src + ident.start, "drop", 4) == 0 && len == 4)
t = TOKEN_KEYWORD_DROP;
- else if (strncmp(lexer->src + ident.start, "as", 2) == 0)
+ else if (strncmp(lexer->src + ident.start, "as", 2) == 0 && len == 2)
t = TOKEN_KEYWORD_AS;
- else if (strncmp(lexer->src + ident.start, "of", 2) == 0)
+ else if (strncmp(lexer->src + ident.start, "of", 2) == 0 && len == 2)
t = TOKEN_KEYWORD_OF;
- else if (strncmp(lexer->src + ident.start, "and", 3) == 0)
+ else if (strncmp(lexer->src + ident.start, "and", 3) == 0 && len == 3)
t = TOKEN_KEYWORD_AND;
- else if (strncmp(lexer->src + ident.start, "or", 2) == 0)
+ else if (strncmp(lexer->src + ident.start, "or", 2) == 0 && len == 2)
t = TOKEN_KEYWORD_OR;
- else if (strncmp(lexer->src + ident.start, "ref", 3) == 0)
+ else if (strncmp(lexer->src + ident.start, "ref", 3) == 0 && len == 3)
t = TOKEN_KEYWORD_REF;
- else if (strncmp(lexer->src + ident.start, "struct", 6) == 0)
+ else if (strncmp(lexer->src + ident.start, "struct", 6) == 0 && len == 6)
t = TOKEN_KEYWORD_STRUCT;
- else if (strncmp(lexer->src + ident.start, "enum", 4) == 0)
+ else if (strncmp(lexer->src + ident.start, "enum", 4) == 0 && len == 4)
t = TOKEN_KEYWORD_ENUM;
- else if (strncmp(lexer->src + ident.start, "pre", 3) == 0)
+ else if (strncmp(lexer->src + ident.start, "pre", 3) == 0 && len == 3)
t = TOKEN_KEYWORD_PRE;
- else if (strncmp(lexer->src + ident.start, "post", 4) == 0)
+ else if (strncmp(lexer->src + ident.start, "post", 4) == 0 && len == 4)
t = TOKEN_KEYWORD_POST;
- else if (strncmp(lexer->src + ident.start, "inv", 3) == 0)
+ else if (strncmp(lexer->src + ident.start, "inv", 3) == 0 && len == 3)
t = TOKEN_KEYWORD_INV;
- else if (strncmp(lexer->src + ident.start, "if", 2) == 0)
+ else if (strncmp(lexer->src + ident.start, "if", 2) == 0 && len == 2)
t = TOKEN_KEYWORD_IF;
- else if (strncmp(lexer->src + ident.start, "else", 4) == 0)
+ else if (strncmp(lexer->src + ident.start, "else", 4) == 0 && len == 4)
t = TOKEN_KEYWORD_ELSE;
- else if (strncmp(lexer->src + ident.start, "where", 5) == 0)
+ else if (strncmp(lexer->src + ident.start, "where", 5) == 0 && len == 5)
t = TOKEN_KEYWORD_WHERE;
- else if (strncmp(lexer->src + ident.start, "pub", 3) == 0)
+ else if (strncmp(lexer->src + ident.start, "pub", 3) == 0 && len == 3)
t = TOKEN_KEYWORD_PUB;
return t;
}
diff --git a/mtcc/src/token.h b/mtcc/src/token.h
@@ -3,8 +3,7 @@
#include <stddef.h>
enum token_type {
- TOKEN_IDENT,
-
+ TOKEN_IDENT = 6,
TOKEN_KEYWORD_NS,
TOKEN_KEYWORD_IN,
TOKEN_KEYWORD_FROM,
@@ -25,20 +24,17 @@ enum token_type {
TOKEN_KEYWORD_ELSE,
TOKEN_KEYWORD_WHERE,
TOKEN_KEYWORD_PUB,
-
TOKEN_DOT,
TOKEN_COLON,
TOKEN_COLON_COLON,
TOKEN_COMMA,
TOKEN_EOF,
TOKEN_EQ,
-
TOKEN_LITERAL_BOOL,
TOKEN_LITERAL_INT,
TOKEN_LITERAL_FLOAT,
TOKEN_LITERAL_CHAR,
TOKEN_LITERAL_STR,
-
TOKEN_MINUS,
TOKEN_PLUS,
TOKEN_STAR,
@@ -48,7 +44,6 @@ enum token_type {
TOKEN_PLUS_EQ,
TOKEN_MINUS_MINUS,
TOKEN_PLUS_PLUS,
-
TOKEN_L_PAREN,
TOKEN_R_PAREN,
TOKEN_L_BRACKET,