sic

The sic programming language, compiler and tools (WIP)
Log | Files | Refs

commit 05c4135547a761744cd1553980fed2b9f853cade
parent 20834b8524da952999b7611c212dc3a3fae92e15
Author: citbl <citbl@citbl.org>
Date:   Sun, 10 May 2026 20:02:55 +1000

better lexing

Diffstat:
Msrc/common.h | 2++
Msrc/lexer.c | 103+++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
Mtest.sic | 1+
3 files changed, 64 insertions(+), 42 deletions(-)

diff --git a/src/common.h b/src/common.h @@ -78,6 +78,8 @@ typedef struct Token { typedef struct Lexer_State { size_t pos; + size_t line; + size_t col; bool in_string; bool in_comment; } Lexer_State; diff --git a/src/lexer.c b/src/lexer.c @@ -1,18 +1,19 @@ #include "lexer.h" #include <stdio.h> #include <string.h> +#include <ctype.h> -static void add_token(Lexer *l, Token t) +static void add_token(Lexer *lex, Token t) { - if (l->len >= l->cap) { - l->cap = l->cap == 0 ? 256 : l->cap * 2; - l->tokens = realloc(l->tokens, l->cap * sizeof(Token)); + if (lex->len >= lex->cap) { + lex->cap = lex->cap == 0 ? 256 : lex->cap * 2; + lex->tokens = realloc(lex->tokens, lex->cap * sizeof(Token)); } - l->tokens[l->len++] = t; + lex->tokens[lex->len++] = t; } -void print_tokens(Lexer *l) +void print_tokens(Lexer *lex) { Token t; Token_Type typ; @@ -20,13 +21,13 @@ void print_tokens(Lexer *l) printf("------- print tokens --------\n"); - for (i = 0; i < l->len; i++) { - t = l->tokens[i]; + for (i = 0; i < lex->len; i++) { + t = lex->tokens[i]; typ = t.type; switch (typ) { case LIT_STRING: - printf("STRING LITERAL: %s\n", l->tokens[i].value.as_string.value); + printf("STRING LITERAL: %s\n", lex->tokens[i].value.as_string.value); break; default: printf("print_tokens: unhandled token %i", typ); @@ -35,9 +36,9 @@ void print_tokens(Lexer *l) } } -static void add_to_string(Token *t, char c) +static void add_to_string(Token *tok, char c) { - String *str = &t->value.as_string; + String *str = &tok->value.as_string; if (str->len >= str->cap) { str->cap = str->cap == 0 ? 256 : str->cap * 2; @@ -48,72 +49,90 @@ static void add_to_string(Token *t, char c) str->value[str->len] = '\0'; } -static char peek(Lexer *l) +static char peek(Lexer *lex) { - size_t next = l->state.pos + 1; + size_t next = lex->state.pos + 1; - if (next >= l->code_len) { + if (next >= lex->code_len) { return '\0'; } - return l->code[next]; + return lex->code[next]; } -static void run_until_char(Lexer *l, char c) +static char advance(Lexer *lex) +{ + const char c = peek(lex); + // if (!c) return c; + if (c == '\n') { + lex->state.line++; + lex->state.col = 1; + } + else { + lex->state.col++; + } + lex->state.pos++; + return c; +} + +static void run_until_char(Lexer *lex, char c) { do { - l->state.pos++; - } while (peek(l) != c); - l->state.pos++; + advance(lex); + } while (peek(lex) != c); + advance(lex); } -Lexer *lexer_lex(Lexer *l) +Lexer *lexer_lex(Lexer *lex) { char c = '\0'; - size_t len = strlen(l->code); + size_t len = strlen(lex->code); Token t = { - .filename = l->filename, .path = l->path, .col = -1, .line = -1, .type = NOTYETSET, .value = {0}}; + .filename = lex->filename, .path = lex->path, .col = -1, .line = -1, .type = NOTYETSET, .value = {0}}; - l->tokens = calloc(250, sizeof(Token)); - l->state.pos = 0; - l->state.in_string = false; + lex->tokens = calloc(250, sizeof(Token)); + lex->state.pos = 0; + lex->state.in_string = false; // longest valid token first - while (l->state.pos <= len) { - c = l->code[l->state.pos]; + while (lex->state.pos <= len) { + c = lex->code[lex->state.pos]; - if (c == '/' && peek(l) == '/') { - run_until_char(l, '\n'); + if (c == '/' && peek(lex) == '/') { + run_until_char(lex, '\n'); continue; } + if (isdigit(c)) { + } + switch (c) { case '\"': - l->state.in_string = true; + lex->state.in_string = true; t.type = LIT_STRING; - c = l->code[++l->state.pos]; + advance(lex); + c = lex->code[lex->state.pos]; - while (l->code[l->state.pos] != '\"') { + while (lex->code[lex->state.pos] != '\"') { add_to_string(&t, c); - c = l->code[++l->state.pos]; + c = lex->code[++lex->state.pos]; } - - l->state.pos++; - l->state.in_string = false; - add_token(l, t); + advance(lex); + lex->state.in_string = false; + add_token(lex, t); continue; case EOF: - return l; + return lex; case '\n': case '\r': - l->state.pos++; + advance(lex); continue; break; } - printf("unhandled: %zu: %c\n", l->state.pos, c); - l->state.pos++; + printf("unhandled: %zu: %c\n", lex->state.pos, c); + advance(lex); } - return l; + return lex; } diff --git a/test.sic b/test.sic @@ -3,3 +3,4 @@ void main() { str name = "Johnny Mnemonic"; } +