sic

The sic programming language, compiler and tools (WIP)
Log | Files | Refs

commit dd0da306d6a210eb93c26af69cf9e484125b0c9c
parent e96299d559a92fb9d377da134e5bd502e8dad946
Author: citbl <citbl@citbl.org>
Date:   Wed, 13 May 2026 23:07:48 +1000

lexer progress

Diffstat:
Msrc/lexer.c | 51+++++++++++++++++++++++++++++++++++++++++----------
Msrc/token.h | 1+
2 files changed, 42 insertions(+), 10 deletions(-)

diff --git a/src/lexer.c b/src/lexer.c @@ -23,6 +23,8 @@ void print_tokens(Lexer* lex) t = lex->tokens[i]; typ = t.type; + printf("%s: %zu:%zu ", t.filename, t.line, t.col); + switch (typ) { case LIT_STRING: printf("STRING LITERAL: %s\n", lex->tokens[i].lexeme.value); @@ -33,8 +35,17 @@ void print_tokens(Lexer* lex) case LIT_INT: printf("DECIMAL LITERAL: %s\n", lex->tokens[i].lexeme.value); break; + case EQ: + printf("EQ =\n"); + break; + case SEMICOL: + printf("SEMICOL ;\n"); + break; + case IDENT: + printf("IDENT: %s\n", lex->tokens[i].lexeme.value); + break; default: - printf("print_tokens: unhandled token %i", typ); + printf("print_tokens: unhandled token %i\n", typ); break; } } @@ -60,9 +71,10 @@ static char peek(Lexer* lex) static char advance(Lexer* lex) { const char c = peek(lex); + if (c == '\r') advance(lex); if (c == '\n') { lex->state.line++; - lex->state.col = 1; + lex->state.col = 0; } else { lex->state.col++; @@ -89,19 +101,23 @@ static void lex_number(Lexer* lex, Token* tok) { char c = lex->code[lex->state.pos]; Str* str = &tok->lexeme; - str_append(str, c); tok->type = LIT_INT; + str_append(str, c); while (lex->state.pos < lex->code_len) { - advance(lex); c = peek(lex); + if (c == '_' && tok->type == LIT_INT) { + advance(lex); // allow _ in large integers + continue; + } + if (c != '.' && !isdigit((unsigned char)c)) break; if (c == '.' && tok->type == LIT_DECIMAL) { err(lex, "parsing number failed with more than one decimal point '.'\n"); } if (c == '.' && tok->type == LIT_INT) tok->type = LIT_DECIMAL; - if (c == '_' && tok->type == LIT_INT) continue; // allow _ in large integers - if (c != '.' && !isdigit((unsigned char)c)) break; str_append(str, c); + advance(lex); } + advance(lex); add_token(lex, *tok); } @@ -109,13 +125,13 @@ static void lex_ident(Lexer* lex, Token* tok) { char c = lex->code[lex->state.pos]; Str* str = &tok->lexeme; - str_append(str, c); tok->type = IDENT; while (lex->state.pos < lex->code_len) { - advance(lex); + str_append(str, c); + // printf("char: %c\n", c); c = peek(lex); if (!isalnum((unsigned char)c)) break; - str_append(str, c); + advance(lex); } add_token(lex, *tok); } @@ -140,7 +156,7 @@ Lexer* lexer_lex(Lexer* lex) lex->tokens = calloc(250, sizeof(Token)); lex->state.pos = 0; lex->state.line = 1; - lex->state.col = 1; + lex->state.col = 0; // longest valid token first while (lex->state.pos < lex->code_len) { @@ -153,9 +169,12 @@ Lexer* lexer_lex(Lexer* lex) } if (isalpha((unsigned char)c)) { lex_ident(lex, &t); + advance(lex); + continue; } if (isdigit((unsigned char)c)) { lex_number(lex, &t); + continue; } switch (c) { @@ -166,13 +185,25 @@ Lexer* lexer_lex(Lexer* lex) while (lex->code[lex->state.pos] != '\"' || lex->state.pos > lex->code_len) { add_to_string(&t, c); + lex->state.col++; c = lex->code[++lex->state.pos]; } advance(lex); add_token(lex, t); continue; + case '=': + t.type = EQ; + add_token(lex, t); + advance(lex); + continue; + case ';': + t.type = SEMICOL; + add_token(lex, t); + advance(lex); + continue; case '\n': case '\r': + case ' ': advance(lex); continue; break; diff --git a/src/token.h b/src/token.h @@ -24,6 +24,7 @@ typedef enum Token_Type { DASH_GT, EQ, BANG, + SEMICOL, LIT_STRING, LIT_DECIMAL, LIT_INT,