sic

The sic programming language, compiler and tools (WIP)
Log | Files | Refs

commit 772c7870a795bda9fe1654a98576db9bcc7e47d1
parent b4580c3da03225f57a867d02f01f4d3ce4ab6fac
Author: citbl <citbl@citbl.org>
Date:   Sun, 10 May 2026 14:57:33 +1000

wip lexer

Diffstat:
M.clang-format | 6+++---
M.clangd | 6+++---
Mlanguage.sic | 5+++--
Msrc/common.h | 55+++++++++++++++++++++++++++++++++++++++++--------------
Msrc/lexer.c | 77+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
Msrc/main.c | 11++++++-----
Msrc/utils.c | 1+
Atest.sic | 9+++++++++
8 files changed, 137 insertions(+), 33 deletions(-)

diff --git a/.clang-format b/.clang-format @@ -8,7 +8,7 @@ PointerAlignment: Right AlignTrailingComments: true SpacesBeforeTrailingComments: 1 KeepEmptyLinesAtTheStartOfBlocks: false -AllowShortBlocksOnASingleLine: true +AllowShortBlocksOnASingleLine: false AllowShortIfStatementsOnASingleLine: true AllowShortCaseLabelsOnASingleLine: false AllowShortEnumsOnASingleLine: true @@ -19,10 +19,10 @@ AlignConsecutiveMacros: false SortIncludes: false IndentCaseLabels: false -ColumnLimit: 135 +ColumnLimit: 80 PenaltyBreakBeforeFirstCallParameter: 1 AlignAfterOpenBracket: DontAlign BinPackArguments: false BinPackParameters: false -BreakAfterReturnType: TopLevelDefinitions +#BreakAfterReturnType: TopLevelDefinitions diff --git a/.clangd b/.clangd @@ -1,8 +1,8 @@ CompileFlags: Add: [ - -Wall, - -Wextra, - -Wpedantic, + #-Wall, + #-Wextra, + #-Wpedantic, -Wshadow, -Wpointer-arith, -Wcast-qual, diff --git a/language.sic b/language.sic @@ -2,12 +2,15 @@ int ~age = 25 dec height = 87.5 str name = "George" +// some comment here +// another one here int main { str name = arg[0] print(name) return 0 } +// definition of human struct def Human { str name dec height @@ -58,5 +61,3 @@ int y = 12 if cast(int, x) == y { print("yay") } - - diff --git a/src/common.h b/src/common.h @@ -3,8 +3,8 @@ #include <stdbool.h> #include <stddef.h> -typedef enum TokenType { - TERMINUS = 4, +typedef enum Token_Type { + NOTYETSET = 7, IDENT, KEYWORD, SYMBOL, @@ -26,31 +26,58 @@ typedef enum TokenType { DASH_GT, EQ, BANG, - NAMESPACE -} TokenType; + LIT_STRING, + LIT_DECIMAL, + LIT_INT, + LIT_BOOL, + LIT_VOID, +} Token_Type; -typedef enum Keyword { IF = 130, ELSE, WHILE, OPT, LAZY, MATCH, DEF, FOR, EACH, SOME, NONE, OK, ERR, IN, IS, CAST } Keyword; +typedef enum Keyword { + IF = 137, + ELSE, + WHILE, + OPT, + LAZY, + MATCH, + DEF, + FOR, + EACH, + SOME, + NONE, + OK, + ERR, + IN, + IS, + CAST +} Keyword; typedef struct Token { - TokenType type; + Token_Type type; union Value { - char* as_string; + char *as_string; char as_char; size_t as_int; bool as_bool; } value; - const char* path; - const char* filename; + const char *path; + const char *filename; size_t line; size_t col; } Token; -typedef struct Lexer { - const char* code; - const char* path; - const char* filename; +typedef struct Lexer_State { size_t pos; - Token* tokens; + bool in_string; + bool in_comment; +} Lexer_State; + +typedef struct Lexer { + const char *code; + const char *path; + const char *filename; + Lexer_State state; + Token *tokens; size_t count; size_t cap; } Lexer; diff --git a/src/lexer.c b/src/lexer.c @@ -1,9 +1,9 @@ #include "lexer.h" #include <stdio.h> #include <stdlib.h> +#include <string.h> -static void -add_token(Lexer *l, Token t) +static void emit_token(Lexer *l, Token t) { if (l->count >= l->cap) { l->cap *= 2; @@ -13,11 +13,76 @@ add_token(Lexer *l, Token t) l->tokens[l->count++] = t; } -Lexer * -lexer_lex(Lexer *l) +static void add_to_string(char *str, char c) { - Token tok = { 0 }; +} + +static char peek(Lexer *l) +{ + char c = l->code[l->state.pos + 1]; + // printf("PEEK: %c\n", c); + return c; +} + +static char consume(Lexer *l) +{ + return l->code[++l->state.pos]; +} + +static void run_until_char(Lexer *l, char c) +{ + do { + // printf("%zu", l->state.pos); + l->state.pos++; + } while (peek(l) != c); +} + +Lexer *lexer_lex(Lexer *l) +{ + char c = '\0'; + size_t len = strlen(l->code); + Token t = { .filename = l->filename, + .path = l->path, + .col = -1, + .line = -1, + .type = NOTYETSET }; + l->tokens = calloc(250, sizeof(Token)); - add_token(l, tok); + l->state.pos = 0; + l->state.in_string = false; + + // longest valid token first + while (l->state.pos <= len) { + c = l->code[l->state.pos]; + + if (c == '/' && peek(l) == '/') { + // continue until the end of the line + run_until_char(l, '\n'); + l->state.pos++; + continue; + } + + switch (c) { + case '\"': + l->state.in_string = true; + run_until_char(l, '\"'); // TODO buffer up the string + t.type = LIT_STRING; + emit_token(l, t); + + break; + case EOF: + return l; + case '\n': + case '\r': + l->state.pos++; + continue; + break; + } + + // printf("%zu", l->state.pos); + printf("unhandled: %c\n", c); + l->state.pos++; + } + return l; } diff --git a/src/main.c b/src/main.c @@ -1,9 +1,9 @@ +#include <stdio.h> + #include "lexer.h" #include "utils.h" -#include <stdio.h> -int -main(int argc, char **args) +int main(int argc, char **args) { char *filename; char *contents; @@ -16,9 +16,10 @@ main(int argc, char **args) contents = read_file(filename); if (contents == NULL) return 1; - - printf("%s\n", contents); + lexer.code = contents; lexer = *lexer_lex(&lexer); + + printf("\n"); return 0; } diff --git a/src/utils.c b/src/utils.c @@ -1,6 +1,7 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> +#include "utils.h" char * read_file(const char *filename) diff --git a/test.sic b/test.sic @@ -0,0 +1,9 @@ +// this is a comment + +int jack = 5; + +// another one + +void main() { + +}