sic

The sic programming language, compiler and tools (WIP)
Log | Files | Refs

lexer.c (10267B)




#include <string.h>
#include <ctype.h>

#include "lexer.h"
#include "array.h"
#include "str.h"

static void add_token(Lexer* lex, Token t);
static void add_to_string(Token* tok, char c);
static char peek(Lexer* lex);
static char advance(Lexer* lex);
static void run_until_char(Lexer* lex, char c);
static void lex_number(Lexer* lex, Token* tok);
static void lex_ident(Lexer* lex, Token* tok);
static Token new_token(Lexer* lex);

Lexer* lexer_lex(Lexer* lex)
{
    Token t;
    char c = '\0';

    lex->tokens = calloc(250, sizeof(Token));
    lex->state.pos = 0;
    lex->state.line = 1;
    lex->state.col = 0;

    // longest valid token first
    while (lex->state.pos < lex->code_len) {
        c = lex->code[lex->state.pos];
        t = new_token(lex);

        if (c == '/' && peek(lex) == '/') {
            run_until_char(lex, '\n');
            continue;
        }
        if (isalpha((unsigned char)c)) {
            lex_ident(lex, &t);
            advance(lex);
            continue;
        }
        if (isdigit((unsigned char)c)) {
            lex_number(lex, &t);
            continue;
        }

        switch (c) {
        case '\"':
            t.type = LIT_STRING;
            advance(lex);
            c = lex->code[lex->state.pos];

            while (lex->code[lex->state.pos] != '\"' || lex->state.pos > lex->code_len) {
                add_to_string(&t, c);
                lex->state.col++;
                c = lex->code[++lex->state.pos];
            }
            advance(lex);
            add_token(lex, t);
            continue;
        case '=':
            if (peek(lex) == '=') {
                t.type = EQ_EQ;
                add_token(lex, t);
                advance(lex);
                advance(lex);
                continue;
            }
            if (peek(lex) == '>') {
                t.type = EQ_GT;
                add_token(lex, t);
                advance(lex);
                advance(lex);
                continue;
            }
            t.type = EQ;
            add_token(lex, t);
            advance(lex);
            continue;
        case '!':
            if (peek(lex) == '=') {
                t.type = BANG_EQ;
                add_token(lex, t);
                advance(lex);
                advance(lex);
                continue;
            }
            t.type = BANG;
            add_token(lex, t);
            advance(lex);
            continue;
        case '*':
            if (peek(lex) == '=') {
                t.type = STAR_EQ;
                add_token(lex, t);
                advance(lex);
                advance(lex);
                continue;
            }
            t.type = STAR;
            add_token(lex, t);
            advance(lex);
            continue;
        case '/':
            if (peek(lex) == '=') {
                t.type = SLASH_EQ;
                add_token(lex, t);
                advance(lex);
                advance(lex);
                continue;
            }
            t.type = SLASH;
            add_token(lex, t);
            advance(lex);
            continue;
        case '%':
            if (peek(lex) == '=') {
                t.type = PERCENT_EQ;
                add_token(lex, t);
                advance(lex);
                advance(lex);
                continue;
            }
            t.type = PERCENT;
            add_token(lex, t);
            advance(lex);
            continue;
        case '&':
            if (peek(lex) == '=') {
                t.type = AMPERSAND_EQ;
                add_token(lex, t);
                advance(lex);
                advance(lex);
                continue;
            }
            if (peek(lex) == '&') {
                t.type = AMPERSAND_AMPERSAND;
                add_token(lex, t);
                advance(lex);
                advance(lex);
                continue;
            }
            t.type = AMPERSAND;
            add_token(lex, t);
            advance(lex);
            continue;

        case '|':
            if (peek(lex) == '=') {
                t.type = PIPE_EQ;
                add_token(lex, t);
                advance(lex);
                advance(lex);
                continue;
            }
            if (peek(lex) == '|') {
                t.type = PIPE_PIPE;
                add_token(lex, t);
                advance(lex);
                advance(lex);
                continue;
            }
            t.type = PIPE;
            add_token(lex, t);
            advance(lex);
            continue;
        case '+':
            if (peek(lex) == '+') {
                t.type = PLUS_PLUS;
                add_token(lex, t);
                advance(lex);
                advance(lex);
                continue;
            }
            if (peek(lex) == '=') {
                t.type = PLUS_EQ;
                add_token(lex, t);
                advance(lex);
                advance(lex);
                continue;
            }
            t.type = PLUS;
            add_token(lex, t);
            advance(lex);
            continue;
        case '-':
            if (peek(lex) == '>') {
                t.type = MINUS_GT;
                add_token(lex, t);
                advance(lex);
                advance(lex);
                continue;
            }
            if (peek(lex) == '-') {
                t.type = MINUS_MINUS;
                add_token(lex, t);
                advance(lex);
                advance(lex);
                continue;
            }
            if (peek(lex) == '=') {
                t.type = MINUS_EQ;
                add_token(lex, t);
                advance(lex);
                advance(lex);
                continue;
            }
            t.type = EQ;
            add_token(lex, t);
            advance(lex);
            continue;
        case '<':
            if (peek(lex) == '=') {
                t.type = LT_EQ;
                add_token(lex, t);
                advance(lex);
                advance(lex);
                continue;
            }
            if (peek(lex) == '<') {
                t.type = LT_LT;
                add_token(lex, t);
                advance(lex);
                advance(lex);
                continue;
            }
            if (peek(lex) == '-') {
                t.type = LT_MINUS;
                add_token(lex, t);
                advance(lex);
                advance(lex);
                continue;
            }
            t.type = LT;
            add_token(lex, t);
            advance(lex);
            continue;
        case '>':
            if (peek(lex) == '=') {
                t.type = GT_EQ;
                add_token(lex, t);
                advance(lex);
                advance(lex);
                continue;
            }
            if (peek(lex) == '>') {
                t.type = GT_GT;
                add_token(lex, t);
                advance(lex);
                advance(lex);
                continue;
            }
            t.type = GT;
            add_token(lex, t);
            advance(lex);
            continue;
        case ';':
            t.type = SEMICOL;
            add_token(lex, t);
            advance(lex);
            continue;
        case '(':
            t.type = LPAREN;
            add_token(lex, t);
            advance(lex);
            continue;
        case ')':
            t.type = RPAREN;
            add_token(lex, t);
            advance(lex);
            continue;
        case '{':
            t.type = LBRACE;
            add_token(lex, t);
            advance(lex);
            continue;
        case '}':
            t.type = RBRACE;
            add_token(lex, t);
            advance(lex);
            continue;
        case '\n':
        case '\r':
        case '\t':
        case ' ':
            advance(lex);
            continue;
            break;
        }

        printf("unhandled: %s: %zu:%zu %c\n", lex->filename, lex->state.line, lex->state.col, c);
        advance(lex);
    }

    return lex;
}

static void add_token(Lexer* lex, Token t)
{
    ARRAY_PUSH(lex->tokens, lex->len, lex->cap, t);
}

static void add_to_string(Token* tok, char c)
{
    Str* str = &tok->lexeme;
    str_append(str, c);
}

static char peek(Lexer* lex)
{
    size_t next = lex->state.pos + 1;

    if (next >= lex->code_len) {
        return '\0';
    }

    return lex->code[next];
}

static char advance(Lexer* lex)
{
    const char c = peek(lex);
    if (c == '\r') advance(lex);
    if (c == '\n') {
        lex->state.line++;
        lex->state.col = 0;
    }
    else {
        lex->state.col++;
    }
    lex->state.pos++;
    return c;
}

static void run_until_char(Lexer* lex, char c)
{
    do {
        advance(lex);
    } while (peek(lex) != c);
    advance(lex);
}

static void err(Lexer* lex, const char* message)
{
    fprintf(stderr, "%s %zu:%zu %s", lex->filename, lex->state.line, lex->state.col, message);
    exit(1);
}

static void lex_number(Lexer* lex, Token* tok)
{
    char c = lex->code[lex->state.pos];
    Str* str = &tok->lexeme;
    tok->type = LIT_INT;
    str_append(str, c);
    while (lex->state.pos < lex->code_len) {
        c = peek(lex);
        if (c == '_' && tok->type == LIT_INT) {
            advance(lex);  // allow _ in large integers
            continue;
        }
        if (c != '.' && !isdigit((unsigned char)c)) break;
        if (c == '.' && tok->type == LIT_DECIMAL) {
            err(lex, "parsing number failed with more than one decimal point '.'\n");
        }
        if (c == '.' && tok->type == LIT_INT) tok->type = LIT_DECIMAL;
        str_append(str, c);
        advance(lex);
    }
    advance(lex);
    add_token(lex, *tok);
}

static void lex_ident(Lexer* lex, Token* tok)
{
    char c = lex->code[lex->state.pos];
    Str* str = &tok->lexeme;
    tok->type = IDENT;
    while (lex->state.pos < lex->code_len) {
        str_append(str, c);
        // printf("char: %c\n", c);
        c = peek(lex);
        if (!isalnum((unsigned char)c)) break;
        advance(lex);
    }
    add_token(lex, *tok);
}

static Token new_token(Lexer* lex)
{
    return (Token){
        .filename = lex->filename,
        .path = lex->path,
        .col = lex->state.col,
        .line = lex->state.line,
        .type = NOTYETSET,
        .lexeme = {0},
    };
}