lexer.c (10267B)
#include <string.h>
#include <ctype.h>
#include "lexer.h"
#include "array.h"
#include "str.h"
static void add_token(Lexer* lex, Token t);
static void add_to_string(Token* tok, char c);
static char peek(Lexer* lex);
static char advance(Lexer* lex);
static void run_until_char(Lexer* lex, char c);
static void lex_number(Lexer* lex, Token* tok);
static void lex_ident(Lexer* lex, Token* tok);
static Token new_token(Lexer* lex);
Lexer* lexer_lex(Lexer* lex)
{
Token t;
char c = '\0';
lex->tokens = calloc(250, sizeof(Token));
lex->state.pos = 0;
lex->state.line = 1;
lex->state.col = 0;
// longest valid token first
while (lex->state.pos < lex->code_len) {
c = lex->code[lex->state.pos];
t = new_token(lex);
if (c == '/' && peek(lex) == '/') {
run_until_char(lex, '\n');
continue;
}
if (isalpha((unsigned char)c)) {
lex_ident(lex, &t);
advance(lex);
continue;
}
if (isdigit((unsigned char)c)) {
lex_number(lex, &t);
continue;
}
switch (c) {
case '\"':
t.type = LIT_STRING;
advance(lex);
c = lex->code[lex->state.pos];
while (lex->code[lex->state.pos] != '\"' || lex->state.pos > lex->code_len) {
add_to_string(&t, c);
lex->state.col++;
c = lex->code[++lex->state.pos];
}
advance(lex);
add_token(lex, t);
continue;
case '=':
if (peek(lex) == '=') {
t.type = EQ_EQ;
add_token(lex, t);
advance(lex);
advance(lex);
continue;
}
if (peek(lex) == '>') {
t.type = EQ_GT;
add_token(lex, t);
advance(lex);
advance(lex);
continue;
}
t.type = EQ;
add_token(lex, t);
advance(lex);
continue;
case '!':
if (peek(lex) == '=') {
t.type = BANG_EQ;
add_token(lex, t);
advance(lex);
advance(lex);
continue;
}
t.type = BANG;
add_token(lex, t);
advance(lex);
continue;
case '*':
if (peek(lex) == '=') {
t.type = STAR_EQ;
add_token(lex, t);
advance(lex);
advance(lex);
continue;
}
t.type = STAR;
add_token(lex, t);
advance(lex);
continue;
case '/':
if (peek(lex) == '=') {
t.type = SLASH_EQ;
add_token(lex, t);
advance(lex);
advance(lex);
continue;
}
t.type = SLASH;
add_token(lex, t);
advance(lex);
continue;
case '%':
if (peek(lex) == '=') {
t.type = PERCENT_EQ;
add_token(lex, t);
advance(lex);
advance(lex);
continue;
}
t.type = PERCENT;
add_token(lex, t);
advance(lex);
continue;
case '&':
if (peek(lex) == '=') {
t.type = AMPERSAND_EQ;
add_token(lex, t);
advance(lex);
advance(lex);
continue;
}
if (peek(lex) == '&') {
t.type = AMPERSAND_AMPERSAND;
add_token(lex, t);
advance(lex);
advance(lex);
continue;
}
t.type = AMPERSAND;
add_token(lex, t);
advance(lex);
continue;
case '|':
if (peek(lex) == '=') {
t.type = PIPE_EQ;
add_token(lex, t);
advance(lex);
advance(lex);
continue;
}
if (peek(lex) == '|') {
t.type = PIPE_PIPE;
add_token(lex, t);
advance(lex);
advance(lex);
continue;
}
t.type = PIPE;
add_token(lex, t);
advance(lex);
continue;
case '+':
if (peek(lex) == '+') {
t.type = PLUS_PLUS;
add_token(lex, t);
advance(lex);
advance(lex);
continue;
}
if (peek(lex) == '=') {
t.type = PLUS_EQ;
add_token(lex, t);
advance(lex);
advance(lex);
continue;
}
t.type = PLUS;
add_token(lex, t);
advance(lex);
continue;
case '-':
if (peek(lex) == '>') {
t.type = MINUS_GT;
add_token(lex, t);
advance(lex);
advance(lex);
continue;
}
if (peek(lex) == '-') {
t.type = MINUS_MINUS;
add_token(lex, t);
advance(lex);
advance(lex);
continue;
}
if (peek(lex) == '=') {
t.type = MINUS_EQ;
add_token(lex, t);
advance(lex);
advance(lex);
continue;
}
t.type = EQ;
add_token(lex, t);
advance(lex);
continue;
case '<':
if (peek(lex) == '=') {
t.type = LT_EQ;
add_token(lex, t);
advance(lex);
advance(lex);
continue;
}
if (peek(lex) == '<') {
t.type = LT_LT;
add_token(lex, t);
advance(lex);
advance(lex);
continue;
}
if (peek(lex) == '-') {
t.type = LT_MINUS;
add_token(lex, t);
advance(lex);
advance(lex);
continue;
}
t.type = LT;
add_token(lex, t);
advance(lex);
continue;
case '>':
if (peek(lex) == '=') {
t.type = GT_EQ;
add_token(lex, t);
advance(lex);
advance(lex);
continue;
}
if (peek(lex) == '>') {
t.type = GT_GT;
add_token(lex, t);
advance(lex);
advance(lex);
continue;
}
t.type = GT;
add_token(lex, t);
advance(lex);
continue;
case ';':
t.type = SEMICOL;
add_token(lex, t);
advance(lex);
continue;
case '(':
t.type = LPAREN;
add_token(lex, t);
advance(lex);
continue;
case ')':
t.type = RPAREN;
add_token(lex, t);
advance(lex);
continue;
case '{':
t.type = LBRACE;
add_token(lex, t);
advance(lex);
continue;
case '}':
t.type = RBRACE;
add_token(lex, t);
advance(lex);
continue;
case '\n':
case '\r':
case '\t':
case ' ':
advance(lex);
continue;
break;
}
printf("unhandled: %s: %zu:%zu %c\n", lex->filename, lex->state.line, lex->state.col, c);
advance(lex);
}
return lex;
}
static void add_token(Lexer* lex, Token t)
{
ARRAY_PUSH(lex->tokens, lex->len, lex->cap, t);
}
static void add_to_string(Token* tok, char c)
{
Str* str = &tok->lexeme;
str_append(str, c);
}
static char peek(Lexer* lex)
{
size_t next = lex->state.pos + 1;
if (next >= lex->code_len) {
return '\0';
}
return lex->code[next];
}
static char advance(Lexer* lex)
{
const char c = peek(lex);
if (c == '\r') advance(lex);
if (c == '\n') {
lex->state.line++;
lex->state.col = 0;
}
else {
lex->state.col++;
}
lex->state.pos++;
return c;
}
static void run_until_char(Lexer* lex, char c)
{
do {
advance(lex);
} while (peek(lex) != c);
advance(lex);
}
static void err(Lexer* lex, const char* message)
{
fprintf(stderr, "%s %zu:%zu %s", lex->filename, lex->state.line, lex->state.col, message);
exit(1);
}
static void lex_number(Lexer* lex, Token* tok)
{
char c = lex->code[lex->state.pos];
Str* str = &tok->lexeme;
tok->type = LIT_INT;
str_append(str, c);
while (lex->state.pos < lex->code_len) {
c = peek(lex);
if (c == '_' && tok->type == LIT_INT) {
advance(lex); // allow _ in large integers
continue;
}
if (c != '.' && !isdigit((unsigned char)c)) break;
if (c == '.' && tok->type == LIT_DECIMAL) {
err(lex, "parsing number failed with more than one decimal point '.'\n");
}
if (c == '.' && tok->type == LIT_INT) tok->type = LIT_DECIMAL;
str_append(str, c);
advance(lex);
}
advance(lex);
add_token(lex, *tok);
}
static void lex_ident(Lexer* lex, Token* tok)
{
char c = lex->code[lex->state.pos];
Str* str = &tok->lexeme;
tok->type = IDENT;
while (lex->state.pos < lex->code_len) {
str_append(str, c);
// printf("char: %c\n", c);
c = peek(lex);
if (!isalnum((unsigned char)c)) break;
advance(lex);
}
add_token(lex, *tok);
}
static Token new_token(Lexer* lex)
{
return (Token){
.filename = lex->filename,
.path = lex->path,
.col = lex->state.col,
.line = lex->state.line,
.type = NOTYETSET,
.lexeme = {0},
};
}