lexer.c (10198B)
#include "lexer.h"
#include "types.h"
#include "utils.h"
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>
#include <assert.h>
static char
peek(Lexer* lex)
{
char c = lex->src[lex->pos];
return c ? c : 0;
}
static char
peek2(Lexer* lex)
{
char c = lex->src[lex->pos];
if (!c) return 0;
c = lex->src[lex->pos + 1];
return c ? c : 0;
}
static char
nudge(Lexer* lex)
{
const char c = peek(lex);
if (!c) return 0;
if (c == '\n') {
lex->line++;
lex->col = 1;
} else {
lex->col++;
}
lex->pos++;
return c;
}
static void
skip_space_and_comments(Lexer* lex)
{
for (;;) {
for (;;) {
char c = peek(lex);
if (c == ' ' || c == '\t' || c == '\r' || c == '\n')
nudge(lex);
else
break;
}
if (peek(lex) == '/' && peek2(lex) == '/') {
nudge(lex);
nudge(lex);
while (peek(lex) != '\n' && peek(lex) != 0)
nudge(lex);
continue;
}
break;
}
}
static Token
make_ident(Lexer* lex, size_t pos, size_t line, size_t col)
{
for (;;) {
char c = peek(lex);
if (c == '_' || isalnum(c))
nudge(lex);
else
break;
}
TokenType type = TOKEN_IDENT;
if ((lex->pos - pos) == 6 && strncmp(lex->src + pos, "return", 6) == 0)
type = TOKEN_RETURN;
else if ((lex->pos - pos) == 3 && strncmp(lex->src + pos, "for", 3) == 0)
type = TOKEN_FOR;
else if ((lex->pos - pos) == 2 && strncmp(lex->src + pos, "if", 2) == 0)
type = TOKEN_IF;
else if ((lex->pos - pos) == 4 && strncmp(lex->src + pos, "else", 4) == 0)
type = TOKEN_ELSE;
else if ((lex->pos - pos) == 8 && strncmp(lex->src + pos, "continue", 8) == 0)
type = TOKEN_CONTINUE;
else if ((lex->pos - pos) == 5 && strncmp(lex->src + pos, "break", 5) == 0)
type = TOKEN_BREAK;
else if ((lex->pos - pos) == 5 && strncmp(lex->src + pos, "while", 5) == 0)
type = TOKEN_WHILE;
else if ((lex->pos - pos) == 6 && strncmp(lex->src + pos, "struct", 6) == 0)
type = TOKEN_STRUCT;
else if ((lex->pos - pos) == 6 && strncmp(lex->src + pos, "extend", 6) == 0)
type = TOKEN_EXTEND;
else if ((lex->pos - pos) == 4 && strncmp(lex->src + pos, "true", 4) == 0)
type = TOKEN_BOOL_TRUE_LITERAL;
else if ((lex->pos - pos) == 5 && strncmp(lex->src + pos, "false", 5) == 0)
type = TOKEN_BOOL_FALSE_LITERAL;
else if ((lex->pos - pos) == 2 && strncmp(lex->src + pos, "fx", 2) == 0)
type = TOKEN_FX;
else if ((lex->pos - pos) == 2 && strncmp(lex->src + pos, "fn", 2) == 0)
type = TOKEN_FN;
// Check for keywords, or a Capitalised CustomType
// -- @later do it in the parser, keep types as unprotected names
// if (strncmp(lex->src + pos, "int", lex->pos - pos) == 0)
// type = TOKEN_INT;
// else if (strncmp(lex->src + pos, "float", lex->pos - pos) == 0)
// type = TOKEN_FLOAT;
// else if (strncmp(lex->src + pos, "void", lex->pos - pos) == 0)
// type = TOKEN_VOID;
// else if (strncmp(lex->src + pos, "string", lex->pos - pos) == 0)
// type = TOKEN_STRING;
// else {
// // If identifier starts with a capital letter, treat as TOKEN_TYPE
// char first = lex->src[pos];
// if (first >= 'A' && first <= 'Z') {
// type = TOKEN_TYPE;
// }
// }
return (Token) { .type = type, .start = pos, .line = line, .col = col, .end = lex->pos };
}
static Token
make_number(Lexer* lex, size_t pos, size_t line, size_t col)
{
bool is_float = false;
if (peek(lex) == '-') {
nudge(lex); // pass through negative values
}
while (isdigit(peek(lex)))
nudge(lex);
if (peek(lex) == '.' && isdigit(peek2(lex))) {
is_float = true;
nudge(lex);
while (isdigit(peek(lex)))
nudge(lex);
}
return (Token) {
.type = is_float ? TOKEN_FLOAT_LITERAL : TOKEN_INT_LITERAL, .start = pos, .end = lex->pos, .line = line, .col = col
};
}
static Token
make_string(Lexer* lex, size_t pos, size_t line, size_t col)
{
nudge(lex); // " start
while (peek(lex) != '"' && peek(lex) != 0)
nudge(lex);
if (peek(lex) == '"') nudge(lex); // " end
return (Token) { .type = TOKEN_STRING_LITERAL, .start = pos, .end = lex->pos, .line = line, .col = col };
}
static Token
next_token(Lexer* lex)
{
skip_space_and_comments(lex);
size_t start = lex->pos;
size_t line = lex->line;
size_t col = lex->col;
const char c = peek(lex);
if (c == 0) return (Token) { .type = TOKEN_EOF, .start = start, .end = lex->pos, .col = col, .line = line };
if (isalpha(c) || c == '_') return make_ident(lex, start, line, col);
if (isdigit(c)) return make_number(lex, start, line, col);
if (c == '-' && isdigit(peek2(lex))) { return make_number(lex, start, line, col); }
if (c == '"') return make_string(lex, start, line, col);
TokenType type = TOKEN_UNKNOWN;
switch (c) {
case '(':
nudge(lex);
type = TOKEN_LPAREN;
break;
case ')':
nudge(lex);
type = TOKEN_RPAREN;
break;
case '{':
nudge(lex);
type = TOKEN_LBRACE;
break;
case '}':
nudge(lex);
type = TOKEN_RBRACE;
break;
case '[':
nudge(lex);
type = TOKEN_LBRACKET;
break;
case ']':
nudge(lex);
type = TOKEN_RBRACKET;
break;
case ';':
nudge(lex);
type = TOKEN_SEMICOLON;
break;
case '%':
nudge(lex);
type = TOKEN_PERCENT;
break;
case '#':
nudge(lex);
type = TOKEN_COMP_TIME;
break;
case '~':
nudge(lex);
type = TOKEN_VARIADIC;
break;
case '/':
nudge(lex);
type = TOKEN_SLASH;
break;
case '*':
nudge(lex);
type = TOKEN_STAR;
break;
case '+':
nudge(lex);
if (peek(lex) == '+') {
nudge(lex);
type = TOKEN_PLUSPLUS;
} else {
type = TOKEN_PLUS;
}
break;
case '-':
nudge(lex);
if (peek(lex) == '-') {
nudge(lex);
type = TOKEN_MINUSMINUS;
} else {
type = TOKEN_MINUS;
}
break;
case ',':
nudge(lex);
type = TOKEN_COMMA;
break;
case '=':
nudge(lex);
if (peek(lex) == '=') {
nudge(lex);
type = TOKEN_EQUALITY;
} else {
type = TOKEN_EQUAL;
}
break;
case '!':
nudge(lex);
if (peek(lex) == '=') {
nudge(lex);
type = TOKEN_INEQUALITY;
} else {
type = TOKEN_BANG;
}
break;
case '>':
nudge(lex);
if (peek(lex) == '=') {
nudge(lex);
type = TOKEN_GT_EQ;
} else {
type = TOKEN_GT;
}
break;
case '<':
nudge(lex);
if (peek(lex) == '=') {
nudge(lex);
type = TOKEN_LT_EQ;
} else {
type = TOKEN_LT;
}
break;
default:
nudge(lex);
type = TOKEN_UNKNOWN;
break;
}
Token t = (Token) { .type = type, .start = start, .end = lex->pos, .col = col, .line = line };
return t;
}
static void
print_token(const Token* t, const char* contents)
{
static const char* TYPES[] = { [TOKEN_IDENT] = "ident/type",
[TOKEN_LPAREN] = "open paren",
[TOKEN_RPAREN] = "close paren",
[TOKEN_LBRACE] = "open brace",
[TOKEN_RBRACE] = "close brace",
[TOKEN_LBRACKET] = "open bracket",
[TOKEN_RBRACKET] = "close bracket",
[TOKEN_EQUAL] = "equal",
[TOKEN_SEMICOLON] = "semicol",
[TOKEN_COMMA] = "comma",
[TOKEN_INT_LITERAL] = "integer literal",
[TOKEN_FLOAT_LITERAL] = "float literal",
[TOKEN_STRING_LITERAL] = "string literal",
[TOKEN_BOOL_TRUE_LITERAL] = "bool TRUE literal",
[TOKEN_BOOL_FALSE_LITERAL] = "bool FALSE literal",
[TOKEN_SLASH] = "slash",
[TOKEN_STAR] = "star",
[TOKEN_PLUS] = "plus",
[TOKEN_PLUSPLUS] = "++",
[TOKEN_MINUS] = "minus",
[TOKEN_MINUSMINUS] = "--",
[TOKEN_EQUALITY] = "equality ==",
[TOKEN_INEQUALITY] = "inequality !=",
[TOKEN_BANG] = "bang !",
[TOKEN_LT] = "lower than",
[TOKEN_GT] = "greater than",
[TOKEN_LT_EQ] = "lt or = than",
[TOKEN_GT_EQ] = "gt or = than",
[TOKEN_IF] = "if",
[TOKEN_ELSE] = "else",
[TOKEN_WHILE] = "while",
[TOKEN_FOR] = "for",
[TOKEN_RETURN] = "return",
[TOKEN_UNKNOWN] = "< UNKNOWN >",
[TOKEN_EOF] = "~EOF~" };
printf("L%zu:%zu \t%-14s '", t->line + 1, t->col + 1, TYPES[t->type]);
fwrite(contents + t->start, 1, t->end - t->start, stdout);
printf("'\n");
}
static void
add_token(Lexer* lex, Token tok)
{
if (lex->token_count >= lex->token_cap) {
lex->token_cap *= 2;
lex->tokens = (Token*)realloc(lex->tokens, sizeof(Token) * lex->token_cap);
}
lex->tokens[lex->token_count++] = tok;
}
void
lexer_print(Lexer* lex)
{
for (size_t i = 0; i < lex->token_count; i++) {
print_token(&lex->tokens[i], lex->src);
}
}
void
lexer_lex(Lexer* lex, const char* filename, const char* contents)
{
lex->line = 1;
lex->col = 1;
lex->pos = 0;
lex->token_cap = 128;
lex->token_count = 0;
lex->tokens = (Token*)calloc(lex->token_cap, sizeof(Token));
if (lex->tokens == NULL) panic("lexer_lex: could not alloc");
lex->filename = filename;
lex->src = contents;
lex->src_len = strlen(contents);
for (;;) {
Token tok = next_token(lex);
add_token(lex, tok);
if (tok.type == TOKEN_EOF) break;
}
}
const char*
token_type_str(TokenType t)
{
static const char* type_strings[] = { [TOKEN_IDENT] = "TOKEN_IDENT",
[TOKEN_LPAREN] = "TOKEN_LPAREN",
[TOKEN_RPAREN] = "TOKEN_RPAREN",
[TOKEN_LBRACE] = "TOKEN_LBRACE",
[TOKEN_RBRACE] = "TOKEN_RBRACE",
[TOKEN_LBRACKET] = "TOKEN_LBRACKET",
[TOKEN_RBRACKET] = "TOKEN_RBRACKET",
[TOKEN_EQUAL] = "TOKEN_EQUAL",
[TOKEN_SEMICOLON] = "TOKEN_SEMICOLON",
[TOKEN_COMMA] = "TOKEN_COMMA",
[TOKEN_INT_LITERAL] = "TOKEN_INT_LITERAL",
[TOKEN_FLOAT_LITERAL] = "TOKEN_FLOAT_LITERAL",
[TOKEN_STRING_LITERAL] = "TOKEN_STRING_LITERAL",
[TOKEN_BOOL_TRUE_LITERAL] = "TOKEN_BOOL_TRUE_LITERAL",
[TOKEN_BOOL_FALSE_LITERAL] = "TOKEN_BOOL_FALSE_LITERAL",
[TOKEN_SLASH] = "TOKEN_SLASH",
[TOKEN_STAR] = "TOKEN_STAR",
[TOKEN_PLUS] = "TOKEN_PLUS",
[TOKEN_PLUSPLUS] = "TOKEN_PLUSPLUS",
[TOKEN_MINUS] = "TOKEN_MINUS",
[TOKEN_MINUSMINUS] = "TOKEN_MINUSMINUS",
[TOKEN_EQUALITY] = "TOKEN_EQUALITY",
[TOKEN_INEQUALITY] = "TOKEN_INEQUALITY",
[TOKEN_BANG] = "TOKEN_BANG",
[TOKEN_LT] = "TOKEN_LT",
[TOKEN_GT] = "TOKEN_GT",
[TOKEN_LT_EQ] = "TOKEN_LT_EQ",
[TOKEN_GT_EQ] = "TOKEN_GT_EQ",
[TOKEN_IF] = "TOKEN_IF",
[TOKEN_ELSE] = "TOKEN_ELSE",
[TOKEN_WHILE] = "TOKEN_WHILE",
[TOKEN_FOR] = "TOKEN_FOR",
[TOKEN_RETURN] = "TOKEN_RETURN",
[TOKEN_CONTINUE] = "TOKEN_CONTINUE",
[TOKEN_BREAK] = "TOKEN_BREAK",
[TOKEN_PERCENT] = "TOKEN_PERCENT",
[TOKEN_UNKNOWN] = "TOKEN_UNKNOWN",
[TOKEN_EOF] = "TOKEN_EOF" };
if (t >= TOKEN_IDENT && t <= TOKEN_EOF) {
return type_strings[t];
} else {
return "UNKNOWN_TOKEN_TYPE";
}
}