ox

The Ox programming language, compiler and tools (WIP)
Log | Files | Refs | README | LICENSE

lexer.c (10198B)



#include "lexer.h"
#include "types.h"
#include "utils.h"

#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>
#include <assert.h>

static char
peek(Lexer* lex)
{
	char c = lex->src[lex->pos];
	return c ? c : 0;
}

static char
peek2(Lexer* lex)
{
	char c = lex->src[lex->pos];
	if (!c) return 0;
	c = lex->src[lex->pos + 1];
	return c ? c : 0;
}

static char
nudge(Lexer* lex)
{
	const char c = peek(lex);
	if (!c) return 0;
	if (c == '\n') {
		lex->line++;
		lex->col = 1;
	} else {
		lex->col++;
	}
	lex->pos++;
	return c;
}

static void
skip_space_and_comments(Lexer* lex)
{
	for (;;) {
		for (;;) {
			char c = peek(lex);
			if (c == ' ' || c == '\t' || c == '\r' || c == '\n')
				nudge(lex);
			else
				break;
		}
		if (peek(lex) == '/' && peek2(lex) == '/') {
			nudge(lex);
			nudge(lex);
			while (peek(lex) != '\n' && peek(lex) != 0)
				nudge(lex);
			continue;
		}
		break;
	}
}

static Token
make_ident(Lexer* lex, size_t pos, size_t line, size_t col)
{
	for (;;) {
		char c = peek(lex);
		if (c == '_' || isalnum(c))
			nudge(lex);
		else
			break;
	}

	TokenType type = TOKEN_IDENT;

	if ((lex->pos - pos) == 6 && strncmp(lex->src + pos, "return", 6) == 0)
		type = TOKEN_RETURN;
	else if ((lex->pos - pos) == 3 && strncmp(lex->src + pos, "for", 3) == 0)
		type = TOKEN_FOR;
	else if ((lex->pos - pos) == 2 && strncmp(lex->src + pos, "if", 2) == 0)
		type = TOKEN_IF;
	else if ((lex->pos - pos) == 4 && strncmp(lex->src + pos, "else", 4) == 0)
		type = TOKEN_ELSE;
	else if ((lex->pos - pos) == 8 && strncmp(lex->src + pos, "continue", 8) == 0)
		type = TOKEN_CONTINUE;
	else if ((lex->pos - pos) == 5 && strncmp(lex->src + pos, "break", 5) == 0)
		type = TOKEN_BREAK;
	else if ((lex->pos - pos) == 5 && strncmp(lex->src + pos, "while", 5) == 0)
		type = TOKEN_WHILE;

	else if ((lex->pos - pos) == 6 && strncmp(lex->src + pos, "struct", 6) == 0)
		type = TOKEN_STRUCT;
	else if ((lex->pos - pos) == 6 && strncmp(lex->src + pos, "extend", 6) == 0)
		type = TOKEN_EXTEND;

	else if ((lex->pos - pos) == 4 && strncmp(lex->src + pos, "true", 4) == 0)
		type = TOKEN_BOOL_TRUE_LITERAL;
	else if ((lex->pos - pos) == 5 && strncmp(lex->src + pos, "false", 5) == 0)
		type = TOKEN_BOOL_FALSE_LITERAL;

	else if ((lex->pos - pos) == 2 && strncmp(lex->src + pos, "fx", 2) == 0)
		type = TOKEN_FX;
	else if ((lex->pos - pos) == 2 && strncmp(lex->src + pos, "fn", 2) == 0)
		type = TOKEN_FN;
	// Check for keywords, or a Capitalised CustomType
	// -- @later do it in the parser, keep types as unprotected names
	// if (strncmp(lex->src + pos, "int", lex->pos - pos) == 0)
	//     type = TOKEN_INT;
	// else if (strncmp(lex->src + pos, "float", lex->pos - pos) == 0)
	//     type = TOKEN_FLOAT;
	// else if (strncmp(lex->src + pos, "void", lex->pos - pos) == 0)
	//     type = TOKEN_VOID;
	// else if (strncmp(lex->src + pos, "string", lex->pos - pos) == 0)
	//     type = TOKEN_STRING;
	// else {
	//     // If identifier starts with a capital letter, treat as TOKEN_TYPE
	//     char first = lex->src[pos];
	//     if (first >= 'A' && first <= 'Z') {
	//         type = TOKEN_TYPE;
	//     }
	// }

	return (Token) { .type = type, .start = pos, .line = line, .col = col, .end = lex->pos };
}
static Token
make_number(Lexer* lex, size_t pos, size_t line, size_t col)
{
	bool is_float = false;

	if (peek(lex) == '-') {
		nudge(lex); // pass through negative values
	}

	while (isdigit(peek(lex)))
		nudge(lex);
	if (peek(lex) == '.' && isdigit(peek2(lex))) {
		is_float = true;
		nudge(lex);
		while (isdigit(peek(lex)))
			nudge(lex);
	}
	return (Token) {
		.type = is_float ? TOKEN_FLOAT_LITERAL : TOKEN_INT_LITERAL, .start = pos, .end = lex->pos, .line = line, .col = col
	};
}

static Token
make_string(Lexer* lex, size_t pos, size_t line, size_t col)
{
	nudge(lex); // " start
	while (peek(lex) != '"' && peek(lex) != 0)
		nudge(lex);
	if (peek(lex) == '"') nudge(lex); // " end
	return (Token) { .type = TOKEN_STRING_LITERAL, .start = pos, .end = lex->pos, .line = line, .col = col };
}

static Token
next_token(Lexer* lex)
{
	skip_space_and_comments(lex);
	size_t start = lex->pos;
	size_t line = lex->line;
	size_t col = lex->col;
	const char c = peek(lex);

	if (c == 0) return (Token) { .type = TOKEN_EOF, .start = start, .end = lex->pos, .col = col, .line = line };

	if (isalpha(c) || c == '_') return make_ident(lex, start, line, col);

	if (isdigit(c)) return make_number(lex, start, line, col);
	if (c == '-' && isdigit(peek2(lex))) { return make_number(lex, start, line, col); }

	if (c == '"') return make_string(lex, start, line, col);

	TokenType type = TOKEN_UNKNOWN;

	switch (c) {
	case '(':
		nudge(lex);
		type = TOKEN_LPAREN;
		break;
	case ')':
		nudge(lex);
		type = TOKEN_RPAREN;
		break;
	case '{':
		nudge(lex);
		type = TOKEN_LBRACE;
		break;
	case '}':
		nudge(lex);
		type = TOKEN_RBRACE;
		break;
	case '[':
		nudge(lex);
		type = TOKEN_LBRACKET;
		break;
	case ']':
		nudge(lex);
		type = TOKEN_RBRACKET;
		break;
	case ';':
		nudge(lex);
		type = TOKEN_SEMICOLON;
		break;
	case '%':
		nudge(lex);
		type = TOKEN_PERCENT;
		break;
	case '#':
		nudge(lex);
		type = TOKEN_COMP_TIME;
		break;
	case '~':
		nudge(lex);
		type = TOKEN_VARIADIC;
		break;
	case '/':
		nudge(lex);
		type = TOKEN_SLASH;
		break;
	case '*':
		nudge(lex);
		type = TOKEN_STAR;
		break;
	case '+':
		nudge(lex);
		if (peek(lex) == '+') {
			nudge(lex);
			type = TOKEN_PLUSPLUS;
		} else {
			type = TOKEN_PLUS;
		}
		break;
	case '-':
		nudge(lex);
		if (peek(lex) == '-') {
			nudge(lex);
			type = TOKEN_MINUSMINUS;
		} else {
			type = TOKEN_MINUS;
		}
		break;
	case ',':
		nudge(lex);
		type = TOKEN_COMMA;
		break;
	case '=':
		nudge(lex);
		if (peek(lex) == '=') {
			nudge(lex);
			type = TOKEN_EQUALITY;
		} else {
			type = TOKEN_EQUAL;
		}
		break;
	case '!':
		nudge(lex);
		if (peek(lex) == '=') {
			nudge(lex);
			type = TOKEN_INEQUALITY;
		} else {
			type = TOKEN_BANG;
		}
		break;
	case '>':
		nudge(lex);
		if (peek(lex) == '=') {
			nudge(lex);
			type = TOKEN_GT_EQ;
		} else {
			type = TOKEN_GT;
		}
		break;
	case '<':
		nudge(lex);
		if (peek(lex) == '=') {
			nudge(lex);
			type = TOKEN_LT_EQ;
		} else {
			type = TOKEN_LT;
		}
		break;
	default:
		nudge(lex);
		type = TOKEN_UNKNOWN;
		break;
	}
	Token t = (Token) { .type = type, .start = start, .end = lex->pos, .col = col, .line = line };
	return t;
}

static void
print_token(const Token* t, const char* contents)
{
	static const char* TYPES[] = { [TOKEN_IDENT] = "ident/type",
		[TOKEN_LPAREN] = "open paren",
		[TOKEN_RPAREN] = "close paren",
		[TOKEN_LBRACE] = "open brace",
		[TOKEN_RBRACE] = "close brace",
		[TOKEN_LBRACKET] = "open bracket",
		[TOKEN_RBRACKET] = "close bracket",
		[TOKEN_EQUAL] = "equal",
		[TOKEN_SEMICOLON] = "semicol",
		[TOKEN_COMMA] = "comma",
		[TOKEN_INT_LITERAL] = "integer literal",
		[TOKEN_FLOAT_LITERAL] = "float literal",
		[TOKEN_STRING_LITERAL] = "string literal",
		[TOKEN_BOOL_TRUE_LITERAL] = "bool TRUE literal",
		[TOKEN_BOOL_FALSE_LITERAL] = "bool FALSE literal",
		[TOKEN_SLASH] = "slash",
		[TOKEN_STAR] = "star",
		[TOKEN_PLUS] = "plus",
		[TOKEN_PLUSPLUS] = "++",
		[TOKEN_MINUS] = "minus",
		[TOKEN_MINUSMINUS] = "--",
		[TOKEN_EQUALITY] = "equality ==",
		[TOKEN_INEQUALITY] = "inequality !=",
		[TOKEN_BANG] = "bang !",
		[TOKEN_LT] = "lower than",
		[TOKEN_GT] = "greater than",
		[TOKEN_LT_EQ] = "lt or = than",
		[TOKEN_GT_EQ] = "gt or = than",
		[TOKEN_IF] = "if",
		[TOKEN_ELSE] = "else",
		[TOKEN_WHILE] = "while",
		[TOKEN_FOR] = "for",
		[TOKEN_RETURN] = "return",
		[TOKEN_UNKNOWN] = "< UNKNOWN >",
		[TOKEN_EOF] = "~EOF~" };

	printf("L%zu:%zu \t%-14s '", t->line + 1, t->col + 1, TYPES[t->type]);
	fwrite(contents + t->start, 1, t->end - t->start, stdout);
	printf("'\n");
}

static void
add_token(Lexer* lex, Token tok)
{
	if (lex->token_count >= lex->token_cap) {
		lex->token_cap *= 2;
		lex->tokens = (Token*)realloc(lex->tokens, sizeof(Token) * lex->token_cap);
	}
	lex->tokens[lex->token_count++] = tok;
}

void
lexer_print(Lexer* lex)
{
	for (size_t i = 0; i < lex->token_count; i++) {
		print_token(&lex->tokens[i], lex->src);
	}
}

void
lexer_lex(Lexer* lex, const char* filename, const char* contents)
{
	lex->line = 1;
	lex->col = 1;
	lex->pos = 0;
	lex->token_cap = 128;
	lex->token_count = 0;
	lex->tokens = (Token*)calloc(lex->token_cap, sizeof(Token));
	if (lex->tokens == NULL) panic("lexer_lex: could not alloc");
	lex->filename = filename;
	lex->src = contents;
	lex->src_len = strlen(contents);
	for (;;) {
		Token tok = next_token(lex);
		add_token(lex, tok);
		if (tok.type == TOKEN_EOF) break;
	}
}

const char*
token_type_str(TokenType t)
{
	static const char* type_strings[] = { [TOKEN_IDENT] = "TOKEN_IDENT",
		[TOKEN_LPAREN] = "TOKEN_LPAREN",
		[TOKEN_RPAREN] = "TOKEN_RPAREN",
		[TOKEN_LBRACE] = "TOKEN_LBRACE",
		[TOKEN_RBRACE] = "TOKEN_RBRACE",
		[TOKEN_LBRACKET] = "TOKEN_LBRACKET",
		[TOKEN_RBRACKET] = "TOKEN_RBRACKET",
		[TOKEN_EQUAL] = "TOKEN_EQUAL",
		[TOKEN_SEMICOLON] = "TOKEN_SEMICOLON",
		[TOKEN_COMMA] = "TOKEN_COMMA",
		[TOKEN_INT_LITERAL] = "TOKEN_INT_LITERAL",
		[TOKEN_FLOAT_LITERAL] = "TOKEN_FLOAT_LITERAL",
		[TOKEN_STRING_LITERAL] = "TOKEN_STRING_LITERAL",
		[TOKEN_BOOL_TRUE_LITERAL] = "TOKEN_BOOL_TRUE_LITERAL",
		[TOKEN_BOOL_FALSE_LITERAL] = "TOKEN_BOOL_FALSE_LITERAL",
		[TOKEN_SLASH] = "TOKEN_SLASH",
		[TOKEN_STAR] = "TOKEN_STAR",
		[TOKEN_PLUS] = "TOKEN_PLUS",
		[TOKEN_PLUSPLUS] = "TOKEN_PLUSPLUS",
		[TOKEN_MINUS] = "TOKEN_MINUS",
		[TOKEN_MINUSMINUS] = "TOKEN_MINUSMINUS",
		[TOKEN_EQUALITY] = "TOKEN_EQUALITY",
		[TOKEN_INEQUALITY] = "TOKEN_INEQUALITY",
		[TOKEN_BANG] = "TOKEN_BANG",
		[TOKEN_LT] = "TOKEN_LT",
		[TOKEN_GT] = "TOKEN_GT",
		[TOKEN_LT_EQ] = "TOKEN_LT_EQ",
		[TOKEN_GT_EQ] = "TOKEN_GT_EQ",
		[TOKEN_IF] = "TOKEN_IF",
		[TOKEN_ELSE] = "TOKEN_ELSE",
		[TOKEN_WHILE] = "TOKEN_WHILE",
		[TOKEN_FOR] = "TOKEN_FOR",
		[TOKEN_RETURN] = "TOKEN_RETURN",
		[TOKEN_CONTINUE] = "TOKEN_CONTINUE",
		[TOKEN_BREAK] = "TOKEN_BREAK",
		[TOKEN_PERCENT] = "TOKEN_PERCENT",
		[TOKEN_UNKNOWN] = "TOKEN_UNKNOWN",
		[TOKEN_EOF] = "TOKEN_EOF" };
	if (t >= TOKEN_IDENT && t <= TOKEN_EOF) {
		return type_strings[t];
	} else {
		return "UNKNOWN_TOKEN_TYPE";
	}
}