ox

The Ox programming language, compiler and tools (WIP)
Log | Files | Refs | README | LICENSE

lexer.c (10198B)


      1 #include "lexer.h"
      2 #include "types.h"
      3 #include "utils.h"
      4 
      5 #include <string.h>
      6 #include <stdlib.h>
      7 #include <stdio.h>
      8 #include <ctype.h>
      9 #include <assert.h>
     10 
     11 static char
     12 peek(Lexer* lex)
     13 {
     14 	char c = lex->src[lex->pos];
     15 	return c ? c : 0;
     16 }
     17 
     18 static char
     19 peek2(Lexer* lex)
     20 {
     21 	char c = lex->src[lex->pos];
     22 	if (!c) return 0;
     23 	c = lex->src[lex->pos + 1];
     24 	return c ? c : 0;
     25 }
     26 
     27 static char
     28 nudge(Lexer* lex)
     29 {
     30 	const char c = peek(lex);
     31 	if (!c) return 0;
     32 	if (c == '\n') {
     33 		lex->line++;
     34 		lex->col = 1;
     35 	} else {
     36 		lex->col++;
     37 	}
     38 	lex->pos++;
     39 	return c;
     40 }
     41 
     42 static void
     43 skip_space_and_comments(Lexer* lex)
     44 {
     45 	for (;;) {
     46 		for (;;) {
     47 			char c = peek(lex);
     48 			if (c == ' ' || c == '\t' || c == '\r' || c == '\n')
     49 				nudge(lex);
     50 			else
     51 				break;
     52 		}
     53 		if (peek(lex) == '/' && peek2(lex) == '/') {
     54 			nudge(lex);
     55 			nudge(lex);
     56 			while (peek(lex) != '\n' && peek(lex) != 0)
     57 				nudge(lex);
     58 			continue;
     59 		}
     60 		break;
     61 	}
     62 }
     63 
     64 static Token
     65 make_ident(Lexer* lex, size_t pos, size_t line, size_t col)
     66 {
     67 	for (;;) {
     68 		char c = peek(lex);
     69 		if (c == '_' || isalnum(c))
     70 			nudge(lex);
     71 		else
     72 			break;
     73 	}
     74 
     75 	TokenType type = TOKEN_IDENT;
     76 
     77 	if ((lex->pos - pos) == 6 && strncmp(lex->src + pos, "return", 6) == 0)
     78 		type = TOKEN_RETURN;
     79 	else if ((lex->pos - pos) == 3 && strncmp(lex->src + pos, "for", 3) == 0)
     80 		type = TOKEN_FOR;
     81 	else if ((lex->pos - pos) == 2 && strncmp(lex->src + pos, "if", 2) == 0)
     82 		type = TOKEN_IF;
     83 	else if ((lex->pos - pos) == 4 && strncmp(lex->src + pos, "else", 4) == 0)
     84 		type = TOKEN_ELSE;
     85 	else if ((lex->pos - pos) == 8 && strncmp(lex->src + pos, "continue", 8) == 0)
     86 		type = TOKEN_CONTINUE;
     87 	else if ((lex->pos - pos) == 5 && strncmp(lex->src + pos, "break", 5) == 0)
     88 		type = TOKEN_BREAK;
     89 	else if ((lex->pos - pos) == 5 && strncmp(lex->src + pos, "while", 5) == 0)
     90 		type = TOKEN_WHILE;
     91 
     92 	else if ((lex->pos - pos) == 6 && strncmp(lex->src + pos, "struct", 6) == 0)
     93 		type = TOKEN_STRUCT;
     94 	else if ((lex->pos - pos) == 6 && strncmp(lex->src + pos, "extend", 6) == 0)
     95 		type = TOKEN_EXTEND;
     96 
     97 	else if ((lex->pos - pos) == 4 && strncmp(lex->src + pos, "true", 4) == 0)
     98 		type = TOKEN_BOOL_TRUE_LITERAL;
     99 	else if ((lex->pos - pos) == 5 && strncmp(lex->src + pos, "false", 5) == 0)
    100 		type = TOKEN_BOOL_FALSE_LITERAL;
    101 
    102 	else if ((lex->pos - pos) == 2 && strncmp(lex->src + pos, "fx", 2) == 0)
    103 		type = TOKEN_FX;
    104 	else if ((lex->pos - pos) == 2 && strncmp(lex->src + pos, "fn", 2) == 0)
    105 		type = TOKEN_FN;
    106 	// Check for keywords, or a Capitalised CustomType
    107 	// -- @later do it in the parser, keep types as unprotected names
    108 	// if (strncmp(lex->src + pos, "int", lex->pos - pos) == 0)
    109 	//     type = TOKEN_INT;
    110 	// else if (strncmp(lex->src + pos, "float", lex->pos - pos) == 0)
    111 	//     type = TOKEN_FLOAT;
    112 	// else if (strncmp(lex->src + pos, "void", lex->pos - pos) == 0)
    113 	//     type = TOKEN_VOID;
    114 	// else if (strncmp(lex->src + pos, "string", lex->pos - pos) == 0)
    115 	//     type = TOKEN_STRING;
    116 	// else {
    117 	//     // If identifier starts with a capital letter, treat as TOKEN_TYPE
    118 	//     char first = lex->src[pos];
    119 	//     if (first >= 'A' && first <= 'Z') {
    120 	//         type = TOKEN_TYPE;
    121 	//     }
    122 	// }
    123 
    124 	return (Token) { .type = type, .start = pos, .line = line, .col = col, .end = lex->pos };
    125 }
    126 static Token
    127 make_number(Lexer* lex, size_t pos, size_t line, size_t col)
    128 {
    129 	bool is_float = false;
    130 
    131 	if (peek(lex) == '-') {
    132 		nudge(lex); // pass through negative values
    133 	}
    134 
    135 	while (isdigit(peek(lex)))
    136 		nudge(lex);
    137 	if (peek(lex) == '.' && isdigit(peek2(lex))) {
    138 		is_float = true;
    139 		nudge(lex);
    140 		while (isdigit(peek(lex)))
    141 			nudge(lex);
    142 	}
    143 	return (Token) {
    144 		.type = is_float ? TOKEN_FLOAT_LITERAL : TOKEN_INT_LITERAL, .start = pos, .end = lex->pos, .line = line, .col = col
    145 	};
    146 }
    147 
    148 static Token
    149 make_string(Lexer* lex, size_t pos, size_t line, size_t col)
    150 {
    151 	nudge(lex); // " start
    152 	while (peek(lex) != '"' && peek(lex) != 0)
    153 		nudge(lex);
    154 	if (peek(lex) == '"') nudge(lex); // " end
    155 	return (Token) { .type = TOKEN_STRING_LITERAL, .start = pos, .end = lex->pos, .line = line, .col = col };
    156 }
    157 
    158 static Token
    159 next_token(Lexer* lex)
    160 {
    161 	skip_space_and_comments(lex);
    162 	size_t start = lex->pos;
    163 	size_t line = lex->line;
    164 	size_t col = lex->col;
    165 	const char c = peek(lex);
    166 
    167 	if (c == 0) return (Token) { .type = TOKEN_EOF, .start = start, .end = lex->pos, .col = col, .line = line };
    168 
    169 	if (isalpha(c) || c == '_') return make_ident(lex, start, line, col);
    170 
    171 	if (isdigit(c)) return make_number(lex, start, line, col);
    172 	if (c == '-' && isdigit(peek2(lex))) { return make_number(lex, start, line, col); }
    173 
    174 	if (c == '"') return make_string(lex, start, line, col);
    175 
    176 	TokenType type = TOKEN_UNKNOWN;
    177 
    178 	switch (c) {
    179 	case '(':
    180 		nudge(lex);
    181 		type = TOKEN_LPAREN;
    182 		break;
    183 	case ')':
    184 		nudge(lex);
    185 		type = TOKEN_RPAREN;
    186 		break;
    187 	case '{':
    188 		nudge(lex);
    189 		type = TOKEN_LBRACE;
    190 		break;
    191 	case '}':
    192 		nudge(lex);
    193 		type = TOKEN_RBRACE;
    194 		break;
    195 	case '[':
    196 		nudge(lex);
    197 		type = TOKEN_LBRACKET;
    198 		break;
    199 	case ']':
    200 		nudge(lex);
    201 		type = TOKEN_RBRACKET;
    202 		break;
    203 	case ';':
    204 		nudge(lex);
    205 		type = TOKEN_SEMICOLON;
    206 		break;
    207 	case '%':
    208 		nudge(lex);
    209 		type = TOKEN_PERCENT;
    210 		break;
    211 	case '#':
    212 		nudge(lex);
    213 		type = TOKEN_COMP_TIME;
    214 		break;
    215 	case '~':
    216 		nudge(lex);
    217 		type = TOKEN_VARIADIC;
    218 		break;
    219 	case '/':
    220 		nudge(lex);
    221 		type = TOKEN_SLASH;
    222 		break;
    223 	case '*':
    224 		nudge(lex);
    225 		type = TOKEN_STAR;
    226 		break;
    227 	case '+':
    228 		nudge(lex);
    229 		if (peek(lex) == '+') {
    230 			nudge(lex);
    231 			type = TOKEN_PLUSPLUS;
    232 		} else {
    233 			type = TOKEN_PLUS;
    234 		}
    235 		break;
    236 	case '-':
    237 		nudge(lex);
    238 		if (peek(lex) == '-') {
    239 			nudge(lex);
    240 			type = TOKEN_MINUSMINUS;
    241 		} else {
    242 			type = TOKEN_MINUS;
    243 		}
    244 		break;
    245 	case ',':
    246 		nudge(lex);
    247 		type = TOKEN_COMMA;
    248 		break;
    249 	case '=':
    250 		nudge(lex);
    251 		if (peek(lex) == '=') {
    252 			nudge(lex);
    253 			type = TOKEN_EQUALITY;
    254 		} else {
    255 			type = TOKEN_EQUAL;
    256 		}
    257 		break;
    258 	case '!':
    259 		nudge(lex);
    260 		if (peek(lex) == '=') {
    261 			nudge(lex);
    262 			type = TOKEN_INEQUALITY;
    263 		} else {
    264 			type = TOKEN_BANG;
    265 		}
    266 		break;
    267 	case '>':
    268 		nudge(lex);
    269 		if (peek(lex) == '=') {
    270 			nudge(lex);
    271 			type = TOKEN_GT_EQ;
    272 		} else {
    273 			type = TOKEN_GT;
    274 		}
    275 		break;
    276 	case '<':
    277 		nudge(lex);
    278 		if (peek(lex) == '=') {
    279 			nudge(lex);
    280 			type = TOKEN_LT_EQ;
    281 		} else {
    282 			type = TOKEN_LT;
    283 		}
    284 		break;
    285 	default:
    286 		nudge(lex);
    287 		type = TOKEN_UNKNOWN;
    288 		break;
    289 	}
    290 	Token t = (Token) { .type = type, .start = start, .end = lex->pos, .col = col, .line = line };
    291 	return t;
    292 }
    293 
    294 static void
    295 print_token(const Token* t, const char* contents)
    296 {
    297 	static const char* TYPES[] = { [TOKEN_IDENT] = "ident/type",
    298 		[TOKEN_LPAREN] = "open paren",
    299 		[TOKEN_RPAREN] = "close paren",
    300 		[TOKEN_LBRACE] = "open brace",
    301 		[TOKEN_RBRACE] = "close brace",
    302 		[TOKEN_LBRACKET] = "open bracket",
    303 		[TOKEN_RBRACKET] = "close bracket",
    304 		[TOKEN_EQUAL] = "equal",
    305 		[TOKEN_SEMICOLON] = "semicol",
    306 		[TOKEN_COMMA] = "comma",
    307 		[TOKEN_INT_LITERAL] = "integer literal",
    308 		[TOKEN_FLOAT_LITERAL] = "float literal",
    309 		[TOKEN_STRING_LITERAL] = "string literal",
    310 		[TOKEN_BOOL_TRUE_LITERAL] = "bool TRUE literal",
    311 		[TOKEN_BOOL_FALSE_LITERAL] = "bool FALSE literal",
    312 		[TOKEN_SLASH] = "slash",
    313 		[TOKEN_STAR] = "star",
    314 		[TOKEN_PLUS] = "plus",
    315 		[TOKEN_PLUSPLUS] = "++",
    316 		[TOKEN_MINUS] = "minus",
    317 		[TOKEN_MINUSMINUS] = "--",
    318 		[TOKEN_EQUALITY] = "equality ==",
    319 		[TOKEN_INEQUALITY] = "inequality !=",
    320 		[TOKEN_BANG] = "bang !",
    321 		[TOKEN_LT] = "lower than",
    322 		[TOKEN_GT] = "greater than",
    323 		[TOKEN_LT_EQ] = "lt or = than",
    324 		[TOKEN_GT_EQ] = "gt or = than",
    325 		[TOKEN_IF] = "if",
    326 		[TOKEN_ELSE] = "else",
    327 		[TOKEN_WHILE] = "while",
    328 		[TOKEN_FOR] = "for",
    329 		[TOKEN_RETURN] = "return",
    330 		[TOKEN_UNKNOWN] = "< UNKNOWN >",
    331 		[TOKEN_EOF] = "~EOF~" };
    332 
    333 	printf("L%zu:%zu \t%-14s '", t->line + 1, t->col + 1, TYPES[t->type]);
    334 	fwrite(contents + t->start, 1, t->end - t->start, stdout);
    335 	printf("'\n");
    336 }
    337 
    338 static void
    339 add_token(Lexer* lex, Token tok)
    340 {
    341 	if (lex->token_count >= lex->token_cap) {
    342 		lex->token_cap *= 2;
    343 		lex->tokens = (Token*)realloc(lex->tokens, sizeof(Token) * lex->token_cap);
    344 	}
    345 	lex->tokens[lex->token_count++] = tok;
    346 }
    347 
    348 void
    349 lexer_print(Lexer* lex)
    350 {
    351 	for (size_t i = 0; i < lex->token_count; i++) {
    352 		print_token(&lex->tokens[i], lex->src);
    353 	}
    354 }
    355 
    356 void
    357 lexer_lex(Lexer* lex, const char* filename, const char* contents)
    358 {
    359 	lex->line = 1;
    360 	lex->col = 1;
    361 	lex->pos = 0;
    362 	lex->token_cap = 128;
    363 	lex->token_count = 0;
    364 	lex->tokens = (Token*)calloc(lex->token_cap, sizeof(Token));
    365 	if (lex->tokens == NULL) panic("lexer_lex: could not alloc");
    366 	lex->filename = filename;
    367 	lex->src = contents;
    368 	lex->src_len = strlen(contents);
    369 	for (;;) {
    370 		Token tok = next_token(lex);
    371 		add_token(lex, tok);
    372 		if (tok.type == TOKEN_EOF) break;
    373 	}
    374 }
    375 
    376 const char*
    377 token_type_str(TokenType t)
    378 {
    379 	static const char* type_strings[] = { [TOKEN_IDENT] = "TOKEN_IDENT",
    380 		[TOKEN_LPAREN] = "TOKEN_LPAREN",
    381 		[TOKEN_RPAREN] = "TOKEN_RPAREN",
    382 		[TOKEN_LBRACE] = "TOKEN_LBRACE",
    383 		[TOKEN_RBRACE] = "TOKEN_RBRACE",
    384 		[TOKEN_LBRACKET] = "TOKEN_LBRACKET",
    385 		[TOKEN_RBRACKET] = "TOKEN_RBRACKET",
    386 		[TOKEN_EQUAL] = "TOKEN_EQUAL",
    387 		[TOKEN_SEMICOLON] = "TOKEN_SEMICOLON",
    388 		[TOKEN_COMMA] = "TOKEN_COMMA",
    389 		[TOKEN_INT_LITERAL] = "TOKEN_INT_LITERAL",
    390 		[TOKEN_FLOAT_LITERAL] = "TOKEN_FLOAT_LITERAL",
    391 		[TOKEN_STRING_LITERAL] = "TOKEN_STRING_LITERAL",
    392 		[TOKEN_BOOL_TRUE_LITERAL] = "TOKEN_BOOL_TRUE_LITERAL",
    393 		[TOKEN_BOOL_FALSE_LITERAL] = "TOKEN_BOOL_FALSE_LITERAL",
    394 		[TOKEN_SLASH] = "TOKEN_SLASH",
    395 		[TOKEN_STAR] = "TOKEN_STAR",
    396 		[TOKEN_PLUS] = "TOKEN_PLUS",
    397 		[TOKEN_PLUSPLUS] = "TOKEN_PLUSPLUS",
    398 		[TOKEN_MINUS] = "TOKEN_MINUS",
    399 		[TOKEN_MINUSMINUS] = "TOKEN_MINUSMINUS",
    400 		[TOKEN_EQUALITY] = "TOKEN_EQUALITY",
    401 		[TOKEN_INEQUALITY] = "TOKEN_INEQUALITY",
    402 		[TOKEN_BANG] = "TOKEN_BANG",
    403 		[TOKEN_LT] = "TOKEN_LT",
    404 		[TOKEN_GT] = "TOKEN_GT",
    405 		[TOKEN_LT_EQ] = "TOKEN_LT_EQ",
    406 		[TOKEN_GT_EQ] = "TOKEN_GT_EQ",
    407 		[TOKEN_IF] = "TOKEN_IF",
    408 		[TOKEN_ELSE] = "TOKEN_ELSE",
    409 		[TOKEN_WHILE] = "TOKEN_WHILE",
    410 		[TOKEN_FOR] = "TOKEN_FOR",
    411 		[TOKEN_RETURN] = "TOKEN_RETURN",
    412 		[TOKEN_CONTINUE] = "TOKEN_CONTINUE",
    413 		[TOKEN_BREAK] = "TOKEN_BREAK",
    414 		[TOKEN_PERCENT] = "TOKEN_PERCENT",
    415 		[TOKEN_UNKNOWN] = "TOKEN_UNKNOWN",
    416 		[TOKEN_EOF] = "TOKEN_EOF" };
    417 	if (t >= TOKEN_IDENT && t <= TOKEN_EOF) {
    418 		return type_strings[t];
    419 	} else {
    420 		return "UNKNOWN_TOKEN_TYPE";
    421 	}
    422 }