lexer.c - ox - The Ox programming language, compiler and tools (WIP)

lexer.c (9247B)
      1 #include "lexer.h"
      2 #include "types.h"
      3 #include "utils.h"
      4 
      5 #include <string.h>
      6 #include <stdlib.h>
      7 #include <stdio.h>
      8 #include <ctype.h>
      9 #include <assert.h>
     10 
     11 static char
     12 peek(Lexer* lex)
     13 {
     14 	char c = lex->src[lex->pos];
     15 	return c ? c : 0;
     16 }
     17 
     18 static char
     19 peek2(Lexer* lex)
     20 {
     21 	char c = lex->src[lex->pos];
     22 	if (!c) return 0;
     23 	c = lex->src[lex->pos + 1];
     24 	return c ? c : 0;
     25 }
     26 
     27 static char
     28 nudge(Lexer* lex)
     29 {
     30 	const char c = peek(lex);
     31 	if (!c) return 0;
     32 	if (c == '\n') {
     33 		lex->line++;
     34 		lex->col = 1;
     35 	} else {
     36 		lex->col++;
     37 	}
     38 	lex->pos++;
     39 	return c;
     40 }
     41 
     42 static void
     43 skip_space_and_comments(Lexer* lex)
     44 {
     45 	for (;;) {
     46 		for (;;) {
     47 			char c = peek(lex);
     48 			if (c == ' ' || c == '\t' || c == '\r' || c == '\n')
     49 				nudge(lex);
     50 			else
     51 				break;
     52 		}
     53 		if (peek(lex) == '/' && peek2(lex) == '/') {
     54 			nudge(lex);
     55 			nudge(lex);
     56 			while (peek(lex) != '\n' && peek(lex) != 0)
     57 				nudge(lex);
     58 			continue;
     59 		}
     60 		break;
     61 	}
     62 }
     63 
     64 static Token
     65 make_ident(Lexer* lex, size_t pos, size_t line, size_t col)
     66 {
     67 	for (;;) {
     68 		char c = peek(lex);
     69 		if (c == '_' || isalnum(c))
     70 			nudge(lex);
     71 		else
     72 			break;
     73 	}
     74 
     75 	TokenType type = TOKEN_IDENT;
     76 
     77 	if ((lex->pos - pos) == 6 && strncmp(lex->src + pos, "return", 6) == 0)
     78 		type = TOKEN_RETURN;
     79 	else if ((lex->pos - pos) == 3 && strncmp(lex->src + pos, "for", 3) == 0)
     80 		type = TOKEN_FOR;
     81 	else if ((lex->pos - pos) == 2 && strncmp(lex->src + pos, "if", 2) == 0)
     82 		type = TOKEN_IF;
     83 	else if ((lex->pos - pos) == 4 && strncmp(lex->src + pos, "else", 4) == 0)
     84 		type = TOKEN_ELSE;
     85 	else if ((lex->pos - pos) == 8 && strncmp(lex->src + pos, "continue", 8) == 0)
     86 		type = TOKEN_CONTINUE;
     87 	else if ((lex->pos - pos) == 5 && strncmp(lex->src + pos, "break", 5) == 0)
     88 		type = TOKEN_BREAK;
     89 	else if ((lex->pos - pos) == 5 && strncmp(lex->src + pos, "while", 5) == 0)
     90 		type = TOKEN_WHILE;
     91 
     92 	// Check for keywords, or a Capitalised CustomType
     93 	// -- @later do it in the parser, keep types as unprotected names
     94 	// if (strncmp(lex->src + pos, "int", lex->pos - pos) == 0)
     95 	//     type = TOKEN_INT;
     96 	// else if (strncmp(lex->src + pos, "float", lex->pos - pos) == 0)
     97 	//     type = TOKEN_FLOAT;
     98 	// else if (strncmp(lex->src + pos, "void", lex->pos - pos) == 0)
     99 	//     type = TOKEN_VOID;
    100 	// else if (strncmp(lex->src + pos, "string", lex->pos - pos) == 0)
    101 	//     type = TOKEN_STRING;
    102 	// else {
    103 	//     // If identifier starts with a capital letter, treat as TOKEN_TYPE
    104 	//     char first = lex->src[pos];
    105 	//     if (first >= 'A' && first <= 'Z') {
    106 	//         type = TOKEN_TYPE;
    107 	//     }
    108 	// }
    109 
    110 	return (Token) { .type = type, .start = pos, .line = line, .col = col, .end = lex->pos };
    111 }
    112 static Token
    113 make_number(Lexer* lex, size_t pos, size_t line, size_t col)
    114 {
    115 	bool is_float = false;
    116 
    117 	if (peek(lex) == '-') {
    118 		nudge(lex); // pass through negative values
    119 	}
    120 
    121 	while (isdigit(peek(lex)))
    122 		nudge(lex);
    123 	if (peek(lex) == '.' && isdigit(peek2(lex))) {
    124 		is_float = true;
    125 		nudge(lex);
    126 		while (isdigit(peek(lex)))
    127 			nudge(lex);
    128 	}
    129 	return (Token) {
    130 		.type = is_float ? TOKEN_FLOAT_LITERAL : TOKEN_INT_LITERAL, .start = pos, .end = lex->pos, .line = line, .col = col
    131 	};
    132 }
    133 
    134 static Token
    135 make_string(Lexer* lex, size_t pos, size_t line, size_t col)
    136 {
    137 	nudge(lex); // " start
    138 	while (peek(lex) != '"' && peek(lex) != 0)
    139 		nudge(lex);
    140 	if (peek(lex) == '"') nudge(lex); // " end
    141 	return (Token) { .type = TOKEN_STRING_LITERAL, .start = pos, .end = lex->pos, .line = line, .col = col };
    142 }
    143 
    144 static Token
    145 next_token(Lexer* lex)
    146 {
    147 	skip_space_and_comments(lex);
    148 	size_t start = lex->pos;
    149 	size_t line = lex->line;
    150 	size_t col = lex->col;
    151 	const char c = peek(lex);
    152 
    153 	if (c == 0) return (Token) { .type = TOKEN_EOF, .start = start, .end = lex->pos, .col = col, .line = line };
    154 
    155 	if (isalpha(c) || c == '_') return make_ident(lex, start, line, col);
    156 
    157 	if (isdigit(c)) return make_number(lex, start, line, col);
    158 	if (c == '-' && isdigit(peek2(lex))) { return make_number(lex, start, line, col); }
    159 
    160 	if (c == '"') return make_string(lex, start, line, col);
    161 
    162 	TokenType type = TOKEN_UNKNOWN;
    163 
    164 	switch (c) {
    165 	case '(':
    166 		nudge(lex);
    167 		type = TOKEN_LPAREN;
    168 		break;
    169 	case ')':
    170 		nudge(lex);
    171 		type = TOKEN_RPAREN;
    172 		break;
    173 	case '{':
    174 		nudge(lex);
    175 		type = TOKEN_LBRACE;
    176 		break;
    177 	case '}':
    178 		nudge(lex);
    179 		type = TOKEN_RBRACE;
    180 		break;
    181 	case '[':
    182 		nudge(lex);
    183 		type = TOKEN_LBRACKET;
    184 		break;
    185 	case ']':
    186 		nudge(lex);
    187 		type = TOKEN_RBRACKET;
    188 		break;
    189 	case ';':
    190 		nudge(lex);
    191 		type = TOKEN_SEMICOLON;
    192 		break;
    193 	case '%':
    194 		nudge(lex);
    195 		type = TOKEN_PERCENT;
    196 		break;
    197 	case '/':
    198 		nudge(lex);
    199 		type = TOKEN_SLASH;
    200 		break;
    201 	case '*':
    202 		nudge(lex);
    203 		type = TOKEN_STAR;
    204 		break;
    205 	case '+':
    206 		nudge(lex);
    207 		if (peek(lex) == '+') {
    208 			nudge(lex);
    209 			type = TOKEN_PLUSPLUS;
    210 		} else {
    211 			type = TOKEN_PLUS;
    212 		}
    213 		break;
    214 	case '-':
    215 		nudge(lex);
    216 		if (peek(lex) == '-') {
    217 			nudge(lex);
    218 			type = TOKEN_MINUSMINUS;
    219 		} else {
    220 			type = TOKEN_MINUS;
    221 		}
    222 		break;
    223 	case ',':
    224 		nudge(lex);
    225 		type = TOKEN_COMMA;
    226 		break;
    227 	case '=':
    228 		nudge(lex);
    229 		if (peek(lex) == '=') {
    230 			nudge(lex);
    231 			type = TOKEN_EQUALITY;
    232 		} else {
    233 			type = TOKEN_EQUAL;
    234 		}
    235 		break;
    236 	case '!':
    237 		nudge(lex);
    238 		if (peek(lex) == '=') {
    239 			nudge(lex);
    240 			type = TOKEN_INEQUALITY;
    241 		} else {
    242 			type = TOKEN_BANG;
    243 		}
    244 		break;
    245 	case '>':
    246 		nudge(lex);
    247 		if (peek(lex) == '=') {
    248 			nudge(lex);
    249 			type = TOKEN_GT_EQ;
    250 		} else {
    251 			type = TOKEN_GT;
    252 		}
    253 		break;
    254 	case '<':
    255 		nudge(lex);
    256 		if (peek(lex) == '=') {
    257 			nudge(lex);
    258 			type = TOKEN_LT_EQ;
    259 		} else {
    260 			type = TOKEN_LT;
    261 		}
    262 		break;
    263 	default:
    264 		nudge(lex);
    265 		type = TOKEN_UNKNOWN;
    266 		break;
    267 	}
    268 	Token t = (Token) { .type = type, .start = start, .end = lex->pos, .col = col, .line = line };
    269 	return t;
    270 }
    271 
    272 static void
    273 print_token(const Token* t, const char* contents)
    274 {
    275 	static const char* TYPES[] = { [TOKEN_IDENT] = "ident/type",
    276 		[TOKEN_LPAREN] = "open paren",
    277 		[TOKEN_RPAREN] = "close paren",
    278 		[TOKEN_LBRACE] = "open brace",
    279 		[TOKEN_RBRACE] = "close brace",
    280 		[TOKEN_LBRACKET] = "open bracket",
    281 		[TOKEN_RBRACKET] = "close bracket",
    282 		[TOKEN_EQUAL] = "equal",
    283 		[TOKEN_SEMICOLON] = "semicol",
    284 		[TOKEN_COMMA] = "comma",
    285 		[TOKEN_INT_LITERAL] = "integer literal",
    286 		[TOKEN_FLOAT_LITERAL] = "float literal",
    287 		[TOKEN_STRING_LITERAL] = "string literal",
    288 		[TOKEN_SLASH] = "slash",
    289 		[TOKEN_STAR] = "star",
    290 		[TOKEN_PLUS] = "plus",
    291 		[TOKEN_PLUSPLUS] = "++",
    292 		[TOKEN_MINUS] = "minus",
    293 		[TOKEN_MINUSMINUS] = "--",
    294 		[TOKEN_EQUALITY] = "equality ==",
    295 		[TOKEN_INEQUALITY] = "inequality !=",
    296 		[TOKEN_BANG] = "bang !",
    297 		[TOKEN_LT] = "lower than",
    298 		[TOKEN_GT] = "greater than",
    299 		[TOKEN_LT_EQ] = "lt or = than",
    300 		[TOKEN_GT_EQ] = "gt or = than",
    301 		[TOKEN_IF] = "if",
    302 		[TOKEN_ELSE] = "else",
    303 		[TOKEN_WHILE] = "while",
    304 		[TOKEN_FOR] = "for",
    305 		[TOKEN_RETURN] = "return",
    306 		[TOKEN_UNKNOWN] = "< UNKNOWN >",
    307 		[TOKEN_EOF] = "~EOF~" };
    308 
    309 	printf("L%zu:%zu \t%-14s '", t->line + 1, t->col + 1, TYPES[t->type]);
    310 	fwrite(contents + t->start, 1, t->end - t->start, stdout);
    311 	printf("'\n");
    312 }
    313 
    314 static void
    315 add_token(Lexer* lex, Token tok)
    316 {
    317 	if (lex->token_count >= lex->token_cap) {
    318 		lex->token_cap *= 2;
    319 		lex->tokens = (Token*)realloc(lex->tokens, sizeof(Token) * lex->token_cap);
    320 	}
    321 	lex->tokens[lex->token_count++] = tok;
    322 }
    323 
    324 void
    325 lexer_print(Lexer* lex)
    326 {
    327 	for (size_t i = 0; i < lex->token_count; i++) {
    328 		print_token(&lex->tokens[i], lex->src);
    329 	}
    330 }
    331 
    332 void
    333 lexer_lex(Lexer* lex, const char* filename, const char* contents)
    334 {
    335 	lex->line = 1;
    336 	lex->col = 1;
    337 	lex->pos = 0;
    338 	lex->token_cap = 128;
    339 	lex->token_count = 0;
    340 	lex->tokens = (Token*)calloc(lex->token_cap, sizeof(Token));
    341 	if (lex->tokens == NULL) panic("lexer_lex: could not alloc");
    342 	lex->filename = filename;
    343 	lex->src = contents;
    344 	lex->src_len = strlen(contents);
    345 	for (;;) {
    346 		Token tok = next_token(lex);
    347 		add_token(lex, tok);
    348 		if (tok.type == TOKEN_EOF) break;
    349 	}
    350 }
    351 
    352 const char*
    353 token_type_str(TokenType t)
    354 {
    355 	static const char* type_strings[] = { [TOKEN_IDENT] = "TOKEN_IDENT",
    356 		[TOKEN_LPAREN] = "TOKEN_LPAREN",
    357 		[TOKEN_RPAREN] = "TOKEN_RPAREN",
    358 		[TOKEN_LBRACE] = "TOKEN_LBRACE",
    359 		[TOKEN_RBRACE] = "TOKEN_RBRACE",
    360 		[TOKEN_LBRACKET] = "TOKEN_LBRACKET",
    361 		[TOKEN_RBRACKET] = "TOKEN_RBRACKET",
    362 		[TOKEN_EQUAL] = "TOKEN_EQUAL",
    363 		[TOKEN_SEMICOLON] = "TOKEN_SEMICOLON",
    364 		[TOKEN_COMMA] = "TOKEN_COMMA",
    365 		[TOKEN_INT_LITERAL] = "TOKEN_INT_LITERAL",
    366 		[TOKEN_FLOAT_LITERAL] = "TOKEN_FLOAT_LITERAL",
    367 		[TOKEN_STRING_LITERAL] = "TOKEN_STRING_LITERAL",
    368 		[TOKEN_SLASH] = "TOKEN_SLASH",
    369 		[TOKEN_STAR] = "TOKEN_STAR",
    370 		[TOKEN_PLUS] = "TOKEN_PLUS",
    371 		[TOKEN_PLUSPLUS] = "TOKEN_PLUSPLUS",
    372 		[TOKEN_MINUS] = "TOKEN_MINUS",
    373 		[TOKEN_MINUSMINUS] = "TOKEN_MINUSMINUS",
    374 		[TOKEN_EQUALITY] = "TOKEN_EQUALITY",
    375 		[TOKEN_INEQUALITY] = "TOKEN_INEQUALITY",
    376 		[TOKEN_BANG] = "TOKEN_BANG",
    377 		[TOKEN_LT] = "TOKEN_LT",
    378 		[TOKEN_GT] = "TOKEN_GT",
    379 		[TOKEN_LT_EQ] = "TOKEN_LT_EQ",
    380 		[TOKEN_GT_EQ] = "TOKEN_GT_EQ",
    381 		[TOKEN_IF] = "TOKEN_IF",
    382 		[TOKEN_ELSE] = "TOKEN_ELSE",
    383 		[TOKEN_WHILE] = "TOKEN_WHILE",
    384 		[TOKEN_FOR] = "TOKEN_FOR",
    385 		[TOKEN_RETURN] = "TOKEN_RETURN",
    386 		[TOKEN_CONTINUE] = "TOKEN_CONTINUE",
    387 		[TOKEN_BREAK] = "TOKEN_BREAK",
    388 		[TOKEN_PERCENT] = "TOKEN_PERCENT",
    389 		[TOKEN_UNKNOWN] = "TOKEN_UNKNOWN",
    390 		[TOKEN_EOF] = "TOKEN_EOF" };
    391 	if (t >= TOKEN_IDENT && t <= TOKEN_EOF) {
    392 		return type_strings[t];
    393 	} else {
    394 		return "UNKNOWN_TOKEN_TYPE";
    395 	}
    396 }
	ox The Ox programming language, compiler and tools (WIP)
	Log \| Files \| Refs \| README \| LICENSE