ox

The Ox programming language, compiler and tools (WIP)
Log | Files | Refs | README | LICENSE

lexer.c (8957B)


      1 #include "lexer.h"
      2 #include "utils.h"
      3 
      4 #include <string.h>
      5 #include <stdlib.h>
      6 #include <stdio.h>
      7 #include <ctype.h>
      8 #include <assert.h>
      9 
     10 static char peek(Lexer* lex)
     11 {
     12 	char c = lex->src[lex->pos];
     13 	return c ? c : 0;
     14 }
     15 
     16 static char peek2(Lexer* lex)
     17 {
     18 	char c = lex->src[lex->pos];
     19 	if (!c)
     20 		return 0;
     21 	c = lex->src[lex->pos + 1];
     22 	return c ? c : 0;
     23 }
     24 
     25 static char nudge(Lexer* lex)
     26 {
     27 	const char c = peek(lex);
     28 	if (!c)
     29 		return 0;
     30 	if (c == '\n') {
     31 		lex->line++;
     32 		lex->col = 1;
     33 	} else {
     34 		lex->col++;
     35 	}
     36 	lex->pos++;
     37 	return c;
     38 }
     39 
     40 static void skip_space_and_comments(Lexer* lex)
     41 {
     42 	for (;;) {
     43 		for (;;) {
     44 			char c = peek(lex);
     45 			if (c == ' ' || c == '\t' || c == '\r' || c == '\n')
     46 				nudge(lex);
     47 			else
     48 				break;
     49 		}
     50 		if (peek(lex) == '/' && peek2(lex) == '/') {
     51 			nudge(lex);
     52 			nudge(lex);
     53 			while (peek(lex) != '\n' && peek(lex) != 0)
     54 				nudge(lex);
     55 			continue;
     56 		}
     57 		break;
     58 	}
     59 }
     60 
     61 static Token make_ident(Lexer* lex, size_t pos, size_t line, size_t col)
     62 {
     63 	for (;;) {
     64 		char c = peek(lex);
     65 		if (c == '_' || isalnum(c))
     66 			nudge(lex);
     67 		else
     68 			break;
     69 	}
     70 
     71 	TokenType type = TOKEN_IDENT;
     72 
     73 	if ((lex->pos - pos) == 6 && strncmp(lex->src + pos, "return", 6) == 0)
     74 		type = TOKEN_RETURN;
     75 	else if ((lex->pos - pos) == 3 && strncmp(lex->src + pos, "for", 3) == 0)
     76 		type = TOKEN_FOR;
     77 	else if ((lex->pos - pos) == 2 && strncmp(lex->src + pos, "if", 2) == 0)
     78 		type = TOKEN_IF;
     79 	else if ((lex->pos - pos) == 4 && strncmp(lex->src + pos, "else", 4) == 0)
     80 		type = TOKEN_ELSE;
     81 	else if ((lex->pos - pos) == 8 && strncmp(lex->src + pos, "continue", 8) == 0)
     82 		type = TOKEN_CONTINUE;
     83 	else if ((lex->pos - pos) == 5 && strncmp(lex->src + pos, "break", 5) == 0)
     84 		type = TOKEN_BREAK;
     85 	else if ((lex->pos - pos) == 5 && strncmp(lex->src + pos, "while", 5) == 0)
     86 		type = TOKEN_WHILE;
     87 
     88 	// Check for keywords, or a Capitalised CustomType
     89 	// -- @later do it in the parser, keep types as unprotected names
     90 	// if (strncmp(lex->src + pos, "int", lex->pos - pos) == 0)
     91 	//     type = TOKEN_INT;
     92 	// else if (strncmp(lex->src + pos, "float", lex->pos - pos) == 0)
     93 	//     type = TOKEN_FLOAT;
     94 	// else if (strncmp(lex->src + pos, "void", lex->pos - pos) == 0)
     95 	//     type = TOKEN_VOID;
     96 	// else if (strncmp(lex->src + pos, "string", lex->pos - pos) == 0)
     97 	//     type = TOKEN_STRING;
     98 	// else {
     99 	//     // If identifier starts with a capital letter, treat as TOKEN_TYPE
    100 	//     char first = lex->src[pos];
    101 	//     if (first >= 'A' && first <= 'Z') {
    102 	//         type = TOKEN_TYPE;
    103 	//     }
    104 	// }
    105 
    106 	return (Token) { .type = type,
    107 		.start = pos,
    108 		.line = line,
    109 		.col = col,
    110 		.end = lex->pos };
    111 }
    112 static Token make_number(Lexer* lex, size_t pos, size_t line, size_t col)
    113 {
    114 	while (isdigit(peek(lex)))
    115 		nudge(lex);
    116 	if (peek(lex) == '.' && isdigit(peek2(lex))) {
    117 		nudge(lex);
    118 		while (isdigit(peek(lex)))
    119 			nudge(lex);
    120 	}
    121 	return (Token) {
    122 		.type = TOKEN_NUMBER_LITERAL,
    123 		.start = pos,
    124 		.end = lex->pos,
    125 		.line = line,
    126 		.col = col
    127 	};
    128 }
    129 
    130 static Token make_string(Lexer* lex, size_t pos, size_t line, size_t col)
    131 {
    132 	nudge(lex); // " start
    133 	while (peek(lex) != '"' && peek(lex) != 0)
    134 		nudge(lex);
    135 	if (peek(lex) == '"')
    136 		nudge(lex); // " end
    137 	return (Token) {
    138 		.type = TOKEN_STRING_LITERAL,
    139 		.start = pos,
    140 		.end = lex->pos,
    141 		.line = line,
    142 		.col = col
    143 	};
    144 }
    145 
    146 static Token next_token(Lexer* lex)
    147 {
    148 	skip_space_and_comments(lex);
    149 	size_t start = lex->pos;
    150 	size_t line = lex->line;
    151 	size_t col = lex->col;
    152 	const char c = peek(lex);
    153 
    154 	if (c == 0)
    155 		return (Token) {
    156 			.type = TOKEN_EOF, .start = start, .end = lex->pos, .col = col, .line = line
    157 		};
    158 
    159 	if (isalpha(c) || c == '_')
    160 		return make_ident(lex, start, line, col);
    161 	if (isdigit(c))
    162 		return make_number(lex, start, line, col);
    163 	if (c == '"')
    164 		return make_string(lex, start, line, col);
    165 
    166 	TokenType type = TOKEN_UNKNOWN;
    167 
    168 	switch (c) {
    169 	case '(':
    170 		nudge(lex);
    171 		type = TOKEN_LPAREN;
    172 		break;
    173 	case ')':
    174 		nudge(lex);
    175 		type = TOKEN_RPAREN;
    176 		break;
    177 	case '{':
    178 		nudge(lex);
    179 		type = TOKEN_LBRACE;
    180 		break;
    181 	case '}':
    182 		nudge(lex);
    183 		type = TOKEN_RBRACE;
    184 		break;
    185 	case '[':
    186 		nudge(lex);
    187 		type = TOKEN_LBRACKET;
    188 		break;
    189 	case ']':
    190 		nudge(lex);
    191 		type = TOKEN_RBRACKET;
    192 		break;
    193 	case ';':
    194 		nudge(lex);
    195 		type = TOKEN_SEMICOLON;
    196 		break;
    197 	case '%':
    198 		nudge(lex);
    199 		type = TOKEN_PERCENT;
    200 		break;
    201 	case '/':
    202 		nudge(lex);
    203 		type = TOKEN_SLASH;
    204 		break;
    205 	case '*':
    206 		nudge(lex);
    207 		type = TOKEN_UNKNOWN;
    208 		break;
    209 	case '+':
    210 		nudge(lex);
    211 		if (peek(lex) == '+') {
    212 			nudge(lex);
    213 			type = TOKEN_PLUSPLUS;
    214 		} else {
    215 			type = TOKEN_PLUS;
    216 		}
    217 		break;
    218 	case '-':
    219 		nudge(lex);
    220 		if (peek(lex) == '-') {
    221 			nudge(lex);
    222 			type = TOKEN_MINUSMINUS;
    223 		} else {
    224 			type = TOKEN_MINUS;
    225 		}
    226 		break;
    227 	case ',':
    228 		nudge(lex);
    229 		type = TOKEN_COMMA;
    230 		break;
    231 	case '=':
    232 		nudge(lex);
    233 		if (peek(lex) == '=') {
    234 			nudge(lex);
    235 			type = TOKEN_EQUALITY;
    236 		} else {
    237 			type = TOKEN_EQUAL;
    238 		}
    239 		break;
    240 	case '!':
    241 		nudge(lex);
    242 		if (peek(lex) == '=') {
    243 			nudge(lex);
    244 			type = TOKEN_INEQUALITY;
    245 		} else {
    246 			type = TOKEN_BANG;
    247 		}
    248 		break;
    249 	case '>':
    250 		nudge(lex);
    251 		if (peek(lex) == '=') {
    252 			nudge(lex);
    253 			type = TOKEN_GT_EQ;
    254 		} else {
    255 			type = TOKEN_GT;
    256 		}
    257 		break;
    258 	case '<':
    259 		nudge(lex);
    260 		if (peek(lex) == '=') {
    261 			nudge(lex);
    262 			type = TOKEN_LT_EQ;
    263 		} else {
    264 			type = TOKEN_LT;
    265 		}
    266 		break;
    267 	default:
    268 		nudge(lex);
    269 		type = TOKEN_UNKNOWN;
    270 		break;
    271 	}
    272 	Token t = (Token) { .type = type, .start = start, .end = lex->pos, .col = col, .line = line };
    273 	return t;
    274 }
    275 
    276 static void print_token(const Token* t, const char* contents)
    277 {
    278 	static const char* TYPES[] = {
    279 		[TOKEN_IDENT] = "ident/type",
    280 		[TOKEN_LPAREN] = "open paren",
    281 		[TOKEN_RPAREN] = "close paren",
    282 		[TOKEN_LBRACE] = "open brace",
    283 		[TOKEN_RBRACE] = "close brace",
    284 		[TOKEN_LBRACKET] = "open bracket",
    285 		[TOKEN_RBRACKET] = "close bracket",
    286 		[TOKEN_EQUAL] = "equal",
    287 		[TOKEN_SEMICOLON] = "semicol",
    288 		[TOKEN_COMMA] = "comma",
    289 		[TOKEN_NUMBER_LITERAL] = "number",
    290 		[TOKEN_STRING_LITERAL] = "string literal",
    291 		[TOKEN_SLASH] = "slash",
    292 		[TOKEN_STAR] = "star",
    293 		[TOKEN_PLUS] = "plus",
    294 		[TOKEN_PLUSPLUS] = "++",
    295 		[TOKEN_MINUS] = "minus",
    296 		[TOKEN_MINUSMINUS] = "--",
    297 		[TOKEN_EQUALITY] = "equality ==",
    298 		[TOKEN_INEQUALITY] = "inequality !=",
    299 		[TOKEN_BANG] = "bang !",
    300 		[TOKEN_LT] = "lower than",
    301 		[TOKEN_GT] = "greater than",
    302 		[TOKEN_LT_EQ] = "lt or = than",
    303 		[TOKEN_GT_EQ] = "gt or = than",
    304 		[TOKEN_IF] = "if",
    305 		[TOKEN_ELSE] = "else",
    306 		[TOKEN_WHILE] = "while",
    307 		[TOKEN_FOR] = "for",
    308 		[TOKEN_RETURN] = "return",
    309 		[TOKEN_UNKNOWN] = "< UNKNOWN >",
    310 		[TOKEN_EOF] = "~EOF~"
    311 	};
    312 
    313 	printf("L%zu:%zu \t%-14s '", t->line + 1, t->col + 1, TYPES[t->type]);
    314 	fwrite(contents + t->start, 1, t->end - t->start, stdout);
    315 	printf("'\n");
    316 }
    317 
    318 static void add_token(Lexer* lex, Token tok)
    319 {
    320 	if (lex->token_count >= lex->token_cap) {
    321 		lex->token_cap *= 2;
    322 		lex->tokens = (Token*)realloc(lex->tokens, sizeof(Token) * lex->token_cap);
    323 	}
    324 	lex->tokens[lex->token_count++] = tok;
    325 }
    326 
    327 void lexer_print(Lexer* lex)
    328 {
    329 	for (size_t i = 0; i < lex->token_count; i++) {
    330 		print_token(&lex->tokens[i], lex->src);
    331 	}
    332 }
    333 
    334 void lexer_lex(Lexer* lex, const char* filename, const char* contents)
    335 {
    336 	lex->line = 1;
    337 	lex->col = 1;
    338 	lex->pos = 0;
    339 	lex->token_cap = 128;
    340 	lex->token_count = 0;
    341 	lex->tokens = (Token*)calloc(lex->token_cap, sizeof(Token));
    342 	if(lex->tokens == NULL) panic("lexer_lex: could not alloc");
    343 	lex->filename = filename;
    344 	lex->src = contents;
    345 	lex->src_len = strlen(contents);
    346 	for (;;) {
    347 		Token tok = next_token(lex);
    348 		add_token(lex, tok);
    349 		if (tok.type == TOKEN_EOF)
    350 			break;
    351 	}
    352 }
    353 
    354 const char* token_type_str(TokenType t)
    355 {
    356 	static const char* type_strings[] = {
    357 		[TOKEN_IDENT] = "TOKEN_IDENT",
    358 		[TOKEN_LPAREN] = "TOKEN_LPAREN",
    359 		[TOKEN_RPAREN] = "TOKEN_RPAREN",
    360 		[TOKEN_LBRACE] = "TOKEN_LBRACE",
    361 		[TOKEN_RBRACE] = "TOKEN_RBRACE",
    362 		[TOKEN_LBRACKET] = "TOKEN_LBRACKET",
    363 		[TOKEN_RBRACKET] = "TOKEN_RBRACKET",
    364 		[TOKEN_EQUAL] = "TOKEN_EQUAL",
    365 		[TOKEN_SEMICOLON] = "TOKEN_SEMICOLON",
    366 		[TOKEN_COMMA] = "TOKEN_COMMA",
    367 		[TOKEN_NUMBER_LITERAL] = "TOKEN_NUMBER_LITERAL",
    368 		[TOKEN_STRING_LITERAL] = "TOKEN_STRING_LITERAL",
    369 		[TOKEN_SLASH] = "TOKEN_SLASH",
    370 		[TOKEN_STAR] = "TOKEN_STAR",
    371 		[TOKEN_PLUS] = "TOKEN_PLUS",
    372 		[TOKEN_PLUSPLUS] = "TOKEN_PLUSPLUS",
    373 		[TOKEN_MINUS] = "TOKEN_MINUS",
    374 		[TOKEN_MINUSMINUS] = "TOKEN_MINUSMINUS",
    375 		[TOKEN_EQUALITY] = "TOKEN_EQUALITY",
    376 		[TOKEN_INEQUALITY] = "TOKEN_INEQUALITY",
    377 		[TOKEN_BANG] = "TOKEN_BANG",
    378 		[TOKEN_LT] = "TOKEN_LT",
    379 		[TOKEN_GT] = "TOKEN_GT",
    380 		[TOKEN_LT_EQ] = "TOKEN_LT_EQ",
    381 		[TOKEN_GT_EQ] = "TOKEN_GT_EQ",
    382 		[TOKEN_IF] = "TOKEN_IF",
    383 		[TOKEN_ELSE] = "TOKEN_ELSE",
    384 		[TOKEN_WHILE] = "TOKEN_WHILE",
    385 		[TOKEN_FOR] = "TOKEN_FOR",
    386 		[TOKEN_RETURN] = "TOKEN_RETURN",
    387 		[TOKEN_CONTINUE] = "TOKEN_CONTINUE",
    388 		[TOKEN_BREAK] = "TOKEN_BREAK",
    389 		[TOKEN_PERCENT] = "TOKEN_PERCENT",
    390 		[TOKEN_UNKNOWN] = "TOKEN_UNKNOWN",
    391 		[TOKEN_EOF] = "TOKEN_EOF"
    392 	};
    393 	if (t >= TOKEN_IDENT && t <= TOKEN_EOF) {
    394 		return type_strings[t];
    395 	} else {
    396 		return "UNKNOWN_TOKEN_TYPE";
    397 	}
    398 }