lexer.c (10198B)
1 #include "lexer.h" 2 #include "types.h" 3 #include "utils.h" 4 5 #include <string.h> 6 #include <stdlib.h> 7 #include <stdio.h> 8 #include <ctype.h> 9 #include <assert.h> 10 11 static char 12 peek(Lexer* lex) 13 { 14 char c = lex->src[lex->pos]; 15 return c ? c : 0; 16 } 17 18 static char 19 peek2(Lexer* lex) 20 { 21 char c = lex->src[lex->pos]; 22 if (!c) return 0; 23 c = lex->src[lex->pos + 1]; 24 return c ? c : 0; 25 } 26 27 static char 28 nudge(Lexer* lex) 29 { 30 const char c = peek(lex); 31 if (!c) return 0; 32 if (c == '\n') { 33 lex->line++; 34 lex->col = 1; 35 } else { 36 lex->col++; 37 } 38 lex->pos++; 39 return c; 40 } 41 42 static void 43 skip_space_and_comments(Lexer* lex) 44 { 45 for (;;) { 46 for (;;) { 47 char c = peek(lex); 48 if (c == ' ' || c == '\t' || c == '\r' || c == '\n') 49 nudge(lex); 50 else 51 break; 52 } 53 if (peek(lex) == '/' && peek2(lex) == '/') { 54 nudge(lex); 55 nudge(lex); 56 while (peek(lex) != '\n' && peek(lex) != 0) 57 nudge(lex); 58 continue; 59 } 60 break; 61 } 62 } 63 64 static Token 65 make_ident(Lexer* lex, size_t pos, size_t line, size_t col) 66 { 67 for (;;) { 68 char c = peek(lex); 69 if (c == '_' || isalnum(c)) 70 nudge(lex); 71 else 72 break; 73 } 74 75 TokenType type = TOKEN_IDENT; 76 77 if ((lex->pos - pos) == 6 && strncmp(lex->src + pos, "return", 6) == 0) 78 type = TOKEN_RETURN; 79 else if ((lex->pos - pos) == 3 && strncmp(lex->src + pos, "for", 3) == 0) 80 type = TOKEN_FOR; 81 else if ((lex->pos - pos) == 2 && strncmp(lex->src + pos, "if", 2) == 0) 82 type = TOKEN_IF; 83 else if ((lex->pos - pos) == 4 && strncmp(lex->src + pos, "else", 4) == 0) 84 type = TOKEN_ELSE; 85 else if ((lex->pos - pos) == 8 && strncmp(lex->src + pos, "continue", 8) == 0) 86 type = TOKEN_CONTINUE; 87 else if ((lex->pos - pos) == 5 && strncmp(lex->src + pos, "break", 5) == 0) 88 type = TOKEN_BREAK; 89 else if ((lex->pos - pos) == 5 && strncmp(lex->src + pos, "while", 5) == 0) 90 type = TOKEN_WHILE; 91 92 else if ((lex->pos - pos) == 6 && strncmp(lex->src + pos, "struct", 6) == 0) 93 type = TOKEN_STRUCT; 94 else if ((lex->pos - pos) == 6 && strncmp(lex->src + pos, "extend", 6) == 0) 95 type = TOKEN_EXTEND; 96 97 else if ((lex->pos - pos) == 4 && strncmp(lex->src + pos, "true", 4) == 0) 98 type = TOKEN_BOOL_TRUE_LITERAL; 99 else if ((lex->pos - pos) == 5 && strncmp(lex->src + pos, "false", 5) == 0) 100 type = TOKEN_BOOL_FALSE_LITERAL; 101 102 else if ((lex->pos - pos) == 2 && strncmp(lex->src + pos, "fx", 2) == 0) 103 type = TOKEN_FX; 104 else if ((lex->pos - pos) == 2 && strncmp(lex->src + pos, "fn", 2) == 0) 105 type = TOKEN_FN; 106 // Check for keywords, or a Capitalised CustomType 107 // -- @later do it in the parser, keep types as unprotected names 108 // if (strncmp(lex->src + pos, "int", lex->pos - pos) == 0) 109 // type = TOKEN_INT; 110 // else if (strncmp(lex->src + pos, "float", lex->pos - pos) == 0) 111 // type = TOKEN_FLOAT; 112 // else if (strncmp(lex->src + pos, "void", lex->pos - pos) == 0) 113 // type = TOKEN_VOID; 114 // else if (strncmp(lex->src + pos, "string", lex->pos - pos) == 0) 115 // type = TOKEN_STRING; 116 // else { 117 // // If identifier starts with a capital letter, treat as TOKEN_TYPE 118 // char first = lex->src[pos]; 119 // if (first >= 'A' && first <= 'Z') { 120 // type = TOKEN_TYPE; 121 // } 122 // } 123 124 return (Token) { .type = type, .start = pos, .line = line, .col = col, .end = lex->pos }; 125 } 126 static Token 127 make_number(Lexer* lex, size_t pos, size_t line, size_t col) 128 { 129 bool is_float = false; 130 131 if (peek(lex) == '-') { 132 nudge(lex); // pass through negative values 133 } 134 135 while (isdigit(peek(lex))) 136 nudge(lex); 137 if (peek(lex) == '.' && isdigit(peek2(lex))) { 138 is_float = true; 139 nudge(lex); 140 while (isdigit(peek(lex))) 141 nudge(lex); 142 } 143 return (Token) { 144 .type = is_float ? TOKEN_FLOAT_LITERAL : TOKEN_INT_LITERAL, .start = pos, .end = lex->pos, .line = line, .col = col 145 }; 146 } 147 148 static Token 149 make_string(Lexer* lex, size_t pos, size_t line, size_t col) 150 { 151 nudge(lex); // " start 152 while (peek(lex) != '"' && peek(lex) != 0) 153 nudge(lex); 154 if (peek(lex) == '"') nudge(lex); // " end 155 return (Token) { .type = TOKEN_STRING_LITERAL, .start = pos, .end = lex->pos, .line = line, .col = col }; 156 } 157 158 static Token 159 next_token(Lexer* lex) 160 { 161 skip_space_and_comments(lex); 162 size_t start = lex->pos; 163 size_t line = lex->line; 164 size_t col = lex->col; 165 const char c = peek(lex); 166 167 if (c == 0) return (Token) { .type = TOKEN_EOF, .start = start, .end = lex->pos, .col = col, .line = line }; 168 169 if (isalpha(c) || c == '_') return make_ident(lex, start, line, col); 170 171 if (isdigit(c)) return make_number(lex, start, line, col); 172 if (c == '-' && isdigit(peek2(lex))) { return make_number(lex, start, line, col); } 173 174 if (c == '"') return make_string(lex, start, line, col); 175 176 TokenType type = TOKEN_UNKNOWN; 177 178 switch (c) { 179 case '(': 180 nudge(lex); 181 type = TOKEN_LPAREN; 182 break; 183 case ')': 184 nudge(lex); 185 type = TOKEN_RPAREN; 186 break; 187 case '{': 188 nudge(lex); 189 type = TOKEN_LBRACE; 190 break; 191 case '}': 192 nudge(lex); 193 type = TOKEN_RBRACE; 194 break; 195 case '[': 196 nudge(lex); 197 type = TOKEN_LBRACKET; 198 break; 199 case ']': 200 nudge(lex); 201 type = TOKEN_RBRACKET; 202 break; 203 case ';': 204 nudge(lex); 205 type = TOKEN_SEMICOLON; 206 break; 207 case '%': 208 nudge(lex); 209 type = TOKEN_PERCENT; 210 break; 211 case '#': 212 nudge(lex); 213 type = TOKEN_COMP_TIME; 214 break; 215 case '~': 216 nudge(lex); 217 type = TOKEN_VARIADIC; 218 break; 219 case '/': 220 nudge(lex); 221 type = TOKEN_SLASH; 222 break; 223 case '*': 224 nudge(lex); 225 type = TOKEN_STAR; 226 break; 227 case '+': 228 nudge(lex); 229 if (peek(lex) == '+') { 230 nudge(lex); 231 type = TOKEN_PLUSPLUS; 232 } else { 233 type = TOKEN_PLUS; 234 } 235 break; 236 case '-': 237 nudge(lex); 238 if (peek(lex) == '-') { 239 nudge(lex); 240 type = TOKEN_MINUSMINUS; 241 } else { 242 type = TOKEN_MINUS; 243 } 244 break; 245 case ',': 246 nudge(lex); 247 type = TOKEN_COMMA; 248 break; 249 case '=': 250 nudge(lex); 251 if (peek(lex) == '=') { 252 nudge(lex); 253 type = TOKEN_EQUALITY; 254 } else { 255 type = TOKEN_EQUAL; 256 } 257 break; 258 case '!': 259 nudge(lex); 260 if (peek(lex) == '=') { 261 nudge(lex); 262 type = TOKEN_INEQUALITY; 263 } else { 264 type = TOKEN_BANG; 265 } 266 break; 267 case '>': 268 nudge(lex); 269 if (peek(lex) == '=') { 270 nudge(lex); 271 type = TOKEN_GT_EQ; 272 } else { 273 type = TOKEN_GT; 274 } 275 break; 276 case '<': 277 nudge(lex); 278 if (peek(lex) == '=') { 279 nudge(lex); 280 type = TOKEN_LT_EQ; 281 } else { 282 type = TOKEN_LT; 283 } 284 break; 285 default: 286 nudge(lex); 287 type = TOKEN_UNKNOWN; 288 break; 289 } 290 Token t = (Token) { .type = type, .start = start, .end = lex->pos, .col = col, .line = line }; 291 return t; 292 } 293 294 static void 295 print_token(const Token* t, const char* contents) 296 { 297 static const char* TYPES[] = { [TOKEN_IDENT] = "ident/type", 298 [TOKEN_LPAREN] = "open paren", 299 [TOKEN_RPAREN] = "close paren", 300 [TOKEN_LBRACE] = "open brace", 301 [TOKEN_RBRACE] = "close brace", 302 [TOKEN_LBRACKET] = "open bracket", 303 [TOKEN_RBRACKET] = "close bracket", 304 [TOKEN_EQUAL] = "equal", 305 [TOKEN_SEMICOLON] = "semicol", 306 [TOKEN_COMMA] = "comma", 307 [TOKEN_INT_LITERAL] = "integer literal", 308 [TOKEN_FLOAT_LITERAL] = "float literal", 309 [TOKEN_STRING_LITERAL] = "string literal", 310 [TOKEN_BOOL_TRUE_LITERAL] = "bool TRUE literal", 311 [TOKEN_BOOL_FALSE_LITERAL] = "bool FALSE literal", 312 [TOKEN_SLASH] = "slash", 313 [TOKEN_STAR] = "star", 314 [TOKEN_PLUS] = "plus", 315 [TOKEN_PLUSPLUS] = "++", 316 [TOKEN_MINUS] = "minus", 317 [TOKEN_MINUSMINUS] = "--", 318 [TOKEN_EQUALITY] = "equality ==", 319 [TOKEN_INEQUALITY] = "inequality !=", 320 [TOKEN_BANG] = "bang !", 321 [TOKEN_LT] = "lower than", 322 [TOKEN_GT] = "greater than", 323 [TOKEN_LT_EQ] = "lt or = than", 324 [TOKEN_GT_EQ] = "gt or = than", 325 [TOKEN_IF] = "if", 326 [TOKEN_ELSE] = "else", 327 [TOKEN_WHILE] = "while", 328 [TOKEN_FOR] = "for", 329 [TOKEN_RETURN] = "return", 330 [TOKEN_UNKNOWN] = "< UNKNOWN >", 331 [TOKEN_EOF] = "~EOF~" }; 332 333 printf("L%zu:%zu \t%-14s '", t->line + 1, t->col + 1, TYPES[t->type]); 334 fwrite(contents + t->start, 1, t->end - t->start, stdout); 335 printf("'\n"); 336 } 337 338 static void 339 add_token(Lexer* lex, Token tok) 340 { 341 if (lex->token_count >= lex->token_cap) { 342 lex->token_cap *= 2; 343 lex->tokens = (Token*)realloc(lex->tokens, sizeof(Token) * lex->token_cap); 344 } 345 lex->tokens[lex->token_count++] = tok; 346 } 347 348 void 349 lexer_print(Lexer* lex) 350 { 351 for (size_t i = 0; i < lex->token_count; i++) { 352 print_token(&lex->tokens[i], lex->src); 353 } 354 } 355 356 void 357 lexer_lex(Lexer* lex, const char* filename, const char* contents) 358 { 359 lex->line = 1; 360 lex->col = 1; 361 lex->pos = 0; 362 lex->token_cap = 128; 363 lex->token_count = 0; 364 lex->tokens = (Token*)calloc(lex->token_cap, sizeof(Token)); 365 if (lex->tokens == NULL) panic("lexer_lex: could not alloc"); 366 lex->filename = filename; 367 lex->src = contents; 368 lex->src_len = strlen(contents); 369 for (;;) { 370 Token tok = next_token(lex); 371 add_token(lex, tok); 372 if (tok.type == TOKEN_EOF) break; 373 } 374 } 375 376 const char* 377 token_type_str(TokenType t) 378 { 379 static const char* type_strings[] = { [TOKEN_IDENT] = "TOKEN_IDENT", 380 [TOKEN_LPAREN] = "TOKEN_LPAREN", 381 [TOKEN_RPAREN] = "TOKEN_RPAREN", 382 [TOKEN_LBRACE] = "TOKEN_LBRACE", 383 [TOKEN_RBRACE] = "TOKEN_RBRACE", 384 [TOKEN_LBRACKET] = "TOKEN_LBRACKET", 385 [TOKEN_RBRACKET] = "TOKEN_RBRACKET", 386 [TOKEN_EQUAL] = "TOKEN_EQUAL", 387 [TOKEN_SEMICOLON] = "TOKEN_SEMICOLON", 388 [TOKEN_COMMA] = "TOKEN_COMMA", 389 [TOKEN_INT_LITERAL] = "TOKEN_INT_LITERAL", 390 [TOKEN_FLOAT_LITERAL] = "TOKEN_FLOAT_LITERAL", 391 [TOKEN_STRING_LITERAL] = "TOKEN_STRING_LITERAL", 392 [TOKEN_BOOL_TRUE_LITERAL] = "TOKEN_BOOL_TRUE_LITERAL", 393 [TOKEN_BOOL_FALSE_LITERAL] = "TOKEN_BOOL_FALSE_LITERAL", 394 [TOKEN_SLASH] = "TOKEN_SLASH", 395 [TOKEN_STAR] = "TOKEN_STAR", 396 [TOKEN_PLUS] = "TOKEN_PLUS", 397 [TOKEN_PLUSPLUS] = "TOKEN_PLUSPLUS", 398 [TOKEN_MINUS] = "TOKEN_MINUS", 399 [TOKEN_MINUSMINUS] = "TOKEN_MINUSMINUS", 400 [TOKEN_EQUALITY] = "TOKEN_EQUALITY", 401 [TOKEN_INEQUALITY] = "TOKEN_INEQUALITY", 402 [TOKEN_BANG] = "TOKEN_BANG", 403 [TOKEN_LT] = "TOKEN_LT", 404 [TOKEN_GT] = "TOKEN_GT", 405 [TOKEN_LT_EQ] = "TOKEN_LT_EQ", 406 [TOKEN_GT_EQ] = "TOKEN_GT_EQ", 407 [TOKEN_IF] = "TOKEN_IF", 408 [TOKEN_ELSE] = "TOKEN_ELSE", 409 [TOKEN_WHILE] = "TOKEN_WHILE", 410 [TOKEN_FOR] = "TOKEN_FOR", 411 [TOKEN_RETURN] = "TOKEN_RETURN", 412 [TOKEN_CONTINUE] = "TOKEN_CONTINUE", 413 [TOKEN_BREAK] = "TOKEN_BREAK", 414 [TOKEN_PERCENT] = "TOKEN_PERCENT", 415 [TOKEN_UNKNOWN] = "TOKEN_UNKNOWN", 416 [TOKEN_EOF] = "TOKEN_EOF" }; 417 if (t >= TOKEN_IDENT && t <= TOKEN_EOF) { 418 return type_strings[t]; 419 } else { 420 return "UNKNOWN_TOKEN_TYPE"; 421 } 422 }