lexer.c (8957B)
1 #include "lexer.h" 2 #include "utils.h" 3 4 #include <string.h> 5 #include <stdlib.h> 6 #include <stdio.h> 7 #include <ctype.h> 8 #include <assert.h> 9 10 static char peek(Lexer* lex) 11 { 12 char c = lex->src[lex->pos]; 13 return c ? c : 0; 14 } 15 16 static char peek2(Lexer* lex) 17 { 18 char c = lex->src[lex->pos]; 19 if (!c) 20 return 0; 21 c = lex->src[lex->pos + 1]; 22 return c ? c : 0; 23 } 24 25 static char nudge(Lexer* lex) 26 { 27 const char c = peek(lex); 28 if (!c) 29 return 0; 30 if (c == '\n') { 31 lex->line++; 32 lex->col = 1; 33 } else { 34 lex->col++; 35 } 36 lex->pos++; 37 return c; 38 } 39 40 static void skip_space_and_comments(Lexer* lex) 41 { 42 for (;;) { 43 for (;;) { 44 char c = peek(lex); 45 if (c == ' ' || c == '\t' || c == '\r' || c == '\n') 46 nudge(lex); 47 else 48 break; 49 } 50 if (peek(lex) == '/' && peek2(lex) == '/') { 51 nudge(lex); 52 nudge(lex); 53 while (peek(lex) != '\n' && peek(lex) != 0) 54 nudge(lex); 55 continue; 56 } 57 break; 58 } 59 } 60 61 static Token make_ident(Lexer* lex, size_t pos, size_t line, size_t col) 62 { 63 for (;;) { 64 char c = peek(lex); 65 if (c == '_' || isalnum(c)) 66 nudge(lex); 67 else 68 break; 69 } 70 71 TokenType type = TOKEN_IDENT; 72 73 if ((lex->pos - pos) == 6 && strncmp(lex->src + pos, "return", 6) == 0) 74 type = TOKEN_RETURN; 75 else if ((lex->pos - pos) == 3 && strncmp(lex->src + pos, "for", 3) == 0) 76 type = TOKEN_FOR; 77 else if ((lex->pos - pos) == 2 && strncmp(lex->src + pos, "if", 2) == 0) 78 type = TOKEN_IF; 79 else if ((lex->pos - pos) == 4 && strncmp(lex->src + pos, "else", 4) == 0) 80 type = TOKEN_ELSE; 81 else if ((lex->pos - pos) == 8 && strncmp(lex->src + pos, "continue", 8) == 0) 82 type = TOKEN_CONTINUE; 83 else if ((lex->pos - pos) == 5 && strncmp(lex->src + pos, "break", 5) == 0) 84 type = TOKEN_BREAK; 85 else if ((lex->pos - pos) == 5 && strncmp(lex->src + pos, "while", 5) == 0) 86 type = TOKEN_WHILE; 87 88 // Check for keywords, or a Capitalised CustomType 89 // -- @later do it in the parser, keep types as unprotected names 90 // if (strncmp(lex->src + pos, "int", lex->pos - pos) == 0) 91 // type = TOKEN_INT; 92 // else if (strncmp(lex->src + pos, "float", lex->pos - pos) == 0) 93 // type = TOKEN_FLOAT; 94 // else if (strncmp(lex->src + pos, "void", lex->pos - pos) == 0) 95 // type = TOKEN_VOID; 96 // else if (strncmp(lex->src + pos, "string", lex->pos - pos) == 0) 97 // type = TOKEN_STRING; 98 // else { 99 // // If identifier starts with a capital letter, treat as TOKEN_TYPE 100 // char first = lex->src[pos]; 101 // if (first >= 'A' && first <= 'Z') { 102 // type = TOKEN_TYPE; 103 // } 104 // } 105 106 return (Token) { .type = type, 107 .start = pos, 108 .line = line, 109 .col = col, 110 .end = lex->pos }; 111 } 112 static Token make_number(Lexer* lex, size_t pos, size_t line, size_t col) 113 { 114 while (isdigit(peek(lex))) 115 nudge(lex); 116 if (peek(lex) == '.' && isdigit(peek2(lex))) { 117 nudge(lex); 118 while (isdigit(peek(lex))) 119 nudge(lex); 120 } 121 return (Token) { 122 .type = TOKEN_NUMBER_LITERAL, 123 .start = pos, 124 .end = lex->pos, 125 .line = line, 126 .col = col 127 }; 128 } 129 130 static Token make_string(Lexer* lex, size_t pos, size_t line, size_t col) 131 { 132 nudge(lex); // " start 133 while (peek(lex) != '"' && peek(lex) != 0) 134 nudge(lex); 135 if (peek(lex) == '"') 136 nudge(lex); // " end 137 return (Token) { 138 .type = TOKEN_STRING_LITERAL, 139 .start = pos, 140 .end = lex->pos, 141 .line = line, 142 .col = col 143 }; 144 } 145 146 static Token next_token(Lexer* lex) 147 { 148 skip_space_and_comments(lex); 149 size_t start = lex->pos; 150 size_t line = lex->line; 151 size_t col = lex->col; 152 const char c = peek(lex); 153 154 if (c == 0) 155 return (Token) { 156 .type = TOKEN_EOF, .start = start, .end = lex->pos, .col = col, .line = line 157 }; 158 159 if (isalpha(c) || c == '_') 160 return make_ident(lex, start, line, col); 161 if (isdigit(c)) 162 return make_number(lex, start, line, col); 163 if (c == '"') 164 return make_string(lex, start, line, col); 165 166 TokenType type = TOKEN_UNKNOWN; 167 168 switch (c) { 169 case '(': 170 nudge(lex); 171 type = TOKEN_LPAREN; 172 break; 173 case ')': 174 nudge(lex); 175 type = TOKEN_RPAREN; 176 break; 177 case '{': 178 nudge(lex); 179 type = TOKEN_LBRACE; 180 break; 181 case '}': 182 nudge(lex); 183 type = TOKEN_RBRACE; 184 break; 185 case '[': 186 nudge(lex); 187 type = TOKEN_LBRACKET; 188 break; 189 case ']': 190 nudge(lex); 191 type = TOKEN_RBRACKET; 192 break; 193 case ';': 194 nudge(lex); 195 type = TOKEN_SEMICOLON; 196 break; 197 case '%': 198 nudge(lex); 199 type = TOKEN_PERCENT; 200 break; 201 case '/': 202 nudge(lex); 203 type = TOKEN_SLASH; 204 break; 205 case '*': 206 nudge(lex); 207 type = TOKEN_UNKNOWN; 208 break; 209 case '+': 210 nudge(lex); 211 if (peek(lex) == '+') { 212 nudge(lex); 213 type = TOKEN_PLUSPLUS; 214 } else { 215 type = TOKEN_PLUS; 216 } 217 break; 218 case '-': 219 nudge(lex); 220 if (peek(lex) == '-') { 221 nudge(lex); 222 type = TOKEN_MINUSMINUS; 223 } else { 224 type = TOKEN_MINUS; 225 } 226 break; 227 case ',': 228 nudge(lex); 229 type = TOKEN_COMMA; 230 break; 231 case '=': 232 nudge(lex); 233 if (peek(lex) == '=') { 234 nudge(lex); 235 type = TOKEN_EQUALITY; 236 } else { 237 type = TOKEN_EQUAL; 238 } 239 break; 240 case '!': 241 nudge(lex); 242 if (peek(lex) == '=') { 243 nudge(lex); 244 type = TOKEN_INEQUALITY; 245 } else { 246 type = TOKEN_BANG; 247 } 248 break; 249 case '>': 250 nudge(lex); 251 if (peek(lex) == '=') { 252 nudge(lex); 253 type = TOKEN_GT_EQ; 254 } else { 255 type = TOKEN_GT; 256 } 257 break; 258 case '<': 259 nudge(lex); 260 if (peek(lex) == '=') { 261 nudge(lex); 262 type = TOKEN_LT_EQ; 263 } else { 264 type = TOKEN_LT; 265 } 266 break; 267 default: 268 nudge(lex); 269 type = TOKEN_UNKNOWN; 270 break; 271 } 272 Token t = (Token) { .type = type, .start = start, .end = lex->pos, .col = col, .line = line }; 273 return t; 274 } 275 276 static void print_token(const Token* t, const char* contents) 277 { 278 static const char* TYPES[] = { 279 [TOKEN_IDENT] = "ident/type", 280 [TOKEN_LPAREN] = "open paren", 281 [TOKEN_RPAREN] = "close paren", 282 [TOKEN_LBRACE] = "open brace", 283 [TOKEN_RBRACE] = "close brace", 284 [TOKEN_LBRACKET] = "open bracket", 285 [TOKEN_RBRACKET] = "close bracket", 286 [TOKEN_EQUAL] = "equal", 287 [TOKEN_SEMICOLON] = "semicol", 288 [TOKEN_COMMA] = "comma", 289 [TOKEN_NUMBER_LITERAL] = "number", 290 [TOKEN_STRING_LITERAL] = "string literal", 291 [TOKEN_SLASH] = "slash", 292 [TOKEN_STAR] = "star", 293 [TOKEN_PLUS] = "plus", 294 [TOKEN_PLUSPLUS] = "++", 295 [TOKEN_MINUS] = "minus", 296 [TOKEN_MINUSMINUS] = "--", 297 [TOKEN_EQUALITY] = "equality ==", 298 [TOKEN_INEQUALITY] = "inequality !=", 299 [TOKEN_BANG] = "bang !", 300 [TOKEN_LT] = "lower than", 301 [TOKEN_GT] = "greater than", 302 [TOKEN_LT_EQ] = "lt or = than", 303 [TOKEN_GT_EQ] = "gt or = than", 304 [TOKEN_IF] = "if", 305 [TOKEN_ELSE] = "else", 306 [TOKEN_WHILE] = "while", 307 [TOKEN_FOR] = "for", 308 [TOKEN_RETURN] = "return", 309 [TOKEN_UNKNOWN] = "< UNKNOWN >", 310 [TOKEN_EOF] = "~EOF~" 311 }; 312 313 printf("L%zu:%zu \t%-14s '", t->line + 1, t->col + 1, TYPES[t->type]); 314 fwrite(contents + t->start, 1, t->end - t->start, stdout); 315 printf("'\n"); 316 } 317 318 static void add_token(Lexer* lex, Token tok) 319 { 320 if (lex->token_count >= lex->token_cap) { 321 lex->token_cap *= 2; 322 lex->tokens = (Token*)realloc(lex->tokens, sizeof(Token) * lex->token_cap); 323 } 324 lex->tokens[lex->token_count++] = tok; 325 } 326 327 void lexer_print(Lexer* lex) 328 { 329 for (size_t i = 0; i < lex->token_count; i++) { 330 print_token(&lex->tokens[i], lex->src); 331 } 332 } 333 334 void lexer_lex(Lexer* lex, const char* filename, const char* contents) 335 { 336 lex->line = 1; 337 lex->col = 1; 338 lex->pos = 0; 339 lex->token_cap = 128; 340 lex->token_count = 0; 341 lex->tokens = (Token*)calloc(lex->token_cap, sizeof(Token)); 342 if(lex->tokens == NULL) panic("lexer_lex: could not alloc"); 343 lex->filename = filename; 344 lex->src = contents; 345 lex->src_len = strlen(contents); 346 for (;;) { 347 Token tok = next_token(lex); 348 add_token(lex, tok); 349 if (tok.type == TOKEN_EOF) 350 break; 351 } 352 } 353 354 const char* token_type_str(TokenType t) 355 { 356 static const char* type_strings[] = { 357 [TOKEN_IDENT] = "TOKEN_IDENT", 358 [TOKEN_LPAREN] = "TOKEN_LPAREN", 359 [TOKEN_RPAREN] = "TOKEN_RPAREN", 360 [TOKEN_LBRACE] = "TOKEN_LBRACE", 361 [TOKEN_RBRACE] = "TOKEN_RBRACE", 362 [TOKEN_LBRACKET] = "TOKEN_LBRACKET", 363 [TOKEN_RBRACKET] = "TOKEN_RBRACKET", 364 [TOKEN_EQUAL] = "TOKEN_EQUAL", 365 [TOKEN_SEMICOLON] = "TOKEN_SEMICOLON", 366 [TOKEN_COMMA] = "TOKEN_COMMA", 367 [TOKEN_NUMBER_LITERAL] = "TOKEN_NUMBER_LITERAL", 368 [TOKEN_STRING_LITERAL] = "TOKEN_STRING_LITERAL", 369 [TOKEN_SLASH] = "TOKEN_SLASH", 370 [TOKEN_STAR] = "TOKEN_STAR", 371 [TOKEN_PLUS] = "TOKEN_PLUS", 372 [TOKEN_PLUSPLUS] = "TOKEN_PLUSPLUS", 373 [TOKEN_MINUS] = "TOKEN_MINUS", 374 [TOKEN_MINUSMINUS] = "TOKEN_MINUSMINUS", 375 [TOKEN_EQUALITY] = "TOKEN_EQUALITY", 376 [TOKEN_INEQUALITY] = "TOKEN_INEQUALITY", 377 [TOKEN_BANG] = "TOKEN_BANG", 378 [TOKEN_LT] = "TOKEN_LT", 379 [TOKEN_GT] = "TOKEN_GT", 380 [TOKEN_LT_EQ] = "TOKEN_LT_EQ", 381 [TOKEN_GT_EQ] = "TOKEN_GT_EQ", 382 [TOKEN_IF] = "TOKEN_IF", 383 [TOKEN_ELSE] = "TOKEN_ELSE", 384 [TOKEN_WHILE] = "TOKEN_WHILE", 385 [TOKEN_FOR] = "TOKEN_FOR", 386 [TOKEN_RETURN] = "TOKEN_RETURN", 387 [TOKEN_CONTINUE] = "TOKEN_CONTINUE", 388 [TOKEN_BREAK] = "TOKEN_BREAK", 389 [TOKEN_PERCENT] = "TOKEN_PERCENT", 390 [TOKEN_UNKNOWN] = "TOKEN_UNKNOWN", 391 [TOKEN_EOF] = "TOKEN_EOF" 392 }; 393 if (t >= TOKEN_IDENT && t <= TOKEN_EOF) { 394 return type_strings[t]; 395 } else { 396 return "UNKNOWN_TOKEN_TYPE"; 397 } 398 }