lexer.c (9247B)
1 #include "lexer.h" 2 #include "types.h" 3 #include "utils.h" 4 5 #include <string.h> 6 #include <stdlib.h> 7 #include <stdio.h> 8 #include <ctype.h> 9 #include <assert.h> 10 11 static char 12 peek(Lexer* lex) 13 { 14 char c = lex->src[lex->pos]; 15 return c ? c : 0; 16 } 17 18 static char 19 peek2(Lexer* lex) 20 { 21 char c = lex->src[lex->pos]; 22 if (!c) return 0; 23 c = lex->src[lex->pos + 1]; 24 return c ? c : 0; 25 } 26 27 static char 28 nudge(Lexer* lex) 29 { 30 const char c = peek(lex); 31 if (!c) return 0; 32 if (c == '\n') { 33 lex->line++; 34 lex->col = 1; 35 } else { 36 lex->col++; 37 } 38 lex->pos++; 39 return c; 40 } 41 42 static void 43 skip_space_and_comments(Lexer* lex) 44 { 45 for (;;) { 46 for (;;) { 47 char c = peek(lex); 48 if (c == ' ' || c == '\t' || c == '\r' || c == '\n') 49 nudge(lex); 50 else 51 break; 52 } 53 if (peek(lex) == '/' && peek2(lex) == '/') { 54 nudge(lex); 55 nudge(lex); 56 while (peek(lex) != '\n' && peek(lex) != 0) 57 nudge(lex); 58 continue; 59 } 60 break; 61 } 62 } 63 64 static Token 65 make_ident(Lexer* lex, size_t pos, size_t line, size_t col) 66 { 67 for (;;) { 68 char c = peek(lex); 69 if (c == '_' || isalnum(c)) 70 nudge(lex); 71 else 72 break; 73 } 74 75 TokenType type = TOKEN_IDENT; 76 77 if ((lex->pos - pos) == 6 && strncmp(lex->src + pos, "return", 6) == 0) 78 type = TOKEN_RETURN; 79 else if ((lex->pos - pos) == 3 && strncmp(lex->src + pos, "for", 3) == 0) 80 type = TOKEN_FOR; 81 else if ((lex->pos - pos) == 2 && strncmp(lex->src + pos, "if", 2) == 0) 82 type = TOKEN_IF; 83 else if ((lex->pos - pos) == 4 && strncmp(lex->src + pos, "else", 4) == 0) 84 type = TOKEN_ELSE; 85 else if ((lex->pos - pos) == 8 && strncmp(lex->src + pos, "continue", 8) == 0) 86 type = TOKEN_CONTINUE; 87 else if ((lex->pos - pos) == 5 && strncmp(lex->src + pos, "break", 5) == 0) 88 type = TOKEN_BREAK; 89 else if ((lex->pos - pos) == 5 && strncmp(lex->src + pos, "while", 5) == 0) 90 type = TOKEN_WHILE; 91 92 // Check for keywords, or a Capitalised CustomType 93 // -- @later do it in the parser, keep types as unprotected names 94 // if (strncmp(lex->src + pos, "int", lex->pos - pos) == 0) 95 // type = TOKEN_INT; 96 // else if (strncmp(lex->src + pos, "float", lex->pos - pos) == 0) 97 // type = TOKEN_FLOAT; 98 // else if (strncmp(lex->src + pos, "void", lex->pos - pos) == 0) 99 // type = TOKEN_VOID; 100 // else if (strncmp(lex->src + pos, "string", lex->pos - pos) == 0) 101 // type = TOKEN_STRING; 102 // else { 103 // // If identifier starts with a capital letter, treat as TOKEN_TYPE 104 // char first = lex->src[pos]; 105 // if (first >= 'A' && first <= 'Z') { 106 // type = TOKEN_TYPE; 107 // } 108 // } 109 110 return (Token) { .type = type, .start = pos, .line = line, .col = col, .end = lex->pos }; 111 } 112 static Token 113 make_number(Lexer* lex, size_t pos, size_t line, size_t col) 114 { 115 bool is_float = false; 116 117 if (peek(lex) == '-') { 118 nudge(lex); // pass through negative values 119 } 120 121 while (isdigit(peek(lex))) 122 nudge(lex); 123 if (peek(lex) == '.' && isdigit(peek2(lex))) { 124 is_float = true; 125 nudge(lex); 126 while (isdigit(peek(lex))) 127 nudge(lex); 128 } 129 return (Token) { 130 .type = is_float ? TOKEN_FLOAT_LITERAL : TOKEN_INT_LITERAL, .start = pos, .end = lex->pos, .line = line, .col = col 131 }; 132 } 133 134 static Token 135 make_string(Lexer* lex, size_t pos, size_t line, size_t col) 136 { 137 nudge(lex); // " start 138 while (peek(lex) != '"' && peek(lex) != 0) 139 nudge(lex); 140 if (peek(lex) == '"') nudge(lex); // " end 141 return (Token) { .type = TOKEN_STRING_LITERAL, .start = pos, .end = lex->pos, .line = line, .col = col }; 142 } 143 144 static Token 145 next_token(Lexer* lex) 146 { 147 skip_space_and_comments(lex); 148 size_t start = lex->pos; 149 size_t line = lex->line; 150 size_t col = lex->col; 151 const char c = peek(lex); 152 153 if (c == 0) return (Token) { .type = TOKEN_EOF, .start = start, .end = lex->pos, .col = col, .line = line }; 154 155 if (isalpha(c) || c == '_') return make_ident(lex, start, line, col); 156 157 if (isdigit(c)) return make_number(lex, start, line, col); 158 if (c == '-' && isdigit(peek2(lex))) { return make_number(lex, start, line, col); } 159 160 if (c == '"') return make_string(lex, start, line, col); 161 162 TokenType type = TOKEN_UNKNOWN; 163 164 switch (c) { 165 case '(': 166 nudge(lex); 167 type = TOKEN_LPAREN; 168 break; 169 case ')': 170 nudge(lex); 171 type = TOKEN_RPAREN; 172 break; 173 case '{': 174 nudge(lex); 175 type = TOKEN_LBRACE; 176 break; 177 case '}': 178 nudge(lex); 179 type = TOKEN_RBRACE; 180 break; 181 case '[': 182 nudge(lex); 183 type = TOKEN_LBRACKET; 184 break; 185 case ']': 186 nudge(lex); 187 type = TOKEN_RBRACKET; 188 break; 189 case ';': 190 nudge(lex); 191 type = TOKEN_SEMICOLON; 192 break; 193 case '%': 194 nudge(lex); 195 type = TOKEN_PERCENT; 196 break; 197 case '/': 198 nudge(lex); 199 type = TOKEN_SLASH; 200 break; 201 case '*': 202 nudge(lex); 203 type = TOKEN_STAR; 204 break; 205 case '+': 206 nudge(lex); 207 if (peek(lex) == '+') { 208 nudge(lex); 209 type = TOKEN_PLUSPLUS; 210 } else { 211 type = TOKEN_PLUS; 212 } 213 break; 214 case '-': 215 nudge(lex); 216 if (peek(lex) == '-') { 217 nudge(lex); 218 type = TOKEN_MINUSMINUS; 219 } else { 220 type = TOKEN_MINUS; 221 } 222 break; 223 case ',': 224 nudge(lex); 225 type = TOKEN_COMMA; 226 break; 227 case '=': 228 nudge(lex); 229 if (peek(lex) == '=') { 230 nudge(lex); 231 type = TOKEN_EQUALITY; 232 } else { 233 type = TOKEN_EQUAL; 234 } 235 break; 236 case '!': 237 nudge(lex); 238 if (peek(lex) == '=') { 239 nudge(lex); 240 type = TOKEN_INEQUALITY; 241 } else { 242 type = TOKEN_BANG; 243 } 244 break; 245 case '>': 246 nudge(lex); 247 if (peek(lex) == '=') { 248 nudge(lex); 249 type = TOKEN_GT_EQ; 250 } else { 251 type = TOKEN_GT; 252 } 253 break; 254 case '<': 255 nudge(lex); 256 if (peek(lex) == '=') { 257 nudge(lex); 258 type = TOKEN_LT_EQ; 259 } else { 260 type = TOKEN_LT; 261 } 262 break; 263 default: 264 nudge(lex); 265 type = TOKEN_UNKNOWN; 266 break; 267 } 268 Token t = (Token) { .type = type, .start = start, .end = lex->pos, .col = col, .line = line }; 269 return t; 270 } 271 272 static void 273 print_token(const Token* t, const char* contents) 274 { 275 static const char* TYPES[] = { [TOKEN_IDENT] = "ident/type", 276 [TOKEN_LPAREN] = "open paren", 277 [TOKEN_RPAREN] = "close paren", 278 [TOKEN_LBRACE] = "open brace", 279 [TOKEN_RBRACE] = "close brace", 280 [TOKEN_LBRACKET] = "open bracket", 281 [TOKEN_RBRACKET] = "close bracket", 282 [TOKEN_EQUAL] = "equal", 283 [TOKEN_SEMICOLON] = "semicol", 284 [TOKEN_COMMA] = "comma", 285 [TOKEN_INT_LITERAL] = "integer literal", 286 [TOKEN_FLOAT_LITERAL] = "float literal", 287 [TOKEN_STRING_LITERAL] = "string literal", 288 [TOKEN_SLASH] = "slash", 289 [TOKEN_STAR] = "star", 290 [TOKEN_PLUS] = "plus", 291 [TOKEN_PLUSPLUS] = "++", 292 [TOKEN_MINUS] = "minus", 293 [TOKEN_MINUSMINUS] = "--", 294 [TOKEN_EQUALITY] = "equality ==", 295 [TOKEN_INEQUALITY] = "inequality !=", 296 [TOKEN_BANG] = "bang !", 297 [TOKEN_LT] = "lower than", 298 [TOKEN_GT] = "greater than", 299 [TOKEN_LT_EQ] = "lt or = than", 300 [TOKEN_GT_EQ] = "gt or = than", 301 [TOKEN_IF] = "if", 302 [TOKEN_ELSE] = "else", 303 [TOKEN_WHILE] = "while", 304 [TOKEN_FOR] = "for", 305 [TOKEN_RETURN] = "return", 306 [TOKEN_UNKNOWN] = "< UNKNOWN >", 307 [TOKEN_EOF] = "~EOF~" }; 308 309 printf("L%zu:%zu \t%-14s '", t->line + 1, t->col + 1, TYPES[t->type]); 310 fwrite(contents + t->start, 1, t->end - t->start, stdout); 311 printf("'\n"); 312 } 313 314 static void 315 add_token(Lexer* lex, Token tok) 316 { 317 if (lex->token_count >= lex->token_cap) { 318 lex->token_cap *= 2; 319 lex->tokens = (Token*)realloc(lex->tokens, sizeof(Token) * lex->token_cap); 320 } 321 lex->tokens[lex->token_count++] = tok; 322 } 323 324 void 325 lexer_print(Lexer* lex) 326 { 327 for (size_t i = 0; i < lex->token_count; i++) { 328 print_token(&lex->tokens[i], lex->src); 329 } 330 } 331 332 void 333 lexer_lex(Lexer* lex, const char* filename, const char* contents) 334 { 335 lex->line = 1; 336 lex->col = 1; 337 lex->pos = 0; 338 lex->token_cap = 128; 339 lex->token_count = 0; 340 lex->tokens = (Token*)calloc(lex->token_cap, sizeof(Token)); 341 if (lex->tokens == NULL) panic("lexer_lex: could not alloc"); 342 lex->filename = filename; 343 lex->src = contents; 344 lex->src_len = strlen(contents); 345 for (;;) { 346 Token tok = next_token(lex); 347 add_token(lex, tok); 348 if (tok.type == TOKEN_EOF) break; 349 } 350 } 351 352 const char* 353 token_type_str(TokenType t) 354 { 355 static const char* type_strings[] = { [TOKEN_IDENT] = "TOKEN_IDENT", 356 [TOKEN_LPAREN] = "TOKEN_LPAREN", 357 [TOKEN_RPAREN] = "TOKEN_RPAREN", 358 [TOKEN_LBRACE] = "TOKEN_LBRACE", 359 [TOKEN_RBRACE] = "TOKEN_RBRACE", 360 [TOKEN_LBRACKET] = "TOKEN_LBRACKET", 361 [TOKEN_RBRACKET] = "TOKEN_RBRACKET", 362 [TOKEN_EQUAL] = "TOKEN_EQUAL", 363 [TOKEN_SEMICOLON] = "TOKEN_SEMICOLON", 364 [TOKEN_COMMA] = "TOKEN_COMMA", 365 [TOKEN_INT_LITERAL] = "TOKEN_INT_LITERAL", 366 [TOKEN_FLOAT_LITERAL] = "TOKEN_FLOAT_LITERAL", 367 [TOKEN_STRING_LITERAL] = "TOKEN_STRING_LITERAL", 368 [TOKEN_SLASH] = "TOKEN_SLASH", 369 [TOKEN_STAR] = "TOKEN_STAR", 370 [TOKEN_PLUS] = "TOKEN_PLUS", 371 [TOKEN_PLUSPLUS] = "TOKEN_PLUSPLUS", 372 [TOKEN_MINUS] = "TOKEN_MINUS", 373 [TOKEN_MINUSMINUS] = "TOKEN_MINUSMINUS", 374 [TOKEN_EQUALITY] = "TOKEN_EQUALITY", 375 [TOKEN_INEQUALITY] = "TOKEN_INEQUALITY", 376 [TOKEN_BANG] = "TOKEN_BANG", 377 [TOKEN_LT] = "TOKEN_LT", 378 [TOKEN_GT] = "TOKEN_GT", 379 [TOKEN_LT_EQ] = "TOKEN_LT_EQ", 380 [TOKEN_GT_EQ] = "TOKEN_GT_EQ", 381 [TOKEN_IF] = "TOKEN_IF", 382 [TOKEN_ELSE] = "TOKEN_ELSE", 383 [TOKEN_WHILE] = "TOKEN_WHILE", 384 [TOKEN_FOR] = "TOKEN_FOR", 385 [TOKEN_RETURN] = "TOKEN_RETURN", 386 [TOKEN_CONTINUE] = "TOKEN_CONTINUE", 387 [TOKEN_BREAK] = "TOKEN_BREAK", 388 [TOKEN_PERCENT] = "TOKEN_PERCENT", 389 [TOKEN_UNKNOWN] = "TOKEN_UNKNOWN", 390 [TOKEN_EOF] = "TOKEN_EOF" }; 391 if (t >= TOKEN_IDENT && t <= TOKEN_EOF) { 392 return type_strings[t]; 393 } else { 394 return "UNKNOWN_TOKEN_TYPE"; 395 } 396 }