mighty

The mighty programming language, compiler and tools (WIP)
Log | Files | Refs

lexer.lua (5001B)



local lexer = {
	tokens = {},
	file = "---",
	line = 1,
	col = 1,
}

local a = string.byte("a")
local z = string.byte("z")
local A = string.byte("A")
local Z = string.byte("Z")
local zero = string.byte("0")
local nine = string.byte("9")

function is_alpha(c)
	return (c >= a and c <= z) or (c >= A and c <= Z)
end

function is_digit(c)
	return (c >= zero and c <= nine)
end

function alpha_num(c)
	return is_alpha(c) or is_digit(c)
end

function is_space(c)
	return c == "\t" or c == "\n" or c == " " or c == "\r"
end

function read_ident(start, src)
	local word = ""
	local i = start
	local c = 0
	while i <= #src do
		c = src:sub(i, i):byte()
		if not alpha_num(c) then
			i = i - 1 -- rewind
			break
		end
		i = i + 1
	end
	word = src:sub(start, i)
	return word, i
end

function read_number(start, src)
	local num = ""
	local i = start
	local c = 0
	local ch = ""
	local is_float = false
	while i <= #src do
		ch = src:sub(i, i)
		c = ch:byte()
		if ch == "." then
			is_float = true
		end
		if not is_digit(c) and ch ~= "." then
			i = i - 1 -- rewind
			break
		end
		i = i + 1
	end
	num = src:sub(start, i)
	return num, i, is_float
end

function read_string_literal(start, src)
	start = start + 1 -- consume leading dbl_quote
	local str = ""
	local i = start
	local c = 0
	while i <= #src do
		c = src:sub(i, i)
		-- todo escaping \"
		-- todo blocking multiple lines
		if c == '"' then
			i = i - 1
			break
		end
		i = i + 1
	end
	word = src:sub(start, i)
	i = i + 1 -- consume trailing dbl_quote
	return word, i
end

TK = {
	IDK = "UNKNOWN",
	COLON = "colon",
	COLONCOLON = "colcol",
	EOS = "EOS end stmt",
	DO = "DO KEYWORD",
	IDENT = "ident",
	KEYWORD = "keyword",
	DOT = "dot",
	EQ = "assign",
	EQEQ = "equality",
	L_PAREN = "l paren",
	R_PAREN = "r paren",
	L_BRACE = "l brace",
	R_BRACE = "r brace",
	DBL_QUOTE = "dbl quo",
	SGL_QUOTE = "sgl quo",
	LIT_FLOAT = "a float",
	LIT_INT = "a int",
	LIT_STRING = "a string",
	COMMA = "comma",
}

local keywords = {
	["ns"] = true,
	["use"] = true,
	["from"] = true,
	["ffi"] = true,
	["as"] = true,
	["and"] = true,
	["or"] = true,
	["struct"] = true,
	["enum"] = true,
	["if"] = true,
	["else"] = true,
	["end"] = true,
	["pub"] = true,
	["fx"] = true,
	["fn"] = true,
	["return"] = true,
}

Token = { kind = TK.IDK, lexeme = "", file = "", line = 0, col = 0 }

local function print_token(t)
	print(
		t.file
			.. " L"
			.. t.line
			.. ":"
			.. t.col
			.. " \t type: "
			.. t.kind
			.. "\t value: "
			.. t.lexeme
	)
end

local function upgrade(word)
	if keywords[word] then
		print("upgraded " .. word)
		return TK.KEYWORD
	end
	return TK.IDENT
end

function lexer:add(kind, lexeme)
	token = {
		kind = kind,
		lexeme = lexeme,
		file = self.file,
		line = self.line,
		col = self.col,
	}
	table.insert(self.tokens, token)
end

function lexer:lex(file, src)
	local i = 1
	self.file = file

	local function next()
		if i + 1 <= #src then
			return src:sub(i + 1, i + 1)
		end
	end

	while i <= #src do
		local c = src:sub(i, i)

		if is_space(c) then
			if c == "\n" then
				self:add(TK.EOS, "CR")
				self.col = 1
				self.line = self.line + 1
			else
				self.col = self.col + 1
			end
			i = i + 1
		elseif is_alpha(c:byte()) then
			word, i = read_ident(i, src)
			kind = upgrade(word)
			self:add(kind, word)
			self.col = self.col + #word
			i = i + 1
		elseif is_digit(c:byte()) then
			number, i, is_float = read_number(i, src)
			if is_float then
				self:add(TK.LIT_FLOAT, number)
			else
				self:add(TK.LIT_INT, number)
			end
			self.col = self.col + #number
			i = i + 1
		elseif c == '"' then
			str, i = read_string_literal(i, src)
			self:add(TK.LIT_STRING, str)
			self.col = self.col + #str + 2
			i = i + 1
		elseif c == ":" then
			if next() == ":" then
				self:add(TK.COLONCOLON, "::")
				self.col = self.col + 2
				i = i + 2
			else
				self:add(TK.COLON, ":")
				self.col = self.col + 1
				i = i + 1
			end
		elseif c == "=" then
			if next() == "=" then
				self:add(TK.EQEQ, "==")
				self.col = self.col + 2
				i = i + 2
			else
				self:add(TK.EQ, "=")
				self.col = self.col + 1
				i = i + 1
			end
		elseif c == "(" then
			self:add(TK.L_PAREN, "(")
			self.col = self.col + 1
			i = i + 1
		elseif c == ")" then
			self:add(TK.R_PAREN, ")")
			self.col = self.col + 1
			i = i + 1
		elseif c == "{" then
			self:add(TK.L_BRACE, "{")
			self.col = self.col + 1
			i = i + 1
		elseif c == "}" then
			self:add(TK.R_BRACE, "}")
			self.col = self.col + 1
			i = i + 1
		elseif c == ";" then
			self:add(TK.EOS, ";")
			self.col = self.col + 1
			i = i + 1
		elseif c == "," then
			self:add(TK.COMMA, ",")
			self.col = self.col + 1
			i = i + 1
		elseif c == "'" then
			self:add(TK.SGL_QUOTE, "'")
			self.col = self.col + 1
			i = i + 1
		elseif c == '"' then
			self:add(TK.DBL_QUOTE, '"')
			self.col = self.col + 1
			i = i + 1
		else
			self:add(TK.IDK, "")
			self.col = self.col + 1
			i = i + 1
		end
	end

	for i = 1, #self.tokens do
		print_token(self.tokens[i])
	end
end

return lexer