mighty

The mighty programming language, compiler and tools (WIP)
Log | Files | Refs

lexer.go (6001B)



package lexer

import (
	"fmt"
)

type Kind int

const (
	Ident Kind = iota

	KeywordNs
	KeywordIn
	KeywordFrom
	KeywordUse
	KeywordFfi
	KeywordDrop
	KeywordAs
	KeywordOf
	KeywordAnd
	KeywordOr
	KeywordRef
	KeywordStruct
	KeywordEnum
	KeywordPre
	KeywordPost
	KeywordInv
	KeywordIf
	KeywordElse
	KeywordWhere
	KeywordPub

	Dot
	Colon
	ColonColon
	Comma
	EOF
	Eq
	LiteralBool
	LiteralChar
	LiteralFloat
	LiteralInt
	LiteralString
	Minus
	MinusEq
	MinusMinus
	LParen
	RParen
	LBrace
	RBrace
	LBracket
	RBracket
	Plus
	PlusEq
	PlusPlus
	Slash
	Star
	BadToken
)

var kindsKeywords = map[Kind]string{
	KeywordNs:     "ns",
	KeywordIn:     "in",
	KeywordFrom:   "from",
	KeywordUse:    "use",
	KeywordFfi:    "ffi",
	KeywordDrop:   "drop",
	KeywordAs:     "as",
	KeywordOf:     "of",
	KeywordAnd:    "and",
	KeywordOr:     "or",
	KeywordRef:    "ref",
	KeywordStruct: "struct",
	KeywordEnum:   "enum",
	KeywordPre:    "pre",
	KeywordPost:   "post",
	KeywordInv:    "inv",
	KeywordIf:     "if",
	KeywordElse:   "else",
	KeywordWhere:  "where",
	KeywordPub:    "pub",
}
var keywordKinds = map[string]Kind{
	"ns":     KeywordNs,
	"in":     KeywordIn,
	"from":   KeywordFrom,
	"use":    KeywordUse,
	"ffi":    KeywordFfi,
	"drop":   KeywordDrop,
	"as":     KeywordAs,
	"of":     KeywordOf,
	"and":    KeywordAnd,
	"or":     KeywordOr,
	"ref":    KeywordRef,
	"struct": KeywordStruct,
	"enum":   KeywordEnum,
	"pre":    KeywordPre,
	"post":   KeywordPost,
	"inv":    KeywordInv,
	"if":     KeywordIf,
	"else":   KeywordElse,
	"where":  KeywordWhere,
	"pub":    KeywordPub,
}

type Token struct {
	Kind      Kind
	Value     string
	Line, Col int
}

func Lex(filename string, src string) []Token {
	var res []Token
	i := 0
	line := 1
	col := 1

	for i < len(src) {

		c := src[i]

		if is_space(c) {
			if c == '\n' {
				line++
				col = 1
			} else {
				col++
			}
			i++
			continue
		}

		start := i
		startCol := col

		if is_alpha(c) || is__(c) {
			for i < len(src) && (is_alphanum(src[i]) || is__(src[i])) {
				i++
				col++
			}

			ident := src[start:i]
			kind := Ident
			if kw, ok := keywordKinds[ident]; ok {
				kind = kw
			}
			res = append(res, Token{kind, src[start:i], line, startCol})
			continue
		}
		if is_digit(c) {
			numeric := LiteralInt
			for i < len(src) && (is_digit(src[i]) || is__(src[i]) && is_dot(src[i])) {
				if is_dot(src[i]) {
					numeric = LiteralFloat
				}
				i++
				col++
			}

			res = append(res, Token{numeric, src[start:i], line, startCol})
			continue
		}
		if c == '"' {
			i++ // consume opening dbquote
			col++
			for i < len(src) && src[i] != '"' {
				i++
				col++
			}

			res = append(res, Token{LiteralString, src[start+1 : i], line, startCol})
			i++ // consume closing dbquote
			col++
			continue
		}
		var cx byte
		if i+1 < len(src) {
			cx = src[i+1]
		}
		if c == '/' && cx == '/' {
			for i < len(src) && src[i] != '\n' {
				i++
				col++
			}
			continue
		}

		switch c {
		case '.':
			res = append(res, Token{Dot, src[i : i+1], line, col})
		case ':':
			switch cx {
			case ':':
				res = append(res, Token{ColonColon, src[i : i+2], line, col})
				i++
				col++
			default:
				res = append(res, Token{Colon, src[i : i+1], line, col})
			}
		case '+':
			switch cx {
			case '=':
				res = append(res, Token{PlusEq, src[i : i+2], line, col})
				i++
				col++
			case '+':
				res = append(res, Token{PlusPlus, src[i : i+2], line, col})
				i++
				col++
			default:
				res = append(res, Token{Plus, src[i : i+1], line, col})
			}
		case '=':
			res = append(res, Token{Eq, src[i : i+1], line, col})
		case '/':
			res = append(res, Token{Slash, src[i : i+1], line, col})
		case '*':
			res = append(res, Token{Star, src[i : i+1], line, col})
		case ',':
			res = append(res, Token{Comma, src[i : i+1], line, col})
		case '(':
			res = append(res, Token{LParen, src[i : i+1], line, col})
		case ')':
			res = append(res, Token{RParen, src[i : i+1], line, col})
		case '[':
			res = append(res, Token{LBracket, src[i : i+1], line, col})
		case ']':
			res = append(res, Token{RBracket, src[i : i+1], line, col})
		case '{':
			res = append(res, Token{LBrace, src[i : i+1], line, col})
		case '}':
			res = append(res, Token{RBrace, src[i : i+1], line, col})
		default:
			res = append(res, Token{BadToken, src[i : i+1], line, col})
		}

		i++
		col++
	}
	res = append(res, Token{EOF, "", line, col})
	return res
}

func is__(c byte) bool        { return c == '_' }
func is_alpha(c byte) bool    { return c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' }
func is_alphanum(c byte) bool { return is_alpha(c) || is_digit(c) }
func is_digit(c byte) bool    { return c >= '0' && c <= '9' }
func is_dot(c byte) bool      { return c == '.' }
func is_space(c byte) bool    { return c == ' ' || c == '\t' || c == '\r' || c == '\n' }

func (k Kind) String() string {

	if text, ok := kindsKeywords[k]; ok {
		return text
	}

	switch k {
	case Dot:
		return "Dot"
	case Comma:
		return "Comma"
	case Ident:
		return "Ident"
	case Eq:
		return "Eq"
	case Plus:
		return "Plus"
	case PlusEq:
		return "PlusEq"
	case PlusPlus:
		return "PlusPlus"
	case Star:
		return "Star"
	case Slash:
		return "Slash"
	case Minus:
		return "Minus"
	case MinusEq:
		return "MinusEq"
	case MinusMinus:
		return "MinusMinus"
	case Colon:
		return "Colon"
	case ColonColon:
		return "ColonColon"
	case LParen:
		return "LParen"
	case RParen:
		return "RParen"
	case LBrace:
		return "LBrace"
	case RBrace:
		return "RBrace"
	case LBracket:
		return "LBracket"
	case RBracket:
		return "RBracket"
	case EOF:
		return "EOF"
	case LiteralInt:
		return "IntLiteral"
	case LiteralFloat:
		return "FloatLiteral"
	case LiteralString:
		return "StringLiteral"
	case LiteralChar:
		return "CharLiteral"
	case LiteralBool:
		return "BoolLiteral"
	case BadToken:
		return "BAD~~~TOKEN"
	default:
		return "Print:Unknown"
	}
}

func Print_tokens(tokens []Token) {

	for _, tok := range tokens {
		fmt.Printf("%-d:%-2d   %-16s %-16q\n", tok.Line, tok.Col, tok.Kind, tok.Value)
	}
}