package raku import "strings" import "unicode" import "fmt" import "reflect" import "runtime" import "gitlab.com/beoran/woe/monolog" const tokenDelimiter = " \t\r\n'({[]}),;.:" const operator_chars = "&|@'^-*%/+=<>~" type TokenText string type TokenType int64 type Position struct { Index int Row int Column int } const ( TokenPeriod TokenType = TokenType('.') TokenComma TokenType = TokenType(',') TokenSemicolon TokenType = TokenType(';') TokenColon TokenType = TokenType(':') TokenOpenParen TokenType = TokenType('(') TokenCloseParen TokenType = TokenType(')') TokenOpenBrace TokenType = TokenType('{') TokenCloseBrace TokenType = TokenType('}') TokenOpenBracket TokenType = TokenType('[') TokenCloseBracket TokenType = TokenType(']') TokenNone TokenType = 0 TokenError TokenType = -1 TokenWord TokenType = -2 TokenEOL TokenType = -3 TokenEOF TokenType = -4 TokenNumber TokenType = -5 TokenOperator TokenType = -6 TokenString TokenType = -7 TokenSymbol TokenType = -8 TokenFirstKeyword TokenType = -9 TokenArticle TokenType = -10 TokenDo TokenType = -11 TokenEnd TokenType = -12 TokenDef TokenType = -13 TokenPreposition TokenType = -14 TokenVerb TokenType = -15 TokenNoun TokenType = -16 TokenAdverb TokenType = -17 TokenAdjective TokenType = -18 TokenLastKeyword TokenType = -19 TokenLast TokenType = -19 ) type Token struct { TokenType TokenText Position } var tokenTypeMap map[TokenType]string = map[TokenType]string{ TokenNone: "None", TokenError: "Error", TokenWord: "Word", TokenEOL: "EOL", TokenEOF: "EOF", TokenNumber: "Number", TokenOperator: "Operator", TokenString: "String", TokenSymbol: "Symbol", TokenArticle: "Article", TokenPreposition:"Preposition", TokenDo: "Do", TokenEnd: "End", TokenDef: "KeywordDef", TokenVerb: "Verb", TokenAdjective: "Adjective", TokenAdverb: "Adverb", TokenNoun: "Noun", } var keywordMap map[string]TokenType = map[string]TokenType{ "a" : TokenArticle, "an" : TokenArticle, "the" : TokenArticle, "do" : TokenDo, "begin" : TokenDo, "then" : TokenDo, "has" : TokenDo, "end" : TokenEnd, "done" : TokenEnd, "endif" : TokenEnd, "def" : TokenDef, "define" : TokenDef, "aboard" : TokenPreposition, "about" : TokenPreposition, "above" : TokenPreposition, "absent" : TokenPreposition, "across" : TokenPreposition, "after" : TokenPreposition, "against" : TokenPreposition, "along" : TokenPreposition, "alongside" : TokenPreposition, "amid" : TokenPreposition, "amidst" : TokenPreposition, "among" : TokenPreposition, "apropos" : TokenPreposition, "apud" : TokenPreposition, "around" : TokenPreposition, "as" : TokenPreposition, "astride" : TokenPreposition, "at" : TokenPreposition, "atop" : TokenPreposition, "ontop" : TokenPreposition, "bar" : TokenPreposition, "before" : TokenPreposition, "behind" : TokenPreposition, "below" : TokenPreposition, "beneath" : TokenPreposition, "beside" : TokenPreposition, "besides" : TokenPreposition, "between" : TokenPreposition, "beyond" : TokenPreposition, "but" : TokenPreposition, "by" : TokenPreposition, "chez" : TokenPreposition, "circa" : TokenPreposition, "come" : TokenPreposition, "dehors" : TokenPreposition, "despite" : TokenPreposition, "down" : TokenPreposition, "during" : TokenPreposition, "except" : TokenPreposition, "for" : TokenPreposition, "from" : TokenPreposition, "in" : TokenPreposition, "inside" : TokenPreposition, "into" : TokenPreposition, "less" : TokenPreposition, "like" : TokenPreposition, "minus" : TokenPreposition, "near" : TokenPreposition, "nearer" : TokenPreposition, "nearest" : TokenPreposition, "notwithstanding" : TokenPreposition, "of" : TokenPreposition, "off" : TokenPreposition, "on" : TokenPreposition, "onto" : TokenPreposition, "opposite" : TokenPreposition, "out" : TokenPreposition, "outside" : TokenPreposition, "over" : TokenPreposition, "pace" : TokenPreposition, "past" : TokenPreposition, "per" : TokenPreposition, "post" : TokenPreposition, "pre" : TokenPreposition, "pro" : TokenPreposition, "qua" : TokenPreposition, "re" : TokenPreposition, "sans" : TokenPreposition, "save" : TokenPreposition, "short" : TokenPreposition, "since" : TokenPreposition, "than" : TokenPreposition, "through" : TokenPreposition, "thru" : TokenPreposition, "throughout" : TokenPreposition, "to" : TokenPreposition, "toward" : TokenPreposition, "towards" : TokenPreposition, "under" : TokenPreposition, "underneath" : TokenPreposition, "unlike" : TokenPreposition, "until" : TokenPreposition, "up" : TokenPreposition, "upon" : TokenPreposition, "upside" : TokenPreposition, "versus" : TokenPreposition, "via" : TokenPreposition, "vice" : TokenPreposition, "vis-à-vis" : TokenPreposition, "with" : TokenPreposition, "within" : TokenPreposition, "without" : TokenPreposition, "worth" : TokenPreposition, } var sigilMap map[string]TokenType = map[string]TokenType{ "[": TokenOpenBracket, "{": TokenOpenBrace, "(": TokenOpenParen, "]": TokenCloseBracket, "}": TokenCloseBrace, ")": TokenCloseParen, } func (me TokenType) String() string { name, found := tokenTypeMap[me] if found { return name } else { if (me > 0) && (me < 256) { return fmt.Sprintf("Char<%c>", byte(me)) } return fmt.Sprintf("Unknown Token %d", int(me)) } } func (me Token) String() string { return fmt.Sprintf("Token: %s >%s< %d %d %d.", me.TokenType, string(me.TokenText), me.Index, me.Row, me.Column) } func (me Token) ShortString() string { return fmt.Sprintf("T: %s >%s<", me.TokenType, string(me.TokenText)) } /* The tokenizer splits up text in tokens without classifying Word tokens. */ type Tokenizer struct { CurrentPosition Position LastPosition Position Input []rune Output []*Token rule TokenizerRule } type TokenizerRule func(tkz *Tokenizer) TokenizerRule func NewTokenizer(input string) * Tokenizer { tokenizer := &Tokenizer{} tokenizer.Input = []rune(input); tokenizer.rule = TokenizeStart return tokenizer; } func (tkz *Tokenizer) Last() Position { return tkz.LastPosition } func (tkz *Tokenizer) Current() Position { return tkz.CurrentPosition } func (tkz *Tokenizer) LastPtr() * Position { return &tkz.LastPosition } func (tkz *Tokenizer) CurrentPtr() * Position { return &tkz.CurrentPosition } func (tkz *Tokenizer) Emit(t TokenType, v TokenText) { tok := &Token{t, v, tkz.Current()} tkz.EmitToken(tok) } func (tkz *Tokenizer) Error(message string, args ...interface{}) { value := fmt.Sprintf(message, args...) monolog.Error("Tokenize Error: %s", value) tkz.Emit(TokenError, TokenText(value)) } func TokenizeError(tkz *Tokenizer) TokenizerRule { tkz.Error("Error") return nil } func (tkz *Tokenizer) SkipComment() bool { if tkz.Peek() == '#' { if tkz.Next() == '(' { return tkz.SkipNotIn(")") } else { return tkz.SkipNotIn("\r\n") } } return true } func TokenizeSigil(tkz *Tokenizer) TokenizerRule { tkz.Found(TokenType(tkz.Peek())) _ = tkz.Next() tkz.Advance() return TokenizeNormal } func TokenizeWord(tkz *Tokenizer) TokenizerRule { tkz.SkipNotIn(tokenDelimiter) tkz.Found(TokenWord) return TokenizeNormal } func TokenizeSymbol(tkz *Tokenizer) TokenizerRule { tkz.SkipNotIn(tokenDelimiter) tkz.Found(TokenSymbol) return TokenizeNormal } func TokenizeNumber(tkz *Tokenizer) TokenizerRule { tkz.SkipNotIn(tokenDelimiter) tkz.Found(TokenNumber) return TokenizeNormal } func TokenizeWhitespace(tkz *Tokenizer) TokenizerRule { tkz.SkipWhitespace() tkz.Advance() return TokenizeNormal } func TokenizeComment(tkz *Tokenizer) TokenizerRule { if !tkz.SkipComment() { tkz.Error("Unterminated comment") return TokenizeError } tkz.Advance() return TokenizeNormal } func TokenizePunctuator(tkz *Tokenizer) TokenizerRule { tkz.Found(TokenType(tkz.Peek())) _ = tkz.Next() tkz.Advance() return TokenizeNormal } func TokenizeEOL(tkz *Tokenizer) TokenizerRule { tkz.SkipIn("\r\n") tkz.Found(TokenEOL) return TokenizeNormal } func TokenizeOperator(tkz *Tokenizer) TokenizerRule { tkz.SkipIn(operator_chars) tkz.SkipCurrentNotIn(tokenDelimiter) tkz.Found(TokenOperator) return TokenizeNormal } func tokenizeEscape(tkz *Tokenizer) error { _ = tkz.Next() return nil } func TokenizeString(tkz *Tokenizer) TokenizerRule { open := tkz.Peek() do_escape := open == '"' peek := tkz.Next() tkz.Advance() for ; peek != '\000'; peek = tkz.Next() { if do_escape && peek == '\\' { if err := tokenizeEscape(tkz); err != nil { return TokenizeError } } else if peek == open { tkz.Found(TokenString) _ = tkz.Next() tkz.Advance() return TokenizeNormal } } tkz.Error("Unexpected EOF in string.") return nil } func TokenizeNumberOrOperator(tkz *Tokenizer) TokenizerRule { if unicode.IsDigit(tkz.Next()) { return TokenizeNumber } else { _ = tkz.Previous() return TokenizeOperator } } func TokenizeEscapedNewline(tkz * Tokenizer) TokenizerRule { tkz.SkipWhitespace() peek := tkz.Peek() if strings.ContainsRune("\n\r", peek) { tkz.SkipIn("\n\r") } else { tkz.Error("Stray backslash character.") } return TokenizeNormal } func TokenizeNormal(tkz *Tokenizer) TokenizerRule { peek := tkz.Peek() if peek == '#' { return TokenizeComment } else if strings.ContainsRune(" \t", peek) { return TokenizeWhitespace } else if strings.ContainsRune(".,;:", peek) { return TokenizePunctuator } else if strings.ContainsRune("([{}])", peek) { return TokenizeSigil } else if strings.ContainsRune("$", peek) { return TokenizeSymbol } else if strings.ContainsRune("\r\n", peek) { return TokenizeEOL } else if strings.ContainsRune("+-", peek) { return TokenizeNumberOrOperator } else if strings.ContainsRune("\"`", peek) { return TokenizeString } else if strings.ContainsRune("\\", peek) { return TokenizeEscapedNewline } else if peek == '\000' { tkz.Emit(TokenEOF, "") return nil } else if unicode.IsLetter(peek) { return TokenizeWord } else if unicode.IsDigit(peek) { return TokenizeNumber } else if strings.ContainsRune(operator_chars, peek) { return TokenizeOperator } else { return TokenizeError } } func (tkz *Tokenizer) Peek() rune { if (tkz.Current().Index) >= len(tkz.Input) { return '\000' } return tkz.Input[tkz.Current().Index] } func (tkz *Tokenizer) PeekNext() rune { if (tkz.Current().Index + 1) >= len(tkz.Input) { return '\000' } return tkz.Input[tkz.Current().Index+1] } func (tkz *Tokenizer) Next() rune { monolog.Debug("Next: %c %s", tkz.Peek(), tkz.Current()) if tkz.Peek() == '\n' { tkz.CurrentPtr().Column = 0 tkz.CurrentPtr().Row++ } tkz.CurrentPtr().Index++ return tkz.Peek() } func (tkz *Tokenizer) Previous() rune { if tkz.Current().Index > 0 { tkz.CurrentPtr().Index-- if tkz.Peek() == '\n' { tkz.CurrentPtr().Column = 0 tkz.CurrentPtr().Row++ } } return tkz.Peek() } func (tkz *Tokenizer) SkipRune() { _ = tkz.Next() } func (tkz *Tokenizer) SkipIn(set string) bool { for strings.ContainsRune(set, tkz.Next()) { monolog.Debug("SkipIn: %s %c\n", set, tkz.Peek()) if tkz.Peek() == '\000' { return false } } return true } func (tkz *Tokenizer) SkipCurrentNotIn(set string) bool { for c := tkz.Peek() ; !strings.ContainsRune(set,c) ; c = tkz.Next() { monolog.Debug("SkipNotIn: %c %s", c, tkz.Current()) if c == '\000' { return false } } return true } func (tkz *Tokenizer) SkipNotIn(set string) bool { _ = tkz.Next() return tkz.SkipCurrentNotIn(set) } func (tkz *Tokenizer) SkipWhile(should_skip func(r rune) bool) bool { for should_skip(tkz.Peek()) { if tkz.Next() == '\000' { return false } } return true } func (tkz *Tokenizer) SkipWhitespace() { tkz.SkipIn(" \t") } func (tkz *Tokenizer) Advance() { (*tkz.LastPtr()) = tkz.Current() } func (tkz *Tokenizer) Rewind() { (*tkz.CurrentPtr()) = tkz.Last() } func (tkz *Tokenizer) CurrentRuneValue() []rune { return tkz.Input[tkz.Last().Index:tkz.Current().Index] } func (tkz *Tokenizer) CurrentStringValue() string { return string(tkz.CurrentRuneValue()) } func (tkz *Tokenizer) Found(kind TokenType) { tkz.Emit(kind, TokenText(tkz.CurrentStringValue())) tkz.Advance() } func getFunctionName(fun interface{}) string { return runtime.FuncForPC(reflect.ValueOf(fun).Pointer()).Name() } func (tkz *Tokenizer) Start() []*Token { rule := TokenizeNormal for rule != nil { monolog.Debug("Tokenizer Rule: %s\n", getFunctionName(rule)) rule = rule(tkz) } return tkz.Output } func (tkz *Tokenizer) TryTokenizing() { tokens := tkz.Start() for token := range tokens { monolog.Info("Token %s", token) } } func (tkz * Tokenizer) EmitToken(token * Token) * Token { tkz.Output = append(tkz.Output, token) return token } func TokenizeStart(tkz *Tokenizer) TokenizerRule { return nil } func Tokenize(input string) []*Token { tokenizer := NewTokenizer(input) return tokenizer.Start() } func TryTokenizingString(input string) { tokens := Tokenize(input) for token := range tokens { monolog.Info("Token %s", token) } }