123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581 |
- package raku
- import "strings"
- import "unicode"
- import "fmt"
- import "reflect"
- import "runtime"
- import "gitlab.com/beoran/woe/monolog"
- const tokenDelimiter = " \t\r\n'({[]}),;.:"
- const operator_chars = "&|@'^-*%/+=<>~"
- type TokenText string
- type TokenType int64
- type Position struct {
- Index int
- Row int
- Column int
- }
- const (
- TokenPeriod TokenType = TokenType('.')
- TokenComma TokenType = TokenType(',')
- TokenSemicolon TokenType = TokenType(';')
- TokenColon TokenType = TokenType(':')
- TokenOpenParen TokenType = TokenType('(')
- TokenCloseParen TokenType = TokenType(')')
- TokenOpenBrace TokenType = TokenType('{')
- TokenCloseBrace TokenType = TokenType('}')
- TokenOpenBracket TokenType = TokenType('[')
- TokenCloseBracket TokenType = TokenType(']')
- TokenNone TokenType = 0
- TokenError TokenType = -1
- TokenWord TokenType = -2
- TokenEOL TokenType = -3
- TokenEOF TokenType = -4
- TokenNumber TokenType = -5
- TokenOperator TokenType = -6
- TokenString TokenType = -7
- TokenSymbol TokenType = -8
- TokenFirstKeyword TokenType = -9
- TokenArticle TokenType = -10
- TokenDo TokenType = -11
- TokenEnd TokenType = -12
- TokenDef TokenType = -13
- TokenPreposition TokenType = -14
- TokenVerb TokenType = -15
- TokenNoun TokenType = -16
- TokenAdverb TokenType = -17
- TokenAdjective TokenType = -18
- TokenLastKeyword TokenType = -19
- TokenLast TokenType = -19
- )
- type Token struct {
- TokenType
- TokenText
- Position
- }
- var tokenTypeMap map[TokenType]string = map[TokenType]string{
- TokenNone: "None",
- TokenError: "Error",
- TokenWord: "Word",
- TokenEOL: "EOL",
- TokenEOF: "EOF",
- TokenNumber: "Number",
- TokenOperator: "Operator",
- TokenString: "String",
- TokenSymbol: "Symbol",
- TokenArticle: "Article",
- TokenPreposition:"Preposition",
- TokenDo: "Do",
- TokenEnd: "End",
- TokenDef: "KeywordDef",
- TokenVerb: "Verb",
- TokenAdjective: "Adjective",
- TokenAdverb: "Adverb",
- TokenNoun: "Noun",
- }
- var keywordMap map[string]TokenType = map[string]TokenType{
- "a" : TokenArticle,
- "an" : TokenArticle,
- "the" : TokenArticle,
- "do" : TokenDo,
- "begin" : TokenDo,
- "then" : TokenDo,
- "has" : TokenDo,
- "end" : TokenEnd,
- "done" : TokenEnd,
- "endif" : TokenEnd,
- "def" : TokenDef,
- "define" : TokenDef,
- "aboard" : TokenPreposition,
- "about" : TokenPreposition,
- "above" : TokenPreposition,
- "absent" : TokenPreposition,
- "across" : TokenPreposition,
- "after" : TokenPreposition,
- "against" : TokenPreposition,
- "along" : TokenPreposition,
- "alongside" : TokenPreposition,
- "amid" : TokenPreposition,
- "amidst" : TokenPreposition,
- "among" : TokenPreposition,
- "apropos" : TokenPreposition,
- "apud" : TokenPreposition,
- "around" : TokenPreposition,
- "as" : TokenPreposition,
- "astride" : TokenPreposition,
- "at" : TokenPreposition,
- "atop" : TokenPreposition,
- "ontop" : TokenPreposition,
- "bar" : TokenPreposition,
- "before" : TokenPreposition,
- "behind" : TokenPreposition,
- "below" : TokenPreposition,
- "beneath" : TokenPreposition,
- "beside" : TokenPreposition,
- "besides" : TokenPreposition,
- "between" : TokenPreposition,
- "beyond" : TokenPreposition,
- "but" : TokenPreposition,
- "by" : TokenPreposition,
- "chez" : TokenPreposition,
- "circa" : TokenPreposition,
- "come" : TokenPreposition,
- "dehors" : TokenPreposition,
- "despite" : TokenPreposition,
- "down" : TokenPreposition,
- "during" : TokenPreposition,
- "except" : TokenPreposition,
- "for" : TokenPreposition,
- "from" : TokenPreposition,
- "in" : TokenPreposition,
- "inside" : TokenPreposition,
- "into" : TokenPreposition,
- "less" : TokenPreposition,
- "like" : TokenPreposition,
- "minus" : TokenPreposition,
- "near" : TokenPreposition,
- "nearer" : TokenPreposition,
- "nearest" : TokenPreposition,
- "notwithstanding" : TokenPreposition,
- "of" : TokenPreposition,
- "off" : TokenPreposition,
- "on" : TokenPreposition,
- "onto" : TokenPreposition,
- "opposite" : TokenPreposition,
- "out" : TokenPreposition,
- "outside" : TokenPreposition,
- "over" : TokenPreposition,
- "pace" : TokenPreposition,
- "past" : TokenPreposition,
- "per" : TokenPreposition,
- "post" : TokenPreposition,
- "pre" : TokenPreposition,
- "pro" : TokenPreposition,
- "qua" : TokenPreposition,
- "re" : TokenPreposition,
- "sans" : TokenPreposition,
- "save" : TokenPreposition,
- "short" : TokenPreposition,
- "since" : TokenPreposition,
- "than" : TokenPreposition,
- "through" : TokenPreposition,
- "thru" : TokenPreposition,
- "throughout" : TokenPreposition,
- "to" : TokenPreposition,
- "toward" : TokenPreposition,
- "towards" : TokenPreposition,
- "under" : TokenPreposition,
- "underneath" : TokenPreposition,
- "unlike" : TokenPreposition,
- "until" : TokenPreposition,
- "up" : TokenPreposition,
- "upon" : TokenPreposition,
- "upside" : TokenPreposition,
- "versus" : TokenPreposition,
- "via" : TokenPreposition,
- "vice" : TokenPreposition,
- "vis-à-vis" : TokenPreposition,
- "with" : TokenPreposition,
- "within" : TokenPreposition,
- "without" : TokenPreposition,
- "worth" : TokenPreposition,
- }
- var sigilMap map[string]TokenType = map[string]TokenType{
- "[": TokenOpenBracket,
- "{": TokenOpenBrace,
- "(": TokenOpenParen,
- "]": TokenCloseBracket,
- "}": TokenCloseBrace,
- ")": TokenCloseParen,
- }
- func (me TokenType) String() string {
- name, found := tokenTypeMap[me]
- if found {
- return name
- } else {
- if (me > 0) && (me < 256) {
- return fmt.Sprintf("Char<%c>", byte(me))
- }
- return fmt.Sprintf("Unknown Token %d", int(me))
- }
- }
- func (me Token) String() string {
- return fmt.Sprintf("Token: %s >%s< %d %d %d.", me.TokenType, string(me.TokenText), me.Index, me.Row, me.Column)
- }
- func (me Token) ShortString() string {
- return fmt.Sprintf("T: %s >%s<", me.TokenType, string(me.TokenText))
- }
- /* The tokenizer splits up text in tokens without classifying Word tokens. */
- type Tokenizer struct {
- CurrentPosition Position
- LastPosition Position
- Input []rune
- Output []*Token
- rule TokenizerRule
- }
- type TokenizerRule func(tkz *Tokenizer) TokenizerRule
- func NewTokenizer(input string) * Tokenizer {
- tokenizer := &Tokenizer{}
- tokenizer.Input = []rune(input);
- tokenizer.rule = TokenizeStart
- return tokenizer;
- }
- func (tkz *Tokenizer) Last() Position {
- return tkz.LastPosition
- }
- func (tkz *Tokenizer) Current() Position {
- return tkz.CurrentPosition
- }
- func (tkz *Tokenizer) LastPtr() * Position {
- return &tkz.LastPosition
- }
- func (tkz *Tokenizer) CurrentPtr() * Position {
- return &tkz.CurrentPosition
- }
- func (tkz *Tokenizer) Emit(t TokenType, v TokenText) {
- tok := &Token{t, v, tkz.Current()}
- tkz.EmitToken(tok)
- }
- func (tkz *Tokenizer) Error(message string, args ...interface{}) {
- value := fmt.Sprintf(message, args...)
- monolog.Error("Tokenize Error: %s", value)
- tkz.Emit(TokenError, TokenText(value))
- }
- func TokenizeError(tkz *Tokenizer) TokenizerRule {
- tkz.Error("Error")
- return nil
- }
- func (tkz *Tokenizer) SkipComment() bool {
- if tkz.Peek() == '#' {
- if tkz.Next() == '(' {
- return tkz.SkipNotIn(")")
- } else {
- return tkz.SkipNotIn("\r\n")
- }
- }
- return true
- }
- func TokenizeSigil(tkz *Tokenizer) TokenizerRule {
- tkz.Found(TokenType(tkz.Peek()))
- _ = tkz.Next()
- tkz.Advance()
- return TokenizeNormal
- }
- func TokenizeWord(tkz *Tokenizer) TokenizerRule {
- tkz.SkipNotIn(tokenDelimiter)
- tkz.Found(TokenWord)
- return TokenizeNormal
- }
- func TokenizeSymbol(tkz *Tokenizer) TokenizerRule {
- tkz.SkipNotIn(tokenDelimiter)
- tkz.Found(TokenSymbol)
- return TokenizeNormal
- }
- func TokenizeNumber(tkz *Tokenizer) TokenizerRule {
- tkz.SkipNotIn(tokenDelimiter)
- tkz.Found(TokenNumber)
- return TokenizeNormal
- }
- func TokenizeWhitespace(tkz *Tokenizer) TokenizerRule {
- tkz.SkipWhitespace()
- tkz.Advance()
- return TokenizeNormal
- }
- func TokenizeComment(tkz *Tokenizer) TokenizerRule {
- if !tkz.SkipComment() {
- tkz.Error("Unterminated comment")
- return TokenizeError
- }
- tkz.Advance()
- return TokenizeNormal
- }
- func TokenizePunctuator(tkz *Tokenizer) TokenizerRule {
- tkz.Found(TokenType(tkz.Peek()))
- _ = tkz.Next()
- tkz.Advance()
- return TokenizeNormal
- }
- func TokenizeEOL(tkz *Tokenizer) TokenizerRule {
- tkz.SkipIn("\r\n")
- tkz.Found(TokenEOL)
- return TokenizeNormal
- }
- func TokenizeOperator(tkz *Tokenizer) TokenizerRule {
- tkz.SkipIn(operator_chars)
- tkz.SkipCurrentNotIn(tokenDelimiter)
- tkz.Found(TokenOperator)
- return TokenizeNormal
- }
- func tokenizeEscape(tkz *Tokenizer) error {
- _ = tkz.Next()
- return nil
- }
- func TokenizeString(tkz *Tokenizer) TokenizerRule {
- open := tkz.Peek()
- do_escape := open == '"'
- peek := tkz.Next()
- tkz.Advance()
- for ; peek != '\000'; peek = tkz.Next() {
- if do_escape && peek == '\\' {
- if err := tokenizeEscape(tkz); err != nil {
- return TokenizeError
- }
- } else if peek == open {
- tkz.Found(TokenString)
- _ = tkz.Next()
- tkz.Advance()
- return TokenizeNormal
- }
- }
- tkz.Error("Unexpected EOF in string.")
- return nil
- }
- func TokenizeNumberOrOperator(tkz *Tokenizer) TokenizerRule {
- if unicode.IsDigit(tkz.Next()) {
- return TokenizeNumber
- } else {
- _ = tkz.Previous()
- return TokenizeOperator
- }
- }
- func TokenizeEscapedNewline(tkz * Tokenizer) TokenizerRule {
- tkz.SkipWhitespace()
- peek := tkz.Peek()
- if strings.ContainsRune("\n\r", peek) {
- tkz.SkipIn("\n\r")
- } else {
- tkz.Error("Stray backslash character.")
- }
- return TokenizeNormal
- }
- func TokenizeNormal(tkz *Tokenizer) TokenizerRule {
- peek := tkz.Peek()
- if peek == '#' {
- return TokenizeComment
- } else if strings.ContainsRune(" \t", peek) {
- return TokenizeWhitespace
- } else if strings.ContainsRune(".,;:", peek) {
- return TokenizePunctuator
- } else if strings.ContainsRune("([{}])", peek) {
- return TokenizeSigil
- } else if strings.ContainsRune("$", peek) {
- return TokenizeSymbol
- } else if strings.ContainsRune("\r\n", peek) {
- return TokenizeEOL
- } else if strings.ContainsRune("+-", peek) {
- return TokenizeNumberOrOperator
- } else if strings.ContainsRune("\"`", peek) {
- return TokenizeString
- } else if strings.ContainsRune("\\", peek) {
- return TokenizeEscapedNewline
- } else if peek == '\000' {
- tkz.Emit(TokenEOF, "")
- return nil
- } else if unicode.IsLetter(peek) {
- return TokenizeWord
- } else if unicode.IsDigit(peek) {
- return TokenizeNumber
- } else if strings.ContainsRune(operator_chars, peek) {
- return TokenizeOperator
- } else {
- return TokenizeError
- }
- }
- func (tkz *Tokenizer) Peek() rune {
- if (tkz.Current().Index) >= len(tkz.Input) {
- return '\000'
- }
- return tkz.Input[tkz.Current().Index]
- }
- func (tkz *Tokenizer) PeekNext() rune {
- if (tkz.Current().Index + 1) >= len(tkz.Input) {
- return '\000'
- }
- return tkz.Input[tkz.Current().Index+1]
- }
- func (tkz *Tokenizer) Next() rune {
- monolog.Debug("Next: %c %s", tkz.Peek(), tkz.Current())
- if tkz.Peek() == '\n' {
- tkz.CurrentPtr().Column = 0
- tkz.CurrentPtr().Row++
- }
- tkz.CurrentPtr().Index++
- return tkz.Peek()
- }
- func (tkz *Tokenizer) Previous() rune {
- if tkz.Current().Index > 0 {
- tkz.CurrentPtr().Index--
- if tkz.Peek() == '\n' {
- tkz.CurrentPtr().Column = 0
- tkz.CurrentPtr().Row++
- }
- }
- return tkz.Peek()
- }
- func (tkz *Tokenizer) SkipRune() {
- _ = tkz.Next()
- }
- func (tkz *Tokenizer) SkipIn(set string) bool {
- for strings.ContainsRune(set, tkz.Next()) {
- monolog.Debug("SkipIn: %s %c\n", set, tkz.Peek())
- if tkz.Peek() == '\000' {
- return false
- }
- }
- return true
- }
- func (tkz *Tokenizer) SkipCurrentNotIn(set string) bool {
- for c := tkz.Peek() ; !strings.ContainsRune(set,c) ; c = tkz.Next() {
- monolog.Debug("SkipNotIn: %c %s", c, tkz.Current())
- if c == '\000' {
- return false
- }
- }
- return true
- }
- func (tkz *Tokenizer) SkipNotIn(set string) bool {
- _ = tkz.Next()
- return tkz.SkipCurrentNotIn(set)
- }
- func (tkz *Tokenizer) SkipWhile(should_skip func(r rune) bool) bool {
- for should_skip(tkz.Peek()) {
- if tkz.Next() == '\000' {
- return false
- }
- }
- return true
- }
- func (tkz *Tokenizer) SkipWhitespace() {
- tkz.SkipIn(" \t")
- }
- func (tkz *Tokenizer) Advance() {
- (*tkz.LastPtr()) = tkz.Current()
- }
- func (tkz *Tokenizer) Rewind() {
- (*tkz.CurrentPtr()) = tkz.Last()
- }
- func (tkz *Tokenizer) CurrentRuneValue() []rune {
- return tkz.Input[tkz.Last().Index:tkz.Current().Index]
- }
- func (tkz *Tokenizer) CurrentStringValue() string {
- return string(tkz.CurrentRuneValue())
- }
- func (tkz *Tokenizer) Found(kind TokenType) {
- tkz.Emit(kind, TokenText(tkz.CurrentStringValue()))
- tkz.Advance()
- }
- func getFunctionName(fun interface{}) string {
- return runtime.FuncForPC(reflect.ValueOf(fun).Pointer()).Name()
- }
- func (tkz *Tokenizer) Start() []*Token {
- rule := TokenizeNormal
- for rule != nil {
- monolog.Debug("Tokenizer Rule: %s\n", getFunctionName(rule))
- rule = rule(tkz)
- }
- return tkz.Output
- }
- func (tkz *Tokenizer) TryTokenizing() {
- tokens := tkz.Start()
- for token := range tokens {
- monolog.Info("Token %s", token)
- }
- }
- func (tkz * Tokenizer) EmitToken(token * Token) * Token {
- tkz.Output = append(tkz.Output, token)
- return token
- }
- func TokenizeStart(tkz *Tokenizer) TokenizerRule {
- return nil
- }
- func Tokenize(input string) []*Token {
- tokenizer := NewTokenizer(input)
- return tokenizer.Start()
- }
- func TryTokenizingString(input string) {
- tokens := Tokenize(input)
- for token := range tokens {
- monolog.Info("Token %s", token)
- }
- }
|