|
@@ -0,0 +1,347 @@
|
|
|
+package raku
|
|
|
+
|
|
|
+import "strings"
|
|
|
+import "unicode"
|
|
|
+import "fmt"
|
|
|
+import "reflect"
|
|
|
+import "runtime"
|
|
|
+import "gitlab.com/beoran/woe/monolog"
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+type Tokenizer struct {
|
|
|
+ CurrentPosition Position
|
|
|
+ LastPosition Position
|
|
|
+ Input []rune
|
|
|
+ Output []*Token
|
|
|
+ rule TokenizerRule
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+type TokenizerRule func(tkz *Tokenizer) TokenizerRule
|
|
|
+
|
|
|
+
|
|
|
+func NewTokenizer(input string) * Tokenizer {
|
|
|
+ tokenizer := &Tokenizer{}
|
|
|
+ tokenizer.Input = []rune(input);
|
|
|
+ tokenizer.rule = TokenizeStart
|
|
|
+ return tokenizer;
|
|
|
+}
|
|
|
+
|
|
|
+func (tkz *Tokenizer) Last() Position {
|
|
|
+ return tkz.LastPosition
|
|
|
+}
|
|
|
+
|
|
|
+func (tkz *Tokenizer) Current() Position {
|
|
|
+ return tkz.CurrentPosition
|
|
|
+}
|
|
|
+
|
|
|
+func (tkz *Tokenizer) LastPtr() * Position {
|
|
|
+ return &tkz.LastPosition
|
|
|
+}
|
|
|
+
|
|
|
+func (tkz *Tokenizer) CurrentPtr() * Position {
|
|
|
+ return &tkz.CurrentPosition
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+func (tkz *Tokenizer) Emit(t TokenType, v TokenText) {
|
|
|
+ tok := &Token{t, v, tkz.Current()}
|
|
|
+ tkz.EmitToken(tok)
|
|
|
+}
|
|
|
+
|
|
|
+func (tkz *Tokenizer) Error(message string, args ...interface{}) {
|
|
|
+ value := fmt.Sprintf(message, args...)
|
|
|
+ monolog.Error("Tokenize Error: %s", value)
|
|
|
+ tkz.Emit(TokenError, TokenText(value))
|
|
|
+}
|
|
|
+
|
|
|
+func TokenizeError(tkz *Tokenizer) TokenizerRule {
|
|
|
+ tkz.Error("Error")
|
|
|
+ return nil
|
|
|
+}
|
|
|
+
|
|
|
+func (tkz *Tokenizer) SkipComment() bool {
|
|
|
+ if tkz.Peek() == '#' {
|
|
|
+ if tkz.Next() == '(' {
|
|
|
+ return tkz.SkipNotIn(")")
|
|
|
+ } else {
|
|
|
+ return tkz.SkipNotIn("\r\n")
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return true
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+func TokenizeSigil(tkz *Tokenizer) TokenizerRule {
|
|
|
+ tkz.Found(TokenType(tkz.Peek()))
|
|
|
+ _ = tkz.Next()
|
|
|
+ tkz.Advance()
|
|
|
+ return TokenizeNormal
|
|
|
+}
|
|
|
+
|
|
|
+const tokenDelimiter = " \t\r\n'({[]})"
|
|
|
+
|
|
|
+
|
|
|
+func TokenizeWord(tkz *Tokenizer) TokenizerRule {
|
|
|
+ tkz.SkipNotIn(tokenDelimiter)
|
|
|
+ tkz.Found(TokenWord)
|
|
|
+ return TokenizeNormal
|
|
|
+}
|
|
|
+
|
|
|
+func TokenizeSymbol(tkz *Tokenizer) TokenizerRule {
|
|
|
+ tkz.SkipNotIn(tokenDelimiter)
|
|
|
+ tkz.Found(TokenSymbol)
|
|
|
+ return TokenizeNormal
|
|
|
+}
|
|
|
+
|
|
|
+func TokenizeNumber(tkz *Tokenizer) TokenizerRule {
|
|
|
+ tkz.SkipNotIn(tokenDelimiter)
|
|
|
+ tkz.Found(TokenNumber)
|
|
|
+ return TokenizeNormal
|
|
|
+}
|
|
|
+
|
|
|
+func TokenizeWhitespace(tkz *Tokenizer) TokenizerRule {
|
|
|
+ tkz.SkipWhitespace()
|
|
|
+ tkz.Advance()
|
|
|
+ return TokenizeNormal
|
|
|
+}
|
|
|
+
|
|
|
+func TokenizeComment(tkz *Tokenizer) TokenizerRule {
|
|
|
+ if !tkz.SkipComment() {
|
|
|
+ tkz.Error("Unterminated comment")
|
|
|
+ return TokenizeError
|
|
|
+ }
|
|
|
+ tkz.Advance()
|
|
|
+ return TokenizeNormal
|
|
|
+}
|
|
|
+
|
|
|
+func TokenizePunctuator(tkz *Tokenizer) TokenizerRule {
|
|
|
+ tkz.Found(TokenType(tkz.Peek()))
|
|
|
+ _ = tkz.Next()
|
|
|
+ tkz.Advance()
|
|
|
+ return TokenizeNormal
|
|
|
+}
|
|
|
+
|
|
|
+func TokenizeEOL(tkz *Tokenizer) TokenizerRule {
|
|
|
+ tkz.SkipIn("\r\n")
|
|
|
+ tkz.Found(TokenEOL)
|
|
|
+ return TokenizeNormal
|
|
|
+}
|
|
|
+
|
|
|
+func TokenizeOperator(tkz *Tokenizer) TokenizerRule {
|
|
|
+ tkz.SkipIn(operator_chars)
|
|
|
+ tkz.Found(TokenOperator)
|
|
|
+ return TokenizeNormal
|
|
|
+}
|
|
|
+
|
|
|
+func tokenizeEscape(tkz *Tokenizer) error {
|
|
|
+ _ = tkz.Next()
|
|
|
+ return nil
|
|
|
+}
|
|
|
+
|
|
|
+func TokenizeString(tkz *Tokenizer) TokenizerRule {
|
|
|
+ open := tkz.Peek()
|
|
|
+ do_escape := open == '"'
|
|
|
+ peek := tkz.Next()
|
|
|
+ tkz.Advance()
|
|
|
+ for ; peek != '\000'; peek = tkz.Next() {
|
|
|
+ if do_escape && peek == '\\' {
|
|
|
+ if err := tokenizeEscape(tkz); err != nil {
|
|
|
+ return TokenizeError
|
|
|
+ }
|
|
|
+ } else if peek == open {
|
|
|
+ tkz.Found(TokenString)
|
|
|
+ _ = tkz.Next()
|
|
|
+ tkz.Advance()
|
|
|
+ return TokenizeNormal
|
|
|
+ }
|
|
|
+ }
|
|
|
+ tkz.Error("Unexpected EOF in string.")
|
|
|
+ return nil
|
|
|
+}
|
|
|
+
|
|
|
+func TokenizeNumberOrOperator(tkz *Tokenizer) TokenizerRule {
|
|
|
+ if unicode.IsDigit(tkz.Next()) {
|
|
|
+ return TokenizeNumber
|
|
|
+ } else {
|
|
|
+ _ = tkz.Previous()
|
|
|
+ return TokenizeOperator
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+func TokenizeNormal(tkz *Tokenizer) TokenizerRule {
|
|
|
+ peek := tkz.Peek()
|
|
|
+ if peek == '#' {
|
|
|
+ return TokenizeComment
|
|
|
+ } else if strings.ContainsRune(" \t", peek) {
|
|
|
+ return TokenizeWhitespace
|
|
|
+ } else if strings.ContainsRune(".,;:", peek) {
|
|
|
+ return TokenizePunctuator
|
|
|
+ } else if strings.ContainsRune("([{}])", peek) {
|
|
|
+ return TokenizeSigil
|
|
|
+ } else if strings.ContainsRune("$", peek) {
|
|
|
+ return TokenizeSymbol
|
|
|
+ } else if strings.ContainsRune("\r\n", peek) {
|
|
|
+ return TokenizeEOL
|
|
|
+ } else if strings.ContainsRune("+-", peek) {
|
|
|
+ return TokenizeNumberOrOperator
|
|
|
+ } else if strings.ContainsRune("\"`", peek) {
|
|
|
+ return TokenizeString
|
|
|
+ } else if peek == '\000' {
|
|
|
+ tkz.Emit(TokenEOF, "")
|
|
|
+ return nil
|
|
|
+ } else if unicode.IsLetter(peek) {
|
|
|
+ return TokenizeWord
|
|
|
+ } else if unicode.IsDigit(peek) {
|
|
|
+ return TokenizeNumber
|
|
|
+ } else if strings.ContainsRune(operator_chars, peek) {
|
|
|
+ return TokenizeOperator
|
|
|
+ } else {
|
|
|
+ return TokenizeError
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+func (tkz *Tokenizer) Peek() rune {
|
|
|
+ if (tkz.Current().Index) >= len(tkz.Input) {
|
|
|
+ return '\000'
|
|
|
+ }
|
|
|
+ return tkz.Input[tkz.Current().Index]
|
|
|
+}
|
|
|
+
|
|
|
+func (tkz *Tokenizer) PeekNext() rune {
|
|
|
+ if (tkz.Current().Index + 1) >= len(tkz.Input) {
|
|
|
+ return '\000'
|
|
|
+ }
|
|
|
+ return tkz.Input[tkz.Current().Index+1]
|
|
|
+}
|
|
|
+
|
|
|
+func (tkz *Tokenizer) Next() rune {
|
|
|
+ monolog.Debug("Next: %c %s", tkz.Peek(), tkz.Current())
|
|
|
+ if tkz.Peek() == '\n' {
|
|
|
+ tkz.CurrentPtr().Column = 0
|
|
|
+ tkz.CurrentPtr().Row++
|
|
|
+ }
|
|
|
+ tkz.CurrentPtr().Index++
|
|
|
+ return tkz.Peek()
|
|
|
+}
|
|
|
+
|
|
|
+func (tkz *Tokenizer) Previous() rune {
|
|
|
+ if tkz.Current().Index > 0 {
|
|
|
+ tkz.CurrentPtr().Index--
|
|
|
+
|
|
|
+ if tkz.Peek() == '\n' {
|
|
|
+ tkz.CurrentPtr().Column = 0
|
|
|
+ tkz.CurrentPtr().Row++
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return tkz.Peek()
|
|
|
+}
|
|
|
+
|
|
|
+func (tkz *Tokenizer) SkipRune() {
|
|
|
+ _ = tkz.Next()
|
|
|
+}
|
|
|
+
|
|
|
+func (tkz *Tokenizer) SkipIn(set string) bool {
|
|
|
+ for strings.ContainsRune(set, tkz.Next()) {
|
|
|
+ monolog.Debug("SkipIn: %s %c\n", set, tkz.Peek())
|
|
|
+ if tkz.Peek() == '\000' {
|
|
|
+ return false
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return true
|
|
|
+}
|
|
|
+
|
|
|
+func (tkz *Tokenizer) SkipNotIn(set string) bool {
|
|
|
+ _ = tkz.Next()
|
|
|
+ for c := tkz.Peek() ; !strings.ContainsRune(set,c) ; c = tkz.Next() {
|
|
|
+ monolog.Debug("SkipNotIn: %c %s", c, tkz.Current())
|
|
|
+ if c == '\000' {
|
|
|
+ return false
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return true
|
|
|
+}
|
|
|
+
|
|
|
+func (tkz *Tokenizer) SkipWhile(should_skip func(r rune) bool) bool {
|
|
|
+ for should_skip(tkz.Peek()) {
|
|
|
+ if tkz.Next() == '\000' {
|
|
|
+ return false
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return true
|
|
|
+}
|
|
|
+
|
|
|
+func (tkz *Tokenizer) SkipWhitespace() {
|
|
|
+ tkz.SkipIn(" \t")
|
|
|
+}
|
|
|
+
|
|
|
+func (tkz *Tokenizer) Advance() {
|
|
|
+ (*tkz.LastPtr()) = tkz.Current()
|
|
|
+}
|
|
|
+
|
|
|
+func (tkz *Tokenizer) Rewind() {
|
|
|
+ (*tkz.CurrentPtr()) = tkz.Last()
|
|
|
+}
|
|
|
+
|
|
|
+func (tkz *Tokenizer) CurrentRuneValue() []rune {
|
|
|
+ return tkz.Input[tkz.Last().Index:tkz.Current().Index]
|
|
|
+}
|
|
|
+
|
|
|
+func (tkz *Tokenizer) CurrentStringValue() string {
|
|
|
+ return string(tkz.CurrentRuneValue())
|
|
|
+}
|
|
|
+
|
|
|
+func (tkz *Tokenizer) Found(kind TokenType) {
|
|
|
+ tkz.Emit(kind, TokenText(tkz.CurrentStringValue()))
|
|
|
+ tkz.Advance()
|
|
|
+}
|
|
|
+
|
|
|
+func getFunctionName(fun interface{}) string {
|
|
|
+ return runtime.FuncForPC(reflect.ValueOf(fun).Pointer()).Name()
|
|
|
+}
|
|
|
+
|
|
|
+func (tkz *Tokenizer) Start() []*Token {
|
|
|
+ rule := TokenizeNormal
|
|
|
+ for rule != nil {
|
|
|
+ monolog.Debug("Tokenizer Rule: %s\n", getFunctionName(rule))
|
|
|
+ rule = rule(tkz)
|
|
|
+ }
|
|
|
+ return tkz.Output
|
|
|
+}
|
|
|
+
|
|
|
+func (tkz *Tokenizer) TryTokenizing() {
|
|
|
+ tokens := tkz.Start()
|
|
|
+
|
|
|
+ for token := range tokens {
|
|
|
+ monolog.Info("Token %s", token)
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+func (tkz * Tokenizer) EmitToken(token * Token) * Token {
|
|
|
+ tkz.Output = append(tkz.Output, token)
|
|
|
+ return token
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+func TokenizeStart(tkz *Tokenizer) TokenizerRule {
|
|
|
+ return nil
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+func Tokenize(input string) []*Token {
|
|
|
+ tokenizer := NewTokenizer(input)
|
|
|
+ return tokenizer.Start()
|
|
|
+}
|
|
|
+
|
|
|
+func TryTokenizingString(input string) {
|
|
|
+ tokens := Tokenize(input)
|
|
|
+
|
|
|
+ for token := range tokens {
|
|
|
+ monolog.Info("Token %s", token)
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+
|