Browse Source

Attempt an approach of a 3 step tokenizer/classifier/parser for Raku.

Beoran 6 years ago
parent
commit
2a51c41e8e
2 changed files with 406 additions and 0 deletions
  1. 347 0
      raku/tokenizer.go
  2. 59 0
      raku/tokenizer_test.go

+ 347 - 0
raku/tokenizer.go

@@ -0,0 +1,347 @@
+package raku
+
+import "strings"
+import "unicode"
+import "fmt"
+import "reflect"
+import "runtime"
+import  "gitlab.com/beoran/woe/monolog"
+
+
+/* The tokenizer splits up text in tokens without classifying Word tokens. */
+type Tokenizer struct {
+    CurrentPosition     Position
+    LastPosition        Position
+    Input               []rune
+    Output              []*Token
+    rule                TokenizerRule
+}
+
+
+type TokenizerRule func(tkz *Tokenizer) TokenizerRule
+
+
+func NewTokenizer(input string) * Tokenizer {
+    tokenizer := &Tokenizer{}
+    tokenizer.Input = []rune(input);
+    tokenizer.rule  = TokenizeStart
+    return tokenizer;
+}
+
+func (tkz *Tokenizer) Last() Position {
+    return tkz.LastPosition
+}
+
+func (tkz *Tokenizer) Current() Position {
+    return tkz.CurrentPosition
+}
+
+func (tkz *Tokenizer) LastPtr() * Position {
+    return &tkz.LastPosition
+}
+
+func (tkz *Tokenizer) CurrentPtr() * Position {
+    return &tkz.CurrentPosition
+}
+
+
+func (tkz *Tokenizer) Emit(t TokenType, v TokenText) {
+    tok := &Token{t, v, tkz.Current()}
+    tkz.EmitToken(tok)
+}
+
+func (tkz *Tokenizer) Error(message string, args ...interface{}) {
+    value := fmt.Sprintf(message, args...)
+    monolog.Error("Tokenize Error: %s", value)
+    tkz.Emit(TokenError, TokenText(value))
+}
+
+func TokenizeError(tkz *Tokenizer) TokenizerRule {
+    tkz.Error("Error")
+    return nil
+}
+
+func (tkz *Tokenizer) SkipComment() bool {
+    if tkz.Peek() == '#' {
+        if tkz.Next() == '(' {
+            return tkz.SkipNotIn(")")
+        } else {
+            return tkz.SkipNotIn("\r\n")
+        }
+    }
+    return true
+}
+
+
+func TokenizeSigil(tkz *Tokenizer) TokenizerRule {
+    tkz.Found(TokenType(tkz.Peek()))
+    _ = tkz.Next()
+    tkz.Advance()
+    return TokenizeNormal
+}
+
+const tokenDelimiter = " \t\r\n'({[]})"
+
+
+func TokenizeWord(tkz *Tokenizer) TokenizerRule {
+    tkz.SkipNotIn(tokenDelimiter)
+    tkz.Found(TokenWord)
+    return TokenizeNormal
+}
+
+func TokenizeSymbol(tkz *Tokenizer) TokenizerRule {
+    tkz.SkipNotIn(tokenDelimiter)
+    tkz.Found(TokenSymbol)
+    return TokenizeNormal
+}
+
+func TokenizeNumber(tkz *Tokenizer) TokenizerRule {
+    tkz.SkipNotIn(tokenDelimiter)
+    tkz.Found(TokenNumber)
+    return TokenizeNormal
+}
+
+func TokenizeWhitespace(tkz *Tokenizer) TokenizerRule {
+    tkz.SkipWhitespace()
+    tkz.Advance()
+    return TokenizeNormal
+}
+
+func TokenizeComment(tkz *Tokenizer) TokenizerRule {
+    if !tkz.SkipComment() {
+        tkz.Error("Unterminated comment")
+        return TokenizeError
+    }
+    tkz.Advance()
+    return TokenizeNormal
+}
+
+func TokenizePunctuator(tkz *Tokenizer) TokenizerRule {
+    tkz.Found(TokenType(tkz.Peek()))
+    _ = tkz.Next()
+    tkz.Advance()
+    return TokenizeNormal
+}
+
+func TokenizeEOL(tkz *Tokenizer) TokenizerRule {
+    tkz.SkipIn("\r\n")
+    tkz.Found(TokenEOL)
+    return TokenizeNormal
+}
+
+func TokenizeOperator(tkz *Tokenizer) TokenizerRule {
+    tkz.SkipIn(operator_chars)
+    tkz.Found(TokenOperator)
+    return TokenizeNormal
+}
+
+func tokenizeEscape(tkz *Tokenizer) error {
+    _ = tkz.Next()
+    return nil
+}
+
+func TokenizeString(tkz *Tokenizer) TokenizerRule {
+    open := tkz.Peek()
+    do_escape := open == '"'
+    peek := tkz.Next()
+    tkz.Advance()
+    for ; peek != '\000'; peek = tkz.Next() {
+        if do_escape && peek == '\\' {
+            if err := tokenizeEscape(tkz); err != nil {
+                return TokenizeError
+            }
+        } else if peek == open {
+            tkz.Found(TokenString)
+            _ = tkz.Next()
+            tkz.Advance()
+            return TokenizeNormal
+        }
+    }
+    tkz.Error("Unexpected EOF in string.")
+    return nil
+}
+
+func TokenizeNumberOrOperator(tkz *Tokenizer) TokenizerRule {
+    if unicode.IsDigit(tkz.Next()) {
+        return TokenizeNumber
+    } else {
+        _ = tkz.Previous()
+        return TokenizeOperator
+    }
+}
+
+func TokenizeNormal(tkz *Tokenizer) TokenizerRule {
+    peek := tkz.Peek()
+    if peek == '#' {
+        return TokenizeComment
+    } else if strings.ContainsRune(" \t", peek) {
+        return TokenizeWhitespace
+    } else if strings.ContainsRune(".,;:", peek) {
+        return TokenizePunctuator
+    } else if strings.ContainsRune("([{}])", peek) {
+        return TokenizeSigil
+    } else if strings.ContainsRune("$", peek) {
+        return TokenizeSymbol
+    } else if strings.ContainsRune("\r\n", peek) {
+        return TokenizeEOL
+    } else if strings.ContainsRune("+-", peek) {
+        return TokenizeNumberOrOperator
+    } else if strings.ContainsRune("\"`", peek) {
+        return TokenizeString
+    } else if peek == '\000' {
+        tkz.Emit(TokenEOF, "")
+        return nil
+    } else if unicode.IsLetter(peek) {
+        return TokenizeWord
+    } else if unicode.IsDigit(peek) {
+        return TokenizeNumber
+    } else if strings.ContainsRune(operator_chars, peek) {
+        return TokenizeOperator
+    } else {
+        return TokenizeError
+    }
+}
+
+func (tkz *Tokenizer) Peek() rune {
+    if (tkz.Current().Index) >= len(tkz.Input) {
+        return '\000'
+    }
+    return tkz.Input[tkz.Current().Index]
+}
+
+func (tkz *Tokenizer) PeekNext() rune {
+    if (tkz.Current().Index + 1) >= len(tkz.Input) {
+        return '\000'
+    }
+    return tkz.Input[tkz.Current().Index+1]
+}
+
+func (tkz *Tokenizer) Next() rune {
+    monolog.Debug("Next: %c %s", tkz.Peek(), tkz.Current())
+    if tkz.Peek() == '\n' {
+        tkz.CurrentPtr().Column = 0
+        tkz.CurrentPtr().Row++
+    }
+    tkz.CurrentPtr().Index++
+    return tkz.Peek()
+}
+
+func (tkz *Tokenizer) Previous() rune {
+    if tkz.Current().Index > 0 {
+        tkz.CurrentPtr().Index--
+
+        if tkz.Peek() == '\n' {
+            tkz.CurrentPtr().Column = 0
+            tkz.CurrentPtr().Row++
+        }
+    }
+    return tkz.Peek()
+}
+
+func (tkz *Tokenizer) SkipRune() {
+    _ = tkz.Next()
+}
+
+func (tkz *Tokenizer) SkipIn(set string) bool {
+    for strings.ContainsRune(set, tkz.Next()) {
+        monolog.Debug("SkipIn: %s %c\n", set, tkz.Peek())
+        if tkz.Peek() == '\000' {
+            return false
+        }
+    }
+    return true
+}
+
+func (tkz *Tokenizer) SkipNotIn(set string) bool {
+    _ = tkz.Next()
+    for c := tkz.Peek() ;  !strings.ContainsRune(set,c) ; c = tkz.Next() {
+        monolog.Debug("SkipNotIn: %c %s", c, tkz.Current())
+        if c == '\000' {
+            return false
+        }
+    }
+    return true
+}
+
+func (tkz *Tokenizer) SkipWhile(should_skip func(r rune) bool) bool {
+    for should_skip(tkz.Peek()) {
+        if tkz.Next() == '\000' {
+            return false
+        }
+    }
+    return true
+}
+
+func (tkz *Tokenizer) SkipWhitespace() {
+    tkz.SkipIn(" \t")
+}
+
+func (tkz *Tokenizer) Advance() {
+    (*tkz.LastPtr()) = tkz.Current()
+}
+
+func (tkz *Tokenizer) Rewind() {
+    (*tkz.CurrentPtr()) = tkz.Last()
+}
+
+func (tkz *Tokenizer) CurrentRuneValue() []rune {
+    return tkz.Input[tkz.Last().Index:tkz.Current().Index]
+}
+
+func (tkz *Tokenizer) CurrentStringValue() string {
+    return string(tkz.CurrentRuneValue())
+}
+
+func (tkz *Tokenizer) Found(kind TokenType) {
+    tkz.Emit(kind, TokenText(tkz.CurrentStringValue()))
+    tkz.Advance()
+}
+
+func getFunctionName(fun interface{}) string {
+    return runtime.FuncForPC(reflect.ValueOf(fun).Pointer()).Name()
+}
+
+func (tkz *Tokenizer) Start() []*Token {
+    rule := TokenizeNormal
+    for rule != nil {
+        monolog.Debug("Tokenizer Rule: %s\n", getFunctionName(rule))
+        rule = rule(tkz)
+    }
+    return tkz.Output
+}
+
+func (tkz *Tokenizer) TryTokenizing() {
+    tokens := tkz.Start()
+
+    for token := range tokens {
+        monolog.Info("Token %s", token)
+    }
+}
+
+
+func (tkz * Tokenizer) EmitToken(token * Token) * Token {
+    tkz.Output = append(tkz.Output, token)
+    return token
+}
+
+
+func TokenizeStart(tkz *Tokenizer) TokenizerRule {
+    return nil
+}
+
+
+func Tokenize(input string) []*Token {
+    tokenizer := NewTokenizer(input)
+    return tokenizer.Start()
+}
+
+func TryTokenizingString(input string) {
+    tokens := Tokenize(input)
+
+    for token := range tokens {
+        monolog.Info("Token %s", token)
+    }
+}
+
+
+

+ 59 - 0
raku/tokenizer_test.go

@@ -0,0 +1,59 @@
+package raku
+
+import (
+    // "strings"
+    "testing"
+    "os"
+
+    "gitlab.com/beoran/woe/monolog"
+    // "gitlab.com/beoran/woe/tree"
+)
+
+func HelperTryTokenizing(input string, test *testing.T) {
+    test.Logf("Tokenizing started:")
+    output := Tokenize(input)
+    for _, token := range output {
+        test.Logf("Token %s", token.String())
+    }
+}
+
+
+func TestTokenizing1(test *testing.T) {
+    const input = `
+say "hello \"world\\"
+
+define open a door do
+    set (door's open) true
+    let door 's open be true 
+end
+
+def increment variable by value do
+    ( variable = ( variable + value ) )
+end
+
+"
+`
+    HelperTryTokenizing(input, test)
+    test.Log("Hi test!")
+} 
+
+func TestTokenizing2(test *testing.T) {
+    const input = `say`
+    HelperTryTokenizing(input, test)
+    test.Log("Hi test!")
+}
+
+func TestTokenizing3(test *testing.T) {
+    const input = `$sym`
+    HelperTryTokenizing(input, test)
+    test.Log("Hi test!")
+}
+
+
+func TestMain(m *testing.M) { 
+    monolog.Setup("raku_test.log", true, false)
+    // monolog.EnableLevel("DEBUG")
+    retCode := m.Run()
+    monolog.Close()
+    os.Exit(retCode)
+}