123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347 |
- package raku
- import "strings"
- import "unicode"
- import "fmt"
- import "reflect"
- import "runtime"
- import "gitlab.com/beoran/woe/monolog"
- /* The tokenizer splits up text in tokens without classifying Word tokens. */
- type Tokenizer struct {
- CurrentPosition Position
- LastPosition Position
- Input []rune
- Output []*Token
- rule TokenizerRule
- }
- type TokenizerRule func(tkz *Tokenizer) TokenizerRule
- func NewTokenizer(input string) * Tokenizer {
- tokenizer := &Tokenizer{}
- tokenizer.Input = []rune(input);
- tokenizer.rule = TokenizeStart
- return tokenizer;
- }
- func (tkz *Tokenizer) Last() Position {
- return tkz.LastPosition
- }
- func (tkz *Tokenizer) Current() Position {
- return tkz.CurrentPosition
- }
- func (tkz *Tokenizer) LastPtr() * Position {
- return &tkz.LastPosition
- }
- func (tkz *Tokenizer) CurrentPtr() * Position {
- return &tkz.CurrentPosition
- }
- func (tkz *Tokenizer) Emit(t TokenType, v TokenText) {
- tok := &Token{t, v, tkz.Current()}
- tkz.EmitToken(tok)
- }
- func (tkz *Tokenizer) Error(message string, args ...interface{}) {
- value := fmt.Sprintf(message, args...)
- monolog.Error("Tokenize Error: %s", value)
- tkz.Emit(TokenError, TokenText(value))
- }
- func TokenizeError(tkz *Tokenizer) TokenizerRule {
- tkz.Error("Error")
- return nil
- }
- func (tkz *Tokenizer) SkipComment() bool {
- if tkz.Peek() == '#' {
- if tkz.Next() == '(' {
- return tkz.SkipNotIn(")")
- } else {
- return tkz.SkipNotIn("\r\n")
- }
- }
- return true
- }
- func TokenizeSigil(tkz *Tokenizer) TokenizerRule {
- tkz.Found(TokenType(tkz.Peek()))
- _ = tkz.Next()
- tkz.Advance()
- return TokenizeNormal
- }
- const tokenDelimiter = " \t\r\n'({[]})"
- func TokenizeWord(tkz *Tokenizer) TokenizerRule {
- tkz.SkipNotIn(tokenDelimiter)
- tkz.Found(TokenWord)
- return TokenizeNormal
- }
- func TokenizeSymbol(tkz *Tokenizer) TokenizerRule {
- tkz.SkipNotIn(tokenDelimiter)
- tkz.Found(TokenSymbol)
- return TokenizeNormal
- }
- func TokenizeNumber(tkz *Tokenizer) TokenizerRule {
- tkz.SkipNotIn(tokenDelimiter)
- tkz.Found(TokenNumber)
- return TokenizeNormal
- }
- func TokenizeWhitespace(tkz *Tokenizer) TokenizerRule {
- tkz.SkipWhitespace()
- tkz.Advance()
- return TokenizeNormal
- }
- func TokenizeComment(tkz *Tokenizer) TokenizerRule {
- if !tkz.SkipComment() {
- tkz.Error("Unterminated comment")
- return TokenizeError
- }
- tkz.Advance()
- return TokenizeNormal
- }
- func TokenizePunctuator(tkz *Tokenizer) TokenizerRule {
- tkz.Found(TokenType(tkz.Peek()))
- _ = tkz.Next()
- tkz.Advance()
- return TokenizeNormal
- }
- func TokenizeEOL(tkz *Tokenizer) TokenizerRule {
- tkz.SkipIn("\r\n")
- tkz.Found(TokenEOL)
- return TokenizeNormal
- }
- func TokenizeOperator(tkz *Tokenizer) TokenizerRule {
- tkz.SkipIn(operator_chars)
- tkz.Found(TokenOperator)
- return TokenizeNormal
- }
- func tokenizeEscape(tkz *Tokenizer) error {
- _ = tkz.Next()
- return nil
- }
- func TokenizeString(tkz *Tokenizer) TokenizerRule {
- open := tkz.Peek()
- do_escape := open == '"'
- peek := tkz.Next()
- tkz.Advance()
- for ; peek != '\000'; peek = tkz.Next() {
- if do_escape && peek == '\\' {
- if err := tokenizeEscape(tkz); err != nil {
- return TokenizeError
- }
- } else if peek == open {
- tkz.Found(TokenString)
- _ = tkz.Next()
- tkz.Advance()
- return TokenizeNormal
- }
- }
- tkz.Error("Unexpected EOF in string.")
- return nil
- }
- func TokenizeNumberOrOperator(tkz *Tokenizer) TokenizerRule {
- if unicode.IsDigit(tkz.Next()) {
- return TokenizeNumber
- } else {
- _ = tkz.Previous()
- return TokenizeOperator
- }
- }
- func TokenizeNormal(tkz *Tokenizer) TokenizerRule {
- peek := tkz.Peek()
- if peek == '#' {
- return TokenizeComment
- } else if strings.ContainsRune(" \t", peek) {
- return TokenizeWhitespace
- } else if strings.ContainsRune(".,;:", peek) {
- return TokenizePunctuator
- } else if strings.ContainsRune("([{}])", peek) {
- return TokenizeSigil
- } else if strings.ContainsRune("$", peek) {
- return TokenizeSymbol
- } else if strings.ContainsRune("\r\n", peek) {
- return TokenizeEOL
- } else if strings.ContainsRune("+-", peek) {
- return TokenizeNumberOrOperator
- } else if strings.ContainsRune("\"`", peek) {
- return TokenizeString
- } else if peek == '\000' {
- tkz.Emit(TokenEOF, "")
- return nil
- } else if unicode.IsLetter(peek) {
- return TokenizeWord
- } else if unicode.IsDigit(peek) {
- return TokenizeNumber
- } else if strings.ContainsRune(operator_chars, peek) {
- return TokenizeOperator
- } else {
- return TokenizeError
- }
- }
- func (tkz *Tokenizer) Peek() rune {
- if (tkz.Current().Index) >= len(tkz.Input) {
- return '\000'
- }
- return tkz.Input[tkz.Current().Index]
- }
- func (tkz *Tokenizer) PeekNext() rune {
- if (tkz.Current().Index + 1) >= len(tkz.Input) {
- return '\000'
- }
- return tkz.Input[tkz.Current().Index+1]
- }
- func (tkz *Tokenizer) Next() rune {
- monolog.Debug("Next: %c %s", tkz.Peek(), tkz.Current())
- if tkz.Peek() == '\n' {
- tkz.CurrentPtr().Column = 0
- tkz.CurrentPtr().Row++
- }
- tkz.CurrentPtr().Index++
- return tkz.Peek()
- }
- func (tkz *Tokenizer) Previous() rune {
- if tkz.Current().Index > 0 {
- tkz.CurrentPtr().Index--
- if tkz.Peek() == '\n' {
- tkz.CurrentPtr().Column = 0
- tkz.CurrentPtr().Row++
- }
- }
- return tkz.Peek()
- }
- func (tkz *Tokenizer) SkipRune() {
- _ = tkz.Next()
- }
- func (tkz *Tokenizer) SkipIn(set string) bool {
- for strings.ContainsRune(set, tkz.Next()) {
- monolog.Debug("SkipIn: %s %c\n", set, tkz.Peek())
- if tkz.Peek() == '\000' {
- return false
- }
- }
- return true
- }
- func (tkz *Tokenizer) SkipNotIn(set string) bool {
- _ = tkz.Next()
- for c := tkz.Peek() ; !strings.ContainsRune(set,c) ; c = tkz.Next() {
- monolog.Debug("SkipNotIn: %c %s", c, tkz.Current())
- if c == '\000' {
- return false
- }
- }
- return true
- }
- func (tkz *Tokenizer) SkipWhile(should_skip func(r rune) bool) bool {
- for should_skip(tkz.Peek()) {
- if tkz.Next() == '\000' {
- return false
- }
- }
- return true
- }
- func (tkz *Tokenizer) SkipWhitespace() {
- tkz.SkipIn(" \t")
- }
- func (tkz *Tokenizer) Advance() {
- (*tkz.LastPtr()) = tkz.Current()
- }
- func (tkz *Tokenizer) Rewind() {
- (*tkz.CurrentPtr()) = tkz.Last()
- }
- func (tkz *Tokenizer) CurrentRuneValue() []rune {
- return tkz.Input[tkz.Last().Index:tkz.Current().Index]
- }
- func (tkz *Tokenizer) CurrentStringValue() string {
- return string(tkz.CurrentRuneValue())
- }
- func (tkz *Tokenizer) Found(kind TokenType) {
- tkz.Emit(kind, TokenText(tkz.CurrentStringValue()))
- tkz.Advance()
- }
- func getFunctionName(fun interface{}) string {
- return runtime.FuncForPC(reflect.ValueOf(fun).Pointer()).Name()
- }
- func (tkz *Tokenizer) Start() []*Token {
- rule := TokenizeNormal
- for rule != nil {
- monolog.Debug("Tokenizer Rule: %s\n", getFunctionName(rule))
- rule = rule(tkz)
- }
- return tkz.Output
- }
- func (tkz *Tokenizer) TryTokenizing() {
- tokens := tkz.Start()
- for token := range tokens {
- monolog.Info("Token %s", token)
- }
- }
- func (tkz * Tokenizer) EmitToken(token * Token) * Token {
- tkz.Output = append(tkz.Output, token)
- return token
- }
- func TokenizeStart(tkz *Tokenizer) TokenizerRule {
- return nil
- }
- func Tokenize(input string) []*Token {
- tokenizer := NewTokenizer(input)
- return tokenizer.Start()
- }
- func TryTokenizingString(input string) {
- tokens := Tokenize(input)
- for token := range tokens {
- monolog.Info("Token %s", token)
- }
- }
|