package flexer import "fmt" import "regexp" import "strings" import "strconv" import . "src.eruta.nl/beoran/ll1/common" /* Flexer is a flexible regexp and lexeme based lexer that can be used as an implementation for generated code. */ type BasicToken struct { location Location kind Kind text string value Value } func (bt BasicToken) Kind() Kind { return bt.kind } func (bt BasicToken) Location() Location { return bt.location } func (bt BasicToken) Text() string { return bt.text } func (bt BasicToken) Value() Value { if bt.Value() == nil { return StringValue(bt.text) } else { return bt.value } } func MakeToken(Location Location, kind Kind, form string, args ...interface{}) BasicToken { text := fmt.Sprintf(form, args...) return BasicToken{Location, kind, text, StringValue(text)} } func MakeValueToken(Location Location, kind Kind, value Value) BasicToken { text := value.String() return BasicToken{Location, kind, text, value} } type ErrorToken struct { BasicToken } func MakeErrorToken(Location Location, form string, args ...interface{}) ErrorToken { err := fmt.Errorf(form, args...) tok := MakeValueToken(Location, ErrorKind, ErrorValue{err}) return ErrorToken{tok} } func (e ErrorToken) Error() string { return fmt.Sprintf("%s%s", e.Location(), e.text) } /* Lexeme for Flexer is based on a regular expression. * While the lexeme may have submatches, the lexer will consume * the whole match if it matches at the beginning of the current input. */ type Lexeme struct { Kind *regexp.Regexp Context string Action } // DefaultAction is the default action on a match. // If there is only 1 match, then that is the token, // otherwise all sub-macthes excluding the first // whole string match are the tokens. func DefaultAction(lex Lexer, k Kind, matches ...string) []Token { if len(matches) == 1 { tok := lex.MakeToken(k, matches[0]) return []Token{tok} } res := []Token{} for i := 1; 1 < len(matches); i++ { tok := lex.MakeToken(k, matches[i]) res = append(res, tok) } return res } // ContextAction returns an action that returns // no tokens but switches the lexer context and // empties the buffer. func ContextAction(context string) func(lex Lexer, k Kind, matches ...string) []Token { return func(lex Lexer, k Kind, matches ...string) []Token { lex.PushContext(context) lex.Builder().Reset() return []Token{} } } // Returns an action that pops the context and // returns the token in the buffer with the given kind func PopAction(kind Kind) func(lex Lexer, k Kind, matches ...string) []Token { return func(lex Lexer, k Kind, matches ...string) []Token { lex.PopContext() tok := lex.MakeBuilderToken(kind) return []Token{tok} } } // Returns an action that stores the match in the lexer buffer. func StoreAction() func(lex Lexer, k Kind, matches ...string) []Token { return func(lex Lexer, k Kind, matches ...string) []Token { for _, m := range matches { lex.Builder().WriteString(m) } return []Token{} } } // Returns an action that stores the match in the lexer buffer after applying UnquoteChar to apply // an escape sequence. func EscapeAction(quote byte) func(lex Lexer, k Kind, matches ...string) []Token { return func(lex Lexer, k Kind, matches ...string) []Token { s, _, t, e := strconv.UnquoteChar(matches[0], quote) print("escape", s, t, e) if e != nil { et := lex.MakeToken(ErrorKind, "%s", e) return []Token{et} } lex.Builder().WriteRune(s) lex.Builder().WriteString(t) return []Token{} } } // Try tries to apply a lexeme. // Returns nil on no match. func (r Lexeme) Try(lex Lexer) []Token { matches := lex.Accept(r.Regexp) if matches == nil || len(matches) == 0 { return nil } if r.Action != nil { return r.Action(lex, r.Kind, matches...) } // No action, use default action return DefaultAction(lex, r.Kind, matches...) } type Flexer struct { index int location Location lexemes []Lexeme input string name string contexts []string builder strings.Builder } func (f Flexer) MakeToken(kind Kind, form string, args ...interface{}) Token { return MakeToken(f.location, kind, form, args...) } func (f *Flexer) MakeBuilderToken(kind Kind) Token { text := f.builder.String() f.builder.Reset() return f.MakeToken(kind, text) } // Advances the flexer to the given index, // updating the Location. func (f *Flexer) advanceTo(index int) { start := f.index end := index for i := start; i < end; i++ { c := f.input[i] // This works because newlines are ascii. if c == '\r' || c == '\n' { if c == '\r' && (i+1) < len(f.input) { if f.input[i+1] == '\n' { i++ } } f.location.Line++ f.location.Col = 1 } else { f.location.Col++ } } f.index = end } func (f *Flexer) Accept(re *regexp.Regexp) []string { indexes := re.FindStringSubmatchIndex(f.input[f.index:len(f.input)]) if indexes == nil || len(indexes) < 1 { return nil } _, end := f.index+indexes[0], f.index+indexes[1] matches := []string{} for i := 1; i < len(indexes); i += 2 { subStart, subEnd := indexes[i-1]+f.index, indexes[i]+f.index sub := f.input[subStart:subEnd] matches = append(matches, sub) } f.advanceTo(end) return matches } func (f *Flexer) Lexeme(kind Kind, expr, context string, act Action) error { re, err := regexp.Compile(`\A` + expr) if err != nil { return err } lexeme := Lexeme{kind, re, context, act} f.lexemes = append(f.lexemes, lexeme) return nil } func (f *Flexer) EscapedStringLexeme(kind Kind, first, last, context string) { f.Lexeme(SkipKind, first, "", ContextAction(context)) f.Lexeme(kind, last, context, PopAction(kind)) f.Lexeme(SkipKind, `\\[etnru][0-9a-f]*`, context, EscapeAction(last[0])) f.Lexeme(SkipKind, `.`, context, StoreAction()) } func (f *Flexer) RawStringLexeme(kind Kind, first, last, context string) { f.Lexeme(SkipKind, first, "", ContextAction(context)) f.Lexeme(kind, last, context, PopAction(kind)) f.Lexeme(SkipKind, `.`, context, StoreAction()) } func (f *Flexer) PushContext(context string) { f.contexts = append(f.contexts, context) } func (f *Flexer) Context() string { context := "" clen := len(f.contexts) if clen > 0 { context = f.contexts[clen-1] } return context } func (f *Flexer) PopContext() { clen := len(f.contexts) if clen > 0 { f.contexts = f.contexts[0 : clen-1] } } func (f *Flexer) Builder() *strings.Builder { return &f.builder } // Runs the lexer once. // Return nil if no more progress can be made func (f *Flexer) LexOnce() []Token { for _, lexeme := range f.lexemes { if lexeme.Context != f.Context() { continue } tokens := lexeme.Try(f) if tokens != nil { return tokens } } return nil } func (f Flexer) Location() Location { return f.location } func (f Flexer) EOF() bool { return f.index >= len(f.input) } func NewFlexer(name, text string) *Flexer { res := &Flexer{} res.location.Line = 1 res.location.Col = 1 res.location.Name = &name res.input = text return res } // KeepToken returns true if the token should be kept // and false if it is of the kind in the skip list func KeepToken(tok Token, skips ...Kind) bool { for _, skip := range skips { if skip == tok.Kind() { return false } } return true } // Lexes all tokens from the lexer until it reaches // EOF, or until it cannot progress anymore. // All tokens in the skip array will be skipped // from the results. If the lexer reachest he end of input, // a token with kind EndKind will be appended func LexAll(lex Lexer, skips ...Kind) []Token { res := []Token{} for !lex.EOF() { toks := lex.LexOnce() if toks == nil { err := lex.MakeToken(ErrorKind, " Lexer error: no lexeme matches. Context:%s.", lex.Context()) res = append(res, err) return res } for _, tok := range toks { if KeepToken(tok, skips...) { res = append(res, tok) } } } // here we reached EOF res = append(res, lex.MakeToken(EndKind, "")) return res }