Browse Source

Flexible lexer based on regular expressions. Seems to work OK in test.

Beoran 2 years ago
parent
commit
ab01c0613d
2 changed files with 388 additions and 0 deletions
  1. 327 0
      flexer/flexer.go
  2. 61 0
      flexer/flexer_test.go

+ 327 - 0
flexer/flexer.go

@@ -0,0 +1,327 @@
+package flexer
+
+import "fmt"
+import "regexp"
+import "strings"
+import "strconv"
+
+/* Flexer is a flexible regexp and rule based
+lexer that can be used as an implementation for
+generated code.
+*/
+
+type Position struct {
+	Name *string
+	Line int
+	Col  int
+}
+
+type Kind int
+
+const (
+	SkipKind  Kind = -30000
+	ErrorKind Kind = -31000
+)
+
+type Token interface {
+	Position() Position
+	Kind() Kind
+	Text() string
+}
+
+type Lexer interface {
+	// Accept will accept a regexp and advance, returning the matches.
+	// Returns nil if no matches were found.
+	Accept(re *regexp.Regexp) []string
+	// Returns the current lexer position.
+	Position() Position
+	// Returns if the lexer is at the end or not.
+	EOF() bool
+
+	// The lexer creates a token with the current lexer position and
+	// the given kind and text.
+	MakeToken(kind Kind, form string, args ...interface{}) Token
+
+	// The lexer creates a token with the current lexer position and
+	// the given kind. The text is taken from the lexer string builder and
+	// that builser is reset.
+	MakeBuilderToken(kind Kind) Token
+
+	// The lexer has a string builder, which can be used to append
+	// strings or runes to and which can be returned and cleared when the
+	// token is complete.
+	Builder() *strings.Builder
+
+	// Adds a rule to the lexer.
+	Rule(kind Kind, re, context string, act Action) error
+	// Calls the lexer once.
+	LexOnce() []Token
+
+	// Returns the current lexer context
+	Context() string
+	// Pushes the named context on the lexer context stack
+	PushContext(name string)
+	// Pops the current context from the lexer context stack.
+	PopContext()
+}
+
+type Action func(f Lexer, k Kind, matches ...string) []Token
+
+type BasicToken struct {
+	position Position
+	kind     Kind
+	text     string
+}
+
+func (bt BasicToken) Kind() Kind {
+	return bt.kind
+}
+
+func (bt BasicToken) Position() Position {
+	return bt.position
+}
+
+func (bt BasicToken) Text() string {
+	return bt.text
+}
+
+func MakeToken(position Position, kind Kind, form string,
+	args ...interface{}) BasicToken {
+	text := fmt.Sprintf(form, args...)
+	return BasicToken{position, kind, text}
+}
+
+type ErrorToken struct {
+	BasicToken
+}
+
+/* A rule for Flexer is based on a regular expression.
+* While the rule may have submatches, the lexer will consume
+* the whole match if it matches at the beginning of the current input.
+ */
+type Rule struct {
+	Kind
+	*regexp.Regexp
+	Context string
+	Action
+}
+
+// DefaultAction is the default action on a match.
+// If there is only 1 match, then that is the token,
+// otherwise all sub-macthes excluding the first
+// whole string match are the tokens.
+func DefaultAction(lex Lexer, k Kind, matches ...string) []Token {
+	if len(matches) == 1 {
+		tok := lex.MakeToken(k, matches[0])
+		return []Token{tok}
+	}
+	res := []Token{}
+	for i := 1; 1 < len(matches); i++ {
+		tok := lex.MakeToken(k, matches[i])
+		res = append(res, tok)
+	}
+	return res
+}
+
+// ContextAction returns an action that returns
+// no tokens but switches the lexer context and
+// empties the buffer.
+func ContextAction(context string) func(lex Lexer, k Kind, matches ...string) []Token {
+	return func(lex Lexer, k Kind, matches ...string) []Token {
+		lex.PushContext(context)
+		lex.Builder().Reset()
+		return []Token{}
+	}
+}
+
+// Returns an action that pops the context and
+// returns the token in the buffer with the given kind
+func PopAction(kind Kind) func(lex Lexer, k Kind, matches ...string) []Token {
+	return func(lex Lexer, k Kind, matches ...string) []Token {
+		lex.PopContext()
+		tok := lex.MakeBuilderToken(kind)
+		return []Token{tok}
+	}
+}
+
+// Returns an action that stores the match in the lexer buffer.
+func StoreAction() func(lex Lexer, k Kind, matches ...string) []Token {
+	return func(lex Lexer, k Kind, matches ...string) []Token {
+		for _, m := range matches {
+			lex.Builder().WriteString(m)
+		}
+		return []Token{}
+	}
+}
+
+// Returns an action that stores the match in the lexer buffer after applying UnquoteChar to apply
+// an escape sequence.
+func EscapeAction(quote byte) func(lex Lexer, k Kind, matches ...string) []Token {
+	return func(lex Lexer, k Kind, matches ...string) []Token {
+		s, _, t, e := strconv.UnquoteChar(matches[0], quote)
+		print("escape", s, t, e)
+		if e != nil {
+			et := lex.MakeToken(ErrorKind, "%s", e)
+			return []Token{et}
+		}
+		lex.Builder().WriteRune(s)
+		lex.Builder().WriteString(t)
+		return []Token{}
+	}
+}
+
+// Try tries to apply a rule.
+// Returns nil on no match.
+func (r Rule) Try(lex Lexer) []Token {
+	matches := lex.Accept(r.Regexp)
+	if matches == nil || len(matches) == 0 {
+		return nil
+	}
+	if r.Action != nil {
+		return r.Action(lex, r.Kind, matches...)
+	}
+	// No action, use default action
+	return DefaultAction(lex, r.Kind, matches...)
+}
+
+type Flexer struct {
+	index    int
+	position Position
+	rules    []Rule
+	input    string
+	name     string
+	contexts []string
+	builder  strings.Builder
+}
+
+func (f Flexer) MakeToken(kind Kind, form string, args ...interface{}) Token {
+	return MakeToken(f.position, kind, form, args...)
+}
+
+func (f *Flexer) MakeBuilderToken(kind Kind) Token {
+	text := f.builder.String()
+	f.builder.Reset()
+	return f.MakeToken(kind, text)
+}
+
+// Advances the flexer to the given index,
+// updating the position.
+func (f *Flexer) advanceTo(index int) {
+	start := f.index
+	end := index
+	for i := start; i < end; i++ {
+		c := f.input[i] // This works because newlines are ascii.
+		if c == '\r' || c == '\n' {
+			if c == '\r' && (i+1) < len(f.input) {
+				if f.input[i+1] == '\n' {
+					i++
+				}
+			}
+			f.position.Line++
+			f.position.Col = 1
+		} else {
+			f.position.Col++
+		}
+	}
+	f.index = end
+}
+
+func (f *Flexer) Accept(re *regexp.Regexp) []string {
+	indexes := re.FindStringSubmatchIndex(f.input[f.index:len(f.input)])
+	if indexes == nil || len(indexes) < 1 {
+		return nil
+	}
+	_, end := f.index+indexes[0], f.index+indexes[1]
+	matches := []string{}
+	for i := 1; i < len(indexes); i += 2 {
+		subStart, subEnd := indexes[i-1]+f.index, indexes[i]+f.index
+		sub := f.input[subStart:subEnd]
+		matches = append(matches, sub)
+	}
+	f.advanceTo(end)
+	return matches
+}
+
+func (f *Flexer) Rule(kind Kind, expr, context string, act Action) error {
+	re, err := regexp.Compile(`\A` + expr)
+	if err != nil {
+		return err
+	}
+	rule := Rule{kind, re, context, act}
+	f.rules = append(f.rules, rule)
+	return nil
+}
+
+func (f *Flexer) PushContext(context string) {
+	f.contexts = append(f.contexts, context)
+}
+
+func (f *Flexer) Context() string {
+	context := ""
+	clen := len(f.contexts)
+	if clen > 0 {
+		context = f.contexts[clen-1]
+	}
+	return context
+}
+
+func (f *Flexer) PopContext() {
+	clen := len(f.contexts)
+	if clen > 0 {
+		f.contexts = f.contexts[0 : clen-1]
+	}
+}
+
+func (f *Flexer) Builder() *strings.Builder {
+	return &f.builder
+}
+
+// Runs the lexer once.
+// Return nil if no more progress can be made
+func (f *Flexer) LexOnce() []Token {
+	for _, rule := range f.rules {
+		if rule.Context != f.Context() {
+			continue
+		}
+		tokens := rule.Try(f)
+		if tokens != nil {
+			return tokens
+		}
+	}
+	return nil
+}
+
+func (f Flexer) Position() Position {
+	return f.position
+}
+
+func (f Flexer) EOF() bool {
+	return f.index >= len(f.input)
+}
+
+func NewFlexer(name, text string) *Flexer {
+	res := &Flexer{}
+	res.position.Line = 1
+	res.position.Col = 1
+	res.position.Name = &name
+	res.input = text
+	return res
+}
+
+// Lexes all tokens from the lexer until it reaches
+// EOF, or until it cannot progress anymore.
+// All tokens of kind SkipKind will be skipped
+// from the results.
+func LexAll(lex Lexer) []Token {
+	res := []Token{}
+	for !lex.EOF() {
+		toks := lex.LexOnce()
+		if toks == nil {
+			err := lex.MakeToken(ErrorKind, " Lexer error: no rule matches. Context:%s.", lex.Context())
+			res = append(res, err)
+			return res
+		}
+		res = append(res, toks...)
+	}
+	return res
+}

+ 61 - 0
flexer/flexer_test.go

@@ -0,0 +1,61 @@
+package flexer
+
+import "testing"
+
+const (
+	tWord = Kind(-1 - iota)
+	tArrow
+	tSpace
+	tString
+	tPlus = Kind('+')
+	tEos  = Kind('.')
+)
+
+func TestFlexer(t *testing.T) {
+	pos := Position{}
+	expected := []Token{
+		MakeToken(pos, tSpace, "\t "),
+		MakeToken(pos, tWord, "PROGRAM"),
+		MakeToken(pos, tSpace, "  "),
+		MakeToken(pos, tArrow, "->"),
+		MakeToken(pos, tSpace, "  "),
+		MakeToken(pos, tWord, "STATEMENT"),
+		MakeToken(pos, tPlus, "+"),
+		MakeToken(pos, tSpace, " "),
+		MakeToken(pos, tEos, ".\n"),
+		MakeToken(pos, tWord, "say"),
+		MakeToken(pos, tSpace, " "),
+		MakeToken(pos, tString, "hello\nworld"),
+		MakeToken(pos, tEos, "."),
+	}
+	f := NewFlexer(`test`, "\t PROGRAM  ->  STATEMENT+ .\nsay \"hello\\nworld\".")
+	f.Rule(tSpace, `[ \t]+`, "", nil)
+	f.Rule(tWord, `[A-Za-z_]+`, "", nil)
+	f.Rule(tArrow, `\->`, "", nil)
+	f.Rule(tPlus, `\+`, "", nil)
+	f.Rule(tEos, `\.[\n\r]*`, "", nil)
+	f.Rule(SkipKind, `"`, "", ContextAction("string"))
+	f.Rule(tString, `"`, "string", PopAction(tString))
+	f.Rule(SkipKind, `\\[etnru][0-9a-f]*`, "string", EscapeAction('"'))
+	f.Rule(SkipKind, `.`, "string", StoreAction())
+
+	toks := LexAll(f)
+
+	for i, e := range expected {
+		tok := toks[i]
+		t.Logf("toks: %d, %v", i, tok)
+		ko := tok.Kind()
+		ke := e.Kind()
+		if ko != ke {
+			t.Errorf("error: kind:%d|%d|", ko, ke)
+		}
+		to := tok.Text()
+		te := e.Text()
+		if to != te {
+			t.Errorf("error: text:%s|%s|", to, te)
+		}
+	}
+	if !f.EOF() {
+		t.Errorf("error: should be EOF")
+	}
+}