2 years ago · ab01c0613d
--- a/flexer/flexer.go
+++ b/flexer/flexer.go
@@ -0,0 +1,327 @@
 
				+package flexer
			
 
				+
			
 
				+import "fmt"
			
 
				+import "regexp"
			
 
				+import "strings"
			
 
				+import "strconv"
			
 
				+
			
 
				+/* Flexer is a flexible regexp and rule based
			
 
				+lexer that can be used as an implementation for
			
 
				+generated code.
			
 
				+*/
			
 
				+
			
 
				+type Position struct {
			
 
				+	Name *string
			
 
				+	Line int
			
 
				+	Col  int
			
 
				+}
			
 
				+
			
 
				+type Kind int
			
 
				+
			
 
				+const (
			
 
				+	SkipKind  Kind = -30000
			
 
				+	ErrorKind Kind = -31000
			
 
				+)
			
 
				+
			
 
				+type Token interface {
			
 
				+	Position() Position
			
 
				+	Kind() Kind
			
 
				+	Text() string
			
 
				+}
			
 
				+
			
 
				+type Lexer interface {
			
 
				+	// Accept will accept a regexp and advance, returning the matches.
			
 
				+	// Returns nil if no matches were found.
			
 
				+	Accept(re *regexp.Regexp) []string
			
 
				+	// Returns the current lexer position.
			
 
				+	Position() Position
			
 
				+	// Returns if the lexer is at the end or not.
			
 
				+	EOF() bool
			
 
				+
			
 
				+	// The lexer creates a token with the current lexer position and
			
 
				+	// the given kind and text.
			
 
				+	MakeToken(kind Kind, form string, args ...interface{}) Token
			
 
				+
			
 
				+	// The lexer creates a token with the current lexer position and
			
 
				+	// the given kind. The text is taken from the lexer string builder and
			
 
				+	// that builser is reset.
			
 
				+	MakeBuilderToken(kind Kind) Token
			
 
				+
			
 
				+	// The lexer has a string builder, which can be used to append
			
 
				+	// strings or runes to and which can be returned and cleared when the
			
 
				+	// token is complete.
			
 
				+	Builder() *strings.Builder
			
 
				+
			
 
				+	// Adds a rule to the lexer.
			
 
				+	Rule(kind Kind, re, context string, act Action) error
			
 
				+	// Calls the lexer once.
			
 
				+	LexOnce() []Token
			
 
				+
			
 
				+	// Returns the current lexer context
			
 
				+	Context() string
			
 
				+	// Pushes the named context on the lexer context stack
			
 
				+	PushContext(name string)
			
 
				+	// Pops the current context from the lexer context stack.
			
 
				+	PopContext()
			
 
				+}
			
 
				+
			
 
				+type Action func(f Lexer, k Kind, matches ...string) []Token
			
 
				+
			
 
				+type BasicToken struct {
			
 
				+	position Position
			
 
				+	kind     Kind
			
 
				+	text     string
			
 
				+}
			
 
				+
			
 
				+func (bt BasicToken) Kind() Kind {
			
 
				+	return bt.kind
			
 
				+}
			
 
				+
			
 
				+func (bt BasicToken) Position() Position {
			
 
				+	return bt.position
			
 
				+}
			
 
				+
			
 
				+func (bt BasicToken) Text() string {
			
 
				+	return bt.text
			
 
				+}
			
 
				+
			
 
				+func MakeToken(position Position, kind Kind, form string,
			
 
				+	args ...interface{}) BasicToken {
			
 
				+	text := fmt.Sprintf(form, args...)
			
 
				+	return BasicToken{position, kind, text}
			
 
				+}
			
 
				+
			
 
				+type ErrorToken struct {
			
 
				+	BasicToken
			
 
				+}
			
 
				+
			
 
				+/* A rule for Flexer is based on a regular expression.
			
 
				+* While the rule may have submatches, the lexer will consume
			
 
				+* the whole match if it matches at the beginning of the current input.
			
 
				+ */
			
 
				+type Rule struct {
			
 
				+	Kind
			
 
				+	*regexp.Regexp
			
 
				+	Context string
			
 
				+	Action
			
 
				+}
			
 
				+
			
 
				+// DefaultAction is the default action on a match.
			
 
				+// If there is only 1 match, then that is the token,
			
 
				+// otherwise all sub-macthes excluding the first
			
 
				+// whole string match are the tokens.
			
 
				+func DefaultAction(lex Lexer, k Kind, matches ...string) []Token {
			
 
				+	if len(matches) == 1 {
			
 
				+		tok := lex.MakeToken(k, matches[0])
			
 
				+		return []Token{tok}
			
 
				+	}
			
 
				+	res := []Token{}
			
 
				+	for i := 1; 1 < len(matches); i++ {
			
 
				+		tok := lex.MakeToken(k, matches[i])
			
 
				+		res = append(res, tok)
			
 
				+	}
			
 
				+	return res
			
 
				+}
			
 
				+
			
 
				+// ContextAction returns an action that returns
			
 
				+// no tokens but switches the lexer context and
			
 
				+// empties the buffer.
			
 
				+func ContextAction(context string) func(lex Lexer, k Kind, matches ...string) []Token {
			
 
				+	return func(lex Lexer, k Kind, matches ...string) []Token {
			
 
				+		lex.PushContext(context)
			
 
				+		lex.Builder().Reset()
			
 
				+		return []Token{}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// Returns an action that pops the context and
			
 
				+// returns the token in the buffer with the given kind
			
 
				+func PopAction(kind Kind) func(lex Lexer, k Kind, matches ...string) []Token {
			
 
				+	return func(lex Lexer, k Kind, matches ...string) []Token {
			
 
				+		lex.PopContext()
			
 
				+		tok := lex.MakeBuilderToken(kind)
			
 
				+		return []Token{tok}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// Returns an action that stores the match in the lexer buffer.
			
 
				+func StoreAction() func(lex Lexer, k Kind, matches ...string) []Token {
			
 
				+	return func(lex Lexer, k Kind, matches ...string) []Token {
			
 
				+		for _, m := range matches {
			
 
				+			lex.Builder().WriteString(m)
			
 
				+		}
			
 
				+		return []Token{}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// Returns an action that stores the match in the lexer buffer after applying UnquoteChar to apply
			
 
				+// an escape sequence.
			
 
				+func EscapeAction(quote byte) func(lex Lexer, k Kind, matches ...string) []Token {
			
 
				+	return func(lex Lexer, k Kind, matches ...string) []Token {
			
 
				+		s, _, t, e := strconv.UnquoteChar(matches[0], quote)
			
 
				+		print("escape", s, t, e)
			
 
				+		if e != nil {
			
 
				+			et := lex.MakeToken(ErrorKind, "%s", e)
			
 
				+			return []Token{et}
			
 
				+		}
			
 
				+		lex.Builder().WriteRune(s)
			
 
				+		lex.Builder().WriteString(t)
			
 
				+		return []Token{}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// Try tries to apply a rule.
			
 
				+// Returns nil on no match.
			
 
				+func (r Rule) Try(lex Lexer) []Token {
			
 
				+	matches := lex.Accept(r.Regexp)
			
 
				+	if matches == nil || len(matches) == 0 {
			
 
				+		return nil
			
 
				+	}
			
 
				+	if r.Action != nil {
			
 
				+		return r.Action(lex, r.Kind, matches...)
			
 
				+	}
			
 
				+	// No action, use default action
			
 
				+	return DefaultAction(lex, r.Kind, matches...)
			
 
				+}
			
 
				+
			
 
				+type Flexer struct {
			
 
				+	index    int
			
 
				+	position Position
			
 
				+	rules    []Rule
			
 
				+	input    string
			
 
				+	name     string
			
 
				+	contexts []string
			
 
				+	builder  strings.Builder
			
 
				+}
			
 
				+
			
 
				+func (f Flexer) MakeToken(kind Kind, form string, args ...interface{}) Token {
			
 
				+	return MakeToken(f.position, kind, form, args...)
			
 
				+}
			
 
				+
			
 
				+func (f *Flexer) MakeBuilderToken(kind Kind) Token {
			
 
				+	text := f.builder.String()
			
 
				+	f.builder.Reset()
			
 
				+	return f.MakeToken(kind, text)
			
 
				+}
			
 
				+
			
 
				+// Advances the flexer to the given index,
			
 
				+// updating the position.
			
 
				+func (f *Flexer) advanceTo(index int) {
			
 
				+	start := f.index
			
 
				+	end := index
			
 
				+	for i := start; i < end; i++ {
			
 
				+		c := f.input[i] // This works because newlines are ascii.
			
 
				+		if c == '\r' || c == '\n' {
			
 
				+			if c == '\r' && (i+1) < len(f.input) {
			
 
				+				if f.input[i+1] == '\n' {
			
 
				+					i++
			
 
				+				}
			
 
				+			}
			
 
				+			f.position.Line++
			
 
				+			f.position.Col = 1
			
 
				+		} else {
			
 
				+			f.position.Col++
			
 
				+		}
			
 
				+	}
			
 
				+	f.index = end
			
 
				+}
			
 
				+
			
 
				+func (f *Flexer) Accept(re *regexp.Regexp) []string {
			
 
				+	indexes := re.FindStringSubmatchIndex(f.input[f.index:len(f.input)])
			
 
				+	if indexes == nil || len(indexes) < 1 {
			
 
				+		return nil
			
 
				+	}
			
 
				+	_, end := f.index+indexes[0], f.index+indexes[1]
			
 
				+	matches := []string{}
			
 
				+	for i := 1; i < len(indexes); i += 2 {
			
 
				+		subStart, subEnd := indexes[i-1]+f.index, indexes[i]+f.index
			
 
				+		sub := f.input[subStart:subEnd]
			
 
				+		matches = append(matches, sub)
			
 
				+	}
			
 
				+	f.advanceTo(end)
			
 
				+	return matches
			
 
				+}
			
 
				+
			
 
				+func (f *Flexer) Rule(kind Kind, expr, context string, act Action) error {
			
 
				+	re, err := regexp.Compile(`\A` + expr)
			
 
				+	if err != nil {
			
 
				+		return err
			
 
				+	}
			
 
				+	rule := Rule{kind, re, context, act}
			
 
				+	f.rules = append(f.rules, rule)
			
 
				+	return nil
			
 
				+}
			
 
				+
			
 
				+func (f *Flexer) PushContext(context string) {
			
 
				+	f.contexts = append(f.contexts, context)
			
 
				+}
			
 
				+
			
 
				+func (f *Flexer) Context() string {
			
 
				+	context := ""
			
 
				+	clen := len(f.contexts)
			
 
				+	if clen > 0 {
			
 
				+		context = f.contexts[clen-1]
			
 
				+	}
			
 
				+	return context
			
 
				+}
			
 
				+
			
 
				+func (f *Flexer) PopContext() {
			
 
				+	clen := len(f.contexts)
			
 
				+	if clen > 0 {
			
 
				+		f.contexts = f.contexts[0 : clen-1]
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func (f *Flexer) Builder() *strings.Builder {
			
 
				+	return &f.builder
			
 
				+}
			
 
				+
			
 
				+// Runs the lexer once.
			
 
				+// Return nil if no more progress can be made
			
 
				+func (f *Flexer) LexOnce() []Token {
			
 
				+	for _, rule := range f.rules {
			
 
				+		if rule.Context != f.Context() {
			
 
				+			continue
			
 
				+		}
			
 
				+		tokens := rule.Try(f)
			
 
				+		if tokens != nil {
			
 
				+			return tokens
			
 
				+		}
			
 
				+	}
			
 
				+	return nil
			
 
				+}
			
 
				+
			
 
				+func (f Flexer) Position() Position {
			
 
				+	return f.position
			
 
				+}
			
 
				+
			
 
				+func (f Flexer) EOF() bool {
			
 
				+	return f.index >= len(f.input)
			
 
				+}
			
 
				+
			
 
				+func NewFlexer(name, text string) *Flexer {
			
 
				+	res := &Flexer{}
			
 
				+	res.position.Line = 1
			
 
				+	res.position.Col = 1
			
 
				+	res.position.Name = &name
			
 
				+	res.input = text
			
 
				+	return res
			
 
				+}
			
 
				+
			
 
				+// Lexes all tokens from the lexer until it reaches
			
 
				+// EOF, or until it cannot progress anymore.
			
 
				+// All tokens of kind SkipKind will be skipped
			
 
				+// from the results.
			
 
				+func LexAll(lex Lexer) []Token {
			
 
				+	res := []Token{}
			
 
				+	for !lex.EOF() {
			
 
				+		toks := lex.LexOnce()
			
 
				+		if toks == nil {
			
 
				+			err := lex.MakeToken(ErrorKind, " Lexer error: no rule matches. Context:%s.", lex.Context())
			
 
				+			res = append(res, err)
			
 
				+			return res
			
 
				+		}
			
 
				+		res = append(res, toks...)
			
 
				+	}
			
 
				+	return res
			
 
				+}
			
--- a/flexer/flexer_test.go
+++ b/flexer/flexer_test.go
@@ -0,0 +1,61 @@
 
				+package flexer
			
 
				+
			
 
				+import "testing"
			
 
				+
			
 
				+const (
			
 
				+	tWord = Kind(-1 - iota)
			
 
				+	tArrow
			
 
				+	tSpace
			
 
				+	tString
			
 
				+	tPlus = Kind('+')
			
 
				+	tEos  = Kind('.')
			
 
				+)
			
 
				+
			
 
				+func TestFlexer(t *testing.T) {
			
 
				+	pos := Position{}
			
 
				+	expected := []Token{
			
 
				+		MakeToken(pos, tSpace, "\t "),
			
 
				+		MakeToken(pos, tWord, "PROGRAM"),
			
 
				+		MakeToken(pos, tSpace, "  "),
			
 
				+		MakeToken(pos, tArrow, "->"),
			
 
				+		MakeToken(pos, tSpace, "  "),
			
 
				+		MakeToken(pos, tWord, "STATEMENT"),
			
 
				+		MakeToken(pos, tPlus, "+"),
			
 
				+		MakeToken(pos, tSpace, " "),
			
 
				+		MakeToken(pos, tEos, ".\n"),
			
 
				+		MakeToken(pos, tWord, "say"),
			
 
				+		MakeToken(pos, tSpace, " "),
			
 
				+		MakeToken(pos, tString, "hello\nworld"),
			
 
				+		MakeToken(pos, tEos, "."),
			
 
				+	}
			
 
				+	f := NewFlexer(`test`, "\t PROGRAM  ->  STATEMENT+ .\nsay \"hello\\nworld\".")
			
 
				+	f.Rule(tSpace, `[ \t]+`, "", nil)
			
 
				+	f.Rule(tWord, `[A-Za-z_]+`, "", nil)
			
 
				+	f.Rule(tArrow, `\->`, "", nil)
			
 
				+	f.Rule(tPlus, `\+`, "", nil)
			
 
				+	f.Rule(tEos, `\.[\n\r]*`, "", nil)
			
 
				+	f.Rule(SkipKind, `"`, "", ContextAction("string"))
			
 
				+	f.Rule(tString, `"`, "string", PopAction(tString))
			
 
				+	f.Rule(SkipKind, `\\[etnru][0-9a-f]*`, "string", EscapeAction('"'))
			
 
				+	f.Rule(SkipKind, `.`, "string", StoreAction())
			
 
				+
			
 
				+	toks := LexAll(f)
			
 
				+
			
 
				+	for i, e := range expected {
			
 
				+		tok := toks[i]
			
 
				+		t.Logf("toks: %d, %v", i, tok)
			
 
				+		ko := tok.Kind()
			
 
				+		ke := e.Kind()
			
 
				+		if ko != ke {
			
 
				+			t.Errorf("error: kind:%d|%d|", ko, ke)
			
 
				+		}
			
 
				+		to := tok.Text()
			
 
				+		te := e.Text()
			
 
				+		if to != te {
			
 
				+			t.Errorf("error: text:%s|%s|", to, te)
			
 
				+		}
			
 
				+	}
			
 
				+	if !f.EOF() {
			
 
				+		t.Errorf("error: should be EOF")
			
 
				+	}
			
 
				+}