Browse Source

Try a fresh start and work on the lexer.

Beoran 2 years ago
parent
commit
7787c932de
13 changed files with 569 additions and 0 deletions
  1. 3 0
      go.mod
  2. 566 0
      lexer.go
  3. 0 0
      old/generate.go
  4. 0 0
      old/grammar.go
  5. 0 0
      old/ll1.debug.tpl
  6. 0 0
      old/ll1.dot.tpl
  7. 0 0
      old/ll1.go
  8. 0 0
      old/ll1.parser.go.lined.tpl
  9. 0 0
      old/ll1.parser.go.tpl
  10. 0 0
      old/ll1_parser.go
  11. 0 0
      old/main.go
  12. 0 0
      old/parser.go
  13. 0 0
      old/template_functions.go

+ 3 - 0
go.mod

@@ -0,0 +1,3 @@
+module src.eruta.nl/ll1
+
+go 1.16

+ 566 - 0
lexer.go

@@ -0,0 +1,566 @@
+package main
+
+import (
+	"fmt"
+	"strings"
+	"unicode"
+)
+
+// Ucalgary stntax:
+//
+// Grammar Structure
+//
+// The format for a context free grammar is as follows:
+//
+// Any reference to "punctuation" when describing legal identifiers is restricted
+// to certain pieces of punctuation. In no particular order, here are the
+// punctuation characters that are legal: ~ ! @ # * ( ) _ + ' ; : / ?
+//
+// Terminals are strings which either start with a lowercase letter, a number,
+// or a piece of punctuation, followed by any amount of letters, numbers and
+// punctuation. i.e. a terminal matches the regular expression
+// [{punctuation}a-z0-9][{punctuation}a-zA-Z0-9]*. Examples are id + begin end
+// plus 9 8' 7+? '' ....
+//
+// Nonterminals start with uppercase letters, followed by any number of legal
+// characters. i.e. a nonterminal matches the regular expression
+// [A-Z][{punctuation}a-zA-Z0-9]*. Examples are S A EXPR TERM~3 Stop ....
+//
+// It is assumed that the start symbol is the first nonterminal whose productions
+// are given.
+//
+// The productions associated with a nonterminal are indicated as
+// head -> RHS1          | RHS2          ....          | RHSn.
+// where the alternative righthand sides of the productions are separated by
+// a slash, "|", and terminated by a period, ".".
+//
+// The RHS of a production is a sequence of terminals or nonterminals separated
+// by spaces. For example: Expr add Expr.
+//
+// C-style comments are allowed, e.g.
+// /* this is a comment     possibly spanning many lines */
+//
+// Empty grammars
+// The grammar epsilon or ε IS EMPTY.
+// For any nonterminal epsilon is a valid grammar.
+// This represents a grammar with no productions.
+//
+// Contrary to this, ll1 only allows the _ punctuation in identifiers.
+//
+
+// Location of the lexer or of a token
+type Location struct {
+	//Name is the name of the file or input.
+	Name string
+	//Current index in the input buffer of the token, or of the lex step.
+	Index int
+	//Start index in the input buffer at which the token begins.
+	Start int
+	//Line the lexer is at or on which a token begins.
+	Line int
+	//Col is the column in the line on which the token begins.
+	Col int
+	//Index in the input buffer from which the token begins.
+}
+
+func (l Location) String() string {
+	return fmt.Sprintf("%s:%d:%d", l.Name, l.Line, l.Col)
+}
+
+type TokenKind rune
+
+const TokenKindSkip = TokenKind(-2)
+
+type Token struct {
+	Location
+	TokenKind
+	Text string
+}
+
+// Advance moves to the next rune in input location, updating the
+// location's Index, Line and Col.
+// Returns the rune found, or -1 if the end of the buffer has been reached.
+func (l *Location) Advance(input []rune) rune {
+	if l.Index >= len(input) {
+		return -1
+	}
+	r := input[l.Index]
+	if (r == '\n' && l.Index > 0 && input[l.Index] != '\r') ||
+		(r == '\r') {
+		l.Line++
+	}
+	if (r == '\n') || (r == '\r') {
+		l.Col = 0
+	}
+	l.Col++
+	l.Index++
+	return r
+}
+
+type Lexer struct {
+	Location
+	Input []rune
+	Rules []LexerFunc
+}
+
+type LexerFuncs map[string]LexerFunc
+
+func (l *Lexer) Advance() rune {
+	return l.Location.Advance(l.Input)
+}
+
+func (l Lexer) IsEof() bool {
+	return l.Index >= len(l.Input)
+}
+
+func (l Lexer) Peek() rune {
+	if l.IsEof() {
+		return -1
+	}
+	return l.Input[l.Location.Index]
+}
+
+// LexerFunc is a Lexer function.
+// It Lexes the input buffer starting from lex.Index, which must be
+// guaranteed by the caller to be non negative.
+// The lexerfunc must make progress if it parses a token or returns an error.
+// It should NOT make progress if it does not match.
+// It should return as follows:
+// * If the Lexer function matched what it is intended to lex
+//   it should return the lexed token(s), nil, and lex.Start points to the start
+//   of the token, and lex.Index should be moved to point right after the lexed
+//   part of the string. To indicate that the lexer should skip the parsed token,
+//   set it's token kind to TokenKindSkip
+// * If the Lexer function did not match what it is intended to lex
+//   it should return nil, nil, and lex.Index should be unchanged.
+// * If the Lexer function did match what it is intended to lex
+//   but there is a Lexer error, it should return empty slice, error slice,
+//   and l.Index should be set to the error's location.
+type LexerFunc func(lex *Lexer) ([]Token, []error)
+
+var Debug = false
+
+func debug(msg string) {
+	if Debug {
+		print(msg)
+	}
+}
+
+func (l Lexer) WhileOk(ok func(r rune, l Lexer) bool) (string, error) {
+	l.Start = l.Index
+	now := 0
+	for !l.IsEof() {
+		r := l.Advance()
+		if !ok(r, l) {
+			if now == 0 {
+				return "", nil
+			}
+			return string(l.Input[l.Start:l.Index]), nil
+		}
+		now++
+	}
+	return "", fmt.Errorf("unexpected EOF: >" + string(l.Input[l.Start:l.Index]) + "<")
+}
+
+func NewLexerFromString(input, name string, funcs []LexerFunc) *Lexer {
+	loc := Location{}
+	loc.Name = name
+	return &Lexer{loc, []rune(input), funcs}
+}
+
+func (l *Lexer) Lex() (result []Token, rerr []error) {
+	defer func() {
+		val := recover()
+		err, ok := val.(error)
+		if ok {
+			rerr = append(rerr, err)
+		}
+	}()
+	return l.lex()
+}
+
+func (l *Lexer) lex() (result []Token, rerr []error) {
+	for !l.IsEof() {
+		for _, lf := range l.Rules {
+			tokens, errs := lf(l)
+			if len(errs) > 0 {
+				rerr = append(rerr, errs...)
+				// skip until next whitespace
+				for !l.IsEof() {
+					r2 := l.Advance()
+					if unicode.IsSpace(r2) {
+						break
+					}
+				}
+			}
+			for _, token := range tokens {
+				if token.TokenKind != TokenKindSkip {
+					result = append(result, token)
+				}
+			}
+			// no progress made, indicates fatal lex error
+			if l.Index == l.Start {
+				err := fmt.Errorf("Lex error: %s", l.Location)
+				rerr = append(rerr, err)
+				return result, rerr
+			}
+			// advance start based on progress
+			l.Start = l.Index
+		}
+	}
+	return result, rerr
+}
+
+func (l *Lexer) Tokenize(kind TokenKind) (toks []Token, errs []error) {
+	if l.Index == l.Start {
+		err := fmt.Errorf("Lex error: %s, expected %c", l.Location, kind)
+		errs = append(errs, err)
+		return toks, errs
+	}
+	str := string(l.Input[l.Start:l.Index])
+	return []Token{Token{l.Location, kind, str}}, errs
+}
+
+func (l *Lexer) LexContains(chars string, kind TokenKind) (toks []Token, errs []error) {
+	r := l.Peek()
+	if !strings.ContainsRune(chars, r) {
+		return toks, errs
+	}
+
+	for !l.IsEof() {
+		if strings.ContainsRune(chars, r) {
+			l.Advance()
+		} else {
+			return l.Tokenize(kind)
+		}
+	}
+	return l.Tokenize(kind)
+}
+
+/*
+func LexerRs(input []rune, index *int) (Token, error) {
+	debug("LexerRs")
+	SkipWs(input, index)
+	return LexerWhileRuneOk(input, index, func(r rune) bool {
+		return r == '\n' || r == '\r' || r == ';'
+	})
+}
+
+func LexerWs(input []rune, index *int) (Token, error) {
+	debug("LexerWs")
+	return LexerWhileRuneOk(input, index, func(r rune) bool {
+		return r == ' ' || r == '\t'
+	})
+}
+
+func LexerWsRs(input []rune, index *int) (Token, error) {
+	debug("LexerRs")
+	SkipWs(input, index)
+	return LexerWhileRuneOk(input, index, func(r rune) bool {
+		return r == '\n' || r == '\r' || r == ';' || r == ' ' || r == '\t'
+	})
+}
+
+func SkipWs(input []rune, index *int) {
+	LexerWs(input, index)
+}
+
+func SkipRs(input []rune, index *int) {
+	LexerRs(input, index)
+}
+
+func SkipWsRs(input []rune, index *int) {
+	LexerWsRs(input, index)
+}
+
+func LexerComment(input []rune, index *int) (Token, error) {
+	debug("LexerComment")
+	start := *index
+	if !RequireRune(input, index, '#') {
+		return nil, nil
+	}
+	for ; *index < len(input); *index++ {
+		r := input[*index]
+		if r == '\n' || r == '\r' {
+			end := *index
+			return Comment(string(input[start:end])), nil
+		}
+	}
+	return nil, ErrorFromString("unexpected EOF in comment")
+}
+
+func LexerStatement(input []rune, index *int) (Token, error) {
+	debug("LexerStatement")
+	SkipWs(input, index)
+	return LexerAlternative(input, index, LexerCommand, LexerBlock, LexerComment)
+}
+
+func LexerParameters(input []rune, index *int) (Token, error) {
+	debug("LexerParameters")
+	params := List{}
+	for {
+		sep, err := LexerWs(input, index)
+		if err != nil {
+			return nil, err
+		}
+		if sep == nil {
+			return params, nil
+		}
+		val, err := LexerParameter(input, index)
+		if err != nil {
+			return nil, err
+		}
+		if val == nil {
+			return params, nil
+		}
+		params = append(params, val)
+	}
+}
+
+func LexerParameter(input []rune, index *int) (Token, error) {
+	debug("LexerParameter")
+	funcs := []LexerFunc{LexerLiteral, LexerEvaluation, LexerBlock, LexerGetter}
+	return LexerAlternative(input, index, funcs...)
+}
+
+func LexerOrder(input []rune, index *int) (Token, error) {
+	debug("LexerOrder")
+	return LexerAlternative(input, index, LexerLiteral, LexerEvaluation)
+}
+
+func LexerCommand(input []rune, index *int) (Token, error) {
+	debug("LexerCommand")
+	order, err := LexerOrder(input, index)
+	if err != nil || order == nil {
+		return order, err
+	}
+	params, err := LexerParameters(input, index)
+	if err != nil {
+		return params, err
+	}
+	if params == nil {
+		params = List{}
+	}
+	return Command{order, params.(List)}, nil
+}
+
+// RequireRune requires a single rune to be present,
+// and skips it, however that rune is discared.
+// Returns true if the rune was found, false if not
+func RequireRune(input []rune, index *int, req rune) bool {
+	if input[*index] == req {
+		*index++
+		return true
+	}
+	return false
+}
+
+func LexerEvaluation(input []rune, index *int) (Token, error) {
+	debug("LexerEvaluation")
+	if !RequireRune(input, index, '[') {
+		return nil, nil
+	}
+	res, err := LexerCommand(input, index)
+	if err != nil {
+		return nil, err
+	}
+	if !RequireRune(input, index, ']') {
+		print(input[*index])
+		return nil, ErrorFromString("Expected end of evaluation ]")
+	}
+	if res != nil {
+		res = Evaluation{Command: res.(Command)}
+	}
+	return res, nil
+}
+
+func LexerBlock(input []rune, index *int) (Token, error) {
+	debug("LexerBlock")
+	if !RequireRune(input, index, '{') {
+		return nil, nil
+	}
+	res, err := LexerStatements(input, index)
+	if err != nil {
+		return nil, err
+	}
+	SkipWsRs(input, index)
+	if !RequireRune(input, index, '}') {
+		return nil, ErrorFromString("Expected end of block }")
+	}
+	return Block{Statements: res.(List)}, nil
+	return nil, nil
+}
+
+func LexerGetter(input []rune, index *int) (Token, error) {
+	debug("LexerGetter")
+	if RequireRune(input, index, '$') {
+		if input[*index] == '$' { // recusively Lexer double getters
+			val, err := LexerGetter(input, index)
+			if err == nil { // Getter with a getter inside.
+				return Getter{val}, err
+			} else {
+				return nil, err
+			}
+		} else { // integer, sring or getter name
+			key, err := LexerLiteral(input, index)
+			if key == nil {
+				return nil, ErrorFromString("Expected literal after getter $")
+			}
+			if err == nil {
+				return Getter{key}, nil
+			}
+			return nil, err
+		}
+	}
+	return nil, nil
+}
+
+func LexerLiteral(input []rune, index *int) (Token, error) {
+	debug("LexerLiteral")
+	return LexerAlternative(input, index, LexerWord, LexerString, LexerInteger,
+		LexerRawString)
+}
+
+func IsLetter(r rune) bool {
+	return (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r > rune(128)) ||
+		r == '_' || r == '/'
+}
+
+func IsNumber(r rune) bool {
+	return (r >= '0' && r <= '9')
+}
+
+func LexerWord(input []rune, index *int) (Token, error) {
+	debug("LexerWord")
+	// a word consists of an ascii letter or non asci characters, or underscore
+	// followed by an ascii letter or number, or non ascii characters, or underscore
+	start := *index
+	r := input[*index]
+	if !IsLetter(r) {
+		return nil, nil
+	}
+	for *index++; *index < len(input); *index++ {
+		r := input[*index]
+		if !(IsLetter(r) || IsNumber(r)) {
+			return Word(string(input[start:*index])), nil
+		}
+	}
+	return nil, ErrorFromString("unexpected EOF in string")
+}
+
+func next(input []rune, index *int) {
+	*index++
+	if *index >= len(input) {
+		panic(ErrorFromString("Unexpected end of input."))
+	}
+}
+
+func LexerEscape(input []rune, index *int) (Token, error) {
+	res := ""
+	if input[*index] != '\\' {
+		return nil, nil
+	}
+	next(input, index)
+	switch input[*index] {
+	case 'a':
+		res += "\a"
+	case 'b':
+		res += "\b"
+	case 'e':
+		res += "\033"
+	case 'f':
+		res += "\f"
+	case 'n':
+		res += "\n"
+	case 'r':
+		res += "\r"
+	case 't':
+		res += "\t"
+	case '\\':
+		res += "\\"
+	case '"':
+		res += "\""
+	default:
+		return nil, ErrorFromString("Unknown escape sequence character")
+	}
+
+	return String(res), nil
+}
+
+func LexerString(input []rune, index *int) (Token, error) {
+	debug("LexerString")
+	res := ""
+	ch := input[*index]
+	if ch != '"' {
+		return nil, nil
+	}
+	*index++
+	for *index < len(input) {
+		ch = input[*index]
+		esc, err := LexerEscape(input, index)
+		if err != nil {
+			return nil, err
+		}
+		if esc != nil {
+			res += string(esc.(String))
+		} else if ch == '"' {
+			*index++
+			return String(res), nil
+		} else {
+			res += string(ch)
+		}
+		*index++
+	}
+	return nil, ErrorFromString("Unexpected end of input.")
+}
+
+func LexerRawString(input []rune, index *int) (Token, error) {
+	debug("LexerRawString")
+	res := ""
+	ch := input[*index]
+	if ch != '`' {
+		return nil, nil
+	}
+	*index++
+	for *index < len(input) {
+		ch = input[*index]
+		if ch == '`' {
+			*index++
+			return String(res), nil
+		} else {
+			res += string(ch)
+		}
+		*index++
+	}
+	return nil, ErrorFromString("Unexpected end of input.")
+}
+
+func LexerInteger(input []rune, index *int) (Token, error) {
+	debug("LexerInteger")
+	ch := input[*index]
+	neg := 1
+	res := 0
+	if ch == '-' {
+		neg = -1
+	} else if ch == '+' {
+		// do nothing, ignore + as an integer prefix
+	} else {
+		res = int(ch - '0')
+		if res < 0 || res > 9 { // Not a digit, no integer
+			return nil, nil
+		}
+	}
+	*index++
+	for *index < len(input) {
+		ch = input[*index]
+		ch -= '0'
+		if ch < 0 || ch > 9 { // Not a digit, finished
+			return Int(neg * res), nil
+		}
+		res = res * 10
+		res = res + int(ch)
+		*index++
+	}
+	return nil, ErrorFromString("unexpected EOF in number")
+}
+*/

+ 0 - 0
generate.go → old/generate.go


+ 0 - 0
grammar.go → old/grammar.go


+ 0 - 0
ll1.debug.tpl → old/ll1.debug.tpl


+ 0 - 0
ll1.dot.tpl → old/ll1.dot.tpl


+ 0 - 0
ll1.go → old/ll1.go


+ 0 - 0
ll1.parser.go.lined.tpl → old/ll1.parser.go.lined.tpl


+ 0 - 0
ll1.parser.go.tpl → old/ll1.parser.go.tpl


+ 0 - 0
ll1_parser.go → old/ll1_parser.go


+ 0 - 0
main.go → old/main.go


+ 0 - 0
parser.go → old/parser.go


+ 0 - 0
template_functions.go → old/template_functions.go