Browse Source

WIP lexer generator

Beoran 2 years ago
parent
commit
f8d4540c67
8 changed files with 253 additions and 27 deletions
  1. 40 23
      README.md
  2. 60 0
      cmd/ll1lex/flexer.go
  3. 31 3
      flexer/flexer.go
  4. 18 0
      flexgen/flexer.flex
  5. 61 0
      flexgen/flexer_lexer.go
  6. 41 0
      flexgen/flexer_parser.go
  7. 1 0
      flexgen/generator.go
  8. 1 1
      go.mod

+ 40 - 23
README.md

@@ -1,8 +1,8 @@
 # ll1
 
-ll1 is a tool to parse and check LL(1) specifications, and to generate 
-code or reports using Go templates based on these specifications. 
-ll1 specifications must contain a definition for an ll1 grammar, and 
+ll1 is a tool to parse and check LL(1) specifications, and to generate
+code or reports using Go templates based on these specifications.
+ll1 specifications must contain a definition for an ll1 grammar, and
 may optionally also specify a lexer for that grammar.
 
 # Usage
@@ -14,27 +14,27 @@ The [options] are:
     -append file
         Name of output file to append. Takes precedence over -out.
     -define definition
-    	Add a definition for the template, in the form of key:value or 
-        []key:value. Keys that start with a [] are arrays and can be 
+    	Add a definition for the template, in the form of key:value or
+        []key:value. Keys that start with a [] are arrays and can be
         concatenated to by specifying the same definition key again.
-        Non array keys will be overwoitten if they are specified again. 
+        Non array keys will be overwoitten if they are specified again.
     -help
         Shows the help page.
     -out file
-        Name of output file to overwrite. 
+        Name of output file to overwrite.
     -template file
-    	Template file to expand. This may be repeated to make use 
+    	Template file to expand. This may be repeated to make use
         of several templates to generate one output file.
     -verbose
     	Be more verbose. Shows the scanned tokens as well.
 
-The names of template files may be given with the -t option, or after the 
+The names of template files may be given with the -t option, or after the
 ll1 input file.
 
 # Syntax
 
-The syntax of an LL1 grammar itself is: 
-    
+The syntax of an LL1 grammar itself is:
+
     Specification -> Grammar OptLexer.
     Grammar -> Rules.
     Rules -> Rule OptRules .
@@ -43,10 +43,10 @@ The syntax of an LL1 grammar itself is:
     Name -> ruleName .
     Template -> rawString | epsilon .
     // Alternates consist of sequences.
-    Definition -> Alternates . 
+    Definition -> Alternates .
     Alternates -> Sequence OptSequences .
     OptSequences -> or Alternates | epsilon.
-    Sequence -> Element OptElements . 
+    Sequence -> Element OptElements .
     OptElements -> Element OptElements | epsilon .
     Element -> Parenthesis .
     Element -> Name .
@@ -54,7 +54,7 @@ The syntax of an LL1 grammar itself is:
     Parenthesis -> '(' Definition ')' .
     OptLexer -> LexerTerminal OptLexerTerminals | epsilon .
     LexerTerminal -> terminalName arrow LexerDefinition Template .
-    LexerDefinition -> LexerAlternates . 
+    LexerDefinition -> LexerAlternates .
     LexerAlternates -> LexerPattern OptLexerMatches .
     OptLexerMatches -> or LexerPattern | epsilon.
     LexerPattern -> literal .
@@ -71,38 +71,55 @@ The syntax of an LL1 grammar itself is:
     epsilon      -> "epsilon" | 'ε'
     arrow        -> "->" | '→'
 
-The syntax of an ll1 grammar has the following elements:  
+The syntax of an ll1 grammar has the following elements:
   - //comment : Line comments start with //, /*block comments*/ are C-like
-  - RuleName  : names that start with an upper case letter are 
+  - RuleName  : names that start with an upper case letter are
                 rule names or nonterminals defined by the grammar.
-  - terminal  : names that start with a lower case letter are names of 
+  - terminal  : names that start with a lower case letter are names of
                 teminals that the lexer produces.
   - 'l'       : single quoted strings are rune literals that the lexer produces.
   - "literal" : double quoted strings are rune literals that the lexer produces.
   - arrow     : a literal -> → as a separator.
   - epsion    : a literal "epsilon" or 'ε', which indicates the empty rule.
-                this is used in conjunction with alternates to make a rule 
+                this is used in conjunction with alternates to make a rule
                 optional.
 
 # Templates
 
-If no templates are given, ll1 simply checks the grammar and outputs a 
+If no templates are given, ll1 simply checks the grammar and outputs a
 simple text report to the output file.
 
-If a template is given, it will be expanded and output to the output file. 
+If a template is given, it will be expanded and output to the output file.
 
-Inside the template the following variables are available: 
+Inside the template the following variables are available:
   - .Grammar: contains the .Rules of the grammar.
   - .InName: contains the name of the ll1 input file.
   - .OutName: contains the name of the output file specified with -a or -o.
   - .Templates: contains the names of the templates read.
   - .Definitions: contains the keys of the available definitions.
   - All other variables defined with -d
-    
+
 Inside the ll1 templates, the following template functions are available:
   - Most functions from the strings package (see go doc strings).
   - CompileRegexp compiles a regexp package regexp which can be used as such.
   - ToString to convert anything anything that isn't a string to a string.
   - NewMap creates a map based it's argumens wich have string keys and interface{} values
     This is handly to pass multiple aruments to a sub-template
-  - NewList creates a list from the given arguments.
+  - NewList creates a list from the given arguments.
+
+
+
+Follow conflict example:
+
+A -> B | C .
+
+B -> D e .
+C -> e .
+D -> f | epsilon .
+
+Since D is optional and can be empty, this
+parser cannot decide between B and C, because
+in case D is empty, both sides match.
+
+
+

+ 60 - 0
cmd/ll1lex/flexer.go

@@ -0,0 +1,60 @@
+package main
+
+import "os"
+import "io"
+import "path/filepath"
+import "fmt"
+import "flag"
+import "strings"
+import "src.eruta.nl/beoran/ll1/flexer"
+
+func help() {
+	fmt.Printf("flexer usage: flexer [-o output.go] input.flex\n")
+	fmt.Printf("\nGenerates a lexer in Go language.\n")
+	os.Exit(1)
+}
+
+func showError(in string, err interface{}) {
+	fmt.Fprintf(os.Stderr, "%s: error: %s\n", in, err)
+}
+
+func fatal(err error, code int) {
+	fmt.Fprintf(os.Stderr, "flexer: error: %s\n", err)
+	os.Exit(code)
+}
+
+func main() {
+	outPtr := flag.String("o", "", "")
+	flag.Parse()
+	if flag.NArg() < 1 {
+		help()
+	}
+	inName := flag.Arg(0)
+	inFile, err := os.Open(inName)
+	if err != nil {
+		fatal(err, 1)
+	}
+	defer inFile.Close()
+	buf, err := io.ReadAll(inFile)
+	if err != nil {
+		fatal(err, 2)
+	}
+	outName := inName + ".go"
+	if outPtr != nil && *outPtr != "" {
+		outName = *outPtr
+	}
+	abs, err := filepath.Abs(outName)
+	if err != nil {
+		fatal(err, 3)
+	}
+	parts := filepath.SplitList(abs)
+	pkg := strings.TrimSuffix(filepath.Base(inName), filepath.Ext(inName))
+	if len(parts) > 2 {
+		pkg = parts[len(parts)-1]
+	}
+	tokens := flexer.LexFlexerInputString(inName, string(buf))
+	fmt.Printf("package %s\n\n", pkg)
+	for _, tok := range tokens {
+		fmt.Printf("tok: %v\n", tok)
+	}
+}

+ 31 - 3
flexer/flexer.go

@@ -252,6 +252,19 @@ func (f *Flexer) Rule(kind Kind, expr, context string, act Action) error {
 	return nil
 }
 
+func (f *Flexer) EscapedStringRule(kind Kind, first, last, context string) {
+	f.Rule(SkipKind, first, "", ContextAction(context))
+	f.Rule(kind, last, context, PopAction(kind))
+	f.Rule(SkipKind, `\\[etnru][0-9a-f]*`, context, EscapeAction(last[0]))
+	f.Rule(SkipKind, `.`, context, StoreAction())
+}
+
+func (f *Flexer) RawStringRule(kind Kind, first, last, context string) {
+	f.Rule(SkipKind, first, "", ContextAction(context))
+	f.Rule(kind, last, context, PopAction(kind))
+	f.Rule(SkipKind, `.`, context, StoreAction())
+}
+
 func (f *Flexer) PushContext(context string) {
 	f.contexts = append(f.contexts, context)
 }
@@ -308,11 +321,22 @@ func NewFlexer(name, text string) *Flexer {
 	return res
 }
 
+// KeepToken returns true if the token should be kept
+// and false if it is of the kind in the skip list
+func KeepToken(tok Token, skips ...Kind) bool {
+	for _, skip := range skips {
+		if skip == tok.Kind() {
+			return false
+		}
+	}
+	return true
+}
+
 // Lexes all tokens from the lexer until it reaches
 // EOF, or until it cannot progress anymore.
-// All tokens of kind SkipKind will be skipped
+// All tokens in the skip array will be skipped
 // from the results.
-func LexAll(lex Lexer) []Token {
+func LexAll(lex Lexer, skips ...Kind) []Token {
 	res := []Token{}
 	for !lex.EOF() {
 		toks := lex.LexOnce()
@@ -321,7 +345,11 @@ func LexAll(lex Lexer) []Token {
 			res = append(res, err)
 			return res
 		}
-		res = append(res, toks...)
+		for _, tok := range toks {
+			if KeepToken(tok, skips...) {
+				res = append(res, tok)
+			}
+		}
 	}
 	return res
 }

+ 18 - 0
flexgen/flexer.flex

@@ -0,0 +1,18 @@
+// Flexer generator's input own lexer specification
+line-comment 	-> "//[^\n\r]+" flex-skip .
+block-comment	-> "/\*(?ms:.)*?\*/" flex-skip .
+dot          	-> '\.' .
+literal-string  -> `"` flex-esc-string  .
+literal-raw     -> "`" flex-string  .
+literal-char    -> "'" flex-string  .
+terminal     	-> "[[:lower:]][[:alnum]_-]+" .
+epsilon      	-> "(?:epsilon|ε)" .
+arrow        	-> "(?:->|→)" .
+flexer-keyword	-> "flex-skip|flex-string|flex-esc-string" .
+whitespace   	-> "[ \t\n\r]+" flex-skip .
+flexer-action   -> `@{(?ms:.)*?}@`
+@{
+// This is the rule action.
+@ { } @
+}@
+.

+ 61 - 0
flexgen/flexer_lexer.go

@@ -0,0 +1,61 @@
+package flexer
+
+const (
+	FlexerKindDot = Kind(-1 - iota)
+	FlexerKindLiteralString
+	FlexerKindLiteralRaw
+	FlexerKindLiteralChar
+	FlexerKindTerminal
+	FlexerKindArrow
+	FlexerKindFlexerKeyword
+	FlexerKindWhitespace
+	FlexerKindLineComment
+	FlexerKindBlockComment
+	FlexerKindFlexerAction
+)
+
+/*
+
+// Flexer generator's input own lexer specification
+dot          	-> '\.' .
+arrow        	-> "(?:->|→)" .
+terminal     	-> "[[:isLower:]][[:isAlNum]_-]+" .
+literal-string  -> `"` flex-esc-string  .
+literal-raw     -> "`" flex-string  .
+literal-char    -> "'" flex-string  .
+flexer-keyword	-> "flex-skip|flex-string|flex-esc-string" .
+whitespace   	-> "[ \t\n\r]+" flex-skip .
+line-comment 	-> "//[^\n\r]+" flex-skip .
+block-comment	-> "/\*(?ms:.)*?\* /" .
+flexer-action   -> `%{(?ms:.)*?}%` .
+
+*/
+
+type CheckedError struct {
+	Error error
+}
+
+func Check(err error) {
+	if err != nil {
+		panic(CheckedError{err})
+	}
+}
+
+func LexFlexerInputString(name, input string) []Token {
+	f := NewFlexer(name, input)
+	Check(f.Rule(FlexerKindWhitespace, `[ \t\n\r]+`, "", nil))
+	Check(f.Rule(FlexerKindLineComment, `//[^\n\r]+[\n\r]+`, "", nil))
+	Check(f.Rule(FlexerKindBlockComment, `/\*(?ms:.)*?\*/`, "", nil))
+	Check(f.Rule(FlexerKindDot, `\.`, "", nil))
+	Check(f.Rule(FlexerKindArrow, `(?:->|→)`, "", nil))
+	Check(f.Rule(FlexerKindTerminal, `[[:lower:]][[:alnum:]_-]+`, "", nil))
+	f.EscapedStringRule(FlexerKindLiteralString, `"`, `"`, "literal-string")
+	f.RawStringRule(FlexerKindLiteralRaw, "`", "`", "literal-raw")
+	f.RawStringRule(FlexerKindLiteralChar, `''`, `''`, "literal-char")
+	Check(f.Rule(FlexerKindFlexerKeyword, `flex-skip|flex-string|flex-esc-string`, "", nil))
+	Check(f.Rule(FlexerKindFlexerAction, `@{(?ms:.)*?}@`, "", nil))
+
+	skipKinds := []Kind{SkipKind, FlexerKindWhitespace, FlexerKindBlockComment, FlexerKindLineComment}
+
+	return LexAll(f, skipKinds...)
+}

+ 41 - 0
flexgen/flexer_parser.go

@@ -0,0 +1,41 @@
+package flexer
+
+/* Parser for the flexer lexer generator.  */
+
+type GeneratorRule struct {
+	Name     string
+	Regexp   string
+	Keywords []string
+	Action   string
+}
+
+type GeneratorParser struct {
+	Tokens []Token
+	Index  int
+	Rules  []GeneratorRule
+}
+
+func (g GeneratorParser) Token() Token {
+	return g.Tokens[g.Index]
+}
+
+func (g GeneratorParser) Accept() (Token, err) {
+
+}
+
+func (g *GeneratorParser) ParseRule() error {
+	tok := g.Token()
+	if tok.Kind() != FlexerKind Terminal
+
+	for g.Index < len(g.Tokens) {
+		g.ParseRule()
+	}
+	return nil
+}
+
+func (g *GeneratorParser) Parse() error {
+	for g.Index < len(g.Tokens) {
+		g.ParseRule()
+	}
+	return nil
+}

+ 1 - 0
flexgen/generator.go

@@ -0,0 +1 @@
+package flexer

+ 1 - 1
go.mod

@@ -1,3 +1,3 @@
-module src.eruta.nl/ll1
+module src.eruta.nl/beoran/ll1
 
 go 1.16