parser.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447
  1. package main
  2. import "text/scanner"
  3. import "fmt"
  4. import "io"
  5. import "os"
  6. import "unicode"
  7. // Value is the lexical value of a lexer token.
  8. // This is an alias for string in the case of ll1.
  9. type Value = string
  10. // Position is a position within a source file. Since the lexer is based on
  11. // text/scanner, we use that packages Position.
  12. type Position = scanner.Position
  13. // TokenKind is the kind or type of a token.
  14. // This has rune as the underlying type so one-character tokens can be easily
  15. // supported. EOF will be 65535 (I.e, -1 cast to rune). Non-character token
  16. // kinds will start from 65533 down (i.e -3, -4, -5, etc).
  17. type TokenKind rune
  18. // NoTokenKind means "no token kind" i.e. no token.
  19. const NoTokenKind TokenKind = TokenKind(0)
  20. // TokenKindEOF means the end of the input.
  21. const TokenKindEOF TokenKind = TokenKind(scanner.EOF)
  22. // TokenKindError means a parsing or lexing error was encountered.
  23. //line ll1.parser.go.tpl:80
  24. const TokenKindError TokenKind = TokenKind(-21)
  25. // Scanner based token kinds
  26. const TokenKindIdent TokenKind = TokenKind(scanner.Ident )
  27. const TokenKindInt TokenKind = TokenKind(scanner.Int )
  28. const TokenKindFloat TokenKind = TokenKind(scanner.Float )
  29. const TokenKindChar TokenKind = TokenKind(scanner.Char )
  30. const TokenKindString TokenKind = TokenKind(scanner.String )
  31. const TokenKindRawString TokenKind = TokenKind(scanner.RawString)
  32. const TokenKindComment TokenKind = TokenKind(scanner.Comment )
  33. // Grammar based token kinds
  34. const (
  35. TokenKindEpsilon TokenKind = TokenKind('ε')
  36. TokenKindOr TokenKind = TokenKind('|')
  37. TokenKindArrow TokenKind = TokenKind('→')
  38. TokenKindStar TokenKind = TokenKind('*')
  39. TokenKindOptional TokenKind = TokenKind('?')
  40. TokenKindList TokenKind = TokenKind('+')
  41. TokenKindRuleEnd TokenKind = TokenKind('.')
  42. )
  43. // Convert token kind to a string representation
  44. //line ll1.parser.go.tpl:86
  45. func (tk TokenKind) String() string {
  46. return scanner.TokenString(rune(tk))
  47. }
  48. // Token is the result of a single lexical analysis step by the lexer.
  49. //line ll1.parser.go.tpl:109
  50. type Token struct {
  51. Position // Position in the source where the token was found.
  52. TokenKind // Type of the token
  53. Value // Value of the token
  54. }
  55. // MakeToken makes a token with the given position, type and value.
  56. //line ll1.parser.go.tpl:118
  57. func MakeToken(pos Position, typ TokenKind, val Value) Token {
  58. return Token{ pos, typ, val}
  59. }
  60. // Lexer performs the lexical analysis of the input.
  61. //line ll1.parser.go.tpl:124
  62. type Lexer struct {
  63. // Embed scanner.Scanner
  64. scanner.Scanner
  65. Filename string
  66. }
  67. // NewLexerFromReader creates a new lexer for the given parser and input.
  68. //line ll1.parser.go.tpl:133
  69. func NewLexerFromReader(parser *Parser, reader io.Reader, filename string) *Lexer {
  70. lexer := &Lexer{}
  71. lexer.Filename = filename
  72. lexer.Scanner.Init(reader)
  73. lexer.Scanner.Mode = scanner.GoTokens
  74. lexer.Scanner.Error = func (s *scanner.Scanner, msg string) {
  75. parser.Panicf("%s: scanner error: %s, %s", s.Position, s.TokenText(), msg)
  76. }
  77. // XXX: needs to be generated from the identifier rule in the syntax!
  78. lexer.Scanner.IsIdentRune = func(ch rune, i int) bool {
  79. if i == 0 {
  80. return unicode.IsLetter(ch)
  81. }
  82. return unicode.IsLetter(ch) ||
  83. unicode.IsNumber(ch) ||
  84. ch == '_' ||
  85. ch == '-'
  86. }
  87. return lexer
  88. }
  89. //line ll1.parser.go.tpl:155
  90. func (lex *Lexer) Lex() Token {
  91. scanned := lex.Scanner.Scan()
  92. pos := lex.Scanner.Position
  93. pos.Filename = lex.Filename
  94. value := lex.Scanner.TokenText()
  95. // Get rid of the quotes
  96. if scanned == scanner.Char ||
  97. scanned == scanner.String ||
  98. scanned == scanner.RawString {
  99. value = value[1:len(value) - 1]
  100. }
  101. token := Token {
  102. TokenKind: TokenKind(scanned),
  103. Value: value,
  104. Position: pos,
  105. }
  106. return token
  107. }
  108. // This parser is for parsing LL1 grammars, not Beo.
  109. type Parser struct {
  110. reader io.Reader
  111. scanner scanner.Scanner
  112. current Token
  113. Errors []ParserError
  114. Filename string
  115. Debug io.Writer
  116. }
  117. func NewParserFromReader(reader io.Reader, filename string, debug bool) *Parser {
  118. parser := &Parser{}
  119. parser.reader = reader
  120. parser.scanner.Init(parser.reader)
  121. parser.Filename = filename
  122. parser.current.Kind = 0
  123. parser.Debug = nil
  124. if debug {
  125. parser.Debug = os.Stderr
  126. }
  127. parser.scanner.Mode = scanner.GoTokens
  128. parser.scanner.Error = func (s *scanner.Scanner, msg string) {
  129. parser.Panicf("%s: scanner error: %s, %s", s.Position, s.TokenText(), msg)
  130. }
  131. parser.scanner.IsIdentRune = func(ch rune, i int) bool {
  132. if i == 0 {
  133. return unicode.IsLetter(ch)
  134. }
  135. return unicode.IsLetter(ch) ||
  136. unicode.IsNumber(ch) ||
  137. ch == '_' ||
  138. ch == '-'
  139. }
  140. return parser
  141. }
  142. func (p *Parser) Lex() Token {
  143. scanned := p.scanner.Scan()
  144. pos := p.scanner.Position
  145. pos.Filename = p.Filename
  146. value := p.scanner.TokenText()
  147. // Get rid of the quotes
  148. if scanned == scanner.Char ||
  149. scanned == scanner.String ||
  150. scanned == scanner.RawString {
  151. value = value[1:len(value) - 1]
  152. }
  153. token := Token {
  154. scanned,
  155. value,
  156. pos,
  157. }
  158. p.Debugf("Lexed: %s\n", p.Filename, token)
  159. return token
  160. }
  161. func (p *Parser) Advance() {
  162. token := p.Lex()
  163. p.current = token
  164. }
  165. // panic with this type on errors that would prevent the parser
  166. // from making progress.
  167. type ParserError struct {
  168. *Parser
  169. *Token
  170. Chain error
  171. }
  172. func (pe ParserError) Error() string {
  173. return pe.Token.MakeError(pe.Chain.Error()).Error()
  174. }
  175. func (parser *Parser) Errorf(message string, args ...interface{}) ParserError {
  176. err := fmt.Errorf(message, args...)
  177. pe := ParserError { Parser: parser, Token:&parser.current, Chain: err }
  178. parser.Errors = append(parser.Errors, pe)
  179. return pe
  180. }
  181. func (parser *Parser) Panicf(message string, args ...interface{}) {
  182. pe := parser.Errorf(message, args...)
  183. panic(pe)
  184. }
  185. func (p *Parser) Debugf(message string, args ...interface{}) {
  186. if p.Debug != nil {
  187. fmt.Fprintf(p.Debug, message, args)
  188. }
  189. }
  190. /* Looks at the current token and advances the lexer if the token is of any of
  191. the token kinds given in kinds. In this case it will return the accepted
  192. token and advance the parser. Otherwise, it will call parser.Panicf.*/
  193. func (parser *Parser) Require(kinds ...rune) Token {
  194. parser.Debugf("Require: %v\n", kinds)
  195. if parser.current.Kind == 0 {
  196. parser.Advance()
  197. }
  198. expected := ""
  199. sep := ""
  200. for _, kind := range kinds {
  201. if kind == parser.current.Kind {
  202. accepted := parser.current
  203. parser.Advance()
  204. return accepted
  205. }
  206. expected = fmt.Sprintf("%s%s%s", expected, sep, scanner.TokenString(kind))
  207. }
  208. parser.Panicf("error: expected one of the following: %s", expected)
  209. return Token{}
  210. }
  211. func (parser Parser) NextIs(kinds ...rune) bool {
  212. parser.Debugf("NextIs: %v\n", kinds)
  213. if (parser.current.Kind == 0) {
  214. parser.Advance()
  215. }
  216. for _, kind := range kinds {
  217. if kind == parser.current.Kind {
  218. return true
  219. }
  220. }
  221. return false
  222. }
  223. /*
  224. Sequence -> Element Sequence | .
  225. Element -> Parenthesis | Name | Literal .
  226. Parenthesis -> '(' Definition ')' .
  227. */
  228. func (p *Parser) NextIsElement() (bool) {
  229. return p.NextIs('(', scanner.Ident, scanner.Char, scanner.String)
  230. }
  231. func (p *Parser) NextIsOperator() (bool) {
  232. return p.NextIs('+', '*')
  233. }
  234. func (p *Parser) ParseElement() (Element, error) {
  235. switch {
  236. case p.NextIs('('):
  237. return p.ParseParenthesis()
  238. case p.NextIs(scanner.Ident):
  239. ident := p.Require(scanner.Ident)
  240. if ident.Value == "epsilon" || ident.Value == "ε" {
  241. ident.Value = "ε"
  242. return Epsilon{ element { token: ident } }, nil
  243. }
  244. // Upper case means a nonterminal, otherwise terminal
  245. if unicode.IsUpper(([]rune(ident.Value))[0]) {
  246. return Nonterminal{ element { token: ident }, nil }, nil
  247. }
  248. return Terminal{ element { token: ident } }, nil
  249. case p.NextIs(scanner.Char, scanner.String):
  250. literal := p.Require(scanner.Char, scanner.String)
  251. return Literal{ Terminal { element { token: literal } } }, nil
  252. default:
  253. p.Panicf("error: unexpected for grammar Element: %s", p.current)
  254. }
  255. return nil, nil
  256. }
  257. func (p *Parser) ParseSequence() (*Sequence, error) {
  258. sequence := &Sequence{}
  259. for p.NextIsElement() {
  260. element, err := p.ParseElement()
  261. p.Debugf("Sequence parsed element: %v\n", element)
  262. if err != nil {
  263. return sequence, err
  264. }
  265. sequence.Elements = append(sequence.Elements, element)
  266. }
  267. return sequence, nil
  268. }
  269. func (p *Parser) ParseParenthesis() (*Definition, error) {
  270. // Parse the sub definition
  271. p.Require('(')
  272. sub, err := p.ParseDefinition()
  273. p.Require(')')
  274. return sub, err
  275. }
  276. func (p *Parser) ParseAlternates() (*Definition, error) {
  277. alternates := &Alternates{}
  278. for p.NextIsElement() {
  279. sequence, err := p.ParseSequence()
  280. if err != nil {
  281. return alternates, err
  282. }
  283. alternates.Sequences = append(alternates.Sequences, *sequence)
  284. if !p.NextIs('.', scanner.RawString, scanner.EOF, ')') {
  285. p.Require('|')
  286. }
  287. }
  288. return alternates, nil
  289. }
  290. func (p *Parser) ParseDefinition() (*Definition, error) {
  291. return p.ParseAlternates()
  292. /*if p.NextIs('(') {
  293. sub, err := p.ParseParenthesis()
  294. if err != nil {
  295. return nil, err
  296. }
  297. }*/
  298. }
  299. func (p *Parser) ParseRule() (*Rule, error) {
  300. rule := &Rule{}
  301. name := p.Require(scanner.Ident)
  302. // XXX Require or → or -> separator.
  303. // This is a hack, needs to be moved to a lexer.
  304. if p.NextIs('→') {
  305. p.Require('→')
  306. } else {
  307. p.Require('-')
  308. p.Require('>')
  309. }
  310. definition, err := p.ParseDefinition()
  311. if err != nil {
  312. return rule, err
  313. }
  314. if p.NextIs(scanner.RawString) {
  315. rs := p.Require(scanner.RawString)
  316. rule.Template = rs.Value
  317. }
  318. // require '.' terminator
  319. p.Require('.')
  320. rule.Name = name.Value
  321. rule.Definition = *definition
  322. return rule, nil
  323. }
  324. func (p *Parser) ParseRules() ([]*Rule, error) {
  325. var rules []*Rule
  326. for p.current.Kind != scanner.EOF {
  327. rule, err := p.ParseRule()
  328. if err != nil {
  329. return rules, err
  330. }
  331. appended := false
  332. for i, existing := range(rules) {
  333. if existing.Name == rule.Name {
  334. // Add to the definition in stead
  335. existing.Definition.Sequences = append(existing.Definition.Sequences, rule.Definition.Sequences...)
  336. rules[i] = existing
  337. appended = true
  338. break
  339. }
  340. }
  341. if !appended {
  342. rules = append(rules, rule)
  343. }
  344. }
  345. return rules, nil
  346. }
  347. func (p *Parser) Parse() (g *Grammar, e error) {
  348. defer func() {
  349. thrown := recover()
  350. if thrown != nil {
  351. g = nil
  352. e = thrown.(error)
  353. return
  354. }
  355. }()
  356. p.Advance()
  357. rules, err := p.ParseRules()
  358. if err != nil {
  359. return nil, err
  360. }
  361. if len(rules) < 1 {
  362. return nil, nil
  363. }
  364. grammar := &Grammar {
  365. Top: rules[0],
  366. Rules: rules,
  367. }
  368. return grammar, nil
  369. }
  370. func ParseFile(filename string, debug bool) (*Parser, *Grammar, error) {
  371. file, err := os.Open(filename)
  372. defer file.Close()
  373. parser := NewParserFromReader(file, filename, debug)
  374. grammar, err := parser.Parse()
  375. return parser, grammar, err
  376. }