flexer.go 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. package flexer
  2. import "fmt"
  3. import "regexp"
  4. import "strings"
  5. import "strconv"
  6. import . "src.eruta.nl/beoran/ll1/common"
  7. /* Flexer is a flexible regexp and lexeme based
  8. lexer that can be used as an implementation for
  9. generated code.
  10. */
  11. type BasicToken struct {
  12. location Location
  13. kind Kind
  14. text string
  15. value Value
  16. }
  17. func (bt BasicToken) Kind() Kind {
  18. return bt.kind
  19. }
  20. func (bt BasicToken) Location() Location {
  21. return bt.location
  22. }
  23. func (bt BasicToken) Text() string {
  24. return bt.text
  25. }
  26. func (bt BasicToken) Value() Value {
  27. if bt.Value() == nil {
  28. return StringValue(bt.text)
  29. } else {
  30. return bt.value
  31. }
  32. }
  33. func MakeToken(Location Location, kind Kind, form string,
  34. args ...interface{}) BasicToken {
  35. text := fmt.Sprintf(form, args...)
  36. return BasicToken{Location, kind, text, StringValue(text)}
  37. }
  38. func MakeValueToken(Location Location, kind Kind, value Value) BasicToken {
  39. text := value.String()
  40. return BasicToken{Location, kind, text, value}
  41. }
  42. type ErrorToken struct {
  43. BasicToken
  44. }
  45. func MakeErrorToken(Location Location, form string, args ...interface{}) ErrorToken {
  46. err := fmt.Errorf(form, args...)
  47. tok := MakeValueToken(Location, ErrorKind, ErrorValue{err})
  48. return ErrorToken{tok}
  49. }
  50. func (e ErrorToken) Error() string {
  51. return fmt.Sprintf("%s%s", e.Location(), e.text)
  52. }
  53. /* Lexeme for Flexer is based on a regular expression.
  54. * While the lexeme may have submatches, the lexer will consume
  55. * the whole match if it matches at the beginning of the current input.
  56. */
  57. type Lexeme struct {
  58. Kind
  59. *regexp.Regexp
  60. Context string
  61. Action
  62. }
  63. // DefaultAction is the default action on a match.
  64. // If there is only 1 match, then that is the token,
  65. // otherwise all sub-macthes excluding the first
  66. // whole string match are the tokens.
  67. func DefaultAction(lex Lexer, k Kind, matches ...string) []Token {
  68. if len(matches) == 1 {
  69. tok := lex.MakeToken(k, matches[0])
  70. return []Token{tok}
  71. }
  72. res := []Token{}
  73. for i := 1; 1 < len(matches); i++ {
  74. tok := lex.MakeToken(k, matches[i])
  75. res = append(res, tok)
  76. }
  77. return res
  78. }
  79. // ContextAction returns an action that returns
  80. // no tokens but switches the lexer context and
  81. // empties the buffer.
  82. func ContextAction(context string) func(lex Lexer, k Kind, matches ...string) []Token {
  83. return func(lex Lexer, k Kind, matches ...string) []Token {
  84. lex.PushContext(context)
  85. lex.Builder().Reset()
  86. return []Token{}
  87. }
  88. }
  89. // Returns an action that pops the context and
  90. // returns the token in the buffer with the given kind
  91. func PopAction(kind Kind) func(lex Lexer, k Kind, matches ...string) []Token {
  92. return func(lex Lexer, k Kind, matches ...string) []Token {
  93. lex.PopContext()
  94. tok := lex.MakeBuilderToken(kind)
  95. return []Token{tok}
  96. }
  97. }
  98. // Returns an action that stores the match in the lexer buffer.
  99. func StoreAction() func(lex Lexer, k Kind, matches ...string) []Token {
  100. return func(lex Lexer, k Kind, matches ...string) []Token {
  101. for _, m := range matches {
  102. lex.Builder().WriteString(m)
  103. }
  104. return []Token{}
  105. }
  106. }
  107. // Returns an action that stores the match in the lexer buffer after applying UnquoteChar to apply
  108. // an escape sequence.
  109. func EscapeAction(quote byte) func(lex Lexer, k Kind, matches ...string) []Token {
  110. return func(lex Lexer, k Kind, matches ...string) []Token {
  111. s, _, t, e := strconv.UnquoteChar(matches[0], quote)
  112. print("escape", s, t, e)
  113. if e != nil {
  114. et := lex.MakeToken(ErrorKind, "%s", e)
  115. return []Token{et}
  116. }
  117. lex.Builder().WriteRune(s)
  118. lex.Builder().WriteString(t)
  119. return []Token{}
  120. }
  121. }
  122. // Try tries to apply a lexeme.
  123. // Returns nil on no match.
  124. func (r Lexeme) Try(lex Lexer) []Token {
  125. matches := lex.Accept(r.Regexp)
  126. if matches == nil || len(matches) == 0 {
  127. return nil
  128. }
  129. if r.Action != nil {
  130. return r.Action(lex, r.Kind, matches...)
  131. }
  132. // No action, use default action
  133. return DefaultAction(lex, r.Kind, matches...)
  134. }
  135. type Flexer struct {
  136. index int
  137. location Location
  138. lexemes []Lexeme
  139. input string
  140. name string
  141. contexts []string
  142. builder strings.Builder
  143. }
  144. func (f Flexer) MakeToken(kind Kind, form string, args ...interface{}) Token {
  145. return MakeToken(f.location, kind, form, args...)
  146. }
  147. func (f *Flexer) MakeBuilderToken(kind Kind) Token {
  148. text := f.builder.String()
  149. f.builder.Reset()
  150. return f.MakeToken(kind, text)
  151. }
  152. // Advances the flexer to the given index,
  153. // updating the Location.
  154. func (f *Flexer) advanceTo(index int) {
  155. start := f.index
  156. end := index
  157. for i := start; i < end; i++ {
  158. c := f.input[i] // This works because newlines are ascii.
  159. if c == '\r' || c == '\n' {
  160. if c == '\r' && (i+1) < len(f.input) {
  161. if f.input[i+1] == '\n' {
  162. i++
  163. }
  164. }
  165. f.location.Line++
  166. f.location.Col = 1
  167. } else {
  168. f.location.Col++
  169. }
  170. }
  171. f.index = end
  172. }
  173. func (f *Flexer) Accept(re *regexp.Regexp) []string {
  174. indexes := re.FindStringSubmatchIndex(f.input[f.index:len(f.input)])
  175. if indexes == nil || len(indexes) < 1 {
  176. return nil
  177. }
  178. _, end := f.index+indexes[0], f.index+indexes[1]
  179. matches := []string{}
  180. for i := 1; i < len(indexes); i += 2 {
  181. subStart, subEnd := indexes[i-1]+f.index, indexes[i]+f.index
  182. sub := f.input[subStart:subEnd]
  183. matches = append(matches, sub)
  184. }
  185. f.advanceTo(end)
  186. return matches
  187. }
  188. func (f *Flexer) Lexeme(kind Kind, expr, context string, act Action) error {
  189. re, err := regexp.Compile(`\A` + expr)
  190. if err != nil {
  191. return err
  192. }
  193. lexeme := Lexeme{kind, re, context, act}
  194. f.lexemes = append(f.lexemes, lexeme)
  195. return nil
  196. }
  197. func (f *Flexer) EscapedStringLexeme(kind Kind, first, last, context string) {
  198. f.Lexeme(SkipKind, first, "", ContextAction(context))
  199. f.Lexeme(kind, last, context, PopAction(kind))
  200. f.Lexeme(SkipKind, `\\[etnru][0-9a-f]*`, context, EscapeAction(last[0]))
  201. f.Lexeme(SkipKind, `.`, context, StoreAction())
  202. }
  203. func (f *Flexer) RawStringLexeme(kind Kind, first, last, context string) {
  204. f.Lexeme(SkipKind, first, "", ContextAction(context))
  205. f.Lexeme(kind, last, context, PopAction(kind))
  206. f.Lexeme(SkipKind, `.`, context, StoreAction())
  207. }
  208. func (f *Flexer) PushContext(context string) {
  209. f.contexts = append(f.contexts, context)
  210. }
  211. func (f *Flexer) Context() string {
  212. context := ""
  213. clen := len(f.contexts)
  214. if clen > 0 {
  215. context = f.contexts[clen-1]
  216. }
  217. return context
  218. }
  219. func (f *Flexer) PopContext() {
  220. clen := len(f.contexts)
  221. if clen > 0 {
  222. f.contexts = f.contexts[0 : clen-1]
  223. }
  224. }
  225. func (f *Flexer) Builder() *strings.Builder {
  226. return &f.builder
  227. }
  228. // Runs the lexer once.
  229. // Return nil if no more progress can be made
  230. func (f *Flexer) LexOnce() []Token {
  231. for _, lexeme := range f.lexemes {
  232. if lexeme.Context != f.Context() {
  233. continue
  234. }
  235. tokens := lexeme.Try(f)
  236. if tokens != nil {
  237. return tokens
  238. }
  239. }
  240. return nil
  241. }
  242. func (f Flexer) Location() Location {
  243. return f.location
  244. }
  245. func (f Flexer) EOF() bool {
  246. return f.index >= len(f.input)
  247. }
  248. func NewFlexer(name, text string) *Flexer {
  249. res := &Flexer{}
  250. res.location.Line = 1
  251. res.location.Col = 1
  252. res.location.Name = &name
  253. res.input = text
  254. return res
  255. }
  256. // KeepToken returns true if the token should be kept
  257. // and false if it is of the kind in the skip list
  258. func KeepToken(tok Token, skips ...Kind) bool {
  259. for _, skip := range skips {
  260. if skip == tok.Kind() {
  261. return false
  262. }
  263. }
  264. return true
  265. }
  266. // Lexes all tokens from the lexer until it reaches
  267. // EOF, or until it cannot progress anymore.
  268. // All tokens in the skip array will be skipped
  269. // from the results. If the lexer reachest he end of input,
  270. // a token with kind EndKind will be appended
  271. func LexAll(lex Lexer, skips ...Kind) []Token {
  272. res := []Token{}
  273. for !lex.EOF() {
  274. toks := lex.LexOnce()
  275. if toks == nil {
  276. err := lex.MakeToken(ErrorKind, " Lexer error: no lexeme matches. Context:%s.", lex.Context())
  277. res = append(res, err)
  278. return res
  279. }
  280. for _, tok := range toks {
  281. if KeepToken(tok, skips...) {
  282. res = append(res, tok)
  283. }
  284. }
  285. }
  286. // here we reached EOF
  287. res = append(res, lex.MakeToken(EndKind, "<end>"))
  288. return res
  289. }