lexer.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566
  1. package main
  2. import (
  3. "fmt"
  4. "strings"
  5. "unicode"
  6. )
  7. // Ucalgary stntax:
  8. //
  9. // Grammar Structure
  10. //
  11. // The format for a context free grammar is as follows:
  12. //
  13. // Any reference to "punctuation" when describing legal identifiers is restricted
  14. // to certain pieces of punctuation. In no particular order, here are the
  15. // punctuation characters that are legal: ~ ! @ # * ( ) _ + ' ; : / ?
  16. //
  17. // Terminals are strings which either start with a lowercase letter, a number,
  18. // or a piece of punctuation, followed by any amount of letters, numbers and
  19. // punctuation. i.e. a terminal matches the regular expression
  20. // [{punctuation}a-z0-9][{punctuation}a-zA-Z0-9]*. Examples are id + begin end
  21. // plus 9 8' 7+? '' ....
  22. //
  23. // Nonterminals start with uppercase letters, followed by any number of legal
  24. // characters. i.e. a nonterminal matches the regular expression
  25. // [A-Z][{punctuation}a-zA-Z0-9]*. Examples are S A EXPR TERM~3 Stop ....
  26. //
  27. // It is assumed that the start symbol is the first nonterminal whose productions
  28. // are given.
  29. //
  30. // The productions associated with a nonterminal are indicated as
  31. // head -> RHS1 | RHS2 .... | RHSn.
  32. // where the alternative righthand sides of the productions are separated by
  33. // a slash, "|", and terminated by a period, ".".
  34. //
  35. // The RHS of a production is a sequence of terminals or nonterminals separated
  36. // by spaces. For example: Expr add Expr.
  37. //
  38. // C-style comments are allowed, e.g.
  39. // /* this is a comment possibly spanning many lines */
  40. //
  41. // Empty grammars
  42. // The grammar epsilon or ε IS EMPTY.
  43. // For any nonterminal epsilon is a valid grammar.
  44. // This represents a grammar with no productions.
  45. //
  46. // Contrary to this, ll1 only allows the _ punctuation in identifiers.
  47. //
  48. // Location of the lexer or of a token
  49. type Location struct {
  50. //Name is the name of the file or input.
  51. Name string
  52. //Current index in the input buffer of the token, or of the lex step.
  53. Index int
  54. //Start index in the input buffer at which the token begins.
  55. Start int
  56. //Line the lexer is at or on which a token begins.
  57. Line int
  58. //Col is the column in the line on which the token begins.
  59. Col int
  60. //Index in the input buffer from which the token begins.
  61. }
  62. func (l Location) String() string {
  63. return fmt.Sprintf("%s:%d:%d", l.Name, l.Line, l.Col)
  64. }
  65. type TokenKind rune
  66. const TokenKindSkip = TokenKind(-2)
  67. type Token struct {
  68. Location
  69. TokenKind
  70. Text string
  71. }
  72. // Advance moves to the next rune in input location, updating the
  73. // location's Index, Line and Col.
  74. // Returns the rune found, or -1 if the end of the buffer has been reached.
  75. func (l *Location) Advance(input []rune) rune {
  76. if l.Index >= len(input) {
  77. return -1
  78. }
  79. r := input[l.Index]
  80. if (r == '\n' && l.Index > 0 && input[l.Index] != '\r') ||
  81. (r == '\r') {
  82. l.Line++
  83. }
  84. if (r == '\n') || (r == '\r') {
  85. l.Col = 0
  86. }
  87. l.Col++
  88. l.Index++
  89. return r
  90. }
  91. type Lexer struct {
  92. Location
  93. Input []rune
  94. Rules []LexerFunc
  95. }
  96. type LexerFuncs map[string]LexerFunc
  97. func (l *Lexer) Advance() rune {
  98. return l.Location.Advance(l.Input)
  99. }
  100. func (l Lexer) IsEof() bool {
  101. return l.Index >= len(l.Input)
  102. }
  103. func (l Lexer) Peek() rune {
  104. if l.IsEof() {
  105. return -1
  106. }
  107. return l.Input[l.Location.Index]
  108. }
  109. // LexerFunc is a Lexer function.
  110. // It Lexes the input buffer starting from lex.Index, which must be
  111. // guaranteed by the caller to be non negative.
  112. // The lexerfunc must make progress if it parses a token or returns an error.
  113. // It should NOT make progress if it does not match.
  114. // It should return as follows:
  115. // * If the Lexer function matched what it is intended to lex
  116. // it should return the lexed token(s), nil, and lex.Start points to the start
  117. // of the token, and lex.Index should be moved to point right after the lexed
  118. // part of the string. To indicate that the lexer should skip the parsed token,
  119. // set it's token kind to TokenKindSkip
  120. // * If the Lexer function did not match what it is intended to lex
  121. // it should return nil, nil, and lex.Index should be unchanged.
  122. // * If the Lexer function did match what it is intended to lex
  123. // but there is a Lexer error, it should return empty slice, error slice,
  124. // and l.Index should be set to the error's location.
  125. type LexerFunc func(lex *Lexer) ([]Token, []error)
  126. var Debug = false
  127. func debug(msg string) {
  128. if Debug {
  129. print(msg)
  130. }
  131. }
  132. func (l Lexer) WhileOk(ok func(r rune, l Lexer) bool) (string, error) {
  133. l.Start = l.Index
  134. now := 0
  135. for !l.IsEof() {
  136. r := l.Advance()
  137. if !ok(r, l) {
  138. if now == 0 {
  139. return "", nil
  140. }
  141. return string(l.Input[l.Start:l.Index]), nil
  142. }
  143. now++
  144. }
  145. return "", fmt.Errorf("unexpected EOF: >" + string(l.Input[l.Start:l.Index]) + "<")
  146. }
  147. func NewLexerFromString(input, name string, funcs []LexerFunc) *Lexer {
  148. loc := Location{}
  149. loc.Name = name
  150. return &Lexer{loc, []rune(input), funcs}
  151. }
  152. func (l *Lexer) Lex() (result []Token, rerr []error) {
  153. defer func() {
  154. val := recover()
  155. err, ok := val.(error)
  156. if ok {
  157. rerr = append(rerr, err)
  158. }
  159. }()
  160. return l.lex()
  161. }
  162. func (l *Lexer) lex() (result []Token, rerr []error) {
  163. for !l.IsEof() {
  164. for _, lf := range l.Rules {
  165. tokens, errs := lf(l)
  166. if len(errs) > 0 {
  167. rerr = append(rerr, errs...)
  168. // skip until next whitespace
  169. for !l.IsEof() {
  170. r2 := l.Advance()
  171. if unicode.IsSpace(r2) {
  172. break
  173. }
  174. }
  175. }
  176. for _, token := range tokens {
  177. if token.TokenKind != TokenKindSkip {
  178. result = append(result, token)
  179. }
  180. }
  181. // no progress made, indicates fatal lex error
  182. if l.Index == l.Start {
  183. err := fmt.Errorf("Lex error: %s", l.Location)
  184. rerr = append(rerr, err)
  185. return result, rerr
  186. }
  187. // advance start based on progress
  188. l.Start = l.Index
  189. }
  190. }
  191. return result, rerr
  192. }
  193. func (l *Lexer) Tokenize(kind TokenKind) (toks []Token, errs []error) {
  194. if l.Index == l.Start {
  195. err := fmt.Errorf("Lex error: %s, expected %c", l.Location, kind)
  196. errs = append(errs, err)
  197. return toks, errs
  198. }
  199. str := string(l.Input[l.Start:l.Index])
  200. return []Token{Token{l.Location, kind, str}}, errs
  201. }
  202. func (l *Lexer) LexContains(chars string, kind TokenKind) (toks []Token, errs []error) {
  203. r := l.Peek()
  204. if !strings.ContainsRune(chars, r) {
  205. return toks, errs
  206. }
  207. for !l.IsEof() {
  208. if strings.ContainsRune(chars, r) {
  209. l.Advance()
  210. } else {
  211. return l.Tokenize(kind)
  212. }
  213. }
  214. return l.Tokenize(kind)
  215. }
  216. /*
  217. func LexerRs(input []rune, index *int) (Token, error) {
  218. debug("LexerRs")
  219. SkipWs(input, index)
  220. return LexerWhileRuneOk(input, index, func(r rune) bool {
  221. return r == '\n' || r == '\r' || r == ';'
  222. })
  223. }
  224. func LexerWs(input []rune, index *int) (Token, error) {
  225. debug("LexerWs")
  226. return LexerWhileRuneOk(input, index, func(r rune) bool {
  227. return r == ' ' || r == '\t'
  228. })
  229. }
  230. func LexerWsRs(input []rune, index *int) (Token, error) {
  231. debug("LexerRs")
  232. SkipWs(input, index)
  233. return LexerWhileRuneOk(input, index, func(r rune) bool {
  234. return r == '\n' || r == '\r' || r == ';' || r == ' ' || r == '\t'
  235. })
  236. }
  237. func SkipWs(input []rune, index *int) {
  238. LexerWs(input, index)
  239. }
  240. func SkipRs(input []rune, index *int) {
  241. LexerRs(input, index)
  242. }
  243. func SkipWsRs(input []rune, index *int) {
  244. LexerWsRs(input, index)
  245. }
  246. func LexerComment(input []rune, index *int) (Token, error) {
  247. debug("LexerComment")
  248. start := *index
  249. if !RequireRune(input, index, '#') {
  250. return nil, nil
  251. }
  252. for ; *index < len(input); *index++ {
  253. r := input[*index]
  254. if r == '\n' || r == '\r' {
  255. end := *index
  256. return Comment(string(input[start:end])), nil
  257. }
  258. }
  259. return nil, ErrorFromString("unexpected EOF in comment")
  260. }
  261. func LexerStatement(input []rune, index *int) (Token, error) {
  262. debug("LexerStatement")
  263. SkipWs(input, index)
  264. return LexerAlternative(input, index, LexerCommand, LexerBlock, LexerComment)
  265. }
  266. func LexerParameters(input []rune, index *int) (Token, error) {
  267. debug("LexerParameters")
  268. params := List{}
  269. for {
  270. sep, err := LexerWs(input, index)
  271. if err != nil {
  272. return nil, err
  273. }
  274. if sep == nil {
  275. return params, nil
  276. }
  277. val, err := LexerParameter(input, index)
  278. if err != nil {
  279. return nil, err
  280. }
  281. if val == nil {
  282. return params, nil
  283. }
  284. params = append(params, val)
  285. }
  286. }
  287. func LexerParameter(input []rune, index *int) (Token, error) {
  288. debug("LexerParameter")
  289. funcs := []LexerFunc{LexerLiteral, LexerEvaluation, LexerBlock, LexerGetter}
  290. return LexerAlternative(input, index, funcs...)
  291. }
  292. func LexerOrder(input []rune, index *int) (Token, error) {
  293. debug("LexerOrder")
  294. return LexerAlternative(input, index, LexerLiteral, LexerEvaluation)
  295. }
  296. func LexerCommand(input []rune, index *int) (Token, error) {
  297. debug("LexerCommand")
  298. order, err := LexerOrder(input, index)
  299. if err != nil || order == nil {
  300. return order, err
  301. }
  302. params, err := LexerParameters(input, index)
  303. if err != nil {
  304. return params, err
  305. }
  306. if params == nil {
  307. params = List{}
  308. }
  309. return Command{order, params.(List)}, nil
  310. }
  311. // RequireRune requires a single rune to be present,
  312. // and skips it, however that rune is discared.
  313. // Returns true if the rune was found, false if not
  314. func RequireRune(input []rune, index *int, req rune) bool {
  315. if input[*index] == req {
  316. *index++
  317. return true
  318. }
  319. return false
  320. }
  321. func LexerEvaluation(input []rune, index *int) (Token, error) {
  322. debug("LexerEvaluation")
  323. if !RequireRune(input, index, '[') {
  324. return nil, nil
  325. }
  326. res, err := LexerCommand(input, index)
  327. if err != nil {
  328. return nil, err
  329. }
  330. if !RequireRune(input, index, ']') {
  331. print(input[*index])
  332. return nil, ErrorFromString("Expected end of evaluation ]")
  333. }
  334. if res != nil {
  335. res = Evaluation{Command: res.(Command)}
  336. }
  337. return res, nil
  338. }
  339. func LexerBlock(input []rune, index *int) (Token, error) {
  340. debug("LexerBlock")
  341. if !RequireRune(input, index, '{') {
  342. return nil, nil
  343. }
  344. res, err := LexerStatements(input, index)
  345. if err != nil {
  346. return nil, err
  347. }
  348. SkipWsRs(input, index)
  349. if !RequireRune(input, index, '}') {
  350. return nil, ErrorFromString("Expected end of block }")
  351. }
  352. return Block{Statements: res.(List)}, nil
  353. return nil, nil
  354. }
  355. func LexerGetter(input []rune, index *int) (Token, error) {
  356. debug("LexerGetter")
  357. if RequireRune(input, index, '$') {
  358. if input[*index] == '$' { // recusively Lexer double getters
  359. val, err := LexerGetter(input, index)
  360. if err == nil { // Getter with a getter inside.
  361. return Getter{val}, err
  362. } else {
  363. return nil, err
  364. }
  365. } else { // integer, sring or getter name
  366. key, err := LexerLiteral(input, index)
  367. if key == nil {
  368. return nil, ErrorFromString("Expected literal after getter $")
  369. }
  370. if err == nil {
  371. return Getter{key}, nil
  372. }
  373. return nil, err
  374. }
  375. }
  376. return nil, nil
  377. }
  378. func LexerLiteral(input []rune, index *int) (Token, error) {
  379. debug("LexerLiteral")
  380. return LexerAlternative(input, index, LexerWord, LexerString, LexerInteger,
  381. LexerRawString)
  382. }
  383. func IsLetter(r rune) bool {
  384. return (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r > rune(128)) ||
  385. r == '_' || r == '/'
  386. }
  387. func IsNumber(r rune) bool {
  388. return (r >= '0' && r <= '9')
  389. }
  390. func LexerWord(input []rune, index *int) (Token, error) {
  391. debug("LexerWord")
  392. // a word consists of an ascii letter or non asci characters, or underscore
  393. // followed by an ascii letter or number, or non ascii characters, or underscore
  394. start := *index
  395. r := input[*index]
  396. if !IsLetter(r) {
  397. return nil, nil
  398. }
  399. for *index++; *index < len(input); *index++ {
  400. r := input[*index]
  401. if !(IsLetter(r) || IsNumber(r)) {
  402. return Word(string(input[start:*index])), nil
  403. }
  404. }
  405. return nil, ErrorFromString("unexpected EOF in string")
  406. }
  407. func next(input []rune, index *int) {
  408. *index++
  409. if *index >= len(input) {
  410. panic(ErrorFromString("Unexpected end of input."))
  411. }
  412. }
  413. func LexerEscape(input []rune, index *int) (Token, error) {
  414. res := ""
  415. if input[*index] != '\\' {
  416. return nil, nil
  417. }
  418. next(input, index)
  419. switch input[*index] {
  420. case 'a':
  421. res += "\a"
  422. case 'b':
  423. res += "\b"
  424. case 'e':
  425. res += "\033"
  426. case 'f':
  427. res += "\f"
  428. case 'n':
  429. res += "\n"
  430. case 'r':
  431. res += "\r"
  432. case 't':
  433. res += "\t"
  434. case '\\':
  435. res += "\\"
  436. case '"':
  437. res += "\""
  438. default:
  439. return nil, ErrorFromString("Unknown escape sequence character")
  440. }
  441. return String(res), nil
  442. }
  443. func LexerString(input []rune, index *int) (Token, error) {
  444. debug("LexerString")
  445. res := ""
  446. ch := input[*index]
  447. if ch != '"' {
  448. return nil, nil
  449. }
  450. *index++
  451. for *index < len(input) {
  452. ch = input[*index]
  453. esc, err := LexerEscape(input, index)
  454. if err != nil {
  455. return nil, err
  456. }
  457. if esc != nil {
  458. res += string(esc.(String))
  459. } else if ch == '"' {
  460. *index++
  461. return String(res), nil
  462. } else {
  463. res += string(ch)
  464. }
  465. *index++
  466. }
  467. return nil, ErrorFromString("Unexpected end of input.")
  468. }
  469. func LexerRawString(input []rune, index *int) (Token, error) {
  470. debug("LexerRawString")
  471. res := ""
  472. ch := input[*index]
  473. if ch != '`' {
  474. return nil, nil
  475. }
  476. *index++
  477. for *index < len(input) {
  478. ch = input[*index]
  479. if ch == '`' {
  480. *index++
  481. return String(res), nil
  482. } else {
  483. res += string(ch)
  484. }
  485. *index++
  486. }
  487. return nil, ErrorFromString("Unexpected end of input.")
  488. }
  489. func LexerInteger(input []rune, index *int) (Token, error) {
  490. debug("LexerInteger")
  491. ch := input[*index]
  492. neg := 1
  493. res := 0
  494. if ch == '-' {
  495. neg = -1
  496. } else if ch == '+' {
  497. // do nothing, ignore + as an integer prefix
  498. } else {
  499. res = int(ch - '0')
  500. if res < 0 || res > 9 { // Not a digit, no integer
  501. return nil, nil
  502. }
  503. }
  504. *index++
  505. for *index < len(input) {
  506. ch = input[*index]
  507. ch -= '0'
  508. if ch < 0 || ch > 9 { // Not a digit, finished
  509. return Int(neg * res), nil
  510. }
  511. res = res * 10
  512. res = res + int(ch)
  513. *index++
  514. }
  515. return nil, ErrorFromString("unexpected EOF in number")
  516. }
  517. */