tokenizer.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581
  1. package raku
  2. import "strings"
  3. import "unicode"
  4. import "fmt"
  5. import "reflect"
  6. import "runtime"
  7. import "gitlab.com/beoran/woe/monolog"
  8. const tokenDelimiter = " \t\r\n'({[]}),;.:"
  9. const operator_chars = "&|@'^-*%/+=<>~"
  10. type TokenText string
  11. type TokenType int64
  12. type Position struct {
  13. Index int
  14. Row int
  15. Column int
  16. }
  17. const (
  18. TokenPeriod TokenType = TokenType('.')
  19. TokenComma TokenType = TokenType(',')
  20. TokenSemicolon TokenType = TokenType(';')
  21. TokenColon TokenType = TokenType(':')
  22. TokenOpenParen TokenType = TokenType('(')
  23. TokenCloseParen TokenType = TokenType(')')
  24. TokenOpenBrace TokenType = TokenType('{')
  25. TokenCloseBrace TokenType = TokenType('}')
  26. TokenOpenBracket TokenType = TokenType('[')
  27. TokenCloseBracket TokenType = TokenType(']')
  28. TokenNone TokenType = 0
  29. TokenError TokenType = -1
  30. TokenWord TokenType = -2
  31. TokenEOL TokenType = -3
  32. TokenEOF TokenType = -4
  33. TokenNumber TokenType = -5
  34. TokenOperator TokenType = -6
  35. TokenString TokenType = -7
  36. TokenSymbol TokenType = -8
  37. TokenFirstKeyword TokenType = -9
  38. TokenArticle TokenType = -10
  39. TokenDo TokenType = -11
  40. TokenEnd TokenType = -12
  41. TokenDef TokenType = -13
  42. TokenPreposition TokenType = -14
  43. TokenVerb TokenType = -15
  44. TokenNoun TokenType = -16
  45. TokenAdverb TokenType = -17
  46. TokenAdjective TokenType = -18
  47. TokenLastKeyword TokenType = -19
  48. TokenLast TokenType = -19
  49. )
  50. type Token struct {
  51. TokenType
  52. TokenText
  53. Position
  54. }
  55. var tokenTypeMap map[TokenType]string = map[TokenType]string{
  56. TokenNone: "None",
  57. TokenError: "Error",
  58. TokenWord: "Word",
  59. TokenEOL: "EOL",
  60. TokenEOF: "EOF",
  61. TokenNumber: "Number",
  62. TokenOperator: "Operator",
  63. TokenString: "String",
  64. TokenSymbol: "Symbol",
  65. TokenArticle: "Article",
  66. TokenPreposition:"Preposition",
  67. TokenDo: "Do",
  68. TokenEnd: "End",
  69. TokenDef: "KeywordDef",
  70. TokenVerb: "Verb",
  71. TokenAdjective: "Adjective",
  72. TokenAdverb: "Adverb",
  73. TokenNoun: "Noun",
  74. }
  75. var keywordMap map[string]TokenType = map[string]TokenType{
  76. "a" : TokenArticle,
  77. "an" : TokenArticle,
  78. "the" : TokenArticle,
  79. "do" : TokenDo,
  80. "begin" : TokenDo,
  81. "then" : TokenDo,
  82. "has" : TokenDo,
  83. "end" : TokenEnd,
  84. "done" : TokenEnd,
  85. "endif" : TokenEnd,
  86. "def" : TokenDef,
  87. "define" : TokenDef,
  88. "aboard" : TokenPreposition,
  89. "about" : TokenPreposition,
  90. "above" : TokenPreposition,
  91. "absent" : TokenPreposition,
  92. "across" : TokenPreposition,
  93. "after" : TokenPreposition,
  94. "against" : TokenPreposition,
  95. "along" : TokenPreposition,
  96. "alongside" : TokenPreposition,
  97. "amid" : TokenPreposition,
  98. "amidst" : TokenPreposition,
  99. "among" : TokenPreposition,
  100. "apropos" : TokenPreposition,
  101. "apud" : TokenPreposition,
  102. "around" : TokenPreposition,
  103. "as" : TokenPreposition,
  104. "astride" : TokenPreposition,
  105. "at" : TokenPreposition,
  106. "atop" : TokenPreposition,
  107. "ontop" : TokenPreposition,
  108. "bar" : TokenPreposition,
  109. "before" : TokenPreposition,
  110. "behind" : TokenPreposition,
  111. "below" : TokenPreposition,
  112. "beneath" : TokenPreposition,
  113. "beside" : TokenPreposition,
  114. "besides" : TokenPreposition,
  115. "between" : TokenPreposition,
  116. "beyond" : TokenPreposition,
  117. "but" : TokenPreposition,
  118. "by" : TokenPreposition,
  119. "chez" : TokenPreposition,
  120. "circa" : TokenPreposition,
  121. "come" : TokenPreposition,
  122. "dehors" : TokenPreposition,
  123. "despite" : TokenPreposition,
  124. "down" : TokenPreposition,
  125. "during" : TokenPreposition,
  126. "except" : TokenPreposition,
  127. "for" : TokenPreposition,
  128. "from" : TokenPreposition,
  129. "in" : TokenPreposition,
  130. "inside" : TokenPreposition,
  131. "into" : TokenPreposition,
  132. "less" : TokenPreposition,
  133. "like" : TokenPreposition,
  134. "minus" : TokenPreposition,
  135. "near" : TokenPreposition,
  136. "nearer" : TokenPreposition,
  137. "nearest" : TokenPreposition,
  138. "notwithstanding" : TokenPreposition,
  139. "of" : TokenPreposition,
  140. "off" : TokenPreposition,
  141. "on" : TokenPreposition,
  142. "onto" : TokenPreposition,
  143. "opposite" : TokenPreposition,
  144. "out" : TokenPreposition,
  145. "outside" : TokenPreposition,
  146. "over" : TokenPreposition,
  147. "pace" : TokenPreposition,
  148. "past" : TokenPreposition,
  149. "per" : TokenPreposition,
  150. "post" : TokenPreposition,
  151. "pre" : TokenPreposition,
  152. "pro" : TokenPreposition,
  153. "qua" : TokenPreposition,
  154. "re" : TokenPreposition,
  155. "sans" : TokenPreposition,
  156. "save" : TokenPreposition,
  157. "short" : TokenPreposition,
  158. "since" : TokenPreposition,
  159. "than" : TokenPreposition,
  160. "through" : TokenPreposition,
  161. "thru" : TokenPreposition,
  162. "throughout" : TokenPreposition,
  163. "to" : TokenPreposition,
  164. "toward" : TokenPreposition,
  165. "towards" : TokenPreposition,
  166. "under" : TokenPreposition,
  167. "underneath" : TokenPreposition,
  168. "unlike" : TokenPreposition,
  169. "until" : TokenPreposition,
  170. "up" : TokenPreposition,
  171. "upon" : TokenPreposition,
  172. "upside" : TokenPreposition,
  173. "versus" : TokenPreposition,
  174. "via" : TokenPreposition,
  175. "vice" : TokenPreposition,
  176. "vis-à-vis" : TokenPreposition,
  177. "with" : TokenPreposition,
  178. "within" : TokenPreposition,
  179. "without" : TokenPreposition,
  180. "worth" : TokenPreposition,
  181. }
  182. var sigilMap map[string]TokenType = map[string]TokenType{
  183. "[": TokenOpenBracket,
  184. "{": TokenOpenBrace,
  185. "(": TokenOpenParen,
  186. "]": TokenCloseBracket,
  187. "}": TokenCloseBrace,
  188. ")": TokenCloseParen,
  189. }
  190. func (me TokenType) String() string {
  191. name, found := tokenTypeMap[me]
  192. if found {
  193. return name
  194. } else {
  195. if (me > 0) && (me < 256) {
  196. return fmt.Sprintf("Char<%c>", byte(me))
  197. }
  198. return fmt.Sprintf("Unknown Token %d", int(me))
  199. }
  200. }
  201. func (me Token) String() string {
  202. return fmt.Sprintf("Token: %s >%s< %d %d %d.", me.TokenType, string(me.TokenText), me.Index, me.Row, me.Column)
  203. }
  204. func (me Token) ShortString() string {
  205. return fmt.Sprintf("T: %s >%s<", me.TokenType, string(me.TokenText))
  206. }
  207. /* The tokenizer splits up text in tokens without classifying Word tokens. */
  208. type Tokenizer struct {
  209. CurrentPosition Position
  210. LastPosition Position
  211. Input []rune
  212. Output []*Token
  213. rule TokenizerRule
  214. }
  215. type TokenizerRule func(tkz *Tokenizer) TokenizerRule
  216. func NewTokenizer(input string) * Tokenizer {
  217. tokenizer := &Tokenizer{}
  218. tokenizer.Input = []rune(input);
  219. tokenizer.rule = TokenizeStart
  220. return tokenizer;
  221. }
  222. func (tkz *Tokenizer) Last() Position {
  223. return tkz.LastPosition
  224. }
  225. func (tkz *Tokenizer) Current() Position {
  226. return tkz.CurrentPosition
  227. }
  228. func (tkz *Tokenizer) LastPtr() * Position {
  229. return &tkz.LastPosition
  230. }
  231. func (tkz *Tokenizer) CurrentPtr() * Position {
  232. return &tkz.CurrentPosition
  233. }
  234. func (tkz *Tokenizer) Emit(t TokenType, v TokenText) {
  235. tok := &Token{t, v, tkz.Current()}
  236. tkz.EmitToken(tok)
  237. }
  238. func (tkz *Tokenizer) Error(message string, args ...interface{}) {
  239. value := fmt.Sprintf(message, args...)
  240. monolog.Error("Tokenize Error: %s", value)
  241. tkz.Emit(TokenError, TokenText(value))
  242. }
  243. func TokenizeError(tkz *Tokenizer) TokenizerRule {
  244. tkz.Error("Error")
  245. return nil
  246. }
  247. func (tkz *Tokenizer) SkipComment() bool {
  248. if tkz.Peek() == '#' {
  249. if tkz.Next() == '(' {
  250. return tkz.SkipNotIn(")")
  251. } else {
  252. return tkz.SkipNotIn("\r\n")
  253. }
  254. }
  255. return true
  256. }
  257. func TokenizeSigil(tkz *Tokenizer) TokenizerRule {
  258. tkz.Found(TokenType(tkz.Peek()))
  259. _ = tkz.Next()
  260. tkz.Advance()
  261. return TokenizeNormal
  262. }
  263. func TokenizeWord(tkz *Tokenizer) TokenizerRule {
  264. tkz.SkipNotIn(tokenDelimiter)
  265. tkz.Found(TokenWord)
  266. return TokenizeNormal
  267. }
  268. func TokenizeSymbol(tkz *Tokenizer) TokenizerRule {
  269. tkz.SkipNotIn(tokenDelimiter)
  270. tkz.Found(TokenSymbol)
  271. return TokenizeNormal
  272. }
  273. func TokenizeNumber(tkz *Tokenizer) TokenizerRule {
  274. tkz.SkipNotIn(tokenDelimiter)
  275. tkz.Found(TokenNumber)
  276. return TokenizeNormal
  277. }
  278. func TokenizeWhitespace(tkz *Tokenizer) TokenizerRule {
  279. tkz.SkipWhitespace()
  280. tkz.Advance()
  281. return TokenizeNormal
  282. }
  283. func TokenizeComment(tkz *Tokenizer) TokenizerRule {
  284. if !tkz.SkipComment() {
  285. tkz.Error("Unterminated comment")
  286. return TokenizeError
  287. }
  288. tkz.Advance()
  289. return TokenizeNormal
  290. }
  291. func TokenizePunctuator(tkz *Tokenizer) TokenizerRule {
  292. tkz.Found(TokenType(tkz.Peek()))
  293. _ = tkz.Next()
  294. tkz.Advance()
  295. return TokenizeNormal
  296. }
  297. func TokenizeEOL(tkz *Tokenizer) TokenizerRule {
  298. tkz.SkipIn("\r\n")
  299. tkz.Found(TokenEOL)
  300. return TokenizeNormal
  301. }
  302. func TokenizeOperator(tkz *Tokenizer) TokenizerRule {
  303. tkz.SkipIn(operator_chars)
  304. tkz.SkipCurrentNotIn(tokenDelimiter)
  305. tkz.Found(TokenOperator)
  306. return TokenizeNormal
  307. }
  308. func tokenizeEscape(tkz *Tokenizer) error {
  309. _ = tkz.Next()
  310. return nil
  311. }
  312. func TokenizeString(tkz *Tokenizer) TokenizerRule {
  313. open := tkz.Peek()
  314. do_escape := open == '"'
  315. peek := tkz.Next()
  316. tkz.Advance()
  317. for ; peek != '\000'; peek = tkz.Next() {
  318. if do_escape && peek == '\\' {
  319. if err := tokenizeEscape(tkz); err != nil {
  320. return TokenizeError
  321. }
  322. } else if peek == open {
  323. tkz.Found(TokenString)
  324. _ = tkz.Next()
  325. tkz.Advance()
  326. return TokenizeNormal
  327. }
  328. }
  329. tkz.Error("Unexpected EOF in string.")
  330. return nil
  331. }
  332. func TokenizeNumberOrOperator(tkz *Tokenizer) TokenizerRule {
  333. if unicode.IsDigit(tkz.Next()) {
  334. return TokenizeNumber
  335. } else {
  336. _ = tkz.Previous()
  337. return TokenizeOperator
  338. }
  339. }
  340. func TokenizeEscapedNewline(tkz * Tokenizer) TokenizerRule {
  341. tkz.SkipWhitespace()
  342. peek := tkz.Peek()
  343. if strings.ContainsRune("\n\r", peek) {
  344. tkz.SkipIn("\n\r")
  345. } else {
  346. tkz.Error("Stray backslash character.")
  347. }
  348. return TokenizeNormal
  349. }
  350. func TokenizeNormal(tkz *Tokenizer) TokenizerRule {
  351. peek := tkz.Peek()
  352. if peek == '#' {
  353. return TokenizeComment
  354. } else if strings.ContainsRune(" \t", peek) {
  355. return TokenizeWhitespace
  356. } else if strings.ContainsRune(".,;:", peek) {
  357. return TokenizePunctuator
  358. } else if strings.ContainsRune("([{}])", peek) {
  359. return TokenizeSigil
  360. } else if strings.ContainsRune("$", peek) {
  361. return TokenizeSymbol
  362. } else if strings.ContainsRune("\r\n", peek) {
  363. return TokenizeEOL
  364. } else if strings.ContainsRune("+-", peek) {
  365. return TokenizeNumberOrOperator
  366. } else if strings.ContainsRune("\"`", peek) {
  367. return TokenizeString
  368. } else if strings.ContainsRune("\\", peek) {
  369. return TokenizeEscapedNewline
  370. } else if peek == '\000' {
  371. tkz.Emit(TokenEOF, "")
  372. return nil
  373. } else if unicode.IsLetter(peek) {
  374. return TokenizeWord
  375. } else if unicode.IsDigit(peek) {
  376. return TokenizeNumber
  377. } else if strings.ContainsRune(operator_chars, peek) {
  378. return TokenizeOperator
  379. } else {
  380. return TokenizeError
  381. }
  382. }
  383. func (tkz *Tokenizer) Peek() rune {
  384. if (tkz.Current().Index) >= len(tkz.Input) {
  385. return '\000'
  386. }
  387. return tkz.Input[tkz.Current().Index]
  388. }
  389. func (tkz *Tokenizer) PeekNext() rune {
  390. if (tkz.Current().Index + 1) >= len(tkz.Input) {
  391. return '\000'
  392. }
  393. return tkz.Input[tkz.Current().Index+1]
  394. }
  395. func (tkz *Tokenizer) Next() rune {
  396. monolog.Debug("Next: %c %s", tkz.Peek(), tkz.Current())
  397. if tkz.Peek() == '\n' {
  398. tkz.CurrentPtr().Column = 0
  399. tkz.CurrentPtr().Row++
  400. }
  401. tkz.CurrentPtr().Index++
  402. return tkz.Peek()
  403. }
  404. func (tkz *Tokenizer) Previous() rune {
  405. if tkz.Current().Index > 0 {
  406. tkz.CurrentPtr().Index--
  407. if tkz.Peek() == '\n' {
  408. tkz.CurrentPtr().Column = 0
  409. tkz.CurrentPtr().Row++
  410. }
  411. }
  412. return tkz.Peek()
  413. }
  414. func (tkz *Tokenizer) SkipRune() {
  415. _ = tkz.Next()
  416. }
  417. func (tkz *Tokenizer) SkipIn(set string) bool {
  418. for strings.ContainsRune(set, tkz.Next()) {
  419. monolog.Debug("SkipIn: %s %c\n", set, tkz.Peek())
  420. if tkz.Peek() == '\000' {
  421. return false
  422. }
  423. }
  424. return true
  425. }
  426. func (tkz *Tokenizer) SkipCurrentNotIn(set string) bool {
  427. for c := tkz.Peek() ; !strings.ContainsRune(set,c) ; c = tkz.Next() {
  428. monolog.Debug("SkipNotIn: %c %s", c, tkz.Current())
  429. if c == '\000' {
  430. return false
  431. }
  432. }
  433. return true
  434. }
  435. func (tkz *Tokenizer) SkipNotIn(set string) bool {
  436. _ = tkz.Next()
  437. return tkz.SkipCurrentNotIn(set)
  438. }
  439. func (tkz *Tokenizer) SkipWhile(should_skip func(r rune) bool) bool {
  440. for should_skip(tkz.Peek()) {
  441. if tkz.Next() == '\000' {
  442. return false
  443. }
  444. }
  445. return true
  446. }
  447. func (tkz *Tokenizer) SkipWhitespace() {
  448. tkz.SkipIn(" \t")
  449. }
  450. func (tkz *Tokenizer) Advance() {
  451. (*tkz.LastPtr()) = tkz.Current()
  452. }
  453. func (tkz *Tokenizer) Rewind() {
  454. (*tkz.CurrentPtr()) = tkz.Last()
  455. }
  456. func (tkz *Tokenizer) CurrentRuneValue() []rune {
  457. return tkz.Input[tkz.Last().Index:tkz.Current().Index]
  458. }
  459. func (tkz *Tokenizer) CurrentStringValue() string {
  460. return string(tkz.CurrentRuneValue())
  461. }
  462. func (tkz *Tokenizer) Found(kind TokenType) {
  463. tkz.Emit(kind, TokenText(tkz.CurrentStringValue()))
  464. tkz.Advance()
  465. }
  466. func getFunctionName(fun interface{}) string {
  467. return runtime.FuncForPC(reflect.ValueOf(fun).Pointer()).Name()
  468. }
  469. func (tkz *Tokenizer) Start() []*Token {
  470. rule := TokenizeNormal
  471. for rule != nil {
  472. monolog.Debug("Tokenizer Rule: %s\n", getFunctionName(rule))
  473. rule = rule(tkz)
  474. }
  475. return tkz.Output
  476. }
  477. func (tkz *Tokenizer) TryTokenizing() {
  478. tokens := tkz.Start()
  479. for token := range tokens {
  480. monolog.Info("Token %s", token)
  481. }
  482. }
  483. func (tkz * Tokenizer) EmitToken(token * Token) * Token {
  484. tkz.Output = append(tkz.Output, token)
  485. return token
  486. }
  487. func TokenizeStart(tkz *Tokenizer) TokenizerRule {
  488. return nil
  489. }
  490. func Tokenize(input string) []*Token {
  491. tokenizer := NewTokenizer(input)
  492. return tokenizer.Start()
  493. }
  494. func TryTokenizingString(input string) {
  495. tokens := Tokenize(input)
  496. for token := range tokens {
  497. monolog.Info("Token %s", token)
  498. }
  499. }