tokenizer.go 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347
  1. package raku
  2. import "strings"
  3. import "unicode"
  4. import "fmt"
  5. import "reflect"
  6. import "runtime"
  7. import "gitlab.com/beoran/woe/monolog"
  8. /* The tokenizer splits up text in tokens without classifying Word tokens. */
  9. type Tokenizer struct {
  10. CurrentPosition Position
  11. LastPosition Position
  12. Input []rune
  13. Output []*Token
  14. rule TokenizerRule
  15. }
  16. type TokenizerRule func(tkz *Tokenizer) TokenizerRule
  17. func NewTokenizer(input string) * Tokenizer {
  18. tokenizer := &Tokenizer{}
  19. tokenizer.Input = []rune(input);
  20. tokenizer.rule = TokenizeStart
  21. return tokenizer;
  22. }
  23. func (tkz *Tokenizer) Last() Position {
  24. return tkz.LastPosition
  25. }
  26. func (tkz *Tokenizer) Current() Position {
  27. return tkz.CurrentPosition
  28. }
  29. func (tkz *Tokenizer) LastPtr() * Position {
  30. return &tkz.LastPosition
  31. }
  32. func (tkz *Tokenizer) CurrentPtr() * Position {
  33. return &tkz.CurrentPosition
  34. }
  35. func (tkz *Tokenizer) Emit(t TokenType, v TokenText) {
  36. tok := &Token{t, v, tkz.Current()}
  37. tkz.EmitToken(tok)
  38. }
  39. func (tkz *Tokenizer) Error(message string, args ...interface{}) {
  40. value := fmt.Sprintf(message, args...)
  41. monolog.Error("Tokenize Error: %s", value)
  42. tkz.Emit(TokenError, TokenText(value))
  43. }
  44. func TokenizeError(tkz *Tokenizer) TokenizerRule {
  45. tkz.Error("Error")
  46. return nil
  47. }
  48. func (tkz *Tokenizer) SkipComment() bool {
  49. if tkz.Peek() == '#' {
  50. if tkz.Next() == '(' {
  51. return tkz.SkipNotIn(")")
  52. } else {
  53. return tkz.SkipNotIn("\r\n")
  54. }
  55. }
  56. return true
  57. }
  58. func TokenizeSigil(tkz *Tokenizer) TokenizerRule {
  59. tkz.Found(TokenType(tkz.Peek()))
  60. _ = tkz.Next()
  61. tkz.Advance()
  62. return TokenizeNormal
  63. }
  64. const tokenDelimiter = " \t\r\n'({[]})"
  65. func TokenizeWord(tkz *Tokenizer) TokenizerRule {
  66. tkz.SkipNotIn(tokenDelimiter)
  67. tkz.Found(TokenWord)
  68. return TokenizeNormal
  69. }
  70. func TokenizeSymbol(tkz *Tokenizer) TokenizerRule {
  71. tkz.SkipNotIn(tokenDelimiter)
  72. tkz.Found(TokenSymbol)
  73. return TokenizeNormal
  74. }
  75. func TokenizeNumber(tkz *Tokenizer) TokenizerRule {
  76. tkz.SkipNotIn(tokenDelimiter)
  77. tkz.Found(TokenNumber)
  78. return TokenizeNormal
  79. }
  80. func TokenizeWhitespace(tkz *Tokenizer) TokenizerRule {
  81. tkz.SkipWhitespace()
  82. tkz.Advance()
  83. return TokenizeNormal
  84. }
  85. func TokenizeComment(tkz *Tokenizer) TokenizerRule {
  86. if !tkz.SkipComment() {
  87. tkz.Error("Unterminated comment")
  88. return TokenizeError
  89. }
  90. tkz.Advance()
  91. return TokenizeNormal
  92. }
  93. func TokenizePunctuator(tkz *Tokenizer) TokenizerRule {
  94. tkz.Found(TokenType(tkz.Peek()))
  95. _ = tkz.Next()
  96. tkz.Advance()
  97. return TokenizeNormal
  98. }
  99. func TokenizeEOL(tkz *Tokenizer) TokenizerRule {
  100. tkz.SkipIn("\r\n")
  101. tkz.Found(TokenEOL)
  102. return TokenizeNormal
  103. }
  104. func TokenizeOperator(tkz *Tokenizer) TokenizerRule {
  105. tkz.SkipIn(operator_chars)
  106. tkz.Found(TokenOperator)
  107. return TokenizeNormal
  108. }
  109. func tokenizeEscape(tkz *Tokenizer) error {
  110. _ = tkz.Next()
  111. return nil
  112. }
  113. func TokenizeString(tkz *Tokenizer) TokenizerRule {
  114. open := tkz.Peek()
  115. do_escape := open == '"'
  116. peek := tkz.Next()
  117. tkz.Advance()
  118. for ; peek != '\000'; peek = tkz.Next() {
  119. if do_escape && peek == '\\' {
  120. if err := tokenizeEscape(tkz); err != nil {
  121. return TokenizeError
  122. }
  123. } else if peek == open {
  124. tkz.Found(TokenString)
  125. _ = tkz.Next()
  126. tkz.Advance()
  127. return TokenizeNormal
  128. }
  129. }
  130. tkz.Error("Unexpected EOF in string.")
  131. return nil
  132. }
  133. func TokenizeNumberOrOperator(tkz *Tokenizer) TokenizerRule {
  134. if unicode.IsDigit(tkz.Next()) {
  135. return TokenizeNumber
  136. } else {
  137. _ = tkz.Previous()
  138. return TokenizeOperator
  139. }
  140. }
  141. func TokenizeNormal(tkz *Tokenizer) TokenizerRule {
  142. peek := tkz.Peek()
  143. if peek == '#' {
  144. return TokenizeComment
  145. } else if strings.ContainsRune(" \t", peek) {
  146. return TokenizeWhitespace
  147. } else if strings.ContainsRune(".,;:", peek) {
  148. return TokenizePunctuator
  149. } else if strings.ContainsRune("([{}])", peek) {
  150. return TokenizeSigil
  151. } else if strings.ContainsRune("$", peek) {
  152. return TokenizeSymbol
  153. } else if strings.ContainsRune("\r\n", peek) {
  154. return TokenizeEOL
  155. } else if strings.ContainsRune("+-", peek) {
  156. return TokenizeNumberOrOperator
  157. } else if strings.ContainsRune("\"`", peek) {
  158. return TokenizeString
  159. } else if peek == '\000' {
  160. tkz.Emit(TokenEOF, "")
  161. return nil
  162. } else if unicode.IsLetter(peek) {
  163. return TokenizeWord
  164. } else if unicode.IsDigit(peek) {
  165. return TokenizeNumber
  166. } else if strings.ContainsRune(operator_chars, peek) {
  167. return TokenizeOperator
  168. } else {
  169. return TokenizeError
  170. }
  171. }
  172. func (tkz *Tokenizer) Peek() rune {
  173. if (tkz.Current().Index) >= len(tkz.Input) {
  174. return '\000'
  175. }
  176. return tkz.Input[tkz.Current().Index]
  177. }
  178. func (tkz *Tokenizer) PeekNext() rune {
  179. if (tkz.Current().Index + 1) >= len(tkz.Input) {
  180. return '\000'
  181. }
  182. return tkz.Input[tkz.Current().Index+1]
  183. }
  184. func (tkz *Tokenizer) Next() rune {
  185. monolog.Debug("Next: %c %s", tkz.Peek(), tkz.Current())
  186. if tkz.Peek() == '\n' {
  187. tkz.CurrentPtr().Column = 0
  188. tkz.CurrentPtr().Row++
  189. }
  190. tkz.CurrentPtr().Index++
  191. return tkz.Peek()
  192. }
  193. func (tkz *Tokenizer) Previous() rune {
  194. if tkz.Current().Index > 0 {
  195. tkz.CurrentPtr().Index--
  196. if tkz.Peek() == '\n' {
  197. tkz.CurrentPtr().Column = 0
  198. tkz.CurrentPtr().Row++
  199. }
  200. }
  201. return tkz.Peek()
  202. }
  203. func (tkz *Tokenizer) SkipRune() {
  204. _ = tkz.Next()
  205. }
  206. func (tkz *Tokenizer) SkipIn(set string) bool {
  207. for strings.ContainsRune(set, tkz.Next()) {
  208. monolog.Debug("SkipIn: %s %c\n", set, tkz.Peek())
  209. if tkz.Peek() == '\000' {
  210. return false
  211. }
  212. }
  213. return true
  214. }
  215. func (tkz *Tokenizer) SkipNotIn(set string) bool {
  216. _ = tkz.Next()
  217. for c := tkz.Peek() ; !strings.ContainsRune(set,c) ; c = tkz.Next() {
  218. monolog.Debug("SkipNotIn: %c %s", c, tkz.Current())
  219. if c == '\000' {
  220. return false
  221. }
  222. }
  223. return true
  224. }
  225. func (tkz *Tokenizer) SkipWhile(should_skip func(r rune) bool) bool {
  226. for should_skip(tkz.Peek()) {
  227. if tkz.Next() == '\000' {
  228. return false
  229. }
  230. }
  231. return true
  232. }
  233. func (tkz *Tokenizer) SkipWhitespace() {
  234. tkz.SkipIn(" \t")
  235. }
  236. func (tkz *Tokenizer) Advance() {
  237. (*tkz.LastPtr()) = tkz.Current()
  238. }
  239. func (tkz *Tokenizer) Rewind() {
  240. (*tkz.CurrentPtr()) = tkz.Last()
  241. }
  242. func (tkz *Tokenizer) CurrentRuneValue() []rune {
  243. return tkz.Input[tkz.Last().Index:tkz.Current().Index]
  244. }
  245. func (tkz *Tokenizer) CurrentStringValue() string {
  246. return string(tkz.CurrentRuneValue())
  247. }
  248. func (tkz *Tokenizer) Found(kind TokenType) {
  249. tkz.Emit(kind, TokenText(tkz.CurrentStringValue()))
  250. tkz.Advance()
  251. }
  252. func getFunctionName(fun interface{}) string {
  253. return runtime.FuncForPC(reflect.ValueOf(fun).Pointer()).Name()
  254. }
  255. func (tkz *Tokenizer) Start() []*Token {
  256. rule := TokenizeNormal
  257. for rule != nil {
  258. monolog.Debug("Tokenizer Rule: %s\n", getFunctionName(rule))
  259. rule = rule(tkz)
  260. }
  261. return tkz.Output
  262. }
  263. func (tkz *Tokenizer) TryTokenizing() {
  264. tokens := tkz.Start()
  265. for token := range tokens {
  266. monolog.Info("Token %s", token)
  267. }
  268. }
  269. func (tkz * Tokenizer) EmitToken(token * Token) * Token {
  270. tkz.Output = append(tkz.Output, token)
  271. return token
  272. }
  273. func TokenizeStart(tkz *Tokenizer) TokenizerRule {
  274. return nil
  275. }
  276. func Tokenize(input string) []*Token {
  277. tokenizer := NewTokenizer(input)
  278. return tokenizer.Start()
  279. }
  280. func TryTokenizingString(input string) {
  281. tokens := Tokenize(input)
  282. for token := range tokens {
  283. monolog.Info("Token %s", token)
  284. }
  285. }