package main import ( "fmt" "strings" "unicode" ) // Ucalgary stntax: // // Grammar Structure // // The format for a context free grammar is as follows: // // Any reference to "punctuation" when describing legal identifiers is restricted // to certain pieces of punctuation. In no particular order, here are the // punctuation characters that are legal: ~ ! @ # * ( ) _ + ' ; : / ? // // Terminals are strings which either start with a lowercase letter, a number, // or a piece of punctuation, followed by any amount of letters, numbers and // punctuation. i.e. a terminal matches the regular expression // [{punctuation}a-z0-9][{punctuation}a-zA-Z0-9]*. Examples are id + begin end // plus 9 8' 7+? '' .... // // Nonterminals start with uppercase letters, followed by any number of legal // characters. i.e. a nonterminal matches the regular expression // [A-Z][{punctuation}a-zA-Z0-9]*. Examples are S A EXPR TERM~3 Stop .... // // It is assumed that the start symbol is the first nonterminal whose productions // are given. // // The productions associated with a nonterminal are indicated as // head -> RHS1 | RHS2 .... | RHSn. // where the alternative righthand sides of the productions are separated by // a slash, "|", and terminated by a period, ".". // // The RHS of a production is a sequence of terminals or nonterminals separated // by spaces. For example: Expr add Expr. // // C-style comments are allowed, e.g. // /* this is a comment possibly spanning many lines */ // // Empty grammars // The grammar epsilon or ε IS EMPTY. // For any nonterminal epsilon is a valid grammar. // This represents a grammar with no productions. // // Contrary to this, ll1 only allows the _ punctuation in identifiers. // // Location of the lexer or of a token type Location struct { //Name is the name of the file or input. Name string //Current index in the input buffer of the token, or of the lex step. Index int //Start index in the input buffer at which the token begins. Start int //Line the lexer is at or on which a token begins. Line int //Col is the column in the line on which the token begins. Col int //Index in the input buffer from which the token begins. } func (l Location) String() string { return fmt.Sprintf("%s:%d:%d", l.Name, l.Line, l.Col) } type TokenKind rune const TokenKindSkip = TokenKind(-2) type Token struct { Location TokenKind Text string } // Advance moves to the next rune in input location, updating the // location's Index, Line and Col. // Returns the rune found, or -1 if the end of the buffer has been reached. func (l *Location) Advance(input []rune) rune { if l.Index >= len(input) { return -1 } r := input[l.Index] if (r == '\n' && l.Index > 0 && input[l.Index] != '\r') || (r == '\r') { l.Line++ } if (r == '\n') || (r == '\r') { l.Col = 0 } l.Col++ l.Index++ return r } type Lexer struct { Location Input []rune Rules []LexerFunc } type LexerFuncs map[string]LexerFunc func (l *Lexer) Advance() rune { return l.Location.Advance(l.Input) } func (l Lexer) IsEof() bool { return l.Index >= len(l.Input) } func (l Lexer) Peek() rune { if l.IsEof() { return -1 } return l.Input[l.Location.Index] } // LexerFunc is a Lexer function. // It Lexes the input buffer starting from lex.Index, which must be // guaranteed by the caller to be non negative. // The lexerfunc must make progress if it parses a token or returns an error. // It should NOT make progress if it does not match. // It should return as follows: // * If the Lexer function matched what it is intended to lex // it should return the lexed token(s), nil, and lex.Start points to the start // of the token, and lex.Index should be moved to point right after the lexed // part of the string. To indicate that the lexer should skip the parsed token, // set it's token kind to TokenKindSkip // * If the Lexer function did not match what it is intended to lex // it should return nil, nil, and lex.Index should be unchanged. // * If the Lexer function did match what it is intended to lex // but there is a Lexer error, it should return empty slice, error slice, // and l.Index should be set to the error's location. type LexerFunc func(lex *Lexer) ([]Token, []error) var Debug = false func debug(msg string) { if Debug { print(msg) } } func (l Lexer) WhileOk(ok func(r rune, l Lexer) bool) (string, error) { l.Start = l.Index now := 0 for !l.IsEof() { r := l.Advance() if !ok(r, l) { if now == 0 { return "", nil } return string(l.Input[l.Start:l.Index]), nil } now++ } return "", fmt.Errorf("unexpected EOF: >" + string(l.Input[l.Start:l.Index]) + "<") } func NewLexerFromString(input, name string, funcs []LexerFunc) *Lexer { loc := Location{} loc.Name = name return &Lexer{loc, []rune(input), funcs} } func (l *Lexer) Lex() (result []Token, rerr []error) { defer func() { val := recover() err, ok := val.(error) if ok { rerr = append(rerr, err) } }() return l.lex() } func (l *Lexer) lex() (result []Token, rerr []error) { for !l.IsEof() { for _, lf := range l.Rules { tokens, errs := lf(l) if len(errs) > 0 { rerr = append(rerr, errs...) // skip until next whitespace for !l.IsEof() { r2 := l.Advance() if unicode.IsSpace(r2) { break } } } for _, token := range tokens { if token.TokenKind != TokenKindSkip { result = append(result, token) } } // no progress made, indicates fatal lex error if l.Index == l.Start { err := fmt.Errorf("Lex error: %s", l.Location) rerr = append(rerr, err) return result, rerr } // advance start based on progress l.Start = l.Index } } return result, rerr } func (l *Lexer) Tokenize(kind TokenKind) (toks []Token, errs []error) { if l.Index == l.Start { err := fmt.Errorf("Lex error: %s, expected %c", l.Location, kind) errs = append(errs, err) return toks, errs } str := string(l.Input[l.Start:l.Index]) return []Token{Token{l.Location, kind, str}}, errs } func (l *Lexer) LexContains(chars string, kind TokenKind) (toks []Token, errs []error) { r := l.Peek() if !strings.ContainsRune(chars, r) { return toks, errs } for !l.IsEof() { if strings.ContainsRune(chars, r) { l.Advance() } else { return l.Tokenize(kind) } } return l.Tokenize(kind) } /* func LexerRs(input []rune, index *int) (Token, error) { debug("LexerRs") SkipWs(input, index) return LexerWhileRuneOk(input, index, func(r rune) bool { return r == '\n' || r == '\r' || r == ';' }) } func LexerWs(input []rune, index *int) (Token, error) { debug("LexerWs") return LexerWhileRuneOk(input, index, func(r rune) bool { return r == ' ' || r == '\t' }) } func LexerWsRs(input []rune, index *int) (Token, error) { debug("LexerRs") SkipWs(input, index) return LexerWhileRuneOk(input, index, func(r rune) bool { return r == '\n' || r == '\r' || r == ';' || r == ' ' || r == '\t' }) } func SkipWs(input []rune, index *int) { LexerWs(input, index) } func SkipRs(input []rune, index *int) { LexerRs(input, index) } func SkipWsRs(input []rune, index *int) { LexerWsRs(input, index) } func LexerComment(input []rune, index *int) (Token, error) { debug("LexerComment") start := *index if !RequireRune(input, index, '#') { return nil, nil } for ; *index < len(input); *index++ { r := input[*index] if r == '\n' || r == '\r' { end := *index return Comment(string(input[start:end])), nil } } return nil, ErrorFromString("unexpected EOF in comment") } func LexerStatement(input []rune, index *int) (Token, error) { debug("LexerStatement") SkipWs(input, index) return LexerAlternative(input, index, LexerCommand, LexerBlock, LexerComment) } func LexerParameters(input []rune, index *int) (Token, error) { debug("LexerParameters") params := List{} for { sep, err := LexerWs(input, index) if err != nil { return nil, err } if sep == nil { return params, nil } val, err := LexerParameter(input, index) if err != nil { return nil, err } if val == nil { return params, nil } params = append(params, val) } } func LexerParameter(input []rune, index *int) (Token, error) { debug("LexerParameter") funcs := []LexerFunc{LexerLiteral, LexerEvaluation, LexerBlock, LexerGetter} return LexerAlternative(input, index, funcs...) } func LexerOrder(input []rune, index *int) (Token, error) { debug("LexerOrder") return LexerAlternative(input, index, LexerLiteral, LexerEvaluation) } func LexerCommand(input []rune, index *int) (Token, error) { debug("LexerCommand") order, err := LexerOrder(input, index) if err != nil || order == nil { return order, err } params, err := LexerParameters(input, index) if err != nil { return params, err } if params == nil { params = List{} } return Command{order, params.(List)}, nil } // RequireRune requires a single rune to be present, // and skips it, however that rune is discared. // Returns true if the rune was found, false if not func RequireRune(input []rune, index *int, req rune) bool { if input[*index] == req { *index++ return true } return false } func LexerEvaluation(input []rune, index *int) (Token, error) { debug("LexerEvaluation") if !RequireRune(input, index, '[') { return nil, nil } res, err := LexerCommand(input, index) if err != nil { return nil, err } if !RequireRune(input, index, ']') { print(input[*index]) return nil, ErrorFromString("Expected end of evaluation ]") } if res != nil { res = Evaluation{Command: res.(Command)} } return res, nil } func LexerBlock(input []rune, index *int) (Token, error) { debug("LexerBlock") if !RequireRune(input, index, '{') { return nil, nil } res, err := LexerStatements(input, index) if err != nil { return nil, err } SkipWsRs(input, index) if !RequireRune(input, index, '}') { return nil, ErrorFromString("Expected end of block }") } return Block{Statements: res.(List)}, nil return nil, nil } func LexerGetter(input []rune, index *int) (Token, error) { debug("LexerGetter") if RequireRune(input, index, '$') { if input[*index] == '$' { // recusively Lexer double getters val, err := LexerGetter(input, index) if err == nil { // Getter with a getter inside. return Getter{val}, err } else { return nil, err } } else { // integer, sring or getter name key, err := LexerLiteral(input, index) if key == nil { return nil, ErrorFromString("Expected literal after getter $") } if err == nil { return Getter{key}, nil } return nil, err } } return nil, nil } func LexerLiteral(input []rune, index *int) (Token, error) { debug("LexerLiteral") return LexerAlternative(input, index, LexerWord, LexerString, LexerInteger, LexerRawString) } func IsLetter(r rune) bool { return (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r > rune(128)) || r == '_' || r == '/' } func IsNumber(r rune) bool { return (r >= '0' && r <= '9') } func LexerWord(input []rune, index *int) (Token, error) { debug("LexerWord") // a word consists of an ascii letter or non asci characters, or underscore // followed by an ascii letter or number, or non ascii characters, or underscore start := *index r := input[*index] if !IsLetter(r) { return nil, nil } for *index++; *index < len(input); *index++ { r := input[*index] if !(IsLetter(r) || IsNumber(r)) { return Word(string(input[start:*index])), nil } } return nil, ErrorFromString("unexpected EOF in string") } func next(input []rune, index *int) { *index++ if *index >= len(input) { panic(ErrorFromString("Unexpected end of input.")) } } func LexerEscape(input []rune, index *int) (Token, error) { res := "" if input[*index] != '\\' { return nil, nil } next(input, index) switch input[*index] { case 'a': res += "\a" case 'b': res += "\b" case 'e': res += "\033" case 'f': res += "\f" case 'n': res += "\n" case 'r': res += "\r" case 't': res += "\t" case '\\': res += "\\" case '"': res += "\"" default: return nil, ErrorFromString("Unknown escape sequence character") } return String(res), nil } func LexerString(input []rune, index *int) (Token, error) { debug("LexerString") res := "" ch := input[*index] if ch != '"' { return nil, nil } *index++ for *index < len(input) { ch = input[*index] esc, err := LexerEscape(input, index) if err != nil { return nil, err } if esc != nil { res += string(esc.(String)) } else if ch == '"' { *index++ return String(res), nil } else { res += string(ch) } *index++ } return nil, ErrorFromString("Unexpected end of input.") } func LexerRawString(input []rune, index *int) (Token, error) { debug("LexerRawString") res := "" ch := input[*index] if ch != '`' { return nil, nil } *index++ for *index < len(input) { ch = input[*index] if ch == '`' { *index++ return String(res), nil } else { res += string(ch) } *index++ } return nil, ErrorFromString("Unexpected end of input.") } func LexerInteger(input []rune, index *int) (Token, error) { debug("LexerInteger") ch := input[*index] neg := 1 res := 0 if ch == '-' { neg = -1 } else if ch == '+' { // do nothing, ignore + as an integer prefix } else { res = int(ch - '0') if res < 0 || res > 9 { // Not a digit, no integer return nil, nil } } *index++ for *index < len(input) { ch = input[*index] ch -= '0' if ch < 0 || ch > 9 { // Not a digit, finished return Int(neg * res), nil } res = res * 10 res = res + int(ch) *index++ } return nil, ErrorFromString("unexpected EOF in number") } */