1*4947cdc7SCole Faust// Copyright 2017 The Bazel Authors. All rights reserved. 2*4947cdc7SCole Faust// Use of this source code is governed by a BSD-style 3*4947cdc7SCole Faust// license that can be found in the LICENSE file. 4*4947cdc7SCole Faust 5*4947cdc7SCole Faustpackage syntax 6*4947cdc7SCole Faust 7*4947cdc7SCole Faust// A lexical scanner for Starlark. 8*4947cdc7SCole Faust 9*4947cdc7SCole Faustimport ( 10*4947cdc7SCole Faust "fmt" 11*4947cdc7SCole Faust "io" 12*4947cdc7SCole Faust "io/ioutil" 13*4947cdc7SCole Faust "log" 14*4947cdc7SCole Faust "math/big" 15*4947cdc7SCole Faust "os" 16*4947cdc7SCole Faust "strconv" 17*4947cdc7SCole Faust "strings" 18*4947cdc7SCole Faust "unicode" 19*4947cdc7SCole Faust "unicode/utf8" 20*4947cdc7SCole Faust) 21*4947cdc7SCole Faust 22*4947cdc7SCole Faust// A Token represents a Starlark lexical token. 23*4947cdc7SCole Fausttype Token int8 24*4947cdc7SCole Faust 25*4947cdc7SCole Faustconst ( 26*4947cdc7SCole Faust ILLEGAL Token = iota 27*4947cdc7SCole Faust EOF 28*4947cdc7SCole Faust 29*4947cdc7SCole Faust NEWLINE 30*4947cdc7SCole Faust INDENT 31*4947cdc7SCole Faust OUTDENT 32*4947cdc7SCole Faust 33*4947cdc7SCole Faust // Tokens with values 34*4947cdc7SCole Faust IDENT // x 35*4947cdc7SCole Faust INT // 123 36*4947cdc7SCole Faust FLOAT // 1.23e45 37*4947cdc7SCole Faust STRING // "foo" or 'foo' or '''foo''' or r'foo' or r"foo" 38*4947cdc7SCole Faust BYTES // b"foo", etc 39*4947cdc7SCole Faust 40*4947cdc7SCole Faust // Punctuation 41*4947cdc7SCole Faust PLUS // + 42*4947cdc7SCole Faust MINUS // - 43*4947cdc7SCole Faust STAR // * 44*4947cdc7SCole Faust SLASH // / 45*4947cdc7SCole Faust SLASHSLASH // // 46*4947cdc7SCole Faust PERCENT // % 47*4947cdc7SCole Faust AMP // & 48*4947cdc7SCole Faust PIPE // | 49*4947cdc7SCole Faust CIRCUMFLEX // ^ 50*4947cdc7SCole Faust LTLT // << 51*4947cdc7SCole Faust GTGT // >> 52*4947cdc7SCole Faust TILDE // ~ 53*4947cdc7SCole Faust DOT // . 54*4947cdc7SCole Faust COMMA // , 55*4947cdc7SCole Faust EQ // = 56*4947cdc7SCole Faust SEMI // ; 57*4947cdc7SCole Faust COLON // : 58*4947cdc7SCole Faust LPAREN // ( 59*4947cdc7SCole Faust RPAREN // ) 60*4947cdc7SCole Faust LBRACK // [ 61*4947cdc7SCole Faust RBRACK // ] 62*4947cdc7SCole Faust LBRACE // { 63*4947cdc7SCole Faust RBRACE // } 64*4947cdc7SCole Faust LT // < 65*4947cdc7SCole Faust GT // > 66*4947cdc7SCole Faust GE // >= 67*4947cdc7SCole Faust LE // <= 68*4947cdc7SCole Faust EQL // == 69*4947cdc7SCole Faust NEQ // != 70*4947cdc7SCole Faust PLUS_EQ // += (keep order consistent with PLUS..GTGT) 71*4947cdc7SCole Faust MINUS_EQ // -= 72*4947cdc7SCole Faust STAR_EQ // *= 73*4947cdc7SCole Faust SLASH_EQ // /= 74*4947cdc7SCole Faust SLASHSLASH_EQ // //= 75*4947cdc7SCole Faust PERCENT_EQ // %= 76*4947cdc7SCole Faust AMP_EQ // &= 77*4947cdc7SCole Faust PIPE_EQ // |= 78*4947cdc7SCole Faust CIRCUMFLEX_EQ // ^= 79*4947cdc7SCole Faust LTLT_EQ // <<= 80*4947cdc7SCole Faust GTGT_EQ // >>= 81*4947cdc7SCole Faust STARSTAR // ** 82*4947cdc7SCole Faust 83*4947cdc7SCole Faust // Keywords 84*4947cdc7SCole Faust AND 85*4947cdc7SCole Faust BREAK 86*4947cdc7SCole Faust CONTINUE 87*4947cdc7SCole Faust DEF 88*4947cdc7SCole Faust ELIF 89*4947cdc7SCole Faust ELSE 90*4947cdc7SCole Faust FOR 91*4947cdc7SCole Faust IF 92*4947cdc7SCole Faust IN 93*4947cdc7SCole Faust LAMBDA 94*4947cdc7SCole Faust LOAD 95*4947cdc7SCole Faust NOT 96*4947cdc7SCole Faust NOT_IN // synthesized by parser from NOT IN 97*4947cdc7SCole Faust OR 98*4947cdc7SCole Faust PASS 99*4947cdc7SCole Faust RETURN 100*4947cdc7SCole Faust WHILE 101*4947cdc7SCole Faust 102*4947cdc7SCole Faust maxToken 103*4947cdc7SCole Faust) 104*4947cdc7SCole Faust 105*4947cdc7SCole Faustfunc (tok Token) String() string { return tokenNames[tok] } 106*4947cdc7SCole Faust 107*4947cdc7SCole Faust// GoString is like String but quotes punctuation tokens. 108*4947cdc7SCole Faust// Use Sprintf("%#v", tok) when constructing error messages. 109*4947cdc7SCole Faustfunc (tok Token) GoString() string { 110*4947cdc7SCole Faust if tok >= PLUS && tok <= STARSTAR { 111*4947cdc7SCole Faust return "'" + tokenNames[tok] + "'" 112*4947cdc7SCole Faust } 113*4947cdc7SCole Faust return tokenNames[tok] 114*4947cdc7SCole Faust} 115*4947cdc7SCole Faust 116*4947cdc7SCole Faustvar tokenNames = [...]string{ 117*4947cdc7SCole Faust ILLEGAL: "illegal token", 118*4947cdc7SCole Faust EOF: "end of file", 119*4947cdc7SCole Faust NEWLINE: "newline", 120*4947cdc7SCole Faust INDENT: "indent", 121*4947cdc7SCole Faust OUTDENT: "outdent", 122*4947cdc7SCole Faust IDENT: "identifier", 123*4947cdc7SCole Faust INT: "int literal", 124*4947cdc7SCole Faust FLOAT: "float literal", 125*4947cdc7SCole Faust STRING: "string literal", 126*4947cdc7SCole Faust PLUS: "+", 127*4947cdc7SCole Faust MINUS: "-", 128*4947cdc7SCole Faust STAR: "*", 129*4947cdc7SCole Faust SLASH: "/", 130*4947cdc7SCole Faust SLASHSLASH: "//", 131*4947cdc7SCole Faust PERCENT: "%", 132*4947cdc7SCole Faust AMP: "&", 133*4947cdc7SCole Faust PIPE: "|", 134*4947cdc7SCole Faust CIRCUMFLEX: "^", 135*4947cdc7SCole Faust LTLT: "<<", 136*4947cdc7SCole Faust GTGT: ">>", 137*4947cdc7SCole Faust TILDE: "~", 138*4947cdc7SCole Faust DOT: ".", 139*4947cdc7SCole Faust COMMA: ",", 140*4947cdc7SCole Faust EQ: "=", 141*4947cdc7SCole Faust SEMI: ";", 142*4947cdc7SCole Faust COLON: ":", 143*4947cdc7SCole Faust LPAREN: "(", 144*4947cdc7SCole Faust RPAREN: ")", 145*4947cdc7SCole Faust LBRACK: "[", 146*4947cdc7SCole Faust RBRACK: "]", 147*4947cdc7SCole Faust LBRACE: "{", 148*4947cdc7SCole Faust RBRACE: "}", 149*4947cdc7SCole Faust LT: "<", 150*4947cdc7SCole Faust GT: ">", 151*4947cdc7SCole Faust GE: ">=", 152*4947cdc7SCole Faust LE: "<=", 153*4947cdc7SCole Faust EQL: "==", 154*4947cdc7SCole Faust NEQ: "!=", 155*4947cdc7SCole Faust PLUS_EQ: "+=", 156*4947cdc7SCole Faust MINUS_EQ: "-=", 157*4947cdc7SCole Faust STAR_EQ: "*=", 158*4947cdc7SCole Faust SLASH_EQ: "/=", 159*4947cdc7SCole Faust SLASHSLASH_EQ: "//=", 160*4947cdc7SCole Faust PERCENT_EQ: "%=", 161*4947cdc7SCole Faust AMP_EQ: "&=", 162*4947cdc7SCole Faust PIPE_EQ: "|=", 163*4947cdc7SCole Faust CIRCUMFLEX_EQ: "^=", 164*4947cdc7SCole Faust LTLT_EQ: "<<=", 165*4947cdc7SCole Faust GTGT_EQ: ">>=", 166*4947cdc7SCole Faust STARSTAR: "**", 167*4947cdc7SCole Faust AND: "and", 168*4947cdc7SCole Faust BREAK: "break", 169*4947cdc7SCole Faust CONTINUE: "continue", 170*4947cdc7SCole Faust DEF: "def", 171*4947cdc7SCole Faust ELIF: "elif", 172*4947cdc7SCole Faust ELSE: "else", 173*4947cdc7SCole Faust FOR: "for", 174*4947cdc7SCole Faust IF: "if", 175*4947cdc7SCole Faust IN: "in", 176*4947cdc7SCole Faust LAMBDA: "lambda", 177*4947cdc7SCole Faust LOAD: "load", 178*4947cdc7SCole Faust NOT: "not", 179*4947cdc7SCole Faust NOT_IN: "not in", 180*4947cdc7SCole Faust OR: "or", 181*4947cdc7SCole Faust PASS: "pass", 182*4947cdc7SCole Faust RETURN: "return", 183*4947cdc7SCole Faust WHILE: "while", 184*4947cdc7SCole Faust} 185*4947cdc7SCole Faust 186*4947cdc7SCole Faust// A FilePortion describes the content of a portion of a file. 187*4947cdc7SCole Faust// Callers may provide a FilePortion for the src argument of Parse 188*4947cdc7SCole Faust// when the desired initial line and column numbers are not (1, 1), 189*4947cdc7SCole Faust// such as when an expression is parsed from within larger file. 190*4947cdc7SCole Fausttype FilePortion struct { 191*4947cdc7SCole Faust Content []byte 192*4947cdc7SCole Faust FirstLine, FirstCol int32 193*4947cdc7SCole Faust} 194*4947cdc7SCole Faust 195*4947cdc7SCole Faust// A Position describes the location of a rune of input. 196*4947cdc7SCole Fausttype Position struct { 197*4947cdc7SCole Faust file *string // filename (indirect for compactness) 198*4947cdc7SCole Faust Line int32 // 1-based line number; 0 if line unknown 199*4947cdc7SCole Faust Col int32 // 1-based column (rune) number; 0 if column unknown 200*4947cdc7SCole Faust} 201*4947cdc7SCole Faust 202*4947cdc7SCole Faust// IsValid reports whether the position is valid. 203*4947cdc7SCole Faustfunc (p Position) IsValid() bool { return p.file != nil } 204*4947cdc7SCole Faust 205*4947cdc7SCole Faust// Filename returns the name of the file containing this position. 206*4947cdc7SCole Faustfunc (p Position) Filename() string { 207*4947cdc7SCole Faust if p.file != nil { 208*4947cdc7SCole Faust return *p.file 209*4947cdc7SCole Faust } 210*4947cdc7SCole Faust return "<invalid>" 211*4947cdc7SCole Faust} 212*4947cdc7SCole Faust 213*4947cdc7SCole Faust// MakePosition returns position with the specified components. 214*4947cdc7SCole Faustfunc MakePosition(file *string, line, col int32) Position { return Position{file, line, col} } 215*4947cdc7SCole Faust 216*4947cdc7SCole Faust// add returns the position at the end of s, assuming it starts at p. 217*4947cdc7SCole Faustfunc (p Position) add(s string) Position { 218*4947cdc7SCole Faust if n := strings.Count(s, "\n"); n > 0 { 219*4947cdc7SCole Faust p.Line += int32(n) 220*4947cdc7SCole Faust s = s[strings.LastIndex(s, "\n")+1:] 221*4947cdc7SCole Faust p.Col = 1 222*4947cdc7SCole Faust } 223*4947cdc7SCole Faust p.Col += int32(utf8.RuneCountInString(s)) 224*4947cdc7SCole Faust return p 225*4947cdc7SCole Faust} 226*4947cdc7SCole Faust 227*4947cdc7SCole Faustfunc (p Position) String() string { 228*4947cdc7SCole Faust file := p.Filename() 229*4947cdc7SCole Faust if p.Line > 0 { 230*4947cdc7SCole Faust if p.Col > 0 { 231*4947cdc7SCole Faust return fmt.Sprintf("%s:%d:%d", file, p.Line, p.Col) 232*4947cdc7SCole Faust } 233*4947cdc7SCole Faust return fmt.Sprintf("%s:%d", file, p.Line) 234*4947cdc7SCole Faust } 235*4947cdc7SCole Faust return file 236*4947cdc7SCole Faust} 237*4947cdc7SCole Faust 238*4947cdc7SCole Faustfunc (p Position) isBefore(q Position) bool { 239*4947cdc7SCole Faust if p.Line != q.Line { 240*4947cdc7SCole Faust return p.Line < q.Line 241*4947cdc7SCole Faust } 242*4947cdc7SCole Faust return p.Col < q.Col 243*4947cdc7SCole Faust} 244*4947cdc7SCole Faust 245*4947cdc7SCole Faust// An scanner represents a single input file being parsed. 246*4947cdc7SCole Fausttype scanner struct { 247*4947cdc7SCole Faust rest []byte // rest of input (in REPL, a line of input) 248*4947cdc7SCole Faust token []byte // token being scanned 249*4947cdc7SCole Faust pos Position // current input position 250*4947cdc7SCole Faust depth int // nesting of [ ] { } ( ) 251*4947cdc7SCole Faust indentstk []int // stack of indentation levels 252*4947cdc7SCole Faust dents int // number of saved INDENT (>0) or OUTDENT (<0) tokens to return 253*4947cdc7SCole Faust lineStart bool // after NEWLINE; convert spaces to indentation tokens 254*4947cdc7SCole Faust keepComments bool // accumulate comments in slice 255*4947cdc7SCole Faust lineComments []Comment // list of full line comments (if keepComments) 256*4947cdc7SCole Faust suffixComments []Comment // list of suffix comments (if keepComments) 257*4947cdc7SCole Faust 258*4947cdc7SCole Faust readline func() ([]byte, error) // read next line of input (REPL only) 259*4947cdc7SCole Faust} 260*4947cdc7SCole Faust 261*4947cdc7SCole Faustfunc newScanner(filename string, src interface{}, keepComments bool) (*scanner, error) { 262*4947cdc7SCole Faust var firstLine, firstCol int32 = 1, 1 263*4947cdc7SCole Faust if portion, ok := src.(FilePortion); ok { 264*4947cdc7SCole Faust firstLine, firstCol = portion.FirstLine, portion.FirstCol 265*4947cdc7SCole Faust } 266*4947cdc7SCole Faust sc := &scanner{ 267*4947cdc7SCole Faust pos: MakePosition(&filename, firstLine, firstCol), 268*4947cdc7SCole Faust indentstk: make([]int, 1, 10), // []int{0} + spare capacity 269*4947cdc7SCole Faust lineStart: true, 270*4947cdc7SCole Faust keepComments: keepComments, 271*4947cdc7SCole Faust } 272*4947cdc7SCole Faust sc.readline, _ = src.(func() ([]byte, error)) // ParseCompoundStmt (REPL) only 273*4947cdc7SCole Faust if sc.readline == nil { 274*4947cdc7SCole Faust data, err := readSource(filename, src) 275*4947cdc7SCole Faust if err != nil { 276*4947cdc7SCole Faust return nil, err 277*4947cdc7SCole Faust } 278*4947cdc7SCole Faust sc.rest = data 279*4947cdc7SCole Faust } 280*4947cdc7SCole Faust return sc, nil 281*4947cdc7SCole Faust} 282*4947cdc7SCole Faust 283*4947cdc7SCole Faustfunc readSource(filename string, src interface{}) ([]byte, error) { 284*4947cdc7SCole Faust switch src := src.(type) { 285*4947cdc7SCole Faust case string: 286*4947cdc7SCole Faust return []byte(src), nil 287*4947cdc7SCole Faust case []byte: 288*4947cdc7SCole Faust return src, nil 289*4947cdc7SCole Faust case io.Reader: 290*4947cdc7SCole Faust data, err := ioutil.ReadAll(src) 291*4947cdc7SCole Faust if err != nil { 292*4947cdc7SCole Faust err = &os.PathError{Op: "read", Path: filename, Err: err} 293*4947cdc7SCole Faust return nil, err 294*4947cdc7SCole Faust } 295*4947cdc7SCole Faust return data, nil 296*4947cdc7SCole Faust case FilePortion: 297*4947cdc7SCole Faust return src.Content, nil 298*4947cdc7SCole Faust case nil: 299*4947cdc7SCole Faust return ioutil.ReadFile(filename) 300*4947cdc7SCole Faust default: 301*4947cdc7SCole Faust return nil, fmt.Errorf("invalid source: %T", src) 302*4947cdc7SCole Faust } 303*4947cdc7SCole Faust} 304*4947cdc7SCole Faust 305*4947cdc7SCole Faust// An Error describes the nature and position of a scanner or parser error. 306*4947cdc7SCole Fausttype Error struct { 307*4947cdc7SCole Faust Pos Position 308*4947cdc7SCole Faust Msg string 309*4947cdc7SCole Faust} 310*4947cdc7SCole Faust 311*4947cdc7SCole Faustfunc (e Error) Error() string { return e.Pos.String() + ": " + e.Msg } 312*4947cdc7SCole Faust 313*4947cdc7SCole Faust// errorf is called to report an error. 314*4947cdc7SCole Faust// errorf does not return: it panics. 315*4947cdc7SCole Faustfunc (sc *scanner) error(pos Position, s string) { 316*4947cdc7SCole Faust panic(Error{pos, s}) 317*4947cdc7SCole Faust} 318*4947cdc7SCole Faust 319*4947cdc7SCole Faustfunc (sc *scanner) errorf(pos Position, format string, args ...interface{}) { 320*4947cdc7SCole Faust sc.error(pos, fmt.Sprintf(format, args...)) 321*4947cdc7SCole Faust} 322*4947cdc7SCole Faust 323*4947cdc7SCole Faustfunc (sc *scanner) recover(err *error) { 324*4947cdc7SCole Faust // The scanner and parser panic both for routine errors like 325*4947cdc7SCole Faust // syntax errors and for programmer bugs like array index 326*4947cdc7SCole Faust // errors. Turn both into error returns. Catching bug panics 327*4947cdc7SCole Faust // is especially important when processing many files. 328*4947cdc7SCole Faust switch e := recover().(type) { 329*4947cdc7SCole Faust case nil: 330*4947cdc7SCole Faust // no panic 331*4947cdc7SCole Faust case Error: 332*4947cdc7SCole Faust *err = e 333*4947cdc7SCole Faust default: 334*4947cdc7SCole Faust *err = Error{sc.pos, fmt.Sprintf("internal error: %v", e)} 335*4947cdc7SCole Faust if debug { 336*4947cdc7SCole Faust log.Fatal(*err) 337*4947cdc7SCole Faust } 338*4947cdc7SCole Faust } 339*4947cdc7SCole Faust} 340*4947cdc7SCole Faust 341*4947cdc7SCole Faust// eof reports whether the input has reached end of file. 342*4947cdc7SCole Faustfunc (sc *scanner) eof() bool { 343*4947cdc7SCole Faust return len(sc.rest) == 0 && !sc.readLine() 344*4947cdc7SCole Faust} 345*4947cdc7SCole Faust 346*4947cdc7SCole Faust// readLine attempts to read another line of input. 347*4947cdc7SCole Faust// Precondition: len(sc.rest)==0. 348*4947cdc7SCole Faustfunc (sc *scanner) readLine() bool { 349*4947cdc7SCole Faust if sc.readline != nil { 350*4947cdc7SCole Faust var err error 351*4947cdc7SCole Faust sc.rest, err = sc.readline() 352*4947cdc7SCole Faust if err != nil { 353*4947cdc7SCole Faust sc.errorf(sc.pos, "%v", err) // EOF or ErrInterrupt 354*4947cdc7SCole Faust } 355*4947cdc7SCole Faust return len(sc.rest) > 0 356*4947cdc7SCole Faust } 357*4947cdc7SCole Faust return false 358*4947cdc7SCole Faust} 359*4947cdc7SCole Faust 360*4947cdc7SCole Faust// peekRune returns the next rune in the input without consuming it. 361*4947cdc7SCole Faust// Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'. 362*4947cdc7SCole Faustfunc (sc *scanner) peekRune() rune { 363*4947cdc7SCole Faust // TODO(adonovan): opt: measure and perhaps inline eof. 364*4947cdc7SCole Faust if sc.eof() { 365*4947cdc7SCole Faust return 0 366*4947cdc7SCole Faust } 367*4947cdc7SCole Faust 368*4947cdc7SCole Faust // fast path: ASCII 369*4947cdc7SCole Faust if b := sc.rest[0]; b < utf8.RuneSelf { 370*4947cdc7SCole Faust if b == '\r' { 371*4947cdc7SCole Faust return '\n' 372*4947cdc7SCole Faust } 373*4947cdc7SCole Faust return rune(b) 374*4947cdc7SCole Faust } 375*4947cdc7SCole Faust 376*4947cdc7SCole Faust r, _ := utf8.DecodeRune(sc.rest) 377*4947cdc7SCole Faust return r 378*4947cdc7SCole Faust} 379*4947cdc7SCole Faust 380*4947cdc7SCole Faust// readRune consumes and returns the next rune in the input. 381*4947cdc7SCole Faust// Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'. 382*4947cdc7SCole Faustfunc (sc *scanner) readRune() rune { 383*4947cdc7SCole Faust // eof() has been inlined here, both to avoid a call 384*4947cdc7SCole Faust // and to establish len(rest)>0 to avoid a bounds check. 385*4947cdc7SCole Faust if len(sc.rest) == 0 { 386*4947cdc7SCole Faust if !sc.readLine() { 387*4947cdc7SCole Faust sc.error(sc.pos, "internal scanner error: readRune at EOF") 388*4947cdc7SCole Faust } 389*4947cdc7SCole Faust // Redundant, but eliminates the bounds-check below. 390*4947cdc7SCole Faust if len(sc.rest) == 0 { 391*4947cdc7SCole Faust return 0 392*4947cdc7SCole Faust } 393*4947cdc7SCole Faust } 394*4947cdc7SCole Faust 395*4947cdc7SCole Faust // fast path: ASCII 396*4947cdc7SCole Faust if b := sc.rest[0]; b < utf8.RuneSelf { 397*4947cdc7SCole Faust r := rune(b) 398*4947cdc7SCole Faust sc.rest = sc.rest[1:] 399*4947cdc7SCole Faust if r == '\r' { 400*4947cdc7SCole Faust if len(sc.rest) > 0 && sc.rest[0] == '\n' { 401*4947cdc7SCole Faust sc.rest = sc.rest[1:] 402*4947cdc7SCole Faust } 403*4947cdc7SCole Faust r = '\n' 404*4947cdc7SCole Faust } 405*4947cdc7SCole Faust if r == '\n' { 406*4947cdc7SCole Faust sc.pos.Line++ 407*4947cdc7SCole Faust sc.pos.Col = 1 408*4947cdc7SCole Faust } else { 409*4947cdc7SCole Faust sc.pos.Col++ 410*4947cdc7SCole Faust } 411*4947cdc7SCole Faust return r 412*4947cdc7SCole Faust } 413*4947cdc7SCole Faust 414*4947cdc7SCole Faust r, size := utf8.DecodeRune(sc.rest) 415*4947cdc7SCole Faust sc.rest = sc.rest[size:] 416*4947cdc7SCole Faust sc.pos.Col++ 417*4947cdc7SCole Faust return r 418*4947cdc7SCole Faust} 419*4947cdc7SCole Faust 420*4947cdc7SCole Faust// tokenValue records the position and value associated with each token. 421*4947cdc7SCole Fausttype tokenValue struct { 422*4947cdc7SCole Faust raw string // raw text of token 423*4947cdc7SCole Faust int int64 // decoded int 424*4947cdc7SCole Faust bigInt *big.Int // decoded integers > int64 425*4947cdc7SCole Faust float float64 // decoded float 426*4947cdc7SCole Faust string string // decoded string or bytes 427*4947cdc7SCole Faust pos Position // start position of token 428*4947cdc7SCole Faust} 429*4947cdc7SCole Faust 430*4947cdc7SCole Faust// startToken marks the beginning of the next input token. 431*4947cdc7SCole Faust// It must be followed by a call to endToken once the token has 432*4947cdc7SCole Faust// been consumed using readRune. 433*4947cdc7SCole Faustfunc (sc *scanner) startToken(val *tokenValue) { 434*4947cdc7SCole Faust sc.token = sc.rest 435*4947cdc7SCole Faust val.raw = "" 436*4947cdc7SCole Faust val.pos = sc.pos 437*4947cdc7SCole Faust} 438*4947cdc7SCole Faust 439*4947cdc7SCole Faust// endToken marks the end of an input token. 440*4947cdc7SCole Faust// It records the actual token string in val.raw if the caller 441*4947cdc7SCole Faust// has not done that already. 442*4947cdc7SCole Faustfunc (sc *scanner) endToken(val *tokenValue) { 443*4947cdc7SCole Faust if val.raw == "" { 444*4947cdc7SCole Faust val.raw = string(sc.token[:len(sc.token)-len(sc.rest)]) 445*4947cdc7SCole Faust } 446*4947cdc7SCole Faust} 447*4947cdc7SCole Faust 448*4947cdc7SCole Faust// nextToken is called by the parser to obtain the next input token. 449*4947cdc7SCole Faust// It returns the token value and sets val to the data associated with 450*4947cdc7SCole Faust// the token. 451*4947cdc7SCole Faust// 452*4947cdc7SCole Faust// For all our input tokens, the associated data is val.pos (the 453*4947cdc7SCole Faust// position where the token begins), val.raw (the input string 454*4947cdc7SCole Faust// corresponding to the token). For string and int tokens, the string 455*4947cdc7SCole Faust// and int fields additionally contain the token's interpreted value. 456*4947cdc7SCole Faustfunc (sc *scanner) nextToken(val *tokenValue) Token { 457*4947cdc7SCole Faust 458*4947cdc7SCole Faust // The following distribution of tokens guides case ordering: 459*4947cdc7SCole Faust // 460*4947cdc7SCole Faust // COMMA 27 % 461*4947cdc7SCole Faust // STRING 23 % 462*4947cdc7SCole Faust // IDENT 15 % 463*4947cdc7SCole Faust // EQL 11 % 464*4947cdc7SCole Faust // LBRACK 5.5 % 465*4947cdc7SCole Faust // RBRACK 5.5 % 466*4947cdc7SCole Faust // NEWLINE 3 % 467*4947cdc7SCole Faust // LPAREN 2.9 % 468*4947cdc7SCole Faust // RPAREN 2.9 % 469*4947cdc7SCole Faust // INT 2 % 470*4947cdc7SCole Faust // others < 1 % 471*4947cdc7SCole Faust // 472*4947cdc7SCole Faust // Although NEWLINE tokens are infrequent, and lineStart is 473*4947cdc7SCole Faust // usually (~97%) false on entry, skipped newlines account for 474*4947cdc7SCole Faust // about 50% of all iterations of the 'start' loop. 475*4947cdc7SCole Faust 476*4947cdc7SCole Fauststart: 477*4947cdc7SCole Faust var c rune 478*4947cdc7SCole Faust 479*4947cdc7SCole Faust // Deal with leading spaces and indentation. 480*4947cdc7SCole Faust blank := false 481*4947cdc7SCole Faust savedLineStart := sc.lineStart 482*4947cdc7SCole Faust if sc.lineStart { 483*4947cdc7SCole Faust sc.lineStart = false 484*4947cdc7SCole Faust col := 0 485*4947cdc7SCole Faust for { 486*4947cdc7SCole Faust c = sc.peekRune() 487*4947cdc7SCole Faust if c == ' ' { 488*4947cdc7SCole Faust col++ 489*4947cdc7SCole Faust sc.readRune() 490*4947cdc7SCole Faust } else if c == '\t' { 491*4947cdc7SCole Faust const tab = 8 492*4947cdc7SCole Faust col += int(tab - (sc.pos.Col-1)%tab) 493*4947cdc7SCole Faust sc.readRune() 494*4947cdc7SCole Faust } else { 495*4947cdc7SCole Faust break 496*4947cdc7SCole Faust } 497*4947cdc7SCole Faust } 498*4947cdc7SCole Faust 499*4947cdc7SCole Faust // The third clause matches EOF. 500*4947cdc7SCole Faust if c == '#' || c == '\n' || c == 0 { 501*4947cdc7SCole Faust blank = true 502*4947cdc7SCole Faust } 503*4947cdc7SCole Faust 504*4947cdc7SCole Faust // Compute indentation level for non-blank lines not 505*4947cdc7SCole Faust // inside an expression. This is not the common case. 506*4947cdc7SCole Faust if !blank && sc.depth == 0 { 507*4947cdc7SCole Faust cur := sc.indentstk[len(sc.indentstk)-1] 508*4947cdc7SCole Faust if col > cur { 509*4947cdc7SCole Faust // indent 510*4947cdc7SCole Faust sc.dents++ 511*4947cdc7SCole Faust sc.indentstk = append(sc.indentstk, col) 512*4947cdc7SCole Faust } else if col < cur { 513*4947cdc7SCole Faust // outdent(s) 514*4947cdc7SCole Faust for len(sc.indentstk) > 0 && col < sc.indentstk[len(sc.indentstk)-1] { 515*4947cdc7SCole Faust sc.dents-- 516*4947cdc7SCole Faust sc.indentstk = sc.indentstk[:len(sc.indentstk)-1] // pop 517*4947cdc7SCole Faust } 518*4947cdc7SCole Faust if col != sc.indentstk[len(sc.indentstk)-1] { 519*4947cdc7SCole Faust sc.error(sc.pos, "unindent does not match any outer indentation level") 520*4947cdc7SCole Faust } 521*4947cdc7SCole Faust } 522*4947cdc7SCole Faust } 523*4947cdc7SCole Faust } 524*4947cdc7SCole Faust 525*4947cdc7SCole Faust // Return saved indentation tokens. 526*4947cdc7SCole Faust if sc.dents != 0 { 527*4947cdc7SCole Faust sc.startToken(val) 528*4947cdc7SCole Faust sc.endToken(val) 529*4947cdc7SCole Faust if sc.dents < 0 { 530*4947cdc7SCole Faust sc.dents++ 531*4947cdc7SCole Faust return OUTDENT 532*4947cdc7SCole Faust } else { 533*4947cdc7SCole Faust sc.dents-- 534*4947cdc7SCole Faust return INDENT 535*4947cdc7SCole Faust } 536*4947cdc7SCole Faust } 537*4947cdc7SCole Faust 538*4947cdc7SCole Faust // start of line proper 539*4947cdc7SCole Faust c = sc.peekRune() 540*4947cdc7SCole Faust 541*4947cdc7SCole Faust // Skip spaces. 542*4947cdc7SCole Faust for c == ' ' || c == '\t' { 543*4947cdc7SCole Faust sc.readRune() 544*4947cdc7SCole Faust c = sc.peekRune() 545*4947cdc7SCole Faust } 546*4947cdc7SCole Faust 547*4947cdc7SCole Faust // comment 548*4947cdc7SCole Faust if c == '#' { 549*4947cdc7SCole Faust if sc.keepComments { 550*4947cdc7SCole Faust sc.startToken(val) 551*4947cdc7SCole Faust } 552*4947cdc7SCole Faust // Consume up to newline (included). 553*4947cdc7SCole Faust for c != 0 && c != '\n' { 554*4947cdc7SCole Faust sc.readRune() 555*4947cdc7SCole Faust c = sc.peekRune() 556*4947cdc7SCole Faust } 557*4947cdc7SCole Faust if sc.keepComments { 558*4947cdc7SCole Faust sc.endToken(val) 559*4947cdc7SCole Faust if blank { 560*4947cdc7SCole Faust sc.lineComments = append(sc.lineComments, Comment{val.pos, val.raw}) 561*4947cdc7SCole Faust } else { 562*4947cdc7SCole Faust sc.suffixComments = append(sc.suffixComments, Comment{val.pos, val.raw}) 563*4947cdc7SCole Faust } 564*4947cdc7SCole Faust } 565*4947cdc7SCole Faust } 566*4947cdc7SCole Faust 567*4947cdc7SCole Faust // newline 568*4947cdc7SCole Faust if c == '\n' { 569*4947cdc7SCole Faust sc.lineStart = true 570*4947cdc7SCole Faust 571*4947cdc7SCole Faust // Ignore newlines within expressions (common case). 572*4947cdc7SCole Faust if sc.depth > 0 { 573*4947cdc7SCole Faust sc.readRune() 574*4947cdc7SCole Faust goto start 575*4947cdc7SCole Faust } 576*4947cdc7SCole Faust 577*4947cdc7SCole Faust // Ignore blank lines, except in the REPL, 578*4947cdc7SCole Faust // where they emit OUTDENTs and NEWLINE. 579*4947cdc7SCole Faust if blank { 580*4947cdc7SCole Faust if sc.readline == nil { 581*4947cdc7SCole Faust sc.readRune() 582*4947cdc7SCole Faust goto start 583*4947cdc7SCole Faust } else if len(sc.indentstk) > 1 { 584*4947cdc7SCole Faust sc.dents = 1 - len(sc.indentstk) 585*4947cdc7SCole Faust sc.indentstk = sc.indentstk[:1] 586*4947cdc7SCole Faust goto start 587*4947cdc7SCole Faust } 588*4947cdc7SCole Faust } 589*4947cdc7SCole Faust 590*4947cdc7SCole Faust // At top-level (not in an expression). 591*4947cdc7SCole Faust sc.startToken(val) 592*4947cdc7SCole Faust sc.readRune() 593*4947cdc7SCole Faust val.raw = "\n" 594*4947cdc7SCole Faust return NEWLINE 595*4947cdc7SCole Faust } 596*4947cdc7SCole Faust 597*4947cdc7SCole Faust // end of file 598*4947cdc7SCole Faust if c == 0 { 599*4947cdc7SCole Faust // Emit OUTDENTs for unfinished indentation, 600*4947cdc7SCole Faust // preceded by a NEWLINE if we haven't just emitted one. 601*4947cdc7SCole Faust if len(sc.indentstk) > 1 { 602*4947cdc7SCole Faust if savedLineStart { 603*4947cdc7SCole Faust sc.dents = 1 - len(sc.indentstk) 604*4947cdc7SCole Faust sc.indentstk = sc.indentstk[:1] 605*4947cdc7SCole Faust goto start 606*4947cdc7SCole Faust } else { 607*4947cdc7SCole Faust sc.lineStart = true 608*4947cdc7SCole Faust sc.startToken(val) 609*4947cdc7SCole Faust val.raw = "\n" 610*4947cdc7SCole Faust return NEWLINE 611*4947cdc7SCole Faust } 612*4947cdc7SCole Faust } 613*4947cdc7SCole Faust 614*4947cdc7SCole Faust sc.startToken(val) 615*4947cdc7SCole Faust sc.endToken(val) 616*4947cdc7SCole Faust return EOF 617*4947cdc7SCole Faust } 618*4947cdc7SCole Faust 619*4947cdc7SCole Faust // line continuation 620*4947cdc7SCole Faust if c == '\\' { 621*4947cdc7SCole Faust sc.readRune() 622*4947cdc7SCole Faust if sc.peekRune() != '\n' { 623*4947cdc7SCole Faust sc.errorf(sc.pos, "stray backslash in program") 624*4947cdc7SCole Faust } 625*4947cdc7SCole Faust sc.readRune() 626*4947cdc7SCole Faust goto start 627*4947cdc7SCole Faust } 628*4947cdc7SCole Faust 629*4947cdc7SCole Faust // start of the next token 630*4947cdc7SCole Faust sc.startToken(val) 631*4947cdc7SCole Faust 632*4947cdc7SCole Faust // comma (common case) 633*4947cdc7SCole Faust if c == ',' { 634*4947cdc7SCole Faust sc.readRune() 635*4947cdc7SCole Faust sc.endToken(val) 636*4947cdc7SCole Faust return COMMA 637*4947cdc7SCole Faust } 638*4947cdc7SCole Faust 639*4947cdc7SCole Faust // string literal 640*4947cdc7SCole Faust if c == '"' || c == '\'' { 641*4947cdc7SCole Faust return sc.scanString(val, c) 642*4947cdc7SCole Faust } 643*4947cdc7SCole Faust 644*4947cdc7SCole Faust // identifier or keyword 645*4947cdc7SCole Faust if isIdentStart(c) { 646*4947cdc7SCole Faust if (c == 'r' || c == 'b') && len(sc.rest) > 1 && (sc.rest[1] == '"' || sc.rest[1] == '\'') { 647*4947cdc7SCole Faust // r"..." 648*4947cdc7SCole Faust // b"..." 649*4947cdc7SCole Faust sc.readRune() 650*4947cdc7SCole Faust c = sc.peekRune() 651*4947cdc7SCole Faust return sc.scanString(val, c) 652*4947cdc7SCole Faust } else if c == 'r' && len(sc.rest) > 2 && sc.rest[1] == 'b' && (sc.rest[2] == '"' || sc.rest[2] == '\'') { 653*4947cdc7SCole Faust // rb"..." 654*4947cdc7SCole Faust sc.readRune() 655*4947cdc7SCole Faust sc.readRune() 656*4947cdc7SCole Faust c = sc.peekRune() 657*4947cdc7SCole Faust return sc.scanString(val, c) 658*4947cdc7SCole Faust } 659*4947cdc7SCole Faust 660*4947cdc7SCole Faust for isIdent(c) { 661*4947cdc7SCole Faust sc.readRune() 662*4947cdc7SCole Faust c = sc.peekRune() 663*4947cdc7SCole Faust } 664*4947cdc7SCole Faust sc.endToken(val) 665*4947cdc7SCole Faust if k, ok := keywordToken[val.raw]; ok { 666*4947cdc7SCole Faust return k 667*4947cdc7SCole Faust } 668*4947cdc7SCole Faust 669*4947cdc7SCole Faust return IDENT 670*4947cdc7SCole Faust } 671*4947cdc7SCole Faust 672*4947cdc7SCole Faust // brackets 673*4947cdc7SCole Faust switch c { 674*4947cdc7SCole Faust case '[', '(', '{': 675*4947cdc7SCole Faust sc.depth++ 676*4947cdc7SCole Faust sc.readRune() 677*4947cdc7SCole Faust sc.endToken(val) 678*4947cdc7SCole Faust switch c { 679*4947cdc7SCole Faust case '[': 680*4947cdc7SCole Faust return LBRACK 681*4947cdc7SCole Faust case '(': 682*4947cdc7SCole Faust return LPAREN 683*4947cdc7SCole Faust case '{': 684*4947cdc7SCole Faust return LBRACE 685*4947cdc7SCole Faust } 686*4947cdc7SCole Faust panic("unreachable") 687*4947cdc7SCole Faust 688*4947cdc7SCole Faust case ']', ')', '}': 689*4947cdc7SCole Faust if sc.depth == 0 { 690*4947cdc7SCole Faust sc.errorf(sc.pos, "unexpected %q", c) 691*4947cdc7SCole Faust } else { 692*4947cdc7SCole Faust sc.depth-- 693*4947cdc7SCole Faust } 694*4947cdc7SCole Faust sc.readRune() 695*4947cdc7SCole Faust sc.endToken(val) 696*4947cdc7SCole Faust switch c { 697*4947cdc7SCole Faust case ']': 698*4947cdc7SCole Faust return RBRACK 699*4947cdc7SCole Faust case ')': 700*4947cdc7SCole Faust return RPAREN 701*4947cdc7SCole Faust case '}': 702*4947cdc7SCole Faust return RBRACE 703*4947cdc7SCole Faust } 704*4947cdc7SCole Faust panic("unreachable") 705*4947cdc7SCole Faust } 706*4947cdc7SCole Faust 707*4947cdc7SCole Faust // int or float literal, or period 708*4947cdc7SCole Faust if isdigit(c) || c == '.' { 709*4947cdc7SCole Faust return sc.scanNumber(val, c) 710*4947cdc7SCole Faust } 711*4947cdc7SCole Faust 712*4947cdc7SCole Faust // other punctuation 713*4947cdc7SCole Faust defer sc.endToken(val) 714*4947cdc7SCole Faust switch c { 715*4947cdc7SCole Faust case '=', '<', '>', '!', '+', '-', '%', '/', '&', '|', '^': // possibly followed by '=' 716*4947cdc7SCole Faust start := sc.pos 717*4947cdc7SCole Faust sc.readRune() 718*4947cdc7SCole Faust if sc.peekRune() == '=' { 719*4947cdc7SCole Faust sc.readRune() 720*4947cdc7SCole Faust switch c { 721*4947cdc7SCole Faust case '<': 722*4947cdc7SCole Faust return LE 723*4947cdc7SCole Faust case '>': 724*4947cdc7SCole Faust return GE 725*4947cdc7SCole Faust case '=': 726*4947cdc7SCole Faust return EQL 727*4947cdc7SCole Faust case '!': 728*4947cdc7SCole Faust return NEQ 729*4947cdc7SCole Faust case '+': 730*4947cdc7SCole Faust return PLUS_EQ 731*4947cdc7SCole Faust case '-': 732*4947cdc7SCole Faust return MINUS_EQ 733*4947cdc7SCole Faust case '/': 734*4947cdc7SCole Faust return SLASH_EQ 735*4947cdc7SCole Faust case '%': 736*4947cdc7SCole Faust return PERCENT_EQ 737*4947cdc7SCole Faust case '&': 738*4947cdc7SCole Faust return AMP_EQ 739*4947cdc7SCole Faust case '|': 740*4947cdc7SCole Faust return PIPE_EQ 741*4947cdc7SCole Faust case '^': 742*4947cdc7SCole Faust return CIRCUMFLEX_EQ 743*4947cdc7SCole Faust } 744*4947cdc7SCole Faust } 745*4947cdc7SCole Faust switch c { 746*4947cdc7SCole Faust case '=': 747*4947cdc7SCole Faust return EQ 748*4947cdc7SCole Faust case '<': 749*4947cdc7SCole Faust if sc.peekRune() == '<' { 750*4947cdc7SCole Faust sc.readRune() 751*4947cdc7SCole Faust if sc.peekRune() == '=' { 752*4947cdc7SCole Faust sc.readRune() 753*4947cdc7SCole Faust return LTLT_EQ 754*4947cdc7SCole Faust } else { 755*4947cdc7SCole Faust return LTLT 756*4947cdc7SCole Faust } 757*4947cdc7SCole Faust } 758*4947cdc7SCole Faust return LT 759*4947cdc7SCole Faust case '>': 760*4947cdc7SCole Faust if sc.peekRune() == '>' { 761*4947cdc7SCole Faust sc.readRune() 762*4947cdc7SCole Faust if sc.peekRune() == '=' { 763*4947cdc7SCole Faust sc.readRune() 764*4947cdc7SCole Faust return GTGT_EQ 765*4947cdc7SCole Faust } else { 766*4947cdc7SCole Faust return GTGT 767*4947cdc7SCole Faust } 768*4947cdc7SCole Faust } 769*4947cdc7SCole Faust return GT 770*4947cdc7SCole Faust case '!': 771*4947cdc7SCole Faust sc.error(start, "unexpected input character '!'") 772*4947cdc7SCole Faust case '+': 773*4947cdc7SCole Faust return PLUS 774*4947cdc7SCole Faust case '-': 775*4947cdc7SCole Faust return MINUS 776*4947cdc7SCole Faust case '/': 777*4947cdc7SCole Faust if sc.peekRune() == '/' { 778*4947cdc7SCole Faust sc.readRune() 779*4947cdc7SCole Faust if sc.peekRune() == '=' { 780*4947cdc7SCole Faust sc.readRune() 781*4947cdc7SCole Faust return SLASHSLASH_EQ 782*4947cdc7SCole Faust } else { 783*4947cdc7SCole Faust return SLASHSLASH 784*4947cdc7SCole Faust } 785*4947cdc7SCole Faust } 786*4947cdc7SCole Faust return SLASH 787*4947cdc7SCole Faust case '%': 788*4947cdc7SCole Faust return PERCENT 789*4947cdc7SCole Faust case '&': 790*4947cdc7SCole Faust return AMP 791*4947cdc7SCole Faust case '|': 792*4947cdc7SCole Faust return PIPE 793*4947cdc7SCole Faust case '^': 794*4947cdc7SCole Faust return CIRCUMFLEX 795*4947cdc7SCole Faust } 796*4947cdc7SCole Faust panic("unreachable") 797*4947cdc7SCole Faust 798*4947cdc7SCole Faust case ':', ';', '~': // single-char tokens (except comma) 799*4947cdc7SCole Faust sc.readRune() 800*4947cdc7SCole Faust switch c { 801*4947cdc7SCole Faust case ':': 802*4947cdc7SCole Faust return COLON 803*4947cdc7SCole Faust case ';': 804*4947cdc7SCole Faust return SEMI 805*4947cdc7SCole Faust case '~': 806*4947cdc7SCole Faust return TILDE 807*4947cdc7SCole Faust } 808*4947cdc7SCole Faust panic("unreachable") 809*4947cdc7SCole Faust 810*4947cdc7SCole Faust case '*': // possibly followed by '*' or '=' 811*4947cdc7SCole Faust sc.readRune() 812*4947cdc7SCole Faust switch sc.peekRune() { 813*4947cdc7SCole Faust case '*': 814*4947cdc7SCole Faust sc.readRune() 815*4947cdc7SCole Faust return STARSTAR 816*4947cdc7SCole Faust case '=': 817*4947cdc7SCole Faust sc.readRune() 818*4947cdc7SCole Faust return STAR_EQ 819*4947cdc7SCole Faust } 820*4947cdc7SCole Faust return STAR 821*4947cdc7SCole Faust } 822*4947cdc7SCole Faust 823*4947cdc7SCole Faust sc.errorf(sc.pos, "unexpected input character %#q", c) 824*4947cdc7SCole Faust panic("unreachable") 825*4947cdc7SCole Faust} 826*4947cdc7SCole Faust 827*4947cdc7SCole Faustfunc (sc *scanner) scanString(val *tokenValue, quote rune) Token { 828*4947cdc7SCole Faust start := sc.pos 829*4947cdc7SCole Faust triple := len(sc.rest) >= 3 && sc.rest[0] == byte(quote) && sc.rest[1] == byte(quote) && sc.rest[2] == byte(quote) 830*4947cdc7SCole Faust sc.readRune() 831*4947cdc7SCole Faust 832*4947cdc7SCole Faust // String literals may contain escaped or unescaped newlines, 833*4947cdc7SCole Faust // causing them to span multiple lines (gulps) of REPL input; 834*4947cdc7SCole Faust // they are the only such token. Thus we cannot call endToken, 835*4947cdc7SCole Faust // as it assumes sc.rest is unchanged since startToken. 836*4947cdc7SCole Faust // Instead, buffer the token here. 837*4947cdc7SCole Faust // TODO(adonovan): opt: buffer only if we encounter a newline. 838*4947cdc7SCole Faust raw := new(strings.Builder) 839*4947cdc7SCole Faust 840*4947cdc7SCole Faust // Copy the prefix, e.g. r' or " (see startToken). 841*4947cdc7SCole Faust raw.Write(sc.token[:len(sc.token)-len(sc.rest)]) 842*4947cdc7SCole Faust 843*4947cdc7SCole Faust if !triple { 844*4947cdc7SCole Faust // single-quoted string literal 845*4947cdc7SCole Faust for { 846*4947cdc7SCole Faust if sc.eof() { 847*4947cdc7SCole Faust sc.error(val.pos, "unexpected EOF in string") 848*4947cdc7SCole Faust } 849*4947cdc7SCole Faust c := sc.readRune() 850*4947cdc7SCole Faust raw.WriteRune(c) 851*4947cdc7SCole Faust if c == quote { 852*4947cdc7SCole Faust break 853*4947cdc7SCole Faust } 854*4947cdc7SCole Faust if c == '\n' { 855*4947cdc7SCole Faust sc.error(val.pos, "unexpected newline in string") 856*4947cdc7SCole Faust } 857*4947cdc7SCole Faust if c == '\\' { 858*4947cdc7SCole Faust if sc.eof() { 859*4947cdc7SCole Faust sc.error(val.pos, "unexpected EOF in string") 860*4947cdc7SCole Faust } 861*4947cdc7SCole Faust c = sc.readRune() 862*4947cdc7SCole Faust raw.WriteRune(c) 863*4947cdc7SCole Faust } 864*4947cdc7SCole Faust } 865*4947cdc7SCole Faust } else { 866*4947cdc7SCole Faust // triple-quoted string literal 867*4947cdc7SCole Faust sc.readRune() 868*4947cdc7SCole Faust raw.WriteRune(quote) 869*4947cdc7SCole Faust sc.readRune() 870*4947cdc7SCole Faust raw.WriteRune(quote) 871*4947cdc7SCole Faust 872*4947cdc7SCole Faust quoteCount := 0 873*4947cdc7SCole Faust for { 874*4947cdc7SCole Faust if sc.eof() { 875*4947cdc7SCole Faust sc.error(val.pos, "unexpected EOF in string") 876*4947cdc7SCole Faust } 877*4947cdc7SCole Faust c := sc.readRune() 878*4947cdc7SCole Faust raw.WriteRune(c) 879*4947cdc7SCole Faust if c == quote { 880*4947cdc7SCole Faust quoteCount++ 881*4947cdc7SCole Faust if quoteCount == 3 { 882*4947cdc7SCole Faust break 883*4947cdc7SCole Faust } 884*4947cdc7SCole Faust } else { 885*4947cdc7SCole Faust quoteCount = 0 886*4947cdc7SCole Faust } 887*4947cdc7SCole Faust if c == '\\' { 888*4947cdc7SCole Faust if sc.eof() { 889*4947cdc7SCole Faust sc.error(val.pos, "unexpected EOF in string") 890*4947cdc7SCole Faust } 891*4947cdc7SCole Faust c = sc.readRune() 892*4947cdc7SCole Faust raw.WriteRune(c) 893*4947cdc7SCole Faust } 894*4947cdc7SCole Faust } 895*4947cdc7SCole Faust } 896*4947cdc7SCole Faust val.raw = raw.String() 897*4947cdc7SCole Faust 898*4947cdc7SCole Faust s, _, isByte, err := unquote(val.raw) 899*4947cdc7SCole Faust if err != nil { 900*4947cdc7SCole Faust sc.error(start, err.Error()) 901*4947cdc7SCole Faust } 902*4947cdc7SCole Faust val.string = s 903*4947cdc7SCole Faust if isByte { 904*4947cdc7SCole Faust return BYTES 905*4947cdc7SCole Faust } else { 906*4947cdc7SCole Faust return STRING 907*4947cdc7SCole Faust } 908*4947cdc7SCole Faust} 909*4947cdc7SCole Faust 910*4947cdc7SCole Faustfunc (sc *scanner) scanNumber(val *tokenValue, c rune) Token { 911*4947cdc7SCole Faust // https://github.com/google/starlark-go/blob/master/doc/spec.md#lexical-elements 912*4947cdc7SCole Faust // 913*4947cdc7SCole Faust // Python features not supported: 914*4947cdc7SCole Faust // - integer literals of >64 bits of precision 915*4947cdc7SCole Faust // - 123L or 123l long suffix 916*4947cdc7SCole Faust // - traditional octal: 0755 917*4947cdc7SCole Faust // https://docs.python.org/2/reference/lexical_analysis.html#integer-and-long-integer-literals 918*4947cdc7SCole Faust 919*4947cdc7SCole Faust start := sc.pos 920*4947cdc7SCole Faust fraction, exponent := false, false 921*4947cdc7SCole Faust 922*4947cdc7SCole Faust if c == '.' { 923*4947cdc7SCole Faust // dot or start of fraction 924*4947cdc7SCole Faust sc.readRune() 925*4947cdc7SCole Faust c = sc.peekRune() 926*4947cdc7SCole Faust if !isdigit(c) { 927*4947cdc7SCole Faust sc.endToken(val) 928*4947cdc7SCole Faust return DOT 929*4947cdc7SCole Faust } 930*4947cdc7SCole Faust fraction = true 931*4947cdc7SCole Faust } else if c == '0' { 932*4947cdc7SCole Faust // hex, octal, binary or float 933*4947cdc7SCole Faust sc.readRune() 934*4947cdc7SCole Faust c = sc.peekRune() 935*4947cdc7SCole Faust 936*4947cdc7SCole Faust if c == '.' { 937*4947cdc7SCole Faust fraction = true 938*4947cdc7SCole Faust } else if c == 'x' || c == 'X' { 939*4947cdc7SCole Faust // hex 940*4947cdc7SCole Faust sc.readRune() 941*4947cdc7SCole Faust c = sc.peekRune() 942*4947cdc7SCole Faust if !isxdigit(c) { 943*4947cdc7SCole Faust sc.error(start, "invalid hex literal") 944*4947cdc7SCole Faust } 945*4947cdc7SCole Faust for isxdigit(c) { 946*4947cdc7SCole Faust sc.readRune() 947*4947cdc7SCole Faust c = sc.peekRune() 948*4947cdc7SCole Faust } 949*4947cdc7SCole Faust } else if c == 'o' || c == 'O' { 950*4947cdc7SCole Faust // octal 951*4947cdc7SCole Faust sc.readRune() 952*4947cdc7SCole Faust c = sc.peekRune() 953*4947cdc7SCole Faust if !isodigit(c) { 954*4947cdc7SCole Faust sc.error(sc.pos, "invalid octal literal") 955*4947cdc7SCole Faust } 956*4947cdc7SCole Faust for isodigit(c) { 957*4947cdc7SCole Faust sc.readRune() 958*4947cdc7SCole Faust c = sc.peekRune() 959*4947cdc7SCole Faust } 960*4947cdc7SCole Faust } else if c == 'b' || c == 'B' { 961*4947cdc7SCole Faust // binary 962*4947cdc7SCole Faust sc.readRune() 963*4947cdc7SCole Faust c = sc.peekRune() 964*4947cdc7SCole Faust if !isbdigit(c) { 965*4947cdc7SCole Faust sc.error(sc.pos, "invalid binary literal") 966*4947cdc7SCole Faust } 967*4947cdc7SCole Faust for isbdigit(c) { 968*4947cdc7SCole Faust sc.readRune() 969*4947cdc7SCole Faust c = sc.peekRune() 970*4947cdc7SCole Faust } 971*4947cdc7SCole Faust } else { 972*4947cdc7SCole Faust // float (or obsolete octal "0755") 973*4947cdc7SCole Faust allzeros, octal := true, true 974*4947cdc7SCole Faust for isdigit(c) { 975*4947cdc7SCole Faust if c != '0' { 976*4947cdc7SCole Faust allzeros = false 977*4947cdc7SCole Faust } 978*4947cdc7SCole Faust if c > '7' { 979*4947cdc7SCole Faust octal = false 980*4947cdc7SCole Faust } 981*4947cdc7SCole Faust sc.readRune() 982*4947cdc7SCole Faust c = sc.peekRune() 983*4947cdc7SCole Faust } 984*4947cdc7SCole Faust if c == '.' { 985*4947cdc7SCole Faust fraction = true 986*4947cdc7SCole Faust } else if c == 'e' || c == 'E' { 987*4947cdc7SCole Faust exponent = true 988*4947cdc7SCole Faust } else if octal && !allzeros { 989*4947cdc7SCole Faust sc.endToken(val) 990*4947cdc7SCole Faust sc.errorf(sc.pos, "obsolete form of octal literal; use 0o%s", val.raw[1:]) 991*4947cdc7SCole Faust } 992*4947cdc7SCole Faust } 993*4947cdc7SCole Faust } else { 994*4947cdc7SCole Faust // decimal 995*4947cdc7SCole Faust for isdigit(c) { 996*4947cdc7SCole Faust sc.readRune() 997*4947cdc7SCole Faust c = sc.peekRune() 998*4947cdc7SCole Faust } 999*4947cdc7SCole Faust 1000*4947cdc7SCole Faust if c == '.' { 1001*4947cdc7SCole Faust fraction = true 1002*4947cdc7SCole Faust } else if c == 'e' || c == 'E' { 1003*4947cdc7SCole Faust exponent = true 1004*4947cdc7SCole Faust } 1005*4947cdc7SCole Faust } 1006*4947cdc7SCole Faust 1007*4947cdc7SCole Faust if fraction { 1008*4947cdc7SCole Faust sc.readRune() // consume '.' 1009*4947cdc7SCole Faust c = sc.peekRune() 1010*4947cdc7SCole Faust for isdigit(c) { 1011*4947cdc7SCole Faust sc.readRune() 1012*4947cdc7SCole Faust c = sc.peekRune() 1013*4947cdc7SCole Faust } 1014*4947cdc7SCole Faust 1015*4947cdc7SCole Faust if c == 'e' || c == 'E' { 1016*4947cdc7SCole Faust exponent = true 1017*4947cdc7SCole Faust } 1018*4947cdc7SCole Faust } 1019*4947cdc7SCole Faust 1020*4947cdc7SCole Faust if exponent { 1021*4947cdc7SCole Faust sc.readRune() // consume [eE] 1022*4947cdc7SCole Faust c = sc.peekRune() 1023*4947cdc7SCole Faust if c == '+' || c == '-' { 1024*4947cdc7SCole Faust sc.readRune() 1025*4947cdc7SCole Faust c = sc.peekRune() 1026*4947cdc7SCole Faust if !isdigit(c) { 1027*4947cdc7SCole Faust sc.error(sc.pos, "invalid float literal") 1028*4947cdc7SCole Faust } 1029*4947cdc7SCole Faust } 1030*4947cdc7SCole Faust for isdigit(c) { 1031*4947cdc7SCole Faust sc.readRune() 1032*4947cdc7SCole Faust c = sc.peekRune() 1033*4947cdc7SCole Faust } 1034*4947cdc7SCole Faust } 1035*4947cdc7SCole Faust 1036*4947cdc7SCole Faust sc.endToken(val) 1037*4947cdc7SCole Faust if fraction || exponent { 1038*4947cdc7SCole Faust var err error 1039*4947cdc7SCole Faust val.float, err = strconv.ParseFloat(val.raw, 64) 1040*4947cdc7SCole Faust if err != nil { 1041*4947cdc7SCole Faust sc.error(sc.pos, "invalid float literal") 1042*4947cdc7SCole Faust } 1043*4947cdc7SCole Faust return FLOAT 1044*4947cdc7SCole Faust } else { 1045*4947cdc7SCole Faust var err error 1046*4947cdc7SCole Faust s := val.raw 1047*4947cdc7SCole Faust val.bigInt = nil 1048*4947cdc7SCole Faust if len(s) > 2 && s[0] == '0' && (s[1] == 'o' || s[1] == 'O') { 1049*4947cdc7SCole Faust val.int, err = strconv.ParseInt(s[2:], 8, 64) 1050*4947cdc7SCole Faust } else if len(s) > 2 && s[0] == '0' && (s[1] == 'b' || s[1] == 'B') { 1051*4947cdc7SCole Faust val.int, err = strconv.ParseInt(s[2:], 2, 64) 1052*4947cdc7SCole Faust } else { 1053*4947cdc7SCole Faust val.int, err = strconv.ParseInt(s, 0, 64) 1054*4947cdc7SCole Faust if err != nil { 1055*4947cdc7SCole Faust num := new(big.Int) 1056*4947cdc7SCole Faust var ok bool 1057*4947cdc7SCole Faust val.bigInt, ok = num.SetString(s, 0) 1058*4947cdc7SCole Faust if ok { 1059*4947cdc7SCole Faust err = nil 1060*4947cdc7SCole Faust } 1061*4947cdc7SCole Faust } 1062*4947cdc7SCole Faust } 1063*4947cdc7SCole Faust if err != nil { 1064*4947cdc7SCole Faust sc.error(start, "invalid int literal") 1065*4947cdc7SCole Faust } 1066*4947cdc7SCole Faust return INT 1067*4947cdc7SCole Faust } 1068*4947cdc7SCole Faust} 1069*4947cdc7SCole Faust 1070*4947cdc7SCole Faust// isIdent reports whether c is an identifier rune. 1071*4947cdc7SCole Faustfunc isIdent(c rune) bool { 1072*4947cdc7SCole Faust return isdigit(c) || isIdentStart(c) 1073*4947cdc7SCole Faust} 1074*4947cdc7SCole Faust 1075*4947cdc7SCole Faustfunc isIdentStart(c rune) bool { 1076*4947cdc7SCole Faust return 'a' <= c && c <= 'z' || 1077*4947cdc7SCole Faust 'A' <= c && c <= 'Z' || 1078*4947cdc7SCole Faust c == '_' || 1079*4947cdc7SCole Faust unicode.IsLetter(c) 1080*4947cdc7SCole Faust} 1081*4947cdc7SCole Faust 1082*4947cdc7SCole Faustfunc isdigit(c rune) bool { return '0' <= c && c <= '9' } 1083*4947cdc7SCole Faustfunc isodigit(c rune) bool { return '0' <= c && c <= '7' } 1084*4947cdc7SCole Faustfunc isxdigit(c rune) bool { return isdigit(c) || 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f' } 1085*4947cdc7SCole Faustfunc isbdigit(c rune) bool { return '0' == c || c == '1' } 1086*4947cdc7SCole Faust 1087*4947cdc7SCole Faust// keywordToken records the special tokens for 1088*4947cdc7SCole Faust// strings that should not be treated as ordinary identifiers. 1089*4947cdc7SCole Faustvar keywordToken = map[string]Token{ 1090*4947cdc7SCole Faust "and": AND, 1091*4947cdc7SCole Faust "break": BREAK, 1092*4947cdc7SCole Faust "continue": CONTINUE, 1093*4947cdc7SCole Faust "def": DEF, 1094*4947cdc7SCole Faust "elif": ELIF, 1095*4947cdc7SCole Faust "else": ELSE, 1096*4947cdc7SCole Faust "for": FOR, 1097*4947cdc7SCole Faust "if": IF, 1098*4947cdc7SCole Faust "in": IN, 1099*4947cdc7SCole Faust "lambda": LAMBDA, 1100*4947cdc7SCole Faust "load": LOAD, 1101*4947cdc7SCole Faust "not": NOT, 1102*4947cdc7SCole Faust "or": OR, 1103*4947cdc7SCole Faust "pass": PASS, 1104*4947cdc7SCole Faust "return": RETURN, 1105*4947cdc7SCole Faust "while": WHILE, 1106*4947cdc7SCole Faust 1107*4947cdc7SCole Faust // reserved words: 1108*4947cdc7SCole Faust "as": ILLEGAL, 1109*4947cdc7SCole Faust // "assert": ILLEGAL, // heavily used by our tests 1110*4947cdc7SCole Faust "class": ILLEGAL, 1111*4947cdc7SCole Faust "del": ILLEGAL, 1112*4947cdc7SCole Faust "except": ILLEGAL, 1113*4947cdc7SCole Faust "finally": ILLEGAL, 1114*4947cdc7SCole Faust "from": ILLEGAL, 1115*4947cdc7SCole Faust "global": ILLEGAL, 1116*4947cdc7SCole Faust "import": ILLEGAL, 1117*4947cdc7SCole Faust "is": ILLEGAL, 1118*4947cdc7SCole Faust "nonlocal": ILLEGAL, 1119*4947cdc7SCole Faust "raise": ILLEGAL, 1120*4947cdc7SCole Faust "try": ILLEGAL, 1121*4947cdc7SCole Faust "with": ILLEGAL, 1122*4947cdc7SCole Faust "yield": ILLEGAL, 1123*4947cdc7SCole Faust} 1124