xref: /aosp_15_r20/external/starlark-go/syntax/scan.go (revision 4947cdc739c985f6d86941e22894f5cefe7c9e9a)
1*4947cdc7SCole Faust// Copyright 2017 The Bazel Authors. All rights reserved.
2*4947cdc7SCole Faust// Use of this source code is governed by a BSD-style
3*4947cdc7SCole Faust// license that can be found in the LICENSE file.
4*4947cdc7SCole Faust
5*4947cdc7SCole Faustpackage syntax
6*4947cdc7SCole Faust
7*4947cdc7SCole Faust// A lexical scanner for Starlark.
8*4947cdc7SCole Faust
9*4947cdc7SCole Faustimport (
10*4947cdc7SCole Faust	"fmt"
11*4947cdc7SCole Faust	"io"
12*4947cdc7SCole Faust	"io/ioutil"
13*4947cdc7SCole Faust	"log"
14*4947cdc7SCole Faust	"math/big"
15*4947cdc7SCole Faust	"os"
16*4947cdc7SCole Faust	"strconv"
17*4947cdc7SCole Faust	"strings"
18*4947cdc7SCole Faust	"unicode"
19*4947cdc7SCole Faust	"unicode/utf8"
20*4947cdc7SCole Faust)
21*4947cdc7SCole Faust
22*4947cdc7SCole Faust// A Token represents a Starlark lexical token.
23*4947cdc7SCole Fausttype Token int8
24*4947cdc7SCole Faust
25*4947cdc7SCole Faustconst (
26*4947cdc7SCole Faust	ILLEGAL Token = iota
27*4947cdc7SCole Faust	EOF
28*4947cdc7SCole Faust
29*4947cdc7SCole Faust	NEWLINE
30*4947cdc7SCole Faust	INDENT
31*4947cdc7SCole Faust	OUTDENT
32*4947cdc7SCole Faust
33*4947cdc7SCole Faust	// Tokens with values
34*4947cdc7SCole Faust	IDENT  // x
35*4947cdc7SCole Faust	INT    // 123
36*4947cdc7SCole Faust	FLOAT  // 1.23e45
37*4947cdc7SCole Faust	STRING // "foo" or 'foo' or '''foo''' or r'foo' or r"foo"
38*4947cdc7SCole Faust	BYTES  // b"foo", etc
39*4947cdc7SCole Faust
40*4947cdc7SCole Faust	// Punctuation
41*4947cdc7SCole Faust	PLUS          // +
42*4947cdc7SCole Faust	MINUS         // -
43*4947cdc7SCole Faust	STAR          // *
44*4947cdc7SCole Faust	SLASH         // /
45*4947cdc7SCole Faust	SLASHSLASH    // //
46*4947cdc7SCole Faust	PERCENT       // %
47*4947cdc7SCole Faust	AMP           // &
48*4947cdc7SCole Faust	PIPE          // |
49*4947cdc7SCole Faust	CIRCUMFLEX    // ^
50*4947cdc7SCole Faust	LTLT          // <<
51*4947cdc7SCole Faust	GTGT          // >>
52*4947cdc7SCole Faust	TILDE         // ~
53*4947cdc7SCole Faust	DOT           // .
54*4947cdc7SCole Faust	COMMA         // ,
55*4947cdc7SCole Faust	EQ            // =
56*4947cdc7SCole Faust	SEMI          // ;
57*4947cdc7SCole Faust	COLON         // :
58*4947cdc7SCole Faust	LPAREN        // (
59*4947cdc7SCole Faust	RPAREN        // )
60*4947cdc7SCole Faust	LBRACK        // [
61*4947cdc7SCole Faust	RBRACK        // ]
62*4947cdc7SCole Faust	LBRACE        // {
63*4947cdc7SCole Faust	RBRACE        // }
64*4947cdc7SCole Faust	LT            // <
65*4947cdc7SCole Faust	GT            // >
66*4947cdc7SCole Faust	GE            // >=
67*4947cdc7SCole Faust	LE            // <=
68*4947cdc7SCole Faust	EQL           // ==
69*4947cdc7SCole Faust	NEQ           // !=
70*4947cdc7SCole Faust	PLUS_EQ       // +=    (keep order consistent with PLUS..GTGT)
71*4947cdc7SCole Faust	MINUS_EQ      // -=
72*4947cdc7SCole Faust	STAR_EQ       // *=
73*4947cdc7SCole Faust	SLASH_EQ      // /=
74*4947cdc7SCole Faust	SLASHSLASH_EQ // //=
75*4947cdc7SCole Faust	PERCENT_EQ    // %=
76*4947cdc7SCole Faust	AMP_EQ        // &=
77*4947cdc7SCole Faust	PIPE_EQ       // |=
78*4947cdc7SCole Faust	CIRCUMFLEX_EQ // ^=
79*4947cdc7SCole Faust	LTLT_EQ       // <<=
80*4947cdc7SCole Faust	GTGT_EQ       // >>=
81*4947cdc7SCole Faust	STARSTAR      // **
82*4947cdc7SCole Faust
83*4947cdc7SCole Faust	// Keywords
84*4947cdc7SCole Faust	AND
85*4947cdc7SCole Faust	BREAK
86*4947cdc7SCole Faust	CONTINUE
87*4947cdc7SCole Faust	DEF
88*4947cdc7SCole Faust	ELIF
89*4947cdc7SCole Faust	ELSE
90*4947cdc7SCole Faust	FOR
91*4947cdc7SCole Faust	IF
92*4947cdc7SCole Faust	IN
93*4947cdc7SCole Faust	LAMBDA
94*4947cdc7SCole Faust	LOAD
95*4947cdc7SCole Faust	NOT
96*4947cdc7SCole Faust	NOT_IN // synthesized by parser from NOT IN
97*4947cdc7SCole Faust	OR
98*4947cdc7SCole Faust	PASS
99*4947cdc7SCole Faust	RETURN
100*4947cdc7SCole Faust	WHILE
101*4947cdc7SCole Faust
102*4947cdc7SCole Faust	maxToken
103*4947cdc7SCole Faust)
104*4947cdc7SCole Faust
105*4947cdc7SCole Faustfunc (tok Token) String() string { return tokenNames[tok] }
106*4947cdc7SCole Faust
107*4947cdc7SCole Faust// GoString is like String but quotes punctuation tokens.
108*4947cdc7SCole Faust// Use Sprintf("%#v", tok) when constructing error messages.
109*4947cdc7SCole Faustfunc (tok Token) GoString() string {
110*4947cdc7SCole Faust	if tok >= PLUS && tok <= STARSTAR {
111*4947cdc7SCole Faust		return "'" + tokenNames[tok] + "'"
112*4947cdc7SCole Faust	}
113*4947cdc7SCole Faust	return tokenNames[tok]
114*4947cdc7SCole Faust}
115*4947cdc7SCole Faust
116*4947cdc7SCole Faustvar tokenNames = [...]string{
117*4947cdc7SCole Faust	ILLEGAL:       "illegal token",
118*4947cdc7SCole Faust	EOF:           "end of file",
119*4947cdc7SCole Faust	NEWLINE:       "newline",
120*4947cdc7SCole Faust	INDENT:        "indent",
121*4947cdc7SCole Faust	OUTDENT:       "outdent",
122*4947cdc7SCole Faust	IDENT:         "identifier",
123*4947cdc7SCole Faust	INT:           "int literal",
124*4947cdc7SCole Faust	FLOAT:         "float literal",
125*4947cdc7SCole Faust	STRING:        "string literal",
126*4947cdc7SCole Faust	PLUS:          "+",
127*4947cdc7SCole Faust	MINUS:         "-",
128*4947cdc7SCole Faust	STAR:          "*",
129*4947cdc7SCole Faust	SLASH:         "/",
130*4947cdc7SCole Faust	SLASHSLASH:    "//",
131*4947cdc7SCole Faust	PERCENT:       "%",
132*4947cdc7SCole Faust	AMP:           "&",
133*4947cdc7SCole Faust	PIPE:          "|",
134*4947cdc7SCole Faust	CIRCUMFLEX:    "^",
135*4947cdc7SCole Faust	LTLT:          "<<",
136*4947cdc7SCole Faust	GTGT:          ">>",
137*4947cdc7SCole Faust	TILDE:         "~",
138*4947cdc7SCole Faust	DOT:           ".",
139*4947cdc7SCole Faust	COMMA:         ",",
140*4947cdc7SCole Faust	EQ:            "=",
141*4947cdc7SCole Faust	SEMI:          ";",
142*4947cdc7SCole Faust	COLON:         ":",
143*4947cdc7SCole Faust	LPAREN:        "(",
144*4947cdc7SCole Faust	RPAREN:        ")",
145*4947cdc7SCole Faust	LBRACK:        "[",
146*4947cdc7SCole Faust	RBRACK:        "]",
147*4947cdc7SCole Faust	LBRACE:        "{",
148*4947cdc7SCole Faust	RBRACE:        "}",
149*4947cdc7SCole Faust	LT:            "<",
150*4947cdc7SCole Faust	GT:            ">",
151*4947cdc7SCole Faust	GE:            ">=",
152*4947cdc7SCole Faust	LE:            "<=",
153*4947cdc7SCole Faust	EQL:           "==",
154*4947cdc7SCole Faust	NEQ:           "!=",
155*4947cdc7SCole Faust	PLUS_EQ:       "+=",
156*4947cdc7SCole Faust	MINUS_EQ:      "-=",
157*4947cdc7SCole Faust	STAR_EQ:       "*=",
158*4947cdc7SCole Faust	SLASH_EQ:      "/=",
159*4947cdc7SCole Faust	SLASHSLASH_EQ: "//=",
160*4947cdc7SCole Faust	PERCENT_EQ:    "%=",
161*4947cdc7SCole Faust	AMP_EQ:        "&=",
162*4947cdc7SCole Faust	PIPE_EQ:       "|=",
163*4947cdc7SCole Faust	CIRCUMFLEX_EQ: "^=",
164*4947cdc7SCole Faust	LTLT_EQ:       "<<=",
165*4947cdc7SCole Faust	GTGT_EQ:       ">>=",
166*4947cdc7SCole Faust	STARSTAR:      "**",
167*4947cdc7SCole Faust	AND:           "and",
168*4947cdc7SCole Faust	BREAK:         "break",
169*4947cdc7SCole Faust	CONTINUE:      "continue",
170*4947cdc7SCole Faust	DEF:           "def",
171*4947cdc7SCole Faust	ELIF:          "elif",
172*4947cdc7SCole Faust	ELSE:          "else",
173*4947cdc7SCole Faust	FOR:           "for",
174*4947cdc7SCole Faust	IF:            "if",
175*4947cdc7SCole Faust	IN:            "in",
176*4947cdc7SCole Faust	LAMBDA:        "lambda",
177*4947cdc7SCole Faust	LOAD:          "load",
178*4947cdc7SCole Faust	NOT:           "not",
179*4947cdc7SCole Faust	NOT_IN:        "not in",
180*4947cdc7SCole Faust	OR:            "or",
181*4947cdc7SCole Faust	PASS:          "pass",
182*4947cdc7SCole Faust	RETURN:        "return",
183*4947cdc7SCole Faust	WHILE:         "while",
184*4947cdc7SCole Faust}
185*4947cdc7SCole Faust
186*4947cdc7SCole Faust// A FilePortion describes the content of a portion of a file.
187*4947cdc7SCole Faust// Callers may provide a FilePortion for the src argument of Parse
188*4947cdc7SCole Faust// when the desired initial line and column numbers are not (1, 1),
189*4947cdc7SCole Faust// such as when an expression is parsed from within larger file.
190*4947cdc7SCole Fausttype FilePortion struct {
191*4947cdc7SCole Faust	Content             []byte
192*4947cdc7SCole Faust	FirstLine, FirstCol int32
193*4947cdc7SCole Faust}
194*4947cdc7SCole Faust
195*4947cdc7SCole Faust// A Position describes the location of a rune of input.
196*4947cdc7SCole Fausttype Position struct {
197*4947cdc7SCole Faust	file *string // filename (indirect for compactness)
198*4947cdc7SCole Faust	Line int32   // 1-based line number; 0 if line unknown
199*4947cdc7SCole Faust	Col  int32   // 1-based column (rune) number; 0 if column unknown
200*4947cdc7SCole Faust}
201*4947cdc7SCole Faust
202*4947cdc7SCole Faust// IsValid reports whether the position is valid.
203*4947cdc7SCole Faustfunc (p Position) IsValid() bool { return p.file != nil }
204*4947cdc7SCole Faust
205*4947cdc7SCole Faust// Filename returns the name of the file containing this position.
206*4947cdc7SCole Faustfunc (p Position) Filename() string {
207*4947cdc7SCole Faust	if p.file != nil {
208*4947cdc7SCole Faust		return *p.file
209*4947cdc7SCole Faust	}
210*4947cdc7SCole Faust	return "<invalid>"
211*4947cdc7SCole Faust}
212*4947cdc7SCole Faust
213*4947cdc7SCole Faust// MakePosition returns position with the specified components.
214*4947cdc7SCole Faustfunc MakePosition(file *string, line, col int32) Position { return Position{file, line, col} }
215*4947cdc7SCole Faust
216*4947cdc7SCole Faust// add returns the position at the end of s, assuming it starts at p.
217*4947cdc7SCole Faustfunc (p Position) add(s string) Position {
218*4947cdc7SCole Faust	if n := strings.Count(s, "\n"); n > 0 {
219*4947cdc7SCole Faust		p.Line += int32(n)
220*4947cdc7SCole Faust		s = s[strings.LastIndex(s, "\n")+1:]
221*4947cdc7SCole Faust		p.Col = 1
222*4947cdc7SCole Faust	}
223*4947cdc7SCole Faust	p.Col += int32(utf8.RuneCountInString(s))
224*4947cdc7SCole Faust	return p
225*4947cdc7SCole Faust}
226*4947cdc7SCole Faust
227*4947cdc7SCole Faustfunc (p Position) String() string {
228*4947cdc7SCole Faust	file := p.Filename()
229*4947cdc7SCole Faust	if p.Line > 0 {
230*4947cdc7SCole Faust		if p.Col > 0 {
231*4947cdc7SCole Faust			return fmt.Sprintf("%s:%d:%d", file, p.Line, p.Col)
232*4947cdc7SCole Faust		}
233*4947cdc7SCole Faust		return fmt.Sprintf("%s:%d", file, p.Line)
234*4947cdc7SCole Faust	}
235*4947cdc7SCole Faust	return file
236*4947cdc7SCole Faust}
237*4947cdc7SCole Faust
238*4947cdc7SCole Faustfunc (p Position) isBefore(q Position) bool {
239*4947cdc7SCole Faust	if p.Line != q.Line {
240*4947cdc7SCole Faust		return p.Line < q.Line
241*4947cdc7SCole Faust	}
242*4947cdc7SCole Faust	return p.Col < q.Col
243*4947cdc7SCole Faust}
244*4947cdc7SCole Faust
245*4947cdc7SCole Faust// An scanner represents a single input file being parsed.
246*4947cdc7SCole Fausttype scanner struct {
247*4947cdc7SCole Faust	rest           []byte    // rest of input (in REPL, a line of input)
248*4947cdc7SCole Faust	token          []byte    // token being scanned
249*4947cdc7SCole Faust	pos            Position  // current input position
250*4947cdc7SCole Faust	depth          int       // nesting of [ ] { } ( )
251*4947cdc7SCole Faust	indentstk      []int     // stack of indentation levels
252*4947cdc7SCole Faust	dents          int       // number of saved INDENT (>0) or OUTDENT (<0) tokens to return
253*4947cdc7SCole Faust	lineStart      bool      // after NEWLINE; convert spaces to indentation tokens
254*4947cdc7SCole Faust	keepComments   bool      // accumulate comments in slice
255*4947cdc7SCole Faust	lineComments   []Comment // list of full line comments (if keepComments)
256*4947cdc7SCole Faust	suffixComments []Comment // list of suffix comments (if keepComments)
257*4947cdc7SCole Faust
258*4947cdc7SCole Faust	readline func() ([]byte, error) // read next line of input (REPL only)
259*4947cdc7SCole Faust}
260*4947cdc7SCole Faust
261*4947cdc7SCole Faustfunc newScanner(filename string, src interface{}, keepComments bool) (*scanner, error) {
262*4947cdc7SCole Faust	var firstLine, firstCol int32 = 1, 1
263*4947cdc7SCole Faust	if portion, ok := src.(FilePortion); ok {
264*4947cdc7SCole Faust		firstLine, firstCol = portion.FirstLine, portion.FirstCol
265*4947cdc7SCole Faust	}
266*4947cdc7SCole Faust	sc := &scanner{
267*4947cdc7SCole Faust		pos:          MakePosition(&filename, firstLine, firstCol),
268*4947cdc7SCole Faust		indentstk:    make([]int, 1, 10), // []int{0} + spare capacity
269*4947cdc7SCole Faust		lineStart:    true,
270*4947cdc7SCole Faust		keepComments: keepComments,
271*4947cdc7SCole Faust	}
272*4947cdc7SCole Faust	sc.readline, _ = src.(func() ([]byte, error)) // ParseCompoundStmt (REPL) only
273*4947cdc7SCole Faust	if sc.readline == nil {
274*4947cdc7SCole Faust		data, err := readSource(filename, src)
275*4947cdc7SCole Faust		if err != nil {
276*4947cdc7SCole Faust			return nil, err
277*4947cdc7SCole Faust		}
278*4947cdc7SCole Faust		sc.rest = data
279*4947cdc7SCole Faust	}
280*4947cdc7SCole Faust	return sc, nil
281*4947cdc7SCole Faust}
282*4947cdc7SCole Faust
283*4947cdc7SCole Faustfunc readSource(filename string, src interface{}) ([]byte, error) {
284*4947cdc7SCole Faust	switch src := src.(type) {
285*4947cdc7SCole Faust	case string:
286*4947cdc7SCole Faust		return []byte(src), nil
287*4947cdc7SCole Faust	case []byte:
288*4947cdc7SCole Faust		return src, nil
289*4947cdc7SCole Faust	case io.Reader:
290*4947cdc7SCole Faust		data, err := ioutil.ReadAll(src)
291*4947cdc7SCole Faust		if err != nil {
292*4947cdc7SCole Faust			err = &os.PathError{Op: "read", Path: filename, Err: err}
293*4947cdc7SCole Faust			return nil, err
294*4947cdc7SCole Faust		}
295*4947cdc7SCole Faust		return data, nil
296*4947cdc7SCole Faust	case FilePortion:
297*4947cdc7SCole Faust		return src.Content, nil
298*4947cdc7SCole Faust	case nil:
299*4947cdc7SCole Faust		return ioutil.ReadFile(filename)
300*4947cdc7SCole Faust	default:
301*4947cdc7SCole Faust		return nil, fmt.Errorf("invalid source: %T", src)
302*4947cdc7SCole Faust	}
303*4947cdc7SCole Faust}
304*4947cdc7SCole Faust
305*4947cdc7SCole Faust// An Error describes the nature and position of a scanner or parser error.
306*4947cdc7SCole Fausttype Error struct {
307*4947cdc7SCole Faust	Pos Position
308*4947cdc7SCole Faust	Msg string
309*4947cdc7SCole Faust}
310*4947cdc7SCole Faust
311*4947cdc7SCole Faustfunc (e Error) Error() string { return e.Pos.String() + ": " + e.Msg }
312*4947cdc7SCole Faust
313*4947cdc7SCole Faust// errorf is called to report an error.
314*4947cdc7SCole Faust// errorf does not return: it panics.
315*4947cdc7SCole Faustfunc (sc *scanner) error(pos Position, s string) {
316*4947cdc7SCole Faust	panic(Error{pos, s})
317*4947cdc7SCole Faust}
318*4947cdc7SCole Faust
319*4947cdc7SCole Faustfunc (sc *scanner) errorf(pos Position, format string, args ...interface{}) {
320*4947cdc7SCole Faust	sc.error(pos, fmt.Sprintf(format, args...))
321*4947cdc7SCole Faust}
322*4947cdc7SCole Faust
323*4947cdc7SCole Faustfunc (sc *scanner) recover(err *error) {
324*4947cdc7SCole Faust	// The scanner and parser panic both for routine errors like
325*4947cdc7SCole Faust	// syntax errors and for programmer bugs like array index
326*4947cdc7SCole Faust	// errors.  Turn both into error returns.  Catching bug panics
327*4947cdc7SCole Faust	// is especially important when processing many files.
328*4947cdc7SCole Faust	switch e := recover().(type) {
329*4947cdc7SCole Faust	case nil:
330*4947cdc7SCole Faust		// no panic
331*4947cdc7SCole Faust	case Error:
332*4947cdc7SCole Faust		*err = e
333*4947cdc7SCole Faust	default:
334*4947cdc7SCole Faust		*err = Error{sc.pos, fmt.Sprintf("internal error: %v", e)}
335*4947cdc7SCole Faust		if debug {
336*4947cdc7SCole Faust			log.Fatal(*err)
337*4947cdc7SCole Faust		}
338*4947cdc7SCole Faust	}
339*4947cdc7SCole Faust}
340*4947cdc7SCole Faust
341*4947cdc7SCole Faust// eof reports whether the input has reached end of file.
342*4947cdc7SCole Faustfunc (sc *scanner) eof() bool {
343*4947cdc7SCole Faust	return len(sc.rest) == 0 && !sc.readLine()
344*4947cdc7SCole Faust}
345*4947cdc7SCole Faust
346*4947cdc7SCole Faust// readLine attempts to read another line of input.
347*4947cdc7SCole Faust// Precondition: len(sc.rest)==0.
348*4947cdc7SCole Faustfunc (sc *scanner) readLine() bool {
349*4947cdc7SCole Faust	if sc.readline != nil {
350*4947cdc7SCole Faust		var err error
351*4947cdc7SCole Faust		sc.rest, err = sc.readline()
352*4947cdc7SCole Faust		if err != nil {
353*4947cdc7SCole Faust			sc.errorf(sc.pos, "%v", err) // EOF or ErrInterrupt
354*4947cdc7SCole Faust		}
355*4947cdc7SCole Faust		return len(sc.rest) > 0
356*4947cdc7SCole Faust	}
357*4947cdc7SCole Faust	return false
358*4947cdc7SCole Faust}
359*4947cdc7SCole Faust
360*4947cdc7SCole Faust// peekRune returns the next rune in the input without consuming it.
361*4947cdc7SCole Faust// Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'.
362*4947cdc7SCole Faustfunc (sc *scanner) peekRune() rune {
363*4947cdc7SCole Faust	// TODO(adonovan): opt: measure and perhaps inline eof.
364*4947cdc7SCole Faust	if sc.eof() {
365*4947cdc7SCole Faust		return 0
366*4947cdc7SCole Faust	}
367*4947cdc7SCole Faust
368*4947cdc7SCole Faust	// fast path: ASCII
369*4947cdc7SCole Faust	if b := sc.rest[0]; b < utf8.RuneSelf {
370*4947cdc7SCole Faust		if b == '\r' {
371*4947cdc7SCole Faust			return '\n'
372*4947cdc7SCole Faust		}
373*4947cdc7SCole Faust		return rune(b)
374*4947cdc7SCole Faust	}
375*4947cdc7SCole Faust
376*4947cdc7SCole Faust	r, _ := utf8.DecodeRune(sc.rest)
377*4947cdc7SCole Faust	return r
378*4947cdc7SCole Faust}
379*4947cdc7SCole Faust
380*4947cdc7SCole Faust// readRune consumes and returns the next rune in the input.
381*4947cdc7SCole Faust// Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'.
382*4947cdc7SCole Faustfunc (sc *scanner) readRune() rune {
383*4947cdc7SCole Faust	// eof() has been inlined here, both to avoid a call
384*4947cdc7SCole Faust	// and to establish len(rest)>0 to avoid a bounds check.
385*4947cdc7SCole Faust	if len(sc.rest) == 0 {
386*4947cdc7SCole Faust		if !sc.readLine() {
387*4947cdc7SCole Faust			sc.error(sc.pos, "internal scanner error: readRune at EOF")
388*4947cdc7SCole Faust		}
389*4947cdc7SCole Faust		// Redundant, but eliminates the bounds-check below.
390*4947cdc7SCole Faust		if len(sc.rest) == 0 {
391*4947cdc7SCole Faust			return 0
392*4947cdc7SCole Faust		}
393*4947cdc7SCole Faust	}
394*4947cdc7SCole Faust
395*4947cdc7SCole Faust	// fast path: ASCII
396*4947cdc7SCole Faust	if b := sc.rest[0]; b < utf8.RuneSelf {
397*4947cdc7SCole Faust		r := rune(b)
398*4947cdc7SCole Faust		sc.rest = sc.rest[1:]
399*4947cdc7SCole Faust		if r == '\r' {
400*4947cdc7SCole Faust			if len(sc.rest) > 0 && sc.rest[0] == '\n' {
401*4947cdc7SCole Faust				sc.rest = sc.rest[1:]
402*4947cdc7SCole Faust			}
403*4947cdc7SCole Faust			r = '\n'
404*4947cdc7SCole Faust		}
405*4947cdc7SCole Faust		if r == '\n' {
406*4947cdc7SCole Faust			sc.pos.Line++
407*4947cdc7SCole Faust			sc.pos.Col = 1
408*4947cdc7SCole Faust		} else {
409*4947cdc7SCole Faust			sc.pos.Col++
410*4947cdc7SCole Faust		}
411*4947cdc7SCole Faust		return r
412*4947cdc7SCole Faust	}
413*4947cdc7SCole Faust
414*4947cdc7SCole Faust	r, size := utf8.DecodeRune(sc.rest)
415*4947cdc7SCole Faust	sc.rest = sc.rest[size:]
416*4947cdc7SCole Faust	sc.pos.Col++
417*4947cdc7SCole Faust	return r
418*4947cdc7SCole Faust}
419*4947cdc7SCole Faust
420*4947cdc7SCole Faust// tokenValue records the position and value associated with each token.
421*4947cdc7SCole Fausttype tokenValue struct {
422*4947cdc7SCole Faust	raw    string   // raw text of token
423*4947cdc7SCole Faust	int    int64    // decoded int
424*4947cdc7SCole Faust	bigInt *big.Int // decoded integers > int64
425*4947cdc7SCole Faust	float  float64  // decoded float
426*4947cdc7SCole Faust	string string   // decoded string or bytes
427*4947cdc7SCole Faust	pos    Position // start position of token
428*4947cdc7SCole Faust}
429*4947cdc7SCole Faust
430*4947cdc7SCole Faust// startToken marks the beginning of the next input token.
431*4947cdc7SCole Faust// It must be followed by a call to endToken once the token has
432*4947cdc7SCole Faust// been consumed using readRune.
433*4947cdc7SCole Faustfunc (sc *scanner) startToken(val *tokenValue) {
434*4947cdc7SCole Faust	sc.token = sc.rest
435*4947cdc7SCole Faust	val.raw = ""
436*4947cdc7SCole Faust	val.pos = sc.pos
437*4947cdc7SCole Faust}
438*4947cdc7SCole Faust
439*4947cdc7SCole Faust// endToken marks the end of an input token.
440*4947cdc7SCole Faust// It records the actual token string in val.raw if the caller
441*4947cdc7SCole Faust// has not done that already.
442*4947cdc7SCole Faustfunc (sc *scanner) endToken(val *tokenValue) {
443*4947cdc7SCole Faust	if val.raw == "" {
444*4947cdc7SCole Faust		val.raw = string(sc.token[:len(sc.token)-len(sc.rest)])
445*4947cdc7SCole Faust	}
446*4947cdc7SCole Faust}
447*4947cdc7SCole Faust
448*4947cdc7SCole Faust// nextToken is called by the parser to obtain the next input token.
449*4947cdc7SCole Faust// It returns the token value and sets val to the data associated with
450*4947cdc7SCole Faust// the token.
451*4947cdc7SCole Faust//
452*4947cdc7SCole Faust// For all our input tokens, the associated data is val.pos (the
453*4947cdc7SCole Faust// position where the token begins), val.raw (the input string
454*4947cdc7SCole Faust// corresponding to the token).  For string and int tokens, the string
455*4947cdc7SCole Faust// and int fields additionally contain the token's interpreted value.
456*4947cdc7SCole Faustfunc (sc *scanner) nextToken(val *tokenValue) Token {
457*4947cdc7SCole Faust
458*4947cdc7SCole Faust	// The following distribution of tokens guides case ordering:
459*4947cdc7SCole Faust	//
460*4947cdc7SCole Faust	//      COMMA          27   %
461*4947cdc7SCole Faust	//      STRING         23   %
462*4947cdc7SCole Faust	//      IDENT          15   %
463*4947cdc7SCole Faust	//      EQL            11   %
464*4947cdc7SCole Faust	//      LBRACK          5.5 %
465*4947cdc7SCole Faust	//      RBRACK          5.5 %
466*4947cdc7SCole Faust	//      NEWLINE         3   %
467*4947cdc7SCole Faust	//      LPAREN          2.9 %
468*4947cdc7SCole Faust	//      RPAREN          2.9 %
469*4947cdc7SCole Faust	//      INT             2   %
470*4947cdc7SCole Faust	//      others        < 1   %
471*4947cdc7SCole Faust	//
472*4947cdc7SCole Faust	// Although NEWLINE tokens are infrequent, and lineStart is
473*4947cdc7SCole Faust	// usually (~97%) false on entry, skipped newlines account for
474*4947cdc7SCole Faust	// about 50% of all iterations of the 'start' loop.
475*4947cdc7SCole Faust
476*4947cdc7SCole Fauststart:
477*4947cdc7SCole Faust	var c rune
478*4947cdc7SCole Faust
479*4947cdc7SCole Faust	// Deal with leading spaces and indentation.
480*4947cdc7SCole Faust	blank := false
481*4947cdc7SCole Faust	savedLineStart := sc.lineStart
482*4947cdc7SCole Faust	if sc.lineStart {
483*4947cdc7SCole Faust		sc.lineStart = false
484*4947cdc7SCole Faust		col := 0
485*4947cdc7SCole Faust		for {
486*4947cdc7SCole Faust			c = sc.peekRune()
487*4947cdc7SCole Faust			if c == ' ' {
488*4947cdc7SCole Faust				col++
489*4947cdc7SCole Faust				sc.readRune()
490*4947cdc7SCole Faust			} else if c == '\t' {
491*4947cdc7SCole Faust				const tab = 8
492*4947cdc7SCole Faust				col += int(tab - (sc.pos.Col-1)%tab)
493*4947cdc7SCole Faust				sc.readRune()
494*4947cdc7SCole Faust			} else {
495*4947cdc7SCole Faust				break
496*4947cdc7SCole Faust			}
497*4947cdc7SCole Faust		}
498*4947cdc7SCole Faust
499*4947cdc7SCole Faust		// The third clause matches EOF.
500*4947cdc7SCole Faust		if c == '#' || c == '\n' || c == 0 {
501*4947cdc7SCole Faust			blank = true
502*4947cdc7SCole Faust		}
503*4947cdc7SCole Faust
504*4947cdc7SCole Faust		// Compute indentation level for non-blank lines not
505*4947cdc7SCole Faust		// inside an expression.  This is not the common case.
506*4947cdc7SCole Faust		if !blank && sc.depth == 0 {
507*4947cdc7SCole Faust			cur := sc.indentstk[len(sc.indentstk)-1]
508*4947cdc7SCole Faust			if col > cur {
509*4947cdc7SCole Faust				// indent
510*4947cdc7SCole Faust				sc.dents++
511*4947cdc7SCole Faust				sc.indentstk = append(sc.indentstk, col)
512*4947cdc7SCole Faust			} else if col < cur {
513*4947cdc7SCole Faust				// outdent(s)
514*4947cdc7SCole Faust				for len(sc.indentstk) > 0 && col < sc.indentstk[len(sc.indentstk)-1] {
515*4947cdc7SCole Faust					sc.dents--
516*4947cdc7SCole Faust					sc.indentstk = sc.indentstk[:len(sc.indentstk)-1] // pop
517*4947cdc7SCole Faust				}
518*4947cdc7SCole Faust				if col != sc.indentstk[len(sc.indentstk)-1] {
519*4947cdc7SCole Faust					sc.error(sc.pos, "unindent does not match any outer indentation level")
520*4947cdc7SCole Faust				}
521*4947cdc7SCole Faust			}
522*4947cdc7SCole Faust		}
523*4947cdc7SCole Faust	}
524*4947cdc7SCole Faust
525*4947cdc7SCole Faust	// Return saved indentation tokens.
526*4947cdc7SCole Faust	if sc.dents != 0 {
527*4947cdc7SCole Faust		sc.startToken(val)
528*4947cdc7SCole Faust		sc.endToken(val)
529*4947cdc7SCole Faust		if sc.dents < 0 {
530*4947cdc7SCole Faust			sc.dents++
531*4947cdc7SCole Faust			return OUTDENT
532*4947cdc7SCole Faust		} else {
533*4947cdc7SCole Faust			sc.dents--
534*4947cdc7SCole Faust			return INDENT
535*4947cdc7SCole Faust		}
536*4947cdc7SCole Faust	}
537*4947cdc7SCole Faust
538*4947cdc7SCole Faust	// start of line proper
539*4947cdc7SCole Faust	c = sc.peekRune()
540*4947cdc7SCole Faust
541*4947cdc7SCole Faust	// Skip spaces.
542*4947cdc7SCole Faust	for c == ' ' || c == '\t' {
543*4947cdc7SCole Faust		sc.readRune()
544*4947cdc7SCole Faust		c = sc.peekRune()
545*4947cdc7SCole Faust	}
546*4947cdc7SCole Faust
547*4947cdc7SCole Faust	// comment
548*4947cdc7SCole Faust	if c == '#' {
549*4947cdc7SCole Faust		if sc.keepComments {
550*4947cdc7SCole Faust			sc.startToken(val)
551*4947cdc7SCole Faust		}
552*4947cdc7SCole Faust		// Consume up to newline (included).
553*4947cdc7SCole Faust		for c != 0 && c != '\n' {
554*4947cdc7SCole Faust			sc.readRune()
555*4947cdc7SCole Faust			c = sc.peekRune()
556*4947cdc7SCole Faust		}
557*4947cdc7SCole Faust		if sc.keepComments {
558*4947cdc7SCole Faust			sc.endToken(val)
559*4947cdc7SCole Faust			if blank {
560*4947cdc7SCole Faust				sc.lineComments = append(sc.lineComments, Comment{val.pos, val.raw})
561*4947cdc7SCole Faust			} else {
562*4947cdc7SCole Faust				sc.suffixComments = append(sc.suffixComments, Comment{val.pos, val.raw})
563*4947cdc7SCole Faust			}
564*4947cdc7SCole Faust		}
565*4947cdc7SCole Faust	}
566*4947cdc7SCole Faust
567*4947cdc7SCole Faust	// newline
568*4947cdc7SCole Faust	if c == '\n' {
569*4947cdc7SCole Faust		sc.lineStart = true
570*4947cdc7SCole Faust
571*4947cdc7SCole Faust		// Ignore newlines within expressions (common case).
572*4947cdc7SCole Faust		if sc.depth > 0 {
573*4947cdc7SCole Faust			sc.readRune()
574*4947cdc7SCole Faust			goto start
575*4947cdc7SCole Faust		}
576*4947cdc7SCole Faust
577*4947cdc7SCole Faust		// Ignore blank lines, except in the REPL,
578*4947cdc7SCole Faust		// where they emit OUTDENTs and NEWLINE.
579*4947cdc7SCole Faust		if blank {
580*4947cdc7SCole Faust			if sc.readline == nil {
581*4947cdc7SCole Faust				sc.readRune()
582*4947cdc7SCole Faust				goto start
583*4947cdc7SCole Faust			} else if len(sc.indentstk) > 1 {
584*4947cdc7SCole Faust				sc.dents = 1 - len(sc.indentstk)
585*4947cdc7SCole Faust				sc.indentstk = sc.indentstk[:1]
586*4947cdc7SCole Faust				goto start
587*4947cdc7SCole Faust			}
588*4947cdc7SCole Faust		}
589*4947cdc7SCole Faust
590*4947cdc7SCole Faust		// At top-level (not in an expression).
591*4947cdc7SCole Faust		sc.startToken(val)
592*4947cdc7SCole Faust		sc.readRune()
593*4947cdc7SCole Faust		val.raw = "\n"
594*4947cdc7SCole Faust		return NEWLINE
595*4947cdc7SCole Faust	}
596*4947cdc7SCole Faust
597*4947cdc7SCole Faust	// end of file
598*4947cdc7SCole Faust	if c == 0 {
599*4947cdc7SCole Faust		// Emit OUTDENTs for unfinished indentation,
600*4947cdc7SCole Faust		// preceded by a NEWLINE if we haven't just emitted one.
601*4947cdc7SCole Faust		if len(sc.indentstk) > 1 {
602*4947cdc7SCole Faust			if savedLineStart {
603*4947cdc7SCole Faust				sc.dents = 1 - len(sc.indentstk)
604*4947cdc7SCole Faust				sc.indentstk = sc.indentstk[:1]
605*4947cdc7SCole Faust				goto start
606*4947cdc7SCole Faust			} else {
607*4947cdc7SCole Faust				sc.lineStart = true
608*4947cdc7SCole Faust				sc.startToken(val)
609*4947cdc7SCole Faust				val.raw = "\n"
610*4947cdc7SCole Faust				return NEWLINE
611*4947cdc7SCole Faust			}
612*4947cdc7SCole Faust		}
613*4947cdc7SCole Faust
614*4947cdc7SCole Faust		sc.startToken(val)
615*4947cdc7SCole Faust		sc.endToken(val)
616*4947cdc7SCole Faust		return EOF
617*4947cdc7SCole Faust	}
618*4947cdc7SCole Faust
619*4947cdc7SCole Faust	// line continuation
620*4947cdc7SCole Faust	if c == '\\' {
621*4947cdc7SCole Faust		sc.readRune()
622*4947cdc7SCole Faust		if sc.peekRune() != '\n' {
623*4947cdc7SCole Faust			sc.errorf(sc.pos, "stray backslash in program")
624*4947cdc7SCole Faust		}
625*4947cdc7SCole Faust		sc.readRune()
626*4947cdc7SCole Faust		goto start
627*4947cdc7SCole Faust	}
628*4947cdc7SCole Faust
629*4947cdc7SCole Faust	// start of the next token
630*4947cdc7SCole Faust	sc.startToken(val)
631*4947cdc7SCole Faust
632*4947cdc7SCole Faust	// comma (common case)
633*4947cdc7SCole Faust	if c == ',' {
634*4947cdc7SCole Faust		sc.readRune()
635*4947cdc7SCole Faust		sc.endToken(val)
636*4947cdc7SCole Faust		return COMMA
637*4947cdc7SCole Faust	}
638*4947cdc7SCole Faust
639*4947cdc7SCole Faust	// string literal
640*4947cdc7SCole Faust	if c == '"' || c == '\'' {
641*4947cdc7SCole Faust		return sc.scanString(val, c)
642*4947cdc7SCole Faust	}
643*4947cdc7SCole Faust
644*4947cdc7SCole Faust	// identifier or keyword
645*4947cdc7SCole Faust	if isIdentStart(c) {
646*4947cdc7SCole Faust		if (c == 'r' || c == 'b') && len(sc.rest) > 1 && (sc.rest[1] == '"' || sc.rest[1] == '\'') {
647*4947cdc7SCole Faust			//  r"..."
648*4947cdc7SCole Faust			//  b"..."
649*4947cdc7SCole Faust			sc.readRune()
650*4947cdc7SCole Faust			c = sc.peekRune()
651*4947cdc7SCole Faust			return sc.scanString(val, c)
652*4947cdc7SCole Faust		} else if c == 'r' && len(sc.rest) > 2 && sc.rest[1] == 'b' && (sc.rest[2] == '"' || sc.rest[2] == '\'') {
653*4947cdc7SCole Faust			// rb"..."
654*4947cdc7SCole Faust			sc.readRune()
655*4947cdc7SCole Faust			sc.readRune()
656*4947cdc7SCole Faust			c = sc.peekRune()
657*4947cdc7SCole Faust			return sc.scanString(val, c)
658*4947cdc7SCole Faust		}
659*4947cdc7SCole Faust
660*4947cdc7SCole Faust		for isIdent(c) {
661*4947cdc7SCole Faust			sc.readRune()
662*4947cdc7SCole Faust			c = sc.peekRune()
663*4947cdc7SCole Faust		}
664*4947cdc7SCole Faust		sc.endToken(val)
665*4947cdc7SCole Faust		if k, ok := keywordToken[val.raw]; ok {
666*4947cdc7SCole Faust			return k
667*4947cdc7SCole Faust		}
668*4947cdc7SCole Faust
669*4947cdc7SCole Faust		return IDENT
670*4947cdc7SCole Faust	}
671*4947cdc7SCole Faust
672*4947cdc7SCole Faust	// brackets
673*4947cdc7SCole Faust	switch c {
674*4947cdc7SCole Faust	case '[', '(', '{':
675*4947cdc7SCole Faust		sc.depth++
676*4947cdc7SCole Faust		sc.readRune()
677*4947cdc7SCole Faust		sc.endToken(val)
678*4947cdc7SCole Faust		switch c {
679*4947cdc7SCole Faust		case '[':
680*4947cdc7SCole Faust			return LBRACK
681*4947cdc7SCole Faust		case '(':
682*4947cdc7SCole Faust			return LPAREN
683*4947cdc7SCole Faust		case '{':
684*4947cdc7SCole Faust			return LBRACE
685*4947cdc7SCole Faust		}
686*4947cdc7SCole Faust		panic("unreachable")
687*4947cdc7SCole Faust
688*4947cdc7SCole Faust	case ']', ')', '}':
689*4947cdc7SCole Faust		if sc.depth == 0 {
690*4947cdc7SCole Faust			sc.errorf(sc.pos, "unexpected %q", c)
691*4947cdc7SCole Faust		} else {
692*4947cdc7SCole Faust			sc.depth--
693*4947cdc7SCole Faust		}
694*4947cdc7SCole Faust		sc.readRune()
695*4947cdc7SCole Faust		sc.endToken(val)
696*4947cdc7SCole Faust		switch c {
697*4947cdc7SCole Faust		case ']':
698*4947cdc7SCole Faust			return RBRACK
699*4947cdc7SCole Faust		case ')':
700*4947cdc7SCole Faust			return RPAREN
701*4947cdc7SCole Faust		case '}':
702*4947cdc7SCole Faust			return RBRACE
703*4947cdc7SCole Faust		}
704*4947cdc7SCole Faust		panic("unreachable")
705*4947cdc7SCole Faust	}
706*4947cdc7SCole Faust
707*4947cdc7SCole Faust	// int or float literal, or period
708*4947cdc7SCole Faust	if isdigit(c) || c == '.' {
709*4947cdc7SCole Faust		return sc.scanNumber(val, c)
710*4947cdc7SCole Faust	}
711*4947cdc7SCole Faust
712*4947cdc7SCole Faust	// other punctuation
713*4947cdc7SCole Faust	defer sc.endToken(val)
714*4947cdc7SCole Faust	switch c {
715*4947cdc7SCole Faust	case '=', '<', '>', '!', '+', '-', '%', '/', '&', '|', '^': // possibly followed by '='
716*4947cdc7SCole Faust		start := sc.pos
717*4947cdc7SCole Faust		sc.readRune()
718*4947cdc7SCole Faust		if sc.peekRune() == '=' {
719*4947cdc7SCole Faust			sc.readRune()
720*4947cdc7SCole Faust			switch c {
721*4947cdc7SCole Faust			case '<':
722*4947cdc7SCole Faust				return LE
723*4947cdc7SCole Faust			case '>':
724*4947cdc7SCole Faust				return GE
725*4947cdc7SCole Faust			case '=':
726*4947cdc7SCole Faust				return EQL
727*4947cdc7SCole Faust			case '!':
728*4947cdc7SCole Faust				return NEQ
729*4947cdc7SCole Faust			case '+':
730*4947cdc7SCole Faust				return PLUS_EQ
731*4947cdc7SCole Faust			case '-':
732*4947cdc7SCole Faust				return MINUS_EQ
733*4947cdc7SCole Faust			case '/':
734*4947cdc7SCole Faust				return SLASH_EQ
735*4947cdc7SCole Faust			case '%':
736*4947cdc7SCole Faust				return PERCENT_EQ
737*4947cdc7SCole Faust			case '&':
738*4947cdc7SCole Faust				return AMP_EQ
739*4947cdc7SCole Faust			case '|':
740*4947cdc7SCole Faust				return PIPE_EQ
741*4947cdc7SCole Faust			case '^':
742*4947cdc7SCole Faust				return CIRCUMFLEX_EQ
743*4947cdc7SCole Faust			}
744*4947cdc7SCole Faust		}
745*4947cdc7SCole Faust		switch c {
746*4947cdc7SCole Faust		case '=':
747*4947cdc7SCole Faust			return EQ
748*4947cdc7SCole Faust		case '<':
749*4947cdc7SCole Faust			if sc.peekRune() == '<' {
750*4947cdc7SCole Faust				sc.readRune()
751*4947cdc7SCole Faust				if sc.peekRune() == '=' {
752*4947cdc7SCole Faust					sc.readRune()
753*4947cdc7SCole Faust					return LTLT_EQ
754*4947cdc7SCole Faust				} else {
755*4947cdc7SCole Faust					return LTLT
756*4947cdc7SCole Faust				}
757*4947cdc7SCole Faust			}
758*4947cdc7SCole Faust			return LT
759*4947cdc7SCole Faust		case '>':
760*4947cdc7SCole Faust			if sc.peekRune() == '>' {
761*4947cdc7SCole Faust				sc.readRune()
762*4947cdc7SCole Faust				if sc.peekRune() == '=' {
763*4947cdc7SCole Faust					sc.readRune()
764*4947cdc7SCole Faust					return GTGT_EQ
765*4947cdc7SCole Faust				} else {
766*4947cdc7SCole Faust					return GTGT
767*4947cdc7SCole Faust				}
768*4947cdc7SCole Faust			}
769*4947cdc7SCole Faust			return GT
770*4947cdc7SCole Faust		case '!':
771*4947cdc7SCole Faust			sc.error(start, "unexpected input character '!'")
772*4947cdc7SCole Faust		case '+':
773*4947cdc7SCole Faust			return PLUS
774*4947cdc7SCole Faust		case '-':
775*4947cdc7SCole Faust			return MINUS
776*4947cdc7SCole Faust		case '/':
777*4947cdc7SCole Faust			if sc.peekRune() == '/' {
778*4947cdc7SCole Faust				sc.readRune()
779*4947cdc7SCole Faust				if sc.peekRune() == '=' {
780*4947cdc7SCole Faust					sc.readRune()
781*4947cdc7SCole Faust					return SLASHSLASH_EQ
782*4947cdc7SCole Faust				} else {
783*4947cdc7SCole Faust					return SLASHSLASH
784*4947cdc7SCole Faust				}
785*4947cdc7SCole Faust			}
786*4947cdc7SCole Faust			return SLASH
787*4947cdc7SCole Faust		case '%':
788*4947cdc7SCole Faust			return PERCENT
789*4947cdc7SCole Faust		case '&':
790*4947cdc7SCole Faust			return AMP
791*4947cdc7SCole Faust		case '|':
792*4947cdc7SCole Faust			return PIPE
793*4947cdc7SCole Faust		case '^':
794*4947cdc7SCole Faust			return CIRCUMFLEX
795*4947cdc7SCole Faust		}
796*4947cdc7SCole Faust		panic("unreachable")
797*4947cdc7SCole Faust
798*4947cdc7SCole Faust	case ':', ';', '~': // single-char tokens (except comma)
799*4947cdc7SCole Faust		sc.readRune()
800*4947cdc7SCole Faust		switch c {
801*4947cdc7SCole Faust		case ':':
802*4947cdc7SCole Faust			return COLON
803*4947cdc7SCole Faust		case ';':
804*4947cdc7SCole Faust			return SEMI
805*4947cdc7SCole Faust		case '~':
806*4947cdc7SCole Faust			return TILDE
807*4947cdc7SCole Faust		}
808*4947cdc7SCole Faust		panic("unreachable")
809*4947cdc7SCole Faust
810*4947cdc7SCole Faust	case '*': // possibly followed by '*' or '='
811*4947cdc7SCole Faust		sc.readRune()
812*4947cdc7SCole Faust		switch sc.peekRune() {
813*4947cdc7SCole Faust		case '*':
814*4947cdc7SCole Faust			sc.readRune()
815*4947cdc7SCole Faust			return STARSTAR
816*4947cdc7SCole Faust		case '=':
817*4947cdc7SCole Faust			sc.readRune()
818*4947cdc7SCole Faust			return STAR_EQ
819*4947cdc7SCole Faust		}
820*4947cdc7SCole Faust		return STAR
821*4947cdc7SCole Faust	}
822*4947cdc7SCole Faust
823*4947cdc7SCole Faust	sc.errorf(sc.pos, "unexpected input character %#q", c)
824*4947cdc7SCole Faust	panic("unreachable")
825*4947cdc7SCole Faust}
826*4947cdc7SCole Faust
827*4947cdc7SCole Faustfunc (sc *scanner) scanString(val *tokenValue, quote rune) Token {
828*4947cdc7SCole Faust	start := sc.pos
829*4947cdc7SCole Faust	triple := len(sc.rest) >= 3 && sc.rest[0] == byte(quote) && sc.rest[1] == byte(quote) && sc.rest[2] == byte(quote)
830*4947cdc7SCole Faust	sc.readRune()
831*4947cdc7SCole Faust
832*4947cdc7SCole Faust	// String literals may contain escaped or unescaped newlines,
833*4947cdc7SCole Faust	// causing them to span multiple lines (gulps) of REPL input;
834*4947cdc7SCole Faust	// they are the only such token. Thus we cannot call endToken,
835*4947cdc7SCole Faust	// as it assumes sc.rest is unchanged since startToken.
836*4947cdc7SCole Faust	// Instead, buffer the token here.
837*4947cdc7SCole Faust	// TODO(adonovan): opt: buffer only if we encounter a newline.
838*4947cdc7SCole Faust	raw := new(strings.Builder)
839*4947cdc7SCole Faust
840*4947cdc7SCole Faust	// Copy the prefix, e.g. r' or " (see startToken).
841*4947cdc7SCole Faust	raw.Write(sc.token[:len(sc.token)-len(sc.rest)])
842*4947cdc7SCole Faust
843*4947cdc7SCole Faust	if !triple {
844*4947cdc7SCole Faust		// single-quoted string literal
845*4947cdc7SCole Faust		for {
846*4947cdc7SCole Faust			if sc.eof() {
847*4947cdc7SCole Faust				sc.error(val.pos, "unexpected EOF in string")
848*4947cdc7SCole Faust			}
849*4947cdc7SCole Faust			c := sc.readRune()
850*4947cdc7SCole Faust			raw.WriteRune(c)
851*4947cdc7SCole Faust			if c == quote {
852*4947cdc7SCole Faust				break
853*4947cdc7SCole Faust			}
854*4947cdc7SCole Faust			if c == '\n' {
855*4947cdc7SCole Faust				sc.error(val.pos, "unexpected newline in string")
856*4947cdc7SCole Faust			}
857*4947cdc7SCole Faust			if c == '\\' {
858*4947cdc7SCole Faust				if sc.eof() {
859*4947cdc7SCole Faust					sc.error(val.pos, "unexpected EOF in string")
860*4947cdc7SCole Faust				}
861*4947cdc7SCole Faust				c = sc.readRune()
862*4947cdc7SCole Faust				raw.WriteRune(c)
863*4947cdc7SCole Faust			}
864*4947cdc7SCole Faust		}
865*4947cdc7SCole Faust	} else {
866*4947cdc7SCole Faust		// triple-quoted string literal
867*4947cdc7SCole Faust		sc.readRune()
868*4947cdc7SCole Faust		raw.WriteRune(quote)
869*4947cdc7SCole Faust		sc.readRune()
870*4947cdc7SCole Faust		raw.WriteRune(quote)
871*4947cdc7SCole Faust
872*4947cdc7SCole Faust		quoteCount := 0
873*4947cdc7SCole Faust		for {
874*4947cdc7SCole Faust			if sc.eof() {
875*4947cdc7SCole Faust				sc.error(val.pos, "unexpected EOF in string")
876*4947cdc7SCole Faust			}
877*4947cdc7SCole Faust			c := sc.readRune()
878*4947cdc7SCole Faust			raw.WriteRune(c)
879*4947cdc7SCole Faust			if c == quote {
880*4947cdc7SCole Faust				quoteCount++
881*4947cdc7SCole Faust				if quoteCount == 3 {
882*4947cdc7SCole Faust					break
883*4947cdc7SCole Faust				}
884*4947cdc7SCole Faust			} else {
885*4947cdc7SCole Faust				quoteCount = 0
886*4947cdc7SCole Faust			}
887*4947cdc7SCole Faust			if c == '\\' {
888*4947cdc7SCole Faust				if sc.eof() {
889*4947cdc7SCole Faust					sc.error(val.pos, "unexpected EOF in string")
890*4947cdc7SCole Faust				}
891*4947cdc7SCole Faust				c = sc.readRune()
892*4947cdc7SCole Faust				raw.WriteRune(c)
893*4947cdc7SCole Faust			}
894*4947cdc7SCole Faust		}
895*4947cdc7SCole Faust	}
896*4947cdc7SCole Faust	val.raw = raw.String()
897*4947cdc7SCole Faust
898*4947cdc7SCole Faust	s, _, isByte, err := unquote(val.raw)
899*4947cdc7SCole Faust	if err != nil {
900*4947cdc7SCole Faust		sc.error(start, err.Error())
901*4947cdc7SCole Faust	}
902*4947cdc7SCole Faust	val.string = s
903*4947cdc7SCole Faust	if isByte {
904*4947cdc7SCole Faust		return BYTES
905*4947cdc7SCole Faust	} else {
906*4947cdc7SCole Faust		return STRING
907*4947cdc7SCole Faust	}
908*4947cdc7SCole Faust}
909*4947cdc7SCole Faust
910*4947cdc7SCole Faustfunc (sc *scanner) scanNumber(val *tokenValue, c rune) Token {
911*4947cdc7SCole Faust	// https://github.com/google/starlark-go/blob/master/doc/spec.md#lexical-elements
912*4947cdc7SCole Faust	//
913*4947cdc7SCole Faust	// Python features not supported:
914*4947cdc7SCole Faust	// - integer literals of >64 bits of precision
915*4947cdc7SCole Faust	// - 123L or 123l long suffix
916*4947cdc7SCole Faust	// - traditional octal: 0755
917*4947cdc7SCole Faust	// https://docs.python.org/2/reference/lexical_analysis.html#integer-and-long-integer-literals
918*4947cdc7SCole Faust
919*4947cdc7SCole Faust	start := sc.pos
920*4947cdc7SCole Faust	fraction, exponent := false, false
921*4947cdc7SCole Faust
922*4947cdc7SCole Faust	if c == '.' {
923*4947cdc7SCole Faust		// dot or start of fraction
924*4947cdc7SCole Faust		sc.readRune()
925*4947cdc7SCole Faust		c = sc.peekRune()
926*4947cdc7SCole Faust		if !isdigit(c) {
927*4947cdc7SCole Faust			sc.endToken(val)
928*4947cdc7SCole Faust			return DOT
929*4947cdc7SCole Faust		}
930*4947cdc7SCole Faust		fraction = true
931*4947cdc7SCole Faust	} else if c == '0' {
932*4947cdc7SCole Faust		// hex, octal, binary or float
933*4947cdc7SCole Faust		sc.readRune()
934*4947cdc7SCole Faust		c = sc.peekRune()
935*4947cdc7SCole Faust
936*4947cdc7SCole Faust		if c == '.' {
937*4947cdc7SCole Faust			fraction = true
938*4947cdc7SCole Faust		} else if c == 'x' || c == 'X' {
939*4947cdc7SCole Faust			// hex
940*4947cdc7SCole Faust			sc.readRune()
941*4947cdc7SCole Faust			c = sc.peekRune()
942*4947cdc7SCole Faust			if !isxdigit(c) {
943*4947cdc7SCole Faust				sc.error(start, "invalid hex literal")
944*4947cdc7SCole Faust			}
945*4947cdc7SCole Faust			for isxdigit(c) {
946*4947cdc7SCole Faust				sc.readRune()
947*4947cdc7SCole Faust				c = sc.peekRune()
948*4947cdc7SCole Faust			}
949*4947cdc7SCole Faust		} else if c == 'o' || c == 'O' {
950*4947cdc7SCole Faust			// octal
951*4947cdc7SCole Faust			sc.readRune()
952*4947cdc7SCole Faust			c = sc.peekRune()
953*4947cdc7SCole Faust			if !isodigit(c) {
954*4947cdc7SCole Faust				sc.error(sc.pos, "invalid octal literal")
955*4947cdc7SCole Faust			}
956*4947cdc7SCole Faust			for isodigit(c) {
957*4947cdc7SCole Faust				sc.readRune()
958*4947cdc7SCole Faust				c = sc.peekRune()
959*4947cdc7SCole Faust			}
960*4947cdc7SCole Faust		} else if c == 'b' || c == 'B' {
961*4947cdc7SCole Faust			// binary
962*4947cdc7SCole Faust			sc.readRune()
963*4947cdc7SCole Faust			c = sc.peekRune()
964*4947cdc7SCole Faust			if !isbdigit(c) {
965*4947cdc7SCole Faust				sc.error(sc.pos, "invalid binary literal")
966*4947cdc7SCole Faust			}
967*4947cdc7SCole Faust			for isbdigit(c) {
968*4947cdc7SCole Faust				sc.readRune()
969*4947cdc7SCole Faust				c = sc.peekRune()
970*4947cdc7SCole Faust			}
971*4947cdc7SCole Faust		} else {
972*4947cdc7SCole Faust			// float (or obsolete octal "0755")
973*4947cdc7SCole Faust			allzeros, octal := true, true
974*4947cdc7SCole Faust			for isdigit(c) {
975*4947cdc7SCole Faust				if c != '0' {
976*4947cdc7SCole Faust					allzeros = false
977*4947cdc7SCole Faust				}
978*4947cdc7SCole Faust				if c > '7' {
979*4947cdc7SCole Faust					octal = false
980*4947cdc7SCole Faust				}
981*4947cdc7SCole Faust				sc.readRune()
982*4947cdc7SCole Faust				c = sc.peekRune()
983*4947cdc7SCole Faust			}
984*4947cdc7SCole Faust			if c == '.' {
985*4947cdc7SCole Faust				fraction = true
986*4947cdc7SCole Faust			} else if c == 'e' || c == 'E' {
987*4947cdc7SCole Faust				exponent = true
988*4947cdc7SCole Faust			} else if octal && !allzeros {
989*4947cdc7SCole Faust				sc.endToken(val)
990*4947cdc7SCole Faust				sc.errorf(sc.pos, "obsolete form of octal literal; use 0o%s", val.raw[1:])
991*4947cdc7SCole Faust			}
992*4947cdc7SCole Faust		}
993*4947cdc7SCole Faust	} else {
994*4947cdc7SCole Faust		// decimal
995*4947cdc7SCole Faust		for isdigit(c) {
996*4947cdc7SCole Faust			sc.readRune()
997*4947cdc7SCole Faust			c = sc.peekRune()
998*4947cdc7SCole Faust		}
999*4947cdc7SCole Faust
1000*4947cdc7SCole Faust		if c == '.' {
1001*4947cdc7SCole Faust			fraction = true
1002*4947cdc7SCole Faust		} else if c == 'e' || c == 'E' {
1003*4947cdc7SCole Faust			exponent = true
1004*4947cdc7SCole Faust		}
1005*4947cdc7SCole Faust	}
1006*4947cdc7SCole Faust
1007*4947cdc7SCole Faust	if fraction {
1008*4947cdc7SCole Faust		sc.readRune() // consume '.'
1009*4947cdc7SCole Faust		c = sc.peekRune()
1010*4947cdc7SCole Faust		for isdigit(c) {
1011*4947cdc7SCole Faust			sc.readRune()
1012*4947cdc7SCole Faust			c = sc.peekRune()
1013*4947cdc7SCole Faust		}
1014*4947cdc7SCole Faust
1015*4947cdc7SCole Faust		if c == 'e' || c == 'E' {
1016*4947cdc7SCole Faust			exponent = true
1017*4947cdc7SCole Faust		}
1018*4947cdc7SCole Faust	}
1019*4947cdc7SCole Faust
1020*4947cdc7SCole Faust	if exponent {
1021*4947cdc7SCole Faust		sc.readRune() // consume [eE]
1022*4947cdc7SCole Faust		c = sc.peekRune()
1023*4947cdc7SCole Faust		if c == '+' || c == '-' {
1024*4947cdc7SCole Faust			sc.readRune()
1025*4947cdc7SCole Faust			c = sc.peekRune()
1026*4947cdc7SCole Faust			if !isdigit(c) {
1027*4947cdc7SCole Faust				sc.error(sc.pos, "invalid float literal")
1028*4947cdc7SCole Faust			}
1029*4947cdc7SCole Faust		}
1030*4947cdc7SCole Faust		for isdigit(c) {
1031*4947cdc7SCole Faust			sc.readRune()
1032*4947cdc7SCole Faust			c = sc.peekRune()
1033*4947cdc7SCole Faust		}
1034*4947cdc7SCole Faust	}
1035*4947cdc7SCole Faust
1036*4947cdc7SCole Faust	sc.endToken(val)
1037*4947cdc7SCole Faust	if fraction || exponent {
1038*4947cdc7SCole Faust		var err error
1039*4947cdc7SCole Faust		val.float, err = strconv.ParseFloat(val.raw, 64)
1040*4947cdc7SCole Faust		if err != nil {
1041*4947cdc7SCole Faust			sc.error(sc.pos, "invalid float literal")
1042*4947cdc7SCole Faust		}
1043*4947cdc7SCole Faust		return FLOAT
1044*4947cdc7SCole Faust	} else {
1045*4947cdc7SCole Faust		var err error
1046*4947cdc7SCole Faust		s := val.raw
1047*4947cdc7SCole Faust		val.bigInt = nil
1048*4947cdc7SCole Faust		if len(s) > 2 && s[0] == '0' && (s[1] == 'o' || s[1] == 'O') {
1049*4947cdc7SCole Faust			val.int, err = strconv.ParseInt(s[2:], 8, 64)
1050*4947cdc7SCole Faust		} else if len(s) > 2 && s[0] == '0' && (s[1] == 'b' || s[1] == 'B') {
1051*4947cdc7SCole Faust			val.int, err = strconv.ParseInt(s[2:], 2, 64)
1052*4947cdc7SCole Faust		} else {
1053*4947cdc7SCole Faust			val.int, err = strconv.ParseInt(s, 0, 64)
1054*4947cdc7SCole Faust			if err != nil {
1055*4947cdc7SCole Faust				num := new(big.Int)
1056*4947cdc7SCole Faust				var ok bool
1057*4947cdc7SCole Faust				val.bigInt, ok = num.SetString(s, 0)
1058*4947cdc7SCole Faust				if ok {
1059*4947cdc7SCole Faust					err = nil
1060*4947cdc7SCole Faust				}
1061*4947cdc7SCole Faust			}
1062*4947cdc7SCole Faust		}
1063*4947cdc7SCole Faust		if err != nil {
1064*4947cdc7SCole Faust			sc.error(start, "invalid int literal")
1065*4947cdc7SCole Faust		}
1066*4947cdc7SCole Faust		return INT
1067*4947cdc7SCole Faust	}
1068*4947cdc7SCole Faust}
1069*4947cdc7SCole Faust
1070*4947cdc7SCole Faust// isIdent reports whether c is an identifier rune.
1071*4947cdc7SCole Faustfunc isIdent(c rune) bool {
1072*4947cdc7SCole Faust	return isdigit(c) || isIdentStart(c)
1073*4947cdc7SCole Faust}
1074*4947cdc7SCole Faust
1075*4947cdc7SCole Faustfunc isIdentStart(c rune) bool {
1076*4947cdc7SCole Faust	return 'a' <= c && c <= 'z' ||
1077*4947cdc7SCole Faust		'A' <= c && c <= 'Z' ||
1078*4947cdc7SCole Faust		c == '_' ||
1079*4947cdc7SCole Faust		unicode.IsLetter(c)
1080*4947cdc7SCole Faust}
1081*4947cdc7SCole Faust
1082*4947cdc7SCole Faustfunc isdigit(c rune) bool  { return '0' <= c && c <= '9' }
1083*4947cdc7SCole Faustfunc isodigit(c rune) bool { return '0' <= c && c <= '7' }
1084*4947cdc7SCole Faustfunc isxdigit(c rune) bool { return isdigit(c) || 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f' }
1085*4947cdc7SCole Faustfunc isbdigit(c rune) bool { return '0' == c || c == '1' }
1086*4947cdc7SCole Faust
1087*4947cdc7SCole Faust// keywordToken records the special tokens for
1088*4947cdc7SCole Faust// strings that should not be treated as ordinary identifiers.
1089*4947cdc7SCole Faustvar keywordToken = map[string]Token{
1090*4947cdc7SCole Faust	"and":      AND,
1091*4947cdc7SCole Faust	"break":    BREAK,
1092*4947cdc7SCole Faust	"continue": CONTINUE,
1093*4947cdc7SCole Faust	"def":      DEF,
1094*4947cdc7SCole Faust	"elif":     ELIF,
1095*4947cdc7SCole Faust	"else":     ELSE,
1096*4947cdc7SCole Faust	"for":      FOR,
1097*4947cdc7SCole Faust	"if":       IF,
1098*4947cdc7SCole Faust	"in":       IN,
1099*4947cdc7SCole Faust	"lambda":   LAMBDA,
1100*4947cdc7SCole Faust	"load":     LOAD,
1101*4947cdc7SCole Faust	"not":      NOT,
1102*4947cdc7SCole Faust	"or":       OR,
1103*4947cdc7SCole Faust	"pass":     PASS,
1104*4947cdc7SCole Faust	"return":   RETURN,
1105*4947cdc7SCole Faust	"while":    WHILE,
1106*4947cdc7SCole Faust
1107*4947cdc7SCole Faust	// reserved words:
1108*4947cdc7SCole Faust	"as": ILLEGAL,
1109*4947cdc7SCole Faust	// "assert":   ILLEGAL, // heavily used by our tests
1110*4947cdc7SCole Faust	"class":    ILLEGAL,
1111*4947cdc7SCole Faust	"del":      ILLEGAL,
1112*4947cdc7SCole Faust	"except":   ILLEGAL,
1113*4947cdc7SCole Faust	"finally":  ILLEGAL,
1114*4947cdc7SCole Faust	"from":     ILLEGAL,
1115*4947cdc7SCole Faust	"global":   ILLEGAL,
1116*4947cdc7SCole Faust	"import":   ILLEGAL,
1117*4947cdc7SCole Faust	"is":       ILLEGAL,
1118*4947cdc7SCole Faust	"nonlocal": ILLEGAL,
1119*4947cdc7SCole Faust	"raise":    ILLEGAL,
1120*4947cdc7SCole Faust	"try":      ILLEGAL,
1121*4947cdc7SCole Faust	"with":     ILLEGAL,
1122*4947cdc7SCole Faust	"yield":    ILLEGAL,
1123*4947cdc7SCole Faust}
1124