1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Package scanner implements a scanner for Go source text.
6// It takes a []byte as source which can then be tokenized
7// through repeated calls to the Scan method.
8package scanner
9
10import (
11	"bytes"
12	"fmt"
13	"go/token"
14	"path/filepath"
15	"strconv"
16	"unicode"
17	"unicode/utf8"
18)
19
20// An ErrorHandler may be provided to [Scanner.Init]. If a syntax error is
21// encountered and a handler was installed, the handler is called with a
22// position and an error message. The position points to the beginning of
23// the offending token.
24type ErrorHandler func(pos token.Position, msg string)
25
26// A Scanner holds the scanner's internal state while processing
27// a given text. It can be allocated as part of another data
28// structure but must be initialized via [Scanner.Init] before use.
29type Scanner struct {
30	// immutable state
31	file *token.File  // source file handle
32	dir  string       // directory portion of file.Name()
33	src  []byte       // source
34	err  ErrorHandler // error reporting; or nil
35	mode Mode         // scanning mode
36
37	// scanning state
38	ch         rune      // current character
39	offset     int       // character offset
40	rdOffset   int       // reading offset (position after current character)
41	lineOffset int       // current line offset
42	insertSemi bool      // insert a semicolon before next newline
43	nlPos      token.Pos // position of newline in preceding comment
44
45	// public state - ok to modify
46	ErrorCount int // number of errors encountered
47}
48
49const (
50	bom = 0xFEFF // byte order mark, only permitted as very first character
51	eof = -1     // end of file
52)
53
54// Read the next Unicode char into s.ch.
55// s.ch < 0 means end-of-file.
56//
57// For optimization, there is some overlap between this method and
58// s.scanIdentifier.
59func (s *Scanner) next() {
60	if s.rdOffset < len(s.src) {
61		s.offset = s.rdOffset
62		if s.ch == '\n' {
63			s.lineOffset = s.offset
64			s.file.AddLine(s.offset)
65		}
66		r, w := rune(s.src[s.rdOffset]), 1
67		switch {
68		case r == 0:
69			s.error(s.offset, "illegal character NUL")
70		case r >= utf8.RuneSelf:
71			// not ASCII
72			r, w = utf8.DecodeRune(s.src[s.rdOffset:])
73			if r == utf8.RuneError && w == 1 {
74				s.error(s.offset, "illegal UTF-8 encoding")
75			} else if r == bom && s.offset > 0 {
76				s.error(s.offset, "illegal byte order mark")
77			}
78		}
79		s.rdOffset += w
80		s.ch = r
81	} else {
82		s.offset = len(s.src)
83		if s.ch == '\n' {
84			s.lineOffset = s.offset
85			s.file.AddLine(s.offset)
86		}
87		s.ch = eof
88	}
89}
90
91// peek returns the byte following the most recently read character without
92// advancing the scanner. If the scanner is at EOF, peek returns 0.
93func (s *Scanner) peek() byte {
94	if s.rdOffset < len(s.src) {
95		return s.src[s.rdOffset]
96	}
97	return 0
98}
99
100// A mode value is a set of flags (or 0).
101// They control scanner behavior.
102type Mode uint
103
104const (
105	ScanComments    Mode = 1 << iota // return comments as COMMENT tokens
106	dontInsertSemis                  // do not automatically insert semicolons - for testing only
107)
108
109// Init prepares the scanner s to tokenize the text src by setting the
110// scanner at the beginning of src. The scanner uses the file set file
111// for position information and it adds line information for each line.
112// It is ok to re-use the same file when re-scanning the same file as
113// line information which is already present is ignored. Init causes a
114// panic if the file size does not match the src size.
115//
116// Calls to [Scanner.Scan] will invoke the error handler err if they encounter a
117// syntax error and err is not nil. Also, for each error encountered,
118// the [Scanner] field ErrorCount is incremented by one. The mode parameter
119// determines how comments are handled.
120//
121// Note that Init may call err if there is an error in the first character
122// of the file.
123func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
124	// Explicitly initialize all fields since a scanner may be reused.
125	if file.Size() != len(src) {
126		panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
127	}
128	s.file = file
129	s.dir, _ = filepath.Split(file.Name())
130	s.src = src
131	s.err = err
132	s.mode = mode
133
134	s.ch = ' '
135	s.offset = 0
136	s.rdOffset = 0
137	s.lineOffset = 0
138	s.insertSemi = false
139	s.ErrorCount = 0
140
141	s.next()
142	if s.ch == bom {
143		s.next() // ignore BOM at file beginning
144	}
145}
146
147func (s *Scanner) error(offs int, msg string) {
148	if s.err != nil {
149		s.err(s.file.Position(s.file.Pos(offs)), msg)
150	}
151	s.ErrorCount++
152}
153
154func (s *Scanner) errorf(offs int, format string, args ...any) {
155	s.error(offs, fmt.Sprintf(format, args...))
156}
157
158// scanComment returns the text of the comment and (if nonzero)
159// the offset of the first newline within it, which implies a
160// /*...*/ comment.
161func (s *Scanner) scanComment() (string, int) {
162	// initial '/' already consumed; s.ch == '/' || s.ch == '*'
163	offs := s.offset - 1 // position of initial '/'
164	next := -1           // position immediately following the comment; < 0 means invalid comment
165	numCR := 0
166	nlOffset := 0 // offset of first newline within /*...*/ comment
167
168	if s.ch == '/' {
169		//-style comment
170		// (the final '\n' is not considered part of the comment)
171		s.next()
172		for s.ch != '\n' && s.ch >= 0 {
173			if s.ch == '\r' {
174				numCR++
175			}
176			s.next()
177		}
178		// if we are at '\n', the position following the comment is afterwards
179		next = s.offset
180		if s.ch == '\n' {
181			next++
182		}
183		goto exit
184	}
185
186	/*-style comment */
187	s.next()
188	for s.ch >= 0 {
189		ch := s.ch
190		if ch == '\r' {
191			numCR++
192		} else if ch == '\n' && nlOffset == 0 {
193			nlOffset = s.offset
194		}
195		s.next()
196		if ch == '*' && s.ch == '/' {
197			s.next()
198			next = s.offset
199			goto exit
200		}
201	}
202
203	s.error(offs, "comment not terminated")
204
205exit:
206	lit := s.src[offs:s.offset]
207
208	// On Windows, a (//-comment) line may end in "\r\n".
209	// Remove the final '\r' before analyzing the text for
210	// line directives (matching the compiler). Remove any
211	// other '\r' afterwards (matching the pre-existing be-
212	// havior of the scanner).
213	if numCR > 0 && len(lit) >= 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
214		lit = lit[:len(lit)-1]
215		numCR--
216	}
217
218	// interpret line directives
219	// (//line directives must start at the beginning of the current line)
220	if next >= 0 /* implies valid comment */ && (lit[1] == '*' || offs == s.lineOffset) && bytes.HasPrefix(lit[2:], prefix) {
221		s.updateLineInfo(next, offs, lit)
222	}
223
224	if numCR > 0 {
225		lit = stripCR(lit, lit[1] == '*')
226	}
227
228	return string(lit), nlOffset
229}
230
231var prefix = []byte("line ")
232
233// updateLineInfo parses the incoming comment text at offset offs
234// as a line directive. If successful, it updates the line info table
235// for the position next per the line directive.
236func (s *Scanner) updateLineInfo(next, offs int, text []byte) {
237	// extract comment text
238	if text[1] == '*' {
239		text = text[:len(text)-2] // lop off trailing "*/"
240	}
241	text = text[7:] // lop off leading "//line " or "/*line "
242	offs += 7
243
244	i, n, ok := trailingDigits(text)
245	if i == 0 {
246		return // ignore (not a line directive)
247	}
248	// i > 0
249
250	if !ok {
251		// text has a suffix :xxx but xxx is not a number
252		s.error(offs+i, "invalid line number: "+string(text[i:]))
253		return
254	}
255
256	// Put a cap on the maximum size of line and column numbers.
257	// 30 bits allows for some additional space before wrapping an int32.
258	// Keep this consistent with cmd/compile/internal/syntax.PosMax.
259	const maxLineCol = 1 << 30
260	var line, col int
261	i2, n2, ok2 := trailingDigits(text[:i-1])
262	if ok2 {
263		//line filename:line:col
264		i, i2 = i2, i
265		line, col = n2, n
266		if col == 0 || col > maxLineCol {
267			s.error(offs+i2, "invalid column number: "+string(text[i2:]))
268			return
269		}
270		text = text[:i2-1] // lop off ":col"
271	} else {
272		//line filename:line
273		line = n
274	}
275
276	if line == 0 || line > maxLineCol {
277		s.error(offs+i, "invalid line number: "+string(text[i:]))
278		return
279	}
280
281	// If we have a column (//line filename:line:col form),
282	// an empty filename means to use the previous filename.
283	filename := string(text[:i-1]) // lop off ":line", and trim white space
284	if filename == "" && ok2 {
285		filename = s.file.Position(s.file.Pos(offs)).Filename
286	} else if filename != "" {
287		// Put a relative filename in the current directory.
288		// This is for compatibility with earlier releases.
289		// See issue 26671.
290		filename = filepath.Clean(filename)
291		if !filepath.IsAbs(filename) {
292			filename = filepath.Join(s.dir, filename)
293		}
294	}
295
296	s.file.AddLineColumnInfo(next, filename, line, col)
297}
298
299func trailingDigits(text []byte) (int, int, bool) {
300	i := bytes.LastIndexByte(text, ':') // look from right (Windows filenames may contain ':')
301	if i < 0 {
302		return 0, 0, false // no ":"
303	}
304	// i >= 0
305	n, err := strconv.ParseUint(string(text[i+1:]), 10, 0)
306	return i + 1, int(n), err == nil
307}
308
309func isLetter(ch rune) bool {
310	return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
311}
312
313func isDigit(ch rune) bool {
314	return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
315}
316
317// scanIdentifier reads the string of valid identifier characters at s.offset.
318// It must only be called when s.ch is known to be a valid letter.
319//
320// Be careful when making changes to this function: it is optimized and affects
321// scanning performance significantly.
322func (s *Scanner) scanIdentifier() string {
323	offs := s.offset
324
325	// Optimize for the common case of an ASCII identifier.
326	//
327	// Ranging over s.src[s.rdOffset:] lets us avoid some bounds checks, and
328	// avoids conversions to runes.
329	//
330	// In case we encounter a non-ASCII character, fall back on the slower path
331	// of calling into s.next().
332	for rdOffset, b := range s.src[s.rdOffset:] {
333		if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' || b == '_' || '0' <= b && b <= '9' {
334			// Avoid assigning a rune for the common case of an ascii character.
335			continue
336		}
337		s.rdOffset += rdOffset
338		if 0 < b && b < utf8.RuneSelf {
339			// Optimization: we've encountered an ASCII character that's not a letter
340			// or number. Avoid the call into s.next() and corresponding set up.
341			//
342			// Note that s.next() does some line accounting if s.ch is '\n', so this
343			// shortcut is only possible because we know that the preceding character
344			// is not '\n'.
345			s.ch = rune(b)
346			s.offset = s.rdOffset
347			s.rdOffset++
348			goto exit
349		}
350		// We know that the preceding character is valid for an identifier because
351		// scanIdentifier is only called when s.ch is a letter, so calling s.next()
352		// at s.rdOffset resets the scanner state.
353		s.next()
354		for isLetter(s.ch) || isDigit(s.ch) {
355			s.next()
356		}
357		goto exit
358	}
359	s.offset = len(s.src)
360	s.rdOffset = len(s.src)
361	s.ch = eof
362
363exit:
364	return string(s.src[offs:s.offset])
365}
366
367func digitVal(ch rune) int {
368	switch {
369	case '0' <= ch && ch <= '9':
370		return int(ch - '0')
371	case 'a' <= lower(ch) && lower(ch) <= 'f':
372		return int(lower(ch) - 'a' + 10)
373	}
374	return 16 // larger than any legal digit val
375}
376
377func lower(ch rune) rune     { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter
378func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
379func isHex(ch rune) bool     { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
380
381// digits accepts the sequence { digit | '_' }.
382// If base <= 10, digits accepts any decimal digit but records
383// the offset (relative to the source start) of a digit >= base
384// in *invalid, if *invalid < 0.
385// digits returns a bitset describing whether the sequence contained
386// digits (bit 0 is set), or separators '_' (bit 1 is set).
387func (s *Scanner) digits(base int, invalid *int) (digsep int) {
388	if base <= 10 {
389		max := rune('0' + base)
390		for isDecimal(s.ch) || s.ch == '_' {
391			ds := 1
392			if s.ch == '_' {
393				ds = 2
394			} else if s.ch >= max && *invalid < 0 {
395				*invalid = s.offset // record invalid rune offset
396			}
397			digsep |= ds
398			s.next()
399		}
400	} else {
401		for isHex(s.ch) || s.ch == '_' {
402			ds := 1
403			if s.ch == '_' {
404				ds = 2
405			}
406			digsep |= ds
407			s.next()
408		}
409	}
410	return
411}
412
413func (s *Scanner) scanNumber() (token.Token, string) {
414	offs := s.offset
415	tok := token.ILLEGAL
416
417	base := 10        // number base
418	prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b'
419	digsep := 0       // bit 0: digit present, bit 1: '_' present
420	invalid := -1     // index of invalid digit in literal, or < 0
421
422	// integer part
423	if s.ch != '.' {
424		tok = token.INT
425		if s.ch == '0' {
426			s.next()
427			switch lower(s.ch) {
428			case 'x':
429				s.next()
430				base, prefix = 16, 'x'
431			case 'o':
432				s.next()
433				base, prefix = 8, 'o'
434			case 'b':
435				s.next()
436				base, prefix = 2, 'b'
437			default:
438				base, prefix = 8, '0'
439				digsep = 1 // leading 0
440			}
441		}
442		digsep |= s.digits(base, &invalid)
443	}
444
445	// fractional part
446	if s.ch == '.' {
447		tok = token.FLOAT
448		if prefix == 'o' || prefix == 'b' {
449			s.error(s.offset, "invalid radix point in "+litname(prefix))
450		}
451		s.next()
452		digsep |= s.digits(base, &invalid)
453	}
454
455	if digsep&1 == 0 {
456		s.error(s.offset, litname(prefix)+" has no digits")
457	}
458
459	// exponent
460	if e := lower(s.ch); e == 'e' || e == 'p' {
461		switch {
462		case e == 'e' && prefix != 0 && prefix != '0':
463			s.errorf(s.offset, "%q exponent requires decimal mantissa", s.ch)
464		case e == 'p' && prefix != 'x':
465			s.errorf(s.offset, "%q exponent requires hexadecimal mantissa", s.ch)
466		}
467		s.next()
468		tok = token.FLOAT
469		if s.ch == '+' || s.ch == '-' {
470			s.next()
471		}
472		ds := s.digits(10, nil)
473		digsep |= ds
474		if ds&1 == 0 {
475			s.error(s.offset, "exponent has no digits")
476		}
477	} else if prefix == 'x' && tok == token.FLOAT {
478		s.error(s.offset, "hexadecimal mantissa requires a 'p' exponent")
479	}
480
481	// suffix 'i'
482	if s.ch == 'i' {
483		tok = token.IMAG
484		s.next()
485	}
486
487	lit := string(s.src[offs:s.offset])
488	if tok == token.INT && invalid >= 0 {
489		s.errorf(invalid, "invalid digit %q in %s", lit[invalid-offs], litname(prefix))
490	}
491	if digsep&2 != 0 {
492		if i := invalidSep(lit); i >= 0 {
493			s.error(offs+i, "'_' must separate successive digits")
494		}
495	}
496
497	return tok, lit
498}
499
500func litname(prefix rune) string {
501	switch prefix {
502	case 'x':
503		return "hexadecimal literal"
504	case 'o', '0':
505		return "octal literal"
506	case 'b':
507		return "binary literal"
508	}
509	return "decimal literal"
510}
511
512// invalidSep returns the index of the first invalid separator in x, or -1.
513func invalidSep(x string) int {
514	x1 := ' ' // prefix char, we only care if it's 'x'
515	d := '.'  // digit, one of '_', '0' (a digit), or '.' (anything else)
516	i := 0
517
518	// a prefix counts as a digit
519	if len(x) >= 2 && x[0] == '0' {
520		x1 = lower(rune(x[1]))
521		if x1 == 'x' || x1 == 'o' || x1 == 'b' {
522			d = '0'
523			i = 2
524		}
525	}
526
527	// mantissa and exponent
528	for ; i < len(x); i++ {
529		p := d // previous digit
530		d = rune(x[i])
531		switch {
532		case d == '_':
533			if p != '0' {
534				return i
535			}
536		case isDecimal(d) || x1 == 'x' && isHex(d):
537			d = '0'
538		default:
539			if p == '_' {
540				return i - 1
541			}
542			d = '.'
543		}
544	}
545	if d == '_' {
546		return len(x) - 1
547	}
548
549	return -1
550}
551
552// scanEscape parses an escape sequence where rune is the accepted
553// escaped quote. In case of a syntax error, it stops at the offending
554// character (without consuming it) and returns false. Otherwise
555// it returns true.
556func (s *Scanner) scanEscape(quote rune) bool {
557	offs := s.offset
558
559	var n int
560	var base, max uint32
561	switch s.ch {
562	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
563		s.next()
564		return true
565	case '0', '1', '2', '3', '4', '5', '6', '7':
566		n, base, max = 3, 8, 255
567	case 'x':
568		s.next()
569		n, base, max = 2, 16, 255
570	case 'u':
571		s.next()
572		n, base, max = 4, 16, unicode.MaxRune
573	case 'U':
574		s.next()
575		n, base, max = 8, 16, unicode.MaxRune
576	default:
577		msg := "unknown escape sequence"
578		if s.ch < 0 {
579			msg = "escape sequence not terminated"
580		}
581		s.error(offs, msg)
582		return false
583	}
584
585	var x uint32
586	for n > 0 {
587		d := uint32(digitVal(s.ch))
588		if d >= base {
589			msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch)
590			if s.ch < 0 {
591				msg = "escape sequence not terminated"
592			}
593			s.error(s.offset, msg)
594			return false
595		}
596		x = x*base + d
597		s.next()
598		n--
599	}
600
601	if x > max || 0xD800 <= x && x < 0xE000 {
602		s.error(offs, "escape sequence is invalid Unicode code point")
603		return false
604	}
605
606	return true
607}
608
609func (s *Scanner) scanRune() string {
610	// '\'' opening already consumed
611	offs := s.offset - 1
612
613	valid := true
614	n := 0
615	for {
616		ch := s.ch
617		if ch == '\n' || ch < 0 {
618			// only report error if we don't have one already
619			if valid {
620				s.error(offs, "rune literal not terminated")
621				valid = false
622			}
623			break
624		}
625		s.next()
626		if ch == '\'' {
627			break
628		}
629		n++
630		if ch == '\\' {
631			if !s.scanEscape('\'') {
632				valid = false
633			}
634			// continue to read to closing quote
635		}
636	}
637
638	if valid && n != 1 {
639		s.error(offs, "illegal rune literal")
640	}
641
642	return string(s.src[offs:s.offset])
643}
644
645func (s *Scanner) scanString() string {
646	// '"' opening already consumed
647	offs := s.offset - 1
648
649	for {
650		ch := s.ch
651		if ch == '\n' || ch < 0 {
652			s.error(offs, "string literal not terminated")
653			break
654		}
655		s.next()
656		if ch == '"' {
657			break
658		}
659		if ch == '\\' {
660			s.scanEscape('"')
661		}
662	}
663
664	return string(s.src[offs:s.offset])
665}
666
667func stripCR(b []byte, comment bool) []byte {
668	c := make([]byte, len(b))
669	i := 0
670	for j, ch := range b {
671		// In a /*-style comment, don't strip \r from *\r/ (incl.
672		// sequences of \r from *\r\r...\r/) since the resulting
673		// */ would terminate the comment too early unless the \r
674		// is immediately following the opening /* in which case
675		// it's ok because /*/ is not closed yet (issue #11151).
676		if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' {
677			c[i] = ch
678			i++
679		}
680	}
681	return c[:i]
682}
683
684func (s *Scanner) scanRawString() string {
685	// '`' opening already consumed
686	offs := s.offset - 1
687
688	hasCR := false
689	for {
690		ch := s.ch
691		if ch < 0 {
692			s.error(offs, "raw string literal not terminated")
693			break
694		}
695		s.next()
696		if ch == '`' {
697			break
698		}
699		if ch == '\r' {
700			hasCR = true
701		}
702	}
703
704	lit := s.src[offs:s.offset]
705	if hasCR {
706		lit = stripCR(lit, false)
707	}
708
709	return string(lit)
710}
711
712func (s *Scanner) skipWhitespace() {
713	for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' {
714		s.next()
715	}
716}
717
718// Helper functions for scanning multi-byte tokens such as >> += >>= .
719// Different routines recognize different length tok_i based on matches
720// of ch_i. If a token ends in '=', the result is tok1 or tok3
721// respectively. Otherwise, the result is tok0 if there was no other
722// matching character, or tok2 if the matching character was ch2.
723
724func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
725	if s.ch == '=' {
726		s.next()
727		return tok1
728	}
729	return tok0
730}
731
732func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token {
733	if s.ch == '=' {
734		s.next()
735		return tok1
736	}
737	if s.ch == ch2 {
738		s.next()
739		return tok2
740	}
741	return tok0
742}
743
744func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token {
745	if s.ch == '=' {
746		s.next()
747		return tok1
748	}
749	if s.ch == ch2 {
750		s.next()
751		if s.ch == '=' {
752			s.next()
753			return tok3
754		}
755		return tok2
756	}
757	return tok0
758}
759
760// Scan scans the next token and returns the token position, the token,
761// and its literal string if applicable. The source end is indicated by
762// [token.EOF].
763//
764// If the returned token is a literal ([token.IDENT], [token.INT], [token.FLOAT],
765// [token.IMAG], [token.CHAR], [token.STRING]) or [token.COMMENT], the literal string
766// has the corresponding value.
767//
768// If the returned token is a keyword, the literal string is the keyword.
769//
770// If the returned token is [token.SEMICOLON], the corresponding
771// literal string is ";" if the semicolon was present in the source,
772// and "\n" if the semicolon was inserted because of a newline or
773// at EOF.
774//
775// If the returned token is [token.ILLEGAL], the literal string is the
776// offending character.
777//
778// In all other cases, Scan returns an empty literal string.
779//
780// For more tolerant parsing, Scan will return a valid token if
781// possible even if a syntax error was encountered. Thus, even
782// if the resulting token sequence contains no illegal tokens,
783// a client may not assume that no error occurred. Instead it
784// must check the scanner's ErrorCount or the number of calls
785// of the error handler, if there was one installed.
786//
787// Scan adds line information to the file added to the file
788// set with Init. Token positions are relative to that file
789// and thus relative to the file set.
790func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
791scanAgain:
792	if s.nlPos.IsValid() {
793		// Return artificial ';' token after /*...*/ comment
794		// containing newline, at position of first newline.
795		pos, tok, lit = s.nlPos, token.SEMICOLON, "\n"
796		s.nlPos = token.NoPos
797		return
798	}
799
800	s.skipWhitespace()
801
802	// current token start
803	pos = s.file.Pos(s.offset)
804
805	// determine token value
806	insertSemi := false
807	switch ch := s.ch; {
808	case isLetter(ch):
809		lit = s.scanIdentifier()
810		if len(lit) > 1 {
811			// keywords are longer than one letter - avoid lookup otherwise
812			tok = token.Lookup(lit)
813			switch tok {
814			case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
815				insertSemi = true
816			}
817		} else {
818			insertSemi = true
819			tok = token.IDENT
820		}
821	case isDecimal(ch) || ch == '.' && isDecimal(rune(s.peek())):
822		insertSemi = true
823		tok, lit = s.scanNumber()
824	default:
825		s.next() // always make progress
826		switch ch {
827		case eof:
828			if s.insertSemi {
829				s.insertSemi = false // EOF consumed
830				return pos, token.SEMICOLON, "\n"
831			}
832			tok = token.EOF
833		case '\n':
834			// we only reach here if s.insertSemi was
835			// set in the first place and exited early
836			// from s.skipWhitespace()
837			s.insertSemi = false // newline consumed
838			return pos, token.SEMICOLON, "\n"
839		case '"':
840			insertSemi = true
841			tok = token.STRING
842			lit = s.scanString()
843		case '\'':
844			insertSemi = true
845			tok = token.CHAR
846			lit = s.scanRune()
847		case '`':
848			insertSemi = true
849			tok = token.STRING
850			lit = s.scanRawString()
851		case ':':
852			tok = s.switch2(token.COLON, token.DEFINE)
853		case '.':
854			// fractions starting with a '.' are handled by outer switch
855			tok = token.PERIOD
856			if s.ch == '.' && s.peek() == '.' {
857				s.next()
858				s.next() // consume last '.'
859				tok = token.ELLIPSIS
860			}
861		case ',':
862			tok = token.COMMA
863		case ';':
864			tok = token.SEMICOLON
865			lit = ";"
866		case '(':
867			tok = token.LPAREN
868		case ')':
869			insertSemi = true
870			tok = token.RPAREN
871		case '[':
872			tok = token.LBRACK
873		case ']':
874			insertSemi = true
875			tok = token.RBRACK
876		case '{':
877			tok = token.LBRACE
878		case '}':
879			insertSemi = true
880			tok = token.RBRACE
881		case '+':
882			tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
883			if tok == token.INC {
884				insertSemi = true
885			}
886		case '-':
887			tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
888			if tok == token.DEC {
889				insertSemi = true
890			}
891		case '*':
892			tok = s.switch2(token.MUL, token.MUL_ASSIGN)
893		case '/':
894			if s.ch == '/' || s.ch == '*' {
895				// comment
896				comment, nlOffset := s.scanComment()
897				if s.insertSemi && nlOffset != 0 {
898					// For /*...*/ containing \n, return
899					// COMMENT then artificial SEMICOLON.
900					s.nlPos = s.file.Pos(nlOffset)
901					s.insertSemi = false
902				} else {
903					insertSemi = s.insertSemi // preserve insertSemi info
904				}
905				if s.mode&ScanComments == 0 {
906					// skip comment
907					goto scanAgain
908				}
909				tok = token.COMMENT
910				lit = comment
911			} else {
912				// division
913				tok = s.switch2(token.QUO, token.QUO_ASSIGN)
914			}
915		case '%':
916			tok = s.switch2(token.REM, token.REM_ASSIGN)
917		case '^':
918			tok = s.switch2(token.XOR, token.XOR_ASSIGN)
919		case '<':
920			if s.ch == '-' {
921				s.next()
922				tok = token.ARROW
923			} else {
924				tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
925			}
926		case '>':
927			tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
928		case '=':
929			tok = s.switch2(token.ASSIGN, token.EQL)
930		case '!':
931			tok = s.switch2(token.NOT, token.NEQ)
932		case '&':
933			if s.ch == '^' {
934				s.next()
935				tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
936			} else {
937				tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
938			}
939		case '|':
940			tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
941		case '~':
942			tok = token.TILDE
943		default:
944			// next reports unexpected BOMs - don't repeat
945			if ch != bom {
946				// Report an informative error for U+201[CD] quotation
947				// marks, which are easily introduced via copy and paste.
948				if ch == '“' || ch == '”' {
949					s.errorf(s.file.Offset(pos), "curly quotation mark %q (use neutral %q)", ch, '"')
950				} else {
951					s.errorf(s.file.Offset(pos), "illegal character %#U", ch)
952				}
953			}
954			insertSemi = s.insertSemi // preserve insertSemi info
955			tok = token.ILLEGAL
956			lit = string(ch)
957		}
958	}
959	if s.mode&dontInsertSemis == 0 {
960		s.insertSemi = insertSemi
961	}
962
963	return
964}
965