1// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package parse
6
7import (
8	"fmt"
9	"strings"
10	"unicode"
11	"unicode/utf8"
12)
13
14// item represents a token or text string returned from the scanner.
15type item struct {
16	typ  itemType // The type of this item.
17	pos  Pos      // The starting position, in bytes, of this item in the input string.
18	val  string   // The value of this item.
19	line int      // The line number at the start of this item.
20}
21
22func (i item) String() string {
23	switch {
24	case i.typ == itemEOF:
25		return "EOF"
26	case i.typ == itemError:
27		return i.val
28	case i.typ > itemKeyword:
29		return fmt.Sprintf("<%s>", i.val)
30	case len(i.val) > 10:
31		return fmt.Sprintf("%.10q...", i.val)
32	}
33	return fmt.Sprintf("%q", i.val)
34}
35
36// itemType identifies the type of lex items.
37type itemType int
38
39const (
40	itemError        itemType = iota // error occurred; value is text of error
41	itemBool                         // boolean constant
42	itemChar                         // printable ASCII character; grab bag for comma etc.
43	itemCharConstant                 // character constant
44	itemComment                      // comment text
45	itemComplex                      // complex constant (1+2i); imaginary is just a number
46	itemAssign                       // equals ('=') introducing an assignment
47	itemDeclare                      // colon-equals (':=') introducing a declaration
48	itemEOF
49	itemField      // alphanumeric identifier starting with '.'
50	itemIdentifier // alphanumeric identifier not starting with '.'
51	itemLeftDelim  // left action delimiter
52	itemLeftParen  // '(' inside action
53	itemNumber     // simple number, including imaginary
54	itemPipe       // pipe symbol
55	itemRawString  // raw quoted string (includes quotes)
56	itemRightDelim // right action delimiter
57	itemRightParen // ')' inside action
58	itemSpace      // run of spaces separating arguments
59	itemString     // quoted string (includes quotes)
60	itemText       // plain text
61	itemVariable   // variable starting with '$', such as '$' or  '$1' or '$hello'
62	// Keywords appear after all the rest.
63	itemKeyword  // used only to delimit the keywords
64	itemBlock    // block keyword
65	itemBreak    // break keyword
66	itemContinue // continue keyword
67	itemDot      // the cursor, spelled '.'
68	itemDefine   // define keyword
69	itemElse     // else keyword
70	itemEnd      // end keyword
71	itemIf       // if keyword
72	itemNil      // the untyped nil constant, easiest to treat as a keyword
73	itemRange    // range keyword
74	itemTemplate // template keyword
75	itemWith     // with keyword
76)
77
78var key = map[string]itemType{
79	".":        itemDot,
80	"block":    itemBlock,
81	"break":    itemBreak,
82	"continue": itemContinue,
83	"define":   itemDefine,
84	"else":     itemElse,
85	"end":      itemEnd,
86	"if":       itemIf,
87	"range":    itemRange,
88	"nil":      itemNil,
89	"template": itemTemplate,
90	"with":     itemWith,
91}
92
93const eof = -1
94
95// Trimming spaces.
96// If the action begins "{{- " rather than "{{", then all space/tab/newlines
97// preceding the action are trimmed; conversely if it ends " -}}" the
98// leading spaces are trimmed. This is done entirely in the lexer; the
99// parser never sees it happen. We require an ASCII space (' ', \t, \r, \n)
100// to be present to avoid ambiguity with things like "{{-3}}". It reads
101// better with the space present anyway. For simplicity, only ASCII
102// does the job.
103const (
104	spaceChars    = " \t\r\n"  // These are the space characters defined by Go itself.
105	trimMarker    = '-'        // Attached to left/right delimiter, trims trailing spaces from preceding/following text.
106	trimMarkerLen = Pos(1 + 1) // marker plus space before or after
107)
108
109// stateFn represents the state of the scanner as a function that returns the next state.
110type stateFn func(*lexer) stateFn
111
112// lexer holds the state of the scanner.
113type lexer struct {
114	name         string // the name of the input; used only for error reports
115	input        string // the string being scanned
116	leftDelim    string // start of action marker
117	rightDelim   string // end of action marker
118	pos          Pos    // current position in the input
119	start        Pos    // start position of this item
120	atEOF        bool   // we have hit the end of input and returned eof
121	parenDepth   int    // nesting depth of ( ) exprs
122	line         int    // 1+number of newlines seen
123	startLine    int    // start line of this item
124	item         item   // item to return to parser
125	insideAction bool   // are we inside an action?
126	options      lexOptions
127}
128
129// lexOptions control behavior of the lexer. All default to false.
130type lexOptions struct {
131	emitComment bool // emit itemComment tokens.
132	breakOK     bool // break keyword allowed
133	continueOK  bool // continue keyword allowed
134}
135
136// next returns the next rune in the input.
137func (l *lexer) next() rune {
138	if int(l.pos) >= len(l.input) {
139		l.atEOF = true
140		return eof
141	}
142	r, w := utf8.DecodeRuneInString(l.input[l.pos:])
143	l.pos += Pos(w)
144	if r == '\n' {
145		l.line++
146	}
147	return r
148}
149
150// peek returns but does not consume the next rune in the input.
151func (l *lexer) peek() rune {
152	r := l.next()
153	l.backup()
154	return r
155}
156
157// backup steps back one rune.
158func (l *lexer) backup() {
159	if !l.atEOF && l.pos > 0 {
160		r, w := utf8.DecodeLastRuneInString(l.input[:l.pos])
161		l.pos -= Pos(w)
162		// Correct newline count.
163		if r == '\n' {
164			l.line--
165		}
166	}
167}
168
169// thisItem returns the item at the current input point with the specified type
170// and advances the input.
171func (l *lexer) thisItem(t itemType) item {
172	i := item{t, l.start, l.input[l.start:l.pos], l.startLine}
173	l.start = l.pos
174	l.startLine = l.line
175	return i
176}
177
178// emit passes the trailing text as an item back to the parser.
179func (l *lexer) emit(t itemType) stateFn {
180	return l.emitItem(l.thisItem(t))
181}
182
183// emitItem passes the specified item to the parser.
184func (l *lexer) emitItem(i item) stateFn {
185	l.item = i
186	return nil
187}
188
189// ignore skips over the pending input before this point.
190// It tracks newlines in the ignored text, so use it only
191// for text that is skipped without calling l.next.
192func (l *lexer) ignore() {
193	l.line += strings.Count(l.input[l.start:l.pos], "\n")
194	l.start = l.pos
195	l.startLine = l.line
196}
197
198// accept consumes the next rune if it's from the valid set.
199func (l *lexer) accept(valid string) bool {
200	if strings.ContainsRune(valid, l.next()) {
201		return true
202	}
203	l.backup()
204	return false
205}
206
207// acceptRun consumes a run of runes from the valid set.
208func (l *lexer) acceptRun(valid string) {
209	for strings.ContainsRune(valid, l.next()) {
210	}
211	l.backup()
212}
213
214// errorf returns an error token and terminates the scan by passing
215// back a nil pointer that will be the next state, terminating l.nextItem.
216func (l *lexer) errorf(format string, args ...any) stateFn {
217	l.item = item{itemError, l.start, fmt.Sprintf(format, args...), l.startLine}
218	l.start = 0
219	l.pos = 0
220	l.input = l.input[:0]
221	return nil
222}
223
224// nextItem returns the next item from the input.
225// Called by the parser, not in the lexing goroutine.
226func (l *lexer) nextItem() item {
227	l.item = item{itemEOF, l.pos, "EOF", l.startLine}
228	state := lexText
229	if l.insideAction {
230		state = lexInsideAction
231	}
232	for {
233		state = state(l)
234		if state == nil {
235			return l.item
236		}
237	}
238}
239
240// lex creates a new scanner for the input string.
241func lex(name, input, left, right string) *lexer {
242	if left == "" {
243		left = leftDelim
244	}
245	if right == "" {
246		right = rightDelim
247	}
248	l := &lexer{
249		name:         name,
250		input:        input,
251		leftDelim:    left,
252		rightDelim:   right,
253		line:         1,
254		startLine:    1,
255		insideAction: false,
256	}
257	return l
258}
259
260// state functions
261
262const (
263	leftDelim    = "{{"
264	rightDelim   = "}}"
265	leftComment  = "/*"
266	rightComment = "*/"
267)
268
269// lexText scans until an opening action delimiter, "{{".
270func lexText(l *lexer) stateFn {
271	if x := strings.Index(l.input[l.pos:], l.leftDelim); x >= 0 {
272		if x > 0 {
273			l.pos += Pos(x)
274			// Do we trim any trailing space?
275			trimLength := Pos(0)
276			delimEnd := l.pos + Pos(len(l.leftDelim))
277			if hasLeftTrimMarker(l.input[delimEnd:]) {
278				trimLength = rightTrimLength(l.input[l.start:l.pos])
279			}
280			l.pos -= trimLength
281			l.line += strings.Count(l.input[l.start:l.pos], "\n")
282			i := l.thisItem(itemText)
283			l.pos += trimLength
284			l.ignore()
285			if len(i.val) > 0 {
286				return l.emitItem(i)
287			}
288		}
289		return lexLeftDelim
290	}
291	l.pos = Pos(len(l.input))
292	// Correctly reached EOF.
293	if l.pos > l.start {
294		l.line += strings.Count(l.input[l.start:l.pos], "\n")
295		return l.emit(itemText)
296	}
297	return l.emit(itemEOF)
298}
299
300// rightTrimLength returns the length of the spaces at the end of the string.
301func rightTrimLength(s string) Pos {
302	return Pos(len(s) - len(strings.TrimRight(s, spaceChars)))
303}
304
305// atRightDelim reports whether the lexer is at a right delimiter, possibly preceded by a trim marker.
306func (l *lexer) atRightDelim() (delim, trimSpaces bool) {
307	if hasRightTrimMarker(l.input[l.pos:]) && strings.HasPrefix(l.input[l.pos+trimMarkerLen:], l.rightDelim) { // With trim marker.
308		return true, true
309	}
310	if strings.HasPrefix(l.input[l.pos:], l.rightDelim) { // Without trim marker.
311		return true, false
312	}
313	return false, false
314}
315
316// leftTrimLength returns the length of the spaces at the beginning of the string.
317func leftTrimLength(s string) Pos {
318	return Pos(len(s) - len(strings.TrimLeft(s, spaceChars)))
319}
320
321// lexLeftDelim scans the left delimiter, which is known to be present, possibly with a trim marker.
322// (The text to be trimmed has already been emitted.)
323func lexLeftDelim(l *lexer) stateFn {
324	l.pos += Pos(len(l.leftDelim))
325	trimSpace := hasLeftTrimMarker(l.input[l.pos:])
326	afterMarker := Pos(0)
327	if trimSpace {
328		afterMarker = trimMarkerLen
329	}
330	if strings.HasPrefix(l.input[l.pos+afterMarker:], leftComment) {
331		l.pos += afterMarker
332		l.ignore()
333		return lexComment
334	}
335	i := l.thisItem(itemLeftDelim)
336	l.insideAction = true
337	l.pos += afterMarker
338	l.ignore()
339	l.parenDepth = 0
340	return l.emitItem(i)
341}
342
343// lexComment scans a comment. The left comment marker is known to be present.
344func lexComment(l *lexer) stateFn {
345	l.pos += Pos(len(leftComment))
346	x := strings.Index(l.input[l.pos:], rightComment)
347	if x < 0 {
348		return l.errorf("unclosed comment")
349	}
350	l.pos += Pos(x + len(rightComment))
351	delim, trimSpace := l.atRightDelim()
352	if !delim {
353		return l.errorf("comment ends before closing delimiter")
354	}
355	i := l.thisItem(itemComment)
356	if trimSpace {
357		l.pos += trimMarkerLen
358	}
359	l.pos += Pos(len(l.rightDelim))
360	if trimSpace {
361		l.pos += leftTrimLength(l.input[l.pos:])
362	}
363	l.ignore()
364	if l.options.emitComment {
365		return l.emitItem(i)
366	}
367	return lexText
368}
369
370// lexRightDelim scans the right delimiter, which is known to be present, possibly with a trim marker.
371func lexRightDelim(l *lexer) stateFn {
372	_, trimSpace := l.atRightDelim()
373	if trimSpace {
374		l.pos += trimMarkerLen
375		l.ignore()
376	}
377	l.pos += Pos(len(l.rightDelim))
378	i := l.thisItem(itemRightDelim)
379	if trimSpace {
380		l.pos += leftTrimLength(l.input[l.pos:])
381		l.ignore()
382	}
383	l.insideAction = false
384	return l.emitItem(i)
385}
386
387// lexInsideAction scans the elements inside action delimiters.
388func lexInsideAction(l *lexer) stateFn {
389	// Either number, quoted string, or identifier.
390	// Spaces separate arguments; runs of spaces turn into itemSpace.
391	// Pipe symbols separate and are emitted.
392	delim, _ := l.atRightDelim()
393	if delim {
394		if l.parenDepth == 0 {
395			return lexRightDelim
396		}
397		return l.errorf("unclosed left paren")
398	}
399	switch r := l.next(); {
400	case r == eof:
401		return l.errorf("unclosed action")
402	case isSpace(r):
403		l.backup() // Put space back in case we have " -}}".
404		return lexSpace
405	case r == '=':
406		return l.emit(itemAssign)
407	case r == ':':
408		if l.next() != '=' {
409			return l.errorf("expected :=")
410		}
411		return l.emit(itemDeclare)
412	case r == '|':
413		return l.emit(itemPipe)
414	case r == '"':
415		return lexQuote
416	case r == '`':
417		return lexRawQuote
418	case r == '$':
419		return lexVariable
420	case r == '\'':
421		return lexChar
422	case r == '.':
423		// special look-ahead for ".field" so we don't break l.backup().
424		if l.pos < Pos(len(l.input)) {
425			r := l.input[l.pos]
426			if r < '0' || '9' < r {
427				return lexField
428			}
429		}
430		fallthrough // '.' can start a number.
431	case r == '+' || r == '-' || ('0' <= r && r <= '9'):
432		l.backup()
433		return lexNumber
434	case isAlphaNumeric(r):
435		l.backup()
436		return lexIdentifier
437	case r == '(':
438		l.parenDepth++
439		return l.emit(itemLeftParen)
440	case r == ')':
441		l.parenDepth--
442		if l.parenDepth < 0 {
443			return l.errorf("unexpected right paren")
444		}
445		return l.emit(itemRightParen)
446	case r <= unicode.MaxASCII && unicode.IsPrint(r):
447		return l.emit(itemChar)
448	default:
449		return l.errorf("unrecognized character in action: %#U", r)
450	}
451}
452
453// lexSpace scans a run of space characters.
454// We have not consumed the first space, which is known to be present.
455// Take care if there is a trim-marked right delimiter, which starts with a space.
456func lexSpace(l *lexer) stateFn {
457	var r rune
458	var numSpaces int
459	for {
460		r = l.peek()
461		if !isSpace(r) {
462			break
463		}
464		l.next()
465		numSpaces++
466	}
467	// Be careful about a trim-marked closing delimiter, which has a minus
468	// after a space. We know there is a space, so check for the '-' that might follow.
469	if hasRightTrimMarker(l.input[l.pos-1:]) && strings.HasPrefix(l.input[l.pos-1+trimMarkerLen:], l.rightDelim) {
470		l.backup() // Before the space.
471		if numSpaces == 1 {
472			return lexRightDelim // On the delim, so go right to that.
473		}
474	}
475	return l.emit(itemSpace)
476}
477
478// lexIdentifier scans an alphanumeric.
479func lexIdentifier(l *lexer) stateFn {
480	for {
481		switch r := l.next(); {
482		case isAlphaNumeric(r):
483			// absorb.
484		default:
485			l.backup()
486			word := l.input[l.start:l.pos]
487			if !l.atTerminator() {
488				return l.errorf("bad character %#U", r)
489			}
490			switch {
491			case key[word] > itemKeyword:
492				item := key[word]
493				if item == itemBreak && !l.options.breakOK || item == itemContinue && !l.options.continueOK {
494					return l.emit(itemIdentifier)
495				}
496				return l.emit(item)
497			case word[0] == '.':
498				return l.emit(itemField)
499			case word == "true", word == "false":
500				return l.emit(itemBool)
501			default:
502				return l.emit(itemIdentifier)
503			}
504		}
505	}
506}
507
508// lexField scans a field: .Alphanumeric.
509// The . has been scanned.
510func lexField(l *lexer) stateFn {
511	return lexFieldOrVariable(l, itemField)
512}
513
514// lexVariable scans a Variable: $Alphanumeric.
515// The $ has been scanned.
516func lexVariable(l *lexer) stateFn {
517	if l.atTerminator() { // Nothing interesting follows -> "$".
518		return l.emit(itemVariable)
519	}
520	return lexFieldOrVariable(l, itemVariable)
521}
522
523// lexFieldOrVariable scans a field or variable: [.$]Alphanumeric.
524// The . or $ has been scanned.
525func lexFieldOrVariable(l *lexer, typ itemType) stateFn {
526	if l.atTerminator() { // Nothing interesting follows -> "." or "$".
527		if typ == itemVariable {
528			return l.emit(itemVariable)
529		}
530		return l.emit(itemDot)
531	}
532	var r rune
533	for {
534		r = l.next()
535		if !isAlphaNumeric(r) {
536			l.backup()
537			break
538		}
539	}
540	if !l.atTerminator() {
541		return l.errorf("bad character %#U", r)
542	}
543	return l.emit(typ)
544}
545
546// atTerminator reports whether the input is at valid termination character to
547// appear after an identifier. Breaks .X.Y into two pieces. Also catches cases
548// like "$x+2" not being acceptable without a space, in case we decide one
549// day to implement arithmetic.
550func (l *lexer) atTerminator() bool {
551	r := l.peek()
552	if isSpace(r) {
553		return true
554	}
555	switch r {
556	case eof, '.', ',', '|', ':', ')', '(':
557		return true
558	}
559	return strings.HasPrefix(l.input[l.pos:], l.rightDelim)
560}
561
562// lexChar scans a character constant. The initial quote is already
563// scanned. Syntax checking is done by the parser.
564func lexChar(l *lexer) stateFn {
565Loop:
566	for {
567		switch l.next() {
568		case '\\':
569			if r := l.next(); r != eof && r != '\n' {
570				break
571			}
572			fallthrough
573		case eof, '\n':
574			return l.errorf("unterminated character constant")
575		case '\'':
576			break Loop
577		}
578	}
579	return l.emit(itemCharConstant)
580}
581
582// lexNumber scans a number: decimal, octal, hex, float, or imaginary. This
583// isn't a perfect number scanner - for instance it accepts "." and "0x0.2"
584// and "089" - but when it's wrong the input is invalid and the parser (via
585// strconv) will notice.
586func lexNumber(l *lexer) stateFn {
587	if !l.scanNumber() {
588		return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
589	}
590	if sign := l.peek(); sign == '+' || sign == '-' {
591		// Complex: 1+2i. No spaces, must end in 'i'.
592		if !l.scanNumber() || l.input[l.pos-1] != 'i' {
593			return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
594		}
595		return l.emit(itemComplex)
596	}
597	return l.emit(itemNumber)
598}
599
600func (l *lexer) scanNumber() bool {
601	// Optional leading sign.
602	l.accept("+-")
603	// Is it hex?
604	digits := "0123456789_"
605	if l.accept("0") {
606		// Note: Leading 0 does not mean octal in floats.
607		if l.accept("xX") {
608			digits = "0123456789abcdefABCDEF_"
609		} else if l.accept("oO") {
610			digits = "01234567_"
611		} else if l.accept("bB") {
612			digits = "01_"
613		}
614	}
615	l.acceptRun(digits)
616	if l.accept(".") {
617		l.acceptRun(digits)
618	}
619	if len(digits) == 10+1 && l.accept("eE") {
620		l.accept("+-")
621		l.acceptRun("0123456789_")
622	}
623	if len(digits) == 16+6+1 && l.accept("pP") {
624		l.accept("+-")
625		l.acceptRun("0123456789_")
626	}
627	// Is it imaginary?
628	l.accept("i")
629	// Next thing mustn't be alphanumeric.
630	if isAlphaNumeric(l.peek()) {
631		l.next()
632		return false
633	}
634	return true
635}
636
637// lexQuote scans a quoted string.
638func lexQuote(l *lexer) stateFn {
639Loop:
640	for {
641		switch l.next() {
642		case '\\':
643			if r := l.next(); r != eof && r != '\n' {
644				break
645			}
646			fallthrough
647		case eof, '\n':
648			return l.errorf("unterminated quoted string")
649		case '"':
650			break Loop
651		}
652	}
653	return l.emit(itemString)
654}
655
656// lexRawQuote scans a raw quoted string.
657func lexRawQuote(l *lexer) stateFn {
658Loop:
659	for {
660		switch l.next() {
661		case eof:
662			return l.errorf("unterminated raw quoted string")
663		case '`':
664			break Loop
665		}
666	}
667	return l.emit(itemRawString)
668}
669
670// isSpace reports whether r is a space character.
671func isSpace(r rune) bool {
672	return r == ' ' || r == '\t' || r == '\r' || r == '\n'
673}
674
675// isAlphaNumeric reports whether r is an alphabetic, digit, or underscore.
676func isAlphaNumeric(r rune) bool {
677	return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
678}
679
680func hasLeftTrimMarker(s string) bool {
681	return len(s) >= 2 && s[0] == trimMarker && isSpace(rune(s[1]))
682}
683
684func hasRightTrimMarker(s string) bool {
685	return len(s) >= 2 && isSpace(rune(s[0])) && s[1] == trimMarker
686}
687