1// Copyright 2011 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package parse 6 7import ( 8 "fmt" 9 "strings" 10 "unicode" 11 "unicode/utf8" 12) 13 14// item represents a token or text string returned from the scanner. 15type item struct { 16 typ itemType // The type of this item. 17 pos Pos // The starting position, in bytes, of this item in the input string. 18 val string // The value of this item. 19 line int // The line number at the start of this item. 20} 21 22func (i item) String() string { 23 switch { 24 case i.typ == itemEOF: 25 return "EOF" 26 case i.typ == itemError: 27 return i.val 28 case i.typ > itemKeyword: 29 return fmt.Sprintf("<%s>", i.val) 30 case len(i.val) > 10: 31 return fmt.Sprintf("%.10q...", i.val) 32 } 33 return fmt.Sprintf("%q", i.val) 34} 35 36// itemType identifies the type of lex items. 37type itemType int 38 39const ( 40 itemError itemType = iota // error occurred; value is text of error 41 itemBool // boolean constant 42 itemChar // printable ASCII character; grab bag for comma etc. 43 itemCharConstant // character constant 44 itemComment // comment text 45 itemComplex // complex constant (1+2i); imaginary is just a number 46 itemAssign // equals ('=') introducing an assignment 47 itemDeclare // colon-equals (':=') introducing a declaration 48 itemEOF 49 itemField // alphanumeric identifier starting with '.' 50 itemIdentifier // alphanumeric identifier not starting with '.' 51 itemLeftDelim // left action delimiter 52 itemLeftParen // '(' inside action 53 itemNumber // simple number, including imaginary 54 itemPipe // pipe symbol 55 itemRawString // raw quoted string (includes quotes) 56 itemRightDelim // right action delimiter 57 itemRightParen // ')' inside action 58 itemSpace // run of spaces separating arguments 59 itemString // quoted string (includes quotes) 60 itemText // plain text 61 itemVariable // variable starting with '$', such as '$' or '$1' or '$hello' 62 // Keywords appear after all the rest. 63 itemKeyword // used only to delimit the keywords 64 itemBlock // block keyword 65 itemBreak // break keyword 66 itemContinue // continue keyword 67 itemDot // the cursor, spelled '.' 68 itemDefine // define keyword 69 itemElse // else keyword 70 itemEnd // end keyword 71 itemIf // if keyword 72 itemNil // the untyped nil constant, easiest to treat as a keyword 73 itemRange // range keyword 74 itemTemplate // template keyword 75 itemWith // with keyword 76) 77 78var key = map[string]itemType{ 79 ".": itemDot, 80 "block": itemBlock, 81 "break": itemBreak, 82 "continue": itemContinue, 83 "define": itemDefine, 84 "else": itemElse, 85 "end": itemEnd, 86 "if": itemIf, 87 "range": itemRange, 88 "nil": itemNil, 89 "template": itemTemplate, 90 "with": itemWith, 91} 92 93const eof = -1 94 95// Trimming spaces. 96// If the action begins "{{- " rather than "{{", then all space/tab/newlines 97// preceding the action are trimmed; conversely if it ends " -}}" the 98// leading spaces are trimmed. This is done entirely in the lexer; the 99// parser never sees it happen. We require an ASCII space (' ', \t, \r, \n) 100// to be present to avoid ambiguity with things like "{{-3}}". It reads 101// better with the space present anyway. For simplicity, only ASCII 102// does the job. 103const ( 104 spaceChars = " \t\r\n" // These are the space characters defined by Go itself. 105 trimMarker = '-' // Attached to left/right delimiter, trims trailing spaces from preceding/following text. 106 trimMarkerLen = Pos(1 + 1) // marker plus space before or after 107) 108 109// stateFn represents the state of the scanner as a function that returns the next state. 110type stateFn func(*lexer) stateFn 111 112// lexer holds the state of the scanner. 113type lexer struct { 114 name string // the name of the input; used only for error reports 115 input string // the string being scanned 116 leftDelim string // start of action marker 117 rightDelim string // end of action marker 118 pos Pos // current position in the input 119 start Pos // start position of this item 120 atEOF bool // we have hit the end of input and returned eof 121 parenDepth int // nesting depth of ( ) exprs 122 line int // 1+number of newlines seen 123 startLine int // start line of this item 124 item item // item to return to parser 125 insideAction bool // are we inside an action? 126 options lexOptions 127} 128 129// lexOptions control behavior of the lexer. All default to false. 130type lexOptions struct { 131 emitComment bool // emit itemComment tokens. 132 breakOK bool // break keyword allowed 133 continueOK bool // continue keyword allowed 134} 135 136// next returns the next rune in the input. 137func (l *lexer) next() rune { 138 if int(l.pos) >= len(l.input) { 139 l.atEOF = true 140 return eof 141 } 142 r, w := utf8.DecodeRuneInString(l.input[l.pos:]) 143 l.pos += Pos(w) 144 if r == '\n' { 145 l.line++ 146 } 147 return r 148} 149 150// peek returns but does not consume the next rune in the input. 151func (l *lexer) peek() rune { 152 r := l.next() 153 l.backup() 154 return r 155} 156 157// backup steps back one rune. 158func (l *lexer) backup() { 159 if !l.atEOF && l.pos > 0 { 160 r, w := utf8.DecodeLastRuneInString(l.input[:l.pos]) 161 l.pos -= Pos(w) 162 // Correct newline count. 163 if r == '\n' { 164 l.line-- 165 } 166 } 167} 168 169// thisItem returns the item at the current input point with the specified type 170// and advances the input. 171func (l *lexer) thisItem(t itemType) item { 172 i := item{t, l.start, l.input[l.start:l.pos], l.startLine} 173 l.start = l.pos 174 l.startLine = l.line 175 return i 176} 177 178// emit passes the trailing text as an item back to the parser. 179func (l *lexer) emit(t itemType) stateFn { 180 return l.emitItem(l.thisItem(t)) 181} 182 183// emitItem passes the specified item to the parser. 184func (l *lexer) emitItem(i item) stateFn { 185 l.item = i 186 return nil 187} 188 189// ignore skips over the pending input before this point. 190// It tracks newlines in the ignored text, so use it only 191// for text that is skipped without calling l.next. 192func (l *lexer) ignore() { 193 l.line += strings.Count(l.input[l.start:l.pos], "\n") 194 l.start = l.pos 195 l.startLine = l.line 196} 197 198// accept consumes the next rune if it's from the valid set. 199func (l *lexer) accept(valid string) bool { 200 if strings.ContainsRune(valid, l.next()) { 201 return true 202 } 203 l.backup() 204 return false 205} 206 207// acceptRun consumes a run of runes from the valid set. 208func (l *lexer) acceptRun(valid string) { 209 for strings.ContainsRune(valid, l.next()) { 210 } 211 l.backup() 212} 213 214// errorf returns an error token and terminates the scan by passing 215// back a nil pointer that will be the next state, terminating l.nextItem. 216func (l *lexer) errorf(format string, args ...any) stateFn { 217 l.item = item{itemError, l.start, fmt.Sprintf(format, args...), l.startLine} 218 l.start = 0 219 l.pos = 0 220 l.input = l.input[:0] 221 return nil 222} 223 224// nextItem returns the next item from the input. 225// Called by the parser, not in the lexing goroutine. 226func (l *lexer) nextItem() item { 227 l.item = item{itemEOF, l.pos, "EOF", l.startLine} 228 state := lexText 229 if l.insideAction { 230 state = lexInsideAction 231 } 232 for { 233 state = state(l) 234 if state == nil { 235 return l.item 236 } 237 } 238} 239 240// lex creates a new scanner for the input string. 241func lex(name, input, left, right string) *lexer { 242 if left == "" { 243 left = leftDelim 244 } 245 if right == "" { 246 right = rightDelim 247 } 248 l := &lexer{ 249 name: name, 250 input: input, 251 leftDelim: left, 252 rightDelim: right, 253 line: 1, 254 startLine: 1, 255 insideAction: false, 256 } 257 return l 258} 259 260// state functions 261 262const ( 263 leftDelim = "{{" 264 rightDelim = "}}" 265 leftComment = "/*" 266 rightComment = "*/" 267) 268 269// lexText scans until an opening action delimiter, "{{". 270func lexText(l *lexer) stateFn { 271 if x := strings.Index(l.input[l.pos:], l.leftDelim); x >= 0 { 272 if x > 0 { 273 l.pos += Pos(x) 274 // Do we trim any trailing space? 275 trimLength := Pos(0) 276 delimEnd := l.pos + Pos(len(l.leftDelim)) 277 if hasLeftTrimMarker(l.input[delimEnd:]) { 278 trimLength = rightTrimLength(l.input[l.start:l.pos]) 279 } 280 l.pos -= trimLength 281 l.line += strings.Count(l.input[l.start:l.pos], "\n") 282 i := l.thisItem(itemText) 283 l.pos += trimLength 284 l.ignore() 285 if len(i.val) > 0 { 286 return l.emitItem(i) 287 } 288 } 289 return lexLeftDelim 290 } 291 l.pos = Pos(len(l.input)) 292 // Correctly reached EOF. 293 if l.pos > l.start { 294 l.line += strings.Count(l.input[l.start:l.pos], "\n") 295 return l.emit(itemText) 296 } 297 return l.emit(itemEOF) 298} 299 300// rightTrimLength returns the length of the spaces at the end of the string. 301func rightTrimLength(s string) Pos { 302 return Pos(len(s) - len(strings.TrimRight(s, spaceChars))) 303} 304 305// atRightDelim reports whether the lexer is at a right delimiter, possibly preceded by a trim marker. 306func (l *lexer) atRightDelim() (delim, trimSpaces bool) { 307 if hasRightTrimMarker(l.input[l.pos:]) && strings.HasPrefix(l.input[l.pos+trimMarkerLen:], l.rightDelim) { // With trim marker. 308 return true, true 309 } 310 if strings.HasPrefix(l.input[l.pos:], l.rightDelim) { // Without trim marker. 311 return true, false 312 } 313 return false, false 314} 315 316// leftTrimLength returns the length of the spaces at the beginning of the string. 317func leftTrimLength(s string) Pos { 318 return Pos(len(s) - len(strings.TrimLeft(s, spaceChars))) 319} 320 321// lexLeftDelim scans the left delimiter, which is known to be present, possibly with a trim marker. 322// (The text to be trimmed has already been emitted.) 323func lexLeftDelim(l *lexer) stateFn { 324 l.pos += Pos(len(l.leftDelim)) 325 trimSpace := hasLeftTrimMarker(l.input[l.pos:]) 326 afterMarker := Pos(0) 327 if trimSpace { 328 afterMarker = trimMarkerLen 329 } 330 if strings.HasPrefix(l.input[l.pos+afterMarker:], leftComment) { 331 l.pos += afterMarker 332 l.ignore() 333 return lexComment 334 } 335 i := l.thisItem(itemLeftDelim) 336 l.insideAction = true 337 l.pos += afterMarker 338 l.ignore() 339 l.parenDepth = 0 340 return l.emitItem(i) 341} 342 343// lexComment scans a comment. The left comment marker is known to be present. 344func lexComment(l *lexer) stateFn { 345 l.pos += Pos(len(leftComment)) 346 x := strings.Index(l.input[l.pos:], rightComment) 347 if x < 0 { 348 return l.errorf("unclosed comment") 349 } 350 l.pos += Pos(x + len(rightComment)) 351 delim, trimSpace := l.atRightDelim() 352 if !delim { 353 return l.errorf("comment ends before closing delimiter") 354 } 355 i := l.thisItem(itemComment) 356 if trimSpace { 357 l.pos += trimMarkerLen 358 } 359 l.pos += Pos(len(l.rightDelim)) 360 if trimSpace { 361 l.pos += leftTrimLength(l.input[l.pos:]) 362 } 363 l.ignore() 364 if l.options.emitComment { 365 return l.emitItem(i) 366 } 367 return lexText 368} 369 370// lexRightDelim scans the right delimiter, which is known to be present, possibly with a trim marker. 371func lexRightDelim(l *lexer) stateFn { 372 _, trimSpace := l.atRightDelim() 373 if trimSpace { 374 l.pos += trimMarkerLen 375 l.ignore() 376 } 377 l.pos += Pos(len(l.rightDelim)) 378 i := l.thisItem(itemRightDelim) 379 if trimSpace { 380 l.pos += leftTrimLength(l.input[l.pos:]) 381 l.ignore() 382 } 383 l.insideAction = false 384 return l.emitItem(i) 385} 386 387// lexInsideAction scans the elements inside action delimiters. 388func lexInsideAction(l *lexer) stateFn { 389 // Either number, quoted string, or identifier. 390 // Spaces separate arguments; runs of spaces turn into itemSpace. 391 // Pipe symbols separate and are emitted. 392 delim, _ := l.atRightDelim() 393 if delim { 394 if l.parenDepth == 0 { 395 return lexRightDelim 396 } 397 return l.errorf("unclosed left paren") 398 } 399 switch r := l.next(); { 400 case r == eof: 401 return l.errorf("unclosed action") 402 case isSpace(r): 403 l.backup() // Put space back in case we have " -}}". 404 return lexSpace 405 case r == '=': 406 return l.emit(itemAssign) 407 case r == ':': 408 if l.next() != '=' { 409 return l.errorf("expected :=") 410 } 411 return l.emit(itemDeclare) 412 case r == '|': 413 return l.emit(itemPipe) 414 case r == '"': 415 return lexQuote 416 case r == '`': 417 return lexRawQuote 418 case r == '$': 419 return lexVariable 420 case r == '\'': 421 return lexChar 422 case r == '.': 423 // special look-ahead for ".field" so we don't break l.backup(). 424 if l.pos < Pos(len(l.input)) { 425 r := l.input[l.pos] 426 if r < '0' || '9' < r { 427 return lexField 428 } 429 } 430 fallthrough // '.' can start a number. 431 case r == '+' || r == '-' || ('0' <= r && r <= '9'): 432 l.backup() 433 return lexNumber 434 case isAlphaNumeric(r): 435 l.backup() 436 return lexIdentifier 437 case r == '(': 438 l.parenDepth++ 439 return l.emit(itemLeftParen) 440 case r == ')': 441 l.parenDepth-- 442 if l.parenDepth < 0 { 443 return l.errorf("unexpected right paren") 444 } 445 return l.emit(itemRightParen) 446 case r <= unicode.MaxASCII && unicode.IsPrint(r): 447 return l.emit(itemChar) 448 default: 449 return l.errorf("unrecognized character in action: %#U", r) 450 } 451} 452 453// lexSpace scans a run of space characters. 454// We have not consumed the first space, which is known to be present. 455// Take care if there is a trim-marked right delimiter, which starts with a space. 456func lexSpace(l *lexer) stateFn { 457 var r rune 458 var numSpaces int 459 for { 460 r = l.peek() 461 if !isSpace(r) { 462 break 463 } 464 l.next() 465 numSpaces++ 466 } 467 // Be careful about a trim-marked closing delimiter, which has a minus 468 // after a space. We know there is a space, so check for the '-' that might follow. 469 if hasRightTrimMarker(l.input[l.pos-1:]) && strings.HasPrefix(l.input[l.pos-1+trimMarkerLen:], l.rightDelim) { 470 l.backup() // Before the space. 471 if numSpaces == 1 { 472 return lexRightDelim // On the delim, so go right to that. 473 } 474 } 475 return l.emit(itemSpace) 476} 477 478// lexIdentifier scans an alphanumeric. 479func lexIdentifier(l *lexer) stateFn { 480 for { 481 switch r := l.next(); { 482 case isAlphaNumeric(r): 483 // absorb. 484 default: 485 l.backup() 486 word := l.input[l.start:l.pos] 487 if !l.atTerminator() { 488 return l.errorf("bad character %#U", r) 489 } 490 switch { 491 case key[word] > itemKeyword: 492 item := key[word] 493 if item == itemBreak && !l.options.breakOK || item == itemContinue && !l.options.continueOK { 494 return l.emit(itemIdentifier) 495 } 496 return l.emit(item) 497 case word[0] == '.': 498 return l.emit(itemField) 499 case word == "true", word == "false": 500 return l.emit(itemBool) 501 default: 502 return l.emit(itemIdentifier) 503 } 504 } 505 } 506} 507 508// lexField scans a field: .Alphanumeric. 509// The . has been scanned. 510func lexField(l *lexer) stateFn { 511 return lexFieldOrVariable(l, itemField) 512} 513 514// lexVariable scans a Variable: $Alphanumeric. 515// The $ has been scanned. 516func lexVariable(l *lexer) stateFn { 517 if l.atTerminator() { // Nothing interesting follows -> "$". 518 return l.emit(itemVariable) 519 } 520 return lexFieldOrVariable(l, itemVariable) 521} 522 523// lexFieldOrVariable scans a field or variable: [.$]Alphanumeric. 524// The . or $ has been scanned. 525func lexFieldOrVariable(l *lexer, typ itemType) stateFn { 526 if l.atTerminator() { // Nothing interesting follows -> "." or "$". 527 if typ == itemVariable { 528 return l.emit(itemVariable) 529 } 530 return l.emit(itemDot) 531 } 532 var r rune 533 for { 534 r = l.next() 535 if !isAlphaNumeric(r) { 536 l.backup() 537 break 538 } 539 } 540 if !l.atTerminator() { 541 return l.errorf("bad character %#U", r) 542 } 543 return l.emit(typ) 544} 545 546// atTerminator reports whether the input is at valid termination character to 547// appear after an identifier. Breaks .X.Y into two pieces. Also catches cases 548// like "$x+2" not being acceptable without a space, in case we decide one 549// day to implement arithmetic. 550func (l *lexer) atTerminator() bool { 551 r := l.peek() 552 if isSpace(r) { 553 return true 554 } 555 switch r { 556 case eof, '.', ',', '|', ':', ')', '(': 557 return true 558 } 559 return strings.HasPrefix(l.input[l.pos:], l.rightDelim) 560} 561 562// lexChar scans a character constant. The initial quote is already 563// scanned. Syntax checking is done by the parser. 564func lexChar(l *lexer) stateFn { 565Loop: 566 for { 567 switch l.next() { 568 case '\\': 569 if r := l.next(); r != eof && r != '\n' { 570 break 571 } 572 fallthrough 573 case eof, '\n': 574 return l.errorf("unterminated character constant") 575 case '\'': 576 break Loop 577 } 578 } 579 return l.emit(itemCharConstant) 580} 581 582// lexNumber scans a number: decimal, octal, hex, float, or imaginary. This 583// isn't a perfect number scanner - for instance it accepts "." and "0x0.2" 584// and "089" - but when it's wrong the input is invalid and the parser (via 585// strconv) will notice. 586func lexNumber(l *lexer) stateFn { 587 if !l.scanNumber() { 588 return l.errorf("bad number syntax: %q", l.input[l.start:l.pos]) 589 } 590 if sign := l.peek(); sign == '+' || sign == '-' { 591 // Complex: 1+2i. No spaces, must end in 'i'. 592 if !l.scanNumber() || l.input[l.pos-1] != 'i' { 593 return l.errorf("bad number syntax: %q", l.input[l.start:l.pos]) 594 } 595 return l.emit(itemComplex) 596 } 597 return l.emit(itemNumber) 598} 599 600func (l *lexer) scanNumber() bool { 601 // Optional leading sign. 602 l.accept("+-") 603 // Is it hex? 604 digits := "0123456789_" 605 if l.accept("0") { 606 // Note: Leading 0 does not mean octal in floats. 607 if l.accept("xX") { 608 digits = "0123456789abcdefABCDEF_" 609 } else if l.accept("oO") { 610 digits = "01234567_" 611 } else if l.accept("bB") { 612 digits = "01_" 613 } 614 } 615 l.acceptRun(digits) 616 if l.accept(".") { 617 l.acceptRun(digits) 618 } 619 if len(digits) == 10+1 && l.accept("eE") { 620 l.accept("+-") 621 l.acceptRun("0123456789_") 622 } 623 if len(digits) == 16+6+1 && l.accept("pP") { 624 l.accept("+-") 625 l.acceptRun("0123456789_") 626 } 627 // Is it imaginary? 628 l.accept("i") 629 // Next thing mustn't be alphanumeric. 630 if isAlphaNumeric(l.peek()) { 631 l.next() 632 return false 633 } 634 return true 635} 636 637// lexQuote scans a quoted string. 638func lexQuote(l *lexer) stateFn { 639Loop: 640 for { 641 switch l.next() { 642 case '\\': 643 if r := l.next(); r != eof && r != '\n' { 644 break 645 } 646 fallthrough 647 case eof, '\n': 648 return l.errorf("unterminated quoted string") 649 case '"': 650 break Loop 651 } 652 } 653 return l.emit(itemString) 654} 655 656// lexRawQuote scans a raw quoted string. 657func lexRawQuote(l *lexer) stateFn { 658Loop: 659 for { 660 switch l.next() { 661 case eof: 662 return l.errorf("unterminated raw quoted string") 663 case '`': 664 break Loop 665 } 666 } 667 return l.emit(itemRawString) 668} 669 670// isSpace reports whether r is a space character. 671func isSpace(r rune) bool { 672 return r == ' ' || r == '\t' || r == '\r' || r == '\n' 673} 674 675// isAlphaNumeric reports whether r is an alphabetic, digit, or underscore. 676func isAlphaNumeric(r rune) bool { 677 return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r) 678} 679 680func hasLeftTrimMarker(s string) bool { 681 return len(s) >= 2 && s[0] == trimMarker && isSpace(rune(s[1])) 682} 683 684func hasRightTrimMarker(s string) bool { 685 return len(s) >= 2 && isSpace(rune(s[0])) && s[1] == trimMarker 686} 687