1*1c12ee1eSDan Willemsen// Copyright 2018 The Go Authors. All rights reserved. 2*1c12ee1eSDan Willemsen// Use of this source code is governed by a BSD-style 3*1c12ee1eSDan Willemsen// license that can be found in the LICENSE file. 4*1c12ee1eSDan Willemsen 5*1c12ee1eSDan Willemsenpackage text 6*1c12ee1eSDan Willemsen 7*1c12ee1eSDan Willemsenimport ( 8*1c12ee1eSDan Willemsen "bytes" 9*1c12ee1eSDan Willemsen "fmt" 10*1c12ee1eSDan Willemsen "io" 11*1c12ee1eSDan Willemsen "strconv" 12*1c12ee1eSDan Willemsen "unicode/utf8" 13*1c12ee1eSDan Willemsen 14*1c12ee1eSDan Willemsen "google.golang.org/protobuf/internal/errors" 15*1c12ee1eSDan Willemsen) 16*1c12ee1eSDan Willemsen 17*1c12ee1eSDan Willemsen// Decoder is a token-based textproto decoder. 18*1c12ee1eSDan Willemsentype Decoder struct { 19*1c12ee1eSDan Willemsen // lastCall is last method called, either readCall or peekCall. 20*1c12ee1eSDan Willemsen // Initial value is readCall. 21*1c12ee1eSDan Willemsen lastCall call 22*1c12ee1eSDan Willemsen 23*1c12ee1eSDan Willemsen // lastToken contains the last read token. 24*1c12ee1eSDan Willemsen lastToken Token 25*1c12ee1eSDan Willemsen 26*1c12ee1eSDan Willemsen // lastErr contains the last read error. 27*1c12ee1eSDan Willemsen lastErr error 28*1c12ee1eSDan Willemsen 29*1c12ee1eSDan Willemsen // openStack is a stack containing the byte characters for MessageOpen and 30*1c12ee1eSDan Willemsen // ListOpen kinds. The top of stack represents the message or the list that 31*1c12ee1eSDan Willemsen // the current token is nested in. An empty stack means the current token is 32*1c12ee1eSDan Willemsen // at the top level message. The characters '{' and '<' both represent the 33*1c12ee1eSDan Willemsen // MessageOpen kind. 34*1c12ee1eSDan Willemsen openStack []byte 35*1c12ee1eSDan Willemsen 36*1c12ee1eSDan Willemsen // orig is used in reporting line and column. 37*1c12ee1eSDan Willemsen orig []byte 38*1c12ee1eSDan Willemsen // in contains the unconsumed input. 39*1c12ee1eSDan Willemsen in []byte 40*1c12ee1eSDan Willemsen} 41*1c12ee1eSDan Willemsen 42*1c12ee1eSDan Willemsen// NewDecoder returns a Decoder to read the given []byte. 43*1c12ee1eSDan Willemsenfunc NewDecoder(b []byte) *Decoder { 44*1c12ee1eSDan Willemsen return &Decoder{orig: b, in: b} 45*1c12ee1eSDan Willemsen} 46*1c12ee1eSDan Willemsen 47*1c12ee1eSDan Willemsen// ErrUnexpectedEOF means that EOF was encountered in the middle of the input. 48*1c12ee1eSDan Willemsenvar ErrUnexpectedEOF = errors.New("%v", io.ErrUnexpectedEOF) 49*1c12ee1eSDan Willemsen 50*1c12ee1eSDan Willemsen// call specifies which Decoder method was invoked. 51*1c12ee1eSDan Willemsentype call uint8 52*1c12ee1eSDan Willemsen 53*1c12ee1eSDan Willemsenconst ( 54*1c12ee1eSDan Willemsen readCall call = iota 55*1c12ee1eSDan Willemsen peekCall 56*1c12ee1eSDan Willemsen) 57*1c12ee1eSDan Willemsen 58*1c12ee1eSDan Willemsen// Peek looks ahead and returns the next token and error without advancing a read. 59*1c12ee1eSDan Willemsenfunc (d *Decoder) Peek() (Token, error) { 60*1c12ee1eSDan Willemsen defer func() { d.lastCall = peekCall }() 61*1c12ee1eSDan Willemsen if d.lastCall == readCall { 62*1c12ee1eSDan Willemsen d.lastToken, d.lastErr = d.Read() 63*1c12ee1eSDan Willemsen } 64*1c12ee1eSDan Willemsen return d.lastToken, d.lastErr 65*1c12ee1eSDan Willemsen} 66*1c12ee1eSDan Willemsen 67*1c12ee1eSDan Willemsen// Read returns the next token. 68*1c12ee1eSDan Willemsen// It will return an error if there is no valid token. 69*1c12ee1eSDan Willemsenfunc (d *Decoder) Read() (Token, error) { 70*1c12ee1eSDan Willemsen defer func() { d.lastCall = readCall }() 71*1c12ee1eSDan Willemsen if d.lastCall == peekCall { 72*1c12ee1eSDan Willemsen return d.lastToken, d.lastErr 73*1c12ee1eSDan Willemsen } 74*1c12ee1eSDan Willemsen 75*1c12ee1eSDan Willemsen tok, err := d.parseNext(d.lastToken.kind) 76*1c12ee1eSDan Willemsen if err != nil { 77*1c12ee1eSDan Willemsen return Token{}, err 78*1c12ee1eSDan Willemsen } 79*1c12ee1eSDan Willemsen 80*1c12ee1eSDan Willemsen switch tok.kind { 81*1c12ee1eSDan Willemsen case comma, semicolon: 82*1c12ee1eSDan Willemsen tok, err = d.parseNext(tok.kind) 83*1c12ee1eSDan Willemsen if err != nil { 84*1c12ee1eSDan Willemsen return Token{}, err 85*1c12ee1eSDan Willemsen } 86*1c12ee1eSDan Willemsen } 87*1c12ee1eSDan Willemsen d.lastToken = tok 88*1c12ee1eSDan Willemsen return tok, nil 89*1c12ee1eSDan Willemsen} 90*1c12ee1eSDan Willemsen 91*1c12ee1eSDan Willemsenconst ( 92*1c12ee1eSDan Willemsen mismatchedFmt = "mismatched close character %q" 93*1c12ee1eSDan Willemsen unexpectedFmt = "unexpected character %q" 94*1c12ee1eSDan Willemsen) 95*1c12ee1eSDan Willemsen 96*1c12ee1eSDan Willemsen// parseNext parses the next Token based on given last kind. 97*1c12ee1eSDan Willemsenfunc (d *Decoder) parseNext(lastKind Kind) (Token, error) { 98*1c12ee1eSDan Willemsen // Trim leading spaces. 99*1c12ee1eSDan Willemsen d.consume(0) 100*1c12ee1eSDan Willemsen isEOF := false 101*1c12ee1eSDan Willemsen if len(d.in) == 0 { 102*1c12ee1eSDan Willemsen isEOF = true 103*1c12ee1eSDan Willemsen } 104*1c12ee1eSDan Willemsen 105*1c12ee1eSDan Willemsen switch lastKind { 106*1c12ee1eSDan Willemsen case EOF: 107*1c12ee1eSDan Willemsen return d.consumeToken(EOF, 0, 0), nil 108*1c12ee1eSDan Willemsen 109*1c12ee1eSDan Willemsen case bof: 110*1c12ee1eSDan Willemsen // Start of top level message. Next token can be EOF or Name. 111*1c12ee1eSDan Willemsen if isEOF { 112*1c12ee1eSDan Willemsen return d.consumeToken(EOF, 0, 0), nil 113*1c12ee1eSDan Willemsen } 114*1c12ee1eSDan Willemsen return d.parseFieldName() 115*1c12ee1eSDan Willemsen 116*1c12ee1eSDan Willemsen case Name: 117*1c12ee1eSDan Willemsen // Next token can be MessageOpen, ListOpen or Scalar. 118*1c12ee1eSDan Willemsen if isEOF { 119*1c12ee1eSDan Willemsen return Token{}, ErrUnexpectedEOF 120*1c12ee1eSDan Willemsen } 121*1c12ee1eSDan Willemsen switch ch := d.in[0]; ch { 122*1c12ee1eSDan Willemsen case '{', '<': 123*1c12ee1eSDan Willemsen d.pushOpenStack(ch) 124*1c12ee1eSDan Willemsen return d.consumeToken(MessageOpen, 1, 0), nil 125*1c12ee1eSDan Willemsen case '[': 126*1c12ee1eSDan Willemsen d.pushOpenStack(ch) 127*1c12ee1eSDan Willemsen return d.consumeToken(ListOpen, 1, 0), nil 128*1c12ee1eSDan Willemsen default: 129*1c12ee1eSDan Willemsen return d.parseScalar() 130*1c12ee1eSDan Willemsen } 131*1c12ee1eSDan Willemsen 132*1c12ee1eSDan Willemsen case Scalar: 133*1c12ee1eSDan Willemsen openKind, closeCh := d.currentOpenKind() 134*1c12ee1eSDan Willemsen switch openKind { 135*1c12ee1eSDan Willemsen case bof: 136*1c12ee1eSDan Willemsen // Top level message. 137*1c12ee1eSDan Willemsen // Next token can be EOF, comma, semicolon or Name. 138*1c12ee1eSDan Willemsen if isEOF { 139*1c12ee1eSDan Willemsen return d.consumeToken(EOF, 0, 0), nil 140*1c12ee1eSDan Willemsen } 141*1c12ee1eSDan Willemsen switch d.in[0] { 142*1c12ee1eSDan Willemsen case ',': 143*1c12ee1eSDan Willemsen return d.consumeToken(comma, 1, 0), nil 144*1c12ee1eSDan Willemsen case ';': 145*1c12ee1eSDan Willemsen return d.consumeToken(semicolon, 1, 0), nil 146*1c12ee1eSDan Willemsen default: 147*1c12ee1eSDan Willemsen return d.parseFieldName() 148*1c12ee1eSDan Willemsen } 149*1c12ee1eSDan Willemsen 150*1c12ee1eSDan Willemsen case MessageOpen: 151*1c12ee1eSDan Willemsen // Next token can be MessageClose, comma, semicolon or Name. 152*1c12ee1eSDan Willemsen if isEOF { 153*1c12ee1eSDan Willemsen return Token{}, ErrUnexpectedEOF 154*1c12ee1eSDan Willemsen } 155*1c12ee1eSDan Willemsen switch ch := d.in[0]; ch { 156*1c12ee1eSDan Willemsen case closeCh: 157*1c12ee1eSDan Willemsen d.popOpenStack() 158*1c12ee1eSDan Willemsen return d.consumeToken(MessageClose, 1, 0), nil 159*1c12ee1eSDan Willemsen case otherCloseChar[closeCh]: 160*1c12ee1eSDan Willemsen return Token{}, d.newSyntaxError(mismatchedFmt, ch) 161*1c12ee1eSDan Willemsen case ',': 162*1c12ee1eSDan Willemsen return d.consumeToken(comma, 1, 0), nil 163*1c12ee1eSDan Willemsen case ';': 164*1c12ee1eSDan Willemsen return d.consumeToken(semicolon, 1, 0), nil 165*1c12ee1eSDan Willemsen default: 166*1c12ee1eSDan Willemsen return d.parseFieldName() 167*1c12ee1eSDan Willemsen } 168*1c12ee1eSDan Willemsen 169*1c12ee1eSDan Willemsen case ListOpen: 170*1c12ee1eSDan Willemsen // Next token can be ListClose or comma. 171*1c12ee1eSDan Willemsen if isEOF { 172*1c12ee1eSDan Willemsen return Token{}, ErrUnexpectedEOF 173*1c12ee1eSDan Willemsen } 174*1c12ee1eSDan Willemsen switch ch := d.in[0]; ch { 175*1c12ee1eSDan Willemsen case ']': 176*1c12ee1eSDan Willemsen d.popOpenStack() 177*1c12ee1eSDan Willemsen return d.consumeToken(ListClose, 1, 0), nil 178*1c12ee1eSDan Willemsen case ',': 179*1c12ee1eSDan Willemsen return d.consumeToken(comma, 1, 0), nil 180*1c12ee1eSDan Willemsen default: 181*1c12ee1eSDan Willemsen return Token{}, d.newSyntaxError(unexpectedFmt, ch) 182*1c12ee1eSDan Willemsen } 183*1c12ee1eSDan Willemsen } 184*1c12ee1eSDan Willemsen 185*1c12ee1eSDan Willemsen case MessageOpen: 186*1c12ee1eSDan Willemsen // Next token can be MessageClose or Name. 187*1c12ee1eSDan Willemsen if isEOF { 188*1c12ee1eSDan Willemsen return Token{}, ErrUnexpectedEOF 189*1c12ee1eSDan Willemsen } 190*1c12ee1eSDan Willemsen _, closeCh := d.currentOpenKind() 191*1c12ee1eSDan Willemsen switch ch := d.in[0]; ch { 192*1c12ee1eSDan Willemsen case closeCh: 193*1c12ee1eSDan Willemsen d.popOpenStack() 194*1c12ee1eSDan Willemsen return d.consumeToken(MessageClose, 1, 0), nil 195*1c12ee1eSDan Willemsen case otherCloseChar[closeCh]: 196*1c12ee1eSDan Willemsen return Token{}, d.newSyntaxError(mismatchedFmt, ch) 197*1c12ee1eSDan Willemsen default: 198*1c12ee1eSDan Willemsen return d.parseFieldName() 199*1c12ee1eSDan Willemsen } 200*1c12ee1eSDan Willemsen 201*1c12ee1eSDan Willemsen case MessageClose: 202*1c12ee1eSDan Willemsen openKind, closeCh := d.currentOpenKind() 203*1c12ee1eSDan Willemsen switch openKind { 204*1c12ee1eSDan Willemsen case bof: 205*1c12ee1eSDan Willemsen // Top level message. 206*1c12ee1eSDan Willemsen // Next token can be EOF, comma, semicolon or Name. 207*1c12ee1eSDan Willemsen if isEOF { 208*1c12ee1eSDan Willemsen return d.consumeToken(EOF, 0, 0), nil 209*1c12ee1eSDan Willemsen } 210*1c12ee1eSDan Willemsen switch ch := d.in[0]; ch { 211*1c12ee1eSDan Willemsen case ',': 212*1c12ee1eSDan Willemsen return d.consumeToken(comma, 1, 0), nil 213*1c12ee1eSDan Willemsen case ';': 214*1c12ee1eSDan Willemsen return d.consumeToken(semicolon, 1, 0), nil 215*1c12ee1eSDan Willemsen default: 216*1c12ee1eSDan Willemsen return d.parseFieldName() 217*1c12ee1eSDan Willemsen } 218*1c12ee1eSDan Willemsen 219*1c12ee1eSDan Willemsen case MessageOpen: 220*1c12ee1eSDan Willemsen // Next token can be MessageClose, comma, semicolon or Name. 221*1c12ee1eSDan Willemsen if isEOF { 222*1c12ee1eSDan Willemsen return Token{}, ErrUnexpectedEOF 223*1c12ee1eSDan Willemsen } 224*1c12ee1eSDan Willemsen switch ch := d.in[0]; ch { 225*1c12ee1eSDan Willemsen case closeCh: 226*1c12ee1eSDan Willemsen d.popOpenStack() 227*1c12ee1eSDan Willemsen return d.consumeToken(MessageClose, 1, 0), nil 228*1c12ee1eSDan Willemsen case otherCloseChar[closeCh]: 229*1c12ee1eSDan Willemsen return Token{}, d.newSyntaxError(mismatchedFmt, ch) 230*1c12ee1eSDan Willemsen case ',': 231*1c12ee1eSDan Willemsen return d.consumeToken(comma, 1, 0), nil 232*1c12ee1eSDan Willemsen case ';': 233*1c12ee1eSDan Willemsen return d.consumeToken(semicolon, 1, 0), nil 234*1c12ee1eSDan Willemsen default: 235*1c12ee1eSDan Willemsen return d.parseFieldName() 236*1c12ee1eSDan Willemsen } 237*1c12ee1eSDan Willemsen 238*1c12ee1eSDan Willemsen case ListOpen: 239*1c12ee1eSDan Willemsen // Next token can be ListClose or comma 240*1c12ee1eSDan Willemsen if isEOF { 241*1c12ee1eSDan Willemsen return Token{}, ErrUnexpectedEOF 242*1c12ee1eSDan Willemsen } 243*1c12ee1eSDan Willemsen switch ch := d.in[0]; ch { 244*1c12ee1eSDan Willemsen case closeCh: 245*1c12ee1eSDan Willemsen d.popOpenStack() 246*1c12ee1eSDan Willemsen return d.consumeToken(ListClose, 1, 0), nil 247*1c12ee1eSDan Willemsen case ',': 248*1c12ee1eSDan Willemsen return d.consumeToken(comma, 1, 0), nil 249*1c12ee1eSDan Willemsen default: 250*1c12ee1eSDan Willemsen return Token{}, d.newSyntaxError(unexpectedFmt, ch) 251*1c12ee1eSDan Willemsen } 252*1c12ee1eSDan Willemsen } 253*1c12ee1eSDan Willemsen 254*1c12ee1eSDan Willemsen case ListOpen: 255*1c12ee1eSDan Willemsen // Next token can be ListClose, MessageStart or Scalar. 256*1c12ee1eSDan Willemsen if isEOF { 257*1c12ee1eSDan Willemsen return Token{}, ErrUnexpectedEOF 258*1c12ee1eSDan Willemsen } 259*1c12ee1eSDan Willemsen switch ch := d.in[0]; ch { 260*1c12ee1eSDan Willemsen case ']': 261*1c12ee1eSDan Willemsen d.popOpenStack() 262*1c12ee1eSDan Willemsen return d.consumeToken(ListClose, 1, 0), nil 263*1c12ee1eSDan Willemsen case '{', '<': 264*1c12ee1eSDan Willemsen d.pushOpenStack(ch) 265*1c12ee1eSDan Willemsen return d.consumeToken(MessageOpen, 1, 0), nil 266*1c12ee1eSDan Willemsen default: 267*1c12ee1eSDan Willemsen return d.parseScalar() 268*1c12ee1eSDan Willemsen } 269*1c12ee1eSDan Willemsen 270*1c12ee1eSDan Willemsen case ListClose: 271*1c12ee1eSDan Willemsen openKind, closeCh := d.currentOpenKind() 272*1c12ee1eSDan Willemsen switch openKind { 273*1c12ee1eSDan Willemsen case bof: 274*1c12ee1eSDan Willemsen // Top level message. 275*1c12ee1eSDan Willemsen // Next token can be EOF, comma, semicolon or Name. 276*1c12ee1eSDan Willemsen if isEOF { 277*1c12ee1eSDan Willemsen return d.consumeToken(EOF, 0, 0), nil 278*1c12ee1eSDan Willemsen } 279*1c12ee1eSDan Willemsen switch ch := d.in[0]; ch { 280*1c12ee1eSDan Willemsen case ',': 281*1c12ee1eSDan Willemsen return d.consumeToken(comma, 1, 0), nil 282*1c12ee1eSDan Willemsen case ';': 283*1c12ee1eSDan Willemsen return d.consumeToken(semicolon, 1, 0), nil 284*1c12ee1eSDan Willemsen default: 285*1c12ee1eSDan Willemsen return d.parseFieldName() 286*1c12ee1eSDan Willemsen } 287*1c12ee1eSDan Willemsen 288*1c12ee1eSDan Willemsen case MessageOpen: 289*1c12ee1eSDan Willemsen // Next token can be MessageClose, comma, semicolon or Name. 290*1c12ee1eSDan Willemsen if isEOF { 291*1c12ee1eSDan Willemsen return Token{}, ErrUnexpectedEOF 292*1c12ee1eSDan Willemsen } 293*1c12ee1eSDan Willemsen switch ch := d.in[0]; ch { 294*1c12ee1eSDan Willemsen case closeCh: 295*1c12ee1eSDan Willemsen d.popOpenStack() 296*1c12ee1eSDan Willemsen return d.consumeToken(MessageClose, 1, 0), nil 297*1c12ee1eSDan Willemsen case otherCloseChar[closeCh]: 298*1c12ee1eSDan Willemsen return Token{}, d.newSyntaxError(mismatchedFmt, ch) 299*1c12ee1eSDan Willemsen case ',': 300*1c12ee1eSDan Willemsen return d.consumeToken(comma, 1, 0), nil 301*1c12ee1eSDan Willemsen case ';': 302*1c12ee1eSDan Willemsen return d.consumeToken(semicolon, 1, 0), nil 303*1c12ee1eSDan Willemsen default: 304*1c12ee1eSDan Willemsen return d.parseFieldName() 305*1c12ee1eSDan Willemsen } 306*1c12ee1eSDan Willemsen 307*1c12ee1eSDan Willemsen default: 308*1c12ee1eSDan Willemsen // It is not possible to have this case. Let it panic below. 309*1c12ee1eSDan Willemsen } 310*1c12ee1eSDan Willemsen 311*1c12ee1eSDan Willemsen case comma, semicolon: 312*1c12ee1eSDan Willemsen openKind, closeCh := d.currentOpenKind() 313*1c12ee1eSDan Willemsen switch openKind { 314*1c12ee1eSDan Willemsen case bof: 315*1c12ee1eSDan Willemsen // Top level message. Next token can be EOF or Name. 316*1c12ee1eSDan Willemsen if isEOF { 317*1c12ee1eSDan Willemsen return d.consumeToken(EOF, 0, 0), nil 318*1c12ee1eSDan Willemsen } 319*1c12ee1eSDan Willemsen return d.parseFieldName() 320*1c12ee1eSDan Willemsen 321*1c12ee1eSDan Willemsen case MessageOpen: 322*1c12ee1eSDan Willemsen // Next token can be MessageClose or Name. 323*1c12ee1eSDan Willemsen if isEOF { 324*1c12ee1eSDan Willemsen return Token{}, ErrUnexpectedEOF 325*1c12ee1eSDan Willemsen } 326*1c12ee1eSDan Willemsen switch ch := d.in[0]; ch { 327*1c12ee1eSDan Willemsen case closeCh: 328*1c12ee1eSDan Willemsen d.popOpenStack() 329*1c12ee1eSDan Willemsen return d.consumeToken(MessageClose, 1, 0), nil 330*1c12ee1eSDan Willemsen case otherCloseChar[closeCh]: 331*1c12ee1eSDan Willemsen return Token{}, d.newSyntaxError(mismatchedFmt, ch) 332*1c12ee1eSDan Willemsen default: 333*1c12ee1eSDan Willemsen return d.parseFieldName() 334*1c12ee1eSDan Willemsen } 335*1c12ee1eSDan Willemsen 336*1c12ee1eSDan Willemsen case ListOpen: 337*1c12ee1eSDan Willemsen if lastKind == semicolon { 338*1c12ee1eSDan Willemsen // It is not be possible to have this case as logic here 339*1c12ee1eSDan Willemsen // should not have produced a semicolon Token when inside a 340*1c12ee1eSDan Willemsen // list. Let it panic below. 341*1c12ee1eSDan Willemsen break 342*1c12ee1eSDan Willemsen } 343*1c12ee1eSDan Willemsen // Next token can be MessageOpen or Scalar. 344*1c12ee1eSDan Willemsen if isEOF { 345*1c12ee1eSDan Willemsen return Token{}, ErrUnexpectedEOF 346*1c12ee1eSDan Willemsen } 347*1c12ee1eSDan Willemsen switch ch := d.in[0]; ch { 348*1c12ee1eSDan Willemsen case '{', '<': 349*1c12ee1eSDan Willemsen d.pushOpenStack(ch) 350*1c12ee1eSDan Willemsen return d.consumeToken(MessageOpen, 1, 0), nil 351*1c12ee1eSDan Willemsen default: 352*1c12ee1eSDan Willemsen return d.parseScalar() 353*1c12ee1eSDan Willemsen } 354*1c12ee1eSDan Willemsen } 355*1c12ee1eSDan Willemsen } 356*1c12ee1eSDan Willemsen 357*1c12ee1eSDan Willemsen line, column := d.Position(len(d.orig) - len(d.in)) 358*1c12ee1eSDan Willemsen panic(fmt.Sprintf("Decoder.parseNext: bug at handling line %d:%d with lastKind=%v", line, column, lastKind)) 359*1c12ee1eSDan Willemsen} 360*1c12ee1eSDan Willemsen 361*1c12ee1eSDan Willemsenvar otherCloseChar = map[byte]byte{ 362*1c12ee1eSDan Willemsen '}': '>', 363*1c12ee1eSDan Willemsen '>': '}', 364*1c12ee1eSDan Willemsen} 365*1c12ee1eSDan Willemsen 366*1c12ee1eSDan Willemsen// currentOpenKind indicates whether current position is inside a message, list 367*1c12ee1eSDan Willemsen// or top-level message by returning MessageOpen, ListOpen or bof respectively. 368*1c12ee1eSDan Willemsen// If the returned kind is either a MessageOpen or ListOpen, it also returns the 369*1c12ee1eSDan Willemsen// corresponding closing character. 370*1c12ee1eSDan Willemsenfunc (d *Decoder) currentOpenKind() (Kind, byte) { 371*1c12ee1eSDan Willemsen if len(d.openStack) == 0 { 372*1c12ee1eSDan Willemsen return bof, 0 373*1c12ee1eSDan Willemsen } 374*1c12ee1eSDan Willemsen openCh := d.openStack[len(d.openStack)-1] 375*1c12ee1eSDan Willemsen switch openCh { 376*1c12ee1eSDan Willemsen case '{': 377*1c12ee1eSDan Willemsen return MessageOpen, '}' 378*1c12ee1eSDan Willemsen case '<': 379*1c12ee1eSDan Willemsen return MessageOpen, '>' 380*1c12ee1eSDan Willemsen case '[': 381*1c12ee1eSDan Willemsen return ListOpen, ']' 382*1c12ee1eSDan Willemsen } 383*1c12ee1eSDan Willemsen panic(fmt.Sprintf("Decoder: openStack contains invalid byte %c", openCh)) 384*1c12ee1eSDan Willemsen} 385*1c12ee1eSDan Willemsen 386*1c12ee1eSDan Willemsenfunc (d *Decoder) pushOpenStack(ch byte) { 387*1c12ee1eSDan Willemsen d.openStack = append(d.openStack, ch) 388*1c12ee1eSDan Willemsen} 389*1c12ee1eSDan Willemsen 390*1c12ee1eSDan Willemsenfunc (d *Decoder) popOpenStack() { 391*1c12ee1eSDan Willemsen d.openStack = d.openStack[:len(d.openStack)-1] 392*1c12ee1eSDan Willemsen} 393*1c12ee1eSDan Willemsen 394*1c12ee1eSDan Willemsen// parseFieldName parses field name and separator. 395*1c12ee1eSDan Willemsenfunc (d *Decoder) parseFieldName() (tok Token, err error) { 396*1c12ee1eSDan Willemsen defer func() { 397*1c12ee1eSDan Willemsen if err == nil && d.tryConsumeChar(':') { 398*1c12ee1eSDan Willemsen tok.attrs |= hasSeparator 399*1c12ee1eSDan Willemsen } 400*1c12ee1eSDan Willemsen }() 401*1c12ee1eSDan Willemsen 402*1c12ee1eSDan Willemsen // Extension or Any type URL. 403*1c12ee1eSDan Willemsen if d.in[0] == '[' { 404*1c12ee1eSDan Willemsen return d.parseTypeName() 405*1c12ee1eSDan Willemsen } 406*1c12ee1eSDan Willemsen 407*1c12ee1eSDan Willemsen // Identifier. 408*1c12ee1eSDan Willemsen if size := parseIdent(d.in, false); size > 0 { 409*1c12ee1eSDan Willemsen return d.consumeToken(Name, size, uint8(IdentName)), nil 410*1c12ee1eSDan Willemsen } 411*1c12ee1eSDan Willemsen 412*1c12ee1eSDan Willemsen // Field number. Identify if input is a valid number that is not negative 413*1c12ee1eSDan Willemsen // and is decimal integer within 32-bit range. 414*1c12ee1eSDan Willemsen if num := parseNumber(d.in); num.size > 0 { 415*1c12ee1eSDan Willemsen str := num.string(d.in) 416*1c12ee1eSDan Willemsen if !num.neg && num.kind == numDec { 417*1c12ee1eSDan Willemsen if _, err := strconv.ParseInt(str, 10, 32); err == nil { 418*1c12ee1eSDan Willemsen return d.consumeToken(Name, num.size, uint8(FieldNumber)), nil 419*1c12ee1eSDan Willemsen } 420*1c12ee1eSDan Willemsen } 421*1c12ee1eSDan Willemsen return Token{}, d.newSyntaxError("invalid field number: %s", str) 422*1c12ee1eSDan Willemsen } 423*1c12ee1eSDan Willemsen 424*1c12ee1eSDan Willemsen return Token{}, d.newSyntaxError("invalid field name: %s", errId(d.in)) 425*1c12ee1eSDan Willemsen} 426*1c12ee1eSDan Willemsen 427*1c12ee1eSDan Willemsen// parseTypeName parses Any type URL or extension field name. The name is 428*1c12ee1eSDan Willemsen// enclosed in [ and ] characters. The C++ parser does not handle many legal URL 429*1c12ee1eSDan Willemsen// strings. This implementation is more liberal and allows for the pattern 430*1c12ee1eSDan Willemsen// ^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`). Whitespaces and comments are allowed 431*1c12ee1eSDan Willemsen// in between [ ], '.', '/' and the sub names. 432*1c12ee1eSDan Willemsenfunc (d *Decoder) parseTypeName() (Token, error) { 433*1c12ee1eSDan Willemsen startPos := len(d.orig) - len(d.in) 434*1c12ee1eSDan Willemsen // Use alias s to advance first in order to use d.in for error handling. 435*1c12ee1eSDan Willemsen // Caller already checks for [ as first character. 436*1c12ee1eSDan Willemsen s := consume(d.in[1:], 0) 437*1c12ee1eSDan Willemsen if len(s) == 0 { 438*1c12ee1eSDan Willemsen return Token{}, ErrUnexpectedEOF 439*1c12ee1eSDan Willemsen } 440*1c12ee1eSDan Willemsen 441*1c12ee1eSDan Willemsen var name []byte 442*1c12ee1eSDan Willemsen for len(s) > 0 && isTypeNameChar(s[0]) { 443*1c12ee1eSDan Willemsen name = append(name, s[0]) 444*1c12ee1eSDan Willemsen s = s[1:] 445*1c12ee1eSDan Willemsen } 446*1c12ee1eSDan Willemsen s = consume(s, 0) 447*1c12ee1eSDan Willemsen 448*1c12ee1eSDan Willemsen var closed bool 449*1c12ee1eSDan Willemsen for len(s) > 0 && !closed { 450*1c12ee1eSDan Willemsen switch { 451*1c12ee1eSDan Willemsen case s[0] == ']': 452*1c12ee1eSDan Willemsen s = s[1:] 453*1c12ee1eSDan Willemsen closed = true 454*1c12ee1eSDan Willemsen 455*1c12ee1eSDan Willemsen case s[0] == '/', s[0] == '.': 456*1c12ee1eSDan Willemsen if len(name) > 0 && (name[len(name)-1] == '/' || name[len(name)-1] == '.') { 457*1c12ee1eSDan Willemsen return Token{}, d.newSyntaxError("invalid type URL/extension field name: %s", 458*1c12ee1eSDan Willemsen d.orig[startPos:len(d.orig)-len(s)+1]) 459*1c12ee1eSDan Willemsen } 460*1c12ee1eSDan Willemsen name = append(name, s[0]) 461*1c12ee1eSDan Willemsen s = s[1:] 462*1c12ee1eSDan Willemsen s = consume(s, 0) 463*1c12ee1eSDan Willemsen for len(s) > 0 && isTypeNameChar(s[0]) { 464*1c12ee1eSDan Willemsen name = append(name, s[0]) 465*1c12ee1eSDan Willemsen s = s[1:] 466*1c12ee1eSDan Willemsen } 467*1c12ee1eSDan Willemsen s = consume(s, 0) 468*1c12ee1eSDan Willemsen 469*1c12ee1eSDan Willemsen default: 470*1c12ee1eSDan Willemsen return Token{}, d.newSyntaxError( 471*1c12ee1eSDan Willemsen "invalid type URL/extension field name: %s", d.orig[startPos:len(d.orig)-len(s)+1]) 472*1c12ee1eSDan Willemsen } 473*1c12ee1eSDan Willemsen } 474*1c12ee1eSDan Willemsen 475*1c12ee1eSDan Willemsen if !closed { 476*1c12ee1eSDan Willemsen return Token{}, ErrUnexpectedEOF 477*1c12ee1eSDan Willemsen } 478*1c12ee1eSDan Willemsen 479*1c12ee1eSDan Willemsen // First character cannot be '.'. Last character cannot be '.' or '/'. 480*1c12ee1eSDan Willemsen size := len(name) 481*1c12ee1eSDan Willemsen if size == 0 || name[0] == '.' || name[size-1] == '.' || name[size-1] == '/' { 482*1c12ee1eSDan Willemsen return Token{}, d.newSyntaxError("invalid type URL/extension field name: %s", 483*1c12ee1eSDan Willemsen d.orig[startPos:len(d.orig)-len(s)]) 484*1c12ee1eSDan Willemsen } 485*1c12ee1eSDan Willemsen 486*1c12ee1eSDan Willemsen d.in = s 487*1c12ee1eSDan Willemsen endPos := len(d.orig) - len(d.in) 488*1c12ee1eSDan Willemsen d.consume(0) 489*1c12ee1eSDan Willemsen 490*1c12ee1eSDan Willemsen return Token{ 491*1c12ee1eSDan Willemsen kind: Name, 492*1c12ee1eSDan Willemsen attrs: uint8(TypeName), 493*1c12ee1eSDan Willemsen pos: startPos, 494*1c12ee1eSDan Willemsen raw: d.orig[startPos:endPos], 495*1c12ee1eSDan Willemsen str: string(name), 496*1c12ee1eSDan Willemsen }, nil 497*1c12ee1eSDan Willemsen} 498*1c12ee1eSDan Willemsen 499*1c12ee1eSDan Willemsenfunc isTypeNameChar(b byte) bool { 500*1c12ee1eSDan Willemsen return (b == '-' || b == '_' || 501*1c12ee1eSDan Willemsen ('0' <= b && b <= '9') || 502*1c12ee1eSDan Willemsen ('a' <= b && b <= 'z') || 503*1c12ee1eSDan Willemsen ('A' <= b && b <= 'Z')) 504*1c12ee1eSDan Willemsen} 505*1c12ee1eSDan Willemsen 506*1c12ee1eSDan Willemsenfunc isWhiteSpace(b byte) bool { 507*1c12ee1eSDan Willemsen switch b { 508*1c12ee1eSDan Willemsen case ' ', '\n', '\r', '\t': 509*1c12ee1eSDan Willemsen return true 510*1c12ee1eSDan Willemsen default: 511*1c12ee1eSDan Willemsen return false 512*1c12ee1eSDan Willemsen } 513*1c12ee1eSDan Willemsen} 514*1c12ee1eSDan Willemsen 515*1c12ee1eSDan Willemsen// parseIdent parses an unquoted proto identifier and returns size. 516*1c12ee1eSDan Willemsen// If allowNeg is true, it allows '-' to be the first character in the 517*1c12ee1eSDan Willemsen// identifier. This is used when parsing literal values like -infinity, etc. 518*1c12ee1eSDan Willemsen// Regular expression matches an identifier: `^[_a-zA-Z][_a-zA-Z0-9]*` 519*1c12ee1eSDan Willemsenfunc parseIdent(input []byte, allowNeg bool) int { 520*1c12ee1eSDan Willemsen var size int 521*1c12ee1eSDan Willemsen 522*1c12ee1eSDan Willemsen s := input 523*1c12ee1eSDan Willemsen if len(s) == 0 { 524*1c12ee1eSDan Willemsen return 0 525*1c12ee1eSDan Willemsen } 526*1c12ee1eSDan Willemsen 527*1c12ee1eSDan Willemsen if allowNeg && s[0] == '-' { 528*1c12ee1eSDan Willemsen s = s[1:] 529*1c12ee1eSDan Willemsen size++ 530*1c12ee1eSDan Willemsen if len(s) == 0 { 531*1c12ee1eSDan Willemsen return 0 532*1c12ee1eSDan Willemsen } 533*1c12ee1eSDan Willemsen } 534*1c12ee1eSDan Willemsen 535*1c12ee1eSDan Willemsen switch { 536*1c12ee1eSDan Willemsen case s[0] == '_', 537*1c12ee1eSDan Willemsen 'a' <= s[0] && s[0] <= 'z', 538*1c12ee1eSDan Willemsen 'A' <= s[0] && s[0] <= 'Z': 539*1c12ee1eSDan Willemsen s = s[1:] 540*1c12ee1eSDan Willemsen size++ 541*1c12ee1eSDan Willemsen default: 542*1c12ee1eSDan Willemsen return 0 543*1c12ee1eSDan Willemsen } 544*1c12ee1eSDan Willemsen 545*1c12ee1eSDan Willemsen for len(s) > 0 && (s[0] == '_' || 546*1c12ee1eSDan Willemsen 'a' <= s[0] && s[0] <= 'z' || 547*1c12ee1eSDan Willemsen 'A' <= s[0] && s[0] <= 'Z' || 548*1c12ee1eSDan Willemsen '0' <= s[0] && s[0] <= '9') { 549*1c12ee1eSDan Willemsen s = s[1:] 550*1c12ee1eSDan Willemsen size++ 551*1c12ee1eSDan Willemsen } 552*1c12ee1eSDan Willemsen 553*1c12ee1eSDan Willemsen if len(s) > 0 && !isDelim(s[0]) { 554*1c12ee1eSDan Willemsen return 0 555*1c12ee1eSDan Willemsen } 556*1c12ee1eSDan Willemsen 557*1c12ee1eSDan Willemsen return size 558*1c12ee1eSDan Willemsen} 559*1c12ee1eSDan Willemsen 560*1c12ee1eSDan Willemsen// parseScalar parses for a string, literal or number value. 561*1c12ee1eSDan Willemsenfunc (d *Decoder) parseScalar() (Token, error) { 562*1c12ee1eSDan Willemsen if d.in[0] == '"' || d.in[0] == '\'' { 563*1c12ee1eSDan Willemsen return d.parseStringValue() 564*1c12ee1eSDan Willemsen } 565*1c12ee1eSDan Willemsen 566*1c12ee1eSDan Willemsen if tok, ok := d.parseLiteralValue(); ok { 567*1c12ee1eSDan Willemsen return tok, nil 568*1c12ee1eSDan Willemsen } 569*1c12ee1eSDan Willemsen 570*1c12ee1eSDan Willemsen if tok, ok := d.parseNumberValue(); ok { 571*1c12ee1eSDan Willemsen return tok, nil 572*1c12ee1eSDan Willemsen } 573*1c12ee1eSDan Willemsen 574*1c12ee1eSDan Willemsen return Token{}, d.newSyntaxError("invalid scalar value: %s", errId(d.in)) 575*1c12ee1eSDan Willemsen} 576*1c12ee1eSDan Willemsen 577*1c12ee1eSDan Willemsen// parseLiteralValue parses a literal value. A literal value is used for 578*1c12ee1eSDan Willemsen// bools, special floats and enums. This function simply identifies that the 579*1c12ee1eSDan Willemsen// field value is a literal. 580*1c12ee1eSDan Willemsenfunc (d *Decoder) parseLiteralValue() (Token, bool) { 581*1c12ee1eSDan Willemsen size := parseIdent(d.in, true) 582*1c12ee1eSDan Willemsen if size == 0 { 583*1c12ee1eSDan Willemsen return Token{}, false 584*1c12ee1eSDan Willemsen } 585*1c12ee1eSDan Willemsen return d.consumeToken(Scalar, size, literalValue), true 586*1c12ee1eSDan Willemsen} 587*1c12ee1eSDan Willemsen 588*1c12ee1eSDan Willemsen// consumeToken constructs a Token for given Kind from d.in and consumes given 589*1c12ee1eSDan Willemsen// size-length from it. 590*1c12ee1eSDan Willemsenfunc (d *Decoder) consumeToken(kind Kind, size int, attrs uint8) Token { 591*1c12ee1eSDan Willemsen // Important to compute raw and pos before consuming. 592*1c12ee1eSDan Willemsen tok := Token{ 593*1c12ee1eSDan Willemsen kind: kind, 594*1c12ee1eSDan Willemsen attrs: attrs, 595*1c12ee1eSDan Willemsen pos: len(d.orig) - len(d.in), 596*1c12ee1eSDan Willemsen raw: d.in[:size], 597*1c12ee1eSDan Willemsen } 598*1c12ee1eSDan Willemsen d.consume(size) 599*1c12ee1eSDan Willemsen return tok 600*1c12ee1eSDan Willemsen} 601*1c12ee1eSDan Willemsen 602*1c12ee1eSDan Willemsen// newSyntaxError returns a syntax error with line and column information for 603*1c12ee1eSDan Willemsen// current position. 604*1c12ee1eSDan Willemsenfunc (d *Decoder) newSyntaxError(f string, x ...interface{}) error { 605*1c12ee1eSDan Willemsen e := errors.New(f, x...) 606*1c12ee1eSDan Willemsen line, column := d.Position(len(d.orig) - len(d.in)) 607*1c12ee1eSDan Willemsen return errors.New("syntax error (line %d:%d): %v", line, column, e) 608*1c12ee1eSDan Willemsen} 609*1c12ee1eSDan Willemsen 610*1c12ee1eSDan Willemsen// Position returns line and column number of given index of the original input. 611*1c12ee1eSDan Willemsen// It will panic if index is out of range. 612*1c12ee1eSDan Willemsenfunc (d *Decoder) Position(idx int) (line int, column int) { 613*1c12ee1eSDan Willemsen b := d.orig[:idx] 614*1c12ee1eSDan Willemsen line = bytes.Count(b, []byte("\n")) + 1 615*1c12ee1eSDan Willemsen if i := bytes.LastIndexByte(b, '\n'); i >= 0 { 616*1c12ee1eSDan Willemsen b = b[i+1:] 617*1c12ee1eSDan Willemsen } 618*1c12ee1eSDan Willemsen column = utf8.RuneCount(b) + 1 // ignore multi-rune characters 619*1c12ee1eSDan Willemsen return line, column 620*1c12ee1eSDan Willemsen} 621*1c12ee1eSDan Willemsen 622*1c12ee1eSDan Willemsenfunc (d *Decoder) tryConsumeChar(c byte) bool { 623*1c12ee1eSDan Willemsen if len(d.in) > 0 && d.in[0] == c { 624*1c12ee1eSDan Willemsen d.consume(1) 625*1c12ee1eSDan Willemsen return true 626*1c12ee1eSDan Willemsen } 627*1c12ee1eSDan Willemsen return false 628*1c12ee1eSDan Willemsen} 629*1c12ee1eSDan Willemsen 630*1c12ee1eSDan Willemsen// consume consumes n bytes of input and any subsequent whitespace or comments. 631*1c12ee1eSDan Willemsenfunc (d *Decoder) consume(n int) { 632*1c12ee1eSDan Willemsen d.in = consume(d.in, n) 633*1c12ee1eSDan Willemsen return 634*1c12ee1eSDan Willemsen} 635*1c12ee1eSDan Willemsen 636*1c12ee1eSDan Willemsen// consume consumes n bytes of input and any subsequent whitespace or comments. 637*1c12ee1eSDan Willemsenfunc consume(b []byte, n int) []byte { 638*1c12ee1eSDan Willemsen b = b[n:] 639*1c12ee1eSDan Willemsen for len(b) > 0 { 640*1c12ee1eSDan Willemsen switch b[0] { 641*1c12ee1eSDan Willemsen case ' ', '\n', '\r', '\t': 642*1c12ee1eSDan Willemsen b = b[1:] 643*1c12ee1eSDan Willemsen case '#': 644*1c12ee1eSDan Willemsen if i := bytes.IndexByte(b, '\n'); i >= 0 { 645*1c12ee1eSDan Willemsen b = b[i+len("\n"):] 646*1c12ee1eSDan Willemsen } else { 647*1c12ee1eSDan Willemsen b = nil 648*1c12ee1eSDan Willemsen } 649*1c12ee1eSDan Willemsen default: 650*1c12ee1eSDan Willemsen return b 651*1c12ee1eSDan Willemsen } 652*1c12ee1eSDan Willemsen } 653*1c12ee1eSDan Willemsen return b 654*1c12ee1eSDan Willemsen} 655*1c12ee1eSDan Willemsen 656*1c12ee1eSDan Willemsen// errId extracts a byte sequence that looks like an invalid ID 657*1c12ee1eSDan Willemsen// (for the purposes of error reporting). 658*1c12ee1eSDan Willemsenfunc errId(seq []byte) []byte { 659*1c12ee1eSDan Willemsen const maxLen = 32 660*1c12ee1eSDan Willemsen for i := 0; i < len(seq); { 661*1c12ee1eSDan Willemsen if i > maxLen { 662*1c12ee1eSDan Willemsen return append(seq[:i:i], "…"...) 663*1c12ee1eSDan Willemsen } 664*1c12ee1eSDan Willemsen r, size := utf8.DecodeRune(seq[i:]) 665*1c12ee1eSDan Willemsen if r > utf8.RuneSelf || (r != '/' && isDelim(byte(r))) { 666*1c12ee1eSDan Willemsen if i == 0 { 667*1c12ee1eSDan Willemsen // Either the first byte is invalid UTF-8 or a 668*1c12ee1eSDan Willemsen // delimiter, or the first rune is non-ASCII. 669*1c12ee1eSDan Willemsen // Return it as-is. 670*1c12ee1eSDan Willemsen i = size 671*1c12ee1eSDan Willemsen } 672*1c12ee1eSDan Willemsen return seq[:i:i] 673*1c12ee1eSDan Willemsen } 674*1c12ee1eSDan Willemsen i += size 675*1c12ee1eSDan Willemsen } 676*1c12ee1eSDan Willemsen // No delimiter found. 677*1c12ee1eSDan Willemsen return seq 678*1c12ee1eSDan Willemsen} 679*1c12ee1eSDan Willemsen 680*1c12ee1eSDan Willemsen// isDelim returns true if given byte is a delimiter character. 681*1c12ee1eSDan Willemsenfunc isDelim(c byte) bool { 682*1c12ee1eSDan Willemsen return !(c == '-' || c == '+' || c == '.' || c == '_' || 683*1c12ee1eSDan Willemsen ('a' <= c && c <= 'z') || 684*1c12ee1eSDan Willemsen ('A' <= c && c <= 'Z') || 685*1c12ee1eSDan Willemsen ('0' <= c && c <= '9')) 686*1c12ee1eSDan Willemsen} 687