1// Copyright 2010 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package json
6
7// JSON value parser state machine.
8// Just about at the limit of what is reasonable to write by hand.
9// Some parts are a bit tedious, but overall it nicely factors out the
10// otherwise common code from the multiple scanning functions
11// in this package (Compact, Indent, checkValid, etc).
12//
13// This file starts with two simple examples using the scanner
14// before diving into the scanner itself.
15
16import (
17	"strconv"
18	"sync"
19)
20
21// Valid reports whether data is a valid JSON encoding.
22func Valid(data []byte) bool {
23	scan := newScanner()
24	defer freeScanner(scan)
25	return checkValid(data, scan) == nil
26}
27
28// checkValid verifies that data is valid JSON-encoded data.
29// scan is passed in for use by checkValid to avoid an allocation.
30// checkValid returns nil or a SyntaxError.
31func checkValid(data []byte, scan *scanner) error {
32	scan.reset()
33	for _, c := range data {
34		scan.bytes++
35		if scan.step(scan, c) == scanError {
36			return scan.err
37		}
38	}
39	if scan.eof() == scanError {
40		return scan.err
41	}
42	return nil
43}
44
45// A SyntaxError is a description of a JSON syntax error.
46// [Unmarshal] will return a SyntaxError if the JSON can't be parsed.
47type SyntaxError struct {
48	msg    string // description of error
49	Offset int64  // error occurred after reading Offset bytes
50}
51
52func (e *SyntaxError) Error() string { return e.msg }
53
54// A scanner is a JSON scanning state machine.
55// Callers call scan.reset and then pass bytes in one at a time
56// by calling scan.step(&scan, c) for each byte.
57// The return value, referred to as an opcode, tells the
58// caller about significant parsing events like beginning
59// and ending literals, objects, and arrays, so that the
60// caller can follow along if it wishes.
61// The return value scanEnd indicates that a single top-level
62// JSON value has been completed, *before* the byte that
63// just got passed in.  (The indication must be delayed in order
64// to recognize the end of numbers: is 123 a whole value or
65// the beginning of 12345e+6?).
66type scanner struct {
67	// The step is a func to be called to execute the next transition.
68	// Also tried using an integer constant and a single func
69	// with a switch, but using the func directly was 10% faster
70	// on a 64-bit Mac Mini, and it's nicer to read.
71	step func(*scanner, byte) int
72
73	// Reached end of top-level value.
74	endTop bool
75
76	// Stack of what we're in the middle of - array values, object keys, object values.
77	parseState []int
78
79	// Error that happened, if any.
80	err error
81
82	// total bytes consumed, updated by decoder.Decode (and deliberately
83	// not set to zero by scan.reset)
84	bytes int64
85}
86
87var scannerPool = sync.Pool{
88	New: func() any {
89		return &scanner{}
90	},
91}
92
93func newScanner() *scanner {
94	scan := scannerPool.Get().(*scanner)
95	// scan.reset by design doesn't set bytes to zero
96	scan.bytes = 0
97	scan.reset()
98	return scan
99}
100
101func freeScanner(scan *scanner) {
102	// Avoid hanging on to too much memory in extreme cases.
103	if len(scan.parseState) > 1024 {
104		scan.parseState = nil
105	}
106	scannerPool.Put(scan)
107}
108
109// These values are returned by the state transition functions
110// assigned to scanner.state and the method scanner.eof.
111// They give details about the current state of the scan that
112// callers might be interested to know about.
113// It is okay to ignore the return value of any particular
114// call to scanner.state: if one call returns scanError,
115// every subsequent call will return scanError too.
116const (
117	// Continue.
118	scanContinue     = iota // uninteresting byte
119	scanBeginLiteral        // end implied by next result != scanContinue
120	scanBeginObject         // begin object
121	scanObjectKey           // just finished object key (string)
122	scanObjectValue         // just finished non-last object value
123	scanEndObject           // end object (implies scanObjectValue if possible)
124	scanBeginArray          // begin array
125	scanArrayValue          // just finished array value
126	scanEndArray            // end array (implies scanArrayValue if possible)
127	scanSkipSpace           // space byte; can skip; known to be last "continue" result
128
129	// Stop.
130	scanEnd   // top-level value ended *before* this byte; known to be first "stop" result
131	scanError // hit an error, scanner.err.
132)
133
134// These values are stored in the parseState stack.
135// They give the current state of a composite value
136// being scanned. If the parser is inside a nested value
137// the parseState describes the nested state, outermost at entry 0.
138const (
139	parseObjectKey   = iota // parsing object key (before colon)
140	parseObjectValue        // parsing object value (after colon)
141	parseArrayValue         // parsing array value
142)
143
144// This limits the max nesting depth to prevent stack overflow.
145// This is permitted by https://tools.ietf.org/html/rfc7159#section-9
146const maxNestingDepth = 10000
147
148// reset prepares the scanner for use.
149// It must be called before calling s.step.
150func (s *scanner) reset() {
151	s.step = stateBeginValue
152	s.parseState = s.parseState[0:0]
153	s.err = nil
154	s.endTop = false
155}
156
157// eof tells the scanner that the end of input has been reached.
158// It returns a scan status just as s.step does.
159func (s *scanner) eof() int {
160	if s.err != nil {
161		return scanError
162	}
163	if s.endTop {
164		return scanEnd
165	}
166	s.step(s, ' ')
167	if s.endTop {
168		return scanEnd
169	}
170	if s.err == nil {
171		s.err = &SyntaxError{"unexpected end of JSON input", s.bytes}
172	}
173	return scanError
174}
175
176// pushParseState pushes a new parse state p onto the parse stack.
177// an error state is returned if maxNestingDepth was exceeded, otherwise successState is returned.
178func (s *scanner) pushParseState(c byte, newParseState int, successState int) int {
179	s.parseState = append(s.parseState, newParseState)
180	if len(s.parseState) <= maxNestingDepth {
181		return successState
182	}
183	return s.error(c, "exceeded max depth")
184}
185
186// popParseState pops a parse state (already obtained) off the stack
187// and updates s.step accordingly.
188func (s *scanner) popParseState() {
189	n := len(s.parseState) - 1
190	s.parseState = s.parseState[0:n]
191	if n == 0 {
192		s.step = stateEndTop
193		s.endTop = true
194	} else {
195		s.step = stateEndValue
196	}
197}
198
199func isSpace(c byte) bool {
200	return c <= ' ' && (c == ' ' || c == '\t' || c == '\r' || c == '\n')
201}
202
203// stateBeginValueOrEmpty is the state after reading `[`.
204func stateBeginValueOrEmpty(s *scanner, c byte) int {
205	if isSpace(c) {
206		return scanSkipSpace
207	}
208	if c == ']' {
209		return stateEndValue(s, c)
210	}
211	return stateBeginValue(s, c)
212}
213
214// stateBeginValue is the state at the beginning of the input.
215func stateBeginValue(s *scanner, c byte) int {
216	if isSpace(c) {
217		return scanSkipSpace
218	}
219	switch c {
220	case '{':
221		s.step = stateBeginStringOrEmpty
222		return s.pushParseState(c, parseObjectKey, scanBeginObject)
223	case '[':
224		s.step = stateBeginValueOrEmpty
225		return s.pushParseState(c, parseArrayValue, scanBeginArray)
226	case '"':
227		s.step = stateInString
228		return scanBeginLiteral
229	case '-':
230		s.step = stateNeg
231		return scanBeginLiteral
232	case '0': // beginning of 0.123
233		s.step = state0
234		return scanBeginLiteral
235	case 't': // beginning of true
236		s.step = stateT
237		return scanBeginLiteral
238	case 'f': // beginning of false
239		s.step = stateF
240		return scanBeginLiteral
241	case 'n': // beginning of null
242		s.step = stateN
243		return scanBeginLiteral
244	}
245	if '1' <= c && c <= '9' { // beginning of 1234.5
246		s.step = state1
247		return scanBeginLiteral
248	}
249	return s.error(c, "looking for beginning of value")
250}
251
252// stateBeginStringOrEmpty is the state after reading `{`.
253func stateBeginStringOrEmpty(s *scanner, c byte) int {
254	if isSpace(c) {
255		return scanSkipSpace
256	}
257	if c == '}' {
258		n := len(s.parseState)
259		s.parseState[n-1] = parseObjectValue
260		return stateEndValue(s, c)
261	}
262	return stateBeginString(s, c)
263}
264
265// stateBeginString is the state after reading `{"key": value,`.
266func stateBeginString(s *scanner, c byte) int {
267	if isSpace(c) {
268		return scanSkipSpace
269	}
270	if c == '"' {
271		s.step = stateInString
272		return scanBeginLiteral
273	}
274	return s.error(c, "looking for beginning of object key string")
275}
276
277// stateEndValue is the state after completing a value,
278// such as after reading `{}` or `true` or `["x"`.
279func stateEndValue(s *scanner, c byte) int {
280	n := len(s.parseState)
281	if n == 0 {
282		// Completed top-level before the current byte.
283		s.step = stateEndTop
284		s.endTop = true
285		return stateEndTop(s, c)
286	}
287	if isSpace(c) {
288		s.step = stateEndValue
289		return scanSkipSpace
290	}
291	ps := s.parseState[n-1]
292	switch ps {
293	case parseObjectKey:
294		if c == ':' {
295			s.parseState[n-1] = parseObjectValue
296			s.step = stateBeginValue
297			return scanObjectKey
298		}
299		return s.error(c, "after object key")
300	case parseObjectValue:
301		if c == ',' {
302			s.parseState[n-1] = parseObjectKey
303			s.step = stateBeginString
304			return scanObjectValue
305		}
306		if c == '}' {
307			s.popParseState()
308			return scanEndObject
309		}
310		return s.error(c, "after object key:value pair")
311	case parseArrayValue:
312		if c == ',' {
313			s.step = stateBeginValue
314			return scanArrayValue
315		}
316		if c == ']' {
317			s.popParseState()
318			return scanEndArray
319		}
320		return s.error(c, "after array element")
321	}
322	return s.error(c, "")
323}
324
325// stateEndTop is the state after finishing the top-level value,
326// such as after reading `{}` or `[1,2,3]`.
327// Only space characters should be seen now.
328func stateEndTop(s *scanner, c byte) int {
329	if !isSpace(c) {
330		// Complain about non-space byte on next call.
331		s.error(c, "after top-level value")
332	}
333	return scanEnd
334}
335
336// stateInString is the state after reading `"`.
337func stateInString(s *scanner, c byte) int {
338	if c == '"' {
339		s.step = stateEndValue
340		return scanContinue
341	}
342	if c == '\\' {
343		s.step = stateInStringEsc
344		return scanContinue
345	}
346	if c < 0x20 {
347		return s.error(c, "in string literal")
348	}
349	return scanContinue
350}
351
352// stateInStringEsc is the state after reading `"\` during a quoted string.
353func stateInStringEsc(s *scanner, c byte) int {
354	switch c {
355	case 'b', 'f', 'n', 'r', 't', '\\', '/', '"':
356		s.step = stateInString
357		return scanContinue
358	case 'u':
359		s.step = stateInStringEscU
360		return scanContinue
361	}
362	return s.error(c, "in string escape code")
363}
364
365// stateInStringEscU is the state after reading `"\u` during a quoted string.
366func stateInStringEscU(s *scanner, c byte) int {
367	if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
368		s.step = stateInStringEscU1
369		return scanContinue
370	}
371	// numbers
372	return s.error(c, "in \\u hexadecimal character escape")
373}
374
375// stateInStringEscU1 is the state after reading `"\u1` during a quoted string.
376func stateInStringEscU1(s *scanner, c byte) int {
377	if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
378		s.step = stateInStringEscU12
379		return scanContinue
380	}
381	// numbers
382	return s.error(c, "in \\u hexadecimal character escape")
383}
384
385// stateInStringEscU12 is the state after reading `"\u12` during a quoted string.
386func stateInStringEscU12(s *scanner, c byte) int {
387	if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
388		s.step = stateInStringEscU123
389		return scanContinue
390	}
391	// numbers
392	return s.error(c, "in \\u hexadecimal character escape")
393}
394
395// stateInStringEscU123 is the state after reading `"\u123` during a quoted string.
396func stateInStringEscU123(s *scanner, c byte) int {
397	if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
398		s.step = stateInString
399		return scanContinue
400	}
401	// numbers
402	return s.error(c, "in \\u hexadecimal character escape")
403}
404
405// stateNeg is the state after reading `-` during a number.
406func stateNeg(s *scanner, c byte) int {
407	if c == '0' {
408		s.step = state0
409		return scanContinue
410	}
411	if '1' <= c && c <= '9' {
412		s.step = state1
413		return scanContinue
414	}
415	return s.error(c, "in numeric literal")
416}
417
418// state1 is the state after reading a non-zero integer during a number,
419// such as after reading `1` or `100` but not `0`.
420func state1(s *scanner, c byte) int {
421	if '0' <= c && c <= '9' {
422		s.step = state1
423		return scanContinue
424	}
425	return state0(s, c)
426}
427
428// state0 is the state after reading `0` during a number.
429func state0(s *scanner, c byte) int {
430	if c == '.' {
431		s.step = stateDot
432		return scanContinue
433	}
434	if c == 'e' || c == 'E' {
435		s.step = stateE
436		return scanContinue
437	}
438	return stateEndValue(s, c)
439}
440
441// stateDot is the state after reading the integer and decimal point in a number,
442// such as after reading `1.`.
443func stateDot(s *scanner, c byte) int {
444	if '0' <= c && c <= '9' {
445		s.step = stateDot0
446		return scanContinue
447	}
448	return s.error(c, "after decimal point in numeric literal")
449}
450
451// stateDot0 is the state after reading the integer, decimal point, and subsequent
452// digits of a number, such as after reading `3.14`.
453func stateDot0(s *scanner, c byte) int {
454	if '0' <= c && c <= '9' {
455		return scanContinue
456	}
457	if c == 'e' || c == 'E' {
458		s.step = stateE
459		return scanContinue
460	}
461	return stateEndValue(s, c)
462}
463
464// stateE is the state after reading the mantissa and e in a number,
465// such as after reading `314e` or `0.314e`.
466func stateE(s *scanner, c byte) int {
467	if c == '+' || c == '-' {
468		s.step = stateESign
469		return scanContinue
470	}
471	return stateESign(s, c)
472}
473
474// stateESign is the state after reading the mantissa, e, and sign in a number,
475// such as after reading `314e-` or `0.314e+`.
476func stateESign(s *scanner, c byte) int {
477	if '0' <= c && c <= '9' {
478		s.step = stateE0
479		return scanContinue
480	}
481	return s.error(c, "in exponent of numeric literal")
482}
483
484// stateE0 is the state after reading the mantissa, e, optional sign,
485// and at least one digit of the exponent in a number,
486// such as after reading `314e-2` or `0.314e+1` or `3.14e0`.
487func stateE0(s *scanner, c byte) int {
488	if '0' <= c && c <= '9' {
489		return scanContinue
490	}
491	return stateEndValue(s, c)
492}
493
494// stateT is the state after reading `t`.
495func stateT(s *scanner, c byte) int {
496	if c == 'r' {
497		s.step = stateTr
498		return scanContinue
499	}
500	return s.error(c, "in literal true (expecting 'r')")
501}
502
503// stateTr is the state after reading `tr`.
504func stateTr(s *scanner, c byte) int {
505	if c == 'u' {
506		s.step = stateTru
507		return scanContinue
508	}
509	return s.error(c, "in literal true (expecting 'u')")
510}
511
512// stateTru is the state after reading `tru`.
513func stateTru(s *scanner, c byte) int {
514	if c == 'e' {
515		s.step = stateEndValue
516		return scanContinue
517	}
518	return s.error(c, "in literal true (expecting 'e')")
519}
520
521// stateF is the state after reading `f`.
522func stateF(s *scanner, c byte) int {
523	if c == 'a' {
524		s.step = stateFa
525		return scanContinue
526	}
527	return s.error(c, "in literal false (expecting 'a')")
528}
529
530// stateFa is the state after reading `fa`.
531func stateFa(s *scanner, c byte) int {
532	if c == 'l' {
533		s.step = stateFal
534		return scanContinue
535	}
536	return s.error(c, "in literal false (expecting 'l')")
537}
538
539// stateFal is the state after reading `fal`.
540func stateFal(s *scanner, c byte) int {
541	if c == 's' {
542		s.step = stateFals
543		return scanContinue
544	}
545	return s.error(c, "in literal false (expecting 's')")
546}
547
548// stateFals is the state after reading `fals`.
549func stateFals(s *scanner, c byte) int {
550	if c == 'e' {
551		s.step = stateEndValue
552		return scanContinue
553	}
554	return s.error(c, "in literal false (expecting 'e')")
555}
556
557// stateN is the state after reading `n`.
558func stateN(s *scanner, c byte) int {
559	if c == 'u' {
560		s.step = stateNu
561		return scanContinue
562	}
563	return s.error(c, "in literal null (expecting 'u')")
564}
565
566// stateNu is the state after reading `nu`.
567func stateNu(s *scanner, c byte) int {
568	if c == 'l' {
569		s.step = stateNul
570		return scanContinue
571	}
572	return s.error(c, "in literal null (expecting 'l')")
573}
574
575// stateNul is the state after reading `nul`.
576func stateNul(s *scanner, c byte) int {
577	if c == 'l' {
578		s.step = stateEndValue
579		return scanContinue
580	}
581	return s.error(c, "in literal null (expecting 'l')")
582}
583
584// stateError is the state after reaching a syntax error,
585// such as after reading `[1}` or `5.1.2`.
586func stateError(s *scanner, c byte) int {
587	return scanError
588}
589
590// error records an error and switches to the error state.
591func (s *scanner) error(c byte, context string) int {
592	s.step = stateError
593	s.err = &SyntaxError{"invalid character " + quoteChar(c) + " " + context, s.bytes}
594	return scanError
595}
596
597// quoteChar formats c as a quoted character literal.
598func quoteChar(c byte) string {
599	// special cases - different from quoted strings
600	if c == '\'' {
601		return `'\''`
602	}
603	if c == '"' {
604		return `'"'`
605	}
606
607	// use quoted string with different quotation marks
608	s := strconv.Quote(string(c))
609	return "'" + s[1:len(s)-1] + "'"
610}
611