xref: /aosp_15_r20/external/cronet/third_party/protobuf/src/google/protobuf/util/internal/json_stream_parser.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 //     * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 //     * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 #include <google/protobuf/util/internal/json_stream_parser.h>
32 
33 #include <algorithm>
34 #include <cctype>
35 #include <cmath>
36 #include <memory>
37 #include <stack>
38 #include <string>
39 
40 #include <google/protobuf/stubs/common.h>
41 #include <google/protobuf/stubs/logging.h>
42 #include <google/protobuf/stubs/strutil.h>
43 #include <google/protobuf/stubs/status.h>
44 #include <google/protobuf/util/internal/object_writer.h>
45 #include <google/protobuf/util/internal/json_escaping.h>
46 
47 
48 namespace google {
49 namespace protobuf {
50 namespace util {
51 
52 namespace converter {
53 
54 // Number of digits in an escaped UTF-16 code unit ('\\' 'u' X X X X)
55 static const int kUnicodeEscapedLength = 6;
56 
57 static const int kDefaultMaxRecursionDepth = 100;
58 
59 // These cannot be constexpr for portability with VS2015.
60 static const StringPiece kKeywordTrue = "true";
61 static const StringPiece kKeywordFalse = "false";
62 static const StringPiece kKeywordNull = "null";
63 
IsLetter(char c)64 inline bool IsLetter(char c) {
65   return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || (c == '_') ||
66          (c == '$');
67 }
68 
IsAlphanumeric(char c)69 inline bool IsAlphanumeric(char c) {
70   return IsLetter(c) || ('0' <= c && c <= '9');
71 }
72 
73 // Indicates a character may not be part of an unquoted key.
IsKeySeparator(char c)74 inline bool IsKeySeparator(char c) {
75   return (ascii_isspace(c) || c == '"' || c == '\'' || c == '{' ||
76           c == '}' || c == '[' || c == ']' || c == ':' || c == ',');
77 }
78 
ReplaceInvalidCodePoints(StringPiece str,const std::string & replacement,std::string * dst)79 inline void ReplaceInvalidCodePoints(StringPiece str,
80                                      const std::string& replacement,
81                                      std::string* dst) {
82   while (!str.empty()) {
83     int n_valid_bytes = internal::UTF8SpnStructurallyValid(str);
84     StringPiece valid_part = str.substr(0, n_valid_bytes);
85     StrAppend(dst, valid_part);
86 
87     if (n_valid_bytes == str.size()) {
88       break;
89     }
90 
91     // Append replacement value.
92     StrAppend(dst, replacement);
93 
94     // Move past valid bytes + one invalid byte.
95     str.remove_prefix(n_valid_bytes + 1);
96   }
97 }
98 
ConsumeKey(StringPiece * input,StringPiece * key)99 static bool ConsumeKey(StringPiece* input, StringPiece* key) {
100   if (input->empty() || !IsLetter((*input)[0])) return false;
101   int len = 1;
102   for (; len < input->size(); ++len) {
103     if (!IsAlphanumeric((*input)[len])) {
104       break;
105     }
106   }
107   *key = StringPiece(input->data(), len);
108   *input = StringPiece(input->data() + len, input->size() - len);
109   return true;
110 }
111 
112 // Same as 'ConsumeKey', but allows a widened set of key characters.
ConsumeKeyPermissive(StringPiece * input,StringPiece * key)113 static bool ConsumeKeyPermissive(StringPiece* input,
114                                  StringPiece* key) {
115   if (input->empty() || !IsLetter((*input)[0])) return false;
116   int len = 1;
117   for (; len < input->size(); ++len) {
118     if (IsKeySeparator((*input)[len])) {
119       break;
120     }
121   }
122   *key = StringPiece(input->data(), len);
123   *input = StringPiece(input->data() + len, input->size() - len);
124   return true;
125 }
126 
MatchKey(StringPiece input)127 static bool MatchKey(StringPiece input) {
128   return !input.empty() && IsLetter(input[0]);
129 }
130 
JsonStreamParser(ObjectWriter * ow)131 JsonStreamParser::JsonStreamParser(ObjectWriter* ow)
132     : ow_(ow),
133       stack_(),
134       leftover_(),
135       json_(),
136       p_(),
137       key_(),
138       key_storage_(),
139       finishing_(false),
140       seen_non_whitespace_(false),
141       allow_no_root_element_(false),
142       parsed_(),
143       parsed_storage_(),
144       string_open_(0),
145       chunk_storage_(),
146       coerce_to_utf8_(false),
147       utf8_replacement_character_(" "),
148       allow_empty_null_(false),
149       allow_permissive_key_naming_(false),
150       loose_float_number_conversion_(false),
151       recursion_depth_(0),
152       max_recursion_depth_(kDefaultMaxRecursionDepth) {
153   // Initialize the stack with a single value to be parsed.
154   stack_.push(VALUE);
155 }
156 
~JsonStreamParser()157 JsonStreamParser::~JsonStreamParser() {}
158 
159 
Parse(StringPiece json)160 util::Status JsonStreamParser::Parse(StringPiece json) {
161   StringPiece chunk = json;
162   // If we have leftovers from a previous chunk, append the new chunk to it
163   // and create a new StringPiece pointing at the string's data. This could
164   // be large but we rely on the chunks to be small, assuming they are
165   // fragments of a Cord.
166   if (!leftover_.empty()) {
167     // Don't point chunk to leftover_ because leftover_ will be updated in
168     // ParseChunk(chunk).
169     chunk_storage_.swap(leftover_);
170     StrAppend(&chunk_storage_, json);
171     chunk = StringPiece(chunk_storage_);
172   }
173 
174   // Find the structurally valid UTF8 prefix and parse only that.
175   int n = internal::UTF8SpnStructurallyValid(chunk);
176   if (n > 0) {
177     util::Status status = ParseChunk(chunk.substr(0, n));
178 
179     // Any leftover characters are stashed in leftover_ for later parsing when
180     // there is more data available.
181     StrAppend(&leftover_, chunk.substr(n));
182     return status;
183   } else {
184     leftover_.assign(chunk.data(), chunk.size());
185     return util::Status();
186   }
187 }
188 
FinishParse()189 util::Status JsonStreamParser::FinishParse() {
190   // If we do not expect anything and there is nothing left to parse we're all
191   // done.
192   if (stack_.empty() && leftover_.empty()) {
193     return util::Status();
194   }
195 
196   // Lifetime needs to last until RunParser returns, so keep this variable
197   // outside of the coerce_to_utf8 block.
198   std::unique_ptr<std::string> scratch;
199 
200   bool is_valid_utf8 = internal::IsStructurallyValidUTF8(leftover_);
201   if (coerce_to_utf8_ && !is_valid_utf8) {
202     scratch.reset(new std::string);
203     scratch->reserve(leftover_.size() * utf8_replacement_character_.size());
204     ReplaceInvalidCodePoints(leftover_, utf8_replacement_character_,
205                              scratch.get());
206     p_ = json_ = *scratch;
207   } else {
208     p_ = json_ = leftover_;
209     if (!is_valid_utf8) {
210       return ReportFailure("Encountered non UTF-8 code points.",
211                            ParseErrorType::NON_UTF_8);
212     }
213   }
214 
215   // Parse the remainder in finishing mode, which reports errors for things like
216   // unterminated strings or unknown tokens that would normally be retried.
217   finishing_ = true;
218   util::Status result = RunParser();
219   if (result.ok()) {
220     SkipWhitespace();
221     if (!p_.empty()) {
222       result =
223           ReportFailure("Parsing terminated before end of input.",
224                         ParseErrorType::PARSING_TERMINATED_BEFORE_END_OF_INPUT);
225     }
226   }
227   return result;
228 }
229 
ParseChunk(StringPiece chunk)230 util::Status JsonStreamParser::ParseChunk(StringPiece chunk) {
231   // Do not do any work if the chunk is empty.
232   if (chunk.empty()) return util::Status();
233 
234   p_ = json_ = chunk;
235 
236   finishing_ = false;
237   util::Status result = RunParser();
238   if (!result.ok()) return result;
239 
240   SkipWhitespace();
241   if (p_.empty()) {
242     // If we parsed everything we had, clear the leftover.
243     leftover_.clear();
244   } else {
245     // If we do not expect anything i.e. stack is empty, and we have non-empty
246     // string left to parse, we report an error.
247     if (stack_.empty()) {
248       return ReportFailure(
249           "Parsing terminated before end of input.",
250           ParseErrorType::PARSING_TERMINATED_BEFORE_END_OF_INPUT);
251     }
252     // If we expect future data i.e. stack is non-empty, and we have some
253     // unparsed data left, we save it for later parse.
254     leftover_ = std::string(p_);
255   }
256   return util::Status();
257 }
258 
IsInputAllWhiteSpaces(TokenType type)259 bool JsonStreamParser::IsInputAllWhiteSpaces(TokenType type) {
260   // Conclude the whole input is full of white spaces by:
261   // - it is at the finishing stage
262   // - we have run out of the input data
263   // - haven't seen non-whitespace char so far
264   if (finishing_ && p_.empty() && type == UNKNOWN && !seen_non_whitespace_) {
265     return true;
266   }
267   return false;
268 }
269 
RunParser()270 util::Status JsonStreamParser::RunParser() {
271   while (!stack_.empty()) {
272     ParseType type = stack_.top();
273     TokenType t = (string_open_ == 0) ? GetNextTokenType() : BEGIN_STRING;
274     stack_.pop();
275     util::Status result;
276     switch (type) {
277       case VALUE:
278         if (allow_no_root_element_ && IsInputAllWhiteSpaces(t)) {
279           return util::Status();
280         }
281         result = ParseValue(t);
282         break;
283 
284       case OBJ_MID:
285         result = ParseObjectMid(t);
286         break;
287 
288       case ENTRY:
289         result = ParseEntry(t);
290         break;
291 
292       case ENTRY_MID:
293         result = ParseEntryMid(t);
294         break;
295 
296       case ARRAY_VALUE:
297         result = ParseArrayValue(t);
298         break;
299 
300       case ARRAY_MID:
301         result = ParseArrayMid(t);
302         break;
303 
304       default:
305         result =
306             util::InternalError(StrCat("Unknown parse type: ", type));
307         break;
308     }
309     if (!result.ok()) {
310       // If we were cancelled, save our state and try again later.
311       if (!finishing_ && util::IsCancelled(result)) {
312         stack_.push(type);
313         // If we have a key we still need to render, make sure to save off the
314         // contents in our own storage.
315         if (!key_.empty() && key_storage_.empty()) {
316           StrAppend(&key_storage_, key_);
317           key_ = StringPiece(key_storage_);
318         }
319         result = util::Status();
320       }
321       return result;
322     }
323   }
324   return util::Status();
325 }
326 
ParseValue(TokenType type)327 util::Status JsonStreamParser::ParseValue(TokenType type) {
328   switch (type) {
329     case BEGIN_OBJECT:
330       return HandleBeginObject();
331     case BEGIN_ARRAY:
332       return HandleBeginArray();
333     case BEGIN_STRING:
334       return ParseString();
335     case BEGIN_NUMBER:
336       return ParseNumber();
337     case BEGIN_TRUE:
338       return ParseTrue();
339     case BEGIN_FALSE:
340       return ParseFalse();
341     case BEGIN_NULL:
342       return ParseNull();
343     case UNKNOWN:
344       return ReportUnknown("Expected a value.", ParseErrorType::EXPECTED_VALUE);
345     default: {
346       // Special case for having been cut off while parsing, wait for more data.
347       // This handles things like 'fals' being at the end of the string, we
348       // don't know if the next char would be e, completing it, or something
349       // else, making it invalid.
350       if (!finishing_ && p_.length() < kKeywordFalse.length()) {
351         return util::CancelledError("");
352       }
353 
354       if (allow_empty_null_ && IsEmptyNullAllowed(type)) {
355         return ParseEmptyNull();
356       }
357       return ReportFailure("Unexpected token.",
358                            ParseErrorType::UNEXPECTED_TOKEN);
359     }
360   }
361 }
362 
ParseString()363 util::Status JsonStreamParser::ParseString() {
364   util::Status result = ParseStringHelper();
365   if (result.ok()) {
366     ow_->RenderString(key_, parsed_);
367     key_ = StringPiece();
368     parsed_ = StringPiece();
369     parsed_storage_.clear();
370   }
371   return result;
372 }
373 
ParseStringHelper()374 util::Status JsonStreamParser::ParseStringHelper() {
375   // If we haven't seen the start quote, grab it and remember it for later.
376   if (string_open_ == 0) {
377     string_open_ = *p_.data();
378     GOOGLE_DCHECK(string_open_ == '\"' || string_open_ == '\'');
379     Advance();
380   }
381   // Track where we last copied data from so we can minimize copying.
382   const char* last = p_.data();
383   while (!p_.empty()) {
384     const char* data = p_.data();
385     if (*data == '\\') {
386       // We're about to handle an escape, copy all bytes from last to data.
387       if (last < data) {
388         parsed_storage_.append(last, data - last);
389       }
390       // If we ran out of string after the \, cancel or report an error
391       // depending on if we expect more data later.
392       if (p_.length() == 1) {
393         if (!finishing_) {
394           return util::CancelledError("");
395         }
396         return ReportFailure("Closing quote expected in string.",
397                              ParseErrorType::EXPECTED_CLOSING_QUOTE);
398       }
399       // Parse a unicode escape if we found \u in the string.
400       if (data[1] == 'u') {
401         util::Status result = ParseUnicodeEscape();
402         if (!result.ok()) {
403           return result;
404         }
405         // Move last pointer past the unicode escape and continue.
406         last = p_.data();
407         continue;
408       }
409       // Handle the standard set of backslash-escaped characters.
410       switch (data[1]) {
411         case 'b':
412           parsed_storage_.push_back('\b');
413           break;
414         case 'f':
415           parsed_storage_.push_back('\f');
416           break;
417         case 'n':
418           parsed_storage_.push_back('\n');
419           break;
420         case 'r':
421           parsed_storage_.push_back('\r');
422           break;
423         case 't':
424           parsed_storage_.push_back('\t');
425           break;
426         case 'v':
427           parsed_storage_.push_back('\v');
428           break;
429         default:
430           parsed_storage_.push_back(data[1]);
431       }
432       // We handled two characters, so advance past them and continue.
433       p_.remove_prefix(2);
434       last = p_.data();
435       continue;
436     }
437     // If we found the closing quote note it, advance past it, and return.
438     if (*data == string_open_) {
439       // If we didn't copy anything, reuse the input buffer.
440       if (parsed_storage_.empty()) {
441         parsed_ = StringPiece(last, data - last);
442       } else {
443         if (last < data) {
444           parsed_storage_.append(last, data - last);
445         }
446         parsed_ = StringPiece(parsed_storage_);
447       }
448       // Clear the quote char so next time we try to parse a string we'll
449       // start fresh.
450       string_open_ = 0;
451       Advance();
452       return util::Status();
453     }
454     // Normal character, just advance past it.
455     Advance();
456   }
457   // If we ran out of characters, copy over what we have so far.
458   if (last < p_.data()) {
459     parsed_storage_.append(last, p_.data() - last);
460   }
461   // If we didn't find the closing quote but we expect more data, cancel for now
462   if (!finishing_) {
463     return util::CancelledError("");
464   }
465   // End of string reached without a closing quote, report an error.
466   string_open_ = 0;
467   return ReportFailure("Closing quote expected in string.",
468                        ParseErrorType::EXPECTED_CLOSING_QUOTE);
469 }
470 
471 // Converts a unicode escaped character to a decimal value stored in a char32
472 // for use in UTF8 encoding utility.  We assume that str begins with \uhhhh and
473 // convert that from the hex number to a decimal value.
474 //
475 // There are some security exploits with UTF-8 that we should be careful of:
476 //   - http://www.unicode.org/reports/tr36/#UTF-8_Exploit
477 //   - http://sites/intl-eng/design-guide/core-application
ParseUnicodeEscape()478 util::Status JsonStreamParser::ParseUnicodeEscape() {
479   if (p_.length() < kUnicodeEscapedLength) {
480     if (!finishing_) {
481       return util::CancelledError("");
482     }
483     return ReportFailure("Illegal hex string.",
484                          ParseErrorType::ILLEGAL_HEX_STRING);
485   }
486   GOOGLE_DCHECK_EQ('\\', p_.data()[0]);
487   GOOGLE_DCHECK_EQ('u', p_.data()[1]);
488   uint32_t code = 0;
489   for (int i = 2; i < kUnicodeEscapedLength; ++i) {
490     if (!isxdigit(p_.data()[i])) {
491       return ReportFailure("Invalid escape sequence.",
492                            ParseErrorType::INVALID_ESCAPE_SEQUENCE);
493     }
494     code = (code << 4) + hex_digit_to_int(p_.data()[i]);
495   }
496   if (code >= JsonEscaping::kMinHighSurrogate &&
497       code <= JsonEscaping::kMaxHighSurrogate) {
498     if (p_.length() < 2 * kUnicodeEscapedLength) {
499       if (!finishing_) {
500         return util::CancelledError("");
501       }
502       if (!coerce_to_utf8_) {
503         return ReportFailure("Missing low surrogate.",
504                              ParseErrorType::MISSING_LOW_SURROGATE);
505       }
506     } else if (p_.data()[kUnicodeEscapedLength] == '\\' &&
507                p_.data()[kUnicodeEscapedLength + 1] == 'u') {
508       uint32_t low_code = 0;
509       for (int i = kUnicodeEscapedLength + 2; i < 2 * kUnicodeEscapedLength;
510            ++i) {
511         if (!isxdigit(p_.data()[i])) {
512           return ReportFailure("Invalid escape sequence.",
513                                ParseErrorType::INVALID_ESCAPE_SEQUENCE);
514         }
515         low_code = (low_code << 4) + hex_digit_to_int(p_.data()[i]);
516       }
517       if (low_code >= JsonEscaping::kMinLowSurrogate &&
518           low_code <= JsonEscaping::kMaxLowSurrogate) {
519         // Convert UTF-16 surrogate pair to 21-bit Unicode codepoint.
520         code = (((code & 0x3FF) << 10) | (low_code & 0x3FF)) +
521                JsonEscaping::kMinSupplementaryCodePoint;
522         // Advance past the first code unit escape.
523         p_.remove_prefix(kUnicodeEscapedLength);
524       } else if (!coerce_to_utf8_) {
525         return ReportFailure("Invalid low surrogate.",
526                              ParseErrorType::INVALID_LOW_SURROGATE);
527       }
528     } else if (!coerce_to_utf8_) {
529       return ReportFailure("Missing low surrogate.",
530                            ParseErrorType::MISSING_LOW_SURROGATE);
531     }
532   }
533   if (!coerce_to_utf8_ && !IsValidCodePoint(code)) {
534     return ReportFailure("Invalid unicode code point.",
535                          ParseErrorType::INVALID_UNICODE);
536   }
537   char buf[UTFmax];
538   int len = EncodeAsUTF8Char(code, buf);
539   // Advance past the [final] code unit escape.
540   p_.remove_prefix(kUnicodeEscapedLength);
541   parsed_storage_.append(buf, len);
542   return util::Status();
543 }
544 
ParseNumber()545 util::Status JsonStreamParser::ParseNumber() {
546   NumberResult number;
547   util::Status result = ParseNumberHelper(&number);
548   if (result.ok()) {
549     switch (number.type) {
550       case NumberResult::DOUBLE:
551         ow_->RenderDouble(key_, number.double_val);
552         key_ = StringPiece();
553         break;
554 
555       case NumberResult::INT:
556         ow_->RenderInt64(key_, number.int_val);
557         key_ = StringPiece();
558         break;
559 
560       case NumberResult::UINT:
561         ow_->RenderUint64(key_, number.uint_val);
562         key_ = StringPiece();
563         break;
564 
565       default:
566         return ReportFailure("Unable to parse number.",
567                              ParseErrorType::UNABLE_TO_PARSE_NUMBER);
568     }
569   }
570   return result;
571 }
572 
ParseDoubleHelper(const std::string & number,NumberResult * result)573 util::Status JsonStreamParser::ParseDoubleHelper(const std::string& number,
574                                                  NumberResult* result) {
575   if (!safe_strtod(number, &result->double_val)) {
576     return ReportFailure("Unable to parse number.",
577                          ParseErrorType::UNABLE_TO_PARSE_NUMBER);
578   }
579   if (!loose_float_number_conversion_ && !std::isfinite(result->double_val)) {
580     return ReportFailure("Number exceeds the range of double.",
581                          ParseErrorType::NUMBER_EXCEEDS_RANGE_DOUBLE);
582   }
583   result->type = NumberResult::DOUBLE;
584   return util::Status();
585 }
586 
ParseNumberHelper(NumberResult * result)587 util::Status JsonStreamParser::ParseNumberHelper(NumberResult* result) {
588   const char* data = p_.data();
589   int length = p_.length();
590 
591   // Look for the first non-numeric character, or the end of the string.
592   int index = 0;
593   bool floating = false;
594   bool negative = data[index] == '-';
595   // Find the first character that cannot be part of the number. Along the way
596   // detect if the number needs to be parsed as a double.
597   // Note that this restricts numbers to the JSON specification, so for example
598   // we do not support hex or octal notations.
599   for (; index < length; ++index) {
600     char c = data[index];
601     if (isdigit(c)) continue;
602     if (c == '.' || c == 'e' || c == 'E') {
603       floating = true;
604       continue;
605     }
606     if (c == '+' || c == '-' || c == 'x') continue;
607     // Not a valid number character, break out.
608     break;
609   }
610 
611   // If the entire input is a valid number, and we may have more content in the
612   // future, we abort for now and resume when we know more.
613   if (index == length && !finishing_) {
614     return util::CancelledError("");
615   }
616 
617   // Create a string containing just the number, so we can use safe_strtoX
618   std::string number = std::string(p_.substr(0, index));
619 
620   // Floating point number, parse as a double.
621   if (floating) {
622     util::Status status = ParseDoubleHelper(number, result);
623     if (status.ok()) {
624       p_.remove_prefix(index);
625     }
626     return status;
627   }
628 
629   // Positive non-floating point number, parse as a uint64_t.
630   if (!negative) {
631     // Octal/Hex numbers are not valid JSON values.
632     if (number.length() >= 2 && number[0] == '0') {
633       return ReportFailure(
634           "Octal/hex numbers are not valid JSON values.",
635           ParseErrorType::OCTAL_OR_HEX_ARE_NOT_VALID_JSON_VALUES);
636     }
637     if (safe_strtou64(number, &result->uint_val)) {
638       result->type = NumberResult::UINT;
639       p_.remove_prefix(index);
640       return util::Status();
641     } else {
642       // If the value is too large, parse it as double.
643       util::Status status = ParseDoubleHelper(number, result);
644       if (status.ok()) {
645         p_.remove_prefix(index);
646       }
647       return status;
648     }
649   }
650 
651   // Octal/Hex numbers are not valid JSON values.
652   if (number.length() >= 3 && number[1] == '0') {
653     return ReportFailure(
654         "Octal/hex numbers are not valid JSON values.",
655         ParseErrorType::OCTAL_OR_HEX_ARE_NOT_VALID_JSON_VALUES);
656   }
657   // Negative non-floating point number, parse as an int64_t.
658   if (safe_strto64(number, &result->int_val)) {
659     result->type = NumberResult::INT;
660     p_.remove_prefix(index);
661     return util::Status();
662   } else {
663     // If the value is too large, parse it as double.
664     util::Status status = ParseDoubleHelper(number, result);
665     if (status.ok()) {
666       p_.remove_prefix(index);
667     }
668     return status;
669   }
670 }
671 
HandleBeginObject()672 util::Status JsonStreamParser::HandleBeginObject() {
673   GOOGLE_DCHECK_EQ('{', *p_.data());
674   Advance();
675   ow_->StartObject(key_);
676   auto status = IncrementRecursionDepth(key_);
677   if (!status.ok()) {
678     return status;
679   }
680   key_ = StringPiece();
681   stack_.push(ENTRY);
682   return util::Status();
683 }
684 
ParseObjectMid(TokenType type)685 util::Status JsonStreamParser::ParseObjectMid(TokenType type) {
686   if (type == UNKNOWN) {
687     return ReportUnknown("Expected , or } after key:value pair.",
688                          ParseErrorType::EXPECTED_COMMA_OR_BRACES);
689   }
690 
691   // Object is complete, advance past the comma and render the EndObject.
692   if (type == END_OBJECT) {
693     Advance();
694     ow_->EndObject();
695     --recursion_depth_;
696     return util::Status();
697   }
698   // Found a comma, advance past it and get ready for an entry.
699   if (type == VALUE_SEPARATOR) {
700     Advance();
701     stack_.push(ENTRY);
702     return util::Status();
703   }
704   // Illegal token after key:value pair.
705   return ReportFailure("Expected , or } after key:value pair.",
706                        ParseErrorType::EXPECTED_COMMA_OR_BRACES);
707 }
708 
ParseEntry(TokenType type)709 util::Status JsonStreamParser::ParseEntry(TokenType type) {
710   if (type == UNKNOWN) {
711     return ReportUnknown("Expected an object key or }.",
712                          ParseErrorType::EXPECTED_OBJECT_KEY_OR_BRACES);
713   }
714 
715   // Close the object and return. This allows for trailing commas.
716   if (type == END_OBJECT) {
717     ow_->EndObject();
718     Advance();
719     --recursion_depth_;
720     return util::Status();
721   }
722 
723   util::Status result;
724   if (type == BEGIN_STRING) {
725     // Key is a string (standard JSON), parse it and store the string.
726     result = ParseStringHelper();
727     if (result.ok()) {
728       key_storage_.clear();
729       if (!parsed_storage_.empty()) {
730         parsed_storage_.swap(key_storage_);
731         key_ = StringPiece(key_storage_);
732       } else {
733         key_ = parsed_;
734       }
735       parsed_ = StringPiece();
736     }
737   } else if (type == BEGIN_KEY) {
738     // Key is a bare key (back compat), create a StringPiece pointing to it.
739     result = ParseKey();
740   } else if (type == BEGIN_NULL || type == BEGIN_TRUE || type == BEGIN_FALSE) {
741     // Key may be a bare key that begins with a reserved word.
742     result = ParseKey();
743     if (result.ok() && (key_ == kKeywordNull || key_ == kKeywordTrue ||
744                         key_ == kKeywordFalse)) {
745       result = ReportFailure("Expected an object key or }.",
746                              ParseErrorType::EXPECTED_OBJECT_KEY_OR_BRACES);
747     }
748   } else {
749     // Unknown key type, report an error.
750     result = ReportFailure("Expected an object key or }.",
751                            ParseErrorType::EXPECTED_OBJECT_KEY_OR_BRACES);
752   }
753   // On success we next expect an entry mid ':' then an object mid ',' or '}'
754   if (result.ok()) {
755     stack_.push(OBJ_MID);
756     stack_.push(ENTRY_MID);
757   }
758   return result;
759 }
760 
ParseEntryMid(TokenType type)761 util::Status JsonStreamParser::ParseEntryMid(TokenType type) {
762   if (type == UNKNOWN) {
763     return ReportUnknown("Expected : between key:value pair.",
764                          ParseErrorType::EXPECTED_COLON);
765   }
766   if (type == ENTRY_SEPARATOR) {
767     Advance();
768     stack_.push(VALUE);
769     return util::Status();
770   }
771   return ReportFailure("Expected : between key:value pair.",
772                        ParseErrorType::EXPECTED_COLON);
773 }
774 
HandleBeginArray()775 util::Status JsonStreamParser::HandleBeginArray() {
776   GOOGLE_DCHECK_EQ('[', *p_.data());
777   Advance();
778   ow_->StartList(key_);
779   key_ = StringPiece();
780   stack_.push(ARRAY_VALUE);
781   return util::Status();
782 }
783 
ParseArrayValue(TokenType type)784 util::Status JsonStreamParser::ParseArrayValue(TokenType type) {
785   if (type == UNKNOWN) {
786     return ReportUnknown("Expected a value or ] within an array.",
787                          ParseErrorType::EXPECTED_VALUE_OR_BRACKET);
788   }
789 
790   if (type == END_ARRAY) {
791     ow_->EndList();
792     Advance();
793     return util::Status();
794   }
795 
796   // The ParseValue call may push something onto the stack so we need to make
797   // sure an ARRAY_MID is after it, so we push it on now. Also, the parsing of
798   // empty-null array value is relying on this ARRAY_MID token.
799   stack_.push(ARRAY_MID);
800   util::Status result = ParseValue(type);
801   if (util::IsCancelled(result)) {
802     // If we were cancelled, pop back off the ARRAY_MID so we don't try to
803     // push it on again when we try over.
804     stack_.pop();
805   }
806   return result;
807 }
808 
ParseArrayMid(TokenType type)809 util::Status JsonStreamParser::ParseArrayMid(TokenType type) {
810   if (type == UNKNOWN) {
811     return ReportUnknown("Expected , or ] after array value.",
812                          ParseErrorType::EXPECTED_COMMA_OR_BRACKET);
813   }
814 
815   if (type == END_ARRAY) {
816     ow_->EndList();
817     Advance();
818     return util::Status();
819   }
820 
821   // Found a comma, advance past it and expect an array value next.
822   if (type == VALUE_SEPARATOR) {
823     Advance();
824     stack_.push(ARRAY_VALUE);
825     return util::Status();
826   }
827   // Illegal token after array value.
828   return ReportFailure("Expected , or ] after array value.",
829                        ParseErrorType::EXPECTED_COMMA_OR_BRACKET);
830 }
831 
ParseTrue()832 util::Status JsonStreamParser::ParseTrue() {
833   ow_->RenderBool(key_, true);
834   key_ = StringPiece();
835   p_.remove_prefix(kKeywordTrue.length());
836   return util::Status();
837 }
838 
ParseFalse()839 util::Status JsonStreamParser::ParseFalse() {
840   ow_->RenderBool(key_, false);
841   key_ = StringPiece();
842   p_.remove_prefix(kKeywordFalse.length());
843   return util::Status();
844 }
845 
ParseNull()846 util::Status JsonStreamParser::ParseNull() {
847   ow_->RenderNull(key_);
848   key_ = StringPiece();
849   p_.remove_prefix(kKeywordNull.length());
850   return util::Status();
851 }
852 
ParseEmptyNull()853 util::Status JsonStreamParser::ParseEmptyNull() {
854   ow_->RenderNull(key_);
855   key_ = StringPiece();
856   return util::Status();
857 }
858 
IsEmptyNullAllowed(TokenType type)859 bool JsonStreamParser::IsEmptyNullAllowed(TokenType type) {
860   if (stack_.empty()) return false;
861   return (stack_.top() == ARRAY_MID && type == VALUE_SEPARATOR) ||
862          stack_.top() == OBJ_MID;
863 }
864 
ReportFailure(StringPiece message,ParseErrorType parse_code)865 util::Status JsonStreamParser::ReportFailure(StringPiece message,
866                                              ParseErrorType parse_code) {
867   (void)parse_code;  // Parameter is used in Google-internal code.
868   static const int kContextLength = 20;
869   const char* p_start = p_.data();
870   const char* json_start = json_.data();
871   const char* begin = std::max(p_start - kContextLength, json_start);
872   const char* end =
873       std::min(p_start + kContextLength, json_start + json_.size());
874   StringPiece segment(begin, end - begin);
875   std::string location(p_start - begin, ' ');
876   location.push_back('^');
877   auto status = util::InvalidArgumentError(
878       StrCat(message, "\n", segment, "\n", location));
879   return status;
880 }
881 
ReportUnknown(StringPiece message,ParseErrorType parse_code)882 util::Status JsonStreamParser::ReportUnknown(StringPiece message,
883                                              ParseErrorType parse_code) {
884   // If we aren't finishing the parse, cancel parsing and try later.
885   if (!finishing_) {
886     return util::CancelledError("");
887   }
888   if (p_.empty()) {
889     return ReportFailure(StrCat("Unexpected end of string. ", message),
890                          parse_code);
891   }
892   return ReportFailure(message, parse_code);
893 }
894 
IncrementRecursionDepth(StringPiece key) const895 util::Status JsonStreamParser::IncrementRecursionDepth(
896     StringPiece key) const {
897   if (++recursion_depth_ > max_recursion_depth_) {
898     return util::InvalidArgumentError(StrCat(
899         "Message too deep. Max recursion depth reached for key '", key, "'"));
900   }
901   return util::Status();
902 }
903 
SkipWhitespace()904 void JsonStreamParser::SkipWhitespace() {
905   while (!p_.empty() && ascii_isspace(*p_.data())) {
906     Advance();
907   }
908   if (!p_.empty() && !ascii_isspace(*p_.data())) {
909     seen_non_whitespace_ = true;
910   }
911 }
912 
Advance()913 void JsonStreamParser::Advance() {
914   // Advance by moving one UTF8 character while making sure we don't go beyond
915   // the length of StringPiece.
916   p_.remove_prefix(std::min<int>(
917       p_.length(), UTF8FirstLetterNumBytes(p_.data(), p_.length())));
918 }
919 
ParseKey()920 util::Status JsonStreamParser::ParseKey() {
921   StringPiece original = p_;
922 
923   if (allow_permissive_key_naming_) {
924     if (!ConsumeKeyPermissive(&p_, &key_)) {
925       return ReportFailure("Invalid key or variable name.",
926                            ParseErrorType::INVALID_KEY_OR_VARIABLE_NAME);
927     }
928   } else {
929     if (!ConsumeKey(&p_, &key_)) {
930       return ReportFailure("Invalid key or variable name.",
931                            ParseErrorType::INVALID_KEY_OR_VARIABLE_NAME);
932     }
933   }
934 
935   // If we consumed everything but expect more data, reset p_ and cancel since
936   // we can't know if the key was complete or not.
937   if (!finishing_ && p_.empty()) {
938     p_ = original;
939     return util::CancelledError("");
940   }
941   // Since we aren't using the key storage, clear it out.
942   key_storage_.clear();
943   return util::Status();
944 }
945 
GetNextTokenType()946 JsonStreamParser::TokenType JsonStreamParser::GetNextTokenType() {
947   SkipWhitespace();
948 
949   int size = p_.size();
950   if (size == 0) {
951     // If we ran out of data, report unknown and we'll place the previous parse
952     // type onto the stack and try again when we have more data.
953     return UNKNOWN;
954   }
955   // TODO(sven): Split this method based on context since different contexts
956   // support different tokens. Would slightly speed up processing?
957   const char* data = p_.data();
958   StringPiece data_view = StringPiece(data, size);
959   if (*data == '\"' || *data == '\'') return BEGIN_STRING;
960   if (*data == '-' || ('0' <= *data && *data <= '9')) {
961     return BEGIN_NUMBER;
962   }
963   if (size >= kKeywordTrue.length() &&
964       HasPrefixString(data_view, kKeywordTrue)) {
965     return BEGIN_TRUE;
966   }
967   if (size >= kKeywordFalse.length() &&
968       HasPrefixString(data_view, kKeywordFalse)) {
969     return BEGIN_FALSE;
970   }
971   if (size >= kKeywordNull.length() &&
972       HasPrefixString(data_view, kKeywordNull)) {
973     return BEGIN_NULL;
974   }
975   if (*data == '{') return BEGIN_OBJECT;
976   if (*data == '}') return END_OBJECT;
977   if (*data == '[') return BEGIN_ARRAY;
978   if (*data == ']') return END_ARRAY;
979   if (*data == ':') return ENTRY_SEPARATOR;
980   if (*data == ',') return VALUE_SEPARATOR;
981   if (MatchKey(p_)) {
982     return BEGIN_KEY;
983   }
984 
985   // We don't know that we necessarily have an invalid token here, just that we
986   // can't parse what we have so far. So we don't report an error and just
987   // return UNKNOWN so we can try again later when we have more data, or if we
988   // finish and we have leftovers.
989   return UNKNOWN;
990 }
991 
992 }  // namespace converter
993 }  // namespace util
994 }  // namespace protobuf
995 }  // namespace google
996