1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc. All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 // * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 // * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 // * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31 #include <google/protobuf/util/internal/json_stream_parser.h>
32
33 #include <algorithm>
34 #include <cctype>
35 #include <cmath>
36 #include <memory>
37 #include <stack>
38 #include <string>
39
40 #include <google/protobuf/stubs/common.h>
41 #include <google/protobuf/stubs/logging.h>
42 #include <google/protobuf/stubs/strutil.h>
43 #include <google/protobuf/stubs/status.h>
44 #include <google/protobuf/util/internal/object_writer.h>
45 #include <google/protobuf/util/internal/json_escaping.h>
46
47
48 namespace google {
49 namespace protobuf {
50 namespace util {
51
52 namespace converter {
53
54 // Number of digits in an escaped UTF-16 code unit ('\\' 'u' X X X X)
55 static const int kUnicodeEscapedLength = 6;
56
57 static const int kDefaultMaxRecursionDepth = 100;
58
59 // These cannot be constexpr for portability with VS2015.
60 static const StringPiece kKeywordTrue = "true";
61 static const StringPiece kKeywordFalse = "false";
62 static const StringPiece kKeywordNull = "null";
63
IsLetter(char c)64 inline bool IsLetter(char c) {
65 return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || (c == '_') ||
66 (c == '$');
67 }
68
IsAlphanumeric(char c)69 inline bool IsAlphanumeric(char c) {
70 return IsLetter(c) || ('0' <= c && c <= '9');
71 }
72
73 // Indicates a character may not be part of an unquoted key.
IsKeySeparator(char c)74 inline bool IsKeySeparator(char c) {
75 return (ascii_isspace(c) || c == '"' || c == '\'' || c == '{' ||
76 c == '}' || c == '[' || c == ']' || c == ':' || c == ',');
77 }
78
ReplaceInvalidCodePoints(StringPiece str,const std::string & replacement,std::string * dst)79 inline void ReplaceInvalidCodePoints(StringPiece str,
80 const std::string& replacement,
81 std::string* dst) {
82 while (!str.empty()) {
83 int n_valid_bytes = internal::UTF8SpnStructurallyValid(str);
84 StringPiece valid_part = str.substr(0, n_valid_bytes);
85 StrAppend(dst, valid_part);
86
87 if (n_valid_bytes == str.size()) {
88 break;
89 }
90
91 // Append replacement value.
92 StrAppend(dst, replacement);
93
94 // Move past valid bytes + one invalid byte.
95 str.remove_prefix(n_valid_bytes + 1);
96 }
97 }
98
ConsumeKey(StringPiece * input,StringPiece * key)99 static bool ConsumeKey(StringPiece* input, StringPiece* key) {
100 if (input->empty() || !IsLetter((*input)[0])) return false;
101 int len = 1;
102 for (; len < input->size(); ++len) {
103 if (!IsAlphanumeric((*input)[len])) {
104 break;
105 }
106 }
107 *key = StringPiece(input->data(), len);
108 *input = StringPiece(input->data() + len, input->size() - len);
109 return true;
110 }
111
112 // Same as 'ConsumeKey', but allows a widened set of key characters.
ConsumeKeyPermissive(StringPiece * input,StringPiece * key)113 static bool ConsumeKeyPermissive(StringPiece* input,
114 StringPiece* key) {
115 if (input->empty() || !IsLetter((*input)[0])) return false;
116 int len = 1;
117 for (; len < input->size(); ++len) {
118 if (IsKeySeparator((*input)[len])) {
119 break;
120 }
121 }
122 *key = StringPiece(input->data(), len);
123 *input = StringPiece(input->data() + len, input->size() - len);
124 return true;
125 }
126
MatchKey(StringPiece input)127 static bool MatchKey(StringPiece input) {
128 return !input.empty() && IsLetter(input[0]);
129 }
130
JsonStreamParser(ObjectWriter * ow)131 JsonStreamParser::JsonStreamParser(ObjectWriter* ow)
132 : ow_(ow),
133 stack_(),
134 leftover_(),
135 json_(),
136 p_(),
137 key_(),
138 key_storage_(),
139 finishing_(false),
140 seen_non_whitespace_(false),
141 allow_no_root_element_(false),
142 parsed_(),
143 parsed_storage_(),
144 string_open_(0),
145 chunk_storage_(),
146 coerce_to_utf8_(false),
147 utf8_replacement_character_(" "),
148 allow_empty_null_(false),
149 allow_permissive_key_naming_(false),
150 loose_float_number_conversion_(false),
151 recursion_depth_(0),
152 max_recursion_depth_(kDefaultMaxRecursionDepth) {
153 // Initialize the stack with a single value to be parsed.
154 stack_.push(VALUE);
155 }
156
~JsonStreamParser()157 JsonStreamParser::~JsonStreamParser() {}
158
159
Parse(StringPiece json)160 util::Status JsonStreamParser::Parse(StringPiece json) {
161 StringPiece chunk = json;
162 // If we have leftovers from a previous chunk, append the new chunk to it
163 // and create a new StringPiece pointing at the string's data. This could
164 // be large but we rely on the chunks to be small, assuming they are
165 // fragments of a Cord.
166 if (!leftover_.empty()) {
167 // Don't point chunk to leftover_ because leftover_ will be updated in
168 // ParseChunk(chunk).
169 chunk_storage_.swap(leftover_);
170 StrAppend(&chunk_storage_, json);
171 chunk = StringPiece(chunk_storage_);
172 }
173
174 // Find the structurally valid UTF8 prefix and parse only that.
175 int n = internal::UTF8SpnStructurallyValid(chunk);
176 if (n > 0) {
177 util::Status status = ParseChunk(chunk.substr(0, n));
178
179 // Any leftover characters are stashed in leftover_ for later parsing when
180 // there is more data available.
181 StrAppend(&leftover_, chunk.substr(n));
182 return status;
183 } else {
184 leftover_.assign(chunk.data(), chunk.size());
185 return util::Status();
186 }
187 }
188
FinishParse()189 util::Status JsonStreamParser::FinishParse() {
190 // If we do not expect anything and there is nothing left to parse we're all
191 // done.
192 if (stack_.empty() && leftover_.empty()) {
193 return util::Status();
194 }
195
196 // Lifetime needs to last until RunParser returns, so keep this variable
197 // outside of the coerce_to_utf8 block.
198 std::unique_ptr<std::string> scratch;
199
200 bool is_valid_utf8 = internal::IsStructurallyValidUTF8(leftover_);
201 if (coerce_to_utf8_ && !is_valid_utf8) {
202 scratch.reset(new std::string);
203 scratch->reserve(leftover_.size() * utf8_replacement_character_.size());
204 ReplaceInvalidCodePoints(leftover_, utf8_replacement_character_,
205 scratch.get());
206 p_ = json_ = *scratch;
207 } else {
208 p_ = json_ = leftover_;
209 if (!is_valid_utf8) {
210 return ReportFailure("Encountered non UTF-8 code points.",
211 ParseErrorType::NON_UTF_8);
212 }
213 }
214
215 // Parse the remainder in finishing mode, which reports errors for things like
216 // unterminated strings or unknown tokens that would normally be retried.
217 finishing_ = true;
218 util::Status result = RunParser();
219 if (result.ok()) {
220 SkipWhitespace();
221 if (!p_.empty()) {
222 result =
223 ReportFailure("Parsing terminated before end of input.",
224 ParseErrorType::PARSING_TERMINATED_BEFORE_END_OF_INPUT);
225 }
226 }
227 return result;
228 }
229
ParseChunk(StringPiece chunk)230 util::Status JsonStreamParser::ParseChunk(StringPiece chunk) {
231 // Do not do any work if the chunk is empty.
232 if (chunk.empty()) return util::Status();
233
234 p_ = json_ = chunk;
235
236 finishing_ = false;
237 util::Status result = RunParser();
238 if (!result.ok()) return result;
239
240 SkipWhitespace();
241 if (p_.empty()) {
242 // If we parsed everything we had, clear the leftover.
243 leftover_.clear();
244 } else {
245 // If we do not expect anything i.e. stack is empty, and we have non-empty
246 // string left to parse, we report an error.
247 if (stack_.empty()) {
248 return ReportFailure(
249 "Parsing terminated before end of input.",
250 ParseErrorType::PARSING_TERMINATED_BEFORE_END_OF_INPUT);
251 }
252 // If we expect future data i.e. stack is non-empty, and we have some
253 // unparsed data left, we save it for later parse.
254 leftover_ = std::string(p_);
255 }
256 return util::Status();
257 }
258
IsInputAllWhiteSpaces(TokenType type)259 bool JsonStreamParser::IsInputAllWhiteSpaces(TokenType type) {
260 // Conclude the whole input is full of white spaces by:
261 // - it is at the finishing stage
262 // - we have run out of the input data
263 // - haven't seen non-whitespace char so far
264 if (finishing_ && p_.empty() && type == UNKNOWN && !seen_non_whitespace_) {
265 return true;
266 }
267 return false;
268 }
269
RunParser()270 util::Status JsonStreamParser::RunParser() {
271 while (!stack_.empty()) {
272 ParseType type = stack_.top();
273 TokenType t = (string_open_ == 0) ? GetNextTokenType() : BEGIN_STRING;
274 stack_.pop();
275 util::Status result;
276 switch (type) {
277 case VALUE:
278 if (allow_no_root_element_ && IsInputAllWhiteSpaces(t)) {
279 return util::Status();
280 }
281 result = ParseValue(t);
282 break;
283
284 case OBJ_MID:
285 result = ParseObjectMid(t);
286 break;
287
288 case ENTRY:
289 result = ParseEntry(t);
290 break;
291
292 case ENTRY_MID:
293 result = ParseEntryMid(t);
294 break;
295
296 case ARRAY_VALUE:
297 result = ParseArrayValue(t);
298 break;
299
300 case ARRAY_MID:
301 result = ParseArrayMid(t);
302 break;
303
304 default:
305 result =
306 util::InternalError(StrCat("Unknown parse type: ", type));
307 break;
308 }
309 if (!result.ok()) {
310 // If we were cancelled, save our state and try again later.
311 if (!finishing_ && util::IsCancelled(result)) {
312 stack_.push(type);
313 // If we have a key we still need to render, make sure to save off the
314 // contents in our own storage.
315 if (!key_.empty() && key_storage_.empty()) {
316 StrAppend(&key_storage_, key_);
317 key_ = StringPiece(key_storage_);
318 }
319 result = util::Status();
320 }
321 return result;
322 }
323 }
324 return util::Status();
325 }
326
ParseValue(TokenType type)327 util::Status JsonStreamParser::ParseValue(TokenType type) {
328 switch (type) {
329 case BEGIN_OBJECT:
330 return HandleBeginObject();
331 case BEGIN_ARRAY:
332 return HandleBeginArray();
333 case BEGIN_STRING:
334 return ParseString();
335 case BEGIN_NUMBER:
336 return ParseNumber();
337 case BEGIN_TRUE:
338 return ParseTrue();
339 case BEGIN_FALSE:
340 return ParseFalse();
341 case BEGIN_NULL:
342 return ParseNull();
343 case UNKNOWN:
344 return ReportUnknown("Expected a value.", ParseErrorType::EXPECTED_VALUE);
345 default: {
346 // Special case for having been cut off while parsing, wait for more data.
347 // This handles things like 'fals' being at the end of the string, we
348 // don't know if the next char would be e, completing it, or something
349 // else, making it invalid.
350 if (!finishing_ && p_.length() < kKeywordFalse.length()) {
351 return util::CancelledError("");
352 }
353
354 if (allow_empty_null_ && IsEmptyNullAllowed(type)) {
355 return ParseEmptyNull();
356 }
357 return ReportFailure("Unexpected token.",
358 ParseErrorType::UNEXPECTED_TOKEN);
359 }
360 }
361 }
362
ParseString()363 util::Status JsonStreamParser::ParseString() {
364 util::Status result = ParseStringHelper();
365 if (result.ok()) {
366 ow_->RenderString(key_, parsed_);
367 key_ = StringPiece();
368 parsed_ = StringPiece();
369 parsed_storage_.clear();
370 }
371 return result;
372 }
373
ParseStringHelper()374 util::Status JsonStreamParser::ParseStringHelper() {
375 // If we haven't seen the start quote, grab it and remember it for later.
376 if (string_open_ == 0) {
377 string_open_ = *p_.data();
378 GOOGLE_DCHECK(string_open_ == '\"' || string_open_ == '\'');
379 Advance();
380 }
381 // Track where we last copied data from so we can minimize copying.
382 const char* last = p_.data();
383 while (!p_.empty()) {
384 const char* data = p_.data();
385 if (*data == '\\') {
386 // We're about to handle an escape, copy all bytes from last to data.
387 if (last < data) {
388 parsed_storage_.append(last, data - last);
389 }
390 // If we ran out of string after the \, cancel or report an error
391 // depending on if we expect more data later.
392 if (p_.length() == 1) {
393 if (!finishing_) {
394 return util::CancelledError("");
395 }
396 return ReportFailure("Closing quote expected in string.",
397 ParseErrorType::EXPECTED_CLOSING_QUOTE);
398 }
399 // Parse a unicode escape if we found \u in the string.
400 if (data[1] == 'u') {
401 util::Status result = ParseUnicodeEscape();
402 if (!result.ok()) {
403 return result;
404 }
405 // Move last pointer past the unicode escape and continue.
406 last = p_.data();
407 continue;
408 }
409 // Handle the standard set of backslash-escaped characters.
410 switch (data[1]) {
411 case 'b':
412 parsed_storage_.push_back('\b');
413 break;
414 case 'f':
415 parsed_storage_.push_back('\f');
416 break;
417 case 'n':
418 parsed_storage_.push_back('\n');
419 break;
420 case 'r':
421 parsed_storage_.push_back('\r');
422 break;
423 case 't':
424 parsed_storage_.push_back('\t');
425 break;
426 case 'v':
427 parsed_storage_.push_back('\v');
428 break;
429 default:
430 parsed_storage_.push_back(data[1]);
431 }
432 // We handled two characters, so advance past them and continue.
433 p_.remove_prefix(2);
434 last = p_.data();
435 continue;
436 }
437 // If we found the closing quote note it, advance past it, and return.
438 if (*data == string_open_) {
439 // If we didn't copy anything, reuse the input buffer.
440 if (parsed_storage_.empty()) {
441 parsed_ = StringPiece(last, data - last);
442 } else {
443 if (last < data) {
444 parsed_storage_.append(last, data - last);
445 }
446 parsed_ = StringPiece(parsed_storage_);
447 }
448 // Clear the quote char so next time we try to parse a string we'll
449 // start fresh.
450 string_open_ = 0;
451 Advance();
452 return util::Status();
453 }
454 // Normal character, just advance past it.
455 Advance();
456 }
457 // If we ran out of characters, copy over what we have so far.
458 if (last < p_.data()) {
459 parsed_storage_.append(last, p_.data() - last);
460 }
461 // If we didn't find the closing quote but we expect more data, cancel for now
462 if (!finishing_) {
463 return util::CancelledError("");
464 }
465 // End of string reached without a closing quote, report an error.
466 string_open_ = 0;
467 return ReportFailure("Closing quote expected in string.",
468 ParseErrorType::EXPECTED_CLOSING_QUOTE);
469 }
470
471 // Converts a unicode escaped character to a decimal value stored in a char32
472 // for use in UTF8 encoding utility. We assume that str begins with \uhhhh and
473 // convert that from the hex number to a decimal value.
474 //
475 // There are some security exploits with UTF-8 that we should be careful of:
476 // - http://www.unicode.org/reports/tr36/#UTF-8_Exploit
477 // - http://sites/intl-eng/design-guide/core-application
ParseUnicodeEscape()478 util::Status JsonStreamParser::ParseUnicodeEscape() {
479 if (p_.length() < kUnicodeEscapedLength) {
480 if (!finishing_) {
481 return util::CancelledError("");
482 }
483 return ReportFailure("Illegal hex string.",
484 ParseErrorType::ILLEGAL_HEX_STRING);
485 }
486 GOOGLE_DCHECK_EQ('\\', p_.data()[0]);
487 GOOGLE_DCHECK_EQ('u', p_.data()[1]);
488 uint32_t code = 0;
489 for (int i = 2; i < kUnicodeEscapedLength; ++i) {
490 if (!isxdigit(p_.data()[i])) {
491 return ReportFailure("Invalid escape sequence.",
492 ParseErrorType::INVALID_ESCAPE_SEQUENCE);
493 }
494 code = (code << 4) + hex_digit_to_int(p_.data()[i]);
495 }
496 if (code >= JsonEscaping::kMinHighSurrogate &&
497 code <= JsonEscaping::kMaxHighSurrogate) {
498 if (p_.length() < 2 * kUnicodeEscapedLength) {
499 if (!finishing_) {
500 return util::CancelledError("");
501 }
502 if (!coerce_to_utf8_) {
503 return ReportFailure("Missing low surrogate.",
504 ParseErrorType::MISSING_LOW_SURROGATE);
505 }
506 } else if (p_.data()[kUnicodeEscapedLength] == '\\' &&
507 p_.data()[kUnicodeEscapedLength + 1] == 'u') {
508 uint32_t low_code = 0;
509 for (int i = kUnicodeEscapedLength + 2; i < 2 * kUnicodeEscapedLength;
510 ++i) {
511 if (!isxdigit(p_.data()[i])) {
512 return ReportFailure("Invalid escape sequence.",
513 ParseErrorType::INVALID_ESCAPE_SEQUENCE);
514 }
515 low_code = (low_code << 4) + hex_digit_to_int(p_.data()[i]);
516 }
517 if (low_code >= JsonEscaping::kMinLowSurrogate &&
518 low_code <= JsonEscaping::kMaxLowSurrogate) {
519 // Convert UTF-16 surrogate pair to 21-bit Unicode codepoint.
520 code = (((code & 0x3FF) << 10) | (low_code & 0x3FF)) +
521 JsonEscaping::kMinSupplementaryCodePoint;
522 // Advance past the first code unit escape.
523 p_.remove_prefix(kUnicodeEscapedLength);
524 } else if (!coerce_to_utf8_) {
525 return ReportFailure("Invalid low surrogate.",
526 ParseErrorType::INVALID_LOW_SURROGATE);
527 }
528 } else if (!coerce_to_utf8_) {
529 return ReportFailure("Missing low surrogate.",
530 ParseErrorType::MISSING_LOW_SURROGATE);
531 }
532 }
533 if (!coerce_to_utf8_ && !IsValidCodePoint(code)) {
534 return ReportFailure("Invalid unicode code point.",
535 ParseErrorType::INVALID_UNICODE);
536 }
537 char buf[UTFmax];
538 int len = EncodeAsUTF8Char(code, buf);
539 // Advance past the [final] code unit escape.
540 p_.remove_prefix(kUnicodeEscapedLength);
541 parsed_storage_.append(buf, len);
542 return util::Status();
543 }
544
ParseNumber()545 util::Status JsonStreamParser::ParseNumber() {
546 NumberResult number;
547 util::Status result = ParseNumberHelper(&number);
548 if (result.ok()) {
549 switch (number.type) {
550 case NumberResult::DOUBLE:
551 ow_->RenderDouble(key_, number.double_val);
552 key_ = StringPiece();
553 break;
554
555 case NumberResult::INT:
556 ow_->RenderInt64(key_, number.int_val);
557 key_ = StringPiece();
558 break;
559
560 case NumberResult::UINT:
561 ow_->RenderUint64(key_, number.uint_val);
562 key_ = StringPiece();
563 break;
564
565 default:
566 return ReportFailure("Unable to parse number.",
567 ParseErrorType::UNABLE_TO_PARSE_NUMBER);
568 }
569 }
570 return result;
571 }
572
ParseDoubleHelper(const std::string & number,NumberResult * result)573 util::Status JsonStreamParser::ParseDoubleHelper(const std::string& number,
574 NumberResult* result) {
575 if (!safe_strtod(number, &result->double_val)) {
576 return ReportFailure("Unable to parse number.",
577 ParseErrorType::UNABLE_TO_PARSE_NUMBER);
578 }
579 if (!loose_float_number_conversion_ && !std::isfinite(result->double_val)) {
580 return ReportFailure("Number exceeds the range of double.",
581 ParseErrorType::NUMBER_EXCEEDS_RANGE_DOUBLE);
582 }
583 result->type = NumberResult::DOUBLE;
584 return util::Status();
585 }
586
ParseNumberHelper(NumberResult * result)587 util::Status JsonStreamParser::ParseNumberHelper(NumberResult* result) {
588 const char* data = p_.data();
589 int length = p_.length();
590
591 // Look for the first non-numeric character, or the end of the string.
592 int index = 0;
593 bool floating = false;
594 bool negative = data[index] == '-';
595 // Find the first character that cannot be part of the number. Along the way
596 // detect if the number needs to be parsed as a double.
597 // Note that this restricts numbers to the JSON specification, so for example
598 // we do not support hex or octal notations.
599 for (; index < length; ++index) {
600 char c = data[index];
601 if (isdigit(c)) continue;
602 if (c == '.' || c == 'e' || c == 'E') {
603 floating = true;
604 continue;
605 }
606 if (c == '+' || c == '-' || c == 'x') continue;
607 // Not a valid number character, break out.
608 break;
609 }
610
611 // If the entire input is a valid number, and we may have more content in the
612 // future, we abort for now and resume when we know more.
613 if (index == length && !finishing_) {
614 return util::CancelledError("");
615 }
616
617 // Create a string containing just the number, so we can use safe_strtoX
618 std::string number = std::string(p_.substr(0, index));
619
620 // Floating point number, parse as a double.
621 if (floating) {
622 util::Status status = ParseDoubleHelper(number, result);
623 if (status.ok()) {
624 p_.remove_prefix(index);
625 }
626 return status;
627 }
628
629 // Positive non-floating point number, parse as a uint64_t.
630 if (!negative) {
631 // Octal/Hex numbers are not valid JSON values.
632 if (number.length() >= 2 && number[0] == '0') {
633 return ReportFailure(
634 "Octal/hex numbers are not valid JSON values.",
635 ParseErrorType::OCTAL_OR_HEX_ARE_NOT_VALID_JSON_VALUES);
636 }
637 if (safe_strtou64(number, &result->uint_val)) {
638 result->type = NumberResult::UINT;
639 p_.remove_prefix(index);
640 return util::Status();
641 } else {
642 // If the value is too large, parse it as double.
643 util::Status status = ParseDoubleHelper(number, result);
644 if (status.ok()) {
645 p_.remove_prefix(index);
646 }
647 return status;
648 }
649 }
650
651 // Octal/Hex numbers are not valid JSON values.
652 if (number.length() >= 3 && number[1] == '0') {
653 return ReportFailure(
654 "Octal/hex numbers are not valid JSON values.",
655 ParseErrorType::OCTAL_OR_HEX_ARE_NOT_VALID_JSON_VALUES);
656 }
657 // Negative non-floating point number, parse as an int64_t.
658 if (safe_strto64(number, &result->int_val)) {
659 result->type = NumberResult::INT;
660 p_.remove_prefix(index);
661 return util::Status();
662 } else {
663 // If the value is too large, parse it as double.
664 util::Status status = ParseDoubleHelper(number, result);
665 if (status.ok()) {
666 p_.remove_prefix(index);
667 }
668 return status;
669 }
670 }
671
HandleBeginObject()672 util::Status JsonStreamParser::HandleBeginObject() {
673 GOOGLE_DCHECK_EQ('{', *p_.data());
674 Advance();
675 ow_->StartObject(key_);
676 auto status = IncrementRecursionDepth(key_);
677 if (!status.ok()) {
678 return status;
679 }
680 key_ = StringPiece();
681 stack_.push(ENTRY);
682 return util::Status();
683 }
684
ParseObjectMid(TokenType type)685 util::Status JsonStreamParser::ParseObjectMid(TokenType type) {
686 if (type == UNKNOWN) {
687 return ReportUnknown("Expected , or } after key:value pair.",
688 ParseErrorType::EXPECTED_COMMA_OR_BRACES);
689 }
690
691 // Object is complete, advance past the comma and render the EndObject.
692 if (type == END_OBJECT) {
693 Advance();
694 ow_->EndObject();
695 --recursion_depth_;
696 return util::Status();
697 }
698 // Found a comma, advance past it and get ready for an entry.
699 if (type == VALUE_SEPARATOR) {
700 Advance();
701 stack_.push(ENTRY);
702 return util::Status();
703 }
704 // Illegal token after key:value pair.
705 return ReportFailure("Expected , or } after key:value pair.",
706 ParseErrorType::EXPECTED_COMMA_OR_BRACES);
707 }
708
ParseEntry(TokenType type)709 util::Status JsonStreamParser::ParseEntry(TokenType type) {
710 if (type == UNKNOWN) {
711 return ReportUnknown("Expected an object key or }.",
712 ParseErrorType::EXPECTED_OBJECT_KEY_OR_BRACES);
713 }
714
715 // Close the object and return. This allows for trailing commas.
716 if (type == END_OBJECT) {
717 ow_->EndObject();
718 Advance();
719 --recursion_depth_;
720 return util::Status();
721 }
722
723 util::Status result;
724 if (type == BEGIN_STRING) {
725 // Key is a string (standard JSON), parse it and store the string.
726 result = ParseStringHelper();
727 if (result.ok()) {
728 key_storage_.clear();
729 if (!parsed_storage_.empty()) {
730 parsed_storage_.swap(key_storage_);
731 key_ = StringPiece(key_storage_);
732 } else {
733 key_ = parsed_;
734 }
735 parsed_ = StringPiece();
736 }
737 } else if (type == BEGIN_KEY) {
738 // Key is a bare key (back compat), create a StringPiece pointing to it.
739 result = ParseKey();
740 } else if (type == BEGIN_NULL || type == BEGIN_TRUE || type == BEGIN_FALSE) {
741 // Key may be a bare key that begins with a reserved word.
742 result = ParseKey();
743 if (result.ok() && (key_ == kKeywordNull || key_ == kKeywordTrue ||
744 key_ == kKeywordFalse)) {
745 result = ReportFailure("Expected an object key or }.",
746 ParseErrorType::EXPECTED_OBJECT_KEY_OR_BRACES);
747 }
748 } else {
749 // Unknown key type, report an error.
750 result = ReportFailure("Expected an object key or }.",
751 ParseErrorType::EXPECTED_OBJECT_KEY_OR_BRACES);
752 }
753 // On success we next expect an entry mid ':' then an object mid ',' or '}'
754 if (result.ok()) {
755 stack_.push(OBJ_MID);
756 stack_.push(ENTRY_MID);
757 }
758 return result;
759 }
760
ParseEntryMid(TokenType type)761 util::Status JsonStreamParser::ParseEntryMid(TokenType type) {
762 if (type == UNKNOWN) {
763 return ReportUnknown("Expected : between key:value pair.",
764 ParseErrorType::EXPECTED_COLON);
765 }
766 if (type == ENTRY_SEPARATOR) {
767 Advance();
768 stack_.push(VALUE);
769 return util::Status();
770 }
771 return ReportFailure("Expected : between key:value pair.",
772 ParseErrorType::EXPECTED_COLON);
773 }
774
HandleBeginArray()775 util::Status JsonStreamParser::HandleBeginArray() {
776 GOOGLE_DCHECK_EQ('[', *p_.data());
777 Advance();
778 ow_->StartList(key_);
779 key_ = StringPiece();
780 stack_.push(ARRAY_VALUE);
781 return util::Status();
782 }
783
ParseArrayValue(TokenType type)784 util::Status JsonStreamParser::ParseArrayValue(TokenType type) {
785 if (type == UNKNOWN) {
786 return ReportUnknown("Expected a value or ] within an array.",
787 ParseErrorType::EXPECTED_VALUE_OR_BRACKET);
788 }
789
790 if (type == END_ARRAY) {
791 ow_->EndList();
792 Advance();
793 return util::Status();
794 }
795
796 // The ParseValue call may push something onto the stack so we need to make
797 // sure an ARRAY_MID is after it, so we push it on now. Also, the parsing of
798 // empty-null array value is relying on this ARRAY_MID token.
799 stack_.push(ARRAY_MID);
800 util::Status result = ParseValue(type);
801 if (util::IsCancelled(result)) {
802 // If we were cancelled, pop back off the ARRAY_MID so we don't try to
803 // push it on again when we try over.
804 stack_.pop();
805 }
806 return result;
807 }
808
ParseArrayMid(TokenType type)809 util::Status JsonStreamParser::ParseArrayMid(TokenType type) {
810 if (type == UNKNOWN) {
811 return ReportUnknown("Expected , or ] after array value.",
812 ParseErrorType::EXPECTED_COMMA_OR_BRACKET);
813 }
814
815 if (type == END_ARRAY) {
816 ow_->EndList();
817 Advance();
818 return util::Status();
819 }
820
821 // Found a comma, advance past it and expect an array value next.
822 if (type == VALUE_SEPARATOR) {
823 Advance();
824 stack_.push(ARRAY_VALUE);
825 return util::Status();
826 }
827 // Illegal token after array value.
828 return ReportFailure("Expected , or ] after array value.",
829 ParseErrorType::EXPECTED_COMMA_OR_BRACKET);
830 }
831
ParseTrue()832 util::Status JsonStreamParser::ParseTrue() {
833 ow_->RenderBool(key_, true);
834 key_ = StringPiece();
835 p_.remove_prefix(kKeywordTrue.length());
836 return util::Status();
837 }
838
ParseFalse()839 util::Status JsonStreamParser::ParseFalse() {
840 ow_->RenderBool(key_, false);
841 key_ = StringPiece();
842 p_.remove_prefix(kKeywordFalse.length());
843 return util::Status();
844 }
845
ParseNull()846 util::Status JsonStreamParser::ParseNull() {
847 ow_->RenderNull(key_);
848 key_ = StringPiece();
849 p_.remove_prefix(kKeywordNull.length());
850 return util::Status();
851 }
852
ParseEmptyNull()853 util::Status JsonStreamParser::ParseEmptyNull() {
854 ow_->RenderNull(key_);
855 key_ = StringPiece();
856 return util::Status();
857 }
858
IsEmptyNullAllowed(TokenType type)859 bool JsonStreamParser::IsEmptyNullAllowed(TokenType type) {
860 if (stack_.empty()) return false;
861 return (stack_.top() == ARRAY_MID && type == VALUE_SEPARATOR) ||
862 stack_.top() == OBJ_MID;
863 }
864
ReportFailure(StringPiece message,ParseErrorType parse_code)865 util::Status JsonStreamParser::ReportFailure(StringPiece message,
866 ParseErrorType parse_code) {
867 (void)parse_code; // Parameter is used in Google-internal code.
868 static const int kContextLength = 20;
869 const char* p_start = p_.data();
870 const char* json_start = json_.data();
871 const char* begin = std::max(p_start - kContextLength, json_start);
872 const char* end =
873 std::min(p_start + kContextLength, json_start + json_.size());
874 StringPiece segment(begin, end - begin);
875 std::string location(p_start - begin, ' ');
876 location.push_back('^');
877 auto status = util::InvalidArgumentError(
878 StrCat(message, "\n", segment, "\n", location));
879 return status;
880 }
881
ReportUnknown(StringPiece message,ParseErrorType parse_code)882 util::Status JsonStreamParser::ReportUnknown(StringPiece message,
883 ParseErrorType parse_code) {
884 // If we aren't finishing the parse, cancel parsing and try later.
885 if (!finishing_) {
886 return util::CancelledError("");
887 }
888 if (p_.empty()) {
889 return ReportFailure(StrCat("Unexpected end of string. ", message),
890 parse_code);
891 }
892 return ReportFailure(message, parse_code);
893 }
894
IncrementRecursionDepth(StringPiece key) const895 util::Status JsonStreamParser::IncrementRecursionDepth(
896 StringPiece key) const {
897 if (++recursion_depth_ > max_recursion_depth_) {
898 return util::InvalidArgumentError(StrCat(
899 "Message too deep. Max recursion depth reached for key '", key, "'"));
900 }
901 return util::Status();
902 }
903
SkipWhitespace()904 void JsonStreamParser::SkipWhitespace() {
905 while (!p_.empty() && ascii_isspace(*p_.data())) {
906 Advance();
907 }
908 if (!p_.empty() && !ascii_isspace(*p_.data())) {
909 seen_non_whitespace_ = true;
910 }
911 }
912
Advance()913 void JsonStreamParser::Advance() {
914 // Advance by moving one UTF8 character while making sure we don't go beyond
915 // the length of StringPiece.
916 p_.remove_prefix(std::min<int>(
917 p_.length(), UTF8FirstLetterNumBytes(p_.data(), p_.length())));
918 }
919
ParseKey()920 util::Status JsonStreamParser::ParseKey() {
921 StringPiece original = p_;
922
923 if (allow_permissive_key_naming_) {
924 if (!ConsumeKeyPermissive(&p_, &key_)) {
925 return ReportFailure("Invalid key or variable name.",
926 ParseErrorType::INVALID_KEY_OR_VARIABLE_NAME);
927 }
928 } else {
929 if (!ConsumeKey(&p_, &key_)) {
930 return ReportFailure("Invalid key or variable name.",
931 ParseErrorType::INVALID_KEY_OR_VARIABLE_NAME);
932 }
933 }
934
935 // If we consumed everything but expect more data, reset p_ and cancel since
936 // we can't know if the key was complete or not.
937 if (!finishing_ && p_.empty()) {
938 p_ = original;
939 return util::CancelledError("");
940 }
941 // Since we aren't using the key storage, clear it out.
942 key_storage_.clear();
943 return util::Status();
944 }
945
GetNextTokenType()946 JsonStreamParser::TokenType JsonStreamParser::GetNextTokenType() {
947 SkipWhitespace();
948
949 int size = p_.size();
950 if (size == 0) {
951 // If we ran out of data, report unknown and we'll place the previous parse
952 // type onto the stack and try again when we have more data.
953 return UNKNOWN;
954 }
955 // TODO(sven): Split this method based on context since different contexts
956 // support different tokens. Would slightly speed up processing?
957 const char* data = p_.data();
958 StringPiece data_view = StringPiece(data, size);
959 if (*data == '\"' || *data == '\'') return BEGIN_STRING;
960 if (*data == '-' || ('0' <= *data && *data <= '9')) {
961 return BEGIN_NUMBER;
962 }
963 if (size >= kKeywordTrue.length() &&
964 HasPrefixString(data_view, kKeywordTrue)) {
965 return BEGIN_TRUE;
966 }
967 if (size >= kKeywordFalse.length() &&
968 HasPrefixString(data_view, kKeywordFalse)) {
969 return BEGIN_FALSE;
970 }
971 if (size >= kKeywordNull.length() &&
972 HasPrefixString(data_view, kKeywordNull)) {
973 return BEGIN_NULL;
974 }
975 if (*data == '{') return BEGIN_OBJECT;
976 if (*data == '}') return END_OBJECT;
977 if (*data == '[') return BEGIN_ARRAY;
978 if (*data == ']') return END_ARRAY;
979 if (*data == ':') return ENTRY_SEPARATOR;
980 if (*data == ',') return VALUE_SEPARATOR;
981 if (MatchKey(p_)) {
982 return BEGIN_KEY;
983 }
984
985 // We don't know that we necessarily have an invalid token here, just that we
986 // can't parse what we have so far. So we don't report an error and just
987 // return UNKNOWN so we can try again later when we have more data, or if we
988 // finish and we have leftovers.
989 return UNKNOWN;
990 }
991
992 } // namespace converter
993 } // namespace util
994 } // namespace protobuf
995 } // namespace google
996