xref: /aosp_15_r20/external/pdfium/core/fxcrt/xml/cfx_xmlparser.cpp (revision 3ac0a46f773bac49fa9476ec2b1cf3f8da5ec3a4)
1 // Copyright 2016 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fxcrt/xml/cfx_xmlparser.h"
8 
9 #include <stdint.h>
10 
11 #include <algorithm>
12 #include <iterator>
13 #include <stack>
14 #include <utility>
15 
16 #include "core/fxcrt/autorestorer.h"
17 #include "core/fxcrt/cfx_seekablestreamproxy.h"
18 #include "core/fxcrt/data_vector.h"
19 #include "core/fxcrt/fx_codepage.h"
20 #include "core/fxcrt/fx_extension.h"
21 #include "core/fxcrt/fx_safe_types.h"
22 #include "core/fxcrt/xml/cfx_xmlchardata.h"
23 #include "core/fxcrt/xml/cfx_xmldocument.h"
24 #include "core/fxcrt/xml/cfx_xmlelement.h"
25 #include "core/fxcrt/xml/cfx_xmlinstruction.h"
26 #include "core/fxcrt/xml/cfx_xmlnode.h"
27 #include "core/fxcrt/xml/cfx_xmltext.h"
28 #include "third_party/base/check.h"
29 #include "third_party/base/notreached.h"
30 
31 namespace {
32 
33 constexpr size_t kCurrentTextReserve = 128;
34 constexpr uint32_t kMaxCharRange = 0x10ffff;
35 
IsXMLWhiteSpace(wchar_t ch)36 bool IsXMLWhiteSpace(wchar_t ch) {
37   return ch == L' ' || ch == 0x0A || ch == 0x0D || ch == 0x09;
38 }
39 
40 struct FX_XMLNAMECHAR {
41   uint16_t wStart;
42   uint16_t wEnd;
43   bool bStartChar;
44 };
45 
46 constexpr FX_XMLNAMECHAR kXMLNameChars[] = {
47     {L'-', L'.', false},    {L'0', L'9', false},     {L':', L':', false},
48     {L'A', L'Z', true},     {L'_', L'_', true},      {L'a', L'z', true},
49     {0xB7, 0xB7, false},    {0xC0, 0xD6, true},      {0xD8, 0xF6, true},
50     {0xF8, 0x02FF, true},   {0x0300, 0x036F, false}, {0x0370, 0x037D, true},
51     {0x037F, 0x1FFF, true}, {0x200C, 0x200D, true},  {0x203F, 0x2040, false},
52     {0x2070, 0x218F, true}, {0x2C00, 0x2FEF, true},  {0x3001, 0xD7FF, true},
53     {0xF900, 0xFDCF, true}, {0xFDF0, 0xFFFD, true},
54 };
55 
56 }  // namespace
57 
58 // static
IsXMLNameChar(wchar_t ch,bool bFirstChar)59 bool CFX_XMLParser::IsXMLNameChar(wchar_t ch, bool bFirstChar) {
60   auto* it = std::lower_bound(
61       std::begin(kXMLNameChars), std::end(kXMLNameChars), ch,
62       [](const FX_XMLNAMECHAR& arg, wchar_t ch) { return arg.wEnd < ch; });
63   return it != std::end(kXMLNameChars) && ch >= it->wStart &&
64          (!bFirstChar || it->bStartChar);
65 }
66 
CFX_XMLParser(const RetainPtr<IFX_SeekableReadStream> & pStream)67 CFX_XMLParser::CFX_XMLParser(const RetainPtr<IFX_SeekableReadStream>& pStream) {
68   DCHECK(pStream);
69 
70   auto proxy = pdfium::MakeRetain<CFX_SeekableStreamProxy>(pStream);
71   FX_CodePage wCodePage = proxy->GetCodePage();
72   if (wCodePage != FX_CodePage::kUTF16LE &&
73       wCodePage != FX_CodePage::kUTF16BE && wCodePage != FX_CodePage::kUTF8) {
74     proxy->SetCodePage(FX_CodePage::kUTF8);
75   }
76   stream_ = proxy;
77 
78   xml_plane_size_ = std::min(
79       xml_plane_size_, pdfium::base::checked_cast<size_t>(stream_->GetSize()));
80 
81   current_text_.reserve(kCurrentTextReserve);
82 }
83 
84 CFX_XMLParser::~CFX_XMLParser() = default;
85 
Parse()86 std::unique_ptr<CFX_XMLDocument> CFX_XMLParser::Parse() {
87   auto doc = std::make_unique<CFX_XMLDocument>();
88   AutoRestorer<UnownedPtr<CFX_XMLNode>> restorer(&current_node_);
89   current_node_ = doc->GetRoot();
90   return DoSyntaxParse(doc.get()) ? std::move(doc) : nullptr;
91 }
92 
DoSyntaxParse(CFX_XMLDocument * doc)93 bool CFX_XMLParser::DoSyntaxParse(CFX_XMLDocument* doc) {
94   if (xml_plane_size_ <= 0)
95     return false;
96 
97   FX_SAFE_SIZE_T alloc_size_safe = xml_plane_size_;
98   alloc_size_safe += 1;  // For NUL.
99   if (!alloc_size_safe.IsValid())
100     return false;
101 
102   size_t current_buffer_idx = 0;
103   size_t buffer_size = 0;
104 
105   DataVector<wchar_t> buffer;
106   buffer.resize(alloc_size_safe.ValueOrDie());
107 
108   std::stack<wchar_t> character_to_skip_too_stack;
109   std::stack<CFX_XMLNode::Type> node_type_stack;
110   WideString current_attribute_name;
111   FDE_XmlSyntaxState current_parser_state = FDE_XmlSyntaxState::Text;
112   wchar_t current_quote_character = 0;
113   wchar_t current_character_to_skip_to = 0;
114 
115   while (true) {
116     if (current_buffer_idx >= buffer_size) {
117       if (stream_->IsEOF())
118         return true;
119 
120       size_t buffer_chars = stream_->ReadBlock(buffer.data(), xml_plane_size_);
121       if (buffer_chars == 0)
122         return true;
123 
124       current_buffer_idx = 0;
125       buffer_size = buffer_chars;
126     }
127 
128     while (current_buffer_idx < buffer_size) {
129       wchar_t ch = buffer[current_buffer_idx];
130       switch (current_parser_state) {
131         case FDE_XmlSyntaxState::Text:
132           if (ch == L'<') {
133             if (!current_text_.empty()) {
134               current_node_->AppendLastChild(
135                   doc->CreateNode<CFX_XMLText>(GetTextData()));
136             } else {
137               current_buffer_idx++;
138               current_parser_state = FDE_XmlSyntaxState::Node;
139             }
140           } else {
141             // Fail if there is text outside of the root element, ignore
142             // whitespace/null.
143             if (node_type_stack.empty() && ch && !FXSYS_iswspace(ch))
144               return false;
145             ProcessTextChar(ch);
146             current_buffer_idx++;
147           }
148           break;
149         case FDE_XmlSyntaxState::Node:
150           if (ch == L'!') {
151             current_buffer_idx++;
152             current_parser_state = FDE_XmlSyntaxState::SkipCommentOrDecl;
153           } else if (ch == L'/') {
154             current_buffer_idx++;
155             current_parser_state = FDE_XmlSyntaxState::CloseElement;
156           } else if (ch == L'?') {
157             node_type_stack.push(CFX_XMLNode::Type::kInstruction);
158             current_buffer_idx++;
159             current_parser_state = FDE_XmlSyntaxState::Target;
160           } else {
161             node_type_stack.push(CFX_XMLNode::Type::kElement);
162             current_parser_state = FDE_XmlSyntaxState::Tag;
163           }
164           break;
165         case FDE_XmlSyntaxState::Target:
166           if (!IsXMLNameChar(ch, current_text_.empty())) {
167             if (current_text_.empty())
168               return false;
169 
170             current_parser_state = FDE_XmlSyntaxState::TargetData;
171 
172             WideString target_name = GetTextData();
173             if (target_name.EqualsASCII("originalXFAVersion") ||
174                 target_name.EqualsASCII("acrobat")) {
175               auto* node = doc->CreateNode<CFX_XMLInstruction>(target_name);
176               current_node_->AppendLastChild(node);
177               current_node_ = node;
178             }
179           } else {
180             current_text_.push_back(ch);
181             current_buffer_idx++;
182           }
183           break;
184         case FDE_XmlSyntaxState::Tag:
185           if (!IsXMLNameChar(ch, current_text_.empty())) {
186             if (current_text_.empty())
187               return false;
188 
189             current_parser_state = FDE_XmlSyntaxState::AttriName;
190 
191             auto* child = doc->CreateNode<CFX_XMLElement>(GetTextData());
192             current_node_->AppendLastChild(child);
193             current_node_ = child;
194           } else {
195             current_text_.push_back(ch);
196             current_buffer_idx++;
197           }
198           break;
199         case FDE_XmlSyntaxState::AttriName:
200           if (current_text_.empty() && IsXMLWhiteSpace(ch)) {
201             current_buffer_idx++;
202             break;
203           }
204           if (!IsXMLNameChar(ch, current_text_.empty())) {
205             if (current_text_.empty()) {
206               if (node_type_stack.top() == CFX_XMLNode::Type::kElement) {
207                 if (ch == L'>' || ch == L'/') {
208                   current_parser_state = FDE_XmlSyntaxState::BreakElement;
209                   break;
210                 }
211               } else if (node_type_stack.top() ==
212                          CFX_XMLNode::Type::kInstruction) {
213                 if (ch == L'?') {
214                   current_parser_state = FDE_XmlSyntaxState::CloseInstruction;
215                   current_buffer_idx++;
216                 } else {
217                   current_parser_state = FDE_XmlSyntaxState::TargetData;
218                 }
219                 break;
220               }
221               return false;
222             } else {
223               if (node_type_stack.top() == CFX_XMLNode::Type::kInstruction) {
224                 if (ch != '=' && !IsXMLWhiteSpace(ch)) {
225                   current_parser_state = FDE_XmlSyntaxState::TargetData;
226                   break;
227                 }
228               }
229               current_parser_state = FDE_XmlSyntaxState::AttriEqualSign;
230               current_attribute_name = GetTextData();
231             }
232           } else {
233             current_text_.push_back(ch);
234             current_buffer_idx++;
235           }
236           break;
237         case FDE_XmlSyntaxState::AttriEqualSign:
238           if (IsXMLWhiteSpace(ch)) {
239             current_buffer_idx++;
240             break;
241           }
242           if (ch != L'=') {
243             if (node_type_stack.top() == CFX_XMLNode::Type::kInstruction) {
244               current_parser_state = FDE_XmlSyntaxState::TargetData;
245               break;
246             }
247             return false;
248           } else {
249             current_parser_state = FDE_XmlSyntaxState::AttriQuotation;
250             current_buffer_idx++;
251           }
252           break;
253         case FDE_XmlSyntaxState::AttriQuotation:
254           if (IsXMLWhiteSpace(ch)) {
255             current_buffer_idx++;
256             break;
257           }
258           if (ch != L'\"' && ch != L'\'') {
259             return false;
260           }
261 
262           current_quote_character = ch;
263           current_parser_state = FDE_XmlSyntaxState::AttriValue;
264           current_buffer_idx++;
265           break;
266         case FDE_XmlSyntaxState::AttriValue:
267           if (ch == current_quote_character) {
268             if (entity_start_.has_value())
269               return false;
270 
271             current_quote_character = 0;
272             current_buffer_idx++;
273             current_parser_state = FDE_XmlSyntaxState::AttriName;
274 
275             CFX_XMLElement* elem = ToXMLElement(current_node_);
276             if (elem)
277               elem->SetAttribute(current_attribute_name, GetTextData());
278 
279             current_attribute_name.clear();
280           } else {
281             ProcessTextChar(ch);
282             current_buffer_idx++;
283           }
284           break;
285         case FDE_XmlSyntaxState::CloseInstruction:
286           if (ch != L'>') {
287             current_text_.push_back(ch);
288             current_parser_state = FDE_XmlSyntaxState::TargetData;
289           } else if (!current_text_.empty()) {
290             ProcessTargetData();
291           } else {
292             current_buffer_idx++;
293             if (node_type_stack.empty())
294               return false;
295 
296             node_type_stack.pop();
297             current_parser_state = FDE_XmlSyntaxState::Text;
298 
299             if (current_node_ &&
300                 current_node_->GetType() == CFX_XMLNode::Type::kInstruction)
301               current_node_ = current_node_->GetParent();
302           }
303           break;
304         case FDE_XmlSyntaxState::BreakElement:
305           if (ch == L'>') {
306             current_parser_state = FDE_XmlSyntaxState::Text;
307           } else if (ch == L'/') {
308             current_parser_state = FDE_XmlSyntaxState::CloseElement;
309           } else {
310             return false;
311           }
312           current_buffer_idx++;
313           break;
314         case FDE_XmlSyntaxState::CloseElement:
315           if (!IsXMLNameChar(ch, current_text_.empty())) {
316             if (ch == L'>') {
317               if (node_type_stack.empty())
318                 return false;
319 
320               node_type_stack.pop();
321               current_parser_state = FDE_XmlSyntaxState::Text;
322 
323               CFX_XMLElement* element = ToXMLElement(current_node_);
324               if (!element)
325                 return false;
326 
327               WideString element_name = GetTextData();
328               if (element_name.GetLength() > 0 &&
329                   element_name != element->GetName()) {
330                 return false;
331               }
332 
333               current_node_ = current_node_->GetParent();
334             } else if (!IsXMLWhiteSpace(ch)) {
335               return false;
336             }
337           } else {
338             current_text_.push_back(ch);
339           }
340           current_buffer_idx++;
341           break;
342         case FDE_XmlSyntaxState::SkipCommentOrDecl: {
343           auto current_span =
344               pdfium::make_span(buffer).subspan(current_buffer_idx);
345           if (FXSYS_wcsnicmp(current_span.data(), L"--", 2) == 0) {
346             current_buffer_idx += 2;
347             current_parser_state = FDE_XmlSyntaxState::SkipComment;
348           } else if (FXSYS_wcsnicmp(current_span.data(), L"[CDATA[", 7) == 0) {
349             current_buffer_idx += 7;
350             current_parser_state = FDE_XmlSyntaxState::SkipCData;
351           } else {
352             current_parser_state = FDE_XmlSyntaxState::SkipDeclNode;
353             current_character_to_skip_to = L'>';
354             character_to_skip_too_stack.push(L'>');
355           }
356           break;
357         }
358         case FDE_XmlSyntaxState::SkipCData: {
359           auto current_span =
360               pdfium::make_span(buffer).subspan(current_buffer_idx);
361           if (FXSYS_wcsnicmp(current_span.data(), L"]]>", 3) == 0) {
362             current_buffer_idx += 3;
363             current_parser_state = FDE_XmlSyntaxState::Text;
364             current_node_->AppendLastChild(
365                 doc->CreateNode<CFX_XMLCharData>(GetTextData()));
366           } else {
367             current_text_.push_back(ch);
368             current_buffer_idx++;
369           }
370           break;
371         }
372         case FDE_XmlSyntaxState::SkipDeclNode:
373           if (current_character_to_skip_to == L'\'' ||
374               current_character_to_skip_to == L'\"') {
375             current_buffer_idx++;
376             if (ch != current_character_to_skip_to)
377               break;
378 
379             character_to_skip_too_stack.pop();
380             if (character_to_skip_too_stack.empty())
381               current_parser_state = FDE_XmlSyntaxState::Text;
382             else
383               current_character_to_skip_to = character_to_skip_too_stack.top();
384           } else {
385             switch (ch) {
386               case L'<':
387                 current_character_to_skip_to = L'>';
388                 character_to_skip_too_stack.push(L'>');
389                 break;
390               case L'[':
391                 current_character_to_skip_to = L']';
392                 character_to_skip_too_stack.push(L']');
393                 break;
394               case L'(':
395                 current_character_to_skip_to = L')';
396                 character_to_skip_too_stack.push(L')');
397                 break;
398               case L'\'':
399                 current_character_to_skip_to = L'\'';
400                 character_to_skip_too_stack.push(L'\'');
401                 break;
402               case L'\"':
403                 current_character_to_skip_to = L'\"';
404                 character_to_skip_too_stack.push(L'\"');
405                 break;
406               default:
407                 if (ch == current_character_to_skip_to) {
408                   character_to_skip_too_stack.pop();
409                   if (character_to_skip_too_stack.empty()) {
410                     current_parser_state = FDE_XmlSyntaxState::Text;
411                   } else {
412                     current_character_to_skip_to =
413                         character_to_skip_too_stack.top();
414                   }
415                 }
416                 break;
417             }
418             current_buffer_idx++;
419           }
420           break;
421         case FDE_XmlSyntaxState::SkipComment: {
422           auto current_span =
423               pdfium::make_span(buffer).subspan(current_buffer_idx);
424           if (FXSYS_wcsnicmp(current_span.data(), L"-->", 3) == 0) {
425             current_buffer_idx += 2;
426             current_parser_state = FDE_XmlSyntaxState::Text;
427           }
428           current_buffer_idx++;
429           break;
430         }
431         case FDE_XmlSyntaxState::TargetData:
432           if (IsXMLWhiteSpace(ch)) {
433             if (current_text_.empty()) {
434               current_buffer_idx++;
435               break;
436             }
437             if (current_quote_character == 0) {
438               current_buffer_idx++;
439               ProcessTargetData();
440               break;
441             }
442           }
443           if (ch == '?') {
444             current_parser_state = FDE_XmlSyntaxState::CloseInstruction;
445             current_buffer_idx++;
446           } else if (ch == '\"') {
447             if (current_quote_character == 0) {
448               current_quote_character = ch;
449               current_buffer_idx++;
450             } else if (ch == current_quote_character) {
451               current_quote_character = 0;
452               current_buffer_idx++;
453               ProcessTargetData();
454             } else {
455               return false;
456             }
457           } else {
458             current_text_.push_back(ch);
459             current_buffer_idx++;
460           }
461           break;
462       }
463     }
464   }
465 
466   NOTREACHED();
467   return false;
468 }
469 
ProcessTextChar(wchar_t character)470 void CFX_XMLParser::ProcessTextChar(wchar_t character) {
471   current_text_.push_back(character);
472 
473   if (entity_start_.has_value() && character == L';') {
474     // Copy the entity out into a string and remove from the vector. When we
475     // copy the entity we don't want to copy out the & or the ; so we start
476     // shifted by one and want to copy 2 less characters in total.
477     WideString csEntity(current_text_.data() + entity_start_.value() + 1,
478                         current_text_.size() - entity_start_.value() - 2);
479     current_text_.erase(current_text_.begin() + entity_start_.value(),
480                         current_text_.end());
481 
482     size_t iLen = csEntity.GetLength();
483     if (iLen > 0) {
484       if (csEntity[0] == L'#') {
485         uint32_t ch = 0;
486         if (iLen > 1 && csEntity[1] == L'x') {
487           for (size_t i = 2; i < iLen; i++) {
488             if (!FXSYS_IsHexDigit(csEntity[i]))
489               break;
490             ch = (ch << 4) + FXSYS_HexCharToInt(csEntity[i]);
491           }
492         } else {
493           for (size_t i = 1; i < iLen; i++) {
494             if (!FXSYS_IsDecimalDigit(csEntity[i]))
495               break;
496             ch = ch * 10 + FXSYS_DecimalCharToInt(csEntity[i]);
497           }
498         }
499         if (ch > kMaxCharRange)
500           ch = ' ';
501 
502         character = static_cast<wchar_t>(ch);
503         if (character != 0)
504           current_text_.push_back(character);
505       } else {
506         if (csEntity == L"amp") {
507           current_text_.push_back(L'&');
508         } else if (csEntity == L"lt") {
509           current_text_.push_back(L'<');
510         } else if (csEntity == L"gt") {
511           current_text_.push_back(L'>');
512         } else if (csEntity == L"apos") {
513           current_text_.push_back(L'\'');
514         } else if (csEntity == L"quot") {
515           current_text_.push_back(L'"');
516         }
517       }
518     }
519     entity_start_ = absl::nullopt;
520   } else if (!entity_start_.has_value() && character == L'&') {
521     entity_start_ = current_text_.size() - 1;
522   }
523 }
524 
ProcessTargetData()525 void CFX_XMLParser::ProcessTargetData() {
526   WideString target_data = GetTextData();
527   if (target_data.IsEmpty())
528     return;
529 
530   CFX_XMLInstruction* instruction = ToXMLInstruction(current_node_);
531   if (instruction)
532     instruction->AppendData(target_data);
533 }
534 
GetTextData()535 WideString CFX_XMLParser::GetTextData() {
536   WideString ret(current_text_.data(), current_text_.size());
537   entity_start_ = absl::nullopt;
538   current_text_.clear();
539   current_text_.reserve(kCurrentTextReserve);
540   return ret;
541 }
542