xref: /aosp_15_r20/external/pdfium/core/fpdfapi/parser/cpdf_parser.cpp (revision 3ac0a46f773bac49fa9476ec2b1cf3f8da5ec3a4)
1 // Copyright 2016 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fpdfapi/parser/cpdf_parser.h"
8 
9 #include <ctype.h>
10 #include <stdint.h>
11 
12 #include <algorithm>
13 #include <utility>
14 #include <vector>
15 
16 #include "core/fpdfapi/parser/cpdf_array.h"
17 #include "core/fpdfapi/parser/cpdf_crypto_handler.h"
18 #include "core/fpdfapi/parser/cpdf_dictionary.h"
19 #include "core/fpdfapi/parser/cpdf_document.h"
20 #include "core/fpdfapi/parser/cpdf_linearized_header.h"
21 #include "core/fpdfapi/parser/cpdf_number.h"
22 #include "core/fpdfapi/parser/cpdf_object_stream.h"
23 #include "core/fpdfapi/parser/cpdf_read_validator.h"
24 #include "core/fpdfapi/parser/cpdf_reference.h"
25 #include "core/fpdfapi/parser/cpdf_security_handler.h"
26 #include "core/fpdfapi/parser/cpdf_stream.h"
27 #include "core/fpdfapi/parser/cpdf_stream_acc.h"
28 #include "core/fpdfapi/parser/cpdf_syntax_parser.h"
29 #include "core/fpdfapi/parser/fpdf_parser_utility.h"
30 #include "core/fxcrt/autorestorer.h"
31 #include "core/fxcrt/data_vector.h"
32 #include "core/fxcrt/fx_extension.h"
33 #include "core/fxcrt/fx_safe_types.h"
34 #include "core/fxcrt/scoped_set_insertion.h"
35 #include "third_party/base/check.h"
36 #include "third_party/base/check_op.h"
37 #include "third_party/base/containers/contains.h"
38 #include "third_party/base/containers/span.h"
39 #include "third_party/base/notreached.h"
40 
41 namespace {
42 
43 // A limit on the size of the xref table. Theoretical limits are higher, but
44 // this may be large enough in practice. The max size should always be 1 more
45 // than the max object number.
46 constexpr int32_t kMaxXRefSize = CPDF_Parser::kMaxObjectNumber + 1;
47 
48 // "%PDF-1.7\n"
49 constexpr FX_FILESIZE kPDFHeaderSize = 9;
50 
51 // The required number of fields in a /W array in a cross-reference stream
52 // dictionary.
53 constexpr size_t kMinFieldCount = 3;
54 
55 // V4 trailers are inline.
56 constexpr uint32_t kNoV4TrailerObjectNumber = 0;
57 
58 struct CrossRefV5IndexEntry {
59   uint32_t start_obj_num;
60   uint32_t obj_count;
61 };
62 
GetObjectTypeFromCrossRefStreamType(uint32_t cross_ref_stream_type)63 CPDF_Parser::ObjectType GetObjectTypeFromCrossRefStreamType(
64     uint32_t cross_ref_stream_type) {
65   switch (cross_ref_stream_type) {
66     case 0:
67       return CPDF_Parser::ObjectType::kFree;
68     case 1:
69       return CPDF_Parser::ObjectType::kNotCompressed;
70     case 2:
71       return CPDF_Parser::ObjectType::kCompressed;
72     default:
73       return CPDF_Parser::ObjectType::kNull;
74   }
75 }
76 
77 // Use the Get*XRefStreamEntry() functions below, instead of calling this
78 // directly.
GetVarInt(pdfium::span<const uint8_t> input)79 uint32_t GetVarInt(pdfium::span<const uint8_t> input) {
80   uint32_t result = 0;
81   for (uint8_t c : input)
82     result = result * 256 + c;
83   return result;
84 }
85 
86 // The following 3 functions retrieve variable length entries from
87 // cross-reference streams, as described in ISO 32000-1:2008 table 18. There are
88 // only 3 fields for any given entry.
GetFirstXRefStreamEntry(pdfium::span<const uint8_t> entry_span,pdfium::span<const uint32_t> field_widths)89 uint32_t GetFirstXRefStreamEntry(pdfium::span<const uint8_t> entry_span,
90                                  pdfium::span<const uint32_t> field_widths) {
91   return GetVarInt(entry_span.first(field_widths[0]));
92 }
93 
GetSecondXRefStreamEntry(pdfium::span<const uint8_t> entry_span,pdfium::span<const uint32_t> field_widths)94 uint32_t GetSecondXRefStreamEntry(pdfium::span<const uint8_t> entry_span,
95                                   pdfium::span<const uint32_t> field_widths) {
96   return GetVarInt(entry_span.subspan(field_widths[0], field_widths[1]));
97 }
98 
GetThirdXRefStreamEntry(pdfium::span<const uint8_t> entry_span,pdfium::span<const uint32_t> field_widths)99 uint32_t GetThirdXRefStreamEntry(pdfium::span<const uint8_t> entry_span,
100                                  pdfium::span<const uint32_t> field_widths) {
101   return GetVarInt(
102       entry_span.subspan(field_widths[0] + field_widths[1], field_widths[2]));
103 }
104 
GetCrossRefV5Indices(const CPDF_Array * array,uint32_t size)105 std::vector<CrossRefV5IndexEntry> GetCrossRefV5Indices(const CPDF_Array* array,
106                                                        uint32_t size) {
107   std::vector<CrossRefV5IndexEntry> indices;
108   if (array) {
109     for (size_t i = 0; i < array->size() / 2; i++) {
110       RetainPtr<const CPDF_Number> pStartNumObj = array->GetNumberAt(i * 2);
111       if (!pStartNumObj)
112         continue;
113 
114       RetainPtr<const CPDF_Number> pCountObj = array->GetNumberAt(i * 2 + 1);
115       if (!pCountObj)
116         continue;
117 
118       int nStartNum = pStartNumObj->GetInteger();
119       int nCount = pCountObj->GetInteger();
120       if (nStartNum < 0 || nCount <= 0)
121         continue;
122 
123       indices.push_back(
124           {static_cast<uint32_t>(nStartNum), static_cast<uint32_t>(nCount)});
125     }
126   }
127 
128   if (indices.empty())
129     indices.push_back({0, size});
130   return indices;
131 }
132 
GetFieldWidths(const CPDF_Array * array)133 std::vector<uint32_t> GetFieldWidths(const CPDF_Array* array) {
134   std::vector<uint32_t> results;
135   if (!array)
136     return results;
137 
138   CPDF_ArrayLocker locker(array);
139   for (const auto& obj : locker)
140     results.push_back(obj->GetInteger());
141   return results;
142 }
143 
144 class ObjectsHolderStub final : public CPDF_Parser::ParsedObjectsHolder {
145  public:
146   ObjectsHolderStub() = default;
147   ~ObjectsHolderStub() override = default;
TryInit()148   bool TryInit() override { return true; }
149 };
150 
151 }  // namespace
152 
CPDF_Parser(ParsedObjectsHolder * holder)153 CPDF_Parser::CPDF_Parser(ParsedObjectsHolder* holder)
154     : m_pObjectsHolder(holder),
155       m_CrossRefTable(std::make_unique<CPDF_CrossRefTable>()) {
156   if (!holder) {
157     m_pOwnedObjectsHolder = std::make_unique<ObjectsHolderStub>();
158     m_pObjectsHolder = m_pOwnedObjectsHolder.get();
159   }
160 }
161 
CPDF_Parser()162 CPDF_Parser::CPDF_Parser() : CPDF_Parser(nullptr) {}
163 
164 CPDF_Parser::~CPDF_Parser() = default;
165 
GetLastObjNum() const166 uint32_t CPDF_Parser::GetLastObjNum() const {
167   return m_CrossRefTable->objects_info().empty()
168              ? 0
169              : m_CrossRefTable->objects_info().rbegin()->first;
170 }
171 
IsValidObjectNumber(uint32_t objnum) const172 bool CPDF_Parser::IsValidObjectNumber(uint32_t objnum) const {
173   return objnum <= GetLastObjNum();
174 }
175 
GetObjectPositionOrZero(uint32_t objnum) const176 FX_FILESIZE CPDF_Parser::GetObjectPositionOrZero(uint32_t objnum) const {
177   const auto* info = m_CrossRefTable->GetObjectInfo(objnum);
178   return (info && info->type == ObjectType::kNormal) ? info->pos : 0;
179 }
180 
GetObjectType(uint32_t objnum) const181 CPDF_Parser::ObjectType CPDF_Parser::GetObjectType(uint32_t objnum) const {
182   DCHECK(IsValidObjectNumber(objnum));
183   const auto* info = m_CrossRefTable->GetObjectInfo(objnum);
184   return info ? info->type : ObjectType::kFree;
185 }
186 
IsObjectFreeOrNull(uint32_t objnum) const187 bool CPDF_Parser::IsObjectFreeOrNull(uint32_t objnum) const {
188   switch (GetObjectType(objnum)) {
189     case ObjectType::kFree:
190     case ObjectType::kNull:
191       return true;
192     case ObjectType::kNotCompressed:
193     case ObjectType::kCompressed:
194       return false;
195   }
196   NOTREACHED();
197   return false;
198 }
199 
IsObjectFree(uint32_t objnum) const200 bool CPDF_Parser::IsObjectFree(uint32_t objnum) const {
201   return GetObjectType(objnum) == ObjectType::kFree;
202 }
203 
InitSyntaxParser(RetainPtr<CPDF_ReadValidator> validator)204 bool CPDF_Parser::InitSyntaxParser(RetainPtr<CPDF_ReadValidator> validator) {
205   const absl::optional<FX_FILESIZE> header_offset = GetHeaderOffset(validator);
206   if (!header_offset.has_value())
207     return false;
208   if (validator->GetSize() < header_offset.value() + kPDFHeaderSize)
209     return false;
210 
211   m_pSyntax = std::make_unique<CPDF_SyntaxParser>(std::move(validator),
212                                                   header_offset.value());
213   return ParseFileVersion();
214 }
215 
ParseFileVersion()216 bool CPDF_Parser::ParseFileVersion() {
217   m_FileVersion = 0;
218   uint8_t ch;
219   if (!m_pSyntax->GetCharAt(5, ch))
220     return false;
221 
222   if (isdigit(ch))
223     m_FileVersion = FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch)) * 10;
224 
225   if (!m_pSyntax->GetCharAt(7, ch))
226     return false;
227 
228   if (isdigit(ch))
229     m_FileVersion += FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch));
230   return true;
231 }
232 
StartParse(RetainPtr<IFX_SeekableReadStream> pFileAccess,const ByteString & password)233 CPDF_Parser::Error CPDF_Parser::StartParse(
234     RetainPtr<IFX_SeekableReadStream> pFileAccess,
235     const ByteString& password) {
236   if (!InitSyntaxParser(pdfium::MakeRetain<CPDF_ReadValidator>(
237           std::move(pFileAccess), nullptr)))
238     return FORMAT_ERROR;
239   SetPassword(password);
240   return StartParseInternal();
241 }
242 
StartParseInternal()243 CPDF_Parser::Error CPDF_Parser::StartParseInternal() {
244   DCHECK(!m_bHasParsed);
245   DCHECK(!m_bXRefTableRebuilt);
246   m_bHasParsed = true;
247   m_bXRefStream = false;
248 
249   m_LastXRefOffset = ParseStartXRef();
250   if (m_LastXRefOffset >= kPDFHeaderSize) {
251     if (!LoadAllCrossRefV4(m_LastXRefOffset) &&
252         !LoadAllCrossRefV5(m_LastXRefOffset)) {
253       if (!RebuildCrossRef())
254         return FORMAT_ERROR;
255 
256       m_bXRefTableRebuilt = true;
257       m_LastXRefOffset = 0;
258     }
259   } else {
260     if (!RebuildCrossRef())
261       return FORMAT_ERROR;
262 
263     m_bXRefTableRebuilt = true;
264   }
265   Error eRet = SetEncryptHandler();
266   if (eRet != SUCCESS)
267     return eRet;
268 
269   if (!GetRoot() || !m_pObjectsHolder->TryInit()) {
270     if (m_bXRefTableRebuilt)
271       return FORMAT_ERROR;
272 
273     ReleaseEncryptHandler();
274     if (!RebuildCrossRef())
275       return FORMAT_ERROR;
276 
277     eRet = SetEncryptHandler();
278     if (eRet != SUCCESS)
279       return eRet;
280 
281     m_pObjectsHolder->TryInit();
282     if (!GetRoot())
283       return FORMAT_ERROR;
284   }
285   if (GetRootObjNum() == CPDF_Object::kInvalidObjNum) {
286     ReleaseEncryptHandler();
287     if (!RebuildCrossRef() || GetRootObjNum() == CPDF_Object::kInvalidObjNum)
288       return FORMAT_ERROR;
289 
290     eRet = SetEncryptHandler();
291     if (eRet != SUCCESS)
292       return eRet;
293   }
294   if (m_pSecurityHandler && !m_pSecurityHandler->IsMetadataEncrypted()) {
295     RetainPtr<const CPDF_Reference> pMetadata =
296         ToReference(GetRoot()->GetObjectFor("Metadata"));
297     if (pMetadata)
298       m_MetadataObjnum = pMetadata->GetRefObjNum();
299   }
300   return SUCCESS;
301 }
302 
ParseStartXRef()303 FX_FILESIZE CPDF_Parser::ParseStartXRef() {
304   static constexpr char kStartXRefKeyword[] = "startxref";
305   m_pSyntax->SetPos(m_pSyntax->GetDocumentSize() - strlen(kStartXRefKeyword));
306   if (!m_pSyntax->BackwardsSearchToWord(kStartXRefKeyword, 4096))
307     return 0;
308 
309   // Skip "startxref" keyword.
310   m_pSyntax->GetKeyword();
311 
312   // Read XRef offset.
313   const CPDF_SyntaxParser::WordResult xref_offset_result =
314       m_pSyntax->GetNextWord();
315   if (!xref_offset_result.is_number || xref_offset_result.word.IsEmpty())
316     return 0;
317 
318   const FX_SAFE_FILESIZE result = FXSYS_atoi64(xref_offset_result.word.c_str());
319   if (!result.IsValid() || result.ValueOrDie() >= m_pSyntax->GetDocumentSize())
320     return 0;
321 
322   return result.ValueOrDie();
323 }
324 
SetEncryptHandler()325 CPDF_Parser::Error CPDF_Parser::SetEncryptHandler() {
326   ReleaseEncryptHandler();
327   if (!GetTrailer())
328     return FORMAT_ERROR;
329 
330   RetainPtr<const CPDF_Dictionary> pEncryptDict = GetEncryptDict();
331   if (!pEncryptDict)
332     return SUCCESS;
333 
334   if (pEncryptDict->GetNameFor("Filter") != "Standard")
335     return HANDLER_ERROR;
336 
337   auto pSecurityHandler = pdfium::MakeRetain<CPDF_SecurityHandler>();
338   if (!pSecurityHandler->OnInit(pEncryptDict, GetIDArray(), GetPassword()))
339     return PASSWORD_ERROR;
340 
341   m_pSecurityHandler = std::move(pSecurityHandler);
342   return SUCCESS;
343 }
344 
ReleaseEncryptHandler()345 void CPDF_Parser::ReleaseEncryptHandler() {
346   m_pSecurityHandler.Reset();
347 }
348 
349 // Ideally, all the cross reference entries should be verified.
350 // In reality, we rarely see well-formed cross references don't match
351 // with the objects. crbug/602650 showed a case where object numbers
352 // in the cross reference table are all off by one.
VerifyCrossRefV4()353 bool CPDF_Parser::VerifyCrossRefV4() {
354   for (const auto& it : m_CrossRefTable->objects_info()) {
355     if (it.second.pos <= 0)
356       continue;
357     // Find the first non-zero position.
358     FX_FILESIZE SavedPos = m_pSyntax->GetPos();
359     m_pSyntax->SetPos(it.second.pos);
360     CPDF_SyntaxParser::WordResult word_result = m_pSyntax->GetNextWord();
361     m_pSyntax->SetPos(SavedPos);
362     if (!word_result.is_number || word_result.word.IsEmpty() ||
363         FXSYS_atoui(word_result.word.c_str()) != it.first) {
364       // If the object number read doesn't match the one stored,
365       // something is wrong with the cross reference table.
366       return false;
367     }
368     break;
369   }
370   return true;
371 }
372 
LoadAllCrossRefV4(FX_FILESIZE xref_offset)373 bool CPDF_Parser::LoadAllCrossRefV4(FX_FILESIZE xref_offset) {
374   if (!LoadCrossRefV4(xref_offset, true))
375     return false;
376 
377   RetainPtr<CPDF_Dictionary> trailer = LoadTrailerV4();
378   if (!trailer)
379     return false;
380 
381   m_CrossRefTable->SetTrailer(std::move(trailer), kNoV4TrailerObjectNumber);
382   const int32_t xrefsize = GetTrailer()->GetDirectIntegerFor("Size");
383   if (xrefsize > 0 && xrefsize <= kMaxXRefSize)
384     m_CrossRefTable->SetObjectMapSize(xrefsize);
385 
386   FX_FILESIZE xref_stm = GetTrailer()->GetDirectIntegerFor("XRefStm");
387   std::vector<FX_FILESIZE> xref_stream_list{xref_stm};
388   std::vector<FX_FILESIZE> xref_list{xref_offset};
389   std::set<FX_FILESIZE> seen_xref_offset{xref_offset};
390 
391   // When the trailer doesn't have Prev entry or Prev entry value is not
392   // numerical, GetDirectInteger() returns 0. Loading will end.
393   xref_offset = GetTrailer()->GetDirectIntegerFor("Prev");
394   while (xref_offset > 0) {
395     // Check for circular references.
396     if (pdfium::Contains(seen_xref_offset, xref_offset))
397       return false;
398 
399     seen_xref_offset.insert(xref_offset);
400     xref_list.insert(xref_list.begin(), xref_offset);
401 
402     // SLOW ...
403     LoadCrossRefV4(xref_offset, true);
404 
405     RetainPtr<CPDF_Dictionary> pDict(LoadTrailerV4());
406     if (!pDict)
407       return false;
408 
409     xref_offset = pDict->GetDirectIntegerFor("Prev");
410     xref_stm = pDict->GetIntegerFor("XRefStm");
411     xref_stream_list.insert(xref_stream_list.begin(), xref_stm);
412 
413     // SLOW ...
414     m_CrossRefTable = CPDF_CrossRefTable::MergeUp(
415         std::make_unique<CPDF_CrossRefTable>(std::move(pDict),
416                                              kNoV4TrailerObjectNumber),
417         std::move(m_CrossRefTable));
418   }
419 
420   for (size_t i = 0; i < xref_list.size(); ++i) {
421     if (xref_list[i] > 0 && !LoadCrossRefV4(xref_list[i], false))
422       return false;
423 
424     if (xref_stream_list[i] > 0 && !LoadCrossRefV5(&xref_stream_list[i], false))
425       return false;
426 
427     if (i == 0 && !VerifyCrossRefV4())
428       return false;
429   }
430   return true;
431 }
432 
LoadLinearizedAllCrossRefV4(FX_FILESIZE main_xref_offset)433 bool CPDF_Parser::LoadLinearizedAllCrossRefV4(FX_FILESIZE main_xref_offset) {
434   if (!LoadCrossRefV4(main_xref_offset, false))
435     return false;
436 
437   RetainPtr<CPDF_Dictionary> main_trailer = LoadTrailerV4();
438   if (!main_trailer)
439     return false;
440 
441   // GetTrailer() currently returns the first-page trailer.
442   if (GetTrailer()->GetDirectIntegerFor("Size") == 0)
443     return false;
444 
445   // Read /XRefStm from the first-page trailer. No need to read /Prev for the
446   // first-page trailer, as the caller already did that and passed it in as
447   // |main_xref_offset|.
448   FX_FILESIZE xref_stm = GetTrailer()->GetDirectIntegerFor("XRefStm");
449   std::vector<FX_FILESIZE> xref_stream_list{xref_stm};
450   std::vector<FX_FILESIZE> xref_list{main_xref_offset};
451   std::set<FX_FILESIZE> seen_xref_offset{main_xref_offset};
452 
453   // Merge the trailers.
454   m_CrossRefTable = CPDF_CrossRefTable::MergeUp(
455       std::make_unique<CPDF_CrossRefTable>(std::move(main_trailer),
456                                            kNoV4TrailerObjectNumber),
457       std::move(m_CrossRefTable));
458 
459   // Now GetTrailer() returns the merged trailer, where /Prev is from the
460   // main-trailer.
461   FX_FILESIZE xref_offset = GetTrailer()->GetDirectIntegerFor("Prev");
462   while (xref_offset > 0) {
463     // Check for circular references.
464     if (pdfium::Contains(seen_xref_offset, xref_offset))
465       return false;
466 
467     seen_xref_offset.insert(xref_offset);
468     xref_list.insert(xref_list.begin(), xref_offset);
469 
470     // SLOW ...
471     LoadCrossRefV4(xref_offset, true);
472 
473     RetainPtr<CPDF_Dictionary> pDict(LoadTrailerV4());
474     if (!pDict)
475       return false;
476 
477     xref_offset = pDict->GetDirectIntegerFor("Prev");
478     xref_stm = pDict->GetIntegerFor("XRefStm");
479     xref_stream_list.insert(xref_stream_list.begin(), xref_stm);
480 
481     // SLOW ...
482     m_CrossRefTable = CPDF_CrossRefTable::MergeUp(
483         std::make_unique<CPDF_CrossRefTable>(std::move(pDict),
484                                              kNoV4TrailerObjectNumber),
485         std::move(m_CrossRefTable));
486   }
487 
488   if (xref_stream_list[0] > 0 && !LoadCrossRefV5(&xref_stream_list[0], false))
489     return false;
490 
491   for (size_t i = 1; i < xref_list.size(); ++i) {
492     if (xref_list[i] > 0 && !LoadCrossRefV4(xref_list[i], false))
493       return false;
494 
495     if (xref_stream_list[i] > 0 && !LoadCrossRefV5(&xref_stream_list[i], false))
496       return false;
497   }
498   return true;
499 }
500 
ParseAndAppendCrossRefSubsectionData(uint32_t start_objnum,uint32_t count,std::vector<CrossRefObjData> * out_objects)501 bool CPDF_Parser::ParseAndAppendCrossRefSubsectionData(
502     uint32_t start_objnum,
503     uint32_t count,
504     std::vector<CrossRefObjData>* out_objects) {
505   if (!count)
506     return true;
507 
508   // Each entry shall be exactly 20 byte.
509   // A sample entry looks like:
510   // "0000000000 00007 f\r\n"
511   static constexpr int32_t kEntrySize = 20;
512 
513   if (!out_objects) {
514     FX_SAFE_FILESIZE pos = count;
515     pos *= kEntrySize;
516     pos += m_pSyntax->GetPos();
517     if (!pos.IsValid())
518       return false;
519     m_pSyntax->SetPos(pos.ValueOrDie());
520     return true;
521   }
522   const size_t start_obj_index = out_objects->size();
523   FX_SAFE_SIZE_T new_size = start_obj_index;
524   new_size += count;
525   if (!new_size.IsValid())
526     return false;
527 
528   if (new_size.ValueOrDie() > kMaxXRefSize)
529     return false;
530 
531   const size_t max_entries_in_file = m_pSyntax->GetDocumentSize() / kEntrySize;
532   if (new_size.ValueOrDie() > max_entries_in_file)
533     return false;
534 
535   out_objects->resize(new_size.ValueOrDie());
536 
537   DataVector<char> buf(1024 * kEntrySize + 1);
538   buf.back() = '\0';
539 
540   uint32_t entries_to_read = count;
541   while (entries_to_read > 0) {
542     const uint32_t entries_in_block = std::min(entries_to_read, 1024u);
543     const uint32_t bytes_to_read = entries_in_block * kEntrySize;
544     auto block_span = pdfium::make_span(buf).first(bytes_to_read);
545     if (!m_pSyntax->ReadBlock(pdfium::as_writable_bytes(block_span)))
546       return false;
547 
548     for (uint32_t i = 0; i < entries_in_block; i++) {
549       uint32_t iObjectIndex = count - entries_to_read + i;
550       CrossRefObjData& obj_data =
551           (*out_objects)[start_obj_index + iObjectIndex];
552       const uint32_t objnum = start_objnum + iObjectIndex;
553       obj_data.obj_num = objnum;
554       ObjectInfo& info = obj_data.info;
555 
556       const char* pEntry = &buf[i * kEntrySize];
557       if (pEntry[17] == 'f') {
558         info.pos = 0;
559         info.type = ObjectType::kFree;
560       } else {
561         const FX_SAFE_FILESIZE offset = FXSYS_atoi64(pEntry);
562         if (!offset.IsValid())
563           return false;
564 
565         if (offset.ValueOrDie() == 0) {
566           for (int32_t c = 0; c < 10; c++) {
567             if (!isdigit(pEntry[c]))
568               return false;
569           }
570         }
571 
572         info.pos = offset.ValueOrDie();
573 
574         // TODO(art-snake): The info.gennum is uint16_t, but version may be
575         // greated than max<uint16_t>. Needs solve this issue.
576         const int32_t version = FXSYS_atoi(pEntry + 11);
577         info.gennum = version;
578         info.type = ObjectType::kNotCompressed;
579       }
580     }
581     entries_to_read -= entries_in_block;
582   }
583   return true;
584 }
585 
ParseCrossRefV4(std::vector<CrossRefObjData> * out_objects)586 bool CPDF_Parser::ParseCrossRefV4(std::vector<CrossRefObjData>* out_objects) {
587   if (out_objects)
588     out_objects->clear();
589 
590   if (m_pSyntax->GetKeyword() != "xref")
591     return false;
592   std::vector<CrossRefObjData> result_objects;
593   while (true) {
594     FX_FILESIZE saved_pos = m_pSyntax->GetPos();
595     CPDF_SyntaxParser::WordResult word_result = m_pSyntax->GetNextWord();
596     const ByteString& word = word_result.word;
597     if (word.IsEmpty())
598       return false;
599 
600     if (!word_result.is_number) {
601       m_pSyntax->SetPos(saved_pos);
602       break;
603     }
604 
605     uint32_t start_objnum = FXSYS_atoui(word.c_str());
606     if (start_objnum >= kMaxObjectNumber)
607       return false;
608 
609     uint32_t count = m_pSyntax->GetDirectNum();
610     m_pSyntax->ToNextWord();
611 
612     if (!ParseAndAppendCrossRefSubsectionData(
613             start_objnum, count, out_objects ? &result_objects : nullptr)) {
614       return false;
615     }
616   }
617   if (out_objects)
618     *out_objects = std::move(result_objects);
619   return true;
620 }
621 
LoadCrossRefV4(FX_FILESIZE pos,bool bSkip)622 bool CPDF_Parser::LoadCrossRefV4(FX_FILESIZE pos, bool bSkip) {
623   m_pSyntax->SetPos(pos);
624   std::vector<CrossRefObjData> objects;
625   if (!ParseCrossRefV4(bSkip ? nullptr : &objects))
626     return false;
627 
628   MergeCrossRefObjectsData(objects);
629   return true;
630 }
631 
MergeCrossRefObjectsData(const std::vector<CrossRefObjData> & objects)632 void CPDF_Parser::MergeCrossRefObjectsData(
633     const std::vector<CrossRefObjData>& objects) {
634   for (const auto& obj : objects) {
635     switch (obj.info.type) {
636       case ObjectType::kFree:
637         if (obj.info.gennum > 0)
638           m_CrossRefTable->SetFree(obj.obj_num);
639         break;
640       case ObjectType::kNormal:
641       case ObjectType::kObjStream:
642         m_CrossRefTable->AddNormal(obj.obj_num, obj.info.gennum, obj.info.pos);
643         break;
644       case ObjectType::kCompressed:
645         m_CrossRefTable->AddCompressed(obj.obj_num, obj.info.archive.obj_num,
646                                        obj.info.archive.obj_index);
647         break;
648     }
649   }
650 }
651 
LoadAllCrossRefV5(FX_FILESIZE xref_offset)652 bool CPDF_Parser::LoadAllCrossRefV5(FX_FILESIZE xref_offset) {
653   if (!LoadCrossRefV5(&xref_offset, true))
654     return false;
655 
656   std::set<FX_FILESIZE> seen_xref_offset;
657   while (xref_offset > 0) {
658     seen_xref_offset.insert(xref_offset);
659     if (!LoadCrossRefV5(&xref_offset, false))
660       return false;
661 
662     // Check for circular references.
663     if (pdfium::Contains(seen_xref_offset, xref_offset))
664       return false;
665   }
666   m_ObjectStreamMap.clear();
667   m_bXRefStream = true;
668   return true;
669 }
670 
RebuildCrossRef()671 bool CPDF_Parser::RebuildCrossRef() {
672   auto cross_ref_table = std::make_unique<CPDF_CrossRefTable>();
673 
674   const uint32_t kBufferSize = 4096;
675   m_pSyntax->SetReadBufferSize(kBufferSize);
676   m_pSyntax->SetPos(0);
677 
678   std::vector<std::pair<uint32_t, FX_FILESIZE>> numbers;
679   for (CPDF_SyntaxParser::WordResult result = m_pSyntax->GetNextWord();
680        !result.word.IsEmpty(); result = m_pSyntax->GetNextWord()) {
681     const ByteString& word = result.word;
682     if (result.is_number) {
683       numbers.emplace_back(FXSYS_atoui(word.c_str()),
684                            m_pSyntax->GetPos() - word.GetLength());
685       if (numbers.size() > 2u)
686         numbers.erase(numbers.begin());
687       continue;
688     }
689 
690     if (word == "(") {
691       m_pSyntax->ReadString();
692     } else if (word == "<") {
693       m_pSyntax->ReadHexString();
694     } else if (word == "trailer") {
695       RetainPtr<CPDF_Object> pTrailer = m_pSyntax->GetObjectBody(nullptr);
696       if (pTrailer) {
697         CPDF_Stream* stream_trailer = pTrailer->AsMutableStream();
698         // Grab the object number from `pTrailer` before potentially calling
699         // std::move(pTrailer) below.
700         const uint32_t trailer_object_number = pTrailer->GetObjNum();
701         RetainPtr<CPDF_Dictionary> trailer_dict =
702             stream_trailer ? stream_trailer->GetMutableDict()
703                            : ToDictionary(std::move(pTrailer));
704         cross_ref_table = CPDF_CrossRefTable::MergeUp(
705             std::move(cross_ref_table),
706             std::make_unique<CPDF_CrossRefTable>(std::move(trailer_dict),
707                                                  trailer_object_number));
708       }
709     } else if (word == "obj" && numbers.size() == 2u) {
710       const FX_FILESIZE obj_pos = numbers[0].second;
711       const uint32_t obj_num = numbers[0].first;
712       const uint32_t gen_num = numbers[1].first;
713 
714       m_pSyntax->SetPos(obj_pos);
715       const RetainPtr<CPDF_Stream> pStream =
716           ToStream(m_pSyntax->GetIndirectObject(
717               nullptr, CPDF_SyntaxParser::ParseType::kStrict));
718 
719       if (pStream && pStream->GetDict()->GetNameFor("Type") == "XRef") {
720         cross_ref_table = CPDF_CrossRefTable::MergeUp(
721             std::move(cross_ref_table),
722             std::make_unique<CPDF_CrossRefTable>(
723                 ToDictionary(pStream->GetDict()->Clone()),
724                 pStream->GetObjNum()));
725       }
726 
727       if (obj_num < kMaxObjectNumber) {
728         cross_ref_table->AddNormal(obj_num, gen_num, obj_pos);
729         const auto object_stream =
730             CPDF_ObjectStream::Create(std::move(pStream));
731         if (object_stream) {
732           const auto& object_info = object_stream->object_info();
733           for (size_t i = 0; i < object_info.size(); ++i) {
734             const auto& info = object_info[i];
735             if (info.obj_num < kMaxObjectNumber)
736               cross_ref_table->AddCompressed(info.obj_num, obj_num, i);
737           }
738         }
739       }
740     }
741     numbers.clear();
742   }
743 
744   m_CrossRefTable = CPDF_CrossRefTable::MergeUp(std::move(m_CrossRefTable),
745                                                 std::move(cross_ref_table));
746   // Resore default buffer size.
747   m_pSyntax->SetReadBufferSize(CPDF_Stream::kFileBufSize);
748 
749   return GetTrailer() && !m_CrossRefTable->objects_info().empty();
750 }
751 
LoadCrossRefV5(FX_FILESIZE * pos,bool bMainXRef)752 bool CPDF_Parser::LoadCrossRefV5(FX_FILESIZE* pos, bool bMainXRef) {
753   RetainPtr<CPDF_Object> pObject(ParseIndirectObjectAt(*pos, 0));
754   if (!pObject || !pObject->GetObjNum())
755     return false;
756 
757   RetainPtr<const CPDF_Stream> pStream(pObject->AsStream());
758   if (!pStream)
759     return false;
760 
761   RetainPtr<const CPDF_Dictionary> pDict = pStream->GetDict();
762   int32_t prev = pDict->GetIntegerFor("Prev");
763   if (prev < 0)
764     return false;
765 
766   int32_t size = pDict->GetIntegerFor("Size");
767   if (size < 0)
768     return false;
769 
770   *pos = prev;
771 
772   RetainPtr<CPDF_Dictionary> pNewTrailer = ToDictionary(pDict->Clone());
773   if (bMainXRef) {
774     m_CrossRefTable = std::make_unique<CPDF_CrossRefTable>(
775         std::move(pNewTrailer), pStream->GetObjNum());
776     m_CrossRefTable->SetObjectMapSize(size);
777   } else {
778     m_CrossRefTable = CPDF_CrossRefTable::MergeUp(
779         std::make_unique<CPDF_CrossRefTable>(std::move(pNewTrailer),
780                                              pStream->GetObjNum()),
781         std::move(m_CrossRefTable));
782   }
783 
784   std::vector<CrossRefV5IndexEntry> indices =
785       GetCrossRefV5Indices(pDict->GetArrayFor("Index").Get(), size);
786 
787   std::vector<uint32_t> field_widths =
788       GetFieldWidths(pDict->GetArrayFor("W").Get());
789   if (field_widths.size() < kMinFieldCount)
790     return false;
791 
792   FX_SAFE_UINT32 dwAccWidth;
793   for (uint32_t width : field_widths)
794     dwAccWidth += width;
795   if (!dwAccWidth.IsValid())
796     return false;
797 
798   uint32_t total_width = dwAccWidth.ValueOrDie();
799   auto pAcc = pdfium::MakeRetain<CPDF_StreamAcc>(std::move(pStream));
800   pAcc->LoadAllDataFiltered();
801 
802   pdfium::span<const uint8_t> data_span = pAcc->GetSpan();
803   uint32_t segindex = 0;
804   for (const auto& index : indices) {
805     FX_SAFE_UINT32 seg_end = segindex;
806     seg_end += index.obj_count;
807     seg_end *= total_width;
808     if (!seg_end.IsValid() || seg_end.ValueOrDie() > data_span.size())
809       continue;
810 
811     pdfium::span<const uint8_t> seg_span = data_span.subspan(
812         segindex * total_width, index.obj_count * total_width);
813     FX_SAFE_UINT32 safe_new_size = index.start_obj_num;
814     safe_new_size += index.obj_count;
815     if (!safe_new_size.IsValid()) {
816       continue;
817     }
818 
819     // Until SetObjectMapSize() below has been called by a prior loop iteration,
820     // `current_size` is based on the /Size value parsed in LoadCrossRefV5().
821     // PDFs may not always have the correct /Size. In this case, other PDF
822     // implementations ignore the incorrect size, and PDFium also ignores
823     // incorrect size in trailers for V4 xrefs.
824     const uint32_t current_size =
825         m_CrossRefTable->objects_info().empty() ? 0 : GetLastObjNum() + 1;
826     // So allow `new_size` to be greater than `current_size`, but avoid going
827     // over `kMaxXRefSize`. This works just fine because the loop below checks
828     // against `kMaxObjectNumber`, and the two "max" constants are in sync.
829     const uint32_t new_size =
830         std::min<uint32_t>(safe_new_size.ValueOrDie(), kMaxXRefSize);
831     if (new_size > current_size) {
832       m_CrossRefTable->SetObjectMapSize(new_size);
833     }
834 
835     for (uint32_t i = 0; i < index.obj_count; ++i) {
836       const uint32_t obj_num = index.start_obj_num + i;
837       if (obj_num >= kMaxObjectNumber) {
838         break;
839       }
840 
841       ProcessCrossRefV5Entry(seg_span.subspan(i * total_width, total_width),
842                              field_widths, obj_num);
843     }
844 
845     segindex += index.obj_count;
846   }
847   return true;
848 }
849 
ProcessCrossRefV5Entry(pdfium::span<const uint8_t> entry_span,pdfium::span<const uint32_t> field_widths,uint32_t obj_num)850 void CPDF_Parser::ProcessCrossRefV5Entry(
851     pdfium::span<const uint8_t> entry_span,
852     pdfium::span<const uint32_t> field_widths,
853     uint32_t obj_num) {
854   DCHECK_GE(field_widths.size(), kMinFieldCount);
855   ObjectType type;
856   if (field_widths[0]) {
857     const uint32_t cross_ref_stream_obj_type =
858         GetFirstXRefStreamEntry(entry_span, field_widths);
859     type = GetObjectTypeFromCrossRefStreamType(cross_ref_stream_obj_type);
860     if (type == ObjectType::kNull)
861       return;
862   } else {
863     // Per ISO 32000-1:2008 table 17, use the default value of 1 for the xref
864     // stream entry when it is not specified. The `type` assignment is the
865     // equivalent to calling GetObjectTypeFromCrossRefStreamType(1).
866     type = ObjectType::kNotCompressed;
867   }
868 
869   const ObjectType existing_type = GetObjectType(obj_num);
870   if (existing_type == ObjectType::kNull) {
871     const uint32_t offset = GetSecondXRefStreamEntry(entry_span, field_widths);
872     if (pdfium::base::IsValueInRangeForNumericType<FX_FILESIZE>(offset))
873       m_CrossRefTable->AddNormal(obj_num, 0, offset);
874     return;
875   }
876 
877   if (existing_type != ObjectType::kFree)
878     return;
879 
880   if (type == ObjectType::kFree) {
881     m_CrossRefTable->SetFree(obj_num);
882     return;
883   }
884 
885   if (type == ObjectType::kNotCompressed) {
886     const uint32_t offset = GetSecondXRefStreamEntry(entry_span, field_widths);
887     if (pdfium::base::IsValueInRangeForNumericType<FX_FILESIZE>(offset))
888       m_CrossRefTable->AddNormal(obj_num, 0, offset);
889     return;
890   }
891 
892   DCHECK_EQ(type, ObjectType::kCompressed);
893   const uint32_t archive_obj_num =
894       GetSecondXRefStreamEntry(entry_span, field_widths);
895   if (!IsValidObjectNumber(archive_obj_num)) {
896     return;
897   }
898 
899   const uint32_t archive_obj_index =
900       GetThirdXRefStreamEntry(entry_span, field_widths);
901   m_CrossRefTable->AddCompressed(obj_num, archive_obj_num, archive_obj_index);
902 }
903 
GetIDArray() const904 RetainPtr<const CPDF_Array> CPDF_Parser::GetIDArray() const {
905   return GetTrailer() ? GetTrailer()->GetArrayFor("ID") : nullptr;
906 }
907 
GetRoot() const908 RetainPtr<const CPDF_Dictionary> CPDF_Parser::GetRoot() const {
909   RetainPtr<CPDF_Object> obj =
910       m_pObjectsHolder->GetOrParseIndirectObject(GetRootObjNum());
911   return obj ? obj->GetDict() : nullptr;
912 }
913 
GetEncryptDict() const914 RetainPtr<const CPDF_Dictionary> CPDF_Parser::GetEncryptDict() const {
915   if (!GetTrailer())
916     return nullptr;
917 
918   RetainPtr<const CPDF_Object> pEncryptObj =
919       GetTrailer()->GetObjectFor("Encrypt");
920   if (!pEncryptObj)
921     return nullptr;
922 
923   if (pEncryptObj->IsDictionary())
924     return pdfium::WrapRetain(pEncryptObj->AsDictionary());
925 
926   if (pEncryptObj->IsReference()) {
927     return ToDictionary(m_pObjectsHolder->GetOrParseIndirectObject(
928         pEncryptObj->AsReference()->GetRefObjNum()));
929   }
930   return nullptr;
931 }
932 
GetEncodedPassword() const933 ByteString CPDF_Parser::GetEncodedPassword() const {
934   return GetSecurityHandler()->GetEncodedPassword(GetPassword().AsStringView());
935 }
936 
GetTrailer() const937 const CPDF_Dictionary* CPDF_Parser::GetTrailer() const {
938   return m_CrossRefTable->trailer();
939 }
940 
GetMutableTrailerForTesting()941 CPDF_Dictionary* CPDF_Parser::GetMutableTrailerForTesting() {
942   return m_CrossRefTable->GetMutableTrailerForTesting();
943 }
944 
GetTrailerObjectNumber() const945 uint32_t CPDF_Parser::GetTrailerObjectNumber() const {
946   return m_CrossRefTable->trailer_object_number();
947 }
948 
GetCombinedTrailer() const949 RetainPtr<CPDF_Dictionary> CPDF_Parser::GetCombinedTrailer() const {
950   return m_CrossRefTable->trailer()
951              ? ToDictionary(m_CrossRefTable->trailer()->Clone())
952              : RetainPtr<CPDF_Dictionary>();
953 }
954 
GetInfoObjNum() const955 uint32_t CPDF_Parser::GetInfoObjNum() const {
956   RetainPtr<const CPDF_Reference> pRef =
957       ToReference(m_CrossRefTable->trailer()
958                       ? m_CrossRefTable->trailer()->GetObjectFor("Info")
959                       : nullptr);
960   return pRef ? pRef->GetRefObjNum() : CPDF_Object::kInvalidObjNum;
961 }
962 
GetRootObjNum() const963 uint32_t CPDF_Parser::GetRootObjNum() const {
964   RetainPtr<const CPDF_Reference> pRef =
965       ToReference(m_CrossRefTable->trailer()
966                       ? m_CrossRefTable->trailer()->GetObjectFor("Root")
967                       : nullptr);
968   return pRef ? pRef->GetRefObjNum() : CPDF_Object::kInvalidObjNum;
969 }
970 
ParseIndirectObject(uint32_t objnum)971 RetainPtr<CPDF_Object> CPDF_Parser::ParseIndirectObject(uint32_t objnum) {
972   if (!IsValidObjectNumber(objnum))
973     return nullptr;
974 
975   // Prevent circular parsing the same object.
976   if (pdfium::Contains(m_ParsingObjNums, objnum))
977     return nullptr;
978 
979   ScopedSetInsertion<uint32_t> local_insert(&m_ParsingObjNums, objnum);
980   if (GetObjectType(objnum) == ObjectType::kNotCompressed) {
981     FX_FILESIZE pos = GetObjectPositionOrZero(objnum);
982     if (pos <= 0)
983       return nullptr;
984     return ParseIndirectObjectAt(pos, objnum);
985   }
986   if (GetObjectType(objnum) != ObjectType::kCompressed)
987     return nullptr;
988 
989   const ObjectInfo& info = *m_CrossRefTable->GetObjectInfo(objnum);
990   const CPDF_ObjectStream* pObjStream = GetObjectStream(info.archive.obj_num);
991   if (!pObjStream)
992     return nullptr;
993 
994   return pObjStream->ParseObject(m_pObjectsHolder, objnum,
995                                  info.archive.obj_index);
996 }
997 
GetObjectStream(uint32_t object_number)998 const CPDF_ObjectStream* CPDF_Parser::GetObjectStream(uint32_t object_number) {
999   // Prevent circular parsing the same object.
1000   if (pdfium::Contains(m_ParsingObjNums, object_number))
1001     return nullptr;
1002 
1003   auto it = m_ObjectStreamMap.find(object_number);
1004   if (it != m_ObjectStreamMap.end())
1005     return it->second.get();
1006 
1007   const auto* info = m_CrossRefTable->GetObjectInfo(object_number);
1008   if (!info || info->type != ObjectType::kObjStream)
1009     return nullptr;
1010 
1011   const FX_FILESIZE object_pos = info->pos;
1012   if (object_pos <= 0)
1013     return nullptr;
1014 
1015   // Keep track of `object_number` before doing more parsing.
1016   ScopedSetInsertion<uint32_t> local_insert(&m_ParsingObjNums, object_number);
1017 
1018   RetainPtr<CPDF_Object> object =
1019       ParseIndirectObjectAt(object_pos, object_number);
1020   if (!object)
1021     return nullptr;
1022 
1023   std::unique_ptr<CPDF_ObjectStream> objs_stream =
1024       CPDF_ObjectStream::Create(ToStream(object));
1025   const CPDF_ObjectStream* result = objs_stream.get();
1026   m_ObjectStreamMap[object_number] = std::move(objs_stream);
1027 
1028   return result;
1029 }
1030 
ParseIndirectObjectAt(FX_FILESIZE pos,uint32_t objnum)1031 RetainPtr<CPDF_Object> CPDF_Parser::ParseIndirectObjectAt(FX_FILESIZE pos,
1032                                                           uint32_t objnum) {
1033   const FX_FILESIZE saved_pos = m_pSyntax->GetPos();
1034   m_pSyntax->SetPos(pos);
1035 
1036   auto result = m_pSyntax->GetIndirectObject(
1037       m_pObjectsHolder, CPDF_SyntaxParser::ParseType::kLoose);
1038   m_pSyntax->SetPos(saved_pos);
1039   if (result && objnum && result->GetObjNum() != objnum)
1040     return nullptr;
1041 
1042   const bool should_decrypt = m_pSecurityHandler &&
1043                               m_pSecurityHandler->GetCryptoHandler() &&
1044                               objnum != m_MetadataObjnum;
1045   if (should_decrypt &&
1046       !m_pSecurityHandler->GetCryptoHandler()->DecryptObjectTree(result)) {
1047     return nullptr;
1048   }
1049   return result;
1050 }
1051 
GetDocumentSize() const1052 FX_FILESIZE CPDF_Parser::GetDocumentSize() const {
1053   return m_pSyntax->GetDocumentSize();
1054 }
1055 
GetFirstPageNo() const1056 uint32_t CPDF_Parser::GetFirstPageNo() const {
1057   return m_pLinearized ? m_pLinearized->GetFirstPageNo() : 0;
1058 }
1059 
SetLinearizedHeaderForTesting(std::unique_ptr<CPDF_LinearizedHeader> pLinearized)1060 void CPDF_Parser::SetLinearizedHeaderForTesting(
1061     std::unique_ptr<CPDF_LinearizedHeader> pLinearized) {
1062   m_pLinearized = std::move(pLinearized);
1063 }
1064 
LoadTrailerV4()1065 RetainPtr<CPDF_Dictionary> CPDF_Parser::LoadTrailerV4() {
1066   if (m_pSyntax->GetKeyword() != "trailer")
1067     return nullptr;
1068 
1069   return ToDictionary(m_pSyntax->GetObjectBody(m_pObjectsHolder));
1070 }
1071 
GetPermissions() const1072 uint32_t CPDF_Parser::GetPermissions() const {
1073   return m_pSecurityHandler ? m_pSecurityHandler->GetPermissions() : 0xFFFFFFFF;
1074 }
1075 
ParseLinearizedHeader()1076 std::unique_ptr<CPDF_LinearizedHeader> CPDF_Parser::ParseLinearizedHeader() {
1077   return CPDF_LinearizedHeader::Parse(m_pSyntax.get());
1078 }
1079 
StartLinearizedParse(RetainPtr<CPDF_ReadValidator> validator,const ByteString & password)1080 CPDF_Parser::Error CPDF_Parser::StartLinearizedParse(
1081     RetainPtr<CPDF_ReadValidator> validator,
1082     const ByteString& password) {
1083   DCHECK(!m_bHasParsed);
1084   DCHECK(!m_bXRefTableRebuilt);
1085   SetPassword(password);
1086   m_bXRefStream = false;
1087   m_LastXRefOffset = 0;
1088 
1089   if (!InitSyntaxParser(std::move(validator)))
1090     return FORMAT_ERROR;
1091 
1092   m_pLinearized = ParseLinearizedHeader();
1093   if (!m_pLinearized)
1094     return StartParseInternal();
1095 
1096   m_bHasParsed = true;
1097 
1098   m_LastXRefOffset = m_pLinearized->GetLastXRefOffset();
1099   FX_FILESIZE dwFirstXRefOffset = m_LastXRefOffset;
1100   bool bLoadV4 = LoadCrossRefV4(dwFirstXRefOffset, false);
1101   if (!bLoadV4 && !LoadCrossRefV5(&dwFirstXRefOffset, true)) {
1102     if (!RebuildCrossRef())
1103       return FORMAT_ERROR;
1104 
1105     m_bXRefTableRebuilt = true;
1106     m_LastXRefOffset = 0;
1107   }
1108   if (bLoadV4) {
1109     RetainPtr<CPDF_Dictionary> trailer = LoadTrailerV4();
1110     if (!trailer)
1111       return SUCCESS;
1112 
1113     m_CrossRefTable->SetTrailer(std::move(trailer), kNoV4TrailerObjectNumber);
1114     const int32_t xrefsize = GetTrailer()->GetDirectIntegerFor("Size");
1115     if (xrefsize > 0) {
1116       // Check if `xrefsize` is correct. If it is incorrect, give up and rebuild
1117       // the xref table.
1118       const uint32_t expected_last_obj_num = xrefsize - 1;
1119       if (GetLastObjNum() != expected_last_obj_num && !RebuildCrossRef()) {
1120         return FORMAT_ERROR;
1121       }
1122     }
1123   }
1124 
1125   Error eRet = SetEncryptHandler();
1126   if (eRet != SUCCESS)
1127     return eRet;
1128 
1129   if (!GetRoot() || !m_pObjectsHolder->TryInit()) {
1130     if (m_bXRefTableRebuilt)
1131       return FORMAT_ERROR;
1132 
1133     ReleaseEncryptHandler();
1134     if (!RebuildCrossRef())
1135       return FORMAT_ERROR;
1136 
1137     eRet = SetEncryptHandler();
1138     if (eRet != SUCCESS)
1139       return eRet;
1140 
1141     m_pObjectsHolder->TryInit();
1142     if (!GetRoot())
1143       return FORMAT_ERROR;
1144   }
1145 
1146   if (GetRootObjNum() == CPDF_Object::kInvalidObjNum) {
1147     ReleaseEncryptHandler();
1148     if (!RebuildCrossRef() || GetRootObjNum() == CPDF_Object::kInvalidObjNum)
1149       return FORMAT_ERROR;
1150 
1151     eRet = SetEncryptHandler();
1152     if (eRet != SUCCESS)
1153       return eRet;
1154   }
1155 
1156   if (m_pSecurityHandler && m_pSecurityHandler->IsMetadataEncrypted()) {
1157     RetainPtr<const CPDF_Reference> pMetadata =
1158         ToReference(GetRoot()->GetObjectFor("Metadata"));
1159     if (pMetadata)
1160       m_MetadataObjnum = pMetadata->GetRefObjNum();
1161   }
1162   return SUCCESS;
1163 }
1164 
LoadLinearizedAllCrossRefV5(FX_FILESIZE main_xref_offset)1165 bool CPDF_Parser::LoadLinearizedAllCrossRefV5(FX_FILESIZE main_xref_offset) {
1166   FX_FILESIZE xref_offset = main_xref_offset;
1167   if (!LoadCrossRefV5(&xref_offset, false))
1168     return false;
1169 
1170   std::set<FX_FILESIZE> seen_xref_offset;
1171   while (xref_offset) {
1172     seen_xref_offset.insert(xref_offset);
1173     if (!LoadCrossRefV5(&xref_offset, false))
1174       return false;
1175 
1176     // Check for circular references.
1177     if (pdfium::Contains(seen_xref_offset, xref_offset))
1178       return false;
1179   }
1180   m_ObjectStreamMap.clear();
1181   m_bXRefStream = true;
1182   return true;
1183 }
1184 
LoadLinearizedMainXRefTable()1185 CPDF_Parser::Error CPDF_Parser::LoadLinearizedMainXRefTable() {
1186   const FX_SAFE_FILESIZE prev = GetTrailer()->GetIntegerFor("Prev");
1187   const FX_FILESIZE main_xref_offset = prev.ValueOrDefault(-1);
1188   if (main_xref_offset < 0)
1189     return FORMAT_ERROR;
1190 
1191   if (main_xref_offset == 0)
1192     return SUCCESS;
1193 
1194   const AutoRestorer<uint32_t> save_metadata_objnum(&m_MetadataObjnum);
1195   m_MetadataObjnum = 0;
1196   m_ObjectStreamMap.clear();
1197 
1198   if (!LoadLinearizedAllCrossRefV4(main_xref_offset) &&
1199       !LoadLinearizedAllCrossRefV5(main_xref_offset)) {
1200     m_LastXRefOffset = 0;
1201     return FORMAT_ERROR;
1202   }
1203 
1204   return SUCCESS;
1205 }
1206 
SetSyntaxParserForTesting(std::unique_ptr<CPDF_SyntaxParser> parser)1207 void CPDF_Parser::SetSyntaxParserForTesting(
1208     std::unique_ptr<CPDF_SyntaxParser> parser) {
1209   m_pSyntax = std::move(parser);
1210 }
1211 
GetTrailerEnds()1212 std::vector<unsigned int> CPDF_Parser::GetTrailerEnds() {
1213   std::vector<unsigned int> trailer_ends;
1214   m_pSyntax->SetTrailerEnds(&trailer_ends);
1215 
1216   // Traverse the document.
1217   m_pSyntax->SetPos(0);
1218   while (true) {
1219     CPDF_SyntaxParser::WordResult word_result = m_pSyntax->GetNextWord();
1220     if (word_result.is_number) {
1221       // The object number was read. Read the generation number.
1222       word_result = m_pSyntax->GetNextWord();
1223       if (!word_result.is_number)
1224         break;
1225 
1226       word_result = m_pSyntax->GetNextWord();
1227       if (word_result.word != "obj")
1228         break;
1229 
1230       m_pSyntax->GetObjectBody(nullptr);
1231 
1232       word_result = m_pSyntax->GetNextWord();
1233       if (word_result.word != "endobj")
1234         break;
1235     } else if (word_result.word == "trailer") {
1236       m_pSyntax->GetObjectBody(nullptr);
1237     } else if (word_result.word == "startxref") {
1238       m_pSyntax->GetNextWord();
1239     } else if (word_result.word == "xref") {
1240       while (true) {
1241         word_result = m_pSyntax->GetNextWord();
1242         if (word_result.word.IsEmpty() || word_result.word == "startxref")
1243           break;
1244       }
1245       m_pSyntax->GetNextWord();
1246     } else {
1247       break;
1248     }
1249   }
1250 
1251   // Stop recording trailer ends.
1252   m_pSyntax->SetTrailerEnds(nullptr);
1253   return trailer_ends;
1254 }
1255 
WriteToArchive(IFX_ArchiveStream * archive,FX_FILESIZE src_size)1256 bool CPDF_Parser::WriteToArchive(IFX_ArchiveStream* archive,
1257                                  FX_FILESIZE src_size) {
1258   static constexpr FX_FILESIZE kBufferSize = 4096;
1259   DataVector<uint8_t> buffer(kBufferSize);
1260   m_pSyntax->SetPos(0);
1261   while (src_size) {
1262     const uint32_t block_size =
1263         static_cast<uint32_t>(std::min(kBufferSize, src_size));
1264     auto block_span = pdfium::make_span(buffer).first(block_size);
1265     if (!m_pSyntax->ReadBlock(block_span))
1266       return false;
1267     if (!archive->WriteBlock(pdfium::make_span(buffer).first(block_size)))
1268       return false;
1269     src_size -= block_size;
1270   }
1271   return true;
1272 }
1273