xref: /aosp_15_r20/external/pdfium/core/fpdfapi/parser/object_tree_traversal_util.cpp (revision 3ac0a46f773bac49fa9476ec2b1cf3f8da5ec3a4)
1 // Copyright 2023 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "core/fpdfapi/parser/object_tree_traversal_util.h"
6 
7 #include <stdint.h>
8 
9 #include <map>
10 #include <queue>
11 #include <set>
12 #include <utility>
13 #include <vector>
14 
15 #include "core/fpdfapi/parser/cpdf_array.h"
16 #include "core/fpdfapi/parser/cpdf_dictionary.h"
17 #include "core/fpdfapi/parser/cpdf_document.h"
18 #include "core/fpdfapi/parser/cpdf_reference.h"
19 #include "core/fpdfapi/parser/cpdf_stream.h"
20 #include "core/fxcrt/unowned_ptr.h"
21 #include "third_party/base/check.h"
22 #include "third_party/base/containers/contains.h"
23 
24 namespace {
25 
26 class ObjectTreeTraverser {
27  public:
ObjectTreeTraverser(const CPDF_Document * document)28   explicit ObjectTreeTraverser(const CPDF_Document* document)
29       : document_(document) {
30     const CPDF_Parser* parser = document_->GetParser();
31     const CPDF_Dictionary* trailer = parser ? parser->GetTrailer() : nullptr;
32     const CPDF_Dictionary* root = trailer ? trailer : document_->GetRoot();
33     const uint32_t root_object_number =
34         trailer ? parser->GetTrailerObjectNumber() : root->GetObjNum();
35     // If `root` is a trailer, then it may not have an object number, as many
36     // trailers are inlined.
37     if (root_object_number) {
38       referenced_objects_[root_object_number] = 1;
39       object_number_map_[root] = root_object_number;
40     }
41 
42     object_queue_.push(pdfium::WrapRetain(root));
43     seen_objects_.insert(root);
44   }
45   ~ObjectTreeTraverser() = default;
46 
Traverse()47   void Traverse() { CalculateReferenceCounts(GetReferenceEntries()); }
48 
referenced_objects()49   const std::map<uint32_t, int>& referenced_objects() {
50     return referenced_objects_;
51   }
52 
53  private:
54   struct ReferenceEntry {
55     uint32_t ref_object_number;
56     uint32_t referenced_object_number;
57   };
58 
GetReferenceEntries()59   std::vector<ReferenceEntry> GetReferenceEntries() {
60     std::vector<ReferenceEntry> reference_entries;
61     while (!object_queue_.empty()) {
62       RetainPtr<const CPDF_Object> current_object = object_queue_.front();
63       object_queue_.pop();
64 
65       switch (current_object->GetType()) {
66         case CPDF_Object::kArray: {
67           CPDF_ArrayLocker locker(current_object->AsArray());
68           for (const auto& it : locker) {
69             PushNewObject(current_object, it);
70           }
71           break;
72         }
73         case CPDF_Object::kDictionary: {
74           CPDF_DictionaryLocker locker(current_object->AsDictionary());
75           for (const auto& it : locker) {
76             PushNewObject(current_object, it.second);
77           }
78           break;
79         }
80         case CPDF_Object::kReference: {
81           const CPDF_Reference* ref_object = current_object->AsReference();
82           const uint32_t ref_object_number = GetObjectNumber(ref_object);
83           const uint32_t referenced_object_number = ref_object->GetRefObjNum();
84 
85           RetainPtr<const CPDF_Object> referenced_object;
86           if (ref_object->HasIndirectObjectHolder()) {
87             // Calling GetIndirectObject() does not work for normal references.
88             referenced_object = ref_object->GetDirect();
89           } else {
90             // Calling GetDirect() does not work for references from trailers.
91             referenced_object =
92                 document_->GetIndirectObject(referenced_object_number);
93           }
94           // Unlike the other object types, CPDF_Reference can point at nullptr.
95           if (referenced_object) {
96             CHECK(referenced_object_number);
97             reference_entries.push_back(
98                 {ref_object_number, referenced_object_number});
99             PushNewObject(ref_object, referenced_object);
100           }
101           break;
102         }
103         case CPDF_Object::kStream: {
104           RetainPtr<const CPDF_Dictionary> dict =
105               current_object->AsStream()->GetDict();
106           CHECK(dict->IsInline());  // i.e. No object number.
107           CPDF_DictionaryLocker locker(dict);
108           for (const auto& it : locker) {
109             PushNewObject(current_object, it.second);
110           }
111           break;
112         }
113         default: {
114           break;
115         }
116       }
117     }
118     return reference_entries;
119   }
120 
CalculateReferenceCounts(const std::vector<ReferenceEntry> & reference_entries)121   void CalculateReferenceCounts(
122       const std::vector<ReferenceEntry>& reference_entries) {
123     // Tracks PDF objects that referenced other PDF objects, identified by their
124     // object numbers. Never 0.
125     std::set<uint32_t> seen_ref_objects;
126 
127     for (const ReferenceEntry& entry : reference_entries) {
128       // Make sure this is not a self-reference.
129       if (entry.referenced_object_number == entry.ref_object_number) {
130         continue;
131       }
132 
133       // Make sure this is not a circular reference.
134       if (pdfium::Contains(seen_ref_objects, entry.ref_object_number) &&
135           pdfium::Contains(seen_ref_objects, entry.referenced_object_number)) {
136         continue;
137       }
138 
139       ++referenced_objects_[entry.referenced_object_number];
140       if (entry.ref_object_number) {
141         seen_ref_objects.insert(entry.ref_object_number);
142       }
143     }
144   }
145 
PushNewObject(const CPDF_Object * parent_object,RetainPtr<const CPDF_Object> child_object)146   void PushNewObject(const CPDF_Object* parent_object,
147                      RetainPtr<const CPDF_Object> child_object) {
148     CHECK(parent_object);
149     CHECK(child_object);
150     const bool inserted = seen_objects_.insert(child_object).second;
151     if (!inserted) {
152       return;
153     }
154     const uint32_t child_object_number = child_object->GetObjNum();
155     if (child_object_number) {
156       object_number_map_[child_object] = child_object_number;
157     } else {
158       // This search can fail for inlined trailers.
159       auto it = object_number_map_.find(parent_object);
160       if (it != object_number_map_.end()) {
161         object_number_map_[child_object] = it->second;
162       }
163     }
164     object_queue_.push(std::move(child_object));
165   }
166 
167   // Returns 0 if not found.
GetObjectNumber(const CPDF_Object * object) const168   uint32_t GetObjectNumber(const CPDF_Object* object) const {
169     auto it = object_number_map_.find(object);
170     return it != object_number_map_.end() ? it->second : 0;
171   }
172 
173   UnownedPtr<const CPDF_Document> const document_;
174 
175   // Queue of objects to traverse.
176   // - Pointers in the queue are non-null.
177   // - The same pointer never enters the queue twice.
178   std::queue<RetainPtr<const CPDF_Object>> object_queue_;
179 
180   // Map of objects to "top-level" object numbers. For inline objects, this is
181   // the ancestor object with an object number. The keys are non-null and the
182   // values are never 0.
183   // This is used to prevent self-references, as a single PDF object, with
184   // inlined objects, is represented by multiple CPDF_Objects.
185   std::map<const CPDF_Object*, uint32_t> object_number_map_;
186 
187   // Tracks traversed objects to prevent duplicates from getting into
188   // `object_queue_` and `object_number_map_`.
189   std::set<const CPDF_Object*> seen_objects_;
190 
191   // Tracks which PDF objects are referenced.
192   // Key: object number
193   // Value: number of times referenced
194   std::map<uint32_t, int> referenced_objects_;
195 };
196 
197 }  // namespace
198 
GetObjectsWithReferences(const CPDF_Document * document)199 std::set<uint32_t> GetObjectsWithReferences(const CPDF_Document* document) {
200   ObjectTreeTraverser traverser(document);
201   traverser.Traverse();
202 
203   std::set<uint32_t> results;
204   for (const auto& it : traverser.referenced_objects()) {
205     results.insert(it.first);
206   }
207   return results;
208 }
209 
GetObjectsWithMultipleReferences(const CPDF_Document * document)210 std::set<uint32_t> GetObjectsWithMultipleReferences(
211     const CPDF_Document* document) {
212   ObjectTreeTraverser traverser(document);
213   traverser.Traverse();
214 
215   std::set<uint32_t> results;
216   for (const auto& it : traverser.referenced_objects()) {
217     if (it.second > 1) {
218       results.insert(it.first);
219     }
220   }
221   return results;
222 }
223