1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // We store the trie in three areas: nodes, nexts and suffixes.
16 //
17 // Nodes contain an index to a children array (kept in nexts) or to
18 // suffixes (for leaf nodes). Nexts contain children arrays of
19 // different sizes. Each child entry has the matched char and an index
20 // back into the nodes. Leaf nodes index into suffixes instead of the
21 // nexts array. Each suffix is a NULL-terminated suffix off the trie,
22 // followed by a 4-byte value associated with that key.
23 //
24 // Allocation
25 //
26 // Nodes are allocated and never removed. Nexts contain arrays of
27 // sizes in power-of-2 increments, i.e. 1, 2, 4, ..., 256. When the
28 // number of children of a node increases, it is relocated to an array
29 // with the proper size. The (smaller) unused array is added to a free
30 // list. A free list is kept for each array size. Allocations happen
31 // from the free list first, and then from the end of the nexts
32 // array. Suffixes are never freed or compacted. If a node wants to
33 // refer to a smaller suffix, it moves the pointer forward and the
34 // characters before the new pointer are wasted.
35 //
36 // Keys can contain any character except '\0'. The '\0' char is
37 // special in that it specifies an end-of-key in the child array.
38 //
39 // Ideas to try:
40 //
41 // - Put suffix index in a Next instead of creating a leaf node.
42 // - Change allocation buckets to 1, 2, 3, 4, 5, 6, 7, 8, 16, 32, ..., 256
43 // - Compact next array
44 // - GroupVarByte and delta-encode the next array
45 // - Collapse nodes with single children
46 //
47 // Persistence
48 //
49 // We persist the trie in a binary format such that resurrecting the
50 // trie is simply a few file reads. The file is laid out as such:
51 //
52 // - Header
53 // - Nodes
54 // - Nexts
55 // - Suffixes
56 //
57 // Each section is aligned to IcingMMapper::system_page_size(). The max
58 // requested value for each array is pre-allocated in the file. When
59 // we make modifications to the arrays, we set bits in a dirty bitmap
60 // of pages. No changes get written to disk until an explicit call to
61 // Flush. Then we only write the pages that have their dirty bit set.
62
63 #include "icing/legacy/index/icing-dynamic-trie.h"
64
65 #include <fcntl.h>
66 #include <sys/mman.h>
67 #include <sys/stat.h>
68 #include <unistd.h>
69
70 #include <algorithm>
71 #include <cerrno>
72 #include <cinttypes>
73 #include <cstdint>
74 #include <cstring>
75 #include <memory>
76 #include <ostream>
77 #include <string>
78 #include <string_view>
79 #include <utility>
80 #include <vector>
81
82 #include "icing/text_classifier/lib3/utils/base/status.h"
83 #include "icing/absl_ports/canonical_errors.h"
84 #include "icing/legacy/core/icing-string-util.h"
85 #include "icing/legacy/core/icing-timer.h"
86 #include "icing/legacy/index/icing-array-storage.h"
87 #include "icing/legacy/index/icing-filesystem.h"
88 #include "icing/legacy/index/icing-flash-bitmap.h"
89 #include "icing/legacy/index/icing-mmapper.h"
90 #include "icing/legacy/index/proto/icing-dynamic-trie-header.pb.h"
91 #include "icing/util/crc32.h"
92 #include "icing/util/i18n-utils.h"
93 #include "icing/util/logging.h"
94 #include "icing/util/math-util.h"
95 #include "icing/util/status-macros.h"
96
97 namespace icing {
98 namespace lib {
99
100 namespace {
101
102 constexpr uint32_t kInvalidNodeIndex = (1U << 24) - 1;
103 constexpr uint32_t kInvalidNextIndex = ~0U;
104
ResetMutableNext(IcingDynamicTrie::Next & mutable_next)105 void ResetMutableNext(IcingDynamicTrie::Next &mutable_next) {
106 mutable_next.set_val(0xff);
107 mutable_next.set_node_index(kInvalidNodeIndex);
108 }
109
110 // Helper function to check that there is no termination character '\0' in the
111 // key.
IsKeyValid(std::string_view key)112 bool IsKeyValid(std::string_view key) {
113 return key.find('\0') == std::string_view::npos; // NOLINT
114 }
115
GetCharOrNull(std::string_view s,int pos)116 char GetCharOrNull(std::string_view s, int pos) {
117 return (pos < s.size()) ? s[pos] : '\0';
118 }
119
120 } // namespace
121
122 // Based on the bit field widths.
123 const uint32_t IcingDynamicTrie::Options::kMaxNodes = (1U << 24) - 1;
124 const uint32_t IcingDynamicTrie::Options::kMaxNexts = (1U << 27) - 1;
125 const uint32_t IcingDynamicTrie::Options::kMaxSuffixesSize = 1U << 27;
126 const uint32_t IcingDynamicTrie::Options::kMaxValueSize = 1U << 16;
127
128 const uint32_t IcingDynamicTrie::kInvalidSuffixIndex = ~0U;
129
130 const int IcingDynamicTrie::kMaxNextArraySize;
131 const int IcingDynamicTrie::kNumNextAllocationBuckets;
132
133 const uint32_t IcingDynamicTrie::kMaxPropertyId;
134
135 const uint32_t IcingDynamicTrie::kInvalidValueIndex;
136
137 const uint32_t IcingDynamicTrie::kNoCrc;
138
139 // Manages logical node candidates while searching for possible
140 // variant matches. Currently implemented as depth first search. The
141 // max stack depth is key length * variant fanout. Since max variant
142 // fanout is 3, we don't need to worry about blowup of the depth first
143 // search stack.
144 //
145 // Keeps track of original matched string (the string actually present
146 // in the trie) for every candidate.
147 class IcingDynamicTrie::CandidateSet {
148 public:
149 struct Candidate {
150 LogicalNode logical_node;
151 const char *key;
152 int matched_prefix_len;
153 std::string matched_span;
154
Candidateicing::lib::IcingDynamicTrie::CandidateSet::Candidate155 Candidate() {}
156
Candidateicing::lib::IcingDynamicTrie::CandidateSet::Candidate157 Candidate(const LogicalNode &logical_node_in, const char *key_in,
158 int matched_prefix_len_in, const char *matched_span_in,
159 int matched_span_len_in)
160 : logical_node(logical_node_in),
161 key(key_in),
162 matched_prefix_len(matched_prefix_len_in),
163 matched_span(matched_span_in, matched_span_len_in) {}
164
matched_lenicing::lib::IcingDynamicTrie::CandidateSet::Candidate165 int matched_len() const { return matched_prefix_len + matched_span.size(); }
166 };
167
CandidateSet(bool prefix)168 explicit CandidateSet(bool prefix) : prefix_(prefix) {}
169
IsTerminal(const char * key,uint32_t value_index) const170 bool IsTerminal(const char *key, uint32_t value_index) const {
171 // Terminal match condition:
172 //
173 // 1. Key was entirely consumed.
174 // 2. The entire suffix was consumed (hence value index is
175 // valid). OR, we are ok with prefix matches.
176 return *key == 0 && (value_index != kInvalidValueIndex || prefix_);
177 }
178
179 // Push a terminal or non-terminal.
Push(const LogicalNode & logical_node,const char * key,uint32_t value_index,int matched_prefix_len,const char * matched_span,int matched_span_len)180 void Push(const LogicalNode &logical_node, const char *key,
181 uint32_t value_index, int matched_prefix_len,
182 const char *matched_span, int matched_span_len) {
183 if (!AddMatchIfTerminal(key, value_index, matched_span, matched_span_len)) {
184 PushNonTerminal(logical_node, key, matched_prefix_len, matched_span,
185 matched_span_len);
186 }
187 }
188
AddMatchIfTerminal(const char * key,uint32_t value_index,const char * matched_span,int matched_span_len)189 bool AddMatchIfTerminal(const char *key, uint32_t value_index,
190 const char *matched_span, int matched_span_len) {
191 if (!IsTerminal(key, value_index)) {
192 return false;
193 }
194
195 // Terminal match.
196 matches_.push_back(OriginalMatch());
197 OriginalMatch *match = &matches_.back();
198 match->value_index = value_index;
199 match->orig.reserve(cur_prefix_.size() + matched_span_len);
200 match->orig.append(cur_prefix_).append(matched_span, matched_span_len);
201 return true;
202 }
203
204 // Push a definite non-terminal.
PushNonTerminal(const LogicalNode & logical_node,const char * key,int matched_prefix_len,const char * matched_span,int matched_span_len)205 void PushNonTerminal(const LogicalNode &logical_node, const char *key,
206 int matched_prefix_len, const char *matched_span,
207 int matched_span_len) {
208 candidates_.push_back(Candidate(logical_node, key, matched_prefix_len,
209 matched_span, matched_span_len));
210 }
211
Pop(Candidate * candidate)212 void Pop(Candidate *candidate) {
213 *candidate = candidates_.back();
214 if (cur_prefix_.size() < candidate->matched_prefix_len) {
215 ICING_LOG(FATAL)
216 << "Length of current prefix is smaller than length of matched "
217 "prefer, there're inconsistencies in dynamic trie.";
218 }
219
220 cur_prefix_.resize(candidate->matched_prefix_len);
221 cur_prefix_.append(candidate->matched_span);
222 candidates_.pop_back();
223 }
224
empty() const225 bool empty() const { return candidates_.empty(); }
226
Release(std::vector<OriginalMatch> * ret)227 void Release(std::vector<OriginalMatch> *ret) {
228 if (!empty()) {
229 ICING_LOG(FATAL) << "Candidate set not empty before releasing matches";
230 }
231
232 ret->swap(matches_);
233
234 cur_prefix_.clear();
235 candidates_.clear();
236 matches_.clear();
237 }
238
239 private:
240 const bool prefix_;
241
242 std::string cur_prefix_;
243 std::vector<Candidate> candidates_;
244
245 std::vector<IcingDynamicTrie::OriginalMatch> matches_;
246 };
247
248 // Options.
is_valid() const249 bool IcingDynamicTrie::Options::is_valid() const {
250 return max_nodes <= kMaxNodes && max_nodes > 0 && max_nexts <= kMaxNexts &&
251 max_nexts > 0 && max_suffixes_size <= kMaxSuffixesSize &&
252 max_suffixes_size > 0 && value_size <= kMaxValueSize;
253 }
254
255 // IcingDynamicTrieStorage
256 class IcingDynamicTrie::IcingDynamicTrieStorage {
257 public:
258 IcingDynamicTrieStorage(const std::string &file_basename,
259 const RuntimeOptions &runtime_options,
260 const IcingFilesystem *filesystem);
261 ~IcingDynamicTrieStorage();
262
is_initialized() const263 bool is_initialized() const { return hdr_mmapper_.is_valid(); }
264
265 bool CreateIfNotExist(const Options &options);
266 bool Init();
267 static bool Remove(const std::string &file_basename,
268 const IcingFilesystem &filesystem);
269 bool Sync();
270 uint64_t GetDiskUsage() const;
271
272 // Returns the size of the elements held in the trie. This excludes the size
273 // of any internal metadata of the trie, e.g. the trie's header.
274 uint64_t GetElementsFileSize() const;
275
276 void Warm();
277
278 void Clear();
279
empty() const280 bool empty() const { return hdr().num_nodes() == 0; }
281
282 // Never cast off these consts when writing to the arrays. Always
283 // use the GetMutable* helpers above.
GetNode(uint32_t idx) const284 const Node *GetNode(uint32_t idx) const {
285 return &array_storage_[NODE].array_cast<Node>()[idx];
286 }
287
288 // REQUIRES: !empty(). Otherwise node 0 could contain invalid data
289 // (next_index, is_leaf, log2_num_children).
GetRootNode() const290 const Node *GetRootNode() const { return GetNode(0); }
291
GetNext(uint32_t idx,int child) const292 const Next *GetNext(uint32_t idx, int child) const {
293 return &array_storage_[NEXT].array_cast<Next>()[idx + child];
294 }
GetSuffix(uint32_t idx) const295 const char *GetSuffix(uint32_t idx) const {
296 return &array_storage_[SUFFIX].array_cast<char>()[idx];
297 }
298
GetNodeIndex(const Node * node) const299 uint32_t GetNodeIndex(const Node *node) const { return node - GetNode(0); }
GetNextArrayIndex(const Next * next) const300 uint32_t GetNextArrayIndex(const Next *next) const {
301 return next - GetNext(0, 0);
302 }
GetSuffixIndex(const char * suffix) const303 uint32_t GetSuffixIndex(const char *suffix) const {
304 return suffix - GetSuffix(0);
305 }
306
307 // By default, nodes_, nexts_ and suffixes_ are read-only. This
308 // returns a writable element or array within and sets
309 // dirty_pages_[array_type] as a side effect, assuming the mutable
310 // area will get written to.
311 Node *GetMutableNode(uint32_t idx);
312 Next *GetMutableNextArray(uint32_t idx, uint32_t len);
313 char *GetMutableSuffix(uint32_t idx, uint32_t len);
314
315 // Update crcs based on current contents. Returns all_crc or kNoCrc.
316 Crc32 UpdateCrc();
317
318 // Calculates the current crc and returns it.
319 Crc32 GetCrc() const;
320
321 // Allocators.
322 uint32_t nodes_left() const;
323 uint32_t nexts_left() const;
324 uint32_t suffixes_left() const;
325
326 // REQUIRES: nodes_left() > 0.
327 Node *AllocNode();
328 // REQUIRES: nexts_left() >= kMaxNextArraySize.
329 libtextclassifier3::StatusOr<Next *> AllocNextArray(int size);
330 void FreeNextArray(Next *next, int log2_size);
331 // REQUIRES: suffixes_left() >= strlen(suffix) + 1 + value_size()
332 uint32_t MakeSuffix(std::string_view suffix, const void *value,
333 uint32_t *value_index);
334
hdr() const335 const IcingDynamicTrieHeader &hdr() const { return hdr_.hdr; }
336
value_size() const337 uint32_t value_size() const { return hdr().value_size(); }
338
339 void FillDirtyPageStats(Stats *stats) const;
340
inc_num_keys()341 void inc_num_keys() { hdr_.hdr.set_num_keys(hdr_.hdr.num_keys() + 1); }
342
dec_num_keys()343 void dec_num_keys() { hdr_.hdr.set_num_keys(hdr_.hdr.num_keys() - 1); }
344
345 private:
346 friend void IcingDynamicTrie::SetHeader(
347 const IcingDynamicTrieHeader &new_hdr);
348
349 enum ArrayType { NODE, NEXT, SUFFIX, NUM_ARRAY_TYPES };
350
351 // Returns all filenames that are part of the storage. First
352 // filename is the header and the rest correspond to ArrayType enum
353 // values.
354 static void GetFilenames(const std::string &file_basename,
355 std::vector<std::string> *filenames);
356 static std::string GetHeaderFilename(const std::string &file_basename);
357
358 // TODO(b/353398502) Improve the handling of the header to avoid this weird
359 // pattern with both an mmapped header and an in-memory header.
360 // Calculates the crc of the header as it is represented in the
361 // header_mmapper_.
362 Crc32 GetWrittenHeaderCrc() const;
363
364 // Calculates the crc of the hdr_.
365 //
366 // This value will deviate from GetWrittenHeaderCrc() if the header is
367 // modified after initialization or the last call to WriteHeader().
368 //
369 // NOTE: This function will need to serialize hdr_ to calculate the crc.
370 // Therefore, if given the choice, prefer to use GetWrittenHeaderCrc().
371 Crc32 GetHeaderCrc() const;
372
373 // Returns the crc of the cached crcs.
374 Crc32 GetCachedCrc() const;
375
376 Crc32 UpdateCrcInternal(bool write_hdr);
377
378 // Initializes hdr_ with options and writes the resulting header to disk.
379 bool CreateNewHeader(IcingScopedFd sfd, const Options &options);
380 bool WriteHeader();
381
382 // Header block. On-disk header block format is as follows:
383 //
384 // |serialized-header|pad|crcs|
385 // <--- system_page_size() --->
386
387 // Wrapper for header protobuf.
388 class Header {
389 // Serialized format:
390 //
391 // magic(4)|size(4)|serialized hdr(size)
392 static const uint32_t kMagic;
393 // TODO(b/77482303) : Remove version from the IcingFlashBitmap header -
394 // magic makes it unnecessary.
395 static const uint32_t kCurVersion;
396
397 public:
398 void Init(const Options &options);
399 bool Init(const uint8_t *buf, uint32_t buf_size);
Invalidate()400 void Invalidate() { hdr.Clear(); }
401 bool SerializeToArray(uint8_t *buf, uint32_t buf_size) const;
402 bool Verify();
403
404 IcingDynamicTrieHeader hdr;
405 };
406
407 std::string file_basename_;
408
409 Header hdr_;
410
411 IcingMMapper hdr_mmapper_;
412
413 struct Crcs {
414 uint32_t all_crc;
415 uint32_t header_crc;
416 uint32_t array_crcs[NUM_ARRAY_TYPES];
417 };
418 Crcs *cached_crcs_;
419
serialized_header_max()420 static uint32_t serialized_header_max() {
421 return IcingMMapper::system_page_size() - sizeof(Crcs);
422 }
423
424 static Crc32 GetAllCrcs(const Crcs &crcs);
425
426 RuntimeOptions runtime_options_;
427
428 // Info kept about each array (NODE, NEXT, SUFFIX) to manage
429 // storage.
430 IcingScopedFd array_fds_[NUM_ARRAY_TYPES];
431 std::vector<IcingArrayStorage> array_storage_;
432
433 // Legacy file system. Switch to use the new Filesystem class instead.
434 const IcingFilesystem *filesystem_;
435 };
436
IcingDynamicTrieStorage(const std::string & file_basename,const RuntimeOptions & runtime_options,const IcingFilesystem * filesystem)437 IcingDynamicTrie::IcingDynamicTrieStorage::IcingDynamicTrieStorage(
438 const std::string &file_basename, const RuntimeOptions &runtime_options,
439 const IcingFilesystem *filesystem)
440 : file_basename_(file_basename),
441 hdr_mmapper_(false, MAP_SHARED),
442 cached_crcs_(nullptr),
443 runtime_options_(runtime_options),
444 array_storage_(NUM_ARRAY_TYPES, IcingArrayStorage(*filesystem)),
445 filesystem_(filesystem) {}
446
~IcingDynamicTrieStorage()447 IcingDynamicTrie::IcingDynamicTrieStorage::~IcingDynamicTrieStorage() {
448 if (is_initialized()) {
449 for (int i = 0; i < NUM_ARRAY_TYPES; i++) {
450 array_storage_[i].Reset();
451 }
452 }
453 }
454
GetFilenames(const std::string & file_basename,std::vector<std::string> * filenames)455 void IcingDynamicTrie::IcingDynamicTrieStorage::GetFilenames(
456 const std::string &file_basename, std::vector<std::string> *filenames) {
457 const char *kArrayFilenameSuffixes[NUM_ARRAY_TYPES] = {
458 ".n",
459 ".x",
460 ".s",
461 };
462
463 filenames->clear();
464 filenames->push_back(GetHeaderFilename(file_basename));
465 for (int i = 0; i < NUM_ARRAY_TYPES; i++) {
466 filenames->push_back(file_basename + kArrayFilenameSuffixes[i]);
467 }
468 }
469
GetHeaderFilename(const std::string & file_basename)470 std::string IcingDynamicTrie::IcingDynamicTrieStorage::GetHeaderFilename(
471 const std::string &file_basename) {
472 constexpr char kHeaderFilenameSuffix[] = ".h";
473 return file_basename + kHeaderFilenameSuffix;
474 }
475
Init()476 bool IcingDynamicTrie::IcingDynamicTrieStorage::Init() {
477 bool init_crcs = false;
478 const bool map_shared =
479 runtime_options_.storage_policy == RuntimeOptions::kMapSharedWithCrc;
480
481 // Open files.
482 std::vector<std::string> filenames;
483 GetFilenames(file_basename_, &filenames);
484 for (size_t i = 0; i < filenames.size(); i++) {
485 uint64_t file_size = filesystem_->GetFileSize(filenames[i].c_str());
486 if (file_size == IcingFilesystem::kBadFileSize) {
487 goto failed;
488 }
489 IcingScopedFd sfd(filesystem_->OpenForWrite(filenames[i].c_str()));
490 if (!sfd.is_valid()) {
491 goto failed;
492 }
493 // The first filename is the header and the rest correspond to ArrayType
494 // enum values. The header's fd can be closed immediately after mmapping
495 // (see b/114830334). Other files' fds are tracked in array_fds_ for later
496 // closing.
497 if (i == 0) {
498 // Header.
499 if (file_size != IcingMMapper::system_page_size()) {
500 ICING_LOG(ERROR) << "Trie hdr wrong size: " << file_size;
501 goto failed;
502 }
503
504 // Open hdr.
505 hdr_mmapper_.Remap(sfd.get(), 0, IcingMMapper::system_page_size());
506 if (!hdr_mmapper_.is_valid()) {
507 ICING_LOG(ERROR) << "Trie map header failed";
508 goto failed;
509 }
510 } else {
511 array_fds_[i - 1] = std::move(sfd);
512 }
513 }
514
515 // Point crcs_ to correct region.
516 cached_crcs_ = reinterpret_cast<Crcs *>(hdr_mmapper_.address() +
517 serialized_header_max());
518 // Header hasn't been initialized yet. So we should check what's actually
519 // written to checksum the header.
520 if (cached_crcs_->header_crc == kNoCrc) {
521 // Create crcs.
522 cached_crcs_->header_crc = GetWrittenHeaderCrc().Get();
523
524 // Do the same for the arrays.
525 init_crcs = true;
526 } else {
527 // Verify crc.
528 if (cached_crcs_->header_crc != GetWrittenHeaderCrc().Get()) {
529 ICING_LOG(ERROR) << "Trie header crc failed";
530 goto failed;
531 }
532 }
533
534 // Deserialize and verify header.
535 if (!hdr_.Init(hdr_mmapper_.address(),
536 IcingMMapper::system_page_size() - sizeof(Crcs)) ||
537 !hdr_.Verify()) {
538 ICING_LOG(ERROR) << "Trie reading header failed";
539 goto failed;
540 }
541
542 // We have the header set up. Now read in the arrays.
543 if (!array_storage_[NODE].Init(array_fds_[NODE].get(), 0, map_shared,
544 sizeof(Node), hdr_.hdr.num_nodes(),
545 hdr_.hdr.max_nodes(),
546 &cached_crcs_->array_crcs[NODE], init_crcs)) {
547 ICING_LOG(ERROR) << "Trie mmap node failed";
548 goto failed;
549 }
550
551 if (!array_storage_[NEXT].Init(array_fds_[NEXT].get(), 0, map_shared,
552 sizeof(Next), hdr_.hdr.num_nexts(),
553 hdr_.hdr.max_nexts(),
554 &cached_crcs_->array_crcs[NEXT], init_crcs)) {
555 ICING_LOG(ERROR) << "Trie mmap next failed";
556 goto failed;
557 }
558
559 if (!array_storage_[SUFFIX].Init(
560 array_fds_[SUFFIX].get(), 0, map_shared, sizeof(char),
561 hdr_.hdr.suffixes_size(), hdr_.hdr.max_suffixes_size(),
562 &cached_crcs_->array_crcs[SUFFIX], init_crcs)) {
563 ICING_LOG(ERROR) << "Trie mmap suffix failed";
564 goto failed;
565 }
566
567 // All of the cached crcs are now up to date. Either calculate+set the all crc
568 // or calculate+check the all crc.
569 if (init_crcs) {
570 cached_crcs_->all_crc = GetCachedCrc().Get();
571 } else {
572 // Verify crc.
573 if (cached_crcs_->all_crc != GetCachedCrc().Get()) {
574 ICING_LOG(ERROR) << "Trie all crc failed";
575 goto failed;
576 }
577 }
578
579 return true;
580
581 failed:
582 cached_crcs_ = nullptr;
583 hdr_mmapper_.Unmap();
584 hdr_.Invalidate();
585 for (int i = 0; i < NUM_ARRAY_TYPES; i++) {
586 array_storage_[i].Reset();
587 array_fds_[i].reset();
588 }
589
590 return false;
591 }
592
CreateIfNotExist(const Options & options)593 bool IcingDynamicTrie::IcingDynamicTrieStorage::CreateIfNotExist(
594 const Options &options) {
595 std::vector<std::string> filenames;
596 GetFilenames(file_basename_, &filenames);
597
598 // Check already exists. Just header file check is enough.
599 if (filesystem_->FileExists(filenames[0].c_str())) {
600 return true;
601 }
602
603 // Ensure the storage directory exists
604 std::string storage_dir = filesystem_->GetDirname(filenames[0].c_str());
605 if (!filesystem_->CreateDirectoryRecursively(storage_dir.c_str())) {
606 return false;
607 }
608
609 // Create files.
610 for (size_t i = 0; i < filenames.size(); i++) {
611 IcingScopedFd sfd(filesystem_->OpenForWrite(filenames[i].c_str()));
612 if (!sfd.is_valid()) {
613 Remove(file_basename_, *filesystem_);
614 return false;
615 }
616
617 if (i == 0) {
618 if (!CreateNewHeader(std::move(sfd), options)) {
619 ICING_LOG(ERROR) << "Serialize trie header failed";
620 Remove(file_basename_, *filesystem_);
621 return false;
622 }
623 } else {
624 // Crcs are automatically kNoCrc so they will be initialized
625 // upon first call to Init.
626 if (!filesystem_->Truncate(*sfd, 0)) {
627 Remove(file_basename_, *filesystem_);
628 return false;
629 }
630 }
631 }
632 return true;
633 }
634
CreateNewHeader(IcingScopedFd sfd,const Options & options)635 bool IcingDynamicTrie::IcingDynamicTrieStorage::CreateNewHeader(
636 IcingScopedFd sfd, const Options &options) {
637 ICING_VLOG(1) << "Creating header with write+sync";
638 hdr_.Init(options);
639 auto buf = std::make_unique<uint8_t[]>(IcingMMapper::system_page_size());
640 // serialized_header_max must be less than system_page_size so we don't
641 // overflow buf when serializing the header.
642 if (serialized_header_max() > IcingMMapper::system_page_size()) {
643 ICING_LOG(FATAL) << "serialized_header_max exceeds system page size";
644 }
645
646 return hdr_.SerializeToArray(buf.get(), serialized_header_max()) &&
647 filesystem_->Write(sfd.get(), buf.get(),
648 IcingMMapper::system_page_size()) &&
649 filesystem_->DataSync(sfd.get());
650 }
651
Remove(const std::string & file_basename,const IcingFilesystem & filesystem)652 bool IcingDynamicTrie::IcingDynamicTrieStorage::Remove(
653 const std::string &file_basename, const IcingFilesystem &filesystem) {
654 bool success = true;
655 std::vector<std::string> files;
656 GetFilenames(file_basename, &files);
657 for (size_t i = 0; i < files.size(); i++) {
658 if (!filesystem.DeleteFile(files[i].c_str())) {
659 success = false;
660 }
661 }
662 return success;
663 }
664
Warm()665 void IcingDynamicTrie::IcingDynamicTrieStorage::Warm() {
666 for (int i = 0; i < NUM_ARRAY_TYPES; i++) {
667 array_storage_[i].Warm();
668 }
669 }
670
Clear()671 void IcingDynamicTrie::IcingDynamicTrieStorage::Clear() {
672 if (!is_initialized()) {
673 ICING_LOG(FATAL) << "DynamicTrie not initialized";
674 }
675
676 // Clear header.
677 hdr_.hdr.set_num_nodes(0);
678 hdr_.hdr.set_num_nexts(0);
679 hdr_.hdr.set_suffixes_size(0);
680 for (int i = 0; i < hdr_.hdr.free_lists_size(); i++) {
681 hdr_.hdr.set_free_lists(i, kInvalidNextIndex);
682 }
683 hdr_.hdr.set_num_keys(0);
684
685 // Clear array storage.
686 for (int i = 0; i < NUM_ARRAY_TYPES; i++) {
687 array_storage_[i].Clear();
688 }
689
690 // Copy to persistence.
691 WriteHeader();
692 }
693
Sync()694 bool IcingDynamicTrie::IcingDynamicTrieStorage::Sync() {
695 if (!is_initialized()) {
696 ICING_LOG(FATAL) << "DynamicTrie not initialized";
697 }
698
699 uint32_t total_flushed = 0;
700 bool success = true;
701
702 // Sync all array types.
703 for (int i = 0; i < NUM_ARRAY_TYPES; i++) {
704 total_flushed += array_storage_[i].Sync();
705 if (!filesystem_->DataSync(array_fds_[i].get())) {
706 ICING_LOG(ERROR) << "Unable to sync data for flushing";
707 success = false;
708 }
709 }
710
711 if (!WriteHeader()) {
712 ICING_LOG(ERROR) << "Flushing trie header failed: " << strerror(errno);
713 success = false;
714 }
715
716 // Need to update CRCs before we sync the header mmap.
717 UpdateCrcInternal(false);
718
719 // Sync header.
720 if (!hdr_mmapper_.Sync()) {
721 ICING_LOG(ERROR) << "Unable to sync trie header for flushing";
722 success = false;
723 }
724
725 if (total_flushed > 0) {
726 ICING_VLOG(1) << "Flushing " << total_flushed << " pages of trie";
727 }
728
729 return success;
730 }
731
GetDiskUsage() const732 uint64_t IcingDynamicTrie::IcingDynamicTrieStorage::GetDiskUsage() const {
733 // Trie files themselves.
734 uint64_t total = 0;
735 for (int i = 0; i < NUM_ARRAY_TYPES; i++) {
736 IcingFilesystem::IncrementByOrSetInvalid(
737 filesystem_->GetDiskUsage(array_fds_[i].get()), &total);
738 }
739
740 // Header.
741 std::string header_filename = GetHeaderFilename(file_basename_);
742 IcingFilesystem::IncrementByOrSetInvalid(
743 filesystem_->GetFileDiskUsage(header_filename.c_str()), &total);
744
745 return total;
746 }
747
GetElementsFileSize() const748 uint64_t IcingDynamicTrie::IcingDynamicTrieStorage::GetElementsFileSize()
749 const {
750 // Trie files themselves, exclude size of the header. These arrays are dense,
751 // not sparse, so use file size for more accurate numbers.
752 uint64_t total = 0;
753 for (int i = 0; i < NUM_ARRAY_TYPES; i++) {
754 IcingFilesystem::IncrementByOrSetInvalid(
755 filesystem_->GetFileSize(array_fds_[i].get()), &total);
756 }
757 return total;
758 }
759
AllocNode()760 IcingDynamicTrie::Node *IcingDynamicTrie::IcingDynamicTrieStorage::AllocNode() {
761 if (nodes_left() == 0) {
762 ICING_LOG(FATAL) << "No allocated nodes left";
763 }
764
765 hdr_.hdr.set_num_nodes(hdr_.hdr.num_nodes() + 1);
766 return GetMutableNode(hdr_.hdr.num_nodes() - 1);
767 }
768
769 libtextclassifier3::StatusOr<IcingDynamicTrie::Next *>
AllocNextArray(int size)770 IcingDynamicTrie::IcingDynamicTrieStorage::AllocNextArray(int size) {
771 if (size > kMaxNextArraySize) {
772 return absl_ports::InternalError(
773 "Array size exceeds the max 'next' array size");
774 }
775
776 if (nexts_left() < static_cast<uint32_t>(kMaxNextArraySize)) {
777 ICING_LOG(FATAL) << "'next' buffer not enough";
778 }
779
780 // Compute ceil(log2(size)).
781 int log2_size = 0;
782 while ((1 << log2_size) < size) log2_size++;
783 // Note: size <= aligned_size <= kMaxNextArraySize
784 int aligned_size = 1 << log2_size;
785
786 // Look in free list.
787 Next *ret;
788 if (hdr_.hdr.free_lists(log2_size) != kInvalidNextIndex) {
789 ret = GetMutableNextArray(hdr_.hdr.free_lists(log2_size), aligned_size);
790 uint32_t next_link = ret->next_index();
791 if (next_link != kInvalidNextIndex && next_link >= hdr_.hdr.max_nexts()) {
792 ICING_LOG(FATAL) << "'next' index is out of range";
793 }
794 hdr_.hdr.set_free_lists(log2_size, next_link);
795 } else {
796 // Allocate a new one.
797 ret = GetMutableNextArray(hdr_.hdr.num_nexts(), aligned_size);
798 hdr_.hdr.set_num_nexts(hdr_.hdr.num_nexts() + aligned_size);
799 }
800
801 // Fill with char 0xff so we are sorted properly.
802 for (int i = 0; i < aligned_size; i++) {
803 ResetMutableNext(ret[i]);
804 }
805 return ret;
806 }
807
FreeNextArray(Next * next,int log2_size)808 void IcingDynamicTrie::IcingDynamicTrieStorage::FreeNextArray(Next *next,
809 int log2_size) {
810 if (GetNextArrayIndex(next) + (1 << log2_size) > hdr_.hdr.max_nexts()) {
811 ICING_LOG(FATAL) << "'next' array is out of range";
812 }
813
814 // Put it in free list.
815 next->set_next_index(hdr_.hdr.free_lists(log2_size));
816 hdr_.hdr.set_free_lists(log2_size, GetNextArrayIndex(next));
817 }
818
MakeSuffix(std::string_view suffix,const void * value,uint32_t * value_index)819 uint32_t IcingDynamicTrie::IcingDynamicTrieStorage::MakeSuffix(
820 std::string_view suffix, const void *value, uint32_t *value_index) {
821 if (suffixes_left() < suffix.size() + 1 + value_size()) {
822 ICING_LOG(FATAL) << "'suffix' buffer not enough";
823 }
824
825 char *start = GetMutableSuffix(hdr_.hdr.suffixes_size(),
826 suffix.size() + 1 + value_size());
827 // Copy suffix.
828 memcpy(start, suffix.data(), suffix.size());
829 // Set a '\0' after suffix.
830 memset(start + suffix.size(), /*value=*/0, /*num=*/1);
831 // Copy value.
832 memcpy(start + suffix.size() + 1, value, value_size());
833 if (value_index) *value_index = GetSuffixIndex(start + suffix.size() + 1);
834 hdr_.hdr.set_suffixes_size(hdr_.hdr.suffixes_size() + suffix.size() + 1 +
835 value_size());
836
837 return GetSuffixIndex(start);
838 }
839
GetWrittenHeaderCrc() const840 Crc32 IcingDynamicTrie::IcingDynamicTrieStorage::GetWrittenHeaderCrc() const {
841 std::string_view data(reinterpret_cast<const char *>(hdr_mmapper_.address()),
842 serialized_header_max());
843 return Crc32(data);
844 }
845
GetHeaderCrc() const846 Crc32 IcingDynamicTrie::IcingDynamicTrieStorage::GetHeaderCrc() const {
847 // Create a buffer that is the same as the mmapped header.
848 auto hdr_data = std::make_unique<uint8_t[]>(serialized_header_max());
849 std::memcpy(hdr_data.get(), hdr_mmapper_.address(), serialized_header_max());
850
851 // Serialize the in-memory header to the buffer and then checksum it.
852 hdr_.SerializeToArray(hdr_data.get(), serialized_header_max());
853 std::string_view data(reinterpret_cast<const char *>(hdr_data.get()),
854 serialized_header_max());
855 return Crc32(data);
856 }
857
GetCachedCrc() const858 Crc32 IcingDynamicTrie::IcingDynamicTrieStorage::GetCachedCrc() const {
859 return GetAllCrcs(*cached_crcs_);
860 }
861
GetAllCrcs(const IcingDynamicTrie::IcingDynamicTrieStorage::Crcs & crcs)862 /*static*/ Crc32 IcingDynamicTrie::IcingDynamicTrieStorage::GetAllCrcs(
863 const IcingDynamicTrie::IcingDynamicTrieStorage::Crcs &crcs) {
864 // Append array crcs to header crc.
865 Crc32 crc(crcs.header_crc);
866 std::string_view data(reinterpret_cast<const char *>(crcs.array_crcs),
867 sizeof(cached_crcs_->array_crcs));
868 crc.Append(data);
869 return crc;
870 }
871
GetCrc() const872 Crc32 IcingDynamicTrie::IcingDynamicTrieStorage::GetCrc() const {
873 // crcs_ holds cached values. We want the crcs that represent the content *as
874 // it exists now*.
875 Crcs crcs;
876 crcs.header_crc = GetHeaderCrc().Get();
877 for (int i = 0; i < NUM_ARRAY_TYPES; i++) {
878 crcs.array_crcs[i] = array_storage_[i].GetCrc().Get();
879 }
880 return GetAllCrcs(crcs);
881 }
882
UpdateCrc()883 Crc32 IcingDynamicTrie::IcingDynamicTrieStorage::UpdateCrc() {
884 return UpdateCrcInternal(true);
885 }
886
UpdateCrcInternal(bool write_hdr)887 Crc32 IcingDynamicTrie::IcingDynamicTrieStorage::UpdateCrcInternal(
888 bool write_hdr) {
889 if (write_hdr && !WriteHeader()) {
890 ICING_LOG(ERROR) << "Flushing trie header failed: " << strerror(errno);
891 }
892
893 // Since we just wrote the header, GetHeaderCrc() and GetWrittenHeaderCrc()
894 // are equivalent. We call GetWrittenHeaderCrc() because it avoids serializing
895 // the header to recalculate the crc.
896 cached_crcs_->header_crc = GetWrittenHeaderCrc().Get();
897 for (int i = 0; i < NUM_ARRAY_TYPES; i++) {
898 array_storage_[i].UpdateCrc();
899 }
900
901 // All of the cached crcs are now up to date.
902 Crc32 all_crc = GetAllCrcs(*cached_crcs_);
903 cached_crcs_->all_crc = all_crc.Get();
904 return all_crc;
905 }
906
WriteHeader()907 bool IcingDynamicTrie::IcingDynamicTrieStorage::WriteHeader() {
908 return hdr_.SerializeToArray(hdr_mmapper_.address(), serialized_header_max());
909 }
910
911 IcingDynamicTrie::Node *
GetMutableNode(uint32_t idx)912 IcingDynamicTrie::IcingDynamicTrieStorage::GetMutableNode(uint32_t idx) {
913 return array_storage_[NODE].GetMutableMem<Node>(idx, 1);
914 }
915
916 IcingDynamicTrie::Next *
GetMutableNextArray(uint32_t idx,uint32_t len)917 IcingDynamicTrie::IcingDynamicTrieStorage::GetMutableNextArray(uint32_t idx,
918 uint32_t len) {
919 return array_storage_[NEXT].GetMutableMem<Next>(idx, len);
920 }
921
GetMutableSuffix(uint32_t idx,uint32_t len)922 char *IcingDynamicTrie::IcingDynamicTrieStorage::GetMutableSuffix(
923 uint32_t idx, uint32_t len) {
924 return array_storage_[SUFFIX].GetMutableMem<char>(idx, len);
925 }
926
927 // Header functions.
928 const uint32_t IcingDynamicTrie::IcingDynamicTrieStorage::Header::kMagic =
929 0x6dfba6ae;
930 // For future revisions, this should be synced with global index version.
931 // See comments on Upgrade() in native-index-impl.h for versioning.
932 const uint32_t IcingDynamicTrie::IcingDynamicTrieStorage::Header::kCurVersion =
933 4;
934
Init(const IcingDynamicTrie::Options & options)935 void IcingDynamicTrie::IcingDynamicTrieStorage::Header::Init(
936 const IcingDynamicTrie::Options &options) {
937 hdr.Clear();
938
939 hdr.set_version(kCurVersion);
940 hdr.set_max_nodes(options.max_nodes);
941 hdr.set_max_nexts(options.max_nexts);
942 hdr.set_max_suffixes_size(options.max_suffixes_size);
943 hdr.set_value_size(options.value_size);
944
945 for (int i = 0; i < kNumNextAllocationBuckets; i++) {
946 hdr.add_free_lists(kInvalidNextIndex);
947 }
948 }
949
Init(const uint8_t * buf,uint32_t buf_size)950 bool IcingDynamicTrie::IcingDynamicTrieStorage::Header::Init(
951 const uint8_t *buf, uint32_t buf_size) {
952 // Check magic and length.
953 if (buf_size <= sizeof(kMagic) + sizeof(uint32_t)) {
954 ICING_LOG(ERROR) << "Trie header too short";
955 return false;
956 }
957
958 uint32_t magic;
959 memcpy(&magic, buf, sizeof(magic));
960 if (magic != kMagic) {
961 ICING_LOG(ERROR) << "Trie header magic mismatch";
962 return false;
963 }
964 uint32_t len;
965 memcpy(&len, buf + sizeof(magic), sizeof(len));
966 if (len > buf_size - sizeof(magic) - sizeof(len)) {
967 ICING_LOG(ERROR) << "Trie header too short";
968 return false;
969 }
970
971 return hdr.ParseFromArray(buf + sizeof(magic) + sizeof(len), len);
972 }
973
SerializeToArray(uint8_t * buf,uint32_t buf_size) const974 bool IcingDynamicTrie::IcingDynamicTrieStorage::Header::SerializeToArray(
975 uint8_t *buf, uint32_t buf_size) const {
976 uint32_t size = hdr.ByteSizeLong();
977 if (size + sizeof(kMagic) + sizeof(uint32_t) > buf_size) return false;
978 memcpy(buf, &kMagic, sizeof(kMagic));
979 memcpy(buf + sizeof(kMagic), &size, sizeof(uint32_t));
980 hdr.SerializeWithCachedSizesToArray(buf + sizeof(kMagic) + sizeof(uint32_t));
981 return true;
982 }
983
Verify()984 bool IcingDynamicTrie::IcingDynamicTrieStorage::Header::Verify() {
985 // Check version.
986 if (hdr.version() != kCurVersion) {
987 ICING_LOG(ERROR) << "Trie version " << hdr.version() << " mismatch";
988 return false;
989 }
990
991 // Check that indices in hdr are within bounds. Note that this is
992 // not a comprehensive integrity check for the entire trie.
993 if (hdr.num_nodes() > hdr.max_nodes() || hdr.num_nexts() > hdr.max_nexts() ||
994 hdr.suffixes_size() > hdr.max_suffixes_size() ||
995 hdr.value_size() >= hdr.max_suffixes_size()) {
996 ICING_LOG(ERROR) << "Trie header array size out of bounds";
997 return false;
998 }
999
1000 if (hdr.free_lists_size() != kNumNextAllocationBuckets) {
1001 ICING_LOG(ERROR) << "Bad number of free lists";
1002 return false;
1003 }
1004
1005 for (int i = 0; i < kNumNextAllocationBuckets; i++) {
1006 if (hdr.free_lists(i) != kInvalidNextIndex &&
1007 hdr.free_lists(i) >= hdr.max_nexts()) {
1008 ICING_LOG(ERROR) << "Free list index out of bounds";
1009 return false;
1010 }
1011 }
1012
1013 return true;
1014 }
1015
nodes_left() const1016 uint32_t IcingDynamicTrie::IcingDynamicTrieStorage::nodes_left() const {
1017 return hdr_.hdr.max_nodes() - hdr_.hdr.num_nodes();
1018 }
1019
nexts_left() const1020 uint32_t IcingDynamicTrie::IcingDynamicTrieStorage::nexts_left() const {
1021 return hdr_.hdr.max_nexts() - hdr_.hdr.num_nexts();
1022 }
1023
suffixes_left() const1024 uint32_t IcingDynamicTrie::IcingDynamicTrieStorage::suffixes_left() const {
1025 return hdr_.hdr.max_suffixes_size() - hdr_.hdr.suffixes_size();
1026 }
1027
FillDirtyPageStats(Stats * stats) const1028 void IcingDynamicTrie::IcingDynamicTrieStorage::FillDirtyPageStats(
1029 Stats *stats) const {
1030 stats->dirty_pages_nodes = array_storage_[NODE].num_dirty_pages();
1031 stats->dirty_pages_nexts = array_storage_[NEXT].num_dirty_pages();
1032 stats->dirty_pages_suffixes = array_storage_[SUFFIX].num_dirty_pages();
1033 }
1034
1035 // Dumper.
1036 class IcingDynamicTrie::Dumper {
1037 public:
Dumper(const IcingDynamicTrie & trie)1038 explicit Dumper(const IcingDynamicTrie &trie)
1039 : all_props_(trie), del_prop_(trie), storage_(trie.storage_.get()) {}
1040
Dump(std::ostream * pretty_print,std::vector<std::string> * keys) const1041 void Dump(std::ostream *pretty_print, std::vector<std::string> *keys) const {
1042 if (storage_->empty()) {
1043 *pretty_print << "(empty)\n";
1044 } else {
1045 DumpNodeRecursive("", *storage_->GetRootNode(), 0, pretty_print, keys);
1046 }
1047 }
1048
1049 private:
SuffixToValueAsString(const char * suffix) const1050 std::string SuffixToValueAsString(const char *suffix) const {
1051 int suffix_len = strlen(suffix);
1052 std::string ret;
1053 ret.reserve(storage_->value_size() * 2);
1054 for (uint32_t i = 0; i < storage_->value_size(); i++) {
1055 IcingStringUtil::SStringAppendF(&ret, 10, "%02x",
1056 suffix[suffix_len + 1 + i]);
1057 }
1058
1059 // Now dump set properties.
1060 uint32_t value_index = storage_->GetSuffixIndex(suffix + suffix_len + 1);
1061 if (del_prop_.HasProperty(value_index)) {
1062 ret += " (deleted)";
1063 }
1064 ret += " [";
1065 for (size_t i = 0; i < all_props_.size(); i++) {
1066 if (all_props_.HasProperty(i, value_index)) {
1067 IcingStringUtil::SStringAppendF(&ret, 10, "%zu", i);
1068 }
1069 }
1070 ret += ']';
1071
1072 return ret;
1073 }
1074
1075 // Inputs:
1076 // prefix - the key prefix of the current node (so we can rebuild the key)
1077 // node - the node we're at
1078 // level - how many levels deep we are in the trie
1079 // ret - the stream to pretty print to
1080 // keys - the keys encountered are appended to this
1081 //
1082 // REQUIRES: node is valid.
1083 // - Since we only invalidate Next to remove the edge from the trie and Node
1084 // is not invalidated after deletion, the caller should ensure that it
1085 // traverses correctly to a valid node according to the trie structure.
1086 // Calling this function with an invalid node is undefined behavior since
1087 // it could traverse into a deleted subtree, or invalid memory addresses.
1088 // - This also means storage_->empty() should be checked before calling this
1089 // function with the root node.
DumpNodeRecursive(const std::string & prefix,const Node & node,int level,std::ostream * ret,std::vector<std::string> * keys) const1090 void DumpNodeRecursive(const std::string &prefix, const Node &node, int level,
1091 std::ostream *ret,
1092 std::vector<std::string> *keys) const {
1093 // This function should be called only if the node is valid. The first call
1094 // is always from the root node, so it means the trie should not be empty at
1095 // this moment. Otherwise (root) node could contain invalid next_index(),
1096 // is_leaf(), and log2_num_children().
1097 if (node.is_leaf()) {
1098 // Dump suffix and value.
1099 for (int i = 0; i < level; i++) {
1100 *ret << ' ';
1101 }
1102 const char *suffix = storage_->GetSuffix(node.next_index());
1103 *ret << suffix;
1104 *ret << ' ';
1105 *ret << SuffixToValueAsString(suffix);
1106 *ret << '\n';
1107 keys->push_back(prefix + suffix);
1108 } else {
1109 // Go through each child (next) node. Print char and recursively
1110 // print trie underneath.
1111 for (uint32_t i = 0; i < (1U << node.log2_num_children()); i++) {
1112 const Next &next = *storage_->GetNext(node.next_index(), i);
1113 if (next.node_index() == kInvalidNodeIndex) break;
1114 for (int j = 0; j < level; j++) {
1115 *ret << ' ';
1116 }
1117 std::string new_prefix = prefix;
1118 if (next.val()) {
1119 *ret << static_cast<char>(next.val());
1120 new_prefix += next.val();
1121 } else {
1122 *ret << "null";
1123 }
1124 *ret << '\n';
1125 DumpNodeRecursive(new_prefix, *storage_->GetNode(next.node_index()),
1126 level + 1, ret, keys);
1127 }
1128 }
1129 }
1130
1131 PropertyReadersAll all_props_;
1132 PropertyDeletedReader del_prop_;
1133 const IcingDynamicTrie::IcingDynamicTrieStorage *storage_;
1134 };
1135
1136 // IcingDynamicTrie.
IcingDynamicTrie(const std::string & filename_base,const RuntimeOptions & runtime_options,const IcingFilesystem * filesystem)1137 IcingDynamicTrie::IcingDynamicTrie(const std::string &filename_base,
1138 const RuntimeOptions &runtime_options,
1139 const IcingFilesystem *filesystem)
1140 : IIcingStorage(),
1141 filename_base_(filename_base),
1142 is_initialized_(false),
1143 runtime_options_(runtime_options),
1144 storage_(nullptr),
1145 property_bitmaps_prefix_(filename_base_ + ".prop."),
1146 deleted_bitmap_filename_(filename_base_ + ".deleted"),
1147 deleted_bitmap_(nullptr),
1148 filesystem_(filesystem) {}
1149
~IcingDynamicTrie()1150 IcingDynamicTrie::~IcingDynamicTrie() { Close(); }
1151
Init()1152 bool IcingDynamicTrie::Init() {
1153 if (is_initialized_) return true;
1154
1155 if (storage_ != nullptr) {
1156 ICING_LOG(FATAL) << "Storage is not null before initialization";
1157 }
1158
1159 storage_ = std::make_unique<IcingDynamicTrieStorage>(
1160 filename_base_, runtime_options_, filesystem_);
1161 if (!storage_->Init() || !InitPropertyBitmaps()) {
1162 storage_.reset();
1163 return false;
1164 }
1165 is_initialized_ = true;
1166 return true;
1167 }
1168
CreateIfNotExist(const Options & options)1169 bool IcingDynamicTrie::CreateIfNotExist(const Options &options) {
1170 // Initialized means exists.
1171 if (is_initialized_) return true;
1172
1173 if (!options.is_valid()) {
1174 ICING_LOG(ERROR) << "Trie options invalid";
1175 return false;
1176 }
1177
1178 auto storage = std::make_unique<IcingDynamicTrieStorage>(
1179 filename_base_, runtime_options_, filesystem_);
1180 return storage->CreateIfNotExist(options);
1181 }
1182
Close()1183 void IcingDynamicTrie::Close() {
1184 if (!is_initialized_) return;
1185
1186 UpdateCrc();
1187
1188 storage_.reset();
1189 property_bitmaps_.clear();
1190 deleted_bitmap_.reset();
1191 is_initialized_ = false;
1192 }
1193
Remove()1194 bool IcingDynamicTrie::Remove() {
1195 if (is_initialized()) {
1196 Close();
1197 }
1198
1199 bool success = true;
1200
1201 // Remove storage files.
1202 if (!IcingDynamicTrieStorage::Remove(filename_base_, *filesystem_)) {
1203 success = false;
1204 }
1205
1206 // Also remove property bitmaps.
1207 std::vector<std::string> files;
1208 if (!filesystem_->GetMatchingFiles((property_bitmaps_prefix_ + "*").c_str(),
1209 &files)) {
1210 return false;
1211 }
1212 for (size_t i = 0; i < files.size(); i++) {
1213 if (!filesystem_->DeleteFile(files[i].c_str())) success = false;
1214 }
1215 // And deleted bitmap.
1216 if (!filesystem_->DeleteFile(deleted_bitmap_filename_.c_str()))
1217 success = false;
1218
1219 return success;
1220 }
1221
Sync()1222 bool IcingDynamicTrie::Sync() {
1223 if (!is_initialized_) {
1224 ICING_LOG(FATAL) << "DynamicTrie not initialized";
1225 }
1226
1227 bool success = true;
1228 IcingTimer timer;
1229
1230 // Sync property bitmaps.
1231 for (size_t i = 0; i < property_bitmaps_.size(); i++) {
1232 if (property_bitmaps_[i]) {
1233 if (!property_bitmaps_[i]->Sync()) success = false;
1234 }
1235 }
1236 if (!deleted_bitmap_->Sync()) success = false;
1237
1238 // Sync storage.
1239 if (!storage_->Sync()) success = false;
1240
1241 Warm();
1242
1243 ICING_VLOG(1) << "Syncing dynamic trie " << filename_base_.c_str() << " took "
1244 << timer.Elapsed() * 1000 << "ms";
1245
1246 return success;
1247 }
1248
GetDiskUsage() const1249 uint64_t IcingDynamicTrie::GetDiskUsage() const {
1250 uint64_t total = 0;
1251 // Property bitmaps.
1252 IcingFilesystem::IncrementByOrSetInvalid(deleted_bitmap_->GetDiskUsage(),
1253 &total);
1254
1255 for (auto &bitmap : property_bitmaps_) {
1256 if (bitmap == nullptr) continue;
1257 IcingFilesystem::IncrementByOrSetInvalid(bitmap->GetDiskUsage(), &total);
1258 }
1259
1260 // Storage.
1261 IcingFilesystem::IncrementByOrSetInvalid(storage_->GetDiskUsage(), &total);
1262 return total;
1263 }
1264
GetElementsSize() const1265 uint64_t IcingDynamicTrie::GetElementsSize() const {
1266 uint64_t total = 0;
1267
1268 // Bitmaps are sparsely populated, so disk usage is more accurate for those.
1269 // Property bitmaps.
1270 IcingFilesystem::IncrementByOrSetInvalid(deleted_bitmap_->GetDiskUsage(),
1271 &total);
1272 // The deleted bitmap is always initially grown to kGrowSize, whether there
1273 // are elements or not. So even if there are no elements in the trie, we'll
1274 // still have the bitmap of size kGrowSize, so subtract that from the size of
1275 // the trie's elements.
1276 total -= IcingFlashBitmap::kGrowSize;
1277
1278 for (auto &bitmap : property_bitmaps_) {
1279 if (bitmap == nullptr) continue;
1280 IcingFilesystem::IncrementByOrSetInvalid(bitmap->GetDiskUsage(), &total);
1281 }
1282
1283 // Storage. We can use file size here since the storage files aren't sparse.
1284 IcingFilesystem::IncrementByOrSetInvalid(storage_->GetElementsFileSize(),
1285 &total);
1286 return total;
1287 }
1288
OpenAndInitBitmap(const std::string & filename,bool verify,const IcingFilesystem * filesystem)1289 std::unique_ptr<IcingFlashBitmap> IcingDynamicTrie::OpenAndInitBitmap(
1290 const std::string &filename, bool verify,
1291 const IcingFilesystem *filesystem) {
1292 auto bitmap = std::make_unique<IcingFlashBitmap>(filename, filesystem);
1293 if (!bitmap->Init() || (verify && !bitmap->Verify())) {
1294 ICING_LOG(ERROR) << "Init of " << filename.c_str() << " failed";
1295 return nullptr;
1296 }
1297 return bitmap;
1298 }
1299
InitPropertyBitmaps()1300 bool IcingDynamicTrie::InitPropertyBitmaps() {
1301 // Only called on init.
1302 if (!property_bitmaps_.empty()) {
1303 ICING_LOG(FATAL) << "Property bitmaps not empty before initialization";
1304 }
1305
1306 if (deleted_bitmap_ != nullptr) {
1307 ICING_LOG(FATAL) << "Deleted bitmap not null before initialization";
1308 }
1309
1310 // Truncate property bitmap files at current value index. Last value
1311 // is at suffixes_size - value_size(). We want to clear everything
1312 // after that.
1313 uint64_t truncate_idx =
1314 storage_->hdr().suffixes_size() > 0
1315 ? ValueIndexToPropertyBitmapIndex(storage_->hdr().suffixes_size() -
1316 value_size()) +
1317 1
1318 : 0;
1319
1320 // Discover property bitmaps by scanning the dir.
1321 std::vector<std::string> files;
1322 if (!filesystem_->GetMatchingFiles((property_bitmaps_prefix_ + "*").c_str(),
1323 &files)) {
1324 ICING_LOG(ERROR) << "Could not get files at prefix "
1325 << property_bitmaps_prefix_;
1326 goto failed;
1327 }
1328 for (size_t i = 0; i < files.size(); i++) {
1329 // Decode property id from filename.
1330 size_t property_id_start_idx = files[i].rfind('.');
1331 if (property_id_start_idx == std::string::npos) {
1332 ICING_LOG(ERROR) << "Malformed filename " << files[i];
1333 continue;
1334 }
1335 property_id_start_idx++; // skip dot
1336 char *end;
1337 uint32_t property_id =
1338 strtol(files[i].c_str() + property_id_start_idx, &end, 10); // NOLINT
1339 if (!end || end != (files[i].c_str() + files[i].size())) {
1340 ICING_LOG(ERROR) << "Malformed filename " << files[i];
1341 continue;
1342 }
1343 std::unique_ptr<IcingFlashBitmap> bitmap = OpenAndInitBitmap(
1344 files[i],
1345 runtime_options_.storage_policy == RuntimeOptions::kMapSharedWithCrc,
1346 filesystem_);
1347 if (!bitmap) {
1348 ICING_LOG(ERROR) << "Open prop bitmap failed: " << files[i];
1349 goto failed;
1350 }
1351 bitmap->Truncate(truncate_idx);
1352 if (property_id >= property_bitmaps_.size()) {
1353 property_bitmaps_.resize(property_id + 1);
1354 }
1355 property_bitmaps_[property_id] = std::move(bitmap);
1356 }
1357
1358 deleted_bitmap_ = OpenAndInitBitmap(
1359 deleted_bitmap_filename_,
1360 runtime_options_.storage_policy == RuntimeOptions::kMapSharedWithCrc,
1361 filesystem_);
1362 if (!deleted_bitmap_) {
1363 goto failed;
1364 }
1365 deleted_bitmap_->Truncate(truncate_idx);
1366
1367 return true;
1368
1369 failed:
1370 property_bitmaps_.clear();
1371 deleted_bitmap_.reset();
1372 return false;
1373 }
1374
Warm() const1375 void IcingDynamicTrie::Warm() const {
1376 if (!is_initialized()) {
1377 ICING_LOG(FATAL) << "DynamicTrie not initialized";
1378 }
1379
1380 return storage_->Warm();
1381 }
1382
size() const1383 uint32_t IcingDynamicTrie::size() const {
1384 if (!is_initialized()) {
1385 ICING_LOG(FATAL) << "DynamicTrie not initialized";
1386 }
1387 return storage_->hdr().num_keys();
1388 }
1389
empty() const1390 bool IcingDynamicTrie::empty() const {
1391 if (!is_initialized()) {
1392 ICING_LOG(FATAL) << "DynamicTrie not initialized";
1393 }
1394 return storage_->empty();
1395 }
1396
CollectStatsRecursive(const Node & node,Stats * stats,uint32_t depth) const1397 void IcingDynamicTrie::CollectStatsRecursive(const Node &node, Stats *stats,
1398 uint32_t depth) const {
1399 // This function should be called only if the node is valid. The first call is
1400 // always from the root node, so it means the trie should not be empty at this
1401 // moment. Otherwise (root) node could contain invalid next_index(),
1402 // is_leaf(), and log2_num_children().
1403 if (node.is_leaf()) {
1404 stats->num_leaves++;
1405 stats->sum_depth += depth;
1406 stats->max_depth = std::max(stats->max_depth, depth);
1407 const char *suffix = storage_->GetSuffix(node.next_index());
1408 stats->suffixes_used += strlen(suffix) + 1 + value_size();
1409 if (!suffix[0]) {
1410 stats->null_suffixes++;
1411 }
1412 } else {
1413 stats->num_intermediates++;
1414 uint32_t i = 0;
1415 for (; i < (1U << node.log2_num_children()); i++) {
1416 const Next &next = *storage_->GetNext(node.next_index(), i);
1417 if (next.node_index() == kInvalidNodeIndex) break;
1418 CollectStatsRecursive(*storage_->GetNode(next.node_index()), stats,
1419 depth + 1);
1420 }
1421
1422 // At least one valid node in each next array
1423 if (i == 0) {
1424 ICING_LOG(FATAL) << "No valid node in 'next' array";
1425 }
1426 stats->sum_children += i;
1427 stats->max_children = std::max(stats->max_children, i);
1428
1429 stats->child_counts[i - 1]++;
1430 stats->wasted[node.log2_num_children()] +=
1431 (1 << node.log2_num_children()) - i;
1432 stats->total_wasted += (1 << node.log2_num_children()) - i;
1433 }
1434 }
1435
CollectStats(Stats * stats) const1436 void IcingDynamicTrie::CollectStats(Stats *stats) const {
1437 if (!is_initialized()) {
1438 ICING_LOG(FATAL) << "DynamicTrie not initialized";
1439 }
1440
1441 memset(stats, 0, sizeof(*stats));
1442
1443 stats->num_keys = storage_->hdr().num_keys();
1444 stats->num_nodes = storage_->hdr().num_nodes();
1445 stats->max_nodes = storage_->hdr().max_nodes();
1446 stats->num_nexts = storage_->hdr().num_nexts();
1447 stats->max_nexts = storage_->hdr().max_nexts();
1448 stats->suffixes_size = storage_->hdr().suffixes_size();
1449 stats->max_suffixes_size = storage_->hdr().max_suffixes_size();
1450
1451 // Stats collected from traversing the trie.
1452 if (!storage_->empty()) {
1453 CollectStatsRecursive(*storage_->GetRootNode(), stats);
1454 }
1455
1456 // Free-list stats.
1457 for (int i = 0; i < kNumNextAllocationBuckets; i++) {
1458 for (uint32_t cur = storage_->hdr().free_lists(i); cur != kInvalidNextIndex;
1459 cur = storage_->GetNext(cur, 0)->next_index()) {
1460 stats->num_free[i]++;
1461 }
1462 stats->total_free += stats->num_free[i] * (1 << i);
1463 }
1464
1465 // Dirty page counts.
1466 storage_->FillDirtyPageStats(stats);
1467 }
1468
DumpStats(int verbosity) const1469 std::string IcingDynamicTrie::Stats::DumpStats(int verbosity) const {
1470 std::string ret;
1471 IcingStringUtil::SStringAppendF(
1472 &ret, 0,
1473 "Keys %u "
1474 "Nodes (%u/%u) %.3f%% "
1475 "Nexts (%u/%u) %.3f%% "
1476 "Suffixes (%u/%u) %.3f%%\n",
1477 num_keys, num_nodes, max_nodes,
1478 100. * math_util::SafeDivide(num_nodes, max_nodes), num_nexts, max_nexts,
1479 100. * math_util::SafeDivide(num_nexts, max_nexts), suffixes_size,
1480 max_suffixes_size,
1481 100. * math_util::SafeDivide(suffixes_size, max_suffixes_size));
1482
1483 if (verbosity > 0) {
1484 for (int i = 0; i < kNumNextAllocationBuckets; i++) {
1485 if (num_free[i] > 0) {
1486 IcingStringUtil::SStringAppendF(&ret, 0, "Freelist@%d: %u\n", 1 << i,
1487 num_free[i]);
1488 }
1489 }
1490 IcingStringUtil::SStringAppendF(
1491 &ret, 0, "Freelist total: %u/%u %.3f%%\n", total_free, num_nexts,
1492 100. * math_util::SafeDivide(total_free, num_nexts));
1493
1494 for (int i = 0; i < 256; i++) {
1495 if (child_counts[i] > 0) {
1496 IcingStringUtil::SStringAppendF(&ret, 0, "Child count@%d: %u\n", i + 1,
1497 child_counts[i]);
1498 }
1499 }
1500 for (int i = 0; i < kNumNextAllocationBuckets; i++) {
1501 IcingStringUtil::SStringAppendF(&ret, 0, "Wasted@%d: %u\n", 1 << i,
1502 wasted[i]);
1503 }
1504 IcingStringUtil::SStringAppendF(
1505 &ret, 0,
1506 "Wasted total: %u\n"
1507 "Num intermediates %u num leaves %u "
1508 "suffixes used %u null %u\n"
1509 "avg and max children for intermediates: %.3f, %u\n"
1510 "avg and max depth for leaves: %.3f, %u\n"
1511 "Total next frag: %.3f%%\n",
1512 total_wasted, num_intermediates, num_leaves, suffixes_used,
1513 null_suffixes, 1. * sum_children / num_intermediates, max_children,
1514 1. * sum_depth / num_leaves, max_depth,
1515 100. * math_util::SafeDivide((total_free + total_wasted), num_nexts));
1516 }
1517 IcingStringUtil::SStringAppendF(
1518 &ret, 0, "Memory usage: %zu/%zu bytes\n",
1519 num_nodes * sizeof(Node) + num_nexts * sizeof(Next) + suffixes_size,
1520 max_nodes * sizeof(Node) + max_nexts * sizeof(Next) + max_suffixes_size);
1521
1522 IcingStringUtil::SStringAppendF(
1523 &ret, 0, "Dirty pages: nodes %u/%.0f nexts %u/%.0f suffixes %u/%.0f\n",
1524 dirty_pages_nodes,
1525 math_util::SafeDivide(num_nodes * sizeof(Node) + getpagesize() - 1,
1526 getpagesize()),
1527 dirty_pages_nexts,
1528 math_util::SafeDivide(num_nexts * sizeof(Next) + getpagesize() - 1,
1529 getpagesize()),
1530 dirty_pages_suffixes,
1531 math_util::SafeDivide(suffixes_size + getpagesize() - 1, getpagesize()));
1532
1533 return ret;
1534 }
1535
DumpTrie(std::ostream * pretty_print,std::vector<std::string> * keys) const1536 void IcingDynamicTrie::DumpTrie(std::ostream *pretty_print,
1537 std::vector<std::string> *keys) const {
1538 if (!is_initialized()) {
1539 ICING_LOG(FATAL) << "DynamicTrie not initialized";
1540 }
1541
1542 Dumper dumper(*this);
1543 dumper.Dump(pretty_print, keys);
1544 }
1545
Clear()1546 void IcingDynamicTrie::Clear() {
1547 if (!is_initialized()) {
1548 ICING_LOG(FATAL) << "DynamicTrie not initialized";
1549 }
1550
1551 storage_->Clear();
1552 for (auto &bitmap : property_bitmaps_) {
1553 if (bitmap) {
1554 bitmap->Delete();
1555 bitmap.reset();
1556 }
1557 }
1558 deleted_bitmap_->Truncate(0);
1559 }
1560
ClearSuffixAndValue(uint32_t suffix_value_index)1561 bool IcingDynamicTrie::ClearSuffixAndValue(uint32_t suffix_value_index) {
1562 // The size 1 below is for a '\0' between the suffix and the value.
1563 size_t suffix_and_value_length =
1564 strlen(this->storage_->GetSuffix(suffix_value_index)) + 1 +
1565 this->value_size();
1566 char *mutable_suffix_and_value = this->storage_->GetMutableSuffix(
1567 suffix_value_index, suffix_and_value_length);
1568
1569 if (mutable_suffix_and_value == nullptr) {
1570 return false;
1571 }
1572
1573 memset(mutable_suffix_and_value, 0, suffix_and_value_length);
1574 return true;
1575 }
1576
ResetNext(uint32_t next_index)1577 bool IcingDynamicTrie::ResetNext(uint32_t next_index) {
1578 Next *mutable_next =
1579 this->storage_->GetMutableNextArray(next_index, /*len=*/1);
1580
1581 if (mutable_next == nullptr) {
1582 return false;
1583 }
1584 ResetMutableNext(*mutable_next);
1585 return true;
1586 }
1587
SortNextArray(const Node * node)1588 bool IcingDynamicTrie::SortNextArray(const Node *node) {
1589 if (node == nullptr) {
1590 // Nothing to sort, return success directly.
1591 return true;
1592 }
1593
1594 uint32_t next_array_buffer_size = 1u << node->log2_num_children();
1595 Next *next_array_start = this->storage_->GetMutableNextArray(
1596 node->next_index(), next_array_buffer_size);
1597
1598 if (next_array_start == nullptr) {
1599 return false;
1600 }
1601
1602 std::sort(next_array_start, next_array_start + next_array_buffer_size);
1603 return true;
1604 }
1605
Insert(std::string_view key,const void * value,uint32_t * value_index,bool replace,bool * pnew_key)1606 libtextclassifier3::Status IcingDynamicTrie::Insert(std::string_view key,
1607 const void *value,
1608 uint32_t *value_index,
1609 bool replace,
1610 bool *pnew_key) {
1611 if (!is_initialized()) {
1612 ICING_LOG(FATAL) << "DynamicTrie not initialized";
1613 }
1614
1615 if (pnew_key) *pnew_key = false;
1616
1617 if (!IsKeyValid(key)) {
1618 return absl_ports::InvalidArgumentError(
1619 "Key cannot contain a null character '\\0'");
1620 }
1621
1622 // Find out ahead of time whether things will fit. A conservative
1623 // check based on allocations made below.
1624 //
1625 // IMPORTANT: This needs to be updated if the alloc patterns below
1626 // change.
1627 if (!(storage_->nodes_left() >= 2 + key.size() + 1 &&
1628 storage_->nexts_left() >= 2 + key.size() + 1 + kMaxNextArraySize &&
1629 storage_->suffixes_left() >= key.size() + 1 + value_size())) {
1630 return absl_ports::ResourceExhaustedError("No more space left");
1631 }
1632
1633 uint32_t best_node_index;
1634 int key_offset;
1635 FindBestNode(key, &best_node_index, &key_offset, false);
1636
1637 // A negative key_offset indicates that storage_ is empty
1638 if (key_offset < 0) {
1639 // First key.
1640 if (!storage_->empty()) {
1641 ICING_LOG(FATAL) << "Key offset is negative but storage is not empty, "
1642 "there're inconsistencies in dynamic trie.";
1643 }
1644 Node *node = storage_->AllocNode();
1645 node->set_next_index(storage_->MakeSuffix(key, value, value_index));
1646 node->set_is_leaf(true);
1647 node->set_log2_num_children(0);
1648 } else if (storage_->GetNode(best_node_index)->is_leaf()) {
1649 // Prefix in the trie. Split at leaf.
1650 Node *split_node = storage_->GetMutableNode(best_node_index);
1651 const char *prev_suffix = storage_->GetSuffix(split_node->next_index());
1652
1653 // Find the common prefix length starting from prev_suffix[0] and
1654 // key[key_offset].
1655 // - prev_suffix terminates with '\0'.
1656 // - key is a std::string_view object, so it may not be null-terminated.
1657 // - key doesn't contain '\0' because it's checked in IsKeyValid() above.
1658 int common_prefix_len = 0;
1659 while (key_offset + common_prefix_len < key.size() &&
1660 prev_suffix[common_prefix_len] ==
1661 key[key_offset + common_prefix_len]) {
1662 ++common_prefix_len;
1663 }
1664
1665 // Equal strings
1666 bool strings_equal = prev_suffix[common_prefix_len] == '\0' &&
1667 key_offset + common_prefix_len >= key.size();
1668 if (strings_equal) {
1669 if (value_index) {
1670 *value_index =
1671 storage_->GetSuffixIndex(prev_suffix + common_prefix_len + 1);
1672 }
1673 // Update value if replace == true and return.
1674 if (replace) {
1675 char *mutable_prev_suffix_cur = storage_->GetMutableSuffix(
1676 storage_->GetSuffixIndex(prev_suffix + common_prefix_len + 1),
1677 value_size());
1678 memcpy(mutable_prev_suffix_cur, value, value_size());
1679 }
1680 return libtextclassifier3::Status::OK;
1681 }
1682
1683 // Create single-branch children for the common prefix
1684 // length. After the loop, split_node points to the node that
1685 // will have more than 1 char.
1686 for (int i = 0; i < common_prefix_len; i++) {
1687 // Create a single-branch child node.
1688 ICING_ASSIGN_OR_RETURN(Next * split_next, storage_->AllocNextArray(1));
1689 split_node->set_next_index(storage_->GetNextArrayIndex(split_next));
1690 split_node->set_is_leaf(false);
1691 split_node->set_log2_num_children(0);
1692 Node *child_node = storage_->AllocNode();
1693 split_next[0].set_val(*(prev_suffix + i));
1694 split_next[0].set_node_index(storage_->GetNodeIndex(child_node));
1695
1696 split_node = child_node;
1697 }
1698
1699 // Fill a split.
1700 ICING_ASSIGN_OR_RETURN(Next * split_next, storage_->AllocNextArray(2));
1701 split_node->set_next_index(storage_->GetNextArrayIndex(split_next));
1702 split_node->set_is_leaf(false);
1703 split_node->set_log2_num_children(1);
1704 Node *prev_suffix_node = storage_->AllocNode();
1705 Node *key_node = storage_->AllocNode();
1706 split_next[0].set_val(*(prev_suffix + common_prefix_len));
1707 split_next[0].set_node_index(storage_->GetNodeIndex(prev_suffix_node));
1708 if (*(prev_suffix + common_prefix_len)) {
1709 uint32_t next_index =
1710 storage_->GetSuffixIndex(prev_suffix + common_prefix_len) + 1;
1711 prev_suffix_node->set_next_index(next_index);
1712 } else {
1713 uint32_t next_index =
1714 storage_->GetSuffixIndex(prev_suffix + common_prefix_len);
1715 prev_suffix_node->set_next_index(next_index);
1716 }
1717
1718 char next_val = GetCharOrNull(key, key_offset + common_prefix_len);
1719 prev_suffix_node->set_is_leaf(true);
1720 prev_suffix_node->set_log2_num_children(0);
1721 split_next[1].set_val(next_val);
1722 split_next[1].set_node_index(storage_->GetNodeIndex(key_node));
1723 if (next_val != '\0') {
1724 uint32_t next_index = storage_->MakeSuffix(
1725 key.substr(key_offset + common_prefix_len + 1), value, value_index);
1726 key_node->set_next_index(next_index);
1727 } else {
1728 uint32_t next_index = storage_->MakeSuffix(
1729 key.substr(key_offset + common_prefix_len), value, value_index);
1730 key_node->set_next_index(next_index);
1731 }
1732 key_node->set_is_leaf(true);
1733 key_node->set_log2_num_children(0);
1734
1735 std::sort(split_next, split_next + 2);
1736 } else {
1737 // Insert into intermediate node.
1738 const Node *best_node = storage_->GetNode(best_node_index);
1739
1740 // Add our value as a node + suffix.
1741 Node *new_leaf_node = storage_->AllocNode();
1742 if (key_offset < key.size()) {
1743 uint32_t next_index =
1744 storage_->MakeSuffix(key.substr(key_offset + 1), value, value_index);
1745 new_leaf_node->set_next_index(next_index);
1746 } else {
1747 uint32_t next_index =
1748 storage_->MakeSuffix(key.substr(key_offset), value, value_index);
1749 new_leaf_node->set_next_index(next_index);
1750 }
1751 new_leaf_node->set_is_leaf(true);
1752 new_leaf_node->set_log2_num_children(0);
1753
1754 // Figure out the real length of the existing next array.
1755 uint32_t next_array_buffer_size = 1u << best_node->log2_num_children();
1756 Next *cur_next = storage_->GetMutableNextArray(best_node->next_index(),
1757 next_array_buffer_size);
1758 int next_len = GetValidNextsSize(cur_next, next_array_buffer_size);
1759 Next *new_next = cur_next;
1760 if (next_len == (next_array_buffer_size)) {
1761 // Allocate a new, larger, array.
1762 ICING_ASSIGN_OR_RETURN(new_next, storage_->AllocNextArray(next_len + 1));
1763 memcpy(new_next, cur_next, sizeof(Next) * next_len);
1764 }
1765
1766 // Write a link to our new leaf node and sort.
1767 new_next[next_len].set_val(GetCharOrNull(key, key_offset));
1768 new_next[next_len].set_node_index(storage_->GetNodeIndex(new_leaf_node));
1769 std::inplace_merge(new_next, new_next + next_len, new_next + next_len + 1);
1770 next_len++;
1771
1772 // If this was new, update the parent node and free the old next
1773 // array.
1774 if (new_next != cur_next) {
1775 Node *mutable_best_node =
1776 storage_->GetMutableNode(storage_->GetNodeIndex(best_node));
1777 mutable_best_node->set_next_index(storage_->GetNextArrayIndex(new_next));
1778 mutable_best_node->set_is_leaf(false);
1779 uint8_t log2_num_children = mutable_best_node->log2_num_children();
1780
1781 // 8 == log2(256)
1782 if (log2_num_children >= 8) {
1783 return absl_ports::InternalError(
1784 "Number of children exceeds the max allowed size");
1785 }
1786
1787 mutable_best_node->set_log2_num_children(log2_num_children + 1);
1788
1789 storage_->FreeNextArray(cur_next,
1790 mutable_best_node->log2_num_children() - 1);
1791 }
1792 }
1793
1794 // We added a new key.
1795 storage_->inc_num_keys();
1796
1797 if (pnew_key) *pnew_key = true;
1798 return libtextclassifier3::Status::OK;
1799 }
1800
GetValueAtIndex(uint32_t value_index) const1801 const void *IcingDynamicTrie::GetValueAtIndex(uint32_t value_index) const {
1802 if (!is_initialized()) {
1803 ICING_LOG(FATAL) << "DynamicTrie not initialized";
1804 }
1805
1806 return static_cast<const void *>(storage_->GetSuffix(value_index));
1807 }
1808
SetValueAtIndex(uint32_t value_index,const void * value)1809 void IcingDynamicTrie::SetValueAtIndex(uint32_t value_index,
1810 const void *value) {
1811 if (!is_initialized()) {
1812 ICING_LOG(FATAL) << "DynamicTrie not initialized";
1813 }
1814
1815 if (value_index > storage_->hdr().max_suffixes_size() - value_size()) {
1816 ICING_LOG(FATAL) << "Value index is out of range";
1817 }
1818
1819 memcpy(storage_->GetMutableSuffix(value_index, value_size()), value,
1820 value_size());
1821 }
1822
Find(std::string_view key,void * value,uint32_t * value_index) const1823 bool IcingDynamicTrie::Find(std::string_view key, void *value,
1824 uint32_t *value_index) const {
1825 if (!is_initialized()) {
1826 ICING_LOG(FATAL) << "DynamicTrie not initialized";
1827 }
1828
1829 if (!IsKeyValid(key)) {
1830 return false;
1831 }
1832
1833 uint32_t best_node_index;
1834 int key_offset;
1835 FindBestNode(key, &best_node_index, &key_offset, false);
1836
1837 if (key_offset < 0) {
1838 return false;
1839 }
1840
1841 const Node *best_node = storage_->GetNode(best_node_index);
1842 if (!best_node->is_leaf()) {
1843 return false;
1844 }
1845
1846 std::string_view suffix(storage_->GetSuffix(best_node->next_index()));
1847 if (key.substr(key_offset) != suffix) {
1848 return false;
1849 }
1850
1851 uint32_t vidx = best_node->next_index() + suffix.size() + 1;
1852 if (value_index) *value_index = vidx;
1853 if (value) memcpy(value, storage_->GetSuffix(vidx), value_size());
1854 return true;
1855 }
1856
Iterator(const IcingDynamicTrie & trie,std::string prefix,bool reverse)1857 IcingDynamicTrie::Iterator::Iterator(const IcingDynamicTrie &trie,
1858 std::string prefix, bool reverse)
1859 : cur_key_(std::move(prefix)),
1860 cur_suffix_(nullptr),
1861 cur_suffix_len_(0),
1862 single_leaf_match_(false),
1863 reverse_(reverse),
1864 trie_(trie) {
1865 if (!trie.is_initialized()) {
1866 ICING_LOG(FATAL) << "DynamicTrie not initialized";
1867 }
1868
1869 Reset();
1870 }
1871
BranchToLeaf(uint32_t node_index,BranchType branch_type)1872 void IcingDynamicTrie::Iterator::BranchToLeaf(uint32_t node_index,
1873 BranchType branch_type) {
1874 // Go down the trie, following the left-most child until we hit a
1875 // leaf. Push to stack and cur_key nodes and chars as we go.
1876 // When reverse_ is true, the method will follow the right-most child.
1877 const Node *node = trie_.storage_->GetNode(node_index);
1878 while (!node->is_leaf()) {
1879 const Next *next_start = trie_.storage_->GetNext(node->next_index(), 0);
1880 int child_idx;
1881 if (branch_type == BranchType::kRightMost) {
1882 uint32_t next_array_size = 1u << node->log2_num_children();
1883 child_idx = trie_.GetValidNextsSize(next_start, next_array_size) - 1;
1884 } else {
1885 // node isn't a leaf. So it must have >0 children.
1886 // 0 is the left-most child.
1887 child_idx = 0;
1888 }
1889 const Next &child_next = next_start[child_idx];
1890 branch_stack_.push_back(Branch(node_index, child_idx));
1891 cur_key_.push_back(child_next.val());
1892
1893 node_index = child_next.node_index();
1894 node = trie_.storage_->GetNode(node_index);
1895 }
1896
1897 // We're at a leaf.
1898 cur_suffix_ = trie_.storage_->GetSuffix(
1899 trie_.storage_->GetNode(node_index)->next_index());
1900 cur_suffix_len_ = strlen(cur_suffix_);
1901 cur_key_.append(cur_suffix_, cur_suffix_len_);
1902 }
1903
Reset()1904 void IcingDynamicTrie::Iterator::Reset() {
1905 if (!IsKeyValid(cur_key_)) {
1906 // Set invalid and return.
1907 cur_suffix_ = nullptr;
1908 cur_suffix_len_ = 0;
1909 return;
1910 }
1911
1912 size_t strip_len = branch_stack_.size() + cur_suffix_len_;
1913
1914 if (cur_key_.size() < strip_len) {
1915 ICING_LOG(FATAL) << "Key size < visited trie depth + remaining suffix "
1916 "size, there're inconsistencies in dynamic trie";
1917 }
1918
1919 // Trim back cur_key_ to original prefix.
1920 cur_key_.resize(cur_key_.size() - strip_len);
1921 cur_suffix_ = nullptr;
1922 cur_suffix_len_ = 0;
1923 single_leaf_match_ = false;
1924 branch_stack_.clear();
1925
1926 // Nothing to do with an empty trie.
1927 if (trie_.storage_->empty()) return;
1928
1929 // Find node matching prefix.
1930 uint32_t node_index;
1931 int key_offset;
1932 trie_.FindBestNode(cur_key_, &node_index, &key_offset, true);
1933
1934 // Two cases/states:
1935 //
1936 // - Found an intermediate node. If we matched all of prefix
1937 // (cur_key_), BranchToLeaf.
1938 //
1939 // - Found a leaf node, which is the ONLY matching key for this
1940 // prefix. Check that suffix matches the prefix. Then we set
1941 // single_leaf_match_ = true and apply different logic for
1942 // Advance.
1943 if (key_offset < 0) {
1944 // A negative key_offset indicates that trie_.storage_ is empty
1945 ICING_LOG(FATAL) << "Trie storage is empty";
1946 }
1947
1948 const Node *best_node = trie_.storage_->GetNode(node_index);
1949 if (best_node->is_leaf() &&
1950 !strncmp(cur_key_.c_str() + key_offset,
1951 trie_.storage_->GetSuffix(best_node->next_index()),
1952 cur_key_.size() - key_offset)) {
1953 // Copy the entire suffix into the current key.
1954 cur_key_.resize(key_offset);
1955 cur_key_.append(trie_.storage_->GetSuffix(best_node->next_index()));
1956 cur_suffix_ = trie_.storage_->GetSuffix(best_node->next_index());
1957 cur_suffix_len_ = strlen(cur_suffix_);
1958 single_leaf_match_ = true;
1959 } else if (static_cast<size_t>(key_offset) == cur_key_.size()) {
1960 BranchType branch_type =
1961 (reverse_) ? BranchType::kRightMost : BranchType::kLeftMost;
1962 BranchToLeaf(node_index, branch_type);
1963 }
1964 }
1965
Advance()1966 bool IcingDynamicTrie::Iterator::Advance() {
1967 if (!IsValid()) return false;
1968 if (single_leaf_match_) {
1969 // If we only have an exact match, the Advance logic does not
1970 // apply. Invalidate the iterator and return.
1971 cur_suffix_ = nullptr;
1972 cur_suffix_len_ = 0;
1973 return false;
1974 }
1975
1976 if (cur_key_.size() < (branch_stack_.size() + cur_suffix_len_)) {
1977 ICING_LOG(FATAL) << "Key size < visited trie depth + remaining suffix "
1978 "size, there're inconsistencies in dynamic trie";
1979 }
1980
1981 // Move up from the current leaf.
1982 cur_key_.resize(cur_key_.size() - cur_suffix_len_);
1983 cur_suffix_ = nullptr;
1984 cur_suffix_len_ = 0;
1985
1986 while (!branch_stack_.empty()) {
1987 Branch *branch = &branch_stack_.back();
1988 const Node *node = trie_.storage_->GetNode(branch->node_idx);
1989 if (reverse_) {
1990 branch->child_idx--;
1991 } else {
1992 branch->child_idx++;
1993 }
1994 if (branch->child_idx >= 0 &&
1995 branch->child_idx < (1 << node->log2_num_children())) {
1996 const Next *child_next =
1997 trie_.storage_->GetNext(node->next_index(), branch->child_idx);
1998 if (child_next->node_index() != kInvalidNodeIndex) {
1999 // Successfully incremented to the next child. Update the char
2000 // value at this depth.
2001 cur_key_[cur_key_.size() - 1] = child_next->val();
2002 // We successfully found a sub-trie to explore.
2003 BranchType branch_type =
2004 (reverse_) ? BranchType::kRightMost : BranchType::kLeftMost;
2005 BranchToLeaf(child_next->node_index(), branch_type);
2006 return true;
2007 }
2008 }
2009 branch_stack_.pop_back();
2010 cur_key_.resize(cur_key_.size() - 1);
2011 }
2012
2013 // Un-wound the entire stack. We are done.
2014 return false;
2015 }
2016
IsValid() const2017 bool IcingDynamicTrie::Iterator::IsValid() const {
2018 return cur_suffix_ != nullptr;
2019 }
2020
GetKey() const2021 std::string_view IcingDynamicTrie::Iterator::GetKey() const {
2022 // cur_key_ can have a NULL in it so cur_key_ can be wrong but
2023 // cur_key_.c_str() is always right.
2024 return IsValid() ? cur_key_.c_str() : std::string_view();
2025 }
2026
GetValue() const2027 const void *IcingDynamicTrie::Iterator::GetValue() const {
2028 if (!IsValid()) return nullptr;
2029
2030 return static_cast<const void *>(cur_suffix_ + cur_suffix_len_ + 1);
2031 }
2032
GetValueIndex() const2033 uint32_t IcingDynamicTrie::Iterator::GetValueIndex() const {
2034 if (!IsValid()) return kInvalidSuffixIndex;
2035
2036 return trie_.storage_->GetSuffixIndex(cur_suffix_ + cur_suffix_len_ + 1);
2037 }
2038
LeftBranchToUtf8End()2039 void IcingDynamicTrie::Utf8Iterator::LeftBranchToUtf8End() {
2040 if (cur_len_ <= 0) {
2041 ICING_LOG(FATAL) << "Invalid UTF-8 character length";
2042 }
2043
2044 if (branch_end_ - branch_stack_ != cur_len_) {
2045 ICING_LOG(FATAL) << "Depth from first visited node to last visited node "
2046 "doesn't match the current UTF-8 character length";
2047 }
2048
2049 // Use branch at top of stack to determine where to follow.
2050 const Branch &branch = *(branch_end_ - 1);
2051 const Node *node = trie_.storage_->GetNode(branch.child->node_index());
2052
2053 // If we start with non-ascii, take all left branches while there is
2054 // a continuation byte.
2055 if (!i18n_utils::IsAscii(cur_[cur_len_ - 1])) {
2056 while (!node->is_leaf()) {
2057 if (cur_len_ >= U8_MAX_LENGTH) break;
2058
2059 InitBranch(branch_end_, node, 0);
2060 // When we are looking to complete a utf8 char, skip 0s.
2061 if (branch_end_->child->val() == 0) {
2062 // Check if we already have a valid cur_.
2063 cur_[cur_len_] = 0;
2064 UChar32 uchar32 = i18n_utils::GetUChar32At(cur_, cur_len_, 0);
2065 if (uchar32 == i18n_utils::kInvalidUChar32 &&
2066 node->log2_num_children() > 0) {
2067 branch_end_->child++;
2068 } else {
2069 // Good termination. Just break.
2070 break;
2071 }
2072 }
2073
2074 if (!IcingStringUtil::IsContinuationByte(branch_end_->child->val()))
2075 break;
2076
2077 cur_[cur_len_++] = branch_end_->child->val();
2078 node = trie_.storage_->GetNode(branch_end_->child->node_index());
2079 branch_end_++;
2080 }
2081
2082 cur_logical_node_.node = node;
2083
2084 // Maybe go into suffixes and set suffix_offset.
2085 if (node->is_leaf()) {
2086 GoIntoSuffix(node);
2087 } else {
2088 cur_logical_node_.suffix_offset = 0;
2089 }
2090 } else { // ascii
2091 cur_logical_node_.node = node;
2092 cur_logical_node_.suffix_offset = 0;
2093 }
2094
2095 // NULL-terminate.
2096 cur_[cur_len_] = 0;
2097 }
2098
GoIntoSuffix(const Node * node)2099 void IcingDynamicTrie::Utf8Iterator::GoIntoSuffix(const Node *node) {
2100 const char *suffix = trie_.storage_->GetSuffix(node->next_index());
2101 const char *cur_suffix;
2102 for (cur_suffix = suffix; cur_len_ < U8_MAX_LENGTH &&
2103 IcingStringUtil::IsContinuationByte(*cur_suffix);
2104 cur_suffix++) {
2105 cur_[cur_len_++] = *cur_suffix;
2106 }
2107 cur_logical_node_.suffix_offset = cur_suffix - suffix;
2108 }
2109
Reset()2110 void IcingDynamicTrie::Utf8Iterator::Reset() {
2111 cur_[0] = 0;
2112 cur_len_ = 0;
2113 branch_end_ = branch_stack_;
2114
2115 if (start_node_) {
2116 // Take the first char node's children.
2117 const Next *next = trie_.storage_->GetNext(start_node_->next_index(), 0);
2118 branch_end_->node = start_node_;
2119 branch_end_->child_end = next + (1 << start_node_->log2_num_children());
2120 if (next->val() == 0) {
2121 // Skip any nulls at this position. We don't return empty string
2122 // as an iteration.
2123 next++;
2124 }
2125 branch_end_->child = next;
2126 cur_[cur_len_++] = next->val();
2127 branch_end_++;
2128
2129 // Will NULL-terminate cur_.
2130 LeftBranchToUtf8End();
2131 } else {
2132 // Nothing to return.
2133 cur_logical_node_.node = nullptr;
2134 cur_logical_node_.suffix_offset = 0;
2135 }
2136 }
2137
Advance()2138 bool IcingDynamicTrie::Utf8Iterator::Advance() {
2139 if (!IsValid()) return false;
2140
2141 // Clip to branch.
2142 cur_len_ = branch_end_ - branch_stack_;
2143
2144 while (branch_end_ > branch_stack_) {
2145 Branch *branch = branch_end_ - 1;
2146 branch->child++;
2147 if (!branch->IsFinished()) {
2148 // Successfully incremented to the next child. Update the char
2149 // value at this depth.
2150 cur_[cur_len_ - 1] = branch->child->val();
2151
2152 // We successfully found a sub-trie to explore.
2153 LeftBranchToUtf8End();
2154 return true;
2155 }
2156 cur_len_--;
2157 branch_end_--;
2158 }
2159
2160 // Un-wound the entire stack. We are done.
2161 return false;
2162 }
2163
InitBranch(Branch * branch,const Node * start,char key_char)2164 void IcingDynamicTrie::Utf8Iterator::InitBranch(Branch *branch,
2165 const Node *start,
2166 char key_char) {
2167 branch->node = start;
2168 branch->child = trie_.storage_->GetNext(start->next_index(), 0);
2169 branch->child_end = branch->child + (1 << start->log2_num_children());
2170 if (key_char) {
2171 branch->child =
2172 trie_.LowerBound(branch->child, branch->child_end, key_char);
2173 }
2174 }
2175
IsFinished()2176 bool IcingDynamicTrie::Utf8Iterator::Branch::IsFinished() {
2177 return child >= child_end || child->node_index() == kInvalidNodeIndex;
2178 }
2179
IsValid() const2180 bool IcingDynamicTrie::Utf8Iterator::IsValid() const { return cur_len_ > 0; }
2181
GetNextByChar(const Node * node,uint8_t key_char) const2182 const IcingDynamicTrie::Next *IcingDynamicTrie::GetNextByChar(
2183 const Node *node, uint8_t key_char) const {
2184 const Next *next_start = storage_->GetNext(node->next_index(), 0);
2185 const Next *next_end = next_start + (1 << node->log2_num_children());
2186
2187 const Next *found = LowerBound(next_start, next_end, key_char);
2188 if (found >= next_end || found->val() != key_char ||
2189 found->node_index() == kInvalidNodeIndex) {
2190 return nullptr;
2191 }
2192
2193 return found;
2194 }
2195
GetValidNextsSize(const IcingDynamicTrie::Next * next_array_start,int next_array_length) const2196 int IcingDynamicTrie::GetValidNextsSize(
2197 const IcingDynamicTrie::Next *next_array_start,
2198 int next_array_length) const {
2199 // Only searching for key char 0xff is not sufficient, as 0xff can be a valid
2200 // character. We must also specify kInvalidNodeIndex as the target node index
2201 // when searching the next array.
2202 return LowerBound(next_array_start, next_array_start + next_array_length,
2203 /*key_char=*/0xff, /*node_index=*/kInvalidNodeIndex) -
2204 next_array_start;
2205 }
2206
LowerBound(const Next * start,const Next * end,uint8_t key_char,uint32_t node_index) const2207 const IcingDynamicTrie::Next *IcingDynamicTrie::LowerBound(
2208 const Next *start, const Next *end, uint8_t key_char,
2209 uint32_t node_index) const {
2210 // Above this value will use binary search instead of linear
2211 // search. 16 was chosen from running some benchmarks with
2212 // different values.
2213 static const uint32_t kBinarySearchCutoff = 16;
2214
2215 Next key_next(key_char, node_index);
2216 if (end - start >= kBinarySearchCutoff) {
2217 // Binary search.
2218 return std::lower_bound(start, end, key_next);
2219 } else {
2220 // Linear search.
2221 const Next *found;
2222 for (found = start; found < end; found++) {
2223 if (!(*found < key_next)) {
2224 // Should have gotten match.
2225 break;
2226 }
2227 }
2228 return found;
2229 }
2230 }
2231
FindBestNode(std::string_view key,uint32_t * best_node_index,int * key_offset,bool prefix,bool utf8) const2232 void IcingDynamicTrie::FindBestNode(std::string_view key,
2233 uint32_t *best_node_index, int *key_offset,
2234 bool prefix, bool utf8) const {
2235 // Find the best node such that:
2236 //
2237 // - If key is NOT in the trie, key[0..key_offset) is a prefix to
2238 // everything under best_node_index.
2239 //
2240 // - If key is in the trie, best_node_index is the leaf that points
2241 // to the key suffix and key_offset == strlen(key).
2242 //
2243 // If prefix is true, when key is both in the trie AND a prefix
2244 // (e.g. "ab" and "abc" are in the trie), we return the intermediate
2245 // node with key as the prefix as opposed to the exactly matching
2246 // leaf node.
2247 if (storage_->empty()) {
2248 *best_node_index = 0;
2249 *key_offset = -1;
2250 return;
2251 }
2252
2253 const Node *cur_node = storage_->GetRootNode();
2254 int cur_key_idx = 0;
2255 int utf8_key_idx = 0;
2256 const Node *utf8_node = cur_node;
2257 while (!cur_node->is_leaf()) {
2258 char cur_char = GetCharOrNull(key, cur_key_idx);
2259 const Next *found = GetNextByChar(cur_node, cur_char);
2260 if (!found) break;
2261
2262 if (prefix && found->val() == 0) {
2263 break;
2264 }
2265
2266 cur_node = storage_->GetNode(found->node_index());
2267
2268 // End of key.
2269 if (cur_key_idx >= key.size()) {
2270 break;
2271 }
2272
2273 ++cur_key_idx;
2274 cur_char = GetCharOrNull(key, cur_key_idx);
2275
2276 if (utf8 && i18n_utils::IsLeadUtf8Byte(cur_char)) {
2277 utf8_node = cur_node;
2278 utf8_key_idx = cur_key_idx;
2279 }
2280 }
2281
2282 if (utf8) {
2283 // Rewind.
2284 cur_node = utf8_node;
2285 cur_key_idx = utf8_key_idx;
2286 }
2287
2288 *best_node_index = storage_->GetNodeIndex(cur_node);
2289 *key_offset = cur_key_idx;
2290 }
2291
FindNewBranchingPrefixLength(std::string_view key,bool utf8) const2292 int IcingDynamicTrie::FindNewBranchingPrefixLength(std::string_view key,
2293 bool utf8) const {
2294 if (!IsKeyValid(key)) {
2295 return kNoBranchFound;
2296 }
2297
2298 if (storage_->empty()) {
2299 return kNoBranchFound;
2300 }
2301
2302 uint32_t best_node_index;
2303 int key_offset;
2304 FindBestNode(key, &best_node_index, &key_offset, /*prefix=*/true, utf8);
2305 if (key_offset < 0) {
2306 return kNoBranchFound;
2307 }
2308
2309 const Node *cur_node = storage_->GetNode(best_node_index);
2310 if (cur_node->is_leaf()) {
2311 // Prefix in the trie. Split at leaf.
2312 const char *prev_suffix = storage_->GetSuffix(cur_node->next_index());
2313 int additional_branch_prefix_len = 0;
2314 // Find the additional prefix length starting from prev_suffix[0] and
2315 // key[key_offset].
2316 // - prev_suffix terminates with '\0'.
2317 // - key is a std::string_view object, so it may not be null-terminated.
2318 // - key doesn't contain '\0' because it's checked in IsKeyValid() above.
2319 while (key_offset + additional_branch_prefix_len < key.size() &&
2320 prev_suffix[additional_branch_prefix_len] ==
2321 key[key_offset + additional_branch_prefix_len]) {
2322 ++additional_branch_prefix_len;
2323 }
2324
2325 // Equal strings. No branching.
2326 bool strings_equal =
2327 prev_suffix[additional_branch_prefix_len] == '\0' &&
2328 key_offset + additional_branch_prefix_len >= key.size();
2329 if (strings_equal) {
2330 return kNoBranchFound;
2331 }
2332
2333 // The remaining key (after key_offset) is a prefix of the suffix, so the
2334 // branching prefix length is key length.
2335 if (key_offset + additional_branch_prefix_len >= key.size()) {
2336 return key.size();
2337 }
2338
2339 if (utf8) {
2340 // Rewind to utf8 boundary.
2341 return i18n_utils::SafeTruncateUtf8Length(
2342 key.data(), key_offset + additional_branch_prefix_len);
2343 }
2344
2345 return key_offset + additional_branch_prefix_len;
2346 } else if (cur_node->log2_num_children() == 0) {
2347 // Intermediate node going from no branching to branching.
2348 return key_offset;
2349 }
2350
2351 // If we've reached this point, then we're already at a branch point. So there
2352 // is no *new* branch point.
2353 return kNoBranchFound;
2354 }
2355
FindBranchingPrefixLengths(std::string_view key,bool utf8) const2356 std::vector<int> IcingDynamicTrie::FindBranchingPrefixLengths(
2357 std::string_view key, bool utf8) const {
2358 std::vector<int> prefix_lengths;
2359
2360 if (!IsKeyValid(key)) {
2361 return prefix_lengths;
2362 }
2363
2364 if (storage_->empty()) {
2365 return prefix_lengths;
2366 }
2367
2368 const Node *cur_node = storage_->GetRootNode();
2369 int idx = 0;
2370 while (idx < key.size() && !cur_node->is_leaf()) {
2371 // Branching prefix?
2372 if (cur_node->log2_num_children() > 0) {
2373 int len = idx;
2374 if (utf8) {
2375 // Do not cut mid-utf8. Walk up to utf8 boundary.
2376 len = i18n_utils::SafeTruncateUtf8Length(key.data(), len);
2377 if (prefix_lengths.empty() || len != prefix_lengths.back()) {
2378 prefix_lengths.push_back(len);
2379 }
2380 } else {
2381 prefix_lengths.push_back(len);
2382 }
2383 }
2384
2385 // Move to next.
2386 const Next *found = GetNextByChar(cur_node, key[idx]);
2387 if (found == nullptr) {
2388 break;
2389 }
2390 cur_node = storage_->GetNode(found->node_index());
2391
2392 ++idx;
2393 }
2394 return prefix_lengths;
2395 }
2396
IsBranchingTerm(std::string_view key) const2397 bool IcingDynamicTrie::IsBranchingTerm(std::string_view key) const {
2398 if (!is_initialized()) {
2399 ICING_LOG(FATAL) << "DynamicTrie not initialized";
2400 }
2401
2402 if (!IsKeyValid(key)) {
2403 return false;
2404 }
2405
2406 if (storage_->empty()) {
2407 return false;
2408 }
2409
2410 uint32_t best_node_index;
2411 int key_offset;
2412 FindBestNode(key, &best_node_index, &key_offset, /*prefix=*/true);
2413 if (key_offset < 0) {
2414 return false;
2415 }
2416
2417 const Node *cur_node = storage_->GetNode(best_node_index);
2418
2419 if (cur_node->is_leaf()) {
2420 return false;
2421 }
2422
2423 // There is no intermediate node for key in the trie.
2424 if (key_offset < key.size()) {
2425 return false;
2426 }
2427
2428 // Found key as an intermediate node, but key is not a valid term stored in
2429 // the trie. In this case, we need at least two children for key to be a
2430 // branching term.
2431 if (GetNextByChar(cur_node, '\0') == nullptr) {
2432 return cur_node->log2_num_children() >= 1;
2433 }
2434
2435 // The intermediate node for key must have more than two children for key to
2436 // be a branching term, one of which represents the leaf node for key itself.
2437 return cur_node->log2_num_children() > 1;
2438 }
2439
GetDebugInfo(int verbosity,std::string * out) const2440 void IcingDynamicTrie::GetDebugInfo(int verbosity, std::string *out) const {
2441 Stats stats;
2442 CollectStats(&stats);
2443 out->append(stats.DumpStats(verbosity));
2444
2445 // Property files.
2446 std::vector<std::string> files;
2447 if (!filesystem_->GetMatchingFiles((property_bitmaps_prefix_ + "*").c_str(),
2448 &files)) {
2449 ICING_LOG(ERROR) << "Could not get files at prefix "
2450 << property_bitmaps_prefix_;
2451 return;
2452 }
2453 for (size_t i = 0; i < files.size(); i++) {
2454 IcingStringUtil::SStringAppendF(
2455 out, 1000, "Prop file %s size %" PRIu64 "\n",
2456 filesystem_->GetBasename(files[i].c_str()).c_str(),
2457 filesystem_->GetFileSize(files[i].c_str()));
2458 }
2459 IcingStringUtil::SStringAppendF(
2460 out, 1000, "Deleted file %s size %" PRIu64 "\n",
2461 filesystem_->GetBasename(deleted_bitmap_filename_.c_str()).c_str(),
2462 filesystem_->GetFileSize(deleted_bitmap_filename_.c_str()));
2463 }
2464
min_free_fraction() const2465 double IcingDynamicTrie::min_free_fraction() const {
2466 if (!is_initialized()) {
2467 ICING_LOG(FATAL) << "DynamicTrie not initialized";
2468 }
2469
2470 return 1.0 -
2471 std::max(std::max(static_cast<double>(storage_->hdr().num_nodes()) /
2472 storage_->hdr().max_nodes(),
2473 static_cast<double>(storage_->hdr().num_nexts()) /
2474 storage_->hdr().max_nexts()),
2475 static_cast<double>(storage_->hdr().suffixes_size()) /
2476 storage_->hdr().max_suffixes_size());
2477 }
2478
value_size() const2479 uint32_t IcingDynamicTrie::value_size() const {
2480 return storage_->hdr().value_size();
2481 }
2482
max_value_index() const2483 uint32_t IcingDynamicTrie::max_value_index() const {
2484 return storage_->hdr().max_suffixes_size();
2485 }
2486
UpdateCrc()2487 Crc32 IcingDynamicTrie::UpdateCrc() {
2488 if (!is_initialized()) {
2489 ICING_LOG(FATAL) << "DynamicTrie not initialized";
2490 }
2491
2492 if (runtime_options_.storage_policy != RuntimeOptions::kMapSharedWithCrc) {
2493 return Crc32();
2494 }
2495
2496 // Combine storage crc with property bitmap crcs.
2497 Crc32 crc = storage_->UpdateCrc();
2498
2499 // Update crcs on bitmaps.
2500 for (size_t i = 0; i < property_bitmaps_.size(); ++i) {
2501 if (property_bitmaps_[i]) {
2502 // Combine property id with the bitmap crc.
2503 uint64_t property_crc = property_bitmaps_[i]->UpdateCrc().Get();
2504 property_crc = (property_crc << 32) | i;
2505 std::string_view property_crc_str(
2506 reinterpret_cast<const char *>(&property_crc), sizeof(property_crc));
2507 crc.Append(property_crc_str);
2508 }
2509 }
2510 uint32_t deleted_crc = deleted_bitmap_->UpdateCrc().Get();
2511 std::string_view deleted_crc_str(reinterpret_cast<const char *>(&deleted_crc),
2512 sizeof(deleted_crc));
2513 crc.Append(deleted_crc_str);
2514 return crc;
2515 }
2516
GetCrc() const2517 Crc32 IcingDynamicTrie::GetCrc() const {
2518 if (!is_initialized()) {
2519 ICING_LOG(FATAL) << "DynamicTrie not initialized";
2520 }
2521
2522 if (runtime_options_.storage_policy != RuntimeOptions::kMapSharedWithCrc) {
2523 return Crc32();
2524 }
2525
2526 // Combine storage crc with property bitmap crcs.
2527 Crc32 crc = storage_->GetCrc();
2528
2529 // Update crcs on bitmaps.
2530 for (size_t i = 0; i < property_bitmaps_.size(); ++i) {
2531 if (property_bitmaps_[i]) {
2532 // Combine property id with the bitmap crc.
2533 uint64_t property_crc = property_bitmaps_[i]->GetCrc().Get();
2534 property_crc = (property_crc << 32) | i;
2535 std::string_view property_crc_str(
2536 reinterpret_cast<const char *>(&property_crc), sizeof(property_crc));
2537 crc.Append(property_crc_str);
2538 }
2539 }
2540 uint32_t deleted_crc = deleted_bitmap_->UpdateCrc().Get();
2541 std::string_view deleted_crc_str(reinterpret_cast<const char *>(&deleted_crc),
2542 sizeof(deleted_crc));
2543 crc.Append(deleted_crc_str);
2544 return crc;
2545 }
2546
OpenOrCreatePropertyBitmap(uint32_t property_id)2547 IcingFlashBitmap *IcingDynamicTrie::OpenOrCreatePropertyBitmap(
2548 uint32_t property_id) {
2549 if (!is_initialized()) {
2550 ICING_LOG(FATAL) << "DynamicTrie not initialized";
2551 }
2552
2553 if (property_id > kMaxPropertyId) {
2554 ICING_LOG(ERROR) << "Property id " << property_id << " out of range";
2555 return nullptr;
2556 }
2557
2558 if (property_id >= property_bitmaps_.size()) {
2559 property_bitmaps_.resize(property_id + 1);
2560 }
2561 if (!property_bitmaps_[property_id]) {
2562 std::string filename;
2563 IcingStringUtil::SStringAppendF(
2564 &filename, property_bitmaps_prefix_.size() + 10, "%s%u",
2565 property_bitmaps_prefix_.c_str(), property_id);
2566 property_bitmaps_[property_id] =
2567 OpenAndInitBitmap(filename, false, filesystem_);
2568 }
2569 return property_bitmaps_[property_id].get();
2570 }
2571
SetProperty(uint32_t value_index,uint32_t property_id)2572 bool IcingDynamicTrie::SetProperty(uint32_t value_index, uint32_t property_id) {
2573 IcingFlashBitmap *bitmap = OpenOrCreatePropertyBitmap(property_id);
2574 if (!bitmap) {
2575 return false;
2576 }
2577 uint64_t idx = ValueIndexToPropertyBitmapIndex(value_index);
2578
2579 // Also clear deleted bit.
2580 return bitmap->SetBit(idx, true) && deleted_bitmap_->SetBit(idx, false);
2581 }
2582
ClearProperty(uint32_t value_index,uint32_t property_id)2583 bool IcingDynamicTrie::ClearProperty(uint32_t value_index,
2584 uint32_t property_id) {
2585 if (property_id >= property_bitmaps_.size() ||
2586 !property_bitmaps_[property_id]) {
2587 // No bitmap is ok for clearing.
2588 return true;
2589 }
2590
2591 uint64_t idx = ValueIndexToPropertyBitmapIndex(value_index);
2592 return property_bitmaps_[property_id]->SetBit(idx, false);
2593 }
2594
SetDeleted(uint32_t value_index)2595 bool IcingDynamicTrie::SetDeleted(uint32_t value_index) {
2596 uint64_t idx = ValueIndexToPropertyBitmapIndex(value_index);
2597 return deleted_bitmap_->SetBit(idx, true);
2598 }
2599
ClearDeleted(uint32_t value_index)2600 bool IcingDynamicTrie::ClearDeleted(uint32_t value_index) {
2601 uint64_t idx = ValueIndexToPropertyBitmapIndex(value_index);
2602 return deleted_bitmap_->SetBit(idx, false);
2603 }
2604
2605 // Steps:
2606 // 1. Find the key in the trie.
2607 // 2. Remove the suffix and the value.
2608 // 3. Reset the nexts that point to the nodes to be removed.
2609 // 4. Sort any next array if needed.
2610 // 5. Reset the trie state if the trie is empty after deletion.
2611 // - This is essential for storage_->empty(), which is a critical check for
2612 // all trie APIs before accessing the root node via
2613 // storage_->GetRootNode().
2614 // - When the trie is empty, it is possible that the root node (i.e.
2615 // Node(0)):
2616 // - Contains an invalid next_index(), and accessing it will cause a crash
2617 // or fetch incorrect data.
2618 // - Points to a valid next array but the next elements in the array
2619 // contain kInvalidNodeIndex. Accessing the next node via the next
2620 // element will cause a crash or fetch incorrect data.
2621 // - So we must reset the trie state to make sure storage_->empty() works
2622 // correctly and prevents trie APIs from accessing the root node.
Delete(std::string_view key)2623 bool IcingDynamicTrie::Delete(std::string_view key) {
2624 if (!is_initialized()) {
2625 ICING_LOG(ERROR) << "DynamicTrie not initialized";
2626 return false;
2627 }
2628
2629 if (!IsKeyValid(key)) {
2630 return false;
2631 }
2632
2633 if (storage_->empty()) {
2634 // Nothing to delete.
2635 return true;
2636 }
2637
2638 // Tries to find the key in the trie, starting from the root.
2639 const Node *current_node = storage_->GetRootNode();
2640
2641 // The node after which we start to remove data.
2642 const Node *last_multichild_node = nullptr;
2643
2644 // While visiting the trie nodes, we store the indices of Nexts that point
2645 // to all the nodes after last_multichild_node. Those nodes must be
2646 // consecutive and all have only one child. Resetting those Nexts means that
2647 // we remove the data of the key.
2648 std::vector<uint32_t> nexts_to_reset;
2649 nexts_to_reset.reserve(key.length());
2650
2651 // Iterates through chars in the key, finds nodes in the trie until a leaf
2652 // node is reached. The max number of loops is key.length() + 1 because we
2653 // start from the root.
2654 for (size_t i = 0; i <= key.length(); ++i) {
2655 if (current_node->is_leaf()) {
2656 // Leaf node, now check the suffix.
2657 if (key.substr(i) != storage_->GetSuffix(current_node->next_index())) {
2658 // Key does not exist in the trie, nothing to delete.
2659 return true;
2660 }
2661 // Otherwise, key is found.
2662 break;
2663 }
2664
2665 // Finds the next char.
2666 const Next *next;
2667 if (i == key.length()) {
2668 // When we're at the end of the key, the next char is the termination char
2669 // '\0'.
2670 next = GetNextByChar(current_node, '\0');
2671 } else {
2672 next = GetNextByChar(current_node, key[i]);
2673 }
2674
2675 if (next == nullptr) {
2676 // Key does not exist in the trie, nothing to delete.
2677 return true;
2678 }
2679
2680 // Checks the real size of next array.
2681 uint32_t next_array_buffer_size = 1u << current_node->log2_num_children();
2682 Next *next_array_start = storage_->GetMutableNextArray(
2683 current_node->next_index(), next_array_buffer_size);
2684 int valid_next_array_size =
2685 GetValidNextsSize(next_array_start, next_array_buffer_size);
2686 if (valid_next_array_size == 0) {
2687 // Key does not exist in the trie, nothing to delete.
2688 // This shouldn't happen, but we put a sanity check here in case something
2689 // is wrong.
2690 return true;
2691 } else if (valid_next_array_size == 1) {
2692 // Single-child branch will be deleted.
2693 nexts_to_reset.push_back(storage_->GetNextArrayIndex(next));
2694 } else {
2695 // We see a new node with multiple children, all the previously seen nodes
2696 // shouldn't be removed.
2697 last_multichild_node = current_node;
2698 nexts_to_reset.clear();
2699 nexts_to_reset.push_back(storage_->GetNextArrayIndex(next));
2700 }
2701
2702 // Updates current_node.
2703 current_node = storage_->GetNode(next->node_index());
2704 }
2705 // Now we've found the key in the trie.
2706
2707 ClearSuffixAndValue(current_node->next_index());
2708
2709 // Resets nexts to remove key information.
2710 for (uint32_t next_index : nexts_to_reset) {
2711 ResetNext(next_index);
2712 }
2713
2714 if (last_multichild_node != nullptr) {
2715 SortNextArray(last_multichild_node);
2716 uint32_t next_array_buffer_size =
2717 1u << last_multichild_node->log2_num_children();
2718 Next *next_array_start = this->storage_->GetMutableNextArray(
2719 last_multichild_node->next_index(), next_array_buffer_size);
2720 uint32_t num_children =
2721 GetValidNextsSize(next_array_start, next_array_buffer_size);
2722 // Shrink the next array if we can.
2723 if (num_children == next_array_buffer_size / 2) {
2724 Node *mutable_node = storage_->GetMutableNode(
2725 storage_->GetNodeIndex(last_multichild_node));
2726 mutable_node->set_log2_num_children(mutable_node->log2_num_children() -
2727 1);
2728 // Add the unused second half of the next array to the free list.
2729 storage_->FreeNextArray(next_array_start + next_array_buffer_size / 2,
2730 mutable_node->log2_num_children());
2731 }
2732 }
2733
2734 storage_->dec_num_keys();
2735 if (storage_->hdr().num_keys() == 0) {
2736 // Reset the trie state to empty by calling Clear() directly.
2737 //
2738 // Note: in this case, last_multichild_node will be nullptr as well.
2739 // - If we never saw a node with multiple children before deletion, then all
2740 // the traversed nodes, including the root node, are single-child nodes
2741 // before deletion.
2742 // - Therefore, after deletion, there should be no valid nodes or nexts in
2743 // the trie.
2744 Clear();
2745 }
2746
2747 return true;
2748 }
2749
ClearPropertyForAllValues(uint32_t property_id)2750 bool IcingDynamicTrie::ClearPropertyForAllValues(uint32_t property_id) {
2751 if (!is_initialized()) {
2752 ICING_LOG(FATAL) << "DynamicTrie not initialized";
2753 }
2754
2755 PropertyReadersAll readers(*this);
2756 if (!readers.Exists(property_id)) {
2757 ICING_VLOG(1) << "Properties for id " << property_id << " don't exist";
2758 return true;
2759 }
2760
2761 // Mark values that have no other properties set as as deleted.
2762 uint64_t max_idx =
2763 ValueIndexToPropertyBitmapIndex(storage_->hdr().suffixes_size());
2764 // TODO(vishwajith) Inefficient to do this bit by bit, should be word by
2765 // word. Removing a corpus is likely rare enough that this is low priority.
2766 for (uint64_t i = 0; i < max_idx; ++i) {
2767 // See if the bit is set in our property map.
2768 if (readers.IsPropertyUnique(property_id, i)) {
2769 deleted_bitmap_->SetBit(i, true);
2770 }
2771 }
2772
2773 // Now delete the bitmap file for this property.
2774 std::unique_ptr<IcingFlashBitmap> bitmap(
2775 std::move(property_bitmaps_[property_id]));
2776 // bitmap cannot be null here, because then readers.Exists(property_id) would
2777 // have returned false earlier, and we wouldn't get here.
2778 if (bitmap == nullptr) {
2779 ICING_LOG(ERROR) << "Property bitmap is null";
2780 return false;
2781 }
2782
2783 return bitmap->Delete();
2784 }
2785
Exists() const2786 bool IcingDynamicTrie::PropertyReaderBase::Exists() const {
2787 return bitmap_ != nullptr;
2788 }
2789
HasProperty(uint32_t value_index) const2790 bool IcingDynamicTrie::PropertyReaderBase::HasProperty(
2791 uint32_t value_index) const {
2792 return bitmap_ &&
2793 bitmap_->GetBit(trie_.ValueIndexToPropertyBitmapIndex(value_index));
2794 }
2795
PropertyReaderBase(const IcingDynamicTrie & trie,bool deleted,uint32_t property_id)2796 IcingDynamicTrie::PropertyReaderBase::PropertyReaderBase(
2797 const IcingDynamicTrie &trie, bool deleted, uint32_t property_id)
2798 : trie_(trie) {
2799 if (!trie.is_initialized()) {
2800 ICING_LOG(FATAL) << "DynamicTrie not initialized";
2801 }
2802
2803 if (deleted) {
2804 bitmap_ = trie.deleted_bitmap_.get();
2805 } else if (property_id < trie.property_bitmaps_.size()) {
2806 bitmap_ = trie.property_bitmaps_[property_id].get();
2807 } else {
2808 bitmap_ = nullptr;
2809 }
2810 }
2811
PropertyReadersAll(const IcingDynamicTrie & trie)2812 IcingDynamicTrie::PropertyReadersAll::PropertyReadersAll(
2813 const IcingDynamicTrie &trie)
2814 : trie_(trie) {
2815 if (!trie.is_initialized()) {
2816 ICING_LOG(FATAL) << "DynamicTrie not initialized";
2817 }
2818 }
2819
Exists(uint32_t property_id) const2820 bool IcingDynamicTrie::PropertyReadersAll::Exists(uint32_t property_id) const {
2821 return property_id < trie_.property_bitmaps_.size() &&
2822 trie_.property_bitmaps_[property_id];
2823 }
2824
HasProperty(uint32_t property_id,uint32_t value_index) const2825 bool IcingDynamicTrie::PropertyReadersAll::HasProperty(
2826 uint32_t property_id, uint32_t value_index) const {
2827 return property_id < trie_.property_bitmaps_.size() &&
2828 trie_.property_bitmaps_[property_id] &&
2829 trie_.property_bitmaps_[property_id]->GetBit(
2830 trie_.ValueIndexToPropertyBitmapIndex(value_index));
2831 }
2832
IsPropertyUnique(uint32_t property_id,uint32_t value_index) const2833 bool IcingDynamicTrie::PropertyReadersAll::IsPropertyUnique(
2834 uint32_t property_id, uint32_t value_index) const {
2835 uint32_t idx = trie_.ValueIndexToPropertyBitmapIndex(value_index);
2836
2837 // First check that value is set for the requested id.
2838 if (property_id >= trie_.property_bitmaps_.size() ||
2839 !trie_.property_bitmaps_[property_id] ||
2840 !trie_.property_bitmaps_[property_id]->GetBit(idx)) {
2841 return false;
2842 }
2843
2844 // Now check that the value is not set for the rest.
2845 for (size_t i = 0; i < trie_.property_bitmaps_.size(); ++i) {
2846 if (i == property_id) {
2847 continue;
2848 }
2849 if (trie_.property_bitmaps_[i] && trie_.property_bitmaps_[i]->GetBit(idx)) {
2850 return false;
2851 }
2852 }
2853 return true;
2854 }
2855
size() const2856 size_t IcingDynamicTrie::PropertyReadersAll::size() const {
2857 return trie_.property_bitmaps_.size();
2858 }
2859
ValueIndexToPropertyBitmapIndex(uint32_t value_index) const2860 uint64_t IcingDynamicTrie::ValueIndexToPropertyBitmapIndex(
2861 uint32_t value_index) const {
2862 // We know that value indices are separated by at least 1 +
2863 // value_size() bytes (for the null terminator and the value).
2864 return value_index / (value_size() + 1);
2865 }
2866
2867 // Testing hooks.
GetHeader(IcingDynamicTrieHeader * hdr) const2868 void IcingDynamicTrie::GetHeader(IcingDynamicTrieHeader *hdr) const {
2869 if (!is_initialized()) {
2870 ICING_LOG(FATAL) << "DynamicTrie not initialized";
2871 }
2872
2873 *hdr = storage_->hdr();
2874 }
2875
SetHeader(const IcingDynamicTrieHeader & new_hdr)2876 void IcingDynamicTrie::SetHeader(const IcingDynamicTrieHeader &new_hdr) {
2877 if (!is_initialized()) {
2878 ICING_LOG(FATAL) << "DynamicTrie not initialized";
2879 }
2880
2881 storage_->hdr_.hdr = new_hdr;
2882 storage_->WriteHeader();
2883 }
2884
2885 } // namespace lib
2886 } // namespace icing
2887