xref: /aosp_15_r20/external/icing/icing/file/file-backed-proto-log.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // File-backed log of protos with append-only writes and position based reads.
16 //
17 // The implementation in this file is deprecated and replaced by
18 // portable-file-backed-proto-log.h.
19 //
20 // This deprecated implementation has been made read-only for the purposes of
21 // migration; writing and erasing this format of log is no longer supported and
22 // the methods to accomplish this have been removed.
23 //
24 // The details of this format follow below:
25 // Each proto written to the file will have a metadata written just before it.
26 // The metadata consists of
27 //   {
28 //     1 bytes of kProtoMagic;
29 //     3 bytes of the proto size
30 //     n bytes of the proto itself
31 //   }
32 // TODO(b/136514769): Add versioning to the header and a UpgradeToVersion
33 // migration method.
34 #ifndef ICING_FILE_FILE_BACKED_PROTO_LOG_H_
35 #define ICING_FILE_FILE_BACKED_PROTO_LOG_H_
36 
37 #include <cstdint>
38 #include <memory>
39 #include <string>
40 #include <string_view>
41 
42 #include "icing/text_classifier/lib3/utils/base/statusor.h"
43 #include "icing/absl_ports/canonical_errors.h"
44 #include "icing/absl_ports/str_cat.h"
45 #include "icing/file/constants.h"
46 #include "icing/file/filesystem.h"
47 #include "icing/file/memory-mapped-file.h"
48 #include "icing/legacy/core/icing-string-util.h"
49 #include "icing/portable/gzip_stream.h"
50 #include "icing/portable/platform.h"
51 #include "icing/portable/zlib.h"
52 #include "icing/util/crc32.h"
53 #include "icing/util/data-loss.h"
54 #include "icing/util/logging.h"
55 #include "icing/util/status-macros.h"
56 #include <google/protobuf/io/zero_copy_stream_impl_lite.h>
57 
58 namespace icing {
59 namespace lib {
60 
61 template <typename ProtoT>
62 class FileBackedProtoLog {
63  public:
64   struct Options {
65     // Whether to compress each proto before writing to the proto log.
66     bool compress;
67 
68     // Byte-size limit for each proto written to the store. This does not
69     // include the bytes needed for the metadata of each proto.
70     //
71     // NOTE: Currently, we only support protos up to 16MiB. We store the proto
72     // size in 3 bytes within the metadata.
73     //
74     // NOTE: This limit is only enforced for future writes. If the store
75     // previously had a higher limit, then reading older entries could return
76     // larger protos.
77     //
78     // NOTE: The max_proto_size is the upper limit for input protos into the
79     // ProtoLog. Even if the proto is larger than max_proto_size, but compresses
80     // to a smaller size, ProtoLog will not accept it. Protos that result in a
81     // compressed size larger than max_proto_size are also not accepted.
82     const int32_t max_proto_size;
83 
84     // Must specify values for options.
85     Options() = delete;
86     explicit Options(bool compress_in,
87                      const int32_t max_proto_size_in = constants::kMaxProtoSize)
compressOptions88         : compress(compress_in), max_proto_size(max_proto_size_in) {}
89   };
90 
91   // Header stored at the beginning of the file before the rest of the log
92   // contents. Stores metadata on the log.
93   struct Header {
94     static constexpr int32_t kMagic = 0xf4c6f67a;
95 
96     // Holds the magic as a quick sanity check against file corruption.
97     int32_t magic = kMagic;
98 
99     // Whether to compress the protos before writing to the log.
100     bool compress = true;
101 
102     // The maximum proto size that can be written to the log.
103     int32_t max_proto_size = 0;
104 
105     // Checksum of the log elements, doesn't include the header fields.
106     uint32_t log_checksum = 0;
107 
108     // Last known good offset at which the log and its checksum were updated.
109     // If we crash between writing to the log and updating the checksum, we can
110     // try to rewind the log to this offset and verify the checksum is still
111     // valid instead of throwing away the entire log.
112     int64_t rewind_offset = sizeof(Header);
113 
114     // Must be at the end. Contains the crc checksum of the preceding fields.
115     uint32_t header_checksum = 0;
116 
CalculateHeaderChecksumHeader117     uint32_t CalculateHeaderChecksum() const {
118       Crc32 crc;
119       std::string_view header_str(reinterpret_cast<const char*>(this),
120                                   offsetof(Header, header_checksum));
121       crc.Append(header_str);
122       return crc.Get();
123     }
124   };
125 
126   struct CreateResult {
127     // A successfully initialized log.
128     std::unique_ptr<FileBackedProtoLog<ProtoT>> proto_log;
129 
130     // The data status after initializing from a previous state. Data loss can
131     // happen if the file is corrupted or some previously added data was
132     // unpersisted. This may be used to signal that any derived data off of the
133     // proto log may need to be regenerated.
134     DataLoss data_loss;
135 
has_data_lossCreateResult136     bool has_data_loss() {
137       return data_loss == DataLoss::PARTIAL || data_loss == DataLoss::COMPLETE;
138     }
139   };
140 
141   // Factory method to create, initialize, and return a FileBackedProtoLog. Will
142   // create the file if it doesn't exist.
143   //
144   // If on re-initialization the log detects disk corruption or some previously
145   // added data was unpersisted, the log will rewind to the last-good state. The
146   // log saves these checkpointed "good" states when PersistToDisk() is called
147   // or the log is safely destructed. If the log rewinds successfully to the
148   // last-good state, then the returned CreateResult.data_loss indicates
149   // whether it has a data loss and what kind of data loss it is (partial or
150   // complete) so that any derived data may know that it needs to be updated. If
151   // the log re-initializes successfully without any data loss,
152   // CreateResult.data_loss will be NONE.
153   //
154   // Params:
155   //   filesystem: Handles system level calls
156   //   file_path: Path of the underlying file. Directory of the file should
157   //   already exist
158   //   options: Configuration options for the proto log
159   //
160   // Returns:
161   //   FileBackedProtoLog::CreateResult on success
162   //   INVALID_ARGUMENT on an invalid option
163   //   INTERNAL_ERROR on IO error
164   static libtextclassifier3::StatusOr<CreateResult> Create(
165       const Filesystem* filesystem, const std::string& file_path,
166       const Options& options);
167 
168   // Not copyable
169   FileBackedProtoLog(const FileBackedProtoLog&) = delete;
170   FileBackedProtoLog& operator=(const FileBackedProtoLog&) = delete;
171 
172   // Reads out a proto located at file_offset from the file.
173   //
174   // Returns:
175   //   A proto on success
176   //   NOT_FOUND if the proto at the given offset has been erased
177   //   OUT_OF_RANGE_ERROR if file_offset exceeds file size
178   //   INTERNAL_ERROR on IO error
179   libtextclassifier3::StatusOr<ProtoT> ReadProto(int64_t file_offset) const;
180 
181   // An iterator helping to find offsets of all the protos in file.
182   // Example usage:
183   //
184   // while (iterator.Advance().ok()) {
185   //   int64_t offset = iterator.GetOffset();
186   //   // Do something
187   // }
188   class Iterator {
189    public:
190     explicit Iterator(const Filesystem& filesystem,
191                       const std::string& file_path, int64_t initial_offset,
192                       MemoryMappedFile&& mmapped_file);
193 
194     // Advances to the position of next proto whether it has been erased or not.
195     //
196     // Returns:
197     //   OK on success
198     //   OUT_OF_RANGE_ERROR if it reaches the end
199     //   INTERNAL_ERROR on IO error
200     libtextclassifier3::Status Advance();
201 
202     // Returns the file offset of current proto.
203     int64_t GetOffset();
204 
205    private:
206     static constexpr int64_t kInvalidOffset = -1;
207     // Used to read proto metadata
208     MemoryMappedFile mmapped_file_;
209     // Offset of first proto
210     int64_t initial_offset_;
211     int64_t current_offset_;
212     int64_t file_size_;
213   };
214 
215   // Returns an iterator of current proto log. The caller needs to keep the
216   // proto log unchanged while using the iterator, otherwise unexpected
217   // behaviors could happen.
218   libtextclassifier3::StatusOr<Iterator> GetIterator();
219 
220  private:
221   // Object can only be instantiated via the ::Create factory.
222   FileBackedProtoLog(const Filesystem* filesystem, const std::string& file_path,
223                      std::unique_ptr<Header> header);
224 
225   // Initializes a new proto log.
226   //
227   // Returns:
228   //   std::unique_ptr<CreateResult> on success
229   //   INTERNAL_ERROR on IO error
230   static libtextclassifier3::StatusOr<CreateResult> InitializeNewFile(
231       const Filesystem* filesystem, const std::string& file_path,
232       const Options& options);
233 
234   // Verifies that the existing proto log is in a good state. If not in a good
235   // state, then the proto log may be truncated to the last good state and
236   // content will be lost.
237   //
238   // Returns:
239   //   std::unique_ptr<CreateResult> on success
240   //   INTERNAL_ERROR on IO error or internal inconsistencies in the file
241   //   INVALID_ARGUMENT_ERROR if options aren't consistent with previous
242   //     instances
243   static libtextclassifier3::StatusOr<CreateResult> InitializeExistingFile(
244       const Filesystem* filesystem, const std::string& file_path,
245       const Options& options, int64_t file_size);
246 
247   // Takes an initial checksum and updates it with the content between `start`
248   // and `end` offsets in the file.
249   //
250   // Returns:
251   //   Crc of the content between `start`, inclusive, and `end`, exclusive.
252   //   INTERNAL_ERROR on IO error
253   //   INVALID_ARGUMENT_ERROR if start and end aren't within the file size
254   static libtextclassifier3::StatusOr<Crc32> ComputeChecksum(
255       const Filesystem* filesystem, const std::string& file_path,
256       Crc32 initial_crc, int64_t start, int64_t end);
257 
IsEmptyBuffer(const char * buffer,int size)258   static bool IsEmptyBuffer(const char* buffer, int size) {
259     return std::all_of(buffer, buffer + size,
260                        [](const char byte) { return byte == 0; });
261   }
262 
263   // Helper function to get stored proto size from the metadata.
264   // Metadata format: 8 bits magic + 24 bits size
GetProtoSize(int metadata)265   static int GetProtoSize(int metadata) { return metadata & 0x00FFFFFF; }
266 
267   // Helper function to get stored proto magic from the metadata.
268   // Metadata format: 8 bits magic + 24 bits size
GetProtoMagic(int metadata)269   static uint8_t GetProtoMagic(int metadata) { return metadata >> 24; }
270 
271   // Reads out the metadata of a proto located at file_offset from the file.
272   //
273   // Returns:
274   //   Proto's metadata on success
275   //   OUT_OF_RANGE_ERROR if file_offset exceeds file_size
276   //   INTERNAL_ERROR if the metadata is invalid or any IO errors happen
277   static libtextclassifier3::StatusOr<int> ReadProtoMetadata(
278       MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size);
279 
280   // Magic number added in front of every proto. Used when reading out protos
281   // as a first check for corruption in each entry in the file. Even if there is
282   // a corruption, the best we can do is roll back to our last recovery point
283   // and throw away un-flushed data. We can discard/reuse this byte if needed so
284   // that we have 4 bytes to store the size of protos, and increase the size of
285   // protos we support.
286   static constexpr uint8_t kProtoMagic = 0x5C;
287 
288   // Chunks of the file to mmap at a time, so we don't mmap the entire file.
289   // Only used on 32-bit devices
290   static constexpr int kMmapChunkSize = 4 * 1024 * 1024;  // 4MiB
291 
292   ScopedFd fd_;
293   const Filesystem* const filesystem_;
294   const std::string file_path_;
295   std::unique_ptr<Header> header_;
296 };
297 
298 template <typename ProtoT>
FileBackedProtoLog(const Filesystem * filesystem,const std::string & file_path,std::unique_ptr<Header> header)299 FileBackedProtoLog<ProtoT>::FileBackedProtoLog(const Filesystem* filesystem,
300                                                const std::string& file_path,
301                                                std::unique_ptr<Header> header)
302     : filesystem_(filesystem),
303       file_path_(file_path),
304       header_(std::move(header)) {
305   fd_.reset(filesystem_->OpenForAppend(file_path.c_str()));
306 }
307 
308 template <typename ProtoT>
309 libtextclassifier3::StatusOr<typename FileBackedProtoLog<ProtoT>::CreateResult>
Create(const Filesystem * filesystem,const std::string & file_path,const Options & options)310 FileBackedProtoLog<ProtoT>::Create(const Filesystem* filesystem,
311                                    const std::string& file_path,
312                                    const Options& options) {
313   if (options.max_proto_size <= 0) {
314     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
315         "options.max_proto_size must be greater than 0, was %d",
316         options.max_proto_size));
317   }
318 
319   // Since we store the proto_size in 3 bytes, we can only support protos of up
320   // to 16MiB.
321   if (options.max_proto_size > constants::kMaxProtoSize) {
322     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
323         "options.max_proto_size must be under 16MiB, was %d",
324         options.max_proto_size));
325   }
326 
327   if (!filesystem->FileExists(file_path.c_str())) {
328     return InitializeNewFile(filesystem, file_path, options);
329   }
330 
331   int64_t file_size = filesystem->GetFileSize(file_path.c_str());
332   if (file_size == Filesystem::kBadFileSize) {
333     return absl_ports::InternalError(
334         absl_ports::StrCat("Bad file size '", file_path, "'"));
335   }
336 
337   if (file_size == 0) {
338     return InitializeNewFile(filesystem, file_path, options);
339   }
340 
341   return InitializeExistingFile(filesystem, file_path, options, file_size);
342 }
343 
344 template <typename ProtoT>
345 libtextclassifier3::StatusOr<typename FileBackedProtoLog<ProtoT>::CreateResult>
InitializeNewFile(const Filesystem * filesystem,const std::string & file_path,const Options & options)346 FileBackedProtoLog<ProtoT>::InitializeNewFile(const Filesystem* filesystem,
347                                               const std::string& file_path,
348                                               const Options& options) {
349   // Create the header
350   std::unique_ptr<Header> header = std::make_unique<Header>();
351   header->compress = options.compress;
352   header->max_proto_size = options.max_proto_size;
353   header->header_checksum = header->CalculateHeaderChecksum();
354 
355   if (!filesystem->Write(file_path.c_str(), header.get(), sizeof(Header))) {
356     return absl_ports::InternalError(
357         absl_ports::StrCat("Failed to write header for file: ", file_path));
358   }
359 
360   CreateResult create_result = {
361       std::unique_ptr<FileBackedProtoLog<ProtoT>>(
362           new FileBackedProtoLog<ProtoT>(filesystem, file_path,
363                                          std::move(header))),
364       /*data_loss=*/DataLoss::NONE};
365 
366   return create_result;
367 }
368 
369 template <typename ProtoT>
370 libtextclassifier3::StatusOr<typename FileBackedProtoLog<ProtoT>::CreateResult>
InitializeExistingFile(const Filesystem * filesystem,const std::string & file_path,const Options & options,int64_t file_size)371 FileBackedProtoLog<ProtoT>::InitializeExistingFile(const Filesystem* filesystem,
372                                                    const std::string& file_path,
373                                                    const Options& options,
374                                                    int64_t file_size) {
375   if (file_size < sizeof(Header)) {
376     return absl_ports::InternalError(
377         absl_ports::StrCat("File header too short for: ", file_path));
378   }
379 
380   std::unique_ptr<Header> header = std::make_unique<Header>();
381   if (!filesystem->PRead(file_path.c_str(), header.get(), sizeof(Header),
382                          /*offset=*/0)) {
383     return absl_ports::InternalError(
384         absl_ports::StrCat("Failed to read header for file: ", file_path));
385   }
386 
387   // Make sure the header is still valid before we use any of its values. This
388   // is covered by the header_checksum check below, but this is a quick check
389   // that can save us from an extra crc computation.
390   if (header->magic != Header::kMagic) {
391     return absl_ports::InternalError(
392         absl_ports::StrCat("Invalid header kMagic for file: ", file_path));
393   }
394 
395   if (header->header_checksum != header->CalculateHeaderChecksum()) {
396     return absl_ports::InternalError(
397         absl_ports::StrCat("Invalid header checksum for: ", file_path));
398   }
399 
400   if (header->compress != options.compress) {
401     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
402         "Inconsistent compress option, expected %d, actual %d",
403         header->compress, options.compress));
404   }
405 
406   if (header->max_proto_size > options.max_proto_size) {
407     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
408         "Max proto size cannot be smaller than previous "
409         "instantiations, previous size %d, wanted size %d",
410         header->max_proto_size, options.max_proto_size));
411   }
412   header->max_proto_size = options.max_proto_size;
413 
414   DataLoss data_loss = DataLoss::NONE;
415   ICING_ASSIGN_OR_RETURN(Crc32 calculated_log_checksum,
416                          ComputeChecksum(filesystem, file_path, Crc32(),
417                                          sizeof(Header), file_size));
418 
419   // Double check that the log checksum is the same as the one that was
420   // persisted last time. If not, we start recovery logic.
421   if (header->log_checksum != calculated_log_checksum.Get()) {
422     // Need to rewind the proto log since the checksums don't match.
423     // Worst case, we have to rewind the entire log back to just the header
424     int64_t last_known_good = sizeof(Header);
425 
426     // Calculate the checksum of the log contents just up to the last rewind
427     // offset point. This will be valid if we just appended contents to the log
428     // without updating the checksum, and we can rewind back to this point
429     // safely.
430     ICING_ASSIGN_OR_RETURN(
431         calculated_log_checksum,
432         ComputeChecksum(filesystem, file_path, Crc32(), sizeof(Header),
433                         header->rewind_offset));
434     if (header->log_checksum == calculated_log_checksum.Get()) {
435       // Check if it matches our last rewind state. If so, this becomes our last
436       // good state and we can safely truncate and recover from here.
437       last_known_good = header->rewind_offset;
438       data_loss = DataLoss::PARTIAL;
439     } else {
440       // Otherwise, we're going to truncate the entire log and this resets the
441       // checksum to an empty log state.
442       header->log_checksum = 0;
443       data_loss = DataLoss::COMPLETE;
444     }
445 
446     if (!filesystem->Truncate(file_path.c_str(), last_known_good)) {
447       return absl_ports::InternalError(
448           absl_ports::StrCat("Error truncating file: ", file_path));
449     }
450 
451     ICING_LOG(WARNING) << "Truncated '" << file_path << "' to size "
452                        << last_known_good;
453   }
454 
455   CreateResult create_result = {
456       std::unique_ptr<FileBackedProtoLog<ProtoT>>(
457           new FileBackedProtoLog<ProtoT>(filesystem, file_path,
458                                          std::move(header))),
459       data_loss};
460 
461   return create_result;
462 }
463 
464 template <typename ProtoT>
ComputeChecksum(const Filesystem * filesystem,const std::string & file_path,Crc32 initial_crc,int64_t start,int64_t end)465 libtextclassifier3::StatusOr<Crc32> FileBackedProtoLog<ProtoT>::ComputeChecksum(
466     const Filesystem* filesystem, const std::string& file_path,
467     Crc32 initial_crc, int64_t start, int64_t end) {
468   ICING_ASSIGN_OR_RETURN(
469       MemoryMappedFile mmapped_file,
470       MemoryMappedFile::Create(*filesystem, file_path,
471                                MemoryMappedFile::Strategy::READ_ONLY));
472   Crc32 new_crc(initial_crc.Get());
473 
474   if (start < 0) {
475     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
476         "Starting checksum offset of file '%s' must be greater than 0, was "
477         "%lld",
478         file_path.c_str(), static_cast<long long>(start)));
479   }
480 
481   if (end < start) {
482     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
483         "Ending checksum offset of file '%s' must be greater than start "
484         "'%lld', was '%lld'",
485         file_path.c_str(), static_cast<long long>(start),
486         static_cast<long long>(end)));
487   }
488 
489   int64_t file_size = filesystem->GetFileSize(file_path.c_str());
490   if (end > file_size) {
491     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
492         "Ending checksum offset of file '%s' must be within "
493         "file size of %lld, was %lld",
494         file_path.c_str(), static_cast<long long>(file_size),
495         static_cast<long long>(end)));
496   }
497 
498   Architecture architecture = GetArchitecture();
499   switch (architecture) {
500     case Architecture::BIT_64: {
501       // Don't mmap in chunks here since mmapping can be harmful on 64-bit
502       // devices where mmap/munmap calls need the mmap write semaphore, which
503       // blocks mmap/munmap/mprotect and all page faults from executing while
504       // they run. On 64-bit devices, this doesn't actually load into memory, it
505       // just makes the file faultable. So the whole file should be ok.
506       // b/185822878.
507       ICING_RETURN_IF_ERROR(mmapped_file.Remap(start, end - start));
508       auto mmap_str = std::string_view(mmapped_file.region(), end - start);
509       new_crc.Append(mmap_str);
510       break;
511     }
512     case Architecture::BIT_32:
513       [[fallthrough]];
514     case Architecture::UNKNOWN: {
515       // 32-bit devices only have 4GB of RAM. Mmap in chunks to not use up too
516       // much memory at once. If we're unknown, then also chunk it because we're
517       // not sure what the device can handle.
518       for (int i = start; i < end; i += kMmapChunkSize) {
519         // Don't read past the file size.
520         int next_chunk_size = kMmapChunkSize;
521         if ((i + kMmapChunkSize) >= end) {
522           next_chunk_size = end - i;
523         }
524 
525         ICING_RETURN_IF_ERROR(mmapped_file.Remap(i, next_chunk_size));
526 
527         auto mmap_str =
528             std::string_view(mmapped_file.region(), next_chunk_size);
529         new_crc.Append(mmap_str);
530       }
531       break;
532     }
533   }
534 
535   return new_crc;
536 }
537 
538 template <typename ProtoT>
ReadProto(int64_t file_offset)539 libtextclassifier3::StatusOr<ProtoT> FileBackedProtoLog<ProtoT>::ReadProto(
540     int64_t file_offset) const {
541   int64_t file_size = filesystem_->GetFileSize(fd_.get());
542   ICING_ASSIGN_OR_RETURN(
543       MemoryMappedFile mmapped_file,
544       MemoryMappedFile::Create(*filesystem_, file_path_,
545                                MemoryMappedFile::Strategy::READ_ONLY));
546   if (file_offset >= file_size) {
547     // file_size points to the next byte to write at, so subtract one to get
548     // the inclusive, actual size of file.
549     return absl_ports::OutOfRangeError(
550         IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
551                                       "out of range of the file size, %lld",
552                                       static_cast<long long>(file_offset),
553                                       static_cast<long long>(file_size - 1)));
554   }
555 
556   // Read out the metadata
557   ICING_ASSIGN_OR_RETURN(
558       int metadata, ReadProtoMetadata(&mmapped_file, file_offset, file_size));
559 
560   // Copy out however many bytes it says the proto is
561   int stored_size = GetProtoSize(metadata);
562 
563   ICING_RETURN_IF_ERROR(
564       mmapped_file.Remap(file_offset + sizeof(metadata), stored_size));
565 
566   if (IsEmptyBuffer(mmapped_file.region(), mmapped_file.region_size())) {
567     return absl_ports::NotFoundError("The proto data has been erased.");
568   }
569 
570   google::protobuf::io::ArrayInputStream proto_stream(mmapped_file.mutable_region(),
571                                             stored_size);
572 
573   // Deserialize proto
574   ProtoT proto;
575   if (header_->compress) {
576     protobuf_ports::GzipInputStream decompress_stream(&proto_stream);
577     proto.ParseFromZeroCopyStream(&decompress_stream);
578   } else {
579     proto.ParseFromZeroCopyStream(&proto_stream);
580   }
581 
582   return proto;
583 }
584 
585 template <typename ProtoT>
Iterator(const Filesystem & filesystem,const std::string & file_path,int64_t initial_offset,MemoryMappedFile && mmapped_file)586 FileBackedProtoLog<ProtoT>::Iterator::Iterator(const Filesystem& filesystem,
587                                                const std::string& file_path,
588                                                int64_t initial_offset,
589                                                MemoryMappedFile&& mmapped_file)
590     : mmapped_file_(std::move(mmapped_file)),
591       initial_offset_(initial_offset),
592       current_offset_(kInvalidOffset),
593       file_size_(filesystem.GetFileSize(file_path.c_str())) {
594   if (file_size_ == Filesystem::kBadFileSize) {
595     // Fails all Advance() calls
596     file_size_ = 0;
597   }
598 }
599 
600 template <typename ProtoT>
Advance()601 libtextclassifier3::Status FileBackedProtoLog<ProtoT>::Iterator::Advance() {
602   if (current_offset_ == kInvalidOffset) {
603     // First Advance() call
604     current_offset_ = initial_offset_;
605   } else {
606     // Jumps to the next proto position
607     ICING_ASSIGN_OR_RETURN(
608         int metadata,
609         ReadProtoMetadata(&mmapped_file_, current_offset_, file_size_));
610     current_offset_ += sizeof(metadata) + GetProtoSize(metadata);
611   }
612 
613   if (current_offset_ < file_size_) {
614     return libtextclassifier3::Status::OK;
615   } else {
616     return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
617         "The next proto offset, %lld, is out of file range [0, %lld)",
618         static_cast<long long>(current_offset_),
619         static_cast<long long>(file_size_)));
620   }
621 }
622 
623 template <typename ProtoT>
GetOffset()624 int64_t FileBackedProtoLog<ProtoT>::Iterator::GetOffset() {
625   return current_offset_;
626 }
627 
628 template <typename ProtoT>
629 libtextclassifier3::StatusOr<typename FileBackedProtoLog<ProtoT>::Iterator>
GetIterator()630 FileBackedProtoLog<ProtoT>::GetIterator() {
631   ICING_ASSIGN_OR_RETURN(
632       MemoryMappedFile mmapped_file,
633       MemoryMappedFile::Create(*filesystem_, file_path_,
634                                MemoryMappedFile::Strategy::READ_ONLY));
635   return Iterator(*filesystem_, file_path_,
636                   /*initial_offset=*/sizeof(Header), std::move(mmapped_file));
637 }
638 
639 template <typename ProtoT>
ReadProtoMetadata(MemoryMappedFile * mmapped_file,int64_t file_offset,int64_t file_size)640 libtextclassifier3::StatusOr<int> FileBackedProtoLog<ProtoT>::ReadProtoMetadata(
641     MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size) {
642   // Checks file_offset
643   if (file_offset >= file_size) {
644     return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
645         "offset, %lld, is out of file range [0, %lld)",
646         static_cast<long long>(file_offset),
647         static_cast<long long>(file_size)));
648   }
649   int metadata;
650   int metadata_size = sizeof(metadata);
651   if (file_offset + metadata_size >= file_size) {
652     return absl_ports::InternalError(IcingStringUtil::StringPrintf(
653         "Wrong metadata offset %lld, metadata doesn't fit in "
654         "with file range [0, %lld)",
655         static_cast<long long>(file_offset),
656         static_cast<long long>(file_size)));
657   }
658   // Reads metadata
659   ICING_RETURN_IF_ERROR(mmapped_file->Remap(file_offset, metadata_size));
660   memcpy(&metadata, mmapped_file->region(), metadata_size);
661   // Checks magic number
662   uint8_t stored_k_proto_magic = GetProtoMagic(metadata);
663   if (stored_k_proto_magic != kProtoMagic) {
664     return absl_ports::InternalError(IcingStringUtil::StringPrintf(
665         "Failed to read kProtoMagic, expected %d, actual %d", kProtoMagic,
666         stored_k_proto_magic));
667   }
668   return metadata;
669 }
670 
671 }  // namespace lib
672 }  // namespace icing
673 
674 #endif  // ICING_FILE_FILE_BACKED_PROTO_LOG_H_
675