1 // Copyright (C) 2021 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 // File-backed log of protos with append-only writes and position based reads.
16 //
17 // There should only be one instance of a PortableFileBackedProtoLog of the same
18 // file at a time; using multiple instances at the same time may lead to
19 // undefined behavior.
20 //
21 // The entire checksum is computed on initialization to verify the contents are
22 // valid. On failure, the log will be truncated to the last verified state when
23 // PersistToDisk() was called. If the log cannot successfully restore the last
24 // state due to disk corruption or some other inconsistency, then the entire log
25 // will be lost.
26 //
27 // Each proto written to the file will have a metadata written just before it.
28 // The metadata consists of
29 // {
30 // 1 bytes of kProtoMagic;
31 // 3 bytes of the proto size
32 // n bytes of the proto itself
33 // }
34 //
35 // All metadata is written in a portable format, encoded with htonl before
36 // writing to file and decoded with ntohl when reading from file.
37 //
38 // Example usage:
39 // ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
40 // PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
41 // file_path_,
42 // options));
43 // auto proto_log = create_result.proto_log;
44 //
45 // Document document;
46 // document.set_namespace("com.google.android.example");
47 // document.set_uri("www.google.com");
48 //
49 // int64_t document_offset = proto_log->WriteProto(document));
50 // Document same_document = proto_log->ReadProto(document_offset));
51 // proto_log->PersistToDisk();
52
53 #ifndef ICING_FILE_PORTABLE_FILE_BACKED_PROTO_LOG_H_
54 #define ICING_FILE_PORTABLE_FILE_BACKED_PROTO_LOG_H_
55
56 #include <cstddef>
57 #include <cstdint>
58 #include <cstring>
59 #include <memory>
60 #include <string>
61 #include <string_view>
62 #include <utility>
63 #include <vector>
64
65 #include "icing/text_classifier/lib3/utils/base/status.h"
66 #include "icing/text_classifier/lib3/utils/base/statusor.h"
67 #include "icing/absl_ports/canonical_errors.h"
68 #include "icing/absl_ports/str_cat.h"
69 #include "icing/file/constants.h"
70 #include "icing/file/filesystem.h"
71 #include "icing/file/memory-mapped-file.h"
72 #include "icing/legacy/core/icing-string-util.h"
73 #include "icing/portable/endian.h"
74 #include "icing/portable/gzip_stream.h"
75 #include "icing/portable/platform.h"
76 #include "icing/portable/zlib.h"
77 #include "icing/util/bit-util.h"
78 #include "icing/util/crc32.h"
79 #include "icing/util/data-loss.h"
80 #include "icing/util/logging.h"
81 #include "icing/util/status-macros.h"
82 #include <google/protobuf/io/zero_copy_stream_impl_lite.h>
83
84 namespace icing {
85 namespace lib {
86
87 template <typename ProtoT>
88 class PortableFileBackedProtoLog {
89 public:
90 struct Options {
91 // Whether to compress each proto before writing to the proto log.
92 bool compress;
93
94 // Byte-size limit for each proto written to the store. This does not
95 // include the bytes needed for the metadata of each proto.
96 //
97 // NOTE: Currently, we only support protos up to 16MiB. We store the proto
98 // size in 3 bytes within the metadata.
99 //
100 // NOTE: This limit is only enforced for future writes. If the store
101 // previously had a higher limit, then reading older entries could return
102 // larger protos.
103 //
104 // NOTE: The max_proto_size is the upper limit for input protos into the
105 // ProtoLog. Even if the proto is larger than max_proto_size, but compresses
106 // to a smaller size, ProtoLog will not accept it. Protos that result in a
107 // compressed size larger than max_proto_size are also not accepted.
108 const int32_t max_proto_size;
109
110 // Level of compression if enabled, NO_COMPRESSION = 0, BEST_SPEED = 1,
111 // BEST_COMPRESSION = 9
112 const int32_t compression_level;
113
114 // Must specify values for options.
115 Options() = delete;
116 explicit Options(
117 bool compress_in,
118 const int32_t max_proto_size_in = constants::kMaxProtoSize,
119 const int32_t compression_level_in = kDefaultCompressionLevel)
compressOptions120 : compress(compress_in),
121 max_proto_size(max_proto_size_in),
122 compression_level(compression_level_in) {}
123 };
124
125 // Level of compression, BEST_SPEED = 1, BEST_COMPRESSION = 9
126 static constexpr int kDefaultCompressionLevel = 3;
127
128 // Number of bytes we reserve for the heading at the beginning of the proto
129 // log. We reserve this so the header can grow without running into the
130 // contents of the proto log, triggering an unnecessary migration of the data.
131 static constexpr int kHeaderReservedBytes = 256;
132
133 // Header stored at the beginning of the file before the rest of the log
134 // contents. Stores metadata on the log.
135 class Header {
136 public:
137 static constexpr int32_t kMagic = 0xf4c6f67a;
138
139 // We should go directly from 0 to 2 the next time we have to change the
140 // format.
141 static constexpr int32_t kFileFormatVersion = 0;
142
CalculateHeaderChecksum()143 uint32_t CalculateHeaderChecksum() const {
144 Crc32 crc;
145
146 // Get a string_view of all the fields of the Header, excluding the
147 // magic_nbytes_ and header_checksum_nbytes_
148 std::string_view header_str(
149 reinterpret_cast<const char*>(this) +
150 offsetof(Header, header_checksum_nbytes_) +
151 sizeof(header_checksum_nbytes_),
152 sizeof(Header) - sizeof(magic_nbytes_) -
153 sizeof(header_checksum_nbytes_));
154 crc.Append(header_str);
155 return crc.Get();
156 }
157
GetMagic()158 int32_t GetMagic() const { return GNetworkToHostL(magic_nbytes_); }
159
SetMagic(int32_t magic_in)160 void SetMagic(int32_t magic_in) {
161 magic_nbytes_ = GHostToNetworkL(magic_in);
162 }
163
GetFileFormatVersion()164 int32_t GetFileFormatVersion() const {
165 return GNetworkToHostL(file_format_version_nbytes_);
166 }
167
SetFileFormatVersion(int32_t file_format_version_in)168 void SetFileFormatVersion(int32_t file_format_version_in) {
169 file_format_version_nbytes_ = GHostToNetworkL(file_format_version_in);
170 }
171
GetMaxProtoSize()172 int32_t GetMaxProtoSize() const {
173 return GNetworkToHostL(max_proto_size_nbytes_);
174 }
175
SetMaxProtoSize(int32_t max_proto_size_in)176 void SetMaxProtoSize(int32_t max_proto_size_in) {
177 max_proto_size_nbytes_ = GHostToNetworkL(max_proto_size_in);
178 }
179
GetLogChecksum()180 int32_t GetLogChecksum() const {
181 return GNetworkToHostL(log_checksum_nbytes_);
182 }
183
SetLogChecksum(int32_t log_checksum_in)184 void SetLogChecksum(int32_t log_checksum_in) {
185 log_checksum_nbytes_ = GHostToNetworkL(log_checksum_in);
186 }
187
GetRewindOffset()188 int64_t GetRewindOffset() const {
189 return GNetworkToHostLL(rewind_offset_nbytes_);
190 }
191
SetRewindOffset(int64_t rewind_offset_in)192 void SetRewindOffset(int64_t rewind_offset_in) {
193 rewind_offset_nbytes_ = GHostToNetworkLL(rewind_offset_in);
194 }
195
GetHeaderChecksum()196 int32_t GetHeaderChecksum() const {
197 return GNetworkToHostL(header_checksum_nbytes_);
198 }
199
SetHeaderChecksum(int32_t header_checksum_in)200 void SetHeaderChecksum(int32_t header_checksum_in) {
201 header_checksum_nbytes_ = GHostToNetworkL(header_checksum_in);
202 }
203
GetCompressFlag()204 bool GetCompressFlag() const { return GetFlag(kCompressBit); }
205
SetCompressFlag(bool compress)206 void SetCompressFlag(bool compress) { SetFlag(kCompressBit, compress); }
207
GetDirtyFlag()208 bool GetDirtyFlag() const { return GetFlag(kDirtyBit); }
209
SetDirtyFlag(bool dirty)210 void SetDirtyFlag(bool dirty) { SetFlag(kDirtyBit, dirty); }
211
212 private:
213 // The least-significant bit offset at which the compress flag is stored in
214 // 'flags_nbytes_'. Represents whether the protos in the log are compressed
215 // or not.
216 static constexpr int32_t kCompressBit = 0;
217
218 // The least-significant bit offset at which the dirty flag is stored in
219 // 'flags'. Represents whether the checksummed portion of the log has been
220 // modified after the last checksum was computed.
221 static constexpr int32_t kDirtyBit = 1;
222
GetFlag(int offset)223 bool GetFlag(int offset) const {
224 return bit_util::BitfieldGet(flags_, offset, /*len=*/1);
225 }
226
SetFlag(int offset,bool value)227 void SetFlag(int offset, bool value) {
228 bit_util::BitfieldSet(value, offset, /*len=*/1, &flags_);
229 }
230
231 // Holds the magic as a quick sanity check against file corruption.
232 //
233 // Field is in network-byte order.
234 int32_t magic_nbytes_ = GHostToNetworkL(kMagic);
235
236 // Must be at the beginning after kMagic. Contains the crc checksum of
237 // the following fields.
238 //
239 // Field is in network-byte order.
240 uint32_t header_checksum_nbytes_ = 0;
241
242 // Last known good offset at which the log and its checksum were updated.
243 // If we crash between writing to the log and updating the checksum, we can
244 // try to rewind the log to this offset and verify the checksum is still
245 // valid instead of throwing away the entire log.
246 //
247 // Field is in network-byte order.
248 int64_t rewind_offset_nbytes_ = GHostToNetworkLL(kHeaderReservedBytes);
249
250 // Version number tracking how we serialize the file to disk. If we change
251 // how/what we write to disk, this version should be updated and this class
252 // should handle a migration.
253 //
254 // Currently at kFileFormatVersion.
255 //
256 // Field is in network-byte order.
257 int32_t file_format_version_nbytes_ = 0;
258
259 // The maximum proto size that can be written to the log.
260 //
261 // Field is in network-byte order.
262 int32_t max_proto_size_nbytes_ = 0;
263
264 // Checksum of the log elements, doesn't include the header fields.
265 //
266 // Field is in network-byte order.
267 uint32_t log_checksum_nbytes_ = 0;
268
269 // Bits are used to hold various flags.
270 // Lowest bit is whether the protos are compressed or not.
271 //
272 // Field is only 1 byte, so is byte-order agnostic.
273 uint8_t flags_ = 0;
274
275 // NOTE: New fields should *almost always* be added to the end here. Since
276 // this class may have already been written to disk, appending fields
277 // increases the chances that changes are backwards-compatible.
278 };
279 static_assert(sizeof(Header) <= kHeaderReservedBytes,
280 "Header has grown past our reserved bytes!");
281
282 struct CreateResult {
283 // A successfully initialized log.
284 std::unique_ptr<PortableFileBackedProtoLog<ProtoT>> proto_log;
285
286 // The data status after initializing from a previous state. Data loss can
287 // happen if the file is corrupted or some previously added data was
288 // unpersisted. This may be used to signal that any derived data off of the
289 // proto log may need to be regenerated.
290 DataLoss data_loss = DataLoss::NONE;
291
292 // Whether the proto log had to recalculate the checksum to check its
293 // integrity. This can be avoided if no changes were made or the log was
294 // able to update its checksum before shutting down. But it may have to
295 // recalculate if it's unclear if we crashed after updating the log, but
296 // before updating our checksum.
297 bool recalculated_checksum = false;
298
has_data_lossCreateResult299 bool has_data_loss() const {
300 return data_loss == DataLoss::PARTIAL || data_loss == DataLoss::COMPLETE;
301 }
302 };
303
304 // Factory method to create, initialize, and return a
305 // PortableFileBackedProtoLog. Will create the file if it doesn't exist.
306 //
307 // If on re-initialization the log detects disk corruption or some previously
308 // added data was unpersisted, the log will rewind to the last-good state. The
309 // log saves these checkpointed "good" states when PersistToDisk() is called
310 // or the log is safely destructed. If the log rewinds successfully to the
311 // last-good state, then the returned CreateResult.data_loss indicates
312 // whether it has a data loss and what kind of data loss it is (partial or
313 // complete) so that any derived data may know that it needs to be updated. If
314 // the log re-initializes successfully without any data loss,
315 // CreateResult.data_loss will be NONE.
316 //
317 // Params:
318 // filesystem: Handles system level calls
319 // file_path: Path of the underlying file. Directory of the file should
320 // already exist
321 // options: Configuration options for the proto log
322 //
323 // Returns:
324 // PortableFileBackedProtoLog::CreateResult on success
325 // INVALID_ARGUMENT on an invalid option
326 // INTERNAL_ERROR on IO error
327 static libtextclassifier3::StatusOr<CreateResult> Create(
328 const Filesystem* filesystem, const std::string& file_path,
329 const Options& options);
330
331 // Not copyable
332 PortableFileBackedProtoLog(const PortableFileBackedProtoLog&) = delete;
333 PortableFileBackedProtoLog& operator=(const PortableFileBackedProtoLog&) =
334 delete;
335
336 // This will update the checksum of the log as well.
337 ~PortableFileBackedProtoLog();
338
339 // Writes the serialized proto to the underlying file. Writes are applied
340 // directly to the underlying file. Users do not need to sync the file after
341 // writing.
342 //
343 // Returns:
344 // Offset of the newly appended proto in file on success
345 // INVALID_ARGUMENT if proto is too large, as decided by
346 // Options.max_proto_size
347 // INTERNAL_ERROR on IO error
348 libtextclassifier3::StatusOr<int64_t> WriteProto(const ProtoT& proto);
349
350 // Reads out a proto located at file_offset from the file.
351 //
352 // Returns:
353 // A proto on success
354 // NOT_FOUND if the proto at the given offset has been erased
355 // OUT_OF_RANGE_ERROR if file_offset exceeds file size
356 // INTERNAL_ERROR on IO error
357 libtextclassifier3::StatusOr<ProtoT> ReadProto(int64_t file_offset) const;
358
359 // Erases the data of a proto located at file_offset from the file.
360 //
361 // Returns:
362 // OK on success
363 // OUT_OF_RANGE_ERROR if file_offset exceeds file size
364 // INTERNAL_ERROR on IO error
365 libtextclassifier3::Status EraseProto(int64_t file_offset);
366
367 // Calculates and returns the disk usage in bytes. Rounds up to the nearest
368 // block size.
369 //
370 // Returns:
371 // Disk usage on success
372 // INTERNAL_ERROR on IO error
373 libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
374
375 // Returns the file size of all the elements held in the log. File size is in
376 // bytes. This excludes the size of any internal metadata of the log, e.g. the
377 // log's header.
378 //
379 // Returns:
380 // File size on success
381 // INTERNAL_ERROR on IO error
382 libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const;
383
384 // An iterator helping to find offsets of all the protos in file.
385 // Example usage:
386 //
387 // while (iterator.Advance().ok()) {
388 // int64_t offset = iterator.GetOffset();
389 // // Do something
390 // }
391 class Iterator {
392 public:
393 Iterator(const Filesystem& filesystem, int fd, int64_t initial_offset,
394 int64_t file_size);
395
396 // Advances to the position of next proto whether it has been erased or not.
397 //
398 // Returns:
399 // OK on success
400 // OUT_OF_RANGE_ERROR if it reaches the end
401 // INTERNAL_ERROR on IO error
402 libtextclassifier3::Status Advance();
403
404 // Returns the file offset of current proto.
405 int64_t GetOffset();
406
407 private:
408 static constexpr int64_t kInvalidOffset = -1;
409 // Used to read proto metadata
410 // Offset of first proto
411 const Filesystem* const filesystem_;
412 int64_t initial_offset_;
413 int64_t current_offset_;
414 int64_t file_size_;
415 int fd_;
416 };
417
418 // Returns an iterator of current proto log. The caller needs to keep the
419 // proto log unchanged while using the iterator, otherwise unexpected
420 // behaviors could happen.
421 Iterator GetIterator();
422
423 // Persists all changes since initialization or the last call to
424 // PersistToDisk(). Any changes that aren't persisted may be lost if the
425 // system fails to close safely.
426 //
427 // Example use case:
428 //
429 // Document document;
430 // document.set_namespace("com.google.android.example");
431 // document.set_uri("www.google.com");
432 //
433 // {
434 // ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
435 // PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
436 // file_path,
437 // options));
438 // auto proto_log = std::move(create_result.proto_log);
439 //
440 // int64_t document_offset = proto_log->WriteProto(document));
441 //
442 // // We lose the document here since it wasn't persisted.
443 // // *SYSTEM CRASH*
444 // }
445 //
446 // {
447 // // Can still successfully create after a crash since the log can
448 // // rewind/truncate to recover into a previously good state
449 // ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
450 // PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
451 // file_path,
452 // options));
453 // auto proto_log = std::move(create_result.proto_log);
454 //
455 // // Lost the proto since we didn't PersistToDisk before the crash
456 // proto_log->ReadProto(document_offset)); // INVALID_ARGUMENT error
457 //
458 // int64_t document_offset = proto_log->WriteProto(document));
459 //
460 // // Persisted this time, so we should be ok.
461 // ICING_ASSERT_OK(proto_log->PersistToDisk());
462 // }
463 //
464 // {
465 // ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
466 // PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
467 // file_path,
468 // options));
469 // auto proto_log = std::move(create_result.proto_log);
470 //
471 // // SUCCESS
472 // Document same_document = proto_log->ReadProto(document_offset));
473 // }
474 //
475 // NOTE: Since all protos are already written to the file directly, this
476 // just updates the checksum and rewind position. Without these updates,
477 // future initializations will truncate the file and discard unpersisted
478 // changes.
479 //
480 // Returns:
481 // OK on success
482 // INTERNAL_ERROR on IO error
483 libtextclassifier3::Status PersistToDisk();
484
485 // Calculates the checksum of the log contents (excluding the header) and
486 // updates the header.
487 //
488 // Returns:
489 // Crc of the log content
490 // INTERNAL_ERROR on IO error
491 libtextclassifier3::StatusOr<Crc32> UpdateChecksum();
492
493 // Calculates and returns the checksum of the log contents (excluding the
494 // header). Does NOT update the header.
495 //
496 // Returns:
497 // Crc of the log content
498 // INTERNAL_ERROR on IO error
499 libtextclassifier3::StatusOr<Crc32> GetChecksum() const;
500
501 private:
502 // Object can only be instantiated via the ::Create factory.
503 PortableFileBackedProtoLog(const Filesystem* filesystem,
504 const std::string& file_path,
505 std::unique_ptr<Header> header, int64_t file_size,
506 int32_t compression_level);
507
508 // Initializes a new proto log.
509 //
510 // Returns:
511 // std::unique_ptr<CreateResult> on success
512 // INTERNAL_ERROR on IO error
513 static libtextclassifier3::StatusOr<CreateResult> InitializeNewFile(
514 const Filesystem* filesystem, const std::string& file_path,
515 const Options& options);
516
517 // Verifies that the existing proto log is in a good state. If not in a good
518 // state, then the proto log may be truncated to the last good state and
519 // content will be lost.
520 //
521 // Returns:
522 // std::unique_ptr<CreateResult> on success
523 // INTERNAL_ERROR on IO error or internal inconsistencies in the file
524 // INVALID_ARGUMENT_ERROR if options aren't consistent with previous
525 // instances
526 static libtextclassifier3::StatusOr<CreateResult> InitializeExistingFile(
527 const Filesystem* filesystem, const std::string& file_path,
528 const Options& options, int64_t file_size);
529
530 // Takes an initial checksum and updates it with the content between `start`
531 // and `end` offsets in the file.
532 //
533 // Returns:
534 // Crc of the content between `start`, inclusive, and `end`, exclusive.
535 // INTERNAL_ERROR on IO error
536 // INVALID_ARGUMENT_ERROR if start and end aren't within the file size
537 static libtextclassifier3::StatusOr<Crc32> GetPartialChecksum(
538 const Filesystem* filesystem, const std::string& file_path,
539 Crc32 initial_crc, int64_t start, int64_t end, int64_t file_size);
540
541 // Reads out the metadata of a proto located at file_offset from the fd.
542 // Metadata will be returned in host byte order endianness.
543 //
544 // Returns:
545 // Proto's metadata on success
546 // OUT_OF_RANGE_ERROR if file_offset exceeds file_size
547 // INTERNAL_ERROR if the metadata is invalid or any IO errors happen
548 static libtextclassifier3::StatusOr<int32_t> ReadProtoMetadata(
549 const Filesystem* const filesystem, int fd, int64_t file_offset,
550 int64_t file_size);
551
552 // Writes metadata of a proto to the fd. Takes in a host byte order endianness
553 // metadata and converts it into a portable metadata before writing.
554 //
555 // Returns:
556 // OK on success
557 // INTERNAL_ERROR on any IO errors
558 static libtextclassifier3::Status WriteProtoMetadata(
559 const Filesystem* filesystem, int fd, int32_t host_order_metadata);
560
IsEmptyBuffer(const char * buffer,int size)561 static bool IsEmptyBuffer(const char* buffer, int size) {
562 return std::all_of(buffer, buffer + size,
563 [](const char byte) { return byte == 0; });
564 }
565
566 // Helper function to get stored proto size from the metadata.
567 // Metadata format: 8 bits magic + 24 bits size
GetProtoSize(int metadata)568 static int GetProtoSize(int metadata) { return metadata & 0x00FFFFFF; }
569
570 // Helper function to get stored proto magic from the metadata.
571 // Metadata format: 8 bits magic + 24 bits size
GetProtoMagic(int metadata)572 static uint8_t GetProtoMagic(int metadata) { return metadata >> 24; }
573
574 // Magic number added in front of every proto. Used when reading out protos
575 // as a first check for corruption in each entry in the file. Even if there is
576 // a corruption, the best we can do is roll back to our last recovery point
577 // and throw away un-flushed data. We can discard/reuse this byte if needed so
578 // that we have 4 bytes to store the size of protos, and increase the size of
579 // protos we support.
580 static constexpr uint8_t kProtoMagic = 0x5C;
581
582 // Chunks of the file to mmap at a time, so we don't mmap the entire file.
583 // Only used on 32-bit devices
584 static constexpr int kMmapChunkSize = 4 * 1024 * 1024; // 4MiB
585
586 ScopedFd fd_;
587 const Filesystem* const filesystem_;
588 const std::string file_path_;
589 std::unique_ptr<Header> header_;
590 int64_t file_size_;
591 const int32_t compression_level_;
592 };
593
594 template <typename ProtoT>
PortableFileBackedProtoLog(const Filesystem * filesystem,const std::string & file_path,std::unique_ptr<Header> header,int64_t file_size,int32_t compression_level)595 PortableFileBackedProtoLog<ProtoT>::PortableFileBackedProtoLog(
596 const Filesystem* filesystem, const std::string& file_path,
597 std::unique_ptr<Header> header, int64_t file_size,
598 int32_t compression_level)
599 : filesystem_(filesystem),
600 file_path_(file_path),
601 header_(std::move(header)),
602 file_size_(file_size),
603 compression_level_(compression_level) {
604 fd_.reset(filesystem_->OpenForAppend(file_path.c_str()));
605 }
606
607 template <typename ProtoT>
~PortableFileBackedProtoLog()608 PortableFileBackedProtoLog<ProtoT>::~PortableFileBackedProtoLog() {
609 if (!PersistToDisk().ok()) {
610 ICING_LOG(WARNING) << "Error persisting to disk during destruction of "
611 "PortableFileBackedProtoLog: "
612 << file_path_;
613 }
614 }
615
616 template <typename ProtoT>
617 libtextclassifier3::StatusOr<
618 typename PortableFileBackedProtoLog<ProtoT>::CreateResult>
Create(const Filesystem * filesystem,const std::string & file_path,const Options & options)619 PortableFileBackedProtoLog<ProtoT>::Create(const Filesystem* filesystem,
620 const std::string& file_path,
621 const Options& options) {
622 if (options.max_proto_size <= 0) {
623 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
624 "options.max_proto_size must be greater than 0, was %d",
625 options.max_proto_size));
626 }
627
628 // Since we store the proto_size in 3 bytes, we can only support protos of up
629 // to 16MiB.
630 if (options.max_proto_size > constants::kMaxProtoSize) {
631 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
632 "options.max_proto_size must be under 16MiB, was %d",
633 options.max_proto_size));
634 }
635
636 if (options.compression_level < 0 || options.compression_level > 9) {
637 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
638 "options.compression_level must be between 0 and 9 inclusive, was %d",
639 options.compression_level));
640 }
641
642 if (!filesystem->FileExists(file_path.c_str())) {
643 return InitializeNewFile(filesystem, file_path, options);
644 }
645
646 int64_t file_size = filesystem->GetFileSize(file_path.c_str());
647 if (file_size == Filesystem::kBadFileSize) {
648 return absl_ports::InternalError(
649 absl_ports::StrCat("Bad file size '", file_path, "'"));
650 }
651
652 if (file_size == 0) {
653 return InitializeNewFile(filesystem, file_path, options);
654 }
655
656 return InitializeExistingFile(filesystem, file_path, options, file_size);
657 }
658
659 template <typename ProtoT>
660 libtextclassifier3::StatusOr<
661 typename PortableFileBackedProtoLog<ProtoT>::CreateResult>
InitializeNewFile(const Filesystem * filesystem,const std::string & file_path,const Options & options)662 PortableFileBackedProtoLog<ProtoT>::InitializeNewFile(
663 const Filesystem* filesystem, const std::string& file_path,
664 const Options& options) {
665 // Grow to the minimum reserved bytes for the header.
666 if (!filesystem->Truncate(file_path.c_str(), kHeaderReservedBytes)) {
667 return absl_ports::InternalError(
668 absl_ports::StrCat("Failed to initialize file size: ", file_path));
669 }
670
671 // Create the header
672 std::unique_ptr<Header> header = std::make_unique<Header>();
673 header->SetCompressFlag(options.compress);
674 header->SetMaxProtoSize(options.max_proto_size);
675 header->SetHeaderChecksum(header->CalculateHeaderChecksum());
676
677 if (!filesystem->Write(file_path.c_str(), header.get(), sizeof(Header))) {
678 return absl_ports::InternalError(
679 absl_ports::StrCat("Failed to write header for file: ", file_path));
680 }
681
682 CreateResult create_result = {
683 std::unique_ptr<PortableFileBackedProtoLog<ProtoT>>(
684 new PortableFileBackedProtoLog<ProtoT>(
685 filesystem, file_path, std::move(header),
686 /*file_size=*/kHeaderReservedBytes, options.compression_level)),
687 /*data_loss=*/DataLoss::NONE, /*recalculated_checksum=*/false};
688
689 return create_result;
690 }
691
692 template <typename ProtoT>
693 libtextclassifier3::StatusOr<
694 typename PortableFileBackedProtoLog<ProtoT>::CreateResult>
InitializeExistingFile(const Filesystem * filesystem,const std::string & file_path,const Options & options,int64_t file_size)695 PortableFileBackedProtoLog<ProtoT>::InitializeExistingFile(
696 const Filesystem* filesystem, const std::string& file_path,
697 const Options& options, int64_t file_size) {
698 bool header_changed = false;
699 if (file_size < kHeaderReservedBytes) {
700 return absl_ports::InternalError(
701 absl_ports::StrCat("File header too short for: ", file_path));
702 }
703
704 std::unique_ptr<Header> header = std::make_unique<Header>();
705 if (!filesystem->PRead(file_path.c_str(), header.get(), sizeof(Header),
706 /*offset=*/0)) {
707 return absl_ports::InternalError(
708 absl_ports::StrCat("Failed to read header for file: ", file_path));
709 }
710
711 // Make sure the header is still valid before we use any of its values. This
712 // is covered by the header_checksum check below, but this is a quick check
713 // that can save us from an extra crc computation.
714 if (header->GetMagic() != Header::kMagic) {
715 return absl_ports::InternalError(
716 absl_ports::StrCat("Invalid header kMagic for file: ", file_path));
717 }
718
719 if (header->GetHeaderChecksum() != header->CalculateHeaderChecksum()) {
720 return absl_ports::InternalError(
721 absl_ports::StrCat("Invalid header checksum for: ", file_path));
722 }
723
724 if (header->GetFileFormatVersion() != Header::kFileFormatVersion) {
725 // If this changes, we might need to handle a migration rather than throwing
726 // an error.
727 return absl_ports::InternalError(
728 absl_ports::StrCat("Invalid header file format version: ", file_path));
729 }
730
731 if (header->GetCompressFlag() != options.compress) {
732 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
733 "Inconsistent compress option, expected %d, actual %d",
734 header->GetCompressFlag(), options.compress));
735 }
736
737 int32_t existing_max_proto_size = header->GetMaxProtoSize();
738 if (existing_max_proto_size > options.max_proto_size) {
739 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
740 "Max proto size cannot be smaller than previous "
741 "instantiations, previous size %d, wanted size %d",
742 header->GetMaxProtoSize(), options.max_proto_size));
743 } else if (existing_max_proto_size < options.max_proto_size) {
744 // It's fine if our new max size is greater than our previous one. Existing
745 // data is still valid.
746 header->SetMaxProtoSize(options.max_proto_size);
747 header_changed = true;
748 }
749
750 DataLoss data_loss = DataLoss::NONE;
751
752 // If we have any documents in our tail, get rid of them since they're not in
753 // our checksum. Our checksum reflects content up to the rewind offset.
754 if (file_size > header->GetRewindOffset()) {
755 if (!filesystem->Truncate(file_path.c_str(), header->GetRewindOffset())) {
756 return absl_ports::InternalError(IcingStringUtil::StringPrintf(
757 "Failed to truncate '%s' to size %lld", file_path.data(),
758 static_cast<long long>(header->GetRewindOffset())));
759 }
760 file_size = header->GetRewindOffset();
761 data_loss = DataLoss::PARTIAL;
762 }
763
764 bool recalculated_checksum = false;
765
766 // If our dirty flag is set, that means we might have crashed in the middle of
767 // erasing a proto. This could have happened anywhere between:
768 // A. Set dirty flag to true and update header checksum
769 // B. Erase the proto
770 // C. Set dirty flag to false, update log checksum, update header checksum
771 //
772 // Scenario 1: We went down between A and B. Maybe our dirty flag is a
773 // false alarm and we can keep all our data.
774 //
775 // Scenario 2: We went down between B and C. Our data is compromised and
776 // we need to throw everything out.
777 if (header->GetDirtyFlag()) {
778 // Recompute the log's checksum to detect which scenario we're in.
779 ICING_ASSIGN_OR_RETURN(Crc32 calculated_log_checksum,
780 GetPartialChecksum(filesystem, file_path, Crc32(),
781 /*start=*/kHeaderReservedBytes,
782 /*end=*/file_size, file_size));
783
784 if (header->GetLogChecksum() != calculated_log_checksum.Get()) {
785 // Still doesn't match, we're in Scenario 2. Throw out all our data now
786 // and initialize as a new instance.
787 ICING_ASSIGN_OR_RETURN(CreateResult create_result,
788 InitializeNewFile(filesystem, file_path, options));
789 create_result.data_loss = DataLoss::COMPLETE;
790 create_result.recalculated_checksum = true;
791 return create_result;
792 }
793 // Otherwise we're good, checksum matches our contents so continue
794 // initializing like normal.
795 recalculated_checksum = true;
796
797 // Update our header.
798 header->SetDirtyFlag(false);
799 header_changed = true;
800 }
801
802 if (header_changed) {
803 header->SetHeaderChecksum(header->CalculateHeaderChecksum());
804
805 if (!filesystem->PWrite(file_path.c_str(), /*offset=*/0, header.get(),
806 sizeof(Header))) {
807 return absl_ports::InternalError(
808 absl_ports::StrCat("Failed to update header to: ", file_path));
809 }
810 }
811
812 CreateResult create_result = {
813 std::unique_ptr<PortableFileBackedProtoLog<ProtoT>>(
814 new PortableFileBackedProtoLog<ProtoT>(filesystem, file_path,
815 std::move(header), file_size,
816 options.compression_level)),
817 data_loss, recalculated_checksum};
818
819 return create_result;
820 }
821
822 template <typename ProtoT>
823 libtextclassifier3::StatusOr<Crc32>
GetPartialChecksum(const Filesystem * filesystem,const std::string & file_path,Crc32 initial_crc,int64_t start,int64_t end,int64_t file_size)824 PortableFileBackedProtoLog<ProtoT>::GetPartialChecksum(
825 const Filesystem* filesystem, const std::string& file_path,
826 Crc32 initial_crc, int64_t start, int64_t end, int64_t file_size) {
827 ICING_ASSIGN_OR_RETURN(
828 MemoryMappedFile mmapped_file,
829 MemoryMappedFile::Create(*filesystem, file_path,
830 MemoryMappedFile::Strategy::READ_ONLY));
831 Crc32 new_crc(initial_crc.Get());
832
833 if (start < 0) {
834 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
835 "Starting checksum offset of file '%s' must be greater than 0, was "
836 "%lld",
837 file_path.c_str(), static_cast<long long>(start)));
838 }
839
840 if (end < start) {
841 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
842 "Ending checksum offset of file '%s' must be greater than start "
843 "'%lld', was '%lld'",
844 file_path.c_str(), static_cast<long long>(start),
845 static_cast<long long>(end)));
846 }
847
848 if (end > file_size) {
849 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
850 "Ending checksum offset of file '%s' must be within "
851 "file size of %lld, was %lld",
852 file_path.c_str(), static_cast<long long>(file_size),
853 static_cast<long long>(end)));
854 }
855
856 Architecture architecture = GetArchitecture();
857 switch (architecture) {
858 case Architecture::BIT_64: {
859 // Don't mmap in chunks here since mmapping can be harmful on 64-bit
860 // devices where mmap/munmap calls need the mmap write semaphore, which
861 // blocks mmap/munmap/mprotect and all page faults from executing while
862 // they run. On 64-bit devices, this doesn't actually load into memory, it
863 // just makes the file faultable. So the whole file should be ok.
864 // b/185822878.
865 ICING_RETURN_IF_ERROR(mmapped_file.Remap(start, end - start));
866 auto mmap_str = std::string_view(mmapped_file.region(), end - start);
867 new_crc.Append(mmap_str);
868 break;
869 }
870 case Architecture::BIT_32:
871 [[fallthrough]];
872 case Architecture::UNKNOWN: {
873 // 32-bit devices only have 4GB of RAM. Mmap in chunks to not use up too
874 // much memory at once. If we're unknown, then also chunk it because we're
875 // not sure what the device can handle.
876 for (int i = start; i < end; i += kMmapChunkSize) {
877 // Don't read past the file size.
878 int next_chunk_size = kMmapChunkSize;
879 if ((i + kMmapChunkSize) >= end) {
880 next_chunk_size = end - i;
881 }
882
883 ICING_RETURN_IF_ERROR(mmapped_file.Remap(i, next_chunk_size));
884
885 auto mmap_str =
886 std::string_view(mmapped_file.region(), next_chunk_size);
887 new_crc.Append(mmap_str);
888 }
889 break;
890 }
891 }
892
893 return new_crc;
894 }
895
896 template <typename ProtoT>
897 libtextclassifier3::StatusOr<int64_t>
WriteProto(const ProtoT & proto)898 PortableFileBackedProtoLog<ProtoT>::WriteProto(const ProtoT& proto) {
899 int64_t proto_size = proto.ByteSizeLong();
900 int32_t host_order_metadata;
901 int64_t current_position = filesystem_->GetCurrentPosition(fd_.get());
902
903 if (proto_size > header_->GetMaxProtoSize()) {
904 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
905 "proto_size, %lld, was too large to write. Max is %d",
906 static_cast<long long>(proto_size), header_->GetMaxProtoSize()));
907 }
908
909 // At this point, we've guaranteed that proto_size is under kMaxProtoSize
910 // (see
911 // ::Create), so we can safely store it in an int.
912 int final_size = 0;
913
914 std::string proto_str;
915 google::protobuf::io::StringOutputStream proto_stream(&proto_str);
916
917 if (header_->GetCompressFlag()) {
918 protobuf_ports::GzipOutputStream::Options options;
919 options.format = protobuf_ports::GzipOutputStream::ZLIB;
920 options.compression_level = compression_level_;
921
922 protobuf_ports::GzipOutputStream compressing_stream(&proto_stream, options);
923
924 bool success = proto.SerializeToZeroCopyStream(&compressing_stream) &&
925 compressing_stream.Close();
926
927 if (!success) {
928 return absl_ports::InternalError("Error compressing proto.");
929 }
930
931 final_size = proto_str.size();
932
933 // In case the compressed proto is larger than the original proto, we also
934 // can't write it.
935 if (final_size > header_->GetMaxProtoSize()) {
936 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
937 "Compressed proto size, %d, was greater than "
938 "max_proto_size, %d",
939 final_size, header_->GetMaxProtoSize()));
940 }
941 } else {
942 // Serialize the proto directly into the write buffer at an offset of the
943 // metadata.
944 proto.SerializeToZeroCopyStream(&proto_stream);
945 final_size = proto_str.size();
946 }
947
948 // 1st byte for magic, next 3 bytes for proto size.
949 host_order_metadata = (kProtoMagic << 24) | final_size;
950
951 // Actually write metadata, has to be done after we know the possibly
952 // compressed proto size
953 ICING_RETURN_IF_ERROR(
954 WriteProtoMetadata(filesystem_, fd_.get(), host_order_metadata));
955
956 // Write the serialized proto
957 if (!filesystem_->Write(fd_.get(), proto_str.data(), proto_str.size())) {
958 return absl_ports::InternalError(
959 absl_ports::StrCat("Failed to write proto to: ", file_path_));
960 }
961
962 // Update file size. The file should have grown by sizeof(Metadata) + size of
963 // the serialized proto.
964 file_size_ += sizeof(host_order_metadata) + final_size;
965 return current_position;
966 }
967
968 template <typename ProtoT>
969 libtextclassifier3::StatusOr<ProtoT>
ReadProto(int64_t file_offset)970 PortableFileBackedProtoLog<ProtoT>::ReadProto(int64_t file_offset) const {
971 ICING_ASSIGN_OR_RETURN(
972 int32_t metadata,
973 ReadProtoMetadata(filesystem_, fd_.get(), file_offset, file_size_));
974
975 // Copy out however many bytes it says the proto is
976 int stored_size = GetProtoSize(metadata);
977 file_offset += sizeof(metadata);
978
979 // Read the compressed proto out.
980 if (file_offset + stored_size > file_size_) {
981 return absl_ports::OutOfRangeError(
982 IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
983 "out of range of the file size, %lld",
984 static_cast<long long>(file_offset),
985 static_cast<long long>(file_size_ - 1)));
986 }
987 auto buf = std::make_unique<char[]>(stored_size);
988 if (!filesystem_->PRead(fd_.get(), buf.get(), stored_size, file_offset)) {
989 return absl_ports::InternalError("");
990 }
991
992 if (IsEmptyBuffer(buf.get(), stored_size)) {
993 return absl_ports::NotFoundError("The proto data has been erased.");
994 }
995
996 google::protobuf::io::ArrayInputStream proto_stream(buf.get(), stored_size);
997
998 // Deserialize proto
999 ProtoT proto;
1000 if (header_->GetCompressFlag()) {
1001 protobuf_ports::GzipInputStream decompress_stream(&proto_stream);
1002 proto.ParseFromZeroCopyStream(&decompress_stream);
1003 } else {
1004 proto.ParseFromZeroCopyStream(&proto_stream);
1005 }
1006
1007 return proto;
1008 }
1009
1010 template <typename ProtoT>
EraseProto(int64_t file_offset)1011 libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::EraseProto(
1012 int64_t file_offset) {
1013 ICING_ASSIGN_OR_RETURN(
1014 int32_t metadata,
1015 ReadProtoMetadata(filesystem_, fd_.get(), file_offset, file_size_));
1016 // Copy out however many bytes it says the proto is
1017 int stored_size = GetProtoSize(metadata);
1018 file_offset += sizeof(metadata);
1019 if (file_offset + stored_size > file_size_) {
1020 return absl_ports::OutOfRangeError(
1021 IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
1022 "out of range of the file size, %lld",
1023 static_cast<long long>(file_offset),
1024 static_cast<long long>(file_size_ - 1)));
1025 }
1026 auto buf = std::make_unique<char[]>(stored_size);
1027
1028 // We need to update the crc checksum if the erased area is before the
1029 // rewind position.
1030 int32_t new_crc;
1031 if (file_offset < header_->GetRewindOffset()) {
1032 // Set to "dirty" before we start writing anything.
1033 header_->SetDirtyFlag(true);
1034 header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
1035 if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
1036 sizeof(Header))) {
1037 return absl_ports::InternalError(absl_ports::StrCat(
1038 "Failed to update dirty bit of header to: ", file_path_));
1039 }
1040
1041 // We need to calculate [original string xor 0s].
1042 // The xored string is the same as the original string because 0 xor 0 =
1043 // 0, 1 xor 0 = 1.
1044 // Read the compressed proto out.
1045 if (!filesystem_->PRead(fd_.get(), buf.get(), stored_size, file_offset)) {
1046 return absl_ports::InternalError("");
1047 }
1048 const std::string_view xored_str(buf.get(), stored_size);
1049
1050 Crc32 crc(header_->GetLogChecksum());
1051 ICING_ASSIGN_OR_RETURN(
1052 new_crc,
1053 crc.UpdateWithXor(xored_str,
1054 /*full_data_size=*/header_->GetRewindOffset() -
1055 kHeaderReservedBytes,
1056 /*position=*/file_offset - kHeaderReservedBytes));
1057 }
1058
1059 // Clear the region.
1060 memset(buf.get(), '\0', stored_size);
1061 if (!filesystem_->PWrite(fd_.get(), file_offset, buf.get(), stored_size)) {
1062 return absl_ports::InternalError("");
1063 }
1064
1065 // If we cleared something in our checksummed area, we should update our
1066 // checksum and reset our dirty bit.
1067 if (file_offset < header_->GetRewindOffset()) {
1068 header_->SetDirtyFlag(false);
1069 header_->SetLogChecksum(new_crc);
1070 header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
1071
1072 if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
1073 sizeof(Header))) {
1074 return absl_ports::InternalError(
1075 absl_ports::StrCat("Failed to update header to: ", file_path_));
1076 }
1077 }
1078
1079 return libtextclassifier3::Status::OK;
1080 }
1081
1082 template <typename ProtoT>
1083 libtextclassifier3::StatusOr<int64_t>
GetDiskUsage()1084 PortableFileBackedProtoLog<ProtoT>::GetDiskUsage() const {
1085 int64_t size = filesystem_->GetDiskUsage(file_path_.c_str());
1086 if (size == Filesystem::kBadFileSize) {
1087 return absl_ports::InternalError("Failed to get disk usage of proto log");
1088 }
1089 return size;
1090 }
1091
1092 template <typename ProtoT>
1093 libtextclassifier3::StatusOr<int64_t>
GetElementsFileSize()1094 PortableFileBackedProtoLog<ProtoT>::GetElementsFileSize() const {
1095 return file_size_ - kHeaderReservedBytes;
1096 }
1097
1098 template <typename ProtoT>
Iterator(const Filesystem & filesystem,int fd,int64_t initial_offset,int64_t file_size)1099 PortableFileBackedProtoLog<ProtoT>::Iterator::Iterator(
1100 const Filesystem& filesystem, int fd, int64_t initial_offset,
1101 int64_t file_size)
1102 : filesystem_(&filesystem),
1103 initial_offset_(initial_offset),
1104 current_offset_(kInvalidOffset),
1105 file_size_(file_size),
1106 fd_(fd) {}
1107
1108 template <typename ProtoT>
1109 libtextclassifier3::Status
Advance()1110 PortableFileBackedProtoLog<ProtoT>::Iterator::Advance() {
1111 if (current_offset_ == kInvalidOffset) {
1112 // First Advance() call
1113 current_offset_ = initial_offset_;
1114 } else {
1115 // Jumps to the next proto position
1116 ICING_ASSIGN_OR_RETURN(
1117 int32_t metadata,
1118 ReadProtoMetadata(filesystem_, fd_, current_offset_, file_size_));
1119 current_offset_ += sizeof(metadata) + GetProtoSize(metadata);
1120 }
1121
1122 if (current_offset_ < file_size_) {
1123 return libtextclassifier3::Status::OK;
1124 } else {
1125 return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
1126 "The next proto offset, %lld, is out of file range [0, %lld)",
1127 static_cast<long long>(current_offset_),
1128 static_cast<long long>(file_size_)));
1129 }
1130 }
1131
1132 template <typename ProtoT>
GetOffset()1133 int64_t PortableFileBackedProtoLog<ProtoT>::Iterator::GetOffset() {
1134 return current_offset_;
1135 }
1136
1137 template <typename ProtoT>
1138 typename PortableFileBackedProtoLog<ProtoT>::Iterator
GetIterator()1139 PortableFileBackedProtoLog<ProtoT>::GetIterator() {
1140 return Iterator(*filesystem_, fd_.get(),
1141 /*initial_offset=*/kHeaderReservedBytes, file_size_);
1142 }
1143
1144 template <typename ProtoT>
1145 libtextclassifier3::StatusOr<int32_t>
ReadProtoMetadata(const Filesystem * const filesystem,int fd,int64_t file_offset,int64_t file_size)1146 PortableFileBackedProtoLog<ProtoT>::ReadProtoMetadata(
1147 const Filesystem* const filesystem, int fd, int64_t file_offset,
1148 int64_t file_size) {
1149 // Checks file_offset
1150 if (file_offset >= file_size) {
1151 return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
1152 "offset, %lld, is out of file range [0, %lld)",
1153 static_cast<long long>(file_offset),
1154 static_cast<long long>(file_size)));
1155 }
1156 int32_t portable_metadata;
1157 int metadata_size = sizeof(portable_metadata);
1158 if (file_offset + metadata_size >= file_size) {
1159 return absl_ports::InternalError(IcingStringUtil::StringPrintf(
1160 "Wrong metadata offset %lld, metadata doesn't fit in "
1161 "with file range [0, %lld)",
1162 static_cast<long long>(file_offset),
1163 static_cast<long long>(file_size)));
1164 }
1165
1166 if (!filesystem->PRead(fd, &portable_metadata, metadata_size, file_offset)) {
1167 return absl_ports::InternalError("");
1168 }
1169
1170 // Need to switch it back to host order endianness after reading from disk.
1171 int32_t host_order_metadata = GNetworkToHostL(portable_metadata);
1172
1173 // Checks magic number
1174 uint8_t stored_k_proto_magic = GetProtoMagic(host_order_metadata);
1175 if (stored_k_proto_magic != kProtoMagic) {
1176 return absl_ports::InternalError(IcingStringUtil::StringPrintf(
1177 "Failed to read kProtoMagic, expected %d, actual %d", kProtoMagic,
1178 stored_k_proto_magic));
1179 }
1180
1181 return host_order_metadata;
1182 }
1183
1184 template <typename ProtoT>
1185 libtextclassifier3::Status
WriteProtoMetadata(const Filesystem * filesystem,int fd,int32_t host_order_metadata)1186 PortableFileBackedProtoLog<ProtoT>::WriteProtoMetadata(
1187 const Filesystem* filesystem, int fd, int32_t host_order_metadata) {
1188 // Convert it into portable endian format before writing to disk
1189 int32_t portable_metadata = GHostToNetworkL(host_order_metadata);
1190 int portable_metadata_size = sizeof(portable_metadata);
1191
1192 // Write metadata
1193 if (!filesystem->Write(fd, &portable_metadata, portable_metadata_size)) {
1194 return absl_ports::InternalError(
1195 absl_ports::StrCat("Failed to write proto metadata."));
1196 }
1197
1198 return libtextclassifier3::Status::OK;
1199 }
1200
1201 template <typename ProtoT>
PersistToDisk()1202 libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::PersistToDisk() {
1203 if (file_size_ == header_->GetRewindOffset()) {
1204 // No new protos appended, don't need to update the checksum.
1205 return libtextclassifier3::Status::OK;
1206 }
1207
1208 ICING_RETURN_IF_ERROR(UpdateChecksum());
1209 if (!filesystem_->DataSync(fd_.get())) {
1210 return absl_ports::InternalError(
1211 absl_ports::StrCat("Failed to sync data to disk: ", file_path_));
1212 }
1213
1214 return libtextclassifier3::Status::OK;
1215 }
1216
1217 template <typename ProtoT>
1218 libtextclassifier3::StatusOr<Crc32>
UpdateChecksum()1219 PortableFileBackedProtoLog<ProtoT>::UpdateChecksum() {
1220 if (file_size_ == header_->GetRewindOffset()) {
1221 return Crc32(header_->GetLogChecksum());
1222 }
1223 ICING_ASSIGN_OR_RETURN(Crc32 crc, GetChecksum());
1224 header_->SetLogChecksum(crc.Get());
1225 header_->SetRewindOffset(file_size_);
1226 header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
1227
1228 if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
1229 sizeof(Header))) {
1230 return absl_ports::InternalError(
1231 absl_ports::StrCat("Failed to update header to: ", file_path_));
1232 }
1233 return crc;
1234 }
1235
1236 template <typename ProtoT>
1237 libtextclassifier3::StatusOr<Crc32>
GetChecksum()1238 PortableFileBackedProtoLog<ProtoT>::GetChecksum() const {
1239 int64_t new_content_size = file_size_ - header_->GetRewindOffset();
1240 if (new_content_size == 0) {
1241 // No new protos appended, return cached checksum
1242 return Crc32(header_->GetLogChecksum());
1243 } else if (new_content_size < 0) {
1244 // File shrunk, recalculate the entire checksum.
1245 return GetPartialChecksum(filesystem_, file_path_, Crc32(),
1246 /*start=*/kHeaderReservedBytes,
1247 /*end=*/file_size_, file_size_);
1248 } else {
1249 // Append new changes to the existing checksum.
1250 return GetPartialChecksum(
1251 filesystem_, file_path_, Crc32(header_->GetLogChecksum()),
1252 /*start=*/header_->GetRewindOffset(), /*end=*/file_size_, file_size_);
1253 }
1254 }
1255
1256 } // namespace lib
1257 } // namespace icing
1258
1259 #endif // ICING_FILE_PORTABLE_FILE_BACKED_PROTO_LOG_H_
1260