xref: /aosp_15_r20/external/icing/icing/file/file-backed-proto.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // A simple file-backed proto with an in-memory cache.
16 // WARNING: Only use this for small protos. Files storing larger protos can
17 // benefit from more sophisticated strategies like chunked reads/writes,
18 // using mmap and ideally, not even using protos.
19 //
20 // TODO(b/133793579) Consider exposing a checksum mismatch to callers.
21 
22 #ifndef ICING_FILE_FILE_BACKED_PROTO_H_
23 #define ICING_FILE_FILE_BACKED_PROTO_H_
24 
25 #include <algorithm>
26 #include <cstdint>
27 #include <memory>
28 #include <string>
29 #include <string_view>
30 
31 #include "icing/text_classifier/lib3/utils/base/status.h"
32 #include "icing/text_classifier/lib3/utils/base/statusor.h"
33 #include "icing/absl_ports/canonical_errors.h"
34 #include "icing/absl_ports/mutex.h"
35 #include "icing/absl_ports/str_cat.h"
36 #include "icing/absl_ports/thread_annotations.h"
37 #include "icing/file/filesystem.h"
38 #include "icing/legacy/core/icing-string-util.h"
39 #include "icing/util/crc32.h"
40 #include "icing/util/logging.h"
41 #include "icing/util/status-macros.h"
42 
43 namespace icing {
44 namespace lib {
45 
46 // This class is go/thread-compatible
47 template <typename ProtoT>
48 class FileBackedProto {
49  public:
50   // Header stored at the beginning of the file before the proto.
51   struct Header {
52     static constexpr int32_t kMagic = 0x726f746f;
53 
54     // Holds the magic as a quick sanity check against file corruption.
55     int32_t magic;
56 
57     // Checksum of the serialized proto, for a more thorough check against file
58     // corruption.
59     uint32_t proto_checksum;
60   };
61 
62   // Used the specified file to read older version of the proto and store
63   // newer versions of the proto.
64   //
65   // file_path : Must be a path within in a directory that already exists.
66   FileBackedProto(const Filesystem& filesystem, std::string_view file_path);
67 
68   // Reset the internal file_path for the file backed proto.
69   // Example use:
70   //   auto file_backed_proto1 = *FileBackedProto<Proto>::Create(...);
71   //   auto file_backed_google::protobuf = *FileBackedProto<Proto>::Create(...);
72   //   filesystem.SwapFiles(file1, file2);
73   //   file_backed_proto1.SetSwappedFilepath(file2);
74   //   file_backed_google::protobuf.SetSwappedFilepath(file1);
SetSwappedFilepath(std::string_view swapped_to_file_path)75   void SetSwappedFilepath(std::string_view swapped_to_file_path) {
76     file_path_ = swapped_to_file_path;
77   }
78 
79   // Computes the checksum of the proto stored in this file and returns it.
80   // RETURNS:
81   //   - the checksum of the proto or 0 if the file is empty/non-existent
82   //   - INTERNAL_ERROR if an IO error or a corruption was encountered.
83   libtextclassifier3::StatusOr<Crc32> GetChecksum() const
84       ICING_LOCKS_EXCLUDED(mutex_);
85 
86   // Returns a reference to the proto read from the file. It
87   // internally caches the read proto so that future calls are fast.
88   //
89   // NOTE: The caller does NOT get ownership of the object returned and
90   // the returned object is only valid till a new version of the proto is
91   // written to the file.
92   //
93   // Returns NOT_FOUND if the file was empty or never written to.
94   // Returns INTERNAL_ERROR if an IO error or a corruption was encountered.
95   libtextclassifier3::StatusOr<const ProtoT*> Read() const
96       ICING_LOCKS_EXCLUDED(mutex_);
97 
98   // Writes the new version of the proto provided through to disk.
99   // Successful Write() invalidates any previously read version of the proto.
100   //
101   // Returns INTERNAL_ERROR if any IO error is encountered and will NOT
102   // invalidate any previously read versions of the proto.
103   //
104   // TODO(cassiewang) The implementation today loses old data if Write() fails.
105   // We should write to a tmp file first and rename the file to fix this.
106   // TODO(cassiewang) Change to Write(ProtoT&& proto)
107   libtextclassifier3::Status Write(std::unique_ptr<ProtoT> proto)
108       ICING_LOCKS_EXCLUDED(mutex_);
109 
110   // Disallow copy and assign.
111   FileBackedProto(const FileBackedProto&) = delete;
112   FileBackedProto& operator=(const FileBackedProto&) = delete;
113 
114  private:
115   // Internal method to handle reading the proto from disk.
116   // Requires the caller to hold an exclusive lock on mutex_.
117   libtextclassifier3::StatusOr<const ProtoT*> ReadInternal() const
118       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
119 
120   // Upper bound of file-size that is supported.
121   static constexpr int32_t kMaxFileSize = 1 * 1024 * 1024;  // 1 MiB.
122 
123   // Used to provide reader and writer locks
124   mutable absl_ports::shared_mutex mutex_;
125 
126   const Filesystem* const filesystem_;
127   std::string file_path_;
128 
129   mutable std::unique_ptr<ProtoT> cached_proto_ ICING_GUARDED_BY(mutex_);
130 
131   mutable std::unique_ptr<Header> cached_header_ ICING_GUARDED_BY(mutex_);
132 };
133 
134 template <typename ProtoT>
135 constexpr int32_t FileBackedProto<ProtoT>::kMaxFileSize;
136 
137 template <typename ProtoT>
FileBackedProto(const Filesystem & filesystem,const std::string_view file_path)138 FileBackedProto<ProtoT>::FileBackedProto(const Filesystem& filesystem,
139                                          const std::string_view file_path)
140     : filesystem_(&filesystem), file_path_(file_path) {}
141 
142 template <typename ProtoT>
GetChecksum()143 libtextclassifier3::StatusOr<Crc32> FileBackedProto<ProtoT>::GetChecksum()
144     const {
145   absl_ports::unique_lock l(&mutex_);
146   if (cached_proto_ == nullptr) {
147     auto read_status = ReadInternal();
148     if (!read_status.ok()) {
149       if (absl_ports::IsNotFound(read_status.status())) {
150         // File doesn't exist. So simply return 0.
151         return Crc32();
152       }
153       return read_status.status();
154     }
155   }
156   return Crc32(cached_header_->proto_checksum);
157 }
158 
159 template <typename ProtoT>
Read()160 libtextclassifier3::StatusOr<const ProtoT*> FileBackedProto<ProtoT>::Read()
161     const {
162   ICING_VLOG(1) << "Reading proto from file: " << file_path_;
163 
164   absl_ports::unique_lock l(&mutex_);
165 
166   return ReadInternal();
167 }
168 
169 template <typename ProtoT>
170 libtextclassifier3::StatusOr<const ProtoT*>
ReadInternal()171 FileBackedProto<ProtoT>::ReadInternal() const {
172   // Return cached proto if we've already read from disk.
173   if (cached_proto_ != nullptr) {
174     ICING_VLOG(1) << "Reusing cached proto for file: " << file_path_;
175     return cached_proto_.get();
176   }
177 
178   int64_t file_size = filesystem_->GetFileSize(file_path_.c_str());
179   if (file_size == Filesystem::kBadFileSize || file_size == 0) {
180     return absl_ports::NotFoundError(
181         absl_ports::StrCat("Missing file: ", file_path_));
182   }
183 
184   if (file_size > kMaxFileSize) {
185     return absl_ports::InternalError(absl_ports::StrCat(
186         "File larger than expected, couldn't read: ", file_path_));
187   }
188 
189   ScopedFd fd(filesystem_->OpenForRead(file_path_.c_str()));
190   if (!fd.is_valid()) {
191     return absl_ports::InternalError(
192         absl_ports::StrCat("Unable to open file for read: ", file_path_));
193   }
194 
195   ICING_VLOG(1) << "Loading proto from  file: " << file_path_
196                 << " of size: " << file_size;
197 
198   Header header;
199   if (!filesystem_->PRead(fd.get(), &header, sizeof(Header), /*offset=*/0)) {
200     return absl_ports::InternalError(
201         absl_ports::StrCat("Unable to read header of: ", file_path_));
202   }
203 
204   if (header.magic != Header::kMagic) {
205     return absl_ports::InternalError(
206         absl_ports::StrCat("Invalid header kMagic for: ", file_path_));
207   }
208 
209   int proto_size = file_size - sizeof(Header);
210   auto buffer = std::make_unique<uint8_t[]>(proto_size);
211   if (!filesystem_->PRead(fd.get(), buffer.get(), proto_size,
212                           /*offset=*/sizeof(Header))) {
213     return absl_ports::InternalError(
214         absl_ports::StrCat("File read failed: ", file_path_));
215   }
216 
217   std::string_view buffer_str(reinterpret_cast<const char*>(buffer.get()),
218                               proto_size);
219   Crc32 crc;
220   crc.Append(buffer_str);
221   if (header.proto_checksum != crc.Get()) {
222     return absl_ports::InternalError(
223         absl_ports::StrCat("Checksum of file does not match: ", file_path_));
224   }
225 
226   auto proto = std::make_unique<ProtoT>();
227   if (!proto->ParseFromArray(buffer.get(), proto_size)) {
228     return absl_ports::InternalError(
229         absl_ports::StrCat("Proto parse failed. File corrupted: ", file_path_));
230   }
231 
232   ICING_VLOG(1) << "Successfully read proto from file: " << file_path_;
233   cached_proto_ = std::move(proto);
234   cached_header_ = std::make_unique<Header>(std::move(header));
235   return cached_proto_.get();
236 }
237 
238 template <typename ProtoT>
Write(std::unique_ptr<ProtoT> new_proto)239 libtextclassifier3::Status FileBackedProto<ProtoT>::Write(
240     std::unique_ptr<ProtoT> new_proto) {
241   ICING_VLOG(1) << "Writing proto to file: " << file_path_;
242 
243   absl_ports::unique_lock l(&mutex_);
244 
245   const std::string new_proto_str = new_proto->SerializeAsString();
246   if (new_proto_str.size() >= kMaxFileSize) {
247     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
248         "New proto too large. size: %d; limit: %d.",
249         static_cast<int>(new_proto_str.size()), kMaxFileSize));
250   }
251 
252   if (cached_proto_ != nullptr &&
253       cached_proto_->SerializeAsString() == new_proto_str) {
254     ICING_VLOG(1) << "Skip writing proto to file as contents are identical: "
255                   << file_path_;
256     return libtextclassifier3::Status::OK;
257   }
258 
259   ScopedFd fd(filesystem_->OpenForWrite(file_path_.c_str()));
260   if (!fd.is_valid()) {
261     return absl_ports::InternalError(
262         absl_ports::StrCat("Unable to open file for write: ", file_path_));
263   }
264 
265   if (!filesystem_->Truncate(fd.get(), 0)) {
266     return absl_ports::InternalError(
267         absl_ports::StrCat("Failed to truncate file: ", file_path_));
268   }
269 
270   Header header;
271   header.magic = Header::kMagic;
272 
273   Crc32 crc;
274   crc.Append(new_proto_str);
275   header.proto_checksum = crc.Get();
276   if (!filesystem_->Write(fd.get(), &header, sizeof(Header))) {
277     return absl_ports::InternalError(
278         absl_ports::StrCat("Failed to write header to file: ", file_path_));
279   }
280 
281   if (!filesystem_->Write(fd.get(), new_proto_str.data(),
282                           new_proto_str.size())) {
283     return absl_ports::InternalError(
284         absl_ports::StrCat("Failed to write proto to file: ", file_path_));
285   }
286 
287   if (!filesystem_->DataSync(fd.get())) {
288     return absl_ports::InternalError(IcingStringUtil::StringPrintf(
289         "Failed to sync file; filename: %s; content_size: %d ",
290         file_path_.c_str(), static_cast<int>(new_proto_str.size())));
291   }
292 
293   ICING_VLOG(1) << "Successfully wrote proto to file: " << file_path_;
294   cached_proto_ = std::move(new_proto);
295   cached_header_ = std::make_unique<Header>(std::move(header));
296   return libtextclassifier3::Status::OK;
297 }
298 
299 }  // namespace lib
300 }  // namespace icing
301 
302 #endif  // ICING_FILE_FILE_BACKED_PROTO_H_
303