xref: /aosp_15_r20/external/icing/icing/file/persistent-storage.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2023 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_FILE_PERSISTENT_STORAGE_H_
16 #define ICING_FILE_PERSISTENT_STORAGE_H_
17 
18 #include <cstdint>
19 #include <string>
20 #include <string_view>
21 
22 #include "icing/text_classifier/lib3/utils/base/status.h"
23 #include "icing/text_classifier/lib3/utils/base/statusor.h"
24 #include "icing/absl_ports/canonical_errors.h"
25 #include "icing/absl_ports/str_cat.h"
26 #include "icing/file/filesystem.h"
27 #include "icing/util/crc32.h"
28 #include "icing/util/status-macros.h"
29 
30 namespace icing {
31 namespace lib {
32 
33 // PersistentStorage: an abstract class for all persistent data structures.
34 // - It provides some common persistent file methods, e.g. PersistToDisk.
35 // - It encapsulates most of the checksum handling logics (including update and
36 //   validation).
37 //
38 // Terminology:
39 // - Crcs: checksum section
40 // - Info: (custom) information for derived class
41 // - Metadata: Crcs + Info
42 //
43 // Usually a persistent data structure will have its own custom Info and
44 // storages (single or composite storages) definition. To create a new
45 // persistent data structure via PersistentStorage:
46 // - Decide what type the working path is (single file or directory). See
47 //   working_path_ and WorkingPathType for more details.
48 // - Create a new class that inherits PersistentStorage:
49 //   - Declare custom Info and design the metadata section layout.
50 //     Usually the layout is <Crcs><Info>, and there are 2 common ways to
51 //     manage metadata section:
52 //     - Have a separate file for metadata. In this case, the new persistent
53 //       data structure contains multiple files, so working path should be used
54 //       as directory path and multiple files will be stored under it. Example:
55 //       PersistentHashMap.
56 //     - Have a single file for both metadata and storage data. In this case,
57 //       the file layout should be <Crcs><Info><Storage Data>, and
58 //       working path should be used as file path. Example: FileBackedVector.
59 //   - Handle working path file/directory creation and deletion.
60 //     PersistentStorage only provides static Discard() method to use. The
61 //     derived class should implement other logics, e.g. working path (file
62 //     /directory) creation, check condition to discard working path and start
63 //     over new file(s).
64 //   - Implement all pure virtual methods:
65 //     - PersistStoragesToDisk: persist all (composite) storages. In general,
66 //       the implementation will be calling PersistToDisk for all composite
67 //       storages.
68 //     - PersistMetadataToDisk: persist metadata, including Crcs and Info.
69 //       - If the derived class maintains a concrete Crc and (custom) Info
70 //         instance, then it should perform write/pwrite into the metadata
71 //         section.
72 //       - If the derived class uses memory-mapped region directly for metadata,
73 //         then it should call MemoryMappedFile::PersistToDisk.
74 //       - See crcs() for more details.
75 //     - GetInfoChecksum: compute the checksum for custom Info.
76 //     - GetStoragesChecksum: compute the (combined) checksum for all
77 //       (composite) storages. In general, the implementation will be calling
78 //       GetChecksum for all composite storages and XOR all checksums.
79 //     - UpdateStoragesChecksum: update the (combined) checksum for all
80 //       (composite) storages. In general, the implementation will be calling
81 //       UpdateChecksum for all composite storages and XOR all checksums.
82 //     - crcs(): provide the reference for PersistentStorage to write checksums.
83 //       The derived class can either maintain a concrete Crcs instance, or
84 //       reinterpret_cast the memory-mapped region to Crcs reference. Either
85 //       choice is fine as long as PersistMetadataToDisk flushes it to disk
86 //       correctly.
87 // - Call either InitializeNewStorage or InitializeExistingStorage when creating
88 //   and initializing an instance, depending on initializing new storage or from
89 //   existing file(s).
90 class PersistentStorage {
91  public:
92   enum class WorkingPathType {
93     kSingleFile,
94     kDirectory,
95     kDummy,
96   };
97 
98   // Crcs and Info will be written into the metadata section. Info is defined by
99   // the actual implementation of each persistent storage. Usually the Metadata
100   // layout is: <Crcs><Info>
101   struct Crcs {
102     struct ComponentCrcs {
103       uint32_t info_crc;
104       uint32_t storages_crc;
105 
106       bool operator==(const ComponentCrcs& other) const {
107         return info_crc == other.info_crc && storages_crc == other.storages_crc;
108       }
109 
GetChecksumCrcs::ComponentCrcs110       Crc32 GetChecksum() const {
111         return Crc32(std::string_view(reinterpret_cast<const char*>(this),
112                                       sizeof(ComponentCrcs)));
113       }
114     } __attribute__((packed));
115 
116     bool operator==(const Crcs& other) const {
117       return all_crc == other.all_crc && component_crcs == other.component_crcs;
118     }
119 
120     uint32_t all_crc;
121     ComponentCrcs component_crcs;
122   } __attribute__((packed));
123   static_assert(sizeof(Crcs) == 12, "");
124 
125   // Deletes working_path according to its type.
126   //
127   // Returns:
128   //   - OK on success
129   //   - INTERNAL_ERROR on I/O error
130   //   - INVALID_ARGUMENT_ERROR if working_path_type is unknown type
131   static libtextclassifier3::Status Discard(const Filesystem& filesystem,
132                                             const std::string& working_path,
133                                             WorkingPathType working_path_type);
134 
135   virtual ~PersistentStorage() = default;
136 
137   // Initializes new persistent storage. It computes the initial checksums and
138   // writes into the metadata file.
139   //
140   // Note: either InitializeNewStorage or InitializeExistingStorage should be
141   // invoked after creating a PersistentStorage instance before using, otherwise
142   // an uninitialized instance will fail to use persistent storage features,
143   // e.g. PersistToDisk, UpdateChecksums.
144   //
145   // Returns:
146   //   - OK on success or already initialized
147   //   - Any errors from GetInfoChecksum, UpdateStoragesChecksum, depending on
148   //     actual implementation
InitializeNewStorage()149   libtextclassifier3::Status InitializeNewStorage() {
150     if (is_initialized_) {
151       return libtextclassifier3::Status::OK;
152     }
153 
154     ICING_RETURN_IF_ERROR(UpdateChecksumsInternal());
155     ICING_RETURN_IF_ERROR(PersistStoragesToDisk());
156     ICING_RETURN_IF_ERROR(PersistMetadataToDisk());
157 
158     is_initialized_ = true;
159     return libtextclassifier3::Status::OK;
160   }
161 
162   // Initializes persistent storage from existing file(s).
163   //
164   // It enforces the following check(s):
165   // - Validate checksums.
166   //
167   // Note: either InitializeNewStorage or InitializeExistingStorage should be
168   // invoked after creating a PersistentStorage instance before using.
169   //
170   // Returns:
171   //   - OK on success or already initialized
172   //   - FAILED_PRECONDITION_ERROR if checksum validation fails.
173   //   - Any errors from GetInfoChecksum, GetStoragesChecksum, depending on
174   //     actual implementation
InitializeExistingStorage()175   libtextclassifier3::Status InitializeExistingStorage() {
176     if (is_initialized_) {
177       return libtextclassifier3::Status::OK;
178     }
179 
180     ICING_RETURN_IF_ERROR(ValidateChecksums());
181 
182     is_initialized_ = true;
183     return libtextclassifier3::Status::OK;
184   }
185 
186   // Flushes contents to underlying files.
187   // 1) Flushes storages.
188   // 2) Updates all checksums by new data.
189   // 3) Flushes metadata.
190   //
191   // Returns:
192   //   - OK on success
193   //   - FAILED_PRECONDITION_ERROR if PersistentStorage is uninitialized
194   //   - Any errors from PersistStoragesToDisk, UpdateChecksums,
195   //     PersistMetadataToDisk, depending on actual implementation
PersistToDisk()196   libtextclassifier3::Status PersistToDisk() {
197     if (!is_initialized_) {
198       return absl_ports::FailedPreconditionError(absl_ports::StrCat(
199           "PersistentStorage ", working_path_, " not initialized"));
200     }
201 
202     ICING_RETURN_IF_ERROR(UpdateChecksumsInternal());
203     ICING_RETURN_IF_ERROR(PersistStoragesToDisk());
204     ICING_RETURN_IF_ERROR(PersistMetadataToDisk());
205     return libtextclassifier3::Status::OK;
206   }
207 
208   // Updates checksums of all components and returns the overall crc (all_crc)
209   // of the persistent storage.
210   //
211   // Returns:
212   //   - Overall crc of the persistent storage on success
213   //   - FAILED_PRECONDITION_ERROR if PersistentStorage is uninitialized
214   //   - Any errors from UpdateInfoChecksum, UpdateStoragesChecksum, depending
215   //     on actual implementation
UpdateChecksums()216   libtextclassifier3::StatusOr<Crc32> UpdateChecksums() {
217     if (!is_initialized_) {
218       return absl_ports::FailedPreconditionError(absl_ports::StrCat(
219           "PersistentStorage ", working_path_, " not initialized"));
220     }
221 
222     ICING_ASSIGN_OR_RETURN(Crc32 crc, UpdateChecksumsInternal());
223     ICING_RETURN_IF_ERROR(WriteMetadata());
224     return crc;
225   }
226 
227   // Calculates and returns the overall crc (all_crc) of the persistent storage.
228   //
229   // Returns:
230   //   - Overall crc of the persistent storage on success
231   //   - FAILED_PRECONDITION_ERROR if PersistentStorage is uninitialized
232   //   - Any errors from GetInfoChecksum, GetStoragesChecksum, depending on
233   //     actual implementation
GetChecksum()234   libtextclassifier3::StatusOr<Crc32> GetChecksum() const {
235     if (!is_initialized_) {
236       return absl_ports::FailedPreconditionError(absl_ports::StrCat(
237           "PersistentStorage ", working_path_, " not initialized"));
238     }
239 
240     ICING_ASSIGN_OR_RETURN(Crc32 info_crc, GetInfoChecksum());
241     ICING_ASSIGN_OR_RETURN(Crc32 storages_crc, GetStoragesChecksum());
242     Crcs::ComponentCrcs crcs = {info_crc.Get(), storages_crc.Get()};
243     return crcs.GetChecksum();
244   }
245 
246  protected:
PersistentStorage(const Filesystem & filesystem,std::string working_path,WorkingPathType working_path_type)247   explicit PersistentStorage(const Filesystem& filesystem,
248                              std::string working_path,
249                              WorkingPathType working_path_type)
250       : filesystem_(filesystem),
251         working_path_(std::move(working_path)),
252         working_path_type_(working_path_type),
253         is_initialized_(false) {}
254 
255   // Flushes contents of metadata. The implementation should flush Crcs and Info
256   // correctly, depending on whether they're using memory-mapped regions or
257   // concrete instances in the derived class.
258   //
259   // It is valid to call this function even when is_initialized_ is false.
260   //
261   // Returns:
262   //   - OK on success
263   //   - Any other errors, depending on actual implementation
264   virtual libtextclassifier3::Status PersistMetadataToDisk() = 0;
265 
266   // Flushes contents of all storages to underlying files.
267   //
268   // It is valid to call this function even when is_initialized_ is false.
269   //
270   // Returns:
271   //   - OK on success
272   //   - Any other errors, depending on actual implementation
273   virtual libtextclassifier3::Status PersistStoragesToDisk() = 0;
274 
275   // Writes the contents of the metadata, if necessary. Unlike
276   // PersistMetadataToDisk this method does not explicitly flush the metadata to
277   // disk.
278   //
279   // Returns:
280   //   - OK on success
281   //   - Any other errors, depending on actual implementation
282   virtual libtextclassifier3::Status WriteMetadata() = 0;
283 
284   // Computes and updates all storages checksums and returns a combined checksum
285   // of all storages. If there are multiple storages, usually we XOR their
286   // checksums together to a single checksum.
287   //
288   // This function will be mainly called by UpdateChecksums.
289   //
290   // It is valid to call this function even when is_initialized_ is false.
291   //
292   // Returns:
293   //   - Crc of all storages on success
294   //   - Any other errors from depending on actual implementation
295   virtual libtextclassifier3::StatusOr<Crc32> UpdateStoragesChecksum() = 0;
296 
297   // Computes and returns Info checksum.
298   //
299   // This function will be mainly called by GetChecksum.
300   //
301   // It is valid to call this function even when is_initialized_ is false.
302   //
303   // Returns:
304   //   - Crc of the Info on success
305   //   - Any other errors, depending on actual implementation
306   virtual libtextclassifier3::StatusOr<Crc32> GetInfoChecksum() const = 0;
307 
308   // Computes and returns all storages checksum. If there are multiple storages,
309   // usually we XOR their checksums together to a single checksum.
310   //
311   // This function will be mainly called by GetChecksum.
312   //
313   // It is valid to call this function even when is_initialized_ is false.
314   //
315   // Returns:
316   //   - Crc of all storages on success
317   //   - Any other errors, depending on actual implementation
318   virtual libtextclassifier3::StatusOr<Crc32> GetStoragesChecksum() const = 0;
319 
320   // Returns the Crcs instance reference. The derived class can either own a
321   // concrete Crcs instance, or reinterpret_cast the memory-mapped region to
322   // Crcs reference. PersistMetadataToDisk should flush it to disk correctly.
323   virtual Crcs& crcs() = 0;
324   virtual const Crcs& crcs() const = 0;
325 
326   const Filesystem& filesystem_;  // Does not own
327   // Path to the storage. It can be a single file path or a directory path
328   // depending on the implementation of the derived class.
329   //
330   // Note that the derived storage class will take full ownership and of
331   // working_path_, including creation/deletion. It is the caller's
332   // responsibility to specify correct working path and avoid mixing different
333   // persistent storages together under the same path. Also the caller has the
334   // ownership for the parent directory of working_path_, and it is responsible
335   // for parent directory creation/deletion.
336   std::string working_path_;
337   WorkingPathType working_path_type_;
338 
339   bool is_initialized_;
340 
341  private:
342   // Updates checksums of all components and returns the overall crc (all_crc)
343   // of the persistent storage. Different from UpdateChecksums, it won't check
344   // if PersistentStorage is initialized or not.
345   //
346   // Returns:
347   //   - Overall crc of the persistent storage on success
348   //   - Any errors from GetInfoChecksum, UpdateStoragesChecksum, depending on
349   //     actual implementation
UpdateChecksumsInternal()350   libtextclassifier3::StatusOr<Crc32> UpdateChecksumsInternal() {
351     Crcs& crcs_ref = crcs();
352     // Compute and update storages + info checksums.
353     ICING_ASSIGN_OR_RETURN(Crc32 info_crc, GetInfoChecksum());
354     ICING_ASSIGN_OR_RETURN(Crc32 storages_crc, UpdateStoragesChecksum());
355     if (crcs_ref.component_crcs.info_crc == info_crc.Get() &&
356         crcs_ref.component_crcs.storages_crc == storages_crc.Get()) {
357       // If info and storages crc haven't changed, then we don't have to update
358       // checksums.
359       return Crc32(crcs_ref.all_crc);
360     }
361 
362     crcs_ref.component_crcs.info_crc = info_crc.Get();
363     crcs_ref.component_crcs.storages_crc = storages_crc.Get();
364 
365     // Finally compute and update overall checksum.
366     Crc32 all_crc = crcs_ref.component_crcs.GetChecksum();
367     crcs_ref.all_crc = all_crc.Get();
368     return all_crc;
369   }
370 
371   // Validates all checksums of the persistent storage.
372   //
373   // Returns:
374   //   - OK on success
375   //   - FAILED_PRECONDITION_ERROR if any checksum is incorrect.
376   //   - Any errors from GetInfoChecksum, GetStoragesChecksum, depending on
377   //     actual implementation
ValidateChecksums()378   libtextclassifier3::Status ValidateChecksums() const {
379     const Crcs& crcs_ref = crcs();
380     if (crcs_ref.all_crc != crcs_ref.component_crcs.GetChecksum().Get()) {
381       return absl_ports::FailedPreconditionError("Invalid all crc");
382     }
383 
384     ICING_ASSIGN_OR_RETURN(Crc32 info_crc, GetInfoChecksum());
385     if (crcs_ref.component_crcs.info_crc != info_crc.Get()) {
386       return absl_ports::FailedPreconditionError("Invalid info crc");
387     }
388 
389     ICING_ASSIGN_OR_RETURN(Crc32 storages_crc, GetStoragesChecksum());
390     if (crcs_ref.component_crcs.storages_crc != storages_crc.Get()) {
391       return absl_ports::FailedPreconditionError("Invalid storages crc");
392     }
393     return libtextclassifier3::Status::OK;
394   }
395 };
396 
397 }  // namespace lib
398 }  // namespace icing
399 
400 #endif  // ICING_FILE_PERSISTENT_STORAGE_H_
401