xref: /aosp_15_r20/external/icing/icing/store/blob-store.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2024 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_STORE_BLOB_STORE_H_
16 #define ICING_STORE_BLOB_STORE_H_
17 
18 #include <cstdint>
19 #include <string>
20 #include <unordered_map>
21 #include <unordered_set>
22 #include <utility>
23 
24 #include "icing/text_classifier/lib3/utils/base/status.h"
25 #include "icing/text_classifier/lib3/utils/base/statusor.h"
26 #include "icing/file/filesystem.h"
27 #include "icing/file/portable-file-backed-proto-log.h"
28 #include "icing/proto/blob.pb.h"
29 #include "icing/proto/document.pb.h"
30 #include "icing/proto/storage.pb.h"
31 #include "icing/util/clock.h"
32 
33 namespace icing {
34 namespace lib {
35 
36 // Provides storage interfaces for Blobs.
37 //
38 // The BlobStore is responsible for storing blobs in a directory and for
39 // ensuring that the directory is in a consistent state.
40 //
41 // A blob is a file that is stored in the BlobStore. A blob is identified by
42 // a blob handle, which is a unique identifier for the blob.
43 //
44 // Any blob that is written to the BlobStore must be committed before it can be
45 // read. A blob can be committed only once. After a blob is committed, it is
46 // not allowed to be updated.
47 //
48 // The BlobStore is not thread-safe.
49 class BlobStore {
50  public:
51   // Builds a string representation of a blob handle.
52   // The string is used as the key in the key mapper.
53   static std::string BuildBlobHandleStr(
54       const PropertyProto::BlobHandleProto& blob_handle);
55 
56   // Factory function to create a BlobStore instance. The base directory is
57   // used to persist blobs. If a blob store was previously created with
58   // this directory, it will reload the files saved by the last instance.
59   //
60   // The callers must create the base directory before calling this function.
61   //
62   // Returns:
63   //   A BlobStore on success
64   //   FAILED_PRECONDITION_ERROR on any null pointer input
65   //   INTERNAL_ERROR on I/O error
66   static libtextclassifier3::StatusOr<BlobStore> Create(
67       const Filesystem* filesystem, std::string base_dir, const Clock* clock,
68       int64_t orphan_blob_time_to_live_ms, int32_t compression_level);
69 
70   // Gets or creates a file for write only purpose for the given blob handle.
71   // To mark the blob is completed written, CommitBlob must be called. Once
72   // CommitBlob is called, the blob is sealed and rewrite is not allowed.
73   //
74   // It is the user's responsibility to close the file descriptor after writing
75   // is done and should operate on the file descriptor after commit or remove
76   // it.
77   //
78   // Returns:
79   //   File descriptor (writable) on success
80   //   INVALID_ARGUMENT_ERROR on invalid blob handle
81   //   FAILED_PRECONDITION_ERROR on blob is already opened for write
82   //   ALREADY_EXISTS_ERROR if the blob has already been committed
83   //   INTERNAL_ERROR on IO error
84   libtextclassifier3::StatusOr<int> OpenWrite(
85       const PropertyProto::BlobHandleProto& blob_handle);
86 
87   // Removes a blob file and blob handle from the blob store.
88   //
89   // This will remove the blob on any state. No matter it's committed or not or
90   // it has reference document links or not.
91   //
92   // Returns:
93   //   INVALID_ARGUMENT_ERROR on invalid blob handle
94   //   NOT_FOUND_ERROR on blob is not found
95   //   INTERNAL_ERROR on IO error
96   libtextclassifier3::Status RemoveBlob(
97       const PropertyProto::BlobHandleProto& blob_handle);
98 
99   // Gets a file for read only purpose for the given blob handle.
100   // Will only succeed for blobs that were committed by calling CommitBlob.
101   //
102   // It is the user's responsibility to close the file descriptor after reading.
103   //
104   // Returns:
105   //   File descriptor (read only) on success
106   //   INVALID_ARGUMENT_ERROR on invalid blob handle
107   //   NOT_FOUND_ERROR on blob is not found or is not committed
108   libtextclassifier3::StatusOr<int> OpenRead(
109       const PropertyProto::BlobHandleProto& blob_handle);
110 
111   // Commits the given blob, if the blob is finished wrote via OpenWrite.
112   // Before the blob is committed, it is not visible to any reader via OpenRead.
113   // After the blob is committed, it is not allowed to rewrite or update the
114   // content.
115   //
116   // Returns:
117   //   OK on the blob is successfully committed.
118   //   ALREADY_EXISTS_ERROR on the blob is already committed, this is no op.
119   //   INVALID_ARGUMENT_ERROR on invalid blob handle or digest is mismatch with
120   //                        file content.
121   //   NOT_FOUND_ERROR on blob is not found.
122   libtextclassifier3::Status CommitBlob(
123       const PropertyProto::BlobHandleProto& blob_handle);
124 
125   // Persists the blobs to disk.
126   libtextclassifier3::Status PersistToDisk();
127 
128   // Gets the potentially optimizable blob handles.
129   //
130   // A blob will be consider as a potentially optimizable blob if it created
131   // before the orphan_blob_time_to_live_ms. And the blob should be removed if
132   // it has no reference document links to it.
133   std::unordered_set<std::string> GetPotentiallyOptimizableBlobHandles();
134 
135   // Optimize the blob store and remove dead blob files.
136   //
137   // A blob will be consider as a dead blob and removed if it meets BOTH of
138   // following conditions
139   //  1: has no reference document links to it
140   //  2: It's mature.
141   //
142   // Returns:
143   //   OK on success
144   //   INTERNAL_ERROR on IO error
145   libtextclassifier3::Status Optimize(
146       const std::unordered_set<std::string>& dead_blob_handles);
147 
148   // Calculates the StorageInfo for the Blob Store.
149   //
150   // Returns:
151   //   Vector of NamespaceBlobStorageInfoProto contains size of each namespace.
152   //   INTERNAL_ERROR on I/O error
153   libtextclassifier3::StatusOr<std::vector<NamespaceBlobStorageInfoProto>>
154   GetStorageInfo() const;
155 
156 private:
BlobStore(const Filesystem * filesystem,std::string base_dir,const Clock * clock,int64_t orphan_blob_time_to_live_ms,int32_t compression_level,std::unique_ptr<PortableFileBackedProtoLog<BlobInfoProto>> blob_info_log,std::unordered_map<std::string,int32_t> blob_handle_to_offset,std::unordered_set<std::string> known_file_names)157   explicit BlobStore(
158       const Filesystem* filesystem, std::string base_dir, const Clock* clock,
159       int64_t orphan_blob_time_to_live_ms, int32_t compression_level,
160       std::unique_ptr<PortableFileBackedProtoLog<BlobInfoProto>> blob_info_log,
161       std::unordered_map<std::string, int32_t> blob_handle_to_offset,
162       std::unordered_set<std::string> known_file_names)
163       : filesystem_(*filesystem),
164         base_dir_(std::move(base_dir)),
165         clock_(*clock),
166         orphan_blob_time_to_live_ms_(orphan_blob_time_to_live_ms),
167         compression_level_(compression_level),
168         blob_info_log_(std::move(blob_info_log)),
169         blob_handle_to_offset_(std::move(blob_handle_to_offset)),
170         known_file_names_(std::move(known_file_names)) {}
171 
172   libtextclassifier3::StatusOr<BlobInfoProto> GetOrCreateBlobInfo(
173       const std::string& blob_handle_str,
174       const PropertyProto::BlobHandleProto& blob_handle);
175 
176   const Filesystem& filesystem_;
177   std::string base_dir_;
178   const Clock& clock_;
179   int64_t orphan_blob_time_to_live_ms_;
180   int32_t compression_level_;
181 
182   // The ground truth blob info log file, which is used to read/write/erase
183   // BlobInfoProto.
184   std::unique_ptr<PortableFileBackedProtoLog<BlobInfoProto>> blob_info_log_;
185 
186   // The map for BlobHandle string to the offset of BlobInfoProto in the
187   // BlobInfoProto log file.
188   // The keys are the Encoded CString from BlobHandleProto.
189   std::unordered_map<std::string, int32_t> blob_handle_to_offset_;
190 
191   // The set of used file names to store blobs in the blob store.
192   std::unordered_set<std::string> known_file_names_;
193 
194   bool has_mutated_ = false;
195 };
196 
197 }  // namespace lib
198 }  // namespace icing
199 
200 #endif  // ICING_STORE_BLOB_STORE_H_
201