1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 // A simple file-backed proto with an in-memory cache.
16 // WARNING: Only use this for small protos. Files storing larger protos can
17 // benefit from more sophisticated strategies like chunked reads/writes,
18 // using mmap and ideally, not even using protos.
19 //
20 // TODO(b/133793579) Consider exposing a checksum mismatch to callers.
21
22 #ifndef ICING_FILE_FILE_BACKED_PROTO_H_
23 #define ICING_FILE_FILE_BACKED_PROTO_H_
24
25 #include <algorithm>
26 #include <cstdint>
27 #include <memory>
28 #include <string>
29 #include <string_view>
30
31 #include "icing/text_classifier/lib3/utils/base/status.h"
32 #include "icing/text_classifier/lib3/utils/base/statusor.h"
33 #include "icing/absl_ports/canonical_errors.h"
34 #include "icing/absl_ports/mutex.h"
35 #include "icing/absl_ports/str_cat.h"
36 #include "icing/absl_ports/thread_annotations.h"
37 #include "icing/file/filesystem.h"
38 #include "icing/legacy/core/icing-string-util.h"
39 #include "icing/util/crc32.h"
40 #include "icing/util/logging.h"
41 #include "icing/util/status-macros.h"
42
43 namespace icing {
44 namespace lib {
45
46 // This class is go/thread-compatible
47 template <typename ProtoT>
48 class FileBackedProto {
49 public:
50 // Header stored at the beginning of the file before the proto.
51 struct Header {
52 static constexpr int32_t kMagic = 0x726f746f;
53
54 // Holds the magic as a quick sanity check against file corruption.
55 int32_t magic;
56
57 // Checksum of the serialized proto, for a more thorough check against file
58 // corruption.
59 uint32_t proto_checksum;
60 };
61
62 // Used the specified file to read older version of the proto and store
63 // newer versions of the proto.
64 //
65 // file_path : Must be a path within in a directory that already exists.
66 FileBackedProto(const Filesystem& filesystem, std::string_view file_path);
67
68 // Reset the internal file_path for the file backed proto.
69 // Example use:
70 // auto file_backed_proto1 = *FileBackedProto<Proto>::Create(...);
71 // auto file_backed_google::protobuf = *FileBackedProto<Proto>::Create(...);
72 // filesystem.SwapFiles(file1, file2);
73 // file_backed_proto1.SetSwappedFilepath(file2);
74 // file_backed_google::protobuf.SetSwappedFilepath(file1);
SetSwappedFilepath(std::string_view swapped_to_file_path)75 void SetSwappedFilepath(std::string_view swapped_to_file_path) {
76 file_path_ = swapped_to_file_path;
77 }
78
79 // Computes the checksum of the proto stored in this file and returns it.
80 // RETURNS:
81 // - the checksum of the proto or 0 if the file is empty/non-existent
82 // - INTERNAL_ERROR if an IO error or a corruption was encountered.
83 libtextclassifier3::StatusOr<Crc32> GetChecksum() const
84 ICING_LOCKS_EXCLUDED(mutex_);
85
86 // Returns a reference to the proto read from the file. It
87 // internally caches the read proto so that future calls are fast.
88 //
89 // NOTE: The caller does NOT get ownership of the object returned and
90 // the returned object is only valid till a new version of the proto is
91 // written to the file.
92 //
93 // Returns NOT_FOUND if the file was empty or never written to.
94 // Returns INTERNAL_ERROR if an IO error or a corruption was encountered.
95 libtextclassifier3::StatusOr<const ProtoT*> Read() const
96 ICING_LOCKS_EXCLUDED(mutex_);
97
98 // Writes the new version of the proto provided through to disk.
99 // Successful Write() invalidates any previously read version of the proto.
100 //
101 // Returns INTERNAL_ERROR if any IO error is encountered and will NOT
102 // invalidate any previously read versions of the proto.
103 //
104 // TODO(cassiewang) The implementation today loses old data if Write() fails.
105 // We should write to a tmp file first and rename the file to fix this.
106 // TODO(cassiewang) Change to Write(ProtoT&& proto)
107 libtextclassifier3::Status Write(std::unique_ptr<ProtoT> proto)
108 ICING_LOCKS_EXCLUDED(mutex_);
109
110 // Disallow copy and assign.
111 FileBackedProto(const FileBackedProto&) = delete;
112 FileBackedProto& operator=(const FileBackedProto&) = delete;
113
114 private:
115 // Internal method to handle reading the proto from disk.
116 // Requires the caller to hold an exclusive lock on mutex_.
117 libtextclassifier3::StatusOr<const ProtoT*> ReadInternal() const
118 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
119
120 // Upper bound of file-size that is supported.
121 static constexpr int32_t kMaxFileSize = 1 * 1024 * 1024; // 1 MiB.
122
123 // Used to provide reader and writer locks
124 mutable absl_ports::shared_mutex mutex_;
125
126 const Filesystem* const filesystem_;
127 std::string file_path_;
128
129 mutable std::unique_ptr<ProtoT> cached_proto_ ICING_GUARDED_BY(mutex_);
130
131 mutable std::unique_ptr<Header> cached_header_ ICING_GUARDED_BY(mutex_);
132 };
133
134 template <typename ProtoT>
135 constexpr int32_t FileBackedProto<ProtoT>::kMaxFileSize;
136
137 template <typename ProtoT>
FileBackedProto(const Filesystem & filesystem,const std::string_view file_path)138 FileBackedProto<ProtoT>::FileBackedProto(const Filesystem& filesystem,
139 const std::string_view file_path)
140 : filesystem_(&filesystem), file_path_(file_path) {}
141
142 template <typename ProtoT>
GetChecksum()143 libtextclassifier3::StatusOr<Crc32> FileBackedProto<ProtoT>::GetChecksum()
144 const {
145 absl_ports::unique_lock l(&mutex_);
146 if (cached_proto_ == nullptr) {
147 auto read_status = ReadInternal();
148 if (!read_status.ok()) {
149 if (absl_ports::IsNotFound(read_status.status())) {
150 // File doesn't exist. So simply return 0.
151 return Crc32();
152 }
153 return read_status.status();
154 }
155 }
156 return Crc32(cached_header_->proto_checksum);
157 }
158
159 template <typename ProtoT>
Read()160 libtextclassifier3::StatusOr<const ProtoT*> FileBackedProto<ProtoT>::Read()
161 const {
162 ICING_VLOG(1) << "Reading proto from file: " << file_path_;
163
164 absl_ports::unique_lock l(&mutex_);
165
166 return ReadInternal();
167 }
168
169 template <typename ProtoT>
170 libtextclassifier3::StatusOr<const ProtoT*>
ReadInternal()171 FileBackedProto<ProtoT>::ReadInternal() const {
172 // Return cached proto if we've already read from disk.
173 if (cached_proto_ != nullptr) {
174 ICING_VLOG(1) << "Reusing cached proto for file: " << file_path_;
175 return cached_proto_.get();
176 }
177
178 int64_t file_size = filesystem_->GetFileSize(file_path_.c_str());
179 if (file_size == Filesystem::kBadFileSize || file_size == 0) {
180 return absl_ports::NotFoundError(
181 absl_ports::StrCat("Missing file: ", file_path_));
182 }
183
184 if (file_size > kMaxFileSize) {
185 return absl_ports::InternalError(absl_ports::StrCat(
186 "File larger than expected, couldn't read: ", file_path_));
187 }
188
189 ScopedFd fd(filesystem_->OpenForRead(file_path_.c_str()));
190 if (!fd.is_valid()) {
191 return absl_ports::InternalError(
192 absl_ports::StrCat("Unable to open file for read: ", file_path_));
193 }
194
195 ICING_VLOG(1) << "Loading proto from file: " << file_path_
196 << " of size: " << file_size;
197
198 Header header;
199 if (!filesystem_->PRead(fd.get(), &header, sizeof(Header), /*offset=*/0)) {
200 return absl_ports::InternalError(
201 absl_ports::StrCat("Unable to read header of: ", file_path_));
202 }
203
204 if (header.magic != Header::kMagic) {
205 return absl_ports::InternalError(
206 absl_ports::StrCat("Invalid header kMagic for: ", file_path_));
207 }
208
209 int proto_size = file_size - sizeof(Header);
210 auto buffer = std::make_unique<uint8_t[]>(proto_size);
211 if (!filesystem_->PRead(fd.get(), buffer.get(), proto_size,
212 /*offset=*/sizeof(Header))) {
213 return absl_ports::InternalError(
214 absl_ports::StrCat("File read failed: ", file_path_));
215 }
216
217 std::string_view buffer_str(reinterpret_cast<const char*>(buffer.get()),
218 proto_size);
219 Crc32 crc;
220 crc.Append(buffer_str);
221 if (header.proto_checksum != crc.Get()) {
222 return absl_ports::InternalError(
223 absl_ports::StrCat("Checksum of file does not match: ", file_path_));
224 }
225
226 auto proto = std::make_unique<ProtoT>();
227 if (!proto->ParseFromArray(buffer.get(), proto_size)) {
228 return absl_ports::InternalError(
229 absl_ports::StrCat("Proto parse failed. File corrupted: ", file_path_));
230 }
231
232 ICING_VLOG(1) << "Successfully read proto from file: " << file_path_;
233 cached_proto_ = std::move(proto);
234 cached_header_ = std::make_unique<Header>(std::move(header));
235 return cached_proto_.get();
236 }
237
238 template <typename ProtoT>
Write(std::unique_ptr<ProtoT> new_proto)239 libtextclassifier3::Status FileBackedProto<ProtoT>::Write(
240 std::unique_ptr<ProtoT> new_proto) {
241 ICING_VLOG(1) << "Writing proto to file: " << file_path_;
242
243 absl_ports::unique_lock l(&mutex_);
244
245 const std::string new_proto_str = new_proto->SerializeAsString();
246 if (new_proto_str.size() >= kMaxFileSize) {
247 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
248 "New proto too large. size: %d; limit: %d.",
249 static_cast<int>(new_proto_str.size()), kMaxFileSize));
250 }
251
252 if (cached_proto_ != nullptr &&
253 cached_proto_->SerializeAsString() == new_proto_str) {
254 ICING_VLOG(1) << "Skip writing proto to file as contents are identical: "
255 << file_path_;
256 return libtextclassifier3::Status::OK;
257 }
258
259 ScopedFd fd(filesystem_->OpenForWrite(file_path_.c_str()));
260 if (!fd.is_valid()) {
261 return absl_ports::InternalError(
262 absl_ports::StrCat("Unable to open file for write: ", file_path_));
263 }
264
265 if (!filesystem_->Truncate(fd.get(), 0)) {
266 return absl_ports::InternalError(
267 absl_ports::StrCat("Failed to truncate file: ", file_path_));
268 }
269
270 Header header;
271 header.magic = Header::kMagic;
272
273 Crc32 crc;
274 crc.Append(new_proto_str);
275 header.proto_checksum = crc.Get();
276 if (!filesystem_->Write(fd.get(), &header, sizeof(Header))) {
277 return absl_ports::InternalError(
278 absl_ports::StrCat("Failed to write header to file: ", file_path_));
279 }
280
281 if (!filesystem_->Write(fd.get(), new_proto_str.data(),
282 new_proto_str.size())) {
283 return absl_ports::InternalError(
284 absl_ports::StrCat("Failed to write proto to file: ", file_path_));
285 }
286
287 if (!filesystem_->DataSync(fd.get())) {
288 return absl_ports::InternalError(IcingStringUtil::StringPrintf(
289 "Failed to sync file; filename: %s; content_size: %d ",
290 file_path_.c_str(), static_cast<int>(new_proto_str.size())));
291 }
292
293 ICING_VLOG(1) << "Successfully wrote proto to file: " << file_path_;
294 cached_proto_ = std::move(new_proto);
295 cached_header_ = std::make_unique<Header>(std::move(header));
296 return libtextclassifier3::Status::OK;
297 }
298
299 } // namespace lib
300 } // namespace icing
301
302 #endif // ICING_FILE_FILE_BACKED_PROTO_H_
303