1 // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. See the AUTHORS file for names of contributors. 4 // 5 // The representation of a DBImpl consists of a set of Versions. The 6 // newest version is called "current". Older versions may be kept 7 // around to provide a consistent view to live iterators. 8 // 9 // Each Version keeps track of a set of Table files per level. The 10 // entire set of versions is maintained in a VersionSet. 11 // 12 // Version,VersionSet are thread-compatible, but require external 13 // synchronization on all accesses. 14 15 #ifndef STORAGE_LEVELDB_DB_VERSION_SET_H_ 16 #define STORAGE_LEVELDB_DB_VERSION_SET_H_ 17 18 #include <map> 19 #include <set> 20 #include <vector> 21 22 #include "db/dbformat.h" 23 #include "db/version_edit.h" 24 #include "port/port.h" 25 #include "port/thread_annotations.h" 26 27 namespace leveldb { 28 29 namespace log { 30 class Writer; 31 } 32 33 class Compaction; 34 class Iterator; 35 class MemTable; 36 class TableBuilder; 37 class TableCache; 38 class Version; 39 class VersionSet; 40 class WritableFile; 41 42 // Return the smallest index i such that files[i]->largest >= key. 43 // Return files.size() if there is no such file. 44 // REQUIRES: "files" contains a sorted list of non-overlapping files. 45 int FindFile(const InternalKeyComparator& icmp, 46 const std::vector<FileMetaData*>& files, const Slice& key); 47 48 // Returns true iff some file in "files" overlaps the user key range 49 // [*smallest,*largest]. 50 // smallest==nullptr represents a key smaller than all keys in the DB. 51 // largest==nullptr represents a key largest than all keys in the DB. 52 // REQUIRES: If disjoint_sorted_files, files[] contains disjoint ranges 53 // in sorted order. 54 bool SomeFileOverlapsRange(const InternalKeyComparator& icmp, 55 bool disjoint_sorted_files, 56 const std::vector<FileMetaData*>& files, 57 const Slice* smallest_user_key, 58 const Slice* largest_user_key); 59 60 class Version { 61 public: 62 // Lookup the value for key. If found, store it in *val and 63 // return OK. Else return a non-OK status. Fills *stats. 64 // REQUIRES: lock is not held 65 struct GetStats { 66 FileMetaData* seek_file; 67 int seek_file_level; 68 }; 69 70 // Append to *iters a sequence of iterators that will 71 // yield the contents of this Version when merged together. 72 // REQUIRES: This version has been saved (see VersionSet::SaveTo) 73 void AddIterators(const ReadOptions&, std::vector<Iterator*>* iters); 74 75 Status Get(const ReadOptions&, const LookupKey& key, std::string* val, 76 GetStats* stats); 77 78 // Adds "stats" into the current state. Returns true if a new 79 // compaction may need to be triggered, false otherwise. 80 // REQUIRES: lock is held 81 bool UpdateStats(const GetStats& stats); 82 83 // Record a sample of bytes read at the specified internal key. 84 // Samples are taken approximately once every config::kReadBytesPeriod 85 // bytes. Returns true if a new compaction may need to be triggered. 86 // REQUIRES: lock is held 87 bool RecordReadSample(Slice key); 88 89 // Reference count management (so Versions do not disappear out from 90 // under live iterators) 91 void Ref(); 92 void Unref(); 93 94 void GetOverlappingInputs( 95 int level, 96 const InternalKey* begin, // nullptr means before all keys 97 const InternalKey* end, // nullptr means after all keys 98 std::vector<FileMetaData*>* inputs); 99 100 // Returns true iff some file in the specified level overlaps 101 // some part of [*smallest_user_key,*largest_user_key]. 102 // smallest_user_key==nullptr represents a key smaller than all the DB's keys. 103 // largest_user_key==nullptr represents a key largest than all the DB's keys. 104 bool OverlapInLevel(int level, const Slice* smallest_user_key, 105 const Slice* largest_user_key); 106 107 // Return the level at which we should place a new memtable compaction 108 // result that covers the range [smallest_user_key,largest_user_key]. 109 int PickLevelForMemTableOutput(const Slice& smallest_user_key, 110 const Slice& largest_user_key); 111 NumFiles(int level)112 int NumFiles(int level) const { return files_[level].size(); } 113 114 // Return a human readable string that describes this version's contents. 115 std::string DebugString() const; 116 117 private: 118 friend class Compaction; 119 friend class VersionSet; 120 121 class LevelFileNumIterator; 122 Version(VersionSet * vset)123 explicit Version(VersionSet* vset) 124 : vset_(vset), 125 next_(this), 126 prev_(this), 127 refs_(0), 128 file_to_compact_(nullptr), 129 file_to_compact_level_(-1), 130 compaction_score_(-1), 131 compaction_level_(-1) {} 132 133 Version(const Version&) = delete; 134 Version& operator=(const Version&) = delete; 135 136 ~Version(); 137 138 Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const; 139 140 // Call func(arg, level, f) for every file that overlaps user_key in 141 // order from newest to oldest. If an invocation of func returns 142 // false, makes no more calls. 143 // 144 // REQUIRES: user portion of internal_key == user_key. 145 void ForEachOverlapping(Slice user_key, Slice internal_key, void* arg, 146 bool (*func)(void*, int, FileMetaData*)); 147 148 VersionSet* vset_; // VersionSet to which this Version belongs 149 Version* next_; // Next version in linked list 150 Version* prev_; // Previous version in linked list 151 int refs_; // Number of live refs to this version 152 153 // List of files per level 154 std::vector<FileMetaData*> files_[config::kNumLevels]; 155 156 // Next file to compact based on seek stats. 157 FileMetaData* file_to_compact_; 158 int file_to_compact_level_; 159 160 // Level that should be compacted next and its compaction score. 161 // Score < 1 means compaction is not strictly needed. These fields 162 // are initialized by Finalize(). 163 double compaction_score_; 164 int compaction_level_; 165 }; 166 167 class VersionSet { 168 public: 169 VersionSet(const std::string& dbname, const Options* options, 170 TableCache* table_cache, const InternalKeyComparator*); 171 VersionSet(const VersionSet&) = delete; 172 VersionSet& operator=(const VersionSet&) = delete; 173 174 ~VersionSet(); 175 176 // Apply *edit to the current version to form a new descriptor that 177 // is both saved to persistent state and installed as the new 178 // current version. Will release *mu while actually writing to the file. 179 // REQUIRES: *mu is held on entry. 180 // REQUIRES: no other thread concurrently calls LogAndApply() 181 Status LogAndApply(VersionEdit* edit, port::Mutex* mu) 182 EXCLUSIVE_LOCKS_REQUIRED(mu); 183 184 // Recover the last saved descriptor from persistent storage. 185 Status Recover(bool* save_manifest); 186 187 // Return the current version. current()188 Version* current() const { return current_; } 189 190 // Return the current manifest file number ManifestFileNumber()191 uint64_t ManifestFileNumber() const { return manifest_file_number_; } 192 193 // Allocate and return a new file number NewFileNumber()194 uint64_t NewFileNumber() { return next_file_number_++; } 195 196 // Arrange to reuse "file_number" unless a newer file number has 197 // already been allocated. 198 // REQUIRES: "file_number" was returned by a call to NewFileNumber(). ReuseFileNumber(uint64_t file_number)199 void ReuseFileNumber(uint64_t file_number) { 200 if (next_file_number_ == file_number + 1) { 201 next_file_number_ = file_number; 202 } 203 } 204 205 // Return the number of Table files at the specified level. 206 int NumLevelFiles(int level) const; 207 208 // Return the combined file size of all files at the specified level. 209 int64_t NumLevelBytes(int level) const; 210 211 // Return the last sequence number. LastSequence()212 uint64_t LastSequence() const { return last_sequence_; } 213 214 // Set the last sequence number to s. SetLastSequence(uint64_t s)215 void SetLastSequence(uint64_t s) { 216 assert(s >= last_sequence_); 217 last_sequence_ = s; 218 } 219 220 // Mark the specified file number as used. 221 void MarkFileNumberUsed(uint64_t number); 222 223 // Return the current log file number. LogNumber()224 uint64_t LogNumber() const { return log_number_; } 225 226 // Return the log file number for the log file that is currently 227 // being compacted, or zero if there is no such log file. PrevLogNumber()228 uint64_t PrevLogNumber() const { return prev_log_number_; } 229 230 // Pick level and inputs for a new compaction. 231 // Returns nullptr if there is no compaction to be done. 232 // Otherwise returns a pointer to a heap-allocated object that 233 // describes the compaction. Caller should delete the result. 234 Compaction* PickCompaction(); 235 236 // Return a compaction object for compacting the range [begin,end] in 237 // the specified level. Returns nullptr if there is nothing in that 238 // level that overlaps the specified range. Caller should delete 239 // the result. 240 Compaction* CompactRange(int level, const InternalKey* begin, 241 const InternalKey* end); 242 243 // Return the maximum overlapping data (in bytes) at next level for any 244 // file at a level >= 1. 245 int64_t MaxNextLevelOverlappingBytes(); 246 247 // Create an iterator that reads over the compaction inputs for "*c". 248 // The caller should delete the iterator when no longer needed. 249 Iterator* MakeInputIterator(Compaction* c); 250 251 // Returns true iff some level needs a compaction. NeedsCompaction()252 bool NeedsCompaction() const { 253 Version* v = current_; 254 return (v->compaction_score_ >= 1) || (v->file_to_compact_ != nullptr); 255 } 256 257 // Add all files listed in any live version to *live. 258 // May also mutate some internal state. 259 void AddLiveFiles(std::set<uint64_t>* live); 260 261 // Return the approximate offset in the database of the data for 262 // "key" as of version "v". 263 uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key); 264 265 // Return a human-readable short (single-line) summary of the number 266 // of files per level. Uses *scratch as backing store. 267 struct LevelSummaryStorage { 268 char buffer[100]; 269 }; 270 const char* LevelSummary(LevelSummaryStorage* scratch) const; 271 272 private: 273 class Builder; 274 275 friend class Compaction; 276 friend class Version; 277 278 bool ReuseManifest(const std::string& dscname, const std::string& dscbase); 279 280 void Finalize(Version* v); 281 282 void GetRange(const std::vector<FileMetaData*>& inputs, InternalKey* smallest, 283 InternalKey* largest); 284 285 void GetRange2(const std::vector<FileMetaData*>& inputs1, 286 const std::vector<FileMetaData*>& inputs2, 287 InternalKey* smallest, InternalKey* largest); 288 289 void SetupOtherInputs(Compaction* c); 290 291 // Save current contents to *log 292 Status WriteSnapshot(log::Writer* log); 293 294 void AppendVersion(Version* v); 295 296 Env* const env_; 297 const std::string dbname_; 298 const Options* const options_; 299 TableCache* const table_cache_; 300 const InternalKeyComparator icmp_; 301 uint64_t next_file_number_; 302 uint64_t manifest_file_number_; 303 uint64_t last_sequence_; 304 uint64_t log_number_; 305 uint64_t prev_log_number_; // 0 or backing store for memtable being compacted 306 307 // Opened lazily 308 WritableFile* descriptor_file_; 309 log::Writer* descriptor_log_; 310 Version dummy_versions_; // Head of circular doubly-linked list of versions. 311 Version* current_; // == dummy_versions_.prev_ 312 313 // Per-level key at which the next compaction at that level should start. 314 // Either an empty string, or a valid InternalKey. 315 std::string compact_pointer_[config::kNumLevels]; 316 }; 317 318 // A Compaction encapsulates information about a compaction. 319 class Compaction { 320 public: 321 ~Compaction(); 322 323 // Return the level that is being compacted. Inputs from "level" 324 // and "level+1" will be merged to produce a set of "level+1" files. level()325 int level() const { return level_; } 326 327 // Return the object that holds the edits to the descriptor done 328 // by this compaction. edit()329 VersionEdit* edit() { return &edit_; } 330 331 // "which" must be either 0 or 1 num_input_files(int which)332 int num_input_files(int which) const { return inputs_[which].size(); } 333 334 // Return the ith input file at "level()+which" ("which" must be 0 or 1). input(int which,int i)335 FileMetaData* input(int which, int i) const { return inputs_[which][i]; } 336 337 // Maximum size of files to build during this compaction. MaxOutputFileSize()338 uint64_t MaxOutputFileSize() const { return max_output_file_size_; } 339 340 // Is this a trivial compaction that can be implemented by just 341 // moving a single input file to the next level (no merging or splitting) 342 bool IsTrivialMove() const; 343 344 // Add all inputs to this compaction as delete operations to *edit. 345 void AddInputDeletions(VersionEdit* edit); 346 347 // Returns true if the information we have available guarantees that 348 // the compaction is producing data in "level+1" for which no data exists 349 // in levels greater than "level+1". 350 bool IsBaseLevelForKey(const Slice& user_key); 351 352 // Returns true iff we should stop building the current output 353 // before processing "internal_key". 354 bool ShouldStopBefore(const Slice& internal_key); 355 356 // Release the input version for the compaction, once the compaction 357 // is successful. 358 void ReleaseInputs(); 359 360 private: 361 friend class Version; 362 friend class VersionSet; 363 364 Compaction(const Options* options, int level); 365 366 int level_; 367 uint64_t max_output_file_size_; 368 Version* input_version_; 369 VersionEdit edit_; 370 371 // Each compaction reads inputs from "level_" and "level_+1" 372 std::vector<FileMetaData*> inputs_[2]; // The two sets of inputs 373 374 // State used to check for number of overlapping grandparent files 375 // (parent == level_ + 1, grandparent == level_ + 2) 376 std::vector<FileMetaData*> grandparents_; 377 size_t grandparent_index_; // Index in grandparent_starts_ 378 bool seen_key_; // Some output key has been seen 379 int64_t overlapped_bytes_; // Bytes of overlap between current output 380 // and grandparent files 381 382 // State for implementing IsBaseLevelForKey 383 384 // level_ptrs_ holds indices into input_version_->levels_: our state 385 // is that we are positioned at one of the file ranges for each 386 // higher level than the ones involved in this compaction (i.e. for 387 // all L >= level_ + 2). 388 size_t level_ptrs_[config::kNumLevels]; 389 }; 390 391 } // namespace leveldb 392 393 #endif // STORAGE_LEVELDB_DB_VERSION_SET_H_ 394