xref: /aosp_15_r20/external/tensorflow/tensorflow/core/common_runtime/bfc_allocator.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_BFC_ALLOCATOR_H_
17 #define TENSORFLOW_CORE_COMMON_RUNTIME_BFC_ALLOCATOR_H_
18 
19 #include <array>
20 #include <deque>
21 #include <memory>
22 #include <string>
23 #include <unordered_map>
24 #include <vector>
25 
26 #include "absl/container/flat_hash_set.h"
27 #include "tensorflow/core/common_runtime/allocator_retry.h"
28 #include "tensorflow/core/common_runtime/shared_counter.h"
29 #include "tensorflow/core/framework/allocator.h"
30 #include "tensorflow/core/lib/strings/numbers.h"
31 #include "tensorflow/core/lib/strings/strcat.h"
32 #include "tensorflow/core/platform/macros.h"
33 #include "tensorflow/core/platform/mutex.h"
34 #include "tensorflow/core/platform/thread_annotations.h"
35 #include "tensorflow/core/platform/types.h"
36 
37 namespace tensorflow {
38 
39 class MemoryDump;
40 
41 // A memory allocator that implements a 'best-fit with coalescing'
42 // algorithm.  This is essentially a very simple version of Doug Lea's
43 // malloc (dlmalloc).
44 //
45 // The goal of this allocator is to support defragmentation via
46 // coalescing.  One assumption we make is that the process using this
47 // allocator owns pretty much all of the memory, and that nearly
48 // all requests to allocate memory go through this interface.
49 class BFCAllocator : public Allocator {
50  public:
51   struct Options {
52     bool allow_growth = true;
53 
54     // If true, the allocator may sleep for a period of time when it can't
55     // fulfill an allocation request, in the hopes that another thread will free
56     // up memory in the meantime.
57     //
58     // If false, the allocator will never sleep, even if
59     // AllocationAttributes::attr_retry_on_failure is true.
60     bool allow_retry_on_failure = true;
61 
62     // Whether the allocator will deallocate free regions to avoid OOM due to
63     // memory fragmentation.
64     bool garbage_collection = false;
65 
66     // Controls when a chunk should be split, if its size exceeds the requested
67     // allocation size.
68     double fragmentation_fraction = 0;
69   };
70   BFCAllocator(std::unique_ptr<SubAllocator> sub_allocator, size_t total_memory,
71                const string& name, const Options& opts);
72 
73   ~BFCAllocator() override;
74 
Name()75   string Name() override { return name_; }
76 
AllocateRaw(size_t alignment,size_t num_bytes)77   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
78     return AllocateRaw(alignment, num_bytes, AllocationAttributes());
79   }
80 
81   void* AllocateRaw(size_t alignment, size_t num_bytes,
82                     const AllocationAttributes& allocation_attr) override;
83 
84   void DeallocateRaw(void* ptr) override;
85 
86   bool TracksAllocationSizes() const override;
87 
88   size_t RequestedSize(const void* ptr) const override;
89 
90   size_t AllocatedSize(const void* ptr) const override;
91 
92   int64_t AllocationId(const void* ptr) const override;
93 
94   absl::optional<AllocatorStats> GetStats() override;
95 
96   bool ClearStats() override;
97 
SetTimingCounter(SharedCounter * sc)98   void SetTimingCounter(SharedCounter* sc) { timing_counter_ = sc; }
99 
100   void SetSafeFrontier(uint64 count) override;
101 
102   AllocatorMemoryType GetMemoryType() const override;
103 
ShouldRecordOpName()104   bool ShouldRecordOpName() const { return true; }
105 
106   MemoryDump RecordMemoryMap();
107 
108  protected:
109   // This setting controls when a chunk should be split, if its size exceeds the
110   // requested allocation size. It is not expected to be changed after
111   // initialization.
SetInternalFragmentationFraction(double fraction)112   void SetInternalFragmentationFraction(double fraction) {
113     internal_fragmentation_fraction_ = fraction;
114   }
115 
116  private:
117   struct Bin;
118 
119   void* AllocateRawInternal(size_t alignment, size_t num_bytes,
120                             bool dump_log_on_failure,
121                             uint64 freed_before_count);
122 
123   void* AllocateRawInternalWithRetry(
124       size_t alignment, size_t num_bytes,
125       const AllocationAttributes& allocation_attr);
126 
127   void DeallocateRawInternal(void* ptr);
128 
129   // Chunks whose freed_at_count is later than the safe frontier value are kept
130   // on a special list and not subject to merging immediately upon being freed.
131   //
132   // This function sweeps that list looking for Chunks whose timestamp is now
133   // safe. When found their freed_at_count is set to 0 and we attempt to merge
134   // them with their neighbors.
135   //
136   // If required_bytes > 0 then this function is being called in the context of
137   // a need for this many bytes that could not be satisfied without merging
138   // unsafe chunks, so we go ahead and merge the unsafe chunks too, just up to
139   // the point that a free chunk of required_bytes is produced.  Note that
140   // unsafe merged chunks adopt the most conservative timestamp from their
141   // constituents so they're only useful for allocations not requiring a
142   // particular timestamp.
143   bool MergeTimestampedChunks(size_t required_bytes)
144       TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
145 
146   // Return the largest free chunk bytes from the largest bin in constant time.
147   // The free chunks are sorted by size (and then address) in a bin.
148   int64_t LargestFreeChunk() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
149 
150   // Add TraceMe (in memory allocation and deallocation) for memory stats
151   // profiling. The chunk_ptr is passed to get information such as address,
152   // chunk size and requested_size.
153   void AddTraceMe(absl::string_view traceme_name, const void* ptr)
154       TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
155 
156   // Overloaded AddTraceMe function with chunk information.
157   void AddTraceMe(absl::string_view traceme_name, const void* chunk_ptr,
158                   int64_t req_bytes, int64_t alloc_bytes)
159       TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
160 
161   // A ChunkHandle is an index into the chunks_ vector in BFCAllocator
162   // kInvalidChunkHandle means an invalid chunk
163   typedef size_t ChunkHandle;
164   static constexpr ChunkHandle kInvalidChunkHandle = SIZE_MAX;
165 
166   typedef int BinNum;
167   static constexpr int kInvalidBinNum = -1;
168   // The following means that the largest bin'd chunk size is 256 << 21 = 512MB.
169   static constexpr int kNumBins = 21;
170 
171   // A Chunk points to a piece of memory that's either entirely free or entirely
172   // in use by one user memory allocation.
173   //
174   // An AllocationRegion's memory is split up into one or more disjoint Chunks,
175   // which together cover the whole region without gaps.  Chunks participate in
176   // a doubly-linked list, and the prev/next pointers point to the physically
177   // adjacent chunks.
178   //
179   // Since a chunk cannot be partially in use, we may need to split a free chunk
180   // in order to service a user allocation.  We always merge adjacent free
181   // chunks.
182   //
183   // Chunks contain information about whether they are in use or whether they
184   // are free, and contain a pointer to the bin they are in.
185   struct Chunk {
186     size_t size = 0;  // Full size of buffer.
187 
188     // We sometimes give chunks that are larger than needed to reduce
189     // fragmentation.  requested_size keeps track of what the client
190     // actually wanted so we can understand whether our splitting
191     // strategy is efficient.
192     size_t requested_size = 0;
193 
194     // allocation_id is set to -1 when the chunk is not in use. It is assigned a
195     // value greater than zero before the chunk is returned from
196     // AllocateRaw, and this value is unique among values assigned by
197     // the parent allocator.
198     int64_t allocation_id = -1;
199     void* ptr = nullptr;  // pointer to granted subbuffer.
200 
201     // If not kInvalidChunkHandle, the memory referred to by 'prev' is directly
202     // preceding the memory used by this chunk.  E.g., It should start
203     // at 'ptr - prev->size'
204     ChunkHandle prev = kInvalidChunkHandle;
205 
206     // If not kInvalidChunkHandle, the memory referred to by 'next' is directly
207     // following the memory used by this chunk.  E.g., It should be at
208     // 'ptr + size'
209     ChunkHandle next = kInvalidChunkHandle;
210 
211     // What bin are we in?
212     BinNum bin_num = kInvalidBinNum;
213 
214     // Optional count when this chunk was most recently made free.
215     uint64 freed_at_count = 0;
216 
in_useChunk217     bool in_use() const { return allocation_id != -1; }
218 
219 #ifdef TENSORFLOW_MEM_DEBUG
220     // optional debugging info
221     const char* op_name = nullptr;
222     uint64 step_id = 0;
223     int64 action_count = 0;
224 #endif
225 
DebugStringChunk226     string DebugString(BFCAllocator* a,
227                        bool recurse) TF_NO_THREAD_SAFETY_ANALYSIS {
228       string dbg;
229       strings::StrAppend(
230           &dbg, "  Size: ", strings::HumanReadableNumBytes(size),
231           " | Requested Size: ", strings::HumanReadableNumBytes(requested_size),
232           " | in_use: ", in_use(), " | bin_num: ", bin_num);
233       if (recurse && prev != BFCAllocator::kInvalidChunkHandle) {
234         Chunk* p = a->ChunkFromHandle(prev);
235         strings::StrAppend(&dbg, ", prev: ", p->DebugString(a, false));
236       }
237       if (recurse && next != BFCAllocator::kInvalidChunkHandle) {
238         Chunk* n = a->ChunkFromHandle(next);
239         strings::StrAppend(&dbg, ", next: ", n->DebugString(a, false));
240       }
241 #ifdef TENSORFLOW_MEM_DEBUG
242       strings::StrAppend(&dbg, ", for: ", op_name ? op_name : "UNKNOWN",
243                          ", stepid: ", step_id,
244                          ", last_action: ", action_count);
245 #endif
246       return dbg;
247     }
248   };
249 
250   // A Bin is a collection of similar-sized free chunks.
251   // Allocated chunks are never in a Bin.
252   struct Bin {
253     // All chunks in this bin have >= bin_size memory.
254     size_t bin_size = 0;
255 
256     class ChunkComparator {
257      public:
ChunkComparatorBin258       explicit ChunkComparator(BFCAllocator* allocator)
259           : allocator_(allocator) {}
260       // Sort first by size and then use pointer address as a tie breaker.
operatorBin261       bool operator()(const ChunkHandle ha,
262                       const ChunkHandle hb) const TF_NO_THREAD_SAFETY_ANALYSIS {
263         const Chunk* a = allocator_->ChunkFromHandle(ha);
264         const Chunk* b = allocator_->ChunkFromHandle(hb);
265         if (a->size != b->size) {
266           return a->size < b->size;
267         }
268         return a->ptr < b->ptr;
269       }
270 
271      private:
272       BFCAllocator* allocator_;  // The parent allocator
273     };
274 
275     typedef std::set<ChunkHandle, ChunkComparator> FreeChunkSet;
276     // List of free chunks within the bin, sorted by chunk size.
277     // Chunk * not owned.
278     FreeChunkSet free_chunks;
BinBin279     Bin(BFCAllocator* allocator, size_t bs)
280         : bin_size(bs), free_chunks(ChunkComparator(allocator)) {}
281   };
282 
283   static constexpr size_t kMinAllocationBits = 8;
284   static constexpr size_t kMinAllocationSize = 1 << kMinAllocationBits;
285 
286   // BFCAllocator allocates memory into a collection of disjoint
287   // AllocationRegions.  Each AllocationRegion corresponds to one call to
288   // SubAllocator::Alloc().  (Actually, if a subsequent call to
289   // SubAllocator::Alloc() returns another region immediately adjacent to the
290   // last, it will be used to extend the first AllocationRegion, not create a
291   // separate one.)
292   //
293   // An AllocationRegion contains one or more Chunks, covering all of its
294   // memory.  Its primary job is to map pointers to ChunkHandles.
295   //
296   // This class is thread-compatible.
297   class AllocationRegion {
298    public:
AllocationRegion(void * ptr,size_t memory_size)299     AllocationRegion(void* ptr, size_t memory_size)
300         : ptr_(ptr),
301           memory_size_(memory_size),
302           end_ptr_(
303               static_cast<void*>(static_cast<char*>(ptr_) + memory_size_)) {
304       DCHECK_EQ(0, memory_size % kMinAllocationSize);
305       const size_t n_handles =
306           (memory_size + kMinAllocationSize - 1) / kMinAllocationSize;
307       handles_.resize(n_handles, kInvalidChunkHandle);
308     }
309 
310     AllocationRegion() = default;
AllocationRegion(AllocationRegion && other)311     AllocationRegion(AllocationRegion&& other) { Swap(&other); }
312     AllocationRegion& operator=(AllocationRegion&& other) {
313       Swap(&other);
314       return *this;
315     }
316 
ptr()317     void* ptr() const { return ptr_; }
end_ptr()318     void* end_ptr() const { return end_ptr_; }
memory_size()319     size_t memory_size() const { return memory_size_; }
extend(size_t size)320     void extend(size_t size) {
321       memory_size_ += size;
322       DCHECK_EQ(0, memory_size_ % kMinAllocationSize);
323 
324       end_ptr_ = static_cast<void*>(static_cast<char*>(end_ptr_) + size);
325       const size_t n_handles =
326           (memory_size_ + kMinAllocationSize - 1) / kMinAllocationSize;
327       handles_.resize(n_handles, kInvalidChunkHandle);
328     }
get_handle(const void * p)329     ChunkHandle get_handle(const void* p) const {
330       return handles_[IndexFor(p)];
331     }
set_handle(const void * p,ChunkHandle h)332     void set_handle(const void* p, ChunkHandle h) { handles_[IndexFor(p)] = h; }
erase(const void * p)333     void erase(const void* p) { set_handle(p, kInvalidChunkHandle); }
334 
335    private:
Swap(AllocationRegion * other)336     void Swap(AllocationRegion* other) {
337       std::swap(ptr_, other->ptr_);
338       std::swap(memory_size_, other->memory_size_);
339       std::swap(end_ptr_, other->end_ptr_);
340       std::swap(handles_, other->handles_);
341     }
342 
IndexFor(const void * p)343     size_t IndexFor(const void* p) const {
344       std::uintptr_t p_int = reinterpret_cast<std::uintptr_t>(p);
345       std::uintptr_t base_int = reinterpret_cast<std::uintptr_t>(ptr_);
346       DCHECK_GE(p_int, base_int);
347       DCHECK_LT(p_int, base_int + memory_size_);
348       return static_cast<size_t>(((p_int - base_int) >> kMinAllocationBits));
349     }
350 
351     // Metadata about the allocation region.
352     void* ptr_ = nullptr;
353     size_t memory_size_ = 0;
354     void* end_ptr_ = nullptr;
355 
356     // Array of size "memory_size / kMinAllocationSize".  It is
357     // indexed by (p-base) / kMinAllocationSize, contains ChunkHandle
358     // for the memory allocation represented by "p"
359     std::vector<ChunkHandle> handles_;
360 
361     TF_DISALLOW_COPY_AND_ASSIGN(AllocationRegion);
362   };
363 
364   // RegionManager aggregates one or more "AllocationRegions" and provides
365   // a layer of indirection from pointers to the underlying ChunkHandle,
366   // allowing allocation across multiple discontiguous memory regions.
367   //
368   // This class is thread-compatible.
369   class RegionManager {
370    public:
RegionManager()371     RegionManager() {}
~RegionManager()372     ~RegionManager() {}
373 
AddAllocationRegion(void * ptr,size_t memory_size)374     void AddAllocationRegion(void* ptr, size_t memory_size) {
375       // Insert sorted by end_ptr.
376       auto entry =
377           std::upper_bound(regions_.begin(), regions_.end(), ptr, &Comparator);
378       regions_.insert(entry, AllocationRegion(ptr, memory_size));
379     }
380 
381     // Adds an alloation region for the given ptr and size, potentially
382     // extending a region if ptr matches the end_ptr of an existing region.
383     // If a region is extended, returns a pointer to the extended region so that
384     // the BFC allocator can reason about chunkification.
AddOrExtendAllocationRegion(void * ptr,size_t memory_size)385     AllocationRegion* AddOrExtendAllocationRegion(void* ptr,
386                                                   size_t memory_size) {
387       // Insert sorted by end_ptr.
388       auto entry =
389           std::upper_bound(regions_.begin(), regions_.end(), ptr, &Comparator);
390       // Check if can be coalesced with preceding region.
391       if (entry != regions_.begin()) {
392         auto preceding_region = entry - 1;
393         if (preceding_region->end_ptr() == ptr) {
394           if (VLOG_IS_ON(1)) {
395             LOG(INFO) << "Extending region " << preceding_region->ptr()
396                       << " of "
397                       << strings::HumanReadableNumBytes(
398                              preceding_region->memory_size())
399                       << "  by " << strings::HumanReadableNumBytes(memory_size)
400                       << " bytes";
401           }
402           preceding_region->extend(memory_size);
403           return &*preceding_region;
404         }
405       }
406       VLOG(1) << "Inserting new region " << ptr << " of "
407               << strings::HumanReadableNumBytes(memory_size);
408       regions_.insert(entry, AllocationRegion(ptr, memory_size));
409       return nullptr;
410     }
411 
RemoveAllocationRegion(std::vector<AllocationRegion>::iterator it)412     std::vector<AllocationRegion>::iterator RemoveAllocationRegion(
413         std::vector<AllocationRegion>::iterator it) {
414       return regions_.erase(it);
415     }
416 
get_handle(const void * p)417     ChunkHandle get_handle(const void* p) const {
418       return RegionFor(p)->get_handle(p);
419     }
420 
set_handle(const void * p,ChunkHandle h)421     void set_handle(const void* p, ChunkHandle h) {
422       return MutableRegionFor(p)->set_handle(p, h);
423     }
erase(const void * p)424     void erase(const void* p) { return MutableRegionFor(p)->erase(p); }
425 
regions()426     const std::vector<AllocationRegion>& regions() const { return regions_; }
427 
428    private:
Comparator(const void * ptr,const AllocationRegion & other)429     static bool Comparator(const void* ptr, const AllocationRegion& other) {
430       return ptr < other.end_ptr();
431     }
432 
MutableRegionFor(const void * p)433     AllocationRegion* MutableRegionFor(const void* p) {
434       return const_cast<AllocationRegion*>(RegionFor(p));
435     }
436 
RegionFor(const void * p)437     const AllocationRegion* RegionFor(const void* p) const {
438       auto entry =
439           std::upper_bound(regions_.begin(), regions_.end(), p, &Comparator);
440 
441       if (entry != regions_.end()) {
442         return &(*entry);
443       }
444 
445       LOG(FATAL) << "Could not find Region for " << p;
446       return nullptr;
447     }
448 
449    private:
450     std::vector<AllocationRegion> regions_;
451   };
452 
453   // Returns 'bytes' rounded up to the next highest kMinAllocationSize.
454   static size_t RoundedBytes(size_t bytes);
455 
456   // Try to add a new memory region that can satisfy an allocation of
457   // 'rounded_bytes' bytes.  Returns true on success and false on
458   // failure.
459   bool Extend(size_t alignment, size_t rounded_bytes)
460       TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
461 
462   // Deallocate free regions to give back the memory to suballocator, so that
463   // we can re-allocate a larger region.  The main use scenario of this function
464   // is when OOM happens but we have free regions and the sum of sizes of free
465   // regions and unallocated bytes is larger than the requested size, implying
466   // (external) memory fragmentation.  Returns true if any free regions are
467   // found and freed; false otherwise.
468   bool DeallocateFreeRegions(size_t rounded_bytes);
469 
470   // Helper function to deallocate regions.
471   void DeallocateRegions(const absl::flat_hash_set<void*>& region_ptrs)
472       TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
473 
474   // Returns a pointer to an underlying allocated chunk of size
475   // 'rounded_bytes'.
476   void* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes,
477                      uint64 freed_before) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
478 
479   // Splits the chunk specified by 'h' into two chunks, one at least
480   // of size 'num_bytes'.
481   void SplitChunk(ChunkHandle h, size_t num_bytes)
482       TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
483 
484   // Merges the two chunk handles.  Requires that the chunks are
485   // contiguous in their allocation.
486   void Merge(ChunkHandle h, ChunkHandle h2) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
487 
488   // Adds the chunk 'h' to the proper free bin.
489   void InsertFreeChunkIntoBin(ChunkHandle h) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
490 
491   // Removes the free chunk pointed to by 'c' from the set free_chunks.
492   void RemoveFreeChunkIterFromBin(Bin::FreeChunkSet* free_chunks,
493                                   const Bin::FreeChunkSet::iterator& c)
494       TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
495 
496   // Removes a free chunk from the bin.
497   void RemoveFreeChunkFromBin(ChunkHandle h) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
498   void MaybeRemoveFreeChunkFromBin(ChunkHandle h)
499       TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
500 
501   // Removes the chunk metadata represented by 'h'.
502   void DeleteChunk(ChunkHandle h) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
503 
504   string RenderOccupancy() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
505   void DumpMemoryLog(size_t num_bytes) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
506   MemoryDump RecordMemoryMapInternal() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
507   void MaybeWriteMemoryMap() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
508 
509   ChunkHandle AllocateChunk() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
510   void DeallocateChunk(ChunkHandle h) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
511 
512   Chunk* ChunkFromHandle(ChunkHandle h) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
513   const Chunk* ChunkFromHandle(ChunkHandle h) const
514       TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
515 
516   void MarkFree(ChunkHandle h) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
517 
518   ChunkHandle TryToCoalesce(ChunkHandle h, bool ignore_freed_at)
519       TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
520 
521   // Fragmentation is calculated as the reverse ratio of the largest free chunk
522   // size over total free memory, and returns a value within [0, 1].
523   double GetFragmentation() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
524 
525   // Information about a Bin that is useful for debugging.
526   struct BinDebugInfo {
527     size_t total_bytes_in_use = 0;
528     size_t total_bytes_in_bin = 0;
529     size_t total_requested_bytes_in_use = 0;
530     size_t total_chunks_in_use = 0;
531     size_t total_chunks_in_bin = 0;
532   };
533 
534   // Computes and returns a BinDebugInfo for each Bin.
535   std::array<BinDebugInfo, kNumBins> get_bin_debug_info()
536       TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
537 
538   AllocatorRetry retry_helper_;
539 
540   // Structures immutable after construction
541   size_t memory_limit_ = 0;
542 
Log2FloorNonZeroSlow(uint64 n)543   inline int Log2FloorNonZeroSlow(uint64 n) {
544     int r = 0;
545     while (n > 0) {
546       r++;
547       n >>= 1;
548     }
549     return r - 1;
550   }
551 
552   // Returns floor(log2(n)).
Log2FloorNonZero(uint64 n)553   inline int Log2FloorNonZero(uint64 n) {
554 #if defined(__GNUC__)
555     return 63 ^ __builtin_clzll(n);
556 #elif defined(PLATFORM_WINDOWS) && (_WIN64)
557     unsigned long index;
558     _BitScanReverse64(&index, n);
559     return index;
560 #else
561     return Log2FloorNonZeroSlow(n);
562 #endif
563   }
564 
565   // Map from bin size to Bin
BinFromIndex(BinNum index)566   Bin* BinFromIndex(BinNum index) {
567     return reinterpret_cast<Bin*>(&(bins_space_[index * sizeof(Bin)]));
568   }
BinNumToSize(BinNum index)569   size_t BinNumToSize(BinNum index) {
570     return static_cast<size_t>(256) << index;
571   }
BinNumForSize(size_t bytes)572   BinNum BinNumForSize(size_t bytes) {
573     uint64 v = std::max<size_t>(bytes, 256) >> kMinAllocationBits;
574     int b = std::min(kNumBins - 1, Log2FloorNonZero(v));
575     return b;
576   }
BinForSize(size_t bytes)577   Bin* BinForSize(size_t bytes) { return BinFromIndex(BinNumForSize(bytes)); }
578 
579   char bins_space_[sizeof(Bin) * kNumBins];
580 
581   const Options opts_;
582 
583   // The size of the current region allocation.
584   size_t curr_region_allocation_bytes_;
585 
586   // The total number of allocated bytes by the allocator.
587   size_t total_region_allocated_bytes_ = 0;
588 
589   // An indicator that expansion of a region has hit the limits
590   // of the available memory.
591   bool started_backpedal_ = false;
592 
593   // Whether the allocator will coalesce adjacent sub allocator provided
594   // AllocationRegions. This may be disabled if discrete sub allocator
595   // regions can't be treated as contiguous (e.g. if the allocation refers to
596   // device visible memory which is not adjacent to the other region in the
597   // device's address space).
598   const bool coalesce_regions_;
599 
600   std::unique_ptr<SubAllocator> sub_allocator_;
601   string name_;
602   SharedCounter* timing_counter_ = nullptr;
603   std::deque<ChunkHandle> timestamped_chunks_;
604 
605   double internal_fragmentation_fraction_ = {0.0};
606 
607   std::atomic<uint64> safe_frontier_ = {0};
608 
609   // Structures mutable after construction
610   mutable mutex lock_;
611   RegionManager region_manager_ TF_GUARDED_BY(lock_);
612 
613   std::vector<Chunk> chunks_ TF_GUARDED_BY(lock_);
614 
615   // Pointer to head of linked list of free Chunks
616   ChunkHandle free_chunks_list_ TF_GUARDED_BY(lock_);
617 
618   // Counter containing the next unique identifier to assign to a
619   // newly-created chunk.
620   int64_t next_allocation_id_ TF_GUARDED_BY(lock_);
621 
622   // Stats.
623   AllocatorStats stats_ TF_GUARDED_BY(lock_);
624 #ifdef TENSORFLOW_MEM_DEBUG
625   int64 action_counter_ = 0 TF_GUARDED_BY(lock_);
626 #define MEM_DEBUG_SIZE_HISTORY_SIZE 4096
627   int64 size_history_[MEM_DEBUG_SIZE_HISTORY_SIZE];
628 #endif
629 
630   friend class GPUBFCAllocatorPrivateMethodsTest;
631   friend class GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific;
632   TF_DISALLOW_COPY_AND_ASSIGN(BFCAllocator);
633 };
634 
635 }  // namespace tensorflow
636 
637 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_BFC_ALLOCATOR_H_
638