1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_H_ 17 #define TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_H_ 18 19 #include <stdlib.h> 20 21 #include <functional> 22 #include <limits> 23 24 #include "absl/strings/string_view.h" 25 #include "absl/types/optional.h" 26 #include "tensorflow/core/framework/numeric_types.h" 27 #include "tensorflow/core/framework/type_traits.h" 28 #include "tensorflow/core/platform/logging.h" 29 #include "tensorflow/core/platform/macros.h" 30 #include "tensorflow/core/platform/numa.h" 31 #include "tensorflow/core/platform/types.h" 32 33 namespace tensorflow { 34 35 // Attributes for a single allocation call. Different calls to the same 36 // allocator could potentially have different allocation attributes. 37 struct AllocationAttributes { 38 AllocationAttributes() = default; 39 AllocationAttributesAllocationAttributes40 AllocationAttributes(bool retry_on_failure, bool allocation_will_be_logged, 41 std::function<uint64()>* freed_by_func) 42 : retry_on_failure(retry_on_failure), 43 allocation_will_be_logged(allocation_will_be_logged), 44 freed_by_func(freed_by_func) {} 45 46 // If the first attempt to allocate the memory fails, the allocation should 47 // wait and retry (with a timeout). 48 // 49 // This is usually set to true, but we may set it to false in cases where a 50 // failure has only performance impact (e.g. optional scratch space 51 // allocation). 52 bool retry_on_failure = true; 53 // If a Tensor is allocated without the following set to true, then 54 // it is logged as an unknown allocation. During execution Tensors 55 // should be allocated through the OpKernelContext which records 56 // which Op is performing the allocation, and sets this flag to 57 // true. 58 bool allocation_will_be_logged = false; 59 // EXPERIMENTAL: If provided, then evaluates to a timing count such that only 60 // a memory chunk whose freed_at_count is at this value or earlier may be 61 // returned. 62 std::function<uint64()>* freed_by_func = nullptr; // Not owned. 63 64 TF_DISALLOW_COPY_AND_ASSIGN(AllocationAttributes); 65 }; 66 67 // Runtime statistics collected by an allocator. Exactly the same as 68 // stream_executor::AllocatorStats, but independently defined to preserve the 69 // mutual independence of StreamExecutor and TensorFlow. 70 struct AllocatorStats { 71 int64_t num_allocs; // Number of allocations. 72 int64_t bytes_in_use; // Number of bytes in use. 73 int64_t peak_bytes_in_use; // The peak bytes in use. 74 int64_t largest_alloc_size; // The largest single allocation seen. 75 76 // The upper limit of bytes of user allocatable device memory, if such a limit 77 // is known. 78 absl::optional<int64_t> bytes_limit; 79 80 // Stats for reserved memory usage. 81 int64_t bytes_reserved; // Number of bytes reserved. 82 int64_t peak_bytes_reserved; // The peak number of bytes reserved. 83 // The upper limit on the number bytes of reservable memory, 84 // if such a limit is known. 85 absl::optional<int64_t> bytes_reservable_limit; 86 87 int64_t largest_free_block_bytes; // Largest free block's size in heap. 88 AllocatorStatsAllocatorStats89 AllocatorStats() 90 : num_allocs(0), 91 bytes_in_use(0), 92 peak_bytes_in_use(0), 93 largest_alloc_size(0), 94 bytes_reserved(0), 95 peak_bytes_reserved(0), 96 largest_free_block_bytes(0) {} 97 98 std::string DebugString() const; 99 }; 100 101 // The type of the allocated memory. 102 enum class AllocatorMemoryType { 103 kUnknown = 0, // Memory type unknown. 104 kDevice = 1, // Memory on device. 105 kHostPageable = 2, // Memory on host and it is pagable. 106 kHostPinned = 3, // Memory on host and it is pinned. 107 }; 108 109 // Allocator is an abstract interface for allocating and deallocating 110 // device memory. 111 class Allocator { 112 public: 113 // Align to 64 byte boundary. 114 static constexpr size_t kAllocatorAlignment = 64; 115 116 virtual ~Allocator(); 117 118 // Return a string identifying this allocator 119 virtual std::string Name() = 0; 120 121 // Return an uninitialized block of memory that is "num_bytes" bytes 122 // in size. The returned pointer is guaranteed to be aligned to a 123 // multiple of "alignment" bytes. 124 // REQUIRES: "alignment" is a power of 2. 125 virtual void* AllocateRaw(size_t alignment, size_t num_bytes) = 0; 126 127 // Return an uninitialized block of memory that is "num_bytes" bytes 128 // in size with specified allocation attributes. The returned pointer is 129 // guaranteed to be aligned to a multiple of "alignment" bytes. 130 // REQUIRES: "alignment" is a power of 2. AllocateRaw(size_t alignment,size_t num_bytes,const AllocationAttributes & allocation_attr)131 virtual void* AllocateRaw(size_t alignment, size_t num_bytes, 132 const AllocationAttributes& allocation_attr) { 133 // The default behavior is to use the implementation without any allocation 134 // attributes. 135 return AllocateRaw(alignment, num_bytes); 136 } 137 138 // Deallocate a block of memory pointer to by "ptr" 139 // REQUIRES: "ptr" was previously returned by a call to AllocateRaw 140 virtual void DeallocateRaw(void* ptr) = 0; 141 142 // Returns true if this allocator tracks the sizes of allocations. 143 // RequestedSize and AllocatedSize must be overridden if 144 // TracksAllocationSizes is overridden to return true. TracksAllocationSizes()145 virtual bool TracksAllocationSizes() const { return false; } 146 147 // Returns true if this allocator allocates an opaque handle rather than the 148 // requested number of bytes. 149 // 150 // This method returns false for most allocators, but may be used by 151 // special-case allocators that track tensor usage. If this method returns 152 // true, AllocateRaw() should be invoked for all values of `num_bytes`, 153 // including 0. 154 // 155 // NOTE: It is the caller's responsibility to track whether an allocated 156 // object is a buffer or an opaque handle. In particular, when this method 157 // returns `true`, users of this allocator must not run any constructors or 158 // destructors for complex objects, since there is no backing store for the 159 // tensor in which to place their outputs. AllocatesOpaqueHandle()160 virtual bool AllocatesOpaqueHandle() const { return false; } 161 162 // Returns the user-requested size of the data allocated at 163 // 'ptr'. Note that the actual buffer allocated might be larger 164 // than requested, but this function returns the size requested by 165 // the user. 166 // 167 // REQUIRES: TracksAllocationSizes() is true. 168 // 169 // REQUIRES: 'ptr!=nullptr' and points to a buffer previously 170 // allocated by this allocator. RequestedSize(const void * ptr)171 virtual size_t RequestedSize(const void* ptr) const { 172 CHECK(false) << "allocator doesn't track sizes"; 173 return size_t(0); 174 } 175 176 // Returns the allocated size of the buffer at 'ptr' if known, 177 // otherwise returns RequestedSize(ptr). AllocatedSize(ptr) is 178 // guaranteed to be >= RequestedSize(ptr). 179 // 180 // REQUIRES: TracksAllocationSizes() is true. 181 // 182 // REQUIRES: 'ptr!=nullptr' and points to a buffer previously 183 // allocated by this allocator. AllocatedSize(const void * ptr)184 virtual size_t AllocatedSize(const void* ptr) const { 185 return RequestedSize(ptr); 186 } 187 188 // Returns either 0 or an identifier assigned to the buffer at 'ptr' 189 // when the buffer was returned by AllocateRaw. If non-zero, the 190 // identifier differs from every other ID assigned by this 191 // allocator. 192 // 193 // REQUIRES: TracksAllocationSizes() is true. 194 // 195 // REQUIRES: 'ptr!=nullptr' and points to a buffer previously 196 // allocated by this allocator. AllocationId(const void * ptr)197 virtual int64_t AllocationId(const void* ptr) const { return 0; } 198 199 // Returns the allocated size of the buffer at 'ptr' if known, 200 // otherwise returns 0. This method can be called when 201 // TracksAllocationSizes() is false, but can be extremely slow. 202 // 203 // REQUIRES: 'ptr!=nullptr' and points to a buffer previously 204 // allocated by this allocator. AllocatedSizeSlow(const void * ptr)205 virtual size_t AllocatedSizeSlow(const void* ptr) const { 206 if (TracksAllocationSizes()) { 207 return AllocatedSize(ptr); 208 } 209 return 0; 210 } 211 212 // Fills in 'stats' with statistics collected by this allocator. GetStats()213 virtual absl::optional<AllocatorStats> GetStats() { return absl::nullopt; } 214 215 // If implemented, clears the internal stats except for the `in_use` fields 216 // and sets the `peak_bytes_in_use` to be equal to the `bytes_in_use`. Returns 217 // true if implemented. 218 // 219 // REQUIRES: GetStats is overridden. ClearStats()220 virtual bool ClearStats() TF_MUST_USE_RESULT { return false; } 221 SetSafeFrontier(uint64 count)222 virtual void SetSafeFrontier(uint64 count) {} 223 224 // For allocator that are stream aware, allow to specify the compute 225 // stream this allocator is used for. This can also trigger memory 226 // preallocation. SetStreamAndPreallocateMemory(void * stream)227 virtual void SetStreamAndPreallocateMemory(void* stream) {} 228 229 // Returns the type of the memory allocated by this allocator. GetMemoryType()230 virtual AllocatorMemoryType GetMemoryType() const { 231 return AllocatorMemoryType::kUnknown; 232 } 233 }; 234 235 // An implementation of Allocator that delegates all calls to another Allocator. 236 // 237 // Useful to clients who want to override part of the functionality of another 238 // allocator. 239 class AllocatorWrapper : public Allocator { 240 public: AllocatorWrapper(Allocator * wrapped)241 explicit AllocatorWrapper(Allocator* wrapped) : wrapped_(wrapped) {} 242 ~AllocatorWrapper()243 ~AllocatorWrapper() override {} 244 245 // Returns the wrapped allocator to which all calls are delegated. wrapped()246 Allocator* wrapped() const { return wrapped_; } 247 Name()248 std::string Name() override { return wrapped_->Name(); } 249 AllocateRaw(size_t alignment,size_t num_bytes)250 void* AllocateRaw(size_t alignment, size_t num_bytes) override { 251 return wrapped_->AllocateRaw(alignment, num_bytes); 252 } 253 AllocateRaw(size_t alignment,size_t num_bytes,const AllocationAttributes & allocation_attr)254 void* AllocateRaw(size_t alignment, size_t num_bytes, 255 const AllocationAttributes& allocation_attr) override { 256 return wrapped_->AllocateRaw(alignment, num_bytes, allocation_attr); 257 } 258 DeallocateRaw(void * ptr)259 void DeallocateRaw(void* ptr) override { wrapped_->DeallocateRaw(ptr); } 260 TracksAllocationSizes()261 bool TracksAllocationSizes() const override { 262 return wrapped_->TracksAllocationSizes(); 263 } 264 AllocatesOpaqueHandle()265 bool AllocatesOpaqueHandle() const override { 266 return wrapped_->AllocatesOpaqueHandle(); 267 } 268 RequestedSize(const void * ptr)269 size_t RequestedSize(const void* ptr) const override { 270 return wrapped_->RequestedSize(ptr); 271 } 272 AllocatedSize(const void * ptr)273 size_t AllocatedSize(const void* ptr) const override { 274 return wrapped_->AllocatedSize(ptr); 275 } 276 AllocationId(const void * ptr)277 int64_t AllocationId(const void* ptr) const override { 278 return wrapped_->AllocationId(ptr); 279 } 280 AllocatedSizeSlow(const void * ptr)281 size_t AllocatedSizeSlow(const void* ptr) const override { 282 return wrapped_->AllocatedSizeSlow(ptr); 283 } 284 GetMemoryType()285 AllocatorMemoryType GetMemoryType() const override { 286 return wrapped_->GetMemoryType(); 287 } 288 289 private: 290 Allocator* const wrapped_; 291 }; 292 293 // A tensorflow Op may need access to different kinds of memory that 294 // are not simply a function of the device to which the Op has been 295 // assigned. For example, an Op executing on a GPU may still need 296 // to allocate CPU RAM for some purpose. Internal to the tensorflow 297 // runtime we may choose to allocate CPU ram from special regions 298 // that have been prepared for higher performance in some use 299 // contexts, e.g. doing DMA with particular devices. For these 300 // reasons, the Device interface does not expose just one memory 301 // Allocator, but instead provides an accessor that takes a 302 // specification of the desired memory attributes in order to select 303 // an Allocator. 304 // 305 // Example use: 306 // // Allocator for ordinary device memory: 307 // Allocator* a = allocator(AllocatorAttributes()); 308 // ... 309 // // Allocator for CPU RAM, regardless of where Op is executing: 310 // AllocatorAttributes attr; 311 // attr.set_on_host(true); 312 // Allocator* a = allocator(attr); 313 struct AllocatorAttributes { set_on_hostAllocatorAttributes314 void set_on_host(bool v) { value |= (static_cast<int>(v)); } on_hostAllocatorAttributes315 bool on_host() const { return value & 0x1; } set_nic_compatibleAllocatorAttributes316 void set_nic_compatible(bool v) { value |= (static_cast<int>(v) << 1); } nic_compatibleAllocatorAttributes317 bool nic_compatible() const { return value & (0x1 << 1); } set_gpu_compatibleAllocatorAttributes318 void set_gpu_compatible(bool v) { value |= (static_cast<int>(v) << 2); } gpu_compatibleAllocatorAttributes319 bool gpu_compatible() const { return value & (0x1 << 2); } MergeAllocatorAttributes320 void Merge(AllocatorAttributes other) { 321 value |= other.value; 322 if (scope_id != other.scope_id) { 323 CHECK(scope_id == 0 || other.scope_id == 0) 324 << "At least one scope_id should be zero to merge " 325 "AllocatorAttributes but found this.scope_id=" 326 << scope_id << " and other.scope_id=" << other.scope_id; 327 scope_id = scope_id == 0 ? other.scope_id : scope_id; 328 } 329 } 330 // Returns true if the fields set in *this is a subset of or equal to 331 // those set in other. IsEqualOrLessRestrictiveThanAllocatorAttributes332 bool IsEqualOrLessRestrictiveThan(const AllocatorAttributes& other) const { 333 return (value | other.value) == other.value; 334 } 335 336 // NOTE: The upper 8 bits of the value are reserved for 337 // device-specific uses. Implementors of a device can interpret these 338 // upper 8 bits in device-specific ways, and ops implemented for those 339 // devices are responsible for setting those 8 bits appropriately. 340 uint32 value = 0; 341 // EXPERIMENTAL: If this is greater than zero, then allocation is delegated to 342 // a named special-purpose allocator on the same device. 343 int32 scope_id = 0; 344 345 // Returns a human readable representation of this. 346 std::string DebugString() const; 347 }; 348 349 // Returns a trivial implementation of Allocator, which is a process singleton. 350 // Access through this function is only intended for use by restricted parts 351 // of the infrastructure. 352 Allocator* cpu_allocator_base(); 353 354 // If available, calls ProcessState::GetCPUAllocator(numa_node). 355 // If not, falls back to cpu_allocator_base(). 356 // Intended for use in contexts where ProcessState is not visible at 357 // compile time. Where ProcessState is visible, it's preferable to 358 // call it directly. 359 Allocator* cpu_allocator(int numa_node = port::kNUMANoAffinity); 360 361 // Enables AllocatorStats in the default CPU allocator implementation. By 362 // default, it's disabled. 363 void EnableCPUAllocatorStats(); 364 // Disables AllocatorStats in the default CPU allocator implementation. By 365 // default, it's disabled. 366 void DisableCPUAllocatorStats(); 367 bool CPUAllocatorStatsEnabled(); 368 369 // Enables full statistics collection in the default CPU allocator 370 // implementation. By default, it's disabled. 371 void EnableCPUAllocatorFullStats(); 372 bool CPUAllocatorFullStatsEnabled(); 373 374 // An object that does the underlying suballoc/free of memory for a higher-level 375 // allocator. The expectation is that the higher-level allocator is doing some 376 // kind of cache or pool management so that it will call SubAllocator::Alloc and 377 // Free relatively infrequently, compared to the number of times its own 378 // AllocateRaw and Free methods are called. 379 class SubAllocator { 380 public: 381 // Visitor gets called with a pointer to a memory area and its 382 // size in bytes. The index value will be numa_node for a CPU 383 // allocator and GPU id for a GPU allocator. 384 typedef std::function<void(void*, int index, size_t)> Visitor; 385 386 SubAllocator(const std::vector<Visitor>& alloc_visitors, 387 const std::vector<Visitor>& free_visitors); 388 ~SubAllocator()389 virtual ~SubAllocator() {} 390 // Allocates at least num_bytes. Returns actual number of bytes allocated in 391 // bytes_received. The caller can safely use the full bytes_received sized 392 // buffer following the returend pointer. 393 virtual void* Alloc(size_t alignment, size_t num_bytes, 394 size_t* bytes_received) = 0; 395 virtual void Free(void* ptr, size_t num_bytes) = 0; 396 397 // Returns true if the BFC allocator can safely coalesce adjacent regions 398 // returned by this allocator. 399 virtual bool SupportsCoalescing() const = 0; 400 401 // Returns the type of the memory allocated by this SubAllocator. GetMemoryType()402 virtual AllocatorMemoryType GetMemoryType() const { 403 return AllocatorMemoryType::kUnknown; 404 } 405 406 protected: 407 // Implementation of Alloc() method must call this on newly allocated 408 // value. 409 void VisitAlloc(void* ptr, int index, size_t num_bytes); 410 411 // Implementation of Free() method must call this on value to be 412 // freed immediately before deallocation. 413 void VisitFree(void* ptr, int index, size_t num_bytes); 414 415 const std::vector<Visitor> alloc_visitors_; 416 const std::vector<Visitor> free_visitors_; 417 }; 418 419 } // namespace tensorflow 420 421 #endif // TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_H_ 422