1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PROCESS_STATE_H_ 17 #define TENSORFLOW_CORE_COMMON_RUNTIME_PROCESS_STATE_H_ 18 19 #include <functional> 20 #include <map> 21 #include <unordered_map> 22 #include <vector> 23 24 #include "tensorflow/core/framework/allocator.h" 25 #include "tensorflow/core/framework/allocator_registry.h" 26 #include "tensorflow/core/platform/mutex.h" 27 #include "tensorflow/core/platform/thread_annotations.h" 28 #include "tensorflow/core/platform/types.h" 29 #include "tensorflow/core/protobuf/config.pb.h" 30 31 namespace tensorflow { 32 33 class Allocator; 34 class PoolAllocator; 35 36 // Singleton that manages per-process state, e.g. allocation of 37 // shared resources. 38 class ProcessState : public ProcessStateInterface { 39 public: 40 static ProcessState* singleton(); 41 42 // Descriptor for memory allocation attributes, used by optional 43 // runtime correctness analysis logic. 44 struct MemDesc { 45 enum MemLoc { CPU, GPU }; 46 MemLoc loc; 47 int dev_index; 48 bool gpu_registered; 49 bool nic_registered; MemDescMemDesc50 MemDesc() 51 : loc(CPU), 52 dev_index(0), 53 gpu_registered(false), 54 nic_registered(false) {} 55 string DebugString(); 56 }; 57 58 // If NUMA Allocators are desired, call this before calling any 59 // Allocator accessor. EnableNUMA()60 void EnableNUMA() { numa_enabled_ = true; } 61 62 // Returns what we know about the memory at ptr. 63 // If we know nothing, it's called CPU 0 with no other attributes. 64 MemDesc PtrType(const void* ptr); 65 66 // Returns the one CPUAllocator used for the given numa_node. 67 // Treats numa_node == kNUMANoAffinity as numa_node == 0. 68 Allocator* GetCPUAllocator(int numa_node) override; 69 70 // Registers alloc visitor for the CPU allocator(s). 71 // REQUIRES: must be called before GetCPUAllocator. 72 void AddCPUAllocVisitor(SubAllocator::Visitor v); 73 74 // Registers free visitor for the CPU allocator(s). 75 // REQUIRES: must be called before GetCPUAllocator. 76 void AddCPUFreeVisitor(SubAllocator::Visitor v); 77 78 typedef std::unordered_map<const void*, MemDesc> MDMap; 79 80 protected: 81 ProcessState(); ~ProcessState()82 virtual ~ProcessState() {} 83 friend class GPUProcessState; 84 friend class PluggableDeviceProcessState; 85 86 // If these flags need to be runtime configurable consider adding 87 // them to ConfigProto. 88 static constexpr bool FLAGS_brain_mem_reg_gpu_dma = true; 89 static constexpr bool FLAGS_brain_gpu_record_mem_types = false; 90 91 // Helper method for unit tests to reset the ProcessState singleton by 92 // cleaning up everything. Never use in production. 93 void TestOnlyReset(); 94 95 static ProcessState* instance_; 96 bool numa_enabled_; 97 98 mutex mu_; 99 100 // Indexed by numa_node. If we want numa-specific allocators AND a 101 // non-specific allocator, maybe should index by numa_node+1. 102 std::vector<Allocator*> cpu_allocators_ TF_GUARDED_BY(mu_); 103 std::vector<SubAllocator::Visitor> cpu_alloc_visitors_ TF_GUARDED_BY(mu_); 104 std::vector<SubAllocator::Visitor> cpu_free_visitors_ TF_GUARDED_BY(mu_); 105 106 // A cache of cpu allocators indexed by a numa node. Used as a fast path to 107 // get CPU allocator by numa node id without locking the mutex. We can't use 108 // `cpu_allocators_` storage in the lock-free path because concurrent 109 // operation can deallocate the vector storage. 110 std::atomic<int> cpu_allocators_cached_; 111 std::array<Allocator*, 8> cpu_allocators_cache_; 112 113 // Optional RecordingAllocators that wrap the corresponding 114 // Allocators for runtime attribute use analysis. 115 MDMap mem_desc_map_; 116 std::vector<Allocator*> cpu_al_ TF_GUARDED_BY(mu_); 117 }; 118 119 namespace internal { 120 class RecordingAllocator : public Allocator { 121 public: RecordingAllocator(ProcessState::MDMap * mm,Allocator * a,ProcessState::MemDesc md,mutex * mu)122 RecordingAllocator(ProcessState::MDMap* mm, Allocator* a, 123 ProcessState::MemDesc md, mutex* mu) 124 : mm_(mm), a_(a), md_(md), mu_(mu) {} 125 Name()126 string Name() override { return a_->Name(); } AllocateRaw(size_t alignment,size_t num_bytes)127 void* AllocateRaw(size_t alignment, size_t num_bytes) override { 128 void* p = a_->AllocateRaw(alignment, num_bytes); 129 mutex_lock l(*mu_); 130 (*mm_)[p] = md_; 131 return p; 132 } DeallocateRaw(void * p)133 void DeallocateRaw(void* p) override { 134 mutex_lock l(*mu_); 135 auto iter = mm_->find(p); 136 mm_->erase(iter); 137 a_->DeallocateRaw(p); 138 } TracksAllocationSizes()139 bool TracksAllocationSizes() const override { 140 return a_->TracksAllocationSizes(); 141 } RequestedSize(const void * p)142 size_t RequestedSize(const void* p) const override { 143 return a_->RequestedSize(p); 144 } AllocatedSize(const void * p)145 size_t AllocatedSize(const void* p) const override { 146 return a_->AllocatedSize(p); 147 } GetStats()148 absl::optional<AllocatorStats> GetStats() override { return a_->GetStats(); } ClearStats()149 bool ClearStats() override { return a_->ClearStats(); } 150 GetMemoryType()151 AllocatorMemoryType GetMemoryType() const override { 152 return a_->GetMemoryType(); 153 } 154 155 ProcessState::MDMap* mm_; // not owned 156 Allocator* a_; // not owned 157 ProcessState::MemDesc md_; 158 mutex* mu_; 159 }; 160 } // namespace internal 161 } // namespace tensorflow 162 #endif // TENSORFLOW_CORE_COMMON_RUNTIME_PROCESS_STATE_H_ 163