1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // A simple CPU allocator that intercepts malloc/free calls from MKL library 17 // and redirects them to Tensorflow allocator 18 19 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_ 20 #define TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_ 21 22 #ifdef INTEL_MKL 23 24 #include <cstdlib> 25 26 #include "tensorflow/core/common_runtime/bfc_allocator.h" 27 #include "tensorflow/core/common_runtime/pool_allocator.h" 28 #include "tensorflow/core/lib/strings/numbers.h" 29 #include "tensorflow/core/lib/strings/str_util.h" 30 #include "tensorflow/core/platform/mem.h" 31 #include "tensorflow/core/platform/numa.h" 32 #include "tensorflow/core/util/env_var.h" 33 #include "tensorflow/core/util/onednn_env_vars.h" 34 #ifdef _WIN32 35 typedef unsigned int uint; 36 #endif 37 38 namespace tensorflow { 39 40 static bool mkl_small_allocator_collect_stats = false; 41 42 class MklSubAllocator : public BasicCPUAllocator { 43 public: MklSubAllocator()44 MklSubAllocator() : BasicCPUAllocator(port::kNUMANoAffinity, {}, {}) {} ~MklSubAllocator()45 ~MklSubAllocator() override {} 46 }; 47 48 // CPU allocator that handles small-size allocations by calling 49 // suballocator directly. Mostly, it is just a wrapper around a suballocator 50 // (that calls malloc and free directly) with support for bookkeeping. 51 class MklSmallSizeAllocator : public Allocator { 52 public: MklSmallSizeAllocator(SubAllocator * sub_allocator,size_t total_memory,const string & name)53 MklSmallSizeAllocator(SubAllocator* sub_allocator, size_t total_memory, 54 const string& name) 55 : sub_allocator_(sub_allocator), name_(name) { 56 stats_.bytes_limit = total_memory; 57 } ~MklSmallSizeAllocator()58 ~MklSmallSizeAllocator() override {} 59 60 TF_DISALLOW_COPY_AND_ASSIGN(MklSmallSizeAllocator); 61 Name()62 inline string Name() override { return name_; } 63 AllocateRaw(size_t alignment,size_t num_bytes)64 void* AllocateRaw(size_t alignment, size_t num_bytes) override { 65 void* ptr = port::AlignedMalloc(num_bytes, alignment); 66 if (mkl_small_allocator_collect_stats) IncrementStats(num_bytes); 67 return ptr; 68 } 69 DeallocateRaw(void * ptr)70 void DeallocateRaw(void* ptr) override { 71 if (ptr == nullptr) { 72 LOG(ERROR) << "tried to deallocate nullptr"; 73 return; 74 } 75 76 if (mkl_small_allocator_collect_stats) { 77 const size_t alloc_size = port::MallocExtension_GetAllocatedSize(ptr); 78 DecrementStats(alloc_size); 79 } 80 port::AlignedFree(ptr); 81 } 82 GetStats()83 absl::optional<AllocatorStats> GetStats() override { 84 mutex_lock l(mutex_); 85 return stats_; 86 } 87 ClearStats()88 bool ClearStats() override { 89 mutex_lock l(mutex_); 90 stats_.num_allocs = 0; 91 stats_.peak_bytes_in_use = 0; 92 stats_.largest_alloc_size = 0; 93 stats_.bytes_in_use = 0; 94 stats_.bytes_limit = 0; 95 return true; 96 } 97 98 private: 99 // Increment statistics for the allocator handling small allocations. IncrementStats(size_t alloc_size)100 inline void IncrementStats(size_t alloc_size) TF_LOCKS_EXCLUDED(mutex_) { 101 mutex_lock l(mutex_); 102 ++stats_.num_allocs; 103 stats_.bytes_in_use += alloc_size; 104 stats_.peak_bytes_in_use = 105 std::max(stats_.peak_bytes_in_use, stats_.bytes_in_use); 106 stats_.largest_alloc_size = 107 std::max(alloc_size, static_cast<size_t>(stats_.largest_alloc_size)); 108 } 109 110 // Decrement statistics for the allocator handling small allocations. DecrementStats(size_t dealloc_size)111 inline void DecrementStats(size_t dealloc_size) TF_LOCKS_EXCLUDED(mutex_) { 112 mutex_lock l(mutex_); 113 stats_.bytes_in_use -= dealloc_size; 114 } 115 116 SubAllocator* sub_allocator_; // Not owned by this class. 117 118 // Mutex for protecting updates to map of allocations. 119 mutable mutex mutex_; 120 121 // Allocator name 122 string name_; 123 124 // Allocator stats for small allocs 125 AllocatorStats stats_ TF_GUARDED_BY(mutex_); 126 }; 127 128 /// CPU allocator for MKL that wraps BFC allocator and intercepts 129 /// and redirects memory allocation calls from MKL. 130 class MklCPUAllocator : public Allocator { 131 public: 132 // Constructor and other standard functions 133 134 /// Environment variable that user can set to upper bound on memory allocation 135 static constexpr const char* kMaxLimitStr = "TF_MKL_ALLOC_MAX_BYTES"; 136 137 /// Default upper limit on allocator size - 64GB 138 static constexpr size_t kDefaultMaxLimit = 64LL << 30; 139 MklCPUAllocator()140 MklCPUAllocator() { TF_CHECK_OK(Initialize()); } 141 ~MklCPUAllocator()142 ~MklCPUAllocator() override { 143 delete small_size_allocator_; 144 delete large_size_allocator_; 145 } 146 Initialize()147 Status Initialize() { 148 VLOG(2) << "MklCPUAllocator: In MklCPUAllocator"; 149 150 // Set upper bound on memory allocation to physical RAM available on the 151 // CPU unless explicitly specified by user 152 uint64 max_mem_bytes = kDefaultMaxLimit; 153 #if defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) 154 max_mem_bytes = 155 (uint64)sysconf(_SC_PHYS_PAGES) * (uint64)sysconf(_SC_PAGESIZE); 156 #endif 157 char* user_mem_bytes = getenv(kMaxLimitStr); 158 159 if (user_mem_bytes != NULL) { 160 uint64 user_val = 0; 161 if (!strings::safe_strtou64(user_mem_bytes, &user_val)) { 162 return errors::InvalidArgument("Invalid memory limit (", user_mem_bytes, 163 ") specified for MKL allocator through ", 164 kMaxLimitStr); 165 } 166 #if defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) 167 if (user_val > max_mem_bytes) { 168 LOG(WARNING) << "The user specified a memory limit " << kMaxLimitStr 169 << "=" << user_val 170 << " greater than available physical memory: " 171 << max_mem_bytes 172 << ". This could significantly reduce performance!"; 173 } 174 #endif 175 max_mem_bytes = user_val; 176 } 177 178 VLOG(1) << "MklCPUAllocator: Setting max_mem_bytes: " << max_mem_bytes; 179 180 sub_allocator_ = new MklSubAllocator(); 181 182 // SubAllocator is owned by BFCAllocator, so we do not need to deallocate 183 // it in MklSmallSizeAllocator. 184 small_size_allocator_ = 185 new MklSmallSizeAllocator(sub_allocator_, max_mem_bytes, kName); 186 187 BFCAllocator::Options large_allocator_opts; 188 large_allocator_opts.allow_growth = kAllowGrowth; 189 large_size_allocator_ = 190 new BFCAllocator(absl::WrapUnique(sub_allocator_), max_mem_bytes, kName, 191 large_allocator_opts); 192 return Status::OK(); 193 } 194 Name()195 inline string Name() override { return kName; } IsSmallSizeAllocation(const void * ptr)196 inline bool IsSmallSizeAllocation(const void* ptr) const 197 TF_LOCKS_EXCLUDED(mutex_) { 198 mutex_lock l(mutex_); 199 return large_allocations_map_.find(ptr) == large_allocations_map_.end(); 200 } 201 // AddLargeAllocMap and RemoveLargeAllocMap are always called with a lock held AddLargeAllocMap(void * ptr,size_t num_bytes)202 inline void AddLargeAllocMap(void* ptr, size_t num_bytes) 203 TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_) { 204 if (ptr != nullptr) { 205 std::pair<void*, size_t> map_val(ptr, num_bytes); 206 large_allocations_map_.insert(map_val); 207 } 208 } RemoveLargeAllocMap(void * ptr)209 inline void RemoveLargeAllocMap(void* ptr) 210 TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_) { 211 auto map_iter = large_allocations_map_.find(ptr); 212 if (map_iter != large_allocations_map_.end()) { 213 large_allocations_map_.erase(map_iter); 214 } else { 215 LOG(ERROR) << "tried to deallocate invalid pointer"; 216 } 217 return; 218 } 219 AllocateRaw(size_t alignment,size_t num_bytes)220 inline void* AllocateRaw(size_t alignment, size_t num_bytes) override { 221 // If the allocation size is less than threshold, call small allocator, 222 // otherwise call large-size allocator (BFC). We found that BFC allocator 223 // does not deliver good performance for small allocations when 224 // inter_op_parallelism_threads is high. 225 if (UseSystemAlloc() || num_bytes < kSmallAllocationsThreshold) { 226 return small_size_allocator_->AllocateRaw(alignment, num_bytes); 227 } else { 228 mutex_lock l(mutex_); 229 void* ptr = large_size_allocator_->AllocateRaw(alignment, num_bytes); 230 AddLargeAllocMap(ptr, num_bytes); 231 return ptr; 232 } 233 } DeallocateRaw(void * ptr)234 inline void DeallocateRaw(void* ptr) override { 235 // Check if ptr is for "small" allocation. If it is, then call Free 236 // directly. Otherwise, call BFC to handle free. 237 if (UseSystemAlloc() || IsSmallSizeAllocation(ptr)) { 238 small_size_allocator_->DeallocateRaw(ptr); 239 } else { 240 mutex_lock l(mutex_); 241 RemoveLargeAllocMap(ptr); 242 large_size_allocator_->DeallocateRaw(ptr); 243 } 244 } GetStats()245 absl::optional<AllocatorStats> GetStats() override { 246 auto s_stats = small_size_allocator_->GetStats(); 247 auto l_stats = large_size_allocator_->GetStats(); 248 249 // Combine statistics from small-size and large-size allocator. 250 mutex_lock l(mutex_); 251 stats_.num_allocs = l_stats->num_allocs + s_stats->num_allocs; 252 stats_.bytes_in_use = l_stats->bytes_in_use + s_stats->bytes_in_use; 253 stats_.peak_bytes_in_use = 254 l_stats->peak_bytes_in_use + s_stats->peak_bytes_in_use; 255 256 // Since small-size allocations go to MklSmallSizeAllocator, 257 // max_alloc_size from large_size_allocator would be the maximum 258 // size allocated by MklCPUAllocator. 259 stats_.largest_alloc_size = l_stats->largest_alloc_size; 260 stats_.bytes_limit = std::max(s_stats->bytes_limit, l_stats->bytes_limit); 261 return stats_; 262 } 263 ClearStats()264 bool ClearStats() override { 265 bool stats_cleared = small_size_allocator_->ClearStats(); 266 stats_cleared &= large_size_allocator_->ClearStats(); 267 return stats_cleared; 268 } 269 270 private: 271 // Hooks provided by this allocator for memory allocation routines from MKL MallocHook(size_t size)272 static inline void* MallocHook(size_t size) { 273 VLOG(3) << "MklCPUAllocator: In MallocHook"; 274 return cpu_allocator()->AllocateRaw(kAlignment, size); 275 } 276 FreeHook(void * ptr)277 static inline void FreeHook(void* ptr) { 278 VLOG(3) << "MklCPUAllocator: In FreeHook"; 279 cpu_allocator()->DeallocateRaw(ptr); 280 } 281 CallocHook(size_t num,size_t size)282 static inline void* CallocHook(size_t num, size_t size) { 283 Status s = Status(error::Code::UNIMPLEMENTED, 284 "Unimplemented case for hooking MKL function."); 285 TF_CHECK_OK(s); // way to assert with an error message 286 return nullptr; // return a value and make static code analyzers happy 287 } 288 ReallocHook(void * ptr,size_t size)289 static inline void* ReallocHook(void* ptr, size_t size) { 290 Status s = Status(error::Code::UNIMPLEMENTED, 291 "Unimplemented case for hooking MKL function."); 292 TF_CHECK_OK(s); // way to assert with an error message 293 return nullptr; // return a value and make static code analyzers happy 294 } 295 296 // Do we allow growth in BFC Allocator 297 static const bool kAllowGrowth = true; 298 299 // Name 300 static constexpr const char* kName = "mklcpu"; 301 302 // The alignment that we need for the allocations 303 static constexpr const size_t kAlignment = 64; 304 305 Allocator* large_size_allocator_; // owned by this class 306 MklSmallSizeAllocator* small_size_allocator_; // owned by this class. 307 308 SubAllocator* sub_allocator_; // not owned by this class 309 mutable mutex mutex_; 310 AllocatorStats stats_ TF_GUARDED_BY(mutex_); 311 312 // Hash map to keep track of "BFC" allocations 313 // We do not use BFC allocator for small allocations. 314 std::unordered_map<const void*, size_t> large_allocations_map_ 315 TF_GUARDED_BY(mutex_); 316 317 // Size in bytes that defines the upper-bound for "small" allocations. 318 // Any allocation below this threshold is "small" allocation. 319 static constexpr const size_t kSmallAllocationsThreshold = 4096; 320 321 // Prevent copying and assignment 322 TF_DISALLOW_COPY_AND_ASSIGN(MklCPUAllocator); 323 }; 324 325 } // namespace tensorflow 326 327 #endif // INTEL_MKL 328 329 #endif // TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_ 330