xref: /aosp_15_r20/external/tensorflow/tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "absl/types/optional.h"
17 #ifdef GOOGLE_CUDA
18 #include "third_party/gpus/cuda/include/cuda.h"
19 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
20 #endif  // GOOGLE_CUDA
21 
22 #include "absl/strings/str_cat.h"
23 #include "tensorflow/core/common_runtime/device/device_id_utils.h"
24 #include "tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.h"
25 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
26 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
27 #include "tensorflow/core/framework/allocator.h"
28 #include "tensorflow/core/platform/stream_executor.h"
29 #include "tensorflow/core/util/env_var.h"
30 
31 namespace tensorflow {
32 
33 #if GOOGLE_CUDA
GetCudaErrorMessage(CUresult result)34 static std::string GetCudaErrorMessage(CUresult result) {
35   const char* error;
36   cuGetErrorString(result, &error);
37   const char* name;
38   cuGetErrorName(result, &name);
39   return absl::StrCat("CUDA error: ", error ? error : "<unknown>", " (",
40                       name ? name : "Unknown", ")");
41 }
42 #endif  // GOOGLE_CUDA
43 
PrintAllocatorStatistics()44 void GpuCudaMallocAsyncAllocator::PrintAllocatorStatistics() {
45   mutex_lock lock(lock_);
46 
47   std::map<size_t, int> size_map_historgram;
48   std::vector<string> ptr_size_string;
49   for (auto p : size_map_) {
50     if (VLOG_IS_ON(8)) {
51       ptr_size_string.push_back(
52           absl::StrCat("(", absl::Hex(p.first), ",", p.second) + ")");
53     }
54     size_map_historgram[p.second]++;
55   }
56   LOG(ERROR) << "Histogram of current allocation: (allocation_size_in_bytes, "
57              << "nb_allocation_of_that_sizes), ...;";
58   for (auto p : size_map_historgram) {
59     LOG(ERROR) << p.first << ", " << p.second;
60   }
61 
62   VLOG(8) << "\nThe sorted list of (ptr,size):";
63   VLOG(8) << absl::StrJoin(ptr_size_string, ",");
64 
65 #if CUDA_VERSION >= 11030
66   cuuint64_t mem_reserved_current;
67   if (auto result = cuMemPoolGetAttribute(
68           pool_, CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT, &mem_reserved_current)) {
69     LOG(ERROR) << "Error while fetching extra cudaMallocAsync pool attribute: "
70                << GetCudaErrorMessage(result);
71   }
72   cuuint64_t mem_used_current;
73   if (auto result = cuMemPoolGetAttribute(
74           pool_, CU_MEMPOOL_ATTR_USED_MEM_CURRENT, &mem_used_current)) {
75     LOG(ERROR) << "Error while fetching extra cudaMallocAsync pool attribute: "
76                << GetCudaErrorMessage(result);
77   }
78   cuuint64_t mem_reserved_high;
79   if (auto result = cuMemPoolGetAttribute(
80           pool_, CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH, &mem_reserved_high)) {
81     LOG(ERROR) << "Error while fetching extra cudaMallocAsync pool attribute: "
82                << GetCudaErrorMessage(result);
83   }
84   cuuint64_t mem_used_high;
85   if (auto result = cuMemPoolGetAttribute(pool_, CU_MEMPOOL_ATTR_USED_MEM_HIGH,
86                                           &mem_used_high)) {
87     LOG(ERROR) << "Error while fetching extra cudaMallocAsync pool attribute: "
88                << GetCudaErrorMessage(result);
89   }
90   LOG(ERROR) << "CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT: "
91              << mem_reserved_current;
92   LOG(ERROR) << "CU_MEMPOOL_ATTR_USED_MEM_CURRENT: " << mem_used_current;
93   LOG(ERROR) << "CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: " << mem_reserved_high;
94   LOG(ERROR) << "CU_MEMPOOL_ATTR_USED_MEM_HIGH: " << mem_used_high;
95 #endif
96 }
97 
98 std::atomic<int> GpuCudaMallocAsyncAllocator::number_instantiated_(0);
99 
GpuCudaMallocAsyncAllocator(PlatformDeviceId platform_device_id,size_t pool_size,bool reserve_memory,bool compute_stats)100 GpuCudaMallocAsyncAllocator::GpuCudaMallocAsyncAllocator(
101     PlatformDeviceId platform_device_id, size_t pool_size, bool reserve_memory,
102     bool compute_stats)
103     : name_(absl::StrCat("gpu_async_", platform_device_id.value())),
104       reserve_memory_(reserve_memory) {
105   ++number_instantiated_;
106 
107   // Stop clang from complaining about unused private fields when
108   // TF_CUDA_MALLOC_ASYNC_SUPPORTED is not defined.
109   (void)reserve_memory_;
110 
111 #if TF_CUDA_MALLOC_ASYNC_SUPPORTED
112   stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
113                                                            platform_device_id)
114                      .ValueOrDie();
115   // Initialized here as it only exist if compiled with a recent
116   // enough CUDA.
117   pool_ = nullptr;
118   cuda_stream_ = nullptr;
119   int driverVersion;
120   cuDriverGetVersion(&driverVersion);
121   VLOG(2) << "DRIVER VERSION: " << driverVersion;
122   if (driverVersion < 11020) {
123     LOG(FATAL)  // Crash OK.
124         << "Disable cuda_malloc_async or update your CUDA driver to a version"
125         << " compatible with CUDA 11.2 or higher."
126         << " We detected a version compatible with: " << driverVersion;
127   }
128 
129   // WAR an CUDA 11.2 driver bug for multiple-GPU. It currently
130   // request that the context on GPU 0 is initialized. Which isn't the
131   // case for TF+horovod.
132   if (platform_device_id.value() > 0 && driverVersion < 11030) {
133     CUcontext pctx;  // We loose track of it. But this is fine.
134     if (auto result = cuDevicePrimaryCtxRetain(&pctx, 0))
135       LOG(FATAL)  // Crash OK.
136           << "Failed to retain context: " << GetCudaErrorMessage(result);
137   }
138 
139   se::cuda::ScopedActivateExecutorContext scoped_activation{stream_exec_};
140 
141   // Check the CUDA runtime is recent enough.
142   if (auto status2 = cuDriverGetVersion(&driverVersion)) {
143     LOG(FATAL)  // Crash OK.
144         << "Error while fetching driver version: "
145         << GetCudaErrorMessage(status2);
146   }
147 
148   // Check that cudaMallocAsync is supported.
149   int cuda_malloc_async_supported;
150   if (auto status =
151           cuDeviceGetAttribute(&cuda_malloc_async_supported,
152                                CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED,
153                                platform_device_id.value())) {
154     LOG(FATAL)  // Crash OK.
155         << "On device: " << platform_device_id.value()
156         << " Current driver: " << driverVersion
157         << ". Failed to get device attribute : " << GetCudaErrorMessage(status);
158   }
159   if (!cuda_malloc_async_supported)
160     LOG(FATAL)  // Crash OK.
161         << "TF_GPU_ALLOCATOR=cuda_malloc_async isn't currently supported on "
162         << "GPU id " << platform_device_id.value() << ":"
163         << " Possible causes: device not supported (request SM60+), driver too "
164            "old, "
165         << " OS not supported, CUDA version too old(request CUDA11.2+).";
166 
167   if (auto status =
168           cuDeviceGetDefaultMemPool(&pool_, platform_device_id.value()))
169     LOG(FATAL) <<  // Crash OK.
170         "Failed to get default CUDA pool: " << GetCudaErrorMessage(status);
171 
172   VLOG(1) << Name() << " CudaMallocAsync initialized on platform: "
173           << platform_device_id.value() << " with pool size of: " << pool_size
174           << " this ptr: " << this;
175   uint64_t pool_size_64 = pool_size;
176   if (auto status = cuMemPoolSetAttribute(
177           pool_, CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, &pool_size_64))
178     LOG(FATAL) <<  // Crash OK.
179         "Failed to set CUDA pool attribute: " << GetCudaErrorMessage(status);
180 
181   if (compute_stats) {
182     stats_ = std::make_unique<AllocatorStats>();
183     stats_->bytes_limit = static_cast<int64_t>(pool_size);
184   }  // If not set, it means we do not compute stats.
185 
186   // If in TF_DETERMINISTIC_ALLOCATOR is set, then make the allocator behave
187   // determistically.
188   bool deterministic = false;
189   TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_DETERMINISTIC_ALLOCATOR",
190                                              /*default_val=*/false,
191                                              &deterministic));
192   if (deterministic) {
193     int disable = 0;
194     if (auto status = cuMemPoolSetAttribute(
195             pool_, CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC, &disable)) {
196       LOG(FATAL) <<  // Crash OK.
197           "Failed to set CUDA pool attribute: " << GetCudaErrorMessage(status);
198     }
199     if (auto status = cuMemPoolSetAttribute(
200             pool_, CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES,
201             &disable)) {
202       LOG(FATAL) <<  // Crash OK.
203           "Failed to set CUDA pool attribute: " << GetCudaErrorMessage(status);
204     }
205   }
206 
207   // Set read/write access to all GPUs.
208   static auto* all_pools_ = new std::vector<CUmemoryPool*>();
209   static auto* all_ids_ = new std::vector<PlatformDeviceId>();
210   DCHECK(all_pools_->size() == all_ids_->size());
211   for (int i = 0; i < all_pools_->size(); ++i) {
212     // Set the current pool access to the previous GPUs.
213     CUmemAccessDesc map;
214     map.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
215     map.location.id = (*all_ids_)[i].value();
216 
217     map.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
218     VLOG(2) << "Setting access of the current pool to "
219             << " location id: " << map.location.id;
220     int canAccessPeer;
221     if (auto status = cuDeviceCanAccessPeer(
222             &canAccessPeer, platform_device_id.value(), map.location.id)) {
223       pool_ = nullptr;
224       LOG(FATAL)  // Crash OK.
225           << "cuDeviceCanAccessPeer failed to know if GPU id "
226           << map.location.id << " can access GPU id "
227           << platform_device_id.value() << ": " << GetCudaErrorMessage(status);
228     }
229     if (canAccessPeer == 1) {
230       if (auto status = cuMemPoolSetAccess(pool_, &map, 1)) {
231         pool_ = nullptr;
232         LOG(FATAL)  // Crash OK.
233             << "Error when setting access to the pool id: " << i
234             << " location id: " << map.location.id
235             << " error: " << GetCudaErrorMessage(status);
236       }
237     }
238 
239     // Set the previous pools access to the current GPU.
240     map.location.id = platform_device_id.value();
241 
242     VLOG(2) << "Set access to the pool id: " << i
243             << " location id: " << map.location.id;
244     if (auto status = cuDeviceCanAccessPeer(&canAccessPeer, i,
245                                             platform_device_id.value())) {
246       pool_ = nullptr;
247       LOG(FATAL)  // Crash OK.
248           << "cuDeviceCanAccessPeer failed: " << GetCudaErrorMessage(status);
249     }
250     if (canAccessPeer == 1) {
251       if (auto status = cuMemPoolSetAccess(*(*all_pools_)[i], &map, 1)) {
252         pool_ = nullptr;
253         LOG(FATAL)  // Crash OK.
254             << "Error when setting access to the pool id: " << i
255             << " location id: " << map.location.id
256             << " error: " << GetCudaErrorMessage(status);
257       }
258     }
259   }
260   all_pools_->push_back(&pool_);
261   all_ids_->push_back(platform_device_id);
262 
263   VLOG(2) << Name() << " GpuCudaMallocAsyncAllocator PoolSize " << pool_size;
264 #else   // TF_CUDA_MALLOC_ASYNC_SUPPORTED
265   LOG(FATAL) << "GpuCudaMallocAsyncAllocator requires CUDA 11.2+";  // Crash OK.
266 #endif  // TF_CUDA_MALLOC_ASYNC_SUPPORTED
267 }
268 
~GpuCudaMallocAsyncAllocator()269 GpuCudaMallocAsyncAllocator::~GpuCudaMallocAsyncAllocator() {}
270 
AllocateRaw(size_t alignment,size_t num_bytes)271 void* GpuCudaMallocAsyncAllocator::AllocateRaw(size_t alignment,
272                                                size_t num_bytes) {
273 #if TF_CUDA_MALLOC_ASYNC_SUPPORTED
274   CHECK(cuda_stream_ != nullptr)
275       << "A stream must be added to the GpuCudaMallocAsync allocator";
276   if (pool_ == nullptr) {
277     LOG(FATAL)  // Crash OK.
278         << "The instantiation of GpuCudaMallocAsyncAllocator failed."
279         << " See previous errors.";
280   }
281   se::cuda::ScopedActivateExecutorContext scoped_activation{stream_exec_};
282   void* ptr = nullptr;
283   if (auto result =
284           cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&ptr),
285                                   num_bytes, pool_, cuda_stream_)) {
286     size_t free, total;
287     cuMemGetInfo(&free, &total);
288     LOG(ERROR) << Name() << " cuMemAllocAsync failed to allocate " << num_bytes
289                << " bytes: " << GetCudaErrorMessage(result)
290                << "\n Reported by CUDA: Free memory/Total memory: " << free
291                << "/" << total;
292     if (auto stats = GetStats())
293       LOG(ERROR) << "Stats: " << stats->DebugString();
294 
295     PrintAllocatorStatistics();
296 
297     return nullptr;
298   }
299 
300   // Update stats.
301   if (stats_) {
302     mutex_lock lock(lock_);
303     ++(stats_->num_allocs);
304     stats_->bytes_in_use += num_bytes;
305     if (stats_->bytes_in_use > stats_->peak_bytes_in_use) {
306       VLOG(9) << "New Peak memory usage of " << stats_->bytes_in_use
307               << " bytes.";
308     }
309     stats_->peak_bytes_in_use =
310         std::max(stats_->peak_bytes_in_use, stats_->bytes_in_use);
311     stats_->largest_alloc_size =
312         std::max<std::size_t>(stats_->largest_alloc_size, num_bytes);
313     size_map_[ptr] = num_bytes;
314   }
315   VLOG(10) << Name() << " Allocated " << num_bytes << " at " << ptr;
316   return ptr;
317 #else   // TF_CUDA_MALLOC_ASYNC_SUPPORTED
318   return nullptr;
319 #endif  // TF_CUDA_MALLOC_ASYNC_SUPPORTED
320 }
DeallocateRaw(void * ptr)321 void GpuCudaMallocAsyncAllocator::DeallocateRaw(void* ptr) {
322 #if TF_CUDA_MALLOC_ASYNC_SUPPORTED
323   if (ptr == nullptr) return;
324   if (auto result = cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(ptr),
325                                    cuda_stream_)) {
326     if (result == CUDA_ERROR_DEINITIALIZED) {
327       // It happens with multi-GPU that TF free the GPU allocation after
328       // the driver is unloaded. It is safe to ignore this error here.
329       // TODO: Find how to fix the shutdown steps in TF.
330       VLOG(1) << "Ignoring CUDA error: " << GetCudaErrorMessage(result);
331     } else {
332       size_t free, total;
333       se::cuda::ScopedActivateExecutorContext scoped_activation{stream_exec_};
334       cuMemGetInfo(&free, &total);
335       LOG(ERROR) << "cudaFreeAsync failed to free " << ptr << ": "
336                  << GetCudaErrorMessage(result)
337                  << "\n Free memory/Total memory: " << free << "/" << total;
338       if (auto stats = GetStats())
339         LOG(ERROR) << "Stats: " << stats->DebugString();
340     }
341   }
342 
343   // Updates the stats.
344   if (stats_) {
345     mutex_lock lock(lock_);
346     DCHECK(size_map_.contains(ptr));
347     size_t size = size_map_[ptr];
348     stats_->bytes_in_use -= size;
349     size_map_.erase(ptr);
350   }
351 
352   VLOG(10) << Name() << " Freed ptr: " << ptr;
353 #endif  // TF_CUDA_MALLOC_ASYNC_SUPPORTED
354 }
355 
TracksAllocationSizes() const356 bool GpuCudaMallocAsyncAllocator::TracksAllocationSizes() const {
357   return static_cast<bool>(stats_);
358 }
359 
RequestedSize(const void * ptr) const360 size_t GpuCudaMallocAsyncAllocator::RequestedSize(const void* ptr) const {
361   if (!stats_ || !ptr) return 0;
362   mutex_lock l(lock_);
363   return size_map_.at(ptr);
364 }
365 
AllocatedSize(const void * ptr) const366 size_t GpuCudaMallocAsyncAllocator::AllocatedSize(const void* ptr) const {
367   if (!stats_ || !ptr) return 0;
368   mutex_lock l(lock_);
369   return size_map_.at(ptr);
370 }
371 
GetStats()372 absl::optional<AllocatorStats> GpuCudaMallocAsyncAllocator::GetStats() {
373   if (!stats_) return absl::nullopt;
374   mutex_lock l(lock_);
375   return *stats_;
376 }
377 
ClearStats()378 bool GpuCudaMallocAsyncAllocator::ClearStats() {
379   if (!stats_) return false;
380   mutex_lock l(lock_);
381   stats_->num_allocs = 0;
382   stats_->peak_bytes_in_use = stats_->bytes_in_use;
383   stats_->largest_alloc_size = 0;
384   return true;
385 }
386 
SetStreamAndPreallocateMemory(void * stream)387 void GpuCudaMallocAsyncAllocator::SetStreamAndPreallocateMemory(void* stream) {
388 #if TF_CUDA_MALLOC_ASYNC_SUPPORTED
389   CUstream new_cuda_stream = *(static_cast<CUstream*>(stream));
390   // We don't need to re-set the CUDA stream if this is the same stream
391   if (cuda_stream_ != nullptr && new_cuda_stream != cuda_stream_) {
392     LOG(FATAL) <<  // Crash OK.
393         "Trying to set the stream twice. This isn't supported. ";
394   }
395 
396   uint64_t pool_size_64 = 0;
397   if (auto status = cuMemPoolGetAttribute(
398           pool_, CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, &pool_size_64)) {
399     LOG(FATAL) <<  // Crash OK.
400         "Failed to get CUDA pool attribute: " << GetCudaErrorMessage(status);
401   }
402   cuda_stream_ = new_cuda_stream;
403   int64 prealloc_size = 0;
404   // TF_CUDA_MALLOC_ASYNC_SUPPORTED_PREALLOC=-1 is a special value that
405   // preallocates the total pool size.
406   TF_CHECK_OK(ReadInt64FromEnvVar("TF_CUDA_MALLOC_ASYNC_SUPPORTED_PREALLOC", 0,
407                                   &prealloc_size));
408   if (prealloc_size == -1) {
409     prealloc_size = pool_size_64;
410   } else if (reserve_memory_) {
411     prealloc_size = pool_size_64;
412   }
413 
414   if (prealloc_size != 0) {
415     void* ptr = AllocateRaw(0, prealloc_size);
416     DeallocateRaw(ptr);
417     VLOG(2) << Name() << " GpuCudaMallocAsyncAllocator reserved the pool for "
418             << prealloc_size << " bytes"
419             << ". First ptr: " << ptr;
420     ClearStats();
421   }
422 #endif
423 }
424 
425 }  // namespace tensorflow
426