1 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "absl/types/optional.h"
17 #ifdef GOOGLE_CUDA
18 #include "third_party/gpus/cuda/include/cuda.h"
19 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
20 #endif // GOOGLE_CUDA
21
22 #include "absl/strings/str_cat.h"
23 #include "tensorflow/core/common_runtime/device/device_id_utils.h"
24 #include "tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.h"
25 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
26 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
27 #include "tensorflow/core/framework/allocator.h"
28 #include "tensorflow/core/platform/stream_executor.h"
29 #include "tensorflow/core/util/env_var.h"
30
31 namespace tensorflow {
32
33 #if GOOGLE_CUDA
GetCudaErrorMessage(CUresult result)34 static std::string GetCudaErrorMessage(CUresult result) {
35 const char* error;
36 cuGetErrorString(result, &error);
37 const char* name;
38 cuGetErrorName(result, &name);
39 return absl::StrCat("CUDA error: ", error ? error : "<unknown>", " (",
40 name ? name : "Unknown", ")");
41 }
42 #endif // GOOGLE_CUDA
43
PrintAllocatorStatistics()44 void GpuCudaMallocAsyncAllocator::PrintAllocatorStatistics() {
45 mutex_lock lock(lock_);
46
47 std::map<size_t, int> size_map_historgram;
48 std::vector<string> ptr_size_string;
49 for (auto p : size_map_) {
50 if (VLOG_IS_ON(8)) {
51 ptr_size_string.push_back(
52 absl::StrCat("(", absl::Hex(p.first), ",", p.second) + ")");
53 }
54 size_map_historgram[p.second]++;
55 }
56 LOG(ERROR) << "Histogram of current allocation: (allocation_size_in_bytes, "
57 << "nb_allocation_of_that_sizes), ...;";
58 for (auto p : size_map_historgram) {
59 LOG(ERROR) << p.first << ", " << p.second;
60 }
61
62 VLOG(8) << "\nThe sorted list of (ptr,size):";
63 VLOG(8) << absl::StrJoin(ptr_size_string, ",");
64
65 #if CUDA_VERSION >= 11030
66 cuuint64_t mem_reserved_current;
67 if (auto result = cuMemPoolGetAttribute(
68 pool_, CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT, &mem_reserved_current)) {
69 LOG(ERROR) << "Error while fetching extra cudaMallocAsync pool attribute: "
70 << GetCudaErrorMessage(result);
71 }
72 cuuint64_t mem_used_current;
73 if (auto result = cuMemPoolGetAttribute(
74 pool_, CU_MEMPOOL_ATTR_USED_MEM_CURRENT, &mem_used_current)) {
75 LOG(ERROR) << "Error while fetching extra cudaMallocAsync pool attribute: "
76 << GetCudaErrorMessage(result);
77 }
78 cuuint64_t mem_reserved_high;
79 if (auto result = cuMemPoolGetAttribute(
80 pool_, CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH, &mem_reserved_high)) {
81 LOG(ERROR) << "Error while fetching extra cudaMallocAsync pool attribute: "
82 << GetCudaErrorMessage(result);
83 }
84 cuuint64_t mem_used_high;
85 if (auto result = cuMemPoolGetAttribute(pool_, CU_MEMPOOL_ATTR_USED_MEM_HIGH,
86 &mem_used_high)) {
87 LOG(ERROR) << "Error while fetching extra cudaMallocAsync pool attribute: "
88 << GetCudaErrorMessage(result);
89 }
90 LOG(ERROR) << "CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT: "
91 << mem_reserved_current;
92 LOG(ERROR) << "CU_MEMPOOL_ATTR_USED_MEM_CURRENT: " << mem_used_current;
93 LOG(ERROR) << "CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: " << mem_reserved_high;
94 LOG(ERROR) << "CU_MEMPOOL_ATTR_USED_MEM_HIGH: " << mem_used_high;
95 #endif
96 }
97
98 std::atomic<int> GpuCudaMallocAsyncAllocator::number_instantiated_(0);
99
GpuCudaMallocAsyncAllocator(PlatformDeviceId platform_device_id,size_t pool_size,bool reserve_memory,bool compute_stats)100 GpuCudaMallocAsyncAllocator::GpuCudaMallocAsyncAllocator(
101 PlatformDeviceId platform_device_id, size_t pool_size, bool reserve_memory,
102 bool compute_stats)
103 : name_(absl::StrCat("gpu_async_", platform_device_id.value())),
104 reserve_memory_(reserve_memory) {
105 ++number_instantiated_;
106
107 // Stop clang from complaining about unused private fields when
108 // TF_CUDA_MALLOC_ASYNC_SUPPORTED is not defined.
109 (void)reserve_memory_;
110
111 #if TF_CUDA_MALLOC_ASYNC_SUPPORTED
112 stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
113 platform_device_id)
114 .ValueOrDie();
115 // Initialized here as it only exist if compiled with a recent
116 // enough CUDA.
117 pool_ = nullptr;
118 cuda_stream_ = nullptr;
119 int driverVersion;
120 cuDriverGetVersion(&driverVersion);
121 VLOG(2) << "DRIVER VERSION: " << driverVersion;
122 if (driverVersion < 11020) {
123 LOG(FATAL) // Crash OK.
124 << "Disable cuda_malloc_async or update your CUDA driver to a version"
125 << " compatible with CUDA 11.2 or higher."
126 << " We detected a version compatible with: " << driverVersion;
127 }
128
129 // WAR an CUDA 11.2 driver bug for multiple-GPU. It currently
130 // request that the context on GPU 0 is initialized. Which isn't the
131 // case for TF+horovod.
132 if (platform_device_id.value() > 0 && driverVersion < 11030) {
133 CUcontext pctx; // We loose track of it. But this is fine.
134 if (auto result = cuDevicePrimaryCtxRetain(&pctx, 0))
135 LOG(FATAL) // Crash OK.
136 << "Failed to retain context: " << GetCudaErrorMessage(result);
137 }
138
139 se::cuda::ScopedActivateExecutorContext scoped_activation{stream_exec_};
140
141 // Check the CUDA runtime is recent enough.
142 if (auto status2 = cuDriverGetVersion(&driverVersion)) {
143 LOG(FATAL) // Crash OK.
144 << "Error while fetching driver version: "
145 << GetCudaErrorMessage(status2);
146 }
147
148 // Check that cudaMallocAsync is supported.
149 int cuda_malloc_async_supported;
150 if (auto status =
151 cuDeviceGetAttribute(&cuda_malloc_async_supported,
152 CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED,
153 platform_device_id.value())) {
154 LOG(FATAL) // Crash OK.
155 << "On device: " << platform_device_id.value()
156 << " Current driver: " << driverVersion
157 << ". Failed to get device attribute : " << GetCudaErrorMessage(status);
158 }
159 if (!cuda_malloc_async_supported)
160 LOG(FATAL) // Crash OK.
161 << "TF_GPU_ALLOCATOR=cuda_malloc_async isn't currently supported on "
162 << "GPU id " << platform_device_id.value() << ":"
163 << " Possible causes: device not supported (request SM60+), driver too "
164 "old, "
165 << " OS not supported, CUDA version too old(request CUDA11.2+).";
166
167 if (auto status =
168 cuDeviceGetDefaultMemPool(&pool_, platform_device_id.value()))
169 LOG(FATAL) << // Crash OK.
170 "Failed to get default CUDA pool: " << GetCudaErrorMessage(status);
171
172 VLOG(1) << Name() << " CudaMallocAsync initialized on platform: "
173 << platform_device_id.value() << " with pool size of: " << pool_size
174 << " this ptr: " << this;
175 uint64_t pool_size_64 = pool_size;
176 if (auto status = cuMemPoolSetAttribute(
177 pool_, CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, &pool_size_64))
178 LOG(FATAL) << // Crash OK.
179 "Failed to set CUDA pool attribute: " << GetCudaErrorMessage(status);
180
181 if (compute_stats) {
182 stats_ = std::make_unique<AllocatorStats>();
183 stats_->bytes_limit = static_cast<int64_t>(pool_size);
184 } // If not set, it means we do not compute stats.
185
186 // If in TF_DETERMINISTIC_ALLOCATOR is set, then make the allocator behave
187 // determistically.
188 bool deterministic = false;
189 TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_DETERMINISTIC_ALLOCATOR",
190 /*default_val=*/false,
191 &deterministic));
192 if (deterministic) {
193 int disable = 0;
194 if (auto status = cuMemPoolSetAttribute(
195 pool_, CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC, &disable)) {
196 LOG(FATAL) << // Crash OK.
197 "Failed to set CUDA pool attribute: " << GetCudaErrorMessage(status);
198 }
199 if (auto status = cuMemPoolSetAttribute(
200 pool_, CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES,
201 &disable)) {
202 LOG(FATAL) << // Crash OK.
203 "Failed to set CUDA pool attribute: " << GetCudaErrorMessage(status);
204 }
205 }
206
207 // Set read/write access to all GPUs.
208 static auto* all_pools_ = new std::vector<CUmemoryPool*>();
209 static auto* all_ids_ = new std::vector<PlatformDeviceId>();
210 DCHECK(all_pools_->size() == all_ids_->size());
211 for (int i = 0; i < all_pools_->size(); ++i) {
212 // Set the current pool access to the previous GPUs.
213 CUmemAccessDesc map;
214 map.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
215 map.location.id = (*all_ids_)[i].value();
216
217 map.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
218 VLOG(2) << "Setting access of the current pool to "
219 << " location id: " << map.location.id;
220 int canAccessPeer;
221 if (auto status = cuDeviceCanAccessPeer(
222 &canAccessPeer, platform_device_id.value(), map.location.id)) {
223 pool_ = nullptr;
224 LOG(FATAL) // Crash OK.
225 << "cuDeviceCanAccessPeer failed to know if GPU id "
226 << map.location.id << " can access GPU id "
227 << platform_device_id.value() << ": " << GetCudaErrorMessage(status);
228 }
229 if (canAccessPeer == 1) {
230 if (auto status = cuMemPoolSetAccess(pool_, &map, 1)) {
231 pool_ = nullptr;
232 LOG(FATAL) // Crash OK.
233 << "Error when setting access to the pool id: " << i
234 << " location id: " << map.location.id
235 << " error: " << GetCudaErrorMessage(status);
236 }
237 }
238
239 // Set the previous pools access to the current GPU.
240 map.location.id = platform_device_id.value();
241
242 VLOG(2) << "Set access to the pool id: " << i
243 << " location id: " << map.location.id;
244 if (auto status = cuDeviceCanAccessPeer(&canAccessPeer, i,
245 platform_device_id.value())) {
246 pool_ = nullptr;
247 LOG(FATAL) // Crash OK.
248 << "cuDeviceCanAccessPeer failed: " << GetCudaErrorMessage(status);
249 }
250 if (canAccessPeer == 1) {
251 if (auto status = cuMemPoolSetAccess(*(*all_pools_)[i], &map, 1)) {
252 pool_ = nullptr;
253 LOG(FATAL) // Crash OK.
254 << "Error when setting access to the pool id: " << i
255 << " location id: " << map.location.id
256 << " error: " << GetCudaErrorMessage(status);
257 }
258 }
259 }
260 all_pools_->push_back(&pool_);
261 all_ids_->push_back(platform_device_id);
262
263 VLOG(2) << Name() << " GpuCudaMallocAsyncAllocator PoolSize " << pool_size;
264 #else // TF_CUDA_MALLOC_ASYNC_SUPPORTED
265 LOG(FATAL) << "GpuCudaMallocAsyncAllocator requires CUDA 11.2+"; // Crash OK.
266 #endif // TF_CUDA_MALLOC_ASYNC_SUPPORTED
267 }
268
~GpuCudaMallocAsyncAllocator()269 GpuCudaMallocAsyncAllocator::~GpuCudaMallocAsyncAllocator() {}
270
AllocateRaw(size_t alignment,size_t num_bytes)271 void* GpuCudaMallocAsyncAllocator::AllocateRaw(size_t alignment,
272 size_t num_bytes) {
273 #if TF_CUDA_MALLOC_ASYNC_SUPPORTED
274 CHECK(cuda_stream_ != nullptr)
275 << "A stream must be added to the GpuCudaMallocAsync allocator";
276 if (pool_ == nullptr) {
277 LOG(FATAL) // Crash OK.
278 << "The instantiation of GpuCudaMallocAsyncAllocator failed."
279 << " See previous errors.";
280 }
281 se::cuda::ScopedActivateExecutorContext scoped_activation{stream_exec_};
282 void* ptr = nullptr;
283 if (auto result =
284 cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&ptr),
285 num_bytes, pool_, cuda_stream_)) {
286 size_t free, total;
287 cuMemGetInfo(&free, &total);
288 LOG(ERROR) << Name() << " cuMemAllocAsync failed to allocate " << num_bytes
289 << " bytes: " << GetCudaErrorMessage(result)
290 << "\n Reported by CUDA: Free memory/Total memory: " << free
291 << "/" << total;
292 if (auto stats = GetStats())
293 LOG(ERROR) << "Stats: " << stats->DebugString();
294
295 PrintAllocatorStatistics();
296
297 return nullptr;
298 }
299
300 // Update stats.
301 if (stats_) {
302 mutex_lock lock(lock_);
303 ++(stats_->num_allocs);
304 stats_->bytes_in_use += num_bytes;
305 if (stats_->bytes_in_use > stats_->peak_bytes_in_use) {
306 VLOG(9) << "New Peak memory usage of " << stats_->bytes_in_use
307 << " bytes.";
308 }
309 stats_->peak_bytes_in_use =
310 std::max(stats_->peak_bytes_in_use, stats_->bytes_in_use);
311 stats_->largest_alloc_size =
312 std::max<std::size_t>(stats_->largest_alloc_size, num_bytes);
313 size_map_[ptr] = num_bytes;
314 }
315 VLOG(10) << Name() << " Allocated " << num_bytes << " at " << ptr;
316 return ptr;
317 #else // TF_CUDA_MALLOC_ASYNC_SUPPORTED
318 return nullptr;
319 #endif // TF_CUDA_MALLOC_ASYNC_SUPPORTED
320 }
DeallocateRaw(void * ptr)321 void GpuCudaMallocAsyncAllocator::DeallocateRaw(void* ptr) {
322 #if TF_CUDA_MALLOC_ASYNC_SUPPORTED
323 if (ptr == nullptr) return;
324 if (auto result = cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(ptr),
325 cuda_stream_)) {
326 if (result == CUDA_ERROR_DEINITIALIZED) {
327 // It happens with multi-GPU that TF free the GPU allocation after
328 // the driver is unloaded. It is safe to ignore this error here.
329 // TODO: Find how to fix the shutdown steps in TF.
330 VLOG(1) << "Ignoring CUDA error: " << GetCudaErrorMessage(result);
331 } else {
332 size_t free, total;
333 se::cuda::ScopedActivateExecutorContext scoped_activation{stream_exec_};
334 cuMemGetInfo(&free, &total);
335 LOG(ERROR) << "cudaFreeAsync failed to free " << ptr << ": "
336 << GetCudaErrorMessage(result)
337 << "\n Free memory/Total memory: " << free << "/" << total;
338 if (auto stats = GetStats())
339 LOG(ERROR) << "Stats: " << stats->DebugString();
340 }
341 }
342
343 // Updates the stats.
344 if (stats_) {
345 mutex_lock lock(lock_);
346 DCHECK(size_map_.contains(ptr));
347 size_t size = size_map_[ptr];
348 stats_->bytes_in_use -= size;
349 size_map_.erase(ptr);
350 }
351
352 VLOG(10) << Name() << " Freed ptr: " << ptr;
353 #endif // TF_CUDA_MALLOC_ASYNC_SUPPORTED
354 }
355
TracksAllocationSizes() const356 bool GpuCudaMallocAsyncAllocator::TracksAllocationSizes() const {
357 return static_cast<bool>(stats_);
358 }
359
RequestedSize(const void * ptr) const360 size_t GpuCudaMallocAsyncAllocator::RequestedSize(const void* ptr) const {
361 if (!stats_ || !ptr) return 0;
362 mutex_lock l(lock_);
363 return size_map_.at(ptr);
364 }
365
AllocatedSize(const void * ptr) const366 size_t GpuCudaMallocAsyncAllocator::AllocatedSize(const void* ptr) const {
367 if (!stats_ || !ptr) return 0;
368 mutex_lock l(lock_);
369 return size_map_.at(ptr);
370 }
371
GetStats()372 absl::optional<AllocatorStats> GpuCudaMallocAsyncAllocator::GetStats() {
373 if (!stats_) return absl::nullopt;
374 mutex_lock l(lock_);
375 return *stats_;
376 }
377
ClearStats()378 bool GpuCudaMallocAsyncAllocator::ClearStats() {
379 if (!stats_) return false;
380 mutex_lock l(lock_);
381 stats_->num_allocs = 0;
382 stats_->peak_bytes_in_use = stats_->bytes_in_use;
383 stats_->largest_alloc_size = 0;
384 return true;
385 }
386
SetStreamAndPreallocateMemory(void * stream)387 void GpuCudaMallocAsyncAllocator::SetStreamAndPreallocateMemory(void* stream) {
388 #if TF_CUDA_MALLOC_ASYNC_SUPPORTED
389 CUstream new_cuda_stream = *(static_cast<CUstream*>(stream));
390 // We don't need to re-set the CUDA stream if this is the same stream
391 if (cuda_stream_ != nullptr && new_cuda_stream != cuda_stream_) {
392 LOG(FATAL) << // Crash OK.
393 "Trying to set the stream twice. This isn't supported. ";
394 }
395
396 uint64_t pool_size_64 = 0;
397 if (auto status = cuMemPoolGetAttribute(
398 pool_, CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, &pool_size_64)) {
399 LOG(FATAL) << // Crash OK.
400 "Failed to get CUDA pool attribute: " << GetCudaErrorMessage(status);
401 }
402 cuda_stream_ = new_cuda_stream;
403 int64 prealloc_size = 0;
404 // TF_CUDA_MALLOC_ASYNC_SUPPORTED_PREALLOC=-1 is a special value that
405 // preallocates the total pool size.
406 TF_CHECK_OK(ReadInt64FromEnvVar("TF_CUDA_MALLOC_ASYNC_SUPPORTED_PREALLOC", 0,
407 &prealloc_size));
408 if (prealloc_size == -1) {
409 prealloc_size = pool_size_64;
410 } else if (reserve_memory_) {
411 prealloc_size = pool_size_64;
412 }
413
414 if (prealloc_size != 0) {
415 void* ptr = AllocateRaw(0, prealloc_size);
416 DeallocateRaw(ptr);
417 VLOG(2) << Name() << " GpuCudaMallocAsyncAllocator reserved the pool for "
418 << prealloc_size << " bytes"
419 << ". First ptr: " << ptr;
420 ClearStats();
421 }
422 #endif
423 }
424
425 } // namespace tensorflow
426