1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
17
18 #include <cstring>
19 #include <vector>
20
21 #include "absl/container/flat_hash_set.h"
22 #include "tensorflow/core/common_runtime/device/device_host_allocator.h"
23 #include "tensorflow/core/common_runtime/device/device_id_utils.h"
24 #include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
25 #include "tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h"
26 #include "tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.h"
27 #include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h"
28 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
29 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
30 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
31 #include "tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h"
32 #include "tensorflow/core/common_runtime/pool_allocator.h"
33 #include "tensorflow/core/common_runtime/shared_counter.h"
34 #include "tensorflow/core/framework/allocator.h"
35 #include "tensorflow/core/framework/log_memory.h"
36 #include "tensorflow/core/framework/tracking_allocator.h"
37 #include "tensorflow/core/lib/strings/strcat.h"
38 #include "tensorflow/core/platform/logging.h"
39 #include "tensorflow/core/platform/mutex.h"
40 #include "tensorflow/core/platform/stream_executor.h"
41 #include "tensorflow/core/platform/types.h"
42 #include "tensorflow/core/util/env_var.h"
43
44 namespace tensorflow {
45
46 // NOLINTNEXTLINE(clang-diagnostic-unused-function)
UseCudaMallocAllocator()47 static bool UseCudaMallocAllocator() {
48 const char* allocator_env = std::getenv("TF_GPU_ALLOCATOR");
49 return allocator_env != nullptr &&
50 std::strcmp(allocator_env, "cuda_malloc") == 0;
51 }
52
53 // NOLINTNEXTLINE(clang-diagnostic-unused-function)
UseCudaMemoryGuardAllocator()54 static bool UseCudaMemoryGuardAllocator() {
55 const char* allocator_env = std::getenv("TF_GPU_ALLOCATOR");
56 return allocator_env != nullptr &&
57 std::strcmp(allocator_env, "memory_guard") == 0;
58 }
59
60 // NOLINTNEXTLINE(clang-diagnostic-unused-function)
UseCudaMallocAsyncAllocator()61 static bool UseCudaMallocAsyncAllocator() {
62 const char* allocator_env = std::getenv("TF_GPU_ALLOCATOR");
63 auto result = allocator_env != nullptr &&
64 std::strcmp(allocator_env, "cuda_malloc_async") == 0;
65 #if TF_CUDA_MALLOC_ASYNC_SUPPORTED
66 return result;
67 #else
68 if (result)
69 LOG(ERROR) << "TF_GPU_ALLOCATOR=cuda_malloc_async environment found, "
70 << "but TensorFlow was not compiled with CUDA 11.2+.";
71 return false;
72 #endif
73 }
74
singleton(GPUProcessState * ps)75 /*static*/ GPUProcessState* GPUProcessState::singleton(GPUProcessState* ps) {
76 static GPUProcessState* instance = ps ? ps : new GPUProcessState;
77 DCHECK((!ps) || (ps == instance))
78 << "Multiple calls to GPUProcessState with non-null ps";
79 return instance;
80 }
81
GPUProcessState()82 GPUProcessState::GPUProcessState() : gpu_device_enabled_(false) {
83 process_state_ = ProcessState::singleton();
84 }
85
BusIdForGPU(TfDeviceId tf_device_id)86 int GPUProcessState::BusIdForGPU(TfDeviceId tf_device_id) {
87 // Return the NUMA node associated with the GPU's StreamExecutor.
88 se::StreamExecutor* se = DeviceIdUtil::ExecutorForTfDeviceId(
89 DEVICE_GPU, GPUMachineManager(), tf_device_id)
90 .ValueOrDie();
91 int numa_node = se->GetDeviceDescription().numa_node();
92 // bus_id must be non-negative. If the numa_node is not known,
93 // use 0.
94 return numa_node >= 0 ? numa_node : 0;
95 }
96
97 // NOLINTNEXTLINE: clang-tidy complains this is unused because of build flags.
CreateSubAllocator(const GPUOptions & options,PlatformDeviceId platform_device_id,const std::vector<SubAllocator::Visitor> & alloc_visitors,size_t total_bytes,const std::vector<TfDeviceId> & peer_gpu_ids)98 static std::unique_ptr<SubAllocator> CreateSubAllocator(
99 const GPUOptions& options, PlatformDeviceId platform_device_id,
100 const std::vector<SubAllocator::Visitor>& alloc_visitors,
101 size_t total_bytes, const std::vector<TfDeviceId>& peer_gpu_ids) {
102 auto executor = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
103 platform_device_id)
104 .ValueOrDie();
105
106 // FIXME(imintz): Observed OOM issues when using the virtual memory
107 // allocators. This should be reenabled when resolved.
108 #if 0 && defined(GOOGLE_CUDA) && CUDA_VERSION >= 10020
109 // Use the old allocator when unified memory is required.
110 // TODO(imintz): Remove the cuMemAlloc capability of this allocator.
111 if (options.per_process_gpu_memory_fraction() > 1.0 ||
112 options.experimental().use_unified_memory()) {
113 return new DeviceMemAllocator(executor, platform_device_id,
114 /*use_unified_memory=*/true, alloc_visitors,
115 {});
116 } else {
117 auto* gpu_context = reinterpret_cast<stream_executor::gpu::GpuContext*>(
118 executor->implementation()->GpuContextHack());
119
120 absl::flat_hash_set<PlatformDeviceId> platform_peer_gpu_ids;
121 platform_peer_gpu_ids.reserve(peer_gpu_ids.size());
122 for (const TfDeviceId tf_device_id : peer_gpu_ids) {
123 PlatformDeviceId platform_device_id;
124 TF_CHECK_OK(GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id));
125 platform_peer_gpu_ids.insert(platform_device_id);
126 }
127 std::vector<PlatformDeviceId> platform_peer_gpu_ids_vec(
128 platform_peer_gpu_ids.begin(), platform_peer_gpu_ids.end());
129
130 // Adjust virtual address space to be slightly larger than the physical
131 // address space in case the BFC allocator performs suboptimal garbage
132 // collection.
133 // TODO(imintz): Update BFC allocator to ensure it doesn't create holes in
134 // the va space.
135 return GpuVirtualMemAllocator::Create(
136 alloc_visitors, {}, *gpu_context, platform_device_id,
137 /*virtual_address_space_size=*/total_bytes * 2,
138 platform_peer_gpu_ids_vec)
139 .ValueOrDie()
140 .release();
141 }
142 #else
143 return absl::WrapUnique(
144 new DeviceMemAllocator(executor, platform_device_id,
145 (options.per_process_gpu_memory_fraction() > 1.0 ||
146 options.experimental().use_unified_memory()),
147 alloc_visitors, {}));
148 #endif
149 }
150
GetGPUAllocator(const GPUOptions & options,TfDeviceId tf_device_id,size_t total_bytes,const std::vector<TfDeviceId> & peer_gpu_ids)151 Allocator* GPUProcessState::GetGPUAllocator(
152 const GPUOptions& options, TfDeviceId tf_device_id, size_t total_bytes,
153 const std::vector<TfDeviceId>& peer_gpu_ids) {
154 CHECK(process_state_);
155 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
156 (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
157 const string& allocator_type = options.allocator_type();
158 mutex_lock lock(mu_);
159 DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, GPUMachineManager(),
160 tf_device_id);
161
162 if (tf_device_id.value() >= static_cast<int64_t>(gpu_allocators_.size())) {
163 gpu_allocators_.resize(tf_device_id.value() + 1);
164 }
165
166 AllocatorParts& allocator_parts = gpu_allocators_[tf_device_id.value()];
167 if (allocator_parts.allocator == nullptr) {
168 // Validate allocator types.
169 if (!allocator_type.empty() && allocator_type != "BFC") {
170 LOG(ERROR) << "Invalid allocator type: " << allocator_type;
171 return nullptr;
172 }
173
174 PlatformDeviceId platform_device_id;
175 TF_CHECK_OK(
176 GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id));
177 int bus_id = BusIdForGPU(tf_device_id);
178 DCHECK_GE(bus_id, 0);
179 while (bus_id >= gpu_visitors_.size()) {
180 gpu_visitors_.push_back({});
181 }
182 std::unique_ptr<SubAllocator> sub_allocator =
183 CreateSubAllocator(options, platform_device_id, gpu_visitors_[bus_id],
184 total_bytes, peer_gpu_ids);
185 SubAllocator* sub_allocator_ptr = sub_allocator.get();
186
187 auto gpu_bfc_allocator = absl::make_unique<GPUBFCAllocator>(
188 std::move(sub_allocator), total_bytes,
189 strings::StrCat("GPU_", tf_device_id.value(), "_bfc"), [&] {
190 GPUBFCAllocator::Options o;
191 o.allow_growth = options.allow_growth();
192 o.allow_retry_on_failure =
193 !options.experimental().disallow_retry_on_allocation_failure();
194 o.fragmentation_fraction =
195 options.experimental().internal_fragmentation_fraction();
196 return o;
197 }());
198 Allocator* gpu_allocator = gpu_bfc_allocator.get();
199
200 SharedCounter* timing_counter = nullptr;
201 if (options.experimental().timestamped_allocator()) {
202 timing_counter = new SharedCounter;
203 gpu_bfc_allocator->SetTimingCounter(timing_counter);
204 }
205
206 // If true, checks for memory overwrites by writing
207 // distinctive patterns on both ends of allocated memory.
208 if (UseCudaMemoryGuardAllocator()) {
209 LOG(INFO) << "Using memory guard allocator for GPU.";
210 gpu_allocator = new GPUNanResetAllocator(
211 new GPUDebugAllocator(gpu_allocator, platform_device_id),
212 platform_device_id);
213 } else if (UseCudaMallocAllocator()) {
214 LOG(INFO) << "Using CUDA malloc allocator for GPU.";
215 // If true, passes all allocation requests through to cudaMalloc
216 // useful for doing memory debugging with tools like cuda-memcheck
217 // **WARNING** probably will not work in a multi-gpu scenario
218 gpu_bfc_allocator.reset();
219 gpu_allocator = new GPUcudaMallocAllocator(platform_device_id);
220 } else if (UseCudaMallocAsyncAllocator() ||
221 options.experimental().use_cuda_malloc_async()) {
222 LOG(INFO) << "Using CUDA malloc Async allocator for GPU: "
223 << platform_device_id;
224 // If true, passes all allocation requests through to cudaMallocAsync
225 // TODO: useful for doing memory debugging with tools like
226 // compute-sanitizer.
227 // TODO: **WARNING** probably will not work in a multi-gpu scenario
228 gpu_bfc_allocator.reset();
229 gpu_allocator =
230 new GpuCudaMallocAsyncAllocator(platform_device_id, total_bytes);
231 }
232
233 Allocator* recording_allocator = nullptr;
234 if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
235 ProcessState::MemDesc md;
236 md.loc = ProcessState::MemDesc::GPU;
237 md.dev_index = platform_device_id.value();
238 md.gpu_registered = false;
239 md.nic_registered = true;
240 recording_allocator = new internal::RecordingAllocator(
241 &process_state_->mem_desc_map_, gpu_allocator, md, &mu_);
242 }
243 allocator_parts = {
244 std::unique_ptr<Allocator>(gpu_allocator),
245 std::unique_ptr<SharedCounter>(timing_counter),
246 gpu_bfc_allocator.release(),
247 sub_allocator_ptr,
248 std::unique_ptr<Allocator>(recording_allocator),
249 };
250 }
251 if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
252 return allocator_parts.recording_allocator.get();
253 } else {
254 return allocator_parts.allocator.get();
255 }
256 #else
257 LOG(FATAL) << "GPUAllocator unavailable. Not compiled with --config=cuda or "
258 "--config=rocm.";
259 return nullptr;
260 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
261 }
262
GPUAllocatorCounter(TfDeviceId tf_device_id)263 SharedCounter* GPUProcessState::GPUAllocatorCounter(TfDeviceId tf_device_id) {
264 DCHECK(process_state_);
265 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
266 (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
267 DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, GPUMachineManager(),
268 tf_device_id);
269 mutex_lock l(mu_);
270 if (tf_device_id.value() >= static_cast<int64_t>(gpu_allocators_.size())) {
271 LOG(ERROR) << "Asked for counter for GPU allocator " << tf_device_id.value()
272 << " but only have " << gpu_allocators_.size();
273 return nullptr;
274 }
275
276 AllocatorParts& allocator_parts = gpu_allocators_[tf_device_id.value()];
277 if (allocator_parts.counter.get() == nullptr) {
278 if (allocator_parts.bfc_allocator == nullptr) {
279 return nullptr;
280 }
281 SharedCounter* timing_counter = new SharedCounter;
282 allocator_parts.bfc_allocator->SetTimingCounter(timing_counter);
283 allocator_parts.counter.reset(timing_counter);
284 }
285 return allocator_parts.counter.get();
286 #else
287 return nullptr;
288 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
289 }
290
GetGpuHostAllocator(int numa_node)291 Allocator* GPUProcessState::GetGpuHostAllocator(int numa_node) {
292 CHECK(process_state_);
293 if (!HasGPUDevice() ||
294 !process_state_->ProcessState::FLAGS_brain_mem_reg_gpu_dma) {
295 return process_state_->GetCPUAllocator(numa_node);
296 }
297 if (numa_node == port::kNUMANoAffinity) {
298 numa_node = 0;
299 }
300 {
301 // Here we optimize the most common use case where gpu_host_allocators_
302 // have already been populated and since we're only reading
303 // these vectors, we can get by with a shared lock. In the slower case,
304 // we take a unique lock and populate these vectors.
305 tf_shared_lock lock(mu_);
306
307 if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types &&
308 !gpu_host_allocators_.empty() &&
309 gpu_host_allocators_[0].recording_allocator != nullptr) {
310 return gpu_host_allocators_[0].recording_allocator.get();
311 }
312 if (static_cast<int>(gpu_host_allocators_.size()) > numa_node) {
313 return gpu_host_allocators_[0].allocator.get();
314 }
315 }
316
317 mutex_lock lock(mu_);
318 // Find the first valid StreamExecutor to request CUDA or ROCm host memory
319 // through, since any will work.
320 //
321 // This search isn't super clean, and it would be nice to use a
322 // better source of information about which executor to use. For
323 // example, process_state could maybe save the first stream executor
324 // it knows is valid.
325 se::StreamExecutor* se = nullptr;
326 for (int i = 0; i < static_cast<int>(gpu_allocators_.size()); ++i) {
327 if (gpu_allocators_[i].allocator != nullptr) {
328 se = DeviceIdUtil::ExecutorForTfDeviceId(DEVICE_GPU, GPUMachineManager(),
329 TfDeviceId(i))
330 .ValueOrDie();
331 break;
332 }
333 }
334
335 CHECK_NE(nullptr, se);
336
337 while (static_cast<int>(gpu_host_allocators_.size()) <= numa_node) {
338 while (gpu_host_alloc_visitors_.size() <= numa_node) {
339 gpu_host_alloc_visitors_.push_back({});
340 }
341 while (gpu_host_free_visitors_.size() <= numa_node) {
342 gpu_host_free_visitors_.push_back({});
343 }
344 SubAllocator* sub_allocator = new DeviceHostAllocator(
345 se, numa_node, gpu_host_alloc_visitors_[numa_node],
346 gpu_host_free_visitors_[numa_node]);
347 // TODO(zheng-xq): evaluate whether 64GB by default is the best choice.
348 int64_t gpu_host_mem_limit_in_mb = -1;
349 Status status = ReadInt64FromEnvVar("TF_GPU_HOST_MEM_LIMIT_IN_MB",
350 1LL << 16 /*64GB max by default*/,
351 &gpu_host_mem_limit_in_mb);
352 if (!status.ok()) {
353 LOG(ERROR) << "GetGpuHostAllocator: " << status.error_message();
354 }
355 int64_t gpu_host_mem_limit = gpu_host_mem_limit_in_mb * (1LL << 20);
356
357 BFCAllocator::Options allocator_opts;
358 allocator_opts.allow_growth = true;
359 Allocator* allocator =
360 new BFCAllocator(absl::WrapUnique(sub_allocator), gpu_host_mem_limit,
361 /*name=*/"gpu_host_bfc", allocator_opts);
362
363 if (LogMemory::IsEnabled() && !allocator->TracksAllocationSizes()) {
364 // Wrap the allocator to track allocation ids for better logging
365 // at the cost of performance.
366 allocator = new TrackingAllocator(allocator, true);
367 }
368 gpu_host_allocators_.push_back({std::unique_ptr<Allocator>(allocator),
369 std::unique_ptr<SharedCounter>(nullptr),
370 nullptr, sub_allocator,
371 std::unique_ptr<Allocator>(nullptr)});
372 AllocatorParts& allocator_parts = gpu_host_allocators_.back();
373 if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
374 ProcessState::MemDesc md;
375 md.loc = ProcessState::MemDesc::CPU;
376 md.dev_index = 0;
377 md.gpu_registered = true;
378 md.nic_registered = false;
379 allocator_parts.recording_allocator.reset(
380 new internal::RecordingAllocator(&process_state_->mem_desc_map_,
381 allocator_parts.allocator.get(), md,
382 &mu_));
383 }
384 }
385 if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
386 return gpu_host_allocators_[0].recording_allocator.get();
387 } else {
388 return gpu_host_allocators_[0].allocator.get();
389 }
390 }
391
AddGPUAllocVisitor(int bus_id,const SubAllocator::Visitor & visitor)392 void GPUProcessState::AddGPUAllocVisitor(int bus_id,
393 const SubAllocator::Visitor& visitor) {
394 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
395 (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
396 mutex_lock lock(mu_);
397 CHECK(gpu_allocators_.empty()) // Crash OK
398 << "AddGPUAllocVisitor must be called before "
399 "first call to GetGPUAllocator.";
400 DCHECK_GE(bus_id, 0);
401 while (bus_id >= static_cast<int64_t>(gpu_visitors_.size())) {
402 gpu_visitors_.push_back(std::vector<SubAllocator::Visitor>());
403 }
404 gpu_visitors_[bus_id].push_back(visitor);
405 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
406 }
407
AddGpuHostAllocVisitor(int numa_node,const SubAllocator::Visitor & visitor)408 void GPUProcessState::AddGpuHostAllocVisitor(
409 int numa_node, const SubAllocator::Visitor& visitor) {
410 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
411 (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
412 mutex_lock lock(mu_);
413 CHECK(gpu_host_allocators_.empty()) // Crash OK
414 << "AddGpuHostAllocVisitor must be called before "
415 "first call to GetGpuHostAllocator.";
416 while (numa_node >= static_cast<int64_t>(gpu_host_alloc_visitors_.size())) {
417 gpu_host_alloc_visitors_.push_back(std::vector<SubAllocator::Visitor>());
418 }
419 gpu_host_alloc_visitors_[numa_node].push_back(visitor);
420 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
421 }
422
AddGpuHostFreeVisitor(int numa_node,const SubAllocator::Visitor & visitor)423 void GPUProcessState::AddGpuHostFreeVisitor(
424 int numa_node, const SubAllocator::Visitor& visitor) {
425 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
426 (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
427 mutex_lock lock(mu_);
428 CHECK(gpu_host_allocators_.empty()) // Crash OK
429 << "AddGpuHostFreeVisitor must be called before "
430 "first call to GetGpuHostAllocator.";
431 while (numa_node >= static_cast<int64_t>(gpu_host_free_visitors_.size())) {
432 gpu_host_free_visitors_.push_back(std::vector<SubAllocator::Visitor>());
433 }
434 gpu_host_free_visitors_[numa_node].push_back(visitor);
435 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
436 }
437
TestOnlyReset()438 void GPUProcessState::TestOnlyReset() {
439 if (process_state_) {
440 process_state_->ProcessState::TestOnlyReset();
441 }
442 {
443 mutex_lock lock(mu_);
444 gpu_device_enabled_ = false;
445 gpu_allocators_.clear();
446 gpu_visitors_.clear();
447 gpu_host_allocators_.clear();
448 gpu_host_alloc_visitors_.clear();
449 gpu_host_free_visitors_.clear();
450 }
451 }
452
453 } // namespace tensorflow
454