xref: /aosp_15_r20/external/tensorflow/tensorflow/core/common_runtime/gpu/gpu_process_state.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
17 
18 #include <cstring>
19 #include <vector>
20 
21 #include "absl/container/flat_hash_set.h"
22 #include "tensorflow/core/common_runtime/device/device_host_allocator.h"
23 #include "tensorflow/core/common_runtime/device/device_id_utils.h"
24 #include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
25 #include "tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h"
26 #include "tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.h"
27 #include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h"
28 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
29 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
30 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
31 #include "tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h"
32 #include "tensorflow/core/common_runtime/pool_allocator.h"
33 #include "tensorflow/core/common_runtime/shared_counter.h"
34 #include "tensorflow/core/framework/allocator.h"
35 #include "tensorflow/core/framework/log_memory.h"
36 #include "tensorflow/core/framework/tracking_allocator.h"
37 #include "tensorflow/core/lib/strings/strcat.h"
38 #include "tensorflow/core/platform/logging.h"
39 #include "tensorflow/core/platform/mutex.h"
40 #include "tensorflow/core/platform/stream_executor.h"
41 #include "tensorflow/core/platform/types.h"
42 #include "tensorflow/core/util/env_var.h"
43 
44 namespace tensorflow {
45 
46 // NOLINTNEXTLINE(clang-diagnostic-unused-function)
UseCudaMallocAllocator()47 static bool UseCudaMallocAllocator() {
48   const char* allocator_env = std::getenv("TF_GPU_ALLOCATOR");
49   return allocator_env != nullptr &&
50          std::strcmp(allocator_env, "cuda_malloc") == 0;
51 }
52 
53 // NOLINTNEXTLINE(clang-diagnostic-unused-function)
UseCudaMemoryGuardAllocator()54 static bool UseCudaMemoryGuardAllocator() {
55   const char* allocator_env = std::getenv("TF_GPU_ALLOCATOR");
56   return allocator_env != nullptr &&
57          std::strcmp(allocator_env, "memory_guard") == 0;
58 }
59 
60 // NOLINTNEXTLINE(clang-diagnostic-unused-function)
UseCudaMallocAsyncAllocator()61 static bool UseCudaMallocAsyncAllocator() {
62   const char* allocator_env = std::getenv("TF_GPU_ALLOCATOR");
63   auto result = allocator_env != nullptr &&
64                 std::strcmp(allocator_env, "cuda_malloc_async") == 0;
65 #if TF_CUDA_MALLOC_ASYNC_SUPPORTED
66   return result;
67 #else
68   if (result)
69     LOG(ERROR) << "TF_GPU_ALLOCATOR=cuda_malloc_async environment found, "
70                << "but TensorFlow was not compiled with CUDA 11.2+.";
71   return false;
72 #endif
73 }
74 
singleton(GPUProcessState * ps)75 /*static*/ GPUProcessState* GPUProcessState::singleton(GPUProcessState* ps) {
76   static GPUProcessState* instance = ps ? ps : new GPUProcessState;
77   DCHECK((!ps) || (ps == instance))
78       << "Multiple calls to GPUProcessState with non-null ps";
79   return instance;
80 }
81 
GPUProcessState()82 GPUProcessState::GPUProcessState() : gpu_device_enabled_(false) {
83   process_state_ = ProcessState::singleton();
84 }
85 
BusIdForGPU(TfDeviceId tf_device_id)86 int GPUProcessState::BusIdForGPU(TfDeviceId tf_device_id) {
87   // Return the NUMA node associated with the GPU's StreamExecutor.
88   se::StreamExecutor* se = DeviceIdUtil::ExecutorForTfDeviceId(
89                                DEVICE_GPU, GPUMachineManager(), tf_device_id)
90                                .ValueOrDie();
91   int numa_node = se->GetDeviceDescription().numa_node();
92   // bus_id must be non-negative.  If the numa_node is not known,
93   // use 0.
94   return numa_node >= 0 ? numa_node : 0;
95 }
96 
97 // NOLINTNEXTLINE: clang-tidy complains this is unused because of build flags.
CreateSubAllocator(const GPUOptions & options,PlatformDeviceId platform_device_id,const std::vector<SubAllocator::Visitor> & alloc_visitors,size_t total_bytes,const std::vector<TfDeviceId> & peer_gpu_ids)98 static std::unique_ptr<SubAllocator> CreateSubAllocator(
99     const GPUOptions& options, PlatformDeviceId platform_device_id,
100     const std::vector<SubAllocator::Visitor>& alloc_visitors,
101     size_t total_bytes, const std::vector<TfDeviceId>& peer_gpu_ids) {
102   auto executor = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
103                                                             platform_device_id)
104                       .ValueOrDie();
105 
106   // FIXME(imintz): Observed OOM issues when using the virtual memory
107   // allocators. This should be reenabled when resolved.
108 #if 0 && defined(GOOGLE_CUDA) && CUDA_VERSION >= 10020
109   // Use the old allocator when unified memory is required.
110   // TODO(imintz): Remove the cuMemAlloc capability of this allocator.
111   if (options.per_process_gpu_memory_fraction() > 1.0 ||
112       options.experimental().use_unified_memory()) {
113     return new DeviceMemAllocator(executor, platform_device_id,
114                                   /*use_unified_memory=*/true, alloc_visitors,
115                                   {});
116   } else {
117     auto* gpu_context = reinterpret_cast<stream_executor::gpu::GpuContext*>(
118         executor->implementation()->GpuContextHack());
119 
120     absl::flat_hash_set<PlatformDeviceId> platform_peer_gpu_ids;
121     platform_peer_gpu_ids.reserve(peer_gpu_ids.size());
122     for (const TfDeviceId tf_device_id : peer_gpu_ids) {
123       PlatformDeviceId platform_device_id;
124       TF_CHECK_OK(GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id));
125       platform_peer_gpu_ids.insert(platform_device_id);
126     }
127     std::vector<PlatformDeviceId> platform_peer_gpu_ids_vec(
128         platform_peer_gpu_ids.begin(), platform_peer_gpu_ids.end());
129 
130     // Adjust virtual address space to be slightly larger than the physical
131     // address space in case the BFC allocator performs suboptimal garbage
132     // collection.
133     // TODO(imintz): Update BFC allocator to ensure it doesn't create holes in
134     // the va space.
135     return GpuVirtualMemAllocator::Create(
136                alloc_visitors, {}, *gpu_context, platform_device_id,
137                /*virtual_address_space_size=*/total_bytes * 2,
138                platform_peer_gpu_ids_vec)
139         .ValueOrDie()
140         .release();
141   }
142 #else
143   return absl::WrapUnique(
144       new DeviceMemAllocator(executor, platform_device_id,
145                              (options.per_process_gpu_memory_fraction() > 1.0 ||
146                               options.experimental().use_unified_memory()),
147                              alloc_visitors, {}));
148 #endif
149 }
150 
GetGPUAllocator(const GPUOptions & options,TfDeviceId tf_device_id,size_t total_bytes,const std::vector<TfDeviceId> & peer_gpu_ids)151 Allocator* GPUProcessState::GetGPUAllocator(
152     const GPUOptions& options, TfDeviceId tf_device_id, size_t total_bytes,
153     const std::vector<TfDeviceId>& peer_gpu_ids) {
154   CHECK(process_state_);
155 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
156     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
157   const string& allocator_type = options.allocator_type();
158   mutex_lock lock(mu_);
159   DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, GPUMachineManager(),
160                                      tf_device_id);
161 
162   if (tf_device_id.value() >= static_cast<int64_t>(gpu_allocators_.size())) {
163     gpu_allocators_.resize(tf_device_id.value() + 1);
164   }
165 
166   AllocatorParts& allocator_parts = gpu_allocators_[tf_device_id.value()];
167   if (allocator_parts.allocator == nullptr) {
168     // Validate allocator types.
169     if (!allocator_type.empty() && allocator_type != "BFC") {
170       LOG(ERROR) << "Invalid allocator type: " << allocator_type;
171       return nullptr;
172     }
173 
174     PlatformDeviceId platform_device_id;
175     TF_CHECK_OK(
176         GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id));
177     int bus_id = BusIdForGPU(tf_device_id);
178     DCHECK_GE(bus_id, 0);
179     while (bus_id >= gpu_visitors_.size()) {
180       gpu_visitors_.push_back({});
181     }
182     std::unique_ptr<SubAllocator> sub_allocator =
183         CreateSubAllocator(options, platform_device_id, gpu_visitors_[bus_id],
184                            total_bytes, peer_gpu_ids);
185     SubAllocator* sub_allocator_ptr = sub_allocator.get();
186 
187     auto gpu_bfc_allocator = absl::make_unique<GPUBFCAllocator>(
188         std::move(sub_allocator), total_bytes,
189         strings::StrCat("GPU_", tf_device_id.value(), "_bfc"), [&] {
190           GPUBFCAllocator::Options o;
191           o.allow_growth = options.allow_growth();
192           o.allow_retry_on_failure =
193               !options.experimental().disallow_retry_on_allocation_failure();
194           o.fragmentation_fraction =
195               options.experimental().internal_fragmentation_fraction();
196           return o;
197         }());
198     Allocator* gpu_allocator = gpu_bfc_allocator.get();
199 
200     SharedCounter* timing_counter = nullptr;
201     if (options.experimental().timestamped_allocator()) {
202       timing_counter = new SharedCounter;
203       gpu_bfc_allocator->SetTimingCounter(timing_counter);
204     }
205 
206     // If true, checks for memory overwrites by writing
207     // distinctive patterns on both ends of allocated memory.
208     if (UseCudaMemoryGuardAllocator()) {
209       LOG(INFO) << "Using memory guard allocator for GPU.";
210       gpu_allocator = new GPUNanResetAllocator(
211           new GPUDebugAllocator(gpu_allocator, platform_device_id),
212           platform_device_id);
213     } else if (UseCudaMallocAllocator()) {
214       LOG(INFO) << "Using CUDA malloc allocator for GPU.";
215       // If true, passes all allocation requests through to cudaMalloc
216       // useful for doing memory debugging with tools like cuda-memcheck
217       // **WARNING** probably will not work in a multi-gpu scenario
218       gpu_bfc_allocator.reset();
219       gpu_allocator = new GPUcudaMallocAllocator(platform_device_id);
220     } else if (UseCudaMallocAsyncAllocator() ||
221                options.experimental().use_cuda_malloc_async()) {
222       LOG(INFO) << "Using CUDA malloc Async allocator for GPU: "
223                 << platform_device_id;
224       // If true, passes all allocation requests through to cudaMallocAsync
225       // TODO: useful for doing memory debugging with tools like
226       // compute-sanitizer.
227       // TODO: **WARNING** probably will not work in a multi-gpu scenario
228       gpu_bfc_allocator.reset();
229       gpu_allocator =
230           new GpuCudaMallocAsyncAllocator(platform_device_id, total_bytes);
231     }
232 
233     Allocator* recording_allocator = nullptr;
234     if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
235       ProcessState::MemDesc md;
236       md.loc = ProcessState::MemDesc::GPU;
237       md.dev_index = platform_device_id.value();
238       md.gpu_registered = false;
239       md.nic_registered = true;
240       recording_allocator = new internal::RecordingAllocator(
241           &process_state_->mem_desc_map_, gpu_allocator, md, &mu_);
242     }
243     allocator_parts = {
244         std::unique_ptr<Allocator>(gpu_allocator),
245         std::unique_ptr<SharedCounter>(timing_counter),
246         gpu_bfc_allocator.release(),
247         sub_allocator_ptr,
248         std::unique_ptr<Allocator>(recording_allocator),
249     };
250   }
251   if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
252     return allocator_parts.recording_allocator.get();
253   } else {
254     return allocator_parts.allocator.get();
255   }
256 #else
257   LOG(FATAL) << "GPUAllocator unavailable. Not compiled with --config=cuda or "
258                 "--config=rocm.";
259   return nullptr;
260 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
261 }
262 
GPUAllocatorCounter(TfDeviceId tf_device_id)263 SharedCounter* GPUProcessState::GPUAllocatorCounter(TfDeviceId tf_device_id) {
264   DCHECK(process_state_);
265 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
266     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
267   DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, GPUMachineManager(),
268                                      tf_device_id);
269   mutex_lock l(mu_);
270   if (tf_device_id.value() >= static_cast<int64_t>(gpu_allocators_.size())) {
271     LOG(ERROR) << "Asked for counter for GPU allocator " << tf_device_id.value()
272                << " but only have " << gpu_allocators_.size();
273     return nullptr;
274   }
275 
276   AllocatorParts& allocator_parts = gpu_allocators_[tf_device_id.value()];
277   if (allocator_parts.counter.get() == nullptr) {
278     if (allocator_parts.bfc_allocator == nullptr) {
279       return nullptr;
280     }
281     SharedCounter* timing_counter = new SharedCounter;
282     allocator_parts.bfc_allocator->SetTimingCounter(timing_counter);
283     allocator_parts.counter.reset(timing_counter);
284   }
285   return allocator_parts.counter.get();
286 #else
287   return nullptr;
288 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
289 }
290 
GetGpuHostAllocator(int numa_node)291 Allocator* GPUProcessState::GetGpuHostAllocator(int numa_node) {
292   CHECK(process_state_);
293   if (!HasGPUDevice() ||
294       !process_state_->ProcessState::FLAGS_brain_mem_reg_gpu_dma) {
295     return process_state_->GetCPUAllocator(numa_node);
296   }
297   if (numa_node == port::kNUMANoAffinity) {
298     numa_node = 0;
299   }
300   {
301     // Here we optimize the most common use case where gpu_host_allocators_
302     // have already been populated and since we're only reading
303     // these vectors, we can get by with a shared lock. In the slower case,
304     // we take a unique lock and populate these vectors.
305     tf_shared_lock lock(mu_);
306 
307     if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types &&
308         !gpu_host_allocators_.empty() &&
309         gpu_host_allocators_[0].recording_allocator != nullptr) {
310       return gpu_host_allocators_[0].recording_allocator.get();
311     }
312     if (static_cast<int>(gpu_host_allocators_.size()) > numa_node) {
313       return gpu_host_allocators_[0].allocator.get();
314     }
315   }
316 
317   mutex_lock lock(mu_);
318   // Find the first valid StreamExecutor to request CUDA or ROCm host memory
319   // through, since any will work.
320   //
321   // This search isn't super clean, and it would be nice to use a
322   // better source of information about which executor to use.  For
323   // example, process_state could maybe save the first stream executor
324   // it knows is valid.
325   se::StreamExecutor* se = nullptr;
326   for (int i = 0; i < static_cast<int>(gpu_allocators_.size()); ++i) {
327     if (gpu_allocators_[i].allocator != nullptr) {
328       se = DeviceIdUtil::ExecutorForTfDeviceId(DEVICE_GPU, GPUMachineManager(),
329                                                TfDeviceId(i))
330                .ValueOrDie();
331       break;
332     }
333   }
334 
335   CHECK_NE(nullptr, se);
336 
337   while (static_cast<int>(gpu_host_allocators_.size()) <= numa_node) {
338     while (gpu_host_alloc_visitors_.size() <= numa_node) {
339       gpu_host_alloc_visitors_.push_back({});
340     }
341     while (gpu_host_free_visitors_.size() <= numa_node) {
342       gpu_host_free_visitors_.push_back({});
343     }
344     SubAllocator* sub_allocator = new DeviceHostAllocator(
345         se, numa_node, gpu_host_alloc_visitors_[numa_node],
346         gpu_host_free_visitors_[numa_node]);
347     // TODO(zheng-xq): evaluate whether 64GB by default is the best choice.
348     int64_t gpu_host_mem_limit_in_mb = -1;
349     Status status = ReadInt64FromEnvVar("TF_GPU_HOST_MEM_LIMIT_IN_MB",
350                                         1LL << 16 /*64GB max by default*/,
351                                         &gpu_host_mem_limit_in_mb);
352     if (!status.ok()) {
353       LOG(ERROR) << "GetGpuHostAllocator: " << status.error_message();
354     }
355     int64_t gpu_host_mem_limit = gpu_host_mem_limit_in_mb * (1LL << 20);
356 
357     BFCAllocator::Options allocator_opts;
358     allocator_opts.allow_growth = true;
359     Allocator* allocator =
360         new BFCAllocator(absl::WrapUnique(sub_allocator), gpu_host_mem_limit,
361                          /*name=*/"gpu_host_bfc", allocator_opts);
362 
363     if (LogMemory::IsEnabled() && !allocator->TracksAllocationSizes()) {
364       // Wrap the allocator to track allocation ids for better logging
365       // at the cost of performance.
366       allocator = new TrackingAllocator(allocator, true);
367     }
368     gpu_host_allocators_.push_back({std::unique_ptr<Allocator>(allocator),
369                                     std::unique_ptr<SharedCounter>(nullptr),
370                                     nullptr, sub_allocator,
371                                     std::unique_ptr<Allocator>(nullptr)});
372     AllocatorParts& allocator_parts = gpu_host_allocators_.back();
373     if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
374       ProcessState::MemDesc md;
375       md.loc = ProcessState::MemDesc::CPU;
376       md.dev_index = 0;
377       md.gpu_registered = true;
378       md.nic_registered = false;
379       allocator_parts.recording_allocator.reset(
380           new internal::RecordingAllocator(&process_state_->mem_desc_map_,
381                                            allocator_parts.allocator.get(), md,
382                                            &mu_));
383     }
384   }
385   if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
386     return gpu_host_allocators_[0].recording_allocator.get();
387   } else {
388     return gpu_host_allocators_[0].allocator.get();
389   }
390 }
391 
AddGPUAllocVisitor(int bus_id,const SubAllocator::Visitor & visitor)392 void GPUProcessState::AddGPUAllocVisitor(int bus_id,
393                                          const SubAllocator::Visitor& visitor) {
394 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
395     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
396   mutex_lock lock(mu_);
397   CHECK(gpu_allocators_.empty())  // Crash OK
398       << "AddGPUAllocVisitor must be called before "
399          "first call to GetGPUAllocator.";
400   DCHECK_GE(bus_id, 0);
401   while (bus_id >= static_cast<int64_t>(gpu_visitors_.size())) {
402     gpu_visitors_.push_back(std::vector<SubAllocator::Visitor>());
403   }
404   gpu_visitors_[bus_id].push_back(visitor);
405 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
406 }
407 
AddGpuHostAllocVisitor(int numa_node,const SubAllocator::Visitor & visitor)408 void GPUProcessState::AddGpuHostAllocVisitor(
409     int numa_node, const SubAllocator::Visitor& visitor) {
410 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
411     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
412   mutex_lock lock(mu_);
413   CHECK(gpu_host_allocators_.empty())  // Crash OK
414       << "AddGpuHostAllocVisitor must be called before "
415          "first call to GetGpuHostAllocator.";
416   while (numa_node >= static_cast<int64_t>(gpu_host_alloc_visitors_.size())) {
417     gpu_host_alloc_visitors_.push_back(std::vector<SubAllocator::Visitor>());
418   }
419   gpu_host_alloc_visitors_[numa_node].push_back(visitor);
420 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
421 }
422 
AddGpuHostFreeVisitor(int numa_node,const SubAllocator::Visitor & visitor)423 void GPUProcessState::AddGpuHostFreeVisitor(
424     int numa_node, const SubAllocator::Visitor& visitor) {
425 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
426     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
427   mutex_lock lock(mu_);
428   CHECK(gpu_host_allocators_.empty())  // Crash OK
429       << "AddGpuHostFreeVisitor must be called before "
430          "first call to GetGpuHostAllocator.";
431   while (numa_node >= static_cast<int64_t>(gpu_host_free_visitors_.size())) {
432     gpu_host_free_visitors_.push_back(std::vector<SubAllocator::Visitor>());
433   }
434   gpu_host_free_visitors_[numa_node].push_back(visitor);
435 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
436 }
437 
TestOnlyReset()438 void GPUProcessState::TestOnlyReset() {
439   if (process_state_) {
440     process_state_->ProcessState::TestOnlyReset();
441   }
442   {
443     mutex_lock lock(mu_);
444     gpu_device_enabled_ = false;
445     gpu_allocators_.clear();
446     gpu_visitors_.clear();
447     gpu_host_allocators_.clear();
448     gpu_host_alloc_visitors_.clear();
449     gpu_host_free_visitors_.clear();
450   }
451 }
452 
453 }  // namespace tensorflow
454