xref: /aosp_15_r20/external/tensorflow/tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.h"
17 
18 #include <utility>
19 
20 #if defined(__APPLE__)
21 #include <mach-o/dyld.h>
22 #endif
23 #if defined(PLATFORM_WINDOWS)
24 #include <windows.h>
25 #define PATH_MAX MAX_PATH
26 #else
27 #include <unistd.h>
28 #endif
29 #include "absl/strings/ascii.h"
30 #include "absl/strings/str_cat.h"
31 #include "absl/strings/str_format.h"
32 #include "absl/strings/str_split.h"
33 #include "absl/strings/string_view.h"
34 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.h"
35 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.h"
36 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_event.h"
37 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_platform_id.h"
38 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_stream.h"
39 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_timer.h"
40 #include "tensorflow/compiler/xla/stream_executor/kernel_cache_config.h"
41 #include "tensorflow/compiler/xla/stream_executor/lib/env.h"
42 #include "tensorflow/compiler/xla/stream_executor/lib/error.h"
43 #include "tensorflow/compiler/xla/stream_executor/lib/initialize.h"
44 #include "tensorflow/compiler/xla/stream_executor/lib/mathutil.h"
45 #include "tensorflow/compiler/xla/stream_executor/lib/numbers.h"
46 #include "tensorflow/compiler/xla/stream_executor/lib/path.h"
47 #include "tensorflow/compiler/xla/stream_executor/lib/process_state.h"
48 #include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
49 #include "tensorflow/compiler/xla/stream_executor/platform.h"
50 #include "tensorflow/compiler/xla/stream_executor/platform/logging.h"
51 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
52 #include "tensorflow/compiler/xla/stream_executor/plugin_registry.h"
53 #include "tensorflow/compiler/xla/stream_executor/stream.h"
54 #include "tensorflow/compiler/xla/stream_executor/stream_executor_internal.h"
55 #include "tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h"
56 #include "tensorflow/compiler/xla/stream_executor/timer.h"
57 
58 // LOG(ERROR) uses a const named ERROR, so a macro with the same name is
59 // always unwanted. This happens on Windows that defines such a macro.
60 #undef ERROR
61 
62 #ifdef PLATFORMS_GPUS_CUDA_DYNAMIC_LIBCUDA_DYNAMIC_LIBCUDA_H_
63 #error \
64     "No driver calls in this file, wrap driver functionality in cuda_driver.cc."
65 #endif
66 
67 #ifdef __CUDA_RUNTIME_H__
68 #error \
69     "CUDA runtime being included into CUDA GPU executor; should be driver only."
70 #endif
71 
72 extern bool FLAGS_check_gpu_leaks;
73 bool FLAGS_prefer_cubin_to_ptx = true;
74 
75 namespace stream_executor {
76 namespace gpu {
77 
78 // Hook that can be used to CUBIN-ate PTX before it is loaded into the driver.
79 // It has been observed that loading both PTX and cubins into the driver library
80 // can cause it to crash, but loading only CUBINs avoids those crashes;
81 // therefore, it's useful to have this hook to hack in uniform CUBIN-ation of
82 // PTX code.
83 //
84 // As this is an implementation-detail workaround, the usage is to declare this
85 // variable with extern linkage and populate it from another translation unit.
86 std::function<std::string(const std::string&)> g_cubinate;
87 
AsGpuEvent(Event * event)88 static GpuEvent* AsGpuEvent(Event* event) {
89   DCHECK(event != nullptr);
90   return static_cast<GpuEvent*>(event->implementation());
91 }
92 
93 // Given a platform-independent timer datatype, returns the internal CUDA
94 // platform implementation pointer.
AsGpuTimer(Timer * timer)95 static GpuTimer* AsGpuTimer(Timer* timer) {
96   DCHECK(timer != nullptr);
97   return static_cast<GpuTimer*>(timer->implementation());
98 }
99 
100 // Given const GPU memory, returns a libcuda device pointer datatype, suitable
101 // for passing directly to libcuda APIs.
102 //
103 // N.B. we must lose constness in order to pass a suitable type to the existing
104 // libcuda APIs, so the caller should take care to only pass the result of const
105 // GPU memory conversions to libcuda functions which will honor constness.
AsCudaDevicePtr(const DeviceMemoryBase & gpu_mem)106 static CUdeviceptr AsCudaDevicePtr(const DeviceMemoryBase& gpu_mem) {
107   return reinterpret_cast<CUdeviceptr>(gpu_mem.opaque());
108 }
109 
110 // See description on const version above.
AsCudaDevicePtr(DeviceMemoryBase * gpu_mem)111 static CUdeviceptr AsCudaDevicePtr(DeviceMemoryBase* gpu_mem) {
112   return AsCudaDevicePtr(*gpu_mem);
113 }
114 
ExtractGpuContext(GpuExecutor * cuda_exec)115 GpuContext* ExtractGpuContext(GpuExecutor* cuda_exec) {
116   CHECK(cuda_exec != nullptr);
117   return cuda_exec->gpu_context();
118 }
119 
~GpuExecutor()120 GpuExecutor::~GpuExecutor() {
121   CHECK(kernel_to_gpu_binary_.empty()) << "GpuExecutor has live kernels.";
122   CHECK(gpu_binary_to_module_.empty()) << "GpuExecutor has loaded modules.";
123   if (context_ != nullptr) {
124     GpuDriver::DestroyContext(context_);
125   }
126 }
127 
Init(int device_ordinal,DeviceOptions device_options)128 port::Status GpuExecutor::Init(int device_ordinal,
129                                DeviceOptions device_options) {
130   device_ordinal_ = device_ordinal;
131 
132   auto status = GpuDriver::Init();
133   if (!status.ok()) {
134     return status;
135   }
136 
137   status = GpuDriver::GetDevice(device_ordinal_, &device_);
138   if (!status.ok()) {
139     return status;
140   }
141 
142   status = GpuDriver::CreateContext(device_ordinal_, device_, device_options,
143                                     &context_);
144   if (!status.ok()) {
145     return status;
146   }
147 
148   return GpuDriver::GetComputeCapability(&cc_major_, &cc_minor_, device_);
149 }
150 
FindOnDiskForComputeCapability(absl::string_view filename,absl::string_view canonical_suffix,std::string * found_filename) const151 bool GpuExecutor::FindOnDiskForComputeCapability(
152     absl::string_view filename, absl::string_view canonical_suffix,
153     std::string* found_filename) const {
154   if (cc_major_ == 0 && cc_minor_ == 0) {
155     return false;
156   }
157 
158   std::string cc_specific =
159       absl::StrCat(filename, ".cc", cc_major_, cc_minor_, canonical_suffix);
160   if (port::FileExists(cc_specific).ok()) {
161     VLOG(2) << "found compute-capability-specific file, using that: "
162             << cc_specific;
163     *found_filename = cc_specific;
164     return true;
165   }
166 
167   VLOG(2) << "could not find compute-capability specific file at: "
168           << cc_specific;
169   if (port::FileExists(std::string(filename)).ok()) {
170     *found_filename = std::string(filename);
171     return true;
172   }
173 
174   return false;
175 }
176 
FindOnDiskForISAVersion(absl::string_view filename,absl::string_view canonical_suffix,std::string * found_filename) const177 bool GpuExecutor::FindOnDiskForISAVersion(absl::string_view filename,
178                                           absl::string_view canonical_suffix,
179                                           std::string* found_filename) const {
180   LOG(ERROR)
181       << "Feature not supported on CUDA platform (FindOnDiskForISAVersion)";
182   return false;
183 }
184 // Returns the path to the running executable.
185 // N.B. Derived from //knowledge/smalltalk/background_kb.cc
186 // Arg: strip_exe: if true, remove the name of the executable itself from the
187 //                 returned string. Example: calling this from /usr/bin/foo
188 //                 would return /usr/bin.
GetBinaryDir(bool strip_exe)189 static std::string GetBinaryDir(bool strip_exe) {
190   std::string exe_path = port::GetExecutablePath();
191   if (strip_exe) {
192     // The exe is the last component of the path, so remove one component.
193     std::vector<std::string> components = absl::StrSplit(exe_path, '/');
194     components.pop_back();
195     return absl::StrJoin(components, "/");
196   }
197   return exe_path;
198 }
199 
LoadModuleFromCuBin(const char * cubin,CUmodule * module)200 port::Status GpuExecutor::LoadModuleFromCuBin(const char* cubin,
201                                               CUmodule* module) {
202   uint64_t module_refcount;
203   std::tie(*module, module_refcount) = gpu_binary_to_module_[cubin];
204 
205   if (*module == nullptr) {
206     TF_RETURN_IF_ERROR(GpuDriver::LoadCubin(context_, cubin, module));
207     module_refcount = 1;
208     VLOG(3) << "Loaded CUBIN " << static_cast<const void*>(cubin)
209             << " as module " << *module;
210   } else {
211     ++module_refcount;
212     VLOG(3) << "CUBIN " << static_cast<const void*>(cubin)
213             << " is already loaded as module " << *module;
214   }
215   gpu_binary_to_module_[cubin] = {*module, module_refcount};
216   return ::tensorflow::OkStatus();
217 }
218 
LoadModuleFromPtx(const char * ptx,CUmodule * module)219 port::Status GpuExecutor::LoadModuleFromPtx(const char* ptx, CUmodule* module) {
220   uint64_t module_refcount;
221   std::tie(*module, module_refcount) = gpu_binary_to_module_[ptx];
222 
223   if (*module == nullptr) {
224     TF_RETURN_IF_ERROR(GpuDriver::LoadPtx(context_, ptx, module));
225     VLOG(3) << "Loaded PTX " << static_cast<const void*>(ptx) << " as module "
226             << *module;
227     module_refcount = 1;
228   } else {
229     ++module_refcount;
230     VLOG(3) << "PTX " << static_cast<const void*>(ptx)
231             << " is already loaded as module " << module;
232   }
233   gpu_binary_to_module_[ptx] = {*module, module_refcount};
234   return ::tensorflow::OkStatus();
235 }
236 
LoadModuleFromHsaco(const char * hsaco,CUmodule * module)237 port::Status GpuExecutor::LoadModuleFromHsaco(const char* hsaco,
238                                               CUmodule* module) {
239   return port::InternalError(
240       "Feature not supported on CUDA platform (LoadModuleFromHsaco)");
241 }
242 
GetKernel(const MultiKernelLoaderSpec & spec,KernelBase * kernel)243 port::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
244                                     KernelBase* kernel) {
245   GpuKernel* cuda_kernel = AsGpuKernel(kernel);
246   CUmodule module;
247   const std::string* kernelname;
248 
249   VLOG(3) << "GetKernel on kernel " << kernel << " : " << kernel->name();
250 
251   if (spec.has_cuda_cubin_in_memory()) {
252     absl::MutexLock lock{&in_memory_modules_mu_};
253     kernelname = &spec.cuda_cubin_in_memory().kernelname();
254     const char* cubin = spec.cuda_cubin_in_memory().bytes();
255     TF_RETURN_IF_ERROR(LoadModuleFromCuBin(cubin, &module));
256     kernel_to_gpu_binary_[kernel] = cubin;
257   } else if (spec.has_cuda_ptx_in_memory()) {
258     kernelname = &spec.cuda_ptx_in_memory().kernelname();
259 
260     if (cc_major_ == 0 && cc_minor_ == 0) {
261       return port::InternalError("Compute capability not set");
262     }
263 
264     const char* ptx = spec.cuda_ptx_in_memory().text(cc_major_, cc_minor_);
265     if (ptx == nullptr) {
266       ptx = spec.cuda_ptx_in_memory().default_text();
267     }
268     if (ptx == nullptr) {
269       LOG(FATAL) << "Loader spec has no ptx for kernel " << *kernelname;
270     }
271 
272     absl::MutexLock lock{&in_memory_modules_mu_};
273     TF_RETURN_IF_ERROR(LoadModuleFromPtx(ptx, &module));
274     kernel_to_gpu_binary_[kernel] = ptx;
275   } else {
276     return port::InternalError("No method of loading CUDA kernel provided");
277   }
278   VLOG(2) << "getting function " << *kernelname << " from module " << module;
279   if (!GpuDriver::GetModuleFunction(context_, module, kernelname->c_str(),
280                                     cuda_kernel->gpu_function_ptr())) {
281     return port::InternalError("Could not find the corresponding function");
282   }
283 
284   // We have to trust the kernel loader spec arity because there doesn't appear
285   // to be a way to reflect on the number of expected arguments w/the CUDA API.
286   cuda_kernel->set_arity(spec.arity());
287 
288   KernelMetadata kernel_metadata;
289   TF_RETURN_IF_ERROR(GetKernelMetadata(cuda_kernel, &kernel_metadata));
290   kernel->set_metadata(kernel_metadata);
291   kernel->set_name(*kernelname);
292   return ::tensorflow::OkStatus();
293 }
294 
UnloadGpuBinary(const void * gpu_binary)295 bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
296   auto module_it = gpu_binary_to_module_.find(gpu_binary);
297   if (gpu_binary_to_module_.end() == module_it) {
298     VLOG(3) << "No loaded CUDA module for " << gpu_binary;
299     return false;
300   }
301   auto& module = module_it->second.first;
302   auto& refcount = module_it->second.second;
303   VLOG(3) << "Found CUDA module " << module << " with refcount " << refcount;
304   if (--refcount == 0) {
305     VLOG(3) << "Unloading CUDA module " << module;
306     GpuDriver::UnloadModule(context_, module);
307     gpu_binary_to_module_.erase(module_it);
308   }
309   return true;
310 }
311 
UnloadKernel(const KernelBase * kernel)312 void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
313   VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name();
314 
315   absl::MutexLock lock{&in_memory_modules_mu_};
316   auto gpu_binary_it = kernel_to_gpu_binary_.find(kernel);
317   if (kernel_to_gpu_binary_.end() == gpu_binary_it) {
318     VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
319             << " has never been loaded.";
320     return;  // We've never seen this kernel.
321   }
322   VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
323           << " has loaded GPU code " << gpu_binary_it->second;
324   UnloadGpuBinary(gpu_binary_it->second);
325   kernel_to_gpu_binary_.erase(gpu_binary_it);
326 }
327 
LoadModule(const MultiModuleLoaderSpec & spec,ModuleHandle * module_handle)328 port::Status GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
329                                      ModuleHandle* module_handle) {
330   // In GpuExecutor we store the pointer to the GPU binary (PTX or CUBIN) as
331   // ModuleHandle::id().
332   CUmodule cu_module;
333   if (spec.has_cuda_cubin_in_memory()) {
334     absl::MutexLock lock{&in_memory_modules_mu_};
335     TF_RETURN_IF_ERROR(LoadModuleFromCuBin(
336         reinterpret_cast<const char*>(spec.cuda_cubin_in_memory().data()),
337         &cu_module));
338     *module_handle = ModuleHandle(const_cast<void*>(
339         static_cast<const void*>(spec.cuda_cubin_in_memory().data())));
340     return ::tensorflow::OkStatus();
341   } else if (spec.has_cuda_ptx_in_memory()) {
342     if (cc_major_ == 0 && cc_minor_ == 0) {
343       return port::InternalError("Compute capability not set");
344     }
345 
346     if (!spec.cuda_ptx_in_memory()) {
347       return port::InternalError("PTX not found in spec");
348     }
349 
350     absl::MutexLock lock{&in_memory_modules_mu_};
351     TF_RETURN_IF_ERROR(
352         LoadModuleFromPtx(spec.cuda_ptx_in_memory(), &cu_module));
353     *module_handle = ModuleHandle(
354         const_cast<void*>(static_cast<const void*>(spec.cuda_ptx_in_memory())));
355     return ::tensorflow::OkStatus();
356   }
357   return port::InternalError("No method of loading CUDA module provided");
358 }
359 
UnloadModule(ModuleHandle module_handle)360 bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
361   const char* gpu_binary = reinterpret_cast<const char*>(module_handle.id());
362   absl::MutexLock lock{&in_memory_modules_mu_};
363   return UnloadGpuBinary(gpu_binary);
364 }
365 
366 namespace {
Fingerprint128(const absl::string_view s)367 absl::uint128 Fingerprint128(const absl::string_view s) {
368   auto fp = tensorflow::Fingerprint128(s);
369   return absl::MakeUint128(fp.high64, fp.low64);
370 }
371 }  // namespace
372 
373 port::StatusOr<std::shared_ptr<DeviceMemoryBase>>
CreateOrShareConstant(Stream * stream,const std::vector<uint8_t> & content)374 GpuExecutor::CreateOrShareConstant(Stream* stream,
375                                    const std::vector<uint8_t>& content) {
376   absl::MutexLock lock{&shared_constants_mu_};
377   // We assume all constants are uniquely identified by this hash. In the
378   // (highly unlikely) event of a hash collision, the program will likely crash
379   // (because the cached constant that will be returned by mistake is unlikely
380   // to have the correct size).
381   absl::uint128 fingerprint = Fingerprint128(absl::string_view(
382       reinterpret_cast<const char*>(content.data()), content.size()));
383   // Must insert nullptr first to get an iterator to the insertion point.
384   auto insert_result = shared_constants_.insert(
385       {fingerprint, std::weak_ptr<DeviceMemoryBase>()});
386   auto it = insert_result.first;
387   bool was_already_in_cache = !insert_result.second;
388   std::shared_ptr<DeviceMemoryBase> shared_constant;
389 
390   if (was_already_in_cache) {
391     shared_constant = it->second.lock();
392   }
393 
394   if (shared_constant == nullptr) {
395     // Either the constant wasn't found in the cache, or it was but its
396     // weak_ptr had expired.
397     DeviceMemoryBase* new_constant =
398         new DeviceMemoryBase(Allocate(content.size(), /*memory_space=*/0));
399     if (new_constant->opaque() == nullptr) {
400       return port::InternalError(absl::StrFormat(
401           "Failed to allocate %d bytes for new constant", content.size()));
402     }
403 
404     port::Status status =
405         stream->ThenMemcpy(new_constant, content.data(), content.size())
406             .BlockHostUntilDone();
407     if (!status.ok()) {
408       Deallocate(new_constant);
409       status.Update(port::InternalError(absl::StrFormat(
410           "Memcpy to device address %p failed", new_constant->opaque())));
411       return status;
412     }
413 
414     // Capturing 'this' in the custom deleter means this executor must
415     // outlive all shared uses of this constant.
416     shared_constant = std::shared_ptr<DeviceMemoryBase>(
417         new_constant, [this](DeviceMemoryBase* p) {
418           Deallocate(p);
419           delete p;
420         });
421     it->second = std::weak_ptr<DeviceMemoryBase>(shared_constant);
422   }
423 
424   return shared_constant;
425 }
426 
GetKernelMetadata(GpuKernel * cuda_kernel,KernelMetadata * kernel_metadata)427 port::Status GpuExecutor::GetKernelMetadata(GpuKernel* cuda_kernel,
428                                             KernelMetadata* kernel_metadata) {
429   int value;
430   TF_RETURN_IF_ERROR(GpuDriver::FuncGetAttribute(
431       CU_FUNC_ATTRIBUTE_NUM_REGS, *cuda_kernel->gpu_function_ptr(), &value));
432   kernel_metadata->set_registers_per_thread(value);
433 
434   TF_RETURN_IF_ERROR(
435       GpuDriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
436                                   *cuda_kernel->gpu_function_ptr(), &value));
437   kernel_metadata->set_shared_memory_bytes(value);
438   return ::tensorflow::OkStatus();
439 }
440 
Launch(Stream * stream,const ThreadDim & thread_dims,const BlockDim & block_dims,const KernelBase & kernel,const KernelArgsArrayBase & args)441 port::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
442                                  const BlockDim& block_dims,
443                                  const KernelBase& kernel,
444                                  const KernelArgsArrayBase& args) {
445   CHECK_EQ(kernel.Arity(), args.number_of_arguments());
446   CUstream custream = AsGpuStreamValue(stream);
447   const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
448   CUfunction cufunc = cuda_kernel->AsGpuFunctionHandle();
449 
450   // Only perform/print the occupancy check once.  Even just checking to see
451   // whether we've done an occupancy check on this kernel before isn't free
452   // (because we have to synchronize), so we only do this at -v 2+.
453   if (VLOG_IS_ON(2)) {
454     absl::MutexLock lock(&launched_kernels_mu_);
455     if (!launched_kernels_.count(cufunc)) {
456       VlogOccupancyInfo(kernel, thread_dims, block_dims);
457       // TODO(rspringer): Remove elements from launched_kernels_...if we ever
458       // expose a kernel/module deallocation method.
459       launched_kernels_.insert(cufunc);
460     }
461   }
462 
463   if (cuda_kernel->GetPreferredCacheConfig() !=
464       KernelCacheConfig::kNoPreference) {
465     TF_RETURN_IF_ERROR(GpuDriver::FuncSetCacheConfig(
466         cufunc, cuda_kernel->GetGpuCacheConfig()));
467   }
468 
469   void** kernel_params = const_cast<void**>(args.argument_addresses().data());
470 
471   return GpuDriver::LaunchKernel(context_, kernel.name(), cufunc, block_dims.x,
472                                  block_dims.y, block_dims.z, thread_dims.x,
473                                  thread_dims.y, thread_dims.z,
474                                  args.number_of_shared_bytes(), custream,
475                                  kernel_params, nullptr /* = extra */);
476 }
477 
478 // This is a non-essential operation; if there's a failure, proceed without
479 // logging an error. It's nearly certain that in case of failures, we'd never
480 // get here in the first place; these are very low-impact routines.
VlogOccupancyInfo(const KernelBase & kernel,const ThreadDim & thread_dims,const BlockDim & block_dims)481 void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
482                                     const ThreadDim& thread_dims,
483                                     const BlockDim& block_dims) {
484   VLOG(2) << "Computing kernel occupancy for kernel "
485           << kernel.demangled_name();
486   VLOG(2) << "Thread dimensions (" << thread_dims.x << ", " << thread_dims.y
487           << ", " << thread_dims.z << ")";
488 
489   int regs_per_thread;
490   if (!kernel.metadata().registers_per_thread(&regs_per_thread)) {
491     return;
492   }
493 
494   int smem_per_block;
495   if (!kernel.metadata().shared_memory_bytes(&smem_per_block)) {
496     return;
497   }
498 
499   const DeviceDescription& device_description =
500       kernel.parent()->GetDeviceDescription();
501 
502   const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
503   CUfunction cufunc = cuda_kernel->AsGpuFunctionHandle();
504 
505   int blocks_per_sm = CalculateOccupancy(device_description, regs_per_thread,
506                                          smem_per_block, thread_dims, cufunc);
507   VLOG(2) << "Resident blocks per SM is " << blocks_per_sm;
508 
509   int suggested_threads =
510       CompareOccupancy(&blocks_per_sm, device_description, regs_per_thread,
511                        smem_per_block, thread_dims, cufunc);
512   if (suggested_threads != 0) {
513     VLOG(2) << "The cuda occupancy calculator recommends using "
514             << suggested_threads
515             << " threads per block to achieve an occupancy of " << blocks_per_sm
516             << " blocks per SM.";
517   }
518 }
519 
520 // Compute and return maximum blocks per core (occupancy) based on the
521 // device description, some kernel characteristics and the number of threads per
522 // block.  If unable to compute occupancy, zero is returned.
CalculateOccupancy(const DeviceDescription & device_description,uint64_t registers_per_thread,uint64_t shared_memory_per_block,const ThreadDim & thread_dims,CUfunction func)523 int GpuExecutor::CalculateOccupancy(const DeviceDescription& device_description,
524                                     uint64_t registers_per_thread,
525                                     uint64_t shared_memory_per_block,
526                                     const ThreadDim& thread_dims,
527                                     CUfunction func) {
528   int suggested_blocks = 0;
529   int suggested_threads = 0;
530   CUresult err = cuOccupancyMaxPotentialBlockSize(
531       &suggested_blocks, &suggested_threads, func, nullptr,
532       shared_memory_per_block, 0);
533   CHECK_EQ(err, CUDA_SUCCESS);
534   return suggested_blocks;
535 }
536 
537 // Compute and return the suggested thread count to achieve ideal occupancy.
538 // If the provided thread dimensions match this number, zero is returned.
CompareOccupancy(int * initial_blocks,const DeviceDescription & device_description,uint64_t registers_per_thread,uint64_t shared_memory_per_block,const ThreadDim & thread_dims,CUfunction func)539 int GpuExecutor::CompareOccupancy(int* initial_blocks,
540                                   const DeviceDescription& device_description,
541                                   uint64_t registers_per_thread,
542                                   uint64_t shared_memory_per_block,
543                                   const ThreadDim& thread_dims,
544                                   CUfunction func) {
545   int suggested_blocks = 0;
546   int suggested_threads = 0;
547   CUresult err = cuOccupancyMaxPotentialBlockSize(
548       &suggested_blocks, &suggested_threads, func, nullptr,
549       shared_memory_per_block, 0);
550   CHECK_EQ(err, CUDA_SUCCESS);
551   if (suggested_blocks > *initial_blocks) {
552     *initial_blocks = suggested_blocks;
553     return suggested_threads;
554   } else {
555     return 0;
556   }
557 }
558 
Allocate(uint64_t size,int64_t memory_space)559 DeviceMemoryBase GpuExecutor::Allocate(uint64_t size, int64_t memory_space) {
560   CHECK_EQ(memory_space, 0);
561   return DeviceMemoryBase(GpuDriver::DeviceAllocate(context_, size), size);
562 }
563 
GetSubBuffer(DeviceMemoryBase * mem,uint64_t offset_bytes,uint64_t size_bytes)564 void* GpuExecutor::GetSubBuffer(DeviceMemoryBase* mem, uint64_t offset_bytes,
565                                 uint64_t size_bytes) {
566   // offset and size are in bytes, so char* works as the pointer type.
567   return reinterpret_cast<char*>(mem->opaque()) + offset_bytes;
568 }
569 
Deallocate(DeviceMemoryBase * mem)570 void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
571   GpuDriver::DeviceDeallocate(context_, mem->opaque());
572 }
573 
HostMemoryRegister(void * location,uint64_t size)574 bool GpuExecutor::HostMemoryRegister(void* location, uint64_t size) {
575   if (location == nullptr || size == 0) {
576     LOG(WARNING) << "attempting to register null or zero-sized memory: "
577                  << location << "; size " << size;
578   }
579   VLOG(2) << "registering " << location << " size " << size;
580   return GpuDriver::HostRegister(context_, location, size);
581 }
582 
HostMemoryUnregister(void * location)583 bool GpuExecutor::HostMemoryUnregister(void* location) {
584   VLOG(2) << "unregistering " << location;
585   return GpuDriver::HostUnregister(context_, location);
586 }
587 
SynchronizeAllActivity()588 bool GpuExecutor::SynchronizeAllActivity() {
589   return GpuDriver::SynchronizeContext(context_);
590 }
591 
SynchronousMemZero(DeviceMemoryBase * location,uint64_t size)592 port::Status GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location,
593                                              uint64_t size) {
594   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
595       size % 4 == 0) {
596     return GpuDriver::SynchronousMemsetUint32(
597         context_, AsCudaDevicePtr(location), 0x0, size / 4);
598   }
599   return GpuDriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
600                                            0x0, size);
601 }
602 
SynchronousMemSet(DeviceMemoryBase * location,int value,uint64_t size)603 port::Status GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location,
604                                             int value, uint64_t size) {
605   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
606       size % 4 == 0) {
607     // cudaMemset reinterprets "value" as a uint8.
608     uint8 byte_value = static_cast<uint8>(value);
609     uint32 pattern = (byte_value << 24) | (byte_value << 16) |
610                      (byte_value << 8) | byte_value;
611     return GpuDriver::SynchronousMemsetUint32(
612         context_, AsCudaDevicePtr(location), pattern, size / 4);
613   }
614   return GpuDriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
615                                            value, size);
616 }
617 
SynchronousMemcpy(DeviceMemoryBase * gpu_dst,const void * host_src,uint64_t size)618 port::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
619                                             const void* host_src,
620                                             uint64_t size) {
621   return GpuDriver::SynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
622                                          host_src, size);
623 }
624 
SynchronousMemcpy(void * host_dst,const DeviceMemoryBase & gpu_src,uint64_t size)625 port::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
626                                             const DeviceMemoryBase& gpu_src,
627                                             uint64_t size) {
628   return GpuDriver::SynchronousMemcpyD2H(context_, host_dst,
629                                          AsCudaDevicePtr(gpu_src), size);
630 }
631 
SynchronousMemcpyDeviceToDevice(DeviceMemoryBase * gpu_dst,const DeviceMemoryBase & gpu_src,uint64_t size)632 port::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
633     DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64_t size) {
634   return GpuDriver::SynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
635                                          AsCudaDevicePtr(gpu_src), size);
636 }
637 
MemZero(Stream * stream,DeviceMemoryBase * location,uint64_t size)638 port::Status GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
639                                   uint64_t size) {
640   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
641       size % 4 == 0) {
642     return Memset32(stream, location, 0x0, size);
643   } else {
644     return Memset(stream, location, 0x0, size);
645   }
646 }
647 
Memset(Stream * stream,DeviceMemoryBase * location,uint8 pattern,uint64_t size)648 port::Status GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
649                                  uint8 pattern, uint64_t size) {
650   VLOG(2) << "enqueueing memset8 operation onto stream " << stream
651           << " at location " << location << " with size " << size
652           << " and pattern " << std::hex << pattern;
653   return GpuDriver::AsynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
654                                             pattern, size,
655                                             AsGpuStreamValue(stream));
656 }
657 
Memset32(Stream * stream,DeviceMemoryBase * location,uint32 pattern,uint64_t size)658 port::Status GpuExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
659                                    uint32 pattern, uint64_t size) {
660   VLOG(2) << "enqueueing memset32 operation onto stream " << stream
661           << " at location " << location << " with size " << size
662           << " and pattern " << std::hex << pattern;
663   CHECK(reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
664         size % 4 == 0);
665   return GpuDriver::AsynchronousMemsetUint32(
666       context_, AsCudaDevicePtr(location), pattern, size / 4,
667       AsGpuStreamValue(stream));
668 }
669 
Memcpy(Stream * stream,void * host_dst,const DeviceMemoryBase & gpu_src,uint64_t size)670 bool GpuExecutor::Memcpy(Stream* stream, void* host_dst,
671                          const DeviceMemoryBase& gpu_src, uint64_t size) {
672   return GpuDriver::AsynchronousMemcpyD2H(context_, host_dst,
673                                           AsCudaDevicePtr(gpu_src), size,
674                                           AsGpuStreamValue(stream));
675 }
676 
Memcpy(Stream * stream,DeviceMemoryBase * gpu_dst,const void * host_src,uint64_t size)677 bool GpuExecutor::Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst,
678                          const void* host_src, uint64_t size) {
679   return GpuDriver::AsynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
680                                           host_src, size,
681                                           AsGpuStreamValue(stream));
682 }
683 
MemcpyDeviceToDevice(Stream * stream,DeviceMemoryBase * gpu_dst,const DeviceMemoryBase & gpu_src,uint64_t size)684 bool GpuExecutor::MemcpyDeviceToDevice(Stream* stream,
685                                        DeviceMemoryBase* gpu_dst,
686                                        const DeviceMemoryBase& gpu_src,
687                                        uint64_t size) {
688   return GpuDriver::AsynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
689                                           AsCudaDevicePtr(gpu_src), size,
690                                           AsGpuStreamValue(stream));
691 }
692 
HostCallback(Stream * stream,std::function<port::Status ()> callback)693 bool GpuExecutor::HostCallback(Stream* stream,
694                                std::function<port::Status()> callback) {
695   auto callback_ptr = new std::function<void()>([callback]() {
696     port::Status s = callback();
697     if (!s.ok()) {
698       LOG(WARNING) << "Host callback failed: " << s;
699     }
700   });
701   return GpuDriver::AddStreamCallback(context_, AsGpuStreamValue(stream),
702                                       InternalHostCallback, callback_ptr);
703 }
704 
InternalHostCallback(CUstream stream,CUresult status,void * data)705 /* static */ void GpuExecutor::InternalHostCallback(CUstream stream,
706                                                     CUresult status,
707                                                     void* data) {
708   std::function<void()>* callback =
709       reinterpret_cast<std::function<void()>*>(data);
710   (*callback)();
711   delete callback;
712 }
713 
AllocateEvent(Event * event)714 port::Status GpuExecutor::AllocateEvent(Event* event) {
715   return AsGpuEvent(event)->Init();
716 }
717 
DeallocateEvent(Event * event)718 port::Status GpuExecutor::DeallocateEvent(Event* event) {
719   return AsGpuEvent(event)->Destroy();
720 }
721 
RecordEvent(Stream * stream,Event * event)722 port::Status GpuExecutor::RecordEvent(Stream* stream, Event* event) {
723   return AsGpuEvent(event)->Record(AsGpuStream(stream));
724 }
725 
WaitForEvent(Stream * stream,Event * event)726 port::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
727   if (GpuDriver::WaitStreamOnEvent(context_, AsGpuStream(stream)->gpu_stream(),
728                                    AsGpuEvent(event)->gpu_event())) {
729     return ::tensorflow::OkStatus();
730   } else {
731     return port::Status(
732         port::error::INTERNAL,
733         absl::StrFormat("error recording waiting for CUDA event on stream %p",
734                         stream));
735   }
736 }
737 
PollForEventStatus(Event * event)738 Event::Status GpuExecutor::PollForEventStatus(Event* event) {
739   return AsGpuEvent(event)->PollForStatus();
740 }
741 
AllocateStream(Stream * stream)742 bool GpuExecutor::AllocateStream(Stream* stream) {
743   absl::MutexLock l(&alive_gpu_streams_mu_);
744   bool out = AsGpuStream(stream)->Init();
745   alive_gpu_streams_[stream->implementation()->GpuStreamHack()] = stream;
746   return out;
747 }
748 
DeallocateStream(Stream * stream)749 void GpuExecutor::DeallocateStream(Stream* stream) {
750   GpuStream* cuda_stream = AsGpuStream(stream);
751   absl::MutexLock l(&alive_gpu_streams_mu_);
752   alive_gpu_streams_.erase(cuda_stream->GpuStreamHack());
753   if (!cuda_stream->IsIdle()) {
754     LOG(ERROR) << "Deallocating stream with pending work";
755   }
756   cuda_stream->Destroy();
757 }
758 
AllocateTimer(Timer * timer)759 bool GpuExecutor::AllocateTimer(Timer* timer) {
760   return AsGpuTimer(timer)->Init();
761 }
762 
DeallocateTimer(Timer * timer)763 void GpuExecutor::DeallocateTimer(Timer* timer) {
764   AsGpuTimer(timer)->Destroy();
765 }
766 
CreateStreamDependency(Stream * dependent,Stream * other)767 bool GpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
768   CUevent other_completed_event = *AsGpuStream(other)->completed_event();
769   bool ok = GpuDriver::RecordEvent(context_, other_completed_event,
770                                    AsGpuStreamValue(other))
771                 .ok();
772   if (!ok) {
773     LOG(ERROR) << "failed to record completion event; "
774                   "therefore, failed to create inter-stream dependency";
775     return false;
776   }
777 
778   return GpuDriver::WaitStreamOnEvent(context_, AsGpuStreamValue(dependent),
779                                       other_completed_event);
780 }
781 
StartTimer(Stream * stream,Timer * timer)782 bool GpuExecutor::StartTimer(Stream* stream, Timer* timer) {
783   return AsGpuTimer(timer)->Start(AsGpuStream(stream));
784 }
785 
StopTimer(Stream * stream,Timer * timer)786 bool GpuExecutor::StopTimer(Stream* stream, Timer* timer) {
787   return AsGpuTimer(timer)->Stop(AsGpuStream(stream));
788 }
789 
BlockHostUntilDone(Stream * stream)790 port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
791   return GpuDriver::SynchronizeStream(context_, AsGpuStreamValue(stream));
792 }
793 
CreateBlas()794 blas::BlasSupport* GpuExecutor::CreateBlas() {
795   PluginRegistry* registry = PluginRegistry::Instance();
796   port::StatusOr<PluginRegistry::BlasFactory> status =
797       registry->GetFactory<PluginRegistry::BlasFactory>(cuda::kCudaPlatformId,
798                                                         plugin_config_.blas());
799   if (!status.ok()) {
800     LOG(ERROR) << "Unable to retrieve BLAS factory: "
801                << status.status().error_message();
802     return nullptr;
803   }
804 
805   return status.ValueOrDie()(this);
806 }
807 
CreateDnn()808 dnn::DnnSupport* GpuExecutor::CreateDnn() {
809   PluginRegistry* registry = PluginRegistry::Instance();
810   port::StatusOr<PluginRegistry::DnnFactory> status =
811       registry->GetFactory<PluginRegistry::DnnFactory>(cuda::kCudaPlatformId,
812                                                        plugin_config_.dnn());
813   if (!status.ok()) {
814     LOG(ERROR) << "Unable to retrieve DNN factory: "
815                << status.status().error_message();
816     return nullptr;
817   }
818 
819   return status.ValueOrDie()(this);
820 }
821 
CreateFft()822 fft::FftSupport* GpuExecutor::CreateFft() {
823   PluginRegistry* registry = PluginRegistry::Instance();
824   port::StatusOr<PluginRegistry::FftFactory> status =
825       registry->GetFactory<PluginRegistry::FftFactory>(cuda::kCudaPlatformId,
826                                                        plugin_config_.fft());
827   if (!status.ok()) {
828     LOG(ERROR) << "Unable to retrieve FFT factory: "
829                << status.status().error_message();
830     return nullptr;
831   }
832 
833   return status.ValueOrDie()(this);
834 }
835 
CreateRng()836 rng::RngSupport* GpuExecutor::CreateRng() {
837   PluginRegistry* registry = PluginRegistry::Instance();
838   port::StatusOr<PluginRegistry::RngFactory> status =
839       registry->GetFactory<PluginRegistry::RngFactory>(cuda::kCudaPlatformId,
840                                                        plugin_config_.rng());
841   if (!status.ok()) {
842     LOG(ERROR) << "Unable to retrieve RNG factory: "
843                << status.status().error_message();
844     return nullptr;
845   }
846 
847   return status.ValueOrDie()(this);
848 }
849 
850 // TODO(rspringer): Remove in b/18544742.
SupportsDnn() const851 bool GpuExecutor::SupportsDnn() const { return true; }
852 
CanEnablePeerAccessTo(StreamExecutorInterface * other)853 bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
854   GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
855   return GpuDriver::CanEnablePeerAccess(context_, cuda_other->context_);
856 }
857 
EnablePeerAccessTo(StreamExecutorInterface * other)858 port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
859   GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
860   return GpuDriver::EnablePeerAccess(context_, cuda_other->context_);
861 }
862 
DeviceMemoryUsage(int64_t * free,int64_t * total) const863 bool GpuExecutor::DeviceMemoryUsage(int64_t* free, int64_t* total) const {
864   return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
865 }
866 
GetSymbol(const std::string & symbol_name,ModuleHandle module_handle,void ** mem,size_t * bytes)867 bool GpuExecutor::GetSymbol(const std::string& symbol_name,
868                             ModuleHandle module_handle, void** mem,
869                             size_t* bytes) {
870   CHECK(static_cast<bool>(module_handle));
871 
872   auto lookup_in_module = [&](CUmodule module) {
873     CHECK(module != nullptr);
874     return GpuDriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
875                                       reinterpret_cast<CUdeviceptr*>(mem),
876                                       bytes);
877   };
878 
879   {  // give limited scope to mutex_lock
880     absl::MutexLock lock{&in_memory_modules_mu_};
881     auto it = gpu_binary_to_module_.find(module_handle.id());
882     CHECK(it != gpu_binary_to_module_.end());
883     return lookup_in_module(it->second.first);
884   }
885 
886   LOG(INFO) << "Failed to find symbol: " << symbol_name;
887   return false;
888 }
889 
FillBlockDimLimit(GpuDeviceHandle device,BlockDim * block_dim_limit)890 bool FillBlockDimLimit(GpuDeviceHandle device, BlockDim* block_dim_limit) {
891   // The BlockDim name is a mismatch against these GRID_DIM_* queries because
892   // we use BlockDims to express the dimensions of blocks within a grid
893   // (as opposed to ThreadDim which expresses the dimensions of threads
894   // within a block).
895   int x, y, z;
896   if (!GpuDriver::GetGridLimits(&x, &y, &z, device)) {
897     return false;
898   }
899 
900   block_dim_limit->x = x;
901   block_dim_limit->y = y;
902   block_dim_limit->z = z;
903   return true;
904 }
905 
SupportsBlas() const906 bool GpuExecutor::SupportsBlas() const { return true; }
907 
SupportsFft() const908 bool GpuExecutor::SupportsFft() const { return true; }
909 
SupportsRng() const910 bool GpuExecutor::SupportsRng() const { return true; }
911 
912 std::unique_ptr<internal::EventInterface>
CreateEventImplementation()913 GpuExecutor::CreateEventImplementation() {
914   return std::unique_ptr<internal::EventInterface>(new GpuEvent(this));
915 }
916 
917 std::unique_ptr<internal::KernelInterface>
CreateKernelImplementation()918 GpuExecutor::CreateKernelImplementation() {
919   return std::unique_ptr<internal::KernelInterface>(new GpuKernel());
920 }
921 
922 std::unique_ptr<internal::StreamInterface>
GetStreamImplementation()923 GpuExecutor::GetStreamImplementation() {
924   return std::unique_ptr<internal::StreamInterface>(new GpuStream(this));
925 }
926 
927 std::unique_ptr<internal::TimerInterface>
GetTimerImplementation()928 GpuExecutor::GetTimerImplementation() {
929   return std::unique_ptr<internal::TimerInterface>(new GpuTimer(this));
930 }
931 
GpuContextHack()932 void* GpuExecutor::GpuContextHack() { return context_; }
933 
gpu_context()934 GpuContext* GpuExecutor::gpu_context() { return context_; }
935 
936 // Attempts to read the NUMA node corresponding to the GPU device's PCI bus out
937 // of SysFS. Returns -1 if it cannot.
938 //
939 // For anything more complicated/prod-focused than this, you'll likely want to
940 // turn to gsys' topology modeling.
TryToReadNumaNode(const std::string & pci_bus_id,int device_ordinal)941 static int TryToReadNumaNode(const std::string& pci_bus_id,
942                              int device_ordinal) {
943 #if defined(__APPLE__)
944   LOG(INFO) << "OS X does not support NUMA - returning NUMA node zero";
945   return 0;
946 #elif defined(PLATFORM_WINDOWS)
947   // Windows support for NUMA is not currently implemented. Return node 0.
948   return 0;
949 #else
950   VLOG(2) << "trying to read NUMA node for device ordinal: " << device_ordinal;
951   static const int kUnknownNumaNode = -1;
952 
953   if (pci_bus_id.empty()) {
954     LOG(INFO) << "no PCI bus ID for device ordinal: " << device_ordinal;
955     return kUnknownNumaNode;
956   }
957 
958   std::string filename =
959       absl::StrFormat("/sys/bus/pci/devices/%s/numa_node", pci_bus_id);
960 
961   // We have to use fopen/fread here so that the device properties can be
962   // populated before InitGoogle procedure has been completed (at which point we
963   // could use the file::* utilities).
964   FILE* file = fopen(filename.c_str(), "r");
965   if (file == nullptr) {
966     LOG(INFO) << "could not open file to read NUMA node: " << filename
967               << "\nYour kernel may have been built without NUMA support.";
968     return kUnknownNumaNode;
969   }
970 
971   std::string content;
972   char buf[32];
973   size_t did_read = fread(buf, sizeof(buf[0]), sizeof(buf) - 1, file);
974   buf[did_read] = '\0';
975   content = buf;
976 
977   int32_t value;
978   if (port::safe_strto32(content, &value)) {
979     if (value < 0) {  // See http://b/18228951 for details on this path.
980       LOG(INFO) << "successful NUMA node read from SysFS had negative value ("
981                 << value
982                 << "), but there must be at least one NUMA node"
983                    ", so returning NUMA node zero";
984       fclose(file);
985       return 0;
986     }
987     fclose(file);
988     return value;
989   }
990 
991   LOG(WARNING)
992       << "could not convert SysFS file contents to integral NUMA node value: "
993       << content;
994 
995   fclose(file);
996   return kUnknownNumaNode;
997 #endif
998 }
999 
1000 port::StatusOr<std::unique_ptr<DeviceDescription>>
CreateDeviceDescription(int device_ordinal)1001 GpuExecutor::CreateDeviceDescription(int device_ordinal) {
1002   GpuDeviceHandle device;
1003   auto status = GpuDriver::GetDevice(device_ordinal, &device);
1004   if (!status.ok()) {
1005     return status;
1006   }
1007 
1008   int cc_major;
1009   int cc_minor;
1010   status = GpuDriver::GetComputeCapability(&cc_major, &cc_minor, device);
1011   if (!status.ok()) {
1012     return status;
1013   }
1014 
1015   internal::DeviceDescriptionBuilder builder;
1016 
1017   {
1018     int driver_version = 0;
1019     (void)GpuDriver::GetDriverVersion(&driver_version);
1020     std::string augmented_driver_version = absl::StrFormat(
1021         "%d (%s)", driver_version,
1022         cuda::DriverVersionStatusToString(Diagnostician::FindDsoVersion()));
1023     builder.set_driver_version(augmented_driver_version);
1024   }
1025 
1026   {
1027     std::string pci_bus_id = GpuDriver::GetPCIBusID(device);
1028 
1029     // Lower the hex characters to match sysfs.
1030     pci_bus_id = absl::AsciiStrToLower(pci_bus_id);
1031     builder.set_pci_bus_id(pci_bus_id);
1032 
1033     // Read the NUMA node corresponding to the PCI bus ID out of sysfs.
1034     int numa_node = TryToReadNumaNode(pci_bus_id, device_ordinal);
1035     builder.set_numa_node(numa_node);
1036   }
1037 
1038   {
1039     builder.set_threads_per_block_limit(
1040         GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1041                                       device)
1042             .ValueOrDie());
1043 
1044     ThreadDim thread_dim_limit;
1045     thread_dim_limit.x = GpuDriver::GetDeviceAttribute(
1046                              CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, device)
1047                              .ValueOrDie();
1048     thread_dim_limit.y = GpuDriver::GetDeviceAttribute(
1049                              CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, device)
1050                              .ValueOrDie();
1051     thread_dim_limit.z = GpuDriver::GetDeviceAttribute(
1052                              CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, device)
1053                              .ValueOrDie();
1054     builder.set_thread_dim_limit(thread_dim_limit);
1055 
1056     int clock_rate =
1057         GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device)
1058             .ValueOrDie();
1059     builder.set_clock_rate_ghz(static_cast<float>(clock_rate) / 1e6);
1060   }
1061 
1062   {
1063     bool ecc_enabled = false;
1064     (void)GpuDriver::IsEccEnabled(device, &ecc_enabled);
1065     builder.set_ecc_enabled(ecc_enabled);
1066   }
1067 
1068   {
1069     uint64_t device_memory_size = -1;
1070     (void)GpuDriver::GetDeviceTotalMemory(device, &device_memory_size);
1071     builder.set_device_memory_size(device_memory_size);
1072   }
1073 
1074   port::StatusOr<int> mem_clock_khz = GpuDriver::GetDeviceAttribute(
1075       CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device_ordinal);
1076   port::StatusOr<int> mem_bus_width_bits = GpuDriver::GetDeviceAttribute(
1077       CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device_ordinal);
1078   if (mem_clock_khz.ok() && mem_bus_width_bits.ok()) {
1079     // Times 2 because HBM is DDR memory; it gets two data bits per each data
1080     // lane.
1081     builder.set_memory_bandwidth(2 * int64_t{mem_clock_khz.ValueOrDie()} *
1082                                  1000 *
1083                                  int64_t{mem_bus_width_bits.ValueOrDie()} / 8);
1084   }
1085 
1086   {
1087     BlockDim block_dim_limit;
1088     FillBlockDimLimit(device, &block_dim_limit);
1089     builder.set_block_dim_limit(block_dim_limit);
1090   }
1091 
1092   {
1093     std::string device_name;
1094     TF_RETURN_IF_ERROR(GpuDriver::GetDeviceName(device, &device_name));
1095     builder.set_name(device_name);
1096   }
1097 
1098   builder.set_platform_version(
1099       absl::StrCat("Compute Capability ", cc_major, ".", cc_minor));
1100 
1101   // TODO(leary) should be a way to query this from the driver, but this is
1102   // unlikely to change for us any time soon.
1103   builder.set_device_address_bits(64);
1104 
1105   builder.set_device_vendor("NVIDIA Corporation");
1106   builder.set_cuda_compute_capability(cc_major, cc_minor);
1107   builder.set_shared_memory_per_core(
1108       GpuDriver::GetMaxSharedMemoryPerCore(device).ValueOrDie());
1109   builder.set_shared_memory_per_block(
1110       GpuDriver::GetMaxSharedMemoryPerBlock(device).ValueOrDie());
1111   builder.set_core_count(
1112       GpuDriver::GetMultiprocessorCount(device).ValueOrDie());
1113   builder.set_threads_per_core_limit(
1114       GpuDriver::GetMaxThreadsPerMultiprocessor(device).ValueOrDie());
1115   builder.set_registers_per_block_limit(
1116       GpuDriver::GetMaxRegistersPerBlock(device).ValueOrDie());
1117   builder.set_threads_per_warp(
1118       GpuDriver::GetThreadsPerWarp(device).ValueOrDie());
1119   builder.set_registers_per_core_limit(
1120       GpuDriver::GetDeviceAttribute(
1121           CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, device)
1122           .ValueOrDie());
1123 
1124   return builder.Build();
1125 }
1126 
1127 }  // namespace gpu
1128 
1129 }  // namespace stream_executor
1130 
1131 REGISTER_MODULE_INITIALIZER(cuda_gpu_executor, {});
1132