1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.h"
17
18 #include <utility>
19
20 #if defined(__APPLE__)
21 #include <mach-o/dyld.h>
22 #endif
23 #if defined(PLATFORM_WINDOWS)
24 #include <windows.h>
25 #define PATH_MAX MAX_PATH
26 #else
27 #include <unistd.h>
28 #endif
29 #include "absl/strings/ascii.h"
30 #include "absl/strings/str_cat.h"
31 #include "absl/strings/str_format.h"
32 #include "absl/strings/str_split.h"
33 #include "absl/strings/string_view.h"
34 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.h"
35 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.h"
36 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_event.h"
37 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_platform_id.h"
38 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_stream.h"
39 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_timer.h"
40 #include "tensorflow/compiler/xla/stream_executor/kernel_cache_config.h"
41 #include "tensorflow/compiler/xla/stream_executor/lib/env.h"
42 #include "tensorflow/compiler/xla/stream_executor/lib/error.h"
43 #include "tensorflow/compiler/xla/stream_executor/lib/initialize.h"
44 #include "tensorflow/compiler/xla/stream_executor/lib/mathutil.h"
45 #include "tensorflow/compiler/xla/stream_executor/lib/numbers.h"
46 #include "tensorflow/compiler/xla/stream_executor/lib/path.h"
47 #include "tensorflow/compiler/xla/stream_executor/lib/process_state.h"
48 #include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
49 #include "tensorflow/compiler/xla/stream_executor/platform.h"
50 #include "tensorflow/compiler/xla/stream_executor/platform/logging.h"
51 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
52 #include "tensorflow/compiler/xla/stream_executor/plugin_registry.h"
53 #include "tensorflow/compiler/xla/stream_executor/stream.h"
54 #include "tensorflow/compiler/xla/stream_executor/stream_executor_internal.h"
55 #include "tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h"
56 #include "tensorflow/compiler/xla/stream_executor/timer.h"
57
58 // LOG(ERROR) uses a const named ERROR, so a macro with the same name is
59 // always unwanted. This happens on Windows that defines such a macro.
60 #undef ERROR
61
62 #ifdef PLATFORMS_GPUS_CUDA_DYNAMIC_LIBCUDA_DYNAMIC_LIBCUDA_H_
63 #error \
64 "No driver calls in this file, wrap driver functionality in cuda_driver.cc."
65 #endif
66
67 #ifdef __CUDA_RUNTIME_H__
68 #error \
69 "CUDA runtime being included into CUDA GPU executor; should be driver only."
70 #endif
71
72 extern bool FLAGS_check_gpu_leaks;
73 bool FLAGS_prefer_cubin_to_ptx = true;
74
75 namespace stream_executor {
76 namespace gpu {
77
78 // Hook that can be used to CUBIN-ate PTX before it is loaded into the driver.
79 // It has been observed that loading both PTX and cubins into the driver library
80 // can cause it to crash, but loading only CUBINs avoids those crashes;
81 // therefore, it's useful to have this hook to hack in uniform CUBIN-ation of
82 // PTX code.
83 //
84 // As this is an implementation-detail workaround, the usage is to declare this
85 // variable with extern linkage and populate it from another translation unit.
86 std::function<std::string(const std::string&)> g_cubinate;
87
AsGpuEvent(Event * event)88 static GpuEvent* AsGpuEvent(Event* event) {
89 DCHECK(event != nullptr);
90 return static_cast<GpuEvent*>(event->implementation());
91 }
92
93 // Given a platform-independent timer datatype, returns the internal CUDA
94 // platform implementation pointer.
AsGpuTimer(Timer * timer)95 static GpuTimer* AsGpuTimer(Timer* timer) {
96 DCHECK(timer != nullptr);
97 return static_cast<GpuTimer*>(timer->implementation());
98 }
99
100 // Given const GPU memory, returns a libcuda device pointer datatype, suitable
101 // for passing directly to libcuda APIs.
102 //
103 // N.B. we must lose constness in order to pass a suitable type to the existing
104 // libcuda APIs, so the caller should take care to only pass the result of const
105 // GPU memory conversions to libcuda functions which will honor constness.
AsCudaDevicePtr(const DeviceMemoryBase & gpu_mem)106 static CUdeviceptr AsCudaDevicePtr(const DeviceMemoryBase& gpu_mem) {
107 return reinterpret_cast<CUdeviceptr>(gpu_mem.opaque());
108 }
109
110 // See description on const version above.
AsCudaDevicePtr(DeviceMemoryBase * gpu_mem)111 static CUdeviceptr AsCudaDevicePtr(DeviceMemoryBase* gpu_mem) {
112 return AsCudaDevicePtr(*gpu_mem);
113 }
114
ExtractGpuContext(GpuExecutor * cuda_exec)115 GpuContext* ExtractGpuContext(GpuExecutor* cuda_exec) {
116 CHECK(cuda_exec != nullptr);
117 return cuda_exec->gpu_context();
118 }
119
~GpuExecutor()120 GpuExecutor::~GpuExecutor() {
121 CHECK(kernel_to_gpu_binary_.empty()) << "GpuExecutor has live kernels.";
122 CHECK(gpu_binary_to_module_.empty()) << "GpuExecutor has loaded modules.";
123 if (context_ != nullptr) {
124 GpuDriver::DestroyContext(context_);
125 }
126 }
127
Init(int device_ordinal,DeviceOptions device_options)128 port::Status GpuExecutor::Init(int device_ordinal,
129 DeviceOptions device_options) {
130 device_ordinal_ = device_ordinal;
131
132 auto status = GpuDriver::Init();
133 if (!status.ok()) {
134 return status;
135 }
136
137 status = GpuDriver::GetDevice(device_ordinal_, &device_);
138 if (!status.ok()) {
139 return status;
140 }
141
142 status = GpuDriver::CreateContext(device_ordinal_, device_, device_options,
143 &context_);
144 if (!status.ok()) {
145 return status;
146 }
147
148 return GpuDriver::GetComputeCapability(&cc_major_, &cc_minor_, device_);
149 }
150
FindOnDiskForComputeCapability(absl::string_view filename,absl::string_view canonical_suffix,std::string * found_filename) const151 bool GpuExecutor::FindOnDiskForComputeCapability(
152 absl::string_view filename, absl::string_view canonical_suffix,
153 std::string* found_filename) const {
154 if (cc_major_ == 0 && cc_minor_ == 0) {
155 return false;
156 }
157
158 std::string cc_specific =
159 absl::StrCat(filename, ".cc", cc_major_, cc_minor_, canonical_suffix);
160 if (port::FileExists(cc_specific).ok()) {
161 VLOG(2) << "found compute-capability-specific file, using that: "
162 << cc_specific;
163 *found_filename = cc_specific;
164 return true;
165 }
166
167 VLOG(2) << "could not find compute-capability specific file at: "
168 << cc_specific;
169 if (port::FileExists(std::string(filename)).ok()) {
170 *found_filename = std::string(filename);
171 return true;
172 }
173
174 return false;
175 }
176
FindOnDiskForISAVersion(absl::string_view filename,absl::string_view canonical_suffix,std::string * found_filename) const177 bool GpuExecutor::FindOnDiskForISAVersion(absl::string_view filename,
178 absl::string_view canonical_suffix,
179 std::string* found_filename) const {
180 LOG(ERROR)
181 << "Feature not supported on CUDA platform (FindOnDiskForISAVersion)";
182 return false;
183 }
184 // Returns the path to the running executable.
185 // N.B. Derived from //knowledge/smalltalk/background_kb.cc
186 // Arg: strip_exe: if true, remove the name of the executable itself from the
187 // returned string. Example: calling this from /usr/bin/foo
188 // would return /usr/bin.
GetBinaryDir(bool strip_exe)189 static std::string GetBinaryDir(bool strip_exe) {
190 std::string exe_path = port::GetExecutablePath();
191 if (strip_exe) {
192 // The exe is the last component of the path, so remove one component.
193 std::vector<std::string> components = absl::StrSplit(exe_path, '/');
194 components.pop_back();
195 return absl::StrJoin(components, "/");
196 }
197 return exe_path;
198 }
199
LoadModuleFromCuBin(const char * cubin,CUmodule * module)200 port::Status GpuExecutor::LoadModuleFromCuBin(const char* cubin,
201 CUmodule* module) {
202 uint64_t module_refcount;
203 std::tie(*module, module_refcount) = gpu_binary_to_module_[cubin];
204
205 if (*module == nullptr) {
206 TF_RETURN_IF_ERROR(GpuDriver::LoadCubin(context_, cubin, module));
207 module_refcount = 1;
208 VLOG(3) << "Loaded CUBIN " << static_cast<const void*>(cubin)
209 << " as module " << *module;
210 } else {
211 ++module_refcount;
212 VLOG(3) << "CUBIN " << static_cast<const void*>(cubin)
213 << " is already loaded as module " << *module;
214 }
215 gpu_binary_to_module_[cubin] = {*module, module_refcount};
216 return ::tensorflow::OkStatus();
217 }
218
LoadModuleFromPtx(const char * ptx,CUmodule * module)219 port::Status GpuExecutor::LoadModuleFromPtx(const char* ptx, CUmodule* module) {
220 uint64_t module_refcount;
221 std::tie(*module, module_refcount) = gpu_binary_to_module_[ptx];
222
223 if (*module == nullptr) {
224 TF_RETURN_IF_ERROR(GpuDriver::LoadPtx(context_, ptx, module));
225 VLOG(3) << "Loaded PTX " << static_cast<const void*>(ptx) << " as module "
226 << *module;
227 module_refcount = 1;
228 } else {
229 ++module_refcount;
230 VLOG(3) << "PTX " << static_cast<const void*>(ptx)
231 << " is already loaded as module " << module;
232 }
233 gpu_binary_to_module_[ptx] = {*module, module_refcount};
234 return ::tensorflow::OkStatus();
235 }
236
LoadModuleFromHsaco(const char * hsaco,CUmodule * module)237 port::Status GpuExecutor::LoadModuleFromHsaco(const char* hsaco,
238 CUmodule* module) {
239 return port::InternalError(
240 "Feature not supported on CUDA platform (LoadModuleFromHsaco)");
241 }
242
GetKernel(const MultiKernelLoaderSpec & spec,KernelBase * kernel)243 port::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
244 KernelBase* kernel) {
245 GpuKernel* cuda_kernel = AsGpuKernel(kernel);
246 CUmodule module;
247 const std::string* kernelname;
248
249 VLOG(3) << "GetKernel on kernel " << kernel << " : " << kernel->name();
250
251 if (spec.has_cuda_cubin_in_memory()) {
252 absl::MutexLock lock{&in_memory_modules_mu_};
253 kernelname = &spec.cuda_cubin_in_memory().kernelname();
254 const char* cubin = spec.cuda_cubin_in_memory().bytes();
255 TF_RETURN_IF_ERROR(LoadModuleFromCuBin(cubin, &module));
256 kernel_to_gpu_binary_[kernel] = cubin;
257 } else if (spec.has_cuda_ptx_in_memory()) {
258 kernelname = &spec.cuda_ptx_in_memory().kernelname();
259
260 if (cc_major_ == 0 && cc_minor_ == 0) {
261 return port::InternalError("Compute capability not set");
262 }
263
264 const char* ptx = spec.cuda_ptx_in_memory().text(cc_major_, cc_minor_);
265 if (ptx == nullptr) {
266 ptx = spec.cuda_ptx_in_memory().default_text();
267 }
268 if (ptx == nullptr) {
269 LOG(FATAL) << "Loader spec has no ptx for kernel " << *kernelname;
270 }
271
272 absl::MutexLock lock{&in_memory_modules_mu_};
273 TF_RETURN_IF_ERROR(LoadModuleFromPtx(ptx, &module));
274 kernel_to_gpu_binary_[kernel] = ptx;
275 } else {
276 return port::InternalError("No method of loading CUDA kernel provided");
277 }
278 VLOG(2) << "getting function " << *kernelname << " from module " << module;
279 if (!GpuDriver::GetModuleFunction(context_, module, kernelname->c_str(),
280 cuda_kernel->gpu_function_ptr())) {
281 return port::InternalError("Could not find the corresponding function");
282 }
283
284 // We have to trust the kernel loader spec arity because there doesn't appear
285 // to be a way to reflect on the number of expected arguments w/the CUDA API.
286 cuda_kernel->set_arity(spec.arity());
287
288 KernelMetadata kernel_metadata;
289 TF_RETURN_IF_ERROR(GetKernelMetadata(cuda_kernel, &kernel_metadata));
290 kernel->set_metadata(kernel_metadata);
291 kernel->set_name(*kernelname);
292 return ::tensorflow::OkStatus();
293 }
294
UnloadGpuBinary(const void * gpu_binary)295 bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
296 auto module_it = gpu_binary_to_module_.find(gpu_binary);
297 if (gpu_binary_to_module_.end() == module_it) {
298 VLOG(3) << "No loaded CUDA module for " << gpu_binary;
299 return false;
300 }
301 auto& module = module_it->second.first;
302 auto& refcount = module_it->second.second;
303 VLOG(3) << "Found CUDA module " << module << " with refcount " << refcount;
304 if (--refcount == 0) {
305 VLOG(3) << "Unloading CUDA module " << module;
306 GpuDriver::UnloadModule(context_, module);
307 gpu_binary_to_module_.erase(module_it);
308 }
309 return true;
310 }
311
UnloadKernel(const KernelBase * kernel)312 void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
313 VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name();
314
315 absl::MutexLock lock{&in_memory_modules_mu_};
316 auto gpu_binary_it = kernel_to_gpu_binary_.find(kernel);
317 if (kernel_to_gpu_binary_.end() == gpu_binary_it) {
318 VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
319 << " has never been loaded.";
320 return; // We've never seen this kernel.
321 }
322 VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
323 << " has loaded GPU code " << gpu_binary_it->second;
324 UnloadGpuBinary(gpu_binary_it->second);
325 kernel_to_gpu_binary_.erase(gpu_binary_it);
326 }
327
LoadModule(const MultiModuleLoaderSpec & spec,ModuleHandle * module_handle)328 port::Status GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
329 ModuleHandle* module_handle) {
330 // In GpuExecutor we store the pointer to the GPU binary (PTX or CUBIN) as
331 // ModuleHandle::id().
332 CUmodule cu_module;
333 if (spec.has_cuda_cubin_in_memory()) {
334 absl::MutexLock lock{&in_memory_modules_mu_};
335 TF_RETURN_IF_ERROR(LoadModuleFromCuBin(
336 reinterpret_cast<const char*>(spec.cuda_cubin_in_memory().data()),
337 &cu_module));
338 *module_handle = ModuleHandle(const_cast<void*>(
339 static_cast<const void*>(spec.cuda_cubin_in_memory().data())));
340 return ::tensorflow::OkStatus();
341 } else if (spec.has_cuda_ptx_in_memory()) {
342 if (cc_major_ == 0 && cc_minor_ == 0) {
343 return port::InternalError("Compute capability not set");
344 }
345
346 if (!spec.cuda_ptx_in_memory()) {
347 return port::InternalError("PTX not found in spec");
348 }
349
350 absl::MutexLock lock{&in_memory_modules_mu_};
351 TF_RETURN_IF_ERROR(
352 LoadModuleFromPtx(spec.cuda_ptx_in_memory(), &cu_module));
353 *module_handle = ModuleHandle(
354 const_cast<void*>(static_cast<const void*>(spec.cuda_ptx_in_memory())));
355 return ::tensorflow::OkStatus();
356 }
357 return port::InternalError("No method of loading CUDA module provided");
358 }
359
UnloadModule(ModuleHandle module_handle)360 bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
361 const char* gpu_binary = reinterpret_cast<const char*>(module_handle.id());
362 absl::MutexLock lock{&in_memory_modules_mu_};
363 return UnloadGpuBinary(gpu_binary);
364 }
365
366 namespace {
Fingerprint128(const absl::string_view s)367 absl::uint128 Fingerprint128(const absl::string_view s) {
368 auto fp = tensorflow::Fingerprint128(s);
369 return absl::MakeUint128(fp.high64, fp.low64);
370 }
371 } // namespace
372
373 port::StatusOr<std::shared_ptr<DeviceMemoryBase>>
CreateOrShareConstant(Stream * stream,const std::vector<uint8_t> & content)374 GpuExecutor::CreateOrShareConstant(Stream* stream,
375 const std::vector<uint8_t>& content) {
376 absl::MutexLock lock{&shared_constants_mu_};
377 // We assume all constants are uniquely identified by this hash. In the
378 // (highly unlikely) event of a hash collision, the program will likely crash
379 // (because the cached constant that will be returned by mistake is unlikely
380 // to have the correct size).
381 absl::uint128 fingerprint = Fingerprint128(absl::string_view(
382 reinterpret_cast<const char*>(content.data()), content.size()));
383 // Must insert nullptr first to get an iterator to the insertion point.
384 auto insert_result = shared_constants_.insert(
385 {fingerprint, std::weak_ptr<DeviceMemoryBase>()});
386 auto it = insert_result.first;
387 bool was_already_in_cache = !insert_result.second;
388 std::shared_ptr<DeviceMemoryBase> shared_constant;
389
390 if (was_already_in_cache) {
391 shared_constant = it->second.lock();
392 }
393
394 if (shared_constant == nullptr) {
395 // Either the constant wasn't found in the cache, or it was but its
396 // weak_ptr had expired.
397 DeviceMemoryBase* new_constant =
398 new DeviceMemoryBase(Allocate(content.size(), /*memory_space=*/0));
399 if (new_constant->opaque() == nullptr) {
400 return port::InternalError(absl::StrFormat(
401 "Failed to allocate %d bytes for new constant", content.size()));
402 }
403
404 port::Status status =
405 stream->ThenMemcpy(new_constant, content.data(), content.size())
406 .BlockHostUntilDone();
407 if (!status.ok()) {
408 Deallocate(new_constant);
409 status.Update(port::InternalError(absl::StrFormat(
410 "Memcpy to device address %p failed", new_constant->opaque())));
411 return status;
412 }
413
414 // Capturing 'this' in the custom deleter means this executor must
415 // outlive all shared uses of this constant.
416 shared_constant = std::shared_ptr<DeviceMemoryBase>(
417 new_constant, [this](DeviceMemoryBase* p) {
418 Deallocate(p);
419 delete p;
420 });
421 it->second = std::weak_ptr<DeviceMemoryBase>(shared_constant);
422 }
423
424 return shared_constant;
425 }
426
GetKernelMetadata(GpuKernel * cuda_kernel,KernelMetadata * kernel_metadata)427 port::Status GpuExecutor::GetKernelMetadata(GpuKernel* cuda_kernel,
428 KernelMetadata* kernel_metadata) {
429 int value;
430 TF_RETURN_IF_ERROR(GpuDriver::FuncGetAttribute(
431 CU_FUNC_ATTRIBUTE_NUM_REGS, *cuda_kernel->gpu_function_ptr(), &value));
432 kernel_metadata->set_registers_per_thread(value);
433
434 TF_RETURN_IF_ERROR(
435 GpuDriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
436 *cuda_kernel->gpu_function_ptr(), &value));
437 kernel_metadata->set_shared_memory_bytes(value);
438 return ::tensorflow::OkStatus();
439 }
440
Launch(Stream * stream,const ThreadDim & thread_dims,const BlockDim & block_dims,const KernelBase & kernel,const KernelArgsArrayBase & args)441 port::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
442 const BlockDim& block_dims,
443 const KernelBase& kernel,
444 const KernelArgsArrayBase& args) {
445 CHECK_EQ(kernel.Arity(), args.number_of_arguments());
446 CUstream custream = AsGpuStreamValue(stream);
447 const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
448 CUfunction cufunc = cuda_kernel->AsGpuFunctionHandle();
449
450 // Only perform/print the occupancy check once. Even just checking to see
451 // whether we've done an occupancy check on this kernel before isn't free
452 // (because we have to synchronize), so we only do this at -v 2+.
453 if (VLOG_IS_ON(2)) {
454 absl::MutexLock lock(&launched_kernels_mu_);
455 if (!launched_kernels_.count(cufunc)) {
456 VlogOccupancyInfo(kernel, thread_dims, block_dims);
457 // TODO(rspringer): Remove elements from launched_kernels_...if we ever
458 // expose a kernel/module deallocation method.
459 launched_kernels_.insert(cufunc);
460 }
461 }
462
463 if (cuda_kernel->GetPreferredCacheConfig() !=
464 KernelCacheConfig::kNoPreference) {
465 TF_RETURN_IF_ERROR(GpuDriver::FuncSetCacheConfig(
466 cufunc, cuda_kernel->GetGpuCacheConfig()));
467 }
468
469 void** kernel_params = const_cast<void**>(args.argument_addresses().data());
470
471 return GpuDriver::LaunchKernel(context_, kernel.name(), cufunc, block_dims.x,
472 block_dims.y, block_dims.z, thread_dims.x,
473 thread_dims.y, thread_dims.z,
474 args.number_of_shared_bytes(), custream,
475 kernel_params, nullptr /* = extra */);
476 }
477
478 // This is a non-essential operation; if there's a failure, proceed without
479 // logging an error. It's nearly certain that in case of failures, we'd never
480 // get here in the first place; these are very low-impact routines.
VlogOccupancyInfo(const KernelBase & kernel,const ThreadDim & thread_dims,const BlockDim & block_dims)481 void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
482 const ThreadDim& thread_dims,
483 const BlockDim& block_dims) {
484 VLOG(2) << "Computing kernel occupancy for kernel "
485 << kernel.demangled_name();
486 VLOG(2) << "Thread dimensions (" << thread_dims.x << ", " << thread_dims.y
487 << ", " << thread_dims.z << ")";
488
489 int regs_per_thread;
490 if (!kernel.metadata().registers_per_thread(®s_per_thread)) {
491 return;
492 }
493
494 int smem_per_block;
495 if (!kernel.metadata().shared_memory_bytes(&smem_per_block)) {
496 return;
497 }
498
499 const DeviceDescription& device_description =
500 kernel.parent()->GetDeviceDescription();
501
502 const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
503 CUfunction cufunc = cuda_kernel->AsGpuFunctionHandle();
504
505 int blocks_per_sm = CalculateOccupancy(device_description, regs_per_thread,
506 smem_per_block, thread_dims, cufunc);
507 VLOG(2) << "Resident blocks per SM is " << blocks_per_sm;
508
509 int suggested_threads =
510 CompareOccupancy(&blocks_per_sm, device_description, regs_per_thread,
511 smem_per_block, thread_dims, cufunc);
512 if (suggested_threads != 0) {
513 VLOG(2) << "The cuda occupancy calculator recommends using "
514 << suggested_threads
515 << " threads per block to achieve an occupancy of " << blocks_per_sm
516 << " blocks per SM.";
517 }
518 }
519
520 // Compute and return maximum blocks per core (occupancy) based on the
521 // device description, some kernel characteristics and the number of threads per
522 // block. If unable to compute occupancy, zero is returned.
CalculateOccupancy(const DeviceDescription & device_description,uint64_t registers_per_thread,uint64_t shared_memory_per_block,const ThreadDim & thread_dims,CUfunction func)523 int GpuExecutor::CalculateOccupancy(const DeviceDescription& device_description,
524 uint64_t registers_per_thread,
525 uint64_t shared_memory_per_block,
526 const ThreadDim& thread_dims,
527 CUfunction func) {
528 int suggested_blocks = 0;
529 int suggested_threads = 0;
530 CUresult err = cuOccupancyMaxPotentialBlockSize(
531 &suggested_blocks, &suggested_threads, func, nullptr,
532 shared_memory_per_block, 0);
533 CHECK_EQ(err, CUDA_SUCCESS);
534 return suggested_blocks;
535 }
536
537 // Compute and return the suggested thread count to achieve ideal occupancy.
538 // If the provided thread dimensions match this number, zero is returned.
CompareOccupancy(int * initial_blocks,const DeviceDescription & device_description,uint64_t registers_per_thread,uint64_t shared_memory_per_block,const ThreadDim & thread_dims,CUfunction func)539 int GpuExecutor::CompareOccupancy(int* initial_blocks,
540 const DeviceDescription& device_description,
541 uint64_t registers_per_thread,
542 uint64_t shared_memory_per_block,
543 const ThreadDim& thread_dims,
544 CUfunction func) {
545 int suggested_blocks = 0;
546 int suggested_threads = 0;
547 CUresult err = cuOccupancyMaxPotentialBlockSize(
548 &suggested_blocks, &suggested_threads, func, nullptr,
549 shared_memory_per_block, 0);
550 CHECK_EQ(err, CUDA_SUCCESS);
551 if (suggested_blocks > *initial_blocks) {
552 *initial_blocks = suggested_blocks;
553 return suggested_threads;
554 } else {
555 return 0;
556 }
557 }
558
Allocate(uint64_t size,int64_t memory_space)559 DeviceMemoryBase GpuExecutor::Allocate(uint64_t size, int64_t memory_space) {
560 CHECK_EQ(memory_space, 0);
561 return DeviceMemoryBase(GpuDriver::DeviceAllocate(context_, size), size);
562 }
563
GetSubBuffer(DeviceMemoryBase * mem,uint64_t offset_bytes,uint64_t size_bytes)564 void* GpuExecutor::GetSubBuffer(DeviceMemoryBase* mem, uint64_t offset_bytes,
565 uint64_t size_bytes) {
566 // offset and size are in bytes, so char* works as the pointer type.
567 return reinterpret_cast<char*>(mem->opaque()) + offset_bytes;
568 }
569
Deallocate(DeviceMemoryBase * mem)570 void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
571 GpuDriver::DeviceDeallocate(context_, mem->opaque());
572 }
573
HostMemoryRegister(void * location,uint64_t size)574 bool GpuExecutor::HostMemoryRegister(void* location, uint64_t size) {
575 if (location == nullptr || size == 0) {
576 LOG(WARNING) << "attempting to register null or zero-sized memory: "
577 << location << "; size " << size;
578 }
579 VLOG(2) << "registering " << location << " size " << size;
580 return GpuDriver::HostRegister(context_, location, size);
581 }
582
HostMemoryUnregister(void * location)583 bool GpuExecutor::HostMemoryUnregister(void* location) {
584 VLOG(2) << "unregistering " << location;
585 return GpuDriver::HostUnregister(context_, location);
586 }
587
SynchronizeAllActivity()588 bool GpuExecutor::SynchronizeAllActivity() {
589 return GpuDriver::SynchronizeContext(context_);
590 }
591
SynchronousMemZero(DeviceMemoryBase * location,uint64_t size)592 port::Status GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location,
593 uint64_t size) {
594 if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
595 size % 4 == 0) {
596 return GpuDriver::SynchronousMemsetUint32(
597 context_, AsCudaDevicePtr(location), 0x0, size / 4);
598 }
599 return GpuDriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
600 0x0, size);
601 }
602
SynchronousMemSet(DeviceMemoryBase * location,int value,uint64_t size)603 port::Status GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location,
604 int value, uint64_t size) {
605 if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
606 size % 4 == 0) {
607 // cudaMemset reinterprets "value" as a uint8.
608 uint8 byte_value = static_cast<uint8>(value);
609 uint32 pattern = (byte_value << 24) | (byte_value << 16) |
610 (byte_value << 8) | byte_value;
611 return GpuDriver::SynchronousMemsetUint32(
612 context_, AsCudaDevicePtr(location), pattern, size / 4);
613 }
614 return GpuDriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
615 value, size);
616 }
617
SynchronousMemcpy(DeviceMemoryBase * gpu_dst,const void * host_src,uint64_t size)618 port::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
619 const void* host_src,
620 uint64_t size) {
621 return GpuDriver::SynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
622 host_src, size);
623 }
624
SynchronousMemcpy(void * host_dst,const DeviceMemoryBase & gpu_src,uint64_t size)625 port::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
626 const DeviceMemoryBase& gpu_src,
627 uint64_t size) {
628 return GpuDriver::SynchronousMemcpyD2H(context_, host_dst,
629 AsCudaDevicePtr(gpu_src), size);
630 }
631
SynchronousMemcpyDeviceToDevice(DeviceMemoryBase * gpu_dst,const DeviceMemoryBase & gpu_src,uint64_t size)632 port::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
633 DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64_t size) {
634 return GpuDriver::SynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
635 AsCudaDevicePtr(gpu_src), size);
636 }
637
MemZero(Stream * stream,DeviceMemoryBase * location,uint64_t size)638 port::Status GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
639 uint64_t size) {
640 if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
641 size % 4 == 0) {
642 return Memset32(stream, location, 0x0, size);
643 } else {
644 return Memset(stream, location, 0x0, size);
645 }
646 }
647
Memset(Stream * stream,DeviceMemoryBase * location,uint8 pattern,uint64_t size)648 port::Status GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
649 uint8 pattern, uint64_t size) {
650 VLOG(2) << "enqueueing memset8 operation onto stream " << stream
651 << " at location " << location << " with size " << size
652 << " and pattern " << std::hex << pattern;
653 return GpuDriver::AsynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
654 pattern, size,
655 AsGpuStreamValue(stream));
656 }
657
Memset32(Stream * stream,DeviceMemoryBase * location,uint32 pattern,uint64_t size)658 port::Status GpuExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
659 uint32 pattern, uint64_t size) {
660 VLOG(2) << "enqueueing memset32 operation onto stream " << stream
661 << " at location " << location << " with size " << size
662 << " and pattern " << std::hex << pattern;
663 CHECK(reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
664 size % 4 == 0);
665 return GpuDriver::AsynchronousMemsetUint32(
666 context_, AsCudaDevicePtr(location), pattern, size / 4,
667 AsGpuStreamValue(stream));
668 }
669
Memcpy(Stream * stream,void * host_dst,const DeviceMemoryBase & gpu_src,uint64_t size)670 bool GpuExecutor::Memcpy(Stream* stream, void* host_dst,
671 const DeviceMemoryBase& gpu_src, uint64_t size) {
672 return GpuDriver::AsynchronousMemcpyD2H(context_, host_dst,
673 AsCudaDevicePtr(gpu_src), size,
674 AsGpuStreamValue(stream));
675 }
676
Memcpy(Stream * stream,DeviceMemoryBase * gpu_dst,const void * host_src,uint64_t size)677 bool GpuExecutor::Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst,
678 const void* host_src, uint64_t size) {
679 return GpuDriver::AsynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
680 host_src, size,
681 AsGpuStreamValue(stream));
682 }
683
MemcpyDeviceToDevice(Stream * stream,DeviceMemoryBase * gpu_dst,const DeviceMemoryBase & gpu_src,uint64_t size)684 bool GpuExecutor::MemcpyDeviceToDevice(Stream* stream,
685 DeviceMemoryBase* gpu_dst,
686 const DeviceMemoryBase& gpu_src,
687 uint64_t size) {
688 return GpuDriver::AsynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
689 AsCudaDevicePtr(gpu_src), size,
690 AsGpuStreamValue(stream));
691 }
692
HostCallback(Stream * stream,std::function<port::Status ()> callback)693 bool GpuExecutor::HostCallback(Stream* stream,
694 std::function<port::Status()> callback) {
695 auto callback_ptr = new std::function<void()>([callback]() {
696 port::Status s = callback();
697 if (!s.ok()) {
698 LOG(WARNING) << "Host callback failed: " << s;
699 }
700 });
701 return GpuDriver::AddStreamCallback(context_, AsGpuStreamValue(stream),
702 InternalHostCallback, callback_ptr);
703 }
704
InternalHostCallback(CUstream stream,CUresult status,void * data)705 /* static */ void GpuExecutor::InternalHostCallback(CUstream stream,
706 CUresult status,
707 void* data) {
708 std::function<void()>* callback =
709 reinterpret_cast<std::function<void()>*>(data);
710 (*callback)();
711 delete callback;
712 }
713
AllocateEvent(Event * event)714 port::Status GpuExecutor::AllocateEvent(Event* event) {
715 return AsGpuEvent(event)->Init();
716 }
717
DeallocateEvent(Event * event)718 port::Status GpuExecutor::DeallocateEvent(Event* event) {
719 return AsGpuEvent(event)->Destroy();
720 }
721
RecordEvent(Stream * stream,Event * event)722 port::Status GpuExecutor::RecordEvent(Stream* stream, Event* event) {
723 return AsGpuEvent(event)->Record(AsGpuStream(stream));
724 }
725
WaitForEvent(Stream * stream,Event * event)726 port::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
727 if (GpuDriver::WaitStreamOnEvent(context_, AsGpuStream(stream)->gpu_stream(),
728 AsGpuEvent(event)->gpu_event())) {
729 return ::tensorflow::OkStatus();
730 } else {
731 return port::Status(
732 port::error::INTERNAL,
733 absl::StrFormat("error recording waiting for CUDA event on stream %p",
734 stream));
735 }
736 }
737
PollForEventStatus(Event * event)738 Event::Status GpuExecutor::PollForEventStatus(Event* event) {
739 return AsGpuEvent(event)->PollForStatus();
740 }
741
AllocateStream(Stream * stream)742 bool GpuExecutor::AllocateStream(Stream* stream) {
743 absl::MutexLock l(&alive_gpu_streams_mu_);
744 bool out = AsGpuStream(stream)->Init();
745 alive_gpu_streams_[stream->implementation()->GpuStreamHack()] = stream;
746 return out;
747 }
748
DeallocateStream(Stream * stream)749 void GpuExecutor::DeallocateStream(Stream* stream) {
750 GpuStream* cuda_stream = AsGpuStream(stream);
751 absl::MutexLock l(&alive_gpu_streams_mu_);
752 alive_gpu_streams_.erase(cuda_stream->GpuStreamHack());
753 if (!cuda_stream->IsIdle()) {
754 LOG(ERROR) << "Deallocating stream with pending work";
755 }
756 cuda_stream->Destroy();
757 }
758
AllocateTimer(Timer * timer)759 bool GpuExecutor::AllocateTimer(Timer* timer) {
760 return AsGpuTimer(timer)->Init();
761 }
762
DeallocateTimer(Timer * timer)763 void GpuExecutor::DeallocateTimer(Timer* timer) {
764 AsGpuTimer(timer)->Destroy();
765 }
766
CreateStreamDependency(Stream * dependent,Stream * other)767 bool GpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
768 CUevent other_completed_event = *AsGpuStream(other)->completed_event();
769 bool ok = GpuDriver::RecordEvent(context_, other_completed_event,
770 AsGpuStreamValue(other))
771 .ok();
772 if (!ok) {
773 LOG(ERROR) << "failed to record completion event; "
774 "therefore, failed to create inter-stream dependency";
775 return false;
776 }
777
778 return GpuDriver::WaitStreamOnEvent(context_, AsGpuStreamValue(dependent),
779 other_completed_event);
780 }
781
StartTimer(Stream * stream,Timer * timer)782 bool GpuExecutor::StartTimer(Stream* stream, Timer* timer) {
783 return AsGpuTimer(timer)->Start(AsGpuStream(stream));
784 }
785
StopTimer(Stream * stream,Timer * timer)786 bool GpuExecutor::StopTimer(Stream* stream, Timer* timer) {
787 return AsGpuTimer(timer)->Stop(AsGpuStream(stream));
788 }
789
BlockHostUntilDone(Stream * stream)790 port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
791 return GpuDriver::SynchronizeStream(context_, AsGpuStreamValue(stream));
792 }
793
CreateBlas()794 blas::BlasSupport* GpuExecutor::CreateBlas() {
795 PluginRegistry* registry = PluginRegistry::Instance();
796 port::StatusOr<PluginRegistry::BlasFactory> status =
797 registry->GetFactory<PluginRegistry::BlasFactory>(cuda::kCudaPlatformId,
798 plugin_config_.blas());
799 if (!status.ok()) {
800 LOG(ERROR) << "Unable to retrieve BLAS factory: "
801 << status.status().error_message();
802 return nullptr;
803 }
804
805 return status.ValueOrDie()(this);
806 }
807
CreateDnn()808 dnn::DnnSupport* GpuExecutor::CreateDnn() {
809 PluginRegistry* registry = PluginRegistry::Instance();
810 port::StatusOr<PluginRegistry::DnnFactory> status =
811 registry->GetFactory<PluginRegistry::DnnFactory>(cuda::kCudaPlatformId,
812 plugin_config_.dnn());
813 if (!status.ok()) {
814 LOG(ERROR) << "Unable to retrieve DNN factory: "
815 << status.status().error_message();
816 return nullptr;
817 }
818
819 return status.ValueOrDie()(this);
820 }
821
CreateFft()822 fft::FftSupport* GpuExecutor::CreateFft() {
823 PluginRegistry* registry = PluginRegistry::Instance();
824 port::StatusOr<PluginRegistry::FftFactory> status =
825 registry->GetFactory<PluginRegistry::FftFactory>(cuda::kCudaPlatformId,
826 plugin_config_.fft());
827 if (!status.ok()) {
828 LOG(ERROR) << "Unable to retrieve FFT factory: "
829 << status.status().error_message();
830 return nullptr;
831 }
832
833 return status.ValueOrDie()(this);
834 }
835
CreateRng()836 rng::RngSupport* GpuExecutor::CreateRng() {
837 PluginRegistry* registry = PluginRegistry::Instance();
838 port::StatusOr<PluginRegistry::RngFactory> status =
839 registry->GetFactory<PluginRegistry::RngFactory>(cuda::kCudaPlatformId,
840 plugin_config_.rng());
841 if (!status.ok()) {
842 LOG(ERROR) << "Unable to retrieve RNG factory: "
843 << status.status().error_message();
844 return nullptr;
845 }
846
847 return status.ValueOrDie()(this);
848 }
849
850 // TODO(rspringer): Remove in b/18544742.
SupportsDnn() const851 bool GpuExecutor::SupportsDnn() const { return true; }
852
CanEnablePeerAccessTo(StreamExecutorInterface * other)853 bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
854 GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
855 return GpuDriver::CanEnablePeerAccess(context_, cuda_other->context_);
856 }
857
EnablePeerAccessTo(StreamExecutorInterface * other)858 port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
859 GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
860 return GpuDriver::EnablePeerAccess(context_, cuda_other->context_);
861 }
862
DeviceMemoryUsage(int64_t * free,int64_t * total) const863 bool GpuExecutor::DeviceMemoryUsage(int64_t* free, int64_t* total) const {
864 return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
865 }
866
GetSymbol(const std::string & symbol_name,ModuleHandle module_handle,void ** mem,size_t * bytes)867 bool GpuExecutor::GetSymbol(const std::string& symbol_name,
868 ModuleHandle module_handle, void** mem,
869 size_t* bytes) {
870 CHECK(static_cast<bool>(module_handle));
871
872 auto lookup_in_module = [&](CUmodule module) {
873 CHECK(module != nullptr);
874 return GpuDriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
875 reinterpret_cast<CUdeviceptr*>(mem),
876 bytes);
877 };
878
879 { // give limited scope to mutex_lock
880 absl::MutexLock lock{&in_memory_modules_mu_};
881 auto it = gpu_binary_to_module_.find(module_handle.id());
882 CHECK(it != gpu_binary_to_module_.end());
883 return lookup_in_module(it->second.first);
884 }
885
886 LOG(INFO) << "Failed to find symbol: " << symbol_name;
887 return false;
888 }
889
FillBlockDimLimit(GpuDeviceHandle device,BlockDim * block_dim_limit)890 bool FillBlockDimLimit(GpuDeviceHandle device, BlockDim* block_dim_limit) {
891 // The BlockDim name is a mismatch against these GRID_DIM_* queries because
892 // we use BlockDims to express the dimensions of blocks within a grid
893 // (as opposed to ThreadDim which expresses the dimensions of threads
894 // within a block).
895 int x, y, z;
896 if (!GpuDriver::GetGridLimits(&x, &y, &z, device)) {
897 return false;
898 }
899
900 block_dim_limit->x = x;
901 block_dim_limit->y = y;
902 block_dim_limit->z = z;
903 return true;
904 }
905
SupportsBlas() const906 bool GpuExecutor::SupportsBlas() const { return true; }
907
SupportsFft() const908 bool GpuExecutor::SupportsFft() const { return true; }
909
SupportsRng() const910 bool GpuExecutor::SupportsRng() const { return true; }
911
912 std::unique_ptr<internal::EventInterface>
CreateEventImplementation()913 GpuExecutor::CreateEventImplementation() {
914 return std::unique_ptr<internal::EventInterface>(new GpuEvent(this));
915 }
916
917 std::unique_ptr<internal::KernelInterface>
CreateKernelImplementation()918 GpuExecutor::CreateKernelImplementation() {
919 return std::unique_ptr<internal::KernelInterface>(new GpuKernel());
920 }
921
922 std::unique_ptr<internal::StreamInterface>
GetStreamImplementation()923 GpuExecutor::GetStreamImplementation() {
924 return std::unique_ptr<internal::StreamInterface>(new GpuStream(this));
925 }
926
927 std::unique_ptr<internal::TimerInterface>
GetTimerImplementation()928 GpuExecutor::GetTimerImplementation() {
929 return std::unique_ptr<internal::TimerInterface>(new GpuTimer(this));
930 }
931
GpuContextHack()932 void* GpuExecutor::GpuContextHack() { return context_; }
933
gpu_context()934 GpuContext* GpuExecutor::gpu_context() { return context_; }
935
936 // Attempts to read the NUMA node corresponding to the GPU device's PCI bus out
937 // of SysFS. Returns -1 if it cannot.
938 //
939 // For anything more complicated/prod-focused than this, you'll likely want to
940 // turn to gsys' topology modeling.
TryToReadNumaNode(const std::string & pci_bus_id,int device_ordinal)941 static int TryToReadNumaNode(const std::string& pci_bus_id,
942 int device_ordinal) {
943 #if defined(__APPLE__)
944 LOG(INFO) << "OS X does not support NUMA - returning NUMA node zero";
945 return 0;
946 #elif defined(PLATFORM_WINDOWS)
947 // Windows support for NUMA is not currently implemented. Return node 0.
948 return 0;
949 #else
950 VLOG(2) << "trying to read NUMA node for device ordinal: " << device_ordinal;
951 static const int kUnknownNumaNode = -1;
952
953 if (pci_bus_id.empty()) {
954 LOG(INFO) << "no PCI bus ID for device ordinal: " << device_ordinal;
955 return kUnknownNumaNode;
956 }
957
958 std::string filename =
959 absl::StrFormat("/sys/bus/pci/devices/%s/numa_node", pci_bus_id);
960
961 // We have to use fopen/fread here so that the device properties can be
962 // populated before InitGoogle procedure has been completed (at which point we
963 // could use the file::* utilities).
964 FILE* file = fopen(filename.c_str(), "r");
965 if (file == nullptr) {
966 LOG(INFO) << "could not open file to read NUMA node: " << filename
967 << "\nYour kernel may have been built without NUMA support.";
968 return kUnknownNumaNode;
969 }
970
971 std::string content;
972 char buf[32];
973 size_t did_read = fread(buf, sizeof(buf[0]), sizeof(buf) - 1, file);
974 buf[did_read] = '\0';
975 content = buf;
976
977 int32_t value;
978 if (port::safe_strto32(content, &value)) {
979 if (value < 0) { // See http://b/18228951 for details on this path.
980 LOG(INFO) << "successful NUMA node read from SysFS had negative value ("
981 << value
982 << "), but there must be at least one NUMA node"
983 ", so returning NUMA node zero";
984 fclose(file);
985 return 0;
986 }
987 fclose(file);
988 return value;
989 }
990
991 LOG(WARNING)
992 << "could not convert SysFS file contents to integral NUMA node value: "
993 << content;
994
995 fclose(file);
996 return kUnknownNumaNode;
997 #endif
998 }
999
1000 port::StatusOr<std::unique_ptr<DeviceDescription>>
CreateDeviceDescription(int device_ordinal)1001 GpuExecutor::CreateDeviceDescription(int device_ordinal) {
1002 GpuDeviceHandle device;
1003 auto status = GpuDriver::GetDevice(device_ordinal, &device);
1004 if (!status.ok()) {
1005 return status;
1006 }
1007
1008 int cc_major;
1009 int cc_minor;
1010 status = GpuDriver::GetComputeCapability(&cc_major, &cc_minor, device);
1011 if (!status.ok()) {
1012 return status;
1013 }
1014
1015 internal::DeviceDescriptionBuilder builder;
1016
1017 {
1018 int driver_version = 0;
1019 (void)GpuDriver::GetDriverVersion(&driver_version);
1020 std::string augmented_driver_version = absl::StrFormat(
1021 "%d (%s)", driver_version,
1022 cuda::DriverVersionStatusToString(Diagnostician::FindDsoVersion()));
1023 builder.set_driver_version(augmented_driver_version);
1024 }
1025
1026 {
1027 std::string pci_bus_id = GpuDriver::GetPCIBusID(device);
1028
1029 // Lower the hex characters to match sysfs.
1030 pci_bus_id = absl::AsciiStrToLower(pci_bus_id);
1031 builder.set_pci_bus_id(pci_bus_id);
1032
1033 // Read the NUMA node corresponding to the PCI bus ID out of sysfs.
1034 int numa_node = TryToReadNumaNode(pci_bus_id, device_ordinal);
1035 builder.set_numa_node(numa_node);
1036 }
1037
1038 {
1039 builder.set_threads_per_block_limit(
1040 GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1041 device)
1042 .ValueOrDie());
1043
1044 ThreadDim thread_dim_limit;
1045 thread_dim_limit.x = GpuDriver::GetDeviceAttribute(
1046 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, device)
1047 .ValueOrDie();
1048 thread_dim_limit.y = GpuDriver::GetDeviceAttribute(
1049 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, device)
1050 .ValueOrDie();
1051 thread_dim_limit.z = GpuDriver::GetDeviceAttribute(
1052 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, device)
1053 .ValueOrDie();
1054 builder.set_thread_dim_limit(thread_dim_limit);
1055
1056 int clock_rate =
1057 GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device)
1058 .ValueOrDie();
1059 builder.set_clock_rate_ghz(static_cast<float>(clock_rate) / 1e6);
1060 }
1061
1062 {
1063 bool ecc_enabled = false;
1064 (void)GpuDriver::IsEccEnabled(device, &ecc_enabled);
1065 builder.set_ecc_enabled(ecc_enabled);
1066 }
1067
1068 {
1069 uint64_t device_memory_size = -1;
1070 (void)GpuDriver::GetDeviceTotalMemory(device, &device_memory_size);
1071 builder.set_device_memory_size(device_memory_size);
1072 }
1073
1074 port::StatusOr<int> mem_clock_khz = GpuDriver::GetDeviceAttribute(
1075 CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device_ordinal);
1076 port::StatusOr<int> mem_bus_width_bits = GpuDriver::GetDeviceAttribute(
1077 CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device_ordinal);
1078 if (mem_clock_khz.ok() && mem_bus_width_bits.ok()) {
1079 // Times 2 because HBM is DDR memory; it gets two data bits per each data
1080 // lane.
1081 builder.set_memory_bandwidth(2 * int64_t{mem_clock_khz.ValueOrDie()} *
1082 1000 *
1083 int64_t{mem_bus_width_bits.ValueOrDie()} / 8);
1084 }
1085
1086 {
1087 BlockDim block_dim_limit;
1088 FillBlockDimLimit(device, &block_dim_limit);
1089 builder.set_block_dim_limit(block_dim_limit);
1090 }
1091
1092 {
1093 std::string device_name;
1094 TF_RETURN_IF_ERROR(GpuDriver::GetDeviceName(device, &device_name));
1095 builder.set_name(device_name);
1096 }
1097
1098 builder.set_platform_version(
1099 absl::StrCat("Compute Capability ", cc_major, ".", cc_minor));
1100
1101 // TODO(leary) should be a way to query this from the driver, but this is
1102 // unlikely to change for us any time soon.
1103 builder.set_device_address_bits(64);
1104
1105 builder.set_device_vendor("NVIDIA Corporation");
1106 builder.set_cuda_compute_capability(cc_major, cc_minor);
1107 builder.set_shared_memory_per_core(
1108 GpuDriver::GetMaxSharedMemoryPerCore(device).ValueOrDie());
1109 builder.set_shared_memory_per_block(
1110 GpuDriver::GetMaxSharedMemoryPerBlock(device).ValueOrDie());
1111 builder.set_core_count(
1112 GpuDriver::GetMultiprocessorCount(device).ValueOrDie());
1113 builder.set_threads_per_core_limit(
1114 GpuDriver::GetMaxThreadsPerMultiprocessor(device).ValueOrDie());
1115 builder.set_registers_per_block_limit(
1116 GpuDriver::GetMaxRegistersPerBlock(device).ValueOrDie());
1117 builder.set_threads_per_warp(
1118 GpuDriver::GetThreadsPerWarp(device).ValueOrDie());
1119 builder.set_registers_per_core_limit(
1120 GpuDriver::GetDeviceAttribute(
1121 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, device)
1122 .ValueOrDie());
1123
1124 return builder.Build();
1125 }
1126
1127 } // namespace gpu
1128
1129 } // namespace stream_executor
1130
1131 REGISTER_MODULE_INITIALIZER(cuda_gpu_executor, {});
1132