xref: /aosp_15_r20/external/tensorflow/tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.h"
17 
18 #include <stdint.h>
19 #include <stdlib.h>
20 
21 #include <map>
22 #include <set>
23 #include <utility>
24 
25 #include "absl/base/casts.h"
26 #include "absl/base/const_init.h"
27 #include "absl/container/inlined_vector.h"
28 #include "absl/debugging/leak_check.h"
29 #include "absl/memory/memory.h"
30 #include "absl/strings/str_cat.h"
31 #include "absl/strings/str_format.h"
32 #include "absl/synchronization/mutex.h"
33 #include "absl/synchronization/notification.h"
34 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
35 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.h"
36 #include "tensorflow/compiler/xla/stream_executor/lib/env.h"
37 #include "tensorflow/compiler/xla/stream_executor/lib/error.h"
38 #include "tensorflow/compiler/xla/stream_executor/lib/human_readable.h"
39 #include "tensorflow/compiler/xla/stream_executor/lib/stacktrace.h"
40 #include "tensorflow/compiler/xla/stream_executor/lib/static_threadlocal.h"
41 #include "tensorflow/compiler/xla/stream_executor/lib/threadpool.h"
42 #include "tensorflow/compiler/xla/stream_executor/platform/logging.h"
43 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
44 
45 bool FLAGS_gpuexec_cuda_driver_inject_init_error = false;
46 bool FLAGS_gpuexec_cuda_sync_around_driver_calls = false;
47 bool FLAGS_gpuexec_cuda_device_0_only = false;
48 
49 #define RETURN_IF_CUDA_RES_ERROR(expr, ...)                            \
50   do {                                                                 \
51     CUresult _res = (expr);                                            \
52     if (TF_PREDICT_FALSE(_res != CUDA_SUCCESS)) {                      \
53       return port::InternalError(absl::StrCat(                         \
54           __VA_ARGS__, ": ", ::stream_executor::gpu::ToString(_res))); \
55     }                                                                  \
56   } while (0)
57 
58 #define FAIL_IF_CUDA_RES_ERROR(expr, ...)                   \
59   do {                                                      \
60     CUresult _res = (expr);                                 \
61     if (TF_PREDICT_FALSE(_res != CUDA_SUCCESS)) {           \
62       LOG(FATAL) << absl::StrCat(__VA_ARGS__) << ": "       \
63                  << ::stream_executor::gpu::ToString(_res); \
64     }                                                       \
65   } while (0)
66 
67 // Debugging: on each push and pop of a cuda context, verify the current context
68 // matches the expected one.
69 constexpr bool kVerifyGpuContext = false;
70 
71 namespace stream_executor {
72 namespace gpu {
73 
74 /* static */ absl::Mutex CreatedContexts::mu_{absl::kConstInit};
75 /* static */ int64_t CreatedContexts::next_id_ = 1;  // 0 means "no context"
76 
77 namespace {
78 
79 // Returns the current context and checks that it is in the set of CUDA contexts
80 // created by StreamExecutor (to ensure that the CUDA runtime didn't create a
81 // context behind our backs).
CurrentContext()82 CUcontext CurrentContext() {
83   CUcontext current = cuda::CurrentContextOrDie();
84   if (current != nullptr && !CreatedContexts::Has(current)) {
85     LOG(FATAL) << "current context was not created by the StreamExecutor "
86                   "cuda_driver API: "
87                << current
88                << "; a CUDA runtime call "
89                   "was likely performed without using a StreamExecutor context";
90   }
91   return current;
92 }
93 
94 // CUDA driver routines may require a large amount of stack (particularly
95 // cuModuleLoadDataEx, in our experience). To avoid stack overflow when using
96 // stack-limited threads (such as those spawned by a default-argument
97 // thread::ThreadPool on some platforms), we run certain routines in this pool
98 // and wait for completion.
GetDriverExecutor()99 port::ThreadPool* GetDriverExecutor() {
100   static port::ThreadPool* thread_pool = new port::ThreadPool(
101       port::Env::Default(), port::ThreadOptions(), "cuda_driver", 1);
102   return thread_pool;
103 }
104 
105 }  // namespace
106 
MemorySpaceString(MemorySpace memory_space)107 std::string MemorySpaceString(MemorySpace memory_space) {
108   switch (memory_space) {
109     case MemorySpace::kHost:
110       return "host";
111     case MemorySpace::kDevice:
112       return "device";
113     default:
114       LOG(FATAL) << "impossible memory space";
115   }
116 }
117 
118 namespace {
119 
120 // Call cuCtxtSynchronize and crash if it doesn't succeed.
SynchronizeOrDie()121 void SynchronizeOrDie() {
122   FAIL_IF_CUDA_RES_ERROR(cuCtxSynchronize(),
123                          "Synchronize fail: ", port::CurrentStackTrace());
124 }
125 
126 struct ThreadLocalData {
127   int64_t id;
128   GpuContext* context;  // Only valid if id == a known good context.
129   int depth;
130 };
131 
132 SE_STATIC_THREAD_LOCAL_POD(ThreadLocalData, tls_data);
133 
134 }  // namespace
135 
ScopedActivateContext(GpuContext * cuda_context)136 ScopedActivateContext::ScopedActivateContext(GpuContext* cuda_context) {
137   if (FLAGS_gpuexec_cuda_sync_around_driver_calls) SynchronizeOrDie();
138 
139   auto* tls = &tls_data.get();
140 
141   // If this is an outermost scope, we must not assume that the CUDA context has
142   // been left in the same state we left it. Other code may have run on this
143   // thread and altered the context.
144   if (tls->depth == 0) {
145     VLOG(3) << "ScopedActivateContext switching to " << cuda_context->id();
146     FAIL_IF_CUDA_RES_ERROR(cuCtxSetCurrent(cuda_context->context()),
147                            "Failed setting context");
148     tls->depth = 1;
149     tls->id = cuda_context->id();
150     tls->context = cuda_context;
151     to_restore_ = nullptr;
152     return;
153   }
154 
155   tls->depth++;
156   if (tls->id == cuda_context->id()) {
157     if (kVerifyGpuContext) {
158       CHECK_EQ(CurrentContext(), cuda_context->context());
159     }
160     DCHECK_EQ(CurrentContext(), cuda_context->context());
161     return;
162   }
163 
164   VLOG(3) << "ScopedActivateContext switching context from " << tls->id
165           << " to " << cuda_context->id();
166 
167   to_restore_ = tls->context;
168   // Set the context and update thread local.
169   FAIL_IF_CUDA_RES_ERROR(cuCtxSetCurrent(cuda_context->context()),
170                          "Failed setting context");
171   tls->id = cuda_context->id();
172   tls->context = cuda_context;
173 }
174 
~ScopedActivateContext()175 ScopedActivateContext::~ScopedActivateContext() {
176   if (FLAGS_gpuexec_cuda_sync_around_driver_calls) SynchronizeOrDie();
177 
178   auto* tls = &tls_data.get();
179 
180   if (kVerifyGpuContext) {
181     // Note that if kVerifyGpuContext is used, and contexts are deleted, it's
182     // possible this could fail in the CurrentContext() call.
183     CHECK_EQ(CurrentContext(),
184              tls->context == nullptr ? nullptr : tls->context->context());
185   }
186 
187   tls->depth--;
188   DCHECK_GE(tls->depth, 0);
189   if (to_restore_ == nullptr) {
190     // Leave context, tls->id, and tls->context set.
191     return;
192   }
193 
194   // Set context and update thread local.
195   FAIL_IF_CUDA_RES_ERROR(cuCtxSetCurrent(to_restore_->context()),
196                          "Failed setting context");
197   tls->id = to_restore_->id();
198   tls->context = to_restore_;
199 }
200 
201 namespace {
202 
203 // Returns a stringified device number associated with pointer, primarily for
204 // logging purposes. Returns "?" if the device could not be successfully
205 // queried.
CUDAPointerToDeviceString(CUdeviceptr pointer)206 std::string CUDAPointerToDeviceString(CUdeviceptr pointer) {
207   auto value = GpuDriver::GetPointerDevice(pointer);
208   if (value.ok()) {
209     return absl::StrCat(value.ValueOrDie());
210   }
211   LOG(ERROR) << "could not query device: " << value.status();
212   return "?";
213 }
214 
215 // Returns a stringified memory space associated with pointer, primarily for
216 // logging purposes. Returns "?" if the memory space could not be successfully
217 // queried.
CUDAPointerToMemorySpaceString(CUdeviceptr pointer)218 std::string CUDAPointerToMemorySpaceString(CUdeviceptr pointer) {
219   auto value = GpuDriver::GetPointerMemorySpace(pointer);
220   if (value.ok()) {
221     return MemorySpaceString(value.ValueOrDie());
222   }
223   LOG(ERROR) << "could not query device: " << value.status();
224   return "?";
225 }
226 
227 // Returns a stringified representation of whether or not peer access is
228 // permitted between the "from" and "to" pointers' associated contexts,
229 // primarily for logging purposes. Returns "error" if an error is encountered
230 // in the process of querying.
CUDAPointersToCanAccessString(CUdeviceptr from,CUdeviceptr to)231 std::string CUDAPointersToCanAccessString(CUdeviceptr from, CUdeviceptr to) {
232   auto from_context = GpuDriver::GetPointerContext(from);
233   if (!from_context.ok()) {
234     LOG(ERROR) << "could not retrieve source pointer's context: "
235                << from_context.status();
236     return "source ptr error";
237   }
238   auto to_context = GpuDriver::GetPointerContext(to);
239   if (!to_context.ok()) {
240     LOG(ERROR) << "could not retrieve destination pointer's context: "
241                << to_context.status();
242     return "destination ptr error";
243   }
244   return GpuDriver::CanEnablePeerAccess(from_context.ValueOrDie(),
245                                         to_context.ValueOrDie())
246              ? "true"
247              : "false";
248 }
249 
250 // Actually performs the work of CUDA initialization. Wrapped up in one-time
251 // execution guard.
InternalInit()252 static port::Status InternalInit() {
253   CUresult res = CUDA_ERROR_NO_DEVICE;
254   if (FLAGS_gpuexec_cuda_driver_inject_init_error) {
255     LOG(ERROR) << "injecting CUDA init error; initialization will fail";
256   } else {
257     res = cuInit(0 /* = flags */);
258   }
259 
260   if (res == CUDA_SUCCESS) {
261     return ::tensorflow::OkStatus();
262   } else if (res == CUDA_ERROR_SHARED_OBJECT_INIT_FAILED) {
263     LOG(WARNING) << "failed call to cuInit: " << ToString(res);
264   } else {
265     LOG(ERROR) << "failed call to cuInit: " << ToString(res);
266   }
267 
268   Diagnostician::LogDiagnosticInformation();
269   return port::Status(port::error::ABORTED,
270                       absl::StrCat("failed call to cuInit: ", ToString(res)));
271 }
272 
273 }  // namespace
274 
Init()275 /* static */ port::Status GpuDriver::Init() {
276   // Cached return value from calling InternalInit(), as cuInit need only be
277   // called once, but GpuDriver::Init may be called many times.
278   static port::Status* init_retval = [] {
279     return new port::Status(InternalInit());
280   }();
281   return *init_retval;
282 }
283 
GetDevice(int device_ordinal,CUdevice * device)284 /* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
285                                                CUdevice* device) {
286   RETURN_IF_CUDA_RES_ERROR(cuDeviceGet(device, device_ordinal),
287                            "Failed call to cuDeviceGet");
288   return ::tensorflow::OkStatus();
289 }
290 
GetDeviceName(CUdevice device,std::string * device_name)291 /* static */ port::Status GpuDriver::GetDeviceName(CUdevice device,
292                                                    std::string* device_name) {
293   static const size_t kCharLimit = 64;
294   absl::InlinedVector<char, 4> chars(kCharLimit);
295   RETURN_IF_CUDA_RES_ERROR(
296       cuDeviceGetName(chars.begin(), kCharLimit - 1, device),
297       "Failed to get device name");
298   chars[kCharLimit - 1] = '\0';
299   *device_name = chars.begin();
300   return ::tensorflow::OkStatus();
301 }
302 
DeviceOptionsToContextFlags(const DeviceOptions & device_options,int * flags)303 bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
304                                  int* flags) {
305   static_assert(DeviceOptions::kMask == 0xf,
306                 "needs update for new device options");
307 
308   if (device_options.flags() & DeviceOptions::kDoNotReclaimStackAllocation) {
309     *flags |= CU_CTX_LMEM_RESIZE_TO_MAX;
310   }
311 
312   // If no flags are set the default is CU_CTX_SCHED_AUTO, which
313   // in Google environments is very likely to mean SPIN.
314   if (device_options.flags() & DeviceOptions::kScheduleSpin) {
315     *flags |= CU_CTX_SCHED_SPIN;
316   }
317   if (device_options.flags() & DeviceOptions::kScheduleYield) {
318     *flags |= CU_CTX_SCHED_YIELD;
319   }
320   if (device_options.flags() & DeviceOptions::kScheduleBlockingSync) {
321     *flags |= CU_CTX_SCHED_BLOCKING_SYNC;
322   }
323 
324   return true;
325 }
326 
CreateContext(int device_ordinal,CUdevice device,const DeviceOptions & device_options,GpuContext ** context)327 /* static */ port::Status GpuDriver::CreateContext(
328     int device_ordinal, CUdevice device, const DeviceOptions& device_options,
329     GpuContext** context) {
330   *context = nullptr;
331 
332   int flags = 0;
333   if (!DeviceOptionsToContextFlags(device_options, &flags)) {
334     LOG(WARNING) << "could not convert all device options into context flags";
335   }
336 
337   CUresult res;
338   CUcontext former_context;
339   CUcontext new_context;
340 
341   unsigned int former_primary_context_flags;
342   int former_primary_context_is_active;
343   CHECK_EQ(CUDA_SUCCESS,
344            cuDevicePrimaryCtxGetState(device, &former_primary_context_flags,
345                                       &former_primary_context_is_active));
346   if (former_primary_context_flags != flags) {
347     if (former_primary_context_is_active) {
348       LOG(ERROR)
349           << "The primary context is active and has a different flag set ("
350           << former_primary_context_flags << ") than the desired flag set ("
351           << flags << ").";
352     } else {
353       CHECK_EQ(CUDA_SUCCESS, cuDevicePrimaryCtxSetFlags(device, flags));
354     }
355   }
356 
357   former_context = cuda::CurrentContextOrDie();
358   res = cuDevicePrimaryCtxRetain(&new_context, device);
359   if (former_context != nullptr) {
360     CUdevice former_device;
361     if (cuCtxGetDevice(&former_device) == CUDA_SUCCESS) {
362       if (former_device == device) {
363         if (former_context == new_context) {
364           VLOG(2) << "The primary context " << former_context << " for device "
365                   << device
366                   << " exists before initializing the StreamExecutor.";
367         } else {
368           LOG(WARNING) << "A non-primary context " << former_context
369                        << " for device " << device
370                        << " exists before initializing the StreamExecutor. The "
371                        << "primary context is now " << new_context << ". We "
372                        << "haven't verified StreamExecutor works with that.";
373         }
374       }
375     } else {
376       LOG(ERROR) << "Failed to get the device of the current context "
377                  << former_context;
378     }
379   }
380   CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(former_context));
381 
382   if (res == CUDA_SUCCESS) {
383     *context = CreatedContexts::Add(new_context, device_ordinal);
384     CHECK(*context != nullptr)
385         << "success in this call must entail non-null result";
386     VLOG(2) << "created or reused context " << new_context
387             << " for this thread";
388     return ::tensorflow::OkStatus();
389   }
390 
391   std::string message =
392       "failed call to cuDevicePrimaryCtxRetain: " + ToString(res);
393   if (res == CUDA_ERROR_OUT_OF_MEMORY) {
394     uint64_t total_memory;
395     if (GetDeviceTotalMemory(device, &total_memory)) {
396       absl::StrAppend(&message, "; total memory reported: ", total_memory);
397     } else {
398       absl::StrAppend(&message, "; could not query total memory");
399     }
400   }
401 
402   return port::Status(port::error::INTERNAL, message);
403 }
404 
DestroyContext(GpuContext * context)405 /* static */ void GpuDriver::DestroyContext(GpuContext* context) {
406   if (context == nullptr) {
407     return;
408   }
409   CUcontext former_context = CurrentContext();
410   CUresult res = cuCtxSetCurrent(context->context());
411   CUdevice device;
412   cuCtxGetDevice(&device);
413   cuCtxSetCurrent(former_context);
414 
415   res = cuDevicePrimaryCtxRelease(device);
416 
417   if (res != CUDA_SUCCESS) {
418     LOG(ERROR) << "failed to release CUDA context; leaking: " << ToString(res);
419   }
420 
421   CreatedContexts::Remove(context->context());
422 }
423 
GetContextHandle(GpuContext * context)424 /* static */ CUcontext GpuDriver::GetContextHandle(GpuContext* context) {
425   return context->context();
426 }
427 
FuncGetAttribute(CUfunction_attribute attribute,CUfunction func,int * attribute_value)428 /* static */ port::Status GpuDriver::FuncGetAttribute(
429     CUfunction_attribute attribute, CUfunction func, int* attribute_value) {
430   RETURN_IF_CUDA_RES_ERROR(cuFuncGetAttribute(attribute_value, attribute, func),
431                            "Failed to query kernel attribute: ", attribute);
432   return ::tensorflow::OkStatus();
433 }
434 
FuncSetCacheConfig(CUfunction function,CUfunc_cache cache_config)435 /* static */ port::Status GpuDriver::FuncSetCacheConfig(
436     CUfunction function, CUfunc_cache cache_config) {
437   RETURN_IF_CUDA_RES_ERROR(cuFuncSetCacheConfig(function, cache_config),
438                            "Failed to set CUDA kernel cache config");
439   return ::tensorflow::OkStatus();
440 }
441 
442 /* static */ port::StatusOr<CUsharedconfig>
ContextGetSharedMemConfig(GpuContext * context)443 GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
444   CUsharedconfig shared_mem_config;
445   ScopedActivateContext activation(context);
446   RETURN_IF_CUDA_RES_ERROR(cuCtxGetSharedMemConfig(&shared_mem_config),
447                            "Failed to get shared memory config");
448   return shared_mem_config;
449 }
450 
ContextSetSharedMemConfig(GpuContext * context,CUsharedconfig shared_mem_config)451 /* static */ port::Status GpuDriver::ContextSetSharedMemConfig(
452     GpuContext* context, CUsharedconfig shared_mem_config) {
453   ScopedActivateContext activation(context);
454   RETURN_IF_CUDA_RES_ERROR(cuCtxSetSharedMemConfig(shared_mem_config),
455                            "Failed to set shared memory config");
456   return ::tensorflow::OkStatus();
457 }
458 
LaunchKernel(GpuContext * context,absl::string_view kernel_name,CUfunction function,unsigned int grid_dim_x,unsigned int grid_dim_y,unsigned int grid_dim_z,unsigned int block_dim_x,unsigned int block_dim_y,unsigned int block_dim_z,unsigned int shared_mem_bytes,CUstream stream,void ** kernel_params,void ** extra)459 /* static */ port::Status GpuDriver::LaunchKernel(
460     GpuContext* context, absl::string_view kernel_name, CUfunction function,
461     unsigned int grid_dim_x, unsigned int grid_dim_y, unsigned int grid_dim_z,
462     unsigned int block_dim_x, unsigned int block_dim_y,
463     unsigned int block_dim_z, unsigned int shared_mem_bytes, CUstream stream,
464     void** kernel_params, void** extra) {
465   ScopedActivateContext activation(context);
466   VLOG(2) << "launching kernel: " << kernel_name << "; gdx: " << grid_dim_x
467           << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
468           << " bdx: " << block_dim_x << " bdy: " << block_dim_y
469           << " bdz: " << block_dim_z;
470   RETURN_IF_CUDA_RES_ERROR(
471       cuLaunchKernel(function, grid_dim_x, grid_dim_y, grid_dim_z, block_dim_x,
472                      block_dim_y, block_dim_z, shared_mem_bytes, stream,
473                      kernel_params, extra),
474       "Failed to launch CUDA kernel: ", kernel_name,
475       " with block dimensions: ", block_dim_x, "x", block_dim_y, "x",
476       block_dim_z, " and grid dimensions: ", grid_dim_x, "x", grid_dim_y, "x",
477       grid_dim_z);
478   return ::tensorflow::OkStatus();
479 }
480 
LoadCubin(GpuContext * context,const char * cubin_bytes,CUmodule * module)481 /* static */ port::Status GpuDriver::LoadCubin(GpuContext* context,
482                                                const char* cubin_bytes,
483                                                CUmodule* module) {
484   ScopedActivateContext activation(context);
485   RETURN_IF_CUDA_RES_ERROR(cuModuleLoadFatBinary(module, cubin_bytes),
486                            "Failed to load in-memory CUBIN");
487   return ::tensorflow::OkStatus();
488 }
489 
LoadPtx(GpuContext * context,const char * ptx_contents,CUmodule * module)490 /* static */ port::Status GpuDriver::LoadPtx(GpuContext* context,
491                                              const char* ptx_contents,
492                                              CUmodule* module) {
493   absl::Notification notification;
494   port::Status ret = ::tensorflow::OkStatus();
495   GetDriverExecutor()->Schedule([context, ptx_contents, module, &ret,
496                                  &notification]() {
497     ScopedActivateContext activation(context);
498     void* ptx_data = const_cast<char*>(ptx_contents);
499     static const unsigned int kLogBufferBytesLimit = 1024;
500     unsigned int error_log_buffer_bytes = kLogBufferBytesLimit;
501     unsigned int info_log_buffer_bytes = kLogBufferBytesLimit;
502     absl::InlinedVector<char, 4> error_log_buffer(error_log_buffer_bytes);
503     absl::InlinedVector<char, 4> info_log_buffer(info_log_buffer_bytes);
504     bool log_verbose = true;
505     CUjit_option options[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
506                               CU_JIT_ERROR_LOG_BUFFER,
507                               CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
508                               CU_JIT_INFO_LOG_BUFFER, CU_JIT_LOG_VERBOSE};
509     // Note that the driver API wants the contents of this values to be stored
510     // in an array of void*s, so we coerce them accordingly.
511     void* option_values[] = {
512         absl::bit_cast<void*>(uintptr_t(error_log_buffer_bytes)),
513         absl::bit_cast<void*>(error_log_buffer.data()),
514         absl::bit_cast<void*>(uintptr_t(info_log_buffer_bytes)),
515         absl::bit_cast<void*>(info_log_buffer.data()),
516         absl::bit_cast<void*>(uintptr_t(log_verbose))};
517     CHECK(TF_ARRAYSIZE(options) == TF_ARRAYSIZE(option_values));
518 
519     CUresult res;
520     {
521       // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their
522       // module loading: see http://b/13248943
523       absl::LeakCheckDisabler disabler;
524       res = cuModuleLoadDataEx(module, ptx_data, TF_ARRAYSIZE(options), options,
525                                option_values);
526     }
527 
528     // The PTX JIT mutates the values in the option values array to reflect the
529     // size of the logs it output; now that we've made the call, read the values
530     // back out.
531     error_log_buffer_bytes = reinterpret_cast<uintptr_t>(option_values[0]);
532     info_log_buffer_bytes = reinterpret_cast<uintptr_t>(option_values[2]);
533     CHECK_LE(error_log_buffer_bytes, kLogBufferBytesLimit);
534     CHECK_LE(info_log_buffer_bytes, kLogBufferBytesLimit);
535 
536     if (res != CUDA_SUCCESS) {
537       LOG(ERROR) << "failed to load PTX text as a module: " << ToString(res);
538       // As a precaution for null termination of the API-provided value, ensure
539       // that at least the last byte is null.
540       error_log_buffer[error_log_buffer_bytes ? error_log_buffer_bytes - 1
541                                               : 0] = '\0';
542       LOG(ERROR) << "error log buffer (" << error_log_buffer_bytes
543                  << " bytes): " << error_log_buffer.data();
544       ret = port::InternalError(
545           absl::StrCat("Failed to load PTX text as a module: ", ToString(res)));
546       notification.Notify();
547     }
548 
549     VLOG(3) << "PTX compilation info log (" << info_log_buffer_bytes
550             << " bytes): " << info_log_buffer.data();
551     VLOG(3) << "PTX compilation error log (" << error_log_buffer_bytes
552             << " bytes): " << error_log_buffer.data();
553     CHECK(module != nullptr);
554     notification.Notify();
555   });
556   notification.WaitForNotification();
557 
558   return ret;
559 }
560 
LoadHsaco(GpuContext * context,const char * hsaco_contents,CUmodule * module)561 /* static */ port::Status GpuDriver::LoadHsaco(GpuContext* context,
562                                                const char* hsaco_contents,
563                                                CUmodule* module) {
564   return port::InternalError(
565       "Feature not supported on CUDA platform (LoadHsaco)");
566 }
567 
SynchronousMemsetUint8(GpuContext * context,CUdeviceptr location,uint8 value,size_t size)568 /* static */ port::Status GpuDriver::SynchronousMemsetUint8(
569     GpuContext* context, CUdeviceptr location, uint8 value, size_t size) {
570   ScopedActivateContext activation(context);
571   RETURN_IF_CUDA_RES_ERROR(cuMemsetD8(location, value, size),
572                            "Failed to memset memory");
573   return ::tensorflow::OkStatus();
574 }
575 
SynchronousMemsetUint32(GpuContext * context,CUdeviceptr location,uint32 value,size_t uint32_count)576 /* static */ port::Status GpuDriver::SynchronousMemsetUint32(
577     GpuContext* context, CUdeviceptr location, uint32 value,
578     size_t uint32_count) {
579   ScopedActivateContext activation(context);
580   RETURN_IF_CUDA_RES_ERROR(cuMemsetD32(location, value, uint32_count),
581                            "Failed to memset memory");
582   return ::tensorflow::OkStatus();
583 }
584 
AsynchronousMemsetUint8(GpuContext * context,CUdeviceptr location,uint8 value,size_t uint32_count,CUstream stream)585 /* static */ port::Status GpuDriver::AsynchronousMemsetUint8(
586     GpuContext* context, CUdeviceptr location, uint8 value, size_t uint32_count,
587     CUstream stream) {
588   ScopedActivateContext activation(context);
589   RETURN_IF_CUDA_RES_ERROR(
590       cuMemsetD8Async(location, value, uint32_count, stream),
591       "Failed to enqueue async memset operation");
592   return ::tensorflow::OkStatus();
593 }
594 
AsynchronousMemsetUint32(GpuContext * context,CUdeviceptr location,uint32 value,size_t uint32_count,CUstream stream)595 /* static */ port::Status GpuDriver::AsynchronousMemsetUint32(
596     GpuContext* context, CUdeviceptr location, uint32 value,
597     size_t uint32_count, CUstream stream) {
598   ScopedActivateContext activation(context);
599   RETURN_IF_CUDA_RES_ERROR(
600       cuMemsetD32Async(location, value, uint32_count, stream),
601       "Failed to enqueue async memset operation");
602   return ::tensorflow::OkStatus();
603 }
604 
AddStreamCallback(GpuContext * context,CUstream stream,StreamCallback callback,void * data)605 /* static */ bool GpuDriver::AddStreamCallback(GpuContext* context,
606                                                CUstream stream,
607                                                StreamCallback callback,
608                                                void* data) {
609   // Note: flags param is required to be zero according to CUDA 6.0.
610   CUresult res = cuStreamAddCallback(stream, callback, data, 0 /* = flags */);
611   if (res != CUDA_SUCCESS) {
612     LOG(ERROR) << "unable to add host callback: " << ToString(res);
613     return false;
614   }
615   return true;
616 }
617 
GetModuleFunction(GpuContext * context,CUmodule module,const char * kernel_name,CUfunction * function)618 /* static */ bool GpuDriver::GetModuleFunction(GpuContext* context,
619                                                CUmodule module,
620                                                const char* kernel_name,
621                                                CUfunction* function) {
622   ScopedActivateContext activated{context};
623   CHECK(module != nullptr && kernel_name != nullptr);
624   CUresult res = cuModuleGetFunction(function, module, kernel_name);
625   if (res != CUDA_SUCCESS) {
626     LOG(ERROR) << "failed to get PTX kernel \"" << kernel_name
627                << "\" from module: " << ToString(res);
628     return false;
629   }
630 
631   return true;
632 }
633 
GetModuleSymbol(GpuContext * context,CUmodule module,const char * symbol_name,CUdeviceptr * dptr,size_t * bytes)634 /* static */ bool GpuDriver::GetModuleSymbol(GpuContext* context,
635                                              CUmodule module,
636                                              const char* symbol_name,
637                                              CUdeviceptr* dptr, size_t* bytes) {
638   ScopedActivateContext activated{context};
639   CHECK(module != nullptr && symbol_name != nullptr &&
640         (dptr != nullptr || bytes != nullptr));
641   CUresult res = cuModuleGetGlobal(dptr, bytes, module, symbol_name);
642   if (res != CUDA_SUCCESS) {
643     // symbol may not be found in the current module, but it may reside in
644     // another module.
645     VLOG(2) << "failed to get symbol \"" << symbol_name
646             << "\" from module: " << ToString(res);
647     return false;
648   }
649 
650   return true;
651 }
652 
UnloadModule(GpuContext * context,CUmodule module)653 /* static */ void GpuDriver::UnloadModule(GpuContext* context,
654                                           CUmodule module) {
655   ScopedActivateContext activated{context};
656   CUresult res = cuModuleUnload(module);
657   if (res != CUDA_SUCCESS) {
658     LOG(ERROR) << "failed to unload module " << module
659                << "; leaking: " << ToString(res);
660   }
661 }
662 
DeviceFromContext(GpuContext * context)663 /* static */ port::StatusOr<CUdevice> GpuDriver::DeviceFromContext(
664     GpuContext* context) {
665   ScopedActivateContext activated{context};
666   CUdevice device = -1;
667   CUresult result = cuCtxGetDevice(&device);
668   if (result == CUDA_SUCCESS) {
669     return device;
670   }
671 
672   return port::Status(
673       port::error::INTERNAL,
674       absl::StrCat("failed to get device for context: ", ToString(result)));
675 }
676 
CreateStream(GpuContext * context,CUstream * stream,int priority)677 /* static */ bool GpuDriver::CreateStream(GpuContext* context, CUstream* stream,
678                                           int priority) {
679   // TODO(leary) can we switch this to CU_STREAM_NON_BLOCKING or will that mess
680   // up synchronization with respect to memsets and any other things that have
681   // to occur on the default stream?
682   ScopedActivateContext activated{context};
683   CUresult res;
684   // If the priority is 0, then use the previous api to create the stream with
685   // the default priority for backward compatibility. Probably there is no
686   // difference in using the new api call but leaving it as is for now.
687   if (priority == 0) {
688     res = cuStreamCreate(stream, 0);
689   } else {
690     res = cuStreamCreateWithPriority(stream, 0, priority);
691   }
692   if (res != CUDA_SUCCESS) {
693     LOG(ERROR) << "could not allocate CUDA stream for context "
694                << context->context() << ": " << ToString(res);
695     return false;
696   }
697 
698   VLOG(2) << "successfully created stream " << *stream << " for context "
699           << context->context() << " on thread";
700   return true;
701 }
702 
DestroyStream(GpuContext * context,CUstream * stream)703 /* static */ void GpuDriver::DestroyStream(GpuContext* context,
704                                            CUstream* stream) {
705   if (*stream == nullptr) {
706     return;
707   }
708 
709   ScopedActivateContext activated{context};
710   CUresult res = cuStreamDestroy(*stream);
711   if (res != CUDA_SUCCESS) {
712     LOG(ERROR) << "failed to destroy CUDA stream for context "
713                << context->context() << ": " << ToString(res);
714   } else {
715     VLOG(2) << "successfully destroyed stream " << *stream << " for context "
716             << context->context();
717     *stream = nullptr;
718   }
719 }
720 
DeviceAllocate(GpuContext * context,uint64_t bytes)721 /* static */ void* GpuDriver::DeviceAllocate(GpuContext* context,
722                                              uint64_t bytes) {
723   if (bytes == 0) {
724     return nullptr;
725   }
726 
727   ScopedActivateContext activated{context};
728   CUdeviceptr result = 0;
729   CUresult res = cuMemAlloc(&result, bytes);
730   if (res != CUDA_SUCCESS) {
731     // LOG(INFO) because this isn't always important to users (e.g. BFCAllocator
732     // implements a retry if the first allocation fails).
733     LOG(INFO) << "failed to allocate "
734               << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
735               << " bytes) from device: " << ToString(res);
736     return nullptr;
737   }
738   void* ptr = reinterpret_cast<void*>(result);
739   VLOG(2) << "allocated " << ptr << " for context " << context->context()
740           << " of " << bytes << " bytes";
741   return ptr;
742 }
743 
DeviceDeallocate(GpuContext * context,void * location)744 /* static */ void GpuDriver::DeviceDeallocate(GpuContext* context,
745                                               void* location) {
746   ScopedActivateContext activation(context);
747   CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
748   CUresult res = cuMemFree(pointer);
749   if (res != CUDA_SUCCESS) {
750     LOG(ERROR) << "failed to free device memory at " << location
751                << "; result: " << ToString(res);
752   } else {
753     VLOG(2) << "deallocated " << location << " for context "
754             << context->context();
755   }
756 }
757 
UnifiedMemoryAllocate(GpuContext * context,uint64_t bytes)758 /* static */ void* GpuDriver::UnifiedMemoryAllocate(GpuContext* context,
759                                                     uint64_t bytes) {
760   ScopedActivateContext activation(context);
761   CUdeviceptr result = 0;
762   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
763   CUresult res = cuMemAllocManaged(&result, bytes, CU_MEM_ATTACH_GLOBAL);
764   if (res != CUDA_SUCCESS) {
765     LOG(ERROR) << "failed to alloc " << bytes
766                << " bytes unified memory; result: " << ToString(res);
767     return nullptr;
768   }
769   void* ptr = reinterpret_cast<void*>(result);
770   VLOG(2) << "allocated " << ptr << " for context " << context->context()
771           << " of " << bytes << " bytes in unified memory";
772   return ptr;
773 }
774 
UnifiedMemoryDeallocate(GpuContext * context,void * location)775 /* static */ void GpuDriver::UnifiedMemoryDeallocate(GpuContext* context,
776                                                      void* location) {
777   ScopedActivateContext activation(context);
778   CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
779   CUresult res = cuMemFree(pointer);
780   if (res != CUDA_SUCCESS) {
781     LOG(ERROR) << "failed to free unified memory at " << location
782                << "; result: " << ToString(res);
783   } else {
784     VLOG(2) << "deallocated unified memory at " << location << " for context "
785             << context->context();
786   }
787 }
788 
HostAllocate(GpuContext * context,uint64_t bytes)789 /* static */ void* GpuDriver::HostAllocate(GpuContext* context,
790                                            uint64_t bytes) {
791   ScopedActivateContext activation(context);
792   void* host_mem = nullptr;
793   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
794   CUresult res = cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE);
795   if (res != CUDA_SUCCESS) {
796     LOG(ERROR) << "failed to alloc " << bytes
797                << " bytes on host: " << ToString(res);
798   }
799   return host_mem;
800 }
801 
HostDeallocate(GpuContext * context,void * location)802 /* static */ void GpuDriver::HostDeallocate(GpuContext* context,
803                                             void* location) {
804   ScopedActivateContext activation(context);
805   CUresult res = cuMemFreeHost(location);
806   if (res != CUDA_SUCCESS) {
807     LOG(ERROR) << "error deallocating host memory at " << location << ": "
808                << ToString(res);
809   }
810 }
811 
HostRegister(GpuContext * context,void * location,uint64_t bytes)812 /* static */ bool GpuDriver::HostRegister(GpuContext* context, void* location,
813                                           uint64_t bytes) {
814   ScopedActivateContext activation(context);
815   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
816   CUresult res =
817       cuMemHostRegister(location, bytes, CU_MEMHOSTREGISTER_PORTABLE);
818   if (res != CUDA_SUCCESS) {
819     LOG(ERROR) << "error registering host memory at " << location << ": "
820                << ToString(res);
821     return false;
822   }
823   return true;
824 }
825 
HostUnregister(GpuContext * context,void * location)826 /* static */ bool GpuDriver::HostUnregister(GpuContext* context,
827                                             void* location) {
828   ScopedActivateContext activation(context);
829   CUresult res = cuMemHostUnregister(location);
830   if (res != CUDA_SUCCESS) {
831     LOG(ERROR) << "error unregistering host memory at " << location << ": "
832                << ToString(res);
833     return false;
834   }
835   return true;
836 }
837 
838 #if CUDA_VERSION >= 10020
839 /* static */ port::StatusOr<GpuDriver::VmemSpan>
ReserveVirtualMemory(GpuContext * context,uint64_t bytes)840 GpuDriver::ReserveVirtualMemory(GpuContext* context, uint64_t bytes) {
841   ScopedActivateContext activation(context);
842   CUdeviceptr base;
843   CUresult res = cuMemAddressReserve(&base, bytes, /*alignment=*/0,
844                                      /*addr=*/0, /*flags=*/0);
845   if (res != CUDA_SUCCESS) {
846     return port::InternalError(
847         absl::StrFormat("error reserving %d bytes of virtual GPU memory: %s",
848                         bytes, ToString(res)));
849   }
850   return {{base, bytes}};
851 }
852 
FreeVirtualMemory(GpuContext * context,GpuDriver::VmemSpan reservation)853 /* static */ void GpuDriver::FreeVirtualMemory(
854     GpuContext* context, GpuDriver::VmemSpan reservation) {
855   ScopedActivateContext activation(context);
856   CUresult res = cuMemAddressFree(reservation.base, reservation.size_bytes);
857   if (res != CUDA_SUCCESS) {
858     LOG(ERROR) << "error freeing vmem reservation of size "
859                << reservation.size_bytes << " at address " << reservation.base;
860   }
861 }
862 
GetMinAllocationGranularity(GpuDeviceHandle device)863 /* static */ port::StatusOr<uint64_t> GpuDriver::GetMinAllocationGranularity(
864     GpuDeviceHandle device) {
865   CUmemAllocationProp props = {};
866   props.type = CU_MEM_ALLOCATION_TYPE_PINNED;
867   props.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
868   props.location.id = device;
869 
870   size_t granularity;
871   CUresult res = cuMemGetAllocationGranularity(
872       &granularity, &props, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
873   if (res != CUDA_SUCCESS) {
874     return port::InternalError(absl::StrCat(
875         "failed to get min allocation granularity: ", ToString(res)));
876   }
877   return granularity;
878 }
879 
880 /* static */ port::StatusOr<GpuDriver::GenericMemoryHandle>
CreateMemoryHandle(GpuContext * context,uint64_t bytes)881 GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
882   ScopedActivateContext activation(context);
883   auto device = DeviceFromContext(context);
884   if (!device.ok()) {
885     LOG(ERROR) << "Failed to get device from context" << device.status();
886     return device.status();
887   }
888 
889   CUmemAllocationProp props = {};
890   props.type = CU_MEM_ALLOCATION_TYPE_PINNED;
891   props.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
892   props.location.id = device.ValueOrDie();
893 
894   CUmemGenericAllocationHandle mem_handle;
895   CUresult res = cuMemCreate(&mem_handle, bytes, &props, 0);
896   if (res != CUDA_SUCCESS) {
897     return port::InternalError(
898         absl::StrFormat("failed to create memory allocation of size %d: %s",
899                         bytes, ToString(res)));
900   }
901   return GpuDriver::GenericMemoryHandle{mem_handle, bytes};
902 }
903 
ReleaseMemoryHandle(GpuContext * context,GpuDriver::GenericMemoryHandle handle)904 /* static */ void GpuDriver::ReleaseMemoryHandle(
905     GpuContext* context, GpuDriver::GenericMemoryHandle handle) {
906   ScopedActivateContext activation(context);
907 
908   CUresult res = cuMemRelease(handle.handle);
909   if (res != CUDA_SUCCESS) {
910     LOG(ERROR) << "Failed to release memory handle " << handle.handle
911                << " of size " << handle.bytes << ": " << ToString(res);
912   }
913 }
914 
MapMemory(GpuContext * context,CUdeviceptr va,const GpuDriver::GenericMemoryHandle & handle,const std::vector<GpuDeviceHandle> & device_handles)915 /* static */ port::Status GpuDriver::MapMemory(
916     GpuContext* context, CUdeviceptr va,
917     const GpuDriver::GenericMemoryHandle& handle,
918     const std::vector<GpuDeviceHandle>& device_handles) {
919   ScopedActivateContext activation(context);
920 
921   auto device = DeviceFromContext(context);
922   if (!device.ok()) {
923     return device.status();
924   }
925 
926   // NB: Zero is the only valid value for both flags and offset.
927   CUresult res =
928       cuMemMap(va, handle.bytes, /*offset=*/0, handle.handle, /*flags=*/0);
929   if (res != CUDA_SUCCESS) {
930     return port::InternalError(absl::StrFormat(
931         "Failed to map %d bytes at %d: %s", handle.bytes, va, ToString(res)));
932   }
933 
934   std::vector<CUmemAccessDesc> access_descriptors(device_handles.size());
935   for (int i = 0; i < access_descriptors.size(); ++i) {
936     access_descriptors[i].location.id = device_handles[i];
937     access_descriptors[i].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
938     access_descriptors[i].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
939   }
940 
941   res = cuMemSetAccess(va, handle.bytes, access_descriptors.data(),
942                        access_descriptors.size());
943   if (res != CUDA_SUCCESS) {
944     // Unmap the memory that we failed to set access for.
945     if (cuMemUnmap(va, handle.bytes) != CUDA_SUCCESS) {
946       LOG(ERROR)
947           << "Failed to unmap memory in GpuDriver::MapMemory error path.";
948     }
949     return port::InternalError(absl::StrFormat(
950         "Failed to set read/write access on memory mapped at %d: %s", va,
951         ToString(res)));
952   }
953   return port::Status::OK();
954 }
955 
UnmapMemory(GpuContext * context,CUdeviceptr va,uint64_t bytes)956 /* static */ void GpuDriver::UnmapMemory(GpuContext* context, CUdeviceptr va,
957                                          uint64_t bytes) {
958   ScopedActivateContext activation(context);
959 
960   CUresult res = cuMemUnmap(va, bytes);
961   if (res != CUDA_SUCCESS) {
962     LOG(ERROR) << "Failed to unmap memory at " << va << " of size " << bytes
963                << ": " << ToString(res);
964   }
965 }
966 
967 #endif
968 
DestroyEvent(GpuContext * context,CUevent * event)969 /* static */ port::Status GpuDriver::DestroyEvent(GpuContext* context,
970                                                   CUevent* event) {
971   if (*event == nullptr) {
972     return port::Status(port::error::INVALID_ARGUMENT,
973                         "input event cannot be null");
974   }
975 
976   ScopedActivateContext activated{context};
977   RETURN_IF_CUDA_RES_ERROR(cuEventDestroy(*event),
978                            "Error destroying CUDA event");
979   return ::tensorflow::OkStatus();
980 }
981 
RecordEvent(GpuContext * context,CUevent event,CUstream stream)982 /* static */ port::Status GpuDriver::RecordEvent(GpuContext* context,
983                                                  CUevent event,
984                                                  CUstream stream) {
985   ScopedActivateContext activated{context};
986   RETURN_IF_CUDA_RES_ERROR(cuEventRecord(event, stream),
987                            "Error recording CUDA event");
988   return ::tensorflow::OkStatus();
989 }
990 
QueryEvent(GpuContext * context,CUevent event)991 /* static */ port::StatusOr<CUresult> GpuDriver::QueryEvent(GpuContext* context,
992                                                             CUevent event) {
993   ScopedActivateContext activated{context};
994   CUresult res = cuEventQuery(event);
995   if (res != CUDA_SUCCESS && res != CUDA_ERROR_NOT_READY) {
996     return port::Status(
997         port::error::INTERNAL,
998         absl::StrFormat("failed to query event: %s", ToString(res)));
999   }
1000 
1001   return res;
1002 }
1003 
GetEventElapsedTime(GpuContext * context,float * elapsed_milliseconds,CUevent start,CUevent stop)1004 /* static */ bool GpuDriver::GetEventElapsedTime(GpuContext* context,
1005                                                  float* elapsed_milliseconds,
1006                                                  CUevent start, CUevent stop) {
1007   ScopedActivateContext activated{context};
1008   // The stop event must have completed in order for cuEventElapsedTime to
1009   // work.
1010   CUresult res = cuEventSynchronize(stop);
1011   if (res != CUDA_SUCCESS) {
1012     LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res);
1013     return false;
1014   }
1015   res = cuEventElapsedTime(elapsed_milliseconds, start, stop);
1016   if (res != CUDA_SUCCESS) {
1017     LOG(ERROR) << "failed to get elapsed time between events: "
1018                << ToString(res);
1019     return false;
1020   }
1021 
1022   return true;
1023 }
1024 
WaitStreamOnEvent(GpuContext * context,CUstream stream,CUevent event)1025 /* static */ bool GpuDriver::WaitStreamOnEvent(GpuContext* context,
1026                                                CUstream stream, CUevent event) {
1027   ScopedActivateContext activation(context);
1028   CUresult res = cuStreamWaitEvent(stream, event, 0 /* = flags */);
1029   if (res != CUDA_SUCCESS) {
1030     LOG(ERROR) << "could not wait stream on event: " << ToString(res);
1031     return false;
1032   }
1033 
1034   return true;
1035 }
1036 
SynchronizeContext(GpuContext * context)1037 /* static */ bool GpuDriver::SynchronizeContext(GpuContext* context) {
1038   ScopedActivateContext activation(context);
1039   CUresult res = cuCtxSynchronize();
1040   if (res != CUDA_SUCCESS) {
1041     LOG(ERROR) << "could not synchronize on CUDA context: " << ToString(res)
1042                << " :: " << port::CurrentStackTrace();
1043     return false;
1044   }
1045 
1046   return true;
1047 }
1048 
SynchronizeStream(GpuContext * context,CUstream stream)1049 /* static */ port::Status GpuDriver::SynchronizeStream(GpuContext* context,
1050                                                        CUstream stream) {
1051   ScopedActivateContext activated{context};
1052   CHECK(stream != nullptr);
1053   RETURN_IF_CUDA_RES_ERROR(cuStreamSynchronize(stream),
1054                            "Could not synchronize CUDA stream");
1055   return ::tensorflow::OkStatus();
1056 }
1057 
IsStreamIdle(GpuContext * context,CUstream stream)1058 /* static */ bool GpuDriver::IsStreamIdle(GpuContext* context,
1059                                           CUstream stream) {
1060   ScopedActivateContext activated{context};
1061   CHECK(stream != nullptr);
1062   CUresult res = cuStreamQuery(stream);
1063   if (res == CUDA_SUCCESS) {
1064     return true;
1065   }
1066 
1067   if (res != CUDA_ERROR_NOT_READY) {
1068     LOG(ERROR) << "stream in bad state on status query: " << ToString(res);
1069   }
1070   return false;
1071 }
1072 
SynchronousMemcpyD2H(GpuContext * context,void * host_dst,CUdeviceptr gpu_src,uint64_t size)1073 /* static */ port::Status GpuDriver::SynchronousMemcpyD2H(GpuContext* context,
1074                                                           void* host_dst,
1075                                                           CUdeviceptr gpu_src,
1076                                                           uint64_t size) {
1077   ScopedActivateContext activation(context);
1078   RETURN_IF_CUDA_RES_ERROR(
1079       cuMemcpyDtoH(host_dst, gpu_src, size),
1080       absl::StrFormat("failed to synchronous memcpy from device to host "
1081                       "host dst: %p; GPU src: %p; size: %u=0x%x",
1082                       host_dst, absl::bit_cast<void*>(gpu_src), size, size));
1083   VLOG(2) << "successfully sync memcpy'd d2h of " << size << " bytes to "
1084           << host_dst;
1085   return ::tensorflow::OkStatus();
1086 }
1087 
SynchronousMemcpyH2D(GpuContext * context,CUdeviceptr gpu_dst,const void * host_src,uint64_t size)1088 /* static */ port::Status GpuDriver::SynchronousMemcpyH2D(GpuContext* context,
1089                                                           CUdeviceptr gpu_dst,
1090                                                           const void* host_src,
1091                                                           uint64_t size) {
1092   ScopedActivateContext activation(context);
1093   RETURN_IF_CUDA_RES_ERROR(
1094       cuMemcpyHtoD(gpu_dst, host_src, size),
1095       absl::StrFormat(
1096           "failed to synchronous memcpy from host to device: GPU dst: %p;"
1097           " host src: %p; size: %u=0x%x",
1098           absl::bit_cast<void*>(gpu_dst), host_src, size, size));
1099   VLOG(2) << "successfully enqueued sync memcpy h2d of " << size << " bytes";
1100   return ::tensorflow::OkStatus();
1101 }
1102 
SynchronousMemcpyD2D(GpuContext * context,CUdeviceptr gpu_dst,CUdeviceptr gpu_src,uint64_t size)1103 /* static */ port::Status GpuDriver::SynchronousMemcpyD2D(GpuContext* context,
1104                                                           CUdeviceptr gpu_dst,
1105                                                           CUdeviceptr gpu_src,
1106                                                           uint64_t size) {
1107   ScopedActivateContext activation(context);
1108 
1109   CUresult result;
1110   // CreatedContexts::GetAnyContext() doesn't works when ptr == 0.
1111   // This happens when the size is 0.
1112   if (gpu_dst == 0 || gpu_src == 0) {
1113     result = cuMemcpyDtoD(gpu_dst, gpu_src, size);
1114   } else {
1115     // Any context work here.
1116     CUcontext dst_context =
1117         CreatedContexts::GetAnyContext(absl::bit_cast<void*>(gpu_dst));
1118     CUcontext src_context =
1119         CreatedContexts::GetAnyContext(absl::bit_cast<void*>(gpu_src));
1120 
1121     if (static_cast<void*>(dst_context) == nullptr) {
1122       port::StatusOr<GpuContext*> tmp_context = GetPointerContext(gpu_dst);
1123       if (tmp_context.ok()) {
1124         dst_context = tmp_context.ValueOrDie()->context();
1125       }
1126     }
1127 
1128     if (static_cast<void*>(src_context) == nullptr) {
1129       port::StatusOr<GpuContext*> tmp_context = GetPointerContext(gpu_src);
1130       if (tmp_context.ok()) {
1131         src_context = tmp_context.ValueOrDie()->context();
1132       }
1133     }
1134 
1135     result = cuMemcpyPeer(gpu_dst, dst_context, gpu_src, src_context, size);
1136   }
1137 
1138   RETURN_IF_CUDA_RES_ERROR(
1139       result,
1140       absl::StrFormat(
1141           "failed to synchronous memcpy from host to device: GPU dst: %p; "
1142           "GPU src: %p; size: %u=0x%x",
1143           absl::bit_cast<void*>(gpu_dst), absl::bit_cast<void*>(gpu_src), size,
1144           size));
1145   VLOG(2) << "successfully sync memcpy'd d2d of " << size << " bytes";
1146   return ::tensorflow::OkStatus();
1147 }
1148 
AsynchronousMemcpyD2H(GpuContext * context,void * host_dst,CUdeviceptr gpu_src,uint64_t size,CUstream stream)1149 /* static */ bool GpuDriver::AsynchronousMemcpyD2H(GpuContext* context,
1150                                                    void* host_dst,
1151                                                    CUdeviceptr gpu_src,
1152                                                    uint64_t size,
1153                                                    CUstream stream) {
1154   ScopedActivateContext activation(context);
1155   CUresult res = cuMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
1156   if (res != CUDA_SUCCESS) {
1157     LOG(ERROR) << absl::StrFormat(
1158         "failed to enqueue async memcpy from device to host: %s; host dst: %p; "
1159         "GPU src: %p; size: %u=0x%x",
1160         ToString(res), host_dst, absl::bit_cast<void*>(gpu_src), size, size);
1161     return false;
1162   }
1163   VLOG(2) << "successfully enqueued async memcpy d2h of " << size
1164           << " bytes from " << absl::bit_cast<void*>(gpu_src) << " to "
1165           << host_dst << " on stream " << stream;
1166   return true;
1167 }
1168 
AsynchronousMemcpyH2D(GpuContext * context,CUdeviceptr gpu_dst,const void * host_src,uint64_t size,CUstream stream)1169 /* static */ bool GpuDriver::AsynchronousMemcpyH2D(GpuContext* context,
1170                                                    CUdeviceptr gpu_dst,
1171                                                    const void* host_src,
1172                                                    uint64_t size,
1173                                                    CUstream stream) {
1174   ScopedActivateContext activation(context);
1175   CUresult res = cuMemcpyHtoDAsync(gpu_dst, host_src, size, stream);
1176   if (res != CUDA_SUCCESS) {
1177     LOG(ERROR) << absl::StrFormat(
1178         "failed to enqueue async memcpy from host to device: %s; GPU dst: %p; "
1179         "host src: %p; size: %u=0x%x",
1180         ToString(res), absl::bit_cast<void*>(gpu_dst), host_src, size, size);
1181     return false;
1182   }
1183   VLOG(2) << "successfully enqueued async memcpy h2d of " << size << " bytes"
1184           << " on stream " << stream;
1185   return true;
1186 }
1187 
AsynchronousMemcpyD2D(GpuContext * context,CUdeviceptr gpu_dst,CUdeviceptr gpu_src,uint64_t size,CUstream stream)1188 /* static */ bool GpuDriver::AsynchronousMemcpyD2D(GpuContext* context,
1189                                                    CUdeviceptr gpu_dst,
1190                                                    CUdeviceptr gpu_src,
1191                                                    uint64_t size,
1192                                                    CUstream stream) {
1193   ScopedActivateContext activation(context);
1194   CUresult result;
1195   // CreatedContexts::GetAnyContext() doesn't works when ptr == 0.
1196   // This happens when the size is 0.
1197   if (gpu_dst == 0 || gpu_src == 0) {
1198     result = cuMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
1199   } else {
1200     // Any context work here.
1201     CUcontext dst_context =
1202         CreatedContexts::GetAnyContext(absl::bit_cast<void*>(gpu_dst));
1203     CUcontext src_context =
1204         CreatedContexts::GetAnyContext(absl::bit_cast<void*>(gpu_src));
1205 
1206     if (static_cast<void*>(dst_context) == nullptr) {
1207       port::StatusOr<GpuContext*> tmp_context = GetPointerContext(gpu_dst);
1208       if (tmp_context.ok()) {
1209         dst_context = tmp_context.ValueOrDie()->context();
1210       }
1211     }
1212 
1213     if (static_cast<void*>(src_context) == nullptr) {
1214       port::StatusOr<GpuContext*> tmp_context = GetPointerContext(gpu_src);
1215       if (tmp_context.ok()) {
1216         src_context = tmp_context.ValueOrDie()->context();
1217       }
1218     }
1219 
1220     result = cuMemcpyPeerAsync(gpu_dst, dst_context, gpu_src, src_context, size,
1221                                stream);
1222   }
1223   if (result != CUDA_SUCCESS) {
1224     LOG(ERROR) << absl::StrFormat(
1225         "failed to enqueue async memcpy from device to device: %s"
1226         "; GPU dst: %p on %s %s"
1227         "; GPU src: %p on %s %s"
1228         "; can access? %s; size: %u=0x%x",
1229         ToString(result), absl::bit_cast<void*>(gpu_dst),
1230         CUDAPointerToMemorySpaceString(gpu_dst),
1231         CUDAPointerToDeviceString(gpu_dst), absl::bit_cast<void*>(gpu_src),
1232         CUDAPointerToMemorySpaceString(gpu_src),
1233         CUDAPointerToDeviceString(gpu_src),
1234         CUDAPointersToCanAccessString(gpu_src, gpu_dst), size, size);
1235 
1236     return false;
1237   }
1238   VLOG(2) << "successfully enqueued async memcpy d2d of " << size << " bytes";
1239   return true;
1240 }
1241 
InitEvent(GpuContext * context,CUevent * result,EventFlags flags)1242 /* static */ port::Status GpuDriver::InitEvent(GpuContext* context,
1243                                                CUevent* result,
1244                                                EventFlags flags) {
1245   int cuflags;
1246   switch (flags) {
1247     case EventFlags::kDefault:
1248       cuflags = CU_EVENT_DEFAULT;
1249       break;
1250     case EventFlags::kDisableTiming:
1251       cuflags = CU_EVENT_DISABLE_TIMING;
1252       break;
1253     default:
1254       LOG(FATAL) << "impossible event flags: " << int(flags);
1255   }
1256 
1257   ScopedActivateContext activated{context};
1258   CUresult res = cuEventCreate(result, cuflags);
1259 
1260   if (res == CUDA_SUCCESS) {
1261     return ::tensorflow::OkStatus();
1262   } else if (res == CUDA_ERROR_OUT_OF_MEMORY) {
1263     return port::Status(port::error::RESOURCE_EXHAUSTED,
1264                         "could not create CUDA event: out of device memory");
1265   } else {
1266     return port::Status(
1267         port::error::FAILED_PRECONDITION,
1268         absl::StrCat("could not create CUDA event: ", ToString(res)));
1269   }
1270 }
1271 
GetDeviceCount()1272 /* static */ int GpuDriver::GetDeviceCount() {
1273   int device_count = 0;
1274   CUresult res = cuDeviceGetCount(&device_count);
1275   if (res != CUDA_SUCCESS) {
1276     LOG(ERROR) << "could not retrieve CUDA device count: " << ToString(res);
1277     return 0;
1278   }
1279 
1280   if (FLAGS_gpuexec_cuda_device_0_only && device_count > 1) {
1281     device_count = 1;
1282   }
1283   return device_count;
1284 }
1285 
GetPointerContext(CUdeviceptr pointer)1286 /* static */ port::StatusOr<GpuContext*> GpuDriver::GetPointerContext(
1287     CUdeviceptr pointer) {
1288   GpuContext* context = nullptr;
1289   CUresult result =
1290       cuPointerGetAttribute(&context, CU_POINTER_ATTRIBUTE_CONTEXT, pointer);
1291   if (result == CUDA_SUCCESS) {
1292     // For cudaMallocAsync, the context returned is null.  For now
1293     // return not-available. But how to manage that correctly
1294     // everywhere in TF?  Currently this is only used during error
1295     // handling.  So all is working fine, but TF have a different
1296     // error then the original one.
1297     if (context == nullptr) {
1298       return port::Status(
1299           port::error::UNAVAILABLE,
1300           "Empty context returned while querying context for device pointer");
1301     }
1302     return context;
1303   }
1304 
1305   return port::Status(
1306       port::error::INTERNAL,
1307       absl::StrCat("failed to query context for device pointer: ",
1308                    ToString(result)));
1309 }
1310 
GetPointerMemorySpace(CUdeviceptr pointer)1311 /* static */ port::StatusOr<MemorySpace> GpuDriver::GetPointerMemorySpace(
1312     CUdeviceptr pointer) {
1313   unsigned int value;
1314   CUresult result =
1315       cuPointerGetAttribute(&value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer);
1316   if (result == CUDA_SUCCESS) {
1317     switch (value) {
1318       case CU_MEMORYTYPE_DEVICE:
1319         return MemorySpace::kDevice;
1320       case CU_MEMORYTYPE_HOST:
1321         return MemorySpace::kHost;
1322       default:
1323         return port::Status(
1324             port::error::INTERNAL,
1325             absl::StrCat("unknown memory space provided by CUDA API: ", value));
1326     }
1327   }
1328 
1329   return port::Status(
1330       port::error::INTERNAL,
1331       absl::StrCat("failed to query device pointer for memory space: ",
1332                    ToString(result)));
1333 }
1334 
GetPointerAddressRange(CUdeviceptr dptr,CUdeviceptr * base,size_t * size)1335 /* static */ port::Status GpuDriver::GetPointerAddressRange(CUdeviceptr dptr,
1336                                                             CUdeviceptr* base,
1337                                                             size_t* size) {
1338   CUresult result = cuMemGetAddressRange(base, size, dptr);
1339   if (result == CUDA_SUCCESS) {
1340     return ::tensorflow::OkStatus();
1341   } else if (result == CUDA_ERROR_NOT_FOUND) {
1342     // We differentiate between "this pointer is unknown" (return here) and
1343     // "there was an internal error while performing this operation" (return
1344     // below).
1345     return port::Status(
1346         port::error::NOT_FOUND,
1347         absl::StrFormat("not a device pointer %p; %s",
1348                         reinterpret_cast<void*>(dptr), ToString(result)));
1349   }
1350 
1351   return port::Status(
1352       port::error::INTERNAL,
1353       absl::StrFormat("failed to get pointer into for device pointer %p; %s",
1354                       reinterpret_cast<void*>(dptr), ToString(result)));
1355 }
1356 
GetPointerDevice(CUdeviceptr pointer)1357 /* static */ port::StatusOr<CUdevice> GpuDriver::GetPointerDevice(
1358     CUdeviceptr pointer) {
1359   auto result = GetPointerContext(pointer);
1360   if (!result.ok()) {
1361     return result.status();
1362   }
1363 
1364   return DeviceFromContext(result.ValueOrDie());
1365 }
1366 
GetComputeCapability(int * cc_major,int * cc_minor,CUdevice device)1367 /* static */ port::Status GpuDriver::GetComputeCapability(int* cc_major,
1368                                                           int* cc_minor,
1369                                                           CUdevice device) {
1370   *cc_major = 0;
1371   *cc_minor = 0;
1372 
1373   CUresult res = cuDeviceGetAttribute(
1374       cc_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
1375   if (res != CUDA_SUCCESS) {
1376     return port::Status(
1377         port::error::INTERNAL,
1378         absl::StrFormat(
1379             "failed to get compute capability major for device: %s; %d",
1380             ToString(res), device));
1381   }
1382 
1383   res = cuDeviceGetAttribute(
1384       cc_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
1385   if (res != CUDA_SUCCESS) {
1386     return port::Status(
1387         port::error::INTERNAL,
1388         absl::StrFormat(
1389             "failed to get compute capability minor for device: %s; %d",
1390             ToString(res), device));
1391   }
1392 
1393   return ::tensorflow::OkStatus();
1394 }
1395 
GetGpuISAVersion(int * version,CUdevice device)1396 /* static */ port::Status GpuDriver::GetGpuISAVersion(int* version,
1397                                                       CUdevice device) {
1398   return port::Status{
1399       port::error::INTERNAL,
1400       "Feature not supported on CUDA platform (GetGpuISAVersion)"};
1401 }
1402 
GetGpuGCNArchName(CUdevice,std::string *)1403 /* static */ port::Status GpuDriver::GetGpuGCNArchName(CUdevice, std::string*) {
1404   return port::Status{
1405       port::error::INTERNAL,
1406       "Feature not supported on CUDA platform (GetGpuGCNArchName)"};
1407 }
1408 
1409 // Helper function that turns the integer output of cuDeviceGetAttribute to type
1410 // T and wraps it in a StatusOr.
1411 template <typename T>
GetSimpleAttribute(CUdevice device,CUdevice_attribute attribute)1412 static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
1413                                             CUdevice_attribute attribute) {
1414   int value = -1;
1415   RETURN_IF_CUDA_RES_ERROR(cuDeviceGetAttribute(&value, attribute, device),
1416                            "Could not retrieve CUDA device attribute (",
1417                            attribute);
1418   T converted = value;
1419   return converted;
1420 }
1421 
GetMultiprocessorCount(CUdevice device)1422 /* static */ port::StatusOr<int> GpuDriver::GetMultiprocessorCount(
1423     CUdevice device) {
1424   return GetSimpleAttribute<int>(device,
1425                                  CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
1426 }
1427 
GetMaxSharedMemoryPerCore(CUdevice device)1428 /* static */ port::StatusOr<int64_t> GpuDriver::GetMaxSharedMemoryPerCore(
1429     CUdevice device) {
1430   return GetSimpleAttribute<int64_t>(
1431       device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
1432 }
1433 
GetMaxSharedMemoryPerBlock(CUdevice device)1434 /* static */ port::StatusOr<int64_t> GpuDriver::GetMaxSharedMemoryPerBlock(
1435     CUdevice device) {
1436   return GetSimpleAttribute<int64_t>(
1437       device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK);
1438 }
1439 
GetMaxThreadsPerMultiprocessor(CUdevice device)1440 /* static */ port::StatusOr<int64_t> GpuDriver::GetMaxThreadsPerMultiprocessor(
1441     CUdevice device) {
1442   return GetSimpleAttribute<int64_t>(
1443       device, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR);
1444 }
1445 
GetMaxThreadsPerBlock(CUdevice device)1446 /* static */ port::StatusOr<int64_t> GpuDriver::GetMaxThreadsPerBlock(
1447     CUdevice device) {
1448   return GetSimpleAttribute<int64_t>(device,
1449                                      CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
1450 }
1451 
GetMaxRegistersPerBlock(CUdevice device)1452 /* static */ port::StatusOr<int64_t> GpuDriver::GetMaxRegistersPerBlock(
1453     CUdevice device) {
1454   return GetSimpleAttribute<int64_t>(
1455       device, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK);
1456 }
1457 
GetThreadsPerWarp(CUdevice device)1458 /* static */ port::StatusOr<int64_t> GpuDriver::GetThreadsPerWarp(
1459     CUdevice device) {
1460   return GetSimpleAttribute<int64_t>(device, CU_DEVICE_ATTRIBUTE_WARP_SIZE);
1461 }
1462 
GetGridLimits(int * x,int * y,int * z,CUdevice device)1463 /* static */ bool GpuDriver::GetGridLimits(int* x, int* y, int* z,
1464                                            CUdevice device) {
1465   int value;
1466   CUresult res =
1467       cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device);
1468   if (res != CUDA_SUCCESS) {
1469     LOG(ERROR) << "failed to query max grid dim x: " << ToString(res);
1470     return false;
1471   }
1472   *x = value;
1473 
1474   res =
1475       cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device);
1476   if (res != CUDA_SUCCESS) {
1477     LOG(ERROR) << "failed to query max grid dim y: " << ToString(res);
1478     return false;
1479   }
1480   *y = value;
1481 
1482   res =
1483       cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device);
1484   if (res != CUDA_SUCCESS) {
1485     LOG(ERROR) << "failed to query max grid dim z: " << ToString(res);
1486     return false;
1487   }
1488   *z = value;
1489   return true;
1490 }
1491 
GetDriverVersion(int * driver_version)1492 /* static */ bool GpuDriver::GetDriverVersion(int* driver_version) {
1493   CUresult res = cuDriverGetVersion(driver_version);
1494   if (res != CUDA_SUCCESS) {
1495     LOG(ERROR) << "failed to query driver version: " << ToString(res);
1496     return false;
1497   }
1498 
1499   return true;
1500 }
1501 
GetDeviceProperties(CUdevprop * device_properties,int device_ordinal)1502 /* static */ bool GpuDriver::GetDeviceProperties(CUdevprop* device_properties,
1503                                                  int device_ordinal) {
1504   CUresult res = cuDeviceGetProperties(device_properties, device_ordinal);
1505   if (res != CUDA_SUCCESS) {
1506     LOG(ERROR) << "failed to query device properties: " << ToString(res);
1507     return false;
1508   }
1509 
1510   return true;
1511 }
1512 
GetDeviceAttribute(CUdevice_attribute attribute,CUdevice device)1513 /* static */ port::StatusOr<int> GpuDriver::GetDeviceAttribute(
1514     CUdevice_attribute attribute, CUdevice device) {
1515   int val;
1516   CUresult res = cuDeviceGetAttribute(&val, attribute, device);
1517   if (res != CUDA_SUCCESS) {
1518     return port::Status(
1519         port::error::INTERNAL,
1520         absl::StrFormat("failed to get device attribute %d for device %d: %s",
1521                         attribute, device, ToString(res)));
1522   }
1523   return val;
1524 }
1525 
IsEccEnabled(CUdevice device,bool * result)1526 /* static */ bool GpuDriver::IsEccEnabled(CUdevice device, bool* result) {
1527   int value = -1;
1528   CUresult res =
1529       cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, device);
1530   if (res != CUDA_SUCCESS) {
1531     LOG(ERROR) << "failed to query ECC status: " << ToString(res);
1532     return false;
1533   }
1534 
1535   *result = value;
1536   return true;
1537 }
1538 
GetDeviceMemoryInfo(GpuContext * context,int64_t * free_out,int64_t * total_out)1539 /* static */ bool GpuDriver::GetDeviceMemoryInfo(GpuContext* context,
1540                                                  int64_t* free_out,
1541                                                  int64_t* total_out) {
1542   ScopedActivateContext activation(context);
1543   size_t free = 0;
1544   size_t total = 0;
1545   CUresult res = cuMemGetInfo(&free, &total);
1546   if (res != CUDA_SUCCESS) {
1547     LOG(ERROR) << "failed to query device memory info: " << ToString(res);
1548     return false;
1549   }
1550 
1551   *free_out = free;
1552   *total_out = total;
1553   return true;
1554 }
1555 
GetDeviceTotalMemory(CUdevice device,uint64_t * result)1556 /* static */ bool GpuDriver::GetDeviceTotalMemory(CUdevice device,
1557                                                   uint64_t* result) {
1558   size_t value = -1;
1559   CUresult res = cuDeviceTotalMem(&value, device);
1560   if (res != CUDA_SUCCESS) {
1561     LOG(ERROR) << "failed to query total available memory: " << ToString(res);
1562     return false;
1563   }
1564 
1565   *result = value;
1566   return true;
1567 }
1568 
GetPCIBusID(CUdevice device)1569 /* static */ std::string GpuDriver::GetPCIBusID(CUdevice device) {
1570   std::string pci_bus_id;
1571   static const int kBufferSize = 64;
1572   absl::InlinedVector<char, 4> chars(kBufferSize);
1573   chars[kBufferSize - 1] = '\0';
1574   CUresult res = cuDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device);
1575   if (res != CUDA_SUCCESS) {
1576     LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res);
1577     return pci_bus_id;
1578   }
1579   pci_bus_id = chars.begin();
1580   return pci_bus_id;
1581 }
1582 
CanEnablePeerAccess(GpuContext * from,GpuContext * to)1583 /* static */ bool GpuDriver::CanEnablePeerAccess(GpuContext* from,
1584                                                  GpuContext* to) {
1585   if (from == to) {
1586     return true;  // A context can always access its own memory.
1587   }
1588 
1589   auto from_device = DeviceFromContext(from);
1590   if (!from_device.ok()) {
1591     LOG(ERROR) << "failed to resolve 'from' peer access context to a device: "
1592                << from_device.status();
1593     return false;
1594   }
1595   auto to_device = DeviceFromContext(to);
1596   if (!to_device.ok()) {
1597     LOG(ERROR) << "failed to resolve 'to' peer access context to a device: "
1598                << to_device.status();
1599     return false;
1600   }
1601   return CanEnablePeerAccess(from_device.ValueOrDie(), to_device.ValueOrDie());
1602 }
1603 
CanEnablePeerAccess(GpuDeviceHandle from,GpuDeviceHandle to)1604 /* static */ bool GpuDriver::CanEnablePeerAccess(GpuDeviceHandle from,
1605                                                  GpuDeviceHandle to) {
1606   int can_access_peer = -1;
1607   CUresult result = cuDeviceCanAccessPeer(&can_access_peer, from, to);
1608   if (result != CUDA_SUCCESS) {
1609     LOG(ERROR) << "failed to detect peer access capability: "
1610                << ToString(result);
1611     return false;
1612   }
1613   return can_access_peer;
1614 }
1615 
EnablePeerAccess(GpuContext * from,GpuContext * to)1616 /* static */ port::Status GpuDriver::EnablePeerAccess(GpuContext* from,
1617                                                       GpuContext* to) {
1618   if (from == to) {
1619     return ::tensorflow::OkStatus();  // A context can always access its own
1620                                       // memory.
1621   }
1622 
1623   ScopedActivateContext activated{from};
1624   CUresult result = cuCtxEnablePeerAccess(to->context(), 0 /* = flags */);
1625   if (result != CUDA_SUCCESS &&
1626       result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
1627     return port::Status(
1628         port::error::INTERNAL,
1629         absl::StrFormat("failed to enable peer access from %p to %p: %s", from,
1630                         to, ToString(result)));
1631   }
1632 
1633   return ::tensorflow::OkStatus();
1634 }
1635 
GetMaxOccupiedBlocksPerCore(GpuContext * context,CUfunction kernel,int threads_per_block,size_t dynamic_shared_memory_bytes)1636 /* static */ port::StatusOr<int> GpuDriver::GetMaxOccupiedBlocksPerCore(
1637     GpuContext* context, CUfunction kernel, int threads_per_block,
1638     size_t dynamic_shared_memory_bytes) {
1639   ScopedActivateContext activation(context);
1640 
1641   int max_blocks;
1642   RETURN_IF_CUDA_RES_ERROR(
1643       cuOccupancyMaxActiveBlocksPerMultiprocessor(
1644           &max_blocks, kernel, threads_per_block, dynamic_shared_memory_bytes),
1645       absl::StrFormat("Failed to calculate occupancy of kernel %p", kernel));
1646   return max_blocks;
1647 }
1648 
1649 }  // namespace gpu
1650 
1651 namespace cuda {
1652 
CurrentContextOrDie()1653 CUcontext CurrentContextOrDie() {
1654   CUcontext current = nullptr;
1655   FAIL_IF_CUDA_RES_ERROR(cuCtxGetCurrent(&current),
1656                          "Failed to query current context");
1657   return current;
1658 }
1659 
1660 }  // namespace cuda
1661 }  // namespace stream_executor
1662