1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.h"
17
18 #include <stdint.h>
19 #include <stdlib.h>
20
21 #include <map>
22 #include <set>
23 #include <utility>
24
25 #include "absl/base/casts.h"
26 #include "absl/base/const_init.h"
27 #include "absl/container/inlined_vector.h"
28 #include "absl/debugging/leak_check.h"
29 #include "absl/memory/memory.h"
30 #include "absl/strings/str_cat.h"
31 #include "absl/strings/str_format.h"
32 #include "absl/synchronization/mutex.h"
33 #include "absl/synchronization/notification.h"
34 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
35 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.h"
36 #include "tensorflow/compiler/xla/stream_executor/lib/env.h"
37 #include "tensorflow/compiler/xla/stream_executor/lib/error.h"
38 #include "tensorflow/compiler/xla/stream_executor/lib/human_readable.h"
39 #include "tensorflow/compiler/xla/stream_executor/lib/stacktrace.h"
40 #include "tensorflow/compiler/xla/stream_executor/lib/static_threadlocal.h"
41 #include "tensorflow/compiler/xla/stream_executor/lib/threadpool.h"
42 #include "tensorflow/compiler/xla/stream_executor/platform/logging.h"
43 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
44
45 bool FLAGS_gpuexec_cuda_driver_inject_init_error = false;
46 bool FLAGS_gpuexec_cuda_sync_around_driver_calls = false;
47 bool FLAGS_gpuexec_cuda_device_0_only = false;
48
49 #define RETURN_IF_CUDA_RES_ERROR(expr, ...) \
50 do { \
51 CUresult _res = (expr); \
52 if (TF_PREDICT_FALSE(_res != CUDA_SUCCESS)) { \
53 return port::InternalError(absl::StrCat( \
54 __VA_ARGS__, ": ", ::stream_executor::gpu::ToString(_res))); \
55 } \
56 } while (0)
57
58 #define FAIL_IF_CUDA_RES_ERROR(expr, ...) \
59 do { \
60 CUresult _res = (expr); \
61 if (TF_PREDICT_FALSE(_res != CUDA_SUCCESS)) { \
62 LOG(FATAL) << absl::StrCat(__VA_ARGS__) << ": " \
63 << ::stream_executor::gpu::ToString(_res); \
64 } \
65 } while (0)
66
67 // Debugging: on each push and pop of a cuda context, verify the current context
68 // matches the expected one.
69 constexpr bool kVerifyGpuContext = false;
70
71 namespace stream_executor {
72 namespace gpu {
73
74 /* static */ absl::Mutex CreatedContexts::mu_{absl::kConstInit};
75 /* static */ int64_t CreatedContexts::next_id_ = 1; // 0 means "no context"
76
77 namespace {
78
79 // Returns the current context and checks that it is in the set of CUDA contexts
80 // created by StreamExecutor (to ensure that the CUDA runtime didn't create a
81 // context behind our backs).
CurrentContext()82 CUcontext CurrentContext() {
83 CUcontext current = cuda::CurrentContextOrDie();
84 if (current != nullptr && !CreatedContexts::Has(current)) {
85 LOG(FATAL) << "current context was not created by the StreamExecutor "
86 "cuda_driver API: "
87 << current
88 << "; a CUDA runtime call "
89 "was likely performed without using a StreamExecutor context";
90 }
91 return current;
92 }
93
94 // CUDA driver routines may require a large amount of stack (particularly
95 // cuModuleLoadDataEx, in our experience). To avoid stack overflow when using
96 // stack-limited threads (such as those spawned by a default-argument
97 // thread::ThreadPool on some platforms), we run certain routines in this pool
98 // and wait for completion.
GetDriverExecutor()99 port::ThreadPool* GetDriverExecutor() {
100 static port::ThreadPool* thread_pool = new port::ThreadPool(
101 port::Env::Default(), port::ThreadOptions(), "cuda_driver", 1);
102 return thread_pool;
103 }
104
105 } // namespace
106
MemorySpaceString(MemorySpace memory_space)107 std::string MemorySpaceString(MemorySpace memory_space) {
108 switch (memory_space) {
109 case MemorySpace::kHost:
110 return "host";
111 case MemorySpace::kDevice:
112 return "device";
113 default:
114 LOG(FATAL) << "impossible memory space";
115 }
116 }
117
118 namespace {
119
120 // Call cuCtxtSynchronize and crash if it doesn't succeed.
SynchronizeOrDie()121 void SynchronizeOrDie() {
122 FAIL_IF_CUDA_RES_ERROR(cuCtxSynchronize(),
123 "Synchronize fail: ", port::CurrentStackTrace());
124 }
125
126 struct ThreadLocalData {
127 int64_t id;
128 GpuContext* context; // Only valid if id == a known good context.
129 int depth;
130 };
131
132 SE_STATIC_THREAD_LOCAL_POD(ThreadLocalData, tls_data);
133
134 } // namespace
135
ScopedActivateContext(GpuContext * cuda_context)136 ScopedActivateContext::ScopedActivateContext(GpuContext* cuda_context) {
137 if (FLAGS_gpuexec_cuda_sync_around_driver_calls) SynchronizeOrDie();
138
139 auto* tls = &tls_data.get();
140
141 // If this is an outermost scope, we must not assume that the CUDA context has
142 // been left in the same state we left it. Other code may have run on this
143 // thread and altered the context.
144 if (tls->depth == 0) {
145 VLOG(3) << "ScopedActivateContext switching to " << cuda_context->id();
146 FAIL_IF_CUDA_RES_ERROR(cuCtxSetCurrent(cuda_context->context()),
147 "Failed setting context");
148 tls->depth = 1;
149 tls->id = cuda_context->id();
150 tls->context = cuda_context;
151 to_restore_ = nullptr;
152 return;
153 }
154
155 tls->depth++;
156 if (tls->id == cuda_context->id()) {
157 if (kVerifyGpuContext) {
158 CHECK_EQ(CurrentContext(), cuda_context->context());
159 }
160 DCHECK_EQ(CurrentContext(), cuda_context->context());
161 return;
162 }
163
164 VLOG(3) << "ScopedActivateContext switching context from " << tls->id
165 << " to " << cuda_context->id();
166
167 to_restore_ = tls->context;
168 // Set the context and update thread local.
169 FAIL_IF_CUDA_RES_ERROR(cuCtxSetCurrent(cuda_context->context()),
170 "Failed setting context");
171 tls->id = cuda_context->id();
172 tls->context = cuda_context;
173 }
174
~ScopedActivateContext()175 ScopedActivateContext::~ScopedActivateContext() {
176 if (FLAGS_gpuexec_cuda_sync_around_driver_calls) SynchronizeOrDie();
177
178 auto* tls = &tls_data.get();
179
180 if (kVerifyGpuContext) {
181 // Note that if kVerifyGpuContext is used, and contexts are deleted, it's
182 // possible this could fail in the CurrentContext() call.
183 CHECK_EQ(CurrentContext(),
184 tls->context == nullptr ? nullptr : tls->context->context());
185 }
186
187 tls->depth--;
188 DCHECK_GE(tls->depth, 0);
189 if (to_restore_ == nullptr) {
190 // Leave context, tls->id, and tls->context set.
191 return;
192 }
193
194 // Set context and update thread local.
195 FAIL_IF_CUDA_RES_ERROR(cuCtxSetCurrent(to_restore_->context()),
196 "Failed setting context");
197 tls->id = to_restore_->id();
198 tls->context = to_restore_;
199 }
200
201 namespace {
202
203 // Returns a stringified device number associated with pointer, primarily for
204 // logging purposes. Returns "?" if the device could not be successfully
205 // queried.
CUDAPointerToDeviceString(CUdeviceptr pointer)206 std::string CUDAPointerToDeviceString(CUdeviceptr pointer) {
207 auto value = GpuDriver::GetPointerDevice(pointer);
208 if (value.ok()) {
209 return absl::StrCat(value.ValueOrDie());
210 }
211 LOG(ERROR) << "could not query device: " << value.status();
212 return "?";
213 }
214
215 // Returns a stringified memory space associated with pointer, primarily for
216 // logging purposes. Returns "?" if the memory space could not be successfully
217 // queried.
CUDAPointerToMemorySpaceString(CUdeviceptr pointer)218 std::string CUDAPointerToMemorySpaceString(CUdeviceptr pointer) {
219 auto value = GpuDriver::GetPointerMemorySpace(pointer);
220 if (value.ok()) {
221 return MemorySpaceString(value.ValueOrDie());
222 }
223 LOG(ERROR) << "could not query device: " << value.status();
224 return "?";
225 }
226
227 // Returns a stringified representation of whether or not peer access is
228 // permitted between the "from" and "to" pointers' associated contexts,
229 // primarily for logging purposes. Returns "error" if an error is encountered
230 // in the process of querying.
CUDAPointersToCanAccessString(CUdeviceptr from,CUdeviceptr to)231 std::string CUDAPointersToCanAccessString(CUdeviceptr from, CUdeviceptr to) {
232 auto from_context = GpuDriver::GetPointerContext(from);
233 if (!from_context.ok()) {
234 LOG(ERROR) << "could not retrieve source pointer's context: "
235 << from_context.status();
236 return "source ptr error";
237 }
238 auto to_context = GpuDriver::GetPointerContext(to);
239 if (!to_context.ok()) {
240 LOG(ERROR) << "could not retrieve destination pointer's context: "
241 << to_context.status();
242 return "destination ptr error";
243 }
244 return GpuDriver::CanEnablePeerAccess(from_context.ValueOrDie(),
245 to_context.ValueOrDie())
246 ? "true"
247 : "false";
248 }
249
250 // Actually performs the work of CUDA initialization. Wrapped up in one-time
251 // execution guard.
InternalInit()252 static port::Status InternalInit() {
253 CUresult res = CUDA_ERROR_NO_DEVICE;
254 if (FLAGS_gpuexec_cuda_driver_inject_init_error) {
255 LOG(ERROR) << "injecting CUDA init error; initialization will fail";
256 } else {
257 res = cuInit(0 /* = flags */);
258 }
259
260 if (res == CUDA_SUCCESS) {
261 return ::tensorflow::OkStatus();
262 } else if (res == CUDA_ERROR_SHARED_OBJECT_INIT_FAILED) {
263 LOG(WARNING) << "failed call to cuInit: " << ToString(res);
264 } else {
265 LOG(ERROR) << "failed call to cuInit: " << ToString(res);
266 }
267
268 Diagnostician::LogDiagnosticInformation();
269 return port::Status(port::error::ABORTED,
270 absl::StrCat("failed call to cuInit: ", ToString(res)));
271 }
272
273 } // namespace
274
Init()275 /* static */ port::Status GpuDriver::Init() {
276 // Cached return value from calling InternalInit(), as cuInit need only be
277 // called once, but GpuDriver::Init may be called many times.
278 static port::Status* init_retval = [] {
279 return new port::Status(InternalInit());
280 }();
281 return *init_retval;
282 }
283
GetDevice(int device_ordinal,CUdevice * device)284 /* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
285 CUdevice* device) {
286 RETURN_IF_CUDA_RES_ERROR(cuDeviceGet(device, device_ordinal),
287 "Failed call to cuDeviceGet");
288 return ::tensorflow::OkStatus();
289 }
290
GetDeviceName(CUdevice device,std::string * device_name)291 /* static */ port::Status GpuDriver::GetDeviceName(CUdevice device,
292 std::string* device_name) {
293 static const size_t kCharLimit = 64;
294 absl::InlinedVector<char, 4> chars(kCharLimit);
295 RETURN_IF_CUDA_RES_ERROR(
296 cuDeviceGetName(chars.begin(), kCharLimit - 1, device),
297 "Failed to get device name");
298 chars[kCharLimit - 1] = '\0';
299 *device_name = chars.begin();
300 return ::tensorflow::OkStatus();
301 }
302
DeviceOptionsToContextFlags(const DeviceOptions & device_options,int * flags)303 bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
304 int* flags) {
305 static_assert(DeviceOptions::kMask == 0xf,
306 "needs update for new device options");
307
308 if (device_options.flags() & DeviceOptions::kDoNotReclaimStackAllocation) {
309 *flags |= CU_CTX_LMEM_RESIZE_TO_MAX;
310 }
311
312 // If no flags are set the default is CU_CTX_SCHED_AUTO, which
313 // in Google environments is very likely to mean SPIN.
314 if (device_options.flags() & DeviceOptions::kScheduleSpin) {
315 *flags |= CU_CTX_SCHED_SPIN;
316 }
317 if (device_options.flags() & DeviceOptions::kScheduleYield) {
318 *flags |= CU_CTX_SCHED_YIELD;
319 }
320 if (device_options.flags() & DeviceOptions::kScheduleBlockingSync) {
321 *flags |= CU_CTX_SCHED_BLOCKING_SYNC;
322 }
323
324 return true;
325 }
326
CreateContext(int device_ordinal,CUdevice device,const DeviceOptions & device_options,GpuContext ** context)327 /* static */ port::Status GpuDriver::CreateContext(
328 int device_ordinal, CUdevice device, const DeviceOptions& device_options,
329 GpuContext** context) {
330 *context = nullptr;
331
332 int flags = 0;
333 if (!DeviceOptionsToContextFlags(device_options, &flags)) {
334 LOG(WARNING) << "could not convert all device options into context flags";
335 }
336
337 CUresult res;
338 CUcontext former_context;
339 CUcontext new_context;
340
341 unsigned int former_primary_context_flags;
342 int former_primary_context_is_active;
343 CHECK_EQ(CUDA_SUCCESS,
344 cuDevicePrimaryCtxGetState(device, &former_primary_context_flags,
345 &former_primary_context_is_active));
346 if (former_primary_context_flags != flags) {
347 if (former_primary_context_is_active) {
348 LOG(ERROR)
349 << "The primary context is active and has a different flag set ("
350 << former_primary_context_flags << ") than the desired flag set ("
351 << flags << ").";
352 } else {
353 CHECK_EQ(CUDA_SUCCESS, cuDevicePrimaryCtxSetFlags(device, flags));
354 }
355 }
356
357 former_context = cuda::CurrentContextOrDie();
358 res = cuDevicePrimaryCtxRetain(&new_context, device);
359 if (former_context != nullptr) {
360 CUdevice former_device;
361 if (cuCtxGetDevice(&former_device) == CUDA_SUCCESS) {
362 if (former_device == device) {
363 if (former_context == new_context) {
364 VLOG(2) << "The primary context " << former_context << " for device "
365 << device
366 << " exists before initializing the StreamExecutor.";
367 } else {
368 LOG(WARNING) << "A non-primary context " << former_context
369 << " for device " << device
370 << " exists before initializing the StreamExecutor. The "
371 << "primary context is now " << new_context << ". We "
372 << "haven't verified StreamExecutor works with that.";
373 }
374 }
375 } else {
376 LOG(ERROR) << "Failed to get the device of the current context "
377 << former_context;
378 }
379 }
380 CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(former_context));
381
382 if (res == CUDA_SUCCESS) {
383 *context = CreatedContexts::Add(new_context, device_ordinal);
384 CHECK(*context != nullptr)
385 << "success in this call must entail non-null result";
386 VLOG(2) << "created or reused context " << new_context
387 << " for this thread";
388 return ::tensorflow::OkStatus();
389 }
390
391 std::string message =
392 "failed call to cuDevicePrimaryCtxRetain: " + ToString(res);
393 if (res == CUDA_ERROR_OUT_OF_MEMORY) {
394 uint64_t total_memory;
395 if (GetDeviceTotalMemory(device, &total_memory)) {
396 absl::StrAppend(&message, "; total memory reported: ", total_memory);
397 } else {
398 absl::StrAppend(&message, "; could not query total memory");
399 }
400 }
401
402 return port::Status(port::error::INTERNAL, message);
403 }
404
DestroyContext(GpuContext * context)405 /* static */ void GpuDriver::DestroyContext(GpuContext* context) {
406 if (context == nullptr) {
407 return;
408 }
409 CUcontext former_context = CurrentContext();
410 CUresult res = cuCtxSetCurrent(context->context());
411 CUdevice device;
412 cuCtxGetDevice(&device);
413 cuCtxSetCurrent(former_context);
414
415 res = cuDevicePrimaryCtxRelease(device);
416
417 if (res != CUDA_SUCCESS) {
418 LOG(ERROR) << "failed to release CUDA context; leaking: " << ToString(res);
419 }
420
421 CreatedContexts::Remove(context->context());
422 }
423
GetContextHandle(GpuContext * context)424 /* static */ CUcontext GpuDriver::GetContextHandle(GpuContext* context) {
425 return context->context();
426 }
427
FuncGetAttribute(CUfunction_attribute attribute,CUfunction func,int * attribute_value)428 /* static */ port::Status GpuDriver::FuncGetAttribute(
429 CUfunction_attribute attribute, CUfunction func, int* attribute_value) {
430 RETURN_IF_CUDA_RES_ERROR(cuFuncGetAttribute(attribute_value, attribute, func),
431 "Failed to query kernel attribute: ", attribute);
432 return ::tensorflow::OkStatus();
433 }
434
FuncSetCacheConfig(CUfunction function,CUfunc_cache cache_config)435 /* static */ port::Status GpuDriver::FuncSetCacheConfig(
436 CUfunction function, CUfunc_cache cache_config) {
437 RETURN_IF_CUDA_RES_ERROR(cuFuncSetCacheConfig(function, cache_config),
438 "Failed to set CUDA kernel cache config");
439 return ::tensorflow::OkStatus();
440 }
441
442 /* static */ port::StatusOr<CUsharedconfig>
ContextGetSharedMemConfig(GpuContext * context)443 GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
444 CUsharedconfig shared_mem_config;
445 ScopedActivateContext activation(context);
446 RETURN_IF_CUDA_RES_ERROR(cuCtxGetSharedMemConfig(&shared_mem_config),
447 "Failed to get shared memory config");
448 return shared_mem_config;
449 }
450
ContextSetSharedMemConfig(GpuContext * context,CUsharedconfig shared_mem_config)451 /* static */ port::Status GpuDriver::ContextSetSharedMemConfig(
452 GpuContext* context, CUsharedconfig shared_mem_config) {
453 ScopedActivateContext activation(context);
454 RETURN_IF_CUDA_RES_ERROR(cuCtxSetSharedMemConfig(shared_mem_config),
455 "Failed to set shared memory config");
456 return ::tensorflow::OkStatus();
457 }
458
LaunchKernel(GpuContext * context,absl::string_view kernel_name,CUfunction function,unsigned int grid_dim_x,unsigned int grid_dim_y,unsigned int grid_dim_z,unsigned int block_dim_x,unsigned int block_dim_y,unsigned int block_dim_z,unsigned int shared_mem_bytes,CUstream stream,void ** kernel_params,void ** extra)459 /* static */ port::Status GpuDriver::LaunchKernel(
460 GpuContext* context, absl::string_view kernel_name, CUfunction function,
461 unsigned int grid_dim_x, unsigned int grid_dim_y, unsigned int grid_dim_z,
462 unsigned int block_dim_x, unsigned int block_dim_y,
463 unsigned int block_dim_z, unsigned int shared_mem_bytes, CUstream stream,
464 void** kernel_params, void** extra) {
465 ScopedActivateContext activation(context);
466 VLOG(2) << "launching kernel: " << kernel_name << "; gdx: " << grid_dim_x
467 << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
468 << " bdx: " << block_dim_x << " bdy: " << block_dim_y
469 << " bdz: " << block_dim_z;
470 RETURN_IF_CUDA_RES_ERROR(
471 cuLaunchKernel(function, grid_dim_x, grid_dim_y, grid_dim_z, block_dim_x,
472 block_dim_y, block_dim_z, shared_mem_bytes, stream,
473 kernel_params, extra),
474 "Failed to launch CUDA kernel: ", kernel_name,
475 " with block dimensions: ", block_dim_x, "x", block_dim_y, "x",
476 block_dim_z, " and grid dimensions: ", grid_dim_x, "x", grid_dim_y, "x",
477 grid_dim_z);
478 return ::tensorflow::OkStatus();
479 }
480
LoadCubin(GpuContext * context,const char * cubin_bytes,CUmodule * module)481 /* static */ port::Status GpuDriver::LoadCubin(GpuContext* context,
482 const char* cubin_bytes,
483 CUmodule* module) {
484 ScopedActivateContext activation(context);
485 RETURN_IF_CUDA_RES_ERROR(cuModuleLoadFatBinary(module, cubin_bytes),
486 "Failed to load in-memory CUBIN");
487 return ::tensorflow::OkStatus();
488 }
489
LoadPtx(GpuContext * context,const char * ptx_contents,CUmodule * module)490 /* static */ port::Status GpuDriver::LoadPtx(GpuContext* context,
491 const char* ptx_contents,
492 CUmodule* module) {
493 absl::Notification notification;
494 port::Status ret = ::tensorflow::OkStatus();
495 GetDriverExecutor()->Schedule([context, ptx_contents, module, &ret,
496 ¬ification]() {
497 ScopedActivateContext activation(context);
498 void* ptx_data = const_cast<char*>(ptx_contents);
499 static const unsigned int kLogBufferBytesLimit = 1024;
500 unsigned int error_log_buffer_bytes = kLogBufferBytesLimit;
501 unsigned int info_log_buffer_bytes = kLogBufferBytesLimit;
502 absl::InlinedVector<char, 4> error_log_buffer(error_log_buffer_bytes);
503 absl::InlinedVector<char, 4> info_log_buffer(info_log_buffer_bytes);
504 bool log_verbose = true;
505 CUjit_option options[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
506 CU_JIT_ERROR_LOG_BUFFER,
507 CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
508 CU_JIT_INFO_LOG_BUFFER, CU_JIT_LOG_VERBOSE};
509 // Note that the driver API wants the contents of this values to be stored
510 // in an array of void*s, so we coerce them accordingly.
511 void* option_values[] = {
512 absl::bit_cast<void*>(uintptr_t(error_log_buffer_bytes)),
513 absl::bit_cast<void*>(error_log_buffer.data()),
514 absl::bit_cast<void*>(uintptr_t(info_log_buffer_bytes)),
515 absl::bit_cast<void*>(info_log_buffer.data()),
516 absl::bit_cast<void*>(uintptr_t(log_verbose))};
517 CHECK(TF_ARRAYSIZE(options) == TF_ARRAYSIZE(option_values));
518
519 CUresult res;
520 {
521 // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their
522 // module loading: see http://b/13248943
523 absl::LeakCheckDisabler disabler;
524 res = cuModuleLoadDataEx(module, ptx_data, TF_ARRAYSIZE(options), options,
525 option_values);
526 }
527
528 // The PTX JIT mutates the values in the option values array to reflect the
529 // size of the logs it output; now that we've made the call, read the values
530 // back out.
531 error_log_buffer_bytes = reinterpret_cast<uintptr_t>(option_values[0]);
532 info_log_buffer_bytes = reinterpret_cast<uintptr_t>(option_values[2]);
533 CHECK_LE(error_log_buffer_bytes, kLogBufferBytesLimit);
534 CHECK_LE(info_log_buffer_bytes, kLogBufferBytesLimit);
535
536 if (res != CUDA_SUCCESS) {
537 LOG(ERROR) << "failed to load PTX text as a module: " << ToString(res);
538 // As a precaution for null termination of the API-provided value, ensure
539 // that at least the last byte is null.
540 error_log_buffer[error_log_buffer_bytes ? error_log_buffer_bytes - 1
541 : 0] = '\0';
542 LOG(ERROR) << "error log buffer (" << error_log_buffer_bytes
543 << " bytes): " << error_log_buffer.data();
544 ret = port::InternalError(
545 absl::StrCat("Failed to load PTX text as a module: ", ToString(res)));
546 notification.Notify();
547 }
548
549 VLOG(3) << "PTX compilation info log (" << info_log_buffer_bytes
550 << " bytes): " << info_log_buffer.data();
551 VLOG(3) << "PTX compilation error log (" << error_log_buffer_bytes
552 << " bytes): " << error_log_buffer.data();
553 CHECK(module != nullptr);
554 notification.Notify();
555 });
556 notification.WaitForNotification();
557
558 return ret;
559 }
560
LoadHsaco(GpuContext * context,const char * hsaco_contents,CUmodule * module)561 /* static */ port::Status GpuDriver::LoadHsaco(GpuContext* context,
562 const char* hsaco_contents,
563 CUmodule* module) {
564 return port::InternalError(
565 "Feature not supported on CUDA platform (LoadHsaco)");
566 }
567
SynchronousMemsetUint8(GpuContext * context,CUdeviceptr location,uint8 value,size_t size)568 /* static */ port::Status GpuDriver::SynchronousMemsetUint8(
569 GpuContext* context, CUdeviceptr location, uint8 value, size_t size) {
570 ScopedActivateContext activation(context);
571 RETURN_IF_CUDA_RES_ERROR(cuMemsetD8(location, value, size),
572 "Failed to memset memory");
573 return ::tensorflow::OkStatus();
574 }
575
SynchronousMemsetUint32(GpuContext * context,CUdeviceptr location,uint32 value,size_t uint32_count)576 /* static */ port::Status GpuDriver::SynchronousMemsetUint32(
577 GpuContext* context, CUdeviceptr location, uint32 value,
578 size_t uint32_count) {
579 ScopedActivateContext activation(context);
580 RETURN_IF_CUDA_RES_ERROR(cuMemsetD32(location, value, uint32_count),
581 "Failed to memset memory");
582 return ::tensorflow::OkStatus();
583 }
584
AsynchronousMemsetUint8(GpuContext * context,CUdeviceptr location,uint8 value,size_t uint32_count,CUstream stream)585 /* static */ port::Status GpuDriver::AsynchronousMemsetUint8(
586 GpuContext* context, CUdeviceptr location, uint8 value, size_t uint32_count,
587 CUstream stream) {
588 ScopedActivateContext activation(context);
589 RETURN_IF_CUDA_RES_ERROR(
590 cuMemsetD8Async(location, value, uint32_count, stream),
591 "Failed to enqueue async memset operation");
592 return ::tensorflow::OkStatus();
593 }
594
AsynchronousMemsetUint32(GpuContext * context,CUdeviceptr location,uint32 value,size_t uint32_count,CUstream stream)595 /* static */ port::Status GpuDriver::AsynchronousMemsetUint32(
596 GpuContext* context, CUdeviceptr location, uint32 value,
597 size_t uint32_count, CUstream stream) {
598 ScopedActivateContext activation(context);
599 RETURN_IF_CUDA_RES_ERROR(
600 cuMemsetD32Async(location, value, uint32_count, stream),
601 "Failed to enqueue async memset operation");
602 return ::tensorflow::OkStatus();
603 }
604
AddStreamCallback(GpuContext * context,CUstream stream,StreamCallback callback,void * data)605 /* static */ bool GpuDriver::AddStreamCallback(GpuContext* context,
606 CUstream stream,
607 StreamCallback callback,
608 void* data) {
609 // Note: flags param is required to be zero according to CUDA 6.0.
610 CUresult res = cuStreamAddCallback(stream, callback, data, 0 /* = flags */);
611 if (res != CUDA_SUCCESS) {
612 LOG(ERROR) << "unable to add host callback: " << ToString(res);
613 return false;
614 }
615 return true;
616 }
617
GetModuleFunction(GpuContext * context,CUmodule module,const char * kernel_name,CUfunction * function)618 /* static */ bool GpuDriver::GetModuleFunction(GpuContext* context,
619 CUmodule module,
620 const char* kernel_name,
621 CUfunction* function) {
622 ScopedActivateContext activated{context};
623 CHECK(module != nullptr && kernel_name != nullptr);
624 CUresult res = cuModuleGetFunction(function, module, kernel_name);
625 if (res != CUDA_SUCCESS) {
626 LOG(ERROR) << "failed to get PTX kernel \"" << kernel_name
627 << "\" from module: " << ToString(res);
628 return false;
629 }
630
631 return true;
632 }
633
GetModuleSymbol(GpuContext * context,CUmodule module,const char * symbol_name,CUdeviceptr * dptr,size_t * bytes)634 /* static */ bool GpuDriver::GetModuleSymbol(GpuContext* context,
635 CUmodule module,
636 const char* symbol_name,
637 CUdeviceptr* dptr, size_t* bytes) {
638 ScopedActivateContext activated{context};
639 CHECK(module != nullptr && symbol_name != nullptr &&
640 (dptr != nullptr || bytes != nullptr));
641 CUresult res = cuModuleGetGlobal(dptr, bytes, module, symbol_name);
642 if (res != CUDA_SUCCESS) {
643 // symbol may not be found in the current module, but it may reside in
644 // another module.
645 VLOG(2) << "failed to get symbol \"" << symbol_name
646 << "\" from module: " << ToString(res);
647 return false;
648 }
649
650 return true;
651 }
652
UnloadModule(GpuContext * context,CUmodule module)653 /* static */ void GpuDriver::UnloadModule(GpuContext* context,
654 CUmodule module) {
655 ScopedActivateContext activated{context};
656 CUresult res = cuModuleUnload(module);
657 if (res != CUDA_SUCCESS) {
658 LOG(ERROR) << "failed to unload module " << module
659 << "; leaking: " << ToString(res);
660 }
661 }
662
DeviceFromContext(GpuContext * context)663 /* static */ port::StatusOr<CUdevice> GpuDriver::DeviceFromContext(
664 GpuContext* context) {
665 ScopedActivateContext activated{context};
666 CUdevice device = -1;
667 CUresult result = cuCtxGetDevice(&device);
668 if (result == CUDA_SUCCESS) {
669 return device;
670 }
671
672 return port::Status(
673 port::error::INTERNAL,
674 absl::StrCat("failed to get device for context: ", ToString(result)));
675 }
676
CreateStream(GpuContext * context,CUstream * stream,int priority)677 /* static */ bool GpuDriver::CreateStream(GpuContext* context, CUstream* stream,
678 int priority) {
679 // TODO(leary) can we switch this to CU_STREAM_NON_BLOCKING or will that mess
680 // up synchronization with respect to memsets and any other things that have
681 // to occur on the default stream?
682 ScopedActivateContext activated{context};
683 CUresult res;
684 // If the priority is 0, then use the previous api to create the stream with
685 // the default priority for backward compatibility. Probably there is no
686 // difference in using the new api call but leaving it as is for now.
687 if (priority == 0) {
688 res = cuStreamCreate(stream, 0);
689 } else {
690 res = cuStreamCreateWithPriority(stream, 0, priority);
691 }
692 if (res != CUDA_SUCCESS) {
693 LOG(ERROR) << "could not allocate CUDA stream for context "
694 << context->context() << ": " << ToString(res);
695 return false;
696 }
697
698 VLOG(2) << "successfully created stream " << *stream << " for context "
699 << context->context() << " on thread";
700 return true;
701 }
702
DestroyStream(GpuContext * context,CUstream * stream)703 /* static */ void GpuDriver::DestroyStream(GpuContext* context,
704 CUstream* stream) {
705 if (*stream == nullptr) {
706 return;
707 }
708
709 ScopedActivateContext activated{context};
710 CUresult res = cuStreamDestroy(*stream);
711 if (res != CUDA_SUCCESS) {
712 LOG(ERROR) << "failed to destroy CUDA stream for context "
713 << context->context() << ": " << ToString(res);
714 } else {
715 VLOG(2) << "successfully destroyed stream " << *stream << " for context "
716 << context->context();
717 *stream = nullptr;
718 }
719 }
720
DeviceAllocate(GpuContext * context,uint64_t bytes)721 /* static */ void* GpuDriver::DeviceAllocate(GpuContext* context,
722 uint64_t bytes) {
723 if (bytes == 0) {
724 return nullptr;
725 }
726
727 ScopedActivateContext activated{context};
728 CUdeviceptr result = 0;
729 CUresult res = cuMemAlloc(&result, bytes);
730 if (res != CUDA_SUCCESS) {
731 // LOG(INFO) because this isn't always important to users (e.g. BFCAllocator
732 // implements a retry if the first allocation fails).
733 LOG(INFO) << "failed to allocate "
734 << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
735 << " bytes) from device: " << ToString(res);
736 return nullptr;
737 }
738 void* ptr = reinterpret_cast<void*>(result);
739 VLOG(2) << "allocated " << ptr << " for context " << context->context()
740 << " of " << bytes << " bytes";
741 return ptr;
742 }
743
DeviceDeallocate(GpuContext * context,void * location)744 /* static */ void GpuDriver::DeviceDeallocate(GpuContext* context,
745 void* location) {
746 ScopedActivateContext activation(context);
747 CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
748 CUresult res = cuMemFree(pointer);
749 if (res != CUDA_SUCCESS) {
750 LOG(ERROR) << "failed to free device memory at " << location
751 << "; result: " << ToString(res);
752 } else {
753 VLOG(2) << "deallocated " << location << " for context "
754 << context->context();
755 }
756 }
757
UnifiedMemoryAllocate(GpuContext * context,uint64_t bytes)758 /* static */ void* GpuDriver::UnifiedMemoryAllocate(GpuContext* context,
759 uint64_t bytes) {
760 ScopedActivateContext activation(context);
761 CUdeviceptr result = 0;
762 // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
763 CUresult res = cuMemAllocManaged(&result, bytes, CU_MEM_ATTACH_GLOBAL);
764 if (res != CUDA_SUCCESS) {
765 LOG(ERROR) << "failed to alloc " << bytes
766 << " bytes unified memory; result: " << ToString(res);
767 return nullptr;
768 }
769 void* ptr = reinterpret_cast<void*>(result);
770 VLOG(2) << "allocated " << ptr << " for context " << context->context()
771 << " of " << bytes << " bytes in unified memory";
772 return ptr;
773 }
774
UnifiedMemoryDeallocate(GpuContext * context,void * location)775 /* static */ void GpuDriver::UnifiedMemoryDeallocate(GpuContext* context,
776 void* location) {
777 ScopedActivateContext activation(context);
778 CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
779 CUresult res = cuMemFree(pointer);
780 if (res != CUDA_SUCCESS) {
781 LOG(ERROR) << "failed to free unified memory at " << location
782 << "; result: " << ToString(res);
783 } else {
784 VLOG(2) << "deallocated unified memory at " << location << " for context "
785 << context->context();
786 }
787 }
788
HostAllocate(GpuContext * context,uint64_t bytes)789 /* static */ void* GpuDriver::HostAllocate(GpuContext* context,
790 uint64_t bytes) {
791 ScopedActivateContext activation(context);
792 void* host_mem = nullptr;
793 // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
794 CUresult res = cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE);
795 if (res != CUDA_SUCCESS) {
796 LOG(ERROR) << "failed to alloc " << bytes
797 << " bytes on host: " << ToString(res);
798 }
799 return host_mem;
800 }
801
HostDeallocate(GpuContext * context,void * location)802 /* static */ void GpuDriver::HostDeallocate(GpuContext* context,
803 void* location) {
804 ScopedActivateContext activation(context);
805 CUresult res = cuMemFreeHost(location);
806 if (res != CUDA_SUCCESS) {
807 LOG(ERROR) << "error deallocating host memory at " << location << ": "
808 << ToString(res);
809 }
810 }
811
HostRegister(GpuContext * context,void * location,uint64_t bytes)812 /* static */ bool GpuDriver::HostRegister(GpuContext* context, void* location,
813 uint64_t bytes) {
814 ScopedActivateContext activation(context);
815 // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
816 CUresult res =
817 cuMemHostRegister(location, bytes, CU_MEMHOSTREGISTER_PORTABLE);
818 if (res != CUDA_SUCCESS) {
819 LOG(ERROR) << "error registering host memory at " << location << ": "
820 << ToString(res);
821 return false;
822 }
823 return true;
824 }
825
HostUnregister(GpuContext * context,void * location)826 /* static */ bool GpuDriver::HostUnregister(GpuContext* context,
827 void* location) {
828 ScopedActivateContext activation(context);
829 CUresult res = cuMemHostUnregister(location);
830 if (res != CUDA_SUCCESS) {
831 LOG(ERROR) << "error unregistering host memory at " << location << ": "
832 << ToString(res);
833 return false;
834 }
835 return true;
836 }
837
838 #if CUDA_VERSION >= 10020
839 /* static */ port::StatusOr<GpuDriver::VmemSpan>
ReserveVirtualMemory(GpuContext * context,uint64_t bytes)840 GpuDriver::ReserveVirtualMemory(GpuContext* context, uint64_t bytes) {
841 ScopedActivateContext activation(context);
842 CUdeviceptr base;
843 CUresult res = cuMemAddressReserve(&base, bytes, /*alignment=*/0,
844 /*addr=*/0, /*flags=*/0);
845 if (res != CUDA_SUCCESS) {
846 return port::InternalError(
847 absl::StrFormat("error reserving %d bytes of virtual GPU memory: %s",
848 bytes, ToString(res)));
849 }
850 return {{base, bytes}};
851 }
852
FreeVirtualMemory(GpuContext * context,GpuDriver::VmemSpan reservation)853 /* static */ void GpuDriver::FreeVirtualMemory(
854 GpuContext* context, GpuDriver::VmemSpan reservation) {
855 ScopedActivateContext activation(context);
856 CUresult res = cuMemAddressFree(reservation.base, reservation.size_bytes);
857 if (res != CUDA_SUCCESS) {
858 LOG(ERROR) << "error freeing vmem reservation of size "
859 << reservation.size_bytes << " at address " << reservation.base;
860 }
861 }
862
GetMinAllocationGranularity(GpuDeviceHandle device)863 /* static */ port::StatusOr<uint64_t> GpuDriver::GetMinAllocationGranularity(
864 GpuDeviceHandle device) {
865 CUmemAllocationProp props = {};
866 props.type = CU_MEM_ALLOCATION_TYPE_PINNED;
867 props.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
868 props.location.id = device;
869
870 size_t granularity;
871 CUresult res = cuMemGetAllocationGranularity(
872 &granularity, &props, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
873 if (res != CUDA_SUCCESS) {
874 return port::InternalError(absl::StrCat(
875 "failed to get min allocation granularity: ", ToString(res)));
876 }
877 return granularity;
878 }
879
880 /* static */ port::StatusOr<GpuDriver::GenericMemoryHandle>
CreateMemoryHandle(GpuContext * context,uint64_t bytes)881 GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
882 ScopedActivateContext activation(context);
883 auto device = DeviceFromContext(context);
884 if (!device.ok()) {
885 LOG(ERROR) << "Failed to get device from context" << device.status();
886 return device.status();
887 }
888
889 CUmemAllocationProp props = {};
890 props.type = CU_MEM_ALLOCATION_TYPE_PINNED;
891 props.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
892 props.location.id = device.ValueOrDie();
893
894 CUmemGenericAllocationHandle mem_handle;
895 CUresult res = cuMemCreate(&mem_handle, bytes, &props, 0);
896 if (res != CUDA_SUCCESS) {
897 return port::InternalError(
898 absl::StrFormat("failed to create memory allocation of size %d: %s",
899 bytes, ToString(res)));
900 }
901 return GpuDriver::GenericMemoryHandle{mem_handle, bytes};
902 }
903
ReleaseMemoryHandle(GpuContext * context,GpuDriver::GenericMemoryHandle handle)904 /* static */ void GpuDriver::ReleaseMemoryHandle(
905 GpuContext* context, GpuDriver::GenericMemoryHandle handle) {
906 ScopedActivateContext activation(context);
907
908 CUresult res = cuMemRelease(handle.handle);
909 if (res != CUDA_SUCCESS) {
910 LOG(ERROR) << "Failed to release memory handle " << handle.handle
911 << " of size " << handle.bytes << ": " << ToString(res);
912 }
913 }
914
MapMemory(GpuContext * context,CUdeviceptr va,const GpuDriver::GenericMemoryHandle & handle,const std::vector<GpuDeviceHandle> & device_handles)915 /* static */ port::Status GpuDriver::MapMemory(
916 GpuContext* context, CUdeviceptr va,
917 const GpuDriver::GenericMemoryHandle& handle,
918 const std::vector<GpuDeviceHandle>& device_handles) {
919 ScopedActivateContext activation(context);
920
921 auto device = DeviceFromContext(context);
922 if (!device.ok()) {
923 return device.status();
924 }
925
926 // NB: Zero is the only valid value for both flags and offset.
927 CUresult res =
928 cuMemMap(va, handle.bytes, /*offset=*/0, handle.handle, /*flags=*/0);
929 if (res != CUDA_SUCCESS) {
930 return port::InternalError(absl::StrFormat(
931 "Failed to map %d bytes at %d: %s", handle.bytes, va, ToString(res)));
932 }
933
934 std::vector<CUmemAccessDesc> access_descriptors(device_handles.size());
935 for (int i = 0; i < access_descriptors.size(); ++i) {
936 access_descriptors[i].location.id = device_handles[i];
937 access_descriptors[i].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
938 access_descriptors[i].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
939 }
940
941 res = cuMemSetAccess(va, handle.bytes, access_descriptors.data(),
942 access_descriptors.size());
943 if (res != CUDA_SUCCESS) {
944 // Unmap the memory that we failed to set access for.
945 if (cuMemUnmap(va, handle.bytes) != CUDA_SUCCESS) {
946 LOG(ERROR)
947 << "Failed to unmap memory in GpuDriver::MapMemory error path.";
948 }
949 return port::InternalError(absl::StrFormat(
950 "Failed to set read/write access on memory mapped at %d: %s", va,
951 ToString(res)));
952 }
953 return port::Status::OK();
954 }
955
UnmapMemory(GpuContext * context,CUdeviceptr va,uint64_t bytes)956 /* static */ void GpuDriver::UnmapMemory(GpuContext* context, CUdeviceptr va,
957 uint64_t bytes) {
958 ScopedActivateContext activation(context);
959
960 CUresult res = cuMemUnmap(va, bytes);
961 if (res != CUDA_SUCCESS) {
962 LOG(ERROR) << "Failed to unmap memory at " << va << " of size " << bytes
963 << ": " << ToString(res);
964 }
965 }
966
967 #endif
968
DestroyEvent(GpuContext * context,CUevent * event)969 /* static */ port::Status GpuDriver::DestroyEvent(GpuContext* context,
970 CUevent* event) {
971 if (*event == nullptr) {
972 return port::Status(port::error::INVALID_ARGUMENT,
973 "input event cannot be null");
974 }
975
976 ScopedActivateContext activated{context};
977 RETURN_IF_CUDA_RES_ERROR(cuEventDestroy(*event),
978 "Error destroying CUDA event");
979 return ::tensorflow::OkStatus();
980 }
981
RecordEvent(GpuContext * context,CUevent event,CUstream stream)982 /* static */ port::Status GpuDriver::RecordEvent(GpuContext* context,
983 CUevent event,
984 CUstream stream) {
985 ScopedActivateContext activated{context};
986 RETURN_IF_CUDA_RES_ERROR(cuEventRecord(event, stream),
987 "Error recording CUDA event");
988 return ::tensorflow::OkStatus();
989 }
990
QueryEvent(GpuContext * context,CUevent event)991 /* static */ port::StatusOr<CUresult> GpuDriver::QueryEvent(GpuContext* context,
992 CUevent event) {
993 ScopedActivateContext activated{context};
994 CUresult res = cuEventQuery(event);
995 if (res != CUDA_SUCCESS && res != CUDA_ERROR_NOT_READY) {
996 return port::Status(
997 port::error::INTERNAL,
998 absl::StrFormat("failed to query event: %s", ToString(res)));
999 }
1000
1001 return res;
1002 }
1003
GetEventElapsedTime(GpuContext * context,float * elapsed_milliseconds,CUevent start,CUevent stop)1004 /* static */ bool GpuDriver::GetEventElapsedTime(GpuContext* context,
1005 float* elapsed_milliseconds,
1006 CUevent start, CUevent stop) {
1007 ScopedActivateContext activated{context};
1008 // The stop event must have completed in order for cuEventElapsedTime to
1009 // work.
1010 CUresult res = cuEventSynchronize(stop);
1011 if (res != CUDA_SUCCESS) {
1012 LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res);
1013 return false;
1014 }
1015 res = cuEventElapsedTime(elapsed_milliseconds, start, stop);
1016 if (res != CUDA_SUCCESS) {
1017 LOG(ERROR) << "failed to get elapsed time between events: "
1018 << ToString(res);
1019 return false;
1020 }
1021
1022 return true;
1023 }
1024
WaitStreamOnEvent(GpuContext * context,CUstream stream,CUevent event)1025 /* static */ bool GpuDriver::WaitStreamOnEvent(GpuContext* context,
1026 CUstream stream, CUevent event) {
1027 ScopedActivateContext activation(context);
1028 CUresult res = cuStreamWaitEvent(stream, event, 0 /* = flags */);
1029 if (res != CUDA_SUCCESS) {
1030 LOG(ERROR) << "could not wait stream on event: " << ToString(res);
1031 return false;
1032 }
1033
1034 return true;
1035 }
1036
SynchronizeContext(GpuContext * context)1037 /* static */ bool GpuDriver::SynchronizeContext(GpuContext* context) {
1038 ScopedActivateContext activation(context);
1039 CUresult res = cuCtxSynchronize();
1040 if (res != CUDA_SUCCESS) {
1041 LOG(ERROR) << "could not synchronize on CUDA context: " << ToString(res)
1042 << " :: " << port::CurrentStackTrace();
1043 return false;
1044 }
1045
1046 return true;
1047 }
1048
SynchronizeStream(GpuContext * context,CUstream stream)1049 /* static */ port::Status GpuDriver::SynchronizeStream(GpuContext* context,
1050 CUstream stream) {
1051 ScopedActivateContext activated{context};
1052 CHECK(stream != nullptr);
1053 RETURN_IF_CUDA_RES_ERROR(cuStreamSynchronize(stream),
1054 "Could not synchronize CUDA stream");
1055 return ::tensorflow::OkStatus();
1056 }
1057
IsStreamIdle(GpuContext * context,CUstream stream)1058 /* static */ bool GpuDriver::IsStreamIdle(GpuContext* context,
1059 CUstream stream) {
1060 ScopedActivateContext activated{context};
1061 CHECK(stream != nullptr);
1062 CUresult res = cuStreamQuery(stream);
1063 if (res == CUDA_SUCCESS) {
1064 return true;
1065 }
1066
1067 if (res != CUDA_ERROR_NOT_READY) {
1068 LOG(ERROR) << "stream in bad state on status query: " << ToString(res);
1069 }
1070 return false;
1071 }
1072
SynchronousMemcpyD2H(GpuContext * context,void * host_dst,CUdeviceptr gpu_src,uint64_t size)1073 /* static */ port::Status GpuDriver::SynchronousMemcpyD2H(GpuContext* context,
1074 void* host_dst,
1075 CUdeviceptr gpu_src,
1076 uint64_t size) {
1077 ScopedActivateContext activation(context);
1078 RETURN_IF_CUDA_RES_ERROR(
1079 cuMemcpyDtoH(host_dst, gpu_src, size),
1080 absl::StrFormat("failed to synchronous memcpy from device to host "
1081 "host dst: %p; GPU src: %p; size: %u=0x%x",
1082 host_dst, absl::bit_cast<void*>(gpu_src), size, size));
1083 VLOG(2) << "successfully sync memcpy'd d2h of " << size << " bytes to "
1084 << host_dst;
1085 return ::tensorflow::OkStatus();
1086 }
1087
SynchronousMemcpyH2D(GpuContext * context,CUdeviceptr gpu_dst,const void * host_src,uint64_t size)1088 /* static */ port::Status GpuDriver::SynchronousMemcpyH2D(GpuContext* context,
1089 CUdeviceptr gpu_dst,
1090 const void* host_src,
1091 uint64_t size) {
1092 ScopedActivateContext activation(context);
1093 RETURN_IF_CUDA_RES_ERROR(
1094 cuMemcpyHtoD(gpu_dst, host_src, size),
1095 absl::StrFormat(
1096 "failed to synchronous memcpy from host to device: GPU dst: %p;"
1097 " host src: %p; size: %u=0x%x",
1098 absl::bit_cast<void*>(gpu_dst), host_src, size, size));
1099 VLOG(2) << "successfully enqueued sync memcpy h2d of " << size << " bytes";
1100 return ::tensorflow::OkStatus();
1101 }
1102
SynchronousMemcpyD2D(GpuContext * context,CUdeviceptr gpu_dst,CUdeviceptr gpu_src,uint64_t size)1103 /* static */ port::Status GpuDriver::SynchronousMemcpyD2D(GpuContext* context,
1104 CUdeviceptr gpu_dst,
1105 CUdeviceptr gpu_src,
1106 uint64_t size) {
1107 ScopedActivateContext activation(context);
1108
1109 CUresult result;
1110 // CreatedContexts::GetAnyContext() doesn't works when ptr == 0.
1111 // This happens when the size is 0.
1112 if (gpu_dst == 0 || gpu_src == 0) {
1113 result = cuMemcpyDtoD(gpu_dst, gpu_src, size);
1114 } else {
1115 // Any context work here.
1116 CUcontext dst_context =
1117 CreatedContexts::GetAnyContext(absl::bit_cast<void*>(gpu_dst));
1118 CUcontext src_context =
1119 CreatedContexts::GetAnyContext(absl::bit_cast<void*>(gpu_src));
1120
1121 if (static_cast<void*>(dst_context) == nullptr) {
1122 port::StatusOr<GpuContext*> tmp_context = GetPointerContext(gpu_dst);
1123 if (tmp_context.ok()) {
1124 dst_context = tmp_context.ValueOrDie()->context();
1125 }
1126 }
1127
1128 if (static_cast<void*>(src_context) == nullptr) {
1129 port::StatusOr<GpuContext*> tmp_context = GetPointerContext(gpu_src);
1130 if (tmp_context.ok()) {
1131 src_context = tmp_context.ValueOrDie()->context();
1132 }
1133 }
1134
1135 result = cuMemcpyPeer(gpu_dst, dst_context, gpu_src, src_context, size);
1136 }
1137
1138 RETURN_IF_CUDA_RES_ERROR(
1139 result,
1140 absl::StrFormat(
1141 "failed to synchronous memcpy from host to device: GPU dst: %p; "
1142 "GPU src: %p; size: %u=0x%x",
1143 absl::bit_cast<void*>(gpu_dst), absl::bit_cast<void*>(gpu_src), size,
1144 size));
1145 VLOG(2) << "successfully sync memcpy'd d2d of " << size << " bytes";
1146 return ::tensorflow::OkStatus();
1147 }
1148
AsynchronousMemcpyD2H(GpuContext * context,void * host_dst,CUdeviceptr gpu_src,uint64_t size,CUstream stream)1149 /* static */ bool GpuDriver::AsynchronousMemcpyD2H(GpuContext* context,
1150 void* host_dst,
1151 CUdeviceptr gpu_src,
1152 uint64_t size,
1153 CUstream stream) {
1154 ScopedActivateContext activation(context);
1155 CUresult res = cuMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
1156 if (res != CUDA_SUCCESS) {
1157 LOG(ERROR) << absl::StrFormat(
1158 "failed to enqueue async memcpy from device to host: %s; host dst: %p; "
1159 "GPU src: %p; size: %u=0x%x",
1160 ToString(res), host_dst, absl::bit_cast<void*>(gpu_src), size, size);
1161 return false;
1162 }
1163 VLOG(2) << "successfully enqueued async memcpy d2h of " << size
1164 << " bytes from " << absl::bit_cast<void*>(gpu_src) << " to "
1165 << host_dst << " on stream " << stream;
1166 return true;
1167 }
1168
AsynchronousMemcpyH2D(GpuContext * context,CUdeviceptr gpu_dst,const void * host_src,uint64_t size,CUstream stream)1169 /* static */ bool GpuDriver::AsynchronousMemcpyH2D(GpuContext* context,
1170 CUdeviceptr gpu_dst,
1171 const void* host_src,
1172 uint64_t size,
1173 CUstream stream) {
1174 ScopedActivateContext activation(context);
1175 CUresult res = cuMemcpyHtoDAsync(gpu_dst, host_src, size, stream);
1176 if (res != CUDA_SUCCESS) {
1177 LOG(ERROR) << absl::StrFormat(
1178 "failed to enqueue async memcpy from host to device: %s; GPU dst: %p; "
1179 "host src: %p; size: %u=0x%x",
1180 ToString(res), absl::bit_cast<void*>(gpu_dst), host_src, size, size);
1181 return false;
1182 }
1183 VLOG(2) << "successfully enqueued async memcpy h2d of " << size << " bytes"
1184 << " on stream " << stream;
1185 return true;
1186 }
1187
AsynchronousMemcpyD2D(GpuContext * context,CUdeviceptr gpu_dst,CUdeviceptr gpu_src,uint64_t size,CUstream stream)1188 /* static */ bool GpuDriver::AsynchronousMemcpyD2D(GpuContext* context,
1189 CUdeviceptr gpu_dst,
1190 CUdeviceptr gpu_src,
1191 uint64_t size,
1192 CUstream stream) {
1193 ScopedActivateContext activation(context);
1194 CUresult result;
1195 // CreatedContexts::GetAnyContext() doesn't works when ptr == 0.
1196 // This happens when the size is 0.
1197 if (gpu_dst == 0 || gpu_src == 0) {
1198 result = cuMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
1199 } else {
1200 // Any context work here.
1201 CUcontext dst_context =
1202 CreatedContexts::GetAnyContext(absl::bit_cast<void*>(gpu_dst));
1203 CUcontext src_context =
1204 CreatedContexts::GetAnyContext(absl::bit_cast<void*>(gpu_src));
1205
1206 if (static_cast<void*>(dst_context) == nullptr) {
1207 port::StatusOr<GpuContext*> tmp_context = GetPointerContext(gpu_dst);
1208 if (tmp_context.ok()) {
1209 dst_context = tmp_context.ValueOrDie()->context();
1210 }
1211 }
1212
1213 if (static_cast<void*>(src_context) == nullptr) {
1214 port::StatusOr<GpuContext*> tmp_context = GetPointerContext(gpu_src);
1215 if (tmp_context.ok()) {
1216 src_context = tmp_context.ValueOrDie()->context();
1217 }
1218 }
1219
1220 result = cuMemcpyPeerAsync(gpu_dst, dst_context, gpu_src, src_context, size,
1221 stream);
1222 }
1223 if (result != CUDA_SUCCESS) {
1224 LOG(ERROR) << absl::StrFormat(
1225 "failed to enqueue async memcpy from device to device: %s"
1226 "; GPU dst: %p on %s %s"
1227 "; GPU src: %p on %s %s"
1228 "; can access? %s; size: %u=0x%x",
1229 ToString(result), absl::bit_cast<void*>(gpu_dst),
1230 CUDAPointerToMemorySpaceString(gpu_dst),
1231 CUDAPointerToDeviceString(gpu_dst), absl::bit_cast<void*>(gpu_src),
1232 CUDAPointerToMemorySpaceString(gpu_src),
1233 CUDAPointerToDeviceString(gpu_src),
1234 CUDAPointersToCanAccessString(gpu_src, gpu_dst), size, size);
1235
1236 return false;
1237 }
1238 VLOG(2) << "successfully enqueued async memcpy d2d of " << size << " bytes";
1239 return true;
1240 }
1241
InitEvent(GpuContext * context,CUevent * result,EventFlags flags)1242 /* static */ port::Status GpuDriver::InitEvent(GpuContext* context,
1243 CUevent* result,
1244 EventFlags flags) {
1245 int cuflags;
1246 switch (flags) {
1247 case EventFlags::kDefault:
1248 cuflags = CU_EVENT_DEFAULT;
1249 break;
1250 case EventFlags::kDisableTiming:
1251 cuflags = CU_EVENT_DISABLE_TIMING;
1252 break;
1253 default:
1254 LOG(FATAL) << "impossible event flags: " << int(flags);
1255 }
1256
1257 ScopedActivateContext activated{context};
1258 CUresult res = cuEventCreate(result, cuflags);
1259
1260 if (res == CUDA_SUCCESS) {
1261 return ::tensorflow::OkStatus();
1262 } else if (res == CUDA_ERROR_OUT_OF_MEMORY) {
1263 return port::Status(port::error::RESOURCE_EXHAUSTED,
1264 "could not create CUDA event: out of device memory");
1265 } else {
1266 return port::Status(
1267 port::error::FAILED_PRECONDITION,
1268 absl::StrCat("could not create CUDA event: ", ToString(res)));
1269 }
1270 }
1271
GetDeviceCount()1272 /* static */ int GpuDriver::GetDeviceCount() {
1273 int device_count = 0;
1274 CUresult res = cuDeviceGetCount(&device_count);
1275 if (res != CUDA_SUCCESS) {
1276 LOG(ERROR) << "could not retrieve CUDA device count: " << ToString(res);
1277 return 0;
1278 }
1279
1280 if (FLAGS_gpuexec_cuda_device_0_only && device_count > 1) {
1281 device_count = 1;
1282 }
1283 return device_count;
1284 }
1285
GetPointerContext(CUdeviceptr pointer)1286 /* static */ port::StatusOr<GpuContext*> GpuDriver::GetPointerContext(
1287 CUdeviceptr pointer) {
1288 GpuContext* context = nullptr;
1289 CUresult result =
1290 cuPointerGetAttribute(&context, CU_POINTER_ATTRIBUTE_CONTEXT, pointer);
1291 if (result == CUDA_SUCCESS) {
1292 // For cudaMallocAsync, the context returned is null. For now
1293 // return not-available. But how to manage that correctly
1294 // everywhere in TF? Currently this is only used during error
1295 // handling. So all is working fine, but TF have a different
1296 // error then the original one.
1297 if (context == nullptr) {
1298 return port::Status(
1299 port::error::UNAVAILABLE,
1300 "Empty context returned while querying context for device pointer");
1301 }
1302 return context;
1303 }
1304
1305 return port::Status(
1306 port::error::INTERNAL,
1307 absl::StrCat("failed to query context for device pointer: ",
1308 ToString(result)));
1309 }
1310
GetPointerMemorySpace(CUdeviceptr pointer)1311 /* static */ port::StatusOr<MemorySpace> GpuDriver::GetPointerMemorySpace(
1312 CUdeviceptr pointer) {
1313 unsigned int value;
1314 CUresult result =
1315 cuPointerGetAttribute(&value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer);
1316 if (result == CUDA_SUCCESS) {
1317 switch (value) {
1318 case CU_MEMORYTYPE_DEVICE:
1319 return MemorySpace::kDevice;
1320 case CU_MEMORYTYPE_HOST:
1321 return MemorySpace::kHost;
1322 default:
1323 return port::Status(
1324 port::error::INTERNAL,
1325 absl::StrCat("unknown memory space provided by CUDA API: ", value));
1326 }
1327 }
1328
1329 return port::Status(
1330 port::error::INTERNAL,
1331 absl::StrCat("failed to query device pointer for memory space: ",
1332 ToString(result)));
1333 }
1334
GetPointerAddressRange(CUdeviceptr dptr,CUdeviceptr * base,size_t * size)1335 /* static */ port::Status GpuDriver::GetPointerAddressRange(CUdeviceptr dptr,
1336 CUdeviceptr* base,
1337 size_t* size) {
1338 CUresult result = cuMemGetAddressRange(base, size, dptr);
1339 if (result == CUDA_SUCCESS) {
1340 return ::tensorflow::OkStatus();
1341 } else if (result == CUDA_ERROR_NOT_FOUND) {
1342 // We differentiate between "this pointer is unknown" (return here) and
1343 // "there was an internal error while performing this operation" (return
1344 // below).
1345 return port::Status(
1346 port::error::NOT_FOUND,
1347 absl::StrFormat("not a device pointer %p; %s",
1348 reinterpret_cast<void*>(dptr), ToString(result)));
1349 }
1350
1351 return port::Status(
1352 port::error::INTERNAL,
1353 absl::StrFormat("failed to get pointer into for device pointer %p; %s",
1354 reinterpret_cast<void*>(dptr), ToString(result)));
1355 }
1356
GetPointerDevice(CUdeviceptr pointer)1357 /* static */ port::StatusOr<CUdevice> GpuDriver::GetPointerDevice(
1358 CUdeviceptr pointer) {
1359 auto result = GetPointerContext(pointer);
1360 if (!result.ok()) {
1361 return result.status();
1362 }
1363
1364 return DeviceFromContext(result.ValueOrDie());
1365 }
1366
GetComputeCapability(int * cc_major,int * cc_minor,CUdevice device)1367 /* static */ port::Status GpuDriver::GetComputeCapability(int* cc_major,
1368 int* cc_minor,
1369 CUdevice device) {
1370 *cc_major = 0;
1371 *cc_minor = 0;
1372
1373 CUresult res = cuDeviceGetAttribute(
1374 cc_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
1375 if (res != CUDA_SUCCESS) {
1376 return port::Status(
1377 port::error::INTERNAL,
1378 absl::StrFormat(
1379 "failed to get compute capability major for device: %s; %d",
1380 ToString(res), device));
1381 }
1382
1383 res = cuDeviceGetAttribute(
1384 cc_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
1385 if (res != CUDA_SUCCESS) {
1386 return port::Status(
1387 port::error::INTERNAL,
1388 absl::StrFormat(
1389 "failed to get compute capability minor for device: %s; %d",
1390 ToString(res), device));
1391 }
1392
1393 return ::tensorflow::OkStatus();
1394 }
1395
GetGpuISAVersion(int * version,CUdevice device)1396 /* static */ port::Status GpuDriver::GetGpuISAVersion(int* version,
1397 CUdevice device) {
1398 return port::Status{
1399 port::error::INTERNAL,
1400 "Feature not supported on CUDA platform (GetGpuISAVersion)"};
1401 }
1402
GetGpuGCNArchName(CUdevice,std::string *)1403 /* static */ port::Status GpuDriver::GetGpuGCNArchName(CUdevice, std::string*) {
1404 return port::Status{
1405 port::error::INTERNAL,
1406 "Feature not supported on CUDA platform (GetGpuGCNArchName)"};
1407 }
1408
1409 // Helper function that turns the integer output of cuDeviceGetAttribute to type
1410 // T and wraps it in a StatusOr.
1411 template <typename T>
GetSimpleAttribute(CUdevice device,CUdevice_attribute attribute)1412 static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
1413 CUdevice_attribute attribute) {
1414 int value = -1;
1415 RETURN_IF_CUDA_RES_ERROR(cuDeviceGetAttribute(&value, attribute, device),
1416 "Could not retrieve CUDA device attribute (",
1417 attribute);
1418 T converted = value;
1419 return converted;
1420 }
1421
GetMultiprocessorCount(CUdevice device)1422 /* static */ port::StatusOr<int> GpuDriver::GetMultiprocessorCount(
1423 CUdevice device) {
1424 return GetSimpleAttribute<int>(device,
1425 CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
1426 }
1427
GetMaxSharedMemoryPerCore(CUdevice device)1428 /* static */ port::StatusOr<int64_t> GpuDriver::GetMaxSharedMemoryPerCore(
1429 CUdevice device) {
1430 return GetSimpleAttribute<int64_t>(
1431 device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
1432 }
1433
GetMaxSharedMemoryPerBlock(CUdevice device)1434 /* static */ port::StatusOr<int64_t> GpuDriver::GetMaxSharedMemoryPerBlock(
1435 CUdevice device) {
1436 return GetSimpleAttribute<int64_t>(
1437 device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK);
1438 }
1439
GetMaxThreadsPerMultiprocessor(CUdevice device)1440 /* static */ port::StatusOr<int64_t> GpuDriver::GetMaxThreadsPerMultiprocessor(
1441 CUdevice device) {
1442 return GetSimpleAttribute<int64_t>(
1443 device, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR);
1444 }
1445
GetMaxThreadsPerBlock(CUdevice device)1446 /* static */ port::StatusOr<int64_t> GpuDriver::GetMaxThreadsPerBlock(
1447 CUdevice device) {
1448 return GetSimpleAttribute<int64_t>(device,
1449 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
1450 }
1451
GetMaxRegistersPerBlock(CUdevice device)1452 /* static */ port::StatusOr<int64_t> GpuDriver::GetMaxRegistersPerBlock(
1453 CUdevice device) {
1454 return GetSimpleAttribute<int64_t>(
1455 device, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK);
1456 }
1457
GetThreadsPerWarp(CUdevice device)1458 /* static */ port::StatusOr<int64_t> GpuDriver::GetThreadsPerWarp(
1459 CUdevice device) {
1460 return GetSimpleAttribute<int64_t>(device, CU_DEVICE_ATTRIBUTE_WARP_SIZE);
1461 }
1462
GetGridLimits(int * x,int * y,int * z,CUdevice device)1463 /* static */ bool GpuDriver::GetGridLimits(int* x, int* y, int* z,
1464 CUdevice device) {
1465 int value;
1466 CUresult res =
1467 cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device);
1468 if (res != CUDA_SUCCESS) {
1469 LOG(ERROR) << "failed to query max grid dim x: " << ToString(res);
1470 return false;
1471 }
1472 *x = value;
1473
1474 res =
1475 cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device);
1476 if (res != CUDA_SUCCESS) {
1477 LOG(ERROR) << "failed to query max grid dim y: " << ToString(res);
1478 return false;
1479 }
1480 *y = value;
1481
1482 res =
1483 cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device);
1484 if (res != CUDA_SUCCESS) {
1485 LOG(ERROR) << "failed to query max grid dim z: " << ToString(res);
1486 return false;
1487 }
1488 *z = value;
1489 return true;
1490 }
1491
GetDriverVersion(int * driver_version)1492 /* static */ bool GpuDriver::GetDriverVersion(int* driver_version) {
1493 CUresult res = cuDriverGetVersion(driver_version);
1494 if (res != CUDA_SUCCESS) {
1495 LOG(ERROR) << "failed to query driver version: " << ToString(res);
1496 return false;
1497 }
1498
1499 return true;
1500 }
1501
GetDeviceProperties(CUdevprop * device_properties,int device_ordinal)1502 /* static */ bool GpuDriver::GetDeviceProperties(CUdevprop* device_properties,
1503 int device_ordinal) {
1504 CUresult res = cuDeviceGetProperties(device_properties, device_ordinal);
1505 if (res != CUDA_SUCCESS) {
1506 LOG(ERROR) << "failed to query device properties: " << ToString(res);
1507 return false;
1508 }
1509
1510 return true;
1511 }
1512
GetDeviceAttribute(CUdevice_attribute attribute,CUdevice device)1513 /* static */ port::StatusOr<int> GpuDriver::GetDeviceAttribute(
1514 CUdevice_attribute attribute, CUdevice device) {
1515 int val;
1516 CUresult res = cuDeviceGetAttribute(&val, attribute, device);
1517 if (res != CUDA_SUCCESS) {
1518 return port::Status(
1519 port::error::INTERNAL,
1520 absl::StrFormat("failed to get device attribute %d for device %d: %s",
1521 attribute, device, ToString(res)));
1522 }
1523 return val;
1524 }
1525
IsEccEnabled(CUdevice device,bool * result)1526 /* static */ bool GpuDriver::IsEccEnabled(CUdevice device, bool* result) {
1527 int value = -1;
1528 CUresult res =
1529 cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, device);
1530 if (res != CUDA_SUCCESS) {
1531 LOG(ERROR) << "failed to query ECC status: " << ToString(res);
1532 return false;
1533 }
1534
1535 *result = value;
1536 return true;
1537 }
1538
GetDeviceMemoryInfo(GpuContext * context,int64_t * free_out,int64_t * total_out)1539 /* static */ bool GpuDriver::GetDeviceMemoryInfo(GpuContext* context,
1540 int64_t* free_out,
1541 int64_t* total_out) {
1542 ScopedActivateContext activation(context);
1543 size_t free = 0;
1544 size_t total = 0;
1545 CUresult res = cuMemGetInfo(&free, &total);
1546 if (res != CUDA_SUCCESS) {
1547 LOG(ERROR) << "failed to query device memory info: " << ToString(res);
1548 return false;
1549 }
1550
1551 *free_out = free;
1552 *total_out = total;
1553 return true;
1554 }
1555
GetDeviceTotalMemory(CUdevice device,uint64_t * result)1556 /* static */ bool GpuDriver::GetDeviceTotalMemory(CUdevice device,
1557 uint64_t* result) {
1558 size_t value = -1;
1559 CUresult res = cuDeviceTotalMem(&value, device);
1560 if (res != CUDA_SUCCESS) {
1561 LOG(ERROR) << "failed to query total available memory: " << ToString(res);
1562 return false;
1563 }
1564
1565 *result = value;
1566 return true;
1567 }
1568
GetPCIBusID(CUdevice device)1569 /* static */ std::string GpuDriver::GetPCIBusID(CUdevice device) {
1570 std::string pci_bus_id;
1571 static const int kBufferSize = 64;
1572 absl::InlinedVector<char, 4> chars(kBufferSize);
1573 chars[kBufferSize - 1] = '\0';
1574 CUresult res = cuDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device);
1575 if (res != CUDA_SUCCESS) {
1576 LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res);
1577 return pci_bus_id;
1578 }
1579 pci_bus_id = chars.begin();
1580 return pci_bus_id;
1581 }
1582
CanEnablePeerAccess(GpuContext * from,GpuContext * to)1583 /* static */ bool GpuDriver::CanEnablePeerAccess(GpuContext* from,
1584 GpuContext* to) {
1585 if (from == to) {
1586 return true; // A context can always access its own memory.
1587 }
1588
1589 auto from_device = DeviceFromContext(from);
1590 if (!from_device.ok()) {
1591 LOG(ERROR) << "failed to resolve 'from' peer access context to a device: "
1592 << from_device.status();
1593 return false;
1594 }
1595 auto to_device = DeviceFromContext(to);
1596 if (!to_device.ok()) {
1597 LOG(ERROR) << "failed to resolve 'to' peer access context to a device: "
1598 << to_device.status();
1599 return false;
1600 }
1601 return CanEnablePeerAccess(from_device.ValueOrDie(), to_device.ValueOrDie());
1602 }
1603
CanEnablePeerAccess(GpuDeviceHandle from,GpuDeviceHandle to)1604 /* static */ bool GpuDriver::CanEnablePeerAccess(GpuDeviceHandle from,
1605 GpuDeviceHandle to) {
1606 int can_access_peer = -1;
1607 CUresult result = cuDeviceCanAccessPeer(&can_access_peer, from, to);
1608 if (result != CUDA_SUCCESS) {
1609 LOG(ERROR) << "failed to detect peer access capability: "
1610 << ToString(result);
1611 return false;
1612 }
1613 return can_access_peer;
1614 }
1615
EnablePeerAccess(GpuContext * from,GpuContext * to)1616 /* static */ port::Status GpuDriver::EnablePeerAccess(GpuContext* from,
1617 GpuContext* to) {
1618 if (from == to) {
1619 return ::tensorflow::OkStatus(); // A context can always access its own
1620 // memory.
1621 }
1622
1623 ScopedActivateContext activated{from};
1624 CUresult result = cuCtxEnablePeerAccess(to->context(), 0 /* = flags */);
1625 if (result != CUDA_SUCCESS &&
1626 result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
1627 return port::Status(
1628 port::error::INTERNAL,
1629 absl::StrFormat("failed to enable peer access from %p to %p: %s", from,
1630 to, ToString(result)));
1631 }
1632
1633 return ::tensorflow::OkStatus();
1634 }
1635
GetMaxOccupiedBlocksPerCore(GpuContext * context,CUfunction kernel,int threads_per_block,size_t dynamic_shared_memory_bytes)1636 /* static */ port::StatusOr<int> GpuDriver::GetMaxOccupiedBlocksPerCore(
1637 GpuContext* context, CUfunction kernel, int threads_per_block,
1638 size_t dynamic_shared_memory_bytes) {
1639 ScopedActivateContext activation(context);
1640
1641 int max_blocks;
1642 RETURN_IF_CUDA_RES_ERROR(
1643 cuOccupancyMaxActiveBlocksPerMultiprocessor(
1644 &max_blocks, kernel, threads_per_block, dynamic_shared_memory_bytes),
1645 absl::StrFormat("Failed to calculate occupancy of kernel %p", kernel));
1646 return max_blocks;
1647 }
1648
1649 } // namespace gpu
1650
1651 namespace cuda {
1652
CurrentContextOrDie()1653 CUcontext CurrentContextOrDie() {
1654 CUcontext current = nullptr;
1655 FAIL_IF_CUDA_RES_ERROR(cuCtxGetCurrent(¤t),
1656 "Failed to query current context");
1657 return current;
1658 }
1659
1660 } // namespace cuda
1661 } // namespace stream_executor
1662