1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include <stdint.h>
17 #include <stdlib.h>
18
19 #include <map>
20 #include <set>
21 #include <utility>
22
23 #include "absl/base/casts.h"
24 #include "absl/container/inlined_vector.h"
25 #include "absl/strings/str_cat.h"
26 #include "absl/strings/str_format.h"
27 #include "absl/synchronization/mutex.h"
28 #include "absl/synchronization/notification.h"
29 #include "tensorflow/stream_executor/gpu/gpu_diagnostics.h"
30 #include "tensorflow/stream_executor/gpu/gpu_driver.h"
31 #include "tensorflow/stream_executor/lib/env.h"
32 #include "tensorflow/stream_executor/lib/error.h"
33 #include "tensorflow/stream_executor/lib/human_readable.h"
34 #include "tensorflow/stream_executor/lib/stacktrace.h"
35 #include "tensorflow/stream_executor/lib/static_threadlocal.h"
36 #include "tensorflow/stream_executor/lib/threadpool.h"
37 #include "tensorflow/stream_executor/platform/logging.h"
38 #include "tensorflow/stream_executor/platform/port.h"
39 #include "tensorflow/stream_executor/rocm/rocm_driver_wrapper.h"
40
41 bool FLAGS_gpuexec_rocm_driver_inject_init_error = false;
42 bool FLAGS_gpuexec_rocm_sync_around_driver_calls = false;
43 bool FLAGS_gpuexec_rocm_device_0_only = false;
44
45 #define RETURN_IF_ROCM_ERROR(expr, ...) \
46 do { \
47 hipError_t _res = (expr); \
48 if (TF_PREDICT_FALSE(_res != hipSuccess)) { \
49 return port::InternalError(absl::StrCat( \
50 __VA_ARGS__, ": ", ::stream_executor::gpu::ToString(_res))); \
51 } \
52 } while (0)
53
54 // Debugging: on each push and pop of a rocm context, verify the current device
55 // matches the expected one.
56 constexpr bool kVerifyGpuContext = false;
57
58 namespace stream_executor {
59 namespace gpu {
60
61 // GpuContext wraps the device_ordinal.
62 // Only reason we need this wrapper class is to make the GpuDriver* API
63 class GpuContext {
64 public:
GpuContext(const int v)65 GpuContext(const int v) : device_ordinal_(v) {}
66
device_ordinal() const67 int device_ordinal() const { return device_ordinal_; }
68
69 // Disallow copying and moving.
70 GpuContext(GpuContext&&) = delete;
71 GpuContext(const GpuContext&) = delete;
72 GpuContext& operator=(GpuContext&&) = delete;
73 GpuContext& operator=(const GpuContext&) = delete;
74
75 private:
76 const int device_ordinal_;
77 };
78
79 namespace {
80
81 // Formats hipError_t to output prettified values into a log stream.
82 // Error summaries taken from:
ToString(hipError_t result)83 string ToString(hipError_t result) {
84 #define OSTREAM_ROCM_ERROR(__name) \
85 case hipError##__name: \
86 return "HIP_ERROR_" #__name;
87
88 switch (result) {
89 OSTREAM_ROCM_ERROR(InvalidValue)
90 OSTREAM_ROCM_ERROR(OutOfMemory)
91 OSTREAM_ROCM_ERROR(NotInitialized)
92 OSTREAM_ROCM_ERROR(Deinitialized)
93 OSTREAM_ROCM_ERROR(NoDevice)
94 OSTREAM_ROCM_ERROR(InvalidDevice)
95 OSTREAM_ROCM_ERROR(InvalidImage)
96 OSTREAM_ROCM_ERROR(InvalidContext)
97 OSTREAM_ROCM_ERROR(InvalidHandle)
98 OSTREAM_ROCM_ERROR(NotFound)
99 OSTREAM_ROCM_ERROR(NotReady)
100 OSTREAM_ROCM_ERROR(NoBinaryForGpu)
101
102 // Encountered an uncorrectable ECC error during execution.
103 OSTREAM_ROCM_ERROR(ECCNotCorrectable)
104
105 // Load/store on an invalid address. Must reboot all context.
106 case 700:
107 return "ROCM_ERROR_ILLEGAL_ADDRESS";
108 // Passed too many / wrong arguments, too many threads for register count.
109 case 701:
110 return "ROCM_ERROR_LAUNCH_OUT_OF_RESOURCES";
111
112 OSTREAM_ROCM_ERROR(ContextAlreadyInUse)
113 OSTREAM_ROCM_ERROR(PeerAccessUnsupported)
114 OSTREAM_ROCM_ERROR(Unknown) // Unknown internal error to ROCM.
115 default:
116 return absl::StrCat("hipError_t(", static_cast<int>(result), ")");
117 }
118 }
119
120 // ROCM driver routines may require a large amount of stack (particularly
121 // hipModuleLoadDataEx, in our experience). To avoid stack overflow when using
122 // stack-limited threads (such as those spawned by a default-argument
123 // thread::ThreadPool on some platforms), we run certain routines in this pool
124 // and wait for completion.
GetDriverExecutor()125 port::ThreadPool* GetDriverExecutor() {
126 static port::ThreadPool* thread_pool = new port::ThreadPool(
127 port::Env::Default(), port::ThreadOptions(), "rocm_driver", 1);
128 return thread_pool;
129 }
130
131 } // namespace
132
MemorySpaceString(MemorySpace memory_space)133 string MemorySpaceString(MemorySpace memory_space) {
134 switch (memory_space) {
135 case MemorySpace::kHost:
136 return "host";
137 case MemorySpace::kDevice:
138 return "device";
139 default:
140 LOG(FATAL) << "impossible memory space";
141 }
142 }
143
144 // Returns the current device set in HIP. This is done by calling the
145 // HIP driver (e.g., this value is not our cached view of the current device).
CurrentDeviceOrDie()146 static int CurrentDeviceOrDie() {
147 int current = -1;
148 hipError_t result = tensorflow::wrap::hipGetDevice(¤t);
149 if (result != hipSuccess) {
150 LOG(FATAL) << "failed to query current device: " << ToString(result);
151 }
152 return current;
153 }
154
155 namespace {
156
157 // Call hipDeviceSynchronize and crash if it doesn't succeed.
SynchronizeOrDie()158 void SynchronizeOrDie() {
159 auto res = tensorflow::wrap::hipDeviceSynchronize();
160 if (res != hipSuccess) {
161 LOG(FATAL) << "Synchronize found " << ToString(res)
162 << " :: " << port::CurrentStackTrace();
163 }
164 }
165
166 struct ThreadLocalData {
167 int current_device_ordinal;
168 int depth;
169 };
170
171 SE_STATIC_THREAD_LOCAL_POD(ThreadLocalData, tls_data);
172
173 } // namespace
174
ScopedActivateContext(GpuContext * context)175 ScopedActivateContext::ScopedActivateContext(GpuContext* context) {
176 if (FLAGS_gpuexec_rocm_sync_around_driver_calls) {
177 SynchronizeOrDie();
178 }
179
180 auto* tls = &tls_data.get();
181 if (tls->depth == 0) {
182 tls->current_device_ordinal = CurrentDeviceOrDie();
183 }
184
185 if (kVerifyGpuContext) {
186 CHECK_EQ(CurrentDeviceOrDie(), tls->current_device_ordinal);
187 }
188
189 tls->depth++;
190
191 to_restore_ = context;
192
193 if (context->device_ordinal() == tls->current_device_ordinal) {
194 DCHECK_EQ(CurrentDeviceOrDie(), context->device_ordinal());
195 return;
196 }
197
198 VLOG(3) << "ScopedActivateContext switching device from "
199 << tls->current_device_ordinal << " to " << context->device_ordinal();
200
201 // Set the device and update thread local.
202 CHECK_EQ(hipSuccess,
203 tensorflow::wrap::hipSetDevice(context->device_ordinal()));
204 tls->current_device_ordinal = context->device_ordinal();
205 }
206
~ScopedActivateContext()207 ScopedActivateContext::~ScopedActivateContext() {
208 if (FLAGS_gpuexec_rocm_sync_around_driver_calls) {
209 SynchronizeOrDie();
210 }
211
212 auto* tls = &tls_data.get();
213
214 if (kVerifyGpuContext) {
215 CHECK_EQ(CurrentDeviceOrDie(), tls->current_device_ordinal);
216 }
217
218 tls->depth--;
219 DCHECK_GE(tls->depth, 0);
220
221 if (to_restore_->device_ordinal() == tls->current_device_ordinal) {
222 DCHECK_EQ(CurrentDeviceOrDie(), to_restore_->device_ordinal());
223 return;
224 }
225
226 VLOG(3) << "ScopedActivateContext switching device from "
227 << tls->current_device_ordinal << " to "
228 << to_restore_->device_ordinal();
229
230 // Set context and update thread local.
231 CHECK_EQ(hipSuccess,
232 tensorflow::wrap::hipSetDevice(to_restore_->device_ordinal()));
233 tls->current_device_ordinal = to_restore_->device_ordinal();
234 }
235
236 namespace {
237
238 // Returns a stringified device number associated with pointer, primarily for
239 // logging purposes. Returns "?" if the device could not be successfully
240 // queried.
ROCMPointerToDeviceString(hipDeviceptr_t pointer)241 string ROCMPointerToDeviceString(hipDeviceptr_t pointer) {
242 auto value = GpuDriver::GetPointerDevice(pointer);
243 if (value.ok()) {
244 return absl::StrCat(value.ValueOrDie());
245 }
246 LOG(ERROR) << "could not query device: " << value.status();
247 return "?";
248 }
249
250 // Returns a stringified memory space associated with pointer, primarily for
251 // logging purposes. Returns "?" if the memory space could not be successfully
252 // queried.
ROCMPointerToMemorySpaceString(hipDeviceptr_t pointer)253 string ROCMPointerToMemorySpaceString(hipDeviceptr_t pointer) {
254 auto value = GpuDriver::GetPointerMemorySpace(pointer);
255 if (value.ok()) {
256 return MemorySpaceString(value.ValueOrDie());
257 }
258 LOG(ERROR) << "could not query device: " << value.status();
259 return "?";
260 }
261
262 // Returns a stringified representation of whether or not peer access is
263 // permitted between the "from" and "to" pointers' associated contexts,
264 // primarily for logging purposes. Returns "error" if an error is encountered
265 // in the process of querying.
ROCMPointersToCanAccessString(hipDeviceptr_t from,hipDeviceptr_t to)266 string ROCMPointersToCanAccessString(hipDeviceptr_t from, hipDeviceptr_t to) {
267 hipPointerAttribute_t from_pointerAttributes;
268 hipError_t result =
269 tensorflow::wrap::hipPointerGetAttributes(&from_pointerAttributes, from);
270 if (result != hipSuccess) {
271 LOG(ERROR) << "could not retrieve source pointer's device: "
272 << ToString(result);
273 return "error";
274 }
275
276 hipPointerAttribute_t to_pointerAttributes;
277 result = tensorflow::wrap::hipPointerGetAttributes(&to_pointerAttributes, to);
278 if (result != hipSuccess) {
279 LOG(ERROR) << "could not retrieve destination pointer's device: "
280 << ToString(result);
281 return "error";
282 }
283
284 GpuContext fromCtx(from_pointerAttributes.device);
285 GpuContext toCtx(to_pointerAttributes.device);
286
287 return GpuDriver::CanEnablePeerAccess(&fromCtx, &toCtx) ? "true" : "false";
288 }
289
290 // Actually performs the work of ROCM initialization. Wrapped up in one-time
291 // execution guard.
InternalInit()292 static port::Status InternalInit() {
293 hipError_t res = hipErrorNoDevice;
294 if (FLAGS_gpuexec_rocm_driver_inject_init_error) {
295 LOG(ERROR) << "injecting ROCM init error; initialization will fail";
296 } else {
297 res = tensorflow::wrap::hipInit(0 /* = flags */);
298 }
299
300 if (res == hipSuccess) {
301 return port::Status::OK();
302 }
303
304 LOG(ERROR) << "failed call to hipInit: " << ToString(res);
305 Diagnostician::LogDiagnosticInformation();
306 return port::Status{port::error::ABORTED,
307 absl::StrCat("failed call to hipInit: ", ToString(res))};
308 }
309
310 } // namespace
311
Init()312 /* static */ port::Status GpuDriver::Init() {
313 // Cached return value from calling InternalInit(), as hipInit need only be
314 // called once, but GpuDriver::Init may be called many times.
315 static port::Status* init_retval = [] {
316 return new port::Status(InternalInit());
317 }();
318 return *init_retval;
319 }
320
GetDevice(int device_ordinal,hipDevice_t * device)321 /* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
322 hipDevice_t* device) {
323 hipError_t res = tensorflow::wrap::hipDeviceGet(device, device_ordinal);
324 if (res == hipSuccess) {
325 return port::Status::OK();
326 }
327
328 return port::Status{
329 port::error::INTERNAL,
330 absl::StrCat("failed call to hipDeviceGet: ", ToString(res))};
331 }
332
GetDeviceName(hipDevice_t device,string * device_name)333 /* static */ port::Status GpuDriver::GetDeviceName(hipDevice_t device,
334 string* device_name) {
335 static const size_t kCharLimit = 64;
336 absl::InlinedVector<char, 4> chars(kCharLimit);
337 RETURN_IF_ROCM_ERROR(
338 tensorflow::wrap::hipDeviceGetName(chars.begin(), kCharLimit - 1, device),
339 "Failed to get device name");
340 chars[kCharLimit - 1] = '\0';
341 *device_name = chars.begin();
342 return port::Status::OK();
343 }
344
DeviceOptionsToContextFlags(const DeviceOptions & device_options,int * flags)345 bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
346 int* flags) {
347 static_assert(DeviceOptions::kMask == 0xf,
348 "needs update for new device options");
349 return true;
350 }
351
CreateContext(int device_ordinal,hipDevice_t device,const DeviceOptions & device_options,GpuContext ** context)352 /* static */ port::Status GpuDriver::CreateContext(
353 int device_ordinal, hipDevice_t device, const DeviceOptions& device_options,
354 GpuContext** context) {
355 // TODO(hanbinyoon): Create a real context, i.e., by calling hipCtxCreate().
356 *context = new GpuContext(device_ordinal);
357 return port::Status::OK();
358 }
DestroyContext(GpuContext * context)359 /* static */ void GpuDriver::DestroyContext(GpuContext* context) {
360 if (context == nullptr) {
361 return;
362 }
363 delete context;
364 }
365
GetContextHandle(GpuContext * context)366 /* static */ hipCtx_t GpuDriver::GetContextHandle(GpuContext* context) {
367 // TODO(hanbinyoon): Return a real context.
368 return nullptr;
369 }
370
FuncGetAttribute(hipDeviceAttribute_t attribute,hipFunction_t func,int * attribute_value)371 /* static */ port::Status GpuDriver::FuncGetAttribute(
372 hipDeviceAttribute_t attribute, hipFunction_t func, int* attribute_value) {
373 // TODO(ROCm) properly implement this feature in HIP
374 return port::Status::OK();
375 }
376
FuncSetCacheConfig(hipFunction_t function,hipFuncCache_t cache_config)377 /* static */ port::Status GpuDriver::FuncSetCacheConfig(
378 hipFunction_t function, hipFuncCache_t cache_config) {
379 RETURN_IF_ROCM_ERROR(
380 tensorflow::wrap::hipFuncSetCacheConfig(function, cache_config),
381 "Failed to set ROCM kernel cache config.");
382 return port::Status::OK();
383 }
384
385 /* static */ port::StatusOr<hipSharedMemConfig>
ContextGetSharedMemConfig(GpuContext * context)386 GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
387 hipSharedMemConfig shared_mem_config;
388 ScopedActivateContext activation{context};
389 RETURN_IF_ROCM_ERROR(
390 tensorflow::wrap::hipDeviceGetSharedMemConfig(&shared_mem_config),
391 "Failed to get shared memory config");
392 return shared_mem_config;
393 }
394
ContextSetSharedMemConfig(GpuContext * context,hipSharedMemConfig shared_mem_config)395 /* static */ port::Status GpuDriver::ContextSetSharedMemConfig(
396 GpuContext* context, hipSharedMemConfig shared_mem_config) {
397 ScopedActivateContext activation{context};
398 RETURN_IF_ROCM_ERROR(
399 tensorflow::wrap::hipDeviceSetSharedMemConfig(shared_mem_config),
400 "Failed to set ROCM device shared memory config");
401 return port::Status::OK();
402 }
403
LaunchKernel(GpuContext * context,absl::string_view kernel_name,hipFunction_t function,unsigned int grid_dim_x,unsigned int grid_dim_y,unsigned int grid_dim_z,unsigned int block_dim_x,unsigned int block_dim_y,unsigned int block_dim_z,unsigned int shared_mem_bytes,GpuStreamHandle stream,void ** kernel_params,void ** extra)404 /* static */ port::Status GpuDriver::LaunchKernel(
405 GpuContext* context, absl::string_view kernel_name, hipFunction_t function,
406 unsigned int grid_dim_x, unsigned int grid_dim_y, unsigned int grid_dim_z,
407 unsigned int block_dim_x, unsigned int block_dim_y,
408 unsigned int block_dim_z, unsigned int shared_mem_bytes,
409 GpuStreamHandle stream, void** kernel_params, void** extra) {
410 ScopedActivateContext activation{context};
411 VLOG(2) << "launching kernel: " << kernel_name << "; gdx: " << grid_dim_x
412 << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
413 << " bdx: " << block_dim_x << " bdy: " << block_dim_y
414 << " bdz: " << block_dim_z << " smem: " << shared_mem_bytes;
415 RETURN_IF_ROCM_ERROR(tensorflow::wrap::hipModuleLaunchKernel(
416 function, grid_dim_x, grid_dim_y, grid_dim_z,
417 block_dim_x, block_dim_y, block_dim_z,
418 shared_mem_bytes, stream, kernel_params, extra),
419 "Failed to launch ROCm kernel: ", kernel_name,
420 " with block dimensions: ", block_dim_x, "x",
421 block_dim_y, "x", block_dim_z);
422 VLOG(2) << "successfully launched kernel";
423 return port::Status::OK();
424 }
425
LoadPtx(GpuContext * context,const char * ptx_contents,hipModule_t * module)426 /* static */ port::Status GpuDriver::LoadPtx(GpuContext* context,
427 const char* ptx_contents,
428 hipModule_t* module) {
429 LOG(ERROR) << "Feature not supported on ROCm platform (LoadPtx)";
430 return port::InternalError("Not Implemented");
431 }
432
LoadCubin(GpuContext * context,const char * cubin_bytes,hipModule_t * module)433 /* static */ port::Status GpuDriver::LoadCubin(GpuContext* context,
434 const char* cubin_bytes,
435 hipModule_t* module) {
436 return port::Status{port::error::INTERNAL,
437 "Feature not supported on ROCm platform (LoadCubin)"};
438 }
439
LoadHsaco(GpuContext * context,const char * hsaco_contents,hipModule_t * module)440 /* static */ port::Status GpuDriver::LoadHsaco(GpuContext* context,
441 const char* hsaco_contents,
442 hipModule_t* module) {
443 absl::Notification notification;
444 port::Status ret = port::Status::OK();
445 GetDriverExecutor()->Schedule([context, hsaco_contents, module, &ret,
446 ¬ification]() {
447 ScopedActivateContext activation{context};
448 void* hsaco_data = const_cast<char*>(hsaco_contents);
449
450 hipError_t res = tensorflow::wrap::hipModuleLoadData(module, hsaco_data);
451
452 if (res != hipSuccess) {
453 ret = port::InternalError(
454 absl::StrCat("Failed to load HSACO: ", ToString(res)));
455 notification.Notify();
456 }
457
458 CHECK(module != nullptr);
459 notification.Notify();
460 });
461 notification.WaitForNotification();
462
463 return ret;
464 }
465
SynchronousMemsetUint8(GpuContext * context,hipDeviceptr_t location,uint8 value,size_t size)466 /* static */ port::Status GpuDriver::SynchronousMemsetUint8(
467 GpuContext* context, hipDeviceptr_t location, uint8 value, size_t size) {
468 ScopedActivateContext activation{context};
469 RETURN_IF_ROCM_ERROR(tensorflow::wrap::hipMemsetD8(location, value, size),
470 "Failed to memset memory");
471 return port::Status::OK();
472 }
473
SynchronousMemsetUint32(GpuContext * context,hipDeviceptr_t location,uint32 value,size_t uint32_count)474 /* static */ port::Status GpuDriver::SynchronousMemsetUint32(
475 GpuContext* context, hipDeviceptr_t location, uint32 value,
476 size_t uint32_count) {
477 ScopedActivateContext activation{context};
478 void* pointer = absl::bit_cast<void*>(location);
479 RETURN_IF_ROCM_ERROR(
480 tensorflow::wrap::hipMemsetD32(pointer, value, uint32_count),
481 "Failed to memset memory");
482 return port::Status::OK();
483 }
484
AsynchronousMemsetUint8(GpuContext * context,hipDeviceptr_t location,uint8 value,size_t uint32_count,GpuStreamHandle stream)485 /* static */ port::Status GpuDriver::AsynchronousMemsetUint8(
486 GpuContext* context, hipDeviceptr_t location, uint8 value,
487 size_t uint32_count, GpuStreamHandle stream) {
488 ScopedActivateContext activation{context};
489 RETURN_IF_ROCM_ERROR(
490 tensorflow::wrap::hipMemsetAsync(location, value, uint32_count, stream),
491 "Failed to enqueue async memset operation");
492 return port::Status::OK();
493 }
494
AsynchronousMemsetUint32(GpuContext * context,hipDeviceptr_t location,uint32 value,size_t uint32_count,GpuStreamHandle stream)495 /* static */ port::Status GpuDriver::AsynchronousMemsetUint32(
496 GpuContext* context, hipDeviceptr_t location, uint32 value,
497 size_t uint32_count, GpuStreamHandle stream) {
498 ScopedActivateContext activation{context};
499 void* pointer = absl::bit_cast<void*>(location);
500 RETURN_IF_ROCM_ERROR(
501 tensorflow::wrap::hipMemsetD32Async(pointer, value, uint32_count, stream),
502 "Failed to enqueue async memset operation");
503 VLOG(2) << "successfully enqueued async memset operation";
504 return port::Status::OK();
505 }
506
AddStreamCallback(GpuContext * context,GpuStreamHandle stream,StreamCallback callback,void * data)507 /* static */ bool GpuDriver::AddStreamCallback(GpuContext* context,
508 GpuStreamHandle stream,
509 StreamCallback callback,
510 void* data) {
511 hipError_t res = tensorflow::wrap::hipStreamAddCallback(
512 stream, (hipStreamCallback_t)callback, data, 0 /* = flags */);
513 if (res != hipSuccess) {
514 LOG(ERROR) << "unable to add host callback: " << ToString(res);
515 return false;
516 }
517 return true;
518 }
519
GetModuleFunction(GpuContext * context,hipModule_t module,const char * kernel_name,hipFunction_t * function)520 /* static */ bool GpuDriver::GetModuleFunction(GpuContext* context,
521 hipModule_t module,
522 const char* kernel_name,
523 hipFunction_t* function) {
524 ScopedActivateContext activated{context};
525 CHECK(module != nullptr && kernel_name != nullptr);
526 hipError_t res =
527 tensorflow::wrap::hipModuleGetFunction(function, module, kernel_name);
528 if (res != hipSuccess) {
529 LOG(ERROR) << "failed to get kernel \"" << kernel_name
530 << "\" from module: " << ToString(res);
531 return false;
532 }
533
534 return true;
535 }
536
GetModuleSymbol(GpuContext * context,hipModule_t module,const char * symbol_name,hipDeviceptr_t * dptr,size_t * bytes)537 /* static */ bool GpuDriver::GetModuleSymbol(GpuContext* context,
538 hipModule_t module,
539 const char* symbol_name,
540 hipDeviceptr_t* dptr,
541 size_t* bytes) {
542 ScopedActivateContext activated{context};
543 CHECK(module != nullptr && symbol_name != nullptr &&
544 (dptr != nullptr || bytes != nullptr));
545 hipError_t res =
546 tensorflow::wrap::hipModuleGetGlobal(dptr, bytes, module, symbol_name);
547 if (res != hipSuccess) {
548 // symbol may not be found in the current module, but it may reside in
549 // another module.
550 VLOG(2) << "failed to get symbol \"" << symbol_name
551 << "\" from module: " << ToString(res);
552 return false;
553 }
554
555 return true;
556 }
557
UnloadModule(GpuContext * context,hipModule_t module)558 /* static */ void GpuDriver::UnloadModule(GpuContext* context,
559 hipModule_t module) {
560 ScopedActivateContext activated{context};
561 hipError_t res = tensorflow::wrap::hipModuleUnload(module);
562 if (res != hipSuccess) {
563 LOG(ERROR) << "failed to unload module " << module
564 << "; leaking: " << ToString(res);
565 }
566 }
567
CreateStream(GpuContext * context,GpuStreamHandle * stream,int priority)568 /* static */ bool GpuDriver::CreateStream(GpuContext* context,
569 GpuStreamHandle* stream,
570 int priority) {
571 ScopedActivateContext activated{context};
572 hipError_t res;
573 if (priority == 0) {
574 res = tensorflow::wrap::hipStreamCreateWithFlags(
575 stream, hipStreamDefault); // switch to hipStreamNonBlocking?
576 } else {
577 res = tensorflow::wrap::hipStreamCreateWithPriority(
578 stream, hipStreamDefault, priority); // switch to hipStreamNonBlocking?
579 }
580 if (res != hipSuccess) {
581 LOG(ERROR) << "could not allocate ROCM stream for device "
582 << context->device_ordinal() << ": " << ToString(res);
583 return false;
584 }
585
586 VLOG(2) << "successfully created stream " << *stream << " for device "
587 << context->device_ordinal() << " on thread";
588 return true;
589 }
590
DestroyStream(GpuContext * context,GpuStreamHandle * stream)591 /* static */ void GpuDriver::DestroyStream(GpuContext* context,
592 GpuStreamHandle* stream) {
593 if (*stream == nullptr) {
594 return;
595 }
596
597 ScopedActivateContext activated{context};
598 hipError_t res = tensorflow::wrap::hipStreamDestroy(*stream);
599 if (res != hipSuccess) {
600 LOG(ERROR) << "failed to destroy ROCM stream for device "
601 << context->device_ordinal() << ": " << ToString(res);
602 } else {
603 VLOG(2) << "successfully destroyed stream " << *stream << " for device "
604 << context->device_ordinal();
605 *stream = nullptr;
606 }
607 }
608
DeviceAllocate(GpuContext * context,uint64_t bytes)609 /* static */ void* GpuDriver::DeviceAllocate(GpuContext* context,
610 uint64_t bytes) {
611 ScopedActivateContext activated{context};
612 hipDeviceptr_t result = 0;
613 hipError_t res = tensorflow::wrap::hipMalloc(&result, bytes);
614 if (res != hipSuccess) {
615 LOG(ERROR) << "failed to allocate "
616 << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
617 << " bytes) from device: " << ToString(res);
618 return nullptr;
619 }
620 void* ptr = reinterpret_cast<void*>(result);
621 VLOG(2) << "allocated " << ptr << " for device " << context->device_ordinal()
622 << " of " << bytes << " bytes";
623 return ptr;
624 }
625
DeviceDeallocate(GpuContext * context,void * location)626 /* static */ void GpuDriver::DeviceDeallocate(GpuContext* context,
627 void* location) {
628 ScopedActivateContext activation{context};
629 hipDeviceptr_t pointer = absl::bit_cast<hipDeviceptr_t>(location);
630 hipError_t res = tensorflow::wrap::hipFree(pointer);
631 if (res != hipSuccess) {
632 LOG(ERROR) << "failed to free device memory at " << location
633 << "; result: " << ToString(res);
634 } else {
635 VLOG(2) << "deallocated " << location << " for device "
636 << context->device_ordinal();
637 }
638 }
639
UnifiedMemoryAllocate(GpuContext * context,uint64_t bytes)640 /* static */ void* GpuDriver::UnifiedMemoryAllocate(GpuContext* context,
641 uint64_t bytes) {
642 ScopedActivateContext activated{context};
643
644 LOG(ERROR)
645 << "Feature not supported on ROCm platform (UnifiedMemoryAllocate)";
646 return nullptr;
647 }
648
UnifiedMemoryDeallocate(GpuContext * context,void * location)649 /* static */ void GpuDriver::UnifiedMemoryDeallocate(GpuContext* context,
650 void* location) {
651 LOG(ERROR)
652 << "Feature not supported on ROCm platform (UnifiedMemoryDeallocate)";
653 }
654
HostAllocate(GpuContext * context,uint64_t bytes)655 /* static */ void* GpuDriver::HostAllocate(GpuContext* context,
656 uint64_t bytes) {
657 ScopedActivateContext activation{context};
658 void* host_mem = nullptr;
659 // "Portable" memory is visible to all ROCM contexts. Safe for our use model.
660 hipError_t res =
661 tensorflow::wrap::hipHostMalloc(&host_mem, bytes, hipHostMallocPortable);
662 if (res != hipSuccess) {
663 LOG(ERROR) << "failed to alloc " << bytes
664 << " bytes on host: " << ToString(res);
665 }
666 return host_mem;
667 }
668
HostDeallocate(GpuContext * context,void * location)669 /* static */ void GpuDriver::HostDeallocate(GpuContext* context,
670 void* location) {
671 ScopedActivateContext activation{context};
672 hipError_t res = tensorflow::wrap::hipHostFree(location);
673 if (res != hipSuccess) {
674 LOG(ERROR) << "error deallocating host memory at " << location << ": "
675 << ToString(res);
676 }
677 }
678
HostRegister(GpuContext * context,void * location,uint64_t bytes)679 /* static */ bool GpuDriver::HostRegister(GpuContext* context, void* location,
680 uint64_t bytes) {
681 ScopedActivateContext activation{context};
682 // "Portable" memory is visible to all ROCM contexts. Safe for our use model.
683 hipError_t res = tensorflow::wrap::hipHostRegister(location, bytes,
684 hipHostRegisterPortable);
685 if (res != hipSuccess) {
686 LOG(ERROR) << "error registering host memory at " << location << ": "
687 << ToString(res);
688 return false;
689 }
690 return true;
691 }
692
HostUnregister(GpuContext * context,void * location)693 /* static */ bool GpuDriver::HostUnregister(GpuContext* context,
694 void* location) {
695 ScopedActivateContext activation{context};
696 hipError_t res = tensorflow::wrap::hipHostUnregister(location);
697 if (res != hipSuccess) {
698 LOG(ERROR) << "error unregistering host memory at " << location << ": "
699 << ToString(res);
700 return false;
701 }
702 return true;
703 }
704
DestroyEvent(GpuContext * context,GpuEventHandle * event)705 /* static */ port::Status GpuDriver::DestroyEvent(GpuContext* context,
706 GpuEventHandle* event) {
707 if (*event == nullptr) {
708 return port::Status{port::error::INVALID_ARGUMENT,
709 "input event cannot be null"};
710 }
711
712 ScopedActivateContext activated{context};
713 hipError_t res = tensorflow::wrap::hipEventDestroy(*event);
714 *event = nullptr;
715
716 switch (res) {
717 case hipSuccess:
718 return port::Status::OK();
719 case hipErrorDeinitialized:
720 case hipErrorNotInitialized:
721 return port::Status{
722 port::error::FAILED_PRECONDITION,
723 absl::StrFormat("error destroying ROCM event in device %d: %s",
724 context->device_ordinal(), ToString(res).c_str())};
725 default:
726 return port::Status{
727 port::error::INTERNAL,
728 absl::StrFormat("error destroying ROCM event in device %d: %s",
729 context->device_ordinal(), ToString(res).c_str())};
730 }
731 }
732
RecordEvent(GpuContext * context,GpuEventHandle event,GpuStreamHandle stream)733 /* static */ port::Status GpuDriver::RecordEvent(GpuContext* context,
734 GpuEventHandle event,
735 GpuStreamHandle stream) {
736 ScopedActivateContext activated{context};
737 hipError_t res = tensorflow::wrap::hipEventRecord(event, stream);
738 switch (res) {
739 case hipSuccess:
740 return port::Status::OK();
741 case hipErrorDeinitialized:
742 case hipErrorNotInitialized:
743 return port::Status{
744 port::error::FAILED_PRECONDITION,
745 absl::StrFormat("error recording ROCM event on stream %p: %s", stream,
746 ToString(res).c_str())};
747 default:
748 return port::Status{
749 port::error::INVALID_ARGUMENT,
750 absl::StrFormat("error recording ROCM event on stream %p: %s", stream,
751 ToString(res).c_str())};
752 }
753 }
754
QueryEvent(GpuContext * context,GpuEventHandle event)755 /* static */ port::StatusOr<hipError_t> GpuDriver::QueryEvent(
756 GpuContext* context, GpuEventHandle event) {
757 ScopedActivateContext activated{context};
758 hipError_t res = tensorflow::wrap::hipEventQuery(event);
759 if (res != hipSuccess && res != hipErrorNotReady) {
760 return port::Status{
761 port::error::INTERNAL,
762 absl::StrFormat("failed to query event: %s", ToString(res).c_str())};
763 }
764
765 return res;
766 }
767
GetEventElapsedTime(GpuContext * context,float * elapsed_milliseconds,GpuEventHandle start,GpuEventHandle stop)768 /* static */ bool GpuDriver::GetEventElapsedTime(GpuContext* context,
769 float* elapsed_milliseconds,
770 GpuEventHandle start,
771 GpuEventHandle stop) {
772 ScopedActivateContext activated{context};
773 // The stop event must have completed in order for hipEventElapsedTime to
774 // work.
775 hipError_t res = tensorflow::wrap::hipEventSynchronize(stop);
776 if (res != hipSuccess) {
777 LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res);
778 return false;
779 }
780 res =
781 tensorflow::wrap::hipEventElapsedTime(elapsed_milliseconds, start, stop);
782 if (res != hipSuccess) {
783 LOG(ERROR) << "failed to get elapsed time between events: "
784 << ToString(res);
785 return false;
786 }
787
788 return true;
789 }
790
WaitStreamOnEvent(GpuContext * context,GpuStreamHandle stream,GpuEventHandle event)791 /* static */ bool GpuDriver::WaitStreamOnEvent(GpuContext* context,
792 GpuStreamHandle stream,
793 GpuEventHandle event) {
794 ScopedActivateContext activation{context};
795 hipError_t res =
796 tensorflow::wrap::hipStreamWaitEvent(stream, event, 0 /* = flags */);
797 if (res != hipSuccess) {
798 LOG(ERROR) << "could not wait stream on event: " << ToString(res);
799 return false;
800 }
801
802 return true;
803 }
804
SynchronizeContext(GpuContext * context)805 /* static */ bool GpuDriver::SynchronizeContext(GpuContext* context) {
806 ScopedActivateContext activation{context};
807 hipError_t res = tensorflow::wrap::hipDeviceSynchronize();
808 if (res != hipSuccess) {
809 LOG(ERROR) << "could not synchronize on ROCM device: " << ToString(res)
810 << " :: " << port::CurrentStackTrace();
811 return false;
812 }
813
814 return true;
815 }
816
SynchronizeStream(GpuContext * context,GpuStreamHandle stream)817 /* static */ port::Status GpuDriver::SynchronizeStream(GpuContext* context,
818 GpuStreamHandle stream) {
819 ScopedActivateContext activated{context};
820 CHECK(stream != nullptr);
821 RETURN_IF_ROCM_ERROR(tensorflow::wrap::hipStreamSynchronize(stream),
822 "Could not synchronize on ROCM stream");
823 VLOG(2) << "successfully synchronized stream " << stream << " on device "
824 << context->device_ordinal();
825 return port::Status::OK();
826 }
827
IsStreamIdle(GpuContext * context,GpuStreamHandle stream)828 /* static */ bool GpuDriver::IsStreamIdle(GpuContext* context,
829 GpuStreamHandle stream) {
830 ScopedActivateContext activated{context};
831 CHECK(stream != nullptr);
832 hipError_t res = tensorflow::wrap::hipStreamQuery(stream);
833 if (res == hipSuccess) {
834 return true;
835 }
836
837 if (res != hipErrorNotReady) {
838 LOG(ERROR) << "stream in bad state on status query: " << ToString(res);
839 }
840 return false;
841 }
842
SynchronousMemcpyD2H(GpuContext * context,void * host_dst,hipDeviceptr_t gpu_src,uint64_t size)843 /* static */ port::Status GpuDriver::SynchronousMemcpyD2H(
844 GpuContext* context, void* host_dst, hipDeviceptr_t gpu_src,
845 uint64_t size) {
846 ScopedActivateContext activation{context};
847 RETURN_IF_ROCM_ERROR(
848 tensorflow::wrap::hipMemcpyDtoH(host_dst, gpu_src, size),
849 absl::StrFormat("failed to synchronous memcpy from device to host: "
850 "host dst: %p; Gpu src: %p; size: %llu=0x%llx",
851 host_dst, absl::bit_cast<void*>(gpu_src), size, size));
852 VLOG(2) << "successfully sync memcpy'd d2h of " << size << " bytes to "
853 << host_dst;
854 return port::Status::OK();
855 }
856
SynchronousMemcpyH2D(GpuContext * context,hipDeviceptr_t gpu_dst,const void * host_src,uint64_t size)857 /* static */ port::Status GpuDriver::SynchronousMemcpyH2D(
858 GpuContext* context, hipDeviceptr_t gpu_dst, const void* host_src,
859 uint64_t size) {
860 ScopedActivateContext activation{context};
861 RETURN_IF_ROCM_ERROR(
862 tensorflow::wrap::hipMemcpyHtoD(gpu_dst, const_cast<void*>(host_src),
863 size),
864 absl::StrFormat(
865 "failed to synchronous memcpy from host to device: Gpu dst: %p;"
866 " host src: %p; size: %llu=0x%llx",
867 absl::bit_cast<void*>(gpu_dst), host_src, size, size));
868 VLOG(2) << "successfully enqueued sync memcpy h2d of " << size << " bytes";
869 return port::Status::OK();
870 }
871
SynchronousMemcpyD2D(GpuContext * context,hipDeviceptr_t gpu_dst,hipDeviceptr_t gpu_src,uint64_t size)872 /* static */ port::Status GpuDriver::SynchronousMemcpyD2D(
873 GpuContext* context, hipDeviceptr_t gpu_dst, hipDeviceptr_t gpu_src,
874 uint64_t size) {
875 ScopedActivateContext activation{context};
876 RETURN_IF_ROCM_ERROR(
877 tensorflow::wrap::hipMemcpyDtoD(gpu_dst, gpu_src, size),
878 absl::StrFormat(
879 "failed to synchronous memcpy from host to device:Gpu dst: %p; "
880 "Gpu src: %p; size: %llu=0x%llx",
881 absl::bit_cast<void*>(gpu_dst), absl::bit_cast<void*>(gpu_src), size,
882 size));
883 VLOG(2) << "successfully sync memcpy'd d2d of " << size << " bytes";
884 return port::Status::OK();
885 }
886
AsynchronousMemcpyD2H(GpuContext * context,void * host_dst,hipDeviceptr_t gpu_src,uint64_t size,GpuStreamHandle stream)887 /* static */ bool GpuDriver::AsynchronousMemcpyD2H(GpuContext* context,
888 void* host_dst,
889 hipDeviceptr_t gpu_src,
890 uint64_t size,
891 GpuStreamHandle stream) {
892 ScopedActivateContext activation{context};
893 hipError_t res =
894 tensorflow::wrap::hipMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
895 if (res != hipSuccess) {
896 LOG(ERROR) << absl::StrFormat(
897 "failed to enqueue async memcpy from device to host: %s; host dst: %p; "
898 "Gpu src: %p; size: %llu=0x%llx",
899 ToString(res).c_str(), host_dst, absl::bit_cast<void*>(gpu_src), size,
900 size);
901 return false;
902 }
903 VLOG(2) << "successfully enqueued async memcpy d2h of " << size
904 << " bytes from " << absl::bit_cast<void*>(gpu_src) << " to "
905 << host_dst << " on stream " << stream;
906 return true;
907 }
908
AsynchronousMemcpyH2D(GpuContext * context,hipDeviceptr_t gpu_dst,const void * host_src,uint64_t size,GpuStreamHandle stream)909 /* static */ bool GpuDriver::AsynchronousMemcpyH2D(GpuContext* context,
910 hipDeviceptr_t gpu_dst,
911 const void* host_src,
912 uint64_t size,
913 GpuStreamHandle stream) {
914 ScopedActivateContext activation{context};
915 hipError_t res = tensorflow::wrap::hipMemcpyHtoDAsync(
916 gpu_dst, const_cast<void*>(host_src), size, stream);
917 if (res != hipSuccess) {
918 LOG(ERROR) << absl::StrFormat(
919 "failed to enqueue async memcpy from host to device: %s; Gpu dst: %p; "
920 "host src: %p; size: %llu=0x%llx",
921 ToString(res).c_str(), absl::bit_cast<void*>(gpu_dst), host_src, size,
922 size);
923 return false;
924 }
925 VLOG(2) << "successfully enqueued async memcpy h2d of " << size << " bytes"
926 << " on stream " << stream;
927 return true;
928 }
929
AsynchronousMemcpyD2D(GpuContext * context,hipDeviceptr_t gpu_dst,hipDeviceptr_t gpu_src,uint64_t size,GpuStreamHandle stream)930 /* static */ bool GpuDriver::AsynchronousMemcpyD2D(GpuContext* context,
931 hipDeviceptr_t gpu_dst,
932 hipDeviceptr_t gpu_src,
933 uint64_t size,
934 GpuStreamHandle stream) {
935 ScopedActivateContext activation{context};
936 hipError_t result =
937 tensorflow::wrap::hipMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
938 if (result != hipSuccess) {
939 LOG(ERROR) << absl::StrFormat(
940 "failed to enqueue async memcpy from device to device: %s"
941 "; Gpu dst: %p on %s %s"
942 "; Gpu src: %p on %s %s"
943 "; can access? %s; size: %llu=0x%llx",
944 ToString(result).c_str(), absl::bit_cast<void*>(gpu_dst),
945 ROCMPointerToMemorySpaceString(gpu_dst).c_str(),
946 ROCMPointerToDeviceString(gpu_dst).c_str(),
947 absl::bit_cast<void*>(gpu_src),
948 ROCMPointerToMemorySpaceString(gpu_src).c_str(),
949 ROCMPointerToDeviceString(gpu_src).c_str(),
950 ROCMPointersToCanAccessString(gpu_src, gpu_dst).c_str(), size, size);
951
952 return false;
953 }
954 VLOG(2) << "successfully enqueued async memcpy d2d of " << size << " bytes";
955 return true;
956 }
957
InitEvent(GpuContext * context,GpuEventHandle * event,EventFlags flags)958 /* static */ port::Status GpuDriver::InitEvent(GpuContext* context,
959 GpuEventHandle* event,
960 EventFlags flags) {
961 int hipflags;
962 switch (flags) {
963 case EventFlags::kDefault:
964 hipflags = hipEventDefault;
965 break;
966 case EventFlags::kDisableTiming:
967 hipflags = hipEventDisableTiming | hipEventReleaseToSystem;
968 break;
969 default:
970 LOG(FATAL) << "impossible event flags: " << int(hipflags);
971 }
972
973 ScopedActivateContext activated{context};
974 hipError_t res = tensorflow::wrap::hipEventCreateWithFlags(event, hipflags);
975
976 if (res == hipSuccess) {
977 return port::Status::OK();
978 } else if (res == hipErrorMemoryAllocation) {
979 return port::Status{port::error::RESOURCE_EXHAUSTED,
980 "could not create ROCM event: out of device memory"};
981 } else {
982 return port::Status{
983 port::error::FAILED_PRECONDITION,
984 absl::StrCat("could not create ROCM event: ", ToString(res))};
985 }
986 }
987
GetDeviceCount()988 /* static */ int GpuDriver::GetDeviceCount() {
989 int device_count = 0;
990 hipError_t res = tensorflow::wrap::hipGetDeviceCount(&device_count);
991 if (res != hipSuccess) {
992 LOG(ERROR) << "could not retrieve ROCM device count: " << ToString(res);
993 return 0;
994 }
995
996 if (FLAGS_gpuexec_rocm_device_0_only && device_count > 1) {
997 device_count = 1;
998 }
999 return device_count;
1000 }
1001
GetComputeCapability(int * cc_major,int * cc_minor,hipDevice_t device)1002 /* static */ port::Status GpuDriver::GetComputeCapability(int* cc_major,
1003 int* cc_minor,
1004 hipDevice_t device) {
1005 return port::Status(
1006 port::error::INTERNAL,
1007 absl::StrFormat("failed to get compute capability for device: %d "
1008 "(unsupported API on AMD Gpus)",
1009 device));
1010 }
1011
GetPointerAddressRange(hipDeviceptr_t dptr,hipDeviceptr_t * base,size_t * size)1012 /* static */ port::Status GpuDriver::GetPointerAddressRange(
1013 hipDeviceptr_t dptr, hipDeviceptr_t* base, size_t* size) {
1014 hipError_t result = tensorflow::wrap::hipMemGetAddressRange(base, size, dptr);
1015 if (result == hipSuccess) {
1016 return port::Status::OK();
1017 } else if (result == hipErrorNotFound) {
1018 // We differentiate between "this pointer is unknown" (return here) and
1019 // "there was an internal error while performing this operation" (return
1020 // below).
1021 return port::Status{port::error::NOT_FOUND,
1022 absl::StrFormat("not a device pointer %p; %s",
1023 reinterpret_cast<void*>(dptr),
1024 ToString(result).c_str())};
1025 }
1026
1027 return port::Status{
1028 port::error::INTERNAL,
1029 absl::StrFormat("failed to get pointer into for device pointer %p; %s",
1030 reinterpret_cast<void*>(dptr), ToString(result).c_str())};
1031 }
1032
GetPointerMemorySpace(hipDeviceptr_t pointer)1033 /* static */ port::StatusOr<MemorySpace> GpuDriver::GetPointerMemorySpace(
1034 hipDeviceptr_t pointer) {
1035 unsigned int value;
1036 hipError_t result = hipSuccess;
1037 if (result == hipSuccess) {
1038 switch (value) {
1039 case hipMemoryTypeDevice:
1040 return MemorySpace::kDevice;
1041 case hipMemoryTypeHost:
1042 return MemorySpace::kHost;
1043 default:
1044 return port::Status{
1045 port::error::INTERNAL,
1046 absl::StrCat("unknown memory space provided by ROCM API: ", value)};
1047 }
1048 }
1049
1050 return port::Status{
1051 port::error::INTERNAL,
1052 absl::StrCat("failed to query device pointer for memory space: ",
1053 ToString(result))};
1054 }
1055
GetPointerDevice(hipDeviceptr_t pointer)1056 /* static */ port::StatusOr<hipDevice_t> GpuDriver::GetPointerDevice(
1057 hipDeviceptr_t pointer) {
1058 hipPointerAttribute_t pointerAttributes;
1059 hipError_t result =
1060 tensorflow::wrap::hipPointerGetAttributes(&pointerAttributes, pointer);
1061 if (result != hipSuccess) {
1062 return port::Status{
1063 port::error::INTERNAL,
1064 absl::StrCat("failed to get device for pointer: ", ToString(result))};
1065 }
1066
1067 hipDevice_t device;
1068 result = tensorflow::wrap::hipDeviceGet(&device, pointerAttributes.device);
1069 if (result != hipSuccess) {
1070 return port::Status{
1071 port::error::INTERNAL,
1072 absl::StrCat("failed to get device for pointer: ", ToString(result))};
1073 }
1074
1075 return device;
1076 }
1077
GetGpuISAVersion(int * version,hipDevice_t device)1078 /* static */ port::Status GpuDriver::GetGpuISAVersion(int* version,
1079 hipDevice_t device) {
1080 hipDeviceProp_t props;
1081 hipError_t result = tensorflow::wrap::hipGetDeviceProperties(&props, device);
1082 if (result == hipSuccess) {
1083 *version = props.gcnArch;
1084 return port::Status::OK();
1085 }
1086 *version = 0;
1087 return port::Status{
1088 port::error::INTERNAL,
1089 absl::StrFormat("failed to determine AMDGpu ISA version for device %d",
1090 device)};
1091 }
1092
GetGpuGCNArchName(hipDevice_t device,std::string * gcnArchName)1093 /* static */ port::Status GpuDriver::GetGpuGCNArchName(
1094 hipDevice_t device, std::string* gcnArchName) {
1095 hipDeviceProp_t props;
1096 hipError_t result = tensorflow::wrap::hipGetDeviceProperties(&props, device);
1097 if (result == hipSuccess) {
1098 *gcnArchName = props.gcnArchName;
1099 return port::Status::OK();
1100 }
1101 *gcnArchName = "";
1102 return port::Status{
1103 port::error::INTERNAL,
1104 absl::StrFormat("failed to determine AMDGpu GCN Arch Name for device %d",
1105 device)};
1106 }
1107
GetMFMASupport()1108 /* static */ port::StatusOr<bool> GpuDriver::GetMFMASupport() {
1109 hipDeviceProp_t props;
1110 int dev = 0;
1111 hipError_t result = hipGetDevice(&dev);
1112 result = tensorflow::wrap::hipGetDeviceProperties(&props, dev);
1113 if (result == hipSuccess) {
1114 std::string gcnArchName = props.gcnArchName;
1115 VLOG(1) << "GCN arch name " << gcnArchName;
1116 auto pos = gcnArchName.find(":");
1117 if (pos != string::npos) gcnArchName = gcnArchName.substr(0, pos);
1118 pos = gcnArchName.find("gfx");
1119 if (pos != string::npos) gcnArchName = gcnArchName.substr(pos + 3);
1120 VLOG(1) << "GCN arch name (stripped) " << gcnArchName;
1121 return ((gcnArchName == "908") || (gcnArchName == "909"));
1122 }
1123 return port::Status{
1124 port::error::INTERNAL,
1125 absl::StrFormat("failed to determine AMDGpu GCN Arch Name for device %d",
1126 dev)};
1127 }
1128
1129 // Helper function that turns the integer output of hipDeviceGetAttribute to
1130 // type T and wraps it in a StatusOr.
1131 template <typename T>
GetSimpleAttribute(hipDevice_t device,hipDeviceAttribute_t attribute)1132 static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
1133 hipDeviceAttribute_t attribute) {
1134 int value = -1;
1135 hipError_t result =
1136 tensorflow::wrap::hipDeviceGetAttribute(&value, attribute, device);
1137 if (result != hipSuccess) {
1138 return port::Status{
1139 port::error::NOT_FOUND,
1140 absl::StrCat("could not retrieve ROCM device attribute (", attribute,
1141 "): ", ToString(result))};
1142 }
1143 T converted = value;
1144 return converted;
1145 }
1146
GetMultiprocessorCount(hipDevice_t device)1147 /* static */ port::StatusOr<int> GpuDriver::GetMultiprocessorCount(
1148 hipDevice_t device) {
1149 return GetSimpleAttribute<int>(device, hipDeviceAttributeMultiprocessorCount);
1150 }
1151
GetMaxSharedMemoryPerCore(hipDevice_t device)1152 /* static */ port::StatusOr<int64_t> GpuDriver::GetMaxSharedMemoryPerCore(
1153 hipDevice_t device) {
1154 return GetSimpleAttribute<int64_t>(
1155 device, hipDeviceAttributeMaxSharedMemoryPerMultiprocessor);
1156 }
1157
GetMaxSharedMemoryPerBlock(hipDevice_t device)1158 /* static */ port::StatusOr<int64_t> GpuDriver::GetMaxSharedMemoryPerBlock(
1159 hipDevice_t device) {
1160 return GetSimpleAttribute<int64_t>(device,
1161 hipDeviceAttributeMaxSharedMemoryPerBlock);
1162 }
1163
GetMaxThreadsPerMultiprocessor(hipDevice_t device)1164 /* static */ port::StatusOr<int64_t> GpuDriver::GetMaxThreadsPerMultiprocessor(
1165 hipDevice_t device) {
1166 return GetSimpleAttribute<int64_t>(
1167 device, hipDeviceAttributeMaxThreadsPerMultiProcessor);
1168 }
1169
GetMaxThreadsPerBlock(hipDevice_t device)1170 /* static */ port::StatusOr<int64_t> GpuDriver::GetMaxThreadsPerBlock(
1171 hipDevice_t device) {
1172 return GetSimpleAttribute<int64_t>(device,
1173 hipDeviceAttributeMaxThreadsPerBlock);
1174 }
1175
GetMaxRegistersPerBlock(hipDevice_t device)1176 /* static */ port::StatusOr<int64_t> GpuDriver::GetMaxRegistersPerBlock(
1177 hipDevice_t device) {
1178 return GetSimpleAttribute<int64_t>(device,
1179 hipDeviceAttributeMaxRegistersPerBlock);
1180 }
1181
GetThreadsPerWarp(hipDevice_t device)1182 /* static */ port::StatusOr<int64_t> GpuDriver::GetThreadsPerWarp(
1183 hipDevice_t device) {
1184 return GetSimpleAttribute<int64_t>(device, hipDeviceAttributeWarpSize);
1185 }
1186
GetGridLimits(int * x,int * y,int * z,hipDevice_t device)1187 /* static */ bool GpuDriver::GetGridLimits(int* x, int* y, int* z,
1188 hipDevice_t device) {
1189 int value;
1190 hipError_t res = tensorflow::wrap::hipDeviceGetAttribute(
1191 &value, hipDeviceAttributeMaxGridDimX, device);
1192 if (res != hipSuccess) {
1193 LOG(ERROR) << "failed to query max grid dim x: " << ToString(res);
1194 return false;
1195 }
1196 *x = value;
1197
1198 res = tensorflow::wrap::hipDeviceGetAttribute(
1199 &value, hipDeviceAttributeMaxGridDimY, device);
1200 if (res != hipSuccess) {
1201 LOG(ERROR) << "failed to query max grid dim y: " << ToString(res);
1202 return false;
1203 }
1204 *y = value;
1205
1206 res = tensorflow::wrap::hipDeviceGetAttribute(
1207 &value, hipDeviceAttributeMaxGridDimZ, device);
1208 if (res != hipSuccess) {
1209 LOG(ERROR) << "failed to query max grid dim z: " << ToString(res);
1210 return false;
1211 }
1212 *z = value;
1213 return true;
1214 }
1215
GetDriverVersion(int * driver_version)1216 /* static */ bool GpuDriver::GetDriverVersion(int* driver_version) {
1217 hipError_t res = tensorflow::wrap::hipDriverGetVersion(driver_version);
1218 if (res != hipSuccess) {
1219 LOG(ERROR) << "failed to query driver version: " << ToString(res);
1220 return false;
1221 }
1222
1223 return true;
1224 }
1225
GetDeviceProperties(hipDeviceProp_t * device_properties,int device_ordinal)1226 /* static */ bool GpuDriver::GetDeviceProperties(
1227 hipDeviceProp_t* device_properties, int device_ordinal) {
1228 hipError_t res = tensorflow::wrap::hipGetDeviceProperties(device_properties,
1229 device_ordinal);
1230 if (res != hipSuccess) {
1231 LOG(ERROR) << "failed to query device properties: " << ToString(res);
1232 return false;
1233 }
1234
1235 return true;
1236 }
1237
GetDeviceAttribute(hipDeviceAttribute_t attribute,hipDevice_t device)1238 /* static */ port::StatusOr<int> GpuDriver::GetDeviceAttribute(
1239 hipDeviceAttribute_t attribute, hipDevice_t device) {
1240 return GetSimpleAttribute<int>(device, attribute);
1241 }
1242
IsEccEnabled(hipDevice_t device,bool * result)1243 /* static */ bool GpuDriver::IsEccEnabled(hipDevice_t device, bool* result) {
1244 int value = -1;
1245 hipError_t res = hipSuccess;
1246 // TODO(ROCm) implement this feature in HIP
1247 if (res != hipSuccess) {
1248 LOG(ERROR) << "failed to query ECC status: " << ToString(res);
1249 return false;
1250 }
1251
1252 *result = value;
1253 return true;
1254 }
1255
GetDeviceMemoryInfo(GpuContext * context,int64_t * free_out,int64_t * total_out)1256 /* static */ bool GpuDriver::GetDeviceMemoryInfo(GpuContext* context,
1257 int64_t* free_out,
1258 int64_t* total_out) {
1259 ScopedActivateContext activation{context};
1260 size_t free = 0;
1261 size_t total = 0;
1262 hipError_t res = tensorflow::wrap::hipMemGetInfo(&free, &total);
1263 if (res != hipSuccess) {
1264 LOG(ERROR) << "failed to query device memory info: " << ToString(res);
1265 return false;
1266 }
1267
1268 *free_out = free;
1269 *total_out = total;
1270 return true;
1271 }
1272
GetDeviceTotalMemory(hipDevice_t device,uint64_t * result)1273 /* static */ bool GpuDriver::GetDeviceTotalMemory(hipDevice_t device,
1274 uint64_t* result) {
1275 size_t value = -1;
1276 hipError_t res = tensorflow::wrap::hipDeviceTotalMem(&value, device);
1277 if (res != hipSuccess) {
1278 LOG(ERROR) << "failed to query total available memory: " << ToString(res);
1279 return false;
1280 }
1281
1282 *result = value;
1283 return true;
1284 }
1285
GetPCIBusID(hipDevice_t device)1286 /* static */ string GpuDriver::GetPCIBusID(hipDevice_t device) {
1287 string pci_bus_id;
1288 static const int kBufferSize = 64;
1289 absl::InlinedVector<char, 4> chars(kBufferSize);
1290 chars[kBufferSize - 1] = '\0';
1291 hipError_t res = tensorflow::wrap::hipDeviceGetPCIBusId(
1292 chars.begin(), kBufferSize - 1, device);
1293 if (res != hipSuccess) {
1294 LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res);
1295 return pci_bus_id;
1296 }
1297 pci_bus_id = chars.begin();
1298 return pci_bus_id;
1299 }
1300
CanEnablePeerAccess(GpuContext * from,GpuContext * to)1301 /* static */ bool GpuDriver::CanEnablePeerAccess(GpuContext* from,
1302 GpuContext* to) {
1303 if (from->device_ordinal() == to->device_ordinal()) {
1304 return true; // A device can always access its own memory.
1305 }
1306
1307 int can_access_peer = -1;
1308 hipError_t res = tensorflow::wrap::hipDeviceCanAccessPeer(
1309 &can_access_peer, from->device_ordinal(), to->device_ordinal());
1310 if (res != hipSuccess) {
1311 LOG(ERROR) << "failed to detect peer access capability: " << ToString(res);
1312 return false;
1313 }
1314
1315 return can_access_peer;
1316 }
1317
EnablePeerAccess(GpuContext * from,GpuContext * to)1318 /* static */ port::Status GpuDriver::EnablePeerAccess(GpuContext* from,
1319 GpuContext* to) {
1320 if (from->device_ordinal() == to->device_ordinal()) {
1321 return port::Status::OK(); // A device can always access its own memory.
1322 }
1323
1324 ScopedActivateContext activated{from};
1325 hipError_t result = tensorflow::wrap::hipDeviceEnablePeerAccess(
1326 to->device_ordinal(), 0 /* = flags */);
1327 if (result != hipSuccess && result != hipErrorPeerAccessAlreadyEnabled) {
1328 return port::Status{
1329 port::error::INTERNAL,
1330 absl::StrFormat("failed to enable peer access from %d to %d: %s",
1331 from->device_ordinal(), to->device_ordinal(),
1332 ToString(result).c_str())};
1333 }
1334
1335 return port::Status::OK();
1336 }
1337
GetMaxOccupiedBlocksPerCore(GpuContext * context,hipFunction_t kernel,int threads_per_block,size_t dynamic_shared_memory_bytes)1338 /* static */ port::StatusOr<int> GpuDriver::GetMaxOccupiedBlocksPerCore(
1339 GpuContext* context, hipFunction_t kernel, int threads_per_block,
1340 size_t dynamic_shared_memory_bytes) {
1341 ScopedActivateContext activation{context};
1342
1343 int max_blocks = 0;
1344 hipError_t result = hipSuccess;
1345 // TODO(ROCm) implement this feature in HIP
1346 if (result != hipSuccess) {
1347 return port::Status{
1348 port::error::INTERNAL,
1349 absl::StrFormat("failed to calculate occupancy of kernel %p: %s",
1350 kernel, ToString(result).c_str())};
1351 }
1352
1353 return max_blocks;
1354 }
1355
1356 } // namespace gpu
1357 } // namespace stream_executor
1358