xref: /aosp_15_r20/external/tensorflow/tensorflow/stream_executor/rocm/rocm_driver.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include <stdint.h>
17 #include <stdlib.h>
18 
19 #include <map>
20 #include <set>
21 #include <utility>
22 
23 #include "absl/base/casts.h"
24 #include "absl/container/inlined_vector.h"
25 #include "absl/strings/str_cat.h"
26 #include "absl/strings/str_format.h"
27 #include "absl/synchronization/mutex.h"
28 #include "absl/synchronization/notification.h"
29 #include "tensorflow/stream_executor/gpu/gpu_diagnostics.h"
30 #include "tensorflow/stream_executor/gpu/gpu_driver.h"
31 #include "tensorflow/stream_executor/lib/env.h"
32 #include "tensorflow/stream_executor/lib/error.h"
33 #include "tensorflow/stream_executor/lib/human_readable.h"
34 #include "tensorflow/stream_executor/lib/stacktrace.h"
35 #include "tensorflow/stream_executor/lib/static_threadlocal.h"
36 #include "tensorflow/stream_executor/lib/threadpool.h"
37 #include "tensorflow/stream_executor/platform/logging.h"
38 #include "tensorflow/stream_executor/platform/port.h"
39 #include "tensorflow/stream_executor/rocm/rocm_driver_wrapper.h"
40 
41 bool FLAGS_gpuexec_rocm_driver_inject_init_error = false;
42 bool FLAGS_gpuexec_rocm_sync_around_driver_calls = false;
43 bool FLAGS_gpuexec_rocm_device_0_only = false;
44 
45 #define RETURN_IF_ROCM_ERROR(expr, ...)                                \
46   do {                                                                 \
47     hipError_t _res = (expr);                                          \
48     if (TF_PREDICT_FALSE(_res != hipSuccess)) {                        \
49       return port::InternalError(absl::StrCat(                         \
50           __VA_ARGS__, ": ", ::stream_executor::gpu::ToString(_res))); \
51     }                                                                  \
52   } while (0)
53 
54 // Debugging: on each push and pop of a rocm context, verify the current device
55 // matches the expected one.
56 constexpr bool kVerifyGpuContext = false;
57 
58 namespace stream_executor {
59 namespace gpu {
60 
61 // GpuContext wraps the device_ordinal.
62 // Only reason we need this wrapper class is to make the GpuDriver* API
63 class GpuContext {
64  public:
GpuContext(const int v)65   GpuContext(const int v) : device_ordinal_(v) {}
66 
device_ordinal() const67   int device_ordinal() const { return device_ordinal_; }
68 
69   // Disallow copying and moving.
70   GpuContext(GpuContext&&) = delete;
71   GpuContext(const GpuContext&) = delete;
72   GpuContext& operator=(GpuContext&&) = delete;
73   GpuContext& operator=(const GpuContext&) = delete;
74 
75  private:
76   const int device_ordinal_;
77 };
78 
79 namespace {
80 
81 // Formats hipError_t to output prettified values into a log stream.
82 // Error summaries taken from:
ToString(hipError_t result)83 string ToString(hipError_t result) {
84 #define OSTREAM_ROCM_ERROR(__name) \
85   case hipError##__name:           \
86     return "HIP_ERROR_" #__name;
87 
88   switch (result) {
89     OSTREAM_ROCM_ERROR(InvalidValue)
90     OSTREAM_ROCM_ERROR(OutOfMemory)
91     OSTREAM_ROCM_ERROR(NotInitialized)
92     OSTREAM_ROCM_ERROR(Deinitialized)
93     OSTREAM_ROCM_ERROR(NoDevice)
94     OSTREAM_ROCM_ERROR(InvalidDevice)
95     OSTREAM_ROCM_ERROR(InvalidImage)
96     OSTREAM_ROCM_ERROR(InvalidContext)
97     OSTREAM_ROCM_ERROR(InvalidHandle)
98     OSTREAM_ROCM_ERROR(NotFound)
99     OSTREAM_ROCM_ERROR(NotReady)
100     OSTREAM_ROCM_ERROR(NoBinaryForGpu)
101 
102     // Encountered an uncorrectable ECC error during execution.
103     OSTREAM_ROCM_ERROR(ECCNotCorrectable)
104 
105     // Load/store on an invalid address. Must reboot all context.
106     case 700:
107       return "ROCM_ERROR_ILLEGAL_ADDRESS";
108     // Passed too many / wrong arguments, too many threads for register count.
109     case 701:
110       return "ROCM_ERROR_LAUNCH_OUT_OF_RESOURCES";
111 
112       OSTREAM_ROCM_ERROR(ContextAlreadyInUse)
113       OSTREAM_ROCM_ERROR(PeerAccessUnsupported)
114       OSTREAM_ROCM_ERROR(Unknown)  // Unknown internal error to ROCM.
115     default:
116       return absl::StrCat("hipError_t(", static_cast<int>(result), ")");
117   }
118 }
119 
120 // ROCM driver routines may require a large amount of stack (particularly
121 // hipModuleLoadDataEx, in our experience). To avoid stack overflow when using
122 // stack-limited threads (such as those spawned by a default-argument
123 // thread::ThreadPool on some platforms), we run certain routines in this pool
124 // and wait for completion.
GetDriverExecutor()125 port::ThreadPool* GetDriverExecutor() {
126   static port::ThreadPool* thread_pool = new port::ThreadPool(
127       port::Env::Default(), port::ThreadOptions(), "rocm_driver", 1);
128   return thread_pool;
129 }
130 
131 }  // namespace
132 
MemorySpaceString(MemorySpace memory_space)133 string MemorySpaceString(MemorySpace memory_space) {
134   switch (memory_space) {
135     case MemorySpace::kHost:
136       return "host";
137     case MemorySpace::kDevice:
138       return "device";
139     default:
140       LOG(FATAL) << "impossible memory space";
141   }
142 }
143 
144 // Returns the current device set in HIP. This is done by calling the
145 // HIP driver (e.g., this value is not our cached view of the current device).
CurrentDeviceOrDie()146 static int CurrentDeviceOrDie() {
147   int current = -1;
148   hipError_t result = tensorflow::wrap::hipGetDevice(&current);
149   if (result != hipSuccess) {
150     LOG(FATAL) << "failed to query current device: " << ToString(result);
151   }
152   return current;
153 }
154 
155 namespace {
156 
157 // Call hipDeviceSynchronize and crash if it doesn't succeed.
SynchronizeOrDie()158 void SynchronizeOrDie() {
159   auto res = tensorflow::wrap::hipDeviceSynchronize();
160   if (res != hipSuccess) {
161     LOG(FATAL) << "Synchronize found " << ToString(res)
162                << " :: " << port::CurrentStackTrace();
163   }
164 }
165 
166 struct ThreadLocalData {
167   int current_device_ordinal;
168   int depth;
169 };
170 
171 SE_STATIC_THREAD_LOCAL_POD(ThreadLocalData, tls_data);
172 
173 }  // namespace
174 
ScopedActivateContext(GpuContext * context)175 ScopedActivateContext::ScopedActivateContext(GpuContext* context) {
176   if (FLAGS_gpuexec_rocm_sync_around_driver_calls) {
177     SynchronizeOrDie();
178   }
179 
180   auto* tls = &tls_data.get();
181   if (tls->depth == 0) {
182     tls->current_device_ordinal = CurrentDeviceOrDie();
183   }
184 
185   if (kVerifyGpuContext) {
186     CHECK_EQ(CurrentDeviceOrDie(), tls->current_device_ordinal);
187   }
188 
189   tls->depth++;
190 
191   to_restore_ = context;
192 
193   if (context->device_ordinal() == tls->current_device_ordinal) {
194     DCHECK_EQ(CurrentDeviceOrDie(), context->device_ordinal());
195     return;
196   }
197 
198   VLOG(3) << "ScopedActivateContext switching device from "
199           << tls->current_device_ordinal << " to " << context->device_ordinal();
200 
201   // Set the device and update thread local.
202   CHECK_EQ(hipSuccess,
203            tensorflow::wrap::hipSetDevice(context->device_ordinal()));
204   tls->current_device_ordinal = context->device_ordinal();
205 }
206 
~ScopedActivateContext()207 ScopedActivateContext::~ScopedActivateContext() {
208   if (FLAGS_gpuexec_rocm_sync_around_driver_calls) {
209     SynchronizeOrDie();
210   }
211 
212   auto* tls = &tls_data.get();
213 
214   if (kVerifyGpuContext) {
215     CHECK_EQ(CurrentDeviceOrDie(), tls->current_device_ordinal);
216   }
217 
218   tls->depth--;
219   DCHECK_GE(tls->depth, 0);
220 
221   if (to_restore_->device_ordinal() == tls->current_device_ordinal) {
222     DCHECK_EQ(CurrentDeviceOrDie(), to_restore_->device_ordinal());
223     return;
224   }
225 
226   VLOG(3) << "ScopedActivateContext switching device from "
227           << tls->current_device_ordinal << " to "
228           << to_restore_->device_ordinal();
229 
230   // Set context and update thread local.
231   CHECK_EQ(hipSuccess,
232            tensorflow::wrap::hipSetDevice(to_restore_->device_ordinal()));
233   tls->current_device_ordinal = to_restore_->device_ordinal();
234 }
235 
236 namespace {
237 
238 // Returns a stringified device number associated with pointer, primarily for
239 // logging purposes. Returns "?" if the device could not be successfully
240 // queried.
ROCMPointerToDeviceString(hipDeviceptr_t pointer)241 string ROCMPointerToDeviceString(hipDeviceptr_t pointer) {
242   auto value = GpuDriver::GetPointerDevice(pointer);
243   if (value.ok()) {
244     return absl::StrCat(value.ValueOrDie());
245   }
246   LOG(ERROR) << "could not query device: " << value.status();
247   return "?";
248 }
249 
250 // Returns a stringified memory space associated with pointer, primarily for
251 // logging purposes. Returns "?" if the memory space could not be successfully
252 // queried.
ROCMPointerToMemorySpaceString(hipDeviceptr_t pointer)253 string ROCMPointerToMemorySpaceString(hipDeviceptr_t pointer) {
254   auto value = GpuDriver::GetPointerMemorySpace(pointer);
255   if (value.ok()) {
256     return MemorySpaceString(value.ValueOrDie());
257   }
258   LOG(ERROR) << "could not query device: " << value.status();
259   return "?";
260 }
261 
262 // Returns a stringified representation of whether or not peer access is
263 // permitted between the "from" and "to" pointers' associated contexts,
264 // primarily for logging purposes. Returns "error" if an error is encountered
265 // in the process of querying.
ROCMPointersToCanAccessString(hipDeviceptr_t from,hipDeviceptr_t to)266 string ROCMPointersToCanAccessString(hipDeviceptr_t from, hipDeviceptr_t to) {
267   hipPointerAttribute_t from_pointerAttributes;
268   hipError_t result =
269       tensorflow::wrap::hipPointerGetAttributes(&from_pointerAttributes, from);
270   if (result != hipSuccess) {
271     LOG(ERROR) << "could not retrieve source pointer's device: "
272                << ToString(result);
273     return "error";
274   }
275 
276   hipPointerAttribute_t to_pointerAttributes;
277   result = tensorflow::wrap::hipPointerGetAttributes(&to_pointerAttributes, to);
278   if (result != hipSuccess) {
279     LOG(ERROR) << "could not retrieve destination pointer's device: "
280                << ToString(result);
281     return "error";
282   }
283 
284   GpuContext fromCtx(from_pointerAttributes.device);
285   GpuContext toCtx(to_pointerAttributes.device);
286 
287   return GpuDriver::CanEnablePeerAccess(&fromCtx, &toCtx) ? "true" : "false";
288 }
289 
290 // Actually performs the work of ROCM initialization. Wrapped up in one-time
291 // execution guard.
InternalInit()292 static port::Status InternalInit() {
293   hipError_t res = hipErrorNoDevice;
294   if (FLAGS_gpuexec_rocm_driver_inject_init_error) {
295     LOG(ERROR) << "injecting ROCM init error; initialization will fail";
296   } else {
297     res = tensorflow::wrap::hipInit(0 /* = flags */);
298   }
299 
300   if (res == hipSuccess) {
301     return port::Status::OK();
302   }
303 
304   LOG(ERROR) << "failed call to hipInit: " << ToString(res);
305   Diagnostician::LogDiagnosticInformation();
306   return port::Status{port::error::ABORTED,
307                       absl::StrCat("failed call to hipInit: ", ToString(res))};
308 }
309 
310 }  // namespace
311 
Init()312 /* static */ port::Status GpuDriver::Init() {
313   // Cached return value from calling InternalInit(), as hipInit need only be
314   // called once, but GpuDriver::Init may be called many times.
315   static port::Status* init_retval = [] {
316     return new port::Status(InternalInit());
317   }();
318   return *init_retval;
319 }
320 
GetDevice(int device_ordinal,hipDevice_t * device)321 /* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
322                                                hipDevice_t* device) {
323   hipError_t res = tensorflow::wrap::hipDeviceGet(device, device_ordinal);
324   if (res == hipSuccess) {
325     return port::Status::OK();
326   }
327 
328   return port::Status{
329       port::error::INTERNAL,
330       absl::StrCat("failed call to hipDeviceGet: ", ToString(res))};
331 }
332 
GetDeviceName(hipDevice_t device,string * device_name)333 /* static */ port::Status GpuDriver::GetDeviceName(hipDevice_t device,
334                                                    string* device_name) {
335   static const size_t kCharLimit = 64;
336   absl::InlinedVector<char, 4> chars(kCharLimit);
337   RETURN_IF_ROCM_ERROR(
338       tensorflow::wrap::hipDeviceGetName(chars.begin(), kCharLimit - 1, device),
339       "Failed to get device name");
340   chars[kCharLimit - 1] = '\0';
341   *device_name = chars.begin();
342   return port::Status::OK();
343 }
344 
DeviceOptionsToContextFlags(const DeviceOptions & device_options,int * flags)345 bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
346                                  int* flags) {
347   static_assert(DeviceOptions::kMask == 0xf,
348                 "needs update for new device options");
349   return true;
350 }
351 
CreateContext(int device_ordinal,hipDevice_t device,const DeviceOptions & device_options,GpuContext ** context)352 /* static */ port::Status GpuDriver::CreateContext(
353     int device_ordinal, hipDevice_t device, const DeviceOptions& device_options,
354     GpuContext** context) {
355   // TODO(hanbinyoon): Create a real context, i.e., by calling hipCtxCreate().
356   *context = new GpuContext(device_ordinal);
357   return port::Status::OK();
358 }
DestroyContext(GpuContext * context)359 /* static */ void GpuDriver::DestroyContext(GpuContext* context) {
360   if (context == nullptr) {
361     return;
362   }
363   delete context;
364 }
365 
GetContextHandle(GpuContext * context)366 /* static */ hipCtx_t GpuDriver::GetContextHandle(GpuContext* context) {
367   // TODO(hanbinyoon): Return a real context.
368   return nullptr;
369 }
370 
FuncGetAttribute(hipDeviceAttribute_t attribute,hipFunction_t func,int * attribute_value)371 /* static */ port::Status GpuDriver::FuncGetAttribute(
372     hipDeviceAttribute_t attribute, hipFunction_t func, int* attribute_value) {
373   // TODO(ROCm) properly implement this feature in HIP
374   return port::Status::OK();
375 }
376 
FuncSetCacheConfig(hipFunction_t function,hipFuncCache_t cache_config)377 /* static */ port::Status GpuDriver::FuncSetCacheConfig(
378     hipFunction_t function, hipFuncCache_t cache_config) {
379   RETURN_IF_ROCM_ERROR(
380       tensorflow::wrap::hipFuncSetCacheConfig(function, cache_config),
381       "Failed to set ROCM kernel cache config.");
382   return port::Status::OK();
383 }
384 
385 /* static */ port::StatusOr<hipSharedMemConfig>
ContextGetSharedMemConfig(GpuContext * context)386 GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
387   hipSharedMemConfig shared_mem_config;
388   ScopedActivateContext activation{context};
389   RETURN_IF_ROCM_ERROR(
390       tensorflow::wrap::hipDeviceGetSharedMemConfig(&shared_mem_config),
391       "Failed to get shared memory config");
392   return shared_mem_config;
393 }
394 
ContextSetSharedMemConfig(GpuContext * context,hipSharedMemConfig shared_mem_config)395 /* static */ port::Status GpuDriver::ContextSetSharedMemConfig(
396     GpuContext* context, hipSharedMemConfig shared_mem_config) {
397   ScopedActivateContext activation{context};
398   RETURN_IF_ROCM_ERROR(
399       tensorflow::wrap::hipDeviceSetSharedMemConfig(shared_mem_config),
400       "Failed to set ROCM device shared memory config");
401   return port::Status::OK();
402 }
403 
LaunchKernel(GpuContext * context,absl::string_view kernel_name,hipFunction_t function,unsigned int grid_dim_x,unsigned int grid_dim_y,unsigned int grid_dim_z,unsigned int block_dim_x,unsigned int block_dim_y,unsigned int block_dim_z,unsigned int shared_mem_bytes,GpuStreamHandle stream,void ** kernel_params,void ** extra)404 /* static */ port::Status GpuDriver::LaunchKernel(
405     GpuContext* context, absl::string_view kernel_name, hipFunction_t function,
406     unsigned int grid_dim_x, unsigned int grid_dim_y, unsigned int grid_dim_z,
407     unsigned int block_dim_x, unsigned int block_dim_y,
408     unsigned int block_dim_z, unsigned int shared_mem_bytes,
409     GpuStreamHandle stream, void** kernel_params, void** extra) {
410   ScopedActivateContext activation{context};
411   VLOG(2) << "launching kernel: " << kernel_name << "; gdx: " << grid_dim_x
412           << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
413           << " bdx: " << block_dim_x << " bdy: " << block_dim_y
414           << " bdz: " << block_dim_z << " smem: " << shared_mem_bytes;
415   RETURN_IF_ROCM_ERROR(tensorflow::wrap::hipModuleLaunchKernel(
416                            function, grid_dim_x, grid_dim_y, grid_dim_z,
417                            block_dim_x, block_dim_y, block_dim_z,
418                            shared_mem_bytes, stream, kernel_params, extra),
419                        "Failed to launch ROCm kernel: ", kernel_name,
420                        " with block dimensions: ", block_dim_x, "x",
421                        block_dim_y, "x", block_dim_z);
422   VLOG(2) << "successfully launched kernel";
423   return port::Status::OK();
424 }
425 
LoadPtx(GpuContext * context,const char * ptx_contents,hipModule_t * module)426 /* static */ port::Status GpuDriver::LoadPtx(GpuContext* context,
427                                              const char* ptx_contents,
428                                              hipModule_t* module) {
429   LOG(ERROR) << "Feature not supported on ROCm platform (LoadPtx)";
430   return port::InternalError("Not Implemented");
431 }
432 
LoadCubin(GpuContext * context,const char * cubin_bytes,hipModule_t * module)433 /* static */ port::Status GpuDriver::LoadCubin(GpuContext* context,
434                                                const char* cubin_bytes,
435                                                hipModule_t* module) {
436   return port::Status{port::error::INTERNAL,
437                       "Feature not supported on ROCm platform (LoadCubin)"};
438 }
439 
LoadHsaco(GpuContext * context,const char * hsaco_contents,hipModule_t * module)440 /* static */ port::Status GpuDriver::LoadHsaco(GpuContext* context,
441                                                const char* hsaco_contents,
442                                                hipModule_t* module) {
443   absl::Notification notification;
444   port::Status ret = port::Status::OK();
445   GetDriverExecutor()->Schedule([context, hsaco_contents, module, &ret,
446                                  &notification]() {
447     ScopedActivateContext activation{context};
448     void* hsaco_data = const_cast<char*>(hsaco_contents);
449 
450     hipError_t res = tensorflow::wrap::hipModuleLoadData(module, hsaco_data);
451 
452     if (res != hipSuccess) {
453       ret = port::InternalError(
454           absl::StrCat("Failed to load HSACO: ", ToString(res)));
455       notification.Notify();
456     }
457 
458     CHECK(module != nullptr);
459     notification.Notify();
460   });
461   notification.WaitForNotification();
462 
463   return ret;
464 }
465 
SynchronousMemsetUint8(GpuContext * context,hipDeviceptr_t location,uint8 value,size_t size)466 /* static */ port::Status GpuDriver::SynchronousMemsetUint8(
467     GpuContext* context, hipDeviceptr_t location, uint8 value, size_t size) {
468   ScopedActivateContext activation{context};
469   RETURN_IF_ROCM_ERROR(tensorflow::wrap::hipMemsetD8(location, value, size),
470                        "Failed to memset memory");
471   return port::Status::OK();
472 }
473 
SynchronousMemsetUint32(GpuContext * context,hipDeviceptr_t location,uint32 value,size_t uint32_count)474 /* static */ port::Status GpuDriver::SynchronousMemsetUint32(
475     GpuContext* context, hipDeviceptr_t location, uint32 value,
476     size_t uint32_count) {
477   ScopedActivateContext activation{context};
478   void* pointer = absl::bit_cast<void*>(location);
479   RETURN_IF_ROCM_ERROR(
480       tensorflow::wrap::hipMemsetD32(pointer, value, uint32_count),
481       "Failed to memset memory");
482   return port::Status::OK();
483 }
484 
AsynchronousMemsetUint8(GpuContext * context,hipDeviceptr_t location,uint8 value,size_t uint32_count,GpuStreamHandle stream)485 /* static */ port::Status GpuDriver::AsynchronousMemsetUint8(
486     GpuContext* context, hipDeviceptr_t location, uint8 value,
487     size_t uint32_count, GpuStreamHandle stream) {
488   ScopedActivateContext activation{context};
489   RETURN_IF_ROCM_ERROR(
490       tensorflow::wrap::hipMemsetAsync(location, value, uint32_count, stream),
491       "Failed to enqueue async memset operation");
492   return port::Status::OK();
493 }
494 
AsynchronousMemsetUint32(GpuContext * context,hipDeviceptr_t location,uint32 value,size_t uint32_count,GpuStreamHandle stream)495 /* static */ port::Status GpuDriver::AsynchronousMemsetUint32(
496     GpuContext* context, hipDeviceptr_t location, uint32 value,
497     size_t uint32_count, GpuStreamHandle stream) {
498   ScopedActivateContext activation{context};
499   void* pointer = absl::bit_cast<void*>(location);
500   RETURN_IF_ROCM_ERROR(
501       tensorflow::wrap::hipMemsetD32Async(pointer, value, uint32_count, stream),
502       "Failed to enqueue async memset operation");
503   VLOG(2) << "successfully enqueued async memset operation";
504   return port::Status::OK();
505 }
506 
AddStreamCallback(GpuContext * context,GpuStreamHandle stream,StreamCallback callback,void * data)507 /* static */ bool GpuDriver::AddStreamCallback(GpuContext* context,
508                                                GpuStreamHandle stream,
509                                                StreamCallback callback,
510                                                void* data) {
511   hipError_t res = tensorflow::wrap::hipStreamAddCallback(
512       stream, (hipStreamCallback_t)callback, data, 0 /* = flags */);
513   if (res != hipSuccess) {
514     LOG(ERROR) << "unable to add host callback: " << ToString(res);
515     return false;
516   }
517   return true;
518 }
519 
GetModuleFunction(GpuContext * context,hipModule_t module,const char * kernel_name,hipFunction_t * function)520 /* static */ bool GpuDriver::GetModuleFunction(GpuContext* context,
521                                                hipModule_t module,
522                                                const char* kernel_name,
523                                                hipFunction_t* function) {
524   ScopedActivateContext activated{context};
525   CHECK(module != nullptr && kernel_name != nullptr);
526   hipError_t res =
527       tensorflow::wrap::hipModuleGetFunction(function, module, kernel_name);
528   if (res != hipSuccess) {
529     LOG(ERROR) << "failed to get kernel \"" << kernel_name
530                << "\" from module: " << ToString(res);
531     return false;
532   }
533 
534   return true;
535 }
536 
GetModuleSymbol(GpuContext * context,hipModule_t module,const char * symbol_name,hipDeviceptr_t * dptr,size_t * bytes)537 /* static */ bool GpuDriver::GetModuleSymbol(GpuContext* context,
538                                              hipModule_t module,
539                                              const char* symbol_name,
540                                              hipDeviceptr_t* dptr,
541                                              size_t* bytes) {
542   ScopedActivateContext activated{context};
543   CHECK(module != nullptr && symbol_name != nullptr &&
544         (dptr != nullptr || bytes != nullptr));
545   hipError_t res =
546       tensorflow::wrap::hipModuleGetGlobal(dptr, bytes, module, symbol_name);
547   if (res != hipSuccess) {
548     // symbol may not be found in the current module, but it may reside in
549     // another module.
550     VLOG(2) << "failed to get symbol \"" << symbol_name
551             << "\" from module: " << ToString(res);
552     return false;
553   }
554 
555   return true;
556 }
557 
UnloadModule(GpuContext * context,hipModule_t module)558 /* static */ void GpuDriver::UnloadModule(GpuContext* context,
559                                           hipModule_t module) {
560   ScopedActivateContext activated{context};
561   hipError_t res = tensorflow::wrap::hipModuleUnload(module);
562   if (res != hipSuccess) {
563     LOG(ERROR) << "failed to unload module " << module
564                << "; leaking: " << ToString(res);
565   }
566 }
567 
CreateStream(GpuContext * context,GpuStreamHandle * stream,int priority)568 /* static */ bool GpuDriver::CreateStream(GpuContext* context,
569                                           GpuStreamHandle* stream,
570                                           int priority) {
571   ScopedActivateContext activated{context};
572   hipError_t res;
573   if (priority == 0) {
574     res = tensorflow::wrap::hipStreamCreateWithFlags(
575         stream, hipStreamDefault);  // switch to hipStreamNonBlocking?
576   } else {
577     res = tensorflow::wrap::hipStreamCreateWithPriority(
578         stream, hipStreamDefault, priority);  // switch to hipStreamNonBlocking?
579   }
580   if (res != hipSuccess) {
581     LOG(ERROR) << "could not allocate ROCM stream for device "
582                << context->device_ordinal() << ": " << ToString(res);
583     return false;
584   }
585 
586   VLOG(2) << "successfully created stream " << *stream << " for device "
587           << context->device_ordinal() << " on thread";
588   return true;
589 }
590 
DestroyStream(GpuContext * context,GpuStreamHandle * stream)591 /* static */ void GpuDriver::DestroyStream(GpuContext* context,
592                                            GpuStreamHandle* stream) {
593   if (*stream == nullptr) {
594     return;
595   }
596 
597   ScopedActivateContext activated{context};
598   hipError_t res = tensorflow::wrap::hipStreamDestroy(*stream);
599   if (res != hipSuccess) {
600     LOG(ERROR) << "failed to destroy ROCM stream for device "
601                << context->device_ordinal() << ": " << ToString(res);
602   } else {
603     VLOG(2) << "successfully destroyed stream " << *stream << " for device "
604             << context->device_ordinal();
605     *stream = nullptr;
606   }
607 }
608 
DeviceAllocate(GpuContext * context,uint64_t bytes)609 /* static */ void* GpuDriver::DeviceAllocate(GpuContext* context,
610                                              uint64_t bytes) {
611   ScopedActivateContext activated{context};
612   hipDeviceptr_t result = 0;
613   hipError_t res = tensorflow::wrap::hipMalloc(&result, bytes);
614   if (res != hipSuccess) {
615     LOG(ERROR) << "failed to allocate "
616                << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
617                << " bytes) from device: " << ToString(res);
618     return nullptr;
619   }
620   void* ptr = reinterpret_cast<void*>(result);
621   VLOG(2) << "allocated " << ptr << " for device " << context->device_ordinal()
622           << " of " << bytes << " bytes";
623   return ptr;
624 }
625 
DeviceDeallocate(GpuContext * context,void * location)626 /* static */ void GpuDriver::DeviceDeallocate(GpuContext* context,
627                                               void* location) {
628   ScopedActivateContext activation{context};
629   hipDeviceptr_t pointer = absl::bit_cast<hipDeviceptr_t>(location);
630   hipError_t res = tensorflow::wrap::hipFree(pointer);
631   if (res != hipSuccess) {
632     LOG(ERROR) << "failed to free device memory at " << location
633                << "; result: " << ToString(res);
634   } else {
635     VLOG(2) << "deallocated " << location << " for device "
636             << context->device_ordinal();
637   }
638 }
639 
UnifiedMemoryAllocate(GpuContext * context,uint64_t bytes)640 /* static */ void* GpuDriver::UnifiedMemoryAllocate(GpuContext* context,
641                                                     uint64_t bytes) {
642   ScopedActivateContext activated{context};
643 
644   LOG(ERROR)
645       << "Feature not supported on ROCm platform (UnifiedMemoryAllocate)";
646   return nullptr;
647 }
648 
UnifiedMemoryDeallocate(GpuContext * context,void * location)649 /* static */ void GpuDriver::UnifiedMemoryDeallocate(GpuContext* context,
650                                                      void* location) {
651   LOG(ERROR)
652       << "Feature not supported on ROCm platform (UnifiedMemoryDeallocate)";
653 }
654 
HostAllocate(GpuContext * context,uint64_t bytes)655 /* static */ void* GpuDriver::HostAllocate(GpuContext* context,
656                                            uint64_t bytes) {
657   ScopedActivateContext activation{context};
658   void* host_mem = nullptr;
659   // "Portable" memory is visible to all ROCM contexts. Safe for our use model.
660   hipError_t res =
661       tensorflow::wrap::hipHostMalloc(&host_mem, bytes, hipHostMallocPortable);
662   if (res != hipSuccess) {
663     LOG(ERROR) << "failed to alloc " << bytes
664                << " bytes on host: " << ToString(res);
665   }
666   return host_mem;
667 }
668 
HostDeallocate(GpuContext * context,void * location)669 /* static */ void GpuDriver::HostDeallocate(GpuContext* context,
670                                             void* location) {
671   ScopedActivateContext activation{context};
672   hipError_t res = tensorflow::wrap::hipHostFree(location);
673   if (res != hipSuccess) {
674     LOG(ERROR) << "error deallocating host memory at " << location << ": "
675                << ToString(res);
676   }
677 }
678 
HostRegister(GpuContext * context,void * location,uint64_t bytes)679 /* static */ bool GpuDriver::HostRegister(GpuContext* context, void* location,
680                                           uint64_t bytes) {
681   ScopedActivateContext activation{context};
682   // "Portable" memory is visible to all ROCM contexts. Safe for our use model.
683   hipError_t res = tensorflow::wrap::hipHostRegister(location, bytes,
684                                                      hipHostRegisterPortable);
685   if (res != hipSuccess) {
686     LOG(ERROR) << "error registering host memory at " << location << ": "
687                << ToString(res);
688     return false;
689   }
690   return true;
691 }
692 
HostUnregister(GpuContext * context,void * location)693 /* static */ bool GpuDriver::HostUnregister(GpuContext* context,
694                                             void* location) {
695   ScopedActivateContext activation{context};
696   hipError_t res = tensorflow::wrap::hipHostUnregister(location);
697   if (res != hipSuccess) {
698     LOG(ERROR) << "error unregistering host memory at " << location << ": "
699                << ToString(res);
700     return false;
701   }
702   return true;
703 }
704 
DestroyEvent(GpuContext * context,GpuEventHandle * event)705 /* static */ port::Status GpuDriver::DestroyEvent(GpuContext* context,
706                                                   GpuEventHandle* event) {
707   if (*event == nullptr) {
708     return port::Status{port::error::INVALID_ARGUMENT,
709                         "input event cannot be null"};
710   }
711 
712   ScopedActivateContext activated{context};
713   hipError_t res = tensorflow::wrap::hipEventDestroy(*event);
714   *event = nullptr;
715 
716   switch (res) {
717     case hipSuccess:
718       return port::Status::OK();
719     case hipErrorDeinitialized:
720     case hipErrorNotInitialized:
721       return port::Status{
722           port::error::FAILED_PRECONDITION,
723           absl::StrFormat("error destroying ROCM event in device %d: %s",
724                           context->device_ordinal(), ToString(res).c_str())};
725     default:
726       return port::Status{
727           port::error::INTERNAL,
728           absl::StrFormat("error destroying ROCM event in device %d: %s",
729                           context->device_ordinal(), ToString(res).c_str())};
730   }
731 }
732 
RecordEvent(GpuContext * context,GpuEventHandle event,GpuStreamHandle stream)733 /* static */ port::Status GpuDriver::RecordEvent(GpuContext* context,
734                                                  GpuEventHandle event,
735                                                  GpuStreamHandle stream) {
736   ScopedActivateContext activated{context};
737   hipError_t res = tensorflow::wrap::hipEventRecord(event, stream);
738   switch (res) {
739     case hipSuccess:
740       return port::Status::OK();
741     case hipErrorDeinitialized:
742     case hipErrorNotInitialized:
743       return port::Status{
744           port::error::FAILED_PRECONDITION,
745           absl::StrFormat("error recording ROCM event on stream %p: %s", stream,
746                           ToString(res).c_str())};
747     default:
748       return port::Status{
749           port::error::INVALID_ARGUMENT,
750           absl::StrFormat("error recording ROCM event on stream %p: %s", stream,
751                           ToString(res).c_str())};
752   }
753 }
754 
QueryEvent(GpuContext * context,GpuEventHandle event)755 /* static */ port::StatusOr<hipError_t> GpuDriver::QueryEvent(
756     GpuContext* context, GpuEventHandle event) {
757   ScopedActivateContext activated{context};
758   hipError_t res = tensorflow::wrap::hipEventQuery(event);
759   if (res != hipSuccess && res != hipErrorNotReady) {
760     return port::Status{
761         port::error::INTERNAL,
762         absl::StrFormat("failed to query event: %s", ToString(res).c_str())};
763   }
764 
765   return res;
766 }
767 
GetEventElapsedTime(GpuContext * context,float * elapsed_milliseconds,GpuEventHandle start,GpuEventHandle stop)768 /* static */ bool GpuDriver::GetEventElapsedTime(GpuContext* context,
769                                                  float* elapsed_milliseconds,
770                                                  GpuEventHandle start,
771                                                  GpuEventHandle stop) {
772   ScopedActivateContext activated{context};
773   // The stop event must have completed in order for hipEventElapsedTime to
774   // work.
775   hipError_t res = tensorflow::wrap::hipEventSynchronize(stop);
776   if (res != hipSuccess) {
777     LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res);
778     return false;
779   }
780   res =
781       tensorflow::wrap::hipEventElapsedTime(elapsed_milliseconds, start, stop);
782   if (res != hipSuccess) {
783     LOG(ERROR) << "failed to get elapsed time between events: "
784                << ToString(res);
785     return false;
786   }
787 
788   return true;
789 }
790 
WaitStreamOnEvent(GpuContext * context,GpuStreamHandle stream,GpuEventHandle event)791 /* static */ bool GpuDriver::WaitStreamOnEvent(GpuContext* context,
792                                                GpuStreamHandle stream,
793                                                GpuEventHandle event) {
794   ScopedActivateContext activation{context};
795   hipError_t res =
796       tensorflow::wrap::hipStreamWaitEvent(stream, event, 0 /* = flags */);
797   if (res != hipSuccess) {
798     LOG(ERROR) << "could not wait stream on event: " << ToString(res);
799     return false;
800   }
801 
802   return true;
803 }
804 
SynchronizeContext(GpuContext * context)805 /* static */ bool GpuDriver::SynchronizeContext(GpuContext* context) {
806   ScopedActivateContext activation{context};
807   hipError_t res = tensorflow::wrap::hipDeviceSynchronize();
808   if (res != hipSuccess) {
809     LOG(ERROR) << "could not synchronize on ROCM device: " << ToString(res)
810                << " :: " << port::CurrentStackTrace();
811     return false;
812   }
813 
814   return true;
815 }
816 
SynchronizeStream(GpuContext * context,GpuStreamHandle stream)817 /* static */ port::Status GpuDriver::SynchronizeStream(GpuContext* context,
818                                                        GpuStreamHandle stream) {
819   ScopedActivateContext activated{context};
820   CHECK(stream != nullptr);
821   RETURN_IF_ROCM_ERROR(tensorflow::wrap::hipStreamSynchronize(stream),
822                        "Could not synchronize on ROCM stream");
823   VLOG(2) << "successfully synchronized stream " << stream << " on device "
824           << context->device_ordinal();
825   return port::Status::OK();
826 }
827 
IsStreamIdle(GpuContext * context,GpuStreamHandle stream)828 /* static */ bool GpuDriver::IsStreamIdle(GpuContext* context,
829                                           GpuStreamHandle stream) {
830   ScopedActivateContext activated{context};
831   CHECK(stream != nullptr);
832   hipError_t res = tensorflow::wrap::hipStreamQuery(stream);
833   if (res == hipSuccess) {
834     return true;
835   }
836 
837   if (res != hipErrorNotReady) {
838     LOG(ERROR) << "stream in bad state on status query: " << ToString(res);
839   }
840   return false;
841 }
842 
SynchronousMemcpyD2H(GpuContext * context,void * host_dst,hipDeviceptr_t gpu_src,uint64_t size)843 /* static */ port::Status GpuDriver::SynchronousMemcpyD2H(
844     GpuContext* context, void* host_dst, hipDeviceptr_t gpu_src,
845     uint64_t size) {
846   ScopedActivateContext activation{context};
847   RETURN_IF_ROCM_ERROR(
848       tensorflow::wrap::hipMemcpyDtoH(host_dst, gpu_src, size),
849       absl::StrFormat("failed to synchronous memcpy from device to host: "
850                       "host dst: %p; Gpu src: %p; size: %llu=0x%llx",
851                       host_dst, absl::bit_cast<void*>(gpu_src), size, size));
852   VLOG(2) << "successfully sync memcpy'd d2h of " << size << " bytes to "
853           << host_dst;
854   return port::Status::OK();
855 }
856 
SynchronousMemcpyH2D(GpuContext * context,hipDeviceptr_t gpu_dst,const void * host_src,uint64_t size)857 /* static */ port::Status GpuDriver::SynchronousMemcpyH2D(
858     GpuContext* context, hipDeviceptr_t gpu_dst, const void* host_src,
859     uint64_t size) {
860   ScopedActivateContext activation{context};
861   RETURN_IF_ROCM_ERROR(
862       tensorflow::wrap::hipMemcpyHtoD(gpu_dst, const_cast<void*>(host_src),
863                                       size),
864       absl::StrFormat(
865           "failed to synchronous memcpy from host to device: Gpu dst: %p;"
866           " host src: %p; size: %llu=0x%llx",
867           absl::bit_cast<void*>(gpu_dst), host_src, size, size));
868   VLOG(2) << "successfully enqueued sync memcpy h2d of " << size << " bytes";
869   return port::Status::OK();
870 }
871 
SynchronousMemcpyD2D(GpuContext * context,hipDeviceptr_t gpu_dst,hipDeviceptr_t gpu_src,uint64_t size)872 /* static */ port::Status GpuDriver::SynchronousMemcpyD2D(
873     GpuContext* context, hipDeviceptr_t gpu_dst, hipDeviceptr_t gpu_src,
874     uint64_t size) {
875   ScopedActivateContext activation{context};
876   RETURN_IF_ROCM_ERROR(
877       tensorflow::wrap::hipMemcpyDtoD(gpu_dst, gpu_src, size),
878       absl::StrFormat(
879           "failed to synchronous memcpy from host to device:Gpu dst: %p; "
880           "Gpu src: %p; size: %llu=0x%llx",
881           absl::bit_cast<void*>(gpu_dst), absl::bit_cast<void*>(gpu_src), size,
882           size));
883   VLOG(2) << "successfully sync memcpy'd d2d of " << size << " bytes";
884   return port::Status::OK();
885 }
886 
AsynchronousMemcpyD2H(GpuContext * context,void * host_dst,hipDeviceptr_t gpu_src,uint64_t size,GpuStreamHandle stream)887 /* static */ bool GpuDriver::AsynchronousMemcpyD2H(GpuContext* context,
888                                                    void* host_dst,
889                                                    hipDeviceptr_t gpu_src,
890                                                    uint64_t size,
891                                                    GpuStreamHandle stream) {
892   ScopedActivateContext activation{context};
893   hipError_t res =
894       tensorflow::wrap::hipMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
895   if (res != hipSuccess) {
896     LOG(ERROR) << absl::StrFormat(
897         "failed to enqueue async memcpy from device to host: %s; host dst: %p; "
898         "Gpu src: %p; size: %llu=0x%llx",
899         ToString(res).c_str(), host_dst, absl::bit_cast<void*>(gpu_src), size,
900         size);
901     return false;
902   }
903   VLOG(2) << "successfully enqueued async memcpy d2h of " << size
904           << " bytes from " << absl::bit_cast<void*>(gpu_src) << " to "
905           << host_dst << " on stream " << stream;
906   return true;
907 }
908 
AsynchronousMemcpyH2D(GpuContext * context,hipDeviceptr_t gpu_dst,const void * host_src,uint64_t size,GpuStreamHandle stream)909 /* static */ bool GpuDriver::AsynchronousMemcpyH2D(GpuContext* context,
910                                                    hipDeviceptr_t gpu_dst,
911                                                    const void* host_src,
912                                                    uint64_t size,
913                                                    GpuStreamHandle stream) {
914   ScopedActivateContext activation{context};
915   hipError_t res = tensorflow::wrap::hipMemcpyHtoDAsync(
916       gpu_dst, const_cast<void*>(host_src), size, stream);
917   if (res != hipSuccess) {
918     LOG(ERROR) << absl::StrFormat(
919         "failed to enqueue async memcpy from host to device: %s; Gpu dst: %p; "
920         "host src: %p; size: %llu=0x%llx",
921         ToString(res).c_str(), absl::bit_cast<void*>(gpu_dst), host_src, size,
922         size);
923     return false;
924   }
925   VLOG(2) << "successfully enqueued async memcpy h2d of " << size << " bytes"
926           << " on stream " << stream;
927   return true;
928 }
929 
AsynchronousMemcpyD2D(GpuContext * context,hipDeviceptr_t gpu_dst,hipDeviceptr_t gpu_src,uint64_t size,GpuStreamHandle stream)930 /* static */ bool GpuDriver::AsynchronousMemcpyD2D(GpuContext* context,
931                                                    hipDeviceptr_t gpu_dst,
932                                                    hipDeviceptr_t gpu_src,
933                                                    uint64_t size,
934                                                    GpuStreamHandle stream) {
935   ScopedActivateContext activation{context};
936   hipError_t result =
937       tensorflow::wrap::hipMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
938   if (result != hipSuccess) {
939     LOG(ERROR) << absl::StrFormat(
940         "failed to enqueue async memcpy from device to device: %s"
941         "; Gpu dst: %p on %s %s"
942         "; Gpu src: %p on %s %s"
943         "; can access? %s; size: %llu=0x%llx",
944         ToString(result).c_str(), absl::bit_cast<void*>(gpu_dst),
945         ROCMPointerToMemorySpaceString(gpu_dst).c_str(),
946         ROCMPointerToDeviceString(gpu_dst).c_str(),
947         absl::bit_cast<void*>(gpu_src),
948         ROCMPointerToMemorySpaceString(gpu_src).c_str(),
949         ROCMPointerToDeviceString(gpu_src).c_str(),
950         ROCMPointersToCanAccessString(gpu_src, gpu_dst).c_str(), size, size);
951 
952     return false;
953   }
954   VLOG(2) << "successfully enqueued async memcpy d2d of " << size << " bytes";
955   return true;
956 }
957 
InitEvent(GpuContext * context,GpuEventHandle * event,EventFlags flags)958 /* static */ port::Status GpuDriver::InitEvent(GpuContext* context,
959                                                GpuEventHandle* event,
960                                                EventFlags flags) {
961   int hipflags;
962   switch (flags) {
963     case EventFlags::kDefault:
964       hipflags = hipEventDefault;
965       break;
966     case EventFlags::kDisableTiming:
967       hipflags = hipEventDisableTiming | hipEventReleaseToSystem;
968       break;
969     default:
970       LOG(FATAL) << "impossible event flags: " << int(hipflags);
971   }
972 
973   ScopedActivateContext activated{context};
974   hipError_t res = tensorflow::wrap::hipEventCreateWithFlags(event, hipflags);
975 
976   if (res == hipSuccess) {
977     return port::Status::OK();
978   } else if (res == hipErrorMemoryAllocation) {
979     return port::Status{port::error::RESOURCE_EXHAUSTED,
980                         "could not create ROCM event: out of device memory"};
981   } else {
982     return port::Status{
983         port::error::FAILED_PRECONDITION,
984         absl::StrCat("could not create ROCM event: ", ToString(res))};
985   }
986 }
987 
GetDeviceCount()988 /* static */ int GpuDriver::GetDeviceCount() {
989   int device_count = 0;
990   hipError_t res = tensorflow::wrap::hipGetDeviceCount(&device_count);
991   if (res != hipSuccess) {
992     LOG(ERROR) << "could not retrieve ROCM device count: " << ToString(res);
993     return 0;
994   }
995 
996   if (FLAGS_gpuexec_rocm_device_0_only && device_count > 1) {
997     device_count = 1;
998   }
999   return device_count;
1000 }
1001 
GetComputeCapability(int * cc_major,int * cc_minor,hipDevice_t device)1002 /* static */ port::Status GpuDriver::GetComputeCapability(int* cc_major,
1003                                                           int* cc_minor,
1004                                                           hipDevice_t device) {
1005   return port::Status(
1006       port::error::INTERNAL,
1007       absl::StrFormat("failed to get compute capability for device: %d "
1008                       "(unsupported API on AMD Gpus)",
1009                       device));
1010 }
1011 
GetPointerAddressRange(hipDeviceptr_t dptr,hipDeviceptr_t * base,size_t * size)1012 /* static */ port::Status GpuDriver::GetPointerAddressRange(
1013     hipDeviceptr_t dptr, hipDeviceptr_t* base, size_t* size) {
1014   hipError_t result = tensorflow::wrap::hipMemGetAddressRange(base, size, dptr);
1015   if (result == hipSuccess) {
1016     return port::Status::OK();
1017   } else if (result == hipErrorNotFound) {
1018     // We differentiate between "this pointer is unknown" (return here) and
1019     // "there was an internal error while performing this operation" (return
1020     // below).
1021     return port::Status{port::error::NOT_FOUND,
1022                         absl::StrFormat("not a device pointer %p; %s",
1023                                         reinterpret_cast<void*>(dptr),
1024                                         ToString(result).c_str())};
1025   }
1026 
1027   return port::Status{
1028       port::error::INTERNAL,
1029       absl::StrFormat("failed to get pointer into for device pointer %p; %s",
1030                       reinterpret_cast<void*>(dptr), ToString(result).c_str())};
1031 }
1032 
GetPointerMemorySpace(hipDeviceptr_t pointer)1033 /* static */ port::StatusOr<MemorySpace> GpuDriver::GetPointerMemorySpace(
1034     hipDeviceptr_t pointer) {
1035   unsigned int value;
1036   hipError_t result = hipSuccess;
1037   if (result == hipSuccess) {
1038     switch (value) {
1039       case hipMemoryTypeDevice:
1040         return MemorySpace::kDevice;
1041       case hipMemoryTypeHost:
1042         return MemorySpace::kHost;
1043       default:
1044         return port::Status{
1045             port::error::INTERNAL,
1046             absl::StrCat("unknown memory space provided by ROCM API: ", value)};
1047     }
1048   }
1049 
1050   return port::Status{
1051       port::error::INTERNAL,
1052       absl::StrCat("failed to query device pointer for memory space: ",
1053                    ToString(result))};
1054 }
1055 
GetPointerDevice(hipDeviceptr_t pointer)1056 /* static */ port::StatusOr<hipDevice_t> GpuDriver::GetPointerDevice(
1057     hipDeviceptr_t pointer) {
1058   hipPointerAttribute_t pointerAttributes;
1059   hipError_t result =
1060       tensorflow::wrap::hipPointerGetAttributes(&pointerAttributes, pointer);
1061   if (result != hipSuccess) {
1062     return port::Status{
1063         port::error::INTERNAL,
1064         absl::StrCat("failed to get device for pointer: ", ToString(result))};
1065   }
1066 
1067   hipDevice_t device;
1068   result = tensorflow::wrap::hipDeviceGet(&device, pointerAttributes.device);
1069   if (result != hipSuccess) {
1070     return port::Status{
1071         port::error::INTERNAL,
1072         absl::StrCat("failed to get device for pointer: ", ToString(result))};
1073   }
1074 
1075   return device;
1076 }
1077 
GetGpuISAVersion(int * version,hipDevice_t device)1078 /* static */ port::Status GpuDriver::GetGpuISAVersion(int* version,
1079                                                       hipDevice_t device) {
1080   hipDeviceProp_t props;
1081   hipError_t result = tensorflow::wrap::hipGetDeviceProperties(&props, device);
1082   if (result == hipSuccess) {
1083     *version = props.gcnArch;
1084     return port::Status::OK();
1085   }
1086   *version = 0;
1087   return port::Status{
1088       port::error::INTERNAL,
1089       absl::StrFormat("failed to determine AMDGpu ISA version for device %d",
1090                       device)};
1091 }
1092 
GetGpuGCNArchName(hipDevice_t device,std::string * gcnArchName)1093 /* static */ port::Status GpuDriver::GetGpuGCNArchName(
1094     hipDevice_t device, std::string* gcnArchName) {
1095   hipDeviceProp_t props;
1096   hipError_t result = tensorflow::wrap::hipGetDeviceProperties(&props, device);
1097   if (result == hipSuccess) {
1098     *gcnArchName = props.gcnArchName;
1099     return port::Status::OK();
1100   }
1101   *gcnArchName = "";
1102   return port::Status{
1103       port::error::INTERNAL,
1104       absl::StrFormat("failed to determine AMDGpu GCN Arch Name for device %d",
1105                       device)};
1106 }
1107 
GetMFMASupport()1108 /* static */ port::StatusOr<bool> GpuDriver::GetMFMASupport() {
1109   hipDeviceProp_t props;
1110   int dev = 0;
1111   hipError_t result = hipGetDevice(&dev);
1112   result = tensorflow::wrap::hipGetDeviceProperties(&props, dev);
1113   if (result == hipSuccess) {
1114     std::string gcnArchName = props.gcnArchName;
1115     VLOG(1) << "GCN arch name " << gcnArchName;
1116     auto pos = gcnArchName.find(":");
1117     if (pos != string::npos) gcnArchName = gcnArchName.substr(0, pos);
1118     pos = gcnArchName.find("gfx");
1119     if (pos != string::npos) gcnArchName = gcnArchName.substr(pos + 3);
1120     VLOG(1) << "GCN arch name (stripped) " << gcnArchName;
1121     return ((gcnArchName == "908") || (gcnArchName == "909"));
1122   }
1123   return port::Status{
1124       port::error::INTERNAL,
1125       absl::StrFormat("failed to determine AMDGpu GCN Arch Name for device %d",
1126                       dev)};
1127 }
1128 
1129 // Helper function that turns the integer output of hipDeviceGetAttribute to
1130 // type T and wraps it in a StatusOr.
1131 template <typename T>
GetSimpleAttribute(hipDevice_t device,hipDeviceAttribute_t attribute)1132 static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
1133                                             hipDeviceAttribute_t attribute) {
1134   int value = -1;
1135   hipError_t result =
1136       tensorflow::wrap::hipDeviceGetAttribute(&value, attribute, device);
1137   if (result != hipSuccess) {
1138     return port::Status{
1139         port::error::NOT_FOUND,
1140         absl::StrCat("could not retrieve ROCM device attribute (", attribute,
1141                      "): ", ToString(result))};
1142   }
1143   T converted = value;
1144   return converted;
1145 }
1146 
GetMultiprocessorCount(hipDevice_t device)1147 /* static */ port::StatusOr<int> GpuDriver::GetMultiprocessorCount(
1148     hipDevice_t device) {
1149   return GetSimpleAttribute<int>(device, hipDeviceAttributeMultiprocessorCount);
1150 }
1151 
GetMaxSharedMemoryPerCore(hipDevice_t device)1152 /* static */ port::StatusOr<int64_t> GpuDriver::GetMaxSharedMemoryPerCore(
1153     hipDevice_t device) {
1154   return GetSimpleAttribute<int64_t>(
1155       device, hipDeviceAttributeMaxSharedMemoryPerMultiprocessor);
1156 }
1157 
GetMaxSharedMemoryPerBlock(hipDevice_t device)1158 /* static */ port::StatusOr<int64_t> GpuDriver::GetMaxSharedMemoryPerBlock(
1159     hipDevice_t device) {
1160   return GetSimpleAttribute<int64_t>(device,
1161                                      hipDeviceAttributeMaxSharedMemoryPerBlock);
1162 }
1163 
GetMaxThreadsPerMultiprocessor(hipDevice_t device)1164 /* static */ port::StatusOr<int64_t> GpuDriver::GetMaxThreadsPerMultiprocessor(
1165     hipDevice_t device) {
1166   return GetSimpleAttribute<int64_t>(
1167       device, hipDeviceAttributeMaxThreadsPerMultiProcessor);
1168 }
1169 
GetMaxThreadsPerBlock(hipDevice_t device)1170 /* static */ port::StatusOr<int64_t> GpuDriver::GetMaxThreadsPerBlock(
1171     hipDevice_t device) {
1172   return GetSimpleAttribute<int64_t>(device,
1173                                      hipDeviceAttributeMaxThreadsPerBlock);
1174 }
1175 
GetMaxRegistersPerBlock(hipDevice_t device)1176 /* static */ port::StatusOr<int64_t> GpuDriver::GetMaxRegistersPerBlock(
1177     hipDevice_t device) {
1178   return GetSimpleAttribute<int64_t>(device,
1179                                      hipDeviceAttributeMaxRegistersPerBlock);
1180 }
1181 
GetThreadsPerWarp(hipDevice_t device)1182 /* static */ port::StatusOr<int64_t> GpuDriver::GetThreadsPerWarp(
1183     hipDevice_t device) {
1184   return GetSimpleAttribute<int64_t>(device, hipDeviceAttributeWarpSize);
1185 }
1186 
GetGridLimits(int * x,int * y,int * z,hipDevice_t device)1187 /* static */ bool GpuDriver::GetGridLimits(int* x, int* y, int* z,
1188                                            hipDevice_t device) {
1189   int value;
1190   hipError_t res = tensorflow::wrap::hipDeviceGetAttribute(
1191       &value, hipDeviceAttributeMaxGridDimX, device);
1192   if (res != hipSuccess) {
1193     LOG(ERROR) << "failed to query max grid dim x: " << ToString(res);
1194     return false;
1195   }
1196   *x = value;
1197 
1198   res = tensorflow::wrap::hipDeviceGetAttribute(
1199       &value, hipDeviceAttributeMaxGridDimY, device);
1200   if (res != hipSuccess) {
1201     LOG(ERROR) << "failed to query max grid dim y: " << ToString(res);
1202     return false;
1203   }
1204   *y = value;
1205 
1206   res = tensorflow::wrap::hipDeviceGetAttribute(
1207       &value, hipDeviceAttributeMaxGridDimZ, device);
1208   if (res != hipSuccess) {
1209     LOG(ERROR) << "failed to query max grid dim z: " << ToString(res);
1210     return false;
1211   }
1212   *z = value;
1213   return true;
1214 }
1215 
GetDriverVersion(int * driver_version)1216 /* static */ bool GpuDriver::GetDriverVersion(int* driver_version) {
1217   hipError_t res = tensorflow::wrap::hipDriverGetVersion(driver_version);
1218   if (res != hipSuccess) {
1219     LOG(ERROR) << "failed to query driver version: " << ToString(res);
1220     return false;
1221   }
1222 
1223   return true;
1224 }
1225 
GetDeviceProperties(hipDeviceProp_t * device_properties,int device_ordinal)1226 /* static */ bool GpuDriver::GetDeviceProperties(
1227     hipDeviceProp_t* device_properties, int device_ordinal) {
1228   hipError_t res = tensorflow::wrap::hipGetDeviceProperties(device_properties,
1229                                                             device_ordinal);
1230   if (res != hipSuccess) {
1231     LOG(ERROR) << "failed to query device properties: " << ToString(res);
1232     return false;
1233   }
1234 
1235   return true;
1236 }
1237 
GetDeviceAttribute(hipDeviceAttribute_t attribute,hipDevice_t device)1238 /* static */ port::StatusOr<int> GpuDriver::GetDeviceAttribute(
1239     hipDeviceAttribute_t attribute, hipDevice_t device) {
1240   return GetSimpleAttribute<int>(device, attribute);
1241 }
1242 
IsEccEnabled(hipDevice_t device,bool * result)1243 /* static */ bool GpuDriver::IsEccEnabled(hipDevice_t device, bool* result) {
1244   int value = -1;
1245   hipError_t res = hipSuccess;
1246   // TODO(ROCm) implement this feature in HIP
1247   if (res != hipSuccess) {
1248     LOG(ERROR) << "failed to query ECC status: " << ToString(res);
1249     return false;
1250   }
1251 
1252   *result = value;
1253   return true;
1254 }
1255 
GetDeviceMemoryInfo(GpuContext * context,int64_t * free_out,int64_t * total_out)1256 /* static */ bool GpuDriver::GetDeviceMemoryInfo(GpuContext* context,
1257                                                  int64_t* free_out,
1258                                                  int64_t* total_out) {
1259   ScopedActivateContext activation{context};
1260   size_t free = 0;
1261   size_t total = 0;
1262   hipError_t res = tensorflow::wrap::hipMemGetInfo(&free, &total);
1263   if (res != hipSuccess) {
1264     LOG(ERROR) << "failed to query device memory info: " << ToString(res);
1265     return false;
1266   }
1267 
1268   *free_out = free;
1269   *total_out = total;
1270   return true;
1271 }
1272 
GetDeviceTotalMemory(hipDevice_t device,uint64_t * result)1273 /* static */ bool GpuDriver::GetDeviceTotalMemory(hipDevice_t device,
1274                                                   uint64_t* result) {
1275   size_t value = -1;
1276   hipError_t res = tensorflow::wrap::hipDeviceTotalMem(&value, device);
1277   if (res != hipSuccess) {
1278     LOG(ERROR) << "failed to query total available memory: " << ToString(res);
1279     return false;
1280   }
1281 
1282   *result = value;
1283   return true;
1284 }
1285 
GetPCIBusID(hipDevice_t device)1286 /* static */ string GpuDriver::GetPCIBusID(hipDevice_t device) {
1287   string pci_bus_id;
1288   static const int kBufferSize = 64;
1289   absl::InlinedVector<char, 4> chars(kBufferSize);
1290   chars[kBufferSize - 1] = '\0';
1291   hipError_t res = tensorflow::wrap::hipDeviceGetPCIBusId(
1292       chars.begin(), kBufferSize - 1, device);
1293   if (res != hipSuccess) {
1294     LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res);
1295     return pci_bus_id;
1296   }
1297   pci_bus_id = chars.begin();
1298   return pci_bus_id;
1299 }
1300 
CanEnablePeerAccess(GpuContext * from,GpuContext * to)1301 /* static */ bool GpuDriver::CanEnablePeerAccess(GpuContext* from,
1302                                                  GpuContext* to) {
1303   if (from->device_ordinal() == to->device_ordinal()) {
1304     return true;  // A device can always access its own memory.
1305   }
1306 
1307   int can_access_peer = -1;
1308   hipError_t res = tensorflow::wrap::hipDeviceCanAccessPeer(
1309       &can_access_peer, from->device_ordinal(), to->device_ordinal());
1310   if (res != hipSuccess) {
1311     LOG(ERROR) << "failed to detect peer access capability: " << ToString(res);
1312     return false;
1313   }
1314 
1315   return can_access_peer;
1316 }
1317 
EnablePeerAccess(GpuContext * from,GpuContext * to)1318 /* static */ port::Status GpuDriver::EnablePeerAccess(GpuContext* from,
1319                                                       GpuContext* to) {
1320   if (from->device_ordinal() == to->device_ordinal()) {
1321     return port::Status::OK();  // A device can always access its own memory.
1322   }
1323 
1324   ScopedActivateContext activated{from};
1325   hipError_t result = tensorflow::wrap::hipDeviceEnablePeerAccess(
1326       to->device_ordinal(), 0 /* = flags */);
1327   if (result != hipSuccess && result != hipErrorPeerAccessAlreadyEnabled) {
1328     return port::Status{
1329         port::error::INTERNAL,
1330         absl::StrFormat("failed to enable peer access from %d to %d: %s",
1331                         from->device_ordinal(), to->device_ordinal(),
1332                         ToString(result).c_str())};
1333   }
1334 
1335   return port::Status::OK();
1336 }
1337 
GetMaxOccupiedBlocksPerCore(GpuContext * context,hipFunction_t kernel,int threads_per_block,size_t dynamic_shared_memory_bytes)1338 /* static */ port::StatusOr<int> GpuDriver::GetMaxOccupiedBlocksPerCore(
1339     GpuContext* context, hipFunction_t kernel, int threads_per_block,
1340     size_t dynamic_shared_memory_bytes) {
1341   ScopedActivateContext activation{context};
1342 
1343   int max_blocks = 0;
1344   hipError_t result = hipSuccess;
1345   // TODO(ROCm) implement this feature in HIP
1346   if (result != hipSuccess) {
1347     return port::Status{
1348         port::error::INTERNAL,
1349         absl::StrFormat("failed to calculate occupancy of kernel %p: %s",
1350                         kernel, ToString(result).c_str())};
1351   }
1352 
1353   return max_blocks;
1354 }
1355 
1356 }  // namespace gpu
1357 }  // namespace stream_executor
1358