xref: /aosp_15_r20/external/tensorflow/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include <unistd.h>
17 
18 #include "absl/base/casts.h"
19 #include "absl/strings/ascii.h"
20 #include "absl/strings/str_cat.h"
21 #include "absl/strings/str_format.h"
22 #include "absl/strings/str_join.h"
23 #include "tensorflow/stream_executor/gpu/gpu_driver.h"
24 #include "tensorflow/stream_executor/gpu/gpu_event.h"
25 #include "tensorflow/stream_executor/gpu/gpu_executor.h"
26 #include "tensorflow/stream_executor/gpu/gpu_stream.h"
27 #include "tensorflow/stream_executor/gpu/gpu_timer.h"
28 #include "tensorflow/stream_executor/kernel_cache_config.h"
29 #include "tensorflow/stream_executor/lib/env.h"
30 #include "tensorflow/stream_executor/lib/error.h"
31 #include "tensorflow/stream_executor/lib/initialize.h"
32 #include "tensorflow/stream_executor/lib/mathutil.h"
33 #include "tensorflow/stream_executor/lib/numbers.h"
34 #include "tensorflow/stream_executor/lib/path.h"
35 #include "tensorflow/stream_executor/lib/process_state.h"
36 #include "tensorflow/stream_executor/lib/statusor.h"
37 #include "tensorflow/stream_executor/platform.h"
38 #include "tensorflow/stream_executor/platform/dso_loader.h"
39 #include "tensorflow/stream_executor/platform/logging.h"
40 #include "tensorflow/stream_executor/platform/port.h"
41 #include "tensorflow/stream_executor/plugin_registry.h"
42 #include "tensorflow/stream_executor/rocm/rocm_diagnostics.h"
43 #include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
44 #include "tensorflow/stream_executor/stream.h"
45 #include "tensorflow/stream_executor/stream_executor_internal.h"
46 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
47 #include "tensorflow/stream_executor/timer.h"
48 
49 #ifdef PLATFORMS_GPUS_ROCM_DYNAMIC_LIBROCM_DYNAMIC_LIBROCM_H_
50 #error \
51     "No driver calls in this file, wrap driver functionality in rocm_driver.cc."
52 #endif
53 
54 #ifdef __ROCM_RUNTIME_H__
55 #error \
56     "ROCM runtime being included into ROCM GPU executor; should be driver only."
57 #endif
58 
59 namespace stream_executor {
60 namespace gpu {
61 
AsGpuEvent(Event * event)62 static GpuEvent* AsGpuEvent(Event* event) {
63   DCHECK(event != nullptr);
64   return static_cast<GpuEvent*>(event->implementation());
65 }
66 
67 // Given a platform-independent timer datatype, returns the internal ROCM
68 // platform implementation pointer.
AsGpuTimer(Timer * timer)69 static GpuTimer* AsGpuTimer(Timer* timer) {
70   DCHECK(timer != nullptr);
71   return static_cast<GpuTimer*>(timer->implementation());
72 }
73 
74 // Given const GPU memory, returns a librocm device pointer datatype, suitable
75 // for passing directly to librocm APIs.
76 //
77 // N.B. we must lose constness in order to pass a suitable type to the existing
78 // librocm APIs, so the caller should take care to only pass the result of const
79 // GPU memory conversions to librocm functions which will honor constness.
AsROCmDevicePtr(const DeviceMemoryBase & gpu_mem)80 static hipDeviceptr_t AsROCmDevicePtr(const DeviceMemoryBase& gpu_mem) {
81   return const_cast<hipDeviceptr_t>(gpu_mem.opaque());
82 }
83 
84 // See description on const version above.
AsROCmDevicePtr(DeviceMemoryBase * gpu_mem)85 static hipDeviceptr_t AsROCmDevicePtr(DeviceMemoryBase* gpu_mem) {
86   return AsROCmDevicePtr(*gpu_mem);
87 }
88 
GetGpuContext(Stream * stream)89 static GpuContext* GetGpuContext(Stream* stream) {
90   return static_cast<GpuExecutor*>(stream->parent()->implementation())
91       ->gpu_context();
92 }
93 
ExtractGpuContext(GpuExecutor * rocm_exec)94 GpuContext* ExtractGpuContext(GpuExecutor* rocm_exec) {
95   CHECK(rocm_exec != nullptr);
96   return rocm_exec->gpu_context();
97 }
98 
~GpuExecutor()99 GpuExecutor::~GpuExecutor() {
100   for (auto& it : disk_modules_) {
101     GpuDriver::UnloadModule(context_, it.second);
102   }
103   for (auto& it : in_memory_modules_) {
104     GpuDriver::UnloadModule(context_, it.second);
105   }
106   if (context_ != nullptr) {
107     GpuDriver::DestroyContext(context_);
108   }
109   CHECK(kernel_to_gpu_binary_.empty()) << "GpuExecutor has live kernels.";
110   CHECK(gpu_binary_to_module_.empty()) << "GpuExecutor has loaded modules.";
111 }
UnloadModule(ModuleHandle module_handle)112 bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
113   const char* gpu_binary = reinterpret_cast<const char*>(module_handle.id());
114   absl::MutexLock lock{&in_memory_modules_mu_};
115   return UnloadGpuBinary(gpu_binary);
116 }
117 
118 port::StatusOr<std::shared_ptr<DeviceMemoryBase>>
CreateOrShareConstant(Stream * stream,const std::vector<uint8_t> & content)119 GpuExecutor::CreateOrShareConstant(Stream* stream,
120                                    const std::vector<uint8_t>& content) {
121   return port::UnimplementedError("Not implemented for ROCm");
122 }
123 
UnloadGpuBinary(const void * gpu_binary)124 bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
125   auto module_it = gpu_binary_to_module_.find(gpu_binary);
126   if (gpu_binary_to_module_.end() == module_it) {
127     VLOG(3) << "No loaded  HSACO module for " << gpu_binary;
128     return false;
129   }
130   auto& module = module_it->second.first;
131   auto& refcount = module_it->second.second;
132   VLOG(3) << "Found HSACO module " << module << " with refcount " << refcount;
133   if (--refcount == 0) {
134     VLOG(3) << "Unloading  HSACO module " << module;
135     GpuDriver::UnloadModule(context_, module);
136     gpu_binary_to_module_.erase(module_it);
137     const char* mem_it = nullptr;
138     for (auto x : in_memory_modules_) {
139       if (x.second == module) mem_it = x.first;
140     }
141     if (mem_it != nullptr) in_memory_modules_.erase(mem_it);
142   }
143   return true;
144 }
145 
UnloadKernel(const KernelBase * kernel)146 void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
147   VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name();
148 
149   absl::MutexLock lock{&in_memory_modules_mu_};
150   auto gpu_binary_it = kernel_to_gpu_binary_.find(kernel);
151   if (kernel_to_gpu_binary_.end() == gpu_binary_it) {
152     VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
153             << " has never been loaded.";
154     return;  // We've never seen this kernel.
155   }
156   VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
157           << " has loaded GPU code " << gpu_binary_it->second;
158   UnloadGpuBinary(gpu_binary_it->second);
159   kernel_to_gpu_binary_.erase(gpu_binary_it);
160 }
161 
Init(int device_ordinal,DeviceOptions device_options)162 port::Status GpuExecutor::Init(int device_ordinal,
163                                DeviceOptions device_options) {
164   device_ordinal_ = device_ordinal;
165 
166   auto status = GpuDriver::Init();
167   if (!status.ok()) {
168     return status;
169   }
170 
171   status = GpuDriver::GetDevice(device_ordinal_, &device_);
172   if (!status.ok()) {
173     return status;
174   }
175 
176   status = GpuDriver::CreateContext(device_ordinal_, device_, device_options,
177                                     &context_);
178   if (!status.ok()) {
179     return status;
180   }
181 
182   return GpuDriver::GetGpuISAVersion(&version_, device_);
183 }
184 
FindOnDiskForComputeCapability(absl::string_view filename,absl::string_view canonical_suffix,string * found_filename) const185 bool GpuExecutor::FindOnDiskForComputeCapability(
186     absl::string_view filename, absl::string_view canonical_suffix,
187     string* found_filename) const {
188   LOG(FATAL) << "Feature not supported on ROCM platform "
189                 "(FindOnDiskForComputeCapability)";
190   return false;
191 }
192 
FindOnDiskForISAVersion(absl::string_view filename,absl::string_view canonical_suffix,string * found_filename) const193 bool GpuExecutor::FindOnDiskForISAVersion(absl::string_view filename,
194                                           absl::string_view canonical_suffix,
195                                           string* found_filename) const {
196   if (version_ == 0) {
197     return false;
198   }
199 
200   string cc_specific =
201       absl::StrCat(filename, ".cc", version_, canonical_suffix);
202   if (port::FileExists(cc_specific).ok()) {
203     VLOG(2) << "found AMDGPU ISA version-specific file, using that: "
204             << cc_specific;
205     *found_filename = cc_specific;
206     return true;
207   }
208 
209   VLOG(2) << "could not find AMDGPU ISA version-specific file at: "
210           << cc_specific;
211   if (port::FileExists(string(filename)).ok()) {
212     *found_filename = string(filename);
213     return true;
214   }
215 
216   return false;
217 }
218 
219 // Returns the path to the running executable.
220 // N.B. Derived from //knowledge/smalltalk/background_kb.cc
221 // Arg: strip_exe: if true, remove the name of the executable itself from the
222 //                 returned string. Example: calling this from /usr/bin/foo
223 //                 would return /usr/bin.
GetBinaryDir(bool strip_exe)224 static string GetBinaryDir(bool strip_exe) {
225   char exe_path[PATH_MAX] = {0};
226   PCHECK(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1) != -1);
227   // Make sure it's null-terminated:
228   exe_path[sizeof(exe_path) - 1] = 0;
229 
230   if (strip_exe) {
231     // The exe is the last component of the path, so remove one component.
232     string ret = exe_path;
233     std::vector<string> components = absl::StrSplit(exe_path, '/');
234     components.pop_back();
235     return absl::StrJoin(components, "/");
236   }
237   return exe_path;
238 }
239 
GetKernel(const MultiKernelLoaderSpec & spec,KernelBase * kernel)240 port::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
241                                     KernelBase* kernel) {
242   GpuKernel* rocm_kernel = AsGpuKernel(kernel);
243   hipModule_t module = nullptr;
244   const string* kernelname;
245 
246   const OnDiskKernelLoaderSpec* on_disk_spec = nullptr;
247   bool has_cubin = spec.has_cuda_cubin_on_disk();
248   if (has_cubin) {
249     on_disk_spec = &spec.cuda_cubin_on_disk();
250   }
251 
252   if (on_disk_spec != nullptr) {
253     return port::InternalError(
254         "Loading ROCM kernel from disk is not supported");
255   } else if (spec.has_cuda_cubin_in_memory()) {
256     kernelname = &spec.cuda_cubin_in_memory().kernelname();
257 
258     const char* hsaco = spec.cuda_cubin_in_memory().bytes();
259     absl::MutexLock lock{&in_memory_modules_mu_};
260     module = in_memory_modules_[hsaco];
261 
262     if (module == nullptr) {
263       TF_RETURN_IF_ERROR(GpuDriver::LoadHsaco(context_, hsaco, &module));
264     }
265     kernel_to_gpu_binary_[kernel] = hsaco;
266   } else {
267     return port::InternalError("No method of loading ROCM kernel provided");
268   }
269 
270   VLOG(2) << "getting function " << *kernelname << " from module " << module;
271   if (!GpuDriver::GetModuleFunction(context_, module, kernelname->c_str(),
272                                     rocm_kernel->gpu_function_ptr())) {
273     return port::InternalError("Failed getting module function");
274   }
275 
276   // We have to trust the kernel loader spec arity because there doesn't appear
277   // to be a way to reflect on the number of expected arguments w/the ROCM API.
278   rocm_kernel->set_arity(spec.arity());
279 
280   KernelMetadata kernel_metadata;
281   TF_RETURN_IF_ERROR(GetKernelMetadata(rocm_kernel, &kernel_metadata));
282   kernel->set_metadata(kernel_metadata);
283   kernel->set_name(*kernelname);
284   return port::Status::OK();
285 }
286 
GetKernelMetadata(GpuKernel * rocm_kernel,KernelMetadata * kernel_metadata)287 port::Status GpuExecutor::GetKernelMetadata(GpuKernel* rocm_kernel,
288                                             KernelMetadata* kernel_metadata) {
289   int value = 0;
290   // TODO(ROCm) implement this feature in HIP
291   kernel_metadata->set_registers_per_thread(value);
292 
293   // TODO(ROCm) implement this feature in HIP
294   kernel_metadata->set_shared_memory_bytes(value);
295   return port::Status::OK();
296 }
297 
Launch(Stream * stream,const ThreadDim & thread_dims,const BlockDim & block_dims,const KernelBase & kernel,const KernelArgsArrayBase & args)298 port::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
299                                  const BlockDim& block_dims,
300                                  const KernelBase& kernel,
301                                  const KernelArgsArrayBase& args) {
302   CHECK_EQ(kernel.Arity(), args.number_of_arguments());
303   GpuStreamHandle hipstream = AsGpuStreamValue(stream);
304   const GpuKernel* rocm_kernel = AsGpuKernel(&kernel);
305   hipFunction_t hipfunc = rocm_kernel->AsGpuFunctionHandle();
306 
307   // Only perform/print the occupancy check once.  Even just checking to see
308   // whether we've done an occupancy check on this kernel before isn't free
309   // (because we have to synchronize), so we only do this at -v 2+.
310   if (VLOG_IS_ON(2)) {
311     absl::MutexLock lock(&launched_kernels_mu_);
312     if (!launched_kernels_.count(hipfunc)) {
313       VlogOccupancyInfo(kernel, thread_dims, block_dims);
314       // TODO(rspringer): Remove elements from launched_kernels_...if we ever
315       // expose a kernel/module deallocation method.
316       launched_kernels_.insert(hipfunc);
317     }
318   }
319 
320   if (rocm_kernel->GetPreferredCacheConfig() !=
321       KernelCacheConfig::kNoPreference) {
322     TF_RETURN_IF_ERROR(GpuDriver::FuncSetCacheConfig(
323         hipfunc, rocm_kernel->GetGpuCacheConfig()));
324   }
325 
326   // prepare kernargs
327   // KernelArgsArrayBase keeps the pointer of arguments
328   // deference them here
329   std::vector<void*> kernargs;
330   KernelArgIterator iter = args.arg_iterator();
331   while (iter.has_next()) {
332     KernelArg arg = iter.next();
333     VLOG(2) << "*(arg.address): "
334             << reinterpret_cast<void*>(
335                    *static_cast<const uint64_t*>(arg.address));
336     kernargs.push_back(
337         reinterpret_cast<void*>(*static_cast<const uint64_t*>(arg.address)));
338   }
339 
340   size_t size = sizeof(void*) * kernargs.size();
341   void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, kernargs.data(),
342                     HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END};
343 
344   return GpuDriver::LaunchKernel(
345       GetGpuContext(stream), kernel.name(), hipfunc, block_dims.x, block_dims.y,
346       block_dims.z, thread_dims.x, thread_dims.y, thread_dims.z,
347       args.number_of_shared_bytes(), hipstream, nullptr, (void**)&config);
348 }
349 
CalculateOccupancy(const DeviceDescription & device_description,uint64_t registers_per_thread,uint64_t shared_memory_per_block,const ThreadDim & thread_dims,GpuFunctionHandle func)350 int GpuExecutor::CalculateOccupancy(const DeviceDescription& device_description,
351                                     uint64_t registers_per_thread,
352                                     uint64_t shared_memory_per_block,
353                                     const ThreadDim& thread_dims,
354                                     GpuFunctionHandle func) {
355   LOG(FATAL) << "Feature not supported on ROCM platform (CalculateOccupancy)";
356   return 0;
357 }
358 
CompareOccupancy(int * initial_blocks,const DeviceDescription & device_description,uint64_t registers_per_thread,uint64_t shared_memory_per_block,const ThreadDim & thread_dims,GpuFunctionHandle func)359 int GpuExecutor::CompareOccupancy(int* initial_blocks,
360                                   const DeviceDescription& device_description,
361                                   uint64_t registers_per_thread,
362                                   uint64_t shared_memory_per_block,
363                                   const ThreadDim& thread_dims,
364                                   GpuFunctionHandle func) {
365   LOG(FATAL) << "Feature not supported on ROCM platform (CompareOccupancy)";
366   return 0;
367 }
368 
LoadModule(const MultiModuleLoaderSpec & spec,ModuleHandle * module_handle)369 port::Status GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
370                                      ModuleHandle* module_handle) {
371   // In GpuExecutor we store the pointer to the  HSACO binary  as
372   // ModuleHandle::id().
373   hipModule_t hip_module = nullptr;
374   // TODO(ROCm): Need  generic term instead of cubin/cuda/ptx
375   if (spec.has_cuda_cubin_in_memory()) {
376     absl::MutexLock lock{&in_memory_modules_mu_};
377     TF_RETURN_IF_ERROR(LoadModuleFromHsaco(
378         reinterpret_cast<const char*>(spec.cuda_cubin_in_memory().data()),
379         &hip_module));
380     *module_handle = ModuleHandle(const_cast<void*>(
381         static_cast<const void*>(spec.cuda_cubin_in_memory().data())));
382     return port::Status::OK();
383   } else {
384     return port::InternalError("No HASCO binary found");
385   }
386 }
387 
LoadModuleFromCuBin(const char * cubin,hipModule_t * module)388 port::Status GpuExecutor::LoadModuleFromCuBin(const char* cubin,
389                                               hipModule_t* module) {
390   LOG(FATAL) << "Feature not supported on ROCM platform (LoadModuleFromCuBin)";
391 }
392 
LoadModuleFromPtx(const char * ptx,hipModule_t * module)393 port::Status GpuExecutor::LoadModuleFromPtx(const char* ptx,
394                                             hipModule_t* module) {
395   LOG(FATAL) << "Feature not supported on ROCM platform (LoadModuleFromPtx)";
396 }
397 
LoadModuleFromHsaco(const char * hsaco,hipModule_t * module)398 port::Status GpuExecutor::LoadModuleFromHsaco(const char* hsaco,
399                                               hipModule_t* module) {
400   uint64_t module_refcount;
401   std::tie(*module, module_refcount) = gpu_binary_to_module_[hsaco];
402 
403   if (*module == nullptr) {
404     TF_RETURN_IF_ERROR(GpuDriver::LoadHsaco(context_, hsaco, module));
405     module_refcount = 1;
406     in_memory_modules_[hsaco] = *module;
407     VLOG(3) << "Loaded HSACO " << static_cast<const void*>(hsaco)
408             << " as module " << *module;
409   } else {
410     ++module_refcount;
411     VLOG(3) << "HSACO " << static_cast<const void*>(hsaco)
412             << " is already loaded as module " << *module;
413   }
414   gpu_binary_to_module_[hsaco] = {*module, module_refcount};
415   return port::Status::OK();
416 }
417 
418 // This is a non-essential operation; if there's a failure, proceed without
419 // logging an error. It's nearly certain that in case of failures, we'd never
420 // get here in the first place; these are very low-impact routines.
VlogOccupancyInfo(const KernelBase & kernel,const ThreadDim & thread_dims,const BlockDim & block_dims)421 void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
422                                     const ThreadDim& thread_dims,
423                                     const BlockDim& block_dims) {
424   // TODO(ROCm) implement this feature in HIP
425 }
426 
Allocate(uint64_t size,int64_t memory_space)427 DeviceMemoryBase GpuExecutor::Allocate(uint64_t size, int64_t memory_space) {
428   CHECK_EQ(memory_space, 0);
429   return DeviceMemoryBase(GpuDriver::DeviceAllocate(context_, size), size);
430 }
431 
GetSubBuffer(DeviceMemoryBase * mem,uint64_t offset_bytes,uint64_t size_bytes)432 void* GpuExecutor::GetSubBuffer(DeviceMemoryBase* mem, uint64_t offset_bytes,
433                                 uint64_t size_bytes) {
434   // offset and size are in bytes, so char* works as the pointer type.
435   return reinterpret_cast<char*>(mem->opaque()) + offset_bytes;
436 }
437 
Deallocate(DeviceMemoryBase * mem)438 void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
439   GpuDriver::DeviceDeallocate(context_, mem->opaque());
440 }
441 
HostMemoryRegister(void * location,uint64_t size)442 bool GpuExecutor::HostMemoryRegister(void* location, uint64_t size) {
443   if (location == nullptr || size == 0) {
444     LOG(WARNING) << "attempting to register null or zero-sized memory: "
445                  << location << "; size " << size;
446   }
447   VLOG(2) << "registering " << location << " size " << size;
448   return GpuDriver::HostRegister(context_, location, size);
449 }
450 
HostMemoryUnregister(void * location)451 bool GpuExecutor::HostMemoryUnregister(void* location) {
452   VLOG(2) << "unregistering " << location;
453   return GpuDriver::HostUnregister(context_, location);
454 }
455 
SynchronizeAllActivity()456 bool GpuExecutor::SynchronizeAllActivity() {
457   return GpuDriver::SynchronizeContext(context_);
458 }
459 
SynchronousMemZero(DeviceMemoryBase * location,uint64_t size)460 port::Status GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location,
461                                              uint64_t size) {
462   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
463       size % 4 == 0) {
464     return GpuDriver::SynchronousMemsetUint32(
465         context_, AsROCmDevicePtr(location), 0x0, size / 4);
466   }
467   return GpuDriver::SynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
468                                            0x0, size);
469 }
470 
SynchronousMemSet(DeviceMemoryBase * location,int value,uint64_t size)471 port::Status GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location,
472                                             int value, uint64_t size) {
473   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
474       size % 4 == 0) {
475     // hipMemset reinterprets "value" as a uint8.
476     uint8 byte_value = static_cast<uint8>(value);
477     uint32 pattern = (byte_value << 24) | (byte_value << 16) |
478                      (byte_value << 8) | byte_value;
479     return GpuDriver::SynchronousMemsetUint32(
480         context_, AsROCmDevicePtr(location), pattern, size / 4);
481   }
482   return GpuDriver::SynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
483                                            value, size);
484 }
485 
SynchronousMemcpy(DeviceMemoryBase * gpu_dst,const void * host_src,uint64_t size)486 port::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
487                                             const void* host_src,
488                                             uint64_t size) {
489   return GpuDriver::SynchronousMemcpyH2D(context_, AsROCmDevicePtr(gpu_dst),
490                                          host_src, size);
491 }
492 
SynchronousMemcpy(void * host_dst,const DeviceMemoryBase & gpu_src,uint64_t size)493 port::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
494                                             const DeviceMemoryBase& gpu_src,
495                                             uint64_t size) {
496   return GpuDriver::SynchronousMemcpyD2H(context_, host_dst,
497                                          AsROCmDevicePtr(gpu_src), size);
498 }
499 
SynchronousMemcpyDeviceToDevice(DeviceMemoryBase * gpu_dst,const DeviceMemoryBase & gpu_src,uint64_t size)500 port::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
501     DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64_t size) {
502   return GpuDriver::SynchronousMemcpyD2D(context_, AsROCmDevicePtr(gpu_dst),
503                                          AsROCmDevicePtr(gpu_src), size);
504 }
505 
MemZero(Stream * stream,DeviceMemoryBase * location,uint64_t size)506 port::Status GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
507                                   uint64_t size) {
508   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
509       size % 4 == 0) {
510     return Memset32(stream, location, 0x0, size);
511   } else {
512     return Memset(stream, location, 0x0, size);
513   }
514 }
515 
Memset(Stream * stream,DeviceMemoryBase * location,uint8 pattern,uint64_t size)516 port::Status GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
517                                  uint8 pattern, uint64_t size) {
518   VLOG(2) << "enqueueing memset8 operation onto stream " << stream
519           << " at location " << location << " with size " << size
520           << " and pattern " << std::hex << pattern;
521   return GpuDriver::AsynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
522                                             pattern, size,
523                                             AsGpuStreamValue(stream));
524 }
525 
Memset32(Stream * stream,DeviceMemoryBase * location,uint32 pattern,uint64_t size)526 port::Status GpuExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
527                                    uint32 pattern, uint64_t size) {
528   VLOG(2) << "enqueueing memset32 operation onto stream " << stream
529           << " at location " << location << " with size " << size
530           << " and pattern " << std::hex << pattern;
531   CHECK(reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
532         size % 4 == 0);
533   return GpuDriver::AsynchronousMemsetUint32(
534       context_, AsROCmDevicePtr(location), pattern, size / 4,
535       AsGpuStreamValue(stream));
536 }
537 
Memcpy(Stream * stream,void * host_dst,const DeviceMemoryBase & gpu_src,uint64_t size)538 bool GpuExecutor::Memcpy(Stream* stream, void* host_dst,
539                          const DeviceMemoryBase& gpu_src, uint64_t size) {
540   return GpuDriver::AsynchronousMemcpyD2H(context_, host_dst,
541                                           AsROCmDevicePtr(gpu_src), size,
542                                           AsGpuStreamValue(stream));
543 }
544 
Memcpy(Stream * stream,DeviceMemoryBase * gpu_dst,const void * host_src,uint64_t size)545 bool GpuExecutor::Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst,
546                          const void* host_src, uint64_t size) {
547   return GpuDriver::AsynchronousMemcpyH2D(context_, AsROCmDevicePtr(gpu_dst),
548                                           host_src, size,
549                                           AsGpuStreamValue(stream));
550 }
551 
MemcpyDeviceToDevice(Stream * stream,DeviceMemoryBase * gpu_dst,const DeviceMemoryBase & gpu_src,uint64_t size)552 bool GpuExecutor::MemcpyDeviceToDevice(Stream* stream,
553                                        DeviceMemoryBase* gpu_dst,
554                                        const DeviceMemoryBase& gpu_src,
555                                        uint64_t size) {
556   return GpuDriver::AsynchronousMemcpyD2D(context_, AsROCmDevicePtr(gpu_dst),
557                                           AsROCmDevicePtr(gpu_src), size,
558                                           AsGpuStreamValue(stream));
559 }
560 
HostCallback(Stream * stream,std::function<port::Status ()> callback)561 bool GpuExecutor::HostCallback(Stream* stream,
562                                std::function<port::Status()> callback) {
563   auto callback_ptr = new std::function<void()>([callback]() {
564     port::Status s = callback();
565     if (!s.ok()) {
566       LOG(WARNING) << "Host callback failed: " << s;
567     }
568   });
569   return GpuDriver::AddStreamCallback(context_, AsGpuStreamValue(stream),
570                                       InternalHostCallback, callback_ptr);
571 }
572 
InternalHostCallback(GpuStreamHandle stream,hipError_t status,void * data)573 /* static */ void GpuExecutor::InternalHostCallback(GpuStreamHandle stream,
574                                                     hipError_t status,
575                                                     void* data) {
576   std::function<void()>* callback =
577       reinterpret_cast<std::function<void()>*>(data);
578   (*callback)();
579   delete callback;
580 }
581 
AllocateEvent(Event * event)582 port::Status GpuExecutor::AllocateEvent(Event* event) {
583   return AsGpuEvent(event)->Init();
584 }
585 
DeallocateEvent(Event * event)586 port::Status GpuExecutor::DeallocateEvent(Event* event) {
587   return AsGpuEvent(event)->Destroy();
588 }
589 
RecordEvent(Stream * stream,Event * event)590 port::Status GpuExecutor::RecordEvent(Stream* stream, Event* event) {
591   return AsGpuEvent(event)->Record(AsGpuStream(stream));
592 }
593 
WaitForEvent(Stream * stream,Event * event)594 port::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
595   if (GpuDriver::WaitStreamOnEvent(context_, AsGpuStream(stream)->gpu_stream(),
596                                    AsGpuEvent(event)->gpu_event())) {
597     return port::Status::OK();
598   } else {
599     return port::Status{
600         port::error::INTERNAL,
601         absl::StrFormat("error recording waiting for ROCM event on stream %p",
602                         stream)};
603   }
604 }
605 
PollForEventStatus(Event * event)606 Event::Status GpuExecutor::PollForEventStatus(Event* event) {
607   return AsGpuEvent(event)->PollForStatus();
608 }
609 
AllocateStream(Stream * stream)610 bool GpuExecutor::AllocateStream(Stream* stream) {
611   absl::MutexLock l(&alive_gpu_streams_mu_);
612   bool out = AsGpuStream(stream)->Init();
613   alive_gpu_streams_[stream->implementation()->GpuStreamHack()] = stream;
614   return out;
615 }
616 
DeallocateStream(Stream * stream)617 void GpuExecutor::DeallocateStream(Stream* stream) {
618   GpuStream* rocm_stream = AsGpuStream(stream);
619   absl::MutexLock l(&alive_gpu_streams_mu_);
620   alive_gpu_streams_.erase(rocm_stream->GpuStreamHack());
621   if (!rocm_stream->IsIdle()) {
622     LOG(ERROR) << "Deallocating stream with pending work";
623   }
624   rocm_stream->Destroy();
625 }
626 
AllocateTimer(Timer * timer)627 bool GpuExecutor::AllocateTimer(Timer* timer) {
628   return AsGpuTimer(timer)->Init();
629 }
630 
DeallocateTimer(Timer * timer)631 void GpuExecutor::DeallocateTimer(Timer* timer) {
632   AsGpuTimer(timer)->Destroy();
633 }
634 
CreateStreamDependency(Stream * dependent,Stream * other)635 bool GpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
636   GpuEventHandle other_completed_event = *AsGpuStream(other)->completed_event();
637   bool ok = GpuDriver::RecordEvent(context_, other_completed_event,
638                                    AsGpuStreamValue(other))
639                 .ok();
640   if (!ok) {
641     LOG(ERROR) << "failed to record completion event; "
642                   "therefore, failed to create inter-stream dependency";
643     return false;
644   }
645 
646   return GpuDriver::WaitStreamOnEvent(context_, AsGpuStreamValue(dependent),
647                                       other_completed_event);
648 }
649 
StartTimer(Stream * stream,Timer * timer)650 bool GpuExecutor::StartTimer(Stream* stream, Timer* timer) {
651   return AsGpuTimer(timer)->Start(AsGpuStream(stream));
652 }
653 
StopTimer(Stream * stream,Timer * timer)654 bool GpuExecutor::StopTimer(Stream* stream, Timer* timer) {
655   return AsGpuTimer(timer)->Stop(AsGpuStream(stream));
656 }
657 
BlockHostUntilDone(Stream * stream)658 port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
659   return GpuDriver::SynchronizeStream(context_, AsGpuStreamValue(stream));
660 }
661 
CreateBlas()662 blas::BlasSupport* GpuExecutor::CreateBlas() {
663   PluginRegistry* registry = PluginRegistry::Instance();
664   port::StatusOr<PluginRegistry::BlasFactory> status =
665       registry->GetFactory<PluginRegistry::BlasFactory>(rocm::kROCmPlatformId,
666                                                         plugin_config_.blas());
667   if (!status.ok()) {
668     LOG(ERROR) << "Unable to retrieve BLAS factory: "
669                << status.status().error_message();
670     return nullptr;
671   }
672 
673   return status.ValueOrDie()(this);
674 }
675 
CreateDnn()676 dnn::DnnSupport* GpuExecutor::CreateDnn() {
677   PluginRegistry* registry = PluginRegistry::Instance();
678   port::StatusOr<PluginRegistry::DnnFactory> status =
679       registry->GetFactory<PluginRegistry::DnnFactory>(rocm::kROCmPlatformId,
680                                                        plugin_config_.dnn());
681   if (!status.ok()) {
682     LOG(ERROR) << "Unable to retrieve DNN factory: "
683                << status.status().error_message();
684     return nullptr;
685   }
686 
687   return status.ValueOrDie()(this);
688 }
689 
CreateFft()690 fft::FftSupport* GpuExecutor::CreateFft() {
691   PluginRegistry* registry = PluginRegistry::Instance();
692   port::StatusOr<PluginRegistry::FftFactory> status =
693       registry->GetFactory<PluginRegistry::FftFactory>(rocm::kROCmPlatformId,
694                                                        plugin_config_.fft());
695   if (!status.ok()) {
696     LOG(ERROR) << "Unable to retrieve FFT factory: "
697                << status.status().error_message();
698     return nullptr;
699   }
700 
701   return status.ValueOrDie()(this);
702 }
703 
CreateRng()704 rng::RngSupport* GpuExecutor::CreateRng() {
705   PluginRegistry* registry = PluginRegistry::Instance();
706   port::StatusOr<PluginRegistry::RngFactory> status =
707       registry->GetFactory<PluginRegistry::RngFactory>(rocm::kROCmPlatformId,
708                                                        plugin_config_.rng());
709   if (!status.ok()) {
710     LOG(ERROR) << "Unable to retrieve RNG factory: "
711                << status.status().error_message();
712     return nullptr;
713   }
714 
715   return status.ValueOrDie()(this);
716 }
717 
718 // TODO(rspringer): Remove in b/18544742.
SupportsDnn() const719 bool GpuExecutor::SupportsDnn() const { return true; }
720 
CanEnablePeerAccessTo(StreamExecutorInterface * other)721 bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
722   GpuExecutor* rocm_other = static_cast<GpuExecutor*>(other);
723   return GpuDriver::CanEnablePeerAccess(context_, rocm_other->context_);
724 }
725 
EnablePeerAccessTo(StreamExecutorInterface * other)726 port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
727   GpuExecutor* rocm_other = static_cast<GpuExecutor*>(other);
728   return GpuDriver::EnablePeerAccess(context_, rocm_other->context_);
729 }
730 
DeviceMemoryUsage(int64_t * free,int64_t * total) const731 bool GpuExecutor::DeviceMemoryUsage(int64_t* free, int64_t* total) const {
732   return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
733 }
734 
GetSymbol(const string & symbol_name,ModuleHandle module_handle,void ** mem,size_t * bytes)735 bool GpuExecutor::GetSymbol(const string& symbol_name,
736                             ModuleHandle module_handle, void** mem,
737                             size_t* bytes) {
738   absl::MutexLock lock{&in_memory_modules_mu_};
739   if (static_cast<bool>(module_handle)) {
740     auto it = gpu_binary_to_module_.find(module_handle.id());
741     CHECK(it != gpu_binary_to_module_.end());
742     if (GpuDriver::GetModuleSymbol(
743             context_, it->second.first, symbol_name.c_str(),
744             reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
745       return true;
746     }
747   }
748 
749   for (auto& it : gpu_binary_to_module_) {
750     if (GpuDriver::GetModuleSymbol(
751             context_, it.second.first, symbol_name.c_str(),
752             reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
753       return true;
754     }
755   }
756 
757   LOG(INFO) << "Falied to find symbol in any modules: " << symbol_name;
758   return false;
759 }
760 
FillBlockDimLimit(GpuDeviceHandle device,BlockDim * block_dim_limit)761 bool FillBlockDimLimit(GpuDeviceHandle device, BlockDim* block_dim_limit) {
762   // The BlockDim name is a mismatch against these GRID_DIM_* queries because
763   // we use BlockDims to express the dimensions of blocks within a grid
764   // (as opposed to ThreadDim which expresses the dimensions of threads
765   // within a block).
766   int x, y, z;
767   if (!GpuDriver::GetGridLimits(&x, &y, &z, device)) {
768     return false;
769   }
770 
771   block_dim_limit->x = x;
772   block_dim_limit->y = y;
773   block_dim_limit->z = z;
774   return true;
775 }
776 
SupportsBlas() const777 bool GpuExecutor::SupportsBlas() const { return true; }
778 
SupportsFft() const779 bool GpuExecutor::SupportsFft() const { return true; }
780 
SupportsRng() const781 bool GpuExecutor::SupportsRng() const { return true; }
782 
783 std::unique_ptr<internal::EventInterface>
CreateEventImplementation()784 GpuExecutor::CreateEventImplementation() {
785   return std::unique_ptr<internal::EventInterface>(new GpuEvent(this));
786 }
787 
788 std::unique_ptr<internal::KernelInterface>
CreateKernelImplementation()789 GpuExecutor::CreateKernelImplementation() {
790   return std::unique_ptr<internal::KernelInterface>(new GpuKernel());
791 }
792 
793 std::unique_ptr<internal::StreamInterface>
GetStreamImplementation()794 GpuExecutor::GetStreamImplementation() {
795   return std::unique_ptr<internal::StreamInterface>(new GpuStream(this));
796 }
797 
798 std::unique_ptr<internal::TimerInterface>
GetTimerImplementation()799 GpuExecutor::GetTimerImplementation() {
800   return std::unique_ptr<internal::TimerInterface>(new GpuTimer(this));
801 }
802 
GpuContextHack()803 void* GpuExecutor::GpuContextHack() { return context_; }
804 
gpu_context()805 GpuContext* GpuExecutor::gpu_context() { return context_; }
806 
807 // Attempts to read the NUMA node corresponding to the GPU device's PCI bus out
808 // of SysFS. Returns -1 if it cannot.
809 //
810 // For anything more complicated/prod-focused than this, you'll likely want to
811 // turn to gsys' topology modeling.
TryToReadNumaNode(const string & pci_bus_id,int device_ordinal)812 static int TryToReadNumaNode(const string& pci_bus_id, int device_ordinal) {
813   VLOG(2) << "trying to read NUMA node for device ordinal: " << device_ordinal;
814   static const int kUnknownNumaNode = -1;
815 
816   if (pci_bus_id.empty()) {
817     LOG(INFO) << "no PCI bus ID for device ordinal: " << device_ordinal;
818     return kUnknownNumaNode;
819   }
820 
821   std::string filename =
822       absl::StrFormat("/sys/bus/pci/devices/%s/numa_node", pci_bus_id);
823 
824   // We have to use fopen/fread here so that the device properties can be
825   // populated before InitGoogle procedure has been completed (at which point we
826   // could use the file::* utilities).
827   FILE* file = fopen(filename.c_str(), "r");
828   if (file == nullptr) {
829     LOG(INFO) << "could not open file to read NUMA node: " << filename
830               << "\nYour kernel may have been built without NUMA support.";
831     return kUnknownNumaNode;
832   }
833 
834   std::string content;
835   char buf[32];
836   size_t did_read = fread(buf, sizeof(buf[0]), sizeof(buf) - 1, file);
837   buf[did_read] = '\0';
838   content = buf;
839 
840   int32_t value;
841   if (port::safe_strto32(content, &value)) {
842     if (value < 0) {  // See http://b/18228951 for details on this path.
843       LOG(INFO) << "successful NUMA node read from SysFS had negative value ("
844                 << value
845                 << "), but there must be at least one NUMA node"
846                    ", so returning NUMA node zero";
847       fclose(file);
848       return 0;
849     }
850     fclose(file);
851     return value;
852   }
853 
854   LOG(WARNING)
855       << "could not convert SysFS file contents to integral NUMA node value: "
856       << content;
857 
858   fclose(file);
859   return kUnknownNumaNode;
860 }
861 
862 port::StatusOr<std::unique_ptr<DeviceDescription>>
CreateDeviceDescription(int device_ordinal)863 GpuExecutor::CreateDeviceDescription(int device_ordinal) {
864   GpuDeviceHandle device;
865   auto status = GpuDriver::GetDevice(device_ordinal, &device);
866   if (!status.ok()) {
867     return status;
868   }
869 
870   int version;
871   status = GpuDriver::GetGpuISAVersion(&version, device);
872   if (!status.ok()) {
873     return status;
874   }
875 
876   std::string gcn_arch_name;
877   status = GpuDriver::GetGpuGCNArchName(device, &gcn_arch_name);
878   if (!status.ok()) {
879     return status;
880   }
881 
882   internal::DeviceDescriptionBuilder builder;
883 
884   {
885     int driver_version = 0;
886     (void)GpuDriver::GetDriverVersion(&driver_version);
887     string augmented_driver_version = absl::StrFormat(
888         "%d (%s)", driver_version,
889         rocm::DriverVersionStatusToString(Diagnostician::FindDsoVersion())
890             .c_str());
891     builder.set_driver_version(augmented_driver_version);
892   }
893 
894   {
895     string pci_bus_id = GpuDriver::GetPCIBusID(device);
896 
897     // Lower the hex characters to match sysfs.
898     pci_bus_id = absl::AsciiStrToLower(pci_bus_id);
899     builder.set_pci_bus_id(pci_bus_id);
900 
901     // Read the NUMA node corresponding to the PCI bus ID out of sysfs.
902     int numa_node = TryToReadNumaNode(pci_bus_id, device_ordinal);
903     builder.set_numa_node(numa_node);
904   }
905 
906   hipDeviceProp_t prop;
907   if (GpuDriver::GetDeviceProperties(&prop, device_ordinal)) {
908     builder.set_threads_per_block_limit(prop.maxThreadsPerBlock);
909 
910     ThreadDim thread_dim_limit;
911     thread_dim_limit.x = prop.maxThreadsDim[0];
912     thread_dim_limit.y = prop.maxThreadsDim[1];
913     thread_dim_limit.z = prop.maxThreadsDim[2];
914     builder.set_thread_dim_limit(thread_dim_limit);
915 
916     float clock_rate_ghz = static_cast<float>(prop.clockRate) / 1e6;
917     builder.set_clock_rate_ghz(clock_rate_ghz);
918 
919     // mem_bandwidth = 2 * mem_bus_width_in_bytes * mem_clock_rate_in_hz
920     int64_t memory_bandwidth = 2 * (int64_t(prop.memoryBusWidth) / 8) *
921                                (int64_t(prop.memoryClockRate) * 1000);
922     builder.set_memory_bandwidth(memory_bandwidth);
923   }
924 
925   {
926     bool ecc_enabled = false;
927     (void)GpuDriver::IsEccEnabled(device, &ecc_enabled);
928     builder.set_ecc_enabled(ecc_enabled);
929   }
930 
931   {
932     uint64_t device_memory_size = -1;
933     (void)GpuDriver::GetDeviceTotalMemory(device, &device_memory_size);
934     builder.set_device_memory_size(device_memory_size);
935   }
936 
937   {
938     BlockDim block_dim_limit;
939     FillBlockDimLimit(device, &block_dim_limit);
940     builder.set_block_dim_limit(block_dim_limit);
941   }
942 
943   {
944     string device_name;
945     TF_RETURN_IF_ERROR(GpuDriver::GetDeviceName(device, &device_name));
946     builder.set_name(device_name);
947   }
948 
949   builder.set_platform_version(
950       absl::StrCat("AMDGPU ISA version: ", gcn_arch_name));
951 
952   // TODO(leary) should be a way to query this from the driver, but this is
953   // unlikely to change for us any time soon.
954   builder.set_device_address_bits(64);
955 
956   builder.set_device_vendor("Advanced Micro Devices, Inc");
957   builder.set_rocm_compute_capability(gcn_arch_name);
958 
959   builder.set_shared_memory_per_core(
960       GpuDriver::GetMaxSharedMemoryPerCore(device).ValueOrDie());
961   builder.set_shared_memory_per_block(
962       GpuDriver::GetMaxSharedMemoryPerBlock(device).ValueOrDie());
963   builder.set_core_count(
964       GpuDriver::GetMultiprocessorCount(device).ValueOrDie());
965   builder.set_threads_per_core_limit(
966       GpuDriver::GetMaxThreadsPerMultiprocessor(device).ValueOrDie());
967   builder.set_registers_per_block_limit(
968       GpuDriver::GetMaxRegistersPerBlock(device).ValueOrDie());
969   builder.set_threads_per_warp(
970       GpuDriver::GetThreadsPerWarp(device).ValueOrDie());
971   builder.set_registers_per_core_limit(64 * 1024);
972 
973   return builder.Build();
974 }
975 
976 }  // namespace gpu
977 
978 }  // namespace stream_executor
979 
980 REGISTER_MODULE_INITIALIZER(rocm_gpu_executor, {});
981