1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include <unistd.h>
17
18 #include "absl/base/casts.h"
19 #include "absl/strings/ascii.h"
20 #include "absl/strings/str_cat.h"
21 #include "absl/strings/str_format.h"
22 #include "absl/strings/str_join.h"
23 #include "tensorflow/stream_executor/gpu/gpu_driver.h"
24 #include "tensorflow/stream_executor/gpu/gpu_event.h"
25 #include "tensorflow/stream_executor/gpu/gpu_executor.h"
26 #include "tensorflow/stream_executor/gpu/gpu_stream.h"
27 #include "tensorflow/stream_executor/gpu/gpu_timer.h"
28 #include "tensorflow/stream_executor/kernel_cache_config.h"
29 #include "tensorflow/stream_executor/lib/env.h"
30 #include "tensorflow/stream_executor/lib/error.h"
31 #include "tensorflow/stream_executor/lib/initialize.h"
32 #include "tensorflow/stream_executor/lib/mathutil.h"
33 #include "tensorflow/stream_executor/lib/numbers.h"
34 #include "tensorflow/stream_executor/lib/path.h"
35 #include "tensorflow/stream_executor/lib/process_state.h"
36 #include "tensorflow/stream_executor/lib/statusor.h"
37 #include "tensorflow/stream_executor/platform.h"
38 #include "tensorflow/stream_executor/platform/dso_loader.h"
39 #include "tensorflow/stream_executor/platform/logging.h"
40 #include "tensorflow/stream_executor/platform/port.h"
41 #include "tensorflow/stream_executor/plugin_registry.h"
42 #include "tensorflow/stream_executor/rocm/rocm_diagnostics.h"
43 #include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
44 #include "tensorflow/stream_executor/stream.h"
45 #include "tensorflow/stream_executor/stream_executor_internal.h"
46 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
47 #include "tensorflow/stream_executor/timer.h"
48
49 #ifdef PLATFORMS_GPUS_ROCM_DYNAMIC_LIBROCM_DYNAMIC_LIBROCM_H_
50 #error \
51 "No driver calls in this file, wrap driver functionality in rocm_driver.cc."
52 #endif
53
54 #ifdef __ROCM_RUNTIME_H__
55 #error \
56 "ROCM runtime being included into ROCM GPU executor; should be driver only."
57 #endif
58
59 namespace stream_executor {
60 namespace gpu {
61
AsGpuEvent(Event * event)62 static GpuEvent* AsGpuEvent(Event* event) {
63 DCHECK(event != nullptr);
64 return static_cast<GpuEvent*>(event->implementation());
65 }
66
67 // Given a platform-independent timer datatype, returns the internal ROCM
68 // platform implementation pointer.
AsGpuTimer(Timer * timer)69 static GpuTimer* AsGpuTimer(Timer* timer) {
70 DCHECK(timer != nullptr);
71 return static_cast<GpuTimer*>(timer->implementation());
72 }
73
74 // Given const GPU memory, returns a librocm device pointer datatype, suitable
75 // for passing directly to librocm APIs.
76 //
77 // N.B. we must lose constness in order to pass a suitable type to the existing
78 // librocm APIs, so the caller should take care to only pass the result of const
79 // GPU memory conversions to librocm functions which will honor constness.
AsROCmDevicePtr(const DeviceMemoryBase & gpu_mem)80 static hipDeviceptr_t AsROCmDevicePtr(const DeviceMemoryBase& gpu_mem) {
81 return const_cast<hipDeviceptr_t>(gpu_mem.opaque());
82 }
83
84 // See description on const version above.
AsROCmDevicePtr(DeviceMemoryBase * gpu_mem)85 static hipDeviceptr_t AsROCmDevicePtr(DeviceMemoryBase* gpu_mem) {
86 return AsROCmDevicePtr(*gpu_mem);
87 }
88
GetGpuContext(Stream * stream)89 static GpuContext* GetGpuContext(Stream* stream) {
90 return static_cast<GpuExecutor*>(stream->parent()->implementation())
91 ->gpu_context();
92 }
93
ExtractGpuContext(GpuExecutor * rocm_exec)94 GpuContext* ExtractGpuContext(GpuExecutor* rocm_exec) {
95 CHECK(rocm_exec != nullptr);
96 return rocm_exec->gpu_context();
97 }
98
~GpuExecutor()99 GpuExecutor::~GpuExecutor() {
100 for (auto& it : disk_modules_) {
101 GpuDriver::UnloadModule(context_, it.second);
102 }
103 for (auto& it : in_memory_modules_) {
104 GpuDriver::UnloadModule(context_, it.second);
105 }
106 if (context_ != nullptr) {
107 GpuDriver::DestroyContext(context_);
108 }
109 CHECK(kernel_to_gpu_binary_.empty()) << "GpuExecutor has live kernels.";
110 CHECK(gpu_binary_to_module_.empty()) << "GpuExecutor has loaded modules.";
111 }
UnloadModule(ModuleHandle module_handle)112 bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
113 const char* gpu_binary = reinterpret_cast<const char*>(module_handle.id());
114 absl::MutexLock lock{&in_memory_modules_mu_};
115 return UnloadGpuBinary(gpu_binary);
116 }
117
118 port::StatusOr<std::shared_ptr<DeviceMemoryBase>>
CreateOrShareConstant(Stream * stream,const std::vector<uint8_t> & content)119 GpuExecutor::CreateOrShareConstant(Stream* stream,
120 const std::vector<uint8_t>& content) {
121 return port::UnimplementedError("Not implemented for ROCm");
122 }
123
UnloadGpuBinary(const void * gpu_binary)124 bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
125 auto module_it = gpu_binary_to_module_.find(gpu_binary);
126 if (gpu_binary_to_module_.end() == module_it) {
127 VLOG(3) << "No loaded HSACO module for " << gpu_binary;
128 return false;
129 }
130 auto& module = module_it->second.first;
131 auto& refcount = module_it->second.second;
132 VLOG(3) << "Found HSACO module " << module << " with refcount " << refcount;
133 if (--refcount == 0) {
134 VLOG(3) << "Unloading HSACO module " << module;
135 GpuDriver::UnloadModule(context_, module);
136 gpu_binary_to_module_.erase(module_it);
137 const char* mem_it = nullptr;
138 for (auto x : in_memory_modules_) {
139 if (x.second == module) mem_it = x.first;
140 }
141 if (mem_it != nullptr) in_memory_modules_.erase(mem_it);
142 }
143 return true;
144 }
145
UnloadKernel(const KernelBase * kernel)146 void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
147 VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name();
148
149 absl::MutexLock lock{&in_memory_modules_mu_};
150 auto gpu_binary_it = kernel_to_gpu_binary_.find(kernel);
151 if (kernel_to_gpu_binary_.end() == gpu_binary_it) {
152 VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
153 << " has never been loaded.";
154 return; // We've never seen this kernel.
155 }
156 VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
157 << " has loaded GPU code " << gpu_binary_it->second;
158 UnloadGpuBinary(gpu_binary_it->second);
159 kernel_to_gpu_binary_.erase(gpu_binary_it);
160 }
161
Init(int device_ordinal,DeviceOptions device_options)162 port::Status GpuExecutor::Init(int device_ordinal,
163 DeviceOptions device_options) {
164 device_ordinal_ = device_ordinal;
165
166 auto status = GpuDriver::Init();
167 if (!status.ok()) {
168 return status;
169 }
170
171 status = GpuDriver::GetDevice(device_ordinal_, &device_);
172 if (!status.ok()) {
173 return status;
174 }
175
176 status = GpuDriver::CreateContext(device_ordinal_, device_, device_options,
177 &context_);
178 if (!status.ok()) {
179 return status;
180 }
181
182 return GpuDriver::GetGpuISAVersion(&version_, device_);
183 }
184
FindOnDiskForComputeCapability(absl::string_view filename,absl::string_view canonical_suffix,string * found_filename) const185 bool GpuExecutor::FindOnDiskForComputeCapability(
186 absl::string_view filename, absl::string_view canonical_suffix,
187 string* found_filename) const {
188 LOG(FATAL) << "Feature not supported on ROCM platform "
189 "(FindOnDiskForComputeCapability)";
190 return false;
191 }
192
FindOnDiskForISAVersion(absl::string_view filename,absl::string_view canonical_suffix,string * found_filename) const193 bool GpuExecutor::FindOnDiskForISAVersion(absl::string_view filename,
194 absl::string_view canonical_suffix,
195 string* found_filename) const {
196 if (version_ == 0) {
197 return false;
198 }
199
200 string cc_specific =
201 absl::StrCat(filename, ".cc", version_, canonical_suffix);
202 if (port::FileExists(cc_specific).ok()) {
203 VLOG(2) << "found AMDGPU ISA version-specific file, using that: "
204 << cc_specific;
205 *found_filename = cc_specific;
206 return true;
207 }
208
209 VLOG(2) << "could not find AMDGPU ISA version-specific file at: "
210 << cc_specific;
211 if (port::FileExists(string(filename)).ok()) {
212 *found_filename = string(filename);
213 return true;
214 }
215
216 return false;
217 }
218
219 // Returns the path to the running executable.
220 // N.B. Derived from //knowledge/smalltalk/background_kb.cc
221 // Arg: strip_exe: if true, remove the name of the executable itself from the
222 // returned string. Example: calling this from /usr/bin/foo
223 // would return /usr/bin.
GetBinaryDir(bool strip_exe)224 static string GetBinaryDir(bool strip_exe) {
225 char exe_path[PATH_MAX] = {0};
226 PCHECK(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1) != -1);
227 // Make sure it's null-terminated:
228 exe_path[sizeof(exe_path) - 1] = 0;
229
230 if (strip_exe) {
231 // The exe is the last component of the path, so remove one component.
232 string ret = exe_path;
233 std::vector<string> components = absl::StrSplit(exe_path, '/');
234 components.pop_back();
235 return absl::StrJoin(components, "/");
236 }
237 return exe_path;
238 }
239
GetKernel(const MultiKernelLoaderSpec & spec,KernelBase * kernel)240 port::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
241 KernelBase* kernel) {
242 GpuKernel* rocm_kernel = AsGpuKernel(kernel);
243 hipModule_t module = nullptr;
244 const string* kernelname;
245
246 const OnDiskKernelLoaderSpec* on_disk_spec = nullptr;
247 bool has_cubin = spec.has_cuda_cubin_on_disk();
248 if (has_cubin) {
249 on_disk_spec = &spec.cuda_cubin_on_disk();
250 }
251
252 if (on_disk_spec != nullptr) {
253 return port::InternalError(
254 "Loading ROCM kernel from disk is not supported");
255 } else if (spec.has_cuda_cubin_in_memory()) {
256 kernelname = &spec.cuda_cubin_in_memory().kernelname();
257
258 const char* hsaco = spec.cuda_cubin_in_memory().bytes();
259 absl::MutexLock lock{&in_memory_modules_mu_};
260 module = in_memory_modules_[hsaco];
261
262 if (module == nullptr) {
263 TF_RETURN_IF_ERROR(GpuDriver::LoadHsaco(context_, hsaco, &module));
264 }
265 kernel_to_gpu_binary_[kernel] = hsaco;
266 } else {
267 return port::InternalError("No method of loading ROCM kernel provided");
268 }
269
270 VLOG(2) << "getting function " << *kernelname << " from module " << module;
271 if (!GpuDriver::GetModuleFunction(context_, module, kernelname->c_str(),
272 rocm_kernel->gpu_function_ptr())) {
273 return port::InternalError("Failed getting module function");
274 }
275
276 // We have to trust the kernel loader spec arity because there doesn't appear
277 // to be a way to reflect on the number of expected arguments w/the ROCM API.
278 rocm_kernel->set_arity(spec.arity());
279
280 KernelMetadata kernel_metadata;
281 TF_RETURN_IF_ERROR(GetKernelMetadata(rocm_kernel, &kernel_metadata));
282 kernel->set_metadata(kernel_metadata);
283 kernel->set_name(*kernelname);
284 return port::Status::OK();
285 }
286
GetKernelMetadata(GpuKernel * rocm_kernel,KernelMetadata * kernel_metadata)287 port::Status GpuExecutor::GetKernelMetadata(GpuKernel* rocm_kernel,
288 KernelMetadata* kernel_metadata) {
289 int value = 0;
290 // TODO(ROCm) implement this feature in HIP
291 kernel_metadata->set_registers_per_thread(value);
292
293 // TODO(ROCm) implement this feature in HIP
294 kernel_metadata->set_shared_memory_bytes(value);
295 return port::Status::OK();
296 }
297
Launch(Stream * stream,const ThreadDim & thread_dims,const BlockDim & block_dims,const KernelBase & kernel,const KernelArgsArrayBase & args)298 port::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
299 const BlockDim& block_dims,
300 const KernelBase& kernel,
301 const KernelArgsArrayBase& args) {
302 CHECK_EQ(kernel.Arity(), args.number_of_arguments());
303 GpuStreamHandle hipstream = AsGpuStreamValue(stream);
304 const GpuKernel* rocm_kernel = AsGpuKernel(&kernel);
305 hipFunction_t hipfunc = rocm_kernel->AsGpuFunctionHandle();
306
307 // Only perform/print the occupancy check once. Even just checking to see
308 // whether we've done an occupancy check on this kernel before isn't free
309 // (because we have to synchronize), so we only do this at -v 2+.
310 if (VLOG_IS_ON(2)) {
311 absl::MutexLock lock(&launched_kernels_mu_);
312 if (!launched_kernels_.count(hipfunc)) {
313 VlogOccupancyInfo(kernel, thread_dims, block_dims);
314 // TODO(rspringer): Remove elements from launched_kernels_...if we ever
315 // expose a kernel/module deallocation method.
316 launched_kernels_.insert(hipfunc);
317 }
318 }
319
320 if (rocm_kernel->GetPreferredCacheConfig() !=
321 KernelCacheConfig::kNoPreference) {
322 TF_RETURN_IF_ERROR(GpuDriver::FuncSetCacheConfig(
323 hipfunc, rocm_kernel->GetGpuCacheConfig()));
324 }
325
326 // prepare kernargs
327 // KernelArgsArrayBase keeps the pointer of arguments
328 // deference them here
329 std::vector<void*> kernargs;
330 KernelArgIterator iter = args.arg_iterator();
331 while (iter.has_next()) {
332 KernelArg arg = iter.next();
333 VLOG(2) << "*(arg.address): "
334 << reinterpret_cast<void*>(
335 *static_cast<const uint64_t*>(arg.address));
336 kernargs.push_back(
337 reinterpret_cast<void*>(*static_cast<const uint64_t*>(arg.address)));
338 }
339
340 size_t size = sizeof(void*) * kernargs.size();
341 void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, kernargs.data(),
342 HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END};
343
344 return GpuDriver::LaunchKernel(
345 GetGpuContext(stream), kernel.name(), hipfunc, block_dims.x, block_dims.y,
346 block_dims.z, thread_dims.x, thread_dims.y, thread_dims.z,
347 args.number_of_shared_bytes(), hipstream, nullptr, (void**)&config);
348 }
349
CalculateOccupancy(const DeviceDescription & device_description,uint64_t registers_per_thread,uint64_t shared_memory_per_block,const ThreadDim & thread_dims,GpuFunctionHandle func)350 int GpuExecutor::CalculateOccupancy(const DeviceDescription& device_description,
351 uint64_t registers_per_thread,
352 uint64_t shared_memory_per_block,
353 const ThreadDim& thread_dims,
354 GpuFunctionHandle func) {
355 LOG(FATAL) << "Feature not supported on ROCM platform (CalculateOccupancy)";
356 return 0;
357 }
358
CompareOccupancy(int * initial_blocks,const DeviceDescription & device_description,uint64_t registers_per_thread,uint64_t shared_memory_per_block,const ThreadDim & thread_dims,GpuFunctionHandle func)359 int GpuExecutor::CompareOccupancy(int* initial_blocks,
360 const DeviceDescription& device_description,
361 uint64_t registers_per_thread,
362 uint64_t shared_memory_per_block,
363 const ThreadDim& thread_dims,
364 GpuFunctionHandle func) {
365 LOG(FATAL) << "Feature not supported on ROCM platform (CompareOccupancy)";
366 return 0;
367 }
368
LoadModule(const MultiModuleLoaderSpec & spec,ModuleHandle * module_handle)369 port::Status GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
370 ModuleHandle* module_handle) {
371 // In GpuExecutor we store the pointer to the HSACO binary as
372 // ModuleHandle::id().
373 hipModule_t hip_module = nullptr;
374 // TODO(ROCm): Need generic term instead of cubin/cuda/ptx
375 if (spec.has_cuda_cubin_in_memory()) {
376 absl::MutexLock lock{&in_memory_modules_mu_};
377 TF_RETURN_IF_ERROR(LoadModuleFromHsaco(
378 reinterpret_cast<const char*>(spec.cuda_cubin_in_memory().data()),
379 &hip_module));
380 *module_handle = ModuleHandle(const_cast<void*>(
381 static_cast<const void*>(spec.cuda_cubin_in_memory().data())));
382 return port::Status::OK();
383 } else {
384 return port::InternalError("No HASCO binary found");
385 }
386 }
387
LoadModuleFromCuBin(const char * cubin,hipModule_t * module)388 port::Status GpuExecutor::LoadModuleFromCuBin(const char* cubin,
389 hipModule_t* module) {
390 LOG(FATAL) << "Feature not supported on ROCM platform (LoadModuleFromCuBin)";
391 }
392
LoadModuleFromPtx(const char * ptx,hipModule_t * module)393 port::Status GpuExecutor::LoadModuleFromPtx(const char* ptx,
394 hipModule_t* module) {
395 LOG(FATAL) << "Feature not supported on ROCM platform (LoadModuleFromPtx)";
396 }
397
LoadModuleFromHsaco(const char * hsaco,hipModule_t * module)398 port::Status GpuExecutor::LoadModuleFromHsaco(const char* hsaco,
399 hipModule_t* module) {
400 uint64_t module_refcount;
401 std::tie(*module, module_refcount) = gpu_binary_to_module_[hsaco];
402
403 if (*module == nullptr) {
404 TF_RETURN_IF_ERROR(GpuDriver::LoadHsaco(context_, hsaco, module));
405 module_refcount = 1;
406 in_memory_modules_[hsaco] = *module;
407 VLOG(3) << "Loaded HSACO " << static_cast<const void*>(hsaco)
408 << " as module " << *module;
409 } else {
410 ++module_refcount;
411 VLOG(3) << "HSACO " << static_cast<const void*>(hsaco)
412 << " is already loaded as module " << *module;
413 }
414 gpu_binary_to_module_[hsaco] = {*module, module_refcount};
415 return port::Status::OK();
416 }
417
418 // This is a non-essential operation; if there's a failure, proceed without
419 // logging an error. It's nearly certain that in case of failures, we'd never
420 // get here in the first place; these are very low-impact routines.
VlogOccupancyInfo(const KernelBase & kernel,const ThreadDim & thread_dims,const BlockDim & block_dims)421 void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
422 const ThreadDim& thread_dims,
423 const BlockDim& block_dims) {
424 // TODO(ROCm) implement this feature in HIP
425 }
426
Allocate(uint64_t size,int64_t memory_space)427 DeviceMemoryBase GpuExecutor::Allocate(uint64_t size, int64_t memory_space) {
428 CHECK_EQ(memory_space, 0);
429 return DeviceMemoryBase(GpuDriver::DeviceAllocate(context_, size), size);
430 }
431
GetSubBuffer(DeviceMemoryBase * mem,uint64_t offset_bytes,uint64_t size_bytes)432 void* GpuExecutor::GetSubBuffer(DeviceMemoryBase* mem, uint64_t offset_bytes,
433 uint64_t size_bytes) {
434 // offset and size are in bytes, so char* works as the pointer type.
435 return reinterpret_cast<char*>(mem->opaque()) + offset_bytes;
436 }
437
Deallocate(DeviceMemoryBase * mem)438 void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
439 GpuDriver::DeviceDeallocate(context_, mem->opaque());
440 }
441
HostMemoryRegister(void * location,uint64_t size)442 bool GpuExecutor::HostMemoryRegister(void* location, uint64_t size) {
443 if (location == nullptr || size == 0) {
444 LOG(WARNING) << "attempting to register null or zero-sized memory: "
445 << location << "; size " << size;
446 }
447 VLOG(2) << "registering " << location << " size " << size;
448 return GpuDriver::HostRegister(context_, location, size);
449 }
450
HostMemoryUnregister(void * location)451 bool GpuExecutor::HostMemoryUnregister(void* location) {
452 VLOG(2) << "unregistering " << location;
453 return GpuDriver::HostUnregister(context_, location);
454 }
455
SynchronizeAllActivity()456 bool GpuExecutor::SynchronizeAllActivity() {
457 return GpuDriver::SynchronizeContext(context_);
458 }
459
SynchronousMemZero(DeviceMemoryBase * location,uint64_t size)460 port::Status GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location,
461 uint64_t size) {
462 if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
463 size % 4 == 0) {
464 return GpuDriver::SynchronousMemsetUint32(
465 context_, AsROCmDevicePtr(location), 0x0, size / 4);
466 }
467 return GpuDriver::SynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
468 0x0, size);
469 }
470
SynchronousMemSet(DeviceMemoryBase * location,int value,uint64_t size)471 port::Status GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location,
472 int value, uint64_t size) {
473 if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
474 size % 4 == 0) {
475 // hipMemset reinterprets "value" as a uint8.
476 uint8 byte_value = static_cast<uint8>(value);
477 uint32 pattern = (byte_value << 24) | (byte_value << 16) |
478 (byte_value << 8) | byte_value;
479 return GpuDriver::SynchronousMemsetUint32(
480 context_, AsROCmDevicePtr(location), pattern, size / 4);
481 }
482 return GpuDriver::SynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
483 value, size);
484 }
485
SynchronousMemcpy(DeviceMemoryBase * gpu_dst,const void * host_src,uint64_t size)486 port::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
487 const void* host_src,
488 uint64_t size) {
489 return GpuDriver::SynchronousMemcpyH2D(context_, AsROCmDevicePtr(gpu_dst),
490 host_src, size);
491 }
492
SynchronousMemcpy(void * host_dst,const DeviceMemoryBase & gpu_src,uint64_t size)493 port::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
494 const DeviceMemoryBase& gpu_src,
495 uint64_t size) {
496 return GpuDriver::SynchronousMemcpyD2H(context_, host_dst,
497 AsROCmDevicePtr(gpu_src), size);
498 }
499
SynchronousMemcpyDeviceToDevice(DeviceMemoryBase * gpu_dst,const DeviceMemoryBase & gpu_src,uint64_t size)500 port::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
501 DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64_t size) {
502 return GpuDriver::SynchronousMemcpyD2D(context_, AsROCmDevicePtr(gpu_dst),
503 AsROCmDevicePtr(gpu_src), size);
504 }
505
MemZero(Stream * stream,DeviceMemoryBase * location,uint64_t size)506 port::Status GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
507 uint64_t size) {
508 if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
509 size % 4 == 0) {
510 return Memset32(stream, location, 0x0, size);
511 } else {
512 return Memset(stream, location, 0x0, size);
513 }
514 }
515
Memset(Stream * stream,DeviceMemoryBase * location,uint8 pattern,uint64_t size)516 port::Status GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
517 uint8 pattern, uint64_t size) {
518 VLOG(2) << "enqueueing memset8 operation onto stream " << stream
519 << " at location " << location << " with size " << size
520 << " and pattern " << std::hex << pattern;
521 return GpuDriver::AsynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
522 pattern, size,
523 AsGpuStreamValue(stream));
524 }
525
Memset32(Stream * stream,DeviceMemoryBase * location,uint32 pattern,uint64_t size)526 port::Status GpuExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
527 uint32 pattern, uint64_t size) {
528 VLOG(2) << "enqueueing memset32 operation onto stream " << stream
529 << " at location " << location << " with size " << size
530 << " and pattern " << std::hex << pattern;
531 CHECK(reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
532 size % 4 == 0);
533 return GpuDriver::AsynchronousMemsetUint32(
534 context_, AsROCmDevicePtr(location), pattern, size / 4,
535 AsGpuStreamValue(stream));
536 }
537
Memcpy(Stream * stream,void * host_dst,const DeviceMemoryBase & gpu_src,uint64_t size)538 bool GpuExecutor::Memcpy(Stream* stream, void* host_dst,
539 const DeviceMemoryBase& gpu_src, uint64_t size) {
540 return GpuDriver::AsynchronousMemcpyD2H(context_, host_dst,
541 AsROCmDevicePtr(gpu_src), size,
542 AsGpuStreamValue(stream));
543 }
544
Memcpy(Stream * stream,DeviceMemoryBase * gpu_dst,const void * host_src,uint64_t size)545 bool GpuExecutor::Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst,
546 const void* host_src, uint64_t size) {
547 return GpuDriver::AsynchronousMemcpyH2D(context_, AsROCmDevicePtr(gpu_dst),
548 host_src, size,
549 AsGpuStreamValue(stream));
550 }
551
MemcpyDeviceToDevice(Stream * stream,DeviceMemoryBase * gpu_dst,const DeviceMemoryBase & gpu_src,uint64_t size)552 bool GpuExecutor::MemcpyDeviceToDevice(Stream* stream,
553 DeviceMemoryBase* gpu_dst,
554 const DeviceMemoryBase& gpu_src,
555 uint64_t size) {
556 return GpuDriver::AsynchronousMemcpyD2D(context_, AsROCmDevicePtr(gpu_dst),
557 AsROCmDevicePtr(gpu_src), size,
558 AsGpuStreamValue(stream));
559 }
560
HostCallback(Stream * stream,std::function<port::Status ()> callback)561 bool GpuExecutor::HostCallback(Stream* stream,
562 std::function<port::Status()> callback) {
563 auto callback_ptr = new std::function<void()>([callback]() {
564 port::Status s = callback();
565 if (!s.ok()) {
566 LOG(WARNING) << "Host callback failed: " << s;
567 }
568 });
569 return GpuDriver::AddStreamCallback(context_, AsGpuStreamValue(stream),
570 InternalHostCallback, callback_ptr);
571 }
572
InternalHostCallback(GpuStreamHandle stream,hipError_t status,void * data)573 /* static */ void GpuExecutor::InternalHostCallback(GpuStreamHandle stream,
574 hipError_t status,
575 void* data) {
576 std::function<void()>* callback =
577 reinterpret_cast<std::function<void()>*>(data);
578 (*callback)();
579 delete callback;
580 }
581
AllocateEvent(Event * event)582 port::Status GpuExecutor::AllocateEvent(Event* event) {
583 return AsGpuEvent(event)->Init();
584 }
585
DeallocateEvent(Event * event)586 port::Status GpuExecutor::DeallocateEvent(Event* event) {
587 return AsGpuEvent(event)->Destroy();
588 }
589
RecordEvent(Stream * stream,Event * event)590 port::Status GpuExecutor::RecordEvent(Stream* stream, Event* event) {
591 return AsGpuEvent(event)->Record(AsGpuStream(stream));
592 }
593
WaitForEvent(Stream * stream,Event * event)594 port::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
595 if (GpuDriver::WaitStreamOnEvent(context_, AsGpuStream(stream)->gpu_stream(),
596 AsGpuEvent(event)->gpu_event())) {
597 return port::Status::OK();
598 } else {
599 return port::Status{
600 port::error::INTERNAL,
601 absl::StrFormat("error recording waiting for ROCM event on stream %p",
602 stream)};
603 }
604 }
605
PollForEventStatus(Event * event)606 Event::Status GpuExecutor::PollForEventStatus(Event* event) {
607 return AsGpuEvent(event)->PollForStatus();
608 }
609
AllocateStream(Stream * stream)610 bool GpuExecutor::AllocateStream(Stream* stream) {
611 absl::MutexLock l(&alive_gpu_streams_mu_);
612 bool out = AsGpuStream(stream)->Init();
613 alive_gpu_streams_[stream->implementation()->GpuStreamHack()] = stream;
614 return out;
615 }
616
DeallocateStream(Stream * stream)617 void GpuExecutor::DeallocateStream(Stream* stream) {
618 GpuStream* rocm_stream = AsGpuStream(stream);
619 absl::MutexLock l(&alive_gpu_streams_mu_);
620 alive_gpu_streams_.erase(rocm_stream->GpuStreamHack());
621 if (!rocm_stream->IsIdle()) {
622 LOG(ERROR) << "Deallocating stream with pending work";
623 }
624 rocm_stream->Destroy();
625 }
626
AllocateTimer(Timer * timer)627 bool GpuExecutor::AllocateTimer(Timer* timer) {
628 return AsGpuTimer(timer)->Init();
629 }
630
DeallocateTimer(Timer * timer)631 void GpuExecutor::DeallocateTimer(Timer* timer) {
632 AsGpuTimer(timer)->Destroy();
633 }
634
CreateStreamDependency(Stream * dependent,Stream * other)635 bool GpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
636 GpuEventHandle other_completed_event = *AsGpuStream(other)->completed_event();
637 bool ok = GpuDriver::RecordEvent(context_, other_completed_event,
638 AsGpuStreamValue(other))
639 .ok();
640 if (!ok) {
641 LOG(ERROR) << "failed to record completion event; "
642 "therefore, failed to create inter-stream dependency";
643 return false;
644 }
645
646 return GpuDriver::WaitStreamOnEvent(context_, AsGpuStreamValue(dependent),
647 other_completed_event);
648 }
649
StartTimer(Stream * stream,Timer * timer)650 bool GpuExecutor::StartTimer(Stream* stream, Timer* timer) {
651 return AsGpuTimer(timer)->Start(AsGpuStream(stream));
652 }
653
StopTimer(Stream * stream,Timer * timer)654 bool GpuExecutor::StopTimer(Stream* stream, Timer* timer) {
655 return AsGpuTimer(timer)->Stop(AsGpuStream(stream));
656 }
657
BlockHostUntilDone(Stream * stream)658 port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
659 return GpuDriver::SynchronizeStream(context_, AsGpuStreamValue(stream));
660 }
661
CreateBlas()662 blas::BlasSupport* GpuExecutor::CreateBlas() {
663 PluginRegistry* registry = PluginRegistry::Instance();
664 port::StatusOr<PluginRegistry::BlasFactory> status =
665 registry->GetFactory<PluginRegistry::BlasFactory>(rocm::kROCmPlatformId,
666 plugin_config_.blas());
667 if (!status.ok()) {
668 LOG(ERROR) << "Unable to retrieve BLAS factory: "
669 << status.status().error_message();
670 return nullptr;
671 }
672
673 return status.ValueOrDie()(this);
674 }
675
CreateDnn()676 dnn::DnnSupport* GpuExecutor::CreateDnn() {
677 PluginRegistry* registry = PluginRegistry::Instance();
678 port::StatusOr<PluginRegistry::DnnFactory> status =
679 registry->GetFactory<PluginRegistry::DnnFactory>(rocm::kROCmPlatformId,
680 plugin_config_.dnn());
681 if (!status.ok()) {
682 LOG(ERROR) << "Unable to retrieve DNN factory: "
683 << status.status().error_message();
684 return nullptr;
685 }
686
687 return status.ValueOrDie()(this);
688 }
689
CreateFft()690 fft::FftSupport* GpuExecutor::CreateFft() {
691 PluginRegistry* registry = PluginRegistry::Instance();
692 port::StatusOr<PluginRegistry::FftFactory> status =
693 registry->GetFactory<PluginRegistry::FftFactory>(rocm::kROCmPlatformId,
694 plugin_config_.fft());
695 if (!status.ok()) {
696 LOG(ERROR) << "Unable to retrieve FFT factory: "
697 << status.status().error_message();
698 return nullptr;
699 }
700
701 return status.ValueOrDie()(this);
702 }
703
CreateRng()704 rng::RngSupport* GpuExecutor::CreateRng() {
705 PluginRegistry* registry = PluginRegistry::Instance();
706 port::StatusOr<PluginRegistry::RngFactory> status =
707 registry->GetFactory<PluginRegistry::RngFactory>(rocm::kROCmPlatformId,
708 plugin_config_.rng());
709 if (!status.ok()) {
710 LOG(ERROR) << "Unable to retrieve RNG factory: "
711 << status.status().error_message();
712 return nullptr;
713 }
714
715 return status.ValueOrDie()(this);
716 }
717
718 // TODO(rspringer): Remove in b/18544742.
SupportsDnn() const719 bool GpuExecutor::SupportsDnn() const { return true; }
720
CanEnablePeerAccessTo(StreamExecutorInterface * other)721 bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
722 GpuExecutor* rocm_other = static_cast<GpuExecutor*>(other);
723 return GpuDriver::CanEnablePeerAccess(context_, rocm_other->context_);
724 }
725
EnablePeerAccessTo(StreamExecutorInterface * other)726 port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
727 GpuExecutor* rocm_other = static_cast<GpuExecutor*>(other);
728 return GpuDriver::EnablePeerAccess(context_, rocm_other->context_);
729 }
730
DeviceMemoryUsage(int64_t * free,int64_t * total) const731 bool GpuExecutor::DeviceMemoryUsage(int64_t* free, int64_t* total) const {
732 return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
733 }
734
GetSymbol(const string & symbol_name,ModuleHandle module_handle,void ** mem,size_t * bytes)735 bool GpuExecutor::GetSymbol(const string& symbol_name,
736 ModuleHandle module_handle, void** mem,
737 size_t* bytes) {
738 absl::MutexLock lock{&in_memory_modules_mu_};
739 if (static_cast<bool>(module_handle)) {
740 auto it = gpu_binary_to_module_.find(module_handle.id());
741 CHECK(it != gpu_binary_to_module_.end());
742 if (GpuDriver::GetModuleSymbol(
743 context_, it->second.first, symbol_name.c_str(),
744 reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
745 return true;
746 }
747 }
748
749 for (auto& it : gpu_binary_to_module_) {
750 if (GpuDriver::GetModuleSymbol(
751 context_, it.second.first, symbol_name.c_str(),
752 reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
753 return true;
754 }
755 }
756
757 LOG(INFO) << "Falied to find symbol in any modules: " << symbol_name;
758 return false;
759 }
760
FillBlockDimLimit(GpuDeviceHandle device,BlockDim * block_dim_limit)761 bool FillBlockDimLimit(GpuDeviceHandle device, BlockDim* block_dim_limit) {
762 // The BlockDim name is a mismatch against these GRID_DIM_* queries because
763 // we use BlockDims to express the dimensions of blocks within a grid
764 // (as opposed to ThreadDim which expresses the dimensions of threads
765 // within a block).
766 int x, y, z;
767 if (!GpuDriver::GetGridLimits(&x, &y, &z, device)) {
768 return false;
769 }
770
771 block_dim_limit->x = x;
772 block_dim_limit->y = y;
773 block_dim_limit->z = z;
774 return true;
775 }
776
SupportsBlas() const777 bool GpuExecutor::SupportsBlas() const { return true; }
778
SupportsFft() const779 bool GpuExecutor::SupportsFft() const { return true; }
780
SupportsRng() const781 bool GpuExecutor::SupportsRng() const { return true; }
782
783 std::unique_ptr<internal::EventInterface>
CreateEventImplementation()784 GpuExecutor::CreateEventImplementation() {
785 return std::unique_ptr<internal::EventInterface>(new GpuEvent(this));
786 }
787
788 std::unique_ptr<internal::KernelInterface>
CreateKernelImplementation()789 GpuExecutor::CreateKernelImplementation() {
790 return std::unique_ptr<internal::KernelInterface>(new GpuKernel());
791 }
792
793 std::unique_ptr<internal::StreamInterface>
GetStreamImplementation()794 GpuExecutor::GetStreamImplementation() {
795 return std::unique_ptr<internal::StreamInterface>(new GpuStream(this));
796 }
797
798 std::unique_ptr<internal::TimerInterface>
GetTimerImplementation()799 GpuExecutor::GetTimerImplementation() {
800 return std::unique_ptr<internal::TimerInterface>(new GpuTimer(this));
801 }
802
GpuContextHack()803 void* GpuExecutor::GpuContextHack() { return context_; }
804
gpu_context()805 GpuContext* GpuExecutor::gpu_context() { return context_; }
806
807 // Attempts to read the NUMA node corresponding to the GPU device's PCI bus out
808 // of SysFS. Returns -1 if it cannot.
809 //
810 // For anything more complicated/prod-focused than this, you'll likely want to
811 // turn to gsys' topology modeling.
TryToReadNumaNode(const string & pci_bus_id,int device_ordinal)812 static int TryToReadNumaNode(const string& pci_bus_id, int device_ordinal) {
813 VLOG(2) << "trying to read NUMA node for device ordinal: " << device_ordinal;
814 static const int kUnknownNumaNode = -1;
815
816 if (pci_bus_id.empty()) {
817 LOG(INFO) << "no PCI bus ID for device ordinal: " << device_ordinal;
818 return kUnknownNumaNode;
819 }
820
821 std::string filename =
822 absl::StrFormat("/sys/bus/pci/devices/%s/numa_node", pci_bus_id);
823
824 // We have to use fopen/fread here so that the device properties can be
825 // populated before InitGoogle procedure has been completed (at which point we
826 // could use the file::* utilities).
827 FILE* file = fopen(filename.c_str(), "r");
828 if (file == nullptr) {
829 LOG(INFO) << "could not open file to read NUMA node: " << filename
830 << "\nYour kernel may have been built without NUMA support.";
831 return kUnknownNumaNode;
832 }
833
834 std::string content;
835 char buf[32];
836 size_t did_read = fread(buf, sizeof(buf[0]), sizeof(buf) - 1, file);
837 buf[did_read] = '\0';
838 content = buf;
839
840 int32_t value;
841 if (port::safe_strto32(content, &value)) {
842 if (value < 0) { // See http://b/18228951 for details on this path.
843 LOG(INFO) << "successful NUMA node read from SysFS had negative value ("
844 << value
845 << "), but there must be at least one NUMA node"
846 ", so returning NUMA node zero";
847 fclose(file);
848 return 0;
849 }
850 fclose(file);
851 return value;
852 }
853
854 LOG(WARNING)
855 << "could not convert SysFS file contents to integral NUMA node value: "
856 << content;
857
858 fclose(file);
859 return kUnknownNumaNode;
860 }
861
862 port::StatusOr<std::unique_ptr<DeviceDescription>>
CreateDeviceDescription(int device_ordinal)863 GpuExecutor::CreateDeviceDescription(int device_ordinal) {
864 GpuDeviceHandle device;
865 auto status = GpuDriver::GetDevice(device_ordinal, &device);
866 if (!status.ok()) {
867 return status;
868 }
869
870 int version;
871 status = GpuDriver::GetGpuISAVersion(&version, device);
872 if (!status.ok()) {
873 return status;
874 }
875
876 std::string gcn_arch_name;
877 status = GpuDriver::GetGpuGCNArchName(device, &gcn_arch_name);
878 if (!status.ok()) {
879 return status;
880 }
881
882 internal::DeviceDescriptionBuilder builder;
883
884 {
885 int driver_version = 0;
886 (void)GpuDriver::GetDriverVersion(&driver_version);
887 string augmented_driver_version = absl::StrFormat(
888 "%d (%s)", driver_version,
889 rocm::DriverVersionStatusToString(Diagnostician::FindDsoVersion())
890 .c_str());
891 builder.set_driver_version(augmented_driver_version);
892 }
893
894 {
895 string pci_bus_id = GpuDriver::GetPCIBusID(device);
896
897 // Lower the hex characters to match sysfs.
898 pci_bus_id = absl::AsciiStrToLower(pci_bus_id);
899 builder.set_pci_bus_id(pci_bus_id);
900
901 // Read the NUMA node corresponding to the PCI bus ID out of sysfs.
902 int numa_node = TryToReadNumaNode(pci_bus_id, device_ordinal);
903 builder.set_numa_node(numa_node);
904 }
905
906 hipDeviceProp_t prop;
907 if (GpuDriver::GetDeviceProperties(&prop, device_ordinal)) {
908 builder.set_threads_per_block_limit(prop.maxThreadsPerBlock);
909
910 ThreadDim thread_dim_limit;
911 thread_dim_limit.x = prop.maxThreadsDim[0];
912 thread_dim_limit.y = prop.maxThreadsDim[1];
913 thread_dim_limit.z = prop.maxThreadsDim[2];
914 builder.set_thread_dim_limit(thread_dim_limit);
915
916 float clock_rate_ghz = static_cast<float>(prop.clockRate) / 1e6;
917 builder.set_clock_rate_ghz(clock_rate_ghz);
918
919 // mem_bandwidth = 2 * mem_bus_width_in_bytes * mem_clock_rate_in_hz
920 int64_t memory_bandwidth = 2 * (int64_t(prop.memoryBusWidth) / 8) *
921 (int64_t(prop.memoryClockRate) * 1000);
922 builder.set_memory_bandwidth(memory_bandwidth);
923 }
924
925 {
926 bool ecc_enabled = false;
927 (void)GpuDriver::IsEccEnabled(device, &ecc_enabled);
928 builder.set_ecc_enabled(ecc_enabled);
929 }
930
931 {
932 uint64_t device_memory_size = -1;
933 (void)GpuDriver::GetDeviceTotalMemory(device, &device_memory_size);
934 builder.set_device_memory_size(device_memory_size);
935 }
936
937 {
938 BlockDim block_dim_limit;
939 FillBlockDimLimit(device, &block_dim_limit);
940 builder.set_block_dim_limit(block_dim_limit);
941 }
942
943 {
944 string device_name;
945 TF_RETURN_IF_ERROR(GpuDriver::GetDeviceName(device, &device_name));
946 builder.set_name(device_name);
947 }
948
949 builder.set_platform_version(
950 absl::StrCat("AMDGPU ISA version: ", gcn_arch_name));
951
952 // TODO(leary) should be a way to query this from the driver, but this is
953 // unlikely to change for us any time soon.
954 builder.set_device_address_bits(64);
955
956 builder.set_device_vendor("Advanced Micro Devices, Inc");
957 builder.set_rocm_compute_capability(gcn_arch_name);
958
959 builder.set_shared_memory_per_core(
960 GpuDriver::GetMaxSharedMemoryPerCore(device).ValueOrDie());
961 builder.set_shared_memory_per_block(
962 GpuDriver::GetMaxSharedMemoryPerBlock(device).ValueOrDie());
963 builder.set_core_count(
964 GpuDriver::GetMultiprocessorCount(device).ValueOrDie());
965 builder.set_threads_per_core_limit(
966 GpuDriver::GetMaxThreadsPerMultiprocessor(device).ValueOrDie());
967 builder.set_registers_per_block_limit(
968 GpuDriver::GetMaxRegistersPerBlock(device).ValueOrDie());
969 builder.set_threads_per_warp(
970 GpuDriver::GetThreadsPerWarp(device).ValueOrDie());
971 builder.set_registers_per_core_limit(64 * 1024);
972
973 return builder.Build();
974 }
975
976 } // namespace gpu
977
978 } // namespace stream_executor
979
980 REGISTER_MODULE_INITIALIZER(rocm_gpu_executor, {});
981