1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 // The CUDA implementation of the StreamExecutorInterface functionality.
17 // CUDA inclusions are ideally confined to this implementation file.
18 //
19 // The notions from the StreamExecutor basically correspond to the CUDA streams
20 // programming model provided by the libcuda.so driver APIs, so we don't have
21 // to do much more than wrap the calls to the libraries appropriately.
22 #ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
23 #define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
24
25 #include <memory>
26 #include <set>
27 #include <type_traits>
28 #include <unordered_map>
29
30 #include "absl/base/thread_annotations.h"
31 #include "absl/container/flat_hash_map.h"
32 #include "absl/strings/string_view.h"
33 #include "tensorflow/compiler/xla/stream_executor/event.h"
34 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_kernel.h"
35 #include "tensorflow/compiler/xla/stream_executor/lib/status.h"
36 #include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
37 #include "tensorflow/compiler/xla/stream_executor/platform.h"
38 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
39 #include "tensorflow/compiler/xla/stream_executor/stream_executor_internal.h"
40 #include "tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h"
41 #include "tensorflow/core/platform/fingerprint.h"
42
43 namespace stream_executor {
44
45 class StreamExecutor;
46
47 namespace gpu {
48
49 // Pointer-to-implementation object type with virtual destruction for any XLA
50 // specific data hanging off of the GpuExecutor.
51 class XLAInterface {
52 public:
53 // Default constructor for the abstract interface.
XLAInterface()54 explicit XLAInterface() {}
55
56 // Default destructor for the abstract interface.
~XLAInterface()57 virtual ~XLAInterface() {}
58 };
59
60 // CUDA-platform implementation of the platform-agnostic
61 // StreamExecutorInterface.
62 class GpuExecutor : public internal::StreamExecutorInterface {
63 // Helper classes to attach a type erased state to the GpuExecutor. Currently,
64 // we just need to support some XLA specific state.
65 class Object {
66 struct Concept {
~ConceptConcept67 virtual ~Concept() {}
68 };
69 template <typename T>
70 struct Model : Concept {
ModelModel71 explicit Model(StreamExecutor* se) : object(se) {}
72 T object;
73 };
74
75 public:
76 template <typename T>
getOrCreate(StreamExecutor * se)77 T* getOrCreate(StreamExecutor* se) {
78 absl::MutexLock l(&mu_);
79 if (!object_) {
80 object_ = std::make_unique<Model<T>>(se);
81 }
82 return &(dynamic_cast<Model<T>*>(object_.get())->object);
83 }
84
85 private:
86 absl::Mutex mu_;
87 std::unique_ptr<Concept> object_ ABSL_GUARDED_BY(mu_);
88 };
89
90 public:
91 // sub_platform indicates the subplatform used in this executor; it must
92 // be a CUDA type.
GpuExecutor(const PluginConfig & plugin_config)93 explicit GpuExecutor(const PluginConfig& plugin_config)
94 : device_(0),
95 context_(nullptr),
96 device_ordinal_(0),
97 cc_major_(0),
98 cc_minor_(0),
99 version_(0),
100 plugin_config_(plugin_config) {}
101
102 // See the corresponding StreamExecutor methods for method comments on the
103 // following overrides.
104
105 ~GpuExecutor() override;
106
107 port::Status Init(int device_ordinal, DeviceOptions device_options) override;
108
109 port::Status GetKernel(const MultiKernelLoaderSpec& spec,
110 KernelBase* kernel) override;
111 // (supported on CUDA only)
112 void UnloadKernel(const KernelBase* kernel) override;
113 port::Status LoadModule(const MultiModuleLoaderSpec& spec,
114 ModuleHandle* module_handle) override;
115 bool UnloadModule(ModuleHandle module_handle) override;
116
117 // Allocates and initializes a new constant on the device with the given
118 // content. Or, if a device with identical content is already on-device,
119 // returns a pointer to that buffer with shared ownership.
120 port::StatusOr<std::shared_ptr<DeviceMemoryBase>> CreateOrShareConstant(
121 Stream* stream, const std::vector<uint8_t>& content) override;
122
123 port::Status Launch(Stream* stream, const ThreadDim& thread_dims,
124 const BlockDim& block_dims, const KernelBase& k,
125 const KernelArgsArrayBase& args) override;
126
127 // (supported on CUDA only)
128 int CalculateOccupancy(const DeviceDescription& device_description,
129 uint64_t registers_per_thread,
130 uint64_t shared_memory_per_block,
131 const ThreadDim& thread_dims, GpuFunctionHandle func);
132
133 // (supported on CUDA only)
134 int CompareOccupancy(int* initial_blocks,
135 const DeviceDescription& device_description,
136 uint64_t registers_per_thread,
137 uint64_t shared_memory_per_block,
138 const ThreadDim& thread_dims, GpuFunctionHandle func);
139
140 DeviceMemoryBase Allocate(uint64_t size, int64_t memory_space) override;
141
142 void* GetSubBuffer(DeviceMemoryBase* mem, uint64_t offset_bytes,
143 uint64_t size_bytes) override;
144
145 void Deallocate(DeviceMemoryBase* mem) override;
146
UnifiedMemoryAllocate(uint64_t size)147 void* UnifiedMemoryAllocate(uint64_t size) override {
148 return GpuDriver::UnifiedMemoryAllocate(context_, size);
149 }
150
UnifiedMemoryDeallocate(void * location)151 void UnifiedMemoryDeallocate(void* location) override {
152 return GpuDriver::UnifiedMemoryDeallocate(context_, location);
153 }
154
155 // CUDA allocation/registration functions are necessary because the driver
156 // internally sets up buffers for DMA operations (and page locks them).
157 // There's no external interface for us to otherwise control these DMA
158 // settings.
HostMemoryAllocate(uint64_t size)159 void* HostMemoryAllocate(uint64_t size) override {
160 return GpuDriver::HostAllocate(context_, size);
161 }
162
HostMemoryDeallocate(void * location)163 void HostMemoryDeallocate(void* location) override {
164 return GpuDriver::HostDeallocate(context_, location);
165 }
166
167 bool HostMemoryRegister(void* location, uint64_t size) override;
168
169 bool HostMemoryUnregister(void* location) override;
170
171 bool SynchronizeAllActivity() override;
172
173 port::Status SynchronousMemZero(DeviceMemoryBase* location,
174 uint64_t size) override;
175
176 port::Status SynchronousMemSet(DeviceMemoryBase* location, int value,
177 uint64_t size) override;
178
179 port::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
180 const void* host_src, uint64_t size) override;
181
182 port::Status SynchronousMemcpy(void* host_dst,
183 const DeviceMemoryBase& gpu_src,
184 uint64_t size) override;
185
186 port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase* gpu_dst,
187 const DeviceMemoryBase& gpu_src,
188 uint64_t size) override;
189
190 port::Status MemZero(Stream* stream, DeviceMemoryBase* location,
191 uint64_t size) override;
192 port::Status Memset(Stream* stream, DeviceMemoryBase* location, uint8 pattern,
193 uint64_t size) override;
194 port::Status Memset32(Stream* stream, DeviceMemoryBase* location,
195 uint32 pattern, uint64_t size) override;
196
197 bool Memcpy(Stream* stream, void* host_dst, const DeviceMemoryBase& gpu_src,
198 uint64_t size) override;
199
200 bool Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst, const void* host_src,
201 uint64_t size) override;
202
203 bool MemcpyDeviceToDevice(Stream* stream, DeviceMemoryBase* gpu_dst,
204 const DeviceMemoryBase& gpu_src,
205 uint64_t size) override;
206
207 bool HostCallback(Stream* stream,
208 std::function<port::Status()> callback) override;
209
210 bool AllocateStream(Stream* stream) override;
211
212 void DeallocateStream(Stream* stream) override;
213
214 bool CreateStreamDependency(Stream* dependent, Stream* other) override;
215
216 bool AllocateTimer(Timer* timer) override;
217
218 void DeallocateTimer(Timer* timer) override;
219
220 bool StartTimer(Stream* stream, Timer* timer) override;
221
222 bool StopTimer(Stream* stream, Timer* timer) override;
223
224 port::Status AllocateEvent(Event* event) override;
225
226 port::Status DeallocateEvent(Event* event) override;
227
228 port::Status RecordEvent(Stream* stream, Event* event) override;
229
230 port::Status WaitForEvent(Stream* stream, Event* event) override;
231
232 Event::Status PollForEventStatus(Event* event) override;
233
234 port::Status BlockHostUntilDone(Stream* stream) override;
235
PlatformDeviceCount()236 int PlatformDeviceCount() override { return GpuDriver::GetDeviceCount(); }
237
238 port::Status EnablePeerAccessTo(StreamExecutorInterface* other) override;
239
240 bool CanEnablePeerAccessTo(StreamExecutorInterface* other) override;
241
242 bool DeviceMemoryUsage(int64_t* free, int64_t* total) const override;
243
244 // Search for the symbol in the given module and returns a device pointer and
245 // size. Returns false if symbol does not exist. 'module_handle' must not
246 // be null.
247 bool GetSymbol(const std::string& symbol_name, ModuleHandle module_handle,
248 void** mem, size_t* bytes) override;
249
CreateDeviceDescription()250 port::StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
251 const override {
252 return CreateDeviceDescription(device_ordinal_);
253 }
254
255 static port::StatusOr<std::unique_ptr<DeviceDescription>>
256 CreateDeviceDescription(int device_ordinal);
257
258 bool SupportsBlas() const override;
259
260 blas::BlasSupport* CreateBlas() override;
261
262 bool SupportsFft() const override;
263
264 fft::FftSupport* CreateFft() override;
265
266 bool SupportsRng() const override;
267
268 rng::RngSupport* CreateRng() override;
269
270 bool SupportsDnn() const override;
271
272 dnn::DnnSupport* CreateDnn() override;
273
274 std::unique_ptr<internal::EventInterface> CreateEventImplementation()
275 override;
276
277 std::unique_ptr<internal::KernelInterface> CreateKernelImplementation()
278 override;
279
280 std::unique_ptr<internal::StreamInterface> GetStreamImplementation() override;
281
282 std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override;
283
284 void* GpuContextHack() override;
285
286 GpuContext* gpu_context();
287
288 // Provide a type-erased way of attaching arbitrary XLA specific state to the
289 // GpuExecutor. XLA based execution will use this method to attach per-stream
290 // executor XLA specific objects (like the Infeed and Outfeed managers) to the
291 // stream executor, so that their lifetimes can be tied to the lifetime of the
292 // stream executor for which that object is allocated for. This simplifies
293 // memory management as compared to having these objects reside on the side
294 // and then either leaking or having to implement callbacks that the SE
295 // destructors call to deallocate any side state that is associated with that
296 // SE object.
297 template <typename T>
getOrCreateXLAState(StreamExecutor * se)298 T* getOrCreateXLAState(StreamExecutor* se) {
299 return xla_state_.getOrCreate<T>(se);
300 }
301
FindAllocatedStream(void * gpu_stream)302 Stream* FindAllocatedStream(void* gpu_stream) override {
303 absl::MutexLock lock(&alive_gpu_streams_mu_);
304 auto it = alive_gpu_streams_.find(gpu_stream);
305 if (it == alive_gpu_streams_.end()) {
306 return nullptr;
307 }
308 return it->second;
309 }
310
311 private:
312 // Attempts to find a more specific version of the file indicated by
313 // filename by looking for compute-capability-specific suffixed versions; i.e.
314 // looking for "foo.ptx" will check to see if "foo.ptx.cc30.ptx" is present if
315 // we're on a compute capability 3.0 machine.
316 // (supported on CUDA only)
317 bool FindOnDiskForComputeCapability(absl::string_view filename,
318 absl::string_view canonical_suffix,
319 std::string* found_filename) const;
320
321 // Attempts to find a more specific version of the file indicated by
322 // filename by looking for AMDGPU ISA-specific suffixed versions.
323 // (supported on ROCm only)
324
325 bool FindOnDiskForISAVersion(absl::string_view filename,
326 absl::string_view canonical_suffix,
327 std::string* found_filename) const;
328
329 // Host callback landing routine invoked by CUDA.
330 // data: User-provided callback provided to HostCallback() above, captured
331 // as a std::function<void()>. Allocated/initialized inside
332 // HostCallback() and owned and deleted by this call.
333 static void InternalHostCallback(GpuStreamHandle stream, GpuStatus status,
334 void* data);
335
336 // Collects metadata for the specified kernel.
337 port::Status GetKernelMetadata(GpuKernel* cuda_kernel,
338 KernelMetadata* kernel_metadata);
339
340 // Prints to VLOG(2) information about the kernel's occupancy and how it might
341 // be improved.
342 void VlogOccupancyInfo(const KernelBase& kernel, const ThreadDim& thread_dims,
343 const BlockDim& block_dims);
344
345 // (supported on CUDA only)
346 port::Status LoadModuleFromCuBin(const char* cubin, GpuModuleHandle* module)
347 TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
348
349 // Loads the PTX text `ptx` as a CUDA module. `ptx` must be null terminated.
350 // (supported on CUDA only)
351 port::Status LoadModuleFromPtx(const char* ptx, GpuModuleHandle* module)
352 TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
353
354 // (supported on ROCm only)
355 port::Status LoadModuleFromHsaco(const char* hsaco, GpuModuleHandle* module)
356 TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
357
358 bool UnloadGpuBinary(const void* gpu_binary)
359 TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
360
361 // Guards the on-disk-module mapping.
362 absl::Mutex disk_modules_mu_;
363
364 // Mapping from filename to GPUModuleHandle, if it was already retrieved.
365 // Multiple GPUFunctionHandle are usually obtained from a single
366 // GPUModuleHandle so we attempt to hit in this mapping first, before
367 // retrieving it.
368 std::map<std::string, GpuModuleHandle> disk_modules_
369 ABSL_GUARDED_BY(disk_modules_mu_);
370
371 // Guards the in-memory-module mapping.
372 absl::Mutex in_memory_modules_mu_;
373
374 std::map<const char*, GpuModuleHandle> in_memory_modules_
375 ABSL_GUARDED_BY(in_memory_modules_mu_);
376
377 absl::Mutex shared_constants_mu_;
378 // On-device constants that can be shared between multiple executables. A
379 // pointer for a given constant will expire when no executables require use
380 // of that constant anymore.
381 std::map<const absl::uint128, std::weak_ptr<DeviceMemoryBase>>
382 shared_constants_ ABSL_GUARDED_BY(shared_constants_mu_);
383
384 // Kernel -> loaded GPU binary. Many kernels may load the same binary.
385 std::unordered_map<const KernelBase*, const void*> kernel_to_gpu_binary_
386 ABSL_GUARDED_BY(in_memory_modules_mu_);
387 // GPU binary (PTX or CUBIN or HSACO) -> {CUDA module, reference count}.
388 std::unordered_map<const void*, std::pair<GpuModuleHandle, uint64_t>>
389 gpu_binary_to_module_ ABSL_GUARDED_BY(in_memory_modules_mu_);
390
391 // Guards the launched kernel set.
392 absl::Mutex launched_kernels_mu_;
393
394 // Keeps track of the set of launched kernels. Currently used to suppress the
395 // occupancy check on subsequent launches.
396 std::set<GpuFunctionHandle> launched_kernels_
397 ABSL_GUARDED_BY(launched_kernels_mu_);
398
399 // Handle for the CUDA device being operated on. Immutable
400 // post-initialization.
401 GpuDeviceHandle device_;
402
403 // Handle for session with the library/driver. Immutable post-initialization.
404 GpuContext* context_;
405
406 // The device ordinal value that this executor was initialized with; recorded
407 // for use in getting device metadata. Immutable post-initialization.
408 int device_ordinal_;
409
410 // The major version of the compute capability for device_.
411 int cc_major_;
412
413 // The minor version of the compute capability for device_.
414 int cc_minor_;
415
416 // GPU ISA version for device_.
417 int version_;
418
419 // The plugin configuration associated with this instance.
420 PluginConfig plugin_config_;
421
422 // Type erased XLA specific state attached to GpuExecutor.
423 Object xla_state_;
424
425 absl::Mutex alive_gpu_streams_mu_;
426
427 // Lookup map for alive streams, from raw stream pointers.
428 absl::flat_hash_map<void*, Stream*> alive_gpu_streams_
429 ABSL_GUARDED_BY(alive_gpu_streams_mu_);
430
431 SE_DISALLOW_COPY_AND_ASSIGN(GpuExecutor);
432 };
433
ExtractGpuExecutor(StreamExecutor * stream_exec)434 inline GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec) {
435 return static_cast<GpuExecutor*>(stream_exec->implementation());
436 }
437
438 } // namespace gpu
439 } // namespace stream_executor
440
441 #endif // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
442