xref: /aosp_15_r20/external/tensorflow/tensorflow/compiler/xla/stream_executor/gpu/gpu_executor.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // The CUDA implementation of the StreamExecutorInterface functionality.
17 // CUDA inclusions are ideally confined to this implementation file.
18 //
19 // The notions from the StreamExecutor basically correspond to the CUDA streams
20 // programming model provided by the libcuda.so driver APIs, so we don't have
21 // to do much more than wrap the calls to the libraries appropriately.
22 #ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
23 #define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
24 
25 #include <memory>
26 #include <set>
27 #include <type_traits>
28 #include <unordered_map>
29 
30 #include "absl/base/thread_annotations.h"
31 #include "absl/container/flat_hash_map.h"
32 #include "absl/strings/string_view.h"
33 #include "tensorflow/compiler/xla/stream_executor/event.h"
34 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_kernel.h"
35 #include "tensorflow/compiler/xla/stream_executor/lib/status.h"
36 #include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
37 #include "tensorflow/compiler/xla/stream_executor/platform.h"
38 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
39 #include "tensorflow/compiler/xla/stream_executor/stream_executor_internal.h"
40 #include "tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h"
41 #include "tensorflow/core/platform/fingerprint.h"
42 
43 namespace stream_executor {
44 
45 class StreamExecutor;
46 
47 namespace gpu {
48 
49 // Pointer-to-implementation object type with virtual destruction for any XLA
50 // specific data hanging off of the GpuExecutor.
51 class XLAInterface {
52  public:
53   // Default constructor for the abstract interface.
XLAInterface()54   explicit XLAInterface() {}
55 
56   // Default destructor for the abstract interface.
~XLAInterface()57   virtual ~XLAInterface() {}
58 };
59 
60 // CUDA-platform implementation of the platform-agnostic
61 // StreamExecutorInterface.
62 class GpuExecutor : public internal::StreamExecutorInterface {
63   // Helper classes to attach a type erased state to the GpuExecutor. Currently,
64   // we just need to support some XLA specific state.
65   class Object {
66     struct Concept {
~ConceptConcept67       virtual ~Concept() {}
68     };
69     template <typename T>
70     struct Model : Concept {
ModelModel71       explicit Model(StreamExecutor* se) : object(se) {}
72       T object;
73     };
74 
75    public:
76     template <typename T>
getOrCreate(StreamExecutor * se)77     T* getOrCreate(StreamExecutor* se) {
78       absl::MutexLock l(&mu_);
79       if (!object_) {
80         object_ = std::make_unique<Model<T>>(se);
81       }
82       return &(dynamic_cast<Model<T>*>(object_.get())->object);
83     }
84 
85    private:
86     absl::Mutex mu_;
87     std::unique_ptr<Concept> object_ ABSL_GUARDED_BY(mu_);
88   };
89 
90  public:
91   // sub_platform indicates the subplatform used in this executor; it must
92   // be a CUDA type.
GpuExecutor(const PluginConfig & plugin_config)93   explicit GpuExecutor(const PluginConfig& plugin_config)
94       : device_(0),
95         context_(nullptr),
96         device_ordinal_(0),
97         cc_major_(0),
98         cc_minor_(0),
99         version_(0),
100         plugin_config_(plugin_config) {}
101 
102   // See the corresponding StreamExecutor methods for method comments on the
103   // following overrides.
104 
105   ~GpuExecutor() override;
106 
107   port::Status Init(int device_ordinal, DeviceOptions device_options) override;
108 
109   port::Status GetKernel(const MultiKernelLoaderSpec& spec,
110                          KernelBase* kernel) override;
111   // (supported on CUDA only)
112   void UnloadKernel(const KernelBase* kernel) override;
113   port::Status LoadModule(const MultiModuleLoaderSpec& spec,
114                           ModuleHandle* module_handle) override;
115   bool UnloadModule(ModuleHandle module_handle) override;
116 
117   // Allocates and initializes a new constant on the device with the given
118   // content. Or, if a device with identical content is already on-device,
119   // returns a pointer to that buffer with shared ownership.
120   port::StatusOr<std::shared_ptr<DeviceMemoryBase>> CreateOrShareConstant(
121       Stream* stream, const std::vector<uint8_t>& content) override;
122 
123   port::Status Launch(Stream* stream, const ThreadDim& thread_dims,
124                       const BlockDim& block_dims, const KernelBase& k,
125                       const KernelArgsArrayBase& args) override;
126 
127   // (supported on CUDA only)
128   int CalculateOccupancy(const DeviceDescription& device_description,
129                          uint64_t registers_per_thread,
130                          uint64_t shared_memory_per_block,
131                          const ThreadDim& thread_dims, GpuFunctionHandle func);
132 
133   // (supported on CUDA only)
134   int CompareOccupancy(int* initial_blocks,
135                        const DeviceDescription& device_description,
136                        uint64_t registers_per_thread,
137                        uint64_t shared_memory_per_block,
138                        const ThreadDim& thread_dims, GpuFunctionHandle func);
139 
140   DeviceMemoryBase Allocate(uint64_t size, int64_t memory_space) override;
141 
142   void* GetSubBuffer(DeviceMemoryBase* mem, uint64_t offset_bytes,
143                      uint64_t size_bytes) override;
144 
145   void Deallocate(DeviceMemoryBase* mem) override;
146 
UnifiedMemoryAllocate(uint64_t size)147   void* UnifiedMemoryAllocate(uint64_t size) override {
148     return GpuDriver::UnifiedMemoryAllocate(context_, size);
149   }
150 
UnifiedMemoryDeallocate(void * location)151   void UnifiedMemoryDeallocate(void* location) override {
152     return GpuDriver::UnifiedMemoryDeallocate(context_, location);
153   }
154 
155   // CUDA allocation/registration functions are necessary because the driver
156   // internally sets up buffers for DMA operations (and page locks them).
157   // There's no external interface for us to otherwise control these DMA
158   // settings.
HostMemoryAllocate(uint64_t size)159   void* HostMemoryAllocate(uint64_t size) override {
160     return GpuDriver::HostAllocate(context_, size);
161   }
162 
HostMemoryDeallocate(void * location)163   void HostMemoryDeallocate(void* location) override {
164     return GpuDriver::HostDeallocate(context_, location);
165   }
166 
167   bool HostMemoryRegister(void* location, uint64_t size) override;
168 
169   bool HostMemoryUnregister(void* location) override;
170 
171   bool SynchronizeAllActivity() override;
172 
173   port::Status SynchronousMemZero(DeviceMemoryBase* location,
174                                   uint64_t size) override;
175 
176   port::Status SynchronousMemSet(DeviceMemoryBase* location, int value,
177                                  uint64_t size) override;
178 
179   port::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
180                                  const void* host_src, uint64_t size) override;
181 
182   port::Status SynchronousMemcpy(void* host_dst,
183                                  const DeviceMemoryBase& gpu_src,
184                                  uint64_t size) override;
185 
186   port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase* gpu_dst,
187                                                const DeviceMemoryBase& gpu_src,
188                                                uint64_t size) override;
189 
190   port::Status MemZero(Stream* stream, DeviceMemoryBase* location,
191                        uint64_t size) override;
192   port::Status Memset(Stream* stream, DeviceMemoryBase* location, uint8 pattern,
193                       uint64_t size) override;
194   port::Status Memset32(Stream* stream, DeviceMemoryBase* location,
195                         uint32 pattern, uint64_t size) override;
196 
197   bool Memcpy(Stream* stream, void* host_dst, const DeviceMemoryBase& gpu_src,
198               uint64_t size) override;
199 
200   bool Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst, const void* host_src,
201               uint64_t size) override;
202 
203   bool MemcpyDeviceToDevice(Stream* stream, DeviceMemoryBase* gpu_dst,
204                             const DeviceMemoryBase& gpu_src,
205                             uint64_t size) override;
206 
207   bool HostCallback(Stream* stream,
208                     std::function<port::Status()> callback) override;
209 
210   bool AllocateStream(Stream* stream) override;
211 
212   void DeallocateStream(Stream* stream) override;
213 
214   bool CreateStreamDependency(Stream* dependent, Stream* other) override;
215 
216   bool AllocateTimer(Timer* timer) override;
217 
218   void DeallocateTimer(Timer* timer) override;
219 
220   bool StartTimer(Stream* stream, Timer* timer) override;
221 
222   bool StopTimer(Stream* stream, Timer* timer) override;
223 
224   port::Status AllocateEvent(Event* event) override;
225 
226   port::Status DeallocateEvent(Event* event) override;
227 
228   port::Status RecordEvent(Stream* stream, Event* event) override;
229 
230   port::Status WaitForEvent(Stream* stream, Event* event) override;
231 
232   Event::Status PollForEventStatus(Event* event) override;
233 
234   port::Status BlockHostUntilDone(Stream* stream) override;
235 
PlatformDeviceCount()236   int PlatformDeviceCount() override { return GpuDriver::GetDeviceCount(); }
237 
238   port::Status EnablePeerAccessTo(StreamExecutorInterface* other) override;
239 
240   bool CanEnablePeerAccessTo(StreamExecutorInterface* other) override;
241 
242   bool DeviceMemoryUsage(int64_t* free, int64_t* total) const override;
243 
244   // Search for the symbol in the given module and returns a device pointer and
245   // size. Returns false if symbol does not exist. 'module_handle' must not
246   // be null.
247   bool GetSymbol(const std::string& symbol_name, ModuleHandle module_handle,
248                  void** mem, size_t* bytes) override;
249 
CreateDeviceDescription()250   port::StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
251       const override {
252     return CreateDeviceDescription(device_ordinal_);
253   }
254 
255   static port::StatusOr<std::unique_ptr<DeviceDescription>>
256   CreateDeviceDescription(int device_ordinal);
257 
258   bool SupportsBlas() const override;
259 
260   blas::BlasSupport* CreateBlas() override;
261 
262   bool SupportsFft() const override;
263 
264   fft::FftSupport* CreateFft() override;
265 
266   bool SupportsRng() const override;
267 
268   rng::RngSupport* CreateRng() override;
269 
270   bool SupportsDnn() const override;
271 
272   dnn::DnnSupport* CreateDnn() override;
273 
274   std::unique_ptr<internal::EventInterface> CreateEventImplementation()
275       override;
276 
277   std::unique_ptr<internal::KernelInterface> CreateKernelImplementation()
278       override;
279 
280   std::unique_ptr<internal::StreamInterface> GetStreamImplementation() override;
281 
282   std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override;
283 
284   void* GpuContextHack() override;
285 
286   GpuContext* gpu_context();
287 
288   // Provide a type-erased way of attaching arbitrary XLA specific state to the
289   // GpuExecutor. XLA based execution will use this method to attach per-stream
290   // executor XLA specific objects (like the Infeed and Outfeed managers) to the
291   // stream executor, so that their lifetimes can be tied to the lifetime of the
292   // stream executor for which that object is allocated for. This simplifies
293   // memory management as compared to having these objects reside on the side
294   // and then either leaking or having to implement callbacks that the SE
295   // destructors call to deallocate any side state that is associated with that
296   // SE object.
297   template <typename T>
getOrCreateXLAState(StreamExecutor * se)298   T* getOrCreateXLAState(StreamExecutor* se) {
299     return xla_state_.getOrCreate<T>(se);
300   }
301 
FindAllocatedStream(void * gpu_stream)302   Stream* FindAllocatedStream(void* gpu_stream) override {
303     absl::MutexLock lock(&alive_gpu_streams_mu_);
304     auto it = alive_gpu_streams_.find(gpu_stream);
305     if (it == alive_gpu_streams_.end()) {
306       return nullptr;
307     }
308     return it->second;
309   }
310 
311  private:
312   // Attempts to find a more specific version of the file indicated by
313   // filename by looking for compute-capability-specific suffixed versions; i.e.
314   // looking for "foo.ptx" will check to see if "foo.ptx.cc30.ptx" is present if
315   // we're on a compute capability 3.0 machine.
316   // (supported on CUDA only)
317   bool FindOnDiskForComputeCapability(absl::string_view filename,
318                                       absl::string_view canonical_suffix,
319                                       std::string* found_filename) const;
320 
321   // Attempts to find a more specific version of the file indicated by
322   // filename by looking for AMDGPU ISA-specific suffixed versions.
323   // (supported on ROCm only)
324 
325   bool FindOnDiskForISAVersion(absl::string_view filename,
326                                absl::string_view canonical_suffix,
327                                std::string* found_filename) const;
328 
329   // Host callback landing routine invoked by CUDA.
330   // data: User-provided callback provided to HostCallback() above, captured
331   //       as a std::function<void()>. Allocated/initialized inside
332   //       HostCallback() and owned and deleted by this call.
333   static void InternalHostCallback(GpuStreamHandle stream, GpuStatus status,
334                                    void* data);
335 
336   // Collects metadata for the specified kernel.
337   port::Status GetKernelMetadata(GpuKernel* cuda_kernel,
338                                  KernelMetadata* kernel_metadata);
339 
340   // Prints to VLOG(2) information about the kernel's occupancy and how it might
341   // be improved.
342   void VlogOccupancyInfo(const KernelBase& kernel, const ThreadDim& thread_dims,
343                          const BlockDim& block_dims);
344 
345   // (supported on CUDA only)
346   port::Status LoadModuleFromCuBin(const char* cubin, GpuModuleHandle* module)
347       TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
348 
349   // Loads the PTX text `ptx` as a CUDA module.  `ptx` must be null terminated.
350   // (supported on CUDA only)
351   port::Status LoadModuleFromPtx(const char* ptx, GpuModuleHandle* module)
352       TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
353 
354   // (supported on ROCm only)
355   port::Status LoadModuleFromHsaco(const char* hsaco, GpuModuleHandle* module)
356       TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
357 
358   bool UnloadGpuBinary(const void* gpu_binary)
359       TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
360 
361   // Guards the on-disk-module mapping.
362   absl::Mutex disk_modules_mu_;
363 
364   // Mapping from filename to GPUModuleHandle, if it was already retrieved.
365   // Multiple GPUFunctionHandle are usually obtained from a single
366   // GPUModuleHandle so we attempt to hit in this mapping first, before
367   // retrieving it.
368   std::map<std::string, GpuModuleHandle> disk_modules_
369       ABSL_GUARDED_BY(disk_modules_mu_);
370 
371   // Guards the in-memory-module mapping.
372   absl::Mutex in_memory_modules_mu_;
373 
374   std::map<const char*, GpuModuleHandle> in_memory_modules_
375       ABSL_GUARDED_BY(in_memory_modules_mu_);
376 
377   absl::Mutex shared_constants_mu_;
378   // On-device constants that can be shared between multiple executables. A
379   // pointer for a given constant will expire when no executables require use
380   // of that constant anymore.
381   std::map<const absl::uint128, std::weak_ptr<DeviceMemoryBase>>
382       shared_constants_ ABSL_GUARDED_BY(shared_constants_mu_);
383 
384   // Kernel -> loaded GPU binary. Many kernels may load the same binary.
385   std::unordered_map<const KernelBase*, const void*> kernel_to_gpu_binary_
386       ABSL_GUARDED_BY(in_memory_modules_mu_);
387   // GPU binary (PTX or CUBIN or HSACO) -> {CUDA module, reference count}.
388   std::unordered_map<const void*, std::pair<GpuModuleHandle, uint64_t>>
389       gpu_binary_to_module_ ABSL_GUARDED_BY(in_memory_modules_mu_);
390 
391   // Guards the launched kernel set.
392   absl::Mutex launched_kernels_mu_;
393 
394   // Keeps track of the set of launched kernels. Currently used to suppress the
395   // occupancy check on subsequent launches.
396   std::set<GpuFunctionHandle> launched_kernels_
397       ABSL_GUARDED_BY(launched_kernels_mu_);
398 
399   // Handle for the CUDA device being operated on. Immutable
400   // post-initialization.
401   GpuDeviceHandle device_;
402 
403   // Handle for session with the library/driver. Immutable post-initialization.
404   GpuContext* context_;
405 
406   // The device ordinal value that this executor was initialized with; recorded
407   // for use in getting device metadata. Immutable post-initialization.
408   int device_ordinal_;
409 
410   // The major version of the compute capability for device_.
411   int cc_major_;
412 
413   // The minor version of the compute capability for device_.
414   int cc_minor_;
415 
416   // GPU ISA version for device_.
417   int version_;
418 
419   // The plugin configuration associated with this instance.
420   PluginConfig plugin_config_;
421 
422   // Type erased XLA specific state attached to GpuExecutor.
423   Object xla_state_;
424 
425   absl::Mutex alive_gpu_streams_mu_;
426 
427   // Lookup map for alive streams, from raw stream pointers.
428   absl::flat_hash_map<void*, Stream*> alive_gpu_streams_
429       ABSL_GUARDED_BY(alive_gpu_streams_mu_);
430 
431   SE_DISALLOW_COPY_AND_ASSIGN(GpuExecutor);
432 };
433 
ExtractGpuExecutor(StreamExecutor * stream_exec)434 inline GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec) {
435   return static_cast<GpuExecutor*>(stream_exec->implementation());
436 }
437 
438 }  // namespace gpu
439 }  // namespace stream_executor
440 
441 #endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
442