xref: /aosp_15_r20/external/pytorch/aten/src/ATen/native/vulkan/api/Tensor.h (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #pragma once
2 
3 // @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
4 
5 #ifdef USE_VULKAN_API
6 
7 #include <ATen/native/vulkan/api/Context.h>
8 #include <ATen/native/vulkan/api/Types.h>
9 
10 namespace at {
11 namespace native {
12 namespace vulkan {
13 
14 struct LastAccess {
15   api::PipelineStageFlags stage;
16   api::MemoryAccessFlags access;
17 
LastAccessLastAccess18   LastAccess()
19       : stage{api::PipelineStage::NO_STAGE},
20         access{api::MemoryAccessType::NONE} {}
21 
LastAccessLastAccess22   LastAccess(
23       api::PipelineStageFlags stage_flags,
24       api::MemoryAccessFlags access_flags)
25       : stage{stage_flags}, access{access_flags} {}
26 };
27 
28 class vTensorStorage final {
29  public:
30   // Do not allow empty vTensorStorage construction
31   vTensorStorage() = default;
32 
33   vTensorStorage(
34       api::Context* context,
35       const api::StorageType storage_type,
36       const api::GPUMemoryLayout gpu_memory_layout,
37       const std::vector<int64_t>& sizes,
38       const api::ScalarType dtype,
39       const bool allocate_memory = true);
40 
41   vTensorStorage(const vTensorStorage&) = delete;
42   vTensorStorage& operator=(const vTensorStorage&) = delete;
43 
44   vTensorStorage(vTensorStorage&&) = default;
45   vTensorStorage operator=(vTensorStorage&&) = delete;
46 
47   ~vTensorStorage();
48 
49   friend class vTensor;
50 
51  private:
52   // Context
53   api::Context* context_{};
54 
55   api::StorageType storage_type_;
56 
57   // Resource sizings
58   api::utils::uvec3 extents_{};
59   int64_t buffer_length_{};
60 
61   // Image Texture
62   mutable api::VulkanImage image_;
63   mutable api::VulkanBuffer buffer_;
64 
65   // Last Access - used to insert memory barriers
66   LastAccess last_access_;
67 
68  private:
69   // Registers underlying memory for cleanup
70   void flush();
71 
72   // Memory barrier insertion
73   void transition(
74       api::PipelineBarrier&,
75       const api::PipelineStageFlags,
76       const api::MemoryAccessFlags);
77 
78   // Validation
79   void verify() const;
80 
81  public:
texture_format()82   inline VkFormat texture_format() {
83     return image_.format();
84   }
85 
86   void discard_and_reallocate(
87       const std::vector<int64_t>& gpu_sizes,
88       const api::GPUMemoryLayout gpu_memory_layout,
89       const api::ScalarType dtype);
90 };
91 
92 class vTensor final {
93  public:
94   // Do not allow empty vTensor construction
95   vTensor() = default;
96 
97   // Default constructor
98   vTensor(
99       api::Context* context,
100       const std::vector<int64_t>& sizes,
101       const api::ScalarType dtype,
102       const api::StorageType storage_type = api::StorageType::TEXTURE_3D,
103       const api::GPUMemoryLayout memory_layout =
104           api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED,
105       const bool allocate_memory = true);
106 
107   // Default constructor for quantized vTensor
108   vTensor(
109       api::Context* const context,
110       const std::vector<int64_t>& sizes,
111       double q_scale,
112       int64_t q_zero_point,
113       const api::ScalarType dtype,
114       const api::StorageType storage_type = api::StorageType::TEXTURE_3D,
115       const api::GPUMemoryLayout memory_layout =
116           api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
117 
118   // Copy Constructor and Assignment; Ideally copying  would be disabled
119   // (see the reasoning for move assignment below) but it is required for
120   // compatibility with OpaqueTensorImpl
121   vTensor(const vTensor& other) = default;
122   vTensor& operator=(const vTensor& other) = default;
123 
124   // Move Constructor and assignment
125   vTensor(vTensor&& other) = default;
126   vTensor& operator=(vTensor&& other) = default;
127 
128   // Used for passing buffer sizes and strides data to shaders
129   struct BufferMetadata {
130     api::utils::uvec4 sizes;
131     api::utils::uvec4 strides;
132     uint32_t ndim;
133     uint32_t buffer_length;
134   };
135 
136  private:
137   // Tensor Options
138   api::ScalarType dtype_;
139 
140   // GPU specific memory layout qualifier
141   api::GPUMemoryLayout memory_layout_;
142 
143   // Sizes and Strides
144   std::vector<int64_t> sizes_;
145   std::vector<int64_t> strides_;
146 
147   // Storage Dimensions. When stored on the GPU, one dimension will be aligned
148   // to the next multiple of 4 in order to take advantage of vec4 data types.
149   std::vector<int64_t> gpu_sizes_;
150   std::vector<int64_t> gpu_strides_;
151 
152   // The extents that correspond to the tensor's size metadata. Note that this
153   // may not be the same as the extents of the underlying image texture because
154   // vTensor can be virtually resized via virtual_resize() which will cause it
155   // to be interpreted as a tensor with a different size.
156   api::utils::uvec3 virtual_extents_;
157 
158   // A Vulkan uniform buffer containing sizes and strides of the GPU buffer that
159   // can be passed into a shader.
160   api::UniformParamsBuffer metadata_uniform_;
161 
162   // A Vulkan uniform buffer containing the tensor sizes that can be passed into
163   // a shader.
164   std::shared_ptr<api::UniformParamsBuffer> cpu_sizes_uniform_;
165 
166   // A Vulkan uniform buffer containing the GPU tensor sizes that can be passed
167   // into a shader. GPU sizes refers to the sizes of the tensor after padding
168   // has been applied to one dimension to align it to the next multiple of 4.
169   std::shared_ptr<api::UniformParamsBuffer> gpu_sizes_uniform_;
170 
171   // A Vulkan uniform buffer containing the image extents of the underlying
172   // image texture that can be passed into a shader.
173   std::shared_ptr<api::UniformParamsBuffer> extents_uniform_;
174 
175   // Quantization params
176   bool is_quantized_{false};
177   double q_scale_{1.0f};
178   int64_t q_zero_point_{0u};
179 
180   // Even at the cost of a heap allocation plus the resulting negative impact
181   // on cache locality due to the subsequent pointer chasing, it is still
182   // critical to share the view across vTensor implementations to minimize
183   // programmer errors.  Ideally this class should have been only made movable,
184   // and non-copyable - something we cannot do unfortunately due to the inner
185   // workings of at::TensorImpl requiring copy semantics in
186   // at::TensorImpl::release_resources() to function as expected.  Now that this
187   // class is made copyable though, a new door to a whole new class of bugs is
188   // opened, in that there now is a chance of two [shallow] copies, have their
189   // StorageState objects go out of sync as a result of an operation being
190   // performed on one shallow copy that is not reflected in the other.
191   // Technically, if the programmer is very careful, it is possible to avoid
192   // this trap and not pay the cost of indirection, but the resulting bugs of
193   // missing memory barriers will be so frustrating to hunt down for those
194   // unfamiliar with the internal mechanics of this class, that I decided to
195   // take the performance penalty of this extra layer of indirection in favor
196   // of making this class easier to use.
197   std::shared_ptr<vTensorStorage> view_;
198 
199  public:
200   /*
201    Texture Access
202   */
203 
storage_type()204   inline api::StorageType storage_type() const {
205     return view_->storage_type_;
206   }
207 
image()208   inline api::VulkanImage& image() const& {
209     return view_->image_;
210   }
211 
212   api::VulkanImage& image(api::PipelineBarrier&, const api::PipelineStageFlags)
213       const&;
214 
215   api::VulkanImage& image(
216       api::PipelineBarrier&,
217       const api::PipelineStageFlags,
218       const api::MemoryAccessFlags) &;
219 
buffer()220   inline api::VulkanBuffer& buffer() const& {
221     return view_->buffer_;
222   }
223 
224   api::VulkanBuffer& buffer(
225       api::PipelineBarrier&,
226       const api::PipelineStageFlags) const&;
227 
228   api::VulkanBuffer& buffer(
229       api::PipelineBarrier&,
230       const api::PipelineStageFlags,
231       const api::MemoryAccessFlags) &;
232 
233   /*
234     Metadata
235   */
236 
extents()237   inline const api::utils::uvec3& extents() const {
238     return view_->extents_;
239   }
240 
241   /*
242    * Extract an `api::ScalarType` from the TensorOptions member
243    */
dtype()244   inline api::ScalarType dtype() const {
245     return dtype_;
246   }
247 
248   /*
249    * Get an `api::ScalarType` that corresponds to the image format of the
250    * texture
251    */
texture_dtype()252   inline api::ScalarType texture_dtype() const {
253     return api::element_scalartype(view_->texture_format());
254   }
255 
gpu_memory_layout()256   inline api::GPUMemoryLayout gpu_memory_layout() const {
257     return memory_layout_;
258   }
259 
gpu_memory_layout_as_uint()260   inline uint32_t gpu_memory_layout_as_uint() const {
261     return static_cast<uint32_t>(memory_layout_);
262   }
263 
sizes()264   inline const std::vector<int64_t>& sizes() const {
265     return sizes_;
266   }
267 
strides()268   inline const std::vector<int64_t>& strides() const {
269     return strides_;
270   }
271 
gpu_sizes()272   inline const std::vector<int64_t>& gpu_sizes() const {
273     return gpu_sizes_;
274   }
275 
gpu_strides()276   inline const std::vector<int64_t>& gpu_strides() const {
277     return gpu_strides_;
278   }
279 
virtual_extents()280   inline const api::utils::uvec3& virtual_extents() const {
281     return virtual_extents_;
282   }
283 
284   /*
285    * Get a uniform buffer containing sizes and strides information of the GPU
286    * buffer
287    */
288   api::VulkanBuffer& buffer_metadata();
289 
290   /*
291    * Get a uniform buffer object containing the tensor sizes to use in a compute
292    * shader. Note that the UBO will be created the first time this function is
293    * called.
294    */
295   std::shared_ptr<api::UniformParamsBuffer> cpu_sizes_ubo();
296 
297   /*
298    * Get a uniform buffer object containing the tensor GPU sizes to use in a
299    * compute shader. Note that the UBO will be created the first time this
300    * function is called.
301    */
302   std::shared_ptr<api::UniformParamsBuffer> gpu_sizes_ubo();
303 
304   /*
305    * Get a uniform buffer object containing the image extents to use in a
306    * compute shader. Note that the UBO will be created the first time this
307    * function is called.
308    */
309   std::shared_ptr<api::UniformParamsBuffer> extents_ubo();
310 
311   /*
312    * Constructs a BufferMetdata struct based on the original sizes and strides
313    * to pass into a shader.
314    */
315   BufferMetadata get_cpu_buffer_metadata() const;
316 
set_is_quantized()317   inline void set_is_quantized() {
318     is_quantized_ = true;
319   }
320 
is_quantized()321   inline bool is_quantized() const {
322     return is_quantized_;
323   }
324 
set_scale(const double q_scale)325   inline void set_scale(const double q_scale) {
326     q_scale_ = q_scale;
327   }
328 
get_scale()329   inline double get_scale() const {
330     return q_scale_;
331   }
332 
get_scale_float()333   inline float get_scale_float() const {
334     return api::utils::safe_downcast<float>(q_scale_);
335   }
336 
set_zero_point(const int64_t q_zero_point)337   inline void set_zero_point(const int64_t q_zero_point) {
338     q_zero_point_ = q_zero_point;
339   }
340 
get_zero_point()341   inline int64_t get_zero_point() const {
342     return q_zero_point_;
343   }
344 
get_zero_point_int32()345   inline int32_t get_zero_point_int32() const {
346     return api::utils::safe_downcast<int32_t>(q_zero_point_);
347   }
348 
numel()349   inline size_t numel() const {
350     return api::utils::multiply_integers(sizes());
351   }
352 
nbytes()353   inline size_t nbytes() const {
354     return api::element_size(dtype()) * numel();
355   }
356 
357   /*
358    * Returns numel but based on gpu_sizes_ instead of sizes_
359    */
gpu_numel()360   inline size_t gpu_numel() const {
361     return api::utils::multiply_integers(gpu_sizes_);
362   }
363 
364   /*
365    * Return nbytes but bnased on gpu_sizes_ instead of sizes_
366    */
gpu_nbytes()367   inline VkDeviceSize gpu_nbytes() const {
368     return api::element_size(dtype()) * gpu_numel();
369   }
370 
371   /*
372    * Return the VmaAllocationCreateInfo of the underlying resource
373    */
374   VmaAllocationCreateInfo get_allocation_create_info() const;
375 
376   /*
377    * Return the VkMemoryRequirements of the underlying resource
378    */
379   VkMemoryRequirements get_memory_requirements() const;
380 
381   /*
382    * Binds the underlying resource to the given memory allocation
383    */
384   void bind_allocation(const api::MemoryAllocation& allocation);
385 
386  private:
387   /*
388    * Update the size metadata of the vTensor to be new sizes. Should not be used
389    * directly, reallocate() or virtual_resize() should be used instead.
390    */
391   void update_size_metadata(const std::vector<int64_t>& new_sizes);
392 
393  public:
394   /*
395    * Discard the underlying VkImage or VkBuffer and re-allocate based on new
396    * tensor sizes
397    */
398   void reallocate(const std::vector<int64_t>& new_sizes);
399 
400   /*
401    * Perform a virtual resize of the vTensor by modifying the size metadata that
402    * gets used in compute shaders. This allows the shader to treat the
403    * underlying resource as if it were a different size.
404    */
405   void virtual_resize(const std::vector<int64_t>& new_sizes);
406 };
407 
408 void add_buffer_barrier(
409     api::PipelineBarrier&,
410     const api::VulkanBuffer&,
411     const api::PipelineStageFlags,
412     const api::MemoryAccessFlags,
413     const api::PipelineStageFlags,
414     const api::MemoryAccessFlags);
415 
416 } // namespace vulkan
417 } // namespace native
418 } // namespace at
419 
420 #endif /* USE_VULKAN_API */
421