xref: /aosp_15_r20/external/executorch/backends/vulkan/runtime/api/containers/Tensor.h (revision 523fa7a60841cd1ecfb9cc4201f1ca8b03ed023a)
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD-style license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 #pragma once
10 
11 // @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
12 
13 #include <executorch/backends/vulkan/runtime/api/Context.h>
14 
15 #include <executorch/backends/vulkan/runtime/api/containers/ParamsBuffer.h>
16 
17 #include <executorch/backends/vulkan/runtime/utils/StorageUtils.h>
18 
19 namespace vkcompute {
20 namespace api {
21 
22 /*
23  * Given a GPUMemoryLayout value, produce a dim order vector that matches the
24  * given memory layout. The produced dim order vector will be in the NCHW
25  * dimension order
26  */
27 std::vector<int64_t> calculate_dim_order(
28     const size_t ndim,
29     const int32_t packed_dim);
30 
31 /*
32  * Given the sizes of a tensor and the dim order of the tensor (both in NCHW)
33  * dimension order, calculate the strides of the tensor.
34  */
35 std::vector<int64_t> calculate_strides(
36     const std::vector<int64_t>& sizes,
37     const std::vector<int64_t>& dim_order);
38 
39 std::vector<int64_t> unsqueeze_strides(
40     const std::vector<int64_t>& strides,
41     const int64_t numel);
42 
43 /*
44  * When stored on the GPU, tensor data is stored using texels (i.e. a vector of
45  * 4 scalar values) in order to take advantage of the GPU's native vectorization
46  * capabilities. Furthermore, tensor metadata is passed in to shaders as ivec4
47  * types.
48  *
49  * To accommodate these vectorized types, the sizes of a tensor will be modified
50  * for GPU storage in the following ways:
51  *
52  *   1. The dimensionality of the tensor will be padded to a multiple of 4.
53  *   2. The size of the packed dimension will be padded to a multiple of 4.
54  *
55  * The "packed dimension" is determined based on the utils::GPUMemoryLayout
56  * argument.
57  */
58 std::vector<int64_t> calculate_padded_sizes(
59     const std::vector<int64_t>& sizes,
60     const int32_t packed_dim);
61 
62 /*
63  * Calculate the image extents required of a texture backed tensor.
64  */
65 utils::uvec3 calculate_image_extents(
66     const std::vector<int64_t>& padded_sizes,
67     const std::vector<int64_t>& axis_map,
68     const int32_t packed_dim);
69 
70 struct LastAccess {
71   vkapi::PipelineStageFlags stage;
72   vkapi::MemoryAccessFlags access;
73 
LastAccessLastAccess74   LastAccess()
75       : stage{vkapi::PipelineStage::NO_STAGE},
76         access{vkapi::MemoryAccessType::NONE} {}
77 
LastAccessLastAccess78   LastAccess(
79       vkapi::PipelineStageFlags stage_flags,
80       vkapi::MemoryAccessFlags access_flags)
81       : stage{stage_flags}, access{access_flags} {}
82 };
83 
84 class vTensorStorage final {
85  public:
86   // Do not allow empty vTensorStorage construction
87   vTensorStorage() = default;
88 
89   vTensorStorage(
90       Context* context,
91       const utils::StorageType storage_type,
92       const std::vector<int64_t>& axis_map,
93       const int32_t packed_dim,
94       const std::vector<int64_t>& padded_sizes,
95       const vkapi::ScalarType dtype,
96       const bool allocate_memory = true);
97 
98   vTensorStorage(Context* const context, const vkapi::VulkanImage& image);
99 
100  protected:
101   /*
102    * This allows for creation of tensors that use the same underlying storage
103    * as another tensor. Note that this functionality is currently enabled for
104    * tensors that have buffer storage only. The created tensor will not have
105    * ownership of the underlying VkBuffer. This constructor is marked protected
106    * because this behaviour is unsafe, since the original tensor may be
107    * destroyed before the copy is destroyed.
108    */
109   vTensorStorage(vTensorStorage& other, const int64_t buffer_offset = 0);
110 
111  public:
112   // To discourage creating copies, the assignment operator is still deleted.
113   vTensorStorage& operator=(const vTensorStorage& other) = delete;
114 
115   vTensorStorage(vTensorStorage&& other) = default;
116   vTensorStorage& operator=(vTensorStorage&& other) = default;
117 
118   ~vTensorStorage();
119 
120   friend class vTensor;
121 
122  private:
123   // Context
124   Context* context_{};
125 
126   utils::StorageType storage_type_;
127 
128   // Resource sizings
129   utils::uvec3 image_extents_{};
130   int64_t buffer_length_{};
131   int64_t buffer_offset_{};
132 
133   // GPU Storage
134   mutable vkapi::VulkanImage image_;
135   mutable vkapi::VulkanBuffer buffer_;
136 
137   // Last Access - used to insert memory barriers
138   LastAccess last_access_;
139   // Indicates whether copies of this vTensorStorage have been made
140   bool has_copies_;
141 
142  private:
143   // Registers underlying memory for cleanup
144   void flush();
145 
146   // Memory barrier insertion
147   void transition(
148       vkapi::PipelineBarrier&,
149       const vkapi::PipelineStageFlags,
150       const vkapi::MemoryAccessFlags);
151 
152   // Validation
153   void verify() const;
154 
155  public:
texture_format()156   inline VkFormat texture_format() {
157     return image_.format();
158   }
159 
160   /*
161    * Check if the underlying resource is a copy of another resource
162    */
163   bool is_copy() const;
164 
165   /*
166    * Used for checking if this vTensorStorage is a copy of another instance
167    */
168   bool is_copy_of(const vTensorStorage& other) const;
169 };
170 
171 class vTensor final {
172   struct TextureLimits {
173     // Alignment is required to conform with Vulkan specification; a 3 or 4
174     // component vector with components of size N must have base alignment of
175     // 4N.
176     alignas(16) utils::ivec3 limits;
177   };
178 
179  public:
180   explicit vTensor(
181       Context* context,
182       const std::vector<int64_t>& sizes,
183       const vkapi::ScalarType dtype,
184       const utils::StorageType storage_type = utils::kTexture3D,
185       const utils::GPUMemoryLayout memory_layout = utils::kChannelsPacked,
186       const bool allocate_memory = true);
187 
188   vTensor(const vTensor& other) = delete;
189 
190   explicit vTensor(
191       Context* context,
192       const vkapi::VulkanImage& image,
193       const utils::GPUMemoryLayout memory_layout = utils::kChannelsPacked);
194 
195   /*
196    * This constructor allows for the creation of a vTensor that references the
197    * same buffer resource of another vTensor, with the same sizes and strides
198    * metadata. The created vTensor will not own the underlying resource. This is
199    * only applicable for buffer backed tensors at the moment.
200    *
201    * Once created, the sizes and strides of the aliased vTensor can be changed
202    * using the `virtual_reconfigure` member function.
203    */
204   vTensor(vTensor& other);
205 
206   /*
207    * This constructor allows for the creation of a vTensor that references the
208    * same buffer resource of another vTensor, but with different sizes and
209    * strides metatdata. The created vTensor will not own the underlying
210    * resource. This is only applicable for buffer backed tensors at the moment.
211    *
212    * Note that dim order is used as the source of truth regarding the strides,
213    * and the new strides are computed from the new sizes and new dim order.
214    * Thus only the dim order is provided as an argument to this function.
215    *
216    * The offset_numel argument allows the aliased tensor's memory region to
217    * begin at an offset of N elements from the start of the original tensor's
218    * buffer.
219    */
220   vTensor(
221       vTensor& other,
222       const std::vector<int64_t>& sizes,
223       const std::vector<int64_t>& dim_order,
224       const int64_t offset_numel = 0);
225 
226   // To discourage making copies, the copy assignment operator is still deleted
227   vTensor& operator=(const vTensor& other) = delete;
228 
229   vTensor(vTensor&& other) = default;
230   vTensor& operator=(vTensor&& other) = default;
231 
232  private:
233   /*
234    * "Core" tensor metadata. They are the minimum amount of information required
235    * to construct a tensor.
236    */
237 
238   // Whether the tensor has elements of type float, int, etc.
239   vkapi::ScalarType dtype_;
240   // sizes of the tensor in NCHW dimension order
241   std::vector<int64_t> sizes_;
242   // Describes which dimension is "tightly packed" using WHCN index (i.e. 0 for
243   // width, 1 for height, etc.). For texture backed tensors, this describes
244   // which dimension is packed along a texel. For buffer backed tensors, this
245   // describes which dimension has a stride of 1 (i.e. is last in the dim
246   // order).
247   int32_t packed_dim_;
248 
249   /*
250    * "Layout" metadata. These describe with further detail how tensor data is
251    * laid out in memory. However, they are considered secondary to the "core"
252    * metadata members above because defaults can be assumed based on a given
253    * memory layout. When permuting the tensor without performing a copy, these
254    * metadata members are the ones that will be changed. All other metadata is
255    * derived from a combination of sizes, memory layout, and the below members.
256    */
257 
258   // dim order of the tensor; dimension indices are in NCHW dimension order
259   // i.e. 0 is N, 1 is C, 2 is H, 3 is W for a 4D tensor. The dims with larger
260   // strides precede the dims with smaller strides in the dim order. The last
261   // dim is always the fastest moving dim with a stride of 1.
262   std::vector<int64_t> dim_order_;
263   // Describes which axis of an image texture each dimension of the tensor maps
264   // to. The axis mapping allows texture based tensors to be permuted and
265   // transposed without modifying the underlying texture storage. For a more in
266   // depth explanation of axis mapping, see the `default_axis_map()`
267   // function.
268   std::vector<int64_t> axis_map_;
269 
270   /*
271    * The below can be consider "layout" metadata as well, but are derived from
272    * the above data members.
273    */
274 
275   // strides of the tensor in NCHW dimension order
276   std::vector<int64_t> strides_;
277   // Contains the number of elements in the tensor according to the canonical
278   // sizes.
279   size_t numel_;
280 
281   /*
282    * The below metadata members are derived from the above, and are typically
283    * to i.e. pass tensor metadata to compute shaders.
284    */
285 
286   // padded sizes of the tensor in NCHW dimension order. See the
287   // calculate_padded_sizes() function for more context. Note that padded sizes
288   // are only used for texture storage, and not for buffer storage.
289   std::vector<int64_t> padded_sizes_;
290   // Contains the strides of the tensor, with the dimensionality padded to the
291   // nearest multiple of 4. Unsqueezed dims will have a stride of int32_t max.
292   std::vector<int64_t> unsqueezed_strides_;
293   // Contains the number of elements in the tensor according to the padded
294   // sizes.
295   size_t padded_numel_;
296   // See the comments documenting logical_limits() for more context.
297   TextureLimits logical_limits_;
298 
299   /*
300    * Utility GPU buffers that can be passed to shaders in order to convey tensor
301    * metadata. These buffers will be initialized the first time they are
302    * accessed via the corresponding *_ubo() function, and their contents will be
303    * updated whenever virtual_resize() is called.
304    *
305    * Refer to the comments for the corresponding *_ubo() functions for more
306    * context about the data contained in each buffer.
307    */
308   ParamsBuffer sizes_uniform_;
309   ParamsBuffer strides_uniform_;
310   ParamsBuffer numel_uniform_;
311   ParamsBuffer logical_limits_uniform_;
312 
313   vTensorStorage storage_;
314 
315  public:
316   /*
317    Texture Access
318   */
319 
image()320   inline vkapi::VulkanImage& image() const& {
321     return storage_.image_;
322   }
323 
324   vkapi::VulkanImage& image(
325       vkapi::PipelineBarrier&,
326       const vkapi::PipelineStageFlags) &;
327 
328   vkapi::VulkanImage& image(
329       vkapi::PipelineBarrier&,
330       const vkapi::PipelineStageFlags,
331       const vkapi::MemoryAccessFlags) &;
332 
buffer()333   inline vkapi::VulkanBuffer& buffer() const& {
334     return storage_.buffer_;
335   }
336 
337   vkapi::VulkanBuffer& buffer(
338       vkapi::PipelineBarrier&,
339       const vkapi::PipelineStageFlags) &;
340 
341   vkapi::VulkanBuffer& buffer(
342       vkapi::PipelineBarrier&,
343       const vkapi::PipelineStageFlags,
344       const vkapi::MemoryAccessFlags) &;
345 
346   /*
347     Metadata
348   */
349 
storage_type()350   inline utils::StorageType storage_type() const {
351     return storage_.storage_type_;
352   }
353 
has_buffer_storage()354   inline bool has_buffer_storage() const {
355     return storage_.storage_type_ == utils::kBuffer;
356   }
357 
358  private:
359   void set_logical_limits(const utils::uvec3& image_extents);
360 
361  public:
362   /*
363    * The logical limits of the tensor are derived from the image extents of the
364    * image texture used to store the tensor, but with two key differences.
365    *
366    * First, the image extents are permuted according to the axis map. This
367    * makes it so that the first element of the logical limit is the limit of the
368    * texture axis corresponding to the width dimension of the tensor, the next
369    * element is the limit of the texture axis corresponding to the height
370    * dimension and the last element is the limit of the texture axis that
371    * corresponds to the channels dimension of the tensor.
372    *
373    * Second, the logical limits may use smaller extents than the actual image
374    * extents of the image texture. This is due to dynamic shape; if the tensor's
375    * `virtual_resize()` function is called, then the logical limits will reflect
376    * the extents that would be needed to support a tensor with the updated sizes
377    * instead of the original sizes.
378    */
logical_limits()379   inline const utils::ivec3& logical_limits() const {
380     return logical_limits_.limits;
381   }
382 
383   /*
384    * Extract an `vkapi::ScalarType` from the TensorOptions member
385    */
dtype()386   inline vkapi::ScalarType dtype() const {
387     return dtype_;
388   }
389 
390   /*
391    * Provide a "best guess" of a memory layout that can be used to construct a
392    * tensor with similar layout metadata (i.e. strides, axis_map, etc.) as this
393    * tensor. In some scenarios, the exact layout of the tensor may not be able
394    * to be replicated due to calling `virtual_*()` functions after construction;
395    * however, this function will provide a memory layout that will produce the
396    * same `packed_dim_` as this tensor.
397    */
398   utils::GPUMemoryLayout estimate_memory_layout() const;
399 
packed_dim()400   inline int32_t packed_dim() const {
401     return packed_dim_;
402   }
403 
404   /*
405    * Returns the WHCN index of the dimension that is used to concatenate batches
406    * as an int32_t.
407    */
concat_dim()408   inline int32_t concat_dim() const {
409     return utils::safe_downcast<int32_t>(axis_map_.at(3));
410   }
411 
sizes()412   inline const std::vector<int64_t>& sizes() const {
413     return sizes_;
414   }
415 
size(size_t dim)416   inline const int64_t size(size_t dim) const {
417     return sizes().at(dim);
418   }
419 
dim()420   inline const int64_t dim() const {
421     return sizes_.size();
422   }
423 
dim_order()424   inline const std::vector<int64_t>& dim_order() const {
425     return dim_order_;
426   }
427 
axis_map()428   inline const std::vector<int64_t>& axis_map() const {
429     return axis_map_;
430   }
431 
432   /*
433    * Returns a single int32_t that contains the values of the axis map and the
434    * packed dimension packed into a single int32_t, such that it can be used as
435    * a specialization constant in a compute shader. This allows for the SPIR-V
436    * to bytecode compilation to perform compile-time unfolding on the axis map.
437    * Each element of the axis map and the value of the packed dimension take up
438    * 4 bits in the packed int32_t.
439    */
hashed_layout()440   inline int32_t hashed_layout() const {
441     return axis_map_.at(0) + (axis_map_.at(1) << 4) + (axis_map_.at(2) << 8) +
442         (axis_map_.at(3) << 12) + (packed_dim_ << 16);
443   }
444 
445   /*
446    * Return true if the tensor's axis map is {0, 1, 2, concat_dim}. This means
447    * that the width dim is mapped to the width axis of the texture, the height
448    * dim is mapped to the height axis of the texture, the channels dim is mapped
449    * to the depth axis of the texture.
450    */
has_standard_axis_map()451   inline bool has_standard_axis_map() const {
452     return axis_map_.at(0) == 0 && axis_map_.at(1) == 1 && axis_map_.at(2) == 2;
453   }
454 
strides()455   inline const std::vector<int64_t>& strides() const {
456     return strides_;
457   }
458 
unsqueezed_strides()459   inline const std::vector<int64_t>& unsqueezed_strides() const {
460     return unsqueezed_strides_;
461   }
462 
463   /*
464    * Returns a GPU buffer containing the sizes of the tensor in WHCN order.
465    * Note that dimensions that are not present in the tensor's sizes are set to
466    * a size of 1.
467    */
468   const vkapi::BufferBindInfo sizes_ubo();
469 
470   /*
471    * Returns a GPU buffer containing the strides of the tensor in WHCN order.
472    * Note that the strides are extended to a dimensionality that is a multiple
473    * of 4, thus dimensions that are not present in the tensor's sizes are set to
474    * have a stride equal to the stride of the "slowest moving" dimension.
475    */
476   const vkapi::BufferBindInfo strides_ubo();
477 
478   /*
479    * Returns a GPU buffer containing the logical limits of the tensor. See the
480    * comments for logical_limits() for more context.
481    */
482   const vkapi::BufferBindInfo logical_limits_ubo();
483 
484   /*
485    * Returns the number of elements in the buffer used to store the tensor.
486    */
487   const vkapi::BufferBindInfo numel_ubo();
488 
numel()489   inline size_t numel() const {
490     return numel_;
491   }
492 
nbytes()493   inline size_t nbytes() const {
494     return element_size(dtype()) * numel();
495   }
496 
497   /*
498    * Returns numel but based on padded_sizes_ instead of sizes_
499    */
padded_numel()500   inline size_t padded_numel() const {
501     return padded_numel_;
502   }
503 
504   size_t staging_buffer_numel() const;
505 
staging_buffer_nbytes()506   inline size_t staging_buffer_nbytes() const {
507     return element_size(dtype()) * staging_buffer_numel();
508   }
509 
510   /*
511    * Return the VmaAllocationCreateInfo of the underlying resource
512    */
513   VmaAllocationCreateInfo get_allocation_create_info() const;
514 
515   /*
516    * Return the VkMemoryRequirements of the underlying resource
517    */
518   VkMemoryRequirements get_memory_requirements() const;
519 
520   /*
521    * Binds the underlying resource to the given memory allocation
522    */
523   void bind_allocation(const vkapi::Allocation& allocation);
524 
525  private:
526   /*
527    * Assuming sizes, dim order, or axis mapping was modified, recompute all
528    * derived metadata and update metadata UBO with new values.
529    */
530   void update_metadata();
531 
532   /*
533    * Check that tensor sizes are valid given the current storage resource's
534    * limits.
535    */
536   void check_sizes(const std::vector<int64_t>& sizes) const;
537 
538  public:
539   /*
540    * Change how the tensor should be interpreted by compute shaders via updating
541    * the size and dim order of the tensor. The new sizes and dim order may have
542    * different dimensionality than the current dimensionality of the tensor.
543    *
544    * This function can only be used for buffer-backed tensors, since texture
545    * backed buffers cannot change dimensionality or memory layout.
546    *
547    * TODO(ssjia): delete this API. prefer functions such as virtual_transpose
548    * instead.
549    */
550   void virtual_reconfigure(
551       const std::vector<int64_t>& new_sizes,
552       const std::vector<int64_t>& new_dim_order);
553 
554   /*
555    * Set all metadata of this tensor to match the metadata of another tensor.
556    */
557   void virtual_clone(const vTensor& other);
558 
559   /*
560    * Perform a virtual resize of the vTensor by modifying the size metadata that
561    * gets used in compute shaders. This allows the shader to treat the
562    * underlying resource as if it were a different size. The new sizes cannot
563    * modify the dimensionality of the tensor.
564    */
565   void virtual_resize(const std::vector<int64_t>& new_sizes);
566 
567   /*
568    * Transpose the tensor in-place by updating its metadata.
569    */
570   void virtual_transpose(const int64_t dim0, const int64_t dim1);
571 
572   /*
573    * Check if this vTensor instance is a view of another vTensor instance
574    */
is_view_of(const vTensor & other)575   inline bool is_view_of(const vTensor& other) const {
576     return storage_.is_copy_of(other.storage_);
577   }
578 };
579 
580 } // namespace api
581 } // namespace vkcompute
582