1 /* 2 * Copyright (c) Meta Platforms, Inc. and affiliates. 3 * All rights reserved. 4 * 5 * This source code is licensed under the BSD-style license found in the 6 * LICENSE file in the root directory of this source tree. 7 */ 8 9 #pragma once 10 11 // @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName 12 13 #include <executorch/backends/vulkan/runtime/api/Context.h> 14 15 #include <executorch/backends/vulkan/runtime/api/containers/ParamsBuffer.h> 16 17 #include <executorch/backends/vulkan/runtime/utils/StorageUtils.h> 18 19 namespace vkcompute { 20 namespace api { 21 22 /* 23 * Given a GPUMemoryLayout value, produce a dim order vector that matches the 24 * given memory layout. The produced dim order vector will be in the NCHW 25 * dimension order 26 */ 27 std::vector<int64_t> calculate_dim_order( 28 const size_t ndim, 29 const int32_t packed_dim); 30 31 /* 32 * Given the sizes of a tensor and the dim order of the tensor (both in NCHW) 33 * dimension order, calculate the strides of the tensor. 34 */ 35 std::vector<int64_t> calculate_strides( 36 const std::vector<int64_t>& sizes, 37 const std::vector<int64_t>& dim_order); 38 39 std::vector<int64_t> unsqueeze_strides( 40 const std::vector<int64_t>& strides, 41 const int64_t numel); 42 43 /* 44 * When stored on the GPU, tensor data is stored using texels (i.e. a vector of 45 * 4 scalar values) in order to take advantage of the GPU's native vectorization 46 * capabilities. Furthermore, tensor metadata is passed in to shaders as ivec4 47 * types. 48 * 49 * To accommodate these vectorized types, the sizes of a tensor will be modified 50 * for GPU storage in the following ways: 51 * 52 * 1. The dimensionality of the tensor will be padded to a multiple of 4. 53 * 2. The size of the packed dimension will be padded to a multiple of 4. 54 * 55 * The "packed dimension" is determined based on the utils::GPUMemoryLayout 56 * argument. 57 */ 58 std::vector<int64_t> calculate_padded_sizes( 59 const std::vector<int64_t>& sizes, 60 const int32_t packed_dim); 61 62 /* 63 * Calculate the image extents required of a texture backed tensor. 64 */ 65 utils::uvec3 calculate_image_extents( 66 const std::vector<int64_t>& padded_sizes, 67 const std::vector<int64_t>& axis_map, 68 const int32_t packed_dim); 69 70 struct LastAccess { 71 vkapi::PipelineStageFlags stage; 72 vkapi::MemoryAccessFlags access; 73 LastAccessLastAccess74 LastAccess() 75 : stage{vkapi::PipelineStage::NO_STAGE}, 76 access{vkapi::MemoryAccessType::NONE} {} 77 LastAccessLastAccess78 LastAccess( 79 vkapi::PipelineStageFlags stage_flags, 80 vkapi::MemoryAccessFlags access_flags) 81 : stage{stage_flags}, access{access_flags} {} 82 }; 83 84 class vTensorStorage final { 85 public: 86 // Do not allow empty vTensorStorage construction 87 vTensorStorage() = default; 88 89 vTensorStorage( 90 Context* context, 91 const utils::StorageType storage_type, 92 const std::vector<int64_t>& axis_map, 93 const int32_t packed_dim, 94 const std::vector<int64_t>& padded_sizes, 95 const vkapi::ScalarType dtype, 96 const bool allocate_memory = true); 97 98 vTensorStorage(Context* const context, const vkapi::VulkanImage& image); 99 100 protected: 101 /* 102 * This allows for creation of tensors that use the same underlying storage 103 * as another tensor. Note that this functionality is currently enabled for 104 * tensors that have buffer storage only. The created tensor will not have 105 * ownership of the underlying VkBuffer. This constructor is marked protected 106 * because this behaviour is unsafe, since the original tensor may be 107 * destroyed before the copy is destroyed. 108 */ 109 vTensorStorage(vTensorStorage& other, const int64_t buffer_offset = 0); 110 111 public: 112 // To discourage creating copies, the assignment operator is still deleted. 113 vTensorStorage& operator=(const vTensorStorage& other) = delete; 114 115 vTensorStorage(vTensorStorage&& other) = default; 116 vTensorStorage& operator=(vTensorStorage&& other) = default; 117 118 ~vTensorStorage(); 119 120 friend class vTensor; 121 122 private: 123 // Context 124 Context* context_{}; 125 126 utils::StorageType storage_type_; 127 128 // Resource sizings 129 utils::uvec3 image_extents_{}; 130 int64_t buffer_length_{}; 131 int64_t buffer_offset_{}; 132 133 // GPU Storage 134 mutable vkapi::VulkanImage image_; 135 mutable vkapi::VulkanBuffer buffer_; 136 137 // Last Access - used to insert memory barriers 138 LastAccess last_access_; 139 // Indicates whether copies of this vTensorStorage have been made 140 bool has_copies_; 141 142 private: 143 // Registers underlying memory for cleanup 144 void flush(); 145 146 // Memory barrier insertion 147 void transition( 148 vkapi::PipelineBarrier&, 149 const vkapi::PipelineStageFlags, 150 const vkapi::MemoryAccessFlags); 151 152 // Validation 153 void verify() const; 154 155 public: texture_format()156 inline VkFormat texture_format() { 157 return image_.format(); 158 } 159 160 /* 161 * Check if the underlying resource is a copy of another resource 162 */ 163 bool is_copy() const; 164 165 /* 166 * Used for checking if this vTensorStorage is a copy of another instance 167 */ 168 bool is_copy_of(const vTensorStorage& other) const; 169 }; 170 171 class vTensor final { 172 struct TextureLimits { 173 // Alignment is required to conform with Vulkan specification; a 3 or 4 174 // component vector with components of size N must have base alignment of 175 // 4N. 176 alignas(16) utils::ivec3 limits; 177 }; 178 179 public: 180 explicit vTensor( 181 Context* context, 182 const std::vector<int64_t>& sizes, 183 const vkapi::ScalarType dtype, 184 const utils::StorageType storage_type = utils::kTexture3D, 185 const utils::GPUMemoryLayout memory_layout = utils::kChannelsPacked, 186 const bool allocate_memory = true); 187 188 vTensor(const vTensor& other) = delete; 189 190 explicit vTensor( 191 Context* context, 192 const vkapi::VulkanImage& image, 193 const utils::GPUMemoryLayout memory_layout = utils::kChannelsPacked); 194 195 /* 196 * This constructor allows for the creation of a vTensor that references the 197 * same buffer resource of another vTensor, with the same sizes and strides 198 * metadata. The created vTensor will not own the underlying resource. This is 199 * only applicable for buffer backed tensors at the moment. 200 * 201 * Once created, the sizes and strides of the aliased vTensor can be changed 202 * using the `virtual_reconfigure` member function. 203 */ 204 vTensor(vTensor& other); 205 206 /* 207 * This constructor allows for the creation of a vTensor that references the 208 * same buffer resource of another vTensor, but with different sizes and 209 * strides metatdata. The created vTensor will not own the underlying 210 * resource. This is only applicable for buffer backed tensors at the moment. 211 * 212 * Note that dim order is used as the source of truth regarding the strides, 213 * and the new strides are computed from the new sizes and new dim order. 214 * Thus only the dim order is provided as an argument to this function. 215 * 216 * The offset_numel argument allows the aliased tensor's memory region to 217 * begin at an offset of N elements from the start of the original tensor's 218 * buffer. 219 */ 220 vTensor( 221 vTensor& other, 222 const std::vector<int64_t>& sizes, 223 const std::vector<int64_t>& dim_order, 224 const int64_t offset_numel = 0); 225 226 // To discourage making copies, the copy assignment operator is still deleted 227 vTensor& operator=(const vTensor& other) = delete; 228 229 vTensor(vTensor&& other) = default; 230 vTensor& operator=(vTensor&& other) = default; 231 232 private: 233 /* 234 * "Core" tensor metadata. They are the minimum amount of information required 235 * to construct a tensor. 236 */ 237 238 // Whether the tensor has elements of type float, int, etc. 239 vkapi::ScalarType dtype_; 240 // sizes of the tensor in NCHW dimension order 241 std::vector<int64_t> sizes_; 242 // Describes which dimension is "tightly packed" using WHCN index (i.e. 0 for 243 // width, 1 for height, etc.). For texture backed tensors, this describes 244 // which dimension is packed along a texel. For buffer backed tensors, this 245 // describes which dimension has a stride of 1 (i.e. is last in the dim 246 // order). 247 int32_t packed_dim_; 248 249 /* 250 * "Layout" metadata. These describe with further detail how tensor data is 251 * laid out in memory. However, they are considered secondary to the "core" 252 * metadata members above because defaults can be assumed based on a given 253 * memory layout. When permuting the tensor without performing a copy, these 254 * metadata members are the ones that will be changed. All other metadata is 255 * derived from a combination of sizes, memory layout, and the below members. 256 */ 257 258 // dim order of the tensor; dimension indices are in NCHW dimension order 259 // i.e. 0 is N, 1 is C, 2 is H, 3 is W for a 4D tensor. The dims with larger 260 // strides precede the dims with smaller strides in the dim order. The last 261 // dim is always the fastest moving dim with a stride of 1. 262 std::vector<int64_t> dim_order_; 263 // Describes which axis of an image texture each dimension of the tensor maps 264 // to. The axis mapping allows texture based tensors to be permuted and 265 // transposed without modifying the underlying texture storage. For a more in 266 // depth explanation of axis mapping, see the `default_axis_map()` 267 // function. 268 std::vector<int64_t> axis_map_; 269 270 /* 271 * The below can be consider "layout" metadata as well, but are derived from 272 * the above data members. 273 */ 274 275 // strides of the tensor in NCHW dimension order 276 std::vector<int64_t> strides_; 277 // Contains the number of elements in the tensor according to the canonical 278 // sizes. 279 size_t numel_; 280 281 /* 282 * The below metadata members are derived from the above, and are typically 283 * to i.e. pass tensor metadata to compute shaders. 284 */ 285 286 // padded sizes of the tensor in NCHW dimension order. See the 287 // calculate_padded_sizes() function for more context. Note that padded sizes 288 // are only used for texture storage, and not for buffer storage. 289 std::vector<int64_t> padded_sizes_; 290 // Contains the strides of the tensor, with the dimensionality padded to the 291 // nearest multiple of 4. Unsqueezed dims will have a stride of int32_t max. 292 std::vector<int64_t> unsqueezed_strides_; 293 // Contains the number of elements in the tensor according to the padded 294 // sizes. 295 size_t padded_numel_; 296 // See the comments documenting logical_limits() for more context. 297 TextureLimits logical_limits_; 298 299 /* 300 * Utility GPU buffers that can be passed to shaders in order to convey tensor 301 * metadata. These buffers will be initialized the first time they are 302 * accessed via the corresponding *_ubo() function, and their contents will be 303 * updated whenever virtual_resize() is called. 304 * 305 * Refer to the comments for the corresponding *_ubo() functions for more 306 * context about the data contained in each buffer. 307 */ 308 ParamsBuffer sizes_uniform_; 309 ParamsBuffer strides_uniform_; 310 ParamsBuffer numel_uniform_; 311 ParamsBuffer logical_limits_uniform_; 312 313 vTensorStorage storage_; 314 315 public: 316 /* 317 Texture Access 318 */ 319 image()320 inline vkapi::VulkanImage& image() const& { 321 return storage_.image_; 322 } 323 324 vkapi::VulkanImage& image( 325 vkapi::PipelineBarrier&, 326 const vkapi::PipelineStageFlags) &; 327 328 vkapi::VulkanImage& image( 329 vkapi::PipelineBarrier&, 330 const vkapi::PipelineStageFlags, 331 const vkapi::MemoryAccessFlags) &; 332 buffer()333 inline vkapi::VulkanBuffer& buffer() const& { 334 return storage_.buffer_; 335 } 336 337 vkapi::VulkanBuffer& buffer( 338 vkapi::PipelineBarrier&, 339 const vkapi::PipelineStageFlags) &; 340 341 vkapi::VulkanBuffer& buffer( 342 vkapi::PipelineBarrier&, 343 const vkapi::PipelineStageFlags, 344 const vkapi::MemoryAccessFlags) &; 345 346 /* 347 Metadata 348 */ 349 storage_type()350 inline utils::StorageType storage_type() const { 351 return storage_.storage_type_; 352 } 353 has_buffer_storage()354 inline bool has_buffer_storage() const { 355 return storage_.storage_type_ == utils::kBuffer; 356 } 357 358 private: 359 void set_logical_limits(const utils::uvec3& image_extents); 360 361 public: 362 /* 363 * The logical limits of the tensor are derived from the image extents of the 364 * image texture used to store the tensor, but with two key differences. 365 * 366 * First, the image extents are permuted according to the axis map. This 367 * makes it so that the first element of the logical limit is the limit of the 368 * texture axis corresponding to the width dimension of the tensor, the next 369 * element is the limit of the texture axis corresponding to the height 370 * dimension and the last element is the limit of the texture axis that 371 * corresponds to the channels dimension of the tensor. 372 * 373 * Second, the logical limits may use smaller extents than the actual image 374 * extents of the image texture. This is due to dynamic shape; if the tensor's 375 * `virtual_resize()` function is called, then the logical limits will reflect 376 * the extents that would be needed to support a tensor with the updated sizes 377 * instead of the original sizes. 378 */ logical_limits()379 inline const utils::ivec3& logical_limits() const { 380 return logical_limits_.limits; 381 } 382 383 /* 384 * Extract an `vkapi::ScalarType` from the TensorOptions member 385 */ dtype()386 inline vkapi::ScalarType dtype() const { 387 return dtype_; 388 } 389 390 /* 391 * Provide a "best guess" of a memory layout that can be used to construct a 392 * tensor with similar layout metadata (i.e. strides, axis_map, etc.) as this 393 * tensor. In some scenarios, the exact layout of the tensor may not be able 394 * to be replicated due to calling `virtual_*()` functions after construction; 395 * however, this function will provide a memory layout that will produce the 396 * same `packed_dim_` as this tensor. 397 */ 398 utils::GPUMemoryLayout estimate_memory_layout() const; 399 packed_dim()400 inline int32_t packed_dim() const { 401 return packed_dim_; 402 } 403 404 /* 405 * Returns the WHCN index of the dimension that is used to concatenate batches 406 * as an int32_t. 407 */ concat_dim()408 inline int32_t concat_dim() const { 409 return utils::safe_downcast<int32_t>(axis_map_.at(3)); 410 } 411 sizes()412 inline const std::vector<int64_t>& sizes() const { 413 return sizes_; 414 } 415 size(size_t dim)416 inline const int64_t size(size_t dim) const { 417 return sizes().at(dim); 418 } 419 dim()420 inline const int64_t dim() const { 421 return sizes_.size(); 422 } 423 dim_order()424 inline const std::vector<int64_t>& dim_order() const { 425 return dim_order_; 426 } 427 axis_map()428 inline const std::vector<int64_t>& axis_map() const { 429 return axis_map_; 430 } 431 432 /* 433 * Returns a single int32_t that contains the values of the axis map and the 434 * packed dimension packed into a single int32_t, such that it can be used as 435 * a specialization constant in a compute shader. This allows for the SPIR-V 436 * to bytecode compilation to perform compile-time unfolding on the axis map. 437 * Each element of the axis map and the value of the packed dimension take up 438 * 4 bits in the packed int32_t. 439 */ hashed_layout()440 inline int32_t hashed_layout() const { 441 return axis_map_.at(0) + (axis_map_.at(1) << 4) + (axis_map_.at(2) << 8) + 442 (axis_map_.at(3) << 12) + (packed_dim_ << 16); 443 } 444 445 /* 446 * Return true if the tensor's axis map is {0, 1, 2, concat_dim}. This means 447 * that the width dim is mapped to the width axis of the texture, the height 448 * dim is mapped to the height axis of the texture, the channels dim is mapped 449 * to the depth axis of the texture. 450 */ has_standard_axis_map()451 inline bool has_standard_axis_map() const { 452 return axis_map_.at(0) == 0 && axis_map_.at(1) == 1 && axis_map_.at(2) == 2; 453 } 454 strides()455 inline const std::vector<int64_t>& strides() const { 456 return strides_; 457 } 458 unsqueezed_strides()459 inline const std::vector<int64_t>& unsqueezed_strides() const { 460 return unsqueezed_strides_; 461 } 462 463 /* 464 * Returns a GPU buffer containing the sizes of the tensor in WHCN order. 465 * Note that dimensions that are not present in the tensor's sizes are set to 466 * a size of 1. 467 */ 468 const vkapi::BufferBindInfo sizes_ubo(); 469 470 /* 471 * Returns a GPU buffer containing the strides of the tensor in WHCN order. 472 * Note that the strides are extended to a dimensionality that is a multiple 473 * of 4, thus dimensions that are not present in the tensor's sizes are set to 474 * have a stride equal to the stride of the "slowest moving" dimension. 475 */ 476 const vkapi::BufferBindInfo strides_ubo(); 477 478 /* 479 * Returns a GPU buffer containing the logical limits of the tensor. See the 480 * comments for logical_limits() for more context. 481 */ 482 const vkapi::BufferBindInfo logical_limits_ubo(); 483 484 /* 485 * Returns the number of elements in the buffer used to store the tensor. 486 */ 487 const vkapi::BufferBindInfo numel_ubo(); 488 numel()489 inline size_t numel() const { 490 return numel_; 491 } 492 nbytes()493 inline size_t nbytes() const { 494 return element_size(dtype()) * numel(); 495 } 496 497 /* 498 * Returns numel but based on padded_sizes_ instead of sizes_ 499 */ padded_numel()500 inline size_t padded_numel() const { 501 return padded_numel_; 502 } 503 504 size_t staging_buffer_numel() const; 505 staging_buffer_nbytes()506 inline size_t staging_buffer_nbytes() const { 507 return element_size(dtype()) * staging_buffer_numel(); 508 } 509 510 /* 511 * Return the VmaAllocationCreateInfo of the underlying resource 512 */ 513 VmaAllocationCreateInfo get_allocation_create_info() const; 514 515 /* 516 * Return the VkMemoryRequirements of the underlying resource 517 */ 518 VkMemoryRequirements get_memory_requirements() const; 519 520 /* 521 * Binds the underlying resource to the given memory allocation 522 */ 523 void bind_allocation(const vkapi::Allocation& allocation); 524 525 private: 526 /* 527 * Assuming sizes, dim order, or axis mapping was modified, recompute all 528 * derived metadata and update metadata UBO with new values. 529 */ 530 void update_metadata(); 531 532 /* 533 * Check that tensor sizes are valid given the current storage resource's 534 * limits. 535 */ 536 void check_sizes(const std::vector<int64_t>& sizes) const; 537 538 public: 539 /* 540 * Change how the tensor should be interpreted by compute shaders via updating 541 * the size and dim order of the tensor. The new sizes and dim order may have 542 * different dimensionality than the current dimensionality of the tensor. 543 * 544 * This function can only be used for buffer-backed tensors, since texture 545 * backed buffers cannot change dimensionality or memory layout. 546 * 547 * TODO(ssjia): delete this API. prefer functions such as virtual_transpose 548 * instead. 549 */ 550 void virtual_reconfigure( 551 const std::vector<int64_t>& new_sizes, 552 const std::vector<int64_t>& new_dim_order); 553 554 /* 555 * Set all metadata of this tensor to match the metadata of another tensor. 556 */ 557 void virtual_clone(const vTensor& other); 558 559 /* 560 * Perform a virtual resize of the vTensor by modifying the size metadata that 561 * gets used in compute shaders. This allows the shader to treat the 562 * underlying resource as if it were a different size. The new sizes cannot 563 * modify the dimensionality of the tensor. 564 */ 565 void virtual_resize(const std::vector<int64_t>& new_sizes); 566 567 /* 568 * Transpose the tensor in-place by updating its metadata. 569 */ 570 void virtual_transpose(const int64_t dim0, const int64_t dim1); 571 572 /* 573 * Check if this vTensor instance is a view of another vTensor instance 574 */ is_view_of(const vTensor & other)575 inline bool is_view_of(const vTensor& other) const { 576 return storage_.is_copy_of(other.storage_); 577 } 578 }; 579 580 } // namespace api 581 } // namespace vkcompute 582