1 #pragma once 2 3 // @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName 4 5 #ifdef USE_VULKAN_API 6 7 #include <ATen/native/vulkan/api/Context.h> 8 #include <ATen/native/vulkan/api/Types.h> 9 10 namespace at { 11 namespace native { 12 namespace vulkan { 13 14 struct LastAccess { 15 api::PipelineStageFlags stage; 16 api::MemoryAccessFlags access; 17 LastAccessLastAccess18 LastAccess() 19 : stage{api::PipelineStage::NO_STAGE}, 20 access{api::MemoryAccessType::NONE} {} 21 LastAccessLastAccess22 LastAccess( 23 api::PipelineStageFlags stage_flags, 24 api::MemoryAccessFlags access_flags) 25 : stage{stage_flags}, access{access_flags} {} 26 }; 27 28 class vTensorStorage final { 29 public: 30 // Do not allow empty vTensorStorage construction 31 vTensorStorage() = default; 32 33 vTensorStorage( 34 api::Context* context, 35 const api::StorageType storage_type, 36 const api::GPUMemoryLayout gpu_memory_layout, 37 const std::vector<int64_t>& sizes, 38 const api::ScalarType dtype, 39 const bool allocate_memory = true); 40 41 vTensorStorage(const vTensorStorage&) = delete; 42 vTensorStorage& operator=(const vTensorStorage&) = delete; 43 44 vTensorStorage(vTensorStorage&&) = default; 45 vTensorStorage operator=(vTensorStorage&&) = delete; 46 47 ~vTensorStorage(); 48 49 friend class vTensor; 50 51 private: 52 // Context 53 api::Context* context_{}; 54 55 api::StorageType storage_type_; 56 57 // Resource sizings 58 api::utils::uvec3 extents_{}; 59 int64_t buffer_length_{}; 60 61 // Image Texture 62 mutable api::VulkanImage image_; 63 mutable api::VulkanBuffer buffer_; 64 65 // Last Access - used to insert memory barriers 66 LastAccess last_access_; 67 68 private: 69 // Registers underlying memory for cleanup 70 void flush(); 71 72 // Memory barrier insertion 73 void transition( 74 api::PipelineBarrier&, 75 const api::PipelineStageFlags, 76 const api::MemoryAccessFlags); 77 78 // Validation 79 void verify() const; 80 81 public: texture_format()82 inline VkFormat texture_format() { 83 return image_.format(); 84 } 85 86 void discard_and_reallocate( 87 const std::vector<int64_t>& gpu_sizes, 88 const api::GPUMemoryLayout gpu_memory_layout, 89 const api::ScalarType dtype); 90 }; 91 92 class vTensor final { 93 public: 94 // Do not allow empty vTensor construction 95 vTensor() = default; 96 97 // Default constructor 98 vTensor( 99 api::Context* context, 100 const std::vector<int64_t>& sizes, 101 const api::ScalarType dtype, 102 const api::StorageType storage_type = api::StorageType::TEXTURE_3D, 103 const api::GPUMemoryLayout memory_layout = 104 api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, 105 const bool allocate_memory = true); 106 107 // Default constructor for quantized vTensor 108 vTensor( 109 api::Context* const context, 110 const std::vector<int64_t>& sizes, 111 double q_scale, 112 int64_t q_zero_point, 113 const api::ScalarType dtype, 114 const api::StorageType storage_type = api::StorageType::TEXTURE_3D, 115 const api::GPUMemoryLayout memory_layout = 116 api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED); 117 118 // Copy Constructor and Assignment; Ideally copying would be disabled 119 // (see the reasoning for move assignment below) but it is required for 120 // compatibility with OpaqueTensorImpl 121 vTensor(const vTensor& other) = default; 122 vTensor& operator=(const vTensor& other) = default; 123 124 // Move Constructor and assignment 125 vTensor(vTensor&& other) = default; 126 vTensor& operator=(vTensor&& other) = default; 127 128 // Used for passing buffer sizes and strides data to shaders 129 struct BufferMetadata { 130 api::utils::uvec4 sizes; 131 api::utils::uvec4 strides; 132 uint32_t ndim; 133 uint32_t buffer_length; 134 }; 135 136 private: 137 // Tensor Options 138 api::ScalarType dtype_; 139 140 // GPU specific memory layout qualifier 141 api::GPUMemoryLayout memory_layout_; 142 143 // Sizes and Strides 144 std::vector<int64_t> sizes_; 145 std::vector<int64_t> strides_; 146 147 // Storage Dimensions. When stored on the GPU, one dimension will be aligned 148 // to the next multiple of 4 in order to take advantage of vec4 data types. 149 std::vector<int64_t> gpu_sizes_; 150 std::vector<int64_t> gpu_strides_; 151 152 // The extents that correspond to the tensor's size metadata. Note that this 153 // may not be the same as the extents of the underlying image texture because 154 // vTensor can be virtually resized via virtual_resize() which will cause it 155 // to be interpreted as a tensor with a different size. 156 api::utils::uvec3 virtual_extents_; 157 158 // A Vulkan uniform buffer containing sizes and strides of the GPU buffer that 159 // can be passed into a shader. 160 api::UniformParamsBuffer metadata_uniform_; 161 162 // A Vulkan uniform buffer containing the tensor sizes that can be passed into 163 // a shader. 164 std::shared_ptr<api::UniformParamsBuffer> cpu_sizes_uniform_; 165 166 // A Vulkan uniform buffer containing the GPU tensor sizes that can be passed 167 // into a shader. GPU sizes refers to the sizes of the tensor after padding 168 // has been applied to one dimension to align it to the next multiple of 4. 169 std::shared_ptr<api::UniformParamsBuffer> gpu_sizes_uniform_; 170 171 // A Vulkan uniform buffer containing the image extents of the underlying 172 // image texture that can be passed into a shader. 173 std::shared_ptr<api::UniformParamsBuffer> extents_uniform_; 174 175 // Quantization params 176 bool is_quantized_{false}; 177 double q_scale_{1.0f}; 178 int64_t q_zero_point_{0u}; 179 180 // Even at the cost of a heap allocation plus the resulting negative impact 181 // on cache locality due to the subsequent pointer chasing, it is still 182 // critical to share the view across vTensor implementations to minimize 183 // programmer errors. Ideally this class should have been only made movable, 184 // and non-copyable - something we cannot do unfortunately due to the inner 185 // workings of at::TensorImpl requiring copy semantics in 186 // at::TensorImpl::release_resources() to function as expected. Now that this 187 // class is made copyable though, a new door to a whole new class of bugs is 188 // opened, in that there now is a chance of two [shallow] copies, have their 189 // StorageState objects go out of sync as a result of an operation being 190 // performed on one shallow copy that is not reflected in the other. 191 // Technically, if the programmer is very careful, it is possible to avoid 192 // this trap and not pay the cost of indirection, but the resulting bugs of 193 // missing memory barriers will be so frustrating to hunt down for those 194 // unfamiliar with the internal mechanics of this class, that I decided to 195 // take the performance penalty of this extra layer of indirection in favor 196 // of making this class easier to use. 197 std::shared_ptr<vTensorStorage> view_; 198 199 public: 200 /* 201 Texture Access 202 */ 203 storage_type()204 inline api::StorageType storage_type() const { 205 return view_->storage_type_; 206 } 207 image()208 inline api::VulkanImage& image() const& { 209 return view_->image_; 210 } 211 212 api::VulkanImage& image(api::PipelineBarrier&, const api::PipelineStageFlags) 213 const&; 214 215 api::VulkanImage& image( 216 api::PipelineBarrier&, 217 const api::PipelineStageFlags, 218 const api::MemoryAccessFlags) &; 219 buffer()220 inline api::VulkanBuffer& buffer() const& { 221 return view_->buffer_; 222 } 223 224 api::VulkanBuffer& buffer( 225 api::PipelineBarrier&, 226 const api::PipelineStageFlags) const&; 227 228 api::VulkanBuffer& buffer( 229 api::PipelineBarrier&, 230 const api::PipelineStageFlags, 231 const api::MemoryAccessFlags) &; 232 233 /* 234 Metadata 235 */ 236 extents()237 inline const api::utils::uvec3& extents() const { 238 return view_->extents_; 239 } 240 241 /* 242 * Extract an `api::ScalarType` from the TensorOptions member 243 */ dtype()244 inline api::ScalarType dtype() const { 245 return dtype_; 246 } 247 248 /* 249 * Get an `api::ScalarType` that corresponds to the image format of the 250 * texture 251 */ texture_dtype()252 inline api::ScalarType texture_dtype() const { 253 return api::element_scalartype(view_->texture_format()); 254 } 255 gpu_memory_layout()256 inline api::GPUMemoryLayout gpu_memory_layout() const { 257 return memory_layout_; 258 } 259 gpu_memory_layout_as_uint()260 inline uint32_t gpu_memory_layout_as_uint() const { 261 return static_cast<uint32_t>(memory_layout_); 262 } 263 sizes()264 inline const std::vector<int64_t>& sizes() const { 265 return sizes_; 266 } 267 strides()268 inline const std::vector<int64_t>& strides() const { 269 return strides_; 270 } 271 gpu_sizes()272 inline const std::vector<int64_t>& gpu_sizes() const { 273 return gpu_sizes_; 274 } 275 gpu_strides()276 inline const std::vector<int64_t>& gpu_strides() const { 277 return gpu_strides_; 278 } 279 virtual_extents()280 inline const api::utils::uvec3& virtual_extents() const { 281 return virtual_extents_; 282 } 283 284 /* 285 * Get a uniform buffer containing sizes and strides information of the GPU 286 * buffer 287 */ 288 api::VulkanBuffer& buffer_metadata(); 289 290 /* 291 * Get a uniform buffer object containing the tensor sizes to use in a compute 292 * shader. Note that the UBO will be created the first time this function is 293 * called. 294 */ 295 std::shared_ptr<api::UniformParamsBuffer> cpu_sizes_ubo(); 296 297 /* 298 * Get a uniform buffer object containing the tensor GPU sizes to use in a 299 * compute shader. Note that the UBO will be created the first time this 300 * function is called. 301 */ 302 std::shared_ptr<api::UniformParamsBuffer> gpu_sizes_ubo(); 303 304 /* 305 * Get a uniform buffer object containing the image extents to use in a 306 * compute shader. Note that the UBO will be created the first time this 307 * function is called. 308 */ 309 std::shared_ptr<api::UniformParamsBuffer> extents_ubo(); 310 311 /* 312 * Constructs a BufferMetdata struct based on the original sizes and strides 313 * to pass into a shader. 314 */ 315 BufferMetadata get_cpu_buffer_metadata() const; 316 set_is_quantized()317 inline void set_is_quantized() { 318 is_quantized_ = true; 319 } 320 is_quantized()321 inline bool is_quantized() const { 322 return is_quantized_; 323 } 324 set_scale(const double q_scale)325 inline void set_scale(const double q_scale) { 326 q_scale_ = q_scale; 327 } 328 get_scale()329 inline double get_scale() const { 330 return q_scale_; 331 } 332 get_scale_float()333 inline float get_scale_float() const { 334 return api::utils::safe_downcast<float>(q_scale_); 335 } 336 set_zero_point(const int64_t q_zero_point)337 inline void set_zero_point(const int64_t q_zero_point) { 338 q_zero_point_ = q_zero_point; 339 } 340 get_zero_point()341 inline int64_t get_zero_point() const { 342 return q_zero_point_; 343 } 344 get_zero_point_int32()345 inline int32_t get_zero_point_int32() const { 346 return api::utils::safe_downcast<int32_t>(q_zero_point_); 347 } 348 numel()349 inline size_t numel() const { 350 return api::utils::multiply_integers(sizes()); 351 } 352 nbytes()353 inline size_t nbytes() const { 354 return api::element_size(dtype()) * numel(); 355 } 356 357 /* 358 * Returns numel but based on gpu_sizes_ instead of sizes_ 359 */ gpu_numel()360 inline size_t gpu_numel() const { 361 return api::utils::multiply_integers(gpu_sizes_); 362 } 363 364 /* 365 * Return nbytes but bnased on gpu_sizes_ instead of sizes_ 366 */ gpu_nbytes()367 inline VkDeviceSize gpu_nbytes() const { 368 return api::element_size(dtype()) * gpu_numel(); 369 } 370 371 /* 372 * Return the VmaAllocationCreateInfo of the underlying resource 373 */ 374 VmaAllocationCreateInfo get_allocation_create_info() const; 375 376 /* 377 * Return the VkMemoryRequirements of the underlying resource 378 */ 379 VkMemoryRequirements get_memory_requirements() const; 380 381 /* 382 * Binds the underlying resource to the given memory allocation 383 */ 384 void bind_allocation(const api::MemoryAllocation& allocation); 385 386 private: 387 /* 388 * Update the size metadata of the vTensor to be new sizes. Should not be used 389 * directly, reallocate() or virtual_resize() should be used instead. 390 */ 391 void update_size_metadata(const std::vector<int64_t>& new_sizes); 392 393 public: 394 /* 395 * Discard the underlying VkImage or VkBuffer and re-allocate based on new 396 * tensor sizes 397 */ 398 void reallocate(const std::vector<int64_t>& new_sizes); 399 400 /* 401 * Perform a virtual resize of the vTensor by modifying the size metadata that 402 * gets used in compute shaders. This allows the shader to treat the 403 * underlying resource as if it were a different size. 404 */ 405 void virtual_resize(const std::vector<int64_t>& new_sizes); 406 }; 407 408 void add_buffer_barrier( 409 api::PipelineBarrier&, 410 const api::VulkanBuffer&, 411 const api::PipelineStageFlags, 412 const api::MemoryAccessFlags, 413 const api::PipelineStageFlags, 414 const api::MemoryAccessFlags); 415 416 } // namespace vulkan 417 } // namespace native 418 } // namespace at 419 420 #endif /* USE_VULKAN_API */ 421