/* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once // @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName #include #include #include #include #include #include #include #include namespace vkcompute { // Define valid scalar types that the Value class can // accept template struct is_valid_scalar_type : std::false_type {}; template <> struct is_valid_scalar_type : std::true_type {}; template <> struct is_valid_scalar_type : std::true_type {}; template <> struct is_valid_scalar_type : std::true_type {}; // // Guarded Pointer Classes // class ComputeGraph; #define DECL_VALUE_PTR_CLASS(classname, ctype) \ class classname final { \ ComputeGraph* const graph_; \ ctype* ptr_; \ \ public: \ explicit classname(ComputeGraph* const graph, const ValueRef idx); \ ctype* operator->() const; \ ctype& operator*() const; \ ~classname(); \ }; DECL_VALUE_PTR_CLASS(vTensorPtr, api::vTensor) DECL_VALUE_PTR_CLASS(TensorRefPtr, TensorRef) DECL_VALUE_PTR_CLASS(StagingPtr, api::StagingBuffer) DECL_VALUE_PTR_CLASS(IntListPtr, std::vector) DECL_VALUE_PTR_CLASS(DoubleListPtr, std::vector) DECL_VALUE_PTR_CLASS(BoolListPtr, std::vector) DECL_VALUE_PTR_CLASS(ValueListPtr, std::vector) DECL_VALUE_PTR_CLASS(SymIntPtr, SymInt); #undef DECL_VALUE_PTR_CLASS // // TmpTensor // /* * This struct is used to recycle the memory of temporary tensors that are * created during the execution of a node. Upon construction, this struct will * check the `tmp_shared_object_idxs_` of the provided `ComputeGraph` instance * if any shared objects are available; if not, then a new one is created. A * tensor value is then added to the `ComputeGraph` instance with the requested * specifications. Upon destruction, the shared object index of the temporary * tensor is returned to `tmp_shared_object_idxs_`. * * Note that instances of this struct can be used as if they were `ValueRef` due * to implementation of a custom casting operator. * * This class should only be used to create tensors whose lifetimes exist only * in a well defined scope (i.e. within a function). */ struct TmpTensor { ComputeGraph* graph_p; int64_t sobj_idx; ValueRef vref; // // Match all available overloads of `add_tensor` // TmpTensor( ComputeGraph* const graph_ptr, const std::vector& sizes, const vkapi::ScalarType dtype, const utils::StorageType storage_type, const utils::GPUMemoryLayout memory_layout); TmpTensor( ComputeGraph* const graph_ptr, const std::vector& sizes, const vkapi::ScalarType dtype, const utils::StorageType storage_type); TmpTensor( ComputeGraph* const graph_ptr, const std::vector& sizes, const vkapi::ScalarType dtype, const utils::GPUMemoryLayout memory_layout); TmpTensor( ComputeGraph* const graph_ptr, const std::vector& sizes, const vkapi::ScalarType dtype); // No copy construction or assignment TmpTensor(TmpTensor& other) = delete; TmpTensor& operator=(TmpTensor& other) = delete; // No move construction or assignment TmpTensor(TmpTensor&& other) = delete; TmpTensor& operator=(TmpTensor&& other) = delete; // Custom cast to ValueRef operator ValueRef() const { return vref; }; ~TmpTensor(); private: // Helper function to get first available shared object index or request a new // one to be created. int64_t get_sobj_idx(); }; // // ComputeGraph // /* * This is the core data structure used to execute Vulkan models in graph mode. * As opposed to ATen/eager mode where a command buffer is encoded every * inference (since ops are executed with the model), in graph mode the ops that * compose the model are intended to be parsed only once, upon which a command * buffer will be encoded. Model inference will then execute the cached command * buffer without needing to encode a new one. */ class ComputeGraph final { public: explicit ComputeGraph(GraphConfig config); ComputeGraph(ComputeGraph&&) = default; ComputeGraph& operator=(ComputeGraph&&) = default; ~ComputeGraph(); private: GraphConfig config_; vkapi::DescriptorPoolConfig prepack_descriptor_counts_; vkapi::DescriptorPoolConfig execute_descriptor_counts_; std::unique_ptr context_; std::vector shared_objects_; // This stack is used by `TmpTensor` instances to recycle shared objects // for temporary tensors. See the comments of `TmpTensor` for more details std::stack tmp_shared_object_idxs_; std::vector values_; std::vector param_ubos_; std::vector> prepack_nodes_; std::vector> execute_nodes_; std::vector inputs_; std::vector outputs_; protected: size_t values_in_use_ = 0; public: // // Accessors // inline api::Context* context() { return context_.get(); } inline std::vector& inputs() { return inputs_; } inline std::vector& outputs() { return outputs_; } inline std::vector>& prepack_nodes() { return prepack_nodes_; } inline std::vector>& execute_nodes() { return execute_nodes_; } inline GraphConfig& graphconfig() { return config_; } // // Value Extraction // #define GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(ptr_type, short_name, type_name) \ inline ptr_type get_##short_name(const ValueRef idx) { \ return ptr_type(this, idx); \ } \ inline bool val_is_##short_name(const ValueRef idx) { \ return values_.at(idx).is##type_name(); \ } GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(vTensorPtr, tensor, Tensor) GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(TensorRefPtr, tref, TensorRef) GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(StagingPtr, staging, Staging) GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(IntListPtr, int_list, IntList) GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(DoubleListPtr, double_list, DoubleList) GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(BoolListPtr, bool_list, BoolList) GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(ValueListPtr, value_list, ValueList) GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(SymIntPtr, symint, SymInt); #undef GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS #define GET_AND_CHECK_VAL_AS_TYPE_FNS(ctype, short_name, type_name) \ inline ctype get_##short_name(const ValueRef idx) { \ return values_.at(idx).to##type_name(); \ } \ inline bool val_is_##short_name(const ValueRef idx) { \ return values_.at(idx).is##type_name(); \ } GET_AND_CHECK_VAL_AS_TYPE_FNS(int64_t, int, Int) GET_AND_CHECK_VAL_AS_TYPE_FNS(double, double, Double) GET_AND_CHECK_VAL_AS_TYPE_FNS(bool, bool, Bool) GET_AND_CHECK_VAL_AS_TYPE_FNS(std::string, string, String) #undef GET_AND_CHECK_VAL_AS_TYPE_FNS inline bool val_is_none(const ValueRef idx) { return idx == kDummyValueRef ? true : values_.at(idx).isNone(); } inline TypeTag get_val_type(const ValueRef idx) { return values_.at(idx).type(); } // // Tensor Properties Accessors // std::vector sizes_of(const ValueRef idx) const; /* * Returns the size of the tensor at `idx` along the specified dimension. * Negative indexing is allowed. */ template T size_at(const int64_t dim, const ValueRef idx) const { const Value& val = values_.at(idx); if (val.isTensor()) { return static_cast(utils::val_at(dim, val.toConstTensor().sizes())); } else if (val.isTensorRef()) { return static_cast(utils::val_at(dim, val.toConstTensorRef().sizes)); } VK_THROW("Could not get sizes of value with type ", val.type()); } int64_t dim_of(const ValueRef idx) const; std::vector dim_order_of(const ValueRef idx) const; std::vector strides_of(const ValueRef idx) const; vkapi::ScalarType dtype_of(const ValueRef idx) const; inline const utils::ivec3& logical_limits_of(const ValueRef idx) const { return values_.at(idx).toConstTensor().logical_limits(); } inline int32_t numel_of(const ValueRef idx) const { return values_.at(idx).toConstTensor().numel(); } inline utils::StorageType storage_type_of(const ValueRef idx) const { return values_.at(idx).toConstTensor().storage_type(); } inline bool is_buffer_storage(const ValueRef idx) const { return values_.at(idx).toConstTensor().has_buffer_storage(); } inline bool val_is_view_of(const ValueRef maybe_view, const ValueRef base) const { return values_.at(maybe_view) .toConstTensor() .is_view_of(values_.at(base).toConstTensor()); } inline utils::GPUMemoryLayout estimate_memory_layout_of( const ValueRef idx) const { return values_.at(idx).toConstTensor().estimate_memory_layout(); } inline int32_t hashed_layout_of(const ValueRef idx) const { return values_.at(idx).toConstTensor().hashed_layout(); } inline int32_t packed_dim_of(const ValueRef idx) const { return values_.at(idx).toConstTensor().packed_dim(); } inline int32_t concat_dim_of(const ValueRef idx) const { return values_.at(idx).toConstTensor().concat_dim(); } inline vkapi::BufferBindInfo sizes_ubo(const ValueRef idx) { return values_.at(idx).toTensor().sizes_ubo(); } inline vkapi::BufferBindInfo strides_ubo(const ValueRef idx) { return values_.at(idx).toTensor().strides_ubo(); } inline vkapi::BufferBindInfo numel_ubo(const ValueRef idx) { return values_.at(idx).toTensor().numel_ubo(); } inline bool has_standard_axis_map(const ValueRef idx) { return values_.at(idx).toTensor().has_standard_axis_map(); } inline vkapi::BufferBindInfo logical_limits_ubo(const ValueRef idx) { return values_.at(idx).toTensor().logical_limits_ubo(); } // // Scalar Value Extraction // template T extract_scalar(const ValueRef idx) { Value& value = values_.at(idx); if (value.isInt()) { return static_cast(value.toInt()); } if (value.isDouble()) { return static_cast(value.toDouble()); } if (value.isBool()) { return static_cast(value.toBool()); } VK_THROW("Cannot extract scalar from Value with type ", value.type()); } template std::optional extract_optional_scalar(const ValueRef idx) { if (val_is_none(idx)) { return ::std::nullopt; } else { return extract_scalar(idx); } } std::string extract_string(const ValueRef idx) { return values_.at(idx).toString(); } template < typename T, typename std::enable_if< std::is_integral::value && std::is_signed::value, int>::type = 0> T extract_whcn_dim(const ValueRef idx, const int64_t ndim) { T dim = extract_scalar(idx); // Normalize dim to account for negative indexing dim = (dim % ndim + ndim) % ndim; // Assume original value is NCHW ordering, obtain the WHCN ordering return ndim - 1 - dim; } // // Utility functions // /* * Returns a suggested storage type (i.e. buffer or texture) that can be used * to construct `api::vTensor`s. The storage type is typically determined by * the GPU reported by the Vulkan context, unless a storage type override is * defined in the graph configuration. Some GPU architectures work better with * buffer storage, and others with texture storage. Current only texture * storage is supported. */ utils::StorageType suggested_storage_type(); /* * Returns a suggested memory layout (i.e. channels, width, or height packed) * that can be used to construct `api::vTensor`s. The memory layout impacts * which dimension will be treated as the vectorized dimension. For texture * storage, elements along the vectorized dimension are packed into texels. * The suggested memory layout is determined based on the sizes of the tensor, * unless a memory layout override is defined in the graph configuration. */ utils::GPUMemoryLayout suggested_memory_layout( const std::vector& sizes); // // Graph Building // private: void check_no_active_value_ptrs(); public: /* * Add a `api::vTensor` value to the graph with the specified properties. * There are various convenience overloads of this function that may be used * instead. */ ValueRef add_tensor( const std::vector& sizes, const vkapi::ScalarType dtype, const utils::StorageType storage_type, const utils::GPUMemoryLayout memory_layout, const int64_t shared_object_idx = -1); /* * Add a `api::vTensor` value to the graph with the specified properties. The * suggested memory layout will be used to construct the `api::vTensor`. */ ValueRef add_tensor( const std::vector& sizes, const vkapi::ScalarType dtype, const utils::StorageType storage_type, const int64_t shared_object_idx = -1); /* * Add a `api::vTensor` value to the graph with the specified properties. The * suggested storage type will be used to construct the `api::vTensor`. */ ValueRef add_tensor( const std::vector& sizes, const vkapi::ScalarType dtype, const utils::GPUMemoryLayout memory_layout, const int64_t shared_object_idx = -1); /* * Add a `api::vTensor` value to the graph with the specified properties. The * suggested storage type and memory layout will be used to construct the * `api::vTensor`. */ ValueRef add_tensor( const std::vector& sizes, const vkapi::ScalarType dtype, const int64_t shared_object_idx = -1); /* * Add a `api::vTensor` value to the graph with the specified image. */ ValueRef add_tensor(const vkapi::VulkanImage& image); /* * Add a `api::vTensor` value to the graph with the properties of `vref`. */ ValueRef add_tensor_like( const ValueRef vref, const utils::StorageType storage_type, const utils::GPUMemoryLayout memory_layout); /* * Add a `api::vTensor` value to the graph with the properties of `vref`. The * suggested storage type will be used to construct the `api::vTensor`. */ ValueRef add_tensor_like( const ValueRef vref, const utils::GPUMemoryLayout memory_layout); /* * Use the copy constructor of `api::vTensor` to create a "view" of the * `vTensor` value at `vref`. See the copy constructor of `api::vTensor` for * more details. */ ValueRef add_tensor_view(const ValueRef vref); /* * Use the copy constructor of `api::vTensor` to create a "view" of the * `vTensor` value at `vref` with different sizes and dim order. See the copy * constructor of `api::vTensor` for more details. */ ValueRef add_tensor_view( const ValueRef vref, const std::vector& sizes, const std::vector& dim_order, const size_t offset_numel = 0); /* * Add a `TensorRef` value to the graph with the specific properties. A * `TensorRef` is a reference to a `api::vTensor` whose data is stored in an * external CPU buffer. */ ValueRef add_tensorref( const std::vector& sizes, const vkapi::ScalarType dtype, const void* const data); /* * Add a staging buffer to the graph. Staging buffers are data buffers that * use memory that is visible to both the CPU and GPU, and therefore is used * as a intermediary when transferring data between the CPU and GPU. */ ValueRef add_staging(const vkapi::ScalarType dtype, const size_t numel); ValueRef add_none(); template typename std::enable_if::value, ValueRef>::type add_scalar(T value); template typename std::enable_if::value, ValueRef>::type add_scalar_list(std::vector&& value); ValueRef add_value_list(std::vector&& value); ValueRef add_string(std::string&& str); ValueRef add_symint(const int32_t val); ValueRef set_input_tensor(const ValueRef idx, const bool use_staging = true); ValueRef set_output_tensor(const ValueRef idx, const bool use_staging = true); template vkapi::BufferBindInfo create_params_buffer(const Block& data) { param_ubos_.emplace_back(api::ParamsBuffer(context_.get(), data)); return vkapi::BufferBindInfo(param_ubos_.back().buffer()); } /* * Given a ValueRef, do the following depending on the type of the Value: * - If it is a SymInt, return the BufferBindInfo of the ParamsBuffer object * backing the SymInt. * - If it is a regular Int, create a new ParamsBuffer using the integer value * and return the BufferBindInfo of the created ParamsBuffer. */ vkapi::BufferBindInfo get_or_create_int_param_buffer(const ValueRef idx); void set_symint(const ValueRef idx, const int32_t val); int32_t read_symint(const ValueRef idx); inline void set_val_as_input(const ValueRef idx) { inputs_.push_back({idx, kDummyValueRef}); } inline void set_val_as_output(const ValueRef idx) { outputs_.push_back({idx, kDummyValueRef}); } /* * Convenience function to add an input tensor along with its staging buffer */ inline IOValueRef add_input_tensor( const std::vector& sizes, const vkapi::ScalarType dtype, const int64_t shared_object_idx = -1) { ValueRef t = add_tensor(sizes, dtype, shared_object_idx); ValueRef staging = set_input_tensor(t); return {t, staging}; } /* * Convenience function to add an input tensor with a specific memory layout * along with its staging buffer */ inline IOValueRef add_input_tensor( const std::vector& sizes, const vkapi::ScalarType dtype, const utils::GPUMemoryLayout memory_layout, const int64_t shared_object_idx = -1) { ValueRef t = add_tensor(sizes, dtype, memory_layout, shared_object_idx); ValueRef staging = set_input_tensor(t); return {t, staging}; } /* * Convenience function to add an input tensor with a specific storage type * along with its staging buffer */ inline IOValueRef add_input_tensor( const std::vector& sizes, const vkapi::ScalarType dtype, const utils::StorageType storage_type, const int64_t shared_object_idx = -1) { ValueRef t = add_tensor(sizes, dtype, storage_type, shared_object_idx); ValueRef staging = set_input_tensor(t); return {t, staging}; } /* * Add an input tensor with the specified properties along with its staging * buffer. */ inline IOValueRef add_input_tensor( const std::vector& sizes, const vkapi::ScalarType dtype, const utils::StorageType storage_type, const utils::GPUMemoryLayout memory_layout, const int64_t shared_object_idx = -1) { ValueRef t = add_tensor( sizes, dtype, storage_type, memory_layout, shared_object_idx); ValueRef staging = set_input_tensor(t); return {t, staging}; } SharedObject& get_shared_object(const int64_t idx); // // Graph Preparation // void update_descriptor_counts( const vkapi::ShaderInfo& shader_info, bool execute); void prepare(); // // Dispatch Utilities // /* * Create a global workgroup size for a given `api::vTensor` value assuming * that every shader invocation calculates one texel element of the output * tensor. * * For tensors that use texture storage, the image extents of the * `api::vTensor` will be used to set the global workgroup size. * * For tensor that use buffer storage, the number of texels in the texel * buffer will be used to set the x component of the global workgroup size. * All other components will be set to 1 (i.e. {ntexels, 1, 1} will be * returned). */ utils::uvec3 create_global_wg_size(const ValueRef idx); /* * Suggest a local workgroup size for a given global workgroup size. * * The local workgroup size will be formed to try and minimize the number of * inactive invocations. * * Currently, the local workgroup size is hard-coded to contain a total of 64 * shader invocations. In the future, this value can be configured. */ utils::uvec3 create_local_wg_size(const utils::uvec3 global_wg_size); /* * Convenience function to suggest a local workgroup size for a given * `api::vTensor` value, assuming that every shader invocation calculates one * texel element of the output tensor. */ utils::uvec3 create_local_wg_size(const ValueRef idx); // // Input/Output // void copy_into_staging(const ValueRef idx, const void* data, const size_t numel); void copy_from_staging(const ValueRef idx, void* data, const size_t numel); // // Graph Prepacking // void encode_prepack(); void prepack() const; // // Graph Execution // void encode_execute(); void execute() const; // // Dynamic Shape support // void resize_input(const int64_t idx, const std::vector& new_sizes); void propagate_resize(); // // Miscellaneous Utilities // inline bool int16_shader_types_enabled() const { return context_->adapter_ptr()->supports_int16_shader_types(); } /* * Check whether the GPU supports 8 bit buffers. */ inline bool int8_buffers_enabled() const { return context_->adapter_ptr()->has_full_int8_buffers_support(); } // // Debug support (implemented in Logging.cpp) // void print_readable(); // // Friend classes // friend class vTensorPtr; friend class TensorRefPtr; friend class StagingPtr; friend class IntListPtr; friend class DoubleListPtr; friend class BoolListPtr; friend class ValueListPtr; friend class SymIntPtr; friend struct TmpTensor; }; template inline typename std::enable_if::value, ValueRef>::type ComputeGraph::add_scalar(T value) { ValueRef idx(static_cast(values_.size())); check_no_active_value_ptrs(); values_.emplace_back(value); return idx; } template inline typename std::enable_if::value, ValueRef>::type ComputeGraph::add_scalar_list(std::vector&& value) { ValueRef idx(static_cast(values_.size())); check_no_active_value_ptrs(); values_.emplace_back(std::move(value)); return idx; } } // namespace vkcompute