xref: /aosp_15_r20/external/executorch/backends/vulkan/runtime/graph/ops/impl/Staging.cpp (revision 523fa7a60841cd1ecfb9cc4201f1ca8b03ed023a)
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD-style license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
10 
11 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
12 
13 #include <executorch/backends/vulkan/runtime/graph/ops/DispatchNode.h>
14 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
15 #include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
16 
17 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
18 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
19 
20 namespace vkcompute {
21 
add_staging_to_tensor_node(ComputeGraph & graph,const ValueRef in_staging,const ValueRef out_tensor)22 void add_staging_to_tensor_node(
23     ComputeGraph& graph,
24     const ValueRef in_staging,
25     const ValueRef out_tensor) {
26   VK_CHECK_COND(graph.val_is_staging(in_staging));
27 
28   vkapi::ShaderInfo shader = get_nchw_to_tensor_shader(
29       *graph.get_tensor(out_tensor), graph.int8_buffers_enabled());
30 
31   vkapi::ParamsBindList ubos;
32   if (graph.is_buffer_storage(out_tensor)) {
33     ubos.append(
34         {graph.sizes_ubo(out_tensor),
35          graph.strides_ubo(out_tensor),
36          graph.numel_ubo(out_tensor)});
37   } else {
38     ubos.append({graph.sizes_ubo(out_tensor)});
39   }
40 
41   graph.execute_nodes().emplace_back(new DispatchNode(
42       graph,
43       shader,
44       graph.create_global_wg_size(out_tensor),
45       graph.create_local_wg_size(out_tensor),
46       // Input and Outputs
47       {{out_tensor, vkapi::kWrite}, {in_staging, vkapi::kRead}},
48       // Parameter Buffers
49       ubos,
50       // Specialization Constants
51       {graph.hashed_layout_of(out_tensor)},
52       // Resizing Logic
53       nullptr,
54       {}));
55 }
56 
57 const std::string kBitw8PrefixStr = "bitw8_image_to_nchw_nobitw8buffer";
58 
is_bitw8_shader(const vkapi::ShaderInfo & shader)59 bool is_bitw8_shader(const vkapi::ShaderInfo& shader) {
60   const auto size = kBitw8PrefixStr.size();
61   const std::string& shader_prefix_str = shader.kernel_name.substr(0, size);
62   return shader_prefix_str == kBitw8PrefixStr;
63 }
64 
add_tensor_to_staging_node(ComputeGraph & graph,const ValueRef in_tensor,const ValueRef out_staging)65 void add_tensor_to_staging_node(
66     ComputeGraph& graph,
67     const ValueRef in_tensor,
68     const ValueRef out_staging) {
69   VK_CHECK_COND(graph.val_is_staging(out_staging));
70 
71   vkapi::ShaderInfo shader = get_tensor_to_nchw_shader(
72       *graph.get_tensor(in_tensor), graph.int8_buffers_enabled());
73 
74   utils::uvec3 global_wg_size = graph.create_global_wg_size(in_tensor);
75 
76   vkapi::ParamsBindList ubos;
77   if (graph.is_buffer_storage(in_tensor)) {
78     ubos.append(
79         {graph.sizes_ubo(in_tensor),
80          graph.strides_ubo(in_tensor),
81          graph.numel_ubo(in_tensor)});
82   } else {
83     ubos.append({graph.sizes_ubo(in_tensor)});
84   }
85 
86   // Normally, the image_to_nchw shader is structured so that each thread reads
87   // one texel from the input texture and writes each component of the texel
88   // into the corresponding location in the output buffer. However, this shader
89   // is structured slightly differently in that each thread writes out a
90   // complete 32 bit integer (containing 4 packed 8-bit integers) into the
91   // output buffer. Therefore, the global work group size for this shader will
92   // be the number of elements in the output buffer divided by 4, as opposed to
93   // the extents of the input texture.
94   if (is_bitw8_shader(shader)) {
95     uint32_t buffer_len = graph.get_staging(out_staging)->numel() / 4;
96     global_wg_size = {buffer_len, 1, 1};
97     ubos.append({graph.numel_ubo(in_tensor)});
98   }
99 
100   graph.execute_nodes().emplace_back(new DispatchNode(
101       graph,
102       shader,
103       global_wg_size,
104       graph.create_local_wg_size(global_wg_size),
105       // Input and Outputs
106       {{out_staging, vkapi::kWrite}, {in_tensor, vkapi::kRead}},
107       // Parameter Buffers
108       ubos,
109       // Specialization Constants
110       {graph.hashed_layout_of(in_tensor)}));
111 }
112 
add_prepack_standard_node(ComputeGraph & graph,const ValueRef tensor_data,const ValueRef tensor)113 void add_prepack_standard_node(
114     ComputeGraph& graph,
115     const ValueRef tensor_data,
116     const ValueRef tensor) {
117   vkapi::ShaderInfo shader = get_nchw_to_tensor_shader(
118       *graph.get_tensor(tensor), graph.int8_buffers_enabled());
119 
120   vkapi::ParamsBindList ubos;
121   if (graph.is_buffer_storage(tensor)) {
122     ubos.append(
123         {graph.sizes_ubo(tensor),
124          graph.strides_ubo(tensor),
125          graph.numel_ubo(tensor)});
126   } else {
127     ubos.append({graph.sizes_ubo(tensor)});
128   }
129 
130   graph.prepack_nodes().emplace_back(new PrepackNode(
131       graph,
132       shader,
133       graph.create_global_wg_size(tensor),
134       graph.create_local_wg_size(tensor),
135       // Input and Outputs
136       tensor_data,
137       tensor,
138       // Parameter Buffers
139       ubos,
140       // Specialization Constants
141       {graph.hashed_layout_of(tensor)}));
142 }
143 
prepack_standard(ComputeGraph & graph,const ValueRef tensor_data,const utils::StorageType storage_type,const utils::GPUMemoryLayout layout,const bool passthrough)144 ValueRef prepack_standard(
145     ComputeGraph& graph,
146     const ValueRef tensor_data,
147     const utils::StorageType storage_type,
148     const utils::GPUMemoryLayout layout,
149     const bool passthrough) {
150   if (passthrough && graph.val_is_tensor(tensor_data)) {
151     return tensor_data;
152   }
153   VK_CHECK_COND(graph.val_is_tref(tensor_data));
154   ValueRef tensor = graph.add_tensor_like(tensor_data, storage_type, layout);
155   add_prepack_standard_node(graph, tensor_data, tensor);
156   return tensor;
157 }
158 
prepack_standard_like(ComputeGraph & graph,const ValueRef tensor_data,const ValueRef to_copy,const bool passthrough)159 ValueRef prepack_standard_like(
160     ComputeGraph& graph,
161     const ValueRef tensor_data,
162     const ValueRef to_copy,
163     const bool passthrough) {
164   VK_CHECK_COND(graph.val_is_tensor(to_copy));
165   return prepack_standard(
166       graph,
167       tensor_data,
168       graph.storage_type_of(to_copy),
169       graph.estimate_memory_layout_of(to_copy),
170       passthrough);
171 }
172 
add_prepack_direct_copy_buffer_node(ComputeGraph & graph,const ValueRef tensor_data,const ValueRef tensor)173 void add_prepack_direct_copy_buffer_node(
174     ComputeGraph& graph,
175     const ValueRef tensor_data,
176     const ValueRef tensor) {
177   std::string kernel_name = "buffer_to_buffer";
178   add_dtype_suffix(kernel_name, graph.dtype_of(tensor_data));
179   vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
180 
181   vkapi::ParamsBindList ubos;
182   ubos.append({graph.numel_ubo(tensor)});
183 
184   graph.prepack_nodes().emplace_back(new PrepackNode(
185       graph,
186       shader,
187       graph.create_global_wg_size(tensor),
188       graph.create_local_wg_size(tensor),
189       // Input and Outputs
190       tensor_data,
191       tensor,
192       // Parameter Buffers
193       ubos,
194       // Specialization Constants
195       {}));
196 }
197 
prepack_direct_copy_buffer(ComputeGraph & graph,const ValueRef tensor_data)198 ValueRef prepack_direct_copy_buffer(
199     ComputeGraph& graph,
200     const ValueRef tensor_data) {
201   VK_CHECK_COND(graph.val_is_tref(tensor_data));
202   ValueRef tensor =
203       graph.add_tensor_like(tensor_data, utils::kBuffer, utils::kWidthPacked);
204   add_prepack_direct_copy_buffer_node(graph, tensor_data, tensor);
205   return tensor;
206 }
207 
prepack_op(ComputeGraph & graph,const std::vector<ValueRef> & args)208 void prepack_op(ComputeGraph& graph, const std::vector<ValueRef>& args) {
209   return add_prepack_standard_node(graph, args[0], args[1]);
210 }
211 
212 REGISTER_OPERATORS {
213   VK_REGISTER_OP(et_vk.prepack.default, prepack_op);
214 }
215 
216 } // namespace vkcompute
217