1 /*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
10
11 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
12
13 #include <executorch/backends/vulkan/runtime/graph/ops/DispatchNode.h>
14 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
15 #include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
16
17 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
18 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
19
20 namespace vkcompute {
21
add_staging_to_tensor_node(ComputeGraph & graph,const ValueRef in_staging,const ValueRef out_tensor)22 void add_staging_to_tensor_node(
23 ComputeGraph& graph,
24 const ValueRef in_staging,
25 const ValueRef out_tensor) {
26 VK_CHECK_COND(graph.val_is_staging(in_staging));
27
28 vkapi::ShaderInfo shader = get_nchw_to_tensor_shader(
29 *graph.get_tensor(out_tensor), graph.int8_buffers_enabled());
30
31 vkapi::ParamsBindList ubos;
32 if (graph.is_buffer_storage(out_tensor)) {
33 ubos.append(
34 {graph.sizes_ubo(out_tensor),
35 graph.strides_ubo(out_tensor),
36 graph.numel_ubo(out_tensor)});
37 } else {
38 ubos.append({graph.sizes_ubo(out_tensor)});
39 }
40
41 graph.execute_nodes().emplace_back(new DispatchNode(
42 graph,
43 shader,
44 graph.create_global_wg_size(out_tensor),
45 graph.create_local_wg_size(out_tensor),
46 // Input and Outputs
47 {{out_tensor, vkapi::kWrite}, {in_staging, vkapi::kRead}},
48 // Parameter Buffers
49 ubos,
50 // Specialization Constants
51 {graph.hashed_layout_of(out_tensor)},
52 // Resizing Logic
53 nullptr,
54 {}));
55 }
56
57 const std::string kBitw8PrefixStr = "bitw8_image_to_nchw_nobitw8buffer";
58
is_bitw8_shader(const vkapi::ShaderInfo & shader)59 bool is_bitw8_shader(const vkapi::ShaderInfo& shader) {
60 const auto size = kBitw8PrefixStr.size();
61 const std::string& shader_prefix_str = shader.kernel_name.substr(0, size);
62 return shader_prefix_str == kBitw8PrefixStr;
63 }
64
add_tensor_to_staging_node(ComputeGraph & graph,const ValueRef in_tensor,const ValueRef out_staging)65 void add_tensor_to_staging_node(
66 ComputeGraph& graph,
67 const ValueRef in_tensor,
68 const ValueRef out_staging) {
69 VK_CHECK_COND(graph.val_is_staging(out_staging));
70
71 vkapi::ShaderInfo shader = get_tensor_to_nchw_shader(
72 *graph.get_tensor(in_tensor), graph.int8_buffers_enabled());
73
74 utils::uvec3 global_wg_size = graph.create_global_wg_size(in_tensor);
75
76 vkapi::ParamsBindList ubos;
77 if (graph.is_buffer_storage(in_tensor)) {
78 ubos.append(
79 {graph.sizes_ubo(in_tensor),
80 graph.strides_ubo(in_tensor),
81 graph.numel_ubo(in_tensor)});
82 } else {
83 ubos.append({graph.sizes_ubo(in_tensor)});
84 }
85
86 // Normally, the image_to_nchw shader is structured so that each thread reads
87 // one texel from the input texture and writes each component of the texel
88 // into the corresponding location in the output buffer. However, this shader
89 // is structured slightly differently in that each thread writes out a
90 // complete 32 bit integer (containing 4 packed 8-bit integers) into the
91 // output buffer. Therefore, the global work group size for this shader will
92 // be the number of elements in the output buffer divided by 4, as opposed to
93 // the extents of the input texture.
94 if (is_bitw8_shader(shader)) {
95 uint32_t buffer_len = graph.get_staging(out_staging)->numel() / 4;
96 global_wg_size = {buffer_len, 1, 1};
97 ubos.append({graph.numel_ubo(in_tensor)});
98 }
99
100 graph.execute_nodes().emplace_back(new DispatchNode(
101 graph,
102 shader,
103 global_wg_size,
104 graph.create_local_wg_size(global_wg_size),
105 // Input and Outputs
106 {{out_staging, vkapi::kWrite}, {in_tensor, vkapi::kRead}},
107 // Parameter Buffers
108 ubos,
109 // Specialization Constants
110 {graph.hashed_layout_of(in_tensor)}));
111 }
112
add_prepack_standard_node(ComputeGraph & graph,const ValueRef tensor_data,const ValueRef tensor)113 void add_prepack_standard_node(
114 ComputeGraph& graph,
115 const ValueRef tensor_data,
116 const ValueRef tensor) {
117 vkapi::ShaderInfo shader = get_nchw_to_tensor_shader(
118 *graph.get_tensor(tensor), graph.int8_buffers_enabled());
119
120 vkapi::ParamsBindList ubos;
121 if (graph.is_buffer_storage(tensor)) {
122 ubos.append(
123 {graph.sizes_ubo(tensor),
124 graph.strides_ubo(tensor),
125 graph.numel_ubo(tensor)});
126 } else {
127 ubos.append({graph.sizes_ubo(tensor)});
128 }
129
130 graph.prepack_nodes().emplace_back(new PrepackNode(
131 graph,
132 shader,
133 graph.create_global_wg_size(tensor),
134 graph.create_local_wg_size(tensor),
135 // Input and Outputs
136 tensor_data,
137 tensor,
138 // Parameter Buffers
139 ubos,
140 // Specialization Constants
141 {graph.hashed_layout_of(tensor)}));
142 }
143
prepack_standard(ComputeGraph & graph,const ValueRef tensor_data,const utils::StorageType storage_type,const utils::GPUMemoryLayout layout,const bool passthrough)144 ValueRef prepack_standard(
145 ComputeGraph& graph,
146 const ValueRef tensor_data,
147 const utils::StorageType storage_type,
148 const utils::GPUMemoryLayout layout,
149 const bool passthrough) {
150 if (passthrough && graph.val_is_tensor(tensor_data)) {
151 return tensor_data;
152 }
153 VK_CHECK_COND(graph.val_is_tref(tensor_data));
154 ValueRef tensor = graph.add_tensor_like(tensor_data, storage_type, layout);
155 add_prepack_standard_node(graph, tensor_data, tensor);
156 return tensor;
157 }
158
prepack_standard_like(ComputeGraph & graph,const ValueRef tensor_data,const ValueRef to_copy,const bool passthrough)159 ValueRef prepack_standard_like(
160 ComputeGraph& graph,
161 const ValueRef tensor_data,
162 const ValueRef to_copy,
163 const bool passthrough) {
164 VK_CHECK_COND(graph.val_is_tensor(to_copy));
165 return prepack_standard(
166 graph,
167 tensor_data,
168 graph.storage_type_of(to_copy),
169 graph.estimate_memory_layout_of(to_copy),
170 passthrough);
171 }
172
add_prepack_direct_copy_buffer_node(ComputeGraph & graph,const ValueRef tensor_data,const ValueRef tensor)173 void add_prepack_direct_copy_buffer_node(
174 ComputeGraph& graph,
175 const ValueRef tensor_data,
176 const ValueRef tensor) {
177 std::string kernel_name = "buffer_to_buffer";
178 add_dtype_suffix(kernel_name, graph.dtype_of(tensor_data));
179 vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
180
181 vkapi::ParamsBindList ubos;
182 ubos.append({graph.numel_ubo(tensor)});
183
184 graph.prepack_nodes().emplace_back(new PrepackNode(
185 graph,
186 shader,
187 graph.create_global_wg_size(tensor),
188 graph.create_local_wg_size(tensor),
189 // Input and Outputs
190 tensor_data,
191 tensor,
192 // Parameter Buffers
193 ubos,
194 // Specialization Constants
195 {}));
196 }
197
prepack_direct_copy_buffer(ComputeGraph & graph,const ValueRef tensor_data)198 ValueRef prepack_direct_copy_buffer(
199 ComputeGraph& graph,
200 const ValueRef tensor_data) {
201 VK_CHECK_COND(graph.val_is_tref(tensor_data));
202 ValueRef tensor =
203 graph.add_tensor_like(tensor_data, utils::kBuffer, utils::kWidthPacked);
204 add_prepack_direct_copy_buffer_node(graph, tensor_data, tensor);
205 return tensor;
206 }
207
prepack_op(ComputeGraph & graph,const std::vector<ValueRef> & args)208 void prepack_op(ComputeGraph& graph, const std::vector<ValueRef>& args) {
209 return add_prepack_standard_node(graph, args[0], args[1]);
210 }
211
212 REGISTER_OPERATORS {
213 VK_REGISTER_OP(et_vk.prepack.default, prepack_op);
214 }
215
216 } // namespace vkcompute
217