1 /*
2  * Copyright (c) 2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "ClTemplateDepthwiseConv2d.h"
25 
26 #include "src/core/helpers/WindowHelpers.h"
27 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
28 
29 namespace arm_compute
30 {
31 namespace experimental
32 {
33 namespace dynamic_fusion
34 {
ClTemplateDepthwiseConv2d(ComponentId id,const ArgumentPack<ITensorInfo> & tensors,const Attributes & attributes,const Settings & settings)35 ClTemplateDepthwiseConv2d::ClTemplateDepthwiseConv2d(ComponentId                      id,
36                                                      const ArgumentPack<ITensorInfo> &tensors,
37                                                      const Attributes                &attributes,
38                                                      const Settings                  &settings)
39     : IGpuTemplateComponentWriter{ id, tensors },
40       _src{},
41       _weight{},
42       _bias{},
43       _dst{},
44       _attributes{ attributes },
45       _settings{ settings }
46 {
47     _src    = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
48     _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
49     if(this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
50     {
51         _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
52     }
53     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
54     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _weight, _dst);
55 }
56 
get_name() const57 std::string ClTemplateDepthwiseConv2d::get_name() const
58 {
59     return "depthwise_conv2d";
60 }
61 
get_component_code(const ComponentGroup & comp_group) const62 std::string ClTemplateDepthwiseConv2d::get_component_code(const ComponentGroup &comp_group) const
63 {
64     ARM_COMPUTE_UNUSED(comp_group);
65 
66     constexpr int height_idx = 2; // Data Layout is NHWC
67 
68     std::string code = R"_(
69 //------------------ START KERNEL {{meta_kernel_id}} ---------------------
70 // IN_0(src)            {{src}}
71 // IN_1(wei)            {{weight}}
72 )_";
73 
74     if(_bias != nullptr && _bias->has_valid_id())
75     {
76         code += R"_(
77 // IN_1(bia)            {{bias}}
78 )_";
79     }
80 
81     code += R"_(
82 // OUT(dst, accum)      {{dst}}
83 
84 TILE(uint, M0, 1, g_dst_indirect_y);
85 
86 {
87 #define _IWEI_WIDTH {{WEI_WIDTH}}
88 #define _IWEI_HEIGHT {{WEI_HEIGHT}}
89 #define _IDST_WIDTH {{arg_dst}}_w
90 #define _IDST_HEIGHT {{arg_dst}}_h
91 #define _IM0_A M0_A
92 #define _IN0_A N0_A
93 #define _IM0_B _IWEI_WIDTH
94 #define _IN0_B N0
95 #define _IBOUNDARY_CHECK (!((_IWEI_WIDTH == 1 && _IWEI_HEIGHT == 1 && {{PAD_LEFT}} == 0 && {{PAD_TOP}} == 0 && M0 == 1)))
96 )_";
97 
98     code += R"_(
99     const int yo = g_ind_2 % {{arg_dst}}_h;
100     const int bout = g_ind_2 / {{arg_dst}}_h;
101 )_";
102 
103     code += R"_(
104 
105     int xi = g_ind_1 * {{STRIDE_X}};
106     int yi = yo * {{STRIDE_Y}};
107     xi -= {{PAD_LEFT}};
108     yi -= {{PAD_TOP}};
109 
110     LOOP_UNROLLING(int, i, 0, 1, M0,
111     {
112         {{dst}}[i].v = 0;
113     })
114 )_";
115 
116     if(_weight->dimension(height_idx) < 5)
117     {
118         code += R"_(
119     LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
120 )_";
121     }
122     else
123     {
124         code += R"_(
125     for(int yk = 0; yk < _IWEI_HEIGHT; ++yk)
126 )_";
127     }
128 
129     code += R"_(
130     {
131         TILE({{SRC_DATA_TYPE}}, _IM0_A, _IN0_A, a);
132 
133         LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
134         {
135             a[i].v = 0;
136         })
137 
138         T_LOAD_NHWC_WITH_DILATION({{SRC_DATA_TYPE}}, 1, _IM0_A, _IN0_A, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yi + yk * {{DILATION_Y}}, xi, (g_ind_0 / {{DEPTH_MULTIPLIER}}), {{src}}_w, {{src}}_h, {{DILATION_X}}, 1, _IBOUNDARY_CHECK, a);
139 
140         TILE({{WEI_DATA_TYPE}}, _IM0_B, _IN0_B, b);
141 
142         T_LOAD({{WEI_DATA_TYPE}}, _IM0_B, _IN0_B, {{WEI_TENSOR_TYPE}}, {{weight}}, g_ind_0, yk * _IM0_B, 1, {{weight}}_stride_y, b);
143 
144         LOOP_UNROLLING(int, m0, 0, 1, M0,
145         {
146             LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
147             {
148 )_";
149 
150     if(!_settings.is_fma_available())
151     {
152         code += R"_(
153                 {{dst}}[m0].v += a[xk + m0].v * b[xk].v;
154 )_";
155     }
156     else
157     {
158         code += R"_(
159                 {{dst}}[m0].v = fma(a[xk + m0].v, b[xk].v, {{dst}}[m0].v);
160 )_";
161     }
162 
163     code += R"_(
164             })
165         })
166     }
167 )_";
168 
169     if(_weight->dimension(height_idx) < 5)
170     {
171         code += R"_(
172     )
173 )_";
174     }
175 
176     if(_bias && _bias->has_valid_id())
177     {
178         code += R"_(
179         TILE({{BIA_DATA_TYPE}}, 1, N0, {{bias}});
180 
181         T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, g_ind_0, 0, 0, 0, {{bias}});
182 
183         T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, {{bias}}, {{dst}});
184 )_";
185     }
186 
187     code += R"_(
188     LOOP_UNROLLING(int, i, 0, 1, M0,
189     {
190         g_dst_indirect_y[i].v = (uint)min((int)(g_ind_1 + i), (int)({{arg_dst}}_w) - 1);
191         g_dst_indirect_y[i].v += (int)(g_ind_2 % {{arg_dst}}_h) * (int)({{arg_dst}}_w);
192         g_dst_indirect_y[i].v += (int)(g_ind_2 / {{arg_dst}}_h) * (int)({{arg_dst}}_w * {{arg_dst}}_h);
193     })
194 }
195 //------------------ END KERNEL {{meta_kernel_id}} ---------------------
196 )_";
197 
198     return code;
199 }
200 
declare_variables(GpuKernelVariableTable & vtable,const ComponentGroup & comp_group) const201 void ClTemplateDepthwiseConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
202 {
203     const GpuKernelArgumentInfo::Type input_type = _settings.export_input_to_cl_image() ?
204                                                        GpuKernelArgumentInfo::Type::Tensor_4D_t_Image :
205                                                        GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
206 
207     vtable.declare_variable(
208         comp_group,
209         _src,
210         GpuKernelArgumentInfo(input_type),
211         "src");
212 
213     const GpuKernelArgumentInfo::Type weight_type = _settings.export_weights_to_cl_image() ?
214                                                         GpuKernelArgumentInfo::Type::Tensor_4D_t_Image :
215                                                         GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
216 
217     vtable.declare_variable(
218         comp_group,
219         _weight,
220         GpuKernelArgumentInfo(weight_type),
221         "weight");
222 
223     if(_bias != nullptr && _bias->has_valid_id()) // optional bias
224     {
225         vtable.declare_variable(
226             comp_group,
227             _bias,
228             GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector),
229             "bias");
230     }
231     vtable.declare_variable(
232         comp_group,
233         _dst,
234         GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
235         "dst");
236 }
237 
get_tag_lut(const GpuKernelVariableTable & vtable,const ComponentGroup & comp_group) const238 TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
239 {
240     TagLUT lut{};
241 
242     // Arguments and global shared variables
243     lut["src"]    = vtable.get_variable(_src);
244     lut["weight"] = vtable.get_variable(_weight);
245 
246     if(_bias != nullptr && _bias->has_valid_id()) // optional bias
247     {
248         lut["bias"]          = vtable.get_variable(_bias);
249         lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type());
250     }
251     lut["dst"] = vtable.get_variable(_dst);
252 
253     const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor());
254     lut["arg_dst"]          = dst_argument.uniq_name;
255 
256     // Local build options
257     lut["meta_kernel_id"] = id();
258     lut["ACC_DATA_TYPE"]  = _src->data_type();
259     lut["SRC_DATA_TYPE"]  = _src->data_type();
260     lut["WEI_DATA_TYPE"]  = _weight->data_type();
261 
262     switch(vtable.get_variable(_src).kernel_argument_info.type)
263     {
264         case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
265         case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
266         case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
267             lut["SRC_TENSOR_TYPE"] = "IMAGE";
268             break;
269         default:
270             lut["SRC_TENSOR_TYPE"] = "BUFFER";
271             break;
272     }
273 
274     switch(vtable.get_variable(_weight).kernel_argument_info.type)
275     {
276         case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
277         case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
278         case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
279             lut["WEI_TENSOR_TYPE"] = "IMAGE";
280             break;
281         default:
282             lut["WEI_TENSOR_TYPE"] = "BUFFER";
283             break;
284     }
285 
286     // Data Layout is NHWC
287     constexpr int width_idx  = 1;
288     constexpr int height_idx = 2;
289 
290     lut["WEI_WIDTH"]  = _weight->dimension(width_idx);
291     lut["WEI_HEIGHT"] = _weight->dimension(height_idx);
292 
293     lut["STRIDE_X"] = _attributes.stride().x();
294     lut["STRIDE_Y"] = _attributes.stride().y();
295 
296     lut["PAD_LEFT"] = _attributes.pad().left;
297     lut["PAD_TOP"]  = _attributes.pad().top;
298 
299     lut["DILATION_X"] = _attributes.dilation().x();
300     lut["DILATION_Y"] = _attributes.dilation().y();
301 
302     lut["DEPTH_MULTIPLIER"] = _attributes.depth_multiplier();
303 
304     return lut;
305 }
306 
get_build_options(const ComponentGroup & comp_group) const307 CLBuildOptions ClTemplateDepthwiseConv2d::get_build_options(const ComponentGroup &comp_group) const
308 {
309     ARM_COMPUTE_UNUSED(comp_group);
310 
311     constexpr unsigned int width_idx = 1; // Data Layout is NHWC
312 
313     const unsigned int n0               = _settings.n0();
314     const unsigned int m0               = _settings.m0();
315     const unsigned int m0_a             = _weight->dimension(width_idx) + m0 - 1;
316     const unsigned int n0_a             = _attributes.depth_multiplier() > 1 ? 1 : n0;
317     const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
318 
319     CLBuildOptions build_opts{};
320 
321     if(_settings.fast_relaxed_math())
322     {
323         build_opts.add_option("-cl-fast-relaxed-math");
324     }
325     else
326     {
327         // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
328         // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
329         build_opts.add_option("-cl-unsafe-math-optimizations");
330     }
331 
332     build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
333     build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
334     build_opts.add_option("-DN0_A=" + support::cpp11::to_string(n0_a));
335     build_opts.add_option("-DM0_A=" + support::cpp11::to_string(m0_a));
336     build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
337 
338     return build_opts;
339 }
340 
get_config_id() const341 std::string ClTemplateDepthwiseConv2d::get_config_id() const
342 {
343     std::string config_id{};
344 
345     config_id += support::cpp11::to_string(_src->dimension(0));
346     config_id += "_";
347     config_id += support::cpp11::to_string(_src->dimension(1));
348     config_id += "_";
349     config_id += support::cpp11::to_string(_src->dimension(2));
350     config_id += "_";
351     config_id += support::cpp11::to_string(_dst->dimension(0));
352     config_id += "_";
353     config_id += support::cpp11::to_string(_dst->dimension(1));
354     config_id += "_";
355     config_id += support::cpp11::to_string(_dst->dimension(2));
356     config_id += "_";
357     config_id += string_from_data_type(_src->data_type());
358 
359     return config_id;
360 }
361 
get_headers_list() const362 std::set<std::string> ClTemplateDepthwiseConv2d::get_headers_list() const
363 {
364     return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
365 }
366 
get_window() const367 Window ClTemplateDepthwiseConv2d::get_window() const
368 {
369     ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
370 
371     Window win = calculate_max_window(*_dst, Steps(_settings.n0(), _settings.m0()));
372     return win.collapse(win, Window::DimZ);
373 }
374 
375 } // namespace dynamic_fusion
376 } // namespace experimental
377 } // namespace arm_compute
378