1 /*
2 * Copyright (c) 2023 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24 #include "ClTemplatePool2d.h"
25
26 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
27 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
28
29 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
30 #include "src/core/helpers/WindowHelpers.h"
31
32 #include "support/StringSupport.h"
33
34 namespace arm_compute
35 {
36 namespace experimental
37 {
38 namespace dynamic_fusion
39 {
40 namespace
41 {
42 // Shape indexes for NHWC Datalayout
43 constexpr static int32_t batch_idx = 3;
44 constexpr static int32_t height_idx = 2;
45 constexpr static int32_t width_idx = 1;
46 constexpr static int32_t channel_idx = 0;
47 }
ClTemplatePool2d(ComponentId id,const ArgumentPack<ITensorInfo> & tensors,const Attributes & attributes,const Settings & settings)48 ClTemplatePool2d::ClTemplatePool2d(ComponentId id,
49 const ArgumentPack<ITensorInfo> &tensors,
50 const Attributes &attributes,
51 const Settings &settings)
52 : IGpuTemplateComponentWriter{ id, tensors },
53 _src{},
54 _dst{},
55 _attributes{ attributes },
56 _settings{ settings }
57 {
58 _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
59 _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
60 ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
61 }
62
get_name() const63 std::string ClTemplatePool2d::get_name() const
64 {
65 return "pool2d";
66 }
67
get_component_code(const ComponentGroup & comp_group) const68 std::string ClTemplatePool2d::get_component_code(const ComponentGroup &comp_group) const
69 {
70 ARM_COMPUTE_UNUSED(comp_group);
71
72 // Condition to use 2x2 optimized kernel
73 if(_attributes.pool_size() == Size2D(2, 2))
74 {
75 return get_2x2_kernel_code();
76 }
77 else
78 {
79 return get_MxN_kernel_code();
80 }
81 }
82
get_MxN_kernel_code() const83 std::string ClTemplatePool2d::get_MxN_kernel_code() const
84 {
85 const auto pool_type = _attributes.pool_type();
86 const bool fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX;
87
88 // Define pool op macro.
89 std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_" : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
90
91 // Kernel start
92 // Note: If C is not multiple of N0, we shift back of PARTIAL_N0 elements to compute the leftover elements for get_global_id(0) == 0
93 // Note: If C is less than N0, N0 should be SHRINKED to the closest smaller N0. This operation is performed on the host side
94 std::string code = R"_(
95 //------------------ START KERNEL {{meta_kernel_id}} ---------------------
96 // IN_0(src) {{src}}
97 // OUT(dst, accum) {{dst}}
98
99 {
100 const int idx_out_c = g_ind_0;
101 const int idx_out_w = g_ind_1;
102 )_";
103
104 // Add macro for POOL_OP
105 code += "\n" + pool_op + "\n";
106
107 code += R"_(
108 const int idx_out_h = g_ind_2 % {{DST_HEIGHT}};
109 const int idx_out_n = g_ind_2 / {{DST_HEIGHT}};
110 )_";
111
112 // Define common variables.
113 code += R"_(
114 __global unsigned char *in_base_ptr = {{src}}_ptr + {{src}}_offset_first_element_in_bytes + idx_out_c * sizeof({{DATA_TYPE}}) + idx_out_n * {{src}}_stride_w;
115
116 __global unsigned char *out_base_ptr = {{dst}}_ptr + {{dst}}_offset_first_element_in_bytes + idx_out_c * sizeof({{DATA_TYPE}}) + idx_out_w * {{dst}}_stride_y + idx_out_h * {{dst}}_stride_z + idx_out_n * {{dst}}_stride_w;
117
118 VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
119 res0 = {{INITIAL_VALUE}};
120
121 const int idx_in_w = idx_out_w * {{STRIDE_X}} - {{PAD_X}};
122 const int idx_in_h = idx_out_h * {{STRIDE_Y}} - {{PAD_Y}};
123
124 const int pool_x_s = max((int)0, -idx_in_w);
125 const int pool_x_e = min((int){{POOL_SIZE_X}}, (int){{SRC_WIDTH}} - idx_in_w);
126 const int pool_y_s = max((int)0, -idx_in_h);
127 const int pool_y_e = min((int){{POOL_SIZE_Y}}, (int){{SRC_HEIGHT}} - idx_in_h);
128 )_";
129
130 // Determine filter size depending on if padding is excluded or not
131 if(_attributes.exclude_padding())
132 {
133 code += R"_(
134 const int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
135 )_";
136 }
137 else
138 {
139 code += R"_(
140 const int filter_size = {{POOL_SIZE_X}} * {{POOL_SIZE_Y}};
141 )_";
142 }
143
144 // Loop through pool size
145 // if global pooling
146 if(_attributes.pool_size().x() == _src->dimension(width_idx) && _attributes.pool_size().y() == _src->dimension(height_idx))
147 {
148 // Begin loop
149 code += R"_(
150 // Global pooling path
151 for(int y = 0; y < {{POOL_SIZE_Y}}; ++y)
152 {
153 #pragma unroll 8
154 for(int x = 0; x < {{POOL_SIZE_X}}; ++x)
155 {
156 VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
157 data0;
158 )_";
159 }
160 else // if local pooling size
161 {
162 code += R"_(
163 for(int y = pool_y_s; y < pool_y_e; ++y)
164 {
165 #pragma unroll 8
166 for(int x = pool_x_s; x < pool_x_e; ++x)
167 {
168 VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
169 data0;
170 )_";
171 } // end else
172
173 // if condition inside loop - use 32bit acc if mixed_precision.
174 // End loop through pooling section.
175 if(fp_mixed_precision)
176 {
177 // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
178 code += R"_(
179 data0 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + (x + idx_in_w) * {{src}}_stride_y + (y + idx_in_h) * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
180 res0 = POOL_OP(res0, data0);
181 }
182 }
183 )_";
184 }
185 else // load data, compute result and end loop
186 {
187 code += R"_(
188 data0 = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + (x + idx_in_w) * {{src}}_stride_y + (y + idx_in_h) * {{src}}_stride_z));
189 res0 = POOL_OP(res0, data0);
190 }
191 }
192 )_";
193 }
194
195 // For Pool AVG ONLY, divide pool output by filter size
196 if(pool_type == PoolingType::AVG)
197 {
198 code += R"_(
199 res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))filter_size;
200 )_";
201 }
202
203 // If mixed precision convert datatype before storing. Then end kernel.
204 if(fp_mixed_precision)
205 {
206 code += R"_(
207 VEC_DATA_TYPE({{DATA_TYPE}}, N0)
208 res_converted0 = CONVERT(res0, VEC_DATA_TYPE({{DATA_TYPE}}, N0));
209 STORE_VECTOR_SELECT(res_converted, {{DATA_TYPE}}, out_base_ptr, N0, PARTIAL_N0, (PARTIAL_N0 != 0) && g_ind_0 == 0);
210 )_";
211 }
212 else
213 {
214 // Store data
215 code += R"_(
216 STORE_VECTOR_SELECT(res, {{DATA_TYPE}}, out_base_ptr, N0, PARTIAL_N0, (PARTIAL_N0 != 0) && g_ind_0 == 0);
217 )_";
218 }
219
220 code += R"_(
221 //------------------ END KERNEL {{meta_kernel_id}} ---------------------
222 }
223 )_";
224
225 return code;
226 }
227
get_2x2_kernel_code() const228 std::string ClTemplatePool2d::get_2x2_kernel_code() const
229 {
230 const auto pool_type = _attributes.pool_type();
231 const bool fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX;
232 std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_" : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
233
234 std::string code = R"_(
235 //------------------ START KERNEL {{meta_kernel_id}} ---------------------
236 // IN_0(src) {{src}}
237 // OUT(dst, accum) {{dst}}
238
239 #define SELECT_TYPE SELECT_VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
240
241 {
242 const int idx_out_c = g_ind_0;
243 const int idx_out_w = g_ind_1;
244 )_";
245
246 // Add pool op macro
247 code += "\n" + pool_op + "\n";
248
249 // If batch size != 1, the batch size dimension is collapsed over the height dimension
250 code += R"_(
251 const int idx_out_h = g_ind_2 % {{DST_HEIGHT}};
252 const int idx_out_n = g_ind_2 / {{DST_HEIGHT}};
253 )_";
254
255 code += R"_(
256 const int idx_in_w = idx_out_w * {{STRIDE_X}} - {{PAD_X}};
257 const int idx_in_h = idx_out_h * {{STRIDE_Y}} - {{PAD_Y}};
258
259 __global unsigned char *in_base_ptr = {{src}}_ptr + {{src}}_offset_first_element_in_bytes + idx_out_c * sizeof({{DATA_TYPE}}) + idx_out_n * {{src}}_stride_w;
260 __global unsigned char *out_base_ptr = {{dst}}_ptr + {{dst}}_offset_first_element_in_bytes + idx_out_c * sizeof({{DATA_TYPE}}) + idx_out_w * {{dst}}_stride_y + idx_out_h * {{dst}}_stride_z + idx_out_n *
261 {{dst}}_stride_w;
262 const int pool_x_s = max((int)0, -idx_in_w);
263 const int pool_x_e = min((int)2, (int){{SRC_WIDTH}} - idx_in_w);
264 const int pool_y_s = max((int)0, -idx_in_h);
265 const int pool_y_e = min((int)2, (int){{SRC_HEIGHT}} - idx_in_h);
266
267 const int filter_size = (pool_x_e - pool_x_s) * (pool_y_e - pool_y_s);
268 const int x0 = pool_x_s + idx_in_w;
269 const int y0 = pool_y_s + idx_in_h;
270 const int x1 = pool_x_e - 1 + idx_in_w;
271 const int y1 = pool_y_e - 1 + idx_in_h;
272
273 REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0), data, 0);
274 )_";
275
276 if(fp_mixed_precision)
277 {
278 // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
279 code += R"_(
280 data0 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x0 * {{src}}_stride_y + y0 * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
281 data1 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x1 * {{src}}_stride_y + y0 * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
282 data2 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x0 * {{src}}_stride_y + y1 * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
283 data3 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x1 * {{src}}_stride_y + y1 * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
284 )_";
285 }
286 else
287 {
288 code += R"_(
289 data0 = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x0 * {{src}}_stride_y + y0 * {{src}}_stride_z));
290 data1 = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x1 * {{src}}_stride_y + y0 * {{src}}_stride_z));
291 data2 = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x0 * {{src}}_stride_y + y1 * {{src}}_stride_z));
292 data3 = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x1 * {{src}}_stride_y + y1 * {{src}}_stride_z));
293 )_";
294 }
295
296 if(pool_type != PoolingType::MAX)
297 {
298 // Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound)
299 code += R"_(
300 if(filter_size != 4)
301 {
302 SELECT_TYPE cond_w_s = (SELECT_TYPE)idx_in_w < (SELECT_TYPE)0;
303 SELECT_TYPE cond_w_e = (SELECT_TYPE)idx_in_w >= (SELECT_TYPE)({{SRC_WIDTH}} - 1);
304 SELECT_TYPE cond_h_s = (SELECT_TYPE)idx_in_h < (SELECT_TYPE)0;
305 SELECT_TYPE cond_h_e = (SELECT_TYPE)idx_in_h >= (SELECT_TYPE)({{SRC_HEIGHT}} - 1);
306
307 data0 = select(data0, (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)){{INITIAL_VALUE}}, (SELECT_TYPE)(cond_w_s | cond_h_s));
308 data1 = select(data1, (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)){{INITIAL_VALUE}}, (SELECT_TYPE)(cond_w_e | cond_h_s));
309 data2 = select(data2, (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)){{INITIAL_VALUE}}, (SELECT_TYPE)(cond_w_s | cond_h_e));
310 data3 = select(data3, (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)){{INITIAL_VALUE}}, (SELECT_TYPE)(cond_w_e | cond_h_e));
311 }
312 )_";
313 }
314
315 code += R"_(
316 VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
317 res0 = data0;
318 res0 = POOL_OP(res0, data1);
319 res0 = POOL_OP(res0, data2);
320 res0 = POOL_OP(res0, data3);
321 )_";
322
323 if(pool_type == PoolingType::AVG)
324 {
325 // If avg pooling divide result accordingly.
326 if(_attributes.exclude_padding())
327 {
328 code += R"_(
329 res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))filter_size;
330 )_";
331 }
332 else
333 {
334 code += R"_(
335 res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))4;
336 )_";
337 }
338 }
339
340 // Store result
341 if(fp_mixed_precision)
342 {
343 code += R"_(
344 VEC_DATA_TYPE({{DATA_TYPE}}, N0)
345 res_converted0 = CONVERT(res0, VEC_DATA_TYPE({{DATA_TYPE}}, N0));
346 STORE_VECTOR_SELECT(res_converted, {{DATA_TYPE}}, out_base_ptr, N0, PARTIAL_N0, (PARTIAL_N0 != 0) && g_ind_0 == 0);
347 )_";
348 }
349 else
350 {
351 code += R"_(
352 STORE_VECTOR_SELECT(res, {{DATA_TYPE}}, out_base_ptr, N0, PARTIAL_N0, (PARTIAL_N0 != 0) && g_ind_0 == 0);
353 )_";
354 }
355
356 code += R"_(
357 //------------------ END KERNEL {{meta_kernel_id}} ---------------------
358 }
359 #undef SELECT_TYPE
360 )_";
361
362 return code;
363 }
364
declare_variables(GpuKernelVariableTable & vtable,const ComponentGroup & comp_group) const365 void ClTemplatePool2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
366 {
367 vtable.declare_variable(
368 comp_group,
369 _src,
370 GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
371 "src");
372
373 vtable.declare_variable(
374 comp_group,
375 _dst,
376 GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
377 "dst");
378 }
379
get_tag_lut(const GpuKernelVariableTable & vtable,const ComponentGroup & comp_group) const380 TagLUT ClTemplatePool2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
381 {
382 ARM_COMPUTE_UNUSED(comp_group);
383
384 TagLUT lut{};
385 // Arguments and global shared variables
386 lut["src"] = vtable.get_variable(_src);
387 lut["dst"] = vtable.get_variable(_dst);
388
389 // Local build options
390 lut["meta_kernel_id"] = id();
391
392 // Retrieve relevant data
393 const auto padding = _attributes.pad();
394 const auto stride = _attributes.stride();
395 const auto pool_size = _attributes.pool_size();
396 const auto data_type = _src->data_type();
397 const auto use_fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && _attributes.pool_type() != PoolingType::MAX;
398
399 // pool specific
400 lut["STRIDE_X"] = stride.x();
401 lut["STRIDE_Y"] = stride.y();
402 lut["PAD_X"] = padding.left;
403 lut["PAD_Y"] = padding.top;
404 lut["POOL_SIZE_X"] = pool_size.width;
405 lut["POOL_SIZE_Y"] = pool_size.height;
406
407 // Datatypes and variables
408 lut["ACC_DATA_TYPE"] = get_cl_type_from_data_type((use_fp_mixed_precision) ? (DataType::F32) : (data_type)); // Type of accumulators to use.
409 lut["DATA_TYPE"] = get_cl_type_from_data_type(data_type);
410 lut["SRC_WIDTH"] = _src->dimension(width_idx);
411 lut["SRC_HEIGHT"] = _src->dimension(height_idx);
412 lut["INITIAL_VALUE"] = (_attributes.pool_type() == PoolingType::MAX) ? float_to_string_with_full_precision(std::numeric_limits<float>::lowest()) : std::string("0");
413
414 // Tensor specific data
415 lut["DST_HEIGHT"] = _dst->dimension(height_idx);
416
417 return lut;
418 }
419
get_build_options(const ComponentGroup & comp_group) const420 CLBuildOptions ClTemplatePool2d::get_build_options(const ComponentGroup &comp_group) const
421 {
422 const auto root_window = comp_group.get_root_component()->template_writer()->get_window();
423 const unsigned int n0 = root_window.x().step();
424 const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
425
426 CLBuildOptions build_opts{};
427 build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
428 build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
429
430 return build_opts;
431 }
432
get_config_id() const433 std::string ClTemplatePool2d::get_config_id() const
434 {
435 const DataType data_type = _src->data_type();
436 const DataLayout data_layout = _src->data_layout();
437
438 std::string config_id{};
439 config_id += "pooling_layer_2d_";
440 config_id += lower_string(string_from_data_type(data_type));
441 config_id += "_";
442 config_id += lower_string(string_from_data_layout(data_layout));
443 config_id += "_";
444 config_id += support::cpp11::to_string(_dst->dimension(width_idx));
445 config_id += "_";
446 config_id += support::cpp11::to_string(_dst->dimension(height_idx));
447 config_id += "_";
448 config_id += support::cpp11::to_string(_dst->dimension(channel_idx));
449
450 return config_id;
451 }
452
get_headers_list() const453 std::set<std::string> ClTemplatePool2d::get_headers_list() const
454 {
455 return std::set<std::string>{ "helpers.h", "tile_helpers.h", "repeat.h" };
456 }
457
get_window() const458 Window ClTemplatePool2d::get_window() const
459 {
460 ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
461 const auto output_shape = _dst->tensor_shape();
462 const unsigned int vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0));
463
464 // Create and configure kernel window
465 auto win = calculate_max_window(output_shape, Steps(vec_size));
466 win = win.collapse_if_possible(win, Window::DimZ); // collapse window on batch size.
467 return win;
468 }
469
470 } // namespace dynamic_fusion
471 } // namespace experimental
472 } // namespace arm_compute
473