xref: /aosp_15_r20/external/ComputeLibrary/src/core/CL/kernels/CLReductionOperationKernel.cpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2017-2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "src/core/CL/kernels/CLReductionOperationKernel.h"
25 
26 #include "arm_compute/core/CL/CLHelpers.h"
27 #include "arm_compute/core/CL/CLKernelLibrary.h"
28 #include "arm_compute/core/CL/ICLTensor.h"
29 #include "arm_compute/core/Helpers.h"
30 #include "arm_compute/core/TensorInfo.h"
31 #include "arm_compute/core/Utils.h"
32 #include "arm_compute/core/Validate.h"
33 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
34 #include "src/core/AccessWindowStatic.h"
35 #include "src/core/CL/CLValidate.h"
36 #include "src/core/helpers/AutoConfiguration.h"
37 #include "src/core/helpers/WindowHelpers.h"
38 
39 #include "support/StringSupport.h"
40 
41 namespace arm_compute
42 {
43 namespace
44 {
validate_arguments(const ITensorInfo * input,const ITensorInfo * output,unsigned int axis,ReductionOperation op)45 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
46 {
47     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
48     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
49     if(input->num_channels() == 1)
50     {
51         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
52     }
53     else
54     {
55         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32);
56         ARM_COMPUTE_RETURN_ERROR_ON(axis == 0);
57     }
58     ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8, "Not supported reduction operation for QASYMM8");
59     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
60     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
61     ARM_COMPUTE_RETURN_ERROR_ON((op == ReductionOperation::MEAN_SUM) && (axis == 0) && (input->dimension(0) == 0) && (input->data_type() != DataType::QASYMM8)
62                                 && (input->data_type() != DataType::QASYMM8_SIGNED));
63     ARM_COMPUTE_RETURN_ERROR_ON_MSG((op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN), "Not supported reduction operation, use CLArgMinMaxLayer");
64 
65     if(output->total_size() != 0)
66     {
67         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
68         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
69     }
70 
71     return Status{};
72 }
73 } // namespace
74 
CLReductionOperationKernel()75 CLReductionOperationKernel::CLReductionOperationKernel()
76     : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReductionOperation::SUM_SQUARE)
77 {
78     _type = CLKernelType::ELEMENTWISE;
79 }
80 
configure(const ICLTensor * input,ICLTensor * output,unsigned int axis,ReductionOperation op)81 void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
82 {
83     configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op);
84 }
85 
configure(const CLCompileContext & compile_context,const ICLTensor * input,ICLTensor * output,unsigned int axis,ReductionOperation op)86 void CLReductionOperationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
87 {
88     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
89 
90     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
91 
92     auto padding_info = get_padding_info({ input, output });
93 
94     _input          = input;
95     _output         = output;
96     _reduction_axis = axis;
97     _op             = op;
98 
99     const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, true);
100     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).reset_padding().set_is_resizable(true));
101 
102     // Set build options
103     CLBuildOptions build_opts;
104     DataType       data_type = input->info()->data_type();
105     std::string    data_type_promoted{};
106 
107     if(is_data_type_quantized(data_type))
108     {
109         data_type_promoted = "int";
110     }
111     else
112     {
113         data_type_promoted = get_cl_type_from_data_type(data_type);
114     }
115 
116     const unsigned int width             = input->info()->dimension(0) * input->info()->num_channels();
117     unsigned int       vec_size          = (is_data_type_quantized(input->info()->data_type()) && (axis == 0)) ? 1 : 16;
118     vec_size                             = adjust_vec_size(vec_size, width);
119     const unsigned int vec_size_leftover = width % vec_size;
120 
121     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
122     build_opts.add_option("-DDATA_TYPE_PROMOTED=" + data_type_promoted);
123     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
124     build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
125     build_opts.add_option_if(is_data_type_float(data_type), "-DFLOAT_DATA_TYPE");
126     build_opts.add_option_if(op == ReductionOperation::SUM_SQUARE, "-DSUM_SQUARE");
127     build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DMEAN");
128     build_opts.add_option_if(op == ReductionOperation::SUM, "-DSUM");
129     build_opts.add_option_if(op == ReductionOperation::PROD, "-DPROD");
130     build_opts.add_option_if(op == ReductionOperation::MIN, "-DMIN");
131     build_opts.add_option_if(op == ReductionOperation::MAX, "-DMAX");
132     build_opts.add_option_if(is_data_type_quantized(data_type), "-DOFFSET=" + support::cpp11::to_string(input->info()->quantization_info().uniform().offset));
133     build_opts.add_option_if(is_data_type_quantized(data_type), "-DSCALE=" + float_to_string_with_full_precision(input->info()->quantization_info().uniform().scale));
134 
135     switch(op)
136     {
137         case ReductionOperation::SUM_SQUARE:
138             build_opts.add_option(("-DOPERATION=square_sum"));
139             break;
140         case ReductionOperation::SUM:
141         case ReductionOperation::MEAN_SUM:
142             build_opts.add_option(("-DOPERATION=sum"));
143             break;
144         case ReductionOperation::MIN:
145         case ReductionOperation::MAX:
146             break;
147         case ReductionOperation::PROD:
148             build_opts.add_option(("-DOPERATION=product"));
149             break;
150         default:
151             ARM_COMPUTE_ERROR("Unsupported reduction operation");
152     }
153 
154     // Create kernel
155     std::string kernel_axis_name;
156     const bool  is_serial_op = needs_serialized_reduction(_op, _input->info()->data_type(), _reduction_axis);
157 
158     switch(axis)
159     {
160         case 0:
161         {
162             build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(width));
163             kernel_axis_name = ((is_serial_op) ? "non_parallel_x" : "x");
164         }
165         break;
166         case 1:
167             build_opts.add_option("-DHEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
168             kernel_axis_name = "y";
169             break;
170         case 2:
171             build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
172             kernel_axis_name = "z";
173             break;
174         case 3:
175             build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
176             build_opts.add_option("-DBATCH=" + support::cpp11::to_string(input->info()->dimension(3)));
177             kernel_axis_name = "w";
178             break;
179         default:
180             ARM_COMPUTE_ERROR("Not supported");
181     }
182     _kernel = create_kernel(compile_context, "reduction_operation_" + kernel_axis_name, build_opts.options());
183 
184     // Configure kernel window
185     Window win = calculate_max_window(*input->info(), Steps(vec_size));
186     win.set(Window::DimX, Window::Dimension(win.x().start(), win.x().end() * _input->info()->num_channels(), win.x().step()));
187     ICLKernel::configure_internal(win);
188 
189     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
190 }
191 
validate(const ITensorInfo * input,const ITensorInfo * output,unsigned int axis,ReductionOperation op)192 Status CLReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
193 {
194     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
195     return Status{};
196 }
197 
run(const Window & window,cl::CommandQueue & queue)198 void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &queue)
199 {
200     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
201     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
202 
203     const bool is_serial_op = needs_serialized_reduction(_op, _input->info()->data_type(), _reduction_axis);
204     switch(_reduction_axis)
205     {
206         case 0:
207         {
208             // We use parallel reduction only in non quantized types
209             if(is_serial_op)
210             {
211                 // Get first input and output slices
212                 Window window_in{ window };
213                 window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
214 
215                 Window out_window{ window };
216                 out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
217 
218                 Window in_slice  = window_in.first_slice_window_1D();
219                 Window out_slice = out_window.first_slice_window_1D();
220 
221                 do
222                 {
223                     unsigned int idx = 0;
224                     add_1D_tensor_argument(idx, _input, in_slice);
225                     add_1D_tensor_argument(idx, _output, out_slice);
226                     enqueue(queue, *this, in_slice);
227                 }
228                 while(window_in.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice));
229             }
230             else
231             {
232                 // Set out window
233                 bool   has_collapsed = true;
234                 Window window_in     = window.collapse_if_possible(window, 2, &has_collapsed);
235                 ARM_COMPUTE_ERROR_ON(!has_collapsed);
236 
237                 Window window_out = window_in;
238                 window_out.set(0, Window::Dimension());
239 
240                 unsigned int idx = 0;
241                 add_3D_tensor_argument(idx, _input, window_in);
242                 add_3D_tensor_argument(idx, _output, window_out);
243                 enqueue(queue, *this, window_in);
244             }
245         }
246         break;
247         case 1:
248         {
249             // Get first input and output slices
250             Window window_in{ window };
251             window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
252             Window in_slice  = window_in.first_slice_window_2D();
253             Window out_slice = window.first_slice_window_2D();
254 
255             do
256             {
257                 unsigned int idx = 0;
258                 add_2D_tensor_argument(idx, _input, in_slice);
259                 add_2D_tensor_argument(idx, _output, out_slice);
260                 enqueue(queue, *this, in_slice);
261             }
262             while(window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
263         }
264         break;
265         case 2:
266         {
267             // Get first input and output slices
268             Window window_in{ window };
269             window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
270             Window in_slice  = window_in.first_slice_window_3D();
271             Window out_slice = window.first_slice_window_3D();
272 
273             do
274             {
275                 unsigned int idx = 0;
276                 add_3D_tensor_argument(idx, _input, in_slice);
277                 add_3D_tensor_argument(idx, _output, out_slice);
278                 enqueue(queue, *this, in_slice);
279             }
280             while(window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
281         }
282         break;
283         case 3:
284         {
285             // Get first input and output slices
286             Window window_in{ window };
287             window_in.set(3, Window::Dimension(0, 1, 1));
288             Window in_slice  = window_in.first_slice_window_4D();
289             Window out_slice = window.first_slice_window_4D();
290 
291             do
292             {
293                 unsigned int idx = 0;
294                 add_4D_tensor_argument(idx, _input, in_slice);
295                 add_4D_tensor_argument(idx, _output, out_slice);
296                 enqueue(queue, *this, in_slice);
297             }
298             while(window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
299         }
300         break;
301         default:
302             ARM_COMPUTE_ERROR("Not supported");
303     }
304 }
305 } // namespace arm_compute
306