xref: /aosp_15_r20/external/ComputeLibrary/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2022-2023 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.h"
25 
26 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
27 
28 #include "src/common/utils/Log.h"
29 #include "src/core/helpers/AutoConfiguration.h"
30 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
31 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
32 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h"
33 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
34 #include "src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h"
35 #include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
36 
37 namespace arm_compute
38 {
39 namespace experimental
40 {
41 namespace dynamic_fusion
42 {
43 namespace
44 {
calculate_and_init_dst_if_empty(ITensorInfo * dst,const ITensorInfo * src,const ITensorInfo * wei,const DepthwiseConv2dAttributes & attributes)45 void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *src, const ITensorInfo *wei, const DepthwiseConv2dAttributes &attributes)
46 {
47     if(dst->total_size() == 0U)
48     {
49         const PadStrideInfo pad_stride_info(attributes.stride().x(),
50                                             attributes.stride().y(),
51                                             attributes.pad().left,
52                                             attributes.pad().right,
53                                             attributes.pad().top,
54                                             attributes.pad().bottom,
55                                             attributes.dimension_rounding_type());
56 
57         const ConvolutionInfo conv_info{ pad_stride_info, attributes.depth_multiplier(), ActivationLayerInfo(), attributes.dilation() };
58         const TensorShape     shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *wei, conv_info);
59 
60         auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape));
61     }
62 }
63 
64 /* A helper method to reduce the duplication in dst tensor initialization
65 *  when calling validate()
66 */
is_supported_op_helper(const GpuWorkloadContext & context,const ITensorInfo * src,const ITensorInfo * wei,const ITensorInfo * bia,const ITensorInfo * dst,const DepthwiseConv2dAttributes & attributes)67 Status is_supported_op_helper(const GpuWorkloadContext        &context,
68                               const ITensorInfo               *src,
69                               const ITensorInfo               *wei,
70                               const ITensorInfo               *bia,
71                               const ITensorInfo               *dst,
72                               const DepthwiseConv2dAttributes &attributes)
73 {
74     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei);
75 
76     TensorInfo         dst_info_to_validate;
77     const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
78 
79     if(dst != nullptr)
80     {
81         dst_info_to_validate_ptr = dst;
82     }
83 
84     calculate_and_init_dst_if_empty(&dst_info_to_validate, src, wei, attributes);
85 
86     // Check support level
87     // Data type
88     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
89     // Data layout
90     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
91 
92     const GpuTarget gpu_target = context.gpu_target();
93 
94     if(context.gpu_language() == GpuLanguage::OpenCL)
95     {
96         const CLCompileContext *cl_compile_ctx = context.cl_compile_context();
97         ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
98 
99         // Validate Depthwise Conv2d Component
100         {
101             const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
102             auto       settings   = ClComponentDepthwiseConv2d::Settings();
103 
104             const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
105                                                  attributes.pad().right,
106                                                  attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR);
107 
108             // Get the depthwise convolution compute parameters
109             auto                       t        = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target);
110             const DWCComputeKernelInfo dwc_info = t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier());
111 
112             settings.fast_relaxed_math(
113                 (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
114                 && (dst_info_to_validate_ptr->data_type() == DataType::F32 || dst_info_to_validate_ptr->data_type() == DataType::F16));
115 
116             settings.is_fma_available(get_arch_from_target(gpu_target) == GPUTarget::MIDGARD)
117             .m0(dwc_info.m0)
118             .n0(dwc_info.n0)
119             .export_input_to_cl_image(dwc_info.export_input_to_cl_image)
120             .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image);
121 
122             ArgumentPack<ITensorInfo> arguments;
123             arguments.add_const_tensor(ACL_SRC_0, src);
124             arguments.add_const_tensor(ACL_SRC_1, wei);
125             arguments.add_const_tensor(ACL_SRC_2, bia);
126             arguments.add_const_tensor(ACL_DST_0, dst_info_to_validate_ptr);
127             ARM_COMPUTE_RETURN_ON_ERROR(ClComponentDepthwiseConv2d::validate(properties, arguments, attributes, settings));
128         }
129     }
130     else
131     {
132         ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language");
133     }
134 
135     return Status{};
136 }
137 
138 constexpr GpuOperatorType operator_type = GpuOperatorType::Complex;
139 } // namespace
140 
is_supported_op(const GpuWorkloadContext & context,const ITensorInfo * src,const ITensorInfo * wei,const ITensorInfo * bia,const DepthwiseConv2dAttributes & attributes)141 Status GpuDepthwiseConv2d::is_supported_op(const GpuWorkloadContext        &context,
142                                            const ITensorInfo               *src,
143                                            const ITensorInfo               *wei,
144                                            const ITensorInfo               *bia,
145                                            const DepthwiseConv2dAttributes &attributes)
146 {
147     return is_supported_op_helper(context, src, wei, bia, nullptr, attributes);
148 }
149 
validate_op(const GpuWorkloadSketch & sketch,const ITensorInfo * src,const ITensorInfo * wei,const ITensorInfo * bia,const DepthwiseConv2dAttributes & attributes)150 Status GpuDepthwiseConv2d::validate_op(const GpuWorkloadSketch         &sketch,
151                                        const ITensorInfo               *src,
152                                        const ITensorInfo               *wei,
153                                        const ITensorInfo               *bia,
154                                        const DepthwiseConv2dAttributes &attributes)
155 {
156     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei);
157     ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id() || !wei->has_valid_id());
158 
159     if(bia != nullptr)
160     {
161         ARM_COMPUTE_RETURN_ERROR_ON(!bia->has_valid_id());
162     }
163 
164     // Refer to GpuConv2d::validate_op() for id-validness of this TensorInfo object
165     TensorInfo dst_info_to_validate;
166 
167     // Auto initialize dst tensor info
168     calculate_and_init_dst_if_empty(&dst_info_to_validate, src, wei, attributes);
169 
170     // Perform fusion test
171     // Pack tensor infos
172     ArgumentPack<ITensorInfo> tensors;
173     tensors.add_const_tensor(ACL_SRC_0, src);
174     tensors.add_const_tensor(ACL_SRC_1, wei);
175     tensors.add_const_tensor(ACL_SRC_2, bia);
176     tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
177     const Operator op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
178 
179     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op),
180                                     "Operator fusion test failed. This operator cannot be fused into the workload");
181 
182     // Check if configuration is supported
183     return is_supported_op_helper(*sketch.gpu_context(), src, wei, bia, &dst_info_to_validate, attributes);
184 }
185 
create_op(GpuWorkloadSketch & sketch,ITensorInfo * src,ITensorInfo * wei,ITensorInfo * bia,const DepthwiseConv2dAttributes & attributes)186 ITensorInfo *GpuDepthwiseConv2d::create_op(GpuWorkloadSketch               &sketch,
187                                            ITensorInfo                     *src,
188                                            ITensorInfo                     *wei,
189                                            ITensorInfo                     *bia,
190                                            const DepthwiseConv2dAttributes &attributes)
191 {
192     ARM_COMPUTE_ERROR_ON_NULLPTR(src, wei);
193     ARM_COMPUTE_LOG_PARAMS(src, wei, bia, attributes);
194     ARM_COMPUTE_ERROR_THROW_ON(GpuDepthwiseConv2d::validate_op(sketch, src, wei, bia, attributes));
195 
196     ITensorInfo *dst = sketch.implementation().create_virtual_tensor();
197     ARM_COMPUTE_ERROR_ON_NULLPTR(dst);
198 
199     calculate_and_init_dst_if_empty(dst, src, wei, attributes);
200 
201     // Translate into components and add to component graph
202     GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph();
203     const auto              *sketch_ctx = sketch.implementation().context();
204     const GpuTarget          gpu_target = sketch_ctx->gpu_target();
205 
206     if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
207     {
208         ARM_COMPUTE_ERROR_ON_NULLPTR(sketch_ctx->cl_compile_context());
209 
210         // Add Depthwise Conv2d Component
211         {
212             const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
213             auto       settings   = ClComponentDepthwiseConv2d::Settings();
214 
215             const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
216                                                  attributes.pad().right,
217                                                  attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR);
218 
219             // Get the depthwise convolution compute parameters
220             auto                       t        = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target);
221             const DWCComputeKernelInfo dwc_info = t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier());
222 
223             settings.is_fma_available(get_arch_from_target(gpu_target) != GPUTarget::MIDGARD)
224             .m0(dwc_info.m0)
225             .n0(dwc_info.n0)
226             .export_input_to_cl_image(dwc_info.export_input_to_cl_image)
227             .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image);
228 
229             if(settings.export_input_to_cl_image())
230             {
231                 arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(src);
232             }
233 
234             if(settings.export_weights_to_cl_image())
235             {
236                 arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(wei);
237             }
238 
239             ArgumentPack<ITensorInfo> arguments;
240             arguments.add_const_tensor(ACL_SRC_0, src);
241             arguments.add_const_tensor(ACL_SRC_1, wei);
242             arguments.add_const_tensor(ACL_SRC_2, bia);
243             arguments.add_const_tensor(ACL_DST_0, dst);
244             comp_graph.add_new_component<ClComponentDepthwiseConv2d>(properties, arguments, attributes, settings);
245         }
246     }
247     else
248     {
249         ARM_COMPUTE_ERROR("Unimplemented Gpu language");
250     }
251 
252     // Set up fusion test by adding to the Operator Group
253     // Note this has to be performed after all the components have been successfully added to the component graph
254 
255     // Pack tensor infos
256     ArgumentPack<ITensorInfo> tensors;
257     tensors.add_const_tensor(ACL_SRC_0, src);
258     tensors.add_const_tensor(ACL_SRC_1, wei);
259     tensors.add_const_tensor(ACL_SRC_2, bia);
260     tensors.add_const_tensor(ACL_DST_0, dst);
261 
262     const Operator op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
263     sketch.implementation().operator_group().add_operator(op);
264 
265     return dst;
266 }
267 
268 } // namespace dynamic_fusion
269 } // namespace experimental
270 } // namespace arm_compute
271