xref: /aosp_15_r20/external/ComputeLibrary/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2021-2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "arm_gemm_local.hpp"
26 
27 #include "depthwise_implementation.hpp"
28 #include "depthwise_depthfirst.hpp"
29 #include "depthwise_depthfirst_generic.hpp"
30 #include "depthwise_depthfirst_multiplier.hpp"
31 #include "depthwise_planar.hpp"
32 
33 #include "depthwise_implementation_constraints.hpp"
34 
35 #if defined(__aarch64__)
36 #if defined(ARM_COMPUTE_ENABLE_SVE)
37 #if defined(ARM_COMPUTE_ENABLE_SME2)
38 #include "kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za.hpp"
39 #include "kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za.hpp"
40 #include "kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za.hpp"
41 #include "kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za.hpp"
42 #endif  // defined(ARM_COMPUTE_ENABLE_SME2)
43 #include "kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
44 #include "kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
45 #include "kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
46 #endif  // defined(ARM_COMPUTE_ENABLE_SVE)
47 #include "kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
48 #include "kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
49 #include "kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
50 #include "kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp"
51 #include "kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
52 #endif  // defined(__aarch64__)
53 
54 #include <cstdint>
55 
56 using arm_gemm::Requantize32;
57 
58 namespace arm_conv {
59 namespace depthwise {
60 
61 static const DepthwiseImplementation<uint8_t, int8_t, uint8_t, Requantize32> depthwise_u8q_methods[] = {
62 #if defined(__aarch64__)
63 #if defined(ARM_COMPUTE_ENABLE_SVE)
64 #if defined(ARM_COMPUTE_ENABLE_SME2)
65   {
66     DepthwiseMethod::PLANAR,
67     "sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za",
68     constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
69                              is_supported<sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za>,
70                              has_no_channel_multiplier,
71                              qp_has_no_left_shift),
72     nullptr,
__anonfe55368c0102() 73     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
74       auto strat = new sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za(args.cpu_info);
75       return new DepthwisePlanar<uint8_t, int8_t>(strat, args, qp);
76     },
77   },
78   {
79     DepthwiseMethod::PLANAR,
80     "sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za",
81     constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
82                              is_supported<sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za>,
83                              has_no_channel_multiplier,
84                              qp_has_no_left_shift),
85     nullptr,
__anonfe55368c0202() 86     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
87       auto strat = new sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za(args.cpu_info);
88       return new DepthwisePlanar<uint8_t, int8_t>(strat, args, qp);
89     },
90   },
91   {
92     DepthwiseMethod::PLANAR,
93     "sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za",
94     constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
95                              is_supported<sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za>,
96                              has_no_channel_multiplier,
97                              qp_has_no_left_shift),
98     nullptr,
__anonfe55368c0302() 99     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
100       auto strat = new sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za(args.cpu_info);
101       return new DepthwisePlanar<uint8_t, int8_t>(strat, args, qp);
102     },
103   },
104   {
105     DepthwiseMethod::PLANAR,
106     "sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za",
107     constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
108                              is_supported<sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za>,
109                              has_no_channel_multiplier,
110                              qp_has_no_left_shift),
111     nullptr,
__anonfe55368c0402() 112     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
113       auto strat = new sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za(args.cpu_info);
114       return new DepthwisePlanar<uint8_t, int8_t>(strat, args, qp);
115     },
116   },
117 #endif  // defined(ARM_COMPUTE_ENABLE_SME2)
118   {
119     DepthwiseMethod::DEPTHFIRST,
120     "sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
121     constraint<Requantize32>(is_supported<sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
122                              has_no_channel_multiplier,
123                              qp_has_no_left_shift,
124                              cpu_has_sve2),
125     nullptr,
__anonfe55368c0502() 126     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
127       auto strat = new sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
128       return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
129     },
130   },
131   {
132     DepthwiseMethod::DEPTHFIRST,
133     "sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
134     constraint<Requantize32>(is_supported<sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
135                              has_no_channel_multiplier,
136                              qp_has_no_left_shift,
137                              cpu_has_sve2),
138     nullptr,
__anonfe55368c0602() 139     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
140       auto strat = new sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
141       return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
142     },
143   },
144   {
145     DepthwiseMethod::DEPTHFIRST,
146     "sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
147     constraint<Requantize32>(is_supported<sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
148                              has_no_channel_multiplier,
149                              qp_has_no_left_shift,
150                              cpu_has_sve2),
151     nullptr,
__anonfe55368c0702() 152     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
153       auto strat = new sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
154       return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
155     },
156   },
157 #endif  // defined(ARM_COMPUTE_ENABLE_SVE)
158   {
159     DepthwiseMethod::DEPTHFIRST,
160     "a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
161     constraint<Requantize32>(is_supported<a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
162                              has_no_channel_multiplier,
163                              qp_has_no_left_shift),
164     nullptr,
__anonfe55368c0802() 165     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
166       auto strat = new a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
167       return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
168     },
169   },
170   {
171     DepthwiseMethod::DEPTHFIRST,
172     "a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
173     constraint<Requantize32>(is_supported<a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
174                              has_no_channel_multiplier,
175                              qp_has_no_left_shift),
176     nullptr,
__anonfe55368c0902() 177     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
178       auto strat = new a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
179       return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
180     },
181   },
182   {
183     DepthwiseMethod::DEPTHFIRST,
184     "a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
185     constraint<Requantize32>(is_supported<a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
186                              has_no_channel_multiplier,
187                              qp_has_no_left_shift),
188     nullptr,
__anonfe55368c0a02() 189     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
190       auto strat = new a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
191       return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
192     },
193   },
194   {
195     DepthwiseMethod::DEPTHFIRST,
196     "a64_u8s8u8q_nhwc_generic_output3x3_mla_depthfirst",
197     constraint<Requantize32>(has_no_channel_multiplier),
198     nullptr,
__anonfe55368c0b02() 199     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
200       auto kernel = new a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
201       auto strat = new GenericDepthfirstStrategy<uint8_t, int8_t>(kernel, 3, 3, args);
202       return new DepthwiseDepthfirstGeneric<uint8_t, int8_t>(strat, args, qp);
203     },
204   },
205   {
206     DepthwiseMethod::DEPTHFIRST,
207     "a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
208     constraint<Requantize32>(has_channel_multiplier),
209     nullptr,
__anonfe55368c0c02() 210     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
211       auto kern = new a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
212       auto strat = new GenericDepthfirstMultiplierStrategy<uint8_t, int8_t>(kern, args);
213       return new DepthwiseDepthfirstMultiplier<uint8_t, int8_t, uint8_t, int32_t, true>(strat, args, qp);
214     },
215   },
216 #endif  // defined(__aarch64__)
217   { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr },  // End of list
218 };
219 
220 template <>
depthwise_implementation_list()221 const DepthwiseImplementation<uint8_t, int8_t, uint8_t, Requantize32> *depthwise_implementation_list()
222 {
223   return depthwise_u8q_methods;
224 }
225 
226 template UniqueDepthwiseCommon<uint8_t, int8_t, uint8_t> depthwise(const DepthwiseArgs &, const Requantize32 &);
227 template std::vector<KernelDescription> get_compatible_kernels<uint8_t, int8_t, uint8_t, Requantize32>(const DepthwiseArgs &, const Requantize32 &);
228 
229 }  // namespace depthwise
230 }  // namespace arm_conv
231