1 /*
2 * Copyright (c) 2021-2022 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "arm_gemm_local.hpp"
26
27 #include "depthwise_implementation.hpp"
28 #include "depthwise_depthfirst.hpp"
29 #include "depthwise_depthfirst_generic.hpp"
30 #include "depthwise_depthfirst_multiplier.hpp"
31 #include "depthwise_planar.hpp"
32
33 #include "depthwise_implementation_constraints.hpp"
34
35 #if defined(__aarch64__)
36 #if defined(ARM_COMPUTE_ENABLE_SVE)
37 #if defined(ARM_COMPUTE_ENABLE_SME2)
38 #include "kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za.hpp"
39 #include "kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za.hpp"
40 #include "kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za.hpp"
41 #include "kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za.hpp"
42 #endif // defined(ARM_COMPUTE_ENABLE_SME2)
43 #include "kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
44 #include "kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
45 #include "kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
46 #endif // defined(ARM_COMPUTE_ENABLE_SVE)
47 #include "kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
48 #include "kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
49 #include "kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
50 #include "kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp"
51 #include "kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
52 #endif // defined(__aarch64__)
53
54 #include <cstdint>
55
56 using arm_gemm::Requantize32;
57
58 namespace arm_conv {
59 namespace depthwise {
60
61 static const DepthwiseImplementation<uint8_t, int8_t, uint8_t, Requantize32> depthwise_u8q_methods[] = {
62 #if defined(__aarch64__)
63 #if defined(ARM_COMPUTE_ENABLE_SVE)
64 #if defined(ARM_COMPUTE_ENABLE_SME2)
65 {
66 DepthwiseMethod::PLANAR,
67 "sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za",
68 constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
69 is_supported<sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za>,
70 has_no_channel_multiplier,
71 qp_has_no_left_shift),
72 nullptr,
__anonfe55368c0102() 73 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
74 auto strat = new sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za(args.cpu_info);
75 return new DepthwisePlanar<uint8_t, int8_t>(strat, args, qp);
76 },
77 },
78 {
79 DepthwiseMethod::PLANAR,
80 "sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za",
81 constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
82 is_supported<sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za>,
83 has_no_channel_multiplier,
84 qp_has_no_left_shift),
85 nullptr,
__anonfe55368c0202() 86 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
87 auto strat = new sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za(args.cpu_info);
88 return new DepthwisePlanar<uint8_t, int8_t>(strat, args, qp);
89 },
90 },
91 {
92 DepthwiseMethod::PLANAR,
93 "sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za",
94 constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
95 is_supported<sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za>,
96 has_no_channel_multiplier,
97 qp_has_no_left_shift),
98 nullptr,
__anonfe55368c0302() 99 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
100 auto strat = new sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za(args.cpu_info);
101 return new DepthwisePlanar<uint8_t, int8_t>(strat, args, qp);
102 },
103 },
104 {
105 DepthwiseMethod::PLANAR,
106 "sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za",
107 constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
108 is_supported<sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za>,
109 has_no_channel_multiplier,
110 qp_has_no_left_shift),
111 nullptr,
__anonfe55368c0402() 112 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
113 auto strat = new sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za(args.cpu_info);
114 return new DepthwisePlanar<uint8_t, int8_t>(strat, args, qp);
115 },
116 },
117 #endif // defined(ARM_COMPUTE_ENABLE_SME2)
118 {
119 DepthwiseMethod::DEPTHFIRST,
120 "sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
121 constraint<Requantize32>(is_supported<sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
122 has_no_channel_multiplier,
123 qp_has_no_left_shift,
124 cpu_has_sve2),
125 nullptr,
__anonfe55368c0502() 126 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
127 auto strat = new sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
128 return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
129 },
130 },
131 {
132 DepthwiseMethod::DEPTHFIRST,
133 "sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
134 constraint<Requantize32>(is_supported<sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
135 has_no_channel_multiplier,
136 qp_has_no_left_shift,
137 cpu_has_sve2),
138 nullptr,
__anonfe55368c0602() 139 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
140 auto strat = new sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
141 return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
142 },
143 },
144 {
145 DepthwiseMethod::DEPTHFIRST,
146 "sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
147 constraint<Requantize32>(is_supported<sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
148 has_no_channel_multiplier,
149 qp_has_no_left_shift,
150 cpu_has_sve2),
151 nullptr,
__anonfe55368c0702() 152 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
153 auto strat = new sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
154 return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
155 },
156 },
157 #endif // defined(ARM_COMPUTE_ENABLE_SVE)
158 {
159 DepthwiseMethod::DEPTHFIRST,
160 "a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
161 constraint<Requantize32>(is_supported<a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
162 has_no_channel_multiplier,
163 qp_has_no_left_shift),
164 nullptr,
__anonfe55368c0802() 165 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
166 auto strat = new a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
167 return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
168 },
169 },
170 {
171 DepthwiseMethod::DEPTHFIRST,
172 "a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
173 constraint<Requantize32>(is_supported<a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
174 has_no_channel_multiplier,
175 qp_has_no_left_shift),
176 nullptr,
__anonfe55368c0902() 177 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
178 auto strat = new a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
179 return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
180 },
181 },
182 {
183 DepthwiseMethod::DEPTHFIRST,
184 "a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
185 constraint<Requantize32>(is_supported<a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
186 has_no_channel_multiplier,
187 qp_has_no_left_shift),
188 nullptr,
__anonfe55368c0a02() 189 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
190 auto strat = new a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
191 return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
192 },
193 },
194 {
195 DepthwiseMethod::DEPTHFIRST,
196 "a64_u8s8u8q_nhwc_generic_output3x3_mla_depthfirst",
197 constraint<Requantize32>(has_no_channel_multiplier),
198 nullptr,
__anonfe55368c0b02() 199 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
200 auto kernel = new a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
201 auto strat = new GenericDepthfirstStrategy<uint8_t, int8_t>(kernel, 3, 3, args);
202 return new DepthwiseDepthfirstGeneric<uint8_t, int8_t>(strat, args, qp);
203 },
204 },
205 {
206 DepthwiseMethod::DEPTHFIRST,
207 "a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
208 constraint<Requantize32>(has_channel_multiplier),
209 nullptr,
__anonfe55368c0c02() 210 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
211 auto kern = new a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
212 auto strat = new GenericDepthfirstMultiplierStrategy<uint8_t, int8_t>(kern, args);
213 return new DepthwiseDepthfirstMultiplier<uint8_t, int8_t, uint8_t, int32_t, true>(strat, args, qp);
214 },
215 },
216 #endif // defined(__aarch64__)
217 { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr }, // End of list
218 };
219
220 template <>
depthwise_implementation_list()221 const DepthwiseImplementation<uint8_t, int8_t, uint8_t, Requantize32> *depthwise_implementation_list()
222 {
223 return depthwise_u8q_methods;
224 }
225
226 template UniqueDepthwiseCommon<uint8_t, int8_t, uint8_t> depthwise(const DepthwiseArgs &, const Requantize32 &);
227 template std::vector<KernelDescription> get_compatible_kernels<uint8_t, int8_t, uint8_t, Requantize32>(const DepthwiseArgs &, const Requantize32 &);
228
229 } // namespace depthwise
230 } // namespace arm_conv
231