1 /*
2 * Copyright (c) 2017 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24 #ifdef __aarch64__
25
26 #include <arm_neon.h>
27
28 #include "../../asmlib.hpp"
29
30 namespace arm_gemm {
31
a64_gemm_u8_4x4(const uint8_t * Apanel,const uint8_t * Bpanel,uint32_t * Cpanel,int ablocks,int bblocks,int K)32 void a64_gemm_u8_4x4(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
33 const uint8_t *a_ptr = Apanel;
34 uint32_t *c_ptr = Cpanel;
35 K /= 16;
36
37 for (int yb=0; yb<ablocks; yb++) {
38 const uint8_t *a_ptr0 = a_ptr;
39 const uint8_t *b_ptr = Bpanel;
40
41 for (int xb=0; xb<bblocks; xb++) {
42 a_ptr = a_ptr0;
43
44 int k = K-1;
45
46 register uint8x16_t b0 asm("v4");
47 register uint8x16_t b1 asm("v5");
48 register uint8x16_t b2 asm("v6");
49 register uint8x16_t b3 asm("v7");
50
51 __asm __volatile (
52 "movi v16.4s, #0x0\n"
53 "ldr q0, [%[a_ptr]]\n"
54 "movi v17.4s, #0x0\n"
55 "ldr %q[b0], [%[b_ptr]]\n"
56 "movi v18.4s, #0x0\n"
57 "ldr %q[b1], [%[b_ptr], #16]\n"
58 "movi v19.4s, #0x0\n"
59 "ldr %q[b2], [%[b_ptr], #32]\n"
60 "movi v20.4s, #0x0\n"
61 "ldr %q[b3], [%[b_ptr], #48]\n"
62 "movi v21.4s, #0x0\n"
63 "ldr q1, [%[a_ptr], #16]\n"
64 "movi v22.4s, #0x0\n"
65 "ldr q2, [%[a_ptr], #32]\n"
66 "movi v23.4s, #0x0\n"
67 "ldr q3, [%[a_ptr], #48]\n"
68 "movi v24.4s, #0x0\n"
69 ASM_PREFETCH("[%[b_ptr], #64]")
70 "movi v25.4s, #0x0\n"
71 ASM_PREFETCH("[%[a_ptr], #64]")
72 "movi v26.4s, #0x0\n"
73 ASM_PREFETCH("[%[b_ptr], #128]")
74 "movi v27.4s, #0x0\n"
75 ASM_PREFETCH("[%[a_ptr], #128]")
76 "movi v28.4s, #0x0\n"
77 ASM_PREFETCH("[%[b_ptr], #192]")
78 "movi v29.4s, #0x0\n"
79 ASM_PREFETCH("[%[a_ptr], #192]")
80 "movi v30.4s, #0x0\n"
81 ASM_PREFETCH("[%[b_ptr], #256]")
82 "movi v31.4s, #0x0\n"
83 ASM_PREFETCH("[%[a_ptr], #256]")
84
85 "umull v12.8h, v0.8b, %[b0].8b\n"
86 "add %[a_ptr], %[a_ptr], #64\n"
87 "umull v13.8h, v0.8b, %[b1].8b\n"
88 "umull v14.8h, v0.8b, %[b2].8b\n"
89 "add %[b_ptr], %[b_ptr], #64\n"
90 "umull v15.8h, v0.8b, %[b3].8b\n"
91
92 // Skip loop if we are doing zero iterations of it.
93 "cbz %w[k], 2f\n"
94
95 "1:\n"
96 "uadalp v16.4s, v12.8h\n"
97 "umull2 v12.8h, v0.16b, %[b0].16b\n"
98 "uadalp v17.4s, v13.8h\n"
99 "umull2 v13.8h, v0.16b, %[b1].16b\n"
100 "uadalp v18.4s, v14.8h\n"
101 "umull2 v14.8h, v0.16b, %[b2].16b\n"
102 "uadalp v19.4s, v15.8h\n"
103 "umull2 v15.8h, v0.16b, %[b3].16b\n"
104 "ldr q0, [%[a_ptr]]\n"
105
106 "uadalp v16.4s, v12.8h\n"
107 "umull v12.8h, v1.8b, %[b0].8b\n"
108 "uadalp v17.4s, v13.8h\n"
109 "umull v13.8h, v1.8b, %[b1].8b\n"
110 "subs %w[k], %w[k], #1\n"
111 "uadalp v18.4s, v14.8h\n"
112 "umull v14.8h, v1.8b, %[b2].8b\n"
113 "uadalp v19.4s, v15.8h\n"
114 "umull v15.8h, v1.8b, %[b3].8b\n"
115
116 "uadalp v20.4s, v12.8h\n"
117 "umull2 v12.8h, v1.16b, %[b0].16b\n"
118 "uadalp v21.4s, v13.8h\n"
119 "umull2 v13.8h, v1.16b, %[b1].16b\n"
120 ASM_PREFETCH("[%[a_ptr], #256]")
121 "uadalp v22.4s, v14.8h\n"
122 "umull2 v14.8h, v1.16b, %[b2].16b\n"
123 "uadalp v23.4s, v15.8h\n"
124 "umull2 v15.8h, v1.16b, %[b3].16b\n"
125 "ldr q1, [%[a_ptr], #16]\n"
126
127 "uadalp v20.4s, v12.8h\n"
128 "umull v12.8h, v2.8b, %[b0].8b\n"
129 "uadalp v21.4s, v13.8h\n"
130 "umull v13.8h, v2.8b, %[b1].8b\n"
131 ASM_PREFETCH("[%[b_ptr], #256]")
132 "uadalp v22.4s, v14.8h\n"
133 "umull v14.8h, v2.8b, %[b2].8b\n"
134 "uadalp v23.4s, v15.8h\n"
135 "umull v15.8h, v2.8b, %[b3].8b\n"
136
137 "uadalp v24.4s, v12.8h\n"
138 "umull2 v12.8h, v2.16b, %[b0].16b\n"
139 "uadalp v25.4s, v13.8h\n"
140 "umull2 v13.8h, v2.16b, %[b1].16b\n"
141 "uadalp v26.4s, v14.8h\n"
142 "umull2 v14.8h, v2.16b, %[b2].16b\n"
143 "uadalp v27.4s, v15.8h\n"
144 "umull2 v15.8h, v2.16b, %[b3].16b\n"
145 "ldr q2, [%[a_ptr], #32]\n"
146
147 "uadalp v24.4s, v12.8h\n"
148 "umull v12.8h, v3.8b, %[b0].8b\n"
149 "uadalp v25.4s, v13.8h\n"
150 "umull v13.8h, v3.8b, %[b1].8b\n"
151 "uadalp v26.4s, v14.8h\n"
152 "umull v14.8h, v3.8b, %[b2].8b\n"
153 "uadalp v27.4s, v15.8h\n"
154 "umull v15.8h, v3.8b, %[b3].8b\n"
155
156 "uadalp v28.4s, v12.8h\n"
157 "umull2 v12.8h, v3.16b, %[b0].16b\n"
158 "ldr %q[b0], [%[b_ptr]]\n"
159 "uadalp v29.4s, v13.8h\n"
160 "umull2 v13.8h, v3.16b, %[b1].16b\n"
161 "ldr %q[b1], [%[b_ptr], #16]\n"
162 "uadalp v30.4s, v14.8h\n"
163 "umull2 v14.8h, v3.16b, %[b2].16b\n"
164 "ldr %q[b2], [%[b_ptr], #32]\n"
165 "uadalp v31.4s, v15.8h\n"
166 "umull2 v15.8h, v3.16b, %[b3].16b\n"
167 "ldr %q[b3], [%[b_ptr], #48]\n"
168
169 "uadalp v28.4s, v12.8h\n"
170 "umull v12.8h, v0.8b, %[b0].8b\n"
171 "add %[b_ptr], %[b_ptr], #64\n"
172 "uadalp v29.4s, v13.8h\n"
173 "umull v13.8h, v0.8b, %[b1].8b\n"
174 "ldr q3, [%[a_ptr], #48]\n"
175 "uadalp v30.4s, v14.8h\n"
176 "umull v14.8h, v0.8b, %[b2].8b\n"
177 "add %[a_ptr], %[a_ptr], #64\n"
178 "uadalp v31.4s, v15.8h\n"
179 "umull v15.8h, v0.8b, %[b3].8b\n"
180 "bne 1b\n"
181
182 // Branch target
183 "2:\n"
184 "uadalp v16.4s, v12.8h\n"
185 "umull2 v12.8h, v0.16b, %[b0].16b\n"
186 "uadalp v17.4s, v13.8h\n"
187 "umull2 v13.8h, v0.16b, %[b1].16b\n"
188 "uadalp v18.4s, v14.8h\n"
189 "umull2 v14.8h, v0.16b, %[b2].16b\n"
190 "uadalp v19.4s, v15.8h\n"
191 "umull2 v15.8h, v0.16b, %[b3].16b\n"
192
193 "uadalp v16.4s, v12.8h\n"
194 "umull v12.8h, v1.8b, %[b0].8b\n"
195 "uadalp v17.4s, v13.8h\n"
196 "umull v13.8h, v1.8b, %[b1].8b\n"
197 "uadalp v18.4s, v14.8h\n"
198 "umull v14.8h, v1.8b, %[b2].8b\n"
199 "uadalp v19.4s, v15.8h\n"
200 "umull v15.8h, v1.8b, %[b3].8b\n"
201
202 "uadalp v20.4s, v12.8h\n"
203 "umull2 v12.8h, v1.16b, %[b0].16b\n"
204 "uadalp v21.4s, v13.8h\n"
205 "umull2 v13.8h, v1.16b, %[b1].16b\n"
206 "uadalp v22.4s, v14.8h\n"
207 "umull2 v14.8h, v1.16b, %[b2].16b\n"
208 "uadalp v23.4s, v15.8h\n"
209 "umull2 v15.8h, v1.16b, %[b3].16b\n"
210
211 "uadalp v20.4s, v12.8h\n"
212 "umull v12.8h, v2.8b, %[b0].8b\n"
213 "uadalp v21.4s, v13.8h\n"
214 "umull v13.8h, v2.8b, %[b1].8b\n"
215 "uadalp v22.4s, v14.8h\n"
216 "umull v14.8h, v2.8b, %[b2].8b\n"
217 "uadalp v23.4s, v15.8h\n"
218 "umull v15.8h, v2.8b, %[b3].8b\n"
219
220 "uadalp v24.4s, v12.8h\n"
221 "umull2 v12.8h, v2.16b, %[b0].16b\n"
222 "uadalp v25.4s, v13.8h\n"
223 "umull2 v13.8h, v2.16b, %[b1].16b\n"
224 "uadalp v26.4s, v14.8h\n"
225 "umull2 v14.8h, v2.16b, %[b2].16b\n"
226 "uadalp v27.4s, v15.8h\n"
227 "umull2 v15.8h, v2.16b, %[b3].16b\n"
228
229 "uadalp v24.4s, v12.8h\n"
230 "umull v12.8h, v3.8b, %[b0].8b\n"
231 "uadalp v25.4s, v13.8h\n"
232 "umull v13.8h, v3.8b, %[b1].8b\n"
233 "uadalp v26.4s, v14.8h\n"
234 "umull v14.8h, v3.8b, %[b2].8b\n"
235 "uadalp v27.4s, v15.8h\n"
236 "umull v15.8h, v3.8b, %[b3].8b\n"
237
238 "uadalp v28.4s, v12.8h\n"
239 "umull2 v12.8h, v3.16b, %[b0].16b\n"
240 "uadalp v29.4s, v13.8h\n"
241 "umull2 v13.8h, v3.16b, %[b1].16b\n"
242 "uadalp v30.4s, v14.8h\n"
243 "umull2 v14.8h, v3.16b, %[b2].16b\n"
244 "uadalp v31.4s, v15.8h\n"
245 "umull2 v15.8h, v3.16b, %[b3].16b\n"
246
247 "uadalp v28.4s, v12.8h\n"
248 "uadalp v29.4s, v13.8h\n"
249 "uadalp v30.4s, v14.8h\n"
250 "uadalp v31.4s, v15.8h\n"
251
252 "addp v16.4s, v16.4s, v17.4s\n"
253 "addp v17.4s, v18.4s, v19.4s\n"
254 "addp v18.4s, v20.4s, v21.4s\n"
255 "addp v19.4s, v22.4s, v23.4s\n"
256 "addp v20.4s, v24.4s, v25.4s\n"
257 "addp v21.4s, v26.4s, v27.4s\n"
258 "addp v22.4s, v28.4s, v29.4s\n"
259 "addp v23.4s, v30.4s, v31.4s\n"
260
261 "addp v16.4s, v16.4s, v17.4s\n"
262 "addp v17.4s, v18.4s, v19.4s\n"
263 "addp v18.4s, v20.4s, v21.4s\n"
264 "addp v19.4s, v22.4s, v23.4s\n"
265
266 "str q16, [%[c_ptr]]\n"
267 "str q17, [%[c_ptr], #16]\n"
268 "str q18, [%[c_ptr], #32]\n"
269 "str q19, [%[c_ptr], #48]\n"
270 "add %[c_ptr], %[c_ptr], #64\n"
271
272 :
273 [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
274 [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [b3] "+w" (b3),
275 [k] "+r" (k)
276 :
277 : "x20", "x21", "v0","v1","v2","v3","v12","v13","v14","v15","v16","v17","v18","v19",
278 "v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31", "cc");
279 }
280 }
281 }
282
283 } // namespace arm_gemm
284
285 #endif // __aarch64__
286