xref: /aosp_15_r20/external/ComputeLibrary/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2017 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #ifdef __aarch64__
25 
26 #include <arm_neon.h>
27 
28 #include "../../asmlib.hpp"
29 
30 namespace arm_gemm {
31 
a64_gemm_u8_4x4(const uint8_t * Apanel,const uint8_t * Bpanel,uint32_t * Cpanel,int ablocks,int bblocks,int K)32 void a64_gemm_u8_4x4(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
33     const uint8_t *a_ptr = Apanel;
34     uint32_t *c_ptr = Cpanel;
35     K /= 16;
36 
37     for (int yb=0; yb<ablocks; yb++) {
38         const uint8_t *a_ptr0 = a_ptr;
39         const uint8_t *b_ptr = Bpanel;
40 
41         for (int xb=0; xb<bblocks; xb++) {
42             a_ptr = a_ptr0;
43 
44             int k = K-1;
45 
46             register uint8x16_t b0  asm("v4");
47             register uint8x16_t b1  asm("v5");
48             register uint8x16_t b2  asm("v6");
49             register uint8x16_t b3  asm("v7");
50 
51             __asm __volatile (
52                 "movi	v16.4s, #0x0\n"
53                 "ldr	q0, [%[a_ptr]]\n"
54                 "movi	v17.4s, #0x0\n"
55                 "ldr	%q[b0], [%[b_ptr]]\n"
56                 "movi	v18.4s, #0x0\n"
57                 "ldr	%q[b1], [%[b_ptr], #16]\n"
58                 "movi	v19.4s, #0x0\n"
59                 "ldr	%q[b2], [%[b_ptr], #32]\n"
60                 "movi	v20.4s, #0x0\n"
61                 "ldr	%q[b3], [%[b_ptr], #48]\n"
62                 "movi	v21.4s, #0x0\n"
63                 "ldr	q1, [%[a_ptr], #16]\n"
64                 "movi	v22.4s, #0x0\n"
65                 "ldr	q2, [%[a_ptr], #32]\n"
66                 "movi	v23.4s, #0x0\n"
67                 "ldr	q3, [%[a_ptr], #48]\n"
68                 "movi	v24.4s, #0x0\n"
69                 ASM_PREFETCH("[%[b_ptr], #64]")
70                 "movi	v25.4s, #0x0\n"
71                 ASM_PREFETCH("[%[a_ptr], #64]")
72                 "movi	v26.4s, #0x0\n"
73                 ASM_PREFETCH("[%[b_ptr], #128]")
74                 "movi	v27.4s, #0x0\n"
75                 ASM_PREFETCH("[%[a_ptr], #128]")
76                 "movi	v28.4s, #0x0\n"
77                 ASM_PREFETCH("[%[b_ptr], #192]")
78                 "movi	v29.4s, #0x0\n"
79                 ASM_PREFETCH("[%[a_ptr], #192]")
80                 "movi	v30.4s, #0x0\n"
81                 ASM_PREFETCH("[%[b_ptr], #256]")
82                 "movi	v31.4s, #0x0\n"
83                 ASM_PREFETCH("[%[a_ptr], #256]")
84 
85                 "umull	v12.8h, v0.8b, %[b0].8b\n"
86 		"add	%[a_ptr], %[a_ptr], #64\n"
87                 "umull	v13.8h, v0.8b, %[b1].8b\n"
88                 "umull	v14.8h, v0.8b, %[b2].8b\n"
89 		"add	%[b_ptr], %[b_ptr], #64\n"
90                 "umull	v15.8h, v0.8b, %[b3].8b\n"
91 
92                 // Skip loop if we are doing zero iterations of it.
93                 "cbz	%w[k], 2f\n"
94 
95                 "1:\n"
96                 "uadalp	v16.4s, v12.8h\n"
97                 "umull2	v12.8h, v0.16b, %[b0].16b\n"
98                 "uadalp	v17.4s, v13.8h\n"
99                 "umull2	v13.8h, v0.16b, %[b1].16b\n"
100                 "uadalp	v18.4s, v14.8h\n"
101                 "umull2	v14.8h, v0.16b, %[b2].16b\n"
102                 "uadalp	v19.4s, v15.8h\n"
103                 "umull2	v15.8h, v0.16b, %[b3].16b\n"
104                 "ldr 	q0, [%[a_ptr]]\n"
105 
106                 "uadalp	v16.4s, v12.8h\n"
107                 "umull	v12.8h, v1.8b, %[b0].8b\n"
108                 "uadalp	v17.4s, v13.8h\n"
109                 "umull	v13.8h, v1.8b, %[b1].8b\n"
110                 "subs	%w[k], %w[k], #1\n"
111                 "uadalp	v18.4s, v14.8h\n"
112                 "umull	v14.8h, v1.8b, %[b2].8b\n"
113                 "uadalp	v19.4s, v15.8h\n"
114                 "umull	v15.8h, v1.8b, %[b3].8b\n"
115 
116                 "uadalp	v20.4s, v12.8h\n"
117                 "umull2	v12.8h, v1.16b, %[b0].16b\n"
118                 "uadalp	v21.4s, v13.8h\n"
119                 "umull2	v13.8h, v1.16b, %[b1].16b\n"
120                 ASM_PREFETCH("[%[a_ptr], #256]")
121                 "uadalp	v22.4s, v14.8h\n"
122                 "umull2	v14.8h, v1.16b, %[b2].16b\n"
123                 "uadalp	v23.4s, v15.8h\n"
124                 "umull2	v15.8h, v1.16b, %[b3].16b\n"
125                 "ldr 	q1, [%[a_ptr], #16]\n"
126 
127                 "uadalp	v20.4s, v12.8h\n"
128                 "umull	v12.8h, v2.8b, %[b0].8b\n"
129                 "uadalp	v21.4s, v13.8h\n"
130                 "umull	v13.8h, v2.8b, %[b1].8b\n"
131                 ASM_PREFETCH("[%[b_ptr], #256]")
132                 "uadalp	v22.4s, v14.8h\n"
133                 "umull	v14.8h, v2.8b, %[b2].8b\n"
134                 "uadalp	v23.4s, v15.8h\n"
135                 "umull	v15.8h, v2.8b, %[b3].8b\n"
136 
137                 "uadalp	v24.4s, v12.8h\n"
138                 "umull2	v12.8h, v2.16b, %[b0].16b\n"
139                 "uadalp	v25.4s, v13.8h\n"
140                 "umull2	v13.8h, v2.16b, %[b1].16b\n"
141                 "uadalp	v26.4s, v14.8h\n"
142                 "umull2	v14.8h, v2.16b, %[b2].16b\n"
143                 "uadalp	v27.4s, v15.8h\n"
144                 "umull2	v15.8h, v2.16b, %[b3].16b\n"
145                 "ldr	q2, [%[a_ptr], #32]\n"
146 
147                 "uadalp	v24.4s, v12.8h\n"
148                 "umull	v12.8h, v3.8b, %[b0].8b\n"
149                 "uadalp	v25.4s, v13.8h\n"
150                 "umull	v13.8h, v3.8b, %[b1].8b\n"
151                 "uadalp	v26.4s, v14.8h\n"
152                 "umull	v14.8h, v3.8b, %[b2].8b\n"
153                 "uadalp	v27.4s, v15.8h\n"
154                 "umull	v15.8h, v3.8b, %[b3].8b\n"
155 
156                 "uadalp	v28.4s, v12.8h\n"
157                 "umull2	v12.8h, v3.16b, %[b0].16b\n"
158                 "ldr	%q[b0], [%[b_ptr]]\n"
159                 "uadalp	v29.4s, v13.8h\n"
160                 "umull2	v13.8h, v3.16b, %[b1].16b\n"
161                 "ldr	%q[b1], [%[b_ptr], #16]\n"
162                 "uadalp	v30.4s, v14.8h\n"
163                 "umull2	v14.8h, v3.16b, %[b2].16b\n"
164                 "ldr	%q[b2], [%[b_ptr], #32]\n"
165                 "uadalp	v31.4s, v15.8h\n"
166                 "umull2	v15.8h, v3.16b, %[b3].16b\n"
167                 "ldr	%q[b3], [%[b_ptr], #48]\n"
168 
169                 "uadalp	v28.4s, v12.8h\n"
170                 "umull	v12.8h, v0.8b, %[b0].8b\n"
171                 "add	%[b_ptr], %[b_ptr], #64\n"
172                 "uadalp	v29.4s, v13.8h\n"
173                 "umull	v13.8h, v0.8b, %[b1].8b\n"
174                 "ldr	q3, [%[a_ptr], #48]\n"
175                 "uadalp	v30.4s, v14.8h\n"
176                 "umull	v14.8h, v0.8b, %[b2].8b\n"
177                 "add	%[a_ptr], %[a_ptr], #64\n"
178                 "uadalp	v31.4s, v15.8h\n"
179                 "umull	v15.8h, v0.8b, %[b3].8b\n"
180                 "bne	1b\n"
181 
182                 // Branch target
183                 "2:\n"
184                 "uadalp	v16.4s, v12.8h\n"
185                 "umull2	v12.8h, v0.16b, %[b0].16b\n"
186                 "uadalp	v17.4s, v13.8h\n"
187                 "umull2	v13.8h, v0.16b, %[b1].16b\n"
188                 "uadalp	v18.4s, v14.8h\n"
189                 "umull2	v14.8h, v0.16b, %[b2].16b\n"
190                 "uadalp	v19.4s, v15.8h\n"
191                 "umull2	v15.8h, v0.16b, %[b3].16b\n"
192 
193                 "uadalp	v16.4s, v12.8h\n"
194                 "umull	v12.8h, v1.8b, %[b0].8b\n"
195                 "uadalp	v17.4s, v13.8h\n"
196                 "umull	v13.8h, v1.8b, %[b1].8b\n"
197                 "uadalp	v18.4s, v14.8h\n"
198                 "umull	v14.8h, v1.8b, %[b2].8b\n"
199                 "uadalp	v19.4s, v15.8h\n"
200                 "umull	v15.8h, v1.8b, %[b3].8b\n"
201 
202                 "uadalp	v20.4s, v12.8h\n"
203                 "umull2	v12.8h, v1.16b, %[b0].16b\n"
204                 "uadalp	v21.4s, v13.8h\n"
205                 "umull2	v13.8h, v1.16b, %[b1].16b\n"
206                 "uadalp	v22.4s, v14.8h\n"
207                 "umull2	v14.8h, v1.16b, %[b2].16b\n"
208                 "uadalp	v23.4s, v15.8h\n"
209                 "umull2	v15.8h, v1.16b, %[b3].16b\n"
210 
211                 "uadalp	v20.4s, v12.8h\n"
212                 "umull	v12.8h, v2.8b, %[b0].8b\n"
213                 "uadalp	v21.4s, v13.8h\n"
214                 "umull	v13.8h, v2.8b, %[b1].8b\n"
215                 "uadalp	v22.4s, v14.8h\n"
216                 "umull	v14.8h, v2.8b, %[b2].8b\n"
217                 "uadalp	v23.4s, v15.8h\n"
218                 "umull	v15.8h, v2.8b, %[b3].8b\n"
219 
220                 "uadalp	v24.4s, v12.8h\n"
221                 "umull2	v12.8h, v2.16b, %[b0].16b\n"
222                 "uadalp	v25.4s, v13.8h\n"
223                 "umull2	v13.8h, v2.16b, %[b1].16b\n"
224                 "uadalp	v26.4s, v14.8h\n"
225                 "umull2	v14.8h, v2.16b, %[b2].16b\n"
226                 "uadalp	v27.4s, v15.8h\n"
227                 "umull2	v15.8h, v2.16b, %[b3].16b\n"
228 
229                 "uadalp	v24.4s, v12.8h\n"
230                 "umull	v12.8h, v3.8b, %[b0].8b\n"
231                 "uadalp	v25.4s, v13.8h\n"
232                 "umull	v13.8h, v3.8b, %[b1].8b\n"
233                 "uadalp	v26.4s, v14.8h\n"
234                 "umull	v14.8h, v3.8b, %[b2].8b\n"
235                 "uadalp	v27.4s, v15.8h\n"
236                 "umull	v15.8h, v3.8b, %[b3].8b\n"
237 
238                 "uadalp	v28.4s, v12.8h\n"
239                 "umull2	v12.8h, v3.16b, %[b0].16b\n"
240                 "uadalp	v29.4s, v13.8h\n"
241                 "umull2	v13.8h, v3.16b, %[b1].16b\n"
242                 "uadalp	v30.4s, v14.8h\n"
243                 "umull2	v14.8h, v3.16b, %[b2].16b\n"
244                 "uadalp	v31.4s, v15.8h\n"
245                 "umull2	v15.8h, v3.16b, %[b3].16b\n"
246 
247                 "uadalp	v28.4s, v12.8h\n"
248                 "uadalp	v29.4s, v13.8h\n"
249                 "uadalp	v30.4s, v14.8h\n"
250                 "uadalp	v31.4s, v15.8h\n"
251 
252                 "addp	v16.4s, v16.4s, v17.4s\n"
253                 "addp	v17.4s, v18.4s, v19.4s\n"
254                 "addp	v18.4s, v20.4s, v21.4s\n"
255                 "addp	v19.4s, v22.4s, v23.4s\n"
256                 "addp	v20.4s, v24.4s, v25.4s\n"
257                 "addp	v21.4s, v26.4s, v27.4s\n"
258                 "addp	v22.4s, v28.4s, v29.4s\n"
259                 "addp	v23.4s, v30.4s, v31.4s\n"
260 
261                 "addp	v16.4s, v16.4s, v17.4s\n"
262                 "addp	v17.4s, v18.4s, v19.4s\n"
263                 "addp	v18.4s, v20.4s, v21.4s\n"
264                 "addp	v19.4s, v22.4s, v23.4s\n"
265 
266                 "str	q16, [%[c_ptr]]\n"
267                 "str	q17, [%[c_ptr], #16]\n"
268                 "str	q18, [%[c_ptr], #32]\n"
269                 "str	q19, [%[c_ptr], #48]\n"
270                 "add	%[c_ptr], %[c_ptr], #64\n"
271 
272             :
273               [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
274               [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [b3] "+w" (b3),
275               [k] "+r" (k)
276             :
277             : "x20", "x21", "v0","v1","v2","v3","v12","v13","v14","v15","v16","v17","v18","v19",
278               "v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31", "cc");
279         }
280     }
281 }
282 
283 } // namespace arm_gemm
284 
285 #endif // __aarch64__
286