xref: /aosp_15_r20/external/ComputeLibrary/src/core/NEON/kernels/arm_gemm/merges/a64_merge_u32_12x8.hpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2019-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #pragma once
25 
26 #ifdef __aarch64__
27 
28 template<>
MergeResults(uint32_t * out,const uint32_t * in,const int ldout,const int y0,const int ymax,const int x0,const int xmax,const uint32_t * bias,Activation,bool append)29 void MergeResults<12, 8, false>(uint32_t *out, const uint32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const uint32_t *bias, Activation , bool append)
30 {
31     const uint32_t *inptr = in;
32     uint32_t nullbias[12];
33 
34 
35     if (!append && !bias)
36     {
37         memset(nullbias, 0, (12 * sizeof(uint32_t)));
38     }
39 
40     for (int y=y0; y<ymax; y+=8)
41     {
42         uint32_t *outptr0 = out + (y * ldout) + x0;
43         uint32_t *outptr1 = outptr0 + ldout;
44         uint32_t *outptr2 = outptr1 + ldout;
45         uint32_t *outptr3 = outptr2 + ldout;
46         uint32_t *outptr4 = outptr3 + ldout;
47         uint32_t *outptr5 = outptr4 + ldout;
48         uint32_t *outptr6 = outptr5 + ldout;
49         uint32_t *outptr7 = outptr6 + ldout;
50 
51         const int height = ymax - y;
52 
53         for (int i=x0; i<xmax; i+=12)
54         {
55             if (append)
56             {
57                 switch(height)
58                 {
59                 case 1:
60                     {
61                         if ((i+11) >= xmax)
62                         {
63                             for (int xi=0; xi<11; xi++)
64                             {
65                                 if ((i+xi) < xmax)
66                                 {
67                                     *outptr0 += inptr[xi];
68                                     outptr0++;
69                                 }
70                             }
71                             inptr += 96;
72                         } else {
73                             /* Optimized routine to copy an entire block */
74                             __asm __volatile (
75                                 "ldr q2, [%[outptr0]]\n"
76                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
77                                 "ldr q10, [%[inptr]]\n"
78                                 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
79                                 "ldr q3, [%[outptr0], #0x10]\n"
80                                 "ldr q11, [%[inptr], #0x10]\n"
81                                 "add v10.4s, v10.4s, v2.4s\n"
82                                 "ldr q4, [%[outptr0], #0x20]\n"
83                                 "ldr q12, [%[inptr], #0x20]\n"
84                                 "add %[inptr], %[inptr], #0x180\n"
85                                 "add v11.4s, v11.4s, v3.4s\n"
86                                 "str q10, [%[outptr0]]\n"
87                                 "add v12.4s, v12.4s, v4.4s\n"
88                                 "str q11, [%[outptr0], #0x10]\n"
89                                 "str q12, [%[outptr0], #0x20]\n"
90                                 "add %[outptr0], %[outptr0], #0x30\n"
91                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
92                               [inptr] "+r" (inptr)
93                             :
94                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
95                             );
96                         }
97                     }
98                     break;
99 
100                 case 2:
101                     {
102                         if ((i+11) >= xmax)
103                         {
104                             for (int xi=0; xi<11; xi++)
105                             {
106                                 if ((i+xi) < xmax)
107                                 {
108                                     *outptr0 += inptr[xi];
109                                     outptr0++;
110                                     *outptr1 += inptr[xi + 12];
111                                     outptr1++;
112                                 }
113                             }
114                             inptr += 96;
115                         } else {
116                             /* Optimized routine to copy an entire block */
117                             __asm __volatile (
118                                 "ldr q2, [%[outptr0]]\n"
119                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
120                                 "ldr q10, [%[inptr]]\n"
121                                 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
122                                 "ldr q3, [%[outptr0], #0x10]\n"
123                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
124                                 "add v10.4s, v10.4s, v2.4s\n"
125                                 "ldr q11, [%[inptr], #0x10]\n"
126                                 "ldr q4, [%[outptr0], #0x20]\n"
127                                 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
128                                 "ldr q12, [%[inptr], #0x20]\n"
129                                 "add v11.4s, v11.4s, v3.4s\n"
130                                 "str q10, [%[outptr0]]\n"
131                                 "ldr q5, [%[outptr1]]\n"
132                                 "ldr q13, [%[inptr], #0x30]\n"
133                                 "add v12.4s, v12.4s, v4.4s\n"
134                                 "str q11, [%[outptr0], #0x10]\n"
135                                 "ldr q6, [%[outptr1], #0x10]\n"
136                                 "ldr q14, [%[inptr], #0x40]\n"
137                                 "add v13.4s, v13.4s, v5.4s\n"
138                                 "str q12, [%[outptr0], #0x20]\n"
139                                 "ldr q7, [%[outptr1], #0x20]\n"
140                                 "add %[outptr0], %[outptr0], #0x30\n"
141                                 "add v14.4s, v14.4s, v6.4s\n"
142                                 "str q13, [%[outptr1]]\n"
143                                 "ldr q15, [%[inptr], #0x50]\n"
144                                 "add %[inptr], %[inptr], #0x180\n"
145                                 "str q14, [%[outptr1], #0x10]\n"
146                                 "add v15.4s, v15.4s, v7.4s\n"
147                                 "str q15, [%[outptr1], #0x20]\n"
148                                 "add %[outptr1], %[outptr1], #0x30\n"
149                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
150                               [inptr] "+r" (inptr)
151                             :
152                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
153                             );
154                         }
155                     }
156                     break;
157 
158                 case 3:
159                     {
160                         if ((i+11) >= xmax)
161                         {
162                             for (int xi=0; xi<11; xi++)
163                             {
164                                 if ((i+xi) < xmax)
165                                 {
166                                     *outptr0 += inptr[xi];
167                                     outptr0++;
168                                     *outptr1 += inptr[xi + 12];
169                                     outptr1++;
170                                     *outptr2 += inptr[xi + 24];
171                                     outptr2++;
172                                 }
173                             }
174                             inptr += 96;
175                         } else {
176                             /* Optimized routine to copy an entire block */
177                             __asm __volatile (
178                                 "ldr q2, [%[outptr0]]\n"
179                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
180                                 "ldr q10, [%[inptr]]\n"
181                                 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
182                                 "ldr q3, [%[outptr0], #0x10]\n"
183                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
184                                 "add v10.4s, v10.4s, v2.4s\n"
185                                 "ldr q11, [%[inptr], #0x10]\n"
186                                 "ldr q4, [%[outptr0], #0x20]\n"
187                                 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
188                                 "ldr q12, [%[inptr], #0x20]\n"
189                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
190                                 "add v11.4s, v11.4s, v3.4s\n"
191                                 "str q10, [%[outptr0]]\n"
192                                 "ldr q5, [%[outptr1]]\n"
193                                 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
194                                 "add v12.4s, v12.4s, v4.4s\n"
195                                 "str q11, [%[outptr0], #0x10]\n"
196                                 "ldr q13, [%[inptr], #0x30]\n"
197                                 "ldr q6, [%[outptr1], #0x10]\n"
198                                 "ldr q14, [%[inptr], #0x40]\n"
199                                 "str q12, [%[outptr0], #0x20]\n"
200                                 "add %[outptr0], %[outptr0], #0x30\n"
201                                 "add v13.4s, v13.4s, v5.4s\n"
202                                 "ldr q7, [%[outptr1], #0x20]\n"
203                                 "add v14.4s, v14.4s, v6.4s\n"
204                                 "ldr q15, [%[inptr], #0x50]\n"
205                                 "ldr q8, [%[outptr2]]\n"
206                                 "ldr q16, [%[inptr], #0x60]\n"
207                                 "str q13, [%[outptr1]]\n"
208                                 "add v15.4s, v15.4s, v7.4s\n"
209                                 "ldr q9, [%[outptr2], #0x10]\n"
210                                 "ldr q17, [%[inptr], #0x70]\n"
211                                 "add v16.4s, v16.4s, v8.4s\n"
212                                 "str q14, [%[outptr1], #0x10]\n"
213                                 "ldr q2, [%[outptr2], #0x20]\n"
214                                 "ldr q10, [%[inptr], #0x80]\n"
215                                 "add %[inptr], %[inptr], #0x180\n"
216                                 "add v17.4s, v17.4s, v9.4s\n"
217                                 "str q15, [%[outptr1], #0x20]\n"
218                                 "add %[outptr1], %[outptr1], #0x30\n"
219                                 "add v10.4s, v10.4s, v2.4s\n"
220                                 "str q16, [%[outptr2]]\n"
221                                 "str q17, [%[outptr2], #0x10]\n"
222                                 "str q10, [%[outptr2], #0x20]\n"
223                                 "add %[outptr2], %[outptr2], #0x30\n"
224                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
225                               [inptr] "+r" (inptr)
226                             :
227                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
228                             );
229                         }
230                     }
231                     break;
232 
233                 case 4:
234                     {
235                         if ((i+11) >= xmax)
236                         {
237                             for (int xi=0; xi<11; xi++)
238                             {
239                                 if ((i+xi) < xmax)
240                                 {
241                                     *outptr0 += inptr[xi];
242                                     outptr0++;
243                                     *outptr1 += inptr[xi + 12];
244                                     outptr1++;
245                                     *outptr2 += inptr[xi + 24];
246                                     outptr2++;
247                                     *outptr3 += inptr[xi + 36];
248                                     outptr3++;
249                                 }
250                             }
251                             inptr += 96;
252                         } else {
253                             /* Optimized routine to copy an entire block */
254                             __asm __volatile (
255                                 "ldr q2, [%[outptr0]]\n"
256                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
257                                 "ldr q10, [%[inptr]]\n"
258                                 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
259                                 "ldr q3, [%[outptr0], #0x10]\n"
260                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
261                                 "add v10.4s, v10.4s, v2.4s\n"
262                                 "ldr q11, [%[inptr], #0x10]\n"
263                                 "ldr q4, [%[outptr0], #0x20]\n"
264                                 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
265                                 "ldr q12, [%[inptr], #0x20]\n"
266                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
267                                 "add v11.4s, v11.4s, v3.4s\n"
268                                 "str q10, [%[outptr0]]\n"
269                                 "ldr q5, [%[outptr1]]\n"
270                                 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
271                                 "add v12.4s, v12.4s, v4.4s\n"
272                                 "str q11, [%[outptr0], #0x10]\n"
273                                 "ldr q13, [%[inptr], #0x30]\n"
274                                 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
275                                 "ldr q6, [%[outptr1], #0x10]\n"
276                                 "str q12, [%[outptr0], #0x20]\n"
277                                 "add %[outptr0], %[outptr0], #0x30\n"
278                                 "add v13.4s, v13.4s, v5.4s\n"
279                                 "ldr q14, [%[inptr], #0x40]\n"
280                                 "ldr q7, [%[outptr1], #0x20]\n"
281                                 "ldr q15, [%[inptr], #0x50]\n"
282                                 "ldr q8, [%[outptr2]]\n"
283                                 "add v14.4s, v14.4s, v6.4s\n"
284                                 "str q13, [%[outptr1]]\n"
285                                 "ldr q16, [%[inptr], #0x60]\n"
286                                 "add v15.4s, v15.4s, v7.4s\n"
287                                 "ldr q9, [%[outptr2], #0x10]\n"
288                                 "ldr q17, [%[inptr], #0x70]\n"
289                                 "str q14, [%[outptr1], #0x10]\n"
290                                 "add v16.4s, v16.4s, v8.4s\n"
291                                 "ldr q2, [%[outptr2], #0x20]\n"
292                                 "ldr q10, [%[inptr], #0x80]\n"
293                                 "add v17.4s, v17.4s, v9.4s\n"
294                                 "str q15, [%[outptr1], #0x20]\n"
295                                 "ldr q3, [%[outptr3]]\n"
296                                 "add %[outptr1], %[outptr1], #0x30\n"
297                                 "add v10.4s, v10.4s, v2.4s\n"
298                                 "str q16, [%[outptr2]]\n"
299                                 "ldr q11, [%[inptr], #0x90]\n"
300                                 "ldr q4, [%[outptr3], #0x10]\n"
301                                 "ldr q12, [%[inptr], #0xa0]\n"
302                                 "str q17, [%[outptr2], #0x10]\n"
303                                 "add v11.4s, v11.4s, v3.4s\n"
304                                 "ldr q5, [%[outptr3], #0x20]\n"
305                                 "ldr q13, [%[inptr], #0xb0]\n"
306                                 "add %[inptr], %[inptr], #0x180\n"
307                                 "add v12.4s, v12.4s, v4.4s\n"
308                                 "str q10, [%[outptr2], #0x20]\n"
309                                 "add %[outptr2], %[outptr2], #0x30\n"
310                                 "add v13.4s, v13.4s, v5.4s\n"
311                                 "str q11, [%[outptr3]]\n"
312                                 "str q12, [%[outptr3], #0x10]\n"
313                                 "str q13, [%[outptr3], #0x20]\n"
314                                 "add %[outptr3], %[outptr3], #0x30\n"
315                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
316                               [inptr] "+r" (inptr)
317                             :
318                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
319                             );
320                         }
321                     }
322                     break;
323 
324                 case 5:
325                     {
326                         if ((i+11) >= xmax)
327                         {
328                             for (int xi=0; xi<11; xi++)
329                             {
330                                 if ((i+xi) < xmax)
331                                 {
332                                     *outptr0 += inptr[xi];
333                                     outptr0++;
334                                     *outptr1 += inptr[xi + 12];
335                                     outptr1++;
336                                     *outptr2 += inptr[xi + 24];
337                                     outptr2++;
338                                     *outptr3 += inptr[xi + 36];
339                                     outptr3++;
340                                     *outptr4 += inptr[xi + 48];
341                                     outptr4++;
342                                 }
343                             }
344                             inptr += 96;
345                         } else {
346                             /* Optimized routine to copy an entire block */
347                             __asm __volatile (
348                                 "ldr q2, [%[outptr0]]\n"
349                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
350                                 "ldr q10, [%[inptr]]\n"
351                                 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
352                                 "ldr q3, [%[outptr0], #0x10]\n"
353                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
354                                 "add v10.4s, v10.4s, v2.4s\n"
355                                 "ldr q11, [%[inptr], #0x10]\n"
356                                 "ldr q4, [%[outptr0], #0x20]\n"
357                                 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
358                                 "ldr q12, [%[inptr], #0x20]\n"
359                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
360                                 "add v11.4s, v11.4s, v3.4s\n"
361                                 "str q10, [%[outptr0]]\n"
362                                 "ldr q5, [%[outptr1]]\n"
363                                 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
364                                 "add v12.4s, v12.4s, v4.4s\n"
365                                 "str q11, [%[outptr0], #0x10]\n"
366                                 "ldr q13, [%[inptr], #0x30]\n"
367                                 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
368                                 "ldr q6, [%[outptr1], #0x10]\n"
369                                 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
370                                 "add v13.4s, v13.4s, v5.4s\n"
371                                 "str q12, [%[outptr0], #0x20]\n"
372                                 "ldr q14, [%[inptr], #0x40]\n"
373                                 "add %[outptr0], %[outptr0], #0x30\n"
374                                 "ldr q7, [%[outptr1], #0x20]\n"
375                                 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
376                                 "add v14.4s, v14.4s, v6.4s\n"
377                                 "str q13, [%[outptr1]]\n"
378                                 "ldr q15, [%[inptr], #0x50]\n"
379                                 "ldr q8, [%[outptr2]]\n"
380                                 "ldr q16, [%[inptr], #0x60]\n"
381                                 "str q14, [%[outptr1], #0x10]\n"
382                                 "add v15.4s, v15.4s, v7.4s\n"
383                                 "ldr q9, [%[outptr2], #0x10]\n"
384                                 "ldr q17, [%[inptr], #0x70]\n"
385                                 "add v16.4s, v16.4s, v8.4s\n"
386                                 "ldr q2, [%[outptr2], #0x20]\n"
387                                 "ldr q10, [%[inptr], #0x80]\n"
388                                 "str q15, [%[outptr1], #0x20]\n"
389                                 "add %[outptr1], %[outptr1], #0x30\n"
390                                 "add v17.4s, v17.4s, v9.4s\n"
391                                 "ldr q3, [%[outptr3]]\n"
392                                 "add v10.4s, v10.4s, v2.4s\n"
393                                 "str q16, [%[outptr2]]\n"
394                                 "ldr q11, [%[inptr], #0x90]\n"
395                                 "ldr q4, [%[outptr3], #0x10]\n"
396                                 "ldr q12, [%[inptr], #0xa0]\n"
397                                 "str q17, [%[outptr2], #0x10]\n"
398                                 "add v11.4s, v11.4s, v3.4s\n"
399                                 "ldr q5, [%[outptr3], #0x20]\n"
400                                 "ldr q13, [%[inptr], #0xb0]\n"
401                                 "add v12.4s, v12.4s, v4.4s\n"
402                                 "str q10, [%[outptr2], #0x20]\n"
403                                 "ldr q6, [%[outptr4]]\n"
404                                 "add %[outptr2], %[outptr2], #0x30\n"
405                                 "add v13.4s, v13.4s, v5.4s\n"
406                                 "str q11, [%[outptr3]]\n"
407                                 "ldr q14, [%[inptr], #0xc0]\n"
408                                 "ldr q7, [%[outptr4], #0x10]\n"
409                                 "ldr q15, [%[inptr], #0xd0]\n"
410                                 "str q12, [%[outptr3], #0x10]\n"
411                                 "add v14.4s, v14.4s, v6.4s\n"
412                                 "ldr q8, [%[outptr4], #0x20]\n"
413                                 "ldr q16, [%[inptr], #0xe0]\n"
414                                 "add %[inptr], %[inptr], #0x180\n"
415                                 "add v15.4s, v15.4s, v7.4s\n"
416                                 "str q13, [%[outptr3], #0x20]\n"
417                                 "add %[outptr3], %[outptr3], #0x30\n"
418                                 "add v16.4s, v16.4s, v8.4s\n"
419                                 "str q14, [%[outptr4]]\n"
420                                 "str q15, [%[outptr4], #0x10]\n"
421                                 "str q16, [%[outptr4], #0x20]\n"
422                                 "add %[outptr4], %[outptr4], #0x30\n"
423                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
424                               [inptr] "+r" (inptr)
425                             :
426                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
427                             );
428                         }
429                     }
430                     break;
431 
432                 case 6:
433                     {
434                         if ((i+11) >= xmax)
435                         {
436                             for (int xi=0; xi<11; xi++)
437                             {
438                                 if ((i+xi) < xmax)
439                                 {
440                                     *outptr0 += inptr[xi];
441                                     outptr0++;
442                                     *outptr1 += inptr[xi + 12];
443                                     outptr1++;
444                                     *outptr2 += inptr[xi + 24];
445                                     outptr2++;
446                                     *outptr3 += inptr[xi + 36];
447                                     outptr3++;
448                                     *outptr4 += inptr[xi + 48];
449                                     outptr4++;
450                                     *outptr5 += inptr[xi + 60];
451                                     outptr5++;
452                                 }
453                             }
454                             inptr += 96;
455                         } else {
456                             /* Optimized routine to copy an entire block */
457                             __asm __volatile (
458                                 "ldr q2, [%[outptr0]]\n"
459                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
460                                 "ldr q10, [%[inptr]]\n"
461                                 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
462                                 "ldr q3, [%[outptr0], #0x10]\n"
463                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
464                                 "add v10.4s, v10.4s, v2.4s\n"
465                                 "ldr q11, [%[inptr], #0x10]\n"
466                                 "ldr q4, [%[outptr0], #0x20]\n"
467                                 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
468                                 "ldr q12, [%[inptr], #0x20]\n"
469                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
470                                 "add v11.4s, v11.4s, v3.4s\n"
471                                 "str q10, [%[outptr0]]\n"
472                                 "ldr q5, [%[outptr1]]\n"
473                                 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
474                                 "add v12.4s, v12.4s, v4.4s\n"
475                                 "str q11, [%[outptr0], #0x10]\n"
476                                 "ldr q13, [%[inptr], #0x30]\n"
477                                 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
478                                 "ldr q6, [%[outptr1], #0x10]\n"
479                                 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
480                                 "add v13.4s, v13.4s, v5.4s\n"
481                                 "str q12, [%[outptr0], #0x20]\n"
482                                 "ldr q14, [%[inptr], #0x40]\n"
483                                 "add %[outptr0], %[outptr0], #0x30\n"
484                                 "ldr q7, [%[outptr1], #0x20]\n"
485                                 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
486                                 "add v14.4s, v14.4s, v6.4s\n"
487                                 "str q13, [%[outptr1]]\n"
488                                 "ldr q15, [%[inptr], #0x50]\n"
489                                 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
490                                 "ldr q8, [%[outptr2]]\n"
491                                 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
492                                 "add v15.4s, v15.4s, v7.4s\n"
493                                 "str q14, [%[outptr1], #0x10]\n"
494                                 "ldr q16, [%[inptr], #0x60]\n"
495                                 "ldr q9, [%[outptr2], #0x10]\n"
496                                 "ldr q17, [%[inptr], #0x70]\n"
497                                 "str q15, [%[outptr1], #0x20]\n"
498                                 "add %[outptr1], %[outptr1], #0x30\n"
499                                 "add v16.4s, v16.4s, v8.4s\n"
500                                 "ldr q2, [%[outptr2], #0x20]\n"
501                                 "add v17.4s, v17.4s, v9.4s\n"
502                                 "ldr q10, [%[inptr], #0x80]\n"
503                                 "ldr q3, [%[outptr3]]\n"
504                                 "ldr q11, [%[inptr], #0x90]\n"
505                                 "str q16, [%[outptr2]]\n"
506                                 "add v10.4s, v10.4s, v2.4s\n"
507                                 "ldr q4, [%[outptr3], #0x10]\n"
508                                 "ldr q12, [%[inptr], #0xa0]\n"
509                                 "add v11.4s, v11.4s, v3.4s\n"
510                                 "str q17, [%[outptr2], #0x10]\n"
511                                 "ldr q5, [%[outptr3], #0x20]\n"
512                                 "ldr q13, [%[inptr], #0xb0]\n"
513                                 "add v12.4s, v12.4s, v4.4s\n"
514                                 "str q10, [%[outptr2], #0x20]\n"
515                                 "ldr q6, [%[outptr4]]\n"
516                                 "add %[outptr2], %[outptr2], #0x30\n"
517                                 "add v13.4s, v13.4s, v5.4s\n"
518                                 "str q11, [%[outptr3]]\n"
519                                 "ldr q14, [%[inptr], #0xc0]\n"
520                                 "ldr q7, [%[outptr4], #0x10]\n"
521                                 "ldr q15, [%[inptr], #0xd0]\n"
522                                 "str q12, [%[outptr3], #0x10]\n"
523                                 "add v14.4s, v14.4s, v6.4s\n"
524                                 "ldr q8, [%[outptr4], #0x20]\n"
525                                 "ldr q16, [%[inptr], #0xe0]\n"
526                                 "add v15.4s, v15.4s, v7.4s\n"
527                                 "str q13, [%[outptr3], #0x20]\n"
528                                 "ldr q9, [%[outptr5]]\n"
529                                 "add %[outptr3], %[outptr3], #0x30\n"
530                                 "add v16.4s, v16.4s, v8.4s\n"
531                                 "str q14, [%[outptr4]]\n"
532                                 "ldr q17, [%[inptr], #0xf0]\n"
533                                 "ldr q2, [%[outptr5], #0x10]\n"
534                                 "ldr q10, [%[inptr], #0x100]\n"
535                                 "str q15, [%[outptr4], #0x10]\n"
536                                 "add v17.4s, v17.4s, v9.4s\n"
537                                 "ldr q3, [%[outptr5], #0x20]\n"
538                                 "ldr q11, [%[inptr], #0x110]\n"
539                                 "add %[inptr], %[inptr], #0x180\n"
540                                 "add v10.4s, v10.4s, v2.4s\n"
541                                 "str q16, [%[outptr4], #0x20]\n"
542                                 "add %[outptr4], %[outptr4], #0x30\n"
543                                 "add v11.4s, v11.4s, v3.4s\n"
544                                 "str q17, [%[outptr5]]\n"
545                                 "str q10, [%[outptr5], #0x10]\n"
546                                 "str q11, [%[outptr5], #0x20]\n"
547                                 "add %[outptr5], %[outptr5], #0x30\n"
548                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
549                               [inptr] "+r" (inptr)
550                             :
551                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
552                             );
553                         }
554                     }
555                     break;
556 
557                 case 7:
558                     {
559                         if ((i+11) >= xmax)
560                         {
561                             for (int xi=0; xi<11; xi++)
562                             {
563                                 if ((i+xi) < xmax)
564                                 {
565                                     *outptr0 += inptr[xi];
566                                     outptr0++;
567                                     *outptr1 += inptr[xi + 12];
568                                     outptr1++;
569                                     *outptr2 += inptr[xi + 24];
570                                     outptr2++;
571                                     *outptr3 += inptr[xi + 36];
572                                     outptr3++;
573                                     *outptr4 += inptr[xi + 48];
574                                     outptr4++;
575                                     *outptr5 += inptr[xi + 60];
576                                     outptr5++;
577                                     *outptr6 += inptr[xi + 72];
578                                     outptr6++;
579                                 }
580                             }
581                             inptr += 96;
582                         } else {
583                             /* Optimized routine to copy an entire block */
584                             __asm __volatile (
585                                 "ldr q2, [%[outptr0]]\n"
586                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
587                                 "ldr q10, [%[inptr]]\n"
588                                 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
589                                 "ldr q3, [%[outptr0], #0x10]\n"
590                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
591                                 "add v10.4s, v10.4s, v2.4s\n"
592                                 "ldr q11, [%[inptr], #0x10]\n"
593                                 "ldr q4, [%[outptr0], #0x20]\n"
594                                 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
595                                 "ldr q12, [%[inptr], #0x20]\n"
596                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
597                                 "add v11.4s, v11.4s, v3.4s\n"
598                                 "str q10, [%[outptr0]]\n"
599                                 "ldr q5, [%[outptr1]]\n"
600                                 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
601                                 "add v12.4s, v12.4s, v4.4s\n"
602                                 "str q11, [%[outptr0], #0x10]\n"
603                                 "ldr q13, [%[inptr], #0x30]\n"
604                                 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
605                                 "ldr q6, [%[outptr1], #0x10]\n"
606                                 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
607                                 "add v13.4s, v13.4s, v5.4s\n"
608                                 "str q12, [%[outptr0], #0x20]\n"
609                                 "ldr q14, [%[inptr], #0x40]\n"
610                                 "add %[outptr0], %[outptr0], #0x30\n"
611                                 "ldr q7, [%[outptr1], #0x20]\n"
612                                 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
613                                 "add v14.4s, v14.4s, v6.4s\n"
614                                 "str q13, [%[outptr1]]\n"
615                                 "ldr q15, [%[inptr], #0x50]\n"
616                                 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
617                                 "ldr q8, [%[outptr2]]\n"
618                                 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
619                                 "add v15.4s, v15.4s, v7.4s\n"
620                                 "str q14, [%[outptr1], #0x10]\n"
621                                 "ldr q16, [%[inptr], #0x60]\n"
622                                 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
623                                 "ldr q9, [%[outptr2], #0x10]\n"
624                                 "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
625                                 "add v16.4s, v16.4s, v8.4s\n"
626                                 "str q15, [%[outptr1], #0x20]\n"
627                                 "ldr q17, [%[inptr], #0x70]\n"
628                                 "add %[outptr1], %[outptr1], #0x30\n"
629                                 "ldr q2, [%[outptr2], #0x20]\n"
630                                 "str q16, [%[outptr2]]\n"
631                                 "add v17.4s, v17.4s, v9.4s\n"
632                                 "ldr q10, [%[inptr], #0x80]\n"
633                                 "ldr q3, [%[outptr3]]\n"
634                                 "ldr q11, [%[inptr], #0x90]\n"
635                                 "ldr q4, [%[outptr3], #0x10]\n"
636                                 "add v10.4s, v10.4s, v2.4s\n"
637                                 "str q17, [%[outptr2], #0x10]\n"
638                                 "ldr q12, [%[inptr], #0xa0]\n"
639                                 "add v11.4s, v11.4s, v3.4s\n"
640                                 "ldr q5, [%[outptr3], #0x20]\n"
641                                 "ldr q13, [%[inptr], #0xb0]\n"
642                                 "str q10, [%[outptr2], #0x20]\n"
643                                 "add %[outptr2], %[outptr2], #0x30\n"
644                                 "add v12.4s, v12.4s, v4.4s\n"
645                                 "ldr q6, [%[outptr4]]\n"
646                                 "add v13.4s, v13.4s, v5.4s\n"
647                                 "str q11, [%[outptr3]]\n"
648                                 "ldr q14, [%[inptr], #0xc0]\n"
649                                 "ldr q7, [%[outptr4], #0x10]\n"
650                                 "ldr q15, [%[inptr], #0xd0]\n"
651                                 "str q12, [%[outptr3], #0x10]\n"
652                                 "add v14.4s, v14.4s, v6.4s\n"
653                                 "ldr q8, [%[outptr4], #0x20]\n"
654                                 "ldr q16, [%[inptr], #0xe0]\n"
655                                 "add v15.4s, v15.4s, v7.4s\n"
656                                 "str q13, [%[outptr3], #0x20]\n"
657                                 "ldr q9, [%[outptr5]]\n"
658                                 "add %[outptr3], %[outptr3], #0x30\n"
659                                 "add v16.4s, v16.4s, v8.4s\n"
660                                 "str q14, [%[outptr4]]\n"
661                                 "ldr q17, [%[inptr], #0xf0]\n"
662                                 "ldr q2, [%[outptr5], #0x10]\n"
663                                 "ldr q10, [%[inptr], #0x100]\n"
664                                 "str q15, [%[outptr4], #0x10]\n"
665                                 "add v17.4s, v17.4s, v9.4s\n"
666                                 "ldr q3, [%[outptr5], #0x20]\n"
667                                 "ldr q11, [%[inptr], #0x110]\n"
668                                 "add v10.4s, v10.4s, v2.4s\n"
669                                 "str q16, [%[outptr4], #0x20]\n"
670                                 "ldr q4, [%[outptr6]]\n"
671                                 "add %[outptr4], %[outptr4], #0x30\n"
672                                 "add v11.4s, v11.4s, v3.4s\n"
673                                 "str q17, [%[outptr5]]\n"
674                                 "ldr q12, [%[inptr], #0x120]\n"
675                                 "ldr q5, [%[outptr6], #0x10]\n"
676                                 "ldr q13, [%[inptr], #0x130]\n"
677                                 "str q10, [%[outptr5], #0x10]\n"
678                                 "add v12.4s, v12.4s, v4.4s\n"
679                                 "ldr q6, [%[outptr6], #0x20]\n"
680                                 "ldr q14, [%[inptr], #0x140]\n"
681                                 "add %[inptr], %[inptr], #0x180\n"
682                                 "add v13.4s, v13.4s, v5.4s\n"
683                                 "str q11, [%[outptr5], #0x20]\n"
684                                 "add %[outptr5], %[outptr5], #0x30\n"
685                                 "add v14.4s, v14.4s, v6.4s\n"
686                                 "str q12, [%[outptr6]]\n"
687                                 "str q13, [%[outptr6], #0x10]\n"
688                                 "str q14, [%[outptr6], #0x20]\n"
689                                 "add %[outptr6], %[outptr6], #0x30\n"
690                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
691                               [inptr] "+r" (inptr)
692                             :
693                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
694                             );
695                         }
696                     }
697                     break;
698 
699                 default:
700                 case 8:
701                     {
702                         if ((i+11) >= xmax)
703                         {
704                             for (int xi=0; xi<11; xi++)
705                             {
706                                 if ((i+xi) < xmax)
707                                 {
708                                     *outptr0 += inptr[xi];
709                                     outptr0++;
710                                     *outptr1 += inptr[xi + 12];
711                                     outptr1++;
712                                     *outptr2 += inptr[xi + 24];
713                                     outptr2++;
714                                     *outptr3 += inptr[xi + 36];
715                                     outptr3++;
716                                     *outptr4 += inptr[xi + 48];
717                                     outptr4++;
718                                     *outptr5 += inptr[xi + 60];
719                                     outptr5++;
720                                     *outptr6 += inptr[xi + 72];
721                                     outptr6++;
722                                     *outptr7 += inptr[xi + 84];
723                                     outptr7++;
724                                 }
725                             }
726                             inptr += 96;
727                         } else {
728                             /* Optimized routine to copy an entire block */
729                             __asm __volatile (
730                                 "ldr q2, [%[outptr0]]\n"
731                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
732                                 "ldr q10, [%[inptr]]\n"
733                                 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
734                                 "ldr q3, [%[outptr0], #0x10]\n"
735                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
736                                 "add v10.4s, v10.4s, v2.4s\n"
737                                 "ldr q11, [%[inptr], #0x10]\n"
738                                 "ldr q4, [%[outptr0], #0x20]\n"
739                                 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
740                                 "ldr q12, [%[inptr], #0x20]\n"
741                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
742                                 "add v11.4s, v11.4s, v3.4s\n"
743                                 "str q10, [%[outptr0]]\n"
744                                 "ldr q5, [%[outptr1]]\n"
745                                 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
746                                 "add v12.4s, v12.4s, v4.4s\n"
747                                 "str q11, [%[outptr0], #0x10]\n"
748                                 "ldr q13, [%[inptr], #0x30]\n"
749                                 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
750                                 "ldr q6, [%[outptr1], #0x10]\n"
751                                 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
752                                 "add v13.4s, v13.4s, v5.4s\n"
753                                 "str q12, [%[outptr0], #0x20]\n"
754                                 "ldr q14, [%[inptr], #0x40]\n"
755                                 "add %[outptr0], %[outptr0], #0x30\n"
756                                 "ldr q7, [%[outptr1], #0x20]\n"
757                                 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
758                                 "add v14.4s, v14.4s, v6.4s\n"
759                                 "str q13, [%[outptr1]]\n"
760                                 "ldr q15, [%[inptr], #0x50]\n"
761                                 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
762                                 "ldr q8, [%[outptr2]]\n"
763                                 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
764                                 "add v15.4s, v15.4s, v7.4s\n"
765                                 "str q14, [%[outptr1], #0x10]\n"
766                                 "ldr q16, [%[inptr], #0x60]\n"
767                                 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
768                                 "ldr q9, [%[outptr2], #0x10]\n"
769                                 "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
770                                 "add v16.4s, v16.4s, v8.4s\n"
771                                 "str q15, [%[outptr1], #0x20]\n"
772                                 "ldr q17, [%[inptr], #0x70]\n"
773                                 "add %[outptr1], %[outptr1], #0x30\n"
774                                 "ldr q2, [%[outptr2], #0x20]\n"
775                                 "prfm PLDL1KEEP, [%[outptr7], #0x60]\n"
776                                 "add v17.4s, v17.4s, v9.4s\n"
777                                 "str q16, [%[outptr2]]\n"
778                                 "ldr q10, [%[inptr], #0x80]\n"
779                                 "ldr q3, [%[outptr3]]\n"
780                                 "ldr q11, [%[inptr], #0x90]\n"
781                                 "str q17, [%[outptr2], #0x10]\n"
782                                 "add v10.4s, v10.4s, v2.4s\n"
783                                 "ldr q4, [%[outptr3], #0x10]\n"
784                                 "ldr q12, [%[inptr], #0xa0]\n"
785                                 "add v11.4s, v11.4s, v3.4s\n"
786                                 "ldr q5, [%[outptr3], #0x20]\n"
787                                 "ldr q13, [%[inptr], #0xb0]\n"
788                                 "str q10, [%[outptr2], #0x20]\n"
789                                 "add %[outptr2], %[outptr2], #0x30\n"
790                                 "add v12.4s, v12.4s, v4.4s\n"
791                                 "ldr q6, [%[outptr4]]\n"
792                                 "add v13.4s, v13.4s, v5.4s\n"
793                                 "str q11, [%[outptr3]]\n"
794                                 "ldr q14, [%[inptr], #0xc0]\n"
795                                 "ldr q7, [%[outptr4], #0x10]\n"
796                                 "ldr q15, [%[inptr], #0xd0]\n"
797                                 "str q12, [%[outptr3], #0x10]\n"
798                                 "add v14.4s, v14.4s, v6.4s\n"
799                                 "ldr q8, [%[outptr4], #0x20]\n"
800                                 "ldr q16, [%[inptr], #0xe0]\n"
801                                 "add v15.4s, v15.4s, v7.4s\n"
802                                 "str q13, [%[outptr3], #0x20]\n"
803                                 "ldr q9, [%[outptr5]]\n"
804                                 "add %[outptr3], %[outptr3], #0x30\n"
805                                 "add v16.4s, v16.4s, v8.4s\n"
806                                 "str q14, [%[outptr4]]\n"
807                                 "ldr q17, [%[inptr], #0xf0]\n"
808                                 "ldr q2, [%[outptr5], #0x10]\n"
809                                 "ldr q10, [%[inptr], #0x100]\n"
810                                 "str q15, [%[outptr4], #0x10]\n"
811                                 "add v17.4s, v17.4s, v9.4s\n"
812                                 "ldr q3, [%[outptr5], #0x20]\n"
813                                 "ldr q11, [%[inptr], #0x110]\n"
814                                 "add v10.4s, v10.4s, v2.4s\n"
815                                 "str q16, [%[outptr4], #0x20]\n"
816                                 "ldr q4, [%[outptr6]]\n"
817                                 "add %[outptr4], %[outptr4], #0x30\n"
818                                 "add v11.4s, v11.4s, v3.4s\n"
819                                 "str q17, [%[outptr5]]\n"
820                                 "ldr q12, [%[inptr], #0x120]\n"
821                                 "ldr q5, [%[outptr6], #0x10]\n"
822                                 "ldr q13, [%[inptr], #0x130]\n"
823                                 "str q10, [%[outptr5], #0x10]\n"
824                                 "add v12.4s, v12.4s, v4.4s\n"
825                                 "ldr q6, [%[outptr6], #0x20]\n"
826                                 "ldr q14, [%[inptr], #0x140]\n"
827                                 "add v13.4s, v13.4s, v5.4s\n"
828                                 "str q11, [%[outptr5], #0x20]\n"
829                                 "ldr q7, [%[outptr7]]\n"
830                                 "add %[outptr5], %[outptr5], #0x30\n"
831                                 "add v14.4s, v14.4s, v6.4s\n"
832                                 "str q12, [%[outptr6]]\n"
833                                 "ldr q15, [%[inptr], #0x150]\n"
834                                 "ldr q8, [%[outptr7], #0x10]\n"
835                                 "ldr q16, [%[inptr], #0x160]\n"
836                                 "str q13, [%[outptr6], #0x10]\n"
837                                 "add v15.4s, v15.4s, v7.4s\n"
838                                 "ldr q9, [%[outptr7], #0x20]\n"
839                                 "ldr q17, [%[inptr], #0x170]\n"
840                                 "add %[inptr], %[inptr], #0x180\n"
841                                 "add v16.4s, v16.4s, v8.4s\n"
842                                 "str q14, [%[outptr6], #0x20]\n"
843                                 "add %[outptr6], %[outptr6], #0x30\n"
844                                 "add v17.4s, v17.4s, v9.4s\n"
845                                 "str q15, [%[outptr7]]\n"
846                                 "str q16, [%[outptr7], #0x10]\n"
847                                 "str q17, [%[outptr7], #0x20]\n"
848                                 "add %[outptr7], %[outptr7], #0x30\n"
849                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
850                               [inptr] "+r" (inptr)
851                             :
852                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
853                             );
854                         }
855                     }
856                     break;
857 
858 
859                 }
860             }
861             else
862             {
863                 const uint32_t *biasptr = bias ? bias + i : nullbias;
864 
865                 switch(height)
866                 {
867                 case 1:
868                     {
869                         if ((i+11) >= xmax)
870                         {
871                             for (int xi=0; xi<11; xi++)
872                             {
873                                 if ((i+xi) < xmax)
874                                 {
875                                     *outptr0 = biasptr[xi] + inptr[xi];
876                                     outptr0++;
877                                 }
878                             }
879                             inptr += 96;
880                         } else {
881                             /* Optimized routine to copy an entire block */
882                             __asm __volatile (
883                                 "ldr q2, [%[biasptr]]\n"
884                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
885                                 "ldr q3, [%[biasptr], #0x10]\n"
886                                 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
887                                 "ldr q4, [%[biasptr], #0x20]\n"
888                                 "ldr q13, [%[inptr]]\n"
889                                 "ldr q14, [%[inptr], #0x10]\n"
890                                 "ldr q15, [%[inptr], #0x20]\n"
891                                 "add %[inptr], %[inptr], #0x180\n"
892                                 "add v13.4s, v13.4s, v2.4s\n"
893                                 "add v14.4s, v14.4s, v3.4s\n"
894                                 "add v15.4s, v15.4s, v4.4s\n"
895                                 "str q13, [%[outptr0]]\n"
896                                 "str q14, [%[outptr0], #0x10]\n"
897                                 "str q15, [%[outptr0], #0x20]\n"
898                                 "add %[outptr0], %[outptr0], #0x30\n"
899                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
900                               [inptr] "+r" (inptr)
901                             : [biasptr] "r" (biasptr)
902                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
903                             );
904                         }
905                     }
906                     break;
907 
908                 case 2:
909                     {
910                         if ((i+11) >= xmax)
911                         {
912                             for (int xi=0; xi<11; xi++)
913                             {
914                                 if ((i+xi) < xmax)
915                                 {
916                                     *outptr0 = biasptr[xi] + inptr[xi];
917                                     outptr0++;
918                                     *outptr1 = biasptr[xi] + inptr[xi + 12];
919                                     outptr1++;
920                                 }
921                             }
922                             inptr += 96;
923                         } else {
924                             /* Optimized routine to copy an entire block */
925                             __asm __volatile (
926                                 "ldr q2, [%[biasptr]]\n"
927                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
928                                 "ldr q3, [%[biasptr], #0x10]\n"
929                                 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
930                                 "ldr q4, [%[biasptr], #0x20]\n"
931                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
932                                 "ldr q13, [%[inptr]]\n"
933                                 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
934                                 "ldr q14, [%[inptr], #0x10]\n"
935                                 "ldr q15, [%[inptr], #0x20]\n"
936                                 "add v13.4s, v13.4s, v2.4s\n"
937                                 "ldr q16, [%[inptr], #0x30]\n"
938                                 "ldr q17, [%[inptr], #0x40]\n"
939                                 "add v14.4s, v14.4s, v3.4s\n"
940                                 "ldr q18, [%[inptr], #0x50]\n"
941                                 "add v15.4s, v15.4s, v4.4s\n"
942                                 "str q13, [%[outptr0]]\n"
943                                 "add v16.4s, v16.4s, v2.4s\n"
944                                 "add %[inptr], %[inptr], #0x180\n"
945                                 "add v17.4s, v17.4s, v3.4s\n"
946                                 "str q14, [%[outptr0], #0x10]\n"
947                                 "add v18.4s, v18.4s, v4.4s\n"
948                                 "str q15, [%[outptr0], #0x20]\n"
949                                 "add %[outptr0], %[outptr0], #0x30\n"
950                                 "str q16, [%[outptr1]]\n"
951                                 "str q17, [%[outptr1], #0x10]\n"
952                                 "str q18, [%[outptr1], #0x20]\n"
953                                 "add %[outptr1], %[outptr1], #0x30\n"
954                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
955                               [inptr] "+r" (inptr)
956                             : [biasptr] "r" (biasptr)
957                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
958                             );
959                         }
960                     }
961                     break;
962 
963                 case 3:
964                     {
965                         if ((i+11) >= xmax)
966                         {
967                             for (int xi=0; xi<11; xi++)
968                             {
969                                 if ((i+xi) < xmax)
970                                 {
971                                     *outptr0 = biasptr[xi] + inptr[xi];
972                                     outptr0++;
973                                     *outptr1 = biasptr[xi] + inptr[xi + 12];
974                                     outptr1++;
975                                     *outptr2 = biasptr[xi] + inptr[xi + 24];
976                                     outptr2++;
977                                 }
978                             }
979                             inptr += 96;
980                         } else {
981                             /* Optimized routine to copy an entire block */
982                             __asm __volatile (
983                                 "ldr q2, [%[biasptr]]\n"
984                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
985                                 "ldr q3, [%[biasptr], #0x10]\n"
986                                 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
987                                 "ldr q4, [%[biasptr], #0x20]\n"
988                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
989                                 "ldr q13, [%[inptr]]\n"
990                                 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
991                                 "ldr q14, [%[inptr], #0x10]\n"
992                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
993                                 "add v13.4s, v13.4s, v2.4s\n"
994                                 "ldr q15, [%[inptr], #0x20]\n"
995                                 "ldr q16, [%[inptr], #0x30]\n"
996                                 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
997                                 "add v14.4s, v14.4s, v3.4s\n"
998                                 "str q13, [%[outptr0]]\n"
999                                 "add v15.4s, v15.4s, v4.4s\n"
1000                                 "ldr q17, [%[inptr], #0x40]\n"
1001                                 "add v16.4s, v16.4s, v2.4s\n"
1002                                 "ldr q18, [%[inptr], #0x50]\n"
1003                                 "ldr q19, [%[inptr], #0x60]\n"
1004                                 "str q14, [%[outptr0], #0x10]\n"
1005                                 "add v17.4s, v17.4s, v3.4s\n"
1006                                 "ldr q20, [%[inptr], #0x70]\n"
1007                                 "add v18.4s, v18.4s, v4.4s\n"
1008                                 "ldr q13, [%[inptr], #0x80]\n"
1009                                 "add v19.4s, v19.4s, v2.4s\n"
1010                                 "str q15, [%[outptr0], #0x20]\n"
1011                                 "add %[outptr0], %[outptr0], #0x30\n"
1012                                 "add v20.4s, v20.4s, v3.4s\n"
1013                                 "add %[inptr], %[inptr], #0x180\n"
1014                                 "add v13.4s, v13.4s, v4.4s\n"
1015                                 "str q16, [%[outptr1]]\n"
1016                                 "str q17, [%[outptr1], #0x10]\n"
1017                                 "str q18, [%[outptr1], #0x20]\n"
1018                                 "add %[outptr1], %[outptr1], #0x30\n"
1019                                 "str q19, [%[outptr2]]\n"
1020                                 "str q20, [%[outptr2], #0x10]\n"
1021                                 "str q13, [%[outptr2], #0x20]\n"
1022                                 "add %[outptr2], %[outptr2], #0x30\n"
1023                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1024                               [inptr] "+r" (inptr)
1025                             : [biasptr] "r" (biasptr)
1026                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1027                             );
1028                         }
1029                     }
1030                     break;
1031 
1032                 case 4:
1033                     {
1034                         if ((i+11) >= xmax)
1035                         {
1036                             for (int xi=0; xi<11; xi++)
1037                             {
1038                                 if ((i+xi) < xmax)
1039                                 {
1040                                     *outptr0 = biasptr[xi] + inptr[xi];
1041                                     outptr0++;
1042                                     *outptr1 = biasptr[xi] + inptr[xi + 12];
1043                                     outptr1++;
1044                                     *outptr2 = biasptr[xi] + inptr[xi + 24];
1045                                     outptr2++;
1046                                     *outptr3 = biasptr[xi] + inptr[xi + 36];
1047                                     outptr3++;
1048                                 }
1049                             }
1050                             inptr += 96;
1051                         } else {
1052                             /* Optimized routine to copy an entire block */
1053                             __asm __volatile (
1054                                 "ldr q2, [%[biasptr]]\n"
1055                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1056                                 "ldr q3, [%[biasptr], #0x10]\n"
1057                                 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1058                                 "ldr q4, [%[biasptr], #0x20]\n"
1059                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1060                                 "ldr q13, [%[inptr]]\n"
1061                                 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1062                                 "ldr q14, [%[inptr], #0x10]\n"
1063                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1064                                 "add v13.4s, v13.4s, v2.4s\n"
1065                                 "ldr q15, [%[inptr], #0x20]\n"
1066                                 "ldr q16, [%[inptr], #0x30]\n"
1067                                 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1068                                 "add v14.4s, v14.4s, v3.4s\n"
1069                                 "str q13, [%[outptr0]]\n"
1070                                 "add v15.4s, v15.4s, v4.4s\n"
1071                                 "ldr q17, [%[inptr], #0x40]\n"
1072                                 "add v16.4s, v16.4s, v2.4s\n"
1073                                 "ldr q18, [%[inptr], #0x50]\n"
1074                                 "ldr q19, [%[inptr], #0x60]\n"
1075                                 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1076                                 "add v17.4s, v17.4s, v3.4s\n"
1077                                 "str q14, [%[outptr0], #0x10]\n"
1078                                 "add v18.4s, v18.4s, v4.4s\n"
1079                                 "ldr q20, [%[inptr], #0x70]\n"
1080                                 "add v19.4s, v19.4s, v2.4s\n"
1081                                 "ldr q13, [%[inptr], #0x80]\n"
1082                                 "ldr q14, [%[inptr], #0x90]\n"
1083                                 "str q15, [%[outptr0], #0x20]\n"
1084                                 "add %[outptr0], %[outptr0], #0x30\n"
1085                                 "add v20.4s, v20.4s, v3.4s\n"
1086                                 "ldr q15, [%[inptr], #0xa0]\n"
1087                                 "add v13.4s, v13.4s, v4.4s\n"
1088                                 "str q16, [%[outptr1]]\n"
1089                                 "add v14.4s, v14.4s, v2.4s\n"
1090                                 "ldr q16, [%[inptr], #0xb0]\n"
1091                                 "add %[inptr], %[inptr], #0x180\n"
1092                                 "add v15.4s, v15.4s, v3.4s\n"
1093                                 "str q17, [%[outptr1], #0x10]\n"
1094                                 "add v16.4s, v16.4s, v4.4s\n"
1095                                 "str q18, [%[outptr1], #0x20]\n"
1096                                 "add %[outptr1], %[outptr1], #0x30\n"
1097                                 "str q19, [%[outptr2]]\n"
1098                                 "str q20, [%[outptr2], #0x10]\n"
1099                                 "str q13, [%[outptr2], #0x20]\n"
1100                                 "add %[outptr2], %[outptr2], #0x30\n"
1101                                 "str q14, [%[outptr3]]\n"
1102                                 "str q15, [%[outptr3], #0x10]\n"
1103                                 "str q16, [%[outptr3], #0x20]\n"
1104                                 "add %[outptr3], %[outptr3], #0x30\n"
1105                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1106                               [inptr] "+r" (inptr)
1107                             : [biasptr] "r" (biasptr)
1108                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1109                             );
1110                         }
1111                     }
1112                     break;
1113 
1114                 case 5:
1115                     {
1116                         if ((i+11) >= xmax)
1117                         {
1118                             for (int xi=0; xi<11; xi++)
1119                             {
1120                                 if ((i+xi) < xmax)
1121                                 {
1122                                     *outptr0 = biasptr[xi] + inptr[xi];
1123                                     outptr0++;
1124                                     *outptr1 = biasptr[xi] + inptr[xi + 12];
1125                                     outptr1++;
1126                                     *outptr2 = biasptr[xi] + inptr[xi + 24];
1127                                     outptr2++;
1128                                     *outptr3 = biasptr[xi] + inptr[xi + 36];
1129                                     outptr3++;
1130                                     *outptr4 = biasptr[xi] + inptr[xi + 48];
1131                                     outptr4++;
1132                                 }
1133                             }
1134                             inptr += 96;
1135                         } else {
1136                             /* Optimized routine to copy an entire block */
1137                             __asm __volatile (
1138                                 "ldr q2, [%[biasptr]]\n"
1139                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1140                                 "ldr q3, [%[biasptr], #0x10]\n"
1141                                 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1142                                 "ldr q4, [%[biasptr], #0x20]\n"
1143                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1144                                 "ldr q13, [%[inptr]]\n"
1145                                 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1146                                 "ldr q14, [%[inptr], #0x10]\n"
1147                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1148                                 "add v13.4s, v13.4s, v2.4s\n"
1149                                 "ldr q15, [%[inptr], #0x20]\n"
1150                                 "ldr q16, [%[inptr], #0x30]\n"
1151                                 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1152                                 "add v14.4s, v14.4s, v3.4s\n"
1153                                 "str q13, [%[outptr0]]\n"
1154                                 "add v15.4s, v15.4s, v4.4s\n"
1155                                 "ldr q17, [%[inptr], #0x40]\n"
1156                                 "add v16.4s, v16.4s, v2.4s\n"
1157                                 "ldr q18, [%[inptr], #0x50]\n"
1158                                 "ldr q19, [%[inptr], #0x60]\n"
1159                                 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1160                                 "add v17.4s, v17.4s, v3.4s\n"
1161                                 "str q14, [%[outptr0], #0x10]\n"
1162                                 "add v18.4s, v18.4s, v4.4s\n"
1163                                 "ldr q20, [%[inptr], #0x70]\n"
1164                                 "add v19.4s, v19.4s, v2.4s\n"
1165                                 "ldr q13, [%[inptr], #0x80]\n"
1166                                 "ldr q14, [%[inptr], #0x90]\n"
1167                                 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1168                                 "add v20.4s, v20.4s, v3.4s\n"
1169                                 "str q15, [%[outptr0], #0x20]\n"
1170                                 "add v13.4s, v13.4s, v4.4s\n"
1171                                 "ldr q15, [%[inptr], #0xa0]\n"
1172                                 "add v14.4s, v14.4s, v2.4s\n"
1173                                 "add %[outptr0], %[outptr0], #0x30\n"
1174                                 "str q16, [%[outptr1]]\n"
1175                                 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1176                                 "add v15.4s, v15.4s, v3.4s\n"
1177                                 "ldr q16, [%[inptr], #0xb0]\n"
1178                                 "str q17, [%[outptr1], #0x10]\n"
1179                                 "ldr q17, [%[inptr], #0xc0]\n"
1180                                 "add v16.4s, v16.4s, v4.4s\n"
1181                                 "str q18, [%[outptr1], #0x20]\n"
1182                                 "add %[outptr1], %[outptr1], #0x30\n"
1183                                 "add v17.4s, v17.4s, v2.4s\n"
1184                                 "ldr q18, [%[inptr], #0xd0]\n"
1185                                 "str q19, [%[outptr2]]\n"
1186                                 "ldr q19, [%[inptr], #0xe0]\n"
1187                                 "add %[inptr], %[inptr], #0x180\n"
1188                                 "add v18.4s, v18.4s, v3.4s\n"
1189                                 "str q20, [%[outptr2], #0x10]\n"
1190                                 "add v19.4s, v19.4s, v4.4s\n"
1191                                 "str q13, [%[outptr2], #0x20]\n"
1192                                 "add %[outptr2], %[outptr2], #0x30\n"
1193                                 "str q14, [%[outptr3]]\n"
1194                                 "str q15, [%[outptr3], #0x10]\n"
1195                                 "str q16, [%[outptr3], #0x20]\n"
1196                                 "add %[outptr3], %[outptr3], #0x30\n"
1197                                 "str q17, [%[outptr4]]\n"
1198                                 "str q18, [%[outptr4], #0x10]\n"
1199                                 "str q19, [%[outptr4], #0x20]\n"
1200                                 "add %[outptr4], %[outptr4], #0x30\n"
1201                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1202                               [inptr] "+r" (inptr)
1203                             : [biasptr] "r" (biasptr)
1204                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1205                             );
1206                         }
1207                     }
1208                     break;
1209 
1210                 case 6:
1211                     {
1212                         if ((i+11) >= xmax)
1213                         {
1214                             for (int xi=0; xi<11; xi++)
1215                             {
1216                                 if ((i+xi) < xmax)
1217                                 {
1218                                     *outptr0 = biasptr[xi] + inptr[xi];
1219                                     outptr0++;
1220                                     *outptr1 = biasptr[xi] + inptr[xi + 12];
1221                                     outptr1++;
1222                                     *outptr2 = biasptr[xi] + inptr[xi + 24];
1223                                     outptr2++;
1224                                     *outptr3 = biasptr[xi] + inptr[xi + 36];
1225                                     outptr3++;
1226                                     *outptr4 = biasptr[xi] + inptr[xi + 48];
1227                                     outptr4++;
1228                                     *outptr5 = biasptr[xi] + inptr[xi + 60];
1229                                     outptr5++;
1230                                 }
1231                             }
1232                             inptr += 96;
1233                         } else {
1234                             /* Optimized routine to copy an entire block */
1235                             __asm __volatile (
1236                                 "ldr q2, [%[biasptr]]\n"
1237                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1238                                 "ldr q3, [%[biasptr], #0x10]\n"
1239                                 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1240                                 "ldr q4, [%[biasptr], #0x20]\n"
1241                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1242                                 "ldr q13, [%[inptr]]\n"
1243                                 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1244                                 "ldr q14, [%[inptr], #0x10]\n"
1245                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1246                                 "add v13.4s, v13.4s, v2.4s\n"
1247                                 "ldr q15, [%[inptr], #0x20]\n"
1248                                 "ldr q16, [%[inptr], #0x30]\n"
1249                                 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1250                                 "add v14.4s, v14.4s, v3.4s\n"
1251                                 "str q13, [%[outptr0]]\n"
1252                                 "add v15.4s, v15.4s, v4.4s\n"
1253                                 "ldr q17, [%[inptr], #0x40]\n"
1254                                 "add v16.4s, v16.4s, v2.4s\n"
1255                                 "ldr q18, [%[inptr], #0x50]\n"
1256                                 "ldr q19, [%[inptr], #0x60]\n"
1257                                 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1258                                 "add v17.4s, v17.4s, v3.4s\n"
1259                                 "str q14, [%[outptr0], #0x10]\n"
1260                                 "add v18.4s, v18.4s, v4.4s\n"
1261                                 "ldr q20, [%[inptr], #0x70]\n"
1262                                 "add v19.4s, v19.4s, v2.4s\n"
1263                                 "ldr q13, [%[inptr], #0x80]\n"
1264                                 "ldr q14, [%[inptr], #0x90]\n"
1265                                 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1266                                 "add v20.4s, v20.4s, v3.4s\n"
1267                                 "str q15, [%[outptr0], #0x20]\n"
1268                                 "add v13.4s, v13.4s, v4.4s\n"
1269                                 "ldr q15, [%[inptr], #0xa0]\n"
1270                                 "add v14.4s, v14.4s, v2.4s\n"
1271                                 "add %[outptr0], %[outptr0], #0x30\n"
1272                                 "str q16, [%[outptr1]]\n"
1273                                 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1274                                 "add v15.4s, v15.4s, v3.4s\n"
1275                                 "ldr q16, [%[inptr], #0xb0]\n"
1276                                 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1277                                 "str q17, [%[outptr1], #0x10]\n"
1278                                 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1279                                 "add v16.4s, v16.4s, v4.4s\n"
1280                                 "ldr q17, [%[inptr], #0xc0]\n"
1281                                 "str q18, [%[outptr1], #0x20]\n"
1282                                 "add %[outptr1], %[outptr1], #0x30\n"
1283                                 "add v17.4s, v17.4s, v2.4s\n"
1284                                 "ldr q18, [%[inptr], #0xd0]\n"
1285                                 "str q19, [%[outptr2]]\n"
1286                                 "ldr q19, [%[inptr], #0xe0]\n"
1287                                 "add v18.4s, v18.4s, v3.4s\n"
1288                                 "str q20, [%[outptr2], #0x10]\n"
1289                                 "add v19.4s, v19.4s, v4.4s\n"
1290                                 "ldr q20, [%[inptr], #0xf0]\n"
1291                                 "str q13, [%[outptr2], #0x20]\n"
1292                                 "add %[outptr2], %[outptr2], #0x30\n"
1293                                 "add v20.4s, v20.4s, v2.4s\n"
1294                                 "ldr q13, [%[inptr], #0x100]\n"
1295                                 "str q14, [%[outptr3]]\n"
1296                                 "ldr q14, [%[inptr], #0x110]\n"
1297                                 "add %[inptr], %[inptr], #0x180\n"
1298                                 "add v13.4s, v13.4s, v3.4s\n"
1299                                 "str q15, [%[outptr3], #0x10]\n"
1300                                 "add v14.4s, v14.4s, v4.4s\n"
1301                                 "str q16, [%[outptr3], #0x20]\n"
1302                                 "add %[outptr3], %[outptr3], #0x30\n"
1303                                 "str q17, [%[outptr4]]\n"
1304                                 "str q18, [%[outptr4], #0x10]\n"
1305                                 "str q19, [%[outptr4], #0x20]\n"
1306                                 "add %[outptr4], %[outptr4], #0x30\n"
1307                                 "str q20, [%[outptr5]]\n"
1308                                 "str q13, [%[outptr5], #0x10]\n"
1309                                 "str q14, [%[outptr5], #0x20]\n"
1310                                 "add %[outptr5], %[outptr5], #0x30\n"
1311                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1312                               [inptr] "+r" (inptr)
1313                             : [biasptr] "r" (biasptr)
1314                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1315                             );
1316                         }
1317                     }
1318                     break;
1319 
1320                 case 7:
1321                     {
1322                         if ((i+11) >= xmax)
1323                         {
1324                             for (int xi=0; xi<11; xi++)
1325                             {
1326                                 if ((i+xi) < xmax)
1327                                 {
1328                                     *outptr0 = biasptr[xi] + inptr[xi];
1329                                     outptr0++;
1330                                     *outptr1 = biasptr[xi] + inptr[xi + 12];
1331                                     outptr1++;
1332                                     *outptr2 = biasptr[xi] + inptr[xi + 24];
1333                                     outptr2++;
1334                                     *outptr3 = biasptr[xi] + inptr[xi + 36];
1335                                     outptr3++;
1336                                     *outptr4 = biasptr[xi] + inptr[xi + 48];
1337                                     outptr4++;
1338                                     *outptr5 = biasptr[xi] + inptr[xi + 60];
1339                                     outptr5++;
1340                                     *outptr6 = biasptr[xi] + inptr[xi + 72];
1341                                     outptr6++;
1342                                 }
1343                             }
1344                             inptr += 96;
1345                         } else {
1346                             /* Optimized routine to copy an entire block */
1347                             __asm __volatile (
1348                                 "ldr q2, [%[biasptr]]\n"
1349                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1350                                 "ldr q3, [%[biasptr], #0x10]\n"
1351                                 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1352                                 "ldr q4, [%[biasptr], #0x20]\n"
1353                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1354                                 "ldr q13, [%[inptr]]\n"
1355                                 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1356                                 "ldr q14, [%[inptr], #0x10]\n"
1357                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1358                                 "add v13.4s, v13.4s, v2.4s\n"
1359                                 "ldr q15, [%[inptr], #0x20]\n"
1360                                 "ldr q16, [%[inptr], #0x30]\n"
1361                                 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1362                                 "add v14.4s, v14.4s, v3.4s\n"
1363                                 "str q13, [%[outptr0]]\n"
1364                                 "add v15.4s, v15.4s, v4.4s\n"
1365                                 "ldr q17, [%[inptr], #0x40]\n"
1366                                 "add v16.4s, v16.4s, v2.4s\n"
1367                                 "ldr q18, [%[inptr], #0x50]\n"
1368                                 "ldr q19, [%[inptr], #0x60]\n"
1369                                 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1370                                 "add v17.4s, v17.4s, v3.4s\n"
1371                                 "str q14, [%[outptr0], #0x10]\n"
1372                                 "add v18.4s, v18.4s, v4.4s\n"
1373                                 "ldr q20, [%[inptr], #0x70]\n"
1374                                 "add v19.4s, v19.4s, v2.4s\n"
1375                                 "ldr q13, [%[inptr], #0x80]\n"
1376                                 "ldr q14, [%[inptr], #0x90]\n"
1377                                 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1378                                 "add v20.4s, v20.4s, v3.4s\n"
1379                                 "str q15, [%[outptr0], #0x20]\n"
1380                                 "add v13.4s, v13.4s, v4.4s\n"
1381                                 "ldr q15, [%[inptr], #0xa0]\n"
1382                                 "add v14.4s, v14.4s, v2.4s\n"
1383                                 "add %[outptr0], %[outptr0], #0x30\n"
1384                                 "str q16, [%[outptr1]]\n"
1385                                 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1386                                 "add v15.4s, v15.4s, v3.4s\n"
1387                                 "ldr q16, [%[inptr], #0xb0]\n"
1388                                 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1389                                 "str q17, [%[outptr1], #0x10]\n"
1390                                 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1391                                 "add v16.4s, v16.4s, v4.4s\n"
1392                                 "ldr q17, [%[inptr], #0xc0]\n"
1393                                 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
1394                                 "str q18, [%[outptr1], #0x20]\n"
1395                                 "add %[outptr1], %[outptr1], #0x30\n"
1396                                 "add v17.4s, v17.4s, v2.4s\n"
1397                                 "ldr q18, [%[inptr], #0xd0]\n"
1398                                 "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
1399                                 "str q19, [%[outptr2]]\n"
1400                                 "ldr q19, [%[inptr], #0xe0]\n"
1401                                 "add v18.4s, v18.4s, v3.4s\n"
1402                                 "str q20, [%[outptr2], #0x10]\n"
1403                                 "add v19.4s, v19.4s, v4.4s\n"
1404                                 "ldr q20, [%[inptr], #0xf0]\n"
1405                                 "str q13, [%[outptr2], #0x20]\n"
1406                                 "add %[outptr2], %[outptr2], #0x30\n"
1407                                 "add v20.4s, v20.4s, v2.4s\n"
1408                                 "ldr q13, [%[inptr], #0x100]\n"
1409                                 "str q14, [%[outptr3]]\n"
1410                                 "ldr q14, [%[inptr], #0x110]\n"
1411                                 "add v13.4s, v13.4s, v3.4s\n"
1412                                 "str q15, [%[outptr3], #0x10]\n"
1413                                 "add v14.4s, v14.4s, v4.4s\n"
1414                                 "ldr q15, [%[inptr], #0x120]\n"
1415                                 "str q16, [%[outptr3], #0x20]\n"
1416                                 "add %[outptr3], %[outptr3], #0x30\n"
1417                                 "add v15.4s, v15.4s, v2.4s\n"
1418                                 "ldr q16, [%[inptr], #0x130]\n"
1419                                 "str q17, [%[outptr4]]\n"
1420                                 "ldr q17, [%[inptr], #0x140]\n"
1421                                 "add %[inptr], %[inptr], #0x180\n"
1422                                 "add v16.4s, v16.4s, v3.4s\n"
1423                                 "str q18, [%[outptr4], #0x10]\n"
1424                                 "add v17.4s, v17.4s, v4.4s\n"
1425                                 "str q19, [%[outptr4], #0x20]\n"
1426                                 "add %[outptr4], %[outptr4], #0x30\n"
1427                                 "str q20, [%[outptr5]]\n"
1428                                 "str q13, [%[outptr5], #0x10]\n"
1429                                 "str q14, [%[outptr5], #0x20]\n"
1430                                 "add %[outptr5], %[outptr5], #0x30\n"
1431                                 "str q15, [%[outptr6]]\n"
1432                                 "str q16, [%[outptr6], #0x10]\n"
1433                                 "str q17, [%[outptr6], #0x20]\n"
1434                                 "add %[outptr6], %[outptr6], #0x30\n"
1435                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1436                               [inptr] "+r" (inptr)
1437                             : [biasptr] "r" (biasptr)
1438                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1439                             );
1440                         }
1441                     }
1442                     break;
1443 
1444                 default:
1445                 case 8:
1446                     {
1447                         if ((i+11) >= xmax)
1448                         {
1449                             for (int xi=0; xi<11; xi++)
1450                             {
1451                                 if ((i+xi) < xmax)
1452                                 {
1453                                     *outptr0 = biasptr[xi] + inptr[xi];
1454                                     outptr0++;
1455                                     *outptr1 = biasptr[xi] + inptr[xi + 12];
1456                                     outptr1++;
1457                                     *outptr2 = biasptr[xi] + inptr[xi + 24];
1458                                     outptr2++;
1459                                     *outptr3 = biasptr[xi] + inptr[xi + 36];
1460                                     outptr3++;
1461                                     *outptr4 = biasptr[xi] + inptr[xi + 48];
1462                                     outptr4++;
1463                                     *outptr5 = biasptr[xi] + inptr[xi + 60];
1464                                     outptr5++;
1465                                     *outptr6 = biasptr[xi] + inptr[xi + 72];
1466                                     outptr6++;
1467                                     *outptr7 = biasptr[xi] + inptr[xi + 84];
1468                                     outptr7++;
1469                                 }
1470                             }
1471                             inptr += 96;
1472                         } else {
1473                             /* Optimized routine to copy an entire block */
1474                             __asm __volatile (
1475                                 "ldr q2, [%[biasptr]]\n"
1476                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1477                                 "ldr q3, [%[biasptr], #0x10]\n"
1478                                 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1479                                 "ldr q4, [%[biasptr], #0x20]\n"
1480                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1481                                 "ldr q13, [%[inptr]]\n"
1482                                 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1483                                 "ldr q14, [%[inptr], #0x10]\n"
1484                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1485                                 "add v13.4s, v13.4s, v2.4s\n"
1486                                 "ldr q15, [%[inptr], #0x20]\n"
1487                                 "ldr q16, [%[inptr], #0x30]\n"
1488                                 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1489                                 "add v14.4s, v14.4s, v3.4s\n"
1490                                 "str q13, [%[outptr0]]\n"
1491                                 "add v15.4s, v15.4s, v4.4s\n"
1492                                 "ldr q17, [%[inptr], #0x40]\n"
1493                                 "add v16.4s, v16.4s, v2.4s\n"
1494                                 "ldr q18, [%[inptr], #0x50]\n"
1495                                 "ldr q19, [%[inptr], #0x60]\n"
1496                                 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1497                                 "add v17.4s, v17.4s, v3.4s\n"
1498                                 "str q14, [%[outptr0], #0x10]\n"
1499                                 "add v18.4s, v18.4s, v4.4s\n"
1500                                 "ldr q20, [%[inptr], #0x70]\n"
1501                                 "add v19.4s, v19.4s, v2.4s\n"
1502                                 "ldr q13, [%[inptr], #0x80]\n"
1503                                 "ldr q14, [%[inptr], #0x90]\n"
1504                                 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1505                                 "add v20.4s, v20.4s, v3.4s\n"
1506                                 "str q15, [%[outptr0], #0x20]\n"
1507                                 "add v13.4s, v13.4s, v4.4s\n"
1508                                 "ldr q15, [%[inptr], #0xa0]\n"
1509                                 "add v14.4s, v14.4s, v2.4s\n"
1510                                 "add %[outptr0], %[outptr0], #0x30\n"
1511                                 "str q16, [%[outptr1]]\n"
1512                                 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1513                                 "add v15.4s, v15.4s, v3.4s\n"
1514                                 "ldr q16, [%[inptr], #0xb0]\n"
1515                                 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1516                                 "str q17, [%[outptr1], #0x10]\n"
1517                                 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1518                                 "add v16.4s, v16.4s, v4.4s\n"
1519                                 "ldr q17, [%[inptr], #0xc0]\n"
1520                                 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
1521                                 "str q18, [%[outptr1], #0x20]\n"
1522                                 "add %[outptr1], %[outptr1], #0x30\n"
1523                                 "add v17.4s, v17.4s, v2.4s\n"
1524                                 "ldr q18, [%[inptr], #0xd0]\n"
1525                                 "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
1526                                 "str q19, [%[outptr2]]\n"
1527                                 "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
1528                                 "add v18.4s, v18.4s, v3.4s\n"
1529                                 "ldr q19, [%[inptr], #0xe0]\n"
1530                                 "str q20, [%[outptr2], #0x10]\n"
1531                                 "ldr q20, [%[inptr], #0xf0]\n"
1532                                 "add v19.4s, v19.4s, v4.4s\n"
1533                                 "str q13, [%[outptr2], #0x20]\n"
1534                                 "add %[outptr2], %[outptr2], #0x30\n"
1535                                 "add v20.4s, v20.4s, v2.4s\n"
1536                                 "ldr q13, [%[inptr], #0x100]\n"
1537                                 "str q14, [%[outptr3]]\n"
1538                                 "ldr q14, [%[inptr], #0x110]\n"
1539                                 "add v13.4s, v13.4s, v3.4s\n"
1540                                 "str q15, [%[outptr3], #0x10]\n"
1541                                 "add v14.4s, v14.4s, v4.4s\n"
1542                                 "ldr q15, [%[inptr], #0x120]\n"
1543                                 "str q16, [%[outptr3], #0x20]\n"
1544                                 "add %[outptr3], %[outptr3], #0x30\n"
1545                                 "add v15.4s, v15.4s, v2.4s\n"
1546                                 "ldr q16, [%[inptr], #0x130]\n"
1547                                 "str q17, [%[outptr4]]\n"
1548                                 "ldr q17, [%[inptr], #0x140]\n"
1549                                 "add v16.4s, v16.4s, v3.4s\n"
1550                                 "str q18, [%[outptr4], #0x10]\n"
1551                                 "add v17.4s, v17.4s, v4.4s\n"
1552                                 "ldr q18, [%[inptr], #0x150]\n"
1553                                 "str q19, [%[outptr4], #0x20]\n"
1554                                 "add %[outptr4], %[outptr4], #0x30\n"
1555                                 "add v18.4s, v18.4s, v2.4s\n"
1556                                 "ldr q19, [%[inptr], #0x160]\n"
1557                                 "str q20, [%[outptr5]]\n"
1558                                 "ldr q20, [%[inptr], #0x170]\n"
1559                                 "add %[inptr], %[inptr], #0x180\n"
1560                                 "add v19.4s, v19.4s, v3.4s\n"
1561                                 "str q13, [%[outptr5], #0x10]\n"
1562                                 "add v20.4s, v20.4s, v4.4s\n"
1563                                 "str q14, [%[outptr5], #0x20]\n"
1564                                 "add %[outptr5], %[outptr5], #0x30\n"
1565                                 "str q15, [%[outptr6]]\n"
1566                                 "str q16, [%[outptr6], #0x10]\n"
1567                                 "str q17, [%[outptr6], #0x20]\n"
1568                                 "add %[outptr6], %[outptr6], #0x30\n"
1569                                 "str q18, [%[outptr7]]\n"
1570                                 "str q19, [%[outptr7], #0x10]\n"
1571                                 "str q20, [%[outptr7], #0x20]\n"
1572                                 "add %[outptr7], %[outptr7], #0x30\n"
1573                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1574                               [inptr] "+r" (inptr)
1575                             : [biasptr] "r" (biasptr)
1576                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1577                             );
1578                         }
1579                     }
1580                     break;
1581 
1582 
1583                 }
1584             }
1585         }
1586     }
1587 }
1588 
1589 #endif // __aarch64__
1590