xref: /aosp_15_r20/external/ComputeLibrary/src/core/NEON/kernels/arm_gemm/merges/sve_merge_u32_3VLx8.hpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2019-2020,2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #pragma once
25 
26 #ifdef ARM_COMPUTE_ENABLE_SVE
27 
28 template<>
MergeResults(uint32_t * out,const uint32_t * in,const int ldout,const int y0,const int ymax,const int x0,const int xmax,const uint32_t * bias,Activation,bool append)29 void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const uint32_t *bias, Activation , bool append)
30 {
31     const uint32_t *inptr = in;
32     uint32_t nullbias[192];
33 
34 
35     if (!append && !bias)
36     {
37         memset(nullbias, 0, (3 * get_vector_length<uint32_t>() * sizeof(uint32_t)));
38     }
39 
40     for (int y=y0; y<ymax; y+=8)
41     {
42         uint32_t *outptr0 = out + (y * ldout) + x0;
43         uint32_t *outptr1 = outptr0 + ldout;
44         uint32_t *outptr2 = outptr1 + ldout;
45         uint32_t *outptr3 = outptr2 + ldout;
46         uint32_t *outptr4 = outptr3 + ldout;
47         uint32_t *outptr5 = outptr4 + ldout;
48         uint32_t *outptr6 = outptr5 + ldout;
49         uint32_t *outptr7 = outptr6 + ldout;
50 
51         const int height = ymax - y;
52 
53         for (int i=x0; i<xmax; i+=(3 * get_vector_length<uint32_t>()))
54         {
55             if (append)
56             {
57                 switch(height)
58                 {
59                 case 1:
60                     {
61                         long w = xmax - i;
62                         long p = 0;
63                         /* Optimized routine to copy an entire block */
64                         __asm __volatile (
65                             "addvl x8, %[inptr], #16\n"
66                             "whilelt p0.s, %[p], %[w]\n"
67                             "incw %[p], all, mul #1\n"
68                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
69                             "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
70                             "ld1w z2.s, p0/z, [%[outptr0]]\n"
71                             "whilelt p1.s, %[p], %[w]\n"
72                             "ld1w z10.s, p0/z, [%[inptr]]\n"
73                             "incw %[p], all, mul #1\n"
74                             "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
75                             "add z10.s, z10.s, z2.s\n"
76                             "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
77                             "whilelt p2.s, %[p], %[w]\n"
78                             "add z11.s, z11.s, z3.s\n"
79                             "st1w z10.s, p0, [%[outptr0]]\n"
80                             "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
81                             "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
82                             "addvl %[inptr], %[inptr], #24\n"
83                             "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
84                             "add z12.s, z12.s, z4.s\n"
85                             "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
86                             "addvl %[outptr0], %[outptr0], #3\n"
87                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
88                           [inptr] "+r" (inptr), [p] "+r" (p)
89                         : [w] "r" (w)
90                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
91                         );
92                     }
93                     break;
94 
95                 case 2:
96                     {
97                         long w = xmax - i;
98                         long p = 0;
99                         /* Optimized routine to copy an entire block */
100                         __asm __volatile (
101                             "addvl x8, %[inptr], #16\n"
102                             "whilelt p0.s, %[p], %[w]\n"
103                             "incw %[p], all, mul #1\n"
104                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
105                             "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
106                             "ld1w z2.s, p0/z, [%[outptr0]]\n"
107                             "whilelt p1.s, %[p], %[w]\n"
108                             "ld1w z10.s, p0/z, [%[inptr]]\n"
109                             "incw %[p], all, mul #1\n"
110                             "ld1w z5.s, p0/z, [%[outptr1]]\n"
111                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
112                             "add z10.s, z10.s, z2.s\n"
113                             "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
114                             "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
115                             "whilelt p2.s, %[p], %[w]\n"
116                             "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
117                             "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
118                             "add z11.s, z11.s, z3.s\n"
119                             "st1w z10.s, p0, [%[outptr0]]\n"
120                             "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
121                             "add z13.s, z13.s, z5.s\n"
122                             "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
123                             "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
124                             "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
125                             "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
126                             "add z12.s, z12.s, z4.s\n"
127                             "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
128                             "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
129                             "addvl %[inptr], %[inptr], #24\n"
130                             "add z14.s, z14.s, z6.s\n"
131                             "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
132                             "addvl %[outptr0], %[outptr0], #3\n"
133                             "add z15.s, z15.s, z7.s\n"
134                             "st1w z13.s, p0, [%[outptr1]]\n"
135                             "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
136                             "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
137                             "addvl %[outptr1], %[outptr1], #3\n"
138                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
139                           [inptr] "+r" (inptr), [p] "+r" (p)
140                         : [w] "r" (w)
141                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
142                         );
143                     }
144                     break;
145 
146                 case 3:
147                     {
148                         long w = xmax - i;
149                         long p = 0;
150                         /* Optimized routine to copy an entire block */
151                         __asm __volatile (
152                             "addvl x8, %[inptr], #16\n"
153                             "whilelt p0.s, %[p], %[w]\n"
154                             "incw %[p], all, mul #1\n"
155                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
156                             "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
157                             "ld1w z2.s, p0/z, [%[outptr0]]\n"
158                             "whilelt p1.s, %[p], %[w]\n"
159                             "ld1w z10.s, p0/z, [%[inptr]]\n"
160                             "incw %[p], all, mul #1\n"
161                             "ld1w z5.s, p0/z, [%[outptr1]]\n"
162                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
163                             "add z10.s, z10.s, z2.s\n"
164                             "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
165                             "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
166                             "whilelt p2.s, %[p], %[w]\n"
167                             "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
168                             "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
169                             "add z11.s, z11.s, z3.s\n"
170                             "st1w z10.s, p0, [%[outptr0]]\n"
171                             "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
172                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
173                             "add z13.s, z13.s, z5.s\n"
174                             "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
175                             "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
176                             "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
177                             "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
178                             "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
179                             "add z12.s, z12.s, z4.s\n"
180                             "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
181                             "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
182                             "ld1w z8.s, p0/z, [%[outptr2]]\n"
183                             "add z14.s, z14.s, z6.s\n"
184                             "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
185                             "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
186                             "addvl %[outptr0], %[outptr0], #3\n"
187                             "add z15.s, z15.s, z7.s\n"
188                             "st1w z13.s, p0, [%[outptr1]]\n"
189                             "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
190                             "add z16.s, z16.s, z8.s\n"
191                             "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
192                             "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
193                             "addvl %[inptr], %[inptr], #24\n"
194                             "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
195                             "add z17.s, z17.s, z9.s\n"
196                             "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
197                             "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
198                             "addvl %[outptr1], %[outptr1], #3\n"
199                             "add z10.s, z10.s, z2.s\n"
200                             "st1w z16.s, p0, [%[outptr2]]\n"
201                             "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
202                             "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
203                             "addvl %[outptr2], %[outptr2], #3\n"
204                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
205                           [inptr] "+r" (inptr), [p] "+r" (p)
206                         : [w] "r" (w)
207                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
208                         );
209                     }
210                     break;
211 
212                 case 4:
213                     {
214                         long w = xmax - i;
215                         long p = 0;
216                         /* Optimized routine to copy an entire block */
217                         __asm __volatile (
218                             "addvl x8, %[inptr], #16\n"
219                             "whilelt p0.s, %[p], %[w]\n"
220                             "incw %[p], all, mul #1\n"
221                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
222                             "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
223                             "ld1w z2.s, p0/z, [%[outptr0]]\n"
224                             "whilelt p1.s, %[p], %[w]\n"
225                             "ld1w z10.s, p0/z, [%[inptr]]\n"
226                             "incw %[p], all, mul #1\n"
227                             "ld1w z5.s, p0/z, [%[outptr1]]\n"
228                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
229                             "add z10.s, z10.s, z2.s\n"
230                             "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
231                             "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
232                             "whilelt p2.s, %[p], %[w]\n"
233                             "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
234                             "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
235                             "add z11.s, z11.s, z3.s\n"
236                             "st1w z10.s, p0, [%[outptr0]]\n"
237                             "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
238                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
239                             "add z13.s, z13.s, z5.s\n"
240                             "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
241                             "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
242                             "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
243                             "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
244                             "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
245                             "add z12.s, z12.s, z4.s\n"
246                             "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
247                             "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
248                             "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
249                             "ld1w z8.s, p0/z, [%[outptr2]]\n"
250                             "add z14.s, z14.s, z6.s\n"
251                             "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
252                             "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
253                             "addvl %[outptr0], %[outptr0], #3\n"
254                             "add z15.s, z15.s, z7.s\n"
255                             "st1w z13.s, p0, [%[outptr1]]\n"
256                             "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
257                             "add z16.s, z16.s, z8.s\n"
258                             "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
259                             "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
260                             "addvl %[inptr], %[inptr], #24\n"
261                             "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
262                             "add z17.s, z17.s, z9.s\n"
263                             "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
264                             "ld1w z3.s, p0/z, [%[outptr3]]\n"
265                             "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n"
266                             "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
267                             "addvl %[outptr1], %[outptr1], #3\n"
268                             "add z10.s, z10.s, z2.s\n"
269                             "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n"
270                             "add z11.s, z11.s, z3.s\n"
271                             "st1w z16.s, p0, [%[outptr2]]\n"
272                             "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n"
273                             "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n"
274                             "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n"
275                             "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
276                             "add z12.s, z12.s, z4.s\n"
277                             "add z13.s, z13.s, z5.s\n"
278                             "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
279                             "addvl %[outptr2], %[outptr2], #3\n"
280                             "st1w z11.s, p0, [%[outptr3]]\n"
281                             "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n"
282                             "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n"
283                             "addvl %[outptr3], %[outptr3], #3\n"
284                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
285                           [inptr] "+r" (inptr), [p] "+r" (p)
286                         : [w] "r" (w)
287                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
288                         );
289                     }
290                     break;
291 
292                 case 5:
293                     {
294                         long w = xmax - i;
295                         long p = 0;
296                         /* Optimized routine to copy an entire block */
297                         __asm __volatile (
298                             "addvl x8, %[inptr], #16\n"
299                             "whilelt p0.s, %[p], %[w]\n"
300                             "incw %[p], all, mul #1\n"
301                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
302                             "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
303                             "ld1w z2.s, p0/z, [%[outptr0]]\n"
304                             "whilelt p1.s, %[p], %[w]\n"
305                             "ld1w z10.s, p0/z, [%[inptr]]\n"
306                             "incw %[p], all, mul #1\n"
307                             "ld1w z5.s, p0/z, [%[outptr1]]\n"
308                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
309                             "add z10.s, z10.s, z2.s\n"
310                             "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
311                             "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
312                             "whilelt p2.s, %[p], %[w]\n"
313                             "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
314                             "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
315                             "add z11.s, z11.s, z3.s\n"
316                             "st1w z10.s, p0, [%[outptr0]]\n"
317                             "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
318                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
319                             "add z13.s, z13.s, z5.s\n"
320                             "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
321                             "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
322                             "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
323                             "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
324                             "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
325                             "add z12.s, z12.s, z4.s\n"
326                             "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
327                             "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
328                             "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
329                             "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
330                             "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
331                             "add z14.s, z14.s, z6.s\n"
332                             "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
333                             "ld1w z8.s, p0/z, [%[outptr2]]\n"
334                             "addvl %[outptr0], %[outptr0], #3\n"
335                             "add z15.s, z15.s, z7.s\n"
336                             "st1w z13.s, p0, [%[outptr1]]\n"
337                             "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
338                             "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
339                             "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
340                             "addvl %[inptr], %[inptr], #24\n"
341                             "add z16.s, z16.s, z8.s\n"
342                             "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
343                             "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
344                             "add z17.s, z17.s, z9.s\n"
345                             "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
346                             "ld1w z3.s, p0/z, [%[outptr3]]\n"
347                             "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
348                             "addvl %[outptr1], %[outptr1], #3\n"
349                             "add z10.s, z10.s, z2.s\n"
350                             "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n"
351                             "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n"
352                             "st1w z16.s, p0, [%[outptr2]]\n"
353                             "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n"
354                             "add z11.s, z11.s, z3.s\n"
355                             "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n"
356                             "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n"
357                             "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
358                             "add z12.s, z12.s, z4.s\n"
359                             "ld1w z6.s, p0/z, [%[outptr4]]\n"
360                             "ld1w z14.s, p0/z, [x8, #-4, MUL VL]\n"
361                             "add z13.s, z13.s, z5.s\n"
362                             "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
363                             "ld1w z7.s, p1/z, [%[outptr4], #1, MUL VL]\n"
364                             "addvl %[outptr2], %[outptr2], #3\n"
365                             "add z14.s, z14.s, z6.s\n"
366                             "st1w z11.s, p0, [%[outptr3]]\n"
367                             "ld1w z15.s, p1/z, [x8, #-3, MUL VL]\n"
368                             "ld1w z8.s, p2/z, [%[outptr4], #2, MUL VL]\n"
369                             "ld1w z16.s, p2/z, [x8, #-2, MUL VL]\n"
370                             "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n"
371                             "add z15.s, z15.s, z7.s\n"
372                             "add z16.s, z16.s, z8.s\n"
373                             "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n"
374                             "addvl %[outptr3], %[outptr3], #3\n"
375                             "st1w z14.s, p0, [%[outptr4]]\n"
376                             "st1w z15.s, p1, [%[outptr4], #1, MUL VL]\n"
377                             "st1w z16.s, p2, [%[outptr4], #2, MUL VL]\n"
378                             "addvl %[outptr4], %[outptr4], #3\n"
379                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
380                           [inptr] "+r" (inptr), [p] "+r" (p)
381                         : [w] "r" (w)
382                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
383                         );
384                     }
385                     break;
386 
387                 case 6:
388                     {
389                         long w = xmax - i;
390                         long p = 0;
391                         /* Optimized routine to copy an entire block */
392                         __asm __volatile (
393                             "addvl x8, %[inptr], #16\n"
394                             "whilelt p0.s, %[p], %[w]\n"
395                             "incw %[p], all, mul #1\n"
396                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
397                             "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
398                             "ld1w z2.s, p0/z, [%[outptr0]]\n"
399                             "whilelt p1.s, %[p], %[w]\n"
400                             "ld1w z10.s, p0/z, [%[inptr]]\n"
401                             "incw %[p], all, mul #1\n"
402                             "ld1w z5.s, p0/z, [%[outptr1]]\n"
403                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
404                             "add z10.s, z10.s, z2.s\n"
405                             "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
406                             "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
407                             "whilelt p2.s, %[p], %[w]\n"
408                             "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
409                             "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
410                             "add z11.s, z11.s, z3.s\n"
411                             "st1w z10.s, p0, [%[outptr0]]\n"
412                             "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
413                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
414                             "add z13.s, z13.s, z5.s\n"
415                             "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
416                             "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
417                             "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
418                             "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
419                             "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
420                             "add z12.s, z12.s, z4.s\n"
421                             "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
422                             "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
423                             "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
424                             "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
425                             "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
426                             "add z14.s, z14.s, z6.s\n"
427                             "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
428                             "ld1w z8.s, p0/z, [%[outptr2]]\n"
429                             "addvl %[outptr0], %[outptr0], #3\n"
430                             "add z15.s, z15.s, z7.s\n"
431                             "st1w z13.s, p0, [%[outptr1]]\n"
432                             "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
433                             "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
434                             "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
435                             "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
436                             "add z16.s, z16.s, z8.s\n"
437                             "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
438                             "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
439                             "addvl %[inptr], %[inptr], #24\n"
440                             "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
441                             "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
442                             "addvl %[outptr1], %[outptr1], #3\n"
443                             "add z17.s, z17.s, z9.s\n"
444                             "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
445                             "ld1w z3.s, p0/z, [%[outptr3]]\n"
446                             "st1w z16.s, p0, [%[outptr2]]\n"
447                             "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n"
448                             "add z10.s, z10.s, z2.s\n"
449                             "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n"
450                             "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n"
451                             "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
452                             "add z11.s, z11.s, z3.s\n"
453                             "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n"
454                             "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n"
455                             "add z12.s, z12.s, z4.s\n"
456                             "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
457                             "ld1w z6.s, p0/z, [%[outptr4]]\n"
458                             "addvl %[outptr2], %[outptr2], #3\n"
459                             "add z13.s, z13.s, z5.s\n"
460                             "st1w z11.s, p0, [%[outptr3]]\n"
461                             "ld1w z14.s, p0/z, [x8, #-4, MUL VL]\n"
462                             "ld1w z7.s, p1/z, [%[outptr4], #1, MUL VL]\n"
463                             "ld1w z15.s, p1/z, [x8, #-3, MUL VL]\n"
464                             "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n"
465                             "add z14.s, z14.s, z6.s\n"
466                             "ld1w z8.s, p2/z, [%[outptr4], #2, MUL VL]\n"
467                             "ld1w z16.s, p2/z, [x8, #-2, MUL VL]\n"
468                             "add z15.s, z15.s, z7.s\n"
469                             "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n"
470                             "ld1w z9.s, p0/z, [%[outptr5]]\n"
471                             "addvl %[outptr3], %[outptr3], #3\n"
472                             "add z16.s, z16.s, z8.s\n"
473                             "st1w z14.s, p0, [%[outptr4]]\n"
474                             "ld1w z17.s, p0/z, [x8, #-1, MUL VL]\n"
475                             "ld1w z2.s, p1/z, [%[outptr5], #1, MUL VL]\n"
476                             "ld1w z10.s, p1/z, [x8]\n"
477                             "st1w z15.s, p1, [%[outptr4], #1, MUL VL]\n"
478                             "add z17.s, z17.s, z9.s\n"
479                             "ld1w z3.s, p2/z, [%[outptr5], #2, MUL VL]\n"
480                             "ld1w z11.s, p2/z, [x8, #1, MUL VL]\n"
481                             "add z10.s, z10.s, z2.s\n"
482                             "st1w z16.s, p2, [%[outptr4], #2, MUL VL]\n"
483                             "addvl %[outptr4], %[outptr4], #3\n"
484                             "add z11.s, z11.s, z3.s\n"
485                             "st1w z17.s, p0, [%[outptr5]]\n"
486                             "st1w z10.s, p1, [%[outptr5], #1, MUL VL]\n"
487                             "st1w z11.s, p2, [%[outptr5], #2, MUL VL]\n"
488                             "addvl %[outptr5], %[outptr5], #3\n"
489                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
490                           [inptr] "+r" (inptr), [p] "+r" (p)
491                         : [w] "r" (w)
492                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
493                         );
494                     }
495                     break;
496 
497                 case 7:
498                     {
499                         long w = xmax - i;
500                         long p = 0;
501                         /* Optimized routine to copy an entire block */
502                         __asm __volatile (
503                             "addvl x8, %[inptr], #16\n"
504                             "whilelt p0.s, %[p], %[w]\n"
505                             "incw %[p], all, mul #1\n"
506                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
507                             "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
508                             "ld1w z2.s, p0/z, [%[outptr0]]\n"
509                             "whilelt p1.s, %[p], %[w]\n"
510                             "ld1w z10.s, p0/z, [%[inptr]]\n"
511                             "incw %[p], all, mul #1\n"
512                             "ld1w z5.s, p0/z, [%[outptr1]]\n"
513                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
514                             "add z10.s, z10.s, z2.s\n"
515                             "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
516                             "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
517                             "whilelt p2.s, %[p], %[w]\n"
518                             "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
519                             "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
520                             "add z11.s, z11.s, z3.s\n"
521                             "st1w z10.s, p0, [%[outptr0]]\n"
522                             "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
523                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
524                             "add z13.s, z13.s, z5.s\n"
525                             "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
526                             "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
527                             "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
528                             "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
529                             "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
530                             "add z12.s, z12.s, z4.s\n"
531                             "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
532                             "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
533                             "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
534                             "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
535                             "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
536                             "add z14.s, z14.s, z6.s\n"
537                             "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
538                             "ld1w z8.s, p0/z, [%[outptr2]]\n"
539                             "addvl %[outptr0], %[outptr0], #3\n"
540                             "add z15.s, z15.s, z7.s\n"
541                             "st1w z13.s, p0, [%[outptr1]]\n"
542                             "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
543                             "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
544                             "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
545                             "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
546                             "add z16.s, z16.s, z8.s\n"
547                             "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
548                             "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
549                             "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
550                             "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
551                             "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
552                             "add z17.s, z17.s, z9.s\n"
553                             "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
554                             "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
555                             "addvl %[outptr1], %[outptr1], #3\n"
556                             "ld1w z3.s, p0/z, [%[outptr3]]\n"
557                             "addvl %[inptr], %[inptr], #24\n"
558                             "add z10.s, z10.s, z2.s\n"
559                             "st1w z16.s, p0, [%[outptr2]]\n"
560                             "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n"
561                             "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n"
562                             "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n"
563                             "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
564                             "add z11.s, z11.s, z3.s\n"
565                             "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n"
566                             "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n"
567                             "add z12.s, z12.s, z4.s\n"
568                             "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
569                             "ld1w z6.s, p0/z, [%[outptr4]]\n"
570                             "addvl %[outptr2], %[outptr2], #3\n"
571                             "add z13.s, z13.s, z5.s\n"
572                             "st1w z11.s, p0, [%[outptr3]]\n"
573                             "ld1w z14.s, p0/z, [x8, #-4, MUL VL]\n"
574                             "ld1w z7.s, p1/z, [%[outptr4], #1, MUL VL]\n"
575                             "ld1w z15.s, p1/z, [x8, #-3, MUL VL]\n"
576                             "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n"
577                             "add z14.s, z14.s, z6.s\n"
578                             "ld1w z8.s, p2/z, [%[outptr4], #2, MUL VL]\n"
579                             "ld1w z16.s, p2/z, [x8, #-2, MUL VL]\n"
580                             "add z15.s, z15.s, z7.s\n"
581                             "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n"
582                             "ld1w z9.s, p0/z, [%[outptr5]]\n"
583                             "addvl %[outptr3], %[outptr3], #3\n"
584                             "add z16.s, z16.s, z8.s\n"
585                             "st1w z14.s, p0, [%[outptr4]]\n"
586                             "ld1w z17.s, p0/z, [x8, #-1, MUL VL]\n"
587                             "ld1w z2.s, p1/z, [%[outptr5], #1, MUL VL]\n"
588                             "ld1w z10.s, p1/z, [x8]\n"
589                             "st1w z15.s, p1, [%[outptr4], #1, MUL VL]\n"
590                             "add z17.s, z17.s, z9.s\n"
591                             "ld1w z3.s, p2/z, [%[outptr5], #2, MUL VL]\n"
592                             "ld1w z11.s, p2/z, [x8, #1, MUL VL]\n"
593                             "add z10.s, z10.s, z2.s\n"
594                             "st1w z16.s, p2, [%[outptr4], #2, MUL VL]\n"
595                             "ld1w z4.s, p0/z, [%[outptr6]]\n"
596                             "addvl %[outptr4], %[outptr4], #3\n"
597                             "add z11.s, z11.s, z3.s\n"
598                             "st1w z17.s, p0, [%[outptr5]]\n"
599                             "ld1w z12.s, p0/z, [x8, #2, MUL VL]\n"
600                             "ld1w z5.s, p1/z, [%[outptr6], #1, MUL VL]\n"
601                             "ld1w z13.s, p1/z, [x8, #3, MUL VL]\n"
602                             "st1w z10.s, p1, [%[outptr5], #1, MUL VL]\n"
603                             "add z12.s, z12.s, z4.s\n"
604                             "ld1w z6.s, p2/z, [%[outptr6], #2, MUL VL]\n"
605                             "ld1w z14.s, p2/z, [x8, #4, MUL VL]\n"
606                             "add z13.s, z13.s, z5.s\n"
607                             "st1w z11.s, p2, [%[outptr5], #2, MUL VL]\n"
608                             "addvl %[outptr5], %[outptr5], #3\n"
609                             "add z14.s, z14.s, z6.s\n"
610                             "st1w z12.s, p0, [%[outptr6]]\n"
611                             "st1w z13.s, p1, [%[outptr6], #1, MUL VL]\n"
612                             "st1w z14.s, p2, [%[outptr6], #2, MUL VL]\n"
613                             "addvl %[outptr6], %[outptr6], #3\n"
614                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
615                           [inptr] "+r" (inptr), [p] "+r" (p)
616                         : [w] "r" (w)
617                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
618                         );
619                     }
620                     break;
621 
622                 default:
623                 case 8:
624                     {
625                         long w = xmax - i;
626                         long p = 0;
627                         /* Optimized routine to copy an entire block */
628                         __asm __volatile (
629                             "addvl x8, %[inptr], #16\n"
630                             "whilelt p0.s, %[p], %[w]\n"
631                             "incw %[p], all, mul #1\n"
632                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
633                             "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
634                             "ld1w z2.s, p0/z, [%[outptr0]]\n"
635                             "whilelt p1.s, %[p], %[w]\n"
636                             "ld1w z10.s, p0/z, [%[inptr]]\n"
637                             "incw %[p], all, mul #1\n"
638                             "ld1w z5.s, p0/z, [%[outptr1]]\n"
639                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
640                             "add z10.s, z10.s, z2.s\n"
641                             "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
642                             "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
643                             "whilelt p2.s, %[p], %[w]\n"
644                             "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
645                             "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
646                             "add z11.s, z11.s, z3.s\n"
647                             "st1w z10.s, p0, [%[outptr0]]\n"
648                             "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
649                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
650                             "add z13.s, z13.s, z5.s\n"
651                             "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
652                             "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
653                             "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
654                             "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
655                             "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
656                             "add z12.s, z12.s, z4.s\n"
657                             "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
658                             "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
659                             "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
660                             "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
661                             "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
662                             "add z14.s, z14.s, z6.s\n"
663                             "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
664                             "ld1w z8.s, p0/z, [%[outptr2]]\n"
665                             "addvl %[outptr0], %[outptr0], #3\n"
666                             "add z15.s, z15.s, z7.s\n"
667                             "st1w z13.s, p0, [%[outptr1]]\n"
668                             "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
669                             "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
670                             "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
671                             "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
672                             "add z16.s, z16.s, z8.s\n"
673                             "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
674                             "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
675                             "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
676                             "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
677                             "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
678                             "add z17.s, z17.s, z9.s\n"
679                             "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
680                             "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
681                             "addvl %[outptr1], %[outptr1], #3\n"
682                             "ld1w z3.s, p0/z, [%[outptr3]]\n"
683                             "prfm PLDL1KEEP, [%[outptr7], #0x60]\n"
684                             "add z10.s, z10.s, z2.s\n"
685                             "st1w z16.s, p0, [%[outptr2]]\n"
686                             "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n"
687                             "addvl %[inptr], %[inptr], #24\n"
688                             "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n"
689                             "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
690                             "add z11.s, z11.s, z3.s\n"
691                             "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n"
692                             "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n"
693                             "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n"
694                             "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
695                             "addvl %[outptr2], %[outptr2], #3\n"
696                             "add z12.s, z12.s, z4.s\n"
697                             "ld1w z6.s, p0/z, [%[outptr4]]\n"
698                             "add z13.s, z13.s, z5.s\n"
699                             "st1w z11.s, p0, [%[outptr3]]\n"
700                             "ld1w z14.s, p0/z, [x8, #-4, MUL VL]\n"
701                             "ld1w z7.s, p1/z, [%[outptr4], #1, MUL VL]\n"
702                             "ld1w z15.s, p1/z, [x8, #-3, MUL VL]\n"
703                             "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n"
704                             "add z14.s, z14.s, z6.s\n"
705                             "ld1w z8.s, p2/z, [%[outptr4], #2, MUL VL]\n"
706                             "ld1w z16.s, p2/z, [x8, #-2, MUL VL]\n"
707                             "add z15.s, z15.s, z7.s\n"
708                             "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n"
709                             "ld1w z9.s, p0/z, [%[outptr5]]\n"
710                             "addvl %[outptr3], %[outptr3], #3\n"
711                             "add z16.s, z16.s, z8.s\n"
712                             "st1w z14.s, p0, [%[outptr4]]\n"
713                             "ld1w z17.s, p0/z, [x8, #-1, MUL VL]\n"
714                             "ld1w z2.s, p1/z, [%[outptr5], #1, MUL VL]\n"
715                             "ld1w z10.s, p1/z, [x8]\n"
716                             "st1w z15.s, p1, [%[outptr4], #1, MUL VL]\n"
717                             "add z17.s, z17.s, z9.s\n"
718                             "ld1w z3.s, p2/z, [%[outptr5], #2, MUL VL]\n"
719                             "ld1w z11.s, p2/z, [x8, #1, MUL VL]\n"
720                             "add z10.s, z10.s, z2.s\n"
721                             "st1w z16.s, p2, [%[outptr4], #2, MUL VL]\n"
722                             "ld1w z4.s, p0/z, [%[outptr6]]\n"
723                             "addvl %[outptr4], %[outptr4], #3\n"
724                             "add z11.s, z11.s, z3.s\n"
725                             "st1w z17.s, p0, [%[outptr5]]\n"
726                             "ld1w z12.s, p0/z, [x8, #2, MUL VL]\n"
727                             "ld1w z5.s, p1/z, [%[outptr6], #1, MUL VL]\n"
728                             "ld1w z13.s, p1/z, [x8, #3, MUL VL]\n"
729                             "st1w z10.s, p1, [%[outptr5], #1, MUL VL]\n"
730                             "add z12.s, z12.s, z4.s\n"
731                             "ld1w z6.s, p2/z, [%[outptr6], #2, MUL VL]\n"
732                             "ld1w z14.s, p2/z, [x8, #4, MUL VL]\n"
733                             "add z13.s, z13.s, z5.s\n"
734                             "st1w z11.s, p2, [%[outptr5], #2, MUL VL]\n"
735                             "ld1w z7.s, p0/z, [%[outptr7]]\n"
736                             "addvl %[outptr5], %[outptr5], #3\n"
737                             "add z14.s, z14.s, z6.s\n"
738                             "st1w z12.s, p0, [%[outptr6]]\n"
739                             "ld1w z15.s, p0/z, [x8, #5, MUL VL]\n"
740                             "ld1w z8.s, p1/z, [%[outptr7], #1, MUL VL]\n"
741                             "ld1w z16.s, p1/z, [x8, #6, MUL VL]\n"
742                             "st1w z13.s, p1, [%[outptr6], #1, MUL VL]\n"
743                             "add z15.s, z15.s, z7.s\n"
744                             "ld1w z9.s, p2/z, [%[outptr7], #2, MUL VL]\n"
745                             "ld1w z17.s, p2/z, [x8, #7, MUL VL]\n"
746                             "add z16.s, z16.s, z8.s\n"
747                             "st1w z14.s, p2, [%[outptr6], #2, MUL VL]\n"
748                             "addvl %[outptr6], %[outptr6], #3\n"
749                             "add z17.s, z17.s, z9.s\n"
750                             "st1w z15.s, p0, [%[outptr7]]\n"
751                             "st1w z16.s, p1, [%[outptr7], #1, MUL VL]\n"
752                             "st1w z17.s, p2, [%[outptr7], #2, MUL VL]\n"
753                             "addvl %[outptr7], %[outptr7], #3\n"
754                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
755                           [inptr] "+r" (inptr), [p] "+r" (p)
756                         : [w] "r" (w)
757                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
758                         );
759                     }
760                     break;
761 
762 
763                 }
764             }
765             else
766             {
767                 const uint32_t *biasptr = bias ? bias + i : nullbias;
768 
769                 switch(height)
770                 {
771                 case 1:
772                     {
773                         long w = xmax - i;
774                         long p = 0;
775                         /* Optimized routine to copy an entire block */
776                         __asm __volatile (
777                             "addvl x8, %[inptr], #16\n"
778                             "whilelt p0.s, %[p], %[w]\n"
779                             "incw %[p], all, mul #1\n"
780                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
781                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
782                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
783                             "whilelt p1.s, %[p], %[w]\n"
784                             "ld1w z13.s, p0/z, [%[inptr]]\n"
785                             "incw %[p], all, mul #1\n"
786                             "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
787                             "add z13.s, z13.s, z2.s\n"
788                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
789                             "whilelt p2.s, %[p], %[w]\n"
790                             "add z14.s, z14.s, z3.s\n"
791                             "st1w z13.s, p0, [%[outptr0]]\n"
792                             "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
793                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
794                             "addvl %[inptr], %[inptr], #24\n"
795                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
796                             "add z15.s, z15.s, z4.s\n"
797                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
798                             "addvl %[outptr0], %[outptr0], #3\n"
799                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
800                           [inptr] "+r" (inptr), [p] "+r" (p)
801                         : [w] "r" (w), [biasptr] "r" (biasptr)
802                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
803                         );
804                     }
805                     break;
806 
807                 case 2:
808                     {
809                         long w = xmax - i;
810                         long p = 0;
811                         /* Optimized routine to copy an entire block */
812                         __asm __volatile (
813                             "addvl x8, %[inptr], #16\n"
814                             "whilelt p0.s, %[p], %[w]\n"
815                             "incw %[p], all, mul #1\n"
816                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
817                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
818                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
819                             "whilelt p1.s, %[p], %[w]\n"
820                             "ld1w z13.s, p0/z, [%[inptr]]\n"
821                             "incw %[p], all, mul #1\n"
822                             "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
823                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
824                             "add z13.s, z13.s, z2.s\n"
825                             "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
826                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
827                             "whilelt p2.s, %[p], %[w]\n"
828                             "add z16.s, z16.s, z2.s\n"
829                             "st1w z13.s, p0, [%[outptr0]]\n"
830                             "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
831                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
832                             "add z14.s, z14.s, z3.s\n"
833                             "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
834                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
835                             "add z17.s, z17.s, z3.s\n"
836                             "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
837                             "addvl %[inptr], %[inptr], #24\n"
838                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
839                             "add z15.s, z15.s, z4.s\n"
840                             "add z18.s, z18.s, z4.s\n"
841                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
842                             "addvl %[outptr0], %[outptr0], #3\n"
843                             "st1w z16.s, p0, [%[outptr1]]\n"
844                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
845                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
846                             "addvl %[outptr1], %[outptr1], #3\n"
847                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
848                           [inptr] "+r" (inptr), [p] "+r" (p)
849                         : [w] "r" (w), [biasptr] "r" (biasptr)
850                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
851                         );
852                     }
853                     break;
854 
855                 case 3:
856                     {
857                         long w = xmax - i;
858                         long p = 0;
859                         /* Optimized routine to copy an entire block */
860                         __asm __volatile (
861                             "addvl x8, %[inptr], #16\n"
862                             "whilelt p0.s, %[p], %[w]\n"
863                             "incw %[p], all, mul #1\n"
864                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
865                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
866                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
867                             "whilelt p1.s, %[p], %[w]\n"
868                             "ld1w z13.s, p0/z, [%[inptr]]\n"
869                             "incw %[p], all, mul #1\n"
870                             "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
871                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
872                             "add z13.s, z13.s, z2.s\n"
873                             "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
874                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
875                             "whilelt p2.s, %[p], %[w]\n"
876                             "add z16.s, z16.s, z2.s\n"
877                             "st1w z13.s, p0, [%[outptr0]]\n"
878                             "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
879                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
880                             "add z14.s, z14.s, z3.s\n"
881                             "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
882                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
883                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
884                             "add z17.s, z17.s, z3.s\n"
885                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
886                             "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
887                             "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
888                             "add z15.s, z15.s, z4.s\n"
889                             "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
890                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
891                             "addvl %[inptr], %[inptr], #24\n"
892                             "add z18.s, z18.s, z4.s\n"
893                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
894                             "add z19.s, z19.s, z2.s\n"
895                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
896                             "add z20.s, z20.s, z3.s\n"
897                             "addvl %[outptr0], %[outptr0], #3\n"
898                             "st1w z16.s, p0, [%[outptr1]]\n"
899                             "add z13.s, z13.s, z4.s\n"
900                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
901                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
902                             "addvl %[outptr1], %[outptr1], #3\n"
903                             "st1w z19.s, p0, [%[outptr2]]\n"
904                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
905                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
906                             "addvl %[outptr2], %[outptr2], #3\n"
907                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
908                           [inptr] "+r" (inptr), [p] "+r" (p)
909                         : [w] "r" (w), [biasptr] "r" (biasptr)
910                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
911                         );
912                     }
913                     break;
914 
915                 case 4:
916                     {
917                         long w = xmax - i;
918                         long p = 0;
919                         /* Optimized routine to copy an entire block */
920                         __asm __volatile (
921                             "addvl x8, %[inptr], #16\n"
922                             "whilelt p0.s, %[p], %[w]\n"
923                             "incw %[p], all, mul #1\n"
924                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
925                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
926                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
927                             "whilelt p1.s, %[p], %[w]\n"
928                             "ld1w z13.s, p0/z, [%[inptr]]\n"
929                             "incw %[p], all, mul #1\n"
930                             "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
931                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
932                             "add z13.s, z13.s, z2.s\n"
933                             "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
934                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
935                             "whilelt p2.s, %[p], %[w]\n"
936                             "add z16.s, z16.s, z2.s\n"
937                             "st1w z13.s, p0, [%[outptr0]]\n"
938                             "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
939                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
940                             "add z14.s, z14.s, z3.s\n"
941                             "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
942                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
943                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
944                             "add z17.s, z17.s, z3.s\n"
945                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
946                             "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
947                             "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
948                             "add z15.s, z15.s, z4.s\n"
949                             "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
950                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
951                             "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
952                             "add z18.s, z18.s, z4.s\n"
953                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
954                             "add z19.s, z19.s, z2.s\n"
955                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
956                             "add z20.s, z20.s, z3.s\n"
957                             "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
958                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
959                             "addvl %[outptr0], %[outptr0], #3\n"
960                             "add z13.s, z13.s, z4.s\n"
961                             "st1w z16.s, p0, [%[outptr1]]\n"
962                             "add z14.s, z14.s, z2.s\n"
963                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
964                             "add z15.s, z15.s, z3.s\n"
965                             "addvl %[inptr], %[inptr], #24\n"
966                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
967                             "add z16.s, z16.s, z4.s\n"
968                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
969                             "addvl %[outptr1], %[outptr1], #3\n"
970                             "st1w z19.s, p0, [%[outptr2]]\n"
971                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
972                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
973                             "addvl %[outptr2], %[outptr2], #3\n"
974                             "st1w z14.s, p0, [%[outptr3]]\n"
975                             "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
976                             "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
977                             "addvl %[outptr3], %[outptr3], #3\n"
978                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
979                           [inptr] "+r" (inptr), [p] "+r" (p)
980                         : [w] "r" (w), [biasptr] "r" (biasptr)
981                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
982                         );
983                     }
984                     break;
985 
986                 case 5:
987                     {
988                         long w = xmax - i;
989                         long p = 0;
990                         /* Optimized routine to copy an entire block */
991                         __asm __volatile (
992                             "addvl x8, %[inptr], #16\n"
993                             "whilelt p0.s, %[p], %[w]\n"
994                             "incw %[p], all, mul #1\n"
995                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
996                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
997                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
998                             "whilelt p1.s, %[p], %[w]\n"
999                             "ld1w z13.s, p0/z, [%[inptr]]\n"
1000                             "incw %[p], all, mul #1\n"
1001                             "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
1002                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1003                             "add z13.s, z13.s, z2.s\n"
1004                             "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
1005                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1006                             "whilelt p2.s, %[p], %[w]\n"
1007                             "add z16.s, z16.s, z2.s\n"
1008                             "st1w z13.s, p0, [%[outptr0]]\n"
1009                             "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
1010                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1011                             "add z14.s, z14.s, z3.s\n"
1012                             "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
1013                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1014                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1015                             "add z17.s, z17.s, z3.s\n"
1016                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
1017                             "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
1018                             "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1019                             "add z15.s, z15.s, z4.s\n"
1020                             "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
1021                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
1022                             "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1023                             "add z18.s, z18.s, z4.s\n"
1024                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
1025                             "add z19.s, z19.s, z2.s\n"
1026                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
1027                             "add z20.s, z20.s, z3.s\n"
1028                             "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
1029                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
1030                             "addvl %[outptr0], %[outptr0], #3\n"
1031                             "add z13.s, z13.s, z4.s\n"
1032                             "st1w z16.s, p0, [%[outptr1]]\n"
1033                             "add z14.s, z14.s, z2.s\n"
1034                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
1035                             "add z15.s, z15.s, z3.s\n"
1036                             "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1037                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
1038                             "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1039                             "add z16.s, z16.s, z4.s\n"
1040                             "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
1041                             "addvl %[inptr], %[inptr], #24\n"
1042                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
1043                             "addvl %[outptr1], %[outptr1], #3\n"
1044                             "add z17.s, z17.s, z2.s\n"
1045                             "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
1046                             "st1w z19.s, p0, [%[outptr2]]\n"
1047                             "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
1048                             "add z18.s, z18.s, z3.s\n"
1049                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
1050                             "add z19.s, z19.s, z4.s\n"
1051                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
1052                             "addvl %[outptr2], %[outptr2], #3\n"
1053                             "st1w z14.s, p0, [%[outptr3]]\n"
1054                             "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
1055                             "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
1056                             "addvl %[outptr3], %[outptr3], #3\n"
1057                             "st1w z17.s, p0, [%[outptr4]]\n"
1058                             "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
1059                             "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
1060                             "addvl %[outptr4], %[outptr4], #3\n"
1061                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1062                           [inptr] "+r" (inptr), [p] "+r" (p)
1063                         : [w] "r" (w), [biasptr] "r" (biasptr)
1064                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
1065                         );
1066                     }
1067                     break;
1068 
1069                 case 6:
1070                     {
1071                         long w = xmax - i;
1072                         long p = 0;
1073                         /* Optimized routine to copy an entire block */
1074                         __asm __volatile (
1075                             "addvl x8, %[inptr], #16\n"
1076                             "whilelt p0.s, %[p], %[w]\n"
1077                             "incw %[p], all, mul #1\n"
1078                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1079                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1080                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
1081                             "whilelt p1.s, %[p], %[w]\n"
1082                             "ld1w z13.s, p0/z, [%[inptr]]\n"
1083                             "incw %[p], all, mul #1\n"
1084                             "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
1085                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1086                             "add z13.s, z13.s, z2.s\n"
1087                             "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
1088                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1089                             "whilelt p2.s, %[p], %[w]\n"
1090                             "add z16.s, z16.s, z2.s\n"
1091                             "st1w z13.s, p0, [%[outptr0]]\n"
1092                             "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
1093                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1094                             "add z14.s, z14.s, z3.s\n"
1095                             "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
1096                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1097                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1098                             "add z17.s, z17.s, z3.s\n"
1099                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
1100                             "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
1101                             "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1102                             "add z15.s, z15.s, z4.s\n"
1103                             "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
1104                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
1105                             "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1106                             "add z18.s, z18.s, z4.s\n"
1107                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
1108                             "add z19.s, z19.s, z2.s\n"
1109                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
1110                             "add z20.s, z20.s, z3.s\n"
1111                             "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
1112                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
1113                             "addvl %[outptr0], %[outptr0], #3\n"
1114                             "add z13.s, z13.s, z4.s\n"
1115                             "st1w z16.s, p0, [%[outptr1]]\n"
1116                             "add z14.s, z14.s, z2.s\n"
1117                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
1118                             "add z15.s, z15.s, z3.s\n"
1119                             "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1120                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
1121                             "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1122                             "add z16.s, z16.s, z4.s\n"
1123                             "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
1124                             "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1125                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
1126                             "addvl %[outptr1], %[outptr1], #3\n"
1127                             "add z17.s, z17.s, z2.s\n"
1128                             "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
1129                             "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1130                             "st1w z19.s, p0, [%[outptr2]]\n"
1131                             "addvl %[inptr], %[inptr], #24\n"
1132                             "add z18.s, z18.s, z3.s\n"
1133                             "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
1134                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
1135                             "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
1136                             "add z19.s, z19.s, z4.s\n"
1137                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
1138                             "addvl %[outptr2], %[outptr2], #3\n"
1139                             "add z20.s, z20.s, z2.s\n"
1140                             "ld1w z13.s, p1/z, [x8]\n"
1141                             "st1w z14.s, p0, [%[outptr3]]\n"
1142                             "ld1w z14.s, p2/z, [x8, #1, MUL VL]\n"
1143                             "add z13.s, z13.s, z3.s\n"
1144                             "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
1145                             "add z14.s, z14.s, z4.s\n"
1146                             "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
1147                             "addvl %[outptr3], %[outptr3], #3\n"
1148                             "st1w z17.s, p0, [%[outptr4]]\n"
1149                             "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
1150                             "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
1151                             "addvl %[outptr4], %[outptr4], #3\n"
1152                             "st1w z20.s, p0, [%[outptr5]]\n"
1153                             "st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n"
1154                             "st1w z14.s, p2, [%[outptr5], #2, MUL VL]\n"
1155                             "addvl %[outptr5], %[outptr5], #3\n"
1156                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1157                           [inptr] "+r" (inptr), [p] "+r" (p)
1158                         : [w] "r" (w), [biasptr] "r" (biasptr)
1159                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
1160                         );
1161                     }
1162                     break;
1163 
1164                 case 7:
1165                     {
1166                         long w = xmax - i;
1167                         long p = 0;
1168                         /* Optimized routine to copy an entire block */
1169                         __asm __volatile (
1170                             "addvl x8, %[inptr], #16\n"
1171                             "whilelt p0.s, %[p], %[w]\n"
1172                             "incw %[p], all, mul #1\n"
1173                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1174                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1175                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
1176                             "whilelt p1.s, %[p], %[w]\n"
1177                             "ld1w z13.s, p0/z, [%[inptr]]\n"
1178                             "incw %[p], all, mul #1\n"
1179                             "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
1180                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1181                             "add z13.s, z13.s, z2.s\n"
1182                             "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
1183                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1184                             "whilelt p2.s, %[p], %[w]\n"
1185                             "add z16.s, z16.s, z2.s\n"
1186                             "st1w z13.s, p0, [%[outptr0]]\n"
1187                             "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
1188                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1189                             "add z14.s, z14.s, z3.s\n"
1190                             "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
1191                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1192                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1193                             "add z17.s, z17.s, z3.s\n"
1194                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
1195                             "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
1196                             "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1197                             "add z15.s, z15.s, z4.s\n"
1198                             "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
1199                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
1200                             "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1201                             "add z18.s, z18.s, z4.s\n"
1202                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
1203                             "add z19.s, z19.s, z2.s\n"
1204                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
1205                             "add z20.s, z20.s, z3.s\n"
1206                             "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
1207                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
1208                             "addvl %[outptr0], %[outptr0], #3\n"
1209                             "add z13.s, z13.s, z4.s\n"
1210                             "st1w z16.s, p0, [%[outptr1]]\n"
1211                             "add z14.s, z14.s, z2.s\n"
1212                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
1213                             "add z15.s, z15.s, z3.s\n"
1214                             "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1215                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
1216                             "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1217                             "add z16.s, z16.s, z4.s\n"
1218                             "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
1219                             "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1220                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
1221                             "addvl %[outptr1], %[outptr1], #3\n"
1222                             "add z17.s, z17.s, z2.s\n"
1223                             "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
1224                             "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1225                             "st1w z19.s, p0, [%[outptr2]]\n"
1226                             "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
1227                             "add z18.s, z18.s, z3.s\n"
1228                             "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
1229                             "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
1230                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
1231                             "addvl %[inptr], %[inptr], #24\n"
1232                             "add z19.s, z19.s, z4.s\n"
1233                             "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
1234                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
1235                             "addvl %[outptr2], %[outptr2], #3\n"
1236                             "add z20.s, z20.s, z2.s\n"
1237                             "ld1w z13.s, p1/z, [x8]\n"
1238                             "st1w z14.s, p0, [%[outptr3]]\n"
1239                             "ld1w z14.s, p2/z, [x8, #1, MUL VL]\n"
1240                             "add z13.s, z13.s, z3.s\n"
1241                             "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
1242                             "add z14.s, z14.s, z4.s\n"
1243                             "ld1w z15.s, p0/z, [x8, #2, MUL VL]\n"
1244                             "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
1245                             "addvl %[outptr3], %[outptr3], #3\n"
1246                             "add z15.s, z15.s, z2.s\n"
1247                             "ld1w z16.s, p1/z, [x8, #3, MUL VL]\n"
1248                             "st1w z17.s, p0, [%[outptr4]]\n"
1249                             "ld1w z17.s, p2/z, [x8, #4, MUL VL]\n"
1250                             "add z16.s, z16.s, z3.s\n"
1251                             "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
1252                             "add z17.s, z17.s, z4.s\n"
1253                             "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
1254                             "addvl %[outptr4], %[outptr4], #3\n"
1255                             "st1w z20.s, p0, [%[outptr5]]\n"
1256                             "st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n"
1257                             "st1w z14.s, p2, [%[outptr5], #2, MUL VL]\n"
1258                             "addvl %[outptr5], %[outptr5], #3\n"
1259                             "st1w z15.s, p0, [%[outptr6]]\n"
1260                             "st1w z16.s, p1, [%[outptr6], #1, MUL VL]\n"
1261                             "st1w z17.s, p2, [%[outptr6], #2, MUL VL]\n"
1262                             "addvl %[outptr6], %[outptr6], #3\n"
1263                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1264                           [inptr] "+r" (inptr), [p] "+r" (p)
1265                         : [w] "r" (w), [biasptr] "r" (biasptr)
1266                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
1267                         );
1268                     }
1269                     break;
1270 
1271                 default:
1272                 case 8:
1273                     {
1274                         long w = xmax - i;
1275                         long p = 0;
1276                         /* Optimized routine to copy an entire block */
1277                         __asm __volatile (
1278                             "addvl x8, %[inptr], #16\n"
1279                             "whilelt p0.s, %[p], %[w]\n"
1280                             "incw %[p], all, mul #1\n"
1281                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1282                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1283                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
1284                             "whilelt p1.s, %[p], %[w]\n"
1285                             "ld1w z13.s, p0/z, [%[inptr]]\n"
1286                             "incw %[p], all, mul #1\n"
1287                             "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
1288                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1289                             "add z13.s, z13.s, z2.s\n"
1290                             "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
1291                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1292                             "whilelt p2.s, %[p], %[w]\n"
1293                             "add z16.s, z16.s, z2.s\n"
1294                             "st1w z13.s, p0, [%[outptr0]]\n"
1295                             "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
1296                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1297                             "add z14.s, z14.s, z3.s\n"
1298                             "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
1299                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1300                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1301                             "add z17.s, z17.s, z3.s\n"
1302                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
1303                             "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
1304                             "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1305                             "add z15.s, z15.s, z4.s\n"
1306                             "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
1307                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
1308                             "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1309                             "add z18.s, z18.s, z4.s\n"
1310                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
1311                             "add z19.s, z19.s, z2.s\n"
1312                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
1313                             "add z20.s, z20.s, z3.s\n"
1314                             "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
1315                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
1316                             "addvl %[outptr0], %[outptr0], #3\n"
1317                             "add z13.s, z13.s, z4.s\n"
1318                             "st1w z16.s, p0, [%[outptr1]]\n"
1319                             "add z14.s, z14.s, z2.s\n"
1320                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
1321                             "add z15.s, z15.s, z3.s\n"
1322                             "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1323                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
1324                             "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1325                             "add z16.s, z16.s, z4.s\n"
1326                             "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
1327                             "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1328                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
1329                             "addvl %[outptr1], %[outptr1], #3\n"
1330                             "add z17.s, z17.s, z2.s\n"
1331                             "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
1332                             "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1333                             "st1w z19.s, p0, [%[outptr2]]\n"
1334                             "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
1335                             "add z18.s, z18.s, z3.s\n"
1336                             "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
1337                             "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
1338                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
1339                             "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
1340                             "add z19.s, z19.s, z4.s\n"
1341                             "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
1342                             "addvl %[inptr], %[inptr], #24\n"
1343                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
1344                             "addvl %[outptr2], %[outptr2], #3\n"
1345                             "add z20.s, z20.s, z2.s\n"
1346                             "ld1w z13.s, p1/z, [x8]\n"
1347                             "st1w z14.s, p0, [%[outptr3]]\n"
1348                             "ld1w z14.s, p2/z, [x8, #1, MUL VL]\n"
1349                             "add z13.s, z13.s, z3.s\n"
1350                             "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
1351                             "add z14.s, z14.s, z4.s\n"
1352                             "ld1w z15.s, p0/z, [x8, #2, MUL VL]\n"
1353                             "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
1354                             "addvl %[outptr3], %[outptr3], #3\n"
1355                             "add z15.s, z15.s, z2.s\n"
1356                             "ld1w z16.s, p1/z, [x8, #3, MUL VL]\n"
1357                             "st1w z17.s, p0, [%[outptr4]]\n"
1358                             "ld1w z17.s, p2/z, [x8, #4, MUL VL]\n"
1359                             "add z16.s, z16.s, z3.s\n"
1360                             "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
1361                             "add z17.s, z17.s, z4.s\n"
1362                             "ld1w z18.s, p0/z, [x8, #5, MUL VL]\n"
1363                             "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
1364                             "addvl %[outptr4], %[outptr4], #3\n"
1365                             "add z18.s, z18.s, z2.s\n"
1366                             "ld1w z19.s, p1/z, [x8, #6, MUL VL]\n"
1367                             "st1w z20.s, p0, [%[outptr5]]\n"
1368                             "ld1w z20.s, p2/z, [x8, #7, MUL VL]\n"
1369                             "add z19.s, z19.s, z3.s\n"
1370                             "st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n"
1371                             "add z20.s, z20.s, z4.s\n"
1372                             "st1w z14.s, p2, [%[outptr5], #2, MUL VL]\n"
1373                             "addvl %[outptr5], %[outptr5], #3\n"
1374                             "st1w z15.s, p0, [%[outptr6]]\n"
1375                             "st1w z16.s, p1, [%[outptr6], #1, MUL VL]\n"
1376                             "st1w z17.s, p2, [%[outptr6], #2, MUL VL]\n"
1377                             "addvl %[outptr6], %[outptr6], #3\n"
1378                             "st1w z18.s, p0, [%[outptr7]]\n"
1379                             "st1w z19.s, p1, [%[outptr7], #1, MUL VL]\n"
1380                             "st1w z20.s, p2, [%[outptr7], #2, MUL VL]\n"
1381                             "addvl %[outptr7], %[outptr7], #3\n"
1382                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1383                           [inptr] "+r" (inptr), [p] "+r" (p)
1384                         : [w] "r" (w), [biasptr] "r" (biasptr)
1385                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
1386                         );
1387                     }
1388                     break;
1389 
1390 
1391                 }
1392             }
1393         }
1394     }
1395 }
1396 
1397 #endif // ARM_COMPUTE_ENABLE_SVE
1398