xref: /aosp_15_r20/external/ComputeLibrary/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2019-2020,2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #pragma once
25 
26 #ifdef ARM_COMPUTE_ENABLE_SVE
27 
28 template<>
MergeResults(float * out,const float * in,const int ldout,const int y0,const int ymax,const int x0,const int xmax,const float * bias,Activation act,bool append)29 void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float *bias, Activation act, bool append)
30 {
31     const float *inptr = in;
32     float nullbias[192];
33     float minval = - std::numeric_limits<float>::infinity();
34     float maxval =   std::numeric_limits<float>::infinity();
35 
36     switch(act.type)
37     {
38         default:
39         case Activation::Type::None:
40             break;
41         case Activation::Type::BoundedReLU:
42             maxval = static_cast<float>(act.param1);
43             /* fall through */
44         case Activation::Type::ReLU:
45             minval = 0.0f;
46             break;
47     }
48 
49     if (!append && !bias)
50     {
51         memset(nullbias, 0, (3 * get_vector_length<float>() * sizeof(float)));
52     }
53 
54     for (int y=y0; y<ymax; y+=8)
55     {
56         float *outptr0 = out + (y * ldout) + x0;
57         float *outptr1 = outptr0 + ldout;
58         float *outptr2 = outptr1 + ldout;
59         float *outptr3 = outptr2 + ldout;
60         float *outptr4 = outptr3 + ldout;
61         float *outptr5 = outptr4 + ldout;
62         float *outptr6 = outptr5 + ldout;
63         float *outptr7 = outptr6 + ldout;
64 
65         const int height = ymax - y;
66 
67         for (int i=x0; i<xmax; i+=(3 * get_vector_length<float>()))
68         {
69             if (append)
70             {
71                 switch(height)
72                 {
73                 case 1:
74                     {
75                         long w = xmax - i;
76                         long p = 0;
77                         /* Optimized routine to copy an entire block */
78                         __asm __volatile (
79                             "mov z0.s, %s[maxval]\n"
80                             "addvl x8, %[inptr], #16\n"
81                             "mov z1.s, %s[minval]\n"
82                             "whilelt p0.s, %[p], %[w]\n"
83                             "incw %[p], all, mul #1\n"
84                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
85                             "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
86                             "ld1w z2.s, p0/z, [%[outptr0]]\n"
87                             "whilelt p1.s, %[p], %[w]\n"
88                             "ld1w z10.s, p0/z, [%[inptr]]\n"
89                             "incw %[p], all, mul #1\n"
90                             "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
91                             "fadd z10.s, z10.s, z2.s\n"
92                             "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
93                             "whilelt p2.s, %[p], %[w]\n"
94                             "fmin z10.s, p0/m, z10.s, z0.s\n"
95                             "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
96                             "fadd z11.s, z11.s, z3.s\n"
97                             "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
98                             "addvl %[inptr], %[inptr], #24\n"
99                             "fmax z10.s, p0/m, z10.s, z1.s\n"
100                             "fmin z11.s, p1/m, z11.s, z0.s\n"
101                             "fadd z12.s, z12.s, z4.s\n"
102                             "st1w z10.s, p0, [%[outptr0]]\n"
103                             "fmax z11.s, p1/m, z11.s, z1.s\n"
104                             "fmin z12.s, p2/m, z12.s, z0.s\n"
105                             "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
106                             "fmax z12.s, p2/m, z12.s, z1.s\n"
107                             "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
108                             "addvl %[outptr0], %[outptr0], #3\n"
109                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
110                           [inptr] "+r" (inptr), [p] "+r" (p)
111                         : [w] "r" (w), [minval] "w" (minval), [maxval] "w" (maxval)
112                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
113                         );
114                     }
115                     break;
116 
117                 case 2:
118                     {
119                         long w = xmax - i;
120                         long p = 0;
121                         /* Optimized routine to copy an entire block */
122                         __asm __volatile (
123                             "mov z0.s, %s[maxval]\n"
124                             "addvl x8, %[inptr], #16\n"
125                             "mov z1.s, %s[minval]\n"
126                             "whilelt p0.s, %[p], %[w]\n"
127                             "incw %[p], all, mul #1\n"
128                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
129                             "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
130                             "ld1w z2.s, p0/z, [%[outptr0]]\n"
131                             "whilelt p1.s, %[p], %[w]\n"
132                             "ld1w z10.s, p0/z, [%[inptr]]\n"
133                             "incw %[p], all, mul #1\n"
134                             "ld1w z5.s, p0/z, [%[outptr1]]\n"
135                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
136                             "fadd z10.s, z10.s, z2.s\n"
137                             "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
138                             "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
139                             "whilelt p2.s, %[p], %[w]\n"
140                             "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
141                             "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
142                             "fmin z10.s, p0/m, z10.s, z0.s\n"
143                             "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
144                             "fadd z11.s, z11.s, z3.s\n"
145                             "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
146                             "fadd z13.s, z13.s, z5.s\n"
147                             "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
148                             "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
149                             "fmax z10.s, p0/m, z10.s, z1.s\n"
150                             "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
151                             "fmin z11.s, p1/m, z11.s, z0.s\n"
152                             "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
153                             "fadd z12.s, z12.s, z4.s\n"
154                             "addvl %[inptr], %[inptr], #24\n"
155                             "fmin z13.s, p0/m, z13.s, z0.s\n"
156                             "st1w z10.s, p0, [%[outptr0]]\n"
157                             "fmax z11.s, p1/m, z11.s, z1.s\n"
158                             "fmin z12.s, p2/m, z12.s, z0.s\n"
159                             "fadd z14.s, z14.s, z6.s\n"
160                             "fmax z13.s, p0/m, z13.s, z1.s\n"
161                             "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
162                             "fadd z15.s, z15.s, z7.s\n"
163                             "fmax z12.s, p2/m, z12.s, z1.s\n"
164                             "fmin z14.s, p1/m, z14.s, z0.s\n"
165                             "fmin z15.s, p2/m, z15.s, z0.s\n"
166                             "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
167                             "addvl %[outptr0], %[outptr0], #3\n"
168                             "fmax z14.s, p1/m, z14.s, z1.s\n"
169                             "fmax z15.s, p2/m, z15.s, z1.s\n"
170                             "st1w z13.s, p0, [%[outptr1]]\n"
171                             "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
172                             "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
173                             "addvl %[outptr1], %[outptr1], #3\n"
174                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
175                           [inptr] "+r" (inptr), [p] "+r" (p)
176                         : [w] "r" (w), [minval] "w" (minval), [maxval] "w" (maxval)
177                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
178                         );
179                     }
180                     break;
181 
182                 case 3:
183                     {
184                         long w = xmax - i;
185                         long p = 0;
186                         /* Optimized routine to copy an entire block */
187                         __asm __volatile (
188                             "mov z0.s, %s[maxval]\n"
189                             "addvl x8, %[inptr], #16\n"
190                             "mov z1.s, %s[minval]\n"
191                             "whilelt p0.s, %[p], %[w]\n"
192                             "incw %[p], all, mul #1\n"
193                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
194                             "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
195                             "ld1w z2.s, p0/z, [%[outptr0]]\n"
196                             "whilelt p1.s, %[p], %[w]\n"
197                             "ld1w z10.s, p0/z, [%[inptr]]\n"
198                             "incw %[p], all, mul #1\n"
199                             "ld1w z5.s, p0/z, [%[outptr1]]\n"
200                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
201                             "fadd z10.s, z10.s, z2.s\n"
202                             "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
203                             "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
204                             "whilelt p2.s, %[p], %[w]\n"
205                             "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
206                             "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
207                             "fmin z10.s, p0/m, z10.s, z0.s\n"
208                             "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
209                             "fadd z11.s, z11.s, z3.s\n"
210                             "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
211                             "fadd z13.s, z13.s, z5.s\n"
212                             "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
213                             "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
214                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
215                             "fmax z10.s, p0/m, z10.s, z1.s\n"
216                             "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
217                             "fmin z11.s, p1/m, z11.s, z0.s\n"
218                             "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
219                             "fadd z12.s, z12.s, z4.s\n"
220                             "ld1w z8.s, p0/z, [%[outptr2]]\n"
221                             "fmin z13.s, p0/m, z13.s, z0.s\n"
222                             "st1w z10.s, p0, [%[outptr0]]\n"
223                             "fadd z14.s, z14.s, z6.s\n"
224                             "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
225                             "fmax z11.s, p1/m, z11.s, z1.s\n"
226                             "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
227                             "fmin z12.s, p2/m, z12.s, z0.s\n"
228                             "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
229                             "fmax z13.s, p0/m, z13.s, z1.s\n"
230                             "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
231                             "fmin z14.s, p1/m, z14.s, z0.s\n"
232                             "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
233                             "fadd z15.s, z15.s, z7.s\n"
234                             "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
235                             "fmax z12.s, p2/m, z12.s, z1.s\n"
236                             "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
237                             "fmax z14.s, p1/m, z14.s, z1.s\n"
238                             "addvl %[inptr], %[inptr], #24\n"
239                             "fmin z15.s, p2/m, z15.s, z0.s\n"
240                             "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
241                             "fadd z16.s, z16.s, z8.s\n"
242                             "addvl %[outptr0], %[outptr0], #3\n"
243                             "fadd z17.s, z17.s, z9.s\n"
244                             "st1w z13.s, p0, [%[outptr1]]\n"
245                             "fmax z15.s, p2/m, z15.s, z1.s\n"
246                             "fmin z16.s, p0/m, z16.s, z0.s\n"
247                             "fadd z10.s, z10.s, z2.s\n"
248                             "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
249                             "fmin z17.s, p1/m, z17.s, z0.s\n"
250                             "fmax z16.s, p0/m, z16.s, z1.s\n"
251                             "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
252                             "fmin z10.s, p2/m, z10.s, z0.s\n"
253                             "addvl %[outptr1], %[outptr1], #3\n"
254                             "fmax z17.s, p1/m, z17.s, z1.s\n"
255                             "st1w z16.s, p0, [%[outptr2]]\n"
256                             "fmax z10.s, p2/m, z10.s, z1.s\n"
257                             "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
258                             "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
259                             "addvl %[outptr2], %[outptr2], #3\n"
260                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
261                           [inptr] "+r" (inptr), [p] "+r" (p)
262                         : [w] "r" (w), [minval] "w" (minval), [maxval] "w" (maxval)
263                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
264                         );
265                     }
266                     break;
267 
268                 case 4:
269                     {
270                         long w = xmax - i;
271                         long p = 0;
272                         /* Optimized routine to copy an entire block */
273                         __asm __volatile (
274                             "mov z0.s, %s[maxval]\n"
275                             "addvl x8, %[inptr], #16\n"
276                             "mov z1.s, %s[minval]\n"
277                             "whilelt p0.s, %[p], %[w]\n"
278                             "incw %[p], all, mul #1\n"
279                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
280                             "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
281                             "ld1w z2.s, p0/z, [%[outptr0]]\n"
282                             "whilelt p1.s, %[p], %[w]\n"
283                             "ld1w z10.s, p0/z, [%[inptr]]\n"
284                             "incw %[p], all, mul #1\n"
285                             "ld1w z5.s, p0/z, [%[outptr1]]\n"
286                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
287                             "fadd z10.s, z10.s, z2.s\n"
288                             "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
289                             "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
290                             "whilelt p2.s, %[p], %[w]\n"
291                             "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
292                             "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
293                             "fmin z10.s, p0/m, z10.s, z0.s\n"
294                             "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
295                             "fadd z11.s, z11.s, z3.s\n"
296                             "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
297                             "fadd z13.s, z13.s, z5.s\n"
298                             "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
299                             "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
300                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
301                             "fmax z10.s, p0/m, z10.s, z1.s\n"
302                             "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
303                             "fmin z11.s, p1/m, z11.s, z0.s\n"
304                             "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
305                             "fadd z12.s, z12.s, z4.s\n"
306                             "ld1w z8.s, p0/z, [%[outptr2]]\n"
307                             "fmin z13.s, p0/m, z13.s, z0.s\n"
308                             "st1w z10.s, p0, [%[outptr0]]\n"
309                             "fadd z14.s, z14.s, z6.s\n"
310                             "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
311                             "fmax z11.s, p1/m, z11.s, z1.s\n"
312                             "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
313                             "fmin z12.s, p2/m, z12.s, z0.s\n"
314                             "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
315                             "fmax z13.s, p0/m, z13.s, z1.s\n"
316                             "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
317                             "fmin z14.s, p1/m, z14.s, z0.s\n"
318                             "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
319                             "fadd z15.s, z15.s, z7.s\n"
320                             "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
321                             "fmax z12.s, p2/m, z12.s, z1.s\n"
322                             "ld1w z3.s, p0/z, [%[outptr3]]\n"
323                             "fadd z16.s, z16.s, z8.s\n"
324                             "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n"
325                             "fmax z14.s, p1/m, z14.s, z1.s\n"
326                             "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n"
327                             "fmin z15.s, p2/m, z15.s, z0.s\n"
328                             "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
329                             "fadd z17.s, z17.s, z9.s\n"
330                             "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n"
331                             "fmin z16.s, p0/m, z16.s, z0.s\n"
332                             "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n"
333                             "fadd z10.s, z10.s, z2.s\n"
334                             "st1w z13.s, p0, [%[outptr1]]\n"
335                             "fmax z15.s, p2/m, z15.s, z1.s\n"
336                             "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n"
337                             "fmin z17.s, p1/m, z17.s, z0.s\n"
338                             "addvl %[outptr0], %[outptr0], #3\n"
339                             "fmax z16.s, p0/m, z16.s, z1.s\n"
340                             "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
341                             "fmin z10.s, p2/m, z10.s, z0.s\n"
342                             "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
343                             "fmax z17.s, p1/m, z17.s, z1.s\n"
344                             "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
345                             "fadd z11.s, z11.s, z3.s\n"
346                             "addvl %[outptr1], %[outptr1], #3\n"
347                             "fmax z10.s, p2/m, z10.s, z1.s\n"
348                             "st1w z16.s, p0, [%[outptr2]]\n"
349                             "fadd z12.s, z12.s, z4.s\n"
350                             "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
351                             "fmin z11.s, p0/m, z11.s, z0.s\n"
352                             "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
353                             "fadd z13.s, z13.s, z5.s\n"
354                             "addvl %[inptr], %[inptr], #24\n"
355                             "fmin z12.s, p1/m, z12.s, z0.s\n"
356                             "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
357                             "fmax z11.s, p0/m, z11.s, z1.s\n"
358                             "addvl %[outptr2], %[outptr2], #3\n"
359                             "fmin z13.s, p2/m, z13.s, z0.s\n"
360                             "fmax z12.s, p1/m, z12.s, z1.s\n"
361                             "st1w z11.s, p0, [%[outptr3]]\n"
362                             "fmax z13.s, p2/m, z13.s, z1.s\n"
363                             "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n"
364                             "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n"
365                             "addvl %[outptr3], %[outptr3], #3\n"
366                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
367                           [inptr] "+r" (inptr), [p] "+r" (p)
368                         : [w] "r" (w), [minval] "w" (minval), [maxval] "w" (maxval)
369                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
370                         );
371                     }
372                     break;
373 
374                 case 5:
375                     {
376                         long w = xmax - i;
377                         long p = 0;
378                         /* Optimized routine to copy an entire block */
379                         __asm __volatile (
380                             "mov z0.s, %s[maxval]\n"
381                             "addvl x8, %[inptr], #16\n"
382                             "mov z1.s, %s[minval]\n"
383                             "whilelt p0.s, %[p], %[w]\n"
384                             "incw %[p], all, mul #1\n"
385                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
386                             "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
387                             "ld1w z2.s, p0/z, [%[outptr0]]\n"
388                             "whilelt p1.s, %[p], %[w]\n"
389                             "ld1w z10.s, p0/z, [%[inptr]]\n"
390                             "incw %[p], all, mul #1\n"
391                             "ld1w z5.s, p0/z, [%[outptr1]]\n"
392                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
393                             "fadd z10.s, z10.s, z2.s\n"
394                             "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
395                             "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
396                             "whilelt p2.s, %[p], %[w]\n"
397                             "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
398                             "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
399                             "fmin z10.s, p0/m, z10.s, z0.s\n"
400                             "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
401                             "fadd z11.s, z11.s, z3.s\n"
402                             "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
403                             "fadd z13.s, z13.s, z5.s\n"
404                             "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
405                             "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
406                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
407                             "fmax z10.s, p0/m, z10.s, z1.s\n"
408                             "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
409                             "fmin z11.s, p1/m, z11.s, z0.s\n"
410                             "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
411                             "fadd z12.s, z12.s, z4.s\n"
412                             "ld1w z8.s, p0/z, [%[outptr2]]\n"
413                             "fmin z13.s, p0/m, z13.s, z0.s\n"
414                             "st1w z10.s, p0, [%[outptr0]]\n"
415                             "fadd z14.s, z14.s, z6.s\n"
416                             "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
417                             "fmax z11.s, p1/m, z11.s, z1.s\n"
418                             "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
419                             "fmin z12.s, p2/m, z12.s, z0.s\n"
420                             "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
421                             "fmax z13.s, p0/m, z13.s, z1.s\n"
422                             "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
423                             "fmin z14.s, p1/m, z14.s, z0.s\n"
424                             "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
425                             "fadd z15.s, z15.s, z7.s\n"
426                             "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
427                             "fmax z12.s, p2/m, z12.s, z1.s\n"
428                             "ld1w z3.s, p0/z, [%[outptr3]]\n"
429                             "fadd z16.s, z16.s, z8.s\n"
430                             "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n"
431                             "fmax z14.s, p1/m, z14.s, z1.s\n"
432                             "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n"
433                             "fmin z15.s, p2/m, z15.s, z0.s\n"
434                             "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
435                             "fadd z17.s, z17.s, z9.s\n"
436                             "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n"
437                             "fmin z16.s, p0/m, z16.s, z0.s\n"
438                             "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n"
439                             "fadd z10.s, z10.s, z2.s\n"
440                             "st1w z13.s, p0, [%[outptr1]]\n"
441                             "fmax z15.s, p2/m, z15.s, z1.s\n"
442                             "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n"
443                             "fmin z17.s, p1/m, z17.s, z0.s\n"
444                             "ld1w z6.s, p0/z, [%[outptr4]]\n"
445                             "fmax z16.s, p0/m, z16.s, z1.s\n"
446                             "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
447                             "fmin z10.s, p2/m, z10.s, z0.s\n"
448                             "ld1w z14.s, p0/z, [x8, #-4, MUL VL]\n"
449                             "fadd z11.s, z11.s, z3.s\n"
450                             "ld1w z7.s, p1/z, [%[outptr4], #1, MUL VL]\n"
451                             "fmax z17.s, p1/m, z17.s, z1.s\n"
452                             "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
453                             "fadd z12.s, z12.s, z4.s\n"
454                             "ld1w z15.s, p1/z, [x8, #-3, MUL VL]\n"
455                             "fmax z10.s, p2/m, z10.s, z1.s\n"
456                             "ld1w z8.s, p2/z, [%[outptr4], #2, MUL VL]\n"
457                             "fmin z11.s, p0/m, z11.s, z0.s\n"
458                             "st1w z16.s, p0, [%[outptr2]]\n"
459                             "fadd z13.s, z13.s, z5.s\n"
460                             "ld1w z16.s, p2/z, [x8, #-2, MUL VL]\n"
461                             "fmin z12.s, p1/m, z12.s, z0.s\n"
462                             "addvl %[outptr0], %[outptr0], #3\n"
463                             "fmax z11.s, p0/m, z11.s, z1.s\n"
464                             "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
465                             "fmin z13.s, p2/m, z13.s, z0.s\n"
466                             "addvl %[outptr1], %[outptr1], #3\n"
467                             "fmax z12.s, p1/m, z12.s, z1.s\n"
468                             "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
469                             "fadd z14.s, z14.s, z6.s\n"
470                             "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
471                             "fmax z13.s, p2/m, z13.s, z1.s\n"
472                             "st1w z11.s, p0, [%[outptr3]]\n"
473                             "fadd z15.s, z15.s, z7.s\n"
474                             "addvl %[outptr2], %[outptr2], #3\n"
475                             "fmin z14.s, p0/m, z14.s, z0.s\n"
476                             "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n"
477                             "fadd z16.s, z16.s, z8.s\n"
478                             "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
479                             "fmin z15.s, p1/m, z15.s, z0.s\n"
480                             "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n"
481                             "fmax z14.s, p0/m, z14.s, z1.s\n"
482                             "addvl %[outptr3], %[outptr3], #3\n"
483                             "fmin z16.s, p2/m, z16.s, z0.s\n"
484                             "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
485                             "fmax z15.s, p1/m, z15.s, z1.s\n"
486                             "st1w z14.s, p0, [%[outptr4]]\n"
487                             "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
488                             "fmax z16.s, p2/m, z16.s, z1.s\n"
489                             "addvl %[inptr], %[inptr], #24\n"
490                             "st1w z15.s, p1, [%[outptr4], #1, MUL VL]\n"
491                             "st1w z16.s, p2, [%[outptr4], #2, MUL VL]\n"
492                             "addvl %[outptr4], %[outptr4], #3\n"
493                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
494                           [inptr] "+r" (inptr), [p] "+r" (p)
495                         : [w] "r" (w), [minval] "w" (minval), [maxval] "w" (maxval)
496                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
497                         );
498                     }
499                     break;
500 
501                 case 6:
502                     {
503                         long w = xmax - i;
504                         long p = 0;
505                         /* Optimized routine to copy an entire block */
506                         __asm __volatile (
507                             "mov z0.s, %s[maxval]\n"
508                             "addvl x8, %[inptr], #16\n"
509                             "mov z1.s, %s[minval]\n"
510                             "whilelt p0.s, %[p], %[w]\n"
511                             "incw %[p], all, mul #1\n"
512                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
513                             "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
514                             "ld1w z2.s, p0/z, [%[outptr0]]\n"
515                             "whilelt p1.s, %[p], %[w]\n"
516                             "ld1w z10.s, p0/z, [%[inptr]]\n"
517                             "incw %[p], all, mul #1\n"
518                             "ld1w z5.s, p0/z, [%[outptr1]]\n"
519                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
520                             "fadd z10.s, z10.s, z2.s\n"
521                             "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
522                             "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
523                             "whilelt p2.s, %[p], %[w]\n"
524                             "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
525                             "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
526                             "fmin z10.s, p0/m, z10.s, z0.s\n"
527                             "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
528                             "fadd z11.s, z11.s, z3.s\n"
529                             "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
530                             "fadd z13.s, z13.s, z5.s\n"
531                             "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
532                             "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
533                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
534                             "fmax z10.s, p0/m, z10.s, z1.s\n"
535                             "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
536                             "fmin z11.s, p1/m, z11.s, z0.s\n"
537                             "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
538                             "fadd z12.s, z12.s, z4.s\n"
539                             "ld1w z8.s, p0/z, [%[outptr2]]\n"
540                             "fmin z13.s, p0/m, z13.s, z0.s\n"
541                             "st1w z10.s, p0, [%[outptr0]]\n"
542                             "fadd z14.s, z14.s, z6.s\n"
543                             "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
544                             "fmax z11.s, p1/m, z11.s, z1.s\n"
545                             "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
546                             "fmin z12.s, p2/m, z12.s, z0.s\n"
547                             "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
548                             "fmax z13.s, p0/m, z13.s, z1.s\n"
549                             "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
550                             "fmin z14.s, p1/m, z14.s, z0.s\n"
551                             "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
552                             "fadd z15.s, z15.s, z7.s\n"
553                             "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
554                             "fmax z12.s, p2/m, z12.s, z1.s\n"
555                             "ld1w z3.s, p0/z, [%[outptr3]]\n"
556                             "fadd z16.s, z16.s, z8.s\n"
557                             "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n"
558                             "fmax z14.s, p1/m, z14.s, z1.s\n"
559                             "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n"
560                             "fmin z15.s, p2/m, z15.s, z0.s\n"
561                             "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
562                             "fadd z17.s, z17.s, z9.s\n"
563                             "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n"
564                             "fmin z16.s, p0/m, z16.s, z0.s\n"
565                             "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n"
566                             "fadd z10.s, z10.s, z2.s\n"
567                             "st1w z13.s, p0, [%[outptr1]]\n"
568                             "fmax z15.s, p2/m, z15.s, z1.s\n"
569                             "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n"
570                             "fmin z17.s, p1/m, z17.s, z0.s\n"
571                             "ld1w z6.s, p0/z, [%[outptr4]]\n"
572                             "fmax z16.s, p0/m, z16.s, z1.s\n"
573                             "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
574                             "fmin z10.s, p2/m, z10.s, z0.s\n"
575                             "ld1w z14.s, p0/z, [x8, #-4, MUL VL]\n"
576                             "fadd z11.s, z11.s, z3.s\n"
577                             "ld1w z7.s, p1/z, [%[outptr4], #1, MUL VL]\n"
578                             "fmax z17.s, p1/m, z17.s, z1.s\n"
579                             "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
580                             "fadd z12.s, z12.s, z4.s\n"
581                             "ld1w z15.s, p1/z, [x8, #-3, MUL VL]\n"
582                             "fmax z10.s, p2/m, z10.s, z1.s\n"
583                             "ld1w z8.s, p2/z, [%[outptr4], #2, MUL VL]\n"
584                             "fmin z11.s, p0/m, z11.s, z0.s\n"
585                             "st1w z16.s, p0, [%[outptr2]]\n"
586                             "fadd z13.s, z13.s, z5.s\n"
587                             "ld1w z16.s, p2/z, [x8, #-2, MUL VL]\n"
588                             "fmin z12.s, p1/m, z12.s, z0.s\n"
589                             "ld1w z9.s, p0/z, [%[outptr5]]\n"
590                             "fadd z14.s, z14.s, z6.s\n"
591                             "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
592                             "fmax z11.s, p0/m, z11.s, z1.s\n"
593                             "ld1w z17.s, p0/z, [x8, #-1, MUL VL]\n"
594                             "fmin z13.s, p2/m, z13.s, z0.s\n"
595                             "ld1w z2.s, p1/z, [%[outptr5], #1, MUL VL]\n"
596                             "fmax z12.s, p1/m, z12.s, z1.s\n"
597                             "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
598                             "fmin z14.s, p0/m, z14.s, z0.s\n"
599                             "ld1w z10.s, p1/z, [x8]\n"
600                             "fadd z15.s, z15.s, z7.s\n"
601                             "ld1w z3.s, p2/z, [%[outptr5], #2, MUL VL]\n"
602                             "fmax z13.s, p2/m, z13.s, z1.s\n"
603                             "st1w z11.s, p0, [%[outptr3]]\n"
604                             "fadd z16.s, z16.s, z8.s\n"
605                             "ld1w z11.s, p2/z, [x8, #1, MUL VL]\n"
606                             "fmax z14.s, p0/m, z14.s, z1.s\n"
607                             "addvl %[outptr0], %[outptr0], #3\n"
608                             "fmin z15.s, p1/m, z15.s, z0.s\n"
609                             "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n"
610                             "fmin z16.s, p2/m, z16.s, z0.s\n"
611                             "addvl %[outptr1], %[outptr1], #3\n"
612                             "fadd z17.s, z17.s, z9.s\n"
613                             "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n"
614                             "fmax z15.s, p1/m, z15.s, z1.s\n"
615                             "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
616                             "fmax z16.s, p2/m, z16.s, z1.s\n"
617                             "st1w z14.s, p0, [%[outptr4]]\n"
618                             "fmin z17.s, p0/m, z17.s, z0.s\n"
619                             "addvl %[outptr2], %[outptr2], #3\n"
620                             "fadd z10.s, z10.s, z2.s\n"
621                             "st1w z15.s, p1, [%[outptr4], #1, MUL VL]\n"
622                             "fadd z11.s, z11.s, z3.s\n"
623                             "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
624                             "fmax z17.s, p0/m, z17.s, z1.s\n"
625                             "st1w z16.s, p2, [%[outptr4], #2, MUL VL]\n"
626                             "fmin z10.s, p1/m, z10.s, z0.s\n"
627                             "addvl %[outptr3], %[outptr3], #3\n"
628                             "fmin z11.s, p2/m, z11.s, z0.s\n"
629                             "st1w z17.s, p0, [%[outptr5]]\n"
630                             "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
631                             "fmax z10.s, p1/m, z10.s, z1.s\n"
632                             "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
633                             "fmax z11.s, p2/m, z11.s, z1.s\n"
634                             "addvl %[outptr4], %[outptr4], #3\n"
635                             "st1w z10.s, p1, [%[outptr5], #1, MUL VL]\n"
636                             "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
637                             "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
638                             "addvl %[inptr], %[inptr], #24\n"
639                             "st1w z11.s, p2, [%[outptr5], #2, MUL VL]\n"
640                             "addvl %[outptr5], %[outptr5], #3\n"
641                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
642                           [inptr] "+r" (inptr), [p] "+r" (p)
643                         : [w] "r" (w), [minval] "w" (minval), [maxval] "w" (maxval)
644                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
645                         );
646                     }
647                     break;
648 
649                 case 7:
650                     {
651                         long w = xmax - i;
652                         long p = 0;
653                         /* Optimized routine to copy an entire block */
654                         __asm __volatile (
655                             "mov z0.s, %s[maxval]\n"
656                             "addvl x8, %[inptr], #16\n"
657                             "mov z1.s, %s[minval]\n"
658                             "whilelt p0.s, %[p], %[w]\n"
659                             "incw %[p], all, mul #1\n"
660                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
661                             "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
662                             "ld1w z2.s, p0/z, [%[outptr0]]\n"
663                             "whilelt p1.s, %[p], %[w]\n"
664                             "ld1w z10.s, p0/z, [%[inptr]]\n"
665                             "incw %[p], all, mul #1\n"
666                             "ld1w z5.s, p0/z, [%[outptr1]]\n"
667                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
668                             "fadd z10.s, z10.s, z2.s\n"
669                             "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
670                             "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
671                             "whilelt p2.s, %[p], %[w]\n"
672                             "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
673                             "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
674                             "fmin z10.s, p0/m, z10.s, z0.s\n"
675                             "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
676                             "fadd z11.s, z11.s, z3.s\n"
677                             "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
678                             "fadd z13.s, z13.s, z5.s\n"
679                             "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
680                             "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
681                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
682                             "fmax z10.s, p0/m, z10.s, z1.s\n"
683                             "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
684                             "fmin z11.s, p1/m, z11.s, z0.s\n"
685                             "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
686                             "fadd z12.s, z12.s, z4.s\n"
687                             "ld1w z8.s, p0/z, [%[outptr2]]\n"
688                             "fmin z13.s, p0/m, z13.s, z0.s\n"
689                             "st1w z10.s, p0, [%[outptr0]]\n"
690                             "fadd z14.s, z14.s, z6.s\n"
691                             "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
692                             "fmax z11.s, p1/m, z11.s, z1.s\n"
693                             "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
694                             "fmin z12.s, p2/m, z12.s, z0.s\n"
695                             "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
696                             "fmax z13.s, p0/m, z13.s, z1.s\n"
697                             "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
698                             "fmin z14.s, p1/m, z14.s, z0.s\n"
699                             "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
700                             "fadd z15.s, z15.s, z7.s\n"
701                             "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
702                             "fmax z12.s, p2/m, z12.s, z1.s\n"
703                             "ld1w z3.s, p0/z, [%[outptr3]]\n"
704                             "fadd z16.s, z16.s, z8.s\n"
705                             "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n"
706                             "fmax z14.s, p1/m, z14.s, z1.s\n"
707                             "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n"
708                             "fmin z15.s, p2/m, z15.s, z0.s\n"
709                             "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
710                             "fadd z17.s, z17.s, z9.s\n"
711                             "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n"
712                             "fmin z16.s, p0/m, z16.s, z0.s\n"
713                             "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n"
714                             "fadd z10.s, z10.s, z2.s\n"
715                             "st1w z13.s, p0, [%[outptr1]]\n"
716                             "fmax z15.s, p2/m, z15.s, z1.s\n"
717                             "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n"
718                             "fmin z17.s, p1/m, z17.s, z0.s\n"
719                             "ld1w z6.s, p0/z, [%[outptr4]]\n"
720                             "fmax z16.s, p0/m, z16.s, z1.s\n"
721                             "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
722                             "fmin z10.s, p2/m, z10.s, z0.s\n"
723                             "ld1w z14.s, p0/z, [x8, #-4, MUL VL]\n"
724                             "fadd z11.s, z11.s, z3.s\n"
725                             "ld1w z7.s, p1/z, [%[outptr4], #1, MUL VL]\n"
726                             "fmax z17.s, p1/m, z17.s, z1.s\n"
727                             "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
728                             "fadd z12.s, z12.s, z4.s\n"
729                             "ld1w z15.s, p1/z, [x8, #-3, MUL VL]\n"
730                             "fmax z10.s, p2/m, z10.s, z1.s\n"
731                             "ld1w z8.s, p2/z, [%[outptr4], #2, MUL VL]\n"
732                             "fmin z11.s, p0/m, z11.s, z0.s\n"
733                             "st1w z16.s, p0, [%[outptr2]]\n"
734                             "fadd z13.s, z13.s, z5.s\n"
735                             "ld1w z16.s, p2/z, [x8, #-2, MUL VL]\n"
736                             "fmin z12.s, p1/m, z12.s, z0.s\n"
737                             "ld1w z9.s, p0/z, [%[outptr5]]\n"
738                             "fadd z14.s, z14.s, z6.s\n"
739                             "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
740                             "fmax z11.s, p0/m, z11.s, z1.s\n"
741                             "ld1w z17.s, p0/z, [x8, #-1, MUL VL]\n"
742                             "fmin z13.s, p2/m, z13.s, z0.s\n"
743                             "ld1w z2.s, p1/z, [%[outptr5], #1, MUL VL]\n"
744                             "fmax z12.s, p1/m, z12.s, z1.s\n"
745                             "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
746                             "fmin z14.s, p0/m, z14.s, z0.s\n"
747                             "ld1w z10.s, p1/z, [x8]\n"
748                             "fadd z15.s, z15.s, z7.s\n"
749                             "ld1w z3.s, p2/z, [%[outptr5], #2, MUL VL]\n"
750                             "fmax z13.s, p2/m, z13.s, z1.s\n"
751                             "st1w z11.s, p0, [%[outptr3]]\n"
752                             "fadd z16.s, z16.s, z8.s\n"
753                             "ld1w z11.s, p2/z, [x8, #1, MUL VL]\n"
754                             "fmax z14.s, p0/m, z14.s, z1.s\n"
755                             "ld1w z4.s, p0/z, [%[outptr6]]\n"
756                             "fmin z15.s, p1/m, z15.s, z0.s\n"
757                             "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n"
758                             "fadd z17.s, z17.s, z9.s\n"
759                             "ld1w z12.s, p0/z, [x8, #2, MUL VL]\n"
760                             "fmin z16.s, p2/m, z16.s, z0.s\n"
761                             "ld1w z5.s, p1/z, [%[outptr6], #1, MUL VL]\n"
762                             "fadd z10.s, z10.s, z2.s\n"
763                             "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n"
764                             "fmax z15.s, p1/m, z15.s, z1.s\n"
765                             "ld1w z13.s, p1/z, [x8, #3, MUL VL]\n"
766                             "fmin z17.s, p0/m, z17.s, z0.s\n"
767                             "ld1w z6.s, p2/z, [%[outptr6], #2, MUL VL]\n"
768                             "fmax z16.s, p2/m, z16.s, z1.s\n"
769                             "st1w z14.s, p0, [%[outptr4]]\n"
770                             "fmin z10.s, p1/m, z10.s, z0.s\n"
771                             "ld1w z14.s, p2/z, [x8, #4, MUL VL]\n"
772                             "fadd z11.s, z11.s, z3.s\n"
773                             "addvl %[outptr0], %[outptr0], #3\n"
774                             "fmax z17.s, p0/m, z17.s, z1.s\n"
775                             "st1w z15.s, p1, [%[outptr4], #1, MUL VL]\n"
776                             "fmax z10.s, p1/m, z10.s, z1.s\n"
777                             "addvl %[outptr1], %[outptr1], #3\n"
778                             "fmin z11.s, p2/m, z11.s, z0.s\n"
779                             "st1w z16.s, p2, [%[outptr4], #2, MUL VL]\n"
780                             "fadd z12.s, z12.s, z4.s\n"
781                             "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
782                             "fadd z13.s, z13.s, z5.s\n"
783                             "st1w z17.s, p0, [%[outptr5]]\n"
784                             "fmax z11.s, p2/m, z11.s, z1.s\n"
785                             "addvl %[outptr2], %[outptr2], #3\n"
786                             "fmin z12.s, p0/m, z12.s, z0.s\n"
787                             "st1w z10.s, p1, [%[outptr5], #1, MUL VL]\n"
788                             "fmin z13.s, p1/m, z13.s, z0.s\n"
789                             "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
790                             "fadd z14.s, z14.s, z6.s\n"
791                             "st1w z11.s, p2, [%[outptr5], #2, MUL VL]\n"
792                             "fmax z12.s, p0/m, z12.s, z1.s\n"
793                             "addvl %[outptr3], %[outptr3], #3\n"
794                             "fmax z13.s, p1/m, z13.s, z1.s\n"
795                             "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
796                             "fmin z14.s, p2/m, z14.s, z0.s\n"
797                             "st1w z12.s, p0, [%[outptr6]]\n"
798                             "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
799                             "addvl %[outptr4], %[outptr4], #3\n"
800                             "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
801                             "fmax z14.s, p2/m, z14.s, z1.s\n"
802                             "st1w z13.s, p1, [%[outptr6], #1, MUL VL]\n"
803                             "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
804                             "addvl %[outptr5], %[outptr5], #3\n"
805                             "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
806                             "st1w z14.s, p2, [%[outptr6], #2, MUL VL]\n"
807                             "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
808                             "addvl %[outptr6], %[outptr6], #3\n"
809                             "addvl %[inptr], %[inptr], #24\n"
810                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
811                           [inptr] "+r" (inptr), [p] "+r" (p)
812                         : [w] "r" (w), [minval] "w" (minval), [maxval] "w" (maxval)
813                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
814                         );
815                     }
816                     break;
817 
818                 default:
819                 case 8:
820                     {
821                         long w = xmax - i;
822                         long p = 0;
823                         /* Optimized routine to copy an entire block */
824                         __asm __volatile (
825                             "mov z0.s, %s[maxval]\n"
826                             "addvl x8, %[inptr], #16\n"
827                             "mov z1.s, %s[minval]\n"
828                             "whilelt p0.s, %[p], %[w]\n"
829                             "incw %[p], all, mul #1\n"
830                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
831                             "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
832                             "ld1w z2.s, p0/z, [%[outptr0]]\n"
833                             "whilelt p1.s, %[p], %[w]\n"
834                             "ld1w z10.s, p0/z, [%[inptr]]\n"
835                             "incw %[p], all, mul #1\n"
836                             "ld1w z5.s, p0/z, [%[outptr1]]\n"
837                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
838                             "fadd z10.s, z10.s, z2.s\n"
839                             "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
840                             "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
841                             "whilelt p2.s, %[p], %[w]\n"
842                             "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
843                             "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
844                             "fmin z10.s, p0/m, z10.s, z0.s\n"
845                             "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
846                             "fadd z11.s, z11.s, z3.s\n"
847                             "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
848                             "fadd z13.s, z13.s, z5.s\n"
849                             "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
850                             "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
851                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
852                             "fmax z10.s, p0/m, z10.s, z1.s\n"
853                             "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
854                             "fmin z11.s, p1/m, z11.s, z0.s\n"
855                             "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
856                             "fadd z12.s, z12.s, z4.s\n"
857                             "ld1w z8.s, p0/z, [%[outptr2]]\n"
858                             "fmin z13.s, p0/m, z13.s, z0.s\n"
859                             "st1w z10.s, p0, [%[outptr0]]\n"
860                             "fadd z14.s, z14.s, z6.s\n"
861                             "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
862                             "fmax z11.s, p1/m, z11.s, z1.s\n"
863                             "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
864                             "fmin z12.s, p2/m, z12.s, z0.s\n"
865                             "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
866                             "fmax z13.s, p0/m, z13.s, z1.s\n"
867                             "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
868                             "fmin z14.s, p1/m, z14.s, z0.s\n"
869                             "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
870                             "fadd z15.s, z15.s, z7.s\n"
871                             "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
872                             "fmax z12.s, p2/m, z12.s, z1.s\n"
873                             "ld1w z3.s, p0/z, [%[outptr3]]\n"
874                             "fadd z16.s, z16.s, z8.s\n"
875                             "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n"
876                             "fmax z14.s, p1/m, z14.s, z1.s\n"
877                             "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n"
878                             "fmin z15.s, p2/m, z15.s, z0.s\n"
879                             "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
880                             "fadd z17.s, z17.s, z9.s\n"
881                             "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n"
882                             "fmin z16.s, p0/m, z16.s, z0.s\n"
883                             "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n"
884                             "fadd z10.s, z10.s, z2.s\n"
885                             "st1w z13.s, p0, [%[outptr1]]\n"
886                             "fmax z15.s, p2/m, z15.s, z1.s\n"
887                             "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n"
888                             "fmin z17.s, p1/m, z17.s, z0.s\n"
889                             "ld1w z6.s, p0/z, [%[outptr4]]\n"
890                             "fmax z16.s, p0/m, z16.s, z1.s\n"
891                             "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
892                             "fmin z10.s, p2/m, z10.s, z0.s\n"
893                             "ld1w z14.s, p0/z, [x8, #-4, MUL VL]\n"
894                             "fadd z11.s, z11.s, z3.s\n"
895                             "ld1w z7.s, p1/z, [%[outptr4], #1, MUL VL]\n"
896                             "fmax z17.s, p1/m, z17.s, z1.s\n"
897                             "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
898                             "fadd z12.s, z12.s, z4.s\n"
899                             "ld1w z15.s, p1/z, [x8, #-3, MUL VL]\n"
900                             "fmax z10.s, p2/m, z10.s, z1.s\n"
901                             "ld1w z8.s, p2/z, [%[outptr4], #2, MUL VL]\n"
902                             "fmin z11.s, p0/m, z11.s, z0.s\n"
903                             "st1w z16.s, p0, [%[outptr2]]\n"
904                             "fadd z13.s, z13.s, z5.s\n"
905                             "ld1w z16.s, p2/z, [x8, #-2, MUL VL]\n"
906                             "fmin z12.s, p1/m, z12.s, z0.s\n"
907                             "ld1w z9.s, p0/z, [%[outptr5]]\n"
908                             "fadd z14.s, z14.s, z6.s\n"
909                             "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
910                             "fmax z11.s, p0/m, z11.s, z1.s\n"
911                             "ld1w z17.s, p0/z, [x8, #-1, MUL VL]\n"
912                             "fmin z13.s, p2/m, z13.s, z0.s\n"
913                             "ld1w z2.s, p1/z, [%[outptr5], #1, MUL VL]\n"
914                             "fmax z12.s, p1/m, z12.s, z1.s\n"
915                             "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
916                             "fmin z14.s, p0/m, z14.s, z0.s\n"
917                             "ld1w z10.s, p1/z, [x8]\n"
918                             "fadd z15.s, z15.s, z7.s\n"
919                             "ld1w z3.s, p2/z, [%[outptr5], #2, MUL VL]\n"
920                             "fmax z13.s, p2/m, z13.s, z1.s\n"
921                             "st1w z11.s, p0, [%[outptr3]]\n"
922                             "fadd z16.s, z16.s, z8.s\n"
923                             "ld1w z11.s, p2/z, [x8, #1, MUL VL]\n"
924                             "fmax z14.s, p0/m, z14.s, z1.s\n"
925                             "ld1w z4.s, p0/z, [%[outptr6]]\n"
926                             "fmin z15.s, p1/m, z15.s, z0.s\n"
927                             "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n"
928                             "fadd z17.s, z17.s, z9.s\n"
929                             "ld1w z12.s, p0/z, [x8, #2, MUL VL]\n"
930                             "fmin z16.s, p2/m, z16.s, z0.s\n"
931                             "ld1w z5.s, p1/z, [%[outptr6], #1, MUL VL]\n"
932                             "fadd z10.s, z10.s, z2.s\n"
933                             "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n"
934                             "fmax z15.s, p1/m, z15.s, z1.s\n"
935                             "ld1w z13.s, p1/z, [x8, #3, MUL VL]\n"
936                             "fmin z17.s, p0/m, z17.s, z0.s\n"
937                             "ld1w z6.s, p2/z, [%[outptr6], #2, MUL VL]\n"
938                             "fmax z16.s, p2/m, z16.s, z1.s\n"
939                             "st1w z14.s, p0, [%[outptr4]]\n"
940                             "fmin z10.s, p1/m, z10.s, z0.s\n"
941                             "ld1w z14.s, p2/z, [x8, #4, MUL VL]\n"
942                             "fadd z11.s, z11.s, z3.s\n"
943                             "ld1w z7.s, p0/z, [%[outptr7]]\n"
944                             "fmax z17.s, p0/m, z17.s, z1.s\n"
945                             "st1w z15.s, p1, [%[outptr4], #1, MUL VL]\n"
946                             "fadd z12.s, z12.s, z4.s\n"
947                             "ld1w z15.s, p0/z, [x8, #5, MUL VL]\n"
948                             "fmax z10.s, p1/m, z10.s, z1.s\n"
949                             "ld1w z8.s, p1/z, [%[outptr7], #1, MUL VL]\n"
950                             "fmin z11.s, p2/m, z11.s, z0.s\n"
951                             "st1w z16.s, p2, [%[outptr4], #2, MUL VL]\n"
952                             "fadd z13.s, z13.s, z5.s\n"
953                             "ld1w z16.s, p1/z, [x8, #6, MUL VL]\n"
954                             "fmin z12.s, p0/m, z12.s, z0.s\n"
955                             "ld1w z9.s, p2/z, [%[outptr7], #2, MUL VL]\n"
956                             "fadd z14.s, z14.s, z6.s\n"
957                             "st1w z17.s, p0, [%[outptr5]]\n"
958                             "fmax z11.s, p2/m, z11.s, z1.s\n"
959                             "ld1w z17.s, p2/z, [x8, #7, MUL VL]\n"
960                             "fmin z13.s, p1/m, z13.s, z0.s\n"
961                             "addvl %[outptr0], %[outptr0], #3\n"
962                             "fmax z12.s, p0/m, z12.s, z1.s\n"
963                             "st1w z10.s, p1, [%[outptr5], #1, MUL VL]\n"
964                             "fmin z14.s, p2/m, z14.s, z0.s\n"
965                             "addvl %[outptr1], %[outptr1], #3\n"
966                             "fmax z13.s, p1/m, z13.s, z1.s\n"
967                             "st1w z11.s, p2, [%[outptr5], #2, MUL VL]\n"
968                             "fadd z15.s, z15.s, z7.s\n"
969                             "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
970                             "fmax z14.s, p2/m, z14.s, z1.s\n"
971                             "st1w z12.s, p0, [%[outptr6]]\n"
972                             "fadd z16.s, z16.s, z8.s\n"
973                             "addvl %[outptr2], %[outptr2], #3\n"
974                             "fmin z15.s, p0/m, z15.s, z0.s\n"
975                             "st1w z13.s, p1, [%[outptr6], #1, MUL VL]\n"
976                             "fadd z17.s, z17.s, z9.s\n"
977                             "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
978                             "fmin z16.s, p1/m, z16.s, z0.s\n"
979                             "st1w z14.s, p2, [%[outptr6], #2, MUL VL]\n"
980                             "fmax z15.s, p0/m, z15.s, z1.s\n"
981                             "addvl %[outptr3], %[outptr3], #3\n"
982                             "fmin z17.s, p2/m, z17.s, z0.s\n"
983                             "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
984                             "fmax z16.s, p1/m, z16.s, z1.s\n"
985                             "st1w z15.s, p0, [%[outptr7]]\n"
986                             "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
987                             "fmax z17.s, p2/m, z17.s, z1.s\n"
988                             "addvl %[outptr4], %[outptr4], #3\n"
989                             "st1w z16.s, p1, [%[outptr7], #1, MUL VL]\n"
990                             "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
991                             "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
992                             "addvl %[outptr5], %[outptr5], #3\n"
993                             "st1w z17.s, p2, [%[outptr7], #2, MUL VL]\n"
994                             "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
995                             "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
996                             "addvl %[outptr6], %[outptr6], #3\n"
997                             "prfm PLDL1KEEP, [%[outptr7], #0x60]\n"
998                             "addvl %[outptr7], %[outptr7], #3\n"
999                             "addvl %[inptr], %[inptr], #24\n"
1000                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1001                           [inptr] "+r" (inptr), [p] "+r" (p)
1002                         : [w] "r" (w), [minval] "w" (minval), [maxval] "w" (maxval)
1003                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
1004                         );
1005                     }
1006                     break;
1007 
1008 
1009                 }
1010             }
1011             else
1012             {
1013                 const float *biasptr = bias ? bias + i : nullbias;
1014 
1015                 switch(height)
1016                 {
1017                 case 1:
1018                     {
1019                         long w = xmax - i;
1020                         long p = 0;
1021                         /* Optimized routine to copy an entire block */
1022                         __asm __volatile (
1023                             "mov z0.s, %s[maxval]\n"
1024                             "addvl x8, %[inptr], #16\n"
1025                             "mov z1.s, %s[minval]\n"
1026                             "whilelt p0.s, %[p], %[w]\n"
1027                             "incw %[p], all, mul #1\n"
1028                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1029                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1030                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
1031                             "whilelt p1.s, %[p], %[w]\n"
1032                             "ld1w z13.s, p0/z, [%[inptr]]\n"
1033                             "incw %[p], all, mul #1\n"
1034                             "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
1035                             "fadd z13.s, z13.s, z2.s\n"
1036                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1037                             "whilelt p2.s, %[p], %[w]\n"
1038                             "fmin z13.s, p0/m, z13.s, z0.s\n"
1039                             "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
1040                             "fadd z14.s, z14.s, z3.s\n"
1041                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1042                             "addvl %[inptr], %[inptr], #24\n"
1043                             "fmax z13.s, p0/m, z13.s, z1.s\n"
1044                             "fmin z14.s, p1/m, z14.s, z0.s\n"
1045                             "fadd z15.s, z15.s, z4.s\n"
1046                             "st1w z13.s, p0, [%[outptr0]]\n"
1047                             "fmax z14.s, p1/m, z14.s, z1.s\n"
1048                             "fmin z15.s, p2/m, z15.s, z0.s\n"
1049                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
1050                             "fmax z15.s, p2/m, z15.s, z1.s\n"
1051                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
1052                             "addvl %[outptr0], %[outptr0], #3\n"
1053                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1054                           [inptr] "+r" (inptr), [p] "+r" (p)
1055                         : [w] "r" (w), [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1056                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
1057                         );
1058                     }
1059                     break;
1060 
1061                 case 2:
1062                     {
1063                         long w = xmax - i;
1064                         long p = 0;
1065                         /* Optimized routine to copy an entire block */
1066                         __asm __volatile (
1067                             "mov z0.s, %s[maxval]\n"
1068                             "addvl x8, %[inptr], #16\n"
1069                             "mov z1.s, %s[minval]\n"
1070                             "whilelt p0.s, %[p], %[w]\n"
1071                             "incw %[p], all, mul #1\n"
1072                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1073                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1074                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
1075                             "whilelt p1.s, %[p], %[w]\n"
1076                             "ld1w z13.s, p0/z, [%[inptr]]\n"
1077                             "incw %[p], all, mul #1\n"
1078                             "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
1079                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1080                             "fadd z13.s, z13.s, z2.s\n"
1081                             "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
1082                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1083                             "whilelt p2.s, %[p], %[w]\n"
1084                             "fadd z16.s, z16.s, z2.s\n"
1085                             "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
1086                             "fmin z13.s, p0/m, z13.s, z0.s\n"
1087                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1088                             "fadd z14.s, z14.s, z3.s\n"
1089                             "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
1090                             "fmin z16.s, p0/m, z16.s, z0.s\n"
1091                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1092                             "fmax z13.s, p0/m, z13.s, z1.s\n"
1093                             "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
1094                             "fadd z17.s, z17.s, z3.s\n"
1095                             "addvl %[inptr], %[inptr], #24\n"
1096                             "fmin z14.s, p1/m, z14.s, z0.s\n"
1097                             "st1w z13.s, p0, [%[outptr0]]\n"
1098                             "fadd z15.s, z15.s, z4.s\n"
1099                             "fmax z16.s, p0/m, z16.s, z1.s\n"
1100                             "fmin z17.s, p1/m, z17.s, z0.s\n"
1101                             "fmax z14.s, p1/m, z14.s, z1.s\n"
1102                             "fmin z15.s, p2/m, z15.s, z0.s\n"
1103                             "fadd z18.s, z18.s, z4.s\n"
1104                             "fmax z17.s, p1/m, z17.s, z1.s\n"
1105                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
1106                             "fmax z15.s, p2/m, z15.s, z1.s\n"
1107                             "fmin z18.s, p2/m, z18.s, z0.s\n"
1108                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
1109                             "addvl %[outptr0], %[outptr0], #3\n"
1110                             "fmax z18.s, p2/m, z18.s, z1.s\n"
1111                             "st1w z16.s, p0, [%[outptr1]]\n"
1112                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
1113                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
1114                             "addvl %[outptr1], %[outptr1], #3\n"
1115                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1116                           [inptr] "+r" (inptr), [p] "+r" (p)
1117                         : [w] "r" (w), [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1118                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
1119                         );
1120                     }
1121                     break;
1122 
1123                 case 3:
1124                     {
1125                         long w = xmax - i;
1126                         long p = 0;
1127                         /* Optimized routine to copy an entire block */
1128                         __asm __volatile (
1129                             "mov z0.s, %s[maxval]\n"
1130                             "addvl x8, %[inptr], #16\n"
1131                             "mov z1.s, %s[minval]\n"
1132                             "whilelt p0.s, %[p], %[w]\n"
1133                             "incw %[p], all, mul #1\n"
1134                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1135                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1136                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
1137                             "whilelt p1.s, %[p], %[w]\n"
1138                             "ld1w z13.s, p0/z, [%[inptr]]\n"
1139                             "incw %[p], all, mul #1\n"
1140                             "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
1141                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1142                             "fadd z13.s, z13.s, z2.s\n"
1143                             "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
1144                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1145                             "whilelt p2.s, %[p], %[w]\n"
1146                             "fadd z16.s, z16.s, z2.s\n"
1147                             "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
1148                             "fmin z13.s, p0/m, z13.s, z0.s\n"
1149                             "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
1150                             "fadd z14.s, z14.s, z3.s\n"
1151                             "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
1152                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1153                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1154                             "fmax z13.s, p0/m, z13.s, z1.s\n"
1155                             "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
1156                             "fmin z14.s, p1/m, z14.s, z0.s\n"
1157                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
1158                             "fadd z15.s, z15.s, z4.s\n"
1159                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1160                             "fmin z16.s, p0/m, z16.s, z0.s\n"
1161                             "st1w z13.s, p0, [%[outptr0]]\n"
1162                             "fmax z14.s, p1/m, z14.s, z1.s\n"
1163                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
1164                             "fmin z15.s, p2/m, z15.s, z0.s\n"
1165                             "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1166                             "fmax z16.s, p0/m, z16.s, z1.s\n"
1167                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
1168                             "fadd z17.s, z17.s, z3.s\n"
1169                             "addvl %[inptr], %[inptr], #24\n"
1170                             "fmax z15.s, p2/m, z15.s, z1.s\n"
1171                             "fadd z18.s, z18.s, z4.s\n"
1172                             "fmin z17.s, p1/m, z17.s, z0.s\n"
1173                             "fadd z19.s, z19.s, z2.s\n"
1174                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
1175                             "fadd z20.s, z20.s, z3.s\n"
1176                             "addvl %[outptr0], %[outptr0], #3\n"
1177                             "fmax z17.s, p1/m, z17.s, z1.s\n"
1178                             "st1w z16.s, p0, [%[outptr1]]\n"
1179                             "fmin z18.s, p2/m, z18.s, z0.s\n"
1180                             "fmin z19.s, p0/m, z19.s, z0.s\n"
1181                             "fmin z20.s, p1/m, z20.s, z0.s\n"
1182                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
1183                             "fadd z13.s, z13.s, z4.s\n"
1184                             "fmax z18.s, p2/m, z18.s, z1.s\n"
1185                             "fmax z19.s, p0/m, z19.s, z1.s\n"
1186                             "fmax z20.s, p1/m, z20.s, z1.s\n"
1187                             "fmin z13.s, p2/m, z13.s, z0.s\n"
1188                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
1189                             "addvl %[outptr1], %[outptr1], #3\n"
1190                             "fmax z13.s, p2/m, z13.s, z1.s\n"
1191                             "st1w z19.s, p0, [%[outptr2]]\n"
1192                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
1193                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
1194                             "addvl %[outptr2], %[outptr2], #3\n"
1195                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1196                           [inptr] "+r" (inptr), [p] "+r" (p)
1197                         : [w] "r" (w), [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1198                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
1199                         );
1200                     }
1201                     break;
1202 
1203                 case 4:
1204                     {
1205                         long w = xmax - i;
1206                         long p = 0;
1207                         /* Optimized routine to copy an entire block */
1208                         __asm __volatile (
1209                             "mov z0.s, %s[maxval]\n"
1210                             "addvl x8, %[inptr], #16\n"
1211                             "mov z1.s, %s[minval]\n"
1212                             "whilelt p0.s, %[p], %[w]\n"
1213                             "incw %[p], all, mul #1\n"
1214                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1215                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1216                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
1217                             "whilelt p1.s, %[p], %[w]\n"
1218                             "ld1w z13.s, p0/z, [%[inptr]]\n"
1219                             "incw %[p], all, mul #1\n"
1220                             "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
1221                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1222                             "fadd z13.s, z13.s, z2.s\n"
1223                             "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
1224                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1225                             "whilelt p2.s, %[p], %[w]\n"
1226                             "fadd z16.s, z16.s, z2.s\n"
1227                             "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
1228                             "fmin z13.s, p0/m, z13.s, z0.s\n"
1229                             "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
1230                             "fadd z14.s, z14.s, z3.s\n"
1231                             "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
1232                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1233                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1234                             "fmax z13.s, p0/m, z13.s, z1.s\n"
1235                             "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
1236                             "fmin z14.s, p1/m, z14.s, z0.s\n"
1237                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
1238                             "fadd z15.s, z15.s, z4.s\n"
1239                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1240                             "fmin z16.s, p0/m, z16.s, z0.s\n"
1241                             "st1w z13.s, p0, [%[outptr0]]\n"
1242                             "fmax z14.s, p1/m, z14.s, z1.s\n"
1243                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
1244                             "fmin z15.s, p2/m, z15.s, z0.s\n"
1245                             "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1246                             "fmax z16.s, p0/m, z16.s, z1.s\n"
1247                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
1248                             "fadd z17.s, z17.s, z3.s\n"
1249                             "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
1250                             "fmax z15.s, p2/m, z15.s, z1.s\n"
1251                             "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1252                             "fadd z18.s, z18.s, z4.s\n"
1253                             "addvl %[inptr], %[inptr], #24\n"
1254                             "fmin z17.s, p1/m, z17.s, z0.s\n"
1255                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
1256                             "fadd z19.s, z19.s, z2.s\n"
1257                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
1258                             "fmin z18.s, p2/m, z18.s, z0.s\n"
1259                             "addvl %[outptr0], %[outptr0], #3\n"
1260                             "fmax z17.s, p1/m, z17.s, z1.s\n"
1261                             "st1w z16.s, p0, [%[outptr1]]\n"
1262                             "fmin z19.s, p0/m, z19.s, z0.s\n"
1263                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
1264                             "fmax z18.s, p2/m, z18.s, z1.s\n"
1265                             "fadd z20.s, z20.s, z3.s\n"
1266                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
1267                             "fadd z13.s, z13.s, z4.s\n"
1268                             "fmax z19.s, p0/m, z19.s, z1.s\n"
1269                             "fadd z14.s, z14.s, z2.s\n"
1270                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
1271                             "fmin z20.s, p1/m, z20.s, z0.s\n"
1272                             "addvl %[outptr1], %[outptr1], #3\n"
1273                             "fmin z13.s, p2/m, z13.s, z0.s\n"
1274                             "st1w z19.s, p0, [%[outptr2]]\n"
1275                             "fmin z14.s, p0/m, z14.s, z0.s\n"
1276                             "fmax z20.s, p1/m, z20.s, z1.s\n"
1277                             "fadd z15.s, z15.s, z3.s\n"
1278                             "fmax z13.s, p2/m, z13.s, z1.s\n"
1279                             "fmax z14.s, p0/m, z14.s, z1.s\n"
1280                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
1281                             "fadd z16.s, z16.s, z4.s\n"
1282                             "fmin z15.s, p1/m, z15.s, z0.s\n"
1283                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
1284                             "addvl %[outptr2], %[outptr2], #3\n"
1285                             "fmax z15.s, p1/m, z15.s, z1.s\n"
1286                             "fmin z16.s, p2/m, z16.s, z0.s\n"
1287                             "st1w z14.s, p0, [%[outptr3]]\n"
1288                             "fmax z16.s, p2/m, z16.s, z1.s\n"
1289                             "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
1290                             "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
1291                             "addvl %[outptr3], %[outptr3], #3\n"
1292                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1293                           [inptr] "+r" (inptr), [p] "+r" (p)
1294                         : [w] "r" (w), [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1295                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
1296                         );
1297                     }
1298                     break;
1299 
1300                 case 5:
1301                     {
1302                         long w = xmax - i;
1303                         long p = 0;
1304                         /* Optimized routine to copy an entire block */
1305                         __asm __volatile (
1306                             "mov z0.s, %s[maxval]\n"
1307                             "addvl x8, %[inptr], #16\n"
1308                             "mov z1.s, %s[minval]\n"
1309                             "whilelt p0.s, %[p], %[w]\n"
1310                             "incw %[p], all, mul #1\n"
1311                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1312                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1313                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
1314                             "whilelt p1.s, %[p], %[w]\n"
1315                             "ld1w z13.s, p0/z, [%[inptr]]\n"
1316                             "incw %[p], all, mul #1\n"
1317                             "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
1318                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1319                             "fadd z13.s, z13.s, z2.s\n"
1320                             "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
1321                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1322                             "whilelt p2.s, %[p], %[w]\n"
1323                             "fadd z16.s, z16.s, z2.s\n"
1324                             "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
1325                             "fmin z13.s, p0/m, z13.s, z0.s\n"
1326                             "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
1327                             "fadd z14.s, z14.s, z3.s\n"
1328                             "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
1329                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1330                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1331                             "fmax z13.s, p0/m, z13.s, z1.s\n"
1332                             "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
1333                             "fmin z14.s, p1/m, z14.s, z0.s\n"
1334                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
1335                             "fadd z15.s, z15.s, z4.s\n"
1336                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1337                             "fmin z16.s, p0/m, z16.s, z0.s\n"
1338                             "st1w z13.s, p0, [%[outptr0]]\n"
1339                             "fmax z14.s, p1/m, z14.s, z1.s\n"
1340                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
1341                             "fmin z15.s, p2/m, z15.s, z0.s\n"
1342                             "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1343                             "fmax z16.s, p0/m, z16.s, z1.s\n"
1344                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
1345                             "fadd z17.s, z17.s, z3.s\n"
1346                             "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
1347                             "fmax z15.s, p2/m, z15.s, z1.s\n"
1348                             "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1349                             "fadd z18.s, z18.s, z4.s\n"
1350                             "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1351                             "fmin z17.s, p1/m, z17.s, z0.s\n"
1352                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
1353                             "fadd z19.s, z19.s, z2.s\n"
1354                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
1355                             "fmin z18.s, p2/m, z18.s, z0.s\n"
1356                             "addvl %[outptr0], %[outptr0], #3\n"
1357                             "fmax z17.s, p1/m, z17.s, z1.s\n"
1358                             "st1w z16.s, p0, [%[outptr1]]\n"
1359                             "fmin z19.s, p0/m, z19.s, z0.s\n"
1360                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
1361                             "fmax z18.s, p2/m, z18.s, z1.s\n"
1362                             "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1363                             "fadd z20.s, z20.s, z3.s\n"
1364                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
1365                             "fmax z19.s, p0/m, z19.s, z1.s\n"
1366                             "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
1367                             "fadd z13.s, z13.s, z4.s\n"
1368                             "addvl %[inptr], %[inptr], #24\n"
1369                             "fmin z20.s, p1/m, z20.s, z0.s\n"
1370                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
1371                             "fadd z14.s, z14.s, z2.s\n"
1372                             "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
1373                             "fmin z13.s, p2/m, z13.s, z0.s\n"
1374                             "addvl %[outptr1], %[outptr1], #3\n"
1375                             "fmax z20.s, p1/m, z20.s, z1.s\n"
1376                             "st1w z19.s, p0, [%[outptr2]]\n"
1377                             "fmin z14.s, p0/m, z14.s, z0.s\n"
1378                             "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
1379                             "fmax z13.s, p2/m, z13.s, z1.s\n"
1380                             "fadd z15.s, z15.s, z3.s\n"
1381                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
1382                             "fadd z16.s, z16.s, z4.s\n"
1383                             "fmax z14.s, p0/m, z14.s, z1.s\n"
1384                             "fadd z17.s, z17.s, z2.s\n"
1385                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
1386                             "fmin z15.s, p1/m, z15.s, z0.s\n"
1387                             "addvl %[outptr2], %[outptr2], #3\n"
1388                             "fmin z16.s, p2/m, z16.s, z0.s\n"
1389                             "st1w z14.s, p0, [%[outptr3]]\n"
1390                             "fmin z17.s, p0/m, z17.s, z0.s\n"
1391                             "fmax z15.s, p1/m, z15.s, z1.s\n"
1392                             "fadd z18.s, z18.s, z3.s\n"
1393                             "fmax z16.s, p2/m, z16.s, z1.s\n"
1394                             "fmax z17.s, p0/m, z17.s, z1.s\n"
1395                             "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
1396                             "fadd z19.s, z19.s, z4.s\n"
1397                             "fmin z18.s, p1/m, z18.s, z0.s\n"
1398                             "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
1399                             "addvl %[outptr3], %[outptr3], #3\n"
1400                             "fmax z18.s, p1/m, z18.s, z1.s\n"
1401                             "fmin z19.s, p2/m, z19.s, z0.s\n"
1402                             "st1w z17.s, p0, [%[outptr4]]\n"
1403                             "fmax z19.s, p2/m, z19.s, z1.s\n"
1404                             "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
1405                             "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
1406                             "addvl %[outptr4], %[outptr4], #3\n"
1407                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1408                           [inptr] "+r" (inptr), [p] "+r" (p)
1409                         : [w] "r" (w), [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1410                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
1411                         );
1412                     }
1413                     break;
1414 
1415                 case 6:
1416                     {
1417                         long w = xmax - i;
1418                         long p = 0;
1419                         /* Optimized routine to copy an entire block */
1420                         __asm __volatile (
1421                             "mov z0.s, %s[maxval]\n"
1422                             "addvl x8, %[inptr], #16\n"
1423                             "mov z1.s, %s[minval]\n"
1424                             "whilelt p0.s, %[p], %[w]\n"
1425                             "incw %[p], all, mul #1\n"
1426                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1427                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1428                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
1429                             "whilelt p1.s, %[p], %[w]\n"
1430                             "ld1w z13.s, p0/z, [%[inptr]]\n"
1431                             "incw %[p], all, mul #1\n"
1432                             "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
1433                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1434                             "fadd z13.s, z13.s, z2.s\n"
1435                             "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
1436                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1437                             "whilelt p2.s, %[p], %[w]\n"
1438                             "fadd z16.s, z16.s, z2.s\n"
1439                             "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
1440                             "fmin z13.s, p0/m, z13.s, z0.s\n"
1441                             "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
1442                             "fadd z14.s, z14.s, z3.s\n"
1443                             "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
1444                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1445                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1446                             "fmax z13.s, p0/m, z13.s, z1.s\n"
1447                             "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
1448                             "fmin z14.s, p1/m, z14.s, z0.s\n"
1449                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
1450                             "fadd z15.s, z15.s, z4.s\n"
1451                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1452                             "fmin z16.s, p0/m, z16.s, z0.s\n"
1453                             "st1w z13.s, p0, [%[outptr0]]\n"
1454                             "fmax z14.s, p1/m, z14.s, z1.s\n"
1455                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
1456                             "fmin z15.s, p2/m, z15.s, z0.s\n"
1457                             "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1458                             "fmax z16.s, p0/m, z16.s, z1.s\n"
1459                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
1460                             "fadd z17.s, z17.s, z3.s\n"
1461                             "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
1462                             "fmax z15.s, p2/m, z15.s, z1.s\n"
1463                             "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1464                             "fadd z18.s, z18.s, z4.s\n"
1465                             "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1466                             "fmin z17.s, p1/m, z17.s, z0.s\n"
1467                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
1468                             "fadd z19.s, z19.s, z2.s\n"
1469                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
1470                             "fmin z18.s, p2/m, z18.s, z0.s\n"
1471                             "addvl %[outptr0], %[outptr0], #3\n"
1472                             "fmax z17.s, p1/m, z17.s, z1.s\n"
1473                             "st1w z16.s, p0, [%[outptr1]]\n"
1474                             "fmin z19.s, p0/m, z19.s, z0.s\n"
1475                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
1476                             "fmax z18.s, p2/m, z18.s, z1.s\n"
1477                             "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1478                             "fadd z20.s, z20.s, z3.s\n"
1479                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
1480                             "fmax z19.s, p0/m, z19.s, z1.s\n"
1481                             "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
1482                             "fadd z13.s, z13.s, z4.s\n"
1483                             "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1484                             "fmin z20.s, p1/m, z20.s, z0.s\n"
1485                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
1486                             "fadd z14.s, z14.s, z2.s\n"
1487                             "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
1488                             "fmin z13.s, p2/m, z13.s, z0.s\n"
1489                             "addvl %[outptr1], %[outptr1], #3\n"
1490                             "fmax z20.s, p1/m, z20.s, z1.s\n"
1491                             "st1w z19.s, p0, [%[outptr2]]\n"
1492                             "fmin z14.s, p0/m, z14.s, z0.s\n"
1493                             "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
1494                             "fmax z13.s, p2/m, z13.s, z1.s\n"
1495                             "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1496                             "fadd z15.s, z15.s, z3.s\n"
1497                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
1498                             "fmax z14.s, p0/m, z14.s, z1.s\n"
1499                             "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
1500                             "fadd z16.s, z16.s, z4.s\n"
1501                             "addvl %[inptr], %[inptr], #24\n"
1502                             "fmin z15.s, p1/m, z15.s, z0.s\n"
1503                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
1504                             "fadd z17.s, z17.s, z2.s\n"
1505                             "ld1w z13.s, p1/z, [x8]\n"
1506                             "fmin z16.s, p2/m, z16.s, z0.s\n"
1507                             "addvl %[outptr2], %[outptr2], #3\n"
1508                             "fmax z15.s, p1/m, z15.s, z1.s\n"
1509                             "st1w z14.s, p0, [%[outptr3]]\n"
1510                             "fmin z17.s, p0/m, z17.s, z0.s\n"
1511                             "ld1w z14.s, p2/z, [x8, #1, MUL VL]\n"
1512                             "fmax z16.s, p2/m, z16.s, z1.s\n"
1513                             "fadd z18.s, z18.s, z3.s\n"
1514                             "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
1515                             "fadd z19.s, z19.s, z4.s\n"
1516                             "fmax z17.s, p0/m, z17.s, z1.s\n"
1517                             "fadd z20.s, z20.s, z2.s\n"
1518                             "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
1519                             "fmin z18.s, p1/m, z18.s, z0.s\n"
1520                             "addvl %[outptr3], %[outptr3], #3\n"
1521                             "fmin z19.s, p2/m, z19.s, z0.s\n"
1522                             "st1w z17.s, p0, [%[outptr4]]\n"
1523                             "fmin z20.s, p0/m, z20.s, z0.s\n"
1524                             "fmax z18.s, p1/m, z18.s, z1.s\n"
1525                             "fadd z13.s, z13.s, z3.s\n"
1526                             "fmax z19.s, p2/m, z19.s, z1.s\n"
1527                             "fmax z20.s, p0/m, z20.s, z1.s\n"
1528                             "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
1529                             "fadd z14.s, z14.s, z4.s\n"
1530                             "fmin z13.s, p1/m, z13.s, z0.s\n"
1531                             "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
1532                             "addvl %[outptr4], %[outptr4], #3\n"
1533                             "fmax z13.s, p1/m, z13.s, z1.s\n"
1534                             "fmin z14.s, p2/m, z14.s, z0.s\n"
1535                             "st1w z20.s, p0, [%[outptr5]]\n"
1536                             "fmax z14.s, p2/m, z14.s, z1.s\n"
1537                             "st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n"
1538                             "st1w z14.s, p2, [%[outptr5], #2, MUL VL]\n"
1539                             "addvl %[outptr5], %[outptr5], #3\n"
1540                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1541                           [inptr] "+r" (inptr), [p] "+r" (p)
1542                         : [w] "r" (w), [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1543                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
1544                         );
1545                     }
1546                     break;
1547 
1548                 case 7:
1549                     {
1550                         long w = xmax - i;
1551                         long p = 0;
1552                         /* Optimized routine to copy an entire block */
1553                         __asm __volatile (
1554                             "mov z0.s, %s[maxval]\n"
1555                             "addvl x8, %[inptr], #16\n"
1556                             "mov z1.s, %s[minval]\n"
1557                             "whilelt p0.s, %[p], %[w]\n"
1558                             "incw %[p], all, mul #1\n"
1559                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1560                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1561                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
1562                             "whilelt p1.s, %[p], %[w]\n"
1563                             "ld1w z13.s, p0/z, [%[inptr]]\n"
1564                             "incw %[p], all, mul #1\n"
1565                             "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
1566                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1567                             "fadd z13.s, z13.s, z2.s\n"
1568                             "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
1569                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1570                             "whilelt p2.s, %[p], %[w]\n"
1571                             "fadd z16.s, z16.s, z2.s\n"
1572                             "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
1573                             "fmin z13.s, p0/m, z13.s, z0.s\n"
1574                             "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
1575                             "fadd z14.s, z14.s, z3.s\n"
1576                             "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
1577                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1578                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1579                             "fmax z13.s, p0/m, z13.s, z1.s\n"
1580                             "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
1581                             "fmin z14.s, p1/m, z14.s, z0.s\n"
1582                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
1583                             "fadd z15.s, z15.s, z4.s\n"
1584                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1585                             "fmin z16.s, p0/m, z16.s, z0.s\n"
1586                             "st1w z13.s, p0, [%[outptr0]]\n"
1587                             "fmax z14.s, p1/m, z14.s, z1.s\n"
1588                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
1589                             "fmin z15.s, p2/m, z15.s, z0.s\n"
1590                             "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1591                             "fmax z16.s, p0/m, z16.s, z1.s\n"
1592                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
1593                             "fadd z17.s, z17.s, z3.s\n"
1594                             "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
1595                             "fmax z15.s, p2/m, z15.s, z1.s\n"
1596                             "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1597                             "fadd z18.s, z18.s, z4.s\n"
1598                             "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1599                             "fmin z17.s, p1/m, z17.s, z0.s\n"
1600                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
1601                             "fadd z19.s, z19.s, z2.s\n"
1602                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
1603                             "fmin z18.s, p2/m, z18.s, z0.s\n"
1604                             "addvl %[outptr0], %[outptr0], #3\n"
1605                             "fmax z17.s, p1/m, z17.s, z1.s\n"
1606                             "st1w z16.s, p0, [%[outptr1]]\n"
1607                             "fmin z19.s, p0/m, z19.s, z0.s\n"
1608                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
1609                             "fmax z18.s, p2/m, z18.s, z1.s\n"
1610                             "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1611                             "fadd z20.s, z20.s, z3.s\n"
1612                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
1613                             "fmax z19.s, p0/m, z19.s, z1.s\n"
1614                             "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
1615                             "fadd z13.s, z13.s, z4.s\n"
1616                             "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1617                             "fmin z20.s, p1/m, z20.s, z0.s\n"
1618                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
1619                             "fadd z14.s, z14.s, z2.s\n"
1620                             "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
1621                             "fmin z13.s, p2/m, z13.s, z0.s\n"
1622                             "addvl %[outptr1], %[outptr1], #3\n"
1623                             "fmax z20.s, p1/m, z20.s, z1.s\n"
1624                             "st1w z19.s, p0, [%[outptr2]]\n"
1625                             "fmin z14.s, p0/m, z14.s, z0.s\n"
1626                             "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
1627                             "fmax z13.s, p2/m, z13.s, z1.s\n"
1628                             "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1629                             "fadd z15.s, z15.s, z3.s\n"
1630                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
1631                             "fmax z14.s, p0/m, z14.s, z1.s\n"
1632                             "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
1633                             "fadd z16.s, z16.s, z4.s\n"
1634                             "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
1635                             "fmin z15.s, p1/m, z15.s, z0.s\n"
1636                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
1637                             "fadd z17.s, z17.s, z2.s\n"
1638                             "ld1w z13.s, p1/z, [x8]\n"
1639                             "fmin z16.s, p2/m, z16.s, z0.s\n"
1640                             "addvl %[outptr2], %[outptr2], #3\n"
1641                             "fmax z15.s, p1/m, z15.s, z1.s\n"
1642                             "st1w z14.s, p0, [%[outptr3]]\n"
1643                             "fmin z17.s, p0/m, z17.s, z0.s\n"
1644                             "ld1w z14.s, p2/z, [x8, #1, MUL VL]\n"
1645                             "fmax z16.s, p2/m, z16.s, z1.s\n"
1646                             "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
1647                             "fadd z18.s, z18.s, z3.s\n"
1648                             "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
1649                             "fmax z17.s, p0/m, z17.s, z1.s\n"
1650                             "ld1w z15.s, p0/z, [x8, #2, MUL VL]\n"
1651                             "fadd z19.s, z19.s, z4.s\n"
1652                             "addvl %[inptr], %[inptr], #24\n"
1653                             "fmin z18.s, p1/m, z18.s, z0.s\n"
1654                             "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
1655                             "fadd z20.s, z20.s, z2.s\n"
1656                             "ld1w z16.s, p1/z, [x8, #3, MUL VL]\n"
1657                             "fmin z19.s, p2/m, z19.s, z0.s\n"
1658                             "addvl %[outptr3], %[outptr3], #3\n"
1659                             "fmax z18.s, p1/m, z18.s, z1.s\n"
1660                             "st1w z17.s, p0, [%[outptr4]]\n"
1661                             "fmin z20.s, p0/m, z20.s, z0.s\n"
1662                             "ld1w z17.s, p2/z, [x8, #4, MUL VL]\n"
1663                             "fmax z19.s, p2/m, z19.s, z1.s\n"
1664                             "fadd z13.s, z13.s, z3.s\n"
1665                             "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
1666                             "fadd z14.s, z14.s, z4.s\n"
1667                             "fmax z20.s, p0/m, z20.s, z1.s\n"
1668                             "fadd z15.s, z15.s, z2.s\n"
1669                             "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
1670                             "fmin z13.s, p1/m, z13.s, z0.s\n"
1671                             "addvl %[outptr4], %[outptr4], #3\n"
1672                             "fmin z14.s, p2/m, z14.s, z0.s\n"
1673                             "st1w z20.s, p0, [%[outptr5]]\n"
1674                             "fmin z15.s, p0/m, z15.s, z0.s\n"
1675                             "fmax z13.s, p1/m, z13.s, z1.s\n"
1676                             "fadd z16.s, z16.s, z3.s\n"
1677                             "fmax z14.s, p2/m, z14.s, z1.s\n"
1678                             "fmax z15.s, p0/m, z15.s, z1.s\n"
1679                             "st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n"
1680                             "fadd z17.s, z17.s, z4.s\n"
1681                             "fmin z16.s, p1/m, z16.s, z0.s\n"
1682                             "st1w z14.s, p2, [%[outptr5], #2, MUL VL]\n"
1683                             "addvl %[outptr5], %[outptr5], #3\n"
1684                             "fmax z16.s, p1/m, z16.s, z1.s\n"
1685                             "fmin z17.s, p2/m, z17.s, z0.s\n"
1686                             "st1w z15.s, p0, [%[outptr6]]\n"
1687                             "fmax z17.s, p2/m, z17.s, z1.s\n"
1688                             "st1w z16.s, p1, [%[outptr6], #1, MUL VL]\n"
1689                             "st1w z17.s, p2, [%[outptr6], #2, MUL VL]\n"
1690                             "addvl %[outptr6], %[outptr6], #3\n"
1691                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1692                           [inptr] "+r" (inptr), [p] "+r" (p)
1693                         : [w] "r" (w), [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1694                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
1695                         );
1696                     }
1697                     break;
1698 
1699                 default:
1700                 case 8:
1701                     {
1702                         long w = xmax - i;
1703                         long p = 0;
1704                         /* Optimized routine to copy an entire block */
1705                         __asm __volatile (
1706                             "mov z0.s, %s[maxval]\n"
1707                             "addvl x8, %[inptr], #16\n"
1708                             "mov z1.s, %s[minval]\n"
1709                             "whilelt p0.s, %[p], %[w]\n"
1710                             "incw %[p], all, mul #1\n"
1711                             "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1712                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1713                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
1714                             "whilelt p1.s, %[p], %[w]\n"
1715                             "ld1w z13.s, p0/z, [%[inptr]]\n"
1716                             "incw %[p], all, mul #1\n"
1717                             "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
1718                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1719                             "fadd z13.s, z13.s, z2.s\n"
1720                             "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
1721                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1722                             "whilelt p2.s, %[p], %[w]\n"
1723                             "fadd z16.s, z16.s, z2.s\n"
1724                             "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
1725                             "fmin z13.s, p0/m, z13.s, z0.s\n"
1726                             "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
1727                             "fadd z14.s, z14.s, z3.s\n"
1728                             "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
1729                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1730                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1731                             "fmax z13.s, p0/m, z13.s, z1.s\n"
1732                             "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
1733                             "fmin z14.s, p1/m, z14.s, z0.s\n"
1734                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
1735                             "fadd z15.s, z15.s, z4.s\n"
1736                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1737                             "fmin z16.s, p0/m, z16.s, z0.s\n"
1738                             "st1w z13.s, p0, [%[outptr0]]\n"
1739                             "fmax z14.s, p1/m, z14.s, z1.s\n"
1740                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
1741                             "fmin z15.s, p2/m, z15.s, z0.s\n"
1742                             "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1743                             "fmax z16.s, p0/m, z16.s, z1.s\n"
1744                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
1745                             "fadd z17.s, z17.s, z3.s\n"
1746                             "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
1747                             "fmax z15.s, p2/m, z15.s, z1.s\n"
1748                             "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1749                             "fadd z18.s, z18.s, z4.s\n"
1750                             "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1751                             "fmin z17.s, p1/m, z17.s, z0.s\n"
1752                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
1753                             "fadd z19.s, z19.s, z2.s\n"
1754                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
1755                             "fmin z18.s, p2/m, z18.s, z0.s\n"
1756                             "addvl %[outptr0], %[outptr0], #3\n"
1757                             "fmax z17.s, p1/m, z17.s, z1.s\n"
1758                             "st1w z16.s, p0, [%[outptr1]]\n"
1759                             "fmin z19.s, p0/m, z19.s, z0.s\n"
1760                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
1761                             "fmax z18.s, p2/m, z18.s, z1.s\n"
1762                             "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1763                             "fadd z20.s, z20.s, z3.s\n"
1764                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
1765                             "fmax z19.s, p0/m, z19.s, z1.s\n"
1766                             "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
1767                             "fadd z13.s, z13.s, z4.s\n"
1768                             "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1769                             "fmin z20.s, p1/m, z20.s, z0.s\n"
1770                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
1771                             "fadd z14.s, z14.s, z2.s\n"
1772                             "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
1773                             "fmin z13.s, p2/m, z13.s, z0.s\n"
1774                             "addvl %[outptr1], %[outptr1], #3\n"
1775                             "fmax z20.s, p1/m, z20.s, z1.s\n"
1776                             "st1w z19.s, p0, [%[outptr2]]\n"
1777                             "fmin z14.s, p0/m, z14.s, z0.s\n"
1778                             "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
1779                             "fmax z13.s, p2/m, z13.s, z1.s\n"
1780                             "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1781                             "fadd z15.s, z15.s, z3.s\n"
1782                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
1783                             "fmax z14.s, p0/m, z14.s, z1.s\n"
1784                             "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
1785                             "fadd z16.s, z16.s, z4.s\n"
1786                             "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
1787                             "fmin z15.s, p1/m, z15.s, z0.s\n"
1788                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
1789                             "fadd z17.s, z17.s, z2.s\n"
1790                             "ld1w z13.s, p1/z, [x8]\n"
1791                             "fmin z16.s, p2/m, z16.s, z0.s\n"
1792                             "addvl %[outptr2], %[outptr2], #3\n"
1793                             "fmax z15.s, p1/m, z15.s, z1.s\n"
1794                             "st1w z14.s, p0, [%[outptr3]]\n"
1795                             "fmin z17.s, p0/m, z17.s, z0.s\n"
1796                             "ld1w z14.s, p2/z, [x8, #1, MUL VL]\n"
1797                             "fmax z16.s, p2/m, z16.s, z1.s\n"
1798                             "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
1799                             "fadd z18.s, z18.s, z3.s\n"
1800                             "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
1801                             "fmax z17.s, p0/m, z17.s, z1.s\n"
1802                             "ld1w z15.s, p0/z, [x8, #2, MUL VL]\n"
1803                             "fadd z19.s, z19.s, z4.s\n"
1804                             "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
1805                             "fmin z18.s, p1/m, z18.s, z0.s\n"
1806                             "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
1807                             "fadd z20.s, z20.s, z2.s\n"
1808                             "ld1w z16.s, p1/z, [x8, #3, MUL VL]\n"
1809                             "fmin z19.s, p2/m, z19.s, z0.s\n"
1810                             "addvl %[outptr3], %[outptr3], #3\n"
1811                             "fmax z18.s, p1/m, z18.s, z1.s\n"
1812                             "st1w z17.s, p0, [%[outptr4]]\n"
1813                             "fmin z20.s, p0/m, z20.s, z0.s\n"
1814                             "ld1w z17.s, p2/z, [x8, #4, MUL VL]\n"
1815                             "fmax z19.s, p2/m, z19.s, z1.s\n"
1816                             "addvl %[inptr], %[inptr], #24\n"
1817                             "fadd z13.s, z13.s, z3.s\n"
1818                             "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
1819                             "fmax z20.s, p0/m, z20.s, z1.s\n"
1820                             "ld1w z18.s, p0/z, [x8, #5, MUL VL]\n"
1821                             "fadd z14.s, z14.s, z4.s\n"
1822                             "fadd z15.s, z15.s, z2.s\n"
1823                             "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
1824                             "fmin z13.s, p1/m, z13.s, z0.s\n"
1825                             "ld1w z19.s, p1/z, [x8, #6, MUL VL]\n"
1826                             "fadd z16.s, z16.s, z3.s\n"
1827                             "addvl %[outptr4], %[outptr4], #3\n"
1828                             "fmin z14.s, p2/m, z14.s, z0.s\n"
1829                             "st1w z20.s, p0, [%[outptr5]]\n"
1830                             "fmax z13.s, p1/m, z13.s, z1.s\n"
1831                             "ld1w z20.s, p2/z, [x8, #7, MUL VL]\n"
1832                             "fmin z15.s, p0/m, z15.s, z0.s\n"
1833                             "fmin z16.s, p1/m, z16.s, z0.s\n"
1834                             "fmax z14.s, p2/m, z14.s, z1.s\n"
1835                             "st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n"
1836                             "fadd z17.s, z17.s, z4.s\n"
1837                             "fmax z15.s, p0/m, z15.s, z1.s\n"
1838                             "fmax z16.s, p1/m, z16.s, z1.s\n"
1839                             "st1w z14.s, p2, [%[outptr5], #2, MUL VL]\n"
1840                             "fadd z18.s, z18.s, z2.s\n"
1841                             "addvl %[outptr5], %[outptr5], #3\n"
1842                             "fmin z17.s, p2/m, z17.s, z0.s\n"
1843                             "st1w z15.s, p0, [%[outptr6]]\n"
1844                             "fadd z19.s, z19.s, z3.s\n"
1845                             "fmin z18.s, p0/m, z18.s, z0.s\n"
1846                             "fadd z20.s, z20.s, z4.s\n"
1847                             "st1w z16.s, p1, [%[outptr6], #1, MUL VL]\n"
1848                             "fmax z17.s, p2/m, z17.s, z1.s\n"
1849                             "fmin z19.s, p1/m, z19.s, z0.s\n"
1850                             "fmax z18.s, p0/m, z18.s, z1.s\n"
1851                             "fmin z20.s, p2/m, z20.s, z0.s\n"
1852                             "st1w z17.s, p2, [%[outptr6], #2, MUL VL]\n"
1853                             "addvl %[outptr6], %[outptr6], #3\n"
1854                             "fmax z19.s, p1/m, z19.s, z1.s\n"
1855                             "fmax z20.s, p2/m, z20.s, z1.s\n"
1856                             "st1w z18.s, p0, [%[outptr7]]\n"
1857                             "st1w z19.s, p1, [%[outptr7], #1, MUL VL]\n"
1858                             "st1w z20.s, p2, [%[outptr7], #2, MUL VL]\n"
1859                             "addvl %[outptr7], %[outptr7], #3\n"
1860                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1861                           [inptr] "+r" (inptr), [p] "+r" (p)
1862                         : [w] "r" (w), [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1863                         : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
1864                         );
1865                     }
1866                     break;
1867 
1868 
1869                 }
1870             }
1871         }
1872     }
1873 }
1874 
1875 #endif // ARM_COMPUTE_ENABLE_SVE
1876