xref: /aosp_15_r20/external/XNNPACK/src/qs8-igemm/c2-neon-mull-dup.c.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2021 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
7$assert NR % 8 == 0
8$assert 8 <= NR <= 16
9$assert REQUANTIZATION in ["FP32", "RNDNU"]
10$assert not CHANNELWISE or REQUANTIZATION == "FP32"
11$assert DUP in ["DUP", "LD1R", "LD2R", "LD4R"]
12#include <assert.h>
13
14#include <arm_neon.h>
15
16#include <xnnpack/gemm.h>
17$if REQUANTIZATION == "FP32" and ARMV8:
18  #include <xnnpack/intrinsics-polyfill.h>
19#include <xnnpack/math.h>
20
21$DATATYPE = "qc8" if CHANNELWISE else "qs8"
22$PARAMS_STRUCT = REQUANTIZATION.lower() + "_" + ("neonv8" if REQUANTIZATION == "FP32" and ARMV8 else "neon")
23$PARAMS_UNION = "xnn_%s_conv_minmax_params" % DATATYPE.lower()
24$ISA = "neonv8" if ARMV8 else "neon"
25void xnn_${DATATYPE}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_${MR}x${NR}c2__${ISA}_${"mlal" if MLA else "mull"}_${DUP.lower()}(
26    size_t mr,
27    size_t nc,
28    size_t kc,
29    size_t ks,
30    const int8_t** restrict a,
31    const void* restrict w,
32    int8_t* restrict c,
33    size_t cm_stride,
34    size_t cn_stride,
35    size_t a_offset,
36    const int8_t* zero,
37    const union ${PARAMS_UNION} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
38{
39  assert(mr != 0);
40  assert(mr <= ${MR});
41  assert(nc != 0);
42  assert(kc != 0);
43  assert(ks != 0);
44  assert(ks % (${MR} * sizeof(void*)) == 0);
45  assert(a_offset % sizeof(int8_t) == 0);
46  assert(a != NULL);
47  assert(w != NULL);
48  assert(c != NULL);
49
50  kc = round_up_po2(kc, 2 * sizeof(int8_t));
51  int8_t* c0 = c;
52  $for M in range(1, MR):
53    int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);
54    $if M % 2 == 0:
55      if XNN_UNPREDICTABLE(mr <= ${M}) {
56        c${M} = c${M-1};
57      }
58    $elif M + 1 == MR:
59      if XNN_UNPREDICTABLE(mr != ${M+1}) {
60        c${M} = c${M-1};
61      }
62    $else:
63      if XNN_UNPREDICTABLE(mr < ${M+1}) {
64        c${M} = c${M-1};
65      }
66
67  do {
68    $for N in range(0, NR, 4):
69      int32x4_t vacc0x${ABC[N:N+4]} = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
70    $for M in range(1, MR):
71      $for N in range(0, NR, 4):
72        int32x4_t vacc${M}x${ABC[N:N+4]} = vacc0x${ABC[N:N+4]};
73
74    size_t p = ks;
75    do {
76      $for M in range(MR):
77        const int8_t* restrict a${M} = a[${M}];
78        if XNN_UNPREDICTABLE(a${M} != zero) {
79          a${M} = (const int8_t*) ((uintptr_t) a${M} + a_offset);
80        }
81      a += ${MR};
82
83      size_t k = kc;
84
85      $if MLA:
86        while (k >= 16 * sizeof(int8_t)) {
87          $for M in range(MR):
88            $if DUP == "LD4R":
89              const int16x4x4_t va${M}x0 = vld4_dup_s16((const void*)a${M}); a${M} += 8;
90              const int16x4x4_t va${M}x1 = vld4_dup_s16((const void*)a${M}); a${M} += 8;
91            $elif DUP == "LD2R":
92              const int16x4x2_t va${M}0x0 = vld2_dup_s16((const void*)a${M});
93              const int16x4x2_t va${M}1x0 = vld2_dup_s16((const void*)(a${M} + 4)); a${M} += 8;
94              const int16x4x2_t va${M}0x1 = vld2_dup_s16((const void*)a${M});
95              const int16x4x2_t va${M}1x1 = vld2_dup_s16((const void*)(a${M} + 4)); a${M} += 8;
96            $elif DUP == "LD1R":
97              const int16x4_t va${M}0x0 = vld1_dup_s16((const void*)a${M});
98              const int16x4_t va${M}1x0 = vld1_dup_s16((const void*)(a${M} + 2));
99              const int16x4_t va${M}2x0 = vld1_dup_s16((const void*)(a${M} + 4));
100              const int16x4_t va${M}3x0 = vld1_dup_s16((const void*)(a${M} + 6)); a${M} += 8;
101              const int16x4_t va${M}0x1 = vld1_dup_s16((const void*)a${M});
102              const int16x4_t va${M}1x1 = vld1_dup_s16((const void*)(a${M} + 2));
103              const int16x4_t va${M}2x1 = vld1_dup_s16((const void*)(a${M} + 4));
104              const int16x4_t va${M}3x1 = vld1_dup_s16((const void*)(a${M} + 6)); a${M} += 8;
105            $else:
106              const int8x8_t va${M}x0 = vld1_s8(a${M}); a${M} += 8;
107              const int8x8_t va${M}x1 = vld1_s8(a${M}); a${M} += 8;
108
109          $for K in range(4):
110            $for N in range(0, NR, 4):
111              const int8x8_t vb${ABC[N:N+4]}c${K}x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
112
113          $for K in range(4):
114            $for M in range(MR):
115              $if DUP == "LD4R":
116                const int8x8_t va${M}c${K}x0 = vreinterpret_s8_s16(va${M}x0.val[${K}]);
117                const int8x8_t va${M}c${K}x1 = vreinterpret_s8_s16(va${M}x1.val[${K}]);
118              $elif DUP == "LD2R":
119                const int8x8_t va${M}c${K}x0 = vreinterpret_s8_s16(va${M}${int(K/2)}x0.val[${K%2}]);
120                const int8x8_t va${M}c${K}x1 = vreinterpret_s8_s16(va${M}${int(K/2)}x1.val[${K%2}]);
121              $elif DUP == "LD1R":
122                const int8x8_t va${M}c${K}x0 = vreinterpret_s8_s16(va${M}${K}x0);
123                const int8x8_t va${M}c${K}x1 = vreinterpret_s8_s16(va${M}${K}x1);
124              $else:
125                const int8x8_t va${M}c${K}x0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va${M}x0), ${K}));
126                const int8x8_t va${M}c${K}x1 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va${M}x1), ${K}));
127
128            $for N in range(0, NR, 4):
129              $for M in range(MR):
130                int16x8_t vprod${M}x${ABC[N:N+4]}c${K} = vmull_s8(vb${ABC[N:N+4]}c${K}x0, va${M}c${K}x0);
131              const int8x8_t vb${ABC[N:N+4]}c${K}x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
132              $for M in range(MR):
133                vprod${M}x${ABC[N:N+4]}c${K} = vmlal_s8(vprod${M}x${ABC[N:N+4]}c${K}, vb${ABC[N:N+4]}c${K}x1, va${M}c${K}x1);
134              $for M in range(MR):
135                vacc${M}x${ABC[N:N+4]} = vpadalq_s16(vacc${M}x${ABC[N:N+4]}, vprod${M}x${ABC[N:N+4]}c${K});
136
137          k -= 16 * sizeof(int8_t);
138        }
139
140      ${"if" if MLA else "while"} (k >= 8 * sizeof(int8_t)) {
141        $for M in range(MR):
142          $if DUP == "LD4R":
143            const int16x4x4_t va${M} = vld4_dup_s16((const void*)a${M}); a${M} += 8;
144          $elif DUP == "LD2R":
145            const int16x4x2_t va${M}0 = vld2_dup_s16((const void*)a${M});
146            const int16x4x2_t va${M}1 = vld2_dup_s16((const void*)(a${M} + 4)); a${M} += 8;
147          $elif DUP == "LD1R":
148            const int16x4_t va${M}0 = vld1_dup_s16((const void*)a${M});
149            const int16x4_t va${M}1 = vld1_dup_s16((const void*)(a${M} + 2));
150            const int16x4_t va${M}2 = vld1_dup_s16((const void*)(a${M} + 4));
151            const int16x4_t va${M}3 = vld1_dup_s16((const void*)(a${M} + 6)); a${M} += 8;
152          $else:
153            const int8x8_t va${M} = vld1_s8(a${M}); a${M} += 8;
154
155        $for K in range(4):
156          $for N in range(0, NR, 4):
157            const int8x8_t vb${ABC[N:N+4]}c${K} = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
158
159        $for K in range(4):
160          $for M in range(MR):
161            $if DUP == "LD4R":
162              const int8x8_t va${M}c${K} = vreinterpret_s8_s16(va${M}.val[${K}]);
163            $elif DUP == "LD2R":
164              const int8x8_t va${M}c${K} = vreinterpret_s8_s16(va${M}${int(K/2)}.val[${K%2}]);
165            $elif DUP == "LD1R":
166              const int8x8_t va${M}c${K} = vreinterpret_s8_s16(va${M}${K});
167            $else:
168              const int8x8_t va${M}c${K} = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va${M}), ${K}));
169
170          $for N in range(0, NR, 4):
171            $for M in range(MR):
172              const int16x8_t vprod${M}x${ABC[N:N+4]}c${K} = vmull_s8(vb${ABC[N:N+4]}c${K}, va${M}c${K});
173            $for M in range(MR):
174              vacc${M}x${ABC[N:N+4]} = vpadalq_s16(vacc${M}x${ABC[N:N+4]}, vprod${M}x${ABC[N:N+4]}c${K});
175
176        k -= 8 * sizeof(int8_t);
177      }
178
179      if XNN_UNLIKELY(k != 0) {
180        $for M in range(MR):
181          const int8x8_t va${M} = vld1_s8(a${M}); a${M} = (const int8_t*) ((uintptr_t) a${M} + k);
182
183        $for N in range(0, NR, 4):
184          const int8x8_t vb${ABC[N:N+4]}c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
185
186        $for M in range(MR):
187          const int8x8_t va${M}c0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va${M}), 0));
188          $for N in range(0, NR, 4):
189            const int16x8_t vprod${M}x${ABC[N:N+4]}c0 = vmull_s8(vb${ABC[N:N+4]}c0, va${M}c0);
190            vacc${M}x${ABC[N:N+4]} = vpadalq_s16(vacc${M}x${ABC[N:N+4]}, vprod${M}x${ABC[N:N+4]}c0);
191
192        if (k > 2 * sizeof(int8_t)) {
193          $for N in range(0, NR, 4):
194            const int8x8_t vb${ABC[N:N+4]}c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
195
196          $for M in range(MR):
197            const int8x8_t va${M}c1 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va${M}), 1));
198            $for N in range(0, NR, 4):
199              const int16x8_t vprod${M}x${ABC[N:N+4]}c1 = vmull_s8(vb${ABC[N:N+4]}c1, va${M}c1);
200              vacc${M}x${ABC[N:N+4]} = vpadalq_s16(vacc${M}x${ABC[N:N+4]}, vprod${M}x${ABC[N:N+4]}c1);
201
202          if (k > 4 * sizeof(int8_t)) {
203            $for N in range(0, NR, 4):
204              const int8x8_t vb${ABC[N:N+4]}c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
205
206            $for M in range(MR):
207              const int8x8_t va${M}c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va${M}), 2));
208              $for N in range(0, NR, 4):
209                const int16x8_t vprod${M}x${ABC[N:N+4]}c2 = vmull_s8(vb${ABC[N:N+4]}c2, va${M}c2);
210                vacc${M}x${ABC[N:N+4]} = vpadalq_s16(vacc${M}x${ABC[N:N+4]}, vprod${M}x${ABC[N:N+4]}c2);
211          }
212        }
213      }
214      p -= ${MR} * sizeof(void*);
215    } while (p != 0);
216
217    $if REQUANTIZATION == "RNDNU":
218      const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->${PARAMS_STRUCT}.right_pre_shift);
219      const int32x4_t vmultiplier = vld1q_dup_s32(&params->${PARAMS_STRUCT}.multiplier);
220      const int32x4_t vright_post_shift = vld1q_dup_s32(&params->${PARAMS_STRUCT}.right_post_shift);
221
222      $for M in range(MR):
223        $for N in range(0, NR, 4):
224          vacc${M}x${ABC[N:N+4]} = vqshlq_s32(vacc${M}x${ABC[N:N+4]}, vright_pre_shift);
225
226      $for M in range(MR):
227        $for N in range(0, NR, 4):
228          vacc${M}x${ABC[N:N+4]} = vqdmulhq_s32(vacc${M}x${ABC[N:N+4]}, vmultiplier);
229
230      $for M in range(MR):
231        $for N in range(0, NR, 4):
232          vacc${M}x${ABC[N:N+4]} = vrshlq_s32(vacc${M}x${ABC[N:N+4]}, vright_post_shift);
233    $elif REQUANTIZATION == "FP32":
234      $for M in range(MR):
235        $for N in range(0, NR, 4):
236          float32x4_t vfpacc${M}x${ABC[N:N+4]} = vcvtq_f32_s32(vacc${M}x${ABC[N:N+4]});
237
238      $if CHANNELWISE:
239        $for N in range(0, NR, 4):
240          const float32x4_t vscale${ABC[N:N+4]} = vld1q_f32((const float*) w); w = (const void*) ((const float*) w + 4);
241          $for M in range(MR):
242            vfpacc${M}x${ABC[N:N+4]} = vmulq_f32(vfpacc${M}x${ABC[N:N+4]}, vscale${ABC[N:N+4]});
243      $else:
244        const float32x4_t vscale = vld1q_dup_f32(&params->${PARAMS_STRUCT}.scale);
245        $for M in range(MR):
246          $for N in range(0, NR, 4):
247            vfpacc${M}x${ABC[N:N+4]} = vmulq_f32(vfpacc${M}x${ABC[N:N+4]}, vscale);
248
249      $if ARMV8:
250        $for M in range(MR):
251          $for N in range(0, NR, 4):
252            vacc${M}x${ABC[N:N+4]} = vcvtnq_s32_f32(vfpacc${M}x${ABC[N:N+4]});
253      $else:
254        const float32x4_t vmagic_bias = vld1q_dup_f32(&params->${PARAMS_STRUCT}.magic_bias);
255        $for M in range(MR):
256          $for N in range(0, NR, 4):
257            vacc${M}x${ABC[N:N+4]} = vreinterpretq_s32_f32(vaddq_f32(vfpacc${M}x${ABC[N:N+4]}, vmagic_bias));
258
259        const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(&params->${PARAMS_STRUCT}.magic_bias_less_output_zero_point);
260        $for M in range(MR):
261          $for N in range(0, NR, 4):
262            vacc${M}x${ABC[N:N+4]} = vqsubq_s32(vacc${M}x${ABC[N:N+4]}, vmagic_bias_less_output_zero_point);
263
264    $if REQUANTIZATION != "FP32" or ARMV8:
265      const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->${PARAMS_STRUCT}.output_zero_point);
266#if XNN_ARCH_ARM64
267    $for M in range(MR):
268      $for N in range(0, NR, 8):
269        int16x8_t vacc${M}x${ABC[N:N+8]} = vqmovn_high_s32(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vacc${M}x${ABC[N+4:N+8]});
270
271    $if REQUANTIZATION != "FP32" or ARMV8:
272      $for M in range(MR):
273        $for N in range(0, NR, 8):
274          vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vacc${M}x${ABC[N:N+8]}, voutput_zero_point);
275
276    $for M in range(MR):
277      $for N in range(0, NR, 16):
278        $if N + 8 < NR:
279          int8x16_t vout${M}x${ABC[N:N+16]} = vqmovn_high_s16(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vacc${M}x${ABC[N+8:N+16]});
280        $elif M % 2 == 1:
281          int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vqmovn_high_s16(vqmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vacc${M}x${ABC[N:N+8]});
282        $elif M + 1 == MR:
283          int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]});
284#else
285    $for M in range(MR):
286      $for N in range(0, NR, 8):
287        int16x8_t vacc${M}x${ABC[N:N+8]} = vcombine_s16(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vqmovn_s32(vacc${M}x${ABC[N+4:N+8]}));
288
289    $if REQUANTIZATION != "FP32" or ARMV8:
290      $for M in range(MR):
291        $for N in range(0, NR, 8):
292          vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vacc${M}x${ABC[N:N+8]}, voutput_zero_point);
293
294    $for M in range(MR):
295      $for N in range(0, NR, 16):
296        $if N + 8 < NR:
297          int8x16_t vout${M}x${ABC[N:N+16]} = vcombine_s8(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vqmovn_s16(vacc${M}x${ABC[N+8:N+16]}));
298        $elif M % 2 == 1:
299          int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vcombine_s8(vqmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vqmovn_s16(vacc${M}x${ABC[N:N+8]}));
300        $elif M + 1 == MR:
301          int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]});
302#endif
303
304    $if NR == 8 and MR == 1:
305      const int8x8_t voutput_min = vld1_dup_s8(&params->${PARAMS_STRUCT}.output_min);
306    $else:
307      const int8x16_t voutput_min = vld1q_dup_s8(&params->${PARAMS_STRUCT}.output_min);
308    $for M in range(MR):
309      $for N in range(0, NR, 16):
310        $if N + 8 < NR:
311          vout${M}x${ABC[N:N+16]} = vmaxq_s8(vout${M}x${ABC[N:N+16]}, voutput_min);
312        $elif M % 2 == 1:
313          vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vmaxq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_min);
314        $elif M + 1 == MR:
315          $if NR == 8 and MR == 1:
316            vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, voutput_min);
317          $else:
318            vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_min));
319
320    $if NR == 8 and MR == 1:
321      const int8x8_t voutput_max = vld1_dup_s8(&params->${PARAMS_STRUCT}.output_max);
322    $else:
323      const int8x16_t voutput_max = vld1q_dup_s8(&params->${PARAMS_STRUCT}.output_max);
324    $for M in range(MR):
325      $for N in range(0, NR, 16):
326        $if N + 8 < NR:
327          vout${M}x${ABC[N:N+16]} = vminq_s8(vout${M}x${ABC[N:N+16]}, voutput_max);
328        $elif M % 2 == 1:
329          vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vminq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_max);
330        $elif M + 1 == MR:
331          $if NR == 8 and MR == 1:
332            vout${M}x${ABC[N:N+8]} = vmin_s8(vout${M}x${ABC[N:N+8]}, voutput_max);
333          $else:
334            vout${M}x${ABC[N:N+8]} = vmin_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_max));
335
336    if (nc >= ${NR}) {
337      $for M in reversed(range(MR)):
338        $for N in range(0, NR, 16):
339          $if N + 8 < NR:
340            vst1q_s8(c${M} + ${N}, vout${M}x${ABC[N:N+16]});
341          $elif M % 2 == 1:
342            vst1_s8(c${M} + ${N}, vget_high_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}));
343            vst1_s8(c${M-1} + ${N}, vget_low_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}));
344          $elif M + 1 == MR:
345            vst1_s8(c${M} + ${N}, vout${M}x${ABC[N:N+8]});
346
347      $for M in reversed(range(MR)):
348        c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
349
350      a = (const int8_t**restrict) ((uintptr_t) a - ks);
351
352      nc -= ${NR};
353    } else {
354      $if NR == 16:
355        $for M in reversed(range(MR)):
356          $if M % 2 == 1:
357            int8x16_t vout${M-1}x01234567_${M}x01234567 = vcombine_s8(vget_low_s8(vout${M-1}x0123456789ABCDEF), vget_low_s8(vout${M}x0123456789ABCDEF));
358          $elif M + 1 == MR:
359            int8x8_t vout${M}x01234567 = vget_low_s8(vout${M}x0123456789ABCDEF);
360        if (nc & 8) {
361          $for M in reversed(range(MR)):
362            $if M % 2 == 1:
363              vst1_s8(c${M}, vget_high_s8(vout${M-1}x01234567_${M}x01234567)); c${M} += 8;
364              vst1_s8(c${M-1}, vget_low_s8(vout${M-1}x01234567_${M}x01234567)); c${M-1} += 8;
365            $elif M + 1 == MR:
366              vst1_s8(c${M}, vout${M}x01234567); c${M} += 8;
367          $for M in reversed(range(MR)):
368            $if M % 2 == 1:
369              vout${M-1}x01234567_${M}x01234567 = vcombine_s8(vget_high_s8(vout${M-1}x0123456789ABCDEF), vget_high_s8(vout${M}x0123456789ABCDEF));
370            $elif M + 1 == MR:
371              vout${M}x01234567 = vget_high_s8(vout${M}x0123456789ABCDEF);
372        }
373      if (nc & 4) {
374        $for M in reversed(range(MR)):
375          $if M % 2 == 1:
376            vst1q_lane_u32((void*) c${M}, vreinterpretq_u32_s8(vout${M-1}x01234567_${M}x01234567), 2); c${M} += 4;
377            vst1q_lane_u32((void*) c${M-1}, vreinterpretq_u32_s8(vout${M-1}x01234567_${M}x01234567), 0); c${M-1} += 4;
378          $elif M + 1 == MR:
379            vst1_lane_u32((void*) c${M}, vreinterpret_u32_s8(vout${M}x01234567), 0); c${M} += 4;
380        $for M in reversed(range(MR)):
381          $if M % 2 == 1:
382            vout${M-1}x01234567_${M}x01234567 = vextq_s8(vout${M-1}x01234567_${M}x01234567, vout${M-1}x01234567_${M}x01234567, 4);
383          $elif M + 1 == MR:
384            vout${M}x01234567 = vext_s8(vout${M}x01234567, vout${M}x01234567, 4);
385      }
386      if (nc & 2) {
387        $for M in reversed(range(MR)):
388          $if M % 2 == 1:
389            vst1q_lane_u16((void*) c${M}, vreinterpretq_u16_s8(vout${M-1}x01234567_${M}x01234567), 4); c${M} += 2;
390            vst1q_lane_u16((void*) c${M-1}, vreinterpretq_u16_s8(vout${M-1}x01234567_${M}x01234567), 0); c${M-1} += 2;
391          $elif M + 1 == MR:
392            vst1_lane_u16((void*) c${M}, vreinterpret_u16_s8(vout${M}x01234567), 0); c${M} += 2;
393        $for M in reversed(range(MR)):
394          $if M % 2 == 1:
395            vout${M-1}x01234567_${M}x01234567 = vextq_s8(vout${M-1}x01234567_${M}x01234567, vout${M-1}x01234567_${M}x01234567, 2);
396          $elif M + 1 == MR:
397            vout${M}x01234567 = vext_s8(vout${M}x01234567, vout${M}x01234567, 2);
398      }
399      if (nc & 1) {
400        $for M in reversed(range(MR)):
401          $if M % 2 == 1:
402            vst1q_lane_s8(c${M}, vout${M-1}x01234567_${M}x01234567, 8);
403            vst1q_lane_s8(c${M-1}, vout${M-1}x01234567_${M}x01234567, 0);
404          $elif M + 1 == MR:
405            vst1_lane_s8(c${M}, vout${M}x01234567, 0);
406      }
407
408      nc = 0;
409    }
410  } while (nc != 0);
411}
412