xref: /aosp_15_r20/external/XNNPACK/src/x32-transposec/wasmsimd.c.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2021 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5$from itertools import chain
6$import math
7$assert IN_PTRS in ["MULTI", "REUSE"]
8$assert OUT_PTRS in ["MULTI", "SWITCH", "MOV", "DEC"]
9$assert SIZE in [8, 16, 32]
10$TILE_SIZE = int(128/SIZE)
11$NUM_ITERS = int(math.log2(TILE_SIZE))
12$LO_PERM=str(list(chain.from_iterable((i, i+TILE_SIZE) for i in range((TILE_SIZE>>1)))))[1:-1]
13$HI_PERM=str(list(chain.from_iterable(((TILE_SIZE>>1)+i, (TILE_SIZE>>1)+i+TILE_SIZE) for i in range((TILE_SIZE>>1)))))[1:-1]
14
15#include <wasm_simd128.h>
16
17#include <assert.h>
18
19#include <xnnpack/common.h>
20#include <xnnpack/math.h>
21#include <xnnpack/transpose.h>
22
23void xnn_x${SIZE}_transposec_ukernel__${TILE_SIZE}x${TILE_SIZE}_${IN_PTRS.lower()}_${OUT_PTRS.lower()}_wasmsimd(
24    const uint${SIZE}_t* input,
25    uint${SIZE}_t* output,
26    size_t input_stride,
27    size_t output_stride,
28    size_t block_width,
29    size_t block_height) XNN_OOB_READS
30{
31  assert(output_stride >= block_height * sizeof(uint${SIZE}_t));
32  assert(input_stride >= block_width * sizeof(uint${SIZE}_t));
33
34  const size_t tile_height = ${TILE_SIZE};
35  const size_t tile_width = ${TILE_SIZE};
36  const size_t tile_hbytes = tile_height * sizeof(uint${SIZE}_t);
37  const size_t tile_wbytes = tile_width * sizeof(uint${SIZE}_t);
38  const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
39  $if IN_PTRS == "MULTI":
40    const size_t input_offset = tile_height * input_stride;
41  $if OUT_PTRS in ["MOV", "DEC"]:
42    const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint${SIZE}_t) - tile_hbytes;
43  $else:
44    const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint${SIZE}_t);
45
46  $if IN_PTRS == "MULTI":
47    const uint${SIZE}_t* i0 = input;
48    $for N in range(1, TILE_SIZE):
49      const uint${SIZE}_t* i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride);
50  $else:
51    const uint${SIZE}_t* i0 = input;
52  $if OUT_PTRS == "MULTI":
53    uint${SIZE}_t* o0 = (uint${SIZE}_t*) output;
54    $for N in range(1, TILE_SIZE):
55      uint${SIZE}_t* o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N-1} + output_stride);
56  $elif OUT_PTRS == "SWITCH":
57    uint${SIZE}_t* o = (uint${SIZE}_t*) output;
58  $else:
59    uint${SIZE}_t* o = (uint${SIZE}_t*) ((uintptr_t) output - tile_hbytes);
60  $if OUT_PTRS != "MULTI":
61    const size_t minus_output_stride = -output_stride;
62
63  do {
64    $if OUT_PTRS == "MULTI":
65      if XNN_UNPREDICTABLE(block_width < 2) {
66        o1 = o0;
67      }
68      $for N in range(2, TILE_SIZE, 2):
69        if XNN_UNPREDICTABLE(block_width <= ${N}) {
70          o${N} = o0;
71        }
72        if XNN_UNPREDICTABLE(block_width < ${N+2}) {
73          o${N+1} = o0;
74        }
75    $elif OUT_PTRS in ["MOV", "DEC"]:
76      const size_t rem = min(block_width - 1, ${TILE_SIZE-1});
77      const size_t oN_stride = rem * output_stride;
78      const size_t oN_offset = oN_stride + tile_hbytes;
79    $else:
80      const size_t rem = min(block_width - 1, ${TILE_SIZE-1});
81      const size_t oN_stride = rem * output_stride;
82    size_t bh = block_height;
83    for (; bh >= ${TILE_SIZE}; bh -= ${TILE_SIZE}) {
84      $for N in range(TILE_SIZE):
85        $if IN_PTRS == "REUSE":
86          const v128_t v${NUM_ITERS}_${N} = wasm_v128_load(i0);
87          i0 = (uint${SIZE}_t*) ((uintptr_t) i0 + input_stride);
88        $else:
89          const v128_t v${NUM_ITERS}_${N} = wasm_v128_load(i${N});
90          i${N} = (uint${SIZE}_t*) ((uintptr_t) i${N} + input_offset);
91
92      $for M in range(NUM_ITERS):
93        $for N in range(TILE_SIZE >> 1):
94          const v128_t v${NUM_ITERS-M-1}_${2*N} = wasm_v${SIZE}x${TILE_SIZE}_shuffle(v${NUM_ITERS-M}_${N}, v${NUM_ITERS-M}_${N+int(TILE_SIZE/2)}, ${LO_PERM});
95          const v128_t v${NUM_ITERS-M-1}_${2*N+1} = wasm_v${SIZE}x${TILE_SIZE}_shuffle(v${NUM_ITERS-M}_${N}, v${NUM_ITERS-M}_${N+int(TILE_SIZE/2)}, ${HI_PERM});
96
97      $if OUT_PTRS == "SWITCH":
98        uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
99        switch (rem) {
100          $for N in reversed(range(2, TILE_SIZE)):
101            case ${N}:
102              wasm_v128_store(oN, v0_${N});
103              oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
104          case 1:
105            wasm_v128_store(oN, v0_1);
106          case 0:
107            wasm_v128_store(o, v0_0);
108            o = (uint${SIZE}_t*) ((uintptr_t) o + tile_hbytes);
109            break;
110          default:
111            XNN_UNREACHABLE;
112        }
113      $elif OUT_PTRS in ["MOV", "DEC"]:
114        o = (uint${SIZE}_t*) ((uintptr_t) o + oN_offset);
115        wasm_v128_store(o, v0_${TILE_SIZE-1});
116        $if OUT_PTRS == "MOV":
117          uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
118        $for N in reversed(range(2, TILE_SIZE-1, 2)):
119          if XNN_UNPREDICTABLE(block_width > ${N+1}) {
120            $if OUT_PTRS == "MOV":
121              o = oN;
122            $else:
123              o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
124          }
125          wasm_v128_store(o, v0_${N});
126          $if OUT_PTRS == "MOV":
127            oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
128          if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
129            $if OUT_PTRS == "MOV":
130              o = oN;
131            $else:
132              o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
133          }
134          wasm_v128_store(o, v0_${N-1});
135          $if OUT_PTRS == "MOV":
136            oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
137        if XNN_UNPREDICTABLE(block_width > 1) {
138          $if OUT_PTRS == "MOV":
139            o = oN;
140          $else:
141            o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
142        }
143        wasm_v128_store(o, v0_0);
144      $else:
145        $for N in reversed(range(TILE_SIZE)):
146          wasm_v128_store(o${N}, v0_${N});
147          o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N} + tile_hbytes);
148    }
149    $if OUT_PTRS in ["MOV", "DEC"]:
150      o = (uint${SIZE}_t*) ((uintptr_t) o + tile_hbytes);
151
152    if (bh != 0) {
153      $if IN_PTRS == "REUSE":
154        const v128_t v${NUM_ITERS}_0 = wasm_v128_load(i0);
155        $for N in range(1, TILE_SIZE - 1, 2):
156          const uint${SIZE}_t *i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride);
157          if XNN_UNPREDICTABLE(bh < ${N+1}) {
158            i${N} = i${N-1};
159          }
160          const v128_t v${NUM_ITERS}_${N} = wasm_v128_load(i${N});
161          const uint${SIZE}_t *i${N+1} = (const uint${SIZE}_t*) ((uintptr_t) i${N} + input_stride);
162          if XNN_UNPREDICTABLE(bh <= ${N+1}) {
163            i${N+1} = i${N};
164          }
165          const v128_t v${NUM_ITERS}_${N+1} = wasm_v128_load(i${N+1});
166      $else:
167        const v128_t v${NUM_ITERS}_0 = wasm_v128_load(i0);
168        $for N in range(1, TILE_SIZE - 1, 2):
169          if XNN_UNPREDICTABLE(bh < ${N+1}) {
170            i${N} = i0;
171          }
172          const v128_t v${NUM_ITERS}_${N} = wasm_v128_load(i${N});
173          if XNN_UNPREDICTABLE(bh <= ${N+1}) {
174            i${N+1} = i0;
175          }
176          const v128_t v${NUM_ITERS}_${N+1} = wasm_v128_load(i${N+1});
177      const v128_t v${NUM_ITERS}_${TILE_SIZE-1} = wasm_v128_xor(v${NUM_ITERS}_0, v${NUM_ITERS}_0);
178
179      $for M in range(NUM_ITERS-1):
180        $for N in range(TILE_SIZE >> 1):
181          const v128_t v${NUM_ITERS-M-1}_${2*N} = wasm_v${SIZE}x${TILE_SIZE}_shuffle(v${NUM_ITERS-M}_${N}, v${NUM_ITERS-M}_${N+int(TILE_SIZE/2)}, ${LO_PERM});
182          const v128_t v${NUM_ITERS-M-1}_${2*N+1} = wasm_v${SIZE}x${TILE_SIZE}_shuffle(v${NUM_ITERS-M}_${N}, v${NUM_ITERS-M}_${N+int(TILE_SIZE/2)}, ${HI_PERM});
183
184      $for N in range(TILE_SIZE >> 1):
185        v128_t v0_${2*N} = wasm_v${SIZE}x${TILE_SIZE}_shuffle(v1_${N}, v1_${N+int(TILE_SIZE/2)}, ${LO_PERM});
186        v128_t v0_${2*N+1} = wasm_v${SIZE}x${TILE_SIZE}_shuffle(v1_${N}, v1_${N+int(TILE_SIZE/2)}, ${HI_PERM});
187
188      if (bh & ${TILE_SIZE>>1}) {
189        $if OUT_PTRS == "SWITCH":
190          uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
191          switch (rem) {
192            $for N in reversed(range(2, TILE_SIZE)):
193              case ${N}:
194                *((double*) oN) = wasm_f64x2_extract_lane(v0_${N}, 0);
195                oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
196            case 1:
197              *((double*) oN) = wasm_f64x2_extract_lane(v0_1, 0);
198            case 0:
199              $if NUM_ITERS > 1:
200                *((double*) o) = wasm_f64x2_extract_lane(v0_0, 0);
201                o += ${TILE_SIZE>>1};
202              $else:
203                *((double*) o) = wasm_f64x2_extract_lane(v0_0, 0);
204              break;
205            default:
206              XNN_UNREACHABLE;
207          }
208        $elif OUT_PTRS in ["MOV", "DEC"]:
209          o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
210          *((double*) o) = wasm_f64x2_extract_lane(v0_${TILE_SIZE-1}, 0);
211          $if OUT_PTRS == "MOV":
212            uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
213          $for N in reversed(range(2, TILE_SIZE, 2)):
214            if XNN_UNPREDICTABLE(block_width > ${N+1}) {
215              $if OUT_PTRS == "MOV":
216                o = oN;
217              $else:
218                o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
219            }
220            *((double*) o) = wasm_f64x2_extract_lane(v0_${N}, 0);
221            $if OUT_PTRS == "MOV":
222              oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
223            if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
224              $if OUT_PTRS == "MOV":
225                o = oN;
226              $else:
227                o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
228            }
229            *((double*) o) = wasm_f64x2_extract_lane(v0_${N-1}, 0);
230            $if OUT_PTRS == "MOV":
231              oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
232          if XNN_UNPREDICTABLE(block_width > 1) {
233            $if OUT_PTRS == "MOV":
234              o = oN;
235            $else:
236              o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
237          }
238          $if NUM_ITERS > 1:
239            *((double*) o) = wasm_f64x2_extract_lane(v0_0, 0);
240            o += ${TILE_SIZE>>1};
241          $else:
242            *((double*) o) = wasm_f64x2_extract_lane(v0_0, 0);
243        $else:
244          $for N in reversed(range(TILE_SIZE)):
245            $if NUM_ITERS>1:
246              *((double*) o${N}) = wasm_f64x2_extract_lane(v0_${N}, 0);
247              o${N} += ${TILE_SIZE>>1};
248            $else:
249              *((double*) o${N}) = wasm_f64x2_extract_lane(v0_${N}, 0);
250        $if NUM_ITERS > 1:
251          $for N in range(TILE_SIZE):
252            v0_${N} = wasm_v64x2_shuffle(v0_${N}, v0_${N}, 1, 1);
253      }
254
255      $if NUM_ITERS>1:
256        if (bh & ${TILE_SIZE>>2}) {
257          $if OUT_PTRS == "SWITCH":
258            uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
259            switch (rem) {
260              $for N in reversed(range(2, TILE_SIZE)):
261                case ${N}:
262                  *((float*) oN) = wasm_f32x4_extract_lane(v0_${N}, 0);
263                  oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
264              case 1:
265                *((float*) oN) = wasm_f32x4_extract_lane(v0_1, 0);
266              case 0:
267                *((float*) o) = wasm_f32x4_extract_lane(v0_0, 0);
268                $if SIZE < 32:
269                  o += ${TILE_SIZE>>2};
270                break;
271              default:
272                XNN_UNREACHABLE;
273            }
274          $elif OUT_PTRS in ["MOV", "DEC"]:
275            o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
276            *((float*) o) = wasm_f32x4_extract_lane(v0_${TILE_SIZE-1}, 0);
277            $if OUT_PTRS == "MOV":
278              uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
279            $for N in reversed(range(2, TILE_SIZE, 2)):
280              if XNN_UNPREDICTABLE(block_width > ${N+1}) {
281                $if OUT_PTRS == "MOV":
282                  o = oN;
283                $else:
284                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
285              }
286              *((float*) o) = wasm_f32x4_extract_lane(v0_${N}, 0);
287              $if OUT_PTRS == "MOV":
288                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
289              if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
290                $if OUT_PTRS == "MOV":
291                  o = oN;
292                $else:
293                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
294              }
295              *((float*) o) = wasm_f32x4_extract_lane(v0_${N-1}, 0);
296              $if OUT_PTRS == "MOV":
297                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
298            if XNN_UNPREDICTABLE(block_width > 1) {
299              $if OUT_PTRS == "MOV":
300                o = oN;
301              $else:
302                o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
303            }
304            *((float*) o) = wasm_f32x4_extract_lane(v0_0, 0);
305            $if SIZE < 32:
306              o += ${TILE_SIZE>>2};
307          $else:
308            $for N in reversed(range(TILE_SIZE)):
309              *((float*) o${N}) = wasm_f32x4_extract_lane(v0_${N}, 0);
310              $if SIZE < 32:
311                o${N} += ${TILE_SIZE>>2};
312          $if NUM_ITERS > 2:
313            $for N in range(TILE_SIZE):
314              v0_${N} = wasm_u64x2_shr(v0_${N}, 32);
315        }
316      $if NUM_ITERS>2:
317        if (bh & ${TILE_SIZE>>3}) {
318          $if OUT_PTRS == "SWITCH":
319            uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
320            switch (rem) {
321              $for N in reversed(range(2, TILE_SIZE)):
322                case ${N}:
323                  $if SIZE == 16:
324                    *oN = wasm_i16x8_extract_lane(v0_${N}, 0);
325                    oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
326                  $else:
327                    *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_${N}, 0);
328                    oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
329              case 1:
330                $if SIZE == 16:
331                  *oN = wasm_i16x8_extract_lane(v0_1, 0);
332                $else:
333                  *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_1, 0);
334              case 0:
335                $if SIZE == 16:
336                  *o = wasm_i16x8_extract_lane(v0_0, 0);
337                $else:
338                  *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_0, 0);
339                  o += ${TILE_SIZE>>3};
340                break;
341              default:
342                XNN_UNREACHABLE;
343            }
344          $elif OUT_PTRS in ["MOV", "DEC"]:
345            o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
346            $if SIZE == 16:
347              *o = wasm_i16x8_extract_lane(v0_${TILE_SIZE-1}, 0);
348            $else:
349              *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_${TILE_SIZE-1}, 0);
350            $if OUT_PTRS == "MOV":
351              uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
352            $for N in reversed(range(2, TILE_SIZE, 2)):
353              if XNN_UNPREDICTABLE(block_width > ${N+1}) {
354                $if OUT_PTRS == "MOV":
355                  o = oN;
356                $else:
357                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
358              }
359              $if SIZE == 16:
360                *o = wasm_i16x8_extract_lane(v0_${N}, 0);
361              $else:
362                *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_${N}, 0);
363              $if OUT_PTRS == "MOV":
364                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
365              if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
366                $if OUT_PTRS == "MOV":
367                  o = oN;
368                $else:
369                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
370              }
371              $if SIZE == 16:
372                *o = wasm_i16x8_extract_lane(v0_${N-1}, 0);
373              $else:
374                *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_${N-1}, 0);
375              $if OUT_PTRS == "MOV":
376                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
377            if XNN_UNPREDICTABLE(block_width > 1) {
378              $if OUT_PTRS == "MOV":
379                o = oN;
380              $else:
381                o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
382            }
383            $if SIZE == 16:
384              *o = wasm_i16x8_extract_lane(v0_0, 0);
385            $else:
386              *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_0, 0);
387              o += ${TILE_SIZE>>3};
388          $else:
389            $for N in reversed(range(TILE_SIZE)):
390              $if SIZE == 16:
391                *o${N} = wasm_i16x8_extract_lane(v0_${N}, 0);
392              $else:
393                *((uint16_t*) o${N}) = wasm_i16x8_extract_lane(v0_${N}, 0);
394                o${N} += ${TILE_SIZE>>3};
395          $if NUM_ITERS>3:
396            $for N in range(TILE_SIZE):
397              v0_${N} = wasm_u32x4_shr(v0_${N}, 16);
398        }
399      $if SIZE == 8:
400        if (bh & 1) {
401          $if OUT_PTRS == "SWITCH":
402            uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
403            switch (rem) {
404              $for N in reversed(range(2, TILE_SIZE)):
405                case ${N}:
406                  *oN = wasm_i8x16_extract_lane(v0_${N}, 0);
407                  oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
408              case 1:
409                *oN = wasm_i8x16_extract_lane(v0_1, 0);
410              case 0:
411                *o = wasm_i8x16_extract_lane(v0_0, 0);
412                break;
413              default:
414                XNN_UNREACHABLE;
415            }
416          $elif OUT_PTRS in ["MOV", "DEC"]:
417            o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
418            *o = wasm_i8x16_extract_lane(v0_${TILE_SIZE-1}, 0);
419            $if OUT_PTRS == "MOV":
420              uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
421            $for N in reversed(range(2, TILE_SIZE, 2)):
422              if XNN_UNPREDICTABLE(block_width > ${N+1}) {
423                $if OUT_PTRS == "MOV":
424                  o = oN;
425                $else:
426                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
427              }
428              *o = wasm_i8x16_extract_lane(v0_${N}, 0);
429              $if OUT_PTRS == "MOV":
430                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
431              if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
432                $if OUT_PTRS == "MOV":
433                  o = oN;
434                $else:
435                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
436              }
437              *o = wasm_i8x16_extract_lane(v0_${N-1}, 0);
438              $if OUT_PTRS == "MOV":
439                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
440            if XNN_UNPREDICTABLE(block_width > 1) {
441              $if OUT_PTRS == "MOV":
442                o = oN;
443              $else:
444                o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
445            }
446            *o = wasm_i8x16_extract_lane(v0_0, 0);
447          $else:
448            $for N in reversed(range(TILE_SIZE)):
449              *o${N} = wasm_i8x16_extract_lane(v0_${N}, 0);
450        }
451    }
452
453    $if IN_PTRS == "MULTI":
454      i0 = (const uint${SIZE}_t*) ((uintptr_t) i0 + input_reset);
455      $for N in range(1, TILE_SIZE):
456        i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride);
457    $else:
458      i0 = (const uint${SIZE}_t*) ((uintptr_t) i0 + input_reset);
459    $if OUT_PTRS == "MULTI":
460      o0 = (uint${SIZE}_t*) ((uintptr_t) o0 + output_reset);
461      $for N in range(1, TILE_SIZE):
462        o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N} + output_reset);
463    $else:
464      o = (uint${SIZE}_t*) ((uintptr_t) o + output_reset);
465    block_width = doz(block_width, tile_width);
466  } while (block_width != 0);
467}
468