1// Copyright 2021 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5$from itertools import chain 6$import math 7$assert IN_PTRS in ["MULTI", "REUSE"] 8$assert OUT_PTRS in ["MULTI", "SWITCH", "MOV", "DEC"] 9$assert SIZE in [8, 16, 32] 10$TILE_SIZE = int(128/SIZE) 11$NUM_ITERS = int(math.log2(TILE_SIZE)) 12$LO_PERM=str(list(chain.from_iterable((i, i+TILE_SIZE) for i in range((TILE_SIZE>>1)))))[1:-1] 13$HI_PERM=str(list(chain.from_iterable(((TILE_SIZE>>1)+i, (TILE_SIZE>>1)+i+TILE_SIZE) for i in range((TILE_SIZE>>1)))))[1:-1] 14 15#include <wasm_simd128.h> 16 17#include <assert.h> 18 19#include <xnnpack/common.h> 20#include <xnnpack/math.h> 21#include <xnnpack/transpose.h> 22 23void xnn_x${SIZE}_transposec_ukernel__${TILE_SIZE}x${TILE_SIZE}_${IN_PTRS.lower()}_${OUT_PTRS.lower()}_wasmsimd( 24 const uint${SIZE}_t* input, 25 uint${SIZE}_t* output, 26 size_t input_stride, 27 size_t output_stride, 28 size_t block_width, 29 size_t block_height) XNN_OOB_READS 30{ 31 assert(output_stride >= block_height * sizeof(uint${SIZE}_t)); 32 assert(input_stride >= block_width * sizeof(uint${SIZE}_t)); 33 34 const size_t tile_height = ${TILE_SIZE}; 35 const size_t tile_width = ${TILE_SIZE}; 36 const size_t tile_hbytes = tile_height * sizeof(uint${SIZE}_t); 37 const size_t tile_wbytes = tile_width * sizeof(uint${SIZE}_t); 38 const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride; 39 $if IN_PTRS == "MULTI": 40 const size_t input_offset = tile_height * input_stride; 41 $if OUT_PTRS in ["MOV", "DEC"]: 42 const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint${SIZE}_t) - tile_hbytes; 43 $else: 44 const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint${SIZE}_t); 45 46 $if IN_PTRS == "MULTI": 47 const uint${SIZE}_t* i0 = input; 48 $for N in range(1, TILE_SIZE): 49 const uint${SIZE}_t* i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride); 50 $else: 51 const uint${SIZE}_t* i0 = input; 52 $if OUT_PTRS == "MULTI": 53 uint${SIZE}_t* o0 = (uint${SIZE}_t*) output; 54 $for N in range(1, TILE_SIZE): 55 uint${SIZE}_t* o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N-1} + output_stride); 56 $elif OUT_PTRS == "SWITCH": 57 uint${SIZE}_t* o = (uint${SIZE}_t*) output; 58 $else: 59 uint${SIZE}_t* o = (uint${SIZE}_t*) ((uintptr_t) output - tile_hbytes); 60 $if OUT_PTRS != "MULTI": 61 const size_t minus_output_stride = -output_stride; 62 63 do { 64 $if OUT_PTRS == "MULTI": 65 if XNN_UNPREDICTABLE(block_width < 2) { 66 o1 = o0; 67 } 68 $for N in range(2, TILE_SIZE, 2): 69 if XNN_UNPREDICTABLE(block_width <= ${N}) { 70 o${N} = o0; 71 } 72 if XNN_UNPREDICTABLE(block_width < ${N+2}) { 73 o${N+1} = o0; 74 } 75 $elif OUT_PTRS in ["MOV", "DEC"]: 76 const size_t rem = min(block_width - 1, ${TILE_SIZE-1}); 77 const size_t oN_stride = rem * output_stride; 78 const size_t oN_offset = oN_stride + tile_hbytes; 79 $else: 80 const size_t rem = min(block_width - 1, ${TILE_SIZE-1}); 81 const size_t oN_stride = rem * output_stride; 82 size_t bh = block_height; 83 for (; bh >= ${TILE_SIZE}; bh -= ${TILE_SIZE}) { 84 $for N in range(TILE_SIZE): 85 $if IN_PTRS == "REUSE": 86 const v128_t v${NUM_ITERS}_${N} = wasm_v128_load(i0); 87 i0 = (uint${SIZE}_t*) ((uintptr_t) i0 + input_stride); 88 $else: 89 const v128_t v${NUM_ITERS}_${N} = wasm_v128_load(i${N}); 90 i${N} = (uint${SIZE}_t*) ((uintptr_t) i${N} + input_offset); 91 92 $for M in range(NUM_ITERS): 93 $for N in range(TILE_SIZE >> 1): 94 const v128_t v${NUM_ITERS-M-1}_${2*N} = wasm_v${SIZE}x${TILE_SIZE}_shuffle(v${NUM_ITERS-M}_${N}, v${NUM_ITERS-M}_${N+int(TILE_SIZE/2)}, ${LO_PERM}); 95 const v128_t v${NUM_ITERS-M-1}_${2*N+1} = wasm_v${SIZE}x${TILE_SIZE}_shuffle(v${NUM_ITERS-M}_${N}, v${NUM_ITERS-M}_${N+int(TILE_SIZE/2)}, ${HI_PERM}); 96 97 $if OUT_PTRS == "SWITCH": 98 uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 99 switch (rem) { 100 $for N in reversed(range(2, TILE_SIZE)): 101 case ${N}: 102 wasm_v128_store(oN, v0_${N}); 103 oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 104 case 1: 105 wasm_v128_store(oN, v0_1); 106 case 0: 107 wasm_v128_store(o, v0_0); 108 o = (uint${SIZE}_t*) ((uintptr_t) o + tile_hbytes); 109 break; 110 default: 111 XNN_UNREACHABLE; 112 } 113 $elif OUT_PTRS in ["MOV", "DEC"]: 114 o = (uint${SIZE}_t*) ((uintptr_t) o + oN_offset); 115 wasm_v128_store(o, v0_${TILE_SIZE-1}); 116 $if OUT_PTRS == "MOV": 117 uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 118 $for N in reversed(range(2, TILE_SIZE-1, 2)): 119 if XNN_UNPREDICTABLE(block_width > ${N+1}) { 120 $if OUT_PTRS == "MOV": 121 o = oN; 122 $else: 123 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 124 } 125 wasm_v128_store(o, v0_${N}); 126 $if OUT_PTRS == "MOV": 127 oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 128 if XNN_UNPREDICTABLE(block_width >= ${N+1}) { 129 $if OUT_PTRS == "MOV": 130 o = oN; 131 $else: 132 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 133 } 134 wasm_v128_store(o, v0_${N-1}); 135 $if OUT_PTRS == "MOV": 136 oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 137 if XNN_UNPREDICTABLE(block_width > 1) { 138 $if OUT_PTRS == "MOV": 139 o = oN; 140 $else: 141 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 142 } 143 wasm_v128_store(o, v0_0); 144 $else: 145 $for N in reversed(range(TILE_SIZE)): 146 wasm_v128_store(o${N}, v0_${N}); 147 o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N} + tile_hbytes); 148 } 149 $if OUT_PTRS in ["MOV", "DEC"]: 150 o = (uint${SIZE}_t*) ((uintptr_t) o + tile_hbytes); 151 152 if (bh != 0) { 153 $if IN_PTRS == "REUSE": 154 const v128_t v${NUM_ITERS}_0 = wasm_v128_load(i0); 155 $for N in range(1, TILE_SIZE - 1, 2): 156 const uint${SIZE}_t *i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride); 157 if XNN_UNPREDICTABLE(bh < ${N+1}) { 158 i${N} = i${N-1}; 159 } 160 const v128_t v${NUM_ITERS}_${N} = wasm_v128_load(i${N}); 161 const uint${SIZE}_t *i${N+1} = (const uint${SIZE}_t*) ((uintptr_t) i${N} + input_stride); 162 if XNN_UNPREDICTABLE(bh <= ${N+1}) { 163 i${N+1} = i${N}; 164 } 165 const v128_t v${NUM_ITERS}_${N+1} = wasm_v128_load(i${N+1}); 166 $else: 167 const v128_t v${NUM_ITERS}_0 = wasm_v128_load(i0); 168 $for N in range(1, TILE_SIZE - 1, 2): 169 if XNN_UNPREDICTABLE(bh < ${N+1}) { 170 i${N} = i0; 171 } 172 const v128_t v${NUM_ITERS}_${N} = wasm_v128_load(i${N}); 173 if XNN_UNPREDICTABLE(bh <= ${N+1}) { 174 i${N+1} = i0; 175 } 176 const v128_t v${NUM_ITERS}_${N+1} = wasm_v128_load(i${N+1}); 177 const v128_t v${NUM_ITERS}_${TILE_SIZE-1} = wasm_v128_xor(v${NUM_ITERS}_0, v${NUM_ITERS}_0); 178 179 $for M in range(NUM_ITERS-1): 180 $for N in range(TILE_SIZE >> 1): 181 const v128_t v${NUM_ITERS-M-1}_${2*N} = wasm_v${SIZE}x${TILE_SIZE}_shuffle(v${NUM_ITERS-M}_${N}, v${NUM_ITERS-M}_${N+int(TILE_SIZE/2)}, ${LO_PERM}); 182 const v128_t v${NUM_ITERS-M-1}_${2*N+1} = wasm_v${SIZE}x${TILE_SIZE}_shuffle(v${NUM_ITERS-M}_${N}, v${NUM_ITERS-M}_${N+int(TILE_SIZE/2)}, ${HI_PERM}); 183 184 $for N in range(TILE_SIZE >> 1): 185 v128_t v0_${2*N} = wasm_v${SIZE}x${TILE_SIZE}_shuffle(v1_${N}, v1_${N+int(TILE_SIZE/2)}, ${LO_PERM}); 186 v128_t v0_${2*N+1} = wasm_v${SIZE}x${TILE_SIZE}_shuffle(v1_${N}, v1_${N+int(TILE_SIZE/2)}, ${HI_PERM}); 187 188 if (bh & ${TILE_SIZE>>1}) { 189 $if OUT_PTRS == "SWITCH": 190 uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 191 switch (rem) { 192 $for N in reversed(range(2, TILE_SIZE)): 193 case ${N}: 194 *((double*) oN) = wasm_f64x2_extract_lane(v0_${N}, 0); 195 oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 196 case 1: 197 *((double*) oN) = wasm_f64x2_extract_lane(v0_1, 0); 198 case 0: 199 $if NUM_ITERS > 1: 200 *((double*) o) = wasm_f64x2_extract_lane(v0_0, 0); 201 o += ${TILE_SIZE>>1}; 202 $else: 203 *((double*) o) = wasm_f64x2_extract_lane(v0_0, 0); 204 break; 205 default: 206 XNN_UNREACHABLE; 207 } 208 $elif OUT_PTRS in ["MOV", "DEC"]: 209 o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 210 *((double*) o) = wasm_f64x2_extract_lane(v0_${TILE_SIZE-1}, 0); 211 $if OUT_PTRS == "MOV": 212 uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 213 $for N in reversed(range(2, TILE_SIZE, 2)): 214 if XNN_UNPREDICTABLE(block_width > ${N+1}) { 215 $if OUT_PTRS == "MOV": 216 o = oN; 217 $else: 218 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 219 } 220 *((double*) o) = wasm_f64x2_extract_lane(v0_${N}, 0); 221 $if OUT_PTRS == "MOV": 222 oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 223 if XNN_UNPREDICTABLE(block_width >= ${N+1}) { 224 $if OUT_PTRS == "MOV": 225 o = oN; 226 $else: 227 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 228 } 229 *((double*) o) = wasm_f64x2_extract_lane(v0_${N-1}, 0); 230 $if OUT_PTRS == "MOV": 231 oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 232 if XNN_UNPREDICTABLE(block_width > 1) { 233 $if OUT_PTRS == "MOV": 234 o = oN; 235 $else: 236 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 237 } 238 $if NUM_ITERS > 1: 239 *((double*) o) = wasm_f64x2_extract_lane(v0_0, 0); 240 o += ${TILE_SIZE>>1}; 241 $else: 242 *((double*) o) = wasm_f64x2_extract_lane(v0_0, 0); 243 $else: 244 $for N in reversed(range(TILE_SIZE)): 245 $if NUM_ITERS>1: 246 *((double*) o${N}) = wasm_f64x2_extract_lane(v0_${N}, 0); 247 o${N} += ${TILE_SIZE>>1}; 248 $else: 249 *((double*) o${N}) = wasm_f64x2_extract_lane(v0_${N}, 0); 250 $if NUM_ITERS > 1: 251 $for N in range(TILE_SIZE): 252 v0_${N} = wasm_v64x2_shuffle(v0_${N}, v0_${N}, 1, 1); 253 } 254 255 $if NUM_ITERS>1: 256 if (bh & ${TILE_SIZE>>2}) { 257 $if OUT_PTRS == "SWITCH": 258 uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 259 switch (rem) { 260 $for N in reversed(range(2, TILE_SIZE)): 261 case ${N}: 262 *((float*) oN) = wasm_f32x4_extract_lane(v0_${N}, 0); 263 oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 264 case 1: 265 *((float*) oN) = wasm_f32x4_extract_lane(v0_1, 0); 266 case 0: 267 *((float*) o) = wasm_f32x4_extract_lane(v0_0, 0); 268 $if SIZE < 32: 269 o += ${TILE_SIZE>>2}; 270 break; 271 default: 272 XNN_UNREACHABLE; 273 } 274 $elif OUT_PTRS in ["MOV", "DEC"]: 275 o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 276 *((float*) o) = wasm_f32x4_extract_lane(v0_${TILE_SIZE-1}, 0); 277 $if OUT_PTRS == "MOV": 278 uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 279 $for N in reversed(range(2, TILE_SIZE, 2)): 280 if XNN_UNPREDICTABLE(block_width > ${N+1}) { 281 $if OUT_PTRS == "MOV": 282 o = oN; 283 $else: 284 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 285 } 286 *((float*) o) = wasm_f32x4_extract_lane(v0_${N}, 0); 287 $if OUT_PTRS == "MOV": 288 oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 289 if XNN_UNPREDICTABLE(block_width >= ${N+1}) { 290 $if OUT_PTRS == "MOV": 291 o = oN; 292 $else: 293 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 294 } 295 *((float*) o) = wasm_f32x4_extract_lane(v0_${N-1}, 0); 296 $if OUT_PTRS == "MOV": 297 oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 298 if XNN_UNPREDICTABLE(block_width > 1) { 299 $if OUT_PTRS == "MOV": 300 o = oN; 301 $else: 302 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 303 } 304 *((float*) o) = wasm_f32x4_extract_lane(v0_0, 0); 305 $if SIZE < 32: 306 o += ${TILE_SIZE>>2}; 307 $else: 308 $for N in reversed(range(TILE_SIZE)): 309 *((float*) o${N}) = wasm_f32x4_extract_lane(v0_${N}, 0); 310 $if SIZE < 32: 311 o${N} += ${TILE_SIZE>>2}; 312 $if NUM_ITERS > 2: 313 $for N in range(TILE_SIZE): 314 v0_${N} = wasm_u64x2_shr(v0_${N}, 32); 315 } 316 $if NUM_ITERS>2: 317 if (bh & ${TILE_SIZE>>3}) { 318 $if OUT_PTRS == "SWITCH": 319 uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 320 switch (rem) { 321 $for N in reversed(range(2, TILE_SIZE)): 322 case ${N}: 323 $if SIZE == 16: 324 *oN = wasm_i16x8_extract_lane(v0_${N}, 0); 325 oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 326 $else: 327 *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_${N}, 0); 328 oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 329 case 1: 330 $if SIZE == 16: 331 *oN = wasm_i16x8_extract_lane(v0_1, 0); 332 $else: 333 *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_1, 0); 334 case 0: 335 $if SIZE == 16: 336 *o = wasm_i16x8_extract_lane(v0_0, 0); 337 $else: 338 *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_0, 0); 339 o += ${TILE_SIZE>>3}; 340 break; 341 default: 342 XNN_UNREACHABLE; 343 } 344 $elif OUT_PTRS in ["MOV", "DEC"]: 345 o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 346 $if SIZE == 16: 347 *o = wasm_i16x8_extract_lane(v0_${TILE_SIZE-1}, 0); 348 $else: 349 *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_${TILE_SIZE-1}, 0); 350 $if OUT_PTRS == "MOV": 351 uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 352 $for N in reversed(range(2, TILE_SIZE, 2)): 353 if XNN_UNPREDICTABLE(block_width > ${N+1}) { 354 $if OUT_PTRS == "MOV": 355 o = oN; 356 $else: 357 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 358 } 359 $if SIZE == 16: 360 *o = wasm_i16x8_extract_lane(v0_${N}, 0); 361 $else: 362 *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_${N}, 0); 363 $if OUT_PTRS == "MOV": 364 oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 365 if XNN_UNPREDICTABLE(block_width >= ${N+1}) { 366 $if OUT_PTRS == "MOV": 367 o = oN; 368 $else: 369 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 370 } 371 $if SIZE == 16: 372 *o = wasm_i16x8_extract_lane(v0_${N-1}, 0); 373 $else: 374 *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_${N-1}, 0); 375 $if OUT_PTRS == "MOV": 376 oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 377 if XNN_UNPREDICTABLE(block_width > 1) { 378 $if OUT_PTRS == "MOV": 379 o = oN; 380 $else: 381 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 382 } 383 $if SIZE == 16: 384 *o = wasm_i16x8_extract_lane(v0_0, 0); 385 $else: 386 *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_0, 0); 387 o += ${TILE_SIZE>>3}; 388 $else: 389 $for N in reversed(range(TILE_SIZE)): 390 $if SIZE == 16: 391 *o${N} = wasm_i16x8_extract_lane(v0_${N}, 0); 392 $else: 393 *((uint16_t*) o${N}) = wasm_i16x8_extract_lane(v0_${N}, 0); 394 o${N} += ${TILE_SIZE>>3}; 395 $if NUM_ITERS>3: 396 $for N in range(TILE_SIZE): 397 v0_${N} = wasm_u32x4_shr(v0_${N}, 16); 398 } 399 $if SIZE == 8: 400 if (bh & 1) { 401 $if OUT_PTRS == "SWITCH": 402 uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 403 switch (rem) { 404 $for N in reversed(range(2, TILE_SIZE)): 405 case ${N}: 406 *oN = wasm_i8x16_extract_lane(v0_${N}, 0); 407 oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); 408 case 1: 409 *oN = wasm_i8x16_extract_lane(v0_1, 0); 410 case 0: 411 *o = wasm_i8x16_extract_lane(v0_0, 0); 412 break; 413 default: 414 XNN_UNREACHABLE; 415 } 416 $elif OUT_PTRS in ["MOV", "DEC"]: 417 o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); 418 *o = wasm_i8x16_extract_lane(v0_${TILE_SIZE-1}, 0); 419 $if OUT_PTRS == "MOV": 420 uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 421 $for N in reversed(range(2, TILE_SIZE, 2)): 422 if XNN_UNPREDICTABLE(block_width > ${N+1}) { 423 $if OUT_PTRS == "MOV": 424 o = oN; 425 $else: 426 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 427 } 428 *o = wasm_i8x16_extract_lane(v0_${N}, 0); 429 $if OUT_PTRS == "MOV": 430 oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 431 if XNN_UNPREDICTABLE(block_width >= ${N+1}) { 432 $if OUT_PTRS == "MOV": 433 o = oN; 434 $else: 435 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 436 } 437 *o = wasm_i8x16_extract_lane(v0_${N-1}, 0); 438 $if OUT_PTRS == "MOV": 439 oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 440 if XNN_UNPREDICTABLE(block_width > 1) { 441 $if OUT_PTRS == "MOV": 442 o = oN; 443 $else: 444 o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); 445 } 446 *o = wasm_i8x16_extract_lane(v0_0, 0); 447 $else: 448 $for N in reversed(range(TILE_SIZE)): 449 *o${N} = wasm_i8x16_extract_lane(v0_${N}, 0); 450 } 451 } 452 453 $if IN_PTRS == "MULTI": 454 i0 = (const uint${SIZE}_t*) ((uintptr_t) i0 + input_reset); 455 $for N in range(1, TILE_SIZE): 456 i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride); 457 $else: 458 i0 = (const uint${SIZE}_t*) ((uintptr_t) i0 + input_reset); 459 $if OUT_PTRS == "MULTI": 460 o0 = (uint${SIZE}_t*) ((uintptr_t) o0 + output_reset); 461 $for N in range(1, TILE_SIZE): 462 o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N} + output_reset); 463 $else: 464 o = (uint${SIZE}_t*) ((uintptr_t) o + output_reset); 465 block_width = doz(block_width, tile_width); 466 } while (block_width != 0); 467} 468