xref: /aosp_15_r20/external/XNNPACK/src/x16-transposec/gen/8x8-multi-switch-wasmsimd.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/x32-transposec/wasmsimd.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2021 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <wasm_simd128.h>
11 
12 #include <assert.h>
13 
14 #include <xnnpack/common.h>
15 #include <xnnpack/math.h>
16 #include <xnnpack/transpose.h>
17 
xnn_x16_transposec_ukernel__8x8_multi_switch_wasmsimd(const uint16_t * input,uint16_t * output,size_t input_stride,size_t output_stride,size_t block_width,size_t block_height)18 void xnn_x16_transposec_ukernel__8x8_multi_switch_wasmsimd(
19     const uint16_t* input,
20     uint16_t* output,
21     size_t input_stride,
22     size_t output_stride,
23     size_t block_width,
24     size_t block_height) XNN_OOB_READS
25 {
26   assert(output_stride >= block_height * sizeof(uint16_t));
27   assert(input_stride >= block_width * sizeof(uint16_t));
28 
29   const size_t tile_height = 8;
30   const size_t tile_width = 8;
31   const size_t tile_hbytes = tile_height * sizeof(uint16_t);
32   const size_t tile_wbytes = tile_width * sizeof(uint16_t);
33   const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
34   const size_t input_offset = tile_height * input_stride;
35   const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint16_t);
36 
37   const uint16_t* i0 = input;
38   const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
39   const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride);
40   const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride);
41   const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride);
42   const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride);
43   const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride);
44   const uint16_t* i7 = (const uint16_t*) ((uintptr_t) i6 + input_stride);
45   uint16_t* o = (uint16_t*) output;
46   const size_t minus_output_stride = -output_stride;
47 
48   do {
49     const size_t rem = min(block_width - 1, 7);
50     const size_t oN_stride = rem * output_stride;
51     size_t bh = block_height;
52     for (; bh >= 8; bh -= 8) {
53       const v128_t v3_0 = wasm_v128_load(i0);
54       i0 = (uint16_t*) ((uintptr_t) i0 + input_offset);
55       const v128_t v3_1 = wasm_v128_load(i1);
56       i1 = (uint16_t*) ((uintptr_t) i1 + input_offset);
57       const v128_t v3_2 = wasm_v128_load(i2);
58       i2 = (uint16_t*) ((uintptr_t) i2 + input_offset);
59       const v128_t v3_3 = wasm_v128_load(i3);
60       i3 = (uint16_t*) ((uintptr_t) i3 + input_offset);
61       const v128_t v3_4 = wasm_v128_load(i4);
62       i4 = (uint16_t*) ((uintptr_t) i4 + input_offset);
63       const v128_t v3_5 = wasm_v128_load(i5);
64       i5 = (uint16_t*) ((uintptr_t) i5 + input_offset);
65       const v128_t v3_6 = wasm_v128_load(i6);
66       i6 = (uint16_t*) ((uintptr_t) i6 + input_offset);
67       const v128_t v3_7 = wasm_v128_load(i7);
68       i7 = (uint16_t*) ((uintptr_t) i7 + input_offset);
69 
70       const v128_t v2_0 = wasm_v16x8_shuffle(v3_0, v3_4, 0, 8, 1, 9, 2, 10, 3, 11);
71       const v128_t v2_1 = wasm_v16x8_shuffle(v3_0, v3_4, 4, 12, 5, 13, 6, 14, 7, 15);
72       const v128_t v2_2 = wasm_v16x8_shuffle(v3_1, v3_5, 0, 8, 1, 9, 2, 10, 3, 11);
73       const v128_t v2_3 = wasm_v16x8_shuffle(v3_1, v3_5, 4, 12, 5, 13, 6, 14, 7, 15);
74       const v128_t v2_4 = wasm_v16x8_shuffle(v3_2, v3_6, 0, 8, 1, 9, 2, 10, 3, 11);
75       const v128_t v2_5 = wasm_v16x8_shuffle(v3_2, v3_6, 4, 12, 5, 13, 6, 14, 7, 15);
76       const v128_t v2_6 = wasm_v16x8_shuffle(v3_3, v3_7, 0, 8, 1, 9, 2, 10, 3, 11);
77       const v128_t v2_7 = wasm_v16x8_shuffle(v3_3, v3_7, 4, 12, 5, 13, 6, 14, 7, 15);
78       const v128_t v1_0 = wasm_v16x8_shuffle(v2_0, v2_4, 0, 8, 1, 9, 2, 10, 3, 11);
79       const v128_t v1_1 = wasm_v16x8_shuffle(v2_0, v2_4, 4, 12, 5, 13, 6, 14, 7, 15);
80       const v128_t v1_2 = wasm_v16x8_shuffle(v2_1, v2_5, 0, 8, 1, 9, 2, 10, 3, 11);
81       const v128_t v1_3 = wasm_v16x8_shuffle(v2_1, v2_5, 4, 12, 5, 13, 6, 14, 7, 15);
82       const v128_t v1_4 = wasm_v16x8_shuffle(v2_2, v2_6, 0, 8, 1, 9, 2, 10, 3, 11);
83       const v128_t v1_5 = wasm_v16x8_shuffle(v2_2, v2_6, 4, 12, 5, 13, 6, 14, 7, 15);
84       const v128_t v1_6 = wasm_v16x8_shuffle(v2_3, v2_7, 0, 8, 1, 9, 2, 10, 3, 11);
85       const v128_t v1_7 = wasm_v16x8_shuffle(v2_3, v2_7, 4, 12, 5, 13, 6, 14, 7, 15);
86       const v128_t v0_0 = wasm_v16x8_shuffle(v1_0, v1_4, 0, 8, 1, 9, 2, 10, 3, 11);
87       const v128_t v0_1 = wasm_v16x8_shuffle(v1_0, v1_4, 4, 12, 5, 13, 6, 14, 7, 15);
88       const v128_t v0_2 = wasm_v16x8_shuffle(v1_1, v1_5, 0, 8, 1, 9, 2, 10, 3, 11);
89       const v128_t v0_3 = wasm_v16x8_shuffle(v1_1, v1_5, 4, 12, 5, 13, 6, 14, 7, 15);
90       const v128_t v0_4 = wasm_v16x8_shuffle(v1_2, v1_6, 0, 8, 1, 9, 2, 10, 3, 11);
91       const v128_t v0_5 = wasm_v16x8_shuffle(v1_2, v1_6, 4, 12, 5, 13, 6, 14, 7, 15);
92       const v128_t v0_6 = wasm_v16x8_shuffle(v1_3, v1_7, 0, 8, 1, 9, 2, 10, 3, 11);
93       const v128_t v0_7 = wasm_v16x8_shuffle(v1_3, v1_7, 4, 12, 5, 13, 6, 14, 7, 15);
94 
95       uint16_t *oN = (uint16_t*) ((uintptr_t) o + oN_stride);
96       switch (rem) {
97         case 7:
98           wasm_v128_store(oN, v0_7);
99           oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
100         case 6:
101           wasm_v128_store(oN, v0_6);
102           oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
103         case 5:
104           wasm_v128_store(oN, v0_5);
105           oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
106         case 4:
107           wasm_v128_store(oN, v0_4);
108           oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
109         case 3:
110           wasm_v128_store(oN, v0_3);
111           oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
112         case 2:
113           wasm_v128_store(oN, v0_2);
114           oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
115         case 1:
116           wasm_v128_store(oN, v0_1);
117         case 0:
118           wasm_v128_store(o, v0_0);
119           o = (uint16_t*) ((uintptr_t) o + tile_hbytes);
120           break;
121         default:
122           XNN_UNREACHABLE;
123       }
124     }
125 
126     if (bh != 0) {
127       const v128_t v3_0 = wasm_v128_load(i0);
128       if XNN_UNPREDICTABLE(bh < 2) {
129         i1 = i0;
130       }
131       const v128_t v3_1 = wasm_v128_load(i1);
132       if XNN_UNPREDICTABLE(bh <= 2) {
133         i2 = i0;
134       }
135       const v128_t v3_2 = wasm_v128_load(i2);
136       if XNN_UNPREDICTABLE(bh < 4) {
137         i3 = i0;
138       }
139       const v128_t v3_3 = wasm_v128_load(i3);
140       if XNN_UNPREDICTABLE(bh <= 4) {
141         i4 = i0;
142       }
143       const v128_t v3_4 = wasm_v128_load(i4);
144       if XNN_UNPREDICTABLE(bh < 6) {
145         i5 = i0;
146       }
147       const v128_t v3_5 = wasm_v128_load(i5);
148       if XNN_UNPREDICTABLE(bh <= 6) {
149         i6 = i0;
150       }
151       const v128_t v3_6 = wasm_v128_load(i6);
152       const v128_t v3_7 = wasm_v128_xor(v3_0, v3_0);
153 
154       const v128_t v2_0 = wasm_v16x8_shuffle(v3_0, v3_4, 0, 8, 1, 9, 2, 10, 3, 11);
155       const v128_t v2_1 = wasm_v16x8_shuffle(v3_0, v3_4, 4, 12, 5, 13, 6, 14, 7, 15);
156       const v128_t v2_2 = wasm_v16x8_shuffle(v3_1, v3_5, 0, 8, 1, 9, 2, 10, 3, 11);
157       const v128_t v2_3 = wasm_v16x8_shuffle(v3_1, v3_5, 4, 12, 5, 13, 6, 14, 7, 15);
158       const v128_t v2_4 = wasm_v16x8_shuffle(v3_2, v3_6, 0, 8, 1, 9, 2, 10, 3, 11);
159       const v128_t v2_5 = wasm_v16x8_shuffle(v3_2, v3_6, 4, 12, 5, 13, 6, 14, 7, 15);
160       const v128_t v2_6 = wasm_v16x8_shuffle(v3_3, v3_7, 0, 8, 1, 9, 2, 10, 3, 11);
161       const v128_t v2_7 = wasm_v16x8_shuffle(v3_3, v3_7, 4, 12, 5, 13, 6, 14, 7, 15);
162       const v128_t v1_0 = wasm_v16x8_shuffle(v2_0, v2_4, 0, 8, 1, 9, 2, 10, 3, 11);
163       const v128_t v1_1 = wasm_v16x8_shuffle(v2_0, v2_4, 4, 12, 5, 13, 6, 14, 7, 15);
164       const v128_t v1_2 = wasm_v16x8_shuffle(v2_1, v2_5, 0, 8, 1, 9, 2, 10, 3, 11);
165       const v128_t v1_3 = wasm_v16x8_shuffle(v2_1, v2_5, 4, 12, 5, 13, 6, 14, 7, 15);
166       const v128_t v1_4 = wasm_v16x8_shuffle(v2_2, v2_6, 0, 8, 1, 9, 2, 10, 3, 11);
167       const v128_t v1_5 = wasm_v16x8_shuffle(v2_2, v2_6, 4, 12, 5, 13, 6, 14, 7, 15);
168       const v128_t v1_6 = wasm_v16x8_shuffle(v2_3, v2_7, 0, 8, 1, 9, 2, 10, 3, 11);
169       const v128_t v1_7 = wasm_v16x8_shuffle(v2_3, v2_7, 4, 12, 5, 13, 6, 14, 7, 15);
170 
171       v128_t v0_0 = wasm_v16x8_shuffle(v1_0, v1_4, 0, 8, 1, 9, 2, 10, 3, 11);
172       v128_t v0_1 = wasm_v16x8_shuffle(v1_0, v1_4, 4, 12, 5, 13, 6, 14, 7, 15);
173       v128_t v0_2 = wasm_v16x8_shuffle(v1_1, v1_5, 0, 8, 1, 9, 2, 10, 3, 11);
174       v128_t v0_3 = wasm_v16x8_shuffle(v1_1, v1_5, 4, 12, 5, 13, 6, 14, 7, 15);
175       v128_t v0_4 = wasm_v16x8_shuffle(v1_2, v1_6, 0, 8, 1, 9, 2, 10, 3, 11);
176       v128_t v0_5 = wasm_v16x8_shuffle(v1_2, v1_6, 4, 12, 5, 13, 6, 14, 7, 15);
177       v128_t v0_6 = wasm_v16x8_shuffle(v1_3, v1_7, 0, 8, 1, 9, 2, 10, 3, 11);
178       v128_t v0_7 = wasm_v16x8_shuffle(v1_3, v1_7, 4, 12, 5, 13, 6, 14, 7, 15);
179 
180       if (bh & 4) {
181         uint16_t* oN = (uint16_t*) ((uintptr_t) o + oN_stride);
182         switch (rem) {
183           case 7:
184             *((double*) oN) = wasm_f64x2_extract_lane(v0_7, 0);
185             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
186           case 6:
187             *((double*) oN) = wasm_f64x2_extract_lane(v0_6, 0);
188             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
189           case 5:
190             *((double*) oN) = wasm_f64x2_extract_lane(v0_5, 0);
191             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
192           case 4:
193             *((double*) oN) = wasm_f64x2_extract_lane(v0_4, 0);
194             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
195           case 3:
196             *((double*) oN) = wasm_f64x2_extract_lane(v0_3, 0);
197             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
198           case 2:
199             *((double*) oN) = wasm_f64x2_extract_lane(v0_2, 0);
200             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
201           case 1:
202             *((double*) oN) = wasm_f64x2_extract_lane(v0_1, 0);
203           case 0:
204             *((double*) o) = wasm_f64x2_extract_lane(v0_0, 0);
205             o += 4;
206             break;
207           default:
208             XNN_UNREACHABLE;
209         }
210         v0_0 = wasm_v64x2_shuffle(v0_0, v0_0, 1, 1);
211         v0_1 = wasm_v64x2_shuffle(v0_1, v0_1, 1, 1);
212         v0_2 = wasm_v64x2_shuffle(v0_2, v0_2, 1, 1);
213         v0_3 = wasm_v64x2_shuffle(v0_3, v0_3, 1, 1);
214         v0_4 = wasm_v64x2_shuffle(v0_4, v0_4, 1, 1);
215         v0_5 = wasm_v64x2_shuffle(v0_5, v0_5, 1, 1);
216         v0_6 = wasm_v64x2_shuffle(v0_6, v0_6, 1, 1);
217         v0_7 = wasm_v64x2_shuffle(v0_7, v0_7, 1, 1);
218       }
219 
220       if (bh & 2) {
221         uint16_t* oN = (uint16_t*) ((uintptr_t) o + oN_stride);
222         switch (rem) {
223           case 7:
224             *((float*) oN) = wasm_f32x4_extract_lane(v0_7, 0);
225             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
226           case 6:
227             *((float*) oN) = wasm_f32x4_extract_lane(v0_6, 0);
228             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
229           case 5:
230             *((float*) oN) = wasm_f32x4_extract_lane(v0_5, 0);
231             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
232           case 4:
233             *((float*) oN) = wasm_f32x4_extract_lane(v0_4, 0);
234             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
235           case 3:
236             *((float*) oN) = wasm_f32x4_extract_lane(v0_3, 0);
237             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
238           case 2:
239             *((float*) oN) = wasm_f32x4_extract_lane(v0_2, 0);
240             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
241           case 1:
242             *((float*) oN) = wasm_f32x4_extract_lane(v0_1, 0);
243           case 0:
244             *((float*) o) = wasm_f32x4_extract_lane(v0_0, 0);
245             o += 2;
246             break;
247           default:
248             XNN_UNREACHABLE;
249         }
250         v0_0 = wasm_u64x2_shr(v0_0, 32);
251         v0_1 = wasm_u64x2_shr(v0_1, 32);
252         v0_2 = wasm_u64x2_shr(v0_2, 32);
253         v0_3 = wasm_u64x2_shr(v0_3, 32);
254         v0_4 = wasm_u64x2_shr(v0_4, 32);
255         v0_5 = wasm_u64x2_shr(v0_5, 32);
256         v0_6 = wasm_u64x2_shr(v0_6, 32);
257         v0_7 = wasm_u64x2_shr(v0_7, 32);
258       }
259       if (bh & 1) {
260         uint16_t* oN = (uint16_t*) ((uintptr_t) o + oN_stride);
261         switch (rem) {
262           case 7:
263             *oN = wasm_i16x8_extract_lane(v0_7, 0);
264             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
265           case 6:
266             *oN = wasm_i16x8_extract_lane(v0_6, 0);
267             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
268           case 5:
269             *oN = wasm_i16x8_extract_lane(v0_5, 0);
270             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
271           case 4:
272             *oN = wasm_i16x8_extract_lane(v0_4, 0);
273             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
274           case 3:
275             *oN = wasm_i16x8_extract_lane(v0_3, 0);
276             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
277           case 2:
278             *oN = wasm_i16x8_extract_lane(v0_2, 0);
279             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
280           case 1:
281             *oN = wasm_i16x8_extract_lane(v0_1, 0);
282           case 0:
283             *o = wasm_i16x8_extract_lane(v0_0, 0);
284             break;
285           default:
286             XNN_UNREACHABLE;
287         }
288       }
289     }
290 
291     i0 = (const uint16_t*) ((uintptr_t) i0 + input_reset);
292     i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
293     i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride);
294     i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride);
295     i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride);
296     i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride);
297     i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride);
298     i7 = (const uint16_t*) ((uintptr_t) i6 + input_stride);
299     o = (uint16_t*) ((uintptr_t) o + output_reset);
300     block_width = doz(block_width, tile_width);
301   } while (block_width != 0);
302 }
303