xref: /aosp_15_r20/external/XNNPACK/src/x16-transposec/gen/8x8-reuse-switch-wasmsimd.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/x32-transposec/wasmsimd.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2021 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <wasm_simd128.h>
11 
12 #include <assert.h>
13 
14 #include <xnnpack/common.h>
15 #include <xnnpack/math.h>
16 #include <xnnpack/transpose.h>
17 
xnn_x16_transposec_ukernel__8x8_reuse_switch_wasmsimd(const uint16_t * input,uint16_t * output,size_t input_stride,size_t output_stride,size_t block_width,size_t block_height)18 void xnn_x16_transposec_ukernel__8x8_reuse_switch_wasmsimd(
19     const uint16_t* input,
20     uint16_t* output,
21     size_t input_stride,
22     size_t output_stride,
23     size_t block_width,
24     size_t block_height) XNN_OOB_READS
25 {
26   assert(output_stride >= block_height * sizeof(uint16_t));
27   assert(input_stride >= block_width * sizeof(uint16_t));
28 
29   const size_t tile_height = 8;
30   const size_t tile_width = 8;
31   const size_t tile_hbytes = tile_height * sizeof(uint16_t);
32   const size_t tile_wbytes = tile_width * sizeof(uint16_t);
33   const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
34   const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint16_t);
35 
36   const uint16_t* i0 = input;
37   uint16_t* o = (uint16_t*) output;
38   const size_t minus_output_stride = -output_stride;
39 
40   do {
41     const size_t rem = min(block_width - 1, 7);
42     const size_t oN_stride = rem * output_stride;
43     size_t bh = block_height;
44     for (; bh >= 8; bh -= 8) {
45       const v128_t v3_0 = wasm_v128_load(i0);
46       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
47       const v128_t v3_1 = wasm_v128_load(i0);
48       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
49       const v128_t v3_2 = wasm_v128_load(i0);
50       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
51       const v128_t v3_3 = wasm_v128_load(i0);
52       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
53       const v128_t v3_4 = wasm_v128_load(i0);
54       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
55       const v128_t v3_5 = wasm_v128_load(i0);
56       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
57       const v128_t v3_6 = wasm_v128_load(i0);
58       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
59       const v128_t v3_7 = wasm_v128_load(i0);
60       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
61 
62       const v128_t v2_0 = wasm_v16x8_shuffle(v3_0, v3_4, 0, 8, 1, 9, 2, 10, 3, 11);
63       const v128_t v2_1 = wasm_v16x8_shuffle(v3_0, v3_4, 4, 12, 5, 13, 6, 14, 7, 15);
64       const v128_t v2_2 = wasm_v16x8_shuffle(v3_1, v3_5, 0, 8, 1, 9, 2, 10, 3, 11);
65       const v128_t v2_3 = wasm_v16x8_shuffle(v3_1, v3_5, 4, 12, 5, 13, 6, 14, 7, 15);
66       const v128_t v2_4 = wasm_v16x8_shuffle(v3_2, v3_6, 0, 8, 1, 9, 2, 10, 3, 11);
67       const v128_t v2_5 = wasm_v16x8_shuffle(v3_2, v3_6, 4, 12, 5, 13, 6, 14, 7, 15);
68       const v128_t v2_6 = wasm_v16x8_shuffle(v3_3, v3_7, 0, 8, 1, 9, 2, 10, 3, 11);
69       const v128_t v2_7 = wasm_v16x8_shuffle(v3_3, v3_7, 4, 12, 5, 13, 6, 14, 7, 15);
70       const v128_t v1_0 = wasm_v16x8_shuffle(v2_0, v2_4, 0, 8, 1, 9, 2, 10, 3, 11);
71       const v128_t v1_1 = wasm_v16x8_shuffle(v2_0, v2_4, 4, 12, 5, 13, 6, 14, 7, 15);
72       const v128_t v1_2 = wasm_v16x8_shuffle(v2_1, v2_5, 0, 8, 1, 9, 2, 10, 3, 11);
73       const v128_t v1_3 = wasm_v16x8_shuffle(v2_1, v2_5, 4, 12, 5, 13, 6, 14, 7, 15);
74       const v128_t v1_4 = wasm_v16x8_shuffle(v2_2, v2_6, 0, 8, 1, 9, 2, 10, 3, 11);
75       const v128_t v1_5 = wasm_v16x8_shuffle(v2_2, v2_6, 4, 12, 5, 13, 6, 14, 7, 15);
76       const v128_t v1_6 = wasm_v16x8_shuffle(v2_3, v2_7, 0, 8, 1, 9, 2, 10, 3, 11);
77       const v128_t v1_7 = wasm_v16x8_shuffle(v2_3, v2_7, 4, 12, 5, 13, 6, 14, 7, 15);
78       const v128_t v0_0 = wasm_v16x8_shuffle(v1_0, v1_4, 0, 8, 1, 9, 2, 10, 3, 11);
79       const v128_t v0_1 = wasm_v16x8_shuffle(v1_0, v1_4, 4, 12, 5, 13, 6, 14, 7, 15);
80       const v128_t v0_2 = wasm_v16x8_shuffle(v1_1, v1_5, 0, 8, 1, 9, 2, 10, 3, 11);
81       const v128_t v0_3 = wasm_v16x8_shuffle(v1_1, v1_5, 4, 12, 5, 13, 6, 14, 7, 15);
82       const v128_t v0_4 = wasm_v16x8_shuffle(v1_2, v1_6, 0, 8, 1, 9, 2, 10, 3, 11);
83       const v128_t v0_5 = wasm_v16x8_shuffle(v1_2, v1_6, 4, 12, 5, 13, 6, 14, 7, 15);
84       const v128_t v0_6 = wasm_v16x8_shuffle(v1_3, v1_7, 0, 8, 1, 9, 2, 10, 3, 11);
85       const v128_t v0_7 = wasm_v16x8_shuffle(v1_3, v1_7, 4, 12, 5, 13, 6, 14, 7, 15);
86 
87       uint16_t *oN = (uint16_t*) ((uintptr_t) o + oN_stride);
88       switch (rem) {
89         case 7:
90           wasm_v128_store(oN, v0_7);
91           oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
92         case 6:
93           wasm_v128_store(oN, v0_6);
94           oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
95         case 5:
96           wasm_v128_store(oN, v0_5);
97           oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
98         case 4:
99           wasm_v128_store(oN, v0_4);
100           oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
101         case 3:
102           wasm_v128_store(oN, v0_3);
103           oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
104         case 2:
105           wasm_v128_store(oN, v0_2);
106           oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
107         case 1:
108           wasm_v128_store(oN, v0_1);
109         case 0:
110           wasm_v128_store(o, v0_0);
111           o = (uint16_t*) ((uintptr_t) o + tile_hbytes);
112           break;
113         default:
114           XNN_UNREACHABLE;
115       }
116     }
117 
118     if (bh != 0) {
119       const v128_t v3_0 = wasm_v128_load(i0);
120       const uint16_t *i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
121       if XNN_UNPREDICTABLE(bh < 2) {
122         i1 = i0;
123       }
124       const v128_t v3_1 = wasm_v128_load(i1);
125       const uint16_t *i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride);
126       if XNN_UNPREDICTABLE(bh <= 2) {
127         i2 = i1;
128       }
129       const v128_t v3_2 = wasm_v128_load(i2);
130       const uint16_t *i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride);
131       if XNN_UNPREDICTABLE(bh < 4) {
132         i3 = i2;
133       }
134       const v128_t v3_3 = wasm_v128_load(i3);
135       const uint16_t *i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride);
136       if XNN_UNPREDICTABLE(bh <= 4) {
137         i4 = i3;
138       }
139       const v128_t v3_4 = wasm_v128_load(i4);
140       const uint16_t *i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride);
141       if XNN_UNPREDICTABLE(bh < 6) {
142         i5 = i4;
143       }
144       const v128_t v3_5 = wasm_v128_load(i5);
145       const uint16_t *i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride);
146       if XNN_UNPREDICTABLE(bh <= 6) {
147         i6 = i5;
148       }
149       const v128_t v3_6 = wasm_v128_load(i6);
150       const v128_t v3_7 = wasm_v128_xor(v3_0, v3_0);
151 
152       const v128_t v2_0 = wasm_v16x8_shuffle(v3_0, v3_4, 0, 8, 1, 9, 2, 10, 3, 11);
153       const v128_t v2_1 = wasm_v16x8_shuffle(v3_0, v3_4, 4, 12, 5, 13, 6, 14, 7, 15);
154       const v128_t v2_2 = wasm_v16x8_shuffle(v3_1, v3_5, 0, 8, 1, 9, 2, 10, 3, 11);
155       const v128_t v2_3 = wasm_v16x8_shuffle(v3_1, v3_5, 4, 12, 5, 13, 6, 14, 7, 15);
156       const v128_t v2_4 = wasm_v16x8_shuffle(v3_2, v3_6, 0, 8, 1, 9, 2, 10, 3, 11);
157       const v128_t v2_5 = wasm_v16x8_shuffle(v3_2, v3_6, 4, 12, 5, 13, 6, 14, 7, 15);
158       const v128_t v2_6 = wasm_v16x8_shuffle(v3_3, v3_7, 0, 8, 1, 9, 2, 10, 3, 11);
159       const v128_t v2_7 = wasm_v16x8_shuffle(v3_3, v3_7, 4, 12, 5, 13, 6, 14, 7, 15);
160       const v128_t v1_0 = wasm_v16x8_shuffle(v2_0, v2_4, 0, 8, 1, 9, 2, 10, 3, 11);
161       const v128_t v1_1 = wasm_v16x8_shuffle(v2_0, v2_4, 4, 12, 5, 13, 6, 14, 7, 15);
162       const v128_t v1_2 = wasm_v16x8_shuffle(v2_1, v2_5, 0, 8, 1, 9, 2, 10, 3, 11);
163       const v128_t v1_3 = wasm_v16x8_shuffle(v2_1, v2_5, 4, 12, 5, 13, 6, 14, 7, 15);
164       const v128_t v1_4 = wasm_v16x8_shuffle(v2_2, v2_6, 0, 8, 1, 9, 2, 10, 3, 11);
165       const v128_t v1_5 = wasm_v16x8_shuffle(v2_2, v2_6, 4, 12, 5, 13, 6, 14, 7, 15);
166       const v128_t v1_6 = wasm_v16x8_shuffle(v2_3, v2_7, 0, 8, 1, 9, 2, 10, 3, 11);
167       const v128_t v1_7 = wasm_v16x8_shuffle(v2_3, v2_7, 4, 12, 5, 13, 6, 14, 7, 15);
168 
169       v128_t v0_0 = wasm_v16x8_shuffle(v1_0, v1_4, 0, 8, 1, 9, 2, 10, 3, 11);
170       v128_t v0_1 = wasm_v16x8_shuffle(v1_0, v1_4, 4, 12, 5, 13, 6, 14, 7, 15);
171       v128_t v0_2 = wasm_v16x8_shuffle(v1_1, v1_5, 0, 8, 1, 9, 2, 10, 3, 11);
172       v128_t v0_3 = wasm_v16x8_shuffle(v1_1, v1_5, 4, 12, 5, 13, 6, 14, 7, 15);
173       v128_t v0_4 = wasm_v16x8_shuffle(v1_2, v1_6, 0, 8, 1, 9, 2, 10, 3, 11);
174       v128_t v0_5 = wasm_v16x8_shuffle(v1_2, v1_6, 4, 12, 5, 13, 6, 14, 7, 15);
175       v128_t v0_6 = wasm_v16x8_shuffle(v1_3, v1_7, 0, 8, 1, 9, 2, 10, 3, 11);
176       v128_t v0_7 = wasm_v16x8_shuffle(v1_3, v1_7, 4, 12, 5, 13, 6, 14, 7, 15);
177 
178       if (bh & 4) {
179         uint16_t* oN = (uint16_t*) ((uintptr_t) o + oN_stride);
180         switch (rem) {
181           case 7:
182             *((double*) oN) = wasm_f64x2_extract_lane(v0_7, 0);
183             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
184           case 6:
185             *((double*) oN) = wasm_f64x2_extract_lane(v0_6, 0);
186             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
187           case 5:
188             *((double*) oN) = wasm_f64x2_extract_lane(v0_5, 0);
189             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
190           case 4:
191             *((double*) oN) = wasm_f64x2_extract_lane(v0_4, 0);
192             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
193           case 3:
194             *((double*) oN) = wasm_f64x2_extract_lane(v0_3, 0);
195             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
196           case 2:
197             *((double*) oN) = wasm_f64x2_extract_lane(v0_2, 0);
198             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
199           case 1:
200             *((double*) oN) = wasm_f64x2_extract_lane(v0_1, 0);
201           case 0:
202             *((double*) o) = wasm_f64x2_extract_lane(v0_0, 0);
203             o += 4;
204             break;
205           default:
206             XNN_UNREACHABLE;
207         }
208         v0_0 = wasm_v64x2_shuffle(v0_0, v0_0, 1, 1);
209         v0_1 = wasm_v64x2_shuffle(v0_1, v0_1, 1, 1);
210         v0_2 = wasm_v64x2_shuffle(v0_2, v0_2, 1, 1);
211         v0_3 = wasm_v64x2_shuffle(v0_3, v0_3, 1, 1);
212         v0_4 = wasm_v64x2_shuffle(v0_4, v0_4, 1, 1);
213         v0_5 = wasm_v64x2_shuffle(v0_5, v0_5, 1, 1);
214         v0_6 = wasm_v64x2_shuffle(v0_6, v0_6, 1, 1);
215         v0_7 = wasm_v64x2_shuffle(v0_7, v0_7, 1, 1);
216       }
217 
218       if (bh & 2) {
219         uint16_t* oN = (uint16_t*) ((uintptr_t) o + oN_stride);
220         switch (rem) {
221           case 7:
222             *((float*) oN) = wasm_f32x4_extract_lane(v0_7, 0);
223             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
224           case 6:
225             *((float*) oN) = wasm_f32x4_extract_lane(v0_6, 0);
226             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
227           case 5:
228             *((float*) oN) = wasm_f32x4_extract_lane(v0_5, 0);
229             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
230           case 4:
231             *((float*) oN) = wasm_f32x4_extract_lane(v0_4, 0);
232             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
233           case 3:
234             *((float*) oN) = wasm_f32x4_extract_lane(v0_3, 0);
235             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
236           case 2:
237             *((float*) oN) = wasm_f32x4_extract_lane(v0_2, 0);
238             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
239           case 1:
240             *((float*) oN) = wasm_f32x4_extract_lane(v0_1, 0);
241           case 0:
242             *((float*) o) = wasm_f32x4_extract_lane(v0_0, 0);
243             o += 2;
244             break;
245           default:
246             XNN_UNREACHABLE;
247         }
248         v0_0 = wasm_u64x2_shr(v0_0, 32);
249         v0_1 = wasm_u64x2_shr(v0_1, 32);
250         v0_2 = wasm_u64x2_shr(v0_2, 32);
251         v0_3 = wasm_u64x2_shr(v0_3, 32);
252         v0_4 = wasm_u64x2_shr(v0_4, 32);
253         v0_5 = wasm_u64x2_shr(v0_5, 32);
254         v0_6 = wasm_u64x2_shr(v0_6, 32);
255         v0_7 = wasm_u64x2_shr(v0_7, 32);
256       }
257       if (bh & 1) {
258         uint16_t* oN = (uint16_t*) ((uintptr_t) o + oN_stride);
259         switch (rem) {
260           case 7:
261             *oN = wasm_i16x8_extract_lane(v0_7, 0);
262             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
263           case 6:
264             *oN = wasm_i16x8_extract_lane(v0_6, 0);
265             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
266           case 5:
267             *oN = wasm_i16x8_extract_lane(v0_5, 0);
268             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
269           case 4:
270             *oN = wasm_i16x8_extract_lane(v0_4, 0);
271             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
272           case 3:
273             *oN = wasm_i16x8_extract_lane(v0_3, 0);
274             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
275           case 2:
276             *oN = wasm_i16x8_extract_lane(v0_2, 0);
277             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
278           case 1:
279             *oN = wasm_i16x8_extract_lane(v0_1, 0);
280           case 0:
281             *o = wasm_i16x8_extract_lane(v0_0, 0);
282             break;
283           default:
284             XNN_UNREACHABLE;
285         }
286       }
287     }
288 
289     i0 = (const uint16_t*) ((uintptr_t) i0 + input_reset);
290     o = (uint16_t*) ((uintptr_t) o + output_reset);
291     block_width = doz(block_width, tile_width);
292   } while (block_width != 0);
293 }
294