xref: /aosp_15_r20/external/XNNPACK/src/x8-lut/gen/lut-wasmsimd-x48.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/x8-lut/wasmsimd.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2021 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <wasm_simd128.h>
13 
14 #include <xnnpack/intrinsics-polyfill.h>
15 #include <xnnpack/lut.h>
16 #include <xnnpack/common.h>
17 
18 
xnn_x8_lut_ukernel__wasmsimd_x48(size_t n,const uint8_t * x,uint8_t * y,const uint8_t t[restrict XNN_MIN_ELEMENTS (256)])19 void xnn_x8_lut_ukernel__wasmsimd_x48(
20     size_t n,
21     const uint8_t* x,
22     uint8_t* y,
23     const uint8_t t[restrict XNN_MIN_ELEMENTS(256)])
24 {
25   assert(n != 0);
26   assert(x != NULL);
27   assert(y != NULL);
28 
29   const v128_t vtable0 = wasm_v128_load(t);
30   const v128_t vtable1 = wasm_v128_load(t + 16);
31   const v128_t vtable2 = wasm_v128_load(t + 32);
32   const v128_t vtable3 = wasm_v128_load(t + 48);
33   const v128_t vtable4 = wasm_v128_load(t + 64);
34   const v128_t vtable5 = wasm_v128_load(t + 80);
35   const v128_t vtable6 = wasm_v128_load(t + 96);
36   const v128_t vtable7 = wasm_v128_load(t + 112);
37   const v128_t vtable8 = wasm_v128_load(t + 128);
38   const v128_t vtable9 = wasm_v128_load(t + 144);
39   const v128_t vtable10 = wasm_v128_load(t + 160);
40   const v128_t vtable11 = wasm_v128_load(t + 176);
41   const v128_t vtable12 = wasm_v128_load(t + 192);
42   const v128_t vtable13 = wasm_v128_load(t + 208);
43   const v128_t vtable14 = wasm_v128_load(t + 224);
44   const v128_t vtable15 = wasm_v128_load(t + 240);
45   const v128_t voffset = wasm_i8x16_const_splat(16);
46   for (; n >= 48 * sizeof(uint8_t); n -= 48 * sizeof(uint8_t)) {
47     v128_t vx0 = wasm_v128_load(x);
48     v128_t vx1 = wasm_v128_load(x + 16);
49     v128_t vx2 = wasm_v128_load(x + 32);
50     x += 48;
51 
52     v128_t vy0 = wasm_i8x16_swizzle(vtable0, vx0);
53     v128_t vy1 = wasm_i8x16_swizzle(vtable0, vx1);
54     v128_t vy2 = wasm_i8x16_swizzle(vtable0, vx2);
55 
56     vx0 = wasm_i8x16_sub(vx0, voffset);
57     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable1, vx0));
58     vx1 = wasm_i8x16_sub(vx1, voffset);
59     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable1, vx1));
60     vx2 = wasm_i8x16_sub(vx2, voffset);
61     vy2 = wasm_v128_or(vy2, wasm_i8x16_swizzle(vtable1, vx2));
62     vx0 = wasm_i8x16_sub(vx0, voffset);
63     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable2, vx0));
64     vx1 = wasm_i8x16_sub(vx1, voffset);
65     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable2, vx1));
66     vx2 = wasm_i8x16_sub(vx2, voffset);
67     vy2 = wasm_v128_or(vy2, wasm_i8x16_swizzle(vtable2, vx2));
68     vx0 = wasm_i8x16_sub(vx0, voffset);
69     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable3, vx0));
70     vx1 = wasm_i8x16_sub(vx1, voffset);
71     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable3, vx1));
72     vx2 = wasm_i8x16_sub(vx2, voffset);
73     vy2 = wasm_v128_or(vy2, wasm_i8x16_swizzle(vtable3, vx2));
74     vx0 = wasm_i8x16_sub(vx0, voffset);
75     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable4, vx0));
76     vx1 = wasm_i8x16_sub(vx1, voffset);
77     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable4, vx1));
78     vx2 = wasm_i8x16_sub(vx2, voffset);
79     vy2 = wasm_v128_or(vy2, wasm_i8x16_swizzle(vtable4, vx2));
80     vx0 = wasm_i8x16_sub(vx0, voffset);
81     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable5, vx0));
82     vx1 = wasm_i8x16_sub(vx1, voffset);
83     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable5, vx1));
84     vx2 = wasm_i8x16_sub(vx2, voffset);
85     vy2 = wasm_v128_or(vy2, wasm_i8x16_swizzle(vtable5, vx2));
86     vx0 = wasm_i8x16_sub(vx0, voffset);
87     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable6, vx0));
88     vx1 = wasm_i8x16_sub(vx1, voffset);
89     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable6, vx1));
90     vx2 = wasm_i8x16_sub(vx2, voffset);
91     vy2 = wasm_v128_or(vy2, wasm_i8x16_swizzle(vtable6, vx2));
92     vx0 = wasm_i8x16_sub(vx0, voffset);
93     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable7, vx0));
94     vx1 = wasm_i8x16_sub(vx1, voffset);
95     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable7, vx1));
96     vx2 = wasm_i8x16_sub(vx2, voffset);
97     vy2 = wasm_v128_or(vy2, wasm_i8x16_swizzle(vtable7, vx2));
98     vx0 = wasm_i8x16_sub(vx0, voffset);
99     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable8, vx0));
100     vx1 = wasm_i8x16_sub(vx1, voffset);
101     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable8, vx1));
102     vx2 = wasm_i8x16_sub(vx2, voffset);
103     vy2 = wasm_v128_or(vy2, wasm_i8x16_swizzle(vtable8, vx2));
104     vx0 = wasm_i8x16_sub(vx0, voffset);
105     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable9, vx0));
106     vx1 = wasm_i8x16_sub(vx1, voffset);
107     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable9, vx1));
108     vx2 = wasm_i8x16_sub(vx2, voffset);
109     vy2 = wasm_v128_or(vy2, wasm_i8x16_swizzle(vtable9, vx2));
110     vx0 = wasm_i8x16_sub(vx0, voffset);
111     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable10, vx0));
112     vx1 = wasm_i8x16_sub(vx1, voffset);
113     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable10, vx1));
114     vx2 = wasm_i8x16_sub(vx2, voffset);
115     vy2 = wasm_v128_or(vy2, wasm_i8x16_swizzle(vtable10, vx2));
116     vx0 = wasm_i8x16_sub(vx0, voffset);
117     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable11, vx0));
118     vx1 = wasm_i8x16_sub(vx1, voffset);
119     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable11, vx1));
120     vx2 = wasm_i8x16_sub(vx2, voffset);
121     vy2 = wasm_v128_or(vy2, wasm_i8x16_swizzle(vtable11, vx2));
122     vx0 = wasm_i8x16_sub(vx0, voffset);
123     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable12, vx0));
124     vx1 = wasm_i8x16_sub(vx1, voffset);
125     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable12, vx1));
126     vx2 = wasm_i8x16_sub(vx2, voffset);
127     vy2 = wasm_v128_or(vy2, wasm_i8x16_swizzle(vtable12, vx2));
128     vx0 = wasm_i8x16_sub(vx0, voffset);
129     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable13, vx0));
130     vx1 = wasm_i8x16_sub(vx1, voffset);
131     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable13, vx1));
132     vx2 = wasm_i8x16_sub(vx2, voffset);
133     vy2 = wasm_v128_or(vy2, wasm_i8x16_swizzle(vtable13, vx2));
134     vx0 = wasm_i8x16_sub(vx0, voffset);
135     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable14, vx0));
136     vx1 = wasm_i8x16_sub(vx1, voffset);
137     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable14, vx1));
138     vx2 = wasm_i8x16_sub(vx2, voffset);
139     vy2 = wasm_v128_or(vy2, wasm_i8x16_swizzle(vtable14, vx2));
140     vx0 = wasm_i8x16_sub(vx0, voffset);
141     vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable15, vx0));
142     vx1 = wasm_i8x16_sub(vx1, voffset);
143     vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable15, vx1));
144     vx2 = wasm_i8x16_sub(vx2, voffset);
145     vy2 = wasm_v128_or(vy2, wasm_i8x16_swizzle(vtable15, vx2));
146 
147     wasm_v128_store(y, vy0);
148     wasm_v128_store(y + 16, vy1);
149     wasm_v128_store(y + 32, vy2);
150     y += 48;
151   }
152   for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
153     v128_t vx = wasm_v128_load(x);
154     x += 16;
155 
156     v128_t vy = wasm_i8x16_swizzle(vtable0, vx);
157 
158     vx = wasm_i8x16_sub(vx, voffset);
159     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable1, vx));
160     vx = wasm_i8x16_sub(vx, voffset);
161     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable2, vx));
162     vx = wasm_i8x16_sub(vx, voffset);
163     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable3, vx));
164     vx = wasm_i8x16_sub(vx, voffset);
165     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable4, vx));
166     vx = wasm_i8x16_sub(vx, voffset);
167     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable5, vx));
168     vx = wasm_i8x16_sub(vx, voffset);
169     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable6, vx));
170     vx = wasm_i8x16_sub(vx, voffset);
171     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable7, vx));
172     vx = wasm_i8x16_sub(vx, voffset);
173     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable8, vx));
174     vx = wasm_i8x16_sub(vx, voffset);
175     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable9, vx));
176     vx = wasm_i8x16_sub(vx, voffset);
177     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable10, vx));
178     vx = wasm_i8x16_sub(vx, voffset);
179     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable11, vx));
180     vx = wasm_i8x16_sub(vx, voffset);
181     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable12, vx));
182     vx = wasm_i8x16_sub(vx, voffset);
183     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable13, vx));
184     vx = wasm_i8x16_sub(vx, voffset);
185     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable14, vx));
186     vx = wasm_i8x16_sub(vx, voffset);
187     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable15, vx));
188 
189     wasm_v128_store(y, vy);
190     y += 16;
191   }
192   if XNN_UNLIKELY(n != 0) {
193     v128_t vx = wasm_v128_load(x);
194 
195     v128_t vy = wasm_i8x16_swizzle(vtable0, vx);
196 
197     vx = wasm_i8x16_sub(vx, voffset);
198     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable1, vx));
199     vx = wasm_i8x16_sub(vx, voffset);
200     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable2, vx));
201     vx = wasm_i8x16_sub(vx, voffset);
202     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable3, vx));
203     vx = wasm_i8x16_sub(vx, voffset);
204     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable4, vx));
205     vx = wasm_i8x16_sub(vx, voffset);
206     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable5, vx));
207     vx = wasm_i8x16_sub(vx, voffset);
208     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable6, vx));
209     vx = wasm_i8x16_sub(vx, voffset);
210     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable7, vx));
211     vx = wasm_i8x16_sub(vx, voffset);
212     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable8, vx));
213     vx = wasm_i8x16_sub(vx, voffset);
214     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable9, vx));
215     vx = wasm_i8x16_sub(vx, voffset);
216     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable10, vx));
217     vx = wasm_i8x16_sub(vx, voffset);
218     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable11, vx));
219     vx = wasm_i8x16_sub(vx, voffset);
220     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable12, vx));
221     vx = wasm_i8x16_sub(vx, voffset);
222     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable13, vx));
223     vx = wasm_i8x16_sub(vx, voffset);
224     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable14, vx));
225     vx = wasm_i8x16_sub(vx, voffset);
226     vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable15, vx));
227 
228     if (n & (8 * sizeof(uint8_t))) {
229       *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
230       vy = wasm_v64x2_shuffle(vy, vy, 1, 1);
231       y += 8;
232     }
233     if (n & (4 * sizeof(uint8_t))) {
234       *((float*) y) = wasm_f32x4_extract_lane(vy, 0);
235       vy = wasm_u64x2_shr(vy, 32);
236       y += 4;
237     }
238     uint32_t vy_lo = wasm_i32x4_extract_lane(vy, 0);
239     if (n & (2 * sizeof(uint8_t))) {
240       *((uint16_t*) y) = (uint16_t) vy_lo;
241       vy_lo >>= 16;
242       y += 2;
243     }
244     if (n & (1 * sizeof(uint8_t))) {
245       *y = (uint8_t) vy_lo;
246     }
247   }
248 }
249