xref: /aosp_15_r20/external/executorch/kernels/optimized/vec/vec256/missing_vld1_neon.h (revision 523fa7a60841cd1ecfb9cc4201f1ca8b03ed023a)
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD-style license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 /* Workaround for missing vld1_*_x2 and vst1_*_x2 intrinsics in gcc-7.  */
10 
11 __extension__ extern __inline uint8x8x2_t
12 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_u8_x2(const uint8_t * __a)13 vld1_u8_x2 (const uint8_t *__a)
14 {
15   uint8x8x2_t ret;
16   asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w" (ret) : "Q"(*__a));
17   return ret;
18 }
19 
20 __extension__ extern __inline int8x8x2_t
21 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_s8_x2(const int8_t * __a)22 vld1_s8_x2 (const int8_t *__a)
23 {
24   int8x8x2_t ret;
25   asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w" (ret) : "Q"(*__a));
26   return ret;
27 }
28 
29 __extension__ extern __inline uint16x4x2_t
30 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_u16_x2(const uint16_t * __a)31 vld1_u16_x2 (const uint16_t *__a)
32 {
33   uint16x4x2_t ret;
34   asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w" (ret) : "Q"(*__a));
35   return ret;
36 }
37 
38 __extension__ extern __inline int16x4x2_t
39 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_s16_x2(const int16_t * __a)40 vld1_s16_x2 (const int16_t *__a)
41 {
42   int16x4x2_t ret;
43   asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w" (ret) : "Q"(*__a));
44   return ret;
45 }
46 
47 __extension__ extern __inline uint32x2x2_t
48 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_u32_x2(const uint32_t * __a)49 vld1_u32_x2 (const uint32_t *__a)
50 {
51   uint32x2x2_t ret;
52   asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w" (ret) : "Q"(*__a));
53   return ret;
54 }
55 
56 __extension__ extern __inline int32x2x2_t
57 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_s32_x2(const int32_t * __a)58 vld1_s32_x2 (const int32_t *__a)
59 {
60   int32x2x2_t ret;
61   asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w" (ret) : "Q"(*__a));
62   return ret;
63 }
64 
65 __extension__ extern __inline uint64x1x2_t
66 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_u64_x2(const uint64_t * __a)67 vld1_u64_x2 (const uint64_t *__a)
68 {
69   uint64x1x2_t ret;
70   asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w" (ret) : "Q"(*__a));
71   return ret;
72 }
73 
74 __extension__ extern __inline int64x1x2_t
75 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_s64_x2(const int64_t * __a)76 vld1_s64_x2 (const int64_t *__a)
77 {
78   int64x1x2_t ret;
79   __builtin_aarch64_simd_oi __o;
80   asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w" (ret) : "Q"(*__a));
81   return ret;
82 }
83 
84 __extension__ extern __inline float16x4x2_t
85 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_f16_x2(const float16_t * __a)86 vld1_f16_x2 (const float16_t *__a)
87 {
88   float16x4x2_t ret;
89   asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w" (ret) : "Q"(*__a));
90   return ret;
91 }
92 
93 __extension__ extern __inline float32x2x2_t
94 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_f32_x2(const float32_t * __a)95 vld1_f32_x2 (const float32_t *__a)
96 {
97   float32x2x2_t ret;
98   asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w" (ret) : "Q"(*__a));
99   return ret;
100 }
101 
102 __extension__ extern __inline float64x1x2_t
103 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_f64_x2(const float64_t * __a)104 vld1_f64_x2 (const float64_t *__a)
105 {
106   float64x1x2_t ret;
107   asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w" (ret) : "Q"(*__a));
108   return ret;
109 }
110 
111 __extension__ extern __inline poly8x8x2_t
112 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_p8_x2(const poly8_t * __a)113 vld1_p8_x2 (const poly8_t *__a)
114 {
115   poly8x8x2_t ret;
116   asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w" (ret) : "Q"(*__a));
117   return ret;
118 }
119 
120 __extension__ extern __inline poly16x4x2_t
121 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_p16_x2(const poly16_t * __a)122 vld1_p16_x2 (const poly16_t *__a)
123 {
124   poly16x4x2_t ret;
125   asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w" (ret) : "Q"(*__a));
126   return ret;
127 }
128 
129 __extension__ extern __inline poly64x1x2_t
130 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_p64_x2(const poly64_t * __a)131 vld1_p64_x2 (const poly64_t *__a)
132 {
133   poly64x1x2_t ret;
134   asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w" (ret) : "Q"(*__a));
135   return ret;
136 }
137 
138 __extension__ extern __inline uint8x16x2_t
139 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_u8_x2(const uint8_t * __a)140 vld1q_u8_x2 (const uint8_t *__a)
141 {
142   uint8x16x2_t ret;
143   asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w" (ret) : "Q"(*__a));
144   return ret;
145 }
146 
147 __extension__ extern __inline int8x16x2_t
148 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_s8_x2(const int8_t * __a)149 vld1q_s8_x2 (const int8_t *__a)
150 {
151   int8x16x2_t ret;
152   asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w" (ret) : "Q"(*__a));
153   return ret;
154 }
155 
156 __extension__ extern __inline uint16x8x2_t
157 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_u16_x2(const uint16_t * __a)158 vld1q_u16_x2 (const uint16_t *__a)
159 {
160   uint16x8x2_t ret;
161   asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w" (ret) : "Q"(*__a));
162   return ret;
163 }
164 
165 __extension__ extern __inline int16x8x2_t
166 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_s16_x2(const int16_t * __a)167 vld1q_s16_x2 (const int16_t *__a)
168 {
169   int16x8x2_t ret;
170   asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w" (ret) : "Q"(*__a));
171   return ret;
172 }
173 
174 __extension__ extern __inline uint32x4x2_t
175 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_u32_x2(const uint32_t * __a)176 vld1q_u32_x2 (const uint32_t *__a)
177 {
178   uint32x4x2_t ret;
179   asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w" (ret) : "Q"(*__a));
180   return ret;
181 }
182 
183 __extension__ extern __inline int32x4x2_t
184 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_s32_x2(const int32_t * __a)185 vld1q_s32_x2 (const int32_t *__a)
186 {
187   int32x4x2_t ret;
188   asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w" (ret) : "Q"(*__a));
189   return ret;
190 }
191 
192 __extension__ extern __inline uint64x2x2_t
193 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_u64_x2(const uint64_t * __a)194 vld1q_u64_x2 (const uint64_t *__a)
195 {
196   uint64x2x2_t ret;
197   asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w" (ret) : "Q"(*__a));
198   return ret;
199 }
200 
201 __extension__ extern __inline int64x2x2_t
202 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_s64_x2(const int64_t * __a)203 vld1q_s64_x2 (const int64_t *__a)
204 {
205   int64x2x2_t ret;
206   asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w" (ret) : "Q"(*__a));
207   return ret;
208 }
209 
210 __extension__ extern __inline float16x8x2_t
211 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_f16_x2(const float16_t * __a)212 vld1q_f16_x2 (const float16_t *__a)
213 {
214   float16x8x2_t ret;
215   asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w" (ret) : "Q"(*__a));
216   return ret;
217 }
218 
219 __extension__ extern __inline float32x4x2_t
220 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_f32_x2(const float32_t * __a)221 vld1q_f32_x2 (const float32_t *__a)
222 {
223   float32x4x2_t ret;
224   asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w" (ret) : "Q"(*__a));
225   return ret;
226 }
227 
228 __extension__ extern __inline float64x2x2_t
229 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_f64_x2(const float64_t * __a)230 vld1q_f64_x2 (const float64_t *__a)
231 {
232   float64x2x2_t ret;
233   asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w" (ret) : "Q"(*__a));
234   return ret;
235 }
236 
237 __extension__ extern __inline poly8x16x2_t
238 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_p8_x2(const poly8_t * __a)239 vld1q_p8_x2 (const poly8_t *__a)
240 {
241   poly8x16x2_t ret;
242   asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w" (ret) : "Q"(*__a));
243   return ret;
244 }
245 
246 __extension__ extern __inline poly16x8x2_t
247 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_p16_x2(const poly16_t * __a)248 vld1q_p16_x2 (const poly16_t *__a)
249 {
250   poly16x8x2_t ret;
251   asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w" (ret) : "Q"(*__a));
252   return ret;
253 }
254 
255 __extension__ extern __inline poly64x2x2_t
256 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_p64_x2(const poly64_t * __a)257 vld1q_p64_x2 (const poly64_t *__a)
258 {
259   poly64x2x2_t ret;
260   asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w" (ret) : "Q"(*__a));
261   return ret;
262 }
263 
264 /* vst1x2 */
265 
266 __extension__ extern __inline void
267 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_s64_x2(int64_t * __a,int64x1x2_t val)268 vst1_s64_x2 (int64_t * __a, int64x1x2_t val)
269 {
270   asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q" (*__a) : "w" (val));
271 }
272 
273 __extension__ extern __inline void
274 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_u64_x2(uint64_t * __a,uint64x1x2_t val)275 vst1_u64_x2 (uint64_t * __a, uint64x1x2_t val)
276 {
277   asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q" (*__a) : "w" (val));
278 }
279 
280 __extension__ extern __inline void
281 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_f64_x2(float64_t * __a,float64x1x2_t val)282 vst1_f64_x2 (float64_t * __a, float64x1x2_t val)
283 {
284   asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q" (*__a) : "w" (val));
285 }
286 
287 __extension__ extern __inline void
288 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_s8_x2(int8_t * __a,int8x8x2_t val)289 vst1_s8_x2 (int8_t * __a, int8x8x2_t val)
290 {
291   asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q" (*__a) : "w" (val));
292 }
293 
294 __extension__ extern __inline void
295 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_p8_x2(poly8_t * __a,poly8x8x2_t val)296 vst1_p8_x2 (poly8_t * __a, poly8x8x2_t val)
297 {
298   asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q" (*__a) : "w" (val));
299 }
300 
301 __extension__ extern __inline void
302 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_s16_x2(int16_t * __a,int16x4x2_t val)303 vst1_s16_x2 (int16_t * __a, int16x4x2_t val)
304 {
305   asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q" (*__a) : "w" (val));
306 }
307 
308 __extension__ extern __inline void
309 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_p16_x2(poly16_t * __a,poly16x4x2_t val)310 vst1_p16_x2 (poly16_t * __a, poly16x4x2_t val)
311 {
312   asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q" (*__a) : "w" (val));
313 }
314 
315 __extension__ extern __inline void
316 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_s32_x2(int32_t * __a,int32x2x2_t val)317 vst1_s32_x2 (int32_t * __a, int32x2x2_t val)
318 {
319   asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q" (*__a) : "w" (val));
320 }
321 
322 __extension__ extern __inline void
323 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_u8_x2(uint8_t * __a,uint8x8x2_t val)324 vst1_u8_x2 (uint8_t * __a, uint8x8x2_t val)
325 {
326   asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q" (*__a) : "w" (val));
327 }
328 
329 __extension__ extern __inline void
330 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_u16_x2(uint16_t * __a,uint16x4x2_t val)331 vst1_u16_x2 (uint16_t * __a, uint16x4x2_t val)
332 {
333   asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q" (*__a) : "w" (val));
334 }
335 
336 __extension__ extern __inline void
337 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_u32_x2(uint32_t * __a,uint32x2x2_t val)338 vst1_u32_x2 (uint32_t * __a, uint32x2x2_t val)
339 {
340   asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q" (*__a) : "w" (val));
341 }
342 
343 __extension__ extern __inline void
344 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_f16_x2(float16_t * __a,float16x4x2_t val)345 vst1_f16_x2 (float16_t * __a, float16x4x2_t val)
346 {
347   asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q" (*__a) : "w" (val));
348 }
349 
350 __extension__ extern __inline void
351 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_f32_x2(float32_t * __a,float32x2x2_t val)352 vst1_f32_x2 (float32_t * __a, float32x2x2_t val)
353 {
354   asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q" (*__a) : "w" (val));
355 }
356 
357 __extension__ extern __inline void
358 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_p64_x2(poly64_t * __a,poly64x1x2_t val)359 vst1_p64_x2 (poly64_t * __a, poly64x1x2_t val)
360 {
361   asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q" (*__a) : "w" (val));
362 }
363 
364 __extension__ extern __inline void
365 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_s8_x2(int8_t * __a,int8x16x2_t val)366 vst1q_s8_x2 (int8_t * __a, int8x16x2_t val)
367 {
368   asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q" (*__a) : "w" (val));
369 }
370 
371 __extension__ extern __inline void
372 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_p8_x2(poly8_t * __a,poly8x16x2_t val)373 vst1q_p8_x2 (poly8_t * __a, poly8x16x2_t val)
374 {
375   asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q" (*__a) : "w" (val));
376 }
377 
378 __extension__ extern __inline void
379 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_s16_x2(int16_t * __a,int16x8x2_t val)380 vst1q_s16_x2 (int16_t * __a, int16x8x2_t val)
381 {
382   asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q" (*__a) : "w" (val));
383 }
384 
385 __extension__ extern __inline void
386 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_p16_x2(poly16_t * __a,poly16x8x2_t val)387 vst1q_p16_x2 (poly16_t * __a, poly16x8x2_t val)
388 {
389   asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q" (*__a) : "w" (val));
390 }
391 
392 __extension__ extern __inline void
393 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_s32_x2(int32_t * __a,int32x4x2_t val)394 vst1q_s32_x2 (int32_t * __a, int32x4x2_t val)
395 {
396   asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q" (*__a) : "w" (val));
397 }
398 
399 __extension__ extern __inline void
400 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_s64_x2(int64_t * __a,int64x2x2_t val)401 vst1q_s64_x2 (int64_t * __a, int64x2x2_t val)
402 {
403   asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q" (*__a) : "w" (val));
404 }
405 
406 __extension__ extern __inline void
407 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_u8_x2(uint8_t * __a,uint8x16x2_t val)408 vst1q_u8_x2 (uint8_t * __a, uint8x16x2_t val)
409 {
410   asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q" (*__a) : "w" (val));
411 }
412 
413 __extension__ extern __inline void
414 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_u16_x2(uint16_t * __a,uint16x8x2_t val)415 vst1q_u16_x2 (uint16_t * __a, uint16x8x2_t val)
416 {
417   asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q" (*__a) : "w" (val));
418 }
419 
420 __extension__ extern __inline void
421 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_u32_x2(uint32_t * __a,uint32x4x2_t val)422 vst1q_u32_x2 (uint32_t * __a, uint32x4x2_t val)
423 {
424   asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q" (*__a) : "w" (val));
425 }
426 
427 __extension__ extern __inline void
428 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_u64_x2(uint64_t * __a,uint64x2x2_t val)429 vst1q_u64_x2 (uint64_t * __a, uint64x2x2_t val)
430 {
431   asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q" (*__a) : "w" (val));
432 }
433 
434 __extension__ extern __inline void
435 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_f16_x2(float16_t * __a,float16x8x2_t val)436 vst1q_f16_x2 (float16_t * __a, float16x8x2_t val)
437 {
438   asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q" (*__a) : "w" (val));
439 }
440 
441 __extension__ extern __inline void
442 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_f32_x2(float32_t * __a,float32x4x2_t val)443 vst1q_f32_x2 (float32_t * __a, float32x4x2_t val)
444 {
445   asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q" (*__a) : "w" (val));
446 }
447 
448 __extension__ extern __inline void
449 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_f64_x2(float64_t * __a,float64x2x2_t val)450 vst1q_f64_x2 (float64_t * __a, float64x2x2_t val)
451 {
452   asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q" (*__a) : "w" (val));
453 }
454 
455 __extension__ extern __inline void
456 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_p64_x2(poly64_t * __a,poly64x2x2_t val)457 vst1q_p64_x2 (poly64_t * __a, poly64x2x2_t val)
458 {
459   asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q" (*__a) : "w" (val));
460 }
461