xref: /aosp_15_r20/external/pytorch/aten/src/ATen/cpu/vec/vec256/missing_vld1_neon.h (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 /* Workaround for missing vld1_*_x2 and vst1_*_x2 intrinsics in gcc-7.  */
2 
3 __extension__ extern __inline uint8x8x2_t
4 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_u8_x2(const uint8_t * __a)5 vld1_u8_x2 (const uint8_t *__a)
6 {
7   uint8x8x2_t ret;
8   asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w" (ret) : "Q"(*__a));
9   return ret;
10 }
11 
12 __extension__ extern __inline int8x8x2_t
13 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_s8_x2(const int8_t * __a)14 vld1_s8_x2 (const int8_t *__a)
15 {
16   int8x8x2_t ret;
17   asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w" (ret) : "Q"(*__a));
18   return ret;
19 }
20 
21 __extension__ extern __inline uint16x4x2_t
22 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_u16_x2(const uint16_t * __a)23 vld1_u16_x2 (const uint16_t *__a)
24 {
25   uint16x4x2_t ret;
26   asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w" (ret) : "Q"(*__a));
27   return ret;
28 }
29 
30 __extension__ extern __inline int16x4x2_t
31 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_s16_x2(const int16_t * __a)32 vld1_s16_x2 (const int16_t *__a)
33 {
34   int16x4x2_t ret;
35   asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w" (ret) : "Q"(*__a));
36   return ret;
37 }
38 
39 __extension__ extern __inline uint32x2x2_t
40 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_u32_x2(const uint32_t * __a)41 vld1_u32_x2 (const uint32_t *__a)
42 {
43   uint32x2x2_t ret;
44   asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w" (ret) : "Q"(*__a));
45   return ret;
46 }
47 
48 __extension__ extern __inline int32x2x2_t
49 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_s32_x2(const int32_t * __a)50 vld1_s32_x2 (const int32_t *__a)
51 {
52   int32x2x2_t ret;
53   asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w" (ret) : "Q"(*__a));
54   return ret;
55 }
56 
57 __extension__ extern __inline uint64x1x2_t
58 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_u64_x2(const uint64_t * __a)59 vld1_u64_x2 (const uint64_t *__a)
60 {
61   uint64x1x2_t ret;
62   asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w" (ret) : "Q"(*__a));
63   return ret;
64 }
65 
66 __extension__ extern __inline int64x1x2_t
67 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_s64_x2(const int64_t * __a)68 vld1_s64_x2 (const int64_t *__a)
69 {
70   int64x1x2_t ret;
71   __builtin_aarch64_simd_oi __o;
72   asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w" (ret) : "Q"(*__a));
73   return ret;
74 }
75 
76 __extension__ extern __inline float16x4x2_t
77 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_f16_x2(const float16_t * __a)78 vld1_f16_x2 (const float16_t *__a)
79 {
80   float16x4x2_t ret;
81   asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w" (ret) : "Q"(*__a));
82   return ret;
83 }
84 
85 __extension__ extern __inline float32x2x2_t
86 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_f32_x2(const float32_t * __a)87 vld1_f32_x2 (const float32_t *__a)
88 {
89   float32x2x2_t ret;
90   asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w" (ret) : "Q"(*__a));
91   return ret;
92 }
93 
94 __extension__ extern __inline float64x1x2_t
95 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_f64_x2(const float64_t * __a)96 vld1_f64_x2 (const float64_t *__a)
97 {
98   float64x1x2_t ret;
99   asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w" (ret) : "Q"(*__a));
100   return ret;
101 }
102 
103 __extension__ extern __inline poly8x8x2_t
104 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_p8_x2(const poly8_t * __a)105 vld1_p8_x2 (const poly8_t *__a)
106 {
107   poly8x8x2_t ret;
108   asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w" (ret) : "Q"(*__a));
109   return ret;
110 }
111 
112 __extension__ extern __inline poly16x4x2_t
113 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_p16_x2(const poly16_t * __a)114 vld1_p16_x2 (const poly16_t *__a)
115 {
116   poly16x4x2_t ret;
117   asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w" (ret) : "Q"(*__a));
118   return ret;
119 }
120 
121 __extension__ extern __inline poly64x1x2_t
122 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_p64_x2(const poly64_t * __a)123 vld1_p64_x2 (const poly64_t *__a)
124 {
125   poly64x1x2_t ret;
126   asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w" (ret) : "Q"(*__a));
127   return ret;
128 }
129 
130 __extension__ extern __inline uint8x16x2_t
131 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_u8_x2(const uint8_t * __a)132 vld1q_u8_x2 (const uint8_t *__a)
133 {
134   uint8x16x2_t ret;
135   asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w" (ret) : "Q"(*__a));
136   return ret;
137 }
138 
139 __extension__ extern __inline int8x16x2_t
140 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_s8_x2(const int8_t * __a)141 vld1q_s8_x2 (const int8_t *__a)
142 {
143   int8x16x2_t ret;
144   asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w" (ret) : "Q"(*__a));
145   return ret;
146 }
147 
148 __extension__ extern __inline uint16x8x2_t
149 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_u16_x2(const uint16_t * __a)150 vld1q_u16_x2 (const uint16_t *__a)
151 {
152   uint16x8x2_t ret;
153   asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w" (ret) : "Q"(*__a));
154   return ret;
155 }
156 
157 __extension__ extern __inline int16x8x2_t
158 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_s16_x2(const int16_t * __a)159 vld1q_s16_x2 (const int16_t *__a)
160 {
161   int16x8x2_t ret;
162   asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w" (ret) : "Q"(*__a));
163   return ret;
164 }
165 
166 __extension__ extern __inline uint32x4x2_t
167 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_u32_x2(const uint32_t * __a)168 vld1q_u32_x2 (const uint32_t *__a)
169 {
170   uint32x4x2_t ret;
171   asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w" (ret) : "Q"(*__a));
172   return ret;
173 }
174 
175 __extension__ extern __inline int32x4x2_t
176 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_s32_x2(const int32_t * __a)177 vld1q_s32_x2 (const int32_t *__a)
178 {
179   int32x4x2_t ret;
180   asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w" (ret) : "Q"(*__a));
181   return ret;
182 }
183 
184 __extension__ extern __inline uint64x2x2_t
185 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_u64_x2(const uint64_t * __a)186 vld1q_u64_x2 (const uint64_t *__a)
187 {
188   uint64x2x2_t ret;
189   asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w" (ret) : "Q"(*__a));
190   return ret;
191 }
192 
193 __extension__ extern __inline int64x2x2_t
194 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_s64_x2(const int64_t * __a)195 vld1q_s64_x2 (const int64_t *__a)
196 {
197   int64x2x2_t ret;
198   asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w" (ret) : "Q"(*__a));
199   return ret;
200 }
201 
202 __extension__ extern __inline float16x8x2_t
203 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_f16_x2(const float16_t * __a)204 vld1q_f16_x2 (const float16_t *__a)
205 {
206   float16x8x2_t ret;
207   asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w" (ret) : "Q"(*__a));
208   return ret;
209 }
210 
211 __extension__ extern __inline float32x4x2_t
212 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_f32_x2(const float32_t * __a)213 vld1q_f32_x2 (const float32_t *__a)
214 {
215   float32x4x2_t ret;
216   asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w" (ret) : "Q"(*__a));
217   return ret;
218 }
219 
220 __extension__ extern __inline float64x2x2_t
221 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_f64_x2(const float64_t * __a)222 vld1q_f64_x2 (const float64_t *__a)
223 {
224   float64x2x2_t ret;
225   asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w" (ret) : "Q"(*__a));
226   return ret;
227 }
228 
229 __extension__ extern __inline poly8x16x2_t
230 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_p8_x2(const poly8_t * __a)231 vld1q_p8_x2 (const poly8_t *__a)
232 {
233   poly8x16x2_t ret;
234   asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w" (ret) : "Q"(*__a));
235   return ret;
236 }
237 
238 __extension__ extern __inline poly16x8x2_t
239 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_p16_x2(const poly16_t * __a)240 vld1q_p16_x2 (const poly16_t *__a)
241 {
242   poly16x8x2_t ret;
243   asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w" (ret) : "Q"(*__a));
244   return ret;
245 }
246 
247 __extension__ extern __inline poly64x2x2_t
248 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_p64_x2(const poly64_t * __a)249 vld1q_p64_x2 (const poly64_t *__a)
250 {
251   poly64x2x2_t ret;
252   asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w" (ret) : "Q"(*__a));
253   return ret;
254 }
255 
256 /* vst1x2 */
257 
258 __extension__ extern __inline void
259 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_s64_x2(int64_t * __a,int64x1x2_t val)260 vst1_s64_x2 (int64_t * __a, int64x1x2_t val)
261 {
262   asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q" (*__a) : "w" (val));
263 }
264 
265 __extension__ extern __inline void
266 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_u64_x2(uint64_t * __a,uint64x1x2_t val)267 vst1_u64_x2 (uint64_t * __a, uint64x1x2_t val)
268 {
269   asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q" (*__a) : "w" (val));
270 }
271 
272 __extension__ extern __inline void
273 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_f64_x2(float64_t * __a,float64x1x2_t val)274 vst1_f64_x2 (float64_t * __a, float64x1x2_t val)
275 {
276   asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q" (*__a) : "w" (val));
277 }
278 
279 __extension__ extern __inline void
280 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_s8_x2(int8_t * __a,int8x8x2_t val)281 vst1_s8_x2 (int8_t * __a, int8x8x2_t val)
282 {
283   asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q" (*__a) : "w" (val));
284 }
285 
286 __extension__ extern __inline void
287 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_p8_x2(poly8_t * __a,poly8x8x2_t val)288 vst1_p8_x2 (poly8_t * __a, poly8x8x2_t val)
289 {
290   asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q" (*__a) : "w" (val));
291 }
292 
293 __extension__ extern __inline void
294 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_s16_x2(int16_t * __a,int16x4x2_t val)295 vst1_s16_x2 (int16_t * __a, int16x4x2_t val)
296 {
297   asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q" (*__a) : "w" (val));
298 }
299 
300 __extension__ extern __inline void
301 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_p16_x2(poly16_t * __a,poly16x4x2_t val)302 vst1_p16_x2 (poly16_t * __a, poly16x4x2_t val)
303 {
304   asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q" (*__a) : "w" (val));
305 }
306 
307 __extension__ extern __inline void
308 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_s32_x2(int32_t * __a,int32x2x2_t val)309 vst1_s32_x2 (int32_t * __a, int32x2x2_t val)
310 {
311   asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q" (*__a) : "w" (val));
312 }
313 
314 __extension__ extern __inline void
315 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_u8_x2(uint8_t * __a,uint8x8x2_t val)316 vst1_u8_x2 (uint8_t * __a, uint8x8x2_t val)
317 {
318   asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q" (*__a) : "w" (val));
319 }
320 
321 __extension__ extern __inline void
322 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_u16_x2(uint16_t * __a,uint16x4x2_t val)323 vst1_u16_x2 (uint16_t * __a, uint16x4x2_t val)
324 {
325   asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q" (*__a) : "w" (val));
326 }
327 
328 __extension__ extern __inline void
329 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_u32_x2(uint32_t * __a,uint32x2x2_t val)330 vst1_u32_x2 (uint32_t * __a, uint32x2x2_t val)
331 {
332   asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q" (*__a) : "w" (val));
333 }
334 
335 __extension__ extern __inline void
336 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_f16_x2(float16_t * __a,float16x4x2_t val)337 vst1_f16_x2 (float16_t * __a, float16x4x2_t val)
338 {
339   asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q" (*__a) : "w" (val));
340 }
341 
342 __extension__ extern __inline void
343 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_f32_x2(float32_t * __a,float32x2x2_t val)344 vst1_f32_x2 (float32_t * __a, float32x2x2_t val)
345 {
346   asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q" (*__a) : "w" (val));
347 }
348 
349 __extension__ extern __inline void
350 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_p64_x2(poly64_t * __a,poly64x1x2_t val)351 vst1_p64_x2 (poly64_t * __a, poly64x1x2_t val)
352 {
353   asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q" (*__a) : "w" (val));
354 }
355 
356 __extension__ extern __inline void
357 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_s8_x2(int8_t * __a,int8x16x2_t val)358 vst1q_s8_x2 (int8_t * __a, int8x16x2_t val)
359 {
360   asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q" (*__a) : "w" (val));
361 }
362 
363 __extension__ extern __inline void
364 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_p8_x2(poly8_t * __a,poly8x16x2_t val)365 vst1q_p8_x2 (poly8_t * __a, poly8x16x2_t val)
366 {
367   asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q" (*__a) : "w" (val));
368 }
369 
370 __extension__ extern __inline void
371 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_s16_x2(int16_t * __a,int16x8x2_t val)372 vst1q_s16_x2 (int16_t * __a, int16x8x2_t val)
373 {
374   asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q" (*__a) : "w" (val));
375 }
376 
377 __extension__ extern __inline void
378 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_p16_x2(poly16_t * __a,poly16x8x2_t val)379 vst1q_p16_x2 (poly16_t * __a, poly16x8x2_t val)
380 {
381   asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q" (*__a) : "w" (val));
382 }
383 
384 __extension__ extern __inline void
385 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_s32_x2(int32_t * __a,int32x4x2_t val)386 vst1q_s32_x2 (int32_t * __a, int32x4x2_t val)
387 {
388   asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q" (*__a) : "w" (val));
389 }
390 
391 __extension__ extern __inline void
392 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_s64_x2(int64_t * __a,int64x2x2_t val)393 vst1q_s64_x2 (int64_t * __a, int64x2x2_t val)
394 {
395   asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q" (*__a) : "w" (val));
396 }
397 
398 __extension__ extern __inline void
399 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_u8_x2(uint8_t * __a,uint8x16x2_t val)400 vst1q_u8_x2 (uint8_t * __a, uint8x16x2_t val)
401 {
402   asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q" (*__a) : "w" (val));
403 }
404 
405 __extension__ extern __inline void
406 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_u16_x2(uint16_t * __a,uint16x8x2_t val)407 vst1q_u16_x2 (uint16_t * __a, uint16x8x2_t val)
408 {
409   asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q" (*__a) : "w" (val));
410 }
411 
412 __extension__ extern __inline void
413 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_u32_x2(uint32_t * __a,uint32x4x2_t val)414 vst1q_u32_x2 (uint32_t * __a, uint32x4x2_t val)
415 {
416   asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q" (*__a) : "w" (val));
417 }
418 
419 __extension__ extern __inline void
420 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_u64_x2(uint64_t * __a,uint64x2x2_t val)421 vst1q_u64_x2 (uint64_t * __a, uint64x2x2_t val)
422 {
423   asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q" (*__a) : "w" (val));
424 }
425 
426 __extension__ extern __inline void
427 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_f16_x2(float16_t * __a,float16x8x2_t val)428 vst1q_f16_x2 (float16_t * __a, float16x8x2_t val)
429 {
430   asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q" (*__a) : "w" (val));
431 }
432 
433 __extension__ extern __inline void
434 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_f32_x2(float32_t * __a,float32x4x2_t val)435 vst1q_f32_x2 (float32_t * __a, float32x4x2_t val)
436 {
437   asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q" (*__a) : "w" (val));
438 }
439 
440 __extension__ extern __inline void
441 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_f64_x2(float64_t * __a,float64x2x2_t val)442 vst1q_f64_x2 (float64_t * __a, float64x2x2_t val)
443 {
444   asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q" (*__a) : "w" (val));
445 }
446 
447 __extension__ extern __inline void
448 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_p64_x2(poly64_t * __a,poly64x2x2_t val)449 vst1q_p64_x2 (poly64_t * __a, poly64x2x2_t val)
450 {
451   asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q" (*__a) : "w" (val));
452 }
453