1 /*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9 /* Workaround for missing vld1_*_x2 and vst1_*_x2 intrinsics in gcc-7. */
10
11 __extension__ extern __inline uint8x8x2_t
12 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_u8_x2(const uint8_t * __a)13 vld1_u8_x2 (const uint8_t *__a)
14 {
15 uint8x8x2_t ret;
16 asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w" (ret) : "Q"(*__a));
17 return ret;
18 }
19
20 __extension__ extern __inline int8x8x2_t
21 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_s8_x2(const int8_t * __a)22 vld1_s8_x2 (const int8_t *__a)
23 {
24 int8x8x2_t ret;
25 asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w" (ret) : "Q"(*__a));
26 return ret;
27 }
28
29 __extension__ extern __inline uint16x4x2_t
30 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_u16_x2(const uint16_t * __a)31 vld1_u16_x2 (const uint16_t *__a)
32 {
33 uint16x4x2_t ret;
34 asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w" (ret) : "Q"(*__a));
35 return ret;
36 }
37
38 __extension__ extern __inline int16x4x2_t
39 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_s16_x2(const int16_t * __a)40 vld1_s16_x2 (const int16_t *__a)
41 {
42 int16x4x2_t ret;
43 asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w" (ret) : "Q"(*__a));
44 return ret;
45 }
46
47 __extension__ extern __inline uint32x2x2_t
48 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_u32_x2(const uint32_t * __a)49 vld1_u32_x2 (const uint32_t *__a)
50 {
51 uint32x2x2_t ret;
52 asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w" (ret) : "Q"(*__a));
53 return ret;
54 }
55
56 __extension__ extern __inline int32x2x2_t
57 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_s32_x2(const int32_t * __a)58 vld1_s32_x2 (const int32_t *__a)
59 {
60 int32x2x2_t ret;
61 asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w" (ret) : "Q"(*__a));
62 return ret;
63 }
64
65 __extension__ extern __inline uint64x1x2_t
66 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_u64_x2(const uint64_t * __a)67 vld1_u64_x2 (const uint64_t *__a)
68 {
69 uint64x1x2_t ret;
70 asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w" (ret) : "Q"(*__a));
71 return ret;
72 }
73
74 __extension__ extern __inline int64x1x2_t
75 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_s64_x2(const int64_t * __a)76 vld1_s64_x2 (const int64_t *__a)
77 {
78 int64x1x2_t ret;
79 __builtin_aarch64_simd_oi __o;
80 asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w" (ret) : "Q"(*__a));
81 return ret;
82 }
83
84 __extension__ extern __inline float16x4x2_t
85 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_f16_x2(const float16_t * __a)86 vld1_f16_x2 (const float16_t *__a)
87 {
88 float16x4x2_t ret;
89 asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w" (ret) : "Q"(*__a));
90 return ret;
91 }
92
93 __extension__ extern __inline float32x2x2_t
94 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_f32_x2(const float32_t * __a)95 vld1_f32_x2 (const float32_t *__a)
96 {
97 float32x2x2_t ret;
98 asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w" (ret) : "Q"(*__a));
99 return ret;
100 }
101
102 __extension__ extern __inline float64x1x2_t
103 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_f64_x2(const float64_t * __a)104 vld1_f64_x2 (const float64_t *__a)
105 {
106 float64x1x2_t ret;
107 asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w" (ret) : "Q"(*__a));
108 return ret;
109 }
110
111 __extension__ extern __inline poly8x8x2_t
112 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_p8_x2(const poly8_t * __a)113 vld1_p8_x2 (const poly8_t *__a)
114 {
115 poly8x8x2_t ret;
116 asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w" (ret) : "Q"(*__a));
117 return ret;
118 }
119
120 __extension__ extern __inline poly16x4x2_t
121 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_p16_x2(const poly16_t * __a)122 vld1_p16_x2 (const poly16_t *__a)
123 {
124 poly16x4x2_t ret;
125 asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w" (ret) : "Q"(*__a));
126 return ret;
127 }
128
129 __extension__ extern __inline poly64x1x2_t
130 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1_p64_x2(const poly64_t * __a)131 vld1_p64_x2 (const poly64_t *__a)
132 {
133 poly64x1x2_t ret;
134 asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w" (ret) : "Q"(*__a));
135 return ret;
136 }
137
138 __extension__ extern __inline uint8x16x2_t
139 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_u8_x2(const uint8_t * __a)140 vld1q_u8_x2 (const uint8_t *__a)
141 {
142 uint8x16x2_t ret;
143 asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w" (ret) : "Q"(*__a));
144 return ret;
145 }
146
147 __extension__ extern __inline int8x16x2_t
148 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_s8_x2(const int8_t * __a)149 vld1q_s8_x2 (const int8_t *__a)
150 {
151 int8x16x2_t ret;
152 asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w" (ret) : "Q"(*__a));
153 return ret;
154 }
155
156 __extension__ extern __inline uint16x8x2_t
157 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_u16_x2(const uint16_t * __a)158 vld1q_u16_x2 (const uint16_t *__a)
159 {
160 uint16x8x2_t ret;
161 asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w" (ret) : "Q"(*__a));
162 return ret;
163 }
164
165 __extension__ extern __inline int16x8x2_t
166 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_s16_x2(const int16_t * __a)167 vld1q_s16_x2 (const int16_t *__a)
168 {
169 int16x8x2_t ret;
170 asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w" (ret) : "Q"(*__a));
171 return ret;
172 }
173
174 __extension__ extern __inline uint32x4x2_t
175 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_u32_x2(const uint32_t * __a)176 vld1q_u32_x2 (const uint32_t *__a)
177 {
178 uint32x4x2_t ret;
179 asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w" (ret) : "Q"(*__a));
180 return ret;
181 }
182
183 __extension__ extern __inline int32x4x2_t
184 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_s32_x2(const int32_t * __a)185 vld1q_s32_x2 (const int32_t *__a)
186 {
187 int32x4x2_t ret;
188 asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w" (ret) : "Q"(*__a));
189 return ret;
190 }
191
192 __extension__ extern __inline uint64x2x2_t
193 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_u64_x2(const uint64_t * __a)194 vld1q_u64_x2 (const uint64_t *__a)
195 {
196 uint64x2x2_t ret;
197 asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w" (ret) : "Q"(*__a));
198 return ret;
199 }
200
201 __extension__ extern __inline int64x2x2_t
202 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_s64_x2(const int64_t * __a)203 vld1q_s64_x2 (const int64_t *__a)
204 {
205 int64x2x2_t ret;
206 asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w" (ret) : "Q"(*__a));
207 return ret;
208 }
209
210 __extension__ extern __inline float16x8x2_t
211 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_f16_x2(const float16_t * __a)212 vld1q_f16_x2 (const float16_t *__a)
213 {
214 float16x8x2_t ret;
215 asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w" (ret) : "Q"(*__a));
216 return ret;
217 }
218
219 __extension__ extern __inline float32x4x2_t
220 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_f32_x2(const float32_t * __a)221 vld1q_f32_x2 (const float32_t *__a)
222 {
223 float32x4x2_t ret;
224 asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w" (ret) : "Q"(*__a));
225 return ret;
226 }
227
228 __extension__ extern __inline float64x2x2_t
229 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_f64_x2(const float64_t * __a)230 vld1q_f64_x2 (const float64_t *__a)
231 {
232 float64x2x2_t ret;
233 asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w" (ret) : "Q"(*__a));
234 return ret;
235 }
236
237 __extension__ extern __inline poly8x16x2_t
238 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_p8_x2(const poly8_t * __a)239 vld1q_p8_x2 (const poly8_t *__a)
240 {
241 poly8x16x2_t ret;
242 asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w" (ret) : "Q"(*__a));
243 return ret;
244 }
245
246 __extension__ extern __inline poly16x8x2_t
247 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_p16_x2(const poly16_t * __a)248 vld1q_p16_x2 (const poly16_t *__a)
249 {
250 poly16x8x2_t ret;
251 asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w" (ret) : "Q"(*__a));
252 return ret;
253 }
254
255 __extension__ extern __inline poly64x2x2_t
256 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_p64_x2(const poly64_t * __a)257 vld1q_p64_x2 (const poly64_t *__a)
258 {
259 poly64x2x2_t ret;
260 asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w" (ret) : "Q"(*__a));
261 return ret;
262 }
263
264 /* vst1x2 */
265
266 __extension__ extern __inline void
267 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_s64_x2(int64_t * __a,int64x1x2_t val)268 vst1_s64_x2 (int64_t * __a, int64x1x2_t val)
269 {
270 asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q" (*__a) : "w" (val));
271 }
272
273 __extension__ extern __inline void
274 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_u64_x2(uint64_t * __a,uint64x1x2_t val)275 vst1_u64_x2 (uint64_t * __a, uint64x1x2_t val)
276 {
277 asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q" (*__a) : "w" (val));
278 }
279
280 __extension__ extern __inline void
281 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_f64_x2(float64_t * __a,float64x1x2_t val)282 vst1_f64_x2 (float64_t * __a, float64x1x2_t val)
283 {
284 asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q" (*__a) : "w" (val));
285 }
286
287 __extension__ extern __inline void
288 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_s8_x2(int8_t * __a,int8x8x2_t val)289 vst1_s8_x2 (int8_t * __a, int8x8x2_t val)
290 {
291 asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q" (*__a) : "w" (val));
292 }
293
294 __extension__ extern __inline void
295 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_p8_x2(poly8_t * __a,poly8x8x2_t val)296 vst1_p8_x2 (poly8_t * __a, poly8x8x2_t val)
297 {
298 asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q" (*__a) : "w" (val));
299 }
300
301 __extension__ extern __inline void
302 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_s16_x2(int16_t * __a,int16x4x2_t val)303 vst1_s16_x2 (int16_t * __a, int16x4x2_t val)
304 {
305 asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q" (*__a) : "w" (val));
306 }
307
308 __extension__ extern __inline void
309 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_p16_x2(poly16_t * __a,poly16x4x2_t val)310 vst1_p16_x2 (poly16_t * __a, poly16x4x2_t val)
311 {
312 asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q" (*__a) : "w" (val));
313 }
314
315 __extension__ extern __inline void
316 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_s32_x2(int32_t * __a,int32x2x2_t val)317 vst1_s32_x2 (int32_t * __a, int32x2x2_t val)
318 {
319 asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q" (*__a) : "w" (val));
320 }
321
322 __extension__ extern __inline void
323 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_u8_x2(uint8_t * __a,uint8x8x2_t val)324 vst1_u8_x2 (uint8_t * __a, uint8x8x2_t val)
325 {
326 asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q" (*__a) : "w" (val));
327 }
328
329 __extension__ extern __inline void
330 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_u16_x2(uint16_t * __a,uint16x4x2_t val)331 vst1_u16_x2 (uint16_t * __a, uint16x4x2_t val)
332 {
333 asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q" (*__a) : "w" (val));
334 }
335
336 __extension__ extern __inline void
337 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_u32_x2(uint32_t * __a,uint32x2x2_t val)338 vst1_u32_x2 (uint32_t * __a, uint32x2x2_t val)
339 {
340 asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q" (*__a) : "w" (val));
341 }
342
343 __extension__ extern __inline void
344 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_f16_x2(float16_t * __a,float16x4x2_t val)345 vst1_f16_x2 (float16_t * __a, float16x4x2_t val)
346 {
347 asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q" (*__a) : "w" (val));
348 }
349
350 __extension__ extern __inline void
351 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_f32_x2(float32_t * __a,float32x2x2_t val)352 vst1_f32_x2 (float32_t * __a, float32x2x2_t val)
353 {
354 asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q" (*__a) : "w" (val));
355 }
356
357 __extension__ extern __inline void
358 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1_p64_x2(poly64_t * __a,poly64x1x2_t val)359 vst1_p64_x2 (poly64_t * __a, poly64x1x2_t val)
360 {
361 asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q" (*__a) : "w" (val));
362 }
363
364 __extension__ extern __inline void
365 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_s8_x2(int8_t * __a,int8x16x2_t val)366 vst1q_s8_x2 (int8_t * __a, int8x16x2_t val)
367 {
368 asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q" (*__a) : "w" (val));
369 }
370
371 __extension__ extern __inline void
372 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_p8_x2(poly8_t * __a,poly8x16x2_t val)373 vst1q_p8_x2 (poly8_t * __a, poly8x16x2_t val)
374 {
375 asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q" (*__a) : "w" (val));
376 }
377
378 __extension__ extern __inline void
379 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_s16_x2(int16_t * __a,int16x8x2_t val)380 vst1q_s16_x2 (int16_t * __a, int16x8x2_t val)
381 {
382 asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q" (*__a) : "w" (val));
383 }
384
385 __extension__ extern __inline void
386 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_p16_x2(poly16_t * __a,poly16x8x2_t val)387 vst1q_p16_x2 (poly16_t * __a, poly16x8x2_t val)
388 {
389 asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q" (*__a) : "w" (val));
390 }
391
392 __extension__ extern __inline void
393 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_s32_x2(int32_t * __a,int32x4x2_t val)394 vst1q_s32_x2 (int32_t * __a, int32x4x2_t val)
395 {
396 asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q" (*__a) : "w" (val));
397 }
398
399 __extension__ extern __inline void
400 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_s64_x2(int64_t * __a,int64x2x2_t val)401 vst1q_s64_x2 (int64_t * __a, int64x2x2_t val)
402 {
403 asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q" (*__a) : "w" (val));
404 }
405
406 __extension__ extern __inline void
407 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_u8_x2(uint8_t * __a,uint8x16x2_t val)408 vst1q_u8_x2 (uint8_t * __a, uint8x16x2_t val)
409 {
410 asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q" (*__a) : "w" (val));
411 }
412
413 __extension__ extern __inline void
414 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_u16_x2(uint16_t * __a,uint16x8x2_t val)415 vst1q_u16_x2 (uint16_t * __a, uint16x8x2_t val)
416 {
417 asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q" (*__a) : "w" (val));
418 }
419
420 __extension__ extern __inline void
421 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_u32_x2(uint32_t * __a,uint32x4x2_t val)422 vst1q_u32_x2 (uint32_t * __a, uint32x4x2_t val)
423 {
424 asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q" (*__a) : "w" (val));
425 }
426
427 __extension__ extern __inline void
428 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_u64_x2(uint64_t * __a,uint64x2x2_t val)429 vst1q_u64_x2 (uint64_t * __a, uint64x2x2_t val)
430 {
431 asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q" (*__a) : "w" (val));
432 }
433
434 __extension__ extern __inline void
435 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_f16_x2(float16_t * __a,float16x8x2_t val)436 vst1q_f16_x2 (float16_t * __a, float16x8x2_t val)
437 {
438 asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q" (*__a) : "w" (val));
439 }
440
441 __extension__ extern __inline void
442 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_f32_x2(float32_t * __a,float32x4x2_t val)443 vst1q_f32_x2 (float32_t * __a, float32x4x2_t val)
444 {
445 asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q" (*__a) : "w" (val));
446 }
447
448 __extension__ extern __inline void
449 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_f64_x2(float64_t * __a,float64x2x2_t val)450 vst1q_f64_x2 (float64_t * __a, float64x2x2_t val)
451 {
452 asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q" (*__a) : "w" (val));
453 }
454
455 __extension__ extern __inline void
456 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_p64_x2(poly64_t * __a,poly64x2x2_t val)457 vst1q_p64_x2 (poly64_t * __a, poly64x2x2_t val)
458 {
459 asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q" (*__a) : "w" (val));
460 }
461