xref: /aosp_15_r20/external/executorch/kernels/optimized/vec/vec256/vec256_float.h (revision 523fa7a60841cd1ecfb9cc4201f1ca8b03ed023a)
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD-style license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 #pragma once
10 
11 // DO NOT DEFINE STATIC DATA IN THIS HEADER!
12 // See Note [Do not compile initializers with AVX]
13 
14 #include <executorch/kernels/optimized/vec/intrinsics.h>
15 #include <executorch/kernels/optimized/vec/vec_base.h>
16 
17 #if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
18 #include <sleef.h>
19 #endif
20 
21 namespace executorch {
22 namespace vec {
23 // See Note [CPU_CAPABILITY namespace]
24 inline namespace CPU_CAPABILITY {
25 
26 #if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
27 
28 template <> class Vectorized<float> {
29 private:
30   __m256 values;
31 public:
32   using value_type = float;
33   using size_type = int;
size()34   static constexpr size_type size() {
35     return 8;
36   }
Vectorized()37   Vectorized() {}
Vectorized(__m256 v)38   Vectorized(__m256 v) : values(v) {}
Vectorized(float val)39   Vectorized(float val) {
40     values = _mm256_set1_ps(val);
41   }
Vectorized(float val1,float val2,float val3,float val4,float val5,float val6,float val7,float val8)42   Vectorized(float val1, float val2, float val3, float val4,
43          float val5, float val6, float val7, float val8) {
44     values = _mm256_setr_ps(val1, val2, val3, val4, val5, val6, val7, val8);
45   }
__m256()46   operator __m256() const {
47     return values;
48   }
49   template <int64_t mask>
blend(const Vectorized<float> & a,const Vectorized<float> & b)50   static Vectorized<float> blend(const Vectorized<float>& a, const Vectorized<float>& b) {
51     return _mm256_blend_ps(a.values, b.values, mask);
52   }
blendv(const Vectorized<float> & a,const Vectorized<float> & b,const Vectorized<float> & mask)53   static Vectorized<float> blendv(const Vectorized<float>& a, const Vectorized<float>& b,
54                               const Vectorized<float>& mask) {
55     return _mm256_blendv_ps(a.values, b.values, mask.values);
56   }
57   template<typename step_t>
58   static Vectorized<float> arange(float base = 0.f, step_t step = static_cast<step_t>(1)) {
59     return Vectorized<float>(
60       base,            base +     step, base + 2 * step, base + 3 * step,
61       base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step);
62   }
63   static Vectorized<float> set(const Vectorized<float>& a, const Vectorized<float>& b,
64                            int64_t count = size()) {
65     switch (count) {
66       case 0:
67         return a;
68       case 1:
69         return blend<1>(a, b);
70       case 2:
71         return blend<3>(a, b);
72       case 3:
73         return blend<7>(a, b);
74       case 4:
75         return blend<15>(a, b);
76       case 5:
77         return blend<31>(a, b);
78       case 6:
79         return blend<63>(a, b);
80       case 7:
81         return blend<127>(a, b);
82     }
83     return b;
84   }
85   static Vectorized<float> loadu(const void* ptr, int64_t count = size()) {
86     if (count == size())
87       return _mm256_loadu_ps(reinterpret_cast<const float*>(ptr));
88     __at_align__ float tmp_values[size()];
89     // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
90     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
91     // instructions while a loop would be compiled to one instruction.
92     for (size_t i = 0; i < size(); ++i) {
93       tmp_values[i] = 0.0;
94     }
95     std::memcpy(
96         tmp_values, reinterpret_cast<const float*>(ptr), count * sizeof(float));
97     return _mm256_loadu_ps(tmp_values);
98   }
99   void store(void* ptr, int64_t count = size()) const {
100     if (count == size()) {
101       _mm256_storeu_ps(reinterpret_cast<float*>(ptr), values);
102     } else if (count > 0) {
103       float tmp_values[size()];
104       _mm256_storeu_ps(reinterpret_cast<float*>(tmp_values), values);
105       std::memcpy(ptr, tmp_values, count * sizeof(float));
106     }
107   }
108   const float& operator[](int idx) const  = delete;
109   float& operator[](int idx) = delete;
zero_mask()110   int zero_mask() const {
111     // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit
112     __m256 cmp = _mm256_cmp_ps(values, _mm256_set1_ps(0.0f), _CMP_EQ_OQ);
113     return _mm256_movemask_ps(cmp);
114   }
isnan()115   Vectorized<float> isnan() const {
116     return _mm256_cmp_ps(values, _mm256_set1_ps(0.0f), _CMP_UNORD_Q);
117   }
map(float (* const f)(float))118   Vectorized<float> map(float (*const f)(float)) const {
119     __at_align__ float tmp[size()];
120     store(tmp);
121     for (size_t i = 0; i < size(); ++i) {
122       tmp[i] = f(tmp[i]);
123     }
124     return loadu(tmp);
125   }
abs()126   Vectorized<float> abs() const {
127     auto mask = _mm256_set1_ps(-0.f);
128     return _mm256_andnot_ps(mask, values);
129   }
acos()130   Vectorized<float> acos() const {
131     return Vectorized<float>(Sleef_acosf8_u10(values));
132   }
asin()133   Vectorized<float> asin() const {
134     return Vectorized<float>(Sleef_asinf8_u10(values));
135   }
atan()136   Vectorized<float> atan() const {
137     return Vectorized<float>(Sleef_atanf8_u10(values));
138   }
atan2(const Vectorized<float> & b)139   Vectorized<float> atan2(const Vectorized<float> &b) const {
140     return Vectorized<float>(Sleef_atan2f8_u10(values, b));
141   }
copysign(const Vectorized<float> & sign)142   Vectorized<float> copysign(const Vectorized<float> &sign) const {
143     return Vectorized<float>(Sleef_copysignf8(values, sign));
144   }
erf()145   Vectorized<float> erf() const {
146     // constants
147     const auto neg_zero_vec = _mm256_set1_ps(-0.f);
148     const auto one_vec = _mm256_set1_ps(1.0f);
149     const auto p = _mm256_set1_ps(0.3275911f);
150     const auto p1 = _mm256_set1_ps(0.254829592f);
151     const auto p2 = _mm256_set1_ps(-0.284496736f);
152     const auto p3 = _mm256_set1_ps(1.421413741f);
153     const auto p4 = _mm256_set1_ps(-1.453152027f);
154     const auto p5 = _mm256_set1_ps(1.061405429f);
155     // sign(x)
156     auto sign_mask = _mm256_and_ps(neg_zero_vec, values);
157     auto abs_vec = _mm256_xor_ps(sign_mask, values);
158     // t = 1 / (p * abs(x) + 1)
159     auto tmp0 = _mm256_fmadd_ps(p, abs_vec, one_vec);
160     auto t = _mm256_div_ps(one_vec, tmp0);
161     // r = p5 * t ^ 4 + p4 * t ^ 3 + p3 * t ^ 2 + p2 * t + p1
162     auto tmp1 = _mm256_fmadd_ps(p5, t, p4);
163     auto tmp2 = _mm256_fmadd_ps(tmp1, t, p3);
164     auto tmp3 = _mm256_fmadd_ps(tmp2, t, p2);
165     auto r = _mm256_fmadd_ps(tmp3, t, p1);
166     // - exp(- x * x)
167     auto pow_2 = _mm256_mul_ps(values, values);
168     auto neg_pow_2 = _mm256_xor_ps(neg_zero_vec, pow_2);
169     // auto tmp4 = exp(neg_pow_2);
170     auto tmp4 = Vectorized<float>(Sleef_expf8_u10(neg_pow_2));
171     auto tmp5 = _mm256_xor_ps(neg_zero_vec, tmp4);
172     // erf(x) = sign(x) * (1 - r * t * exp(- x * x))
173     auto tmp6 = _mm256_mul_ps(tmp5, t);
174     auto tmp7 = _mm256_fmadd_ps(tmp6, r, one_vec);
175     return _mm256_xor_ps(sign_mask, tmp7);
176   }
erfc()177   Vectorized<float> erfc() const {
178     return Vectorized<float>(Sleef_erfcf8_u15(values));
179   }
exp()180   Vectorized<float> exp() const {
181     return Vectorized<float>(Sleef_expf8_u10(values));
182   }
exp2()183   Vectorized<float> exp2() const {
184     return Vectorized<float>(Sleef_exp2f8_u10(values));
185   }
expm1()186   Vectorized<float> expm1() const {
187     return Vectorized<float>(Sleef_expm1f8_u10(values));
188   }
fmod(const Vectorized<float> & q)189   Vectorized<float> fmod(const Vectorized<float>& q) const {
190     return Vectorized<float>(Sleef_fmodf8(values, q));
191   }
log()192   Vectorized<float> log() const {
193     return Vectorized<float>(Sleef_logf8_u10(values));
194   }
log2()195   Vectorized<float> log2() const {
196     return Vectorized<float>(Sleef_log2f8_u10(values));
197   }
log10()198   Vectorized<float> log10() const {
199     return Vectorized<float>(Sleef_log10f8_u10(values));
200   }
log1p()201   Vectorized<float> log1p() const {
202     return Vectorized<float>(Sleef_log1pf8_u10(values));
203   }
204   Vectorized<float> frac() const;
sin()205   Vectorized<float> sin() const {
206     return Vectorized<float>(Sleef_sinf8_u35(values));
207   }
sinh()208   Vectorized<float> sinh() const {
209     return Vectorized<float>(Sleef_sinhf8_u10(values));
210   }
cos()211   Vectorized<float> cos() const {
212     return Vectorized<float>(Sleef_cosf8_u35(values));
213   }
cosh()214   Vectorized<float> cosh() const {
215     return Vectorized<float>(Sleef_coshf8_u10(values));
216   }
ceil()217   Vectorized<float> ceil() const {
218     return _mm256_ceil_ps(values);
219   }
floor()220   Vectorized<float> floor() const {
221     return _mm256_floor_ps(values);
222   }
hypot(const Vectorized<float> & b)223   Vectorized<float> hypot(const Vectorized<float> &b) const {
224     return Vectorized<float>(Sleef_hypotf8_u05(values, b));
225   }
neg()226   Vectorized<float> neg() const {
227     return _mm256_xor_ps(_mm256_set1_ps(-0.f), values);
228   }
nextafter(const Vectorized<float> & b)229   Vectorized<float> nextafter(const Vectorized<float> &b) const {
230     return Vectorized<float>(Sleef_nextafterf8(values, b));
231   }
round()232   Vectorized<float> round() const {
233     return _mm256_round_ps(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
234   }
tan()235   Vectorized<float> tan() const {
236     return Vectorized<float>(Sleef_tanf8_u10(values));
237   }
tanh()238   Vectorized<float> tanh() const {
239     return Vectorized<float>(Sleef_tanhf8_u10(values));
240   }
trunc()241   Vectorized<float> trunc() const {
242     return _mm256_round_ps(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
243   }
lgamma()244   Vectorized<float> lgamma() const {
245     return Vectorized<float>(Sleef_lgammaf8_u10(values));
246   }
sqrt()247   Vectorized<float> sqrt() const {
248     return _mm256_sqrt_ps(values);
249   }
reciprocal()250   Vectorized<float> reciprocal() const {
251     return _mm256_div_ps(_mm256_set1_ps(1), values);
252   }
rsqrt()253   Vectorized<float> rsqrt() const {
254     return _mm256_div_ps(_mm256_set1_ps(1), _mm256_sqrt_ps(values));
255   }
pow(const Vectorized<float> & b)256   Vectorized<float> pow(const Vectorized<float> &b) const {
257     return Vectorized<float>(Sleef_powf8_u10(values, b));
258   }
259   // Comparison using the _CMP_**_OQ predicate.
260   //   `O`: get false if an operand is NaN
261   //   `Q`: do not raise if an operand is NaN
262   Vectorized<float> operator==(const Vectorized<float>& other) const {
263     return _mm256_cmp_ps(values, other.values, _CMP_EQ_OQ);
264   }
265 
266   Vectorized<float> operator!=(const Vectorized<float>& other) const {
267     return _mm256_cmp_ps(values, other.values, _CMP_NEQ_UQ);
268   }
269 
270   Vectorized<float> operator<(const Vectorized<float>& other) const {
271     return _mm256_cmp_ps(values, other.values, _CMP_LT_OQ);
272   }
273 
274   Vectorized<float> operator<=(const Vectorized<float>& other) const {
275     return _mm256_cmp_ps(values, other.values, _CMP_LE_OQ);
276   }
277 
278   Vectorized<float> operator>(const Vectorized<float>& other) const {
279     return _mm256_cmp_ps(values, other.values, _CMP_GT_OQ);
280   }
281 
282   Vectorized<float> operator>=(const Vectorized<float>& other) const {
283     return _mm256_cmp_ps(values, other.values, _CMP_GE_OQ);
284   }
285 
286   Vectorized<float> eq(const Vectorized<float>& other) const;
287   Vectorized<float> ne(const Vectorized<float>& other) const;
288   Vectorized<float> gt(const Vectorized<float>& other) const;
289   Vectorized<float> ge(const Vectorized<float>& other) const;
290   Vectorized<float> lt(const Vectorized<float>& other) const;
291   Vectorized<float> le(const Vectorized<float>& other) const;
292 };
293 
294 template <>
295 Vectorized<float> inline operator+(const Vectorized<float>& a, const Vectorized<float>& b) {
296   return _mm256_add_ps(a, b);
297 }
298 
299 template <>
300 Vectorized<float> inline operator-(const Vectorized<float>& a, const Vectorized<float>& b) {
301   return _mm256_sub_ps(a, b);
302 }
303 
304 template <>
305 Vectorized<float> inline operator*(const Vectorized<float>& a, const Vectorized<float>& b) {
306   return _mm256_mul_ps(a, b);
307 }
308 
309 template <>
310 Vectorized<float> inline operator/(const Vectorized<float>& a, const Vectorized<float>& b) {
311   return _mm256_div_ps(a, b);
312 }
313 
314 // frac. Implement this here so we can use subtraction
frac()315 inline Vectorized<float> Vectorized<float>::frac() const {
316   return *this - this->trunc();
317 }
318 
319 // Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
320 // either input is a NaN.
321 template <>
maximum(const Vectorized<float> & a,const Vectorized<float> & b)322 Vectorized<float> inline maximum(const Vectorized<float>& a, const Vectorized<float>& b) {
323   Vectorized<float> max = _mm256_max_ps(a, b);
324   Vectorized<float> isnan = _mm256_cmp_ps(a, b, _CMP_UNORD_Q);
325   // Exploit the fact that all-ones is a NaN.
326   return _mm256_or_ps(max, isnan);
327 }
328 
329 // Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
330 // either input is a NaN.
331 template <>
minimum(const Vectorized<float> & a,const Vectorized<float> & b)332 Vectorized<float> inline minimum(const Vectorized<float>& a, const Vectorized<float>& b) {
333   Vectorized<float> min = _mm256_min_ps(a, b);
334   Vectorized<float> isnan = _mm256_cmp_ps(a, b, _CMP_UNORD_Q);
335   // Exploit the fact that all-ones is a NaN.
336   return _mm256_or_ps(min, isnan);
337 }
338 
339 template <>
clamp(const Vectorized<float> & a,const Vectorized<float> & min,const Vectorized<float> & max)340 Vectorized<float> inline clamp(const Vectorized<float>& a, const Vectorized<float>& min, const Vectorized<float>& max) {
341   return _mm256_min_ps(max, _mm256_max_ps(min, a));
342 }
343 
344 template <>
clamp_max(const Vectorized<float> & a,const Vectorized<float> & max)345 Vectorized<float> inline clamp_max(const Vectorized<float>& a, const Vectorized<float>& max) {
346   return _mm256_min_ps(max, a);
347 }
348 
349 template <>
clamp_min(const Vectorized<float> & a,const Vectorized<float> & min)350 Vectorized<float> inline clamp_min(const Vectorized<float>& a, const Vectorized<float>& min) {
351   return _mm256_max_ps(min, a);
352 }
353 
354 template <>
355 Vectorized<float> inline operator&(const Vectorized<float>& a, const Vectorized<float>& b) {
356   return _mm256_and_ps(a, b);
357 }
358 
359 template <>
360 Vectorized<float> inline operator|(const Vectorized<float>& a, const Vectorized<float>& b) {
361   return _mm256_or_ps(a, b);
362 }
363 
364 template <>
365 Vectorized<float> inline operator^(const Vectorized<float>& a, const Vectorized<float>& b) {
366   return _mm256_xor_ps(a, b);
367 }
368 
eq(const Vectorized<float> & other)369 inline Vectorized<float> Vectorized<float>::eq(const Vectorized<float>& other) const {
370   return (*this == other) & Vectorized<float>(1.0f);
371 }
372 
ne(const Vectorized<float> & other)373 inline Vectorized<float> Vectorized<float>::ne(const Vectorized<float>& other) const {
374   return (*this != other) & Vectorized<float>(1.0f);
375 }
376 
gt(const Vectorized<float> & other)377 inline Vectorized<float> Vectorized<float>::gt(const Vectorized<float>& other) const {
378   return (*this > other) & Vectorized<float>(1.0f);
379 }
380 
ge(const Vectorized<float> & other)381 inline Vectorized<float> Vectorized<float>::ge(const Vectorized<float>& other) const {
382   return (*this >= other) & Vectorized<float>(1.0f);
383 }
384 
lt(const Vectorized<float> & other)385 inline Vectorized<float> Vectorized<float>::lt(const Vectorized<float>& other) const {
386   return (*this < other) & Vectorized<float>(1.0f);
387 }
388 
le(const Vectorized<float> & other)389 inline Vectorized<float> Vectorized<float>::le(const Vectorized<float>& other) const {
390   return (*this <= other) & Vectorized<float>(1.0f);
391 }
392 
393 template <>
convert(const float * src,float * dst,int64_t n)394 inline void convert(const float* src, float* dst, int64_t n) {
395   int64_t i;
396 #pragma unroll
397   for (i = 0; i <= (n - Vectorized<float>::size()); i += Vectorized<float>::size()) {
398     _mm256_storeu_ps(dst + i, _mm256_loadu_ps(src + i));
399   }
400 #pragma unroll
401   for (; i < n; i++) {
402     dst[i] = src[i];
403   }
404 }
405 
406 
407 template <>
fmadd(const Vectorized<float> & a,const Vectorized<float> & b,const Vectorized<float> & c)408 Vectorized<float> inline fmadd(const Vectorized<float>& a, const Vectorized<float>& b, const Vectorized<float>& c) {
409   return _mm256_fmadd_ps(a, b, c);
410 }
411 
412 template <>
fmsub(const Vectorized<float> & a,const Vectorized<float> & b,const Vectorized<float> & c)413 Vectorized<float> inline fmsub(const Vectorized<float>& a, const Vectorized<float>& b, const Vectorized<float>& c) {
414   return _mm256_fmsub_ps(a, b, c);
415 }
416 
417 // Used by Inductor CPP codegen
418 template<>
419 inline void transpose_mxn<float, 8, 8>(
420     const float* src,
421     int64_t ld_src,
422     float* dst,
423     int64_t ld_dst) {
424   // load from src to registers
425   // a: a0  a1  a2  a3  a4  a5  a6  a7
426   // b: b0  b1  b2  b3  b4  b5  b6  b7
427   // c: c0  c1  c2  c3  c4  c5  c6  c7
428   // d: d0  d1  d2  d3  d4  d5  d6  d7
429   // e: e0  e1  e2  e3  e4  e5  e6  e7
430   // f: f0  f1  f2  f3  f4  f5  f6  f7
431   // g: g0  g1  g2  g3  g4  g5  g6  g7
432   // h: h0  h1  h2  h3  h4  h5  h6  h7
433   __m256 a = _mm256_loadu_ps(&src[0 * ld_src]);
434   __m256 b = _mm256_loadu_ps(&src[1 * ld_src]);
435   __m256 c = _mm256_loadu_ps(&src[2 * ld_src]);
436   __m256 d = _mm256_loadu_ps(&src[3 * ld_src]);
437   __m256 e = _mm256_loadu_ps(&src[4 * ld_src]);
438   __m256 f = _mm256_loadu_ps(&src[5 * ld_src]);
439   __m256 g = _mm256_loadu_ps(&src[6 * ld_src]);
440   __m256 h = _mm256_loadu_ps(&src[7 * ld_src]);
441 
442   __m256 ta, tb, tc, td, te, tf, tg, th;
443   // unpacking and interleaving 32-bit elements
444   // a0  b0  a1  b1  a4  b4  a5  b5
445   // a2  b2  a3  b3  a6  b6  a7  b7
446   // c0  d0  c1  d1 ...
447   // c2  d2  c3  d3 ...
448   // e0  f0  e1  f1 ...
449   // e2  f2  e3  f3 ...
450   // g0  h0  g1  h1 ...
451   // g2  h2  g3  h3 ...
452   ta = _mm256_unpacklo_ps(a, b);
453   tb = _mm256_unpackhi_ps(a, b);
454   tc = _mm256_unpacklo_ps(c, d);
455   td = _mm256_unpackhi_ps(c, d);
456   te = _mm256_unpacklo_ps(e, f);
457   tf = _mm256_unpackhi_ps(e, f);
458   tg = _mm256_unpacklo_ps(g, h);
459   th = _mm256_unpackhi_ps(g, h);
460 
461   // unpacking and interleaving 64-bit elements
462   //  a0  b0  c0  d0  a4  b4  c4  d4
463   //  a1  b1  c1  d1 ...
464   //  a2  b2  c2  d2 ...
465   //  a3  b3  c3  d3 ...
466   //  e0  f0  g0  h0  e4  f4  g4  h4
467   //  e1  f1  g1  h1 ...
468   //  e2  f2  g2  h2 ...
469   //  e3  f3  g3  h3 ...
470   a = _mm256_castpd_ps(
471       _mm256_unpacklo_pd(_mm256_castps_pd(ta), _mm256_castps_pd(tc)));
472   b = _mm256_castpd_ps(
473       _mm256_unpackhi_pd(_mm256_castps_pd(ta), _mm256_castps_pd(tc)));
474   c = _mm256_castpd_ps(
475       _mm256_unpacklo_pd(_mm256_castps_pd(tb), _mm256_castps_pd(td)));
476   d = _mm256_castpd_ps(
477       _mm256_unpackhi_pd(_mm256_castps_pd(tb), _mm256_castps_pd(td)));
478   e = _mm256_castpd_ps(
479       _mm256_unpacklo_pd(_mm256_castps_pd(te), _mm256_castps_pd(tg)));
480   f = _mm256_castpd_ps(
481       _mm256_unpackhi_pd(_mm256_castps_pd(te), _mm256_castps_pd(tg)));
482   g = _mm256_castpd_ps(
483       _mm256_unpacklo_pd(_mm256_castps_pd(tf), _mm256_castps_pd(th)));
484   h = _mm256_castpd_ps(
485       _mm256_unpackhi_pd(_mm256_castps_pd(tf), _mm256_castps_pd(th)));
486 
487   //  shuffle 128-bits (composed of 4 32-bit elements)
488   //  a0  b0  c0  d0  e0  f0  g0  h0
489   //  a1  b1  c1  d1 ...
490   //  a2  b2  c2  d2 ...
491   //  a3  b3  c3  d3 ...
492   //  a4  b4  c4  d4 ...
493   //  a5  b5  c5  d5 ...
494   //  a6  b6  c6  d6 ...
495   //  a7  b7  c7  d7 ...
496   ta = _mm256_permute2f128_ps(a, e, 0x20);
497   tb = _mm256_permute2f128_ps(b, f, 0x20);
498   tc = _mm256_permute2f128_ps(c, g, 0x20);
499   td = _mm256_permute2f128_ps(d, h, 0x20);
500   te = _mm256_permute2f128_ps(a, e, 0x31);
501   tf = _mm256_permute2f128_ps(b, f, 0x31);
502   tg = _mm256_permute2f128_ps(c, g, 0x31);
503   th = _mm256_permute2f128_ps(d, h, 0x31);
504 
505   // store from registers to dst
506   _mm256_storeu_ps(&dst[0 * ld_dst], ta);
507   _mm256_storeu_ps(&dst[1 * ld_dst], tb);
508   _mm256_storeu_ps(&dst[2 * ld_dst], tc);
509   _mm256_storeu_ps(&dst[3 * ld_dst], td);
510   _mm256_storeu_ps(&dst[4 * ld_dst], te);
511   _mm256_storeu_ps(&dst[5 * ld_dst], tf);
512   _mm256_storeu_ps(&dst[6 * ld_dst], tg);
513   _mm256_storeu_ps(&dst[7 * ld_dst], th);
514 }
515 
516 #endif
517 
518 }}}
519