1 /*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9 #pragma once
10
11 // DO NOT DEFINE STATIC DATA IN THIS HEADER!
12 // See Note [Do not compile initializers with AVX]
13
14 #include <executorch/kernels/optimized/vec/intrinsics.h>
15 #include <executorch/kernels/optimized/vec/vec_base.h>
16
17 #if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
18 #include <sleef.h>
19 #endif
20
21 namespace executorch {
22 namespace vec {
23 // See Note [CPU_CAPABILITY namespace]
24 inline namespace CPU_CAPABILITY {
25
26 #if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
27
28 template <> class Vectorized<float> {
29 private:
30 __m256 values;
31 public:
32 using value_type = float;
33 using size_type = int;
size()34 static constexpr size_type size() {
35 return 8;
36 }
Vectorized()37 Vectorized() {}
Vectorized(__m256 v)38 Vectorized(__m256 v) : values(v) {}
Vectorized(float val)39 Vectorized(float val) {
40 values = _mm256_set1_ps(val);
41 }
Vectorized(float val1,float val2,float val3,float val4,float val5,float val6,float val7,float val8)42 Vectorized(float val1, float val2, float val3, float val4,
43 float val5, float val6, float val7, float val8) {
44 values = _mm256_setr_ps(val1, val2, val3, val4, val5, val6, val7, val8);
45 }
__m256()46 operator __m256() const {
47 return values;
48 }
49 template <int64_t mask>
blend(const Vectorized<float> & a,const Vectorized<float> & b)50 static Vectorized<float> blend(const Vectorized<float>& a, const Vectorized<float>& b) {
51 return _mm256_blend_ps(a.values, b.values, mask);
52 }
blendv(const Vectorized<float> & a,const Vectorized<float> & b,const Vectorized<float> & mask)53 static Vectorized<float> blendv(const Vectorized<float>& a, const Vectorized<float>& b,
54 const Vectorized<float>& mask) {
55 return _mm256_blendv_ps(a.values, b.values, mask.values);
56 }
57 template<typename step_t>
58 static Vectorized<float> arange(float base = 0.f, step_t step = static_cast<step_t>(1)) {
59 return Vectorized<float>(
60 base, base + step, base + 2 * step, base + 3 * step,
61 base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step);
62 }
63 static Vectorized<float> set(const Vectorized<float>& a, const Vectorized<float>& b,
64 int64_t count = size()) {
65 switch (count) {
66 case 0:
67 return a;
68 case 1:
69 return blend<1>(a, b);
70 case 2:
71 return blend<3>(a, b);
72 case 3:
73 return blend<7>(a, b);
74 case 4:
75 return blend<15>(a, b);
76 case 5:
77 return blend<31>(a, b);
78 case 6:
79 return blend<63>(a, b);
80 case 7:
81 return blend<127>(a, b);
82 }
83 return b;
84 }
85 static Vectorized<float> loadu(const void* ptr, int64_t count = size()) {
86 if (count == size())
87 return _mm256_loadu_ps(reinterpret_cast<const float*>(ptr));
88 __at_align__ float tmp_values[size()];
89 // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
90 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
91 // instructions while a loop would be compiled to one instruction.
92 for (size_t i = 0; i < size(); ++i) {
93 tmp_values[i] = 0.0;
94 }
95 std::memcpy(
96 tmp_values, reinterpret_cast<const float*>(ptr), count * sizeof(float));
97 return _mm256_loadu_ps(tmp_values);
98 }
99 void store(void* ptr, int64_t count = size()) const {
100 if (count == size()) {
101 _mm256_storeu_ps(reinterpret_cast<float*>(ptr), values);
102 } else if (count > 0) {
103 float tmp_values[size()];
104 _mm256_storeu_ps(reinterpret_cast<float*>(tmp_values), values);
105 std::memcpy(ptr, tmp_values, count * sizeof(float));
106 }
107 }
108 const float& operator[](int idx) const = delete;
109 float& operator[](int idx) = delete;
zero_mask()110 int zero_mask() const {
111 // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit
112 __m256 cmp = _mm256_cmp_ps(values, _mm256_set1_ps(0.0f), _CMP_EQ_OQ);
113 return _mm256_movemask_ps(cmp);
114 }
isnan()115 Vectorized<float> isnan() const {
116 return _mm256_cmp_ps(values, _mm256_set1_ps(0.0f), _CMP_UNORD_Q);
117 }
map(float (* const f)(float))118 Vectorized<float> map(float (*const f)(float)) const {
119 __at_align__ float tmp[size()];
120 store(tmp);
121 for (size_t i = 0; i < size(); ++i) {
122 tmp[i] = f(tmp[i]);
123 }
124 return loadu(tmp);
125 }
abs()126 Vectorized<float> abs() const {
127 auto mask = _mm256_set1_ps(-0.f);
128 return _mm256_andnot_ps(mask, values);
129 }
acos()130 Vectorized<float> acos() const {
131 return Vectorized<float>(Sleef_acosf8_u10(values));
132 }
asin()133 Vectorized<float> asin() const {
134 return Vectorized<float>(Sleef_asinf8_u10(values));
135 }
atan()136 Vectorized<float> atan() const {
137 return Vectorized<float>(Sleef_atanf8_u10(values));
138 }
atan2(const Vectorized<float> & b)139 Vectorized<float> atan2(const Vectorized<float> &b) const {
140 return Vectorized<float>(Sleef_atan2f8_u10(values, b));
141 }
copysign(const Vectorized<float> & sign)142 Vectorized<float> copysign(const Vectorized<float> &sign) const {
143 return Vectorized<float>(Sleef_copysignf8(values, sign));
144 }
erf()145 Vectorized<float> erf() const {
146 // constants
147 const auto neg_zero_vec = _mm256_set1_ps(-0.f);
148 const auto one_vec = _mm256_set1_ps(1.0f);
149 const auto p = _mm256_set1_ps(0.3275911f);
150 const auto p1 = _mm256_set1_ps(0.254829592f);
151 const auto p2 = _mm256_set1_ps(-0.284496736f);
152 const auto p3 = _mm256_set1_ps(1.421413741f);
153 const auto p4 = _mm256_set1_ps(-1.453152027f);
154 const auto p5 = _mm256_set1_ps(1.061405429f);
155 // sign(x)
156 auto sign_mask = _mm256_and_ps(neg_zero_vec, values);
157 auto abs_vec = _mm256_xor_ps(sign_mask, values);
158 // t = 1 / (p * abs(x) + 1)
159 auto tmp0 = _mm256_fmadd_ps(p, abs_vec, one_vec);
160 auto t = _mm256_div_ps(one_vec, tmp0);
161 // r = p5 * t ^ 4 + p4 * t ^ 3 + p3 * t ^ 2 + p2 * t + p1
162 auto tmp1 = _mm256_fmadd_ps(p5, t, p4);
163 auto tmp2 = _mm256_fmadd_ps(tmp1, t, p3);
164 auto tmp3 = _mm256_fmadd_ps(tmp2, t, p2);
165 auto r = _mm256_fmadd_ps(tmp3, t, p1);
166 // - exp(- x * x)
167 auto pow_2 = _mm256_mul_ps(values, values);
168 auto neg_pow_2 = _mm256_xor_ps(neg_zero_vec, pow_2);
169 // auto tmp4 = exp(neg_pow_2);
170 auto tmp4 = Vectorized<float>(Sleef_expf8_u10(neg_pow_2));
171 auto tmp5 = _mm256_xor_ps(neg_zero_vec, tmp4);
172 // erf(x) = sign(x) * (1 - r * t * exp(- x * x))
173 auto tmp6 = _mm256_mul_ps(tmp5, t);
174 auto tmp7 = _mm256_fmadd_ps(tmp6, r, one_vec);
175 return _mm256_xor_ps(sign_mask, tmp7);
176 }
erfc()177 Vectorized<float> erfc() const {
178 return Vectorized<float>(Sleef_erfcf8_u15(values));
179 }
exp()180 Vectorized<float> exp() const {
181 return Vectorized<float>(Sleef_expf8_u10(values));
182 }
exp2()183 Vectorized<float> exp2() const {
184 return Vectorized<float>(Sleef_exp2f8_u10(values));
185 }
expm1()186 Vectorized<float> expm1() const {
187 return Vectorized<float>(Sleef_expm1f8_u10(values));
188 }
fmod(const Vectorized<float> & q)189 Vectorized<float> fmod(const Vectorized<float>& q) const {
190 return Vectorized<float>(Sleef_fmodf8(values, q));
191 }
log()192 Vectorized<float> log() const {
193 return Vectorized<float>(Sleef_logf8_u10(values));
194 }
log2()195 Vectorized<float> log2() const {
196 return Vectorized<float>(Sleef_log2f8_u10(values));
197 }
log10()198 Vectorized<float> log10() const {
199 return Vectorized<float>(Sleef_log10f8_u10(values));
200 }
log1p()201 Vectorized<float> log1p() const {
202 return Vectorized<float>(Sleef_log1pf8_u10(values));
203 }
204 Vectorized<float> frac() const;
sin()205 Vectorized<float> sin() const {
206 return Vectorized<float>(Sleef_sinf8_u35(values));
207 }
sinh()208 Vectorized<float> sinh() const {
209 return Vectorized<float>(Sleef_sinhf8_u10(values));
210 }
cos()211 Vectorized<float> cos() const {
212 return Vectorized<float>(Sleef_cosf8_u35(values));
213 }
cosh()214 Vectorized<float> cosh() const {
215 return Vectorized<float>(Sleef_coshf8_u10(values));
216 }
ceil()217 Vectorized<float> ceil() const {
218 return _mm256_ceil_ps(values);
219 }
floor()220 Vectorized<float> floor() const {
221 return _mm256_floor_ps(values);
222 }
hypot(const Vectorized<float> & b)223 Vectorized<float> hypot(const Vectorized<float> &b) const {
224 return Vectorized<float>(Sleef_hypotf8_u05(values, b));
225 }
neg()226 Vectorized<float> neg() const {
227 return _mm256_xor_ps(_mm256_set1_ps(-0.f), values);
228 }
nextafter(const Vectorized<float> & b)229 Vectorized<float> nextafter(const Vectorized<float> &b) const {
230 return Vectorized<float>(Sleef_nextafterf8(values, b));
231 }
round()232 Vectorized<float> round() const {
233 return _mm256_round_ps(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
234 }
tan()235 Vectorized<float> tan() const {
236 return Vectorized<float>(Sleef_tanf8_u10(values));
237 }
tanh()238 Vectorized<float> tanh() const {
239 return Vectorized<float>(Sleef_tanhf8_u10(values));
240 }
trunc()241 Vectorized<float> trunc() const {
242 return _mm256_round_ps(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
243 }
lgamma()244 Vectorized<float> lgamma() const {
245 return Vectorized<float>(Sleef_lgammaf8_u10(values));
246 }
sqrt()247 Vectorized<float> sqrt() const {
248 return _mm256_sqrt_ps(values);
249 }
reciprocal()250 Vectorized<float> reciprocal() const {
251 return _mm256_div_ps(_mm256_set1_ps(1), values);
252 }
rsqrt()253 Vectorized<float> rsqrt() const {
254 return _mm256_div_ps(_mm256_set1_ps(1), _mm256_sqrt_ps(values));
255 }
pow(const Vectorized<float> & b)256 Vectorized<float> pow(const Vectorized<float> &b) const {
257 return Vectorized<float>(Sleef_powf8_u10(values, b));
258 }
259 // Comparison using the _CMP_**_OQ predicate.
260 // `O`: get false if an operand is NaN
261 // `Q`: do not raise if an operand is NaN
262 Vectorized<float> operator==(const Vectorized<float>& other) const {
263 return _mm256_cmp_ps(values, other.values, _CMP_EQ_OQ);
264 }
265
266 Vectorized<float> operator!=(const Vectorized<float>& other) const {
267 return _mm256_cmp_ps(values, other.values, _CMP_NEQ_UQ);
268 }
269
270 Vectorized<float> operator<(const Vectorized<float>& other) const {
271 return _mm256_cmp_ps(values, other.values, _CMP_LT_OQ);
272 }
273
274 Vectorized<float> operator<=(const Vectorized<float>& other) const {
275 return _mm256_cmp_ps(values, other.values, _CMP_LE_OQ);
276 }
277
278 Vectorized<float> operator>(const Vectorized<float>& other) const {
279 return _mm256_cmp_ps(values, other.values, _CMP_GT_OQ);
280 }
281
282 Vectorized<float> operator>=(const Vectorized<float>& other) const {
283 return _mm256_cmp_ps(values, other.values, _CMP_GE_OQ);
284 }
285
286 Vectorized<float> eq(const Vectorized<float>& other) const;
287 Vectorized<float> ne(const Vectorized<float>& other) const;
288 Vectorized<float> gt(const Vectorized<float>& other) const;
289 Vectorized<float> ge(const Vectorized<float>& other) const;
290 Vectorized<float> lt(const Vectorized<float>& other) const;
291 Vectorized<float> le(const Vectorized<float>& other) const;
292 };
293
294 template <>
295 Vectorized<float> inline operator+(const Vectorized<float>& a, const Vectorized<float>& b) {
296 return _mm256_add_ps(a, b);
297 }
298
299 template <>
300 Vectorized<float> inline operator-(const Vectorized<float>& a, const Vectorized<float>& b) {
301 return _mm256_sub_ps(a, b);
302 }
303
304 template <>
305 Vectorized<float> inline operator*(const Vectorized<float>& a, const Vectorized<float>& b) {
306 return _mm256_mul_ps(a, b);
307 }
308
309 template <>
310 Vectorized<float> inline operator/(const Vectorized<float>& a, const Vectorized<float>& b) {
311 return _mm256_div_ps(a, b);
312 }
313
314 // frac. Implement this here so we can use subtraction
frac()315 inline Vectorized<float> Vectorized<float>::frac() const {
316 return *this - this->trunc();
317 }
318
319 // Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
320 // either input is a NaN.
321 template <>
maximum(const Vectorized<float> & a,const Vectorized<float> & b)322 Vectorized<float> inline maximum(const Vectorized<float>& a, const Vectorized<float>& b) {
323 Vectorized<float> max = _mm256_max_ps(a, b);
324 Vectorized<float> isnan = _mm256_cmp_ps(a, b, _CMP_UNORD_Q);
325 // Exploit the fact that all-ones is a NaN.
326 return _mm256_or_ps(max, isnan);
327 }
328
329 // Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
330 // either input is a NaN.
331 template <>
minimum(const Vectorized<float> & a,const Vectorized<float> & b)332 Vectorized<float> inline minimum(const Vectorized<float>& a, const Vectorized<float>& b) {
333 Vectorized<float> min = _mm256_min_ps(a, b);
334 Vectorized<float> isnan = _mm256_cmp_ps(a, b, _CMP_UNORD_Q);
335 // Exploit the fact that all-ones is a NaN.
336 return _mm256_or_ps(min, isnan);
337 }
338
339 template <>
clamp(const Vectorized<float> & a,const Vectorized<float> & min,const Vectorized<float> & max)340 Vectorized<float> inline clamp(const Vectorized<float>& a, const Vectorized<float>& min, const Vectorized<float>& max) {
341 return _mm256_min_ps(max, _mm256_max_ps(min, a));
342 }
343
344 template <>
clamp_max(const Vectorized<float> & a,const Vectorized<float> & max)345 Vectorized<float> inline clamp_max(const Vectorized<float>& a, const Vectorized<float>& max) {
346 return _mm256_min_ps(max, a);
347 }
348
349 template <>
clamp_min(const Vectorized<float> & a,const Vectorized<float> & min)350 Vectorized<float> inline clamp_min(const Vectorized<float>& a, const Vectorized<float>& min) {
351 return _mm256_max_ps(min, a);
352 }
353
354 template <>
355 Vectorized<float> inline operator&(const Vectorized<float>& a, const Vectorized<float>& b) {
356 return _mm256_and_ps(a, b);
357 }
358
359 template <>
360 Vectorized<float> inline operator|(const Vectorized<float>& a, const Vectorized<float>& b) {
361 return _mm256_or_ps(a, b);
362 }
363
364 template <>
365 Vectorized<float> inline operator^(const Vectorized<float>& a, const Vectorized<float>& b) {
366 return _mm256_xor_ps(a, b);
367 }
368
eq(const Vectorized<float> & other)369 inline Vectorized<float> Vectorized<float>::eq(const Vectorized<float>& other) const {
370 return (*this == other) & Vectorized<float>(1.0f);
371 }
372
ne(const Vectorized<float> & other)373 inline Vectorized<float> Vectorized<float>::ne(const Vectorized<float>& other) const {
374 return (*this != other) & Vectorized<float>(1.0f);
375 }
376
gt(const Vectorized<float> & other)377 inline Vectorized<float> Vectorized<float>::gt(const Vectorized<float>& other) const {
378 return (*this > other) & Vectorized<float>(1.0f);
379 }
380
ge(const Vectorized<float> & other)381 inline Vectorized<float> Vectorized<float>::ge(const Vectorized<float>& other) const {
382 return (*this >= other) & Vectorized<float>(1.0f);
383 }
384
lt(const Vectorized<float> & other)385 inline Vectorized<float> Vectorized<float>::lt(const Vectorized<float>& other) const {
386 return (*this < other) & Vectorized<float>(1.0f);
387 }
388
le(const Vectorized<float> & other)389 inline Vectorized<float> Vectorized<float>::le(const Vectorized<float>& other) const {
390 return (*this <= other) & Vectorized<float>(1.0f);
391 }
392
393 template <>
convert(const float * src,float * dst,int64_t n)394 inline void convert(const float* src, float* dst, int64_t n) {
395 int64_t i;
396 #pragma unroll
397 for (i = 0; i <= (n - Vectorized<float>::size()); i += Vectorized<float>::size()) {
398 _mm256_storeu_ps(dst + i, _mm256_loadu_ps(src + i));
399 }
400 #pragma unroll
401 for (; i < n; i++) {
402 dst[i] = src[i];
403 }
404 }
405
406
407 template <>
fmadd(const Vectorized<float> & a,const Vectorized<float> & b,const Vectorized<float> & c)408 Vectorized<float> inline fmadd(const Vectorized<float>& a, const Vectorized<float>& b, const Vectorized<float>& c) {
409 return _mm256_fmadd_ps(a, b, c);
410 }
411
412 template <>
fmsub(const Vectorized<float> & a,const Vectorized<float> & b,const Vectorized<float> & c)413 Vectorized<float> inline fmsub(const Vectorized<float>& a, const Vectorized<float>& b, const Vectorized<float>& c) {
414 return _mm256_fmsub_ps(a, b, c);
415 }
416
417 // Used by Inductor CPP codegen
418 template<>
419 inline void transpose_mxn<float, 8, 8>(
420 const float* src,
421 int64_t ld_src,
422 float* dst,
423 int64_t ld_dst) {
424 // load from src to registers
425 // a: a0 a1 a2 a3 a4 a5 a6 a7
426 // b: b0 b1 b2 b3 b4 b5 b6 b7
427 // c: c0 c1 c2 c3 c4 c5 c6 c7
428 // d: d0 d1 d2 d3 d4 d5 d6 d7
429 // e: e0 e1 e2 e3 e4 e5 e6 e7
430 // f: f0 f1 f2 f3 f4 f5 f6 f7
431 // g: g0 g1 g2 g3 g4 g5 g6 g7
432 // h: h0 h1 h2 h3 h4 h5 h6 h7
433 __m256 a = _mm256_loadu_ps(&src[0 * ld_src]);
434 __m256 b = _mm256_loadu_ps(&src[1 * ld_src]);
435 __m256 c = _mm256_loadu_ps(&src[2 * ld_src]);
436 __m256 d = _mm256_loadu_ps(&src[3 * ld_src]);
437 __m256 e = _mm256_loadu_ps(&src[4 * ld_src]);
438 __m256 f = _mm256_loadu_ps(&src[5 * ld_src]);
439 __m256 g = _mm256_loadu_ps(&src[6 * ld_src]);
440 __m256 h = _mm256_loadu_ps(&src[7 * ld_src]);
441
442 __m256 ta, tb, tc, td, te, tf, tg, th;
443 // unpacking and interleaving 32-bit elements
444 // a0 b0 a1 b1 a4 b4 a5 b5
445 // a2 b2 a3 b3 a6 b6 a7 b7
446 // c0 d0 c1 d1 ...
447 // c2 d2 c3 d3 ...
448 // e0 f0 e1 f1 ...
449 // e2 f2 e3 f3 ...
450 // g0 h0 g1 h1 ...
451 // g2 h2 g3 h3 ...
452 ta = _mm256_unpacklo_ps(a, b);
453 tb = _mm256_unpackhi_ps(a, b);
454 tc = _mm256_unpacklo_ps(c, d);
455 td = _mm256_unpackhi_ps(c, d);
456 te = _mm256_unpacklo_ps(e, f);
457 tf = _mm256_unpackhi_ps(e, f);
458 tg = _mm256_unpacklo_ps(g, h);
459 th = _mm256_unpackhi_ps(g, h);
460
461 // unpacking and interleaving 64-bit elements
462 // a0 b0 c0 d0 a4 b4 c4 d4
463 // a1 b1 c1 d1 ...
464 // a2 b2 c2 d2 ...
465 // a3 b3 c3 d3 ...
466 // e0 f0 g0 h0 e4 f4 g4 h4
467 // e1 f1 g1 h1 ...
468 // e2 f2 g2 h2 ...
469 // e3 f3 g3 h3 ...
470 a = _mm256_castpd_ps(
471 _mm256_unpacklo_pd(_mm256_castps_pd(ta), _mm256_castps_pd(tc)));
472 b = _mm256_castpd_ps(
473 _mm256_unpackhi_pd(_mm256_castps_pd(ta), _mm256_castps_pd(tc)));
474 c = _mm256_castpd_ps(
475 _mm256_unpacklo_pd(_mm256_castps_pd(tb), _mm256_castps_pd(td)));
476 d = _mm256_castpd_ps(
477 _mm256_unpackhi_pd(_mm256_castps_pd(tb), _mm256_castps_pd(td)));
478 e = _mm256_castpd_ps(
479 _mm256_unpacklo_pd(_mm256_castps_pd(te), _mm256_castps_pd(tg)));
480 f = _mm256_castpd_ps(
481 _mm256_unpackhi_pd(_mm256_castps_pd(te), _mm256_castps_pd(tg)));
482 g = _mm256_castpd_ps(
483 _mm256_unpacklo_pd(_mm256_castps_pd(tf), _mm256_castps_pd(th)));
484 h = _mm256_castpd_ps(
485 _mm256_unpackhi_pd(_mm256_castps_pd(tf), _mm256_castps_pd(th)));
486
487 // shuffle 128-bits (composed of 4 32-bit elements)
488 // a0 b0 c0 d0 e0 f0 g0 h0
489 // a1 b1 c1 d1 ...
490 // a2 b2 c2 d2 ...
491 // a3 b3 c3 d3 ...
492 // a4 b4 c4 d4 ...
493 // a5 b5 c5 d5 ...
494 // a6 b6 c6 d6 ...
495 // a7 b7 c7 d7 ...
496 ta = _mm256_permute2f128_ps(a, e, 0x20);
497 tb = _mm256_permute2f128_ps(b, f, 0x20);
498 tc = _mm256_permute2f128_ps(c, g, 0x20);
499 td = _mm256_permute2f128_ps(d, h, 0x20);
500 te = _mm256_permute2f128_ps(a, e, 0x31);
501 tf = _mm256_permute2f128_ps(b, f, 0x31);
502 tg = _mm256_permute2f128_ps(c, g, 0x31);
503 th = _mm256_permute2f128_ps(d, h, 0x31);
504
505 // store from registers to dst
506 _mm256_storeu_ps(&dst[0 * ld_dst], ta);
507 _mm256_storeu_ps(&dst[1 * ld_dst], tb);
508 _mm256_storeu_ps(&dst[2 * ld_dst], tc);
509 _mm256_storeu_ps(&dst[3 * ld_dst], td);
510 _mm256_storeu_ps(&dst[4 * ld_dst], te);
511 _mm256_storeu_ps(&dst[5 * ld_dst], tf);
512 _mm256_storeu_ps(&dst[6 * ld_dst], tg);
513 _mm256_storeu_ps(&dst[7 * ld_dst], th);
514 }
515
516 #endif
517
518 }}}
519