xref: /aosp_15_r20/external/eigen/Eigen/src/Core/arch/MSA/PacketMath.h (revision bf2c37156dfe67e5dfebd6d394bad8b2ab5804d4)
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2018 Wave Computing, Inc.
5 // Written by:
6 //   Chris Larsen
7 //   Alexey Frunze ([email protected])
8 //
9 // This Source Code Form is subject to the terms of the Mozilla
10 // Public License v. 2.0. If a copy of the MPL was not distributed
11 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
12 
13 #ifndef EIGEN_PACKET_MATH_MSA_H
14 #define EIGEN_PACKET_MATH_MSA_H
15 
16 #include <iostream>
17 #include <string>
18 
19 namespace Eigen {
20 
21 namespace internal {
22 
23 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
24 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
25 #endif
26 
27 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
28 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
29 #endif
30 
31 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
32 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
33 #endif
34 
35 #if 0
36 #define EIGEN_MSA_DEBUG                                                             \
37   static bool firstTime = true;                                                     \
38   do {                                                                              \
39     if (firstTime) {                                                                \
40       std::cout << __FILE__ << ':' << __LINE__ << ':' << __FUNCTION__ << std::endl; \
41       firstTime = false;                                                            \
42     }                                                                               \
43   } while (0)
44 #else
45 #define EIGEN_MSA_DEBUG
46 #endif
47 
48 #define EIGEN_MSA_SHF_I8(a, b, c, d) (((d) << 6) | ((c) << 4) | ((b) << 2) | (a))
49 
50 typedef v4f32 Packet4f;
51 typedef v4i32 Packet4i;
52 typedef v4u32 Packet4ui;
53 
54 #define _EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = { X, X, X, X }
55 #define _EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = { X, X, X, X }
56 #define _EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = { X, X, X, X }
57 
58 inline std::ostream& operator<<(std::ostream& os, const Packet4f& value) {
59   os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
60   return os;
61 }
62 
63 inline std::ostream& operator<<(std::ostream& os, const Packet4i& value) {
64   os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
65   return os;
66 }
67 
68 inline std::ostream& operator<<(std::ostream& os, const Packet4ui& value) {
69   os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
70   return os;
71 }
72 
73 template <>
74 struct packet_traits<float> : default_packet_traits {
75   typedef Packet4f type;
76   typedef Packet4f half;  // Packet2f intrinsics not implemented yet
77   enum {
78     Vectorizable = 1,
79     AlignedOnScalar = 1,
80     size = 4,
81     HasHalfPacket = 0,  // Packet2f intrinsics not implemented yet
82     // FIXME check the Has*
83     HasDiv = 1,
84     HasSin = EIGEN_FAST_MATH,
85     HasCos = EIGEN_FAST_MATH,
86     HasTanh = EIGEN_FAST_MATH,
87     HasErf = EIGEN_FAST_MATH,
88     HasLog = 1,
89     HasExp = 1,
90     HasSqrt = 1,
91     HasRsqrt = 1,
92     HasRound = 1,
93     HasFloor = 1,
94     HasCeil = 1,
95     HasBlend = 1
96   };
97 };
98 
99 template <>
100 struct packet_traits<int32_t> : default_packet_traits {
101   typedef Packet4i type;
102   typedef Packet4i half;  // Packet2i intrinsics not implemented yet
103   enum {
104     Vectorizable = 1,
105     AlignedOnScalar = 1,
106     size = 4,
107     HasHalfPacket = 0,  // Packet2i intrinsics not implemented yet
108     // FIXME check the Has*
109     HasDiv = 1,
110     HasBlend = 1
111   };
112 };
113 
114 template <>
115 struct unpacket_traits<Packet4f> {
116   typedef float type;
117   enum { size = 4, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
118   typedef Packet4f half;
119 };
120 
121 template <>
122 struct unpacket_traits<Packet4i> {
123   typedef int32_t type;
124   enum { size = 4, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
125   typedef Packet4i half;
126 };
127 
128 template <>
129 EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
130   EIGEN_MSA_DEBUG;
131 
132   Packet4f v = { from, from, from, from };
133   return v;
134 }
135 
136 template <>
137 EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) {
138   EIGEN_MSA_DEBUG;
139 
140   return __builtin_msa_fill_w(from);
141 }
142 
143 template <>
144 EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float* from) {
145   EIGEN_MSA_DEBUG;
146 
147   float f = *from;
148   Packet4f v = { f, f, f, f };
149   return v;
150 }
151 
152 template <>
153 EIGEN_STRONG_INLINE Packet4i pload1<Packet4i>(const int32_t* from) {
154   EIGEN_MSA_DEBUG;
155 
156   return __builtin_msa_fill_w(*from);
157 }
158 
159 template <>
160 EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
161   EIGEN_MSA_DEBUG;
162 
163   return __builtin_msa_fadd_w(a, b);
164 }
165 
166 template <>
167 EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
168   EIGEN_MSA_DEBUG;
169 
170   return __builtin_msa_addv_w(a, b);
171 }
172 
173 template <>
174 EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
175   EIGEN_MSA_DEBUG;
176 
177   static const Packet4f countdown = { 0.0f, 1.0f, 2.0f, 3.0f };
178   return padd(pset1<Packet4f>(a), countdown);
179 }
180 
181 template <>
182 EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a) {
183   EIGEN_MSA_DEBUG;
184 
185   static const Packet4i countdown = { 0, 1, 2, 3 };
186   return padd(pset1<Packet4i>(a), countdown);
187 }
188 
189 template <>
190 EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
191   EIGEN_MSA_DEBUG;
192 
193   return __builtin_msa_fsub_w(a, b);
194 }
195 
196 template <>
197 EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
198   EIGEN_MSA_DEBUG;
199 
200   return __builtin_msa_subv_w(a, b);
201 }
202 
203 template <>
204 EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
205   EIGEN_MSA_DEBUG;
206 
207   return (Packet4f)__builtin_msa_bnegi_w((v4u32)a, 31);
208 }
209 
210 template <>
211 EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
212   EIGEN_MSA_DEBUG;
213 
214   return __builtin_msa_addvi_w((v4i32)__builtin_msa_nori_b((v16u8)a, 0), 1);
215 }
216 
217 template <>
218 EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
219   EIGEN_MSA_DEBUG;
220 
221   return a;
222 }
223 
224 template <>
225 EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
226   EIGEN_MSA_DEBUG;
227 
228   return a;
229 }
230 
231 template <>
232 EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
233   EIGEN_MSA_DEBUG;
234 
235   return __builtin_msa_fmul_w(a, b);
236 }
237 
238 template <>
239 EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
240   EIGEN_MSA_DEBUG;
241 
242   return __builtin_msa_mulv_w(a, b);
243 }
244 
245 template <>
246 EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
247   EIGEN_MSA_DEBUG;
248 
249   return __builtin_msa_fdiv_w(a, b);
250 }
251 
252 template <>
253 EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
254   EIGEN_MSA_DEBUG;
255 
256   return __builtin_msa_div_s_w(a, b);
257 }
258 
259 template <>
260 EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
261   EIGEN_MSA_DEBUG;
262 
263   return __builtin_msa_fmadd_w(c, a, b);
264 }
265 
266 template <>
267 EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
268   EIGEN_MSA_DEBUG;
269 
270   // Use "asm" construct to avoid __builtin_msa_maddv_w GNU C bug.
271   Packet4i value = c;
272   __asm__("maddv.w %w[value], %w[a], %w[b]\n"
273           // Outputs
274           : [value] "+f"(value)
275           // Inputs
276           : [a] "f"(a), [b] "f"(b));
277   return value;
278 }
279 
280 template <>
281 EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
282   EIGEN_MSA_DEBUG;
283 
284   return (Packet4f)__builtin_msa_and_v((v16u8)a, (v16u8)b);
285 }
286 
287 template <>
288 EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
289   EIGEN_MSA_DEBUG;
290 
291   return (Packet4i)__builtin_msa_and_v((v16u8)a, (v16u8)b);
292 }
293 
294 template <>
295 EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
296   EIGEN_MSA_DEBUG;
297 
298   return (Packet4f)__builtin_msa_or_v((v16u8)a, (v16u8)b);
299 }
300 
301 template <>
302 EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
303   EIGEN_MSA_DEBUG;
304 
305   return (Packet4i)__builtin_msa_or_v((v16u8)a, (v16u8)b);
306 }
307 
308 template <>
309 EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
310   EIGEN_MSA_DEBUG;
311 
312   return (Packet4f)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
313 }
314 
315 template <>
316 EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
317   EIGEN_MSA_DEBUG;
318 
319   return (Packet4i)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
320 }
321 
322 template <>
323 EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
324   EIGEN_MSA_DEBUG;
325 
326   return pand(a, (Packet4f)__builtin_msa_xori_b((v16u8)b, 255));
327 }
328 
329 template <>
330 EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
331   EIGEN_MSA_DEBUG;
332 
333   return pand(a, (Packet4i)__builtin_msa_xori_b((v16u8)b, 255));
334 }
335 
336 template <>
337 EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
338   EIGEN_MSA_DEBUG;
339 
340 #if EIGEN_FAST_MATH
341   // This prefers numbers to NaNs.
342   return __builtin_msa_fmin_w(a, b);
343 #else
344   // This prefers NaNs to numbers.
345   Packet4i aNaN = __builtin_msa_fcun_w(a, a);
346   Packet4i aMinOrNaN = por(__builtin_msa_fclt_w(a, b), aNaN);
347   return (Packet4f)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a);
348 #endif
349 }
350 
351 template <>
352 EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
353   EIGEN_MSA_DEBUG;
354 
355   return __builtin_msa_min_s_w(a, b);
356 }
357 
358 template <>
359 EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
360   EIGEN_MSA_DEBUG;
361 
362 #if EIGEN_FAST_MATH
363   // This prefers numbers to NaNs.
364   return __builtin_msa_fmax_w(a, b);
365 #else
366   // This prefers NaNs to numbers.
367   Packet4i aNaN = __builtin_msa_fcun_w(a, a);
368   Packet4i aMaxOrNaN = por(__builtin_msa_fclt_w(b, a), aNaN);
369   return (Packet4f)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a);
370 #endif
371 }
372 
373 template <>
374 EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
375   EIGEN_MSA_DEBUG;
376 
377   return __builtin_msa_max_s_w(a, b);
378 }
379 
380 template <>
381 EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
382   EIGEN_MSA_DEBUG;
383 
384   EIGEN_DEBUG_ALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast<float*>(from), 0);
385 }
386 
387 template <>
388 EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) {
389   EIGEN_MSA_DEBUG;
390 
391   EIGEN_DEBUG_ALIGNED_LOAD return __builtin_msa_ld_w(const_cast<int32_t*>(from), 0);
392 }
393 
394 template <>
395 EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
396   EIGEN_MSA_DEBUG;
397 
398   EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast<float*>(from), 0);
399 }
400 
401 template <>
402 EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) {
403   EIGEN_MSA_DEBUG;
404 
405   EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4i)__builtin_msa_ld_w(const_cast<int32_t*>(from), 0);
406 }
407 
408 template <>
409 EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
410   EIGEN_MSA_DEBUG;
411 
412   float f0 = from[0], f1 = from[1];
413   Packet4f v0 = { f0, f0, f0, f0 };
414   Packet4f v1 = { f1, f1, f1, f1 };
415   return (Packet4f)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
416 }
417 
418 template <>
419 EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from) {
420   EIGEN_MSA_DEBUG;
421 
422   int32_t i0 = from[0], i1 = from[1];
423   Packet4i v0 = { i0, i0, i0, i0 };
424   Packet4i v1 = { i1, i1, i1, i1 };
425   return (Packet4i)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
426 }
427 
428 template <>
429 EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
430   EIGEN_MSA_DEBUG;
431 
432   EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0);
433 }
434 
435 template <>
436 EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) {
437   EIGEN_MSA_DEBUG;
438 
439   EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w(from, to, 0);
440 }
441 
442 template <>
443 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
444   EIGEN_MSA_DEBUG;
445 
446   EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0);
447 }
448 
449 template <>
450 EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) {
451   EIGEN_MSA_DEBUG;
452 
453   EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w(from, to, 0);
454 }
455 
456 template <>
457 EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
458   EIGEN_MSA_DEBUG;
459 
460   float f = *from;
461   Packet4f v = { f, f, f, f };
462   v[1] = from[stride];
463   v[2] = from[2 * stride];
464   v[3] = from[3 * stride];
465   return v;
466 }
467 
468 template <>
469 EIGEN_DEVICE_FUNC inline Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride) {
470   EIGEN_MSA_DEBUG;
471 
472   int32_t i = *from;
473   Packet4i v = { i, i, i, i };
474   v[1] = from[stride];
475   v[2] = from[2 * stride];
476   v[3] = from[3 * stride];
477   return v;
478 }
479 
480 template <>
481 EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from,
482                                                         Index stride) {
483   EIGEN_MSA_DEBUG;
484 
485   *to = from[0];
486   to += stride;
487   *to = from[1];
488   to += stride;
489   *to = from[2];
490   to += stride;
491   *to = from[3];
492 }
493 
494 template <>
495 EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from,
496                                                           Index stride) {
497   EIGEN_MSA_DEBUG;
498 
499   *to = from[0];
500   to += stride;
501   *to = from[1];
502   to += stride;
503   *to = from[2];
504   to += stride;
505   *to = from[3];
506 }
507 
508 template <>
509 EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
510   EIGEN_MSA_DEBUG;
511 
512   __builtin_prefetch(addr);
513 }
514 
515 template <>
516 EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) {
517   EIGEN_MSA_DEBUG;
518 
519   __builtin_prefetch(addr);
520 }
521 
522 template <>
523 EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
524   EIGEN_MSA_DEBUG;
525 
526   return a[0];
527 }
528 
529 template <>
530 EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) {
531   EIGEN_MSA_DEBUG;
532 
533   return a[0];
534 }
535 
536 template <>
537 EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
538   EIGEN_MSA_DEBUG;
539 
540   return (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(3, 2, 1, 0));
541 }
542 
543 template <>
544 EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
545   EIGEN_MSA_DEBUG;
546 
547   return __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(3, 2, 1, 0));
548 }
549 
550 template <>
551 EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
552   EIGEN_MSA_DEBUG;
553 
554   return (Packet4f)__builtin_msa_bclri_w((v4u32)a, 31);
555 }
556 
557 template <>
558 EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
559   EIGEN_MSA_DEBUG;
560 
561   Packet4i zero = __builtin_msa_ldi_w(0);
562   return __builtin_msa_add_a_w(zero, a);
563 }
564 
565 template <>
566 EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
567   EIGEN_MSA_DEBUG;
568 
569   Packet4f s = padd(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
570   s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
571   return s[0];
572 }
573 
574 
575 template <>
576 EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) {
577   EIGEN_MSA_DEBUG;
578 
579   Packet4i s = padd(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
580   s = padd(s, __builtin_msa_shf_w(s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
581   return s[0];
582 }
583 
584 // Other reduction functions:
585 // mul
586 template <>
587 EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
588   EIGEN_MSA_DEBUG;
589 
590   Packet4f p = pmul(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
591   p = pmul(p, (Packet4f)__builtin_msa_shf_w((v4i32)p, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
592   return p[0];
593 }
594 
595 template <>
596 EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a) {
597   EIGEN_MSA_DEBUG;
598 
599   Packet4i p = pmul(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
600   p = pmul(p, __builtin_msa_shf_w(p, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
601   return p[0];
602 }
603 
604 // min
605 template <>
606 EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
607   EIGEN_MSA_DEBUG;
608 
609   // Swap 64-bit halves of a.
610   Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
611 #if !EIGEN_FAST_MATH
612   // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit
613   // masks of all zeroes/ones in low 64 bits.
614   v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped);
615   // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes.
616   unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
617 #endif
618   // Continue with min computation.
619   Packet4f v = __builtin_msa_fmin_w(a, swapped);
620   v = __builtin_msa_fmin_w(
621       v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
622 #if !EIGEN_FAST_MATH
623   // Based on the mask select between v and 4 qNaNs.
624   v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
625   v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v);
626 #endif
627   return v[0];
628 }
629 
630 template <>
631 EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) {
632   EIGEN_MSA_DEBUG;
633 
634   Packet4i m = pmin(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
635   m = pmin(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
636   return m[0];
637 }
638 
639 // max
640 template <>
641 EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
642   EIGEN_MSA_DEBUG;
643 
644   // Swap 64-bit halves of a.
645   Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
646 #if !EIGEN_FAST_MATH
647   // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit
648   // masks of all zeroes/ones in low 64 bits.
649   v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped);
650   // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes.
651   unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
652 #endif
653   // Continue with max computation.
654   Packet4f v = __builtin_msa_fmax_w(a, swapped);
655   v = __builtin_msa_fmax_w(
656       v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
657 #if !EIGEN_FAST_MATH
658   // Based on the mask select between v and 4 qNaNs.
659   v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
660   v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v);
661 #endif
662   return v[0];
663 }
664 
665 template <>
666 EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) {
667   EIGEN_MSA_DEBUG;
668 
669   Packet4i m = pmax(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
670   m = pmax(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
671   return m[0];
672 }
673 
674 inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet4f, 4>& value) {
675   os << "[ " << value.packet[0] << "," << std::endl
676      << "  " << value.packet[1] << "," << std::endl
677      << "  " << value.packet[2] << "," << std::endl
678      << "  " << value.packet[3] << " ]";
679   return os;
680 }
681 
682 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
683   EIGEN_MSA_DEBUG;
684 
685   v4i32 tmp1, tmp2, tmp3, tmp4;
686 
687   tmp1 = __builtin_msa_ilvr_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
688   tmp2 = __builtin_msa_ilvr_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
689   tmp3 = __builtin_msa_ilvl_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
690   tmp4 = __builtin_msa_ilvl_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
691 
692   kernel.packet[0] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
693   kernel.packet[1] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
694   kernel.packet[2] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
695   kernel.packet[3] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
696 }
697 
698 inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet4i, 4>& value) {
699   os << "[ " << value.packet[0] << "," << std::endl
700      << "  " << value.packet[1] << "," << std::endl
701      << "  " << value.packet[2] << "," << std::endl
702      << "  " << value.packet[3] << " ]";
703   return os;
704 }
705 
706 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
707   EIGEN_MSA_DEBUG;
708 
709   v4i32 tmp1, tmp2, tmp3, tmp4;
710 
711   tmp1 = __builtin_msa_ilvr_w(kernel.packet[1], kernel.packet[0]);
712   tmp2 = __builtin_msa_ilvr_w(kernel.packet[3], kernel.packet[2]);
713   tmp3 = __builtin_msa_ilvl_w(kernel.packet[1], kernel.packet[0]);
714   tmp4 = __builtin_msa_ilvl_w(kernel.packet[3], kernel.packet[2]);
715 
716   kernel.packet[0] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
717   kernel.packet[1] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
718   kernel.packet[2] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
719   kernel.packet[3] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
720 }
721 
722 template <>
723 EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
724   EIGEN_MSA_DEBUG;
725 
726   return __builtin_msa_fsqrt_w(a);
727 }
728 
729 template <>
730 EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
731   EIGEN_MSA_DEBUG;
732 
733 #if EIGEN_FAST_MATH
734   return __builtin_msa_frsqrt_w(a);
735 #else
736   Packet4f ones = __builtin_msa_ffint_s_w(__builtin_msa_ldi_w(1));
737   return pdiv(ones, psqrt(a));
738 #endif
739 }
740 
741 template <>
742 EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
743   Packet4f v = a;
744   int32_t old_mode, new_mode;
745   asm volatile(
746       "cfcmsa  %[old_mode], $1\n"
747       "ori     %[new_mode], %[old_mode], 3\n"  // 3 = round towards -INFINITY.
748       "ctcmsa  $1, %[new_mode]\n"
749       "frint.w %w[v], %w[v]\n"
750       "ctcmsa  $1, %[old_mode]\n"
751       :  // outputs
752       [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
753       [v] "+f"(v)
754       :  // inputs
755       :  // clobbers
756   );
757   return v;
758 }
759 
760 template <>
761 EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
762   Packet4f v = a;
763   int32_t old_mode, new_mode;
764   asm volatile(
765       "cfcmsa  %[old_mode], $1\n"
766       "ori     %[new_mode], %[old_mode], 3\n"
767       "xori    %[new_mode], %[new_mode], 1\n"  // 2 = round towards +INFINITY.
768       "ctcmsa  $1, %[new_mode]\n"
769       "frint.w %w[v], %w[v]\n"
770       "ctcmsa  $1, %[old_mode]\n"
771       :  // outputs
772       [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
773       [v] "+f"(v)
774       :  // inputs
775       :  // clobbers
776   );
777   return v;
778 }
779 
780 template <>
781 EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
782   Packet4f v = a;
783   int32_t old_mode, new_mode;
784   asm volatile(
785       "cfcmsa  %[old_mode], $1\n"
786       "ori     %[new_mode], %[old_mode], 3\n"
787       "xori    %[new_mode], %[new_mode], 3\n"  // 0 = round to nearest, ties to even.
788       "ctcmsa  $1, %[new_mode]\n"
789       "frint.w %w[v], %w[v]\n"
790       "ctcmsa  $1, %[old_mode]\n"
791       :  // outputs
792       [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
793       [v] "+f"(v)
794       :  // inputs
795       :  // clobbers
796   );
797   return v;
798 }
799 
800 template <>
801 EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
802                                     const Packet4f& elsePacket) {
803   Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2],
804                        ifPacket.select[3] };
805   Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
806   return (Packet4f)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
807 }
808 
809 template <>
810 EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
811                                     const Packet4i& elsePacket) {
812   Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2],
813                        ifPacket.select[3] };
814   Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
815   return (Packet4i)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
816 }
817 
818 //---------- double ----------
819 
820 typedef v2f64 Packet2d;
821 typedef v2i64 Packet2l;
822 typedef v2u64 Packet2ul;
823 
824 #define _EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = { X, X }
825 #define _EIGEN_DECLARE_CONST_Packet2l(NAME, X) const Packet2l p2l_##NAME = { X, X }
826 #define _EIGEN_DECLARE_CONST_Packet2ul(NAME, X) const Packet2ul p2ul_##NAME = { X, X }
827 
828 inline std::ostream& operator<<(std::ostream& os, const Packet2d& value) {
829   os << "[ " << value[0] << ", " << value[1] << " ]";
830   return os;
831 }
832 
833 inline std::ostream& operator<<(std::ostream& os, const Packet2l& value) {
834   os << "[ " << value[0] << ", " << value[1] << " ]";
835   return os;
836 }
837 
838 inline std::ostream& operator<<(std::ostream& os, const Packet2ul& value) {
839   os << "[ " << value[0] << ", " << value[1] << " ]";
840   return os;
841 }
842 
843 template <>
844 struct packet_traits<double> : default_packet_traits {
845   typedef Packet2d type;
846   typedef Packet2d half;
847   enum {
848     Vectorizable = 1,
849     AlignedOnScalar = 1,
850     size = 2,
851     HasHalfPacket = 0,
852     // FIXME check the Has*
853     HasDiv = 1,
854     HasExp = 1,
855     HasSqrt = 1,
856     HasRsqrt = 1,
857     HasRound = 1,
858     HasFloor = 1,
859     HasCeil = 1,
860     HasBlend = 1
861   };
862 };
863 
864 template <>
865 struct unpacket_traits<Packet2d> {
866   typedef double type;
867   enum { size = 2, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
868   typedef Packet2d half;
869 };
870 
871 template <>
872 EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
873   EIGEN_MSA_DEBUG;
874 
875   Packet2d value = { from, from };
876   return value;
877 }
878 
879 template <>
880 EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
881   EIGEN_MSA_DEBUG;
882 
883   return __builtin_msa_fadd_d(a, b);
884 }
885 
886 template <>
887 EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
888   EIGEN_MSA_DEBUG;
889 
890   static const Packet2d countdown = { 0.0, 1.0 };
891   return padd(pset1<Packet2d>(a), countdown);
892 }
893 
894 template <>
895 EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
896   EIGEN_MSA_DEBUG;
897 
898   return __builtin_msa_fsub_d(a, b);
899 }
900 
901 template <>
902 EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
903   EIGEN_MSA_DEBUG;
904 
905   return (Packet2d)__builtin_msa_bnegi_d((v2u64)a, 63);
906 }
907 
908 template <>
909 EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
910   EIGEN_MSA_DEBUG;
911 
912   return a;
913 }
914 
915 template <>
916 EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
917   EIGEN_MSA_DEBUG;
918 
919   return __builtin_msa_fmul_d(a, b);
920 }
921 
922 template <>
923 EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
924   EIGEN_MSA_DEBUG;
925 
926   return __builtin_msa_fdiv_d(a, b);
927 }
928 
929 template <>
930 EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
931   EIGEN_MSA_DEBUG;
932 
933   return __builtin_msa_fmadd_d(c, a, b);
934 }
935 
936 // Logical Operations are not supported for float, so we have to reinterpret casts using MSA
937 // intrinsics
938 template <>
939 EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
940   EIGEN_MSA_DEBUG;
941 
942   return (Packet2d)__builtin_msa_and_v((v16u8)a, (v16u8)b);
943 }
944 
945 template <>
946 EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
947   EIGEN_MSA_DEBUG;
948 
949   return (Packet2d)__builtin_msa_or_v((v16u8)a, (v16u8)b);
950 }
951 
952 template <>
953 EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
954   EIGEN_MSA_DEBUG;
955 
956   return (Packet2d)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
957 }
958 
959 template <>
960 EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
961   EIGEN_MSA_DEBUG;
962 
963   return pand(a, (Packet2d)__builtin_msa_xori_b((v16u8)b, 255));
964 }
965 
966 template <>
967 EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
968   EIGEN_MSA_DEBUG;
969 
970   EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast<double*>(from), 0);
971 }
972 
973 template <>
974 EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
975   EIGEN_MSA_DEBUG;
976 
977 #if EIGEN_FAST_MATH
978   // This prefers numbers to NaNs.
979   return __builtin_msa_fmin_d(a, b);
980 #else
981   // This prefers NaNs to numbers.
982   v2i64 aNaN = __builtin_msa_fcun_d(a, a);
983   v2i64 aMinOrNaN = por(__builtin_msa_fclt_d(a, b), aNaN);
984   return (Packet2d)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a);
985 #endif
986 }
987 
988 template <>
989 EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
990   EIGEN_MSA_DEBUG;
991 
992 #if EIGEN_FAST_MATH
993   // This prefers numbers to NaNs.
994   return __builtin_msa_fmax_d(a, b);
995 #else
996   // This prefers NaNs to numbers.
997   v2i64 aNaN = __builtin_msa_fcun_d(a, a);
998   v2i64 aMaxOrNaN = por(__builtin_msa_fclt_d(b, a), aNaN);
999   return (Packet2d)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a);
1000 #endif
1001 }
1002 
1003 template <>
1004 EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
1005   EIGEN_MSA_DEBUG;
1006 
1007   EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast<double*>(from), 0);
1008 }
1009 
1010 template <>
1011 EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
1012   EIGEN_MSA_DEBUG;
1013 
1014   Packet2d value = { *from, *from };
1015   return value;
1016 }
1017 
1018 template <>
1019 EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
1020   EIGEN_MSA_DEBUG;
1021 
1022   EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0);
1023 }
1024 
1025 template <>
1026 EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
1027   EIGEN_MSA_DEBUG;
1028 
1029   EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0);
1030 }
1031 
1032 template <>
1033 EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
1034   EIGEN_MSA_DEBUG;
1035 
1036   Packet2d value;
1037   value[0] = *from;
1038   from += stride;
1039   value[1] = *from;
1040   return value;
1041 }
1042 
1043 template <>
1044 EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from,
1045                                                          Index stride) {
1046   EIGEN_MSA_DEBUG;
1047 
1048   *to = from[0];
1049   to += stride;
1050   *to = from[1];
1051 }
1052 
1053 template <>
1054 EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
1055   EIGEN_MSA_DEBUG;
1056 
1057   __builtin_prefetch(addr);
1058 }
1059 
1060 template <>
1061 EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
1062   EIGEN_MSA_DEBUG;
1063 
1064   return a[0];
1065 }
1066 
1067 template <>
1068 EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
1069   EIGEN_MSA_DEBUG;
1070 
1071   return (Packet2d)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
1072 }
1073 
1074 template <>
1075 EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
1076   EIGEN_MSA_DEBUG;
1077 
1078   return (Packet2d)__builtin_msa_bclri_d((v2u64)a, 63);
1079 }
1080 
1081 template <>
1082 EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
1083   EIGEN_MSA_DEBUG;
1084 
1085   Packet2d s = padd(a, preverse(a));
1086   return s[0];
1087 }
1088 
1089 // Other reduction functions:
1090 // mul
1091 template <>
1092 EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
1093   EIGEN_MSA_DEBUG;
1094 
1095   Packet2d p = pmul(a, preverse(a));
1096   return p[0];
1097 }
1098 
1099 // min
1100 template <>
1101 EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
1102   EIGEN_MSA_DEBUG;
1103 
1104 #if EIGEN_FAST_MATH
1105   Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
1106   Packet2d v = __builtin_msa_fmin_d(a, swapped);
1107   return v[0];
1108 #else
1109   double a0 = a[0], a1 = a[1];
1110   return ((numext::isnan)(a0) || a0 < a1) ? a0 : a1;
1111 #endif
1112 }
1113 
1114 // max
1115 template <>
1116 EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
1117   EIGEN_MSA_DEBUG;
1118 
1119 #if EIGEN_FAST_MATH
1120   Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
1121   Packet2d v = __builtin_msa_fmax_d(a, swapped);
1122   return v[0];
1123 #else
1124   double a0 = a[0], a1 = a[1];
1125   return ((numext::isnan)(a0) || a0 > a1) ? a0 : a1;
1126 #endif
1127 }
1128 
1129 template <>
1130 EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& a) {
1131   EIGEN_MSA_DEBUG;
1132 
1133   return __builtin_msa_fsqrt_d(a);
1134 }
1135 
1136 template <>
1137 EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
1138   EIGEN_MSA_DEBUG;
1139 
1140 #if EIGEN_FAST_MATH
1141   return __builtin_msa_frsqrt_d(a);
1142 #else
1143   Packet2d ones = __builtin_msa_ffint_s_d(__builtin_msa_ldi_d(1));
1144   return pdiv(ones, psqrt(a));
1145 #endif
1146 }
1147 
1148 inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet2d, 2>& value) {
1149   os << "[ " << value.packet[0] << "," << std::endl << "  " << value.packet[1] << " ]";
1150   return os;
1151 }
1152 
1153 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
1154   EIGEN_MSA_DEBUG;
1155 
1156   Packet2d trn1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
1157   Packet2d trn2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
1158   kernel.packet[0] = trn1;
1159   kernel.packet[1] = trn2;
1160 }
1161 
1162 template <>
1163 EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
1164   Packet2d v = a;
1165   int32_t old_mode, new_mode;
1166   asm volatile(
1167       "cfcmsa  %[old_mode], $1\n"
1168       "ori     %[new_mode], %[old_mode], 3\n"  // 3 = round towards -INFINITY.
1169       "ctcmsa  $1, %[new_mode]\n"
1170       "frint.d %w[v], %w[v]\n"
1171       "ctcmsa  $1, %[old_mode]\n"
1172       :  // outputs
1173       [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
1174       [v] "+f"(v)
1175       :  // inputs
1176       :  // clobbers
1177   );
1178   return v;
1179 }
1180 
1181 template <>
1182 EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
1183   Packet2d v = a;
1184   int32_t old_mode, new_mode;
1185   asm volatile(
1186       "cfcmsa  %[old_mode], $1\n"
1187       "ori     %[new_mode], %[old_mode], 3\n"
1188       "xori    %[new_mode], %[new_mode], 1\n"  // 2 = round towards +INFINITY.
1189       "ctcmsa  $1, %[new_mode]\n"
1190       "frint.d %w[v], %w[v]\n"
1191       "ctcmsa  $1, %[old_mode]\n"
1192       :  // outputs
1193       [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
1194       [v] "+f"(v)
1195       :  // inputs
1196       :  // clobbers
1197   );
1198   return v;
1199 }
1200 
1201 template <>
1202 EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
1203   Packet2d v = a;
1204   int32_t old_mode, new_mode;
1205   asm volatile(
1206       "cfcmsa  %[old_mode], $1\n"
1207       "ori     %[new_mode], %[old_mode], 3\n"
1208       "xori    %[new_mode], %[new_mode], 3\n"  // 0 = round to nearest, ties to even.
1209       "ctcmsa  $1, %[new_mode]\n"
1210       "frint.d %w[v], %w[v]\n"
1211       "ctcmsa  $1, %[old_mode]\n"
1212       :  // outputs
1213       [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
1214       [v] "+f"(v)
1215       :  // inputs
1216       :  // clobbers
1217   );
1218   return v;
1219 }
1220 
1221 template <>
1222 EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
1223                                     const Packet2d& elsePacket) {
1224   Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
1225   Packet2l mask = __builtin_msa_ceqi_d((Packet2l)select, 0);
1226   return (Packet2d)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
1227 }
1228 
1229 }  // end namespace internal
1230 
1231 }  // end namespace Eigen
1232 
1233 #endif  // EIGEN_PACKET_MATH_MSA_H
1234