1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2018 Wave Computing, Inc. 5 // Written by: 6 // Chris Larsen 7 // Alexey Frunze ([email protected]) 8 // 9 // This Source Code Form is subject to the terms of the Mozilla 10 // Public License v. 2.0. If a copy of the MPL was not distributed 11 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 12 13 #ifndef EIGEN_PACKET_MATH_MSA_H 14 #define EIGEN_PACKET_MATH_MSA_H 15 16 #include <iostream> 17 #include <string> 18 19 namespace Eigen { 20 21 namespace internal { 22 23 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 24 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 25 #endif 26 27 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 28 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 29 #endif 30 31 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 33 #endif 34 35 #if 0 36 #define EIGEN_MSA_DEBUG \ 37 static bool firstTime = true; \ 38 do { \ 39 if (firstTime) { \ 40 std::cout << __FILE__ << ':' << __LINE__ << ':' << __FUNCTION__ << std::endl; \ 41 firstTime = false; \ 42 } \ 43 } while (0) 44 #else 45 #define EIGEN_MSA_DEBUG 46 #endif 47 48 #define EIGEN_MSA_SHF_I8(a, b, c, d) (((d) << 6) | ((c) << 4) | ((b) << 2) | (a)) 49 50 typedef v4f32 Packet4f; 51 typedef v4i32 Packet4i; 52 typedef v4u32 Packet4ui; 53 54 #define _EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = { X, X, X, X } 55 #define _EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = { X, X, X, X } 56 #define _EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = { X, X, X, X } 57 58 inline std::ostream& operator<<(std::ostream& os, const Packet4f& value) { 59 os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]"; 60 return os; 61 } 62 63 inline std::ostream& operator<<(std::ostream& os, const Packet4i& value) { 64 os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]"; 65 return os; 66 } 67 68 inline std::ostream& operator<<(std::ostream& os, const Packet4ui& value) { 69 os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]"; 70 return os; 71 } 72 73 template <> 74 struct packet_traits<float> : default_packet_traits { 75 typedef Packet4f type; 76 typedef Packet4f half; // Packet2f intrinsics not implemented yet 77 enum { 78 Vectorizable = 1, 79 AlignedOnScalar = 1, 80 size = 4, 81 HasHalfPacket = 0, // Packet2f intrinsics not implemented yet 82 // FIXME check the Has* 83 HasDiv = 1, 84 HasSin = EIGEN_FAST_MATH, 85 HasCos = EIGEN_FAST_MATH, 86 HasTanh = EIGEN_FAST_MATH, 87 HasErf = EIGEN_FAST_MATH, 88 HasLog = 1, 89 HasExp = 1, 90 HasSqrt = 1, 91 HasRsqrt = 1, 92 HasRound = 1, 93 HasFloor = 1, 94 HasCeil = 1, 95 HasBlend = 1 96 }; 97 }; 98 99 template <> 100 struct packet_traits<int32_t> : default_packet_traits { 101 typedef Packet4i type; 102 typedef Packet4i half; // Packet2i intrinsics not implemented yet 103 enum { 104 Vectorizable = 1, 105 AlignedOnScalar = 1, 106 size = 4, 107 HasHalfPacket = 0, // Packet2i intrinsics not implemented yet 108 // FIXME check the Has* 109 HasDiv = 1, 110 HasBlend = 1 111 }; 112 }; 113 114 template <> 115 struct unpacket_traits<Packet4f> { 116 typedef float type; 117 enum { size = 4, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false }; 118 typedef Packet4f half; 119 }; 120 121 template <> 122 struct unpacket_traits<Packet4i> { 123 typedef int32_t type; 124 enum { size = 4, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false }; 125 typedef Packet4i half; 126 }; 127 128 template <> 129 EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { 130 EIGEN_MSA_DEBUG; 131 132 Packet4f v = { from, from, from, from }; 133 return v; 134 } 135 136 template <> 137 EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) { 138 EIGEN_MSA_DEBUG; 139 140 return __builtin_msa_fill_w(from); 141 } 142 143 template <> 144 EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float* from) { 145 EIGEN_MSA_DEBUG; 146 147 float f = *from; 148 Packet4f v = { f, f, f, f }; 149 return v; 150 } 151 152 template <> 153 EIGEN_STRONG_INLINE Packet4i pload1<Packet4i>(const int32_t* from) { 154 EIGEN_MSA_DEBUG; 155 156 return __builtin_msa_fill_w(*from); 157 } 158 159 template <> 160 EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { 161 EIGEN_MSA_DEBUG; 162 163 return __builtin_msa_fadd_w(a, b); 164 } 165 166 template <> 167 EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { 168 EIGEN_MSA_DEBUG; 169 170 return __builtin_msa_addv_w(a, b); 171 } 172 173 template <> 174 EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { 175 EIGEN_MSA_DEBUG; 176 177 static const Packet4f countdown = { 0.0f, 1.0f, 2.0f, 3.0f }; 178 return padd(pset1<Packet4f>(a), countdown); 179 } 180 181 template <> 182 EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a) { 183 EIGEN_MSA_DEBUG; 184 185 static const Packet4i countdown = { 0, 1, 2, 3 }; 186 return padd(pset1<Packet4i>(a), countdown); 187 } 188 189 template <> 190 EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { 191 EIGEN_MSA_DEBUG; 192 193 return __builtin_msa_fsub_w(a, b); 194 } 195 196 template <> 197 EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { 198 EIGEN_MSA_DEBUG; 199 200 return __builtin_msa_subv_w(a, b); 201 } 202 203 template <> 204 EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { 205 EIGEN_MSA_DEBUG; 206 207 return (Packet4f)__builtin_msa_bnegi_w((v4u32)a, 31); 208 } 209 210 template <> 211 EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { 212 EIGEN_MSA_DEBUG; 213 214 return __builtin_msa_addvi_w((v4i32)__builtin_msa_nori_b((v16u8)a, 0), 1); 215 } 216 217 template <> 218 EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { 219 EIGEN_MSA_DEBUG; 220 221 return a; 222 } 223 224 template <> 225 EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { 226 EIGEN_MSA_DEBUG; 227 228 return a; 229 } 230 231 template <> 232 EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { 233 EIGEN_MSA_DEBUG; 234 235 return __builtin_msa_fmul_w(a, b); 236 } 237 238 template <> 239 EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { 240 EIGEN_MSA_DEBUG; 241 242 return __builtin_msa_mulv_w(a, b); 243 } 244 245 template <> 246 EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { 247 EIGEN_MSA_DEBUG; 248 249 return __builtin_msa_fdiv_w(a, b); 250 } 251 252 template <> 253 EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) { 254 EIGEN_MSA_DEBUG; 255 256 return __builtin_msa_div_s_w(a, b); 257 } 258 259 template <> 260 EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { 261 EIGEN_MSA_DEBUG; 262 263 return __builtin_msa_fmadd_w(c, a, b); 264 } 265 266 template <> 267 EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { 268 EIGEN_MSA_DEBUG; 269 270 // Use "asm" construct to avoid __builtin_msa_maddv_w GNU C bug. 271 Packet4i value = c; 272 __asm__("maddv.w %w[value], %w[a], %w[b]\n" 273 // Outputs 274 : [value] "+f"(value) 275 // Inputs 276 : [a] "f"(a), [b] "f"(b)); 277 return value; 278 } 279 280 template <> 281 EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { 282 EIGEN_MSA_DEBUG; 283 284 return (Packet4f)__builtin_msa_and_v((v16u8)a, (v16u8)b); 285 } 286 287 template <> 288 EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { 289 EIGEN_MSA_DEBUG; 290 291 return (Packet4i)__builtin_msa_and_v((v16u8)a, (v16u8)b); 292 } 293 294 template <> 295 EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { 296 EIGEN_MSA_DEBUG; 297 298 return (Packet4f)__builtin_msa_or_v((v16u8)a, (v16u8)b); 299 } 300 301 template <> 302 EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { 303 EIGEN_MSA_DEBUG; 304 305 return (Packet4i)__builtin_msa_or_v((v16u8)a, (v16u8)b); 306 } 307 308 template <> 309 EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { 310 EIGEN_MSA_DEBUG; 311 312 return (Packet4f)__builtin_msa_xor_v((v16u8)a, (v16u8)b); 313 } 314 315 template <> 316 EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { 317 EIGEN_MSA_DEBUG; 318 319 return (Packet4i)__builtin_msa_xor_v((v16u8)a, (v16u8)b); 320 } 321 322 template <> 323 EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { 324 EIGEN_MSA_DEBUG; 325 326 return pand(a, (Packet4f)__builtin_msa_xori_b((v16u8)b, 255)); 327 } 328 329 template <> 330 EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { 331 EIGEN_MSA_DEBUG; 332 333 return pand(a, (Packet4i)__builtin_msa_xori_b((v16u8)b, 255)); 334 } 335 336 template <> 337 EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { 338 EIGEN_MSA_DEBUG; 339 340 #if EIGEN_FAST_MATH 341 // This prefers numbers to NaNs. 342 return __builtin_msa_fmin_w(a, b); 343 #else 344 // This prefers NaNs to numbers. 345 Packet4i aNaN = __builtin_msa_fcun_w(a, a); 346 Packet4i aMinOrNaN = por(__builtin_msa_fclt_w(a, b), aNaN); 347 return (Packet4f)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a); 348 #endif 349 } 350 351 template <> 352 EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { 353 EIGEN_MSA_DEBUG; 354 355 return __builtin_msa_min_s_w(a, b); 356 } 357 358 template <> 359 EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { 360 EIGEN_MSA_DEBUG; 361 362 #if EIGEN_FAST_MATH 363 // This prefers numbers to NaNs. 364 return __builtin_msa_fmax_w(a, b); 365 #else 366 // This prefers NaNs to numbers. 367 Packet4i aNaN = __builtin_msa_fcun_w(a, a); 368 Packet4i aMaxOrNaN = por(__builtin_msa_fclt_w(b, a), aNaN); 369 return (Packet4f)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a); 370 #endif 371 } 372 373 template <> 374 EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { 375 EIGEN_MSA_DEBUG; 376 377 return __builtin_msa_max_s_w(a, b); 378 } 379 380 template <> 381 EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { 382 EIGEN_MSA_DEBUG; 383 384 EIGEN_DEBUG_ALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast<float*>(from), 0); 385 } 386 387 template <> 388 EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) { 389 EIGEN_MSA_DEBUG; 390 391 EIGEN_DEBUG_ALIGNED_LOAD return __builtin_msa_ld_w(const_cast<int32_t*>(from), 0); 392 } 393 394 template <> 395 EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { 396 EIGEN_MSA_DEBUG; 397 398 EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast<float*>(from), 0); 399 } 400 401 template <> 402 EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) { 403 EIGEN_MSA_DEBUG; 404 405 EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4i)__builtin_msa_ld_w(const_cast<int32_t*>(from), 0); 406 } 407 408 template <> 409 EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) { 410 EIGEN_MSA_DEBUG; 411 412 float f0 = from[0], f1 = from[1]; 413 Packet4f v0 = { f0, f0, f0, f0 }; 414 Packet4f v1 = { f1, f1, f1, f1 }; 415 return (Packet4f)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0); 416 } 417 418 template <> 419 EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from) { 420 EIGEN_MSA_DEBUG; 421 422 int32_t i0 = from[0], i1 = from[1]; 423 Packet4i v0 = { i0, i0, i0, i0 }; 424 Packet4i v1 = { i1, i1, i1, i1 }; 425 return (Packet4i)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0); 426 } 427 428 template <> 429 EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { 430 EIGEN_MSA_DEBUG; 431 432 EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0); 433 } 434 435 template <> 436 EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) { 437 EIGEN_MSA_DEBUG; 438 439 EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w(from, to, 0); 440 } 441 442 template <> 443 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { 444 EIGEN_MSA_DEBUG; 445 446 EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0); 447 } 448 449 template <> 450 EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) { 451 EIGEN_MSA_DEBUG; 452 453 EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w(from, to, 0); 454 } 455 456 template <> 457 EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) { 458 EIGEN_MSA_DEBUG; 459 460 float f = *from; 461 Packet4f v = { f, f, f, f }; 462 v[1] = from[stride]; 463 v[2] = from[2 * stride]; 464 v[3] = from[3 * stride]; 465 return v; 466 } 467 468 template <> 469 EIGEN_DEVICE_FUNC inline Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride) { 470 EIGEN_MSA_DEBUG; 471 472 int32_t i = *from; 473 Packet4i v = { i, i, i, i }; 474 v[1] = from[stride]; 475 v[2] = from[2 * stride]; 476 v[3] = from[3 * stride]; 477 return v; 478 } 479 480 template <> 481 EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, 482 Index stride) { 483 EIGEN_MSA_DEBUG; 484 485 *to = from[0]; 486 to += stride; 487 *to = from[1]; 488 to += stride; 489 *to = from[2]; 490 to += stride; 491 *to = from[3]; 492 } 493 494 template <> 495 EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from, 496 Index stride) { 497 EIGEN_MSA_DEBUG; 498 499 *to = from[0]; 500 to += stride; 501 *to = from[1]; 502 to += stride; 503 *to = from[2]; 504 to += stride; 505 *to = from[3]; 506 } 507 508 template <> 509 EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { 510 EIGEN_MSA_DEBUG; 511 512 __builtin_prefetch(addr); 513 } 514 515 template <> 516 EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) { 517 EIGEN_MSA_DEBUG; 518 519 __builtin_prefetch(addr); 520 } 521 522 template <> 523 EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { 524 EIGEN_MSA_DEBUG; 525 526 return a[0]; 527 } 528 529 template <> 530 EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { 531 EIGEN_MSA_DEBUG; 532 533 return a[0]; 534 } 535 536 template <> 537 EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { 538 EIGEN_MSA_DEBUG; 539 540 return (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(3, 2, 1, 0)); 541 } 542 543 template <> 544 EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { 545 EIGEN_MSA_DEBUG; 546 547 return __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(3, 2, 1, 0)); 548 } 549 550 template <> 551 EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { 552 EIGEN_MSA_DEBUG; 553 554 return (Packet4f)__builtin_msa_bclri_w((v4u32)a, 31); 555 } 556 557 template <> 558 EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { 559 EIGEN_MSA_DEBUG; 560 561 Packet4i zero = __builtin_msa_ldi_w(0); 562 return __builtin_msa_add_a_w(zero, a); 563 } 564 565 template <> 566 EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) { 567 EIGEN_MSA_DEBUG; 568 569 Packet4f s = padd(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); 570 s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); 571 return s[0]; 572 } 573 574 575 template <> 576 EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) { 577 EIGEN_MSA_DEBUG; 578 579 Packet4i s = padd(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); 580 s = padd(s, __builtin_msa_shf_w(s, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); 581 return s[0]; 582 } 583 584 // Other reduction functions: 585 // mul 586 template <> 587 EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) { 588 EIGEN_MSA_DEBUG; 589 590 Packet4f p = pmul(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); 591 p = pmul(p, (Packet4f)__builtin_msa_shf_w((v4i32)p, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); 592 return p[0]; 593 } 594 595 template <> 596 EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a) { 597 EIGEN_MSA_DEBUG; 598 599 Packet4i p = pmul(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); 600 p = pmul(p, __builtin_msa_shf_w(p, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); 601 return p[0]; 602 } 603 604 // min 605 template <> 606 EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) { 607 EIGEN_MSA_DEBUG; 608 609 // Swap 64-bit halves of a. 610 Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); 611 #if !EIGEN_FAST_MATH 612 // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit 613 // masks of all zeroes/ones in low 64 bits. 614 v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped); 615 // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes. 616 unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0); 617 #endif 618 // Continue with min computation. 619 Packet4f v = __builtin_msa_fmin_w(a, swapped); 620 v = __builtin_msa_fmin_w( 621 v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); 622 #if !EIGEN_FAST_MATH 623 // Based on the mask select between v and 4 qNaNs. 624 v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000); 625 v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v); 626 #endif 627 return v[0]; 628 } 629 630 template <> 631 EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) { 632 EIGEN_MSA_DEBUG; 633 634 Packet4i m = pmin(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); 635 m = pmin(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); 636 return m[0]; 637 } 638 639 // max 640 template <> 641 EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) { 642 EIGEN_MSA_DEBUG; 643 644 // Swap 64-bit halves of a. 645 Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); 646 #if !EIGEN_FAST_MATH 647 // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit 648 // masks of all zeroes/ones in low 64 bits. 649 v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped); 650 // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes. 651 unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0); 652 #endif 653 // Continue with max computation. 654 Packet4f v = __builtin_msa_fmax_w(a, swapped); 655 v = __builtin_msa_fmax_w( 656 v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); 657 #if !EIGEN_FAST_MATH 658 // Based on the mask select between v and 4 qNaNs. 659 v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000); 660 v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v); 661 #endif 662 return v[0]; 663 } 664 665 template <> 666 EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) { 667 EIGEN_MSA_DEBUG; 668 669 Packet4i m = pmax(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); 670 m = pmax(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); 671 return m[0]; 672 } 673 674 inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet4f, 4>& value) { 675 os << "[ " << value.packet[0] << "," << std::endl 676 << " " << value.packet[1] << "," << std::endl 677 << " " << value.packet[2] << "," << std::endl 678 << " " << value.packet[3] << " ]"; 679 return os; 680 } 681 682 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) { 683 EIGEN_MSA_DEBUG; 684 685 v4i32 tmp1, tmp2, tmp3, tmp4; 686 687 tmp1 = __builtin_msa_ilvr_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]); 688 tmp2 = __builtin_msa_ilvr_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]); 689 tmp3 = __builtin_msa_ilvl_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]); 690 tmp4 = __builtin_msa_ilvl_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]); 691 692 kernel.packet[0] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1); 693 kernel.packet[1] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1); 694 kernel.packet[2] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3); 695 kernel.packet[3] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3); 696 } 697 698 inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet4i, 4>& value) { 699 os << "[ " << value.packet[0] << "," << std::endl 700 << " " << value.packet[1] << "," << std::endl 701 << " " << value.packet[2] << "," << std::endl 702 << " " << value.packet[3] << " ]"; 703 return os; 704 } 705 706 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) { 707 EIGEN_MSA_DEBUG; 708 709 v4i32 tmp1, tmp2, tmp3, tmp4; 710 711 tmp1 = __builtin_msa_ilvr_w(kernel.packet[1], kernel.packet[0]); 712 tmp2 = __builtin_msa_ilvr_w(kernel.packet[3], kernel.packet[2]); 713 tmp3 = __builtin_msa_ilvl_w(kernel.packet[1], kernel.packet[0]); 714 tmp4 = __builtin_msa_ilvl_w(kernel.packet[3], kernel.packet[2]); 715 716 kernel.packet[0] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1); 717 kernel.packet[1] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1); 718 kernel.packet[2] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3); 719 kernel.packet[3] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3); 720 } 721 722 template <> 723 EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) { 724 EIGEN_MSA_DEBUG; 725 726 return __builtin_msa_fsqrt_w(a); 727 } 728 729 template <> 730 EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) { 731 EIGEN_MSA_DEBUG; 732 733 #if EIGEN_FAST_MATH 734 return __builtin_msa_frsqrt_w(a); 735 #else 736 Packet4f ones = __builtin_msa_ffint_s_w(__builtin_msa_ldi_w(1)); 737 return pdiv(ones, psqrt(a)); 738 #endif 739 } 740 741 template <> 742 EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { 743 Packet4f v = a; 744 int32_t old_mode, new_mode; 745 asm volatile( 746 "cfcmsa %[old_mode], $1\n" 747 "ori %[new_mode], %[old_mode], 3\n" // 3 = round towards -INFINITY. 748 "ctcmsa $1, %[new_mode]\n" 749 "frint.w %w[v], %w[v]\n" 750 "ctcmsa $1, %[old_mode]\n" 751 : // outputs 752 [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), 753 [v] "+f"(v) 754 : // inputs 755 : // clobbers 756 ); 757 return v; 758 } 759 760 template <> 761 EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { 762 Packet4f v = a; 763 int32_t old_mode, new_mode; 764 asm volatile( 765 "cfcmsa %[old_mode], $1\n" 766 "ori %[new_mode], %[old_mode], 3\n" 767 "xori %[new_mode], %[new_mode], 1\n" // 2 = round towards +INFINITY. 768 "ctcmsa $1, %[new_mode]\n" 769 "frint.w %w[v], %w[v]\n" 770 "ctcmsa $1, %[old_mode]\n" 771 : // outputs 772 [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), 773 [v] "+f"(v) 774 : // inputs 775 : // clobbers 776 ); 777 return v; 778 } 779 780 template <> 781 EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { 782 Packet4f v = a; 783 int32_t old_mode, new_mode; 784 asm volatile( 785 "cfcmsa %[old_mode], $1\n" 786 "ori %[new_mode], %[old_mode], 3\n" 787 "xori %[new_mode], %[new_mode], 3\n" // 0 = round to nearest, ties to even. 788 "ctcmsa $1, %[new_mode]\n" 789 "frint.w %w[v], %w[v]\n" 790 "ctcmsa $1, %[old_mode]\n" 791 : // outputs 792 [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), 793 [v] "+f"(v) 794 : // inputs 795 : // clobbers 796 ); 797 return v; 798 } 799 800 template <> 801 EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, 802 const Packet4f& elsePacket) { 803 Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], 804 ifPacket.select[3] }; 805 Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0); 806 return (Packet4f)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket); 807 } 808 809 template <> 810 EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, 811 const Packet4i& elsePacket) { 812 Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], 813 ifPacket.select[3] }; 814 Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0); 815 return (Packet4i)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket); 816 } 817 818 //---------- double ---------- 819 820 typedef v2f64 Packet2d; 821 typedef v2i64 Packet2l; 822 typedef v2u64 Packet2ul; 823 824 #define _EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = { X, X } 825 #define _EIGEN_DECLARE_CONST_Packet2l(NAME, X) const Packet2l p2l_##NAME = { X, X } 826 #define _EIGEN_DECLARE_CONST_Packet2ul(NAME, X) const Packet2ul p2ul_##NAME = { X, X } 827 828 inline std::ostream& operator<<(std::ostream& os, const Packet2d& value) { 829 os << "[ " << value[0] << ", " << value[1] << " ]"; 830 return os; 831 } 832 833 inline std::ostream& operator<<(std::ostream& os, const Packet2l& value) { 834 os << "[ " << value[0] << ", " << value[1] << " ]"; 835 return os; 836 } 837 838 inline std::ostream& operator<<(std::ostream& os, const Packet2ul& value) { 839 os << "[ " << value[0] << ", " << value[1] << " ]"; 840 return os; 841 } 842 843 template <> 844 struct packet_traits<double> : default_packet_traits { 845 typedef Packet2d type; 846 typedef Packet2d half; 847 enum { 848 Vectorizable = 1, 849 AlignedOnScalar = 1, 850 size = 2, 851 HasHalfPacket = 0, 852 // FIXME check the Has* 853 HasDiv = 1, 854 HasExp = 1, 855 HasSqrt = 1, 856 HasRsqrt = 1, 857 HasRound = 1, 858 HasFloor = 1, 859 HasCeil = 1, 860 HasBlend = 1 861 }; 862 }; 863 864 template <> 865 struct unpacket_traits<Packet2d> { 866 typedef double type; 867 enum { size = 2, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false }; 868 typedef Packet2d half; 869 }; 870 871 template <> 872 EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { 873 EIGEN_MSA_DEBUG; 874 875 Packet2d value = { from, from }; 876 return value; 877 } 878 879 template <> 880 EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { 881 EIGEN_MSA_DEBUG; 882 883 return __builtin_msa_fadd_d(a, b); 884 } 885 886 template <> 887 EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { 888 EIGEN_MSA_DEBUG; 889 890 static const Packet2d countdown = { 0.0, 1.0 }; 891 return padd(pset1<Packet2d>(a), countdown); 892 } 893 894 template <> 895 EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { 896 EIGEN_MSA_DEBUG; 897 898 return __builtin_msa_fsub_d(a, b); 899 } 900 901 template <> 902 EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { 903 EIGEN_MSA_DEBUG; 904 905 return (Packet2d)__builtin_msa_bnegi_d((v2u64)a, 63); 906 } 907 908 template <> 909 EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { 910 EIGEN_MSA_DEBUG; 911 912 return a; 913 } 914 915 template <> 916 EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { 917 EIGEN_MSA_DEBUG; 918 919 return __builtin_msa_fmul_d(a, b); 920 } 921 922 template <> 923 EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { 924 EIGEN_MSA_DEBUG; 925 926 return __builtin_msa_fdiv_d(a, b); 927 } 928 929 template <> 930 EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { 931 EIGEN_MSA_DEBUG; 932 933 return __builtin_msa_fmadd_d(c, a, b); 934 } 935 936 // Logical Operations are not supported for float, so we have to reinterpret casts using MSA 937 // intrinsics 938 template <> 939 EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { 940 EIGEN_MSA_DEBUG; 941 942 return (Packet2d)__builtin_msa_and_v((v16u8)a, (v16u8)b); 943 } 944 945 template <> 946 EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { 947 EIGEN_MSA_DEBUG; 948 949 return (Packet2d)__builtin_msa_or_v((v16u8)a, (v16u8)b); 950 } 951 952 template <> 953 EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { 954 EIGEN_MSA_DEBUG; 955 956 return (Packet2d)__builtin_msa_xor_v((v16u8)a, (v16u8)b); 957 } 958 959 template <> 960 EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { 961 EIGEN_MSA_DEBUG; 962 963 return pand(a, (Packet2d)__builtin_msa_xori_b((v16u8)b, 255)); 964 } 965 966 template <> 967 EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { 968 EIGEN_MSA_DEBUG; 969 970 EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast<double*>(from), 0); 971 } 972 973 template <> 974 EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { 975 EIGEN_MSA_DEBUG; 976 977 #if EIGEN_FAST_MATH 978 // This prefers numbers to NaNs. 979 return __builtin_msa_fmin_d(a, b); 980 #else 981 // This prefers NaNs to numbers. 982 v2i64 aNaN = __builtin_msa_fcun_d(a, a); 983 v2i64 aMinOrNaN = por(__builtin_msa_fclt_d(a, b), aNaN); 984 return (Packet2d)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a); 985 #endif 986 } 987 988 template <> 989 EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { 990 EIGEN_MSA_DEBUG; 991 992 #if EIGEN_FAST_MATH 993 // This prefers numbers to NaNs. 994 return __builtin_msa_fmax_d(a, b); 995 #else 996 // This prefers NaNs to numbers. 997 v2i64 aNaN = __builtin_msa_fcun_d(a, a); 998 v2i64 aMaxOrNaN = por(__builtin_msa_fclt_d(b, a), aNaN); 999 return (Packet2d)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a); 1000 #endif 1001 } 1002 1003 template <> 1004 EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { 1005 EIGEN_MSA_DEBUG; 1006 1007 EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast<double*>(from), 0); 1008 } 1009 1010 template <> 1011 EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) { 1012 EIGEN_MSA_DEBUG; 1013 1014 Packet2d value = { *from, *from }; 1015 return value; 1016 } 1017 1018 template <> 1019 EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { 1020 EIGEN_MSA_DEBUG; 1021 1022 EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0); 1023 } 1024 1025 template <> 1026 EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { 1027 EIGEN_MSA_DEBUG; 1028 1029 EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0); 1030 } 1031 1032 template <> 1033 EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) { 1034 EIGEN_MSA_DEBUG; 1035 1036 Packet2d value; 1037 value[0] = *from; 1038 from += stride; 1039 value[1] = *from; 1040 return value; 1041 } 1042 1043 template <> 1044 EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, 1045 Index stride) { 1046 EIGEN_MSA_DEBUG; 1047 1048 *to = from[0]; 1049 to += stride; 1050 *to = from[1]; 1051 } 1052 1053 template <> 1054 EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { 1055 EIGEN_MSA_DEBUG; 1056 1057 __builtin_prefetch(addr); 1058 } 1059 1060 template <> 1061 EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { 1062 EIGEN_MSA_DEBUG; 1063 1064 return a[0]; 1065 } 1066 1067 template <> 1068 EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { 1069 EIGEN_MSA_DEBUG; 1070 1071 return (Packet2d)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); 1072 } 1073 1074 template <> 1075 EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { 1076 EIGEN_MSA_DEBUG; 1077 1078 return (Packet2d)__builtin_msa_bclri_d((v2u64)a, 63); 1079 } 1080 1081 template <> 1082 EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { 1083 EIGEN_MSA_DEBUG; 1084 1085 Packet2d s = padd(a, preverse(a)); 1086 return s[0]; 1087 } 1088 1089 // Other reduction functions: 1090 // mul 1091 template <> 1092 EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { 1093 EIGEN_MSA_DEBUG; 1094 1095 Packet2d p = pmul(a, preverse(a)); 1096 return p[0]; 1097 } 1098 1099 // min 1100 template <> 1101 EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) { 1102 EIGEN_MSA_DEBUG; 1103 1104 #if EIGEN_FAST_MATH 1105 Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); 1106 Packet2d v = __builtin_msa_fmin_d(a, swapped); 1107 return v[0]; 1108 #else 1109 double a0 = a[0], a1 = a[1]; 1110 return ((numext::isnan)(a0) || a0 < a1) ? a0 : a1; 1111 #endif 1112 } 1113 1114 // max 1115 template <> 1116 EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) { 1117 EIGEN_MSA_DEBUG; 1118 1119 #if EIGEN_FAST_MATH 1120 Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); 1121 Packet2d v = __builtin_msa_fmax_d(a, swapped); 1122 return v[0]; 1123 #else 1124 double a0 = a[0], a1 = a[1]; 1125 return ((numext::isnan)(a0) || a0 > a1) ? a0 : a1; 1126 #endif 1127 } 1128 1129 template <> 1130 EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& a) { 1131 EIGEN_MSA_DEBUG; 1132 1133 return __builtin_msa_fsqrt_d(a); 1134 } 1135 1136 template <> 1137 EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) { 1138 EIGEN_MSA_DEBUG; 1139 1140 #if EIGEN_FAST_MATH 1141 return __builtin_msa_frsqrt_d(a); 1142 #else 1143 Packet2d ones = __builtin_msa_ffint_s_d(__builtin_msa_ldi_d(1)); 1144 return pdiv(ones, psqrt(a)); 1145 #endif 1146 } 1147 1148 inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet2d, 2>& value) { 1149 os << "[ " << value.packet[0] << "," << std::endl << " " << value.packet[1] << " ]"; 1150 return os; 1151 } 1152 1153 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) { 1154 EIGEN_MSA_DEBUG; 1155 1156 Packet2d trn1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]); 1157 Packet2d trn2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]); 1158 kernel.packet[0] = trn1; 1159 kernel.packet[1] = trn2; 1160 } 1161 1162 template <> 1163 EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { 1164 Packet2d v = a; 1165 int32_t old_mode, new_mode; 1166 asm volatile( 1167 "cfcmsa %[old_mode], $1\n" 1168 "ori %[new_mode], %[old_mode], 3\n" // 3 = round towards -INFINITY. 1169 "ctcmsa $1, %[new_mode]\n" 1170 "frint.d %w[v], %w[v]\n" 1171 "ctcmsa $1, %[old_mode]\n" 1172 : // outputs 1173 [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), 1174 [v] "+f"(v) 1175 : // inputs 1176 : // clobbers 1177 ); 1178 return v; 1179 } 1180 1181 template <> 1182 EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { 1183 Packet2d v = a; 1184 int32_t old_mode, new_mode; 1185 asm volatile( 1186 "cfcmsa %[old_mode], $1\n" 1187 "ori %[new_mode], %[old_mode], 3\n" 1188 "xori %[new_mode], %[new_mode], 1\n" // 2 = round towards +INFINITY. 1189 "ctcmsa $1, %[new_mode]\n" 1190 "frint.d %w[v], %w[v]\n" 1191 "ctcmsa $1, %[old_mode]\n" 1192 : // outputs 1193 [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), 1194 [v] "+f"(v) 1195 : // inputs 1196 : // clobbers 1197 ); 1198 return v; 1199 } 1200 1201 template <> 1202 EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { 1203 Packet2d v = a; 1204 int32_t old_mode, new_mode; 1205 asm volatile( 1206 "cfcmsa %[old_mode], $1\n" 1207 "ori %[new_mode], %[old_mode], 3\n" 1208 "xori %[new_mode], %[new_mode], 3\n" // 0 = round to nearest, ties to even. 1209 "ctcmsa $1, %[new_mode]\n" 1210 "frint.d %w[v], %w[v]\n" 1211 "ctcmsa $1, %[old_mode]\n" 1212 : // outputs 1213 [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), 1214 [v] "+f"(v) 1215 : // inputs 1216 : // clobbers 1217 ); 1218 return v; 1219 } 1220 1221 template <> 1222 EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, 1223 const Packet2d& elsePacket) { 1224 Packet2ul select = { ifPacket.select[0], ifPacket.select[1] }; 1225 Packet2l mask = __builtin_msa_ceqi_d((Packet2l)select, 0); 1226 return (Packet2d)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket); 1227 } 1228 1229 } // end namespace internal 1230 1231 } // end namespace Eigen 1232 1233 #endif // EIGEN_PACKET_MATH_MSA_H 1234