1 namespace Eigen { 2 namespace internal { 3 4 #if EIGEN_ARCH_ARM && EIGEN_COMP_CLANG 5 6 // Clang seems to excessively spill registers in the GEBP kernel on 32-bit arm. 7 // Here we specialize gebp_traits to eliminate these register spills. 8 // See #2138. 9 template<> 10 struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull> 11 : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull> 12 { 13 EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const 14 { 15 // This volatile inline ASM both acts as a barrier to prevent reordering, 16 // as well as enforces strict register use. 17 asm volatile( 18 "vmla.f32 %q[r], %q[c], %q[alpha]" 19 : [r] "+w" (r) 20 : [c] "w" (c), 21 [alpha] "w" (alpha) 22 : ); 23 } 24 25 template <typename LaneIdType> 26 EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b, 27 Packet4f& c, [[maybe_unused]] Packet4f& tmp, 28 const LaneIdType&) const { 29 acc(a, b, c); 30 } 31 32 template <typename LaneIdType> 33 EIGEN_STRONG_INLINE void madd(const Packet4f& a, const QuadPacket<Packet4f>& b, 34 Packet4f& c, Packet4f& tmp, 35 const LaneIdType& lane) const { 36 madd(a, b.get(lane), c, tmp, lane); 37 } 38 }; 39 40 #endif // EIGEN_ARCH_ARM && EIGEN_COMP_CLANG 41 42 #if EIGEN_ARCH_ARM64 43 44 template<> 45 struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull> 46 : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull> 47 { 48 typedef float RhsPacket; 49 typedef float32x4_t RhsPacketx4; 50 51 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const 52 { 53 dest = *b; 54 } 55 56 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const 57 { 58 dest = vld1q_f32(b); 59 } 60 61 EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const 62 { 63 dest = *b; 64 } 65 66 EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const 67 {} 68 69 EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const 70 { 71 loadRhs(b,dest); 72 } 73 74 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const 75 { 76 c = vfmaq_n_f32(c, a, b); 77 } 78 79 // NOTE: Template parameter inference failed when compiled with Android NDK: 80 // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>". 81 82 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const 83 { madd_helper<0>(a, b, c); } 84 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const 85 { madd_helper<1>(a, b, c); } 86 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const 87 { madd_helper<2>(a, b, c); } 88 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const 89 { madd_helper<3>(a, b, c); } 90 91 private: 92 template<int LaneID> 93 EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const 94 { 95 #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) 96 // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 97 // vfmaq_laneq_f32 is implemented through a costly dup 98 if(LaneID==0) asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) : ); 99 else if(LaneID==1) asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) : ); 100 else if(LaneID==2) asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) : ); 101 else if(LaneID==3) asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) : ); 102 #else 103 c = vfmaq_laneq_f32(c, a, b, LaneID); 104 #endif 105 } 106 }; 107 108 109 template<> 110 struct gebp_traits <double,double,false,false,Architecture::NEON> 111 : gebp_traits<double,double,false,false,Architecture::Generic> 112 { 113 typedef double RhsPacket; 114 115 struct RhsPacketx4 { 116 float64x2_t B_0, B_1; 117 }; 118 119 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const 120 { 121 dest = *b; 122 } 123 124 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const 125 { 126 dest.B_0 = vld1q_f64(b); 127 dest.B_1 = vld1q_f64(b+2); 128 } 129 130 EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const 131 { 132 loadRhs(b,dest); 133 } 134 135 EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const 136 {} 137 138 EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const 139 { 140 loadRhs(b,dest); 141 } 142 143 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const 144 { 145 c = vfmaq_n_f64(c, a, b); 146 } 147 148 // NOTE: Template parameter inference failed when compiled with Android NDK: 149 // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>". 150 151 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const 152 { madd_helper<0>(a, b, c); } 153 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const 154 { madd_helper<1>(a, b, c); } 155 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const 156 { madd_helper<2>(a, b, c); } 157 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const 158 { madd_helper<3>(a, b, c); } 159 160 private: 161 template <int LaneID> 162 EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const 163 { 164 #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) 165 // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 166 // vfmaq_laneq_f64 is implemented through a costly dup 167 if(LaneID==0) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); 168 else if(LaneID==1) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); 169 else if(LaneID==2) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : ); 170 else if(LaneID==3) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : ); 171 #else 172 if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0); 173 else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1); 174 else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0); 175 else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1); 176 #endif 177 } 178 }; 179 180 #endif // EIGEN_ARCH_ARM64 181 182 } // namespace internal 183 } // namespace Eigen 184