xref: /aosp_15_r20/external/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h (revision bf2c37156dfe67e5dfebd6d394bad8b2ab5804d4)
1 namespace Eigen {
2 namespace internal {
3 
4 #if EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
5 
6 // Clang seems to excessively spill registers in the GEBP kernel on 32-bit arm.
7 // Here we specialize gebp_traits to eliminate these register spills.
8 // See #2138.
9 template<>
10 struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
11  : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
12 {
13   EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
14   {
15     // This volatile inline ASM both acts as a barrier to prevent reordering,
16     // as well as enforces strict register use.
17     asm volatile(
18       "vmla.f32 %q[r], %q[c], %q[alpha]"
19       : [r] "+w" (r)
20       : [c] "w" (c),
21         [alpha] "w" (alpha)
22       : );
23   }
24 
25   template <typename LaneIdType>
26   EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b,
27                                 Packet4f& c, [[maybe_unused]] Packet4f& tmp,
28                                 const LaneIdType&) const {
29     acc(a, b, c);
30   }
31 
32   template <typename LaneIdType>
33   EIGEN_STRONG_INLINE void madd(const Packet4f& a, const QuadPacket<Packet4f>& b,
34                                 Packet4f& c, Packet4f& tmp,
35                                 const LaneIdType& lane) const {
36     madd(a, b.get(lane), c, tmp, lane);
37   }
38 };
39 
40 #endif // EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
41 
42 #if EIGEN_ARCH_ARM64
43 
44 template<>
45 struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
46  : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
47 {
48   typedef float RhsPacket;
49   typedef float32x4_t RhsPacketx4;
50 
51   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
52   {
53     dest = *b;
54   }
55 
56   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
57   {
58     dest = vld1q_f32(b);
59   }
60 
61   EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
62   {
63     dest = *b;
64   }
65 
66   EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
67   {}
68 
69   EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
70   {
71     loadRhs(b,dest);
72   }
73 
74   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
75   {
76     c = vfmaq_n_f32(c, a, b);
77   }
78 
79   // NOTE: Template parameter inference failed when compiled with Android NDK:
80   // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
81 
82   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
83   { madd_helper<0>(a, b, c); }
84   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
85   { madd_helper<1>(a, b, c); }
86   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
87   { madd_helper<2>(a, b, c); }
88   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
89   { madd_helper<3>(a, b, c); }
90 
91  private:
92   template<int LaneID>
93   EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
94   {
95     #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
96     // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
97     // vfmaq_laneq_f32 is implemented through a costly dup
98          if(LaneID==0)  asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) :  );
99     else if(LaneID==1)  asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) :  );
100     else if(LaneID==2)  asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) :  );
101     else if(LaneID==3)  asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) :  );
102     #else
103     c = vfmaq_laneq_f32(c, a, b, LaneID);
104     #endif
105   }
106 };
107 
108 
109 template<>
110 struct gebp_traits <double,double,false,false,Architecture::NEON>
111  : gebp_traits<double,double,false,false,Architecture::Generic>
112 {
113   typedef double RhsPacket;
114 
115   struct RhsPacketx4 {
116     float64x2_t B_0, B_1;
117   };
118 
119   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
120   {
121     dest = *b;
122   }
123 
124   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
125   {
126     dest.B_0 = vld1q_f64(b);
127     dest.B_1 = vld1q_f64(b+2);
128   }
129 
130   EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
131   {
132     loadRhs(b,dest);
133   }
134 
135   EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
136   {}
137 
138   EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
139   {
140     loadRhs(b,dest);
141   }
142 
143   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
144   {
145     c = vfmaq_n_f64(c, a, b);
146   }
147 
148   // NOTE: Template parameter inference failed when compiled with Android NDK:
149   // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
150 
151   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
152   { madd_helper<0>(a, b, c); }
153   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
154   { madd_helper<1>(a, b, c); }
155   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
156   { madd_helper<2>(a, b, c); }
157   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
158   { madd_helper<3>(a, b, c); }
159 
160  private:
161   template <int LaneID>
162   EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
163   {
164     #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
165     // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
166     // vfmaq_laneq_f64 is implemented through a costly dup
167          if(LaneID==0)  asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) :  );
168     else if(LaneID==1)  asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) :  );
169     else if(LaneID==2)  asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) :  );
170     else if(LaneID==3)  asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) :  );
171     #else
172          if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0);
173     else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1);
174     else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0);
175     else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1);
176     #endif
177   }
178 };
179 
180 #endif // EIGEN_ARCH_ARM64
181 
182 }  // namespace internal
183 }  // namespace Eigen
184