xref: /aosp_15_r20/external/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h (revision bf2c37156dfe67e5dfebd6d394bad8b2ab5804d4)
1 //#define EIGEN_POWER_USE_PREFETCH  // Use prefetching in gemm routines
2 #ifdef EIGEN_POWER_USE_PREFETCH
3 #define EIGEN_POWER_PREFETCH(p)  prefetch(p)
4 #else
5 #define EIGEN_POWER_PREFETCH(p)
6 #endif
7 
8 namespace Eigen {
9 
10 namespace internal {
11 
12 template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows>
13 EIGEN_STRONG_INLINE void gemm_extra_col(
14   const DataMapper& res,
15   const Scalar* lhs_base,
16   const Scalar* rhs_base,
17   Index depth,
18   Index strideA,
19   Index offsetA,
20   Index row,
21   Index col,
22   Index remaining_rows,
23   Index remaining_cols,
24   const Packet& pAlpha);
25 
26 template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows, const Index accCols>
27 EIGEN_STRONG_INLINE void gemm_extra_row(
28   const DataMapper& res,
29   const Scalar* lhs_base,
30   const Scalar* rhs_base,
31   Index depth,
32   Index strideA,
33   Index offsetA,
34   Index row,
35   Index col,
36   Index rows,
37   Index cols,
38   Index remaining_rows,
39   const Packet& pAlpha,
40   const Packet& pMask);
41 
42 template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accCols>
43 EIGEN_STRONG_INLINE void gemm_unrolled_col(
44   const DataMapper& res,
45   const Scalar* lhs_base,
46   const Scalar* rhs_base,
47   Index depth,
48   Index strideA,
49   Index offsetA,
50   Index& row,
51   Index rows,
52   Index col,
53   Index remaining_cols,
54   const Packet& pAlpha);
55 
56 template<typename Packet>
57 EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows);
58 
59 template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
60 EIGEN_STRONG_INLINE void gemm_complex_extra_col(
61   const DataMapper& res,
62   const Scalar* lhs_base,
63   const Scalar* rhs_base,
64   Index depth,
65   Index strideA,
66   Index offsetA,
67   Index strideB,
68   Index row,
69   Index col,
70   Index remaining_rows,
71   Index remaining_cols,
72   const Packet& pAlphaReal,
73   const Packet& pAlphaImag);
74 
75 template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
76 EIGEN_STRONG_INLINE void gemm_complex_extra_row(
77   const DataMapper& res,
78   const Scalar* lhs_base,
79   const Scalar* rhs_base,
80   Index depth,
81   Index strideA,
82   Index offsetA,
83   Index strideB,
84   Index row,
85   Index col,
86   Index rows,
87   Index cols,
88   Index remaining_rows,
89   const Packet& pAlphaReal,
90   const Packet& pAlphaImag,
91   const Packet& pMask);
92 
93 template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
94 EIGEN_STRONG_INLINE void gemm_complex_unrolled_col(
95   const DataMapper& res,
96   const Scalar* lhs_base,
97   const Scalar* rhs_base,
98   Index depth,
99   Index strideA,
100   Index offsetA,
101   Index strideB,
102   Index& row,
103   Index rows,
104   Index col,
105   Index remaining_cols,
106   const Packet& pAlphaReal,
107   const Packet& pAlphaImag);
108 
109 template<typename Scalar, typename Packet>
110 EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs);
111 
112 template<typename DataMapper, typename Packet, typename Index, const Index accCols, int N, int StorageOrder>
113 EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet,4>& acc, const DataMapper& res, Index row, Index col);
114 
115 template<typename DataMapper, typename Packet, typename Index, const Index accCols, int N, int StorageOrder>
116 EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet,8>& acc, const DataMapper& res, Index row, Index col);
117 
118 template<typename Packet>
119 EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,4>& acc, PacketBlock<Packet,4>& accZ, const Packet& pAlpha);
120 
121 template<typename Packet, int N>
122 EIGEN_ALWAYS_INLINE void bscalec(PacketBlock<Packet,N>& aReal, PacketBlock<Packet,N>& aImag, const Packet& bReal, const Packet& bImag, PacketBlock<Packet,N>& cReal, PacketBlock<Packet,N>& cImag);
123 
124 const static Packet16uc p16uc_SETCOMPLEX32_FIRST = {  0,  1,  2,  3,
125                                                      16, 17, 18, 19,
126                                                       4,  5,  6,  7,
127                                                      20, 21, 22, 23};
128 
129 const static Packet16uc p16uc_SETCOMPLEX32_SECOND = {  8,  9, 10, 11,
130                                                       24, 25, 26, 27,
131                                                       12, 13, 14, 15,
132                                                       28, 29, 30, 31};
133 //[a,b],[ai,bi] = [a,ai] - This is equivalent to p16uc_GETREAL64
134 const static Packet16uc p16uc_SETCOMPLEX64_FIRST = {  0,  1,  2,  3,  4,  5,  6,  7,
135                                                      16, 17, 18, 19, 20, 21, 22, 23};
136 
137 //[a,b],[ai,bi] = [b,bi] - This is equivalent to p16uc_GETIMAG64
138 const static Packet16uc p16uc_SETCOMPLEX64_SECOND = {  8,  9, 10, 11, 12, 13, 14, 15,
139                                                       24, 25, 26, 27, 28, 29, 30, 31};
140 
141 
142 // Grab two decouples real/imaginary PacketBlocks and return two coupled (real/imaginary pairs) PacketBlocks.
143 template<typename Packet, typename Packetc>
bcouple_common(PacketBlock<Packet,4> & taccReal,PacketBlock<Packet,4> & taccImag,PacketBlock<Packetc,4> & acc1,PacketBlock<Packetc,4> & acc2)144 EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock<Packet,4>& taccReal, PacketBlock<Packet,4>& taccImag, PacketBlock<Packetc, 4>& acc1, PacketBlock<Packetc, 4>& acc2)
145 {
146   acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST);
147   acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_FIRST);
148   acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX32_FIRST);
149   acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX32_FIRST);
150 
151   acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_SECOND);
152   acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_SECOND);
153   acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX32_SECOND);
154   acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX32_SECOND);
155 }
156 
157 template<typename Packet, typename Packetc>
bcouple(PacketBlock<Packet,4> & taccReal,PacketBlock<Packet,4> & taccImag,PacketBlock<Packetc,8> & tRes,PacketBlock<Packetc,4> & acc1,PacketBlock<Packetc,4> & acc2)158 EIGEN_ALWAYS_INLINE void bcouple(PacketBlock<Packet,4>& taccReal, PacketBlock<Packet,4>& taccImag, PacketBlock<Packetc,8>& tRes, PacketBlock<Packetc, 4>& acc1, PacketBlock<Packetc, 4>& acc2)
159 {
160   bcouple_common<Packet, Packetc>(taccReal, taccImag, acc1, acc2);
161 
162   acc1.packet[0] = padd<Packetc>(tRes.packet[0], acc1.packet[0]);
163   acc1.packet[1] = padd<Packetc>(tRes.packet[1], acc1.packet[1]);
164   acc1.packet[2] = padd<Packetc>(tRes.packet[2], acc1.packet[2]);
165   acc1.packet[3] = padd<Packetc>(tRes.packet[3], acc1.packet[3]);
166 
167   acc2.packet[0] = padd<Packetc>(tRes.packet[4], acc2.packet[0]);
168   acc2.packet[1] = padd<Packetc>(tRes.packet[5], acc2.packet[1]);
169   acc2.packet[2] = padd<Packetc>(tRes.packet[6], acc2.packet[2]);
170   acc2.packet[3] = padd<Packetc>(tRes.packet[7], acc2.packet[3]);
171 }
172 
173 template<typename Packet, typename Packetc>
bcouple_common(PacketBlock<Packet,1> & taccReal,PacketBlock<Packet,1> & taccImag,PacketBlock<Packetc,1> & acc1,PacketBlock<Packetc,1> & acc2)174 EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock<Packet,1>& taccReal, PacketBlock<Packet,1>& taccImag, PacketBlock<Packetc, 1>& acc1, PacketBlock<Packetc, 1>& acc2)
175 {
176   acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST);
177 
178   acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_SECOND);
179 }
180 
181 template<typename Packet, typename Packetc>
bcouple(PacketBlock<Packet,1> & taccReal,PacketBlock<Packet,1> & taccImag,PacketBlock<Packetc,2> & tRes,PacketBlock<Packetc,1> & acc1,PacketBlock<Packetc,1> & acc2)182 EIGEN_ALWAYS_INLINE void bcouple(PacketBlock<Packet,1>& taccReal, PacketBlock<Packet,1>& taccImag, PacketBlock<Packetc,2>& tRes, PacketBlock<Packetc, 1>& acc1, PacketBlock<Packetc, 1>& acc2)
183 {
184   bcouple_common<Packet, Packetc>(taccReal, taccImag, acc1, acc2);
185 
186   acc1.packet[0] = padd<Packetc>(tRes.packet[0], acc1.packet[0]);
187 
188   acc2.packet[0] = padd<Packetc>(tRes.packet[1], acc2.packet[0]);
189 }
190 
191 template<>
192 EIGEN_ALWAYS_INLINE void bcouple_common<Packet2d, Packet1cd>(PacketBlock<Packet2d,4>& taccReal, PacketBlock<Packet2d,4>& taccImag, PacketBlock<Packet1cd, 4>& acc1, PacketBlock<Packet1cd, 4>& acc2)
193 {
194   acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST);
195   acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_FIRST);
196   acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX64_FIRST);
197   acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX64_FIRST);
198 
199   acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_SECOND);
200   acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_SECOND);
201   acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX64_SECOND);
202   acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX64_SECOND);
203 }
204 
205 template<>
206 EIGEN_ALWAYS_INLINE void bcouple_common<Packet2d, Packet1cd>(PacketBlock<Packet2d,1>& taccReal, PacketBlock<Packet2d,1>& taccImag, PacketBlock<Packet1cd, 1>& acc1, PacketBlock<Packet1cd, 1>& acc2)
207 {
208   acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST);
209 
210   acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_SECOND);
211 }
212 
213 // This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled.
214 template<typename Scalar, typename Packet>
ploadRhs(const Scalar * rhs)215 EIGEN_ALWAYS_INLINE Packet ploadRhs(const Scalar* rhs)
216 {
217   return ploadu<Packet>(rhs);
218 }
219 
220 } // end namespace internal
221 } // end namespace Eigen
222