xref: /aosp_15_r20/external/eigen/Eigen/src/Core/products/GeneralMatrixVector.h (revision bf2c37156dfe67e5dfebd6d394bad8b2ab5804d4)
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2008-2016 Gael Guennebaud <[email protected]>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
11 #define EIGEN_GENERAL_MATRIX_VECTOR_H
12 
13 namespace Eigen {
14 
15 namespace internal {
16 
17 enum GEMVPacketSizeType {
18   GEMVPacketFull = 0,
19   GEMVPacketHalf,
20   GEMVPacketQuarter
21 };
22 
23 template <int N, typename T1, typename T2, typename T3>
24 struct gemv_packet_cond { typedef T3 type; };
25 
26 template <typename T1, typename T2, typename T3>
27 struct gemv_packet_cond<GEMVPacketFull, T1, T2, T3> { typedef T1 type; };
28 
29 template <typename T1, typename T2, typename T3>
30 struct gemv_packet_cond<GEMVPacketHalf, T1, T2, T3> { typedef T2 type; };
31 
32 template<typename LhsScalar, typename RhsScalar, int _PacketSize=GEMVPacketFull>
33 class gemv_traits
34 {
35   typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
36 
37 #define PACKET_DECL_COND_PREFIX(prefix, name, packet_size)                        \
38   typedef typename gemv_packet_cond<packet_size,                                  \
39                                     typename packet_traits<name ## Scalar>::type, \
40                                     typename packet_traits<name ## Scalar>::half, \
41                                     typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
42   prefix ## name ## Packet
43 
44   PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
45   PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
46   PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
47 #undef PACKET_DECL_COND_PREFIX
48 
49 public:
50   enum {
51         Vectorizable = unpacket_traits<_LhsPacket>::vectorizable &&
52         unpacket_traits<_RhsPacket>::vectorizable &&
53         int(unpacket_traits<_LhsPacket>::size)==int(unpacket_traits<_RhsPacket>::size),
54         LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
55         RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
56         ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1
57   };
58 
59   typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
60   typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
61   typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
62 };
63 
64 
65 /* Optimized col-major matrix * vector product:
66  * This algorithm processes the matrix per vertical panels,
67  * which are then processed horizontaly per chunck of 8*PacketSize x 1 vertical segments.
68  *
69  * Mixing type logic: C += alpha * A * B
70  *  |  A  |  B  |alpha| comments
71  *  |real |cplx |cplx | no vectorization
72  *  |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
73  *  |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
74  *  |cplx |real |real | optimal case, vectorization possible via real-cplx mul
75  *
76  * The same reasoning apply for the transposed case.
77  */
78 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
79 struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
80 {
81   typedef gemv_traits<LhsScalar,RhsScalar> Traits;
82   typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketHalf> HalfTraits;
83   typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketQuarter> QuarterTraits;
84 
85   typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
86 
87   typedef typename Traits::LhsPacket LhsPacket;
88   typedef typename Traits::RhsPacket RhsPacket;
89   typedef typename Traits::ResPacket ResPacket;
90 
91   typedef typename HalfTraits::LhsPacket LhsPacketHalf;
92   typedef typename HalfTraits::RhsPacket RhsPacketHalf;
93   typedef typename HalfTraits::ResPacket ResPacketHalf;
94 
95   typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
96   typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
97   typedef typename QuarterTraits::ResPacket ResPacketQuarter;
98 
99 EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(
100   Index rows, Index cols,
101   const LhsMapper& lhs,
102   const RhsMapper& rhs,
103         ResScalar* res, Index resIncr,
104   RhsScalar alpha);
105 };
106 
107 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
108 EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
109   Index rows, Index cols,
110   const LhsMapper& alhs,
111   const RhsMapper& rhs,
112         ResScalar* res, Index resIncr,
113   RhsScalar alpha)
114 {
115   EIGEN_UNUSED_VARIABLE(resIncr);
116   eigen_internal_assert(resIncr==1);
117 
118   // The following copy tells the compiler that lhs's attributes are not modified outside this function
119   // This helps GCC to generate propoer code.
120   LhsMapper lhs(alhs);
121 
122   conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
123   conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
124   conj_helper<LhsPacketHalf,RhsPacketHalf,ConjugateLhs,ConjugateRhs> pcj_half;
125   conj_helper<LhsPacketQuarter,RhsPacketQuarter,ConjugateLhs,ConjugateRhs> pcj_quarter;
126 
127   const Index lhsStride = lhs.stride();
128   // TODO: for padded aligned inputs, we could enable aligned reads
129   enum { LhsAlignment = Unaligned,
130          ResPacketSize = Traits::ResPacketSize,
131          ResPacketSizeHalf = HalfTraits::ResPacketSize,
132          ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
133          LhsPacketSize = Traits::LhsPacketSize,
134          HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
135          HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
136   };
137 
138   const Index n8 = rows-8*ResPacketSize+1;
139   const Index n4 = rows-4*ResPacketSize+1;
140   const Index n3 = rows-3*ResPacketSize+1;
141   const Index n2 = rows-2*ResPacketSize+1;
142   const Index n1 = rows-1*ResPacketSize+1;
143   const Index n_half = rows-1*ResPacketSizeHalf+1;
144   const Index n_quarter = rows-1*ResPacketSizeQuarter+1;
145 
146   // TODO: improve the following heuristic:
147   const Index block_cols = cols<128 ? cols : (lhsStride*sizeof(LhsScalar)<32000?16:4);
148   ResPacket palpha = pset1<ResPacket>(alpha);
149   ResPacketHalf palpha_half = pset1<ResPacketHalf>(alpha);
150   ResPacketQuarter palpha_quarter = pset1<ResPacketQuarter>(alpha);
151 
152   for(Index j2=0; j2<cols; j2+=block_cols)
153   {
154     Index jend = numext::mini(j2+block_cols,cols);
155     Index i=0;
156     for(; i<n8; i+=ResPacketSize*8)
157     {
158       ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
159                 c1 = pset1<ResPacket>(ResScalar(0)),
160                 c2 = pset1<ResPacket>(ResScalar(0)),
161                 c3 = pset1<ResPacket>(ResScalar(0)),
162                 c4 = pset1<ResPacket>(ResScalar(0)),
163                 c5 = pset1<ResPacket>(ResScalar(0)),
164                 c6 = pset1<ResPacket>(ResScalar(0)),
165                 c7 = pset1<ResPacket>(ResScalar(0));
166 
167       for(Index j=j2; j<jend; j+=1)
168       {
169         RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
170         c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
171         c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
172         c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
173         c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3);
174         c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*4,j),b0,c4);
175         c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*5,j),b0,c5);
176         c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*6,j),b0,c6);
177         c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*7,j),b0,c7);
178       }
179       pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
180       pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
181       pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
182       pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3)));
183       pstoreu(res+i+ResPacketSize*4, pmadd(c4,palpha,ploadu<ResPacket>(res+i+ResPacketSize*4)));
184       pstoreu(res+i+ResPacketSize*5, pmadd(c5,palpha,ploadu<ResPacket>(res+i+ResPacketSize*5)));
185       pstoreu(res+i+ResPacketSize*6, pmadd(c6,palpha,ploadu<ResPacket>(res+i+ResPacketSize*6)));
186       pstoreu(res+i+ResPacketSize*7, pmadd(c7,palpha,ploadu<ResPacket>(res+i+ResPacketSize*7)));
187     }
188     if(i<n4)
189     {
190       ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
191                 c1 = pset1<ResPacket>(ResScalar(0)),
192                 c2 = pset1<ResPacket>(ResScalar(0)),
193                 c3 = pset1<ResPacket>(ResScalar(0));
194 
195       for(Index j=j2; j<jend; j+=1)
196       {
197         RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
198         c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
199         c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
200         c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
201         c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3);
202       }
203       pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
204       pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
205       pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
206       pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3)));
207 
208       i+=ResPacketSize*4;
209     }
210     if(i<n3)
211     {
212       ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
213                 c1 = pset1<ResPacket>(ResScalar(0)),
214                 c2 = pset1<ResPacket>(ResScalar(0));
215 
216       for(Index j=j2; j<jend; j+=1)
217       {
218         RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
219         c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
220         c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
221         c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
222       }
223       pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
224       pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
225       pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
226 
227       i+=ResPacketSize*3;
228     }
229     if(i<n2)
230     {
231       ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
232                 c1 = pset1<ResPacket>(ResScalar(0));
233 
234       for(Index j=j2; j<jend; j+=1)
235       {
236         RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
237         c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
238         c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
239       }
240       pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
241       pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
242       i+=ResPacketSize*2;
243     }
244     if(i<n1)
245     {
246       ResPacket c0 = pset1<ResPacket>(ResScalar(0));
247       for(Index j=j2; j<jend; j+=1)
248       {
249         RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
250         c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
251       }
252       pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
253       i+=ResPacketSize;
254     }
255     if(HasHalf && i<n_half)
256     {
257       ResPacketHalf c0 = pset1<ResPacketHalf>(ResScalar(0));
258       for(Index j=j2; j<jend; j+=1)
259       {
260         RhsPacketHalf b0 = pset1<RhsPacketHalf>(rhs(j,0));
261         c0 = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i+0,j),b0,c0);
262       }
263       pstoreu(res+i+ResPacketSizeHalf*0, pmadd(c0,palpha_half,ploadu<ResPacketHalf>(res+i+ResPacketSizeHalf*0)));
264       i+=ResPacketSizeHalf;
265     }
266     if(HasQuarter && i<n_quarter)
267     {
268       ResPacketQuarter c0 = pset1<ResPacketQuarter>(ResScalar(0));
269       for(Index j=j2; j<jend; j+=1)
270       {
271         RhsPacketQuarter b0 = pset1<RhsPacketQuarter>(rhs(j,0));
272         c0 = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i+0,j),b0,c0);
273       }
274       pstoreu(res+i+ResPacketSizeQuarter*0, pmadd(c0,palpha_quarter,ploadu<ResPacketQuarter>(res+i+ResPacketSizeQuarter*0)));
275       i+=ResPacketSizeQuarter;
276     }
277     for(;i<rows;++i)
278     {
279       ResScalar c0(0);
280       for(Index j=j2; j<jend; j+=1)
281         c0 += cj.pmul(lhs(i,j), rhs(j,0));
282       res[i] += alpha*c0;
283     }
284   }
285 }
286 
287 /* Optimized row-major matrix * vector product:
288  * This algorithm processes 4 rows at once that allows to both reduce
289  * the number of load/stores of the result by a factor 4 and to reduce
290  * the instruction dependency. Moreover, we know that all bands have the
291  * same alignment pattern.
292  *
293  * Mixing type logic:
294  *  - alpha is always a complex (or converted to a complex)
295  *  - no vectorization
296  */
297 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
298 struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
299 {
300   typedef gemv_traits<LhsScalar,RhsScalar> Traits;
301   typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketHalf> HalfTraits;
302   typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketQuarter> QuarterTraits;
303 
304   typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
305 
306   typedef typename Traits::LhsPacket LhsPacket;
307   typedef typename Traits::RhsPacket RhsPacket;
308   typedef typename Traits::ResPacket ResPacket;
309 
310   typedef typename HalfTraits::LhsPacket LhsPacketHalf;
311   typedef typename HalfTraits::RhsPacket RhsPacketHalf;
312   typedef typename HalfTraits::ResPacket ResPacketHalf;
313 
314   typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
315   typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
316   typedef typename QuarterTraits::ResPacket ResPacketQuarter;
317 
318 EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(
319   Index rows, Index cols,
320   const LhsMapper& lhs,
321   const RhsMapper& rhs,
322         ResScalar* res, Index resIncr,
323   ResScalar alpha);
324 };
325 
326 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
327 EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
328   Index rows, Index cols,
329   const LhsMapper& alhs,
330   const RhsMapper& rhs,
331   ResScalar* res, Index resIncr,
332   ResScalar alpha)
333 {
334   // The following copy tells the compiler that lhs's attributes are not modified outside this function
335   // This helps GCC to generate propoer code.
336   LhsMapper lhs(alhs);
337 
338   eigen_internal_assert(rhs.stride()==1);
339   conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
340   conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
341   conj_helper<LhsPacketHalf,RhsPacketHalf,ConjugateLhs,ConjugateRhs> pcj_half;
342   conj_helper<LhsPacketQuarter,RhsPacketQuarter,ConjugateLhs,ConjugateRhs> pcj_quarter;
343 
344   // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large,
345   //       processing 8 rows at once might be counter productive wrt cache.
346   const Index n8 = lhs.stride()*sizeof(LhsScalar)>32000 ? 0 : rows-7;
347   const Index n4 = rows-3;
348   const Index n2 = rows-1;
349 
350   // TODO: for padded aligned inputs, we could enable aligned reads
351   enum { LhsAlignment = Unaligned,
352          ResPacketSize = Traits::ResPacketSize,
353          ResPacketSizeHalf = HalfTraits::ResPacketSize,
354          ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
355          LhsPacketSize = Traits::LhsPacketSize,
356          LhsPacketSizeHalf = HalfTraits::LhsPacketSize,
357          LhsPacketSizeQuarter = QuarterTraits::LhsPacketSize,
358          HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
359          HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
360   };
361 
362   Index i=0;
363   for(; i<n8; i+=8)
364   {
365     ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
366               c1 = pset1<ResPacket>(ResScalar(0)),
367               c2 = pset1<ResPacket>(ResScalar(0)),
368               c3 = pset1<ResPacket>(ResScalar(0)),
369               c4 = pset1<ResPacket>(ResScalar(0)),
370               c5 = pset1<ResPacket>(ResScalar(0)),
371               c6 = pset1<ResPacket>(ResScalar(0)),
372               c7 = pset1<ResPacket>(ResScalar(0));
373 
374     Index j=0;
375     for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
376     {
377       RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
378 
379       c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
380       c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
381       c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2);
382       c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3);
383       c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+4,j),b0,c4);
384       c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+5,j),b0,c5);
385       c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+6,j),b0,c6);
386       c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+7,j),b0,c7);
387     }
388     ResScalar cc0 = predux(c0);
389     ResScalar cc1 = predux(c1);
390     ResScalar cc2 = predux(c2);
391     ResScalar cc3 = predux(c3);
392     ResScalar cc4 = predux(c4);
393     ResScalar cc5 = predux(c5);
394     ResScalar cc6 = predux(c6);
395     ResScalar cc7 = predux(c7);
396     for(; j<cols; ++j)
397     {
398       RhsScalar b0 = rhs(j,0);
399 
400       cc0 += cj.pmul(lhs(i+0,j), b0);
401       cc1 += cj.pmul(lhs(i+1,j), b0);
402       cc2 += cj.pmul(lhs(i+2,j), b0);
403       cc3 += cj.pmul(lhs(i+3,j), b0);
404       cc4 += cj.pmul(lhs(i+4,j), b0);
405       cc5 += cj.pmul(lhs(i+5,j), b0);
406       cc6 += cj.pmul(lhs(i+6,j), b0);
407       cc7 += cj.pmul(lhs(i+7,j), b0);
408     }
409     res[(i+0)*resIncr] += alpha*cc0;
410     res[(i+1)*resIncr] += alpha*cc1;
411     res[(i+2)*resIncr] += alpha*cc2;
412     res[(i+3)*resIncr] += alpha*cc3;
413     res[(i+4)*resIncr] += alpha*cc4;
414     res[(i+5)*resIncr] += alpha*cc5;
415     res[(i+6)*resIncr] += alpha*cc6;
416     res[(i+7)*resIncr] += alpha*cc7;
417   }
418   for(; i<n4; i+=4)
419   {
420     ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
421               c1 = pset1<ResPacket>(ResScalar(0)),
422               c2 = pset1<ResPacket>(ResScalar(0)),
423               c3 = pset1<ResPacket>(ResScalar(0));
424 
425     Index j=0;
426     for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
427     {
428       RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
429 
430       c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
431       c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
432       c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2);
433       c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3);
434     }
435     ResScalar cc0 = predux(c0);
436     ResScalar cc1 = predux(c1);
437     ResScalar cc2 = predux(c2);
438     ResScalar cc3 = predux(c3);
439     for(; j<cols; ++j)
440     {
441       RhsScalar b0 = rhs(j,0);
442 
443       cc0 += cj.pmul(lhs(i+0,j), b0);
444       cc1 += cj.pmul(lhs(i+1,j), b0);
445       cc2 += cj.pmul(lhs(i+2,j), b0);
446       cc3 += cj.pmul(lhs(i+3,j), b0);
447     }
448     res[(i+0)*resIncr] += alpha*cc0;
449     res[(i+1)*resIncr] += alpha*cc1;
450     res[(i+2)*resIncr] += alpha*cc2;
451     res[(i+3)*resIncr] += alpha*cc3;
452   }
453   for(; i<n2; i+=2)
454   {
455     ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
456               c1 = pset1<ResPacket>(ResScalar(0));
457 
458     Index j=0;
459     for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
460     {
461       RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
462 
463       c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
464       c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
465     }
466     ResScalar cc0 = predux(c0);
467     ResScalar cc1 = predux(c1);
468     for(; j<cols; ++j)
469     {
470       RhsScalar b0 = rhs(j,0);
471 
472       cc0 += cj.pmul(lhs(i+0,j), b0);
473       cc1 += cj.pmul(lhs(i+1,j), b0);
474     }
475     res[(i+0)*resIncr] += alpha*cc0;
476     res[(i+1)*resIncr] += alpha*cc1;
477   }
478   for(; i<rows; ++i)
479   {
480     ResPacket c0 = pset1<ResPacket>(ResScalar(0));
481     ResPacketHalf c0_h = pset1<ResPacketHalf>(ResScalar(0));
482     ResPacketQuarter c0_q = pset1<ResPacketQuarter>(ResScalar(0));
483     Index j=0;
484     for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
485     {
486       RhsPacket b0 = rhs.template load<RhsPacket,Unaligned>(j,0);
487       c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i,j),b0,c0);
488     }
489     ResScalar cc0 = predux(c0);
490     if (HasHalf) {
491       for(; j+LhsPacketSizeHalf<=cols; j+=LhsPacketSizeHalf)
492         {
493           RhsPacketHalf b0 = rhs.template load<RhsPacketHalf,Unaligned>(j,0);
494           c0_h = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i,j),b0,c0_h);
495         }
496       cc0 += predux(c0_h);
497     }
498     if (HasQuarter) {
499       for(; j+LhsPacketSizeQuarter<=cols; j+=LhsPacketSizeQuarter)
500         {
501           RhsPacketQuarter b0 = rhs.template load<RhsPacketQuarter,Unaligned>(j,0);
502           c0_q = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i,j),b0,c0_q);
503         }
504       cc0 += predux(c0_q);
505     }
506     for(; j<cols; ++j)
507     {
508       cc0 += cj.pmul(lhs(i,j), rhs(j,0));
509     }
510     res[i*resIncr] += alpha*cc0;
511   }
512 }
513 
514 } // end namespace internal
515 
516 } // end namespace Eigen
517 
518 #endif // EIGEN_GENERAL_MATRIX_VECTOR_H
519