xref: /aosp_15_r20/external/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h (revision bf2c37156dfe67e5dfebd6d394bad8b2ab5804d4)
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2020 Everton Constantino ([email protected])
5 // Copyright (C) 2021 Chip Kerchner ([email protected])
6 //
7 // This Source Code Form is subject to the terms of the Mozilla
8 // Public License v. 2.0. If a copy of the MPL was not distributed
9 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 
11 #ifndef EIGEN_MATRIX_PRODUCT_ALTIVEC_H
12 #define EIGEN_MATRIX_PRODUCT_ALTIVEC_H
13 
14 #ifndef EIGEN_ALTIVEC_USE_CUSTOM_PACK
15 #define EIGEN_ALTIVEC_USE_CUSTOM_PACK    1
16 #endif
17 
18 #include "MatrixProductCommon.h"
19 
20 // Since LLVM doesn't support dynamic dispatching, force either always MMA or VSX
21 #if EIGEN_COMP_LLVM
22 #if !defined(EIGEN_ALTIVEC_DISABLE_MMA) && !defined(EIGEN_ALTIVEC_MMA_ONLY)
23 #ifdef __MMA__
24 #define EIGEN_ALTIVEC_MMA_ONLY
25 #else
26 #define EIGEN_ALTIVEC_DISABLE_MMA
27 #endif
28 #endif
29 #endif
30 
31 #ifdef __has_builtin
32 #if __has_builtin(__builtin_mma_assemble_acc)
33   #define ALTIVEC_MMA_SUPPORT
34 #endif
35 #endif
36 
37 #if defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
38   #include "MatrixProductMMA.h"
39 #endif
40 
41 /**************************************************************************************************
42  * TODO                                                                                           *
43  * - Check StorageOrder on dhs_pack (the innermost second loop seems unvectorized when it could). *
44  * - Check the possibility of transposing as GETREAL and GETIMAG when needed.                     *
45  **************************************************************************************************/
46 namespace Eigen {
47 
48 namespace internal {
49 
50 /**************************
51  * Constants and typedefs *
52  **************************/
53 template<typename Scalar>
54 struct quad_traits
55 {
56   typedef typename packet_traits<Scalar>::type    vectortype;
57   typedef PacketBlock<vectortype,4>                     type;
58   typedef vectortype                                 rhstype;
59   enum
60   {
61     vectorsize = packet_traits<Scalar>::size,
62     size = 4,
63     rows = 4
64   };
65 };
66 
67 template<>
68 struct quad_traits<double>
69 {
70   typedef Packet2d                        vectortype;
71   typedef PacketBlock<vectortype,4>             type;
72   typedef PacketBlock<Packet2d,2>            rhstype;
73   enum
74   {
75     vectorsize = packet_traits<double>::size,
76     size = 2,
77     rows = 4
78   };
79 };
80 
81 // MatrixProduct decomposes real/imaginary vectors into a real vector and an imaginary vector, this turned out
82 // to be faster than Eigen's usual approach of having real/imaginary pairs on a single vector. This constants then
83 // are responsible to extract from convert between Eigen's and MatrixProduct approach.
84 
85 const static Packet16uc p16uc_GETREAL32 = {  0,  1,  2,  3,
86                                              8,  9, 10, 11,
87                                             16, 17, 18, 19,
88                                             24, 25, 26, 27};
89 
90 const static Packet16uc p16uc_GETIMAG32 = {  4,  5,  6,  7,
91                                             12, 13, 14, 15,
92                                             20, 21, 22, 23,
93                                             28, 29, 30, 31};
94 const static Packet16uc p16uc_GETREAL64 = {  0,  1,  2,  3,  4,  5,  6,  7,
95                                             16, 17, 18, 19, 20, 21, 22, 23};
96 
97 //[a,ai],[b,bi] = [ai,bi]
98 const static Packet16uc p16uc_GETIMAG64 = {  8,  9, 10, 11, 12, 13, 14, 15,
99                                             24, 25, 26, 27, 28, 29, 30, 31};
100 
101 /*********************************************
102  * Single precision real and complex packing *
103  * *******************************************/
104 
105 /**
106  * Symm packing is related to packing of symmetric adjoint blocks, as expected the packing leaves
107  * the diagonal real, whatever is below it is copied from the respective upper diagonal element and
108  * conjugated. There's no PanelMode available for symm packing.
109  *
110  * Packing in general is supposed to leave the lhs block and the rhs block easy to be read by gemm using
111  * its respective rank-update instructions. The float32/64 versions are different because at this moment
112  * the size of the accumulator is fixed at 512-bits so you can't have a 4x4 accumulator of 64-bit elements.
113  *
114  * As mentioned earlier MatrixProduct breaks complex numbers into a real vector and a complex vector so packing has
115  * to take that into account, at the moment, we run pack the real part and then the imaginary part, this is the main
116  * reason why packing for complex is broken down into several different parts, also the reason why we endup having a
117  * float32/64 and complex float32/64 version.
118  **/
119 template<typename Scalar, typename Index, int StorageOrder>
120 EIGEN_ALWAYS_INLINE std::complex<Scalar> getAdjointVal(Index i, Index j, const_blas_data_mapper<std::complex<Scalar>, Index, StorageOrder>& dt)
121 {
122   std::complex<Scalar> v;
123   if(i < j)
124   {
125     v.real( dt(j,i).real());
126     v.imag(-dt(j,i).imag());
127   } else if(i > j)
128   {
129     v.real( dt(i,j).real());
130     v.imag( dt(i,j).imag());
131   } else {
132     v.real( dt(i,j).real());
133     v.imag((Scalar)0.0);
134   }
135   return v;
136 }
137 
138 template<typename Scalar, typename Index, int StorageOrder, int N>
139 EIGEN_STRONG_INLINE void symm_pack_complex_rhs_helper(std::complex<Scalar>* blockB, const std::complex<Scalar>* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
140 {
141   const Index depth = k2 + rows;
142   const_blas_data_mapper<std::complex<Scalar>, Index, StorageOrder> rhs(_rhs, rhsStride);
143   const Index vectorSize = N*quad_traits<Scalar>::vectorsize;
144   const Index vectorDelta = vectorSize * rows;
145   Scalar* blockBf = reinterpret_cast<Scalar *>(blockB);
146 
147   Index rir = 0, rii, j = 0;
148   for(; j + vectorSize <= cols; j+=vectorSize)
149   {
150     rii = rir + vectorDelta;
151 
152     for(Index i = k2; i < depth; i++)
153     {
154       for(Index k = 0; k < vectorSize; k++)
155       {
156         std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(i, j + k, rhs);
157 
158         blockBf[rir + k] = v.real();
159         blockBf[rii + k] = v.imag();
160       }
161       rir += vectorSize;
162       rii += vectorSize;
163     }
164 
165     rir += vectorDelta;
166   }
167   if (j < cols)
168   {
169     rii = rir + ((cols - j) * rows);
170 
171     for(Index i = k2; i < depth; i++)
172     {
173       Index k = j;
174       for(; k < cols; k++)
175       {
176         std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(i, k, rhs);
177 
178         blockBf[rir] = v.real();
179         blockBf[rii] = v.imag();
180 
181         rir += 1;
182         rii += 1;
183       }
184     }
185   }
186 }
187 
188 template<typename Scalar, typename Index, int StorageOrder>
189 EIGEN_STRONG_INLINE void symm_pack_complex_lhs_helper(std::complex<Scalar>* blockA, const std::complex<Scalar>* _lhs, Index lhsStride, Index cols, Index rows)
190 {
191   const Index depth = cols;
192   const_blas_data_mapper<std::complex<Scalar>, Index, StorageOrder> lhs(_lhs, lhsStride);
193   const Index vectorSize = quad_traits<Scalar>::vectorsize;
194   const Index vectorDelta = vectorSize * depth;
195   Scalar* blockAf = (Scalar *)(blockA);
196 
197   Index rir = 0, rii, j = 0;
198   for(; j + vectorSize <= rows; j+=vectorSize)
199   {
200     rii = rir + vectorDelta;
201 
202     for(Index i = 0; i < depth; i++)
203     {
204       for(Index k = 0; k < vectorSize; k++)
205       {
206         std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(j+k, i, lhs);
207 
208         blockAf[rir + k] = v.real();
209         blockAf[rii + k] = v.imag();
210       }
211       rir += vectorSize;
212       rii += vectorSize;
213     }
214 
215     rir += vectorDelta;
216   }
217 
218   if (j < rows)
219   {
220     rii = rir + ((rows - j) * depth);
221 
222     for(Index i = 0; i < depth; i++)
223     {
224       Index k = j;
225       for(; k < rows; k++)
226       {
227         std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(k, i, lhs);
228 
229         blockAf[rir] = v.real();
230         blockAf[rii] = v.imag();
231 
232         rir += 1;
233         rii += 1;
234       }
235     }
236   }
237 }
238 
239 template<typename Scalar, typename Index, int StorageOrder, int N>
240 EIGEN_STRONG_INLINE void symm_pack_rhs_helper(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
241 {
242   const Index depth = k2 + rows;
243   const_blas_data_mapper<Scalar, Index, StorageOrder> rhs(_rhs, rhsStride);
244   const Index vectorSize = quad_traits<Scalar>::vectorsize;
245 
246   Index ri = 0, j = 0;
247   for(; j + N*vectorSize <= cols; j+=N*vectorSize)
248   {
249     Index i = k2;
250     for(; i < depth; i++)
251     {
252       for(Index k = 0; k < N*vectorSize; k++)
253       {
254         if(i <= j+k)
255           blockB[ri + k] = rhs(j+k, i);
256         else
257           blockB[ri + k] = rhs(i, j+k);
258       }
259       ri += N*vectorSize;
260     }
261   }
262 
263   if (j < cols)
264   {
265     for(Index i = k2; i < depth; i++)
266     {
267       Index k = j;
268       for(; k < cols; k++)
269       {
270         if(k <= i)
271           blockB[ri] = rhs(i, k);
272         else
273           blockB[ri] = rhs(k, i);
274         ri += 1;
275       }
276     }
277   }
278 }
279 
280 template<typename Scalar, typename Index, int StorageOrder>
281 EIGEN_STRONG_INLINE void symm_pack_lhs_helper(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
282 {
283   const Index depth = cols;
284   const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs, lhsStride);
285   const Index vectorSize = quad_traits<Scalar>::vectorsize;
286 
287   Index ri = 0, j = 0;
288   for(; j + vectorSize <= rows; j+=vectorSize)
289   {
290     Index i = 0;
291 
292     for(; i < depth; i++)
293     {
294       for(Index k = 0; k < vectorSize; k++)
295       {
296         if(i <= j+k)
297           blockA[ri + k] = lhs(j+k, i);
298         else
299           blockA[ri + k] = lhs(i, j+k);
300       }
301       ri += vectorSize;
302     }
303   }
304 
305   if (j < rows)
306   {
307     for(Index i = 0; i < depth; i++)
308     {
309       Index k = j;
310       for(; k < rows; k++)
311       {
312         if(i <= k)
313           blockA[ri] = lhs(k, i);
314         else
315           blockA[ri] = lhs(i, k);
316         ri += 1;
317       }
318     }
319   }
320 }
321 
322 template<typename Index, int nr, int StorageOrder>
323 struct symm_pack_rhs<std::complex<float>, Index, nr, StorageOrder>
324 {
325   void operator()(std::complex<float>* blockB, const std::complex<float>* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
326   {
327     symm_pack_complex_rhs_helper<float, Index, StorageOrder, 1>(blockB, _rhs, rhsStride, rows, cols, k2);
328   }
329 };
330 
331 template<typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
332 struct symm_pack_lhs<std::complex<float>, Index, Pack1, Pack2_dummy, StorageOrder>
333 {
334   void operator()(std::complex<float>* blockA, const std::complex<float>* _lhs, Index lhsStride, Index cols, Index rows)
335   {
336     symm_pack_complex_lhs_helper<float, Index, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
337   }
338 };
339 
340 // *********** symm_pack std::complex<float64> ***********
341 
342 template<typename Index, int nr, int StorageOrder>
343 struct symm_pack_rhs<std::complex<double>, Index, nr, StorageOrder>
344 {
345   void operator()(std::complex<double>* blockB, const std::complex<double>* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
346   {
347     symm_pack_complex_rhs_helper<double, Index, StorageOrder, 2>(blockB, _rhs, rhsStride, rows, cols, k2);
348   }
349 };
350 
351 template<typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
352 struct symm_pack_lhs<std::complex<double>, Index, Pack1, Pack2_dummy, StorageOrder>
353 {
354   void operator()(std::complex<double>* blockA, const std::complex<double>* _lhs, Index lhsStride, Index cols, Index rows)
355   {
356     symm_pack_complex_lhs_helper<double, Index, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
357   }
358 };
359 
360 // *********** symm_pack float32 ***********
361 template<typename Index, int nr, int StorageOrder>
362 struct symm_pack_rhs<float, Index, nr, StorageOrder>
363 {
364   void operator()(float* blockB, const float* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
365   {
366     symm_pack_rhs_helper<float, Index, StorageOrder, 1>(blockB, _rhs, rhsStride, rows, cols, k2);
367   }
368 };
369 
370 template<typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
371 struct symm_pack_lhs<float, Index, Pack1, Pack2_dummy, StorageOrder>
372 {
373   void operator()(float* blockA, const float* _lhs, Index lhsStride, Index cols, Index rows)
374   {
375     symm_pack_lhs_helper<float, Index, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
376   }
377 };
378 
379 // *********** symm_pack float64 ***********
380 template<typename Index, int nr, int StorageOrder>
381 struct symm_pack_rhs<double, Index, nr, StorageOrder>
382 {
383   void operator()(double* blockB, const double* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
384   {
385     symm_pack_rhs_helper<double, Index, StorageOrder, 2>(blockB, _rhs, rhsStride, rows, cols, k2);
386   }
387 };
388 
389 template<typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
390 struct symm_pack_lhs<double, Index, Pack1, Pack2_dummy, StorageOrder>
391 {
392   void operator()(double* blockA, const double* _lhs, Index lhsStride, Index cols, Index rows)
393   {
394     symm_pack_lhs_helper<double, Index, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
395   }
396 };
397 
398 /**
399  * PanelMode
400  * Packing might be called several times before being multiplied by gebp_kernel, this happens because
401  * on special occasions it fills part of block with other parts of the matrix. Two variables control
402  * how PanelMode should behave: offset and stride. The idea is that those variables represent whatever
403  * is going to be the real offset and stride in the future and this is what you should obey. The process
404  * is to behave as you would with normal packing but leave the start of each part with the correct offset
405  * and the end as well respecting the real stride the block will have. Gebp is aware of both blocks stride
406  * and offset and behaves accordingly.
407  **/
408 
409 template<typename Scalar, typename Packet, typename Index>
410 EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock<Packet,4>& block)
411 {
412   const Index size = 16 / sizeof(Scalar);
413   pstore<Scalar>(to + (0 * size), block.packet[0]);
414   pstore<Scalar>(to + (1 * size), block.packet[1]);
415   pstore<Scalar>(to + (2 * size), block.packet[2]);
416   pstore<Scalar>(to + (3 * size), block.packet[3]);
417 }
418 
419 template<typename Scalar, typename Packet, typename Index>
420 EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock<Packet,2>& block)
421 {
422   const Index size = 16 / sizeof(Scalar);
423   pstore<Scalar>(to + (0 * size), block.packet[0]);
424   pstore<Scalar>(to + (1 * size), block.packet[1]);
425 }
426 
427 // General template for lhs & rhs complex packing.
428 template<typename Scalar, typename Index, typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode, bool UseLhs>
429 struct dhs_cpack {
430   EIGEN_STRONG_INLINE void operator()(std::complex<Scalar>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
431   {
432     const Index vectorSize = quad_traits<Scalar>::vectorsize;
433     const Index vectorDelta = vectorSize * ((PanelMode) ? stride : depth);
434     Index rir = ((PanelMode) ? (vectorSize*offset) : 0), rii;
435     Scalar* blockAt = reinterpret_cast<Scalar *>(blockA);
436     Index j = 0;
437 
438     for(; j + vectorSize <= rows; j+=vectorSize)
439     {
440       Index i = 0;
441 
442       rii = rir + vectorDelta;
443 
444       for(; i + vectorSize <= depth; i+=vectorSize)
445       {
446         PacketBlock<Packet,4> blockr, blocki;
447         PacketBlock<PacketC,8> cblock;
448 
449         if (UseLhs) {
450           bload<DataMapper, PacketC, Index, 2, 0, StorageOrder>(cblock, lhs, j, i);
451         } else {
452           bload<DataMapper, PacketC, Index, 2, 0, StorageOrder>(cblock, lhs, i, j);
453         }
454 
455         blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETREAL32);
456         blockr.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETREAL32);
457         blockr.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETREAL32);
458         blockr.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETREAL32);
459 
460         blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETIMAG32);
461         blocki.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETIMAG32);
462         blocki.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETIMAG32);
463         blocki.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETIMAG32);
464 
465         if(Conjugate)
466         {
467           blocki.packet[0] = -blocki.packet[0];
468           blocki.packet[1] = -blocki.packet[1];
469           blocki.packet[2] = -blocki.packet[2];
470           blocki.packet[3] = -blocki.packet[3];
471         }
472 
473         if(((StorageOrder == RowMajor) && UseLhs) || (((StorageOrder == ColMajor) && !UseLhs)))
474         {
475           ptranspose(blockr);
476           ptranspose(blocki);
477         }
478 
479         storeBlock<Scalar, Packet, Index>(blockAt + rir, blockr);
480         storeBlock<Scalar, Packet, Index>(blockAt + rii, blocki);
481 
482         rir += 4*vectorSize;
483         rii += 4*vectorSize;
484       }
485       for(; i < depth; i++)
486       {
487         PacketBlock<Packet,1> blockr, blocki;
488         PacketBlock<PacketC,2> cblock;
489 
490         if(((StorageOrder == ColMajor) && UseLhs) || (((StorageOrder == RowMajor) && !UseLhs)))
491         {
492           if (UseLhs) {
493             cblock.packet[0] = lhs.template loadPacket<PacketC>(j + 0, i);
494             cblock.packet[1] = lhs.template loadPacket<PacketC>(j + 2, i);
495           } else {
496             cblock.packet[0] = lhs.template loadPacket<PacketC>(i, j + 0);
497             cblock.packet[1] = lhs.template loadPacket<PacketC>(i, j + 2);
498           }
499         } else {
500           std::complex<Scalar> lhs0, lhs1;
501           if (UseLhs) {
502             lhs0 = lhs(j + 0, i);
503             lhs1 = lhs(j + 1, i);
504             cblock.packet[0] = pload2(&lhs0, &lhs1);
505             lhs0 = lhs(j + 2, i);
506             lhs1 = lhs(j + 3, i);
507             cblock.packet[1] = pload2(&lhs0, &lhs1);
508           } else {
509             lhs0 = lhs(i, j + 0);
510             lhs1 = lhs(i, j + 1);
511             cblock.packet[0] = pload2(&lhs0, &lhs1);
512             lhs0 = lhs(i, j + 2);
513             lhs1 = lhs(i, j + 3);
514             cblock.packet[1] = pload2(&lhs0, &lhs1);
515           }
516         }
517 
518         blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL32);
519         blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG32);
520 
521         if(Conjugate)
522         {
523           blocki.packet[0] = -blocki.packet[0];
524         }
525 
526         pstore<Scalar>(blockAt + rir, blockr.packet[0]);
527         pstore<Scalar>(blockAt + rii, blocki.packet[0]);
528 
529         rir += vectorSize;
530         rii += vectorSize;
531       }
532 
533       rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta);
534     }
535 
536     if (j < rows)
537     {
538       if(PanelMode) rir += (offset*(rows - j - vectorSize));
539       rii = rir + (((PanelMode) ? stride : depth) * (rows - j));
540 
541       for(Index i = 0; i < depth; i++)
542       {
543         Index k = j;
544         for(; k < rows; k++)
545         {
546           if (UseLhs) {
547             blockAt[rir] = lhs(k, i).real();
548 
549             if(Conjugate)
550               blockAt[rii] = -lhs(k, i).imag();
551             else
552               blockAt[rii] =  lhs(k, i).imag();
553           } else {
554             blockAt[rir] = lhs(i, k).real();
555 
556             if(Conjugate)
557               blockAt[rii] = -lhs(i, k).imag();
558             else
559               blockAt[rii] =  lhs(i, k).imag();
560           }
561 
562           rir += 1;
563           rii += 1;
564         }
565       }
566     }
567   }
568 };
569 
570 // General template for lhs & rhs packing.
571 template<typename Scalar, typename Index, typename DataMapper, typename Packet, int StorageOrder, bool PanelMode, bool UseLhs>
572 struct dhs_pack{
573   EIGEN_STRONG_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
574   {
575     const Index vectorSize = quad_traits<Scalar>::vectorsize;
576     Index ri = 0, j = 0;
577 
578     for(; j + vectorSize <= rows; j+=vectorSize)
579     {
580       Index i = 0;
581 
582       if(PanelMode) ri += vectorSize*offset;
583 
584       for(; i + vectorSize <= depth; i+=vectorSize)
585       {
586         PacketBlock<Packet,4> block;
587 
588         if (UseLhs) {
589           bload<DataMapper, Packet, Index, 4, 0, StorageOrder>(block, lhs, j, i);
590         } else {
591           bload<DataMapper, Packet, Index, 4, 0, StorageOrder>(block, lhs, i, j);
592         }
593         if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs))
594         {
595           ptranspose(block);
596         }
597 
598         storeBlock<Scalar, Packet, Index>(blockA + ri, block);
599 
600         ri += 4*vectorSize;
601       }
602       for(; i < depth; i++)
603       {
604         if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs))
605         {
606           if (UseLhs) {
607             blockA[ri+0] = lhs(j+0, i);
608             blockA[ri+1] = lhs(j+1, i);
609             blockA[ri+2] = lhs(j+2, i);
610             blockA[ri+3] = lhs(j+3, i);
611           } else {
612             blockA[ri+0] = lhs(i, j+0);
613             blockA[ri+1] = lhs(i, j+1);
614             blockA[ri+2] = lhs(i, j+2);
615             blockA[ri+3] = lhs(i, j+3);
616           }
617         } else {
618           Packet lhsV;
619           if (UseLhs) {
620             lhsV = lhs.template loadPacket<Packet>(j, i);
621           } else {
622             lhsV = lhs.template loadPacket<Packet>(i, j);
623           }
624           pstore<Scalar>(blockA + ri, lhsV);
625         }
626 
627         ri += vectorSize;
628       }
629 
630       if(PanelMode) ri += vectorSize*(stride - offset - depth);
631     }
632 
633     if (j < rows)
634     {
635       if(PanelMode) ri += offset*(rows - j);
636 
637       for(Index i = 0; i < depth; i++)
638       {
639         Index k = j;
640         for(; k < rows; k++)
641         {
642           if (UseLhs) {
643             blockA[ri] = lhs(k, i);
644           } else {
645             blockA[ri] = lhs(i, k);
646           }
647           ri += 1;
648         }
649       }
650     }
651   }
652 };
653 
654 // General template for lhs packing, float64 specialization.
655 template<typename Index, typename DataMapper, int StorageOrder, bool PanelMode>
656 struct dhs_pack<double, Index, DataMapper, Packet2d, StorageOrder, PanelMode, true>
657 {
658   EIGEN_STRONG_INLINE void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
659   {
660     const Index vectorSize = quad_traits<double>::vectorsize;
661     Index ri = 0, j = 0;
662 
663     for(; j + vectorSize <= rows; j+=vectorSize)
664     {
665       Index i = 0;
666 
667       if(PanelMode) ri += vectorSize*offset;
668 
669       for(; i + vectorSize <= depth; i+=vectorSize)
670       {
671         PacketBlock<Packet2d,2> block;
672         if(StorageOrder == RowMajor)
673         {
674           block.packet[0] = lhs.template loadPacket<Packet2d>(j + 0, i);
675           block.packet[1] = lhs.template loadPacket<Packet2d>(j + 1, i);
676 
677           ptranspose(block);
678         } else {
679           block.packet[0] = lhs.template loadPacket<Packet2d>(j, i + 0);
680           block.packet[1] = lhs.template loadPacket<Packet2d>(j, i + 1);
681         }
682 
683         storeBlock<double, Packet2d, Index>(blockA + ri, block);
684 
685         ri += 2*vectorSize;
686       }
687       for(; i < depth; i++)
688       {
689         if(StorageOrder == RowMajor)
690         {
691           blockA[ri+0] = lhs(j+0, i);
692           blockA[ri+1] = lhs(j+1, i);
693         } else {
694           Packet2d lhsV = lhs.template loadPacket<Packet2d>(j, i);
695           pstore<double>(blockA + ri, lhsV);
696         }
697 
698         ri += vectorSize;
699       }
700 
701       if(PanelMode) ri += vectorSize*(stride - offset - depth);
702     }
703 
704     if (j < rows)
705     {
706       if(PanelMode) ri += offset*(rows - j);
707 
708       for(Index i = 0; i < depth; i++)
709       {
710         Index k = j;
711         for(; k < rows; k++)
712         {
713           blockA[ri] = lhs(k, i);
714           ri += 1;
715         }
716       }
717     }
718   }
719 };
720 
721 // General template for rhs packing, float64 specialization.
722 template<typename Index, typename DataMapper, int StorageOrder, bool PanelMode>
723 struct dhs_pack<double, Index, DataMapper, Packet2d, StorageOrder, PanelMode, false>
724 {
725   EIGEN_STRONG_INLINE void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
726   {
727     const Index vectorSize = quad_traits<double>::vectorsize;
728     Index ri = 0, j = 0;
729 
730     for(; j + 2*vectorSize <= cols; j+=2*vectorSize)
731     {
732       Index i = 0;
733 
734       if(PanelMode) ri += offset*(2*vectorSize);
735 
736       for(; i + vectorSize <= depth; i+=vectorSize)
737       {
738         PacketBlock<Packet2d,4> block;
739         if(StorageOrder == ColMajor)
740         {
741           PacketBlock<Packet2d,2> block1, block2;
742           block1.packet[0] = rhs.template loadPacket<Packet2d>(i, j + 0);
743           block1.packet[1] = rhs.template loadPacket<Packet2d>(i, j + 1);
744           block2.packet[0] = rhs.template loadPacket<Packet2d>(i, j + 2);
745           block2.packet[1] = rhs.template loadPacket<Packet2d>(i, j + 3);
746 
747           ptranspose(block1);
748           ptranspose(block2);
749 
750           pstore<double>(blockB + ri    , block1.packet[0]);
751           pstore<double>(blockB + ri + 2, block2.packet[0]);
752           pstore<double>(blockB + ri + 4, block1.packet[1]);
753           pstore<double>(blockB + ri + 6, block2.packet[1]);
754         } else {
755           block.packet[0] = rhs.template loadPacket<Packet2d>(i + 0, j + 0); //[a1 a2]
756           block.packet[1] = rhs.template loadPacket<Packet2d>(i + 0, j + 2); //[a3 a4]
757           block.packet[2] = rhs.template loadPacket<Packet2d>(i + 1, j + 0); //[b1 b2]
758           block.packet[3] = rhs.template loadPacket<Packet2d>(i + 1, j + 2); //[b3 b4]
759 
760           storeBlock<double, Packet2d, Index>(blockB + ri, block);
761         }
762 
763         ri += 4*vectorSize;
764       }
765       for(; i < depth; i++)
766       {
767         if(StorageOrder == ColMajor)
768         {
769           blockB[ri+0] = rhs(i, j+0);
770           blockB[ri+1] = rhs(i, j+1);
771 
772           ri += vectorSize;
773 
774           blockB[ri+0] = rhs(i, j+2);
775           blockB[ri+1] = rhs(i, j+3);
776         } else {
777           Packet2d rhsV = rhs.template loadPacket<Packet2d>(i, j);
778           pstore<double>(blockB + ri, rhsV);
779 
780           ri += vectorSize;
781 
782           rhsV = rhs.template loadPacket<Packet2d>(i, j + 2);
783           pstore<double>(blockB + ri, rhsV);
784         }
785         ri += vectorSize;
786       }
787 
788       if(PanelMode) ri += (2*vectorSize)*(stride - offset - depth);
789     }
790 
791     if (j < cols)
792     {
793       if(PanelMode) ri += offset*(cols - j);
794 
795       for(Index i = 0; i < depth; i++)
796       {
797         Index k = j;
798         for(; k < cols; k++)
799         {
800           blockB[ri] = rhs(i, k);
801           ri += 1;
802         }
803       }
804     }
805   }
806 };
807 
808 // General template for lhs complex packing, float64 specialization.
809 template<typename Index, typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
810 struct dhs_cpack<double, Index, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, true>
811 {
812   EIGEN_STRONG_INLINE void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
813   {
814     const Index vectorSize = quad_traits<double>::vectorsize;
815     const Index vectorDelta = vectorSize * ((PanelMode) ? stride : depth);
816     Index rir = ((PanelMode) ? (vectorSize*offset) : 0), rii;
817     double* blockAt = reinterpret_cast<double *>(blockA);
818     Index j = 0;
819 
820     for(; j + vectorSize <= rows; j+=vectorSize)
821     {
822       Index i = 0;
823 
824       rii = rir + vectorDelta;
825 
826       for(; i + vectorSize <= depth; i+=vectorSize)
827       {
828         PacketBlock<Packet,2> blockr, blocki;
829         PacketBlock<PacketC,4> cblock;
830 
831         if(StorageOrder == ColMajor)
832         {
833           cblock.packet[0] = lhs.template loadPacket<PacketC>(j, i + 0); //[a1 a1i]
834           cblock.packet[1] = lhs.template loadPacket<PacketC>(j, i + 1); //[b1 b1i]
835 
836           cblock.packet[2] = lhs.template loadPacket<PacketC>(j + 1, i + 0); //[a2 a2i]
837           cblock.packet[3] = lhs.template loadPacket<PacketC>(j + 1, i + 1); //[b2 b2i]
838 
839           blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[2].v, p16uc_GETREAL64); //[a1 a2]
840           blockr.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[3].v, p16uc_GETREAL64); //[b1 b2]
841 
842           blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[2].v, p16uc_GETIMAG64);
843           blocki.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[3].v, p16uc_GETIMAG64);
844         } else {
845           cblock.packet[0] = lhs.template loadPacket<PacketC>(j + 0, i); //[a1 a1i]
846           cblock.packet[1] = lhs.template loadPacket<PacketC>(j + 1, i); //[a2 a2i]
847 
848           cblock.packet[2] = lhs.template loadPacket<PacketC>(j + 0, i + 1); //[b1 b1i]
849           cblock.packet[3] = lhs.template loadPacket<PacketC>(j + 1, i + 1); //[b2 b2i
850 
851           blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64); //[a1 a2]
852           blockr.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64); //[b1 b2]
853 
854           blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64);
855           blocki.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETIMAG64);
856         }
857 
858         if(Conjugate)
859         {
860           blocki.packet[0] = -blocki.packet[0];
861           blocki.packet[1] = -blocki.packet[1];
862         }
863 
864         storeBlock<double, Packet, Index>(blockAt + rir, blockr);
865         storeBlock<double, Packet, Index>(blockAt + rii, blocki);
866 
867         rir += 2*vectorSize;
868         rii += 2*vectorSize;
869       }
870       for(; i < depth; i++)
871       {
872         PacketBlock<Packet,1> blockr, blocki;
873         PacketBlock<PacketC,2> cblock;
874 
875         cblock.packet[0] = lhs.template loadPacket<PacketC>(j + 0, i);
876         cblock.packet[1] = lhs.template loadPacket<PacketC>(j + 1, i);
877 
878         blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64);
879         blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64);
880 
881         if(Conjugate)
882         {
883           blocki.packet[0] = -blocki.packet[0];
884         }
885 
886         pstore<double>(blockAt + rir, blockr.packet[0]);
887         pstore<double>(blockAt + rii, blocki.packet[0]);
888 
889         rir += vectorSize;
890         rii += vectorSize;
891       }
892 
893       rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta);
894     }
895 
896     if (j < rows)
897     {
898       if(PanelMode) rir += (offset*(rows - j - vectorSize));
899       rii = rir + (((PanelMode) ? stride : depth) * (rows - j));
900 
901       for(Index i = 0; i < depth; i++)
902       {
903         Index k = j;
904         for(; k < rows; k++)
905         {
906           blockAt[rir] = lhs(k, i).real();
907 
908           if(Conjugate)
909             blockAt[rii] = -lhs(k, i).imag();
910           else
911             blockAt[rii] =  lhs(k, i).imag();
912 
913           rir += 1;
914           rii += 1;
915         }
916       }
917     }
918   }
919 };
920 
921 // General template for rhs complex packing, float64 specialization.
922 template<typename Index, typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
923 struct dhs_cpack<double, Index, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, false>
924 {
925   EIGEN_STRONG_INLINE void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
926   {
927     const Index vectorSize = quad_traits<double>::vectorsize;
928     const Index vectorDelta = 2*vectorSize * ((PanelMode) ? stride : depth);
929     Index rir = ((PanelMode) ? (2*vectorSize*offset) : 0), rii;
930     double* blockBt = reinterpret_cast<double *>(blockB);
931     Index j = 0;
932 
933     for(; j + 2*vectorSize <= cols; j+=2*vectorSize)
934     {
935       Index i = 0;
936 
937       rii = rir + vectorDelta;
938 
939       for(; i < depth; i++)
940       {
941         PacketBlock<PacketC,4> cblock;
942         PacketBlock<Packet,2> blockr, blocki;
943 
944         bload<DataMapper, PacketC, Index, 2, 0, ColMajor>(cblock, rhs, i, j);
945 
946         blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64);
947         blockr.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64);
948 
949         blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64);
950         blocki.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETIMAG64);
951 
952         if(Conjugate)
953         {
954           blocki.packet[0] = -blocki.packet[0];
955           blocki.packet[1] = -blocki.packet[1];
956         }
957 
958         storeBlock<double, Packet, Index>(blockBt + rir, blockr);
959         storeBlock<double, Packet, Index>(blockBt + rii, blocki);
960 
961         rir += 2*vectorSize;
962         rii += 2*vectorSize;
963       }
964 
965       rir += ((PanelMode) ? (2*vectorSize*(2*stride - depth)) : vectorDelta);
966     }
967 
968     if (j < cols)
969     {
970       if(PanelMode) rir += (offset*(cols - j - 2*vectorSize));
971       rii = rir + (((PanelMode) ? stride : depth) * (cols - j));
972 
973       for(Index i = 0; i < depth; i++)
974       {
975         Index k = j;
976         for(; k < cols; k++)
977         {
978           blockBt[rir] = rhs(i, k).real();
979 
980           if(Conjugate)
981             blockBt[rii] = -rhs(i, k).imag();
982           else
983             blockBt[rii] =  rhs(i, k).imag();
984 
985           rir += 1;
986           rii += 1;
987         }
988       }
989     }
990   }
991 };
992 
993 /**************
994  * GEMM utils *
995  **************/
996 
997 // 512-bits rank1-update of acc. It can either positive or negative accumulate (useful for complex gemm).
998 template<typename Packet, bool NegativeAccumulate>
999 EIGEN_ALWAYS_INLINE void pger_common(PacketBlock<Packet,4>* acc, const Packet& lhsV, const Packet* rhsV)
1000 {
1001   if(NegativeAccumulate)
1002   {
1003     acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]);
1004     acc->packet[1] = vec_nmsub(lhsV, rhsV[1], acc->packet[1]);
1005     acc->packet[2] = vec_nmsub(lhsV, rhsV[2], acc->packet[2]);
1006     acc->packet[3] = vec_nmsub(lhsV, rhsV[3], acc->packet[3]);
1007   } else {
1008     acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]);
1009     acc->packet[1] = vec_madd(lhsV, rhsV[1], acc->packet[1]);
1010     acc->packet[2] = vec_madd(lhsV, rhsV[2], acc->packet[2]);
1011     acc->packet[3] = vec_madd(lhsV, rhsV[3], acc->packet[3]);
1012   }
1013 }
1014 
1015 template<typename Packet, bool NegativeAccumulate>
1016 EIGEN_ALWAYS_INLINE void pger_common(PacketBlock<Packet,1>* acc, const Packet& lhsV, const Packet* rhsV)
1017 {
1018   if(NegativeAccumulate)
1019   {
1020     acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]);
1021   } else {
1022     acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]);
1023   }
1024 }
1025 
1026 template<int N, typename Scalar, typename Packet, bool NegativeAccumulate>
1027 EIGEN_ALWAYS_INLINE void pger(PacketBlock<Packet,N>* acc, const Scalar* lhs, const Packet* rhsV)
1028 {
1029   Packet lhsV = pload<Packet>(lhs);
1030 
1031   pger_common<Packet, NegativeAccumulate>(acc, lhsV, rhsV);
1032 }
1033 
1034 template<typename Scalar, typename Packet, typename Index>
1035 EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs, Packet &lhsV, Index remaining_rows)
1036 {
1037 #ifdef _ARCH_PWR9
1038   lhsV = vec_xl_len((Scalar *)lhs, remaining_rows * sizeof(Scalar));
1039 #else
1040   Index i = 0;
1041   do {
1042     lhsV[i] = lhs[i];
1043   } while (++i < remaining_rows);
1044 #endif
1045 }
1046 
1047 template<int N, typename Scalar, typename Packet, typename Index, bool NegativeAccumulate>
1048 EIGEN_ALWAYS_INLINE void pger(PacketBlock<Packet,N>* acc, const Scalar* lhs, const Packet* rhsV, Index remaining_rows)
1049 {
1050   Packet lhsV;
1051   loadPacketRemaining<Scalar, Packet, Index>(lhs, lhsV, remaining_rows);
1052 
1053   pger_common<Packet, NegativeAccumulate>(acc, lhsV, rhsV);
1054 }
1055 
1056 // 512-bits rank1-update of complex acc. It takes decoupled accumulators as entries. It also takes cares of mixed types real * complex and complex * real.
1057 template<int N, typename Packet, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
1058 EIGEN_ALWAYS_INLINE void pgerc_common(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag, const Packet &lhsV, const Packet &lhsVi, const Packet* rhsV, const Packet* rhsVi)
1059 {
1060   pger_common<Packet, false>(accReal, lhsV, rhsV);
1061   if(LhsIsReal)
1062   {
1063     pger_common<Packet, ConjugateRhs>(accImag, lhsV, rhsVi);
1064     EIGEN_UNUSED_VARIABLE(lhsVi);
1065   } else {
1066     if (!RhsIsReal) {
1067       pger_common<Packet, ConjugateLhs == ConjugateRhs>(accReal, lhsVi, rhsVi);
1068       pger_common<Packet, ConjugateRhs>(accImag, lhsV, rhsVi);
1069     } else {
1070       EIGEN_UNUSED_VARIABLE(rhsVi);
1071     }
1072     pger_common<Packet, ConjugateLhs>(accImag, lhsVi, rhsV);
1073   }
1074 }
1075 
1076 template<int N, typename Scalar, typename Packet, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
1077 EIGEN_ALWAYS_INLINE void pgerc(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi)
1078 {
1079   Packet lhsV = ploadLhs<Scalar, Packet>(lhs_ptr);
1080   Packet lhsVi;
1081   if(!LhsIsReal) lhsVi = ploadLhs<Scalar, Packet>(lhs_ptr_imag);
1082   else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
1083 
1084   pgerc_common<N, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi);
1085 }
1086 
1087 template<typename Scalar, typename Packet, typename Index, bool LhsIsReal>
1088 EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, Packet &lhsV, Packet &lhsVi, Index remaining_rows)
1089 {
1090 #ifdef _ARCH_PWR9
1091   lhsV = vec_xl_len((Scalar *)lhs_ptr, remaining_rows * sizeof(Scalar));
1092   if(!LhsIsReal) lhsVi = vec_xl_len((Scalar *)lhs_ptr_imag, remaining_rows * sizeof(Scalar));
1093   else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
1094 #else
1095   Index i = 0;
1096   do {
1097     lhsV[i] = lhs_ptr[i];
1098     if(!LhsIsReal) lhsVi[i] = lhs_ptr_imag[i];
1099   } while (++i < remaining_rows);
1100   if(LhsIsReal) EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
1101 #endif
1102 }
1103 
1104 template<int N, typename Scalar, typename Packet, typename Index, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
1105 EIGEN_ALWAYS_INLINE void pgerc(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi, Index remaining_rows)
1106 {
1107   Packet lhsV, lhsVi;
1108   loadPacketRemaining<Scalar, Packet, Index, LhsIsReal>(lhs_ptr, lhs_ptr_imag, lhsV, lhsVi, remaining_rows);
1109 
1110   pgerc_common<N, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi);
1111 }
1112 
1113 template<typename Scalar, typename Packet>
1114 EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs)
1115 {
1116   return ploadu<Packet>(lhs);
1117 }
1118 
1119 // Zero the accumulator on PacketBlock.
1120 template<typename Scalar, typename Packet>
1121 EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock<Packet,4>& acc)
1122 {
1123   acc.packet[0] = pset1<Packet>((Scalar)0);
1124   acc.packet[1] = pset1<Packet>((Scalar)0);
1125   acc.packet[2] = pset1<Packet>((Scalar)0);
1126   acc.packet[3] = pset1<Packet>((Scalar)0);
1127 }
1128 
1129 template<typename Scalar, typename Packet>
1130 EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock<Packet,1>& acc)
1131 {
1132   acc.packet[0] = pset1<Packet>((Scalar)0);
1133 }
1134 
1135 // Scale the PacketBlock vectors by alpha.
1136 template<typename Packet>
1137 EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,4>& acc, PacketBlock<Packet,4>& accZ, const Packet& pAlpha)
1138 {
1139   acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]);
1140   acc.packet[1] = pmadd(pAlpha, accZ.packet[1], acc.packet[1]);
1141   acc.packet[2] = pmadd(pAlpha, accZ.packet[2], acc.packet[2]);
1142   acc.packet[3] = pmadd(pAlpha, accZ.packet[3], acc.packet[3]);
1143 }
1144 
1145 template<typename Packet>
1146 EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,1>& acc, PacketBlock<Packet,1>& accZ, const Packet& pAlpha)
1147 {
1148   acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]);
1149 }
1150 
1151 template<typename Packet>
1152 EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock<Packet,4>& acc, PacketBlock<Packet,4>& accZ, const Packet& pAlpha)
1153 {
1154   acc.packet[0] = pmul<Packet>(accZ.packet[0], pAlpha);
1155   acc.packet[1] = pmul<Packet>(accZ.packet[1], pAlpha);
1156   acc.packet[2] = pmul<Packet>(accZ.packet[2], pAlpha);
1157   acc.packet[3] = pmul<Packet>(accZ.packet[3], pAlpha);
1158 }
1159 
1160 template<typename Packet>
1161 EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock<Packet,1>& acc, PacketBlock<Packet,1>& accZ, const Packet& pAlpha)
1162 {
1163   acc.packet[0] = pmul<Packet>(accZ.packet[0], pAlpha);
1164 }
1165 
1166 // Complex version of PacketBlock scaling.
1167 template<typename Packet, int N>
1168 EIGEN_ALWAYS_INLINE void bscalec(PacketBlock<Packet,N>& aReal, PacketBlock<Packet,N>& aImag, const Packet& bReal, const Packet& bImag, PacketBlock<Packet,N>& cReal, PacketBlock<Packet,N>& cImag)
1169 {
1170   bscalec_common<Packet>(cReal, aReal, bReal);
1171 
1172   bscalec_common<Packet>(cImag, aImag, bReal);
1173 
1174   pger_common<Packet, true>(&cReal, bImag, aImag.packet);
1175 
1176   pger_common<Packet, false>(&cImag, bImag, aReal.packet);
1177 }
1178 
1179 template<typename Packet>
1180 EIGEN_ALWAYS_INLINE void band(PacketBlock<Packet,4>& acc, const Packet& pMask)
1181 {
1182   acc.packet[0] = pand(acc.packet[0], pMask);
1183   acc.packet[1] = pand(acc.packet[1], pMask);
1184   acc.packet[2] = pand(acc.packet[2], pMask);
1185   acc.packet[3] = pand(acc.packet[3], pMask);
1186 }
1187 
1188 template<typename Packet>
1189 EIGEN_ALWAYS_INLINE void bscalec(PacketBlock<Packet,4>& aReal, PacketBlock<Packet,4>& aImag, const Packet& bReal, const Packet& bImag, PacketBlock<Packet,4>& cReal, PacketBlock<Packet,4>& cImag, const Packet& pMask)
1190 {
1191   band<Packet>(aReal, pMask);
1192   band<Packet>(aImag, pMask);
1193 
1194   bscalec<Packet,4>(aReal, aImag, bReal, bImag, cReal, cImag);
1195 }
1196 
1197 // Load a PacketBlock, the N parameters make tunning gemm easier so we can add more accumulators as needed.
1198 template<typename DataMapper, typename Packet, typename Index, const Index accCols, int N, int StorageOrder>
1199 EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet,4>& acc, const DataMapper& res, Index row, Index col)
1200 {
1201   if (StorageOrder == RowMajor) {
1202     acc.packet[0] = res.template loadPacket<Packet>(row + 0, col + N*accCols);
1203     acc.packet[1] = res.template loadPacket<Packet>(row + 1, col + N*accCols);
1204     acc.packet[2] = res.template loadPacket<Packet>(row + 2, col + N*accCols);
1205     acc.packet[3] = res.template loadPacket<Packet>(row + 3, col + N*accCols);
1206   } else {
1207     acc.packet[0] = res.template loadPacket<Packet>(row + N*accCols, col + 0);
1208     acc.packet[1] = res.template loadPacket<Packet>(row + N*accCols, col + 1);
1209     acc.packet[2] = res.template loadPacket<Packet>(row + N*accCols, col + 2);
1210     acc.packet[3] = res.template loadPacket<Packet>(row + N*accCols, col + 3);
1211   }
1212 }
1213 
1214 // An overload of bload when you have a PacketBLock with 8 vectors.
1215 template<typename DataMapper, typename Packet, typename Index, const Index accCols, int N, int StorageOrder>
1216 EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet,8>& acc, const DataMapper& res, Index row, Index col)
1217 {
1218   if (StorageOrder == RowMajor) {
1219     acc.packet[0] = res.template loadPacket<Packet>(row + 0, col + N*accCols);
1220     acc.packet[1] = res.template loadPacket<Packet>(row + 1, col + N*accCols);
1221     acc.packet[2] = res.template loadPacket<Packet>(row + 2, col + N*accCols);
1222     acc.packet[3] = res.template loadPacket<Packet>(row + 3, col + N*accCols);
1223     acc.packet[4] = res.template loadPacket<Packet>(row + 0, col + (N+1)*accCols);
1224     acc.packet[5] = res.template loadPacket<Packet>(row + 1, col + (N+1)*accCols);
1225     acc.packet[6] = res.template loadPacket<Packet>(row + 2, col + (N+1)*accCols);
1226     acc.packet[7] = res.template loadPacket<Packet>(row + 3, col + (N+1)*accCols);
1227   } else {
1228     acc.packet[0] = res.template loadPacket<Packet>(row + N*accCols, col + 0);
1229     acc.packet[1] = res.template loadPacket<Packet>(row + N*accCols, col + 1);
1230     acc.packet[2] = res.template loadPacket<Packet>(row + N*accCols, col + 2);
1231     acc.packet[3] = res.template loadPacket<Packet>(row + N*accCols, col + 3);
1232     acc.packet[4] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 0);
1233     acc.packet[5] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 1);
1234     acc.packet[6] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 2);
1235     acc.packet[7] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 3);
1236   }
1237 }
1238 
1239 template<typename DataMapper, typename Packet, typename Index, const Index accCols, int N, int StorageOrder>
1240 EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet,2>& acc, const DataMapper& res, Index row, Index col)
1241 {
1242   acc.packet[0] = res.template loadPacket<Packet>(row + N*accCols, col + 0);
1243   acc.packet[1] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 0);
1244 }
1245 
1246 const static Packet4i mask41 = { -1,  0,  0,  0 };
1247 const static Packet4i mask42 = { -1, -1,  0,  0 };
1248 const static Packet4i mask43 = { -1, -1, -1,  0 };
1249 
1250 const static Packet2l mask21 = { -1, 0 };
1251 
1252 template<typename Packet>
1253 EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows)
1254 {
1255   if (remaining_rows == 0) {
1256     return pset1<Packet>(float(0.0));  // Not used
1257   } else {
1258     switch (remaining_rows) {
1259       case 1:  return Packet(mask41);
1260       case 2:  return Packet(mask42);
1261       default: return Packet(mask43);
1262     }
1263   }
1264 }
1265 
1266 template<>
1267 EIGEN_ALWAYS_INLINE Packet2d bmask<Packet2d>(const int remaining_rows)
1268 {
1269   if (remaining_rows == 0) {
1270     return pset1<Packet2d>(double(0.0));  // Not used
1271   } else {
1272     return Packet2d(mask21);
1273   }
1274 }
1275 
1276 template<typename Packet>
1277 EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,4>& acc, PacketBlock<Packet,4>& accZ, const Packet& pAlpha, const Packet& pMask)
1278 {
1279   band<Packet>(accZ, pMask);
1280 
1281   bscale<Packet>(acc, accZ, pAlpha);
1282 }
1283 
1284 template<typename Packet>
1285 EIGEN_ALWAYS_INLINE void pbroadcast4_old(const __UNPACK_TYPE__(Packet)* a, Packet& a0, Packet& a1, Packet& a2, Packet& a3)
1286 {
1287   pbroadcast4<Packet>(a, a0, a1, a2, a3);
1288 }
1289 
1290 template<>
1291 EIGEN_ALWAYS_INLINE void pbroadcast4_old<Packet2d>(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
1292 {
1293   a1 = pload<Packet2d>(a);
1294   a3 = pload<Packet2d>(a + 2);
1295   a0 = vec_splat(a1, 0);
1296   a1 = vec_splat(a1, 1);
1297   a2 = vec_splat(a3, 0);
1298   a3 = vec_splat(a3, 1);
1299 }
1300 
1301 // PEEL loop factor.
1302 #define PEEL 7
1303 
1304 template<typename Scalar, typename Packet, typename Index>
1305 EIGEN_ALWAYS_INLINE void MICRO_EXTRA_COL(
1306   const Scalar* &lhs_ptr,
1307   const Scalar* &rhs_ptr,
1308   PacketBlock<Packet,1> &accZero,
1309   Index remaining_rows,
1310   Index remaining_cols)
1311 {
1312   Packet rhsV[1];
1313   rhsV[0] = pset1<Packet>(rhs_ptr[0]);
1314   pger<1,Scalar, Packet, false>(&accZero, lhs_ptr, rhsV);
1315   lhs_ptr += remaining_rows;
1316   rhs_ptr += remaining_cols;
1317 }
1318 
1319 template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows>
1320 EIGEN_STRONG_INLINE void gemm_extra_col(
1321   const DataMapper& res,
1322   const Scalar* lhs_base,
1323   const Scalar* rhs_base,
1324   Index depth,
1325   Index strideA,
1326   Index offsetA,
1327   Index row,
1328   Index col,
1329   Index remaining_rows,
1330   Index remaining_cols,
1331   const Packet& pAlpha)
1332 {
1333   const Scalar* rhs_ptr = rhs_base;
1334   const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA;
1335   PacketBlock<Packet,1> accZero;
1336 
1337   bsetzero<Scalar, Packet>(accZero);
1338 
1339   Index remaining_depth = (depth & -accRows);
1340   Index k = 0;
1341   for(; k + PEEL <= remaining_depth; k+= PEEL)
1342   {
1343     EIGEN_POWER_PREFETCH(rhs_ptr);
1344     EIGEN_POWER_PREFETCH(lhs_ptr);
1345     for (int l = 0; l < PEEL; l++) {
1346       MICRO_EXTRA_COL<Scalar, Packet, Index>(lhs_ptr, rhs_ptr, accZero, remaining_rows, remaining_cols);
1347     }
1348   }
1349   for(; k < remaining_depth; k++)
1350   {
1351     MICRO_EXTRA_COL<Scalar, Packet, Index>(lhs_ptr, rhs_ptr, accZero, remaining_rows, remaining_cols);
1352   }
1353   for(; k < depth; k++)
1354   {
1355     Packet rhsV[1];
1356     rhsV[0] = pset1<Packet>(rhs_ptr[0]);
1357     pger<1, Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows);
1358     lhs_ptr += remaining_rows;
1359     rhs_ptr += remaining_cols;
1360   }
1361 
1362   accZero.packet[0] = vec_mul(pAlpha, accZero.packet[0]);
1363   for(Index i = 0; i < remaining_rows; i++) {
1364     res(row + i, col) += accZero.packet[0][i];
1365   }
1366 }
1367 
1368 template<typename Scalar, typename Packet, typename Index, const Index accRows>
1369 EIGEN_ALWAYS_INLINE void MICRO_EXTRA_ROW(
1370   const Scalar* &lhs_ptr,
1371   const Scalar* &rhs_ptr,
1372   PacketBlock<Packet,4> &accZero,
1373   Index remaining_rows)
1374 {
1375   Packet rhsV[4];
1376   pbroadcast4<Packet>(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
1377   pger<4, Scalar, Packet, false>(&accZero, lhs_ptr, rhsV);
1378   lhs_ptr += remaining_rows;
1379   rhs_ptr += accRows;
1380 }
1381 
1382 template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows, const Index accCols>
1383 EIGEN_STRONG_INLINE void gemm_extra_row(
1384   const DataMapper& res,
1385   const Scalar* lhs_base,
1386   const Scalar* rhs_base,
1387   Index depth,
1388   Index strideA,
1389   Index offsetA,
1390   Index row,
1391   Index col,
1392   Index rows,
1393   Index cols,
1394   Index remaining_rows,
1395   const Packet& pAlpha,
1396   const Packet& pMask)
1397 {
1398   const Scalar* rhs_ptr = rhs_base;
1399   const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA;
1400   PacketBlock<Packet,4> accZero, acc;
1401 
1402   bsetzero<Scalar, Packet>(accZero);
1403 
1404   Index remaining_depth = (col + accRows < cols) ? depth : (depth & -accRows);
1405   Index k = 0;
1406   for(; k + PEEL <= remaining_depth; k+= PEEL)
1407   {
1408     EIGEN_POWER_PREFETCH(rhs_ptr);
1409     EIGEN_POWER_PREFETCH(lhs_ptr);
1410     for (int l = 0; l < PEEL; l++) {
1411       MICRO_EXTRA_ROW<Scalar, Packet, Index, accRows>(lhs_ptr, rhs_ptr, accZero, remaining_rows);
1412     }
1413   }
1414   for(; k < remaining_depth; k++)
1415   {
1416     MICRO_EXTRA_ROW<Scalar, Packet, Index, accRows>(lhs_ptr, rhs_ptr, accZero, remaining_rows);
1417   }
1418 
1419   if ((remaining_depth == depth) && (rows >= accCols))
1420   {
1421     for(Index j = 0; j < 4; j++) {
1422       acc.packet[j] = res.template loadPacket<Packet>(row, col + j);
1423     }
1424     bscale<Packet>(acc, accZero, pAlpha, pMask);
1425     res.template storePacketBlock<Packet,4>(row, col, acc);
1426   } else {
1427     for(; k < depth; k++)
1428     {
1429       Packet rhsV[4];
1430       pbroadcast4<Packet>(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
1431       pger<4, Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows);
1432       lhs_ptr += remaining_rows;
1433       rhs_ptr += accRows;
1434     }
1435 
1436     for(Index j = 0; j < 4; j++) {
1437       accZero.packet[j] = vec_mul(pAlpha, accZero.packet[j]);
1438     }
1439     for(Index j = 0; j < 4; j++) {
1440       for(Index i = 0; i < remaining_rows; i++) {
1441         res(row + i, col + j) += accZero.packet[j][i];
1442       }
1443     }
1444   }
1445 }
1446 
1447 #define MICRO_UNROLL(func) \
1448   func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
1449 
1450 #define MICRO_UNROLL_WORK(func, func2, peel) \
1451     MICRO_UNROLL(func2); \
1452     func(0,peel) func(1,peel) func(2,peel) func(3,peel) \
1453     func(4,peel) func(5,peel) func(6,peel) func(7,peel)
1454 
1455 #define MICRO_LOAD_ONE(iter) \
1456   if (unroll_factor > iter) { \
1457     lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr##iter); \
1458     lhs_ptr##iter += accCols; \
1459   } else { \
1460     EIGEN_UNUSED_VARIABLE(lhsV##iter); \
1461   }
1462 
1463 #define MICRO_WORK_ONE(iter, peel) \
1464   if (unroll_factor > iter) { \
1465     pger_common<Packet, false>(&accZero##iter, lhsV##iter, rhsV##peel); \
1466   }
1467 
1468 #define MICRO_TYPE_PEEL4(func, func2, peel) \
1469   if (PEEL > peel) { \
1470     Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
1471     pbroadcast4<Packet>(rhs_ptr + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
1472     MICRO_UNROLL_WORK(func, func2, peel) \
1473   } else { \
1474     EIGEN_UNUSED_VARIABLE(rhsV##peel); \
1475   }
1476 
1477 #define MICRO_TYPE_PEEL1(func, func2, peel) \
1478   if (PEEL > peel) { \
1479     Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
1480     rhsV##peel[0] = pset1<Packet>(rhs_ptr[remaining_cols * peel]); \
1481     MICRO_UNROLL_WORK(func, func2, peel) \
1482   } else { \
1483     EIGEN_UNUSED_VARIABLE(rhsV##peel); \
1484   }
1485 
1486 #define MICRO_UNROLL_TYPE_PEEL(M, func, func1, func2) \
1487   Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M], rhsV8[M], rhsV9[M]; \
1488   func(func1,func2,0); func(func1,func2,1); \
1489   func(func1,func2,2); func(func1,func2,3); \
1490   func(func1,func2,4); func(func1,func2,5); \
1491   func(func1,func2,6); func(func1,func2,7); \
1492   func(func1,func2,8); func(func1,func2,9);
1493 
1494 #define MICRO_UNROLL_TYPE_ONE(M, func, func1, func2) \
1495   Packet rhsV0[M]; \
1496   func(func1,func2,0);
1497 
1498 #define MICRO_ONE_PEEL4 \
1499   MICRO_UNROLL_TYPE_PEEL(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE); \
1500   rhs_ptr += (accRows * PEEL);
1501 
1502 #define MICRO_ONE4 \
1503   MICRO_UNROLL_TYPE_ONE(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE); \
1504   rhs_ptr += accRows;
1505 
1506 #define MICRO_ONE_PEEL1 \
1507   MICRO_UNROLL_TYPE_PEEL(1, MICRO_TYPE_PEEL1, MICRO_WORK_ONE, MICRO_LOAD_ONE); \
1508   rhs_ptr += (remaining_cols * PEEL);
1509 
1510 #define MICRO_ONE1 \
1511   MICRO_UNROLL_TYPE_ONE(1, MICRO_TYPE_PEEL1, MICRO_WORK_ONE, MICRO_LOAD_ONE); \
1512   rhs_ptr += remaining_cols;
1513 
1514 #define MICRO_DST_PTR_ONE(iter) \
1515   if (unroll_factor > iter) { \
1516     bsetzero<Scalar, Packet>(accZero##iter); \
1517   } else { \
1518     EIGEN_UNUSED_VARIABLE(accZero##iter); \
1519   }
1520 
1521 #define MICRO_DST_PTR MICRO_UNROLL(MICRO_DST_PTR_ONE)
1522 
1523 #define MICRO_SRC_PTR_ONE(iter) \
1524   if (unroll_factor > iter) { \
1525     lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols + accCols*offsetA; \
1526   } else { \
1527     EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \
1528   }
1529 
1530 #define MICRO_SRC_PTR MICRO_UNROLL(MICRO_SRC_PTR_ONE)
1531 
1532 #define MICRO_PREFETCH_ONE(iter) \
1533   if (unroll_factor > iter) { \
1534     EIGEN_POWER_PREFETCH(lhs_ptr##iter); \
1535   }
1536 
1537 #define MICRO_PREFETCH MICRO_UNROLL(MICRO_PREFETCH_ONE)
1538 
1539 #define MICRO_STORE_ONE(iter) \
1540   if (unroll_factor > iter) { \
1541     acc.packet[0] = res.template loadPacket<Packet>(row + iter*accCols, col + 0); \
1542     acc.packet[1] = res.template loadPacket<Packet>(row + iter*accCols, col + 1); \
1543     acc.packet[2] = res.template loadPacket<Packet>(row + iter*accCols, col + 2); \
1544     acc.packet[3] = res.template loadPacket<Packet>(row + iter*accCols, col + 3); \
1545     bscale<Packet>(acc, accZero##iter, pAlpha); \
1546     res.template storePacketBlock<Packet,4>(row + iter*accCols, col, acc); \
1547   }
1548 
1549 #define MICRO_STORE MICRO_UNROLL(MICRO_STORE_ONE)
1550 
1551 #define MICRO_COL_STORE_ONE(iter) \
1552   if (unroll_factor > iter) { \
1553     acc.packet[0] = res.template loadPacket<Packet>(row + iter*accCols, col + 0); \
1554     bscale<Packet>(acc, accZero##iter, pAlpha); \
1555     res.template storePacketBlock<Packet,1>(row + iter*accCols, col, acc); \
1556   }
1557 
1558 #define MICRO_COL_STORE MICRO_UNROLL(MICRO_COL_STORE_ONE)
1559 
1560 template<int unroll_factor, typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows, const Index accCols>
1561 EIGEN_STRONG_INLINE void gemm_unrolled_iteration(
1562   const DataMapper& res,
1563   const Scalar* lhs_base,
1564   const Scalar* rhs_base,
1565   Index depth,
1566   Index strideA,
1567   Index offsetA,
1568   Index& row,
1569   Index col,
1570   const Packet& pAlpha)
1571 {
1572   const Scalar* rhs_ptr = rhs_base;
1573   const Scalar* lhs_ptr0 = NULL, *  lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL;
1574   PacketBlock<Packet,4> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
1575   PacketBlock<Packet,4> acc;
1576 
1577   MICRO_SRC_PTR
1578   MICRO_DST_PTR
1579 
1580   Index k = 0;
1581   for(; k + PEEL <= depth; k+= PEEL)
1582   {
1583     EIGEN_POWER_PREFETCH(rhs_ptr);
1584     MICRO_PREFETCH
1585     MICRO_ONE_PEEL4
1586   }
1587   for(; k < depth; k++)
1588   {
1589     MICRO_ONE4
1590   }
1591   MICRO_STORE
1592 
1593   row += unroll_factor*accCols;
1594 }
1595 
1596 template<int unroll_factor, typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accCols>
1597 EIGEN_STRONG_INLINE void gemm_unrolled_col_iteration(
1598   const DataMapper& res,
1599   const Scalar* lhs_base,
1600   const Scalar* rhs_base,
1601   Index depth,
1602   Index strideA,
1603   Index offsetA,
1604   Index& row,
1605   Index col,
1606   Index remaining_cols,
1607   const Packet& pAlpha)
1608 {
1609   const Scalar* rhs_ptr = rhs_base;
1610   const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, *lhs_ptr7 = NULL;
1611   PacketBlock<Packet,1> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
1612   PacketBlock<Packet,1> acc;
1613 
1614   MICRO_SRC_PTR
1615   MICRO_DST_PTR
1616 
1617   Index k = 0;
1618   for(; k + PEEL <= depth; k+= PEEL)
1619   {
1620     EIGEN_POWER_PREFETCH(rhs_ptr);
1621     MICRO_PREFETCH
1622     MICRO_ONE_PEEL1
1623   }
1624   for(; k < depth; k++)
1625   {
1626     MICRO_ONE1
1627   }
1628   MICRO_COL_STORE
1629 
1630   row += unroll_factor*accCols;
1631 }
1632 
1633 template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accCols>
1634 EIGEN_STRONG_INLINE void gemm_unrolled_col(
1635   const DataMapper& res,
1636   const Scalar* lhs_base,
1637   const Scalar* rhs_base,
1638   Index depth,
1639   Index strideA,
1640   Index offsetA,
1641   Index& row,
1642   Index rows,
1643   Index col,
1644   Index remaining_cols,
1645   const Packet& pAlpha)
1646 {
1647 #define MAX_UNROLL 6
1648   while(row + MAX_UNROLL*accCols <= rows) {
1649     gemm_unrolled_col_iteration<MAX_UNROLL, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
1650   }
1651   switch( (rows-row)/accCols ) {
1652 #if MAX_UNROLL > 7
1653     case 7:
1654       gemm_unrolled_col_iteration<7, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
1655       break;
1656 #endif
1657 #if MAX_UNROLL > 6
1658     case 6:
1659       gemm_unrolled_col_iteration<6, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
1660       break;
1661 #endif
1662 #if MAX_UNROLL > 5
1663    case 5:
1664       gemm_unrolled_col_iteration<5, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
1665       break;
1666 #endif
1667 #if MAX_UNROLL > 4
1668    case 4:
1669       gemm_unrolled_col_iteration<4, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
1670       break;
1671 #endif
1672 #if MAX_UNROLL > 3
1673    case 3:
1674      gemm_unrolled_col_iteration<3, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
1675      break;
1676 #endif
1677 #if MAX_UNROLL > 2
1678    case 2:
1679      gemm_unrolled_col_iteration<2, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
1680      break;
1681 #endif
1682 #if MAX_UNROLL > 1
1683    case 1:
1684      gemm_unrolled_col_iteration<1, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
1685      break;
1686 #endif
1687    default:
1688      break;
1689   }
1690 #undef MAX_UNROLL
1691 }
1692 
1693 /****************
1694  * GEMM kernels *
1695  * **************/
1696 template<typename Scalar, typename Index, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
1697 EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
1698 {
1699       const Index remaining_rows = rows % accCols;
1700       const Index remaining_cols = cols % accRows;
1701 
1702       if( strideA == -1 ) strideA = depth;
1703       if( strideB == -1 ) strideB = depth;
1704 
1705       const Packet pAlpha = pset1<Packet>(alpha);
1706       const Packet pMask  = bmask<Packet>((const int)(remaining_rows));
1707 
1708       Index col = 0;
1709       for(; col + accRows <= cols; col += accRows)
1710       {
1711         const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB;
1712         const Scalar* lhs_base = blockA;
1713         Index row = 0;
1714 
1715 #define MAX_UNROLL 6
1716         while(row + MAX_UNROLL*accCols <= rows) {
1717           gemm_unrolled_iteration<MAX_UNROLL, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
1718         }
1719         switch( (rows-row)/accCols ) {
1720 #if MAX_UNROLL > 7
1721           case 7:
1722             gemm_unrolled_iteration<7, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
1723             break;
1724 #endif
1725 #if MAX_UNROLL > 6
1726           case 6:
1727             gemm_unrolled_iteration<6, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
1728             break;
1729 #endif
1730 #if MAX_UNROLL > 5
1731           case 5:
1732             gemm_unrolled_iteration<5, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
1733             break;
1734 #endif
1735 #if MAX_UNROLL > 4
1736           case 4:
1737             gemm_unrolled_iteration<4, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
1738             break;
1739 #endif
1740 #if MAX_UNROLL > 3
1741           case 3:
1742             gemm_unrolled_iteration<3, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
1743             break;
1744 #endif
1745 #if MAX_UNROLL > 2
1746           case 2:
1747             gemm_unrolled_iteration<2, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
1748             break;
1749 #endif
1750 #if MAX_UNROLL > 1
1751           case 1:
1752             gemm_unrolled_iteration<1, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
1753             break;
1754 #endif
1755           default:
1756             break;
1757         }
1758 #undef MAX_UNROLL
1759 
1760         if(remaining_rows > 0)
1761         {
1762           gemm_extra_row<Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask);
1763         }
1764     }
1765 
1766     if(remaining_cols > 0)
1767     {
1768       const Scalar* rhs_base = blockB + col*strideB + remaining_cols*offsetB;
1769       const Scalar* lhs_base = blockA;
1770 
1771       for(; col < cols; col++)
1772       {
1773         Index row = 0;
1774 
1775         gemm_unrolled_col<Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, rows, col, remaining_cols, pAlpha);
1776 
1777         if (remaining_rows > 0)
1778         {
1779           gemm_extra_col<Scalar, Packet, DataMapper, Index, accRows>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_rows, remaining_cols, pAlpha);
1780         }
1781         rhs_base++;
1782       }
1783     }
1784 }
1785 
1786 #define accColsC (accCols / 2)
1787 #define advanceRows ((LhsIsReal) ? 1 : 2)
1788 #define advanceCols ((RhsIsReal) ? 1 : 2)
1789 
1790 // PEEL_COMPLEX loop factor.
1791 #define PEEL_COMPLEX 3
1792 
1793 template<typename Scalar, typename Packet, typename Index, const Index accRows, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
1794 EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_COL(
1795   const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag,
1796   const Scalar* &rhs_ptr_real, const Scalar* &rhs_ptr_imag,
1797   PacketBlock<Packet,1> &accReal, PacketBlock<Packet,1> &accImag,
1798   Index remaining_rows,
1799   Index remaining_cols)
1800 {
1801   Packet rhsV[1], rhsVi[1];
1802   rhsV[0] = pset1<Packet>(rhs_ptr_real[0]);
1803   if(!RhsIsReal) rhsVi[0] = pset1<Packet>(rhs_ptr_imag[0]);
1804   pgerc<1, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi);
1805   lhs_ptr_real += remaining_rows;
1806   if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
1807   else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
1808   rhs_ptr_real += remaining_cols;
1809   if(!RhsIsReal) rhs_ptr_imag += remaining_cols;
1810   else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
1811 }
1812 
1813 template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
1814 EIGEN_STRONG_INLINE void gemm_complex_extra_col(
1815   const DataMapper& res,
1816   const Scalar* lhs_base,
1817   const Scalar* rhs_base,
1818   Index depth,
1819   Index strideA,
1820   Index offsetA,
1821   Index strideB,
1822   Index row,
1823   Index col,
1824   Index remaining_rows,
1825   Index remaining_cols,
1826   const Packet& pAlphaReal,
1827   const Packet& pAlphaImag)
1828 {
1829   const Scalar* rhs_ptr_real = rhs_base;
1830   const Scalar* rhs_ptr_imag;
1831   if(!RhsIsReal) rhs_ptr_imag = rhs_base + remaining_cols*strideB;
1832   else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
1833   const Scalar* lhs_ptr_real = lhs_base + advanceRows*row*strideA + remaining_rows*offsetA;
1834   const Scalar* lhs_ptr_imag;
1835   if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA;
1836   else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
1837   PacketBlock<Packet,1> accReal, accImag;
1838   PacketBlock<Packet,1> taccReal, taccImag;
1839   PacketBlock<Packetc,1> acc0, acc1;
1840 
1841   bsetzero<Scalar, Packet>(accReal);
1842   bsetzero<Scalar, Packet>(accImag);
1843 
1844   Index remaining_depth = (depth & -accRows);
1845   Index k = 0;
1846   for(; k + PEEL_COMPLEX <= remaining_depth; k+= PEEL_COMPLEX)
1847   {
1848     EIGEN_POWER_PREFETCH(rhs_ptr_real);
1849     if(!RhsIsReal) {
1850       EIGEN_POWER_PREFETCH(rhs_ptr_imag);
1851     }
1852     EIGEN_POWER_PREFETCH(lhs_ptr_real);
1853     if(!LhsIsReal) {
1854       EIGEN_POWER_PREFETCH(lhs_ptr_imag);
1855     }
1856     for (int l = 0; l < PEEL_COMPLEX; l++) {
1857       MICRO_COMPLEX_EXTRA_COL<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows, remaining_cols);
1858     }
1859   }
1860   for(; k < remaining_depth; k++)
1861   {
1862     MICRO_COMPLEX_EXTRA_COL<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows, remaining_cols);
1863   }
1864 
1865   for(; k < depth; k++)
1866   {
1867     Packet rhsV[1], rhsVi[1];
1868     rhsV[0] = pset1<Packet>(rhs_ptr_real[0]);
1869     if(!RhsIsReal) rhsVi[0] = pset1<Packet>(rhs_ptr_imag[0]);
1870     pgerc<1, Scalar, Packet, Index, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi, remaining_rows);
1871     lhs_ptr_real += remaining_rows;
1872     if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
1873     rhs_ptr_real += remaining_cols;
1874     if(!RhsIsReal) rhs_ptr_imag += remaining_cols;
1875   }
1876 
1877   bscalec<Packet,1>(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag);
1878   bcouple_common<Packet, Packetc>(taccReal, taccImag, acc0, acc1);
1879 
1880   if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1))
1881   {
1882     res(row + 0, col + 0) += pfirst<Packetc>(acc0.packet[0]);
1883   } else {
1884     acc0.packet[0] += res.template loadPacket<Packetc>(row + 0, col + 0);
1885     res.template storePacketBlock<Packetc,1>(row + 0, col + 0, acc0);
1886     if(remaining_rows > accColsC) {
1887       res(row + accColsC, col + 0) += pfirst<Packetc>(acc1.packet[0]);
1888     }
1889   }
1890 }
1891 
1892 template<typename Scalar, typename Packet, typename Index, const Index accRows, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
1893 EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_ROW(
1894   const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag,
1895   const Scalar* &rhs_ptr_real, const Scalar* &rhs_ptr_imag,
1896   PacketBlock<Packet,4> &accReal, PacketBlock<Packet,4> &accImag,
1897   Index remaining_rows)
1898 {
1899   Packet rhsV[4], rhsVi[4];
1900   pbroadcast4_old<Packet>(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
1901   if(!RhsIsReal) pbroadcast4_old<Packet>(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]);
1902   pgerc<4, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi);
1903   lhs_ptr_real += remaining_rows;
1904   if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
1905   else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
1906   rhs_ptr_real += accRows;
1907   if(!RhsIsReal) rhs_ptr_imag += accRows;
1908   else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
1909 }
1910 
1911 template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
1912 EIGEN_STRONG_INLINE void gemm_complex_extra_row(
1913   const DataMapper& res,
1914   const Scalar* lhs_base,
1915   const Scalar* rhs_base,
1916   Index depth,
1917   Index strideA,
1918   Index offsetA,
1919   Index strideB,
1920   Index row,
1921   Index col,
1922   Index rows,
1923   Index cols,
1924   Index remaining_rows,
1925   const Packet& pAlphaReal,
1926   const Packet& pAlphaImag,
1927   const Packet& pMask)
1928 {
1929   const Scalar* rhs_ptr_real = rhs_base;
1930   const Scalar* rhs_ptr_imag;
1931   if(!RhsIsReal) rhs_ptr_imag = rhs_base + accRows*strideB;
1932   else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
1933   const Scalar* lhs_ptr_real = lhs_base + advanceRows*row*strideA + remaining_rows*offsetA;
1934   const Scalar* lhs_ptr_imag;
1935   if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA;
1936   else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
1937   PacketBlock<Packet,4> accReal, accImag;
1938   PacketBlock<Packet,4> taccReal, taccImag;
1939   PacketBlock<Packetc,4> acc0, acc1;
1940   PacketBlock<Packetc,8> tRes;
1941 
1942   bsetzero<Scalar, Packet>(accReal);
1943   bsetzero<Scalar, Packet>(accImag);
1944 
1945   Index remaining_depth = (col + accRows < cols) ? depth : (depth & -accRows);
1946   Index k = 0;
1947   for(; k + PEEL_COMPLEX <= remaining_depth; k+= PEEL_COMPLEX)
1948   {
1949     EIGEN_POWER_PREFETCH(rhs_ptr_real);
1950     if(!RhsIsReal) {
1951       EIGEN_POWER_PREFETCH(rhs_ptr_imag);
1952     }
1953     EIGEN_POWER_PREFETCH(lhs_ptr_real);
1954     if(!LhsIsReal) {
1955       EIGEN_POWER_PREFETCH(lhs_ptr_imag);
1956     }
1957     for (int l = 0; l < PEEL_COMPLEX; l++) {
1958       MICRO_COMPLEX_EXTRA_ROW<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows);
1959     }
1960   }
1961   for(; k < remaining_depth; k++)
1962   {
1963     MICRO_COMPLEX_EXTRA_ROW<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows);
1964   }
1965 
1966   if ((remaining_depth == depth) && (rows >= accCols))
1967   {
1968     bload<DataMapper, Packetc, Index, accColsC, 0, ColMajor>(tRes, res, row, col);
1969     bscalec<Packet>(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask);
1970     bcouple<Packet, Packetc>(taccReal, taccImag, tRes, acc0, acc1);
1971     res.template storePacketBlock<Packetc,4>(row + 0, col, acc0);
1972     res.template storePacketBlock<Packetc,4>(row + accColsC, col, acc1);
1973   } else {
1974     for(; k < depth; k++)
1975     {
1976       Packet rhsV[4], rhsVi[4];
1977       pbroadcast4_old<Packet>(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
1978       if(!RhsIsReal) pbroadcast4_old<Packet>(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]);
1979       pgerc<4, Scalar, Packet, Index, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi, remaining_rows);
1980       lhs_ptr_real += remaining_rows;
1981       if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
1982       rhs_ptr_real += accRows;
1983       if(!RhsIsReal) rhs_ptr_imag += accRows;
1984     }
1985 
1986     bscalec<Packet,4>(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag);
1987     bcouple_common<Packet, Packetc>(taccReal, taccImag, acc0, acc1);
1988 
1989     if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1))
1990     {
1991       for(Index j = 0; j < 4; j++) {
1992         res(row + 0, col + j) += pfirst<Packetc>(acc0.packet[j]);
1993       }
1994     } else {
1995       for(Index j = 0; j < 4; j++) {
1996         PacketBlock<Packetc,1> acc2;
1997         acc2.packet[0] = res.template loadPacket<Packetc>(row + 0, col + j) + acc0.packet[j];
1998         res.template storePacketBlock<Packetc,1>(row + 0, col + j, acc2);
1999         if(remaining_rows > accColsC) {
2000           res(row + accColsC, col + j) += pfirst<Packetc>(acc1.packet[j]);
2001         }
2002       }
2003     }
2004   }
2005 }
2006 
2007 #define MICRO_COMPLEX_UNROLL(func) \
2008   func(0) func(1) func(2) func(3) func(4)
2009 
2010 #define MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
2011     MICRO_COMPLEX_UNROLL(func2); \
2012     func(0,peel) func(1,peel) func(2,peel) func(3,peel) func(4,peel)
2013 
2014 #define MICRO_COMPLEX_LOAD_ONE(iter) \
2015   if (unroll_factor > iter) { \
2016     lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr_real##iter); \
2017     lhs_ptr_real##iter += accCols; \
2018     if(!LhsIsReal) { \
2019       lhsVi##iter = ploadLhs<Scalar, Packet>(lhs_ptr_imag##iter); \
2020       lhs_ptr_imag##iter += accCols; \
2021     } else { \
2022       EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
2023     } \
2024   } else { \
2025     EIGEN_UNUSED_VARIABLE(lhsV##iter); \
2026     EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
2027   }
2028 
2029 #define MICRO_COMPLEX_WORK_ONE4(iter, peel) \
2030   if (unroll_factor > iter) { \
2031     pgerc_common<4, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \
2032   }
2033 
2034 #define MICRO_COMPLEX_WORK_ONE1(iter, peel) \
2035   if (unroll_factor > iter) { \
2036     pgerc_common<1, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \
2037   }
2038 
2039 #define MICRO_COMPLEX_TYPE_PEEL4(func, func2, peel) \
2040   if (PEEL_COMPLEX > peel) { \
2041     Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \
2042     Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \
2043     pbroadcast4_old<Packet>(rhs_ptr_real + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
2044     if(!RhsIsReal) { \
2045       pbroadcast4_old<Packet>(rhs_ptr_imag + (accRows * peel), rhsVi##peel[0], rhsVi##peel[1], rhsVi##peel[2], rhsVi##peel[3]); \
2046     } else { \
2047       EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
2048     } \
2049     MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
2050   } else { \
2051     EIGEN_UNUSED_VARIABLE(rhsV##peel); \
2052     EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
2053   }
2054 
2055 #define MICRO_COMPLEX_TYPE_PEEL1(func, func2, peel) \
2056   if (PEEL_COMPLEX > peel) { \
2057     Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \
2058     Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \
2059     rhsV##peel[0] = pset1<Packet>(rhs_ptr_real[remaining_cols * peel]); \
2060     if(!RhsIsReal) { \
2061       rhsVi##peel[0] = pset1<Packet>(rhs_ptr_imag[remaining_cols * peel]); \
2062     } else { \
2063       EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
2064     } \
2065     MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
2066   } else { \
2067     EIGEN_UNUSED_VARIABLE(rhsV##peel); \
2068     EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
2069   }
2070 
2071 #define MICRO_COMPLEX_UNROLL_TYPE_PEEL(M, func, func1, func2) \
2072   Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M], rhsV8[M], rhsV9[M]; \
2073   Packet rhsVi0[M], rhsVi1[M], rhsVi2[M], rhsVi3[M], rhsVi4[M], rhsVi5[M], rhsVi6[M], rhsVi7[M], rhsVi8[M], rhsVi9[M]; \
2074   func(func1,func2,0); func(func1,func2,1); \
2075   func(func1,func2,2); func(func1,func2,3); \
2076   func(func1,func2,4); func(func1,func2,5); \
2077   func(func1,func2,6); func(func1,func2,7); \
2078   func(func1,func2,8); func(func1,func2,9);
2079 
2080 #define MICRO_COMPLEX_UNROLL_TYPE_ONE(M, func, func1, func2) \
2081   Packet rhsV0[M], rhsVi0[M];\
2082   func(func1,func2,0);
2083 
2084 #define MICRO_COMPLEX_ONE_PEEL4 \
2085   MICRO_COMPLEX_UNROLL_TYPE_PEEL(4, MICRO_COMPLEX_TYPE_PEEL4, MICRO_COMPLEX_WORK_ONE4, MICRO_COMPLEX_LOAD_ONE); \
2086   rhs_ptr_real += (accRows * PEEL_COMPLEX); \
2087   if(!RhsIsReal) rhs_ptr_imag += (accRows * PEEL_COMPLEX);
2088 
2089 #define MICRO_COMPLEX_ONE4 \
2090   MICRO_COMPLEX_UNROLL_TYPE_ONE(4, MICRO_COMPLEX_TYPE_PEEL4, MICRO_COMPLEX_WORK_ONE4, MICRO_COMPLEX_LOAD_ONE); \
2091   rhs_ptr_real += accRows; \
2092   if(!RhsIsReal) rhs_ptr_imag += accRows;
2093 
2094 #define MICRO_COMPLEX_ONE_PEEL1 \
2095   MICRO_COMPLEX_UNROLL_TYPE_PEEL(1, MICRO_COMPLEX_TYPE_PEEL1, MICRO_COMPLEX_WORK_ONE1, MICRO_COMPLEX_LOAD_ONE); \
2096   rhs_ptr_real += (remaining_cols * PEEL_COMPLEX); \
2097   if(!RhsIsReal) rhs_ptr_imag += (remaining_cols * PEEL_COMPLEX);
2098 
2099 #define MICRO_COMPLEX_ONE1 \
2100   MICRO_COMPLEX_UNROLL_TYPE_ONE(1, MICRO_COMPLEX_TYPE_PEEL1, MICRO_COMPLEX_WORK_ONE1, MICRO_COMPLEX_LOAD_ONE); \
2101   rhs_ptr_real += remaining_cols; \
2102   if(!RhsIsReal) rhs_ptr_imag += remaining_cols;
2103 
2104 #define MICRO_COMPLEX_DST_PTR_ONE(iter) \
2105   if (unroll_factor > iter) { \
2106     bsetzero<Scalar, Packet>(accReal##iter); \
2107     bsetzero<Scalar, Packet>(accImag##iter); \
2108   } else { \
2109     EIGEN_UNUSED_VARIABLE(accReal##iter); \
2110     EIGEN_UNUSED_VARIABLE(accImag##iter); \
2111   }
2112 
2113 #define MICRO_COMPLEX_DST_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_DST_PTR_ONE)
2114 
2115 #define MICRO_COMPLEX_SRC_PTR_ONE(iter) \
2116   if (unroll_factor > iter) { \
2117     lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols + accCols*offsetA; \
2118     if(!LhsIsReal) { \
2119       lhs_ptr_imag##iter = lhs_ptr_real##iter + accCols*strideA; \
2120     } else { \
2121       EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \
2122     } \
2123   } else { \
2124     EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \
2125     EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \
2126   }
2127 
2128 #define MICRO_COMPLEX_SRC_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_SRC_PTR_ONE)
2129 
2130 #define MICRO_COMPLEX_PREFETCH_ONE(iter) \
2131   if (unroll_factor > iter) { \
2132     EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \
2133     if(!LhsIsReal) { \
2134       EIGEN_POWER_PREFETCH(lhs_ptr_imag##iter); \
2135     } \
2136   }
2137 
2138 #define MICRO_COMPLEX_PREFETCH MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_PREFETCH_ONE)
2139 
2140 #define MICRO_COMPLEX_STORE_ONE(iter) \
2141   if (unroll_factor > iter) { \
2142     bload<DataMapper, Packetc, Index, accColsC, 0, ColMajor>(tRes, res, row + iter*accCols, col); \
2143     bscalec<Packet,4>(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \
2144     bcouple<Packet, Packetc>(taccReal, taccImag, tRes, acc0, acc1); \
2145     res.template storePacketBlock<Packetc,4>(row + iter*accCols + 0, col, acc0); \
2146     res.template storePacketBlock<Packetc,4>(row + iter*accCols + accColsC, col, acc1); \
2147   }
2148 
2149 #define MICRO_COMPLEX_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_STORE_ONE)
2150 
2151 #define MICRO_COMPLEX_COL_STORE_ONE(iter) \
2152   if (unroll_factor > iter) { \
2153     bload<DataMapper, Packetc, Index, accColsC, 0, ColMajor>(tRes, res, row + iter*accCols, col); \
2154     bscalec<Packet,1>(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \
2155     bcouple<Packet, Packetc>(taccReal, taccImag, tRes, acc0, acc1); \
2156     res.template storePacketBlock<Packetc,1>(row + iter*accCols + 0, col, acc0); \
2157     res.template storePacketBlock<Packetc,1>(row + iter*accCols + accColsC, col, acc1); \
2158   }
2159 
2160 #define MICRO_COMPLEX_COL_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_COL_STORE_ONE)
2161 
2162 template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
2163 EIGEN_STRONG_INLINE void gemm_complex_unrolled_iteration(
2164   const DataMapper& res,
2165   const Scalar* lhs_base,
2166   const Scalar* rhs_base,
2167   Index depth,
2168   Index strideA,
2169   Index offsetA,
2170   Index strideB,
2171   Index& row,
2172   Index col,
2173   const Packet& pAlphaReal,
2174   const Packet& pAlphaImag)
2175 {
2176   const Scalar* rhs_ptr_real = rhs_base;
2177   const Scalar* rhs_ptr_imag;
2178   if(!RhsIsReal) {
2179     rhs_ptr_imag = rhs_base + accRows*strideB;
2180   } else {
2181     EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
2182   }
2183   const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL;
2184   const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL;
2185   const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL;
2186   PacketBlock<Packet,4> accReal0, accImag0, accReal1, accImag1;
2187   PacketBlock<Packet,4> accReal2, accImag2, accReal3, accImag3;
2188   PacketBlock<Packet,4> accReal4, accImag4;
2189   PacketBlock<Packet,4> taccReal, taccImag;
2190   PacketBlock<Packetc,4> acc0, acc1;
2191   PacketBlock<Packetc,8> tRes;
2192 
2193   MICRO_COMPLEX_SRC_PTR
2194   MICRO_COMPLEX_DST_PTR
2195 
2196   Index k = 0;
2197   for(; k + PEEL_COMPLEX <= depth; k+= PEEL_COMPLEX)
2198   {
2199     EIGEN_POWER_PREFETCH(rhs_ptr_real);
2200     if(!RhsIsReal) {
2201       EIGEN_POWER_PREFETCH(rhs_ptr_imag);
2202     }
2203     MICRO_COMPLEX_PREFETCH
2204     MICRO_COMPLEX_ONE_PEEL4
2205   }
2206   for(; k < depth; k++)
2207   {
2208     MICRO_COMPLEX_ONE4
2209   }
2210   MICRO_COMPLEX_STORE
2211 
2212   row += unroll_factor*accCols;
2213 }
2214 
2215 template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
2216 EIGEN_STRONG_INLINE void gemm_complex_unrolled_col_iteration(
2217   const DataMapper& res,
2218   const Scalar* lhs_base,
2219   const Scalar* rhs_base,
2220   Index depth,
2221   Index strideA,
2222   Index offsetA,
2223   Index strideB,
2224   Index& row,
2225   Index col,
2226   Index remaining_cols,
2227   const Packet& pAlphaReal,
2228   const Packet& pAlphaImag)
2229 {
2230   const Scalar* rhs_ptr_real = rhs_base;
2231   const Scalar* rhs_ptr_imag;
2232   if(!RhsIsReal) {
2233     rhs_ptr_imag = rhs_base + remaining_cols*strideB;
2234   } else {
2235     EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
2236   }
2237   const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL;
2238   const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL;
2239   const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL;
2240   PacketBlock<Packet,1> accReal0, accImag0, accReal1, accImag1;
2241   PacketBlock<Packet,1> accReal2, accImag2, accReal3, accImag3;
2242   PacketBlock<Packet,1> accReal4, accImag4;
2243   PacketBlock<Packet,1> taccReal, taccImag;
2244   PacketBlock<Packetc,1> acc0, acc1;
2245   PacketBlock<Packetc,2> tRes;
2246 
2247   MICRO_COMPLEX_SRC_PTR
2248   MICRO_COMPLEX_DST_PTR
2249 
2250   Index k = 0;
2251   for(; k + PEEL_COMPLEX <= depth; k+= PEEL_COMPLEX)
2252   {
2253     EIGEN_POWER_PREFETCH(rhs_ptr_real);
2254     if(!RhsIsReal) {
2255       EIGEN_POWER_PREFETCH(rhs_ptr_imag);
2256     }
2257     MICRO_COMPLEX_PREFETCH
2258     MICRO_COMPLEX_ONE_PEEL1
2259   }
2260   for(; k < depth; k++)
2261   {
2262     MICRO_COMPLEX_ONE1
2263   }
2264   MICRO_COMPLEX_COL_STORE
2265 
2266   row += unroll_factor*accCols;
2267 }
2268 
2269 template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
2270 EIGEN_STRONG_INLINE void gemm_complex_unrolled_col(
2271   const DataMapper& res,
2272   const Scalar* lhs_base,
2273   const Scalar* rhs_base,
2274   Index depth,
2275   Index strideA,
2276   Index offsetA,
2277   Index strideB,
2278   Index& row,
2279   Index rows,
2280   Index col,
2281   Index remaining_cols,
2282   const Packet& pAlphaReal,
2283   const Packet& pAlphaImag)
2284 {
2285 #define MAX_COMPLEX_UNROLL 3
2286   while(row + MAX_COMPLEX_UNROLL*accCols <= rows) {
2287     gemm_complex_unrolled_col_iteration<MAX_COMPLEX_UNROLL, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag);
2288   }
2289   switch( (rows-row)/accCols ) {
2290 #if MAX_COMPLEX_UNROLL > 4
2291    case 4:
2292      gemm_complex_unrolled_col_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag);
2293      break;
2294 #endif
2295 #if MAX_COMPLEX_UNROLL > 3
2296    case 3:
2297      gemm_complex_unrolled_col_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag);
2298      break;
2299 #endif
2300 #if MAX_COMPLEX_UNROLL > 2
2301    case 2:
2302      gemm_complex_unrolled_col_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag);
2303      break;
2304 #endif
2305 #if MAX_COMPLEX_UNROLL > 1
2306    case 1:
2307      gemm_complex_unrolled_col_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag);
2308      break;
2309 #endif
2310    default:
2311      break;
2312   }
2313 #undef MAX_COMPLEX_UNROLL
2314 }
2315 
2316 template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Index, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
2317 EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
2318 {
2319       const Index remaining_rows = rows % accCols;
2320       const Index remaining_cols = cols % accRows;
2321 
2322       if( strideA == -1 ) strideA = depth;
2323       if( strideB == -1 ) strideB = depth;
2324 
2325       const Packet pAlphaReal = pset1<Packet>(alpha.real());
2326       const Packet pAlphaImag = pset1<Packet>(alpha.imag());
2327       const Packet pMask = bmask<Packet>((const int)(remaining_rows));
2328 
2329       const Scalar* blockA = (Scalar *) blockAc;
2330       const Scalar* blockB = (Scalar *) blockBc;
2331 
2332       Index col = 0;
2333       for(; col + accRows <= cols; col += accRows)
2334       {
2335         const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB;
2336         const Scalar* lhs_base = blockA;
2337         Index row = 0;
2338 
2339 #define MAX_COMPLEX_UNROLL 3
2340         while(row + MAX_COMPLEX_UNROLL*accCols <= rows) {
2341           gemm_complex_unrolled_iteration<MAX_COMPLEX_UNROLL, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
2342         }
2343         switch( (rows-row)/accCols ) {
2344 #if MAX_COMPLEX_UNROLL > 4
2345           case 4:
2346             gemm_complex_unrolled_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
2347             break;
2348 #endif
2349 #if MAX_COMPLEX_UNROLL > 3
2350           case 3:
2351             gemm_complex_unrolled_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
2352             break;
2353 #endif
2354 #if MAX_COMPLEX_UNROLL > 2
2355           case 2:
2356             gemm_complex_unrolled_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
2357             break;
2358 #endif
2359 #if MAX_COMPLEX_UNROLL > 1
2360           case 1:
2361             gemm_complex_unrolled_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
2362             break;
2363 #endif
2364           default:
2365             break;
2366         }
2367 #undef MAX_COMPLEX_UNROLL
2368 
2369         if(remaining_rows > 0)
2370         {
2371           gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
2372         }
2373       }
2374 
2375       if(remaining_cols > 0)
2376       {
2377         const Scalar* rhs_base = blockB + advanceCols*col*strideB + remaining_cols*offsetB;
2378         const Scalar* lhs_base = blockA;
2379 
2380         for(; col < cols; col++)
2381         {
2382           Index row = 0;
2383 
2384           gemm_complex_unrolled_col<Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, col, remaining_cols, pAlphaReal, pAlphaImag);
2385 
2386           if (remaining_rows > 0)
2387           {
2388             gemm_complex_extra_col<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_rows, remaining_cols, pAlphaReal, pAlphaImag);
2389           }
2390           rhs_base++;
2391         }
2392       }
2393 }
2394 
2395 #undef accColsC
2396 #undef advanceCols
2397 #undef advanceRows
2398 
2399 /************************************
2400  * ppc64le template specializations *
2401  * **********************************/
2402 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2403 struct gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
2404 {
2405   void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2406 };
2407 
2408 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2409 void gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
2410   ::operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2411 {
2412     dhs_pack<double, Index, DataMapper, Packet2d, ColMajor, PanelMode, true> pack;
2413     pack(blockA, lhs, depth, rows, stride, offset);
2414 }
2415 
2416 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2417 struct gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
2418 {
2419   void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2420 };
2421 
2422 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2423 void gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
2424   ::operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2425 {
2426     dhs_pack<double, Index, DataMapper, Packet2d, RowMajor, PanelMode, true> pack;
2427     pack(blockA, lhs, depth, rows, stride, offset);
2428 }
2429 
2430 #if EIGEN_ALTIVEC_USE_CUSTOM_PACK
2431 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2432 struct gemm_pack_rhs<double, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
2433 {
2434   void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2435 };
2436 
2437 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2438 void gemm_pack_rhs<double, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
2439   ::operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2440 {
2441   dhs_pack<double, Index, DataMapper, Packet2d, ColMajor, PanelMode, false> pack;
2442   pack(blockB, rhs, depth, cols, stride, offset);
2443 }
2444 
2445 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2446 struct gemm_pack_rhs<double, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2447 {
2448   void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2449 };
2450 
2451 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2452 void gemm_pack_rhs<double, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2453   ::operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2454 {
2455   dhs_pack<double, Index, DataMapper, Packet2d, RowMajor, PanelMode, false> pack;
2456   pack(blockB, rhs, depth, cols, stride, offset);
2457 }
2458 #endif
2459 
2460 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2461 struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
2462 {
2463   void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2464 };
2465 
2466 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2467 void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
2468   ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2469 {
2470   dhs_pack<float, Index, DataMapper, Packet4f, RowMajor, PanelMode, true> pack;
2471   pack(blockA, lhs, depth, rows, stride, offset);
2472 }
2473 
2474 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2475 struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
2476 {
2477   void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2478 };
2479 
2480 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2481 void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
2482   ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2483 {
2484   dhs_pack<float, Index, DataMapper, Packet4f, ColMajor, PanelMode, true> pack;
2485   pack(blockA, lhs, depth, rows, stride, offset);
2486 }
2487 
2488 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2489 struct gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
2490 {
2491   void operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2492 };
2493 
2494 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2495 void gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
2496   ::operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2497 {
2498   dhs_cpack<float, Index, DataMapper, Packet4f, Packet2cf, RowMajor, Conjugate, PanelMode, true> pack;
2499   pack(blockA, lhs, depth, rows, stride, offset);
2500 }
2501 
2502 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2503 struct gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
2504 {
2505   void operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2506 };
2507 
2508 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2509 void gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
2510   ::operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2511 {
2512   dhs_cpack<float, Index, DataMapper, Packet4f, Packet2cf, ColMajor, Conjugate, PanelMode, true> pack;
2513   pack(blockA, lhs, depth, rows, stride, offset);
2514 }
2515 
2516 #if EIGEN_ALTIVEC_USE_CUSTOM_PACK
2517 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2518 struct gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
2519 {
2520   void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2521 };
2522 
2523 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2524 void gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
2525   ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2526 {
2527   dhs_pack<float, Index, DataMapper, Packet4f, ColMajor, PanelMode, false> pack;
2528   pack(blockB, rhs, depth, cols, stride, offset);
2529 }
2530 
2531 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2532 struct gemm_pack_rhs<float, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2533 {
2534   void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2535 };
2536 
2537 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2538 void gemm_pack_rhs<float, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2539   ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2540 {
2541   dhs_pack<float, Index, DataMapper, Packet4f, RowMajor, PanelMode, false> pack;
2542   pack(blockB, rhs, depth, cols, stride, offset);
2543 }
2544 #endif
2545 
2546 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2547 struct gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
2548 {
2549   void operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2550 };
2551 
2552 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2553 void gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
2554   ::operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2555 {
2556   dhs_cpack<float, Index, DataMapper, Packet4f, Packet2cf, ColMajor, Conjugate, PanelMode, false> pack;
2557   pack(blockB, rhs, depth, cols, stride, offset);
2558 }
2559 
2560 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2561 struct gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2562 {
2563   void operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2564 };
2565 
2566 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2567 void gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2568   ::operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2569 {
2570   dhs_cpack<float, Index, DataMapper, Packet4f, Packet2cf, RowMajor, Conjugate, PanelMode, false> pack;
2571   pack(blockB, rhs, depth, cols, stride, offset);
2572 }
2573 
2574 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2575 struct gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
2576 {
2577   void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2578 };
2579 
2580 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2581 void gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
2582   ::operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2583 {
2584   dhs_cpack<double, Index, DataMapper, Packet2d, Packet1cd, RowMajor, Conjugate, PanelMode, true> pack;
2585   pack(blockA, lhs, depth, rows, stride, offset);
2586 }
2587 
2588 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2589 struct gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
2590 {
2591   void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2592 };
2593 
2594 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2595 void gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
2596   ::operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2597 {
2598   dhs_cpack<double, Index, DataMapper, Packet2d, Packet1cd, ColMajor, Conjugate, PanelMode, true> pack;
2599   pack(blockA, lhs, depth, rows, stride, offset);
2600 }
2601 
2602 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2603 struct gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
2604 {
2605   void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2606 };
2607 
2608 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2609 void gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
2610   ::operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2611 {
2612   dhs_cpack<double, Index, DataMapper, Packet2d, Packet1cd, ColMajor, Conjugate, PanelMode, false> pack;
2613   pack(blockB, rhs, depth, cols, stride, offset);
2614 }
2615 
2616 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2617 struct gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2618 {
2619   void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2620 };
2621 
2622 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2623 void gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2624   ::operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2625 {
2626   dhs_cpack<double, Index, DataMapper, Packet2d, Packet1cd, RowMajor, Conjugate, PanelMode, false> pack;
2627   pack(blockB, rhs, depth, cols, stride, offset);
2628 }
2629 
2630 // ********* gebp specializations *********
2631 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2632 struct gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2633 {
2634   typedef typename quad_traits<float>::vectortype   Packet;
2635   typedef typename quad_traits<float>::rhstype      RhsPacket;
2636 
2637   void operator()(const DataMapper& res, const float* blockA, const float* blockB,
2638                   Index rows, Index depth, Index cols, float alpha,
2639                   Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
2640 };
2641 
2642 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2643 void gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2644   ::operator()(const DataMapper& res, const float* blockA, const float* blockB,
2645                Index rows, Index depth, Index cols, float alpha,
2646                Index strideA, Index strideB, Index offsetA, Index offsetB)
2647   {
2648     const Index accRows = quad_traits<float>::rows;
2649     const Index accCols = quad_traits<float>::size;
2650     void (*gemm_function)(const DataMapper&, const float*, const float*, Index, Index, Index, float, Index, Index, Index, Index);
2651 
2652     #ifdef EIGEN_ALTIVEC_MMA_ONLY
2653       //generate with MMA only
2654       gemm_function = &Eigen::internal::gemmMMA<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2655     #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2656       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
2657         gemm_function = &Eigen::internal::gemmMMA<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2658       }
2659       else{
2660         gemm_function = &Eigen::internal::gemm<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2661       }
2662     #else
2663       gemm_function = &Eigen::internal::gemm<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2664     #endif
2665       gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2666   }
2667 
2668 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2669 struct gebp_kernel<std::complex<float>, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2670 {
2671   typedef Packet4f   Packet;
2672   typedef Packet2cf  Packetc;
2673   typedef Packet4f   RhsPacket;
2674 
2675   void operator()(const DataMapper& res, const std::complex<float>* blockA, const std::complex<float>* blockB,
2676                   Index rows, Index depth, Index cols, std::complex<float> alpha,
2677                   Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
2678 };
2679 
2680 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2681 void gebp_kernel<std::complex<float>, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2682   ::operator()(const DataMapper& res, const std::complex<float>* blockA, const std::complex<float>* blockB,
2683                Index rows, Index depth, Index cols, std::complex<float> alpha,
2684                Index strideA, Index strideB, Index offsetA, Index offsetB)
2685   {
2686     const Index accRows = quad_traits<float>::rows;
2687     const Index accCols = quad_traits<float>::size;
2688     void (*gemm_function)(const DataMapper&, const std::complex<float>*, const std::complex<float>*,
2689           Index, Index, Index, std::complex<float>, Index, Index, Index, Index);
2690 
2691     #ifdef EIGEN_ALTIVEC_MMA_ONLY
2692        //generate with MMA only
2693        gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
2694      #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2695        if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
2696          gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
2697        }
2698        else{
2699          gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
2700        }
2701      #else
2702        gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
2703      #endif
2704       gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2705   }
2706 
2707 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2708 struct gebp_kernel<float, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2709 {
2710   typedef Packet4f   Packet;
2711   typedef Packet2cf  Packetc;
2712   typedef Packet4f   RhsPacket;
2713 
2714   void operator()(const DataMapper& res, const float* blockA, const std::complex<float>* blockB,
2715                   Index rows, Index depth, Index cols, std::complex<float> alpha,
2716                   Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
2717 };
2718 
2719 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2720 void gebp_kernel<float, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2721   ::operator()(const DataMapper& res, const float* blockA, const std::complex<float>* blockB,
2722                Index rows, Index depth, Index cols, std::complex<float> alpha,
2723                Index strideA, Index strideB, Index offsetA, Index offsetB)
2724   {
2725     const Index accRows = quad_traits<float>::rows;
2726     const Index accCols = quad_traits<float>::size;
2727     void (*gemm_function)(const DataMapper&, const float*, const std::complex<float>*,
2728           Index, Index, Index, std::complex<float>, Index, Index, Index, Index);
2729     #ifdef EIGEN_ALTIVEC_MMA_ONLY
2730        //generate with MMA only
2731        gemm_function = &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
2732      #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2733        if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
2734          gemm_function = &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
2735        }
2736        else{
2737          gemm_function = &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
2738        }
2739      #else
2740        gemm_function = &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
2741      #endif
2742        gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2743   }
2744 
2745 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2746 struct gebp_kernel<std::complex<float>, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2747 {
2748   typedef Packet4f   Packet;
2749   typedef Packet2cf  Packetc;
2750   typedef Packet4f   RhsPacket;
2751 
2752   void operator()(const DataMapper& res, const std::complex<float>* blockA, const float* blockB,
2753                   Index rows, Index depth, Index cols, std::complex<float> alpha,
2754                   Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
2755 };
2756 
2757 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2758 void gebp_kernel<std::complex<float>, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2759   ::operator()(const DataMapper& res, const std::complex<float>* blockA, const float* blockB,
2760                Index rows, Index depth, Index cols, std::complex<float> alpha,
2761                Index strideA, Index strideB, Index offsetA, Index offsetB)
2762   {
2763     const Index accRows = quad_traits<float>::rows;
2764     const Index accCols = quad_traits<float>::size;
2765     void (*gemm_function)(const DataMapper&, const std::complex<float>*, const float*,
2766           Index, Index, Index, std::complex<float>, Index, Index, Index, Index);
2767     #ifdef EIGEN_ALTIVEC_MMA_ONLY
2768        //generate with MMA only
2769        gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
2770      #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2771        if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
2772          gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
2773        }
2774        else{
2775          gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
2776        }
2777      #else
2778        gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
2779      #endif
2780        gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2781   }
2782 
2783 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2784 struct gebp_kernel<double, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2785 {
2786   typedef typename quad_traits<double>::vectortype  Packet;
2787   typedef typename quad_traits<double>::rhstype     RhsPacket;
2788 
2789   void operator()(const DataMapper& res, const double* blockA, const double* blockB,
2790                   Index rows, Index depth, Index cols, double alpha,
2791                   Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
2792 };
2793 
2794 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2795 void gebp_kernel<double, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2796   ::operator()(const DataMapper& res, const double* blockA, const double* blockB,
2797                Index rows, Index depth, Index cols, double alpha,
2798                Index strideA, Index strideB, Index offsetA, Index offsetB)
2799   {
2800     const Index accRows = quad_traits<double>::rows;
2801     const Index accCols = quad_traits<double>::size;
2802     void (*gemm_function)(const DataMapper&, const double*, const double*, Index, Index, Index, double, Index, Index, Index, Index);
2803 
2804     #ifdef EIGEN_ALTIVEC_MMA_ONLY
2805       //generate with MMA only
2806       gemm_function = &Eigen::internal::gemmMMA<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2807     #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2808       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
2809         gemm_function = &Eigen::internal::gemmMMA<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2810       }
2811       else{
2812         gemm_function = &Eigen::internal::gemm<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2813       }
2814     #else
2815       gemm_function = &Eigen::internal::gemm<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2816     #endif
2817       gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2818   }
2819 
2820 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2821 struct gebp_kernel<std::complex<double>, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2822 {
2823   typedef quad_traits<double>::vectortype   Packet;
2824   typedef Packet1cd  Packetc;
2825   typedef quad_traits<double>::rhstype   RhsPacket;
2826 
2827   void operator()(const DataMapper& res, const std::complex<double>* blockA, const std::complex<double>* blockB,
2828                   Index rows, Index depth, Index cols, std::complex<double> alpha,
2829                   Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
2830 };
2831 
2832 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2833 void gebp_kernel<std::complex<double>, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2834   ::operator()(const DataMapper& res, const std::complex<double>* blockA, const std::complex<double>* blockB,
2835                Index rows, Index depth, Index cols, std::complex<double> alpha,
2836                Index strideA, Index strideB, Index offsetA, Index offsetB)
2837   {
2838     const Index accRows = quad_traits<double>::rows;
2839     const Index accCols = quad_traits<double>::size;
2840     void (*gemm_function)(const DataMapper&, const std::complex<double>*, const std::complex<double>*,
2841           Index, Index, Index, std::complex<double>, Index, Index, Index, Index);
2842     #ifdef EIGEN_ALTIVEC_MMA_ONLY
2843        //generate with MMA only
2844        gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
2845      #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2846        if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
2847          gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
2848        }
2849        else{
2850          gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
2851        }
2852      #else
2853        gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
2854      #endif
2855        gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2856   }
2857 
2858 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2859 struct gebp_kernel<std::complex<double>, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2860 {
2861   typedef quad_traits<double>::vectortype   Packet;
2862   typedef Packet1cd  Packetc;
2863   typedef quad_traits<double>::rhstype   RhsPacket;
2864 
2865   void operator()(const DataMapper& res, const std::complex<double>* blockA, const double* blockB,
2866                   Index rows, Index depth, Index cols, std::complex<double> alpha,
2867                   Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
2868 };
2869 
2870 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2871 void gebp_kernel<std::complex<double>, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2872   ::operator()(const DataMapper& res, const std::complex<double>* blockA, const double* blockB,
2873                Index rows, Index depth, Index cols, std::complex<double> alpha,
2874                Index strideA, Index strideB, Index offsetA, Index offsetB)
2875   {
2876     const Index accRows = quad_traits<double>::rows;
2877     const Index accCols = quad_traits<double>::size;
2878     void (*gemm_function)(const DataMapper&, const std::complex<double>*, const double*,
2879           Index, Index, Index, std::complex<double>, Index, Index, Index, Index);
2880     #ifdef EIGEN_ALTIVEC_MMA_ONLY
2881        //generate with MMA only
2882        gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
2883      #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2884        if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
2885          gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
2886        }
2887        else{
2888          gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
2889        }
2890      #else
2891        gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
2892      #endif
2893        gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2894   }
2895 
2896 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2897 struct gebp_kernel<double, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2898 {
2899   typedef quad_traits<double>::vectortype   Packet;
2900   typedef Packet1cd  Packetc;
2901   typedef quad_traits<double>::rhstype   RhsPacket;
2902 
2903   void operator()(const DataMapper& res, const double* blockA, const std::complex<double>* blockB,
2904                   Index rows, Index depth, Index cols, std::complex<double> alpha,
2905                   Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
2906 };
2907 
2908 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
2909 void gebp_kernel<double, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2910   ::operator()(const DataMapper& res, const double* blockA, const std::complex<double>* blockB,
2911                Index rows, Index depth, Index cols, std::complex<double> alpha,
2912                Index strideA, Index strideB, Index offsetA, Index offsetB)
2913   {
2914     const Index accRows = quad_traits<double>::rows;
2915     const Index accCols = quad_traits<double>::size;
2916     void (*gemm_function)(const DataMapper&, const double*, const std::complex<double>*,
2917           Index, Index, Index, std::complex<double>, Index, Index, Index, Index);
2918     #ifdef EIGEN_ALTIVEC_MMA_ONLY
2919        //generate with MMA only
2920        gemm_function = &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
2921      #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2922        if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
2923          gemm_function = &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
2924        }
2925        else{
2926          gemm_function = &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
2927        }
2928      #else
2929        gemm_function = &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
2930      #endif
2931        gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2932   }
2933 } // end namespace internal
2934 
2935 } // end namespace Eigen
2936 
2937 #endif // EIGEN_MATRIX_PRODUCT_ALTIVEC_H
2938