xref: /aosp_15_r20/external/eigen/bench/bench_gemm.cpp (revision bf2c37156dfe67e5dfebd6d394bad8b2ab5804d4)
1*bf2c3715SXin Li 
2*bf2c3715SXin Li // g++-4.4 bench_gemm.cpp -I .. -O2 -DNDEBUG -lrt -fopenmp && OMP_NUM_THREADS=2  ./a.out
3*bf2c3715SXin Li // icpc bench_gemm.cpp -I .. -O3 -DNDEBUG -lrt -openmp  && OMP_NUM_THREADS=2  ./a.out
4*bf2c3715SXin Li 
5*bf2c3715SXin Li // Compilation options:
6*bf2c3715SXin Li //
7*bf2c3715SXin Li // -DSCALAR=std::complex<double>
8*bf2c3715SXin Li // -DSCALARA=double or -DSCALARB=double
9*bf2c3715SXin Li // -DHAVE_BLAS
10*bf2c3715SXin Li // -DDECOUPLED
11*bf2c3715SXin Li //
12*bf2c3715SXin Li 
13*bf2c3715SXin Li #include <iostream>
14*bf2c3715SXin Li #include <bench/BenchTimer.h>
15*bf2c3715SXin Li #include <Eigen/Core>
16*bf2c3715SXin Li 
17*bf2c3715SXin Li 
18*bf2c3715SXin Li using namespace std;
19*bf2c3715SXin Li using namespace Eigen;
20*bf2c3715SXin Li 
21*bf2c3715SXin Li #ifndef SCALAR
22*bf2c3715SXin Li // #define SCALAR std::complex<float>
23*bf2c3715SXin Li #define SCALAR float
24*bf2c3715SXin Li #endif
25*bf2c3715SXin Li 
26*bf2c3715SXin Li #ifndef SCALARA
27*bf2c3715SXin Li #define SCALARA SCALAR
28*bf2c3715SXin Li #endif
29*bf2c3715SXin Li 
30*bf2c3715SXin Li #ifndef SCALARB
31*bf2c3715SXin Li #define SCALARB SCALAR
32*bf2c3715SXin Li #endif
33*bf2c3715SXin Li 
34*bf2c3715SXin Li #ifdef ROWMAJ_A
35*bf2c3715SXin Li const int opt_A = RowMajor;
36*bf2c3715SXin Li #else
37*bf2c3715SXin Li const int opt_A = ColMajor;
38*bf2c3715SXin Li #endif
39*bf2c3715SXin Li 
40*bf2c3715SXin Li #ifdef ROWMAJ_B
41*bf2c3715SXin Li const int opt_B = RowMajor;
42*bf2c3715SXin Li #else
43*bf2c3715SXin Li const int opt_B = ColMajor;
44*bf2c3715SXin Li #endif
45*bf2c3715SXin Li 
46*bf2c3715SXin Li typedef SCALAR Scalar;
47*bf2c3715SXin Li typedef NumTraits<Scalar>::Real RealScalar;
48*bf2c3715SXin Li typedef Matrix<SCALARA,Dynamic,Dynamic,opt_A> A;
49*bf2c3715SXin Li typedef Matrix<SCALARB,Dynamic,Dynamic,opt_B> B;
50*bf2c3715SXin Li typedef Matrix<Scalar,Dynamic,Dynamic> C;
51*bf2c3715SXin Li typedef Matrix<RealScalar,Dynamic,Dynamic> M;
52*bf2c3715SXin Li 
53*bf2c3715SXin Li #ifdef HAVE_BLAS
54*bf2c3715SXin Li 
55*bf2c3715SXin Li extern "C" {
56*bf2c3715SXin Li   #include <Eigen/src/misc/blas.h>
57*bf2c3715SXin Li }
58*bf2c3715SXin Li 
59*bf2c3715SXin Li static float fone = 1;
60*bf2c3715SXin Li static float fzero = 0;
61*bf2c3715SXin Li static double done = 1;
62*bf2c3715SXin Li static double szero = 0;
63*bf2c3715SXin Li static std::complex<float> cfone = 1;
64*bf2c3715SXin Li static std::complex<float> cfzero = 0;
65*bf2c3715SXin Li static std::complex<double> cdone = 1;
66*bf2c3715SXin Li static std::complex<double> cdzero = 0;
67*bf2c3715SXin Li static char notrans = 'N';
68*bf2c3715SXin Li static char trans = 'T';
69*bf2c3715SXin Li static char nonunit = 'N';
70*bf2c3715SXin Li static char lower = 'L';
71*bf2c3715SXin Li static char right = 'R';
72*bf2c3715SXin Li static int intone = 1;
73*bf2c3715SXin Li 
74*bf2c3715SXin Li #ifdef ROWMAJ_A
75*bf2c3715SXin Li const char transA = trans;
76*bf2c3715SXin Li #else
77*bf2c3715SXin Li const char transA = notrans;
78*bf2c3715SXin Li #endif
79*bf2c3715SXin Li 
80*bf2c3715SXin Li #ifdef ROWMAJ_B
81*bf2c3715SXin Li const char transB = trans;
82*bf2c3715SXin Li #else
83*bf2c3715SXin Li const char transB = notrans;
84*bf2c3715SXin Li #endif
85*bf2c3715SXin Li 
86*bf2c3715SXin Li template<typename A,typename B>
blas_gemm(const A & a,const B & b,MatrixXf & c)87*bf2c3715SXin Li void blas_gemm(const A& a, const B& b, MatrixXf& c)
88*bf2c3715SXin Li {
89*bf2c3715SXin Li   int M = c.rows(); int N = c.cols(); int K = a.cols();
90*bf2c3715SXin Li   int lda = a.outerStride(); int ldb = b.outerStride(); int ldc = c.rows();
91*bf2c3715SXin Li 
92*bf2c3715SXin Li   sgemm_(&transA,&transB,&M,&N,&K,&fone,
93*bf2c3715SXin Li          const_cast<float*>(a.data()),&lda,
94*bf2c3715SXin Li          const_cast<float*>(b.data()),&ldb,&fone,
95*bf2c3715SXin Li          c.data(),&ldc);
96*bf2c3715SXin Li }
97*bf2c3715SXin Li 
98*bf2c3715SXin Li template<typename A,typename B>
blas_gemm(const A & a,const B & b,MatrixXd & c)99*bf2c3715SXin Li void blas_gemm(const A& a, const B& b, MatrixXd& c)
100*bf2c3715SXin Li {
101*bf2c3715SXin Li   int M = c.rows(); int N = c.cols(); int K = a.cols();
102*bf2c3715SXin Li   int lda = a.outerStride(); int ldb = b.outerStride(); int ldc = c.rows();
103*bf2c3715SXin Li 
104*bf2c3715SXin Li   dgemm_(&transA,&transB,&M,&N,&K,&done,
105*bf2c3715SXin Li          const_cast<double*>(a.data()),&lda,
106*bf2c3715SXin Li          const_cast<double*>(b.data()),&ldb,&done,
107*bf2c3715SXin Li          c.data(),&ldc);
108*bf2c3715SXin Li }
109*bf2c3715SXin Li 
110*bf2c3715SXin Li template<typename A,typename B>
blas_gemm(const A & a,const B & b,MatrixXcf & c)111*bf2c3715SXin Li void blas_gemm(const A& a, const B& b, MatrixXcf& c)
112*bf2c3715SXin Li {
113*bf2c3715SXin Li   int M = c.rows(); int N = c.cols(); int K = a.cols();
114*bf2c3715SXin Li   int lda = a.outerStride(); int ldb = b.outerStride(); int ldc = c.rows();
115*bf2c3715SXin Li 
116*bf2c3715SXin Li   cgemm_(&transA,&transB,&M,&N,&K,(float*)&cfone,
117*bf2c3715SXin Li          const_cast<float*>((const float*)a.data()),&lda,
118*bf2c3715SXin Li          const_cast<float*>((const float*)b.data()),&ldb,(float*)&cfone,
119*bf2c3715SXin Li          (float*)c.data(),&ldc);
120*bf2c3715SXin Li }
121*bf2c3715SXin Li 
122*bf2c3715SXin Li template<typename A,typename B>
blas_gemm(const A & a,const B & b,MatrixXcd & c)123*bf2c3715SXin Li void blas_gemm(const A& a, const B& b, MatrixXcd& c)
124*bf2c3715SXin Li {
125*bf2c3715SXin Li   int M = c.rows(); int N = c.cols(); int K = a.cols();
126*bf2c3715SXin Li   int lda = a.outerStride(); int ldb = b.outerStride(); int ldc = c.rows();
127*bf2c3715SXin Li 
128*bf2c3715SXin Li   zgemm_(&transA,&transB,&M,&N,&K,(double*)&cdone,
129*bf2c3715SXin Li          const_cast<double*>((const double*)a.data()),&lda,
130*bf2c3715SXin Li          const_cast<double*>((const double*)b.data()),&ldb,(double*)&cdone,
131*bf2c3715SXin Li          (double*)c.data(),&ldc);
132*bf2c3715SXin Li }
133*bf2c3715SXin Li 
134*bf2c3715SXin Li 
135*bf2c3715SXin Li 
136*bf2c3715SXin Li #endif
137*bf2c3715SXin Li 
matlab_cplx_cplx(const M & ar,const M & ai,const M & br,const M & bi,M & cr,M & ci)138*bf2c3715SXin Li void matlab_cplx_cplx(const M& ar, const M& ai, const M& br, const M& bi, M& cr, M& ci)
139*bf2c3715SXin Li {
140*bf2c3715SXin Li   cr.noalias() += ar * br;
141*bf2c3715SXin Li   cr.noalias() -= ai * bi;
142*bf2c3715SXin Li   ci.noalias() += ar * bi;
143*bf2c3715SXin Li   ci.noalias() += ai * br;
144*bf2c3715SXin Li   // [cr ci] += [ar ai] * br + [-ai ar] * bi
145*bf2c3715SXin Li }
146*bf2c3715SXin Li 
matlab_real_cplx(const M & a,const M & br,const M & bi,M & cr,M & ci)147*bf2c3715SXin Li void matlab_real_cplx(const M& a, const M& br, const M& bi, M& cr, M& ci)
148*bf2c3715SXin Li {
149*bf2c3715SXin Li   cr.noalias() += a * br;
150*bf2c3715SXin Li   ci.noalias() += a * bi;
151*bf2c3715SXin Li }
152*bf2c3715SXin Li 
matlab_cplx_real(const M & ar,const M & ai,const M & b,M & cr,M & ci)153*bf2c3715SXin Li void matlab_cplx_real(const M& ar, const M& ai, const M& b, M& cr, M& ci)
154*bf2c3715SXin Li {
155*bf2c3715SXin Li   cr.noalias() += ar * b;
156*bf2c3715SXin Li   ci.noalias() += ai * b;
157*bf2c3715SXin Li }
158*bf2c3715SXin Li 
159*bf2c3715SXin Li 
160*bf2c3715SXin Li 
161*bf2c3715SXin Li template<typename A, typename B, typename C>
gemm(const A & a,const B & b,C & c)162*bf2c3715SXin Li EIGEN_DONT_INLINE void gemm(const A& a, const B& b, C& c)
163*bf2c3715SXin Li {
164*bf2c3715SXin Li   c.noalias() += a * b;
165*bf2c3715SXin Li }
166*bf2c3715SXin Li 
main(int argc,char ** argv)167*bf2c3715SXin Li int main(int argc, char ** argv)
168*bf2c3715SXin Li {
169*bf2c3715SXin Li   std::ptrdiff_t l1 = internal::queryL1CacheSize();
170*bf2c3715SXin Li   std::ptrdiff_t l2 = internal::queryTopLevelCacheSize();
171*bf2c3715SXin Li   std::cout << "L1 cache size     = " << (l1>0 ? l1/1024 : -1) << " KB\n";
172*bf2c3715SXin Li   std::cout << "L2/L3 cache size  = " << (l2>0 ? l2/1024 : -1) << " KB\n";
173*bf2c3715SXin Li   typedef internal::gebp_traits<Scalar,Scalar> Traits;
174*bf2c3715SXin Li   std::cout << "Register blocking = " << Traits::mr << " x " << Traits::nr << "\n";
175*bf2c3715SXin Li 
176*bf2c3715SXin Li   int rep = 1;    // number of repetitions per try
177*bf2c3715SXin Li   int tries = 2;  // number of tries, we keep the best
178*bf2c3715SXin Li 
179*bf2c3715SXin Li   int s = 2048;
180*bf2c3715SXin Li   int m = s;
181*bf2c3715SXin Li   int n = s;
182*bf2c3715SXin Li   int p = s;
183*bf2c3715SXin Li   int cache_size1=-1, cache_size2=l2, cache_size3 = 0;
184*bf2c3715SXin Li 
185*bf2c3715SXin Li   bool need_help = false;
186*bf2c3715SXin Li   for (int i=1; i<argc;)
187*bf2c3715SXin Li   {
188*bf2c3715SXin Li     if(argv[i][0]=='-')
189*bf2c3715SXin Li     {
190*bf2c3715SXin Li       if(argv[i][1]=='s')
191*bf2c3715SXin Li       {
192*bf2c3715SXin Li         ++i;
193*bf2c3715SXin Li         s = atoi(argv[i++]);
194*bf2c3715SXin Li         m = n = p = s;
195*bf2c3715SXin Li         if(argv[i][0]!='-')
196*bf2c3715SXin Li         {
197*bf2c3715SXin Li           n = atoi(argv[i++]);
198*bf2c3715SXin Li           p = atoi(argv[i++]);
199*bf2c3715SXin Li         }
200*bf2c3715SXin Li       }
201*bf2c3715SXin Li       else if(argv[i][1]=='c')
202*bf2c3715SXin Li       {
203*bf2c3715SXin Li         ++i;
204*bf2c3715SXin Li         cache_size1 = atoi(argv[i++]);
205*bf2c3715SXin Li         if(argv[i][0]!='-')
206*bf2c3715SXin Li         {
207*bf2c3715SXin Li           cache_size2 = atoi(argv[i++]);
208*bf2c3715SXin Li           if(argv[i][0]!='-')
209*bf2c3715SXin Li             cache_size3 = atoi(argv[i++]);
210*bf2c3715SXin Li         }
211*bf2c3715SXin Li       }
212*bf2c3715SXin Li       else if(argv[i][1]=='t')
213*bf2c3715SXin Li       {
214*bf2c3715SXin Li         tries = atoi(argv[++i]);
215*bf2c3715SXin Li         ++i;
216*bf2c3715SXin Li       }
217*bf2c3715SXin Li       else if(argv[i][1]=='p')
218*bf2c3715SXin Li       {
219*bf2c3715SXin Li         ++i;
220*bf2c3715SXin Li         rep = atoi(argv[i++]);
221*bf2c3715SXin Li       }
222*bf2c3715SXin Li     }
223*bf2c3715SXin Li     else
224*bf2c3715SXin Li     {
225*bf2c3715SXin Li       need_help = true;
226*bf2c3715SXin Li       break;
227*bf2c3715SXin Li     }
228*bf2c3715SXin Li   }
229*bf2c3715SXin Li 
230*bf2c3715SXin Li   if(need_help)
231*bf2c3715SXin Li   {
232*bf2c3715SXin Li     std::cout << argv[0] << " -s <matrix sizes> -c <cache sizes> -t <nb tries> -p <nb repeats>\n";
233*bf2c3715SXin Li     std::cout << "   <matrix sizes> : size\n";
234*bf2c3715SXin Li     std::cout << "   <matrix sizes> : rows columns depth\n";
235*bf2c3715SXin Li     return 1;
236*bf2c3715SXin Li   }
237*bf2c3715SXin Li 
238*bf2c3715SXin Li #if EIGEN_VERSION_AT_LEAST(3,2,90)
239*bf2c3715SXin Li   if(cache_size1>0)
240*bf2c3715SXin Li     setCpuCacheSizes(cache_size1,cache_size2,cache_size3);
241*bf2c3715SXin Li #endif
242*bf2c3715SXin Li 
243*bf2c3715SXin Li   A a(m,p); a.setRandom();
244*bf2c3715SXin Li   B b(p,n); b.setRandom();
245*bf2c3715SXin Li   C c(m,n); c.setOnes();
246*bf2c3715SXin Li   C rc = c;
247*bf2c3715SXin Li 
248*bf2c3715SXin Li   std::cout << "Matrix sizes = " << m << "x" << p << " * " << p << "x" << n << "\n";
249*bf2c3715SXin Li   std::ptrdiff_t mc(m), nc(n), kc(p);
250*bf2c3715SXin Li   internal::computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
251*bf2c3715SXin Li   std::cout << "blocking size (mc x kc) = " << mc << " x " << kc << " x " << nc << "\n";
252*bf2c3715SXin Li 
253*bf2c3715SXin Li   C r = c;
254*bf2c3715SXin Li 
255*bf2c3715SXin Li   // check the parallel product is correct
256*bf2c3715SXin Li   #if defined EIGEN_HAS_OPENMP
257*bf2c3715SXin Li   Eigen::initParallel();
258*bf2c3715SXin Li   int procs = omp_get_max_threads();
259*bf2c3715SXin Li   if(procs>1)
260*bf2c3715SXin Li   {
261*bf2c3715SXin Li     #ifdef HAVE_BLAS
262*bf2c3715SXin Li     blas_gemm(a,b,r);
263*bf2c3715SXin Li     #else
264*bf2c3715SXin Li     omp_set_num_threads(1);
265*bf2c3715SXin Li     r.noalias() += a * b;
266*bf2c3715SXin Li     omp_set_num_threads(procs);
267*bf2c3715SXin Li     #endif
268*bf2c3715SXin Li     c.noalias() += a * b;
269*bf2c3715SXin Li     if(!r.isApprox(c)) std::cerr << "Warning, your parallel product is crap!\n\n";
270*bf2c3715SXin Li   }
271*bf2c3715SXin Li   #elif defined HAVE_BLAS
272*bf2c3715SXin Li     blas_gemm(a,b,r);
273*bf2c3715SXin Li     c.noalias() += a * b;
274*bf2c3715SXin Li     if(!r.isApprox(c)) {
275*bf2c3715SXin Li       std::cout << (r  - c).norm()/r.norm() << "\n";
276*bf2c3715SXin Li       std::cerr << "Warning, your product is crap!\n\n";
277*bf2c3715SXin Li     }
278*bf2c3715SXin Li   #else
279*bf2c3715SXin Li     if(1.*m*n*p<2000.*2000*2000)
280*bf2c3715SXin Li     {
281*bf2c3715SXin Li       gemm(a,b,c);
282*bf2c3715SXin Li       r.noalias() += a.cast<Scalar>() .lazyProduct( b.cast<Scalar>() );
283*bf2c3715SXin Li       if(!r.isApprox(c)) {
284*bf2c3715SXin Li         std::cout << (r  - c).norm()/r.norm() << "\n";
285*bf2c3715SXin Li         std::cerr << "Warning, your product is crap!\n\n";
286*bf2c3715SXin Li       }
287*bf2c3715SXin Li     }
288*bf2c3715SXin Li   #endif
289*bf2c3715SXin Li 
290*bf2c3715SXin Li   #ifdef HAVE_BLAS
291*bf2c3715SXin Li   BenchTimer tblas;
292*bf2c3715SXin Li   c = rc;
293*bf2c3715SXin Li   BENCH(tblas, tries, rep, blas_gemm(a,b,c));
294*bf2c3715SXin Li   std::cout << "blas  cpu         " << tblas.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/tblas.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << tblas.total(CPU_TIMER)  << "s)\n";
295*bf2c3715SXin Li   std::cout << "blas  real        " << tblas.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/tblas.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << tblas.total(REAL_TIMER) << "s)\n";
296*bf2c3715SXin Li   #endif
297*bf2c3715SXin Li 
298*bf2c3715SXin Li   // warm start
299*bf2c3715SXin Li   if(b.norm()+a.norm()==123.554) std::cout << "\n";
300*bf2c3715SXin Li 
301*bf2c3715SXin Li   BenchTimer tmt;
302*bf2c3715SXin Li   c = rc;
303*bf2c3715SXin Li   BENCH(tmt, tries, rep, gemm(a,b,c));
304*bf2c3715SXin Li   std::cout << "eigen cpu         " << tmt.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/tmt.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << tmt.total(CPU_TIMER)  << "s)\n";
305*bf2c3715SXin Li   std::cout << "eigen real        " << tmt.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/tmt.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << tmt.total(REAL_TIMER) << "s)\n";
306*bf2c3715SXin Li 
307*bf2c3715SXin Li   #ifdef EIGEN_HAS_OPENMP
308*bf2c3715SXin Li   if(procs>1)
309*bf2c3715SXin Li   {
310*bf2c3715SXin Li     BenchTimer tmono;
311*bf2c3715SXin Li     omp_set_num_threads(1);
312*bf2c3715SXin Li     Eigen::setNbThreads(1);
313*bf2c3715SXin Li     c = rc;
314*bf2c3715SXin Li     BENCH(tmono, tries, rep, gemm(a,b,c));
315*bf2c3715SXin Li     std::cout << "eigen mono cpu    " << tmono.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/tmono.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << tmono.total(CPU_TIMER)  << "s)\n";
316*bf2c3715SXin Li     std::cout << "eigen mono real   " << tmono.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/tmono.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << tmono.total(REAL_TIMER) << "s)\n";
317*bf2c3715SXin Li     std::cout << "mt speed up x" << tmono.best(CPU_TIMER) / tmt.best(REAL_TIMER)  << " => " << (100.0*tmono.best(CPU_TIMER) / tmt.best(REAL_TIMER))/procs << "%\n";
318*bf2c3715SXin Li   }
319*bf2c3715SXin Li   #endif
320*bf2c3715SXin Li 
321*bf2c3715SXin Li   if(1.*m*n*p<30*30*30)
322*bf2c3715SXin Li   {
323*bf2c3715SXin Li     BenchTimer tmt;
324*bf2c3715SXin Li     c = rc;
325*bf2c3715SXin Li     BENCH(tmt, tries, rep, c.noalias()+=a.lazyProduct(b));
326*bf2c3715SXin Li     std::cout << "lazy cpu         " << tmt.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/tmt.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << tmt.total(CPU_TIMER)  << "s)\n";
327*bf2c3715SXin Li     std::cout << "lazy real        " << tmt.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/tmt.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << tmt.total(REAL_TIMER) << "s)\n";
328*bf2c3715SXin Li   }
329*bf2c3715SXin Li 
330*bf2c3715SXin Li   #ifdef DECOUPLED
331*bf2c3715SXin Li   if((NumTraits<A::Scalar>::IsComplex) && (NumTraits<B::Scalar>::IsComplex))
332*bf2c3715SXin Li   {
333*bf2c3715SXin Li     M ar(m,p); ar.setRandom();
334*bf2c3715SXin Li     M ai(m,p); ai.setRandom();
335*bf2c3715SXin Li     M br(p,n); br.setRandom();
336*bf2c3715SXin Li     M bi(p,n); bi.setRandom();
337*bf2c3715SXin Li     M cr(m,n); cr.setRandom();
338*bf2c3715SXin Li     M ci(m,n); ci.setRandom();
339*bf2c3715SXin Li 
340*bf2c3715SXin Li     BenchTimer t;
341*bf2c3715SXin Li     BENCH(t, tries, rep, matlab_cplx_cplx(ar,ai,br,bi,cr,ci));
342*bf2c3715SXin Li     std::cout << "\"matlab\" cpu    " << t.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << t.total(CPU_TIMER)  << "s)\n";
343*bf2c3715SXin Li     std::cout << "\"matlab\" real   " << t.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n";
344*bf2c3715SXin Li   }
345*bf2c3715SXin Li   if((!NumTraits<A::Scalar>::IsComplex) && (NumTraits<B::Scalar>::IsComplex))
346*bf2c3715SXin Li   {
347*bf2c3715SXin Li     M a(m,p);  a.setRandom();
348*bf2c3715SXin Li     M br(p,n); br.setRandom();
349*bf2c3715SXin Li     M bi(p,n); bi.setRandom();
350*bf2c3715SXin Li     M cr(m,n); cr.setRandom();
351*bf2c3715SXin Li     M ci(m,n); ci.setRandom();
352*bf2c3715SXin Li 
353*bf2c3715SXin Li     BenchTimer t;
354*bf2c3715SXin Li     BENCH(t, tries, rep, matlab_real_cplx(a,br,bi,cr,ci));
355*bf2c3715SXin Li     std::cout << "\"matlab\" cpu    " << t.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << t.total(CPU_TIMER)  << "s)\n";
356*bf2c3715SXin Li     std::cout << "\"matlab\" real   " << t.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n";
357*bf2c3715SXin Li   }
358*bf2c3715SXin Li   if((NumTraits<A::Scalar>::IsComplex) && (!NumTraits<B::Scalar>::IsComplex))
359*bf2c3715SXin Li   {
360*bf2c3715SXin Li     M ar(m,p); ar.setRandom();
361*bf2c3715SXin Li     M ai(m,p); ai.setRandom();
362*bf2c3715SXin Li     M b(p,n);  b.setRandom();
363*bf2c3715SXin Li     M cr(m,n); cr.setRandom();
364*bf2c3715SXin Li     M ci(m,n); ci.setRandom();
365*bf2c3715SXin Li 
366*bf2c3715SXin Li     BenchTimer t;
367*bf2c3715SXin Li     BENCH(t, tries, rep, matlab_cplx_real(ar,ai,b,cr,ci));
368*bf2c3715SXin Li     std::cout << "\"matlab\" cpu    " << t.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << t.total(CPU_TIMER)  << "s)\n";
369*bf2c3715SXin Li     std::cout << "\"matlab\" real   " << t.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n";
370*bf2c3715SXin Li   }
371*bf2c3715SXin Li   #endif
372*bf2c3715SXin Li 
373*bf2c3715SXin Li   return 0;
374*bf2c3715SXin Li }
375*bf2c3715SXin Li 
376