1*bf2c3715SXin Li
2*bf2c3715SXin Li // g++-4.4 bench_gemm.cpp -I .. -O2 -DNDEBUG -lrt -fopenmp && OMP_NUM_THREADS=2 ./a.out
3*bf2c3715SXin Li // icpc bench_gemm.cpp -I .. -O3 -DNDEBUG -lrt -openmp && OMP_NUM_THREADS=2 ./a.out
4*bf2c3715SXin Li
5*bf2c3715SXin Li // Compilation options:
6*bf2c3715SXin Li //
7*bf2c3715SXin Li // -DSCALAR=std::complex<double>
8*bf2c3715SXin Li // -DSCALARA=double or -DSCALARB=double
9*bf2c3715SXin Li // -DHAVE_BLAS
10*bf2c3715SXin Li // -DDECOUPLED
11*bf2c3715SXin Li //
12*bf2c3715SXin Li
13*bf2c3715SXin Li #include <iostream>
14*bf2c3715SXin Li #include <bench/BenchTimer.h>
15*bf2c3715SXin Li #include <Eigen/Core>
16*bf2c3715SXin Li
17*bf2c3715SXin Li
18*bf2c3715SXin Li using namespace std;
19*bf2c3715SXin Li using namespace Eigen;
20*bf2c3715SXin Li
21*bf2c3715SXin Li #ifndef SCALAR
22*bf2c3715SXin Li // #define SCALAR std::complex<float>
23*bf2c3715SXin Li #define SCALAR float
24*bf2c3715SXin Li #endif
25*bf2c3715SXin Li
26*bf2c3715SXin Li #ifndef SCALARA
27*bf2c3715SXin Li #define SCALARA SCALAR
28*bf2c3715SXin Li #endif
29*bf2c3715SXin Li
30*bf2c3715SXin Li #ifndef SCALARB
31*bf2c3715SXin Li #define SCALARB SCALAR
32*bf2c3715SXin Li #endif
33*bf2c3715SXin Li
34*bf2c3715SXin Li #ifdef ROWMAJ_A
35*bf2c3715SXin Li const int opt_A = RowMajor;
36*bf2c3715SXin Li #else
37*bf2c3715SXin Li const int opt_A = ColMajor;
38*bf2c3715SXin Li #endif
39*bf2c3715SXin Li
40*bf2c3715SXin Li #ifdef ROWMAJ_B
41*bf2c3715SXin Li const int opt_B = RowMajor;
42*bf2c3715SXin Li #else
43*bf2c3715SXin Li const int opt_B = ColMajor;
44*bf2c3715SXin Li #endif
45*bf2c3715SXin Li
46*bf2c3715SXin Li typedef SCALAR Scalar;
47*bf2c3715SXin Li typedef NumTraits<Scalar>::Real RealScalar;
48*bf2c3715SXin Li typedef Matrix<SCALARA,Dynamic,Dynamic,opt_A> A;
49*bf2c3715SXin Li typedef Matrix<SCALARB,Dynamic,Dynamic,opt_B> B;
50*bf2c3715SXin Li typedef Matrix<Scalar,Dynamic,Dynamic> C;
51*bf2c3715SXin Li typedef Matrix<RealScalar,Dynamic,Dynamic> M;
52*bf2c3715SXin Li
53*bf2c3715SXin Li #ifdef HAVE_BLAS
54*bf2c3715SXin Li
55*bf2c3715SXin Li extern "C" {
56*bf2c3715SXin Li #include <Eigen/src/misc/blas.h>
57*bf2c3715SXin Li }
58*bf2c3715SXin Li
59*bf2c3715SXin Li static float fone = 1;
60*bf2c3715SXin Li static float fzero = 0;
61*bf2c3715SXin Li static double done = 1;
62*bf2c3715SXin Li static double szero = 0;
63*bf2c3715SXin Li static std::complex<float> cfone = 1;
64*bf2c3715SXin Li static std::complex<float> cfzero = 0;
65*bf2c3715SXin Li static std::complex<double> cdone = 1;
66*bf2c3715SXin Li static std::complex<double> cdzero = 0;
67*bf2c3715SXin Li static char notrans = 'N';
68*bf2c3715SXin Li static char trans = 'T';
69*bf2c3715SXin Li static char nonunit = 'N';
70*bf2c3715SXin Li static char lower = 'L';
71*bf2c3715SXin Li static char right = 'R';
72*bf2c3715SXin Li static int intone = 1;
73*bf2c3715SXin Li
74*bf2c3715SXin Li #ifdef ROWMAJ_A
75*bf2c3715SXin Li const char transA = trans;
76*bf2c3715SXin Li #else
77*bf2c3715SXin Li const char transA = notrans;
78*bf2c3715SXin Li #endif
79*bf2c3715SXin Li
80*bf2c3715SXin Li #ifdef ROWMAJ_B
81*bf2c3715SXin Li const char transB = trans;
82*bf2c3715SXin Li #else
83*bf2c3715SXin Li const char transB = notrans;
84*bf2c3715SXin Li #endif
85*bf2c3715SXin Li
86*bf2c3715SXin Li template<typename A,typename B>
blas_gemm(const A & a,const B & b,MatrixXf & c)87*bf2c3715SXin Li void blas_gemm(const A& a, const B& b, MatrixXf& c)
88*bf2c3715SXin Li {
89*bf2c3715SXin Li int M = c.rows(); int N = c.cols(); int K = a.cols();
90*bf2c3715SXin Li int lda = a.outerStride(); int ldb = b.outerStride(); int ldc = c.rows();
91*bf2c3715SXin Li
92*bf2c3715SXin Li sgemm_(&transA,&transB,&M,&N,&K,&fone,
93*bf2c3715SXin Li const_cast<float*>(a.data()),&lda,
94*bf2c3715SXin Li const_cast<float*>(b.data()),&ldb,&fone,
95*bf2c3715SXin Li c.data(),&ldc);
96*bf2c3715SXin Li }
97*bf2c3715SXin Li
98*bf2c3715SXin Li template<typename A,typename B>
blas_gemm(const A & a,const B & b,MatrixXd & c)99*bf2c3715SXin Li void blas_gemm(const A& a, const B& b, MatrixXd& c)
100*bf2c3715SXin Li {
101*bf2c3715SXin Li int M = c.rows(); int N = c.cols(); int K = a.cols();
102*bf2c3715SXin Li int lda = a.outerStride(); int ldb = b.outerStride(); int ldc = c.rows();
103*bf2c3715SXin Li
104*bf2c3715SXin Li dgemm_(&transA,&transB,&M,&N,&K,&done,
105*bf2c3715SXin Li const_cast<double*>(a.data()),&lda,
106*bf2c3715SXin Li const_cast<double*>(b.data()),&ldb,&done,
107*bf2c3715SXin Li c.data(),&ldc);
108*bf2c3715SXin Li }
109*bf2c3715SXin Li
110*bf2c3715SXin Li template<typename A,typename B>
blas_gemm(const A & a,const B & b,MatrixXcf & c)111*bf2c3715SXin Li void blas_gemm(const A& a, const B& b, MatrixXcf& c)
112*bf2c3715SXin Li {
113*bf2c3715SXin Li int M = c.rows(); int N = c.cols(); int K = a.cols();
114*bf2c3715SXin Li int lda = a.outerStride(); int ldb = b.outerStride(); int ldc = c.rows();
115*bf2c3715SXin Li
116*bf2c3715SXin Li cgemm_(&transA,&transB,&M,&N,&K,(float*)&cfone,
117*bf2c3715SXin Li const_cast<float*>((const float*)a.data()),&lda,
118*bf2c3715SXin Li const_cast<float*>((const float*)b.data()),&ldb,(float*)&cfone,
119*bf2c3715SXin Li (float*)c.data(),&ldc);
120*bf2c3715SXin Li }
121*bf2c3715SXin Li
122*bf2c3715SXin Li template<typename A,typename B>
blas_gemm(const A & a,const B & b,MatrixXcd & c)123*bf2c3715SXin Li void blas_gemm(const A& a, const B& b, MatrixXcd& c)
124*bf2c3715SXin Li {
125*bf2c3715SXin Li int M = c.rows(); int N = c.cols(); int K = a.cols();
126*bf2c3715SXin Li int lda = a.outerStride(); int ldb = b.outerStride(); int ldc = c.rows();
127*bf2c3715SXin Li
128*bf2c3715SXin Li zgemm_(&transA,&transB,&M,&N,&K,(double*)&cdone,
129*bf2c3715SXin Li const_cast<double*>((const double*)a.data()),&lda,
130*bf2c3715SXin Li const_cast<double*>((const double*)b.data()),&ldb,(double*)&cdone,
131*bf2c3715SXin Li (double*)c.data(),&ldc);
132*bf2c3715SXin Li }
133*bf2c3715SXin Li
134*bf2c3715SXin Li
135*bf2c3715SXin Li
136*bf2c3715SXin Li #endif
137*bf2c3715SXin Li
matlab_cplx_cplx(const M & ar,const M & ai,const M & br,const M & bi,M & cr,M & ci)138*bf2c3715SXin Li void matlab_cplx_cplx(const M& ar, const M& ai, const M& br, const M& bi, M& cr, M& ci)
139*bf2c3715SXin Li {
140*bf2c3715SXin Li cr.noalias() += ar * br;
141*bf2c3715SXin Li cr.noalias() -= ai * bi;
142*bf2c3715SXin Li ci.noalias() += ar * bi;
143*bf2c3715SXin Li ci.noalias() += ai * br;
144*bf2c3715SXin Li // [cr ci] += [ar ai] * br + [-ai ar] * bi
145*bf2c3715SXin Li }
146*bf2c3715SXin Li
matlab_real_cplx(const M & a,const M & br,const M & bi,M & cr,M & ci)147*bf2c3715SXin Li void matlab_real_cplx(const M& a, const M& br, const M& bi, M& cr, M& ci)
148*bf2c3715SXin Li {
149*bf2c3715SXin Li cr.noalias() += a * br;
150*bf2c3715SXin Li ci.noalias() += a * bi;
151*bf2c3715SXin Li }
152*bf2c3715SXin Li
matlab_cplx_real(const M & ar,const M & ai,const M & b,M & cr,M & ci)153*bf2c3715SXin Li void matlab_cplx_real(const M& ar, const M& ai, const M& b, M& cr, M& ci)
154*bf2c3715SXin Li {
155*bf2c3715SXin Li cr.noalias() += ar * b;
156*bf2c3715SXin Li ci.noalias() += ai * b;
157*bf2c3715SXin Li }
158*bf2c3715SXin Li
159*bf2c3715SXin Li
160*bf2c3715SXin Li
161*bf2c3715SXin Li template<typename A, typename B, typename C>
gemm(const A & a,const B & b,C & c)162*bf2c3715SXin Li EIGEN_DONT_INLINE void gemm(const A& a, const B& b, C& c)
163*bf2c3715SXin Li {
164*bf2c3715SXin Li c.noalias() += a * b;
165*bf2c3715SXin Li }
166*bf2c3715SXin Li
main(int argc,char ** argv)167*bf2c3715SXin Li int main(int argc, char ** argv)
168*bf2c3715SXin Li {
169*bf2c3715SXin Li std::ptrdiff_t l1 = internal::queryL1CacheSize();
170*bf2c3715SXin Li std::ptrdiff_t l2 = internal::queryTopLevelCacheSize();
171*bf2c3715SXin Li std::cout << "L1 cache size = " << (l1>0 ? l1/1024 : -1) << " KB\n";
172*bf2c3715SXin Li std::cout << "L2/L3 cache size = " << (l2>0 ? l2/1024 : -1) << " KB\n";
173*bf2c3715SXin Li typedef internal::gebp_traits<Scalar,Scalar> Traits;
174*bf2c3715SXin Li std::cout << "Register blocking = " << Traits::mr << " x " << Traits::nr << "\n";
175*bf2c3715SXin Li
176*bf2c3715SXin Li int rep = 1; // number of repetitions per try
177*bf2c3715SXin Li int tries = 2; // number of tries, we keep the best
178*bf2c3715SXin Li
179*bf2c3715SXin Li int s = 2048;
180*bf2c3715SXin Li int m = s;
181*bf2c3715SXin Li int n = s;
182*bf2c3715SXin Li int p = s;
183*bf2c3715SXin Li int cache_size1=-1, cache_size2=l2, cache_size3 = 0;
184*bf2c3715SXin Li
185*bf2c3715SXin Li bool need_help = false;
186*bf2c3715SXin Li for (int i=1; i<argc;)
187*bf2c3715SXin Li {
188*bf2c3715SXin Li if(argv[i][0]=='-')
189*bf2c3715SXin Li {
190*bf2c3715SXin Li if(argv[i][1]=='s')
191*bf2c3715SXin Li {
192*bf2c3715SXin Li ++i;
193*bf2c3715SXin Li s = atoi(argv[i++]);
194*bf2c3715SXin Li m = n = p = s;
195*bf2c3715SXin Li if(argv[i][0]!='-')
196*bf2c3715SXin Li {
197*bf2c3715SXin Li n = atoi(argv[i++]);
198*bf2c3715SXin Li p = atoi(argv[i++]);
199*bf2c3715SXin Li }
200*bf2c3715SXin Li }
201*bf2c3715SXin Li else if(argv[i][1]=='c')
202*bf2c3715SXin Li {
203*bf2c3715SXin Li ++i;
204*bf2c3715SXin Li cache_size1 = atoi(argv[i++]);
205*bf2c3715SXin Li if(argv[i][0]!='-')
206*bf2c3715SXin Li {
207*bf2c3715SXin Li cache_size2 = atoi(argv[i++]);
208*bf2c3715SXin Li if(argv[i][0]!='-')
209*bf2c3715SXin Li cache_size3 = atoi(argv[i++]);
210*bf2c3715SXin Li }
211*bf2c3715SXin Li }
212*bf2c3715SXin Li else if(argv[i][1]=='t')
213*bf2c3715SXin Li {
214*bf2c3715SXin Li tries = atoi(argv[++i]);
215*bf2c3715SXin Li ++i;
216*bf2c3715SXin Li }
217*bf2c3715SXin Li else if(argv[i][1]=='p')
218*bf2c3715SXin Li {
219*bf2c3715SXin Li ++i;
220*bf2c3715SXin Li rep = atoi(argv[i++]);
221*bf2c3715SXin Li }
222*bf2c3715SXin Li }
223*bf2c3715SXin Li else
224*bf2c3715SXin Li {
225*bf2c3715SXin Li need_help = true;
226*bf2c3715SXin Li break;
227*bf2c3715SXin Li }
228*bf2c3715SXin Li }
229*bf2c3715SXin Li
230*bf2c3715SXin Li if(need_help)
231*bf2c3715SXin Li {
232*bf2c3715SXin Li std::cout << argv[0] << " -s <matrix sizes> -c <cache sizes> -t <nb tries> -p <nb repeats>\n";
233*bf2c3715SXin Li std::cout << " <matrix sizes> : size\n";
234*bf2c3715SXin Li std::cout << " <matrix sizes> : rows columns depth\n";
235*bf2c3715SXin Li return 1;
236*bf2c3715SXin Li }
237*bf2c3715SXin Li
238*bf2c3715SXin Li #if EIGEN_VERSION_AT_LEAST(3,2,90)
239*bf2c3715SXin Li if(cache_size1>0)
240*bf2c3715SXin Li setCpuCacheSizes(cache_size1,cache_size2,cache_size3);
241*bf2c3715SXin Li #endif
242*bf2c3715SXin Li
243*bf2c3715SXin Li A a(m,p); a.setRandom();
244*bf2c3715SXin Li B b(p,n); b.setRandom();
245*bf2c3715SXin Li C c(m,n); c.setOnes();
246*bf2c3715SXin Li C rc = c;
247*bf2c3715SXin Li
248*bf2c3715SXin Li std::cout << "Matrix sizes = " << m << "x" << p << " * " << p << "x" << n << "\n";
249*bf2c3715SXin Li std::ptrdiff_t mc(m), nc(n), kc(p);
250*bf2c3715SXin Li internal::computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
251*bf2c3715SXin Li std::cout << "blocking size (mc x kc) = " << mc << " x " << kc << " x " << nc << "\n";
252*bf2c3715SXin Li
253*bf2c3715SXin Li C r = c;
254*bf2c3715SXin Li
255*bf2c3715SXin Li // check the parallel product is correct
256*bf2c3715SXin Li #if defined EIGEN_HAS_OPENMP
257*bf2c3715SXin Li Eigen::initParallel();
258*bf2c3715SXin Li int procs = omp_get_max_threads();
259*bf2c3715SXin Li if(procs>1)
260*bf2c3715SXin Li {
261*bf2c3715SXin Li #ifdef HAVE_BLAS
262*bf2c3715SXin Li blas_gemm(a,b,r);
263*bf2c3715SXin Li #else
264*bf2c3715SXin Li omp_set_num_threads(1);
265*bf2c3715SXin Li r.noalias() += a * b;
266*bf2c3715SXin Li omp_set_num_threads(procs);
267*bf2c3715SXin Li #endif
268*bf2c3715SXin Li c.noalias() += a * b;
269*bf2c3715SXin Li if(!r.isApprox(c)) std::cerr << "Warning, your parallel product is crap!\n\n";
270*bf2c3715SXin Li }
271*bf2c3715SXin Li #elif defined HAVE_BLAS
272*bf2c3715SXin Li blas_gemm(a,b,r);
273*bf2c3715SXin Li c.noalias() += a * b;
274*bf2c3715SXin Li if(!r.isApprox(c)) {
275*bf2c3715SXin Li std::cout << (r - c).norm()/r.norm() << "\n";
276*bf2c3715SXin Li std::cerr << "Warning, your product is crap!\n\n";
277*bf2c3715SXin Li }
278*bf2c3715SXin Li #else
279*bf2c3715SXin Li if(1.*m*n*p<2000.*2000*2000)
280*bf2c3715SXin Li {
281*bf2c3715SXin Li gemm(a,b,c);
282*bf2c3715SXin Li r.noalias() += a.cast<Scalar>() .lazyProduct( b.cast<Scalar>() );
283*bf2c3715SXin Li if(!r.isApprox(c)) {
284*bf2c3715SXin Li std::cout << (r - c).norm()/r.norm() << "\n";
285*bf2c3715SXin Li std::cerr << "Warning, your product is crap!\n\n";
286*bf2c3715SXin Li }
287*bf2c3715SXin Li }
288*bf2c3715SXin Li #endif
289*bf2c3715SXin Li
290*bf2c3715SXin Li #ifdef HAVE_BLAS
291*bf2c3715SXin Li BenchTimer tblas;
292*bf2c3715SXin Li c = rc;
293*bf2c3715SXin Li BENCH(tblas, tries, rep, blas_gemm(a,b,c));
294*bf2c3715SXin Li std::cout << "blas cpu " << tblas.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tblas.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tblas.total(CPU_TIMER) << "s)\n";
295*bf2c3715SXin Li std::cout << "blas real " << tblas.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tblas.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << tblas.total(REAL_TIMER) << "s)\n";
296*bf2c3715SXin Li #endif
297*bf2c3715SXin Li
298*bf2c3715SXin Li // warm start
299*bf2c3715SXin Li if(b.norm()+a.norm()==123.554) std::cout << "\n";
300*bf2c3715SXin Li
301*bf2c3715SXin Li BenchTimer tmt;
302*bf2c3715SXin Li c = rc;
303*bf2c3715SXin Li BENCH(tmt, tries, rep, gemm(a,b,c));
304*bf2c3715SXin Li std::cout << "eigen cpu " << tmt.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(CPU_TIMER) << "s)\n";
305*bf2c3715SXin Li std::cout << "eigen real " << tmt.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(REAL_TIMER) << "s)\n";
306*bf2c3715SXin Li
307*bf2c3715SXin Li #ifdef EIGEN_HAS_OPENMP
308*bf2c3715SXin Li if(procs>1)
309*bf2c3715SXin Li {
310*bf2c3715SXin Li BenchTimer tmono;
311*bf2c3715SXin Li omp_set_num_threads(1);
312*bf2c3715SXin Li Eigen::setNbThreads(1);
313*bf2c3715SXin Li c = rc;
314*bf2c3715SXin Li BENCH(tmono, tries, rep, gemm(a,b,c));
315*bf2c3715SXin Li std::cout << "eigen mono cpu " << tmono.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmono.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tmono.total(CPU_TIMER) << "s)\n";
316*bf2c3715SXin Li std::cout << "eigen mono real " << tmono.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmono.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << tmono.total(REAL_TIMER) << "s)\n";
317*bf2c3715SXin Li std::cout << "mt speed up x" << tmono.best(CPU_TIMER) / tmt.best(REAL_TIMER) << " => " << (100.0*tmono.best(CPU_TIMER) / tmt.best(REAL_TIMER))/procs << "%\n";
318*bf2c3715SXin Li }
319*bf2c3715SXin Li #endif
320*bf2c3715SXin Li
321*bf2c3715SXin Li if(1.*m*n*p<30*30*30)
322*bf2c3715SXin Li {
323*bf2c3715SXin Li BenchTimer tmt;
324*bf2c3715SXin Li c = rc;
325*bf2c3715SXin Li BENCH(tmt, tries, rep, c.noalias()+=a.lazyProduct(b));
326*bf2c3715SXin Li std::cout << "lazy cpu " << tmt.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(CPU_TIMER) << "s)\n";
327*bf2c3715SXin Li std::cout << "lazy real " << tmt.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(REAL_TIMER) << "s)\n";
328*bf2c3715SXin Li }
329*bf2c3715SXin Li
330*bf2c3715SXin Li #ifdef DECOUPLED
331*bf2c3715SXin Li if((NumTraits<A::Scalar>::IsComplex) && (NumTraits<B::Scalar>::IsComplex))
332*bf2c3715SXin Li {
333*bf2c3715SXin Li M ar(m,p); ar.setRandom();
334*bf2c3715SXin Li M ai(m,p); ai.setRandom();
335*bf2c3715SXin Li M br(p,n); br.setRandom();
336*bf2c3715SXin Li M bi(p,n); bi.setRandom();
337*bf2c3715SXin Li M cr(m,n); cr.setRandom();
338*bf2c3715SXin Li M ci(m,n); ci.setRandom();
339*bf2c3715SXin Li
340*bf2c3715SXin Li BenchTimer t;
341*bf2c3715SXin Li BENCH(t, tries, rep, matlab_cplx_cplx(ar,ai,br,bi,cr,ci));
342*bf2c3715SXin Li std::cout << "\"matlab\" cpu " << t.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << t.total(CPU_TIMER) << "s)\n";
343*bf2c3715SXin Li std::cout << "\"matlab\" real " << t.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n";
344*bf2c3715SXin Li }
345*bf2c3715SXin Li if((!NumTraits<A::Scalar>::IsComplex) && (NumTraits<B::Scalar>::IsComplex))
346*bf2c3715SXin Li {
347*bf2c3715SXin Li M a(m,p); a.setRandom();
348*bf2c3715SXin Li M br(p,n); br.setRandom();
349*bf2c3715SXin Li M bi(p,n); bi.setRandom();
350*bf2c3715SXin Li M cr(m,n); cr.setRandom();
351*bf2c3715SXin Li M ci(m,n); ci.setRandom();
352*bf2c3715SXin Li
353*bf2c3715SXin Li BenchTimer t;
354*bf2c3715SXin Li BENCH(t, tries, rep, matlab_real_cplx(a,br,bi,cr,ci));
355*bf2c3715SXin Li std::cout << "\"matlab\" cpu " << t.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << t.total(CPU_TIMER) << "s)\n";
356*bf2c3715SXin Li std::cout << "\"matlab\" real " << t.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n";
357*bf2c3715SXin Li }
358*bf2c3715SXin Li if((NumTraits<A::Scalar>::IsComplex) && (!NumTraits<B::Scalar>::IsComplex))
359*bf2c3715SXin Li {
360*bf2c3715SXin Li M ar(m,p); ar.setRandom();
361*bf2c3715SXin Li M ai(m,p); ai.setRandom();
362*bf2c3715SXin Li M b(p,n); b.setRandom();
363*bf2c3715SXin Li M cr(m,n); cr.setRandom();
364*bf2c3715SXin Li M ci(m,n); ci.setRandom();
365*bf2c3715SXin Li
366*bf2c3715SXin Li BenchTimer t;
367*bf2c3715SXin Li BENCH(t, tries, rep, matlab_cplx_real(ar,ai,b,cr,ci));
368*bf2c3715SXin Li std::cout << "\"matlab\" cpu " << t.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << t.total(CPU_TIMER) << "s)\n";
369*bf2c3715SXin Li std::cout << "\"matlab\" real " << t.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n";
370*bf2c3715SXin Li }
371*bf2c3715SXin Li #endif
372*bf2c3715SXin Li
373*bf2c3715SXin Li return 0;
374*bf2c3715SXin Li }
375*bf2c3715SXin Li
376