xref: /aosp_15_r20/external/eigen/bench/benchVecAdd.cpp (revision bf2c37156dfe67e5dfebd6d394bad8b2ab5804d4)
1*bf2c3715SXin Li 
2*bf2c3715SXin Li #include <iostream>
3*bf2c3715SXin Li #include <Eigen/Core>
4*bf2c3715SXin Li #include <bench/BenchTimer.h>
5*bf2c3715SXin Li using namespace Eigen;
6*bf2c3715SXin Li 
7*bf2c3715SXin Li #ifndef SIZE
8*bf2c3715SXin Li #define SIZE 50
9*bf2c3715SXin Li #endif
10*bf2c3715SXin Li 
11*bf2c3715SXin Li #ifndef REPEAT
12*bf2c3715SXin Li #define REPEAT 10000
13*bf2c3715SXin Li #endif
14*bf2c3715SXin Li 
15*bf2c3715SXin Li typedef float Scalar;
16*bf2c3715SXin Li 
17*bf2c3715SXin Li __attribute__ ((noinline)) void benchVec(Scalar* a, Scalar* b, Scalar* c, int size);
18*bf2c3715SXin Li __attribute__ ((noinline)) void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c);
19*bf2c3715SXin Li __attribute__ ((noinline)) void benchVec(VectorXf& a, VectorXf& b, VectorXf& c);
20*bf2c3715SXin Li 
main(int argc,char * argv[])21*bf2c3715SXin Li int main(int argc, char* argv[])
22*bf2c3715SXin Li {
23*bf2c3715SXin Li     int size = SIZE * 8;
24*bf2c3715SXin Li     int size2 = size * size;
25*bf2c3715SXin Li     Scalar* a = internal::aligned_new<Scalar>(size2);
26*bf2c3715SXin Li     Scalar* b = internal::aligned_new<Scalar>(size2+4)+1;
27*bf2c3715SXin Li     Scalar* c = internal::aligned_new<Scalar>(size2);
28*bf2c3715SXin Li 
29*bf2c3715SXin Li     for (int i=0; i<size; ++i)
30*bf2c3715SXin Li     {
31*bf2c3715SXin Li         a[i] = b[i] = c[i] = 0;
32*bf2c3715SXin Li     }
33*bf2c3715SXin Li 
34*bf2c3715SXin Li     BenchTimer timer;
35*bf2c3715SXin Li 
36*bf2c3715SXin Li     timer.reset();
37*bf2c3715SXin Li     for (int k=0; k<10; ++k)
38*bf2c3715SXin Li     {
39*bf2c3715SXin Li         timer.start();
40*bf2c3715SXin Li         benchVec(a, b, c, size2);
41*bf2c3715SXin Li         timer.stop();
42*bf2c3715SXin Li     }
43*bf2c3715SXin Li     std::cout << timer.value() << "s  " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
44*bf2c3715SXin Li     return 0;
45*bf2c3715SXin Li     for (int innersize = size; innersize>2 ; --innersize)
46*bf2c3715SXin Li     {
47*bf2c3715SXin Li         if (size2%innersize==0)
48*bf2c3715SXin Li         {
49*bf2c3715SXin Li             int outersize = size2/innersize;
50*bf2c3715SXin Li             MatrixXf ma = Map<MatrixXf>(a, innersize, outersize );
51*bf2c3715SXin Li             MatrixXf mb = Map<MatrixXf>(b, innersize, outersize );
52*bf2c3715SXin Li             MatrixXf mc = Map<MatrixXf>(c, innersize, outersize );
53*bf2c3715SXin Li             timer.reset();
54*bf2c3715SXin Li             for (int k=0; k<3; ++k)
55*bf2c3715SXin Li             {
56*bf2c3715SXin Li                 timer.start();
57*bf2c3715SXin Li                 benchVec(ma, mb, mc);
58*bf2c3715SXin Li                 timer.stop();
59*bf2c3715SXin Li             }
60*bf2c3715SXin Li             std::cout << innersize << " x " << outersize << "  " << timer.value() << "s   " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
61*bf2c3715SXin Li         }
62*bf2c3715SXin Li     }
63*bf2c3715SXin Li 
64*bf2c3715SXin Li     VectorXf va = Map<VectorXf>(a, size2);
65*bf2c3715SXin Li     VectorXf vb = Map<VectorXf>(b, size2);
66*bf2c3715SXin Li     VectorXf vc = Map<VectorXf>(c, size2);
67*bf2c3715SXin Li     timer.reset();
68*bf2c3715SXin Li     for (int k=0; k<3; ++k)
69*bf2c3715SXin Li     {
70*bf2c3715SXin Li         timer.start();
71*bf2c3715SXin Li         benchVec(va, vb, vc);
72*bf2c3715SXin Li         timer.stop();
73*bf2c3715SXin Li     }
74*bf2c3715SXin Li     std::cout << timer.value() << "s   " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
75*bf2c3715SXin Li 
76*bf2c3715SXin Li     return 0;
77*bf2c3715SXin Li }
78*bf2c3715SXin Li 
benchVec(MatrixXf & a,MatrixXf & b,MatrixXf & c)79*bf2c3715SXin Li void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c)
80*bf2c3715SXin Li {
81*bf2c3715SXin Li     for (int k=0; k<REPEAT; ++k)
82*bf2c3715SXin Li         a = a + b;
83*bf2c3715SXin Li }
84*bf2c3715SXin Li 
benchVec(VectorXf & a,VectorXf & b,VectorXf & c)85*bf2c3715SXin Li void benchVec(VectorXf& a, VectorXf& b, VectorXf& c)
86*bf2c3715SXin Li {
87*bf2c3715SXin Li     for (int k=0; k<REPEAT; ++k)
88*bf2c3715SXin Li         a = a + b;
89*bf2c3715SXin Li }
90*bf2c3715SXin Li 
benchVec(Scalar * a,Scalar * b,Scalar * c,int size)91*bf2c3715SXin Li void benchVec(Scalar* a, Scalar* b, Scalar* c, int size)
92*bf2c3715SXin Li {
93*bf2c3715SXin Li     typedef internal::packet_traits<Scalar>::type PacketScalar;
94*bf2c3715SXin Li     const int PacketSize = internal::packet_traits<Scalar>::size;
95*bf2c3715SXin Li     PacketScalar a0, a1, a2, a3, b0, b1, b2, b3;
96*bf2c3715SXin Li     for (int k=0; k<REPEAT; ++k)
97*bf2c3715SXin Li         for (int i=0; i<size; i+=PacketSize*8)
98*bf2c3715SXin Li         {
99*bf2c3715SXin Li //             a0 = internal::pload(&a[i]);
100*bf2c3715SXin Li //             b0 = internal::pload(&b[i]);
101*bf2c3715SXin Li //             a1 = internal::pload(&a[i+1*PacketSize]);
102*bf2c3715SXin Li //             b1 = internal::pload(&b[i+1*PacketSize]);
103*bf2c3715SXin Li //             a2 = internal::pload(&a[i+2*PacketSize]);
104*bf2c3715SXin Li //             b2 = internal::pload(&b[i+2*PacketSize]);
105*bf2c3715SXin Li //             a3 = internal::pload(&a[i+3*PacketSize]);
106*bf2c3715SXin Li //             b3 = internal::pload(&b[i+3*PacketSize]);
107*bf2c3715SXin Li //             internal::pstore(&a[i], internal::padd(a0, b0));
108*bf2c3715SXin Li //             a0 = internal::pload(&a[i+4*PacketSize]);
109*bf2c3715SXin Li //             b0 = internal::pload(&b[i+4*PacketSize]);
110*bf2c3715SXin Li //
111*bf2c3715SXin Li //             internal::pstore(&a[i+1*PacketSize], internal::padd(a1, b1));
112*bf2c3715SXin Li //             a1 = internal::pload(&a[i+5*PacketSize]);
113*bf2c3715SXin Li //             b1 = internal::pload(&b[i+5*PacketSize]);
114*bf2c3715SXin Li //
115*bf2c3715SXin Li //             internal::pstore(&a[i+2*PacketSize], internal::padd(a2, b2));
116*bf2c3715SXin Li //             a2 = internal::pload(&a[i+6*PacketSize]);
117*bf2c3715SXin Li //             b2 = internal::pload(&b[i+6*PacketSize]);
118*bf2c3715SXin Li //
119*bf2c3715SXin Li //             internal::pstore(&a[i+3*PacketSize], internal::padd(a3, b3));
120*bf2c3715SXin Li //             a3 = internal::pload(&a[i+7*PacketSize]);
121*bf2c3715SXin Li //             b3 = internal::pload(&b[i+7*PacketSize]);
122*bf2c3715SXin Li //
123*bf2c3715SXin Li //             internal::pstore(&a[i+4*PacketSize], internal::padd(a0, b0));
124*bf2c3715SXin Li //             internal::pstore(&a[i+5*PacketSize], internal::padd(a1, b1));
125*bf2c3715SXin Li //             internal::pstore(&a[i+6*PacketSize], internal::padd(a2, b2));
126*bf2c3715SXin Li //             internal::pstore(&a[i+7*PacketSize], internal::padd(a3, b3));
127*bf2c3715SXin Li 
128*bf2c3715SXin Li             internal::pstore(&a[i+2*PacketSize], internal::padd(internal::ploadu(&a[i+2*PacketSize]), internal::ploadu(&b[i+2*PacketSize])));
129*bf2c3715SXin Li             internal::pstore(&a[i+3*PacketSize], internal::padd(internal::ploadu(&a[i+3*PacketSize]), internal::ploadu(&b[i+3*PacketSize])));
130*bf2c3715SXin Li             internal::pstore(&a[i+4*PacketSize], internal::padd(internal::ploadu(&a[i+4*PacketSize]), internal::ploadu(&b[i+4*PacketSize])));
131*bf2c3715SXin Li             internal::pstore(&a[i+5*PacketSize], internal::padd(internal::ploadu(&a[i+5*PacketSize]), internal::ploadu(&b[i+5*PacketSize])));
132*bf2c3715SXin Li             internal::pstore(&a[i+6*PacketSize], internal::padd(internal::ploadu(&a[i+6*PacketSize]), internal::ploadu(&b[i+6*PacketSize])));
133*bf2c3715SXin Li             internal::pstore(&a[i+7*PacketSize], internal::padd(internal::ploadu(&a[i+7*PacketSize]), internal::ploadu(&b[i+7*PacketSize])));
134*bf2c3715SXin Li         }
135*bf2c3715SXin Li }
136