xref: /aosp_15_r20/external/eigen/bench/tensors/tensor_benchmarks.h (revision bf2c37156dfe67e5dfebd6d394bad8b2ab5804d4)
1 #ifndef THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
2 #define THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
3 
4 typedef int TensorIndex;
5 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
6 
7 #include "unsupported/Eigen/CXX11/Tensor"
8 #include "benchmark.h"
9 
10 #define BENCHMARK_RANGE(bench, lo, hi) \
11   BENCHMARK(bench)->Range(lo, hi)
12 
13 using Eigen::Tensor;
14 using Eigen::TensorMap;
15 
16 // TODO(bsteiner): also templatize on the input type since we have users
17 // for int8 as well as floats.
18 template <typename Device, typename T> class BenchmarkSuite {
19  public:
BenchmarkSuite(const Device & device,size_t m,size_t k,size_t n)20   BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n)
21       : m_(m), k_(k), n_(n), device_(device) {
22     initialize();
23   }
24 
BenchmarkSuite(const Device & device,size_t m)25   BenchmarkSuite(const Device& device, size_t m)
26       : m_(m), k_(m), n_(m), device_(device) {
27     initialize();
28   }
29 
BenchmarkSuite(const Device & device,size_t m,size_t k)30   BenchmarkSuite(const Device& device, size_t m, size_t k)
31       : m_(1), k_(k), n_(m), device_(device) {
32     initialize();
33   }
34 
~BenchmarkSuite()35   ~BenchmarkSuite() {
36     device_.deallocate(a_);
37     device_.deallocate(b_);
38     device_.deallocate(c_);
39   }
40 
memcpy(int num_iters)41   void memcpy(int num_iters) {
42     eigen_assert(m_ == k_ && k_ == n_);
43 #ifdef EIGEN_USE_SYCL // warmup for sycl
44     for (int iter = 0; iter < 10; ++iter) {
45       device_.memcpy(c_, a_, m_ * m_ * sizeof(T));
46     }
47 #endif
48     StartBenchmarkTiming();
49     for (int iter = 0; iter < num_iters; ++iter) {
50       device_.memcpy(c_, a_, m_ * m_ * sizeof(T));
51     }
52     // Record the number of values copied per second
53     finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
54   }
55 
typeCasting(int num_iters)56   void typeCasting(int num_iters) {
57     eigen_assert(m_ == n_);
58     Eigen::array<TensorIndex, 2> sizes;
59     if (sizeof(T) >= sizeof(int)) {
60       sizes[0] = m_;
61       sizes[1] = k_;
62     } else {
63       sizes[0] = m_ * sizeof(T) / sizeof(int);
64       sizes[1] = k_ * sizeof(T) / sizeof(int);
65     }
66     const TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> A((int*)a_, sizes);
67     TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, sizes);
68 #ifdef EIGEN_USE_SYCL // warmup for sycl
69     for (int iter = 0; iter < 10; ++iter) {
70       B.device(device_) = A.template cast<T>();
71     }
72 #endif
73     StartBenchmarkTiming();
74     for (int iter = 0; iter < num_iters; ++iter) {
75       B.device(device_) = A.template cast<T>();
76     }
77     // Record the number of values copied per second
78     finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
79   }
80 
random(int num_iters)81   void random(int num_iters) {
82     eigen_assert(m_ == k_ && k_ == n_);
83     Eigen::array<TensorIndex, 2> sizes;
84     sizes[0] = m_;
85     sizes[1] = m_;
86     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
87 #ifdef EIGEN_USE_SYCL // warmup for sycl
88     for (int iter = 0; iter < 10; ++iter) {
89       C.device(device_) = C.random();
90     }
91 #endif
92     StartBenchmarkTiming();
93     for (int iter = 0; iter < num_iters; ++iter) {
94       C.device(device_) = C.random();
95     }
96     // Record the number of random numbers generated per second
97     finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
98   }
99 
slicing(int num_iters)100   void slicing(int num_iters) {
101     eigen_assert(m_ == k_ && k_ == n_);
102     Eigen::array<TensorIndex, 2> sizes;
103     sizes[0] = m_;
104     sizes[1] = m_;
105     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
106     const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
107     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
108 
109     const Eigen::DSizes<TensorIndex, 2> quarter_sizes(m_/2, m_/2);
110     const Eigen::DSizes<TensorIndex, 2> first_quadrant(0, 0);
111     const Eigen::DSizes<TensorIndex, 2> second_quadrant(0, m_/2);
112     const Eigen::DSizes<TensorIndex, 2> third_quadrant(m_/2, 0);
113     const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(m_/2, m_/2);
114 #ifdef EIGEN_USE_SYCL // warmup for sycl
115     for (int iter = 0; iter < 10; ++iter) {
116       C.slice(first_quadrant, quarter_sizes).device(device_) =
117           A.slice(first_quadrant, quarter_sizes);
118       C.slice(second_quadrant, quarter_sizes).device(device_) =
119           B.slice(second_quadrant, quarter_sizes);
120       C.slice(third_quadrant, quarter_sizes).device(device_) =
121           A.slice(third_quadrant, quarter_sizes);
122       C.slice(fourth_quadrant, quarter_sizes).device(device_) =
123           B.slice(fourth_quadrant, quarter_sizes);
124     }
125 #endif
126     StartBenchmarkTiming();
127     for (int iter = 0; iter < num_iters; ++iter) {
128       C.slice(first_quadrant, quarter_sizes).device(device_) =
129           A.slice(first_quadrant, quarter_sizes);
130       C.slice(second_quadrant, quarter_sizes).device(device_) =
131           B.slice(second_quadrant, quarter_sizes);
132       C.slice(third_quadrant, quarter_sizes).device(device_) =
133           A.slice(third_quadrant, quarter_sizes);
134       C.slice(fourth_quadrant, quarter_sizes).device(device_) =
135           B.slice(fourth_quadrant, quarter_sizes);
136     }
137     // Record the number of values copied from the rhs slice to the lhs slice
138     // each second
139     finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
140   }
141 
rowChip(int num_iters)142   void rowChip(int num_iters) {
143     Eigen::array<TensorIndex, 2> input_size;
144     input_size[0] = k_;
145     input_size[1] = n_;
146     const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
147     Eigen::array<TensorIndex, 1> output_size;
148     output_size[0] = n_;
149     TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
150 #ifdef EIGEN_USE_SYCL // warmup for sycl
151     for (int iter = 0; iter < 10; ++iter) {
152       C.device(device_) = B.chip(iter % k_, 0);
153     }
154 #endif
155     StartBenchmarkTiming();
156     for (int iter = 0; iter < num_iters; ++iter) {
157       C.device(device_) = B.chip(iter % k_, 0);
158     }
159     // Record the number of values copied from the rhs chip to the lhs.
160     finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
161   }
162 
colChip(int num_iters)163   void colChip(int num_iters) {
164     Eigen::array<TensorIndex, 2> input_size;
165     input_size[0] = k_;
166     input_size[1] = n_;
167     const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
168     Eigen::array<TensorIndex, 1> output_size;
169     output_size[0] = n_;
170     TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
171 #ifdef EIGEN_USE_SYCL // warmup for sycl
172     for (int iter = 0; iter < 10; ++iter) {
173       C.device(device_) = B.chip(iter % n_, 1);
174     }
175 #endif
176     StartBenchmarkTiming();
177     for (int iter = 0; iter < num_iters; ++iter) {
178       C.device(device_) = B.chip(iter % n_, 1);
179     }
180     // Record the number of values copied from the rhs chip to the lhs.
181     finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
182   }
183 
shuffling(int num_iters)184   void shuffling(int num_iters) {
185     eigen_assert(m_ == n_);
186     Eigen::array<TensorIndex, 2> size_a;
187     size_a[0] = m_;
188     size_a[1] = k_;
189     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
190     Eigen::array<TensorIndex, 2> size_b;
191     size_b[0] = k_;
192     size_b[1] = m_;
193     TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
194 
195     Eigen::array<int, 2> shuffle;
196     shuffle[0] = 1;
197     shuffle[1] = 0;
198 #ifdef EIGEN_USE_SYCL // warmup for sycl
199     for (int iter = 0; iter < 10; ++iter) {
200       B.device(device_) = A.shuffle(shuffle);
201     }
202 #endif
203     StartBenchmarkTiming();
204     for (int iter = 0; iter < num_iters; ++iter) {
205       B.device(device_) = A.shuffle(shuffle);
206     }
207     // Record the number of values shuffled from A and copied to B each second
208     finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
209   }
210 
padding(int num_iters)211  void padding(int num_iters) {
212     eigen_assert(m_ == k_);
213     Eigen::array<TensorIndex, 2> size_a;
214     size_a[0] = m_;
215     size_a[1] = k_-3;
216     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
217     Eigen::array<TensorIndex, 2> size_b;
218     size_b[0] = k_;
219     size_b[1] = m_;
220     TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
221 
222 #if defined(EIGEN_HAS_INDEX_LIST)
223     Eigen::IndexPairList<Eigen::type2indexpair<0, 0>,
224                          Eigen::type2indexpair<2, 1> > paddings;
225 #else
226     Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings;
227     paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0);
228     paddings[1] = Eigen::IndexPair<TensorIndex>(2, 1);
229 #endif
230 #ifdef EIGEN_USE_SYCL // warmup for sycl
231     for (int iter = 0; iter < 10; ++iter) {
232       B.device(device_) = A.pad(paddings);
233     }
234 #endif
235     StartBenchmarkTiming();
236     for (int iter = 0; iter < num_iters; ++iter) {
237       B.device(device_) = A.pad(paddings);
238     }
239     // Record the number of values copied from the padded tensor A each second
240     finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
241   }
242 
striding(int num_iters)243  void striding(int num_iters) {
244     eigen_assert(m_ == k_);
245     Eigen::array<TensorIndex, 2> size_a;
246     size_a[0] = m_;
247     size_a[1] = k_;
248     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
249     Eigen::array<TensorIndex, 2> size_b;
250     size_b[0] = m_;
251     size_b[1] = k_/2;
252     TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
253 
254 #ifndef EIGEN_HAS_INDEX_LIST
255     Eigen::array<TensorIndex, 2> strides;
256     strides[0] = 1;
257     strides[1] = 2;
258 #else
259     // Take advantage of cxx11 to give the compiler information it can use to
260     // optimize the code.
261     Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> > strides;
262 #endif
263 
264 #ifdef EIGEN_USE_SYCL // warmup for sycl
265     for (int iter = 0; iter < 10; ++iter) {
266       B.device(device_) = A.stride(strides);
267     }
268 #endif
269     StartBenchmarkTiming();
270     for (int iter = 0; iter < num_iters; ++iter) {
271       B.device(device_) = A.stride(strides);
272     }
273     // Record the number of values copied from the padded tensor A each second
274     finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
275   }
276 
277 
broadcasting(int num_iters)278   void broadcasting(int num_iters) {
279     Eigen::array<TensorIndex, 2> size_a;
280     size_a[0] = m_;
281     size_a[1] = 1;
282     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
283     Eigen::array<TensorIndex, 2> size_c;
284     size_c[0] = m_;
285     size_c[1] = n_;
286     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, size_c);
287 
288 #ifndef EIGEN_HAS_INDEX_LIST
289     Eigen::array<int, 2> broadcast;
290     broadcast[0] = 1;
291     broadcast[1] = n_;
292 #else
293     // Take advantage of cxx11 to give the compiler information it can use to
294     // optimize the code.
295     Eigen::IndexList<Eigen::type2index<1>, int> broadcast;
296     broadcast.set(1, n_);
297 #endif
298 
299 #ifdef EIGEN_USE_SYCL // warmup for sycl
300     for (int iter = 0; iter < 10; ++iter) {
301       C.device(device_) = A.broadcast(broadcast);
302     }
303 #endif
304     StartBenchmarkTiming();
305     for (int iter = 0; iter < num_iters; ++iter) {
306       C.device(device_) = A.broadcast(broadcast);
307     }
308     // Record the number of values broadcasted from A and copied to C each second
309     finalizeBenchmark(static_cast<int64_t>(m_) * n_ * num_iters);
310   }
311 
coeffWiseOp(int num_iters)312   void coeffWiseOp(int num_iters) {
313     eigen_assert(m_ == k_ && k_ == n_);
314     Eigen::array<TensorIndex, 2> sizes;
315     sizes[0] = m_;
316     sizes[1] = m_;
317     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
318     const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
319     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
320 #ifdef EIGEN_USE_SYCL // warmup for sycl
321     for (int iter = 0; iter < 10; ++iter) {
322       C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7));
323     }
324 #endif
325     StartBenchmarkTiming();
326     for (int iter = 0; iter < num_iters; ++iter) {
327       C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7));
328     }
329     // Record the number of FLOP executed per second (2 multiplications and
330     // 1 addition per value)
331     finalizeBenchmark(static_cast<int64_t>(3) * m_ * m_ * num_iters);
332   }
333 
algebraicFunc(int num_iters)334   void algebraicFunc(int num_iters) {
335     eigen_assert(m_ == k_ && k_ == n_);
336     Eigen::array<TensorIndex, 2> sizes;
337     sizes[0] = m_;
338     sizes[1] = m_;
339     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
340     const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
341     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
342 
343 #ifdef EIGEN_USE_SYCL // warmup for sycl
344 for (int iter = 0; iter < 10; ++iter) {
345       C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
346 }
347 #endif
348     StartBenchmarkTiming();
349     for (int iter = 0; iter < num_iters; ++iter) {
350       C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
351     }
352     // Record the number of FLOP executed per second (assuming one operation
353     // per value)
354     finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
355   }
356 
transcendentalFunc(int num_iters)357   void transcendentalFunc(int num_iters) {
358     eigen_assert(m_ == k_ && k_ == n_);
359     Eigen::array<TensorIndex, 2> sizes;
360     sizes[0] = m_;
361     sizes[1] = m_;
362     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
363     const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
364     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
365 #ifdef EIGEN_USE_SYCL // warmup for sycl
366     for (int iter = 0; iter < 10; ++iter) {
367       C.device(device_) = A.exp() + B.log();
368     }
369 #endif
370     StartBenchmarkTiming();
371     for (int iter = 0; iter < num_iters; ++iter) {
372       C.device(device_) = A.exp() + B.log();
373     }
374     // Record the number of FLOP executed per second (assuming one operation
375     // per value)
376     finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
377   }
378 
379  // Row reduction
rowReduction(int num_iters)380   void rowReduction(int num_iters) {
381     Eigen::array<TensorIndex, 2> input_size;
382     input_size[0] = k_;
383     input_size[1] = n_;
384     const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
385     Eigen::array<TensorIndex, 1> output_size;
386     output_size[0] = n_;
387     TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
388 
389 #ifndef EIGEN_HAS_INDEX_LIST
390     Eigen::array<TensorIndex, 1> sum_along_dim;
391     sum_along_dim[0] = 0;
392 #else
393     // Take advantage of cxx11 to give the compiler information it can use to
394     // optimize the code.
395     Eigen::IndexList<Eigen::type2index<0>> sum_along_dim;
396 #endif
397 #ifdef EIGEN_USE_SYCL // warmup for sycl
398   for (int iter = 0; iter < 10; ++iter) {
399     C.device(device_) = B.sum(sum_along_dim);
400   }
401 #endif
402     StartBenchmarkTiming();
403     for (int iter = 0; iter < num_iters; ++iter) {
404       C.device(device_) = B.sum(sum_along_dim);
405     }
406     // Record the number of FLOP executed per second (assuming one operation
407     // per value)
408     finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
409   }
410 
411   // Column reduction
colReduction(int num_iters)412   void colReduction(int num_iters) {
413     Eigen::array<TensorIndex, 2> input_size;
414     input_size[0] = k_;
415     input_size[1] = n_;
416     const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
417         b_, input_size);
418     Eigen::array<TensorIndex, 1> output_size;
419     output_size[0] = k_;
420     TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> A(
421         a_, output_size);
422 
423 #ifndef EIGEN_HAS_INDEX_LIST
424     Eigen::array<TensorIndex, 1> sum_along_dim;
425     sum_along_dim[0] = 1;
426 #else
427     // Take advantage of cxx11 to give the compiler information it can use to
428     // optimize the code.
429     Eigen::IndexList<Eigen::type2index<1>> sum_along_dim;
430 #endif
431 #ifdef EIGEN_USE_SYCL // warmup for sycl
432   for (int iter = 0; iter < 10; ++iter) {
433     A.device(device_) = B.sum(sum_along_dim);
434   }
435 #endif
436     StartBenchmarkTiming();
437     for (int iter = 0; iter < num_iters; ++iter) {
438       A.device(device_) = B.sum(sum_along_dim);
439     }
440     // Record the number of FLOP executed per second (assuming one operation
441     // per value)
442     finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
443   }
444 
445   // Full reduction
fullReduction(int num_iters)446   void fullReduction(int num_iters) {
447     Eigen::array<TensorIndex, 2> input_size;
448     input_size[0] = k_;
449     input_size[1] = n_;
450     const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
451         b_, input_size);
452     Eigen::array<TensorIndex, 0> output_size;
453     TensorMap<Tensor<T, 0, 0, TensorIndex>, Eigen::Aligned> C(
454         c_, output_size);
455 #ifdef EIGEN_USE_SYCL // warmup for sycl
456     for (int iter = 0; iter < 10; ++iter) {
457       C.device(device_) = B.sum();
458     }
459 #endif
460     StartBenchmarkTiming();
461     for (int iter = 0; iter < num_iters; ++iter) {
462       C.device(device_) = B.sum();
463     }
464     // Record the number of FLOP executed per second (assuming one operation
465     // per value)
466     finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
467   }
468 
469 
470 
471   // do a contraction which is equivalent to a matrix multiplication
contraction(int num_iters)472   void contraction(int num_iters) {
473       contraction<static_cast<int>(Eigen::ColMajor)>(num_iters, false, false);
474   }
475 
contractionRowMajor(int num_iters)476     void contractionRowMajor(int num_iters) {
477       contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, false, false);
478   }
479 
contractionRowMajorAT(int num_iters)480   void contractionRowMajorAT(int num_iters) {
481       contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, true, false);
482   }
483 
contractionRowMajorBT(int num_iters)484   void contractionRowMajorBT(int num_iters) {
485       contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, false, true);
486   }
487 
contractionRowMajorABT(int num_iters)488   void contractionRowMajorABT(int num_iters) {
489       contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, true, true);
490   }
491 
convolution(int num_iters,int kernel_x,int kernel_y)492   void convolution(int num_iters, int kernel_x, int kernel_y) {
493     Eigen::array<TensorIndex, 2> input_sizes;
494     input_sizes[0] = m_;
495     input_sizes[1] = n_;
496     TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, input_sizes);
497     Eigen::array<TensorIndex, 2> kernel_sizes;
498     kernel_sizes[0] = kernel_x;
499     kernel_sizes[1] = kernel_y;
500     TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, kernel_sizes);
501     Eigen::array<TensorIndex, 2> result_sizes;
502     result_sizes[0] = m_ - kernel_x + 1;
503     result_sizes[1] = n_ - kernel_y + 1;
504     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, result_sizes);
505     Eigen::array<TensorIndex, 2> dims;
506     dims[0] = 0;
507     dims[1] = 1;
508 #ifdef EIGEN_USE_SYCL // warmup for sycl
509     for (int iter = 0; iter < 10; ++iter) {
510       C.device(device_) = A.convolve(B, dims);
511      }
512 #endif
513     StartBenchmarkTiming();
514     for (int iter = 0; iter < num_iters; ++iter) {
515       C.device(device_) = A.convolve(B, dims);
516     }
517     // Record the number of FLOPs executed per second (kernel_size
518     // multiplications and additions for each value in the resulting tensor)
519     finalizeBenchmark(static_cast<int64_t>(2) *
520         (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * num_iters);
521   }
522 
523  private:
524  // do a contraction which is equivalent to a matrix multiplication
525   template<int Layout>
contraction(int num_iters,bool trans_a,bool trans_b)526   void contraction(int num_iters, bool trans_a, bool trans_b) {
527     Eigen::array<TensorIndex, 2> sizeA;
528     sizeA[0] = (trans_a ? k_: m_);
529     sizeA[1] = (trans_a ? m_:  k_);
530     Eigen::array<TensorIndex, 2> sizeB;
531     sizeB[0] = (trans_b ? n_: k_);
532     sizeB[1] = (trans_b ? k_: n_);
533     Eigen::array<TensorIndex, 2> sizeC;
534     sizeC[0] = m_;
535     sizeC[1] = n_;
536 
537     const TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> A(a_, sizeA);
538     const TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> B(b_, sizeB);
539     TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> C(c_, sizeC);
540 
541     typedef typename Tensor<T, 2, Layout>::DimensionPair DimPair;
542     Eigen::array<DimPair, 1> dims;
543     TensorIndex a_contract_dim = (trans_a ? 0 : 1);
544     TensorIndex b_contract_dim = (trans_b ? 1 : 0);
545     dims[0] = DimPair(a_contract_dim, b_contract_dim);
546 #ifdef EIGEN_USE_SYCL // warmup for sycl
547     for (int iter = 0; iter < 10; ++iter) {
548       C.device(device_) = A.contract(B, dims);
549      }
550 #endif
551     StartBenchmarkTiming();
552     for (int iter = 0; iter < num_iters; ++iter) {
553       C.device(device_) = A.contract(B, dims);
554     }
555     // Record the number of FLOP executed per second (size_ multiplications and
556     // additions for each value in the resulting tensor)
557     finalizeBenchmark(static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters);
558   }
559 
initialize()560   void initialize() {
561     a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
562     b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
563     c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
564 
565     // Initialize the content of the memory pools to prevent asan from
566     // complaining.
567     device_.memset(a_, 12, m_ * k_ * sizeof(T));
568     device_.memset(b_, 23, k_ * n_ * sizeof(T));
569     device_.memset(c_, 31, m_ * n_ * sizeof(T));
570 
571   }
572 
finalizeBenchmark(int64_t num_items)573   inline void finalizeBenchmark(int64_t num_items) {
574 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
575     if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
576       device_.synchronize();
577     }
578 #elif defined(EIGEN_USE_SYCL)
579     if (Eigen::internal::is_same<Device, Eigen::SyclDevice>::value) {
580       device_.synchronize();
581     }
582 
583 #endif
584     StopBenchmarkTiming();
585     SetBenchmarkFlopsProcessed(num_items);
586   }
587 
588 
589   TensorIndex m_;
590   TensorIndex k_;
591   TensorIndex n_;
592   T* a_;
593   T* b_;
594   T* c_;
595   Device device_;
596 };
597 #endif  // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
598