1 #ifndef THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_ 2 #define THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_ 3 4 typedef int TensorIndex; 5 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int 6 7 #include "unsupported/Eigen/CXX11/Tensor" 8 #include "benchmark.h" 9 10 #define BENCHMARK_RANGE(bench, lo, hi) \ 11 BENCHMARK(bench)->Range(lo, hi) 12 13 using Eigen::Tensor; 14 using Eigen::TensorMap; 15 16 // TODO(bsteiner): also templatize on the input type since we have users 17 // for int8 as well as floats. 18 template <typename Device, typename T> class BenchmarkSuite { 19 public: BenchmarkSuite(const Device & device,size_t m,size_t k,size_t n)20 BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n) 21 : m_(m), k_(k), n_(n), device_(device) { 22 initialize(); 23 } 24 BenchmarkSuite(const Device & device,size_t m)25 BenchmarkSuite(const Device& device, size_t m) 26 : m_(m), k_(m), n_(m), device_(device) { 27 initialize(); 28 } 29 BenchmarkSuite(const Device & device,size_t m,size_t k)30 BenchmarkSuite(const Device& device, size_t m, size_t k) 31 : m_(1), k_(k), n_(m), device_(device) { 32 initialize(); 33 } 34 ~BenchmarkSuite()35 ~BenchmarkSuite() { 36 device_.deallocate(a_); 37 device_.deallocate(b_); 38 device_.deallocate(c_); 39 } 40 memcpy(int num_iters)41 void memcpy(int num_iters) { 42 eigen_assert(m_ == k_ && k_ == n_); 43 #ifdef EIGEN_USE_SYCL // warmup for sycl 44 for (int iter = 0; iter < 10; ++iter) { 45 device_.memcpy(c_, a_, m_ * m_ * sizeof(T)); 46 } 47 #endif 48 StartBenchmarkTiming(); 49 for (int iter = 0; iter < num_iters; ++iter) { 50 device_.memcpy(c_, a_, m_ * m_ * sizeof(T)); 51 } 52 // Record the number of values copied per second 53 finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters); 54 } 55 typeCasting(int num_iters)56 void typeCasting(int num_iters) { 57 eigen_assert(m_ == n_); 58 Eigen::array<TensorIndex, 2> sizes; 59 if (sizeof(T) >= sizeof(int)) { 60 sizes[0] = m_; 61 sizes[1] = k_; 62 } else { 63 sizes[0] = m_ * sizeof(T) / sizeof(int); 64 sizes[1] = k_ * sizeof(T) / sizeof(int); 65 } 66 const TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> A((int*)a_, sizes); 67 TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, sizes); 68 #ifdef EIGEN_USE_SYCL // warmup for sycl 69 for (int iter = 0; iter < 10; ++iter) { 70 B.device(device_) = A.template cast<T>(); 71 } 72 #endif 73 StartBenchmarkTiming(); 74 for (int iter = 0; iter < num_iters; ++iter) { 75 B.device(device_) = A.template cast<T>(); 76 } 77 // Record the number of values copied per second 78 finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters); 79 } 80 random(int num_iters)81 void random(int num_iters) { 82 eigen_assert(m_ == k_ && k_ == n_); 83 Eigen::array<TensorIndex, 2> sizes; 84 sizes[0] = m_; 85 sizes[1] = m_; 86 TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes); 87 #ifdef EIGEN_USE_SYCL // warmup for sycl 88 for (int iter = 0; iter < 10; ++iter) { 89 C.device(device_) = C.random(); 90 } 91 #endif 92 StartBenchmarkTiming(); 93 for (int iter = 0; iter < num_iters; ++iter) { 94 C.device(device_) = C.random(); 95 } 96 // Record the number of random numbers generated per second 97 finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters); 98 } 99 slicing(int num_iters)100 void slicing(int num_iters) { 101 eigen_assert(m_ == k_ && k_ == n_); 102 Eigen::array<TensorIndex, 2> sizes; 103 sizes[0] = m_; 104 sizes[1] = m_; 105 const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes); 106 const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes); 107 TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes); 108 109 const Eigen::DSizes<TensorIndex, 2> quarter_sizes(m_/2, m_/2); 110 const Eigen::DSizes<TensorIndex, 2> first_quadrant(0, 0); 111 const Eigen::DSizes<TensorIndex, 2> second_quadrant(0, m_/2); 112 const Eigen::DSizes<TensorIndex, 2> third_quadrant(m_/2, 0); 113 const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(m_/2, m_/2); 114 #ifdef EIGEN_USE_SYCL // warmup for sycl 115 for (int iter = 0; iter < 10; ++iter) { 116 C.slice(first_quadrant, quarter_sizes).device(device_) = 117 A.slice(first_quadrant, quarter_sizes); 118 C.slice(second_quadrant, quarter_sizes).device(device_) = 119 B.slice(second_quadrant, quarter_sizes); 120 C.slice(third_quadrant, quarter_sizes).device(device_) = 121 A.slice(third_quadrant, quarter_sizes); 122 C.slice(fourth_quadrant, quarter_sizes).device(device_) = 123 B.slice(fourth_quadrant, quarter_sizes); 124 } 125 #endif 126 StartBenchmarkTiming(); 127 for (int iter = 0; iter < num_iters; ++iter) { 128 C.slice(first_quadrant, quarter_sizes).device(device_) = 129 A.slice(first_quadrant, quarter_sizes); 130 C.slice(second_quadrant, quarter_sizes).device(device_) = 131 B.slice(second_quadrant, quarter_sizes); 132 C.slice(third_quadrant, quarter_sizes).device(device_) = 133 A.slice(third_quadrant, quarter_sizes); 134 C.slice(fourth_quadrant, quarter_sizes).device(device_) = 135 B.slice(fourth_quadrant, quarter_sizes); 136 } 137 // Record the number of values copied from the rhs slice to the lhs slice 138 // each second 139 finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters); 140 } 141 rowChip(int num_iters)142 void rowChip(int num_iters) { 143 Eigen::array<TensorIndex, 2> input_size; 144 input_size[0] = k_; 145 input_size[1] = n_; 146 const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size); 147 Eigen::array<TensorIndex, 1> output_size; 148 output_size[0] = n_; 149 TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size); 150 #ifdef EIGEN_USE_SYCL // warmup for sycl 151 for (int iter = 0; iter < 10; ++iter) { 152 C.device(device_) = B.chip(iter % k_, 0); 153 } 154 #endif 155 StartBenchmarkTiming(); 156 for (int iter = 0; iter < num_iters; ++iter) { 157 C.device(device_) = B.chip(iter % k_, 0); 158 } 159 // Record the number of values copied from the rhs chip to the lhs. 160 finalizeBenchmark(static_cast<int64_t>(n_) * num_iters); 161 } 162 colChip(int num_iters)163 void colChip(int num_iters) { 164 Eigen::array<TensorIndex, 2> input_size; 165 input_size[0] = k_; 166 input_size[1] = n_; 167 const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size); 168 Eigen::array<TensorIndex, 1> output_size; 169 output_size[0] = n_; 170 TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size); 171 #ifdef EIGEN_USE_SYCL // warmup for sycl 172 for (int iter = 0; iter < 10; ++iter) { 173 C.device(device_) = B.chip(iter % n_, 1); 174 } 175 #endif 176 StartBenchmarkTiming(); 177 for (int iter = 0; iter < num_iters; ++iter) { 178 C.device(device_) = B.chip(iter % n_, 1); 179 } 180 // Record the number of values copied from the rhs chip to the lhs. 181 finalizeBenchmark(static_cast<int64_t>(n_) * num_iters); 182 } 183 shuffling(int num_iters)184 void shuffling(int num_iters) { 185 eigen_assert(m_ == n_); 186 Eigen::array<TensorIndex, 2> size_a; 187 size_a[0] = m_; 188 size_a[1] = k_; 189 const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a); 190 Eigen::array<TensorIndex, 2> size_b; 191 size_b[0] = k_; 192 size_b[1] = m_; 193 TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b); 194 195 Eigen::array<int, 2> shuffle; 196 shuffle[0] = 1; 197 shuffle[1] = 0; 198 #ifdef EIGEN_USE_SYCL // warmup for sycl 199 for (int iter = 0; iter < 10; ++iter) { 200 B.device(device_) = A.shuffle(shuffle); 201 } 202 #endif 203 StartBenchmarkTiming(); 204 for (int iter = 0; iter < num_iters; ++iter) { 205 B.device(device_) = A.shuffle(shuffle); 206 } 207 // Record the number of values shuffled from A and copied to B each second 208 finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters); 209 } 210 padding(int num_iters)211 void padding(int num_iters) { 212 eigen_assert(m_ == k_); 213 Eigen::array<TensorIndex, 2> size_a; 214 size_a[0] = m_; 215 size_a[1] = k_-3; 216 const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a); 217 Eigen::array<TensorIndex, 2> size_b; 218 size_b[0] = k_; 219 size_b[1] = m_; 220 TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b); 221 222 #if defined(EIGEN_HAS_INDEX_LIST) 223 Eigen::IndexPairList<Eigen::type2indexpair<0, 0>, 224 Eigen::type2indexpair<2, 1> > paddings; 225 #else 226 Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings; 227 paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0); 228 paddings[1] = Eigen::IndexPair<TensorIndex>(2, 1); 229 #endif 230 #ifdef EIGEN_USE_SYCL // warmup for sycl 231 for (int iter = 0; iter < 10; ++iter) { 232 B.device(device_) = A.pad(paddings); 233 } 234 #endif 235 StartBenchmarkTiming(); 236 for (int iter = 0; iter < num_iters; ++iter) { 237 B.device(device_) = A.pad(paddings); 238 } 239 // Record the number of values copied from the padded tensor A each second 240 finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters); 241 } 242 striding(int num_iters)243 void striding(int num_iters) { 244 eigen_assert(m_ == k_); 245 Eigen::array<TensorIndex, 2> size_a; 246 size_a[0] = m_; 247 size_a[1] = k_; 248 const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a); 249 Eigen::array<TensorIndex, 2> size_b; 250 size_b[0] = m_; 251 size_b[1] = k_/2; 252 TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b); 253 254 #ifndef EIGEN_HAS_INDEX_LIST 255 Eigen::array<TensorIndex, 2> strides; 256 strides[0] = 1; 257 strides[1] = 2; 258 #else 259 // Take advantage of cxx11 to give the compiler information it can use to 260 // optimize the code. 261 Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> > strides; 262 #endif 263 264 #ifdef EIGEN_USE_SYCL // warmup for sycl 265 for (int iter = 0; iter < 10; ++iter) { 266 B.device(device_) = A.stride(strides); 267 } 268 #endif 269 StartBenchmarkTiming(); 270 for (int iter = 0; iter < num_iters; ++iter) { 271 B.device(device_) = A.stride(strides); 272 } 273 // Record the number of values copied from the padded tensor A each second 274 finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters); 275 } 276 277 broadcasting(int num_iters)278 void broadcasting(int num_iters) { 279 Eigen::array<TensorIndex, 2> size_a; 280 size_a[0] = m_; 281 size_a[1] = 1; 282 const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a); 283 Eigen::array<TensorIndex, 2> size_c; 284 size_c[0] = m_; 285 size_c[1] = n_; 286 TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, size_c); 287 288 #ifndef EIGEN_HAS_INDEX_LIST 289 Eigen::array<int, 2> broadcast; 290 broadcast[0] = 1; 291 broadcast[1] = n_; 292 #else 293 // Take advantage of cxx11 to give the compiler information it can use to 294 // optimize the code. 295 Eigen::IndexList<Eigen::type2index<1>, int> broadcast; 296 broadcast.set(1, n_); 297 #endif 298 299 #ifdef EIGEN_USE_SYCL // warmup for sycl 300 for (int iter = 0; iter < 10; ++iter) { 301 C.device(device_) = A.broadcast(broadcast); 302 } 303 #endif 304 StartBenchmarkTiming(); 305 for (int iter = 0; iter < num_iters; ++iter) { 306 C.device(device_) = A.broadcast(broadcast); 307 } 308 // Record the number of values broadcasted from A and copied to C each second 309 finalizeBenchmark(static_cast<int64_t>(m_) * n_ * num_iters); 310 } 311 coeffWiseOp(int num_iters)312 void coeffWiseOp(int num_iters) { 313 eigen_assert(m_ == k_ && k_ == n_); 314 Eigen::array<TensorIndex, 2> sizes; 315 sizes[0] = m_; 316 sizes[1] = m_; 317 const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes); 318 const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes); 319 TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes); 320 #ifdef EIGEN_USE_SYCL // warmup for sycl 321 for (int iter = 0; iter < 10; ++iter) { 322 C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7)); 323 } 324 #endif 325 StartBenchmarkTiming(); 326 for (int iter = 0; iter < num_iters; ++iter) { 327 C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7)); 328 } 329 // Record the number of FLOP executed per second (2 multiplications and 330 // 1 addition per value) 331 finalizeBenchmark(static_cast<int64_t>(3) * m_ * m_ * num_iters); 332 } 333 algebraicFunc(int num_iters)334 void algebraicFunc(int num_iters) { 335 eigen_assert(m_ == k_ && k_ == n_); 336 Eigen::array<TensorIndex, 2> sizes; 337 sizes[0] = m_; 338 sizes[1] = m_; 339 const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes); 340 const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes); 341 TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes); 342 343 #ifdef EIGEN_USE_SYCL // warmup for sycl 344 for (int iter = 0; iter < 10; ++iter) { 345 C.device(device_) = A.rsqrt() + B.sqrt() * B.square(); 346 } 347 #endif 348 StartBenchmarkTiming(); 349 for (int iter = 0; iter < num_iters; ++iter) { 350 C.device(device_) = A.rsqrt() + B.sqrt() * B.square(); 351 } 352 // Record the number of FLOP executed per second (assuming one operation 353 // per value) 354 finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters); 355 } 356 transcendentalFunc(int num_iters)357 void transcendentalFunc(int num_iters) { 358 eigen_assert(m_ == k_ && k_ == n_); 359 Eigen::array<TensorIndex, 2> sizes; 360 sizes[0] = m_; 361 sizes[1] = m_; 362 const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes); 363 const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes); 364 TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes); 365 #ifdef EIGEN_USE_SYCL // warmup for sycl 366 for (int iter = 0; iter < 10; ++iter) { 367 C.device(device_) = A.exp() + B.log(); 368 } 369 #endif 370 StartBenchmarkTiming(); 371 for (int iter = 0; iter < num_iters; ++iter) { 372 C.device(device_) = A.exp() + B.log(); 373 } 374 // Record the number of FLOP executed per second (assuming one operation 375 // per value) 376 finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters); 377 } 378 379 // Row reduction rowReduction(int num_iters)380 void rowReduction(int num_iters) { 381 Eigen::array<TensorIndex, 2> input_size; 382 input_size[0] = k_; 383 input_size[1] = n_; 384 const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size); 385 Eigen::array<TensorIndex, 1> output_size; 386 output_size[0] = n_; 387 TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size); 388 389 #ifndef EIGEN_HAS_INDEX_LIST 390 Eigen::array<TensorIndex, 1> sum_along_dim; 391 sum_along_dim[0] = 0; 392 #else 393 // Take advantage of cxx11 to give the compiler information it can use to 394 // optimize the code. 395 Eigen::IndexList<Eigen::type2index<0>> sum_along_dim; 396 #endif 397 #ifdef EIGEN_USE_SYCL // warmup for sycl 398 for (int iter = 0; iter < 10; ++iter) { 399 C.device(device_) = B.sum(sum_along_dim); 400 } 401 #endif 402 StartBenchmarkTiming(); 403 for (int iter = 0; iter < num_iters; ++iter) { 404 C.device(device_) = B.sum(sum_along_dim); 405 } 406 // Record the number of FLOP executed per second (assuming one operation 407 // per value) 408 finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters); 409 } 410 411 // Column reduction colReduction(int num_iters)412 void colReduction(int num_iters) { 413 Eigen::array<TensorIndex, 2> input_size; 414 input_size[0] = k_; 415 input_size[1] = n_; 416 const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B( 417 b_, input_size); 418 Eigen::array<TensorIndex, 1> output_size; 419 output_size[0] = k_; 420 TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> A( 421 a_, output_size); 422 423 #ifndef EIGEN_HAS_INDEX_LIST 424 Eigen::array<TensorIndex, 1> sum_along_dim; 425 sum_along_dim[0] = 1; 426 #else 427 // Take advantage of cxx11 to give the compiler information it can use to 428 // optimize the code. 429 Eigen::IndexList<Eigen::type2index<1>> sum_along_dim; 430 #endif 431 #ifdef EIGEN_USE_SYCL // warmup for sycl 432 for (int iter = 0; iter < 10; ++iter) { 433 A.device(device_) = B.sum(sum_along_dim); 434 } 435 #endif 436 StartBenchmarkTiming(); 437 for (int iter = 0; iter < num_iters; ++iter) { 438 A.device(device_) = B.sum(sum_along_dim); 439 } 440 // Record the number of FLOP executed per second (assuming one operation 441 // per value) 442 finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters); 443 } 444 445 // Full reduction fullReduction(int num_iters)446 void fullReduction(int num_iters) { 447 Eigen::array<TensorIndex, 2> input_size; 448 input_size[0] = k_; 449 input_size[1] = n_; 450 const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B( 451 b_, input_size); 452 Eigen::array<TensorIndex, 0> output_size; 453 TensorMap<Tensor<T, 0, 0, TensorIndex>, Eigen::Aligned> C( 454 c_, output_size); 455 #ifdef EIGEN_USE_SYCL // warmup for sycl 456 for (int iter = 0; iter < 10; ++iter) { 457 C.device(device_) = B.sum(); 458 } 459 #endif 460 StartBenchmarkTiming(); 461 for (int iter = 0; iter < num_iters; ++iter) { 462 C.device(device_) = B.sum(); 463 } 464 // Record the number of FLOP executed per second (assuming one operation 465 // per value) 466 finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters); 467 } 468 469 470 471 // do a contraction which is equivalent to a matrix multiplication contraction(int num_iters)472 void contraction(int num_iters) { 473 contraction<static_cast<int>(Eigen::ColMajor)>(num_iters, false, false); 474 } 475 contractionRowMajor(int num_iters)476 void contractionRowMajor(int num_iters) { 477 contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, false, false); 478 } 479 contractionRowMajorAT(int num_iters)480 void contractionRowMajorAT(int num_iters) { 481 contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, true, false); 482 } 483 contractionRowMajorBT(int num_iters)484 void contractionRowMajorBT(int num_iters) { 485 contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, false, true); 486 } 487 contractionRowMajorABT(int num_iters)488 void contractionRowMajorABT(int num_iters) { 489 contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, true, true); 490 } 491 convolution(int num_iters,int kernel_x,int kernel_y)492 void convolution(int num_iters, int kernel_x, int kernel_y) { 493 Eigen::array<TensorIndex, 2> input_sizes; 494 input_sizes[0] = m_; 495 input_sizes[1] = n_; 496 TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, input_sizes); 497 Eigen::array<TensorIndex, 2> kernel_sizes; 498 kernel_sizes[0] = kernel_x; 499 kernel_sizes[1] = kernel_y; 500 TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, kernel_sizes); 501 Eigen::array<TensorIndex, 2> result_sizes; 502 result_sizes[0] = m_ - kernel_x + 1; 503 result_sizes[1] = n_ - kernel_y + 1; 504 TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, result_sizes); 505 Eigen::array<TensorIndex, 2> dims; 506 dims[0] = 0; 507 dims[1] = 1; 508 #ifdef EIGEN_USE_SYCL // warmup for sycl 509 for (int iter = 0; iter < 10; ++iter) { 510 C.device(device_) = A.convolve(B, dims); 511 } 512 #endif 513 StartBenchmarkTiming(); 514 for (int iter = 0; iter < num_iters; ++iter) { 515 C.device(device_) = A.convolve(B, dims); 516 } 517 // Record the number of FLOPs executed per second (kernel_size 518 // multiplications and additions for each value in the resulting tensor) 519 finalizeBenchmark(static_cast<int64_t>(2) * 520 (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * num_iters); 521 } 522 523 private: 524 // do a contraction which is equivalent to a matrix multiplication 525 template<int Layout> contraction(int num_iters,bool trans_a,bool trans_b)526 void contraction(int num_iters, bool trans_a, bool trans_b) { 527 Eigen::array<TensorIndex, 2> sizeA; 528 sizeA[0] = (trans_a ? k_: m_); 529 sizeA[1] = (trans_a ? m_: k_); 530 Eigen::array<TensorIndex, 2> sizeB; 531 sizeB[0] = (trans_b ? n_: k_); 532 sizeB[1] = (trans_b ? k_: n_); 533 Eigen::array<TensorIndex, 2> sizeC; 534 sizeC[0] = m_; 535 sizeC[1] = n_; 536 537 const TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> A(a_, sizeA); 538 const TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> B(b_, sizeB); 539 TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> C(c_, sizeC); 540 541 typedef typename Tensor<T, 2, Layout>::DimensionPair DimPair; 542 Eigen::array<DimPair, 1> dims; 543 TensorIndex a_contract_dim = (trans_a ? 0 : 1); 544 TensorIndex b_contract_dim = (trans_b ? 1 : 0); 545 dims[0] = DimPair(a_contract_dim, b_contract_dim); 546 #ifdef EIGEN_USE_SYCL // warmup for sycl 547 for (int iter = 0; iter < 10; ++iter) { 548 C.device(device_) = A.contract(B, dims); 549 } 550 #endif 551 StartBenchmarkTiming(); 552 for (int iter = 0; iter < num_iters; ++iter) { 553 C.device(device_) = A.contract(B, dims); 554 } 555 // Record the number of FLOP executed per second (size_ multiplications and 556 // additions for each value in the resulting tensor) 557 finalizeBenchmark(static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters); 558 } 559 initialize()560 void initialize() { 561 a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); 562 b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); 563 c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); 564 565 // Initialize the content of the memory pools to prevent asan from 566 // complaining. 567 device_.memset(a_, 12, m_ * k_ * sizeof(T)); 568 device_.memset(b_, 23, k_ * n_ * sizeof(T)); 569 device_.memset(c_, 31, m_ * n_ * sizeof(T)); 570 571 } 572 finalizeBenchmark(int64_t num_items)573 inline void finalizeBenchmark(int64_t num_items) { 574 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) 575 if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) { 576 device_.synchronize(); 577 } 578 #elif defined(EIGEN_USE_SYCL) 579 if (Eigen::internal::is_same<Device, Eigen::SyclDevice>::value) { 580 device_.synchronize(); 581 } 582 583 #endif 584 StopBenchmarkTiming(); 585 SetBenchmarkFlopsProcessed(num_items); 586 } 587 588 589 TensorIndex m_; 590 TensorIndex k_; 591 TensorIndex n_; 592 T* a_; 593 T* b_; 594 T* c_; 595 Device device_; 596 }; 597 #endif // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_ 598