1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Mehdi Goli Codeplay Software Ltd. 5 // Ralph Potter Codeplay Software Ltd. 6 // Luke Iwanski Codeplay Software Ltd. 7 // Contact: <[email protected]> 8 // Copyright (C) 2016 Benoit Steiner <[email protected]> 9 10 // 11 // This Source Code Form is subject to the terms of the Mozilla 12 // Public License v. 2.0. If a copy of the MPL was not distributed 13 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 14 15 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H 16 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H 17 18 namespace Eigen { 19 20 /** \class TensorConvolution 21 * \ingroup CXX11_Tensor_Module 22 * 23 * \brief Tensor convolution class. 24 * 25 * 26 */ 27 28 enum class convolution_type { CONV1D, CONV2D, CONV3D }; 29 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims, 30 typename Kernel_accessor, typename Buffer_accessor, convolution_type Conv_Dim> 31 struct EigenConvolutionKernel; 32 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims, 33 typename Kernel_accessor, typename Buffer_accessor> 34 struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor, 35 Buffer_accessor, convolution_type::CONV1D> { 36 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> 37 Local_accessor; 38 Local_accessor local_acc; 39 Evaluator device_evaluator; 40 Kernel_accessor kernel_filter; 41 Buffer_accessor buffer_acc; 42 internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper; 43 const size_t kernelSize; 44 const cl::sycl::range<2> input_range; 45 EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, 46 Buffer_accessor buffer_acc_, 47 internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper_, 48 const size_t kernelSize_, const cl::sycl::range<2> input_range_) 49 : local_acc(local_acc_), 50 device_evaluator(device_evaluator_), 51 kernel_filter(kernel_filter_), 52 buffer_acc(buffer_acc_), 53 indexMapper(indexMapper_), 54 kernelSize(kernelSize_), 55 input_range(input_range_) {} 56 57 template <typename BooleanDim2> 58 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) { 59 return (boolean_check[0] && boolean_check[1]); 60 } 61 void operator()(cl::sycl::nd_item<2> itemID) { 62 auto buffer_ptr = buffer_acc.get_pointer(); 63 auto kernel_ptr = kernel_filter.get_pointer(); 64 // the required row to be calculated for the for each plane in shered memory 65 const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1); 66 const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input; 67 const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0]; 68 const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1)); 69 /// fill the shared memory 70 for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) { 71 const size_t local_index = i + plane_kernel_offset; 72 const size_t tensor_index = 73 plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset); 74 75 local_acc[local_index] = 76 (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1]) 77 ? device_evaluator.coeff(tensor_index) 78 : CoeffReturnType(0); 79 } 80 81 itemID.barrier(cl::sycl::access::fence_space::local_space); 82 83 // calculate the convolution // output start x 84 const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]); 85 if (boundary_check(itemID.get_global_id() < input_range)) { 86 CoeffReturnType result = static_cast<CoeffReturnType>(0); 87 const size_t index = plane_kernel_offset + itemID.get_local_id(0); 88 for (size_t k = 0; k < kernelSize; ++k) { 89 result += (local_acc[k + index] * kernel_ptr[k]); 90 } 91 const size_t tensor_index = 92 indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) + 93 indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start); 94 buffer_ptr[tensor_index] = result; 95 } 96 } 97 }; 98 99 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims, 100 typename Kernel_accessor, typename Buffer_accessor> 101 struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor, 102 Buffer_accessor, convolution_type::CONV2D> { 103 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> 104 Local_accessor; 105 Local_accessor local_acc; 106 Evaluator device_evaluator; 107 Kernel_accessor kernel_filter; 108 Buffer_accessor buffer_acc; 109 internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper; 110 const cl::sycl::range<2> kernel_size; 111 const cl::sycl::range<3> input_range; 112 EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, 113 Buffer_accessor buffer_acc_, 114 internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper_, 115 const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_) 116 : local_acc(local_acc_), 117 device_evaluator(device_evaluator_), 118 kernel_filter(kernel_filter_), 119 buffer_acc(buffer_acc_), 120 indexMapper(indexMapper_), 121 kernel_size(kernel_size_), 122 input_range(input_range_) {} 123 template <typename BooleanDim3> 124 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) { 125 return (boolean_check[0] && boolean_check[1] && boolean_check[2]); 126 } 127 128 void operator()(cl::sycl::nd_item<3> itemID) { 129 auto buffer_ptr = buffer_acc.get_pointer(); 130 auto kernel_ptr = kernel_filter.get_pointer(); 131 // the required row to be calculated for the for each plane in shered memory 132 const auto num_input = cl::sycl::range<2>{ 133 (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)}; 134 135 const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2)); 136 const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1]; 137 138 const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0], 139 itemID.get_group(1) * itemID.get_local_range()[1]}; 140 141 // fill the local memory 142 bool in_range_dim2 = itemID.get_global_id(2) < input_range[2]; 143 for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) { 144 const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset); 145 bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1)); 146 for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) { 147 const size_t local_index = i + local_input_offset; 148 const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset( 149 i + input_offset[0], j + input_offset[1]); 150 local_acc[local_index] = (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) && 151 in_range_dim1 && in_range_dim2) 152 ? device_evaluator.coeff(tensor_index) 153 : CoeffReturnType(0); 154 } 155 } 156 157 itemID.barrier(cl::sycl::access::fence_space::local_space); 158 159 // output offset start for each thread 160 const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0], 161 itemID.get_group(1) * itemID.get_local_range()[1]}; 162 163 if (boundary_check(itemID.get_global_id() < input_range)) { 164 CoeffReturnType result = static_cast<CoeffReturnType>(0); 165 166 for (size_t j = 0; j < kernel_size[1]; j++) { 167 size_t kernel_offset = kernel_size[0] * j; 168 const size_t index = 169 (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0); 170 for (size_t i = 0; i < kernel_size[0]; i++) { 171 result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]); 172 } 173 } 174 const size_t tensor_index = 175 indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) + 176 indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0], 177 itemID.get_local_id(1) + output_offset[1]); 178 179 buffer_ptr[tensor_index] = result; 180 } 181 } 182 }; 183 184 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims, 185 typename Kernel_accessor, typename Buffer_accessor> 186 struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor, 187 Buffer_accessor, convolution_type::CONV3D> { 188 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> 189 Local_accessor; 190 Local_accessor local_acc; 191 Evaluator device_evaluator; 192 Kernel_accessor kernel_filter; 193 Buffer_accessor buffer_acc; 194 internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper; 195 const cl::sycl::range<3> kernel_size; 196 const cl::sycl::range<3> input_range; 197 const size_t numP; 198 199 EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, 200 Buffer_accessor buffer_acc_, 201 internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper_, 202 const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_, 203 const size_t numP_) 204 : local_acc(local_acc_), 205 device_evaluator(device_evaluator_), 206 kernel_filter(kernel_filter_), 207 buffer_acc(buffer_acc_), 208 indexMapper(indexMapper_), 209 kernel_size(kernel_size_), 210 input_range(input_range_), 211 numP(numP_) {} 212 template <typename BooleanDim3> 213 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) { 214 return (boolean_check[0] && boolean_check[1] && boolean_check[2]); 215 } 216 void operator()(cl::sycl::nd_item<3> itemID) { 217 auto buffer_ptr = buffer_acc.get_pointer(); 218 auto kernel_ptr = kernel_filter.get_pointer(); 219 const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1}; 220 221 const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()}; 222 223 const auto output_offset = 224 cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()}; 225 226 for (size_t p = 0; p < numP; p++) { 227 /// fill the shared memory 228 const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); 229 for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) { 230 size_t local_index_dim2 = num_input[0] * num_input[1] * k; 231 bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1)); 232 for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) { 233 bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1)); 234 size_t local_index_dim1 = (num_input[0] * j) + local_index_dim2; 235 for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) { 236 bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1)); 237 const size_t local_index = local_index_dim1 + i; 238 const size_t tensor_index = 239 plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset( 240 i + input_offset[0], j + input_offset[1], k + input_offset[2]); 241 local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0); 242 } 243 } 244 } 245 itemID.barrier(cl::sycl::access::fence_space::local_space); 246 247 // calculate the convolution 248 249 if (boundary_check(itemID.get_global_id() < input_range)) { 250 CoeffReturnType result = static_cast<CoeffReturnType>(0); 251 for (size_t k = 0; k < kernel_size[2]; k++) { 252 for (size_t j = 0; j < kernel_size[1]; j++) { 253 for (size_t i = 0; i < kernel_size[0]; i++) { 254 const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k); 255 const size_t local_index = 256 ((i + itemID.get_local_id(0)) + 257 num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2)))); 258 259 result += (local_acc[local_index] * kernel_ptr[kernel_index]); 260 } 261 } 262 } 263 const size_t tensor_index = 264 indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) + 265 indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]); 266 buffer_ptr[tensor_index] = result; 267 } 268 269 itemID.barrier(cl::sycl::access::fence_space::local_space); 270 } 271 } 272 }; 273 274 template <typename Indices, typename InputArgType, typename KernelArgType> 275 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Eigen::SyclDevice> { 276 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType; 277 278 static const int NumDims = 279 internal::array_size<typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions>::value; 280 static const int NumKernelDims = internal::array_size<Indices>::value; 281 typedef typename XprType::Index Index; 282 typedef DSizes<Index, NumDims> Dimensions; 283 typedef typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions KernelDimensions; 284 typedef const Eigen::SyclDevice Device; 285 typedef typename XprType::CoeffReturnType CoeffReturnType; 286 typedef typename PacketType<CoeffReturnType, Eigen::SyclDevice>::type PacketReturnType; 287 typedef typename InputArgType::Scalar Scalar; 288 static const int PacketSize = PacketType<CoeffReturnType, Device>::size; 289 typedef StorageMemory<CoeffReturnType, Eigen::SyclDevice> Storage; 290 typedef typename Storage::Type EvaluatorPointerType; 291 typedef StorageMemory<const CoeffReturnType, Eigen::SyclDevice> KernelStorage; 292 293 enum { 294 IsAligned = TensorEvaluator<InputArgType, Eigen::SyclDevice>::IsAligned & 295 TensorEvaluator<KernelArgType, Eigen::SyclDevice>::IsAligned, 296 PacketAccess = false, 297 BlockAccess = false, 298 PreferBlockAccess = false, 299 Layout = TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout, 300 CoordAccess = false, // to be implemented 301 RawAccess = false 302 }; 303 304 //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// 305 typedef internal::TensorBlockNotImplemented TensorBlock; 306 //===--------------------------------------------------------------------===// 307 308 TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device) 309 : m_inputImpl(op.inputExpression(), device), 310 m_kernelArg(op.kernelExpression()), 311 m_kernelImpl(op.kernelExpression(), device), 312 m_indices(op.indices()), 313 m_buf(NULL), 314 m_kernel(NULL), 315 m_local_kernel(false), 316 m_device(device) { 317 EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout) == 318 static_cast<int>(TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Layout)), 319 YOU_MADE_A_PROGRAMMING_MISTAKE); 320 321 const typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions &input_dims = m_inputImpl.dimensions(); 322 const typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions &kernel_dims = 323 m_kernelImpl.dimensions(); 324 325 m_dimensions = m_inputImpl.dimensions(); 326 for (int i = 0; i < NumKernelDims; ++i) { 327 const Index index = op.indices()[i]; 328 const Index input_dim = input_dims[index]; 329 const Index kernel_dim = kernel_dims[i]; 330 const Index result_dim = input_dim - kernel_dim + 1; 331 m_dimensions[index] = result_dim; 332 } 333 } 334 335 EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; } 336 337 EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { 338 preloadKernel(); 339 m_inputImpl.evalSubExprsIfNeeded(NULL); 340 if (data) { 341 executeEval(data); 342 return false; 343 } else { 344 m_buf = (EvaluatorPointerType)m_device.get( 345 (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar))); 346 executeEval(m_buf); 347 return true; 348 } 349 } 350 351 EIGEN_STRONG_INLINE void cleanup() { 352 m_inputImpl.cleanup(); 353 if (m_buf) { 354 m_device.deallocate_temp(m_buf); 355 m_buf = NULL; 356 } 357 if (m_local_kernel) { 358 m_device.deallocate_temp(m_kernel); 359 m_local_kernel = false; 360 } 361 m_kernel = NULL; 362 } 363 /// used by sycl in order to build the sycl buffer 364 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device &device() const { return m_device; } 365 /// used by sycl in order to build the sycl buffer 366 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buf; } 367 368 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { 369 // Don't make a local copy of the kernel unless we have to (i.e. it's an 370 // expression that needs to be evaluated) 371 typename KernelStorage::Type in_place = m_kernelImpl.data(); 372 if (in_place) { 373 m_kernel = in_place; 374 m_local_kernel = false; 375 } else { 376 ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); 377 EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz)); 378 typedef TensorEvalToOp<const KernelArgType> EvalTo; 379 EvalTo evalToTmp(m_device.get(local), m_kernelArg); 380 const bool PacketAccess = internal::IsVectorizable<Eigen::SyclDevice, KernelArgType>::value; 381 internal::TensorExecutor<const EvalTo, Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device); 382 m_kernel = local; 383 m_local_kernel = true; 384 } 385 } 386 387 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const { 388 typedef TensorEvaluator<InputArgType, Eigen::SyclDevice> InputEvaluator; 389 typedef typename InputEvaluator::Dimensions InputDims; 390 switch (NumKernelDims) { 391 case 1: { 392 const size_t numX = dimensions()[m_indices[0]]; 393 const size_t numP = dimensions().TotalSize() / numX; 394 const auto input_dim = std::array<size_t, 2>{numX, numP}; 395 auto global_range = cl::sycl::range<2>{}; 396 auto local_range = cl::sycl::range<2>{}; 397 const size_t kernel_size = m_kernelImpl.dimensions().TotalSize(); 398 399 m_device.parallel_for_setup(input_dim, global_range, local_range); 400 const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]); 401 gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock()); 402 const array<Index, 1> indices{{m_indices[0]}}; 403 const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}}; 404 internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); 405 406 typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims, 407 typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV1D> 408 ConvKernel; 409 410 m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>( 411 m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size, 412 indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1])); 413 break; 414 } 415 416 case 2: { 417 auto kernel_index = std::array<size_t, 2>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1, 418 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0}; 419 auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]], 420 (size_t)m_kernelImpl.dimensions()[kernel_index[1]]}; 421 const size_t numX = dimensions()[m_indices[kernel_index[0]]]; 422 const size_t numY = dimensions()[m_indices[kernel_index[1]]]; 423 const size_t numP = dimensions().TotalSize() / (numX * numY); 424 auto input_dim = std::array<size_t, 3>{numX, numY, numP}; 425 426 auto global_range = cl::sycl::range<3>{}; 427 auto local_range = cl::sycl::range<3>{}; 428 429 m_device.parallel_for_setup(input_dim, global_range, local_range); 430 431 const size_t local_memory_size = 432 (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2]; 433 gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock()); 434 const array<Index, 2> indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}}; 435 const array<Index, 2> kernel_dims{ 436 {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}}; 437 internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); 438 typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims, 439 typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV2D> 440 ConvKernel; 441 m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>( 442 m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size, 443 indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]}); 444 break; 445 } 446 447 case 3: { 448 auto kernel_index = std::array<size_t, 3>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2, 449 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1, 450 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0}; 451 452 auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]], 453 (size_t)m_kernelImpl.dimensions()[kernel_index[1]], 454 (size_t)m_kernelImpl.dimensions()[kernel_index[2]]}; 455 456 const size_t numX = dimensions()[m_indices[kernel_index[0]]]; 457 const size_t numY = dimensions()[m_indices[kernel_index[1]]]; 458 const size_t numZ = dimensions()[m_indices[kernel_index[2]]]; 459 auto input_dim = std::array<size_t, 3>{numX, numY, numZ}; 460 const size_t numP = dimensions().TotalSize() / (numX * numY * numZ); 461 462 const array<Index, 3> indices{ 463 {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}}; 464 const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]], 465 m_kernelImpl.dimensions()[kernel_index[1]], 466 m_kernelImpl.dimensions()[kernel_index[2]]}}; 467 468 internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); 469 470 auto global_range = cl::sycl::range<3>{}; 471 auto local_range = cl::sycl::range<3>{}; 472 473 m_device.parallel_for_setup(input_dim, global_range, local_range); 474 auto local_memory_range = (local_range + kernel_size - 1); 475 const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2]; 476 477 gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock()); 478 typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims, 479 typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV3D> 480 ConvKernel; 481 m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>( 482 m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size, 483 indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP); 484 break; 485 } 486 487 default: { 488 EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), 489 THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); 490 } 491 } 492 } 493 494 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { 495 eigen_assert(m_buf != NULL); 496 eigen_assert(index < m_dimensions.TotalSize()); 497 return m_buf[index]; 498 } 499 500 template <int LoadMode> 501 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const { 502 eigen_assert(m_buf != NULL); 503 eigen_assert(index < m_dimensions.TotalSize()); 504 return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index); 505 } 506 507 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { 508 // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost 509 // model. 510 const double kernel_size = m_kernelImpl.dimensions().TotalSize(); 511 // We ignore the use of fused multiply-add. 512 const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>(); 513 const double firstIndex_compute_cost = 514 NumDims * 515 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>()); 516 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + 517 kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) + 518 TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize)); 519 } 520 // binding placeholder accessors to a command group handler for SYCL 521 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { 522 m_kernelImpl.bind(cgh); 523 m_inputImpl.bind(cgh); 524 m_buf.bind(cgh); 525 m_kernel.bind(cgh); 526 } 527 528 private: 529 // No assignment (copies are needed by the kernels) 530 TensorEvaluator &operator=(const TensorEvaluator &); 531 TensorEvaluator<InputArgType, Eigen::SyclDevice> m_inputImpl; 532 KernelArgType m_kernelArg; 533 TensorEvaluator<KernelArgType, Eigen::SyclDevice> m_kernelImpl; 534 Indices m_indices; 535 Dimensions m_dimensions; 536 EvaluatorPointerType m_buf; 537 typename KernelStorage::Type m_kernel; 538 bool m_local_kernel; 539 const Eigen::SyclDevice EIGEN_DEVICE_REF m_device; 540 }; // namespace Eigen 541 542 } // end namespace Eigen 543 544 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H 545