xref: /aosp_15_r20/external/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h (revision bf2c37156dfe67e5dfebd6d394bad8b2ab5804d4)
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Mehdi Goli    Codeplay Software Ltd.
5 // Ralph Potter  Codeplay Software Ltd.
6 // Luke Iwanski  Codeplay Software Ltd.
7 // Contact: <[email protected]>
8 // Copyright (C) 2016 Benoit Steiner <[email protected]>
9 
10 //
11 // This Source Code Form is subject to the terms of the Mozilla
12 // Public License v. 2.0. If a copy of the MPL was not distributed
13 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
14 
15 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
16 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
17 
18 namespace Eigen {
19 
20 /** \class TensorConvolution
21  * \ingroup CXX11_Tensor_Module
22  *
23  * \brief Tensor convolution class.
24  *
25  *
26  */
27 
28 enum class convolution_type { CONV1D, CONV2D, CONV3D };
29 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
30           typename Kernel_accessor, typename Buffer_accessor, convolution_type Conv_Dim>
31 struct EigenConvolutionKernel;
32 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
33           typename Kernel_accessor, typename Buffer_accessor>
34 struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
35                               Buffer_accessor, convolution_type::CONV1D> {
36   typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
37       Local_accessor;
38   Local_accessor local_acc;
39   Evaluator device_evaluator;
40   Kernel_accessor kernel_filter;
41   Buffer_accessor buffer_acc;
42   internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper;
43   const size_t kernelSize;
44   const cl::sycl::range<2> input_range;
45   EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
46                          Buffer_accessor buffer_acc_,
47                          internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper_,
48                          const size_t kernelSize_, const cl::sycl::range<2> input_range_)
49       : local_acc(local_acc_),
50         device_evaluator(device_evaluator_),
51         kernel_filter(kernel_filter_),
52         buffer_acc(buffer_acc_),
53         indexMapper(indexMapper_),
54         kernelSize(kernelSize_),
55         input_range(input_range_) {}
56 
57   template <typename BooleanDim2>
58   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) {
59     return (boolean_check[0] && boolean_check[1]);
60   }
61   void operator()(cl::sycl::nd_item<2> itemID) {
62     auto buffer_ptr = buffer_acc.get_pointer();
63     auto kernel_ptr = kernel_filter.get_pointer();
64     // the required row to be calculated for the for each plane in shered memory
65     const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
66     const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
67     const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0];
68     const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1));
69     /// fill the shared memory
70     for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) {
71       const size_t local_index = i + plane_kernel_offset;
72       const size_t tensor_index =
73           plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset);
74 
75       local_acc[local_index] =
76           (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1])
77               ? device_evaluator.coeff(tensor_index)
78               : CoeffReturnType(0);
79     }
80 
81     itemID.barrier(cl::sycl::access::fence_space::local_space);
82 
83     // calculate the convolution // output start x
84     const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]);
85     if (boundary_check(itemID.get_global_id() < input_range)) {
86       CoeffReturnType result = static_cast<CoeffReturnType>(0);
87       const size_t index = plane_kernel_offset + itemID.get_local_id(0);
88       for (size_t k = 0; k < kernelSize; ++k) {
89         result += (local_acc[k + index] * kernel_ptr[k]);
90       }
91       const size_t tensor_index =
92           indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) +
93           indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start);
94       buffer_ptr[tensor_index] = result;
95     }
96   }
97 };
98 
99 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
100           typename Kernel_accessor, typename Buffer_accessor>
101 struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
102                               Buffer_accessor, convolution_type::CONV2D> {
103   typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
104       Local_accessor;
105   Local_accessor local_acc;
106   Evaluator device_evaluator;
107   Kernel_accessor kernel_filter;
108   Buffer_accessor buffer_acc;
109   internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper;
110   const cl::sycl::range<2> kernel_size;
111   const cl::sycl::range<3> input_range;
112   EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
113                          Buffer_accessor buffer_acc_,
114                          internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper_,
115                          const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_)
116       : local_acc(local_acc_),
117         device_evaluator(device_evaluator_),
118         kernel_filter(kernel_filter_),
119         buffer_acc(buffer_acc_),
120         indexMapper(indexMapper_),
121         kernel_size(kernel_size_),
122         input_range(input_range_) {}
123   template <typename BooleanDim3>
124   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
125     return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
126   }
127 
128   void operator()(cl::sycl::nd_item<3> itemID) {
129     auto buffer_ptr = buffer_acc.get_pointer();
130     auto kernel_ptr = kernel_filter.get_pointer();
131     // the required row to be calculated for the for each plane in shered memory
132     const auto num_input = cl::sycl::range<2>{
133         (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
134 
135     const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2));
136     const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1];
137 
138     const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
139                                                  itemID.get_group(1) * itemID.get_local_range()[1]};
140 
141     // fill the local memory
142     bool in_range_dim2 = itemID.get_global_id(2) < input_range[2];
143     for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
144       const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset);
145       bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1));
146       for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
147         const size_t local_index = i + local_input_offset;
148         const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
149                                                              i + input_offset[0], j + input_offset[1]);
150         local_acc[local_index] = (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) &&
151                                   in_range_dim1 && in_range_dim2)
152                                      ? device_evaluator.coeff(tensor_index)
153                                      : CoeffReturnType(0);
154       }
155     }
156 
157     itemID.barrier(cl::sycl::access::fence_space::local_space);
158 
159     // output offset start for each thread
160     const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
161                                                   itemID.get_group(1) * itemID.get_local_range()[1]};
162 
163     if (boundary_check(itemID.get_global_id() < input_range)) {
164       CoeffReturnType result = static_cast<CoeffReturnType>(0);
165 
166       for (size_t j = 0; j < kernel_size[1]; j++) {
167         size_t kernel_offset = kernel_size[0] * j;
168         const size_t index =
169             (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0);
170         for (size_t i = 0; i < kernel_size[0]; i++) {
171           result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]);
172         }
173       }
174       const size_t tensor_index =
175           indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) +
176           indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0],
177                                                              itemID.get_local_id(1) + output_offset[1]);
178 
179       buffer_ptr[tensor_index] = result;
180     }
181   }
182 };
183 
184 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
185           typename Kernel_accessor, typename Buffer_accessor>
186 struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
187                               Buffer_accessor, convolution_type::CONV3D> {
188   typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
189       Local_accessor;
190   Local_accessor local_acc;
191   Evaluator device_evaluator;
192   Kernel_accessor kernel_filter;
193   Buffer_accessor buffer_acc;
194   internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper;
195   const cl::sycl::range<3> kernel_size;
196   const cl::sycl::range<3> input_range;
197   const size_t numP;
198 
199   EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
200                          Buffer_accessor buffer_acc_,
201                          internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper_,
202                          const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_,
203                          const size_t numP_)
204       : local_acc(local_acc_),
205         device_evaluator(device_evaluator_),
206         kernel_filter(kernel_filter_),
207         buffer_acc(buffer_acc_),
208         indexMapper(indexMapper_),
209         kernel_size(kernel_size_),
210         input_range(input_range_),
211         numP(numP_) {}
212   template <typename BooleanDim3>
213   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
214     return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
215   }
216   void operator()(cl::sycl::nd_item<3> itemID) {
217     auto buffer_ptr = buffer_acc.get_pointer();
218     auto kernel_ptr = kernel_filter.get_pointer();
219     const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
220 
221     const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()};
222 
223     const auto output_offset =
224           cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()};
225 
226     for (size_t p = 0; p < numP; p++) {
227       /// fill the shared memory
228       const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
229       for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) {
230         size_t local_index_dim2 = num_input[0] * num_input[1] * k;
231         bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1));
232         for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
233           bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1));
234           size_t local_index_dim1 = (num_input[0] * j)  + local_index_dim2;
235           for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
236             bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1));
237             const size_t local_index = local_index_dim1 + i;
238             const size_t tensor_index =
239                 plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
240                                          i + input_offset[0], j + input_offset[1], k + input_offset[2]);
241             local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0);
242           }
243         }
244       }
245       itemID.barrier(cl::sycl::access::fence_space::local_space);
246 
247       // calculate the convolution
248 
249       if (boundary_check(itemID.get_global_id() < input_range)) {
250         CoeffReturnType result = static_cast<CoeffReturnType>(0);
251         for (size_t k = 0; k < kernel_size[2]; k++) {
252           for (size_t j = 0; j < kernel_size[1]; j++) {
253             for (size_t i = 0; i < kernel_size[0]; i++) {
254               const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k);
255               const size_t local_index =
256                   ((i + itemID.get_local_id(0)) +
257                    num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2))));
258 
259               result += (local_acc[local_index] * kernel_ptr[kernel_index]);
260             }
261           }
262         }
263         const size_t tensor_index =
264             indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) +
265             indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]);
266         buffer_ptr[tensor_index] = result;
267       }
268 
269       itemID.barrier(cl::sycl::access::fence_space::local_space);
270     }
271   }
272 };
273 
274 template <typename Indices, typename InputArgType, typename KernelArgType>
275 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Eigen::SyclDevice> {
276   typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
277 
278   static const int NumDims =
279       internal::array_size<typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions>::value;
280   static const int NumKernelDims = internal::array_size<Indices>::value;
281   typedef typename XprType::Index Index;
282   typedef DSizes<Index, NumDims> Dimensions;
283   typedef typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions KernelDimensions;
284   typedef const Eigen::SyclDevice Device;
285   typedef typename XprType::CoeffReturnType CoeffReturnType;
286   typedef typename PacketType<CoeffReturnType, Eigen::SyclDevice>::type PacketReturnType;
287   typedef typename InputArgType::Scalar Scalar;
288   static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
289   typedef StorageMemory<CoeffReturnType, Eigen::SyclDevice> Storage;
290   typedef typename Storage::Type EvaluatorPointerType;
291   typedef StorageMemory<const CoeffReturnType, Eigen::SyclDevice> KernelStorage;
292 
293   enum {
294     IsAligned = TensorEvaluator<InputArgType, Eigen::SyclDevice>::IsAligned &
295                 TensorEvaluator<KernelArgType, Eigen::SyclDevice>::IsAligned,
296     PacketAccess = false,
297     BlockAccess = false,
298     PreferBlockAccess = false,
299     Layout = TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout,
300     CoordAccess = false,  // to be implemented
301     RawAccess = false
302   };
303 
304   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
305   typedef internal::TensorBlockNotImplemented TensorBlock;
306   //===--------------------------------------------------------------------===//
307 
308   TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device)
309       : m_inputImpl(op.inputExpression(), device),
310         m_kernelArg(op.kernelExpression()),
311         m_kernelImpl(op.kernelExpression(), device),
312         m_indices(op.indices()),
313         m_buf(NULL),
314         m_kernel(NULL),
315         m_local_kernel(false),
316         m_device(device) {
317     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout) ==
318                          static_cast<int>(TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Layout)),
319                         YOU_MADE_A_PROGRAMMING_MISTAKE);
320 
321     const typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions &input_dims = m_inputImpl.dimensions();
322     const typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions &kernel_dims =
323         m_kernelImpl.dimensions();
324 
325     m_dimensions = m_inputImpl.dimensions();
326     for (int i = 0; i < NumKernelDims; ++i) {
327       const Index index = op.indices()[i];
328       const Index input_dim = input_dims[index];
329       const Index kernel_dim = kernel_dims[i];
330       const Index result_dim = input_dim - kernel_dim + 1;
331       m_dimensions[index] = result_dim;
332     }
333   }
334 
335   EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; }
336 
337   EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
338     preloadKernel();
339     m_inputImpl.evalSubExprsIfNeeded(NULL);
340     if (data) {
341       executeEval(data);
342       return false;
343     } else {
344       m_buf = (EvaluatorPointerType)m_device.get(
345           (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)));
346       executeEval(m_buf);
347       return true;
348     }
349   }
350 
351   EIGEN_STRONG_INLINE void cleanup() {
352     m_inputImpl.cleanup();
353     if (m_buf) {
354       m_device.deallocate_temp(m_buf);
355       m_buf = NULL;
356     }
357     if (m_local_kernel) {
358       m_device.deallocate_temp(m_kernel);
359       m_local_kernel = false;
360     }
361     m_kernel = NULL;
362   }
363   /// used by sycl in order to build the sycl buffer
364   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device &device() const { return m_device; }
365   /// used by sycl in order to build the sycl buffer
366   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buf; }
367 
368   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
369     // Don't make a local copy of the kernel unless we have to (i.e. it's an
370     // expression that needs to be evaluated)
371     typename KernelStorage::Type in_place = m_kernelImpl.data();
372     if (in_place) {
373       m_kernel = in_place;
374       m_local_kernel = false;
375     } else {
376       ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
377       EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz));
378       typedef TensorEvalToOp<const KernelArgType> EvalTo;
379       EvalTo evalToTmp(m_device.get(local), m_kernelArg);
380       const bool PacketAccess = internal::IsVectorizable<Eigen::SyclDevice, KernelArgType>::value;
381       internal::TensorExecutor<const EvalTo, Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
382       m_kernel = local;
383       m_local_kernel = true;
384     }
385   }
386 
387   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const {
388     typedef TensorEvaluator<InputArgType, Eigen::SyclDevice> InputEvaluator;
389     typedef typename InputEvaluator::Dimensions InputDims;
390     switch (NumKernelDims) {
391       case 1: {
392         const size_t numX = dimensions()[m_indices[0]];
393         const size_t numP = dimensions().TotalSize() / numX;
394         const auto input_dim = std::array<size_t, 2>{numX, numP};
395         auto global_range = cl::sycl::range<2>{};
396         auto local_range = cl::sycl::range<2>{};
397         const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
398 
399         m_device.parallel_for_setup(input_dim, global_range, local_range);
400         const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]);
401         gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
402         const array<Index, 1> indices{{m_indices[0]}};
403         const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
404         internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
405 
406         typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
407                                        typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV1D>
408             ConvKernel;
409 
410         m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
411             m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
412             indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1]));
413         break;
414       }
415 
416       case 2: {
417         auto kernel_index = std::array<size_t, 2>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1,
418                                                   static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0};
419         auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
420                                               (size_t)m_kernelImpl.dimensions()[kernel_index[1]]};
421         const size_t numX = dimensions()[m_indices[kernel_index[0]]];
422         const size_t numY = dimensions()[m_indices[kernel_index[1]]];
423         const size_t numP = dimensions().TotalSize() / (numX * numY);
424         auto input_dim = std::array<size_t, 3>{numX, numY, numP};
425 
426         auto global_range = cl::sycl::range<3>{};
427         auto local_range = cl::sycl::range<3>{};
428 
429         m_device.parallel_for_setup(input_dim, global_range, local_range);
430 
431         const size_t local_memory_size =
432             (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2];
433         gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
434         const array<Index, 2> indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}};
435         const array<Index, 2> kernel_dims{
436             {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}};
437         internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
438         typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
439                                        typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV2D>
440             ConvKernel;
441         m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
442             m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
443             indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]});
444         break;
445       }
446 
447       case 3: {
448         auto kernel_index = std::array<size_t, 3>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2,
449                                                   static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1,
450                                                   static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0};
451 
452         auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
453                                               (size_t)m_kernelImpl.dimensions()[kernel_index[1]],
454                                               (size_t)m_kernelImpl.dimensions()[kernel_index[2]]};
455 
456         const size_t numX = dimensions()[m_indices[kernel_index[0]]];
457         const size_t numY = dimensions()[m_indices[kernel_index[1]]];
458         const size_t numZ = dimensions()[m_indices[kernel_index[2]]];
459         auto input_dim = std::array<size_t, 3>{numX, numY, numZ};
460         const size_t numP = dimensions().TotalSize() / (numX * numY * numZ);
461 
462         const array<Index, 3> indices{
463             {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}};
464         const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]],
465                                            m_kernelImpl.dimensions()[kernel_index[1]],
466                                            m_kernelImpl.dimensions()[kernel_index[2]]}};
467 
468         internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
469 
470         auto global_range = cl::sycl::range<3>{};
471         auto local_range = cl::sycl::range<3>{};
472 
473         m_device.parallel_for_setup(input_dim, global_range, local_range);
474         auto local_memory_range = (local_range + kernel_size - 1);
475         const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2];
476 
477         gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
478         typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
479                                        typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV3D>
480             ConvKernel;
481         m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
482             m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
483             indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP);
484         break;
485       }
486 
487       default: {
488         EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
489                             THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
490       }
491     }
492   }
493 
494   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
495     eigen_assert(m_buf != NULL);
496     eigen_assert(index < m_dimensions.TotalSize());
497     return m_buf[index];
498   }
499 
500   template <int LoadMode>
501   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const {
502     eigen_assert(m_buf != NULL);
503     eigen_assert(index < m_dimensions.TotalSize());
504     return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
505   }
506 
507   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
508     // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
509     // model.
510     const double kernel_size = m_kernelImpl.dimensions().TotalSize();
511     // We ignore the use of fused multiply-add.
512     const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
513     const double firstIndex_compute_cost =
514         NumDims *
515         (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
516     return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
517            kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
518                           TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
519   }
520   // binding placeholder accessors to a command group handler for SYCL
521   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
522     m_kernelImpl.bind(cgh);
523     m_inputImpl.bind(cgh);
524     m_buf.bind(cgh);
525     m_kernel.bind(cgh);
526   }
527 
528  private:
529   // No assignment (copies are needed by the kernels)
530   TensorEvaluator &operator=(const TensorEvaluator &);
531   TensorEvaluator<InputArgType, Eigen::SyclDevice> m_inputImpl;
532   KernelArgType m_kernelArg;
533   TensorEvaluator<KernelArgType, Eigen::SyclDevice> m_kernelImpl;
534   Indices m_indices;
535   Dimensions m_dimensions;
536   EvaluatorPointerType m_buf;
537   typename KernelStorage::Type m_kernel;
538   bool m_local_kernel;
539   const Eigen::SyclDevice EIGEN_DEVICE_REF m_device;
540 };  // namespace Eigen
541 
542 }  // end namespace Eigen
543 
544 #endif  // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
545