1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <[email protected]>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
12
13 namespace Eigen {
14 namespace internal {
15
16
17 #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
18 // Full reducers for GPU, don't vectorize for now
19
20 // Reducer function that enables multiple gpu thread to safely accumulate at the same
21 // output address. It basically reads the current value of the output variable, and
22 // attempts to update it with the new value. If in the meantime another gpu thread
23 // updated the content of the output address it will try again.
24 template <typename T, typename R>
atomicReduce(T * output,T accum,R & reducer)25 __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
26 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
27 if (sizeof(T) == 4)
28 {
29 unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
30 unsigned int newval = oldval;
31 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
32 if (newval == oldval) {
33 return;
34 }
35 unsigned int readback;
36 while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
37 oldval = readback;
38 newval = oldval;
39 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
40 if (newval == oldval) {
41 return;
42 }
43 }
44 }
45 else if (sizeof(T) == 8) {
46 unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
47 unsigned long long newval = oldval;
48 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
49 if (newval == oldval) {
50 return;
51 }
52 unsigned long long readback;
53 while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
54 oldval = readback;
55 newval = oldval;
56 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
57 if (newval == oldval) {
58 return;
59 }
60 }
61 }
62 else {
63 gpu_assert(0 && "Wordsize not supported");
64 }
65 #else // EIGEN_CUDA_ARCH >= 300
66 gpu_assert(0 && "Shouldn't be called on unsupported device");
67 #endif // EIGEN_CUDA_ARCH >= 300
68 }
69
70 // We extend atomicExch to support extra data types
71 template <typename Type>
atomicExchCustom(Type * address,Type val)72 __device__ inline Type atomicExchCustom(Type* address, Type val) {
73 return atomicExch(address, val);
74 }
75
76 template <>
atomicExchCustom(double * address,double val)77 __device__ inline double atomicExchCustom(double* address, double val) {
78 unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
79 return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
80 }
81
82 #ifdef EIGEN_HAS_GPU_FP16
83 template <typename R>
atomicReduce(half2 * output,half2 accum,R & reducer)84 __device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) {
85 unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
86 unsigned int newval = oldval;
87 reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
88 if (newval == oldval) {
89 return;
90 }
91 unsigned int readback;
92 while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
93 oldval = readback;
94 newval = oldval;
95 reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
96 if (newval == oldval) {
97 return;
98 }
99 }
100 }
101 // reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations
102 template <typename R>
atomicReduce(Packet4h2 * output,Packet4h2 accum,R & reducer)103 __device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) {
104 half2* houtput=reinterpret_cast<half2*>(output);
105 half2* haccum=reinterpret_cast<half2*>(&accum);
106 for(int i=0;i<4;++i){
107 atomicReduce(houtput+i,*(haccum+i),reducer);
108 }
109 }
110 #endif // EIGEN_HAS_GPU_FP16
111
112 template <>
atomicReduce(float * output,float accum,SumReducer<float> &)113 __device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
114 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
115 atomicAdd(output, accum);
116 #else // EIGEN_CUDA_ARCH >= 300
117 gpu_assert(0 && "Shouldn't be called on unsupported device");
118 #endif // EIGEN_CUDA_ARCH >= 300
119 }
120
121
122 template <typename CoeffType, typename Index>
ReductionInitKernel(const CoeffType val,Index num_preserved_coeffs,CoeffType * output)123 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
124 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
125 const Index num_threads = blockDim.x * gridDim.x;
126 for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
127 output[i] = val;
128 }
129 }
130
131
132 template <int BlockSize, int NumPerThread, typename Self,
133 typename Reducer, typename Index>
FullReductionKernel(Reducer reducer,const Self input,Index num_coeffs,typename Self::CoeffReturnType * output,unsigned int * semaphore)134 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
135 typename Self::CoeffReturnType* output, unsigned int* semaphore) {
136 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
137 // Initialize the output value
138 const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
139 if (gridDim.x == 1) {
140 if (first_index == 0) {
141 *output = reducer.initialize();
142 }
143 }
144 else {
145 if (threadIdx.x == 0) {
146 unsigned int block = atomicCAS(semaphore, 0u, 1u);
147 if (block == 0) {
148 // We're the first block to run, initialize the output value
149 atomicExchCustom(output, reducer.initialize());
150 __threadfence();
151 atomicExch(semaphore, 2u);
152 }
153 else {
154 // Wait for the first block to initialize the output value.
155 // Use atomicCAS here to ensure that the reads aren't cached
156 unsigned int val;
157 do {
158 val = atomicCAS(semaphore, 2u, 2u);
159 }
160 while (val < 2u);
161 }
162 }
163 }
164
165 __syncthreads();
166
167 eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
168
169 typename Self::CoeffReturnType accum = reducer.initialize();
170 Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
171 for (Index i = 0; i < max_iter; i+=BlockSize) {
172 const Index index = first_index + i;
173 eigen_assert(index < num_coeffs);
174 typename Self::CoeffReturnType val = input.m_impl.coeff(index);
175 reducer.reduce(val, &accum);
176 }
177
178 #pragma unroll
179 for (int offset = warpSize/2; offset > 0; offset /= 2) {
180 #if defined(EIGEN_HIPCC)
181 // use std::is_floating_point to determine the type of reduced_val
182 // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
183 // and list the float and int versions of __shfl_down as the candidate functions.
184 if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
185 reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
186 } else {
187 reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
188 }
189 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
190 reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
191 #else
192 reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
193 #endif
194 }
195
196 if ((threadIdx.x & (warpSize - 1)) == 0) {
197 atomicReduce(output, accum, reducer);
198 }
199
200 if (gridDim.x > 1 && threadIdx.x == 0) {
201 // Let the last block reset the semaphore
202 atomicInc(semaphore, gridDim.x + 1);
203 #if defined(EIGEN_HIPCC)
204 __threadfence_system();
205 #endif
206 }
207 #else // EIGEN_CUDA_ARCH >= 300
208 gpu_assert(0 && "Shouldn't be called on unsupported device");
209 #endif // EIGEN_CUDA_ARCH >= 300
210 }
211
212
213 #ifdef EIGEN_HAS_GPU_FP16
214 template <typename Self,
215 typename Reducer, typename Index>
ReductionInitFullReduxKernelHalfFloat(Reducer reducer,const Self input,Index num_coeffs,packet_traits<Eigen::half>::type * scratch)216 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
217 packet_traits<Eigen::half>::type* scratch) {
218 eigen_assert(blockDim.x == 1);
219 eigen_assert(gridDim.x == 1);
220 typedef packet_traits<Eigen::half>::type packet_type;
221 Index packet_remainder =
222 num_coeffs % Index(unpacket_traits<packet_type>::size);
223 if (packet_remainder != 0) {
224 half2* h2scratch = reinterpret_cast<half2*>(scratch);
225 for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) {
226 *h2scratch =
227 __halves2half2(input.m_impl.coeff(i), input.m_impl.coeff(i + 1));
228 h2scratch++;
229 }
230 if ((num_coeffs & 1) != 0) {
231 half lastCoeff = input.m_impl.coeff(num_coeffs - 1);
232 *h2scratch = __halves2half2(lastCoeff, reducer.initialize());
233 }
234 } else {
235 *scratch = reducer.template initializePacket<packet_type>();
236 }
237 }
238
239 template <typename Self,
240 typename Reducer, typename Index>
ReductionInitKernelHalfFloat(Reducer reducer,const Self input,Index num_coeffs,half * output)241 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
242 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
243 const Index num_threads = blockDim.x * gridDim.x;
244 typedef typename packet_traits<Eigen::half>::type PacketType;
245
246 const Index num_packets =
247 num_coeffs / Index(unpacket_traits<PacketType>::size);
248 PacketType* p_output = reinterpret_cast<PacketType*>(output);
249 for (Index i = thread_id; i < num_packets; i += num_threads) {
250 p_output[i] = reducer.template initializePacket<PacketType>();
251 }
252 Index packet_remainder =
253 num_coeffs % Index(unpacket_traits<PacketType>::size);
254 if (thread_id < packet_remainder) {
255 output[num_coeffs - packet_remainder + thread_id] = reducer.initialize();
256 }
257 }
258
259 template <int BlockSize, int NumPerThread, typename Self,
260 typename Reducer, typename Index>
FullReductionKernelHalfFloat(Reducer reducer,const Self input,Index num_coeffs,half * output,packet_traits<Eigen::half>::type * scratch)261 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
262 half* output, packet_traits<Eigen::half>::type* scratch) {
263 typedef typename packet_traits<Eigen::half>::type PacketType;
264 const int packet_width = unpacket_traits<PacketType>::size;
265 eigen_assert(NumPerThread % packet_width == 0);
266 const Index first_index =
267 blockIdx.x * BlockSize * NumPerThread + packet_width * threadIdx.x;
268
269 // Initialize the output value if it wasn't initialized by the ReductionInitKernel
270
271 if (gridDim.x == 1) {
272 if (first_index == 0) {
273 int rem = num_coeffs % packet_width;
274 if (rem != 0) {
275 half2* p_scratch = reinterpret_cast<half2*>(scratch);
276 *scratch = reducer.template initializePacket<PacketType>();
277 for (int i = 0; i < rem / 2; i++) {
278 *p_scratch = __halves2half2(
279 input.m_impl.coeff(num_coeffs - packet_width + 2 * i),
280 input.m_impl.coeff(num_coeffs - packet_width + 2 * i + 1));
281 p_scratch++;
282 }
283 if ((num_coeffs & 1) != 0) {
284 half last = input.m_impl.coeff(num_coeffs - 1);
285 *p_scratch = __halves2half2(last, reducer.initialize());
286 }
287 } else {
288 *scratch = reducer.template initializePacket<PacketType>();
289 }
290 }
291 __syncthreads();
292 }
293
294 PacketType accum = reducer.template initializePacket<PacketType>();
295 const Index max_iter =
296 numext::mini<Index>((num_coeffs - first_index) / packet_width,
297 NumPerThread * BlockSize / packet_width);
298 for (Index i = 0; i < max_iter; i += BlockSize) {
299 const Index index = first_index + packet_width * i;
300 eigen_assert(index + packet_width < num_coeffs);
301 PacketType val = input.m_impl.template packet<Unaligned>(index);
302 reducer.reducePacket(val, &accum);
303 }
304
305 #pragma unroll
306 for (int offset = warpSize/2; offset > 0; offset /= 2) {
307 #if defined(EIGEN_HIPCC)
308 PacketType r1;
309 half2* hr = reinterpret_cast<half2*>(&r1);
310 half2* hacc = reinterpret_cast<half2*>(&accum);
311 for (int i = 0; i < packet_width / 2; i++) {
312 // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
313 union { int i; half2 h; } wka_in, wka_out;
314 wka_in.h = hacc[i];
315 wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
316 hr[i] = wka_out.h;
317 }
318 reducer.reducePacket(r1, &accum);
319 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
320 PacketType r1;
321 half2* hr = reinterpret_cast<half2*>(&r1);
322 half2* hacc = reinterpret_cast<half2*>(&accum);
323 for (int i = 0; i < packet_width / 2; i++) {
324 hr[i] = __shfl_down(hacc[i], offset, warpSize);
325 }
326 reducer.reducePacket(r1, &accum);
327 #else
328 PacketType r1;
329 half2* hr = reinterpret_cast<half2*>(&r1);
330 half2* hacc = reinterpret_cast<half2*>(&accum);
331 for (int i = 0; i < packet_width / 2; i++) {
332 hr[i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (unsigned)offset, warpSize);
333 }
334 reducer.reducePacket(r1, &accum);
335
336 #endif
337 }
338
339 if ((threadIdx.x & (warpSize - 1)) == 0) {
340 atomicReduce(scratch, accum, reducer);
341 }
342
343 __syncthreads();
344 half2* rv1 = reinterpret_cast<half2*>(scratch);
345 if (packet_width > 2) {
346 reducer.reducePacket(rv1[2], rv1);
347 reducer.reducePacket(rv1[3], rv1 + 1);
348 reducer.reducePacket(rv1[1], rv1);
349 }
350 if (gridDim.x == 1) {
351 if (first_index == 0) {
352 half tmp = __low2half(*rv1);
353 reducer.reduce(__high2half(*rv1), &tmp);
354 *output = tmp;
355 }
356 }
357 }
358
359 template <typename Op>
ReductionCleanupKernelHalfFloat(Op reducer,half * output,packet_traits<Eigen::half>::type * scratch)360 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits<Eigen::half>::type* scratch) {
361 eigen_assert(threadIdx.x == 1);
362 half2* pscratch = reinterpret_cast<half2*>(scratch);
363 half tmp = __float2half(0.f);
364 typedef packet_traits<Eigen::half>::type packet_type;
365 for (int i = 0; i < unpacket_traits<packet_type>::size; i += 2) {
366 reducer.reduce(__low2half(*pscratch), &tmp);
367 reducer.reduce(__high2half(*pscratch), &tmp);
368 pscratch++;
369 }
370 *output = tmp;
371 }
372
373 #endif // EIGEN_HAS_GPU_FP16
374
375 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
376 struct FullReductionLauncher {
runFullReductionLauncher377 static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
378 gpu_assert(false && "Should only be called on doubles, floats and half floats");
379 }
380 };
381
382 // Specialization for float and double
383 template <typename Self, typename Op, typename OutputType, bool PacketAccess>
384 struct FullReductionLauncher<
385 Self, Op, OutputType, PacketAccess,
386 typename internal::enable_if<
387 internal::is_same<float, OutputType>::value ||
388 internal::is_same<double, OutputType>::value,
389 void>::type> {
390 static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
391
392 typedef typename Self::Index Index;
393 const int block_size = 256;
394 const int num_per_thread = 128;
395 const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
396
397 unsigned int* semaphore = NULL;
398 if (num_blocks > 1) {
399 semaphore = device.semaphore();
400 }
401
402 LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
403 num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
404 }
405 };
406
407 #ifdef EIGEN_HAS_GPU_FP16
408 template <typename Self, typename Op>
409 struct FullReductionLauncher<Self, Op, Eigen::half, false> {
410 static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
411 gpu_assert(false && "Should not be called since there is no packet accessor");
412 }
413 };
414
415 template <typename Self, typename Op>
416 struct FullReductionLauncher<Self, Op, Eigen::half, true> {
417 static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
418 typedef typename Self::Index Index;
419 typedef typename packet_traits<Eigen::half>::type PacketType;
420
421 const int block_size = 256;
422 const int num_per_thread = 128;
423 const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
424 PacketType* scratch = static_cast<PacketType*>(device.scratchpad());
425 // half2* scratch = static_cast<half2*>(device.scratchpad());
426
427 if (num_blocks > 1) {
428 // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
429 // won't be a race conditions between multiple thread blocks.
430 LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
431 1, 1, 0, device, reducer, self, num_coeffs, scratch);
432 }
433
434 LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
435 num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
436
437 if (num_blocks > 1) {
438 LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
439 1, 1, 0, device, reducer, output, scratch);
440 }
441 }
442 };
443 #endif // EIGEN_HAS_GPU_FP16
444
445
446 template <typename Self, typename Op, bool Vectorizable>
447 struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
448 // Unfortunately nvidia doesn't support well exotic types such as complex,
449 // so reduce the scope of the optimized version of the code to the simple cases
450 // of doubles, floats and half floats
451 #ifdef EIGEN_HAS_GPU_FP16
452 static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
453 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
454 internal::is_same<typename Self::CoeffReturnType, double>::value ||
455 (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
456 #else // EIGEN_HAS_GPU_FP16
457 static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
458 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
459 internal::is_same<typename Self::CoeffReturnType, double>::value);
460 #endif // EIGEN_HAS_GPU_FP16
461
462 template <typename OutputType>
463 static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
464 gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
465 const Index num_coeffs = array_prod(self.m_impl.dimensions());
466 // Don't crash when we're called with an input tensor of size 0.
467 if (num_coeffs == 0) {
468 return;
469 }
470
471 FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
472 }
473 };
474
475
476 template <int NumPerThread, typename Self,
477 typename Reducer, typename Index>
478 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
479 typename Self::CoeffReturnType* output) {
480 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
481 typedef typename Self::CoeffReturnType Type;
482 eigen_assert(blockDim.y == 1);
483 eigen_assert(blockDim.z == 1);
484 eigen_assert(gridDim.y == 1);
485 eigen_assert(gridDim.z == 1);
486
487 const int unroll_times = 16;
488 eigen_assert(NumPerThread % unroll_times == 0);
489
490 const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
491 const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
492
493 const Index num_threads = blockDim.x * gridDim.x;
494 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
495
496 // Initialize the output values if they weren't initialized by the ReductionInitKernel
497 if (gridDim.x == 1) {
498 for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
499 output[i] = reducer.initialize();
500 }
501 __syncthreads();
502 }
503
504 for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
505 const Index row = i / input_col_blocks;
506
507 if (row < num_preserved_coeffs) {
508 const Index col_block = i % input_col_blocks;
509 const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
510
511 Type reduced_val = reducer.initialize();
512
513 for (Index j = 0; j < NumPerThread; j += unroll_times) {
514 const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
515 if (last_col >= num_coeffs_to_reduce) {
516 for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
517 const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
518 reducer.reduce(val, &reduced_val);
519 }
520 break;
521 } else {
522 // Faster version of the loop with no branches after unrolling.
523 #pragma unroll
524 for (int k = 0; k < unroll_times; ++k) {
525 const Index col = col_begin + blockDim.x * (j + k);
526 reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
527 }
528 }
529 }
530
531 #pragma unroll
532 for (int offset = warpSize/2; offset > 0; offset /= 2) {
533 #if defined(EIGEN_HIPCC)
534 // use std::is_floating_point to determine the type of reduced_val
535 // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
536 // and list the float and int versions of __shfl_down as the candidate functions.
537 if (std::is_floating_point<Type>::value) {
538 reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
539 } else {
540 reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
541 }
542 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
543 reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
544 #else
545 reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
546 #endif
547 }
548
549 if ((threadIdx.x & (warpSize - 1)) == 0) {
550 atomicReduce(&(output[row]), reduced_val, reducer);
551 }
552 }
553 }
554 #else // EIGEN_CUDA_ARCH >= 300
555 gpu_assert(0 && "Shouldn't be called on unsupported device");
556 #endif // EIGEN_CUDA_ARCH >= 300
557 }
558
559 #ifdef EIGEN_HAS_GPU_FP16
560
561 template <int NumPerThread, typename Self,
562 typename Reducer, typename Index>
563 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
564 half* output) {
565 eigen_assert(blockDim.y == 1);
566 eigen_assert(blockDim.z == 1);
567 eigen_assert(gridDim.y == 1);
568 eigen_assert(gridDim.z == 1);
569
570 typedef typename packet_traits<Eigen::half>::type PacketType;
571 const int packet_width = unpacket_traits<PacketType>::size;
572 const int unroll_times = 16 / packet_width;
573 eigen_assert(NumPerThread % unroll_times == 0);
574 eigen_assert(unroll_times % 2 == 0);
575
576 const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
577 const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
578
579 const Index num_threads = blockDim.x * gridDim.x;
580 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
581
582 // Initialize the output values if they weren't initialized by the ReductionInitKernel
583 if (gridDim.x == 1) {
584 Index i = packet_width * thread_id;
585 for (; i + packet_width <= num_preserved_coeffs;
586 i += packet_width * num_threads) {
587 PacketType* poutput = reinterpret_cast<PacketType*>(output + i);
588 *poutput = reducer.template initializePacket<PacketType>();
589 }
590 if (i < num_preserved_coeffs) {
591 output[i] = reducer.initialize();
592 }
593 __syncthreads();
594 }
595
596 for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
597 const Index row = 2 * (i / input_col_blocks); // everybody takes 2 rows
598
599 if (row + 1 < num_preserved_coeffs) {
600 const Index col_block = i % input_col_blocks;
601 const Index col_begin =
602 packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x);
603
604 PacketType reduced_val1 = reducer.template initializePacket<PacketType>();
605 PacketType reduced_val2 = reducer.template initializePacket<PacketType>();
606
607 for (Index j = 0; j < NumPerThread; j += unroll_times) {
608 const Index last_col =
609 col_begin + blockDim.x * (j + unroll_times - 1) * packet_width;
610 if (last_col >= num_coeffs_to_reduce) {
611 Index col = col_begin + blockDim.x * j;
612 for (; col + packet_width <= num_coeffs_to_reduce;
613 col += blockDim.x) {
614 const PacketType val1 = input.m_impl.template packet<Unaligned>(
615 row * num_coeffs_to_reduce + col);
616 reducer.reducePacket(val1, &reduced_val1);
617 const PacketType val2 = input.m_impl.template packet<Unaligned>(
618 (row + 1) * num_coeffs_to_reduce + col);
619 reducer.reducePacket(val2, &reduced_val2);
620 }
621 if (col < num_coeffs_to_reduce) {
622 PacketType r1 = reducer.template initializePacket<PacketType>();
623 PacketType r2 = reducer.template initializePacket<PacketType>();
624 half2* hr1 = reinterpret_cast<half2*>(&r1);
625 half2* hr2 = reinterpret_cast<half2*>(&r2);
626 while (col + 1 < num_coeffs_to_reduce) {
627 *hr1 = __halves2half2(
628 input.m_impl.coeff(row * num_coeffs_to_reduce + col),
629 input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1));
630 *hr2 = __halves2half2(
631 input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col),
632 input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col +
633 1));
634 hr1++;
635 hr2++;
636 col += 2;
637 }
638 if (col < num_coeffs_to_reduce) {
639 // Peel;
640 const half last1 =
641 input.m_impl.coeff(row * num_coeffs_to_reduce + col);
642 *hr1 = __halves2half2(last1, reducer.initialize());
643 const half last2 =
644 input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col);
645 *hr2 = __halves2half2(last2, reducer.initialize());
646 }
647 reducer.reducePacket(r1, &reduced_val1);
648 reducer.reducePacket(r2, &reduced_val2);
649 }
650 break;
651 } else {
652 // Faster version of the loop with no branches after unrolling.
653 #pragma unroll
654 for (int k = 0; k < unroll_times; ++k) {
655 const Index col = col_begin + blockDim.x * (j + k) * packet_width;
656 reducer.reducePacket(input.m_impl.template packet<Unaligned>(
657 row * num_coeffs_to_reduce + col),
658 &reduced_val1);
659 reducer.reducePacket(input.m_impl.template packet<Unaligned>(
660 (row + 1) * num_coeffs_to_reduce + col),
661 &reduced_val2);
662 }
663 }
664 }
665
666 #pragma unroll
667 for (int offset = warpSize/2; offset > 0; offset /= 2) {
668 #if defined(EIGEN_HIPCC)
669 PacketType r1;
670 PacketType r2;
671 half2* hr1 = reinterpret_cast<half2*>(&r1);
672 half2* hr2 = reinterpret_cast<half2*>(&r2);
673 half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
674 half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
675 for (int i = 0; i < packet_width / 2; i++) {
676 // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
677 union { int i; half2 h; } wka_in1, wka_out1;
678 wka_in1.h = rv1[i];
679 wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize);
680 hr1[i] = wka_out1.h;
681
682 union { int i; half2 h; } wka_in2, wka_out2;
683 wka_in2.h = rv2[i];
684 wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize);
685 hr2[i] = wka_out2.h;
686 }
687 reducer.reducePacket(r1, &reduced_val1);
688 reducer.reducePacket(r2, &reduced_val2);
689 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
690 PacketType r1;
691 PacketType r2;
692 half2* hr1 = reinterpret_cast<half2*>(&r1);
693 half2* hr2 = reinterpret_cast<half2*>(&r2);
694 half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
695 half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
696 for (int i = 0; i < packet_width / 2; i++) {
697 hr1[i] = __shfl_down(rv1[i], offset, warpSize);
698 hr2[i] = __shfl_down(rv2[i], offset, warpSize);
699 }
700 reducer.reducePacket(r1, &reduced_val1);
701 reducer.reducePacket(r2, &reduced_val2);
702 #else
703 PacketType r1;
704 PacketType r2;
705 half2* hr1 = reinterpret_cast<half2*>(&r1);
706 half2* hr2 = reinterpret_cast<half2*>(&r2);
707 half2* rr1 = reinterpret_cast<half2*>(&reduced_val1);
708 half2* rr2 = reinterpret_cast<half2*>(&reduced_val2);
709 for (int i = 0; i < packet_width / 2; i++) {
710 hr1[i] =
711 __shfl_down_sync(0xFFFFFFFF, rr1[i], (unsigned)offset, warpSize);
712 hr2[i] =
713 __shfl_down_sync(0xFFFFFFFF, rr2[i], (unsigned)offset, warpSize);
714 }
715 reducer.reducePacket(r1, &reduced_val1);
716 reducer.reducePacket(r2, &reduced_val2);
717
718 #endif
719 }
720 half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
721 half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
722 half2 val;
723 if (packet_width > 2) {
724 reducer.reducePacket(rv1[2], rv1);
725 reducer.reducePacket(rv1[3], rv1 + 1);
726 reducer.reducePacket(rv1[1], rv1);
727 reducer.reducePacket(rv2[2], rv2);
728 reducer.reducePacket(rv2[3], rv2 + 1);
729 reducer.reducePacket(rv2[1], rv2);
730 }
731 half val1 = __low2half(*rv1);
732 reducer.reduce(__high2half(*rv1), &val1);
733 half val2 = __low2half(*rv2);
734 reducer.reduce(__high2half(*rv2), &val2);
735 val = __halves2half2(val1, val2);
736 if ((threadIdx.x & (warpSize - 1)) == 0) {
737 half* loc = output + row;
738 atomicReduce((half2*)loc, val, reducer);
739 }
740 }
741 }
742 }
743
744 #endif // EIGEN_HAS_GPU_FP16
745
746 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
747 struct InnerReductionLauncher {
748 static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
749 gpu_assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
750 return true;
751 }
752 };
753
754 // Specialization for float and double
755 template <typename Self, typename Op, typename OutputType, bool PacketAccess>
756 struct InnerReductionLauncher<
757 Self, Op, OutputType, PacketAccess,
758 typename internal::enable_if<
759 internal::is_same<float, OutputType>::value ||
760 internal::is_same<double, OutputType>::value,
761 void>::type> {
762 static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
763 typedef typename Self::Index Index;
764
765 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
766 const int block_size = 256;
767 const int num_per_thread = 128;
768 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
769 const int max_blocks = device.getNumGpuMultiProcessors() *
770 device.maxGpuThreadsPerMultiProcessor() / block_size;
771 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
772
773 if (num_blocks > 1) {
774 // We initialize the outputs outside the reduction kernel when we can't be sure that there
775 // won't be a race conditions between multiple thread blocks.
776 const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
777 const int max_blocks = device.getNumGpuMultiProcessors() *
778 device.maxGpuThreadsPerMultiProcessor() / 1024;
779 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
780 LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>),
781 num_blocks, 1024, 0, device, reducer.initialize(),
782 num_preserved_vals, output);
783 }
784
785 LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
786 num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
787
788 return false;
789 }
790 };
791
792 #ifdef EIGEN_HAS_GPU_FP16
793 template <typename Self, typename Op>
794 struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
795 static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
796 gpu_assert(false && "Should not be called since there is no packet accessor");
797 return true;
798 }
799 };
800
801 template <typename Self, typename Op>
802 struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
803 static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
804 typedef typename Self::Index Index;
805
806 if (num_preserved_vals % 2 != 0) {
807 // Not supported yet, revert to the slower code path
808 return true;
809 }
810
811 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
812 const int block_size = /*256*/128;
813 const int num_per_thread = /*128*/64;
814 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
815 const int max_blocks = device.getNumGpuMultiProcessors() *
816 device.maxGpuThreadsPerMultiProcessor() / block_size;
817 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
818
819 if (num_blocks > 1) {
820 // We initialize the outputs outside the reduction kernel when we can't be sure that there
821 // won't be a race conditions between multiple thread blocks.
822 LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
823 1, 1, 0, device, reducer, self, num_preserved_vals, output);
824 }
825
826 LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
827 num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
828
829 return false;
830 }
831 };
832 #endif // EIGEN_HAS_GPU_FP16
833
834
835 template <typename Self, typename Op>
836 struct InnerReducer<Self, Op, GpuDevice> {
837 // Unfortunately nvidia doesn't support well exotic types such as complex,
838 // so reduce the scope of the optimized version of the code to the simple case
839 // of floats and half floats.
840 #ifdef EIGEN_HAS_GPU_FP16
841 static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
842 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
843 internal::is_same<typename Self::CoeffReturnType, double>::value ||
844 (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
845 #else // EIGEN_HAS_GPU_FP16
846 static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
847 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
848 internal::is_same<typename Self::CoeffReturnType, double>::value);
849 #endif // EIGEN_HAS_GPU_FP16
850
851 template <typename OutputType>
852 static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
853 gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
854 const Index num_coeffs = array_prod(self.m_impl.dimensions());
855 // Don't crash when we're called with an input tensor of size 0.
856 if (num_coeffs == 0) {
857 return true;
858 }
859 // It's faster to use the usual code.
860 if (num_coeffs_to_reduce <= 128) {
861 return true;
862 }
863
864 return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
865 }
866 };
867
868 template <int NumPerThread, typename Self,
869 typename Reducer, typename Index>
870 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
871 typename Self::CoeffReturnType* output) {
872 const Index num_threads = blockDim.x * gridDim.x;
873 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
874 // Initialize the output values if they weren't initialized by the ReductionInitKernel
875 if (gridDim.x == 1) {
876 for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
877 output[i] = reducer.initialize();
878 }
879 __syncthreads();
880 }
881
882 // Do the reduction.
883 const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
884 for (Index i = thread_id; i < max_iter; i += num_threads) {
885 const Index input_col = i % num_preserved_coeffs;
886 const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
887 typename Self::CoeffReturnType reduced_val = reducer.initialize();
888 const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
889 for (Index j = input_row; j < max_row; j++) {
890 typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
891 reducer.reduce(val, &reduced_val);
892 }
893 atomicReduce(&(output[input_col]), reduced_val, reducer);
894 }
895 }
896
897
898 template <typename Self, typename Op>
899 struct OuterReducer<Self, Op, GpuDevice> {
900 // Unfortunately nvidia doesn't support well exotic types such as complex,
901 // so reduce the scope of the optimized version of the code to the simple case
902 // of floats.
903 static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
904 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
905 internal::is_same<typename Self::CoeffReturnType, double>::value);
906 template <typename Device, typename OutputType>
907 static
908 #if !defined(EIGEN_HIPCC)
909 // FIXME : leaving this EIGEN_DEVICE_FUNC in, results in the following runtime error
910 // (in the cxx11_tensor_reduction_gpu test)
911 //
912 // terminate called after throwing an instance of 'std::runtime_error'
913 // what(): No device code available for function: _ZN5Eigen8internal20OuterReductionKernelIL...
914 //
915 // don't know why this happens (and why is it a runtime error instead of a compile time error)
916 //
917 // this will be fixed by HIP PR#457
918 EIGEN_DEVICE_FUNC
919 #endif
920 bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
921 gpu_assert(false && "Should only be called to reduce doubles or floats on a gpu device");
922 return true;
923 }
924
925 static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
926 typedef typename Self::Index Index;
927
928 // It's faster to use the usual code.
929 if (num_coeffs_to_reduce <= 32) {
930 return true;
931 }
932
933 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
934 const int block_size = 256;
935 const int num_per_thread = 16;
936 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
937 const int max_blocks = device.getNumGpuMultiProcessors() *
938 device.maxGpuThreadsPerMultiProcessor() / block_size;
939 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
940
941 if (num_blocks > 1) {
942 // We initialize the outputs in the reduction kernel itself when we don't have to worry
943 // about race conditions between multiple thread blocks.
944 const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
945 const int max_blocks = device.getNumGpuMultiProcessors() *
946 device.maxGpuThreadsPerMultiProcessor() / 1024;
947 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
948 LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>),
949 num_blocks, 1024, 0, device, reducer.initialize(),
950 num_preserved_vals, output);
951 }
952
953 LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
954 num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
955
956 return false;
957 }
958 };
959
960 #endif // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
961
962
963 } // end namespace internal
964 } // end namespace Eigen
965
966 #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
967