xref: /aosp_15_r20/external/tensorflow/tensorflow/core/kernels/segment_reduction_ops_gpu_0.cu.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
17 
18 #include "tensorflow/core/framework/register_types.h"
19 #include "tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h"
20 
21 namespace tensorflow {
22 
UseDeterministicSegmentReductions()23 bool UseDeterministicSegmentReductions() {
24   // See comment below regarding CI build error on Windows.
25 #if !defined(PLATFORM_WINDOWS)
26   static bool cached_result = [] {
27     bool result = false;
28     TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar(
29         "TF_USE_DETERMINISTIC_SEGMENT_REDUCTIONS",
30         /*default_val=*/false, &result));
31     return result;
32   }();
33   return cached_result;
34 #else
35   return false;
36 #endif
37 }
38 
DisableSegmentReductionOpDeterminismExceptions()39 bool DisableSegmentReductionOpDeterminismExceptions() {
40   static bool cached_disable = [] {
41     bool disable = false;
42     TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar(
43         "TF_DISABLE_SEGMENT_REDUCTION_OP_DETERMINISM_EXCEPTIONS",
44         /*default_val=*/false, &disable));
45     return disable;
46   }();
47   return cached_disable;
48 }
49 
50 namespace functor {
51 
52 #define DEFINE_SORTED_GPU_SPECS_INDEX(T, Index)               \
53   template struct SegmentReductionFunctor<                    \
54       T, Index, /*InitialValueF=*/functor::Zero<T>,           \
55       /*EmptySegmentValueF=*/functor::Zero<T>, functor::Sum>; \
56   template struct SegmentReductionFunctor<                    \
57       T, Index, /*InitialValueF=*/functor::One<T>,            \
58       /*EmptySegmentValueF=*/functor::One<T>, functor::Prod>; \
59   template struct SegmentReductionFunctor<                    \
60       T, Index, /*InitialValueF=*/functor::Highest<T>,        \
61       /*EmptySegmentValueF=*/functor::Zero<T>, functor::Min>; \
62   template struct SegmentReductionFunctor<                    \
63       T, Index, /*InitialValueF=*/functor::Lowest<T>,         \
64       /*EmptySegmentValueF=*/functor::Zero<T>, functor::Max>;
65 
66 #define DEFINE_SORTED_GPU_SPECS(T) DEFINE_SORTED_GPU_SPECS_INDEX(T, int32);
67 
68 TF_CALL_GPU_NUMBER_TYPES(DEFINE_SORTED_GPU_SPECS);
69 
70 #define DEFINE_REAL_UNSORTED_GPU_SPECS_INDEX(T, Index)                         \
71   template struct UnsortedSegmentFunctor<GPUDevice, T, Index,                  \
72                                          functor::Lowest<T>, functor::Max>;    \
73   template struct UnsortedSegmentFunctor<GPUDevice, T, Index,                  \
74                                          functor::Highest<T>, functor::Min>;   \
75   template struct UnsortedSegmentFunctor<GPUDevice, T, Index, functor::One<T>, \
76                                          functor::Prod>;
77 
78 // Sum is the only op that supports all input types currently.
79 #define DEFINE_SUM_UNSORTED_GPU_SPECS_INDEX(T, Index)         \
80   template struct UnsortedSegmentFunctor<GPUDevice, T, Index, \
81                                          functor::Zero<T>, functor::Sum>;
82 
83 #define DEFINE_REAL_GPU_SPECS(T) DEFINE_REAL_UNSORTED_GPU_SPECS_INDEX(T, int32);
84 
85 #define DEFINE_SUM_GPU_SPECS(T) DEFINE_SUM_UNSORTED_GPU_SPECS_INDEX(T, int32);
86 
87 TF_CALL_GPU_NUMBER_TYPES(DEFINE_REAL_GPU_SPECS);
88 TF_CALL_GPU_NUMBER_TYPES(DEFINE_SUM_GPU_SPECS);
89 
90 #undef DEFINE_SORTED_GPU_SPECS_INDEX
91 #undef DEFINE_SORTED_GPU_SPECS
92 #undef DEFINE_REAL_UNSORTED_GPU_SPECS_INDEX
93 #undef DEFINE_SUM_UNSORTED_GPU_SPECS_INDEX
94 #undef DEFINE_REAL_GPU_SPECS
95 #undef DEFINE_SUM_GPU_SPECS
96 
97 // TODO(benbarsdell): These kernels are disabled on Windows as a workaround for
98 // a CI build error: "formal parameter with requested alignment of 128 won't be
99 // aligned". The root cause is suspected to be an aligned type (AlignedVector)
100 // being passed to a function by value, possibly inside the CUB library
101 // somewhere, but I have not yet been able to reproduce it in isolation outside
102 // of the GitHub CI.
103 #if !defined(PLATFORM_WINDOWS)
104 
105 #define DEFINE_SPARSE_SEGMENT_REDUCTION_FUNCTOR(T)                \
106   template struct SparseSegmentReductionFunctor<T, int32, int32>; \
107   template struct SparseSegmentReductionFunctor<T, int32, int64_t>;
108 TF_CALL_GPU_NUMBER_TYPES(DEFINE_SPARSE_SEGMENT_REDUCTION_FUNCTOR);
109 #undef DEFINE_SPARSE_SEGMENT_REDUCTION_FUNCTOR
110 
111 #define DEFINE_SPARSE_SEGMENT_GRAD_FUNCTOR(T)                           \
112   template struct SparseSegmentGradFunctor<GPUDevice, T, int32, int32>; \
113   template struct SparseSegmentGradFunctor<GPUDevice, T, int32, int64_t>;
114 TF_CALL_GPU_NUMBER_TYPES(DEFINE_SPARSE_SEGMENT_GRAD_FUNCTOR);
115 #undef DEFINE_SPARSE_SEGMENT_GRAD_FUNCTOR
116 
117 #endif  // !defined(PLATFORM_WINDOWS)
118 
119 }  // namespace functor
120 }  // namespace tensorflow
121 
122 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
123