1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
17 #define EIGEN_USE_GPU
18
19 #include <algorithm>
20 #include <vector>
21
22 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
23 #include "tensorflow/core/framework/kernel_def_builder.h"
24 #include "tensorflow/core/framework/op_kernel.h"
25 #include "tensorflow/core/framework/register_types.h"
26 #include "tensorflow/core/framework/tensor_shape.h"
27 #include "tensorflow/core/framework/types.h"
28 #include "tensorflow/core/kernels/transpose_functor.h"
29 #include "tensorflow/core/platform/types.h"
30 #include "tensorflow/core/util/gpu_kernel_helper.h"
31 #include "tensorflow/core/util/gpu_solvers.h"
32
33 namespace tensorflow {
34
35 typedef Eigen::GpuDevice GPUDevice;
36
37 namespace {
38 template <typename Scalar>
ComputePermutationFromTranspositions(int64 num_rows,const int * __restrict__ pivots,Scalar * __restrict__ permutation_indices)39 __device__ void ComputePermutationFromTranspositions(
40 int64 num_rows, const int* __restrict__ pivots,
41 Scalar* __restrict__ permutation_indices) {
42 // Fill in the output array with the identity permutation.
43 for (int i = 0; i < num_rows; ++i) {
44 permutation_indices[i] = Scalar(i);
45 }
46
47 // Compute the permutation from a sequence of transpositions encoded
48 // in the pivot array by applying the transpositions in order on the
49 // identity permutation.
50 for (int i = 0; i < num_rows; ++i) {
51 // Note: Internally, the cuBlas code uses Fortran convention (1-based)
52 // indexing so ith row was swapped with (pivots[i]-1)'th row in 0-based
53 // indexing.
54 Scalar t = permutation_indices[i];
55 permutation_indices[i] = permutation_indices[pivots[i] - 1];
56 permutation_indices[pivots[i] - 1] = t;
57 }
58 }
59 } // namespace
60
61 // Kernel to compute the inverse of a permutation from a sequence of
62 // transpositions.
63 template <typename Scalar>
ComputePermutationFromTranspositionsKernel(GpuLaunchConfig config,const int64 num_rows,const int * __restrict__ all_pivots,Scalar * __restrict__ all_permutation_indices)64 __global__ void ComputePermutationFromTranspositionsKernel(
65 GpuLaunchConfig config, const int64 num_rows,
66 const int* __restrict__ all_pivots,
67 Scalar* __restrict__ all_permutation_indices) {
68 // We only parallelize over batches here. Performance is not critical,
69 // since this cheap O(num_rows) kernel always follows an O(num_rows^3)
70 // LU factorization.
71 GPU_1D_KERNEL_LOOP(index, config.virtual_thread_count) {
72 ComputePermutationFromTranspositions(
73 num_rows, all_pivots + index * num_rows,
74 all_permutation_indices + index * num_rows);
75 }
76 }
77
78 template <class Scalar, class Tidx>
79 class LuOpGpu : public AsyncOpKernel {
80 public:
LuOpGpu(OpKernelConstruction * context)81 explicit LuOpGpu(OpKernelConstruction* context) : AsyncOpKernel(context) {}
82
ComputeAsync(OpKernelContext * context,DoneCallback done)83 void ComputeAsync(OpKernelContext* context, DoneCallback done) final {
84 const Tensor& input = context->input(0);
85
86 // Analyze shape and validate inputs.
87 const int input_rank = input.dims();
88
89 OP_REQUIRES_ASYNC(
90 context, input_rank >= 2,
91 errors::InvalidArgument("Input must have rank >= 2, got ", input_rank),
92 done);
93
94 const int64 num_rows = input.dim_size(input_rank - 2);
95 const int64 num_cols = input.dim_size(input_rank - 1);
96
97 OP_REQUIRES_ASYNC(
98 context, num_rows == num_cols,
99 errors::InvalidArgument("Input matrices must be squares, got", num_rows,
100 " != ", num_cols),
101 done);
102
103 TensorShape batch_shape;
104 for (int dim = 0; dim < input_rank - 2; ++dim) {
105 batch_shape.AddDim(input.dim_size(dim));
106 }
107 TensorShape permutation_indices_shape = batch_shape;
108 permutation_indices_shape.AddDim(num_rows);
109
110 const GPUDevice& device = context->eigen_device<GPUDevice>();
111 auto solver = absl::make_unique<GpuSolver>(context);
112
113 // We output the packed triangular factors in a dense form.
114 // The lower triangular factor L corresponds to the strictly lower
115 // triangular part of packed_triangular_factors with an implicit unit
116 // diagonal. The upper triangular factor U is the upper triangular part of
117 // packed_triangular_factors. The triangular factors satisfy the equation
118 // P * input_matrix = L * U
119 // where P is the permutation matrix corresponding to the indices in
120 // permutation_indices.
121 //
122 // Reuse the input buffer or make a copy for the factorization step,
123 // depending on whether this ops owns it exclusively.
124 Tensor* packed_triangular_factors;
125 OP_REQUIRES_OK_ASYNC(context,
126 context->forward_input_or_allocate_output(
127 {0}, 0, input.shape(), &packed_triangular_factors),
128 done);
129 if (!packed_triangular_factors->SharesBufferWith(input)) {
130 device.memcpy(packed_triangular_factors->flat<Scalar>().data(),
131 input.flat<Scalar>().data(),
132 input.NumElements() * sizeof(Scalar));
133 }
134
135 // Allocate output permutation.
136 Tensor* permutation_indices = nullptr;
137 OP_REQUIRES_OK_ASYNC(context,
138 context->allocate_output(1, permutation_indices_shape,
139 &permutation_indices),
140 done);
141
142 if (input.NumElements() == 0) {
143 done();
144 return;
145 }
146
147 // Allocate a temporary Tensor to store the transposed packed triangular
148 // factors.
149 Tensor packed_triangular_factors_transpose;
150 OP_REQUIRES_OK_ASYNC(
151 context,
152 context->allocate_temp(DataTypeToEnum<Scalar>::value, input.shape(),
153 &packed_triangular_factors_transpose),
154 done);
155 auto packed_triangular_factors_transpose_reshaped =
156 packed_triangular_factors_transpose
157 .template flat_inner_dims<Scalar, 3>();
158 const int64 batch_size =
159 packed_triangular_factors_transpose_reshaped.dimension(0);
160
161 // Allocate pivots on the device.
162 Tensor pivots;
163 OP_REQUIRES_OK_ASYNC(context,
164 solver->allocate_scoped_tensor(
165 DataTypeToEnum<int32>::value,
166 TensorShape{batch_size, num_rows}, &pivots),
167 done);
168 auto pivots_mat = pivots.template matrix<int32>();
169
170 // Transpose the input. This is necessary because cuBLAS assumes
171 // column-major storage while TensorFlow uses row-major.
172 OP_REQUIRES_OK_ASYNC(
173 context,
174 DoMatrixTranspose(device, *packed_triangular_factors,
175 &packed_triangular_factors_transpose),
176 done);
177
178 std::vector<DeviceLapackInfo> dev_info;
179 if (num_rows == num_cols && num_rows / batch_size <= 128) {
180 // For small matrices or large batch sizes, we use the batched
181 // interface from cuBlas.
182 auto packed_triangular_factors_ptrs = solver->GetScratchSpace<uint8>(
183 sizeof(Scalar*) * batch_size, "packed_triangular_factors_ptrs",
184 /* on_host */ true);
185
186 Scalar** packed_triangular_factors_ptrs_base = reinterpret_cast<Scalar**>(
187 packed_triangular_factors_ptrs.mutable_data());
188
189 for (int batch = 0; batch < batch_size; ++batch) {
190 packed_triangular_factors_ptrs_base[batch] =
191 &packed_triangular_factors_transpose_reshaped(batch, 0, 0);
192 }
193 dev_info.push_back(
194 solver->GetDeviceLapackInfo(batch_size, "getrfBatched"));
195 OP_REQUIRES_OK_ASYNC(
196 context,
197 solver->GetrfBatched(num_rows, packed_triangular_factors_ptrs_base,
198 num_rows, pivots_mat.data(), &dev_info.back(),
199 batch_size),
200 done);
201 } else {
202 // For small batch sizes we use the non-batched interface from cuSolver,
203 // which is much faster for large matrices.
204 dev_info.push_back(solver->GetDeviceLapackInfo(batch_size, "getrf"));
205 for (int batch = 0; batch < batch_size; ++batch) {
206 OP_REQUIRES_OK_ASYNC(
207 context,
208 solver->Getrf(
209 num_rows, num_cols,
210 &packed_triangular_factors_transpose_reshaped(batch, 0, 0),
211 num_rows, &pivots_mat(batch, 0), &dev_info.back()(batch)),
212 done);
213 }
214 }
215
216 // Transpose the result since we had transposed the input.
217 OP_REQUIRES_OK_ASYNC(
218 context,
219 DoMatrixTranspose(device, packed_triangular_factors_transpose,
220 packed_triangular_factors),
221 done);
222
223 // Pivots encode the permutation of the rows as a sequences of row swaps.
224 // For each index i, row i is swapped with row pivots[i].
225 int* pivots_ptr = pivots.flat<int>().data();
226 Tidx* permutation_indices_ptr =
227 permutation_indices->template flat<Tidx>().data();
228 GpuLaunchConfig cfgPivots = GetGpuLaunchConfig(batch_size, device);
229 TF_CHECK_OK(GpuLaunchKernel(
230 ComputePermutationFromTranspositionsKernel<Tidx>, cfgPivots.block_count,
231 cfgPivots.thread_per_block, 0, device.stream(), cfgPivots, num_rows,
232 pivots_ptr, permutation_indices_ptr));
233
234 // Callback for checking info after kernels finish. Also capture the
235 // temporary Tensors/ScratchSpace so they don't get deallocated before the
236 // kernels run.
237 // TODO(rmlarsen): Use move capture once C++14 becomes available.
238 auto info_checker = [context, done, dev_info](
239 const Status& status,
240 const std::vector<HostLapackInfo>& host_infos) {
241 if (!status.ok() && errors::IsInvalidArgument(status) &&
242 !host_infos.empty()) {
243 for (int i = 0; i < host_infos[0].size(); ++i) {
244 // Match the CPU error message for singular matrices. Otherwise
245 // just print the original error message from the status below.
246 OP_REQUIRES_ASYNC(context, host_infos[0].data()[i] <= 0,
247 errors::InvalidArgument("Input is not invertible."),
248 done);
249 }
250 }
251 OP_REQUIRES_OK_ASYNC(context, status, done);
252 done();
253 };
254
255 GpuSolver::CheckLapackInfoAndDeleteSolverAsync(std::move(solver), dev_info,
256 std::move(info_checker));
257 }
258 };
259
260 #define REGISTER_LU_GPU(type, idx_type) \
261 REGISTER_KERNEL_BUILDER(Name("Lu") \
262 .Device(DEVICE_GPU) \
263 .TypeConstraint<type>("T") \
264 .TypeConstraint<idx_type>("output_idx_type"), \
265 LuOpGpu<type, idx_type>);
266
267 REGISTER_LU_GPU(float, int32);
268 REGISTER_LU_GPU(double, int32);
269 REGISTER_LU_GPU(complex64, int32);
270 REGISTER_LU_GPU(complex128, int32);
271
272 REGISTER_LU_GPU(float, int64);
273 REGISTER_LU_GPU(double, int64);
274 REGISTER_LU_GPU(complex64, int64);
275 REGISTER_LU_GPU(complex128, int64);
276 } // namespace tensorflow
277
278 #endif // GOOGLE_CUDA
279