xref: /aosp_15_r20/external/tensorflow/tensorflow/core/kernels/sparse/dense_to_csr_sparse_matrix_op.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #define EIGEN_USE_THREADS
17 
18 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
19 #define EIGEN_USE_GPU
20 #endif
21 
22 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
23 #include "tensorflow/core/framework/op.h"
24 #include "tensorflow/core/framework/op_kernel.h"
25 #include "tensorflow/core/framework/tensor_reference.h"
26 #include "tensorflow/core/framework/tensor_shape.h"
27 #include "tensorflow/core/framework/tensor_types.h"
28 #include "tensorflow/core/framework/types.h"
29 #include "tensorflow/core/framework/variant_op_registry.h"
30 #include "tensorflow/core/kernels/dense_update_functor.h"
31 #include "tensorflow/core/kernels/fill_functor.h"
32 #include "tensorflow/core/kernels/gather_nd_op.h"
33 #include "tensorflow/core/kernels/sparse/kernels.h"
34 #include "tensorflow/core/kernels/sparse/sparse_matrix.h"
35 
36 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
37 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
38 #include "tensorflow/core/util/cuda_sparse.h"
39 #include "tensorflow/core/util/gpu_solvers.h"
40 #endif
41 
42 #if GOOGLE_CUDA
43 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
44 using ::perftools::gputools::cuda::ScopedActivateExecutorContext;
45 #elif TENSORFLOW_USE_ROCM
46 #include "tensorflow/stream_executor/rocm/rocm_activation.h"
47 using ::perftools::gputools::rocm::ScopedActivateExecutorContext;
48 #endif
49 
50 namespace tensorflow {
51 
52 typedef Eigen::ThreadPoolDevice CPUDevice;
53 typedef Eigen::GpuDevice GPUDevice;
54 
55 // Op to convert dense matrices to CSR SparseMatrices on the CPU.
56 // Takes a Tensor of rank 2 or (if batched) 3 and a corresponding list of
57 // indices as input.
58 //
59 // The (batched) CSR SparseMatrix is constructed using only
60 // the values at the given indices. This implementation assumes that the indices
61 // are sorted with respect to batch indices and are in row-major order.
62 template <typename Device, typename T>
63 class DenseToCSRSparseMatrixCPUOp : public OpKernel {
64  public:
DenseToCSRSparseMatrixCPUOp(OpKernelConstruction * c)65   explicit DenseToCSRSparseMatrixCPUOp(OpKernelConstruction* c) : OpKernel(c) {}
66 
Compute(OpKernelContext * ctx)67   void Compute(OpKernelContext* ctx) override {
68     const Tensor& params = ctx->input(0);
69     const Tensor& indices = ctx->input(1);
70 
71     // TODO(anudhyan): Factor out common input validation for CPU and GPU ops
72     // into a single function.
73     const TensorShape& dense_tensor_shape = params.shape();
74     const int rank = params.dims();
75     OP_REQUIRES(ctx, rank == 2 || rank == 3,
76                 errors::InvalidArgument(
77                     "params must have rank == 2 or 3; ",
78                     "but saw shape: ", dense_tensor_shape.DebugString()));
79     OP_REQUIRES(
80         ctx, indices.dims() == 2,
81         errors::InvalidArgument("indices must be a matrix, but saw shape: ",
82                                 indices.shape().DebugString()));
83     OP_REQUIRES(
84         ctx, indices.dim_size(1) == rank,
85         errors::InvalidArgument(
86             "indices.shape[1] must be equal to the rank of params, but saw: ",
87             indices.dim_size(1), " vs. ", rank));
88 
89     Tensor dense_shape(cpu_allocator(), DT_INT64, TensorShape({rank}));
90     auto dense_shape_mutable = dense_shape.vec<int64_t>();
91     for (int i = 0; i < rank; ++i) {
92       dense_shape_mutable(i) = dense_tensor_shape.dim_size(i);
93     }
94 
95     const int64_t batch_size = (rank == 2) ? 1 : dense_tensor_shape.dim_size(0);
96     const int64_t num_rows = dense_tensor_shape.dim_size((rank == 2) ? 0 : 1);
97     const int64_t total_nnz = indices.NumElements() / rank;
98 
99     Tensor values;
100     OP_REQUIRES_OK(ctx, functor::DoGatherNd<Device, T, int64_t>(
101                             ctx, params, indices, &values));
102 
103     Tensor batch_ptr(cpu_allocator(), DT_INT32, TensorShape({batch_size + 1}));
104     Tensor csr_col_ind(cpu_allocator(), DT_INT32, TensorShape({total_nnz}));
105     Tensor csr_row_ptr(cpu_allocator(), DT_INT32,
106                        TensorShape({(num_rows + 1) * batch_size}));
107 
108     // Fill the row pointers with zeros.
109     functor::SetZeroFunctor<Device, int32> set_zero;
110     set_zero(ctx->eigen_device<Device>(), csr_row_ptr.flat<int32>());
111 
112     // Convert from COO to CSR format.
113     functor::SparseTensorToCSRSparseMatrixCPUFunctor coo_to_csr;
114     OP_REQUIRES_OK(ctx,
115                    coo_to_csr(batch_size, num_rows, indices.matrix<int64_t>(),
116                               batch_ptr.vec<int32>(), csr_row_ptr.vec<int32>(),
117                               csr_col_ind.vec<int32>()));
118 
119     CSRSparseMatrix output_csr_matrix;
120     OP_REQUIRES_OK(ctx, CSRSparseMatrix::CreateCSRSparseMatrix(
121                             values.dtype(), dense_shape, batch_ptr, csr_row_ptr,
122                             csr_col_ind, values, &output_csr_matrix));
123     Tensor* output_csr_matrix_tensor;
124     AllocatorAttributes cpu_alloc;
125     cpu_alloc.set_on_host(true);
126     OP_REQUIRES_OK(
127         ctx, ctx->allocate_output(0, TensorShape({}), &output_csr_matrix_tensor,
128                                   cpu_alloc));
129     output_csr_matrix_tensor->scalar<Variant>()() =
130         std::move(output_csr_matrix);
131   }
132 };
133 
134 #define REGISTER_CPU(T)                                  \
135   REGISTER_KERNEL_BUILDER(Name("DenseToCSRSparseMatrix") \
136                               .Device(DEVICE_CPU)        \
137                               .TypeConstraint<T>("T"),   \
138                           DenseToCSRSparseMatrixCPUOp<CPUDevice, T>);
139 
140 REGISTER_CPU(float)
141 REGISTER_CPU(double)
142 REGISTER_CPU(complex64)
143 REGISTER_CPU(complex128)
144 
145 #undef REGISTER_CPU
146 
147 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
148 
149 template <typename Device, typename T>
150 class DenseToCSRSparseMatrixGPUOp : public AsyncOpKernel {
151  public:
DenseToCSRSparseMatrixGPUOp(OpKernelConstruction * c)152   explicit DenseToCSRSparseMatrixGPUOp(OpKernelConstruction* c)
153       : AsyncOpKernel(c) {}
154 
ComputeAsync(OpKernelContext * c,DoneCallback done)155   void ComputeAsync(OpKernelContext* c, DoneCallback done) final {
156     auto stream = c->op_device_context()->stream();
157     const Device& d = c->eigen_device<Device>();
158 
159     const Tensor& params_t = c->input(0);
160     const Tensor& indices_t = c->input(1);
161     const TensorShape& dense_tensor_shape = params_t.shape();
162     const int rank = params_t.dims();
163     OP_REQUIRES_ASYNC(c, rank == 2 || rank == 3,
164                       errors::InvalidArgument(
165                           "params must have rank == 2 or 3; ",
166                           "but saw shape: ", dense_tensor_shape.DebugString()),
167                       done);
168     OP_REQUIRES_ASYNC(
169         c, indices_t.dims() == 2,
170         errors::InvalidArgument("indices must be a matrix, but saw shape: ",
171                                 indices_t.shape().DebugString()),
172         done);
173     OP_REQUIRES_ASYNC(
174         c, indices_t.dim_size(1) == rank,
175         errors::InvalidArgument(
176             "indices.shape[1] must be equal to the rank of params, but saw: ",
177             indices_t.dim_size(1), " vs. ", rank),
178         done);
179     const int64_t batch_size = (rank == 2) ? 1 : dense_tensor_shape.dim_size(0);
180     const int64_t rows = dense_tensor_shape.dim_size((rank == 2) ? 0 : 1);
181     const int64_t cols = dense_tensor_shape.dim_size((rank == 2) ? 1 : 2);
182 
183     ScratchSpace<int32> nnz_per_batch_host(c, batch_size, /*on_host*/ true);
184 
185     Tensor nnz_per_batch_device_t;
186     if (rank == 2) {
187       // Simple case.
188       nnz_per_batch_host.mutable_data()[0] = indices_t.dim_size(0);
189     } else {
190       OP_REQUIRES_OK_ASYNC(c,
191                            c->allocate_temp(DT_INT32, TensorShape({batch_size}),
192                                             &nnz_per_batch_device_t),
193                            done);
194       auto nnz_per_batch_device = nnz_per_batch_device_t.vec<int32>();
195 
196       functor::CalculateNNZPerBatchMatrixFromIndices<Device>
197           calculate_nnz_from_indices;
198       auto indices = indices_t.matrix<int64_t>();
199       OP_REQUIRES_OK_ASYNC(
200           c, calculate_nnz_from_indices(c, indices, nnz_per_batch_device),
201           done);
202 
203       perftools::gputools::DeviceMemoryBase nnz_per_batch_device_ptr(
204           static_cast<void*>(nnz_per_batch_device.data()));
205 
206       OP_REQUIRES_ASYNC(
207           c,
208           stream
209               ->ThenMemcpy(nnz_per_batch_host.mutable_data() /*host_dst*/,
210                            nnz_per_batch_device_ptr /*gpu_src*/,
211                            batch_size * sizeof(int32) /*size*/)
212               .ok(),
213           errors::Internal("DenseToSparseMatrixGPUOp: failed to copy "
214                            "nnz_per_batch from device"),
215           done);
216     }
217 
218     // TODO(ebrevdo): write a custom pair of kernels: one that
219     // calculates the batched csr_row_ptr vector, another that fills in
220     // the col_ind and values vectors.
221     TensorReference nnz_per_batch_device_ref(nnz_per_batch_device_t);
222     auto convert_to_csr = [this, c, rank, batch_size, nnz_per_batch_host,
223                            nnz_per_batch_device_ref, stream, &d, &params_t,
224                            &indices_t, dense_tensor_shape, rows, cols, done]() {
225       // The data has been copied out of the nnz_per_batch_device
226       // tensor by the time we get here; we can unreference it.
227       nnz_per_batch_device_ref.Unref();
228 
229       auto nnz_per_batch = nnz_per_batch_host.tensor().vec<int32>();
230 
231       // Ensure that within the callback, the proper GPU settings are
232       // configured.
233       ScopedActivateExecutorContext scoped_activation{stream->parent()};
234 
235       // Extract out the values.
236       Tensor temp_values_t;
237       OP_REQUIRES_OK_ASYNC(c,
238                            (functor::DoGatherNd<Device, T, int64>(
239                                c, params_t, indices_t, &temp_values_t)),
240                            done);
241       const Tensor& values_t = const_cast<const Tensor&>(temp_values_t);
242 
243       OP_REQUIRES_ASYNC(
244           c, TensorShapeUtils::IsVector(values_t.shape()),
245           errors::Internal("Expected values_t to be a vector, but saw shape: ",
246                            values_t.shape().DebugString()),
247           done);
248 
249       Tensor dense_shape_t(cpu_allocator(), DT_INT64, TensorShape({rank}));
250       auto dense_shape_mutable = dense_shape_t.vec<int64_t>();
251       for (int i = 0; i < rank; ++i) {
252         dense_shape_mutable(i) = dense_tensor_shape.dim_size(i);
253       }
254       auto dense_shape =
255           const_cast<const Tensor&>(dense_shape_t).vec<int64_t>();
256 
257       Tensor batch_ptr_t(cpu_allocator(), DT_INT32,
258                          TensorShape({batch_size + 1}));
259       auto batch_ptr = batch_ptr_t.vec<int32>();
260       auto indices = indices_t.matrix<int64_t>();
261 
262       batch_ptr(0) = 0;
263       for (int i = 0; i < batch_size; ++i) {
264         batch_ptr(i + 1) = batch_ptr(i) + nnz_per_batch(i);
265       }
266       int total_nnz = batch_ptr(batch_size);
267       OP_REQUIRES_ASYNC(
268           c, total_nnz == values_t.NumElements(),
269           errors::Internal("nnz returned by "
270                            "CalculateNNZPerBatchMatrixFromInd"
271                            "ices != len(values): ",
272                            total_nnz, " vs. ", values_t.NumElements()),
273           done);
274 
275       Tensor coo_col_ind_t;
276       Tensor csr_row_ptr_t;
277       Tensor csr_values_t = values_t;
278 
279       Tensor coo_row_ind_t;
280       OP_REQUIRES_OK_ASYNC(
281           c,
282           c->allocate_temp(DT_INT32, TensorShape({total_nnz}), &coo_row_ind_t),
283           done);
284       OP_REQUIRES_OK_ASYNC(
285           c,
286           c->allocate_temp(DT_INT32, TensorShape({total_nnz}), &coo_col_ind_t),
287           done);
288       OP_REQUIRES_OK_ASYNC(
289           c,
290           c->allocate_temp(DT_INT32, TensorShape({batch_size * (rows + 1)}),
291                            &csr_row_ptr_t),
292           done);
293 
294       auto coo_row_ind = coo_row_ind_t.vec<int32>();
295       auto coo_col_ind = coo_col_ind_t.vec<int32>();
296       auto csr_row_ptr = csr_row_ptr_t.vec<int32>();
297 
298       // Convert SparseTensor rep to coo row ind, coo col ind.
299       if (total_nnz > 0) {
300         functor::SparseTensorToCOOSparseMatrix<Device> st_to_coo;
301         st_to_coo(d, dense_shape, indices, coo_row_ind, coo_col_ind);
302       }
303 
304       // Set all csr row pointers to zero, so that when iterating over
305       // batches converting coo to csr, we do not have to perform an
306       // unaligned SetZero for any nnz == 0 minibatches.  coo2csr has
307       // a bug if you have empty coo rows.
308       // TODO(ebrevdo): File bug w/ nvidia so coo2csr can handle
309       // zero-element input coo rows.
310       functor::SetZeroFunctor<Device, int32> set_zero;
311       set_zero(d, csr_row_ptr_t.flat<int32>());
312 
313       functor::COOSparseMatrixToCSRSparseMatrix<Device> coo_to_csr;
314       for (int i = 0; i < batch_size; ++i) {
315         int nnz_i = batch_ptr(i + 1) - batch_ptr(i);
316         if (nnz_i == 0) {
317           // This is an empty minibatch; no call to coo2csr: it's
318           // handled by the SetZero above.
319         } else {
320           // Convert coo to csr.
321           auto coo_row_ind_i =
322               TTypes<int32>::UnalignedVec(&coo_row_ind(batch_ptr(i)), nnz_i);
323           auto csr_row_ptr_i = TTypes<int32>::UnalignedVec(
324               &csr_row_ptr((rows + 1) * i), rows + 1);
325           OP_REQUIRES_OK_ASYNC(
326               c, coo_to_csr(c, rows, cols, coo_row_ind_i, csr_row_ptr_i), done);
327         }
328       }
329 
330       CSRSparseMatrix matrix;
331       OP_REQUIRES_OK_ASYNC(
332           c,
333           CSRSparseMatrix::CreateCSRSparseMatrix(
334               values_t.dtype(), dense_shape_t, batch_ptr_t, csr_row_ptr_t,
335               coo_col_ind_t, csr_values_t, &matrix),
336           done);
337       Tensor* matrix_t;
338       AllocatorAttributes cpu_alloc;
339       cpu_alloc.set_on_host(true);
340       OP_REQUIRES_OK_ASYNC(
341           c, c->allocate_output(0, TensorShape({}), &matrix_t, cpu_alloc),
342           done);
343       matrix_t->scalar<Variant>()() = std::move(matrix);
344 
345       done();
346     };
347 
348     if (rank == 2) {
349       convert_to_csr();
350     } else {
351       // Launch the GPU kernel to count nnz entries, then call convert_to_csr.
352       c->device()->tensorflow_accelerator_device_info()->event_mgr->ThenExecute(
353           stream, convert_to_csr);
354     }
355   }
356 };
357 
358 #define REGISTER_GPU(DEV, T)                             \
359   REGISTER_KERNEL_BUILDER(Name("DenseToCSRSparseMatrix") \
360                               .Device(DEVICE_##DEV)      \
361                               .TypeConstraint<T>("T"),   \
362                           DenseToCSRSparseMatrixGPUOp<DEV##Device, T>);
363 
364 REGISTER_GPU(GPU, float)
365 REGISTER_GPU(GPU, double)
366 REGISTER_GPU(GPU, complex64)
367 REGISTER_GPU(GPU, complex128)
368 
369 namespace functor {
370 
371 template <>
372 Status CalculateNNZPerBatchMatrixFromIndices<GPUDevice>::operator()(
373     OpKernelContext* c, TTypes<int64_t>::ConstMatrix indices,
374     TTypes<int32>::Vec nnz_per_batch);
375 extern template struct CalculateNNZPerBatchMatrixFromIndices<GPUDevice>;
376 
377 template <>
378 struct SparseTensorToCOOSparseMatrix<GPUDevice> {
379   void operator()(const GPUDevice& d,
380                   TTypes<int64_t>::ConstVec host_dense_shape,
381                   TTypes<int64_t>::ConstMatrix indices,
382                   TTypes<int>::Vec coo_row_ind, TTypes<int>::Vec coo_col_ind);
383 };
384 extern template struct SparseTensorToCOOSparseMatrix<GPUDevice>;
385 
386 template <>
387 struct COOSparseMatrixToCSRSparseMatrix<GPUDevice> {
operator ()tensorflow::functor::COOSparseMatrixToCSRSparseMatrix388   Status operator()(OpKernelContext* c, const int rows, const int cols,
389                     TTypes<int>::UnalignedVec coo_row_ind,
390                     TTypes<int>::UnalignedVec csr_row_ptr) {
391     GpuSparse cuda_sparse(c);
392     TF_RETURN_IF_ERROR(cuda_sparse.Initialize());
393     return cuda_sparse.Coo2csr(coo_row_ind.data(),
394                                /*nnz*/ coo_row_ind.size(),
395                                /*m == rows of A*/ rows, csr_row_ptr.data());
396   }
397 };
398 extern template struct COOSparseMatrixToCSRSparseMatrix<GPUDevice>;
399 
400 }  // namespace functor
401 
402 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
403 
404 #undef REGISTER_GPU
405 
406 }  // namespace tensorflow
407