1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #define EIGEN_USE_THREADS 17 18 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM 19 #define EIGEN_USE_GPU 20 #endif 21 22 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 23 #include "tensorflow/core/framework/op.h" 24 #include "tensorflow/core/framework/op_kernel.h" 25 #include "tensorflow/core/framework/tensor_reference.h" 26 #include "tensorflow/core/framework/tensor_shape.h" 27 #include "tensorflow/core/framework/tensor_types.h" 28 #include "tensorflow/core/framework/types.h" 29 #include "tensorflow/core/framework/variant_op_registry.h" 30 #include "tensorflow/core/kernels/dense_update_functor.h" 31 #include "tensorflow/core/kernels/fill_functor.h" 32 #include "tensorflow/core/kernels/gather_nd_op.h" 33 #include "tensorflow/core/kernels/sparse/kernels.h" 34 #include "tensorflow/core/kernels/sparse/sparse_matrix.h" 35 36 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM 37 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" 38 #include "tensorflow/core/util/cuda_sparse.h" 39 #include "tensorflow/core/util/gpu_solvers.h" 40 #endif 41 42 #if GOOGLE_CUDA 43 #include "tensorflow/stream_executor/cuda/cuda_activation.h" 44 using ::perftools::gputools::cuda::ScopedActivateExecutorContext; 45 #elif TENSORFLOW_USE_ROCM 46 #include "tensorflow/stream_executor/rocm/rocm_activation.h" 47 using ::perftools::gputools::rocm::ScopedActivateExecutorContext; 48 #endif 49 50 namespace tensorflow { 51 52 typedef Eigen::ThreadPoolDevice CPUDevice; 53 typedef Eigen::GpuDevice GPUDevice; 54 55 // Op to convert dense matrices to CSR SparseMatrices on the CPU. 56 // Takes a Tensor of rank 2 or (if batched) 3 and a corresponding list of 57 // indices as input. 58 // 59 // The (batched) CSR SparseMatrix is constructed using only 60 // the values at the given indices. This implementation assumes that the indices 61 // are sorted with respect to batch indices and are in row-major order. 62 template <typename Device, typename T> 63 class DenseToCSRSparseMatrixCPUOp : public OpKernel { 64 public: DenseToCSRSparseMatrixCPUOp(OpKernelConstruction * c)65 explicit DenseToCSRSparseMatrixCPUOp(OpKernelConstruction* c) : OpKernel(c) {} 66 Compute(OpKernelContext * ctx)67 void Compute(OpKernelContext* ctx) override { 68 const Tensor& params = ctx->input(0); 69 const Tensor& indices = ctx->input(1); 70 71 // TODO(anudhyan): Factor out common input validation for CPU and GPU ops 72 // into a single function. 73 const TensorShape& dense_tensor_shape = params.shape(); 74 const int rank = params.dims(); 75 OP_REQUIRES(ctx, rank == 2 || rank == 3, 76 errors::InvalidArgument( 77 "params must have rank == 2 or 3; ", 78 "but saw shape: ", dense_tensor_shape.DebugString())); 79 OP_REQUIRES( 80 ctx, indices.dims() == 2, 81 errors::InvalidArgument("indices must be a matrix, but saw shape: ", 82 indices.shape().DebugString())); 83 OP_REQUIRES( 84 ctx, indices.dim_size(1) == rank, 85 errors::InvalidArgument( 86 "indices.shape[1] must be equal to the rank of params, but saw: ", 87 indices.dim_size(1), " vs. ", rank)); 88 89 Tensor dense_shape(cpu_allocator(), DT_INT64, TensorShape({rank})); 90 auto dense_shape_mutable = dense_shape.vec<int64_t>(); 91 for (int i = 0; i < rank; ++i) { 92 dense_shape_mutable(i) = dense_tensor_shape.dim_size(i); 93 } 94 95 const int64_t batch_size = (rank == 2) ? 1 : dense_tensor_shape.dim_size(0); 96 const int64_t num_rows = dense_tensor_shape.dim_size((rank == 2) ? 0 : 1); 97 const int64_t total_nnz = indices.NumElements() / rank; 98 99 Tensor values; 100 OP_REQUIRES_OK(ctx, functor::DoGatherNd<Device, T, int64_t>( 101 ctx, params, indices, &values)); 102 103 Tensor batch_ptr(cpu_allocator(), DT_INT32, TensorShape({batch_size + 1})); 104 Tensor csr_col_ind(cpu_allocator(), DT_INT32, TensorShape({total_nnz})); 105 Tensor csr_row_ptr(cpu_allocator(), DT_INT32, 106 TensorShape({(num_rows + 1) * batch_size})); 107 108 // Fill the row pointers with zeros. 109 functor::SetZeroFunctor<Device, int32> set_zero; 110 set_zero(ctx->eigen_device<Device>(), csr_row_ptr.flat<int32>()); 111 112 // Convert from COO to CSR format. 113 functor::SparseTensorToCSRSparseMatrixCPUFunctor coo_to_csr; 114 OP_REQUIRES_OK(ctx, 115 coo_to_csr(batch_size, num_rows, indices.matrix<int64_t>(), 116 batch_ptr.vec<int32>(), csr_row_ptr.vec<int32>(), 117 csr_col_ind.vec<int32>())); 118 119 CSRSparseMatrix output_csr_matrix; 120 OP_REQUIRES_OK(ctx, CSRSparseMatrix::CreateCSRSparseMatrix( 121 values.dtype(), dense_shape, batch_ptr, csr_row_ptr, 122 csr_col_ind, values, &output_csr_matrix)); 123 Tensor* output_csr_matrix_tensor; 124 AllocatorAttributes cpu_alloc; 125 cpu_alloc.set_on_host(true); 126 OP_REQUIRES_OK( 127 ctx, ctx->allocate_output(0, TensorShape({}), &output_csr_matrix_tensor, 128 cpu_alloc)); 129 output_csr_matrix_tensor->scalar<Variant>()() = 130 std::move(output_csr_matrix); 131 } 132 }; 133 134 #define REGISTER_CPU(T) \ 135 REGISTER_KERNEL_BUILDER(Name("DenseToCSRSparseMatrix") \ 136 .Device(DEVICE_CPU) \ 137 .TypeConstraint<T>("T"), \ 138 DenseToCSRSparseMatrixCPUOp<CPUDevice, T>); 139 140 REGISTER_CPU(float) 141 REGISTER_CPU(double) 142 REGISTER_CPU(complex64) 143 REGISTER_CPU(complex128) 144 145 #undef REGISTER_CPU 146 147 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM 148 149 template <typename Device, typename T> 150 class DenseToCSRSparseMatrixGPUOp : public AsyncOpKernel { 151 public: DenseToCSRSparseMatrixGPUOp(OpKernelConstruction * c)152 explicit DenseToCSRSparseMatrixGPUOp(OpKernelConstruction* c) 153 : AsyncOpKernel(c) {} 154 ComputeAsync(OpKernelContext * c,DoneCallback done)155 void ComputeAsync(OpKernelContext* c, DoneCallback done) final { 156 auto stream = c->op_device_context()->stream(); 157 const Device& d = c->eigen_device<Device>(); 158 159 const Tensor& params_t = c->input(0); 160 const Tensor& indices_t = c->input(1); 161 const TensorShape& dense_tensor_shape = params_t.shape(); 162 const int rank = params_t.dims(); 163 OP_REQUIRES_ASYNC(c, rank == 2 || rank == 3, 164 errors::InvalidArgument( 165 "params must have rank == 2 or 3; ", 166 "but saw shape: ", dense_tensor_shape.DebugString()), 167 done); 168 OP_REQUIRES_ASYNC( 169 c, indices_t.dims() == 2, 170 errors::InvalidArgument("indices must be a matrix, but saw shape: ", 171 indices_t.shape().DebugString()), 172 done); 173 OP_REQUIRES_ASYNC( 174 c, indices_t.dim_size(1) == rank, 175 errors::InvalidArgument( 176 "indices.shape[1] must be equal to the rank of params, but saw: ", 177 indices_t.dim_size(1), " vs. ", rank), 178 done); 179 const int64_t batch_size = (rank == 2) ? 1 : dense_tensor_shape.dim_size(0); 180 const int64_t rows = dense_tensor_shape.dim_size((rank == 2) ? 0 : 1); 181 const int64_t cols = dense_tensor_shape.dim_size((rank == 2) ? 1 : 2); 182 183 ScratchSpace<int32> nnz_per_batch_host(c, batch_size, /*on_host*/ true); 184 185 Tensor nnz_per_batch_device_t; 186 if (rank == 2) { 187 // Simple case. 188 nnz_per_batch_host.mutable_data()[0] = indices_t.dim_size(0); 189 } else { 190 OP_REQUIRES_OK_ASYNC(c, 191 c->allocate_temp(DT_INT32, TensorShape({batch_size}), 192 &nnz_per_batch_device_t), 193 done); 194 auto nnz_per_batch_device = nnz_per_batch_device_t.vec<int32>(); 195 196 functor::CalculateNNZPerBatchMatrixFromIndices<Device> 197 calculate_nnz_from_indices; 198 auto indices = indices_t.matrix<int64_t>(); 199 OP_REQUIRES_OK_ASYNC( 200 c, calculate_nnz_from_indices(c, indices, nnz_per_batch_device), 201 done); 202 203 perftools::gputools::DeviceMemoryBase nnz_per_batch_device_ptr( 204 static_cast<void*>(nnz_per_batch_device.data())); 205 206 OP_REQUIRES_ASYNC( 207 c, 208 stream 209 ->ThenMemcpy(nnz_per_batch_host.mutable_data() /*host_dst*/, 210 nnz_per_batch_device_ptr /*gpu_src*/, 211 batch_size * sizeof(int32) /*size*/) 212 .ok(), 213 errors::Internal("DenseToSparseMatrixGPUOp: failed to copy " 214 "nnz_per_batch from device"), 215 done); 216 } 217 218 // TODO(ebrevdo): write a custom pair of kernels: one that 219 // calculates the batched csr_row_ptr vector, another that fills in 220 // the col_ind and values vectors. 221 TensorReference nnz_per_batch_device_ref(nnz_per_batch_device_t); 222 auto convert_to_csr = [this, c, rank, batch_size, nnz_per_batch_host, 223 nnz_per_batch_device_ref, stream, &d, ¶ms_t, 224 &indices_t, dense_tensor_shape, rows, cols, done]() { 225 // The data has been copied out of the nnz_per_batch_device 226 // tensor by the time we get here; we can unreference it. 227 nnz_per_batch_device_ref.Unref(); 228 229 auto nnz_per_batch = nnz_per_batch_host.tensor().vec<int32>(); 230 231 // Ensure that within the callback, the proper GPU settings are 232 // configured. 233 ScopedActivateExecutorContext scoped_activation{stream->parent()}; 234 235 // Extract out the values. 236 Tensor temp_values_t; 237 OP_REQUIRES_OK_ASYNC(c, 238 (functor::DoGatherNd<Device, T, int64>( 239 c, params_t, indices_t, &temp_values_t)), 240 done); 241 const Tensor& values_t = const_cast<const Tensor&>(temp_values_t); 242 243 OP_REQUIRES_ASYNC( 244 c, TensorShapeUtils::IsVector(values_t.shape()), 245 errors::Internal("Expected values_t to be a vector, but saw shape: ", 246 values_t.shape().DebugString()), 247 done); 248 249 Tensor dense_shape_t(cpu_allocator(), DT_INT64, TensorShape({rank})); 250 auto dense_shape_mutable = dense_shape_t.vec<int64_t>(); 251 for (int i = 0; i < rank; ++i) { 252 dense_shape_mutable(i) = dense_tensor_shape.dim_size(i); 253 } 254 auto dense_shape = 255 const_cast<const Tensor&>(dense_shape_t).vec<int64_t>(); 256 257 Tensor batch_ptr_t(cpu_allocator(), DT_INT32, 258 TensorShape({batch_size + 1})); 259 auto batch_ptr = batch_ptr_t.vec<int32>(); 260 auto indices = indices_t.matrix<int64_t>(); 261 262 batch_ptr(0) = 0; 263 for (int i = 0; i < batch_size; ++i) { 264 batch_ptr(i + 1) = batch_ptr(i) + nnz_per_batch(i); 265 } 266 int total_nnz = batch_ptr(batch_size); 267 OP_REQUIRES_ASYNC( 268 c, total_nnz == values_t.NumElements(), 269 errors::Internal("nnz returned by " 270 "CalculateNNZPerBatchMatrixFromInd" 271 "ices != len(values): ", 272 total_nnz, " vs. ", values_t.NumElements()), 273 done); 274 275 Tensor coo_col_ind_t; 276 Tensor csr_row_ptr_t; 277 Tensor csr_values_t = values_t; 278 279 Tensor coo_row_ind_t; 280 OP_REQUIRES_OK_ASYNC( 281 c, 282 c->allocate_temp(DT_INT32, TensorShape({total_nnz}), &coo_row_ind_t), 283 done); 284 OP_REQUIRES_OK_ASYNC( 285 c, 286 c->allocate_temp(DT_INT32, TensorShape({total_nnz}), &coo_col_ind_t), 287 done); 288 OP_REQUIRES_OK_ASYNC( 289 c, 290 c->allocate_temp(DT_INT32, TensorShape({batch_size * (rows + 1)}), 291 &csr_row_ptr_t), 292 done); 293 294 auto coo_row_ind = coo_row_ind_t.vec<int32>(); 295 auto coo_col_ind = coo_col_ind_t.vec<int32>(); 296 auto csr_row_ptr = csr_row_ptr_t.vec<int32>(); 297 298 // Convert SparseTensor rep to coo row ind, coo col ind. 299 if (total_nnz > 0) { 300 functor::SparseTensorToCOOSparseMatrix<Device> st_to_coo; 301 st_to_coo(d, dense_shape, indices, coo_row_ind, coo_col_ind); 302 } 303 304 // Set all csr row pointers to zero, so that when iterating over 305 // batches converting coo to csr, we do not have to perform an 306 // unaligned SetZero for any nnz == 0 minibatches. coo2csr has 307 // a bug if you have empty coo rows. 308 // TODO(ebrevdo): File bug w/ nvidia so coo2csr can handle 309 // zero-element input coo rows. 310 functor::SetZeroFunctor<Device, int32> set_zero; 311 set_zero(d, csr_row_ptr_t.flat<int32>()); 312 313 functor::COOSparseMatrixToCSRSparseMatrix<Device> coo_to_csr; 314 for (int i = 0; i < batch_size; ++i) { 315 int nnz_i = batch_ptr(i + 1) - batch_ptr(i); 316 if (nnz_i == 0) { 317 // This is an empty minibatch; no call to coo2csr: it's 318 // handled by the SetZero above. 319 } else { 320 // Convert coo to csr. 321 auto coo_row_ind_i = 322 TTypes<int32>::UnalignedVec(&coo_row_ind(batch_ptr(i)), nnz_i); 323 auto csr_row_ptr_i = TTypes<int32>::UnalignedVec( 324 &csr_row_ptr((rows + 1) * i), rows + 1); 325 OP_REQUIRES_OK_ASYNC( 326 c, coo_to_csr(c, rows, cols, coo_row_ind_i, csr_row_ptr_i), done); 327 } 328 } 329 330 CSRSparseMatrix matrix; 331 OP_REQUIRES_OK_ASYNC( 332 c, 333 CSRSparseMatrix::CreateCSRSparseMatrix( 334 values_t.dtype(), dense_shape_t, batch_ptr_t, csr_row_ptr_t, 335 coo_col_ind_t, csr_values_t, &matrix), 336 done); 337 Tensor* matrix_t; 338 AllocatorAttributes cpu_alloc; 339 cpu_alloc.set_on_host(true); 340 OP_REQUIRES_OK_ASYNC( 341 c, c->allocate_output(0, TensorShape({}), &matrix_t, cpu_alloc), 342 done); 343 matrix_t->scalar<Variant>()() = std::move(matrix); 344 345 done(); 346 }; 347 348 if (rank == 2) { 349 convert_to_csr(); 350 } else { 351 // Launch the GPU kernel to count nnz entries, then call convert_to_csr. 352 c->device()->tensorflow_accelerator_device_info()->event_mgr->ThenExecute( 353 stream, convert_to_csr); 354 } 355 } 356 }; 357 358 #define REGISTER_GPU(DEV, T) \ 359 REGISTER_KERNEL_BUILDER(Name("DenseToCSRSparseMatrix") \ 360 .Device(DEVICE_##DEV) \ 361 .TypeConstraint<T>("T"), \ 362 DenseToCSRSparseMatrixGPUOp<DEV##Device, T>); 363 364 REGISTER_GPU(GPU, float) 365 REGISTER_GPU(GPU, double) 366 REGISTER_GPU(GPU, complex64) 367 REGISTER_GPU(GPU, complex128) 368 369 namespace functor { 370 371 template <> 372 Status CalculateNNZPerBatchMatrixFromIndices<GPUDevice>::operator()( 373 OpKernelContext* c, TTypes<int64_t>::ConstMatrix indices, 374 TTypes<int32>::Vec nnz_per_batch); 375 extern template struct CalculateNNZPerBatchMatrixFromIndices<GPUDevice>; 376 377 template <> 378 struct SparseTensorToCOOSparseMatrix<GPUDevice> { 379 void operator()(const GPUDevice& d, 380 TTypes<int64_t>::ConstVec host_dense_shape, 381 TTypes<int64_t>::ConstMatrix indices, 382 TTypes<int>::Vec coo_row_ind, TTypes<int>::Vec coo_col_ind); 383 }; 384 extern template struct SparseTensorToCOOSparseMatrix<GPUDevice>; 385 386 template <> 387 struct COOSparseMatrixToCSRSparseMatrix<GPUDevice> { operator ()tensorflow::functor::COOSparseMatrixToCSRSparseMatrix388 Status operator()(OpKernelContext* c, const int rows, const int cols, 389 TTypes<int>::UnalignedVec coo_row_ind, 390 TTypes<int>::UnalignedVec csr_row_ptr) { 391 GpuSparse cuda_sparse(c); 392 TF_RETURN_IF_ERROR(cuda_sparse.Initialize()); 393 return cuda_sparse.Coo2csr(coo_row_ind.data(), 394 /*nnz*/ coo_row_ind.size(), 395 /*m == rows of A*/ rows, csr_row_ptr.data()); 396 } 397 }; 398 extern template struct COOSparseMatrixToCSRSparseMatrix<GPUDevice>; 399 400 } // namespace functor 401 402 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM 403 404 #undef REGISTER_GPU 405 406 } // namespace tensorflow 407