xref: /aosp_15_r20/external/pytorch/aten/src/ATen/native/cuda/ScanKernels.cpp (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
2 #include <ATen/core/Tensor.h>
3 #include <ATen/TensorUtils.h>
4 
5 #include <ATen/native/cuda/ScanKernels.h>
6 #include <ATen/native/ReduceOps.h>
7 
8 #ifndef AT_PER_OPERATOR_HEADERS
9 #include <ATen/Functions.h>
10 #include <ATen/NativeFunctions.h>
11 #else
12 #include <ATen/ops/_cummax_helper_native.h>
13 #include <ATen/ops/_cummin_helper_native.h>
14 #include <ATen/ops/_logcumsumexp_native.h>
15 #include <ATen/ops/empty.h>
16 #include <ATen/ops/empty_like.h>
17 #endif
18 
19 namespace at::native {
20 
contiguous_out_arg(const Tensor & tensor)21 static c10::MaybeOwned<Tensor> contiguous_out_arg(const Tensor &tensor) {
22   if (tensor.is_contiguous()) {
23     return c10::MaybeOwned<Tensor>::borrowed(tensor);
24   }
25   return c10::MaybeOwned<Tensor>::owned(at::empty(tensor.sizes(), tensor.options()));
26 }
27 
cummax_helper_cuda(const Tensor & self,Tensor & values,Tensor & indices,int64_t dim)28 void cummax_helper_cuda(const Tensor& self, Tensor& values, Tensor& indices, int64_t dim) {
29   TensorArg output_arg{ values, "output", 1 };
30   TensorArg indices_arg{ indices, "indices", 2 };
31   TensorArg input_arg{ self, "input", 3 };
32   checkAllSameGPU(__func__, {output_arg, indices_arg, input_arg});
33 
34   auto values_ = contiguous_out_arg(values);
35   auto indices_ = contiguous_out_arg(indices);
36   launch_cummax_cuda_kernel(self, *values_, *indices_, dim);
37   if (!values.is_same(*values_)) {
38     values.copy_(*values_);
39   }
40   if (!indices.is_same(*indices_)) {
41     indices.copy_(*indices_);
42   }
43 }
44 
cummin_helper_cuda(const Tensor & self,Tensor & values,Tensor & indices,int64_t dim)45 void cummin_helper_cuda(const Tensor& self, Tensor& values, Tensor& indices, int64_t dim) {
46   TensorArg output_arg{ values, "output", 1 };
47   TensorArg indices_arg{ indices, "indices", 2 };
48   TensorArg input_arg{ self, "input", 3 };
49   checkAllSameGPU(__func__, {output_arg, indices_arg, input_arg});
50 
51   auto values_ = contiguous_out_arg(values);
52   auto indices_ = contiguous_out_arg(indices);
53   launch_cummin_cuda_kernel(self, *values_, *indices_, dim);
54   if (!values.is_same(*values_)) {
55     values.copy_(*values_);
56   }
57   if (!indices.is_same(*indices_)) {
58     indices.copy_(*indices_);
59   }
60 }
61 
_logcumsumexp_out_cuda(const Tensor & self,int64_t dim,Tensor & result)62 Tensor& _logcumsumexp_out_cuda(const Tensor& self, int64_t dim, Tensor& result) {
63   const auto wrap_dim = maybe_wrap_dim(dim, self.dim());
64   result.resize_(self.sizes());
65   if (self.dim() == 0) {
66     result.fill_(self);
67     return result;
68   }
69   if (self.numel() == 0) {
70     result.zero_();
71     return result;
72   }
73 
74   TensorArg output_arg{ result, "output", 1 };
75   TensorArg input_arg{ self, "input", 2 };
76   checkAllSameGPU(__func__, {output_arg, input_arg});
77 
78   auto result_ = contiguous_out_arg(result);
79   launch_logcumsumexp_cuda_kernel(*result_, self, wrap_dim);
80   if (!result.is_same(*result_)) {
81     result.copy_(*result_);
82   }
83   return result;
84 }
85 
_logcumsumexp_cuda(const Tensor & self,int64_t dim)86 Tensor _logcumsumexp_cuda(const Tensor& self, int64_t dim) {
87   Tensor result = at::empty_like(self, MemoryFormat::Contiguous);
88   return _logcumsumexp_out_cuda(self, dim, result);
89 }
90 
cumsum_cuda_kernel(const Tensor & result,const Tensor & self,int64_t dim)91 void cumsum_cuda_kernel(const Tensor& result, const Tensor& self, int64_t dim) {
92   if (self.is_floating_point() || self.is_complex()) {
93     // See Note [Writing Nondeterministic Operations]
94     // Issue reporting nondeterministic behavior: https://github.com/pytorch/pytorch/issues/75240
95     globalContext().alertNotDeterministic("cumsum_cuda_kernel");
96   }
97   auto result_ = contiguous_out_arg(result);
98   launch_cumsum_cuda_kernel(*result_, self, dim);
99   if (!result.is_same(*result_)) {
100     result.copy_(*result_);
101   }
102 }
103 
cumprod_cuda_kernel(const Tensor & result,const Tensor & self,int64_t dim)104 void cumprod_cuda_kernel(const Tensor& result, const Tensor& self, int64_t dim) {
105   auto result_ = contiguous_out_arg(result);
106   launch_cumprod_cuda_kernel(*result_, self, dim);
107   if (!result.is_same(*result_)) {
108     result.copy_(*result_);
109   }
110 }
111 
112 REGISTER_CUDA_DISPATCH(cumsum_stub, &cumsum_cuda_kernel);
113 REGISTER_CUDA_DISPATCH(cumprod_stub, &cumprod_cuda_kernel);
114 
115 } // namespace at::native
116