1 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
2 #include <ATen/core/Tensor.h>
3 #include <ATen/TensorUtils.h>
4
5 #include <ATen/native/cuda/ScanKernels.h>
6 #include <ATen/native/ReduceOps.h>
7
8 #ifndef AT_PER_OPERATOR_HEADERS
9 #include <ATen/Functions.h>
10 #include <ATen/NativeFunctions.h>
11 #else
12 #include <ATen/ops/_cummax_helper_native.h>
13 #include <ATen/ops/_cummin_helper_native.h>
14 #include <ATen/ops/_logcumsumexp_native.h>
15 #include <ATen/ops/empty.h>
16 #include <ATen/ops/empty_like.h>
17 #endif
18
19 namespace at::native {
20
contiguous_out_arg(const Tensor & tensor)21 static c10::MaybeOwned<Tensor> contiguous_out_arg(const Tensor &tensor) {
22 if (tensor.is_contiguous()) {
23 return c10::MaybeOwned<Tensor>::borrowed(tensor);
24 }
25 return c10::MaybeOwned<Tensor>::owned(at::empty(tensor.sizes(), tensor.options()));
26 }
27
cummax_helper_cuda(const Tensor & self,Tensor & values,Tensor & indices,int64_t dim)28 void cummax_helper_cuda(const Tensor& self, Tensor& values, Tensor& indices, int64_t dim) {
29 TensorArg output_arg{ values, "output", 1 };
30 TensorArg indices_arg{ indices, "indices", 2 };
31 TensorArg input_arg{ self, "input", 3 };
32 checkAllSameGPU(__func__, {output_arg, indices_arg, input_arg});
33
34 auto values_ = contiguous_out_arg(values);
35 auto indices_ = contiguous_out_arg(indices);
36 launch_cummax_cuda_kernel(self, *values_, *indices_, dim);
37 if (!values.is_same(*values_)) {
38 values.copy_(*values_);
39 }
40 if (!indices.is_same(*indices_)) {
41 indices.copy_(*indices_);
42 }
43 }
44
cummin_helper_cuda(const Tensor & self,Tensor & values,Tensor & indices,int64_t dim)45 void cummin_helper_cuda(const Tensor& self, Tensor& values, Tensor& indices, int64_t dim) {
46 TensorArg output_arg{ values, "output", 1 };
47 TensorArg indices_arg{ indices, "indices", 2 };
48 TensorArg input_arg{ self, "input", 3 };
49 checkAllSameGPU(__func__, {output_arg, indices_arg, input_arg});
50
51 auto values_ = contiguous_out_arg(values);
52 auto indices_ = contiguous_out_arg(indices);
53 launch_cummin_cuda_kernel(self, *values_, *indices_, dim);
54 if (!values.is_same(*values_)) {
55 values.copy_(*values_);
56 }
57 if (!indices.is_same(*indices_)) {
58 indices.copy_(*indices_);
59 }
60 }
61
_logcumsumexp_out_cuda(const Tensor & self,int64_t dim,Tensor & result)62 Tensor& _logcumsumexp_out_cuda(const Tensor& self, int64_t dim, Tensor& result) {
63 const auto wrap_dim = maybe_wrap_dim(dim, self.dim());
64 result.resize_(self.sizes());
65 if (self.dim() == 0) {
66 result.fill_(self);
67 return result;
68 }
69 if (self.numel() == 0) {
70 result.zero_();
71 return result;
72 }
73
74 TensorArg output_arg{ result, "output", 1 };
75 TensorArg input_arg{ self, "input", 2 };
76 checkAllSameGPU(__func__, {output_arg, input_arg});
77
78 auto result_ = contiguous_out_arg(result);
79 launch_logcumsumexp_cuda_kernel(*result_, self, wrap_dim);
80 if (!result.is_same(*result_)) {
81 result.copy_(*result_);
82 }
83 return result;
84 }
85
_logcumsumexp_cuda(const Tensor & self,int64_t dim)86 Tensor _logcumsumexp_cuda(const Tensor& self, int64_t dim) {
87 Tensor result = at::empty_like(self, MemoryFormat::Contiguous);
88 return _logcumsumexp_out_cuda(self, dim, result);
89 }
90
cumsum_cuda_kernel(const Tensor & result,const Tensor & self,int64_t dim)91 void cumsum_cuda_kernel(const Tensor& result, const Tensor& self, int64_t dim) {
92 if (self.is_floating_point() || self.is_complex()) {
93 // See Note [Writing Nondeterministic Operations]
94 // Issue reporting nondeterministic behavior: https://github.com/pytorch/pytorch/issues/75240
95 globalContext().alertNotDeterministic("cumsum_cuda_kernel");
96 }
97 auto result_ = contiguous_out_arg(result);
98 launch_cumsum_cuda_kernel(*result_, self, dim);
99 if (!result.is_same(*result_)) {
100 result.copy_(*result_);
101 }
102 }
103
cumprod_cuda_kernel(const Tensor & result,const Tensor & self,int64_t dim)104 void cumprod_cuda_kernel(const Tensor& result, const Tensor& self, int64_t dim) {
105 auto result_ = contiguous_out_arg(result);
106 launch_cumprod_cuda_kernel(*result_, self, dim);
107 if (!result.is_same(*result_)) {
108 result.copy_(*result_);
109 }
110 }
111
112 REGISTER_CUDA_DISPATCH(cumsum_stub, &cumsum_cuda_kernel);
113 REGISTER_CUDA_DISPATCH(cumprod_stub, &cumprod_cuda_kernel);
114
115 } // namespace at::native
116