#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef AT_PER_OPERATOR_HEADERS #include #include #else #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #endif #include #include #include #include #include #include #include #include #include #include namespace at::meta { static ScalarType infer_dtype_from_optional( const Tensor& self, const std::optional& opt_dtype, const Tensor& result) { // 'opt_dtype' has the priority for both cases. if (result.defined()) { // Otherwise, get the result type, if defined. return opt_dtype.value_or(result.scalar_type()); } else { // Last case is to get the self type. // If the self type is an integer, we promote it to kLong. return at::native::get_dtype_from_self(self, opt_dtype, true); } } static IntArrayRef optional_to_arrayref(const std::optional& opt) { return opt.has_value() ? opt.value() : IntArrayRef{}; } static ScalarType get_result_or_bytebool_dtype(const Tensor& self, const Tensor& result) { // Refer [all, any : uint8 compatibility] if (result.defined()) { return result.scalar_type(); } else { return (self.scalar_type() == kByte) ? kByte : kBool; } } static void check_result_is_bytebool(const char* name, const Tensor& self, const Tensor& result) { if (result.defined()) { // Refer [all, any : uint8 compatibility] TORCH_CHECK( result.scalar_type() == ScalarType::Bool || result.scalar_type() == ScalarType::Byte, name, " only supports bool tensor for result, got: ", result.scalar_type()); } } // Note [all, any : uint8 compatibility]: // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // For NumPy compatibility, `all` and `any` return // Tensor of dtype `bool`. However for compatibility reason, // for `uint8`, they return Tensor of same dtype `uint8`. // Reference: https://github.com/pytorch/pytorch/pull/47878#issuecomment-747108561 static void allany_meta( impl::MetaBase& meta, const char* name, const Tensor& self, OptionalIntArrayRef dims, bool keepdim) { const auto& result = meta.maybe_get_output(); check_result_is_bytebool(name, self, result); auto out_dtype = get_result_or_bytebool_dtype(self, result); resize_reduction(meta, self, dims, keepdim, out_dtype, /*allow_empty_dims=*/true); } TORCH_META_FUNC2(all, dim)(const Tensor& self, int64_t dim, bool keepdim) { allany_meta(*this, "all", self, dim, keepdim); } TORCH_META_FUNC2(all, dims)(const Tensor& self, OptionalIntArrayRef dim, bool keepdim) { allany_meta(*this, "all", self, dim, keepdim); } TORCH_META_FUNC(all)(const Tensor& self) { allany_meta(*this, "all", self, {}, false); } TORCH_META_FUNC2(any, dim)(const Tensor& self, int64_t dim, bool keepdim) { allany_meta(*this, "any", self, dim, keepdim); } TORCH_META_FUNC2(any, dims)(const Tensor& self, OptionalIntArrayRef dim, bool keepdim) { allany_meta(*this, "any", self, dim, keepdim); } TORCH_META_FUNC(any)(const Tensor& self) { allany_meta(*this, "any", self, {}, false); } static void check_argmax_argmin( const char* name, const Tensor& self, const std::optional& dim) { if (dim.has_value()) { auto dim_ = maybe_wrap_dim(dim.value(), self.dim()); native::zero_numel_check_dims(self, dim_, name); } else { TORCH_CHECK_INDEX( self.numel() != 0, name, ": Expected reduction dim to be specified for input.numel() == 0."); } } TORCH_META_FUNC(argmax) (const Tensor& self, std::optional dim, bool keepdim) { check_argmax_argmin("argmax()", self, dim); resize_reduction(*this, self, optional_to_arrayref(dim), keepdim, kLong); } TORCH_META_FUNC(argmin) (const Tensor& self, std::optional dim, bool keepdim) { check_argmax_argmin("argmin()", self, dim); resize_reduction(*this, self, optional_to_arrayref(dim), keepdim, kLong); } static void meta_func_cum_ops( impl::MetaBase& meta, const char* name, const Tensor& self, int64_t dim, std::optional dtype) { // Checking whether 'dim' is valid. maybe_wrap_dim(dim, self.dim()); const auto& result = meta.maybe_get_output(); ScalarType out_dtype{}; if (result.defined()) { out_dtype = dtype.value_or(result.scalar_type()); } else { auto is_integral = at::isIntegralType(self.scalar_type(), /*includeBool=*/true); out_dtype = dtype.value_or(is_integral ? ScalarType::Long : self.scalar_type()); } meta.set_output_raw_strided(0, self.sizes(), {}, self.options().dtype(out_dtype)); namedinference::propagate_names(result, self); } TORCH_META_FUNC(cumsum) (const Tensor& self, int64_t dim, std::optional dtype) { meta_func_cum_ops(*this, "cumsum", self, dim, dtype); } TORCH_META_FUNC(cumprod) (const Tensor& self, int64_t dim, std::optional dtype) { meta_func_cum_ops(*this, "cumprod", self, dim, dtype); } TORCH_META_FUNC2(sum, dim_IntList) (const Tensor& self, OptionalIntArrayRef opt_dim, bool keepdim, std::optional opt_dtype) { auto out_dtype = infer_dtype_from_optional(self, opt_dtype, maybe_get_output()); resize_reduction(*this, self, opt_dim, keepdim, out_dtype); } TORCH_META_FUNC2(prod, dim_int) (const Tensor& self, int64_t dim, bool keepdim, std::optional dtype) { auto out_dtype = infer_dtype_from_optional(self, dtype, maybe_get_output()); resize_reduction(*this, self, dim, keepdim, out_dtype); } TORCH_META_FUNC2(mean, dim) (const Tensor& self, OptionalIntArrayRef opt_dim, bool keepdim, std::optional opt_dtype) { auto in_dtype = at::native::get_dtype_from_self(self, opt_dtype, true); if (!at::isFloatingType(in_dtype) && !at::isComplexType(in_dtype)) { std::string what = "Input"; std::string dtype = toString(self.scalar_type()); if (opt_dtype.has_value()) { what = "Optional"; dtype = toString(opt_dtype.value()); } TORCH_CHECK( false, "mean(): could not infer output dtype. ", what, " dtype must be either a floating point or complex dtype. ", "Got: ", dtype); } auto out_dtype = infer_dtype_from_optional(self, opt_dtype, maybe_get_output()); resize_reduction(*this, self, opt_dim, keepdim, out_dtype); } static ScalarType get_result_or_self_value_dtype( const Tensor& self, const Tensor& result, const std::optional& dtype) { if (result.defined()) { return result.scalar_type(); } else { return dtype.value_or(toRealValueType(self.scalar_type())); } } TORCH_META_FUNC2(norm, ScalarOpt_dim) (const Tensor& self, const OptionalScalarRef p, IntArrayRef dim, bool keepdim) { TORCH_CHECK( at::isFloatingType(self.scalar_type()) || at::isComplexType(self.scalar_type()), "norm(): input dtype should be either floating point or complex. " "Got ", self.scalar_type(), " instead."); auto out_dtype = get_result_or_self_value_dtype(self, maybe_get_output(), std::nullopt); resize_reduction(*this, self, dim, keepdim, out_dtype); } TORCH_META_FUNC2(norm, ScalarOpt_dim_dtype) (const Tensor& self, const OptionalScalarRef p, IntArrayRef dim, bool keepdim, ScalarType dtype) { TORCH_CHECK( at::isFloatingType(dtype) || at::isComplexType(dtype), "norm(): the desired output dtype should be either floating point or complex. " "Got ", dtype, " instead."); auto out_dtype = get_result_or_self_value_dtype(self, maybe_get_output(), dtype); resize_reduction(*this, self, dim, keepdim, out_dtype); } TORCH_META_FUNC(aminmax) (const Tensor& self, std::optional dim_opt, bool keepdim) { DimVector shape; if (dim_opt.has_value()) { auto dim = maybe_wrap_dim(dim_opt.value(), self.ndimension()); native::zero_numel_check_dims(self, dim, "aminmax"); shape = get_reduction_shape(self, dim, keepdim); } else { TORCH_CHECK( self.numel() > 0, "aminmax(): cannot compute aminmax over an empty dimension as the " "operation has no identity."); if (keepdim) { shape = DimVector(self.ndimension(), 1); } } const auto options = self.options(); this->set_output_raw_strided(0, shape, {}, options); this->set_output_raw_strided(1, shape, {}, options); } TORCH_META_FUNC(amax) (const Tensor& self, IntArrayRef dim, bool keepdim) { auto maybe_result = maybe_get_output(); if (maybe_result.defined()) { TORCH_CHECK(self.scalar_type() == maybe_result.scalar_type(), "Expected the dtype for input and out to match, but got ", self.scalar_type(), " for input's dtype and ", maybe_result.scalar_type(), " for out's dtype."); } if (self.numel() == 0) { at::native::zero_numel_check_dims(self, dim, "amax()"); } const ScalarType& out_dtype = maybe_result.defined() ? maybe_result.scalar_type() : self.scalar_type(); resize_reduction(*this, self, dim, keepdim, out_dtype); } TORCH_META_FUNC(amin) (const Tensor& self, IntArrayRef dim, bool keepdim) { auto maybe_result = maybe_get_output(); if (maybe_result.defined()) { TORCH_CHECK(self.scalar_type() == maybe_result.scalar_type(), "Expected the dtype for input and out to match, but got ", self.scalar_type(), " for input's dtype and ", maybe_result.scalar_type(), " for out's dtype."); } if (self.numel() == 0) { at::native::zero_numel_check_dims(self, dim, "amin()"); } const ScalarType& out_dtype = maybe_result.defined() ? maybe_result.scalar_type() : self.scalar_type(); resize_reduction(*this, self, dim, keepdim, out_dtype); } } // namespace at::meta namespace at::native { DEFINE_DISPATCH(aminmax_stub); DEFINE_DISPATCH(aminmax_allreduce_stub); TORCH_IMPL_FUNC(aminmax_out) (const Tensor& self, std::optional dim_opt, bool keepdim, const Tensor& min, const Tensor& max) { auto mutable_min = const_cast(min); auto mutable_max = const_cast(max); if (dim_opt.has_value()) { aminmax_stub( self.device().type(), self, maybe_wrap_dim(dim_opt.value(), self.ndimension()), keepdim, mutable_min, mutable_max); } else { aminmax_allreduce_stub(self.device().type(), self.contiguous(), mutable_min, mutable_max); } } DEFINE_DISPATCH(sum_stub); DEFINE_DISPATCH(nansum_stub); DEFINE_DISPATCH(std_var_stub); DEFINE_DISPATCH(prod_stub); DEFINE_DISPATCH(norm_stub); DEFINE_DISPATCH(mean_stub); DEFINE_DISPATCH(and_stub); DEFINE_DISPATCH(or_stub); DEFINE_DISPATCH(min_values_stub); DEFINE_DISPATCH(max_values_stub); DEFINE_DISPATCH(argmax_stub); DEFINE_DISPATCH(argmin_stub); DEFINE_DISPATCH(cumsum_stub); DEFINE_DISPATCH(cumprod_stub); DEFINE_DISPATCH(logcumsumexp_stub); Tensor _logcumsumexp_cpu(const Tensor& self, int64_t dim) { Tensor result = at::empty_like(self, MemoryFormat::Contiguous); return _logcumsumexp_out_cpu(self, dim, result); } Tensor& _logcumsumexp_out_cpu(const Tensor& self, int64_t dim, Tensor& result) { logcumsumexp_stub(self.device().type(), result, self, dim); return result; } Tensor logcumsumexp(const Tensor& self, int64_t dim) { auto result = [&]() { NoNamesGuard guard; return at::_logcumsumexp(self, dim); }(); namedinference::propagate_names(result, self); return result; } Tensor& logcumsumexp_out(const Tensor& self, int64_t dim, Tensor& result) { check_scalar_type_device_layout_equal(result, self); { NoNamesGuard guard; at::_logcumsumexp_out(result, self.toType(result.scalar_type()), dim); } namedinference::propagate_names(result, self); return result; } template void impl_func_cum_ops( const Tensor& self, int64_t dim, const Tensor& result, Stub& stub) { NoNamesGuard guard; if (self.dim() == 0) { result.fill_(self); } else if (self.numel() == 0) { result.zero_(); } else { dim = maybe_wrap_dim(dim, self.dim()); stub(self.device().type(), result, self.to(result.scalar_type()), dim); } } TORCH_IMPL_FUNC(cumsum_out) (const Tensor& self, int64_t dim, std::optional dtype, const Tensor& result) { impl_func_cum_ops(self, dim, result, cumsum_stub); } TORCH_IMPL_FUNC(cumprod_out) (const Tensor& self, int64_t dim, std::optional dtype, const Tensor& result) { impl_func_cum_ops(self, dim, result, cumprod_stub); } static Tensor reversed_cumsum(const Tensor& w, int64_t dim) { return w.flip(dim).cumsum(dim).flip(dim); } Tensor cumprod_backward(const Tensor& grad, const Tensor& input, int64_t dim, const Tensor& output) { /* We show here how to derive an O(n) gradient formula for arbitrary inputs. It follows via a basic application of the chain rule together with a number of observations for different cases. We assume that x is an n-dimensional vector and y = cumprod(x). In the actual implementation we will need to play a bit with masks to be able to implement the formulas deduced here for tensors. We will first deduce the formula for the case when x[i] != 0 for 1 <= i <= n. For F : R^n -> R the cost function (we will look at the complex case later), we have dF / dx_k = sum_j (dF / dy_j) * (dy_j / dx_k) (1) The term dF / dy_j is just grad_output[j] (assuming again everything is one-dimensional). The term (dy_j / dx_k) is easily seen to be if j >= k dy_j / dx_k = prod_{1 <= i <= j, i != k} x_i else: dy_j / dx_k = 0 Note that the indicator (j>=k) can be taken out by replacing the sum in (1) with a sum from k <= j <= n. Thus, dF / dx_k = sum_{k <= j <= n} grad_output[j] * (dy_j / dx_k) with dy_j / dx_k = prod_{1 <= i <= j, i != k} x_i (2) Note that this last term is just the cumulative product with k omitted. Thus, if x_k (the input) is nonzero, we can just express this as dy_j / dx_k = (prod_{1 <= i <= j} x_i) / x_k = y_j / x_k So therefore, dF / dx_k = sum_{k <= j <= n} grad_output[j] * y_j / x_k This formula just makes sense when input[i] != 0 for every i. Assume now that there exists at least a zero in the input. Denote by z1 the first element 1 <= z1 <= n with input[z1] = 0 and z2 the second element z1 < z2 <= n with input[z2] = 0, (or z2 = n if there is just one zero in input) We have three cases. k > z1: Looking at (2), we see that dy_j / dx_k = 0, for j >= k, as these terms all include a x_{z1} which is zero. As such, dF / dx_k = 0 in this case k < z1: Reasoning as in the previous case, we see that for these elements we have that dF / dx_k = sum_{k <= j < z1} grad_output[j] * (dy_j / dx_k) as the terms of the sum for j in z1 <= j <= n are all zero k = z1: Similar to the case k < z1, we have that dF / dx_z1 = sum_{z1 <= j < z2} grad_output[j] * (dy_j / dx_z1) This case has a subtlety though. To compute (dy_j / dx_z1), we cannot use the formula dy_j / dx_z1 = y_j / x_z1 as, y_j = x_z1 = 0 for j >= z1. We need to compute it with the formula for its derivative, that is: dy_j / dx_z1 = prod(x[:z1]) * (grad_output[z1] + sum(grad_output[z1+1:z2] * cumprod(x[z1+1:z2]))) When the inputs are complex, this is map is holomorphic. As such, to compute its backwards is just the conjugate of the usual backwards. This simplifies to conjugating the input. We may also reuse the output as, since the map is holomorphic, cumprod(input.conj()) = cumprod(input).conj() */ if (input.sym_numel() <= 1) { return grad; } dim = at::maybe_wrap_dim(dim, input.dim()); const int64_t dim_size = input.sym_sizes()[dim].guard_int(__FILE__, __LINE__); if (dim_size == 1) { return grad; } // To enable complex support. // From this line on `input_conj` and output_conj` // are interchangeable with `input` and `output`. auto input_conj = input.conj(); auto output_conj = output.conj(); // For Composite Compliance, we always choose the slower but composite compliant path. bool are_inputs_tensors_sublcass = areAnyTensorSubclassLike({input, grad, output}); const auto w = output_conj * grad; const auto is_zero = input == 0; if (!are_inputs_tensors_sublcass) { if (is_zero.any().item() == 0) { return reversed_cumsum(w, dim).div(input_conj); } } // If we are not computing a second order gradient, we can use an // O(n) implementation. The derivative of this implementation is _not_ // the second derivative of cumprod. As such, we fallback to a less efficient // O(n^2) implementation when at::GradMode::is_enabled(). if (!at::GradMode::is_enabled() && !are_inputs_tensors_sublcass) { // n.b. This could probably be implemented much faster with a kernel // From here on we need to use some mask gymnastics to // account for the tensorial dimensions // We do a cumsum of the zeros along the dimension. // For a vector is_zero = [False, True, False, True, False] // we would have cumsum = [0, 1, 1, 2, 2] // As such we have (in python code for simplicity) // The mask for the range [0, z1): // cumsum == 0 // The indices of the first zero z1 and zeros when // there is no first zero: // indices = (cumsum == 1).max(dim, keepdim=True).indices // The mask for the first zero: // zeros_like(indices).scatter_(dim, indices, 1.) & cumsum == 1 // Note that the logic_and with cumsum == 1 accounts // for the case when there is no first zero Tensor grad_input = at::zeros_symint(input.sym_sizes(), grad.options()); const auto cumsum = is_zero.cumsum(dim); // case k < z1 // select everything before the first zero [0, z1) auto mask = cumsum == 0; // equiv to grad_input[mask] = deriv[grad] grad_input.masked_scatter_(mask, reversed_cumsum(w.masked_fill(~mask, 0.), dim).div_(input_conj).masked_select(mask)); // select everything from the first zero to the second zero [z1, z2) mask = cumsum == 1; // case k = z1 // We start by select the first zero [z1] // We locate the indices of the first zero using the max function // We then go from the indices to a mask index_fill_ // When there is no zero in the slice, max will return the index 0. // To account for this, we need to do an intersection with mask, // which is true in the range [z1, z2) const auto first_zero_index = std::get<1>(mask.max(dim, /*keepdim*/ true)); const auto first_zero_mask = at::zeros_like(mask) .scatter_(dim, first_zero_index, /*src*/ 1) .logical_and_(mask); // select everything between the first zero and the second zero (z1, z2) mask &= ~first_zero_mask; // here we compute // dy_j / dx_z1 = sum(cumprod(input[z1+1:z2] * grad[z1+1:z2])) * prod(output[z1-1]) // relu_() necessary as gather does not support negative indices // finally, we do grad_input[z1] = dy_j / dx_z1 grad_input.masked_scatter_(first_zero_mask, input_conj.masked_fill(~mask, 1.).cumprod(dim) .mul_(grad.masked_fill(cumsum != 1, 0.)) .sum(dim, /*keepdim*/true) .mul_(at::gather(output_conj, dim, (first_zero_index - 1).relu_()) .masked_fill_(first_zero_index == 0, 1.)) .masked_select(first_zero_mask)); return grad_input; } else { // GradMode::enabled() /* If the input is nonzero, we need to calculate the dy_j / dx_k by using the formula (2), called in the code omitted_products. The way the code calculates it is simply by noting that prod_{1 <= i <= j, i != k} x_i = (prod_{1 <= i <= k} x_i) * (prod_{k + 1 <= i <= j} x_i) the first term is calculated as prods_until_k, which since doesn't depend in j is easy to vectorize. The second term (indexed by j) is the cumulative product of x_{k+1}, x_{k+2}, ..., x_n, and it's named in the code prods_from_k_pkus_1, and it's calculated as a cumprod. In order to vectorize this properly, we need to add to omitted_products the dimensions where k > j, and therefore dy_j / dx_k = 0, which is done right after the assert. */ Tensor grad_input; // For Composite Compliance, we will use // at::stack on the grad slices, hence the vector. std::vector grad_inputs; if (are_inputs_tensors_sublcass) { grad_inputs.reserve(dim_size); } else { grad_input = at::zeros(input.sizes(), grad.options()); } auto ones_size = input.sym_sizes().vec(); ones_size[dim] = 1; const Tensor ones = at::ones({1}, grad.options()).expand_symint(ones_size); Tensor prods_from_k_plus_1; Tensor omitted_products; for (const auto k : c10::irange(dim_size)) { if (k == 0) { prods_from_k_plus_1 = at::cumprod(input_conj.slice(dim, k + 1), dim); omitted_products = at::cat({ones, std::move(prods_from_k_plus_1)}, dim); } else if (k == dim_size - 1) { const Tensor prods_until_k = at::prod(input_conj.slice(dim, 0, k), dim, true); omitted_products = prods_until_k; } else { const Tensor prods_until_k = at::prod(input_conj.slice(dim, 0, k), dim, true); prods_from_k_plus_1 = at::cumprod(input_conj.slice(dim, k+1), dim); omitted_products = prods_until_k.expand_as(prods_from_k_plus_1) * prods_from_k_plus_1; omitted_products = at::cat({prods_until_k, omitted_products}, dim); } // At this point omitted_products is the same size // as input, except on the dimension dim where it's // dim_size - k TORCH_CHECK(omitted_products.sym_size(dim) == dim_size - k); auto grad_slice = at::sum(grad.slice(dim, k) * omitted_products, dim); if (are_inputs_tensors_sublcass) { grad_inputs.push_back(grad_slice); } else { grad_input.select(dim, k).copy_(grad_slice); } } return are_inputs_tensors_sublcass ? at::stack(grad_inputs, dim) : std::move(grad_input); } } // Implement std::is_nan for MSVC. namespace { #ifdef _MSC_VER template inline typename std::enable_if::value, bool>::type isnan_(T x) { return false; } template inline typename std::enable_if::value, bool>::type isnan_(T x) { return std::isnan(x); } #else template inline bool isnan_(T x) { return std::isnan(x); } #endif } template void cummax_cummin_helper(const T1* self_data, T1* values_data, T2* indices_data, int self_dim_size, int self_stride, int values_stride, int indices_stride) { Operation op; T1 out = c10::load(self_data); int idx = 0; for (const auto i : c10::irange(self_dim_size)) { T1 curr_elem = c10::load(&self_data[i*self_stride]); if(isnan_(curr_elem) || (!isnan_(out) && op(curr_elem, out))) { out = curr_elem; idx = i; } values_data[i*values_stride] = out; indices_data[i*indices_stride] = idx; } } void cummax_helper_cpu(const Tensor& self, Tensor& values, Tensor& indices, int64_t dim) { AT_DISPATCH_ALL_TYPES_AND3(kBool, kBFloat16, kHalf, self.scalar_type(), "cummax_cpu", [&] { at::native::tensor_dim_apply3(self, values, indices, dim, cummax_cummin_helper>); }); } std::tuple cummax_out(const Tensor& self, int64_t dim, Tensor& values, Tensor& indices) { check_scalar_type_device_layout_equal(values, self); check_scalar_type_device_layout_equal(indices, at::empty({0}, self.options().dtype(at::kLong))); { NoNamesGuard guard; at::native::resize_output(values, self.sizes()); at::native::resize_output(indices, self.sizes()); if(self.dim() == 0) { values.fill_(self); indices.fill_(0); } else if(self.numel() != 0) { dim = maybe_wrap_dim(dim, self.dim()); at::_cummax_helper(self, values, indices, dim); } } namedinference::propagate_names(values, self); namedinference::propagate_names(indices, self); return std::forward_as_tuple(values, indices); } std::tuple cummax(const Tensor& self, int64_t dim) { auto values = at::empty(self.sizes(), self.options()); auto indices = at::empty(self.sizes(), self.options().dtype(at::kLong)); at::cummax_out(values, indices, self, dim); return std::make_tuple(values, indices); } void cummin_helper_cpu(const Tensor& self, Tensor& values, Tensor& indices, int64_t dim) { AT_DISPATCH_ALL_TYPES_AND3(kBool, kBFloat16, kHalf, self.scalar_type(), "cummin_cpu", [&] { at::native::tensor_dim_apply3(self, values, indices, dim, cummax_cummin_helper>); }); } std::tuple cummin_out(const Tensor& self, int64_t dim, Tensor& values, Tensor& indices) { check_scalar_type_device_layout_equal(values, self); check_scalar_type_device_layout_equal(indices, at::empty({0}, self.options().dtype(at::kLong))); { NoNamesGuard guard; at::native::resize_output(values, self.sizes()); at::native::resize_output(indices, self.sizes()); if(self.dim() == 0) { values.fill_(self); indices.fill_(0); } else if(self.numel() != 0) { dim = maybe_wrap_dim(dim, self.dim()); at::_cummin_helper(self, values, indices, dim); } } namedinference::propagate_names(values, self); namedinference::propagate_names(indices, self); return std::forward_as_tuple(values, indices); } std::tuple cummin(const Tensor& self, int64_t dim) { auto values = at::empty(self.sizes(), self.options()); auto indices = at::empty(self.sizes(), self.options().dtype(at::kLong)); at::cummin_out(values, indices, self, dim); return std::make_tuple(values, indices); } Tensor cummaxmin_backward(const Tensor& grad, const Tensor& input, const Tensor& indices, int64_t dim) { if (input.sym_numel() == 0) { return input; } auto result = at::zeros_symint(input.sym_sizes(), input.options()); // for composite compliance, use out-of-place variant of // `scatter_add` if `indices` or `grad` is a Tensor Subclass. if (areAnyTensorSubclassLike({indices, grad})) { return result.scatter_add(dim, indices, grad); } return result.scatter_add_(dim, indices, grad); } static Tensor prepend_append_on_dim(const Tensor& self, const std::optional& prepend, const std::optional& append, int64_t dim) { // Helper for diff that handles prepending and appending when at least one is present TORCH_INTERNAL_ASSERT(prepend.has_value() || append.has_value(), "either prepend or append must be have value"); if (!prepend.has_value() && append.has_value()) { return at::cat({self, append.value()}, dim); } else if (prepend.has_value() && !append.has_value()) { return at::cat({prepend.value(), self}, dim); } else { return at::cat({prepend.value(), self, append.value()}, dim); } } static inline void diff_check_compatible_shape(const Tensor& self, const std::optional&other, int64_t dim) { // Helper for diff that checks whether the shape of the tensor to prepend or append // is compatible with that of input if (other.has_value()) { int64_t wrapped_dim = maybe_wrap_dim(dim, self.dim(), false); TORCH_CHECK( other.value().dim() == self.dim(), "diff expects prepend or append to be the same dimension as input"); for (const auto i : c10::irange(other.value().dim())) { if (i == wrapped_dim) { continue; } TORCH_SYM_CHECK( other.value().sym_size(i).sym_eq(self.sym_size(i)), "diff expects the shape of tensor to prepend or append to match that of" " input except along the differencing dimension;" " input.size(", i, ") = ", self.sym_size(i), ", but got" " tensor.size(", i, ") = ", other.value().sym_size(i)); } } } static inline void diff_check(const Tensor& self, int64_t n, int64_t dim, const std::optional&prepend, const std::optional& append) { // Helper for diff that checks whether its parameters are valid TORCH_CHECK( self.dim() >= 1, "diff expects input to be at least one-dimensional"); TORCH_CHECK( n >= 0, "order must be non-negative but got ", n); diff_check_compatible_shape(self, prepend, dim); diff_check_compatible_shape(self, append, dim); } static inline Tensor diff_helper(const Tensor& self, int64_t n, int64_t dim) { if (n == 0) { auto result = at::zeros_like(self); result.copy_(self); return result; } auto out_len = self.sym_size(dim) - 1; auto result = self; bool is_kBool = (self.dtype() == at::kBool); n = n > self.sym_size(dim) ? self.sym_size(dim).guard_int(__FILE__, __LINE__) : n; for (C10_UNUSED const auto i : c10::irange(n)) { if (is_kBool) { result = at::logical_xor( at::narrow_symint(result, dim, 1, out_len), at::narrow_symint(result, dim, 0, out_len) ); } else { result = at::narrow_symint(result, dim, 1, out_len) - at::narrow_symint(result, dim, 0, out_len); } out_len = out_len - 1; } return result; } Tensor diff(const Tensor& self, int64_t n, int64_t dim, const std::optional& prepend, const std::optional& append) { diff_check(self, n, dim, prepend, append); if ((!prepend.has_value() && !append.has_value()) || n == 0) { return diff_helper(self, n, dim); } else { auto a = prepend_append_on_dim(self, prepend, append, dim); return diff_helper(a, n, dim); } } static inline Tensor& diff_out_helper(const Tensor& self, int64_t n, int64_t dim, Tensor& result) { if (n == 0) { if (resize_output_check_symint(result, self.sym_sizes())) { result.resize__symint(self.sym_sizes()); } check_scalar_type_device_layout_equal(result, self); return result.copy_(self); } n = n > self.sym_size(dim) ? self.sym_size(dim).guard_int(__FILE__, __LINE__) : n; const auto out_len = self.sym_size(dim) - n; auto prev_result = self; if (n > 1) { prev_result = diff_helper(self, n - 1, dim); } if (self.dtype() == at::kBool) { at::logical_xor_out( result, at::narrow_symint(prev_result, dim, 1, out_len), at::narrow_symint(prev_result, dim, 0, out_len) ); } else { at::sub_out( result, at::narrow_symint(prev_result, dim, 1, out_len), at::narrow_symint(prev_result, dim, 0, out_len) ); } return result; } Tensor& diff_out(const Tensor& self, int64_t n, int64_t dim, const std::optional& prepend, const std::optional& append, Tensor& result) { diff_check(self, n, dim, prepend, append); if ((!prepend.has_value() && !append.has_value()) || n == 0) { return diff_out_helper(self, n, dim, result); } else { auto a = prepend_append_on_dim(self, prepend, append, dim); return diff_out_helper(a, n, dim, result); } } static void pre_check_gradient(const Tensor& self, std::optional spacing_size, at::OptionalIntArrayRef dim, int64_t edge_order) { // Helper for gradient function to make sure input data satisfies prerequisites TORCH_CHECK(self.scalar_type() != ScalarType::Byte, "torch.gradient does not support uint8 input."); if (spacing_size.has_value() && !dim.has_value()) { // NOTE: If spacing was given as a scalar, the callers of this function // create a spacing vector of the expected size, and this check passes TORCH_CHECK(spacing_size.value() == self.dim(), "torch.gradient expected spacing to be unspecified, a scalar, or a list ", "of length equal to 'self.dim() = ", self.dim(), "', since dim argument ", "was not given, but got a list of length ", spacing_size.value()); } if (spacing_size.has_value() && dim.has_value()) { TORCH_CHECK(spacing_size.value() == static_cast(dim.value().size()), "torch.gradient expected spacing to be unspecified, a scalar or it's spacing and dim arguments to have the same length, but got a spacing argument of length ", spacing_size.value(), " and a dim argument of length ", dim.value().size(), "." ); } TORCH_CHECK(edge_order == 1 || edge_order == 2, "torch.gradient only supports edge_order=1 and edge_order=2."); if (dim.has_value()) { // The following function get called to check whether dim argument satisfies prerequisites. // The output of the function is not used for the computation of gradient. dim_list_to_bitset(dim.value(), self.dim()); for (const auto i : c10::irange(dim.value().size())) { TORCH_CHECK(self.size(dim.value()[i]) >= edge_order + 1, "torch.gradient expected each dimension size to be at least edge_order+1"); } } else { for (const auto i : c10::irange(self.dim())) { TORCH_CHECK(self.size(i) >= edge_order + 1, "torch.gradient expected each dimension size to be at least edge_order+1"); } } } static std::vector gradient_helper(const Tensor& self, TensorList coordinates, IntArrayRef dim, int64_t edge_order) { for (const auto i : c10::irange(coordinates.size())) { TORCH_CHECK(self.device() == coordinates[i].device(), "torch.gradient expected each tensor to be on the same device, but got devices ", self.device(), " and ", coordinates[i].device(), "!"); } std::vector result; for (const auto i : c10::irange(dim.size())) { TORCH_CHECK( coordinates[i].dim() == 1, "torch.gradient expected each element of spacing to have one dimension, but got an element with ", coordinates[i].dim(), " dimensions!"); int64_t direction = maybe_wrap_dim(dim[i], self.dim()); Tensor prepend, append; std::vector shape(self.dim(),1); shape[ direction ] = -1; auto ax_dx = coordinates[i].diff(1,0); auto dx1 = at::slice(ax_dx, 0, 0, -1); auto dx2 = at::slice(ax_dx, 0, 1); auto a = ( -dx2 / (dx1*(dx1+dx2)) ).reshape(shape); auto b = ( (dx2-dx1) / (dx1*dx2) ).reshape(shape); auto c = ( dx1 / (dx2*(dx1+dx2)) ).reshape(shape); auto center = a * at::slice(self, direction, 0, -2) + b * at::slice(self, direction , 1, -1) + c * at::slice(self, direction, 2); if (edge_order == 1) { prepend = (at::slice(self, direction, 1, 2 ) - at::slice(self, direction, 0, 1 )) / ax_dx[0] ; append = (at::slice(self, direction, -1 ) - at::slice(self, direction, -2, -1 )) / ax_dx[-1] ; } else if (edge_order == 2) { a =-(2.0 * ax_dx[0] + ax_dx[1]) / (ax_dx[0] * (ax_dx[0] + ax_dx[1])) ; b = ( ax_dx[0] + ax_dx[1]) / (ax_dx[0] * ax_dx[1]) ; c = ( -ax_dx[0] ) / (ax_dx[1] * (ax_dx[0] + ax_dx[1])); prepend = a * at::slice(self, direction, 0, 1) + b * at::slice(self, direction, 1, 2) + c * at::slice(self, direction, 2, 3); a = ( ax_dx[-1] ) / (ax_dx[-2] * (ax_dx[-1] + ax_dx[-2])); b =-( ax_dx[-1] + ax_dx[-2]) / (ax_dx[-1] * ax_dx[-2]); c = (2 * ax_dx[-1] + ax_dx[-2]) / (ax_dx[-1] * (ax_dx[-1] + ax_dx[-2])); append = a * at::slice(self, direction, -3, -2) + b * at::slice(self, direction, -2, -1) + c * at::slice(self, direction, -1); } result.emplace_back(prepend_append_on_dim(center, prepend, append, direction)); } return result; } static std::vector gradient_helper_float(const Tensor& self, ArrayRef spacing, IntArrayRef dim, int64_t edge_order) { std::vector result; for (const auto i : c10::irange(dim.size())) { int64_t direction = maybe_wrap_dim(dim[i], self.dim()); const auto& ax_dx = spacing[i]; Tensor prepend, append; auto center = (at::slice(self,direction, 2 ) - at::slice(self, direction, 0, -2 ) ) / ax_dx; if (edge_order==1) { prepend = (at::slice(self,direction, 1, 2) - at::slice(self, direction, 0, 1 ) ) / ax_dx; append = (at::slice(self,direction, -1 ) - at::slice(self, direction, -2, -1) ) / ax_dx ; } else if (edge_order==2) { prepend = (-1.5 * at::slice(self, direction, 0, 1) + 2 * at::slice(self, direction, 1, 2) - 0.5 * at::slice(self, direction, 2, 3))/ ax_dx; append = (0.5 * at::slice(self, direction, -3, -2) - 2 * at::slice(self, direction, -2, -1) + 1.5 * at::slice(self, direction, -1)) / ax_dx; } result.emplace_back(prepend_append_on_dim(center/2, prepend, append, direction)); } return result; } static std::vector gradient_dim_preprocess(const Tensor& self, std::optional dim) { // if gradient dim is provided as an integer, then we need to compute gradient only on this direction. // Moreover, if it's not provided at all, then we are interested in gradient for all directions. // Finally, if dim is provided as vector of ints, then it is not expected to be called by this function. if (dim.has_value()) { return std::vector{dim.value()}; } std::vector axis(self.dim()); std::iota(axis.begin(), axis.end(), 0); return axis; } std::vector gradient(const Tensor& self, TensorList coordinates, IntArrayRef dim, int64_t edge_order) { pre_check_gradient(self, std::optional(coordinates.size()), at::OptionalIntArrayRef(dim), edge_order); return gradient_helper(self, coordinates, dim, edge_order); } std::vector gradient(const Tensor& self, TensorList coordinates, std::optional dim, int64_t edge_order) { const auto processed_dim = gradient_dim_preprocess(self, dim); pre_check_gradient(self, std::optional(coordinates.size()), dim.has_value() ? at::OptionalIntArrayRef(processed_dim) : std::nullopt, edge_order); return gradient_helper(self, coordinates, processed_dim, edge_order); } std::vector gradient(const Tensor& self, c10::ArrayRef spacing, IntArrayRef dim, int64_t edge_order) { pre_check_gradient(self, std::optional(spacing.size()), at::OptionalIntArrayRef(dim), edge_order); return gradient_helper_float(self, spacing, dim, edge_order); } std::vector gradient(const Tensor& self, ArrayRef spacing, std::optional dim, int64_t edge_order) { const auto processed_dim = gradient_dim_preprocess(self, dim); pre_check_gradient(self, std::optional(spacing.size()), dim.has_value() ? at::OptionalIntArrayRef(processed_dim) : std::nullopt, edge_order); return gradient_helper_float(self, spacing, processed_dim, edge_order); } std::vector gradient(const Tensor& self, const Scalar& unit_size, IntArrayRef dim, int64_t edge_order) { // When spacing is given as scalar, while dim is given as IntArrayRef, scalar value need to // be taken as unit size at every given dimension element of - dim. std::vector spacing(dim.size(), unit_size); pre_check_gradient(self, std::optional(spacing.size()), at::OptionalIntArrayRef(dim), edge_order); return gradient_helper_float(self, spacing, dim, edge_order); } std::vector gradient(const Tensor& self, const std::optional& unit_size, std::optional dim, int64_t edge_order) { const auto processed_dim = gradient_dim_preprocess(self, dim); // When unit_size not provided, it is always assumed to be equal to 1. // When dim has integer value it implies we are looking for gradient in the specific direction, however when // it is not provided, it means we are interested to find gradient in all directions. std::vector spacing(dim.has_value() ? 1 : self.dim(), unit_size.has_value() ? unit_size.value() : 1.0) ; pre_check_gradient(self, unit_size.has_value() ? std::optional(spacing.size()) : std::nullopt, dim.has_value() ? at::OptionalIntArrayRef(processed_dim) : std::nullopt, edge_order); return gradient_helper_float(self, spacing, processed_dim, edge_order); } std::vector gradient(const Tensor& self, IntArrayRef dim, int64_t edge_order) { std::vector spacing(dim.size(), 1.0) ; pre_check_gradient(self, std::optional(spacing.size()), at::OptionalIntArrayRef(dim), edge_order); return gradient_helper_float(self, spacing, dim, edge_order); } // ALL REDUCE ################################################################# inline bool should_use_acc_buffer(at::TensorIterator& iter) { const auto ndim = iter.ndim(); if (!iter.device().is_cpu() || iter.noutputs() != 1) { return false; } if (!at::isReducedFloatingType(iter.common_dtype())) { return false; } if (ndim < 2) { return false; } auto out_strides = iter.strides(0); for (const auto dim : c10::irange(0, 2)) { if (out_strides[dim] != 0) { return false; } } return true; } TORCH_IMPL_FUNC(sum_out) (const Tensor& self, OptionalIntArrayRef opt_dim, bool keepdim, std::optional opt_dtype, const Tensor& result) { auto iter = meta::make_reduction_from_out_ty(self, result, opt_dim, keepdim, result.scalar_type()); if (iter.numel() == 0) { result.zero_(); } else { // Here is a limitation of TensorIterator reductions for permuted input with lower precision on CPU. // Consider the case: TensorIterator coalesces such input and output to >= 2 dims tensors, // and the output stride is [0, 0, x, x, ...] with x >= 0 (two reduced dimensions and non-reduced dims). // Since the reduction loop only operates on two dimensions at a time, // the intermediate sums is forced to do accumulation in the second reduced dim with lower precision. // See https://github.com/pytorch/pytorch/issues/83149 if (should_use_acc_buffer(iter)) { auto tmp_output = at::empty(result.sizes(), result.options().dtype(kFloat)); at::sum_outf(self.to(ScalarType::Float), opt_dim, keepdim, /*dtype=*/std::nullopt, tmp_output); result.copy_(tmp_output); } else{ sum_stub(iter.device_type(), iter); } } } Tensor sum(const Tensor &self, std::optional dtype) { return at::sum(self, IntArrayRef{}, false, dtype); } Tensor sum(const Tensor& self, DimnameList dim, bool keepdim, std::optional dtype) { return at::sum(self, dimnames_to_positions(self, dim), keepdim, dtype); } Tensor& sum_out(const Tensor& self, DimnameList dim, bool keepdim, std::optional opt_dtype, Tensor& result) { return at::sum_out(result, self, dimnames_to_positions(self, dim), keepdim, opt_dtype); } Tensor& nansum_out(const Tensor& self, at::OptionalIntArrayRef dim, bool keepdim, std::optional opt_dtype, Tensor& result) { if (self.device().is_cpu()) { TORCH_CHECK(!c10::isComplexType(self.scalar_type()), "nansum does not support complex inputs"); } // For integral types, use existing sum as // integral types don't have `Nan`. if (c10::isIntegralType(self.scalar_type(), true)){ return at::sum_out(result, self, dim, keepdim, opt_dtype); } ScalarType dtype = get_dtype_from_result(result, opt_dtype); auto iter = make_reduction("nansum", result, self, dim, keepdim, dtype); if (iter.numel() == 0) { result = result.zero_(); } else { nansum_stub(iter.device_type(), iter); } return result; } Tensor nansum(const Tensor& self, at::OptionalIntArrayRef dim, bool keepdim, std::optional opt_dtype) { ScalarType dtype = get_dtype_from_self(self, opt_dtype, true); Tensor result = create_reduction_result(self, dim, keepdim, dtype); return at::native::nansum_out(self, dim, keepdim, dtype, result); } namespace { template> void inline set_result(Tensor& result, accscalar_t sum) { if constexpr (std::is_integral_v) { // all integer types get promoted to kLong *result.data_ptr() = sum; } else { *result.data_ptr() = sum; } } } // NOTE: this could be implemented via diag and sum, but this has perf problems, // see https://github.com/pytorch/pytorch/pull/47305, Tensor trace_cpu(const Tensor& self) { Tensor result; // Returns the ScalarType of the self tensor if the tensor is non integral type // In the case, self is an integer type tensor, at::kLong is return since promote_integers // is set to true ScalarType dtype = get_dtype_from_self(self, std::nullopt, true); result = at::empty({}, self.options().dtype(dtype)); AT_DISPATCH_ALL_TYPES_AND_COMPLEX(self.scalar_type(), "trace", [&] { using accscalar_t = at::acc_type; accscalar_t sum = 0; const auto* t_data = self.const_data_ptr(); int64_t t_stride_0, t_stride_1, t_diag_size; TORCH_CHECK(self.dim() == 2, "trace: expected a matrix, but got tensor with dim ", self.dim()); t_stride_0 = self.stride(0); t_stride_1 = self.stride(1); t_diag_size = std::min(self.size(0), self.size(1)); for (const auto i : c10::irange(t_diag_size)) { sum += t_data[i * (t_stride_0 + t_stride_1)]; } set_result(result, sum); }); return result; } static void impl_func_prod( const Tensor& self, IntArrayRef dims, bool keepdim, std::optional dtype, const Tensor& result) { auto iter = meta::make_reduction_from_out_ty(self, result, dims, keepdim, result.scalar_type()); if (iter.numel() == 0) { result.fill_(1); } else { prod_stub(iter.device_type(), iter); } } TORCH_IMPL_FUNC(prod_out) (const Tensor& self, int64_t dim, bool keepdim, std::optional dtype, const Tensor& result) { impl_func_prod(self, dim, keepdim, dtype, result); } Tensor prod(const Tensor &self, std::optional opt_dtype) { auto dtype = get_dtype_from_self(self, opt_dtype, true); auto shape = meta::get_reduction_shape(self, {}, false); Tensor result = at::empty(shape, self.options().dtype(dtype)); impl_func_prod(self, {}, false, dtype, result); return result; } Tensor prod(const Tensor& self, Dimname dim, bool keepdim, std::optional dtype) { return at::prod(self, dimname_to_position(self, dim), keepdim, dtype); } Tensor& prod_out(const Tensor& self, Dimname dim, bool keepdim, std::optional opt_dtype, Tensor& result) { return at::prod_out(result, self, dimname_to_position(self, dim), keepdim, opt_dtype); } TORCH_IMPL_FUNC(mean_out) (const Tensor& self, OptionalIntArrayRef opt_dim, bool keepdim, std::optional opt_dtype, const Tensor& result) { ScalarType dtype = result.scalar_type(); // TODO: the TensorIterator reduction implementation of mean // (mean_kernel_impl()) is unvectorized and leads to very poor performance // for production workloads. Once that's fixed, the following code can be used // in lieu of the sum + divide implementation below. if (self.device().is_cpu()) { int64_t dim_prod = 1; if (!opt_dim.has_value() || opt_dim.value().empty() || self.ndimension() == 0) { dim_prod = self.numel(); } else { auto dim = opt_dim.value(); for (auto d : dim) { dim_prod *= self.size(d); } } auto& result_mut = const_cast(result); // For accuracy reasons, BF16/FP16 mean should be computed via the // following approach: // cast_fp32 -> sum -> div -> cast_bf16_or_fp16 // // Such an approach is necessary because if we were to choose the same // approach for BF16/FP16 as FP32 here, then it would have resulted in // the following code-flow - // cast_fp32 -> sum -> cast_bf16 -> cast_fp32 -> div -> cast_bf16, // which, in turn, does not produce as accurate results. bool is_half_type = (dtype == kHalf || dtype == kBFloat16); auto sum_out_dtype = is_half_type ? ScalarType::Float : dtype; result_mut = is_half_type ? result_mut.to(sum_out_dtype) : result_mut; // If dtype is FP16 or BF16, self (input tensor) will initially be cast to // FP32 in sum_out. This results in having to read that FP32 tensor again, // but maybe in the future, we could revise the implementation to not // materialize that intermediate FP32 tensor. That approach would probably // require some modifications in binary_kernel_reduce_vec(), // TensorIteratorBase::for_each(), and // TensorIteratorBase::serial_for_each(), apart from sum kernel for CPU. at::sum_out(result_mut, self, opt_dim, keepdim, sum_out_dtype).div_(dim_prod); // After sum & div, cast result_mut back to BF16 or FP16, if required. result_mut = is_half_type ? result_mut.to(dtype) : result_mut; } else { // device is not CPU auto iter = at::meta::make_reduction_from_out_ty( self, result, opt_dim, keepdim, dtype); if (iter.numel() == 0) { result.fill_(std::numeric_limits::quiet_NaN()); } else { mean_stub(iter.device_type(), iter); } } } Tensor mean(const Tensor &self, std::optional dtype) { return at::mean(self, IntArrayRef{}, false, dtype); } Tensor mean(const Tensor& self, DimnameList dim, bool keepdim, std::optional dtype) { return at::mean(self, dimnames_to_positions(self, dim), keepdim, dtype); } Tensor& mean_out(const Tensor& self, DimnameList dim, bool keepdim, std::optional opt_dtype, Tensor& result) { return at::mean_out(result, self, dimnames_to_positions(self, dim), keepdim, opt_dtype); } Tensor& mean_dtype_out(const Tensor &self, std::optional dtype, Tensor& result) { TORCH_CHECK( canCast(self.scalar_type(), result.scalar_type()), "mean.dtype_out(): input types can't be cast to the desired output type ", result.scalar_type()); // at::mean_out should make sure dtype and result.scalar_type() are the same return at::mean_out(result, self, IntArrayRef{}, false, dtype); } // TODO(@heitorschueroff) implement custom kernels for nanmean Tensor& nanmean_out( const Tensor& self, at::OptionalIntArrayRef dim, bool keepdim, std::optional opt_dtype, Tensor& result) { TORCH_CHECK( self.is_floating_point() || self.is_complex(), "nanmean(): expected input to have floating point or complex dtype but got ", self.scalar_type()); const auto factor = at::native::isnan(self).logical_not_().sum(dim, keepdim); at::native::nansum_out(self, dim, keepdim, opt_dtype, result).div_(factor); return result; } Tensor nanmean( const Tensor& self, at::OptionalIntArrayRef dim, bool keepdim, std::optional opt_dtype) { TORCH_CHECK( self.is_floating_point() || self.is_complex(), "nanmean(): expected input to have floating point or complex dtype but got ", self.scalar_type()); const auto factor = at::native::isnan(self.detach()).logical_not_().sum(dim, keepdim); return at::nansum(self, dim, keepdim, opt_dtype).div(factor); } static Tensor& logsumexp_out_impl(Tensor& result, const Tensor& self, IntArrayRef dims, bool keepdim) { // can't take max of empty tensor if (self.numel() != 0) { // For complex numbers, use the real part to calculate the max. Based on // https://scicomp.stackexchange.com/questions/34273/log-sum-exp-trick-for-signed-complex-numbers auto maxes = at::amax(at::real(self), dims, true); auto maxes_squeezed = (keepdim ? maxes : at::squeeze(maxes, dims)); maxes_squeezed.masked_fill_(maxes_squeezed.abs() == INFINITY, 0); at::sum_out(result, (self - maxes).exp_(), dims, keepdim); result.log_().add_(maxes_squeezed); } else { at::sum_out(result, at::exp(self), dims, keepdim); result.log_(); } return result; } Tensor& logsumexp_out(const Tensor& self, IntArrayRef dims, bool keepdim, Tensor& result) { // Complex type implies floating point type TORCH_CHECK(at::isFloatingType(result.scalar_type()) || at::isComplexType(result.scalar_type()), "logsumexp(): Expected floating point type for result tensor, but got: ", result.scalar_type()); { NoNamesGuard guard; if (at::isIntegralType(self.scalar_type(), /*includeBool=*/true)) { // for integral inputs, promote input to default floating type. auto default_dtype = at::typeMetaToScalarType(c10::get_default_dtype()); logsumexp_out_impl(result, self.to(default_dtype), dims, keepdim); } else { logsumexp_out_impl(result, self, dims, keepdim); } } namedinference::propagate_names_for_reduction(result, self, dims, keepdim); return result; } Tensor logsumexp(const Tensor& self, IntArrayRef dims, bool keepdim) { TensorOptions result_options; if (at::isIntegralType(self.scalar_type(), /*includeBool=*/true)) { // even for integral inputs, result is floating dtype auto default_dtype = at::typeMetaToScalarType(c10::get_default_dtype()); result_options = self.options().dtype(default_dtype); } else { result_options = self.options(); } auto result = at::empty({0}, result_options); return at::logsumexp_outf(self, dims, keepdim, result); } Tensor logsumexp(const Tensor& self, DimnameList dims, bool keepdim) { return at::logsumexp(self, dimnames_to_positions(self, dims), keepdim); } Tensor& logsumexp_out(const Tensor& self, DimnameList dims, bool keepdim, Tensor& result) { return at::logsumexp_out(result, self, dimnames_to_positions(self, dims), keepdim); } // special_logsumexp, alias for logsumexp Tensor special_logsumexp(const Tensor& self, IntArrayRef dims, bool keepdim) { return self.logsumexp(dims, keepdim); } Tensor& special_logsumexp_out(const Tensor& self, IntArrayRef dims, bool keepdim, Tensor& result) { return at::logsumexp_out(result, self, dims, keepdim); } static void impl_func_norm( const Tensor& self, const OptionalScalarRef& opt_p, IntArrayRef dim, bool keepdim, std::optional opt_dtype, const Tensor& result) { // Left this implementation without deprecating it as it is called in a number of places // in the codebase. We should swap those by linalg_vector_norm auto p = opt_p.has_value() ? opt_p.get() : Scalar(2.0).to(); at::linalg_vector_norm_out(const_cast(result), self, p, dim, keepdim, opt_dtype); } TORCH_IMPL_FUNC(norm_out) (const Tensor& self, const OptionalScalarRef p, IntArrayRef dim, bool keepdim, const Tensor& result) { impl_func_norm(self, p, dim, keepdim, std::nullopt, result); } TORCH_IMPL_FUNC(norm_dtype_out) (const Tensor& self, const OptionalScalarRef p, IntArrayRef dim, bool keepdim, ScalarType dtype, const Tensor& result) { impl_func_norm(self, p, dim, keepdim, dtype, result); } Tensor sparse_norm( const Tensor& self, const std::optional& p, IntArrayRef dim, bool keepdim) { return at::native_norm(self, p, dim, keepdim, std::nullopt); } Tensor sparse_dtype_norm( const Tensor& self, const std::optional& p, IntArrayRef dim, bool keepdim, ScalarType dtype) { return at::native_norm(self, p, dim, keepdim, dtype); } Tensor norm(const Tensor& self, const std::optional& p, ScalarType dtype) { return at::norm(self, p, IntArrayRef{}, false, dtype); } Tensor norm(const Tensor& self, const Scalar& p) { return at::norm(self, p, IntArrayRef{}, false); } inline TensorIterator get_allany_iter( const Tensor& self, const Tensor& result, OptionalIntArrayRef dims, bool keepdim) { if (self.is_cuda()) { // As CUDA supports dynamic type casting, we use this overload of // `make_reduction`, which doesn't cast input to the result type i.e. kBool., // otherwise we use the overload below which casts the input to kBool (which is // an extra operation). return meta::make_reduction(self, result, dims, keepdim, self.scalar_type()); } return meta::make_reduction_from_out_ty( self, result, dims, keepdim, result.scalar_type()); } template inline void allany_impl( const Tensor& self, const Tensor& result, OptionalIntArrayRef dims, bool keepdim, Stub& stub) { if (self.numel() == 0) { result.fill_(identity); } else if (self.numel() == 1) { result.copy_(self.view_as(result).to(at::kBool)); } else { auto iter = get_allany_iter(self, result, dims, keepdim); stub(iter.device_type(), iter); } } TORCH_IMPL_FUNC(all_out) (const Tensor& self, int64_t dim, bool keepdim, const Tensor& result) { allany_impl<1>(self, result, dim, keepdim, and_stub); } TORCH_IMPL_FUNC(all_dims_out) (const Tensor& self, OptionalIntArrayRef dim, bool keepdim, const Tensor& result) { allany_impl<1>(self, result, dim, keepdim, and_stub); } TORCH_IMPL_FUNC(all_all_out)(const Tensor& self, const Tensor& result) { allany_impl<1>(self, result, {}, false, and_stub); } TORCH_IMPL_FUNC(any_out) (const Tensor& self, int64_t dim, bool keepdim, const Tensor& result) { allany_impl<0>(self, result, dim, keepdim, or_stub); } TORCH_IMPL_FUNC(any_dims_out) (const Tensor& self, OptionalIntArrayRef dim, bool keepdim, const Tensor& result) { allany_impl<0>(self, result, dim, keepdim, or_stub); } TORCH_IMPL_FUNC(any_all_out)(const Tensor& self, const Tensor& result) { allany_impl<0>(self, result, {}, false, or_stub); } template Tensor allany_dims_default(const Tensor &self, OptionalIntArrayRef dim, bool keepdim) { // Default implementation in terms of all-reduce or single dim reduce if (!dim) { Tensor out; if constexpr (is_all) { out = self.all(); } else { out = self.any(); } if (keepdim) { DimVector out_shape(self.dim(), 1); return out.expand(out_shape); } return out; } if (dim->empty()) { if (self.scalar_type() == kByte) { // Convert to a 1 or 0 mask auto out = at::empty_like(self); return at::ne_outf(self, 0, out); } else { return at::_to_copy(self, kBool); } } Tensor out = self; for (auto d : *dim) { if constexpr (is_all) { out = out.all(d, /*keepdim=*/true); } else { out = out.any(d, /*keepdim=*/true); } } return keepdim ? out : out.squeeze(*dim); } Tensor all_dims_default(const Tensor &self, OptionalIntArrayRef dim, bool keepdim) { return allany_dims_default(self, dim, keepdim); } Tensor any_dims_default(const Tensor &self, OptionalIntArrayRef dim, bool keepdim) { return allany_dims_default(self, dim, keepdim); } Tensor& all_dims_out_default( const Tensor &self, OptionalIntArrayRef dim, bool keepdim, Tensor &result) { TORCH_CHECK(self.device() == result.device(), "all.dims: output must be on the same device as input"); auto tmp = all_dims_default(self, dim, keepdim); at::native::resize_output(result, tmp.sizes()); return result.copy_(tmp); } Tensor& any_dims_out_default( const Tensor &self, OptionalIntArrayRef dim, bool keepdim, Tensor &result) { TORCH_CHECK(self.device() == result.device(), "any.dims: output must be on the same device as input"); auto tmp = any_dims_default(self, dim, keepdim); at::native::resize_output(result, tmp.sizes()); return result.copy_(tmp); } TORCH_IMPL_FUNC(amin_out) (const Tensor& self, IntArrayRef dim, bool keepdim, const Tensor& result) { auto iter = meta::make_reduction(self, result, dim, keepdim, self.scalar_type()); if (iter.numel() != 0) { min_values_stub(iter.device_type(), iter); } } TORCH_IMPL_FUNC(amax_out) (const Tensor& self, IntArrayRef dim, bool keepdim, const Tensor& result) { auto iter = meta::make_reduction(self, result, dim, keepdim, self.scalar_type()); if (iter.numel() != 0) { max_values_stub(iter.device_type(), iter); } } template void argmax_argmin_impl( const Tensor& self, std::optional dim, bool keepdim, const Tensor& result, Stub& stub) { c10::MaybeOwned in; DimVector dims; int64_t _dim = 0; if (dim.has_value()) { _dim = maybe_wrap_dim(dim.value(), self.dim()); auto sizes = self.sizes(); if (sizes[_dim] == 1) { result.fill_(0); return; } dims = IntArrayRef(_dim); in = c10::MaybeOwned::borrowed(self); } else { in = c10::MaybeOwned::owned(self.reshape({-1})); keepdim = false; } auto iter = meta::make_reduction(*in, result, dims, keepdim, self.scalar_type()); if (iter.numel() != 0) { stub(iter.device_type(), iter); } } TORCH_IMPL_FUNC(argmax_out) (const Tensor& self, std::optional dim, bool keepdim, const Tensor& result) { argmax_argmin_impl(self, dim, keepdim, result, argmax_stub); } TORCH_IMPL_FUNC(argmin_out) (const Tensor& self, std::optional dim, bool keepdim, const Tensor& result) { argmax_argmin_impl(self, dim, keepdim, result, argmin_stub); } static double std_var_all_cpu(const Tensor& self, double correction, bool take_sqrt) { const auto dtype = self.scalar_type(); TORCH_CHECK(dtype == kDouble || dtype == kFloat, "std_var_all: Unsupported dtype ", dtype); auto mean = self.mean().item(); auto iter = TensorIteratorConfig() .add_const_input(self) .build(); auto reduction = [&](int64_t begin, int64_t end, double thread_sum) { AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "std_var_all_cpu", [&] { iter.serial_for_each([&] (char** data, const int64_t* strides, int64_t size0, int64_t size1) { const double local_mean = mean; const int64_t inner_stride = strides[0]; const int64_t outer_stride = strides[1]; double local_sum = 0.0; for (const auto i : c10::irange(size1)) { const char* row_ptr = data[0] + outer_stride * i; for (const auto j : c10::irange(size0)) { const auto ptr = reinterpret_cast(row_ptr + inner_stride * j); auto dx = (static_cast(*ptr) - local_mean); local_sum += dx * dx; } } thread_sum += local_sum; }, {begin, end}); }); return thread_sum; }; // ((x - mean)**2).sum() const double sum_dx2 = at::parallel_reduce( 0, iter.numel(), at::internal::GRAIN_SIZE, 0.0, reduction, std::plus<>{}); const auto var = [&] () __ubsan_ignore_float_divide_by_zero__ { return sum_dx2 / std::max(0.0, self.numel() - correction); }(); const auto result = take_sqrt ? std::sqrt(var) : var; if (dtype == kFloat) { // Convert to infinity if out of range for a float. // Doing it now prevents checked_convert failing later return static_cast(result); } return result; } namespace { inline void warn_invalid_degrees_of_freedom(const char* fname, const TensorIterator& iter, double correction) { int64_t reducing_over_num_elements = iter.num_output_elements() == 0 ? 0 : iter.numel() / iter.num_output_elements(); if (reducing_over_num_elements - correction <= 0) { TORCH_WARN(fname, "(): degrees of freedom is <= 0. Correction should be strictly less than the reduction factor (input numel divided by output numel)."); } } } // namespace static Tensor& std_var_out( const char* fname, Tensor& result, const Tensor& self, at::OptionalIntArrayRef dim, const std::optional& correction_opt, bool keepdim, bool take_sqrt) { TORCH_CHECK(self.device().is_cpu() || self.device().is_cuda() || self.device().is_xpu(), "std and var supports tensors on a CPU, CUDA, or XPU device only, but got: ", self.device().type()); TORCH_CHECK(self.layout() == Layout::Strided, "std and var only supports strided layout, got: ", self.layout()); TORCH_CHECK(at::isFloatingType(self.scalar_type()) || at::isComplexType(self.scalar_type()), "std and var only support floating point and complex dtypes"); if (at::isComplexType(self.scalar_type())) { // For complex, calculate variance of real and imaginary components // separately then add to get overall variance. ScalarType dtype = c10::toRealValueType(get_dtype_from_result(result, {})); Tensor real_in = at::real(self); Tensor real_out = at::empty({0}, self.options().dtype(dtype)); std_var_out( fname, real_out, real_in, dim, correction_opt, keepdim, /*take_sqrt=*/false); Tensor imag_in = at::imag(self); Tensor imag_out = at::empty({0}, self.options().dtype(dtype)); std_var_out( fname, imag_out, imag_in, dim, correction_opt, keepdim, /*take_sqrt=*/false); at::add_out(result, real_out, imag_out); if (take_sqrt) { at::sqrt_out(result, result); } return result; } // Computation for floating point const auto correction = correction_opt.value_or(1).toDouble(); ScalarType dtype = get_dtype_from_result(result, {}); auto iter = make_reduction(fname, result, self, dim, keepdim, dtype); TORCH_CHECK(at::canCast(self.scalar_type(), result.scalar_type()), "result type ", self.scalar_type(), " can't be cast to the " "desired output type ", result.scalar_type()); warn_invalid_degrees_of_freedom(fname, iter, correction); if (iter.numel() == 0) { // Trivial reduction result.fill_(std::numeric_limits::quiet_NaN()); return result; } else if ( result.numel() == 1 && iter.device_type() == kCPU && iter.common_dtype() != kBFloat16 && iter.common_dtype() != kHalf) { // NOTE: CPU performance significantly regressed when attempting to port to // ATen, // so all-reduce has a custom implementation. // See https://github.com/pytorch/pytorch/pull/43858. result.fill_(std_var_all_cpu(self, correction, take_sqrt)); } else { std_var_stub(iter.device_type(), iter, correction, take_sqrt); } return result; } static std::tuple std_var_mean_out( const char* fname, Tensor& result1, Tensor& result2, const Tensor& self, at::OptionalIntArrayRef dim, const std::optional& correction_opt, bool keepdim, bool take_sqrt) { AT_ASSERT(result1.defined() && result2.defined()); TORCH_CHECK(self.device().is_cpu() || self.is_cuda() || self.is_xpu(), fname, " supports tensors on a CPU, CUDA, or XPU device only, got: ", self.device().type()); TORCH_CHECK(self.layout() == Layout::Strided, fname, " only supports strided layout, got: ", self.layout()); TORCH_CHECK(at::isFloatingType(self.scalar_type()) || at::isComplexType(self.scalar_type()), fname, " only support floating point and complex dtypes"); TORCH_CHECK(result1.scalar_type() == c10::toRealValueType(result2.scalar_type()), fname, " expected result1 to be real and match the precision of result2. Got ", result1.scalar_type(), " and ", result2.scalar_type(), "."); if (at::isComplexType(self.scalar_type())) { // For complex, calculate for real and imaginary components separately then combine as: // variance = var_real + var_imag // mean = mean_real + j * mean_imag ScalarType dtype = c10::toRealValueType(get_dtype_from_result(result1, {})); Tensor real_in = at::real(self); Tensor real_out_var = at::empty({0}, self.options().dtype(dtype)); Tensor real_out_mean = at::empty({0}, self.options().dtype(dtype)); std_var_mean_out( fname, real_out_var, real_out_mean, real_in, dim, correction_opt, keepdim, /*take_sqrt=*/false); Tensor imag_in = at::imag(self); Tensor imag_out_var = at::empty({0}, self.options().dtype(dtype)); Tensor imag_out_mean = at::empty({0}, self.options().dtype(dtype)); std_var_mean_out( fname, imag_out_var, imag_out_mean, imag_in, dim, correction_opt, keepdim, /*take_sqrt=*/false); at::add_out(result1, real_out_var, imag_out_var); if (take_sqrt) { at::sqrt_out(result1, result1); } at::complex_out(result2, real_out_mean, imag_out_mean); return std::tuple(result1, result2); } // Computation for floating point const auto correction = correction_opt.value_or(1).toDouble(); ScalarType dtype = get_dtype_from_result(result1, {}); auto iter = make_reduction(fname, result1, result2, self, dim, keepdim, dtype); warn_invalid_degrees_of_freedom(fname, iter, correction); if (iter.numel() == 0) { // Trivial reduction result1.fill_(std::numeric_limits::quiet_NaN()); result2.fill_(std::numeric_limits::quiet_NaN()); } else { std_var_stub(iter.device_type(), iter, correction, take_sqrt); } return std::tuple(result1, result2); } std::tuple var_mean( const Tensor& self, at::OptionalIntArrayRef dim, bool unbiased, bool keepdim) { return at::var_mean( self, /*dim=*/at::OptionalIntArrayRef(dim), /*correction=*/std::make_optional(unbiased ? 1 : 0), keepdim); } std::tuple std_mean( const Tensor& self, at::OptionalIntArrayRef dim, bool unbiased, bool keepdim) { return at::std_mean( self, /*dim=*/at::OptionalIntArrayRef(dim), /*correction=*/std::make_optional(unbiased ? 1 : 0), keepdim); } std::tuple std_mean(const Tensor& self, bool unbiased) { return at::std_mean( self, /*dim=*/std::nullopt, /*correction=*/std::make_optional(unbiased ? 1 : 0)); } std::tuple var_mean(const Tensor& self, bool unbiased) { return at::var_mean( self, /*dim=*/std::nullopt, /*correction=*/std::make_optional(unbiased ? 1 : 0)); } std::tuple var_mean_out( Tensor& result1, Tensor& result2, const Tensor& self, IntArrayRef dim, int64_t correction, bool keepdim) { return std_var_mean_out( "var_mean", result1, result2, self, dim, correction, keepdim, false); } static TensorOptions options_to_value_type(TensorOptions opts) { auto scalar_type = typeMetaToScalarType(opts.dtype()); return opts.dtype(c10::toRealValueType(scalar_type)); } std::tuple var_mean( const Tensor& self, at::OptionalIntArrayRef dim, const std::optional& correction, bool keepdim) { Tensor result1 = at::empty({0}, options_to_value_type(self.options())); Tensor result2 = at::empty({0}, self.options()); return std_var_mean_out( "var_mean", result1, result2, self, dim, correction, keepdim, false); } std::tuple std_mean( const Tensor& self, at::OptionalIntArrayRef dim, const std::optional& correction, bool keepdim) { Tensor result1 = at::empty({0}, options_to_value_type(self.options())); Tensor result2 = at::empty({0}, self.options()); return std_var_mean_out( "std_mean", result1, result2, self, dim, correction, keepdim, true); } Tensor var(const Tensor& self, bool unbiased) { return at::var( self, /*dim=*/std::nullopt, /*correction=*/std::make_optional(unbiased ? 1 : 0)); } Tensor var(const Tensor& self, at::OptionalIntArrayRef dim, bool unbiased, bool keepdim) { return at::var( self, /*dim=*/at::OptionalIntArrayRef(dim), /*correction=*/std::make_optional(unbiased ? 1 : 0), keepdim); } Tensor& var_out(const Tensor& self, at::OptionalIntArrayRef dim, bool unbiased, bool keepdim, Tensor& result) { return at::var_out( result, self, /*dim=*/at::OptionalIntArrayRef(dim), /*correction=*/std::make_optional(unbiased ? 1 : 0), keepdim); } Tensor std(const Tensor& self, bool unbiased) { return at::std( self, /*dim=*/std::nullopt, /*correction=*/std::make_optional(unbiased ? 1 : 0)); } Tensor std(const Tensor& self, at::OptionalIntArrayRef dim, bool unbiased, bool keepdim) { return at::std(self, dim, /*correction=*/std::make_optional(unbiased ? 1 : 0), keepdim); } Tensor& std_out(const Tensor& self, at::OptionalIntArrayRef opt_dim, bool unbiased, bool keepdim, Tensor& result) { return at::std_out(result, self, opt_dim, /*correction=*/std::make_optional(unbiased ? 1 : 0), keepdim); } Tensor std(const Tensor& self, at::OptionalIntArrayRef dim, const std::optional& correction, bool keepdim) { Tensor result = at::empty({0}, options_to_value_type(self.options())); return std_var_out("std", result, self, dim, correction, keepdim, true); } Tensor& std_out( const Tensor& self, at::OptionalIntArrayRef dim, const std::optional& correction, bool keepdim, Tensor& result) { return std_var_out("std", result, self, dim, correction, keepdim, true); } Tensor& var_out( const Tensor& self, at::OptionalIntArrayRef dim, const std::optional& correction, bool keepdim, Tensor& result) { return std_var_out("var", result, self, dim, correction, keepdim, false); } Tensor var( const Tensor& self, at::OptionalIntArrayRef dim, const std::optional& correction, bool keepdim) { Tensor result = at::empty({0}, options_to_value_type(self.options())); return std_var_out("var", result, self, dim, correction, keepdim, false); } Tensor std(const Tensor& self, DimnameList dim, bool unbiased, bool keepdim) { return at::std(self, dimnames_to_positions(self, dim), unbiased, keepdim); } Tensor& std_out(const Tensor& self, DimnameList dim, bool unbiased, bool keepdim, Tensor& result) { return at::std_out(result, self, dimnames_to_positions(self, dim), unbiased, keepdim); } Tensor var(const Tensor& self, DimnameList dim, bool unbiased, bool keepdim) { return at::var(self, dimnames_to_positions(self, dim), unbiased, keepdim); } Tensor& var_out(const Tensor& self, DimnameList dim, bool unbiased, bool keepdim, Tensor& result) { return at::var_out( result, self, dimnames_to_positions(self, dim), unbiased, keepdim); } std::tuple var_mean(const Tensor& self, DimnameList dim, bool unbiased, bool keepdim) { return at::var_mean(self, dimnames_to_positions(self, dim), unbiased, keepdim); } std::tuple std_mean(const Tensor& self, DimnameList dim, bool unbiased, bool keepdim) { return at::std_mean(self, dimnames_to_positions(self, dim), unbiased, keepdim); } Tensor std(const Tensor& self, DimnameList dim, const std::optional& correction, bool keepdim) { return at::std(self, dimnames_to_positions(self, dim), correction, keepdim); } Tensor& std_out(const Tensor& self, DimnameList dim, const std::optional& correction, bool keepdim, Tensor& result) { return at::std_out(result, self, dimnames_to_positions(self, dim), correction, keepdim); } Tensor var(const Tensor& self, DimnameList dim, const std::optional& correction, bool keepdim) { return at::var(self, dimnames_to_positions(self, dim), correction, keepdim); } Tensor& var_out(const Tensor& self, DimnameList dim, const std::optional& correction, bool keepdim, Tensor& result) { return at::var_out( result, self, dimnames_to_positions(self, dim), correction, keepdim); } std::tuple var_mean(const Tensor& self, DimnameList dim, const std::optional& correction, bool keepdim) { return at::var_mean(self, dimnames_to_positions(self, dim), correction, keepdim); } std::tuple std_mean(const Tensor& self, DimnameList dim, const std::optional& correction, bool keepdim) { return at::std_mean(self, dimnames_to_positions(self, dim), correction, keepdim); } Tensor& norm_out(const Tensor& self, const std::optional& p, DimnameList dim, bool keepdim, ScalarType dtype, Tensor& result) { return at::norm_out(result, self, p, dimnames_to_positions(self, dim), keepdim, dtype); } Tensor& norm_out(const Tensor& self, const std::optional& p, DimnameList dim, bool keepdim, Tensor& result) { return at::norm_out(result, self, p, dimnames_to_positions(self, dim), keepdim); } Tensor norm(const Tensor& self, const std::optional& p, DimnameList dim, bool keepdim, ScalarType dtype) { return at::norm(self, p, dimnames_to_positions(self, dim), keepdim, dtype); } Tensor norm(const Tensor& self, const std::optional& p, DimnameList dim, bool keepdim) { return at::norm(self, p, dimnames_to_positions(self, dim), keepdim); } Tensor any(const Tensor& self, Dimname dim, bool keepdim) { reportNYIDimnameOverload("any"); } Tensor& any_out(const Tensor &self, Dimname dim, bool keepdim, Tensor& result) { reportNYIDimnameOverload("any"); } Tensor all(const Tensor& self, Dimname dim, bool keepdim) { reportNYIDimnameOverload("all"); } Tensor& all_out(const Tensor &self, Dimname dim, bool keepdim, Tensor& result) { reportNYIDimnameOverload("all"); } Tensor _is_all_true(const Tensor& self) { TORCH_INTERNAL_ASSERT(self.scalar_type() == at::kBool); return self.all(); } Tensor _is_any_true(const Tensor& self) { TORCH_INTERNAL_ASSERT(self.scalar_type() == at::kBool); return self.any(); } Tensor logcumsumexp(const Tensor& self, Dimname dim) { return at::logcumsumexp(self, dimname_to_position(self, dim)); } Tensor& logcumsumexp_out(const Tensor& self, Dimname dim, Tensor& result) { return at::logcumsumexp_out(result, self, dimname_to_position(self, dim)); } Tensor cumsum(const Tensor& self, Dimname dim, std::optional dtype) { return at::cumsum(self, dimname_to_position(self, dim), dtype); } Tensor& cumsum_(Tensor& self, Dimname dim, std::optional dtype) { return at::cumsum_out(self, self, dimname_to_position(self, dim), dtype); } Tensor& cumsum_out(const Tensor& self, Dimname dim, std::optional dtype, Tensor& result) { return at::cumsum_out(result, self, dimname_to_position(self, dim), dtype); } Tensor cumprod(const Tensor& self, Dimname dim, std::optional dtype) { return at::cumprod(self, dimname_to_position(self, dim), dtype); } Tensor& cumprod_(Tensor& self, Dimname dim, std::optional dtype) { return at::cumprod_out(self, self, dimname_to_position(self, dim), dtype); } Tensor& cumprod_out(const Tensor& self, Dimname dim, std::optional dtype, Tensor& result) { return at::cumprod_out(result, self, dimname_to_position(self, dim), dtype); } std::tuple cummax(const Tensor& self, Dimname dim) { return at::cummax(self, dimname_to_position(self, dim)); } std::tuple cummax_out(const Tensor& self, Dimname dim, Tensor& values, Tensor& indices) { return at::cummax_out(values, indices, self, dimname_to_position(self, dim)); } std::tuple cummin(const Tensor& self, Dimname dim) { return at::cummin(self, dimname_to_position(self, dim)); } std::tuple cummin_out(const Tensor& self, Dimname dim, Tensor& values, Tensor& indices) { return at::cummin_out(values, indices, self, dimname_to_position(self, dim)); } Tensor dist(const Tensor &self, const Tensor& other, const Scalar& p){ return at::norm(self - other, p); } bool cpu_equal(const Tensor& self, const Tensor& other) { if (!at::namedinference::are_names_equal( self.unsafeGetTensorImpl(), other.unsafeGetTensorImpl())) { return false; } at::NoNamesGuard guard; TORCH_CHECK(self.device() == other.device(), "Cannot compare two tensors on " "different devices. Got: ", self.device(), " and ", other.device()); if (!self.is_same_size(other)) { return false; } // Since the flags like neg/conj should be already handled outside the // TensorIterator, it should be safe to have the following fast path by // ensuring the storage and strides exactly the same. if (self.is_alias_of(other) && self.storage_offset() == other.storage_offset() && self.dtype() == other.dtype() && self.is_contiguous() == other.is_contiguous() && self.strides().equals(other.strides()) // Extra checks to ensure the safety in case cpu_equal is directly called in C++. && self.layout() == other.layout() && self.is_neg() == other.is_neg() && self.is_conj() == other.is_conj()) { if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/true)) { return true; } std::atomic result{true}; auto iter = TensorIteratorConfig().add_const_input(self).build(); AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "equal_notnan_cpu", [&] { iter.for_each([&](char** data, const int64_t *strides, int64_t dim_size) { if (!result) { return; } char* self_data = data[0]; for (C10_UNUSED const auto i : c10::irange(dim_size)) { if (isnan_(c10::load(self_data))) { result = false; return; } self_data += strides[0]; } }); }); return result.load(); } std::atomic result{true}; auto iter = TensorIteratorConfig() .add_const_input(self) .add_const_input(other) .allow_cpu_scalars(true) .promote_inputs_to_common_dtype(true) .build(); AT_DISPATCH_V2(iter.input_dtype(), "equal_cpu", AT_WRAP([&] { iter.for_each([&](char** data, const int64_t *strides, int64_t dim_size) { if (!result) { return; } char* self_data = data[0]; char* other_data = data[1]; for (C10_UNUSED const auto i : c10::irange(dim_size)) { if (c10::load(self_data) != c10::load(other_data)) { result = false; return; } self_data += strides[0]; other_data += strides[1]; } }); }), kBool, kBFloat16, kHalf, AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)); return result.load(); } // max(dim), min(dim), topk(dim), mode(dim), are examples of reduction // functions that select values. value_selecting_reduction_backward is the // backward function for those operators; it propagates the grad to the // specific value locations referred to at `indices`. Tensor value_selecting_reduction_backward_symint(const Tensor& grad, int64_t dim, const Tensor& indices, c10::SymIntArrayRef sizes, bool keepdim) { auto inplace_scatter_if_not_tensor_subclass = [&](const Tensor& grad_out, const Tensor& indices_) { auto grad_in = at::zeros_symint(sizes, grad_out.options()); if (areAnyTensorSubclassLike({grad, indices})) { return grad_in.scatter(dim, indices_, grad_out); } return grad_in.scatter_(dim, indices_, grad_out); }; if (!keepdim && !sizes.empty()) { auto grad_ = grad.unsqueeze(dim); auto indices_ = indices.unsqueeze(dim); return inplace_scatter_if_not_tensor_subclass(grad_, indices_); } return inplace_scatter_if_not_tensor_subclass(grad, indices); } Tensor sum_csr(const Tensor &self, std::optional dtype) { return self.values().sum(dtype); } Tensor sum_coo(const Tensor &self, std::optional dtype) { return self._values().sum(dtype); } Tensor sum_sparse_coo(const Tensor& self, at::OptionalIntArrayRef dim, bool keepdim, std::optional dtype) { Tensor result; if (dim.has_value()) { if (dtype.has_value()) { result = at::_sparse_sum(self, *dim, *dtype); } else { if (c10::isIntegralType(self.scalar_type(), true)) { result = at::_sparse_sum(self, *dim, at::kLong); } else { result = at::_sparse_sum(self, *dim); } } } else { result = sum_coo(self, dtype); } if (keepdim) { auto dim_mask = make_dim_mask(dim, self.dim()); for (int dim = 0; dim < self.dim(); dim++) { if (dim_mask[dim]) { result = result.unsqueeze(dim); } } } return result; } Tensor sum_sparse_compressed( const Tensor& self, at::OptionalIntArrayRef dim, bool keepdim, std::optional dtype) { // TODO: The signature of sum.dim_IntList and _sparse_csr_sum.dim_dtype is a little // bit different in the second parameters `dim`, which causes the conversion of `dim` // to call into `_sparse_csr_sum`. Align the signatures would be a better choice. TORCH_CHECK( dim.has_value(), "dim has no value, cannot be used in sum.dim_IntList"); auto layout = self.layout(); TORCH_CHECK( layout == kSparseCsr, "Currently the only compressed sparse format supported for sum.dim_IntList is CSR, but got layout ", layout) return at::_sparse_csr_sum(self, *dim, keepdim, dtype); } } // namespace at::native