xref: /aosp_15_r20/external/pytorch/aten/src/ATen/native/Unique.cpp (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 // Returns unique elements of input tensor.
2 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
3 
4 #include <ATen/core/Tensor.h>
5 #include <ATen/Dispatch_v2.h>
6 #include <ATen/Parallel.h>
7 #include <ATen/native/TensorIterator.h>
8 #include <c10/util/irange.h>
9 #include <c10/util/Load.h>
10 
11 #ifndef AT_PER_OPERATOR_HEADERS
12 #include <ATen/Functions.h>
13 #include <ATen/NativeFunctions.h>
14 #else
15 #include <ATen/ops/_unique2_native.h>
16 #include <ATen/ops/_unique_native.h>
17 #include <ATen/ops/empty.h>
18 #include <ATen/ops/equal.h>
19 #include <ATen/ops/narrow.h>
20 #include <ATen/ops/stack.h>
21 #include <ATen/ops/unbind.h>
22 #include <ATen/ops/unique_consecutive_native.h>
23 #include <ATen/ops/unique_dim_consecutive_native.h>
24 #include <ATen/ops/unique_dim_native.h>
25 #include <ATen/ops/zeros.h>
26 #endif
27 
28 namespace at::native {
29 
30 namespace {
31 
32 // This unique implementation when dtype is bool is mapped
33 // from UniqueCub.cu which uses a reduction to find the number of
34 // true values.
unique_cpu_bool_template(const Tensor & self,const bool return_inverse,const bool return_counts)35 std::tuple<Tensor, Tensor, Tensor> unique_cpu_bool_template(
36     const Tensor& self,
37     const bool return_inverse,
38     const bool return_counts) {
39   const Tensor& input = self.contiguous();
40   const bool* input_data = input.const_data_ptr<bool>();
41 
42   int64_t numel = input.numel();
43   Tensor output = at::empty({0}, self.options());
44   Tensor inverse_indices = at::empty({0}, self.options().dtype(kLong));
45   Tensor counts = at::empty({0}, self.options().dtype(kLong));
46 
47   if (numel == 0) {
48     if (return_inverse) {
49       inverse_indices.resize_(input.sizes());
50     }
51     return std::make_tuple(output, inverse_indices, counts);
52   }
53 
54   int num_threads = at::get_num_threads();
55   std::vector<int64_t> num_true_thread(num_threads, 0);
56 
57   const int64_t grain_size = at::internal::GRAIN_SIZE;
58   at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) {
59     int tid = at::get_thread_num();
60     for (const auto i : c10::irange(begin, end)) {
61       const bool value = c10::load(&input_data[i]);
62       if (value) {
63         num_true_thread[tid]++;
64       }
65     }
66   });
67 
68   int64_t num_true = std::accumulate(num_true_thread.begin(), num_true_thread.end(), 0);
69   int64_t num_false = numel - num_true;
70   int num_out = ((num_true > 0) + (num_false > 0));
71 
72   constexpr int false_idx = 0;
73   const int true_idx = num_false > 0;
74 
75   output.resize_({num_out});
76   if (return_counts) {
77     counts.resize_({num_out});
78   }
79   bool* output_data = output.data_ptr<bool>();
80   int64_t* counts_data = return_counts ? counts.data_ptr<int64_t>() : nullptr;
81 
82   // write output and counts
83   if (num_false > 0) {
84     output_data[false_idx] = false;
85     if (return_counts) {
86       counts_data[false_idx] = num_false;
87     }
88   }
89   if (num_true > 0) {
90     output_data[true_idx] = true;
91     if (return_counts) {
92       counts_data[true_idx] = num_true;
93     }
94   }
95 
96   if (return_inverse) {
97     inverse_indices.resize_(input.sizes());
98     int64_t* inverse_indices_data = inverse_indices.data_ptr<int64_t>();
99     at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) {
100       for (const auto i : c10::irange(begin, end)) {
101         const bool value = c10::load(&input_data[i]);
102         inverse_indices_data[i] = value ? true_idx : false_idx;
103       }
104     });
105   }
106   return std::make_tuple(output, inverse_indices, counts);
107 }
108 
109 // check whether the element on index i is `unique`,
110 // in the sorted sequence, the 1st element is always true.
111 //
112 // NaN is propagated to the rear in a sorted sequence,
113 // consider a sorted sequence of
114 //   {1.0, 1.0, 2.0, 2.0, NaN, NaN, NaN}
115 //
116 // a. `equal_nan` == true will give:
117 //   {T,   F,   T,   F,   T,   F,   F  }
118 //
119 // b. `equal_nan` == false will give:
120 //   {T,   F,   T,   F,   T,   T,   T  }
121 //
122 template <typename scalar_t, bool equal_nan>
123 struct IsUnique {};
124 
125 template <typename scalar_t>
126 struct IsUnique<scalar_t, false> {
operator ()at::native::__anon031178c20111::IsUnique127   inline bool operator() (scalar_t* data_ptr, int64_t i) {
128     if (i == 0) { return true; }
129     return c10::load(&data_ptr[i]) != c10::load(&data_ptr[i - 1]);
130   }
131 };
132 
133 template <typename scalar_t>
134 struct IsUnique<scalar_t, true> {
operator ()at::native::__anon031178c20111::IsUnique135   inline bool operator() (scalar_t* data_ptr, int64_t i) {
136     if (i == 0) { return true; }
137     return (c10::load(&data_ptr[i]) != c10::load(&data_ptr[i - 1]))
138         && !(_isnan(data_ptr[i]) && _isnan(data_ptr[i - 1]));
139   }
140 };
141 
142 // NB: Unique implementation using sort
143 //
144 // The whole algo is taken from NumPy at numpy/lib/arraysetops.py
145 // which firstly do sort on the input sequence and then convert
146 // it to consecutive unique.
147 //
148 // Also improvement has been made upon the NumPy version: parallel
149 // `inverse_indices` and `counts` computation in a fused loop,
150 // which made this part almost a free launch.
151 //
152 // This kernel also implements a `equal_nan` flag which has same
153 // function as NumPy's unique. Currently this is always disabled.
154 //
155 // TODO: add `bool` specialization, use similar approach as UniqueCub
156 //
157 template <typename scalar_t, typename CompareOp>
unique_cpu_sorted_template(const Tensor & self,const bool return_inverse,const bool return_counts,CompareOp is_unique)158 std::tuple<Tensor, Tensor, Tensor> unique_cpu_sorted_template(
159     const Tensor& self,
160     const bool return_inverse,
161     const bool return_counts,
162     CompareOp is_unique) {
163   const Tensor& input = self.contiguous();
164 
165   int64_t numel = input.numel();
166   Tensor output = at::empty({0}, self.options());
167   Tensor inverse_indices = at::empty({0}, self.options().dtype(kLong));
168   Tensor counts = at::empty({0}, self.options().dtype(kLong));
169 
170   if (numel == 0) {
171     if (return_inverse) {
172       inverse_indices.resize_(input.sizes());
173     }
174     return std::make_tuple(output, inverse_indices, counts);
175   }
176 
177   // index of first unique in each consecutive section
178   // this is used to compute counts for parallelization purpose
179   Tensor unique_index = at::empty({0}, self.options().dtype(kLong));
180 
181   // original behavior with unique on scalar tensor
182   // is to return a output size of ([1]), `flatten` here will do the job
183   auto input_flattened = input.flatten();
184 
185   auto [input_sorted, indices] = input_flattened.sort();
186 
187   scalar_t* input_sorted_data = input_sorted.data_ptr<scalar_t>();
188   int64_t* indices_data = indices.data_ptr<int64_t>();
189 
190   int num_threads = at::get_num_threads();
191   std::vector<int64_t> unique_count_thread(num_threads, 0);
192   std::vector<int64_t> offset_thread(num_threads, 0);
193 
194   const int64_t grain_size = at::internal::GRAIN_SIZE;
195 
196   // calculate unique count from each thread
197   at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) {
198     int tid = at::get_thread_num();
199     for (const auto i : c10::irange(begin, end)) {
200       if (is_unique(input_sorted_data, i)) {
201         unique_count_thread[tid]++;
202       }
203     }
204   });
205 
206   // calculate thread offset in output and
207   // `unique_count` records total count of uniques at last
208   int64_t unique_count = 0;
209   for (const auto t : c10::irange(num_threads)) {
210     offset_thread[t] = unique_count;
211     unique_count += unique_count_thread[t];
212   }
213 
214   output.resize_({unique_count});
215   scalar_t* output_data = output.data_ptr<scalar_t>();
216 
217   int64_t* inverse_indices_data = nullptr;
218   if (return_inverse) {
219     inverse_indices.resize_(input.sizes());
220     inverse_indices_data = inverse_indices.data_ptr<int64_t>();
221   }
222 
223   int64_t* counts_data = nullptr;
224   int64_t* unique_index_data = nullptr;
225   if (return_counts) {
226     counts.resize_({unique_count});
227     counts_data = counts.data_ptr<int64_t>();
228 
229     unique_index.resize_({unique_count + 1});
230     unique_index_data = unique_index.data_ptr<int64_t>();
231     unique_index_data[unique_count] = numel;
232   }
233 
234   at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) {
235     int tid = at::get_thread_num();
236     int64_t offset = offset_thread[tid];
237 
238     for (const auto i : c10::irange(begin, end)) {
239       if (is_unique(input_sorted_data, i)) {
240         output_data[offset] = c10::load(&input_sorted_data[i]);
241         if (return_counts) {
242           unique_index_data[offset] = i;
243         }
244         offset++;
245       }
246 
247       if (return_inverse) {
248         int64_t inverse_index = offset - 1;
249         int64_t perm = indices_data[i];
250         inverse_indices_data[perm] = inverse_index;
251       }
252     }
253   });
254 
255   if (return_counts) {
256     // do diff to get count
257     at::parallel_for(0, unique_count, grain_size, [&](int64_t begin, int64_t end) {
258       for (const auto i : c10::irange(begin, end)) {
259         counts_data[i] = unique_index_data[i + 1] - unique_index_data[i];
260       }
261     });
262   }
263   return std::make_tuple(output, inverse_indices, counts);
264 }
265 
266 template <typename scalar_t>
unique_consecutive_cpu_template(const Tensor & self,const bool return_inverse,const bool return_counts)267 std::tuple<Tensor, Tensor, Tensor> unique_consecutive_cpu_template(
268     const Tensor& self,
269     const bool return_inverse,
270     const bool return_counts) {
271   const Tensor& input = self.contiguous();
272   const scalar_t* input_data = input.const_data_ptr<scalar_t>();
273   int64_t numel = input.numel();
274   Tensor output = at::empty({numel}, input.options());
275   Tensor inverse_indices = at::empty({0}, self.options().dtype(kLong));
276   Tensor counts = at::empty({0}, self.options().dtype(kLong));
277 
278   if (return_inverse) {
279     inverse_indices.resize_(input.sizes());
280   }
281 
282   if (numel > 0) {
283     scalar_t *output_data = output.data_ptr<scalar_t>();
284     int64_t *inverse_data = inverse_indices.data_ptr<int64_t>();;
285     int64_t *counts_data = nullptr;
286     scalar_t last_value = c10::load(input_data);
287     *output_data = last_value;
288 
289     if (return_counts) {
290       counts.resize_({numel});
291       counts_data = counts.data_ptr<int64_t>();
292     }
293     scalar_t *p = output_data;
294     int64_t *q = counts_data;
295     int64_t last = 0;
296     if (return_inverse) {
297       inverse_data[0] = 0;
298     }
299     for (const auto i : c10::irange(1, numel)) {
300       const auto value = c10::load(&input_data[i]);
301       if (value != last_value) {
302         *(++p) = value;
303         last_value = value;
304         if (return_counts) {
305           *(q++) = i - last;
306           last = i;
307         }
308       }
309       if (return_inverse) {
310         inverse_data[i] = p - output_data;
311       }
312     }
313     int64_t output_size = p - output_data + 1;
314     if (return_counts) {
315       *q = numel - last;
316       counts.resize_({output_size});
317     }
318     output.resize_({output_size});
319   }
320 
321   return std::make_tuple(output, inverse_indices, counts);
322 }
323 
324 template<class ForwardIt>
_unique_dim_cpu_impl(ForwardIt first,ForwardIt last,std::vector<int64_t> & indices,Tensor inverse_indices_vec,Tensor counts)325 ForwardIt _unique_dim_cpu_impl(ForwardIt first, ForwardIt last,
326   std::vector<int64_t>& indices, Tensor inverse_indices_vec, Tensor counts) {
327     if (first == last) {
328       return last;
329     }
330 
331     TORCH_INTERNAL_ASSERT(inverse_indices_vec.is_contiguous(),
332         "_unique_dim_cpu_impl only support contiguous inverse_indices_vec");
333     TORCH_INTERNAL_ASSERT(counts.is_contiguous(),
334         "_unique_dim_cpu_impl only support contiguous counts");
335 
336     int64_t *indices_data = indices.data();
337     int64_t *inverse_data = inverse_indices_vec.data_ptr<int64_t>();
338     int64_t *counts_data = counts.data_ptr<int64_t>();
339 
340     ForwardIt result = first;
341     ForwardIt previous = first;
342     int64_t *current_counts = counts_data;
343     inverse_data[*(indices_data++)] = 0;
344     for (ForwardIt current = std::next(first); current != last; current++) {
345       if (!at::equal(*current, *result)) {
346         *(++result) = std::move(*current);
347         *(current_counts++) = std::distance(previous, current);
348         previous = current;
349       }
350       inverse_data[*(indices_data++)] = std::distance(first, result);
351     }
352     *current_counts = std::distance(previous, last);
353     return ++result;
354   }
355 
356 template <typename scalar_t>
_unique_dim_cpu_template(const Tensor & self,const int64_t dim,const bool consecutive,const bool return_inverse,const bool return_counts)357 std::tuple<Tensor, Tensor, Tensor> _unique_dim_cpu_template(
358     const Tensor& self,
359     const int64_t dim,
360     const bool consecutive,
361     const bool return_inverse,
362     const bool return_counts) {
363 
364     auto sizes = self.sizes().vec();
365     // check how many zero dimensions exist
366     auto num_zero_dims = std::count(sizes.begin(), sizes.end(), 0);
367 
368     // tensor is not well formed as it has 0 sized dimensions
369     if (self.size(dim) == 0){
370       TORCH_CHECK(
371           num_zero_dims == 1,
372           "Number of zero sized dimensions is more than one, so unique cannot be applied ")
373       Tensor output = at::empty(sizes, self.options());
374       Tensor inverse_indices =
375           at::empty({0}, self.options().dtype(kLong));
376       Tensor counts = at::empty({0}, self.options().dtype(kLong));
377 
378       return std::make_tuple(output, inverse_indices, counts);
379     }
380 
381     TORCH_CHECK(num_zero_dims == 0,
382     "There are 0 sized dimensions, and they aren't selected, so unique cannot be applied");
383 
384   // reshape tensor as [dim, -1]
385   Tensor input_flat = self.moveaxis(dim, 0);
386   auto orig_sizes = input_flat.sizes().vec();
387   input_flat = input_flat.contiguous().view({input_flat.size(0), -1});
388 
389   std::vector<int64_t> indices(input_flat.size(0));
390   std::iota(indices.begin(), indices.end(), 0);
391   int64_t numel = input_flat.size(1);
392   const scalar_t* input_flat_ptr = ((const scalar_t*)input_flat.const_data_ptr());
393 
394   // sort indices using data
395   if (!consecutive) {
396     std::sort(indices.begin(), indices.end(),
397       [&](int64_t a, int64_t b) -> bool {
398         for (const auto i : c10::irange(numel)) {
399           scalar_t lhs = c10::load(&input_flat_ptr[i + a * numel]);
400           scalar_t rhs = c10::load(&input_flat_ptr[i + b * numel]);
401           if (lhs < rhs) {
402             return true;
403           } else if (lhs > rhs) {
404             return false;
405           }
406         }
407         return false;
408       });
409   }
410 
411   Tensor input_sorted;
412   if (!consecutive) {
413     input_sorted = at::empty(input_flat.sizes(), input_flat.options());
414     for (const auto i : c10::irange(indices.size())) {
415       input_sorted[i] = input_flat[indices[i]];
416     }
417   } else {
418     input_sorted = input_flat;
419   }
420 
421   Tensor inverse_indices = at::empty(indices.size(), self.options().dtype(kLong));
422   Tensor counts = at::zeros(indices.size(), self.options().dtype(kLong));
423   std::vector<Tensor> input_unbind = at::unbind(input_sorted, 0);
424   auto last = _unique_dim_cpu_impl(
425     input_unbind.begin(), input_unbind.end(), indices, inverse_indices, counts);
426   input_unbind.erase(last, input_unbind.end());
427   counts = at::narrow(counts, 0, 0, input_unbind.size());
428 
429   // reshape back
430   auto output = at::stack(input_unbind, 0);
431   auto new_sizes = std::vector<int64_t>(std::move(orig_sizes));
432   new_sizes[0] = -1;
433   output = output.view(new_sizes);
434   output = output.moveaxis(0, dim);
435 
436   return std::make_tuple(output, inverse_indices, counts);
437 }
438 
439 } // namespace
440 
441 std::tuple<Tensor, Tensor>
_unique_cpu(const Tensor & self,const bool sorted,const bool return_inverse)442 _unique_cpu(const Tensor& self, const bool sorted, const bool return_inverse) {
443   if (self.scalar_type() == kBool) {
444     auto [output, inverse, _] = unique_cpu_bool_template(
445         self, return_inverse, /* return_counts */false);
446     return std::make_tuple(output, inverse);
447   }
448   return AT_DISPATCH_V2(self.scalar_type(), "unique", [&] AT_WRAP({
449     // The current CPU implementation of unique always sort due to
450     // this is faster than hash table
451     auto [output, inverse, _] = unique_cpu_sorted_template<scalar_t>(
452         self, return_inverse, /* return_counts */false, IsUnique<scalar_t, /* equal_nan */false>());
453     return std::make_tuple(output, inverse);
454   }), AT_EXPAND(AT_ALL_TYPES), kBFloat16, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
455 }
456 
457 std::tuple<Tensor, Tensor, Tensor>
_unique2_cpu(const Tensor & self,const bool sorted,const bool return_inverse,const bool return_counts)458 _unique2_cpu(const Tensor& self, const bool sorted, const bool return_inverse, const bool return_counts) {
459   if (self.scalar_type() == kBool) {
460     return unique_cpu_bool_template(self, return_inverse, return_counts);
461   }
462   return AT_DISPATCH_V2(self.scalar_type(), "unique", AT_WRAP([&] {
463     // The current CPU implementation of unique always sort due to
464     // this is faster than hash table
465     return unique_cpu_sorted_template<scalar_t>(
466         self, return_inverse, return_counts, IsUnique<scalar_t, /* equal_nan */ false>());
467   }), AT_EXPAND(AT_ALL_TYPES), kBFloat16, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
468 }
469 
470 std::tuple<Tensor, Tensor, Tensor>
unique_dim_cpu(const Tensor & self,const int64_t dim,const bool sorted,const bool return_inverse,const bool return_counts)471 unique_dim_cpu(const Tensor& self, const int64_t dim, const bool sorted, const bool return_inverse, const bool return_counts) {
472   return AT_DISPATCH_V2(self.scalar_type(), "unique_dim", AT_WRAP([&] {
473     // The current implementation using `dim` always sorts due to unhashable tensors
474     return _unique_dim_cpu_template<scalar_t>(self, dim, false, return_inverse, return_counts);
475   }), AT_EXPAND(AT_ALL_TYPES), kBFloat16, kBool, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
476 }
477 
478 std::tuple<Tensor, Tensor, Tensor>
unique_dim_consecutive_cpu(const Tensor & self,const int64_t dim,const bool return_inverse,const bool return_counts)479 unique_dim_consecutive_cpu(const Tensor& self, const int64_t dim, const bool return_inverse, const bool return_counts) {
480   return AT_DISPATCH_V2(self.scalar_type(), "unique_dim", AT_WRAP([&] {
481     return _unique_dim_cpu_template<scalar_t>(self, dim, true, return_inverse, return_counts);
482   }), AT_EXPAND(AT_ALL_TYPES), kBFloat16, kBool, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
483 }
484 
485 std::tuple<Tensor, Tensor, Tensor>
unique_consecutive_cpu(const Tensor & self,const bool return_inverse,const bool return_counts,std::optional<int64_t> dim)486 unique_consecutive_cpu(const Tensor& self, const bool return_inverse, const bool return_counts, std::optional<int64_t> dim) {
487   if (!dim.has_value() || (dim.value() == 0 && self.dim() == 1)) {
488     return AT_DISPATCH_V2(self.scalar_type(), "unique", AT_WRAP([&] {
489       return unique_consecutive_cpu_template<scalar_t>(self, return_inverse, return_counts);
490     }), AT_EXPAND(AT_ALL_TYPES), kBFloat16, kBool, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
491   }
492   return unique_dim_consecutive_cpu(self, dim.value(), return_inverse, return_counts);
493 }
494 
495 }  // namespace at::native
496