1 // Returns unique elements of input tensor.
2 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
3
4 #include <ATen/core/Tensor.h>
5 #include <ATen/Dispatch_v2.h>
6 #include <ATen/Parallel.h>
7 #include <ATen/native/TensorIterator.h>
8 #include <c10/util/irange.h>
9 #include <c10/util/Load.h>
10
11 #ifndef AT_PER_OPERATOR_HEADERS
12 #include <ATen/Functions.h>
13 #include <ATen/NativeFunctions.h>
14 #else
15 #include <ATen/ops/_unique2_native.h>
16 #include <ATen/ops/_unique_native.h>
17 #include <ATen/ops/empty.h>
18 #include <ATen/ops/equal.h>
19 #include <ATen/ops/narrow.h>
20 #include <ATen/ops/stack.h>
21 #include <ATen/ops/unbind.h>
22 #include <ATen/ops/unique_consecutive_native.h>
23 #include <ATen/ops/unique_dim_consecutive_native.h>
24 #include <ATen/ops/unique_dim_native.h>
25 #include <ATen/ops/zeros.h>
26 #endif
27
28 namespace at::native {
29
30 namespace {
31
32 // This unique implementation when dtype is bool is mapped
33 // from UniqueCub.cu which uses a reduction to find the number of
34 // true values.
unique_cpu_bool_template(const Tensor & self,const bool return_inverse,const bool return_counts)35 std::tuple<Tensor, Tensor, Tensor> unique_cpu_bool_template(
36 const Tensor& self,
37 const bool return_inverse,
38 const bool return_counts) {
39 const Tensor& input = self.contiguous();
40 const bool* input_data = input.const_data_ptr<bool>();
41
42 int64_t numel = input.numel();
43 Tensor output = at::empty({0}, self.options());
44 Tensor inverse_indices = at::empty({0}, self.options().dtype(kLong));
45 Tensor counts = at::empty({0}, self.options().dtype(kLong));
46
47 if (numel == 0) {
48 if (return_inverse) {
49 inverse_indices.resize_(input.sizes());
50 }
51 return std::make_tuple(output, inverse_indices, counts);
52 }
53
54 int num_threads = at::get_num_threads();
55 std::vector<int64_t> num_true_thread(num_threads, 0);
56
57 const int64_t grain_size = at::internal::GRAIN_SIZE;
58 at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) {
59 int tid = at::get_thread_num();
60 for (const auto i : c10::irange(begin, end)) {
61 const bool value = c10::load(&input_data[i]);
62 if (value) {
63 num_true_thread[tid]++;
64 }
65 }
66 });
67
68 int64_t num_true = std::accumulate(num_true_thread.begin(), num_true_thread.end(), 0);
69 int64_t num_false = numel - num_true;
70 int num_out = ((num_true > 0) + (num_false > 0));
71
72 constexpr int false_idx = 0;
73 const int true_idx = num_false > 0;
74
75 output.resize_({num_out});
76 if (return_counts) {
77 counts.resize_({num_out});
78 }
79 bool* output_data = output.data_ptr<bool>();
80 int64_t* counts_data = return_counts ? counts.data_ptr<int64_t>() : nullptr;
81
82 // write output and counts
83 if (num_false > 0) {
84 output_data[false_idx] = false;
85 if (return_counts) {
86 counts_data[false_idx] = num_false;
87 }
88 }
89 if (num_true > 0) {
90 output_data[true_idx] = true;
91 if (return_counts) {
92 counts_data[true_idx] = num_true;
93 }
94 }
95
96 if (return_inverse) {
97 inverse_indices.resize_(input.sizes());
98 int64_t* inverse_indices_data = inverse_indices.data_ptr<int64_t>();
99 at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) {
100 for (const auto i : c10::irange(begin, end)) {
101 const bool value = c10::load(&input_data[i]);
102 inverse_indices_data[i] = value ? true_idx : false_idx;
103 }
104 });
105 }
106 return std::make_tuple(output, inverse_indices, counts);
107 }
108
109 // check whether the element on index i is `unique`,
110 // in the sorted sequence, the 1st element is always true.
111 //
112 // NaN is propagated to the rear in a sorted sequence,
113 // consider a sorted sequence of
114 // {1.0, 1.0, 2.0, 2.0, NaN, NaN, NaN}
115 //
116 // a. `equal_nan` == true will give:
117 // {T, F, T, F, T, F, F }
118 //
119 // b. `equal_nan` == false will give:
120 // {T, F, T, F, T, T, T }
121 //
122 template <typename scalar_t, bool equal_nan>
123 struct IsUnique {};
124
125 template <typename scalar_t>
126 struct IsUnique<scalar_t, false> {
operator ()at::native::__anon031178c20111::IsUnique127 inline bool operator() (scalar_t* data_ptr, int64_t i) {
128 if (i == 0) { return true; }
129 return c10::load(&data_ptr[i]) != c10::load(&data_ptr[i - 1]);
130 }
131 };
132
133 template <typename scalar_t>
134 struct IsUnique<scalar_t, true> {
operator ()at::native::__anon031178c20111::IsUnique135 inline bool operator() (scalar_t* data_ptr, int64_t i) {
136 if (i == 0) { return true; }
137 return (c10::load(&data_ptr[i]) != c10::load(&data_ptr[i - 1]))
138 && !(_isnan(data_ptr[i]) && _isnan(data_ptr[i - 1]));
139 }
140 };
141
142 // NB: Unique implementation using sort
143 //
144 // The whole algo is taken from NumPy at numpy/lib/arraysetops.py
145 // which firstly do sort on the input sequence and then convert
146 // it to consecutive unique.
147 //
148 // Also improvement has been made upon the NumPy version: parallel
149 // `inverse_indices` and `counts` computation in a fused loop,
150 // which made this part almost a free launch.
151 //
152 // This kernel also implements a `equal_nan` flag which has same
153 // function as NumPy's unique. Currently this is always disabled.
154 //
155 // TODO: add `bool` specialization, use similar approach as UniqueCub
156 //
157 template <typename scalar_t, typename CompareOp>
unique_cpu_sorted_template(const Tensor & self,const bool return_inverse,const bool return_counts,CompareOp is_unique)158 std::tuple<Tensor, Tensor, Tensor> unique_cpu_sorted_template(
159 const Tensor& self,
160 const bool return_inverse,
161 const bool return_counts,
162 CompareOp is_unique) {
163 const Tensor& input = self.contiguous();
164
165 int64_t numel = input.numel();
166 Tensor output = at::empty({0}, self.options());
167 Tensor inverse_indices = at::empty({0}, self.options().dtype(kLong));
168 Tensor counts = at::empty({0}, self.options().dtype(kLong));
169
170 if (numel == 0) {
171 if (return_inverse) {
172 inverse_indices.resize_(input.sizes());
173 }
174 return std::make_tuple(output, inverse_indices, counts);
175 }
176
177 // index of first unique in each consecutive section
178 // this is used to compute counts for parallelization purpose
179 Tensor unique_index = at::empty({0}, self.options().dtype(kLong));
180
181 // original behavior with unique on scalar tensor
182 // is to return a output size of ([1]), `flatten` here will do the job
183 auto input_flattened = input.flatten();
184
185 auto [input_sorted, indices] = input_flattened.sort();
186
187 scalar_t* input_sorted_data = input_sorted.data_ptr<scalar_t>();
188 int64_t* indices_data = indices.data_ptr<int64_t>();
189
190 int num_threads = at::get_num_threads();
191 std::vector<int64_t> unique_count_thread(num_threads, 0);
192 std::vector<int64_t> offset_thread(num_threads, 0);
193
194 const int64_t grain_size = at::internal::GRAIN_SIZE;
195
196 // calculate unique count from each thread
197 at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) {
198 int tid = at::get_thread_num();
199 for (const auto i : c10::irange(begin, end)) {
200 if (is_unique(input_sorted_data, i)) {
201 unique_count_thread[tid]++;
202 }
203 }
204 });
205
206 // calculate thread offset in output and
207 // `unique_count` records total count of uniques at last
208 int64_t unique_count = 0;
209 for (const auto t : c10::irange(num_threads)) {
210 offset_thread[t] = unique_count;
211 unique_count += unique_count_thread[t];
212 }
213
214 output.resize_({unique_count});
215 scalar_t* output_data = output.data_ptr<scalar_t>();
216
217 int64_t* inverse_indices_data = nullptr;
218 if (return_inverse) {
219 inverse_indices.resize_(input.sizes());
220 inverse_indices_data = inverse_indices.data_ptr<int64_t>();
221 }
222
223 int64_t* counts_data = nullptr;
224 int64_t* unique_index_data = nullptr;
225 if (return_counts) {
226 counts.resize_({unique_count});
227 counts_data = counts.data_ptr<int64_t>();
228
229 unique_index.resize_({unique_count + 1});
230 unique_index_data = unique_index.data_ptr<int64_t>();
231 unique_index_data[unique_count] = numel;
232 }
233
234 at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) {
235 int tid = at::get_thread_num();
236 int64_t offset = offset_thread[tid];
237
238 for (const auto i : c10::irange(begin, end)) {
239 if (is_unique(input_sorted_data, i)) {
240 output_data[offset] = c10::load(&input_sorted_data[i]);
241 if (return_counts) {
242 unique_index_data[offset] = i;
243 }
244 offset++;
245 }
246
247 if (return_inverse) {
248 int64_t inverse_index = offset - 1;
249 int64_t perm = indices_data[i];
250 inverse_indices_data[perm] = inverse_index;
251 }
252 }
253 });
254
255 if (return_counts) {
256 // do diff to get count
257 at::parallel_for(0, unique_count, grain_size, [&](int64_t begin, int64_t end) {
258 for (const auto i : c10::irange(begin, end)) {
259 counts_data[i] = unique_index_data[i + 1] - unique_index_data[i];
260 }
261 });
262 }
263 return std::make_tuple(output, inverse_indices, counts);
264 }
265
266 template <typename scalar_t>
unique_consecutive_cpu_template(const Tensor & self,const bool return_inverse,const bool return_counts)267 std::tuple<Tensor, Tensor, Tensor> unique_consecutive_cpu_template(
268 const Tensor& self,
269 const bool return_inverse,
270 const bool return_counts) {
271 const Tensor& input = self.contiguous();
272 const scalar_t* input_data = input.const_data_ptr<scalar_t>();
273 int64_t numel = input.numel();
274 Tensor output = at::empty({numel}, input.options());
275 Tensor inverse_indices = at::empty({0}, self.options().dtype(kLong));
276 Tensor counts = at::empty({0}, self.options().dtype(kLong));
277
278 if (return_inverse) {
279 inverse_indices.resize_(input.sizes());
280 }
281
282 if (numel > 0) {
283 scalar_t *output_data = output.data_ptr<scalar_t>();
284 int64_t *inverse_data = inverse_indices.data_ptr<int64_t>();;
285 int64_t *counts_data = nullptr;
286 scalar_t last_value = c10::load(input_data);
287 *output_data = last_value;
288
289 if (return_counts) {
290 counts.resize_({numel});
291 counts_data = counts.data_ptr<int64_t>();
292 }
293 scalar_t *p = output_data;
294 int64_t *q = counts_data;
295 int64_t last = 0;
296 if (return_inverse) {
297 inverse_data[0] = 0;
298 }
299 for (const auto i : c10::irange(1, numel)) {
300 const auto value = c10::load(&input_data[i]);
301 if (value != last_value) {
302 *(++p) = value;
303 last_value = value;
304 if (return_counts) {
305 *(q++) = i - last;
306 last = i;
307 }
308 }
309 if (return_inverse) {
310 inverse_data[i] = p - output_data;
311 }
312 }
313 int64_t output_size = p - output_data + 1;
314 if (return_counts) {
315 *q = numel - last;
316 counts.resize_({output_size});
317 }
318 output.resize_({output_size});
319 }
320
321 return std::make_tuple(output, inverse_indices, counts);
322 }
323
324 template<class ForwardIt>
_unique_dim_cpu_impl(ForwardIt first,ForwardIt last,std::vector<int64_t> & indices,Tensor inverse_indices_vec,Tensor counts)325 ForwardIt _unique_dim_cpu_impl(ForwardIt first, ForwardIt last,
326 std::vector<int64_t>& indices, Tensor inverse_indices_vec, Tensor counts) {
327 if (first == last) {
328 return last;
329 }
330
331 TORCH_INTERNAL_ASSERT(inverse_indices_vec.is_contiguous(),
332 "_unique_dim_cpu_impl only support contiguous inverse_indices_vec");
333 TORCH_INTERNAL_ASSERT(counts.is_contiguous(),
334 "_unique_dim_cpu_impl only support contiguous counts");
335
336 int64_t *indices_data = indices.data();
337 int64_t *inverse_data = inverse_indices_vec.data_ptr<int64_t>();
338 int64_t *counts_data = counts.data_ptr<int64_t>();
339
340 ForwardIt result = first;
341 ForwardIt previous = first;
342 int64_t *current_counts = counts_data;
343 inverse_data[*(indices_data++)] = 0;
344 for (ForwardIt current = std::next(first); current != last; current++) {
345 if (!at::equal(*current, *result)) {
346 *(++result) = std::move(*current);
347 *(current_counts++) = std::distance(previous, current);
348 previous = current;
349 }
350 inverse_data[*(indices_data++)] = std::distance(first, result);
351 }
352 *current_counts = std::distance(previous, last);
353 return ++result;
354 }
355
356 template <typename scalar_t>
_unique_dim_cpu_template(const Tensor & self,const int64_t dim,const bool consecutive,const bool return_inverse,const bool return_counts)357 std::tuple<Tensor, Tensor, Tensor> _unique_dim_cpu_template(
358 const Tensor& self,
359 const int64_t dim,
360 const bool consecutive,
361 const bool return_inverse,
362 const bool return_counts) {
363
364 auto sizes = self.sizes().vec();
365 // check how many zero dimensions exist
366 auto num_zero_dims = std::count(sizes.begin(), sizes.end(), 0);
367
368 // tensor is not well formed as it has 0 sized dimensions
369 if (self.size(dim) == 0){
370 TORCH_CHECK(
371 num_zero_dims == 1,
372 "Number of zero sized dimensions is more than one, so unique cannot be applied ")
373 Tensor output = at::empty(sizes, self.options());
374 Tensor inverse_indices =
375 at::empty({0}, self.options().dtype(kLong));
376 Tensor counts = at::empty({0}, self.options().dtype(kLong));
377
378 return std::make_tuple(output, inverse_indices, counts);
379 }
380
381 TORCH_CHECK(num_zero_dims == 0,
382 "There are 0 sized dimensions, and they aren't selected, so unique cannot be applied");
383
384 // reshape tensor as [dim, -1]
385 Tensor input_flat = self.moveaxis(dim, 0);
386 auto orig_sizes = input_flat.sizes().vec();
387 input_flat = input_flat.contiguous().view({input_flat.size(0), -1});
388
389 std::vector<int64_t> indices(input_flat.size(0));
390 std::iota(indices.begin(), indices.end(), 0);
391 int64_t numel = input_flat.size(1);
392 const scalar_t* input_flat_ptr = ((const scalar_t*)input_flat.const_data_ptr());
393
394 // sort indices using data
395 if (!consecutive) {
396 std::sort(indices.begin(), indices.end(),
397 [&](int64_t a, int64_t b) -> bool {
398 for (const auto i : c10::irange(numel)) {
399 scalar_t lhs = c10::load(&input_flat_ptr[i + a * numel]);
400 scalar_t rhs = c10::load(&input_flat_ptr[i + b * numel]);
401 if (lhs < rhs) {
402 return true;
403 } else if (lhs > rhs) {
404 return false;
405 }
406 }
407 return false;
408 });
409 }
410
411 Tensor input_sorted;
412 if (!consecutive) {
413 input_sorted = at::empty(input_flat.sizes(), input_flat.options());
414 for (const auto i : c10::irange(indices.size())) {
415 input_sorted[i] = input_flat[indices[i]];
416 }
417 } else {
418 input_sorted = input_flat;
419 }
420
421 Tensor inverse_indices = at::empty(indices.size(), self.options().dtype(kLong));
422 Tensor counts = at::zeros(indices.size(), self.options().dtype(kLong));
423 std::vector<Tensor> input_unbind = at::unbind(input_sorted, 0);
424 auto last = _unique_dim_cpu_impl(
425 input_unbind.begin(), input_unbind.end(), indices, inverse_indices, counts);
426 input_unbind.erase(last, input_unbind.end());
427 counts = at::narrow(counts, 0, 0, input_unbind.size());
428
429 // reshape back
430 auto output = at::stack(input_unbind, 0);
431 auto new_sizes = std::vector<int64_t>(std::move(orig_sizes));
432 new_sizes[0] = -1;
433 output = output.view(new_sizes);
434 output = output.moveaxis(0, dim);
435
436 return std::make_tuple(output, inverse_indices, counts);
437 }
438
439 } // namespace
440
441 std::tuple<Tensor, Tensor>
_unique_cpu(const Tensor & self,const bool sorted,const bool return_inverse)442 _unique_cpu(const Tensor& self, const bool sorted, const bool return_inverse) {
443 if (self.scalar_type() == kBool) {
444 auto [output, inverse, _] = unique_cpu_bool_template(
445 self, return_inverse, /* return_counts */false);
446 return std::make_tuple(output, inverse);
447 }
448 return AT_DISPATCH_V2(self.scalar_type(), "unique", [&] AT_WRAP({
449 // The current CPU implementation of unique always sort due to
450 // this is faster than hash table
451 auto [output, inverse, _] = unique_cpu_sorted_template<scalar_t>(
452 self, return_inverse, /* return_counts */false, IsUnique<scalar_t, /* equal_nan */false>());
453 return std::make_tuple(output, inverse);
454 }), AT_EXPAND(AT_ALL_TYPES), kBFloat16, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
455 }
456
457 std::tuple<Tensor, Tensor, Tensor>
_unique2_cpu(const Tensor & self,const bool sorted,const bool return_inverse,const bool return_counts)458 _unique2_cpu(const Tensor& self, const bool sorted, const bool return_inverse, const bool return_counts) {
459 if (self.scalar_type() == kBool) {
460 return unique_cpu_bool_template(self, return_inverse, return_counts);
461 }
462 return AT_DISPATCH_V2(self.scalar_type(), "unique", AT_WRAP([&] {
463 // The current CPU implementation of unique always sort due to
464 // this is faster than hash table
465 return unique_cpu_sorted_template<scalar_t>(
466 self, return_inverse, return_counts, IsUnique<scalar_t, /* equal_nan */ false>());
467 }), AT_EXPAND(AT_ALL_TYPES), kBFloat16, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
468 }
469
470 std::tuple<Tensor, Tensor, Tensor>
unique_dim_cpu(const Tensor & self,const int64_t dim,const bool sorted,const bool return_inverse,const bool return_counts)471 unique_dim_cpu(const Tensor& self, const int64_t dim, const bool sorted, const bool return_inverse, const bool return_counts) {
472 return AT_DISPATCH_V2(self.scalar_type(), "unique_dim", AT_WRAP([&] {
473 // The current implementation using `dim` always sorts due to unhashable tensors
474 return _unique_dim_cpu_template<scalar_t>(self, dim, false, return_inverse, return_counts);
475 }), AT_EXPAND(AT_ALL_TYPES), kBFloat16, kBool, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
476 }
477
478 std::tuple<Tensor, Tensor, Tensor>
unique_dim_consecutive_cpu(const Tensor & self,const int64_t dim,const bool return_inverse,const bool return_counts)479 unique_dim_consecutive_cpu(const Tensor& self, const int64_t dim, const bool return_inverse, const bool return_counts) {
480 return AT_DISPATCH_V2(self.scalar_type(), "unique_dim", AT_WRAP([&] {
481 return _unique_dim_cpu_template<scalar_t>(self, dim, true, return_inverse, return_counts);
482 }), AT_EXPAND(AT_ALL_TYPES), kBFloat16, kBool, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
483 }
484
485 std::tuple<Tensor, Tensor, Tensor>
unique_consecutive_cpu(const Tensor & self,const bool return_inverse,const bool return_counts,std::optional<int64_t> dim)486 unique_consecutive_cpu(const Tensor& self, const bool return_inverse, const bool return_counts, std::optional<int64_t> dim) {
487 if (!dim.has_value() || (dim.value() == 0 && self.dim() == 1)) {
488 return AT_DISPATCH_V2(self.scalar_type(), "unique", AT_WRAP([&] {
489 return unique_consecutive_cpu_template<scalar_t>(self, return_inverse, return_counts);
490 }), AT_EXPAND(AT_ALL_TYPES), kBFloat16, kBool, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
491 }
492 return unique_dim_consecutive_cpu(self, dim.value(), return_inverse, return_counts);
493 }
494
495 } // namespace at::native
496