xref: /aosp_15_r20/external/pytorch/aten/src/ATen/core/TensorAccessor.h (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #pragma once
2 
3 #include <c10/macros/Macros.h>
4 #include <c10/util/ArrayRef.h>
5 #include <c10/util/Deprecated.h>
6 #include <c10/util/Exception.h>
7 #include <c10/util/irange.h>
8 #include <cstddef>
9 #include <cstdint>
10 #include <type_traits>
11 
12 namespace at {
13 
14 // The PtrTraits argument to the TensorAccessor/GenericPackedTensorAccessor
15 // is used to enable the __restrict__ keyword/modifier for the data
16 // passed to cuda.
17 template <typename T>
18 struct DefaultPtrTraits {
19   typedef T* PtrType;
20 };
21 
22 #if defined(__CUDACC__) || defined(__HIPCC__)
23 template <typename T>
24 struct RestrictPtrTraits {
25   typedef T* __restrict__ PtrType;
26 };
27 #endif
28 
29 // TensorAccessorBase and TensorAccessor are used for both CPU and CUDA tensors.
30 // For CUDA tensors it is used in device code (only). This means that we restrict ourselves
31 // to functions and types available there (e.g. IntArrayRef isn't).
32 
33 // The PtrTraits argument is only relevant to cuda to support `__restrict__` pointers.
34 template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
35 class TensorAccessorBase {
36 public:
37   typedef typename PtrTraits<T>::PtrType PtrType;
38 
TensorAccessorBase(PtrType data_,const index_t * sizes_,const index_t * strides_)39   C10_HOST_DEVICE TensorAccessorBase(
40       PtrType data_,
41       const index_t* sizes_,
42       const index_t* strides_)
43       : data_(data_), sizes_(sizes_), strides_(strides_) {}
sizes()44   C10_HOST IntArrayRef sizes() const {
45     return IntArrayRef(sizes_,N);
46   }
strides()47   C10_HOST IntArrayRef strides() const {
48     return IntArrayRef(strides_,N);
49   }
stride(index_t i)50   C10_HOST_DEVICE index_t stride(index_t i) const {
51     return strides_[i];
52   }
size(index_t i)53   C10_HOST_DEVICE index_t size(index_t i) const {
54     return sizes_[i];
55   }
data()56   C10_HOST_DEVICE PtrType data() {
57     return data_;
58   }
data()59   C10_HOST_DEVICE const PtrType data() const {
60     return data_;
61   }
62 protected:
63   PtrType data_;
64   const index_t* sizes_;
65   const index_t* strides_;
66 };
67 
68 // The `TensorAccessor` is typically instantiated for CPU `Tensor`s using
69 // `Tensor.accessor<T, N>()`.
70 // For CUDA `Tensor`s, `GenericPackedTensorAccessor` is used on the host and only
71 // indexing on the device uses `TensorAccessor`s.
72 template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
73 class TensorAccessor : public TensorAccessorBase<T,N,PtrTraits,index_t> {
74 public:
75   typedef typename PtrTraits<T>::PtrType PtrType;
76 
TensorAccessor(PtrType data_,const index_t * sizes_,const index_t * strides_)77   C10_HOST_DEVICE TensorAccessor(
78       PtrType data_,
79       const index_t* sizes_,
80       const index_t* strides_)
81       : TensorAccessorBase<T, N, PtrTraits, index_t>(data_,sizes_,strides_) {}
82 
83   C10_HOST_DEVICE TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) {
84     return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1);
85   }
86 
87   C10_HOST_DEVICE const TensorAccessor<T, N-1, PtrTraits, index_t> operator[](index_t i) const {
88     return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1);
89   }
90 };
91 
92 template<typename T, template <typename U> class PtrTraits, typename index_t>
93 class TensorAccessor<T,1,PtrTraits,index_t> : public TensorAccessorBase<T,1,PtrTraits,index_t> {
94 public:
95   typedef typename PtrTraits<T>::PtrType PtrType;
96 
TensorAccessor(PtrType data_,const index_t * sizes_,const index_t * strides_)97   C10_HOST_DEVICE TensorAccessor(
98       PtrType data_,
99       const index_t* sizes_,
100       const index_t* strides_)
101       : TensorAccessorBase<T, 1, PtrTraits, index_t>(data_,sizes_,strides_) {}
102   C10_HOST_DEVICE T & operator[](index_t i) {
103     // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
104     return this->data_[this->strides_[0]*i];
105   }
106   C10_HOST_DEVICE const T & operator[](index_t i) const {
107     return this->data_[this->strides_[0]*i];
108   }
109 };
110 
111 
112 // GenericPackedTensorAccessorBase and GenericPackedTensorAccessor are used on for CUDA `Tensor`s on the host
113 // and as
114 // In contrast to `TensorAccessor`s, they copy the strides and sizes on instantiation (on the host)
115 // in order to transfer them on the device when calling kernels.
116 // On the device, indexing of multidimensional tensors gives to `TensorAccessor`s.
117 // Use RestrictPtrTraits as PtrTraits if you want the tensor's data pointer to be marked as __restrict__.
118 // Instantiation from data, sizes, strides is only needed on the host and std::copy isn't available
119 // on the device, so those functions are host only.
120 template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
121 class GenericPackedTensorAccessorBase {
122 public:
123   typedef typename PtrTraits<T>::PtrType PtrType;
124   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
GenericPackedTensorAccessorBase(PtrType data_,const index_t * sizes_,const index_t * strides_)125   C10_HOST GenericPackedTensorAccessorBase(
126       PtrType data_,
127       const index_t* sizes_,
128       const index_t* strides_)
129       : data_(data_) {
130     std::copy(sizes_, sizes_ + N, std::begin(this->sizes_));
131     std::copy(strides_, strides_ + N, std::begin(this->strides_));
132   }
133 
134   // if index_t is not int64_t, we want to have an int64_t constructor
135   template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
136   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
GenericPackedTensorAccessorBase(PtrType data_,const source_index_t * sizes_,const source_index_t * strides_)137   C10_HOST GenericPackedTensorAccessorBase(
138       PtrType data_,
139       const source_index_t* sizes_,
140       const source_index_t* strides_)
141       : data_(data_) {
142     for (const auto i : c10::irange(N)) {
143       this->sizes_[i] = sizes_[i];
144       this->strides_[i] = strides_[i];
145     }
146   }
147 
stride(index_t i)148   C10_HOST_DEVICE index_t stride(index_t i) const {
149     return strides_[i];
150   }
size(index_t i)151   C10_HOST_DEVICE index_t size(index_t i) const {
152     return sizes_[i];
153   }
data()154   C10_HOST_DEVICE PtrType data() {
155     return data_;
156   }
data()157   C10_HOST_DEVICE const PtrType data() const {
158     return data_;
159   }
160 protected:
161   PtrType data_;
162   // NOLINTNEXTLINE(*c-arrays*)
163   index_t sizes_[N];
164   // NOLINTNEXTLINE(*c-arrays*)
165   index_t strides_[N];
bounds_check_(index_t i)166   C10_HOST void bounds_check_(index_t i) const {
167     TORCH_CHECK_INDEX(
168         0 <= i && i < index_t{N},
169         "Index ",
170         i,
171         " is not within bounds of a tensor of dimension ",
172         N);
173   }
174 };
175 
176 template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
177 class GenericPackedTensorAccessor : public GenericPackedTensorAccessorBase<T,N,PtrTraits,index_t> {
178 public:
179   typedef typename PtrTraits<T>::PtrType PtrType;
180 
GenericPackedTensorAccessor(PtrType data_,const index_t * sizes_,const index_t * strides_)181   C10_HOST GenericPackedTensorAccessor(
182       PtrType data_,
183       const index_t* sizes_,
184       const index_t* strides_)
185       : GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t>(data_, sizes_, strides_) {}
186 
187   // if index_t is not int64_t, we want to have an int64_t constructor
188   template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
GenericPackedTensorAccessor(PtrType data_,const source_index_t * sizes_,const source_index_t * strides_)189   C10_HOST GenericPackedTensorAccessor(
190       PtrType data_,
191       const source_index_t* sizes_,
192       const source_index_t* strides_)
193       : GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t>(data_, sizes_, strides_) {}
194 
195   C10_DEVICE TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) {
196     index_t* new_sizes = this->sizes_ + 1;
197     index_t* new_strides = this->strides_ + 1;
198     return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i, new_sizes, new_strides);
199   }
200 
201   C10_DEVICE const TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) const {
202     const index_t* new_sizes = this->sizes_ + 1;
203     const index_t* new_strides = this->strides_ + 1;
204     return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i, new_sizes, new_strides);
205   }
206 
207   /// Returns a PackedTensorAccessor of the same dimension after transposing the
208   /// two dimensions given. Does not actually move elements; transposition is
209   /// made by permuting the size/stride arrays. If the dimensions are not valid,
210   /// asserts.
transpose(index_t dim1,index_t dim2)211   C10_HOST GenericPackedTensorAccessor<T, N, PtrTraits, index_t> transpose(
212       index_t dim1,
213       index_t dim2) const {
214     this->bounds_check_(dim1);
215     this->bounds_check_(dim2);
216     GenericPackedTensorAccessor<T, N, PtrTraits, index_t> result(
217         this->data_, this->sizes_, this->strides_);
218     std::swap(result.strides_[dim1], result.strides_[dim2]);
219     std::swap(result.sizes_[dim1], result.sizes_[dim2]);
220     return result;
221   }
222 };
223 
224 template<typename T, template <typename U> class PtrTraits, typename index_t>
225 class GenericPackedTensorAccessor<T,1,PtrTraits,index_t> : public GenericPackedTensorAccessorBase<T,1,PtrTraits,index_t> {
226 public:
227   typedef typename PtrTraits<T>::PtrType PtrType;
GenericPackedTensorAccessor(PtrType data_,const index_t * sizes_,const index_t * strides_)228   C10_HOST GenericPackedTensorAccessor(
229       PtrType data_,
230       const index_t* sizes_,
231       const index_t* strides_)
232       : GenericPackedTensorAccessorBase<T, 1, PtrTraits, index_t>(data_, sizes_, strides_) {}
233 
234   // if index_t is not int64_t, we want to have an int64_t constructor
235   template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
GenericPackedTensorAccessor(PtrType data_,const source_index_t * sizes_,const source_index_t * strides_)236   C10_HOST GenericPackedTensorAccessor(
237       PtrType data_,
238       const source_index_t* sizes_,
239       const source_index_t* strides_)
240       : GenericPackedTensorAccessorBase<T, 1, PtrTraits, index_t>(data_, sizes_, strides_) {}
241 
242   C10_DEVICE T & operator[](index_t i) {
243     return this->data_[this->strides_[0] * i];
244   }
245   C10_DEVICE const T& operator[](index_t i) const {
246     return this->data_[this->strides_[0]*i];
247   }
248 
249   // Same as in the general N-dimensional case, but note that in the
250   // 1-dimensional case the returned PackedTensorAccessor will always be an
251   // identical copy of the original
transpose(index_t dim1,index_t dim2)252   C10_HOST GenericPackedTensorAccessor<T, 1, PtrTraits, index_t> transpose(
253       index_t dim1,
254       index_t dim2) const {
255     this->bounds_check_(dim1);
256     this->bounds_check_(dim2);
257     return GenericPackedTensorAccessor<T, 1, PtrTraits, index_t>(
258         this->data_, this->sizes_, this->strides_);
259   }
260 };
261 
262 
263 // Can't put this directly into the macro function args because of commas
264 #define AT_X GenericPackedTensorAccessor<T, N, PtrTraits, index_t>
265 
266 // Old name for `GenericPackedTensorAccessor`
267 template <typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
268 C10_DEFINE_DEPRECATED_USING(PackedTensorAccessor, AT_X)
269 
270 #undef AT_X
271 
272 template <typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits>
273 using PackedTensorAccessor32 = GenericPackedTensorAccessor<T, N, PtrTraits, int32_t>;
274 
275 template <typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits>
276 using PackedTensorAccessor64 = GenericPackedTensorAccessor<T, N, PtrTraits, int64_t>;
277 } // namespace at
278