1 #pragma once 2 3 #include <c10/macros/Macros.h> 4 #include <c10/util/ArrayRef.h> 5 #include <c10/util/Deprecated.h> 6 #include <c10/util/Exception.h> 7 #include <c10/util/irange.h> 8 #include <cstddef> 9 #include <cstdint> 10 #include <type_traits> 11 12 namespace at { 13 14 // The PtrTraits argument to the TensorAccessor/GenericPackedTensorAccessor 15 // is used to enable the __restrict__ keyword/modifier for the data 16 // passed to cuda. 17 template <typename T> 18 struct DefaultPtrTraits { 19 typedef T* PtrType; 20 }; 21 22 #if defined(__CUDACC__) || defined(__HIPCC__) 23 template <typename T> 24 struct RestrictPtrTraits { 25 typedef T* __restrict__ PtrType; 26 }; 27 #endif 28 29 // TensorAccessorBase and TensorAccessor are used for both CPU and CUDA tensors. 30 // For CUDA tensors it is used in device code (only). This means that we restrict ourselves 31 // to functions and types available there (e.g. IntArrayRef isn't). 32 33 // The PtrTraits argument is only relevant to cuda to support `__restrict__` pointers. 34 template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t> 35 class TensorAccessorBase { 36 public: 37 typedef typename PtrTraits<T>::PtrType PtrType; 38 TensorAccessorBase(PtrType data_,const index_t * sizes_,const index_t * strides_)39 C10_HOST_DEVICE TensorAccessorBase( 40 PtrType data_, 41 const index_t* sizes_, 42 const index_t* strides_) 43 : data_(data_), sizes_(sizes_), strides_(strides_) {} sizes()44 C10_HOST IntArrayRef sizes() const { 45 return IntArrayRef(sizes_,N); 46 } strides()47 C10_HOST IntArrayRef strides() const { 48 return IntArrayRef(strides_,N); 49 } stride(index_t i)50 C10_HOST_DEVICE index_t stride(index_t i) const { 51 return strides_[i]; 52 } size(index_t i)53 C10_HOST_DEVICE index_t size(index_t i) const { 54 return sizes_[i]; 55 } data()56 C10_HOST_DEVICE PtrType data() { 57 return data_; 58 } data()59 C10_HOST_DEVICE const PtrType data() const { 60 return data_; 61 } 62 protected: 63 PtrType data_; 64 const index_t* sizes_; 65 const index_t* strides_; 66 }; 67 68 // The `TensorAccessor` is typically instantiated for CPU `Tensor`s using 69 // `Tensor.accessor<T, N>()`. 70 // For CUDA `Tensor`s, `GenericPackedTensorAccessor` is used on the host and only 71 // indexing on the device uses `TensorAccessor`s. 72 template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t> 73 class TensorAccessor : public TensorAccessorBase<T,N,PtrTraits,index_t> { 74 public: 75 typedef typename PtrTraits<T>::PtrType PtrType; 76 TensorAccessor(PtrType data_,const index_t * sizes_,const index_t * strides_)77 C10_HOST_DEVICE TensorAccessor( 78 PtrType data_, 79 const index_t* sizes_, 80 const index_t* strides_) 81 : TensorAccessorBase<T, N, PtrTraits, index_t>(data_,sizes_,strides_) {} 82 83 C10_HOST_DEVICE TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) { 84 return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1); 85 } 86 87 C10_HOST_DEVICE const TensorAccessor<T, N-1, PtrTraits, index_t> operator[](index_t i) const { 88 return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1); 89 } 90 }; 91 92 template<typename T, template <typename U> class PtrTraits, typename index_t> 93 class TensorAccessor<T,1,PtrTraits,index_t> : public TensorAccessorBase<T,1,PtrTraits,index_t> { 94 public: 95 typedef typename PtrTraits<T>::PtrType PtrType; 96 TensorAccessor(PtrType data_,const index_t * sizes_,const index_t * strides_)97 C10_HOST_DEVICE TensorAccessor( 98 PtrType data_, 99 const index_t* sizes_, 100 const index_t* strides_) 101 : TensorAccessorBase<T, 1, PtrTraits, index_t>(data_,sizes_,strides_) {} 102 C10_HOST_DEVICE T & operator[](index_t i) { 103 // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) 104 return this->data_[this->strides_[0]*i]; 105 } 106 C10_HOST_DEVICE const T & operator[](index_t i) const { 107 return this->data_[this->strides_[0]*i]; 108 } 109 }; 110 111 112 // GenericPackedTensorAccessorBase and GenericPackedTensorAccessor are used on for CUDA `Tensor`s on the host 113 // and as 114 // In contrast to `TensorAccessor`s, they copy the strides and sizes on instantiation (on the host) 115 // in order to transfer them on the device when calling kernels. 116 // On the device, indexing of multidimensional tensors gives to `TensorAccessor`s. 117 // Use RestrictPtrTraits as PtrTraits if you want the tensor's data pointer to be marked as __restrict__. 118 // Instantiation from data, sizes, strides is only needed on the host and std::copy isn't available 119 // on the device, so those functions are host only. 120 template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t> 121 class GenericPackedTensorAccessorBase { 122 public: 123 typedef typename PtrTraits<T>::PtrType PtrType; 124 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) GenericPackedTensorAccessorBase(PtrType data_,const index_t * sizes_,const index_t * strides_)125 C10_HOST GenericPackedTensorAccessorBase( 126 PtrType data_, 127 const index_t* sizes_, 128 const index_t* strides_) 129 : data_(data_) { 130 std::copy(sizes_, sizes_ + N, std::begin(this->sizes_)); 131 std::copy(strides_, strides_ + N, std::begin(this->strides_)); 132 } 133 134 // if index_t is not int64_t, we want to have an int64_t constructor 135 template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>> 136 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) GenericPackedTensorAccessorBase(PtrType data_,const source_index_t * sizes_,const source_index_t * strides_)137 C10_HOST GenericPackedTensorAccessorBase( 138 PtrType data_, 139 const source_index_t* sizes_, 140 const source_index_t* strides_) 141 : data_(data_) { 142 for (const auto i : c10::irange(N)) { 143 this->sizes_[i] = sizes_[i]; 144 this->strides_[i] = strides_[i]; 145 } 146 } 147 stride(index_t i)148 C10_HOST_DEVICE index_t stride(index_t i) const { 149 return strides_[i]; 150 } size(index_t i)151 C10_HOST_DEVICE index_t size(index_t i) const { 152 return sizes_[i]; 153 } data()154 C10_HOST_DEVICE PtrType data() { 155 return data_; 156 } data()157 C10_HOST_DEVICE const PtrType data() const { 158 return data_; 159 } 160 protected: 161 PtrType data_; 162 // NOLINTNEXTLINE(*c-arrays*) 163 index_t sizes_[N]; 164 // NOLINTNEXTLINE(*c-arrays*) 165 index_t strides_[N]; bounds_check_(index_t i)166 C10_HOST void bounds_check_(index_t i) const { 167 TORCH_CHECK_INDEX( 168 0 <= i && i < index_t{N}, 169 "Index ", 170 i, 171 " is not within bounds of a tensor of dimension ", 172 N); 173 } 174 }; 175 176 template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t> 177 class GenericPackedTensorAccessor : public GenericPackedTensorAccessorBase<T,N,PtrTraits,index_t> { 178 public: 179 typedef typename PtrTraits<T>::PtrType PtrType; 180 GenericPackedTensorAccessor(PtrType data_,const index_t * sizes_,const index_t * strides_)181 C10_HOST GenericPackedTensorAccessor( 182 PtrType data_, 183 const index_t* sizes_, 184 const index_t* strides_) 185 : GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t>(data_, sizes_, strides_) {} 186 187 // if index_t is not int64_t, we want to have an int64_t constructor 188 template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>> GenericPackedTensorAccessor(PtrType data_,const source_index_t * sizes_,const source_index_t * strides_)189 C10_HOST GenericPackedTensorAccessor( 190 PtrType data_, 191 const source_index_t* sizes_, 192 const source_index_t* strides_) 193 : GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t>(data_, sizes_, strides_) {} 194 195 C10_DEVICE TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) { 196 index_t* new_sizes = this->sizes_ + 1; 197 index_t* new_strides = this->strides_ + 1; 198 return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i, new_sizes, new_strides); 199 } 200 201 C10_DEVICE const TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) const { 202 const index_t* new_sizes = this->sizes_ + 1; 203 const index_t* new_strides = this->strides_ + 1; 204 return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i, new_sizes, new_strides); 205 } 206 207 /// Returns a PackedTensorAccessor of the same dimension after transposing the 208 /// two dimensions given. Does not actually move elements; transposition is 209 /// made by permuting the size/stride arrays. If the dimensions are not valid, 210 /// asserts. transpose(index_t dim1,index_t dim2)211 C10_HOST GenericPackedTensorAccessor<T, N, PtrTraits, index_t> transpose( 212 index_t dim1, 213 index_t dim2) const { 214 this->bounds_check_(dim1); 215 this->bounds_check_(dim2); 216 GenericPackedTensorAccessor<T, N, PtrTraits, index_t> result( 217 this->data_, this->sizes_, this->strides_); 218 std::swap(result.strides_[dim1], result.strides_[dim2]); 219 std::swap(result.sizes_[dim1], result.sizes_[dim2]); 220 return result; 221 } 222 }; 223 224 template<typename T, template <typename U> class PtrTraits, typename index_t> 225 class GenericPackedTensorAccessor<T,1,PtrTraits,index_t> : public GenericPackedTensorAccessorBase<T,1,PtrTraits,index_t> { 226 public: 227 typedef typename PtrTraits<T>::PtrType PtrType; GenericPackedTensorAccessor(PtrType data_,const index_t * sizes_,const index_t * strides_)228 C10_HOST GenericPackedTensorAccessor( 229 PtrType data_, 230 const index_t* sizes_, 231 const index_t* strides_) 232 : GenericPackedTensorAccessorBase<T, 1, PtrTraits, index_t>(data_, sizes_, strides_) {} 233 234 // if index_t is not int64_t, we want to have an int64_t constructor 235 template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>> GenericPackedTensorAccessor(PtrType data_,const source_index_t * sizes_,const source_index_t * strides_)236 C10_HOST GenericPackedTensorAccessor( 237 PtrType data_, 238 const source_index_t* sizes_, 239 const source_index_t* strides_) 240 : GenericPackedTensorAccessorBase<T, 1, PtrTraits, index_t>(data_, sizes_, strides_) {} 241 242 C10_DEVICE T & operator[](index_t i) { 243 return this->data_[this->strides_[0] * i]; 244 } 245 C10_DEVICE const T& operator[](index_t i) const { 246 return this->data_[this->strides_[0]*i]; 247 } 248 249 // Same as in the general N-dimensional case, but note that in the 250 // 1-dimensional case the returned PackedTensorAccessor will always be an 251 // identical copy of the original transpose(index_t dim1,index_t dim2)252 C10_HOST GenericPackedTensorAccessor<T, 1, PtrTraits, index_t> transpose( 253 index_t dim1, 254 index_t dim2) const { 255 this->bounds_check_(dim1); 256 this->bounds_check_(dim2); 257 return GenericPackedTensorAccessor<T, 1, PtrTraits, index_t>( 258 this->data_, this->sizes_, this->strides_); 259 } 260 }; 261 262 263 // Can't put this directly into the macro function args because of commas 264 #define AT_X GenericPackedTensorAccessor<T, N, PtrTraits, index_t> 265 266 // Old name for `GenericPackedTensorAccessor` 267 template <typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t> 268 C10_DEFINE_DEPRECATED_USING(PackedTensorAccessor, AT_X) 269 270 #undef AT_X 271 272 template <typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits> 273 using PackedTensorAccessor32 = GenericPackedTensorAccessor<T, N, PtrTraits, int32_t>; 274 275 template <typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits> 276 using PackedTensorAccessor64 = GenericPackedTensorAccessor<T, N, PtrTraits, int64_t>; 277 } // namespace at 278