xref: /aosp_15_r20/external/pytorch/aten/src/ATen/dlpack.h (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 /*!
2  *  Copyright (c) 2017 by Contributors
3  * \file dlpack.h
4  * \brief The common header of DLPack.
5  */
6 #ifndef DLPACK_DLPACK_H_
7 #define DLPACK_DLPACK_H_
8 
9 /**
10  * \brief Compatibility with C++
11  */
12 #ifdef __cplusplus
13 #define DLPACK_EXTERN_C extern "C"
14 #else
15 #define DLPACK_EXTERN_C
16 #endif
17 
18 /*! \brief The current version of dlpack */
19 #define DLPACK_VERSION 80
20 
21 /*! \brief The current ABI version of dlpack */
22 #define DLPACK_ABI_VERSION 1
23 
24 /*! \brief DLPACK_DLL prefix for windows */
25 #ifdef _WIN32
26 #ifdef DLPACK_EXPORTS
27 #define DLPACK_DLL __declspec(dllexport)
28 #else
29 #define DLPACK_DLL __declspec(dllimport)
30 #endif
31 #else
32 #define DLPACK_DLL
33 #endif
34 
35 #include <stdint.h>
36 #include <stddef.h>
37 
38 #ifdef __cplusplus
39 extern "C" {
40 #endif
41 /*!
42  * \brief The device type in DLDevice.
43  */
44 #ifdef __cplusplus
45 typedef enum : int32_t {
46 #else
47 typedef enum {
48 #endif
49   /*! \brief CPU device */
50   kDLCPU = 1,
51   /*! \brief CUDA GPU device */
52   kDLCUDA = 2,
53   /*!
54    * \brief Pinned CUDA CPU memory by cudaMallocHost
55    */
56   kDLCUDAHost = 3,
57   /*! \brief OpenCL devices. */
58   kDLOpenCL = 4,
59   /*! \brief Vulkan buffer for next generation graphics. */
60   kDLVulkan = 7,
61   /*! \brief Metal for Apple GPU. */
62   kDLMetal = 8,
63   /*! \brief Verilog simulator buffer */
64   kDLVPI = 9,
65   /*! \brief ROCm GPUs for AMD GPUs */
66   kDLROCM = 10,
67   /*!
68    * \brief Pinned ROCm CPU memory allocated by hipMallocHost
69    */
70   kDLROCMHost = 11,
71   /*!
72    * \brief Reserved extension device type,
73    * used for quickly test extension device
74    * The semantics can differ depending on the implementation.
75    */
76   kDLExtDev = 12,
77   /*!
78    * \brief CUDA managed/unified memory allocated by cudaMallocManaged
79    */
80   kDLCUDAManaged = 13,
81   /*!
82    * \brief Unified shared memory allocated on a oneAPI non-partititioned
83    * device. Call to oneAPI runtime is required to determine the device
84    * type, the USM allocation type and the sycl context it is bound to.
85    *
86    */
87   kDLOneAPI = 14,
88   /*! \brief GPU support for next generation WebGPU standard. */
89   kDLWebGPU = 15,
90   /*! \brief Qualcomm Hexagon DSP */
91   kDLHexagon = 16,
92   /*! \brief Microsoft AI Accelerator */
93   kDLMAIA = 17,
94 } DLDeviceType;
95 
96 /*!
97  * \brief A Device for Tensor and operator.
98  */
99 typedef struct {
100   /*! \brief The device type used in the device. */
101   DLDeviceType device_type;
102   /*!
103    * \brief The device index.
104    * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
105    */
106   int32_t device_id;
107 } DLDevice;
108 
109 /*!
110  * \brief The type code options DLDataType.
111  */
112 typedef enum {
113   /*! \brief signed integer */
114   kDLInt = 0U,
115   /*! \brief unsigned integer */
116   kDLUInt = 1U,
117   /*! \brief IEEE floating point */
118   kDLFloat = 2U,
119   /*!
120    * \brief Opaque handle type, reserved for testing purposes.
121    * Frameworks need to agree on the handle data type for the exchange to be well-defined.
122    */
123   kDLOpaqueHandle = 3U,
124   /*! \brief bfloat16 */
125   kDLBfloat = 4U,
126   /*!
127    * \brief complex number
128    * (C/C++/Python layout: compact struct per complex number)
129    */
130   kDLComplex = 5U,
131   /*! \brief boolean */
132   kDLBool = 6U,
133 } DLDataTypeCode;
134 
135 /*!
136  * \brief The data type the tensor can hold. The data type is assumed to follow the
137  * native endian-ness. An explicit error message should be raised when attempting to
138  * export an array with non-native endianness
139  *
140  *  Examples
141  *   - float: type_code = 2, bits = 32, lanes = 1
142  *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4
143  *   - int8: type_code = 0, bits = 8, lanes = 1
144  *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
145  *   - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits)
146  */
147 typedef struct {
148   /*!
149    * \brief Type code of base types.
150    * We keep it uint8_t instead of DLDataTypeCode for minimal memory
151    * footprint, but the value should be one of DLDataTypeCode enum values.
152    * */
153   uint8_t code;
154   /*!
155    * \brief Number of bits, common choices are 8, 16, 32.
156    */
157   uint8_t bits;
158   /*! \brief Number of lanes in the type, used for vector types. */
159   uint16_t lanes;
160 } DLDataType;
161 
162 /*!
163  * \brief Plain C Tensor object, does not manage memory.
164  */
165 typedef struct {
166   /*!
167    * \brief The data pointer points to the allocated data. This will be CUDA
168    * device pointer or cl_mem handle in OpenCL. It may be opaque on some device
169    * types. This pointer is always aligned to 256 bytes as in CUDA. The
170    * `byte_offset` field should be used to point to the beginning of the data.
171    *
172    * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
173    * TVM, perhaps others) do not adhere to this 256 byte aligment requirement
174    * on CPU/CUDA/ROCm, and always use `byte_offset=0`.  This must be fixed
175    * (after which this note will be updated); at the moment it is recommended
176    * to not rely on the data pointer being correctly aligned.
177    *
178    * For given DLTensor, the size of memory required to store the contents of
179    * data is calculated as follows:
180    *
181    * \code{.c}
182    * static inline size_t GetDataSize(const DLTensor* t) {
183    *   size_t size = 1;
184    *   for (tvm_index_t i = 0; i < t->ndim; ++i) {
185    *     size *= t->shape[i];
186    *   }
187    *   size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
188    *   return size;
189    * }
190    * \endcode
191    */
192   void* data;
193   /*! \brief The device of the tensor */
194   DLDevice device;
195   /*! \brief Number of dimensions */
196   int32_t ndim;
197   /*! \brief The data type of the pointer*/
198   DLDataType dtype;
199   /*! \brief The shape of the tensor */
200   const int64_t* shape;
201   /*!
202    * \brief strides of the tensor (in number of elements, not bytes)
203    *  can be NULL, indicating tensor is compact and row-majored.
204    */
205   const int64_t* strides;
206   /*! \brief The offset in bytes to the beginning pointer to data */
207   uint64_t byte_offset;
208 } DLTensor;
209 
210 /*!
211  * \brief C Tensor object, manage memory of DLTensor. This data structure is
212  *  intended to facilitate the borrowing of DLTensor by another framework. It is
213  *  not meant to transfer the tensor. When the borrowing framework doesn't need
214  *  the tensor, it should call the deleter to notify the host that the resource
215  *  is no longer needed.
216  */
217 typedef struct DLManagedTensor {
218   /*! \brief DLTensor which is being memory managed */
219   DLTensor dl_tensor;
220   /*! \brief the context of the original host framework of DLManagedTensor in
221    *   which DLManagedTensor is used in the framework. It can also be NULL.
222    */
223   void * manager_ctx;
224   /*! \brief Destructor signature void (*)(void*) - this should be called
225    *   to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
226    *   if there is no way for the caller to provide a reasonable destructor.
227    *   The destructors deletes the argument self as well.
228    */
229   void (*deleter)(struct DLManagedTensor * self);
230 } DLManagedTensor;
231 #ifdef __cplusplus
232 }  // DLPACK_EXTERN_C
233 #endif
234 #endif  // DLPACK_DLPACK_H_
235