1 /*! 2 * Copyright (c) 2017 by Contributors 3 * \file dlpack.h 4 * \brief The common header of DLPack. 5 */ 6 #ifndef DLPACK_DLPACK_H_ 7 #define DLPACK_DLPACK_H_ 8 9 /** 10 * \brief Compatibility with C++ 11 */ 12 #ifdef __cplusplus 13 #define DLPACK_EXTERN_C extern "C" 14 #else 15 #define DLPACK_EXTERN_C 16 #endif 17 18 /*! \brief The current version of dlpack */ 19 #define DLPACK_VERSION 80 20 21 /*! \brief The current ABI version of dlpack */ 22 #define DLPACK_ABI_VERSION 1 23 24 /*! \brief DLPACK_DLL prefix for windows */ 25 #ifdef _WIN32 26 #ifdef DLPACK_EXPORTS 27 #define DLPACK_DLL __declspec(dllexport) 28 #else 29 #define DLPACK_DLL __declspec(dllimport) 30 #endif 31 #else 32 #define DLPACK_DLL 33 #endif 34 35 #include <stdint.h> 36 #include <stddef.h> 37 38 #ifdef __cplusplus 39 extern "C" { 40 #endif 41 /*! 42 * \brief The device type in DLDevice. 43 */ 44 #ifdef __cplusplus 45 typedef enum : int32_t { 46 #else 47 typedef enum { 48 #endif 49 /*! \brief CPU device */ 50 kDLCPU = 1, 51 /*! \brief CUDA GPU device */ 52 kDLCUDA = 2, 53 /*! 54 * \brief Pinned CUDA CPU memory by cudaMallocHost 55 */ 56 kDLCUDAHost = 3, 57 /*! \brief OpenCL devices. */ 58 kDLOpenCL = 4, 59 /*! \brief Vulkan buffer for next generation graphics. */ 60 kDLVulkan = 7, 61 /*! \brief Metal for Apple GPU. */ 62 kDLMetal = 8, 63 /*! \brief Verilog simulator buffer */ 64 kDLVPI = 9, 65 /*! \brief ROCm GPUs for AMD GPUs */ 66 kDLROCM = 10, 67 /*! 68 * \brief Pinned ROCm CPU memory allocated by hipMallocHost 69 */ 70 kDLROCMHost = 11, 71 /*! 72 * \brief Reserved extension device type, 73 * used for quickly test extension device 74 * The semantics can differ depending on the implementation. 75 */ 76 kDLExtDev = 12, 77 /*! 78 * \brief CUDA managed/unified memory allocated by cudaMallocManaged 79 */ 80 kDLCUDAManaged = 13, 81 /*! 82 * \brief Unified shared memory allocated on a oneAPI non-partititioned 83 * device. Call to oneAPI runtime is required to determine the device 84 * type, the USM allocation type and the sycl context it is bound to. 85 * 86 */ 87 kDLOneAPI = 14, 88 /*! \brief GPU support for next generation WebGPU standard. */ 89 kDLWebGPU = 15, 90 /*! \brief Qualcomm Hexagon DSP */ 91 kDLHexagon = 16, 92 /*! \brief Microsoft AI Accelerator */ 93 kDLMAIA = 17, 94 } DLDeviceType; 95 96 /*! 97 * \brief A Device for Tensor and operator. 98 */ 99 typedef struct { 100 /*! \brief The device type used in the device. */ 101 DLDeviceType device_type; 102 /*! 103 * \brief The device index. 104 * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0. 105 */ 106 int32_t device_id; 107 } DLDevice; 108 109 /*! 110 * \brief The type code options DLDataType. 111 */ 112 typedef enum { 113 /*! \brief signed integer */ 114 kDLInt = 0U, 115 /*! \brief unsigned integer */ 116 kDLUInt = 1U, 117 /*! \brief IEEE floating point */ 118 kDLFloat = 2U, 119 /*! 120 * \brief Opaque handle type, reserved for testing purposes. 121 * Frameworks need to agree on the handle data type for the exchange to be well-defined. 122 */ 123 kDLOpaqueHandle = 3U, 124 /*! \brief bfloat16 */ 125 kDLBfloat = 4U, 126 /*! 127 * \brief complex number 128 * (C/C++/Python layout: compact struct per complex number) 129 */ 130 kDLComplex = 5U, 131 /*! \brief boolean */ 132 kDLBool = 6U, 133 } DLDataTypeCode; 134 135 /*! 136 * \brief The data type the tensor can hold. The data type is assumed to follow the 137 * native endian-ness. An explicit error message should be raised when attempting to 138 * export an array with non-native endianness 139 * 140 * Examples 141 * - float: type_code = 2, bits = 32, lanes = 1 142 * - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4 143 * - int8: type_code = 0, bits = 8, lanes = 1 144 * - std::complex<float>: type_code = 5, bits = 64, lanes = 1 145 * - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits) 146 */ 147 typedef struct { 148 /*! 149 * \brief Type code of base types. 150 * We keep it uint8_t instead of DLDataTypeCode for minimal memory 151 * footprint, but the value should be one of DLDataTypeCode enum values. 152 * */ 153 uint8_t code; 154 /*! 155 * \brief Number of bits, common choices are 8, 16, 32. 156 */ 157 uint8_t bits; 158 /*! \brief Number of lanes in the type, used for vector types. */ 159 uint16_t lanes; 160 } DLDataType; 161 162 /*! 163 * \brief Plain C Tensor object, does not manage memory. 164 */ 165 typedef struct { 166 /*! 167 * \brief The data pointer points to the allocated data. This will be CUDA 168 * device pointer or cl_mem handle in OpenCL. It may be opaque on some device 169 * types. This pointer is always aligned to 256 bytes as in CUDA. The 170 * `byte_offset` field should be used to point to the beginning of the data. 171 * 172 * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow, 173 * TVM, perhaps others) do not adhere to this 256 byte aligment requirement 174 * on CPU/CUDA/ROCm, and always use `byte_offset=0`. This must be fixed 175 * (after which this note will be updated); at the moment it is recommended 176 * to not rely on the data pointer being correctly aligned. 177 * 178 * For given DLTensor, the size of memory required to store the contents of 179 * data is calculated as follows: 180 * 181 * \code{.c} 182 * static inline size_t GetDataSize(const DLTensor* t) { 183 * size_t size = 1; 184 * for (tvm_index_t i = 0; i < t->ndim; ++i) { 185 * size *= t->shape[i]; 186 * } 187 * size *= (t->dtype.bits * t->dtype.lanes + 7) / 8; 188 * return size; 189 * } 190 * \endcode 191 */ 192 void* data; 193 /*! \brief The device of the tensor */ 194 DLDevice device; 195 /*! \brief Number of dimensions */ 196 int32_t ndim; 197 /*! \brief The data type of the pointer*/ 198 DLDataType dtype; 199 /*! \brief The shape of the tensor */ 200 const int64_t* shape; 201 /*! 202 * \brief strides of the tensor (in number of elements, not bytes) 203 * can be NULL, indicating tensor is compact and row-majored. 204 */ 205 const int64_t* strides; 206 /*! \brief The offset in bytes to the beginning pointer to data */ 207 uint64_t byte_offset; 208 } DLTensor; 209 210 /*! 211 * \brief C Tensor object, manage memory of DLTensor. This data structure is 212 * intended to facilitate the borrowing of DLTensor by another framework. It is 213 * not meant to transfer the tensor. When the borrowing framework doesn't need 214 * the tensor, it should call the deleter to notify the host that the resource 215 * is no longer needed. 216 */ 217 typedef struct DLManagedTensor { 218 /*! \brief DLTensor which is being memory managed */ 219 DLTensor dl_tensor; 220 /*! \brief the context of the original host framework of DLManagedTensor in 221 * which DLManagedTensor is used in the framework. It can also be NULL. 222 */ 223 void * manager_ctx; 224 /*! \brief Destructor signature void (*)(void*) - this should be called 225 * to destruct manager_ctx which holds the DLManagedTensor. It can be NULL 226 * if there is no way for the caller to provide a reasonable destructor. 227 * The destructors deletes the argument self as well. 228 */ 229 void (*deleter)(struct DLManagedTensor * self); 230 } DLManagedTensor; 231 #ifdef __cplusplus 232 } // DLPACK_EXTERN_C 233 #endif 234 #endif // DLPACK_DLPACK_H_ 235