1 #include <cuda.h>
2 #include <cuda_runtime.h>
3 #include <torch/csrc/utils/pybind.h>
4 #if !defined(USE_ROCM)
5 #include <cuda_profiler_api.h>
6 #else
7 #include <hip/hip_runtime_api.h>
8 #endif
9
10 #include <c10/cuda/CUDAException.h>
11 #include <c10/cuda/CUDAGuard.h>
12
13 namespace torch::cuda::shared {
14
15 #ifdef USE_ROCM
16 namespace {
hipReturnSuccess()17 hipError_t hipReturnSuccess() {
18 return hipSuccess;
19 }
20 } // namespace
21 #endif
22
initCudartBindings(PyObject * module)23 void initCudartBindings(PyObject* module) {
24 auto m = py::handle(module).cast<py::module>();
25
26 auto cudart = m.def_submodule("_cudart", "libcudart.so bindings");
27
28 // By splitting the names of these objects into two literals we prevent the
29 // HIP rewrite rules from changing these names when building with HIP.
30
31 #if !defined(USE_ROCM) && defined(CUDA_VERSION) && CUDA_VERSION < 12000
32 // cudaOutputMode_t is used in cudaProfilerInitialize only. The latter is gone
33 // in CUDA 12.
34 py::enum_<cudaOutputMode_t>(
35 cudart,
36 "cuda"
37 "OutputMode")
38 .value("KeyValuePair", cudaKeyValuePair)
39 .value("CSV", cudaCSV);
40 #endif
41
42 py::enum_<cudaError_t>(
43 cudart,
44 "cuda"
45 "Error")
46 .value("success", cudaSuccess);
47
48 cudart.def(
49 "cuda"
50 "GetErrorString",
51 cudaGetErrorString);
52 cudart.def(
53 "cuda"
54 "ProfilerStart",
55 #ifdef USE_ROCM
56 hipReturnSuccess
57 #else
58 cudaProfilerStart
59 #endif
60 );
61 cudart.def(
62 "cuda"
63 "ProfilerStop",
64 #ifdef USE_ROCM
65 hipReturnSuccess
66 #else
67 cudaProfilerStop
68 #endif
69 );
70 cudart.def(
71 "cuda"
72 "HostRegister",
73 [](uintptr_t ptr, size_t size, unsigned int flags) -> cudaError_t {
74 py::gil_scoped_release no_gil;
75 return C10_CUDA_ERROR_HANDLED(
76 cudaHostRegister((void*)ptr, size, flags));
77 });
78 cudart.def(
79 "cuda"
80 "HostUnregister",
81 [](uintptr_t ptr) -> cudaError_t {
82 py::gil_scoped_release no_gil;
83 return C10_CUDA_ERROR_HANDLED(cudaHostUnregister((void*)ptr));
84 });
85 cudart.def(
86 "cuda"
87 "StreamCreate",
88 [](uintptr_t ptr) -> cudaError_t {
89 py::gil_scoped_release no_gil;
90 return C10_CUDA_ERROR_HANDLED(cudaStreamCreate((cudaStream_t*)ptr));
91 });
92 cudart.def(
93 "cuda"
94 "StreamDestroy",
95 [](uintptr_t ptr) -> cudaError_t {
96 py::gil_scoped_release no_gil;
97 return C10_CUDA_ERROR_HANDLED(cudaStreamDestroy((cudaStream_t)ptr));
98 });
99 #if !defined(USE_ROCM) && defined(CUDA_VERSION) && CUDA_VERSION < 12000
100 // cudaProfilerInitialize is no longer needed after CUDA 12:
101 // https://forums.developer.nvidia.com/t/cudaprofilerinitialize-is-deprecated-alternative/200776/3
102 cudart.def(
103 "cuda"
104 "ProfilerInitialize",
105 cudaProfilerInitialize,
106 py::call_guard<py::gil_scoped_release>());
107 #endif
108 cudart.def(
109 "cuda"
110 "MemGetInfo",
111 [](c10::DeviceIndex device) -> std::pair<size_t, size_t> {
112 c10::cuda::CUDAGuard guard(device);
113 size_t device_free = 0;
114 size_t device_total = 0;
115 py::gil_scoped_release no_gil;
116 C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
117 return {device_free, device_total};
118 });
119 }
120
121 } // namespace torch::cuda::shared
122