1 #include <ATen/cuda/CUDAContext.h>
2 #include <c10/cuda/CUDACachingAllocator.h>
3 #include <c10/util/CallOnce.h>
4
5 #include <ATen/cuda/CUDAConfig.h>
6 #include <deque>
7 #include <vector>
8
9 namespace at::cuda {
10
11 namespace {
12
13 DeviceIndex num_gpus = -1;
14 c10::once_flag init_flag;
15 std::deque<c10::once_flag> device_flags;
16 std::vector<cudaDeviceProp> device_properties;
17
initCUDAContextVectors()18 void initCUDAContextVectors() {
19 num_gpus = c10::cuda::device_count();
20 device_flags.resize(num_gpus);
21 device_properties.resize(num_gpus);
22 }
23
initDeviceProperty(DeviceIndex device_index)24 void initDeviceProperty(DeviceIndex device_index) {
25 cudaDeviceProp device_prop{};
26 AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
27 device_properties[device_index] = device_prop;
28 }
29
30 } // anonymous namespace
31
32 // We need this function to force the linking against torch_cuda(_cpp) on Windows.
33 // If you need to modify this function, please specify a new function and apply
34 // the changes according to https://github.com/pytorch/pytorch/pull/34288.
35 // Related issue: https://github.com/pytorch/pytorch/issues/31611.
36 /* Device info */
warp_size()37 int warp_size() {
38 return getCurrentDeviceProperties()->warpSize;
39 }
40
getCurrentDeviceProperties()41 cudaDeviceProp* getCurrentDeviceProperties() {
42 auto device = c10::cuda::current_device();
43 return getDeviceProperties(device);
44 }
45
getDeviceProperties(c10::DeviceIndex device)46 cudaDeviceProp* getDeviceProperties(c10::DeviceIndex device) {
47 c10::call_once(init_flag, initCUDAContextVectors);
48 if (device == -1) device = c10::cuda::current_device();
49 AT_ASSERT(device >= 0 && device < num_gpus, "device=", static_cast<int>(device), ", num_gpus=", num_gpus);
50 c10::call_once(device_flags[device], initDeviceProperty, device);
51 return &device_properties[device];
52 }
53
canDeviceAccessPeer(c10::DeviceIndex device,c10::DeviceIndex peer_device)54 bool canDeviceAccessPeer(c10::DeviceIndex device, c10::DeviceIndex peer_device) {
55 c10::call_once(init_flag, initCUDAContextVectors);
56 if (device == -1) device = c10::cuda::current_device();
57 AT_ASSERT(device >= 0 && device < num_gpus, "device=", static_cast<int>(device), ", num_gpus=", num_gpus);
58 AT_ASSERT(peer_device >= 0 && peer_device < num_gpus, "peer_device=", static_cast<int>(peer_device), ", num_gpus=", num_gpus);
59 int can_access = 0;
60 AT_CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access, device, peer_device));
61 return can_access != 0;
62 }
63
getCUDADeviceAllocator()64 Allocator* getCUDADeviceAllocator() {
65 return c10::cuda::CUDACachingAllocator::get();
66 }
67
68 } // namespace at::cuda
69