xref: /aosp_15_r20/external/pytorch/c10/cuda/CUDAAllocatorConfig.h (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #pragma once
2 
3 #include <c10/cuda/CUDAMacros.h>
4 #include <c10/util/Exception.h>
5 
6 #include <atomic>
7 #include <cstddef>
8 #include <cstdlib>
9 #include <mutex>
10 #include <string>
11 #include <vector>
12 
13 namespace c10::cuda::CUDACachingAllocator {
14 
15 // Environment config parser
16 class C10_CUDA_API CUDAAllocatorConfig {
17  public:
max_split_size()18   static size_t max_split_size() {
19     return instance().m_max_split_size;
20   }
garbage_collection_threshold()21   static double garbage_collection_threshold() {
22     return instance().m_garbage_collection_threshold;
23   }
24 
expandable_segments()25   static bool expandable_segments() {
26 #ifndef PYTORCH_C10_DRIVER_API_SUPPORTED
27     if (instance().m_expandable_segments) {
28       TORCH_WARN_ONCE("expandable_segments not supported on this platform")
29     }
30     return false;
31 #else
32     return instance().m_expandable_segments;
33 #endif
34   }
35 
release_lock_on_cudamalloc()36   static bool release_lock_on_cudamalloc() {
37     return instance().m_release_lock_on_cudamalloc;
38   }
39 
40   /** Pinned memory allocator settings */
pinned_use_cuda_host_register()41   static bool pinned_use_cuda_host_register() {
42     return instance().m_pinned_use_cuda_host_register;
43   }
44 
pinned_num_register_threads()45   static size_t pinned_num_register_threads() {
46     return instance().m_pinned_num_register_threads;
47   }
48 
pinned_max_register_threads()49   static size_t pinned_max_register_threads() {
50     // Based on the benchmark results, we see better allocation performance
51     // with 8 threads. However on future systems, we may need more threads
52     // and limiting this to 128 threads.
53     return 128;
54   }
55 
56   // This is used to round-up allocation size to nearest power of 2 divisions.
57   // More description below in function roundup_power2_next_division
58   // As ane example, if we want 4 divisions between 2's power, this can be done
59   // using env variable: PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4
60   static size_t roundup_power2_divisions(size_t size);
61 
roundup_power2_divisions()62   static std::vector<size_t> roundup_power2_divisions() {
63     return instance().m_roundup_power2_divisions;
64   }
65 
last_allocator_settings()66   static std::string last_allocator_settings() {
67     std::lock_guard<std::mutex> lock(
68         instance().m_last_allocator_settings_mutex);
69     return instance().m_last_allocator_settings;
70   }
71 
instance()72   static CUDAAllocatorConfig& instance() {
73     static CUDAAllocatorConfig* s_instance = ([]() {
74       auto inst = new CUDAAllocatorConfig();
75       const char* env = getenv("PYTORCH_CUDA_ALLOC_CONF");
76       inst->parseArgs(env);
77       return inst;
78     })();
79     return *s_instance;
80   }
81 
82   void parseArgs(const char* env);
83 
84  private:
85   CUDAAllocatorConfig();
86 
87   static void lexArgs(const char* env, std::vector<std::string>& config);
88   static void consumeToken(
89       const std::vector<std::string>& config,
90       size_t i,
91       const char c);
92   size_t parseMaxSplitSize(const std::vector<std::string>& config, size_t i);
93   size_t parseGarbageCollectionThreshold(
94       const std::vector<std::string>& config,
95       size_t i);
96   size_t parseRoundUpPower2Divisions(
97       const std::vector<std::string>& config,
98       size_t i);
99   size_t parseAllocatorConfig(
100       const std::vector<std::string>& config,
101       size_t i,
102       bool& used_cudaMallocAsync);
103   size_t parsePinnedUseCudaHostRegister(
104       const std::vector<std::string>& config,
105       size_t i);
106   size_t parsePinnedNumRegisterThreads(
107       const std::vector<std::string>& config,
108       size_t i);
109 
110   std::atomic<size_t> m_max_split_size;
111   std::vector<size_t> m_roundup_power2_divisions;
112   std::atomic<double> m_garbage_collection_threshold;
113   std::atomic<size_t> m_pinned_num_register_threads;
114   std::atomic<bool> m_expandable_segments;
115   std::atomic<bool> m_release_lock_on_cudamalloc;
116   std::atomic<bool> m_pinned_use_cuda_host_register;
117   std::string m_last_allocator_settings;
118   std::mutex m_last_allocator_settings_mutex;
119 };
120 
121 // General caching allocator utilities
122 C10_CUDA_API void setAllocatorSettings(const std::string& env);
123 
124 } // namespace c10::cuda::CUDACachingAllocator
125