1 #pragma once 2 3 #include <c10/cuda/CUDAMacros.h> 4 #include <c10/util/Exception.h> 5 6 #include <atomic> 7 #include <cstddef> 8 #include <cstdlib> 9 #include <mutex> 10 #include <string> 11 #include <vector> 12 13 namespace c10::cuda::CUDACachingAllocator { 14 15 // Environment config parser 16 class C10_CUDA_API CUDAAllocatorConfig { 17 public: max_split_size()18 static size_t max_split_size() { 19 return instance().m_max_split_size; 20 } garbage_collection_threshold()21 static double garbage_collection_threshold() { 22 return instance().m_garbage_collection_threshold; 23 } 24 expandable_segments()25 static bool expandable_segments() { 26 #ifndef PYTORCH_C10_DRIVER_API_SUPPORTED 27 if (instance().m_expandable_segments) { 28 TORCH_WARN_ONCE("expandable_segments not supported on this platform") 29 } 30 return false; 31 #else 32 return instance().m_expandable_segments; 33 #endif 34 } 35 release_lock_on_cudamalloc()36 static bool release_lock_on_cudamalloc() { 37 return instance().m_release_lock_on_cudamalloc; 38 } 39 40 /** Pinned memory allocator settings */ pinned_use_cuda_host_register()41 static bool pinned_use_cuda_host_register() { 42 return instance().m_pinned_use_cuda_host_register; 43 } 44 pinned_num_register_threads()45 static size_t pinned_num_register_threads() { 46 return instance().m_pinned_num_register_threads; 47 } 48 pinned_max_register_threads()49 static size_t pinned_max_register_threads() { 50 // Based on the benchmark results, we see better allocation performance 51 // with 8 threads. However on future systems, we may need more threads 52 // and limiting this to 128 threads. 53 return 128; 54 } 55 56 // This is used to round-up allocation size to nearest power of 2 divisions. 57 // More description below in function roundup_power2_next_division 58 // As ane example, if we want 4 divisions between 2's power, this can be done 59 // using env variable: PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4 60 static size_t roundup_power2_divisions(size_t size); 61 roundup_power2_divisions()62 static std::vector<size_t> roundup_power2_divisions() { 63 return instance().m_roundup_power2_divisions; 64 } 65 last_allocator_settings()66 static std::string last_allocator_settings() { 67 std::lock_guard<std::mutex> lock( 68 instance().m_last_allocator_settings_mutex); 69 return instance().m_last_allocator_settings; 70 } 71 instance()72 static CUDAAllocatorConfig& instance() { 73 static CUDAAllocatorConfig* s_instance = ([]() { 74 auto inst = new CUDAAllocatorConfig(); 75 const char* env = getenv("PYTORCH_CUDA_ALLOC_CONF"); 76 inst->parseArgs(env); 77 return inst; 78 })(); 79 return *s_instance; 80 } 81 82 void parseArgs(const char* env); 83 84 private: 85 CUDAAllocatorConfig(); 86 87 static void lexArgs(const char* env, std::vector<std::string>& config); 88 static void consumeToken( 89 const std::vector<std::string>& config, 90 size_t i, 91 const char c); 92 size_t parseMaxSplitSize(const std::vector<std::string>& config, size_t i); 93 size_t parseGarbageCollectionThreshold( 94 const std::vector<std::string>& config, 95 size_t i); 96 size_t parseRoundUpPower2Divisions( 97 const std::vector<std::string>& config, 98 size_t i); 99 size_t parseAllocatorConfig( 100 const std::vector<std::string>& config, 101 size_t i, 102 bool& used_cudaMallocAsync); 103 size_t parsePinnedUseCudaHostRegister( 104 const std::vector<std::string>& config, 105 size_t i); 106 size_t parsePinnedNumRegisterThreads( 107 const std::vector<std::string>& config, 108 size_t i); 109 110 std::atomic<size_t> m_max_split_size; 111 std::vector<size_t> m_roundup_power2_divisions; 112 std::atomic<double> m_garbage_collection_threshold; 113 std::atomic<size_t> m_pinned_num_register_threads; 114 std::atomic<bool> m_expandable_segments; 115 std::atomic<bool> m_release_lock_on_cudamalloc; 116 std::atomic<bool> m_pinned_use_cuda_host_register; 117 std::string m_last_allocator_settings; 118 std::mutex m_last_allocator_settings_mutex; 119 }; 120 121 // General caching allocator utilities 122 C10_CUDA_API void setAllocatorSettings(const std::string& env); 123 124 } // namespace c10::cuda::CUDACachingAllocator 125