c10/cuda/CUDAAllocatorConfig.h

*da0073e9SAndroid Build Coastguard Worker#pragma once
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker#include <c10/cuda/CUDAMacros.h>
*da0073e9SAndroid Build Coastguard Worker#include <c10/util/Exception.h>
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker#include <atomic>
*da0073e9SAndroid Build Coastguard Worker#include <cstddef>
*da0073e9SAndroid Build Coastguard Worker#include <cstdlib>
*da0073e9SAndroid Build Coastguard Worker#include <mutex>
*da0073e9SAndroid Build Coastguard Worker#include <string>
*da0073e9SAndroid Build Coastguard Worker#include <vector>
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Workernamespace c10::cuda::CUDACachingAllocator {
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker// Environment config parser
*da0073e9SAndroid Build Coastguard Workerclass C10_CUDA_API CUDAAllocatorConfig {
*da0073e9SAndroid Build Coastguard Worker public:
*da0073e9SAndroid Build Coastguard Worker  static size_t max_split_size() {
*da0073e9SAndroid Build Coastguard Worker    return instance().m_max_split_size;
*da0073e9SAndroid Build Coastguard Worker  }
*da0073e9SAndroid Build Coastguard Worker  static double garbage_collection_threshold() {
*da0073e9SAndroid Build Coastguard Worker    return instance().m_garbage_collection_threshold;
*da0073e9SAndroid Build Coastguard Worker  }
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker  static bool expandable_segments() {
*da0073e9SAndroid Build Coastguard Worker#ifndef PYTORCH_C10_DRIVER_API_SUPPORTED
*da0073e9SAndroid Build Coastguard Worker    if (instance().m_expandable_segments) {
*da0073e9SAndroid Build Coastguard Worker      TORCH_WARN_ONCE("expandable_segments not supported on this platform")
*da0073e9SAndroid Build Coastguard Worker    }
*da0073e9SAndroid Build Coastguard Worker    return false;
*da0073e9SAndroid Build Coastguard Worker#else
*da0073e9SAndroid Build Coastguard Worker    return instance().m_expandable_segments;
*da0073e9SAndroid Build Coastguard Worker#endif
*da0073e9SAndroid Build Coastguard Worker  }
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker  static bool release_lock_on_cudamalloc() {
*da0073e9SAndroid Build Coastguard Worker    return instance().m_release_lock_on_cudamalloc;
*da0073e9SAndroid Build Coastguard Worker  }
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker  /** Pinned memory allocator settings */
*da0073e9SAndroid Build Coastguard Worker  static bool pinned_use_cuda_host_register() {
*da0073e9SAndroid Build Coastguard Worker    return instance().m_pinned_use_cuda_host_register;
*da0073e9SAndroid Build Coastguard Worker  }
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker  static size_t pinned_num_register_threads() {
*da0073e9SAndroid Build Coastguard Worker    return instance().m_pinned_num_register_threads;
*da0073e9SAndroid Build Coastguard Worker  }
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker  static size_t pinned_max_register_threads() {
*da0073e9SAndroid Build Coastguard Worker    // Based on the benchmark results, we see better allocation performance
*da0073e9SAndroid Build Coastguard Worker    // with 8 threads. However on future systems, we may need more threads
*da0073e9SAndroid Build Coastguard Worker    // and limiting this to 128 threads.
*da0073e9SAndroid Build Coastguard Worker    return 128;
*da0073e9SAndroid Build Coastguard Worker  }
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker  // This is used to round-up allocation size to nearest power of 2 divisions.
*da0073e9SAndroid Build Coastguard Worker  // More description below in function roundup_power2_next_division
*da0073e9SAndroid Build Coastguard Worker  // As ane example, if we want 4 divisions between 2's power, this can be done
*da0073e9SAndroid Build Coastguard Worker  // using env variable: PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4
*da0073e9SAndroid Build Coastguard Worker  static size_t roundup_power2_divisions(size_t size);
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker  static std::vector<size_t> roundup_power2_divisions() {
*da0073e9SAndroid Build Coastguard Worker    return instance().m_roundup_power2_divisions;
*da0073e9SAndroid Build Coastguard Worker  }
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker  static std::string last_allocator_settings() {
*da0073e9SAndroid Build Coastguard Worker    std::lock_guard<std::mutex> lock(
*da0073e9SAndroid Build Coastguard Worker        instance().m_last_allocator_settings_mutex);
*da0073e9SAndroid Build Coastguard Worker    return instance().m_last_allocator_settings;
*da0073e9SAndroid Build Coastguard Worker  }
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker  static CUDAAllocatorConfig& instance() {
*da0073e9SAndroid Build Coastguard Worker    static CUDAAllocatorConfig* s_instance = ([]() {
*da0073e9SAndroid Build Coastguard Worker      auto inst = new CUDAAllocatorConfig();
*da0073e9SAndroid Build Coastguard Worker      const char* env = getenv("PYTORCH_CUDA_ALLOC_CONF");
*da0073e9SAndroid Build Coastguard Worker      inst->parseArgs(env);
*da0073e9SAndroid Build Coastguard Worker      return inst;
*da0073e9SAndroid Build Coastguard Worker    })();
*da0073e9SAndroid Build Coastguard Worker    return *s_instance;
*da0073e9SAndroid Build Coastguard Worker  }
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker  void parseArgs(const char* env);
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker private:
*da0073e9SAndroid Build Coastguard Worker  CUDAAllocatorConfig();
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker  static void lexArgs(const char* env, std::vector<std::string>& config);
*da0073e9SAndroid Build Coastguard Worker  static void consumeToken(
*da0073e9SAndroid Build Coastguard Worker      const std::vector<std::string>& config,
*da0073e9SAndroid Build Coastguard Worker      size_t i,
*da0073e9SAndroid Build Coastguard Worker      const char c);
*da0073e9SAndroid Build Coastguard Worker  size_t parseMaxSplitSize(const std::vector<std::string>& config, size_t i);
*da0073e9SAndroid Build Coastguard Worker  size_t parseGarbageCollectionThreshold(
*da0073e9SAndroid Build Coastguard Worker      const std::vector<std::string>& config,
*da0073e9SAndroid Build Coastguard Worker      size_t i);
*da0073e9SAndroid Build Coastguard Worker  size_t parseRoundUpPower2Divisions(
*da0073e9SAndroid Build Coastguard Worker      const std::vector<std::string>& config,
*da0073e9SAndroid Build Coastguard Worker      size_t i);
*da0073e9SAndroid Build Coastguard Worker  size_t parseAllocatorConfig(
*da0073e9SAndroid Build Coastguard Worker      const std::vector<std::string>& config,
*da0073e9SAndroid Build Coastguard Worker      size_t i,
*da0073e9SAndroid Build Coastguard Worker      bool& used_cudaMallocAsync);
*da0073e9SAndroid Build Coastguard Worker  size_t parsePinnedUseCudaHostRegister(
*da0073e9SAndroid Build Coastguard Worker      const std::vector<std::string>& config,
*da0073e9SAndroid Build Coastguard Worker      size_t i);
*da0073e9SAndroid Build Coastguard Worker  size_t parsePinnedNumRegisterThreads(
*da0073e9SAndroid Build Coastguard Worker      const std::vector<std::string>& config,
*da0073e9SAndroid Build Coastguard Worker      size_t i);
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker  std::atomic<size_t> m_max_split_size;
*da0073e9SAndroid Build Coastguard Worker  std::vector<size_t> m_roundup_power2_divisions;
*da0073e9SAndroid Build Coastguard Worker  std::atomic<double> m_garbage_collection_threshold;
*da0073e9SAndroid Build Coastguard Worker  std::atomic<size_t> m_pinned_num_register_threads;
*da0073e9SAndroid Build Coastguard Worker  std::atomic<bool> m_expandable_segments;
*da0073e9SAndroid Build Coastguard Worker  std::atomic<bool> m_release_lock_on_cudamalloc;
*da0073e9SAndroid Build Coastguard Worker  std::atomic<bool> m_pinned_use_cuda_host_register;
*da0073e9SAndroid Build Coastguard Worker  std::string m_last_allocator_settings;
*da0073e9SAndroid Build Coastguard Worker  std::mutex m_last_allocator_settings_mutex;
*da0073e9SAndroid Build Coastguard Worker};
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker// General caching allocator utilities
*da0073e9SAndroid Build Coastguard WorkerC10_CUDA_API void setAllocatorSettings(const std::string& env);
*da0073e9SAndroid Build Coastguard Worker
*da0073e9SAndroid Build Coastguard Worker} // namespace c10::cuda::CUDACachingAllocator