xref: /aosp_15_r20/external/pytorch/c10/cuda/CUDACachingAllocator.h (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #pragma once
2 
3 #include <c10/core/CachingDeviceAllocator.h>
4 #include <c10/cuda/CUDAGraphsC10Utils.h>
5 #include <c10/cuda/CUDAMacros.h>
6 #include <c10/cuda/CUDAStream.h>
7 #include <c10/util/ApproximateClock.h>
8 #include <c10/util/Exception.h>
9 #include <c10/util/Registry.h>
10 
11 #include <array>
12 #include <atomic>
13 #include <cstddef>
14 #include <cstdint>
15 #include <functional>
16 #include <memory>
17 #include <string>
18 #include <unordered_set>
19 #include <utility>
20 
21 namespace c10 {
22 
23 // Caching allocator will execute every registered callback if it unable to find
24 // block inside of already allocated area.
25 class C10_CUDA_API FreeMemoryCallback {
26  public:
27   virtual ~FreeMemoryCallback() = default;
28   virtual bool Execute() = 0;
29 };
30 
31 C10_DECLARE_REGISTRY(FreeCudaMemoryCallbacksRegistry, FreeMemoryCallback);
32 #define REGISTER_FREE_MEMORY_CALLBACK(name, ...) \
33   C10_REGISTER_CLASS(FreeCudaMemoryCallbacksRegistry, name, __VA_ARGS__);
34 } // namespace c10
35   //
36 // TODO: Turn this into an honest to goodness class. I briefly attempted to do
37 // this, but it was a bit irritating to figure out how to also correctly
38 // apply pimpl pattern so I didn't have to leak any internal implementation
39 // details in the header (CUDACachingAllocator could be made a pimpl, but
40 // you also need to appropriately define a class which is a subclass
41 // of Allocator. Not impossible, but required a bit more surgery than
42 // I wanted to do at the time.)
43 //
44 // Why is this using a namespace rather than old-style THCCachingAllocator_
45 // prefix?  Mostly because it made the HIPify rules easier to write; _ is
46 // not counted as a word boundary, so you would otherwise have to list each
47 // of these functions.
48 
49 namespace c10::cuda::CUDACachingAllocator {
50 
51 // Preserved only for BC reasons
52 // NOLINTNEXTLINE(misc-unused-using-decls)
53 using c10::CachingDeviceAllocator::DeviceStats;
54 
55 extern const size_t kLargeBuffer;
56 
57 typedef std::shared_ptr<GatheredContext> (*CreateContextFn)();
58 
59 // Struct containing info of an allocation block (i.e. a fractional part of a
60 // cudaMalloc)..
61 struct BlockInfo {
62   size_t size = 0;
63   size_t requested_size = 0;
64   int32_t gc_counter = 0;
65   bool allocated = false;
66   bool active = false;
67   std::shared_ptr<GatheredContext>
68       context_when_allocated; // per-watcher context
69 };
70 
71 // Struct containing info of a memory segment (i.e. one contiguous cudaMalloc).
72 struct SegmentInfo {
73   c10::DeviceIndex device = 0;
74   size_t address = 0;
75   size_t total_size = 0;
76   size_t requested_size = 0; // unrounded, actually requested size
77   size_t allocated_size = 0;
78   size_t active_size = 0;
79   cudaStream_t stream = nullptr;
80   bool is_large = false;
81   bool is_expandable = false;
82   MempoolId_t owner_private_pool_id = {0, 0};
83   std::vector<BlockInfo> blocks;
84   std::shared_ptr<GatheredContext> context_when_allocated;
85 };
86 
87 struct AllocatorState {
88   virtual ~AllocatorState() = default;
89 };
90 
91 union trace_time_ {
92   time_t t_;
93   approx_time_t approx_t_;
94 };
95 
96 struct TraceEntry {
97   enum Action {
98     ALLOC, // API made to the caching allocator for new memory
99     FREE_REQUESTED, // API call made to the caching allocator to free memory
100     FREE_COMPLETED, // The allocator might have to delay a free because
101                     // it is still in use on another stream via record_stream
102                     // This event is generated when a free actually completes.
103     SEGMENT_ALLOC, // a call to cudaMalloc to get more memory from the OS
104     SEGMENT_FREE, // a call to cudaFree to return memory to the OS (e.g. to
105                   // defragment or empty_caches)
106     SEGMENT_MAP, // a call to cuMemMap (used with expandable_segments)
107     SEGMENT_UNMAP, // unmap part of a segment (used with expandable segments)
108     SNAPSHOT, // a call to snapshot, used to correlate memory snapshots to trace
109               // events
110     OOM // the allocator threw an OutOfMemoryError (addr_ is the amount of free
111         // bytes reported by cuda)
112   };
113   TraceEntry(
114       Action action,
115       c10::DeviceIndex device,
116       size_t addr,
117       size_t size,
118       cudaStream_t stream,
119       approx_time_t time,
120       std::shared_ptr<GatheredContext> context = nullptr)
action_TraceEntry121       : action_(action),
122         device_(device),
123         addr_(addr),
124         context_(std::move(context)),
125         stream_(stream),
126         size_(size) {
127     time_.approx_t_ = time;
128   }
129   Action action_;
130   c10::DeviceIndex device_;
131   size_t addr_; // for OOM, this is the amount of free bytes reported by cuda
132   std::shared_ptr<GatheredContext> context_;
133   cudaStream_t stream_{};
134   size_t size_;
135   trace_time_ time_{};
136 };
137 
138 // Calls made by record_function will save annotations
139 struct AnnotationEntry {
AnnotationEntryAnnotationEntry140   AnnotationEntry(c10::DeviceIndex device, approx_time_t time)
141       : device_(device) {
142     time_.approx_t_ = time;
143   }
144 
recordUserMetadataAnnotationEntry145   void recordUserMetadata(const std::string& name, std::string value) {
146     metadata_[name] = std::move(value);
147   }
148 
149   c10::DeviceIndex device_;
150   trace_time_ time_{};
151   std::unordered_map<std::string, std::string> metadata_;
152 };
153 
154 struct AllocatorConfigInfo {
155   double garbage_collection_threshold;
156   size_t max_split_size;
157   size_t pinned_num_register_threads;
158   bool expandable_segments;
159   bool release_lock_on_malloc;
160   bool pinned_use_host_register;
161   std::string last_allocator_settings;
162   std::vector<size_t> roundup_power2_divisions;
163 };
164 
165 struct SnapshotInfo {
166   std::vector<SegmentInfo> segments;
167   std::vector<std::vector<TraceEntry>> device_traces;
168   std::vector<AnnotationEntry> external_annotations;
169   AllocatorConfigInfo config_metadata;
170 };
171 
172 // returns the pointers freed in the pool
173 // and the pointers allocated. Note: a pointer
174 // may appear in both freed and allocated
175 struct CheckpointDelta {
176   std::vector<void*> ptrs_freed;
177   std::vector<at::DataPtr> dataptrs_allocd;
178 };
179 
180 enum struct RecordContext {
181   NEVER = 0,
182   STATE = 1, // only keep stacks for active allocations
183   ALLOC = 2, // additionally keep stacks for allocations in the trace history
184   ALL = 3, // additionally record stacks for when something is freed
185 };
186 
187 using OutOfMemoryObserver = std::function<void(
188     int64_t device,
189     size_t allocated,
190     size_t device_total,
191     size_t device_free)>;
192 
193 using AllocatorTraceTracker = std::function<void(const TraceEntry&)>;
194 
195 struct ShareableHandle {
196   ptrdiff_t offset;
197   std::string handle;
198 };
199 
200 class CUDAAllocator : public Allocator {
201  public:
202   virtual void* raw_alloc(size_t nbytes) = 0;
203   virtual void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) = 0;
204   virtual void raw_delete(void* ptr) = 0;
205   virtual void init(int device_count) = 0;
206   virtual bool initialized() = 0;
207   virtual void setMemoryFraction(double fraction, c10::DeviceIndex device) = 0;
208   virtual void emptyCache() = 0;
209   virtual void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) = 0;
210   virtual void* getBaseAllocation(void* ptr, size_t* size) = 0;
211   virtual void recordStream(const DataPtr&, CUDAStream stream) = 0;
212   virtual c10::CachingDeviceAllocator::DeviceStats getDeviceStats(
213       c10::DeviceIndex device) = 0;
214   virtual void resetAccumulatedStats(c10::DeviceIndex device) = 0;
215   virtual void resetPeakStats(c10::DeviceIndex device) = 0;
216   virtual SnapshotInfo snapshot() = 0;
217   virtual void beginAllocateToPool(
218       c10::DeviceIndex device,
219       MempoolId_t mempool_id,
220       std::function<bool(cudaStream_t)> filter) = 0;
221   virtual void endAllocateToPool(
222       c10::DeviceIndex device,
223       MempoolId_t mempool_id) = 0;
224   virtual void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) = 0;
225   // returns true if the allocated blocks are equal to expected live allocations
checkPoolLiveAllocations(c10::DeviceIndex device,MempoolId_t mempool_id,const std::unordered_set<void * > & expected_live_allocations)226   virtual bool checkPoolLiveAllocations(
227       c10::DeviceIndex device,
228       MempoolId_t mempool_id,
229       const std::unordered_set<void*>& expected_live_allocations) {
230     TORCH_CHECK(
231         false,
232         name(),
233         " does not yet support checkPoolLiveAllocations. "
234         "If you need it, please file an issue describing your use case.");
235   }
236   virtual ShareableHandle shareIpcHandle(void* ptr) = 0;
237   virtual std::shared_ptr<void> getIpcDevPtr(std::string handle) = 0;
isHistoryEnabled()238   virtual bool isHistoryEnabled() {
239     TORCH_CHECK(
240         false,
241         name(),
242         " does not yet support recordHistory. "
243         "If you need it, please file an issue describing your use case.");
244   }
245   virtual void recordHistory(
246       bool enabled,
247       CreateContextFn context_recorder,
248       size_t alloc_trace_max_entries,
249       RecordContext when) = 0;
recordAnnotation(const std::vector<std::pair<std::string,std::string>> & md)250   virtual void recordAnnotation(
251       const std::vector<std::pair<std::string, std::string>>& md){};
252   virtual void attachOutOfMemoryObserver(OutOfMemoryObserver observer) = 0;
253 
254   // Attached AllocatorTraceTracker callbacks will be called while the
255   // per-device allocator lock is held. Any additional locks taken from within
256   // the callback must be proven to always have the lock order that never
257   // triggers a deadlock. In particular, Python's GIL may be held when
258   // calling the allocator so it is unsafe to try to acquire the GIL in this
259   // callback.
260   virtual void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) = 0;
261 
262   virtual void enablePeerAccess(
263       c10::DeviceIndex dev,
264       c10::DeviceIndex dev_to_access) = 0;
265 
266   // memory not allocated from cudaMalloc cannot be copied
267   // across devices using cudaMemcpyAsync if peer to peer access is disabled.
268   // instead it requires cudaMemcpyAsyncPeer
269   //  with P2P Enabled, all combinations work
270   //  with P2P Disabled:
271   //                       cudaMalloc cudaMallocAsync/cuMemMap
272   // cudaMemcpyAsyncPeer   works      works
273   // cudaMemcpyAsync       works      error
274 
275   // This function performs chooses to use the Peer version of
276   // memcpy if required based on where the allocated put dst/src.
277   virtual cudaError_t memcpyAsync(
278       void* dst,
279       int dstDevice,
280       const void* src,
281       int srcDevice,
282       size_t count,
283       cudaStream_t stream,
284       bool p2p_enabled) = 0;
285   virtual std::shared_ptr<AllocatorState> getCheckpointState(
286       c10::DeviceIndex device,
287       MempoolId_t id) = 0;
288   virtual CheckpointDelta setCheckpointPoolState(
289       c10::DeviceIndex device,
290       std::shared_ptr<AllocatorState> pps) = 0;
291   virtual std::string name() = 0;
292 };
293 
294 // Allocator object, statically initialized
295 // See BackendInitializer in CUDACachingAllocator.cpp.
296 // Atomic loads on x86 are just normal loads,
297 // (atomic stores are different), so reading this value
298 // is no different than loading a pointer.
299 C10_CUDA_API extern std::atomic<CUDAAllocator*> allocator;
300 
get()301 inline CUDAAllocator* get() {
302   return allocator.load();
303 }
304 
305 // Called directly by clients.
raw_alloc(size_t nbytes)306 inline void* raw_alloc(size_t nbytes) {
307   return get()->raw_alloc(nbytes);
308 }
309 
raw_alloc_with_stream(size_t nbytes,cudaStream_t stream)310 inline void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) {
311   return get()->raw_alloc_with_stream(nbytes, stream);
312 }
313 
raw_delete(void * ptr)314 inline void raw_delete(void* ptr) {
315   return get()->raw_delete(ptr);
316 }
317 
init(int device_count)318 inline void init(int device_count) {
319   return get()->init(device_count);
320 }
321 
setMemoryFraction(double fraction,c10::DeviceIndex device)322 inline void setMemoryFraction(double fraction, c10::DeviceIndex device) {
323   return get()->setMemoryFraction(fraction, device);
324 }
325 
emptyCache()326 inline void emptyCache() {
327   return get()->emptyCache();
328 }
329 
cacheInfo(c10::DeviceIndex device,size_t * largestBlock)330 inline void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) {
331   return get()->cacheInfo(device, largestBlock);
332 }
333 
getBaseAllocation(void * ptr,size_t * size)334 inline void* getBaseAllocation(void* ptr, size_t* size) {
335   return get()->getBaseAllocation(ptr, size);
336 }
337 
recordStream(const DataPtr & dataPtr,CUDAStream stream)338 inline void recordStream(const DataPtr& dataPtr, CUDAStream stream) {
339   return get()->recordStream(dataPtr, stream);
340 }
341 
getDeviceStats(c10::DeviceIndex device)342 inline c10::CachingDeviceAllocator::DeviceStats getDeviceStats(
343     c10::DeviceIndex device) {
344   return get()->getDeviceStats(device);
345 }
346 
resetAccumulatedStats(c10::DeviceIndex device)347 inline void resetAccumulatedStats(c10::DeviceIndex device) {
348   return get()->resetAccumulatedStats(device);
349 }
350 
resetPeakStats(c10::DeviceIndex device)351 inline void resetPeakStats(c10::DeviceIndex device) {
352   return get()->resetPeakStats(device);
353 }
354 
snapshot()355 inline SnapshotInfo snapshot() {
356   return get()->snapshot();
357 }
358 
getCheckpointState(c10::DeviceIndex device,MempoolId_t id)359 inline std::shared_ptr<AllocatorState> getCheckpointState(
360     c10::DeviceIndex device,
361     MempoolId_t id) {
362   return get()->getCheckpointState(device, id);
363 }
364 
setCheckpointPoolState(c10::DeviceIndex device,std::shared_ptr<AllocatorState> pps)365 inline CheckpointDelta setCheckpointPoolState(
366     c10::DeviceIndex device,
367     std::shared_ptr<AllocatorState> pps) {
368   return get()->setCheckpointPoolState(device, std::move(pps));
369 }
370 
371 // CUDAGraph interactions
beginAllocateToPool(c10::DeviceIndex device,MempoolId_t mempool_id,std::function<bool (cudaStream_t)> filter)372 inline void beginAllocateToPool(
373     c10::DeviceIndex device,
374     MempoolId_t mempool_id,
375     std::function<bool(cudaStream_t)> filter) {
376   get()->beginAllocateToPool(device, mempool_id, std::move(filter));
377 }
378 
endAllocateToPool(c10::DeviceIndex device,MempoolId_t mempool_id)379 inline void endAllocateToPool(c10::DeviceIndex device, MempoolId_t mempool_id) {
380   get()->endAllocateToPool(device, mempool_id);
381 }
382 
recordHistory(bool enabled,CreateContextFn context_recorder,size_t alloc_trace_max_entries,RecordContext when)383 inline void recordHistory(
384     bool enabled,
385     CreateContextFn context_recorder,
386     size_t alloc_trace_max_entries,
387     RecordContext when) {
388   return get()->recordHistory(
389       enabled, context_recorder, alloc_trace_max_entries, when);
390 }
391 
recordAnnotation(const std::vector<std::pair<std::string,std::string>> & md)392 inline void recordAnnotation(
393     const std::vector<std::pair<std::string, std::string>>& md) {
394   return get()->recordAnnotation(md);
395 }
396 
isHistoryEnabled()397 inline bool isHistoryEnabled() {
398   return get()->isHistoryEnabled();
399 }
400 
checkPoolLiveAllocations(c10::DeviceIndex device,MempoolId_t mempool_id,const std::unordered_set<void * > & expected_live_allocations)401 inline bool checkPoolLiveAllocations(
402     c10::DeviceIndex device,
403     MempoolId_t mempool_id,
404     const std::unordered_set<void*>& expected_live_allocations) {
405   return get()->checkPoolLiveAllocations(
406       device, mempool_id, expected_live_allocations);
407 }
408 
attachOutOfMemoryObserver(OutOfMemoryObserver observer)409 inline void attachOutOfMemoryObserver(OutOfMemoryObserver observer) {
410   return get()->attachOutOfMemoryObserver(std::move(observer));
411 }
412 
attachAllocatorTraceTracker(AllocatorTraceTracker tracker)413 inline void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) {
414   return get()->attachAllocatorTraceTracker(std::move(tracker));
415 }
416 
releasePool(c10::DeviceIndex device,MempoolId_t mempool_id)417 inline void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) {
418   return get()->releasePool(device, mempool_id);
419 }
420 // Not part of CUDA_ALLOCATOR_BACKEND_INTERFACE
getIpcDevPtr(std::string handle)421 inline std::shared_ptr<void> getIpcDevPtr(std::string handle) {
422   return get()->getIpcDevPtr(std::move(handle));
423 }
424 
shareIpcHandle(void * ptr)425 inline ShareableHandle shareIpcHandle(void* ptr) {
426   return get()->shareIpcHandle(ptr);
427 }
428 
name()429 inline std::string name() {
430   return get()->name();
431 }
432 
memcpyAsync(void * dst,int dstDevice,const void * src,int srcDevice,size_t count,cudaStream_t stream,bool p2p_enabled)433 inline cudaError_t memcpyAsync(
434     void* dst,
435     int dstDevice,
436     const void* src,
437     int srcDevice,
438     size_t count,
439     cudaStream_t stream,
440     bool p2p_enabled) {
441   return get()->memcpyAsync(
442       dst, dstDevice, src, srcDevice, count, stream, p2p_enabled);
443 }
444 
enablePeerAccess(c10::DeviceIndex dev,c10::DeviceIndex dev_to_access)445 inline void enablePeerAccess(
446     c10::DeviceIndex dev,
447     c10::DeviceIndex dev_to_access) {
448   return get()->enablePeerAccess(dev, dev_to_access);
449 }
450 
451 } // namespace c10::cuda::CUDACachingAllocator
452 
453 namespace c10::cuda {
454 
455 // MemPool represents a pool of memory in a caching allocator. Currently,
456 // it's just the ID of the pool object maintained in the CUDACachingAllocator.
457 //
458 // An allocator pointer can be passed to the MemPool to define how the
459 // allocations should be done in the pool. For example: using a different
460 // system allocator such as ncclMemAlloc.
461 struct C10_CUDA_API MemPool {
462   MemPool(
463       CUDACachingAllocator::CUDAAllocator* allocator = nullptr,
464       bool is_user_created = true);
465 
466   MempoolId_t id();
467   CUDACachingAllocator::CUDAAllocator* allocator();
468 
469  private:
470   static std::atomic<CaptureId_t> uid_;
471   static std::atomic<CaptureId_t> uuid_;
472   CUDACachingAllocator::CUDAAllocator* allocator_;
473   bool is_user_created_;
474   MempoolId_t id_;
475 };
476 
477 // MemPoolContext holds the currently active pool and stashes the previous
478 // pool. On deletion it makes the previous pool active.
479 struct C10_CUDA_API MemPoolContext {
480   MemPoolContext(MemPool* mempool);
481 
482   ~MemPoolContext();
483 
484   // getActiveMemPool() can be used to get the currently active pool.
485   // For instance: in CUDACachingAllocator, we can route allocations
486   // to a user provided allocator, by doing:
487   //
488   //  auto active_pool = MemPoolContext::getActiveMemPool();
489   //  if (active_pool && active_pool->allocator()) {
490   //    ptr = active_pool->allocator()->raw_alloc(size);
491   //  }
492   //
493   static MemPool* getActiveMemPool();
494 
495  private:
496   MemPool* prev_mempool_;
497 };
498 
499 } // namespace c10::cuda
500