xref: /aosp_15_r20/external/pytorch/c10/cuda/CUDACachingAllocator.cpp (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #include <c10/cuda/CUDACachingAllocator.h>
2 
3 #include <c10/core/impl/GPUTrace.h>
4 #include <c10/cuda/CUDAAllocatorConfig.h>
5 #include <c10/cuda/CUDAException.h>
6 #include <c10/cuda/CUDAFunctions.h>
7 #include <c10/cuda/CUDAGuard.h>
8 #include <c10/util/CallOnce.h>
9 #include <c10/util/Gauge.h>
10 #include <c10/util/ScopeExit.h>
11 #include <c10/util/UniqueVoidPtr.h>
12 #include <c10/util/flat_hash_map.h>
13 #include <c10/util/hash.h>
14 #include <c10/util/llvmMathExtras.h>
15 #include <c10/util/static_tracepoint.h>
16 
17 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
18 #include <c10/cuda/driver_api.h>
19 #include <sys/syscall.h>
20 #include <sys/types.h>
21 #include <unistd.h>
22 #endif
23 
24 #include <c10/util/Exception.h>
25 #include <cuda_runtime_api.h>
26 #include <algorithm>
27 #include <cstddef>
28 #include <cstdint>
29 #include <deque>
30 #include <memory>
31 #include <mutex>
32 #include <regex>
33 #include <set>
34 #include <utility>
35 #include <vector>
36 
37 TORCH_SDT_DEFINE_SEMAPHORE(malloc)
38 TORCH_SDT_DEFINE_SEMAPHORE(free)
39 
40 namespace c10 {
41 
42 C10_DEFINE_REGISTRY(FreeCudaMemoryCallbacksRegistry, FreeMemoryCallback);
43 
44 namespace cuda::CUDACachingAllocator {
45 
46 using namespace c10::CachingDeviceAllocator;
47 
48 // Included here as this is externally used in CUDAAllocatorConfig
49 const size_t kLargeBuffer =
50     20971520; // "large" allocations may be packed in 20 MiB blocks
51 
52 namespace Native {
53 
54 //
55 // Yet another caching allocator for CUDA device allocations.
56 //
57 // - Allocations are associated with a stream. Once freed, blocks can be
58 //   re-allocated on the same stream, but not on any other stream.
59 // - The allocator attempts to find the smallest cached block that will fit the
60 //   requested size. If the block is larger than the requested size, it may be
61 //   split. If no block is found, the allocator will delegate to cudaMalloc.
62 // - If the cudaMalloc fails, the allocator will attempt to free one cached
63 //   block of sufficient size that is not split and retry the allocation.
64 //   If this also fails, the allocator will attempt to free all cached blocks
65 //   that are not split and retry the allocation.
66 // - Large (>1MB) and small allocations are stored in separate pools.
67 //   Small requests are packed into 2MB buffers. Large requests will use the
68 //   smallest available free block or allocate a new block using cudaMalloc.
69 // - To reduce fragmentation, requests between 1MB and 10MB will allocate and
70 //   split a 20MB block, if no free block of sufficient size is available.
71 // - To further reduce fragmentation, blocks >= max_split_size are not allowed
72 //   to be split. These oversize cached blocks will still satisfy requests
73 //   within 1MB of the oversize cached block size.
74 //
75 // With this allocator, allocations and frees should logically be considered
76 // "usages" of the memory segment associated with streams, just like kernel
77 // launches. The programmer must insert the proper synchronization if memory
78 // segments are used from multiple streams.
79 //
80 // The library provides a recordStream() function to help insert the correct
81 // synchronization when allocations are used on multiple streams. This will
82 // ensure that the block is not reused before each recorded stream completes
83 // work.
84 //
85 
86 /**
87  * Note [Interaction with CUDA graph capture]
88  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
89  * Graph capture performs a dry run of a region of execution, freezing all CUDA
90  * work (and virtual addresses used during that work) into a "graph." The graph
91  * may be "replayed" like a single giant kernel, with greatly reduced CPU
92  * overhead as well as modestly improved GPU performance.
93  *
94  * Because capture bakes in memory addresses, the memory used during capture
95  * must be available for the graph to use during replay. DeviceCachingAllocator
96  * assigns and frees memory eagerly and dynamically, so if we're not careful
97  * about managing graphs' memory, at replay time those memory addresses could be
98  * used by other tensors.
99  *
100  * To guarantee a graph's baked in addresses are safe to reuse in replay,
101  * DeviceAllocator satisfies allocations from a graph-private memory pool during
102  * capture, and doesn't begin cudaFreeing those addresses until the graph is
103  * destroyed.
104  *
105  * Within the private pool, allocations are freed and reassigned as usual during
106  * capture. Memory regions will be used in a consistent order during replay. So
107  * a private pool doesn't use memory more wastefully than the default pools
108  * during capture, but it does reserve its high-water mark of used memory away
109  * from the default pools as long as the capture(s) it served survive
110  * (regardless whether those captures are idle or replaying).
111  *
112  * CUDAGraph's requests for private pools are mediated by
113  * DeviceAllocator::notifyCaptureBegin,
114  *                  notifyCaptureAboutToEnd,
115  *                  notifyCaptureEnded,
116  *                  notifyCaptureDestroy.
117  */
118 
119 constexpr size_t kMinBlockSize =
120     512; // all sizes are rounded to at least 512 bytes
121 constexpr size_t kSmallSize = 1048576; // largest "small" allocation is 1 MiB
122 constexpr size_t kSmallBuffer =
123     2097152; // "small" allocations are packed in 2 MiB blocks
124 constexpr size_t kMinLargeAlloc =
125     10485760; // allocations between 1 and 10 MiB may use kLargeBuffer
126 constexpr size_t kRoundLarge = 2097152; // round up large allocations to 2 MiB
127 
128 char SHAREABLE_HANDLE_VERSION = 1;
129 enum ShareableHandleType : char {
130   SHAREABLE_CUDA_MALLOC = 'c',
131   SHAREABLE_CUDA_EXPANDABLE_SEGMENT = 'e'
132 };
133 
134 namespace {
135 
136 using stream_set = ska::flat_hash_set<cuda::CUDAStream>;
137 
decrease_stat_array(StatArray & stat_array,size_t amount,const StatTypes & stat_types)138 void decrease_stat_array(
139     StatArray& stat_array,
140     size_t amount,
141     const StatTypes& stat_types) {
142   for_each_selected_stat_type(
143       stat_types, [&stat_array, amount](size_t stat_type) {
144         stat_array[stat_type].decrease(amount);
145       });
146 }
147 
148 struct Block;
149 struct PrivatePool;
150 typedef bool (*Comparison)(const Block*, const Block*);
151 static bool BlockComparatorSize(const Block* a, const Block* b);
152 static bool BlockComparatorAddress(const Block* a, const Block* b);
153 
154 struct BlockPool {
BlockPoolc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::BlockPool155   BlockPool(bool small, PrivatePool* private_pool = nullptr)
156       : blocks(BlockComparatorSize),
157         unmapped(BlockComparatorAddress),
158         is_small(small),
159         owner_PrivatePool(private_pool) {}
160 
161   // Do not insert a Block to blocks directly; use insert_into_blocks(),
162   // instead.
163   std::set<Block*, Comparison> blocks;
164   std::set<Block*, Comparison> unmapped;
165   // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
166   const bool is_small;
167   PrivatePool* owner_PrivatePool;
168   int64_t get_free_blocks_call_count{0};
169 
170   // Add a Block into blocks set with updating gc counter.
171   std::pair<std::set<Block*, Comparison>::iterator, bool> insert_into_blocks(
172       Block* block);
173 };
174 
175 struct ExpandableSegment;
176 
177 struct Block {
178   c10::DeviceIndex device; // gpu
179   cudaStream_t stream; // allocation stream
180   stream_set stream_uses; // streams on which the block was used
181   size_t size; // block size in bytes
182   size_t requested_size; // memory originally requested
183   BlockPool* pool{nullptr}; // owning memory pool
184   void* ptr{nullptr}; // memory address
185   bool allocated{false}; // in-use flag
186   bool mapped{true}; // is the virtual address range this Block references
187                      // backed by physical pages. Always true when
188                      // expandable_segment_ is null. When false
189                      // This Block will be aligned to the segment size
190                      // of its expandable_segment_.
191   Block* prev{nullptr}; // prev block if split from a larger allocation
192   Block* next{nullptr}; // next block if split from a larger allocation
193   int event_count{0}; // number of outstanding CUDA events
194   int64_t gc_count_base{0}; // get_free_blocks_call_count when Block is inserted
195   std::shared_ptr<GatheredContext> context_when_allocated;
196   // only set for the first block in the segment (when prev == null)
197   // this records the frame information when cudaMalloc was called
198   // whereas context_when_allocated records the last time we handed this
199   // memory out from our cache.
200   std::shared_ptr<GatheredContext> context_when_segment_allocated;
201 
202   ExpandableSegment* expandable_segment_{nullptr};
203 
Blockc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::Block204   Block(
205       c10::DeviceIndex device,
206       cudaStream_t stream,
207       size_t size,
208       BlockPool* pool,
209       void* ptr)
210       : device(device),
211         stream(stream),
212         stream_uses(),
213         size(size),
214         requested_size(0),
215         pool(pool),
216         ptr(ptr) {}
217 
218   // constructor for search key
Blockc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::Block219   Block(c10::DeviceIndex device, cudaStream_t stream, size_t size)
220       : device(device),
221         stream(stream),
222         stream_uses(),
223         size(size),
224         requested_size(0) {}
225 
gc_countc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::Block226   size_t gc_count() {
227     TORCH_INTERNAL_ASSERT(pool);
228     return static_cast<int>(pool->get_free_blocks_call_count - gc_count_base);
229   }
230 
is_splitc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::Block231   bool is_split() const {
232     return (prev != nullptr) || (next != nullptr);
233   }
splicec10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::Block234   void splice(Block* before, Block* after) {
235     if (before) {
236       TORCH_INTERNAL_ASSERT(before->next == after);
237       before->next = this;
238     }
239     prev = before;
240     if (after) {
241       TORCH_INTERNAL_ASSERT(after->prev == before);
242       after->prev = this;
243     }
244     next = after;
245   }
246 };
247 
248 std::pair<std::set<Block*, Comparison>::iterator, bool> BlockPool::
insert_into_blocks(Block * block)249     insert_into_blocks(Block* block) {
250   block->gc_count_base = get_free_blocks_call_count;
251   return blocks.insert(block);
252 }
253 
254 struct SegmentRange {
255   char* ptr;
256   size_t size;
SegmentRangec10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::SegmentRange257   SegmentRange(void* p, size_t s) : ptr(static_cast<char*>(p)), size(s) {}
258 };
259 
260 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
261 
262 /*
263 Note [Expandable Segments]
264 
265 Rationale
266 
267 For large (>2MB) allocations, the allocator calls cudaMalloc to get allocations
268 that are the same size as what the user requests. In the future, parts of these
269 allocations can be reused for other requests if they are free. This works well
270 when the program makes many requests of exactly the same size or of sizes that
271 even multiples of that size. Many deep learning models follow this behavior.
272 However, one common exception is when the batch size changes slightly from one
273 iteration to the next, e.g. in batched inference. When the program runs
274 initially with batch size N, it will make allocations appropriate for that size.
275 If in the future, it runs at size N - 1, the existing allocations will still be
276 big enough. However, if it runs at size N + 1, then it will have to make new
277 allocations that are slightly larger. Not all the tensors are the same size.
278 Some might be (N + 1)*A and others (N + 1)*A*B where A and B are some non-batch
279 dimensions in the model. Because the allocator reuses existing allocations when
280 they are big enough, some number of (N + 1)*A allocations will actually fit in
281 the already existing N*B*A segments, though not perfectly. As the model runs it
282 will partially fill up all of these segments leaving unusable free slices of
283 memory at the end of these segments. The allocator at some point will need to
284 cudaMalloc a new (N + 1)*A*B segment. If there is not enough memory, there is
285 now no way to recover the slices of memory that are free at the end of existing
286 segments. With models 50+ layers deep, this pattern might repeat 50+ times
287 creating many slivers.
288 
289 Approach
290 
291 Expandable segments allows the allocator to create a segment initially and then
292 expand its size later when more memory is needed. Instead of making one segment
293 per allocation, it tries to make one segment (per stream) that grows as
294 necessary. Now when the N + 1 case runs, the allocations will tile nicely into
295 the one large segment until it fills up. Then more memory is requested and
296 appended to the end of the segment. This process does not create as many slivers
297 of unusable memory, so it is more likely to succeed at finding this memory.
298 
299 Implementation
300 
301 The expandable_segments:True option is used to enable/disable this behavior. We
302 use cuda's low-level memory APIs, which are similar to mmap, to extend the
303 memory segments. These APIs separate the allocation of physical memory
304 (cuMemCreate) from the allocation of virtual address space (cuMemAddressReserve)
305 and the associate between them cuMemMap/cuMemSetAccess.
306 
307 When we allocate a new segment, we allocate enough address space to map
308 basically the entire physical memory of the GPU (there is 256TiB of address
309 space), but we only map enough physical memory to handle the current amount of
310 memory needed by the program. As more is requested, we add more physical memory
311 to the segment. This can work at the granularity of GPU pages which are 2MiB
312 currently.
313 
314 If we end up out of memory, we can unmap all the memory in our segment
315 corresponding to empty physical pages, and return it to CUDA for use at another
316 address in the segment or in a segment for a different stream.
317 
318 A current limitation of CUDA's API is that physical memory
319 (CUmemGenericAllocationHandle) cannot be split up after it is mapped even if the
320 handle holds multiple GPU pages. The cost to map/unmap memory is proportional to
321 the number of physical memory chunks that were allocated (mapping 10 separately
322 allocated 2MiB pages takes 10x time compared to mapping one 20MiB physical
323 allocation of 10 pages).  Changing memory mappings also appears to involve at
324 least some synchronous actions with the GPU and so should be considered an
325 expensive operation. To limit overhead, we use 2MiB pages for our small pool and
326 20MiB pages for our large pool. Initially allocation using expandable_blocks
327 will be slower than cudaMalloc, though still in the milliseconds range for
328 mapping the entire memory.
329 
330 When mapping new memory to expand the segment, we look for the lowest address at
331 which we can fit a new allocation by adding new pages. Normally this will be at
332 the end of the block. But if have previously unmapped blocks earlier in the
333 segment during an OOM, it will first try to fill in those gaps to keep the
334 segment as a single block. By allocating at the lowest address we encourage
335 the split up parts of the block to merge into a single block again, reducing
336 fragmentation potential.
337 
338 Allocation of blocks in the segment uses the same best-fit heuristics of the
339 rest of the allocator.
340 
341 Expandable blocks can be enabled/disabled throughout the run of a program. When
342 disabled, the allocator will not put new allocations in an expandable block.
343 
344 Limitations
345 
346 * Slightly slower initial memory allocation speed.
347 * IPC of cuda tensors (e.g. for multiprocess dataloaders) is not supported.
348 However, it is possible to temporarily disable (expandable_segments:False) the
349 bevhavior for allocator tensors that need to be used cross-process.
350 * CUDA runtime APIs related to sharing memory across process
351 (cudaDeviceEnablePeerAccess) do not work for memory allocated with cuMemMap.
352 Instead these mapping have to be done manually. The allocator now has an
353 `enablePeerAccess` method to do this.
354 */
355 
356 struct ExpandableSegment {
ExpandableSegmentc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment357   ExpandableSegment(
358       c10::DeviceIndex device,
359       std::optional<cudaStream_t> stream,
360       size_t address_space_size,
361       size_t segment_size,
362       std::vector<c10::DeviceIndex> peers)
363       : device_(device),
364         stream_(stream),
365         // 2MB for small pool, 20MB for large pool
366         segment_size_(segment_size),
367         max_handles_(numSegments(address_space_size)),
368         peers_(std::move(peers)) {
369     cudaDeviceProp prop{};
370     C10_CUDA_CHECK(cudaGetDeviceProperties(&prop, device_));
371     // we allocate enough address space for 1 1/8 the total memory on the GPU.
372     // This allows for some cases where we have to unmap pages earlier in the
373     // segment to put them at the end.
374     max_handles_ = numSegments(prop.totalGlobalMem + prop.totalGlobalMem / 8);
375     C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemAddressReserve_(
376         &ptr_, segment_size_ * max_handles_, 0ULL, 0, 0ULL));
377   }
378   // begin must be aligned to segment_size_.
379   // returns the actual range mapped, which may be
380   // greater than requested if size is not aligned to segment_size_.
381   // return size of 0 indicates OOM
mapc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment382   SegmentRange map(SegmentRange range) {
383     auto begin = segmentLeft(range.ptr);
384     auto end = segmentRight(range.ptr + range.size);
385     TORCH_INTERNAL_ASSERT(ptr() + begin * segment_size_ == range.ptr);
386     if (begin == end) {
387       return rangeFromHandles(begin, end);
388     }
389     while (end > handles_.size()) {
390       handles_.emplace_back(std::nullopt);
391     }
392     for (auto i : c10::irange(begin, end)) {
393       TORCH_INTERNAL_ASSERT(!handles_.at(i));
394       CUmemGenericAllocationHandle handle = 0;
395       CUmemAllocationProp prop = {};
396       prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
397 #ifndef FBCODE_CAFFE2
398       prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
399 #endif
400       prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
401       // NOLINTNEXTLINE(bugprone-signed-char-misuse)
402       prop.location.id = static_cast<int>(device_);
403       auto status =
404           DriverAPI::get()->cuMemCreate_(&handle, segment_size_, &prop, 0);
405       if (status == CUDA_ERROR_OUT_OF_MEMORY) {
406         for (auto j : c10::irange(begin, i)) {
407           auto h = handles_.at(j).value();
408           handles_.at(j) = std::nullopt;
409           C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemRelease_(h.handle));
410         }
411         trimHandles();
412         return rangeFromHandles(begin, begin);
413       }
414       C10_CUDA_DRIVER_CHECK(status);
415       handles_.at(i) = Handle{handle, std::nullopt};
416     }
417     mapAndSetAccess(begin, end);
418     return rangeFromHandles(begin, end);
419   }
420 
421   // unmaps all the completely empty segment_size_ segments between
422   // [begin, begin + size), returns the offset where the range begin,
423   // and the actual size unmapped (multiple of segment_size_)
unmapc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment424   SegmentRange unmap(SegmentRange range) {
425     auto begin = segmentRight(range.ptr);
426     auto end = segmentLeft(range.ptr + range.size);
427     if (begin >= end) {
428       return SegmentRange{range.ptr, 0};
429     }
430     unmapHandles(begin, end);
431     return rangeFromHandles(begin, end);
432   }
433 
434   // Setup IPC sharing for range.
435   // Returns the (larger) range that was actually shared.
436   // Serializes data to std::ostream that can be passed to the
437   // other process, and then restored as an exapandable segment
438   // via ExpandableSegment::fromShared(istream);
sharec10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment439   SegmentRange share(SegmentRange range, std::ostream& buf) {
440     auto begin = segmentLeft(range.ptr);
441     auto end = segmentRight(range.ptr + range.size);
442     ShareHeader header{getpid(), segment_size_, end - begin};
443     buf.write((const char*)&header, sizeof(ShareHeader));
444     for (auto i : c10::irange(begin, end)) {
445       auto& handle = handles_.at(i).value();
446       if (!handle.fd) {
447         int fd = 0;
448         C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemExportToShareableHandle_(
449             &fd, handle.handle, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0));
450         handle.fd = fd;
451       }
452       int fd = *handle.fd;
453       buf.write((const char*)&fd, sizeof(int));
454     }
455     return rangeFromHandles(begin, end);
456   }
457 
fromSharedc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment458   static std::unique_ptr<ExpandableSegment> fromShared(
459       c10::DeviceIndex device,
460       std::vector<c10::DeviceIndex> peers,
461       std::istream& buf) {
462     ShareHeader header{};
463     buf.read((char*)&header, sizeof(ShareHeader));
464     auto segment = std::make_unique<ExpandableSegment>(
465         device,
466         std::nullopt,
467         header.num_handles * header.segment_size,
468         header.segment_size,
469         std::move(peers));
470 // older build setups (e.g. multiwheels) do not have this syscall, added 2020
471 // but the kernel on the system might still support it.
472 #ifndef SYS_pidfd_open
473 #define SYS_pidfd_open 434
474 #endif
475 #ifndef SYS_pidfd_getfd
476 #define SYS_pidfd_getfd 438
477 #endif
478     auto pidfd = syscall(SYS_pidfd_open, header.pid, 0);
479     TORCH_CHECK(
480         pidfd != -1 || errno != ENOSYS,
481         "The kernel on this machine does not support the pidfd_open syscall needed to use IPC for CUDA tensors when expandable_segments:True is set. "
482         "Consider using expandable_segments:False via torch.cuda.memory._set_allocator_settings('expandable_segments:False') for this allocation.");
483     TORCH_CHECK(pidfd != -1, "pidfd_open:", std::strerror(errno));
484     for (auto i : c10::irange(header.num_handles)) {
485       (void)i;
486       int fd = 0;
487       buf.read((char*)&fd, sizeof(int));
488       auto myfd = syscall(SYS_pidfd_getfd, pidfd, fd, 0);
489       if (myfd == -1) {
490         auto err = errno;
491         close((int)pidfd);
492         for (auto& h : segment->handles_) {
493           C10_CUDA_DRIVER_CHECK(
494               DriverAPI::get()->cuMemRelease_(h.value().handle));
495           h = std::nullopt;
496         }
497         TORCH_CHECK(
498             err != ENOSYS,
499             "The kernel on this machine does not support the pidfd_getfd syscall needed to use IPC for CUDA tensors when expandable_segments:True is set. "
500             "Consider using expandable_segments:False via torch.cuda.memory._set_allocator_settings('expandable_segments:False') for this allocation.");
501         TORCH_CHECK(false, "pidfd_getfd: ", std::strerror(err));
502       }
503       CUmemGenericAllocationHandle handle = 0;
504       C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemImportFromShareableHandle_(
505           &handle,
506           // NOLINTNEXTLINE(performance-no-int-to-ptr)
507           (void*)(uintptr_t)myfd,
508           CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
509       close((int)myfd);
510       segment->handles_.emplace_back(Handle{handle, std::nullopt});
511     }
512     close((int)pidfd);
513     segment->mapAndSetAccess(0, header.num_handles);
514     return segment;
515   }
516 
ptrc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment517   char* ptr() const {
518     // NOLINTNEXTLINE(performance-no-int-to-ptr)
519     return reinterpret_cast<char*>(ptr_);
520   }
521 
sizec10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment522   size_t size() const {
523     return max_handles_ * segment_size_;
524   }
525 
addPeerc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment526   void addPeer(c10::DeviceIndex device) {
527     peers_.push_back(device);
528     forEachAllocatedRange(
529         [&](size_t begin, size_t end) { setAccess(device, begin, end); });
530   }
531 
~ExpandableSegmentc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment532   ~ExpandableSegment() {
533     forEachAllocatedRange(
534         [&](size_t begin, size_t end) { unmapHandles(begin, end); });
535     C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemAddressFree_(
536         ptr_, segment_size_ * max_handles_));
537   }
538 
539  private:
setAccessc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment540   void setAccess(c10::DeviceIndex device, size_t begin, size_t end) {
541     CUmemAccessDesc desc;
542     desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
543     // NOLINTNEXTLINE(bugprone-signed-char-misuse)
544     desc.location.id = static_cast<int>(device);
545     desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
546     C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemSetAccess_(
547         ptr_ + begin * segment_size_, (end - begin) * segment_size_, &desc, 1));
548   }
549 
mapAndSetAccessc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment550   void mapAndSetAccess(size_t begin, size_t end) {
551     for (auto i : c10::irange(begin, end)) {
552       C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemMap_(
553           ptr_ + i * segment_size_,
554           segment_size_,
555           0,
556           handles_.at(i).value().handle,
557           0ULL));
558     }
559     setAccess(device_, begin, end);
560     for (auto p : peers_) {
561       setAccess(p, begin, end);
562     }
563   }
564 
unmapHandlesc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment565   void unmapHandles(size_t begin, size_t end) {
566     // note: unlike cudaFree, MemUnmap and MemRelease do
567     // not appear to synchronize in all cases, so we have to wait for the
568     // stream to finish before this memory is truly free.
569 
570     // cannot call c10::cuda::stream_synchronize because
571     // it might grab the GIL which can lead to a deadlock
572     // Locking order must be GIL -> Allocator Lock
573     if (stream_) {
574       C10_CUDA_CHECK(cudaStreamSynchronize(*stream_));
575     } else {
576       cuda::CUDAGuard device_guard(device_);
577       C10_CUDA_CHECK(cudaDeviceSynchronize());
578     }
579     for (auto i : c10::irange(begin, end)) {
580       Handle h = handles_.at(i).value();
581       handles_.at(i) = std::nullopt;
582       C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemUnmap_(
583           ptr_ + segment_size_ * i, segment_size_));
584       if (h.fd) {
585         close(*h.fd);
586       }
587       C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemRelease_(h.handle));
588     }
589     trimHandles();
590   }
trimHandlesc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment591   void trimHandles() {
592     while (!handles_.empty() && !handles_.back()) {
593       handles_.pop_back();
594     }
595   }
forEachAllocatedRangec10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment596   void forEachAllocatedRange(const std::function<void(size_t, size_t)>& fn) {
597     size_t start = 0;
598     for (auto i : c10::irange(handles_.size())) {
599       if (handles_.at(i) && (i == 0 || !handles_.at(i - 1))) {
600         start = i;
601       }
602       if (handles_.at(i) && (i + 1 == handles_.size() || !handles_.at(i + 1))) {
603         fn(start, i + 1);
604       }
605     }
606   }
numSegmentsc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment607   size_t numSegments(size_t size) {
608     return (size + segment_size_ - 1) / segment_size_;
609   }
segmentLeftc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment610   size_t segmentLeft(char* p) {
611     auto size = p - ptr();
612     return size / segment_size_;
613   }
segmentRightc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment614   size_t segmentRight(char* p) {
615     auto size = p - ptr();
616     return numSegments(size);
617   }
rangeFromHandlesc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment618   SegmentRange rangeFromHandles(size_t begin, size_t end) {
619     return SegmentRange(
620         ptr() + segment_size_ * begin, segment_size_ * (end - begin));
621   }
622   c10::DeviceIndex device_;
623   std::optional<cudaStream_t> stream_;
624   CUdeviceptr ptr_{};
625   size_t segment_size_;
626   size_t max_handles_;
627   struct Handle {
628     CUmemGenericAllocationHandle handle;
629     std::optional<int> fd;
630   };
631   struct ShareHeader {
632     pid_t pid;
633     size_t segment_size;
634     size_t num_handles;
635   };
636   std::vector<std::optional<Handle>> handles_;
637   // devices on which this memory should be mapped in addition
638   // to the device where the physical memory lives (device_).
639   std::vector<c10::DeviceIndex> peers_;
640 };
641 #else
642 struct ExpandableSegment {
ExpandableSegmentc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment643   ExpandableSegment(
644       c10::DeviceIndex device,
645       std::optional<cudaStream_t> stream,
646       size_t address_space_size,
647       size_t segment_size,
648       std::vector<c10::DeviceIndex> peers) {
649     TORCH_INTERNAL_ASSERT(false, "expandable segment not supported");
650   }
mapc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment651   SegmentRange map(SegmentRange range) {
652     return SegmentRange(nullptr, 0);
653   }
unmapc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment654   SegmentRange unmap(SegmentRange range) {
655     return SegmentRange(nullptr, 0);
656   }
sharec10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment657   SegmentRange share(SegmentRange range, std::ostream& ss) {
658     return SegmentRange(nullptr, 0);
659   }
fromSharedc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment660   static std::unique_ptr<ExpandableSegment> fromShared(
661       c10::DeviceIndex device,
662       std::vector<c10::DeviceIndex> peers,
663       std::istream& buf) {
664     return {};
665   }
ptrc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment666   char* ptr() const {
667     return nullptr;
668   }
sizec10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment669   size_t size() const {
670     return 0;
671   }
addPeerc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment672   void addPeer(c10::DeviceIndex device) {}
673 };
674 #endif
675 
676 // BlockState, BlockPoolState, and PrivatePoolState contain the information
677 // needed to reconstruct a private pool to a previous state. See note
678 // [Checkpointing PrivatePoolState]
679 struct BlockState {
680   c10::DeviceIndex device = 0;
681   cudaStream_t stream = nullptr;
682   stream_set stream_uses = {};
683   size_t size = 0;
684   void* ptr = nullptr;
685   bool allocated = false;
686   int64_t gc_count_base = 0;
687   // maintain invariant that event_count == 0 ;
688   // history will be left alone in checkpoint
689 
690   BlockState(Block* block);
691 };
692 
693 struct SegmentState {
694   std::vector<BlockState> blocks;
695   bool is_small = false;
696 
697   SegmentState(Block* head);
698 };
699 
700 struct PrivatePoolState : AllocatorState {
701   // omitting use_count, and cudaMalloc_count as they remain the same
702   MempoolId_t owner_id = {0, 0};
703 
704   std::vector<SegmentState> segments;
705 
706   PrivatePoolState(
707       MempoolId_t pool_id,
708       const std::vector<Block*>& private_pool_head_blocks);
709 };
710 
711 struct RestoreResult {
712   std::vector<void*> allocations_freed;
713   std::vector<Block*> allocations_created;
714 };
715 
BlockComparatorSize(const Block * a,const Block * b)716 static bool BlockComparatorSize(const Block* a, const Block* b) {
717   if (a->stream != b->stream) {
718     return (uintptr_t)a->stream < (uintptr_t)b->stream;
719   }
720   if (a->size != b->size) {
721     return a->size < b->size;
722   }
723   return (uintptr_t)a->ptr < (uintptr_t)b->ptr;
724 }
BlockComparatorAddress(const Block * a,const Block * b)725 static bool BlockComparatorAddress(const Block* a, const Block* b) {
726   if (a->stream != b->stream) {
727     return (uintptr_t)a->stream < (uintptr_t)b->stream;
728   }
729   return (uintptr_t)a->ptr < (uintptr_t)b->ptr;
730 }
731 
732 struct AllocParams {
AllocParamsc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::AllocParams733   AllocParams(
734       c10::DeviceIndex device,
735       size_t size,
736       cudaStream_t stream,
737       BlockPool* pool,
738       size_t alloc_size,
739       DeviceStats& stats)
740       : search_key(device, stream, size), pool(pool), alloc_size(alloc_size) {}
741 
devicec10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::AllocParams742   c10::DeviceIndex device() const {
743     return search_key.device;
744   }
streamc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::AllocParams745   cudaStream_t stream() const {
746     return search_key.stream;
747   }
sizec10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::AllocParams748   size_t size() const {
749     return search_key.size;
750   }
751 
752   Block search_key;
753   BlockPool* pool;
754   size_t alloc_size;
755   Block* block{nullptr};
756   StatTypes stat_types = {false};
757   cudaError_t err{cudaSuccess};
758 };
759 
760 // Note: cudaEventCreate when concurrently invoked from multiple threads can be
761 // very expensive (at least on certain device/driver combinations). Thus, we a)
762 // serialize event creation at a per-device level, and b) pool the events to
763 // avoid constantly calling cudaEventCreate/cudaEventDestroy. This results in
764 // significant improvements in multithreaded workloads with high allocation
765 // rates.
766 class EventPool {
767  public:
768   using Event = std::unique_ptr<cudaEvent_t, std::function<void(cudaEvent_t*)>>;
769   // TODO: Explicit device count
EventPool()770   EventPool() : pools_(at::cuda::device_count()) {}
771 
get(c10::DeviceIndex device)772   Event get(c10::DeviceIndex device) {
773     TORCH_INTERNAL_ASSERT(0 <= device);
774     TORCH_INTERNAL_ASSERT(device < static_cast<int>(pools_.size()));
775     auto& pool = pools_[device];
776     auto destructor = [&pool](cudaEvent_t* event) {
777       std::lock_guard<std::mutex> g(pool.mutex_);
778       pool.event_pool_.push_back(std::unique_ptr<cudaEvent_t>(event));
779     };
780 
781     // Try to acquire an event from the per-device pool.
782     {
783       std::lock_guard<std::mutex> g(pool.mutex_);
784       if (!pool.event_pool_.empty()) {
785         auto* event = pool.event_pool_.back().release();
786         pool.event_pool_.pop_back();
787         return Event(event, destructor);
788       }
789     }
790     // otherwise, allocate a new event that will be returned to the pool on
791     // destruction.
792     auto new_ptr = std::make_unique<cudaEvent_t>();
793     C10_CUDA_CHECK(
794         cudaEventCreateWithFlags(new_ptr.get(), cudaEventDisableTiming));
795 
796     return Event(new_ptr.release(), destructor);
797   }
798 
empty_cache()799   void empty_cache() {
800     for (auto& pool : pools_) {
801       std::lock_guard<std::mutex> g(pool.mutex_);
802       pool.event_pool_.clear();
803     }
804   }
805 
806  private:
807   struct PerDevicePool {
808     alignas(64) std::mutex mutex_;
809     std::vector<std::unique_ptr<cudaEvent_t>> event_pool_;
810   };
811   std::vector<PerDevicePool> pools_;
812 };
813 
814 // CUDA graphs helper
815 struct PrivatePool {
PrivatePoolc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::PrivatePool816   PrivatePool()
817       : large_blocks(/*small=*/false, this),
818         small_blocks(/*small=*/true, this) {}
819   PrivatePool(const PrivatePool&) = delete;
820   PrivatePool(PrivatePool&&) = delete;
821   PrivatePool& operator=(const PrivatePool&) = delete;
822   // Number of live graphs using this pool
823   int use_count{1};
824   // Number of unfreed cudaMallocs made for this pool. When use_count and
825   // cudaMalloc_count drop to zero, we can delete this PrivatePool from
826   // graph_pools.
827   int cudaMalloc_count{0};
828   // Instead of maintaining private BlockPools here, I could stuff all blocks
829   // (private or no) into the top-level large_blocks and small_blocks, and
830   // distinguish private blocks by adding a "pool id" check above the stream
831   // check in BlockComparator. BlockComparator is performance- critical though,
832   // I'd rather not add more logic to it.
833   BlockPool large_blocks;
834   BlockPool small_blocks;
835 };
836 
BlockState(Block * block)837 BlockState::BlockState(Block* block)
838     : device(block->device),
839       stream(block->stream),
840       stream_uses(block->stream_uses),
841       size(block->size),
842       ptr(block->ptr),
843       allocated(block->allocated),
844       gc_count_base(block->gc_count_base) {
845   TORCH_CHECK(
846       block->event_count == 0,
847       "Events should have synchronized when checkpointing block");
848 };
849 
SegmentState(Block * head)850 SegmentState::SegmentState(Block* head) {
851   TORCH_INTERNAL_ASSERT(head->prev == nullptr && head->pool != nullptr);
852   is_small = head->pool->is_small;
853 
854   for (Block* curr = head; curr != nullptr; curr = curr->next) {
855     blocks.emplace_back(curr);
856   }
857 }
858 
PrivatePoolState(MempoolId_t pool_id,const std::vector<Block * > & private_pool_head_blocks)859 PrivatePoolState::PrivatePoolState(
860     MempoolId_t pool_id,
861     const std::vector<Block*>& private_pool_head_blocks)
862     : owner_id(std::move(pool_id)) {
863   for (Block* head : private_pool_head_blocks) {
864     segments.emplace_back(head);
865   }
866 }
867 
868 struct MempoolIdHash {
operator ()c10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::MempoolIdHash869   std::size_t operator()(const MempoolId_t& mempool_id) const noexcept {
870     return mempool_id.first != 0 ? mempool_id.first : mempool_id.second;
871   }
872 };
873 
cudaMallocMaybeCapturing(void ** p,size_t size)874 cudaError_t cudaMallocMaybeCapturing(void** p, size_t size) {
875   if (at::cuda::currentStreamCaptureStatusMayInitCtx() ==
876       at::cuda::CaptureStatus::None) {
877     return C10_CUDA_ERROR_HANDLED(cudaMalloc(p, size));
878   } else {
879     // It's ok to capture cudaMallocs, as long as we never cudaFree those
880     // addresses before replay.
881     // Capturing cudaMalloc behaves nicely: it gives the graph new VA,
882     // but is ignored (won't leakily allocate new memory) in replays.
883     at::cuda::CUDAStreamCaptureModeGuard g{cudaStreamCaptureModeRelaxed};
884     return C10_CUDA_ERROR_HANDLED(cudaMalloc(p, size));
885   }
886 }
887 
888 template <class T>
889 class RingBuffer {
890  public:
RingBuffer()891   RingBuffer() {
892     // alloc_trace is a pointer because we need to intentionally
893     // leak this on deallocation it can hold references to Python
894     // state which will already be destroyed when we are in exit handlers
895     // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer)
896     alloc_trace = new std::vector<T>();
897   }
898 
setMaxEntries(size_t size)899   void setMaxEntries(size_t size) {
900     std::lock_guard<std::mutex> lk(alloc_trace_lock);
901     alloc_trace_max_entries_ = std::max(size_t(1), size);
902   }
903 
insertEntries(const T & entry)904   void insertEntries(const T& entry) {
905     std::lock_guard<std::mutex> lk(alloc_trace_lock);
906     if (alloc_trace->size() < alloc_trace_max_entries_) {
907       alloc_trace->emplace_back(entry);
908     } else {
909       (*alloc_trace)[alloc_trace_next++] = entry;
910       if (alloc_trace_next == alloc_trace_max_entries_) {
911         alloc_trace_next = 0;
912       }
913     }
914   }
915 
getEntries(std::vector<T> & result)916   void getEntries(std::vector<T>& result) {
917     std::lock_guard<std::mutex> lk(alloc_trace_lock);
918     result.reserve(alloc_trace->size());
919     result.insert(
920         result.end(),
921         alloc_trace->begin() +
922             static_cast<typename std::vector<T>::difference_type>(
923                 alloc_trace_next),
924         alloc_trace->end());
925     result.insert(
926         result.end(),
927         alloc_trace->begin(),
928         alloc_trace->begin() +
929             static_cast<typename std::vector<T>::difference_type>(
930                 alloc_trace_next));
931   }
932 
clear()933   void clear() {
934     std::lock_guard<std::mutex> lk(alloc_trace_lock);
935     alloc_trace_next = 0;
936     alloc_trace->clear();
937   }
938 
939  private:
940   size_t alloc_trace_max_entries_ = 1;
941 
942   // Both alloc_trace and alloc_trace_next needs to be used
943   // under alloc_trace_lock.
944   std::mutex alloc_trace_lock;
945   size_t alloc_trace_next = 0;
946   std::vector<T>*
947       alloc_trace; // pointer because we need to intentionally leak this on
948                    // deallocation it can hold references to Python state which
949                    // will already be destroyed when we are in exit handlers
950 };
951 
952 } // anonymous namespace
953 } // namespace Native
954 
reportProcessMemoryInfo(c10::DeviceIndex device)955 static std::string reportProcessMemoryInfo(c10::DeviceIndex device) {
956 #ifdef PYTORCH_C10_DRIVER_API_SUPPORTED
957   void* nvml_handle = DriverAPI::get_nvml_handle();
958   if (!nvml_handle) {
959     return "";
960   }
961   static c10::once_flag nvml_init;
962   c10::call_once(nvml_init, [] {
963     TORCH_INTERNAL_ASSERT(NVML_SUCCESS == DriverAPI::get()->nvmlInit_v2_());
964   });
965 
966   cudaDeviceProp prop{};
967   C10_CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
968 
969   // NOLINTNEXTLINE(*-c-arrays)
970   char pci_id[80];
971   snprintf(
972       pci_id,
973       sizeof(pci_id),
974       NVML_DEVICE_PCI_BUS_ID_FMT,
975       prop.pciDomainID,
976       prop.pciBusID,
977       prop.pciDeviceID);
978 
979   nvmlDevice_t nvml_device = nullptr;
980   TORCH_INTERNAL_ASSERT(
981       NVML_SUCCESS ==
982       DriverAPI::get()->nvmlDeviceGetHandleByPciBusId_v2_(
983           pci_id, &nvml_device));
984 
985   std::vector<nvmlProcessInfo_v1_t> procs(8);
986   unsigned int size = procs.size();
987   nvmlReturn_t r{};
988   while ((r = DriverAPI::get()->nvmlDeviceGetComputeRunningProcesses_(
989               nvml_device, &size, procs.data())) ==
990          NVML_ERROR_INSUFFICIENT_SIZE) {
991     procs.resize(size);
992   }
993   unsigned int self_pid = getpid();
994   std::stringstream ss;
995   TORCH_INTERNAL_ASSERT(NVML_SUCCESS == r);
996   ss << "";
997   for (auto i : c10::irange(size)) {
998     auto& proc = procs[i];
999     if (self_pid == proc.pid) {
1000       ss << "Including non-PyTorch memory, this process";
1001     } else {
1002       ss << "Process " << proc.pid;
1003     }
1004     ss << " has " << format_size(proc.usedGpuMemory) << " memory in use. ";
1005   }
1006   return ss.str();
1007 #else
1008   return "";
1009 #endif
1010 }
1011 
1012 namespace Native {
1013 
1014 class DeviceCachingAllocator {
1015  private:
1016   // lock around all operations
1017   mutable std::recursive_mutex mutex;
1018 
1019   // device statistics
1020   DeviceStats stats;
1021 
1022   // unallocated cached blocks larger than 1 MB
1023   BlockPool large_blocks;
1024 
1025   // unallocated cached blocks 1 MB or smaller
1026   BlockPool small_blocks;
1027 
1028   // allocated or in use by a stream. Holds all active allocations,
1029   // whether they came from graph_pools or one of the BlockPools above.
1030   ska::flat_hash_set<Block*> active_blocks;
1031 
1032   // captures_underway tracks if we are diverting some
1033   // allocations to a specific pool.
1034   // Most of the time it's empty, in which case malloc can avoid calling
1035   // cudaStreamGetCaptureInfo in the hot path.
1036   std::vector<std::pair<MempoolId_t, std::function<bool(cudaStream_t)>>>
1037       captures_underway;
1038 
1039   // See free() for this thing's purpose
1040   std::vector<Block*> needs_events_deferred_until_no_capture;
1041   // outstanding cuda events
1042   ska::flat_hash_map<
1043       cuda::CUDAStream,
1044       std::deque<std::pair<EventPool::Event, Block*>>>
1045       cuda_events;
1046 
1047   // record used memory.
1048   size_t total_allocated_memory = 0;
1049 
1050   size_t allowed_memory_maximum = 0;
1051 
1052   // all live expandable segments
1053   std::vector<ExpandableSegment*> expandable_segments_;
1054   std::vector<c10::DeviceIndex> devices_with_peer_access_;
1055 
1056   bool set_fraction = false;
1057 
1058   bool record_history = false;
1059 
1060   std::atomic<CreateContextFn> context_recorder_;
1061   RecordContext record_context_ = RecordContext::NEVER;
1062 
1063   // Ring buffer for memory snapshot TraceEntry's
1064   RingBuffer<TraceEntry> alloc_buffer;
1065 
1066   // Members specific to CUDA graphs
1067 
1068   // Private pools for CUDA graphs
1069   ska::flat_hash_map<MempoolId_t, std::unique_ptr<PrivatePool>, MempoolIdHash>
1070       graph_pools;
1071   // Pools no longer referenced by any graph. Their BlockPools are eligible for
1072   // free_blocks. Can't be a vector or deque because we might erase entries in
1073   // any order. Could be an std::list, but we don't care much, access and
1074   // insert/erase are rare.
1075   ska::flat_hash_map<MempoolId_t, PrivatePool*, MempoolIdHash>
1076       graph_pools_freeable;
1077 
1078   // XXX - maybe we should generalize and have multiple events
1079   std::vector<OutOfMemoryObserver> oom_observers_;
1080 
1081   std::vector<AllocatorTraceTracker> trace_trackers_;
1082 
1083   // mapping from block to a stream_set, containing streams on which the block
1084   // was used while cudagraph capturing
1085   std::unordered_map<Block*, stream_set> block_to_cudagraph_stream_uses;
1086 
1087  public:
1088   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
DeviceCachingAllocator()1089   DeviceCachingAllocator()
1090       : large_blocks(/*small=*/false), small_blocks(/*small=*/true) {
1091     stats.max_split_size =
1092         static_cast<int64_t>(CUDAAllocatorConfig::max_split_size());
1093     context_recorder_.store(nullptr);
1094   }
1095 
recordHistory(bool enabled,CreateContextFn context_recorder,size_t alloc_buffer_max_entries,RecordContext when)1096   void recordHistory(
1097       bool enabled,
1098       CreateContextFn context_recorder,
1099       size_t alloc_buffer_max_entries,
1100       RecordContext when) {
1101     std::unique_lock<std::recursive_mutex> lock(mutex);
1102     TORCH_CHECK(when == RecordContext::NEVER || context_recorder);
1103     record_history = enabled;
1104     context_recorder_.store(record_history ? context_recorder : nullptr);
1105     alloc_buffer.setMaxEntries(alloc_buffer_max_entries);
1106     record_context_ = enabled ? when : RecordContext::NEVER;
1107     if (!enabled) {
1108       alloc_buffer.clear();
1109     }
1110   }
1111 
isHistoryEnabled()1112   bool isHistoryEnabled() {
1113     return record_history;
1114   }
1115 
checkPoolLiveAllocations(MempoolId_t mempool_id,const std::unordered_set<void * > & expected_live_allocations)1116   bool checkPoolLiveAllocations(
1117       MempoolId_t mempool_id,
1118       const std::unordered_set<void*>& expected_live_allocations) {
1119     std::unique_lock<std::recursive_mutex> lock(mutex);
1120 
1121     PrivatePool* pool = nullptr;
1122     auto pool_it = graph_pools.find(mempool_id);
1123     TORCH_CHECK(pool_it != graph_pools.end(), "Could not find pool of id");
1124     pool = pool_it->second.get();
1125 
1126     TORCH_INTERNAL_ASSERT(pool != nullptr);
1127 
1128     size_t allocated_pool_blocks = 0;
1129 
1130     for (Block* b : active_blocks) {
1131       TORCH_INTERNAL_ASSERT(b != nullptr);
1132       TORCH_INTERNAL_ASSERT(b->pool != nullptr);
1133       if (b->allocated && b->pool->owner_PrivatePool == pool) {
1134         if (!expected_live_allocations.count(b->ptr)) {
1135           return false;
1136         }
1137 
1138         allocated_pool_blocks += 1;
1139       }
1140     }
1141 
1142     return allocated_pool_blocks == expected_live_allocations.size();
1143   }
1144 
attachOutOfMemoryObserver(OutOfMemoryObserver observer)1145   void attachOutOfMemoryObserver(OutOfMemoryObserver observer) {
1146     oom_observers_.emplace_back(std::move(observer));
1147   }
1148 
attachAllocatorTraceTracker(AllocatorTraceTracker tracker)1149   void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) {
1150     std::unique_lock<std::recursive_mutex> lock(mutex);
1151     trace_trackers_.emplace_back(std::move(tracker));
1152   }
1153 
1154   // Must be called outside of `mutex` or deadlocks are possible with Python
maybeGatherContext(RecordContext level)1155   std::shared_ptr<GatheredContext> maybeGatherContext(RecordContext level) {
1156     if (record_context_ < level) {
1157       return nullptr;
1158     }
1159     return context_recorder_.load()();
1160   }
1161 
1162   // All public methods (except the above) acquire the allocator mutex.
1163   // Thus, do not call a public method from another public method.
1164 
malloc(c10::DeviceIndex device,size_t orig_size,cudaStream_t stream)1165   Block* malloc(
1166       c10::DeviceIndex device,
1167       size_t orig_size,
1168       cudaStream_t stream) {
1169     // done outside the lock because we don't know what locks the recorder needs
1170     // to have...
1171     auto context = maybeGatherContext(RecordContext::STATE);
1172 
1173     std::unique_lock<std::recursive_mutex> lock(mutex);
1174 
1175     if (C10_LIKELY(captures_underway.empty())) {
1176       // Processes end-of-life events for outstanding allocations used on
1177       // multiple streams (checks if their GPU-side uses are complete and
1178       // recycles their memory if so)
1179       //
1180       // Q. Why skip process_events if a capture might be underway?
1181       // A. process_events involves cudaEventQueries, illegal during CUDA graph
1182       //    capture.
1183       //    Dumb simple solution: defer reclaiming these allocations until after
1184       //    capture. Cross-stream memory use is uncommon, so the deferral's
1185       //    effect on memory use during capture should be small.
1186       process_events(context);
1187     }
1188     size_t size = round_size(orig_size);
1189     auto& pool = get_pool(size, stream);
1190     const size_t alloc_size = get_allocation_size(size);
1191     AllocParams params(device, size, stream, &pool, alloc_size, stats);
1192     params.stat_types = get_stat_types_for_pool(pool);
1193 
1194     // First, try to get a block from the existing pool.
1195     bool block_found =
1196         // Search pool
1197         get_free_block(params)
1198         // Trigger callbacks and retry search
1199         || (trigger_free_memory_callbacks(params) && get_free_block(params));
1200 
1201     // Can't reuse an existing block; try to get a new one.
1202     if (!block_found) {
1203       // Do garbage collection if the flag is set.
1204       if (C10_UNLIKELY(
1205               set_fraction &&
1206               CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) {
1207         garbage_collect_cached_blocks(context);
1208       }
1209       // Attempt allocate
1210       // WARNING: alloc_block may release the allocator lock when calling
1211       // cudaMalloc. So far this function has not modified allocator state, but
1212       // keep in mind that any observed allocator state may change across calls
1213       // to alloc_block since it may release the lock.
1214       block_found = alloc_block(params, false, context, lock)
1215           // Free enough available cached blocks to satisfy alloc and retry
1216           // alloc.
1217           || (release_available_cached_blocks(params, context) &&
1218               alloc_block(params, false, context, lock))
1219           // Free all non-split cached blocks and retry alloc.
1220           || (C10_LIKELY(captures_underway.empty()) &&
1221               release_cached_blocks(context) &&
1222               alloc_block(params, true, context, lock));
1223     }
1224 
1225     if (!block_found) {
1226       // For any error code other than cudaErrorMemoryAllocation,
1227       // alloc_block should have thrown an exception already.
1228       TORCH_INTERNAL_ASSERT(params.err == cudaErrorMemoryAllocation);
1229 
1230       size_t device_free = 0;
1231       size_t device_total = 0;
1232       C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
1233       std::string allowed_info;
1234 
1235       if (set_fraction) {
1236         allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
1237       }
1238 
1239       std::string proc_info = reportProcessMemoryInfo(device);
1240 
1241       record_trace(
1242           TraceEntry::OOM,
1243           device_free,
1244           params.size(),
1245           params.stream(),
1246           params.device(),
1247           std::move(context));
1248       stats.num_ooms += 1;
1249 
1250       c10::reportOutOfMemoryToProfiler(
1251           static_cast<int64_t>(size),
1252           stats.allocated_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
1253               .current,
1254           stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
1255               .current,
1256           c10::Device(c10::DeviceType::CUDA, device));
1257 
1258       auto allocated_bytes =
1259           stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)]
1260               .current;
1261       auto reserved_bytes =
1262           stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)]
1263               .current;
1264       auto observers_local = oom_observers_;
1265 
1266       size_t allocated_in_private_pools = 0;
1267       auto get_size_block = [](const BlockPool& pool) {
1268         size_t res = 0;
1269         for (const auto& block : pool.blocks) {
1270           res += block->size;
1271         }
1272         return res;
1273       };
1274       for (const auto& p : graph_pools) {
1275         allocated_in_private_pools += get_size_block(p.second->large_blocks);
1276         allocated_in_private_pools += get_size_block(p.second->small_blocks);
1277       }
1278 
1279       std::string private_pool_msg;
1280 
1281       if (allocated_in_private_pools > 0) {
1282         private_pool_msg = "with " + format_size(allocated_in_private_pools) +
1283             " allocated in private pools (e.g., CUDA Graphs), ";
1284       }
1285 
1286       // Make sure we do not have the device lock before calling our
1287       // observers which might need hold the GIL
1288       // It is safe to release at this point because will no longer
1289       // be reading any allocator state.
1290 
1291       lock.unlock();
1292 
1293       for (const auto& obs : observers_local) {
1294         obs(device,
1295             alloc_size,
1296             set_fraction ? allowed_memory_maximum : device_total,
1297             device_free);
1298       }
1299 
1300       // "total capacity": total global memory on GPU
1301       // "allowed": memory is allowed to use, which set by fraction.
1302       // "already allocated": memory allocated by the program using the
1303       //                      caching allocator
1304       // "free": free memory as reported by the CUDA API
1305       // "cached": memory held by the allocator but not used by the program
1306       //
1307       // The "allocated" amount  does not include memory allocated outside
1308       // of the caching allocator, such as memory allocated by other programs
1309       // or memory held by the driver.
1310       //
1311       // The sum of "allocated" + "free" + "cached" may be less than the
1312       // total capacity due to memory held by the driver and usage by other
1313       // programs.
1314       //
1315       // Note that at this point free_cached_blocks has already returned all
1316       // possible "cached" memory to the driver. The only remaining "cached"
1317       // memory is split from a larger block that is partially in-use.
1318       TORCH_CHECK_WITH(
1319           OutOfMemoryError,
1320           false,
1321           "CUDA out of memory. Tried to allocate ",
1322           format_size(alloc_size),
1323           ". GPU ",
1324           static_cast<int>(device),
1325           " has a total capacity of ",
1326           format_size(device_total),
1327           " of which ",
1328           format_size(device_free),
1329           " is free. ",
1330           proc_info,
1331           allowed_info,
1332           "Of the allocated memory ",
1333           format_size(allocated_bytes + allocated_in_private_pools),
1334           " is allocated by PyTorch, ",
1335           private_pool_msg,
1336           "and ",
1337           format_size(
1338               reserved_bytes - allocated_bytes - allocated_in_private_pools),
1339           " is reserved by PyTorch but unallocated.",
1340           " If reserved but unallocated memory is large try setting",
1341           " PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid"
1342           " fragmentation.  See documentation for Memory Management "
1343           " (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)");
1344     }
1345 
1346     bool split_remainder = should_split(params.block, params.size());
1347     return alloc_found_block(
1348         params, orig_size, std::move(context), split_remainder);
1349   }
1350 
alloc_found_block(const AllocParams & params,size_t orig_size,std::shared_ptr<GatheredContext> context,bool split_remainder)1351   Block* alloc_found_block(
1352       const AllocParams& params,
1353       size_t orig_size,
1354       std::shared_ptr<GatheredContext> context,
1355       bool split_remainder) {
1356     auto size = params.size();
1357     auto device = params.device();
1358     auto pool = params.pool;
1359     auto stream = params.stream();
1360 
1361     TORCH_INTERNAL_ASSERT(
1362         params.err == cudaSuccess && params.block != nullptr &&
1363         params.block->ptr != nullptr);
1364     Block* block = params.block;
1365     Block* remaining = nullptr;
1366 
1367     const bool already_split = block->is_split();
1368     if (split_remainder) {
1369       remaining = block;
1370 
1371       block = new Block(device, stream, size, pool, block->ptr);
1372       block->expandable_segment_ = remaining->expandable_segment_;
1373       block->prev = remaining->prev;
1374       if (block->prev) {
1375         block->prev->next = block;
1376       }
1377       block->next = remaining;
1378 
1379       remaining->prev = block;
1380       remaining->ptr = static_cast<char*>(remaining->ptr) + size;
1381       remaining->size -= size;
1382       // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
1383       bool inserted = pool->insert_into_blocks(remaining).second;
1384       TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inserted);
1385 
1386       if (already_split && !block->expandable_segment_) {
1387         // An already-split inactive block is being shrunk by size bytes.
1388         decrease_stat_array(
1389             stats.inactive_split_bytes, block->size, params.stat_types);
1390       } else if (!block->expandable_segment_) {
1391         // A new split inactive block is being created from a previously unsplit
1392         // block, size remaining->size bytes.
1393         for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
1394           stats.inactive_split_bytes[stat_type].increase(remaining->size);
1395           stats.inactive_split[stat_type].increase(1);
1396         });
1397       }
1398 
1399     } else if (already_split && !block->expandable_segment_) {
1400       // An already-split block is becoming active
1401       for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
1402         stats.inactive_split_bytes[stat_type].decrease(block->size);
1403         stats.inactive_split[stat_type].decrease(1);
1404       });
1405     }
1406 
1407     block->allocated = true;
1408     block->requested_size = orig_size;
1409 
1410     block->context_when_allocated = std::move(context);
1411     record_trace(
1412         TraceEntry::ALLOC,
1413         int64_t(block->ptr),
1414         orig_size,
1415         block->stream,
1416         block->device,
1417         block->context_when_allocated);
1418 
1419     // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
1420     bool inserted = active_blocks.insert(block).second;
1421     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inserted);
1422 
1423     for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
1424       stats.allocation[stat_type].increase(1);
1425       stats.allocated_bytes[stat_type].increase(block->size);
1426       stats.active[stat_type].increase(1);
1427       stats.active_bytes[stat_type].increase(block->size);
1428       stats.requested_bytes[stat_type].increase(block->requested_size);
1429     });
1430     if (block->size >= CUDAAllocatorConfig::max_split_size())
1431       stats.oversize_allocations.increase(1);
1432 
1433     auto allocated_bytes_gauge =
1434         STATIC_GAUGE(pytorch.CUDACachingAllocator.allocated_bytes);
1435     allocated_bytes_gauge.record(
1436         stats.allocated_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
1437             .current);
1438 
1439     c10::reportMemoryUsageToProfiler(
1440         block->ptr,
1441         static_cast<int64_t>(block->size),
1442         stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
1443         stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
1444         c10::Device(c10::DeviceType::CUDA, device));
1445 
1446     return block;
1447   }
1448 
free(Block * block)1449   void free(Block* block) {
1450     std::shared_ptr<GatheredContext> context =
1451         maybeGatherContext(RecordContext::ALL);
1452     std::lock_guard<std::recursive_mutex> lock(mutex);
1453 
1454     block->allocated = false;
1455 
1456     // following logic might modifying underlaying Block, causing the size
1457     // changed. We store ahead for reporting
1458     auto orig_block_ptr = block->ptr;
1459     auto orig_block_size = block->size;
1460 
1461     StatTypes stat_types = get_stat_types_for_pool(*block->pool);
1462     for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
1463       stats.allocation[stat_type].decrease(1);
1464       stats.allocated_bytes[stat_type].decrease(block->size);
1465     });
1466     auto allocated_bytes_gauge =
1467         STATIC_GAUGE(pytorch.CUDACachingAllocator.allocated_bytes);
1468     allocated_bytes_gauge.record(
1469         stats.allocated_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
1470             .current);
1471 
1472     record_trace(
1473         TraceEntry::FREE_REQUESTED,
1474         int64_t(block->ptr),
1475         block->requested_size,
1476         block->stream,
1477         block->device,
1478         context ? context : block->context_when_allocated);
1479 
1480     if (block->size >= CUDAAllocatorConfig::max_split_size())
1481       stats.oversize_allocations.decrease(1);
1482 
1483     if (!block->stream_uses.empty()) {
1484       if (C10_UNLIKELY(!captures_underway.empty())) {
1485         // It's forbidden to cudaEventQuery an event recorded during CUDA graph
1486         // capture. We conservatively defer recording end-of-life events until
1487         // the next call to process_events() (which won't happen until no
1488         // captures are underway)
1489         needs_events_deferred_until_no_capture.push_back(block);
1490       } else {
1491         insert_events(block);
1492       }
1493     } else {
1494       free_block(block, context);
1495     }
1496 
1497     c10::reportMemoryUsageToProfiler(
1498         orig_block_ptr,
1499         -static_cast<int64_t>(orig_block_size),
1500         stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
1501         stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
1502         c10::Device(c10::DeviceType::CUDA, block->device));
1503   }
1504 
getBaseAllocation(Block * block,size_t * outSize)1505   void* getBaseAllocation(Block* block, size_t* outSize) {
1506     std::lock_guard<std::recursive_mutex> lock(mutex);
1507     TORCH_CHECK(
1508         !block->expandable_segment_,
1509         "Tensors allocated with expandable_segments:True cannot be shared between processes. Consider using expandable_segments:False in data loading workers via torch.cuda.memory._set_allocator_settings('expandable_segments:False')");
1510     while (block->prev) {
1511       block = block->prev;
1512     }
1513     void* basePtr = block->ptr;
1514     if (outSize) {
1515       size_t size = 0;
1516       while (block) {
1517         size += block->size;
1518         block = block->next;
1519       }
1520       *outSize = size;
1521     }
1522     return basePtr;
1523   }
1524 
shareIpcHandle(Block * block)1525   ShareableHandle shareIpcHandle(Block* block) {
1526     std::lock_guard<std::recursive_mutex> lock(mutex);
1527     std::ostringstream ss;
1528     ss.put(SHAREABLE_HANDLE_VERSION);
1529     ptrdiff_t offset = 0;
1530     if (!block->expandable_segment_) {
1531       ss.put(SHAREABLE_CUDA_MALLOC);
1532       Block* base_block = block;
1533       while (base_block->prev) {
1534         base_block = base_block->prev;
1535       }
1536       offset = (char*)block->ptr - (char*)base_block->ptr;
1537       cudaIpcMemHandle_t handle;
1538       C10_CUDA_CHECK(cudaIpcGetMemHandle(&handle, base_block->ptr));
1539       ss.write((char*)&handle, CUDA_IPC_HANDLE_SIZE);
1540     } else {
1541       ss.put(SHAREABLE_CUDA_EXPANDABLE_SEGMENT);
1542       auto full_range = block->expandable_segment_->share(
1543           SegmentRange(block->ptr, block->size), ss);
1544       offset = (char*)block->ptr - (char*)full_range.ptr;
1545     }
1546     return ShareableHandle{offset, ss.str()};
1547   }
1548 
recordStream(Block * block,cuda::CUDAStream stream)1549   void recordStream(Block* block, cuda::CUDAStream stream) {
1550     std::lock_guard<std::recursive_mutex> lock(mutex);
1551     if (stream.stream() == block->stream) {
1552       // ignore uses on the allocation stream, since those don't require any
1553       // special synchronization
1554       return;
1555     }
1556     block->stream_uses.insert(stream);
1557     if (C10_UNLIKELY(!captures_underway.empty())) {
1558       block_to_cudagraph_stream_uses[block].insert(stream);
1559     }
1560   }
1561 
1562   /** set memory fraction to limit maximum allocated memory **/
setMemoryFraction(double fraction)1563   void setMemoryFraction(double fraction) {
1564     size_t device_free = 0;
1565     size_t device_total = 0;
1566     C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
1567     allowed_memory_maximum =
1568         static_cast<size_t>(fraction * static_cast<double>(device_total));
1569     set_fraction = true;
1570   }
1571 
1572   /** returns cached blocks to the system allocator **/
emptyCache()1573   void emptyCache() {
1574     auto context = maybeGatherContext(RecordContext::ALL);
1575     std::lock_guard<std::recursive_mutex> lock(mutex);
1576     release_cached_blocks(context);
1577   }
1578 
1579   /** Retrieves size of largest unused block held by the memory cache **/
cacheInfo(size_t * largest)1580   void cacheInfo(size_t* largest) {
1581     std::lock_guard<std::recursive_mutex> lock(mutex);
1582     if (*largest ==
1583         0) { // make an initial guess if a zero *largest is passed in
1584       size_t tmp_bytes = 0;
1585       C10_CUDA_CHECK(cudaMemGetInfo(
1586           largest, // Use free memory as an optimistic initial guess of *largest
1587           &tmp_bytes));
1588     }
1589     cache_info_aux(large_blocks, largest);
1590     cache_info_aux(small_blocks, largest);
1591     for (const auto& gp : graph_pools) {
1592       cache_info_aux(gp.second->large_blocks, largest);
1593       cache_info_aux(gp.second->small_blocks, largest);
1594     }
1595   }
1596 
1597   /** Returns a copy of the memory allocator stats **/
getStats()1598   DeviceStats getStats() {
1599     std::lock_guard<std::recursive_mutex> lock(mutex);
1600     return stats;
1601   }
1602 
1603   /** Resets the historical accumulation stats for the device **/
resetAccumulatedStats()1604   void resetAccumulatedStats() {
1605     std::lock_guard<std::recursive_mutex> lock(mutex);
1606 
1607     for (const auto statType :
1608          c10::irange(static_cast<size_t>(StatType::NUM_TYPES))) {
1609       stats.allocation[statType].reset_accumulated();
1610       stats.segment[statType].reset_accumulated();
1611       stats.active[statType].reset_accumulated();
1612       stats.inactive_split[statType].reset_accumulated();
1613       stats.allocated_bytes[statType].reset_accumulated();
1614       stats.reserved_bytes[statType].reset_accumulated();
1615       stats.active_bytes[statType].reset_accumulated();
1616       stats.inactive_split_bytes[statType].reset_accumulated();
1617       stats.requested_bytes[statType].reset_accumulated();
1618     }
1619 
1620     stats.num_alloc_retries = 0;
1621     stats.num_ooms = 0;
1622     stats.num_sync_all_streams = 0;
1623     stats.num_device_alloc = 0;
1624     stats.num_device_free = 0;
1625     stats.oversize_allocations.reset_accumulated();
1626     stats.oversize_segments.reset_accumulated();
1627   }
1628 
1629   /** Resets the historical peak stats for the device **/
resetPeakStats()1630   void resetPeakStats() {
1631     std::lock_guard<std::recursive_mutex> lock(mutex);
1632 
1633     for (const auto statType :
1634          c10::irange(static_cast<size_t>(StatType::NUM_TYPES))) {
1635       stats.allocation[statType].reset_peak();
1636       stats.segment[statType].reset_peak();
1637       stats.active[statType].reset_peak();
1638       stats.inactive_split[statType].reset_peak();
1639       stats.allocated_bytes[statType].reset_peak();
1640       stats.reserved_bytes[statType].reset_peak();
1641       stats.active_bytes[statType].reset_peak();
1642       stats.inactive_split_bytes[statType].reset_peak();
1643       stats.requested_bytes[statType].reset_peak();
1644     }
1645     stats.oversize_allocations.reset_peak();
1646     stats.oversize_segments.reset_peak();
1647   }
1648 
1649   /* Checkpoint the state of a private pool necessary to return it to its
1650    * current state */
getCheckpointState(MempoolId_t id)1651   std::unique_ptr<PrivatePoolState> getCheckpointState(MempoolId_t id) {
1652     auto context = maybeGatherContext(RecordContext::ALL);
1653     std::lock_guard<std::recursive_mutex> lock(mutex);
1654     insert_events_deferred_until_no_capture(context);
1655 
1656     auto pool = graph_pools.find(id);
1657     if (pool != graph_pools.end()) {
1658       auto private_pool_head_blocks =
1659           get_private_pool_head_blocks(pool->second.get());
1660       return std::make_unique<PrivatePoolState>(id, private_pool_head_blocks);
1661     } else if (graph_pools_freeable.count(id)) {
1662       TORCH_CHECK(false, "Not expected to checkpoint freeable graph");
1663     } else {
1664       TORCH_CHECK(false, "Could not find pool of id");
1665     }
1666   }
1667 
freeBlocksAllocatedToPool(PrivatePool * private_pool,RestoreResult & rr)1668   void freeBlocksAllocatedToPool(PrivatePool* private_pool, RestoreResult& rr) {
1669     auto pool_blocks = get_private_pool_head_blocks(private_pool);
1670 
1671     std::vector<Block*> head_blocks;
1672     for (Block* block : pool_blocks) {
1673       if (block->prev == nullptr) {
1674         head_blocks.push_back(block);
1675       }
1676     }
1677 
1678     for (Block* block : head_blocks) {
1679       Block* curr = block;
1680 
1681       while (curr) {
1682         // When we free a block, its pointer should never change
1683         // only its adjacent blocks, so free, then look at pointer
1684         if (curr->allocated) {
1685           TORCH_CHECK(
1686               curr->event_count == 0,
1687               "Events should have synchronized when setting checkpointed block");
1688           rr.allocations_freed.push_back(curr->ptr);
1689           free(curr);
1690           TORCH_CHECK(!curr->allocated)
1691         }
1692         curr = curr->next;
1693       }
1694     }
1695 
1696     for (Block* b : get_private_pool_head_blocks(private_pool)) {
1697       Block* curr = b;
1698       while (curr) {
1699         TORCH_CHECK(!curr->allocated);
1700         curr = curr->next;
1701       }
1702     }
1703   }
1704 
1705   // checkpoint the state of an allocation that may have been
1706   // split into multiple blocks
setSegmentStateToCheckpoint(Block * block,SegmentState & segment,const std::shared_ptr<GatheredContext> & context,RestoreResult & rr)1707   void setSegmentStateToCheckpoint(
1708       Block* block,
1709       SegmentState& segment,
1710       const std::shared_ptr<GatheredContext>& context,
1711       RestoreResult& rr) {
1712     Block* curr_block = block;
1713     Block* last_block = block;
1714 
1715     TORCH_INTERNAL_ASSERT(block->pool);
1716     BlockPool& pool = *block->pool;
1717     const auto segment_len = segment.blocks.size();
1718 
1719     // allocate all blocks in the segment
1720     for (size_t i = 0; i < segment_len; ++i) {
1721       // The last block in every expandable segment is the remaining amount of
1722       // available unmapped virtual address space. We shouldn't change it but
1723       // instead check it is correctly formed then skip over allocating it.
1724       if (i == segment_len - 1 && curr_block->expandable_segment_) {
1725         TORCH_CHECK(curr_block->next == nullptr);
1726         TORCH_CHECK(!curr_block->mapped);
1727         TORCH_CHECK(curr_block->allocated == false);
1728         continue;
1729       }
1730 
1731       auto& block_state = segment.blocks.at(i);
1732       AllocParams params(
1733           block_state.device,
1734           block_state.size,
1735           block_state.stream,
1736           &pool,
1737           block_state.size,
1738           stats);
1739       pool.blocks.erase(curr_block);
1740       params.block = curr_block;
1741       params.stat_types = get_stat_types_for_pool(pool);
1742 
1743       // splitting a block depends on `max_split_size`, which may have changed
1744       // between when checkpoint was taken and now, so we make sure to recreate
1745       // the behavior from the checkpoint. Keep splitting as long as there is
1746       // space left in the block because the block is already the size of how it
1747       // appears in the segment, so any leftover space belongs to the next
1748       // block.
1749       bool split = curr_block->size > block_state.size;
1750 
1751       // curr_block will become next pointer if it is split, so reassign with
1752       // the returned value
1753       curr_block = alloc_found_block(params, block_state.size, context, split);
1754 
1755       TORCH_CHECK(curr_block->ptr == block_state.ptr);
1756       TORCH_CHECK(curr_block->size == block_state.size);
1757 
1758       last_block = curr_block;
1759       curr_block = curr_block->next;
1760 
1761       TORCH_CHECK((curr_block != nullptr) == ((i + 1) < (segment_len)));
1762     }
1763 
1764     while (last_block->prev) {
1765       last_block = last_block->prev;
1766     }
1767 
1768     // free blocks that are not allocated in the checkpoint
1769     curr_block = last_block;
1770 
1771     for (size_t i = 0; i < segment_len; ++i, curr_block = curr_block->next) {
1772       if (i == segment_len - 1 && curr_block->expandable_segment_) {
1773         TORCH_CHECK(curr_block->next == nullptr);
1774         TORCH_CHECK(!curr_block->mapped);
1775         TORCH_CHECK(curr_block->allocated == false);
1776         continue;
1777       }
1778 
1779       auto& block_state = segment.blocks.at(i);
1780       TORCH_INTERNAL_ASSERT(curr_block != nullptr);
1781 
1782       if (block_state.allocated) {
1783         rr.allocations_created.push_back(curr_block);
1784         continue;
1785       }
1786 
1787       free(curr_block);
1788 
1789       TORCH_CHECK(curr_block->ptr == block_state.ptr);
1790       TORCH_CHECK(curr_block->allocated == block_state.allocated);
1791       TORCH_CHECK(curr_block->size == block_state.size);
1792     }
1793   }
1794 
1795   /**
1796    * Note [Checkpointing PrivatePoolState]
1797    *
1798    * Refer above to Note [Interaction with CUDA graph capture]. Allocations made
1799    * during graph capture are made from a separate private pool. During graph
1800    * capture allocations behave as usual. During graph replay the allocator
1801    * state does not change even as new tensors are created. The private pool
1802    * will not free its blocks to the main caching allocator until cuda graph use
1803    * is finished to prevent an allocation from eager clobbering the memory from
1804    * a live but unaccounted for tensor that was created during replay.
1805    *
1806    * `make_graphed_callables`, a series of separate callables chained in
1807    * successive cuda graphs, can share a memory pool because after a cuda graph
1808    * recording the allocations in the shared private pool exactly reflect the
1809    * tensors that are allocated.
1810    *
1811    * We would like to extend callable chaining to support a graphed callable
1812    * tree. In this scenario, we have a tree of callable chains which will be
1813    * captured with cuda graphs. In the diagram below, we have a tree with four
1814    * callables, A, B, C, and D. Suppose we have captured, and subsequently
1815    * replayed, A, B, and C. Then on a new invocation, we replay A and B, but
1816    * would now like to record D. At this point the private pool will not reflect
1817    * any of the live tensors created during graph replay. Allocations made
1818    * during a new recording with the pool could overwrite those live tensors.
1819    *
1820    * In order to record a new graph capture after replaying prior callables in
1821    * the tree, we need the allocator to reflect the state of the live tensors.
1822    * We checkpoint the state of the private pool after each recording, and then
1823    * reapply it when we are starting a new recording chain. Additionally, we
1824    * must free the allocations for any tensors that died between the end of our
1825    * previous graph replaying and our new recording. All of the allocated
1826    * segments that existed in the checkpointed state must still exist in the
1827    * pool. There may also exist new allocated blocks.
1828    * (TODO : link note [live tensors between iterations] when it exists). For
1829    * every block that is currently allocated but no allocated in the snapshot,
1830    * we will return a pointer to their block.
1831    *.
1832    *
1833    *
1834    *  ---------------> A ---------------> B ---------------> C
1835    *                                      |
1836    *                                      |
1837    *                                      |
1838    *                                      |
1839    *                                      ╰ ---------------> D
1840    */
setCheckpointPoolState(PrivatePoolState & pps)1841   RestoreResult setCheckpointPoolState(PrivatePoolState& pps) {
1842     // To reset the caching allocator state we will
1843     // - Free all the blocks currently allocated to the pool (see [live tensors
1844     // between iterations])
1845     // - Allocate all the blocks in a checkpointed segment, whether they are
1846     // live or not
1847     // - Free the blocks in a checkpointed segment which are not live
1848     // This could be optimized, but it nicely reuses exiting apis, and this
1849     // is not on the hot path.
1850 
1851     // following `done outside the lock because we don't know what locks the
1852     // recorder needs to have...`
1853 
1854     std::shared_ptr<GatheredContext> context =
1855         maybeGatherContext(RecordContext::STATE);
1856 
1857     std::lock_guard<std::recursive_mutex> lock(mutex);
1858 
1859     RestoreResult rr;
1860 
1861     TORCH_CHECK(
1862         !graph_pools_freeable.count(pps.owner_id),
1863         "Not expected to checkpoint freeable graph");
1864 
1865     auto pool = graph_pools.find(pps.owner_id);
1866     TORCH_CHECK(pool != graph_pools.end(), "Could not find private pool id");
1867 
1868     PrivatePool* private_pool = pool->second.get();
1869 
1870     freeBlocksAllocatedToPool(private_pool, rr);
1871 
1872     std::unordered_map<void*, Block*> ptrs_to_blocks;
1873     // at this point, all of the blocks should be free, so they will all be in
1874     // the block set
1875     for (Block* block : private_pool->small_blocks.blocks) {
1876       ptrs_to_blocks[block->ptr] = block;
1877     }
1878     for (Block* block : private_pool->large_blocks.blocks) {
1879       ptrs_to_blocks[block->ptr] = block;
1880     }
1881 
1882     for (auto& segment : pps.segments) {
1883       auto ptr = segment.blocks.at(0).ptr;
1884       TORCH_CHECK(ptrs_to_blocks.count(ptr), " could not find ", ptr)
1885       auto block = ptrs_to_blocks[ptr];
1886 
1887       setSegmentStateToCheckpoint(block, segment, context, rr);
1888     }
1889     return rr;
1890   }
1891 
1892   /** Dump a complete snapshot of the memory held by the allocator. Potentially
1893    * VERY expensive. **/
snapshot()1894   std::vector<SegmentInfo> snapshot() {
1895     std::lock_guard<std::recursive_mutex> lock(mutex);
1896 
1897     std::unordered_map<PrivatePool*, MempoolId_t> pool_to_id;
1898     pool_to_id.reserve(graph_pools.size() + graph_pools_freeable.size());
1899     for (const auto& pair : graph_pools) {
1900       pool_to_id[pair.second.get()] = pair.first;
1901     }
1902     for (const auto& pair : graph_pools_freeable) {
1903       pool_to_id[pair.second] = pair.first;
1904     }
1905 
1906     size_t total_active = 0;
1907     std::vector<SegmentInfo> result;
1908     const auto all_blocks = get_all_blocks();
1909 
1910     for (const Block* const head_block : all_blocks) {
1911       // For expandable segments, we report one segment for each contiguous
1912       // mapped range of memory
1913       if (head_block->prev && head_block->prev->mapped) {
1914         continue;
1915       }
1916       result.emplace_back();
1917       SegmentInfo& segment_info = result.back();
1918       segment_info.device = head_block->device;
1919       segment_info.address = reinterpret_cast<size_t>(head_block->ptr);
1920       segment_info.stream = head_block->stream;
1921       segment_info.is_large = (!head_block->pool->is_small);
1922       segment_info.is_expandable = head_block->expandable_segment_;
1923       segment_info.context_when_allocated =
1924           head_block->context_when_segment_allocated;
1925       auto mempool_id = pool_to_id.find(head_block->pool->owner_PrivatePool);
1926       if (mempool_id != pool_to_id.end()) {
1927         segment_info.owner_private_pool_id = mempool_id->second;
1928       }
1929 
1930       const Block* block = head_block;
1931       while (block != nullptr && block->mapped) {
1932         segment_info.blocks.emplace_back();
1933         BlockInfo& block_info = segment_info.blocks.back();
1934 
1935         block_info.size = block->size;
1936         block_info.requested_size = block->requested_size;
1937         block_info.allocated = block->allocated;
1938         block_info.active = block->allocated || (block->event_count > 0) ||
1939             !block->stream_uses.empty();
1940 
1941         segment_info.total_size += block_info.size;
1942         if (block_info.allocated) {
1943           segment_info.allocated_size += block_info.size;
1944         }
1945         if (block_info.active) {
1946           segment_info.active_size += block_info.size;
1947           segment_info.requested_size += block_info.requested_size;
1948         }
1949         block_info.context_when_allocated = block->context_when_allocated;
1950         block = block->next;
1951       }
1952       total_active += segment_info.active_size;
1953     }
1954 
1955     std::sort(
1956         result.begin(),
1957         result.end(),
1958         [](const SegmentInfo& a, const SegmentInfo& b) {
1959           return a.address < b.address;
1960         });
1961 
1962     record_trace(TraceEntry::SNAPSHOT, 0, total_active, nullptr, 0, nullptr);
1963     return result;
1964   }
1965 
trace(const std::function<time_t (approx_time_t)> & tsc_to_us)1966   std::vector<TraceEntry> trace(
1967       const std::function<time_t(approx_time_t)>& tsc_to_us) {
1968     std::lock_guard<std::recursive_mutex> lock(mutex);
1969     std::vector<TraceEntry> result;
1970     alloc_buffer.getEntries(result);
1971 
1972     // Convert all the timestamps from tsc to epoch time in microseconds.
1973     for (auto& te : result) {
1974       te.time_.t_ = tsc_to_us(te.time_.approx_t_);
1975     }
1976     return result;
1977   }
1978 
1979   // This function takes the size and number of divisions argument and rounds
1980   // up the size argument for the nearest power-of-2 division.
1981   // For example, if we need to round-up 1200 and number of divisions is 4,
1982   // the size 1200 lies between 1024 and 2048 and if we do 4 divisions between
1983   // them, the values are 1024, 1280, 1536, and 1792. So the function will
1984   // return 1280 as the nearest ceiling of power-2 divison.
roundup_power2_next_division(size_t size,size_t divisions)1985   static size_t roundup_power2_next_division(size_t size, size_t divisions) {
1986     if (llvm::isPowerOf2_64(size)) {
1987       return size;
1988     }
1989 
1990     TORCH_CHECK(divisions >= 2, "Only 2 or more divisions are supported");
1991 
1992     // divide the space between these 2's power into equal divisions
1993     // If division is zero, return the power-of-2 ceiling.
1994     size_t power2_floor = llvm::PowerOf2Floor(size);
1995     size_t power2_divison =
1996         power2_floor >> (63 - llvm::countLeadingZeros(divisions));
1997     if (C10_UNLIKELY(power2_divison == 0)) {
1998       return (power2_floor << 1);
1999     }
2000     size_t round_size_floor = size & (~(power2_divison - 1));
2001     return (round_size_floor == size) ? size
2002                                       : round_size_floor + power2_divison;
2003   }
2004 
round_size(size_t size)2005   static size_t round_size(size_t size) {
2006     if (size < kMinBlockSize) {
2007       return kMinBlockSize;
2008     } else {
2009       auto divisions = CUDAAllocatorConfig::roundup_power2_divisions(size);
2010       if (divisions > 1 && size > (kMinBlockSize * divisions)) {
2011         return roundup_power2_next_division(size, divisions);
2012       } else {
2013         return kMinBlockSize * ((size + kMinBlockSize - 1) / kMinBlockSize);
2014       }
2015     }
2016   }
2017 
2018   // See Note [Interaction with CUDA graph capture]
2019 
2020   // Called by CUDAGraph::capture_begin
beginAllocateToPool(MempoolId_t mempool_id,std::function<bool (cudaStream_t)> filter)2021   void beginAllocateToPool(
2022       MempoolId_t mempool_id,
2023       std::function<bool(cudaStream_t)> filter) {
2024     std::lock_guard<std::recursive_mutex> lock(mutex);
2025     auto it = graph_pools.find(mempool_id);
2026     if (it == graph_pools.end()) {
2027       // mempool_id does not reference an existing pool. Make a new pool for
2028       // this capture.
2029       graph_pools.emplace(mempool_id, std::make_unique<PrivatePool>());
2030     } else {
2031       // mempool_id references an existing pool, which the current capture will
2032       // share. Check this pool is live (at least one other capture already
2033       // references it).
2034       TORCH_INTERNAL_ASSERT(it->second->use_count > 0);
2035       it->second->use_count++;
2036     }
2037     for (auto it2 = captures_underway.begin(); it2 != captures_underway.end();
2038          ++it2) {
2039       TORCH_CHECK(
2040           it2->first != mempool_id,
2041           "beginAllocateToPool: already recording to mempool_id");
2042     }
2043     captures_underway.emplace_back(mempool_id, std::move(filter));
2044   }
2045 
2046   // Called by CUDAGraph::capture_end
endAllocateToPool(MempoolId_t mempool_id)2047   void endAllocateToPool(MempoolId_t mempool_id) {
2048     std::lock_guard<std::recursive_mutex> lock(mutex);
2049     for (auto it = captures_underway.begin(); it != captures_underway.end();
2050          ++it) {
2051       if (it->first == mempool_id) {
2052         captures_underway.erase(it);
2053         return;
2054       }
2055     }
2056     TORCH_CHECK(
2057         false, "endAllocatePool: not currently recording to mempool_id");
2058   }
2059 
2060   // Called by CUDAGraph::reset
releasePool(MempoolId_t mempool_id)2061   void releasePool(MempoolId_t mempool_id) {
2062     std::lock_guard<std::recursive_mutex> lock(mutex);
2063     // The instantiated cudaGraphExec_t has been destroyed. We can't blindly
2064     // delete and cudaFree the mempool its capture used, because
2065     //  1. other graph(s) might share the same pool
2066     //  2. the user might still hold references to output tensors allocated
2067     //  during capture.
2068     // To handle 1 and 2, we track the number of graphs using this particular
2069     // mempool. When the count reaches 0, we tell free_cached_blocks it may now
2070     // cudaFree blocks from this graph's pool when it discovers they're unused
2071     // (unsplit).
2072     auto it = graph_pools.find(mempool_id);
2073     TORCH_INTERNAL_ASSERT(it != graph_pools.end());
2074     auto uc = --(it->second->use_count);
2075     TORCH_INTERNAL_ASSERT(uc >= 0);
2076     if (uc == 0) {
2077       // Allows free_cached_blocks to begin cudaFreeing this pool's memory,
2078       // and makes sure this pool wasn't somehow made freeable already.
2079       // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
2080       bool inserted =
2081           graph_pools_freeable.insert({mempool_id, it->second.get()}).second;
2082       TORCH_INTERNAL_ASSERT(inserted);
2083     }
2084   }
2085 
addPeerAccess(c10::DeviceIndex dev_to_access)2086   void addPeerAccess(c10::DeviceIndex dev_to_access) {
2087     std::lock_guard<std::recursive_mutex> lock(mutex);
2088     if (std::find(
2089             devices_with_peer_access_.begin(),
2090             devices_with_peer_access_.end(),
2091             dev_to_access) != devices_with_peer_access_.end()) {
2092       return;
2093     }
2094     devices_with_peer_access_.push_back(dev_to_access);
2095     for (auto& es : expandable_segments_) {
2096       es->addPeer(dev_to_access);
2097     }
2098   }
peers() const2099   std::vector<c10::DeviceIndex> peers() const {
2100     std::lock_guard<std::recursive_mutex> lock(mutex);
2101     return devices_with_peer_access_;
2102   }
2103 
hasAllocatedExpandableSegments() const2104   bool hasAllocatedExpandableSegments() const {
2105     return !expandable_segments_.empty();
2106   }
2107 
2108  private:
2109   // All private methods do not acquire the allocator mutex.
2110 
get_all_blocks() const2111   std::vector<const Block*> get_all_blocks() const {
2112     std::vector<const Block*> blocks;
2113     blocks.insert(
2114         blocks.end(), small_blocks.blocks.begin(), small_blocks.blocks.end());
2115     blocks.insert(
2116         blocks.end(), large_blocks.blocks.begin(), large_blocks.blocks.end());
2117     for (const auto& gp : graph_pools) {
2118       blocks.insert(
2119           blocks.end(),
2120           gp.second->small_blocks.blocks.begin(),
2121           gp.second->small_blocks.blocks.end());
2122       blocks.insert(
2123           blocks.end(),
2124           gp.second->large_blocks.blocks.begin(),
2125           gp.second->large_blocks.blocks.end());
2126     }
2127     blocks.insert(blocks.end(), active_blocks.begin(), active_blocks.end());
2128     return blocks;
2129   }
2130 
get_private_pool_head_blocks(PrivatePool * pool) const2131   std::vector<Block*> get_private_pool_head_blocks(PrivatePool* pool) const {
2132     std::vector<Block*> blocks;
2133     for (Block* b : active_blocks) {
2134       if ((b->pool == &pool->small_blocks || b->pool == &pool->large_blocks) &&
2135           b->prev == nullptr) {
2136         blocks.push_back(b);
2137       }
2138     }
2139 
2140     for (Block* b : pool->small_blocks.blocks) {
2141       if (b->prev == nullptr) {
2142         blocks.push_back(b);
2143       }
2144     }
2145     for (Block* b : pool->large_blocks.blocks) {
2146       if (b->prev == nullptr) {
2147         blocks.push_back(b);
2148       }
2149     }
2150 
2151     return blocks;
2152   }
2153 
2154   // returns the smallest possible address in any segment
2155   // where there is enough free address space to fit size
2156   // may be composed of free and unmapped segments
find_expandable_block(c10::DeviceIndex device,cudaStream_t stream,BlockPool * pool,size_t size)2157   Block* find_expandable_block(
2158       c10::DeviceIndex device,
2159       cudaStream_t stream,
2160       BlockPool* pool,
2161       size_t size) {
2162     Block key(device, stream, 0);
2163 
2164     auto allocatable = [](Block* b) {
2165       return b && !b->allocated && b->event_count == 0 &&
2166           b->stream_uses.empty();
2167     };
2168     auto has_available_address_space = [&](Block* b) {
2169       size_t bytes = 0;
2170       while (bytes < size && allocatable(b)) {
2171         bytes += b->size;
2172         b = b->next;
2173       }
2174       return bytes >= size;
2175     };
2176     for (auto it = pool->unmapped.lower_bound(&key);
2177          it != pool->unmapped.end() && (*it)->stream == stream;
2178          ++it) {
2179       Block* c = *it;
2180       // we found the lowest address of an unmapped segment
2181       // but there might be a free segment we can also use
2182       // right before it
2183       if (allocatable(c->prev)) {
2184         c = c->prev;
2185       }
2186       if (has_available_address_space(c)) {
2187         return c;
2188       }
2189     }
2190     auto segment_size = pool->is_small ? kSmallBuffer : kLargeBuffer;
2191     cudaDeviceProp prop{};
2192     C10_CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
2193     // we allocate enough address space for 1 1/8 the total memory on the GPU.
2194     // This allows for some cases where we have to unmap pages earlier in the
2195     // segment to put them at the end.
2196     size_t address_space_size = prop.totalGlobalMem + prop.totalGlobalMem / 8;
2197 
2198     expandable_segments_.emplace_back(new ExpandableSegment(
2199         device,
2200         stream,
2201         address_space_size,
2202         segment_size,
2203         devices_with_peer_access_));
2204 
2205     ExpandableSegment* es = expandable_segments_.back();
2206     Block* candidate = new Block(device, stream, es->size(), pool, es->ptr());
2207     candidate->mapped = false;
2208     candidate->expandable_segment_ = es;
2209     pool->unmapped.insert(candidate);
2210     return candidate;
2211   }
2212 
map_block(Block * to_map,size_t size,const std::shared_ptr<GatheredContext> & ctx)2213   bool map_block(
2214       Block* to_map,
2215       size_t size,
2216       const std::shared_ptr<GatheredContext>& ctx) {
2217     TORCH_INTERNAL_ASSERT(!to_map->mapped && size <= to_map->size);
2218     TORCH_INTERNAL_ASSERT(
2219         !to_map->context_when_allocated); // unmapped blocks should not keep
2220                                           // history
2221     auto mapped_range =
2222         to_map->expandable_segment_->map(SegmentRange{to_map->ptr, size});
2223     // failed to map the memory
2224     if (mapped_range.size == 0) {
2225       return false;
2226     }
2227     TORCH_INTERNAL_ASSERT(
2228         mapped_range.ptr == to_map->ptr && mapped_range.size >= size);
2229 
2230     BlockPool& pool = *to_map->pool;
2231     pool.unmapped.erase(to_map);
2232     to_map->mapped = true;
2233 
2234     if (mapped_range.size < to_map->size) {
2235       // to_map -> remaining -> to_map->next(?)
2236       Block* remaining = new Block(
2237           to_map->device,
2238           to_map->stream,
2239           to_map->size - mapped_range.size,
2240           &pool,
2241           static_cast<char*>(to_map->ptr) + mapped_range.size);
2242       remaining->mapped = false;
2243       remaining->expandable_segment_ = to_map->expandable_segment_;
2244       remaining->splice(to_map, to_map->next);
2245       pool.unmapped.insert(remaining);
2246       to_map->size = mapped_range.size;
2247     }
2248 
2249     try_merge_blocks(to_map, to_map->prev, pool);
2250     try_merge_blocks(to_map, to_map->next, pool);
2251 
2252     pool.insert_into_blocks(to_map);
2253 
2254     // update statistics
2255     total_allocated_memory += mapped_range.size;
2256     StatTypes stat_types = get_stat_types_for_pool(*to_map->pool);
2257     for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
2258       stats.reserved_bytes[stat_type].increase(mapped_range.size);
2259     });
2260     auto reserved_bytes_gauge =
2261         STATIC_GAUGE(pytorch.CUDACachingAllocator.reserved_bytes);
2262     reserved_bytes_gauge.record(
2263         stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
2264             .current);
2265 
2266     stats.num_device_alloc++;
2267     record_trace(
2268         TraceEntry::SEGMENT_MAP,
2269         int64_t(mapped_range.ptr),
2270         mapped_range.size,
2271         to_map->stream,
2272         to_map->device,
2273         ctx);
2274     if (!to_map->prev && !to_map->context_when_segment_allocated) {
2275       to_map->context_when_segment_allocated = ctx;
2276     }
2277 
2278     return true;
2279   }
2280 
try_allocate_expandable_block(c10::DeviceIndex device,cudaStream_t stream,BlockPool * pool,size_t size,const std::shared_ptr<GatheredContext> & ctx)2281   Block* try_allocate_expandable_block(
2282       c10::DeviceIndex device,
2283       cudaStream_t stream,
2284       BlockPool* pool,
2285       size_t size,
2286       const std::shared_ptr<GatheredContext>& ctx) {
2287     Block* candidate = find_expandable_block(device, stream, pool, size);
2288     // Candidate is now a list free/unmapped blocks with at least size room:
2289     // unmapped -> null
2290     // unmapped -> free -> *
2291     // free -> unmapped -> *
2292 
2293     if (!candidate->mapped &&
2294         !map_block(candidate, std::min(candidate->size, size), ctx)) {
2295       return nullptr;
2296     }
2297     TORCH_INTERNAL_ASSERT(candidate->mapped);
2298 
2299     while (candidate->size < size) {
2300       // invariant: free -> unmapped -> *
2301       // map_block will map some of unmapped and merge with free
2302       auto remaining = size - candidate->size;
2303       auto new_candidate = candidate->next;
2304       if (!map_block(
2305               new_candidate, std::min(remaining, candidate->next->size), ctx)) {
2306         return nullptr;
2307       }
2308       candidate = new_candidate;
2309     }
2310     pool->blocks.erase(candidate);
2311     return candidate;
2312   }
2313 
2314   /** moves a block into a pool of cached free blocks */
free_block(Block * block,const std::shared_ptr<GatheredContext> & context)2315   void free_block(
2316       Block* block,
2317       const std::shared_ptr<GatheredContext>& context) {
2318     TORCH_INTERNAL_ASSERT(
2319         !block->allocated && block->event_count == 0 &&
2320         block->stream_uses.empty());
2321 
2322     record_trace(
2323         TraceEntry::FREE_COMPLETED,
2324         int64_t(block->ptr),
2325         block->requested_size,
2326         block->stream,
2327         block->device,
2328         context ? context : block->context_when_allocated);
2329 
2330     block->context_when_allocated = nullptr;
2331     size_t original_block_size = block->size;
2332     size_t requested_size = block->requested_size;
2333 
2334     auto& pool = *block->pool;
2335     int64_t net_change_inactive_split_blocks = 0;
2336     int64_t net_change_inactive_split_size = 0;
2337 
2338     const std::array<Block*, 2> merge_candidates = {block->prev, block->next};
2339     for (Block* merge_candidate : merge_candidates) {
2340       const auto subsumed_size = try_merge_blocks(block, merge_candidate, pool);
2341       if (subsumed_size > 0) {
2342         net_change_inactive_split_blocks -= 1;
2343         net_change_inactive_split_size -= static_cast<int64_t>(subsumed_size);
2344       }
2345     }
2346 
2347     active_blocks.erase(block);
2348     // Makes sure the Block* isn't already present in the pool we're freeing it
2349     // back into.
2350     // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
2351     bool inserted = pool.insert_into_blocks(block).second;
2352     TORCH_INTERNAL_ASSERT(inserted);
2353 
2354     if (block->is_split()) {
2355       net_change_inactive_split_blocks += 1;
2356       net_change_inactive_split_size += static_cast<int64_t>(block->size);
2357     }
2358 
2359     StatTypes stat_types = get_stat_types_for_pool(pool);
2360 
2361     for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
2362       // inactive_split tries to capture the idea that blocks
2363       // cannot be freed when requested, but fully free pages
2364       // of expandable blocks can always be freed.
2365       // The logic to track this as statistic is pretty involved,
2366       // so we simply just exclude expandable segments from
2367       // inactive_split
2368       if (!block->expandable_segment_) {
2369         if (net_change_inactive_split_blocks > 0) {
2370           stats.inactive_split[stat_type].increase(
2371               static_cast<size_t>(net_change_inactive_split_blocks));
2372         } else if (net_change_inactive_split_blocks < 0) {
2373           stats.inactive_split[stat_type].decrease(
2374               static_cast<size_t>(-net_change_inactive_split_blocks));
2375         }
2376         if (net_change_inactive_split_size > 0) {
2377           stats.inactive_split_bytes[stat_type].increase(
2378               static_cast<size_t>(net_change_inactive_split_size));
2379         } else if (net_change_inactive_split_size < 0) {
2380           stats.inactive_split_bytes[stat_type].decrease(
2381               static_cast<size_t>(-net_change_inactive_split_size));
2382         }
2383       }
2384       stats.active[stat_type].decrease(1);
2385       stats.active_bytes[stat_type].decrease(original_block_size);
2386       stats.requested_bytes[stat_type].decrease(requested_size);
2387     });
2388   }
2389 
2390   /** combine previously split blocks. returns the size of the subsumed block,
2391    * or 0 on failure. */
try_merge_blocks(Block * dst,Block * src,BlockPool & pool)2392   size_t try_merge_blocks(Block* dst, Block* src, BlockPool& pool) {
2393     if (!src || src->allocated || src->event_count > 0 ||
2394         !src->stream_uses.empty() || dst->mapped != src->mapped) {
2395       return 0;
2396     }
2397 
2398     AT_ASSERT(dst->is_split() && src->is_split());
2399 
2400     if (dst->prev == src) { // [src dst]
2401       dst->ptr = src->ptr;
2402       dst->prev = src->prev;
2403       if (dst->prev) {
2404         dst->prev->next = dst;
2405       }
2406       dst->context_when_segment_allocated =
2407           std::move(src->context_when_segment_allocated);
2408     } else { // [dest src]
2409       dst->next = src->next;
2410       if (dst->next) {
2411         dst->next->prev = dst;
2412       }
2413     }
2414     const size_t subsumed_size = src->size;
2415     dst->size += subsumed_size;
2416     // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
2417     auto erased =
2418         src->mapped ? pool.blocks.erase(src) : pool.unmapped.erase(src);
2419     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(erased == 1);
2420     delete src;
2421 
2422     return subsumed_size;
2423   }
2424 
get_pool(size_t size,cudaStream_t stream)2425   BlockPool& get_pool(size_t size, cudaStream_t stream) {
2426     // captures_underway is a conservative guess that the current stream may be
2427     // capturing. It's only non-empty if some thread has begun and not yet ended
2428     // a capture, so it's usually 0, and we can short-circuit
2429     // cudaStreamCaptureStatus (which does a TLS lookup).
2430     if (C10_UNLIKELY(!captures_underway.empty())) {
2431       for (auto& entry : captures_underway) {
2432         if (entry.second(stream)) {
2433           auto it1 = graph_pools.find(entry.first);
2434           TORCH_INTERNAL_ASSERT(it1 != graph_pools.end());
2435           if (size <= kSmallSize) {
2436             return it1->second->small_blocks;
2437           } else {
2438             return it1->second->large_blocks;
2439           }
2440         }
2441       }
2442     }
2443     if (size <= kSmallSize) {
2444       return small_blocks;
2445     } else {
2446       return large_blocks;
2447     }
2448   }
2449 
get_stat_types_for_pool(const BlockPool & pool)2450   StatTypes get_stat_types_for_pool(const BlockPool& pool) {
2451     StatTypes stat_types = {false};
2452     stat_types[static_cast<size_t>(StatType::AGGREGATE)] = true;
2453     stat_types[static_cast<size_t>(
2454         pool.is_small ? StatType::SMALL_POOL : StatType::LARGE_POOL)] = true;
2455     return stat_types;
2456   }
2457 
should_split(const Block * block,size_t size)2458   bool should_split(const Block* block, size_t size) {
2459     size_t remaining = block->size - size;
2460     if (block->pool->is_small || CUDAAllocatorConfig::expandable_segments()) {
2461       return remaining >= kMinBlockSize;
2462     } else {
2463       return (size < CUDAAllocatorConfig::max_split_size()) &&
2464           (remaining > kSmallSize);
2465     }
2466   }
2467 
get_allocation_size(size_t size)2468   static size_t get_allocation_size(size_t size) {
2469     if (size <= kSmallSize) {
2470       return kSmallBuffer;
2471     } else if (size < kMinLargeAlloc) {
2472       return kLargeBuffer;
2473     } else {
2474       return kRoundLarge * ((size + kRoundLarge - 1) / kRoundLarge);
2475     }
2476   }
2477 
get_free_block(AllocParams & p)2478   bool get_free_block(AllocParams& p) {
2479     BlockPool& pool = *p.pool;
2480 
2481     if (C10_UNLIKELY(
2482             set_fraction &&
2483             CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) {
2484       // Track block reuse interval only when garbage collection is enabled.
2485       ++pool.get_free_blocks_call_count;
2486     }
2487     auto it = pool.blocks.lower_bound(&p.search_key);
2488     if (it == pool.blocks.end() || (*it)->stream != p.stream())
2489       return false;
2490 
2491     if ((*it)->expandable_segment_) {
2492       if (CUDAAllocatorConfig::expandable_segments()) {
2493         // if we are allocated to the part of the block that is expandable
2494         // for the purposes of "best fit" we consider its size to be the size it
2495         // can expand to, not the size it currently is. This means that we
2496         // sometimes have to search for blocks with bigger 'size' before
2497         // choosing this segment.
2498         auto expandable_size = [](Block* b) {
2499           return b->size + (b->next && !b->next->mapped ? b->next->size : 0);
2500         };
2501         auto next = it;
2502         next++;
2503         while ((*it)->expandable_segment_ && next != pool.blocks.end() &&
2504                (*next)->stream == p.stream() &&
2505                expandable_size(*next) < expandable_size(*it)) {
2506           it = next++;
2507         }
2508       } else {
2509         // Rarely expandable segments has been turned off after we have
2510         // already allocated some blocks as expandable. For instance,
2511         // since we cannot share expandable memory via IPC, someone might
2512         // temporarily disable it. In this case we need to honor this request
2513         // by only finding non-expandable blocks
2514         do {
2515           it++;
2516         } while (it != pool.blocks.end() && (*it)->expandable_segment_ &&
2517                  (*it)->stream == p.stream());
2518         if (it == pool.blocks.end() || (*it)->stream != p.stream()) {
2519           return false;
2520         }
2521       }
2522     }
2523 
2524     // Do not return an oversized block for a large request
2525     if ((p.size() < CUDAAllocatorConfig::max_split_size()) &&
2526         ((*it)->size >= CUDAAllocatorConfig::max_split_size()))
2527       return false;
2528     // Allow oversized block size to be rounded up but within a limit
2529     if ((p.size() >= CUDAAllocatorConfig::max_split_size()) &&
2530         ((*it)->size >= p.size() + kLargeBuffer))
2531       return false;
2532     p.block = *it;
2533     pool.blocks.erase(it);
2534     return true;
2535   }
2536 
trigger_free_memory_callbacks(AllocParams & p)2537   bool trigger_free_memory_callbacks(AllocParams& p) {
2538     bool freed_memory = false;
2539     for (const auto& name : FreeCudaMemoryCallbacksRegistry()->Keys()) {
2540       freed_memory |=
2541           FreeCudaMemoryCallbacksRegistry()->Create(name)->Execute();
2542     }
2543     return freed_memory;
2544   }
2545 
garbage_collect_cached_blocks(const std::shared_ptr<GatheredContext> & context)2546   void garbage_collect_cached_blocks(
2547       const std::shared_ptr<GatheredContext>& context) {
2548     // Free unused cached blocks to reclaim GPU memory.
2549     // Unlike release_cached_blocks(), this does not enforce synchronization and
2550     // therefore should be of less overheads.
2551 
2552     size_t gc_threshold = static_cast<size_t>(
2553         CUDAAllocatorConfig::garbage_collection_threshold() *
2554         static_cast<double>(allowed_memory_maximum));
2555     // No need to trigger GC yet
2556     if (total_allocated_memory <= gc_threshold) {
2557       return;
2558     }
2559     const auto target_size = total_allocated_memory - gc_threshold;
2560     size_t gc_reclaimed = 0;
2561 
2562     // Calculate the total age of the free-able blocks. We'll use it later to
2563     // get "avg age" threshold.
2564     size_t total_age = 0.0;
2565     int freeable_block_count = 0;
2566     for (auto& b : large_blocks.blocks) {
2567       if (!b->is_split()) {
2568         total_age += b->gc_count();
2569         ++freeable_block_count;
2570       }
2571     }
2572     // No free-able blocks?
2573     if (freeable_block_count == 0) {
2574       return;
2575     }
2576 
2577     // Repeat GC until we reach reclaim > target size.
2578     bool block_freed = true;
2579     while (gc_reclaimed < target_size && block_freed == true &&
2580            freeable_block_count > 0) {
2581       // Free blocks exceeding this age threshold first.
2582       double age_threshold =
2583           static_cast<double>(total_age) / freeable_block_count;
2584       // Stop iteration if we can no longer free a block.
2585       block_freed = false;
2586 
2587       // Free blocks of > avg age. Don't stop upon reaching the target_size,
2588       // we don't want this GC to be triggered frequently.
2589       auto it = large_blocks.blocks.begin();
2590       while (it != large_blocks.blocks.end()) {
2591         Block* block = *it;
2592         ++it;
2593         if (!block->is_split() && !block->expandable_segment_ &&
2594             static_cast<double>(block->gc_count()) >= age_threshold) {
2595           block_freed = true;
2596           gc_reclaimed += block->size;
2597           total_age -= block->gc_count(); // Decrement the age
2598           freeable_block_count--; // One less block that can be freed
2599           release_block(block, context);
2600         }
2601       }
2602     }
2603   }
2604 
2605   // This function assumes that global lock has been taken whle calling into
2606   // this function. We do cudaMalloc sync call in this function which
2607   // can be expensive while holding the lock. Hence, we pass-in the lock to the
2608   // function to temporarily release the lock before cudaMalloc call and acquire
2609   // it back again after the call so that other threads dont get blocked.
alloc_block(AllocParams & p,bool isRetry,const std::shared_ptr<GatheredContext> & ctx,std::unique_lock<std::recursive_mutex> & lock)2610   bool alloc_block(
2611       AllocParams& p,
2612       bool isRetry,
2613       const std::shared_ptr<GatheredContext>& ctx,
2614       std::unique_lock<std::recursive_mutex>& lock) {
2615     // Defensively checks for preexisting CUDA error state.
2616     C10_CUDA_CHECK(cudaGetLastError());
2617 
2618     size_t size = p.alloc_size;
2619     void* ptr = nullptr;
2620 
2621     if (isRetry) {
2622       stats.num_alloc_retries += 1;
2623     }
2624 #ifdef FBCODE_CAFFE2
2625     bool in_fbcode = true;
2626 #else
2627     bool in_fbcode = false;
2628 #endif
2629 
2630     if (set_fraction &&
2631         total_allocated_memory + size > allowed_memory_maximum) {
2632       p.err = cudaErrorMemoryAllocation;
2633       return false;
2634       // Temporarily disable checkpointing & cudagraphs internally
2635     } else if (
2636         CUDAAllocatorConfig::expandable_segments() &&
2637         !(in_fbcode && p.pool->owner_PrivatePool)) {
2638       p.block = try_allocate_expandable_block(
2639           p.device(), p.stream(), p.pool, p.size(), ctx);
2640       if (p.block) {
2641         p.err = cudaSuccess;
2642         if (p.pool->owner_PrivatePool) {
2643           // The block is for a CUDA graph's PrivatePool.
2644           p.pool->owner_PrivatePool->cudaMalloc_count++;
2645         }
2646       } else {
2647         p.err = cudaErrorMemoryAllocation;
2648       }
2649       return bool(p.block);
2650     } else {
2651       if (CUDAAllocatorConfig::release_lock_on_cudamalloc()) {
2652         // At scope exit, acquire the lock again. This provides safety against
2653         // any potential exceptions in the cudaMallocMaybeCapturing function.
2654         auto sg = c10::make_scope_exit([&]() { lock.lock(); });
2655         lock.unlock();
2656       }
2657       auto active_pool = MemPoolContext::getActiveMemPool();
2658       if (active_pool && active_pool->allocator() &&
2659           p.pool->owner_PrivatePool) {
2660         ptr = active_pool->allocator()->raw_alloc(size);
2661         p.err = ptr ? cudaSuccess : cudaErrorMemoryAllocation;
2662       } else {
2663         p.err = cudaMallocMaybeCapturing(&ptr, size);
2664       }
2665       if (CUDAAllocatorConfig::release_lock_on_cudamalloc()) {
2666         TORCH_CHECK(
2667             lock.owns_lock(), "Failed to acquire lock after cudaMalloc");
2668       }
2669 
2670       if (p.err != cudaSuccess) {
2671         if (p.err == cudaErrorMemoryAllocation) {
2672           // If this is the first attempt (!isRetry), we can forgive and clear
2673           // CUDA's internal error state.
2674           //
2675           // If this is the second attempt (isRetry), malloc's TORCH_CHECK_WITH
2676           // will take over to throw a helpful exception. The user can choose
2677           // to catch the exception, free some stuff in their script, and
2678           // attempt the allocation again. In this case, we can also forgive and
2679           // clear CUDA's internal error state.
2680           (void)cudaGetLastError();
2681         } else {
2682           // If the error's unrelated to memory allocation, we should throw
2683           // immediately.
2684           C10_CUDA_CHECK(p.err);
2685         }
2686         return false;
2687       }
2688     }
2689 
2690     if (p.pool->owner_PrivatePool) {
2691       // The block is for a CUDA graph's PrivatePool.
2692       p.pool->owner_PrivatePool->cudaMalloc_count++;
2693     }
2694 
2695     total_allocated_memory += size;
2696     p.block = new Block(p.device(), p.stream(), size, p.pool, (char*)ptr);
2697     for_each_selected_stat_type(p.stat_types, [&](size_t stat_type) {
2698       stats.segment[stat_type].increase(1);
2699       stats.reserved_bytes[stat_type].increase(size);
2700     });
2701     if (size >= CUDAAllocatorConfig::max_split_size())
2702       stats.oversize_segments.increase(1);
2703     auto reserved_bytes_gauge =
2704         STATIC_GAUGE(pytorch.CUDACachingAllocator.reserved_bytes);
2705     reserved_bytes_gauge.record(
2706         stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
2707             .current);
2708 
2709     // p.block came from new, not cudaMalloc. It should not be nullptr here.
2710     TORCH_INTERNAL_ASSERT(p.block != nullptr && p.block->ptr != nullptr);
2711     stats.num_device_alloc++;
2712     record_trace(
2713         TraceEntry::SEGMENT_ALLOC,
2714         int64_t(p.block->ptr),
2715         p.block->size,
2716         p.stream(),
2717         p.device(),
2718         ctx);
2719     p.block->context_when_segment_allocated = ctx;
2720     return true;
2721   }
2722 
2723   /** Free one or more oversize blocks to the system allocator.  But only enough
2724    * **/
2725   /** to satisfy the target size **/
release_available_cached_blocks(const AllocParams & p,const std::shared_ptr<GatheredContext> & context)2726   bool release_available_cached_blocks(
2727       const AllocParams& p,
2728       const std::shared_ptr<GatheredContext>& context) {
2729     if (CUDAAllocatorConfig::max_split_size() ==
2730         std::numeric_limits<size_t>::max())
2731       return false;
2732     BlockPool& pool = *p.pool;
2733 
2734     // because of std::unique_ptr, block cannot be trivially copied
2735     // Use constructor for search key.
2736     Block key(p.search_key.device, p.search_key.stream, p.search_key.size);
2737     key.size = (key.size < CUDAAllocatorConfig::max_split_size())
2738         ? CUDAAllocatorConfig::max_split_size()
2739         : key.size;
2740     auto it = pool.blocks.lower_bound(&key);
2741     if (it == pool.blocks.end() || (*it)->stream != p.stream() ||
2742         (*it)->expandable_segment_) {
2743       // No single block is large enough; free multiple oversize blocks,
2744       // starting with the largest
2745       if (it == pool.blocks.begin())
2746         return false;
2747       size_t totalReleased = 0;
2748       --it; // Back up one item.  Now on the largest block for the correct
2749             // stream
2750       while ((totalReleased < key.size) &&
2751              ((*it)->size >= CUDAAllocatorConfig::max_split_size()) &&
2752              ((*it)->stream == p.stream())) {
2753         auto cur = it;
2754         bool is_first = cur == pool.blocks.begin();
2755         if (!is_first) {
2756           --it;
2757         }
2758         if (!(*cur)->expandable_segment_) {
2759           release_block(*cur, context);
2760           totalReleased += (*cur)->size;
2761         }
2762         if (is_first) {
2763           break;
2764         }
2765       }
2766       if (totalReleased < key.size)
2767         return false;
2768     } else {
2769       release_block(*it, context);
2770     }
2771     return true;
2772   }
2773 
release_cached_blocks(const std::shared_ptr<GatheredContext> & context)2774   bool release_cached_blocks(const std::shared_ptr<GatheredContext>& context) {
2775     // First ensure that all blocks that can't currently be allocated due to
2776     // outstanding events are returned to the pool.
2777     synchronize_and_free_events(context);
2778 
2779     // Free all non-split cached blocks to system allocator
2780     release_blocks(large_blocks, context);
2781     release_blocks(small_blocks, context);
2782 
2783     for (auto it = graph_pools_freeable.begin();
2784          it != graph_pools_freeable.end();) {
2785       // See notifyCaptureDestroy for the strategy here.
2786       TORCH_INTERNAL_ASSERT(it->second->use_count == 0);
2787       release_blocks(it->second->small_blocks, context);
2788       release_blocks(it->second->large_blocks, context);
2789       if (it->second->cudaMalloc_count == 0) {
2790         auto erase_count = graph_pools.erase(it->first);
2791         TORCH_INTERNAL_ASSERT(erase_count == 1);
2792         it = graph_pools_freeable.erase(it);
2793       } else {
2794         ++it;
2795       }
2796     }
2797 
2798     return true;
2799   }
2800 
release_expandable_segment(Block * block)2801   void release_expandable_segment(Block* block) {
2802     TORCH_INTERNAL_ASSERT(
2803         block->size == block->expandable_segment_->size(),
2804         "block disagrees with segment");
2805     TORCH_INTERNAL_ASSERT(!block->mapped);
2806     auto it = std::find(
2807         expandable_segments_.begin(),
2808         expandable_segments_.end(),
2809         block->expandable_segment_);
2810     TORCH_INTERNAL_ASSERT(it != expandable_segments_.end());
2811     expandable_segments_.erase(it);
2812     block->pool->unmapped.erase(block);
2813     delete block->expandable_segment_;
2814     delete block;
2815   }
2816 
release_block(Block * block,const std::shared_ptr<GatheredContext> & context)2817   void release_block(
2818       Block* block,
2819       const std::shared_ptr<GatheredContext>& context) {
2820     TORCH_INTERNAL_ASSERT(!block->expandable_segment_);
2821     stats.num_device_free++;
2822     record_trace(
2823         TraceEntry::SEGMENT_FREE,
2824         int64_t(block->ptr),
2825         block->size,
2826         block->stream,
2827         block->device,
2828         context ? context : block->context_when_segment_allocated);
2829 
2830     C10_CUDA_CHECK(cudaFree((void*)block->ptr));
2831     total_allocated_memory -= block->size;
2832 
2833     auto* pool = block->pool;
2834     if (pool->owner_PrivatePool) {
2835       // The cudaFreed block belonged to a CUDA graph's PrivatePool.
2836       TORCH_INTERNAL_ASSERT(pool->owner_PrivatePool->cudaMalloc_count > 0);
2837       pool->owner_PrivatePool->cudaMalloc_count--;
2838     }
2839 
2840     StatTypes stat_types = get_stat_types_for_pool(*pool);
2841     for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
2842       stats.segment[stat_type].decrease(1);
2843       stats.reserved_bytes[stat_type].decrease(block->size);
2844     });
2845     auto reserved_bytes_gauge =
2846         STATIC_GAUGE(pytorch.CUDACachingAllocator.reserved_bytes);
2847     reserved_bytes_gauge.record(
2848         stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
2849             .current);
2850 
2851     if (block->size >= CUDAAllocatorConfig::max_split_size())
2852       stats.oversize_segments.decrease(1);
2853     pool->blocks.erase(block);
2854     delete block;
2855   }
2856 
unmap_block(Block * block,const std::shared_ptr<GatheredContext> & context)2857   void unmap_block(
2858       Block* block,
2859       const std::shared_ptr<GatheredContext>& context) {
2860     auto unmapped = block->expandable_segment_->unmap(
2861         SegmentRange{block->ptr, block->size});
2862     if (unmapped.size == 0) {
2863       return;
2864     }
2865     block->pool->blocks.erase(block);
2866 
2867     ptrdiff_t before_size =
2868         static_cast<char*>(unmapped.ptr) - static_cast<char*>(block->ptr);
2869     if (before_size > 0) {
2870       // prev? -> before_free -> block
2871       Block* before_free = new Block(
2872           block->device, block->stream, before_size, block->pool, block->ptr);
2873       before_free->expandable_segment_ = block->expandable_segment_;
2874       before_free->splice(block->prev, block);
2875       block->pool->insert_into_blocks(before_free);
2876     }
2877 
2878     auto after_size = block->size - (before_size + unmapped.size);
2879     if (after_size > 0) {
2880       // block -> after_free -> next?
2881       Block* after_free = new Block(
2882           block->device,
2883           block->stream,
2884           after_size,
2885           block->pool,
2886           static_cast<char*>(unmapped.ptr) + unmapped.size);
2887       after_free->expandable_segment_ = block->expandable_segment_;
2888       after_free->splice(block, block->next);
2889       block->pool->insert_into_blocks(after_free);
2890     }
2891 
2892     block->ptr = unmapped.ptr;
2893     block->size = unmapped.size;
2894     block->mapped = false;
2895 
2896     try_merge_blocks(block, block->prev, *block->pool);
2897     try_merge_blocks(block, block->next, *block->pool);
2898     block->pool->unmapped.insert(block);
2899 
2900     // update statistics
2901     total_allocated_memory -= unmapped.size;
2902     StatTypes stat_types = get_stat_types_for_pool(*block->pool);
2903     for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
2904       stats.reserved_bytes[stat_type].decrease(unmapped.size);
2905     });
2906     auto reserved_bytes_gauge =
2907         STATIC_GAUGE(pytorch.CUDACachingAllocator.reserved_bytes);
2908     reserved_bytes_gauge.record(
2909         stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
2910             .current);
2911 
2912     if (block->pool->owner_PrivatePool) {
2913       // The cudaFreed block belonged to a CUDA graph's PrivatePool.
2914       TORCH_INTERNAL_ASSERT(
2915           block->pool->owner_PrivatePool->cudaMalloc_count > 0);
2916       block->pool->owner_PrivatePool->cudaMalloc_count--;
2917     }
2918 
2919     stats.num_device_free++;
2920     record_trace(
2921         TraceEntry::SEGMENT_UNMAP,
2922         int64_t(unmapped.ptr),
2923         unmapped.size,
2924         block->stream,
2925         block->device,
2926         context ? context : block->context_when_segment_allocated);
2927   }
release_blocks(BlockPool & pool,const std::shared_ptr<GatheredContext> & context)2928   void release_blocks(
2929       BlockPool& pool,
2930       const std::shared_ptr<GatheredContext>& context) {
2931     std::vector<Block*> to_unmap;
2932     // Frees all non-split blocks
2933     auto it = pool.blocks.begin();
2934     while (it != pool.blocks.end()) {
2935       Block* block = *it;
2936       ++it;
2937       if (block->expandable_segment_) {
2938         // unmapping will mutate the free pool
2939         // so just gather what needs to be freed
2940         // to avoid invalidating the iterator
2941         to_unmap.push_back(block);
2942       } else if (!block->prev && !block->next) {
2943         release_block(block, context);
2944       }
2945     }
2946     for (Block* block : to_unmap) {
2947       unmap_block(block, context);
2948       if (!block->prev && !block->next) {
2949         release_expandable_segment(block);
2950       }
2951     }
2952   }
2953 
create_event_internal(c10::DeviceIndex idx)2954   EventPool::Event create_event_internal(c10::DeviceIndex idx) {
2955     // Leak the event pool to avoid shutdown issues.
2956     static auto* event_pool = new EventPool();
2957     return event_pool->get(idx);
2958   }
2959 
synchronize_and_free_events(const std::shared_ptr<GatheredContext> & context)2960   void synchronize_and_free_events(
2961       const std::shared_ptr<GatheredContext>& context) {
2962     // Synchronize on outstanding events and then free associated blocks.
2963     stats.num_sync_all_streams++;
2964 
2965     // This function syncs, so capture should not be underway. Might as well
2966     // make sure capture-deferred end of life events get processed too.
2967     TORCH_INTERNAL_ASSERT(captures_underway.empty());
2968     insert_events_deferred_until_no_capture(context);
2969 
2970     for (auto& st : cuda_events) {
2971       for (auto& e : st.second) {
2972         EventPool::Event event = std::move(e.first);
2973         Block* block = e.second;
2974 
2975         C10_CUDA_CHECK(cudaEventSynchronize(*event));
2976 
2977         block->event_count--;
2978         if (block->event_count == 0) {
2979           free_block(block, context);
2980         }
2981       }
2982     }
2983 
2984     cuda_events.clear();
2985   }
2986 
remove_cudagraph_stream_uses(Block * block)2987   void remove_cudagraph_stream_uses(Block* block) {
2988     // remove stream uses added during cudagraph capture
2989     // (i.e., block->stream_uses - block->cudagraph_stream_uses)
2990     if (C10_UNLIKELY(
2991             block_to_cudagraph_stream_uses.find(block) !=
2992             block_to_cudagraph_stream_uses.end())) {
2993       stream_set streams(std::move(block->stream_uses));
2994       AT_ASSERT(block->stream_uses.empty());
2995       for (auto& stream : streams) {
2996         if (block_to_cudagraph_stream_uses[block].find(stream) ==
2997             block_to_cudagraph_stream_uses[block].end()) {
2998           block->stream_uses.insert(stream);
2999         }
3000       }
3001       block_to_cudagraph_stream_uses.erase(block);
3002     }
3003   }
3004 
insert_events(Block * block)3005   void insert_events(Block* block) {
3006     c10::DeviceIndex prev_device = 0;
3007     C10_CUDA_CHECK(c10::cuda::GetDevice(&prev_device));
3008 
3009     stream_set streams(std::move(block->stream_uses));
3010     AT_ASSERT(block->stream_uses.empty());
3011     for (auto& stream : streams) {
3012       C10_CUDA_CHECK(c10::cuda::SetDevice(stream.device_index()));
3013 
3014       EventPool::Event event = create_event_internal(stream.device_index());
3015       C10_CUDA_CHECK(cudaEventRecord(*event, stream.stream()));
3016 
3017       block->event_count++;
3018       cuda_events[stream].emplace_back(std::move(event), block);
3019     }
3020 
3021     C10_CUDA_CHECK(c10::cuda::MaybeSetDevice(prev_device));
3022   }
3023 
insert_events_deferred_until_no_capture(const std::shared_ptr<GatheredContext> & context)3024   void insert_events_deferred_until_no_capture(
3025       const std::shared_ptr<GatheredContext>& context) {
3026     if (C10_UNLIKELY(!needs_events_deferred_until_no_capture.empty())) {
3027       for (auto* block : needs_events_deferred_until_no_capture) {
3028         TORCH_INTERNAL_ASSERT(!block->stream_uses.empty());
3029         // only streams recorded before cudagraph will be used to insert events
3030         // since we know all streams recorded during cudagraph must have
3031         // completed (refer to Section 3.2.8.7.3.1 Cross-stream Dependencies and
3032         // Events in CUDA Programming Guide).
3033         remove_cudagraph_stream_uses(block);
3034         insert_events(block);
3035         if (block->event_count == 0) {
3036           free_block(block, context);
3037         }
3038       }
3039       needs_events_deferred_until_no_capture.clear();
3040     }
3041   }
3042 
process_events(const std::shared_ptr<GatheredContext> & context)3043   void process_events(const std::shared_ptr<GatheredContext>& context) {
3044     insert_events_deferred_until_no_capture(context);
3045 
3046     // Process outstanding cudaEvents. Events that are completed are
3047     // removed from the queue, and the 'event_count' for the
3048     // corresponding allocation is decremented. We maintain a separate
3049     // list of events per stream to avoid head-of-line delays if one
3050     // or more streams has long-running operations.
3051 
3052     // Iterate over different streams.
3053     for (auto it = cuda_events.begin(); it != cuda_events.end();) {
3054       // Iterate over this stream's (event, block) pairs.
3055       while (!it->second.empty()) {
3056         auto& e = it->second.front();
3057         EventPool::Event event = std::move(e.first);
3058         Block* block = e.second;
3059 
3060         cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaEventQuery(*event));
3061         if (err == cudaErrorNotReady) {
3062           // ignore and clear the error if not ready
3063           (void)cudaGetLastError();
3064           // Return the ownership of the Event (unique ptr)
3065           e.first = std::move(event);
3066           break;
3067         } else if (err != cudaSuccess) {
3068           C10_CUDA_CHECK(err);
3069         }
3070 
3071         block->event_count--;
3072         if (block->event_count == 0) {
3073           free_block(block, context);
3074         }
3075         it->second.pop_front();
3076       }
3077 
3078       if (it->second.empty()) {
3079         it = cuda_events.erase(it);
3080       } else {
3081         it++;
3082       }
3083     }
3084   }
3085 
3086   // Iterates over sizes of all memory blocks for given device in given pool
cache_info_aux(const BlockPool & pool,size_t * largest)3087   void cache_info_aux(const BlockPool& pool, size_t* largest) {
3088     for (const auto& block : pool.blocks) {
3089       const auto blocksize = block->size;
3090       if (blocksize > *largest) {
3091         *largest = blocksize;
3092       }
3093     }
3094   }
3095 
record_trace(TraceEntry::Action action,size_t addr,size_t size,cudaStream_t stream,c10::DeviceIndex device,std::shared_ptr<GatheredContext> context)3096   void record_trace(
3097       TraceEntry::Action action,
3098       size_t addr,
3099       size_t size,
3100       cudaStream_t stream,
3101       c10::DeviceIndex device,
3102       std::shared_ptr<GatheredContext> context) {
3103     if (!record_history && trace_trackers_.empty())
3104       return;
3105 
3106     auto te = TraceEntry(
3107         action,
3108         device,
3109         addr,
3110         size,
3111         stream,
3112         getApproximateTime(),
3113         record_context_ >= RecordContext::ALLOC ? std::move(context) : nullptr);
3114 
3115     // Callbacks should not include any Pytorch call
3116     for (const auto& cb : trace_trackers_) {
3117       cb(te);
3118     }
3119 
3120     if (record_history) {
3121       alloc_buffer.insertEntries(te);
3122     }
3123   }
3124 };
3125 
3126 // Returns whether to force all allocations to bypass the caching allocator and
3127 // go straight to cudaMalloc.  This setting is useful when debugging GPU memory
3128 // errors, since the caching allocator foils cuda-memcheck.
forceUncachedAllocator()3129 bool forceUncachedAllocator() {
3130   static bool force_uncached =
3131       getenv("PYTORCH_NO_CUDA_MEMORY_CACHING") != nullptr;
3132   return force_uncached;
3133 }
3134 
uncached_delete(void * ptr)3135 static void uncached_delete(void* ptr) {
3136   if (TORCH_SDT_IS_ENABLED(free)) {
3137     TORCH_SDT_WITH_SEMAPHORE(free, ptr);
3138   }
3139 
3140   const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
3141   if (C10_UNLIKELY(interp)) {
3142     (*interp)->trace_gpu_memory_deallocation(
3143         c10::kCUDA, reinterpret_cast<uintptr_t>(ptr));
3144   }
3145   C10_CUDA_CHECK(cudaFree(ptr));
3146 }
3147 
3148 void local_raw_delete(void* ptr);
3149 
3150 class NativeCachingAllocator : public CUDAAllocator {
3151  private:
3152   // Shard allocation region to have independent mutexes to reduce contention.
3153   static constexpr size_t kNumMutexShard = 67;
3154 
3155   // TODO: use std::hardware_destructive_interference_size once available
3156   struct alignas(64) AlignedMutex {
3157     std::mutex m;
3158   };
3159 
3160   std::array<AlignedMutex, kNumMutexShard> mutex;
3161 
3162   // allocated blocks by device pointer
3163   std::array<ska::flat_hash_map<void*, Block*>, kNumMutexShard>
3164       allocated_blocks;
3165 
get_mutex_shard_id(void * ptr)3166   static size_t get_mutex_shard_id(void* ptr) {
3167     return twang_mix64((size_t)ptr) % kNumMutexShard;
3168   }
3169 
add_allocated_block(Block * block)3170   void add_allocated_block(Block* block) {
3171     // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
3172     const auto mutex_shard_id = get_mutex_shard_id(block->ptr);
3173     std::lock_guard<std::mutex> lock(mutex[mutex_shard_id].m);
3174     allocated_blocks[mutex_shard_id][block->ptr] = block;
3175   }
3176 
3177   // Variables by memory snapshot
3178   c10::ApproximateClockToUnixTimeConverter clock_converter;
3179   bool record_history = false;
3180   RingBuffer<AnnotationEntry> annotation_buffer;
3181 
3182  public:
3183   std::vector<std::unique_ptr<DeviceCachingAllocator>> device_allocator;
3184 
get_allocated_block(void * ptr,bool remove=false)3185   Block* get_allocated_block(void* ptr, bool remove = false) {
3186     const auto mutex_shard_id = get_mutex_shard_id(ptr);
3187     std::lock_guard<std::mutex> lock(mutex[mutex_shard_id].m);
3188     auto it = allocated_blocks[mutex_shard_id].find(ptr);
3189     if (it == allocated_blocks[mutex_shard_id].end()) {
3190       return nullptr;
3191     }
3192     Block* block = it->second;
3193     if (remove) {
3194       allocated_blocks[mutex_shard_id].erase(it);
3195     }
3196     return block;
3197   }
3198 
init(int device_count)3199   void init(int device_count) override {
3200     const auto size = static_cast<int64_t>(device_allocator.size());
3201     if (size < device_count) {
3202       device_allocator.resize(device_count);
3203       for (const auto i : c10::irange(size, device_count)) {
3204         device_allocator[i] = std::make_unique<DeviceCachingAllocator>();
3205       }
3206     }
3207   }
3208 
initialized()3209   bool initialized() override {
3210     return !device_allocator.empty();
3211   }
3212 
3213   /** allocates a block which is safe to use from the provided stream */
malloc(void ** devPtr,c10::DeviceIndex device,size_t size,cudaStream_t stream)3214   void malloc(
3215       void** devPtr,
3216       c10::DeviceIndex device,
3217       size_t size,
3218       cudaStream_t stream) {
3219     TORCH_INTERNAL_ASSERT(
3220         0 <= device && static_cast<size_t>(device) < device_allocator.size(),
3221         "Allocator not initialized for device ",
3222         device,
3223         ": did you call init?");
3224     Block* block = device_allocator[device]->malloc(device, size, stream);
3225     add_allocated_block(block);
3226     *devPtr = (void*)block->ptr;
3227     const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
3228     if (C10_UNLIKELY(interp)) {
3229       (*interp)->trace_gpu_memory_allocation(
3230           c10::kCUDA, reinterpret_cast<uintptr_t>(*devPtr));
3231     }
3232   }
3233 
free(void * ptr)3234   void free(void* ptr) {
3235     if (!ptr) {
3236       return;
3237     }
3238     Block* block = get_allocated_block(ptr, true /* remove */);
3239     if (!block) {
3240       TORCH_CHECK(false, "invalid device pointer: ", ptr);
3241     }
3242     const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
3243     if (C10_UNLIKELY(interp)) {
3244       (*interp)->trace_gpu_memory_deallocation(
3245           c10::kCUDA, reinterpret_cast<uintptr_t>(block->ptr));
3246     }
3247     device_allocator[block->device]->free(block);
3248   }
3249 
setMemoryFraction(double fraction,c10::DeviceIndex device)3250   void setMemoryFraction(double fraction, c10::DeviceIndex device) override {
3251     TORCH_INTERNAL_ASSERT(
3252         0 <= device && static_cast<size_t>(device) < device_allocator.size(),
3253         "Allocator not initialized for device ",
3254         device,
3255         ": did you call init?");
3256     TORCH_INTERNAL_ASSERT(
3257         0 <= fraction && fraction <= 1,
3258         "invalid fraction:",
3259         fraction,
3260         ". Please set within (0, 1).");
3261     C10_CUDA_CHECK(c10::cuda::SetDevice(device));
3262     device_allocator[device]->setMemoryFraction(fraction);
3263   }
3264 
recordHistory(bool enabled,CreateContextFn context_recorder,size_t alloc_buffer_max_entries,RecordContext when)3265   void recordHistory(
3266       bool enabled,
3267       CreateContextFn context_recorder,
3268       size_t alloc_buffer_max_entries,
3269       RecordContext when) override {
3270     record_history = enabled;
3271     annotation_buffer.setMaxEntries(alloc_buffer_max_entries);
3272     annotation_buffer.clear();
3273     for (auto& allocator : device_allocator) {
3274       allocator->recordHistory(
3275           enabled, context_recorder, alloc_buffer_max_entries, when);
3276     }
3277   }
3278 
recordAnnotation(const std::vector<std::pair<std::string,std::string>> & md)3279   void recordAnnotation(
3280       const std::vector<std::pair<std::string, std::string>>& md) override {
3281     if (!record_history) {
3282       return;
3283     }
3284     c10::DeviceIndex device = 0;
3285     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
3286     auto ae = AnnotationEntry(
3287         /*device=*/device,
3288         /*time=*/getApproximateTime());
3289     for (const auto& md_pair : md) {
3290       ae.recordUserMetadata(md_pair.first, md_pair.second);
3291     }
3292     annotation_buffer.insertEntries(ae);
3293   }
3294 
isHistoryEnabled()3295   bool isHistoryEnabled() override {
3296     c10::DeviceIndex device = 0;
3297     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
3298     return device_allocator[device]->isHistoryEnabled();
3299   }
3300 
checkPoolLiveAllocations(c10::DeviceIndex device,MempoolId_t mempool_id,const std::unordered_set<void * > & expected_live_allocations)3301   bool checkPoolLiveAllocations(
3302       c10::DeviceIndex device,
3303       MempoolId_t mempool_id,
3304       const std::unordered_set<void*>& expected_live_allocations) override {
3305     return device_allocator[device]->checkPoolLiveAllocations(
3306         mempool_id, expected_live_allocations);
3307   }
3308 
attachOutOfMemoryObserver(OutOfMemoryObserver observer)3309   void attachOutOfMemoryObserver(OutOfMemoryObserver observer) override {
3310     for (auto& allocator : device_allocator) {
3311       allocator->attachOutOfMemoryObserver(observer);
3312     }
3313   }
3314 
attachAllocatorTraceTracker(AllocatorTraceTracker tracker)3315   void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) override {
3316     for (auto& allocator : device_allocator) {
3317       allocator->attachAllocatorTraceTracker(tracker);
3318     }
3319   }
3320 
emptyCache()3321   void emptyCache() override {
3322     for (auto& da : device_allocator)
3323       da->emptyCache();
3324   }
3325 
getBaseAllocation(void * ptr,size_t * outSize)3326   void* getBaseAllocation(void* ptr, size_t* outSize) override {
3327     Block* block = get_allocated_block(ptr);
3328     if (!block) {
3329       TORCH_CHECK(false, "invalid device pointer: ", ptr);
3330     }
3331     return device_allocator[block->device]->getBaseAllocation(block, outSize);
3332   }
3333 
shareIpcHandle(void * ptr)3334   ShareableHandle shareIpcHandle(void* ptr) override {
3335     Block* block = get_allocated_block(ptr);
3336     if (!block) {
3337       TORCH_CHECK(false, "invalid device pointer: ", ptr);
3338     }
3339     return device_allocator[block->device]->shareIpcHandle(block);
3340   }
3341 
recordStream(const DataPtr & ptr,cuda::CUDAStream stream)3342   void recordStream(const DataPtr& ptr, cuda::CUDAStream stream) override {
3343     // Empty tensor's storage().data() might be a null ptr. As there is no
3344     // blocks associated with those tensors, it is fine to do nothing here.
3345     if (!ptr.get()) {
3346       return;
3347     }
3348 
3349     // If a tensor is not allocated by this instance, simply skip
3350     // This usually happens when CUDA tensors are shared across processes,
3351     // we have implemented reference counting based sharing mechanism to
3352     // guarantee tensors won't be accidentally freed by one process while
3353     // they are still being used in another
3354     if (ptr.get_deleter() != &local_raw_delete)
3355       return;
3356 
3357     Block* block = get_allocated_block(ptr.get());
3358     // block must not be null reaching here
3359     TORCH_INTERNAL_ASSERT(block != nullptr, "No allocated block can be found");
3360     device_allocator[block->device]->recordStream(block, stream);
3361   }
3362 
snapshot()3363   SnapshotInfo snapshot() override {
3364     // Set-up converter to convert timestamps from tsc to microseconds.
3365     auto tsc_to_ns = clock_converter.makeConverter();
3366     auto tsc_to_us = [=](approx_time_t t_approx) {
3367       return tsc_to_ns(t_approx) / 1000;
3368     };
3369 
3370     SnapshotInfo result;
3371 
3372     // Get AnnotationEntry list and convert the timestamps.
3373     annotation_buffer.getEntries(result.external_annotations);
3374     for (auto& ae : result.external_annotations) {
3375       ae.time_.t_ = tsc_to_us(ae.time_.approx_t_);
3376     }
3377 
3378     // Get the device_traces' TraceEntry lists.
3379     for (auto& da : device_allocator) {
3380       result.device_traces.emplace_back(da->trace(tsc_to_us));
3381       auto snap = da->snapshot();
3382       result.segments.insert(result.segments.end(), snap.begin(), snap.end());
3383     }
3384 
3385     auto& md = result.config_metadata;
3386     md.garbage_collection_threshold =
3387         CUDAAllocatorConfig::garbage_collection_threshold();
3388     md.max_split_size = CUDAAllocatorConfig::max_split_size();
3389     md.pinned_num_register_threads =
3390         CUDAAllocatorConfig::pinned_num_register_threads();
3391     md.expandable_segments = CUDAAllocatorConfig::expandable_segments();
3392     md.release_lock_on_malloc =
3393         CUDAAllocatorConfig::release_lock_on_cudamalloc();
3394     md.pinned_use_host_register =
3395         CUDAAllocatorConfig::pinned_use_cuda_host_register();
3396     md.last_allocator_settings = CUDAAllocatorConfig::last_allocator_settings();
3397     md.roundup_power2_divisions =
3398         CUDAAllocatorConfig::roundup_power2_divisions();
3399 
3400     return result;
3401   }
3402 
getCheckpointState(c10::DeviceIndex device,MempoolId_t id)3403   std::shared_ptr<AllocatorState> getCheckpointState(
3404       c10::DeviceIndex device,
3405       MempoolId_t id) override {
3406     return device_allocator[device]->getCheckpointState(id);
3407   }
3408 
3409   /**
3410    * @brief Checkpoint the private pool state identified in `as` to its prior
3411    * state
3412    *
3413    * @param device - device of the pool to manipulate
3414    * @param as - allocator state
3415    * @param stale_live_storages - storages of tensors which are currently
3416    * allocated but which will be not be allocated after the checkpoint is set.
3417    * For these storages we will remove their deleter function.
3418    * @return CheckpointDelta - Freed Pointers and DataPtrs that contain deleter
3419    * functions for all allocated blocks in the new checkpoint state.
3420    */
setCheckpointPoolState(c10::DeviceIndex device,std::shared_ptr<AllocatorState> as)3421   CheckpointDelta setCheckpointPoolState(
3422       c10::DeviceIndex device,
3423       std::shared_ptr<AllocatorState> as) override {
3424     std::shared_ptr<PrivatePoolState> pps =
3425         std::dynamic_pointer_cast<PrivatePoolState>(as);
3426 
3427     TORCH_CHECK(pps, "Expected PrivatePoolState");
3428 
3429     auto rr = device_allocator[device]->setCheckpointPoolState(*pps);
3430 
3431     CheckpointDelta cpd;
3432     for (void* ptr : rr.allocations_freed) {
3433       get_allocated_block(ptr, /*remove*/ true);
3434       cpd.ptrs_freed.push_back(ptr);
3435     }
3436     for (Block* block : rr.allocations_created) {
3437       add_allocated_block(block);
3438       cpd.dataptrs_allocd.emplace_back(
3439           block->ptr,
3440           block->ptr,
3441           &local_raw_delete,
3442           Device(DeviceType::CUDA, device));
3443     }
3444 
3445     return cpd;
3446   }
3447 
allocate(size_t size)3448   DataPtr allocate(size_t size) override {
3449     constexpr size_t one_exa_bytes = 1152921504606846976ULL;
3450     TORCH_CHECK_WITH(
3451         OutOfMemoryError,
3452         size < one_exa_bytes,
3453         "CUDA out of memory. Tried to allocate more than 1EB memory.");
3454     c10::DeviceIndex device = 0;
3455     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
3456     void* devPtr = nullptr;
3457     void (*deleteFunc)(void*) = &local_raw_delete;
3458     CUDAStream stream = cuda::getCurrentCUDAStream(device);
3459 
3460     if (forceUncachedAllocator()) {
3461       deleteFunc = &uncached_delete;
3462 
3463       // Deliberately don't use cudaMallocMaybeCapturing here, to force an error
3464       // if someone tries to use forceUncachedAllocator while capturing.
3465       C10_CUDA_CHECK(cudaMalloc(&devPtr, size));
3466       const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
3467       if (C10_UNLIKELY(interp)) {
3468         (*interp)->trace_gpu_memory_allocation(
3469             c10::kCUDA, reinterpret_cast<uintptr_t>(devPtr));
3470       }
3471     } else {
3472       if (size != 0) {
3473         this->malloc(&devPtr, device, size, stream);
3474       }
3475     }
3476 
3477     if (size && TORCH_SDT_IS_ENABLED(malloc)) {
3478       TORCH_SDT_WITH_SEMAPHORE(malloc, devPtr, device, size, stream.id());
3479     }
3480 
3481     return {devPtr, devPtr, deleteFunc, Device(DeviceType::CUDA, device)};
3482   }
raw_deleter() const3483   DeleterFnPtr raw_deleter() const override {
3484     if (forceUncachedAllocator()) {
3485       return &uncached_delete;
3486     } else {
3487       return &local_raw_delete;
3488     }
3489   }
cacheInfo(c10::DeviceIndex device,size_t * largestBlock)3490   void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) override {
3491     device_allocator[device]->cacheInfo(largestBlock);
3492   }
assertValidDevice(c10::DeviceIndex device)3493   void assertValidDevice(c10::DeviceIndex device) {
3494     const auto device_num = device_allocator.size();
3495     TORCH_CHECK(
3496         0 <= device && device < static_cast<int64_t>(device_num),
3497         "Invalid device argument ",
3498         device,
3499         ": did you call init?");
3500   }
3501 
getDeviceStats(c10::DeviceIndex device)3502   DeviceStats getDeviceStats(c10::DeviceIndex device) override {
3503     assertValidDevice(device);
3504     return device_allocator[device]->getStats();
3505   }
3506 
resetAccumulatedStats(c10::DeviceIndex device)3507   void resetAccumulatedStats(c10::DeviceIndex device) override {
3508     assertValidDevice(device);
3509     device_allocator[device]->resetAccumulatedStats();
3510   }
3511 
resetPeakStats(c10::DeviceIndex device)3512   void resetPeakStats(c10::DeviceIndex device) override {
3513     assertValidDevice(device);
3514     device_allocator[device]->resetPeakStats();
3515   }
3516   // CUDAGraph interactions
beginAllocateToPool(c10::DeviceIndex device,MempoolId_t mempool_id,std::function<bool (cudaStream_t)> filter)3517   void beginAllocateToPool(
3518       c10::DeviceIndex device,
3519       MempoolId_t mempool_id,
3520       std::function<bool(cudaStream_t)> filter) override {
3521     assertValidDevice(device);
3522     device_allocator[device]->beginAllocateToPool(
3523         std::move(mempool_id), std::move(filter));
3524   }
3525 
endAllocateToPool(c10::DeviceIndex device,MempoolId_t mempool_id)3526   void endAllocateToPool(c10::DeviceIndex device, MempoolId_t mempool_id)
3527       override {
3528     assertValidDevice(device);
3529     device_allocator[device]->endAllocateToPool(mempool_id);
3530   }
3531 
releasePool(c10::DeviceIndex device,MempoolId_t mempool_id)3532   void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) override {
3533     assertValidDevice(device);
3534     device_allocator[device]->releasePool(std::move(mempool_id));
3535   }
3536 
raw_alloc(size_t nbytes)3537   void* raw_alloc(size_t nbytes) override {
3538     if (nbytes == 0) {
3539       return nullptr;
3540     }
3541     c10::DeviceIndex device = 0;
3542     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
3543     void* r = nullptr;
3544     malloc(&r, device, nbytes, cuda::getCurrentCUDAStream(device));
3545     return r;
3546   }
3547 
raw_alloc_with_stream(size_t nbytes,cudaStream_t stream)3548   void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) override {
3549     if (nbytes == 0) {
3550       return nullptr;
3551     }
3552     c10::DeviceIndex device = 0;
3553     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
3554     void* r = nullptr;
3555     malloc(&r, device, nbytes, stream);
3556     return r;
3557   }
3558 
enablePeerAccess(c10::DeviceIndex dev,c10::DeviceIndex dev_to_access)3559   void enablePeerAccess(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access)
3560       override {
3561     c10::cuda::CUDAGuard device_guard(dev);
3562     cudaError_t err = cudaDeviceEnablePeerAccess(dev_to_access, 0);
3563     if (err == cudaErrorPeerAccessAlreadyEnabled) {
3564       // ignore and clear the error if access was already enabled
3565       (void)cudaGetLastError();
3566     } else {
3567       C10_CUDA_CHECK(err);
3568     }
3569     device_allocator[dev_to_access]->addPeerAccess(dev);
3570     std::lock_guard<std::mutex> lock(IpcMutex);
3571     for (auto& entry : ipcMemHandle_to_devptr) {
3572       if (entry.second.device_ == dev_to_access &&
3573           entry.second.expandable_segment_) {
3574         entry.second.expandable_segment_->addPeer(dev);
3575       }
3576     }
3577   }
3578 
memcpyAsync(void * dst,int dstDevice,const void * src,int srcDevice,size_t count,cudaStream_t stream,bool p2p_enabled)3579   cudaError_t memcpyAsync(
3580       void* dst,
3581       int dstDevice,
3582       const void* src,
3583       int srcDevice,
3584       size_t count,
3585       cudaStream_t stream,
3586       bool p2p_enabled) override {
3587     if (p2p_enabled || // memcpy ok because memory is mapped in both devices
3588         srcDevice == dstDevice || // memcpy ok on a single device
3589         // memcpy ok because both dst and src must have come from cudaMalloc
3590         (!device_allocator[dstDevice]->hasAllocatedExpandableSegments() &&
3591          !device_allocator[srcDevice]->hasAllocatedExpandableSegments())) {
3592       return cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, stream);
3593     }
3594     // when p2p is not enabled, only cudaMemcpyPeerAsync correctly handles
3595     // memory not allocated via cudaMalloc
3596     return cudaMemcpyPeerAsync(dst, dstDevice, src, srcDevice, count, stream);
3597   }
3598 
raw_delete(void * ptr)3599   void raw_delete(void* ptr) override {
3600     this->free(ptr);
3601   }
3602 
3603   // In CUDA IPC, sender sends a tensor to receiver via shareIPCHandle,
3604   // getIpcDevPtr is called by the receiving process to map the CUDA memory from
3605   // the sending process into its own address space.
3606 
3607   // When allocated with cudaMalloc we use the cudaIPCMemHandle_t APIs.
3608   // These APIs only allow sharing a big memory block associated with a
3609   // cudaIpcMemHandle_t and it can be opened only **once** per context per
3610   // process. There can be multiple types of storage in the same IPC mem block,
3611   // so we must cache the device ptr to construct typed storage as it comes.
3612 
3613   // When using cuMemCreate, via expandable segments, we use
3614   // cuMemExportToShareableHandle to create a file descriptor that can be sent
3615   // to the other process to sort the object. Then we recreate part of the
3616   // exandable segment necessary to load the allocation.
3617 
3618   // ipcMemHandle_to_devptr caches the mapping from shareable handle to
3619   // this process' memory mapping information for that share to ensure we do not
3620   // create it twice. When the shared_ptr is no longer in use we clean up the
3621   // cache.
3622 
3623   std::mutex IpcMutex;
3624   struct MemHandleCacheEntry {
MemHandleCacheEntryc10::cuda::CUDACachingAllocator::Native::NativeCachingAllocator::MemHandleCacheEntry3625     MemHandleCacheEntry(
3626         c10::DeviceIndex device,
3627         std::string& handle,
3628         const DeviceCachingAllocator& allocator)
3629         : device_(device),
3630           expandable_segment_(nullptr),
3631           cuda_ipc_ptr_(nullptr) {
3632       int type = SHAREABLE_CUDA_MALLOC;
3633       std::istringstream ss(handle);
3634       if (handle.size() != CUDA_IPC_HANDLE_SIZE) {
3635         auto version = ss.get();
3636         TORCH_CHECK(
3637             version <= SHAREABLE_HANDLE_VERSION,
3638             "received sharable handle from a future version of torch that this version does not know how to handle")
3639         type = ss.get();
3640       } // otherwise this is coming from an old pytorch where it has to be a raw
3641         // SHARABLE_CUDA_MALLOC
3642       if (type == SHAREABLE_CUDA_MALLOC) {
3643         cudaIpcMemHandle_t cuda_handle;
3644         ss.read((char*)&cuda_handle, CUDA_IPC_HANDLE_SIZE);
3645         C10_CUDA_CHECK(cudaIpcOpenMemHandle(
3646             &cuda_ipc_ptr_, cuda_handle, cudaIpcMemLazyEnablePeerAccess));
3647       } else if (type == SHAREABLE_CUDA_EXPANDABLE_SEGMENT) {
3648         expandable_segment_ =
3649             ExpandableSegment::fromShared(device, allocator.peers(), ss)
3650                 .release();
3651       } else {
3652         TORCH_INTERNAL_ASSERT(
3653             false, "unexpected or illformed shareable handle type");
3654       }
3655     }
3656     // this struct expects that clear is explicitly called to
3657     // free resources, because we only want this code running when
3658     // the shared pointer to this entry is destructed, not during
3659     // deinitialization when cuda may already have been shutdown.
3660     // This replicates the previous behavior of this map when it
3661     // stored raw cuda_ipc_ptr_ handles.
clearc10::cuda::CUDACachingAllocator::Native::NativeCachingAllocator::MemHandleCacheEntry3662     void clear() {
3663       if (cuda_ipc_ptr_) {
3664         cuda::CUDAGuard device_guard(device_);
3665         C10_CUDA_CHECK(cudaIpcCloseMemHandle(cuda_ipc_ptr_));
3666         cuda_ipc_ptr_ = nullptr;
3667       }
3668       if (expandable_segment_) {
3669         delete expandable_segment_;
3670         expandable_segment_ = nullptr;
3671       }
3672     }
ptrc10::cuda::CUDACachingAllocator::Native::NativeCachingAllocator::MemHandleCacheEntry3673     void* ptr() {
3674       if (cuda_ipc_ptr_) {
3675         return cuda_ipc_ptr_;
3676       } else {
3677         return expandable_segment_->ptr();
3678       }
3679     }
3680     c10::DeviceIndex device_;
3681     ExpandableSegment* expandable_segment_;
3682     void* cuda_ipc_ptr_; // nullptr if expandable_segment_ is not null
3683     std::weak_ptr<void> wp_;
3684   };
3685 
3686   ska::flat_hash_map<std::string, MemHandleCacheEntry> ipcMemHandle_to_devptr;
getIpcDevPtr(std::string handle)3687   std::shared_ptr<void> getIpcDevPtr(std::string handle) override {
3688     std::lock_guard<std::mutex> lock(IpcMutex);
3689 
3690     auto iter = ipcMemHandle_to_devptr.find(handle);
3691     if (iter != ipcMemHandle_to_devptr.end()) {
3692       auto devptr = iter->second.wp_.lock();
3693       // the weak_ptr should always be valid because we delete the entry from
3694       // the cache when the shared_ptr is destructed, so we should never get
3695       // here.
3696       TORCH_INTERNAL_ASSERT(devptr, "entry in cache has missing shared_ptr");
3697       return devptr;
3698     }
3699     c10::DeviceIndex curr_device = 0;
3700     C10_CUDA_CHECK(c10::cuda::GetDevice(&curr_device));
3701     auto inserted = ipcMemHandle_to_devptr.insert(
3702         iter,
3703         {handle,
3704          MemHandleCacheEntry(
3705              curr_device, handle, *device_allocator[curr_device])});
3706     auto sp = std::shared_ptr<void>(
3707         inserted->second.ptr(), [handle, this](void* ptr) {
3708           std::lock_guard<std::mutex> deleter_lock(IpcMutex);
3709           auto it = ipcMemHandle_to_devptr.find(handle);
3710           TORCH_INTERNAL_ASSERT(it != ipcMemHandle_to_devptr.end());
3711           it->second.clear();
3712           ipcMemHandle_to_devptr.erase(it);
3713         });
3714     inserted->second.wp_ = sp;
3715     return sp;
3716   }
3717 
name()3718   std::string name() override {
3719     return "native";
3720   }
copy_data(void * dest,const void * src,std::size_t count) const3721   void copy_data(void* dest, const void* src, std::size_t count) const final {
3722     C10_CUDA_CHECK(
3723         cudaMemcpy(dest, src, count, cudaMemcpyKind::cudaMemcpyDeviceToDevice));
3724   }
3725 };
3726 
3727 NativeCachingAllocator allocator;
3728 
local_raw_delete(void * ptr)3729 void local_raw_delete(void* ptr) {
3730   if (TORCH_SDT_IS_ENABLED(free)) {
3731     TORCH_SDT_WITH_SEMAPHORE(free, ptr);
3732   }
3733 
3734   allocator.free(ptr);
3735 }
3736 
3737 } // namespace Native
3738 
3739 namespace CudaMallocAsync {
3740 // If this is put in its own header file, it gets incorrectly renamed in HIPify.
3741 CUDAAllocator* allocator();
3742 
3743 } // namespace CudaMallocAsync
3744 
3745 struct BackendStaticInitializer {
3746   // Parses env for backend at load time, duplicating some logic from
3747   // CUDAAllocatorConfig. CUDAAllocatorConfig double-checks it later (at
3748   // runtime). Defers verbose exceptions and error checks, including Cuda
3749   // version checks, to CUDAAllocatorConfig's runtime doublecheck. If this
3750   // works, maybe we should move all of CUDAAllocatorConfig here?
parseEnvForBackendc10::cuda::CUDACachingAllocator::BackendStaticInitializer3751   CUDAAllocator* parseEnvForBackend() {
3752     const char* val = getenv("PYTORCH_CUDA_ALLOC_CONF");
3753     if (val != nullptr) {
3754       const std::string config(val);
3755 
3756       std::regex exp("[\\s,]+");
3757       std::sregex_token_iterator it(config.begin(), config.end(), exp, -1);
3758       std::sregex_token_iterator end;
3759       std::vector<std::string> options(it, end);
3760 
3761       for (auto option : options) {
3762         std::regex exp2("[:]+");
3763         std::sregex_token_iterator it2(option.begin(), option.end(), exp2, -1);
3764         std::sregex_token_iterator end2;
3765         std::vector<std::string> kv(it2, end2);
3766         if (kv.size() >= 2) {
3767           if (kv[0] == "backend") {
3768             if (kv[1] == "cudaMallocAsync")
3769               return CudaMallocAsync::allocator();
3770             if (kv[1] == "native")
3771               return &Native::allocator;
3772           }
3773         }
3774       }
3775     }
3776     return &Native::allocator;
3777   }
3778 
BackendStaticInitializerc10::cuda::CUDACachingAllocator::BackendStaticInitializer3779   BackendStaticInitializer() {
3780     auto r = parseEnvForBackend();
3781     allocator.store(r);
3782   }
3783 };
3784 
3785 std::atomic<CUDAAllocator*> allocator;
3786 BackendStaticInitializer backend_static_initializer;
3787 } // namespace cuda::CUDACachingAllocator
3788 } // namespace c10
3789 
3790 namespace c10::cuda {
3791 
3792 // uid_ is incremented when a user creates a MemPool,
3793 // for example: using graph_pool_handle() or c10::cuda::MemPool().
3794 //
3795 // uuid_ is incremented when CUDAGraph creates a MemPool
3796 // as a result of a user not providing a pool.
3797 //
3798 // MempoolId_t of {0, 0} is used to denote when no MemPool has been
3799 // passed to a function, either by user or CUDAGraphs. For example,
3800 // default value of MempoolId_t for capture_begin function is {0, 0}.
3801 // That's why uid_ and uuid_ start at 1.
3802 std::atomic<CaptureId_t> MemPool::uid_{1};
3803 std::atomic<CaptureId_t> MemPool::uuid_{1};
3804 
MemPool(CUDACachingAllocator::CUDAAllocator * allocator,bool is_user_created)3805 MemPool::MemPool(
3806     CUDACachingAllocator::CUDAAllocator* allocator,
3807     bool is_user_created)
3808     : allocator_(allocator), is_user_created_(is_user_created) {
3809   if (is_user_created_) {
3810     id_ = {0, uid_++};
3811   } else {
3812     id_ = {uuid_++, 0};
3813   }
3814 }
3815 
id()3816 MempoolId_t MemPool::id() {
3817   return id_;
3818 }
3819 
allocator()3820 CUDACachingAllocator::CUDAAllocator* MemPool::allocator() {
3821   return allocator_;
3822 }
3823 
3824 // Note that active_mempool_ is a global variable here
3825 // and not inside MemPoolContext class, because in windows we
3826 // can't use __declspec(dllexport) and __declspec(thread)
3827 // together: https://stackoverflow.com/a/50967977
3828 static thread_local MemPool* active_mempool_ = nullptr;
3829 
MemPoolContext(MemPool * mempool)3830 MemPoolContext::MemPoolContext(MemPool* mempool)
3831     : prev_mempool_(active_mempool_) {
3832   active_mempool_ = mempool;
3833 }
3834 
~MemPoolContext()3835 MemPoolContext::~MemPoolContext() {
3836   active_mempool_ = prev_mempool_;
3837 }
3838 
getActiveMemPool()3839 MemPool* MemPoolContext::getActiveMemPool() {
3840   return active_mempool_;
3841 }
3842 
3843 } // namespace c10::cuda
3844