1 #include <c10/cuda/CUDACachingAllocator.h>
2
3 #include <c10/core/impl/GPUTrace.h>
4 #include <c10/cuda/CUDAAllocatorConfig.h>
5 #include <c10/cuda/CUDAException.h>
6 #include <c10/cuda/CUDAFunctions.h>
7 #include <c10/cuda/CUDAGuard.h>
8 #include <c10/util/CallOnce.h>
9 #include <c10/util/Gauge.h>
10 #include <c10/util/ScopeExit.h>
11 #include <c10/util/UniqueVoidPtr.h>
12 #include <c10/util/flat_hash_map.h>
13 #include <c10/util/hash.h>
14 #include <c10/util/llvmMathExtras.h>
15 #include <c10/util/static_tracepoint.h>
16
17 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
18 #include <c10/cuda/driver_api.h>
19 #include <sys/syscall.h>
20 #include <sys/types.h>
21 #include <unistd.h>
22 #endif
23
24 #include <c10/util/Exception.h>
25 #include <cuda_runtime_api.h>
26 #include <algorithm>
27 #include <cstddef>
28 #include <cstdint>
29 #include <deque>
30 #include <memory>
31 #include <mutex>
32 #include <regex>
33 #include <set>
34 #include <utility>
35 #include <vector>
36
37 TORCH_SDT_DEFINE_SEMAPHORE(malloc)
38 TORCH_SDT_DEFINE_SEMAPHORE(free)
39
40 namespace c10 {
41
42 C10_DEFINE_REGISTRY(FreeCudaMemoryCallbacksRegistry, FreeMemoryCallback);
43
44 namespace cuda::CUDACachingAllocator {
45
46 using namespace c10::CachingDeviceAllocator;
47
48 // Included here as this is externally used in CUDAAllocatorConfig
49 const size_t kLargeBuffer =
50 20971520; // "large" allocations may be packed in 20 MiB blocks
51
52 namespace Native {
53
54 //
55 // Yet another caching allocator for CUDA device allocations.
56 //
57 // - Allocations are associated with a stream. Once freed, blocks can be
58 // re-allocated on the same stream, but not on any other stream.
59 // - The allocator attempts to find the smallest cached block that will fit the
60 // requested size. If the block is larger than the requested size, it may be
61 // split. If no block is found, the allocator will delegate to cudaMalloc.
62 // - If the cudaMalloc fails, the allocator will attempt to free one cached
63 // block of sufficient size that is not split and retry the allocation.
64 // If this also fails, the allocator will attempt to free all cached blocks
65 // that are not split and retry the allocation.
66 // - Large (>1MB) and small allocations are stored in separate pools.
67 // Small requests are packed into 2MB buffers. Large requests will use the
68 // smallest available free block or allocate a new block using cudaMalloc.
69 // - To reduce fragmentation, requests between 1MB and 10MB will allocate and
70 // split a 20MB block, if no free block of sufficient size is available.
71 // - To further reduce fragmentation, blocks >= max_split_size are not allowed
72 // to be split. These oversize cached blocks will still satisfy requests
73 // within 1MB of the oversize cached block size.
74 //
75 // With this allocator, allocations and frees should logically be considered
76 // "usages" of the memory segment associated with streams, just like kernel
77 // launches. The programmer must insert the proper synchronization if memory
78 // segments are used from multiple streams.
79 //
80 // The library provides a recordStream() function to help insert the correct
81 // synchronization when allocations are used on multiple streams. This will
82 // ensure that the block is not reused before each recorded stream completes
83 // work.
84 //
85
86 /**
87 * Note [Interaction with CUDA graph capture]
88 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
89 * Graph capture performs a dry run of a region of execution, freezing all CUDA
90 * work (and virtual addresses used during that work) into a "graph." The graph
91 * may be "replayed" like a single giant kernel, with greatly reduced CPU
92 * overhead as well as modestly improved GPU performance.
93 *
94 * Because capture bakes in memory addresses, the memory used during capture
95 * must be available for the graph to use during replay. DeviceCachingAllocator
96 * assigns and frees memory eagerly and dynamically, so if we're not careful
97 * about managing graphs' memory, at replay time those memory addresses could be
98 * used by other tensors.
99 *
100 * To guarantee a graph's baked in addresses are safe to reuse in replay,
101 * DeviceAllocator satisfies allocations from a graph-private memory pool during
102 * capture, and doesn't begin cudaFreeing those addresses until the graph is
103 * destroyed.
104 *
105 * Within the private pool, allocations are freed and reassigned as usual during
106 * capture. Memory regions will be used in a consistent order during replay. So
107 * a private pool doesn't use memory more wastefully than the default pools
108 * during capture, but it does reserve its high-water mark of used memory away
109 * from the default pools as long as the capture(s) it served survive
110 * (regardless whether those captures are idle or replaying).
111 *
112 * CUDAGraph's requests for private pools are mediated by
113 * DeviceAllocator::notifyCaptureBegin,
114 * notifyCaptureAboutToEnd,
115 * notifyCaptureEnded,
116 * notifyCaptureDestroy.
117 */
118
119 constexpr size_t kMinBlockSize =
120 512; // all sizes are rounded to at least 512 bytes
121 constexpr size_t kSmallSize = 1048576; // largest "small" allocation is 1 MiB
122 constexpr size_t kSmallBuffer =
123 2097152; // "small" allocations are packed in 2 MiB blocks
124 constexpr size_t kMinLargeAlloc =
125 10485760; // allocations between 1 and 10 MiB may use kLargeBuffer
126 constexpr size_t kRoundLarge = 2097152; // round up large allocations to 2 MiB
127
128 char SHAREABLE_HANDLE_VERSION = 1;
129 enum ShareableHandleType : char {
130 SHAREABLE_CUDA_MALLOC = 'c',
131 SHAREABLE_CUDA_EXPANDABLE_SEGMENT = 'e'
132 };
133
134 namespace {
135
136 using stream_set = ska::flat_hash_set<cuda::CUDAStream>;
137
decrease_stat_array(StatArray & stat_array,size_t amount,const StatTypes & stat_types)138 void decrease_stat_array(
139 StatArray& stat_array,
140 size_t amount,
141 const StatTypes& stat_types) {
142 for_each_selected_stat_type(
143 stat_types, [&stat_array, amount](size_t stat_type) {
144 stat_array[stat_type].decrease(amount);
145 });
146 }
147
148 struct Block;
149 struct PrivatePool;
150 typedef bool (*Comparison)(const Block*, const Block*);
151 static bool BlockComparatorSize(const Block* a, const Block* b);
152 static bool BlockComparatorAddress(const Block* a, const Block* b);
153
154 struct BlockPool {
BlockPoolc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::BlockPool155 BlockPool(bool small, PrivatePool* private_pool = nullptr)
156 : blocks(BlockComparatorSize),
157 unmapped(BlockComparatorAddress),
158 is_small(small),
159 owner_PrivatePool(private_pool) {}
160
161 // Do not insert a Block to blocks directly; use insert_into_blocks(),
162 // instead.
163 std::set<Block*, Comparison> blocks;
164 std::set<Block*, Comparison> unmapped;
165 // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
166 const bool is_small;
167 PrivatePool* owner_PrivatePool;
168 int64_t get_free_blocks_call_count{0};
169
170 // Add a Block into blocks set with updating gc counter.
171 std::pair<std::set<Block*, Comparison>::iterator, bool> insert_into_blocks(
172 Block* block);
173 };
174
175 struct ExpandableSegment;
176
177 struct Block {
178 c10::DeviceIndex device; // gpu
179 cudaStream_t stream; // allocation stream
180 stream_set stream_uses; // streams on which the block was used
181 size_t size; // block size in bytes
182 size_t requested_size; // memory originally requested
183 BlockPool* pool{nullptr}; // owning memory pool
184 void* ptr{nullptr}; // memory address
185 bool allocated{false}; // in-use flag
186 bool mapped{true}; // is the virtual address range this Block references
187 // backed by physical pages. Always true when
188 // expandable_segment_ is null. When false
189 // This Block will be aligned to the segment size
190 // of its expandable_segment_.
191 Block* prev{nullptr}; // prev block if split from a larger allocation
192 Block* next{nullptr}; // next block if split from a larger allocation
193 int event_count{0}; // number of outstanding CUDA events
194 int64_t gc_count_base{0}; // get_free_blocks_call_count when Block is inserted
195 std::shared_ptr<GatheredContext> context_when_allocated;
196 // only set for the first block in the segment (when prev == null)
197 // this records the frame information when cudaMalloc was called
198 // whereas context_when_allocated records the last time we handed this
199 // memory out from our cache.
200 std::shared_ptr<GatheredContext> context_when_segment_allocated;
201
202 ExpandableSegment* expandable_segment_{nullptr};
203
Blockc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::Block204 Block(
205 c10::DeviceIndex device,
206 cudaStream_t stream,
207 size_t size,
208 BlockPool* pool,
209 void* ptr)
210 : device(device),
211 stream(stream),
212 stream_uses(),
213 size(size),
214 requested_size(0),
215 pool(pool),
216 ptr(ptr) {}
217
218 // constructor for search key
Blockc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::Block219 Block(c10::DeviceIndex device, cudaStream_t stream, size_t size)
220 : device(device),
221 stream(stream),
222 stream_uses(),
223 size(size),
224 requested_size(0) {}
225
gc_countc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::Block226 size_t gc_count() {
227 TORCH_INTERNAL_ASSERT(pool);
228 return static_cast<int>(pool->get_free_blocks_call_count - gc_count_base);
229 }
230
is_splitc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::Block231 bool is_split() const {
232 return (prev != nullptr) || (next != nullptr);
233 }
splicec10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::Block234 void splice(Block* before, Block* after) {
235 if (before) {
236 TORCH_INTERNAL_ASSERT(before->next == after);
237 before->next = this;
238 }
239 prev = before;
240 if (after) {
241 TORCH_INTERNAL_ASSERT(after->prev == before);
242 after->prev = this;
243 }
244 next = after;
245 }
246 };
247
248 std::pair<std::set<Block*, Comparison>::iterator, bool> BlockPool::
insert_into_blocks(Block * block)249 insert_into_blocks(Block* block) {
250 block->gc_count_base = get_free_blocks_call_count;
251 return blocks.insert(block);
252 }
253
254 struct SegmentRange {
255 char* ptr;
256 size_t size;
SegmentRangec10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::SegmentRange257 SegmentRange(void* p, size_t s) : ptr(static_cast<char*>(p)), size(s) {}
258 };
259
260 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
261
262 /*
263 Note [Expandable Segments]
264
265 Rationale
266
267 For large (>2MB) allocations, the allocator calls cudaMalloc to get allocations
268 that are the same size as what the user requests. In the future, parts of these
269 allocations can be reused for other requests if they are free. This works well
270 when the program makes many requests of exactly the same size or of sizes that
271 even multiples of that size. Many deep learning models follow this behavior.
272 However, one common exception is when the batch size changes slightly from one
273 iteration to the next, e.g. in batched inference. When the program runs
274 initially with batch size N, it will make allocations appropriate for that size.
275 If in the future, it runs at size N - 1, the existing allocations will still be
276 big enough. However, if it runs at size N + 1, then it will have to make new
277 allocations that are slightly larger. Not all the tensors are the same size.
278 Some might be (N + 1)*A and others (N + 1)*A*B where A and B are some non-batch
279 dimensions in the model. Because the allocator reuses existing allocations when
280 they are big enough, some number of (N + 1)*A allocations will actually fit in
281 the already existing N*B*A segments, though not perfectly. As the model runs it
282 will partially fill up all of these segments leaving unusable free slices of
283 memory at the end of these segments. The allocator at some point will need to
284 cudaMalloc a new (N + 1)*A*B segment. If there is not enough memory, there is
285 now no way to recover the slices of memory that are free at the end of existing
286 segments. With models 50+ layers deep, this pattern might repeat 50+ times
287 creating many slivers.
288
289 Approach
290
291 Expandable segments allows the allocator to create a segment initially and then
292 expand its size later when more memory is needed. Instead of making one segment
293 per allocation, it tries to make one segment (per stream) that grows as
294 necessary. Now when the N + 1 case runs, the allocations will tile nicely into
295 the one large segment until it fills up. Then more memory is requested and
296 appended to the end of the segment. This process does not create as many slivers
297 of unusable memory, so it is more likely to succeed at finding this memory.
298
299 Implementation
300
301 The expandable_segments:True option is used to enable/disable this behavior. We
302 use cuda's low-level memory APIs, which are similar to mmap, to extend the
303 memory segments. These APIs separate the allocation of physical memory
304 (cuMemCreate) from the allocation of virtual address space (cuMemAddressReserve)
305 and the associate between them cuMemMap/cuMemSetAccess.
306
307 When we allocate a new segment, we allocate enough address space to map
308 basically the entire physical memory of the GPU (there is 256TiB of address
309 space), but we only map enough physical memory to handle the current amount of
310 memory needed by the program. As more is requested, we add more physical memory
311 to the segment. This can work at the granularity of GPU pages which are 2MiB
312 currently.
313
314 If we end up out of memory, we can unmap all the memory in our segment
315 corresponding to empty physical pages, and return it to CUDA for use at another
316 address in the segment or in a segment for a different stream.
317
318 A current limitation of CUDA's API is that physical memory
319 (CUmemGenericAllocationHandle) cannot be split up after it is mapped even if the
320 handle holds multiple GPU pages. The cost to map/unmap memory is proportional to
321 the number of physical memory chunks that were allocated (mapping 10 separately
322 allocated 2MiB pages takes 10x time compared to mapping one 20MiB physical
323 allocation of 10 pages). Changing memory mappings also appears to involve at
324 least some synchronous actions with the GPU and so should be considered an
325 expensive operation. To limit overhead, we use 2MiB pages for our small pool and
326 20MiB pages for our large pool. Initially allocation using expandable_blocks
327 will be slower than cudaMalloc, though still in the milliseconds range for
328 mapping the entire memory.
329
330 When mapping new memory to expand the segment, we look for the lowest address at
331 which we can fit a new allocation by adding new pages. Normally this will be at
332 the end of the block. But if have previously unmapped blocks earlier in the
333 segment during an OOM, it will first try to fill in those gaps to keep the
334 segment as a single block. By allocating at the lowest address we encourage
335 the split up parts of the block to merge into a single block again, reducing
336 fragmentation potential.
337
338 Allocation of blocks in the segment uses the same best-fit heuristics of the
339 rest of the allocator.
340
341 Expandable blocks can be enabled/disabled throughout the run of a program. When
342 disabled, the allocator will not put new allocations in an expandable block.
343
344 Limitations
345
346 * Slightly slower initial memory allocation speed.
347 * IPC of cuda tensors (e.g. for multiprocess dataloaders) is not supported.
348 However, it is possible to temporarily disable (expandable_segments:False) the
349 bevhavior for allocator tensors that need to be used cross-process.
350 * CUDA runtime APIs related to sharing memory across process
351 (cudaDeviceEnablePeerAccess) do not work for memory allocated with cuMemMap.
352 Instead these mapping have to be done manually. The allocator now has an
353 `enablePeerAccess` method to do this.
354 */
355
356 struct ExpandableSegment {
ExpandableSegmentc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment357 ExpandableSegment(
358 c10::DeviceIndex device,
359 std::optional<cudaStream_t> stream,
360 size_t address_space_size,
361 size_t segment_size,
362 std::vector<c10::DeviceIndex> peers)
363 : device_(device),
364 stream_(stream),
365 // 2MB for small pool, 20MB for large pool
366 segment_size_(segment_size),
367 max_handles_(numSegments(address_space_size)),
368 peers_(std::move(peers)) {
369 cudaDeviceProp prop{};
370 C10_CUDA_CHECK(cudaGetDeviceProperties(&prop, device_));
371 // we allocate enough address space for 1 1/8 the total memory on the GPU.
372 // This allows for some cases where we have to unmap pages earlier in the
373 // segment to put them at the end.
374 max_handles_ = numSegments(prop.totalGlobalMem + prop.totalGlobalMem / 8);
375 C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemAddressReserve_(
376 &ptr_, segment_size_ * max_handles_, 0ULL, 0, 0ULL));
377 }
378 // begin must be aligned to segment_size_.
379 // returns the actual range mapped, which may be
380 // greater than requested if size is not aligned to segment_size_.
381 // return size of 0 indicates OOM
mapc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment382 SegmentRange map(SegmentRange range) {
383 auto begin = segmentLeft(range.ptr);
384 auto end = segmentRight(range.ptr + range.size);
385 TORCH_INTERNAL_ASSERT(ptr() + begin * segment_size_ == range.ptr);
386 if (begin == end) {
387 return rangeFromHandles(begin, end);
388 }
389 while (end > handles_.size()) {
390 handles_.emplace_back(std::nullopt);
391 }
392 for (auto i : c10::irange(begin, end)) {
393 TORCH_INTERNAL_ASSERT(!handles_.at(i));
394 CUmemGenericAllocationHandle handle = 0;
395 CUmemAllocationProp prop = {};
396 prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
397 #ifndef FBCODE_CAFFE2
398 prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
399 #endif
400 prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
401 // NOLINTNEXTLINE(bugprone-signed-char-misuse)
402 prop.location.id = static_cast<int>(device_);
403 auto status =
404 DriverAPI::get()->cuMemCreate_(&handle, segment_size_, &prop, 0);
405 if (status == CUDA_ERROR_OUT_OF_MEMORY) {
406 for (auto j : c10::irange(begin, i)) {
407 auto h = handles_.at(j).value();
408 handles_.at(j) = std::nullopt;
409 C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemRelease_(h.handle));
410 }
411 trimHandles();
412 return rangeFromHandles(begin, begin);
413 }
414 C10_CUDA_DRIVER_CHECK(status);
415 handles_.at(i) = Handle{handle, std::nullopt};
416 }
417 mapAndSetAccess(begin, end);
418 return rangeFromHandles(begin, end);
419 }
420
421 // unmaps all the completely empty segment_size_ segments between
422 // [begin, begin + size), returns the offset where the range begin,
423 // and the actual size unmapped (multiple of segment_size_)
unmapc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment424 SegmentRange unmap(SegmentRange range) {
425 auto begin = segmentRight(range.ptr);
426 auto end = segmentLeft(range.ptr + range.size);
427 if (begin >= end) {
428 return SegmentRange{range.ptr, 0};
429 }
430 unmapHandles(begin, end);
431 return rangeFromHandles(begin, end);
432 }
433
434 // Setup IPC sharing for range.
435 // Returns the (larger) range that was actually shared.
436 // Serializes data to std::ostream that can be passed to the
437 // other process, and then restored as an exapandable segment
438 // via ExpandableSegment::fromShared(istream);
sharec10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment439 SegmentRange share(SegmentRange range, std::ostream& buf) {
440 auto begin = segmentLeft(range.ptr);
441 auto end = segmentRight(range.ptr + range.size);
442 ShareHeader header{getpid(), segment_size_, end - begin};
443 buf.write((const char*)&header, sizeof(ShareHeader));
444 for (auto i : c10::irange(begin, end)) {
445 auto& handle = handles_.at(i).value();
446 if (!handle.fd) {
447 int fd = 0;
448 C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemExportToShareableHandle_(
449 &fd, handle.handle, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0));
450 handle.fd = fd;
451 }
452 int fd = *handle.fd;
453 buf.write((const char*)&fd, sizeof(int));
454 }
455 return rangeFromHandles(begin, end);
456 }
457
fromSharedc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment458 static std::unique_ptr<ExpandableSegment> fromShared(
459 c10::DeviceIndex device,
460 std::vector<c10::DeviceIndex> peers,
461 std::istream& buf) {
462 ShareHeader header{};
463 buf.read((char*)&header, sizeof(ShareHeader));
464 auto segment = std::make_unique<ExpandableSegment>(
465 device,
466 std::nullopt,
467 header.num_handles * header.segment_size,
468 header.segment_size,
469 std::move(peers));
470 // older build setups (e.g. multiwheels) do not have this syscall, added 2020
471 // but the kernel on the system might still support it.
472 #ifndef SYS_pidfd_open
473 #define SYS_pidfd_open 434
474 #endif
475 #ifndef SYS_pidfd_getfd
476 #define SYS_pidfd_getfd 438
477 #endif
478 auto pidfd = syscall(SYS_pidfd_open, header.pid, 0);
479 TORCH_CHECK(
480 pidfd != -1 || errno != ENOSYS,
481 "The kernel on this machine does not support the pidfd_open syscall needed to use IPC for CUDA tensors when expandable_segments:True is set. "
482 "Consider using expandable_segments:False via torch.cuda.memory._set_allocator_settings('expandable_segments:False') for this allocation.");
483 TORCH_CHECK(pidfd != -1, "pidfd_open:", std::strerror(errno));
484 for (auto i : c10::irange(header.num_handles)) {
485 (void)i;
486 int fd = 0;
487 buf.read((char*)&fd, sizeof(int));
488 auto myfd = syscall(SYS_pidfd_getfd, pidfd, fd, 0);
489 if (myfd == -1) {
490 auto err = errno;
491 close((int)pidfd);
492 for (auto& h : segment->handles_) {
493 C10_CUDA_DRIVER_CHECK(
494 DriverAPI::get()->cuMemRelease_(h.value().handle));
495 h = std::nullopt;
496 }
497 TORCH_CHECK(
498 err != ENOSYS,
499 "The kernel on this machine does not support the pidfd_getfd syscall needed to use IPC for CUDA tensors when expandable_segments:True is set. "
500 "Consider using expandable_segments:False via torch.cuda.memory._set_allocator_settings('expandable_segments:False') for this allocation.");
501 TORCH_CHECK(false, "pidfd_getfd: ", std::strerror(err));
502 }
503 CUmemGenericAllocationHandle handle = 0;
504 C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemImportFromShareableHandle_(
505 &handle,
506 // NOLINTNEXTLINE(performance-no-int-to-ptr)
507 (void*)(uintptr_t)myfd,
508 CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
509 close((int)myfd);
510 segment->handles_.emplace_back(Handle{handle, std::nullopt});
511 }
512 close((int)pidfd);
513 segment->mapAndSetAccess(0, header.num_handles);
514 return segment;
515 }
516
ptrc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment517 char* ptr() const {
518 // NOLINTNEXTLINE(performance-no-int-to-ptr)
519 return reinterpret_cast<char*>(ptr_);
520 }
521
sizec10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment522 size_t size() const {
523 return max_handles_ * segment_size_;
524 }
525
addPeerc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment526 void addPeer(c10::DeviceIndex device) {
527 peers_.push_back(device);
528 forEachAllocatedRange(
529 [&](size_t begin, size_t end) { setAccess(device, begin, end); });
530 }
531
~ExpandableSegmentc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment532 ~ExpandableSegment() {
533 forEachAllocatedRange(
534 [&](size_t begin, size_t end) { unmapHandles(begin, end); });
535 C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemAddressFree_(
536 ptr_, segment_size_ * max_handles_));
537 }
538
539 private:
setAccessc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment540 void setAccess(c10::DeviceIndex device, size_t begin, size_t end) {
541 CUmemAccessDesc desc;
542 desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
543 // NOLINTNEXTLINE(bugprone-signed-char-misuse)
544 desc.location.id = static_cast<int>(device);
545 desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
546 C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemSetAccess_(
547 ptr_ + begin * segment_size_, (end - begin) * segment_size_, &desc, 1));
548 }
549
mapAndSetAccessc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment550 void mapAndSetAccess(size_t begin, size_t end) {
551 for (auto i : c10::irange(begin, end)) {
552 C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemMap_(
553 ptr_ + i * segment_size_,
554 segment_size_,
555 0,
556 handles_.at(i).value().handle,
557 0ULL));
558 }
559 setAccess(device_, begin, end);
560 for (auto p : peers_) {
561 setAccess(p, begin, end);
562 }
563 }
564
unmapHandlesc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment565 void unmapHandles(size_t begin, size_t end) {
566 // note: unlike cudaFree, MemUnmap and MemRelease do
567 // not appear to synchronize in all cases, so we have to wait for the
568 // stream to finish before this memory is truly free.
569
570 // cannot call c10::cuda::stream_synchronize because
571 // it might grab the GIL which can lead to a deadlock
572 // Locking order must be GIL -> Allocator Lock
573 if (stream_) {
574 C10_CUDA_CHECK(cudaStreamSynchronize(*stream_));
575 } else {
576 cuda::CUDAGuard device_guard(device_);
577 C10_CUDA_CHECK(cudaDeviceSynchronize());
578 }
579 for (auto i : c10::irange(begin, end)) {
580 Handle h = handles_.at(i).value();
581 handles_.at(i) = std::nullopt;
582 C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemUnmap_(
583 ptr_ + segment_size_ * i, segment_size_));
584 if (h.fd) {
585 close(*h.fd);
586 }
587 C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemRelease_(h.handle));
588 }
589 trimHandles();
590 }
trimHandlesc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment591 void trimHandles() {
592 while (!handles_.empty() && !handles_.back()) {
593 handles_.pop_back();
594 }
595 }
forEachAllocatedRangec10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment596 void forEachAllocatedRange(const std::function<void(size_t, size_t)>& fn) {
597 size_t start = 0;
598 for (auto i : c10::irange(handles_.size())) {
599 if (handles_.at(i) && (i == 0 || !handles_.at(i - 1))) {
600 start = i;
601 }
602 if (handles_.at(i) && (i + 1 == handles_.size() || !handles_.at(i + 1))) {
603 fn(start, i + 1);
604 }
605 }
606 }
numSegmentsc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment607 size_t numSegments(size_t size) {
608 return (size + segment_size_ - 1) / segment_size_;
609 }
segmentLeftc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment610 size_t segmentLeft(char* p) {
611 auto size = p - ptr();
612 return size / segment_size_;
613 }
segmentRightc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment614 size_t segmentRight(char* p) {
615 auto size = p - ptr();
616 return numSegments(size);
617 }
rangeFromHandlesc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment618 SegmentRange rangeFromHandles(size_t begin, size_t end) {
619 return SegmentRange(
620 ptr() + segment_size_ * begin, segment_size_ * (end - begin));
621 }
622 c10::DeviceIndex device_;
623 std::optional<cudaStream_t> stream_;
624 CUdeviceptr ptr_{};
625 size_t segment_size_;
626 size_t max_handles_;
627 struct Handle {
628 CUmemGenericAllocationHandle handle;
629 std::optional<int> fd;
630 };
631 struct ShareHeader {
632 pid_t pid;
633 size_t segment_size;
634 size_t num_handles;
635 };
636 std::vector<std::optional<Handle>> handles_;
637 // devices on which this memory should be mapped in addition
638 // to the device where the physical memory lives (device_).
639 std::vector<c10::DeviceIndex> peers_;
640 };
641 #else
642 struct ExpandableSegment {
ExpandableSegmentc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment643 ExpandableSegment(
644 c10::DeviceIndex device,
645 std::optional<cudaStream_t> stream,
646 size_t address_space_size,
647 size_t segment_size,
648 std::vector<c10::DeviceIndex> peers) {
649 TORCH_INTERNAL_ASSERT(false, "expandable segment not supported");
650 }
mapc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment651 SegmentRange map(SegmentRange range) {
652 return SegmentRange(nullptr, 0);
653 }
unmapc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment654 SegmentRange unmap(SegmentRange range) {
655 return SegmentRange(nullptr, 0);
656 }
sharec10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment657 SegmentRange share(SegmentRange range, std::ostream& ss) {
658 return SegmentRange(nullptr, 0);
659 }
fromSharedc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment660 static std::unique_ptr<ExpandableSegment> fromShared(
661 c10::DeviceIndex device,
662 std::vector<c10::DeviceIndex> peers,
663 std::istream& buf) {
664 return {};
665 }
ptrc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment666 char* ptr() const {
667 return nullptr;
668 }
sizec10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment669 size_t size() const {
670 return 0;
671 }
addPeerc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::ExpandableSegment672 void addPeer(c10::DeviceIndex device) {}
673 };
674 #endif
675
676 // BlockState, BlockPoolState, and PrivatePoolState contain the information
677 // needed to reconstruct a private pool to a previous state. See note
678 // [Checkpointing PrivatePoolState]
679 struct BlockState {
680 c10::DeviceIndex device = 0;
681 cudaStream_t stream = nullptr;
682 stream_set stream_uses = {};
683 size_t size = 0;
684 void* ptr = nullptr;
685 bool allocated = false;
686 int64_t gc_count_base = 0;
687 // maintain invariant that event_count == 0 ;
688 // history will be left alone in checkpoint
689
690 BlockState(Block* block);
691 };
692
693 struct SegmentState {
694 std::vector<BlockState> blocks;
695 bool is_small = false;
696
697 SegmentState(Block* head);
698 };
699
700 struct PrivatePoolState : AllocatorState {
701 // omitting use_count, and cudaMalloc_count as they remain the same
702 MempoolId_t owner_id = {0, 0};
703
704 std::vector<SegmentState> segments;
705
706 PrivatePoolState(
707 MempoolId_t pool_id,
708 const std::vector<Block*>& private_pool_head_blocks);
709 };
710
711 struct RestoreResult {
712 std::vector<void*> allocations_freed;
713 std::vector<Block*> allocations_created;
714 };
715
BlockComparatorSize(const Block * a,const Block * b)716 static bool BlockComparatorSize(const Block* a, const Block* b) {
717 if (a->stream != b->stream) {
718 return (uintptr_t)a->stream < (uintptr_t)b->stream;
719 }
720 if (a->size != b->size) {
721 return a->size < b->size;
722 }
723 return (uintptr_t)a->ptr < (uintptr_t)b->ptr;
724 }
BlockComparatorAddress(const Block * a,const Block * b)725 static bool BlockComparatorAddress(const Block* a, const Block* b) {
726 if (a->stream != b->stream) {
727 return (uintptr_t)a->stream < (uintptr_t)b->stream;
728 }
729 return (uintptr_t)a->ptr < (uintptr_t)b->ptr;
730 }
731
732 struct AllocParams {
AllocParamsc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::AllocParams733 AllocParams(
734 c10::DeviceIndex device,
735 size_t size,
736 cudaStream_t stream,
737 BlockPool* pool,
738 size_t alloc_size,
739 DeviceStats& stats)
740 : search_key(device, stream, size), pool(pool), alloc_size(alloc_size) {}
741
devicec10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::AllocParams742 c10::DeviceIndex device() const {
743 return search_key.device;
744 }
streamc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::AllocParams745 cudaStream_t stream() const {
746 return search_key.stream;
747 }
sizec10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::AllocParams748 size_t size() const {
749 return search_key.size;
750 }
751
752 Block search_key;
753 BlockPool* pool;
754 size_t alloc_size;
755 Block* block{nullptr};
756 StatTypes stat_types = {false};
757 cudaError_t err{cudaSuccess};
758 };
759
760 // Note: cudaEventCreate when concurrently invoked from multiple threads can be
761 // very expensive (at least on certain device/driver combinations). Thus, we a)
762 // serialize event creation at a per-device level, and b) pool the events to
763 // avoid constantly calling cudaEventCreate/cudaEventDestroy. This results in
764 // significant improvements in multithreaded workloads with high allocation
765 // rates.
766 class EventPool {
767 public:
768 using Event = std::unique_ptr<cudaEvent_t, std::function<void(cudaEvent_t*)>>;
769 // TODO: Explicit device count
EventPool()770 EventPool() : pools_(at::cuda::device_count()) {}
771
get(c10::DeviceIndex device)772 Event get(c10::DeviceIndex device) {
773 TORCH_INTERNAL_ASSERT(0 <= device);
774 TORCH_INTERNAL_ASSERT(device < static_cast<int>(pools_.size()));
775 auto& pool = pools_[device];
776 auto destructor = [&pool](cudaEvent_t* event) {
777 std::lock_guard<std::mutex> g(pool.mutex_);
778 pool.event_pool_.push_back(std::unique_ptr<cudaEvent_t>(event));
779 };
780
781 // Try to acquire an event from the per-device pool.
782 {
783 std::lock_guard<std::mutex> g(pool.mutex_);
784 if (!pool.event_pool_.empty()) {
785 auto* event = pool.event_pool_.back().release();
786 pool.event_pool_.pop_back();
787 return Event(event, destructor);
788 }
789 }
790 // otherwise, allocate a new event that will be returned to the pool on
791 // destruction.
792 auto new_ptr = std::make_unique<cudaEvent_t>();
793 C10_CUDA_CHECK(
794 cudaEventCreateWithFlags(new_ptr.get(), cudaEventDisableTiming));
795
796 return Event(new_ptr.release(), destructor);
797 }
798
empty_cache()799 void empty_cache() {
800 for (auto& pool : pools_) {
801 std::lock_guard<std::mutex> g(pool.mutex_);
802 pool.event_pool_.clear();
803 }
804 }
805
806 private:
807 struct PerDevicePool {
808 alignas(64) std::mutex mutex_;
809 std::vector<std::unique_ptr<cudaEvent_t>> event_pool_;
810 };
811 std::vector<PerDevicePool> pools_;
812 };
813
814 // CUDA graphs helper
815 struct PrivatePool {
PrivatePoolc10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::PrivatePool816 PrivatePool()
817 : large_blocks(/*small=*/false, this),
818 small_blocks(/*small=*/true, this) {}
819 PrivatePool(const PrivatePool&) = delete;
820 PrivatePool(PrivatePool&&) = delete;
821 PrivatePool& operator=(const PrivatePool&) = delete;
822 // Number of live graphs using this pool
823 int use_count{1};
824 // Number of unfreed cudaMallocs made for this pool. When use_count and
825 // cudaMalloc_count drop to zero, we can delete this PrivatePool from
826 // graph_pools.
827 int cudaMalloc_count{0};
828 // Instead of maintaining private BlockPools here, I could stuff all blocks
829 // (private or no) into the top-level large_blocks and small_blocks, and
830 // distinguish private blocks by adding a "pool id" check above the stream
831 // check in BlockComparator. BlockComparator is performance- critical though,
832 // I'd rather not add more logic to it.
833 BlockPool large_blocks;
834 BlockPool small_blocks;
835 };
836
BlockState(Block * block)837 BlockState::BlockState(Block* block)
838 : device(block->device),
839 stream(block->stream),
840 stream_uses(block->stream_uses),
841 size(block->size),
842 ptr(block->ptr),
843 allocated(block->allocated),
844 gc_count_base(block->gc_count_base) {
845 TORCH_CHECK(
846 block->event_count == 0,
847 "Events should have synchronized when checkpointing block");
848 };
849
SegmentState(Block * head)850 SegmentState::SegmentState(Block* head) {
851 TORCH_INTERNAL_ASSERT(head->prev == nullptr && head->pool != nullptr);
852 is_small = head->pool->is_small;
853
854 for (Block* curr = head; curr != nullptr; curr = curr->next) {
855 blocks.emplace_back(curr);
856 }
857 }
858
PrivatePoolState(MempoolId_t pool_id,const std::vector<Block * > & private_pool_head_blocks)859 PrivatePoolState::PrivatePoolState(
860 MempoolId_t pool_id,
861 const std::vector<Block*>& private_pool_head_blocks)
862 : owner_id(std::move(pool_id)) {
863 for (Block* head : private_pool_head_blocks) {
864 segments.emplace_back(head);
865 }
866 }
867
868 struct MempoolIdHash {
operator ()c10::cuda::CUDACachingAllocator::Native::__anon8687b97a0111::MempoolIdHash869 std::size_t operator()(const MempoolId_t& mempool_id) const noexcept {
870 return mempool_id.first != 0 ? mempool_id.first : mempool_id.second;
871 }
872 };
873
cudaMallocMaybeCapturing(void ** p,size_t size)874 cudaError_t cudaMallocMaybeCapturing(void** p, size_t size) {
875 if (at::cuda::currentStreamCaptureStatusMayInitCtx() ==
876 at::cuda::CaptureStatus::None) {
877 return C10_CUDA_ERROR_HANDLED(cudaMalloc(p, size));
878 } else {
879 // It's ok to capture cudaMallocs, as long as we never cudaFree those
880 // addresses before replay.
881 // Capturing cudaMalloc behaves nicely: it gives the graph new VA,
882 // but is ignored (won't leakily allocate new memory) in replays.
883 at::cuda::CUDAStreamCaptureModeGuard g{cudaStreamCaptureModeRelaxed};
884 return C10_CUDA_ERROR_HANDLED(cudaMalloc(p, size));
885 }
886 }
887
888 template <class T>
889 class RingBuffer {
890 public:
RingBuffer()891 RingBuffer() {
892 // alloc_trace is a pointer because we need to intentionally
893 // leak this on deallocation it can hold references to Python
894 // state which will already be destroyed when we are in exit handlers
895 // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer)
896 alloc_trace = new std::vector<T>();
897 }
898
setMaxEntries(size_t size)899 void setMaxEntries(size_t size) {
900 std::lock_guard<std::mutex> lk(alloc_trace_lock);
901 alloc_trace_max_entries_ = std::max(size_t(1), size);
902 }
903
insertEntries(const T & entry)904 void insertEntries(const T& entry) {
905 std::lock_guard<std::mutex> lk(alloc_trace_lock);
906 if (alloc_trace->size() < alloc_trace_max_entries_) {
907 alloc_trace->emplace_back(entry);
908 } else {
909 (*alloc_trace)[alloc_trace_next++] = entry;
910 if (alloc_trace_next == alloc_trace_max_entries_) {
911 alloc_trace_next = 0;
912 }
913 }
914 }
915
getEntries(std::vector<T> & result)916 void getEntries(std::vector<T>& result) {
917 std::lock_guard<std::mutex> lk(alloc_trace_lock);
918 result.reserve(alloc_trace->size());
919 result.insert(
920 result.end(),
921 alloc_trace->begin() +
922 static_cast<typename std::vector<T>::difference_type>(
923 alloc_trace_next),
924 alloc_trace->end());
925 result.insert(
926 result.end(),
927 alloc_trace->begin(),
928 alloc_trace->begin() +
929 static_cast<typename std::vector<T>::difference_type>(
930 alloc_trace_next));
931 }
932
clear()933 void clear() {
934 std::lock_guard<std::mutex> lk(alloc_trace_lock);
935 alloc_trace_next = 0;
936 alloc_trace->clear();
937 }
938
939 private:
940 size_t alloc_trace_max_entries_ = 1;
941
942 // Both alloc_trace and alloc_trace_next needs to be used
943 // under alloc_trace_lock.
944 std::mutex alloc_trace_lock;
945 size_t alloc_trace_next = 0;
946 std::vector<T>*
947 alloc_trace; // pointer because we need to intentionally leak this on
948 // deallocation it can hold references to Python state which
949 // will already be destroyed when we are in exit handlers
950 };
951
952 } // anonymous namespace
953 } // namespace Native
954
reportProcessMemoryInfo(c10::DeviceIndex device)955 static std::string reportProcessMemoryInfo(c10::DeviceIndex device) {
956 #ifdef PYTORCH_C10_DRIVER_API_SUPPORTED
957 void* nvml_handle = DriverAPI::get_nvml_handle();
958 if (!nvml_handle) {
959 return "";
960 }
961 static c10::once_flag nvml_init;
962 c10::call_once(nvml_init, [] {
963 TORCH_INTERNAL_ASSERT(NVML_SUCCESS == DriverAPI::get()->nvmlInit_v2_());
964 });
965
966 cudaDeviceProp prop{};
967 C10_CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
968
969 // NOLINTNEXTLINE(*-c-arrays)
970 char pci_id[80];
971 snprintf(
972 pci_id,
973 sizeof(pci_id),
974 NVML_DEVICE_PCI_BUS_ID_FMT,
975 prop.pciDomainID,
976 prop.pciBusID,
977 prop.pciDeviceID);
978
979 nvmlDevice_t nvml_device = nullptr;
980 TORCH_INTERNAL_ASSERT(
981 NVML_SUCCESS ==
982 DriverAPI::get()->nvmlDeviceGetHandleByPciBusId_v2_(
983 pci_id, &nvml_device));
984
985 std::vector<nvmlProcessInfo_v1_t> procs(8);
986 unsigned int size = procs.size();
987 nvmlReturn_t r{};
988 while ((r = DriverAPI::get()->nvmlDeviceGetComputeRunningProcesses_(
989 nvml_device, &size, procs.data())) ==
990 NVML_ERROR_INSUFFICIENT_SIZE) {
991 procs.resize(size);
992 }
993 unsigned int self_pid = getpid();
994 std::stringstream ss;
995 TORCH_INTERNAL_ASSERT(NVML_SUCCESS == r);
996 ss << "";
997 for (auto i : c10::irange(size)) {
998 auto& proc = procs[i];
999 if (self_pid == proc.pid) {
1000 ss << "Including non-PyTorch memory, this process";
1001 } else {
1002 ss << "Process " << proc.pid;
1003 }
1004 ss << " has " << format_size(proc.usedGpuMemory) << " memory in use. ";
1005 }
1006 return ss.str();
1007 #else
1008 return "";
1009 #endif
1010 }
1011
1012 namespace Native {
1013
1014 class DeviceCachingAllocator {
1015 private:
1016 // lock around all operations
1017 mutable std::recursive_mutex mutex;
1018
1019 // device statistics
1020 DeviceStats stats;
1021
1022 // unallocated cached blocks larger than 1 MB
1023 BlockPool large_blocks;
1024
1025 // unallocated cached blocks 1 MB or smaller
1026 BlockPool small_blocks;
1027
1028 // allocated or in use by a stream. Holds all active allocations,
1029 // whether they came from graph_pools or one of the BlockPools above.
1030 ska::flat_hash_set<Block*> active_blocks;
1031
1032 // captures_underway tracks if we are diverting some
1033 // allocations to a specific pool.
1034 // Most of the time it's empty, in which case malloc can avoid calling
1035 // cudaStreamGetCaptureInfo in the hot path.
1036 std::vector<std::pair<MempoolId_t, std::function<bool(cudaStream_t)>>>
1037 captures_underway;
1038
1039 // See free() for this thing's purpose
1040 std::vector<Block*> needs_events_deferred_until_no_capture;
1041 // outstanding cuda events
1042 ska::flat_hash_map<
1043 cuda::CUDAStream,
1044 std::deque<std::pair<EventPool::Event, Block*>>>
1045 cuda_events;
1046
1047 // record used memory.
1048 size_t total_allocated_memory = 0;
1049
1050 size_t allowed_memory_maximum = 0;
1051
1052 // all live expandable segments
1053 std::vector<ExpandableSegment*> expandable_segments_;
1054 std::vector<c10::DeviceIndex> devices_with_peer_access_;
1055
1056 bool set_fraction = false;
1057
1058 bool record_history = false;
1059
1060 std::atomic<CreateContextFn> context_recorder_;
1061 RecordContext record_context_ = RecordContext::NEVER;
1062
1063 // Ring buffer for memory snapshot TraceEntry's
1064 RingBuffer<TraceEntry> alloc_buffer;
1065
1066 // Members specific to CUDA graphs
1067
1068 // Private pools for CUDA graphs
1069 ska::flat_hash_map<MempoolId_t, std::unique_ptr<PrivatePool>, MempoolIdHash>
1070 graph_pools;
1071 // Pools no longer referenced by any graph. Their BlockPools are eligible for
1072 // free_blocks. Can't be a vector or deque because we might erase entries in
1073 // any order. Could be an std::list, but we don't care much, access and
1074 // insert/erase are rare.
1075 ska::flat_hash_map<MempoolId_t, PrivatePool*, MempoolIdHash>
1076 graph_pools_freeable;
1077
1078 // XXX - maybe we should generalize and have multiple events
1079 std::vector<OutOfMemoryObserver> oom_observers_;
1080
1081 std::vector<AllocatorTraceTracker> trace_trackers_;
1082
1083 // mapping from block to a stream_set, containing streams on which the block
1084 // was used while cudagraph capturing
1085 std::unordered_map<Block*, stream_set> block_to_cudagraph_stream_uses;
1086
1087 public:
1088 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
DeviceCachingAllocator()1089 DeviceCachingAllocator()
1090 : large_blocks(/*small=*/false), small_blocks(/*small=*/true) {
1091 stats.max_split_size =
1092 static_cast<int64_t>(CUDAAllocatorConfig::max_split_size());
1093 context_recorder_.store(nullptr);
1094 }
1095
recordHistory(bool enabled,CreateContextFn context_recorder,size_t alloc_buffer_max_entries,RecordContext when)1096 void recordHistory(
1097 bool enabled,
1098 CreateContextFn context_recorder,
1099 size_t alloc_buffer_max_entries,
1100 RecordContext when) {
1101 std::unique_lock<std::recursive_mutex> lock(mutex);
1102 TORCH_CHECK(when == RecordContext::NEVER || context_recorder);
1103 record_history = enabled;
1104 context_recorder_.store(record_history ? context_recorder : nullptr);
1105 alloc_buffer.setMaxEntries(alloc_buffer_max_entries);
1106 record_context_ = enabled ? when : RecordContext::NEVER;
1107 if (!enabled) {
1108 alloc_buffer.clear();
1109 }
1110 }
1111
isHistoryEnabled()1112 bool isHistoryEnabled() {
1113 return record_history;
1114 }
1115
checkPoolLiveAllocations(MempoolId_t mempool_id,const std::unordered_set<void * > & expected_live_allocations)1116 bool checkPoolLiveAllocations(
1117 MempoolId_t mempool_id,
1118 const std::unordered_set<void*>& expected_live_allocations) {
1119 std::unique_lock<std::recursive_mutex> lock(mutex);
1120
1121 PrivatePool* pool = nullptr;
1122 auto pool_it = graph_pools.find(mempool_id);
1123 TORCH_CHECK(pool_it != graph_pools.end(), "Could not find pool of id");
1124 pool = pool_it->second.get();
1125
1126 TORCH_INTERNAL_ASSERT(pool != nullptr);
1127
1128 size_t allocated_pool_blocks = 0;
1129
1130 for (Block* b : active_blocks) {
1131 TORCH_INTERNAL_ASSERT(b != nullptr);
1132 TORCH_INTERNAL_ASSERT(b->pool != nullptr);
1133 if (b->allocated && b->pool->owner_PrivatePool == pool) {
1134 if (!expected_live_allocations.count(b->ptr)) {
1135 return false;
1136 }
1137
1138 allocated_pool_blocks += 1;
1139 }
1140 }
1141
1142 return allocated_pool_blocks == expected_live_allocations.size();
1143 }
1144
attachOutOfMemoryObserver(OutOfMemoryObserver observer)1145 void attachOutOfMemoryObserver(OutOfMemoryObserver observer) {
1146 oom_observers_.emplace_back(std::move(observer));
1147 }
1148
attachAllocatorTraceTracker(AllocatorTraceTracker tracker)1149 void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) {
1150 std::unique_lock<std::recursive_mutex> lock(mutex);
1151 trace_trackers_.emplace_back(std::move(tracker));
1152 }
1153
1154 // Must be called outside of `mutex` or deadlocks are possible with Python
maybeGatherContext(RecordContext level)1155 std::shared_ptr<GatheredContext> maybeGatherContext(RecordContext level) {
1156 if (record_context_ < level) {
1157 return nullptr;
1158 }
1159 return context_recorder_.load()();
1160 }
1161
1162 // All public methods (except the above) acquire the allocator mutex.
1163 // Thus, do not call a public method from another public method.
1164
malloc(c10::DeviceIndex device,size_t orig_size,cudaStream_t stream)1165 Block* malloc(
1166 c10::DeviceIndex device,
1167 size_t orig_size,
1168 cudaStream_t stream) {
1169 // done outside the lock because we don't know what locks the recorder needs
1170 // to have...
1171 auto context = maybeGatherContext(RecordContext::STATE);
1172
1173 std::unique_lock<std::recursive_mutex> lock(mutex);
1174
1175 if (C10_LIKELY(captures_underway.empty())) {
1176 // Processes end-of-life events for outstanding allocations used on
1177 // multiple streams (checks if their GPU-side uses are complete and
1178 // recycles their memory if so)
1179 //
1180 // Q. Why skip process_events if a capture might be underway?
1181 // A. process_events involves cudaEventQueries, illegal during CUDA graph
1182 // capture.
1183 // Dumb simple solution: defer reclaiming these allocations until after
1184 // capture. Cross-stream memory use is uncommon, so the deferral's
1185 // effect on memory use during capture should be small.
1186 process_events(context);
1187 }
1188 size_t size = round_size(orig_size);
1189 auto& pool = get_pool(size, stream);
1190 const size_t alloc_size = get_allocation_size(size);
1191 AllocParams params(device, size, stream, &pool, alloc_size, stats);
1192 params.stat_types = get_stat_types_for_pool(pool);
1193
1194 // First, try to get a block from the existing pool.
1195 bool block_found =
1196 // Search pool
1197 get_free_block(params)
1198 // Trigger callbacks and retry search
1199 || (trigger_free_memory_callbacks(params) && get_free_block(params));
1200
1201 // Can't reuse an existing block; try to get a new one.
1202 if (!block_found) {
1203 // Do garbage collection if the flag is set.
1204 if (C10_UNLIKELY(
1205 set_fraction &&
1206 CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) {
1207 garbage_collect_cached_blocks(context);
1208 }
1209 // Attempt allocate
1210 // WARNING: alloc_block may release the allocator lock when calling
1211 // cudaMalloc. So far this function has not modified allocator state, but
1212 // keep in mind that any observed allocator state may change across calls
1213 // to alloc_block since it may release the lock.
1214 block_found = alloc_block(params, false, context, lock)
1215 // Free enough available cached blocks to satisfy alloc and retry
1216 // alloc.
1217 || (release_available_cached_blocks(params, context) &&
1218 alloc_block(params, false, context, lock))
1219 // Free all non-split cached blocks and retry alloc.
1220 || (C10_LIKELY(captures_underway.empty()) &&
1221 release_cached_blocks(context) &&
1222 alloc_block(params, true, context, lock));
1223 }
1224
1225 if (!block_found) {
1226 // For any error code other than cudaErrorMemoryAllocation,
1227 // alloc_block should have thrown an exception already.
1228 TORCH_INTERNAL_ASSERT(params.err == cudaErrorMemoryAllocation);
1229
1230 size_t device_free = 0;
1231 size_t device_total = 0;
1232 C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
1233 std::string allowed_info;
1234
1235 if (set_fraction) {
1236 allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
1237 }
1238
1239 std::string proc_info = reportProcessMemoryInfo(device);
1240
1241 record_trace(
1242 TraceEntry::OOM,
1243 device_free,
1244 params.size(),
1245 params.stream(),
1246 params.device(),
1247 std::move(context));
1248 stats.num_ooms += 1;
1249
1250 c10::reportOutOfMemoryToProfiler(
1251 static_cast<int64_t>(size),
1252 stats.allocated_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
1253 .current,
1254 stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
1255 .current,
1256 c10::Device(c10::DeviceType::CUDA, device));
1257
1258 auto allocated_bytes =
1259 stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)]
1260 .current;
1261 auto reserved_bytes =
1262 stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)]
1263 .current;
1264 auto observers_local = oom_observers_;
1265
1266 size_t allocated_in_private_pools = 0;
1267 auto get_size_block = [](const BlockPool& pool) {
1268 size_t res = 0;
1269 for (const auto& block : pool.blocks) {
1270 res += block->size;
1271 }
1272 return res;
1273 };
1274 for (const auto& p : graph_pools) {
1275 allocated_in_private_pools += get_size_block(p.second->large_blocks);
1276 allocated_in_private_pools += get_size_block(p.second->small_blocks);
1277 }
1278
1279 std::string private_pool_msg;
1280
1281 if (allocated_in_private_pools > 0) {
1282 private_pool_msg = "with " + format_size(allocated_in_private_pools) +
1283 " allocated in private pools (e.g., CUDA Graphs), ";
1284 }
1285
1286 // Make sure we do not have the device lock before calling our
1287 // observers which might need hold the GIL
1288 // It is safe to release at this point because will no longer
1289 // be reading any allocator state.
1290
1291 lock.unlock();
1292
1293 for (const auto& obs : observers_local) {
1294 obs(device,
1295 alloc_size,
1296 set_fraction ? allowed_memory_maximum : device_total,
1297 device_free);
1298 }
1299
1300 // "total capacity": total global memory on GPU
1301 // "allowed": memory is allowed to use, which set by fraction.
1302 // "already allocated": memory allocated by the program using the
1303 // caching allocator
1304 // "free": free memory as reported by the CUDA API
1305 // "cached": memory held by the allocator but not used by the program
1306 //
1307 // The "allocated" amount does not include memory allocated outside
1308 // of the caching allocator, such as memory allocated by other programs
1309 // or memory held by the driver.
1310 //
1311 // The sum of "allocated" + "free" + "cached" may be less than the
1312 // total capacity due to memory held by the driver and usage by other
1313 // programs.
1314 //
1315 // Note that at this point free_cached_blocks has already returned all
1316 // possible "cached" memory to the driver. The only remaining "cached"
1317 // memory is split from a larger block that is partially in-use.
1318 TORCH_CHECK_WITH(
1319 OutOfMemoryError,
1320 false,
1321 "CUDA out of memory. Tried to allocate ",
1322 format_size(alloc_size),
1323 ". GPU ",
1324 static_cast<int>(device),
1325 " has a total capacity of ",
1326 format_size(device_total),
1327 " of which ",
1328 format_size(device_free),
1329 " is free. ",
1330 proc_info,
1331 allowed_info,
1332 "Of the allocated memory ",
1333 format_size(allocated_bytes + allocated_in_private_pools),
1334 " is allocated by PyTorch, ",
1335 private_pool_msg,
1336 "and ",
1337 format_size(
1338 reserved_bytes - allocated_bytes - allocated_in_private_pools),
1339 " is reserved by PyTorch but unallocated.",
1340 " If reserved but unallocated memory is large try setting",
1341 " PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid"
1342 " fragmentation. See documentation for Memory Management "
1343 " (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)");
1344 }
1345
1346 bool split_remainder = should_split(params.block, params.size());
1347 return alloc_found_block(
1348 params, orig_size, std::move(context), split_remainder);
1349 }
1350
alloc_found_block(const AllocParams & params,size_t orig_size,std::shared_ptr<GatheredContext> context,bool split_remainder)1351 Block* alloc_found_block(
1352 const AllocParams& params,
1353 size_t orig_size,
1354 std::shared_ptr<GatheredContext> context,
1355 bool split_remainder) {
1356 auto size = params.size();
1357 auto device = params.device();
1358 auto pool = params.pool;
1359 auto stream = params.stream();
1360
1361 TORCH_INTERNAL_ASSERT(
1362 params.err == cudaSuccess && params.block != nullptr &&
1363 params.block->ptr != nullptr);
1364 Block* block = params.block;
1365 Block* remaining = nullptr;
1366
1367 const bool already_split = block->is_split();
1368 if (split_remainder) {
1369 remaining = block;
1370
1371 block = new Block(device, stream, size, pool, block->ptr);
1372 block->expandable_segment_ = remaining->expandable_segment_;
1373 block->prev = remaining->prev;
1374 if (block->prev) {
1375 block->prev->next = block;
1376 }
1377 block->next = remaining;
1378
1379 remaining->prev = block;
1380 remaining->ptr = static_cast<char*>(remaining->ptr) + size;
1381 remaining->size -= size;
1382 // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
1383 bool inserted = pool->insert_into_blocks(remaining).second;
1384 TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inserted);
1385
1386 if (already_split && !block->expandable_segment_) {
1387 // An already-split inactive block is being shrunk by size bytes.
1388 decrease_stat_array(
1389 stats.inactive_split_bytes, block->size, params.stat_types);
1390 } else if (!block->expandable_segment_) {
1391 // A new split inactive block is being created from a previously unsplit
1392 // block, size remaining->size bytes.
1393 for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
1394 stats.inactive_split_bytes[stat_type].increase(remaining->size);
1395 stats.inactive_split[stat_type].increase(1);
1396 });
1397 }
1398
1399 } else if (already_split && !block->expandable_segment_) {
1400 // An already-split block is becoming active
1401 for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
1402 stats.inactive_split_bytes[stat_type].decrease(block->size);
1403 stats.inactive_split[stat_type].decrease(1);
1404 });
1405 }
1406
1407 block->allocated = true;
1408 block->requested_size = orig_size;
1409
1410 block->context_when_allocated = std::move(context);
1411 record_trace(
1412 TraceEntry::ALLOC,
1413 int64_t(block->ptr),
1414 orig_size,
1415 block->stream,
1416 block->device,
1417 block->context_when_allocated);
1418
1419 // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
1420 bool inserted = active_blocks.insert(block).second;
1421 TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inserted);
1422
1423 for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
1424 stats.allocation[stat_type].increase(1);
1425 stats.allocated_bytes[stat_type].increase(block->size);
1426 stats.active[stat_type].increase(1);
1427 stats.active_bytes[stat_type].increase(block->size);
1428 stats.requested_bytes[stat_type].increase(block->requested_size);
1429 });
1430 if (block->size >= CUDAAllocatorConfig::max_split_size())
1431 stats.oversize_allocations.increase(1);
1432
1433 auto allocated_bytes_gauge =
1434 STATIC_GAUGE(pytorch.CUDACachingAllocator.allocated_bytes);
1435 allocated_bytes_gauge.record(
1436 stats.allocated_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
1437 .current);
1438
1439 c10::reportMemoryUsageToProfiler(
1440 block->ptr,
1441 static_cast<int64_t>(block->size),
1442 stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
1443 stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
1444 c10::Device(c10::DeviceType::CUDA, device));
1445
1446 return block;
1447 }
1448
free(Block * block)1449 void free(Block* block) {
1450 std::shared_ptr<GatheredContext> context =
1451 maybeGatherContext(RecordContext::ALL);
1452 std::lock_guard<std::recursive_mutex> lock(mutex);
1453
1454 block->allocated = false;
1455
1456 // following logic might modifying underlaying Block, causing the size
1457 // changed. We store ahead for reporting
1458 auto orig_block_ptr = block->ptr;
1459 auto orig_block_size = block->size;
1460
1461 StatTypes stat_types = get_stat_types_for_pool(*block->pool);
1462 for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
1463 stats.allocation[stat_type].decrease(1);
1464 stats.allocated_bytes[stat_type].decrease(block->size);
1465 });
1466 auto allocated_bytes_gauge =
1467 STATIC_GAUGE(pytorch.CUDACachingAllocator.allocated_bytes);
1468 allocated_bytes_gauge.record(
1469 stats.allocated_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
1470 .current);
1471
1472 record_trace(
1473 TraceEntry::FREE_REQUESTED,
1474 int64_t(block->ptr),
1475 block->requested_size,
1476 block->stream,
1477 block->device,
1478 context ? context : block->context_when_allocated);
1479
1480 if (block->size >= CUDAAllocatorConfig::max_split_size())
1481 stats.oversize_allocations.decrease(1);
1482
1483 if (!block->stream_uses.empty()) {
1484 if (C10_UNLIKELY(!captures_underway.empty())) {
1485 // It's forbidden to cudaEventQuery an event recorded during CUDA graph
1486 // capture. We conservatively defer recording end-of-life events until
1487 // the next call to process_events() (which won't happen until no
1488 // captures are underway)
1489 needs_events_deferred_until_no_capture.push_back(block);
1490 } else {
1491 insert_events(block);
1492 }
1493 } else {
1494 free_block(block, context);
1495 }
1496
1497 c10::reportMemoryUsageToProfiler(
1498 orig_block_ptr,
1499 -static_cast<int64_t>(orig_block_size),
1500 stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
1501 stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
1502 c10::Device(c10::DeviceType::CUDA, block->device));
1503 }
1504
getBaseAllocation(Block * block,size_t * outSize)1505 void* getBaseAllocation(Block* block, size_t* outSize) {
1506 std::lock_guard<std::recursive_mutex> lock(mutex);
1507 TORCH_CHECK(
1508 !block->expandable_segment_,
1509 "Tensors allocated with expandable_segments:True cannot be shared between processes. Consider using expandable_segments:False in data loading workers via torch.cuda.memory._set_allocator_settings('expandable_segments:False')");
1510 while (block->prev) {
1511 block = block->prev;
1512 }
1513 void* basePtr = block->ptr;
1514 if (outSize) {
1515 size_t size = 0;
1516 while (block) {
1517 size += block->size;
1518 block = block->next;
1519 }
1520 *outSize = size;
1521 }
1522 return basePtr;
1523 }
1524
shareIpcHandle(Block * block)1525 ShareableHandle shareIpcHandle(Block* block) {
1526 std::lock_guard<std::recursive_mutex> lock(mutex);
1527 std::ostringstream ss;
1528 ss.put(SHAREABLE_HANDLE_VERSION);
1529 ptrdiff_t offset = 0;
1530 if (!block->expandable_segment_) {
1531 ss.put(SHAREABLE_CUDA_MALLOC);
1532 Block* base_block = block;
1533 while (base_block->prev) {
1534 base_block = base_block->prev;
1535 }
1536 offset = (char*)block->ptr - (char*)base_block->ptr;
1537 cudaIpcMemHandle_t handle;
1538 C10_CUDA_CHECK(cudaIpcGetMemHandle(&handle, base_block->ptr));
1539 ss.write((char*)&handle, CUDA_IPC_HANDLE_SIZE);
1540 } else {
1541 ss.put(SHAREABLE_CUDA_EXPANDABLE_SEGMENT);
1542 auto full_range = block->expandable_segment_->share(
1543 SegmentRange(block->ptr, block->size), ss);
1544 offset = (char*)block->ptr - (char*)full_range.ptr;
1545 }
1546 return ShareableHandle{offset, ss.str()};
1547 }
1548
recordStream(Block * block,cuda::CUDAStream stream)1549 void recordStream(Block* block, cuda::CUDAStream stream) {
1550 std::lock_guard<std::recursive_mutex> lock(mutex);
1551 if (stream.stream() == block->stream) {
1552 // ignore uses on the allocation stream, since those don't require any
1553 // special synchronization
1554 return;
1555 }
1556 block->stream_uses.insert(stream);
1557 if (C10_UNLIKELY(!captures_underway.empty())) {
1558 block_to_cudagraph_stream_uses[block].insert(stream);
1559 }
1560 }
1561
1562 /** set memory fraction to limit maximum allocated memory **/
setMemoryFraction(double fraction)1563 void setMemoryFraction(double fraction) {
1564 size_t device_free = 0;
1565 size_t device_total = 0;
1566 C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
1567 allowed_memory_maximum =
1568 static_cast<size_t>(fraction * static_cast<double>(device_total));
1569 set_fraction = true;
1570 }
1571
1572 /** returns cached blocks to the system allocator **/
emptyCache()1573 void emptyCache() {
1574 auto context = maybeGatherContext(RecordContext::ALL);
1575 std::lock_guard<std::recursive_mutex> lock(mutex);
1576 release_cached_blocks(context);
1577 }
1578
1579 /** Retrieves size of largest unused block held by the memory cache **/
cacheInfo(size_t * largest)1580 void cacheInfo(size_t* largest) {
1581 std::lock_guard<std::recursive_mutex> lock(mutex);
1582 if (*largest ==
1583 0) { // make an initial guess if a zero *largest is passed in
1584 size_t tmp_bytes = 0;
1585 C10_CUDA_CHECK(cudaMemGetInfo(
1586 largest, // Use free memory as an optimistic initial guess of *largest
1587 &tmp_bytes));
1588 }
1589 cache_info_aux(large_blocks, largest);
1590 cache_info_aux(small_blocks, largest);
1591 for (const auto& gp : graph_pools) {
1592 cache_info_aux(gp.second->large_blocks, largest);
1593 cache_info_aux(gp.second->small_blocks, largest);
1594 }
1595 }
1596
1597 /** Returns a copy of the memory allocator stats **/
getStats()1598 DeviceStats getStats() {
1599 std::lock_guard<std::recursive_mutex> lock(mutex);
1600 return stats;
1601 }
1602
1603 /** Resets the historical accumulation stats for the device **/
resetAccumulatedStats()1604 void resetAccumulatedStats() {
1605 std::lock_guard<std::recursive_mutex> lock(mutex);
1606
1607 for (const auto statType :
1608 c10::irange(static_cast<size_t>(StatType::NUM_TYPES))) {
1609 stats.allocation[statType].reset_accumulated();
1610 stats.segment[statType].reset_accumulated();
1611 stats.active[statType].reset_accumulated();
1612 stats.inactive_split[statType].reset_accumulated();
1613 stats.allocated_bytes[statType].reset_accumulated();
1614 stats.reserved_bytes[statType].reset_accumulated();
1615 stats.active_bytes[statType].reset_accumulated();
1616 stats.inactive_split_bytes[statType].reset_accumulated();
1617 stats.requested_bytes[statType].reset_accumulated();
1618 }
1619
1620 stats.num_alloc_retries = 0;
1621 stats.num_ooms = 0;
1622 stats.num_sync_all_streams = 0;
1623 stats.num_device_alloc = 0;
1624 stats.num_device_free = 0;
1625 stats.oversize_allocations.reset_accumulated();
1626 stats.oversize_segments.reset_accumulated();
1627 }
1628
1629 /** Resets the historical peak stats for the device **/
resetPeakStats()1630 void resetPeakStats() {
1631 std::lock_guard<std::recursive_mutex> lock(mutex);
1632
1633 for (const auto statType :
1634 c10::irange(static_cast<size_t>(StatType::NUM_TYPES))) {
1635 stats.allocation[statType].reset_peak();
1636 stats.segment[statType].reset_peak();
1637 stats.active[statType].reset_peak();
1638 stats.inactive_split[statType].reset_peak();
1639 stats.allocated_bytes[statType].reset_peak();
1640 stats.reserved_bytes[statType].reset_peak();
1641 stats.active_bytes[statType].reset_peak();
1642 stats.inactive_split_bytes[statType].reset_peak();
1643 stats.requested_bytes[statType].reset_peak();
1644 }
1645 stats.oversize_allocations.reset_peak();
1646 stats.oversize_segments.reset_peak();
1647 }
1648
1649 /* Checkpoint the state of a private pool necessary to return it to its
1650 * current state */
getCheckpointState(MempoolId_t id)1651 std::unique_ptr<PrivatePoolState> getCheckpointState(MempoolId_t id) {
1652 auto context = maybeGatherContext(RecordContext::ALL);
1653 std::lock_guard<std::recursive_mutex> lock(mutex);
1654 insert_events_deferred_until_no_capture(context);
1655
1656 auto pool = graph_pools.find(id);
1657 if (pool != graph_pools.end()) {
1658 auto private_pool_head_blocks =
1659 get_private_pool_head_blocks(pool->second.get());
1660 return std::make_unique<PrivatePoolState>(id, private_pool_head_blocks);
1661 } else if (graph_pools_freeable.count(id)) {
1662 TORCH_CHECK(false, "Not expected to checkpoint freeable graph");
1663 } else {
1664 TORCH_CHECK(false, "Could not find pool of id");
1665 }
1666 }
1667
freeBlocksAllocatedToPool(PrivatePool * private_pool,RestoreResult & rr)1668 void freeBlocksAllocatedToPool(PrivatePool* private_pool, RestoreResult& rr) {
1669 auto pool_blocks = get_private_pool_head_blocks(private_pool);
1670
1671 std::vector<Block*> head_blocks;
1672 for (Block* block : pool_blocks) {
1673 if (block->prev == nullptr) {
1674 head_blocks.push_back(block);
1675 }
1676 }
1677
1678 for (Block* block : head_blocks) {
1679 Block* curr = block;
1680
1681 while (curr) {
1682 // When we free a block, its pointer should never change
1683 // only its adjacent blocks, so free, then look at pointer
1684 if (curr->allocated) {
1685 TORCH_CHECK(
1686 curr->event_count == 0,
1687 "Events should have synchronized when setting checkpointed block");
1688 rr.allocations_freed.push_back(curr->ptr);
1689 free(curr);
1690 TORCH_CHECK(!curr->allocated)
1691 }
1692 curr = curr->next;
1693 }
1694 }
1695
1696 for (Block* b : get_private_pool_head_blocks(private_pool)) {
1697 Block* curr = b;
1698 while (curr) {
1699 TORCH_CHECK(!curr->allocated);
1700 curr = curr->next;
1701 }
1702 }
1703 }
1704
1705 // checkpoint the state of an allocation that may have been
1706 // split into multiple blocks
setSegmentStateToCheckpoint(Block * block,SegmentState & segment,const std::shared_ptr<GatheredContext> & context,RestoreResult & rr)1707 void setSegmentStateToCheckpoint(
1708 Block* block,
1709 SegmentState& segment,
1710 const std::shared_ptr<GatheredContext>& context,
1711 RestoreResult& rr) {
1712 Block* curr_block = block;
1713 Block* last_block = block;
1714
1715 TORCH_INTERNAL_ASSERT(block->pool);
1716 BlockPool& pool = *block->pool;
1717 const auto segment_len = segment.blocks.size();
1718
1719 // allocate all blocks in the segment
1720 for (size_t i = 0; i < segment_len; ++i) {
1721 // The last block in every expandable segment is the remaining amount of
1722 // available unmapped virtual address space. We shouldn't change it but
1723 // instead check it is correctly formed then skip over allocating it.
1724 if (i == segment_len - 1 && curr_block->expandable_segment_) {
1725 TORCH_CHECK(curr_block->next == nullptr);
1726 TORCH_CHECK(!curr_block->mapped);
1727 TORCH_CHECK(curr_block->allocated == false);
1728 continue;
1729 }
1730
1731 auto& block_state = segment.blocks.at(i);
1732 AllocParams params(
1733 block_state.device,
1734 block_state.size,
1735 block_state.stream,
1736 &pool,
1737 block_state.size,
1738 stats);
1739 pool.blocks.erase(curr_block);
1740 params.block = curr_block;
1741 params.stat_types = get_stat_types_for_pool(pool);
1742
1743 // splitting a block depends on `max_split_size`, which may have changed
1744 // between when checkpoint was taken and now, so we make sure to recreate
1745 // the behavior from the checkpoint. Keep splitting as long as there is
1746 // space left in the block because the block is already the size of how it
1747 // appears in the segment, so any leftover space belongs to the next
1748 // block.
1749 bool split = curr_block->size > block_state.size;
1750
1751 // curr_block will become next pointer if it is split, so reassign with
1752 // the returned value
1753 curr_block = alloc_found_block(params, block_state.size, context, split);
1754
1755 TORCH_CHECK(curr_block->ptr == block_state.ptr);
1756 TORCH_CHECK(curr_block->size == block_state.size);
1757
1758 last_block = curr_block;
1759 curr_block = curr_block->next;
1760
1761 TORCH_CHECK((curr_block != nullptr) == ((i + 1) < (segment_len)));
1762 }
1763
1764 while (last_block->prev) {
1765 last_block = last_block->prev;
1766 }
1767
1768 // free blocks that are not allocated in the checkpoint
1769 curr_block = last_block;
1770
1771 for (size_t i = 0; i < segment_len; ++i, curr_block = curr_block->next) {
1772 if (i == segment_len - 1 && curr_block->expandable_segment_) {
1773 TORCH_CHECK(curr_block->next == nullptr);
1774 TORCH_CHECK(!curr_block->mapped);
1775 TORCH_CHECK(curr_block->allocated == false);
1776 continue;
1777 }
1778
1779 auto& block_state = segment.blocks.at(i);
1780 TORCH_INTERNAL_ASSERT(curr_block != nullptr);
1781
1782 if (block_state.allocated) {
1783 rr.allocations_created.push_back(curr_block);
1784 continue;
1785 }
1786
1787 free(curr_block);
1788
1789 TORCH_CHECK(curr_block->ptr == block_state.ptr);
1790 TORCH_CHECK(curr_block->allocated == block_state.allocated);
1791 TORCH_CHECK(curr_block->size == block_state.size);
1792 }
1793 }
1794
1795 /**
1796 * Note [Checkpointing PrivatePoolState]
1797 *
1798 * Refer above to Note [Interaction with CUDA graph capture]. Allocations made
1799 * during graph capture are made from a separate private pool. During graph
1800 * capture allocations behave as usual. During graph replay the allocator
1801 * state does not change even as new tensors are created. The private pool
1802 * will not free its blocks to the main caching allocator until cuda graph use
1803 * is finished to prevent an allocation from eager clobbering the memory from
1804 * a live but unaccounted for tensor that was created during replay.
1805 *
1806 * `make_graphed_callables`, a series of separate callables chained in
1807 * successive cuda graphs, can share a memory pool because after a cuda graph
1808 * recording the allocations in the shared private pool exactly reflect the
1809 * tensors that are allocated.
1810 *
1811 * We would like to extend callable chaining to support a graphed callable
1812 * tree. In this scenario, we have a tree of callable chains which will be
1813 * captured with cuda graphs. In the diagram below, we have a tree with four
1814 * callables, A, B, C, and D. Suppose we have captured, and subsequently
1815 * replayed, A, B, and C. Then on a new invocation, we replay A and B, but
1816 * would now like to record D. At this point the private pool will not reflect
1817 * any of the live tensors created during graph replay. Allocations made
1818 * during a new recording with the pool could overwrite those live tensors.
1819 *
1820 * In order to record a new graph capture after replaying prior callables in
1821 * the tree, we need the allocator to reflect the state of the live tensors.
1822 * We checkpoint the state of the private pool after each recording, and then
1823 * reapply it when we are starting a new recording chain. Additionally, we
1824 * must free the allocations for any tensors that died between the end of our
1825 * previous graph replaying and our new recording. All of the allocated
1826 * segments that existed in the checkpointed state must still exist in the
1827 * pool. There may also exist new allocated blocks.
1828 * (TODO : link note [live tensors between iterations] when it exists). For
1829 * every block that is currently allocated but no allocated in the snapshot,
1830 * we will return a pointer to their block.
1831 *.
1832 *
1833 *
1834 * ---------------> A ---------------> B ---------------> C
1835 * |
1836 * |
1837 * |
1838 * |
1839 * ╰ ---------------> D
1840 */
setCheckpointPoolState(PrivatePoolState & pps)1841 RestoreResult setCheckpointPoolState(PrivatePoolState& pps) {
1842 // To reset the caching allocator state we will
1843 // - Free all the blocks currently allocated to the pool (see [live tensors
1844 // between iterations])
1845 // - Allocate all the blocks in a checkpointed segment, whether they are
1846 // live or not
1847 // - Free the blocks in a checkpointed segment which are not live
1848 // This could be optimized, but it nicely reuses exiting apis, and this
1849 // is not on the hot path.
1850
1851 // following `done outside the lock because we don't know what locks the
1852 // recorder needs to have...`
1853
1854 std::shared_ptr<GatheredContext> context =
1855 maybeGatherContext(RecordContext::STATE);
1856
1857 std::lock_guard<std::recursive_mutex> lock(mutex);
1858
1859 RestoreResult rr;
1860
1861 TORCH_CHECK(
1862 !graph_pools_freeable.count(pps.owner_id),
1863 "Not expected to checkpoint freeable graph");
1864
1865 auto pool = graph_pools.find(pps.owner_id);
1866 TORCH_CHECK(pool != graph_pools.end(), "Could not find private pool id");
1867
1868 PrivatePool* private_pool = pool->second.get();
1869
1870 freeBlocksAllocatedToPool(private_pool, rr);
1871
1872 std::unordered_map<void*, Block*> ptrs_to_blocks;
1873 // at this point, all of the blocks should be free, so they will all be in
1874 // the block set
1875 for (Block* block : private_pool->small_blocks.blocks) {
1876 ptrs_to_blocks[block->ptr] = block;
1877 }
1878 for (Block* block : private_pool->large_blocks.blocks) {
1879 ptrs_to_blocks[block->ptr] = block;
1880 }
1881
1882 for (auto& segment : pps.segments) {
1883 auto ptr = segment.blocks.at(0).ptr;
1884 TORCH_CHECK(ptrs_to_blocks.count(ptr), " could not find ", ptr)
1885 auto block = ptrs_to_blocks[ptr];
1886
1887 setSegmentStateToCheckpoint(block, segment, context, rr);
1888 }
1889 return rr;
1890 }
1891
1892 /** Dump a complete snapshot of the memory held by the allocator. Potentially
1893 * VERY expensive. **/
snapshot()1894 std::vector<SegmentInfo> snapshot() {
1895 std::lock_guard<std::recursive_mutex> lock(mutex);
1896
1897 std::unordered_map<PrivatePool*, MempoolId_t> pool_to_id;
1898 pool_to_id.reserve(graph_pools.size() + graph_pools_freeable.size());
1899 for (const auto& pair : graph_pools) {
1900 pool_to_id[pair.second.get()] = pair.first;
1901 }
1902 for (const auto& pair : graph_pools_freeable) {
1903 pool_to_id[pair.second] = pair.first;
1904 }
1905
1906 size_t total_active = 0;
1907 std::vector<SegmentInfo> result;
1908 const auto all_blocks = get_all_blocks();
1909
1910 for (const Block* const head_block : all_blocks) {
1911 // For expandable segments, we report one segment for each contiguous
1912 // mapped range of memory
1913 if (head_block->prev && head_block->prev->mapped) {
1914 continue;
1915 }
1916 result.emplace_back();
1917 SegmentInfo& segment_info = result.back();
1918 segment_info.device = head_block->device;
1919 segment_info.address = reinterpret_cast<size_t>(head_block->ptr);
1920 segment_info.stream = head_block->stream;
1921 segment_info.is_large = (!head_block->pool->is_small);
1922 segment_info.is_expandable = head_block->expandable_segment_;
1923 segment_info.context_when_allocated =
1924 head_block->context_when_segment_allocated;
1925 auto mempool_id = pool_to_id.find(head_block->pool->owner_PrivatePool);
1926 if (mempool_id != pool_to_id.end()) {
1927 segment_info.owner_private_pool_id = mempool_id->second;
1928 }
1929
1930 const Block* block = head_block;
1931 while (block != nullptr && block->mapped) {
1932 segment_info.blocks.emplace_back();
1933 BlockInfo& block_info = segment_info.blocks.back();
1934
1935 block_info.size = block->size;
1936 block_info.requested_size = block->requested_size;
1937 block_info.allocated = block->allocated;
1938 block_info.active = block->allocated || (block->event_count > 0) ||
1939 !block->stream_uses.empty();
1940
1941 segment_info.total_size += block_info.size;
1942 if (block_info.allocated) {
1943 segment_info.allocated_size += block_info.size;
1944 }
1945 if (block_info.active) {
1946 segment_info.active_size += block_info.size;
1947 segment_info.requested_size += block_info.requested_size;
1948 }
1949 block_info.context_when_allocated = block->context_when_allocated;
1950 block = block->next;
1951 }
1952 total_active += segment_info.active_size;
1953 }
1954
1955 std::sort(
1956 result.begin(),
1957 result.end(),
1958 [](const SegmentInfo& a, const SegmentInfo& b) {
1959 return a.address < b.address;
1960 });
1961
1962 record_trace(TraceEntry::SNAPSHOT, 0, total_active, nullptr, 0, nullptr);
1963 return result;
1964 }
1965
trace(const std::function<time_t (approx_time_t)> & tsc_to_us)1966 std::vector<TraceEntry> trace(
1967 const std::function<time_t(approx_time_t)>& tsc_to_us) {
1968 std::lock_guard<std::recursive_mutex> lock(mutex);
1969 std::vector<TraceEntry> result;
1970 alloc_buffer.getEntries(result);
1971
1972 // Convert all the timestamps from tsc to epoch time in microseconds.
1973 for (auto& te : result) {
1974 te.time_.t_ = tsc_to_us(te.time_.approx_t_);
1975 }
1976 return result;
1977 }
1978
1979 // This function takes the size and number of divisions argument and rounds
1980 // up the size argument for the nearest power-of-2 division.
1981 // For example, if we need to round-up 1200 and number of divisions is 4,
1982 // the size 1200 lies between 1024 and 2048 and if we do 4 divisions between
1983 // them, the values are 1024, 1280, 1536, and 1792. So the function will
1984 // return 1280 as the nearest ceiling of power-2 divison.
roundup_power2_next_division(size_t size,size_t divisions)1985 static size_t roundup_power2_next_division(size_t size, size_t divisions) {
1986 if (llvm::isPowerOf2_64(size)) {
1987 return size;
1988 }
1989
1990 TORCH_CHECK(divisions >= 2, "Only 2 or more divisions are supported");
1991
1992 // divide the space between these 2's power into equal divisions
1993 // If division is zero, return the power-of-2 ceiling.
1994 size_t power2_floor = llvm::PowerOf2Floor(size);
1995 size_t power2_divison =
1996 power2_floor >> (63 - llvm::countLeadingZeros(divisions));
1997 if (C10_UNLIKELY(power2_divison == 0)) {
1998 return (power2_floor << 1);
1999 }
2000 size_t round_size_floor = size & (~(power2_divison - 1));
2001 return (round_size_floor == size) ? size
2002 : round_size_floor + power2_divison;
2003 }
2004
round_size(size_t size)2005 static size_t round_size(size_t size) {
2006 if (size < kMinBlockSize) {
2007 return kMinBlockSize;
2008 } else {
2009 auto divisions = CUDAAllocatorConfig::roundup_power2_divisions(size);
2010 if (divisions > 1 && size > (kMinBlockSize * divisions)) {
2011 return roundup_power2_next_division(size, divisions);
2012 } else {
2013 return kMinBlockSize * ((size + kMinBlockSize - 1) / kMinBlockSize);
2014 }
2015 }
2016 }
2017
2018 // See Note [Interaction with CUDA graph capture]
2019
2020 // Called by CUDAGraph::capture_begin
beginAllocateToPool(MempoolId_t mempool_id,std::function<bool (cudaStream_t)> filter)2021 void beginAllocateToPool(
2022 MempoolId_t mempool_id,
2023 std::function<bool(cudaStream_t)> filter) {
2024 std::lock_guard<std::recursive_mutex> lock(mutex);
2025 auto it = graph_pools.find(mempool_id);
2026 if (it == graph_pools.end()) {
2027 // mempool_id does not reference an existing pool. Make a new pool for
2028 // this capture.
2029 graph_pools.emplace(mempool_id, std::make_unique<PrivatePool>());
2030 } else {
2031 // mempool_id references an existing pool, which the current capture will
2032 // share. Check this pool is live (at least one other capture already
2033 // references it).
2034 TORCH_INTERNAL_ASSERT(it->second->use_count > 0);
2035 it->second->use_count++;
2036 }
2037 for (auto it2 = captures_underway.begin(); it2 != captures_underway.end();
2038 ++it2) {
2039 TORCH_CHECK(
2040 it2->first != mempool_id,
2041 "beginAllocateToPool: already recording to mempool_id");
2042 }
2043 captures_underway.emplace_back(mempool_id, std::move(filter));
2044 }
2045
2046 // Called by CUDAGraph::capture_end
endAllocateToPool(MempoolId_t mempool_id)2047 void endAllocateToPool(MempoolId_t mempool_id) {
2048 std::lock_guard<std::recursive_mutex> lock(mutex);
2049 for (auto it = captures_underway.begin(); it != captures_underway.end();
2050 ++it) {
2051 if (it->first == mempool_id) {
2052 captures_underway.erase(it);
2053 return;
2054 }
2055 }
2056 TORCH_CHECK(
2057 false, "endAllocatePool: not currently recording to mempool_id");
2058 }
2059
2060 // Called by CUDAGraph::reset
releasePool(MempoolId_t mempool_id)2061 void releasePool(MempoolId_t mempool_id) {
2062 std::lock_guard<std::recursive_mutex> lock(mutex);
2063 // The instantiated cudaGraphExec_t has been destroyed. We can't blindly
2064 // delete and cudaFree the mempool its capture used, because
2065 // 1. other graph(s) might share the same pool
2066 // 2. the user might still hold references to output tensors allocated
2067 // during capture.
2068 // To handle 1 and 2, we track the number of graphs using this particular
2069 // mempool. When the count reaches 0, we tell free_cached_blocks it may now
2070 // cudaFree blocks from this graph's pool when it discovers they're unused
2071 // (unsplit).
2072 auto it = graph_pools.find(mempool_id);
2073 TORCH_INTERNAL_ASSERT(it != graph_pools.end());
2074 auto uc = --(it->second->use_count);
2075 TORCH_INTERNAL_ASSERT(uc >= 0);
2076 if (uc == 0) {
2077 // Allows free_cached_blocks to begin cudaFreeing this pool's memory,
2078 // and makes sure this pool wasn't somehow made freeable already.
2079 // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
2080 bool inserted =
2081 graph_pools_freeable.insert({mempool_id, it->second.get()}).second;
2082 TORCH_INTERNAL_ASSERT(inserted);
2083 }
2084 }
2085
addPeerAccess(c10::DeviceIndex dev_to_access)2086 void addPeerAccess(c10::DeviceIndex dev_to_access) {
2087 std::lock_guard<std::recursive_mutex> lock(mutex);
2088 if (std::find(
2089 devices_with_peer_access_.begin(),
2090 devices_with_peer_access_.end(),
2091 dev_to_access) != devices_with_peer_access_.end()) {
2092 return;
2093 }
2094 devices_with_peer_access_.push_back(dev_to_access);
2095 for (auto& es : expandable_segments_) {
2096 es->addPeer(dev_to_access);
2097 }
2098 }
peers() const2099 std::vector<c10::DeviceIndex> peers() const {
2100 std::lock_guard<std::recursive_mutex> lock(mutex);
2101 return devices_with_peer_access_;
2102 }
2103
hasAllocatedExpandableSegments() const2104 bool hasAllocatedExpandableSegments() const {
2105 return !expandable_segments_.empty();
2106 }
2107
2108 private:
2109 // All private methods do not acquire the allocator mutex.
2110
get_all_blocks() const2111 std::vector<const Block*> get_all_blocks() const {
2112 std::vector<const Block*> blocks;
2113 blocks.insert(
2114 blocks.end(), small_blocks.blocks.begin(), small_blocks.blocks.end());
2115 blocks.insert(
2116 blocks.end(), large_blocks.blocks.begin(), large_blocks.blocks.end());
2117 for (const auto& gp : graph_pools) {
2118 blocks.insert(
2119 blocks.end(),
2120 gp.second->small_blocks.blocks.begin(),
2121 gp.second->small_blocks.blocks.end());
2122 blocks.insert(
2123 blocks.end(),
2124 gp.second->large_blocks.blocks.begin(),
2125 gp.second->large_blocks.blocks.end());
2126 }
2127 blocks.insert(blocks.end(), active_blocks.begin(), active_blocks.end());
2128 return blocks;
2129 }
2130
get_private_pool_head_blocks(PrivatePool * pool) const2131 std::vector<Block*> get_private_pool_head_blocks(PrivatePool* pool) const {
2132 std::vector<Block*> blocks;
2133 for (Block* b : active_blocks) {
2134 if ((b->pool == &pool->small_blocks || b->pool == &pool->large_blocks) &&
2135 b->prev == nullptr) {
2136 blocks.push_back(b);
2137 }
2138 }
2139
2140 for (Block* b : pool->small_blocks.blocks) {
2141 if (b->prev == nullptr) {
2142 blocks.push_back(b);
2143 }
2144 }
2145 for (Block* b : pool->large_blocks.blocks) {
2146 if (b->prev == nullptr) {
2147 blocks.push_back(b);
2148 }
2149 }
2150
2151 return blocks;
2152 }
2153
2154 // returns the smallest possible address in any segment
2155 // where there is enough free address space to fit size
2156 // may be composed of free and unmapped segments
find_expandable_block(c10::DeviceIndex device,cudaStream_t stream,BlockPool * pool,size_t size)2157 Block* find_expandable_block(
2158 c10::DeviceIndex device,
2159 cudaStream_t stream,
2160 BlockPool* pool,
2161 size_t size) {
2162 Block key(device, stream, 0);
2163
2164 auto allocatable = [](Block* b) {
2165 return b && !b->allocated && b->event_count == 0 &&
2166 b->stream_uses.empty();
2167 };
2168 auto has_available_address_space = [&](Block* b) {
2169 size_t bytes = 0;
2170 while (bytes < size && allocatable(b)) {
2171 bytes += b->size;
2172 b = b->next;
2173 }
2174 return bytes >= size;
2175 };
2176 for (auto it = pool->unmapped.lower_bound(&key);
2177 it != pool->unmapped.end() && (*it)->stream == stream;
2178 ++it) {
2179 Block* c = *it;
2180 // we found the lowest address of an unmapped segment
2181 // but there might be a free segment we can also use
2182 // right before it
2183 if (allocatable(c->prev)) {
2184 c = c->prev;
2185 }
2186 if (has_available_address_space(c)) {
2187 return c;
2188 }
2189 }
2190 auto segment_size = pool->is_small ? kSmallBuffer : kLargeBuffer;
2191 cudaDeviceProp prop{};
2192 C10_CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
2193 // we allocate enough address space for 1 1/8 the total memory on the GPU.
2194 // This allows for some cases where we have to unmap pages earlier in the
2195 // segment to put them at the end.
2196 size_t address_space_size = prop.totalGlobalMem + prop.totalGlobalMem / 8;
2197
2198 expandable_segments_.emplace_back(new ExpandableSegment(
2199 device,
2200 stream,
2201 address_space_size,
2202 segment_size,
2203 devices_with_peer_access_));
2204
2205 ExpandableSegment* es = expandable_segments_.back();
2206 Block* candidate = new Block(device, stream, es->size(), pool, es->ptr());
2207 candidate->mapped = false;
2208 candidate->expandable_segment_ = es;
2209 pool->unmapped.insert(candidate);
2210 return candidate;
2211 }
2212
map_block(Block * to_map,size_t size,const std::shared_ptr<GatheredContext> & ctx)2213 bool map_block(
2214 Block* to_map,
2215 size_t size,
2216 const std::shared_ptr<GatheredContext>& ctx) {
2217 TORCH_INTERNAL_ASSERT(!to_map->mapped && size <= to_map->size);
2218 TORCH_INTERNAL_ASSERT(
2219 !to_map->context_when_allocated); // unmapped blocks should not keep
2220 // history
2221 auto mapped_range =
2222 to_map->expandable_segment_->map(SegmentRange{to_map->ptr, size});
2223 // failed to map the memory
2224 if (mapped_range.size == 0) {
2225 return false;
2226 }
2227 TORCH_INTERNAL_ASSERT(
2228 mapped_range.ptr == to_map->ptr && mapped_range.size >= size);
2229
2230 BlockPool& pool = *to_map->pool;
2231 pool.unmapped.erase(to_map);
2232 to_map->mapped = true;
2233
2234 if (mapped_range.size < to_map->size) {
2235 // to_map -> remaining -> to_map->next(?)
2236 Block* remaining = new Block(
2237 to_map->device,
2238 to_map->stream,
2239 to_map->size - mapped_range.size,
2240 &pool,
2241 static_cast<char*>(to_map->ptr) + mapped_range.size);
2242 remaining->mapped = false;
2243 remaining->expandable_segment_ = to_map->expandable_segment_;
2244 remaining->splice(to_map, to_map->next);
2245 pool.unmapped.insert(remaining);
2246 to_map->size = mapped_range.size;
2247 }
2248
2249 try_merge_blocks(to_map, to_map->prev, pool);
2250 try_merge_blocks(to_map, to_map->next, pool);
2251
2252 pool.insert_into_blocks(to_map);
2253
2254 // update statistics
2255 total_allocated_memory += mapped_range.size;
2256 StatTypes stat_types = get_stat_types_for_pool(*to_map->pool);
2257 for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
2258 stats.reserved_bytes[stat_type].increase(mapped_range.size);
2259 });
2260 auto reserved_bytes_gauge =
2261 STATIC_GAUGE(pytorch.CUDACachingAllocator.reserved_bytes);
2262 reserved_bytes_gauge.record(
2263 stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
2264 .current);
2265
2266 stats.num_device_alloc++;
2267 record_trace(
2268 TraceEntry::SEGMENT_MAP,
2269 int64_t(mapped_range.ptr),
2270 mapped_range.size,
2271 to_map->stream,
2272 to_map->device,
2273 ctx);
2274 if (!to_map->prev && !to_map->context_when_segment_allocated) {
2275 to_map->context_when_segment_allocated = ctx;
2276 }
2277
2278 return true;
2279 }
2280
try_allocate_expandable_block(c10::DeviceIndex device,cudaStream_t stream,BlockPool * pool,size_t size,const std::shared_ptr<GatheredContext> & ctx)2281 Block* try_allocate_expandable_block(
2282 c10::DeviceIndex device,
2283 cudaStream_t stream,
2284 BlockPool* pool,
2285 size_t size,
2286 const std::shared_ptr<GatheredContext>& ctx) {
2287 Block* candidate = find_expandable_block(device, stream, pool, size);
2288 // Candidate is now a list free/unmapped blocks with at least size room:
2289 // unmapped -> null
2290 // unmapped -> free -> *
2291 // free -> unmapped -> *
2292
2293 if (!candidate->mapped &&
2294 !map_block(candidate, std::min(candidate->size, size), ctx)) {
2295 return nullptr;
2296 }
2297 TORCH_INTERNAL_ASSERT(candidate->mapped);
2298
2299 while (candidate->size < size) {
2300 // invariant: free -> unmapped -> *
2301 // map_block will map some of unmapped and merge with free
2302 auto remaining = size - candidate->size;
2303 auto new_candidate = candidate->next;
2304 if (!map_block(
2305 new_candidate, std::min(remaining, candidate->next->size), ctx)) {
2306 return nullptr;
2307 }
2308 candidate = new_candidate;
2309 }
2310 pool->blocks.erase(candidate);
2311 return candidate;
2312 }
2313
2314 /** moves a block into a pool of cached free blocks */
free_block(Block * block,const std::shared_ptr<GatheredContext> & context)2315 void free_block(
2316 Block* block,
2317 const std::shared_ptr<GatheredContext>& context) {
2318 TORCH_INTERNAL_ASSERT(
2319 !block->allocated && block->event_count == 0 &&
2320 block->stream_uses.empty());
2321
2322 record_trace(
2323 TraceEntry::FREE_COMPLETED,
2324 int64_t(block->ptr),
2325 block->requested_size,
2326 block->stream,
2327 block->device,
2328 context ? context : block->context_when_allocated);
2329
2330 block->context_when_allocated = nullptr;
2331 size_t original_block_size = block->size;
2332 size_t requested_size = block->requested_size;
2333
2334 auto& pool = *block->pool;
2335 int64_t net_change_inactive_split_blocks = 0;
2336 int64_t net_change_inactive_split_size = 0;
2337
2338 const std::array<Block*, 2> merge_candidates = {block->prev, block->next};
2339 for (Block* merge_candidate : merge_candidates) {
2340 const auto subsumed_size = try_merge_blocks(block, merge_candidate, pool);
2341 if (subsumed_size > 0) {
2342 net_change_inactive_split_blocks -= 1;
2343 net_change_inactive_split_size -= static_cast<int64_t>(subsumed_size);
2344 }
2345 }
2346
2347 active_blocks.erase(block);
2348 // Makes sure the Block* isn't already present in the pool we're freeing it
2349 // back into.
2350 // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
2351 bool inserted = pool.insert_into_blocks(block).second;
2352 TORCH_INTERNAL_ASSERT(inserted);
2353
2354 if (block->is_split()) {
2355 net_change_inactive_split_blocks += 1;
2356 net_change_inactive_split_size += static_cast<int64_t>(block->size);
2357 }
2358
2359 StatTypes stat_types = get_stat_types_for_pool(pool);
2360
2361 for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
2362 // inactive_split tries to capture the idea that blocks
2363 // cannot be freed when requested, but fully free pages
2364 // of expandable blocks can always be freed.
2365 // The logic to track this as statistic is pretty involved,
2366 // so we simply just exclude expandable segments from
2367 // inactive_split
2368 if (!block->expandable_segment_) {
2369 if (net_change_inactive_split_blocks > 0) {
2370 stats.inactive_split[stat_type].increase(
2371 static_cast<size_t>(net_change_inactive_split_blocks));
2372 } else if (net_change_inactive_split_blocks < 0) {
2373 stats.inactive_split[stat_type].decrease(
2374 static_cast<size_t>(-net_change_inactive_split_blocks));
2375 }
2376 if (net_change_inactive_split_size > 0) {
2377 stats.inactive_split_bytes[stat_type].increase(
2378 static_cast<size_t>(net_change_inactive_split_size));
2379 } else if (net_change_inactive_split_size < 0) {
2380 stats.inactive_split_bytes[stat_type].decrease(
2381 static_cast<size_t>(-net_change_inactive_split_size));
2382 }
2383 }
2384 stats.active[stat_type].decrease(1);
2385 stats.active_bytes[stat_type].decrease(original_block_size);
2386 stats.requested_bytes[stat_type].decrease(requested_size);
2387 });
2388 }
2389
2390 /** combine previously split blocks. returns the size of the subsumed block,
2391 * or 0 on failure. */
try_merge_blocks(Block * dst,Block * src,BlockPool & pool)2392 size_t try_merge_blocks(Block* dst, Block* src, BlockPool& pool) {
2393 if (!src || src->allocated || src->event_count > 0 ||
2394 !src->stream_uses.empty() || dst->mapped != src->mapped) {
2395 return 0;
2396 }
2397
2398 AT_ASSERT(dst->is_split() && src->is_split());
2399
2400 if (dst->prev == src) { // [src dst]
2401 dst->ptr = src->ptr;
2402 dst->prev = src->prev;
2403 if (dst->prev) {
2404 dst->prev->next = dst;
2405 }
2406 dst->context_when_segment_allocated =
2407 std::move(src->context_when_segment_allocated);
2408 } else { // [dest src]
2409 dst->next = src->next;
2410 if (dst->next) {
2411 dst->next->prev = dst;
2412 }
2413 }
2414 const size_t subsumed_size = src->size;
2415 dst->size += subsumed_size;
2416 // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
2417 auto erased =
2418 src->mapped ? pool.blocks.erase(src) : pool.unmapped.erase(src);
2419 TORCH_INTERNAL_ASSERT_DEBUG_ONLY(erased == 1);
2420 delete src;
2421
2422 return subsumed_size;
2423 }
2424
get_pool(size_t size,cudaStream_t stream)2425 BlockPool& get_pool(size_t size, cudaStream_t stream) {
2426 // captures_underway is a conservative guess that the current stream may be
2427 // capturing. It's only non-empty if some thread has begun and not yet ended
2428 // a capture, so it's usually 0, and we can short-circuit
2429 // cudaStreamCaptureStatus (which does a TLS lookup).
2430 if (C10_UNLIKELY(!captures_underway.empty())) {
2431 for (auto& entry : captures_underway) {
2432 if (entry.second(stream)) {
2433 auto it1 = graph_pools.find(entry.first);
2434 TORCH_INTERNAL_ASSERT(it1 != graph_pools.end());
2435 if (size <= kSmallSize) {
2436 return it1->second->small_blocks;
2437 } else {
2438 return it1->second->large_blocks;
2439 }
2440 }
2441 }
2442 }
2443 if (size <= kSmallSize) {
2444 return small_blocks;
2445 } else {
2446 return large_blocks;
2447 }
2448 }
2449
get_stat_types_for_pool(const BlockPool & pool)2450 StatTypes get_stat_types_for_pool(const BlockPool& pool) {
2451 StatTypes stat_types = {false};
2452 stat_types[static_cast<size_t>(StatType::AGGREGATE)] = true;
2453 stat_types[static_cast<size_t>(
2454 pool.is_small ? StatType::SMALL_POOL : StatType::LARGE_POOL)] = true;
2455 return stat_types;
2456 }
2457
should_split(const Block * block,size_t size)2458 bool should_split(const Block* block, size_t size) {
2459 size_t remaining = block->size - size;
2460 if (block->pool->is_small || CUDAAllocatorConfig::expandable_segments()) {
2461 return remaining >= kMinBlockSize;
2462 } else {
2463 return (size < CUDAAllocatorConfig::max_split_size()) &&
2464 (remaining > kSmallSize);
2465 }
2466 }
2467
get_allocation_size(size_t size)2468 static size_t get_allocation_size(size_t size) {
2469 if (size <= kSmallSize) {
2470 return kSmallBuffer;
2471 } else if (size < kMinLargeAlloc) {
2472 return kLargeBuffer;
2473 } else {
2474 return kRoundLarge * ((size + kRoundLarge - 1) / kRoundLarge);
2475 }
2476 }
2477
get_free_block(AllocParams & p)2478 bool get_free_block(AllocParams& p) {
2479 BlockPool& pool = *p.pool;
2480
2481 if (C10_UNLIKELY(
2482 set_fraction &&
2483 CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) {
2484 // Track block reuse interval only when garbage collection is enabled.
2485 ++pool.get_free_blocks_call_count;
2486 }
2487 auto it = pool.blocks.lower_bound(&p.search_key);
2488 if (it == pool.blocks.end() || (*it)->stream != p.stream())
2489 return false;
2490
2491 if ((*it)->expandable_segment_) {
2492 if (CUDAAllocatorConfig::expandable_segments()) {
2493 // if we are allocated to the part of the block that is expandable
2494 // for the purposes of "best fit" we consider its size to be the size it
2495 // can expand to, not the size it currently is. This means that we
2496 // sometimes have to search for blocks with bigger 'size' before
2497 // choosing this segment.
2498 auto expandable_size = [](Block* b) {
2499 return b->size + (b->next && !b->next->mapped ? b->next->size : 0);
2500 };
2501 auto next = it;
2502 next++;
2503 while ((*it)->expandable_segment_ && next != pool.blocks.end() &&
2504 (*next)->stream == p.stream() &&
2505 expandable_size(*next) < expandable_size(*it)) {
2506 it = next++;
2507 }
2508 } else {
2509 // Rarely expandable segments has been turned off after we have
2510 // already allocated some blocks as expandable. For instance,
2511 // since we cannot share expandable memory via IPC, someone might
2512 // temporarily disable it. In this case we need to honor this request
2513 // by only finding non-expandable blocks
2514 do {
2515 it++;
2516 } while (it != pool.blocks.end() && (*it)->expandable_segment_ &&
2517 (*it)->stream == p.stream());
2518 if (it == pool.blocks.end() || (*it)->stream != p.stream()) {
2519 return false;
2520 }
2521 }
2522 }
2523
2524 // Do not return an oversized block for a large request
2525 if ((p.size() < CUDAAllocatorConfig::max_split_size()) &&
2526 ((*it)->size >= CUDAAllocatorConfig::max_split_size()))
2527 return false;
2528 // Allow oversized block size to be rounded up but within a limit
2529 if ((p.size() >= CUDAAllocatorConfig::max_split_size()) &&
2530 ((*it)->size >= p.size() + kLargeBuffer))
2531 return false;
2532 p.block = *it;
2533 pool.blocks.erase(it);
2534 return true;
2535 }
2536
trigger_free_memory_callbacks(AllocParams & p)2537 bool trigger_free_memory_callbacks(AllocParams& p) {
2538 bool freed_memory = false;
2539 for (const auto& name : FreeCudaMemoryCallbacksRegistry()->Keys()) {
2540 freed_memory |=
2541 FreeCudaMemoryCallbacksRegistry()->Create(name)->Execute();
2542 }
2543 return freed_memory;
2544 }
2545
garbage_collect_cached_blocks(const std::shared_ptr<GatheredContext> & context)2546 void garbage_collect_cached_blocks(
2547 const std::shared_ptr<GatheredContext>& context) {
2548 // Free unused cached blocks to reclaim GPU memory.
2549 // Unlike release_cached_blocks(), this does not enforce synchronization and
2550 // therefore should be of less overheads.
2551
2552 size_t gc_threshold = static_cast<size_t>(
2553 CUDAAllocatorConfig::garbage_collection_threshold() *
2554 static_cast<double>(allowed_memory_maximum));
2555 // No need to trigger GC yet
2556 if (total_allocated_memory <= gc_threshold) {
2557 return;
2558 }
2559 const auto target_size = total_allocated_memory - gc_threshold;
2560 size_t gc_reclaimed = 0;
2561
2562 // Calculate the total age of the free-able blocks. We'll use it later to
2563 // get "avg age" threshold.
2564 size_t total_age = 0.0;
2565 int freeable_block_count = 0;
2566 for (auto& b : large_blocks.blocks) {
2567 if (!b->is_split()) {
2568 total_age += b->gc_count();
2569 ++freeable_block_count;
2570 }
2571 }
2572 // No free-able blocks?
2573 if (freeable_block_count == 0) {
2574 return;
2575 }
2576
2577 // Repeat GC until we reach reclaim > target size.
2578 bool block_freed = true;
2579 while (gc_reclaimed < target_size && block_freed == true &&
2580 freeable_block_count > 0) {
2581 // Free blocks exceeding this age threshold first.
2582 double age_threshold =
2583 static_cast<double>(total_age) / freeable_block_count;
2584 // Stop iteration if we can no longer free a block.
2585 block_freed = false;
2586
2587 // Free blocks of > avg age. Don't stop upon reaching the target_size,
2588 // we don't want this GC to be triggered frequently.
2589 auto it = large_blocks.blocks.begin();
2590 while (it != large_blocks.blocks.end()) {
2591 Block* block = *it;
2592 ++it;
2593 if (!block->is_split() && !block->expandable_segment_ &&
2594 static_cast<double>(block->gc_count()) >= age_threshold) {
2595 block_freed = true;
2596 gc_reclaimed += block->size;
2597 total_age -= block->gc_count(); // Decrement the age
2598 freeable_block_count--; // One less block that can be freed
2599 release_block(block, context);
2600 }
2601 }
2602 }
2603 }
2604
2605 // This function assumes that global lock has been taken whle calling into
2606 // this function. We do cudaMalloc sync call in this function which
2607 // can be expensive while holding the lock. Hence, we pass-in the lock to the
2608 // function to temporarily release the lock before cudaMalloc call and acquire
2609 // it back again after the call so that other threads dont get blocked.
alloc_block(AllocParams & p,bool isRetry,const std::shared_ptr<GatheredContext> & ctx,std::unique_lock<std::recursive_mutex> & lock)2610 bool alloc_block(
2611 AllocParams& p,
2612 bool isRetry,
2613 const std::shared_ptr<GatheredContext>& ctx,
2614 std::unique_lock<std::recursive_mutex>& lock) {
2615 // Defensively checks for preexisting CUDA error state.
2616 C10_CUDA_CHECK(cudaGetLastError());
2617
2618 size_t size = p.alloc_size;
2619 void* ptr = nullptr;
2620
2621 if (isRetry) {
2622 stats.num_alloc_retries += 1;
2623 }
2624 #ifdef FBCODE_CAFFE2
2625 bool in_fbcode = true;
2626 #else
2627 bool in_fbcode = false;
2628 #endif
2629
2630 if (set_fraction &&
2631 total_allocated_memory + size > allowed_memory_maximum) {
2632 p.err = cudaErrorMemoryAllocation;
2633 return false;
2634 // Temporarily disable checkpointing & cudagraphs internally
2635 } else if (
2636 CUDAAllocatorConfig::expandable_segments() &&
2637 !(in_fbcode && p.pool->owner_PrivatePool)) {
2638 p.block = try_allocate_expandable_block(
2639 p.device(), p.stream(), p.pool, p.size(), ctx);
2640 if (p.block) {
2641 p.err = cudaSuccess;
2642 if (p.pool->owner_PrivatePool) {
2643 // The block is for a CUDA graph's PrivatePool.
2644 p.pool->owner_PrivatePool->cudaMalloc_count++;
2645 }
2646 } else {
2647 p.err = cudaErrorMemoryAllocation;
2648 }
2649 return bool(p.block);
2650 } else {
2651 if (CUDAAllocatorConfig::release_lock_on_cudamalloc()) {
2652 // At scope exit, acquire the lock again. This provides safety against
2653 // any potential exceptions in the cudaMallocMaybeCapturing function.
2654 auto sg = c10::make_scope_exit([&]() { lock.lock(); });
2655 lock.unlock();
2656 }
2657 auto active_pool = MemPoolContext::getActiveMemPool();
2658 if (active_pool && active_pool->allocator() &&
2659 p.pool->owner_PrivatePool) {
2660 ptr = active_pool->allocator()->raw_alloc(size);
2661 p.err = ptr ? cudaSuccess : cudaErrorMemoryAllocation;
2662 } else {
2663 p.err = cudaMallocMaybeCapturing(&ptr, size);
2664 }
2665 if (CUDAAllocatorConfig::release_lock_on_cudamalloc()) {
2666 TORCH_CHECK(
2667 lock.owns_lock(), "Failed to acquire lock after cudaMalloc");
2668 }
2669
2670 if (p.err != cudaSuccess) {
2671 if (p.err == cudaErrorMemoryAllocation) {
2672 // If this is the first attempt (!isRetry), we can forgive and clear
2673 // CUDA's internal error state.
2674 //
2675 // If this is the second attempt (isRetry), malloc's TORCH_CHECK_WITH
2676 // will take over to throw a helpful exception. The user can choose
2677 // to catch the exception, free some stuff in their script, and
2678 // attempt the allocation again. In this case, we can also forgive and
2679 // clear CUDA's internal error state.
2680 (void)cudaGetLastError();
2681 } else {
2682 // If the error's unrelated to memory allocation, we should throw
2683 // immediately.
2684 C10_CUDA_CHECK(p.err);
2685 }
2686 return false;
2687 }
2688 }
2689
2690 if (p.pool->owner_PrivatePool) {
2691 // The block is for a CUDA graph's PrivatePool.
2692 p.pool->owner_PrivatePool->cudaMalloc_count++;
2693 }
2694
2695 total_allocated_memory += size;
2696 p.block = new Block(p.device(), p.stream(), size, p.pool, (char*)ptr);
2697 for_each_selected_stat_type(p.stat_types, [&](size_t stat_type) {
2698 stats.segment[stat_type].increase(1);
2699 stats.reserved_bytes[stat_type].increase(size);
2700 });
2701 if (size >= CUDAAllocatorConfig::max_split_size())
2702 stats.oversize_segments.increase(1);
2703 auto reserved_bytes_gauge =
2704 STATIC_GAUGE(pytorch.CUDACachingAllocator.reserved_bytes);
2705 reserved_bytes_gauge.record(
2706 stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
2707 .current);
2708
2709 // p.block came from new, not cudaMalloc. It should not be nullptr here.
2710 TORCH_INTERNAL_ASSERT(p.block != nullptr && p.block->ptr != nullptr);
2711 stats.num_device_alloc++;
2712 record_trace(
2713 TraceEntry::SEGMENT_ALLOC,
2714 int64_t(p.block->ptr),
2715 p.block->size,
2716 p.stream(),
2717 p.device(),
2718 ctx);
2719 p.block->context_when_segment_allocated = ctx;
2720 return true;
2721 }
2722
2723 /** Free one or more oversize blocks to the system allocator. But only enough
2724 * **/
2725 /** to satisfy the target size **/
release_available_cached_blocks(const AllocParams & p,const std::shared_ptr<GatheredContext> & context)2726 bool release_available_cached_blocks(
2727 const AllocParams& p,
2728 const std::shared_ptr<GatheredContext>& context) {
2729 if (CUDAAllocatorConfig::max_split_size() ==
2730 std::numeric_limits<size_t>::max())
2731 return false;
2732 BlockPool& pool = *p.pool;
2733
2734 // because of std::unique_ptr, block cannot be trivially copied
2735 // Use constructor for search key.
2736 Block key(p.search_key.device, p.search_key.stream, p.search_key.size);
2737 key.size = (key.size < CUDAAllocatorConfig::max_split_size())
2738 ? CUDAAllocatorConfig::max_split_size()
2739 : key.size;
2740 auto it = pool.blocks.lower_bound(&key);
2741 if (it == pool.blocks.end() || (*it)->stream != p.stream() ||
2742 (*it)->expandable_segment_) {
2743 // No single block is large enough; free multiple oversize blocks,
2744 // starting with the largest
2745 if (it == pool.blocks.begin())
2746 return false;
2747 size_t totalReleased = 0;
2748 --it; // Back up one item. Now on the largest block for the correct
2749 // stream
2750 while ((totalReleased < key.size) &&
2751 ((*it)->size >= CUDAAllocatorConfig::max_split_size()) &&
2752 ((*it)->stream == p.stream())) {
2753 auto cur = it;
2754 bool is_first = cur == pool.blocks.begin();
2755 if (!is_first) {
2756 --it;
2757 }
2758 if (!(*cur)->expandable_segment_) {
2759 release_block(*cur, context);
2760 totalReleased += (*cur)->size;
2761 }
2762 if (is_first) {
2763 break;
2764 }
2765 }
2766 if (totalReleased < key.size)
2767 return false;
2768 } else {
2769 release_block(*it, context);
2770 }
2771 return true;
2772 }
2773
release_cached_blocks(const std::shared_ptr<GatheredContext> & context)2774 bool release_cached_blocks(const std::shared_ptr<GatheredContext>& context) {
2775 // First ensure that all blocks that can't currently be allocated due to
2776 // outstanding events are returned to the pool.
2777 synchronize_and_free_events(context);
2778
2779 // Free all non-split cached blocks to system allocator
2780 release_blocks(large_blocks, context);
2781 release_blocks(small_blocks, context);
2782
2783 for (auto it = graph_pools_freeable.begin();
2784 it != graph_pools_freeable.end();) {
2785 // See notifyCaptureDestroy for the strategy here.
2786 TORCH_INTERNAL_ASSERT(it->second->use_count == 0);
2787 release_blocks(it->second->small_blocks, context);
2788 release_blocks(it->second->large_blocks, context);
2789 if (it->second->cudaMalloc_count == 0) {
2790 auto erase_count = graph_pools.erase(it->first);
2791 TORCH_INTERNAL_ASSERT(erase_count == 1);
2792 it = graph_pools_freeable.erase(it);
2793 } else {
2794 ++it;
2795 }
2796 }
2797
2798 return true;
2799 }
2800
release_expandable_segment(Block * block)2801 void release_expandable_segment(Block* block) {
2802 TORCH_INTERNAL_ASSERT(
2803 block->size == block->expandable_segment_->size(),
2804 "block disagrees with segment");
2805 TORCH_INTERNAL_ASSERT(!block->mapped);
2806 auto it = std::find(
2807 expandable_segments_.begin(),
2808 expandable_segments_.end(),
2809 block->expandable_segment_);
2810 TORCH_INTERNAL_ASSERT(it != expandable_segments_.end());
2811 expandable_segments_.erase(it);
2812 block->pool->unmapped.erase(block);
2813 delete block->expandable_segment_;
2814 delete block;
2815 }
2816
release_block(Block * block,const std::shared_ptr<GatheredContext> & context)2817 void release_block(
2818 Block* block,
2819 const std::shared_ptr<GatheredContext>& context) {
2820 TORCH_INTERNAL_ASSERT(!block->expandable_segment_);
2821 stats.num_device_free++;
2822 record_trace(
2823 TraceEntry::SEGMENT_FREE,
2824 int64_t(block->ptr),
2825 block->size,
2826 block->stream,
2827 block->device,
2828 context ? context : block->context_when_segment_allocated);
2829
2830 C10_CUDA_CHECK(cudaFree((void*)block->ptr));
2831 total_allocated_memory -= block->size;
2832
2833 auto* pool = block->pool;
2834 if (pool->owner_PrivatePool) {
2835 // The cudaFreed block belonged to a CUDA graph's PrivatePool.
2836 TORCH_INTERNAL_ASSERT(pool->owner_PrivatePool->cudaMalloc_count > 0);
2837 pool->owner_PrivatePool->cudaMalloc_count--;
2838 }
2839
2840 StatTypes stat_types = get_stat_types_for_pool(*pool);
2841 for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
2842 stats.segment[stat_type].decrease(1);
2843 stats.reserved_bytes[stat_type].decrease(block->size);
2844 });
2845 auto reserved_bytes_gauge =
2846 STATIC_GAUGE(pytorch.CUDACachingAllocator.reserved_bytes);
2847 reserved_bytes_gauge.record(
2848 stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
2849 .current);
2850
2851 if (block->size >= CUDAAllocatorConfig::max_split_size())
2852 stats.oversize_segments.decrease(1);
2853 pool->blocks.erase(block);
2854 delete block;
2855 }
2856
unmap_block(Block * block,const std::shared_ptr<GatheredContext> & context)2857 void unmap_block(
2858 Block* block,
2859 const std::shared_ptr<GatheredContext>& context) {
2860 auto unmapped = block->expandable_segment_->unmap(
2861 SegmentRange{block->ptr, block->size});
2862 if (unmapped.size == 0) {
2863 return;
2864 }
2865 block->pool->blocks.erase(block);
2866
2867 ptrdiff_t before_size =
2868 static_cast<char*>(unmapped.ptr) - static_cast<char*>(block->ptr);
2869 if (before_size > 0) {
2870 // prev? -> before_free -> block
2871 Block* before_free = new Block(
2872 block->device, block->stream, before_size, block->pool, block->ptr);
2873 before_free->expandable_segment_ = block->expandable_segment_;
2874 before_free->splice(block->prev, block);
2875 block->pool->insert_into_blocks(before_free);
2876 }
2877
2878 auto after_size = block->size - (before_size + unmapped.size);
2879 if (after_size > 0) {
2880 // block -> after_free -> next?
2881 Block* after_free = new Block(
2882 block->device,
2883 block->stream,
2884 after_size,
2885 block->pool,
2886 static_cast<char*>(unmapped.ptr) + unmapped.size);
2887 after_free->expandable_segment_ = block->expandable_segment_;
2888 after_free->splice(block, block->next);
2889 block->pool->insert_into_blocks(after_free);
2890 }
2891
2892 block->ptr = unmapped.ptr;
2893 block->size = unmapped.size;
2894 block->mapped = false;
2895
2896 try_merge_blocks(block, block->prev, *block->pool);
2897 try_merge_blocks(block, block->next, *block->pool);
2898 block->pool->unmapped.insert(block);
2899
2900 // update statistics
2901 total_allocated_memory -= unmapped.size;
2902 StatTypes stat_types = get_stat_types_for_pool(*block->pool);
2903 for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
2904 stats.reserved_bytes[stat_type].decrease(unmapped.size);
2905 });
2906 auto reserved_bytes_gauge =
2907 STATIC_GAUGE(pytorch.CUDACachingAllocator.reserved_bytes);
2908 reserved_bytes_gauge.record(
2909 stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
2910 .current);
2911
2912 if (block->pool->owner_PrivatePool) {
2913 // The cudaFreed block belonged to a CUDA graph's PrivatePool.
2914 TORCH_INTERNAL_ASSERT(
2915 block->pool->owner_PrivatePool->cudaMalloc_count > 0);
2916 block->pool->owner_PrivatePool->cudaMalloc_count--;
2917 }
2918
2919 stats.num_device_free++;
2920 record_trace(
2921 TraceEntry::SEGMENT_UNMAP,
2922 int64_t(unmapped.ptr),
2923 unmapped.size,
2924 block->stream,
2925 block->device,
2926 context ? context : block->context_when_segment_allocated);
2927 }
release_blocks(BlockPool & pool,const std::shared_ptr<GatheredContext> & context)2928 void release_blocks(
2929 BlockPool& pool,
2930 const std::shared_ptr<GatheredContext>& context) {
2931 std::vector<Block*> to_unmap;
2932 // Frees all non-split blocks
2933 auto it = pool.blocks.begin();
2934 while (it != pool.blocks.end()) {
2935 Block* block = *it;
2936 ++it;
2937 if (block->expandable_segment_) {
2938 // unmapping will mutate the free pool
2939 // so just gather what needs to be freed
2940 // to avoid invalidating the iterator
2941 to_unmap.push_back(block);
2942 } else if (!block->prev && !block->next) {
2943 release_block(block, context);
2944 }
2945 }
2946 for (Block* block : to_unmap) {
2947 unmap_block(block, context);
2948 if (!block->prev && !block->next) {
2949 release_expandable_segment(block);
2950 }
2951 }
2952 }
2953
create_event_internal(c10::DeviceIndex idx)2954 EventPool::Event create_event_internal(c10::DeviceIndex idx) {
2955 // Leak the event pool to avoid shutdown issues.
2956 static auto* event_pool = new EventPool();
2957 return event_pool->get(idx);
2958 }
2959
synchronize_and_free_events(const std::shared_ptr<GatheredContext> & context)2960 void synchronize_and_free_events(
2961 const std::shared_ptr<GatheredContext>& context) {
2962 // Synchronize on outstanding events and then free associated blocks.
2963 stats.num_sync_all_streams++;
2964
2965 // This function syncs, so capture should not be underway. Might as well
2966 // make sure capture-deferred end of life events get processed too.
2967 TORCH_INTERNAL_ASSERT(captures_underway.empty());
2968 insert_events_deferred_until_no_capture(context);
2969
2970 for (auto& st : cuda_events) {
2971 for (auto& e : st.second) {
2972 EventPool::Event event = std::move(e.first);
2973 Block* block = e.second;
2974
2975 C10_CUDA_CHECK(cudaEventSynchronize(*event));
2976
2977 block->event_count--;
2978 if (block->event_count == 0) {
2979 free_block(block, context);
2980 }
2981 }
2982 }
2983
2984 cuda_events.clear();
2985 }
2986
remove_cudagraph_stream_uses(Block * block)2987 void remove_cudagraph_stream_uses(Block* block) {
2988 // remove stream uses added during cudagraph capture
2989 // (i.e., block->stream_uses - block->cudagraph_stream_uses)
2990 if (C10_UNLIKELY(
2991 block_to_cudagraph_stream_uses.find(block) !=
2992 block_to_cudagraph_stream_uses.end())) {
2993 stream_set streams(std::move(block->stream_uses));
2994 AT_ASSERT(block->stream_uses.empty());
2995 for (auto& stream : streams) {
2996 if (block_to_cudagraph_stream_uses[block].find(stream) ==
2997 block_to_cudagraph_stream_uses[block].end()) {
2998 block->stream_uses.insert(stream);
2999 }
3000 }
3001 block_to_cudagraph_stream_uses.erase(block);
3002 }
3003 }
3004
insert_events(Block * block)3005 void insert_events(Block* block) {
3006 c10::DeviceIndex prev_device = 0;
3007 C10_CUDA_CHECK(c10::cuda::GetDevice(&prev_device));
3008
3009 stream_set streams(std::move(block->stream_uses));
3010 AT_ASSERT(block->stream_uses.empty());
3011 for (auto& stream : streams) {
3012 C10_CUDA_CHECK(c10::cuda::SetDevice(stream.device_index()));
3013
3014 EventPool::Event event = create_event_internal(stream.device_index());
3015 C10_CUDA_CHECK(cudaEventRecord(*event, stream.stream()));
3016
3017 block->event_count++;
3018 cuda_events[stream].emplace_back(std::move(event), block);
3019 }
3020
3021 C10_CUDA_CHECK(c10::cuda::MaybeSetDevice(prev_device));
3022 }
3023
insert_events_deferred_until_no_capture(const std::shared_ptr<GatheredContext> & context)3024 void insert_events_deferred_until_no_capture(
3025 const std::shared_ptr<GatheredContext>& context) {
3026 if (C10_UNLIKELY(!needs_events_deferred_until_no_capture.empty())) {
3027 for (auto* block : needs_events_deferred_until_no_capture) {
3028 TORCH_INTERNAL_ASSERT(!block->stream_uses.empty());
3029 // only streams recorded before cudagraph will be used to insert events
3030 // since we know all streams recorded during cudagraph must have
3031 // completed (refer to Section 3.2.8.7.3.1 Cross-stream Dependencies and
3032 // Events in CUDA Programming Guide).
3033 remove_cudagraph_stream_uses(block);
3034 insert_events(block);
3035 if (block->event_count == 0) {
3036 free_block(block, context);
3037 }
3038 }
3039 needs_events_deferred_until_no_capture.clear();
3040 }
3041 }
3042
process_events(const std::shared_ptr<GatheredContext> & context)3043 void process_events(const std::shared_ptr<GatheredContext>& context) {
3044 insert_events_deferred_until_no_capture(context);
3045
3046 // Process outstanding cudaEvents. Events that are completed are
3047 // removed from the queue, and the 'event_count' for the
3048 // corresponding allocation is decremented. We maintain a separate
3049 // list of events per stream to avoid head-of-line delays if one
3050 // or more streams has long-running operations.
3051
3052 // Iterate over different streams.
3053 for (auto it = cuda_events.begin(); it != cuda_events.end();) {
3054 // Iterate over this stream's (event, block) pairs.
3055 while (!it->second.empty()) {
3056 auto& e = it->second.front();
3057 EventPool::Event event = std::move(e.first);
3058 Block* block = e.second;
3059
3060 cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaEventQuery(*event));
3061 if (err == cudaErrorNotReady) {
3062 // ignore and clear the error if not ready
3063 (void)cudaGetLastError();
3064 // Return the ownership of the Event (unique ptr)
3065 e.first = std::move(event);
3066 break;
3067 } else if (err != cudaSuccess) {
3068 C10_CUDA_CHECK(err);
3069 }
3070
3071 block->event_count--;
3072 if (block->event_count == 0) {
3073 free_block(block, context);
3074 }
3075 it->second.pop_front();
3076 }
3077
3078 if (it->second.empty()) {
3079 it = cuda_events.erase(it);
3080 } else {
3081 it++;
3082 }
3083 }
3084 }
3085
3086 // Iterates over sizes of all memory blocks for given device in given pool
cache_info_aux(const BlockPool & pool,size_t * largest)3087 void cache_info_aux(const BlockPool& pool, size_t* largest) {
3088 for (const auto& block : pool.blocks) {
3089 const auto blocksize = block->size;
3090 if (blocksize > *largest) {
3091 *largest = blocksize;
3092 }
3093 }
3094 }
3095
record_trace(TraceEntry::Action action,size_t addr,size_t size,cudaStream_t stream,c10::DeviceIndex device,std::shared_ptr<GatheredContext> context)3096 void record_trace(
3097 TraceEntry::Action action,
3098 size_t addr,
3099 size_t size,
3100 cudaStream_t stream,
3101 c10::DeviceIndex device,
3102 std::shared_ptr<GatheredContext> context) {
3103 if (!record_history && trace_trackers_.empty())
3104 return;
3105
3106 auto te = TraceEntry(
3107 action,
3108 device,
3109 addr,
3110 size,
3111 stream,
3112 getApproximateTime(),
3113 record_context_ >= RecordContext::ALLOC ? std::move(context) : nullptr);
3114
3115 // Callbacks should not include any Pytorch call
3116 for (const auto& cb : trace_trackers_) {
3117 cb(te);
3118 }
3119
3120 if (record_history) {
3121 alloc_buffer.insertEntries(te);
3122 }
3123 }
3124 };
3125
3126 // Returns whether to force all allocations to bypass the caching allocator and
3127 // go straight to cudaMalloc. This setting is useful when debugging GPU memory
3128 // errors, since the caching allocator foils cuda-memcheck.
forceUncachedAllocator()3129 bool forceUncachedAllocator() {
3130 static bool force_uncached =
3131 getenv("PYTORCH_NO_CUDA_MEMORY_CACHING") != nullptr;
3132 return force_uncached;
3133 }
3134
uncached_delete(void * ptr)3135 static void uncached_delete(void* ptr) {
3136 if (TORCH_SDT_IS_ENABLED(free)) {
3137 TORCH_SDT_WITH_SEMAPHORE(free, ptr);
3138 }
3139
3140 const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
3141 if (C10_UNLIKELY(interp)) {
3142 (*interp)->trace_gpu_memory_deallocation(
3143 c10::kCUDA, reinterpret_cast<uintptr_t>(ptr));
3144 }
3145 C10_CUDA_CHECK(cudaFree(ptr));
3146 }
3147
3148 void local_raw_delete(void* ptr);
3149
3150 class NativeCachingAllocator : public CUDAAllocator {
3151 private:
3152 // Shard allocation region to have independent mutexes to reduce contention.
3153 static constexpr size_t kNumMutexShard = 67;
3154
3155 // TODO: use std::hardware_destructive_interference_size once available
3156 struct alignas(64) AlignedMutex {
3157 std::mutex m;
3158 };
3159
3160 std::array<AlignedMutex, kNumMutexShard> mutex;
3161
3162 // allocated blocks by device pointer
3163 std::array<ska::flat_hash_map<void*, Block*>, kNumMutexShard>
3164 allocated_blocks;
3165
get_mutex_shard_id(void * ptr)3166 static size_t get_mutex_shard_id(void* ptr) {
3167 return twang_mix64((size_t)ptr) % kNumMutexShard;
3168 }
3169
add_allocated_block(Block * block)3170 void add_allocated_block(Block* block) {
3171 // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
3172 const auto mutex_shard_id = get_mutex_shard_id(block->ptr);
3173 std::lock_guard<std::mutex> lock(mutex[mutex_shard_id].m);
3174 allocated_blocks[mutex_shard_id][block->ptr] = block;
3175 }
3176
3177 // Variables by memory snapshot
3178 c10::ApproximateClockToUnixTimeConverter clock_converter;
3179 bool record_history = false;
3180 RingBuffer<AnnotationEntry> annotation_buffer;
3181
3182 public:
3183 std::vector<std::unique_ptr<DeviceCachingAllocator>> device_allocator;
3184
get_allocated_block(void * ptr,bool remove=false)3185 Block* get_allocated_block(void* ptr, bool remove = false) {
3186 const auto mutex_shard_id = get_mutex_shard_id(ptr);
3187 std::lock_guard<std::mutex> lock(mutex[mutex_shard_id].m);
3188 auto it = allocated_blocks[mutex_shard_id].find(ptr);
3189 if (it == allocated_blocks[mutex_shard_id].end()) {
3190 return nullptr;
3191 }
3192 Block* block = it->second;
3193 if (remove) {
3194 allocated_blocks[mutex_shard_id].erase(it);
3195 }
3196 return block;
3197 }
3198
init(int device_count)3199 void init(int device_count) override {
3200 const auto size = static_cast<int64_t>(device_allocator.size());
3201 if (size < device_count) {
3202 device_allocator.resize(device_count);
3203 for (const auto i : c10::irange(size, device_count)) {
3204 device_allocator[i] = std::make_unique<DeviceCachingAllocator>();
3205 }
3206 }
3207 }
3208
initialized()3209 bool initialized() override {
3210 return !device_allocator.empty();
3211 }
3212
3213 /** allocates a block which is safe to use from the provided stream */
malloc(void ** devPtr,c10::DeviceIndex device,size_t size,cudaStream_t stream)3214 void malloc(
3215 void** devPtr,
3216 c10::DeviceIndex device,
3217 size_t size,
3218 cudaStream_t stream) {
3219 TORCH_INTERNAL_ASSERT(
3220 0 <= device && static_cast<size_t>(device) < device_allocator.size(),
3221 "Allocator not initialized for device ",
3222 device,
3223 ": did you call init?");
3224 Block* block = device_allocator[device]->malloc(device, size, stream);
3225 add_allocated_block(block);
3226 *devPtr = (void*)block->ptr;
3227 const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
3228 if (C10_UNLIKELY(interp)) {
3229 (*interp)->trace_gpu_memory_allocation(
3230 c10::kCUDA, reinterpret_cast<uintptr_t>(*devPtr));
3231 }
3232 }
3233
free(void * ptr)3234 void free(void* ptr) {
3235 if (!ptr) {
3236 return;
3237 }
3238 Block* block = get_allocated_block(ptr, true /* remove */);
3239 if (!block) {
3240 TORCH_CHECK(false, "invalid device pointer: ", ptr);
3241 }
3242 const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
3243 if (C10_UNLIKELY(interp)) {
3244 (*interp)->trace_gpu_memory_deallocation(
3245 c10::kCUDA, reinterpret_cast<uintptr_t>(block->ptr));
3246 }
3247 device_allocator[block->device]->free(block);
3248 }
3249
setMemoryFraction(double fraction,c10::DeviceIndex device)3250 void setMemoryFraction(double fraction, c10::DeviceIndex device) override {
3251 TORCH_INTERNAL_ASSERT(
3252 0 <= device && static_cast<size_t>(device) < device_allocator.size(),
3253 "Allocator not initialized for device ",
3254 device,
3255 ": did you call init?");
3256 TORCH_INTERNAL_ASSERT(
3257 0 <= fraction && fraction <= 1,
3258 "invalid fraction:",
3259 fraction,
3260 ". Please set within (0, 1).");
3261 C10_CUDA_CHECK(c10::cuda::SetDevice(device));
3262 device_allocator[device]->setMemoryFraction(fraction);
3263 }
3264
recordHistory(bool enabled,CreateContextFn context_recorder,size_t alloc_buffer_max_entries,RecordContext when)3265 void recordHistory(
3266 bool enabled,
3267 CreateContextFn context_recorder,
3268 size_t alloc_buffer_max_entries,
3269 RecordContext when) override {
3270 record_history = enabled;
3271 annotation_buffer.setMaxEntries(alloc_buffer_max_entries);
3272 annotation_buffer.clear();
3273 for (auto& allocator : device_allocator) {
3274 allocator->recordHistory(
3275 enabled, context_recorder, alloc_buffer_max_entries, when);
3276 }
3277 }
3278
recordAnnotation(const std::vector<std::pair<std::string,std::string>> & md)3279 void recordAnnotation(
3280 const std::vector<std::pair<std::string, std::string>>& md) override {
3281 if (!record_history) {
3282 return;
3283 }
3284 c10::DeviceIndex device = 0;
3285 C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
3286 auto ae = AnnotationEntry(
3287 /*device=*/device,
3288 /*time=*/getApproximateTime());
3289 for (const auto& md_pair : md) {
3290 ae.recordUserMetadata(md_pair.first, md_pair.second);
3291 }
3292 annotation_buffer.insertEntries(ae);
3293 }
3294
isHistoryEnabled()3295 bool isHistoryEnabled() override {
3296 c10::DeviceIndex device = 0;
3297 C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
3298 return device_allocator[device]->isHistoryEnabled();
3299 }
3300
checkPoolLiveAllocations(c10::DeviceIndex device,MempoolId_t mempool_id,const std::unordered_set<void * > & expected_live_allocations)3301 bool checkPoolLiveAllocations(
3302 c10::DeviceIndex device,
3303 MempoolId_t mempool_id,
3304 const std::unordered_set<void*>& expected_live_allocations) override {
3305 return device_allocator[device]->checkPoolLiveAllocations(
3306 mempool_id, expected_live_allocations);
3307 }
3308
attachOutOfMemoryObserver(OutOfMemoryObserver observer)3309 void attachOutOfMemoryObserver(OutOfMemoryObserver observer) override {
3310 for (auto& allocator : device_allocator) {
3311 allocator->attachOutOfMemoryObserver(observer);
3312 }
3313 }
3314
attachAllocatorTraceTracker(AllocatorTraceTracker tracker)3315 void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) override {
3316 for (auto& allocator : device_allocator) {
3317 allocator->attachAllocatorTraceTracker(tracker);
3318 }
3319 }
3320
emptyCache()3321 void emptyCache() override {
3322 for (auto& da : device_allocator)
3323 da->emptyCache();
3324 }
3325
getBaseAllocation(void * ptr,size_t * outSize)3326 void* getBaseAllocation(void* ptr, size_t* outSize) override {
3327 Block* block = get_allocated_block(ptr);
3328 if (!block) {
3329 TORCH_CHECK(false, "invalid device pointer: ", ptr);
3330 }
3331 return device_allocator[block->device]->getBaseAllocation(block, outSize);
3332 }
3333
shareIpcHandle(void * ptr)3334 ShareableHandle shareIpcHandle(void* ptr) override {
3335 Block* block = get_allocated_block(ptr);
3336 if (!block) {
3337 TORCH_CHECK(false, "invalid device pointer: ", ptr);
3338 }
3339 return device_allocator[block->device]->shareIpcHandle(block);
3340 }
3341
recordStream(const DataPtr & ptr,cuda::CUDAStream stream)3342 void recordStream(const DataPtr& ptr, cuda::CUDAStream stream) override {
3343 // Empty tensor's storage().data() might be a null ptr. As there is no
3344 // blocks associated with those tensors, it is fine to do nothing here.
3345 if (!ptr.get()) {
3346 return;
3347 }
3348
3349 // If a tensor is not allocated by this instance, simply skip
3350 // This usually happens when CUDA tensors are shared across processes,
3351 // we have implemented reference counting based sharing mechanism to
3352 // guarantee tensors won't be accidentally freed by one process while
3353 // they are still being used in another
3354 if (ptr.get_deleter() != &local_raw_delete)
3355 return;
3356
3357 Block* block = get_allocated_block(ptr.get());
3358 // block must not be null reaching here
3359 TORCH_INTERNAL_ASSERT(block != nullptr, "No allocated block can be found");
3360 device_allocator[block->device]->recordStream(block, stream);
3361 }
3362
snapshot()3363 SnapshotInfo snapshot() override {
3364 // Set-up converter to convert timestamps from tsc to microseconds.
3365 auto tsc_to_ns = clock_converter.makeConverter();
3366 auto tsc_to_us = [=](approx_time_t t_approx) {
3367 return tsc_to_ns(t_approx) / 1000;
3368 };
3369
3370 SnapshotInfo result;
3371
3372 // Get AnnotationEntry list and convert the timestamps.
3373 annotation_buffer.getEntries(result.external_annotations);
3374 for (auto& ae : result.external_annotations) {
3375 ae.time_.t_ = tsc_to_us(ae.time_.approx_t_);
3376 }
3377
3378 // Get the device_traces' TraceEntry lists.
3379 for (auto& da : device_allocator) {
3380 result.device_traces.emplace_back(da->trace(tsc_to_us));
3381 auto snap = da->snapshot();
3382 result.segments.insert(result.segments.end(), snap.begin(), snap.end());
3383 }
3384
3385 auto& md = result.config_metadata;
3386 md.garbage_collection_threshold =
3387 CUDAAllocatorConfig::garbage_collection_threshold();
3388 md.max_split_size = CUDAAllocatorConfig::max_split_size();
3389 md.pinned_num_register_threads =
3390 CUDAAllocatorConfig::pinned_num_register_threads();
3391 md.expandable_segments = CUDAAllocatorConfig::expandable_segments();
3392 md.release_lock_on_malloc =
3393 CUDAAllocatorConfig::release_lock_on_cudamalloc();
3394 md.pinned_use_host_register =
3395 CUDAAllocatorConfig::pinned_use_cuda_host_register();
3396 md.last_allocator_settings = CUDAAllocatorConfig::last_allocator_settings();
3397 md.roundup_power2_divisions =
3398 CUDAAllocatorConfig::roundup_power2_divisions();
3399
3400 return result;
3401 }
3402
getCheckpointState(c10::DeviceIndex device,MempoolId_t id)3403 std::shared_ptr<AllocatorState> getCheckpointState(
3404 c10::DeviceIndex device,
3405 MempoolId_t id) override {
3406 return device_allocator[device]->getCheckpointState(id);
3407 }
3408
3409 /**
3410 * @brief Checkpoint the private pool state identified in `as` to its prior
3411 * state
3412 *
3413 * @param device - device of the pool to manipulate
3414 * @param as - allocator state
3415 * @param stale_live_storages - storages of tensors which are currently
3416 * allocated but which will be not be allocated after the checkpoint is set.
3417 * For these storages we will remove their deleter function.
3418 * @return CheckpointDelta - Freed Pointers and DataPtrs that contain deleter
3419 * functions for all allocated blocks in the new checkpoint state.
3420 */
setCheckpointPoolState(c10::DeviceIndex device,std::shared_ptr<AllocatorState> as)3421 CheckpointDelta setCheckpointPoolState(
3422 c10::DeviceIndex device,
3423 std::shared_ptr<AllocatorState> as) override {
3424 std::shared_ptr<PrivatePoolState> pps =
3425 std::dynamic_pointer_cast<PrivatePoolState>(as);
3426
3427 TORCH_CHECK(pps, "Expected PrivatePoolState");
3428
3429 auto rr = device_allocator[device]->setCheckpointPoolState(*pps);
3430
3431 CheckpointDelta cpd;
3432 for (void* ptr : rr.allocations_freed) {
3433 get_allocated_block(ptr, /*remove*/ true);
3434 cpd.ptrs_freed.push_back(ptr);
3435 }
3436 for (Block* block : rr.allocations_created) {
3437 add_allocated_block(block);
3438 cpd.dataptrs_allocd.emplace_back(
3439 block->ptr,
3440 block->ptr,
3441 &local_raw_delete,
3442 Device(DeviceType::CUDA, device));
3443 }
3444
3445 return cpd;
3446 }
3447
allocate(size_t size)3448 DataPtr allocate(size_t size) override {
3449 constexpr size_t one_exa_bytes = 1152921504606846976ULL;
3450 TORCH_CHECK_WITH(
3451 OutOfMemoryError,
3452 size < one_exa_bytes,
3453 "CUDA out of memory. Tried to allocate more than 1EB memory.");
3454 c10::DeviceIndex device = 0;
3455 C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
3456 void* devPtr = nullptr;
3457 void (*deleteFunc)(void*) = &local_raw_delete;
3458 CUDAStream stream = cuda::getCurrentCUDAStream(device);
3459
3460 if (forceUncachedAllocator()) {
3461 deleteFunc = &uncached_delete;
3462
3463 // Deliberately don't use cudaMallocMaybeCapturing here, to force an error
3464 // if someone tries to use forceUncachedAllocator while capturing.
3465 C10_CUDA_CHECK(cudaMalloc(&devPtr, size));
3466 const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
3467 if (C10_UNLIKELY(interp)) {
3468 (*interp)->trace_gpu_memory_allocation(
3469 c10::kCUDA, reinterpret_cast<uintptr_t>(devPtr));
3470 }
3471 } else {
3472 if (size != 0) {
3473 this->malloc(&devPtr, device, size, stream);
3474 }
3475 }
3476
3477 if (size && TORCH_SDT_IS_ENABLED(malloc)) {
3478 TORCH_SDT_WITH_SEMAPHORE(malloc, devPtr, device, size, stream.id());
3479 }
3480
3481 return {devPtr, devPtr, deleteFunc, Device(DeviceType::CUDA, device)};
3482 }
raw_deleter() const3483 DeleterFnPtr raw_deleter() const override {
3484 if (forceUncachedAllocator()) {
3485 return &uncached_delete;
3486 } else {
3487 return &local_raw_delete;
3488 }
3489 }
cacheInfo(c10::DeviceIndex device,size_t * largestBlock)3490 void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) override {
3491 device_allocator[device]->cacheInfo(largestBlock);
3492 }
assertValidDevice(c10::DeviceIndex device)3493 void assertValidDevice(c10::DeviceIndex device) {
3494 const auto device_num = device_allocator.size();
3495 TORCH_CHECK(
3496 0 <= device && device < static_cast<int64_t>(device_num),
3497 "Invalid device argument ",
3498 device,
3499 ": did you call init?");
3500 }
3501
getDeviceStats(c10::DeviceIndex device)3502 DeviceStats getDeviceStats(c10::DeviceIndex device) override {
3503 assertValidDevice(device);
3504 return device_allocator[device]->getStats();
3505 }
3506
resetAccumulatedStats(c10::DeviceIndex device)3507 void resetAccumulatedStats(c10::DeviceIndex device) override {
3508 assertValidDevice(device);
3509 device_allocator[device]->resetAccumulatedStats();
3510 }
3511
resetPeakStats(c10::DeviceIndex device)3512 void resetPeakStats(c10::DeviceIndex device) override {
3513 assertValidDevice(device);
3514 device_allocator[device]->resetPeakStats();
3515 }
3516 // CUDAGraph interactions
beginAllocateToPool(c10::DeviceIndex device,MempoolId_t mempool_id,std::function<bool (cudaStream_t)> filter)3517 void beginAllocateToPool(
3518 c10::DeviceIndex device,
3519 MempoolId_t mempool_id,
3520 std::function<bool(cudaStream_t)> filter) override {
3521 assertValidDevice(device);
3522 device_allocator[device]->beginAllocateToPool(
3523 std::move(mempool_id), std::move(filter));
3524 }
3525
endAllocateToPool(c10::DeviceIndex device,MempoolId_t mempool_id)3526 void endAllocateToPool(c10::DeviceIndex device, MempoolId_t mempool_id)
3527 override {
3528 assertValidDevice(device);
3529 device_allocator[device]->endAllocateToPool(mempool_id);
3530 }
3531
releasePool(c10::DeviceIndex device,MempoolId_t mempool_id)3532 void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) override {
3533 assertValidDevice(device);
3534 device_allocator[device]->releasePool(std::move(mempool_id));
3535 }
3536
raw_alloc(size_t nbytes)3537 void* raw_alloc(size_t nbytes) override {
3538 if (nbytes == 0) {
3539 return nullptr;
3540 }
3541 c10::DeviceIndex device = 0;
3542 C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
3543 void* r = nullptr;
3544 malloc(&r, device, nbytes, cuda::getCurrentCUDAStream(device));
3545 return r;
3546 }
3547
raw_alloc_with_stream(size_t nbytes,cudaStream_t stream)3548 void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) override {
3549 if (nbytes == 0) {
3550 return nullptr;
3551 }
3552 c10::DeviceIndex device = 0;
3553 C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
3554 void* r = nullptr;
3555 malloc(&r, device, nbytes, stream);
3556 return r;
3557 }
3558
enablePeerAccess(c10::DeviceIndex dev,c10::DeviceIndex dev_to_access)3559 void enablePeerAccess(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access)
3560 override {
3561 c10::cuda::CUDAGuard device_guard(dev);
3562 cudaError_t err = cudaDeviceEnablePeerAccess(dev_to_access, 0);
3563 if (err == cudaErrorPeerAccessAlreadyEnabled) {
3564 // ignore and clear the error if access was already enabled
3565 (void)cudaGetLastError();
3566 } else {
3567 C10_CUDA_CHECK(err);
3568 }
3569 device_allocator[dev_to_access]->addPeerAccess(dev);
3570 std::lock_guard<std::mutex> lock(IpcMutex);
3571 for (auto& entry : ipcMemHandle_to_devptr) {
3572 if (entry.second.device_ == dev_to_access &&
3573 entry.second.expandable_segment_) {
3574 entry.second.expandable_segment_->addPeer(dev);
3575 }
3576 }
3577 }
3578
memcpyAsync(void * dst,int dstDevice,const void * src,int srcDevice,size_t count,cudaStream_t stream,bool p2p_enabled)3579 cudaError_t memcpyAsync(
3580 void* dst,
3581 int dstDevice,
3582 const void* src,
3583 int srcDevice,
3584 size_t count,
3585 cudaStream_t stream,
3586 bool p2p_enabled) override {
3587 if (p2p_enabled || // memcpy ok because memory is mapped in both devices
3588 srcDevice == dstDevice || // memcpy ok on a single device
3589 // memcpy ok because both dst and src must have come from cudaMalloc
3590 (!device_allocator[dstDevice]->hasAllocatedExpandableSegments() &&
3591 !device_allocator[srcDevice]->hasAllocatedExpandableSegments())) {
3592 return cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, stream);
3593 }
3594 // when p2p is not enabled, only cudaMemcpyPeerAsync correctly handles
3595 // memory not allocated via cudaMalloc
3596 return cudaMemcpyPeerAsync(dst, dstDevice, src, srcDevice, count, stream);
3597 }
3598
raw_delete(void * ptr)3599 void raw_delete(void* ptr) override {
3600 this->free(ptr);
3601 }
3602
3603 // In CUDA IPC, sender sends a tensor to receiver via shareIPCHandle,
3604 // getIpcDevPtr is called by the receiving process to map the CUDA memory from
3605 // the sending process into its own address space.
3606
3607 // When allocated with cudaMalloc we use the cudaIPCMemHandle_t APIs.
3608 // These APIs only allow sharing a big memory block associated with a
3609 // cudaIpcMemHandle_t and it can be opened only **once** per context per
3610 // process. There can be multiple types of storage in the same IPC mem block,
3611 // so we must cache the device ptr to construct typed storage as it comes.
3612
3613 // When using cuMemCreate, via expandable segments, we use
3614 // cuMemExportToShareableHandle to create a file descriptor that can be sent
3615 // to the other process to sort the object. Then we recreate part of the
3616 // exandable segment necessary to load the allocation.
3617
3618 // ipcMemHandle_to_devptr caches the mapping from shareable handle to
3619 // this process' memory mapping information for that share to ensure we do not
3620 // create it twice. When the shared_ptr is no longer in use we clean up the
3621 // cache.
3622
3623 std::mutex IpcMutex;
3624 struct MemHandleCacheEntry {
MemHandleCacheEntryc10::cuda::CUDACachingAllocator::Native::NativeCachingAllocator::MemHandleCacheEntry3625 MemHandleCacheEntry(
3626 c10::DeviceIndex device,
3627 std::string& handle,
3628 const DeviceCachingAllocator& allocator)
3629 : device_(device),
3630 expandable_segment_(nullptr),
3631 cuda_ipc_ptr_(nullptr) {
3632 int type = SHAREABLE_CUDA_MALLOC;
3633 std::istringstream ss(handle);
3634 if (handle.size() != CUDA_IPC_HANDLE_SIZE) {
3635 auto version = ss.get();
3636 TORCH_CHECK(
3637 version <= SHAREABLE_HANDLE_VERSION,
3638 "received sharable handle from a future version of torch that this version does not know how to handle")
3639 type = ss.get();
3640 } // otherwise this is coming from an old pytorch where it has to be a raw
3641 // SHARABLE_CUDA_MALLOC
3642 if (type == SHAREABLE_CUDA_MALLOC) {
3643 cudaIpcMemHandle_t cuda_handle;
3644 ss.read((char*)&cuda_handle, CUDA_IPC_HANDLE_SIZE);
3645 C10_CUDA_CHECK(cudaIpcOpenMemHandle(
3646 &cuda_ipc_ptr_, cuda_handle, cudaIpcMemLazyEnablePeerAccess));
3647 } else if (type == SHAREABLE_CUDA_EXPANDABLE_SEGMENT) {
3648 expandable_segment_ =
3649 ExpandableSegment::fromShared(device, allocator.peers(), ss)
3650 .release();
3651 } else {
3652 TORCH_INTERNAL_ASSERT(
3653 false, "unexpected or illformed shareable handle type");
3654 }
3655 }
3656 // this struct expects that clear is explicitly called to
3657 // free resources, because we only want this code running when
3658 // the shared pointer to this entry is destructed, not during
3659 // deinitialization when cuda may already have been shutdown.
3660 // This replicates the previous behavior of this map when it
3661 // stored raw cuda_ipc_ptr_ handles.
clearc10::cuda::CUDACachingAllocator::Native::NativeCachingAllocator::MemHandleCacheEntry3662 void clear() {
3663 if (cuda_ipc_ptr_) {
3664 cuda::CUDAGuard device_guard(device_);
3665 C10_CUDA_CHECK(cudaIpcCloseMemHandle(cuda_ipc_ptr_));
3666 cuda_ipc_ptr_ = nullptr;
3667 }
3668 if (expandable_segment_) {
3669 delete expandable_segment_;
3670 expandable_segment_ = nullptr;
3671 }
3672 }
ptrc10::cuda::CUDACachingAllocator::Native::NativeCachingAllocator::MemHandleCacheEntry3673 void* ptr() {
3674 if (cuda_ipc_ptr_) {
3675 return cuda_ipc_ptr_;
3676 } else {
3677 return expandable_segment_->ptr();
3678 }
3679 }
3680 c10::DeviceIndex device_;
3681 ExpandableSegment* expandable_segment_;
3682 void* cuda_ipc_ptr_; // nullptr if expandable_segment_ is not null
3683 std::weak_ptr<void> wp_;
3684 };
3685
3686 ska::flat_hash_map<std::string, MemHandleCacheEntry> ipcMemHandle_to_devptr;
getIpcDevPtr(std::string handle)3687 std::shared_ptr<void> getIpcDevPtr(std::string handle) override {
3688 std::lock_guard<std::mutex> lock(IpcMutex);
3689
3690 auto iter = ipcMemHandle_to_devptr.find(handle);
3691 if (iter != ipcMemHandle_to_devptr.end()) {
3692 auto devptr = iter->second.wp_.lock();
3693 // the weak_ptr should always be valid because we delete the entry from
3694 // the cache when the shared_ptr is destructed, so we should never get
3695 // here.
3696 TORCH_INTERNAL_ASSERT(devptr, "entry in cache has missing shared_ptr");
3697 return devptr;
3698 }
3699 c10::DeviceIndex curr_device = 0;
3700 C10_CUDA_CHECK(c10::cuda::GetDevice(&curr_device));
3701 auto inserted = ipcMemHandle_to_devptr.insert(
3702 iter,
3703 {handle,
3704 MemHandleCacheEntry(
3705 curr_device, handle, *device_allocator[curr_device])});
3706 auto sp = std::shared_ptr<void>(
3707 inserted->second.ptr(), [handle, this](void* ptr) {
3708 std::lock_guard<std::mutex> deleter_lock(IpcMutex);
3709 auto it = ipcMemHandle_to_devptr.find(handle);
3710 TORCH_INTERNAL_ASSERT(it != ipcMemHandle_to_devptr.end());
3711 it->second.clear();
3712 ipcMemHandle_to_devptr.erase(it);
3713 });
3714 inserted->second.wp_ = sp;
3715 return sp;
3716 }
3717
name()3718 std::string name() override {
3719 return "native";
3720 }
copy_data(void * dest,const void * src,std::size_t count) const3721 void copy_data(void* dest, const void* src, std::size_t count) const final {
3722 C10_CUDA_CHECK(
3723 cudaMemcpy(dest, src, count, cudaMemcpyKind::cudaMemcpyDeviceToDevice));
3724 }
3725 };
3726
3727 NativeCachingAllocator allocator;
3728
local_raw_delete(void * ptr)3729 void local_raw_delete(void* ptr) {
3730 if (TORCH_SDT_IS_ENABLED(free)) {
3731 TORCH_SDT_WITH_SEMAPHORE(free, ptr);
3732 }
3733
3734 allocator.free(ptr);
3735 }
3736
3737 } // namespace Native
3738
3739 namespace CudaMallocAsync {
3740 // If this is put in its own header file, it gets incorrectly renamed in HIPify.
3741 CUDAAllocator* allocator();
3742
3743 } // namespace CudaMallocAsync
3744
3745 struct BackendStaticInitializer {
3746 // Parses env for backend at load time, duplicating some logic from
3747 // CUDAAllocatorConfig. CUDAAllocatorConfig double-checks it later (at
3748 // runtime). Defers verbose exceptions and error checks, including Cuda
3749 // version checks, to CUDAAllocatorConfig's runtime doublecheck. If this
3750 // works, maybe we should move all of CUDAAllocatorConfig here?
parseEnvForBackendc10::cuda::CUDACachingAllocator::BackendStaticInitializer3751 CUDAAllocator* parseEnvForBackend() {
3752 const char* val = getenv("PYTORCH_CUDA_ALLOC_CONF");
3753 if (val != nullptr) {
3754 const std::string config(val);
3755
3756 std::regex exp("[\\s,]+");
3757 std::sregex_token_iterator it(config.begin(), config.end(), exp, -1);
3758 std::sregex_token_iterator end;
3759 std::vector<std::string> options(it, end);
3760
3761 for (auto option : options) {
3762 std::regex exp2("[:]+");
3763 std::sregex_token_iterator it2(option.begin(), option.end(), exp2, -1);
3764 std::sregex_token_iterator end2;
3765 std::vector<std::string> kv(it2, end2);
3766 if (kv.size() >= 2) {
3767 if (kv[0] == "backend") {
3768 if (kv[1] == "cudaMallocAsync")
3769 return CudaMallocAsync::allocator();
3770 if (kv[1] == "native")
3771 return &Native::allocator;
3772 }
3773 }
3774 }
3775 }
3776 return &Native::allocator;
3777 }
3778
BackendStaticInitializerc10::cuda::CUDACachingAllocator::BackendStaticInitializer3779 BackendStaticInitializer() {
3780 auto r = parseEnvForBackend();
3781 allocator.store(r);
3782 }
3783 };
3784
3785 std::atomic<CUDAAllocator*> allocator;
3786 BackendStaticInitializer backend_static_initializer;
3787 } // namespace cuda::CUDACachingAllocator
3788 } // namespace c10
3789
3790 namespace c10::cuda {
3791
3792 // uid_ is incremented when a user creates a MemPool,
3793 // for example: using graph_pool_handle() or c10::cuda::MemPool().
3794 //
3795 // uuid_ is incremented when CUDAGraph creates a MemPool
3796 // as a result of a user not providing a pool.
3797 //
3798 // MempoolId_t of {0, 0} is used to denote when no MemPool has been
3799 // passed to a function, either by user or CUDAGraphs. For example,
3800 // default value of MempoolId_t for capture_begin function is {0, 0}.
3801 // That's why uid_ and uuid_ start at 1.
3802 std::atomic<CaptureId_t> MemPool::uid_{1};
3803 std::atomic<CaptureId_t> MemPool::uuid_{1};
3804
MemPool(CUDACachingAllocator::CUDAAllocator * allocator,bool is_user_created)3805 MemPool::MemPool(
3806 CUDACachingAllocator::CUDAAllocator* allocator,
3807 bool is_user_created)
3808 : allocator_(allocator), is_user_created_(is_user_created) {
3809 if (is_user_created_) {
3810 id_ = {0, uid_++};
3811 } else {
3812 id_ = {uuid_++, 0};
3813 }
3814 }
3815
id()3816 MempoolId_t MemPool::id() {
3817 return id_;
3818 }
3819
allocator()3820 CUDACachingAllocator::CUDAAllocator* MemPool::allocator() {
3821 return allocator_;
3822 }
3823
3824 // Note that active_mempool_ is a global variable here
3825 // and not inside MemPoolContext class, because in windows we
3826 // can't use __declspec(dllexport) and __declspec(thread)
3827 // together: https://stackoverflow.com/a/50967977
3828 static thread_local MemPool* active_mempool_ = nullptr;
3829
MemPoolContext(MemPool * mempool)3830 MemPoolContext::MemPoolContext(MemPool* mempool)
3831 : prev_mempool_(active_mempool_) {
3832 active_mempool_ = mempool;
3833 }
3834
~MemPoolContext()3835 MemPoolContext::~MemPoolContext() {
3836 active_mempool_ = prev_mempool_;
3837 }
3838
getActiveMemPool()3839 MemPool* MemPoolContext::getActiveMemPool() {
3840 return active_mempool_;
3841 }
3842
3843 } // namespace c10::cuda
3844