xref: /aosp_15_r20/external/pytorch/c10/mobile/CPUProfilingAllocator.cpp (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #include <c10/core/impl/alloc_cpu.h>
2 #include <c10/mobile/CPUProfilingAllocator.h>
3 #include <c10/util/irange.h>
4 
5 #include <map>
6 #include <set>
7 
8 namespace c10 {
9 
10 namespace {
11 thread_local AllocationPlanner* allocation_planner{nullptr};
12 thread_local CPUProfilingAllocator* profiling_allocator{nullptr};
13 
14 struct MemBlock {
15   uint64_t start_offset, end_offset;
MemBlockc10::__anon18d13d3d0111::MemBlock16   MemBlock(uint64_t s, uint64_t e) : start_offset(s), end_offset(e) {}
operator <c10::__anon18d13d3d0111::MemBlock17   bool operator<(const MemBlock& other) const {
18     return start_offset < other.start_offset;
19   }
20 };
21 
22 enum class EventType { Allocate = 0, Free, Invalid };
23 
24 struct MemEvent {
25   uint64_t time;
26   uint64_t allocation_id;
27   uint64_t size;
28   EventType type{EventType::Invalid};
MemEventc10::__anon18d13d3d0111::MemEvent29   MemEvent(uint64_t t, uint64_t id, uint64_t s, EventType e)
30       : time(t), allocation_id(id), size(s), type(e) {}
31 };
32 
overlaps(const MemBlock & a,const MemBlock & b)33 bool overlaps(const MemBlock& a, const MemBlock& b) {
34   // two blocks dont overlap if
35   // |---a--------|--------------b--------|
36   // strat_a     end_a <= start_b       end_b
37   return !(
38       (a.end_offset <= b.start_offset) || (b.end_offset <= a.start_offset));
39 }
40 
validate_allocation_plan(const std::vector<MemEvent> & alloc_events,const std::vector<uint64_t> & allocation_offsets)41 bool validate_allocation_plan(
42     const std::vector<MemEvent>& alloc_events,
43     const std::vector<uint64_t>& allocation_offsets) {
44   std::set<MemBlock> allocations;
45   for (const auto& event : alloc_events) {
46     auto alloc_id = event.allocation_id;
47     // Skip allocations not managed by AllocationPlan
48     if (allocation_offsets[alloc_id] == std::numeric_limits<uint64_t>::max()) {
49       continue;
50     }
51     auto start_offset = allocation_offsets[alloc_id];
52     auto end_offset = allocation_offsets[alloc_id] + event.size;
53     MemBlock mem_block(start_offset, end_offset);
54     if (event.type == EventType::Allocate) {
55       auto it = allocations.lower_bound(mem_block);
56       if (it != allocations.end()) {
57         auto next_block = *it;
58         if (overlaps(next_block, mem_block)) {
59           return false;
60         }
61       }
62       if (it != allocations.begin()) {
63         auto prev_block = *(--it);
64         if (overlaps(prev_block, mem_block)) {
65           return false;
66         }
67       }
68       allocations.emplace(mem_block);
69     } else if (event.type == EventType::Free) {
70       auto it = allocations.find(mem_block);
71       TORCH_CHECK(
72           (*it).end_offset == end_offset,
73           "Enf offset of allocation being freed must match the one recorded.");
74       TORCH_CHECK(
75           it != allocations.end(),
76           "ProfilingAllocator: Allocate event "
77           "must have preceded deallocate event.");
78       allocations.erase(it);
79     } else {
80       TORCH_CHECK(false, "ProfilingAllocator: Invalid event type.");
81     }
82   }
83   return true;
84 }
85 
create_and_sort_mem_events(const std::vector<uint64_t> & allocation_sizes,const std::vector<uint64_t> & allocation_lifetimes)86 std::vector<MemEvent> create_and_sort_mem_events(
87     const std::vector<uint64_t>& allocation_sizes,
88     const std::vector<uint64_t>& allocation_lifetimes) {
89   std::vector<MemEvent> events;
90   for (uint64_t i = 0; i < allocation_sizes.size(); ++i) {
91     // If observed allocation are freed outside the scope of
92     // observation, then allocations are not managed by the
93     // AllocationPlan.
94     if (allocation_lifetimes[i] == std::numeric_limits<uint64_t>::max()) {
95       continue;
96     }
97     events.emplace_back(i, i, allocation_sizes[i], EventType::Allocate);
98     events.emplace_back(
99         allocation_lifetimes[i], i, allocation_sizes[i], EventType::Free);
100   }
101   std::sort(
102       events.begin(),
103       events.end(),
104       [](const MemEvent& a, const MemEvent& b) -> bool {
105         return a.time < b.time;
106       });
107   return events;
108 }
109 
formulate_greedy_allocation_plan(const std::vector<uint64_t> & allocation_sizes,const std::vector<uint64_t> & allocation_lifetimes)110 std::vector<uint64_t> formulate_greedy_allocation_plan(
111     const std::vector<uint64_t>& allocation_sizes,
112     const std::vector<uint64_t>& allocation_lifetimes) {
113   // Step 1. Construct all allocation/free events.
114   //         Sort these events by timestamp.
115   // Step 2. Iterate through all events.
116   //  2.1 If allocate event:
117   //      Find all candidate in free_size_to_offset map
118   //      Greedily pick the first one.
119   //      Remove the entry from free_size_to_offset map.
120   //      new_offset = offset + request_size
121   //      new_size = size - request_size
122   //      Add new entry to both maps
123   //  2.2 If free event.
124   //      Check if the returned offset merges with another chunk.
125   //      If so merge until no more merging is possible.
126   //      If returned offset does not merge, then
127   //      just return it as a chunk.
128 
129   // lower_bound on this map will get all candidates of
130   // the right size for allocation.
131   std::map<uint64_t, uint64_t> free_size_to_offset;
132   // This provides fast lookup when we want to insert freed block
133   // back, especially when we want to merge blocks.
134   ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator>
135       free_start_offset_to_size_iter;
136   ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator>
137       free_end_offset_to_size_iter;
138   // Upon free end_ptr = offset + size
139   // If end_ptr exists merge freed allocation
140   // Also find corresponding offset in size_to_offset
141   // Remove that entry and update with new size and offset
142   // If end_ptr does not exist then just insert offset,size
143   // in map and correspondingly size, offset in the other map.
144   // Merging should always be done recursively until no more chunks
145   // that can be found.
146   // After last free we should have only one entry left in these maps.
147 
148   std::vector<uint64_t> allocation_offsets(
149       allocation_sizes.size(), std::numeric_limits<uint64_t>::max());
150   auto mem_events =
151       create_and_sort_mem_events(allocation_sizes, allocation_lifetimes);
152   uint64_t max_offset{0};
153   for (const auto& mem_event : mem_events) {
154     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
155     uint64_t alloc_offset;
156     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
157     uint64_t new_offset, new_size;
158     if (mem_event.type == EventType::Allocate) {
159       auto it = free_size_to_offset.lower_bound(mem_event.size);
160       if (it == free_size_to_offset.end()) {
161         // If there is no contiguous block of the size requested
162         // allocate a new one.
163         alloc_offset = max_offset;
164         max_offset += mem_event.size;
165       } else {
166         // If we have found a block of the size we want
167         // 1. change the block by allocating out of it.
168         //    1.1 Erase the entire block
169         //    1.2 Erase the reverse map entries
170         // 2. If block still has space left insert the remainder back in map.
171         //    Including reverse map entries.
172         alloc_offset = it->second;
173         new_offset = alloc_offset + mem_event.size;
174         new_size = it->first - mem_event.size;
175         free_size_to_offset.erase(it);
176         free_start_offset_to_size_iter.erase(alloc_offset);
177         free_end_offset_to_size_iter.erase(alloc_offset + it->first);
178         if (new_size > 0) {
179           auto ref_it = free_size_to_offset.emplace(new_size, new_offset).first;
180           free_start_offset_to_size_iter.emplace(new_offset, ref_it);
181           free_end_offset_to_size_iter.emplace(new_offset + new_size, ref_it);
182         }
183       }
184       allocation_offsets[mem_event.allocation_id] = alloc_offset;
185     } else {
186       // 1. Check if freed block is adjacent to an existing free block
187       //    at its end boundary. This is done by checking
188       //    free_end_offset_to_size_iter.
189       //    If we find such a block, remove it and adjust size of
190       //    the block being freed.
191       // 2. Similarly check if freed block is adjacent to an existing
192       //    free block at start boundary. This is done by checking
193       //    free_start_offset_to_size_iter.
194       //    If we find such a block, remove it and adjust size of
195       //    the block being freed.
196       // 3. Insert the freed block in map.
197       auto freed_offset = allocation_offsets[mem_event.allocation_id];
198       auto freed_size = mem_event.size;
199       auto end_offset = freed_offset + freed_size;
200       // Merge when another free block exist at the end of this block
201       auto end_it = free_start_offset_to_size_iter.find(end_offset);
202       if (end_it != free_start_offset_to_size_iter.end()) {
203         auto merge_block_iter = end_it->second;
204         auto merge_block_size = merge_block_iter->first;
205         freed_size += merge_block_size;
206         free_size_to_offset.erase(merge_block_iter);
207         free_start_offset_to_size_iter.erase(end_it);
208         // If the block is being merged then also remove it from
209         // free_end_offset_to_size_iter
210         free_end_offset_to_size_iter.erase(end_offset + merge_block_size);
211       }
212       // Merge when freed block exist at the end of another free block
213       auto start_it = free_end_offset_to_size_iter.find(freed_offset);
214       if (start_it != free_end_offset_to_size_iter.end()) {
215         auto merge_block_iter = start_it->second;
216         auto merge_block_size = merge_block_iter->first;
217         freed_size += merge_block_size;
218         freed_offset -= merge_block_size;
219         free_size_to_offset.erase(merge_block_iter);
220         free_end_offset_to_size_iter.erase(start_it);
221         // If the block is being merged then also remove it from
222         // free_start_offset_to_size_iter
223         free_start_offset_to_size_iter.erase(freed_offset);
224       }
225       auto freed_block_it =
226           free_size_to_offset.emplace(freed_size, freed_offset).first;
227       free_start_offset_to_size_iter.emplace(freed_offset, freed_block_it);
228       free_end_offset_to_size_iter.emplace(
229           freed_offset + freed_size, freed_block_it);
230     }
231   }
232   TORCH_CHECK(
233       validate_allocation_plan(mem_events, allocation_offsets),
234       "ProfilingAllocator: Allocation plan invalid.");
235   return allocation_offsets;
236 }
237 
238 } // namespace
239 
clear()240 void AllocationPlan::clear() {
241   allocation_sizes.clear();
242   allocation_lifetimes.clear();
243   allocation_offsets.clear();
244 }
245 
record_allocation(const uint64_t size,const void * ptr)246 void AllocationPlanner::record_allocation(
247     const uint64_t size,
248     const void* ptr) {
249   if (validation_mode_) {
250     validation_success = validation_success && validate_allocation(size, ptr);
251     return;
252   }
253   allocation_plan_->allocation_sizes.push_back(size);
254   allocation_plan_->allocation_lifetimes.push_back(
255       std::numeric_limits<uint64_t>::max());
256   allocation_ptr_to_id_[ptr] = allocation_id_;
257   allocation_id_++;
258 }
259 
record_free(const void * ptr)260 void AllocationPlanner::record_free(const void* ptr) {
261   if (validation_mode_) {
262     validation_success = validation_success && validate_free(ptr);
263     return;
264   }
265   auto it = allocation_ptr_to_id_.find(ptr);
266   if (it == allocation_ptr_to_id_.end()) {
267     // Free being recorded was allocated outside of WithProfileAllocationGuard
268     return;
269   }
270   auto id = it->second;
271   TORCH_CHECK(
272       id < allocation_plan_->allocation_lifetimes.size(),
273       "Allocation must have been recorded during record_allocation.");
274   allocation_plan_->allocation_lifetimes[id] = allocation_id_;
275 }
276 
validate_allocation(const uint64_t size,const void * ptr)277 bool AllocationPlanner::validate_allocation(
278     const uint64_t size,
279     const void* ptr) {
280   if (allocation_id_ >= allocation_plan_->allocation_sizes.size() ||
281       allocation_plan_->allocation_sizes[allocation_id_] != size) {
282     TORCH_WARN(
283         "Allocation request does not match plan:",
284         "Allocation id:",
285         allocation_id_,
286         ", Number of recorded allocations:",
287         allocation_plan_->allocation_sizes.size(),
288         ", Recorded size of the requested allocation:",
289         allocation_plan_->allocation_sizes[allocation_id_],
290         ", but got:",
291         size);
292 
293     return false;
294   }
295   allocation_ptr_to_id_[ptr] = allocation_id_;
296   allocation_id_++;
297   return true;
298 }
299 
validate_free(const void * ptr)300 bool AllocationPlanner::validate_free(const void* ptr) {
301   auto it = allocation_ptr_to_id_.find(ptr);
302   if (it == allocation_ptr_to_id_.end()) {
303     // Allocation that was made outside the validation scope is being freed here
304     return true;
305   }
306   auto id = (*it).second;
307   TORCH_CHECK(
308       id < allocation_plan_->allocation_lifetimes.size(),
309       "Allocation must have been recorded during validate_allocation.");
310   auto lifetime_id = allocation_plan_->allocation_lifetimes[id];
311   return (lifetime_id == allocation_id_);
312 }
313 
formulate_plan()314 void AllocationPlanner::formulate_plan() {
315   allocation_plan_->allocation_offsets = formulate_greedy_allocation_plan(
316       allocation_plan_->allocation_sizes,
317       allocation_plan_->allocation_lifetimes);
318   allocation_plan_->total_size = 0;
319   for (const auto i : c10::irange(allocation_plan_->allocation_sizes.size())) {
320     if (allocation_plan_->allocation_lifetimes[i] ==
321         std::numeric_limits<uint64_t>::max()) {
322       continue;
323     }
324     auto limit = allocation_plan_->allocation_offsets[i] +
325         allocation_plan_->allocation_sizes[i];
326     allocation_plan_->total_size =
327         std::max(allocation_plan_->total_size, limit);
328   }
329 }
330 
clear()331 void AllocationPlanner::clear() {
332   allocation_plan_->clear();
333   allocation_ptr_to_id_.clear();
334 }
335 
set_plan(const AllocationPlan * plan)336 void CPUProfilingAllocator::set_plan(const AllocationPlan* plan) {
337   TORCH_CHECK(plan != nullptr, "Allocation plan is nullptr.");
338   plan_ = plan;
339   allocation_id_ = 0;
340   allocation_ptr_to_id_.clear();
341   if (current_size_ < plan->total_size) {
342     // Free existing memory and reallocate for larger size.
343     c10::free_cpu(blob_);
344     blob_ = c10::alloc_cpu(plan->total_size);
345     current_size_ = plan->total_size;
346   }
347 }
348 
unset_plan()349 void CPUProfilingAllocator::unset_plan() {
350   allocation_id_ = 0;
351   allocation_ptr_to_id_.clear();
352   plan_ = nullptr;
353 }
354 
allocate(const size_t bytes)355 void* CPUProfilingAllocator::allocate(const size_t bytes) {
356   TORCH_CHECK(
357       bytes == plan_->allocation_sizes[allocation_id_],
358       "Got allocation request that does not match with the plan.");
359   if (plan_->allocation_lifetimes[allocation_id_] ==
360       std::numeric_limits<uint64_t>::max()) {
361     // This allocation is not managed by ProfilingAllocator.
362     allocation_id_++;
363     return c10::alloc_cpu(bytes);
364   }
365   void* ptr = reinterpret_cast<uint8_t*>(blob_) +
366       plan_->allocation_offsets[allocation_id_];
367   allocation_ptr_to_id_[ptr] = allocation_id_;
368   allocation_id_++;
369   return ptr;
370 }
371 
free(void * const ptr)372 void CPUProfilingAllocator::free(void* const ptr) {
373   auto it = allocation_ptr_to_id_.find(ptr);
374   if (it == allocation_ptr_to_id_.end()) {
375     // Either
376     // 1. Allocation that was made outside the validation scope is being freed
377     // here or
378     // 2. Allocation that is not managed by profiling allocator is being freed.
379     //    Example of the second type
380     //    Tensor out;
381     //    for (....) {
382     //      {
383     //        CPUProfilingAllocator
384     //        out = ...some op (This also frees previous memory held by out)
385     //      }
386     //      out is used..
387     //    }
388     c10::free_cpu(ptr);
389     return;
390   }
391   auto id = it->second;
392   TORCH_CHECK(
393       id < plan_->allocation_lifetimes.size(),
394       "Freeing allocation that is not accordingly to the plan.");
395   auto lifetime_id = plan_->allocation_lifetimes[id];
396   TORCH_CHECK(
397       lifetime_id == allocation_id_,
398       "Lifetime of allocations do not match: allocation_id ",
399       id,
400       ", expected:",
401       lifetime_id,
402       ", got:",
403       allocation_id_);
404 }
405 
~CPUProfilingAllocator()406 CPUProfilingAllocator::~CPUProfilingAllocator() {
407   c10::free_cpu(blob_);
408 }
409 
WithProfileAllocationsGuard(AllocationPlan * plan)410 WithProfileAllocationsGuard::WithProfileAllocationsGuard(AllocationPlan* plan) {
411   // Nesting of allocation profiling does not seem meaningful.
412   TORCH_CHECK(
413       allocation_planner == nullptr,
414       "Nesting profiling allocations is not supported.");
415   planner_ = std::make_unique<AllocationPlanner>(plan);
416   planner_->clear();
417   allocation_planner = planner_.get();
418 }
419 
~WithProfileAllocationsGuard()420 WithProfileAllocationsGuard::~WithProfileAllocationsGuard() {
421   planner_->formulate_plan();
422   allocation_planner = nullptr;
423 }
424 
WithValidateAllocationPlanGuard(AllocationPlan * plan,bool * success)425 WithValidateAllocationPlanGuard::WithValidateAllocationPlanGuard(
426     AllocationPlan* plan,
427     bool* success) {
428   // Nesting of allocation profiling does not seem meaningful.
429   TORCH_CHECK(
430       allocation_planner == nullptr,
431       "Nesting profiling allocations is not supported.");
432   planner_ = std::make_unique<AllocationPlanner>(plan, true);
433   success_ = success;
434   allocation_planner = planner_.get();
435 }
436 
~WithValidateAllocationPlanGuard()437 WithValidateAllocationPlanGuard::~WithValidateAllocationPlanGuard() {
438   *success_ = planner_->validation_success;
439   allocation_planner = nullptr;
440 }
441 
GetThreadLocalAllocationPlanner()442 AllocationPlanner* GetThreadLocalAllocationPlanner() {
443   return allocation_planner;
444 }
445 
WithProfilingAllocatorGuard(CPUProfilingAllocator * allocator,const AllocationPlan * plan)446 WithProfilingAllocatorGuard::WithProfilingAllocatorGuard(
447     CPUProfilingAllocator* allocator,
448     const AllocationPlan* plan) {
449   // Nesting of profiling allocator is not supported.
450   TORCH_CHECK(
451       profiling_allocator == nullptr,
452       "Nesting profiling allocators is not supported.");
453   profiling_allocator = allocator;
454   profiling_allocator->set_plan(plan);
455 }
456 
~WithProfilingAllocatorGuard()457 WithProfilingAllocatorGuard::~WithProfilingAllocatorGuard() {
458   profiling_allocator->unset_plan();
459   profiling_allocator = nullptr;
460 }
461 
GetThreadLocalProfilingAllocator()462 CPUProfilingAllocator* GetThreadLocalProfilingAllocator() {
463   return profiling_allocator;
464 }
465 
466 } // namespace c10
467