1 #include <c10/core/impl/alloc_cpu.h>
2 #include <c10/mobile/CPUProfilingAllocator.h>
3 #include <c10/util/irange.h>
4
5 #include <map>
6 #include <set>
7
8 namespace c10 {
9
10 namespace {
11 thread_local AllocationPlanner* allocation_planner{nullptr};
12 thread_local CPUProfilingAllocator* profiling_allocator{nullptr};
13
14 struct MemBlock {
15 uint64_t start_offset, end_offset;
MemBlockc10::__anon18d13d3d0111::MemBlock16 MemBlock(uint64_t s, uint64_t e) : start_offset(s), end_offset(e) {}
operator <c10::__anon18d13d3d0111::MemBlock17 bool operator<(const MemBlock& other) const {
18 return start_offset < other.start_offset;
19 }
20 };
21
22 enum class EventType { Allocate = 0, Free, Invalid };
23
24 struct MemEvent {
25 uint64_t time;
26 uint64_t allocation_id;
27 uint64_t size;
28 EventType type{EventType::Invalid};
MemEventc10::__anon18d13d3d0111::MemEvent29 MemEvent(uint64_t t, uint64_t id, uint64_t s, EventType e)
30 : time(t), allocation_id(id), size(s), type(e) {}
31 };
32
overlaps(const MemBlock & a,const MemBlock & b)33 bool overlaps(const MemBlock& a, const MemBlock& b) {
34 // two blocks dont overlap if
35 // |---a--------|--------------b--------|
36 // strat_a end_a <= start_b end_b
37 return !(
38 (a.end_offset <= b.start_offset) || (b.end_offset <= a.start_offset));
39 }
40
validate_allocation_plan(const std::vector<MemEvent> & alloc_events,const std::vector<uint64_t> & allocation_offsets)41 bool validate_allocation_plan(
42 const std::vector<MemEvent>& alloc_events,
43 const std::vector<uint64_t>& allocation_offsets) {
44 std::set<MemBlock> allocations;
45 for (const auto& event : alloc_events) {
46 auto alloc_id = event.allocation_id;
47 // Skip allocations not managed by AllocationPlan
48 if (allocation_offsets[alloc_id] == std::numeric_limits<uint64_t>::max()) {
49 continue;
50 }
51 auto start_offset = allocation_offsets[alloc_id];
52 auto end_offset = allocation_offsets[alloc_id] + event.size;
53 MemBlock mem_block(start_offset, end_offset);
54 if (event.type == EventType::Allocate) {
55 auto it = allocations.lower_bound(mem_block);
56 if (it != allocations.end()) {
57 auto next_block = *it;
58 if (overlaps(next_block, mem_block)) {
59 return false;
60 }
61 }
62 if (it != allocations.begin()) {
63 auto prev_block = *(--it);
64 if (overlaps(prev_block, mem_block)) {
65 return false;
66 }
67 }
68 allocations.emplace(mem_block);
69 } else if (event.type == EventType::Free) {
70 auto it = allocations.find(mem_block);
71 TORCH_CHECK(
72 (*it).end_offset == end_offset,
73 "Enf offset of allocation being freed must match the one recorded.");
74 TORCH_CHECK(
75 it != allocations.end(),
76 "ProfilingAllocator: Allocate event "
77 "must have preceded deallocate event.");
78 allocations.erase(it);
79 } else {
80 TORCH_CHECK(false, "ProfilingAllocator: Invalid event type.");
81 }
82 }
83 return true;
84 }
85
create_and_sort_mem_events(const std::vector<uint64_t> & allocation_sizes,const std::vector<uint64_t> & allocation_lifetimes)86 std::vector<MemEvent> create_and_sort_mem_events(
87 const std::vector<uint64_t>& allocation_sizes,
88 const std::vector<uint64_t>& allocation_lifetimes) {
89 std::vector<MemEvent> events;
90 for (uint64_t i = 0; i < allocation_sizes.size(); ++i) {
91 // If observed allocation are freed outside the scope of
92 // observation, then allocations are not managed by the
93 // AllocationPlan.
94 if (allocation_lifetimes[i] == std::numeric_limits<uint64_t>::max()) {
95 continue;
96 }
97 events.emplace_back(i, i, allocation_sizes[i], EventType::Allocate);
98 events.emplace_back(
99 allocation_lifetimes[i], i, allocation_sizes[i], EventType::Free);
100 }
101 std::sort(
102 events.begin(),
103 events.end(),
104 [](const MemEvent& a, const MemEvent& b) -> bool {
105 return a.time < b.time;
106 });
107 return events;
108 }
109
formulate_greedy_allocation_plan(const std::vector<uint64_t> & allocation_sizes,const std::vector<uint64_t> & allocation_lifetimes)110 std::vector<uint64_t> formulate_greedy_allocation_plan(
111 const std::vector<uint64_t>& allocation_sizes,
112 const std::vector<uint64_t>& allocation_lifetimes) {
113 // Step 1. Construct all allocation/free events.
114 // Sort these events by timestamp.
115 // Step 2. Iterate through all events.
116 // 2.1 If allocate event:
117 // Find all candidate in free_size_to_offset map
118 // Greedily pick the first one.
119 // Remove the entry from free_size_to_offset map.
120 // new_offset = offset + request_size
121 // new_size = size - request_size
122 // Add new entry to both maps
123 // 2.2 If free event.
124 // Check if the returned offset merges with another chunk.
125 // If so merge until no more merging is possible.
126 // If returned offset does not merge, then
127 // just return it as a chunk.
128
129 // lower_bound on this map will get all candidates of
130 // the right size for allocation.
131 std::map<uint64_t, uint64_t> free_size_to_offset;
132 // This provides fast lookup when we want to insert freed block
133 // back, especially when we want to merge blocks.
134 ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator>
135 free_start_offset_to_size_iter;
136 ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator>
137 free_end_offset_to_size_iter;
138 // Upon free end_ptr = offset + size
139 // If end_ptr exists merge freed allocation
140 // Also find corresponding offset in size_to_offset
141 // Remove that entry and update with new size and offset
142 // If end_ptr does not exist then just insert offset,size
143 // in map and correspondingly size, offset in the other map.
144 // Merging should always be done recursively until no more chunks
145 // that can be found.
146 // After last free we should have only one entry left in these maps.
147
148 std::vector<uint64_t> allocation_offsets(
149 allocation_sizes.size(), std::numeric_limits<uint64_t>::max());
150 auto mem_events =
151 create_and_sort_mem_events(allocation_sizes, allocation_lifetimes);
152 uint64_t max_offset{0};
153 for (const auto& mem_event : mem_events) {
154 // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
155 uint64_t alloc_offset;
156 // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
157 uint64_t new_offset, new_size;
158 if (mem_event.type == EventType::Allocate) {
159 auto it = free_size_to_offset.lower_bound(mem_event.size);
160 if (it == free_size_to_offset.end()) {
161 // If there is no contiguous block of the size requested
162 // allocate a new one.
163 alloc_offset = max_offset;
164 max_offset += mem_event.size;
165 } else {
166 // If we have found a block of the size we want
167 // 1. change the block by allocating out of it.
168 // 1.1 Erase the entire block
169 // 1.2 Erase the reverse map entries
170 // 2. If block still has space left insert the remainder back in map.
171 // Including reverse map entries.
172 alloc_offset = it->second;
173 new_offset = alloc_offset + mem_event.size;
174 new_size = it->first - mem_event.size;
175 free_size_to_offset.erase(it);
176 free_start_offset_to_size_iter.erase(alloc_offset);
177 free_end_offset_to_size_iter.erase(alloc_offset + it->first);
178 if (new_size > 0) {
179 auto ref_it = free_size_to_offset.emplace(new_size, new_offset).first;
180 free_start_offset_to_size_iter.emplace(new_offset, ref_it);
181 free_end_offset_to_size_iter.emplace(new_offset + new_size, ref_it);
182 }
183 }
184 allocation_offsets[mem_event.allocation_id] = alloc_offset;
185 } else {
186 // 1. Check if freed block is adjacent to an existing free block
187 // at its end boundary. This is done by checking
188 // free_end_offset_to_size_iter.
189 // If we find such a block, remove it and adjust size of
190 // the block being freed.
191 // 2. Similarly check if freed block is adjacent to an existing
192 // free block at start boundary. This is done by checking
193 // free_start_offset_to_size_iter.
194 // If we find such a block, remove it and adjust size of
195 // the block being freed.
196 // 3. Insert the freed block in map.
197 auto freed_offset = allocation_offsets[mem_event.allocation_id];
198 auto freed_size = mem_event.size;
199 auto end_offset = freed_offset + freed_size;
200 // Merge when another free block exist at the end of this block
201 auto end_it = free_start_offset_to_size_iter.find(end_offset);
202 if (end_it != free_start_offset_to_size_iter.end()) {
203 auto merge_block_iter = end_it->second;
204 auto merge_block_size = merge_block_iter->first;
205 freed_size += merge_block_size;
206 free_size_to_offset.erase(merge_block_iter);
207 free_start_offset_to_size_iter.erase(end_it);
208 // If the block is being merged then also remove it from
209 // free_end_offset_to_size_iter
210 free_end_offset_to_size_iter.erase(end_offset + merge_block_size);
211 }
212 // Merge when freed block exist at the end of another free block
213 auto start_it = free_end_offset_to_size_iter.find(freed_offset);
214 if (start_it != free_end_offset_to_size_iter.end()) {
215 auto merge_block_iter = start_it->second;
216 auto merge_block_size = merge_block_iter->first;
217 freed_size += merge_block_size;
218 freed_offset -= merge_block_size;
219 free_size_to_offset.erase(merge_block_iter);
220 free_end_offset_to_size_iter.erase(start_it);
221 // If the block is being merged then also remove it from
222 // free_start_offset_to_size_iter
223 free_start_offset_to_size_iter.erase(freed_offset);
224 }
225 auto freed_block_it =
226 free_size_to_offset.emplace(freed_size, freed_offset).first;
227 free_start_offset_to_size_iter.emplace(freed_offset, freed_block_it);
228 free_end_offset_to_size_iter.emplace(
229 freed_offset + freed_size, freed_block_it);
230 }
231 }
232 TORCH_CHECK(
233 validate_allocation_plan(mem_events, allocation_offsets),
234 "ProfilingAllocator: Allocation plan invalid.");
235 return allocation_offsets;
236 }
237
238 } // namespace
239
clear()240 void AllocationPlan::clear() {
241 allocation_sizes.clear();
242 allocation_lifetimes.clear();
243 allocation_offsets.clear();
244 }
245
record_allocation(const uint64_t size,const void * ptr)246 void AllocationPlanner::record_allocation(
247 const uint64_t size,
248 const void* ptr) {
249 if (validation_mode_) {
250 validation_success = validation_success && validate_allocation(size, ptr);
251 return;
252 }
253 allocation_plan_->allocation_sizes.push_back(size);
254 allocation_plan_->allocation_lifetimes.push_back(
255 std::numeric_limits<uint64_t>::max());
256 allocation_ptr_to_id_[ptr] = allocation_id_;
257 allocation_id_++;
258 }
259
record_free(const void * ptr)260 void AllocationPlanner::record_free(const void* ptr) {
261 if (validation_mode_) {
262 validation_success = validation_success && validate_free(ptr);
263 return;
264 }
265 auto it = allocation_ptr_to_id_.find(ptr);
266 if (it == allocation_ptr_to_id_.end()) {
267 // Free being recorded was allocated outside of WithProfileAllocationGuard
268 return;
269 }
270 auto id = it->second;
271 TORCH_CHECK(
272 id < allocation_plan_->allocation_lifetimes.size(),
273 "Allocation must have been recorded during record_allocation.");
274 allocation_plan_->allocation_lifetimes[id] = allocation_id_;
275 }
276
validate_allocation(const uint64_t size,const void * ptr)277 bool AllocationPlanner::validate_allocation(
278 const uint64_t size,
279 const void* ptr) {
280 if (allocation_id_ >= allocation_plan_->allocation_sizes.size() ||
281 allocation_plan_->allocation_sizes[allocation_id_] != size) {
282 TORCH_WARN(
283 "Allocation request does not match plan:",
284 "Allocation id:",
285 allocation_id_,
286 ", Number of recorded allocations:",
287 allocation_plan_->allocation_sizes.size(),
288 ", Recorded size of the requested allocation:",
289 allocation_plan_->allocation_sizes[allocation_id_],
290 ", but got:",
291 size);
292
293 return false;
294 }
295 allocation_ptr_to_id_[ptr] = allocation_id_;
296 allocation_id_++;
297 return true;
298 }
299
validate_free(const void * ptr)300 bool AllocationPlanner::validate_free(const void* ptr) {
301 auto it = allocation_ptr_to_id_.find(ptr);
302 if (it == allocation_ptr_to_id_.end()) {
303 // Allocation that was made outside the validation scope is being freed here
304 return true;
305 }
306 auto id = (*it).second;
307 TORCH_CHECK(
308 id < allocation_plan_->allocation_lifetimes.size(),
309 "Allocation must have been recorded during validate_allocation.");
310 auto lifetime_id = allocation_plan_->allocation_lifetimes[id];
311 return (lifetime_id == allocation_id_);
312 }
313
formulate_plan()314 void AllocationPlanner::formulate_plan() {
315 allocation_plan_->allocation_offsets = formulate_greedy_allocation_plan(
316 allocation_plan_->allocation_sizes,
317 allocation_plan_->allocation_lifetimes);
318 allocation_plan_->total_size = 0;
319 for (const auto i : c10::irange(allocation_plan_->allocation_sizes.size())) {
320 if (allocation_plan_->allocation_lifetimes[i] ==
321 std::numeric_limits<uint64_t>::max()) {
322 continue;
323 }
324 auto limit = allocation_plan_->allocation_offsets[i] +
325 allocation_plan_->allocation_sizes[i];
326 allocation_plan_->total_size =
327 std::max(allocation_plan_->total_size, limit);
328 }
329 }
330
clear()331 void AllocationPlanner::clear() {
332 allocation_plan_->clear();
333 allocation_ptr_to_id_.clear();
334 }
335
set_plan(const AllocationPlan * plan)336 void CPUProfilingAllocator::set_plan(const AllocationPlan* plan) {
337 TORCH_CHECK(plan != nullptr, "Allocation plan is nullptr.");
338 plan_ = plan;
339 allocation_id_ = 0;
340 allocation_ptr_to_id_.clear();
341 if (current_size_ < plan->total_size) {
342 // Free existing memory and reallocate for larger size.
343 c10::free_cpu(blob_);
344 blob_ = c10::alloc_cpu(plan->total_size);
345 current_size_ = plan->total_size;
346 }
347 }
348
unset_plan()349 void CPUProfilingAllocator::unset_plan() {
350 allocation_id_ = 0;
351 allocation_ptr_to_id_.clear();
352 plan_ = nullptr;
353 }
354
allocate(const size_t bytes)355 void* CPUProfilingAllocator::allocate(const size_t bytes) {
356 TORCH_CHECK(
357 bytes == plan_->allocation_sizes[allocation_id_],
358 "Got allocation request that does not match with the plan.");
359 if (plan_->allocation_lifetimes[allocation_id_] ==
360 std::numeric_limits<uint64_t>::max()) {
361 // This allocation is not managed by ProfilingAllocator.
362 allocation_id_++;
363 return c10::alloc_cpu(bytes);
364 }
365 void* ptr = reinterpret_cast<uint8_t*>(blob_) +
366 plan_->allocation_offsets[allocation_id_];
367 allocation_ptr_to_id_[ptr] = allocation_id_;
368 allocation_id_++;
369 return ptr;
370 }
371
free(void * const ptr)372 void CPUProfilingAllocator::free(void* const ptr) {
373 auto it = allocation_ptr_to_id_.find(ptr);
374 if (it == allocation_ptr_to_id_.end()) {
375 // Either
376 // 1. Allocation that was made outside the validation scope is being freed
377 // here or
378 // 2. Allocation that is not managed by profiling allocator is being freed.
379 // Example of the second type
380 // Tensor out;
381 // for (....) {
382 // {
383 // CPUProfilingAllocator
384 // out = ...some op (This also frees previous memory held by out)
385 // }
386 // out is used..
387 // }
388 c10::free_cpu(ptr);
389 return;
390 }
391 auto id = it->second;
392 TORCH_CHECK(
393 id < plan_->allocation_lifetimes.size(),
394 "Freeing allocation that is not accordingly to the plan.");
395 auto lifetime_id = plan_->allocation_lifetimes[id];
396 TORCH_CHECK(
397 lifetime_id == allocation_id_,
398 "Lifetime of allocations do not match: allocation_id ",
399 id,
400 ", expected:",
401 lifetime_id,
402 ", got:",
403 allocation_id_);
404 }
405
~CPUProfilingAllocator()406 CPUProfilingAllocator::~CPUProfilingAllocator() {
407 c10::free_cpu(blob_);
408 }
409
WithProfileAllocationsGuard(AllocationPlan * plan)410 WithProfileAllocationsGuard::WithProfileAllocationsGuard(AllocationPlan* plan) {
411 // Nesting of allocation profiling does not seem meaningful.
412 TORCH_CHECK(
413 allocation_planner == nullptr,
414 "Nesting profiling allocations is not supported.");
415 planner_ = std::make_unique<AllocationPlanner>(plan);
416 planner_->clear();
417 allocation_planner = planner_.get();
418 }
419
~WithProfileAllocationsGuard()420 WithProfileAllocationsGuard::~WithProfileAllocationsGuard() {
421 planner_->formulate_plan();
422 allocation_planner = nullptr;
423 }
424
WithValidateAllocationPlanGuard(AllocationPlan * plan,bool * success)425 WithValidateAllocationPlanGuard::WithValidateAllocationPlanGuard(
426 AllocationPlan* plan,
427 bool* success) {
428 // Nesting of allocation profiling does not seem meaningful.
429 TORCH_CHECK(
430 allocation_planner == nullptr,
431 "Nesting profiling allocations is not supported.");
432 planner_ = std::make_unique<AllocationPlanner>(plan, true);
433 success_ = success;
434 allocation_planner = planner_.get();
435 }
436
~WithValidateAllocationPlanGuard()437 WithValidateAllocationPlanGuard::~WithValidateAllocationPlanGuard() {
438 *success_ = planner_->validation_success;
439 allocation_planner = nullptr;
440 }
441
GetThreadLocalAllocationPlanner()442 AllocationPlanner* GetThreadLocalAllocationPlanner() {
443 return allocation_planner;
444 }
445
WithProfilingAllocatorGuard(CPUProfilingAllocator * allocator,const AllocationPlan * plan)446 WithProfilingAllocatorGuard::WithProfilingAllocatorGuard(
447 CPUProfilingAllocator* allocator,
448 const AllocationPlan* plan) {
449 // Nesting of profiling allocator is not supported.
450 TORCH_CHECK(
451 profiling_allocator == nullptr,
452 "Nesting profiling allocators is not supported.");
453 profiling_allocator = allocator;
454 profiling_allocator->set_plan(plan);
455 }
456
~WithProfilingAllocatorGuard()457 WithProfilingAllocatorGuard::~WithProfilingAllocatorGuard() {
458 profiling_allocator->unset_plan();
459 profiling_allocator = nullptr;
460 }
461
GetThreadLocalProfilingAllocator()462 CPUProfilingAllocator* GetThreadLocalProfilingAllocator() {
463 return profiling_allocator;
464 }
465
466 } // namespace c10
467