xref: /aosp_15_r20/external/pytorch/c10/core/CPUAllocator.cpp (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #include <c10/core/Allocator.h>
2 #include <c10/core/CPUAllocator.h>
3 #include <c10/core/DeviceType.h>
4 #include <c10/core/alignment.h>
5 #include <c10/core/impl/alloc_cpu.h>
6 #include <c10/mobile/CPUCachingAllocator.h>
7 #include <c10/mobile/CPUProfilingAllocator.h>
8 #include <c10/util/Logging.h>
9 
10 // TODO: rename flag to C10
11 C10_DEFINE_bool(
12     caffe2_report_cpu_memory_usage,
13     false,
14     "If set, print out detailed memory usage");
15 
16 namespace c10 {
17 
18 struct C10_API DefaultCPUAllocator final : at::Allocator {
19   DefaultCPUAllocator() = default;
allocatec10::DefaultCPUAllocator20   at::DataPtr allocate(size_t nbytes) override {
21     void* data = nullptr;
22     try {
23       data = c10::alloc_cpu(nbytes);
24     } catch (c10::Error& e) {
25       profiledCPUMemoryReporter().OutOfMemory(nbytes);
26       throw e;
27     }
28     profiledCPUMemoryReporter().New(data, nbytes);
29     return {data, data, &ReportAndDelete, at::Device(at::DeviceType::CPU)};
30   }
31 
ReportAndDeletec10::DefaultCPUAllocator32   static void ReportAndDelete(void* ptr) {
33     if (!ptr) {
34       return;
35     }
36     profiledCPUMemoryReporter().Delete(ptr);
37     free_cpu(ptr);
38   }
39 
raw_deleterc10::DefaultCPUAllocator40   at::DeleterFnPtr raw_deleter() const override {
41     return &ReportAndDelete;
42   }
43 
copy_datac10::DefaultCPUAllocator44   void copy_data(void* dest, const void* src, std::size_t count) const final {
45     default_copy_data(dest, src, count);
46   }
47 };
48 
profiledCPUMemoryReporter()49 ProfiledCPUMemoryReporter& profiledCPUMemoryReporter() {
50   static ProfiledCPUMemoryReporter reporter_;
51   return reporter_;
52 }
53 
54 // QNNPACK AND XNNPACK may out-of-bound access the input and / or output
55 // tensors. This is by-design, and chosen to make the implementation of
56 // micro-kernels both simpler and faster as a result of not having to
57 // individually handle the corner cases where the number of processed elements
58 // is not a multiple of SIMD register width.  This behavior will trigger ASAN
59 // though, and may result in a segfault if the accessed memory location just so
60 // happens to fall on a page the current process has no read access to.  Here we
61 // define a custom allocator that allocates the extra storage required to keep
62 // this behavior safe.  This allocator could have been restricted to QNNPACK and
63 // XNNPACK only, but that would have negative performance ramifications, as
64 // input tensors must now be reallocated, and copied over, if the tensor is not
65 // allocated with this allocator to begin with.  Making this allocator the
66 // default on mobile builds minimizes the probability of unnecessary
67 // reallocations and copies, and also enables acceleration of operations where
68 // the output tensor is allocated outside of the function doing the
69 // implementation, wherein the implementation cannot simply re-allocate the
70 // output with the guarding allocator.
71 //
72 // PreGuardBytes: Number of guard bytes to allocate before the allocation.
73 // PostGuardBytes: Number of guard bytes to allocate after the allocation.
74 
75 template <uint32_t PreGuardBytes, uint32_t PostGuardBytes>
76 class DefaultMobileCPUAllocator final : public at::Allocator {
77  public:
78   DefaultMobileCPUAllocator() = default;
79   ~DefaultMobileCPUAllocator() override = default;
80 
deleter(void * const pointer)81   static void deleter(void* const pointer) {
82     if (C10_UNLIKELY(!pointer)) {
83       return;
84     }
85     // TODO: enable with better TLS support on mobile
86     // profiledCPUMemoryReporter().Delete(pointer);
87     auto allocator_ptr = GetThreadLocalCachingAllocator();
88     auto profiling_allocator_ptr = GetThreadLocalProfilingAllocator();
89     if (allocator_ptr != nullptr) {
90       allocator_ptr->free(pointer);
91     } else if (profiling_allocator_ptr != nullptr) {
92       profiling_allocator_ptr->free(pointer);
93     } else {
94       c10::free_cpu(pointer);
95       // This adds extra cost to freeing memory to the default case when
96       // caching allocator is not enabled.
97       // NOLINTNEXTLINE(clang-analyzer-unix.Malloc)
98       CPUCachingAllocator::record_free(pointer);
99       auto allocation_planner = GetThreadLocalAllocationPlanner();
100       if (allocation_planner != nullptr) {
101         allocation_planner->record_free(pointer);
102       }
103     }
104   }
105 
allocate(const size_t nbytes)106   DataPtr allocate(const size_t nbytes) override {
107     if (C10_UNLIKELY(0u == nbytes)) {
108       return {
109           nullptr,
110           nullptr,
111           &deleter,
112           at::Device(DeviceType::CPU),
113       };
114     }
115 
116     auto alloc_size = PreGuardBytes + nbytes + PostGuardBytes;
117     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
118     void* data;
119     auto allocator_ptr = GetThreadLocalCachingAllocator();
120     auto profiling_allocator_ptr = GetThreadLocalProfilingAllocator();
121     if (allocator_ptr != nullptr) {
122       data = allocator_ptr->allocate(alloc_size);
123     } else if (profiling_allocator_ptr != nullptr) {
124       data = profiling_allocator_ptr->allocate(alloc_size);
125     } else {
126       try {
127         data = c10::alloc_cpu(alloc_size);
128       } catch (c10::Error& e) {
129         profiledCPUMemoryReporter().OutOfMemory(alloc_size);
130         throw e;
131       }
132       auto allocation_planner = GetThreadLocalAllocationPlanner();
133       if (allocation_planner != nullptr) {
134         allocation_planner->record_allocation(alloc_size, data);
135       }
136     }
137     profiledCPUMemoryReporter().New(data, alloc_size);
138     return {
139         reinterpret_cast<uint8_t*>(data) + PreGuardBytes,
140         data,
141         &deleter,
142         at::Device(DeviceType::CPU),
143     };
144   }
145 
raw_deleter() const146   DeleterFnPtr raw_deleter() const override {
147     return deleter;
148   }
149 
is_simple_data_ptr(const c10::DataPtr & data_ptr) const150   bool is_simple_data_ptr(const c10::DataPtr& data_ptr) const final {
151     return reinterpret_cast<const uint8_t*>(data_ptr.get()) ==
152         reinterpret_cast<const uint8_t*>(data_ptr.get_context()) +
153         PreGuardBytes;
154   }
155 
copy_data(void * dest,const void * src,std::size_t count) const156   void copy_data(void* dest, const void* src, std::size_t count) const final {
157     default_copy_data(dest, src, count);
158   }
159 };
160 
NoDelete(void *)161 void NoDelete(void*) {}
162 
GetCPUAllocator()163 at::Allocator* GetCPUAllocator() {
164   return GetAllocator(DeviceType::CPU);
165 }
166 
SetCPUAllocator(at::Allocator * alloc,uint8_t priority)167 void SetCPUAllocator(at::Allocator* alloc, uint8_t priority) {
168   SetAllocator(DeviceType::CPU, alloc, priority);
169 }
170 
171 // The Mobile CPU allocator must always be present even on non-mobile builds
172 // because QNNPACK and XNNPACK are not mobile specific.
173 //
174 // Pre-guard: 8 bytes for QNNPACK, but set to gAlignment to ensure SIMD
175 //            alignment, not on the allocated memory, but memory location
176 //            returned to the user.
177 // Post-guard: 16 bytes for XNNPACK.
178 
179 // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-avoid-non-const-global-variables)
180 static DefaultMobileCPUAllocator<gAlignment, 16u> g_mobile_cpu_allocator;
181 
GetDefaultMobileCPUAllocator()182 at::Allocator* GetDefaultMobileCPUAllocator() {
183   return &g_mobile_cpu_allocator;
184 }
185 
186 #ifdef C10_MOBILE
187 
GetDefaultCPUAllocator()188 at::Allocator* GetDefaultCPUAllocator() {
189   return GetDefaultMobileCPUAllocator();
190 }
191 
192 REGISTER_ALLOCATOR(DeviceType::CPU, &g_mobile_cpu_allocator);
193 
194 #else
195 
196 // Global default CPU Allocator
197 static DefaultCPUAllocator g_cpu_alloc;
198 
GetDefaultCPUAllocator()199 at::Allocator* GetDefaultCPUAllocator() {
200   return &g_cpu_alloc;
201 }
202 
203 REGISTER_ALLOCATOR(DeviceType::CPU, &g_cpu_alloc);
204 
205 #endif /* C10_Mobile */
206 
New(void * ptr,size_t nbytes)207 void ProfiledCPUMemoryReporter::New(void* ptr, size_t nbytes) {
208   if (nbytes == 0) {
209     return;
210   }
211   auto profile_memory = memoryProfilingEnabled();
212   size_t allocated = 0;
213   if (FLAGS_caffe2_report_cpu_memory_usage || profile_memory) {
214     std::lock_guard<std::mutex> guard(mutex_);
215     size_table_[ptr] = nbytes;
216     allocated_ += nbytes;
217     allocated = allocated_;
218   }
219   if (FLAGS_caffe2_report_cpu_memory_usage) {
220     LOG(INFO) << "C10 alloc " << nbytes << " bytes, total alloc " << allocated
221               << " bytes.";
222   }
223   if (profile_memory) {
224     reportMemoryUsageToProfiler(
225         ptr,
226         static_cast<int64_t>(nbytes),
227         allocated,
228         0,
229         c10::Device(c10::DeviceType::CPU));
230   }
231 }
232 
Delete(void * ptr)233 void ProfiledCPUMemoryReporter::Delete(void* ptr) {
234   size_t nbytes = 0;
235   auto profile_memory = memoryProfilingEnabled();
236   size_t allocated = 0;
237   if (FLAGS_caffe2_report_cpu_memory_usage || profile_memory) {
238     std::lock_guard<std::mutex> guard(mutex_);
239     auto it = size_table_.find(ptr);
240     if (it != size_table_.end()) {
241       allocated_ -= it->second;
242       allocated = allocated_;
243       nbytes = it->second;
244       size_table_.erase(it);
245     } else {
246       // C10_LOG_EVERY_MS might log every time in some builds,
247       // using a simple counter to avoid spammy logs
248       if (log_cnt_++ % 1000 == 0) {
249         LOG(WARNING) << "Memory block of unknown size was allocated before "
250                      << "the profiling started, profiler results will not "
251                      << "include the deallocation event";
252       }
253     }
254   }
255   if (nbytes == 0) {
256     return;
257   }
258   if (FLAGS_caffe2_report_cpu_memory_usage) {
259     LOG(INFO) << "C10 deleted " << nbytes << " bytes, total alloc " << allocated
260               << " bytes.";
261   }
262   if (profile_memory) {
263     reportMemoryUsageToProfiler(
264         ptr,
265         -static_cast<int64_t>(nbytes),
266         allocated,
267         0,
268         c10::Device(c10::DeviceType::CPU));
269   }
270 }
271 
OutOfMemory(size_t nbytes)272 void ProfiledCPUMemoryReporter::OutOfMemory(size_t nbytes) {
273   auto profile_memory = memoryProfilingEnabled();
274   size_t allocated = 0;
275   if (FLAGS_caffe2_report_cpu_memory_usage || profile_memory) {
276     std::lock_guard<std::mutex> guard(mutex_);
277 
278     allocated = allocated_;
279   }
280   if (nbytes == 0) {
281     return;
282   }
283   if (FLAGS_caffe2_report_cpu_memory_usage) {
284     LOG(INFO) << "C10 Out of Memory. Trying to allocate " << nbytes
285               << " bytes, total alloc " << allocated << " bytes.";
286   }
287   if (profile_memory) {
288     reportOutOfMemoryToProfiler(
289         static_cast<int64_t>(nbytes),
290         allocated,
291         0,
292         c10::Device(c10::DeviceType::CPU));
293   }
294 }
295 
296 C10_API at::Allocator* cpu_caching_alloc = nullptr;
297 C10_API uint8_t cpu_caching_alloc_priority = 0;
298 
SetCPUCachingAllocator(Allocator * alloc,uint8_t priority)299 void SetCPUCachingAllocator(Allocator* alloc, uint8_t priority) {
300   if (priority >= cpu_caching_alloc_priority) {
301     cpu_caching_alloc = alloc;
302     cpu_caching_alloc_priority = priority;
303   }
304 }
305 
GetCPUCachingAllocator()306 Allocator* GetCPUCachingAllocator() {
307   if (cpu_caching_alloc == nullptr) {
308     VLOG(1)
309         << "There is not caching allocator registered for CPU, use the default allocator instead.";
310     return GetAllocator(DeviceType::CPU);
311   }
312   return cpu_caching_alloc;
313 }
314 
315 } // namespace c10
316