1 #include <c10/core/Allocator.h>
2 #include <c10/core/CPUAllocator.h>
3 #include <c10/core/DeviceType.h>
4 #include <c10/core/alignment.h>
5 #include <c10/core/impl/alloc_cpu.h>
6 #include <c10/mobile/CPUCachingAllocator.h>
7 #include <c10/mobile/CPUProfilingAllocator.h>
8 #include <c10/util/Logging.h>
9
10 // TODO: rename flag to C10
11 C10_DEFINE_bool(
12 caffe2_report_cpu_memory_usage,
13 false,
14 "If set, print out detailed memory usage");
15
16 namespace c10 {
17
18 struct C10_API DefaultCPUAllocator final : at::Allocator {
19 DefaultCPUAllocator() = default;
allocatec10::DefaultCPUAllocator20 at::DataPtr allocate(size_t nbytes) override {
21 void* data = nullptr;
22 try {
23 data = c10::alloc_cpu(nbytes);
24 } catch (c10::Error& e) {
25 profiledCPUMemoryReporter().OutOfMemory(nbytes);
26 throw e;
27 }
28 profiledCPUMemoryReporter().New(data, nbytes);
29 return {data, data, &ReportAndDelete, at::Device(at::DeviceType::CPU)};
30 }
31
ReportAndDeletec10::DefaultCPUAllocator32 static void ReportAndDelete(void* ptr) {
33 if (!ptr) {
34 return;
35 }
36 profiledCPUMemoryReporter().Delete(ptr);
37 free_cpu(ptr);
38 }
39
raw_deleterc10::DefaultCPUAllocator40 at::DeleterFnPtr raw_deleter() const override {
41 return &ReportAndDelete;
42 }
43
copy_datac10::DefaultCPUAllocator44 void copy_data(void* dest, const void* src, std::size_t count) const final {
45 default_copy_data(dest, src, count);
46 }
47 };
48
profiledCPUMemoryReporter()49 ProfiledCPUMemoryReporter& profiledCPUMemoryReporter() {
50 static ProfiledCPUMemoryReporter reporter_;
51 return reporter_;
52 }
53
54 // QNNPACK AND XNNPACK may out-of-bound access the input and / or output
55 // tensors. This is by-design, and chosen to make the implementation of
56 // micro-kernels both simpler and faster as a result of not having to
57 // individually handle the corner cases where the number of processed elements
58 // is not a multiple of SIMD register width. This behavior will trigger ASAN
59 // though, and may result in a segfault if the accessed memory location just so
60 // happens to fall on a page the current process has no read access to. Here we
61 // define a custom allocator that allocates the extra storage required to keep
62 // this behavior safe. This allocator could have been restricted to QNNPACK and
63 // XNNPACK only, but that would have negative performance ramifications, as
64 // input tensors must now be reallocated, and copied over, if the tensor is not
65 // allocated with this allocator to begin with. Making this allocator the
66 // default on mobile builds minimizes the probability of unnecessary
67 // reallocations and copies, and also enables acceleration of operations where
68 // the output tensor is allocated outside of the function doing the
69 // implementation, wherein the implementation cannot simply re-allocate the
70 // output with the guarding allocator.
71 //
72 // PreGuardBytes: Number of guard bytes to allocate before the allocation.
73 // PostGuardBytes: Number of guard bytes to allocate after the allocation.
74
75 template <uint32_t PreGuardBytes, uint32_t PostGuardBytes>
76 class DefaultMobileCPUAllocator final : public at::Allocator {
77 public:
78 DefaultMobileCPUAllocator() = default;
79 ~DefaultMobileCPUAllocator() override = default;
80
deleter(void * const pointer)81 static void deleter(void* const pointer) {
82 if (C10_UNLIKELY(!pointer)) {
83 return;
84 }
85 // TODO: enable with better TLS support on mobile
86 // profiledCPUMemoryReporter().Delete(pointer);
87 auto allocator_ptr = GetThreadLocalCachingAllocator();
88 auto profiling_allocator_ptr = GetThreadLocalProfilingAllocator();
89 if (allocator_ptr != nullptr) {
90 allocator_ptr->free(pointer);
91 } else if (profiling_allocator_ptr != nullptr) {
92 profiling_allocator_ptr->free(pointer);
93 } else {
94 c10::free_cpu(pointer);
95 // This adds extra cost to freeing memory to the default case when
96 // caching allocator is not enabled.
97 // NOLINTNEXTLINE(clang-analyzer-unix.Malloc)
98 CPUCachingAllocator::record_free(pointer);
99 auto allocation_planner = GetThreadLocalAllocationPlanner();
100 if (allocation_planner != nullptr) {
101 allocation_planner->record_free(pointer);
102 }
103 }
104 }
105
allocate(const size_t nbytes)106 DataPtr allocate(const size_t nbytes) override {
107 if (C10_UNLIKELY(0u == nbytes)) {
108 return {
109 nullptr,
110 nullptr,
111 &deleter,
112 at::Device(DeviceType::CPU),
113 };
114 }
115
116 auto alloc_size = PreGuardBytes + nbytes + PostGuardBytes;
117 // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
118 void* data;
119 auto allocator_ptr = GetThreadLocalCachingAllocator();
120 auto profiling_allocator_ptr = GetThreadLocalProfilingAllocator();
121 if (allocator_ptr != nullptr) {
122 data = allocator_ptr->allocate(alloc_size);
123 } else if (profiling_allocator_ptr != nullptr) {
124 data = profiling_allocator_ptr->allocate(alloc_size);
125 } else {
126 try {
127 data = c10::alloc_cpu(alloc_size);
128 } catch (c10::Error& e) {
129 profiledCPUMemoryReporter().OutOfMemory(alloc_size);
130 throw e;
131 }
132 auto allocation_planner = GetThreadLocalAllocationPlanner();
133 if (allocation_planner != nullptr) {
134 allocation_planner->record_allocation(alloc_size, data);
135 }
136 }
137 profiledCPUMemoryReporter().New(data, alloc_size);
138 return {
139 reinterpret_cast<uint8_t*>(data) + PreGuardBytes,
140 data,
141 &deleter,
142 at::Device(DeviceType::CPU),
143 };
144 }
145
raw_deleter() const146 DeleterFnPtr raw_deleter() const override {
147 return deleter;
148 }
149
is_simple_data_ptr(const c10::DataPtr & data_ptr) const150 bool is_simple_data_ptr(const c10::DataPtr& data_ptr) const final {
151 return reinterpret_cast<const uint8_t*>(data_ptr.get()) ==
152 reinterpret_cast<const uint8_t*>(data_ptr.get_context()) +
153 PreGuardBytes;
154 }
155
copy_data(void * dest,const void * src,std::size_t count) const156 void copy_data(void* dest, const void* src, std::size_t count) const final {
157 default_copy_data(dest, src, count);
158 }
159 };
160
NoDelete(void *)161 void NoDelete(void*) {}
162
GetCPUAllocator()163 at::Allocator* GetCPUAllocator() {
164 return GetAllocator(DeviceType::CPU);
165 }
166
SetCPUAllocator(at::Allocator * alloc,uint8_t priority)167 void SetCPUAllocator(at::Allocator* alloc, uint8_t priority) {
168 SetAllocator(DeviceType::CPU, alloc, priority);
169 }
170
171 // The Mobile CPU allocator must always be present even on non-mobile builds
172 // because QNNPACK and XNNPACK are not mobile specific.
173 //
174 // Pre-guard: 8 bytes for QNNPACK, but set to gAlignment to ensure SIMD
175 // alignment, not on the allocated memory, but memory location
176 // returned to the user.
177 // Post-guard: 16 bytes for XNNPACK.
178
179 // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-avoid-non-const-global-variables)
180 static DefaultMobileCPUAllocator<gAlignment, 16u> g_mobile_cpu_allocator;
181
GetDefaultMobileCPUAllocator()182 at::Allocator* GetDefaultMobileCPUAllocator() {
183 return &g_mobile_cpu_allocator;
184 }
185
186 #ifdef C10_MOBILE
187
GetDefaultCPUAllocator()188 at::Allocator* GetDefaultCPUAllocator() {
189 return GetDefaultMobileCPUAllocator();
190 }
191
192 REGISTER_ALLOCATOR(DeviceType::CPU, &g_mobile_cpu_allocator);
193
194 #else
195
196 // Global default CPU Allocator
197 static DefaultCPUAllocator g_cpu_alloc;
198
GetDefaultCPUAllocator()199 at::Allocator* GetDefaultCPUAllocator() {
200 return &g_cpu_alloc;
201 }
202
203 REGISTER_ALLOCATOR(DeviceType::CPU, &g_cpu_alloc);
204
205 #endif /* C10_Mobile */
206
New(void * ptr,size_t nbytes)207 void ProfiledCPUMemoryReporter::New(void* ptr, size_t nbytes) {
208 if (nbytes == 0) {
209 return;
210 }
211 auto profile_memory = memoryProfilingEnabled();
212 size_t allocated = 0;
213 if (FLAGS_caffe2_report_cpu_memory_usage || profile_memory) {
214 std::lock_guard<std::mutex> guard(mutex_);
215 size_table_[ptr] = nbytes;
216 allocated_ += nbytes;
217 allocated = allocated_;
218 }
219 if (FLAGS_caffe2_report_cpu_memory_usage) {
220 LOG(INFO) << "C10 alloc " << nbytes << " bytes, total alloc " << allocated
221 << " bytes.";
222 }
223 if (profile_memory) {
224 reportMemoryUsageToProfiler(
225 ptr,
226 static_cast<int64_t>(nbytes),
227 allocated,
228 0,
229 c10::Device(c10::DeviceType::CPU));
230 }
231 }
232
Delete(void * ptr)233 void ProfiledCPUMemoryReporter::Delete(void* ptr) {
234 size_t nbytes = 0;
235 auto profile_memory = memoryProfilingEnabled();
236 size_t allocated = 0;
237 if (FLAGS_caffe2_report_cpu_memory_usage || profile_memory) {
238 std::lock_guard<std::mutex> guard(mutex_);
239 auto it = size_table_.find(ptr);
240 if (it != size_table_.end()) {
241 allocated_ -= it->second;
242 allocated = allocated_;
243 nbytes = it->second;
244 size_table_.erase(it);
245 } else {
246 // C10_LOG_EVERY_MS might log every time in some builds,
247 // using a simple counter to avoid spammy logs
248 if (log_cnt_++ % 1000 == 0) {
249 LOG(WARNING) << "Memory block of unknown size was allocated before "
250 << "the profiling started, profiler results will not "
251 << "include the deallocation event";
252 }
253 }
254 }
255 if (nbytes == 0) {
256 return;
257 }
258 if (FLAGS_caffe2_report_cpu_memory_usage) {
259 LOG(INFO) << "C10 deleted " << nbytes << " bytes, total alloc " << allocated
260 << " bytes.";
261 }
262 if (profile_memory) {
263 reportMemoryUsageToProfiler(
264 ptr,
265 -static_cast<int64_t>(nbytes),
266 allocated,
267 0,
268 c10::Device(c10::DeviceType::CPU));
269 }
270 }
271
OutOfMemory(size_t nbytes)272 void ProfiledCPUMemoryReporter::OutOfMemory(size_t nbytes) {
273 auto profile_memory = memoryProfilingEnabled();
274 size_t allocated = 0;
275 if (FLAGS_caffe2_report_cpu_memory_usage || profile_memory) {
276 std::lock_guard<std::mutex> guard(mutex_);
277
278 allocated = allocated_;
279 }
280 if (nbytes == 0) {
281 return;
282 }
283 if (FLAGS_caffe2_report_cpu_memory_usage) {
284 LOG(INFO) << "C10 Out of Memory. Trying to allocate " << nbytes
285 << " bytes, total alloc " << allocated << " bytes.";
286 }
287 if (profile_memory) {
288 reportOutOfMemoryToProfiler(
289 static_cast<int64_t>(nbytes),
290 allocated,
291 0,
292 c10::Device(c10::DeviceType::CPU));
293 }
294 }
295
296 C10_API at::Allocator* cpu_caching_alloc = nullptr;
297 C10_API uint8_t cpu_caching_alloc_priority = 0;
298
SetCPUCachingAllocator(Allocator * alloc,uint8_t priority)299 void SetCPUCachingAllocator(Allocator* alloc, uint8_t priority) {
300 if (priority >= cpu_caching_alloc_priority) {
301 cpu_caching_alloc = alloc;
302 cpu_caching_alloc_priority = priority;
303 }
304 }
305
GetCPUCachingAllocator()306 Allocator* GetCPUCachingAllocator() {
307 if (cpu_caching_alloc == nullptr) {
308 VLOG(1)
309 << "There is not caching allocator registered for CPU, use the default allocator instead.";
310 return GetAllocator(DeviceType::CPU);
311 }
312 return cpu_caching_alloc;
313 }
314
315 } // namespace c10
316