1 #include <c10/core/impl/alloc_cpu.h>
2
3 #include <c10/core/alignment.h>
4 #include <c10/util/Flags.h>
5 #include <c10/util/Logging.h>
6 #include <c10/util/irange.h>
7 #include <c10/util/numa.h>
8
9 #ifdef USE_MIMALLOC
10 #include <mimalloc.h>
11 #endif
12
13 #ifdef __linux__
14 #include <sys/mman.h>
15 #include <unistd.h>
16 #endif
17
18 // TODO: rename flags to C10
19 C10_DEFINE_bool(
20 caffe2_cpu_allocator_do_zero_fill,
21 false,
22 "If set, do memory zerofilling when allocating on CPU");
23
24 C10_DEFINE_bool(
25 caffe2_cpu_allocator_do_junk_fill,
26 false,
27 "If set, fill memory with deterministic junk when allocating on CPU");
28
29 namespace c10 {
30
31 namespace {
32
33 // Fill the data memory region of num bytes with a particular garbage pattern.
34 // The garbage value is chosen to be NaN if interpreted as floating point value,
35 // or a very large integer.
memset_junk(void * data,size_t num)36 void memset_junk(void* data, size_t num) {
37 // This garbage pattern is NaN when interpreted as floating point values,
38 // or as very large integer values.
39 static constexpr int32_t kJunkPattern = 0x7fedbeef;
40 static constexpr int64_t kJunkPattern64 =
41 static_cast<int64_t>(kJunkPattern) << 32 | kJunkPattern;
42 auto int64_count = num / sizeof(kJunkPattern64);
43 auto remaining_bytes = num % sizeof(kJunkPattern64);
44 int64_t* data_i64 = reinterpret_cast<int64_t*>(data);
45 for (const auto i : c10::irange(int64_count)) {
46 data_i64[i] = kJunkPattern64;
47 }
48 if (remaining_bytes > 0) {
49 memcpy(data_i64 + int64_count, &kJunkPattern64, remaining_bytes);
50 }
51 }
52
53 #if defined(__linux__) && !defined(__ANDROID__)
is_thp_alloc_enabled()54 static inline bool is_thp_alloc_enabled() {
55 static bool value = [&] {
56 const char* ptr = std::getenv("THP_MEM_ALLOC_ENABLE");
57 return ptr != nullptr ? std::atoi(ptr) : 0;
58 }();
59 return value;
60 }
61
c10_compute_alignment(size_t nbytes)62 inline size_t c10_compute_alignment(size_t nbytes) {
63 static const auto pagesize = sysconf(_SC_PAGESIZE);
64 // for kernels that don't provide page size, default it to 4K
65 const size_t thp_alignment = (pagesize < 0 ? gPagesize : pagesize);
66 return (is_thp_alloc_enabled() ? thp_alignment : gAlignment);
67 }
68
is_thp_alloc(size_t nbytes)69 inline bool is_thp_alloc(size_t nbytes) {
70 // enable thp (transparent huge pages) for larger buffers
71 return (is_thp_alloc_enabled() && (nbytes >= gAlloc_threshold_thp));
72 }
73 #elif !defined(__ANDROID__) && !defined(_MSC_VER)
c10_compute_alignment(C10_UNUSED size_t nbytes)74 constexpr size_t c10_compute_alignment(C10_UNUSED size_t nbytes) {
75 return gAlignment;
76 }
77
is_thp_alloc(C10_UNUSED size_t nbytes)78 constexpr bool is_thp_alloc(C10_UNUSED size_t nbytes) {
79 return false;
80 }
81 #endif
82 } // namespace
83
alloc_cpu(size_t nbytes)84 void* alloc_cpu(size_t nbytes) {
85 if (nbytes == 0) {
86 return nullptr;
87 }
88 // We might have clowny upstream code that tries to alloc a negative number
89 // of bytes. Let's catch it early.
90 CAFFE_ENFORCE(
91 ((ptrdiff_t)nbytes) >= 0,
92 "alloc_cpu() seems to have been called with negative number: ",
93 nbytes);
94
95 // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
96 void* data;
97 #ifdef __ANDROID__
98 data = memalign(gAlignment, nbytes);
99 CAFFE_ENFORCE(
100 data,
101 "DefaultCPUAllocator: not enough memory: you tried to allocate ",
102 nbytes,
103 " bytes.");
104 #elif defined(_MSC_VER)
105 #ifdef USE_MIMALLOC
106 data = mi_malloc_aligned(nbytes, gAlignment);
107 #else
108 data = _aligned_malloc(nbytes, gAlignment);
109 #endif
110 CAFFE_ENFORCE(
111 data,
112 "DefaultCPUAllocator: not enough memory: you tried to allocate ",
113 nbytes,
114 " bytes.");
115 #else
116 int err = posix_memalign(&data, c10_compute_alignment(nbytes), nbytes);
117 CAFFE_ENFORCE(
118 err == 0,
119 "DefaultCPUAllocator: can't allocate memory: you tried to allocate ",
120 nbytes,
121 " bytes. Error code ",
122 err,
123 " (",
124 strerror(err),
125 ")");
126 if (is_thp_alloc(nbytes)) {
127 #ifdef __linux__
128 // MADV_HUGEPAGE advise is available only for linux.
129 // general posix compliant systems can check POSIX_MADV_SEQUENTIAL advise.
130 int ret = madvise(data, nbytes, MADV_HUGEPAGE);
131 if (ret != 0) {
132 TORCH_WARN_ONCE("thp madvise for HUGEPAGE failed with ", strerror(errno));
133 }
134 #endif
135 }
136 #endif
137
138 // move data to a thread's NUMA node
139 NUMAMove(data, nbytes, GetCurrentNUMANode());
140 CHECK(
141 !FLAGS_caffe2_cpu_allocator_do_zero_fill ||
142 !FLAGS_caffe2_cpu_allocator_do_junk_fill)
143 << "Cannot request both zero-fill and junk-fill at the same time";
144 if (FLAGS_caffe2_cpu_allocator_do_zero_fill) {
145 memset(data, 0, nbytes);
146 } else if (FLAGS_caffe2_cpu_allocator_do_junk_fill) {
147 memset_junk(data, nbytes);
148 }
149
150 return data;
151 }
152
free_cpu(void * data)153 void free_cpu(void* data) {
154 #ifdef _MSC_VER
155 #ifdef USE_MIMALLOC
156 mi_free(data);
157 #else
158 _aligned_free(data);
159 #endif
160 #else
161 // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
162 free(data);
163 #endif
164 }
165
166 } // namespace c10
167