1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <cstdio>
7 #include <cstdlib>
8 #include <cstring>
9 #include <mutex>
10
11 #ifdef __linux__
12 #include <sched.h>
13 #endif
14 #if defined(__ANDROID__) || defined(_WIN32) || defined(__CYGWIN__)
15 #include <malloc.h>
16 #endif
17 #if defined(__SSE__) || defined(__x86_64__)
18 #include <xmmintrin.h>
19 #endif
20
21 #include <cpuinfo.h>
22
23 #include <xnnpack.h>
24 #include <xnnpack/allocator.h>
25
26 #include "bench/utils.h"
27
28 static void* wipe_buffer = nullptr;
29 static size_t wipe_buffer_size = 0;
30
31 static std::once_flag wipe_buffer_guard;
32
InitWipeBuffer()33 static void InitWipeBuffer() {
34 // Default: the largest know cache size (128 MB Intel Crystalwell L4 cache).
35 wipe_buffer_size = 128 * 1024 * 1024;
36 if (cpuinfo_initialize()) {
37 wipe_buffer_size = benchmark::utils::GetMaxCacheSize();
38 }
39 #if defined(_WIN32)
40 wipe_buffer = _aligned_malloc(wipe_buffer_size, 128);
41 #elif defined(__ANDROID__) || defined(__CYGWIN__)
42 // memalign is obsolete, but it is the only option on Android until API level 17.
43 wipe_buffer = memalign(128, wipe_buffer_size);
44 #else
45 (void) posix_memalign((void**) &wipe_buffer, 128, wipe_buffer_size);
46 #endif
47 if (wipe_buffer != nullptr) {
48 memset(wipe_buffer, 0xA5, wipe_buffer_size);
49 }
50 }
51
52 namespace benchmark {
53 namespace utils {
54
PrefetchToL1(const void * ptr,size_t size)55 uint32_t PrefetchToL1(const void* ptr, size_t size) {
56 uint32_t step = 16;
57 if (cpuinfo_initialize()) {
58 step = cpuinfo_get_l1d_cache(0)->line_size;
59 }
60 const uint8_t* u8_ptr = static_cast<const uint8_t*>(ptr);
61 // Compute and return sum of data to prevent compiler from removing data reads.
62 uint32_t sum = 0;
63 while (size >= step) {
64 sum += uint32_t(*u8_ptr);
65 u8_ptr += step;
66 size -= step;
67 }
68 return sum;
69 }
70
WipeCache()71 uint32_t WipeCache() {
72 std::call_once(wipe_buffer_guard, InitWipeBuffer);
73 return PrefetchToL1(wipe_buffer, wipe_buffer_size);
74 }
75
DisableDenormals()76 void DisableDenormals() {
77 #if defined(__SSE__) || defined(__x86_64__)
78 _mm_setcsr(_mm_getcsr() | 0x8040);
79 #elif defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0)
80 uint32_t fpscr;
81 #if defined(__thumb__) && !defined(__thumb2__)
82 __asm__ __volatile__(
83 "VMRS %[fpscr], fpscr\n"
84 "ORRS %[fpscr], %[bitmask]\n"
85 "VMSR fpscr, %[fpscr]\n"
86 : [fpscr] "=l" (fpscr)
87 : [bitmask] "l" (0x1000000)
88 : "cc");
89 #else
90 __asm__ __volatile__(
91 "VMRS %[fpscr], fpscr\n"
92 "ORR %[fpscr], #0x1000000\n"
93 "VMSR fpscr, %[fpscr]\n"
94 : [fpscr] "=r" (fpscr));
95 #endif
96 #elif defined(__aarch64__)
97 uint64_t fpcr;
98 __asm__ __volatile__(
99 "MRS %[fpcr], fpcr\n"
100 "ORR %w[fpcr], %w[fpcr], 0x1000000\n"
101 "ORR %w[fpcr], %w[fpcr], 0x80000\n"
102 "MSR fpcr, %[fpcr]\n"
103 : [fpcr] "=r" (fpcr));
104 #endif
105 }
106
107 // Return clockrate in Hz
GetCurrentCpuFrequency()108 uint64_t GetCurrentCpuFrequency() {
109 #ifdef __linux__
110 int freq = 0;
111 char cpuinfo_name[512];
112 int cpu = sched_getcpu();
113 snprintf(cpuinfo_name, sizeof(cpuinfo_name),
114 "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_cur_freq", cpu);
115
116 FILE* f = fopen(cpuinfo_name, "r");
117 if (f) {
118 if (fscanf(f, "%d", &freq)) {
119 fclose(f);
120 return uint64_t(freq) * 1000;
121 }
122 fclose(f);
123 }
124 #endif // __linux__
125 return 0;
126 }
127
GetMaxCacheSize()128 size_t GetMaxCacheSize() {
129 if (!cpuinfo_initialize()) {
130 #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
131 // DynamIQ max: 4 MB
132 return 4 * 1024 * 1024;
133 #else
134 // Intel eDRAM max: 128 MB
135 return 128 * 1024 * 1024;
136 #endif
137 }
138 return cpuinfo_get_max_cache_size();
139 }
140
MultiThreadingParameters(benchmark::internal::Benchmark * benchmark)141 void MultiThreadingParameters(benchmark::internal::Benchmark* benchmark) {
142 benchmark->ArgName("T");
143
144 // Disabled thread pool (execution on the caller thread only).
145 benchmark->Arg(1);
146
147 if (cpuinfo_initialize()) {
148 // All cores except the little ones.
149 uint32_t max_cores = cpuinfo_get_cores_count();
150 if (cpuinfo_get_clusters_count() > 1) {
151 max_cores -= cpuinfo_get_cluster(cpuinfo_get_clusters_count() - 1)->core_count;
152 }
153 for (uint32_t t = 2; t <= max_cores; t++) {
154 benchmark->Arg(t);
155 }
156
157 // All cores (if more than one cluster).
158 if (cpuinfo_get_cores_count() > max_cores) {
159 benchmark->Arg(cpuinfo_get_cores_count());
160 }
161
162 // All cores + hyperthreads (only if hyperthreading supported).
163 if (cpuinfo_get_processors_count() > cpuinfo_get_cores_count()) {
164 benchmark->Arg(cpuinfo_get_processors_count());
165 }
166 }
167 }
168
169
CheckVFP(benchmark::State & state)170 bool CheckVFP(benchmark::State& state) {
171 if (!cpuinfo_initialize() || !(cpuinfo_has_arm_vfpv2() || cpuinfo_has_arm_vfpv3())) {
172 state.SkipWithError("no VFP extension");
173 return false;
174 }
175 return true;
176 }
177
CheckARMV6(benchmark::State & state)178 bool CheckARMV6(benchmark::State& state) {
179 if (!cpuinfo_initialize() || !cpuinfo_has_arm_v6()) {
180 state.SkipWithError("no ARMv6 extension");
181 return false;
182 }
183 return true;
184 }
185
CheckNEON(benchmark::State & state)186 bool CheckNEON(benchmark::State& state) {
187 if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon()) {
188 state.SkipWithError("no NEON extension");
189 return false;
190 }
191 return true;
192 }
193
CheckNEONFP16(benchmark::State & state)194 bool CheckNEONFP16(benchmark::State& state) {
195 if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon_fp16()) {
196 state.SkipWithError("no NEON-FP16 extension");
197 return false;
198 }
199 return true;
200 }
201
CheckNEONFMA(benchmark::State & state)202 bool CheckNEONFMA(benchmark::State& state) {
203 if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon_fma()) {
204 state.SkipWithError("no NEON-FMA extension");
205 return false;
206 }
207 return true;
208 }
209
CheckNEONV8(benchmark::State & state)210 bool CheckNEONV8(benchmark::State& state) {
211 if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon_v8()) {
212 state.SkipWithError("no NEON-V8 extension");
213 return false;
214 }
215 return true;
216 }
217
CheckNEONFP16ARITH(benchmark::State & state)218 bool CheckNEONFP16ARITH(benchmark::State& state) {
219 if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon_fp16_arith()) {
220 state.SkipWithError("no NEON-FP16-ARITH extension");
221 return false;
222 }
223 return true;
224 }
225
CheckNEONBF16(benchmark::State & state)226 bool CheckNEONBF16(benchmark::State& state) {
227 if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon_bf16()) {
228 state.SkipWithError("no NEON-BF16 extension");
229 return false;
230 }
231 return true;
232 }
233
CheckNEONDOT(benchmark::State & state)234 bool CheckNEONDOT(benchmark::State& state) {
235 if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon_dot()) {
236 state.SkipWithError("no NEON-DOT extension");
237 return false;
238 }
239 return true;
240 }
241
CheckSSSE3(benchmark::State & state)242 bool CheckSSSE3(benchmark::State& state) {
243 if (!cpuinfo_initialize() || !cpuinfo_has_x86_ssse3()) {
244 state.SkipWithError("no SSSE3 extension");
245 return false;
246 }
247 return true;
248 }
249
CheckSSE41(benchmark::State & state)250 bool CheckSSE41(benchmark::State& state) {
251 if (!cpuinfo_initialize() || !cpuinfo_has_x86_sse4_1()) {
252 state.SkipWithError("no SSE4.1 extension");
253 return false;
254 }
255 return true;
256 }
257
CheckAVX(benchmark::State & state)258 bool CheckAVX(benchmark::State& state) {
259 if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx()) {
260 state.SkipWithError("no AVX extension");
261 return false;
262 }
263 return true;
264 }
265
CheckF16C(benchmark::State & state)266 bool CheckF16C(benchmark::State& state) {
267 if (!cpuinfo_initialize() || !cpuinfo_has_x86_f16c()) {
268 state.SkipWithError("no F16C extension");
269 return false;
270 }
271 return true;
272 }
273
CheckXOP(benchmark::State & state)274 bool CheckXOP(benchmark::State& state) {
275 if (!cpuinfo_initialize() || !cpuinfo_has_x86_xop()) {
276 state.SkipWithError("no XOP extension");
277 return false;
278 }
279 return true;
280 }
281
CheckFMA3(benchmark::State & state)282 bool CheckFMA3(benchmark::State& state) {
283 if (!cpuinfo_initialize() || !cpuinfo_has_x86_fma3()) {
284 state.SkipWithError("no FMA3 extension");
285 return false;
286 }
287 return true;
288 }
289
CheckAVX2(benchmark::State & state)290 bool CheckAVX2(benchmark::State& state) {
291 if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx2()) {
292 state.SkipWithError("no AVX2 extension");
293 return false;
294 }
295 return true;
296 }
297
CheckAVX512F(benchmark::State & state)298 bool CheckAVX512F(benchmark::State& state) {
299 if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx512f()) {
300 state.SkipWithError("no AVX512F extension");
301 return false;
302 }
303 return true;
304 }
305
CheckAVX512SKX(benchmark::State & state)306 bool CheckAVX512SKX(benchmark::State& state) {
307 if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx512f() ||
308 !cpuinfo_has_x86_avx512cd() || !cpuinfo_has_x86_avx512bw() ||
309 !cpuinfo_has_x86_avx512dq() || !cpuinfo_has_x86_avx512vl())
310 {
311 state.SkipWithError("no AVX512 SKX extensions");
312 return false;
313 }
314 return true;
315 }
316
CodeMemoryHelper()317 CodeMemoryHelper::CodeMemoryHelper() {
318 status = xnn_allocate_code_memory(&buffer, XNN_DEFAULT_CODE_BUFFER_SIZE);
319 }
320
~CodeMemoryHelper()321 CodeMemoryHelper::~CodeMemoryHelper() {
322 if (status == xnn_status_success) {
323 xnn_release_code_memory(&buffer);
324 }
325 }
326
327 } // namespace utils
328 } // namespace benchmark
329