xref: /aosp_15_r20/external/XNNPACK/bench/utils.cc (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <cstdio>
7 #include <cstdlib>
8 #include <cstring>
9 #include <mutex>
10 
11 #ifdef __linux__
12   #include <sched.h>
13 #endif
14 #if defined(__ANDROID__) || defined(_WIN32) || defined(__CYGWIN__)
15   #include <malloc.h>
16 #endif
17 #if defined(__SSE__) || defined(__x86_64__)
18   #include <xmmintrin.h>
19 #endif
20 
21 #include <cpuinfo.h>
22 
23 #include <xnnpack.h>
24 #include <xnnpack/allocator.h>
25 
26 #include "bench/utils.h"
27 
28 static void* wipe_buffer = nullptr;
29 static size_t wipe_buffer_size = 0;
30 
31 static std::once_flag wipe_buffer_guard;
32 
InitWipeBuffer()33 static void InitWipeBuffer() {
34   // Default: the largest know cache size (128 MB Intel Crystalwell L4 cache).
35   wipe_buffer_size = 128 * 1024 * 1024;
36   if (cpuinfo_initialize()) {
37     wipe_buffer_size = benchmark::utils::GetMaxCacheSize();
38   }
39 #if defined(_WIN32)
40   wipe_buffer = _aligned_malloc(wipe_buffer_size, 128);
41 #elif defined(__ANDROID__) || defined(__CYGWIN__)
42   // memalign is obsolete, but it is the only option on Android until API level 17.
43   wipe_buffer = memalign(128, wipe_buffer_size);
44 #else
45   (void) posix_memalign((void**) &wipe_buffer, 128, wipe_buffer_size);
46 #endif
47   if (wipe_buffer != nullptr) {
48     memset(wipe_buffer, 0xA5, wipe_buffer_size);
49   }
50 }
51 
52 namespace benchmark {
53 namespace utils {
54 
PrefetchToL1(const void * ptr,size_t size)55 uint32_t PrefetchToL1(const void* ptr, size_t size) {
56   uint32_t step = 16;
57   if (cpuinfo_initialize()) {
58     step = cpuinfo_get_l1d_cache(0)->line_size;
59   }
60   const uint8_t* u8_ptr = static_cast<const uint8_t*>(ptr);
61   // Compute and return sum of data to prevent compiler from removing data reads.
62   uint32_t sum = 0;
63   while (size >= step) {
64     sum += uint32_t(*u8_ptr);
65     u8_ptr += step;
66     size -= step;
67   }
68   return sum;
69 }
70 
WipeCache()71 uint32_t WipeCache() {
72   std::call_once(wipe_buffer_guard, InitWipeBuffer);
73   return PrefetchToL1(wipe_buffer, wipe_buffer_size);
74 }
75 
DisableDenormals()76 void DisableDenormals() {
77 #if defined(__SSE__) || defined(__x86_64__)
78   _mm_setcsr(_mm_getcsr() | 0x8040);
79 #elif defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0)
80   uint32_t fpscr;
81   #if defined(__thumb__) && !defined(__thumb2__)
82     __asm__ __volatile__(
83         "VMRS %[fpscr], fpscr\n"
84         "ORRS %[fpscr], %[bitmask]\n"
85         "VMSR fpscr, %[fpscr]\n"
86         : [fpscr] "=l" (fpscr)
87         : [bitmask] "l" (0x1000000)
88         : "cc");
89   #else
90     __asm__ __volatile__(
91         "VMRS %[fpscr], fpscr\n"
92         "ORR %[fpscr], #0x1000000\n"
93         "VMSR fpscr, %[fpscr]\n"
94         : [fpscr] "=r" (fpscr));
95   #endif
96 #elif defined(__aarch64__)
97   uint64_t fpcr;
98   __asm__ __volatile__(
99       "MRS %[fpcr], fpcr\n"
100       "ORR %w[fpcr], %w[fpcr], 0x1000000\n"
101       "ORR %w[fpcr], %w[fpcr], 0x80000\n"
102       "MSR fpcr, %[fpcr]\n"
103     : [fpcr] "=r" (fpcr));
104 #endif
105 }
106 
107 // Return clockrate in Hz
GetCurrentCpuFrequency()108 uint64_t GetCurrentCpuFrequency() {
109 #ifdef __linux__
110   int freq = 0;
111   char cpuinfo_name[512];
112   int cpu = sched_getcpu();
113   snprintf(cpuinfo_name, sizeof(cpuinfo_name),
114     "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_cur_freq", cpu);
115 
116   FILE* f = fopen(cpuinfo_name, "r");
117   if (f) {
118     if (fscanf(f, "%d", &freq)) {
119       fclose(f);
120       return uint64_t(freq) * 1000;
121     }
122     fclose(f);
123   }
124 #endif  // __linux__
125   return 0;
126 }
127 
GetMaxCacheSize()128 size_t GetMaxCacheSize() {
129   if (!cpuinfo_initialize()) {
130     #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
131       // DynamIQ max: 4 MB
132       return 4 * 1024 * 1024;
133     #else
134       // Intel eDRAM max: 128 MB
135       return 128 * 1024 * 1024;
136     #endif
137   }
138   return cpuinfo_get_max_cache_size();
139 }
140 
MultiThreadingParameters(benchmark::internal::Benchmark * benchmark)141 void MultiThreadingParameters(benchmark::internal::Benchmark* benchmark) {
142   benchmark->ArgName("T");
143 
144   // Disabled thread pool (execution on the caller thread only).
145   benchmark->Arg(1);
146 
147   if (cpuinfo_initialize()) {
148     // All cores except the little ones.
149     uint32_t max_cores = cpuinfo_get_cores_count();
150     if (cpuinfo_get_clusters_count() > 1) {
151       max_cores -= cpuinfo_get_cluster(cpuinfo_get_clusters_count() - 1)->core_count;
152     }
153     for (uint32_t t = 2; t <= max_cores; t++) {
154       benchmark->Arg(t);
155     }
156 
157     // All cores (if more than one cluster).
158     if (cpuinfo_get_cores_count() > max_cores) {
159       benchmark->Arg(cpuinfo_get_cores_count());
160     }
161 
162     // All cores + hyperthreads (only if hyperthreading supported).
163     if (cpuinfo_get_processors_count() > cpuinfo_get_cores_count()) {
164       benchmark->Arg(cpuinfo_get_processors_count());
165     }
166   }
167 }
168 
169 
CheckVFP(benchmark::State & state)170 bool CheckVFP(benchmark::State& state) {
171   if (!cpuinfo_initialize() || !(cpuinfo_has_arm_vfpv2() || cpuinfo_has_arm_vfpv3())) {
172     state.SkipWithError("no VFP extension");
173     return false;
174   }
175   return true;
176 }
177 
CheckARMV6(benchmark::State & state)178 bool CheckARMV6(benchmark::State& state) {
179   if (!cpuinfo_initialize() || !cpuinfo_has_arm_v6()) {
180     state.SkipWithError("no ARMv6 extension");
181     return false;
182   }
183   return true;
184 }
185 
CheckNEON(benchmark::State & state)186 bool CheckNEON(benchmark::State& state) {
187   if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon()) {
188     state.SkipWithError("no NEON extension");
189     return false;
190   }
191   return true;
192 }
193 
CheckNEONFP16(benchmark::State & state)194 bool CheckNEONFP16(benchmark::State& state) {
195   if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon_fp16()) {
196     state.SkipWithError("no NEON-FP16 extension");
197     return false;
198   }
199   return true;
200 }
201 
CheckNEONFMA(benchmark::State & state)202 bool CheckNEONFMA(benchmark::State& state) {
203   if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon_fma()) {
204     state.SkipWithError("no NEON-FMA extension");
205     return false;
206   }
207   return true;
208 }
209 
CheckNEONV8(benchmark::State & state)210 bool CheckNEONV8(benchmark::State& state) {
211   if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon_v8()) {
212     state.SkipWithError("no NEON-V8 extension");
213     return false;
214   }
215   return true;
216 }
217 
CheckNEONFP16ARITH(benchmark::State & state)218 bool CheckNEONFP16ARITH(benchmark::State& state) {
219   if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon_fp16_arith()) {
220     state.SkipWithError("no NEON-FP16-ARITH extension");
221     return false;
222   }
223   return true;
224 }
225 
CheckNEONBF16(benchmark::State & state)226 bool CheckNEONBF16(benchmark::State& state) {
227   if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon_bf16()) {
228     state.SkipWithError("no NEON-BF16 extension");
229     return false;
230   }
231   return true;
232 }
233 
CheckNEONDOT(benchmark::State & state)234 bool CheckNEONDOT(benchmark::State& state) {
235   if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon_dot()) {
236     state.SkipWithError("no NEON-DOT extension");
237     return false;
238   }
239   return true;
240 }
241 
CheckSSSE3(benchmark::State & state)242 bool CheckSSSE3(benchmark::State& state) {
243   if (!cpuinfo_initialize() || !cpuinfo_has_x86_ssse3()) {
244     state.SkipWithError("no SSSE3 extension");
245     return false;
246   }
247   return true;
248 }
249 
CheckSSE41(benchmark::State & state)250 bool CheckSSE41(benchmark::State& state) {
251   if (!cpuinfo_initialize() || !cpuinfo_has_x86_sse4_1()) {
252     state.SkipWithError("no SSE4.1 extension");
253     return false;
254   }
255   return true;
256 }
257 
CheckAVX(benchmark::State & state)258 bool CheckAVX(benchmark::State& state) {
259   if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx()) {
260     state.SkipWithError("no AVX extension");
261     return false;
262   }
263   return true;
264 }
265 
CheckF16C(benchmark::State & state)266 bool CheckF16C(benchmark::State& state) {
267   if (!cpuinfo_initialize() || !cpuinfo_has_x86_f16c()) {
268     state.SkipWithError("no F16C extension");
269     return false;
270   }
271   return true;
272 }
273 
CheckXOP(benchmark::State & state)274 bool CheckXOP(benchmark::State& state) {
275   if (!cpuinfo_initialize() || !cpuinfo_has_x86_xop()) {
276     state.SkipWithError("no XOP extension");
277     return false;
278   }
279   return true;
280 }
281 
CheckFMA3(benchmark::State & state)282 bool CheckFMA3(benchmark::State& state) {
283   if (!cpuinfo_initialize() || !cpuinfo_has_x86_fma3()) {
284     state.SkipWithError("no FMA3 extension");
285     return false;
286   }
287   return true;
288 }
289 
CheckAVX2(benchmark::State & state)290 bool CheckAVX2(benchmark::State& state) {
291   if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx2()) {
292     state.SkipWithError("no AVX2 extension");
293     return false;
294   }
295   return true;
296 }
297 
CheckAVX512F(benchmark::State & state)298 bool CheckAVX512F(benchmark::State& state) {
299   if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx512f()) {
300     state.SkipWithError("no AVX512F extension");
301     return false;
302   }
303   return true;
304 }
305 
CheckAVX512SKX(benchmark::State & state)306 bool CheckAVX512SKX(benchmark::State& state) {
307   if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx512f() ||
308       !cpuinfo_has_x86_avx512cd() || !cpuinfo_has_x86_avx512bw() ||
309       !cpuinfo_has_x86_avx512dq() || !cpuinfo_has_x86_avx512vl())
310   {
311     state.SkipWithError("no AVX512 SKX extensions");
312     return false;
313   }
314   return true;
315 }
316 
CodeMemoryHelper()317 CodeMemoryHelper::CodeMemoryHelper() {
318   status = xnn_allocate_code_memory(&buffer, XNN_DEFAULT_CODE_BUFFER_SIZE);
319 }
320 
~CodeMemoryHelper()321 CodeMemoryHelper::~CodeMemoryHelper() {
322   if (status == xnn_status_success) {
323     xnn_release_code_memory(&buffer);
324   }
325 }
326 
327 }  // namespace utils
328 }  // namespace benchmark
329