1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <algorithm>
7 #include <cfloat>
8 #include <cmath>
9 #include <functional>
10 #include <memory>
11 #include <numeric>
12 #include <random>
13 #include <vector>
14
15 #include <cpuinfo.h>
16 #include <pthreadpool.h>
17
18 #include <benchmark/benchmark.h>
19 #include <fp16/fp16.h>
20
21 #include "bench/utils.h"
22 #include <xnnpack/aligned-allocator.h>
23 #include <xnnpack/common.h>
24 #include <xnnpack/math.h>
25 #include <xnnpack/math-stubs.h>
26
27
28 struct ComputeErrorContext {
29 const float* input;
30 const float* output;
31 float* error;
32 };
33
ComputeError(struct ComputeErrorContext * context,size_t start,size_t range)34 static void ComputeError(
35 struct ComputeErrorContext* context,
36 size_t start,
37 size_t range)
38 {
39 const float* input = context->input;
40 const float* output = context->output;
41 float* error = context->error;
42 for (size_t i = start; i < start + range; i++) {
43 const double input_val = input[i];
44 double output_ref = 0.0;
45 if (input_val < 0.0) {
46 const double exp_val = std::exp(input_val);
47 output_ref = exp_val / (1.0 + exp_val);
48 } else {
49 output_ref = 1.0 / (1.0 + std::exp(-input_val));
50 }
51 const double abs_error = std::abs(output_ref - double(output[i]));
52 const float output_abs = std::abs(output_ref);
53 const float output_ulp = uint32_as_float(float_as_uint32(output_abs) + 1) - output_abs;
54 error[i] = float(abs_error / output_ulp);
55 }
56 }
57
SigmoidError(benchmark::State & state,xnn_f32_unary_math_function sigmoid,benchmark::utils::IsaCheckFunction isa_check=nullptr)58 static void SigmoidError(benchmark::State& state,
59 xnn_f32_unary_math_function sigmoid,
60 benchmark::utils::IsaCheckFunction isa_check = nullptr)
61 {
62 if (!cpuinfo_initialize()) {
63 state.SkipWithError("failed cpuinfo init");
64 return;
65 }
66 if (isa_check && !isa_check(state)) {
67 return;
68 }
69
70 // The smallest x for which sigmoidf(x) is normalized (-0x1.5D589Ep+6f).
71 const uint32_t min_input = 0xC2AEAC4F;
72 // The largest x for which sigmoidf(x) is not 1.0f (0x1.154244p+4f).
73 const uint32_t max_input = 0x418AA122;
74 // Number of elements in one block of inputs/outputs.
75 // Combining multiple elements in a block reduce function call overhead.
76 const size_t block_size = 16384;
77 // Number of elements in one parallelization tile. Worker threads process this many elements in each task.
78 const size_t tile_size = 64;
79
80 uint32_t num_threads = cpuinfo_get_cores_count();
81 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
82 // Use all cores except for the least performant cluster
83 if (cpuinfo_get_clusters_count() > 1) {
84 num_threads -= cpuinfo_get_cluster(cpuinfo_get_clusters_count() - 1)->core_count;
85 }
86 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
87
88 std::unique_ptr<pthreadpool, decltype(&pthreadpool_destroy)> threadpool(
89 pthreadpool_create(num_threads), pthreadpool_destroy);
90
91 std::vector<float, AlignedAllocator<float, 64>> x(block_size);
92 std::vector<float, AlignedAllocator<float, 64>> y(block_size);
93 std::vector<float> ulp_error(block_size);
94 float max_ulp_error = 0.0f;
95
96 ComputeErrorContext context;
97 context.input = x.data();
98 context.output = y.data();
99 context.error = ulp_error.data();
100 for (auto _ : state) {
101 for (uint32_t n = min_input; int32_t(n) < 0; n -= block_size) {
102 for (uint32_t i = 0; i < block_size; i++) {
103 x[i] = uint32_as_float(std::max<uint32_t>(n - i, 0x80000000));
104 }
105 std::fill(y.begin(), y.end(), std::nanf(""));
106
107 sigmoid(block_size * sizeof(float), x.data(), y.data());
108
109 pthreadpool_parallelize_1d_tile_1d(
110 threadpool.get(),
111 reinterpret_cast<pthreadpool_task_1d_tile_1d_t>(ComputeError),
112 static_cast<void*>(&context),
113 block_size, tile_size, 0 /* flags */);
114
115 max_ulp_error = std::accumulate(ulp_error.cbegin(), ulp_error.cend(), max_ulp_error,
116 static_cast<const float& (*)(const float&, const float&)>(std::max<float>));
117 }
118 for (uint32_t n = 0; n < max_input; n += block_size) {
119 for (uint32_t i = 0; i < block_size; i++) {
120 x[i] = uint32_as_float(std::min<uint32_t>(n + i, max_input));
121 }
122 std::fill(y.begin(), y.end(), std::nanf(""));
123
124 sigmoid(block_size * sizeof(float), x.data(), y.data());
125
126 pthreadpool_parallelize_1d_tile_1d(
127 threadpool.get(),
128 reinterpret_cast<pthreadpool_task_1d_tile_1d_t>(ComputeError),
129 static_cast<void*>(&context),
130 block_size, tile_size, 0 /* flags */);
131
132 max_ulp_error = std::accumulate(ulp_error.cbegin(), ulp_error.cend(), max_ulp_error,
133 static_cast<const float& (*)(const float&, const float&)>(std::max<float>));
134 }
135 }
136
137 state.counters["ULPERROR"] = benchmark::Counter(max_ulp_error);
138 }
139
140 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
141 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_lut64_p2_nr2recps,
142 xnn_math_f32_sigmoid__neonfma_rr1_lut64_p2_nr2recps,
143 benchmark::utils::CheckNEONFMA)
144 ->Unit(benchmark::kMillisecond)
145 ->Iterations(1);
146 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_lut64_p2_nr1recps1fma,
147 xnn_math_f32_sigmoid__neonfma_rr1_lut64_p2_nr1recps1fma,
148 benchmark::utils::CheckNEONFMA)
149 ->Unit(benchmark::kMillisecond)
150 ->Iterations(1);
151 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_lut64_p2_nr2fma,
152 xnn_math_f32_sigmoid__neonfma_rr1_lut64_p2_nr2fma,
153 benchmark::utils::CheckNEONFMA)
154 ->Unit(benchmark::kMillisecond)
155 ->Iterations(1);
156 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_lut64_p2_nr2recps,
157 xnn_math_f32_sigmoid__neonfma_rr2_lut64_p2_nr2recps,
158 benchmark::utils::CheckNEONFMA)
159 ->Unit(benchmark::kMillisecond)
160 ->Iterations(1);
161 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_lut64_p2_nr1recps1fma,
162 xnn_math_f32_sigmoid__neonfma_rr2_lut64_p2_nr1recps1fma,
163 benchmark::utils::CheckNEONFMA)
164 ->Unit(benchmark::kMillisecond)
165 ->Iterations(1);
166 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_lut64_p2_nr2fma,
167 xnn_math_f32_sigmoid__neonfma_rr2_lut64_p2_nr2fma,
168 benchmark::utils::CheckNEONFMA)
169 ->Unit(benchmark::kMillisecond)
170 ->Iterations(1);
171 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_lut2048_p1_nr2recps,
172 xnn_math_f32_sigmoid__neonfma_rr1_lut2048_p1_nr2recps,
173 benchmark::utils::CheckNEONFMA)
174 ->Unit(benchmark::kMillisecond)
175 ->Iterations(1);
176 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_lut2048_p1_nr1recps1fma,
177 xnn_math_f32_sigmoid__neonfma_rr1_lut2048_p1_nr1recps1fma,
178 benchmark::utils::CheckNEONFMA)
179 ->Unit(benchmark::kMillisecond)
180 ->Iterations(1);
181 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_lut2048_p1_nr2fma,
182 xnn_math_f32_sigmoid__neonfma_rr1_lut2048_p1_nr2fma,
183 benchmark::utils::CheckNEONFMA)
184 ->Unit(benchmark::kMillisecond)
185 ->Iterations(1);
186 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_lut2048_p1_nr2recps,
187 xnn_math_f32_sigmoid__neonfma_rr2_lut2048_p1_nr2recps,
188 benchmark::utils::CheckNEONFMA)
189 ->Unit(benchmark::kMillisecond)
190 ->Iterations(1);
191 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_lut2048_p1_nr1recps1fma,
192 xnn_math_f32_sigmoid__neonfma_rr2_lut2048_p1_nr1recps1fma,
193 benchmark::utils::CheckNEONFMA)
194 ->Unit(benchmark::kMillisecond)
195 ->Iterations(1);
196 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_lut2048_p1_nr2fma,
197 xnn_math_f32_sigmoid__neonfma_rr2_lut2048_p1_nr2fma,
198 benchmark::utils::CheckNEONFMA)
199 ->Unit(benchmark::kMillisecond)
200 ->Iterations(1);
201 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_p5_nr2recps,
202 xnn_math_f32_sigmoid__neonfma_rr1_p5_nr2recps,
203 benchmark::utils::CheckNEONFMA)
204 ->Unit(benchmark::kMillisecond)
205 ->Iterations(1);
206 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_p5_nr1recps1fma,
207 xnn_math_f32_sigmoid__neonfma_rr1_p5_nr1recps1fma,
208 benchmark::utils::CheckNEONFMA)
209 ->Unit(benchmark::kMillisecond)
210 ->Iterations(1);
211 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_p5_nr2fma,
212 xnn_math_f32_sigmoid__neonfma_rr1_p5_nr2fma,
213 benchmark::utils::CheckNEONFMA)
214 ->Unit(benchmark::kMillisecond)
215 ->Iterations(1);
216 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_p5_nr2recps,
217 xnn_math_f32_sigmoid__neonfma_rr2_p5_nr2recps,
218 benchmark::utils::CheckNEONFMA)
219 ->Unit(benchmark::kMillisecond)
220 ->Iterations(1);
221 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_p5_nr1recps1fma,
222 xnn_math_f32_sigmoid__neonfma_rr2_p5_nr1recps1fma,
223 benchmark::utils::CheckNEONFMA)
224 ->Unit(benchmark::kMillisecond)
225 ->Iterations(1);
226 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_p5_nr2fma,
227 xnn_math_f32_sigmoid__neonfma_rr2_p5_nr2fma,
228 benchmark::utils::CheckNEONFMA)
229 ->Unit(benchmark::kMillisecond)
230 ->Iterations(1);
231
232 BENCHMARK_CAPTURE(SigmoidError, neon_rr2_lut64_p2_nr2recps,
233 xnn_math_f32_sigmoid__neon_rr2_lut64_p2_nr2recps,
234 benchmark::utils::CheckNEON)
235 ->Unit(benchmark::kMillisecond)
236 ->Iterations(1);
237 BENCHMARK_CAPTURE(SigmoidError, neon_rr2_lut2048_p1_nr2recps,
238 xnn_math_f32_sigmoid__neon_rr2_lut2048_p1_nr2recps,
239 benchmark::utils::CheckNEON)
240 ->Unit(benchmark::kMillisecond)
241 ->Iterations(1);
242 BENCHMARK_CAPTURE(SigmoidError, neon_rr2_p5_nr2recps,
243 xnn_math_f32_sigmoid__neon_rr2_p5_nr2recps,
244 benchmark::utils::CheckNEON)
245 ->Unit(benchmark::kMillisecond)
246 ->Iterations(1);
247 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
248
249 #if XNN_ARCH_ARM64
250 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_lut2048_p1_div,
251 xnn_math_f32_sigmoid__neonfma_rr1_lut2048_p1_div)
252 ->Unit(benchmark::kMillisecond)
253 ->Iterations(1);
254 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_lut2048_p1_div,
255 xnn_math_f32_sigmoid__neonfma_rr2_lut2048_p1_div)
256 ->Unit(benchmark::kMillisecond)
257 ->Iterations(1);
258 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_lut64_p2_div,
259 xnn_math_f32_sigmoid__neonfma_rr1_lut64_p2_div)
260 ->Unit(benchmark::kMillisecond)
261 ->Iterations(1);
262 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_lut64_p2_div,
263 xnn_math_f32_sigmoid__neonfma_rr2_lut64_p2_div)
264 ->Unit(benchmark::kMillisecond)
265 ->Iterations(1);
266 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_p5_div,
267 xnn_math_f32_sigmoid__neonfma_rr1_p5_div)
268 ->Unit(benchmark::kMillisecond)
269 ->Iterations(1);
270 BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_p5_div,
271 xnn_math_f32_sigmoid__neonfma_rr2_p5_div)
272 ->Unit(benchmark::kMillisecond)
273 ->Iterations(1);
274 #endif // XNN_ARCH_ARM64
275
276 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
277 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_lut16_p3_perm_scalef_nr1fma,
278 xnn_math_f32_sigmoid__avx512f_rr1_lut16_p3_perm_scalef_nr1fma,
279 benchmark::utils::CheckAVX512F)
280 ->Unit(benchmark::kMillisecond)
281 ->Iterations(1);
282 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_lut16_p3_perm_scalef_nr1fma1adj,
283 xnn_math_f32_sigmoid__avx512f_rr1_lut16_p3_perm_scalef_nr1fma1adj,
284 benchmark::utils::CheckAVX512F)
285 ->Unit(benchmark::kMillisecond)
286 ->Iterations(1);
287 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_lut16_p3_perm_scalef_div,
288 xnn_math_f32_sigmoid__avx512f_rr1_lut16_p3_perm_scalef_div,
289 benchmark::utils::CheckAVX512F)
290 ->Unit(benchmark::kMillisecond)
291 ->Iterations(1);
292 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_lut16_p3_perm_scalef_nr1fma,
293 xnn_math_f32_sigmoid__avx512f_rr2_lut16_p3_perm_scalef_nr1fma,
294 benchmark::utils::CheckAVX512F)
295 ->Unit(benchmark::kMillisecond)
296 ->Iterations(1);
297 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_lut16_p3_perm_scalef_nr1fma1adj,
298 xnn_math_f32_sigmoid__avx512f_rr2_lut16_p3_perm_scalef_nr1fma1adj,
299 benchmark::utils::CheckAVX512F)
300 ->Unit(benchmark::kMillisecond)
301 ->Iterations(1);
302 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_lut16_p3_perm_scalef_div,
303 xnn_math_f32_sigmoid__avx512f_rr2_lut16_p3_perm_scalef_div,
304 benchmark::utils::CheckAVX512F)
305 ->Unit(benchmark::kMillisecond)
306 ->Iterations(1);
307 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_lut32_p2_perm2_scalef_nr1fma,
308 xnn_math_f32_sigmoid__avx512f_rr1_lut32_p2_perm2_scalef_nr1fma,
309 benchmark::utils::CheckAVX512F)
310 ->Unit(benchmark::kMillisecond)
311 ->Iterations(1);
312 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_lut32_p2_perm2_scalef_nr1fma1adj,
313 xnn_math_f32_sigmoid__avx512f_rr1_lut32_p2_perm2_scalef_nr1fma1adj,
314 benchmark::utils::CheckAVX512F)
315 ->Unit(benchmark::kMillisecond)
316 ->Iterations(1);
317 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_lut32_p2_perm2_scalef_div,
318 xnn_math_f32_sigmoid__avx512f_rr1_lut32_p2_perm2_scalef_div,
319 benchmark::utils::CheckAVX512F)
320 ->Unit(benchmark::kMillisecond)
321 ->Iterations(1);
322 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_lut32_p2_perm2_scalef_nr1fma,
323 xnn_math_f32_sigmoid__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma,
324 benchmark::utils::CheckAVX512F)
325 ->Unit(benchmark::kMillisecond)
326 ->Iterations(1);
327 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_lut32_p2_perm2_scalef_nr1fma1adj,
328 xnn_math_f32_sigmoid__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma1adj,
329 benchmark::utils::CheckAVX512F)
330 ->Unit(benchmark::kMillisecond)
331 ->Iterations(1);
332 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_lut32_p2_perm2_scalef_div,
333 xnn_math_f32_sigmoid__avx512f_rr2_lut32_p2_perm2_scalef_div,
334 benchmark::utils::CheckAVX512F)
335 ->Unit(benchmark::kMillisecond)
336 ->Iterations(1);
337 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_lut64_p2_gather_scalef_nr1fma,
338 xnn_math_f32_sigmoid__avx512f_rr1_lut64_p2_gather_scalef_nr1fma,
339 benchmark::utils::CheckAVX512F)
340 ->Unit(benchmark::kMillisecond)
341 ->Iterations(1);
342 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_lut64_p2_gather_scalef_nr1fma1adj,
343 xnn_math_f32_sigmoid__avx512f_rr1_lut64_p2_gather_scalef_nr1fma1adj,
344 benchmark::utils::CheckAVX512F)
345 ->Unit(benchmark::kMillisecond)
346 ->Iterations(1);
347 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_lut64_p2_gather_scalef_div,
348 xnn_math_f32_sigmoid__avx512f_rr1_lut64_p2_gather_scalef_div,
349 benchmark::utils::CheckAVX512F)
350 ->Unit(benchmark::kMillisecond)
351 ->Iterations(1);
352 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_lut64_p2_gather_scalef_nr1fma,
353 xnn_math_f32_sigmoid__avx512f_rr2_lut64_p2_gather_scalef_nr1fma,
354 benchmark::utils::CheckAVX512F)
355 ->Unit(benchmark::kMillisecond)
356 ->Iterations(1);
357 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_lut64_p2_gather_scalef_nr1fma1adj,
358 xnn_math_f32_sigmoid__avx512f_rr2_lut64_p2_gather_scalef_nr1fma1adj,
359 benchmark::utils::CheckAVX512F)
360 ->Unit(benchmark::kMillisecond)
361 ->Iterations(1);
362 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_lut64_p2_gather_scalef_div,
363 xnn_math_f32_sigmoid__avx512f_rr2_lut64_p2_gather_scalef_div,
364 benchmark::utils::CheckAVX512F)
365 ->Unit(benchmark::kMillisecond)
366 ->Iterations(1);
367 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_p5_scalef_nr1fma,
368 xnn_math_f32_sigmoid__avx512f_rr1_p5_scalef_nr1fma,
369 benchmark::utils::CheckAVX512F)
370 ->Unit(benchmark::kMillisecond)
371 ->Iterations(1);
372 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_p5_scalef_nr1fma1adj,
373 xnn_math_f32_sigmoid__avx512f_rr1_p5_scalef_nr1fma1adj,
374 benchmark::utils::CheckAVX512F)
375 ->Unit(benchmark::kMillisecond)
376 ->Iterations(1);
377 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_p5_scalef_div,
378 xnn_math_f32_sigmoid__avx512f_rr1_p5_scalef_div,
379 benchmark::utils::CheckAVX512F)
380 ->Unit(benchmark::kMillisecond)
381 ->Iterations(1);
382 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_p5_scalef_nr1fma,
383 xnn_math_f32_sigmoid__avx512f_rr2_p5_scalef_nr1fma,
384 benchmark::utils::CheckAVX512F)
385 ->Unit(benchmark::kMillisecond)
386 ->Iterations(1);
387 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_p5_scalef_nr1fma1adj,
388 xnn_math_f32_sigmoid__avx512f_rr2_p5_scalef_nr1fma1adj,
389 benchmark::utils::CheckAVX512F)
390 ->Unit(benchmark::kMillisecond)
391 ->Iterations(1);
392 BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_p5_scalef_div,
393 xnn_math_f32_sigmoid__avx512f_rr2_p5_scalef_div,
394 benchmark::utils::CheckAVX512F)
395 ->Unit(benchmark::kMillisecond)
396 ->Iterations(1);
397
398 BENCHMARK_CAPTURE(SigmoidError, avx2_rr1_lut64_p2_gather_nr1fma,
399 xnn_math_f32_sigmoid__avx2_rr1_lut64_p2_gather_nr1fma,
400 benchmark::utils::CheckAVX2)
401 ->Unit(benchmark::kMillisecond)
402 ->Iterations(1);
403 BENCHMARK_CAPTURE(SigmoidError, avx2_rr1_lut64_p2_gather_nr2fma,
404 xnn_math_f32_sigmoid__avx2_rr1_lut64_p2_gather_nr2fma,
405 benchmark::utils::CheckAVX2)
406 ->Unit(benchmark::kMillisecond)
407 ->Iterations(1);
408 BENCHMARK_CAPTURE(SigmoidError, avx2_rr1_lut64_p2_gather_nr2fma1adj,
409 xnn_math_f32_sigmoid__avx2_rr1_lut64_p2_gather_nr2fma1adj,
410 benchmark::utils::CheckAVX2)
411 ->Unit(benchmark::kMillisecond)
412 ->Iterations(1);
413 BENCHMARK_CAPTURE(SigmoidError, avx2_rr1_lut64_p2_gather_div,
414 xnn_math_f32_sigmoid__avx2_rr1_lut64_p2_gather_div,
415 benchmark::utils::CheckAVX2)
416 ->Unit(benchmark::kMillisecond)
417 ->Iterations(1);
418 BENCHMARK_CAPTURE(SigmoidError, avx2_rr2_lut64_p2_gather_nr1fma,
419 xnn_math_f32_sigmoid__avx2_rr2_lut64_p2_gather_nr1fma,
420 benchmark::utils::CheckAVX2)
421 ->Unit(benchmark::kMillisecond)
422 ->Iterations(1);
423 BENCHMARK_CAPTURE(SigmoidError, avx2_rr2_lut64_p2_gather_nr2fma,
424 xnn_math_f32_sigmoid__avx2_rr2_lut64_p2_gather_nr2fma,
425 benchmark::utils::CheckAVX2)
426 ->Unit(benchmark::kMillisecond)
427 ->Iterations(1);
428 BENCHMARK_CAPTURE(SigmoidError, avx2_rr2_lut64_p2_gather_nr2fma1adj,
429 xnn_math_f32_sigmoid__avx2_rr2_lut64_p2_gather_nr2fma1adj,
430 benchmark::utils::CheckAVX2)
431 ->Unit(benchmark::kMillisecond)
432 ->Iterations(1);
433 BENCHMARK_CAPTURE(SigmoidError, avx2_rr2_lut64_p2_gather_div,
434 xnn_math_f32_sigmoid__avx2_rr2_lut64_p2_gather_div,
435 benchmark::utils::CheckAVX2)
436 ->Unit(benchmark::kMillisecond)
437 ->Iterations(1);
438 BENCHMARK_CAPTURE(SigmoidError, avx2_rr1_p5_nr1fma,
439 xnn_math_f32_sigmoid__avx2_rr1_p5_nr1fma,
440 benchmark::utils::CheckAVX2)
441 ->Unit(benchmark::kMillisecond)
442 ->Iterations(1);
443 BENCHMARK_CAPTURE(SigmoidError, avx2_rr1_p5_nr2fma,
444 xnn_math_f32_sigmoid__avx2_rr1_p5_nr2fma,
445 benchmark::utils::CheckAVX2)
446 ->Unit(benchmark::kMillisecond)
447 ->Iterations(1);
448 BENCHMARK_CAPTURE(SigmoidError, avx2_rr1_p5_div,
449 xnn_math_f32_sigmoid__avx2_rr1_p5_div,
450 benchmark::utils::CheckAVX2)
451 ->Unit(benchmark::kMillisecond)
452 ->Iterations(1);
453 BENCHMARK_CAPTURE(SigmoidError, avx2_rr2_p5_nr1fma,
454 xnn_math_f32_sigmoid__avx2_rr2_p5_nr1fma,
455 benchmark::utils::CheckAVX2)
456 ->Unit(benchmark::kMillisecond)
457 ->Iterations(1);
458 BENCHMARK_CAPTURE(SigmoidError, avx2_rr2_p5_nr2fma,
459 xnn_math_f32_sigmoid__avx2_rr2_p5_nr2fma,
460 benchmark::utils::CheckAVX2)
461 ->Unit(benchmark::kMillisecond)
462 ->Iterations(1);
463 BENCHMARK_CAPTURE(SigmoidError, avx2_rr2_p5_div,
464 xnn_math_f32_sigmoid__avx2_rr2_p5_div,
465 benchmark::utils::CheckAVX2)
466 ->Unit(benchmark::kMillisecond)
467 ->Iterations(1);
468
469 BENCHMARK_CAPTURE(SigmoidError, avx_rr2_lut64_p2_div,
470 xnn_math_f32_sigmoid__avx_rr2_lut64_p2_div,
471 benchmark::utils::CheckAVX)
472 ->Unit(benchmark::kMillisecond)
473 ->Iterations(1);
474 BENCHMARK_CAPTURE(SigmoidError, avx_rr2_p5_nr1,
475 xnn_math_f32_sigmoid__avx_rr2_p5_nr1,
476 benchmark::utils::CheckAVX)
477 ->Unit(benchmark::kMillisecond)
478 ->Iterations(1);
479 BENCHMARK_CAPTURE(SigmoidError, avx_rr2_p5_nr2,
480 xnn_math_f32_sigmoid__avx_rr2_p5_nr2,
481 benchmark::utils::CheckAVX)
482 ->Unit(benchmark::kMillisecond)
483 ->Iterations(1);
484 BENCHMARK_CAPTURE(SigmoidError, avx_rr2_p5_div,
485 xnn_math_f32_sigmoid__avx_rr2_p5_div,
486 benchmark::utils::CheckAVX)
487 ->Unit(benchmark::kMillisecond)
488 ->Iterations(1);
489
490 BENCHMARK_CAPTURE(SigmoidError, sse2_rr2_lut64_p2_nr1,
491 xnn_math_f32_sigmoid__sse2_rr2_lut64_p2_nr1)
492 ->Unit(benchmark::kMillisecond)
493 ->Iterations(1);
494 BENCHMARK_CAPTURE(SigmoidError, sse2_rr2_lut64_p2_nr2,
495 xnn_math_f32_sigmoid__sse2_rr2_lut64_p2_nr2)
496 ->Unit(benchmark::kMillisecond)
497 ->Iterations(1);
498 BENCHMARK_CAPTURE(SigmoidError, sse2_rr2_lut64_p2_div,
499 xnn_math_f32_sigmoid__sse2_rr2_lut64_p2_div)
500 ->Unit(benchmark::kMillisecond)
501 ->Iterations(1);
502 BENCHMARK_CAPTURE(SigmoidError, sse2_rr2_p5_nr1,
503 xnn_math_f32_sigmoid__sse2_rr2_p5_nr1)
504 ->Unit(benchmark::kMillisecond)
505 ->Iterations(1);
506 BENCHMARK_CAPTURE(SigmoidError, sse2_rr2_p5_nr2,
507 xnn_math_f32_sigmoid__sse2_rr2_p5_nr2)
508 ->Unit(benchmark::kMillisecond)
509 ->Iterations(1);
510 BENCHMARK_CAPTURE(SigmoidError, sse2_rr2_p5_div,
511 xnn_math_f32_sigmoid__sse2_rr2_p5_div)
512 ->Unit(benchmark::kMillisecond)
513 ->Iterations(1);
514 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
515
516 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
517 BENCHMARK_CAPTURE(SigmoidError, wasmsimd_rr2_lut64_p2_div,
518 xnn_math_f32_sigmoid__wasmsimd_rr2_lut64_p2_div)
519 ->Unit(benchmark::kMillisecond)
520 ->Iterations(1);
521 BENCHMARK_CAPTURE(SigmoidError, wasmsimd_rr2_p5_div,
522 xnn_math_f32_sigmoid__wasmsimd_rr2_p5_div)
523 ->Unit(benchmark::kMillisecond)
524 ->Iterations(1);
525 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
526
527 BENCHMARK_CAPTURE(SigmoidError, scalar_rr2_lut64_p2_div,
528 xnn_math_f32_sigmoid__scalar_rr2_lut64_p2_div)
529 ->Unit(benchmark::kMillisecond)
530 ->Iterations(1);
531 BENCHMARK_CAPTURE(SigmoidError, scalar_rr2_lut2048_p1_div,
532 xnn_math_f32_sigmoid__scalar_rr2_lut2048_p1_div)
533 ->Unit(benchmark::kMillisecond)
534 ->Iterations(1);
535 BENCHMARK_CAPTURE(SigmoidError, scalar_rr2_p5_div,
536 xnn_math_f32_sigmoid__scalar_rr2_p5_div)
537 ->Unit(benchmark::kMillisecond)
538 ->Iterations(1);
539
540 #ifndef XNNPACK_BENCHMARK_NO_MAIN
541 BENCHMARK_MAIN();
542 #endif
543