xref: /aosp_15_r20/external/XNNPACK/eval/f32-sigmoid-ulp.cc (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cfloat>
8 #include <cmath>
9 #include <functional>
10 #include <memory>
11 #include <numeric>
12 #include <random>
13 #include <vector>
14 
15 #include <cpuinfo.h>
16 #include <pthreadpool.h>
17 
18 #include <benchmark/benchmark.h>
19 #include <fp16/fp16.h>
20 
21 #include "bench/utils.h"
22 #include <xnnpack/aligned-allocator.h>
23 #include <xnnpack/common.h>
24 #include <xnnpack/math.h>
25 #include <xnnpack/math-stubs.h>
26 
27 
28 struct ComputeErrorContext {
29   const float* input;
30   const float* output;
31   float* error;
32 };
33 
ComputeError(struct ComputeErrorContext * context,size_t start,size_t range)34 static void ComputeError(
35   struct ComputeErrorContext* context,
36   size_t start,
37   size_t range)
38 {
39   const float* input = context->input;
40   const float* output = context->output;
41   float* error = context->error;
42   for (size_t i = start; i < start + range; i++) {
43     const double input_val = input[i];
44     double output_ref = 0.0;
45     if (input_val < 0.0) {
46       const double exp_val = std::exp(input_val);
47       output_ref = exp_val / (1.0 + exp_val);
48     } else {
49       output_ref = 1.0 / (1.0 + std::exp(-input_val));
50     }
51     const double abs_error = std::abs(output_ref - double(output[i]));
52     const float output_abs = std::abs(output_ref);
53     const float output_ulp = uint32_as_float(float_as_uint32(output_abs) + 1) - output_abs;
54     error[i] = float(abs_error / output_ulp);
55   }
56 }
57 
SigmoidError(benchmark::State & state,xnn_f32_unary_math_function sigmoid,benchmark::utils::IsaCheckFunction isa_check=nullptr)58 static void SigmoidError(benchmark::State& state,
59   xnn_f32_unary_math_function sigmoid,
60   benchmark::utils::IsaCheckFunction isa_check = nullptr)
61 {
62   if (!cpuinfo_initialize()) {
63     state.SkipWithError("failed cpuinfo init");
64     return;
65   }
66   if (isa_check && !isa_check(state)) {
67     return;
68   }
69 
70   // The smallest x for which sigmoidf(x) is normalized (-0x1.5D589Ep+6f).
71   const uint32_t min_input = 0xC2AEAC4F;
72   // The largest x for which sigmoidf(x) is not 1.0f (0x1.154244p+4f).
73   const uint32_t max_input = 0x418AA122;
74   // Number of elements in one block of inputs/outputs.
75   // Combining multiple elements in a block reduce function call overhead.
76   const size_t block_size = 16384;
77   // Number of elements in one parallelization tile. Worker threads process this many elements in each task.
78   const size_t tile_size = 64;
79 
80   uint32_t num_threads = cpuinfo_get_cores_count();
81   #if XNN_ARCH_ARM || XNN_ARCH_ARM64
82     // Use all cores except for the least performant cluster
83     if (cpuinfo_get_clusters_count() > 1) {
84       num_threads -= cpuinfo_get_cluster(cpuinfo_get_clusters_count() - 1)->core_count;
85     }
86   #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
87 
88   std::unique_ptr<pthreadpool, decltype(&pthreadpool_destroy)> threadpool(
89     pthreadpool_create(num_threads), pthreadpool_destroy);
90 
91   std::vector<float, AlignedAllocator<float, 64>> x(block_size);
92   std::vector<float, AlignedAllocator<float, 64>> y(block_size);
93   std::vector<float> ulp_error(block_size);
94   float max_ulp_error = 0.0f;
95 
96   ComputeErrorContext context;
97   context.input = x.data();
98   context.output = y.data();
99   context.error = ulp_error.data();
100   for (auto _ : state) {
101     for (uint32_t n = min_input; int32_t(n) < 0; n -= block_size) {
102       for (uint32_t i = 0; i < block_size; i++) {
103         x[i] = uint32_as_float(std::max<uint32_t>(n - i, 0x80000000));
104       }
105       std::fill(y.begin(), y.end(), std::nanf(""));
106 
107       sigmoid(block_size * sizeof(float), x.data(), y.data());
108 
109       pthreadpool_parallelize_1d_tile_1d(
110           threadpool.get(),
111           reinterpret_cast<pthreadpool_task_1d_tile_1d_t>(ComputeError),
112           static_cast<void*>(&context),
113           block_size, tile_size, 0 /* flags */);
114 
115       max_ulp_error = std::accumulate(ulp_error.cbegin(), ulp_error.cend(), max_ulp_error,
116         static_cast<const float& (*)(const float&, const float&)>(std::max<float>));
117     }
118     for (uint32_t n = 0; n < max_input; n += block_size) {
119       for (uint32_t i = 0; i < block_size; i++) {
120         x[i] = uint32_as_float(std::min<uint32_t>(n + i, max_input));
121       }
122       std::fill(y.begin(), y.end(), std::nanf(""));
123 
124       sigmoid(block_size * sizeof(float), x.data(), y.data());
125 
126       pthreadpool_parallelize_1d_tile_1d(
127           threadpool.get(),
128           reinterpret_cast<pthreadpool_task_1d_tile_1d_t>(ComputeError),
129           static_cast<void*>(&context),
130           block_size, tile_size, 0 /* flags */);
131 
132       max_ulp_error = std::accumulate(ulp_error.cbegin(), ulp_error.cend(), max_ulp_error,
133         static_cast<const float& (*)(const float&, const float&)>(std::max<float>));
134     }
135   }
136 
137   state.counters["ULPERROR"] = benchmark::Counter(max_ulp_error);
138 }
139 
140 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
141   BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_lut64_p2_nr2recps,
142                     xnn_math_f32_sigmoid__neonfma_rr1_lut64_p2_nr2recps,
143                     benchmark::utils::CheckNEONFMA)
144     ->Unit(benchmark::kMillisecond)
145     ->Iterations(1);
146   BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_lut64_p2_nr1recps1fma,
147                     xnn_math_f32_sigmoid__neonfma_rr1_lut64_p2_nr1recps1fma,
148                     benchmark::utils::CheckNEONFMA)
149     ->Unit(benchmark::kMillisecond)
150     ->Iterations(1);
151   BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_lut64_p2_nr2fma,
152                     xnn_math_f32_sigmoid__neonfma_rr1_lut64_p2_nr2fma,
153                     benchmark::utils::CheckNEONFMA)
154     ->Unit(benchmark::kMillisecond)
155     ->Iterations(1);
156   BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_lut64_p2_nr2recps,
157                     xnn_math_f32_sigmoid__neonfma_rr2_lut64_p2_nr2recps,
158                     benchmark::utils::CheckNEONFMA)
159     ->Unit(benchmark::kMillisecond)
160     ->Iterations(1);
161   BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_lut64_p2_nr1recps1fma,
162                     xnn_math_f32_sigmoid__neonfma_rr2_lut64_p2_nr1recps1fma,
163                     benchmark::utils::CheckNEONFMA)
164     ->Unit(benchmark::kMillisecond)
165     ->Iterations(1);
166   BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_lut64_p2_nr2fma,
167                     xnn_math_f32_sigmoid__neonfma_rr2_lut64_p2_nr2fma,
168                     benchmark::utils::CheckNEONFMA)
169     ->Unit(benchmark::kMillisecond)
170     ->Iterations(1);
171   BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_lut2048_p1_nr2recps,
172                     xnn_math_f32_sigmoid__neonfma_rr1_lut2048_p1_nr2recps,
173                     benchmark::utils::CheckNEONFMA)
174     ->Unit(benchmark::kMillisecond)
175     ->Iterations(1);
176   BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_lut2048_p1_nr1recps1fma,
177                     xnn_math_f32_sigmoid__neonfma_rr1_lut2048_p1_nr1recps1fma,
178                     benchmark::utils::CheckNEONFMA)
179     ->Unit(benchmark::kMillisecond)
180     ->Iterations(1);
181   BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_lut2048_p1_nr2fma,
182                     xnn_math_f32_sigmoid__neonfma_rr1_lut2048_p1_nr2fma,
183                     benchmark::utils::CheckNEONFMA)
184     ->Unit(benchmark::kMillisecond)
185     ->Iterations(1);
186   BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_lut2048_p1_nr2recps,
187                     xnn_math_f32_sigmoid__neonfma_rr2_lut2048_p1_nr2recps,
188                     benchmark::utils::CheckNEONFMA)
189     ->Unit(benchmark::kMillisecond)
190     ->Iterations(1);
191   BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_lut2048_p1_nr1recps1fma,
192                     xnn_math_f32_sigmoid__neonfma_rr2_lut2048_p1_nr1recps1fma,
193                     benchmark::utils::CheckNEONFMA)
194     ->Unit(benchmark::kMillisecond)
195     ->Iterations(1);
196   BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_lut2048_p1_nr2fma,
197                     xnn_math_f32_sigmoid__neonfma_rr2_lut2048_p1_nr2fma,
198                     benchmark::utils::CheckNEONFMA)
199     ->Unit(benchmark::kMillisecond)
200     ->Iterations(1);
201   BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_p5_nr2recps,
202                     xnn_math_f32_sigmoid__neonfma_rr1_p5_nr2recps,
203                     benchmark::utils::CheckNEONFMA)
204     ->Unit(benchmark::kMillisecond)
205     ->Iterations(1);
206   BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_p5_nr1recps1fma,
207                     xnn_math_f32_sigmoid__neonfma_rr1_p5_nr1recps1fma,
208                     benchmark::utils::CheckNEONFMA)
209     ->Unit(benchmark::kMillisecond)
210     ->Iterations(1);
211   BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_p5_nr2fma,
212                     xnn_math_f32_sigmoid__neonfma_rr1_p5_nr2fma,
213                     benchmark::utils::CheckNEONFMA)
214     ->Unit(benchmark::kMillisecond)
215     ->Iterations(1);
216   BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_p5_nr2recps,
217                     xnn_math_f32_sigmoid__neonfma_rr2_p5_nr2recps,
218                     benchmark::utils::CheckNEONFMA)
219     ->Unit(benchmark::kMillisecond)
220     ->Iterations(1);
221   BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_p5_nr1recps1fma,
222                     xnn_math_f32_sigmoid__neonfma_rr2_p5_nr1recps1fma,
223                     benchmark::utils::CheckNEONFMA)
224     ->Unit(benchmark::kMillisecond)
225     ->Iterations(1);
226   BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_p5_nr2fma,
227                     xnn_math_f32_sigmoid__neonfma_rr2_p5_nr2fma,
228                     benchmark::utils::CheckNEONFMA)
229     ->Unit(benchmark::kMillisecond)
230     ->Iterations(1);
231 
232   BENCHMARK_CAPTURE(SigmoidError, neon_rr2_lut64_p2_nr2recps,
233                     xnn_math_f32_sigmoid__neon_rr2_lut64_p2_nr2recps,
234                     benchmark::utils::CheckNEON)
235     ->Unit(benchmark::kMillisecond)
236     ->Iterations(1);
237   BENCHMARK_CAPTURE(SigmoidError, neon_rr2_lut2048_p1_nr2recps,
238                     xnn_math_f32_sigmoid__neon_rr2_lut2048_p1_nr2recps,
239                     benchmark::utils::CheckNEON)
240     ->Unit(benchmark::kMillisecond)
241     ->Iterations(1);
242   BENCHMARK_CAPTURE(SigmoidError, neon_rr2_p5_nr2recps,
243                     xnn_math_f32_sigmoid__neon_rr2_p5_nr2recps,
244                     benchmark::utils::CheckNEON)
245     ->Unit(benchmark::kMillisecond)
246     ->Iterations(1);
247 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
248 
249 #if XNN_ARCH_ARM64
250   BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_lut2048_p1_div,
251                     xnn_math_f32_sigmoid__neonfma_rr1_lut2048_p1_div)
252     ->Unit(benchmark::kMillisecond)
253     ->Iterations(1);
254   BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_lut2048_p1_div,
255                     xnn_math_f32_sigmoid__neonfma_rr2_lut2048_p1_div)
256     ->Unit(benchmark::kMillisecond)
257     ->Iterations(1);
258   BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_lut64_p2_div,
259                     xnn_math_f32_sigmoid__neonfma_rr1_lut64_p2_div)
260     ->Unit(benchmark::kMillisecond)
261     ->Iterations(1);
262   BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_lut64_p2_div,
263                     xnn_math_f32_sigmoid__neonfma_rr2_lut64_p2_div)
264     ->Unit(benchmark::kMillisecond)
265     ->Iterations(1);
266   BENCHMARK_CAPTURE(SigmoidError, neonfma_rr1_p5_div,
267                     xnn_math_f32_sigmoid__neonfma_rr1_p5_div)
268     ->Unit(benchmark::kMillisecond)
269     ->Iterations(1);
270   BENCHMARK_CAPTURE(SigmoidError, neonfma_rr2_p5_div,
271                     xnn_math_f32_sigmoid__neonfma_rr2_p5_div)
272     ->Unit(benchmark::kMillisecond)
273     ->Iterations(1);
274 #endif  // XNN_ARCH_ARM64
275 
276 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
277   BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_lut16_p3_perm_scalef_nr1fma,
278                     xnn_math_f32_sigmoid__avx512f_rr1_lut16_p3_perm_scalef_nr1fma,
279                     benchmark::utils::CheckAVX512F)
280     ->Unit(benchmark::kMillisecond)
281     ->Iterations(1);
282   BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_lut16_p3_perm_scalef_nr1fma1adj,
283                     xnn_math_f32_sigmoid__avx512f_rr1_lut16_p3_perm_scalef_nr1fma1adj,
284                     benchmark::utils::CheckAVX512F)
285     ->Unit(benchmark::kMillisecond)
286     ->Iterations(1);
287   BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_lut16_p3_perm_scalef_div,
288                     xnn_math_f32_sigmoid__avx512f_rr1_lut16_p3_perm_scalef_div,
289                     benchmark::utils::CheckAVX512F)
290     ->Unit(benchmark::kMillisecond)
291     ->Iterations(1);
292   BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_lut16_p3_perm_scalef_nr1fma,
293                     xnn_math_f32_sigmoid__avx512f_rr2_lut16_p3_perm_scalef_nr1fma,
294                     benchmark::utils::CheckAVX512F)
295     ->Unit(benchmark::kMillisecond)
296     ->Iterations(1);
297   BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_lut16_p3_perm_scalef_nr1fma1adj,
298                     xnn_math_f32_sigmoid__avx512f_rr2_lut16_p3_perm_scalef_nr1fma1adj,
299                     benchmark::utils::CheckAVX512F)
300     ->Unit(benchmark::kMillisecond)
301     ->Iterations(1);
302   BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_lut16_p3_perm_scalef_div,
303                     xnn_math_f32_sigmoid__avx512f_rr2_lut16_p3_perm_scalef_div,
304                     benchmark::utils::CheckAVX512F)
305     ->Unit(benchmark::kMillisecond)
306     ->Iterations(1);
307   BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_lut32_p2_perm2_scalef_nr1fma,
308                     xnn_math_f32_sigmoid__avx512f_rr1_lut32_p2_perm2_scalef_nr1fma,
309                     benchmark::utils::CheckAVX512F)
310     ->Unit(benchmark::kMillisecond)
311     ->Iterations(1);
312   BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_lut32_p2_perm2_scalef_nr1fma1adj,
313                     xnn_math_f32_sigmoid__avx512f_rr1_lut32_p2_perm2_scalef_nr1fma1adj,
314                     benchmark::utils::CheckAVX512F)
315     ->Unit(benchmark::kMillisecond)
316     ->Iterations(1);
317   BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_lut32_p2_perm2_scalef_div,
318                     xnn_math_f32_sigmoid__avx512f_rr1_lut32_p2_perm2_scalef_div,
319                     benchmark::utils::CheckAVX512F)
320     ->Unit(benchmark::kMillisecond)
321     ->Iterations(1);
322   BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_lut32_p2_perm2_scalef_nr1fma,
323                     xnn_math_f32_sigmoid__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma,
324                     benchmark::utils::CheckAVX512F)
325     ->Unit(benchmark::kMillisecond)
326     ->Iterations(1);
327   BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_lut32_p2_perm2_scalef_nr1fma1adj,
328                     xnn_math_f32_sigmoid__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma1adj,
329                     benchmark::utils::CheckAVX512F)
330     ->Unit(benchmark::kMillisecond)
331     ->Iterations(1);
332   BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_lut32_p2_perm2_scalef_div,
333                     xnn_math_f32_sigmoid__avx512f_rr2_lut32_p2_perm2_scalef_div,
334                     benchmark::utils::CheckAVX512F)
335     ->Unit(benchmark::kMillisecond)
336     ->Iterations(1);
337   BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_lut64_p2_gather_scalef_nr1fma,
338                     xnn_math_f32_sigmoid__avx512f_rr1_lut64_p2_gather_scalef_nr1fma,
339                     benchmark::utils::CheckAVX512F)
340     ->Unit(benchmark::kMillisecond)
341     ->Iterations(1);
342   BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_lut64_p2_gather_scalef_nr1fma1adj,
343                     xnn_math_f32_sigmoid__avx512f_rr1_lut64_p2_gather_scalef_nr1fma1adj,
344                     benchmark::utils::CheckAVX512F)
345     ->Unit(benchmark::kMillisecond)
346     ->Iterations(1);
347   BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_lut64_p2_gather_scalef_div,
348                     xnn_math_f32_sigmoid__avx512f_rr1_lut64_p2_gather_scalef_div,
349                     benchmark::utils::CheckAVX512F)
350     ->Unit(benchmark::kMillisecond)
351     ->Iterations(1);
352   BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_lut64_p2_gather_scalef_nr1fma,
353                     xnn_math_f32_sigmoid__avx512f_rr2_lut64_p2_gather_scalef_nr1fma,
354                     benchmark::utils::CheckAVX512F)
355     ->Unit(benchmark::kMillisecond)
356     ->Iterations(1);
357   BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_lut64_p2_gather_scalef_nr1fma1adj,
358                     xnn_math_f32_sigmoid__avx512f_rr2_lut64_p2_gather_scalef_nr1fma1adj,
359                     benchmark::utils::CheckAVX512F)
360     ->Unit(benchmark::kMillisecond)
361     ->Iterations(1);
362   BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_lut64_p2_gather_scalef_div,
363                     xnn_math_f32_sigmoid__avx512f_rr2_lut64_p2_gather_scalef_div,
364                     benchmark::utils::CheckAVX512F)
365     ->Unit(benchmark::kMillisecond)
366     ->Iterations(1);
367   BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_p5_scalef_nr1fma,
368                     xnn_math_f32_sigmoid__avx512f_rr1_p5_scalef_nr1fma,
369                     benchmark::utils::CheckAVX512F)
370     ->Unit(benchmark::kMillisecond)
371     ->Iterations(1);
372   BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_p5_scalef_nr1fma1adj,
373                     xnn_math_f32_sigmoid__avx512f_rr1_p5_scalef_nr1fma1adj,
374                     benchmark::utils::CheckAVX512F)
375     ->Unit(benchmark::kMillisecond)
376     ->Iterations(1);
377   BENCHMARK_CAPTURE(SigmoidError, avx512f_rr1_p5_scalef_div,
378                     xnn_math_f32_sigmoid__avx512f_rr1_p5_scalef_div,
379                     benchmark::utils::CheckAVX512F)
380     ->Unit(benchmark::kMillisecond)
381     ->Iterations(1);
382   BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_p5_scalef_nr1fma,
383                     xnn_math_f32_sigmoid__avx512f_rr2_p5_scalef_nr1fma,
384                     benchmark::utils::CheckAVX512F)
385     ->Unit(benchmark::kMillisecond)
386     ->Iterations(1);
387   BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_p5_scalef_nr1fma1adj,
388                     xnn_math_f32_sigmoid__avx512f_rr2_p5_scalef_nr1fma1adj,
389                     benchmark::utils::CheckAVX512F)
390     ->Unit(benchmark::kMillisecond)
391     ->Iterations(1);
392   BENCHMARK_CAPTURE(SigmoidError, avx512f_rr2_p5_scalef_div,
393                     xnn_math_f32_sigmoid__avx512f_rr2_p5_scalef_div,
394                     benchmark::utils::CheckAVX512F)
395     ->Unit(benchmark::kMillisecond)
396     ->Iterations(1);
397 
398   BENCHMARK_CAPTURE(SigmoidError, avx2_rr1_lut64_p2_gather_nr1fma,
399                     xnn_math_f32_sigmoid__avx2_rr1_lut64_p2_gather_nr1fma,
400                     benchmark::utils::CheckAVX2)
401     ->Unit(benchmark::kMillisecond)
402     ->Iterations(1);
403   BENCHMARK_CAPTURE(SigmoidError, avx2_rr1_lut64_p2_gather_nr2fma,
404                     xnn_math_f32_sigmoid__avx2_rr1_lut64_p2_gather_nr2fma,
405                     benchmark::utils::CheckAVX2)
406     ->Unit(benchmark::kMillisecond)
407     ->Iterations(1);
408   BENCHMARK_CAPTURE(SigmoidError, avx2_rr1_lut64_p2_gather_nr2fma1adj,
409                     xnn_math_f32_sigmoid__avx2_rr1_lut64_p2_gather_nr2fma1adj,
410                     benchmark::utils::CheckAVX2)
411     ->Unit(benchmark::kMillisecond)
412     ->Iterations(1);
413   BENCHMARK_CAPTURE(SigmoidError, avx2_rr1_lut64_p2_gather_div,
414                     xnn_math_f32_sigmoid__avx2_rr1_lut64_p2_gather_div,
415                     benchmark::utils::CheckAVX2)
416     ->Unit(benchmark::kMillisecond)
417     ->Iterations(1);
418   BENCHMARK_CAPTURE(SigmoidError, avx2_rr2_lut64_p2_gather_nr1fma,
419                     xnn_math_f32_sigmoid__avx2_rr2_lut64_p2_gather_nr1fma,
420                     benchmark::utils::CheckAVX2)
421     ->Unit(benchmark::kMillisecond)
422     ->Iterations(1);
423   BENCHMARK_CAPTURE(SigmoidError, avx2_rr2_lut64_p2_gather_nr2fma,
424                     xnn_math_f32_sigmoid__avx2_rr2_lut64_p2_gather_nr2fma,
425                     benchmark::utils::CheckAVX2)
426     ->Unit(benchmark::kMillisecond)
427     ->Iterations(1);
428   BENCHMARK_CAPTURE(SigmoidError, avx2_rr2_lut64_p2_gather_nr2fma1adj,
429                     xnn_math_f32_sigmoid__avx2_rr2_lut64_p2_gather_nr2fma1adj,
430                     benchmark::utils::CheckAVX2)
431     ->Unit(benchmark::kMillisecond)
432     ->Iterations(1);
433   BENCHMARK_CAPTURE(SigmoidError, avx2_rr2_lut64_p2_gather_div,
434                     xnn_math_f32_sigmoid__avx2_rr2_lut64_p2_gather_div,
435                     benchmark::utils::CheckAVX2)
436     ->Unit(benchmark::kMillisecond)
437     ->Iterations(1);
438   BENCHMARK_CAPTURE(SigmoidError, avx2_rr1_p5_nr1fma,
439                     xnn_math_f32_sigmoid__avx2_rr1_p5_nr1fma,
440                     benchmark::utils::CheckAVX2)
441     ->Unit(benchmark::kMillisecond)
442     ->Iterations(1);
443   BENCHMARK_CAPTURE(SigmoidError, avx2_rr1_p5_nr2fma,
444                     xnn_math_f32_sigmoid__avx2_rr1_p5_nr2fma,
445                     benchmark::utils::CheckAVX2)
446     ->Unit(benchmark::kMillisecond)
447     ->Iterations(1);
448   BENCHMARK_CAPTURE(SigmoidError, avx2_rr1_p5_div,
449                     xnn_math_f32_sigmoid__avx2_rr1_p5_div,
450                     benchmark::utils::CheckAVX2)
451     ->Unit(benchmark::kMillisecond)
452     ->Iterations(1);
453   BENCHMARK_CAPTURE(SigmoidError, avx2_rr2_p5_nr1fma,
454                     xnn_math_f32_sigmoid__avx2_rr2_p5_nr1fma,
455                     benchmark::utils::CheckAVX2)
456     ->Unit(benchmark::kMillisecond)
457     ->Iterations(1);
458   BENCHMARK_CAPTURE(SigmoidError, avx2_rr2_p5_nr2fma,
459                     xnn_math_f32_sigmoid__avx2_rr2_p5_nr2fma,
460                     benchmark::utils::CheckAVX2)
461     ->Unit(benchmark::kMillisecond)
462     ->Iterations(1);
463   BENCHMARK_CAPTURE(SigmoidError, avx2_rr2_p5_div,
464                     xnn_math_f32_sigmoid__avx2_rr2_p5_div,
465                     benchmark::utils::CheckAVX2)
466     ->Unit(benchmark::kMillisecond)
467     ->Iterations(1);
468 
469   BENCHMARK_CAPTURE(SigmoidError, avx_rr2_lut64_p2_div,
470                     xnn_math_f32_sigmoid__avx_rr2_lut64_p2_div,
471                     benchmark::utils::CheckAVX)
472     ->Unit(benchmark::kMillisecond)
473     ->Iterations(1);
474   BENCHMARK_CAPTURE(SigmoidError, avx_rr2_p5_nr1,
475                     xnn_math_f32_sigmoid__avx_rr2_p5_nr1,
476                     benchmark::utils::CheckAVX)
477     ->Unit(benchmark::kMillisecond)
478     ->Iterations(1);
479   BENCHMARK_CAPTURE(SigmoidError, avx_rr2_p5_nr2,
480                     xnn_math_f32_sigmoid__avx_rr2_p5_nr2,
481                     benchmark::utils::CheckAVX)
482     ->Unit(benchmark::kMillisecond)
483     ->Iterations(1);
484   BENCHMARK_CAPTURE(SigmoidError, avx_rr2_p5_div,
485                     xnn_math_f32_sigmoid__avx_rr2_p5_div,
486                     benchmark::utils::CheckAVX)
487     ->Unit(benchmark::kMillisecond)
488     ->Iterations(1);
489 
490   BENCHMARK_CAPTURE(SigmoidError, sse2_rr2_lut64_p2_nr1,
491                     xnn_math_f32_sigmoid__sse2_rr2_lut64_p2_nr1)
492     ->Unit(benchmark::kMillisecond)
493     ->Iterations(1);
494   BENCHMARK_CAPTURE(SigmoidError, sse2_rr2_lut64_p2_nr2,
495                     xnn_math_f32_sigmoid__sse2_rr2_lut64_p2_nr2)
496     ->Unit(benchmark::kMillisecond)
497     ->Iterations(1);
498   BENCHMARK_CAPTURE(SigmoidError, sse2_rr2_lut64_p2_div,
499                     xnn_math_f32_sigmoid__sse2_rr2_lut64_p2_div)
500     ->Unit(benchmark::kMillisecond)
501     ->Iterations(1);
502   BENCHMARK_CAPTURE(SigmoidError, sse2_rr2_p5_nr1,
503                     xnn_math_f32_sigmoid__sse2_rr2_p5_nr1)
504     ->Unit(benchmark::kMillisecond)
505     ->Iterations(1);
506   BENCHMARK_CAPTURE(SigmoidError, sse2_rr2_p5_nr2,
507                     xnn_math_f32_sigmoid__sse2_rr2_p5_nr2)
508     ->Unit(benchmark::kMillisecond)
509     ->Iterations(1);
510   BENCHMARK_CAPTURE(SigmoidError, sse2_rr2_p5_div,
511                     xnn_math_f32_sigmoid__sse2_rr2_p5_div)
512     ->Unit(benchmark::kMillisecond)
513     ->Iterations(1);
514 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
515 
516 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
517   BENCHMARK_CAPTURE(SigmoidError, wasmsimd_rr2_lut64_p2_div,
518                     xnn_math_f32_sigmoid__wasmsimd_rr2_lut64_p2_div)
519     ->Unit(benchmark::kMillisecond)
520     ->Iterations(1);
521   BENCHMARK_CAPTURE(SigmoidError, wasmsimd_rr2_p5_div,
522                     xnn_math_f32_sigmoid__wasmsimd_rr2_p5_div)
523     ->Unit(benchmark::kMillisecond)
524     ->Iterations(1);
525 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
526 
527 BENCHMARK_CAPTURE(SigmoidError, scalar_rr2_lut64_p2_div,
528                   xnn_math_f32_sigmoid__scalar_rr2_lut64_p2_div)
529   ->Unit(benchmark::kMillisecond)
530   ->Iterations(1);
531 BENCHMARK_CAPTURE(SigmoidError, scalar_rr2_lut2048_p1_div,
532                   xnn_math_f32_sigmoid__scalar_rr2_lut2048_p1_div)
533   ->Unit(benchmark::kMillisecond)
534   ->Iterations(1);
535 BENCHMARK_CAPTURE(SigmoidError, scalar_rr2_p5_div,
536                   xnn_math_f32_sigmoid__scalar_rr2_p5_div)
537   ->Unit(benchmark::kMillisecond)
538   ->Iterations(1);
539 
540 #ifndef XNNPACK_BENCHMARK_NO_MAIN
541 BENCHMARK_MAIN();
542 #endif
543