xref: /aosp_15_r20/external/XNNPACK/bench/f32-softmax.cc (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 #include <algorithm>
2 #include <cfloat>
3 #include <chrono>
4 #include <cmath>
5 #include <functional>
6 #include <random>
7 #include <vector>
8 
9 #include <benchmark/benchmark.h>
10 #ifdef BENCHMARK_INTEL_DNNL
11 #include <dnnl.h>
12 #endif  // BENCHMARK_INTEL_DNNL
13 #include "bench/utils.h"
14 
15 #include <xnnpack.h>
16 #include <xnnpack/common.h>
17 #include <xnnpack/microfnptr.h>
18 #include <xnnpack/microparams-init.h>
19 #include <xnnpack/raddexpminusmax.h>
20 #include <xnnpack/raddextexp.h>
21 #include <xnnpack/raddstoreexpminusmax.h>
22 #include <xnnpack/rmax.h>
23 #include <xnnpack/vbinary.h>
24 #include <xnnpack/vscaleexpminusmax.h>
25 #include <xnnpack/vscaleextexp.h>
26 
27 
28 #ifdef BENCHMARK_INTEL_DNNL
DNNLSoftArgMax(benchmark::State & state)29 static void DNNLSoftArgMax(
30   benchmark::State& state)
31 {
32   const size_t elements = state.range(0);
33   const size_t cache_line_size_max = 128;
34   const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
35 
36   std::random_device random_device;
37   auto rng = std::mt19937(random_device());
38   auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
39 
40   const size_t num_buffers = 1 +
41     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
42   std::vector<float> x(elements);
43   std::vector<float> y(packed_elements * num_buffers);
44 
45   std::generate(x.begin(), x.end(), std::ref(f32rng));
46 
47   dnnl_engine_t engine;
48   if (dnnl_engine_create(&engine, dnnl_cpu, 0) != dnnl_success) {
49     state.SkipWithError("failed to create CPU engine");
50     return;
51   }
52 
53   dnnl_dim_t input_output_shape[1] = { static_cast<int>(elements) };
54 
55   dnnl_memory_desc_t memory_descriptor = { 0 };
56   if (dnnl_memory_desc_init_by_tag(
57     &memory_descriptor, 1, input_output_shape, dnnl_f32, dnnl_x) != dnnl_success)
58   {
59     state.SkipWithError("failed to create input memory descriptor");
60     return;
61   }
62 
63   dnnl_memory_t input_memory = nullptr;
64   if (dnnl_memory_create(
65     &input_memory, &memory_descriptor, engine, x.data()) != dnnl_success)
66   {
67     state.SkipWithError("failed to create input memory");
68     return;
69   }
70 
71   dnnl_memory_t output_memory = nullptr;
72   if (dnnl_memory_create(
73     &output_memory, &memory_descriptor, engine, y.data()) != dnnl_success)
74   {
75     state.SkipWithError("failed to create output memory");
76     return;
77   }
78 
79   dnnl_softmax_desc_t softmax_forward_descriptor = {};
80   if (dnnl_softmax_forward_desc_init(
81     &softmax_forward_descriptor, dnnl_forward_inference,
82     &memory_descriptor, 0) != dnnl_success)
83   {
84     state.SkipWithError("failed to create SoftMax forward descriptor");
85     return;
86   }
87 
88   dnnl_primitive_desc_t softmax_primitive_descriptor = nullptr;
89   if (dnnl_primitive_desc_create(
90     &softmax_primitive_descriptor, &softmax_forward_descriptor,
91     nullptr /* primitive attributes */, engine, nullptr /* hint */) != dnnl_success)
92   {
93     state.SkipWithError("failed to create SoftMax primitive descriptor");
94     return;
95   }
96 
97   dnnl_primitive_t softmax_primitive = nullptr;
98   if (dnnl_primitive_create(
99     &softmax_primitive, softmax_primitive_descriptor) != dnnl_success)
100   {
101     state.SkipWithError("failed to create SoftMax primitive");
102     return;
103   }
104 
105   dnnl_exec_arg_t softmax_args[2] = {
106     {DNNL_ARG_SRC, input_memory},
107     {DNNL_ARG_DST, output_memory},
108   };
109 
110   dnnl_stream_t stream = nullptr;
111   if (dnnl_stream_create(&stream, engine, dnnl_stream_default_flags) != dnnl_success) {
112     state.SkipWithError("failed to create stream");
113     return;
114   }
115 
116   size_t buffer_index = 0;
117   for (auto _ : state) {
118     benchmark::utils::PrefetchToL1(x.data(), x.size() * sizeof(float));
119     if (++buffer_index == num_buffers) {
120       buffer_index = 0;
121     }
122 
123     const auto start = std::chrono::high_resolution_clock::now();
124     if (dnnl_primitive_execute(
125       softmax_primitive, stream, 2, softmax_args) != dnnl_success)
126     {
127       state.SkipWithError("failed to execute SoftMax");
128       return;
129     }
130     const auto end = std::chrono::high_resolution_clock::now();
131 
132     const auto elapsed_seconds =
133       std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
134     state.SetIterationTime(elapsed_seconds.count());
135   }
136 
137   if (dnnl_stream_destroy(stream) != dnnl_success) {
138     state.SkipWithError("failed to destroy stream");
139     return;
140   }
141 
142   if (dnnl_primitive_desc_destroy(softmax_primitive_descriptor) != dnnl_success) {
143     state.SkipWithError("failed to destroy SoftMax primitive descriptor");
144     return;
145   }
146 
147   if (dnnl_primitive_destroy(softmax_primitive) != dnnl_success) {
148     state.SkipWithError("failed to destroy SoftMax primitive");
149     return;
150   }
151 
152   if (dnnl_memory_destroy(input_memory) != dnnl_success) {
153     state.SkipWithError("failed to destroy input memory");
154     return;
155   }
156 
157   if (dnnl_memory_destroy(output_memory) != dnnl_success) {
158     state.SkipWithError("failed to destroy output memory");
159     return;
160   }
161 
162   if (dnnl_engine_destroy(engine) != dnnl_success) {
163     state.SkipWithError("failed to destroy engine");
164     return;
165   }
166 
167   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
168   if (cpu_frequency != 0) {
169     state.counters["cpufreq"] = cpu_frequency;
170   }
171 
172   const size_t elements_per_iteration = elements;
173   state.counters["elements"] =
174     benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
175 
176   const size_t bytes_per_iteration = 2 * elements * sizeof(float);
177   state.counters["bytes"] =
178     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
179 }
180 #endif  // BENCHMARK_INTEL_DNNL
181 
ThreePassSoftMaxWithRecomputing(benchmark::State & state,xnn_f32_rmax_ukernel_function rmax,xnn_f32_raddexpminusmax_ukernel_function raddexpminusmax,xnn_f32_vscaleexpminusmax_ukernel_function vscaleexpminusmax,benchmark::utils::IsaCheckFunction isa_check=nullptr)182 static void ThreePassSoftMaxWithRecomputing(
183   benchmark::State& state,
184   xnn_f32_rmax_ukernel_function rmax,
185   xnn_f32_raddexpminusmax_ukernel_function raddexpminusmax,
186   xnn_f32_vscaleexpminusmax_ukernel_function vscaleexpminusmax,
187   benchmark::utils::IsaCheckFunction isa_check = nullptr)
188 {
189   if (isa_check && !isa_check(state)) {
190     return;
191   }
192 
193   const size_t elements = state.range(0);
194   const size_t cache_line_size_max = 128;
195   const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
196 
197   std::random_device random_device;
198   auto rng = std::mt19937(random_device());
199   auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
200 
201   const size_t num_buffers = 1 +
202     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
203   std::vector<float> x(elements);
204   std::vector<float> y(packed_elements * num_buffers);
205 
206   std::generate(x.begin(), x.end(), std::ref(f32rng));
207 
208   benchmark::utils::DisableDenormals();
209 
210   size_t buffer_index = 0;
211   for (auto _ : state) {
212     benchmark::utils::PrefetchToL1(x.data(), x.size() * sizeof(float));
213     if (++buffer_index == num_buffers) {
214       buffer_index = 0;
215     }
216 
217     const auto start = std::chrono::high_resolution_clock::now();
218     float x_max = nanf("");
219     rmax(elements * sizeof(float), x.data(), &x_max);
220     float y_sum = nanf("");
221     raddexpminusmax(elements * sizeof(float), x.data(), &y_sum, x_max);
222     vscaleexpminusmax(elements * sizeof(float), x.data(), y.data() + packed_elements * buffer_index, x_max, 1.0f / y_sum);
223     const auto end = std::chrono::high_resolution_clock::now();
224 
225     const auto elapsed_seconds =
226       std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
227     state.SetIterationTime(elapsed_seconds.count());
228   }
229 
230   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
231   if (cpu_frequency != 0) {
232     state.counters["cpufreq"] = cpu_frequency;
233   }
234 
235   const size_t elements_per_iteration = elements;
236   state.counters["elements"] =
237     benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
238 
239   const size_t bytes_per_iteration = 2 * elements * sizeof(float);
240   state.counters["bytes"] =
241     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
242 }
243 
ThreePassSoftMaxWithReloading(benchmark::State & state,xnn_f32_rmax_ukernel_function rmax,xnn_f32_raddstoreexpminusmax_ukernel_function raddstoreexpminusmax,xnn_init_f32_expminus_params_fn init_expminus_params,xnn_f32_vbinary_minmax_ukernel_function vmulc,xnn_init_f32_minmax_params_fn init_minmax_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)244 static void ThreePassSoftMaxWithReloading(
245   benchmark::State& state,
246   xnn_f32_rmax_ukernel_function rmax,
247   xnn_f32_raddstoreexpminusmax_ukernel_function raddstoreexpminusmax,
248   xnn_init_f32_expminus_params_fn init_expminus_params,
249   xnn_f32_vbinary_minmax_ukernel_function vmulc,
250   xnn_init_f32_minmax_params_fn init_minmax_params,
251   benchmark::utils::IsaCheckFunction isa_check = nullptr)
252 {
253   if (isa_check && !isa_check(state)) {
254     return;
255   }
256 
257   const size_t elements = state.range(0);
258   const size_t cache_line_size_max = 128;
259   const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
260 
261   std::random_device random_device;
262   auto rng = std::mt19937(random_device());
263   auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
264 
265   const size_t num_buffers = 1 +
266     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
267   std::vector<float> x(elements);
268   std::vector<float> y(packed_elements * num_buffers);
269 
270   std::generate(x.begin(), x.end(), std::ref(f32rng));
271 
272   benchmark::utils::DisableDenormals();
273 
274   xnn_f32_expminus_params expminus_params;
275   xnn_f32_minmax_params minmax_params;
276   init_expminus_params(&expminus_params);
277   init_minmax_params(&minmax_params, -INFINITY, INFINITY);
278 
279   size_t buffer_index = 0;
280   for (auto _ : state) {
281     benchmark::utils::PrefetchToL1(x.data(), x.size() * sizeof(float));
282     if (++buffer_index == num_buffers) {
283       buffer_index = 0;
284     }
285 
286     const auto start = std::chrono::high_resolution_clock::now();
287     float x_max = nanf("");
288     rmax(elements * sizeof(float), x.data(), &x_max);
289     float y_sum = nanf("");
290     raddstoreexpminusmax(elements * sizeof(float), x.data(), &x_max, y.data() + packed_elements * buffer_index, &y_sum, &expminus_params);
291     const float inv_y_sum = 1.0f / y_sum;
292     vmulc(elements * sizeof(float), y.data() + packed_elements * buffer_index, &inv_y_sum, y.data() + packed_elements * buffer_index, &minmax_params);
293     const auto end = std::chrono::high_resolution_clock::now();
294 
295     const auto elapsed_seconds =
296       std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
297     state.SetIterationTime(elapsed_seconds.count());
298   }
299 
300   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
301   if (cpu_frequency != 0) {
302     state.counters["cpufreq"] = cpu_frequency;
303   }
304 
305   const size_t elements_per_iteration = elements;
306   state.counters["elements"] =
307     benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
308 
309   const size_t bytes_per_iteration = 2 * elements * sizeof(float);
310   state.counters["bytes"] =
311     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
312 }
313 
TwoPassSoftMax(benchmark::State & state,xnn_f32_raddextexp_ukernel_function raddextexp,xnn_f32_vscaleextexp_ukernel_function vscaleextexp,benchmark::utils::IsaCheckFunction isa_check=nullptr)314 static void TwoPassSoftMax(
315   benchmark::State& state,
316   xnn_f32_raddextexp_ukernel_function raddextexp,
317   xnn_f32_vscaleextexp_ukernel_function vscaleextexp,
318   benchmark::utils::IsaCheckFunction isa_check = nullptr)
319 {
320   if (isa_check && !isa_check(state)) {
321     return;
322   }
323 
324   const size_t elements = state.range(0);
325   const size_t cache_line_size_max = 128;
326   const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
327 
328   std::random_device random_device;
329   auto rng = std::mt19937(random_device());
330   auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
331 
332   const size_t num_buffers = 1 +
333     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
334   std::vector<float> x(elements);
335   std::vector<float> y(packed_elements * num_buffers);
336 
337   std::generate(x.begin(), x.end(), std::ref(f32rng));
338 
339   benchmark::utils::DisableDenormals();
340 
341   size_t buffer_index = 0;
342   for (auto _ : state) {
343     benchmark::utils::PrefetchToL1(x.data(), x.size() * sizeof(float));
344     if (++buffer_index == num_buffers) {
345       buffer_index = 0;
346     }
347 
348     const auto start = std::chrono::high_resolution_clock::now();
349     float scale[2];
350     raddextexp(elements * sizeof(float), x.data(), scale);
351     vscaleextexp(elements * sizeof(float), x.data(), y.data() + packed_elements * buffer_index, 1.0f / scale[0], -scale[1]);
352     const auto end = std::chrono::high_resolution_clock::now();
353 
354     const auto elapsed_seconds =
355       std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
356     state.SetIterationTime(elapsed_seconds.count());
357   }
358 
359   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
360   if (cpu_frequency != 0) {
361     state.counters["cpufreq"] = cpu_frequency;
362   }
363 
364   const size_t elements_per_iteration = elements;
365   state.counters["elements"] =
366     benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
367 
368   const size_t bytes_per_iteration = 2 * elements * sizeof(float);
369   state.counters["bytes"] =
370     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
371 }
372 
CharacteristicArguments(benchmark::internal::Benchmark * b)373 static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
374   for (int32_t n = 1000; n <= 100000000; n *= 10) {
375     b->Arg(n);
376     b->Arg(3 * n);
377   }
378 }
379 
380 #ifdef BENCHMARK_INTEL_DNNL
381   BENCHMARK(DNNLSoftArgMax)->Apply(CharacteristicArguments)->UseManualTime();
382 #endif
383 
384 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
385   BENCHMARK_CAPTURE(TwoPassSoftMax, avx2_p5,
386     xnn_f32_raddextexp_ukernel__avx2_p5_x96,
387     xnn_f32_vscaleextexp_ukernel__avx2_p5_x40,
388     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime();
389   BENCHMARK_CAPTURE(ThreePassSoftMaxWithRecomputing, avx2_p5,
390     xnn_f32_rmax_ukernel__avx,
391     xnn_f32_raddexpminusmax_ukernel__avx2_p5_x96,
392     xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x24,
393     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime();
394   BENCHMARK_CAPTURE(ThreePassSoftMaxWithReloading, avx2_p5,
395     xnn_f32_rmax_ukernel__avx,
396     xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_x64_acc2,
397     xnn_init_f32_expminus_avx2_rr1_p5_params,
398     xnn_f32_vmulc_minmax_ukernel__avx_x16,
399     xnn_init_f32_minmax_avx_params,
400     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime();
401 
402   BENCHMARK_CAPTURE(TwoPassSoftMax, avx512f_p5_scalef,
403     xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3,
404     xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x16,
405     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime();
406   BENCHMARK_CAPTURE(ThreePassSoftMaxWithRecomputing, avx512f_p5_scalef,
407     xnn_f32_rmax_ukernel__avx512f,
408     xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc4,
409     xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x16,
410     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime();
411   BENCHMARK_CAPTURE(ThreePassSoftMaxWithReloading, avx512f_p5_scalef,
412     xnn_f32_rmax_ukernel__avx512f,
413     xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128_acc2,
414     xnn_init_f32_expminus_avx512_rr1_p5_params,
415     xnn_f32_vmulc_minmax_ukernel__avx512f_x32,
416     xnn_init_f32_minmax_scalar_params,
417     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime();
418 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
419 
420 #ifndef XNNPACK_BENCHMARK_NO_MAIN
421 BENCHMARK_MAIN();
422 #endif
423