1 #include <algorithm>
2 #include <cfloat>
3 #include <chrono>
4 #include <cmath>
5 #include <functional>
6 #include <random>
7 #include <vector>
8
9 #include <benchmark/benchmark.h>
10 #ifdef BENCHMARK_INTEL_DNNL
11 #include <dnnl.h>
12 #endif // BENCHMARK_INTEL_DNNL
13 #include "bench/utils.h"
14
15 #include <xnnpack.h>
16 #include <xnnpack/common.h>
17 #include <xnnpack/microfnptr.h>
18 #include <xnnpack/microparams-init.h>
19 #include <xnnpack/raddexpminusmax.h>
20 #include <xnnpack/raddextexp.h>
21 #include <xnnpack/raddstoreexpminusmax.h>
22 #include <xnnpack/rmax.h>
23 #include <xnnpack/vbinary.h>
24 #include <xnnpack/vscaleexpminusmax.h>
25 #include <xnnpack/vscaleextexp.h>
26
27
28 #ifdef BENCHMARK_INTEL_DNNL
DNNLSoftArgMax(benchmark::State & state)29 static void DNNLSoftArgMax(
30 benchmark::State& state)
31 {
32 const size_t elements = state.range(0);
33 const size_t cache_line_size_max = 128;
34 const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
35
36 std::random_device random_device;
37 auto rng = std::mt19937(random_device());
38 auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
39
40 const size_t num_buffers = 1 +
41 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
42 std::vector<float> x(elements);
43 std::vector<float> y(packed_elements * num_buffers);
44
45 std::generate(x.begin(), x.end(), std::ref(f32rng));
46
47 dnnl_engine_t engine;
48 if (dnnl_engine_create(&engine, dnnl_cpu, 0) != dnnl_success) {
49 state.SkipWithError("failed to create CPU engine");
50 return;
51 }
52
53 dnnl_dim_t input_output_shape[1] = { static_cast<int>(elements) };
54
55 dnnl_memory_desc_t memory_descriptor = { 0 };
56 if (dnnl_memory_desc_init_by_tag(
57 &memory_descriptor, 1, input_output_shape, dnnl_f32, dnnl_x) != dnnl_success)
58 {
59 state.SkipWithError("failed to create input memory descriptor");
60 return;
61 }
62
63 dnnl_memory_t input_memory = nullptr;
64 if (dnnl_memory_create(
65 &input_memory, &memory_descriptor, engine, x.data()) != dnnl_success)
66 {
67 state.SkipWithError("failed to create input memory");
68 return;
69 }
70
71 dnnl_memory_t output_memory = nullptr;
72 if (dnnl_memory_create(
73 &output_memory, &memory_descriptor, engine, y.data()) != dnnl_success)
74 {
75 state.SkipWithError("failed to create output memory");
76 return;
77 }
78
79 dnnl_softmax_desc_t softmax_forward_descriptor = {};
80 if (dnnl_softmax_forward_desc_init(
81 &softmax_forward_descriptor, dnnl_forward_inference,
82 &memory_descriptor, 0) != dnnl_success)
83 {
84 state.SkipWithError("failed to create SoftMax forward descriptor");
85 return;
86 }
87
88 dnnl_primitive_desc_t softmax_primitive_descriptor = nullptr;
89 if (dnnl_primitive_desc_create(
90 &softmax_primitive_descriptor, &softmax_forward_descriptor,
91 nullptr /* primitive attributes */, engine, nullptr /* hint */) != dnnl_success)
92 {
93 state.SkipWithError("failed to create SoftMax primitive descriptor");
94 return;
95 }
96
97 dnnl_primitive_t softmax_primitive = nullptr;
98 if (dnnl_primitive_create(
99 &softmax_primitive, softmax_primitive_descriptor) != dnnl_success)
100 {
101 state.SkipWithError("failed to create SoftMax primitive");
102 return;
103 }
104
105 dnnl_exec_arg_t softmax_args[2] = {
106 {DNNL_ARG_SRC, input_memory},
107 {DNNL_ARG_DST, output_memory},
108 };
109
110 dnnl_stream_t stream = nullptr;
111 if (dnnl_stream_create(&stream, engine, dnnl_stream_default_flags) != dnnl_success) {
112 state.SkipWithError("failed to create stream");
113 return;
114 }
115
116 size_t buffer_index = 0;
117 for (auto _ : state) {
118 benchmark::utils::PrefetchToL1(x.data(), x.size() * sizeof(float));
119 if (++buffer_index == num_buffers) {
120 buffer_index = 0;
121 }
122
123 const auto start = std::chrono::high_resolution_clock::now();
124 if (dnnl_primitive_execute(
125 softmax_primitive, stream, 2, softmax_args) != dnnl_success)
126 {
127 state.SkipWithError("failed to execute SoftMax");
128 return;
129 }
130 const auto end = std::chrono::high_resolution_clock::now();
131
132 const auto elapsed_seconds =
133 std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
134 state.SetIterationTime(elapsed_seconds.count());
135 }
136
137 if (dnnl_stream_destroy(stream) != dnnl_success) {
138 state.SkipWithError("failed to destroy stream");
139 return;
140 }
141
142 if (dnnl_primitive_desc_destroy(softmax_primitive_descriptor) != dnnl_success) {
143 state.SkipWithError("failed to destroy SoftMax primitive descriptor");
144 return;
145 }
146
147 if (dnnl_primitive_destroy(softmax_primitive) != dnnl_success) {
148 state.SkipWithError("failed to destroy SoftMax primitive");
149 return;
150 }
151
152 if (dnnl_memory_destroy(input_memory) != dnnl_success) {
153 state.SkipWithError("failed to destroy input memory");
154 return;
155 }
156
157 if (dnnl_memory_destroy(output_memory) != dnnl_success) {
158 state.SkipWithError("failed to destroy output memory");
159 return;
160 }
161
162 if (dnnl_engine_destroy(engine) != dnnl_success) {
163 state.SkipWithError("failed to destroy engine");
164 return;
165 }
166
167 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
168 if (cpu_frequency != 0) {
169 state.counters["cpufreq"] = cpu_frequency;
170 }
171
172 const size_t elements_per_iteration = elements;
173 state.counters["elements"] =
174 benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
175
176 const size_t bytes_per_iteration = 2 * elements * sizeof(float);
177 state.counters["bytes"] =
178 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
179 }
180 #endif // BENCHMARK_INTEL_DNNL
181
ThreePassSoftMaxWithRecomputing(benchmark::State & state,xnn_f32_rmax_ukernel_function rmax,xnn_f32_raddexpminusmax_ukernel_function raddexpminusmax,xnn_f32_vscaleexpminusmax_ukernel_function vscaleexpminusmax,benchmark::utils::IsaCheckFunction isa_check=nullptr)182 static void ThreePassSoftMaxWithRecomputing(
183 benchmark::State& state,
184 xnn_f32_rmax_ukernel_function rmax,
185 xnn_f32_raddexpminusmax_ukernel_function raddexpminusmax,
186 xnn_f32_vscaleexpminusmax_ukernel_function vscaleexpminusmax,
187 benchmark::utils::IsaCheckFunction isa_check = nullptr)
188 {
189 if (isa_check && !isa_check(state)) {
190 return;
191 }
192
193 const size_t elements = state.range(0);
194 const size_t cache_line_size_max = 128;
195 const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
196
197 std::random_device random_device;
198 auto rng = std::mt19937(random_device());
199 auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
200
201 const size_t num_buffers = 1 +
202 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
203 std::vector<float> x(elements);
204 std::vector<float> y(packed_elements * num_buffers);
205
206 std::generate(x.begin(), x.end(), std::ref(f32rng));
207
208 benchmark::utils::DisableDenormals();
209
210 size_t buffer_index = 0;
211 for (auto _ : state) {
212 benchmark::utils::PrefetchToL1(x.data(), x.size() * sizeof(float));
213 if (++buffer_index == num_buffers) {
214 buffer_index = 0;
215 }
216
217 const auto start = std::chrono::high_resolution_clock::now();
218 float x_max = nanf("");
219 rmax(elements * sizeof(float), x.data(), &x_max);
220 float y_sum = nanf("");
221 raddexpminusmax(elements * sizeof(float), x.data(), &y_sum, x_max);
222 vscaleexpminusmax(elements * sizeof(float), x.data(), y.data() + packed_elements * buffer_index, x_max, 1.0f / y_sum);
223 const auto end = std::chrono::high_resolution_clock::now();
224
225 const auto elapsed_seconds =
226 std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
227 state.SetIterationTime(elapsed_seconds.count());
228 }
229
230 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
231 if (cpu_frequency != 0) {
232 state.counters["cpufreq"] = cpu_frequency;
233 }
234
235 const size_t elements_per_iteration = elements;
236 state.counters["elements"] =
237 benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
238
239 const size_t bytes_per_iteration = 2 * elements * sizeof(float);
240 state.counters["bytes"] =
241 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
242 }
243
ThreePassSoftMaxWithReloading(benchmark::State & state,xnn_f32_rmax_ukernel_function rmax,xnn_f32_raddstoreexpminusmax_ukernel_function raddstoreexpminusmax,xnn_init_f32_expminus_params_fn init_expminus_params,xnn_f32_vbinary_minmax_ukernel_function vmulc,xnn_init_f32_minmax_params_fn init_minmax_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)244 static void ThreePassSoftMaxWithReloading(
245 benchmark::State& state,
246 xnn_f32_rmax_ukernel_function rmax,
247 xnn_f32_raddstoreexpminusmax_ukernel_function raddstoreexpminusmax,
248 xnn_init_f32_expminus_params_fn init_expminus_params,
249 xnn_f32_vbinary_minmax_ukernel_function vmulc,
250 xnn_init_f32_minmax_params_fn init_minmax_params,
251 benchmark::utils::IsaCheckFunction isa_check = nullptr)
252 {
253 if (isa_check && !isa_check(state)) {
254 return;
255 }
256
257 const size_t elements = state.range(0);
258 const size_t cache_line_size_max = 128;
259 const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
260
261 std::random_device random_device;
262 auto rng = std::mt19937(random_device());
263 auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
264
265 const size_t num_buffers = 1 +
266 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
267 std::vector<float> x(elements);
268 std::vector<float> y(packed_elements * num_buffers);
269
270 std::generate(x.begin(), x.end(), std::ref(f32rng));
271
272 benchmark::utils::DisableDenormals();
273
274 xnn_f32_expminus_params expminus_params;
275 xnn_f32_minmax_params minmax_params;
276 init_expminus_params(&expminus_params);
277 init_minmax_params(&minmax_params, -INFINITY, INFINITY);
278
279 size_t buffer_index = 0;
280 for (auto _ : state) {
281 benchmark::utils::PrefetchToL1(x.data(), x.size() * sizeof(float));
282 if (++buffer_index == num_buffers) {
283 buffer_index = 0;
284 }
285
286 const auto start = std::chrono::high_resolution_clock::now();
287 float x_max = nanf("");
288 rmax(elements * sizeof(float), x.data(), &x_max);
289 float y_sum = nanf("");
290 raddstoreexpminusmax(elements * sizeof(float), x.data(), &x_max, y.data() + packed_elements * buffer_index, &y_sum, &expminus_params);
291 const float inv_y_sum = 1.0f / y_sum;
292 vmulc(elements * sizeof(float), y.data() + packed_elements * buffer_index, &inv_y_sum, y.data() + packed_elements * buffer_index, &minmax_params);
293 const auto end = std::chrono::high_resolution_clock::now();
294
295 const auto elapsed_seconds =
296 std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
297 state.SetIterationTime(elapsed_seconds.count());
298 }
299
300 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
301 if (cpu_frequency != 0) {
302 state.counters["cpufreq"] = cpu_frequency;
303 }
304
305 const size_t elements_per_iteration = elements;
306 state.counters["elements"] =
307 benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
308
309 const size_t bytes_per_iteration = 2 * elements * sizeof(float);
310 state.counters["bytes"] =
311 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
312 }
313
TwoPassSoftMax(benchmark::State & state,xnn_f32_raddextexp_ukernel_function raddextexp,xnn_f32_vscaleextexp_ukernel_function vscaleextexp,benchmark::utils::IsaCheckFunction isa_check=nullptr)314 static void TwoPassSoftMax(
315 benchmark::State& state,
316 xnn_f32_raddextexp_ukernel_function raddextexp,
317 xnn_f32_vscaleextexp_ukernel_function vscaleextexp,
318 benchmark::utils::IsaCheckFunction isa_check = nullptr)
319 {
320 if (isa_check && !isa_check(state)) {
321 return;
322 }
323
324 const size_t elements = state.range(0);
325 const size_t cache_line_size_max = 128;
326 const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
327
328 std::random_device random_device;
329 auto rng = std::mt19937(random_device());
330 auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
331
332 const size_t num_buffers = 1 +
333 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
334 std::vector<float> x(elements);
335 std::vector<float> y(packed_elements * num_buffers);
336
337 std::generate(x.begin(), x.end(), std::ref(f32rng));
338
339 benchmark::utils::DisableDenormals();
340
341 size_t buffer_index = 0;
342 for (auto _ : state) {
343 benchmark::utils::PrefetchToL1(x.data(), x.size() * sizeof(float));
344 if (++buffer_index == num_buffers) {
345 buffer_index = 0;
346 }
347
348 const auto start = std::chrono::high_resolution_clock::now();
349 float scale[2];
350 raddextexp(elements * sizeof(float), x.data(), scale);
351 vscaleextexp(elements * sizeof(float), x.data(), y.data() + packed_elements * buffer_index, 1.0f / scale[0], -scale[1]);
352 const auto end = std::chrono::high_resolution_clock::now();
353
354 const auto elapsed_seconds =
355 std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
356 state.SetIterationTime(elapsed_seconds.count());
357 }
358
359 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
360 if (cpu_frequency != 0) {
361 state.counters["cpufreq"] = cpu_frequency;
362 }
363
364 const size_t elements_per_iteration = elements;
365 state.counters["elements"] =
366 benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
367
368 const size_t bytes_per_iteration = 2 * elements * sizeof(float);
369 state.counters["bytes"] =
370 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
371 }
372
CharacteristicArguments(benchmark::internal::Benchmark * b)373 static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
374 for (int32_t n = 1000; n <= 100000000; n *= 10) {
375 b->Arg(n);
376 b->Arg(3 * n);
377 }
378 }
379
380 #ifdef BENCHMARK_INTEL_DNNL
381 BENCHMARK(DNNLSoftArgMax)->Apply(CharacteristicArguments)->UseManualTime();
382 #endif
383
384 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
385 BENCHMARK_CAPTURE(TwoPassSoftMax, avx2_p5,
386 xnn_f32_raddextexp_ukernel__avx2_p5_x96,
387 xnn_f32_vscaleextexp_ukernel__avx2_p5_x40,
388 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime();
389 BENCHMARK_CAPTURE(ThreePassSoftMaxWithRecomputing, avx2_p5,
390 xnn_f32_rmax_ukernel__avx,
391 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x96,
392 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x24,
393 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime();
394 BENCHMARK_CAPTURE(ThreePassSoftMaxWithReloading, avx2_p5,
395 xnn_f32_rmax_ukernel__avx,
396 xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_x64_acc2,
397 xnn_init_f32_expminus_avx2_rr1_p5_params,
398 xnn_f32_vmulc_minmax_ukernel__avx_x16,
399 xnn_init_f32_minmax_avx_params,
400 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime();
401
402 BENCHMARK_CAPTURE(TwoPassSoftMax, avx512f_p5_scalef,
403 xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3,
404 xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x16,
405 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime();
406 BENCHMARK_CAPTURE(ThreePassSoftMaxWithRecomputing, avx512f_p5_scalef,
407 xnn_f32_rmax_ukernel__avx512f,
408 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc4,
409 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x16,
410 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime();
411 BENCHMARK_CAPTURE(ThreePassSoftMaxWithReloading, avx512f_p5_scalef,
412 xnn_f32_rmax_ukernel__avx512f,
413 xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128_acc2,
414 xnn_init_f32_expminus_avx512_rr1_p5_params,
415 xnn_f32_vmulc_minmax_ukernel__avx512f_x32,
416 xnn_init_f32_minmax_scalar_params,
417 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime();
418 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
419
420 #ifndef XNNPACK_BENCHMARK_NO_MAIN
421 BENCHMARK_MAIN();
422 #endif
423