1*412f47f9SXin Li /*
2*412f47f9SXin Li * Microbenchmark for math functions.
3*412f47f9SXin Li *
4*412f47f9SXin Li * Copyright (c) 2018-2023, Arm Limited.
5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li */
7*412f47f9SXin Li
8*412f47f9SXin Li #undef _GNU_SOURCE
9*412f47f9SXin Li #define _GNU_SOURCE 1
10*412f47f9SXin Li #include <stdint.h>
11*412f47f9SXin Li #include <stdlib.h>
12*412f47f9SXin Li #include <stdio.h>
13*412f47f9SXin Li #include <string.h>
14*412f47f9SXin Li #include <time.h>
15*412f47f9SXin Li #include <math.h>
16*412f47f9SXin Li #include "mathlib.h"
17*412f47f9SXin Li
18*412f47f9SXin Li /* Number of measurements, best result is reported. */
19*412f47f9SXin Li #define MEASURE 60
20*412f47f9SXin Li /* Array size. */
21*412f47f9SXin Li #define N 8000
22*412f47f9SXin Li /* Iterations over the array. */
23*412f47f9SXin Li #define ITER 125
24*412f47f9SXin Li
25*412f47f9SXin Li static double *Trace;
26*412f47f9SXin Li static size_t trace_size;
27*412f47f9SXin Li static double A[N];
28*412f47f9SXin Li static float Af[N];
29*412f47f9SXin Li static long measurecount = MEASURE;
30*412f47f9SXin Li static long itercount = ITER;
31*412f47f9SXin Li
32*412f47f9SXin Li #ifdef __vpcs
33*412f47f9SXin Li #include <arm_neon.h>
34*412f47f9SXin Li typedef float64x2_t v_double;
35*412f47f9SXin Li
36*412f47f9SXin Li #define v_double_len() 2
37*412f47f9SXin Li
38*412f47f9SXin Li static inline v_double
v_double_load(const double * p)39*412f47f9SXin Li v_double_load (const double *p)
40*412f47f9SXin Li {
41*412f47f9SXin Li return (v_double){p[0], p[1]};
42*412f47f9SXin Li }
43*412f47f9SXin Li
44*412f47f9SXin Li static inline v_double
v_double_dup(double x)45*412f47f9SXin Li v_double_dup (double x)
46*412f47f9SXin Li {
47*412f47f9SXin Li return (v_double){x, x};
48*412f47f9SXin Li }
49*412f47f9SXin Li
50*412f47f9SXin Li typedef float32x4_t v_float;
51*412f47f9SXin Li
52*412f47f9SXin Li #define v_float_len() 4
53*412f47f9SXin Li
54*412f47f9SXin Li static inline v_float
v_float_load(const float * p)55*412f47f9SXin Li v_float_load (const float *p)
56*412f47f9SXin Li {
57*412f47f9SXin Li return (v_float){p[0], p[1], p[2], p[3]};
58*412f47f9SXin Li }
59*412f47f9SXin Li
60*412f47f9SXin Li static inline v_float
v_float_dup(float x)61*412f47f9SXin Li v_float_dup (float x)
62*412f47f9SXin Li {
63*412f47f9SXin Li return (v_float){x, x, x, x};
64*412f47f9SXin Li }
65*412f47f9SXin Li #else
66*412f47f9SXin Li /* dummy definitions to make things compile. */
67*412f47f9SXin Li typedef double v_double;
68*412f47f9SXin Li typedef float v_float;
69*412f47f9SXin Li #define v_double_len(x) 1
70*412f47f9SXin Li #define v_double_load(x) (x)[0]
71*412f47f9SXin Li #define v_double_dup(x) (x)
72*412f47f9SXin Li #define v_float_len(x) 1
73*412f47f9SXin Li #define v_float_load(x) (x)[0]
74*412f47f9SXin Li #define v_float_dup(x) (x)
75*412f47f9SXin Li
76*412f47f9SXin Li #endif
77*412f47f9SXin Li
78*412f47f9SXin Li #if WANT_SVE_MATH
79*412f47f9SXin Li #include <arm_sve.h>
80*412f47f9SXin Li typedef svbool_t sv_bool;
81*412f47f9SXin Li typedef svfloat64_t sv_double;
82*412f47f9SXin Li
83*412f47f9SXin Li #define sv_double_len() svcntd()
84*412f47f9SXin Li
85*412f47f9SXin Li static inline sv_double
sv_double_load(const double * p)86*412f47f9SXin Li sv_double_load (const double *p)
87*412f47f9SXin Li {
88*412f47f9SXin Li svbool_t pg = svptrue_b64();
89*412f47f9SXin Li return svld1(pg, p);
90*412f47f9SXin Li }
91*412f47f9SXin Li
92*412f47f9SXin Li static inline sv_double
sv_double_dup(double x)93*412f47f9SXin Li sv_double_dup (double x)
94*412f47f9SXin Li {
95*412f47f9SXin Li return svdup_n_f64(x);
96*412f47f9SXin Li }
97*412f47f9SXin Li
98*412f47f9SXin Li typedef svfloat32_t sv_float;
99*412f47f9SXin Li
100*412f47f9SXin Li #define sv_float_len() svcntw()
101*412f47f9SXin Li
102*412f47f9SXin Li static inline sv_float
sv_float_load(const float * p)103*412f47f9SXin Li sv_float_load (const float *p)
104*412f47f9SXin Li {
105*412f47f9SXin Li svbool_t pg = svptrue_b32();
106*412f47f9SXin Li return svld1(pg, p);
107*412f47f9SXin Li }
108*412f47f9SXin Li
109*412f47f9SXin Li static inline sv_float
sv_float_dup(float x)110*412f47f9SXin Li sv_float_dup (float x)
111*412f47f9SXin Li {
112*412f47f9SXin Li return svdup_n_f32(x);
113*412f47f9SXin Li }
114*412f47f9SXin Li #else
115*412f47f9SXin Li /* dummy definitions to make things compile. */
116*412f47f9SXin Li #define sv_double_len(x) 1
117*412f47f9SXin Li #define sv_float_len(x) 1
118*412f47f9SXin Li #endif
119*412f47f9SXin Li
120*412f47f9SXin Li static double
dummy(double x)121*412f47f9SXin Li dummy (double x)
122*412f47f9SXin Li {
123*412f47f9SXin Li return x;
124*412f47f9SXin Li }
125*412f47f9SXin Li
126*412f47f9SXin Li static float
dummyf(float x)127*412f47f9SXin Li dummyf (float x)
128*412f47f9SXin Li {
129*412f47f9SXin Li return x;
130*412f47f9SXin Li }
131*412f47f9SXin Li #ifdef __vpcs
132*412f47f9SXin Li __vpcs static v_double
__vn_dummy(v_double x)133*412f47f9SXin Li __vn_dummy (v_double x)
134*412f47f9SXin Li {
135*412f47f9SXin Li return x;
136*412f47f9SXin Li }
137*412f47f9SXin Li
138*412f47f9SXin Li __vpcs static v_float
__vn_dummyf(v_float x)139*412f47f9SXin Li __vn_dummyf (v_float x)
140*412f47f9SXin Li {
141*412f47f9SXin Li return x;
142*412f47f9SXin Li }
143*412f47f9SXin Li #endif
144*412f47f9SXin Li #if WANT_SVE_MATH
145*412f47f9SXin Li static sv_double
__sv_dummy(sv_double x,sv_bool pg)146*412f47f9SXin Li __sv_dummy (sv_double x, sv_bool pg)
147*412f47f9SXin Li {
148*412f47f9SXin Li return x;
149*412f47f9SXin Li }
150*412f47f9SXin Li
151*412f47f9SXin Li static sv_float
__sv_dummyf(sv_float x,sv_bool pg)152*412f47f9SXin Li __sv_dummyf (sv_float x, sv_bool pg)
153*412f47f9SXin Li {
154*412f47f9SXin Li return x;
155*412f47f9SXin Li }
156*412f47f9SXin Li
157*412f47f9SXin Li #endif
158*412f47f9SXin Li
159*412f47f9SXin Li #include "test/mathbench_wrappers.h"
160*412f47f9SXin Li
161*412f47f9SXin Li static const struct fun
162*412f47f9SXin Li {
163*412f47f9SXin Li const char *name;
164*412f47f9SXin Li int prec;
165*412f47f9SXin Li int vec;
166*412f47f9SXin Li double lo;
167*412f47f9SXin Li double hi;
168*412f47f9SXin Li union
169*412f47f9SXin Li {
170*412f47f9SXin Li double (*d) (double);
171*412f47f9SXin Li float (*f) (float);
172*412f47f9SXin Li #ifdef __vpcs
173*412f47f9SXin Li __vpcs v_double (*vnd) (v_double);
174*412f47f9SXin Li __vpcs v_float (*vnf) (v_float);
175*412f47f9SXin Li #endif
176*412f47f9SXin Li #if WANT_SVE_MATH
177*412f47f9SXin Li sv_double (*svd) (sv_double, sv_bool);
178*412f47f9SXin Li sv_float (*svf) (sv_float, sv_bool);
179*412f47f9SXin Li #endif
180*412f47f9SXin Li } fun;
181*412f47f9SXin Li } funtab[] = {
182*412f47f9SXin Li #define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}},
183*412f47f9SXin Li #define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}},
184*412f47f9SXin Li #define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}},
185*412f47f9SXin Li #define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}},
186*412f47f9SXin Li #define SVD(func, lo, hi) {#func, 'd', 's', lo, hi, {.svd = func}},
187*412f47f9SXin Li #define SVF(func, lo, hi) {#func, 'f', 's', lo, hi, {.svf = func}},
188*412f47f9SXin Li D (dummy, 1.0, 2.0)
189*412f47f9SXin Li F (dummyf, 1.0, 2.0)
190*412f47f9SXin Li #ifdef __vpcs
191*412f47f9SXin Li VND (__vn_dummy, 1.0, 2.0)
192*412f47f9SXin Li VNF (__vn_dummyf, 1.0, 2.0)
193*412f47f9SXin Li #endif
194*412f47f9SXin Li #if WANT_SVE_MATH
195*412f47f9SXin Li SVD (__sv_dummy, 1.0, 2.0)
196*412f47f9SXin Li SVF (__sv_dummyf, 1.0, 2.0)
197*412f47f9SXin Li #endif
198*412f47f9SXin Li #include "test/mathbench_funcs.h"
199*412f47f9SXin Li {0},
200*412f47f9SXin Li #undef F
201*412f47f9SXin Li #undef D
202*412f47f9SXin Li #undef VNF
203*412f47f9SXin Li #undef VND
204*412f47f9SXin Li #undef SVF
205*412f47f9SXin Li #undef SVD
206*412f47f9SXin Li };
207*412f47f9SXin Li
208*412f47f9SXin Li static void
gen_linear(double lo,double hi)209*412f47f9SXin Li gen_linear (double lo, double hi)
210*412f47f9SXin Li {
211*412f47f9SXin Li for (int i = 0; i < N; i++)
212*412f47f9SXin Li A[i] = (lo * (N - i) + hi * i) / N;
213*412f47f9SXin Li }
214*412f47f9SXin Li
215*412f47f9SXin Li static void
genf_linear(double lo,double hi)216*412f47f9SXin Li genf_linear (double lo, double hi)
217*412f47f9SXin Li {
218*412f47f9SXin Li for (int i = 0; i < N; i++)
219*412f47f9SXin Li Af[i] = (float)(lo * (N - i) + hi * i) / N;
220*412f47f9SXin Li }
221*412f47f9SXin Li
222*412f47f9SXin Li static inline double
asdouble(uint64_t i)223*412f47f9SXin Li asdouble (uint64_t i)
224*412f47f9SXin Li {
225*412f47f9SXin Li union
226*412f47f9SXin Li {
227*412f47f9SXin Li uint64_t i;
228*412f47f9SXin Li double f;
229*412f47f9SXin Li } u = {i};
230*412f47f9SXin Li return u.f;
231*412f47f9SXin Li }
232*412f47f9SXin Li
233*412f47f9SXin Li static uint64_t seed = 0x0123456789abcdef;
234*412f47f9SXin Li
235*412f47f9SXin Li static double
frand(double lo,double hi)236*412f47f9SXin Li frand (double lo, double hi)
237*412f47f9SXin Li {
238*412f47f9SXin Li seed = 6364136223846793005ULL * seed + 1;
239*412f47f9SXin Li return lo + (hi - lo) * (asdouble (seed >> 12 | 0x3ffULL << 52) - 1.0);
240*412f47f9SXin Li }
241*412f47f9SXin Li
242*412f47f9SXin Li static void
gen_rand(double lo,double hi)243*412f47f9SXin Li gen_rand (double lo, double hi)
244*412f47f9SXin Li {
245*412f47f9SXin Li for (int i = 0; i < N; i++)
246*412f47f9SXin Li A[i] = frand (lo, hi);
247*412f47f9SXin Li }
248*412f47f9SXin Li
249*412f47f9SXin Li static void
genf_rand(double lo,double hi)250*412f47f9SXin Li genf_rand (double lo, double hi)
251*412f47f9SXin Li {
252*412f47f9SXin Li for (int i = 0; i < N; i++)
253*412f47f9SXin Li Af[i] = (float)frand (lo, hi);
254*412f47f9SXin Li }
255*412f47f9SXin Li
256*412f47f9SXin Li static void
gen_trace(int index)257*412f47f9SXin Li gen_trace (int index)
258*412f47f9SXin Li {
259*412f47f9SXin Li for (int i = 0; i < N; i++)
260*412f47f9SXin Li A[i] = Trace[index + i];
261*412f47f9SXin Li }
262*412f47f9SXin Li
263*412f47f9SXin Li static void
genf_trace(int index)264*412f47f9SXin Li genf_trace (int index)
265*412f47f9SXin Li {
266*412f47f9SXin Li for (int i = 0; i < N; i++)
267*412f47f9SXin Li Af[i] = (float)Trace[index + i];
268*412f47f9SXin Li }
269*412f47f9SXin Li
270*412f47f9SXin Li static void
run_thruput(double f (double))271*412f47f9SXin Li run_thruput (double f (double))
272*412f47f9SXin Li {
273*412f47f9SXin Li for (int i = 0; i < N; i++)
274*412f47f9SXin Li f (A[i]);
275*412f47f9SXin Li }
276*412f47f9SXin Li
277*412f47f9SXin Li static void
runf_thruput(float f (float))278*412f47f9SXin Li runf_thruput (float f (float))
279*412f47f9SXin Li {
280*412f47f9SXin Li for (int i = 0; i < N; i++)
281*412f47f9SXin Li f (Af[i]);
282*412f47f9SXin Li }
283*412f47f9SXin Li
284*412f47f9SXin Li volatile double zero = 0;
285*412f47f9SXin Li
286*412f47f9SXin Li static void
run_latency(double f (double))287*412f47f9SXin Li run_latency (double f (double))
288*412f47f9SXin Li {
289*412f47f9SXin Li double z = zero;
290*412f47f9SXin Li double prev = z;
291*412f47f9SXin Li for (int i = 0; i < N; i++)
292*412f47f9SXin Li prev = f (A[i] + prev * z);
293*412f47f9SXin Li }
294*412f47f9SXin Li
295*412f47f9SXin Li static void
runf_latency(float f (float))296*412f47f9SXin Li runf_latency (float f (float))
297*412f47f9SXin Li {
298*412f47f9SXin Li float z = (float)zero;
299*412f47f9SXin Li float prev = z;
300*412f47f9SXin Li for (int i = 0; i < N; i++)
301*412f47f9SXin Li prev = f (Af[i] + prev * z);
302*412f47f9SXin Li }
303*412f47f9SXin Li
304*412f47f9SXin Li #ifdef __vpcs
305*412f47f9SXin Li static void
run_vn_thruput(__vpcs v_double f (v_double))306*412f47f9SXin Li run_vn_thruput (__vpcs v_double f (v_double))
307*412f47f9SXin Li {
308*412f47f9SXin Li for (int i = 0; i < N; i += v_double_len ())
309*412f47f9SXin Li f (v_double_load (A+i));
310*412f47f9SXin Li }
311*412f47f9SXin Li
312*412f47f9SXin Li static void
runf_vn_thruput(__vpcs v_float f (v_float))313*412f47f9SXin Li runf_vn_thruput (__vpcs v_float f (v_float))
314*412f47f9SXin Li {
315*412f47f9SXin Li for (int i = 0; i < N; i += v_float_len ())
316*412f47f9SXin Li f (v_float_load (Af+i));
317*412f47f9SXin Li }
318*412f47f9SXin Li
319*412f47f9SXin Li static void
run_vn_latency(__vpcs v_double f (v_double))320*412f47f9SXin Li run_vn_latency (__vpcs v_double f (v_double))
321*412f47f9SXin Li {
322*412f47f9SXin Li volatile uint64x2_t vsel = (uint64x2_t) { 0, 0 };
323*412f47f9SXin Li uint64x2_t sel = vsel;
324*412f47f9SXin Li v_double prev = v_double_dup (0);
325*412f47f9SXin Li for (int i = 0; i < N; i += v_double_len ())
326*412f47f9SXin Li prev = f (vbslq_f64 (sel, prev, v_double_load (A+i)));
327*412f47f9SXin Li }
328*412f47f9SXin Li
329*412f47f9SXin Li static void
runf_vn_latency(__vpcs v_float f (v_float))330*412f47f9SXin Li runf_vn_latency (__vpcs v_float f (v_float))
331*412f47f9SXin Li {
332*412f47f9SXin Li volatile uint32x4_t vsel = (uint32x4_t) { 0, 0, 0, 0 };
333*412f47f9SXin Li uint32x4_t sel = vsel;
334*412f47f9SXin Li v_float prev = v_float_dup (0);
335*412f47f9SXin Li for (int i = 0; i < N; i += v_float_len ())
336*412f47f9SXin Li prev = f (vbslq_f32 (sel, prev, v_float_load (Af+i)));
337*412f47f9SXin Li }
338*412f47f9SXin Li #endif
339*412f47f9SXin Li
340*412f47f9SXin Li #if WANT_SVE_MATH
341*412f47f9SXin Li static void
run_sv_thruput(sv_double f (sv_double,sv_bool))342*412f47f9SXin Li run_sv_thruput (sv_double f (sv_double, sv_bool))
343*412f47f9SXin Li {
344*412f47f9SXin Li for (int i = 0; i < N; i += sv_double_len ())
345*412f47f9SXin Li f (sv_double_load (A+i), svptrue_b64 ());
346*412f47f9SXin Li }
347*412f47f9SXin Li
348*412f47f9SXin Li static void
runf_sv_thruput(sv_float f (sv_float,sv_bool))349*412f47f9SXin Li runf_sv_thruput (sv_float f (sv_float, sv_bool))
350*412f47f9SXin Li {
351*412f47f9SXin Li for (int i = 0; i < N; i += sv_float_len ())
352*412f47f9SXin Li f (sv_float_load (Af+i), svptrue_b32 ());
353*412f47f9SXin Li }
354*412f47f9SXin Li
355*412f47f9SXin Li static void
run_sv_latency(sv_double f (sv_double,sv_bool))356*412f47f9SXin Li run_sv_latency (sv_double f (sv_double, sv_bool))
357*412f47f9SXin Li {
358*412f47f9SXin Li volatile sv_bool vsel = svptrue_b64 ();
359*412f47f9SXin Li sv_bool sel = vsel;
360*412f47f9SXin Li sv_double prev = sv_double_dup (0);
361*412f47f9SXin Li for (int i = 0; i < N; i += sv_double_len ())
362*412f47f9SXin Li prev = f (svsel_f64 (sel, sv_double_load (A+i), prev), svptrue_b64 ());
363*412f47f9SXin Li }
364*412f47f9SXin Li
365*412f47f9SXin Li static void
runf_sv_latency(sv_float f (sv_float,sv_bool))366*412f47f9SXin Li runf_sv_latency (sv_float f (sv_float, sv_bool))
367*412f47f9SXin Li {
368*412f47f9SXin Li volatile sv_bool vsel = svptrue_b32 ();
369*412f47f9SXin Li sv_bool sel = vsel;
370*412f47f9SXin Li sv_float prev = sv_float_dup (0);
371*412f47f9SXin Li for (int i = 0; i < N; i += sv_float_len ())
372*412f47f9SXin Li prev = f (svsel_f32 (sel, sv_float_load (Af+i), prev), svptrue_b32 ());
373*412f47f9SXin Li }
374*412f47f9SXin Li #endif
375*412f47f9SXin Li
376*412f47f9SXin Li static uint64_t
tic(void)377*412f47f9SXin Li tic (void)
378*412f47f9SXin Li {
379*412f47f9SXin Li struct timespec ts;
380*412f47f9SXin Li if (clock_gettime (CLOCK_REALTIME, &ts))
381*412f47f9SXin Li abort ();
382*412f47f9SXin Li return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
383*412f47f9SXin Li }
384*412f47f9SXin Li
385*412f47f9SXin Li #define TIMEIT(run, f) do { \
386*412f47f9SXin Li dt = -1; \
387*412f47f9SXin Li run (f); /* Warm up. */ \
388*412f47f9SXin Li for (int j = 0; j < measurecount; j++) \
389*412f47f9SXin Li { \
390*412f47f9SXin Li uint64_t t0 = tic (); \
391*412f47f9SXin Li for (int i = 0; i < itercount; i++) \
392*412f47f9SXin Li run (f); \
393*412f47f9SXin Li uint64_t t1 = tic (); \
394*412f47f9SXin Li if (t1 - t0 < dt) \
395*412f47f9SXin Li dt = t1 - t0; \
396*412f47f9SXin Li } \
397*412f47f9SXin Li } while (0)
398*412f47f9SXin Li
399*412f47f9SXin Li static void
bench1(const struct fun * f,int type,double lo,double hi)400*412f47f9SXin Li bench1 (const struct fun *f, int type, double lo, double hi)
401*412f47f9SXin Li {
402*412f47f9SXin Li uint64_t dt = 0;
403*412f47f9SXin Li uint64_t ns100;
404*412f47f9SXin Li const char *s = type == 't' ? "rthruput" : "latency";
405*412f47f9SXin Li int vlen = 1;
406*412f47f9SXin Li
407*412f47f9SXin Li if (f->vec == 'n')
408*412f47f9SXin Li vlen = f->prec == 'd' ? v_double_len() : v_float_len();
409*412f47f9SXin Li else if (f->vec == 's')
410*412f47f9SXin Li vlen = f->prec == 'd' ? sv_double_len() : sv_float_len();
411*412f47f9SXin Li
412*412f47f9SXin Li if (f->prec == 'd' && type == 't' && f->vec == 0)
413*412f47f9SXin Li TIMEIT (run_thruput, f->fun.d);
414*412f47f9SXin Li else if (f->prec == 'd' && type == 'l' && f->vec == 0)
415*412f47f9SXin Li TIMEIT (run_latency, f->fun.d);
416*412f47f9SXin Li else if (f->prec == 'f' && type == 't' && f->vec == 0)
417*412f47f9SXin Li TIMEIT (runf_thruput, f->fun.f);
418*412f47f9SXin Li else if (f->prec == 'f' && type == 'l' && f->vec == 0)
419*412f47f9SXin Li TIMEIT (runf_latency, f->fun.f);
420*412f47f9SXin Li #ifdef __vpcs
421*412f47f9SXin Li else if (f->prec == 'd' && type == 't' && f->vec == 'n')
422*412f47f9SXin Li TIMEIT (run_vn_thruput, f->fun.vnd);
423*412f47f9SXin Li else if (f->prec == 'd' && type == 'l' && f->vec == 'n')
424*412f47f9SXin Li TIMEIT (run_vn_latency, f->fun.vnd);
425*412f47f9SXin Li else if (f->prec == 'f' && type == 't' && f->vec == 'n')
426*412f47f9SXin Li TIMEIT (runf_vn_thruput, f->fun.vnf);
427*412f47f9SXin Li else if (f->prec == 'f' && type == 'l' && f->vec == 'n')
428*412f47f9SXin Li TIMEIT (runf_vn_latency, f->fun.vnf);
429*412f47f9SXin Li #endif
430*412f47f9SXin Li #if WANT_SVE_MATH
431*412f47f9SXin Li else if (f->prec == 'd' && type == 't' && f->vec == 's')
432*412f47f9SXin Li TIMEIT (run_sv_thruput, f->fun.svd);
433*412f47f9SXin Li else if (f->prec == 'd' && type == 'l' && f->vec == 's')
434*412f47f9SXin Li TIMEIT (run_sv_latency, f->fun.svd);
435*412f47f9SXin Li else if (f->prec == 'f' && type == 't' && f->vec == 's')
436*412f47f9SXin Li TIMEIT (runf_sv_thruput, f->fun.svf);
437*412f47f9SXin Li else if (f->prec == 'f' && type == 'l' && f->vec == 's')
438*412f47f9SXin Li TIMEIT (runf_sv_latency, f->fun.svf);
439*412f47f9SXin Li #endif
440*412f47f9SXin Li
441*412f47f9SXin Li if (type == 't')
442*412f47f9SXin Li {
443*412f47f9SXin Li ns100 = (100 * dt + itercount * N / 2) / (itercount * N);
444*412f47f9SXin Li printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g] vlen %d\n",
445*412f47f9SXin Li f->name, s,
446*412f47f9SXin Li (unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
447*412f47f9SXin Li (unsigned long long) dt, lo, hi, vlen);
448*412f47f9SXin Li }
449*412f47f9SXin Li else if (type == 'l')
450*412f47f9SXin Li {
451*412f47f9SXin Li ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen);
452*412f47f9SXin Li printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g] vlen %d\n",
453*412f47f9SXin Li f->name, s,
454*412f47f9SXin Li (unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
455*412f47f9SXin Li (unsigned long long) dt, lo, hi, vlen);
456*412f47f9SXin Li }
457*412f47f9SXin Li fflush (stdout);
458*412f47f9SXin Li }
459*412f47f9SXin Li
460*412f47f9SXin Li static void
bench(const struct fun * f,double lo,double hi,int type,int gen)461*412f47f9SXin Li bench (const struct fun *f, double lo, double hi, int type, int gen)
462*412f47f9SXin Li {
463*412f47f9SXin Li if (f->prec == 'd' && gen == 'r')
464*412f47f9SXin Li gen_rand (lo, hi);
465*412f47f9SXin Li else if (f->prec == 'd' && gen == 'l')
466*412f47f9SXin Li gen_linear (lo, hi);
467*412f47f9SXin Li else if (f->prec == 'd' && gen == 't')
468*412f47f9SXin Li gen_trace (0);
469*412f47f9SXin Li else if (f->prec == 'f' && gen == 'r')
470*412f47f9SXin Li genf_rand (lo, hi);
471*412f47f9SXin Li else if (f->prec == 'f' && gen == 'l')
472*412f47f9SXin Li genf_linear (lo, hi);
473*412f47f9SXin Li else if (f->prec == 'f' && gen == 't')
474*412f47f9SXin Li genf_trace (0);
475*412f47f9SXin Li
476*412f47f9SXin Li if (gen == 't')
477*412f47f9SXin Li hi = trace_size / N;
478*412f47f9SXin Li
479*412f47f9SXin Li if (type == 'b' || type == 't')
480*412f47f9SXin Li bench1 (f, 't', lo, hi);
481*412f47f9SXin Li
482*412f47f9SXin Li if (type == 'b' || type == 'l')
483*412f47f9SXin Li bench1 (f, 'l', lo, hi);
484*412f47f9SXin Li
485*412f47f9SXin Li for (int i = N; i < trace_size; i += N)
486*412f47f9SXin Li {
487*412f47f9SXin Li if (f->prec == 'd')
488*412f47f9SXin Li gen_trace (i);
489*412f47f9SXin Li else
490*412f47f9SXin Li genf_trace (i);
491*412f47f9SXin Li
492*412f47f9SXin Li lo = i / N;
493*412f47f9SXin Li if (type == 'b' || type == 't')
494*412f47f9SXin Li bench1 (f, 't', lo, hi);
495*412f47f9SXin Li
496*412f47f9SXin Li if (type == 'b' || type == 'l')
497*412f47f9SXin Li bench1 (f, 'l', lo, hi);
498*412f47f9SXin Li }
499*412f47f9SXin Li }
500*412f47f9SXin Li
501*412f47f9SXin Li static void
readtrace(const char * name)502*412f47f9SXin Li readtrace (const char *name)
503*412f47f9SXin Li {
504*412f47f9SXin Li int n = 0;
505*412f47f9SXin Li FILE *f = strcmp (name, "-") == 0 ? stdin : fopen (name, "r");
506*412f47f9SXin Li if (!f)
507*412f47f9SXin Li {
508*412f47f9SXin Li printf ("openning \"%s\" failed: %m\n", name);
509*412f47f9SXin Li exit (1);
510*412f47f9SXin Li }
511*412f47f9SXin Li for (;;)
512*412f47f9SXin Li {
513*412f47f9SXin Li if (n >= trace_size)
514*412f47f9SXin Li {
515*412f47f9SXin Li trace_size += N;
516*412f47f9SXin Li Trace = realloc (Trace, trace_size * sizeof (Trace[0]));
517*412f47f9SXin Li if (Trace == NULL)
518*412f47f9SXin Li {
519*412f47f9SXin Li printf ("out of memory\n");
520*412f47f9SXin Li exit (1);
521*412f47f9SXin Li }
522*412f47f9SXin Li }
523*412f47f9SXin Li if (fscanf (f, "%lf", Trace + n) != 1)
524*412f47f9SXin Li break;
525*412f47f9SXin Li n++;
526*412f47f9SXin Li }
527*412f47f9SXin Li if (ferror (f) || n == 0)
528*412f47f9SXin Li {
529*412f47f9SXin Li printf ("reading \"%s\" failed: %m\n", name);
530*412f47f9SXin Li exit (1);
531*412f47f9SXin Li }
532*412f47f9SXin Li fclose (f);
533*412f47f9SXin Li if (n % N == 0)
534*412f47f9SXin Li trace_size = n;
535*412f47f9SXin Li for (int i = 0; n < trace_size; n++, i++)
536*412f47f9SXin Li Trace[n] = Trace[i];
537*412f47f9SXin Li }
538*412f47f9SXin Li
539*412f47f9SXin Li static void
usage(void)540*412f47f9SXin Li usage (void)
541*412f47f9SXin Li {
542*412f47f9SXin Li printf ("usage: ./mathbench [-g rand|linear|trace] [-t latency|thruput|both] "
543*412f47f9SXin Li "[-i low high] [-f tracefile] [-m measurements] [-c iterations] func "
544*412f47f9SXin Li "[func2 ..]\n");
545*412f47f9SXin Li printf ("func:\n");
546*412f47f9SXin Li printf ("%7s [run all benchmarks]\n", "all");
547*412f47f9SXin Li for (const struct fun *f = funtab; f->name; f++)
548*412f47f9SXin Li printf ("%7s [low: %g high: %g]\n", f->name, f->lo, f->hi);
549*412f47f9SXin Li exit (1);
550*412f47f9SXin Li }
551*412f47f9SXin Li
552*412f47f9SXin Li int
main(int argc,char * argv[])553*412f47f9SXin Li main (int argc, char *argv[])
554*412f47f9SXin Li {
555*412f47f9SXin Li int usergen = 0, gen = 'r', type = 'b', all = 0;
556*412f47f9SXin Li double lo = 0, hi = 0;
557*412f47f9SXin Li const char *tracefile = "-";
558*412f47f9SXin Li
559*412f47f9SXin Li argv++;
560*412f47f9SXin Li argc--;
561*412f47f9SXin Li for (;;)
562*412f47f9SXin Li {
563*412f47f9SXin Li if (argc <= 0)
564*412f47f9SXin Li usage ();
565*412f47f9SXin Li if (argv[0][0] != '-')
566*412f47f9SXin Li break;
567*412f47f9SXin Li else if (argc >= 3 && strcmp (argv[0], "-i") == 0)
568*412f47f9SXin Li {
569*412f47f9SXin Li usergen = 1;
570*412f47f9SXin Li lo = strtod (argv[1], 0);
571*412f47f9SXin Li hi = strtod (argv[2], 0);
572*412f47f9SXin Li argv += 3;
573*412f47f9SXin Li argc -= 3;
574*412f47f9SXin Li }
575*412f47f9SXin Li else if (argc >= 2 && strcmp (argv[0], "-m") == 0)
576*412f47f9SXin Li {
577*412f47f9SXin Li measurecount = strtol (argv[1], 0, 0);
578*412f47f9SXin Li argv += 2;
579*412f47f9SXin Li argc -= 2;
580*412f47f9SXin Li }
581*412f47f9SXin Li else if (argc >= 2 && strcmp (argv[0], "-c") == 0)
582*412f47f9SXin Li {
583*412f47f9SXin Li itercount = strtol (argv[1], 0, 0);
584*412f47f9SXin Li argv += 2;
585*412f47f9SXin Li argc -= 2;
586*412f47f9SXin Li }
587*412f47f9SXin Li else if (argc >= 2 && strcmp (argv[0], "-g") == 0)
588*412f47f9SXin Li {
589*412f47f9SXin Li gen = argv[1][0];
590*412f47f9SXin Li if (strchr ("rlt", gen) == 0)
591*412f47f9SXin Li usage ();
592*412f47f9SXin Li argv += 2;
593*412f47f9SXin Li argc -= 2;
594*412f47f9SXin Li }
595*412f47f9SXin Li else if (argc >= 2 && strcmp (argv[0], "-f") == 0)
596*412f47f9SXin Li {
597*412f47f9SXin Li gen = 't'; /* -f implies -g trace. */
598*412f47f9SXin Li tracefile = argv[1];
599*412f47f9SXin Li argv += 2;
600*412f47f9SXin Li argc -= 2;
601*412f47f9SXin Li }
602*412f47f9SXin Li else if (argc >= 2 && strcmp (argv[0], "-t") == 0)
603*412f47f9SXin Li {
604*412f47f9SXin Li type = argv[1][0];
605*412f47f9SXin Li if (strchr ("ltb", type) == 0)
606*412f47f9SXin Li usage ();
607*412f47f9SXin Li argv += 2;
608*412f47f9SXin Li argc -= 2;
609*412f47f9SXin Li }
610*412f47f9SXin Li else
611*412f47f9SXin Li usage ();
612*412f47f9SXin Li }
613*412f47f9SXin Li if (gen == 't')
614*412f47f9SXin Li {
615*412f47f9SXin Li readtrace (tracefile);
616*412f47f9SXin Li lo = hi = 0;
617*412f47f9SXin Li usergen = 1;
618*412f47f9SXin Li }
619*412f47f9SXin Li while (argc > 0)
620*412f47f9SXin Li {
621*412f47f9SXin Li int found = 0;
622*412f47f9SXin Li all = strcmp (argv[0], "all") == 0;
623*412f47f9SXin Li for (const struct fun *f = funtab; f->name; f++)
624*412f47f9SXin Li if (all || strcmp (argv[0], f->name) == 0)
625*412f47f9SXin Li {
626*412f47f9SXin Li found = 1;
627*412f47f9SXin Li if (!usergen)
628*412f47f9SXin Li {
629*412f47f9SXin Li lo = f->lo;
630*412f47f9SXin Li hi = f->hi;
631*412f47f9SXin Li }
632*412f47f9SXin Li bench (f, lo, hi, type, gen);
633*412f47f9SXin Li if (usergen && !all)
634*412f47f9SXin Li break;
635*412f47f9SXin Li }
636*412f47f9SXin Li if (!found)
637*412f47f9SXin Li printf ("unknown function: %s\n", argv[0]);
638*412f47f9SXin Li argv++;
639*412f47f9SXin Li argc--;
640*412f47f9SXin Li }
641*412f47f9SXin Li return 0;
642*412f47f9SXin Li }
643