1 #include <benchmark/benchmark.h>
2
3 #include <fp16.h>
4 #ifndef EMSCRIPTEN
5 #include <fp16/psimd.h>
6 #endif
7
8 #if (defined(__i386__) || defined(__x86_64__)) && defined(__F16C__)
9 #include <immintrin.h>
10 #endif
11
12 #ifdef FP16_COMPARATIVE_BENCHMARKS
13 #include <third-party/THHalf.h>
14 #include <third-party/npy-halffloat.h>
15 #include <third-party/eigen-half.h>
16 #include <third-party/float16-compressor.h>
17 #include <third-party/half.hpp>
18 #endif
19
next_xorshift16(uint16_t x)20 static inline uint16_t next_xorshift16(uint16_t x) {
21 x ^= x >> 8;
22 x ^= x << 9;
23 x ^= x >> 5;
24 return x;
25 }
26
next_xorshift32(uint32_t x)27 static inline uint32_t next_xorshift32(uint32_t x) {
28 x ^= x >> 13;
29 x ^= x << 17;
30 x ^= x >> 5;
31 return x;
32 }
33 #ifndef EMSCRIPTEN
next_xorshift16_psimd(psimd_u16 x)34 PSIMD_INTRINSIC psimd_u16 next_xorshift16_psimd(psimd_u16 x) {
35 x ^= x >> psimd_splat_u16(8);
36 x ^= x << psimd_splat_u16(9);
37 x ^= x >> psimd_splat_u16(5);
38 return x;
39 }
40 #endif
41
42
43 /* Conversion from IEEE FP16 to IEEE FP32 */
44
fp16_ieee_to_fp32_bits(benchmark::State & state)45 static void fp16_ieee_to_fp32_bits(benchmark::State& state) {
46 uint16_t fp16 = UINT16_C(0x7C00);
47 while (state.KeepRunning()) {
48 const uint32_t fp32 = fp16_ieee_to_fp32_bits(fp16);
49
50 fp16 = next_xorshift16(fp16);
51 benchmark::DoNotOptimize(fp32);
52 }
53 }
54 BENCHMARK(fp16_ieee_to_fp32_bits);
55
fp16_ieee_to_fp32_value(benchmark::State & state)56 static void fp16_ieee_to_fp32_value(benchmark::State& state) {
57 uint16_t fp16 = UINT16_C(0x7C00);
58 while (state.KeepRunning()) {
59 const float fp32 = fp16_ieee_to_fp32_value(fp16);
60
61 fp16 = next_xorshift16(fp16);
62 benchmark::DoNotOptimize(fp32);
63 }
64 }
65 BENCHMARK(fp16_ieee_to_fp32_value);
66
67 #ifndef EMSCRIPTEN
fp16_ieee_to_fp32_psimd(benchmark::State & state)68 static void fp16_ieee_to_fp32_psimd(benchmark::State& state) {
69 psimd_u16 fp16 = (psimd_u16) { 0x7C00, 0x7C01, 0x7C02, 0x7C03 };
70 while (state.KeepRunning()) {
71 const psimd_f32 fp32 = fp16_ieee_to_fp32_psimd(fp16);
72
73 fp16 = next_xorshift16_psimd(fp16);
74 benchmark::DoNotOptimize(fp32);
75 }
76 }
77 BENCHMARK(fp16_ieee_to_fp32_psimd);
78
fp16_ieee_to_fp32x2_psimd(benchmark::State & state)79 static void fp16_ieee_to_fp32x2_psimd(benchmark::State& state) {
80 psimd_u16 fp16 =
81 (psimd_u16) { 0x7C00, 0x7C01, 0x7C02, 0x7C03, 0x7C04, 0x7C05, 0x7C06, 0x7C07 };
82 while (state.KeepRunning()) {
83 const psimd_f32x2 fp32 = fp16_ieee_to_fp32x2_psimd(fp16);
84
85 fp16 = next_xorshift16_psimd(fp16);
86 benchmark::DoNotOptimize(fp32);
87 }
88 }
89 BENCHMARK(fp16_ieee_to_fp32x2_psimd);
90 #endif
91
92 #ifdef FP16_COMPARATIVE_BENCHMARKS
TH_halfbits2float(benchmark::State & state)93 static void TH_halfbits2float(benchmark::State& state) {
94 uint16_t fp16 = UINT16_C(0x7C00);
95 while (state.KeepRunning()) {
96 float fp32;
97 TH_halfbits2float(&fp16, &fp32);
98
99 fp16 = next_xorshift16(fp16);
100 benchmark::DoNotOptimize(fp32);
101 }
102 }
103 BENCHMARK(TH_halfbits2float);
104
npy_halfbits_to_floatbits(benchmark::State & state)105 static void npy_halfbits_to_floatbits(benchmark::State& state) {
106 uint16_t fp16 = UINT16_C(0x7C00);
107 while (state.KeepRunning()) {
108 const uint32_t fp32 = npy_halfbits_to_floatbits(fp16);
109
110 fp16 = next_xorshift16(fp16);
111 benchmark::DoNotOptimize(fp32);
112 }
113 }
114 BENCHMARK(npy_halfbits_to_floatbits);
115
Eigen_half_to_float(benchmark::State & state)116 static void Eigen_half_to_float(benchmark::State& state) {
117 uint16_t fp16 = UINT16_C(0x7C00);
118 while (state.KeepRunning()) {
119 const float fp32 =
120 Eigen::half_impl::half_to_float(
121 Eigen::half_impl::raw_uint16_to_half(fp16));
122
123 fp16 = next_xorshift16(fp16);
124 benchmark::DoNotOptimize(fp32);
125 }
126 }
127 BENCHMARK(Eigen_half_to_float);
128
Float16Compressor_decompress(benchmark::State & state)129 static void Float16Compressor_decompress(benchmark::State& state) {
130 uint16_t fp16 = UINT16_C(0x7C00);
131 while (state.KeepRunning()) {
132 const float fp32 = Float16Compressor::decompress(fp16);
133
134 fp16 = next_xorshift16(fp16);
135 benchmark::DoNotOptimize(fp32);
136 }
137 }
138 BENCHMARK(Float16Compressor_decompress);
139
half_float_detail_half2float_table(benchmark::State & state)140 static void half_float_detail_half2float_table(benchmark::State& state) {
141 uint16_t fp16 = UINT16_C(0x7C00);
142 while (state.KeepRunning()) {
143 const float fp32 =
144 half_float::detail::half2float_impl(fp16,
145 half_float::detail::true_type());
146
147 fp16 = next_xorshift16(fp16);
148 benchmark::DoNotOptimize(fp32);
149 }
150 }
151 BENCHMARK(half_float_detail_half2float_table);
152
half_float_detail_half2float_branch(benchmark::State & state)153 static void half_float_detail_half2float_branch(benchmark::State& state) {
154 uint16_t fp16 = UINT16_C(0x7C00);
155 while (state.KeepRunning()) {
156 const float fp32 =
157 half_float::detail::half2float_impl(fp16,
158 half_float::detail::false_type());
159
160 fp16 = next_xorshift16(fp16);
161 benchmark::DoNotOptimize(fp32);
162 }
163 }
164 BENCHMARK(half_float_detail_half2float_branch);
165 #endif
166
167 /* Conversion from IEEE FP32 to IEEE FP16 */
168
fp16_ieee_from_fp32_value(benchmark::State & state)169 static void fp16_ieee_from_fp32_value(benchmark::State& state) {
170 uint32_t fp32 = UINT32_C(0x7F800000);
171 while (state.KeepRunning()) {
172 const uint16_t fp16 = fp16_ieee_from_fp32_value(fp32_from_bits(fp32));
173
174 fp32 = next_xorshift32(fp32);
175 benchmark::DoNotOptimize(fp16);
176 }
177 }
178 BENCHMARK(fp16_ieee_from_fp32_value);
179
180 #if (defined(__i386__) || defined(__x86_64__)) && defined(__F16C__)
fp16_ieee_from_fp32_hardware(benchmark::State & state)181 static void fp16_ieee_from_fp32_hardware(benchmark::State& state) {
182 uint32_t fp32 = UINT32_C(0x7F800000);
183 while (state.KeepRunning()) {
184 const uint16_t fp16 = static_cast<uint16_t>(
185 _mm_cvtsi128_si32(_mm_cvtps_ph(_mm_set_ss(fp32), _MM_FROUND_CUR_DIRECTION)));
186
187 fp32 = next_xorshift32(fp32);
188 benchmark::DoNotOptimize(fp16);
189 }
190 }
191 BENCHMARK(fp16_ieee_from_fp32_hardware);
192 #endif
193
194 #ifdef FP16_COMPARATIVE_BENCHMARKS
TH_float2halfbits(benchmark::State & state)195 static void TH_float2halfbits(benchmark::State& state) {
196 uint32_t fp32 = UINT32_C(0x7F800000);
197 while (state.KeepRunning()) {
198 uint16_t fp16;
199 float fp32_value = fp32_from_bits(fp32);
200 TH_float2halfbits(&fp32_value, &fp16);
201
202 fp32 = next_xorshift32(fp32);
203 benchmark::DoNotOptimize(fp16);
204 }
205 }
206 BENCHMARK(TH_float2halfbits);
207
npy_floatbits_to_halfbits(benchmark::State & state)208 static void npy_floatbits_to_halfbits(benchmark::State& state) {
209 uint32_t fp32 = UINT32_C(0x7F800000);
210 while (state.KeepRunning()) {
211 const uint16_t fp16 = npy_floatbits_to_halfbits(fp32);
212
213 fp32 = next_xorshift32(fp32);
214 benchmark::DoNotOptimize(fp16);
215 }
216 }
217 BENCHMARK(npy_floatbits_to_halfbits);
218
Eigen_float_to_half_rtne(benchmark::State & state)219 static void Eigen_float_to_half_rtne(benchmark::State& state) {
220 uint32_t fp32 = UINT32_C(0x7F800000);
221 while (state.KeepRunning()) {
222 const Eigen::half_impl::__half fp16 =
223 Eigen::half_impl::float_to_half_rtne(
224 fp32_from_bits(fp32));
225
226 fp32 = next_xorshift32(fp32);
227 benchmark::DoNotOptimize(fp16);
228 }
229 }
230 BENCHMARK(Eigen_float_to_half_rtne);
231
Float16Compressor_compress(benchmark::State & state)232 static void Float16Compressor_compress(benchmark::State& state) {
233 uint32_t fp32 = UINT32_C(0x7F800000);
234 while (state.KeepRunning()) {
235 const uint16_t fp16 = Float16Compressor::compress(fp32_from_bits(fp32));
236
237 fp32 = next_xorshift32(fp32);
238 benchmark::DoNotOptimize(fp16);
239 }
240 }
241 BENCHMARK(Float16Compressor_compress);
242
half_float_detail_float2half_table(benchmark::State & state)243 static void half_float_detail_float2half_table(benchmark::State& state) {
244 uint32_t fp32 = UINT32_C(0x7F800000);
245 while (state.KeepRunning()) {
246 const uint16_t fp16 =
247 half_float::detail::float2half_impl<std::round_to_nearest>(
248 fp32_from_bits(fp32),
249 half_float::detail::true_type());
250
251 fp32 = next_xorshift32(fp32);
252 benchmark::DoNotOptimize(fp16);
253 }
254 }
255 BENCHMARK(half_float_detail_float2half_table);
256
half_float_detail_float2half_branch(benchmark::State & state)257 static void half_float_detail_float2half_branch(benchmark::State& state) {
258 uint32_t fp32 = UINT32_C(0x7F800000);
259 while (state.KeepRunning()) {
260 const uint16_t fp16 =
261 half_float::detail::float2half_impl<std::round_to_nearest>(
262 fp32_from_bits(fp32),
263 half_float::detail::false_type());
264
265 fp32 = next_xorshift32(fp32);
266 benchmark::DoNotOptimize(fp16);
267 }
268 }
269 BENCHMARK(half_float_detail_float2half_branch);
270 #endif
271
272 BENCHMARK_MAIN();
273