1 /*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2018, Two Orioles, LLC
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 * list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 * this list of conditions and the following disclaimer in the documentation
14 * and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #ifndef DAV1D_TESTS_CHECKASM_CHECKASM_H
29 #define DAV1D_TESTS_CHECKASM_CHECKASM_H
30
31 #include "config.h"
32
33 #include <stdint.h>
34 #include <stdlib.h>
35
36 #ifdef _WIN32
37 #include <windows.h>
38 #if ARCH_X86_32
39 #include <setjmp.h>
40 typedef jmp_buf checkasm_context;
41 #define checkasm_save_context() setjmp(checkasm_context_buf)
42 #define checkasm_load_context() longjmp(checkasm_context_buf, 1)
43 #elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
44 /* setjmp/longjmp on Windows on architectures using SEH (all except x86_32)
45 * will try to use SEH to unwind the stack, which doesn't work for assembly
46 * functions without unwind information. */
47 typedef struct { CONTEXT c; int status; } checkasm_context;
48 #define checkasm_save_context() \
49 (checkasm_context_buf.status = 0, \
50 RtlCaptureContext(&checkasm_context_buf.c), \
51 checkasm_context_buf.status)
52 #define checkasm_load_context() \
53 (checkasm_context_buf.status = 1, \
54 RtlRestoreContext(&checkasm_context_buf.c, NULL))
55 #else
56 typedef void* checkasm_context;
57 #define checkasm_save_context() 0
58 #define checkasm_load_context() do {} while (0)
59 #endif
60 #else
61 #include <setjmp.h>
62 typedef sigjmp_buf checkasm_context;
63 #define checkasm_save_context() sigsetjmp(checkasm_context_buf, 1)
64 #define checkasm_load_context() siglongjmp(checkasm_context_buf, 1)
65 #endif
66
67 #include "include/common/attributes.h"
68 #include "include/common/bitdepth.h"
69 #include "include/common/intops.h"
70
71 #if ARCH_ARM
72 #include "src/arm/arm-arch.h"
73 #endif
74
75 int xor128_rand(void);
76 #define rnd xor128_rand
77
78 #define decl_check_bitfns(name) \
79 name##_8bpc(void); \
80 name##_16bpc(void)
81
82 void checkasm_check_msac(void);
83 void checkasm_check_pal(void);
84 void checkasm_check_refmvs(void);
85 decl_check_bitfns(void checkasm_check_cdef);
86 decl_check_bitfns(void checkasm_check_filmgrain);
87 decl_check_bitfns(void checkasm_check_ipred);
88 decl_check_bitfns(void checkasm_check_itx);
89 decl_check_bitfns(void checkasm_check_loopfilter);
90 decl_check_bitfns(void checkasm_check_looprestoration);
91 decl_check_bitfns(void checkasm_check_mc);
92
93 void *checkasm_check_func(void *func, const char *name, ...);
94 int checkasm_bench_func(void);
95 int checkasm_fail_func(const char *msg, ...);
96 void checkasm_update_bench(int iterations, uint64_t cycles);
97 void checkasm_report(const char *name, ...);
98 void checkasm_set_signal_handler_state(int enabled);
99 void checkasm_handle_signal(void);
100 extern checkasm_context checkasm_context_buf;
101
102 /* float compare utilities */
103 int float_near_ulp(float a, float b, unsigned max_ulp);
104 int float_near_abs_eps(float a, float b, float eps);
105 int float_near_abs_eps_ulp(float a, float b, float eps, unsigned max_ulp);
106 int float_near_ulp_array(const float *a, const float *b, unsigned max_ulp,
107 int len);
108 int float_near_abs_eps_array(const float *a, const float *b, float eps,
109 int len);
110 int float_near_abs_eps_array_ulp(const float *a, const float *b, float eps,
111 unsigned max_ulp, int len);
112
113 #define BENCH_RUNS (1 << 12) /* Trade-off between accuracy and speed */
114
115 /* Decide whether or not the specified function needs to be tested */
116 #define check_func(func, ...)\
117 (func_ref = checkasm_check_func((func_new = func), __VA_ARGS__))
118
119 /* Declare the function prototype. The first argument is the return value,
120 * the remaining arguments are the function parameters. Naming parameters
121 * is optional. */
122 #define declare_func(ret, ...)\
123 declare_new(ret, __VA_ARGS__)\
124 void *func_ref, *func_new;\
125 typedef ret func_type(__VA_ARGS__);\
126 if (checkasm_save_context()) checkasm_handle_signal()
127
128 /* Indicate that the current test has failed */
129 #define fail() checkasm_fail_func("%s:%d", __FILE__, __LINE__)
130
131 /* Print the test outcome */
132 #define report checkasm_report
133
134 /* Call the reference function */
135 #define call_ref(...)\
136 (checkasm_set_signal_handler_state(1),\
137 ((func_type *)func_ref)(__VA_ARGS__));\
138 checkasm_set_signal_handler_state(0)
139
140 #if HAVE_ASM
141 #if ARCH_X86
142 #if defined(_MSC_VER) && !defined(__clang__)
143 #include <intrin.h>
144 #define readtime() (_mm_lfence(), __rdtsc())
145 #else
readtime(void)146 static inline uint64_t readtime(void) {
147 uint32_t eax, edx;
148 __asm__ __volatile__("lfence\nrdtsc" : "=a"(eax), "=d"(edx));
149 return (((uint64_t)edx) << 32) | eax;
150 }
151 #define readtime readtime
152 #endif
153 #elif CONFIG_MACOS_KPERF
154 uint64_t checkasm_kperf_cycles(void);
155 #define readtime() checkasm_kperf_cycles()
156 #elif (ARCH_AARCH64 || ARCH_ARM) && defined(__APPLE__)
157 #include <mach/mach_time.h>
158 #define readtime() mach_absolute_time()
159 #elif ARCH_AARCH64
160 #ifdef _MSC_VER
161 #include <windows.h>
162 #define readtime() (_InstructionSynchronizationBarrier(), ReadTimeStampCounter())
163 #else
readtime(void)164 static inline uint64_t readtime(void) {
165 uint64_t cycle_counter;
166 /* This requires enabling user mode access to the cycle counter (which
167 * can only be done from kernel space).
168 * This could also read cntvct_el0 instead of pmccntr_el0; that register
169 * might also be readable (depending on kernel version), but it has much
170 * worse precision (it's a fixed 50 MHz timer). */
171 __asm__ __volatile__("isb\nmrs %0, pmccntr_el0"
172 : "=r"(cycle_counter)
173 :: "memory");
174 return cycle_counter;
175 }
176 #define readtime readtime
177 #endif
178 #elif ARCH_ARM && !defined(_MSC_VER) && __ARM_ARCH >= 7
readtime(void)179 static inline uint64_t readtime(void) {
180 uint32_t cycle_counter;
181 /* This requires enabling user mode access to the cycle counter (which
182 * can only be done from kernel space). */
183 __asm__ __volatile__("isb\nmrc p15, 0, %0, c9, c13, 0"
184 : "=r"(cycle_counter)
185 :: "memory");
186 return cycle_counter;
187 }
188 #define readtime readtime
189 #elif ARCH_PPC64LE
readtime(void)190 static inline uint64_t readtime(void) {
191 uint32_t tbu, tbl, temp;
192
193 __asm__ __volatile__(
194 "1:\n"
195 "mfspr %2,269\n"
196 "mfspr %0,268\n"
197 "mfspr %1,269\n"
198 "cmpw %2,%1\n"
199 "bne 1b\n"
200 : "=r"(tbl), "=r"(tbu), "=r"(temp)
201 :
202 : "cc");
203
204 return (((uint64_t)tbu) << 32) | (uint64_t)tbl;
205 }
206 #define readtime readtime
207 #elif ARCH_RISCV
208 #include <time.h>
clock_gettime_nsec(void)209 static inline uint64_t clock_gettime_nsec(void) {
210 struct timespec ts;
211 clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
212 return ((uint64_t)ts.tv_sec*1000000000u) + (uint64_t)ts.tv_nsec;
213 }
214 #define readtime clock_gettime_nsec
215 #elif ARCH_LOONGARCH
readtime(void)216 static inline uint64_t readtime(void) {
217 #if ARCH_LOONGARCH64
218 uint64_t a, id;
219 __asm__ __volatile__("rdtime.d %0, %1"
220 : "=r"(a), "=r"(id)
221 :: );
222 return a;
223 #else
224 uint32_t a, id;
225 __asm__ __volatile__("rdtimel.w %0, %1"
226 : "=r"(a), "=r"(id)
227 :: );
228 return (uint64_t)a;
229 #endif
230 }
231 #define readtime readtime
232 #endif
233
234 /* Verifies that clobbered callee-saved registers
235 * are properly saved and restored */
236 void checkasm_checked_call(void *func, ...);
237
238 #if ARCH_X86_64
239 /* YMM and ZMM registers on x86 are turned off to save power when they haven't
240 * been used for some period of time. When they are used there will be a
241 * "warmup" period during which performance will be reduced and inconsistent
242 * which is problematic when trying to benchmark individual functions. We can
243 * work around this by periodically issuing "dummy" instructions that uses
244 * those registers to keep them powered on. */
245 void checkasm_simd_warmup(void);
246
247 /* The upper 32 bits of 32-bit data types are undefined when passed as function
248 * parameters. In practice those bits usually end up being zero which may hide
249 * certain bugs, such as using a register containing undefined bits as a pointer
250 * offset, so we want to intentionally clobber those bits with junk to expose
251 * any issues. The following set of macros automatically calculates a bitmask
252 * specifying which parameters should have their upper halves clobbered. */
253 #ifdef _WIN32
254 /* Integer and floating-point parameters share "register slots". */
255 #define IGNORED_FP_ARGS 0
256 #else
257 /* Up to 8 floating-point parameters are passed in XMM registers, which are
258 * handled orthogonally from integer parameters passed in GPR registers. */
259 #define IGNORED_FP_ARGS 8
260 #endif
261 #if HAVE_C11_GENERIC
262 #define clobber_type(arg) _Generic((void (*)(void*, arg))NULL,\
263 void (*)(void*, int32_t ): clobber_mask |= 1 << mpos++,\
264 void (*)(void*, uint32_t): clobber_mask |= 1 << mpos++,\
265 void (*)(void*, float ): mpos += (fp_args++ >= IGNORED_FP_ARGS),\
266 void (*)(void*, double ): mpos += (fp_args++ >= IGNORED_FP_ARGS),\
267 default: mpos++)
268 #define init_clobber_mask(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, ...)\
269 unsigned clobber_mask = 0;\
270 {\
271 int mpos = 0, fp_args = 0;\
272 clobber_type(a); clobber_type(b); clobber_type(c); clobber_type(d);\
273 clobber_type(e); clobber_type(f); clobber_type(g); clobber_type(h);\
274 clobber_type(i); clobber_type(j); clobber_type(k); clobber_type(l);\
275 clobber_type(m); clobber_type(n); clobber_type(o); clobber_type(p);\
276 }
277 #else
278 /* Skip parameter clobbering on compilers without support for _Generic() */
279 #define init_clobber_mask(...) unsigned clobber_mask = 0
280 #endif
281 #define declare_new(ret, ...)\
282 ret (*checked_call)(__VA_ARGS__, int, int, int, int, int, int, int,\
283 int, int, int, int, int, int, int, int, int,\
284 void*, unsigned) =\
285 (void*)checkasm_checked_call;\
286 init_clobber_mask(__VA_ARGS__, void*, void*, void*, void*,\
287 void*, void*, void*, void*, void*, void*,\
288 void*, void*, void*, void*, void*);
289 #define call_new(...)\
290 (checkasm_set_signal_handler_state(1),\
291 checkasm_simd_warmup(),\
292 checked_call(__VA_ARGS__, 16, 15, 14, 13, 12, 11, 10, 9, 8,\
293 7, 6, 5, 4, 3, 2, 1, func_new, clobber_mask));\
294 checkasm_set_signal_handler_state(0)
295 #elif ARCH_X86_32
296 #define declare_new(ret, ...)\
297 ret (*checked_call)(void *, __VA_ARGS__, int, int, int, int, int, int,\
298 int, int, int, int, int, int, int, int, int) =\
299 (void *)checkasm_checked_call;
300 #define call_new(...)\
301 (checkasm_set_signal_handler_state(1),\
302 checked_call(func_new, __VA_ARGS__, 15, 14, 13, 12,\
303 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1));\
304 checkasm_set_signal_handler_state(0)
305 #elif ARCH_ARM
306 /* Use a dummy argument, to offset the real parameters by 2, not only 1.
307 * This makes sure that potential 8-byte-alignment of parameters is kept
308 * the same even when the extra parameters have been removed. */
309 extern void (*checkasm_checked_call_ptr)(void *func, int dummy, ...);
310 #define declare_new(ret, ...)\
311 ret (*checked_call)(void *, int dummy, __VA_ARGS__,\
312 int, int, int, int, int, int, int, int,\
313 int, int, int, int, int, int, int) =\
314 (void *)checkasm_checked_call_ptr;
315 #define call_new(...)\
316 (checkasm_set_signal_handler_state(1),\
317 checked_call(func_new, 0, __VA_ARGS__, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0));\
318 checkasm_set_signal_handler_state(0)
319 #elif ARCH_AARCH64 && !defined(__APPLE__)
320 void checkasm_stack_clobber(uint64_t clobber, ...);
321 #define declare_new(ret, ...)\
322 ret (*checked_call)(void *, int, int, int, int, int, int, int,\
323 __VA_ARGS__, int, int, int, int, int, int, int, int,\
324 int, int, int, int, int, int, int) =\
325 (void *)checkasm_checked_call;
326 #define CLOB (UINT64_C(0xdeadbeefdeadbeef))
327 #define call_new(...)\
328 (checkasm_set_signal_handler_state(1),\
329 checkasm_stack_clobber(CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
330 CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
331 CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
332 CLOB, CLOB, CLOB, CLOB, CLOB),\
333 checked_call(func_new, 0, 0, 0, 0, 0, 0, 0, __VA_ARGS__,\
334 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0));\
335 checkasm_set_signal_handler_state(0)
336 #elif ARCH_RISCV
337 #define declare_new(ret, ...)\
338 ret (*checked_call)(void *, int, int, int, int, int, int, int,\
339 __VA_ARGS__, int, int, int, int, int, int, int, int,\
340 int, int, int, int, int, int, int) =\
341 (void *)checkasm_checked_call;
342 #define call_new(...)\
343 (checkasm_set_signal_handler_state(1),\
344 checked_call(func_new, 0, 0, 0, 0, 0, 0, 0, __VA_ARGS__,\
345 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0));\
346 checkasm_set_signal_handler_state(0)
347 #elif ARCH_LOONGARCH
348 #define declare_new(ret, ...)\
349 ret (*checked_call)(void *, int, int, int, int, int, int, int,\
350 __VA_ARGS__, int, int, int, int, int, int, int, int,\
351 int, int, int, int, int, int, int) =\
352 (void *)checkasm_checked_call;
353 #define call_new(...)\
354 (checkasm_set_signal_handler_state(1),\
355 checked_call(func_new, 0, 0, 0, 0, 0, 0, 0, __VA_ARGS__,\
356 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0));\
357 checkasm_set_signal_handler_state(0)
358 #else
359 #define declare_new(ret, ...)
360 #define call_new(...)\
361 (checkasm_set_signal_handler_state(1),\
362 ((func_type *)func_new)(__VA_ARGS__));\
363 checkasm_set_signal_handler_state(0)
364 #endif
365 #else /* HAVE_ASM */
366 #define declare_new(ret, ...)
367 /* Call the function */
368 #define call_new(...)\
369 (checkasm_set_signal_handler_state(1),\
370 ((func_type *)func_new)(__VA_ARGS__));\
371 checkasm_set_signal_handler_state(0)
372 #endif /* HAVE_ASM */
373
374 /* Benchmark the function */
375 #ifdef readtime
376 #define bench_new(...)\
377 do {\
378 if (checkasm_bench_func()) {\
379 func_type *const tfunc = func_new;\
380 checkasm_set_signal_handler_state(1);\
381 uint64_t tsum = 0;\
382 int tcount = 0;\
383 for (int ti = 0; ti < BENCH_RUNS; ti++) {\
384 uint64_t t = readtime();\
385 int talt = 0; (void)talt;\
386 tfunc(__VA_ARGS__);\
387 talt = 1;\
388 tfunc(__VA_ARGS__);\
389 talt = 0;\
390 tfunc(__VA_ARGS__);\
391 talt = 1;\
392 tfunc(__VA_ARGS__);\
393 t = readtime() - t;\
394 if (t*tcount <= tsum*4 && ti > 0) {\
395 tsum += t;\
396 tcount++;\
397 }\
398 }\
399 checkasm_set_signal_handler_state(0);\
400 checkasm_update_bench(tcount, tsum);\
401 } else {\
402 const int talt = 0; (void)talt;\
403 call_new(__VA_ARGS__);\
404 }\
405 } while (0)
406 #else
407 #define bench_new(...) do {} while (0)
408 #endif
409
410 /* Alternates between two pointers. Intended to be used within bench_new()
411 * calls for functions which modifies their input buffer(s) to ensure that
412 * throughput, and not latency, is measured. */
413 #define alternate(a, b) (talt ? (b) : (a))
414
415 #define ROUND_UP(x,a) (((x)+((a)-1)) & ~((a)-1))
416 #define PIXEL_RECT(name, w, h) \
417 ALIGN_STK_64(pixel, name##_buf, ((h)+32)*(ROUND_UP(w,64)+64) + 64,); \
418 ptrdiff_t name##_stride = sizeof(pixel)*(ROUND_UP(w,64)+64); \
419 (void)name##_stride; \
420 pixel *name = name##_buf + (ROUND_UP(w,64)+64)*16 + 64
421
422 #define CLEAR_PIXEL_RECT(name) \
423 memset(name##_buf, 0x99, sizeof(name##_buf)) \
424
425 #define DECL_CHECKASM_CHECK_FUNC(type) \
426 int checkasm_check_##type(const char *const file, const int line, \
427 const type *const buf1, const ptrdiff_t stride1, \
428 const type *const buf2, const ptrdiff_t stride2, \
429 const int w, const int h, const char *const name, \
430 const int align_w, const int align_h, \
431 const int padding)
432
433 DECL_CHECKASM_CHECK_FUNC(int8_t);
434 DECL_CHECKASM_CHECK_FUNC(int16_t);
435 DECL_CHECKASM_CHECK_FUNC(int32_t);
436 DECL_CHECKASM_CHECK_FUNC(uint8_t);
437 DECL_CHECKASM_CHECK_FUNC(uint16_t);
438 DECL_CHECKASM_CHECK_FUNC(uint32_t);
439
440 #define CONCAT(a,b) a ## b
441
442 #define checkasm_check2(prefix, ...) CONCAT(checkasm_check_, prefix)(__FILE__, __LINE__, __VA_ARGS__)
443 #define checkasm_check(prefix, ...) checkasm_check2(prefix, __VA_ARGS__, 0, 0, 0)
444
445 #ifdef BITDEPTH
446 #define checkasm_check_pixel(...) checkasm_check(PIXEL_TYPE, __VA_ARGS__)
447 #define checkasm_check_pixel_padded(...) checkasm_check2(PIXEL_TYPE, __VA_ARGS__, 1, 1, 8)
448 #define checkasm_check_pixel_padded_align(...) checkasm_check2(PIXEL_TYPE, __VA_ARGS__, 8)
449 #define checkasm_check_coef(...) checkasm_check(COEF_TYPE, __VA_ARGS__)
450 #endif
451
452 #endif /* DAV1D_TESTS_CHECKASM_CHECKASM_H */
453