xref: /aosp_15_r20/external/pthreadpool/src/threadpool-utils.h (revision b095b0533730c2930f947df924a4486d266faa1a)
1*b095b053SXin Li #pragma once
2*b095b053SXin Li 
3*b095b053SXin Li #include <stdint.h>
4*b095b053SXin Li #include <stddef.h>
5*b095b053SXin Li 
6*b095b053SXin Li /* SSE-specific headers */
7*b095b053SXin Li #if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
8*b095b053SXin Li 	#include <xmmintrin.h>
9*b095b053SXin Li #endif
10*b095b053SXin Li 
11*b095b053SXin Li /* MSVC-specific headers */
12*b095b053SXin Li #if defined(_MSC_VER)
13*b095b053SXin Li 	#include <intrin.h>
14*b095b053SXin Li #endif
15*b095b053SXin Li 
16*b095b053SXin Li 
17*b095b053SXin Li struct fpu_state {
18*b095b053SXin Li #if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
19*b095b053SXin Li 	uint32_t mxcsr;
20*b095b053SXin Li #elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0) || defined(_MSC_VER) && defined(_M_ARM)
21*b095b053SXin Li 	uint32_t fpscr;
22*b095b053SXin Li #elif defined(__GNUC__) && defined(__aarch64__) || defined(_MSC_VER) && defined(_M_ARM64)
23*b095b053SXin Li 	uint64_t fpcr;
24*b095b053SXin Li #else
25*b095b053SXin Li 	char unused;
26*b095b053SXin Li #endif
27*b095b053SXin Li };
28*b095b053SXin Li 
get_fpu_state()29*b095b053SXin Li static inline struct fpu_state get_fpu_state() {
30*b095b053SXin Li 	struct fpu_state state = { 0 };
31*b095b053SXin Li #if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
32*b095b053SXin Li 	state.mxcsr = (uint32_t) _mm_getcsr();
33*b095b053SXin Li #elif defined(_MSC_VER) && defined(_M_ARM)
34*b095b053SXin Li 	state.fpscr = (uint32_t) _MoveFromCoprocessor(10, 7, 1, 0, 0);
35*b095b053SXin Li #elif defined(_MSC_VER) && defined(_M_ARM64)
36*b095b053SXin Li 	state.fpcr = (uint64_t) _ReadStatusReg(0x5A20);
37*b095b053SXin Li #elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0)
38*b095b053SXin Li 	__asm__ __volatile__("VMRS %[fpscr], fpscr" : [fpscr] "=r" (state.fpscr));
39*b095b053SXin Li #elif defined(__GNUC__) && defined(__aarch64__)
40*b095b053SXin Li 	__asm__ __volatile__("MRS %[fpcr], fpcr" : [fpcr] "=r" (state.fpcr));
41*b095b053SXin Li #endif
42*b095b053SXin Li 	return state;
43*b095b053SXin Li }
44*b095b053SXin Li 
set_fpu_state(const struct fpu_state state)45*b095b053SXin Li static inline void set_fpu_state(const struct fpu_state state) {
46*b095b053SXin Li #if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
47*b095b053SXin Li 	_mm_setcsr((unsigned int) state.mxcsr);
48*b095b053SXin Li #elif defined(_MSC_VER) && defined(_M_ARM)
49*b095b053SXin Li 	_MoveToCoprocessor((int) state.fpscr, 10, 7, 1, 0, 0);
50*b095b053SXin Li #elif defined(_MSC_VER) && defined(_M_ARM64)
51*b095b053SXin Li 	_WriteStatusReg(0x5A20, (__int64) state.fpcr);
52*b095b053SXin Li #elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0)
53*b095b053SXin Li 	__asm__ __volatile__("VMSR fpscr, %[fpscr]" : : [fpscr] "r" (state.fpscr));
54*b095b053SXin Li #elif defined(__GNUC__) && defined(__aarch64__)
55*b095b053SXin Li 	__asm__ __volatile__("MSR fpcr, %[fpcr]" : : [fpcr] "r" (state.fpcr));
56*b095b053SXin Li #endif
57*b095b053SXin Li }
58*b095b053SXin Li 
disable_fpu_denormals()59*b095b053SXin Li static inline void disable_fpu_denormals() {
60*b095b053SXin Li #if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
61*b095b053SXin Li 	_mm_setcsr(_mm_getcsr() | 0x8040);
62*b095b053SXin Li #elif defined(_MSC_VER) && defined(_M_ARM)
63*b095b053SXin Li 	int fpscr = _MoveFromCoprocessor(10, 7, 1, 0, 0);
64*b095b053SXin Li 	fpscr |= 0x1000000;
65*b095b053SXin Li 	_MoveToCoprocessor(fpscr, 10, 7, 1, 0, 0);
66*b095b053SXin Li #elif defined(_MSC_VER) && defined(_M_ARM64)
67*b095b053SXin Li 	__int64 fpcr = _ReadStatusReg(0x5A20);
68*b095b053SXin Li 	fpcr |= 0x1080000;
69*b095b053SXin Li 	_WriteStatusReg(0x5A20, fpcr);
70*b095b053SXin Li #elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0)
71*b095b053SXin Li 	uint32_t fpscr;
72*b095b053SXin Li 	#if defined(__thumb__) && !defined(__thumb2__)
73*b095b053SXin Li 		__asm__ __volatile__(
74*b095b053SXin Li 				"VMRS %[fpscr], fpscr\n"
75*b095b053SXin Li 				"ORRS %[fpscr], %[bitmask]\n"
76*b095b053SXin Li 				"VMSR fpscr, %[fpscr]\n"
77*b095b053SXin Li 			: [fpscr] "=l" (fpscr)
78*b095b053SXin Li 			: [bitmask] "l" (0x1000000)
79*b095b053SXin Li 			: "cc");
80*b095b053SXin Li 	#else
81*b095b053SXin Li 		__asm__ __volatile__(
82*b095b053SXin Li 				"VMRS %[fpscr], fpscr\n"
83*b095b053SXin Li 				"ORR %[fpscr], #0x1000000\n"
84*b095b053SXin Li 				"VMSR fpscr, %[fpscr]\n"
85*b095b053SXin Li 			: [fpscr] "=r" (fpscr));
86*b095b053SXin Li 	#endif
87*b095b053SXin Li #elif defined(__GNUC__) && defined(__aarch64__)
88*b095b053SXin Li 	uint64_t fpcr;
89*b095b053SXin Li 	__asm__ __volatile__(
90*b095b053SXin Li 			"MRS %[fpcr], fpcr\n"
91*b095b053SXin Li 			"ORR %w[fpcr], %w[fpcr], 0x1000000\n"
92*b095b053SXin Li 			"ORR %w[fpcr], %w[fpcr], 0x80000\n"
93*b095b053SXin Li 			"MSR fpcr, %[fpcr]\n"
94*b095b053SXin Li 		: [fpcr] "=r" (fpcr));
95*b095b053SXin Li #endif
96*b095b053SXin Li }
97*b095b053SXin Li 
modulo_decrement(size_t i,size_t n)98*b095b053SXin Li static inline size_t modulo_decrement(size_t i, size_t n) {
99*b095b053SXin Li 	/* Wrap modulo n, if needed */
100*b095b053SXin Li 	if (i == 0) {
101*b095b053SXin Li 		i = n;
102*b095b053SXin Li 	}
103*b095b053SXin Li 	/* Decrement input variable */
104*b095b053SXin Li 	return i - 1;
105*b095b053SXin Li }
106*b095b053SXin Li 
divide_round_up(size_t dividend,size_t divisor)107*b095b053SXin Li static inline size_t divide_round_up(size_t dividend, size_t divisor) {
108*b095b053SXin Li 	if (dividend % divisor == 0) {
109*b095b053SXin Li 		return dividend / divisor;
110*b095b053SXin Li 	} else {
111*b095b053SXin Li 		return dividend / divisor + 1;
112*b095b053SXin Li 	}
113*b095b053SXin Li }
114*b095b053SXin Li 
115*b095b053SXin Li /* Windows headers define min and max macros; undefine it here */
116*b095b053SXin Li #ifdef min
117*b095b053SXin Li 	#undef min
118*b095b053SXin Li #endif
119*b095b053SXin Li 
min(size_t a,size_t b)120*b095b053SXin Li static inline size_t min(size_t a, size_t b) {
121*b095b053SXin Li 	return a < b ? a : b;
122*b095b053SXin Li }
123