1*412f47f9SXin Li /*
2*412f47f9SXin Li * Single-precision scalar tan(x) function.
3*412f47f9SXin Li *
4*412f47f9SXin Li * Copyright (c) 2021-2023, Arm Limited.
5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li */
7*412f47f9SXin Li #include "math_config.h"
8*412f47f9SXin Li #include "pl_sig.h"
9*412f47f9SXin Li #include "pl_test.h"
10*412f47f9SXin Li #include "poly_scalar_f32.h"
11*412f47f9SXin Li
12*412f47f9SXin Li /* Useful constants. */
13*412f47f9SXin Li #define NegPio2_1 (-0x1.921fb6p+0f)
14*412f47f9SXin Li #define NegPio2_2 (0x1.777a5cp-25f)
15*412f47f9SXin Li #define NegPio2_3 (0x1.ee59dap-50f)
16*412f47f9SXin Li /* Reduced from 0x1p20 to 0x1p17 to ensure 3.5ulps. */
17*412f47f9SXin Li #define RangeVal (0x1p17f)
18*412f47f9SXin Li #define InvPio2 ((0x1.45f306p-1f))
19*412f47f9SXin Li #define Shift (0x1.8p+23f)
20*412f47f9SXin Li #define AbsMask (0x7fffffff)
21*412f47f9SXin Li #define Pio4 (0x1.921fb6p-1)
22*412f47f9SXin Li /* 2PI * 2^-64. */
23*412f47f9SXin Li #define Pio2p63 (0x1.921FB54442D18p-62)
24*412f47f9SXin Li
25*412f47f9SXin Li static inline float
eval_P(float z)26*412f47f9SXin Li eval_P (float z)
27*412f47f9SXin Li {
28*412f47f9SXin Li return pw_horner_5_f32 (z, z * z, __tanf_poly_data.poly_tan);
29*412f47f9SXin Li }
30*412f47f9SXin Li
31*412f47f9SXin Li static inline float
eval_Q(float z)32*412f47f9SXin Li eval_Q (float z)
33*412f47f9SXin Li {
34*412f47f9SXin Li return pairwise_poly_3_f32 (z, z * z, __tanf_poly_data.poly_cotan);
35*412f47f9SXin Li }
36*412f47f9SXin Li
37*412f47f9SXin Li /* Reduction of the input argument x using Cody-Waite approach, such that x = r
38*412f47f9SXin Li + n * pi/2 with r lives in [-pi/4, pi/4] and n is a signed integer. */
39*412f47f9SXin Li static inline float
reduce(float x,int32_t * in)40*412f47f9SXin Li reduce (float x, int32_t *in)
41*412f47f9SXin Li {
42*412f47f9SXin Li /* n = rint(x/(pi/2)). */
43*412f47f9SXin Li float r = x;
44*412f47f9SXin Li float q = fmaf (InvPio2, r, Shift);
45*412f47f9SXin Li float n = q - Shift;
46*412f47f9SXin Li /* There is no rounding here, n is representable by a signed integer. */
47*412f47f9SXin Li *in = (int32_t) n;
48*412f47f9SXin Li /* r = x - n * (pi/2) (range reduction into -pi/4 .. pi/4). */
49*412f47f9SXin Li r = fmaf (NegPio2_1, n, r);
50*412f47f9SXin Li r = fmaf (NegPio2_2, n, r);
51*412f47f9SXin Li r = fmaf (NegPio2_3, n, r);
52*412f47f9SXin Li return r;
53*412f47f9SXin Li }
54*412f47f9SXin Li
55*412f47f9SXin Li /* Table with 4/PI to 192 bit precision. To avoid unaligned accesses
56*412f47f9SXin Li only 8 new bits are added per entry, making the table 4 times larger. */
57*412f47f9SXin Li static const uint32_t __inv_pio4[24]
58*412f47f9SXin Li = {0x000000a2, 0x0000a2f9, 0x00a2f983, 0xa2f9836e, 0xf9836e4e, 0x836e4e44,
59*412f47f9SXin Li 0x6e4e4415, 0x4e441529, 0x441529fc, 0x1529fc27, 0x29fc2757, 0xfc2757d1,
60*412f47f9SXin Li 0x2757d1f5, 0x57d1f534, 0xd1f534dd, 0xf534ddc0, 0x34ddc0db, 0xddc0db62,
61*412f47f9SXin Li 0xc0db6295, 0xdb629599, 0x6295993c, 0x95993c43, 0x993c4390, 0x3c439041};
62*412f47f9SXin Li
63*412f47f9SXin Li /* Reduce the range of XI to a multiple of PI/2 using fast integer arithmetic.
64*412f47f9SXin Li XI is a reinterpreted float and must be >= 2.0f (the sign bit is ignored).
65*412f47f9SXin Li Return the modulo between -PI/4 and PI/4 and store the quadrant in NP.
66*412f47f9SXin Li Reduction uses a table of 4/PI with 192 bits of precision. A 32x96->128 bit
67*412f47f9SXin Li multiply computes the exact 2.62-bit fixed-point modulo. Since the result
68*412f47f9SXin Li can have at most 29 leading zeros after the binary point, the double
69*412f47f9SXin Li precision result is accurate to 33 bits. */
70*412f47f9SXin Li static inline double
reduce_large(uint32_t xi,int * np)71*412f47f9SXin Li reduce_large (uint32_t xi, int *np)
72*412f47f9SXin Li {
73*412f47f9SXin Li const uint32_t *arr = &__inv_pio4[(xi >> 26) & 15];
74*412f47f9SXin Li int shift = (xi >> 23) & 7;
75*412f47f9SXin Li uint64_t n, res0, res1, res2;
76*412f47f9SXin Li
77*412f47f9SXin Li xi = (xi & 0xffffff) | 0x800000;
78*412f47f9SXin Li xi <<= shift;
79*412f47f9SXin Li
80*412f47f9SXin Li res0 = xi * arr[0];
81*412f47f9SXin Li res1 = (uint64_t) xi * arr[4];
82*412f47f9SXin Li res2 = (uint64_t) xi * arr[8];
83*412f47f9SXin Li res0 = (res2 >> 32) | (res0 << 32);
84*412f47f9SXin Li res0 += res1;
85*412f47f9SXin Li
86*412f47f9SXin Li n = (res0 + (1ULL << 61)) >> 62;
87*412f47f9SXin Li res0 -= n << 62;
88*412f47f9SXin Li double x = (int64_t) res0;
89*412f47f9SXin Li *np = n;
90*412f47f9SXin Li return x * Pio2p63;
91*412f47f9SXin Li }
92*412f47f9SXin Li
93*412f47f9SXin Li /* Top 12 bits of the float representation with the sign bit cleared. */
94*412f47f9SXin Li static inline uint32_t
top12(float x)95*412f47f9SXin Li top12 (float x)
96*412f47f9SXin Li {
97*412f47f9SXin Li return (asuint (x) >> 20);
98*412f47f9SXin Li }
99*412f47f9SXin Li
100*412f47f9SXin Li /* Fast single-precision tan implementation.
101*412f47f9SXin Li Maximum ULP error: 3.293ulps.
102*412f47f9SXin Li tanf(0x1.c849eap+16) got -0x1.fe8d98p-1 want -0x1.fe8d9ep-1. */
103*412f47f9SXin Li float
tanf(float x)104*412f47f9SXin Li tanf (float x)
105*412f47f9SXin Li {
106*412f47f9SXin Li /* Get top words. */
107*412f47f9SXin Li uint32_t ix = asuint (x);
108*412f47f9SXin Li uint32_t ia = ix & AbsMask;
109*412f47f9SXin Li uint32_t ia12 = ia >> 20;
110*412f47f9SXin Li
111*412f47f9SXin Li /* Dispatch between no reduction (small numbers), fast reduction and
112*412f47f9SXin Li slow large numbers reduction. The reduction step determines r float
113*412f47f9SXin Li (|r| < pi/4) and n signed integer such that x = r + n * pi/2. */
114*412f47f9SXin Li int32_t n;
115*412f47f9SXin Li float r;
116*412f47f9SXin Li if (ia12 < top12 (Pio4))
117*412f47f9SXin Li {
118*412f47f9SXin Li /* Optimize small values. */
119*412f47f9SXin Li if (unlikely (ia12 < top12 (0x1p-12f)))
120*412f47f9SXin Li {
121*412f47f9SXin Li if (unlikely (ia12 < top12 (0x1p-126f)))
122*412f47f9SXin Li /* Force underflow for tiny x. */
123*412f47f9SXin Li force_eval_float (x * x);
124*412f47f9SXin Li return x;
125*412f47f9SXin Li }
126*412f47f9SXin Li
127*412f47f9SXin Li /* tan (x) ~= x + x^3 * P(x^2). */
128*412f47f9SXin Li float x2 = x * x;
129*412f47f9SXin Li float y = eval_P (x2);
130*412f47f9SXin Li return fmaf (x2, x * y, x);
131*412f47f9SXin Li }
132*412f47f9SXin Li /* Similar to other trigonometric routines, fast inaccurate reduction is
133*412f47f9SXin Li performed for values of x from pi/4 up to RangeVal. In order to keep errors
134*412f47f9SXin Li below 3.5ulps, we set the value of RangeVal to 2^17. This might differ for
135*412f47f9SXin Li other trigonometric routines. Above this value more advanced but slower
136*412f47f9SXin Li reduction techniques need to be implemented to reach a similar accuracy.
137*412f47f9SXin Li */
138*412f47f9SXin Li else if (ia12 < top12 (RangeVal))
139*412f47f9SXin Li {
140*412f47f9SXin Li /* Fast inaccurate reduction. */
141*412f47f9SXin Li r = reduce (x, &n);
142*412f47f9SXin Li }
143*412f47f9SXin Li else if (ia12 < 0x7f8)
144*412f47f9SXin Li {
145*412f47f9SXin Li /* Slow accurate reduction. */
146*412f47f9SXin Li uint32_t sign = ix & ~AbsMask;
147*412f47f9SXin Li double dar = reduce_large (ia, &n);
148*412f47f9SXin Li float ar = (float) dar;
149*412f47f9SXin Li r = asfloat (asuint (ar) ^ sign);
150*412f47f9SXin Li }
151*412f47f9SXin Li else
152*412f47f9SXin Li {
153*412f47f9SXin Li /* tan(Inf or NaN) is NaN. */
154*412f47f9SXin Li return __math_invalidf (x);
155*412f47f9SXin Li }
156*412f47f9SXin Li
157*412f47f9SXin Li /* If x lives in an interval where |tan(x)|
158*412f47f9SXin Li - is finite then use an approximation of tangent in the form
159*412f47f9SXin Li tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2).
160*412f47f9SXin Li - grows to infinity then use an approximation of cotangent in the form
161*412f47f9SXin Li cotan(z) ~ 1/z + z * Q(z^2), where the reciprocal can be computed early.
162*412f47f9SXin Li Using symmetries of tangent and the identity tan(r) = cotan(pi/2 - r),
163*412f47f9SXin Li we only need to change the sign of r to obtain tan(x) from cotan(r).
164*412f47f9SXin Li This 2-interval approach requires 2 different sets of coefficients P and
165*412f47f9SXin Li Q, where Q is a lower order polynomial than P. */
166*412f47f9SXin Li
167*412f47f9SXin Li /* Determine if x lives in an interval where |tan(x)| grows to infinity. */
168*412f47f9SXin Li uint32_t alt = (uint32_t) n & 1;
169*412f47f9SXin Li
170*412f47f9SXin Li /* Perform additional reduction if required. */
171*412f47f9SXin Li float z = alt ? -r : r;
172*412f47f9SXin Li
173*412f47f9SXin Li /* Prepare backward transformation. */
174*412f47f9SXin Li float z2 = r * r;
175*412f47f9SXin Li float offset = alt ? 1.0f / z : z;
176*412f47f9SXin Li float scale = alt ? z : z * z2;
177*412f47f9SXin Li
178*412f47f9SXin Li /* Evaluate polynomial approximation of tan or cotan. */
179*412f47f9SXin Li float p = alt ? eval_Q (z2) : eval_P (z2);
180*412f47f9SXin Li
181*412f47f9SXin Li /* A unified way of assembling the result on both interval types. */
182*412f47f9SXin Li return fmaf (scale, p, offset);
183*412f47f9SXin Li }
184*412f47f9SXin Li
185*412f47f9SXin Li PL_SIG (S, F, 1, tan, -3.1, 3.1)
186*412f47f9SXin Li PL_TEST_ULP (tanf, 2.80)
187*412f47f9SXin Li PL_TEST_INTERVAL (tanf, 0, 0xffff0000, 10000)
188*412f47f9SXin Li PL_TEST_SYM_INTERVAL (tanf, 0x1p-127, 0x1p-14, 50000)
189*412f47f9SXin Li PL_TEST_SYM_INTERVAL (tanf, 0x1p-14, 0.7, 50000)
190*412f47f9SXin Li PL_TEST_SYM_INTERVAL (tanf, 0.7, 1.5, 50000)
191*412f47f9SXin Li PL_TEST_SYM_INTERVAL (tanf, 1.5, 0x1p17, 50000)
192*412f47f9SXin Li PL_TEST_SYM_INTERVAL (tanf, 0x1p17, 0x1p54, 50000)
193*412f47f9SXin Li PL_TEST_SYM_INTERVAL (tanf, 0x1p54, inf, 50000)
194