xref: /aosp_15_r20/external/arm-optimized-routines/pl/math/poly_generic.h (revision 412f47f9e737e10ed5cc46ec6a8d7fa2264f8a14)
1*412f47f9SXin Li /*
2*412f47f9SXin Li  * Generic helpers for evaluating polynomials with various schemes.
3*412f47f9SXin Li  *
4*412f47f9SXin Li  * Copyright (c) 2023, Arm Limited.
5*412f47f9SXin Li  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li  */
7*412f47f9SXin Li 
8*412f47f9SXin Li #ifndef VTYPE
9*412f47f9SXin Li # error Cannot use poly_generic without defining VTYPE
10*412f47f9SXin Li #endif
11*412f47f9SXin Li #ifndef VWRAP
12*412f47f9SXin Li # error Cannot use poly_generic without defining VWRAP
13*412f47f9SXin Li #endif
14*412f47f9SXin Li #ifndef FMA
15*412f47f9SXin Li # error Cannot use poly_generic without defining FMA
16*412f47f9SXin Li #endif
17*412f47f9SXin Li 
VWRAP(pairwise_poly_3)18*412f47f9SXin Li static inline VTYPE VWRAP (pairwise_poly_3) (VTYPE x, VTYPE x2,
19*412f47f9SXin Li 					     const VTYPE *poly)
20*412f47f9SXin Li {
21*412f47f9SXin Li   /* At order 3, Estrin and Pairwise Horner are identical.  */
22*412f47f9SXin Li   VTYPE p01 = FMA (poly[1], x, poly[0]);
23*412f47f9SXin Li   VTYPE p23 = FMA (poly[3], x, poly[2]);
24*412f47f9SXin Li   return FMA (p23, x2, p01);
25*412f47f9SXin Li }
26*412f47f9SXin Li 
VWRAP(estrin_4)27*412f47f9SXin Li static inline VTYPE VWRAP (estrin_4) (VTYPE x, VTYPE x2, VTYPE x4,
28*412f47f9SXin Li 				      const VTYPE *poly)
29*412f47f9SXin Li {
30*412f47f9SXin Li   VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly);
31*412f47f9SXin Li   return FMA (poly[4], x4, p03);
32*412f47f9SXin Li }
VWRAP(estrin_5)33*412f47f9SXin Li static inline VTYPE VWRAP (estrin_5) (VTYPE x, VTYPE x2, VTYPE x4,
34*412f47f9SXin Li 				      const VTYPE *poly)
35*412f47f9SXin Li {
36*412f47f9SXin Li   VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly);
37*412f47f9SXin Li   VTYPE p45 = FMA (poly[5], x, poly[4]);
38*412f47f9SXin Li   return FMA (p45, x4, p03);
39*412f47f9SXin Li }
VWRAP(estrin_6)40*412f47f9SXin Li static inline VTYPE VWRAP (estrin_6) (VTYPE x, VTYPE x2, VTYPE x4,
41*412f47f9SXin Li 				      const VTYPE *poly)
42*412f47f9SXin Li {
43*412f47f9SXin Li   VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly);
44*412f47f9SXin Li   VTYPE p45 = FMA (poly[5], x, poly[4]);
45*412f47f9SXin Li   VTYPE p46 = FMA (poly[6], x2, p45);
46*412f47f9SXin Li   return FMA (p46, x4, p03);
47*412f47f9SXin Li }
VWRAP(estrin_7)48*412f47f9SXin Li static inline VTYPE VWRAP (estrin_7) (VTYPE x, VTYPE x2, VTYPE x4,
49*412f47f9SXin Li 				      const VTYPE *poly)
50*412f47f9SXin Li {
51*412f47f9SXin Li   VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly);
52*412f47f9SXin Li   VTYPE p47 = VWRAP (pairwise_poly_3) (x, x2, poly + 4);
53*412f47f9SXin Li   return FMA (p47, x4, p03);
54*412f47f9SXin Li }
VWRAP(estrin_8)55*412f47f9SXin Li static inline VTYPE VWRAP (estrin_8) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
56*412f47f9SXin Li 				      const VTYPE *poly)
57*412f47f9SXin Li {
58*412f47f9SXin Li   return FMA (poly[8], x8, VWRAP (estrin_7) (x, x2, x4, poly));
59*412f47f9SXin Li }
VWRAP(estrin_9)60*412f47f9SXin Li static inline VTYPE VWRAP (estrin_9) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
61*412f47f9SXin Li 				      const VTYPE *poly)
62*412f47f9SXin Li {
63*412f47f9SXin Li   VTYPE p89 = FMA (poly[9], x, poly[8]);
64*412f47f9SXin Li   return FMA (p89, x8, VWRAP (estrin_7) (x, x2, x4, poly));
65*412f47f9SXin Li }
VWRAP(estrin_10)66*412f47f9SXin Li static inline VTYPE VWRAP (estrin_10) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
67*412f47f9SXin Li 				       const VTYPE *poly)
68*412f47f9SXin Li {
69*412f47f9SXin Li   VTYPE p89 = FMA (poly[9], x, poly[8]);
70*412f47f9SXin Li   VTYPE p8_10 = FMA (poly[10], x2, p89);
71*412f47f9SXin Li   return FMA (p8_10, x8, VWRAP (estrin_7) (x, x2, x4, poly));
72*412f47f9SXin Li }
VWRAP(estrin_11)73*412f47f9SXin Li static inline VTYPE VWRAP (estrin_11) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
74*412f47f9SXin Li 				       const VTYPE *poly)
75*412f47f9SXin Li {
76*412f47f9SXin Li   VTYPE p8_11 = VWRAP (pairwise_poly_3) (x, x2, poly + 8);
77*412f47f9SXin Li   return FMA (p8_11, x8, VWRAP (estrin_7) (x, x2, x4, poly));
78*412f47f9SXin Li }
VWRAP(estrin_12)79*412f47f9SXin Li static inline VTYPE VWRAP (estrin_12) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
80*412f47f9SXin Li 				       const VTYPE *poly)
81*412f47f9SXin Li {
82*412f47f9SXin Li   return FMA (VWRAP (estrin_4) (x, x2, x4, poly + 8), x8,
83*412f47f9SXin Li 	      VWRAP (estrin_7) (x, x2, x4, poly));
84*412f47f9SXin Li }
VWRAP(estrin_13)85*412f47f9SXin Li static inline VTYPE VWRAP (estrin_13) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
86*412f47f9SXin Li 				       const VTYPE *poly)
87*412f47f9SXin Li {
88*412f47f9SXin Li   return FMA (VWRAP (estrin_5) (x, x2, x4, poly + 8), x8,
89*412f47f9SXin Li 	      VWRAP (estrin_7) (x, x2, x4, poly));
90*412f47f9SXin Li }
VWRAP(estrin_14)91*412f47f9SXin Li static inline VTYPE VWRAP (estrin_14) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
92*412f47f9SXin Li 				       const VTYPE *poly)
93*412f47f9SXin Li {
94*412f47f9SXin Li   return FMA (VWRAP (estrin_6) (x, x2, x4, poly + 8), x8,
95*412f47f9SXin Li 	      VWRAP (estrin_7) (x, x2, x4, poly));
96*412f47f9SXin Li }
VWRAP(estrin_15)97*412f47f9SXin Li static inline VTYPE VWRAP (estrin_15) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
98*412f47f9SXin Li 				       const VTYPE *poly)
99*412f47f9SXin Li {
100*412f47f9SXin Li   return FMA (VWRAP (estrin_7) (x, x2, x4, poly + 8), x8,
101*412f47f9SXin Li 	      VWRAP (estrin_7) (x, x2, x4, poly));
102*412f47f9SXin Li }
VWRAP(estrin_16)103*412f47f9SXin Li static inline VTYPE VWRAP (estrin_16) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
104*412f47f9SXin Li 				       VTYPE x16, const VTYPE *poly)
105*412f47f9SXin Li {
106*412f47f9SXin Li   return FMA (poly[16], x16, VWRAP (estrin_15) (x, x2, x4, x8, poly));
107*412f47f9SXin Li }
VWRAP(estrin_17)108*412f47f9SXin Li static inline VTYPE VWRAP (estrin_17) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
109*412f47f9SXin Li 				       VTYPE x16, const VTYPE *poly)
110*412f47f9SXin Li {
111*412f47f9SXin Li   VTYPE p16_17 = FMA (poly[17], x, poly[16]);
112*412f47f9SXin Li   return FMA (p16_17, x16, VWRAP (estrin_15) (x, x2, x4, x8, poly));
113*412f47f9SXin Li }
VWRAP(estrin_18)114*412f47f9SXin Li static inline VTYPE VWRAP (estrin_18) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
115*412f47f9SXin Li 				       VTYPE x16, const VTYPE *poly)
116*412f47f9SXin Li {
117*412f47f9SXin Li   VTYPE p16_17 = FMA (poly[17], x, poly[16]);
118*412f47f9SXin Li   VTYPE p16_18 = FMA (poly[18], x2, p16_17);
119*412f47f9SXin Li   return FMA (p16_18, x16, VWRAP (estrin_15) (x, x2, x4, x8, poly));
120*412f47f9SXin Li }
VWRAP(estrin_19)121*412f47f9SXin Li static inline VTYPE VWRAP (estrin_19) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
122*412f47f9SXin Li 				       VTYPE x16, const VTYPE *poly)
123*412f47f9SXin Li {
124*412f47f9SXin Li   VTYPE p16_19 = VWRAP (pairwise_poly_3) (x, x2, poly + 16);
125*412f47f9SXin Li   return FMA (p16_19, x16, VWRAP (estrin_15) (x, x2, x4, x8, poly));
126*412f47f9SXin Li }
127*412f47f9SXin Li 
VWRAP(horner_2)128*412f47f9SXin Li static inline VTYPE VWRAP (horner_2) (VTYPE x, const VTYPE *poly)
129*412f47f9SXin Li {
130*412f47f9SXin Li   VTYPE p = FMA (poly[2], x, poly[1]);
131*412f47f9SXin Li   return FMA (x, p, poly[0]);
132*412f47f9SXin Li }
VWRAP(horner_3)133*412f47f9SXin Li static inline VTYPE VWRAP (horner_3) (VTYPE x, const VTYPE *poly)
134*412f47f9SXin Li {
135*412f47f9SXin Li   VTYPE p = FMA (poly[3], x, poly[2]);
136*412f47f9SXin Li   p = FMA (x, p, poly[1]);
137*412f47f9SXin Li   p = FMA (x, p, poly[0]);
138*412f47f9SXin Li   return p;
139*412f47f9SXin Li }
VWRAP(horner_4)140*412f47f9SXin Li static inline VTYPE VWRAP (horner_4) (VTYPE x, const VTYPE *poly)
141*412f47f9SXin Li {
142*412f47f9SXin Li   VTYPE p = FMA (poly[4], x, poly[3]);
143*412f47f9SXin Li   p = FMA (x, p, poly[2]);
144*412f47f9SXin Li   p = FMA (x, p, poly[1]);
145*412f47f9SXin Li   p = FMA (x, p, poly[0]);
146*412f47f9SXin Li   return p;
147*412f47f9SXin Li }
VWRAP(horner_5)148*412f47f9SXin Li static inline VTYPE VWRAP (horner_5) (VTYPE x, const VTYPE *poly)
149*412f47f9SXin Li {
150*412f47f9SXin Li   return FMA (x, VWRAP (horner_4) (x, poly + 1), poly[0]);
151*412f47f9SXin Li }
VWRAP(horner_6)152*412f47f9SXin Li static inline VTYPE VWRAP (horner_6) (VTYPE x, const VTYPE *poly)
153*412f47f9SXin Li {
154*412f47f9SXin Li   return FMA (x, VWRAP (horner_5) (x, poly + 1), poly[0]);
155*412f47f9SXin Li }
VWRAP(horner_7)156*412f47f9SXin Li static inline VTYPE VWRAP (horner_7) (VTYPE x, const VTYPE *poly)
157*412f47f9SXin Li {
158*412f47f9SXin Li   return FMA (x, VWRAP (horner_6) (x, poly + 1), poly[0]);
159*412f47f9SXin Li }
VWRAP(horner_8)160*412f47f9SXin Li static inline VTYPE VWRAP (horner_8) (VTYPE x, const VTYPE *poly)
161*412f47f9SXin Li {
162*412f47f9SXin Li   return FMA (x, VWRAP (horner_7) (x, poly + 1), poly[0]);
163*412f47f9SXin Li }
VWRAP(horner_9)164*412f47f9SXin Li static inline VTYPE VWRAP (horner_9) (VTYPE x, const VTYPE *poly)
165*412f47f9SXin Li {
166*412f47f9SXin Li   return FMA (x, VWRAP (horner_8) (x, poly + 1), poly[0]);
167*412f47f9SXin Li }
VWRAP(horner_10)168*412f47f9SXin Li static inline VTYPE VWRAP (horner_10) (VTYPE x, const VTYPE *poly)
169*412f47f9SXin Li {
170*412f47f9SXin Li   return FMA (x, VWRAP (horner_9) (x, poly + 1), poly[0]);
171*412f47f9SXin Li }
VWRAP(horner_11)172*412f47f9SXin Li static inline VTYPE VWRAP (horner_11) (VTYPE x, const VTYPE *poly)
173*412f47f9SXin Li {
174*412f47f9SXin Li   return FMA (x, VWRAP (horner_10) (x, poly + 1), poly[0]);
175*412f47f9SXin Li }
VWRAP(horner_12)176*412f47f9SXin Li static inline VTYPE VWRAP (horner_12) (VTYPE x, const VTYPE *poly)
177*412f47f9SXin Li {
178*412f47f9SXin Li   return FMA (x, VWRAP (horner_11) (x, poly + 1), poly[0]);
179*412f47f9SXin Li }
180*412f47f9SXin Li 
VWRAP(pw_horner_4)181*412f47f9SXin Li static inline VTYPE VWRAP (pw_horner_4) (VTYPE x, VTYPE x2, const VTYPE *poly)
182*412f47f9SXin Li {
183*412f47f9SXin Li   VTYPE p01 = FMA (poly[1], x, poly[0]);
184*412f47f9SXin Li   VTYPE p23 = FMA (poly[3], x, poly[2]);
185*412f47f9SXin Li   VTYPE p;
186*412f47f9SXin Li   p = FMA (x2, poly[4], p23);
187*412f47f9SXin Li   p = FMA (x2, p, p01);
188*412f47f9SXin Li   return p;
189*412f47f9SXin Li }
VWRAP(pw_horner_5)190*412f47f9SXin Li static inline VTYPE VWRAP (pw_horner_5) (VTYPE x, VTYPE x2, const VTYPE *poly)
191*412f47f9SXin Li {
192*412f47f9SXin Li   VTYPE p01 = FMA (poly[1], x, poly[0]);
193*412f47f9SXin Li   VTYPE p23 = FMA (poly[3], x, poly[2]);
194*412f47f9SXin Li   VTYPE p45 = FMA (poly[5], x, poly[4]);
195*412f47f9SXin Li   VTYPE p;
196*412f47f9SXin Li   p = FMA (x2, p45, p23);
197*412f47f9SXin Li   p = FMA (x2, p, p01);
198*412f47f9SXin Li   return p;
199*412f47f9SXin Li }
VWRAP(pw_horner_6)200*412f47f9SXin Li static inline VTYPE VWRAP (pw_horner_6) (VTYPE x, VTYPE x2, const VTYPE *poly)
201*412f47f9SXin Li {
202*412f47f9SXin Li   VTYPE p26 = VWRAP (pw_horner_4) (x, x2, poly + 2);
203*412f47f9SXin Li   VTYPE p01 = FMA (poly[1], x, poly[0]);
204*412f47f9SXin Li   return FMA (x2, p26, p01);
205*412f47f9SXin Li }
VWRAP(pw_horner_7)206*412f47f9SXin Li static inline VTYPE VWRAP (pw_horner_7) (VTYPE x, VTYPE x2, const VTYPE *poly)
207*412f47f9SXin Li {
208*412f47f9SXin Li   VTYPE p27 = VWRAP (pw_horner_5) (x, x2, poly + 2);
209*412f47f9SXin Li   VTYPE p01 = FMA (poly[1], x, poly[0]);
210*412f47f9SXin Li   return FMA (x2, p27, p01);
211*412f47f9SXin Li }
VWRAP(pw_horner_8)212*412f47f9SXin Li static inline VTYPE VWRAP (pw_horner_8) (VTYPE x, VTYPE x2, const VTYPE *poly)
213*412f47f9SXin Li {
214*412f47f9SXin Li   VTYPE p28 = VWRAP (pw_horner_6) (x, x2, poly + 2);
215*412f47f9SXin Li   VTYPE p01 = FMA (poly[1], x, poly[0]);
216*412f47f9SXin Li   return FMA (x2, p28, p01);
217*412f47f9SXin Li }
VWRAP(pw_horner_9)218*412f47f9SXin Li static inline VTYPE VWRAP (pw_horner_9) (VTYPE x, VTYPE x2, const VTYPE *poly)
219*412f47f9SXin Li {
220*412f47f9SXin Li   VTYPE p29 = VWRAP (pw_horner_7) (x, x2, poly + 2);
221*412f47f9SXin Li   VTYPE p01 = FMA (poly[1], x, poly[0]);
222*412f47f9SXin Li   return FMA (x2, p29, p01);
223*412f47f9SXin Li }
VWRAP(pw_horner_10)224*412f47f9SXin Li static inline VTYPE VWRAP (pw_horner_10) (VTYPE x, VTYPE x2, const VTYPE *poly)
225*412f47f9SXin Li {
226*412f47f9SXin Li   VTYPE p2_10 = VWRAP (pw_horner_8) (x, x2, poly + 2);
227*412f47f9SXin Li   VTYPE p01 = FMA (poly[1], x, poly[0]);
228*412f47f9SXin Li   return FMA (x2, p2_10, p01);
229*412f47f9SXin Li }
VWRAP(pw_horner_11)230*412f47f9SXin Li static inline VTYPE VWRAP (pw_horner_11) (VTYPE x, VTYPE x2, const VTYPE *poly)
231*412f47f9SXin Li {
232*412f47f9SXin Li   VTYPE p2_11 = VWRAP (pw_horner_9) (x, x2, poly + 2);
233*412f47f9SXin Li   VTYPE p01 = FMA (poly[1], x, poly[0]);
234*412f47f9SXin Li   return FMA (x2, p2_11, p01);
235*412f47f9SXin Li }
VWRAP(pw_horner_12)236*412f47f9SXin Li static inline VTYPE VWRAP (pw_horner_12) (VTYPE x, VTYPE x2, const VTYPE *poly)
237*412f47f9SXin Li {
238*412f47f9SXin Li   VTYPE p2_12 = VWRAP (pw_horner_10) (x, x2, poly + 2);
239*412f47f9SXin Li   VTYPE p01 = FMA (poly[1], x, poly[0]);
240*412f47f9SXin Li   return FMA (x2, p2_12, p01);
241*412f47f9SXin Li }
VWRAP(pw_horner_13)242*412f47f9SXin Li static inline VTYPE VWRAP (pw_horner_13) (VTYPE x, VTYPE x2, const VTYPE *poly)
243*412f47f9SXin Li {
244*412f47f9SXin Li   VTYPE p2_13 = VWRAP (pw_horner_11) (x, x2, poly + 2);
245*412f47f9SXin Li   VTYPE p01 = FMA (poly[1], x, poly[0]);
246*412f47f9SXin Li   return FMA (x2, p2_13, p01);
247*412f47f9SXin Li }
VWRAP(pw_horner_14)248*412f47f9SXin Li static inline VTYPE VWRAP (pw_horner_14) (VTYPE x, VTYPE x2, const VTYPE *poly)
249*412f47f9SXin Li {
250*412f47f9SXin Li   VTYPE p2_14 = VWRAP (pw_horner_12) (x, x2, poly + 2);
251*412f47f9SXin Li   VTYPE p01 = FMA (poly[1], x, poly[0]);
252*412f47f9SXin Li   return FMA (x2, p2_14, p01);
253*412f47f9SXin Li }
VWRAP(pw_horner_15)254*412f47f9SXin Li static inline VTYPE VWRAP (pw_horner_15) (VTYPE x, VTYPE x2, const VTYPE *poly)
255*412f47f9SXin Li {
256*412f47f9SXin Li   VTYPE p2_15 = VWRAP (pw_horner_13) (x, x2, poly + 2);
257*412f47f9SXin Li   VTYPE p01 = FMA (poly[1], x, poly[0]);
258*412f47f9SXin Li   return FMA (x2, p2_15, p01);
259*412f47f9SXin Li }
VWRAP(pw_horner_16)260*412f47f9SXin Li static inline VTYPE VWRAP (pw_horner_16) (VTYPE x, VTYPE x2, const VTYPE *poly)
261*412f47f9SXin Li {
262*412f47f9SXin Li   VTYPE p2_16 = VWRAP (pw_horner_14) (x, x2, poly + 2);
263*412f47f9SXin Li   VTYPE p01 = FMA (poly[1], x, poly[0]);
264*412f47f9SXin Li   return FMA (x2, p2_16, p01);
265*412f47f9SXin Li }
VWRAP(pw_horner_17)266*412f47f9SXin Li static inline VTYPE VWRAP (pw_horner_17) (VTYPE x, VTYPE x2, const VTYPE *poly)
267*412f47f9SXin Li {
268*412f47f9SXin Li   VTYPE p2_17 = VWRAP (pw_horner_15) (x, x2, poly + 2);
269*412f47f9SXin Li   VTYPE p01 = FMA (poly[1], x, poly[0]);
270*412f47f9SXin Li   return FMA (x2, p2_17, p01);
271*412f47f9SXin Li }
VWRAP(pw_horner_18)272*412f47f9SXin Li static inline VTYPE VWRAP (pw_horner_18) (VTYPE x, VTYPE x2, const VTYPE *poly)
273*412f47f9SXin Li {
274*412f47f9SXin Li   VTYPE p2_18 = VWRAP (pw_horner_16) (x, x2, poly + 2);
275*412f47f9SXin Li   VTYPE p01 = FMA (poly[1], x, poly[0]);
276*412f47f9SXin Li   return FMA (x2, p2_18, p01);
277*412f47f9SXin Li }
278