xref: /aosp_15_r20/external/arm-optimized-routines/pl/math/sv_logf_inline.h (revision 412f47f9e737e10ed5cc46ec6a8d7fa2264f8a14)
1*412f47f9SXin Li /*
2*412f47f9SXin Li  * Single-precision vector log function - inline version
3*412f47f9SXin Li  *
4*412f47f9SXin Li  * Copyright (c) 2024, Arm Limited.
5*412f47f9SXin Li  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li  */
7*412f47f9SXin Li 
8*412f47f9SXin Li #include "sv_math.h"
9*412f47f9SXin Li 
10*412f47f9SXin Li struct sv_logf_data
11*412f47f9SXin Li {
12*412f47f9SXin Li   float p1, p3, p5, p6, p0, p2, p4;
13*412f47f9SXin Li   float ln2;
14*412f47f9SXin Li   uint32_t off, mantissa_mask;
15*412f47f9SXin Li };
16*412f47f9SXin Li 
17*412f47f9SXin Li #define SV_LOGF_CONSTANTS                                                     \
18*412f47f9SXin Li   {                                                                           \
19*412f47f9SXin Li     .p0 = -0x1.ffffc8p-2f, .p1 = 0x1.555d7cp-2f, .p2 = -0x1.00187cp-2f,       \
20*412f47f9SXin Li     .p3 = 0x1.961348p-3f, .p4 = -0x1.4f9934p-3f, .p5 = 0x1.5a9aa2p-3f,        \
21*412f47f9SXin Li     .p6 = -0x1.3e737cp-3f, .ln2 = 0x1.62e43p-1f, .off = 0x3f2aaaab,           \
22*412f47f9SXin Li     .mantissa_mask = 0x007fffff                                               \
23*412f47f9SXin Li   }
24*412f47f9SXin Li 
25*412f47f9SXin Li static inline svfloat32_t
sv_logf_inline(svbool_t pg,svfloat32_t x,const struct sv_logf_data * d)26*412f47f9SXin Li sv_logf_inline (svbool_t pg, svfloat32_t x, const struct sv_logf_data *d)
27*412f47f9SXin Li {
28*412f47f9SXin Li   svuint32_t u = svreinterpret_u32 (x);
29*412f47f9SXin Li 
30*412f47f9SXin Li   /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
31*412f47f9SXin Li   u = svsub_x (pg, u, d->off);
32*412f47f9SXin Li   svfloat32_t n = svcvt_f32_s32_x (
33*412f47f9SXin Li       pg, svasr_x (pg, svreinterpret_s32_u32 (u), 23)); /* signextend.  */
34*412f47f9SXin Li   u = svand_x (pg, u, d->mantissa_mask);
35*412f47f9SXin Li   u = svadd_x (pg, u, d->off);
36*412f47f9SXin Li   svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f);
37*412f47f9SXin Li 
38*412f47f9SXin Li   /* y = log(1+r) + n*ln2.  */
39*412f47f9SXin Li   svfloat32_t r2 = svmul_x (pg, r, r);
40*412f47f9SXin Li   /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))).  */
41*412f47f9SXin Li   svfloat32_t p1356 = svld1rq_f32 (svptrue_b32 (), &d->p1);
42*412f47f9SXin Li   svfloat32_t p = svmla_lane (sv_f32 (d->p4), r, p1356, 2);
43*412f47f9SXin Li   svfloat32_t q = svmla_lane (sv_f32 (d->p2), r, p1356, 1);
44*412f47f9SXin Li   svfloat32_t y = svmla_lane (sv_f32 (d->p0), r, p1356, 0);
45*412f47f9SXin Li   p = svmla_lane (p, r2, p1356, 3);
46*412f47f9SXin Li   q = svmla_x (pg, q, p, r2);
47*412f47f9SXin Li   y = svmla_x (pg, y, q, r2);
48*412f47f9SXin Li   p = svmla_x (pg, r, n, d->ln2);
49*412f47f9SXin Li 
50*412f47f9SXin Li   return svmla_x (pg, p, y, r2);
51*412f47f9SXin Li }
52