xref: /aosp_15_r20/external/libopus/celt/mathops.c (revision a58d3d2adb790c104798cd88c8a3aff4fa8b82cc)
1*a58d3d2aSXin Li /* Copyright (c) 2002-2008 Jean-Marc Valin
2*a58d3d2aSXin Li    Copyright (c) 2007-2008 CSIRO
3*a58d3d2aSXin Li    Copyright (c) 2007-2009 Xiph.Org Foundation
4*a58d3d2aSXin Li    Written by Jean-Marc Valin */
5*a58d3d2aSXin Li /**
6*a58d3d2aSXin Li    @file mathops.h
7*a58d3d2aSXin Li    @brief Various math functions
8*a58d3d2aSXin Li */
9*a58d3d2aSXin Li /*
10*a58d3d2aSXin Li    Redistribution and use in source and binary forms, with or without
11*a58d3d2aSXin Li    modification, are permitted provided that the following conditions
12*a58d3d2aSXin Li    are met:
13*a58d3d2aSXin Li 
14*a58d3d2aSXin Li    - Redistributions of source code must retain the above copyright
15*a58d3d2aSXin Li    notice, this list of conditions and the following disclaimer.
16*a58d3d2aSXin Li 
17*a58d3d2aSXin Li    - Redistributions in binary form must reproduce the above copyright
18*a58d3d2aSXin Li    notice, this list of conditions and the following disclaimer in the
19*a58d3d2aSXin Li    documentation and/or other materials provided with the distribution.
20*a58d3d2aSXin Li 
21*a58d3d2aSXin Li    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22*a58d3d2aSXin Li    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23*a58d3d2aSXin Li    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24*a58d3d2aSXin Li    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
25*a58d3d2aSXin Li    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26*a58d3d2aSXin Li    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27*a58d3d2aSXin Li    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28*a58d3d2aSXin Li    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29*a58d3d2aSXin Li    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30*a58d3d2aSXin Li    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31*a58d3d2aSXin Li    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32*a58d3d2aSXin Li */
33*a58d3d2aSXin Li 
34*a58d3d2aSXin Li #ifdef HAVE_CONFIG_H
35*a58d3d2aSXin Li #include "config.h"
36*a58d3d2aSXin Li #endif
37*a58d3d2aSXin Li 
38*a58d3d2aSXin Li #include "mathops.h"
39*a58d3d2aSXin Li 
40*a58d3d2aSXin Li /*Compute floor(sqrt(_val)) with exact arithmetic.
41*a58d3d2aSXin Li   _val must be greater than 0.
42*a58d3d2aSXin Li   This has been tested on all possible 32-bit inputs greater than 0.*/
isqrt32(opus_uint32 _val)43*a58d3d2aSXin Li unsigned isqrt32(opus_uint32 _val){
44*a58d3d2aSXin Li   unsigned b;
45*a58d3d2aSXin Li   unsigned g;
46*a58d3d2aSXin Li   int      bshift;
47*a58d3d2aSXin Li   /*Uses the second method from
48*a58d3d2aSXin Li      http://www.azillionmonkeys.com/qed/sqroot.html
49*a58d3d2aSXin Li     The main idea is to search for the largest binary digit b such that
50*a58d3d2aSXin Li      (g+b)*(g+b) <= _val, and add it to the solution g.*/
51*a58d3d2aSXin Li   g=0;
52*a58d3d2aSXin Li   bshift=(EC_ILOG(_val)-1)>>1;
53*a58d3d2aSXin Li   b=1U<<bshift;
54*a58d3d2aSXin Li   do{
55*a58d3d2aSXin Li     opus_uint32 t;
56*a58d3d2aSXin Li     t=(((opus_uint32)g<<1)+b)<<bshift;
57*a58d3d2aSXin Li     if(t<=_val){
58*a58d3d2aSXin Li       g+=b;
59*a58d3d2aSXin Li       _val-=t;
60*a58d3d2aSXin Li     }
61*a58d3d2aSXin Li     b>>=1;
62*a58d3d2aSXin Li     bshift--;
63*a58d3d2aSXin Li   }
64*a58d3d2aSXin Li   while(bshift>=0);
65*a58d3d2aSXin Li   return g;
66*a58d3d2aSXin Li }
67*a58d3d2aSXin Li 
68*a58d3d2aSXin Li #ifdef FIXED_POINT
69*a58d3d2aSXin Li 
frac_div32(opus_val32 a,opus_val32 b)70*a58d3d2aSXin Li opus_val32 frac_div32(opus_val32 a, opus_val32 b)
71*a58d3d2aSXin Li {
72*a58d3d2aSXin Li    opus_val16 rcp;
73*a58d3d2aSXin Li    opus_val32 result, rem;
74*a58d3d2aSXin Li    int shift = celt_ilog2(b)-29;
75*a58d3d2aSXin Li    a = VSHR32(a,shift);
76*a58d3d2aSXin Li    b = VSHR32(b,shift);
77*a58d3d2aSXin Li    /* 16-bit reciprocal */
78*a58d3d2aSXin Li    rcp = ROUND16(celt_rcp(ROUND16(b,16)),3);
79*a58d3d2aSXin Li    result = MULT16_32_Q15(rcp, a);
80*a58d3d2aSXin Li    rem = PSHR32(a,2)-MULT32_32_Q31(result, b);
81*a58d3d2aSXin Li    result = ADD32(result, SHL32(MULT16_32_Q15(rcp, rem),2));
82*a58d3d2aSXin Li    if (result >= 536870912)       /*  2^29 */
83*a58d3d2aSXin Li       return 2147483647;          /*  2^31 - 1 */
84*a58d3d2aSXin Li    else if (result <= -536870912) /* -2^29 */
85*a58d3d2aSXin Li       return -2147483647;         /* -2^31 */
86*a58d3d2aSXin Li    else
87*a58d3d2aSXin Li       return SHL32(result, 2);
88*a58d3d2aSXin Li }
89*a58d3d2aSXin Li 
90*a58d3d2aSXin Li /** Reciprocal sqrt approximation in the range [0.25,1) (Q16 in, Q14 out) */
celt_rsqrt_norm(opus_val32 x)91*a58d3d2aSXin Li opus_val16 celt_rsqrt_norm(opus_val32 x)
92*a58d3d2aSXin Li {
93*a58d3d2aSXin Li    opus_val16 n;
94*a58d3d2aSXin Li    opus_val16 r;
95*a58d3d2aSXin Li    opus_val16 r2;
96*a58d3d2aSXin Li    opus_val16 y;
97*a58d3d2aSXin Li    /* Range of n is [-16384,32767] ([-0.5,1) in Q15). */
98*a58d3d2aSXin Li    n = x-32768;
99*a58d3d2aSXin Li    /* Get a rough initial guess for the root.
100*a58d3d2aSXin Li       The optimal minimax quadratic approximation (using relative error) is
101*a58d3d2aSXin Li        r = 1.437799046117536+n*(-0.823394375837328+n*0.4096419668459485).
102*a58d3d2aSXin Li       Coefficients here, and the final result r, are Q14.*/
103*a58d3d2aSXin Li    r = ADD16(23557, MULT16_16_Q15(n, ADD16(-13490, MULT16_16_Q15(n, 6713))));
104*a58d3d2aSXin Li    /* We want y = x*r*r-1 in Q15, but x is 32-bit Q16 and r is Q14.
105*a58d3d2aSXin Li       We can compute the result from n and r using Q15 multiplies with some
106*a58d3d2aSXin Li        adjustment, carefully done to avoid overflow.
107*a58d3d2aSXin Li       Range of y is [-1564,1594]. */
108*a58d3d2aSXin Li    r2 = MULT16_16_Q15(r, r);
109*a58d3d2aSXin Li    y = SHL16(SUB16(ADD16(MULT16_16_Q15(r2, n), r2), 16384), 1);
110*a58d3d2aSXin Li    /* Apply a 2nd-order Householder iteration: r += r*y*(y*0.375-0.5).
111*a58d3d2aSXin Li       This yields the Q14 reciprocal square root of the Q16 x, with a maximum
112*a58d3d2aSXin Li        relative error of 1.04956E-4, a (relative) RMSE of 2.80979E-5, and a
113*a58d3d2aSXin Li        peak absolute error of 2.26591/16384. */
114*a58d3d2aSXin Li    return ADD16(r, MULT16_16_Q15(r, MULT16_16_Q15(y,
115*a58d3d2aSXin Li               SUB16(MULT16_16_Q15(y, 12288), 16384))));
116*a58d3d2aSXin Li }
117*a58d3d2aSXin Li 
118*a58d3d2aSXin Li /** Sqrt approximation (QX input, QX/2 output) */
celt_sqrt(opus_val32 x)119*a58d3d2aSXin Li opus_val32 celt_sqrt(opus_val32 x)
120*a58d3d2aSXin Li {
121*a58d3d2aSXin Li    int k;
122*a58d3d2aSXin Li    opus_val16 n;
123*a58d3d2aSXin Li    opus_val32 rt;
124*a58d3d2aSXin Li    static const opus_val16 C[5] = {23175, 11561, -3011, 1699, -664};
125*a58d3d2aSXin Li    if (x==0)
126*a58d3d2aSXin Li       return 0;
127*a58d3d2aSXin Li    else if (x>=1073741824)
128*a58d3d2aSXin Li       return 32767;
129*a58d3d2aSXin Li    k = (celt_ilog2(x)>>1)-7;
130*a58d3d2aSXin Li    x = VSHR32(x, 2*k);
131*a58d3d2aSXin Li    n = x-32768;
132*a58d3d2aSXin Li    rt = ADD16(C[0], MULT16_16_Q15(n, ADD16(C[1], MULT16_16_Q15(n, ADD16(C[2],
133*a58d3d2aSXin Li               MULT16_16_Q15(n, ADD16(C[3], MULT16_16_Q15(n, (C[4])))))))));
134*a58d3d2aSXin Li    rt = VSHR32(rt,7-k);
135*a58d3d2aSXin Li    return rt;
136*a58d3d2aSXin Li }
137*a58d3d2aSXin Li 
138*a58d3d2aSXin Li #define L1 32767
139*a58d3d2aSXin Li #define L2 -7651
140*a58d3d2aSXin Li #define L3 8277
141*a58d3d2aSXin Li #define L4 -626
142*a58d3d2aSXin Li 
_celt_cos_pi_2(opus_val16 x)143*a58d3d2aSXin Li static OPUS_INLINE opus_val16 _celt_cos_pi_2(opus_val16 x)
144*a58d3d2aSXin Li {
145*a58d3d2aSXin Li    opus_val16 x2;
146*a58d3d2aSXin Li 
147*a58d3d2aSXin Li    x2 = MULT16_16_P15(x,x);
148*a58d3d2aSXin Li    return ADD16(1,MIN16(32766,ADD32(SUB16(L1,x2), MULT16_16_P15(x2, ADD32(L2, MULT16_16_P15(x2, ADD32(L3, MULT16_16_P15(L4, x2
149*a58d3d2aSXin Li                                                                                 ))))))));
150*a58d3d2aSXin Li }
151*a58d3d2aSXin Li 
152*a58d3d2aSXin Li #undef L1
153*a58d3d2aSXin Li #undef L2
154*a58d3d2aSXin Li #undef L3
155*a58d3d2aSXin Li #undef L4
156*a58d3d2aSXin Li 
celt_cos_norm(opus_val32 x)157*a58d3d2aSXin Li opus_val16 celt_cos_norm(opus_val32 x)
158*a58d3d2aSXin Li {
159*a58d3d2aSXin Li    x = x&0x0001ffff;
160*a58d3d2aSXin Li    if (x>SHL32(EXTEND32(1), 16))
161*a58d3d2aSXin Li       x = SUB32(SHL32(EXTEND32(1), 17),x);
162*a58d3d2aSXin Li    if (x&0x00007fff)
163*a58d3d2aSXin Li    {
164*a58d3d2aSXin Li       if (x<SHL32(EXTEND32(1), 15))
165*a58d3d2aSXin Li       {
166*a58d3d2aSXin Li          return _celt_cos_pi_2(EXTRACT16(x));
167*a58d3d2aSXin Li       } else {
168*a58d3d2aSXin Li          return NEG16(_celt_cos_pi_2(EXTRACT16(65536-x)));
169*a58d3d2aSXin Li       }
170*a58d3d2aSXin Li    } else {
171*a58d3d2aSXin Li       if (x&0x0000ffff)
172*a58d3d2aSXin Li          return 0;
173*a58d3d2aSXin Li       else if (x&0x0001ffff)
174*a58d3d2aSXin Li          return -32767;
175*a58d3d2aSXin Li       else
176*a58d3d2aSXin Li          return 32767;
177*a58d3d2aSXin Li    }
178*a58d3d2aSXin Li }
179*a58d3d2aSXin Li 
180*a58d3d2aSXin Li /** Reciprocal approximation (Q15 input, Q16 output) */
celt_rcp(opus_val32 x)181*a58d3d2aSXin Li opus_val32 celt_rcp(opus_val32 x)
182*a58d3d2aSXin Li {
183*a58d3d2aSXin Li    int i;
184*a58d3d2aSXin Li    opus_val16 n;
185*a58d3d2aSXin Li    opus_val16 r;
186*a58d3d2aSXin Li    celt_sig_assert(x>0);
187*a58d3d2aSXin Li    i = celt_ilog2(x);
188*a58d3d2aSXin Li    /* n is Q15 with range [0,1). */
189*a58d3d2aSXin Li    n = VSHR32(x,i-15)-32768;
190*a58d3d2aSXin Li    /* Start with a linear approximation:
191*a58d3d2aSXin Li       r = 1.8823529411764706-0.9411764705882353*n.
192*a58d3d2aSXin Li       The coefficients and the result are Q14 in the range [15420,30840].*/
193*a58d3d2aSXin Li    r = ADD16(30840, MULT16_16_Q15(-15420, n));
194*a58d3d2aSXin Li    /* Perform two Newton iterations:
195*a58d3d2aSXin Li       r -= r*((r*n)-1.Q15)
196*a58d3d2aSXin Li          = r*((r*n)+(r-1.Q15)). */
197*a58d3d2aSXin Li    r = SUB16(r, MULT16_16_Q15(r,
198*a58d3d2aSXin Li              ADD16(MULT16_16_Q15(r, n), ADD16(r, -32768))));
199*a58d3d2aSXin Li    /* We subtract an extra 1 in the second iteration to avoid overflow; it also
200*a58d3d2aSXin Li        neatly compensates for truncation error in the rest of the process. */
201*a58d3d2aSXin Li    r = SUB16(r, ADD16(1, MULT16_16_Q15(r,
202*a58d3d2aSXin Li              ADD16(MULT16_16_Q15(r, n), ADD16(r, -32768)))));
203*a58d3d2aSXin Li    /* r is now the Q15 solution to 2/(n+1), with a maximum relative error
204*a58d3d2aSXin Li        of 7.05346E-5, a (relative) RMSE of 2.14418E-5, and a peak absolute
205*a58d3d2aSXin Li        error of 1.24665/32768. */
206*a58d3d2aSXin Li    return VSHR32(EXTEND32(r),i-16);
207*a58d3d2aSXin Li }
208*a58d3d2aSXin Li 
209*a58d3d2aSXin Li #endif
210