xref: /aosp_15_r20/external/pffft/pf_mixer.cpp (revision 3f1979aa0d7ad34fcf3763de7b7b8f8cd67e5bdd)
1*3f1979aaSAndroid Build Coastguard Worker /*
2*3f1979aaSAndroid Build Coastguard Worker This software is part of pffft/pfdsp, a set of simple DSP routines.
3*3f1979aaSAndroid Build Coastguard Worker 
4*3f1979aaSAndroid Build Coastguard Worker Copyright (c) 2014, Andras Retzler <[email protected]>
5*3f1979aaSAndroid Build Coastguard Worker Copyright (c) 2020  Hayati Ayguen <[email protected]>
6*3f1979aaSAndroid Build Coastguard Worker All rights reserved.
7*3f1979aaSAndroid Build Coastguard Worker 
8*3f1979aaSAndroid Build Coastguard Worker Redistribution and use in source and binary forms, with or without
9*3f1979aaSAndroid Build Coastguard Worker modification, are permitted provided that the following conditions are met:
10*3f1979aaSAndroid Build Coastguard Worker     * Redistributions of source code must retain the above copyright
11*3f1979aaSAndroid Build Coastguard Worker       notice, this list of conditions and the following disclaimer.
12*3f1979aaSAndroid Build Coastguard Worker     * Redistributions in binary form must reproduce the above copyright
13*3f1979aaSAndroid Build Coastguard Worker       notice, this list of conditions and the following disclaimer in the
14*3f1979aaSAndroid Build Coastguard Worker       documentation and/or other materials provided with the distribution.
15*3f1979aaSAndroid Build Coastguard Worker     * Neither the name of the copyright holder nor the
16*3f1979aaSAndroid Build Coastguard Worker       names of its contributors may be used to endorse or promote products
17*3f1979aaSAndroid Build Coastguard Worker       derived from this software without specific prior written permission.
18*3f1979aaSAndroid Build Coastguard Worker 
19*3f1979aaSAndroid Build Coastguard Worker THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20*3f1979aaSAndroid Build Coastguard Worker ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21*3f1979aaSAndroid Build Coastguard Worker WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22*3f1979aaSAndroid Build Coastguard Worker DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
23*3f1979aaSAndroid Build Coastguard Worker DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24*3f1979aaSAndroid Build Coastguard Worker (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25*3f1979aaSAndroid Build Coastguard Worker LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
26*3f1979aaSAndroid Build Coastguard Worker ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27*3f1979aaSAndroid Build Coastguard Worker (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28*3f1979aaSAndroid Build Coastguard Worker SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*3f1979aaSAndroid Build Coastguard Worker */
30*3f1979aaSAndroid Build Coastguard Worker 
31*3f1979aaSAndroid Build Coastguard Worker /* include own header first, to see missing includes */
32*3f1979aaSAndroid Build Coastguard Worker #include "pf_mixer.h"
33*3f1979aaSAndroid Build Coastguard Worker #include "fmv.h"
34*3f1979aaSAndroid Build Coastguard Worker 
35*3f1979aaSAndroid Build Coastguard Worker #include <math.h>
36*3f1979aaSAndroid Build Coastguard Worker #include <stdlib.h>
37*3f1979aaSAndroid Build Coastguard Worker #include <assert.h>
38*3f1979aaSAndroid Build Coastguard Worker 
39*3f1979aaSAndroid Build Coastguard Worker //they dropped M_PI in C99, so we define it:
40*3f1979aaSAndroid Build Coastguard Worker #define PI ((float)3.14159265358979323846)
41*3f1979aaSAndroid Build Coastguard Worker 
42*3f1979aaSAndroid Build Coastguard Worker //apply to pointers:
43*3f1979aaSAndroid Build Coastguard Worker #define iof(complexf_input_p,i) (*(((float*)complexf_input_p)+2*(i)))
44*3f1979aaSAndroid Build Coastguard Worker #define qof(complexf_input_p,i) (*(((float*)complexf_input_p)+2*(i)+1))
45*3f1979aaSAndroid Build Coastguard Worker 
46*3f1979aaSAndroid Build Coastguard Worker #define USE_ALIGNED_ADDRESSES  0
47*3f1979aaSAndroid Build Coastguard Worker 
48*3f1979aaSAndroid Build Coastguard Worker 
49*3f1979aaSAndroid Build Coastguard Worker 
50*3f1979aaSAndroid Build Coastguard Worker /*
51*3f1979aaSAndroid Build Coastguard Worker   _____   _____ _____      __                  _   _
52*3f1979aaSAndroid Build Coastguard Worker  |  __ \ / ____|  __ \    / _|                | | (_)
53*3f1979aaSAndroid Build Coastguard Worker  | |  | | (___ | |__) |  | |_ _   _ _ __   ___| |_ _  ___  _ __  ___
54*3f1979aaSAndroid Build Coastguard Worker  | |  | |\___ \|  ___/   |  _| | | | '_ \ / __| __| |/ _ \| '_ \/ __|
55*3f1979aaSAndroid Build Coastguard Worker  | |__| |____) | |       | | | |_| | | | | (__| |_| | (_) | | | \__ \
56*3f1979aaSAndroid Build Coastguard Worker  |_____/|_____/|_|       |_|  \__,_|_| |_|\___|\__|_|\___/|_| |_|___/
57*3f1979aaSAndroid Build Coastguard Worker 
58*3f1979aaSAndroid Build Coastguard Worker */
59*3f1979aaSAndroid Build Coastguard Worker 
60*3f1979aaSAndroid Build Coastguard Worker 
61*3f1979aaSAndroid Build Coastguard Worker #if defined(__GNUC__)
62*3f1979aaSAndroid Build Coastguard Worker #  define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
63*3f1979aaSAndroid Build Coastguard Worker #  define RESTRICT __restrict
64*3f1979aaSAndroid Build Coastguard Worker #elif defined(_MSC_VER)
65*3f1979aaSAndroid Build Coastguard Worker #  define ALWAYS_INLINE(return_type) __forceinline return_type
66*3f1979aaSAndroid Build Coastguard Worker #  define RESTRICT __restrict
67*3f1979aaSAndroid Build Coastguard Worker #endif
68*3f1979aaSAndroid Build Coastguard Worker 
69*3f1979aaSAndroid Build Coastguard Worker 
70*3f1979aaSAndroid Build Coastguard Worker #ifndef PFFFT_SIMD_DISABLE
71*3f1979aaSAndroid Build Coastguard Worker #if (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86))
72*3f1979aaSAndroid Build Coastguard Worker   #pragma message "Manual SSE x86/x64 optimizations are ON"
73*3f1979aaSAndroid Build Coastguard Worker   #include <xmmintrin.h>
74*3f1979aaSAndroid Build Coastguard Worker   #define HAVE_SSE_INTRINSICS 1
75*3f1979aaSAndroid Build Coastguard Worker 
76*3f1979aaSAndroid Build Coastguard Worker #elif defined(PFFFT_ENABLE_NEON) && defined(__arm__)
77*3f1979aaSAndroid Build Coastguard Worker   #pragma message "Manual NEON (arm32) optimizations are ON"
78*3f1979aaSAndroid Build Coastguard Worker   #include "sse2neon.h"
79*3f1979aaSAndroid Build Coastguard Worker   #define HAVE_SSE_INTRINSICS 1
80*3f1979aaSAndroid Build Coastguard Worker 
81*3f1979aaSAndroid Build Coastguard Worker #elif defined(PFFFT_ENABLE_NEON) && defined(__aarch64__)
82*3f1979aaSAndroid Build Coastguard Worker   #pragma message "Manual NEON (aarch64) optimizations are ON"
83*3f1979aaSAndroid Build Coastguard Worker   #include "sse2neon.h"
84*3f1979aaSAndroid Build Coastguard Worker   #define HAVE_SSE_INTRINSICS 1
85*3f1979aaSAndroid Build Coastguard Worker 
86*3f1979aaSAndroid Build Coastguard Worker #endif
87*3f1979aaSAndroid Build Coastguard Worker #endif
88*3f1979aaSAndroid Build Coastguard Worker 
89*3f1979aaSAndroid Build Coastguard Worker #ifdef HAVE_SSE_INTRINSICS
90*3f1979aaSAndroid Build Coastguard Worker 
91*3f1979aaSAndroid Build Coastguard Worker typedef __m128 v4sf;
92*3f1979aaSAndroid Build Coastguard Worker #  define SIMD_SZ 4
93*3f1979aaSAndroid Build Coastguard Worker 
94*3f1979aaSAndroid Build Coastguard Worker typedef union v4_union {
95*3f1979aaSAndroid Build Coastguard Worker   __m128  v;
96*3f1979aaSAndroid Build Coastguard Worker   float f[4];
97*3f1979aaSAndroid Build Coastguard Worker } v4_union;
98*3f1979aaSAndroid Build Coastguard Worker 
99*3f1979aaSAndroid Build Coastguard Worker #define VMUL(a,b)                 _mm_mul_ps(a,b)
100*3f1979aaSAndroid Build Coastguard Worker #define VDIV(a,b)                 _mm_div_ps(a,b)
101*3f1979aaSAndroid Build Coastguard Worker #define VADD(a,b)                 _mm_add_ps(a,b)
102*3f1979aaSAndroid Build Coastguard Worker #define VSUB(a,b)                 _mm_sub_ps(a,b)
103*3f1979aaSAndroid Build Coastguard Worker #define LD_PS1(s)                 _mm_set1_ps(s)
104*3f1979aaSAndroid Build Coastguard Worker #define VLOAD_UNALIGNED(ptr)      _mm_loadu_ps((const float *)(ptr))
105*3f1979aaSAndroid Build Coastguard Worker #define VLOAD_ALIGNED(ptr)        _mm_load_ps((const float *)(ptr))
106*3f1979aaSAndroid Build Coastguard Worker #define VSTORE_UNALIGNED(ptr, v)  _mm_storeu_ps((float*)(ptr), v)
107*3f1979aaSAndroid Build Coastguard Worker #define VSTORE_ALIGNED(ptr, v)    _mm_store_ps((float*)(ptr), v)
108*3f1979aaSAndroid Build Coastguard Worker #define INTERLEAVE2(in1, in2, out1, out2) { __m128 tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
109*3f1979aaSAndroid Build Coastguard Worker #define UNINTERLEAVE2(in1, in2, out1, out2) { __m128 tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
110*3f1979aaSAndroid Build Coastguard Worker 
111*3f1979aaSAndroid Build Coastguard Worker #if USE_ALIGNED_ADDRESSES
112*3f1979aaSAndroid Build Coastguard Worker   #define VLOAD(ptr)              _mm_load_ps((const float *)(ptr))
113*3f1979aaSAndroid Build Coastguard Worker   #define VSTORE(ptr, v)          _mm_store_ps((float*)(ptr), v)
114*3f1979aaSAndroid Build Coastguard Worker #else
115*3f1979aaSAndroid Build Coastguard Worker   #define VLOAD(ptr)              _mm_loadu_ps((const float *)(ptr))
116*3f1979aaSAndroid Build Coastguard Worker   #define VSTORE(ptr, v)          _mm_storeu_ps((float*)(ptr), v)
117*3f1979aaSAndroid Build Coastguard Worker #endif
118*3f1979aaSAndroid Build Coastguard Worker 
119*3f1979aaSAndroid Build Coastguard Worker 
have_sse_shift_mixer_impl()120*3f1979aaSAndroid Build Coastguard Worker int have_sse_shift_mixer_impl()
121*3f1979aaSAndroid Build Coastguard Worker {
122*3f1979aaSAndroid Build Coastguard Worker     return 1;
123*3f1979aaSAndroid Build Coastguard Worker }
124*3f1979aaSAndroid Build Coastguard Worker 
125*3f1979aaSAndroid Build Coastguard Worker #else
126*3f1979aaSAndroid Build Coastguard Worker 
have_sse_shift_mixer_impl()127*3f1979aaSAndroid Build Coastguard Worker int have_sse_shift_mixer_impl()
128*3f1979aaSAndroid Build Coastguard Worker {
129*3f1979aaSAndroid Build Coastguard Worker     return 0;
130*3f1979aaSAndroid Build Coastguard Worker }
131*3f1979aaSAndroid Build Coastguard Worker 
132*3f1979aaSAndroid Build Coastguard Worker #endif
133*3f1979aaSAndroid Build Coastguard Worker 
134*3f1979aaSAndroid Build Coastguard Worker 
135*3f1979aaSAndroid Build Coastguard Worker /*********************************************************************/
136*3f1979aaSAndroid Build Coastguard Worker 
137*3f1979aaSAndroid Build Coastguard Worker /**************/
138*3f1979aaSAndroid Build Coastguard Worker /*** ALGO A ***/
139*3f1979aaSAndroid Build Coastguard Worker /**************/
140*3f1979aaSAndroid Build Coastguard Worker 
141*3f1979aaSAndroid Build Coastguard Worker PF_TARGET_CLONES
shift_math_cc(complexf * input,complexf * output,int input_size,float rate,float starting_phase)142*3f1979aaSAndroid Build Coastguard Worker float shift_math_cc(complexf *input, complexf* output, int input_size, float rate, float starting_phase)
143*3f1979aaSAndroid Build Coastguard Worker {
144*3f1979aaSAndroid Build Coastguard Worker     rate*=2;
145*3f1979aaSAndroid Build Coastguard Worker     //Shifts the complex spectrum. Basically a complex mixer. This version uses cmath.
146*3f1979aaSAndroid Build Coastguard Worker     float phase=starting_phase;
147*3f1979aaSAndroid Build Coastguard Worker     float phase_increment=rate*PI;
148*3f1979aaSAndroid Build Coastguard Worker     float cosval, sinval;
149*3f1979aaSAndroid Build Coastguard Worker     for(int i=0;i<input_size; i++)
150*3f1979aaSAndroid Build Coastguard Worker     {
151*3f1979aaSAndroid Build Coastguard Worker         cosval=cos(phase);
152*3f1979aaSAndroid Build Coastguard Worker         sinval=sin(phase);
153*3f1979aaSAndroid Build Coastguard Worker         //we multiply two complex numbers.
154*3f1979aaSAndroid Build Coastguard Worker         //how? enter this to maxima (software) for explanation:
155*3f1979aaSAndroid Build Coastguard Worker         //   (a+b*%i)*(c+d*%i), rectform;
156*3f1979aaSAndroid Build Coastguard Worker         iof(output,i)=cosval*iof(input,i)-sinval*qof(input,i);
157*3f1979aaSAndroid Build Coastguard Worker         qof(output,i)=sinval*iof(input,i)+cosval*qof(input,i);
158*3f1979aaSAndroid Build Coastguard Worker         phase+=phase_increment;
159*3f1979aaSAndroid Build Coastguard Worker         while(phase>2*PI) phase-=2*PI; //@shift_math_cc: normalize phase
160*3f1979aaSAndroid Build Coastguard Worker         while(phase<0) phase+=2*PI;
161*3f1979aaSAndroid Build Coastguard Worker     }
162*3f1979aaSAndroid Build Coastguard Worker     return phase;
163*3f1979aaSAndroid Build Coastguard Worker }
164*3f1979aaSAndroid Build Coastguard Worker 
165*3f1979aaSAndroid Build Coastguard Worker /*********************************************************************/
166*3f1979aaSAndroid Build Coastguard Worker 
167*3f1979aaSAndroid Build Coastguard Worker /**************/
168*3f1979aaSAndroid Build Coastguard Worker /*** ALGO B ***/
169*3f1979aaSAndroid Build Coastguard Worker /**************/
170*3f1979aaSAndroid Build Coastguard Worker 
shift_table_init(int table_size)171*3f1979aaSAndroid Build Coastguard Worker shift_table_data_t shift_table_init(int table_size)
172*3f1979aaSAndroid Build Coastguard Worker {
173*3f1979aaSAndroid Build Coastguard Worker     shift_table_data_t output;
174*3f1979aaSAndroid Build Coastguard Worker     output.table=(float*)malloc(sizeof(float)*table_size);
175*3f1979aaSAndroid Build Coastguard Worker     output.table_size=table_size;
176*3f1979aaSAndroid Build Coastguard Worker     for(int i=0;i<table_size;i++)
177*3f1979aaSAndroid Build Coastguard Worker     {
178*3f1979aaSAndroid Build Coastguard Worker         output.table[i]=sin(((float)i/table_size)*(PI/2));
179*3f1979aaSAndroid Build Coastguard Worker     }
180*3f1979aaSAndroid Build Coastguard Worker     return output;
181*3f1979aaSAndroid Build Coastguard Worker }
182*3f1979aaSAndroid Build Coastguard Worker 
shift_table_deinit(shift_table_data_t table_data)183*3f1979aaSAndroid Build Coastguard Worker void shift_table_deinit(shift_table_data_t table_data)
184*3f1979aaSAndroid Build Coastguard Worker {
185*3f1979aaSAndroid Build Coastguard Worker     free(table_data.table);
186*3f1979aaSAndroid Build Coastguard Worker }
187*3f1979aaSAndroid Build Coastguard Worker 
188*3f1979aaSAndroid Build Coastguard Worker 
189*3f1979aaSAndroid Build Coastguard Worker PF_TARGET_CLONES
shift_table_cc(complexf * input,complexf * output,int input_size,float rate,shift_table_data_t table_data,float starting_phase)190*3f1979aaSAndroid Build Coastguard Worker float shift_table_cc(complexf* input, complexf* output, int input_size, float rate, shift_table_data_t table_data, float starting_phase)
191*3f1979aaSAndroid Build Coastguard Worker {
192*3f1979aaSAndroid Build Coastguard Worker     rate*=2;
193*3f1979aaSAndroid Build Coastguard Worker     //Shifts the complex spectrum. Basically a complex mixer. This version uses a pre-built sine table.
194*3f1979aaSAndroid Build Coastguard Worker     float phase=starting_phase;
195*3f1979aaSAndroid Build Coastguard Worker     float phase_increment=rate*PI;
196*3f1979aaSAndroid Build Coastguard Worker     float cosval, sinval;
197*3f1979aaSAndroid Build Coastguard Worker     for(int i=0;i<input_size; i++) //@shift_math_cc
198*3f1979aaSAndroid Build Coastguard Worker     {
199*3f1979aaSAndroid Build Coastguard Worker         int sin_index, cos_index, temp_index, sin_sign, cos_sign;
200*3f1979aaSAndroid Build Coastguard Worker         int quadrant=phase/(PI/2); //between 0 and 3
201*3f1979aaSAndroid Build Coastguard Worker         float vphase=phase-quadrant*(PI/2);
202*3f1979aaSAndroid Build Coastguard Worker         sin_index=(vphase/(PI/2))*table_data.table_size;
203*3f1979aaSAndroid Build Coastguard Worker         cos_index=table_data.table_size-1-sin_index;
204*3f1979aaSAndroid Build Coastguard Worker         if(quadrant&1) //in quadrant 1 and 3
205*3f1979aaSAndroid Build Coastguard Worker         {
206*3f1979aaSAndroid Build Coastguard Worker             temp_index=sin_index;
207*3f1979aaSAndroid Build Coastguard Worker             sin_index=cos_index;
208*3f1979aaSAndroid Build Coastguard Worker             cos_index=temp_index;
209*3f1979aaSAndroid Build Coastguard Worker         }
210*3f1979aaSAndroid Build Coastguard Worker         sin_sign=(quadrant>1)?-1:1; //in quadrant 2 and 3
211*3f1979aaSAndroid Build Coastguard Worker         cos_sign=(quadrant&&quadrant<3)?-1:1; //in quadrant 1 and 2
212*3f1979aaSAndroid Build Coastguard Worker         sinval=sin_sign*table_data.table[sin_index];
213*3f1979aaSAndroid Build Coastguard Worker         cosval=cos_sign*table_data.table[cos_index];
214*3f1979aaSAndroid Build Coastguard Worker         //we multiply two complex numbers.
215*3f1979aaSAndroid Build Coastguard Worker         //how? enter this to maxima (software) for explanation:
216*3f1979aaSAndroid Build Coastguard Worker         //   (a+b*%i)*(c+d*%i), rectform;
217*3f1979aaSAndroid Build Coastguard Worker         iof(output,i)=cosval*iof(input,i)-sinval*qof(input,i);
218*3f1979aaSAndroid Build Coastguard Worker         qof(output,i)=sinval*iof(input,i)+cosval*qof(input,i);
219*3f1979aaSAndroid Build Coastguard Worker         phase+=phase_increment;
220*3f1979aaSAndroid Build Coastguard Worker         while(phase>2*PI) phase-=2*PI; //@shift_math_cc: normalize phase
221*3f1979aaSAndroid Build Coastguard Worker         while(phase<0) phase+=2*PI;
222*3f1979aaSAndroid Build Coastguard Worker     }
223*3f1979aaSAndroid Build Coastguard Worker     return phase;
224*3f1979aaSAndroid Build Coastguard Worker }
225*3f1979aaSAndroid Build Coastguard Worker 
226*3f1979aaSAndroid Build Coastguard Worker /*********************************************************************/
227*3f1979aaSAndroid Build Coastguard Worker 
228*3f1979aaSAndroid Build Coastguard Worker /**************/
229*3f1979aaSAndroid Build Coastguard Worker /*** ALGO C ***/
230*3f1979aaSAndroid Build Coastguard Worker /**************/
231*3f1979aaSAndroid Build Coastguard Worker 
shift_addfast_init(float rate)232*3f1979aaSAndroid Build Coastguard Worker shift_addfast_data_t shift_addfast_init(float rate)
233*3f1979aaSAndroid Build Coastguard Worker {
234*3f1979aaSAndroid Build Coastguard Worker     shift_addfast_data_t output;
235*3f1979aaSAndroid Build Coastguard Worker     output.phase_increment=2*rate*PI;
236*3f1979aaSAndroid Build Coastguard Worker     for(int i=0;i<4;i++)
237*3f1979aaSAndroid Build Coastguard Worker     {
238*3f1979aaSAndroid Build Coastguard Worker         output.dsin[i]=sin(output.phase_increment*(i+1));
239*3f1979aaSAndroid Build Coastguard Worker         output.dcos[i]=cos(output.phase_increment*(i+1));
240*3f1979aaSAndroid Build Coastguard Worker     }
241*3f1979aaSAndroid Build Coastguard Worker     return output;
242*3f1979aaSAndroid Build Coastguard Worker }
243*3f1979aaSAndroid Build Coastguard Worker 
244*3f1979aaSAndroid Build Coastguard Worker #define SADF_L1(j) \
245*3f1979aaSAndroid Build Coastguard Worker     cos_vals_ ## j = cos_start * dcos_ ## j - sin_start * dsin_ ## j; \
246*3f1979aaSAndroid Build Coastguard Worker     sin_vals_ ## j = sin_start * dcos_ ## j + cos_start * dsin_ ## j;
247*3f1979aaSAndroid Build Coastguard Worker #define SADF_L2(j) \
248*3f1979aaSAndroid Build Coastguard Worker     iof(output,4*i+j)=(cos_vals_ ## j)*iof(input,4*i+j)-(sin_vals_ ## j)*qof(input,4*i+j); \
249*3f1979aaSAndroid Build Coastguard Worker     qof(output,4*i+j)=(sin_vals_ ## j)*iof(input,4*i+j)+(cos_vals_ ## j)*qof(input,4*i+j);
250*3f1979aaSAndroid Build Coastguard Worker 
251*3f1979aaSAndroid Build Coastguard Worker PF_TARGET_CLONES
shift_addfast_cc(complexf * input,complexf * output,int input_size,shift_addfast_data_t * d,float starting_phase)252*3f1979aaSAndroid Build Coastguard Worker float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase)
253*3f1979aaSAndroid Build Coastguard Worker {
254*3f1979aaSAndroid Build Coastguard Worker     //input_size should be multiple of 4
255*3f1979aaSAndroid Build Coastguard Worker     //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
256*3f1979aaSAndroid Build Coastguard Worker     float cos_start=cos(starting_phase);
257*3f1979aaSAndroid Build Coastguard Worker     float sin_start=sin(starting_phase);
258*3f1979aaSAndroid Build Coastguard Worker     float register cos_vals_0, cos_vals_1, cos_vals_2, cos_vals_3,
259*3f1979aaSAndroid Build Coastguard Worker         sin_vals_0, sin_vals_1, sin_vals_2, sin_vals_3,
260*3f1979aaSAndroid Build Coastguard Worker         dsin_0 = d->dsin[0], dsin_1 = d->dsin[1], dsin_2 = d->dsin[2], dsin_3 = d->dsin[3],
261*3f1979aaSAndroid Build Coastguard Worker         dcos_0 = d->dcos[0], dcos_1 = d->dcos[1], dcos_2 = d->dcos[2], dcos_3 = d->dcos[3];
262*3f1979aaSAndroid Build Coastguard Worker 
263*3f1979aaSAndroid Build Coastguard Worker     for(int i=0;i<input_size/4; i++)
264*3f1979aaSAndroid Build Coastguard Worker     {
265*3f1979aaSAndroid Build Coastguard Worker         SADF_L1(0)
266*3f1979aaSAndroid Build Coastguard Worker         SADF_L1(1)
267*3f1979aaSAndroid Build Coastguard Worker         SADF_L1(2)
268*3f1979aaSAndroid Build Coastguard Worker         SADF_L1(3)
269*3f1979aaSAndroid Build Coastguard Worker         SADF_L2(0)
270*3f1979aaSAndroid Build Coastguard Worker         SADF_L2(1)
271*3f1979aaSAndroid Build Coastguard Worker         SADF_L2(2)
272*3f1979aaSAndroid Build Coastguard Worker         SADF_L2(3)
273*3f1979aaSAndroid Build Coastguard Worker         cos_start = cos_vals_3;
274*3f1979aaSAndroid Build Coastguard Worker         sin_start = sin_vals_3;
275*3f1979aaSAndroid Build Coastguard Worker     }
276*3f1979aaSAndroid Build Coastguard Worker     starting_phase+=input_size*d->phase_increment;
277*3f1979aaSAndroid Build Coastguard Worker     while(starting_phase>PI) starting_phase-=2*PI;
278*3f1979aaSAndroid Build Coastguard Worker     while(starting_phase<-PI) starting_phase+=2*PI;
279*3f1979aaSAndroid Build Coastguard Worker     return starting_phase;
280*3f1979aaSAndroid Build Coastguard Worker }
281*3f1979aaSAndroid Build Coastguard Worker 
282*3f1979aaSAndroid Build Coastguard Worker #undef SADF_L2
283*3f1979aaSAndroid Build Coastguard Worker 
284*3f1979aaSAndroid Build Coastguard Worker 
285*3f1979aaSAndroid Build Coastguard Worker #define SADF_L2(j) \
286*3f1979aaSAndroid Build Coastguard Worker     tmp_inp_cos = iof(in_out,4*i+j); \
287*3f1979aaSAndroid Build Coastguard Worker     tmp_inp_sin = qof(in_out,4*i+j); \
288*3f1979aaSAndroid Build Coastguard Worker     iof(in_out,4*i+j)=(cos_vals_ ## j)*tmp_inp_cos - (sin_vals_ ## j)*tmp_inp_sin; \
289*3f1979aaSAndroid Build Coastguard Worker     qof(in_out,4*i+j)=(sin_vals_ ## j)*tmp_inp_cos + (cos_vals_ ## j)*tmp_inp_sin;
290*3f1979aaSAndroid Build Coastguard Worker 
291*3f1979aaSAndroid Build Coastguard Worker PF_TARGET_CLONES
shift_addfast_inp_c(complexf * in_out,int N_cplx,shift_addfast_data_t * d,float starting_phase)292*3f1979aaSAndroid Build Coastguard Worker float shift_addfast_inp_c(complexf *in_out, int N_cplx, shift_addfast_data_t* d, float starting_phase)
293*3f1979aaSAndroid Build Coastguard Worker {
294*3f1979aaSAndroid Build Coastguard Worker     //input_size should be multiple of 4
295*3f1979aaSAndroid Build Coastguard Worker     //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
296*3f1979aaSAndroid Build Coastguard Worker     float cos_start=cos(starting_phase);
297*3f1979aaSAndroid Build Coastguard Worker     float sin_start=sin(starting_phase);
298*3f1979aaSAndroid Build Coastguard Worker     float register tmp_inp_cos, tmp_inp_sin,
299*3f1979aaSAndroid Build Coastguard Worker         cos_vals_0, cos_vals_1, cos_vals_2, cos_vals_3,
300*3f1979aaSAndroid Build Coastguard Worker         sin_vals_0, sin_vals_1, sin_vals_2, sin_vals_3,
301*3f1979aaSAndroid Build Coastguard Worker         dsin_0 = d->dsin[0], dsin_1 = d->dsin[1], dsin_2 = d->dsin[2], dsin_3 = d->dsin[3],
302*3f1979aaSAndroid Build Coastguard Worker         dcos_0 = d->dcos[0], dcos_1 = d->dcos[1], dcos_2 = d->dcos[2], dcos_3 = d->dcos[3];
303*3f1979aaSAndroid Build Coastguard Worker 
304*3f1979aaSAndroid Build Coastguard Worker     for(int i=0;i<N_cplx/4; i++)
305*3f1979aaSAndroid Build Coastguard Worker     {
306*3f1979aaSAndroid Build Coastguard Worker         SADF_L1(0)
307*3f1979aaSAndroid Build Coastguard Worker         SADF_L1(1)
308*3f1979aaSAndroid Build Coastguard Worker         SADF_L1(2)
309*3f1979aaSAndroid Build Coastguard Worker         SADF_L1(3)
310*3f1979aaSAndroid Build Coastguard Worker         SADF_L2(0)
311*3f1979aaSAndroid Build Coastguard Worker         SADF_L2(1)
312*3f1979aaSAndroid Build Coastguard Worker         SADF_L2(2)
313*3f1979aaSAndroid Build Coastguard Worker         SADF_L2(3)
314*3f1979aaSAndroid Build Coastguard Worker         cos_start = cos_vals_3;
315*3f1979aaSAndroid Build Coastguard Worker         sin_start = sin_vals_3;
316*3f1979aaSAndroid Build Coastguard Worker     }
317*3f1979aaSAndroid Build Coastguard Worker     starting_phase+=N_cplx*d->phase_increment;
318*3f1979aaSAndroid Build Coastguard Worker     while(starting_phase>PI) starting_phase-=2*PI;
319*3f1979aaSAndroid Build Coastguard Worker     while(starting_phase<-PI) starting_phase+=2*PI;
320*3f1979aaSAndroid Build Coastguard Worker     return starting_phase;
321*3f1979aaSAndroid Build Coastguard Worker }
322*3f1979aaSAndroid Build Coastguard Worker 
323*3f1979aaSAndroid Build Coastguard Worker #undef SADF_L1
324*3f1979aaSAndroid Build Coastguard Worker #undef SADF_L2
325*3f1979aaSAndroid Build Coastguard Worker 
326*3f1979aaSAndroid Build Coastguard Worker 
327*3f1979aaSAndroid Build Coastguard Worker /*********************************************************************/
328*3f1979aaSAndroid Build Coastguard Worker 
329*3f1979aaSAndroid Build Coastguard Worker /**************/
330*3f1979aaSAndroid Build Coastguard Worker /*** ALGO D ***/
331*3f1979aaSAndroid Build Coastguard Worker /**************/
332*3f1979aaSAndroid Build Coastguard Worker 
shift_unroll_init(float rate,int size)333*3f1979aaSAndroid Build Coastguard Worker shift_unroll_data_t shift_unroll_init(float rate, int size)
334*3f1979aaSAndroid Build Coastguard Worker {
335*3f1979aaSAndroid Build Coastguard Worker     shift_unroll_data_t output;
336*3f1979aaSAndroid Build Coastguard Worker     output.phase_increment=2*rate*PI;
337*3f1979aaSAndroid Build Coastguard Worker     output.size = size;
338*3f1979aaSAndroid Build Coastguard Worker     output.dsin=(float*)malloc(sizeof(float)*size);
339*3f1979aaSAndroid Build Coastguard Worker     output.dcos=(float*)malloc(sizeof(float)*size);
340*3f1979aaSAndroid Build Coastguard Worker     float myphase = 0;
341*3f1979aaSAndroid Build Coastguard Worker     for(int i=0;i<size;i++)
342*3f1979aaSAndroid Build Coastguard Worker     {
343*3f1979aaSAndroid Build Coastguard Worker         myphase += output.phase_increment;
344*3f1979aaSAndroid Build Coastguard Worker         while(myphase>PI) myphase-=2*PI;
345*3f1979aaSAndroid Build Coastguard Worker         while(myphase<-PI) myphase+=2*PI;
346*3f1979aaSAndroid Build Coastguard Worker         output.dsin[i]=sin(myphase);
347*3f1979aaSAndroid Build Coastguard Worker         output.dcos[i]=cos(myphase);
348*3f1979aaSAndroid Build Coastguard Worker     }
349*3f1979aaSAndroid Build Coastguard Worker     return output;
350*3f1979aaSAndroid Build Coastguard Worker }
351*3f1979aaSAndroid Build Coastguard Worker 
shift_unroll_deinit(shift_unroll_data_t * d)352*3f1979aaSAndroid Build Coastguard Worker void shift_unroll_deinit(shift_unroll_data_t* d)
353*3f1979aaSAndroid Build Coastguard Worker {
354*3f1979aaSAndroid Build Coastguard Worker     if (!d)
355*3f1979aaSAndroid Build Coastguard Worker         return;
356*3f1979aaSAndroid Build Coastguard Worker     free(d->dsin);
357*3f1979aaSAndroid Build Coastguard Worker     free(d->dcos);
358*3f1979aaSAndroid Build Coastguard Worker     d->dsin = NULL;
359*3f1979aaSAndroid Build Coastguard Worker     d->dcos = NULL;
360*3f1979aaSAndroid Build Coastguard Worker }
361*3f1979aaSAndroid Build Coastguard Worker 
362*3f1979aaSAndroid Build Coastguard Worker PF_TARGET_CLONES
shift_unroll_cc(complexf * input,complexf * output,int input_size,shift_unroll_data_t * d,float starting_phase)363*3f1979aaSAndroid Build Coastguard Worker float shift_unroll_cc(complexf *input, complexf* output, int input_size, shift_unroll_data_t* d, float starting_phase)
364*3f1979aaSAndroid Build Coastguard Worker {
365*3f1979aaSAndroid Build Coastguard Worker     //input_size should be multiple of 4
366*3f1979aaSAndroid Build Coastguard Worker     //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
367*3f1979aaSAndroid Build Coastguard Worker     float cos_start = cos(starting_phase);
368*3f1979aaSAndroid Build Coastguard Worker     float sin_start = sin(starting_phase);
369*3f1979aaSAndroid Build Coastguard Worker     register float cos_val = cos_start, sin_val = sin_start;
370*3f1979aaSAndroid Build Coastguard Worker     for(int i=0;i<input_size; i++)
371*3f1979aaSAndroid Build Coastguard Worker     {
372*3f1979aaSAndroid Build Coastguard Worker         iof(output,i) = cos_val*iof(input,i) - sin_val*qof(input,i);
373*3f1979aaSAndroid Build Coastguard Worker         qof(output,i) = sin_val*iof(input,i) + cos_val*qof(input,i);
374*3f1979aaSAndroid Build Coastguard Worker         // calculate complex phasor for next iteration
375*3f1979aaSAndroid Build Coastguard Worker         cos_val = cos_start * d->dcos[i] - sin_start * d->dsin[i];
376*3f1979aaSAndroid Build Coastguard Worker         sin_val = sin_start * d->dcos[i] + cos_start * d->dsin[i];
377*3f1979aaSAndroid Build Coastguard Worker     }
378*3f1979aaSAndroid Build Coastguard Worker     starting_phase+=input_size*d->phase_increment;
379*3f1979aaSAndroid Build Coastguard Worker     while(starting_phase>PI) starting_phase-=2*PI;
380*3f1979aaSAndroid Build Coastguard Worker     while(starting_phase<-PI) starting_phase+=2*PI;
381*3f1979aaSAndroid Build Coastguard Worker     return starting_phase;
382*3f1979aaSAndroid Build Coastguard Worker }
383*3f1979aaSAndroid Build Coastguard Worker 
384*3f1979aaSAndroid Build Coastguard Worker PF_TARGET_CLONES
shift_unroll_inp_c(complexf * in_out,int size,shift_unroll_data_t * d,float starting_phase)385*3f1979aaSAndroid Build Coastguard Worker float shift_unroll_inp_c(complexf* in_out, int size, shift_unroll_data_t* d, float starting_phase)
386*3f1979aaSAndroid Build Coastguard Worker {
387*3f1979aaSAndroid Build Coastguard Worker     float cos_start = cos(starting_phase);
388*3f1979aaSAndroid Build Coastguard Worker     float sin_start = sin(starting_phase);
389*3f1979aaSAndroid Build Coastguard Worker     register float cos_val = cos_start, sin_val = sin_start;
390*3f1979aaSAndroid Build Coastguard Worker     for(int i=0;i<size; i++)
391*3f1979aaSAndroid Build Coastguard Worker     {
392*3f1979aaSAndroid Build Coastguard Worker         register float inp_i = iof(in_out,i);
393*3f1979aaSAndroid Build Coastguard Worker         register float inp_q = qof(in_out,i);
394*3f1979aaSAndroid Build Coastguard Worker         iof(in_out,i) = cos_val*inp_i - sin_val*inp_q;
395*3f1979aaSAndroid Build Coastguard Worker         qof(in_out,i) = sin_val*inp_i + cos_val*inp_q;
396*3f1979aaSAndroid Build Coastguard Worker         // calculate complex phasor for next iteration
397*3f1979aaSAndroid Build Coastguard Worker         cos_val = cos_start * d->dcos[i] - sin_start * d->dsin[i];
398*3f1979aaSAndroid Build Coastguard Worker         sin_val = sin_start * d->dcos[i] + cos_start * d->dsin[i];
399*3f1979aaSAndroid Build Coastguard Worker     }
400*3f1979aaSAndroid Build Coastguard Worker     starting_phase += size * d->phase_increment;
401*3f1979aaSAndroid Build Coastguard Worker     while(starting_phase>PI) starting_phase-=2*PI;
402*3f1979aaSAndroid Build Coastguard Worker     while(starting_phase<-PI) starting_phase+=2*PI;
403*3f1979aaSAndroid Build Coastguard Worker     return starting_phase;
404*3f1979aaSAndroid Build Coastguard Worker }
405*3f1979aaSAndroid Build Coastguard Worker 
406*3f1979aaSAndroid Build Coastguard Worker 
407*3f1979aaSAndroid Build Coastguard Worker /*********************************************************************/
408*3f1979aaSAndroid Build Coastguard Worker 
409*3f1979aaSAndroid Build Coastguard Worker /**************/
410*3f1979aaSAndroid Build Coastguard Worker /*** ALGO E ***/
411*3f1979aaSAndroid Build Coastguard Worker /**************/
412*3f1979aaSAndroid Build Coastguard Worker 
shift_limited_unroll_init(float rate)413*3f1979aaSAndroid Build Coastguard Worker shift_limited_unroll_data_t shift_limited_unroll_init(float rate)
414*3f1979aaSAndroid Build Coastguard Worker {
415*3f1979aaSAndroid Build Coastguard Worker     shift_limited_unroll_data_t output;
416*3f1979aaSAndroid Build Coastguard Worker     output.phase_increment=2*rate*PI;
417*3f1979aaSAndroid Build Coastguard Worker     float myphase = 0;
418*3f1979aaSAndroid Build Coastguard Worker     for(int i=0; i < PF_SHIFT_LIMITED_UNROLL_SIZE; i++)
419*3f1979aaSAndroid Build Coastguard Worker     {
420*3f1979aaSAndroid Build Coastguard Worker         myphase += output.phase_increment;
421*3f1979aaSAndroid Build Coastguard Worker         while(myphase>PI) myphase-=2*PI;
422*3f1979aaSAndroid Build Coastguard Worker         while(myphase<-PI) myphase+=2*PI;
423*3f1979aaSAndroid Build Coastguard Worker         output.dcos[i] = cos(myphase);
424*3f1979aaSAndroid Build Coastguard Worker         output.dsin[i] = sin(myphase);
425*3f1979aaSAndroid Build Coastguard Worker     }
426*3f1979aaSAndroid Build Coastguard Worker     output.complex_phase.i = 1.0F;
427*3f1979aaSAndroid Build Coastguard Worker     output.complex_phase.q = 0.0F;
428*3f1979aaSAndroid Build Coastguard Worker     return output;
429*3f1979aaSAndroid Build Coastguard Worker }
430*3f1979aaSAndroid Build Coastguard Worker 
431*3f1979aaSAndroid Build Coastguard Worker PF_TARGET_CLONES
shift_limited_unroll_cc(const complexf * input,complexf * output,int size,shift_limited_unroll_data_t * d)432*3f1979aaSAndroid Build Coastguard Worker void shift_limited_unroll_cc(const complexf *input, complexf* output, int size, shift_limited_unroll_data_t* d)
433*3f1979aaSAndroid Build Coastguard Worker {
434*3f1979aaSAndroid Build Coastguard Worker     float cos_start = d->complex_phase.i;
435*3f1979aaSAndroid Build Coastguard Worker     float sin_start = d->complex_phase.q;
436*3f1979aaSAndroid Build Coastguard Worker     register float cos_val = cos_start, sin_val = sin_start, mag;
437*3f1979aaSAndroid Build Coastguard Worker     while (size > 0)
438*3f1979aaSAndroid Build Coastguard Worker     {
439*3f1979aaSAndroid Build Coastguard Worker         int N = (size >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : size;
440*3f1979aaSAndroid Build Coastguard Worker         for(int i=0;i<N/PF_SHIFT_LIMITED_SIMD_SZ; i++ )
441*3f1979aaSAndroid Build Coastguard Worker         {
442*3f1979aaSAndroid Build Coastguard Worker             for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++)
443*3f1979aaSAndroid Build Coastguard Worker             {
444*3f1979aaSAndroid Build Coastguard Worker                 iof(output,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = cos_val*iof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j) - sin_val*qof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j);
445*3f1979aaSAndroid Build Coastguard Worker                 qof(output,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = sin_val*iof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j) + cos_val*qof(input,PF_SHIFT_LIMITED_SIMD_SZ*i+j);
446*3f1979aaSAndroid Build Coastguard Worker                 // calculate complex phasor for next iteration
447*3f1979aaSAndroid Build Coastguard Worker                 cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
448*3f1979aaSAndroid Build Coastguard Worker                 sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
449*3f1979aaSAndroid Build Coastguard Worker             }
450*3f1979aaSAndroid Build Coastguard Worker         }
451*3f1979aaSAndroid Build Coastguard Worker         // "starts := vals := vals / |vals|"
452*3f1979aaSAndroid Build Coastguard Worker         mag = sqrtf(cos_val * cos_val + sin_val * sin_val);
453*3f1979aaSAndroid Build Coastguard Worker         cos_val /= mag;
454*3f1979aaSAndroid Build Coastguard Worker         sin_val /= mag;
455*3f1979aaSAndroid Build Coastguard Worker         cos_start = cos_val;
456*3f1979aaSAndroid Build Coastguard Worker         sin_start = sin_val;
457*3f1979aaSAndroid Build Coastguard Worker 
458*3f1979aaSAndroid Build Coastguard Worker         input += PF_SHIFT_LIMITED_UNROLL_SIZE;
459*3f1979aaSAndroid Build Coastguard Worker         output += PF_SHIFT_LIMITED_UNROLL_SIZE;
460*3f1979aaSAndroid Build Coastguard Worker         size -= PF_SHIFT_LIMITED_UNROLL_SIZE;
461*3f1979aaSAndroid Build Coastguard Worker     }
462*3f1979aaSAndroid Build Coastguard Worker     d->complex_phase.i = cos_val;
463*3f1979aaSAndroid Build Coastguard Worker     d->complex_phase.q = sin_val;
464*3f1979aaSAndroid Build Coastguard Worker }
465*3f1979aaSAndroid Build Coastguard Worker 
466*3f1979aaSAndroid Build Coastguard Worker PF_TARGET_CLONES
shift_limited_unroll_inp_c(complexf * in_out,int N_cplx,shift_limited_unroll_data_t * d)467*3f1979aaSAndroid Build Coastguard Worker void shift_limited_unroll_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_data_t* d)
468*3f1979aaSAndroid Build Coastguard Worker {
469*3f1979aaSAndroid Build Coastguard Worker     float inp_i[PF_SHIFT_LIMITED_SIMD_SZ];
470*3f1979aaSAndroid Build Coastguard Worker     float inp_q[PF_SHIFT_LIMITED_SIMD_SZ];
471*3f1979aaSAndroid Build Coastguard Worker     // "vals := starts := phase_state"
472*3f1979aaSAndroid Build Coastguard Worker     float cos_start = d->complex_phase.i;
473*3f1979aaSAndroid Build Coastguard Worker     float sin_start = d->complex_phase.q;
474*3f1979aaSAndroid Build Coastguard Worker     register float cos_val = cos_start, sin_val = sin_start, mag;
475*3f1979aaSAndroid Build Coastguard Worker     while (N_cplx)
476*3f1979aaSAndroid Build Coastguard Worker     {
477*3f1979aaSAndroid Build Coastguard Worker         int N = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx;
478*3f1979aaSAndroid Build Coastguard Worker         for(int i=0;i<N/PF_SHIFT_LIMITED_SIMD_SZ; i++ )
479*3f1979aaSAndroid Build Coastguard Worker         {
480*3f1979aaSAndroid Build Coastguard Worker             for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++)
481*3f1979aaSAndroid Build Coastguard Worker                 inp_i[j] = in_out[PF_SHIFT_LIMITED_SIMD_SZ*i+j].i;
482*3f1979aaSAndroid Build Coastguard Worker             for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++)
483*3f1979aaSAndroid Build Coastguard Worker                 inp_q[j] = in_out[PF_SHIFT_LIMITED_SIMD_SZ*i+j].q;
484*3f1979aaSAndroid Build Coastguard Worker             for(int j=0; j<PF_SHIFT_LIMITED_SIMD_SZ; j++)
485*3f1979aaSAndroid Build Coastguard Worker             {
486*3f1979aaSAndroid Build Coastguard Worker                 // "out[] = inp[] * vals"
487*3f1979aaSAndroid Build Coastguard Worker                 iof(in_out,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = cos_val*inp_i[j] - sin_val*inp_q[j];
488*3f1979aaSAndroid Build Coastguard Worker                 qof(in_out,PF_SHIFT_LIMITED_SIMD_SZ*i+j) = sin_val*inp_i[j] + cos_val*inp_q[j];
489*3f1979aaSAndroid Build Coastguard Worker                 // calculate complex phasor for next iteration
490*3f1979aaSAndroid Build Coastguard Worker                 // "vals :=  d[] * starts"
491*3f1979aaSAndroid Build Coastguard Worker                 cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
492*3f1979aaSAndroid Build Coastguard Worker                 sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
493*3f1979aaSAndroid Build Coastguard Worker             }
494*3f1979aaSAndroid Build Coastguard Worker         }
495*3f1979aaSAndroid Build Coastguard Worker         // "starts := vals := vals / |vals|"
496*3f1979aaSAndroid Build Coastguard Worker         mag = sqrtf(cos_val * cos_val + sin_val * sin_val);
497*3f1979aaSAndroid Build Coastguard Worker         cos_val /= mag;
498*3f1979aaSAndroid Build Coastguard Worker         sin_val /= mag;
499*3f1979aaSAndroid Build Coastguard Worker         cos_start = cos_val;
500*3f1979aaSAndroid Build Coastguard Worker         sin_start = sin_val;
501*3f1979aaSAndroid Build Coastguard Worker 
502*3f1979aaSAndroid Build Coastguard Worker         in_out += PF_SHIFT_LIMITED_UNROLL_SIZE;
503*3f1979aaSAndroid Build Coastguard Worker         N_cplx -= PF_SHIFT_LIMITED_UNROLL_SIZE;
504*3f1979aaSAndroid Build Coastguard Worker     }
505*3f1979aaSAndroid Build Coastguard Worker     // "phase_state := starts"
506*3f1979aaSAndroid Build Coastguard Worker     d->complex_phase.i = cos_start;
507*3f1979aaSAndroid Build Coastguard Worker     d->complex_phase.q = sin_start;
508*3f1979aaSAndroid Build Coastguard Worker }
509*3f1979aaSAndroid Build Coastguard Worker 
510*3f1979aaSAndroid Build Coastguard Worker 
511*3f1979aaSAndroid Build Coastguard Worker #ifdef HAVE_SSE_INTRINSICS
512*3f1979aaSAndroid Build Coastguard Worker 
513*3f1979aaSAndroid Build Coastguard Worker /*********************************************************************/
514*3f1979aaSAndroid Build Coastguard Worker 
515*3f1979aaSAndroid Build Coastguard Worker /**************/
516*3f1979aaSAndroid Build Coastguard Worker /*** ALGO F ***/
517*3f1979aaSAndroid Build Coastguard Worker /**************/
518*3f1979aaSAndroid Build Coastguard Worker 
shift_limited_unroll_A_sse_init(float relative_freq,float phase_start_rad)519*3f1979aaSAndroid Build Coastguard Worker shift_limited_unroll_A_sse_data_t shift_limited_unroll_A_sse_init(float relative_freq, float phase_start_rad)
520*3f1979aaSAndroid Build Coastguard Worker {
521*3f1979aaSAndroid Build Coastguard Worker     shift_limited_unroll_A_sse_data_t output;
522*3f1979aaSAndroid Build Coastguard Worker     float myphase;
523*3f1979aaSAndroid Build Coastguard Worker 
524*3f1979aaSAndroid Build Coastguard Worker     output.phase_increment = 2*relative_freq*PI;
525*3f1979aaSAndroid Build Coastguard Worker 
526*3f1979aaSAndroid Build Coastguard Worker     myphase = 0.0F;
527*3f1979aaSAndroid Build Coastguard Worker     for (int i = 0; i < PF_SHIFT_LIMITED_UNROLL_SIZE + PF_SHIFT_LIMITED_SIMD_SZ; i += PF_SHIFT_LIMITED_SIMD_SZ)
528*3f1979aaSAndroid Build Coastguard Worker     {
529*3f1979aaSAndroid Build Coastguard Worker         for (int k = 0; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
530*3f1979aaSAndroid Build Coastguard Worker         {
531*3f1979aaSAndroid Build Coastguard Worker             myphase += output.phase_increment;
532*3f1979aaSAndroid Build Coastguard Worker             while(myphase>PI) myphase-=2*PI;
533*3f1979aaSAndroid Build Coastguard Worker             while(myphase<-PI) myphase+=2*PI;
534*3f1979aaSAndroid Build Coastguard Worker         }
535*3f1979aaSAndroid Build Coastguard Worker         output.dcos[i] = cos(myphase);
536*3f1979aaSAndroid Build Coastguard Worker         output.dsin[i] = sin(myphase);
537*3f1979aaSAndroid Build Coastguard Worker         for (int k = 1; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
538*3f1979aaSAndroid Build Coastguard Worker         {
539*3f1979aaSAndroid Build Coastguard Worker             output.dcos[i+k] = output.dcos[i];
540*3f1979aaSAndroid Build Coastguard Worker             output.dsin[i+k] = output.dsin[i];
541*3f1979aaSAndroid Build Coastguard Worker         }
542*3f1979aaSAndroid Build Coastguard Worker     }
543*3f1979aaSAndroid Build Coastguard Worker 
544*3f1979aaSAndroid Build Coastguard Worker     output.dcos_blk = 0.0F;
545*3f1979aaSAndroid Build Coastguard Worker     output.dsin_blk = 0.0F;
546*3f1979aaSAndroid Build Coastguard Worker 
547*3f1979aaSAndroid Build Coastguard Worker     myphase = phase_start_rad;
548*3f1979aaSAndroid Build Coastguard Worker     for (int i = 0; i < PF_SHIFT_LIMITED_SIMD_SZ; i++)
549*3f1979aaSAndroid Build Coastguard Worker     {
550*3f1979aaSAndroid Build Coastguard Worker         output.phase_state_i[i] = cos(myphase);
551*3f1979aaSAndroid Build Coastguard Worker         output.phase_state_q[i] = sin(myphase);
552*3f1979aaSAndroid Build Coastguard Worker         myphase += output.phase_increment;
553*3f1979aaSAndroid Build Coastguard Worker         while(myphase>PI) myphase-=2*PI;
554*3f1979aaSAndroid Build Coastguard Worker         while(myphase<-PI) myphase+=2*PI;
555*3f1979aaSAndroid Build Coastguard Worker     }
556*3f1979aaSAndroid Build Coastguard Worker     return output;
557*3f1979aaSAndroid Build Coastguard Worker }
558*3f1979aaSAndroid Build Coastguard Worker 
559*3f1979aaSAndroid Build Coastguard Worker 
560*3f1979aaSAndroid Build Coastguard Worker PF_TARGET_CLONES
shift_limited_unroll_A_sse_inp_c(complexf * in_out,int N_cplx,shift_limited_unroll_A_sse_data_t * d)561*3f1979aaSAndroid Build Coastguard Worker void shift_limited_unroll_A_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_A_sse_data_t* d)
562*3f1979aaSAndroid Build Coastguard Worker {
563*3f1979aaSAndroid Build Coastguard Worker     // "vals := starts := phase_state"
564*3f1979aaSAndroid Build Coastguard Worker     __m128 cos_starts = VLOAD( &d->phase_state_i[0] );
565*3f1979aaSAndroid Build Coastguard Worker     __m128 sin_starts = VLOAD( &d->phase_state_q[0] );
566*3f1979aaSAndroid Build Coastguard Worker     __m128 cos_vals = cos_starts;
567*3f1979aaSAndroid Build Coastguard Worker     __m128 sin_vals = sin_starts;
568*3f1979aaSAndroid Build Coastguard Worker     __m128 inp_re, inp_im;
569*3f1979aaSAndroid Build Coastguard Worker     __m128 product_re, product_im;
570*3f1979aaSAndroid Build Coastguard Worker     __m128 interl_prod_a, interl_prod_b;
571*3f1979aaSAndroid Build Coastguard Worker     __m128 * RESTRICT p_trig_cos_tab;
572*3f1979aaSAndroid Build Coastguard Worker     __m128 * RESTRICT p_trig_sin_tab;
573*3f1979aaSAndroid Build Coastguard Worker     __m128 * RESTRICT u = (__m128*)in_out;
574*3f1979aaSAndroid Build Coastguard Worker 
575*3f1979aaSAndroid Build Coastguard Worker     while (N_cplx)
576*3f1979aaSAndroid Build Coastguard Worker     {
577*3f1979aaSAndroid Build Coastguard Worker         const int NB = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx;
578*3f1979aaSAndroid Build Coastguard Worker         int B = NB;
579*3f1979aaSAndroid Build Coastguard Worker         p_trig_cos_tab = (__m128*)( &d->dcos[0] );
580*3f1979aaSAndroid Build Coastguard Worker         p_trig_sin_tab = (__m128*)( &d->dsin[0] );
581*3f1979aaSAndroid Build Coastguard Worker         while (B)
582*3f1979aaSAndroid Build Coastguard Worker         {
583*3f1979aaSAndroid Build Coastguard Worker             // complex multiplication of 4 complex values from/to in_out[]
584*3f1979aaSAndroid Build Coastguard Worker             // ==  u[0..3] *= (cos_val[0..3] + i * sin_val[0..3]):
585*3f1979aaSAndroid Build Coastguard Worker             // "out[] = inp[] * vals"
586*3f1979aaSAndroid Build Coastguard Worker             UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im);  /* inp_re = all reals; inp_im = all imags */
587*3f1979aaSAndroid Build Coastguard Worker             product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) );
588*3f1979aaSAndroid Build Coastguard Worker             product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) );
589*3f1979aaSAndroid Build Coastguard Worker             INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b);
590*3f1979aaSAndroid Build Coastguard Worker             VSTORE(u, interl_prod_a);
591*3f1979aaSAndroid Build Coastguard Worker             VSTORE(u+1, interl_prod_b);
592*3f1979aaSAndroid Build Coastguard Worker             u += 2;
593*3f1979aaSAndroid Build Coastguard Worker             // calculate complex phasor for next iteration
594*3f1979aaSAndroid Build Coastguard Worker             // cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
595*3f1979aaSAndroid Build Coastguard Worker             // sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
596*3f1979aaSAndroid Build Coastguard Worker             // cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-)
597*3f1979aaSAndroid Build Coastguard Worker             // "vals :=  d[] * starts"
598*3f1979aaSAndroid Build Coastguard Worker             inp_re = VLOAD(p_trig_cos_tab);
599*3f1979aaSAndroid Build Coastguard Worker             inp_im = VLOAD(p_trig_sin_tab);
600*3f1979aaSAndroid Build Coastguard Worker             cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) );
601*3f1979aaSAndroid Build Coastguard Worker             sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) );
602*3f1979aaSAndroid Build Coastguard Worker             ++p_trig_cos_tab;
603*3f1979aaSAndroid Build Coastguard Worker             ++p_trig_sin_tab;
604*3f1979aaSAndroid Build Coastguard Worker             B -= 4;
605*3f1979aaSAndroid Build Coastguard Worker         }
606*3f1979aaSAndroid Build Coastguard Worker         N_cplx -= NB;
607*3f1979aaSAndroid Build Coastguard Worker         /* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */
608*3f1979aaSAndroid Build Coastguard Worker         /* re-use product_re[]/product_im[] for normalization */
609*3f1979aaSAndroid Build Coastguard Worker         // "starts := vals := vals / |vals|"
610*3f1979aaSAndroid Build Coastguard Worker         product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) );
611*3f1979aaSAndroid Build Coastguard Worker #if 0
612*3f1979aaSAndroid Build Coastguard Worker         // more spikes in spectrum! at PF_SHIFT_LIMITED_UNROLL_SIZE = 64
613*3f1979aaSAndroid Build Coastguard Worker         // higher spikes in spectrum at PF_SHIFT_LIMITED_UNROLL_SIZE = 16
614*3f1979aaSAndroid Build Coastguard Worker         product_im = _mm_rsqrt_ps(product_re);
615*3f1979aaSAndroid Build Coastguard Worker         cos_starts = cos_vals = VMUL(cos_vals, product_im);
616*3f1979aaSAndroid Build Coastguard Worker         sin_starts = sin_vals = VMUL(sin_vals, product_im);
617*3f1979aaSAndroid Build Coastguard Worker #else
618*3f1979aaSAndroid Build Coastguard Worker         // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 64 - but slower!
619*3f1979aaSAndroid Build Coastguard Worker         // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 128 - fast again
620*3f1979aaSAndroid Build Coastguard Worker         product_im = _mm_sqrt_ps(product_re);
621*3f1979aaSAndroid Build Coastguard Worker         cos_starts = cos_vals = VDIV(cos_vals, product_im);
622*3f1979aaSAndroid Build Coastguard Worker         sin_starts = sin_vals = VDIV(sin_vals, product_im);
623*3f1979aaSAndroid Build Coastguard Worker #endif
624*3f1979aaSAndroid Build Coastguard Worker     }
625*3f1979aaSAndroid Build Coastguard Worker     // "phase_state := starts"
626*3f1979aaSAndroid Build Coastguard Worker     VSTORE( &d->phase_state_i[0], cos_starts );
627*3f1979aaSAndroid Build Coastguard Worker     VSTORE( &d->phase_state_q[0], sin_starts );
628*3f1979aaSAndroid Build Coastguard Worker }
629*3f1979aaSAndroid Build Coastguard Worker 
630*3f1979aaSAndroid Build Coastguard Worker 
631*3f1979aaSAndroid Build Coastguard Worker /*********************************************************************/
632*3f1979aaSAndroid Build Coastguard Worker 
633*3f1979aaSAndroid Build Coastguard Worker /**************/
634*3f1979aaSAndroid Build Coastguard Worker /*** ALGO G ***/
635*3f1979aaSAndroid Build Coastguard Worker /**************/
636*3f1979aaSAndroid Build Coastguard Worker 
shift_limited_unroll_B_sse_init(float relative_freq,float phase_start_rad)637*3f1979aaSAndroid Build Coastguard Worker shift_limited_unroll_B_sse_data_t shift_limited_unroll_B_sse_init(float relative_freq, float phase_start_rad)
638*3f1979aaSAndroid Build Coastguard Worker {
639*3f1979aaSAndroid Build Coastguard Worker     shift_limited_unroll_B_sse_data_t output;
640*3f1979aaSAndroid Build Coastguard Worker     float myphase;
641*3f1979aaSAndroid Build Coastguard Worker 
642*3f1979aaSAndroid Build Coastguard Worker     output.phase_increment = 2*relative_freq*PI;
643*3f1979aaSAndroid Build Coastguard Worker 
644*3f1979aaSAndroid Build Coastguard Worker     myphase = 0.0F;
645*3f1979aaSAndroid Build Coastguard Worker     for (int i = 0; i < PF_SHIFT_LIMITED_UNROLL_SIZE + PF_SHIFT_LIMITED_SIMD_SZ; i += PF_SHIFT_LIMITED_SIMD_SZ)
646*3f1979aaSAndroid Build Coastguard Worker     {
647*3f1979aaSAndroid Build Coastguard Worker         for (int k = 0; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
648*3f1979aaSAndroid Build Coastguard Worker         {
649*3f1979aaSAndroid Build Coastguard Worker             myphase += output.phase_increment;
650*3f1979aaSAndroid Build Coastguard Worker             while(myphase>PI) myphase-=2*PI;
651*3f1979aaSAndroid Build Coastguard Worker             while(myphase<-PI) myphase+=2*PI;
652*3f1979aaSAndroid Build Coastguard Worker         }
653*3f1979aaSAndroid Build Coastguard Worker         output.dtrig[i+0] = cos(myphase);
654*3f1979aaSAndroid Build Coastguard Worker         output.dtrig[i+1] = sin(myphase);
655*3f1979aaSAndroid Build Coastguard Worker         output.dtrig[i+2] = output.dtrig[i+0];
656*3f1979aaSAndroid Build Coastguard Worker         output.dtrig[i+3] = output.dtrig[i+1];
657*3f1979aaSAndroid Build Coastguard Worker     }
658*3f1979aaSAndroid Build Coastguard Worker 
659*3f1979aaSAndroid Build Coastguard Worker     output.dcos_blk = 0.0F;
660*3f1979aaSAndroid Build Coastguard Worker     output.dsin_blk = 0.0F;
661*3f1979aaSAndroid Build Coastguard Worker 
662*3f1979aaSAndroid Build Coastguard Worker     myphase = phase_start_rad;
663*3f1979aaSAndroid Build Coastguard Worker     for (int i = 0; i < PF_SHIFT_LIMITED_SIMD_SZ; i++)
664*3f1979aaSAndroid Build Coastguard Worker     {
665*3f1979aaSAndroid Build Coastguard Worker         output.phase_state_i[i] = cos(myphase);
666*3f1979aaSAndroid Build Coastguard Worker         output.phase_state_q[i] = sin(myphase);
667*3f1979aaSAndroid Build Coastguard Worker         myphase += output.phase_increment;
668*3f1979aaSAndroid Build Coastguard Worker         while(myphase>PI) myphase-=2*PI;
669*3f1979aaSAndroid Build Coastguard Worker         while(myphase<-PI) myphase+=2*PI;
670*3f1979aaSAndroid Build Coastguard Worker     }
671*3f1979aaSAndroid Build Coastguard Worker     return output;
672*3f1979aaSAndroid Build Coastguard Worker }
673*3f1979aaSAndroid Build Coastguard Worker 
674*3f1979aaSAndroid Build Coastguard Worker 
675*3f1979aaSAndroid Build Coastguard Worker PF_TARGET_CLONES
shift_limited_unroll_B_sse_inp_c(complexf * in_out,int N_cplx,shift_limited_unroll_B_sse_data_t * d)676*3f1979aaSAndroid Build Coastguard Worker void shift_limited_unroll_B_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_B_sse_data_t* d)
677*3f1979aaSAndroid Build Coastguard Worker {
678*3f1979aaSAndroid Build Coastguard Worker     // "vals := starts := phase_state"
679*3f1979aaSAndroid Build Coastguard Worker     __m128 cos_starts = VLOAD( &d->phase_state_i[0] );
680*3f1979aaSAndroid Build Coastguard Worker     __m128 sin_starts = VLOAD( &d->phase_state_q[0] );
681*3f1979aaSAndroid Build Coastguard Worker     __m128 cos_vals = cos_starts;
682*3f1979aaSAndroid Build Coastguard Worker     __m128 sin_vals = sin_starts;
683*3f1979aaSAndroid Build Coastguard Worker     __m128 inp_re, inp_im;
684*3f1979aaSAndroid Build Coastguard Worker     __m128 product_re, product_im;
685*3f1979aaSAndroid Build Coastguard Worker     __m128 interl_prod_a, interl_prod_b;
686*3f1979aaSAndroid Build Coastguard Worker     __m128 * RESTRICT p_trig_tab;
687*3f1979aaSAndroid Build Coastguard Worker     __m128 * RESTRICT u = (__m128*)in_out;
688*3f1979aaSAndroid Build Coastguard Worker 
689*3f1979aaSAndroid Build Coastguard Worker     while (N_cplx)
690*3f1979aaSAndroid Build Coastguard Worker     {
691*3f1979aaSAndroid Build Coastguard Worker         const int NB = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx;
692*3f1979aaSAndroid Build Coastguard Worker         int B = NB;
693*3f1979aaSAndroid Build Coastguard Worker         p_trig_tab = (__m128*)( &d->dtrig[0] );
694*3f1979aaSAndroid Build Coastguard Worker         while (B)
695*3f1979aaSAndroid Build Coastguard Worker         {
696*3f1979aaSAndroid Build Coastguard Worker             // complex multiplication of 4 complex values from/to in_out[]
697*3f1979aaSAndroid Build Coastguard Worker             // ==  u[0..3] *= (cos_val[0..3] + i * sin_val[0..3]):
698*3f1979aaSAndroid Build Coastguard Worker             // "out[] = inp[] * vals"
699*3f1979aaSAndroid Build Coastguard Worker             UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im);  /* inp_re = all reals; inp_im = all imags */
700*3f1979aaSAndroid Build Coastguard Worker             product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) );
701*3f1979aaSAndroid Build Coastguard Worker             product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) );
702*3f1979aaSAndroid Build Coastguard Worker             INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b);
703*3f1979aaSAndroid Build Coastguard Worker             VSTORE(u, interl_prod_a);
704*3f1979aaSAndroid Build Coastguard Worker             VSTORE(u+1, interl_prod_b);
705*3f1979aaSAndroid Build Coastguard Worker             u += 2;
706*3f1979aaSAndroid Build Coastguard Worker             // calculate complex phasor for next iteration
707*3f1979aaSAndroid Build Coastguard Worker             // cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
708*3f1979aaSAndroid Build Coastguard Worker             // sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
709*3f1979aaSAndroid Build Coastguard Worker             // cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-)
710*3f1979aaSAndroid Build Coastguard Worker             // "vals :=  d[] * starts"
711*3f1979aaSAndroid Build Coastguard Worker             product_re = VLOAD(p_trig_tab);
712*3f1979aaSAndroid Build Coastguard Worker             UNINTERLEAVE2(product_re, product_re, inp_re, inp_im);  /* inp_re = all reals; inp_im = all imags */
713*3f1979aaSAndroid Build Coastguard Worker             cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) );
714*3f1979aaSAndroid Build Coastguard Worker             sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) );
715*3f1979aaSAndroid Build Coastguard Worker             ++p_trig_tab;
716*3f1979aaSAndroid Build Coastguard Worker             B -= 4;
717*3f1979aaSAndroid Build Coastguard Worker         }
718*3f1979aaSAndroid Build Coastguard Worker         N_cplx -= NB;
719*3f1979aaSAndroid Build Coastguard Worker         /* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */
720*3f1979aaSAndroid Build Coastguard Worker         /* re-use product_re[]/product_im[] for normalization */
721*3f1979aaSAndroid Build Coastguard Worker         // "starts := vals := vals / |vals|"
722*3f1979aaSAndroid Build Coastguard Worker         product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) );
723*3f1979aaSAndroid Build Coastguard Worker #if 0
724*3f1979aaSAndroid Build Coastguard Worker         // more spikes in spectrum! at PF_SHIFT_LIMITED_UNROLL_SIZE = 64
725*3f1979aaSAndroid Build Coastguard Worker         // higher spikes in spectrum at PF_SHIFT_LIMITED_UNROLL_SIZE = 16
726*3f1979aaSAndroid Build Coastguard Worker         product_im = _mm_rsqrt_ps(product_re);
727*3f1979aaSAndroid Build Coastguard Worker         cos_starts = cos_vals = VMUL(cos_vals, product_im);
728*3f1979aaSAndroid Build Coastguard Worker         sin_starts = sin_vals = VMUL(sin_vals, product_im);
729*3f1979aaSAndroid Build Coastguard Worker #else
730*3f1979aaSAndroid Build Coastguard Worker         // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 64 - but slower!
731*3f1979aaSAndroid Build Coastguard Worker         // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 128 - fast again
732*3f1979aaSAndroid Build Coastguard Worker         product_im = _mm_sqrt_ps(product_re);
733*3f1979aaSAndroid Build Coastguard Worker         cos_starts = cos_vals = VDIV(cos_vals, product_im);
734*3f1979aaSAndroid Build Coastguard Worker         sin_starts = sin_vals = VDIV(sin_vals, product_im);
735*3f1979aaSAndroid Build Coastguard Worker #endif
736*3f1979aaSAndroid Build Coastguard Worker     }
737*3f1979aaSAndroid Build Coastguard Worker     // "phase_state := starts"
738*3f1979aaSAndroid Build Coastguard Worker     VSTORE( &d->phase_state_i[0], cos_starts );
739*3f1979aaSAndroid Build Coastguard Worker     VSTORE( &d->phase_state_q[0], sin_starts );
740*3f1979aaSAndroid Build Coastguard Worker }
741*3f1979aaSAndroid Build Coastguard Worker 
742*3f1979aaSAndroid Build Coastguard Worker 
743*3f1979aaSAndroid Build Coastguard Worker /*********************************************************************/
744*3f1979aaSAndroid Build Coastguard Worker 
745*3f1979aaSAndroid Build Coastguard Worker 
746*3f1979aaSAndroid Build Coastguard Worker /**************/
747*3f1979aaSAndroid Build Coastguard Worker /*** ALGO H ***/
748*3f1979aaSAndroid Build Coastguard Worker /**************/
749*3f1979aaSAndroid Build Coastguard Worker 
shift_limited_unroll_C_sse_init(float relative_freq,float phase_start_rad)750*3f1979aaSAndroid Build Coastguard Worker shift_limited_unroll_C_sse_data_t shift_limited_unroll_C_sse_init(float relative_freq, float phase_start_rad)
751*3f1979aaSAndroid Build Coastguard Worker {
752*3f1979aaSAndroid Build Coastguard Worker     shift_limited_unroll_C_sse_data_t output;
753*3f1979aaSAndroid Build Coastguard Worker     float myphase;
754*3f1979aaSAndroid Build Coastguard Worker 
755*3f1979aaSAndroid Build Coastguard Worker     output.phase_increment = 2*relative_freq*PI;
756*3f1979aaSAndroid Build Coastguard Worker 
757*3f1979aaSAndroid Build Coastguard Worker     myphase = 0.0F;
758*3f1979aaSAndroid Build Coastguard Worker     for (int i = 0; i < PF_SHIFT_LIMITED_UNROLL_SIZE + PF_SHIFT_LIMITED_SIMD_SZ; i += PF_SHIFT_LIMITED_SIMD_SZ)
759*3f1979aaSAndroid Build Coastguard Worker     {
760*3f1979aaSAndroid Build Coastguard Worker         for (int k = 0; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
761*3f1979aaSAndroid Build Coastguard Worker         {
762*3f1979aaSAndroid Build Coastguard Worker             myphase += output.phase_increment;
763*3f1979aaSAndroid Build Coastguard Worker             while(myphase>PI) myphase-=2*PI;
764*3f1979aaSAndroid Build Coastguard Worker             while(myphase<-PI) myphase+=2*PI;
765*3f1979aaSAndroid Build Coastguard Worker         }
766*3f1979aaSAndroid Build Coastguard Worker         output.dinterl_trig[2*i] = cos(myphase);
767*3f1979aaSAndroid Build Coastguard Worker         output.dinterl_trig[2*i+4] = sin(myphase);
768*3f1979aaSAndroid Build Coastguard Worker         for (int k = 1; k < PF_SHIFT_LIMITED_SIMD_SZ; k++)
769*3f1979aaSAndroid Build Coastguard Worker         {
770*3f1979aaSAndroid Build Coastguard Worker             output.dinterl_trig[2*i+k] = output.dinterl_trig[2*i];
771*3f1979aaSAndroid Build Coastguard Worker             output.dinterl_trig[2*i+k+4] = output.dinterl_trig[2*i+4];
772*3f1979aaSAndroid Build Coastguard Worker         }
773*3f1979aaSAndroid Build Coastguard Worker     }
774*3f1979aaSAndroid Build Coastguard Worker 
775*3f1979aaSAndroid Build Coastguard Worker     output.dcos_blk = 0.0F;
776*3f1979aaSAndroid Build Coastguard Worker     output.dsin_blk = 0.0F;
777*3f1979aaSAndroid Build Coastguard Worker 
778*3f1979aaSAndroid Build Coastguard Worker     myphase = phase_start_rad;
779*3f1979aaSAndroid Build Coastguard Worker     for (int i = 0; i < PF_SHIFT_LIMITED_SIMD_SZ; i++)
780*3f1979aaSAndroid Build Coastguard Worker     {
781*3f1979aaSAndroid Build Coastguard Worker         output.phase_state_i[i] = cos(myphase);
782*3f1979aaSAndroid Build Coastguard Worker         output.phase_state_q[i] = sin(myphase);
783*3f1979aaSAndroid Build Coastguard Worker         myphase += output.phase_increment;
784*3f1979aaSAndroid Build Coastguard Worker         while(myphase>PI) myphase-=2*PI;
785*3f1979aaSAndroid Build Coastguard Worker         while(myphase<-PI) myphase+=2*PI;
786*3f1979aaSAndroid Build Coastguard Worker     }
787*3f1979aaSAndroid Build Coastguard Worker     return output;
788*3f1979aaSAndroid Build Coastguard Worker }
789*3f1979aaSAndroid Build Coastguard Worker 
790*3f1979aaSAndroid Build Coastguard Worker 
791*3f1979aaSAndroid Build Coastguard Worker PF_TARGET_CLONES
shift_limited_unroll_C_sse_inp_c(complexf * in_out,int N_cplx,shift_limited_unroll_C_sse_data_t * d)792*3f1979aaSAndroid Build Coastguard Worker void shift_limited_unroll_C_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_C_sse_data_t* d)
793*3f1979aaSAndroid Build Coastguard Worker {
794*3f1979aaSAndroid Build Coastguard Worker     // "vals := starts := phase_state"
795*3f1979aaSAndroid Build Coastguard Worker     __m128 cos_starts = VLOAD( &d->phase_state_i[0] );
796*3f1979aaSAndroid Build Coastguard Worker     __m128 sin_starts = VLOAD( &d->phase_state_q[0] );
797*3f1979aaSAndroid Build Coastguard Worker     __m128 cos_vals = cos_starts;
798*3f1979aaSAndroid Build Coastguard Worker     __m128 sin_vals = sin_starts;
799*3f1979aaSAndroid Build Coastguard Worker     __m128 inp_re, inp_im;
800*3f1979aaSAndroid Build Coastguard Worker     __m128 product_re, product_im;
801*3f1979aaSAndroid Build Coastguard Worker     __m128 interl_prod_a, interl_prod_b;
802*3f1979aaSAndroid Build Coastguard Worker     __m128 * RESTRICT p_trig_tab;
803*3f1979aaSAndroid Build Coastguard Worker     __m128 * RESTRICT u = (__m128*)in_out;
804*3f1979aaSAndroid Build Coastguard Worker 
805*3f1979aaSAndroid Build Coastguard Worker     while (N_cplx)
806*3f1979aaSAndroid Build Coastguard Worker     {
807*3f1979aaSAndroid Build Coastguard Worker         const int NB = (N_cplx >= PF_SHIFT_LIMITED_UNROLL_SIZE) ? PF_SHIFT_LIMITED_UNROLL_SIZE : N_cplx;
808*3f1979aaSAndroid Build Coastguard Worker         int B = NB;
809*3f1979aaSAndroid Build Coastguard Worker         p_trig_tab = (__m128*)( &d->dinterl_trig[0] );
810*3f1979aaSAndroid Build Coastguard Worker         while (B)
811*3f1979aaSAndroid Build Coastguard Worker         {
812*3f1979aaSAndroid Build Coastguard Worker             // complex multiplication of 4 complex values from/to in_out[]
813*3f1979aaSAndroid Build Coastguard Worker             // ==  u[0..3] *= (cos_val[0..3] + i * sin_val[0..3]):
814*3f1979aaSAndroid Build Coastguard Worker             // "out[] = inp[] * vals"
815*3f1979aaSAndroid Build Coastguard Worker             UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im);  /* inp_re = all reals; inp_im = all imags */
816*3f1979aaSAndroid Build Coastguard Worker             product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) );
817*3f1979aaSAndroid Build Coastguard Worker             product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) );
818*3f1979aaSAndroid Build Coastguard Worker             INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b);
819*3f1979aaSAndroid Build Coastguard Worker             VSTORE(u, interl_prod_a);
820*3f1979aaSAndroid Build Coastguard Worker             VSTORE(u+1, interl_prod_b);
821*3f1979aaSAndroid Build Coastguard Worker             u += 2;
822*3f1979aaSAndroid Build Coastguard Worker             // calculate complex phasor for next iteration
823*3f1979aaSAndroid Build Coastguard Worker             // cos_val = cos_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
824*3f1979aaSAndroid Build Coastguard Worker             // sin_val = sin_start * d->dcos[PF_SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[PF_SHIFT_LIMITED_SIMD_SZ*i+j];
825*3f1979aaSAndroid Build Coastguard Worker             // cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-)
826*3f1979aaSAndroid Build Coastguard Worker             // "vals :=  d[] * starts"
827*3f1979aaSAndroid Build Coastguard Worker             inp_re = VLOAD(p_trig_tab);
828*3f1979aaSAndroid Build Coastguard Worker             inp_im = VLOAD(p_trig_tab+1);
829*3f1979aaSAndroid Build Coastguard Worker             cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) );
830*3f1979aaSAndroid Build Coastguard Worker             sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) );
831*3f1979aaSAndroid Build Coastguard Worker             p_trig_tab += 2;
832*3f1979aaSAndroid Build Coastguard Worker             B -= 4;
833*3f1979aaSAndroid Build Coastguard Worker         }
834*3f1979aaSAndroid Build Coastguard Worker         N_cplx -= NB;
835*3f1979aaSAndroid Build Coastguard Worker         /* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */
836*3f1979aaSAndroid Build Coastguard Worker         /* re-use product_re[]/product_im[] for normalization */
837*3f1979aaSAndroid Build Coastguard Worker         // "starts := vals := vals / |vals|"
838*3f1979aaSAndroid Build Coastguard Worker         product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) );
839*3f1979aaSAndroid Build Coastguard Worker #if 0
840*3f1979aaSAndroid Build Coastguard Worker         // more spikes in spectrum! at PF_SHIFT_LIMITED_UNROLL_SIZE = 64
841*3f1979aaSAndroid Build Coastguard Worker         // higher spikes in spectrum at PF_SHIFT_LIMITED_UNROLL_SIZE = 16
842*3f1979aaSAndroid Build Coastguard Worker         product_im = _mm_rsqrt_ps(product_re);
843*3f1979aaSAndroid Build Coastguard Worker         cos_starts = cos_vals = VMUL(cos_vals, product_im);
844*3f1979aaSAndroid Build Coastguard Worker         sin_starts = sin_vals = VMUL(sin_vals, product_im);
845*3f1979aaSAndroid Build Coastguard Worker #else
846*3f1979aaSAndroid Build Coastguard Worker         // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 64 - but slower!
847*3f1979aaSAndroid Build Coastguard Worker         // spectrally comparable to shift_match_cc() with PF_SHIFT_LIMITED_UNROLL_SIZE = 128 - fast again
848*3f1979aaSAndroid Build Coastguard Worker         product_im = _mm_sqrt_ps(product_re);
849*3f1979aaSAndroid Build Coastguard Worker         cos_starts = cos_vals = VDIV(cos_vals, product_im);
850*3f1979aaSAndroid Build Coastguard Worker         sin_starts = sin_vals = VDIV(sin_vals, product_im);
851*3f1979aaSAndroid Build Coastguard Worker #endif
852*3f1979aaSAndroid Build Coastguard Worker     }
853*3f1979aaSAndroid Build Coastguard Worker     // "phase_state := starts"
854*3f1979aaSAndroid Build Coastguard Worker     VSTORE( &d->phase_state_i[0], cos_starts );
855*3f1979aaSAndroid Build Coastguard Worker     VSTORE( &d->phase_state_q[0], sin_starts );
856*3f1979aaSAndroid Build Coastguard Worker }
857*3f1979aaSAndroid Build Coastguard Worker 
858*3f1979aaSAndroid Build Coastguard Worker 
859*3f1979aaSAndroid Build Coastguard Worker #else
860*3f1979aaSAndroid Build Coastguard Worker 
861*3f1979aaSAndroid Build Coastguard Worker /*********************************************************************/
862*3f1979aaSAndroid Build Coastguard Worker 
shift_limited_unroll_A_sse_init(float relative_freq,float phase_start_rad)863*3f1979aaSAndroid Build Coastguard Worker shift_limited_unroll_A_sse_data_t shift_limited_unroll_A_sse_init(float relative_freq, float phase_start_rad) {
864*3f1979aaSAndroid Build Coastguard Worker     assert(0);
865*3f1979aaSAndroid Build Coastguard Worker     shift_limited_unroll_A_sse_data_t r;
866*3f1979aaSAndroid Build Coastguard Worker     return r;
867*3f1979aaSAndroid Build Coastguard Worker }
shift_limited_unroll_B_sse_init(float relative_freq,float phase_start_rad)868*3f1979aaSAndroid Build Coastguard Worker shift_limited_unroll_B_sse_data_t shift_limited_unroll_B_sse_init(float relative_freq, float phase_start_rad) {
869*3f1979aaSAndroid Build Coastguard Worker     assert(0);
870*3f1979aaSAndroid Build Coastguard Worker     shift_limited_unroll_B_sse_data_t r;
871*3f1979aaSAndroid Build Coastguard Worker     return r;
872*3f1979aaSAndroid Build Coastguard Worker }
shift_limited_unroll_C_sse_init(float relative_freq,float phase_start_rad)873*3f1979aaSAndroid Build Coastguard Worker shift_limited_unroll_C_sse_data_t shift_limited_unroll_C_sse_init(float relative_freq, float phase_start_rad) {
874*3f1979aaSAndroid Build Coastguard Worker     assert(0);
875*3f1979aaSAndroid Build Coastguard Worker     shift_limited_unroll_C_sse_data_t r;
876*3f1979aaSAndroid Build Coastguard Worker     return r;
877*3f1979aaSAndroid Build Coastguard Worker }
878*3f1979aaSAndroid Build Coastguard Worker 
shift_limited_unroll_A_sse_inp_c(complexf * in_out,int N_cplx,shift_limited_unroll_A_sse_data_t * d)879*3f1979aaSAndroid Build Coastguard Worker void shift_limited_unroll_A_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_A_sse_data_t* d) {
880*3f1979aaSAndroid Build Coastguard Worker     assert(0);
881*3f1979aaSAndroid Build Coastguard Worker }
shift_limited_unroll_B_sse_inp_c(complexf * in_out,int N_cplx,shift_limited_unroll_B_sse_data_t * d)882*3f1979aaSAndroid Build Coastguard Worker void shift_limited_unroll_B_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_B_sse_data_t* d) {
883*3f1979aaSAndroid Build Coastguard Worker     assert(0);
884*3f1979aaSAndroid Build Coastguard Worker }
shift_limited_unroll_C_sse_inp_c(complexf * in_out,int N_cplx,shift_limited_unroll_C_sse_data_t * d)885*3f1979aaSAndroid Build Coastguard Worker void shift_limited_unroll_C_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_C_sse_data_t* d) {
886*3f1979aaSAndroid Build Coastguard Worker     assert(0);
887*3f1979aaSAndroid Build Coastguard Worker }
888*3f1979aaSAndroid Build Coastguard Worker 
889*3f1979aaSAndroid Build Coastguard Worker #endif
890*3f1979aaSAndroid Build Coastguard Worker 
891*3f1979aaSAndroid Build Coastguard Worker 
892*3f1979aaSAndroid Build Coastguard Worker /*********************************************************************/
893*3f1979aaSAndroid Build Coastguard Worker 
894*3f1979aaSAndroid Build Coastguard Worker /**************/
895*3f1979aaSAndroid Build Coastguard Worker /*** ALGO I ***/
896*3f1979aaSAndroid Build Coastguard Worker /**************/
897*3f1979aaSAndroid Build Coastguard Worker 
shift_recursive_osc_update_rate(float rate,shift_recursive_osc_conf_t * conf,shift_recursive_osc_t * state)898*3f1979aaSAndroid Build Coastguard Worker void shift_recursive_osc_update_rate(float rate, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state)
899*3f1979aaSAndroid Build Coastguard Worker {
900*3f1979aaSAndroid Build Coastguard Worker     // constants for single phase step
901*3f1979aaSAndroid Build Coastguard Worker     float phase_increment_s = rate*PI;
902*3f1979aaSAndroid Build Coastguard Worker     float k1 = tan(0.5*phase_increment_s);
903*3f1979aaSAndroid Build Coastguard Worker     float k2 = 2*k1 /(1 + k1 * k1);
904*3f1979aaSAndroid Build Coastguard Worker     for (int j=1; j<PF_SHIFT_RECURSIVE_SIMD_SZ; j++)
905*3f1979aaSAndroid Build Coastguard Worker     {
906*3f1979aaSAndroid Build Coastguard Worker         float tmp;
907*3f1979aaSAndroid Build Coastguard Worker         state->u_cos[j] = state->u_cos[j-1];
908*3f1979aaSAndroid Build Coastguard Worker         state->v_sin[j] = state->v_sin[j-1];
909*3f1979aaSAndroid Build Coastguard Worker         // small steps
910*3f1979aaSAndroid Build Coastguard Worker         tmp = state->u_cos[j] - k1 * state->v_sin[j];
911*3f1979aaSAndroid Build Coastguard Worker         state->v_sin[j] += k2 * tmp;
912*3f1979aaSAndroid Build Coastguard Worker         state->u_cos[j] = tmp - k1 * state->v_sin[j];
913*3f1979aaSAndroid Build Coastguard Worker     }
914*3f1979aaSAndroid Build Coastguard Worker 
915*3f1979aaSAndroid Build Coastguard Worker     // constants for PF_SHIFT_RECURSIVE_SIMD_SZ times phase step
916*3f1979aaSAndroid Build Coastguard Worker     float phase_increment_b = phase_increment_s * PF_SHIFT_RECURSIVE_SIMD_SZ;
917*3f1979aaSAndroid Build Coastguard Worker     while(phase_increment_b > PI) phase_increment_b-=2*PI;
918*3f1979aaSAndroid Build Coastguard Worker     while(phase_increment_b < -PI) phase_increment_b+=2*PI;
919*3f1979aaSAndroid Build Coastguard Worker     conf->k1 = tan(0.5*phase_increment_b);
920*3f1979aaSAndroid Build Coastguard Worker     conf->k2 = 2*conf->k1 / (1 + conf->k1 * conf->k1);
921*3f1979aaSAndroid Build Coastguard Worker }
922*3f1979aaSAndroid Build Coastguard Worker 
shift_recursive_osc_init(float rate,float starting_phase,shift_recursive_osc_conf_t * conf,shift_recursive_osc_t * state)923*3f1979aaSAndroid Build Coastguard Worker void shift_recursive_osc_init(float rate, float starting_phase, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t *state)
924*3f1979aaSAndroid Build Coastguard Worker {
925*3f1979aaSAndroid Build Coastguard Worker     if (starting_phase != 0.0F)
926*3f1979aaSAndroid Build Coastguard Worker     {
927*3f1979aaSAndroid Build Coastguard Worker         state->u_cos[0] = cos(starting_phase);
928*3f1979aaSAndroid Build Coastguard Worker         state->v_sin[0] = sin(starting_phase);
929*3f1979aaSAndroid Build Coastguard Worker     }
930*3f1979aaSAndroid Build Coastguard Worker     else
931*3f1979aaSAndroid Build Coastguard Worker     {
932*3f1979aaSAndroid Build Coastguard Worker         state->u_cos[0] = 1.0F;
933*3f1979aaSAndroid Build Coastguard Worker         state->v_sin[0] = 0.0F;
934*3f1979aaSAndroid Build Coastguard Worker     }
935*3f1979aaSAndroid Build Coastguard Worker     shift_recursive_osc_update_rate(rate, conf, state);
936*3f1979aaSAndroid Build Coastguard Worker }
937*3f1979aaSAndroid Build Coastguard Worker 
938*3f1979aaSAndroid Build Coastguard Worker 
939*3f1979aaSAndroid Build Coastguard Worker PF_TARGET_CLONES
shift_recursive_osc_cc(const complexf * input,complexf * output,int size,const shift_recursive_osc_conf_t * conf,shift_recursive_osc_t * state_ext)940*3f1979aaSAndroid Build Coastguard Worker void shift_recursive_osc_cc(const complexf *input, complexf* output,
941*3f1979aaSAndroid Build Coastguard Worker     int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state_ext)
942*3f1979aaSAndroid Build Coastguard Worker {
943*3f1979aaSAndroid Build Coastguard Worker     float tmp[PF_SHIFT_RECURSIVE_SIMD_SZ];
944*3f1979aaSAndroid Build Coastguard Worker     float inp_i[PF_SHIFT_RECURSIVE_SIMD_SZ];
945*3f1979aaSAndroid Build Coastguard Worker     float inp_q[PF_SHIFT_RECURSIVE_SIMD_SZ];
946*3f1979aaSAndroid Build Coastguard Worker     shift_recursive_osc_t state = *state_ext;
947*3f1979aaSAndroid Build Coastguard Worker     const float k1 = conf->k1;
948*3f1979aaSAndroid Build Coastguard Worker     const float k2 = conf->k2;
949*3f1979aaSAndroid Build Coastguard Worker     for(int i=0;i<size/PF_SHIFT_RECURSIVE_SIMD_SZ; i++) //@shift_recursive_osc_cc
950*3f1979aaSAndroid Build Coastguard Worker     {
951*3f1979aaSAndroid Build Coastguard Worker         //we multiply two complex numbers - similar to shift_math_cc
952*3f1979aaSAndroid Build Coastguard Worker         for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
953*3f1979aaSAndroid Build Coastguard Worker         {
954*3f1979aaSAndroid Build Coastguard Worker             inp_i[j] = input[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].i;
955*3f1979aaSAndroid Build Coastguard Worker             inp_q[j] = input[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].q;
956*3f1979aaSAndroid Build Coastguard Worker         }
957*3f1979aaSAndroid Build Coastguard Worker         for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
958*3f1979aaSAndroid Build Coastguard Worker         {
959*3f1979aaSAndroid Build Coastguard Worker             iof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j];
960*3f1979aaSAndroid Build Coastguard Worker             qof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j];
961*3f1979aaSAndroid Build Coastguard Worker         }
962*3f1979aaSAndroid Build Coastguard Worker         // update complex phasor - like incrementing phase
963*3f1979aaSAndroid Build Coastguard Worker         for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
964*3f1979aaSAndroid Build Coastguard Worker             tmp[j] = state.u_cos[j] - k1 * state.v_sin[j];
965*3f1979aaSAndroid Build Coastguard Worker         for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
966*3f1979aaSAndroid Build Coastguard Worker             state.v_sin[j] += k2 * tmp[j];
967*3f1979aaSAndroid Build Coastguard Worker         for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
968*3f1979aaSAndroid Build Coastguard Worker             state.u_cos[j] = tmp[j] - k1 * state.v_sin[j];
969*3f1979aaSAndroid Build Coastguard Worker     }
970*3f1979aaSAndroid Build Coastguard Worker     *state_ext = state;
971*3f1979aaSAndroid Build Coastguard Worker }
972*3f1979aaSAndroid Build Coastguard Worker 
973*3f1979aaSAndroid Build Coastguard Worker PF_TARGET_CLONES
shift_recursive_osc_inp_c(complexf * in_out,int size,const shift_recursive_osc_conf_t * conf,shift_recursive_osc_t * state_ext)974*3f1979aaSAndroid Build Coastguard Worker void shift_recursive_osc_inp_c(complexf* in_out,
975*3f1979aaSAndroid Build Coastguard Worker     int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state_ext)
976*3f1979aaSAndroid Build Coastguard Worker {
977*3f1979aaSAndroid Build Coastguard Worker     float tmp[PF_SHIFT_RECURSIVE_SIMD_SZ];
978*3f1979aaSAndroid Build Coastguard Worker     float inp_i[PF_SHIFT_RECURSIVE_SIMD_SZ];
979*3f1979aaSAndroid Build Coastguard Worker     float inp_q[PF_SHIFT_RECURSIVE_SIMD_SZ];
980*3f1979aaSAndroid Build Coastguard Worker     shift_recursive_osc_t state = *state_ext;
981*3f1979aaSAndroid Build Coastguard Worker     const float k1 = conf->k1;
982*3f1979aaSAndroid Build Coastguard Worker     const float k2 = conf->k2;
983*3f1979aaSAndroid Build Coastguard Worker     for(int i=0;i<size/PF_SHIFT_RECURSIVE_SIMD_SZ; i++) //@shift_recursive_osc_inp_c
984*3f1979aaSAndroid Build Coastguard Worker     {
985*3f1979aaSAndroid Build Coastguard Worker         for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
986*3f1979aaSAndroid Build Coastguard Worker         {
987*3f1979aaSAndroid Build Coastguard Worker             inp_i[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].i;
988*3f1979aaSAndroid Build Coastguard Worker             inp_q[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SZ*i+j].q;
989*3f1979aaSAndroid Build Coastguard Worker         }
990*3f1979aaSAndroid Build Coastguard Worker         //we multiply two complex numbers - similar to shift_math_cc
991*3f1979aaSAndroid Build Coastguard Worker         for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
992*3f1979aaSAndroid Build Coastguard Worker         {
993*3f1979aaSAndroid Build Coastguard Worker             iof(in_out,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j];
994*3f1979aaSAndroid Build Coastguard Worker             qof(in_out,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j];
995*3f1979aaSAndroid Build Coastguard Worker         }
996*3f1979aaSAndroid Build Coastguard Worker         // update complex phasor - like incrementing phase
997*3f1979aaSAndroid Build Coastguard Worker         for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
998*3f1979aaSAndroid Build Coastguard Worker             tmp[j] = state.u_cos[j] - k1 * state.v_sin[j];
999*3f1979aaSAndroid Build Coastguard Worker         for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
1000*3f1979aaSAndroid Build Coastguard Worker             state.v_sin[j] += k2 * tmp[j];
1001*3f1979aaSAndroid Build Coastguard Worker         for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
1002*3f1979aaSAndroid Build Coastguard Worker             state.u_cos[j] = tmp[j] - k1 * state.v_sin[j];
1003*3f1979aaSAndroid Build Coastguard Worker     }
1004*3f1979aaSAndroid Build Coastguard Worker     *state_ext = state;
1005*3f1979aaSAndroid Build Coastguard Worker }
1006*3f1979aaSAndroid Build Coastguard Worker 
1007*3f1979aaSAndroid Build Coastguard Worker PF_TARGET_CLONES
gen_recursive_osc_c(complexf * output,int size,const shift_recursive_osc_conf_t * conf,shift_recursive_osc_t * state_ext)1008*3f1979aaSAndroid Build Coastguard Worker void gen_recursive_osc_c(complexf* output,
1009*3f1979aaSAndroid Build Coastguard Worker     int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state_ext)
1010*3f1979aaSAndroid Build Coastguard Worker {
1011*3f1979aaSAndroid Build Coastguard Worker     float tmp[PF_SHIFT_RECURSIVE_SIMD_SZ];
1012*3f1979aaSAndroid Build Coastguard Worker     shift_recursive_osc_t state = *state_ext;
1013*3f1979aaSAndroid Build Coastguard Worker     const float k1 = conf->k1;
1014*3f1979aaSAndroid Build Coastguard Worker     const float k2 = conf->k2;
1015*3f1979aaSAndroid Build Coastguard Worker     for(int i=0;i<size/PF_SHIFT_RECURSIVE_SIMD_SZ; i++) //@gen_recursive_osc_c
1016*3f1979aaSAndroid Build Coastguard Worker     {
1017*3f1979aaSAndroid Build Coastguard Worker         // output complex oscillator value
1018*3f1979aaSAndroid Build Coastguard Worker         for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
1019*3f1979aaSAndroid Build Coastguard Worker         {
1020*3f1979aaSAndroid Build Coastguard Worker             iof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.u_cos[j];
1021*3f1979aaSAndroid Build Coastguard Worker             qof(output,PF_SHIFT_RECURSIVE_SIMD_SZ*i+j) = state.v_sin[j];
1022*3f1979aaSAndroid Build Coastguard Worker         }
1023*3f1979aaSAndroid Build Coastguard Worker         // update complex phasor - like incrementing phase
1024*3f1979aaSAndroid Build Coastguard Worker         for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
1025*3f1979aaSAndroid Build Coastguard Worker             tmp[j] = state.u_cos[j] - k1 * state.v_sin[j];
1026*3f1979aaSAndroid Build Coastguard Worker         for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
1027*3f1979aaSAndroid Build Coastguard Worker             state.v_sin[j] += k2 * tmp[j];
1028*3f1979aaSAndroid Build Coastguard Worker         for (int j=0;j<PF_SHIFT_RECURSIVE_SIMD_SZ;j++)
1029*3f1979aaSAndroid Build Coastguard Worker             state.u_cos[j] = tmp[j] - k1 * state.v_sin[j];
1030*3f1979aaSAndroid Build Coastguard Worker     }
1031*3f1979aaSAndroid Build Coastguard Worker     *state_ext = state;
1032*3f1979aaSAndroid Build Coastguard Worker }
1033*3f1979aaSAndroid Build Coastguard Worker 
1034*3f1979aaSAndroid Build Coastguard Worker 
1035*3f1979aaSAndroid Build Coastguard Worker #ifdef HAVE_SSE_INTRINSICS
1036*3f1979aaSAndroid Build Coastguard Worker 
1037*3f1979aaSAndroid Build Coastguard Worker /*********************************************************************/
1038*3f1979aaSAndroid Build Coastguard Worker 
1039*3f1979aaSAndroid Build Coastguard Worker /**************/
1040*3f1979aaSAndroid Build Coastguard Worker /*** ALGO J ***/
1041*3f1979aaSAndroid Build Coastguard Worker /**************/
1042*3f1979aaSAndroid Build Coastguard Worker 
shift_recursive_osc_sse_update_rate(float rate,shift_recursive_osc_sse_conf_t * conf,shift_recursive_osc_sse_t * state)1043*3f1979aaSAndroid Build Coastguard Worker void shift_recursive_osc_sse_update_rate(float rate, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state)
1044*3f1979aaSAndroid Build Coastguard Worker {
1045*3f1979aaSAndroid Build Coastguard Worker     // constants for single phase step
1046*3f1979aaSAndroid Build Coastguard Worker     float phase_increment_s = rate*PI;
1047*3f1979aaSAndroid Build Coastguard Worker     float k1 = tan(0.5*phase_increment_s);
1048*3f1979aaSAndroid Build Coastguard Worker     float k2 = 2*k1 /(1 + k1 * k1);
1049*3f1979aaSAndroid Build Coastguard Worker     for (int j=1; j<PF_SHIFT_RECURSIVE_SIMD_SSE_SZ; j++)
1050*3f1979aaSAndroid Build Coastguard Worker     {
1051*3f1979aaSAndroid Build Coastguard Worker         float tmp;
1052*3f1979aaSAndroid Build Coastguard Worker         state->u_cos[j] = state->u_cos[j-1];
1053*3f1979aaSAndroid Build Coastguard Worker         state->v_sin[j] = state->v_sin[j-1];
1054*3f1979aaSAndroid Build Coastguard Worker         // small steps
1055*3f1979aaSAndroid Build Coastguard Worker         tmp = state->u_cos[j] - k1 * state->v_sin[j];
1056*3f1979aaSAndroid Build Coastguard Worker         state->v_sin[j] += k2 * tmp;
1057*3f1979aaSAndroid Build Coastguard Worker         state->u_cos[j] = tmp - k1 * state->v_sin[j];
1058*3f1979aaSAndroid Build Coastguard Worker     }
1059*3f1979aaSAndroid Build Coastguard Worker 
1060*3f1979aaSAndroid Build Coastguard Worker     // constants for PF_SHIFT_RECURSIVE_SIMD_SSE_SZ times phase step
1061*3f1979aaSAndroid Build Coastguard Worker     float phase_increment_b = phase_increment_s * PF_SHIFT_RECURSIVE_SIMD_SSE_SZ;
1062*3f1979aaSAndroid Build Coastguard Worker     while(phase_increment_b > PI) phase_increment_b-=2*PI;
1063*3f1979aaSAndroid Build Coastguard Worker     while(phase_increment_b < -PI) phase_increment_b+=2*PI;
1064*3f1979aaSAndroid Build Coastguard Worker     conf->k1 = tan(0.5*phase_increment_b);
1065*3f1979aaSAndroid Build Coastguard Worker     conf->k2 = 2*conf->k1 / (1 + conf->k1 * conf->k1);
1066*3f1979aaSAndroid Build Coastguard Worker }
1067*3f1979aaSAndroid Build Coastguard Worker 
1068*3f1979aaSAndroid Build Coastguard Worker 
shift_recursive_osc_sse_init(float rate,float starting_phase,shift_recursive_osc_sse_conf_t * conf,shift_recursive_osc_sse_t * state)1069*3f1979aaSAndroid Build Coastguard Worker void shift_recursive_osc_sse_init(float rate, float starting_phase, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t *state)
1070*3f1979aaSAndroid Build Coastguard Worker {
1071*3f1979aaSAndroid Build Coastguard Worker     if (starting_phase != 0.0F)
1072*3f1979aaSAndroid Build Coastguard Worker     {
1073*3f1979aaSAndroid Build Coastguard Worker         state->u_cos[0] = cos(starting_phase);
1074*3f1979aaSAndroid Build Coastguard Worker         state->v_sin[0] = sin(starting_phase);
1075*3f1979aaSAndroid Build Coastguard Worker     }
1076*3f1979aaSAndroid Build Coastguard Worker     else
1077*3f1979aaSAndroid Build Coastguard Worker     {
1078*3f1979aaSAndroid Build Coastguard Worker         state->u_cos[0] = 1.0F;
1079*3f1979aaSAndroid Build Coastguard Worker         state->v_sin[0] = 0.0F;
1080*3f1979aaSAndroid Build Coastguard Worker     }
1081*3f1979aaSAndroid Build Coastguard Worker     shift_recursive_osc_sse_update_rate(rate, conf, state);
1082*3f1979aaSAndroid Build Coastguard Worker }
1083*3f1979aaSAndroid Build Coastguard Worker 
1084*3f1979aaSAndroid Build Coastguard Worker 
1085*3f1979aaSAndroid Build Coastguard Worker PF_TARGET_CLONES
shift_recursive_osc_sse_inp_c(complexf * in_out,int N_cplx,const shift_recursive_osc_sse_conf_t * conf,shift_recursive_osc_sse_t * state_ext)1086*3f1979aaSAndroid Build Coastguard Worker void shift_recursive_osc_sse_inp_c(complexf* in_out,
1087*3f1979aaSAndroid Build Coastguard Worker     int N_cplx, const shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state_ext)
1088*3f1979aaSAndroid Build Coastguard Worker {
1089*3f1979aaSAndroid Build Coastguard Worker     const __m128 k1 = LD_PS1( conf->k1 );
1090*3f1979aaSAndroid Build Coastguard Worker     const __m128 k2 = LD_PS1( conf->k2 );
1091*3f1979aaSAndroid Build Coastguard Worker     __m128 u_cos = VLOAD( &state_ext->u_cos[0] );
1092*3f1979aaSAndroid Build Coastguard Worker     __m128 v_sin = VLOAD( &state_ext->v_sin[0] );
1093*3f1979aaSAndroid Build Coastguard Worker     __m128 inp_re, inp_im;
1094*3f1979aaSAndroid Build Coastguard Worker     __m128 product_re, product_im;
1095*3f1979aaSAndroid Build Coastguard Worker     __m128 interl_prod_a, interl_prod_b;
1096*3f1979aaSAndroid Build Coastguard Worker     __m128 * RESTRICT u = (__m128*)in_out;
1097*3f1979aaSAndroid Build Coastguard Worker 
1098*3f1979aaSAndroid Build Coastguard Worker     while (N_cplx)
1099*3f1979aaSAndroid Build Coastguard Worker     {
1100*3f1979aaSAndroid Build Coastguard Worker         //inp_i[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j].i;
1101*3f1979aaSAndroid Build Coastguard Worker         //inp_q[j] = in_out[PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j].q;
1102*3f1979aaSAndroid Build Coastguard Worker         UNINTERLEAVE2(VLOAD(u), VLOAD(u+1), inp_re, inp_im);  /* inp_re = all reals; inp_im = all imags */
1103*3f1979aaSAndroid Build Coastguard Worker 
1104*3f1979aaSAndroid Build Coastguard Worker         //we multiply two complex numbers - similar to shift_math_cc
1105*3f1979aaSAndroid Build Coastguard Worker         //iof(in_out,PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j];
1106*3f1979aaSAndroid Build Coastguard Worker         //qof(in_out,PF_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j];
1107*3f1979aaSAndroid Build Coastguard Worker         product_re = VSUB( VMUL(inp_re, u_cos), VMUL(inp_im, v_sin) );
1108*3f1979aaSAndroid Build Coastguard Worker         product_im = VADD( VMUL(inp_im, u_cos), VMUL(inp_re, v_sin) );
1109*3f1979aaSAndroid Build Coastguard Worker         INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b);
1110*3f1979aaSAndroid Build Coastguard Worker         VSTORE(u, interl_prod_a);
1111*3f1979aaSAndroid Build Coastguard Worker         VSTORE(u+1, interl_prod_b);
1112*3f1979aaSAndroid Build Coastguard Worker         u += 2;
1113*3f1979aaSAndroid Build Coastguard Worker 
1114*3f1979aaSAndroid Build Coastguard Worker         // update complex phasor - like incrementing phase
1115*3f1979aaSAndroid Build Coastguard Worker         // tmp[j] = state.u_cos[j] - k1 * state.v_sin[j];
1116*3f1979aaSAndroid Build Coastguard Worker         product_re = VSUB( u_cos, VMUL(k1, v_sin) );
1117*3f1979aaSAndroid Build Coastguard Worker         // state.v_sin[j] += k2 * tmp[j];
1118*3f1979aaSAndroid Build Coastguard Worker         v_sin = VADD( v_sin, VMUL(k2, product_re) );
1119*3f1979aaSAndroid Build Coastguard Worker         // state.u_cos[j] = tmp[j] - k1 * state.v_sin[j];
1120*3f1979aaSAndroid Build Coastguard Worker         u_cos = VSUB( product_re, VMUL(k1, v_sin) );
1121*3f1979aaSAndroid Build Coastguard Worker 
1122*3f1979aaSAndroid Build Coastguard Worker         N_cplx -= 4;
1123*3f1979aaSAndroid Build Coastguard Worker     }
1124*3f1979aaSAndroid Build Coastguard Worker     VSTORE( &state_ext->u_cos[0], u_cos );
1125*3f1979aaSAndroid Build Coastguard Worker     VSTORE( &state_ext->v_sin[0], v_sin );
1126*3f1979aaSAndroid Build Coastguard Worker }
1127*3f1979aaSAndroid Build Coastguard Worker 
1128*3f1979aaSAndroid Build Coastguard Worker #else
1129*3f1979aaSAndroid Build Coastguard Worker 
shift_recursive_osc_sse_update_rate(float rate,shift_recursive_osc_sse_conf_t * conf,shift_recursive_osc_sse_t * state)1130*3f1979aaSAndroid Build Coastguard Worker void shift_recursive_osc_sse_update_rate(float rate, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state)
1131*3f1979aaSAndroid Build Coastguard Worker {
1132*3f1979aaSAndroid Build Coastguard Worker     assert(0);
1133*3f1979aaSAndroid Build Coastguard Worker }
1134*3f1979aaSAndroid Build Coastguard Worker 
shift_recursive_osc_sse_init(float rate,float starting_phase,shift_recursive_osc_sse_conf_t * conf,shift_recursive_osc_sse_t * state)1135*3f1979aaSAndroid Build Coastguard Worker void shift_recursive_osc_sse_init(float rate, float starting_phase, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t *state)
1136*3f1979aaSAndroid Build Coastguard Worker {
1137*3f1979aaSAndroid Build Coastguard Worker     assert(0);
1138*3f1979aaSAndroid Build Coastguard Worker }
1139*3f1979aaSAndroid Build Coastguard Worker 
1140*3f1979aaSAndroid Build Coastguard Worker 
shift_recursive_osc_sse_inp_c(complexf * in_out,int N_cplx,const shift_recursive_osc_sse_conf_t * conf,shift_recursive_osc_sse_t * state_ext)1141*3f1979aaSAndroid Build Coastguard Worker void shift_recursive_osc_sse_inp_c(complexf* in_out,
1142*3f1979aaSAndroid Build Coastguard Worker     int N_cplx, const shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state_ext)
1143*3f1979aaSAndroid Build Coastguard Worker {
1144*3f1979aaSAndroid Build Coastguard Worker     assert(0);
1145*3f1979aaSAndroid Build Coastguard Worker }
1146*3f1979aaSAndroid Build Coastguard Worker 
1147*3f1979aaSAndroid Build Coastguard Worker #endif
1148*3f1979aaSAndroid Build Coastguard Worker 
1149