xref: /aosp_15_r20/external/pffft/pffft_priv_impl.h (revision 3f1979aa0d7ad34fcf3763de7b7b8f8cd67e5bdd)
1 /* Copyright (c) 2013  Julien Pommier ( [email protected] )
2    Copyright (c) 2020  Hayati Ayguen ( [email protected] )
3    Copyright (c) 2020  Dario Mambro ( [email protected] )
4 
5    Based on original fortran 77 code from FFTPACKv4 from NETLIB
6    (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
7    of NCAR, in 1985.
8 
9    As confirmed by the NCAR fftpack software curators, the following
10    FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
11    released under the same terms.
12 
13    FFTPACK license:
14 
15    http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
16 
17    Copyright (c) 2004 the University Corporation for Atmospheric
18    Research ("UCAR"). All rights reserved. Developed by NCAR's
19    Computational and Information Systems Laboratory, UCAR,
20    www.cisl.ucar.edu.
21 
22    Redistribution and use of the Software in source and binary forms,
23    with or without modification, is permitted provided that the
24    following conditions are met:
25 
26    - Neither the names of NCAR's Computational and Information Systems
27    Laboratory, the University Corporation for Atmospheric Research,
28    nor the names of its sponsors or contributors may be used to
29    endorse or promote products derived from this Software without
30    specific prior written permission.
31 
32    - Redistributions of source code must retain the above copyright
33    notices, this list of conditions, and the disclaimer below.
34 
35    - Redistributions in binary form must reproduce the above copyright
36    notice, this list of conditions, and the disclaimer below in the
37    documentation and/or other materials provided with the
38    distribution.
39 
40    THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
41    EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
42    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
43    NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
44    HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
45    EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
46    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
47    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
48    SOFTWARE.
49 
50 
51    PFFFT : a Pretty Fast FFT.
52 
53    This file is largerly based on the original FFTPACK implementation, modified in
54    order to take advantage of SIMD instructions of modern CPUs.
55 */
56 
57 /* this file requires architecture specific preprocessor definitions
58  * it's only for library internal use
59  */
60 
61 
62 /* define own constants required to turn off g++ extensions .. */
63 #ifndef M_PI
64   #define M_PI    3.14159265358979323846  /* pi */
65 #endif
66 
67 #ifndef M_SQRT2
68   #define M_SQRT2 1.41421356237309504880  /* sqrt(2) */
69 #endif
70 
71 
FUNC_SIMD_SIZE()72 int FUNC_SIMD_SIZE() { return SIMD_SZ; }
73 
FUNC_SIMD_ARCH()74 const char * FUNC_SIMD_ARCH() { return VARCH; }
75 
76 
77 /*
78   passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
79 */
passf2_ps(int ido,int l1,const v4sf * cc,v4sf * ch,const float * wa1,float fsign)80 static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign) {
81   int k, i;
82   int l1ido = l1*ido;
83   if (ido <= 2) {
84     for (k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido) {
85       ch[0]         = VADD(cc[0], cc[ido+0]);
86       ch[l1ido]     = VSUB(cc[0], cc[ido+0]);
87       ch[1]         = VADD(cc[1], cc[ido+1]);
88       ch[l1ido + 1] = VSUB(cc[1], cc[ido+1]);
89     }
90   } else {
91     for (k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido) {
92       for (i=0; i<ido-1; i+=2) {
93         v4sf tr2 = VSUB(cc[i+0], cc[i+ido+0]);
94         v4sf ti2 = VSUB(cc[i+1], cc[i+ido+1]);
95         v4sf wr = LD_PS1(wa1[i]), wi = VMUL(LD_PS1(fsign), LD_PS1(wa1[i+1]));
96         ch[i]   = VADD(cc[i+0], cc[i+ido+0]);
97         ch[i+1] = VADD(cc[i+1], cc[i+ido+1]);
98         VCPLXMUL(tr2, ti2, wr, wi);
99         ch[i+l1ido]   = tr2;
100         ch[i+l1ido+1] = ti2;
101       }
102     }
103   }
104 }
105 
106 /*
107   passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
108 */
passf3_ps(int ido,int l1,const v4sf * cc,v4sf * ch,const float * wa1,const float * wa2,float fsign)109 static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
110                                     const float *wa1, const float *wa2, float fsign) {
111   static const float taur = -0.5f;
112   float taui = 0.866025403784439f*fsign;
113   int i, k;
114   v4sf tr2, ti2, cr2, ci2, cr3, ci3, dr2, di2, dr3, di3;
115   int l1ido = l1*ido;
116   float wr1, wi1, wr2, wi2;
117   assert(ido > 2);
118   for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) {
119     for (i=0; i<ido-1; i+=2) {
120       tr2 = VADD(cc[i+ido], cc[i+2*ido]);
121       cr2 = VADD(cc[i], SVMUL(taur,tr2));
122       ch[i]    = VADD(cc[i], tr2);
123       ti2 = VADD(cc[i+ido+1], cc[i+2*ido+1]);
124       ci2 = VADD(cc[i    +1], SVMUL(taur,ti2));
125       ch[i+1]  = VADD(cc[i+1], ti2);
126       cr3 = SVMUL(taui, VSUB(cc[i+ido], cc[i+2*ido]));
127       ci3 = SVMUL(taui, VSUB(cc[i+ido+1], cc[i+2*ido+1]));
128       dr2 = VSUB(cr2, ci3);
129       dr3 = VADD(cr2, ci3);
130       di2 = VADD(ci2, cr3);
131       di3 = VSUB(ci2, cr3);
132       wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
133       VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
134       ch[i+l1ido] = dr2;
135       ch[i+l1ido + 1] = di2;
136       VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
137       ch[i+2*l1ido] = dr3;
138       ch[i+2*l1ido+1] = di3;
139     }
140   }
141 } /* passf3 */
142 
passf4_ps(int ido,int l1,const v4sf * cc,v4sf * ch,const float * wa1,const float * wa2,const float * wa3,float fsign)143 static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
144                                     const float *wa1, const float *wa2, const float *wa3, float fsign) {
145   /* isign == -1 for forward transform and +1 for backward transform */
146 
147   int i, k;
148   v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
149   int l1ido = l1*ido;
150   if (ido == 2) {
151     for (k=0; k < l1ido; k += ido, ch += ido, cc += 4*ido) {
152       tr1 = VSUB(cc[0], cc[2*ido + 0]);
153       tr2 = VADD(cc[0], cc[2*ido + 0]);
154       ti1 = VSUB(cc[1], cc[2*ido + 1]);
155       ti2 = VADD(cc[1], cc[2*ido + 1]);
156       ti4 = VMUL(VSUB(cc[1*ido + 0], cc[3*ido + 0]), LD_PS1(fsign));
157       tr4 = VMUL(VSUB(cc[3*ido + 1], cc[1*ido + 1]), LD_PS1(fsign));
158       tr3 = VADD(cc[ido + 0], cc[3*ido + 0]);
159       ti3 = VADD(cc[ido + 1], cc[3*ido + 1]);
160 
161       ch[0*l1ido + 0] = VADD(tr2, tr3);
162       ch[0*l1ido + 1] = VADD(ti2, ti3);
163       ch[1*l1ido + 0] = VADD(tr1, tr4);
164       ch[1*l1ido + 1] = VADD(ti1, ti4);
165       ch[2*l1ido + 0] = VSUB(tr2, tr3);
166       ch[2*l1ido + 1] = VSUB(ti2, ti3);
167       ch[3*l1ido + 0] = VSUB(tr1, tr4);
168       ch[3*l1ido + 1] = VSUB(ti1, ti4);
169     }
170   } else {
171     for (k=0; k < l1ido; k += ido, ch+=ido, cc += 4*ido) {
172       for (i=0; i<ido-1; i+=2) {
173         float wr1, wi1, wr2, wi2, wr3, wi3;
174         tr1 = VSUB(cc[i + 0], cc[i + 2*ido + 0]);
175         tr2 = VADD(cc[i + 0], cc[i + 2*ido + 0]);
176         ti1 = VSUB(cc[i + 1], cc[i + 2*ido + 1]);
177         ti2 = VADD(cc[i + 1], cc[i + 2*ido + 1]);
178         tr4 = VMUL(VSUB(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), LD_PS1(fsign));
179         ti4 = VMUL(VSUB(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), LD_PS1(fsign));
180         tr3 = VADD(cc[i + ido + 0], cc[i + 3*ido + 0]);
181         ti3 = VADD(cc[i + ido + 1], cc[i + 3*ido + 1]);
182 
183         ch[i] = VADD(tr2, tr3);
184         cr3    = VSUB(tr2, tr3);
185         ch[i + 1] = VADD(ti2, ti3);
186         ci3 = VSUB(ti2, ti3);
187 
188         cr2 = VADD(tr1, tr4);
189         cr4 = VSUB(tr1, tr4);
190         ci2 = VADD(ti1, ti4);
191         ci4 = VSUB(ti1, ti4);
192         wr1=wa1[i], wi1=fsign*wa1[i+1];
193         VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
194         wr2=wa2[i], wi2=fsign*wa2[i+1];
195         ch[i + l1ido] = cr2;
196         ch[i + l1ido + 1] = ci2;
197 
198         VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
199         wr3=wa3[i], wi3=fsign*wa3[i+1];
200         ch[i + 2*l1ido] = cr3;
201         ch[i + 2*l1ido + 1] = ci3;
202 
203         VCPLXMUL(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3));
204         ch[i + 3*l1ido] = cr4;
205         ch[i + 3*l1ido + 1] = ci4;
206       }
207     }
208   }
209 } /* passf4 */
210 
211 /*
212   passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
213 */
passf5_ps(int ido,int l1,const v4sf * cc,v4sf * ch,const float * wa1,const float * wa2,const float * wa3,const float * wa4,float fsign)214 static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
215                                     const float *wa1, const float *wa2,
216                                     const float *wa3, const float *wa4, float fsign) {
217   static const float tr11 = .309016994374947f;
218   const float ti11 = .951056516295154f*fsign;
219   static const float tr12 = -.809016994374947f;
220   const float ti12 = .587785252292473f*fsign;
221 
222   /* Local variables */
223   int i, k;
224   v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
225     ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
226 
227   float wr1, wi1, wr2, wi2, wr3, wi3, wr4, wi4;
228 
229 #define cc_ref(a_1,a_2) cc[(a_2-1)*ido + a_1 + 1]
230 #define ch_ref(a_1,a_3) ch[(a_3-1)*l1*ido + a_1 + 1]
231 
232   assert(ido > 2);
233   for (k = 0; k < l1; ++k, cc += 5*ido, ch += ido) {
234     for (i = 0; i < ido-1; i += 2) {
235       ti5 = VSUB(cc_ref(i  , 2), cc_ref(i  , 5));
236       ti2 = VADD(cc_ref(i  , 2), cc_ref(i  , 5));
237       ti4 = VSUB(cc_ref(i  , 3), cc_ref(i  , 4));
238       ti3 = VADD(cc_ref(i  , 3), cc_ref(i  , 4));
239       tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5));
240       tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5));
241       tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4));
242       tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4));
243       ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3));
244       ch_ref(i  , 1) = VADD(cc_ref(i  , 1), VADD(ti2, ti3));
245       cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3)));
246       ci2 = VADD(cc_ref(i  , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3)));
247       cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3)));
248       ci3 = VADD(cc_ref(i  , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3)));
249       cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
250       ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
251       cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
252       ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
253       dr3 = VSUB(cr3, ci4);
254       dr4 = VADD(cr3, ci4);
255       di3 = VADD(ci3, cr4);
256       di4 = VSUB(ci3, cr4);
257       dr5 = VADD(cr2, ci5);
258       dr2 = VSUB(cr2, ci5);
259       di5 = VSUB(ci2, cr5);
260       di2 = VADD(ci2, cr5);
261       wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
262       wr3=wa3[i], wi3=fsign*wa3[i+1], wr4=wa4[i], wi4=fsign*wa4[i+1];
263       VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
264       ch_ref(i - 1, 2) = dr2;
265       ch_ref(i, 2)     = di2;
266       VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
267       ch_ref(i - 1, 3) = dr3;
268       ch_ref(i, 3)     = di3;
269       VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3));
270       ch_ref(i - 1, 4) = dr4;
271       ch_ref(i, 4)     = di4;
272       VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4));
273       ch_ref(i - 1, 5) = dr5;
274       ch_ref(i, 5)     = di5;
275     }
276   }
277 #undef ch_ref
278 #undef cc_ref
279 }
280 
radf2_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1)281 static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1) {
282   static const float minus_one = -1.f;
283   int i, k, l1ido = l1*ido;
284   for (k=0; k < l1ido; k += ido) {
285     v4sf a = cc[k], b = cc[k + l1ido];
286     ch[2*k] = VADD(a, b);
287     ch[2*(k+ido)-1] = VSUB(a, b);
288   }
289   if (ido < 2) return;
290   if (ido != 2) {
291     for (k=0; k < l1ido; k += ido) {
292       for (i=2; i<ido; i+=2) {
293         v4sf tr2 = cc[i - 1 + k + l1ido], ti2 = cc[i + k + l1ido];
294         v4sf br = cc[i - 1 + k], bi = cc[i + k];
295         VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
296         ch[i + 2*k] = VADD(bi, ti2);
297         ch[2*(k+ido) - i] = VSUB(ti2, bi);
298         ch[i - 1 + 2*k] = VADD(br, tr2);
299         ch[2*(k+ido) - i -1] = VSUB(br, tr2);
300       }
301     }
302     if (ido % 2 == 1) return;
303   }
304   for (k=0; k < l1ido; k += ido) {
305     ch[2*k + ido] = SVMUL(minus_one, cc[ido-1 + k + l1ido]);
306     ch[2*k + ido-1] = cc[k + ido-1];
307   }
308 } /* radf2 */
309 
310 
radb2_ps(int ido,int l1,const v4sf * cc,v4sf * ch,const float * wa1)311 static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1) {
312   static const float minus_two=-2;
313   int i, k, l1ido = l1*ido;
314   v4sf a,b,c,d, tr2, ti2;
315   for (k=0; k < l1ido; k += ido) {
316     a = cc[2*k]; b = cc[2*(k+ido) - 1];
317     ch[k] = VADD(a, b);
318     ch[k + l1ido] =VSUB(a, b);
319   }
320   if (ido < 2) return;
321   if (ido != 2) {
322     for (k = 0; k < l1ido; k += ido) {
323       for (i = 2; i < ido; i += 2) {
324         a = cc[i-1 + 2*k]; b = cc[2*(k + ido) - i - 1];
325         c = cc[i+0 + 2*k]; d = cc[2*(k + ido) - i + 0];
326         ch[i-1 + k] = VADD(a, b);
327         tr2 = VSUB(a, b);
328         ch[i+0 + k] = VSUB(c, d);
329         ti2 = VADD(c, d);
330         VCPLXMUL(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
331         ch[i-1 + k + l1ido] = tr2;
332         ch[i+0 + k + l1ido] = ti2;
333       }
334     }
335     if (ido % 2 == 1) return;
336   }
337   for (k = 0; k < l1ido; k += ido) {
338     a = cc[2*k + ido-1]; b = cc[2*k + ido];
339     ch[k + ido-1] = VADD(a,a);
340     ch[k + ido-1 + l1ido] = SVMUL(minus_two, b);
341   }
342 } /* radb2 */
343 
radf3_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2)344 static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
345                      const float *wa1, const float *wa2) {
346   static const float taur = -0.5f;
347   static const float taui = 0.866025403784439f;
348   int i, k, ic;
349   v4sf ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3, wr1, wi1, wr2, wi2;
350   for (k=0; k<l1; k++) {
351     cr2 = VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido]);
352     ch[3*k*ido] = VADD(cc[k*ido], cr2);
353     ch[(3*k+2)*ido] = SVMUL(taui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
354     ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], SVMUL(taur, cr2));
355   }
356   if (ido == 1) return;
357   for (k=0; k<l1; k++) {
358     for (i=2; i<ido; i+=2) {
359       ic = ido - i;
360       wr1 = LD_PS1(wa1[i - 2]); wi1 = LD_PS1(wa1[i - 1]);
361       dr2 = cc[i - 1 + (k + l1)*ido]; di2 = cc[i + (k + l1)*ido];
362       VCPLXMULCONJ(dr2, di2, wr1, wi1);
363 
364       wr2 = LD_PS1(wa2[i - 2]); wi2 = LD_PS1(wa2[i - 1]);
365       dr3 = cc[i - 1 + (k + l1*2)*ido]; di3 = cc[i + (k + l1*2)*ido];
366       VCPLXMULCONJ(dr3, di3, wr2, wi2);
367 
368       cr2 = VADD(dr2, dr3);
369       ci2 = VADD(di2, di3);
370       ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2);
371       ch[i + 3*k*ido] = VADD(cc[i + k*ido], ci2);
372       tr2 = VADD(cc[i - 1 + k*ido], SVMUL(taur, cr2));
373       ti2 = VADD(cc[i + k*ido], SVMUL(taur, ci2));
374       tr3 = SVMUL(taui, VSUB(di2, di3));
375       ti3 = SVMUL(taui, VSUB(dr3, dr2));
376       ch[i - 1 + (3*k + 2)*ido] = VADD(tr2, tr3);
377       ch[ic - 1 + (3*k + 1)*ido] = VSUB(tr2, tr3);
378       ch[i + (3*k + 2)*ido] = VADD(ti2, ti3);
379       ch[ic + (3*k + 1)*ido] = VSUB(ti3, ti2);
380     }
381   }
382 } /* radf3 */
383 
384 
radb3_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2)385 static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
386                      const float *wa1, const float *wa2)
387 {
388   static const float taur = -0.5f;
389   static const float taui = 0.866025403784439f;
390   static const float taui_2 = 0.866025403784439f*2;
391   int i, k, ic;
392   v4sf ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
393   for (k=0; k<l1; k++) {
394     tr2 = cc[ido-1 + (3*k + 1)*ido]; tr2 = VADD(tr2,tr2);
395     cr2 = VMADD(LD_PS1(taur), tr2, cc[3*k*ido]);
396     ch[k*ido] = VADD(cc[3*k*ido], tr2);
397     ci3 = SVMUL(taui_2, cc[(3*k + 2)*ido]);
398     ch[(k + l1)*ido] = VSUB(cr2, ci3);
399     ch[(k + 2*l1)*ido] = VADD(cr2, ci3);
400   }
401   if (ido == 1) return;
402   for (k=0; k<l1; k++) {
403     for (i=2; i<ido; i+=2) {
404       ic = ido - i;
405       tr2 = VADD(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]);
406       cr2 = VMADD(LD_PS1(taur), tr2, cc[i - 1 + 3*k*ido]);
407       ch[i - 1 + k*ido] = VADD(cc[i - 1 + 3*k*ido], tr2);
408       ti2 = VSUB(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]);
409       ci2 = VMADD(LD_PS1(taur), ti2, cc[i + 3*k*ido]);
410       ch[i + k*ido] = VADD(cc[i + 3*k*ido], ti2);
411       cr3 = SVMUL(taui, VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]));
412       ci3 = SVMUL(taui, VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]));
413       dr2 = VSUB(cr2, ci3);
414       dr3 = VADD(cr2, ci3);
415       di2 = VADD(ci2, cr3);
416       di3 = VSUB(ci2, cr3);
417       VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
418       ch[i - 1 + (k + l1)*ido] = dr2;
419       ch[i + (k + l1)*ido] = di2;
420       VCPLXMUL(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
421       ch[i - 1 + (k + 2*l1)*ido] = dr3;
422       ch[i + (k + 2*l1)*ido] = di3;
423     }
424   }
425 } /* radb3 */
426 
radf4_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * RESTRICT wa1,const float * RESTRICT wa2,const float * RESTRICT wa3)427 static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf * RESTRICT ch,
428                                    const float * RESTRICT wa1, const float * RESTRICT wa2, const float * RESTRICT wa3)
429 {
430   static const float minus_hsqt2 = (float)-0.7071067811865475;
431   int i, k, l1ido = l1*ido;
432   {
433     const v4sf *RESTRICT cc_ = cc, * RESTRICT cc_end = cc + l1ido;
434     v4sf * RESTRICT ch_ = ch;
435     while (cc < cc_end) {
436       /* this loop represents between 25% and 40% of total radf4_ps cost ! */
437       v4sf a0 = cc[0], a1 = cc[l1ido];
438       v4sf a2 = cc[2*l1ido], a3 = cc[3*l1ido];
439       v4sf tr1 = VADD(a1, a3);
440       v4sf tr2 = VADD(a0, a2);
441       ch[2*ido-1] = VSUB(a0, a2);
442       ch[2*ido  ] = VSUB(a3, a1);
443       ch[0      ] = VADD(tr1, tr2);
444       ch[4*ido-1] = VSUB(tr2, tr1);
445       cc += ido; ch += 4*ido;
446     }
447     cc = cc_; ch = ch_;
448   }
449   if (ido < 2) return;
450   if (ido != 2) {
451     for (k = 0; k < l1ido; k += ido) {
452       const v4sf * RESTRICT pc = (v4sf*)(cc + 1 + k);
453       for (i=2; i<ido; i += 2, pc += 2) {
454         int ic = ido - i;
455         v4sf wr, wi, cr2, ci2, cr3, ci3, cr4, ci4;
456         v4sf tr1, ti1, tr2, ti2, tr3, ti3, tr4, ti4;
457 
458         cr2 = pc[1*l1ido+0];
459         ci2 = pc[1*l1ido+1];
460         wr=LD_PS1(wa1[i - 2]);
461         wi=LD_PS1(wa1[i - 1]);
462         VCPLXMULCONJ(cr2,ci2,wr,wi);
463 
464         cr3 = pc[2*l1ido+0];
465         ci3 = pc[2*l1ido+1];
466         wr = LD_PS1(wa2[i-2]);
467         wi = LD_PS1(wa2[i-1]);
468         VCPLXMULCONJ(cr3, ci3, wr, wi);
469 
470         cr4 = pc[3*l1ido];
471         ci4 = pc[3*l1ido+1];
472         wr = LD_PS1(wa3[i-2]);
473         wi = LD_PS1(wa3[i-1]);
474         VCPLXMULCONJ(cr4, ci4, wr, wi);
475 
476         /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
477 
478         tr1 = VADD(cr2,cr4);
479         tr4 = VSUB(cr4,cr2);
480         tr2 = VADD(pc[0],cr3);
481         tr3 = VSUB(pc[0],cr3);
482         ch[i - 1 + 4*k] = VADD(tr1,tr2);
483         ch[ic - 1 + 4*k + 3*ido] = VSUB(tr2,tr1); /* at this point tr1 and tr2 can be disposed */
484         ti1 = VADD(ci2,ci4);
485         ti4 = VSUB(ci2,ci4);
486         ch[i - 1 + 4*k + 2*ido] = VADD(ti4,tr3);
487         ch[ic - 1 + 4*k + 1*ido] = VSUB(tr3,ti4); /* dispose tr3, ti4 */
488         ti2 = VADD(pc[1],ci3);
489         ti3 = VSUB(pc[1],ci3);
490         ch[i + 4*k] = VADD(ti1, ti2);
491         ch[ic + 4*k + 3*ido] = VSUB(ti1, ti2);
492         ch[i + 4*k + 2*ido] = VADD(tr4, ti3);
493         ch[ic + 4*k + 1*ido] = VSUB(tr4, ti3);
494       }
495     }
496     if (ido % 2 == 1) return;
497   }
498   for (k=0; k<l1ido; k += ido) {
499     v4sf a = cc[ido-1 + k + l1ido], b = cc[ido-1 + k + 3*l1ido];
500     v4sf c = cc[ido-1 + k], d = cc[ido-1 + k + 2*l1ido];
501     v4sf ti1 = SVMUL(minus_hsqt2, VADD(a, b));
502     v4sf tr1 = SVMUL(minus_hsqt2, VSUB(b, a));
503     ch[ido-1 + 4*k] = VADD(tr1, c);
504     ch[ido-1 + 4*k + 2*ido] = VSUB(c, tr1);
505     ch[4*k + 1*ido] = VSUB(ti1, d);
506     ch[4*k + 3*ido] = VADD(ti1, d);
507   }
508 } /* radf4 */
509 
510 
radb4_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * RESTRICT wa1,const float * RESTRICT wa2,const float * RESTRICT wa3)511 static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
512                                    const float * RESTRICT wa1, const float * RESTRICT wa2, const float *RESTRICT wa3)
513 {
514   static const float minus_sqrt2 = (float)-1.414213562373095;
515   static const float two = 2.f;
516   int i, k, l1ido = l1*ido;
517   v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
518   {
519     const v4sf *RESTRICT cc_ = cc, * RESTRICT ch_end = ch + l1ido;
520     v4sf *ch_ = ch;
521     while (ch < ch_end) {
522       v4sf a = cc[0], b = cc[4*ido-1];
523       v4sf c = cc[2*ido], d = cc[2*ido-1];
524       tr3 = SVMUL(two,d);
525       tr2 = VADD(a,b);
526       tr1 = VSUB(a,b);
527       tr4 = SVMUL(two,c);
528       ch[0*l1ido] = VADD(tr2, tr3);
529       ch[2*l1ido] = VSUB(tr2, tr3);
530       ch[1*l1ido] = VSUB(tr1, tr4);
531       ch[3*l1ido] = VADD(tr1, tr4);
532 
533       cc += 4*ido; ch += ido;
534     }
535     cc = cc_; ch = ch_;
536   }
537   if (ido < 2) return;
538   if (ido != 2) {
539     for (k = 0; k < l1ido; k += ido) {
540       const v4sf * RESTRICT pc = (v4sf*)(cc - 1 + 4*k);
541       v4sf * RESTRICT ph = (v4sf*)(ch + k + 1);
542       for (i = 2; i < ido; i += 2) {
543 
544         tr1 = VSUB(pc[i], pc[4*ido - i]);
545         tr2 = VADD(pc[i], pc[4*ido - i]);
546         ti4 = VSUB(pc[2*ido + i], pc[2*ido - i]);
547         tr3 = VADD(pc[2*ido + i], pc[2*ido - i]);
548         ph[0] = VADD(tr2, tr3);
549         cr3 = VSUB(tr2, tr3);
550 
551         ti3 = VSUB(pc[2*ido + i + 1], pc[2*ido - i + 1]);
552         tr4 = VADD(pc[2*ido + i + 1], pc[2*ido - i + 1]);
553         cr2 = VSUB(tr1, tr4);
554         cr4 = VADD(tr1, tr4);
555 
556         ti1 = VADD(pc[i + 1], pc[4*ido - i + 1]);
557         ti2 = VSUB(pc[i + 1], pc[4*ido - i + 1]);
558 
559         ph[1] = VADD(ti2, ti3); ph += l1ido;
560         ci3 = VSUB(ti2, ti3);
561         ci2 = VADD(ti1, ti4);
562         ci4 = VSUB(ti1, ti4);
563         VCPLXMUL(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
564         ph[0] = cr2;
565         ph[1] = ci2; ph += l1ido;
566         VCPLXMUL(cr3, ci3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
567         ph[0] = cr3;
568         ph[1] = ci3; ph += l1ido;
569         VCPLXMUL(cr4, ci4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1]));
570         ph[0] = cr4;
571         ph[1] = ci4; ph = ph - 3*l1ido + 2;
572       }
573     }
574     if (ido % 2 == 1) return;
575   }
576   for (k=0; k < l1ido; k+=ido) {
577     int i0 = 4*k + ido;
578     v4sf c = cc[i0-1], d = cc[i0 + 2*ido-1];
579     v4sf a = cc[i0+0], b = cc[i0 + 2*ido+0];
580     tr1 = VSUB(c,d);
581     tr2 = VADD(c,d);
582     ti1 = VADD(b,a);
583     ti2 = VSUB(b,a);
584     ch[ido-1 + k + 0*l1ido] = VADD(tr2,tr2);
585     ch[ido-1 + k + 1*l1ido] = SVMUL(minus_sqrt2, VSUB(ti1, tr1));
586     ch[ido-1 + k + 2*l1ido] = VADD(ti2, ti2);
587     ch[ido-1 + k + 3*l1ido] = SVMUL(minus_sqrt2, VADD(ti1, tr1));
588   }
589 } /* radb4 */
590 
radf5_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2,const float * wa3,const float * wa4)591 static void radf5_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
592                      const float *wa1, const float *wa2, const float *wa3, const float *wa4)
593 {
594   static const float tr11 = .309016994374947f;
595   static const float ti11 = .951056516295154f;
596   static const float tr12 = -.809016994374947f;
597   static const float ti12 = .587785252292473f;
598 
599   /* System generated locals */
600   int cc_offset, ch_offset;
601 
602   /* Local variables */
603   int i, k, ic;
604   v4sf ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3, dr4, dr5,
605     cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
606   int idp2;
607 
608 
609 #define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
610 #define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1]
611 
612   /* Parameter adjustments */
613   ch_offset = 1 + ido * 6;
614   ch -= ch_offset;
615   cc_offset = 1 + ido * (1 + l1);
616   cc -= cc_offset;
617 
618   /* Function Body */
619   for (k = 1; k <= l1; ++k) {
620     cr2 = VADD(cc_ref(1, k, 5), cc_ref(1, k, 2));
621     ci5 = VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2));
622     cr3 = VADD(cc_ref(1, k, 4), cc_ref(1, k, 3));
623     ci4 = VSUB(cc_ref(1, k, 4), cc_ref(1, k, 3));
624     ch_ref(1, 1, k) = VADD(cc_ref(1, k, 1), VADD(cr2, cr3));
625     ch_ref(ido, 2, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
626     ch_ref(1, 3, k) = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
627     ch_ref(ido, 4, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
628     ch_ref(1, 5, k) = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
629     /* printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4); */
630   }
631   if (ido == 1) {
632     return;
633   }
634   idp2 = ido + 2;
635   for (k = 1; k <= l1; ++k) {
636     for (i = 3; i <= ido; i += 2) {
637       ic = idp2 - i;
638       dr2 = LD_PS1(wa1[i-3]); di2 = LD_PS1(wa1[i-2]);
639       dr3 = LD_PS1(wa2[i-3]); di3 = LD_PS1(wa2[i-2]);
640       dr4 = LD_PS1(wa3[i-3]); di4 = LD_PS1(wa3[i-2]);
641       dr5 = LD_PS1(wa4[i-3]); di5 = LD_PS1(wa4[i-2]);
642       VCPLXMULCONJ(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
643       VCPLXMULCONJ(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
644       VCPLXMULCONJ(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
645       VCPLXMULCONJ(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
646       cr2 = VADD(dr2, dr5);
647       ci5 = VSUB(dr5, dr2);
648       cr5 = VSUB(di2, di5);
649       ci2 = VADD(di2, di5);
650       cr3 = VADD(dr3, dr4);
651       ci4 = VSUB(dr4, dr3);
652       cr4 = VSUB(di3, di4);
653       ci3 = VADD(di3, di4);
654       ch_ref(i - 1, 1, k) = VADD(cc_ref(i - 1, k, 1), VADD(cr2, cr3));
655       ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));
656       tr2 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
657       ti2 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3)));
658       tr3 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
659       ti3 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3)));
660       tr5 = VADD(SVMUL(ti11, cr5), SVMUL(ti12, cr4));
661       ti5 = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
662       tr4 = VSUB(SVMUL(ti12, cr5), SVMUL(ti11, cr4));
663       ti4 = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
664       ch_ref(i - 1, 3, k) = VSUB(tr2, tr5);
665       ch_ref(ic - 1, 2, k) = VADD(tr2, tr5);
666       ch_ref(i, 3, k) = VADD(ti2, ti5);
667       ch_ref(ic, 2, k) = VSUB(ti5, ti2);
668       ch_ref(i - 1, 5, k) = VSUB(tr3, tr4);
669       ch_ref(ic - 1, 4, k) = VADD(tr3, tr4);
670       ch_ref(i, 5, k) = VADD(ti3, ti4);
671       ch_ref(ic, 4, k) = VSUB(ti4, ti3);
672     }
673   }
674 #undef cc_ref
675 #undef ch_ref
676 } /* radf5 */
677 
radb5_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2,const float * wa3,const float * wa4)678 static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
679                   const float *wa1, const float *wa2, const float *wa3, const float *wa4)
680 {
681   static const float tr11 = .309016994374947f;
682   static const float ti11 = .951056516295154f;
683   static const float tr12 = -.809016994374947f;
684   static const float ti12 = .587785252292473f;
685 
686   int cc_offset, ch_offset;
687 
688   /* Local variables */
689   int i, k, ic;
690   v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
691     ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
692   int idp2;
693 
694 #define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1]
695 #define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
696 
697   /* Parameter adjustments */
698   ch_offset = 1 + ido * (1 + l1);
699   ch -= ch_offset;
700   cc_offset = 1 + ido * 6;
701   cc -= cc_offset;
702 
703   /* Function Body */
704   for (k = 1; k <= l1; ++k) {
705     ti5 = VADD(cc_ref(1, 3, k), cc_ref(1, 3, k));
706     ti4 = VADD(cc_ref(1, 5, k), cc_ref(1, 5, k));
707     tr2 = VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k));
708     tr3 = VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k));
709     ch_ref(1, k, 1) = VADD(cc_ref(1, 1, k), VADD(tr2, tr3));
710     cr2 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
711     cr3 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
712     ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
713     ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
714     ch_ref(1, k, 2) = VSUB(cr2, ci5);
715     ch_ref(1, k, 3) = VSUB(cr3, ci4);
716     ch_ref(1, k, 4) = VADD(cr3, ci4);
717     ch_ref(1, k, 5) = VADD(cr2, ci5);
718   }
719   if (ido == 1) {
720     return;
721   }
722   idp2 = ido + 2;
723   for (k = 1; k <= l1; ++k) {
724     for (i = 3; i <= ido; i += 2) {
725       ic = idp2 - i;
726       ti5 = VADD(cc_ref(i  , 3, k), cc_ref(ic  , 2, k));
727       ti2 = VSUB(cc_ref(i  , 3, k), cc_ref(ic  , 2, k));
728       ti4 = VADD(cc_ref(i  , 5, k), cc_ref(ic  , 4, k));
729       ti3 = VSUB(cc_ref(i  , 5, k), cc_ref(ic  , 4, k));
730       tr5 = VSUB(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
731       tr2 = VADD(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
732       tr4 = VSUB(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
733       tr3 = VADD(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
734       ch_ref(i - 1, k, 1) = VADD(cc_ref(i-1, 1, k), VADD(tr2, tr3));
735       ch_ref(i, k, 1) = VADD(cc_ref(i, 1, k), VADD(ti2, ti3));
736       cr2 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
737       ci2 = VADD(cc_ref(i  , 1, k), VADD(SVMUL(tr11, ti2), SVMUL(tr12, ti3)));
738       cr3 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
739       ci3 = VADD(cc_ref(i  , 1, k), VADD(SVMUL(tr12, ti2), SVMUL(tr11, ti3)));
740       cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
741       ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
742       cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
743       ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
744       dr3 = VSUB(cr3, ci4);
745       dr4 = VADD(cr3, ci4);
746       di3 = VADD(ci3, cr4);
747       di4 = VSUB(ci3, cr4);
748       dr5 = VADD(cr2, ci5);
749       dr2 = VSUB(cr2, ci5);
750       di5 = VSUB(ci2, cr5);
751       di2 = VADD(ci2, cr5);
752       VCPLXMUL(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2]));
753       VCPLXMUL(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2]));
754       VCPLXMUL(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2]));
755       VCPLXMUL(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2]));
756 
757       ch_ref(i-1, k, 2) = dr2; ch_ref(i, k, 2) = di2;
758       ch_ref(i-1, k, 3) = dr3; ch_ref(i, k, 3) = di3;
759       ch_ref(i-1, k, 4) = dr4; ch_ref(i, k, 4) = di4;
760       ch_ref(i-1, k, 5) = dr5; ch_ref(i, k, 5) = di5;
761     }
762   }
763 #undef cc_ref
764 #undef ch_ref
765 } /* radb5 */
766 
rfftf1_ps(int n,const v4sf * input_readonly,v4sf * work1,v4sf * work2,const float * wa,const int * ifac)767 static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
768                                       const float *wa, const int *ifac) {
769   v4sf *in  = (v4sf*)input_readonly;
770   v4sf *out = (in == work2 ? work1 : work2);
771   int nf = ifac[1], k1;
772   int l2 = n;
773   int iw = n-1;
774   assert(in != out && work1 != work2);
775   for (k1 = 1; k1 <= nf; ++k1) {
776     int kh = nf - k1;
777     int ip = ifac[kh + 2];
778     int l1 = l2 / ip;
779     int ido = n / l2;
780     iw -= (ip - 1)*ido;
781     switch (ip) {
782       case 5: {
783         int ix2 = iw + ido;
784         int ix3 = ix2 + ido;
785         int ix4 = ix3 + ido;
786         radf5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
787       } break;
788       case 4: {
789         int ix2 = iw + ido;
790         int ix3 = ix2 + ido;
791         radf4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
792       } break;
793       case 3: {
794         int ix2 = iw + ido;
795         radf3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
796       } break;
797       case 2:
798         radf2_ps(ido, l1, in, out, &wa[iw]);
799         break;
800       default:
801         assert(0);
802         break;
803     }
804     l2 = l1;
805     if (out == work2) {
806       out = work1; in = work2;
807     } else {
808       out = work2; in = work1;
809     }
810   }
811   return in; /* this is in fact the output .. */
812 } /* rfftf1 */
813 
rfftb1_ps(int n,const v4sf * input_readonly,v4sf * work1,v4sf * work2,const float * wa,const int * ifac)814 static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
815                                       const float *wa, const int *ifac) {
816   v4sf *in  = (v4sf*)input_readonly;
817   v4sf *out = (in == work2 ? work1 : work2);
818   int nf = ifac[1], k1;
819   int l1 = 1;
820   int iw = 0;
821   assert(in != out);
822   for (k1=1; k1<=nf; k1++) {
823     int ip = ifac[k1 + 1];
824     int l2 = ip*l1;
825     int ido = n / l2;
826     switch (ip) {
827       case 5: {
828         int ix2 = iw + ido;
829         int ix3 = ix2 + ido;
830         int ix4 = ix3 + ido;
831         radb5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
832       } break;
833       case 4: {
834         int ix2 = iw + ido;
835         int ix3 = ix2 + ido;
836         radb4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
837       } break;
838       case 3: {
839         int ix2 = iw + ido;
840         radb3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
841       } break;
842       case 2:
843         radb2_ps(ido, l1, in, out, &wa[iw]);
844         break;
845       default:
846         assert(0);
847         break;
848     }
849     l1 = l2;
850     iw += (ip - 1)*ido;
851 
852     if (out == work2) {
853       out = work1; in = work2;
854     } else {
855       out = work2; in = work1;
856     }
857   }
858   return in; /* this is in fact the output .. */
859 }
860 
decompose(int n,int * ifac,const int * ntryh)861 static int decompose(int n, int *ifac, const int *ntryh) {
862   int nl = n, nf = 0, i, j = 0;
863   for (j=0; ntryh[j]; ++j) {
864     int ntry = ntryh[j];
865     while (nl != 1) {
866       int nq = nl / ntry;
867       int nr = nl - ntry * nq;
868       if (nr == 0) {
869         ifac[2+nf++] = ntry;
870         nl = nq;
871         if (ntry == 2 && nf != 1) {
872           for (i = 2; i <= nf; ++i) {
873             int ib = nf - i + 2;
874             ifac[ib + 1] = ifac[ib];
875           }
876           ifac[2] = 2;
877         }
878       } else break;
879     }
880   }
881   ifac[0] = n;
882   ifac[1] = nf;
883   return nf;
884 }
885 
886 
887 
rffti1_ps(int n,float * wa,int * ifac)888 static void rffti1_ps(int n, float *wa, int *ifac)
889 {
890   static const int ntryh[] = { 4,2,3,5,0 };
891   int k1, j, ii;
892 
893   int nf = decompose(n,ifac,ntryh);
894   float argh = (2*(float)M_PI) / n;
895   int is = 0;
896   int nfm1 = nf - 1;
897   int l1 = 1;
898   for (k1 = 1; k1 <= nfm1; k1++) {
899     int ip = ifac[k1 + 1];
900     int ld = 0;
901     int l2 = l1*ip;
902     int ido = n / l2;
903     int ipm = ip - 1;
904     for (j = 1; j <= ipm; ++j) {
905       float argld;
906       int i = is, fi=0;
907       ld += l1;
908       argld = ld*argh;
909       for (ii = 3; ii <= ido; ii += 2) {
910         i += 2;
911         fi += 1;
912         wa[i - 2] = FUNC_COS(fi*argld);
913         wa[i - 1] = FUNC_SIN(fi*argld);
914       }
915       is += ido;
916     }
917     l1 = l2;
918   }
919 } /* rffti1 */
920 
cffti1_ps(int n,float * wa,int * ifac)921 static void cffti1_ps(int n, float *wa, int *ifac)
922 {
923   static const int ntryh[] = { 5,3,4,2,0 };
924   int k1, j, ii;
925 
926   int nf = decompose(n,ifac,ntryh);
927   float argh = (2*(float)M_PI) / n;
928   int i = 1;
929   int l1 = 1;
930   for (k1=1; k1<=nf; k1++) {
931     int ip = ifac[k1+1];
932     int ld = 0;
933     int l2 = l1*ip;
934     int ido = n / l2;
935     int idot = ido + ido + 2;
936     int ipm = ip - 1;
937     for (j=1; j<=ipm; j++) {
938       float argld;
939       int i1 = i, fi = 0;
940       wa[i-1] = 1;
941       wa[i] = 0;
942       ld += l1;
943       argld = ld*argh;
944       for (ii = 4; ii <= idot; ii += 2) {
945         i += 2;
946         fi += 1;
947         wa[i-1] = FUNC_COS(fi*argld);
948         wa[i] = FUNC_SIN(fi*argld);
949       }
950       if (ip > 5) {
951         wa[i1-1] = wa[i-1];
952         wa[i1] = wa[i];
953       }
954     }
955     l1 = l2;
956   }
957 } /* cffti1 */
958 
959 
cfftf1_ps(int n,const v4sf * input_readonly,v4sf * work1,v4sf * work2,const float * wa,const int * ifac,int isign)960 static v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int *ifac, int isign) {
961   v4sf *in  = (v4sf*)input_readonly;
962   v4sf *out = (in == work2 ? work1 : work2);
963   int nf = ifac[1], k1;
964   int l1 = 1;
965   int iw = 0;
966   assert(in != out && work1 != work2);
967   for (k1=2; k1<=nf+1; k1++) {
968     int ip = ifac[k1];
969     int l2 = ip*l1;
970     int ido = n / l2;
971     int idot = ido + ido;
972     switch (ip) {
973       case 5: {
974         int ix2 = iw + idot;
975         int ix3 = ix2 + idot;
976         int ix4 = ix3 + idot;
977         passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
978       } break;
979       case 4: {
980         int ix2 = iw + idot;
981         int ix3 = ix2 + idot;
982         passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], isign);
983       } break;
984       case 2: {
985         passf2_ps(idot, l1, in, out, &wa[iw], isign);
986       } break;
987       case 3: {
988         int ix2 = iw + idot;
989         passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], isign);
990       } break;
991       default:
992         assert(0);
993     }
994     l1 = l2;
995     iw += (ip - 1)*idot;
996     if (out == work2) {
997       out = work1; in = work2;
998     } else {
999       out = work2; in = work1;
1000     }
1001   }
1002 
1003   return in; /* this is in fact the output .. */
1004 }
1005 
1006 
1007 struct SETUP_STRUCT {
1008   int     N;
1009   int     Ncvec;  /* nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL) */
1010   int ifac[15];
1011   pffft_transform_t transform;
1012   v4sf *data;     /* allocated room for twiddle coefs */
1013   float *e;       /* points into 'data', N/4*3 elements */
1014   float *twiddle; /* points into 'data', N/4 elements */
1015 };
1016 
FUNC_NEW_SETUP(int N,pffft_transform_t transform)1017 SETUP_STRUCT *FUNC_NEW_SETUP(int N, pffft_transform_t transform) {
1018   SETUP_STRUCT *s = (SETUP_STRUCT*)malloc(sizeof(SETUP_STRUCT));
1019   int k, m;
1020   /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
1021      and 32 for real FFTs -- a lot of stuff would need to be rewritten to
1022      handle other cases (or maybe just switch to a scalar fft, I don't know..) */
1023   if (transform == PFFFT_REAL) { assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0); }
1024   if (transform == PFFFT_COMPLEX) { assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0); }
1025   /* assert((N % 32) == 0); */
1026   s->N = N;
1027   s->transform = transform;
1028   /* nb of complex simd vectors */
1029   s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
1030   s->data = (v4sf*)FUNC_ALIGNED_MALLOC(2*s->Ncvec * sizeof(v4sf));
1031   s->e = (float*)s->data;
1032   s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ);
1033 
1034   if (transform == PFFFT_REAL) {
1035     for (k=0; k < s->Ncvec; ++k) {
1036       int i = k/SIMD_SZ;
1037       int j = k%SIMD_SZ;
1038       for (m=0; m < SIMD_SZ-1; ++m) {
1039         float A = -2*(float)M_PI*(m+1)*k / N;
1040         s->e[(2*(i*3 + m) + 0) * SIMD_SZ + j] = FUNC_COS(A);
1041         s->e[(2*(i*3 + m) + 1) * SIMD_SZ + j] = FUNC_SIN(A);
1042       }
1043     }
1044     rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
1045   } else {
1046     for (k=0; k < s->Ncvec; ++k) {
1047       int i = k/SIMD_SZ;
1048       int j = k%SIMD_SZ;
1049       for (m=0; m < SIMD_SZ-1; ++m) {
1050         float A = -2*(float)M_PI*(m+1)*k / N;
1051         s->e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = FUNC_COS(A);
1052         s->e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = FUNC_SIN(A);
1053       }
1054     }
1055     cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
1056   }
1057 
1058   /* check that N is decomposable with allowed prime factors */
1059   for (k=0, m=1; k < s->ifac[1]; ++k) { m *= s->ifac[2+k]; }
1060   if (m != N/SIMD_SZ) {
1061     FUNC_DESTROY(s); s = 0;
1062   }
1063 
1064   return s;
1065 }
1066 
1067 
FUNC_DESTROY(SETUP_STRUCT * s)1068 void FUNC_DESTROY(SETUP_STRUCT *s) {
1069   FUNC_ALIGNED_FREE(s->data);
1070   free(s);
1071 }
1072 
1073 #if ( SIMD_SZ == 4 )    /* !defined(PFFFT_SIMD_DISABLE) */
1074 
1075 /* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
reversed_copy(int N,const v4sf * in,int in_stride,v4sf * out)1076 static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out) {
1077   v4sf g0, g1;
1078   int k;
1079   INTERLEAVE2(in[0], in[1], g0, g1); in += in_stride;
1080 
1081   *--out = VSWAPHL(g0, g1); /* [g0l, g0h], [g1l g1h] -> [g1l, g0h] */
1082   for (k=1; k < N; ++k) {
1083     v4sf h0, h1;
1084     INTERLEAVE2(in[0], in[1], h0, h1); in += in_stride;
1085     *--out = VSWAPHL(g1, h0);
1086     *--out = VSWAPHL(h0, h1);
1087     g1 = h1;
1088   }
1089   *--out = VSWAPHL(g1, g0);
1090 }
1091 
unreversed_copy(int N,const v4sf * in,v4sf * out,int out_stride)1092 static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride) {
1093   v4sf g0, g1, h0, h1;
1094   int k;
1095   g0 = g1 = in[0]; ++in;
1096   for (k=1; k < N; ++k) {
1097     h0 = *in++; h1 = *in++;
1098     g1 = VSWAPHL(g1, h0);
1099     h0 = VSWAPHL(h0, h1);
1100     UNINTERLEAVE2(h0, g1, out[0], out[1]); out += out_stride;
1101     g1 = h1;
1102   }
1103   h0 = *in++; h1 = g0;
1104   g1 = VSWAPHL(g1, h0);
1105   h0 = VSWAPHL(h0, h1);
1106   UNINTERLEAVE2(h0, g1, out[0], out[1]);
1107 }
1108 
FUNC_ZREORDER(SETUP_STRUCT * setup,const float * in,float * out,pffft_direction_t direction)1109 void FUNC_ZREORDER(SETUP_STRUCT *setup, const float *in, float *out, pffft_direction_t direction) {
1110   int k, N = setup->N, Ncvec = setup->Ncvec;
1111   const v4sf *vin = (const v4sf*)in;
1112   v4sf *vout = (v4sf*)out;
1113   assert(in != out);
1114   if (setup->transform == PFFFT_REAL) {
1115     int k, dk = N/32;
1116     if (direction == PFFFT_FORWARD) {
1117       for (k=0; k < dk; ++k) {
1118         INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
1119         INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
1120       }
1121       reversed_copy(dk, vin+2, 8, (v4sf*)(out + N/2));
1122       reversed_copy(dk, vin+6, 8, (v4sf*)(out + N));
1123     } else {
1124       for (k=0; k < dk; ++k) {
1125         UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
1126         UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
1127       }
1128       unreversed_copy(dk, (v4sf*)(in + N/4), (v4sf*)(out + N - 6*SIMD_SZ), -8);
1129       unreversed_copy(dk, (v4sf*)(in + 3*N/4), (v4sf*)(out + N - 2*SIMD_SZ), -8);
1130     }
1131   } else {
1132     if (direction == PFFFT_FORWARD) {
1133       for (k=0; k < Ncvec; ++k) {
1134         int kk = (k/4) + (k%4)*(Ncvec/4);
1135         INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
1136       }
1137     } else {
1138       for (k=0; k < Ncvec; ++k) {
1139         int kk = (k/4) + (k%4)*(Ncvec/4);
1140         UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
1141       }
1142     }
1143   }
1144 }
1145 
FUNC_CPLX_FINALIZE(int Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1146 void FUNC_CPLX_FINALIZE(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1147   int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */
1148   v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1149   v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1150   assert(in != out);
1151   for (k=0; k < dk; ++k) {
1152     r0 = in[8*k+0]; i0 = in[8*k+1];
1153     r1 = in[8*k+2]; i1 = in[8*k+3];
1154     r2 = in[8*k+4]; i2 = in[8*k+5];
1155     r3 = in[8*k+6]; i3 = in[8*k+7];
1156     VTRANSPOSE4(r0,r1,r2,r3);
1157     VTRANSPOSE4(i0,i1,i2,i3);
1158     VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]);
1159     VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]);
1160     VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]);
1161 
1162     sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
1163     sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
1164     si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
1165     si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
1166 
1167     /*
1168       transformation for each column is:
1169 
1170       [1   1   1   1   0   0   0   0]   [r0]
1171       [1   0  -1   0   0  -1   0   1]   [r1]
1172       [1  -1   1  -1   0   0   0   0]   [r2]
1173       [1   0  -1   0   0   1   0  -1]   [r3]
1174       [0   0   0   0   1   1   1   1] * [i0]
1175       [0   1   0  -1   1   0  -1   0]   [i1]
1176       [0   0   0   0   1  -1   1  -1]   [i2]
1177       [0  -1   0   1   1   0  -1   0]   [i3]
1178     */
1179 
1180     r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
1181     r1 = VADD(dr0, di1); i1 = VSUB(di0, dr1);
1182     r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
1183     r3 = VSUB(dr0, di1); i3 = VADD(di0, dr1);
1184 
1185     *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1186     *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1187   }
1188 }
1189 
FUNC_CPLX_PREPROCESS(int Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1190 void FUNC_CPLX_PREPROCESS(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1191   int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */
1192   v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1193   v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1194   assert(in != out);
1195   for (k=0; k < dk; ++k) {
1196     r0 = in[8*k+0]; i0 = in[8*k+1];
1197     r1 = in[8*k+2]; i1 = in[8*k+3];
1198     r2 = in[8*k+4]; i2 = in[8*k+5];
1199     r3 = in[8*k+6]; i3 = in[8*k+7];
1200 
1201     sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
1202     sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
1203     si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
1204     si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
1205 
1206     r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
1207     r1 = VSUB(dr0, di1); i1 = VADD(di0, dr1);
1208     r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
1209     r3 = VADD(dr0, di1); i3 = VSUB(di0, dr1);
1210 
1211     VCPLXMULCONJ(r1,i1,e[k*6+0],e[k*6+1]);
1212     VCPLXMULCONJ(r2,i2,e[k*6+2],e[k*6+3]);
1213     VCPLXMULCONJ(r3,i3,e[k*6+4],e[k*6+5]);
1214 
1215     VTRANSPOSE4(r0,r1,r2,r3);
1216     VTRANSPOSE4(i0,i1,i2,i3);
1217 
1218     *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1219     *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1220   }
1221 }
1222 
1223 
FUNC_REAL_FINALIZE_4X4(const v4sf * in0,const v4sf * in1,const v4sf * in,const v4sf * e,v4sf * out)1224 static ALWAYS_INLINE(void) FUNC_REAL_FINALIZE_4X4(const v4sf *in0, const v4sf *in1, const v4sf *in,
1225                             const v4sf *e, v4sf *out) {
1226   v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1227   v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1228   r0 = *in0; i0 = *in1;
1229   r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++;
1230   VTRANSPOSE4(r0,r1,r2,r3);
1231   VTRANSPOSE4(i0,i1,i2,i3);
1232 
1233   /*
1234     transformation for each column is:
1235 
1236     [1   1   1   1   0   0   0   0]   [r0]
1237     [1   0  -1   0   0  -1   0   1]   [r1]
1238     [1   0  -1   0   0   1   0  -1]   [r2]
1239     [1  -1   1  -1   0   0   0   0]   [r3]
1240     [0   0   0   0   1   1   1   1] * [i0]
1241     [0  -1   0   1  -1   0   1   0]   [i1]
1242     [0  -1   0   1   1   0  -1   0]   [i2]
1243     [0   0   0   0  -1   1  -1   1]   [i3]
1244   */
1245 
1246   /* cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n"; */
1247   /* cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n"; */
1248 
1249   VCPLXMUL(r1,i1,e[0],e[1]);
1250   VCPLXMUL(r2,i2,e[2],e[3]);
1251   VCPLXMUL(r3,i3,e[4],e[5]);
1252 
1253   /* cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n"; */
1254   /* cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n"; */
1255 
1256   sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2);
1257   sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1);
1258   si0 = VADD(i0,i2); di0 = VSUB(i0,i2);
1259   si1 = VADD(i1,i3); di1 = VSUB(i3,i1);
1260 
1261   r0 = VADD(sr0, sr1);
1262   r3 = VSUB(sr0, sr1);
1263   i0 = VADD(si0, si1);
1264   i3 = VSUB(si1, si0);
1265   r1 = VADD(dr0, di1);
1266   r2 = VSUB(dr0, di1);
1267   i1 = VSUB(dr1, di0);
1268   i2 = VADD(dr1, di0);
1269 
1270   *out++ = r0;
1271   *out++ = i0;
1272   *out++ = r1;
1273   *out++ = i1;
1274   *out++ = r2;
1275   *out++ = i2;
1276   *out++ = r3;
1277   *out++ = i3;
1278 
1279 }
1280 
FUNC_REAL_FINALIZE(int Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1281 static NEVER_INLINE(void) FUNC_REAL_FINALIZE(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1282   int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */
1283   /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1284 
1285   v4sf_union cr, ci, *uout = (v4sf_union*)out;
1286   v4sf save = in[7], zero=VZERO();
1287   float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3;
1288   static const float s = (float)M_SQRT2/2;
1289 
1290   cr.v = in[0]; ci.v = in[Ncvec*2-1];
1291   assert(in != out);
1292   FUNC_REAL_FINALIZE_4X4(&zero, &zero, in+1, e, out);
1293 
1294   /*
1295     [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
1296 
1297     [Xr(1)]  ] [1   1   1   1   0   0   0   0]
1298     [Xr(N/4) ] [0   0   0   0   1   s   0  -s]
1299     [Xr(N/2) ] [1   0  -1   0   0   0   0   0]
1300     [Xr(3N/4)] [0   0   0   0   1  -s   0   s]
1301     [Xi(1)   ] [1  -1   1  -1   0   0   0   0]
1302     [Xi(N/4) ] [0   0   0   0   0  -s  -1  -s]
1303     [Xi(N/2) ] [0  -1   0   1   0   0   0   0]
1304     [Xi(3N/4)] [0   0   0   0   0  -s   1  -s]
1305   */
1306 
1307   xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]); uout[0].f[0] = xr0;
1308   xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]); uout[1].f[0] = xi0;
1309   xr2=(cr.f[0]-cr.f[2]);                     uout[4].f[0] = xr2;
1310   xi2=(cr.f[3]-cr.f[1]);                     uout[5].f[0] = xi2;
1311   xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]);        uout[2].f[0] = xr1;
1312   xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]);        uout[3].f[0] = xi1;
1313   xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]);        uout[6].f[0] = xr3;
1314   xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]);        uout[7].f[0] = xi3;
1315 
1316   for (k=1; k < dk; ++k) {
1317     v4sf save_next = in[8*k+7];
1318     FUNC_REAL_FINALIZE_4X4(&save, &in[8*k+0], in + 8*k+1,
1319                            e + k*6, out + k*8);
1320     save = save_next;
1321   }
1322 
1323 }
1324 
FUNC_REAL_PREPROCESS_4X4(const v4sf * in,const v4sf * e,v4sf * out,int first)1325 static ALWAYS_INLINE(void) FUNC_REAL_PREPROCESS_4X4(const v4sf *in,
1326                                              const v4sf *e, v4sf *out, int first) {
1327   v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
1328   /*
1329     transformation for each column is:
1330 
1331     [1   1   1   1   0   0   0   0]   [r0]
1332     [1   0   0  -1   0  -1  -1   0]   [r1]
1333     [1  -1  -1   1   0   0   0   0]   [r2]
1334     [1   0   0  -1   0   1   1   0]   [r3]
1335     [0   0   0   0   1  -1   1  -1] * [i0]
1336     [0  -1   1   0   1   0   0   1]   [i1]
1337     [0   0   0   0   1   1  -1  -1]   [i2]
1338     [0   1  -1   0   1   0   0   1]   [i3]
1339   */
1340 
1341   v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3);
1342   v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2);
1343   v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3);
1344   v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2);
1345 
1346   r0 = VADD(sr0, sr1);
1347   r2 = VSUB(sr0, sr1);
1348   r1 = VSUB(dr0, si1);
1349   r3 = VADD(dr0, si1);
1350   i0 = VSUB(di0, di1);
1351   i2 = VADD(di0, di1);
1352   i1 = VSUB(si0, dr1);
1353   i3 = VADD(si0, dr1);
1354 
1355   VCPLXMULCONJ(r1,i1,e[0],e[1]);
1356   VCPLXMULCONJ(r2,i2,e[2],e[3]);
1357   VCPLXMULCONJ(r3,i3,e[4],e[5]);
1358 
1359   VTRANSPOSE4(r0,r1,r2,r3);
1360   VTRANSPOSE4(i0,i1,i2,i3);
1361 
1362   if (!first) {
1363     *out++ = r0;
1364     *out++ = i0;
1365   }
1366   *out++ = r1;
1367   *out++ = i1;
1368   *out++ = r2;
1369   *out++ = i2;
1370   *out++ = r3;
1371   *out++ = i3;
1372 }
1373 
FUNC_REAL_PREPROCESS(int Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1374 static NEVER_INLINE(void) FUNC_REAL_PREPROCESS(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1375   int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */
1376   /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1377 
1378   v4sf_union Xr, Xi, *uout = (v4sf_union*)out;
1379   float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3;
1380   static const float s = (float)M_SQRT2;
1381   assert(in != out);
1382   for (k=0; k < 4; ++k) {
1383     Xr.f[k] = ((float*)in)[8*k];
1384     Xi.f[k] = ((float*)in)[8*k+4];
1385   }
1386 
1387   FUNC_REAL_PREPROCESS_4X4(in, e, out+1, 1); /* will write only 6 values */
1388 
1389   /*
1390     [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
1391 
1392     [cr0] [1   0   2   0   1   0   0   0]
1393     [cr1] [1   0   0   0  -1   0  -2   0]
1394     [cr2] [1   0  -2   0   1   0   0   0]
1395     [cr3] [1   0   0   0  -1   0   2   0]
1396     [ci0] [0   2   0   2   0   0   0   0]
1397     [ci1] [0   s   0  -s   0  -s   0  -s]
1398     [ci2] [0   0   0   0   0  -2   0   2]
1399     [ci3] [0  -s   0   s   0  -s   0  -s]
1400   */
1401   for (k=1; k < dk; ++k) {
1402     FUNC_REAL_PREPROCESS_4X4(in+8*k, e + k*6, out-1+k*8, 0);
1403   }
1404 
1405   cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2]; uout[0].f[0] = cr0;
1406   cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2]; uout[0].f[1] = cr1;
1407   cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2]; uout[0].f[2] = cr2;
1408   cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2]; uout[0].f[3] = cr3;
1409   ci0= 2*(Xr.f[1]+Xr.f[3]);                       uout[2*Ncvec-1].f[0] = ci0;
1410   ci1= s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[1] = ci1;
1411   ci2= 2*(Xi.f[3]-Xi.f[1]);                       uout[2*Ncvec-1].f[2] = ci2;
1412   ci3=-s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[3] = ci3;
1413 }
1414 
1415 
FUNC_TRANSFORM_INTERNAL(SETUP_STRUCT * setup,const float * finput,float * foutput,v4sf * scratch,pffft_direction_t direction,int ordered)1416 void FUNC_TRANSFORM_INTERNAL(SETUP_STRUCT *setup, const float *finput, float *foutput, v4sf *scratch,
1417                              pffft_direction_t direction, int ordered) {
1418   int k, Ncvec   = setup->Ncvec;
1419   int nf_odd = (setup->ifac[1] & 1);
1420 
1421   /* temporary buffer is allocated on the stack if the scratch pointer is NULL */
1422   int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1423   VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
1424 
1425   const v4sf *vinput = (const v4sf*)finput;
1426   v4sf *voutput      = (v4sf*)foutput;
1427   v4sf *buff[2]      = { voutput, scratch ? scratch : scratch_on_stack };
1428   int ib = (nf_odd ^ ordered ? 1 : 0);
1429 
1430   assert(VALIGNED(finput) && VALIGNED(foutput));
1431 
1432   /* assert(finput != foutput); */
1433   if (direction == PFFFT_FORWARD) {
1434     ib = !ib;
1435     if (setup->transform == PFFFT_REAL) {
1436       ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib],
1437                       setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1438       FUNC_REAL_FINALIZE(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
1439     } else {
1440       v4sf *tmp = buff[ib];
1441       for (k=0; k < Ncvec; ++k) {
1442         UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
1443       }
1444       ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib],
1445                       setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
1446       FUNC_CPLX_FINALIZE(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
1447     }
1448     if (ordered) {
1449       FUNC_ZREORDER(setup, (float*)buff[!ib], (float*)buff[ib], PFFFT_FORWARD);
1450     } else ib = !ib;
1451   } else {
1452     if (vinput == buff[ib]) {
1453       ib = !ib; /* may happen when finput == foutput */
1454     }
1455     if (ordered) {
1456       FUNC_ZREORDER(setup, (float*)vinput, (float*)buff[ib], PFFFT_BACKWARD);
1457       vinput = buff[ib]; ib = !ib;
1458     }
1459     if (setup->transform == PFFFT_REAL) {
1460       FUNC_REAL_PREPROCESS(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
1461       ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1],
1462                       setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1463     } else {
1464       FUNC_CPLX_PREPROCESS(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
1465       ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1],
1466                       setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
1467       for (k=0; k < Ncvec; ++k) {
1468         INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
1469       }
1470     }
1471   }
1472 
1473   if (buff[ib] != voutput) {
1474     /* extra copy required -- this situation should only happen when finput == foutput */
1475     assert(finput==foutput);
1476     for (k=0; k < Ncvec; ++k) {
1477       v4sf a = buff[ib][2*k], b = buff[ib][2*k+1];
1478       voutput[2*k] = a; voutput[2*k+1] = b;
1479     }
1480     ib = !ib;
1481   }
1482   assert(buff[ib] == voutput);
1483 }
1484 
FUNC_ZCONVOLVE_ACCUMULATE(SETUP_STRUCT * s,const float * a,const float * b,float * ab,float scaling)1485 void FUNC_ZCONVOLVE_ACCUMULATE(SETUP_STRUCT *s, const float *a, const float *b, float *ab, float scaling) {
1486   int Ncvec = s->Ncvec;
1487   const v4sf * RESTRICT va = (const v4sf*)a;
1488   const v4sf * RESTRICT vb = (const v4sf*)b;
1489   v4sf * RESTRICT vab = (v4sf*)ab;
1490 
1491 #ifdef __arm__
1492   __builtin_prefetch(va);
1493   __builtin_prefetch(vb);
1494   __builtin_prefetch(vab);
1495   __builtin_prefetch(va+2);
1496   __builtin_prefetch(vb+2);
1497   __builtin_prefetch(vab+2);
1498   __builtin_prefetch(va+4);
1499   __builtin_prefetch(vb+4);
1500   __builtin_prefetch(vab+4);
1501   __builtin_prefetch(va+6);
1502   __builtin_prefetch(vb+6);
1503   __builtin_prefetch(vab+6);
1504 # ifndef __clang__
1505 #   define ZCONVOLVE_USING_INLINE_NEON_ASM
1506 # endif
1507 #endif
1508 
1509   float ar, ai, br, bi, abr, abi;
1510 #ifndef ZCONVOLVE_USING_INLINE_ASM
1511   v4sf vscal = LD_PS1(scaling);
1512   int i;
1513 #endif
1514 
1515   assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
1516   ar = ((v4sf_union*)va)[0].f[0];
1517   ai = ((v4sf_union*)va)[1].f[0];
1518   br = ((v4sf_union*)vb)[0].f[0];
1519   bi = ((v4sf_union*)vb)[1].f[0];
1520   abr = ((v4sf_union*)vab)[0].f[0];
1521   abi = ((v4sf_union*)vab)[1].f[0];
1522 
1523 #ifdef ZCONVOLVE_USING_INLINE_ASM
1524   /* inline asm version, unfortunately miscompiled by clang 3.2,
1525    * at least on ubuntu.. so this will be restricted to gcc */
1526   const float *a_ = a, *b_ = b; float *ab_ = ab;
1527   int N = Ncvec;
1528   asm volatile("mov         r8, %2                  \n"
1529                "vdup.f32    q15, %4                 \n"
1530                "1:                                  \n"
1531                "pld         [%0,#64]                \n"
1532                "pld         [%1,#64]                \n"
1533                "pld         [%2,#64]                \n"
1534                "pld         [%0,#96]                \n"
1535                "pld         [%1,#96]                \n"
1536                "pld         [%2,#96]                \n"
1537                "vld1.f32    {q0,q1},   [%0,:128]!         \n"
1538                "vld1.f32    {q4,q5},   [%1,:128]!         \n"
1539                "vld1.f32    {q2,q3},   [%0,:128]!         \n"
1540                "vld1.f32    {q6,q7},   [%1,:128]!         \n"
1541                "vld1.f32    {q8,q9},   [r8,:128]!          \n"
1542 
1543                "vmul.f32    q10, q0, q4             \n"
1544                "vmul.f32    q11, q0, q5             \n"
1545                "vmul.f32    q12, q2, q6             \n"
1546                "vmul.f32    q13, q2, q7             \n"
1547                "vmls.f32    q10, q1, q5             \n"
1548                "vmla.f32    q11, q1, q4             \n"
1549                "vld1.f32    {q0,q1}, [r8,:128]!     \n"
1550                "vmls.f32    q12, q3, q7             \n"
1551                "vmla.f32    q13, q3, q6             \n"
1552                "vmla.f32    q8, q10, q15            \n"
1553                "vmla.f32    q9, q11, q15            \n"
1554                "vmla.f32    q0, q12, q15            \n"
1555                "vmla.f32    q1, q13, q15            \n"
1556                "vst1.f32    {q8,q9},[%2,:128]!    \n"
1557                "vst1.f32    {q0,q1},[%2,:128]!    \n"
1558                "subs        %3, #2                  \n"
1559                "bne         1b                      \n"
1560                : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
1561 #else
1562   /* default routine, works fine for non-arm cpus with current compilers */
1563   for (i=0; i < Ncvec; i += 2) {
1564     v4sf ar, ai, br, bi;
1565     ar = va[2*i+0]; ai = va[2*i+1];
1566     br = vb[2*i+0]; bi = vb[2*i+1];
1567     VCPLXMUL(ar, ai, br, bi);
1568     vab[2*i+0] = VMADD(ar, vscal, vab[2*i+0]);
1569     vab[2*i+1] = VMADD(ai, vscal, vab[2*i+1]);
1570     ar = va[2*i+2]; ai = va[2*i+3];
1571     br = vb[2*i+2]; bi = vb[2*i+3];
1572     VCPLXMUL(ar, ai, br, bi);
1573     vab[2*i+2] = VMADD(ar, vscal, vab[2*i+2]);
1574     vab[2*i+3] = VMADD(ai, vscal, vab[2*i+3]);
1575   }
1576 #endif
1577   if (s->transform == PFFFT_REAL) {
1578     ((v4sf_union*)vab)[0].f[0] = abr + ar*br*scaling;
1579     ((v4sf_union*)vab)[1].f[0] = abi + ai*bi*scaling;
1580   }
1581 }
1582 
FUNC_ZCONVOLVE_NO_ACCU(SETUP_STRUCT * s,const float * a,const float * b,float * ab,float scaling)1583 void FUNC_ZCONVOLVE_NO_ACCU(SETUP_STRUCT *s, const float *a, const float *b, float *ab, float scaling) {
1584   v4sf vscal = LD_PS1(scaling);
1585   const v4sf * RESTRICT va = (const v4sf*)a;
1586   const v4sf * RESTRICT vb = (const v4sf*)b;
1587   v4sf * RESTRICT vab = (v4sf*)ab;
1588   float sar, sai, sbr, sbi;
1589   const int NcvecMulTwo = 2*s->Ncvec;  /* int Ncvec = s->Ncvec; */
1590   int k; /* was i -- but always used "2*i" - except at for() */
1591 
1592 #ifdef __arm__
1593   __builtin_prefetch(va);
1594   __builtin_prefetch(vb);
1595   __builtin_prefetch(vab);
1596   __builtin_prefetch(va+2);
1597   __builtin_prefetch(vb+2);
1598   __builtin_prefetch(vab+2);
1599   __builtin_prefetch(va+4);
1600   __builtin_prefetch(vb+4);
1601   __builtin_prefetch(vab+4);
1602   __builtin_prefetch(va+6);
1603   __builtin_prefetch(vb+6);
1604   __builtin_prefetch(vab+6);
1605 # ifndef __clang__
1606 #   define ZCONVOLVE_USING_INLINE_NEON_ASM
1607 # endif
1608 #endif
1609 
1610   assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
1611   sar = ((v4sf_union*)va)[0].f[0];
1612   sai = ((v4sf_union*)va)[1].f[0];
1613   sbr = ((v4sf_union*)vb)[0].f[0];
1614   sbi = ((v4sf_union*)vb)[1].f[0];
1615 
1616   /* default routine, works fine for non-arm cpus with current compilers */
1617   for (k=0; k < NcvecMulTwo; k += 4) {
1618     v4sf var, vai, vbr, vbi;
1619     var = va[k+0]; vai = va[k+1];
1620     vbr = vb[k+0]; vbi = vb[k+1];
1621     VCPLXMUL(var, vai, vbr, vbi);
1622     vab[k+0] = VMUL(var, vscal);
1623     vab[k+1] = VMUL(vai, vscal);
1624     var = va[k+2]; vai = va[k+3];
1625     vbr = vb[k+2]; vbi = vb[k+3];
1626     VCPLXMUL(var, vai, vbr, vbi);
1627     vab[k+2] = VMUL(var, vscal);
1628     vab[k+3] = VMUL(vai, vscal);
1629   }
1630 
1631   if (s->transform == PFFFT_REAL) {
1632     ((v4sf_union*)vab)[0].f[0] = sar*sbr*scaling;
1633     ((v4sf_union*)vab)[1].f[0] = sai*sbi*scaling;
1634   }
1635 }
1636 
1637 
1638 #else  /* #if ( SIMD_SZ == 4 )   * !defined(PFFFT_SIMD_DISABLE) */
1639 
1640 /* standard routine using scalar floats, without SIMD stuff. */
1641 
1642 #define pffft_zreorder_nosimd FUNC_ZREORDER
pffft_zreorder_nosimd(SETUP_STRUCT * setup,const float * in,float * out,pffft_direction_t direction)1643 void pffft_zreorder_nosimd(SETUP_STRUCT *setup, const float *in, float *out, pffft_direction_t direction) {
1644   int k, N = setup->N;
1645   if (setup->transform == PFFFT_COMPLEX) {
1646     for (k=0; k < 2*N; ++k) out[k] = in[k];
1647     return;
1648   }
1649   else if (direction == PFFFT_FORWARD) {
1650     float x_N = in[N-1];
1651     for (k=N-1; k > 1; --k) out[k] = in[k-1];
1652     out[0] = in[0];
1653     out[1] = x_N;
1654   } else {
1655     float x_N = in[1];
1656     for (k=1; k < N-1; ++k) out[k] = in[k+1];
1657     out[0] = in[0];
1658     out[N-1] = x_N;
1659   }
1660 }
1661 
1662 #define pffft_transform_internal_nosimd FUNC_TRANSFORM_INTERNAL
pffft_transform_internal_nosimd(SETUP_STRUCT * setup,const float * input,float * output,float * scratch,pffft_direction_t direction,int ordered)1663 void pffft_transform_internal_nosimd(SETUP_STRUCT *setup, const float *input, float *output, float *scratch,
1664                                     pffft_direction_t direction, int ordered) {
1665   int Ncvec   = setup->Ncvec;
1666   int nf_odd = (setup->ifac[1] & 1);
1667 
1668   /* temporary buffer is allocated on the stack if the scratch pointer is NULL */
1669   int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1670   VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
1671   float *buff[2];
1672   int ib;
1673   if (scratch == 0) scratch = scratch_on_stack;
1674   buff[0] = output; buff[1] = scratch;
1675 
1676   if (setup->transform == PFFFT_COMPLEX) ordered = 0; /* it is always ordered. */
1677   ib = (nf_odd ^ ordered ? 1 : 0);
1678 
1679   if (direction == PFFFT_FORWARD) {
1680     if (setup->transform == PFFFT_REAL) {
1681       ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib],
1682                       setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1683     } else {
1684       ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
1685                       setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
1686     }
1687     if (ordered) {
1688       FUNC_ZREORDER(setup, buff[ib], buff[!ib], PFFFT_FORWARD); ib = !ib;
1689     }
1690   } else {
1691     if (input == buff[ib]) {
1692       ib = !ib; /* may happen when finput == foutput */
1693     }
1694     if (ordered) {
1695       FUNC_ZREORDER(setup, input, buff[!ib], PFFFT_BACKWARD);
1696       input = buff[!ib];
1697     }
1698     if (setup->transform == PFFFT_REAL) {
1699       ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib],
1700                       setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1701     } else {
1702       ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
1703                       setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
1704     }
1705   }
1706   if (buff[ib] != output) {
1707     int k;
1708     /* extra copy required -- this situation should happens only when finput == foutput */
1709     assert(input==output);
1710     for (k=0; k < Ncvec; ++k) {
1711       float a = buff[ib][2*k], b = buff[ib][2*k+1];
1712       output[2*k] = a; output[2*k+1] = b;
1713     }
1714     ib = !ib;
1715   }
1716   assert(buff[ib] == output);
1717 }
1718 
1719 #define pffft_zconvolve_accumulate_nosimd FUNC_ZCONVOLVE_ACCUMULATE
pffft_zconvolve_accumulate_nosimd(SETUP_STRUCT * s,const float * a,const float * b,float * ab,float scaling)1720 void pffft_zconvolve_accumulate_nosimd(SETUP_STRUCT *s, const float *a, const float *b,
1721                                        float *ab, float scaling) {
1722   int NcvecMulTwo = 2*s->Ncvec;  /* int Ncvec = s->Ncvec; */
1723   int k; /* was i -- but always used "2*i" - except at for() */
1724 
1725   if (s->transform == PFFFT_REAL) {
1726     /* take care of the fftpack ordering */
1727     ab[0] += a[0]*b[0]*scaling;
1728     ab[NcvecMulTwo-1] += a[NcvecMulTwo-1]*b[NcvecMulTwo-1]*scaling;
1729     ++ab; ++a; ++b; NcvecMulTwo -= 2;
1730   }
1731   for (k=0; k < NcvecMulTwo; k += 2) {
1732     float ar, ai, br, bi;
1733     ar = a[k+0]; ai = a[k+1];
1734     br = b[k+0]; bi = b[k+1];
1735     VCPLXMUL(ar, ai, br, bi);
1736     ab[k+0] += ar*scaling;
1737     ab[k+1] += ai*scaling;
1738   }
1739 }
1740 
1741 #define pffft_zconvolve_no_accu_nosimd FUNC_ZCONVOLVE_NO_ACCU
pffft_zconvolve_no_accu_nosimd(SETUP_STRUCT * s,const float * a,const float * b,float * ab,float scaling)1742 void pffft_zconvolve_no_accu_nosimd(SETUP_STRUCT *s, const float *a, const float *b,
1743                                     float *ab, float scaling) {
1744   int NcvecMulTwo = 2*s->Ncvec;  /* int Ncvec = s->Ncvec; */
1745   int k; /* was i -- but always used "2*i" - except at for() */
1746 
1747   if (s->transform == PFFFT_REAL) {
1748     /* take care of the fftpack ordering */
1749     ab[0] += a[0]*b[0]*scaling;
1750     ab[NcvecMulTwo-1] += a[NcvecMulTwo-1]*b[NcvecMulTwo-1]*scaling;
1751     ++ab; ++a; ++b; NcvecMulTwo -= 2;
1752   }
1753   for (k=0; k < NcvecMulTwo; k += 2) {
1754     float ar, ai, br, bi;
1755     ar = a[k+0]; ai = a[k+1];
1756     br = b[k+0]; bi = b[k+1];
1757     VCPLXMUL(ar, ai, br, bi);
1758     ab[k+0] = ar*scaling;
1759     ab[k+1] = ai*scaling;
1760   }
1761 }
1762 
1763 
1764 #endif /* #if ( SIMD_SZ == 4 )    * !defined(PFFFT_SIMD_DISABLE) */
1765 
1766 
FUNC_TRANSFORM_UNORDRD(SETUP_STRUCT * setup,const float * input,float * output,float * work,pffft_direction_t direction)1767 void FUNC_TRANSFORM_UNORDRD(SETUP_STRUCT *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
1768   FUNC_TRANSFORM_INTERNAL(setup, input, output, (v4sf*)work, direction, 0);
1769 }
1770 
FUNC_TRANSFORM_ORDERED(SETUP_STRUCT * setup,const float * input,float * output,float * work,pffft_direction_t direction)1771 void FUNC_TRANSFORM_ORDERED(SETUP_STRUCT *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
1772   FUNC_TRANSFORM_INTERNAL(setup, input, output, (v4sf*)work, direction, 1);
1773 }
1774 
1775 
1776 #if ( SIMD_SZ == 4 )
1777 
1778 #define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
1779 
1780 /* detect bugs with the vector support macros */
FUNC_VALIDATE_SIMD_A()1781 void FUNC_VALIDATE_SIMD_A() {
1782   float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
1783   v4sf_union a0, a1, a2, a3, t, u;
1784   memcpy(a0.f, f, 4*sizeof(float));
1785   memcpy(a1.f, f+4, 4*sizeof(float));
1786   memcpy(a2.f, f+8, 4*sizeof(float));
1787   memcpy(a3.f, f+12, 4*sizeof(float));
1788 
1789   t = a0; u = a1; t.v = VZERO();
1790   printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0);
1791   t.v = VADD(a1.v, a2.v);
1792   printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18);
1793   t.v = VMUL(a1.v, a2.v);
1794   printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77);
1795   t.v = VMADD(a1.v, a2.v,a0.v);
1796   printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80);
1797 
1798   INTERLEAVE2(a1.v,a2.v,t.v,u.v);
1799   printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
1800   assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
1801   UNINTERLEAVE2(a1.v,a2.v,t.v,u.v);
1802   printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
1803   assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
1804 
1805   t.v=LD_PS1(f[15]);
1806   printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
1807   assertv4(t, 15, 15, 15, 15);
1808   t.v = VSWAPHL(a1.v, a2.v);
1809   printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
1810   assertv4(t, 8, 9, 6, 7);
1811   VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
1812   printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
1813          a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3],
1814          a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]);
1815   assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
1816 }
1817 
1818 
pffft_assert1(float result,float ref,const char * vartxt,const char * functxt,int * numErrs,const char * f,int lineNo)1819 static void pffft_assert1( float result, float ref, const char * vartxt, const char * functxt, int * numErrs, const char * f, int lineNo )
1820 {
1821   if ( !( fabsf( result - ref ) < 0.01F ) )
1822   {
1823     fprintf(stderr, "%s: assert for %s at %s(%d)\n  expected %f  value %f\n", functxt, vartxt, f, lineNo, ref, result);
1824     ++(*numErrs);
1825   }
1826 }
1827 
pffft_assert4(vsfscalar v0,vsfscalar v1,vsfscalar v2,vsfscalar v3,float a,float b,float c,float d,const char * functxt,int * numErrs,const char * f,int lineNo)1828 static void pffft_assert4(  vsfscalar v0, vsfscalar v1, vsfscalar v2, vsfscalar v3,
1829   float a, float b, float c, float d, const char * functxt, int * numErrs, const char * f, int lineNo )
1830 {
1831   pffft_assert1( v0, a, "[0]", functxt, numErrs, f, lineNo );
1832   pffft_assert1( v1, b, "[1]", functxt, numErrs, f, lineNo );
1833   pffft_assert1( v2, c, "[2]", functxt, numErrs, f, lineNo );
1834   pffft_assert1( v3, d, "[3]", functxt, numErrs, f, lineNo );
1835 }
1836 
1837 #define PFFFT_ASSERT4( V, a, b, c, d, FUNCTXT )  pffft_assert4( (V).f[0], (V).f[1], (V).f[2], (V).f[3], a, b, c, d, FUNCTXT, &numErrs, __FILE__, __LINE__ )
1838 
1839 
FUNC_VALIDATE_SIMD_EX(FILE * DbgOut)1840 int FUNC_VALIDATE_SIMD_EX(FILE * DbgOut)
1841 {
1842   int numErrs = 0;
1843 
1844   {
1845     v4sf_union C;
1846     int k;
1847     for ( k = 0; k < 4; ++k )  C.f[k] = 30 + k+1;
1848 
1849     if (DbgOut) {
1850       fprintf(DbgOut, "\ninput: { }\n" );
1851     }
1852     C.v = VZERO();
1853     if (DbgOut) {
1854       fprintf(DbgOut, "VZERO(a) => C) => {\n" );
1855       fprintf(DbgOut, "  Out C:  %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
1856       fprintf(DbgOut, "}\n" );
1857     }
1858     PFFFT_ASSERT4( C, 0.0F, 0.0F, 0.0F, 0.0F, "VZERO() Out C" );
1859   }
1860 
1861   {
1862     v4sf_union C;
1863     float a = 42.0F;
1864     int k;
1865     for ( k = 0; k < 4; ++k )  C.f[k] = 30 + k+1;
1866 
1867     if (DbgOut) {
1868       fprintf(DbgOut, "\ninput: a = {\n" );
1869       fprintf(DbgOut, "  Inp a:  %f\n", a );
1870       fprintf(DbgOut, "}\n" );
1871     }
1872     C.v = LD_PS1(a);
1873     if (DbgOut) {
1874       fprintf(DbgOut, "LD_PS1(a) => C) => {\n" );
1875       fprintf(DbgOut, "  Out C:  %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
1876       fprintf(DbgOut, "}\n" );
1877     }
1878     PFFFT_ASSERT4( C, 42.0F, 42.0F, 42.0F, 42.0F, "LD_PS1() Out C" );
1879   }
1880 
1881   {
1882     v4sf_union C;
1883     float a[16];
1884     int numAligned = 0, numUnaligned = 0;
1885     int k;
1886     const char * pUn;
1887     for ( k = 0; k < 16; ++k ) a[k] = k+1;
1888 
1889     for ( k = 0; k + 3 < 16; ++k )
1890     {
1891       const float * ptr = &a[k];
1892       if (DbgOut)
1893         fprintf(DbgOut, "\ninput: a = [ %f, %f, %f, %f ]\n", ptr[0], ptr[1], ptr[2], ptr[3] );
1894       if ( VALIGNED(ptr) )
1895       {
1896         C.v = VLOAD_ALIGNED( ptr );
1897         pUn = "";
1898         ++numAligned;
1899       }
1900       else
1901       {
1902         C.v = VLOAD_UNALIGNED( ptr );
1903         pUn = "UN";
1904         ++numUnaligned;
1905       }
1906       if (DbgOut) {
1907         fprintf(DbgOut, "C = VLOAD_%sALIGNED(&a[%d]) => {\n", pUn, k );
1908         fprintf(DbgOut, "  Out C:  %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
1909         fprintf(DbgOut, "}\n" );
1910       }
1911       //PFFFT_ASSERT4( C, 32.0F, 34.0F, 36.0F, 38.0F, "VADD(): Out C" );
1912 
1913       if ( numAligned >= 1 && numUnaligned >= 4 )
1914         break;
1915     }
1916     if ( numAligned < 1 ) {
1917       fprintf(stderr, "VALIGNED() should have found at least 1 occurence!");
1918       ++numErrs;
1919     }
1920     if ( numUnaligned < 4 ) {
1921       fprintf(stderr, "!VALIGNED() should have found at least 4 occurences!");
1922       ++numErrs;
1923     }
1924   }
1925 
1926   {
1927     v4sf_union A, B, C;
1928     int k;
1929     for ( k = 0; k < 4; ++k )  A.f[k] = 10 + k+1;
1930     for ( k = 0; k < 4; ++k )  B.f[k] = 20 + k+1;
1931     for ( k = 0; k < 4; ++k )  C.f[k] = 30 + k+1;
1932 
1933     if (DbgOut) {
1934       fprintf(DbgOut, "\ninput: A,B = {\n" );
1935       fprintf(DbgOut, "  Inp A:  %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
1936       fprintf(DbgOut, "  Inp B:  %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] );
1937       fprintf(DbgOut, "}\n" );
1938     }
1939     C.v = VADD(A.v, B.v);
1940     if (DbgOut) {
1941       fprintf(DbgOut, "C = VADD(A,B) => {\n" );
1942       fprintf(DbgOut, "  Out C:  %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
1943       fprintf(DbgOut, "}\n" );
1944     }
1945     PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "VADD(): Inp A" );
1946     PFFFT_ASSERT4( B, 21.0F, 22.0F, 23.0F, 24.0F, "VADD(): Inp B" );
1947     PFFFT_ASSERT4( C, 32.0F, 34.0F, 36.0F, 38.0F, "VADD(): Out C" );
1948   }
1949 
1950   {
1951     v4sf_union A, B, C;
1952     int k;
1953     for ( k = 0; k < 4; ++k )  A.f[k] = 20 + 2*k+1;
1954     for ( k = 0; k < 4; ++k )  B.f[k] = 10 + k+1;
1955     for ( k = 0; k < 4; ++k )  C.f[k] = 30 + k+1;
1956 
1957     if (DbgOut) {
1958       fprintf(DbgOut, "\ninput: A,B = {\n" );
1959       fprintf(DbgOut, "  Inp A:  %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
1960       fprintf(DbgOut, "  Inp B:  %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] );
1961       fprintf(DbgOut, "}\n" );
1962     }
1963     C.v = VSUB(A.v, B.v);
1964     if (DbgOut) {
1965       fprintf(DbgOut, "C = VSUB(A,B) => {\n" );
1966       fprintf(DbgOut, "  Out C:  %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
1967       fprintf(DbgOut, "}\n" );
1968     }
1969     PFFFT_ASSERT4( A, 21.0F, 23.0F, 25.0F, 27.0F, "VSUB(): Inp A" );
1970     PFFFT_ASSERT4( B, 11.0F, 12.0F, 13.0F, 14.0F, "VSUB(): Inp B" );
1971     PFFFT_ASSERT4( C, 10.0F, 11.0F, 12.0F, 13.0F, "VSUB(): Out C" );
1972   }
1973 
1974   {
1975     v4sf_union A, B, C;
1976     int k;
1977     for ( k = 0; k < 4; ++k )  A.f[k] = 10 + k+1;
1978     for ( k = 0; k < 4; ++k )  B.f[k] = k+1;
1979     for ( k = 0; k < 4; ++k )  C.f[k] = 30 + k+1;
1980 
1981     if (DbgOut) {
1982       fprintf(DbgOut, "\ninput: A,B = {\n" );
1983       fprintf(DbgOut, "  Inp A:  %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
1984       fprintf(DbgOut, "  Inp B:  %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] );
1985       fprintf(DbgOut, "}\n" );
1986     }
1987     C.v = VMUL(A.v, B.v);
1988     if (DbgOut) {
1989       fprintf(DbgOut, "C = VMUL(A,B) => {\n" );
1990       fprintf(DbgOut, "  Out C:  %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
1991       fprintf(DbgOut, "}\n" );
1992     }
1993     PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "VMUL(): Inp A" );
1994     PFFFT_ASSERT4( B,  1.0F,  2.0F,  3.0F,  4.0F, "VMUL(): Inp B" );
1995     PFFFT_ASSERT4( C, 11.0F, 24.0F, 39.0F, 56.0F, "VMUL(): Out C" );
1996   }
1997 
1998   {
1999     v4sf_union A, B, C, D;
2000     int k;
2001     for ( k = 0; k < 4; ++k )  A.f[k] = 10 + k+1;
2002     for ( k = 0; k < 4; ++k )  B.f[k] = k+1;
2003     for ( k = 0; k < 4; ++k )  C.f[k] = 10 + k;
2004     for ( k = 0; k < 4; ++k )  D.f[k] = 40 + k+1;
2005 
2006     if (DbgOut) {
2007       fprintf(DbgOut, "\ninput: A,B,C = {\n" );
2008       fprintf(DbgOut, "  Inp A:  %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
2009       fprintf(DbgOut, "  Inp B:  %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] );
2010       fprintf(DbgOut, "  Inp C:  %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
2011       fprintf(DbgOut, "}\n" );
2012     }
2013     D.v = VMADD(A.v, B.v, C.v);
2014     if (DbgOut) {
2015       fprintf(DbgOut, "D = VMADD(A,B,C) => {\n" );
2016       fprintf(DbgOut, "  Out D:  %f, %f, %f, %f\n", D.f[0], D.f[1], D.f[2], D.f[3] );
2017       fprintf(DbgOut, "}\n" );
2018     }
2019     PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "VMADD(): Inp A" );
2020     PFFFT_ASSERT4( B,  1.0F,  2.0F,  3.0F,  4.0F, "VMADD(): Inp B" );
2021     PFFFT_ASSERT4( C, 10.0F, 11.0F, 12.0F, 13.0F, "VMADD(): Inp C" );
2022     PFFFT_ASSERT4( D, 21.0F, 35.0F, 51.0F, 69.0F, "VMADD(): Out D" );
2023   }
2024 
2025   {
2026     v4sf_union A, B, C, D;
2027     int k;
2028     for ( k = 0; k < 4; ++k )  A.f[k] = 10 + k+1;
2029     for ( k = 0; k < 4; ++k )  B.f[k] = 20 + k+1;
2030     for ( k = 0; k < 4; ++k )  C.f[k] = 30 + k+1;
2031     for ( k = 0; k < 4; ++k )  D.f[k] = 40 + k+1;
2032 
2033     if (DbgOut) {
2034       fprintf(DbgOut, "\ninput: A,B = {\n" );
2035       fprintf(DbgOut, "  Inp A:  %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
2036       fprintf(DbgOut, "  Inp B:  %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] );
2037       fprintf(DbgOut, "}\n" );
2038     }
2039     INTERLEAVE2(A.v, B.v, C.v, D.v);
2040     if (DbgOut) {
2041       fprintf(DbgOut, "INTERLEAVE2(A,B, => C,D) => {\n" );
2042       fprintf(DbgOut, "  Out C:  %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
2043       fprintf(DbgOut, "  Out D:  %f, %f, %f, %f\n", D.f[0], D.f[1], D.f[2], D.f[3] );
2044       fprintf(DbgOut, "}\n" );
2045     }
2046     PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "INTERLEAVE2() Inp A" );
2047     PFFFT_ASSERT4( B, 21.0F, 22.0F, 23.0F, 24.0F, "INTERLEAVE2() Inp B" );
2048     PFFFT_ASSERT4( C, 11.0F, 21.0F, 12.0F, 22.0F, "INTERLEAVE2() Out C" );
2049     PFFFT_ASSERT4( D, 13.0F, 23.0F, 14.0F, 24.0F, "INTERLEAVE2() Out D" );
2050   }
2051 
2052   {
2053     v4sf_union A, B, C, D;
2054     int k;
2055     for ( k = 0; k < 4; ++k )  A.f[k] = 10 + k+1;
2056     for ( k = 0; k < 4; ++k )  B.f[k] = 20 + k+1;
2057     for ( k = 0; k < 4; ++k )  C.f[k] = 30 + k+1;
2058     for ( k = 0; k < 4; ++k )  D.f[k] = 40 + k+1;
2059 
2060     if (DbgOut) {
2061       fprintf(DbgOut, "\ninput: A,B = {\n" );
2062       fprintf(DbgOut, "  Inp A:  %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
2063       fprintf(DbgOut, "  Inp B:  %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] );
2064       fprintf(DbgOut, "}\n" );
2065     }
2066     UNINTERLEAVE2(A.v, B.v, C.v, D.v);
2067     if (DbgOut) {
2068       fprintf(DbgOut, "UNINTERLEAVE2(A,B, => C,D) => {\n" );
2069       fprintf(DbgOut, "  Out C:  %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
2070       fprintf(DbgOut, "  Out D:  %f, %f, %f, %f\n", D.f[0], D.f[1], D.f[2], D.f[3] );
2071       fprintf(DbgOut, "}\n" );
2072     }
2073     PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "UNINTERLEAVE2() Inp A" );
2074     PFFFT_ASSERT4( B, 21.0F, 22.0F, 23.0F, 24.0F, "UNINTERLEAVE2() Inp B" );
2075     PFFFT_ASSERT4( C, 11.0F, 13.0F, 21.0F, 23.0F, "UNINTERLEAVE2() Out C" );
2076     PFFFT_ASSERT4( D, 12.0F, 14.0F, 22.0F, 24.0F, "UNINTERLEAVE2() Out D" );
2077   }
2078 
2079   {
2080     v4sf_union A, B, C, D;
2081     int k;
2082     for ( k = 0; k < 4; ++k )  A.f[k] = 10 + k+1;
2083     for ( k = 0; k < 4; ++k )  B.f[k] = 20 + k+1;
2084     for ( k = 0; k < 4; ++k )  C.f[k] = 30 + k+1;
2085     for ( k = 0; k < 4; ++k )  D.f[k] = 40 + k+1;
2086 
2087     if (DbgOut) {
2088       fprintf(DbgOut, "\ninput: A,B,C,D = {\n" );
2089       fprintf(DbgOut, "  Inp A:  %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
2090       fprintf(DbgOut, "  Inp B:  %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] );
2091       fprintf(DbgOut, "  Inp C:  %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
2092       fprintf(DbgOut, "  Inp D:  %f, %f, %f, %f\n", D.f[0], D.f[1], D.f[2], D.f[3] );
2093       fprintf(DbgOut, "}\n" );
2094     }
2095     VTRANSPOSE4(A.v, B.v, C.v, D.v);
2096     if (DbgOut) {
2097       fprintf(DbgOut, "VTRANSPOSE4(A,B,C,D) => {\n" );
2098       fprintf(DbgOut, "  Out A:  %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
2099       fprintf(DbgOut, "  Out B:  %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] );
2100       fprintf(DbgOut, "  Out C:  %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
2101       fprintf(DbgOut, "  Out D:  %f, %f, %f, %f\n", D.f[0], D.f[1], D.f[2], D.f[3] );
2102       fprintf(DbgOut, "}\n" );
2103     }
2104     PFFFT_ASSERT4( A, 11.0F, 21.0F, 31.0F, 41.0F, "VTRANSPOSE4(): Out A" );
2105     PFFFT_ASSERT4( B, 12.0F, 22.0F, 32.0F, 42.0F, "VTRANSPOSE4(): Out B" );
2106     PFFFT_ASSERT4( C, 13.0F, 23.0F, 33.0F, 43.0F, "VTRANSPOSE4(): Out C" );
2107     PFFFT_ASSERT4( D, 14.0F, 24.0F, 34.0F, 44.0F, "VTRANSPOSE4(): Out D" );
2108   }
2109 
2110   {
2111     v4sf_union A, B, C;
2112     int k;
2113     for ( k = 0; k < 4; ++k )  A.f[k] = 10 + k+1;
2114     for ( k = 0; k < 4; ++k )  B.f[k] = 20 + k+1;
2115     for ( k = 0; k < 4; ++k )  C.f[k] = 30 + k+1;
2116 
2117     if (DbgOut) {
2118       fprintf(DbgOut, "\ninput: A,B = {\n" );
2119       fprintf(DbgOut, "  Inp A:  %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
2120       fprintf(DbgOut, "  Inp B:  %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] );
2121       fprintf(DbgOut, "}\n" );
2122     }
2123     C.v = VSWAPHL(A.v, B.v);
2124     if (DbgOut) {
2125       fprintf(DbgOut, "C = VSWAPHL(A,B) => {\n" );
2126       fprintf(DbgOut, "  Out C:  %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
2127       fprintf(DbgOut, "}\n" );
2128     }
2129     PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "VSWAPHL(): Inp A" );
2130     PFFFT_ASSERT4( B, 21.0F, 22.0F, 23.0F, 24.0F, "VSWAPHL(): Inp B" );
2131     PFFFT_ASSERT4( C, 21.0F, 22.0F, 13.0F, 14.0F, "VSWAPHL(): Out C" );
2132   }
2133 
2134   {
2135     v4sf_union A, C;
2136     int k;
2137     for ( k = 0; k < 4; ++k )  A.f[k] = 10 + k+1;
2138     for ( k = 0; k < 4; ++k )  C.f[k] = 30 + k+1;
2139 
2140     if (DbgOut) {
2141       fprintf(DbgOut, "\ninput: A = {\n" );
2142       fprintf(DbgOut, "  Inp A:  %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
2143       fprintf(DbgOut, "}\n" );
2144     }
2145     C.v = VREV_S(A.v);
2146     if (DbgOut) {
2147       fprintf(DbgOut, "C = VREV_S(A) => {\n" );
2148       fprintf(DbgOut, "  Out C:  %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
2149       fprintf(DbgOut, "}\n" );
2150     }
2151     PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "VREV_S(): Inp A" );
2152     PFFFT_ASSERT4( C, 14.0F, 13.0F, 12.0F, 11.0F, "VREV_S(): Out C" );
2153   }
2154 
2155   {
2156     v4sf_union A, C;
2157     int k;
2158     for ( k = 0; k < 4; ++k )  A.f[k] = 10 + k+1;
2159 
2160     if (DbgOut) {
2161       fprintf(DbgOut, "\ninput: A = {\n" );
2162       fprintf(DbgOut, "  Inp A:  %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
2163       fprintf(DbgOut, "}\n" );
2164     }
2165     C.v = VREV_C(A.v);
2166     if (DbgOut) {
2167       fprintf(DbgOut, "C = VREV_C(A) => {\n" );
2168       fprintf(DbgOut, "  Out C:  %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
2169       fprintf(DbgOut, "}\n" );
2170     }
2171     PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "VREV_C(): Inp A" );
2172     PFFFT_ASSERT4( C, 13.0F, 14.0F, 11.0F, 12.0F, "VREV_C(): Out A" );
2173   }
2174 
2175   return numErrs;
2176 }
2177 
2178 #else  /* if ( SIMD_SZ == 4 ) */
2179 
FUNC_VALIDATE_SIMD_A()2180 void FUNC_VALIDATE_SIMD_A()
2181 {
2182 }
2183 
FUNC_VALIDATE_SIMD_EX(FILE * DbgOut)2184 int FUNC_VALIDATE_SIMD_EX(FILE * DbgOut)
2185 {
2186   (void)DbgOut;
2187   return -1;
2188 }
2189 
2190 #endif  /* end if ( SIMD_SZ == 4 ) */
2191 
2192