1 /* Copyright (C) 2007 Hong Zhiqian */
2 /**
3 @file kiss_fft_tm.h
4 @author Hong Zhiqian
5 @brief Various compatibility routines for Speex (TriMedia version)
6 */
7 /*
8 Redistribution and use in source and binary forms, with or without
9 modification, are permitted provided that the following conditions
10 are met:
11
12 - Redistributions of source code must retain the above copyright
13 notice, this list of conditions and the following disclaimer.
14
15 - Redistributions in binary form must reproduce the above copyright
16 notice, this list of conditions and the following disclaimer in the
17 documentation and/or other materials provided with the distribution.
18
19 - Neither the name of the Xiph.org Foundation nor the names of its
20 contributors may be used to endorse or promote products derived from
21 this software without specific prior written permission.
22
23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
27 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 */
35
36 #include "_kiss_fft_guts_tm.h"
37
38 #ifdef TM_ASM
39
40 #include "profile_tm.h"
41
42 #ifdef FIXED_POINT
43
44 #define OVERRIDE_KFBFLY2
kf_bfly2(kiss_fft_cpx * Fout,const int fstride,const kiss_fft_cfg st,int m)45 static void kf_bfly2(
46 kiss_fft_cpx *Fout,
47 const int fstride,
48 const kiss_fft_cfg st,
49 int m
50 )
51 {
52 register int * restrict Fout2;
53 register int * restrict tw1 = (int*)st->twiddles;
54 register int i, j;
55 register int _inv = !st->inverse;
56
57 Fout2 = (int*)Fout + m;
58
59 for ( i=0,j=0 ; i<m ; ++i,j+=4,tw1+=fstride )
60 { register int tw_10, ff_10, f2_10;
61
62 ff_10 = ld32x(Fout, i);
63 f2_10 = ld32x(Fout2, i);
64 tw_10 = ld32(tw1);
65
66 if ( _inv )
67 { TM_SHR(f2_10, f2_10, 1);
68 TM_SHR(ff_10, ff_10, 1);
69 }
70
71 TM_MUL(tw_10, tw_10, f2_10);
72 TM_SUB(f2_10, ff_10, tw_10);
73 TM_ADD(ff_10, ff_10, tw_10);
74
75 st32d(j, Fout2, f2_10);
76 st32d(j, Fout, ff_10);
77 }
78 }
79
80 #define OVERRIDE_KFBFLY4
kf_bfly4(kiss_fft_cpx * Fout,const int fstride,const kiss_fft_cfg st,const int m)81 static void kf_bfly4(
82 kiss_fft_cpx *Fout,
83 const int fstride,
84 const kiss_fft_cfg st,
85 const int m
86 )
87 {
88 register int * restrict tw1;
89 register int * restrict tw2;
90 register int * restrict tw3;
91 register int * restrict Fout1;
92 register int * restrict Fout2;
93 register int * restrict Fout3;
94 register int i, j;
95 register int fstride2, fstride3;
96 register int _inv = !st->inverse;
97
98 tw3 = tw2 = tw1 = (int*)st->twiddles;
99 fstride2 = fstride << 1;
100 fstride3 = fstride * 3;
101
102 Fout1 = (int*)Fout + m;
103 Fout2 = (int*)Fout + (m << 1);
104 Fout3 = (int*)Fout + (m * 3);
105
106
107 for ( i=0,j=0 ; i<m ; ++i,j+=4,tw1+=fstride,tw2+=fstride2,tw3+=fstride3 )
108 { register int sc0, sc1, sc2, sc3, sc4, sc5;
109 register int ff0;
110
111 sc0 = ld32x(Fout1,i);
112 sc3 = ld32(tw1);
113 sc1 = ld32x(Fout2, i);
114 sc4 = ld32(tw2);
115 sc2 = ld32x(Fout3, i);
116 sc5 = ld32(tw3);
117 ff0 = ld32x(Fout,i);
118
119 if ( _inv )
120 {
121 TM_ADD(sc0, sc0, 0x00020002);
122 TM_ADD(sc1, sc1, 0x00020002);
123 TM_ADD(sc2, sc2, 0x00020002);
124 TM_ADD(ff0, ff0, 0x00020002);
125 TM_SHR(sc0, sc0, 2);
126 TM_SHR(sc1, sc1, 2);
127 TM_SHR(sc2, sc2, 2);
128 TM_SHR(ff0, ff0, 2);
129 }
130
131 TM_MUL(sc0, sc0, sc3);
132 TM_MUL(sc1, sc1, sc4);
133 TM_MUL(sc2, sc2, sc5);
134 TM_SUB(sc5, ff0, sc1);
135 TM_ADD(ff0, ff0, sc1);
136 TM_ADD(sc3, sc0, sc2);
137 TM_SUB(sc4, sc0, sc2);
138 TM_SUB(sc1, ff0, sc3);
139 TM_ADD(ff0, ff0, sc3);
140
141 st32d(j, Fout2, sc1);
142 st32d(j, Fout, ff0);
143
144 sc5 = funshift2(sc5, sc5);
145
146 if ( _inv )
147 { TM_ADD(ff0, sc5, sc4);
148 TM_SUB(sc1, sc5, sc4);
149 } else
150 { TM_ADD(sc1, sc5, sc4);
151 TM_SUB(ff0, sc5, sc4);
152 }
153
154 sc0 = funshift2(sc1, ff0);
155 sc2 = funshift2(ff0, sc1);
156
157 st32d(j, Fout1, sc0);
158 st32d(j, Fout3, sc2);
159 }
160 }
161
162
163 #define OVERRIDE_KFBFLY3
kf_bfly3(kiss_fft_cpx * Fout,const int fstride,const kiss_fft_cfg st,int m)164 static void kf_bfly3(
165 kiss_fft_cpx *Fout,
166 const int fstride,
167 const kiss_fft_cfg st,
168 int m
169 )
170 {
171 register int * restrict tw1;
172 register int * restrict tw2;
173 register int * restrict Fout1;
174 register int * restrict Fout2;
175 register int epi;
176 register int i, j;
177 register int fstride2;
178 register int _inv = !st->inverse;
179
180 tw1 = tw2 = (int*)st->twiddles;
181 Fout1 = (int*)Fout + m;
182 Fout2 = (int*)Fout + (m << 1);
183 epi = tw1[fstride*m];
184 epi = pack16lsb(epi,epi);
185 fstride2 = fstride << 1;
186
187 for ( i=0,j=0 ; i<m ; ++i,j+=4,tw1+=fstride,tw2+=fstride2 )
188 { register int sc0, sc1, sc2, sc3, sc4, sc5;
189 register int ff0;
190
191 sc1 = ld32x(Fout1,i);
192 sc2 = ld32x(Fout2,i);
193 sc3 = ld32(tw1);
194 sc4 = ld32(tw2);
195 ff0 = ld32x(Fout,i);
196
197 if ( _inv )
198 {
199 TM_DIV(sc1, sc1, 3);
200 TM_DIV(sc2, sc2, 3);
201 TM_DIV(ff0, ff0, 3);
202 }
203
204 TM_MUL(sc1, sc1, sc3);
205 TM_MUL(sc2, sc2, sc4);
206 TM_ADD(sc3, sc1, sc2);
207 TM_SUB(sc0, sc1, sc2);
208 TM_SHR(sc4, sc3, 1);
209 TM_SUB(sc1, ff0, sc4);
210
211 sc0 = dspidualmul(sc0, epi);
212 sc0 = funshift2(sc0, sc0);
213
214 TM_ADD(ff0, ff0, sc3);
215 TM_ADD(sc4, sc1, sc0);
216 TM_SUB(sc5, sc1, sc0);
217
218 sc1 = funshift2(sc4, sc5);
219 sc2 = funshift2(sc5, sc4);
220 sc2 = funshift2(sc2, sc2);
221
222 st32d(j, Fout1, sc1);
223 st32d(j, Fout, ff0);
224 st32d(j, Fout2, sc2);
225 }
226 }
227
228
229 #define OVERRIDE_KFBFLY5
kf_bfly5(kiss_fft_cpx * Fout,const int fstride,const kiss_fft_cfg st,int m)230 static void kf_bfly5(
231 kiss_fft_cpx *Fout,
232 const int fstride,
233 const kiss_fft_cfg st,
234 int m
235 )
236 {
237 register int * restrict tw1;
238 register int * restrict tw2;
239 register int * restrict tw3;
240 register int * restrict tw4;
241 register int * restrict Fout1;
242 register int * restrict Fout2;
243 register int * restrict Fout3;
244 register int * restrict Fout4;
245 register int fstride2, fstride3, fstride4;
246 register int i, j;
247 register int yab_msb, yab_lsb, yba_msb, yba_lsb;
248 register int _inv = !st->inverse;
249
250
251 Fout1=(int*)Fout+m;
252 Fout2=(int*)Fout+(m<<1);
253 Fout3=(int*)Fout+(3 *m);
254 Fout4=(int*)Fout+(m<<2);
255
256 tw1 = tw2 = tw3 = tw4 = (int*)st->twiddles;
257
258 i = tw1[fstride*m];
259 yab_lsb = tw1[fstride*(m<<1)];
260 yab_msb = pack16msb(i, yab_lsb);
261 yab_lsb = pack16lsb(i, yab_lsb);
262 yba_msb = funshift2(-sex16(yab_msb), yab_msb);
263 yba_lsb = funshift2(yab_lsb, yab_lsb);
264
265 fstride2 = fstride << 1;
266 fstride3 = fstride * 3;
267 fstride4 = fstride << 2;
268
269 for ( i=0,j=0 ; i<m ; ++i,j+=4,tw1+=fstride,tw2+=fstride2,tw3+=fstride3,tw4+=fstride4 )
270 { register int sc0, sc1, sc2, sc3, sc4, sc5, sc6;
271 register int sc7, sc8, sc9, sc10, sc11, sc12;
272 register int ff0, sc78_msb, sc78_lsb, sc90_msb, sc90_lsb;
273
274 sc0 = ld32x(Fout,i);
275 sc1 = ld32x(Fout1,i);
276 sc2 = ld32x(Fout2,i);
277 sc3 = ld32x(Fout3,i);
278 sc4 = ld32x(Fout4,i);
279 sc5 = ld32(tw1);
280 sc6 = ld32(tw2);
281 sc7 = ld32(tw3);
282 sc8 = ld32(tw4);
283
284 if ( _inv )
285 {
286 TM_DIV(sc0, sc0, 5);
287 TM_DIV(sc1, sc1, 5);
288 TM_DIV(sc2, sc2, 5);
289 TM_DIV(sc3, sc3, 5);
290 TM_DIV(sc4, sc4, 5);
291 }
292
293 ff0 = sc0;
294
295 TM_MUL(sc1, sc1, sc5);
296 TM_MUL(sc2, sc2, sc6);
297 TM_MUL(sc3, sc3, sc7);
298 TM_MUL(sc4, sc4, sc8);
299 TM_ADD(sc7, sc1, sc4);
300 TM_SUB(sc10,sc1, sc4);
301 TM_ADD(sc8, sc2, sc3);
302 TM_SUB(sc9, sc2, sc3);
303
304 TM_ADD(ff0, ff0, sc7);
305 TM_ADD(ff0, ff0, sc8);
306 st32d(j, Fout, ff0);
307
308 sc78_msb = pack16msb(sc7,sc8);
309 sc78_lsb = pack16lsb(sc7,sc8);
310 sc90_msb = pack16msb(sc10,sc9);
311 sc90_lsb = pack16lsb(sc10,sc9);
312
313 sc5 = pack16lsb( sround(ifir16(sc78_msb,yab_lsb)), sround(ifir16(sc78_lsb,yab_lsb)));
314 sc6 = pack16lsb(-sround(ifir16(sc90_lsb,yab_msb)), sround(ifir16(sc90_msb,yab_msb)));
315
316 TM_ADD(sc5, sc5, sc0);
317 TM_SUB(sc1, sc5, sc6);
318 TM_ADD(sc4, sc5, sc6);
319 st32d(j, Fout1, sc1);
320 st32d(j, Fout4, sc4);
321
322 sc11 = pack16lsb( sround(ifir16(sc78_msb,yba_lsb)), sround(ifir16(sc78_lsb,yba_lsb)));
323 sc12 = pack16lsb(-sround(ifir16(sc90_lsb,yba_msb)), sround(ifir16(sc90_msb,yba_msb)));
324
325 TM_ADD(sc11, sc11, sc0);
326 TM_ADD(sc2, sc11, sc12);
327 TM_SUB(sc3, sc11, sc12);
328 st32d(j, Fout2, sc2);
329 st32d(j, Fout3, sc3);
330
331 }
332 }
333
334
335 #define OVERRIDE_KF_BFLY_GENERIC
kf_bfly_generic(kiss_fft_cpx * restrict Fout,const size_t fstride,const kiss_fft_cfg st,int m,int p)336 static void kf_bfly_generic(
337 kiss_fft_cpx * restrict Fout,
338 const size_t fstride,
339 const kiss_fft_cfg st,
340 int m,
341 int p
342 )
343 {
344 register int _inv = !st->inverse;
345 register int i, j, k, l;
346 register int * restrict twiddles = (int*)st->twiddles;
347 register int Norig = st->nfft;
348
349 CHECKBUF(scratchbuf,nscratchbuf,p);
350
351 for ( i=0; i<m; ++i )
352 { register int sc10;
353
354 for ( j=0,k=i ; j<p ; ++j,k+=m )
355 { register int f10;
356
357 f10 = ld32x(Fout,k);
358
359 if ( _inv )
360 { TM_DIV(f10, f10, p);
361 }
362
363 st32d(j<<2, scratchbuf, f10);
364 }
365
366 for ( j=0,k=i,sc10=ld32(scratchbuf) ; j<p ; ++j,k+=m )
367 {
368 register int twidx = 0;
369 register int f10;
370
371 for ( l=1,f10 = sc10 ; l<p ; ++l )
372 { register int tw, sc;
373
374 twidx += fstride * k;
375 if ( twidx>=Norig )
376 { twidx -= Norig;
377 }
378
379 sc = ld32x(scratchbuf,l);
380 tw = ld32x(twiddles,twidx);
381
382 TM_MUL(sc, sc, tw);
383 TM_ADD(f10, f10, sc);
384 }
385 st32d(k<<2, Fout, f10);
386 }
387 }
388 }
389
390 #else
391
392 #define OVERRIDE_KFBFLY2
kf_bfly2(kiss_fft_cpx * Fout,const size_t fstride,const kiss_fft_cfg st,int m)393 static void kf_bfly2(
394 kiss_fft_cpx * Fout,
395 const size_t fstride,
396 const kiss_fft_cfg st,
397 int m
398 )
399 {
400 register kiss_fft_cpx * restrict Fout2;
401 register kiss_fft_cpx * restrict tw1 = st->twiddles;
402
403 Fout2 = Fout + m;
404
405 do
406 {
407 register kiss_fft_cpx _fout2, _fout, t;
408
409 _fout2 = *Fout2;
410 _fout = *Fout;
411
412 C_MUL ( t, _fout2, *tw1);
413 C_SUB (_fout2, _fout, t);
414 C_ADD (_fout, _fout, t);
415
416 *Fout2 = _fout2;
417 *Fout = _fout;
418
419 tw1 += fstride;
420 ++Fout2;
421 ++Fout;
422
423 } while ( --m );
424 }
425
426 #define OVERRIDE_KFBFLY4
kf_bfly4(kiss_fft_cpx * Fout,const int fstride,const kiss_fft_cfg st,int m)427 static void kf_bfly4(
428 kiss_fft_cpx * Fout,
429 const int fstride,
430 const kiss_fft_cfg st,
431 int m
432 )
433 {
434 register kiss_fft_cpx * restrict tw1,* restrict tw2,* restrict tw3;
435 register kiss_fft_cpx * restrict Fout1, * restrict Fout2, * restrict Fout3;
436 register int _inv = !st->inverse;
437
438 tw3 = tw2 = tw1 = st->twiddles;
439
440 Fout1 = Fout + m;
441 Fout2 = Fout + (m << 1);
442 Fout3 = Fout + (m * 3);
443
444 do {
445
446 register kiss_fft_cpx _fout;
447 register kiss_fft_cpx sc0, sc1, sc2, sc3, sc4, sc5;
448
449 _fout = *Fout;
450
451 C_MUL( sc0,*Fout1, *tw1);
452 C_MUL( sc1,*Fout2, *tw2);
453 C_MUL( sc2,*Fout3, *tw3);
454 C_SUB( sc5, _fout, sc1);
455 C_ADD( _fout, _fout, sc1);
456 C_ADD( sc3, sc0, sc2);
457 C_SUB( sc4, sc0, sc2);
458 C_SUB(*Fout2, _fout, sc3);
459 C_ADD( *Fout, _fout, sc3);
460
461 tw1 += fstride;
462 tw2 += (fstride << 1);
463 tw3 += (fstride * 3);
464
465 if ( _inv )
466 {
467 Fout1->r = sc5.r + sc4.i;
468 Fout1->i = sc5.i - sc4.r;
469 Fout3->r = sc5.r - sc4.i;
470 Fout3->i = sc5.i + sc4.r;
471 }
472 else
473 { Fout1->r = sc5.r - sc4.i;
474 Fout1->i = sc5.i + sc4.r;
475 Fout3->r = sc5.r + sc4.i;
476 Fout3->i = sc5.i - sc4.r;
477 }
478
479
480 ++Fout; ++Fout1; ++Fout2; ++Fout3;
481
482 } while(--m);
483 }
484
485 #define OVERRIDE_KFBFLY3
kf_bfly3(kiss_fft_cpx * Fout,const int fstride,const kiss_fft_cfg st,int m)486 static void kf_bfly3(
487 kiss_fft_cpx * Fout,
488 const int fstride,
489 const kiss_fft_cfg st,
490 int m
491 )
492 {
493 register kiss_fft_cpx * restrict Fout1, * restrict Fout2;
494 register kiss_fft_cpx * restrict tw1,* restrict tw2;
495 register float epi;
496
497 tw1 = tw2 = st->twiddles;
498 epi = st->twiddles[fstride*m].i;
499 Fout1 = Fout + m;
500 Fout2 = Fout + (m << 1);
501
502 do {
503
504 register kiss_fft_cpx _fout;
505 register kiss_fft_cpx sc0, sc1, sc2, sc3;
506
507 _fout = *Fout;
508
509 C_MUL( sc1, *Fout1, *tw1);
510 C_MUL( sc2, *Fout2, *tw2);
511 C_ADD( sc3, sc1, sc2);
512 C_SUB( sc0, sc1, sc2);
513 tw1 += fstride;
514 tw2 += (fstride << 1);
515
516 sc1.r = _fout.r - HALF_OF(sc3.r);
517 sc1.i = _fout.i - HALF_OF(sc3.i);
518
519 C_MULBYSCALAR(sc0, epi);
520 C_ADD(*Fout, _fout, sc3);
521
522 Fout2->r = sc1.r + sc0.i;
523 Fout2->i = sc1.i - sc0.r;
524
525 Fout1->r = sc1.i - sc0.i;
526 Fout1->i = sc1.r + sc0.r;
527
528 ++Fout; ++Fout1; ++Fout2;
529
530 } while(--m);
531 }
532
533 #define OVERRIDE_KFBFLY5
kf_bfly5(kiss_fft_cpx * Fout,const size_t fstride,const kiss_fft_cfg st,int m)534 static void kf_bfly5(
535 kiss_fft_cpx * Fout,
536 const size_t fstride,
537 const kiss_fft_cfg st,
538 int m
539 )
540 {
541 register kiss_fft_cpx * restrict Fout1,* restrict Fout2,* restrict Fout3,* restrict Fout4;
542 register int u;
543 register kiss_fft_cpx *tw;
544 register float yar, yai, ybr, ybi;
545
546 Fout1=Fout+m;
547 Fout2=Fout+(m<<1);
548 Fout3=Fout+(m*3);
549 Fout4=Fout+(m<<2);
550
551 tw = st->twiddles;
552 yar = tw[fstride*m].r;
553 yai = tw[fstride*m].i;
554 ybr = tw[fstride*2*m].r;
555 ybi = tw[fstride*2*m].i;
556
557 for ( u=0; u<m; ++u )
558 {
559 register kiss_fft_cpx sc0, sc1, sc2, sc3, sc4, sc5, sc6, sc7, sc8, sc9, sc10, sc11, sc12;
560
561 sc0 = *Fout;
562
563 C_MUL( sc1,*Fout1, tw[u*fstride]);
564 C_MUL( sc2,*Fout2, tw[2*u*fstride]);
565 C_MUL( sc3,*Fout3, tw[3*u*fstride]);
566 C_MUL( sc4,*Fout4, tw[4*u*fstride]);
567
568 C_ADD( sc7, sc1, sc4);
569 C_SUB( sc10, sc1, sc4);
570 C_ADD( sc8, sc2, sc3);
571 C_SUB( sc9, sc2, sc3);
572
573 Fout->r = sc0.r + sc7.r + sc8.r;
574 Fout->i = sc0.i + sc7.i + sc8.i;
575
576 sc5.r = sc0.r + S_MUL(sc7.r,yar) + S_MUL(sc8.r,ybr);
577 sc5.i = sc0.i + S_MUL(sc7.i,yar) + S_MUL(sc8.i,ybr);
578
579 sc6.r = S_MUL(sc10.i,yai) + S_MUL(sc9.i,ybi);
580 sc6.i = -S_MUL(sc10.r,yai) - S_MUL(sc9.r,ybi);
581
582 C_SUB(*Fout1,sc5,sc6);
583 C_ADD(*Fout4,sc5,sc6);
584
585 sc11.r = sc0.r + S_MUL(sc7.r,ybr) + S_MUL(sc8.r,yar);
586 sc11.i = sc0.i + S_MUL(sc7.i,ybr) + S_MUL(sc8.i,yar);
587 sc12.r = - S_MUL(sc10.i,ybi) + S_MUL(sc9.i,yai);
588 sc12.i = S_MUL(sc10.r,ybi) - S_MUL(sc9.r,yai);
589 C_ADD(*Fout2,sc11,sc12);
590 C_SUB(*Fout3,sc11,sc12);
591
592 ++Fout1; ++Fout2; ++Fout3; ++Fout4;
593 }
594 }
595
596
597 #endif
598
599 #endif
600