1 /******************************************************************************
2 *
3 * Copyright 2014 The Android Open Source Project
4 * Copyright 2003 - 2004 Open Interface North America, Inc. All rights
5 * reserved.
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at:
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 *
19 ******************************************************************************/
20
21 /*******************************************************************************
22 $Revision: #1 $
23 ******************************************************************************/
24
25 /** @file
26
27 This file, along with synthesis-generated.c, contains the synthesis
28 filterbank routines. The operations performed correspond to the
29 operations described in A2DP Appendix B, Figure 12.3. Several
30 mathematical optimizations are performed, particularly for the
31 8-subband case.
32
33 One important optimization is to note that the "matrixing" operation
34 can be decomposed into the product of a type II discrete cosine kernel
35 and another, sparse matrix.
36
37 According to Fig 12.3, in the 8-subband case,
38 @code
39 N[k][i] = cos((i+0.5)*(k+4)*pi/8), k = 0..15 and i = 0..7
40 @endcode
41
42 N can be factored as R * C2, where C2 is an 8-point type II discrete
43 cosine kernel given by
44 @code
45 C2[k][i] = cos((i+0.5)*k*pi/8)), k = 0..7 and i = 0..7
46 @endcode
47
48 R turns out to be a sparse 16x8 matrix with the following non-zero
49 entries:
50 @code
51 R[k][k+4] = 1, k = 0..3
52 R[k][abs(12-k)] = -1, k = 5..15
53 @endcode
54
55 The spec describes computing V[0..15] as N * R.
56 @code
57 V[0..15] = N * R = (R * C2) * R = R * (C2 * R)
58 @endcode
59
60 C2 * R corresponds to computing the discrete cosine transform of R, so
61 V[0..15] can be computed by taking the DCT of R followed by assignment
62 and selective negation of the DCT result into V.
63
64 Although this was derived empirically using GNU Octave, it is
65 formally demonstrated in, e.g., Liu, Chi-Min and Lee,
66 Wen-Chieh. "A Unified Fast Algorithm for Cosine Modulated
67 Filter Banks in Current Audio Coding Standards." Journal of
68 the AES 47 (December 1999): 1061.
69
70 Given the shift operation performed prior to computing V[0..15], it is
71 clear that V[0..159] represents a rolling history of the 10 most
72 recent groups of blocks input to the synthesis operation. Interpreting
73 the matrix N in light of its factorization into C2 and R, R's
74 sparseness has implications for interpreting the values in V. In
75 particular, there is considerable redundancy in the values stored in
76 V. Furthermore, since R[4][0..7] are all zeros, one out of every 16
77 values in V will be zero regardless of the input data. Within each
78 block of 16 values in V, fully half of them are redundant or
79 irrelevant:
80
81 @code
82 V[ 0] = DCT[4]
83 V[ 1] = DCT[5]
84 V[ 2] = DCT[6]
85 V[ 3] = DCT[7]
86 V[ 4] = 0
87 V[ 5] = -DCT[7] = -V[3] (redundant)
88 V[ 6] = -DCT[6] = -V[2] (redundant)
89 V[ 7] = -DCT[5] = -V[1] (redundant)
90 V[ 8] = -DCT[4] = -V[0] (redundant)
91 V[ 9] = -DCT[3]
92 V[10] = -DCT[2]
93 V[11] = -DCT[1]
94 V[12] = -DCT[0]
95 V[13] = -DCT[1] = V[11] (redundant)
96 V[14] = -DCT[2] = V[10] (redundant)
97 V[15] = -DCT[3] = V[ 9] (redundant)
98 @endcode
99
100 Since the elements of V beyond 15 were originally computed the same
101 way during a previous run, what holds true for V[x] also holds true
102 for V[x+16]. Thus, so long as care is taken to maintain the mapping,
103 we need only actually store the unique values, which correspond to the
104 output of the DCT, in some cases inverted. In fact, instead of storing
105 V[0..159], we could store DCT[0..79] which would contain a history of
106 DCT results. More on this in a bit.
107
108 Going back to figure 12.3 in the spec, it should be clear that the
109 vector U need not actually be explicitly constructed, but that with
110 suitable indexing into V during the window operation, the same end can
111 be accomplished. In the same spirit of the pseudocode shown in the
112 figure, the following is the construction of W without using U:
113
114 @code
115 for i=0 to 79 do
116 W[i] = D[i]*VSIGN(i)*V[remap_V(i)] where remap_V(i) = 32*(int(i/16)) +
117 (i % 16) + (i % 16 >= 8 ? 16 : 0)
118 and VSIGN(i) maps i%16 into {1, 1,
119 1, 1, 0, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1 }
120 These values correspond to the
121 signs of the redundant values as
122 shown in the explanation three
123 paragraphs above.
124 @endcode
125
126 We saw above how V[4..8,13..15] (and by extension
127 V[(4..8,13..15)+16*n]) can be defined in terms of other elements
128 within the subblock of V. V[0..3,9..12] correspond to DCT elements.
129
130 @code
131 for i=0 to 79 do
132 W[i] = D[i]*DSIGN(i)*DCT[remap_DCT(i)]
133 @endcode
134
135 The DCT is calculated using the Arai-Agui-Nakajima factorization,
136 which saves some computation by producing output that needs to be
137 multiplied by scaling factors before being used.
138
139 @code
140 for i=0 to 79 do
141 W[i] = D[i]*SCALE[i%8]*AAN_DCT[remap_DCT(i)]
142 @endcode
143
144 D can be premultiplied with the DCT scaling factors to yield
145
146 @code
147 for i=0 to 79 do
148 W[i] = DSCALED[i]*AAN_DCT[remap_DCT(i)] where DSCALED[i] =
149 D[i]*SCALE[i%8]
150 @endcode
151
152 The output samples X[0..7] are defined as sums of W:
153
154 @code
155 X[j] = sum{i=0..9}(W[j+8*i])
156 @endcode
157
158 @ingroup codec_internal
159 */
160
161 /**
162 @addtogroup codec_internal
163 @{
164 */
165
166 #include "oi_codec_sbc_private.h"
167
168 const int32_t dec_window_4[21] = {
169 0, /* +0.00000000E+00 */
170 97, /* +5.36548976E-04 */
171 270, /* +1.49188357E-03 */
172 495, /* +2.73370904E-03 */
173 694, /* +3.83720193E-03 */
174 704, /* +3.89205149E-03 */
175 338, /* +1.86581691E-03 */
176 -554, /* -3.06012286E-03 */
177 1974, /* +1.09137620E-02 */
178 3697, /* +2.04385087E-02 */
179 5224, /* +2.88757392E-02 */
180 5824, /* +3.21939290E-02 */
181 4681, /* +2.58767811E-02 */
182 1109, /* +6.13245186E-03 */
183 -5214, /* -2.88217274E-02 */
184 -14047, /* -7.76463494E-02 */
185 24529, /* +1.35593274E-01 */
186 35274, /* +1.94987841E-01 */
187 44618, /* +2.46636662E-01 */
188 50984, /* +2.81828203E-01 */
189 53243, /* +2.94315332E-01 */
190 };
191
192 #define DCTII_4_K06_FIX (11585) /* S1.14 11585 0.707107*/
193
194 #define DCTII_4_K08_FIX (21407) /* S1.14 21407 1.306563*/
195
196 #define DCTII_4_K09_FIX (-15137) /* S1.14 -15137 -0.923880*/
197
198 #define DCTII_4_K10_FIX (-8867) /* S1.14 -8867 -0.541196*/
199
200 /** Scales x by y bits to the right, adding a rounding factor.
201 */
202 #ifndef SCALE
203 #define SCALE(x, y) (((x) + (1 << ((y) - 1))) >> (y))
204 #endif
205
206 #ifndef CLIP_INT16
207 #define CLIP_INT16(x) \
208 do { \
209 if ((x) > OI_INT16_MAX) { \
210 (x) = OI_INT16_MAX; \
211 } else if ((x) < OI_INT16_MIN) { \
212 (x) = OI_INT16_MIN; \
213 } \
214 } while (0)
215 #endif
216
217 /**
218 * Default C language implementation of a 16x32->32 multiply. This function may
219 * be replaced by a platform-specific version for speed.
220 *
221 * @param u A signed 16-bit multiplicand
222 * @param v A signed 32-bit multiplier
223
224 * @return A signed 32-bit value corresponding to the 32 most significant bits
225 * of the 48-bit product of u and v.
226 */
default_mul_16s_32s_hi(int16_t u,int32_t v)227 INLINE int32_t default_mul_16s_32s_hi(int16_t u, int32_t v) {
228 uint16_t v0;
229 int16_t v1;
230
231 int32_t w, x;
232
233 v0 = (uint16_t)(v & 0xffff);
234 v1 = (int16_t)(v >> 16);
235
236 w = v1 * u;
237 x = u * v0;
238
239 return w + (x >> 16);
240 }
241
242 #define MUL_16S_32S_HI(_x, _y) default_mul_16s_32s_hi(_x, _y)
243
244 #define LONG_MULT_DCT(K, sample) (MUL_16S_32S_HI(K, sample) << 2)
245
246 PRIVATE void SynthWindow80_generated(int16_t* pcm, SBC_BUFFER_T const* RESTRICT buffer,
247 OI_UINT strideShift);
248 PRIVATE void SynthWindow112_generated(int16_t* pcm, SBC_BUFFER_T const* RESTRICT buffer,
249 OI_UINT strideShift);
250 PRIVATE void dct2_8(SBC_BUFFER_T* RESTRICT out, int32_t const* RESTRICT x);
251
252 typedef void (*SYNTH_FRAME)(OI_CODEC_SBC_DECODER_CONTEXT* context, int16_t* pcm, OI_UINT blkstart,
253 OI_UINT blkcount);
254
255 #ifndef COPY_BACKWARD_32BIT_ALIGNED_72_HALFWORDS
256 #define COPY_BACKWARD_32BIT_ALIGNED_72_HALFWORDS(dest, src) \
257 do { \
258 shift_buffer(dest, src, 72); \
259 } while (0)
260 #endif
261
262 #ifndef DCT2_8
263 #define DCT2_8(dst, src) dct2_8(dst, src)
264 #endif
265
266 #ifndef SYNTH80
267 #define SYNTH80 SynthWindow80_generated
268 #endif
269
270 #ifndef SYNTH112
271 #define SYNTH112 SynthWindow112_generated
272 #endif
273
OI_SBC_SynthFrame_80(OI_CODEC_SBC_DECODER_CONTEXT * context,int16_t * pcm,OI_UINT blkstart,OI_UINT blkcount)274 PRIVATE void OI_SBC_SynthFrame_80(OI_CODEC_SBC_DECODER_CONTEXT* context, int16_t* pcm,
275 OI_UINT blkstart, OI_UINT blkcount) {
276 OI_UINT blk;
277 OI_UINT ch;
278 OI_UINT nrof_channels = context->common.frameInfo.nrof_channels;
279 OI_UINT pcmStrideShift = context->common.pcmStride == 1 ? 0 : 1;
280 OI_UINT offset = context->common.filterBufferOffset;
281 int32_t* s = context->common.subdata + 8 * nrof_channels * blkstart;
282 OI_UINT blkstop = blkstart + blkcount;
283
284 for (blk = blkstart; blk < blkstop; blk++) {
285 if (offset == 0) {
286 COPY_BACKWARD_32BIT_ALIGNED_72_HALFWORDS(
287 context->common.filterBuffer[0] + context->common.filterBufferLen - 72,
288 context->common.filterBuffer[0]);
289 if (nrof_channels == 2) {
290 COPY_BACKWARD_32BIT_ALIGNED_72_HALFWORDS(
291 context->common.filterBuffer[1] + context->common.filterBufferLen - 72,
292 context->common.filterBuffer[1]);
293 }
294 offset = context->common.filterBufferLen - 80;
295 } else {
296 offset -= 1 * 8;
297 }
298
299 for (ch = 0; ch < nrof_channels; ch++) {
300 DCT2_8(context->common.filterBuffer[ch] + offset, s);
301 SYNTH80(pcm + ch, context->common.filterBuffer[ch] + offset, pcmStrideShift);
302 s += 8;
303 }
304 pcm += (8 << pcmStrideShift);
305 }
306 context->common.filterBufferOffset = offset;
307 }
308
OI_SBC_SynthFrame_4SB(OI_CODEC_SBC_DECODER_CONTEXT * context,int16_t * pcm,OI_UINT blkstart,OI_UINT blkcount)309 PRIVATE void OI_SBC_SynthFrame_4SB(OI_CODEC_SBC_DECODER_CONTEXT* context, int16_t* pcm,
310 OI_UINT blkstart, OI_UINT blkcount) {
311 OI_UINT blk;
312 OI_UINT ch;
313 OI_UINT nrof_channels = context->common.frameInfo.nrof_channels;
314 OI_UINT pcmStrideShift = context->common.pcmStride == 1 ? 0 : 1;
315 OI_UINT offset = context->common.filterBufferOffset;
316 int32_t* s = context->common.subdata + 8 * nrof_channels * blkstart;
317 OI_UINT blkstop = blkstart + blkcount;
318
319 for (blk = blkstart; blk < blkstop; blk++) {
320 if (offset == 0) {
321 COPY_BACKWARD_32BIT_ALIGNED_72_HALFWORDS(
322 context->common.filterBuffer[0] + context->common.filterBufferLen - 72,
323 context->common.filterBuffer[0]);
324 if (nrof_channels == 2) {
325 COPY_BACKWARD_32BIT_ALIGNED_72_HALFWORDS(
326 context->common.filterBuffer[1] + context->common.filterBufferLen - 72,
327 context->common.filterBuffer[1]);
328 }
329 offset = context->common.filterBufferLen - 80;
330 } else {
331 offset -= 8;
332 }
333 for (ch = 0; ch < nrof_channels; ch++) {
334 cosineModulateSynth4(context->common.filterBuffer[ch] + offset, s);
335 SynthWindow40_int32_int32_symmetry_with_sum(
336 pcm + ch, context->common.filterBuffer[ch] + offset, pcmStrideShift);
337 s += 4;
338 }
339 pcm += (4 << pcmStrideShift);
340 }
341 context->common.filterBufferOffset = offset;
342 }
343
344 #ifdef SBC_ENHANCED
345
OI_SBC_SynthFrame_Enhanced(OI_CODEC_SBC_DECODER_CONTEXT * context,int16_t * pcm,OI_UINT blkstart,OI_UINT blkcount)346 PRIVATE void OI_SBC_SynthFrame_Enhanced(OI_CODEC_SBC_DECODER_CONTEXT* context, int16_t* pcm,
347 OI_UINT blkstart, OI_UINT blkcount) {
348 OI_UINT blk;
349 OI_UINT ch;
350 OI_UINT nrof_channels = context->common.frameInfo.nrof_channels;
351 OI_UINT pcmStrideShift = context->common.pcmStride == 1 ? 0 : 1;
352 OI_UINT offset = context->common.filterBufferOffset;
353 int32_t* s = context->common.subdata + 8 * nrof_channels * blkstart;
354 OI_UINT blkstop = blkstart + blkcount;
355
356 for (blk = blkstart; blk < blkstop; blk++) {
357 if (offset == 0) {
358 COPY_BACKWARD_32BIT_ALIGNED_104_HALFWORDS(
359 context->common.filterBuffer[0] + context->common.filterBufferLen - 104,
360 context->common.filterBuffer[0]);
361 if (nrof_channels == 2) {
362 COPY_BACKWARD_32BIT_ALIGNED_104_HALFWORDS(
363 context->common.filterBuffer[1] + context->common.filterBufferLen - 104,
364 context->common.filterBuffer[1]);
365 }
366 offset = context->common.filterBufferLen - 112;
367 } else {
368 offset -= 8;
369 }
370 for (ch = 0; ch < nrof_channels; ++ch) {
371 DCT2_8(context->common.filterBuffer[ch] + offset, s);
372 SYNTH112(pcm + ch, context->common.filterBuffer[ch] + offset, pcmStrideShift);
373 s += 8;
374 }
375 pcm += (8 << pcmStrideShift);
376 }
377 context->common.filterBufferOffset = offset;
378 }
379
380 static const SYNTH_FRAME SynthFrameEnhanced[] = {
381 NULL, /* invalid */
382 OI_SBC_SynthFrame_Enhanced, /* mono */
383 OI_SBC_SynthFrame_Enhanced /* stereo */
384 };
385
386 #endif
387
388 static const SYNTH_FRAME SynthFrame8SB[] = {
389 NULL, /* invalid */
390 OI_SBC_SynthFrame_80, /* mono */
391 OI_SBC_SynthFrame_80 /* stereo */
392 };
393
394 static const SYNTH_FRAME SynthFrame4SB[] = {
395 NULL, /* invalid */
396 OI_SBC_SynthFrame_4SB, /* mono */
397 OI_SBC_SynthFrame_4SB /* stereo */
398 };
399
OI_SBC_SynthFrame(OI_CODEC_SBC_DECODER_CONTEXT * context,int16_t * pcm,OI_UINT start_block,OI_UINT nrof_blocks)400 PRIVATE void OI_SBC_SynthFrame(OI_CODEC_SBC_DECODER_CONTEXT* context, int16_t* pcm,
401 OI_UINT start_block, OI_UINT nrof_blocks) {
402 OI_UINT nrof_subbands = context->common.frameInfo.nrof_subbands;
403 OI_UINT nrof_channels = context->common.frameInfo.nrof_channels;
404
405 OI_ASSERT(nrof_subbands == 4 || nrof_subbands == 8);
406 if (nrof_subbands == 4) {
407 SynthFrame4SB[nrof_channels](context, pcm, start_block, nrof_blocks);
408 #ifdef SBC_ENHANCED
409 } else if (context->common.frameInfo.enhanced) {
410 SynthFrameEnhanced[nrof_channels](context, pcm, start_block, nrof_blocks);
411 #endif /* SBC_ENHANCED */
412 } else {
413 SynthFrame8SB[nrof_channels](context, pcm, start_block, nrof_blocks);
414 }
415 }
416
SynthWindow40_int32_int32_symmetry_with_sum(int16_t * pcm,SBC_BUFFER_T buffer[80],OI_UINT strideShift)417 void SynthWindow40_int32_int32_symmetry_with_sum(int16_t* pcm, SBC_BUFFER_T buffer[80],
418 OI_UINT strideShift) {
419 int32_t pa;
420 int32_t pb;
421
422 /* These values should be zero, since out[2] of the 4-band cosine modulation
423 * is always zero. */
424 OI_ASSERT(buffer[2] == 0);
425 OI_ASSERT(buffer[10] == 0);
426 OI_ASSERT(buffer[18] == 0);
427 OI_ASSERT(buffer[26] == 0);
428 OI_ASSERT(buffer[34] == 0);
429 OI_ASSERT(buffer[42] == 0);
430 OI_ASSERT(buffer[50] == 0);
431 OI_ASSERT(buffer[58] == 0);
432 OI_ASSERT(buffer[66] == 0);
433 OI_ASSERT(buffer[74] == 0);
434
435 pa = dec_window_4[4] * (buffer[12] + buffer[76]);
436 pa += dec_window_4[8] * (buffer[16] - buffer[64]);
437 pa += dec_window_4[12] * (buffer[28] + buffer[60]);
438 pa += dec_window_4[16] * (buffer[32] - buffer[48]);
439 pa += dec_window_4[20] * buffer[44];
440 pa = SCALE(-pa, 15);
441 CLIP_INT16(pa);
442 pcm[0 << strideShift] = (int16_t)pa;
443
444 pa = dec_window_4[1] * buffer[1];
445 pb = dec_window_4[1] * buffer[79];
446 pb += dec_window_4[3] * buffer[3];
447 pa += dec_window_4[3] * buffer[77];
448 pa += dec_window_4[5] * buffer[13];
449 pb += dec_window_4[5] * buffer[67];
450 pb += dec_window_4[7] * buffer[15];
451 pa += dec_window_4[7] * buffer[65];
452 pa += dec_window_4[9] * buffer[17];
453 pb += dec_window_4[9] * buffer[63];
454 pb += dec_window_4[11] * buffer[19];
455 pa += dec_window_4[11] * buffer[61];
456 pa += dec_window_4[13] * buffer[29];
457 pb += dec_window_4[13] * buffer[51];
458 pb += dec_window_4[15] * buffer[31];
459 pa += dec_window_4[15] * buffer[49];
460 pa += dec_window_4[17] * buffer[33];
461 pb += dec_window_4[17] * buffer[47];
462 pb += dec_window_4[19] * buffer[35];
463 pa += dec_window_4[19] * buffer[45];
464 pa = SCALE(-pa, 15);
465 CLIP_INT16(pa);
466 pcm[1 << strideShift] = (int16_t)(pa);
467 pb = SCALE(-pb, 15);
468 CLIP_INT16(pb);
469 pcm[3 << strideShift] = (int16_t)(pb);
470
471 pa = dec_window_4[2] * (/*buffer[ 2] + */ buffer[78]); /* buffer[ 2] is always zero */
472 pa += dec_window_4[6] * (buffer[14] /* + buffer[66]*/); /* buffer[66] is always zero */
473 pa += dec_window_4[10] * (/*buffer[18] + */ buffer[62]); /* buffer[18] is always zero */
474 pa += dec_window_4[14] * (buffer[30] /* + buffer[50]*/); /* buffer[50] is always zero */
475 pa += dec_window_4[18] * (/*buffer[34] + */ buffer[46]); /* buffer[34] is always zero */
476 pa = SCALE(-pa, 15);
477 CLIP_INT16(pa);
478 pcm[2 << strideShift] = (int16_t)(pa);
479 }
480
481 /**
482 This routine implements the cosine modulation matrix for 4-subband
483 synthesis. This is called "matrixing" in the SBC specification. This
484 matrix, M4, can be factored into an 8-point Type II Discrete Cosine
485 Transform, DCTII_4 and a matrix S4, given here:
486
487 @code
488 __ __
489 | 0 0 1 0 |
490 | 0 0 0 1 |
491 | 0 0 0 0 |
492 | 0 0 0 -1 |
493 S4 = | 0 0 -1 0 |
494 | 0 -1 0 0 |
495 | -1 0 0 0 |
496 |__ 0 -1 0 0 __|
497
498 M4 * in = S4 * (DCTII_4 * in)
499 @endcode
500
501 (DCTII_4 * in) is computed using a Fast Cosine Transform. The algorithm
502 here is based on an implementation computed by the SPIRAL computer
503 algebra system, manually converted to fixed-point arithmetic. S4 can be
504 implemented using only assignment and negation.
505 */
cosineModulateSynth4(SBC_BUFFER_T * RESTRICT out,int32_t const * RESTRICT in)506 PRIVATE void cosineModulateSynth4(SBC_BUFFER_T* RESTRICT out, int32_t const* RESTRICT in) {
507 int32_t f0, f1, f2, f3, f4, f7, f8, f9, f10;
508 int32_t y0, y1, y2, y3;
509
510 f0 = (in[0] - in[3]);
511 f1 = (in[0] + in[3]);
512 f2 = (in[1] - in[2]);
513 f3 = (in[1] + in[2]);
514
515 f4 = f1 - f3;
516
517 y0 = -SCALE(f1 + f3, DCT_SHIFT);
518 y2 = -SCALE(LONG_MULT_DCT(DCTII_4_K06_FIX, f4), DCT_SHIFT);
519 f7 = f0 + f2;
520 f8 = LONG_MULT_DCT(DCTII_4_K08_FIX, f0);
521 f9 = LONG_MULT_DCT(DCTII_4_K09_FIX, f7);
522 f10 = LONG_MULT_DCT(DCTII_4_K10_FIX, f2);
523 y3 = -SCALE(f8 + f9, DCT_SHIFT);
524 y1 = -SCALE(f10 - f9, DCT_SHIFT);
525
526 out[0] = (int16_t)-y2;
527 out[1] = (int16_t)-y3;
528 out[2] = (int16_t)0;
529 out[3] = (int16_t)y3;
530 out[4] = (int16_t)y2;
531 out[5] = (int16_t)y1;
532 out[6] = (int16_t)y0;
533 out[7] = (int16_t)y1;
534 }
535
536 /**
537 @}
538 */
539