1 /******************************************************************************
2  *
3  *  Copyright 2014 The Android Open Source Project
4  *  Copyright 2003 - 2004 Open Interface North America, Inc. All rights
5  *                        reserved.
6  *
7  *  Licensed under the Apache License, Version 2.0 (the "License");
8  *  you may not use this file except in compliance with the License.
9  *  You may obtain a copy of the License at:
10  *
11  *  http://www.apache.org/licenses/LICENSE-2.0
12  *
13  *  Unless required by applicable law or agreed to in writing, software
14  *  distributed under the License is distributed on an "AS IS" BASIS,
15  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  *  See the License for the specific language governing permissions and
17  *  limitations under the License.
18  *
19  ******************************************************************************/
20 
21 /*******************************************************************************
22   $Revision: #1 $
23  ******************************************************************************/
24 
25 /** @file
26 
27 This file, along with synthesis-generated.c, contains the synthesis
28 filterbank routines. The operations performed correspond to the
29 operations described in A2DP Appendix B, Figure 12.3. Several
30 mathematical optimizations are performed, particularly for the
31 8-subband case.
32 
33 One important optimization is to note that the "matrixing" operation
34 can be decomposed into the product of a type II discrete cosine kernel
35 and another, sparse matrix.
36 
37 According to Fig 12.3, in the 8-subband case,
38 @code
39     N[k][i] = cos((i+0.5)*(k+4)*pi/8), k = 0..15 and i = 0..7
40 @endcode
41 
42 N can be factored as R * C2, where C2 is an 8-point type II discrete
43 cosine kernel given by
44 @code
45     C2[k][i] = cos((i+0.5)*k*pi/8)), k = 0..7 and i = 0..7
46 @endcode
47 
48 R turns out to be a sparse 16x8 matrix with the following non-zero
49 entries:
50 @code
51     R[k][k+4]        =  1,   k = 0..3
52     R[k][abs(12-k)]  = -1,   k = 5..15
53 @endcode
54 
55 The spec describes computing V[0..15] as N * R.
56 @code
57     V[0..15] = N * R = (R * C2) * R = R * (C2 * R)
58 @endcode
59 
60 C2 * R corresponds to computing the discrete cosine transform of R, so
61 V[0..15] can be computed by taking the DCT of R followed by assignment
62 and selective negation of the DCT result into V.
63 
64         Although this was derived empirically using GNU Octave, it is
65         formally demonstrated in, e.g., Liu, Chi-Min and Lee,
66         Wen-Chieh. "A Unified Fast Algorithm for Cosine Modulated
67         Filter Banks in Current Audio Coding Standards." Journal of
68         the AES 47 (December 1999): 1061.
69 
70 Given the shift operation performed prior to computing V[0..15], it is
71 clear that V[0..159] represents a rolling history of the 10 most
72 recent groups of blocks input to the synthesis operation. Interpreting
73 the matrix N in light of its factorization into C2 and R, R's
74 sparseness has implications for interpreting the values in V. In
75 particular, there is considerable redundancy in the values stored in
76 V. Furthermore, since R[4][0..7] are all zeros, one out of every 16
77 values in V will be zero regardless of the input data. Within each
78 block of 16 values in V, fully half of them are redundant or
79 irrelevant:
80 
81 @code
82     V[ 0] =  DCT[4]
83     V[ 1] =  DCT[5]
84     V[ 2] =  DCT[6]
85     V[ 3] =  DCT[7]
86     V[ 4] = 0
87     V[ 5] = -DCT[7] = -V[3] (redundant)
88     V[ 6] = -DCT[6] = -V[2] (redundant)
89     V[ 7] = -DCT[5] = -V[1] (redundant)
90     V[ 8] = -DCT[4] = -V[0] (redundant)
91     V[ 9] = -DCT[3]
92     V[10] = -DCT[2]
93     V[11] = -DCT[1]
94     V[12] = -DCT[0]
95     V[13] = -DCT[1] = V[11] (redundant)
96     V[14] = -DCT[2] = V[10] (redundant)
97     V[15] = -DCT[3] = V[ 9] (redundant)
98 @endcode
99 
100 Since the elements of V beyond 15 were originally computed the same
101 way during a previous run, what holds true for V[x] also holds true
102 for V[x+16]. Thus, so long as care is taken to maintain the mapping,
103 we need only actually store the unique values, which correspond to the
104 output of the DCT, in some cases inverted. In fact, instead of storing
105 V[0..159], we could store DCT[0..79] which would contain a history of
106 DCT results. More on this in a bit.
107 
108 Going back to figure 12.3 in the spec, it should be clear that the
109 vector U need not actually be explicitly constructed, but that with
110 suitable indexing into V during the window operation, the same end can
111 be accomplished. In the same spirit of the pseudocode shown in the
112 figure, the following is the construction of W without using U:
113 
114 @code
115     for i=0 to 79 do
116         W[i] = D[i]*VSIGN(i)*V[remap_V(i)] where remap_V(i) = 32*(int(i/16)) +
117 (i % 16) + (i % 16 >= 8 ? 16 : 0)
118                                              and VSIGN(i) maps i%16 into {1, 1,
119 1, 1, 0, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1 }
120                                              These values correspond to the
121                                              signs of the redundant values as
122                                              shown in the explanation three
123                                              paragraphs above.
124 @endcode
125 
126 We saw above how V[4..8,13..15] (and by extension
127 V[(4..8,13..15)+16*n]) can be defined in terms of other elements
128 within the subblock of V. V[0..3,9..12] correspond to DCT elements.
129 
130 @code
131     for i=0 to 79 do
132         W[i] = D[i]*DSIGN(i)*DCT[remap_DCT(i)]
133 @endcode
134 
135 The DCT is calculated using the Arai-Agui-Nakajima factorization,
136 which saves some computation by producing output that needs to be
137 multiplied by scaling factors before being used.
138 
139 @code
140     for i=0 to 79 do
141         W[i] = D[i]*SCALE[i%8]*AAN_DCT[remap_DCT(i)]
142 @endcode
143 
144 D can be premultiplied with the DCT scaling factors to yield
145 
146 @code
147     for i=0 to 79 do
148         W[i] = DSCALED[i]*AAN_DCT[remap_DCT(i)] where DSCALED[i] =
149 D[i]*SCALE[i%8]
150 @endcode
151 
152 The output samples X[0..7] are defined as sums of W:
153 
154 @code
155         X[j] = sum{i=0..9}(W[j+8*i])
156 @endcode
157 
158 @ingroup codec_internal
159 */
160 
161 /**
162 @addtogroup codec_internal
163 @{
164 */
165 
166 #include "oi_codec_sbc_private.h"
167 
168 const int32_t dec_window_4[21] = {
169         0,      /* +0.00000000E+00 */
170         97,     /* +5.36548976E-04 */
171         270,    /* +1.49188357E-03 */
172         495,    /* +2.73370904E-03 */
173         694,    /* +3.83720193E-03 */
174         704,    /* +3.89205149E-03 */
175         338,    /* +1.86581691E-03 */
176         -554,   /* -3.06012286E-03 */
177         1974,   /* +1.09137620E-02 */
178         3697,   /* +2.04385087E-02 */
179         5224,   /* +2.88757392E-02 */
180         5824,   /* +3.21939290E-02 */
181         4681,   /* +2.58767811E-02 */
182         1109,   /* +6.13245186E-03 */
183         -5214,  /* -2.88217274E-02 */
184         -14047, /* -7.76463494E-02 */
185         24529,  /* +1.35593274E-01 */
186         35274,  /* +1.94987841E-01 */
187         44618,  /* +2.46636662E-01 */
188         50984,  /* +2.81828203E-01 */
189         53243,  /* +2.94315332E-01 */
190 };
191 
192 #define DCTII_4_K06_FIX (11585) /* S1.14      11585   0.707107*/
193 
194 #define DCTII_4_K08_FIX (21407) /* S1.14      21407   1.306563*/
195 
196 #define DCTII_4_K09_FIX (-15137) /* S1.14     -15137  -0.923880*/
197 
198 #define DCTII_4_K10_FIX (-8867) /* S1.14      -8867  -0.541196*/
199 
200 /** Scales x by y bits to the right, adding a rounding factor.
201  */
202 #ifndef SCALE
203 #define SCALE(x, y) (((x) + (1 << ((y) - 1))) >> (y))
204 #endif
205 
206 #ifndef CLIP_INT16
207 #define CLIP_INT16(x)                \
208   do {                               \
209     if ((x) > OI_INT16_MAX) {        \
210       (x) = OI_INT16_MAX;            \
211     } else if ((x) < OI_INT16_MIN) { \
212       (x) = OI_INT16_MIN;            \
213     }                                \
214   } while (0)
215 #endif
216 
217 /**
218  * Default C language implementation of a 16x32->32 multiply. This function may
219  * be replaced by a platform-specific version for speed.
220  *
221  * @param u A signed 16-bit multiplicand
222  * @param v A signed 32-bit multiplier
223 
224  * @return  A signed 32-bit value corresponding to the 32 most significant bits
225  * of the 48-bit product of u and v.
226  */
default_mul_16s_32s_hi(int16_t u,int32_t v)227 INLINE int32_t default_mul_16s_32s_hi(int16_t u, int32_t v) {
228   uint16_t v0;
229   int16_t v1;
230 
231   int32_t w, x;
232 
233   v0 = (uint16_t)(v & 0xffff);
234   v1 = (int16_t)(v >> 16);
235 
236   w = v1 * u;
237   x = u * v0;
238 
239   return w + (x >> 16);
240 }
241 
242 #define MUL_16S_32S_HI(_x, _y) default_mul_16s_32s_hi(_x, _y)
243 
244 #define LONG_MULT_DCT(K, sample) (MUL_16S_32S_HI(K, sample) << 2)
245 
246 PRIVATE void SynthWindow80_generated(int16_t* pcm, SBC_BUFFER_T const* RESTRICT buffer,
247                                      OI_UINT strideShift);
248 PRIVATE void SynthWindow112_generated(int16_t* pcm, SBC_BUFFER_T const* RESTRICT buffer,
249                                       OI_UINT strideShift);
250 PRIVATE void dct2_8(SBC_BUFFER_T* RESTRICT out, int32_t const* RESTRICT x);
251 
252 typedef void (*SYNTH_FRAME)(OI_CODEC_SBC_DECODER_CONTEXT* context, int16_t* pcm, OI_UINT blkstart,
253                             OI_UINT blkcount);
254 
255 #ifndef COPY_BACKWARD_32BIT_ALIGNED_72_HALFWORDS
256 #define COPY_BACKWARD_32BIT_ALIGNED_72_HALFWORDS(dest, src) \
257   do {                                                      \
258     shift_buffer(dest, src, 72);                            \
259   } while (0)
260 #endif
261 
262 #ifndef DCT2_8
263 #define DCT2_8(dst, src) dct2_8(dst, src)
264 #endif
265 
266 #ifndef SYNTH80
267 #define SYNTH80 SynthWindow80_generated
268 #endif
269 
270 #ifndef SYNTH112
271 #define SYNTH112 SynthWindow112_generated
272 #endif
273 
OI_SBC_SynthFrame_80(OI_CODEC_SBC_DECODER_CONTEXT * context,int16_t * pcm,OI_UINT blkstart,OI_UINT blkcount)274 PRIVATE void OI_SBC_SynthFrame_80(OI_CODEC_SBC_DECODER_CONTEXT* context, int16_t* pcm,
275                                   OI_UINT blkstart, OI_UINT blkcount) {
276   OI_UINT blk;
277   OI_UINT ch;
278   OI_UINT nrof_channels = context->common.frameInfo.nrof_channels;
279   OI_UINT pcmStrideShift = context->common.pcmStride == 1 ? 0 : 1;
280   OI_UINT offset = context->common.filterBufferOffset;
281   int32_t* s = context->common.subdata + 8 * nrof_channels * blkstart;
282   OI_UINT blkstop = blkstart + blkcount;
283 
284   for (blk = blkstart; blk < blkstop; blk++) {
285     if (offset == 0) {
286       COPY_BACKWARD_32BIT_ALIGNED_72_HALFWORDS(
287               context->common.filterBuffer[0] + context->common.filterBufferLen - 72,
288               context->common.filterBuffer[0]);
289       if (nrof_channels == 2) {
290         COPY_BACKWARD_32BIT_ALIGNED_72_HALFWORDS(
291                 context->common.filterBuffer[1] + context->common.filterBufferLen - 72,
292                 context->common.filterBuffer[1]);
293       }
294       offset = context->common.filterBufferLen - 80;
295     } else {
296       offset -= 1 * 8;
297     }
298 
299     for (ch = 0; ch < nrof_channels; ch++) {
300       DCT2_8(context->common.filterBuffer[ch] + offset, s);
301       SYNTH80(pcm + ch, context->common.filterBuffer[ch] + offset, pcmStrideShift);
302       s += 8;
303     }
304     pcm += (8 << pcmStrideShift);
305   }
306   context->common.filterBufferOffset = offset;
307 }
308 
OI_SBC_SynthFrame_4SB(OI_CODEC_SBC_DECODER_CONTEXT * context,int16_t * pcm,OI_UINT blkstart,OI_UINT blkcount)309 PRIVATE void OI_SBC_SynthFrame_4SB(OI_CODEC_SBC_DECODER_CONTEXT* context, int16_t* pcm,
310                                    OI_UINT blkstart, OI_UINT blkcount) {
311   OI_UINT blk;
312   OI_UINT ch;
313   OI_UINT nrof_channels = context->common.frameInfo.nrof_channels;
314   OI_UINT pcmStrideShift = context->common.pcmStride == 1 ? 0 : 1;
315   OI_UINT offset = context->common.filterBufferOffset;
316   int32_t* s = context->common.subdata + 8 * nrof_channels * blkstart;
317   OI_UINT blkstop = blkstart + blkcount;
318 
319   for (blk = blkstart; blk < blkstop; blk++) {
320     if (offset == 0) {
321       COPY_BACKWARD_32BIT_ALIGNED_72_HALFWORDS(
322               context->common.filterBuffer[0] + context->common.filterBufferLen - 72,
323               context->common.filterBuffer[0]);
324       if (nrof_channels == 2) {
325         COPY_BACKWARD_32BIT_ALIGNED_72_HALFWORDS(
326                 context->common.filterBuffer[1] + context->common.filterBufferLen - 72,
327                 context->common.filterBuffer[1]);
328       }
329       offset = context->common.filterBufferLen - 80;
330     } else {
331       offset -= 8;
332     }
333     for (ch = 0; ch < nrof_channels; ch++) {
334       cosineModulateSynth4(context->common.filterBuffer[ch] + offset, s);
335       SynthWindow40_int32_int32_symmetry_with_sum(
336               pcm + ch, context->common.filterBuffer[ch] + offset, pcmStrideShift);
337       s += 4;
338     }
339     pcm += (4 << pcmStrideShift);
340   }
341   context->common.filterBufferOffset = offset;
342 }
343 
344 #ifdef SBC_ENHANCED
345 
OI_SBC_SynthFrame_Enhanced(OI_CODEC_SBC_DECODER_CONTEXT * context,int16_t * pcm,OI_UINT blkstart,OI_UINT blkcount)346 PRIVATE void OI_SBC_SynthFrame_Enhanced(OI_CODEC_SBC_DECODER_CONTEXT* context, int16_t* pcm,
347                                         OI_UINT blkstart, OI_UINT blkcount) {
348   OI_UINT blk;
349   OI_UINT ch;
350   OI_UINT nrof_channels = context->common.frameInfo.nrof_channels;
351   OI_UINT pcmStrideShift = context->common.pcmStride == 1 ? 0 : 1;
352   OI_UINT offset = context->common.filterBufferOffset;
353   int32_t* s = context->common.subdata + 8 * nrof_channels * blkstart;
354   OI_UINT blkstop = blkstart + blkcount;
355 
356   for (blk = blkstart; blk < blkstop; blk++) {
357     if (offset == 0) {
358       COPY_BACKWARD_32BIT_ALIGNED_104_HALFWORDS(
359               context->common.filterBuffer[0] + context->common.filterBufferLen - 104,
360               context->common.filterBuffer[0]);
361       if (nrof_channels == 2) {
362         COPY_BACKWARD_32BIT_ALIGNED_104_HALFWORDS(
363                 context->common.filterBuffer[1] + context->common.filterBufferLen - 104,
364                 context->common.filterBuffer[1]);
365       }
366       offset = context->common.filterBufferLen - 112;
367     } else {
368       offset -= 8;
369     }
370     for (ch = 0; ch < nrof_channels; ++ch) {
371       DCT2_8(context->common.filterBuffer[ch] + offset, s);
372       SYNTH112(pcm + ch, context->common.filterBuffer[ch] + offset, pcmStrideShift);
373       s += 8;
374     }
375     pcm += (8 << pcmStrideShift);
376   }
377   context->common.filterBufferOffset = offset;
378 }
379 
380 static const SYNTH_FRAME SynthFrameEnhanced[] = {
381         NULL,                       /* invalid */
382         OI_SBC_SynthFrame_Enhanced, /* mono */
383         OI_SBC_SynthFrame_Enhanced  /* stereo */
384 };
385 
386 #endif
387 
388 static const SYNTH_FRAME SynthFrame8SB[] = {
389         NULL,                 /* invalid */
390         OI_SBC_SynthFrame_80, /* mono */
391         OI_SBC_SynthFrame_80  /* stereo */
392 };
393 
394 static const SYNTH_FRAME SynthFrame4SB[] = {
395         NULL,                  /* invalid */
396         OI_SBC_SynthFrame_4SB, /* mono */
397         OI_SBC_SynthFrame_4SB  /* stereo */
398 };
399 
OI_SBC_SynthFrame(OI_CODEC_SBC_DECODER_CONTEXT * context,int16_t * pcm,OI_UINT start_block,OI_UINT nrof_blocks)400 PRIVATE void OI_SBC_SynthFrame(OI_CODEC_SBC_DECODER_CONTEXT* context, int16_t* pcm,
401                                OI_UINT start_block, OI_UINT nrof_blocks) {
402   OI_UINT nrof_subbands = context->common.frameInfo.nrof_subbands;
403   OI_UINT nrof_channels = context->common.frameInfo.nrof_channels;
404 
405   OI_ASSERT(nrof_subbands == 4 || nrof_subbands == 8);
406   if (nrof_subbands == 4) {
407     SynthFrame4SB[nrof_channels](context, pcm, start_block, nrof_blocks);
408 #ifdef SBC_ENHANCED
409   } else if (context->common.frameInfo.enhanced) {
410     SynthFrameEnhanced[nrof_channels](context, pcm, start_block, nrof_blocks);
411 #endif /* SBC_ENHANCED */
412   } else {
413     SynthFrame8SB[nrof_channels](context, pcm, start_block, nrof_blocks);
414   }
415 }
416 
SynthWindow40_int32_int32_symmetry_with_sum(int16_t * pcm,SBC_BUFFER_T buffer[80],OI_UINT strideShift)417 void SynthWindow40_int32_int32_symmetry_with_sum(int16_t* pcm, SBC_BUFFER_T buffer[80],
418                                                  OI_UINT strideShift) {
419   int32_t pa;
420   int32_t pb;
421 
422   /* These values should be zero, since out[2] of the 4-band cosine modulation
423    * is always zero. */
424   OI_ASSERT(buffer[2] == 0);
425   OI_ASSERT(buffer[10] == 0);
426   OI_ASSERT(buffer[18] == 0);
427   OI_ASSERT(buffer[26] == 0);
428   OI_ASSERT(buffer[34] == 0);
429   OI_ASSERT(buffer[42] == 0);
430   OI_ASSERT(buffer[50] == 0);
431   OI_ASSERT(buffer[58] == 0);
432   OI_ASSERT(buffer[66] == 0);
433   OI_ASSERT(buffer[74] == 0);
434 
435   pa = dec_window_4[4] * (buffer[12] + buffer[76]);
436   pa += dec_window_4[8] * (buffer[16] - buffer[64]);
437   pa += dec_window_4[12] * (buffer[28] + buffer[60]);
438   pa += dec_window_4[16] * (buffer[32] - buffer[48]);
439   pa += dec_window_4[20] * buffer[44];
440   pa = SCALE(-pa, 15);
441   CLIP_INT16(pa);
442   pcm[0 << strideShift] = (int16_t)pa;
443 
444   pa = dec_window_4[1] * buffer[1];
445   pb = dec_window_4[1] * buffer[79];
446   pb += dec_window_4[3] * buffer[3];
447   pa += dec_window_4[3] * buffer[77];
448   pa += dec_window_4[5] * buffer[13];
449   pb += dec_window_4[5] * buffer[67];
450   pb += dec_window_4[7] * buffer[15];
451   pa += dec_window_4[7] * buffer[65];
452   pa += dec_window_4[9] * buffer[17];
453   pb += dec_window_4[9] * buffer[63];
454   pb += dec_window_4[11] * buffer[19];
455   pa += dec_window_4[11] * buffer[61];
456   pa += dec_window_4[13] * buffer[29];
457   pb += dec_window_4[13] * buffer[51];
458   pb += dec_window_4[15] * buffer[31];
459   pa += dec_window_4[15] * buffer[49];
460   pa += dec_window_4[17] * buffer[33];
461   pb += dec_window_4[17] * buffer[47];
462   pb += dec_window_4[19] * buffer[35];
463   pa += dec_window_4[19] * buffer[45];
464   pa = SCALE(-pa, 15);
465   CLIP_INT16(pa);
466   pcm[1 << strideShift] = (int16_t)(pa);
467   pb = SCALE(-pb, 15);
468   CLIP_INT16(pb);
469   pcm[3 << strideShift] = (int16_t)(pb);
470 
471   pa = dec_window_4[2] * (/*buffer[ 2] + */ buffer[78]);   /* buffer[ 2] is always zero */
472   pa += dec_window_4[6] * (buffer[14] /* + buffer[66]*/);  /* buffer[66] is always zero */
473   pa += dec_window_4[10] * (/*buffer[18] + */ buffer[62]); /* buffer[18] is always zero */
474   pa += dec_window_4[14] * (buffer[30] /* + buffer[50]*/); /* buffer[50] is always zero */
475   pa += dec_window_4[18] * (/*buffer[34] + */ buffer[46]); /* buffer[34] is always zero */
476   pa = SCALE(-pa, 15);
477   CLIP_INT16(pa);
478   pcm[2 << strideShift] = (int16_t)(pa);
479 }
480 
481 /**
482   This routine implements the cosine modulation matrix for 4-subband
483   synthesis. This is called "matrixing" in the SBC specification. This
484   matrix, M4,  can be factored into an 8-point Type II Discrete Cosine
485   Transform, DCTII_4 and a matrix S4, given here:
486 
487   @code
488         __               __
489        |   0   0   1   0   |
490        |   0   0   0   1   |
491        |   0   0   0   0   |
492        |   0   0   0  -1   |
493   S4 = |   0   0  -1   0   |
494        |   0  -1   0   0   |
495        |  -1   0   0   0   |
496        |__ 0  -1   0   0 __|
497 
498   M4 * in = S4 * (DCTII_4 * in)
499   @endcode
500 
501   (DCTII_4 * in) is computed using a Fast Cosine Transform. The algorithm
502   here is based on an implementation computed by the SPIRAL computer
503   algebra system, manually converted to fixed-point arithmetic. S4 can be
504   implemented using only assignment and negation.
505   */
cosineModulateSynth4(SBC_BUFFER_T * RESTRICT out,int32_t const * RESTRICT in)506 PRIVATE void cosineModulateSynth4(SBC_BUFFER_T* RESTRICT out, int32_t const* RESTRICT in) {
507   int32_t f0, f1, f2, f3, f4, f7, f8, f9, f10;
508   int32_t y0, y1, y2, y3;
509 
510   f0 = (in[0] - in[3]);
511   f1 = (in[0] + in[3]);
512   f2 = (in[1] - in[2]);
513   f3 = (in[1] + in[2]);
514 
515   f4 = f1 - f3;
516 
517   y0 = -SCALE(f1 + f3, DCT_SHIFT);
518   y2 = -SCALE(LONG_MULT_DCT(DCTII_4_K06_FIX, f4), DCT_SHIFT);
519   f7 = f0 + f2;
520   f8 = LONG_MULT_DCT(DCTII_4_K08_FIX, f0);
521   f9 = LONG_MULT_DCT(DCTII_4_K09_FIX, f7);
522   f10 = LONG_MULT_DCT(DCTII_4_K10_FIX, f2);
523   y3 = -SCALE(f8 + f9, DCT_SHIFT);
524   y1 = -SCALE(f10 - f9, DCT_SHIFT);
525 
526   out[0] = (int16_t)-y2;
527   out[1] = (int16_t)-y3;
528   out[2] = (int16_t)0;
529   out[3] = (int16_t)y3;
530   out[4] = (int16_t)y2;
531   out[5] = (int16_t)y1;
532   out[6] = (int16_t)y0;
533   out[7] = (int16_t)y1;
534 }
535 
536 /**
537 @}
538 */
539