1 /* Copyright (c) 2023 Amazon
2 Written by Jan Buethe */
3 /*
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #define OSCE_SPEC_WINDOW_SIZE 320
33 #define OSCE_SPEC_NUM_FREQS 161
34
35
36 /*DEBUG*/
37 /*#define WRITE_FEATURES*/
38 /*#define DEBUG_PRING*/
39 /*******/
40
41 #include "stack_alloc.h"
42 #include "osce_features.h"
43 #include "kiss_fft.h"
44 #include "os_support.h"
45 #include "osce.h"
46 #include "freq.h"
47
48
49 #if defined(WRITE_FEATURES) || defined(DEBUG_PRING)
50 #include <stdio.h>
51 #include <stdlib.h>
52 #endif
53
54 static const int center_bins_clean[64] = {
55 0, 2, 5, 8, 10, 12, 15, 18,
56 20, 22, 25, 28, 30, 33, 35, 38,
57 40, 42, 45, 48, 50, 52, 55, 58,
58 60, 62, 65, 68, 70, 73, 75, 78,
59 80, 82, 85, 88, 90, 92, 95, 98,
60 100, 102, 105, 108, 110, 112, 115, 118,
61 120, 122, 125, 128, 130, 132, 135, 138,
62 140, 142, 145, 148, 150, 152, 155, 160
63 };
64
65 static const int center_bins_noisy[18] = {
66 0, 4, 8, 12, 16, 20, 24, 28,
67 32, 40, 48, 56, 64, 80, 96, 112,
68 136, 160
69 };
70
71 static const float band_weights_clean[64] = {
72 0.666666666667f, 0.400000000000f, 0.333333333333f, 0.400000000000f,
73 0.500000000000f, 0.400000000000f, 0.333333333333f, 0.400000000000f,
74 0.500000000000f, 0.400000000000f, 0.333333333333f, 0.400000000000f,
75 0.400000000000f, 0.400000000000f, 0.400000000000f, 0.400000000000f,
76 0.500000000000f, 0.400000000000f, 0.333333333333f, 0.400000000000f,
77 0.500000000000f, 0.400000000000f, 0.333333333333f, 0.400000000000f,
78 0.500000000000f, 0.400000000000f, 0.333333333333f, 0.400000000000f,
79 0.400000000000f, 0.400000000000f, 0.400000000000f, 0.400000000000f,
80 0.500000000000f, 0.400000000000f, 0.333333333333f, 0.400000000000f,
81 0.500000000000f, 0.400000000000f, 0.333333333333f, 0.400000000000f,
82 0.500000000000f, 0.400000000000f, 0.333333333333f, 0.400000000000f,
83 0.500000000000f, 0.400000000000f, 0.333333333333f, 0.400000000000f,
84 0.500000000000f, 0.400000000000f, 0.333333333333f, 0.400000000000f,
85 0.500000000000f, 0.400000000000f, 0.333333333333f, 0.400000000000f,
86 0.500000000000f, 0.400000000000f, 0.333333333333f, 0.400000000000f,
87 0.500000000000f, 0.400000000000f, 0.250000000000f, 0.333333333333f
88 };
89
90 static const float band_weights_noisy[18] = {
91 0.400000000000f, 0.250000000000f, 0.250000000000f, 0.250000000000f,
92 0.250000000000f, 0.250000000000f, 0.250000000000f, 0.250000000000f,
93 0.166666666667f, 0.125000000000f, 0.125000000000f, 0.125000000000f,
94 0.083333333333f, 0.062500000000f, 0.062500000000f, 0.050000000000f,
95 0.041666666667f, 0.080000000000f
96 };
97
98 static float osce_window[OSCE_SPEC_WINDOW_SIZE] = {
99 0.004908718808f, 0.014725683311f, 0.024541228523f, 0.034354408400f, 0.044164277127f,
100 0.053969889210f, 0.063770299562f, 0.073564563600f, 0.083351737332f, 0.093130877450f,
101 0.102901041421f, 0.112661287575f, 0.122410675199f, 0.132148264628f, 0.141873117332f,
102 0.151584296010f, 0.161280864678f, 0.170961888760f, 0.180626435180f, 0.190273572448f,
103 0.199902370753f, 0.209511902052f, 0.219101240157f, 0.228669460829f, 0.238215641862f,
104 0.247738863176f, 0.257238206902f, 0.266712757475f, 0.276161601717f, 0.285583828929f,
105 0.294978530977f, 0.304344802381f, 0.313681740399f, 0.322988445118f, 0.332264019538f,
106 0.341507569661f, 0.350718204573f, 0.359895036535f, 0.369037181064f, 0.378143757022f,
107 0.387213886697f, 0.396246695891f, 0.405241314005f, 0.414196874117f, 0.423112513073f,
108 0.431987371563f, 0.440820594212f, 0.449611329655f, 0.458358730621f, 0.467061954019f,
109 0.475720161014f, 0.484332517110f, 0.492898192230f, 0.501416360796f, 0.509886201809f,
110 0.518306898929f, 0.526677640552f, 0.534997619887f, 0.543266035038f, 0.551482089078f,
111 0.559644990127f, 0.567753951426f, 0.575808191418f, 0.583806933818f, 0.591749407690f,
112 0.599634847523f, 0.607462493302f, 0.615231590581f, 0.622941390558f, 0.630591150148f,
113 0.638180132051f, 0.645707604824f, 0.653172842954f, 0.660575126926f, 0.667913743292f,
114 0.675187984742f, 0.682397150168f, 0.689540544737f, 0.696617479953f, 0.703627273726f,
115 0.710569250438f, 0.717442741007f, 0.724247082951f, 0.730981620454f, 0.737645704427f,
116 0.744238692572f, 0.750759949443f, 0.757208846506f, 0.763584762206f, 0.769887082016f,
117 0.776115198508f, 0.782268511401f, 0.788346427627f, 0.794348361383f, 0.800273734191f,
118 0.806121974951f, 0.811892519997f, 0.817584813152f, 0.823198305781f, 0.828732456844f,
119 0.834186732948f, 0.839560608398f, 0.844853565250f, 0.850065093356f, 0.855194690420f,
120 0.860241862039f, 0.865206121757f, 0.870086991109f, 0.874883999665f, 0.879596685080f,
121 0.884224593137f, 0.888767277786f, 0.893224301196f, 0.897595233788f, 0.901879654283f,
122 0.906077149740f, 0.910187315596f, 0.914209755704f, 0.918144082372f, 0.921989916403f,
123 0.925746887127f, 0.929414632439f, 0.932992798835f, 0.936481041442f, 0.939879024058f,
124 0.943186419177f, 0.946402908026f, 0.949528180593f, 0.952561935658f, 0.955503880820f,
125 0.958353732530f, 0.961111216112f, 0.963776065795f, 0.966348024735f, 0.968826845041f,
126 0.971212287799f, 0.973504123096f, 0.975702130039f, 0.977806096779f, 0.979815820533f,
127 0.981731107599f, 0.983551773378f, 0.985277642389f, 0.986908548290f, 0.988444333892f,
128 0.989884851171f, 0.991229961288f, 0.992479534599f, 0.993633450666f, 0.994691598273f,
129 0.995653875433f, 0.996520189401f, 0.997290456679f, 0.997964603026f, 0.998542563469f,
130 0.999024282300f, 0.999409713092f, 0.999698818696f, 0.999891571247f, 0.999987952167f,
131 0.999987952167f, 0.999891571247f, 0.999698818696f, 0.999409713092f, 0.999024282300f,
132 0.998542563469f, 0.997964603026f, 0.997290456679f, 0.996520189401f, 0.995653875433f,
133 0.994691598273f, 0.993633450666f, 0.992479534599f, 0.991229961288f, 0.989884851171f,
134 0.988444333892f, 0.986908548290f, 0.985277642389f, 0.983551773378f, 0.981731107599f,
135 0.979815820533f, 0.977806096779f, 0.975702130039f, 0.973504123096f, 0.971212287799f,
136 0.968826845041f, 0.966348024735f, 0.963776065795f, 0.961111216112f, 0.958353732530f,
137 0.955503880820f, 0.952561935658f, 0.949528180593f, 0.946402908026f, 0.943186419177f,
138 0.939879024058f, 0.936481041442f, 0.932992798835f, 0.929414632439f, 0.925746887127f,
139 0.921989916403f, 0.918144082372f, 0.914209755704f, 0.910187315596f, 0.906077149740f,
140 0.901879654283f, 0.897595233788f, 0.893224301196f, 0.888767277786f, 0.884224593137f,
141 0.879596685080f, 0.874883999665f, 0.870086991109f, 0.865206121757f, 0.860241862039f,
142 0.855194690420f, 0.850065093356f, 0.844853565250f, 0.839560608398f, 0.834186732948f,
143 0.828732456844f, 0.823198305781f, 0.817584813152f, 0.811892519997f, 0.806121974951f,
144 0.800273734191f, 0.794348361383f, 0.788346427627f, 0.782268511401f, 0.776115198508f,
145 0.769887082016f, 0.763584762206f, 0.757208846506f, 0.750759949443f, 0.744238692572f,
146 0.737645704427f, 0.730981620454f, 0.724247082951f, 0.717442741007f, 0.710569250438f,
147 0.703627273726f, 0.696617479953f, 0.689540544737f, 0.682397150168f, 0.675187984742f,
148 0.667913743292f, 0.660575126926f, 0.653172842954f, 0.645707604824f, 0.638180132051f,
149 0.630591150148f, 0.622941390558f, 0.615231590581f, 0.607462493302f, 0.599634847523f,
150 0.591749407690f, 0.583806933818f, 0.575808191418f, 0.567753951426f, 0.559644990127f,
151 0.551482089078f, 0.543266035038f, 0.534997619887f, 0.526677640552f, 0.518306898929f,
152 0.509886201809f, 0.501416360796f, 0.492898192230f, 0.484332517110f, 0.475720161014f,
153 0.467061954019f, 0.458358730621f, 0.449611329655f, 0.440820594212f, 0.431987371563f,
154 0.423112513073f, 0.414196874117f, 0.405241314005f, 0.396246695891f, 0.387213886697f,
155 0.378143757022f, 0.369037181064f, 0.359895036535f, 0.350718204573f, 0.341507569661f,
156 0.332264019538f, 0.322988445118f, 0.313681740399f, 0.304344802381f, 0.294978530977f,
157 0.285583828929f, 0.276161601717f, 0.266712757475f, 0.257238206902f, 0.247738863176f,
158 0.238215641862f, 0.228669460829f, 0.219101240157f, 0.209511902052f, 0.199902370753f,
159 0.190273572448f, 0.180626435180f, 0.170961888760f, 0.161280864678f, 0.151584296010f,
160 0.141873117332f, 0.132148264628f, 0.122410675199f, 0.112661287575f, 0.102901041421f,
161 0.093130877450f, 0.083351737332f, 0.073564563600f, 0.063770299562f, 0.053969889210f,
162 0.044164277127f, 0.034354408400f, 0.024541228523f, 0.014725683311f, 0.004908718808f
163 };
164
apply_filterbank(float * x_out,float * x_in,const int * center_bins,const float * band_weights,int num_bands)165 static void apply_filterbank(float *x_out, float *x_in, const int *center_bins, const float* band_weights, int num_bands)
166 {
167 int b, i;
168 float frac;
169
170 celt_assert(x_in != x_out)
171
172 x_out[0] = 0;
173 for (b = 0; b < num_bands - 1; b++)
174 {
175 x_out[b+1] = 0;
176 for (i = center_bins[b]; i < center_bins[b+1]; i++)
177 {
178 frac = (float) (center_bins[b+1] - i) / (center_bins[b+1] - center_bins[b]);
179 x_out[b] += band_weights[b] * frac * x_in[i];
180 x_out[b+1] += band_weights[b+1] * (1 - frac) * x_in[i];
181
182 }
183 }
184 x_out[num_bands - 1] += band_weights[num_bands - 1] * x_in[center_bins[num_bands - 1]];
185 #ifdef DEBUG_PRINT
186 for (b = 0; b < num_bands; b++)
187 {
188 printf("band[%d]: %f\n", b, x_out[b]);
189 }
190 #endif
191 }
192
193
mag_spec_320_onesided(float * out,float * in)194 static void mag_spec_320_onesided(float *out, float *in)
195 {
196 celt_assert(OSCE_SPEC_WINDOW_SIZE == 320);
197 kiss_fft_cpx buffer[OSCE_SPEC_WINDOW_SIZE];
198 int k;
199 forward_transform(buffer, in);
200
201 for (k = 0; k < OSCE_SPEC_NUM_FREQS; k++)
202 {
203 out[k] = OSCE_SPEC_WINDOW_SIZE * sqrt(buffer[k].r * buffer[k].r + buffer[k].i * buffer[k].i);
204 #ifdef DEBUG_PRINT
205 printf("magspec[%d]: %f\n", k, out[k]);
206 #endif
207 }
208 }
209
210
calculate_log_spectrum_from_lpc(float * spec,opus_int16 * a_q12,int lpc_order)211 static void calculate_log_spectrum_from_lpc(float *spec, opus_int16 *a_q12, int lpc_order)
212 {
213 float buffer[OSCE_SPEC_WINDOW_SIZE] = {0};
214 int i;
215
216 /* zero expansion */
217 buffer[0] = 1;
218 for (i = 0; i < lpc_order; i++)
219 {
220 buffer[i+1] = - (float)a_q12[i] / (1U << 12);
221 }
222
223 /* calculate and invert magnitude spectrum */
224 mag_spec_320_onesided(buffer, buffer);
225
226 for (i = 0; i < OSCE_SPEC_NUM_FREQS; i++)
227 {
228 buffer[i] = 1.f / (buffer[i] + 1e-9f);
229 }
230
231 /* apply filterbank */
232 apply_filterbank(spec, buffer, center_bins_clean, band_weights_clean, OSCE_CLEAN_SPEC_NUM_BANDS);
233
234 /* log and scaling */
235 for (i = 0; i < OSCE_CLEAN_SPEC_NUM_BANDS; i++)
236 {
237 spec[i] = 0.3f * log(spec[i] + 1e-9f);
238 }
239 }
240
calculate_cepstrum(float * cepstrum,float * signal)241 static void calculate_cepstrum(float *cepstrum, float *signal)
242 {
243 float buffer[OSCE_SPEC_WINDOW_SIZE];
244 float *spec = &buffer[OSCE_SPEC_NUM_FREQS + 3];
245 int n;
246
247 celt_assert(cepstrum != signal)
248
249 for (n = 0; n < OSCE_SPEC_WINDOW_SIZE; n++)
250 {
251 buffer[n] = osce_window[n] * signal[n];
252 }
253
254 /* calculate magnitude spectrum */
255 mag_spec_320_onesided(buffer, buffer);
256
257 /* accumulate bands */
258 apply_filterbank(spec, buffer, center_bins_noisy, band_weights_noisy, OSCE_NOISY_SPEC_NUM_BANDS);
259
260 /* log domain conversion */
261 for (n = 0; n < OSCE_NOISY_SPEC_NUM_BANDS; n++)
262 {
263 spec[n] = log(spec[n] + 1e-9f);
264 #ifdef DEBUG_PRINT
265 printf("logspec[%d]: %f\n", n, spec[n]);
266 #endif
267 }
268
269 /* DCT-II (orthonormal) */
270 celt_assert(OSCE_NOISY_SPEC_NUM_BANDS == NB_BANDS);
271 dct(cepstrum, spec);
272 }
273
calculate_acorr(float * acorr,float * signal,int lag)274 static void calculate_acorr(float *acorr, float *signal, int lag)
275 {
276 int n, k;
277 celt_assert(acorr != signal)
278
279 for (k = -2; k <= 2; k++)
280 {
281 acorr[k+2] = 0;
282 float xx = 0;
283 float xy = 0;
284 float yy = 0;
285 for (n = 0; n < 80; n++)
286 {
287 /* obviously wasteful -> fix later */
288 xx += signal[n] * signal[n];
289 yy += signal[n - lag + k] * signal[n - lag + k];
290 xy += signal[n] * signal[n - lag + k];
291 }
292 acorr[k+2] = xy / sqrt(xx * yy + 1e-9f);
293 }
294 }
295
pitch_postprocessing(OSCEFeatureState * psFeatures,int lag,int type)296 static int pitch_postprocessing(OSCEFeatureState *psFeatures, int lag, int type)
297 {
298 int new_lag;
299 int modulus;
300
301 #ifdef OSCE_HANGOVER_BUGFIX
302 #define TESTBIT 1
303 #else
304 #define TESTBIT 0
305 #endif
306
307 modulus = OSCE_PITCH_HANGOVER;
308 if (modulus == 0) modulus ++;
309
310 /* hangover is currently disabled to reflect a bug in the python code. ToDo: re-evaluate hangover */
311 if (type != TYPE_VOICED && psFeatures->last_type == TYPE_VOICED && TESTBIT)
312 /* enter hangover */
313 {
314 new_lag = OSCE_NO_PITCH_VALUE;
315 if (psFeatures->pitch_hangover_count < OSCE_PITCH_HANGOVER)
316 {
317 new_lag = psFeatures->last_lag;
318 psFeatures->pitch_hangover_count = (psFeatures->pitch_hangover_count + 1) % modulus;
319 }
320 }
321 else if (type != TYPE_VOICED && psFeatures->pitch_hangover_count && TESTBIT)
322 /* continue hangover */
323 {
324 new_lag = psFeatures->last_lag;
325 psFeatures->pitch_hangover_count = (psFeatures->pitch_hangover_count + 1) % modulus;
326 }
327 else if (type != TYPE_VOICED)
328 /* unvoiced frame after hangover */
329 {
330 new_lag = OSCE_NO_PITCH_VALUE;
331 psFeatures->pitch_hangover_count = 0;
332 }
333 else
334 /* voiced frame: update last_lag */
335 {
336 new_lag = lag;
337 psFeatures->last_lag = lag;
338 psFeatures->pitch_hangover_count = 0;
339 }
340
341 /* buffer update */
342 psFeatures->last_type = type;
343
344 /* with the current setup this should never happen (but who knows...) */
345 celt_assert(new_lag)
346
347 return new_lag;
348 }
349
osce_calculate_features(silk_decoder_state * psDec,silk_decoder_control * psDecCtrl,float * features,float * numbits,int * periods,const opus_int16 xq[],opus_int32 num_bits)350 void osce_calculate_features(
351 silk_decoder_state *psDec, /* I/O Decoder state */
352 silk_decoder_control *psDecCtrl, /* I Decoder control */
353 float *features, /* O input features */
354 float *numbits, /* O numbits and smoothed numbits */
355 int *periods, /* O pitch lags on subframe basis */
356 const opus_int16 xq[], /* I Decoded speech */
357 opus_int32 num_bits /* I Size of SILK payload in bits */
358 )
359 {
360 int num_subframes, num_samples;
361 float buffer[OSCE_FEATURES_MAX_HISTORY + OSCE_MAX_FEATURE_FRAMES * 80];
362 float *frame, *pfeatures;
363 OSCEFeatureState *psFeatures;
364 int i, n, k;
365 #ifdef WRITE_FEATURES
366 static FILE *f_feat = NULL;
367 if (f_feat == NULL)
368 {
369 f_feat = fopen("assembled_features.f32", "wb");
370 }
371 #endif
372
373 /*OPUS_CLEAR(buffer, 1);*/
374 memset(buffer, 0, sizeof(buffer));
375
376 num_subframes = psDec->nb_subfr;
377 num_samples = num_subframes * 80;
378 psFeatures = &psDec->osce.features;
379
380 /* smooth bit count */
381 psFeatures->numbits_smooth = 0.9f * psFeatures->numbits_smooth + 0.1f * num_bits;
382 numbits[0] = num_bits;
383 numbits[1] = psFeatures->numbits_smooth;
384
385 for (n = 0; n < num_samples; n++)
386 {
387 buffer[OSCE_FEATURES_MAX_HISTORY + n] = (float) xq[n] / (1U<<15);
388 }
389 OPUS_COPY(buffer, psFeatures->signal_history, OSCE_FEATURES_MAX_HISTORY);
390
391 for (k = 0; k < num_subframes; k++)
392 {
393 pfeatures = features + k * OSCE_FEATURE_DIM;
394 frame = &buffer[OSCE_FEATURES_MAX_HISTORY + k * 80];
395 memset(pfeatures, 0, OSCE_FEATURE_DIM); /* precaution */
396
397 /* clean spectrum from lpcs (update every other frame) */
398 if (k % 2 == 0)
399 {
400 calculate_log_spectrum_from_lpc(pfeatures + OSCE_CLEAN_SPEC_START, psDecCtrl->PredCoef_Q12[k >> 1], psDec->LPC_order);
401 }
402 else
403 {
404 OPUS_COPY(pfeatures + OSCE_CLEAN_SPEC_START, pfeatures + OSCE_CLEAN_SPEC_START - OSCE_FEATURE_DIM, OSCE_CLEAN_SPEC_LENGTH);
405 }
406
407 /* noisy cepstrum from signal (update every other frame) */
408 if (k % 2 == 0)
409 {
410 calculate_cepstrum(pfeatures + OSCE_NOISY_CEPSTRUM_START, frame - 160);
411 }
412 else
413 {
414 OPUS_COPY(pfeatures + OSCE_NOISY_CEPSTRUM_START, pfeatures + OSCE_NOISY_CEPSTRUM_START - OSCE_FEATURE_DIM, OSCE_NOISY_CEPSTRUM_LENGTH);
415 }
416
417 /* pitch hangover and zero value replacement */
418 periods[k] = pitch_postprocessing(psFeatures, psDecCtrl->pitchL[k], psDec->indices.signalType);
419
420 /* auto-correlation around pitch lag */
421 calculate_acorr(pfeatures + OSCE_ACORR_START, frame, periods[k]);
422
423 /* ltp */
424 celt_assert(OSCE_LTP_LENGTH == LTP_ORDER)
425 for (i = 0; i < OSCE_LTP_LENGTH; i++)
426 {
427 pfeatures[OSCE_LTP_START + i] = (float) psDecCtrl->LTPCoef_Q14[k * LTP_ORDER + i] / (1U << 14);
428 }
429
430 /* frame gain */
431 pfeatures[OSCE_LOG_GAIN_START] = log((float) psDecCtrl->Gains_Q16[k] / (1UL << 16) + 1e-9f);
432
433 #ifdef WRITE_FEATURES
434 fwrite(pfeatures, sizeof(*pfeatures), 93, f_feat);
435 #endif
436 }
437
438 /* buffer update */
439 OPUS_COPY(psFeatures->signal_history, &buffer[num_samples], OSCE_FEATURES_MAX_HISTORY);
440 }
441
442
osce_cross_fade_10ms(float * x_enhanced,float * x_in,int length)443 void osce_cross_fade_10ms(float *x_enhanced, float *x_in, int length)
444 {
445 int i;
446 celt_assert(length >= 160);
447
448 for (i = 0; i < 160; i++)
449 {
450 x_enhanced[i] = osce_window[i] * x_enhanced[i] + (1.f - osce_window[i]) * x_in[i];
451 }
452
453
454 }
455