1 /* Copyright (c) 2011 Xiph.Org Foundation
2 Written by Jean-Marc Valin */
3 /*
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
19 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #define ANALYSIS_C
33
34 #ifdef MLP_TRAINING
35 #include <stdio.h>
36 #endif
37
38 #include "mathops.h"
39 #include "kiss_fft.h"
40 #include "celt.h"
41 #include "modes.h"
42 #include "arch.h"
43 #include "quant_bands.h"
44 #include "analysis.h"
45 #include "mlp.h"
46 #include "stack_alloc.h"
47 #include "float_cast.h"
48
49 #ifndef M_PI
50 #define M_PI 3.141592653
51 #endif
52
53 #ifndef DISABLE_FLOAT_API
54
55 #define TRANSITION_PENALTY 10
56
57 static const float dct_table[128] = {
58 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
59 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
60 0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
61 -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
62 0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
63 -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
64 0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
65 0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
66 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
67 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
68 0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
69 -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
70 0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
71 -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
72 0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
73 0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
74 };
75
76 static const float analysis_window[240] = {
77 0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
78 0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
79 0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
80 0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
81 0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
82 0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
83 0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
84 0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
85 0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
86 0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
87 0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
88 0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
89 0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
90 0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
91 0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
92 0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
93 0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
94 0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
95 0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
96 0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
97 0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
98 0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
99 0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
100 0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
101 0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
102 0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
103 0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
104 0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
105 0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
106 0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
107 };
108
109 static const int tbands[NB_TBANDS+1] = {
110 4, 8, 12, 16, 20, 24, 28, 32, 40, 48, 56, 64, 80, 96, 112, 136, 160, 192, 240
111 };
112
113 #define NB_TONAL_SKIP_BANDS 9
114
silk_resampler_down2_hp(opus_val32 * S,opus_val32 * out,const opus_val32 * in,int inLen)115 static opus_val32 silk_resampler_down2_hp(
116 opus_val32 *S, /* I/O State vector [ 2 ] */
117 opus_val32 *out, /* O Output signal [ floor(len/2) ] */
118 const opus_val32 *in, /* I Input signal [ len ] */
119 int inLen /* I Number of input samples */
120 )
121 {
122 int k, len2 = inLen/2;
123 opus_val32 in32, out32, out32_hp, Y, X;
124 opus_val64 hp_ener = 0;
125 /* Internal variables and state are in Q10 format */
126 for( k = 0; k < len2; k++ ) {
127 /* Convert to Q10 */
128 in32 = in[ 2 * k ];
129
130 /* All-pass section for even input sample */
131 Y = SUB32( in32, S[ 0 ] );
132 X = MULT16_32_Q15(QCONST16(0.6074371f, 15), Y);
133 out32 = ADD32( S[ 0 ], X );
134 S[ 0 ] = ADD32( in32, X );
135 out32_hp = out32;
136 /* Convert to Q10 */
137 in32 = in[ 2 * k + 1 ];
138
139 /* All-pass section for odd input sample, and add to output of previous section */
140 Y = SUB32( in32, S[ 1 ] );
141 X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
142 out32 = ADD32( out32, S[ 1 ] );
143 out32 = ADD32( out32, X );
144 S[ 1 ] = ADD32( in32, X );
145
146 Y = SUB32( -in32, S[ 2 ] );
147 X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
148 out32_hp = ADD32( out32_hp, S[ 2 ] );
149 out32_hp = ADD32( out32_hp, X );
150 S[ 2 ] = ADD32( -in32, X );
151
152 if(__builtin_add_overflow(hp_ener, out32_hp*(opus_val64)out32_hp, &hp_ener))
153 {
154 hp_ener = UINT64_MAX;
155 }
156 /* Add, convert back to int16 and store to output */
157 out[ k ] = HALF32(out32);
158 }
159 #ifdef FIXED_POINT
160 /* len2 can be up to 480, so we shift by 8 more to make it fit. */
161 hp_ener = hp_ener >> (2*SIG_SHIFT + 8);
162 #endif
163 return (opus_val32)hp_ener;
164 }
165
downmix_and_resample(downmix_func downmix,const void * _x,opus_val32 * y,opus_val32 S[3],int subframe,int offset,int c1,int c2,int C,int Fs)166 static opus_val32 downmix_and_resample(downmix_func downmix, const void *_x, opus_val32 *y, opus_val32 S[3], int subframe, int offset, int c1, int c2, int C, int Fs)
167 {
168 VARDECL(opus_val32, tmp);
169 opus_val32 scale;
170 int j;
171 opus_val32 ret = 0;
172 SAVE_STACK;
173
174 if (subframe==0) return 0;
175 if (Fs == 48000)
176 {
177 subframe *= 2;
178 offset *= 2;
179 } else if (Fs == 16000) {
180 subframe = subframe*2/3;
181 offset = offset*2/3;
182 }
183 ALLOC(tmp, subframe, opus_val32);
184
185 downmix(_x, tmp, subframe, offset, c1, c2, C);
186 #ifdef FIXED_POINT
187 scale = (1<<SIG_SHIFT);
188 #else
189 scale = 1.f/32768;
190 #endif
191 if (c2==-2)
192 scale /= C;
193 else if (c2>-1)
194 scale /= 2;
195 for (j=0;j<subframe;j++)
196 tmp[j] *= scale;
197 if (Fs == 48000)
198 {
199 ret = silk_resampler_down2_hp(S, y, tmp, subframe);
200 } else if (Fs == 24000) {
201 OPUS_COPY(y, tmp, subframe);
202 } else if (Fs == 16000) {
203 VARDECL(opus_val32, tmp3x);
204 ALLOC(tmp3x, 3*subframe, opus_val32);
205 /* Don't do this at home! This resampler is horrible and it's only (barely)
206 usable for the purpose of the analysis because we don't care about all
207 the aliasing between 8 kHz and 12 kHz. */
208 for (j=0;j<subframe;j++)
209 {
210 tmp3x[3*j] = tmp[j];
211 tmp3x[3*j+1] = tmp[j];
212 tmp3x[3*j+2] = tmp[j];
213 }
214 silk_resampler_down2_hp(S, y, tmp3x, 3*subframe);
215 }
216 RESTORE_STACK;
217 return ret;
218 }
219
tonality_analysis_init(TonalityAnalysisState * tonal,opus_int32 Fs)220 void tonality_analysis_init(TonalityAnalysisState *tonal, opus_int32 Fs)
221 {
222 /* Initialize reusable fields. */
223 tonal->arch = opus_select_arch();
224 tonal->Fs = Fs;
225 /* Clear remaining fields. */
226 tonality_analysis_reset(tonal);
227 }
228
tonality_analysis_reset(TonalityAnalysisState * tonal)229 void tonality_analysis_reset(TonalityAnalysisState *tonal)
230 {
231 /* Clear non-reusable fields. */
232 char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START;
233 OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal));
234 }
235
tonality_get_info(TonalityAnalysisState * tonal,AnalysisInfo * info_out,int len)236 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
237 {
238 int pos;
239 int curr_lookahead;
240 float tonality_max;
241 float tonality_avg;
242 int tonality_count;
243 int i;
244 int pos0;
245 float prob_avg;
246 float prob_count;
247 float prob_min, prob_max;
248 float vad_prob;
249 int mpos, vpos;
250 int bandwidth_span;
251
252 pos = tonal->read_pos;
253 curr_lookahead = tonal->write_pos-tonal->read_pos;
254 if (curr_lookahead<0)
255 curr_lookahead += DETECT_SIZE;
256
257 tonal->read_subframe += len/(tonal->Fs/400);
258 while (tonal->read_subframe>=8)
259 {
260 tonal->read_subframe -= 8;
261 tonal->read_pos++;
262 }
263 if (tonal->read_pos>=DETECT_SIZE)
264 tonal->read_pos-=DETECT_SIZE;
265
266 /* On long frames, look at the second analysis window rather than the first. */
267 if (len > tonal->Fs/50 && pos != tonal->write_pos)
268 {
269 pos++;
270 if (pos==DETECT_SIZE)
271 pos=0;
272 }
273 if (pos == tonal->write_pos)
274 pos--;
275 if (pos<0)
276 pos = DETECT_SIZE-1;
277 pos0 = pos;
278 OPUS_COPY(info_out, &tonal->info[pos], 1);
279 if (!info_out->valid)
280 return;
281 tonality_max = tonality_avg = info_out->tonality;
282 tonality_count = 1;
283 /* Look at the neighbouring frames and pick largest bandwidth found (to be safe). */
284 bandwidth_span = 6;
285 /* If possible, look ahead for a tone to compensate for the delay in the tone detector. */
286 for (i=0;i<3;i++)
287 {
288 pos++;
289 if (pos==DETECT_SIZE)
290 pos = 0;
291 if (pos == tonal->write_pos)
292 break;
293 tonality_max = MAX32(tonality_max, tonal->info[pos].tonality);
294 tonality_avg += tonal->info[pos].tonality;
295 tonality_count++;
296 info_out->bandwidth = IMAX(info_out->bandwidth, tonal->info[pos].bandwidth);
297 bandwidth_span--;
298 }
299 pos = pos0;
300 /* Look back in time to see if any has a wider bandwidth than the current frame. */
301 for (i=0;i<bandwidth_span;i++)
302 {
303 pos--;
304 if (pos < 0)
305 pos = DETECT_SIZE-1;
306 if (pos == tonal->write_pos)
307 break;
308 info_out->bandwidth = IMAX(info_out->bandwidth, tonal->info[pos].bandwidth);
309 }
310 info_out->tonality = MAX32(tonality_avg/tonality_count, tonality_max-.2f);
311
312 mpos = vpos = pos0;
313 /* If we have enough look-ahead, compensate for the ~5-frame delay in the music prob and
314 ~1 frame delay in the VAD prob. */
315 if (curr_lookahead > 15)
316 {
317 mpos += 5;
318 if (mpos>=DETECT_SIZE)
319 mpos -= DETECT_SIZE;
320 vpos += 1;
321 if (vpos>=DETECT_SIZE)
322 vpos -= DETECT_SIZE;
323 }
324
325 /* The following calculations attempt to minimize a "badness function"
326 for the transition. When switching from speech to music, the badness
327 of switching at frame k is
328 b_k = S*v_k + \sum_{i=0}^{k-1} v_i*(p_i - T)
329 where
330 v_i is the activity probability (VAD) at frame i,
331 p_i is the music probability at frame i
332 T is the probability threshold for switching
333 S is the penalty for switching during active audio rather than silence
334 the current frame has index i=0
335
336 Rather than apply badness to directly decide when to switch, what we compute
337 instead is the threshold for which the optimal switching point is now. When
338 considering whether to switch now (frame 0) or at frame k, we have:
339 S*v_0 = S*v_k + \sum_{i=0}^{k-1} v_i*(p_i - T)
340 which gives us:
341 T = ( \sum_{i=0}^{k-1} v_i*p_i + S*(v_k-v_0) ) / ( \sum_{i=0}^{k-1} v_i )
342 We take the min threshold across all positive values of k (up to the maximum
343 amount of lookahead we have) to give us the threshold for which the current
344 frame is the optimal switch point.
345
346 The last step is that we need to consider whether we want to switch at all.
347 For that we use the average of the music probability over the entire window.
348 If the threshold is higher than that average we're not going to
349 switch, so we compute a min with the average as well. The result of all these
350 min operations is music_prob_min, which gives the threshold for switching to music
351 if we're currently encoding for speech.
352
353 We do the exact opposite to compute music_prob_max which is used for switching
354 from music to speech.
355 */
356 prob_min = 1.f;
357 prob_max = 0.f;
358 vad_prob = tonal->info[vpos].activity_probability;
359 prob_count = MAX16(.1f, vad_prob);
360 prob_avg = MAX16(.1f, vad_prob)*tonal->info[mpos].music_prob;
361 while (1)
362 {
363 float pos_vad;
364 mpos++;
365 if (mpos==DETECT_SIZE)
366 mpos = 0;
367 if (mpos == tonal->write_pos)
368 break;
369 vpos++;
370 if (vpos==DETECT_SIZE)
371 vpos = 0;
372 if (vpos == tonal->write_pos)
373 break;
374 pos_vad = tonal->info[vpos].activity_probability;
375 prob_min = MIN16((prob_avg - TRANSITION_PENALTY*(vad_prob - pos_vad))/prob_count, prob_min);
376 prob_max = MAX16((prob_avg + TRANSITION_PENALTY*(vad_prob - pos_vad))/prob_count, prob_max);
377 prob_count += MAX16(.1f, pos_vad);
378 prob_avg += MAX16(.1f, pos_vad)*tonal->info[mpos].music_prob;
379 }
380 info_out->music_prob = prob_avg/prob_count;
381 prob_min = MIN16(prob_avg/prob_count, prob_min);
382 prob_max = MAX16(prob_avg/prob_count, prob_max);
383 prob_min = MAX16(prob_min, 0.f);
384 prob_max = MIN16(prob_max, 1.f);
385
386 /* If we don't have enough look-ahead, do our best to make a decent decision. */
387 if (curr_lookahead < 10)
388 {
389 float pmin, pmax;
390 pmin = prob_min;
391 pmax = prob_max;
392 pos = pos0;
393 /* Look for min/max in the past. */
394 for (i=0;i<IMIN(tonal->count-1, 15);i++)
395 {
396 pos--;
397 if (pos < 0)
398 pos = DETECT_SIZE-1;
399 pmin = MIN16(pmin, tonal->info[pos].music_prob);
400 pmax = MAX16(pmax, tonal->info[pos].music_prob);
401 }
402 /* Bias against switching on active audio. */
403 pmin = MAX16(0.f, pmin - .1f*vad_prob);
404 pmax = MIN16(1.f, pmax + .1f*vad_prob);
405 prob_min += (1.f-.1f*curr_lookahead)*(pmin - prob_min);
406 prob_max += (1.f-.1f*curr_lookahead)*(pmax - prob_max);
407 }
408 info_out->music_prob_min = prob_min;
409 info_out->music_prob_max = prob_max;
410
411 /* printf("%f %f %f %f %f\n", prob_min, prob_max, prob_avg/prob_count, vad_prob, info_out->music_prob); */
412 }
413
414 static const float std_feature_bias[9] = {
415 5.684947f, 3.475288f, 1.770634f, 1.599784f, 3.773215f,
416 2.163313f, 1.260756f, 1.116868f, 1.918795f
417 };
418
419 #define LEAKAGE_OFFSET 2.5f
420 #define LEAKAGE_SLOPE 2.f
421
422 #ifdef FIXED_POINT
423 /* For fixed-point, the input is +/-2^15 shifted up by SIG_SHIFT, so we need to
424 compensate for that in the energy. */
425 #define SCALE_COMPENS (1.f/((opus_int32)1<<(15+SIG_SHIFT)))
426 #define SCALE_ENER(e) ((SCALE_COMPENS*SCALE_COMPENS)*(e))
427 #else
428 #define SCALE_ENER(e) (e)
429 #endif
430
431 #ifdef FIXED_POINT
is_digital_silence32(const opus_val32 * pcm,int frame_size,int channels,int lsb_depth)432 static int is_digital_silence32(const opus_val32* pcm, int frame_size, int channels, int lsb_depth)
433 {
434 int silence = 0;
435 opus_val32 sample_max = 0;
436 #ifdef MLP_TRAINING
437 return 0;
438 #endif
439 sample_max = celt_maxabs32(pcm, frame_size*channels);
440
441 silence = (sample_max == 0);
442 (void)lsb_depth;
443 return silence;
444 }
445 #else
446 #define is_digital_silence32(pcm, frame_size, channels, lsb_depth) is_digital_silence(pcm, frame_size, channels, lsb_depth)
447 #endif
448
tonality_analysis(TonalityAnalysisState * tonal,const CELTMode * celt_mode,const void * x,int len,int offset,int c1,int c2,int C,int lsb_depth,downmix_func downmix)449 static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
450 {
451 int i, b;
452 const kiss_fft_state *kfft;
453 VARDECL(kiss_fft_cpx, in);
454 VARDECL(kiss_fft_cpx, out);
455 int N = 480, N2=240;
456 float * OPUS_RESTRICT A = tonal->angle;
457 float * OPUS_RESTRICT dA = tonal->d_angle;
458 float * OPUS_RESTRICT d2A = tonal->d2_angle;
459 VARDECL(float, tonality);
460 VARDECL(float, noisiness);
461 float band_tonality[NB_TBANDS];
462 float logE[NB_TBANDS];
463 float BFCC[8];
464 float features[25];
465 float frame_tonality;
466 float max_frame_tonality;
467 /*float tw_sum=0;*/
468 float frame_noisiness;
469 const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
470 float slope=0;
471 float frame_stationarity;
472 float relativeE;
473 float frame_probs[2];
474 float alpha, alphaE, alphaE2;
475 float frame_loudness;
476 float bandwidth_mask;
477 int is_masked[NB_TBANDS+1];
478 int bandwidth=0;
479 float maxE = 0;
480 float noise_floor;
481 int remaining;
482 AnalysisInfo *info;
483 float hp_ener;
484 float tonality2[240];
485 float midE[8];
486 float spec_variability=0;
487 float band_log2[NB_TBANDS+1];
488 float leakage_from[NB_TBANDS+1];
489 float leakage_to[NB_TBANDS+1];
490 float layer_out[MAX_NEURONS];
491 float below_max_pitch;
492 float above_max_pitch;
493 int is_silence;
494 SAVE_STACK;
495
496 if (!tonal->initialized)
497 {
498 tonal->mem_fill = 240;
499 tonal->initialized = 1;
500 }
501 alpha = 1.f/IMIN(10, 1+tonal->count);
502 alphaE = 1.f/IMIN(25, 1+tonal->count);
503 /* Noise floor related decay for bandwidth detection: -2.2 dB/second */
504 alphaE2 = 1.f/IMIN(100, 1+tonal->count);
505 if (tonal->count <= 1) alphaE2 = 1;
506
507 if (tonal->Fs == 48000)
508 {
509 /* len and offset are now at 24 kHz. */
510 len/= 2;
511 offset /= 2;
512 } else if (tonal->Fs == 16000) {
513 len = 3*len/2;
514 offset = 3*offset/2;
515 }
516
517 kfft = celt_mode->mdct.kfft[0];
518 tonal->hp_ener_accum += (float)downmix_and_resample(downmix, x,
519 &tonal->inmem[tonal->mem_fill], tonal->downmix_state,
520 IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C, tonal->Fs);
521 if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
522 {
523 tonal->mem_fill += len;
524 /* Don't have enough to update the analysis */
525 RESTORE_STACK;
526 return;
527 }
528 hp_ener = tonal->hp_ener_accum;
529 info = &tonal->info[tonal->write_pos++];
530 if (tonal->write_pos>=DETECT_SIZE)
531 tonal->write_pos-=DETECT_SIZE;
532
533 is_silence = is_digital_silence32(tonal->inmem, ANALYSIS_BUF_SIZE, 1, lsb_depth);
534
535 ALLOC(in, 480, kiss_fft_cpx);
536 ALLOC(out, 480, kiss_fft_cpx);
537 ALLOC(tonality, 240, float);
538 ALLOC(noisiness, 240, float);
539 for (i=0;i<N2;i++)
540 {
541 float w = analysis_window[i];
542 in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
543 in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
544 in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
545 in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
546 }
547 OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
548 remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
549 tonal->hp_ener_accum = (float)downmix_and_resample(downmix, x,
550 &tonal->inmem[240], tonal->downmix_state, remaining,
551 offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C, tonal->Fs);
552 tonal->mem_fill = 240 + remaining;
553 if (is_silence)
554 {
555 /* On silence, copy the previous analysis. */
556 int prev_pos = tonal->write_pos-2;
557 if (prev_pos < 0)
558 prev_pos += DETECT_SIZE;
559 OPUS_COPY(info, &tonal->info[prev_pos], 1);
560 RESTORE_STACK;
561 return;
562 }
563 opus_fft(kfft, in, out, tonal->arch);
564 #ifndef FIXED_POINT
565 /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
566 if (celt_isnan(out[0].r))
567 {
568 info->valid = 0;
569 RESTORE_STACK;
570 return;
571 }
572 #endif
573
574 for (i=1;i<N2;i++)
575 {
576 float X1r, X2r, X1i, X2i;
577 float angle, d_angle, d2_angle;
578 float angle2, d_angle2, d2_angle2;
579 float mod1, mod2, avg_mod;
580 X1r = (float)out[i].r+out[N-i].r;
581 X1i = (float)out[i].i-out[N-i].i;
582 X2r = (float)out[i].i+out[N-i].i;
583 X2i = (float)out[N-i].r-out[i].r;
584
585 angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
586 d_angle = angle - A[i];
587 d2_angle = d_angle - dA[i];
588
589 angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
590 d_angle2 = angle2 - angle;
591 d2_angle2 = d_angle2 - d_angle;
592
593 mod1 = d2_angle - (float)float2int(d2_angle);
594 noisiness[i] = ABS16(mod1);
595 mod1 *= mod1;
596 mod1 *= mod1;
597
598 mod2 = d2_angle2 - (float)float2int(d2_angle2);
599 noisiness[i] += ABS16(mod2);
600 mod2 *= mod2;
601 mod2 *= mod2;
602
603 avg_mod = .25f*(d2A[i]+mod1+2*mod2);
604 /* This introduces an extra delay of 2 frames in the detection. */
605 tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
606 /* No delay on this detection, but it's less reliable. */
607 tonality2[i] = 1.f/(1.f+40.f*16.f*pi4*mod2)-.015f;
608
609 A[i] = angle2;
610 dA[i] = d_angle2;
611 d2A[i] = mod2;
612 }
613 for (i=2;i<N2-1;i++)
614 {
615 float tt = MIN32(tonality2[i], MAX32(tonality2[i-1], tonality2[i+1]));
616 tonality[i] = .9f*MAX32(tonality[i], tt-.1f);
617 }
618 frame_tonality = 0;
619 max_frame_tonality = 0;
620 /*tw_sum = 0;*/
621 info->activity = 0;
622 frame_noisiness = 0;
623 frame_stationarity = 0;
624 if (!tonal->count)
625 {
626 for (b=0;b<NB_TBANDS;b++)
627 {
628 tonal->lowE[b] = 1e10;
629 tonal->highE[b] = -1e10;
630 }
631 }
632 relativeE = 0;
633 frame_loudness = 0;
634 /* The energy of the very first band is special because of DC. */
635 {
636 float E = 0;
637 float X1r, X2r;
638 X1r = 2*(float)out[0].r;
639 X2r = 2*(float)out[0].i;
640 E = X1r*X1r + X2r*X2r;
641 for (i=1;i<4;i++)
642 {
643 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
644 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
645 E += binE;
646 }
647 E = SCALE_ENER(E);
648 band_log2[0] = .5f*1.442695f*(float)log(E+1e-10f);
649 }
650 for (b=0;b<NB_TBANDS;b++)
651 {
652 float E=0, tE=0, nE=0;
653 float L1, L2;
654 float stationarity;
655 for (i=tbands[b];i<tbands[b+1];i++)
656 {
657 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
658 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
659 binE = SCALE_ENER(binE);
660 E += binE;
661 tE += binE*MAX32(0, tonality[i]);
662 nE += binE*2.f*(.5f-noisiness[i]);
663 }
664 #ifndef FIXED_POINT
665 /* Check for extreme band energies that could cause NaNs later. */
666 if (!(E<1e9f) || celt_isnan(E))
667 {
668 info->valid = 0;
669 RESTORE_STACK;
670 return;
671 }
672 #endif
673
674 tonal->E[tonal->E_count][b] = E;
675 frame_noisiness += nE/(1e-15f+E);
676
677 frame_loudness += (float)sqrt(E+1e-10f);
678 logE[b] = (float)log(E+1e-10f);
679 band_log2[b+1] = .5f*1.442695f*(float)log(E+1e-10f);
680 tonal->logE[tonal->E_count][b] = logE[b];
681 if (tonal->count==0)
682 tonal->highE[b] = tonal->lowE[b] = logE[b];
683 if (tonal->highE[b] > tonal->lowE[b] + 7.5)
684 {
685 if (tonal->highE[b] - logE[b] > logE[b] - tonal->lowE[b])
686 tonal->highE[b] -= .01f;
687 else
688 tonal->lowE[b] += .01f;
689 }
690 if (logE[b] > tonal->highE[b])
691 {
692 tonal->highE[b] = logE[b];
693 tonal->lowE[b] = MAX32(tonal->highE[b]-15, tonal->lowE[b]);
694 } else if (logE[b] < tonal->lowE[b])
695 {
696 tonal->lowE[b] = logE[b];
697 tonal->highE[b] = MIN32(tonal->lowE[b]+15, tonal->highE[b]);
698 }
699 relativeE += (logE[b]-tonal->lowE[b])/(1e-5f + (tonal->highE[b]-tonal->lowE[b]));
700
701 L1=L2=0;
702 for (i=0;i<NB_FRAMES;i++)
703 {
704 L1 += (float)sqrt(tonal->E[i][b]);
705 L2 += tonal->E[i][b];
706 }
707
708 stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
709 stationarity *= stationarity;
710 stationarity *= stationarity;
711 frame_stationarity += stationarity;
712 /*band_tonality[b] = tE/(1e-15+E)*/;
713 band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
714 #if 0
715 if (b>=NB_TONAL_SKIP_BANDS)
716 {
717 frame_tonality += tweight[b]*band_tonality[b];
718 tw_sum += tweight[b];
719 }
720 #else
721 frame_tonality += band_tonality[b];
722 if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
723 frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
724 #endif
725 max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
726 slope += band_tonality[b]*(b-8);
727 /*printf("%f %f ", band_tonality[b], stationarity);*/
728 tonal->prev_band_tonality[b] = band_tonality[b];
729 }
730
731 leakage_from[0] = band_log2[0];
732 leakage_to[0] = band_log2[0] - LEAKAGE_OFFSET;
733 for (b=1;b<NB_TBANDS+1;b++)
734 {
735 float leak_slope = LEAKAGE_SLOPE*(tbands[b]-tbands[b-1])/4;
736 leakage_from[b] = MIN16(leakage_from[b-1]+leak_slope, band_log2[b]);
737 leakage_to[b] = MAX16(leakage_to[b-1]-leak_slope, band_log2[b]-LEAKAGE_OFFSET);
738 }
739 for (b=NB_TBANDS-2;b>=0;b--)
740 {
741 float leak_slope = LEAKAGE_SLOPE*(tbands[b+1]-tbands[b])/4;
742 leakage_from[b] = MIN16(leakage_from[b+1]+leak_slope, leakage_from[b]);
743 leakage_to[b] = MAX16(leakage_to[b+1]-leak_slope, leakage_to[b]);
744 }
745 celt_assert(NB_TBANDS+1 <= LEAK_BANDS);
746 for (b=0;b<NB_TBANDS+1;b++)
747 {
748 /* leak_boost[] is made up of two terms. The first, based on leakage_to[],
749 represents the boost needed to overcome the amount of analysis leakage
750 cause in a weaker band b by louder neighbouring bands.
751 The second, based on leakage_from[], applies to a loud band b for
752 which the quantization noise causes synthesis leakage to the weaker
753 neighbouring bands. */
754 float boost = MAX16(0, leakage_to[b] - band_log2[b]) +
755 MAX16(0, band_log2[b] - (leakage_from[b]+LEAKAGE_OFFSET));
756 info->leak_boost[b] = IMIN(255, (int)floor(.5 + 64.f*boost));
757 }
758 for (;b<LEAK_BANDS;b++) info->leak_boost[b] = 0;
759
760 for (i=0;i<NB_FRAMES;i++)
761 {
762 int j;
763 float mindist = 1e15f;
764 for (j=0;j<NB_FRAMES;j++)
765 {
766 int k;
767 float dist=0;
768 for (k=0;k<NB_TBANDS;k++)
769 {
770 float tmp;
771 tmp = tonal->logE[i][k] - tonal->logE[j][k];
772 dist += tmp*tmp;
773 }
774 if (j!=i)
775 mindist = MIN32(mindist, dist);
776 }
777 spec_variability += mindist;
778 }
779 spec_variability = (float)sqrt(spec_variability/NB_FRAMES/NB_TBANDS);
780 bandwidth_mask = 0;
781 bandwidth = 0;
782 maxE = 0;
783 noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
784 noise_floor *= noise_floor;
785 below_max_pitch=0;
786 above_max_pitch=0;
787 for (b=0;b<NB_TBANDS;b++)
788 {
789 float E=0;
790 float Em;
791 int band_start, band_end;
792 /* Keep a margin of 300 Hz for aliasing */
793 band_start = tbands[b];
794 band_end = tbands[b+1];
795 for (i=band_start;i<band_end;i++)
796 {
797 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
798 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
799 E += binE;
800 }
801 E = SCALE_ENER(E);
802 maxE = MAX32(maxE, E);
803 if (band_start < 64)
804 {
805 below_max_pitch += E;
806 } else {
807 above_max_pitch += E;
808 }
809 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
810 Em = MAX32(E, tonal->meanE[b]);
811 /* Consider the band "active" only if all these conditions are met:
812 1) less than 90 dB below the peak band (maximal masking possible considering
813 both the ATH and the loudness-dependent slope of the spreading function)
814 2) above the PCM quantization noise floor
815 We use b+1 because the first CELT band isn't included in tbands[]
816 */
817 if (E*1e9f > maxE && (Em > 3*noise_floor*(band_end-band_start) || E > noise_floor*(band_end-band_start)))
818 bandwidth = b+1;
819 /* Check if the band is masked (see below). */
820 is_masked[b] = E < (tonal->prev_bandwidth >= b+1 ? .01f : .05f)*bandwidth_mask;
821 /* Use a simple follower with 13 dB/Bark slope for spreading function. */
822 bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
823 }
824 /* Special case for the last two bands, for which we don't have spectrum but only
825 the energy above 12 kHz. The difficulty here is that the high-pass we use
826 leaks some LF energy, so we need to increase the threshold without accidentally cutting
827 off the band. */
828 if (tonal->Fs == 48000) {
829 float noise_ratio;
830 float Em;
831 float E = hp_ener*(1.f/(60*60));
832 noise_ratio = tonal->prev_bandwidth==20 ? 10.f : 30.f;
833
834 #ifdef FIXED_POINT
835 /* silk_resampler_down2_hp() shifted right by an extra 8 bits. */
836 E *= 256.f*(1.f/Q15ONE)*(1.f/Q15ONE);
837 #endif
838 above_max_pitch += E;
839 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
840 Em = MAX32(E, tonal->meanE[b]);
841 if (Em > 3*noise_ratio*noise_floor*160 || E > noise_ratio*noise_floor*160)
842 bandwidth = 20;
843 /* Check if the band is masked (see below). */
844 is_masked[b] = E < (tonal->prev_bandwidth == 20 ? .01f : .05f)*bandwidth_mask;
845 }
846 if (above_max_pitch > below_max_pitch)
847 info->max_pitch_ratio = below_max_pitch/above_max_pitch;
848 else
849 info->max_pitch_ratio = 1;
850 /* In some cases, resampling aliasing can create a small amount of energy in the first band
851 being cut. So if the last band is masked, we don't include it. */
852 if (bandwidth == 20 && is_masked[NB_TBANDS])
853 bandwidth-=2;
854 else if (bandwidth > 0 && bandwidth <= NB_TBANDS && is_masked[bandwidth-1])
855 bandwidth--;
856 if (tonal->count<=2)
857 bandwidth = 20;
858 frame_loudness = 20*(float)log10(frame_loudness);
859 tonal->Etracker = MAX32(tonal->Etracker-.003f, frame_loudness);
860 tonal->lowECount *= (1-alphaE);
861 if (frame_loudness < tonal->Etracker-30)
862 tonal->lowECount += alphaE;
863
864 for (i=0;i<8;i++)
865 {
866 float sum=0;
867 for (b=0;b<16;b++)
868 sum += dct_table[i*16+b]*logE[b];
869 BFCC[i] = sum;
870 }
871 for (i=0;i<8;i++)
872 {
873 float sum=0;
874 for (b=0;b<16;b++)
875 sum += dct_table[i*16+b]*.5f*(tonal->highE[b]+tonal->lowE[b]);
876 midE[i] = sum;
877 }
878
879 frame_stationarity /= NB_TBANDS;
880 relativeE /= NB_TBANDS;
881 if (tonal->count<10)
882 relativeE = .5f;
883 frame_noisiness /= NB_TBANDS;
884 #if 1
885 info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
886 #else
887 info->activity = .5*(1+frame_noisiness-frame_stationarity);
888 #endif
889 frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
890 frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
891 tonal->prev_tonality = frame_tonality;
892
893 slope /= 8*8;
894 info->tonality_slope = slope;
895
896 tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
897 tonal->count = IMIN(tonal->count+1, ANALYSIS_COUNT_MAX);
898 info->tonality = frame_tonality;
899
900 for (i=0;i<4;i++)
901 features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
902
903 for (i=0;i<4;i++)
904 tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
905
906 for (i=0;i<4;i++)
907 features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
908 for (i=0;i<3;i++)
909 features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
910
911 if (tonal->count > 5)
912 {
913 for (i=0;i<9;i++)
914 tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
915 }
916 for (i=0;i<4;i++)
917 features[i] = BFCC[i]-midE[i];
918
919 for (i=0;i<8;i++)
920 {
921 tonal->mem[i+24] = tonal->mem[i+16];
922 tonal->mem[i+16] = tonal->mem[i+8];
923 tonal->mem[i+8] = tonal->mem[i];
924 tonal->mem[i] = BFCC[i];
925 }
926 for (i=0;i<9;i++)
927 features[11+i] = (float)sqrt(tonal->std[i]) - std_feature_bias[i];
928 features[18] = spec_variability - 0.78f;
929 features[20] = info->tonality - 0.154723f;
930 features[21] = info->activity - 0.724643f;
931 features[22] = frame_stationarity - 0.743717f;
932 features[23] = info->tonality_slope + 0.069216f;
933 features[24] = tonal->lowECount - 0.067930f;
934
935 analysis_compute_dense(&layer0, layer_out, features);
936 analysis_compute_gru(&layer1, tonal->rnn_state, layer_out);
937 analysis_compute_dense(&layer2, frame_probs, tonal->rnn_state);
938
939 /* Probability of speech or music vs noise */
940 info->activity_probability = frame_probs[1];
941 info->music_prob = frame_probs[0];
942
943 /*printf("%f %f %f\n", frame_probs[0], frame_probs[1], info->music_prob);*/
944 #ifdef MLP_TRAINING
945 for (i=0;i<25;i++)
946 printf("%f ", features[i]);
947 printf("\n");
948 #endif
949
950 info->bandwidth = bandwidth;
951 tonal->prev_bandwidth = bandwidth;
952 /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
953 info->noisiness = frame_noisiness;
954 info->valid = 1;
955 RESTORE_STACK;
956 }
957
run_analysis(TonalityAnalysisState * analysis,const CELTMode * celt_mode,const void * analysis_pcm,int analysis_frame_size,int frame_size,int c1,int c2,int C,opus_int32 Fs,int lsb_depth,downmix_func downmix,AnalysisInfo * analysis_info)958 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
959 int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
960 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
961 {
962 int offset;
963 int pcm_len;
964
965 analysis_frame_size -= analysis_frame_size&1;
966 if (analysis_pcm != NULL)
967 {
968 /* Avoid overflow/wrap-around of the analysis buffer */
969 analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/50, analysis_frame_size);
970
971 pcm_len = analysis_frame_size - analysis->analysis_offset;
972 offset = analysis->analysis_offset;
973 while (pcm_len>0) {
974 tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(Fs/50, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
975 offset += Fs/50;
976 pcm_len -= Fs/50;
977 }
978 analysis->analysis_offset = analysis_frame_size;
979
980 analysis->analysis_offset -= frame_size;
981 }
982
983 tonality_get_info(analysis, analysis_info, frame_size);
984 }
985
986 #endif /* DISABLE_FLOAT_API */
987