xref: /aosp_15_r20/external/libopus/dnn/fwgan.c (revision a58d3d2adb790c104798cd88c8a3aff4fa8b82cc)
1 /* Copyright (c) 2023 Amazon */
2 /*
3    Redistribution and use in source and binary forms, with or without
4    modification, are permitted provided that the following conditions
5    are met:
6 
7    - Redistributions of source code must retain the above copyright
8    notice, this list of conditions and the following disclaimer.
9 
10    - Redistributions in binary form must reproduce the above copyright
11    notice, this list of conditions and the following disclaimer in the
12    documentation and/or other materials provided with the distribution.
13 
14    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
18    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
22    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26 
27 #ifdef HAVE_CONFIG_H
28 #include "config.h"
29 #endif
30 
31 #include "fwgan.h"
32 #include "os_support.h"
33 #include "freq.h"
34 #include "fwgan_data.h"
35 #include "lpcnet.h"
36 #include "pitch.h"
37 #include "nnet.h"
38 #include "lpcnet_private.h"
39 
40 #define FEAT_IN_SIZE (BFCC_WITH_CORR_UPSAMPLER_FC_OUT_SIZE/4 + FWGAN_FRAME_SIZE/2)
41 
42 #define FWGAN_FEATURES (NB_FEATURES-1)
43 
pitch_embeddings(float * pembed,float * phase,double w0)44 static void pitch_embeddings(float *pembed, float *phase, double w0) {
45   int i;
46   float wreal, wimag;
47 #if 1
48   /* This Taylor expansion should be good enough since w0 is always small. */
49   float w2 = w0*w0;
50   wreal = 1 - .5*w2*(1.f - 0.083333333f*w2);
51   wimag = w0*(1 - 0.166666667f*w2*(1.f - 0.05f*w2));
52 #else
53   wreal = cos(w0);
54   wimag = sin(w0);
55 #endif
56   /* Speed-up phase reference by making phase a unit-norm complex value and rotating it
57      by exp(-i*w0) each sample.  */
58   for (i=0;i<SUBFRAME_SIZE;i++) {
59     float tmp;
60     tmp = phase[0]*wreal - phase[1]*wimag;
61     phase[1] = phase[0]*wimag + phase[1]*wreal;
62     phase[0] = tmp;
63     pembed[i] = phase[1];
64     pembed[SUBFRAME_SIZE+i] = phase[0];
65   }
66   /* Renormalize once per sub-frame, though we could probably do it even less frequently. */
67   {
68     float r = 1.f/sqrt(phase[0]*phase[0] + phase[1]*phase[1]);
69     phase[0] *= r;
70     phase[1] *= r;
71   }
72 }
73 
compute_wlpc(float lpc[LPC_ORDER],const float * features)74 static void compute_wlpc(float lpc[LPC_ORDER], const float *features) {
75   float lpc_weight;
76   int i;
77   lpc_from_cepstrum(lpc, features);
78   lpc_weight = 1.f;
79   for (i=0;i<LPC_ORDER;i++) {
80     lpc_weight *= FWGAN_GAMMA;
81     lpc[i] *= lpc_weight;
82   }
83 }
84 
run_fwgan_upsampler(FWGANState * st,float * cond,const float * features)85 static void run_fwgan_upsampler(FWGANState *st, float *cond, const float *features)
86 {
87   FWGAN *model;
88   model = &st->model;
89   celt_assert(FWGAN_FEATURES == model->bfcc_with_corr_upsampler_fc.nb_inputs);
90   celt_assert(BFCC_WITH_CORR_UPSAMPLER_FC_OUT_SIZE == model->bfcc_with_corr_upsampler_fc.nb_outputs);
91   compute_generic_dense(&model->bfcc_with_corr_upsampler_fc, cond, features, ACTIVATION_TANH);
92 }
93 
94 static void fwgan_synthesize_impl(FWGANState *st, float *pcm, const float *lpc, const float *features);
fwgan_cont(FWGANState * st,const float * pcm0,const float * features0)95 void fwgan_cont(FWGANState *st, const float *pcm0, const float *features0)
96 {
97   int i;
98   float norm2, norm_1;
99   float wpcm0[CONT_PCM_INPUTS];
100   float cont_inputs[CONT_PCM_INPUTS+1];
101   float tmp1[MAX_CONT_SIZE];
102   float tmp2[MAX_CONT_SIZE];
103   float lpc[LPC_ORDER];
104   float new_pcm[FWGAN_FRAME_SIZE];
105   FWGAN *model;
106   st->embed_phase[0] = 1;
107   model = &st->model;
108   compute_wlpc(lpc, features0);
109   /* Deemphasis memory is just the last continuation sample. */
110   st->deemph_mem = pcm0[CONT_PCM_INPUTS-1];
111 
112   /* Apply analysis filter, considering that the preemphasis and deemphasis filter
113      cancel each other in this case since the LPC filter is constant across that boundary.
114      */
115   for (i=LPC_ORDER;i<CONT_PCM_INPUTS;i++) {
116     int j;
117     wpcm0[i] = pcm0[i];
118     for (j=0;j<LPC_ORDER;j++) wpcm0[i] += lpc[j]*pcm0[i-j-1];
119   }
120   /* FIXME: Make this less stupid. */
121   for (i=0;i<LPC_ORDER;i++) wpcm0[i] = wpcm0[LPC_ORDER];
122 
123   /* The memory of the pre-empahsis is the last sample of the weighted signal
124      (ignoring preemphasis+deemphasis combination). */
125   st->preemph_mem = wpcm0[CONT_PCM_INPUTS-1];
126   /* The memory of the synthesis filter is the pre-emphasized continuation. */
127   for (i=0;i<LPC_ORDER;i++) st->syn_mem[i] = pcm0[CONT_PCM_INPUTS-1-i] - FWGAN_DEEMPHASIS*pcm0[CONT_PCM_INPUTS-2-i];
128 
129   norm2 = celt_inner_prod(wpcm0, wpcm0, CONT_PCM_INPUTS, st->arch);
130   norm_1 = 1.f/sqrt(1e-8f + norm2);
131   for (i=0;i<CONT_PCM_INPUTS;i++) cont_inputs[i+1] = norm_1*wpcm0[i];
132   cont_inputs[0] = log(sqrt(norm2) + 1e-7f);
133 
134   /* Continuation network */
135   compute_generic_dense(&model->cont_net_0, tmp1, cont_inputs, ACTIVATION_TANH);
136   compute_generic_dense(&model->cont_net_2, tmp2, tmp1, ACTIVATION_TANH);
137   compute_generic_dense(&model->cont_net_4, tmp1, tmp2, ACTIVATION_TANH);
138   compute_generic_dense(&model->cont_net_6, tmp2, tmp1, ACTIVATION_TANH);
139   compute_generic_dense(&model->cont_net_8, tmp1, tmp2, ACTIVATION_TANH);
140   celt_assert(CONT_NET_10_OUT_SIZE == model->cont_net_10.nb_outputs);
141   compute_generic_dense(&model->cont_net_10, st->cont, tmp1, ACTIVATION_TANH);
142 
143   /* Computing continuation for each layer. */
144   celt_assert(RNN_GRU_STATE_SIZE == model->rnn_cont_fc_0.nb_outputs);
145   compute_generic_dense(&model->rnn_cont_fc_0, st->rnn_state, st->cont, ACTIVATION_TANH);
146 
147   celt_assert(FWC1_STATE_SIZE == model->fwc1_cont_fc_0.nb_outputs);
148   compute_generic_dense(&model->fwc1_cont_fc_0, st->fwc1_state, st->cont, ACTIVATION_TANH);
149   celt_assert(FWC2_STATE_SIZE == model->fwc2_cont_fc_0.nb_outputs);
150   compute_generic_dense(&model->fwc2_cont_fc_0, st->fwc2_state, st->cont, ACTIVATION_TANH);
151   celt_assert(FWC3_STATE_SIZE == model->fwc3_cont_fc_0.nb_outputs);
152   compute_generic_dense(&model->fwc3_cont_fc_0, st->fwc3_state, st->cont, ACTIVATION_TANH);
153   celt_assert(FWC4_STATE_SIZE == model->fwc4_cont_fc_0.nb_outputs);
154   compute_generic_dense(&model->fwc4_cont_fc_0, st->fwc4_state, st->cont, ACTIVATION_TANH);
155   celt_assert(FWC5_STATE_SIZE == model->fwc5_cont_fc_0.nb_outputs);
156   compute_generic_dense(&model->fwc5_cont_fc_0, st->fwc5_state, st->cont, ACTIVATION_TANH);
157   celt_assert(FWC6_STATE_SIZE == model->fwc6_cont_fc_0.nb_outputs);
158   compute_generic_dense(&model->fwc6_cont_fc_0, st->fwc6_state, st->cont, ACTIVATION_TANH);
159   celt_assert(FWC7_STATE_SIZE == model->fwc7_cont_fc_0.nb_outputs);
160   compute_generic_dense(&model->fwc7_cont_fc_0, st->fwc7_state, st->cont, ACTIVATION_TANH);
161 
162   st->cont_initialized = 1;
163   /* Process the first frame, discard the first subframe, and keep the rest for the first
164      synthesis call. */
165   fwgan_synthesize_impl(st, new_pcm, lpc, features0);
166   OPUS_COPY(st->pcm_buf, &new_pcm[SUBFRAME_SIZE], FWGAN_FRAME_SIZE-SUBFRAME_SIZE);
167 }
168 
apply_gain(float * pcm,float c0,float * last_gain)169 static void apply_gain(float *pcm, float c0, float *last_gain) {
170   int i;
171   float gain = pow(10.f, (0.5f*c0/sqrt(18.f)));
172   for (i=0;i<SUBFRAME_SIZE;i++) pcm[i] *= *last_gain;
173   *last_gain = gain;
174 }
175 
fwgan_lpc_syn(float * pcm,float * mem,const float * lpc,float last_lpc[LPC_ORDER])176 static void fwgan_lpc_syn(float *pcm, float *mem, const float *lpc, float last_lpc[LPC_ORDER]) {
177   int i;
178   for (i=0;i<SUBFRAME_SIZE;i++) {
179     int j;
180     for (j=0;j<LPC_ORDER;j++) pcm[i] -= mem[j]*last_lpc[j];
181     OPUS_MOVE(&mem[1], &mem[0], LPC_ORDER-1);
182     mem[0] = pcm[i];
183   }
184   OPUS_COPY(last_lpc, lpc, LPC_ORDER);
185 }
186 
fwgan_preemphasis(float * pcm,float * preemph_mem)187 static void fwgan_preemphasis(float *pcm, float *preemph_mem) {
188   int i;
189   for (i=0;i<SUBFRAME_SIZE;i++) {
190     float tmp = pcm[i];
191     pcm[i] -= FWGAN_DEEMPHASIS * *preemph_mem;
192     *preemph_mem = tmp;
193   }
194 }
195 
fwgan_deemphasis(float * pcm,float * deemph_mem)196 static void fwgan_deemphasis(float *pcm, float *deemph_mem) {
197   int i;
198   for (i=0;i<SUBFRAME_SIZE;i++) {
199     pcm[i] += FWGAN_DEEMPHASIS * *deemph_mem;
200     *deemph_mem = pcm[i];
201   }
202 }
203 
run_fwgan_subframe(FWGANState * st,float * pcm,const float * cond,double w0,const float * lpc,float c0)204 static void run_fwgan_subframe(FWGANState *st, float *pcm, const float *cond, double w0, const float *lpc, float c0)
205 {
206   float tmp1[FWC1_FC_0_OUT_SIZE];
207   float tmp2[IMAX(RNN_GRU_STATE_SIZE, FWC2_FC_0_OUT_SIZE)];
208   float feat_in[FEAT_IN_SIZE];
209   float rnn_in[FEAT_IN_CONV1_CONV_OUT_SIZE];
210   float pembed[FWGAN_FRAME_SIZE/2];
211   FWGAN *model;
212   model = &st->model;
213 
214   pitch_embeddings(pembed, st->embed_phase, w0);
215   /* Interleave bfcc_cond and pembed for each subframe in feat_in. */
216   OPUS_COPY(&feat_in[BFCC_WITH_CORR_UPSAMPLER_FC_OUT_SIZE/4], &cond[0], BFCC_WITH_CORR_UPSAMPLER_FC_OUT_SIZE/4);
217   OPUS_COPY(&feat_in[0], &pembed[0], FWGAN_FRAME_SIZE/2);
218 
219   compute_generic_conv1d(&model->feat_in_conv1_conv, rnn_in, st->cont_conv1_mem, feat_in, FEAT_IN_CONV1_CONV_IN_SIZE, ACTIVATION_LINEAR);
220   celt_assert(FEAT_IN_NL1_GATE_OUT_SIZE == model->feat_in_nl1_gate.nb_outputs);
221   compute_gated_activation(&model->feat_in_nl1_gate, rnn_in, rnn_in, ACTIVATION_TANH);
222 
223   if (st->cont_initialized == 1) {
224     /* On the very first subframe we stop here. We only want to run the feat_in layer since the
225        others are initialized via the continuation network. */
226     OPUS_CLEAR(pcm, SUBFRAME_SIZE);
227     st->cont_initialized = 2;
228     apply_gain(pcm, c0, &st->last_gain);
229     OPUS_COPY(st->last_lpc, lpc, LPC_ORDER);
230     return;
231   }
232 
233   compute_generic_gru(&model->rnn_gru_input, &model->rnn_gru_recurrent, st->rnn_state, rnn_in);
234   celt_assert(IMAX(RNN_GRU_STATE_SIZE, FWC2_FC_0_OUT_SIZE) >= model->rnn_nl_gate.nb_outputs);
235   compute_gated_activation(&model->rnn_nl_gate, tmp2, st->rnn_state, ACTIVATION_TANH);
236 
237   compute_generic_conv1d(&model->fwc1_fc_0, tmp1, st->fwc1_state, tmp2, RNN_GRU_STATE_SIZE, ACTIVATION_LINEAR);
238   compute_gated_activation(&model->fwc1_fc_1_gate, tmp1, tmp1, ACTIVATION_TANH);
239 
240   compute_generic_conv1d(&model->fwc2_fc_0, tmp2, st->fwc2_state, tmp1, FWC1_FC_0_OUT_SIZE, ACTIVATION_LINEAR);
241   compute_gated_activation(&model->fwc2_fc_1_gate, tmp2, tmp2, ACTIVATION_TANH);
242 
243   compute_generic_conv1d(&model->fwc3_fc_0, tmp1, st->fwc3_state, tmp2, FWC2_FC_0_OUT_SIZE, ACTIVATION_LINEAR);
244   compute_gated_activation(&model->fwc3_fc_1_gate, tmp1, tmp1, ACTIVATION_TANH);
245 
246   compute_generic_conv1d(&model->fwc4_fc_0, tmp2, st->fwc4_state, tmp1, FWC3_FC_0_OUT_SIZE, ACTIVATION_LINEAR);
247   compute_gated_activation(&model->fwc4_fc_1_gate, tmp2, tmp2, ACTIVATION_TANH);
248 
249   compute_generic_conv1d(&model->fwc5_fc_0, tmp1, st->fwc5_state, tmp2, FWC4_FC_0_OUT_SIZE, ACTIVATION_LINEAR);
250   compute_gated_activation(&model->fwc5_fc_1_gate, tmp1, tmp1, ACTIVATION_TANH);
251 
252   compute_generic_conv1d(&model->fwc6_fc_0, tmp2, st->fwc6_state, tmp1, FWC5_FC_0_OUT_SIZE, ACTIVATION_LINEAR);
253   compute_gated_activation(&model->fwc6_fc_1_gate, tmp2, tmp2, ACTIVATION_TANH);
254 
255   compute_generic_conv1d(&model->fwc7_fc_0, tmp1, st->fwc7_state, tmp2, FWC6_FC_0_OUT_SIZE, ACTIVATION_LINEAR);
256   compute_gated_activation(&model->fwc7_fc_1_gate, pcm, tmp1, ACTIVATION_TANH);
257 
258   apply_gain(pcm, c0, &st->last_gain);
259   fwgan_preemphasis(pcm, &st->preemph_mem);
260   fwgan_lpc_syn(pcm, st->syn_mem, lpc, st->last_lpc);
261   fwgan_deemphasis(pcm, &st->deemph_mem);
262 }
263 
fwgan_init(FWGANState * st)264 void fwgan_init(FWGANState *st)
265 {
266   int ret;
267   OPUS_CLEAR(st, 1);
268   ret = init_fwgan(&st->model, fwgan_arrays);
269   celt_assert(ret == 0);
270   /* FIXME: perform arch detection. */
271 }
272 
fwgan_load_model(FWGANState * st,const unsigned char * data,int len)273 int fwgan_load_model(FWGANState *st, const unsigned char *data, int len) {
274   WeightArray *list;
275   int ret;
276   parse_weights(&list, data, len);
277   ret = init_fwgan(&st->model, list);
278   opus_free(list);
279   if (ret == 0) return 0;
280   else return -1;
281 }
282 
fwgan_synthesize_impl(FWGANState * st,float * pcm,const float * lpc,const float * features)283 static void fwgan_synthesize_impl(FWGANState *st, float *pcm, const float *lpc, const float *features)
284 {
285   int subframe;
286   float cond[BFCC_WITH_CORR_UPSAMPLER_FC_OUT_SIZE];
287   double w0;
288   int period;
289   float fwgan_features[NB_FEATURES-1];
290   celt_assert(st->cont_initialized);
291   OPUS_COPY(fwgan_features, features, NB_FEATURES-2);
292   fwgan_features[NB_FEATURES-2] = features[NB_FEATURES-1]+.5;
293 
294   period = (int)floor(.1 + 50*features[NB_BANDS]+100);
295   w0 = 2*M_PI/period;
296   run_fwgan_upsampler(st, cond, fwgan_features);
297   for (subframe=0;subframe<NB_SUBFRAMES;subframe++) {
298     float *sub_cond;
299     sub_cond = &cond[subframe*BFCC_WITH_CORR_UPSAMPLER_FC_OUT_SIZE/4];
300     run_fwgan_subframe(st, &pcm[subframe*SUBFRAME_SIZE], sub_cond, w0, lpc, features[0]);
301   }
302 }
303 
fwgan_synthesize(FWGANState * st,float * pcm,const float * features)304 void fwgan_synthesize(FWGANState *st, float *pcm, const float *features)
305 {
306   float lpc[LPC_ORDER];
307   float new_pcm[FWGAN_FRAME_SIZE];
308   compute_wlpc(lpc, features);
309   fwgan_synthesize_impl(st, new_pcm, lpc, features);
310   /* Handle buffering. */
311   OPUS_COPY(pcm, st->pcm_buf, FWGAN_FRAME_SIZE-SUBFRAME_SIZE);
312   OPUS_COPY(&pcm[FWGAN_FRAME_SIZE-SUBFRAME_SIZE], new_pcm, SUBFRAME_SIZE);
313   OPUS_COPY(st->pcm_buf, &new_pcm[SUBFRAME_SIZE], FWGAN_FRAME_SIZE-SUBFRAME_SIZE);
314 }
315 
fwgan_synthesize_int(FWGANState * st,opus_int16 * pcm,const float * features)316 void fwgan_synthesize_int(FWGANState *st, opus_int16 *pcm, const float *features)
317 {
318   int i;
319   float fpcm[FWGAN_FRAME_SIZE];
320   fwgan_synthesize(st, fpcm, features);
321   for (i=0;i<LPCNET_FRAME_SIZE;i++) pcm[i] = (int)floor(.5 + MIN32(32767, MAX32(-32767, 32768.f*fpcm[i])));
322 }
323