xref: /aosp_15_r20/external/libopus/dnn/training_tf2/encode_rdovae.py (revision a58d3d2adb790c104798cd88c8a3aff4fa8b82cc)
1*a58d3d2aSXin Li#!/usr/bin/python3
2*a58d3d2aSXin Li'''Copyright (c) 2021-2022 Amazon
3*a58d3d2aSXin Li   Copyright (c) 2018-2019 Mozilla
4*a58d3d2aSXin Li
5*a58d3d2aSXin Li   Redistribution and use in source and binary forms, with or without
6*a58d3d2aSXin Li   modification, are permitted provided that the following conditions
7*a58d3d2aSXin Li   are met:
8*a58d3d2aSXin Li
9*a58d3d2aSXin Li   - Redistributions of source code must retain the above copyright
10*a58d3d2aSXin Li   notice, this list of conditions and the following disclaimer.
11*a58d3d2aSXin Li
12*a58d3d2aSXin Li   - Redistributions in binary form must reproduce the above copyright
13*a58d3d2aSXin Li   notice, this list of conditions and the following disclaimer in the
14*a58d3d2aSXin Li   documentation and/or other materials provided with the distribution.
15*a58d3d2aSXin Li
16*a58d3d2aSXin Li   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17*a58d3d2aSXin Li   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18*a58d3d2aSXin Li   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19*a58d3d2aSXin Li   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
20*a58d3d2aSXin Li   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21*a58d3d2aSXin Li   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22*a58d3d2aSXin Li   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23*a58d3d2aSXin Li   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24*a58d3d2aSXin Li   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25*a58d3d2aSXin Li   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26*a58d3d2aSXin Li   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27*a58d3d2aSXin Li'''
28*a58d3d2aSXin Li
29*a58d3d2aSXin Li# Train an LPCNet model
30*a58d3d2aSXin Li
31*a58d3d2aSXin Liimport argparse
32*a58d3d2aSXin Li#from plc_loader import PLCLoader
33*a58d3d2aSXin Li
34*a58d3d2aSXin Liparser = argparse.ArgumentParser(description='Train a PLC model')
35*a58d3d2aSXin Li
36*a58d3d2aSXin Liparser.add_argument('features', metavar='<features file>', help='binary features file (float32)')
37*a58d3d2aSXin Liparser.add_argument('output', metavar='<output>', help='trained model file (.h5)')
38*a58d3d2aSXin Liparser.add_argument('--model', metavar='<model>', default='rdovae', help='PLC model python definition (without .py)')
39*a58d3d2aSXin Ligroup1 = parser.add_mutually_exclusive_group()
40*a58d3d2aSXin Ligroup1.add_argument('--weights', metavar='<input weights>', help='model weights')
41*a58d3d2aSXin Liparser.add_argument('--cond-size', metavar='<units>', default=1024, type=int, help='number of units in conditioning network (default 1024)')
42*a58d3d2aSXin Liparser.add_argument('--batch-size', metavar='<batch size>', default=1, type=int, help='batch size to use (default 128)')
43*a58d3d2aSXin Liparser.add_argument('--seq-length', metavar='<sequence length>', default=1000, type=int, help='sequence length to use (default 1000)')
44*a58d3d2aSXin Li
45*a58d3d2aSXin Li
46*a58d3d2aSXin Liargs = parser.parse_args()
47*a58d3d2aSXin Li
48*a58d3d2aSXin Liimport importlib
49*a58d3d2aSXin Lirdovae = importlib.import_module(args.model)
50*a58d3d2aSXin Li
51*a58d3d2aSXin Lifrom rdovae import apply_dead_zone
52*a58d3d2aSXin Li
53*a58d3d2aSXin Liimport sys
54*a58d3d2aSXin Liimport numpy as np
55*a58d3d2aSXin Lifrom tensorflow.keras.optimizers import Adam
56*a58d3d2aSXin Lifrom tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
57*a58d3d2aSXin Liimport tensorflow.keras.backend as K
58*a58d3d2aSXin Liimport h5py
59*a58d3d2aSXin Li
60*a58d3d2aSXin Liimport tensorflow as tf
61*a58d3d2aSXin Lifrom rdovae import pvq_quantize
62*a58d3d2aSXin Li
63*a58d3d2aSXin Li# Try reducing batch_size if you run out of memory on your GPU
64*a58d3d2aSXin Libatch_size = args.batch_size
65*a58d3d2aSXin Li
66*a58d3d2aSXin Limodel, encoder, decoder, qembedding = rdovae.new_rdovae_model(nb_used_features=20, nb_bits=80, batch_size=batch_size, cond_size=args.cond_size)
67*a58d3d2aSXin Limodel.load_weights(args.weights)
68*a58d3d2aSXin Li
69*a58d3d2aSXin Lilpc_order = 16
70*a58d3d2aSXin Li
71*a58d3d2aSXin Lifeature_file = args.features
72*a58d3d2aSXin Linb_features = model.nb_used_features + lpc_order
73*a58d3d2aSXin Linb_used_features = model.nb_used_features
74*a58d3d2aSXin Lisequence_size = args.seq_length
75*a58d3d2aSXin Li
76*a58d3d2aSXin Li# u for unquantised, load 16 bit PCM samples and convert to mu-law
77*a58d3d2aSXin Li
78*a58d3d2aSXin Li
79*a58d3d2aSXin Lifeatures = np.memmap(feature_file, dtype='float32', mode='r')
80*a58d3d2aSXin Linb_sequences = len(features)//(nb_features*sequence_size)//batch_size*batch_size
81*a58d3d2aSXin Lifeatures = features[:nb_sequences*sequence_size*nb_features]
82*a58d3d2aSXin Li
83*a58d3d2aSXin Lifeatures = np.reshape(features, (nb_sequences, sequence_size, nb_features))
84*a58d3d2aSXin Liprint(features.shape)
85*a58d3d2aSXin Lifeatures = features[:, :, :nb_used_features]
86*a58d3d2aSXin Li#features = np.random.randn(73600, 1000, 17)
87*a58d3d2aSXin Li
88*a58d3d2aSXin Li
89*a58d3d2aSXin Libits, gru_state_dec = encoder.predict([features], batch_size=batch_size)
90*a58d3d2aSXin Li(gru_state_dec).astype('float32').tofile(args.output + "-state.f32")
91*a58d3d2aSXin Li
92*a58d3d2aSXin Li
93*a58d3d2aSXin Li#dist = rdovae.feat_dist_loss(features, quant_out)
94*a58d3d2aSXin Li#rate = rdovae.sq1_rate_loss(features, model_bits)
95*a58d3d2aSXin Li#rate2 = rdovae.sq_rate_metric(features, model_bits)
96*a58d3d2aSXin Li#print(dist, rate, rate2)
97*a58d3d2aSXin Li
98*a58d3d2aSXin Liprint("shapes are:")
99*a58d3d2aSXin Liprint(bits.shape)
100*a58d3d2aSXin Liprint(gru_state_dec.shape)
101*a58d3d2aSXin Li
102*a58d3d2aSXin Lifeatures.astype('float32').tofile(args.output + "-input.f32")
103*a58d3d2aSXin Li#quant_out.astype('float32').tofile(args.output + "-enc_dec.f32")
104*a58d3d2aSXin Linbits=80
105*a58d3d2aSXin Libits.astype('float32').tofile(args.output + "-syms.f32")
106*a58d3d2aSXin Li
107*a58d3d2aSXin Lilambda_val = 0.0002 * np.ones((nb_sequences, sequence_size//2, 1))
108*a58d3d2aSXin Liquant_id = np.round(3.8*np.log(lambda_val/.0002)).astype('int16')
109*a58d3d2aSXin Liquant_id = quant_id[:,:,0]
110*a58d3d2aSXin Liquant_embed = qembedding(quant_id)
111*a58d3d2aSXin Liquant_scale = tf.math.softplus(quant_embed[:,:,:nbits])
112*a58d3d2aSXin Lidead_zone = tf.math.softplus(quant_embed[:, :, nbits : 2 * nbits])
113*a58d3d2aSXin Li
114*a58d3d2aSXin Libits = bits*quant_scale
115*a58d3d2aSXin Libits = np.round(apply_dead_zone([bits, dead_zone]).numpy())
116*a58d3d2aSXin Libits = bits/quant_scale
117*a58d3d2aSXin Li
118*a58d3d2aSXin Ligru_state_dec = pvq_quantize(gru_state_dec, 82)
119*a58d3d2aSXin Li#gru_state_dec = gru_state_dec/(1e-15+tf.norm(gru_state_dec, axis=-1,keepdims=True))
120*a58d3d2aSXin Ligru_state_dec = gru_state_dec[:,-1,:]
121*a58d3d2aSXin Lidec_out = decoder([bits[:,1::2,:], gru_state_dec])
122*a58d3d2aSXin Li
123*a58d3d2aSXin Liprint(dec_out.shape)
124*a58d3d2aSXin Li
125*a58d3d2aSXin Lidec_out.numpy().astype('float32').tofile(args.output + "-quant_out.f32")
126