xref: /aosp_15_r20/external/libopus/dnn/torch/neural-pitch/run_crepe.py (revision a58d3d2adb790c104798cd88c8a3aff4fa8b82cc)
1"""
2Perform Data Augmentation (Gain, Additive Noise, Random Filtering) on Input TTS Data
31. Read in chunks and compute clean pitch first
42. Then add in augmentation (Noise/Level/Response)
5    - Adds filtered noise from the "Demand" dataset, https://zenodo.org/record/1227121#.XRKKxYhKiUk
6    - When using the Demand Dataset, consider each channel as a possible noise input, and keep the first 4 minutes of noise for training
73. Use this "augmented" audio for feature computation, and compute pitch using CREPE on the clean input
8
9Notes: To ensure consistency with the discovered CREPE offset, we do the following
10- We pad the input audio to the zero-centered CREPE estimator with 80 zeros
11- We pad the input audio to our feature computation with 160 zeros to center them
12"""
13
14import argparse
15parser = argparse.ArgumentParser()
16
17parser.add_argument('data', type=str, help='input raw audio data')
18parser.add_argument('output', type=str, help='output directory')
19parser.add_argument('--gpu-index', type=int, help='GPU index to use if multiple GPUs',default = 0,required = False)
20parser.add_argument('--chunk-size-frames', type=int, help='Number of frames to process at a time',default = 100000,required = False)
21
22args = parser.parse_args()
23
24import os
25os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_index)
26
27import numpy as np
28import tqdm
29import crepe
30
31data = np.memmap(args.data, dtype=np.int16,mode = 'r')
32
33# list_features = []
34list_cents = []
35list_confidences = []
36
37min_period = 32
38max_period = 256
39f_ref = 16000/max_period
40chunk_size_frames = args.chunk_size_frames
41chunk_size = chunk_size_frames*160
42
43nb_chunks = (data.shape[0]+79)//chunk_size+1
44
45output_data = np.zeros((0,2),dtype='float32')
46
47for i in tqdm.trange(nb_chunks):
48    if i==0:
49        chunk = np.concatenate([np.zeros(80),data[:chunk_size-80]])
50    elif i==nb_chunks-1:
51        chunk = data[i*chunk_size-80:]
52    else:
53        chunk = data[i*chunk_size-80:(i+1)*chunk_size-80]
54    chunk = chunk/np.array(32767.,dtype='float32')
55
56    # Clean Pitch/Confidence Estimate
57    # Padding input to CREPE by 80 samples to ensure it aligns
58    _, pitch, confidence, _ = crepe.predict(chunk, 16000, center=True, viterbi=True,verbose=0)
59    pitch = pitch[:chunk_size_frames]
60    confidence = confidence[:chunk_size_frames]
61
62
63    # Filter out of range pitches/confidences
64    confidence[pitch < 16000/max_period] = 0
65    confidence[pitch > 16000/min_period] = 0
66    pitch = np.reshape(pitch, (-1, 1))
67    confidence = np.reshape(confidence, (-1, 1))
68    out = np.concatenate([pitch, confidence], axis=-1, dtype='float32')
69    output_data = np.concatenate([output_data, out], axis=0)
70
71
72output_data.tofile(args.output)
73