1""" 2Perform Data Augmentation (Gain, Additive Noise, Random Filtering) on Input TTS Data 31. Read in chunks and compute clean pitch first 42. Then add in augmentation (Noise/Level/Response) 5 - Adds filtered noise from the "Demand" dataset, https://zenodo.org/record/1227121#.XRKKxYhKiUk 6 - When using the Demand Dataset, consider each channel as a possible noise input, and keep the first 4 minutes of noise for training 73. Use this "augmented" audio for feature computation, and compute pitch using CREPE on the clean input 8 9Notes: To ensure consistency with the discovered CREPE offset, we do the following 10- We pad the input audio to the zero-centered CREPE estimator with 80 zeros 11- We pad the input audio to our feature computation with 160 zeros to center them 12""" 13 14import argparse 15parser = argparse.ArgumentParser() 16 17parser.add_argument('data', type=str, help='input raw audio data') 18parser.add_argument('output', type=str, help='output directory') 19parser.add_argument('--gpu-index', type=int, help='GPU index to use if multiple GPUs',default = 0,required = False) 20parser.add_argument('--chunk-size-frames', type=int, help='Number of frames to process at a time',default = 100000,required = False) 21 22args = parser.parse_args() 23 24import os 25os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_index) 26 27import numpy as np 28import tqdm 29import crepe 30 31data = np.memmap(args.data, dtype=np.int16,mode = 'r') 32 33# list_features = [] 34list_cents = [] 35list_confidences = [] 36 37min_period = 32 38max_period = 256 39f_ref = 16000/max_period 40chunk_size_frames = args.chunk_size_frames 41chunk_size = chunk_size_frames*160 42 43nb_chunks = (data.shape[0]+79)//chunk_size+1 44 45output_data = np.zeros((0,2),dtype='float32') 46 47for i in tqdm.trange(nb_chunks): 48 if i==0: 49 chunk = np.concatenate([np.zeros(80),data[:chunk_size-80]]) 50 elif i==nb_chunks-1: 51 chunk = data[i*chunk_size-80:] 52 else: 53 chunk = data[i*chunk_size-80:(i+1)*chunk_size-80] 54 chunk = chunk/np.array(32767.,dtype='float32') 55 56 # Clean Pitch/Confidence Estimate 57 # Padding input to CREPE by 80 samples to ensure it aligns 58 _, pitch, confidence, _ = crepe.predict(chunk, 16000, center=True, viterbi=True,verbose=0) 59 pitch = pitch[:chunk_size_frames] 60 confidence = confidence[:chunk_size_frames] 61 62 63 # Filter out of range pitches/confidences 64 confidence[pitch < 16000/max_period] = 0 65 confidence[pitch > 16000/min_period] = 0 66 pitch = np.reshape(pitch, (-1, 1)) 67 confidence = np.reshape(confidence, (-1, 1)) 68 out = np.concatenate([pitch, confidence], axis=-1, dtype='float32') 69 output_data = np.concatenate([output_data, out], axis=0) 70 71 72output_data.tofile(args.output) 73