1""" 2Evaluation script to compute the Raw Pitch Accuracy 3Procedure: 4 - Look at all voiced frames in file 5 - Compute number of pitches in those frames that lie within a 50 cent threshold 6 RPA = (Total number of pitches within threshold summed across all files)/(Total number of voiced frames summed accross all files) 7""" 8 9import os 10os.environ["CUDA_VISIBLE_DEVICES"] = "0" 11 12from prettytable import PrettyTable 13import numpy as np 14import glob 15import random 16import tqdm 17import torch 18import librosa 19import json 20from utils import stft, random_filter, feature_xform 21import subprocess 22import crepe 23 24from models import PitchDNN, PitchDNNIF, PitchDNNXcorr 25 26device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 27 28def rca(reference,input,voicing,thresh = 25): 29 idx_voiced = np.where(voicing != 0)[0] 30 acc = np.where(np.abs(reference - input)[idx_voiced] < thresh)[0] 31 return acc.shape[0] 32 33def sweep_rca(reference,input,voicing,thresh = 25,ind_arr = np.arange(-10,10)): 34 l = [] 35 for i in ind_arr: 36 l.append(rca(reference,np.roll(input,i),voicing,thresh)) 37 l = np.array(l) 38 39 return np.max(l) 40 41def rpa(model,device = 'cpu',data_format = 'if'): 42 list_files = glob.glob('/home/ubuntu/Code/Datasets/SPEECH DATA/combined_mic_16k_raw/*.raw') 43 dir_f0 = '/home/ubuntu/Code/Datasets/SPEECH DATA/combine_f0_ptdb/' 44 # random_shuffle = list(np.random.permutation(len(list_files))) 45 random.shuffle(list_files) 46 list_files = list_files[:1000] 47 48 C_all = 0 49 C_all_m = 0 50 C_all_f = 0 51 list_rca_model_all = [] 52 list_rca_male_all = [] 53 list_rca_female_all = [] 54 55 thresh = 50 56 N = 320 57 H = 160 58 freq_keep = 30 59 60 for idx in tqdm.trange(len(list_files)): 61 audio_file = list_files[idx] 62 file_name = os.path.basename(list_files[idx])[:-4] 63 64 audio = np.memmap(list_files[idx], dtype=np.int16)/(2**15 - 1) 65 offset = 432 66 audio = audio[offset:] 67 rmse = np.squeeze(librosa.feature.rms(y = audio,frame_length = 320,hop_length = 160)) 68 69 spec = stft(x = np.concatenate([np.zeros(160),audio]), w = 'boxcar', N = N, H = H).T 70 phase_diff = spec*np.conj(np.roll(spec,1,axis = -1)) 71 phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8) 72 idx_save = np.concatenate([np.arange(freq_keep),(N//2 + 1) + np.arange(freq_keep),2*(N//2 + 1) + np.arange(freq_keep)]) 73 feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T 74 feature_if = feature[:,idx_save] 75 76 data_temp = np.memmap('./temp.raw', dtype=np.int16, shape=(audio.shape[0]), mode='w+') 77 data_temp[:audio.shape[0]] = (audio/(np.max(np.abs(audio)))*(2**15 - 1)).astype(np.int16) 78 79 subprocess.run(["../../../lpcnet_xcorr_extractor", './temp.raw', './temp_xcorr.f32']) 80 feature_xcorr = np.flip(np.fromfile('./temp_xcorr.f32', dtype='float32').reshape((-1,256),order = 'C'),axis = 1) 81 ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1) 82 feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1) 83 # feature_xcorr = feature_xform(feature_xcorr) 84 85 os.remove('./temp.raw') 86 os.remove('./temp_xcorr.f32') 87 88 if data_format == 'if': 89 feature = feature_if 90 elif data_format == 'xcorr': 91 feature = feature_xcorr 92 else: 93 indmin = min(feature_if.shape[0],feature_xcorr.shape[0]) 94 feature = np.concatenate([feature_xcorr[:indmin,:],feature_if[:indmin,:]],-1) 95 96 97 pitch_file_name = dir_f0 + "ref" + os.path.basename(list_files[idx])[3:-4] + ".f0" 98 pitch = np.loadtxt(pitch_file_name)[:,0] 99 voicing = np.loadtxt(pitch_file_name)[:,1] 100 indmin = min(voicing.shape[0],rmse.shape[0],pitch.shape[0]) 101 pitch = pitch[:indmin] 102 voicing = voicing[:indmin] 103 rmse = rmse[:indmin] 104 voicing = voicing*(rmse > 0.05*np.max(rmse)) 105 if "mic_F" in audio_file: 106 idx_correct = np.where(pitch < 125) 107 voicing[idx_correct] = 0 108 109 cent = np.rint(1200*np.log2(np.divide(pitch, (16000/256), out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)).astype('int') 110 111 112 model_cents = model(torch.from_numpy(np.copy(np.expand_dims(feature,0))).float().to(device)) 113 model_cents = 20*model_cents.argmax(dim=1).cpu().detach().squeeze().numpy() 114 115 num_frames = min(cent.shape[0],model_cents.shape[0]) 116 pitch = pitch[:num_frames] 117 cent = cent[:num_frames] 118 voicing = voicing[:num_frames] 119 model_cents = model_cents[:num_frames] 120 121 voicing_all = np.copy(voicing) 122 # Forcefully make regions where pitch is <65 or greater than 500 unvoiced for relevant accurate pitch comparisons for our model 123 force_out_of_pitch = np.where(np.logical_or(pitch < 65,pitch > 500)==True) 124 voicing_all[force_out_of_pitch] = 0 125 C_all = C_all + np.where(voicing_all != 0)[0].shape[0] 126 127 list_rca_model_all.append(rca(cent,model_cents,voicing_all,thresh)) 128 129 if "mic_M" in audio_file: 130 list_rca_male_all.append(rca(cent,model_cents,voicing_all,thresh)) 131 C_all_m = C_all_m + np.where(voicing_all != 0)[0].shape[0] 132 else: 133 list_rca_female_all.append(rca(cent,model_cents,voicing_all,thresh)) 134 C_all_f = C_all_f + np.where(voicing_all != 0)[0].shape[0] 135 136 list_rca_model_all = np.array(list_rca_model_all) 137 list_rca_male_all = np.array(list_rca_male_all) 138 list_rca_female_all = np.array(list_rca_female_all) 139 140 141 x = PrettyTable() 142 143 x.field_names = ["Experiment", "Mean RPA"] 144 x.add_row(["Both all pitches", np.sum(list_rca_model_all)/C_all]) 145 146 x.add_row(["Male all pitches", np.sum(list_rca_male_all)/C_all_m]) 147 148 x.add_row(["Female all pitches", np.sum(list_rca_female_all)/C_all_f]) 149 150 print(x) 151 152 return None 153 154def cycle_eval(checkpoint_list, noise_type = 'synthetic', noise_dataset = None, list_snr = [-20,-15,-10,-5,0,5,10,15,20], ptdb_dataset_path = None,fraction = 0.1,thresh = 50): 155 """ 156 Cycle through SNR evaluation for list of checkpoints 157 """ 158 list_files = glob.glob(ptdb_dataset_path + 'combined_mic_16k/*.raw') 159 dir_f0 = ptdb_dataset_path + 'combined_reference_f0/' 160 random.shuffle(list_files) 161 list_files = list_files[:(int)(fraction*len(list_files))] 162 163 dict_models = {} 164 list_snr.append(np.inf) 165 166 for f in checkpoint_list: 167 if (f!='crepe') and (f!='lpcnet'): 168 169 checkpoint = torch.load(f, map_location='cpu') 170 dict_params = checkpoint['config'] 171 if dict_params['data_format'] == 'if': 172 from models import large_if_ccode as model 173 pitch_nn = PitchDNNIF(dict_params['freq_keep']*3,dict_params['gru_dim'],dict_params['output_dim']) 174 elif dict_params['data_format'] == 'xcorr': 175 from models import large_xcorr as model 176 pitch_nn = PitchDNNXcorr(dict_params['xcorr_dim'],dict_params['gru_dim'],dict_params['output_dim']) 177 else: 178 from models import large_joint as model 179 pitch_nn = PitchDNN(dict_params['freq_keep']*3,dict_params['xcorr_dim'],dict_params['gru_dim'],dict_params['output_dim']) 180 181 pitch_nn.load_state_dict(checkpoint['state_dict']) 182 183 N = dict_params['window_size'] 184 H = dict_params['hop_factor'] 185 freq_keep = dict_params['freq_keep'] 186 187 list_mean = [] 188 list_std = [] 189 for snr_dB in list_snr: 190 C_all = 0 191 C_correct = 0 192 for idx in tqdm.trange(len(list_files)): 193 audio_file = list_files[idx] 194 file_name = os.path.basename(list_files[idx])[:-4] 195 196 audio = np.memmap(list_files[idx], dtype=np.int16)/(2**15 - 1) 197 offset = 432 198 audio = audio[offset:] 199 rmse = np.squeeze(librosa.feature.rms(y = audio,frame_length = N,hop_length = H)) 200 201 if noise_type != 'synthetic': 202 list_noisefiles = noise_dataset + '*.wav' 203 noise_file = random.choice(glob.glob(list_noisefiles)) 204 n = np.memmap(noise_file, dtype=np.int16,mode = 'r')/(2**15 - 1) 205 rand_range = np.random.randint(low = 0, high = (16000*60*5 - audio.shape[0])) # Last 1 minute of noise used for testing 206 n = n[rand_range:rand_range + audio.shape[0]] 207 else: 208 n = np.random.randn(audio.shape[0]) 209 n = random_filter(n) 210 211 snr_multiplier = np.sqrt((np.sum(np.abs(audio)**2)/np.sum(np.abs(n)**2))*10**(-snr_dB/10)) 212 audio = audio + snr_multiplier*n 213 214 spec = stft(x = np.concatenate([np.zeros(160),audio]), w = 'boxcar', N = N, H = H).T 215 phase_diff = spec*np.conj(np.roll(spec,1,axis = -1)) 216 phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8) 217 idx_save = np.concatenate([np.arange(freq_keep),(N//2 + 1) + np.arange(freq_keep),2*(N//2 + 1) + np.arange(freq_keep)]) 218 feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T 219 feature_if = feature[:,idx_save] 220 221 data_temp = np.memmap('./temp.raw', dtype=np.int16, shape=(audio.shape[0]), mode='w+') 222 # data_temp[:audio.shape[0]] = (audio/(np.max(np.abs(audio)))*(2**15 - 1)).astype(np.int16) 223 data_temp[:audio.shape[0]] = ((audio)*(2**15 - 1)).astype(np.int16) 224 225 subprocess.run(["../../../lpcnet_xcorr_extractor", './temp.raw', './temp_xcorr.f32']) 226 feature_xcorr = np.flip(np.fromfile('./temp_xcorr.f32', dtype='float32').reshape((-1,256),order = 'C'),axis = 1) 227 ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1) 228 feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1) 229 230 os.remove('./temp.raw') 231 os.remove('./temp_xcorr.f32') 232 233 if dict_params['data_format'] == 'if': 234 feature = feature_if 235 elif dict_params['data_format'] == 'xcorr': 236 feature = feature_xcorr 237 else: 238 indmin = min(feature_if.shape[0],feature_xcorr.shape[0]) 239 feature = np.concatenate([feature_xcorr[:indmin,:],feature_if[:indmin,:]],-1) 240 241 pitch_file_name = dir_f0 + "ref" + os.path.basename(list_files[idx])[3:-4] + ".f0" 242 pitch = np.loadtxt(pitch_file_name)[:,0] 243 voicing = np.loadtxt(pitch_file_name)[:,1] 244 indmin = min(voicing.shape[0],rmse.shape[0],pitch.shape[0]) 245 pitch = pitch[:indmin] 246 voicing = voicing[:indmin] 247 rmse = rmse[:indmin] 248 voicing = voicing*(rmse > 0.05*np.max(rmse)) 249 if "mic_F" in audio_file: 250 idx_correct = np.where(pitch < 125) 251 voicing[idx_correct] = 0 252 253 cent = np.rint(1200*np.log2(np.divide(pitch, (16000/256), out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)).astype('int') 254 255 model_cents = pitch_nn(torch.from_numpy(np.copy(np.expand_dims(feature,0))).float().to(device)) 256 model_cents = 20*model_cents.argmax(dim=1).cpu().detach().squeeze().numpy() 257 258 num_frames = min(cent.shape[0],model_cents.shape[0]) 259 pitch = pitch[:num_frames] 260 cent = cent[:num_frames] 261 voicing = voicing[:num_frames] 262 model_cents = model_cents[:num_frames] 263 264 voicing_all = np.copy(voicing) 265 # Forcefully make regions where pitch is <65 or greater than 500 unvoiced for relevant accurate pitch comparisons for our model 266 force_out_of_pitch = np.where(np.logical_or(pitch < 65,pitch > 500)==True) 267 voicing_all[force_out_of_pitch] = 0 268 C_all = C_all + np.where(voicing_all != 0)[0].shape[0] 269 270 C_correct = C_correct + rca(cent,model_cents,voicing_all,thresh) 271 list_mean.append(C_correct/C_all) 272 else: 273 fname = f 274 list_mean = [] 275 list_std = [] 276 for snr_dB in list_snr: 277 C_all = 0 278 C_correct = 0 279 for idx in tqdm.trange(len(list_files)): 280 audio_file = list_files[idx] 281 file_name = os.path.basename(list_files[idx])[:-4] 282 283 audio = np.memmap(list_files[idx], dtype=np.int16)/(2**15 - 1) 284 offset = 432 285 audio = audio[offset:] 286 rmse = np.squeeze(librosa.feature.rms(y = audio,frame_length = 320,hop_length = 160)) 287 288 if noise_type != 'synthetic': 289 list_noisefiles = noise_dataset + '*.wav' 290 noise_file = random.choice(glob.glob(list_noisefiles)) 291 n = np.memmap(noise_file, dtype=np.int16,mode = 'r')/(2**15 - 1) 292 rand_range = np.random.randint(low = 0, high = (16000*60*5 - audio.shape[0])) # Last 1 minute of noise used for testing 293 n = n[rand_range:rand_range + audio.shape[0]] 294 else: 295 n = np.random.randn(audio.shape[0]) 296 n = random_filter(n) 297 298 snr_multiplier = np.sqrt((np.sum(np.abs(audio)**2)/np.sum(np.abs(n)**2))*10**(-snr_dB/10)) 299 audio = audio + snr_multiplier*n 300 301 if (f == 'crepe'): 302 _, model_frequency, _, _ = crepe.predict(np.concatenate([np.zeros(80),audio]), 16000, viterbi=True,center=True,verbose=0) 303 model_cents = 1200*np.log2(model_frequency/(16000/256) + 1.0e-8) 304 else: 305 data_temp = np.memmap('./temp.raw', dtype=np.int16, shape=(audio.shape[0]), mode='w+') 306 # data_temp[:audio.shape[0]] = (audio/(np.max(np.abs(audio)))*(2**15 - 1)).astype(np.int16) 307 data_temp[:audio.shape[0]] = ((audio)*(2**15 - 1)).astype(np.int16) 308 309 subprocess.run(["../../../lpcnet_xcorr_extractor", './temp.raw', './temp_xcorr.f32', './temp_period.f32']) 310 feature_xcorr = np.fromfile('./temp_period.f32', dtype='float32') 311 model_cents = 1200*np.log2((256/feature_xcorr + 1.0e-8) + 1.0e-8) 312 313 os.remove('./temp.raw') 314 os.remove('./temp_xcorr.f32') 315 os.remove('./temp_period.f32') 316 317 318 pitch_file_name = dir_f0 + "ref" + os.path.basename(list_files[idx])[3:-4] + ".f0" 319 pitch = np.loadtxt(pitch_file_name)[:,0] 320 voicing = np.loadtxt(pitch_file_name)[:,1] 321 indmin = min(voicing.shape[0],rmse.shape[0],pitch.shape[0]) 322 pitch = pitch[:indmin] 323 voicing = voicing[:indmin] 324 rmse = rmse[:indmin] 325 voicing = voicing*(rmse > 0.05*np.max(rmse)) 326 if "mic_F" in audio_file: 327 idx_correct = np.where(pitch < 125) 328 voicing[idx_correct] = 0 329 330 cent = np.rint(1200*np.log2(np.divide(pitch, (16000/256), out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)).astype('int') 331 num_frames = min(cent.shape[0],model_cents.shape[0]) 332 pitch = pitch[:num_frames] 333 cent = cent[:num_frames] 334 voicing = voicing[:num_frames] 335 model_cents = model_cents[:num_frames] 336 337 voicing_all = np.copy(voicing) 338 # Forcefully make regions where pitch is <65 or greater than 500 unvoiced for relevant accurate pitch comparisons for our model 339 force_out_of_pitch = np.where(np.logical_or(pitch < 65,pitch > 500)==True) 340 voicing_all[force_out_of_pitch] = 0 341 C_all = C_all + np.where(voicing_all != 0)[0].shape[0] 342 343 C_correct = C_correct + rca(cent,model_cents,voicing_all,thresh) 344 list_mean.append(C_correct/C_all) 345 dict_models[fname] = {} 346 dict_models[fname]['list_SNR'] = list_mean[:-1] 347 dict_models[fname]['inf'] = list_mean[-1] 348 349 return dict_models 350