1# Copyright © 2021 Arm Ltd and Contributors. All rights reserved. 2# SPDX-License-Identifier: MIT 3 4"""Automatic speech recognition with PyArmNN demo for processing audio clips to text.""" 5 6import sys 7import os 8import numpy as np 9 10script_dir = os.path.dirname(__file__) 11sys.path.insert(1, os.path.join(script_dir, '..', 'common')) 12 13from argparse import ArgumentParser 14from network_executor import ArmnnNetworkExecutor 15from utils import prepare_input_data 16from audio_capture import AudioCaptureParams, capture_audio 17from audio_utils import decode_text, display_text 18from wav2letter_mfcc import Wav2LetterMFCC, W2LAudioPreprocessor 19from mfcc import MFCCParams 20 21# Model Specific Labels 22labels = {0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e', 5: 'f', 6: 'g', 7: 'h', 8: 'i', 9: 'j', 10: 'k', 11: 'l', 12: 'm', 23 13: 'n', 24 14: 'o', 15: 'p', 16: 'q', 17: 'r', 18: 's', 19: 't', 20: 'u', 21: 'v', 22: 'w', 23: 'x', 24: 'y', 25 25: 'z', 26 26: "'", 27: ' ', 28: '$'} 27 28 29def parse_args(): 30 parser = ArgumentParser(description="ASR with PyArmNN") 31 parser.add_argument( 32 "--audio_file_path", 33 required=True, 34 type=str, 35 help="Path to the audio file to perform ASR", 36 ) 37 parser.add_argument( 38 "--model_file_path", 39 required=True, 40 type=str, 41 help="Path to ASR model to use", 42 ) 43 parser.add_argument( 44 "--preferred_backends", 45 type=str, 46 nargs="+", 47 default=["CpuAcc", "CpuRef"], 48 help="""List of backends in order of preference for optimizing 49 subgraphs, falling back to the next backend in the list on unsupported 50 layers. Defaults to [CpuAcc, CpuRef]""", 51 ) 52 return parser.parse_args() 53 54 55def main(args): 56 # Read command line args 57 audio_file = args.audio_file_path 58 59 # Create the ArmNN inference runner 60 network = ArmnnNetworkExecutor(args.model_file_path, args.preferred_backends) 61 62 # Specify model specific audio data requirements 63 audio_capture_params = AudioCaptureParams(dtype=np.float32, overlap=31712, min_samples=47712, sampling_freq=16000, 64 mono=True) 65 66 buffer = capture_audio(audio_file, audio_capture_params) 67 68 # Extract features and create the preprocessor 69 70 mfcc_params = MFCCParams(sampling_freq=16000, num_fbank_bins=128, mel_lo_freq=0, mel_hi_freq=8000, 71 num_mfcc_feats=13, frame_len=512, use_htk_method=False, n_fft=512) 72 73 wmfcc = Wav2LetterMFCC(mfcc_params) 74 preprocessor = W2LAudioPreprocessor(wmfcc, model_input_size=296, stride=160) 75 current_r_context = "" 76 is_first_window = True 77 78 print("Processing Audio Frames...") 79 for audio_data in buffer: 80 # Prepare the input Tensors 81 input_data = prepare_input_data(audio_data, network.get_data_type(), network.get_input_quantization_scale(0), 82 network.get_input_quantization_offset(0), preprocessor) 83 84 # Run inference 85 output_result = network.run([input_data]) 86 87 # Slice and Decode the text, and store the right context 88 current_r_context, text = decode_text(is_first_window, labels, output_result) 89 90 is_first_window = False 91 92 display_text(text) 93 94 print(current_r_context, flush=True) 95 96 97if __name__ == "__main__": 98 args = parse_args() 99 main(args) 100