1# Copyright © 2021 Arm Ltd and Contributors. All rights reserved. 2# SPDX-License-Identifier: MIT 3 4"""Utilities for speech recognition apps.""" 5 6import numpy as np 7 8 9def decode(model_output: np.ndarray, labels: dict) -> str: 10 """Decodes the integer encoded results from inference into a string. 11 12 Args: 13 model_output: Results from running inference. 14 labels: Dictionary of labels keyed on the classification index. 15 16 Returns: 17 Decoded string. 18 """ 19 top1_results = [labels[np.argmax(row)] for row in model_output] 20 return filter_characters(top1_results) 21 22 23def filter_characters(results: list) -> str: 24 """Filters unwanted and duplicate characters. 25 26 Args: 27 results: List of top 1 results from inference. 28 29 Returns: 30 Final output string to present to user. 31 """ 32 text = "" 33 for i in range(len(results)): 34 if results[i] == "$": 35 continue 36 elif i + 1 < len(results) and results[i] == results[i + 1]: 37 continue 38 else: 39 text += results[i] 40 return text 41 42 43def display_text(text: str): 44 """Presents the results on the console. 45 46 Args: 47 text: Results of performing ASR on the input audio data. 48 """ 49 print(text, sep="", end="", flush=True) 50 51 52def decode_text(is_first_window, labels, output_result): 53 """ 54 Slices the text appropriately depending on the window, and decodes for wav2letter output. 55 * First run, take the left context, and inner context. 56 * Every other run, take the inner context. 57 Stores the current right context, and updates it for each inference. Will get used after last inference. 58 59 Args: 60 is_first_window: Boolean to show if it is the first window we are running inference on 61 labels: the label set 62 output_result: the output from the inference 63 Returns: 64 current_r_context: the current right context 65 text: the current text string, with the latest output decoded and appended 66 """ 67 # For wav2letter with 148 output steps: 68 # Left context is index 0-48, inner context 49-99, right context 100-147 69 inner_context_start = 49 70 inner_context_end = 99 71 right_context_start = 100 72 73 if is_first_window: 74 # Since it's the first inference, keep the left context, and inner context, and decode 75 text = decode(output_result[0][0][0][0:inner_context_end], labels) 76 else: 77 # Only decode the inner context 78 text = decode(output_result[0][0][0][inner_context_start:inner_context_end], labels) 79 80 # Store the right context, we will need it after the last inference 81 current_r_context = decode(output_result[0][0][0][right_context_start:], labels) 82 return current_r_context, text 83