xref: /aosp_15_r20/external/armnn/python/pyarmnn/examples/speech_recognition/audio_utils.py (revision 89c4ff92f2867872bb9e2354d150bf0c8c502810)
1# Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
2# SPDX-License-Identifier: MIT
3
4"""Utilities for speech recognition apps."""
5
6import numpy as np
7
8
9def decode(model_output: np.ndarray, labels: dict) -> str:
10    """Decodes the integer encoded results from inference into a string.
11
12    Args:
13        model_output: Results from running inference.
14        labels: Dictionary of labels keyed on the classification index.
15
16    Returns:
17        Decoded string.
18    """
19    top1_results = [labels[np.argmax(row)] for row in model_output]
20    return filter_characters(top1_results)
21
22
23def filter_characters(results: list) -> str:
24    """Filters unwanted and duplicate characters.
25
26    Args:
27        results: List of top 1 results from inference.
28
29    Returns:
30        Final output string to present to user.
31    """
32    text = ""
33    for i in range(len(results)):
34        if results[i] == "$":
35            continue
36        elif i + 1 < len(results) and results[i] == results[i + 1]:
37            continue
38        else:
39            text += results[i]
40    return text
41
42
43def display_text(text: str):
44    """Presents the results on the console.
45
46    Args:
47        text: Results of performing ASR on the input audio data.
48    """
49    print(text, sep="", end="", flush=True)
50
51
52def decode_text(is_first_window, labels, output_result):
53    """
54    Slices the text appropriately depending on the window, and decodes for wav2letter output.
55        * First run, take the left context, and inner context.
56        * Every other run, take the inner context.
57    Stores the current right context, and updates it for each inference. Will get used after last inference.
58
59    Args:
60        is_first_window: Boolean to show if it is the first window we are running inference on
61        labels: the label set
62        output_result: the output from the inference
63    Returns:
64        current_r_context: the current right context
65        text: the current text string, with the latest output decoded and appended
66    """
67    # For wav2letter with 148 output steps:
68    # Left context is index 0-48, inner context 49-99, right context 100-147
69    inner_context_start = 49
70    inner_context_end = 99
71    right_context_start = 100
72
73    if is_first_window:
74        # Since it's the first inference, keep the left context, and inner context, and decode
75        text = decode(output_result[0][0][0][0:inner_context_end], labels)
76    else:
77        # Only decode the inner context
78        text = decode(output_result[0][0][0][inner_context_start:inner_context_end], labels)
79
80    # Store the right context, we will need it after the last inference
81    current_r_context = decode(output_result[0][0][0][right_context_start:], labels)
82    return current_r_context, text
83