xref: /aosp_15_r20/external/armnn/samples/SpeechRecognition/src/SpeechRecognitionPipeline.cpp (revision 89c4ff92f2867872bb9e2354d150bf0c8c502810)
1 //
2 // Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
3 // SPDX-License-Identifier: MIT
4 //
5 
6 #include "SpeechRecognitionPipeline.hpp"
7 #include "ArmnnNetworkExecutor.hpp"
8 
9 namespace asr
10 {
11 
ASRPipeline(std::unique_ptr<common::ArmnnNetworkExecutor<int8_t>> executor,std::unique_ptr<Decoder> decoder,std::unique_ptr<Wav2LetterPreprocessor> preProcessor)12 ASRPipeline::ASRPipeline(std::unique_ptr<common::ArmnnNetworkExecutor<int8_t>> executor,
13                          std::unique_ptr<Decoder> decoder, std::unique_ptr<Wav2LetterPreprocessor> preProcessor) :
14         m_executor(std::move(executor)),
15         m_decoder(std::move(decoder)), m_preProcessor(std::move(preProcessor)) {}
16 
getInputSamplesSize()17 int ASRPipeline::getInputSamplesSize()
18 {
19     return this->m_preProcessor->m_windowLen +
20            ((this->m_preProcessor->m_mfcc->m_params.m_numMfccVectors - 1) * this->m_preProcessor->m_windowStride);
21 }
22 
getSlidingWindowOffset()23 int ASRPipeline::getSlidingWindowOffset()
24 {
25     // Hardcoded for now until refactor
26     return ASRPipeline::SLIDING_WINDOW_OFFSET;
27 }
28 
PreProcessing(std::vector<float> & audio)29 std::vector<int8_t> ASRPipeline::PreProcessing(std::vector<float>& audio)
30 {
31     int audioDataToPreProcess = m_preProcessor->m_windowLen +
32                                 ((m_preProcessor->m_mfcc->m_params.m_numMfccVectors - 1) *
33                                  m_preProcessor->m_windowStride);
34     int outputBufferSize = m_preProcessor->m_mfcc->m_params.m_numMfccVectors
35                            * m_preProcessor->m_mfcc->m_params.m_numMfccFeatures * 3;
36     std::vector<int8_t> outputBuffer(outputBufferSize);
37     m_preProcessor->Invoke(audio.data(), audioDataToPreProcess, outputBuffer, m_executor->GetQuantizationOffset(),
38                            m_executor->GetQuantizationScale());
39     return outputBuffer;
40 }
41 
CreatePipeline(common::PipelineOptions & config,std::map<int,std::string> & labels)42 IPipelinePtr CreatePipeline(common::PipelineOptions& config, std::map<int, std::string>& labels)
43 {
44     if (config.m_ModelName == "Wav2Letter")
45     {
46         // Wav2Letter ASR SETTINGS
47         int SAMP_FREQ = 16000;
48         int FRAME_LEN_MS = 32;
49         int FRAME_LEN_SAMPLES = SAMP_FREQ * FRAME_LEN_MS * 0.001;
50         int NUM_MFCC_FEATS = 13;
51         int MFCC_WINDOW_LEN = 512;
52         int MFCC_WINDOW_STRIDE = 160;
53         const int NUM_MFCC_VECTORS = 296;
54         int SAMPLES_PER_INFERENCE = MFCC_WINDOW_LEN + ((NUM_MFCC_VECTORS - 1) * MFCC_WINDOW_STRIDE);
55         int MEL_LO_FREQ = 0;
56         int MEL_HI_FREQ = 8000;
57         int NUM_FBANK_BIN = 128;
58         int INPUT_WINDOW_LEFT_CONTEXT = 98;
59         int INPUT_WINDOW_RIGHT_CONTEXT = 98;
60         int INPUT_WINDOW_INNER_CONTEXT = NUM_MFCC_VECTORS -
61                                          (INPUT_WINDOW_LEFT_CONTEXT + INPUT_WINDOW_RIGHT_CONTEXT);
62         int SLIDING_WINDOW_OFFSET = INPUT_WINDOW_INNER_CONTEXT * MFCC_WINDOW_STRIDE;
63 
64 
65         MfccParams mfccParams(SAMP_FREQ, NUM_FBANK_BIN,
66                               MEL_LO_FREQ, MEL_HI_FREQ, NUM_MFCC_FEATS, FRAME_LEN_SAMPLES, false, NUM_MFCC_VECTORS);
67 
68         std::unique_ptr<Wav2LetterMFCC> mfccInst = std::make_unique<Wav2LetterMFCC>(mfccParams);
69 
70         auto executor = std::make_unique<common::ArmnnNetworkExecutor<int8_t>>(config.m_ModelFilePath,
71                                                                                config.m_backends);
72 
73         auto decoder = std::make_unique<asr::Decoder>(labels);
74 
75         auto preprocessor = std::make_unique<Wav2LetterPreprocessor>(MFCC_WINDOW_LEN, MFCC_WINDOW_STRIDE,
76                                                                      std::move(mfccInst));
77 
78         auto ptr = std::make_unique<asr::ASRPipeline>(
79                 std::move(executor), std::move(decoder), std::move(preprocessor));
80 
81         ptr->SLIDING_WINDOW_OFFSET = SLIDING_WINDOW_OFFSET;
82 
83         return ptr;
84     }
85     else
86     {
87         throw std::invalid_argument("Unknown Model name: " + config.m_ModelName + " .");
88     }
89 }
90 
91 }// namespace asr