xref: /aosp_15_r20/external/armnn/samples/SpeechRecognition/src/Wav2LetterPreprocessor.cpp (revision 89c4ff92f2867872bb9e2354d150bf0c8c502810)
1 //
2 // Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
3 // SPDX-License-Identifier: MIT
4 //
5 #include "MathUtils.hpp"
6 #include <cstring>
7 #include <cmath>
8 #include <numeric>
9 #include <algorithm>
10 #include <memory>
11 #include "Wav2LetterPreprocessor.hpp"
12 #include "Wav2LetterMFCC.hpp"
13 
GetMean(Array2d<float> & vec)14 float Wav2LetterPreprocessor::GetMean(Array2d<float>& vec)
15 {
16     return MathUtils::MeanF32(vec.begin(), vec.totalSize());
17 }
18 
GetStdDev(Array2d<float> & vec,const float mean)19 float Wav2LetterPreprocessor::GetStdDev(Array2d<float>& vec, const float mean)
20 {
21     return MathUtils::StdDevF32(vec.begin(), vec.totalSize(), mean);
22 }
23 
NormaliseVec(Array2d<float> & vec)24 void Wav2LetterPreprocessor::NormaliseVec(Array2d<float>& vec)
25 {
26     auto mean = Wav2LetterPreprocessor::GetMean(vec);
27     auto stddev = Wav2LetterPreprocessor::GetStdDev(vec, mean);
28 
29     if (stddev == 0)
30     {
31         std::fill(vec.begin(), vec.end(), 0);
32     }
33     else
34     {
35         const float stddevInv = 1.f/stddev;
36         const float normalisedMean = mean/stddev;
37 
38         auto NormalisingFunction = [=](float &value) {
39             value = value * stddevInv - normalisedMean;
40         };
41         std::for_each(vec.begin(), vec.end(), NormalisingFunction);
42     }
43 }
44 
Normalise()45 void Wav2LetterPreprocessor::Normalise()
46 {
47     Wav2LetterPreprocessor::NormaliseVec(this->m_mfccBuf);
48     Wav2LetterPreprocessor::NormaliseVec(this->m_delta1Buf);
49     Wav2LetterPreprocessor::NormaliseVec(this->m_delta2Buf);
50 }
51 
GetQuantElem(const float elem,const float quantScale,const int quantOffset,const float minVal,const float maxVal)52 float Wav2LetterPreprocessor::GetQuantElem(
53         const float     elem,
54         const float     quantScale,
55         const int       quantOffset,
56         const float     minVal,
57         const float     maxVal)
58 {
59     float val = std::round((elem/quantScale) + quantOffset);
60     float returnVal = std::min<float>(std::max<float>(val, minVal), maxVal);
61     return returnVal;
62 }
63 
Invoke(const float * audioData,const uint32_t audioDataLen,std::vector<int8_t> & output,int quantOffset,float quantScale)64 bool Wav2LetterPreprocessor::Invoke(const float*  audioData, const uint32_t  audioDataLen, std::vector<int8_t>& output,
65                                      int quantOffset, float quantScale)
66 {
67     this->m_window = SlidingWindow<const float>(
68             audioData, audioDataLen,
69             this->m_windowLen, this->m_windowStride);
70 
71     uint32_t mfccBufIdx = 0;
72 
73     // Init buffers with 0
74     std::fill(m_mfccBuf.begin(), m_mfccBuf.end(), 0.f);
75     std::fill(m_delta1Buf.begin(), m_delta1Buf.end(), 0.f);
76     std::fill(m_delta2Buf.begin(), m_delta2Buf.end(), 0.f);
77 
78     // While we can slide over the window
79     while (this->m_window.HasNext())
80     {
81         const float* mfccWindow = this->m_window.Next();
82         auto mfccAudioData = std::vector<float>(
83                 mfccWindow,
84                 mfccWindow + this->m_windowLen);
85 
86         auto mfcc = this->m_mfcc->MfccCompute(mfccAudioData);
87         for (size_t i = 0; i < this->m_mfccBuf.size(0); ++i)
88         {
89             this->m_mfccBuf(i, mfccBufIdx) = mfcc[i];
90         }
91         ++mfccBufIdx;
92     }
93 
94     // Pad MFCC if needed by repeating last feature vector
95     while (mfccBufIdx != this->m_mfcc->m_params.m_numMfccVectors)
96     {
97         memcpy(&this->m_mfccBuf(0, mfccBufIdx),
98                &this->m_mfccBuf(0, mfccBufIdx - 1), sizeof(float) * this->m_mfcc->m_params.m_numMfccFeatures);
99         ++mfccBufIdx;
100     }
101 
102     // Compute first and second order deltas from MFCCs
103     Wav2LetterPreprocessor::ComputeDeltas(this->m_mfccBuf,
104                         this->m_delta1Buf,
105                         this->m_delta2Buf);
106 
107     // Normalise
108     this->Normalise();
109 
110     return this->Quantise<int8_t>(output.data(), quantOffset, quantScale);
111 }
112 
ComputeDeltas(Array2d<float> & mfcc,Array2d<float> & delta1,Array2d<float> & delta2)113 bool Wav2LetterPreprocessor::ComputeDeltas(Array2d<float>& mfcc,
114                                            Array2d<float>& delta1,
115                                            Array2d<float>& delta2)
116 {
117     const std::vector <float> delta1Coeffs =
118             {6.66666667e-02,  5.00000000e-02,  3.33333333e-02,
119              1.66666667e-02, -3.46944695e-18, -1.66666667e-02,
120              -3.33333333e-02, -5.00000000e-02, -6.66666667e-02};
121 
122     const std::vector <float> delta2Coeffs =
123             {0.06060606,      0.01515152,     -0.01731602,
124              -0.03679654,     -0.04329004,     -0.03679654,
125              -0.01731602,      0.01515152,      0.06060606};
126 
127     if (delta1.size(0) == 0 || delta2.size(0) != delta1.size(0) ||
128         mfcc.size(0) == 0 || mfcc.size(1) == 0)
129     {
130         return false;
131     }
132 
133     // Get the middle index; coeff vec len should always be odd
134     const size_t coeffLen = delta1Coeffs.size();
135     const size_t fMidIdx = (coeffLen - 1)/2;
136     const size_t numFeatures = mfcc.size(0);
137     const size_t numFeatVectors = mfcc.size(1);
138 
139     // iterate through features in MFCC vector
140     for (size_t i = 0; i < numFeatures; ++i)
141     {
142         /* for each feature, iterate through time (t) samples representing feature evolution and
143         * calculate d/dt and d^2/dt^2, using 1d convolution with differential kernels.
144         * Convolution padding = valid, result size is `time length - kernel length + 1`.
145         * The result is padded with 0 from both sides to match the size of initial time samples data.
146         *
147         * For the small filter, conv1d implementation as a simple loop is efficient enough.
148         * Filters of a greater size would need CMSIS-DSP functions to be used, like arm_fir_f32.
149         */
150 
151         for (size_t j = fMidIdx; j < numFeatVectors - fMidIdx; ++j)
152         {
153             float d1 = 0;
154             float d2 = 0;
155             const size_t mfccStIdx = j - fMidIdx;
156 
157             for (size_t k = 0, m = coeffLen - 1; k < coeffLen; ++k, --m)
158             {
159 
160                 d1 +=  mfcc(i,mfccStIdx + k) * delta1Coeffs[m];
161                 d2 +=  mfcc(i,mfccStIdx + k) * delta2Coeffs[m];
162             }
163 
164             delta1(i,j) = d1;
165             delta2(i,j) = d2;
166         }
167     }
168 
169     return true;
170 }
171 
Wav2LetterPreprocessor(const uint32_t windowLen,const uint32_t windowStride,std::unique_ptr<Wav2LetterMFCC> mfccInst)172 Wav2LetterPreprocessor::Wav2LetterPreprocessor(const uint32_t  windowLen,
173                                                const uint32_t  windowStride,
174                                                std::unique_ptr<Wav2LetterMFCC> mfccInst):
175     m_mfcc(std::move(mfccInst)),
176     m_mfccBuf(m_mfcc->m_params.m_numMfccFeatures, m_mfcc->m_params.m_numMfccVectors),
177     m_delta1Buf(m_mfcc->m_params.m_numMfccFeatures, m_mfcc->m_params.m_numMfccVectors),
178     m_delta2Buf(m_mfcc->m_params.m_numMfccFeatures, m_mfcc->m_params.m_numMfccVectors),
179     m_windowLen(windowLen),
180     m_windowStride(windowStride)
181 {
182     if (m_mfcc->m_params.m_numMfccFeatures > 0 && windowLen > 0)
183     {
184         this->m_mfcc->Init();
185     }
186     std::fill(m_mfccBuf.begin(), m_mfccBuf.end(), 0.f);
187 }