xref: /aosp_15_r20/external/armnn/samples/SpeechRecognition/include/Wav2LetterPreprocessor.hpp (revision 89c4ff92f2867872bb9e2354d150bf0c8c502810)
1 //
2 // Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
3 // SPDX-License-Identifier: MIT
4 //
5 #ifndef SPEECH_RECOGNITION_EXAMPLE_WAV2LETTERPREPROCESSOR_HPP
6 #define SPEECH_RECOGNITION_EXAMPLE_WAV2LETTERPREPROCESSOR_HPP
7 
8 #include <numeric>
9 #include "DataStructures.hpp"
10 #include "SlidingWindow.hpp"
11 #include "MFCC.hpp"
12 #include "Wav2LetterMFCC.hpp"
13 // Class to facilitate pre-processing calculation for Wav2Letter model for ASR
14 using AudioWindow = SlidingWindow<const float>;
15 
16 class Wav2LetterPreprocessor
17 {
18 public:
19     Wav2LetterPreprocessor(uint32_t windowLen, uint32_t windowStride,
20                            std::unique_ptr<Wav2LetterMFCC> mfccInst);
21 
22     /**
23      * @brief       Calculates the features required from audio data. This
24      *              includes MFCC, first and second order deltas,
25      *              normalisation and finally, quantisation. The tensor is
26      *              populated with feature from a given window placed along
27      *              in a single row.
28      * @param[in]   audioData     pointer to the first element of audio data
29      * @param[in]   audioDataLen  number of elements in the audio data
30      * @param[in]   tensor        tensor to be populated
31      * @return      true if successful, false in case of error.
32      */
33     bool Invoke(const float* audioData, uint32_t audioDataLen, std::vector<int8_t>& output, int quantOffset,
34                 float quantScale);
35 
36     std::unique_ptr<MFCC> m_mfcc;
37 
38     // Actual buffers to be populated
39     Array2d<float> m_mfccBuf;         // Contiguous buffer 1D: MFCC
40     Array2d<float> m_delta1Buf;       // Contiguous buffer 1D: Delta 1
41     Array2d<float> m_delta2Buf;       // Contiguous buffer 1D: Delta 2
42 
43     uint32_t m_windowLen;       // Window length for MFCC
44     uint32_t m_windowStride;    // Window stride len for MFCC
45     AudioWindow m_window;       // Sliding window
46 
47 protected:
48     /**
49      * @brief Computes the first and second order deltas for the
50      *        MFCC buffers - they are assumed to be populated.
51      *
52      * @param[in]  mfcc   MFCC buffers
53      * @param[out] delta1 result of the first diff computation
54      * @param[out] delta2 result of the second diff computation
55      *
56      * @return true if successful, false otherwise
57      */
58     static bool ComputeDeltas(Array2d<float>& mfcc,
59                               Array2d<float>& delta1,
60                               Array2d<float>& delta2);
61 
62 protected:
63 
64     /**
65      * @brief      Given a 2D vector of floats, computes the mean
66      * @param[in]   vec      vector of vector of floats
67      * @return      mean value
68      */
69     static float GetMean(Array2d<float>& vec);
70 
71     /**
72      * @brief       Given a 2D vector of floats, computes the stddev
73      * @param[in]   vec   vector of vector of floats
74      * @param[in]   mean     mean value of the vector passed in
75      * @return      stddev value
76      */
77     static float GetStdDev(Array2d<float>& vec, float mean);
78 
79     /**
80      * @brief           Given a 2D vector of floats, normalises it using
81      *                  the mean and the stddev
82      * @param[in/out]   vec      vector of vector of floats
83      * @return
84      */
85     static void NormaliseVec(Array2d<float>& vec);
86 
87     /**
88      * @brief       Normalises the MFCC and delta buffers
89      * @return
90      */
91     void Normalise();
92 
93     /**
94      * @brief       Given the quantisation and data type limits, computes
95      *              the quantised values of a floating point input data.
96      * @param[in]   elem            Element to be quantised
97      * @param[in]   quantScale      Scale
98      * @param[in]   quantOffset     Offset
99      * @param[in]   minVal          Numerical limit - minimum
100      * @param[in]   maxVal          Numerical limit - maximum
101      * @return      floating point quantised value
102      */
103     static float GetQuantElem(
104             float elem,
105             float quantScale,
106             int quantOffset,
107             float minVal,
108             float maxVal);
109 
110     /**
111      * @brief       Quantises the MFCC and delta buffers, and places them
112      *              in the output buffer. While doing so, it transposes
113      *              the data. Reason: Buffers in this class are arranged
114      *              for "time" axis to be row major. Primary reason for
115      *              this being the convolution speed up (as we can use
116      *              contiguous memory). The output, however, requires the
117      *              time axis to be in column major arrangement.
118      * @param[in]   outputBuf       pointer to the output buffer
119      * @param[in]   outputBufSz     output buffer's size
120      * @param[in]   quantScale      quantisation scale
121      * @param[in]   quantOffset     quantisation offset
122      */
123     template<typename T>
Quantise(T * outputBuf,int quantOffset,float quantScale)124     bool Quantise(T*outputBuf, int quantOffset, float quantScale)
125     {
126         // Populate
127         T* outputBufMfcc = outputBuf;
128         T* outputBufD1 = outputBuf + this->m_mfcc->m_params.m_numMfccFeatures;
129         T* outputBufD2 = outputBufD1 + this->m_mfcc->m_params.m_numMfccFeatures;
130         const uint32_t ptrIncr = this->m_mfcc->m_params.m_numMfccFeatures * 2; // (3 vectors - 1 vector)
131 
132         const float minVal = std::numeric_limits<T>::min();
133         const float maxVal = std::numeric_limits<T>::max();
134 
135         // We need to do a transpose while copying and concatenating the tensor
136         for (uint32_t j = 0; j < this->m_mfcc->m_params.m_numMfccVectors; ++j)
137         {
138             for (uint32_t i = 0; i < this->m_mfcc->m_params.m_numMfccFeatures; ++i)
139             {
140                 *outputBufMfcc++ = static_cast<T>(Wav2LetterPreprocessor::GetQuantElem(
141                         this->m_mfccBuf(i, j), quantScale,
142                         quantOffset, minVal, maxVal));
143                 *outputBufD1++ = static_cast<T>(Wav2LetterPreprocessor::GetQuantElem(
144                         this->m_delta1Buf(i, j), quantScale,
145                         quantOffset, minVal, maxVal));
146                 *outputBufD2++ = static_cast<T>(Wav2LetterPreprocessor::GetQuantElem(
147                         this->m_delta2Buf(i, j), quantScale,
148                         quantOffset, minVal, maxVal));
149             }
150             outputBufMfcc += ptrIncr;
151             outputBufD1 += ptrIncr;
152             outputBufD2 += ptrIncr;
153         }
154         return true;
155     }
156 };
157 
158 #endif //SPEECH_RECOGNITION_EXAMPLE_WAV2LETTERPREPROCESSOR_HPP
159