1 // 2 // Copyright © 2022 Arm Ltd and Contributors. All rights reserved. 3 // SPDX-License-Identifier: MIT 4 // 5 #ifndef SPEECH_RECOGNITION_EXAMPLE_WAV2LETTERPREPROCESSOR_HPP 6 #define SPEECH_RECOGNITION_EXAMPLE_WAV2LETTERPREPROCESSOR_HPP 7 8 #include <numeric> 9 #include "DataStructures.hpp" 10 #include "SlidingWindow.hpp" 11 #include "MFCC.hpp" 12 #include "Wav2LetterMFCC.hpp" 13 // Class to facilitate pre-processing calculation for Wav2Letter model for ASR 14 using AudioWindow = SlidingWindow<const float>; 15 16 class Wav2LetterPreprocessor 17 { 18 public: 19 Wav2LetterPreprocessor(uint32_t windowLen, uint32_t windowStride, 20 std::unique_ptr<Wav2LetterMFCC> mfccInst); 21 22 /** 23 * @brief Calculates the features required from audio data. This 24 * includes MFCC, first and second order deltas, 25 * normalisation and finally, quantisation. The tensor is 26 * populated with feature from a given window placed along 27 * in a single row. 28 * @param[in] audioData pointer to the first element of audio data 29 * @param[in] audioDataLen number of elements in the audio data 30 * @param[in] tensor tensor to be populated 31 * @return true if successful, false in case of error. 32 */ 33 bool Invoke(const float* audioData, uint32_t audioDataLen, std::vector<int8_t>& output, int quantOffset, 34 float quantScale); 35 36 std::unique_ptr<MFCC> m_mfcc; 37 38 // Actual buffers to be populated 39 Array2d<float> m_mfccBuf; // Contiguous buffer 1D: MFCC 40 Array2d<float> m_delta1Buf; // Contiguous buffer 1D: Delta 1 41 Array2d<float> m_delta2Buf; // Contiguous buffer 1D: Delta 2 42 43 uint32_t m_windowLen; // Window length for MFCC 44 uint32_t m_windowStride; // Window stride len for MFCC 45 AudioWindow m_window; // Sliding window 46 47 protected: 48 /** 49 * @brief Computes the first and second order deltas for the 50 * MFCC buffers - they are assumed to be populated. 51 * 52 * @param[in] mfcc MFCC buffers 53 * @param[out] delta1 result of the first diff computation 54 * @param[out] delta2 result of the second diff computation 55 * 56 * @return true if successful, false otherwise 57 */ 58 static bool ComputeDeltas(Array2d<float>& mfcc, 59 Array2d<float>& delta1, 60 Array2d<float>& delta2); 61 62 protected: 63 64 /** 65 * @brief Given a 2D vector of floats, computes the mean 66 * @param[in] vec vector of vector of floats 67 * @return mean value 68 */ 69 static float GetMean(Array2d<float>& vec); 70 71 /** 72 * @brief Given a 2D vector of floats, computes the stddev 73 * @param[in] vec vector of vector of floats 74 * @param[in] mean mean value of the vector passed in 75 * @return stddev value 76 */ 77 static float GetStdDev(Array2d<float>& vec, float mean); 78 79 /** 80 * @brief Given a 2D vector of floats, normalises it using 81 * the mean and the stddev 82 * @param[in/out] vec vector of vector of floats 83 * @return 84 */ 85 static void NormaliseVec(Array2d<float>& vec); 86 87 /** 88 * @brief Normalises the MFCC and delta buffers 89 * @return 90 */ 91 void Normalise(); 92 93 /** 94 * @brief Given the quantisation and data type limits, computes 95 * the quantised values of a floating point input data. 96 * @param[in] elem Element to be quantised 97 * @param[in] quantScale Scale 98 * @param[in] quantOffset Offset 99 * @param[in] minVal Numerical limit - minimum 100 * @param[in] maxVal Numerical limit - maximum 101 * @return floating point quantised value 102 */ 103 static float GetQuantElem( 104 float elem, 105 float quantScale, 106 int quantOffset, 107 float minVal, 108 float maxVal); 109 110 /** 111 * @brief Quantises the MFCC and delta buffers, and places them 112 * in the output buffer. While doing so, it transposes 113 * the data. Reason: Buffers in this class are arranged 114 * for "time" axis to be row major. Primary reason for 115 * this being the convolution speed up (as we can use 116 * contiguous memory). The output, however, requires the 117 * time axis to be in column major arrangement. 118 * @param[in] outputBuf pointer to the output buffer 119 * @param[in] outputBufSz output buffer's size 120 * @param[in] quantScale quantisation scale 121 * @param[in] quantOffset quantisation offset 122 */ 123 template<typename T> Quantise(T * outputBuf,int quantOffset,float quantScale)124 bool Quantise(T*outputBuf, int quantOffset, float quantScale) 125 { 126 // Populate 127 T* outputBufMfcc = outputBuf; 128 T* outputBufD1 = outputBuf + this->m_mfcc->m_params.m_numMfccFeatures; 129 T* outputBufD2 = outputBufD1 + this->m_mfcc->m_params.m_numMfccFeatures; 130 const uint32_t ptrIncr = this->m_mfcc->m_params.m_numMfccFeatures * 2; // (3 vectors - 1 vector) 131 132 const float minVal = std::numeric_limits<T>::min(); 133 const float maxVal = std::numeric_limits<T>::max(); 134 135 // We need to do a transpose while copying and concatenating the tensor 136 for (uint32_t j = 0; j < this->m_mfcc->m_params.m_numMfccVectors; ++j) 137 { 138 for (uint32_t i = 0; i < this->m_mfcc->m_params.m_numMfccFeatures; ++i) 139 { 140 *outputBufMfcc++ = static_cast<T>(Wav2LetterPreprocessor::GetQuantElem( 141 this->m_mfccBuf(i, j), quantScale, 142 quantOffset, minVal, maxVal)); 143 *outputBufD1++ = static_cast<T>(Wav2LetterPreprocessor::GetQuantElem( 144 this->m_delta1Buf(i, j), quantScale, 145 quantOffset, minVal, maxVal)); 146 *outputBufD2++ = static_cast<T>(Wav2LetterPreprocessor::GetQuantElem( 147 this->m_delta2Buf(i, j), quantScale, 148 quantOffset, minVal, maxVal)); 149 } 150 outputBufMfcc += ptrIncr; 151 outputBufD1 += ptrIncr; 152 outputBufD2 += ptrIncr; 153 } 154 return true; 155 } 156 }; 157 158 #endif //SPEECH_RECOGNITION_EXAMPLE_WAV2LETTERPREPROCESSOR_HPP 159