xref: /aosp_15_r20/external/webrtc/modules/audio_processing/agc2/speech_level_estimator.cc (revision d9f758449e529ab9291ac668be2861e7a55c2422)
1 /*
2  *  Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "modules/audio_processing/agc2/speech_level_estimator.h"
12 
13 #include "modules/audio_processing/agc2/agc2_common.h"
14 #include "modules/audio_processing/logging/apm_data_dumper.h"
15 #include "rtc_base/checks.h"
16 #include "rtc_base/logging.h"
17 #include "rtc_base/numerics/safe_minmax.h"
18 
19 namespace webrtc {
20 namespace {
21 
ClampLevelEstimateDbfs(float level_estimate_dbfs)22 float ClampLevelEstimateDbfs(float level_estimate_dbfs) {
23   return rtc::SafeClamp<float>(level_estimate_dbfs, -90.0f, 30.0f);
24 }
25 
26 // Returns the initial speech level estimate needed to apply the initial gain.
GetInitialSpeechLevelEstimateDbfs(const AudioProcessing::Config::GainController2::AdaptiveDigital & config)27 float GetInitialSpeechLevelEstimateDbfs(
28     const AudioProcessing::Config::GainController2::AdaptiveDigital& config) {
29   return ClampLevelEstimateDbfs(-kSaturationProtectorInitialHeadroomDb -
30                                 config.initial_gain_db - config.headroom_db);
31 }
32 
33 }  // namespace
34 
operator ==(const SpeechLevelEstimator::LevelEstimatorState & b) const35 bool SpeechLevelEstimator::LevelEstimatorState::operator==(
36     const SpeechLevelEstimator::LevelEstimatorState& b) const {
37   return time_to_confidence_ms == b.time_to_confidence_ms &&
38          level_dbfs.numerator == b.level_dbfs.numerator &&
39          level_dbfs.denominator == b.level_dbfs.denominator;
40 }
41 
GetRatio() const42 float SpeechLevelEstimator::LevelEstimatorState::Ratio::GetRatio() const {
43   RTC_DCHECK_NE(denominator, 0.f);
44   return numerator / denominator;
45 }
46 
SpeechLevelEstimator(ApmDataDumper * apm_data_dumper,const AudioProcessing::Config::GainController2::AdaptiveDigital & config)47 SpeechLevelEstimator::SpeechLevelEstimator(
48     ApmDataDumper* apm_data_dumper,
49     const AudioProcessing::Config::GainController2::AdaptiveDigital& config)
50     : apm_data_dumper_(apm_data_dumper),
51       initial_speech_level_dbfs_(GetInitialSpeechLevelEstimateDbfs(config)),
52       adjacent_speech_frames_threshold_(
53           config.adjacent_speech_frames_threshold),
54       level_dbfs_(initial_speech_level_dbfs_) {
55   RTC_DCHECK(apm_data_dumper_);
56   RTC_DCHECK_GE(adjacent_speech_frames_threshold_, 1);
57   Reset();
58 }
59 
Update(float rms_dbfs,float peak_dbfs,float speech_probability)60 void SpeechLevelEstimator::Update(float rms_dbfs,
61                                   float peak_dbfs,
62                                   float speech_probability) {
63   RTC_DCHECK_GT(rms_dbfs, -150.0f);
64   RTC_DCHECK_LT(rms_dbfs, 50.0f);
65   RTC_DCHECK_GT(peak_dbfs, -150.0f);
66   RTC_DCHECK_LT(peak_dbfs, 50.0f);
67   RTC_DCHECK_GE(speech_probability, 0.0f);
68   RTC_DCHECK_LE(speech_probability, 1.0f);
69   if (speech_probability < kVadConfidenceThreshold) {
70     // Not a speech frame.
71     if (adjacent_speech_frames_threshold_ > 1) {
72       // When two or more adjacent speech frames are required in order to update
73       // the state, we need to decide whether to discard or confirm the updates
74       // based on the speech sequence length.
75       if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) {
76         // First non-speech frame after a long enough sequence of speech frames.
77         // Update the reliable state.
78         reliable_state_ = preliminary_state_;
79       } else if (num_adjacent_speech_frames_ > 0) {
80         // First non-speech frame after a too short sequence of speech frames.
81         // Reset to the last reliable state.
82         preliminary_state_ = reliable_state_;
83       }
84     }
85     num_adjacent_speech_frames_ = 0;
86   } else {
87     // Speech frame observed.
88     num_adjacent_speech_frames_++;
89 
90     // Update preliminary level estimate.
91     RTC_DCHECK_GE(preliminary_state_.time_to_confidence_ms, 0);
92     const bool buffer_is_full = preliminary_state_.time_to_confidence_ms == 0;
93     if (!buffer_is_full) {
94       preliminary_state_.time_to_confidence_ms -= kFrameDurationMs;
95     }
96     // Weighted average of levels with speech probability as weight.
97     RTC_DCHECK_GT(speech_probability, 0.0f);
98     const float leak_factor = buffer_is_full ? kLevelEstimatorLeakFactor : 1.0f;
99     preliminary_state_.level_dbfs.numerator =
100         preliminary_state_.level_dbfs.numerator * leak_factor +
101         rms_dbfs * speech_probability;
102     preliminary_state_.level_dbfs.denominator =
103         preliminary_state_.level_dbfs.denominator * leak_factor +
104         speech_probability;
105 
106     const float level_dbfs = preliminary_state_.level_dbfs.GetRatio();
107 
108     if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) {
109       // `preliminary_state_` is now reliable. Update the last level estimation.
110       level_dbfs_ = ClampLevelEstimateDbfs(level_dbfs);
111     }
112   }
113   DumpDebugData();
114 }
115 
IsConfident() const116 bool SpeechLevelEstimator::IsConfident() const {
117   if (adjacent_speech_frames_threshold_ == 1) {
118     // Ignore `reliable_state_` when a single frame is enough to update the
119     // level estimate (because it is not used).
120     return preliminary_state_.time_to_confidence_ms == 0;
121   }
122   // Once confident, it remains confident.
123   RTC_DCHECK(reliable_state_.time_to_confidence_ms != 0 ||
124              preliminary_state_.time_to_confidence_ms == 0);
125   // During the first long enough speech sequence, `reliable_state_` must be
126   // ignored since `preliminary_state_` is used.
127   return reliable_state_.time_to_confidence_ms == 0 ||
128          (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_ &&
129           preliminary_state_.time_to_confidence_ms == 0);
130 }
131 
Reset()132 void SpeechLevelEstimator::Reset() {
133   ResetLevelEstimatorState(preliminary_state_);
134   ResetLevelEstimatorState(reliable_state_);
135   level_dbfs_ = initial_speech_level_dbfs_;
136   num_adjacent_speech_frames_ = 0;
137 }
138 
ResetLevelEstimatorState(LevelEstimatorState & state) const139 void SpeechLevelEstimator::ResetLevelEstimatorState(
140     LevelEstimatorState& state) const {
141   state.time_to_confidence_ms = kLevelEstimatorTimeToConfidenceMs;
142   state.level_dbfs.numerator = initial_speech_level_dbfs_;
143   state.level_dbfs.denominator = 1.0f;
144 }
145 
DumpDebugData() const146 void SpeechLevelEstimator::DumpDebugData() const {
147   apm_data_dumper_->DumpRaw(
148       "agc2_adaptive_level_estimator_num_adjacent_speech_frames",
149       num_adjacent_speech_frames_);
150   apm_data_dumper_->DumpRaw(
151       "agc2_adaptive_level_estimator_preliminary_level_estimate_num",
152       preliminary_state_.level_dbfs.numerator);
153   apm_data_dumper_->DumpRaw(
154       "agc2_adaptive_level_estimator_preliminary_level_estimate_den",
155       preliminary_state_.level_dbfs.denominator);
156   apm_data_dumper_->DumpRaw(
157       "agc2_adaptive_level_estimator_preliminary_time_to_confidence_ms",
158       preliminary_state_.time_to_confidence_ms);
159   apm_data_dumper_->DumpRaw(
160       "agc2_adaptive_level_estimator_reliable_time_to_confidence_ms",
161       reliable_state_.time_to_confidence_ms);
162 }
163 
164 }  // namespace webrtc
165