1 /*
2 * Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "modules/audio_processing/agc2/speech_level_estimator.h"
12
13 #include "modules/audio_processing/agc2/agc2_common.h"
14 #include "modules/audio_processing/logging/apm_data_dumper.h"
15 #include "rtc_base/checks.h"
16 #include "rtc_base/logging.h"
17 #include "rtc_base/numerics/safe_minmax.h"
18
19 namespace webrtc {
20 namespace {
21
ClampLevelEstimateDbfs(float level_estimate_dbfs)22 float ClampLevelEstimateDbfs(float level_estimate_dbfs) {
23 return rtc::SafeClamp<float>(level_estimate_dbfs, -90.0f, 30.0f);
24 }
25
26 // Returns the initial speech level estimate needed to apply the initial gain.
GetInitialSpeechLevelEstimateDbfs(const AudioProcessing::Config::GainController2::AdaptiveDigital & config)27 float GetInitialSpeechLevelEstimateDbfs(
28 const AudioProcessing::Config::GainController2::AdaptiveDigital& config) {
29 return ClampLevelEstimateDbfs(-kSaturationProtectorInitialHeadroomDb -
30 config.initial_gain_db - config.headroom_db);
31 }
32
33 } // namespace
34
operator ==(const SpeechLevelEstimator::LevelEstimatorState & b) const35 bool SpeechLevelEstimator::LevelEstimatorState::operator==(
36 const SpeechLevelEstimator::LevelEstimatorState& b) const {
37 return time_to_confidence_ms == b.time_to_confidence_ms &&
38 level_dbfs.numerator == b.level_dbfs.numerator &&
39 level_dbfs.denominator == b.level_dbfs.denominator;
40 }
41
GetRatio() const42 float SpeechLevelEstimator::LevelEstimatorState::Ratio::GetRatio() const {
43 RTC_DCHECK_NE(denominator, 0.f);
44 return numerator / denominator;
45 }
46
SpeechLevelEstimator(ApmDataDumper * apm_data_dumper,const AudioProcessing::Config::GainController2::AdaptiveDigital & config)47 SpeechLevelEstimator::SpeechLevelEstimator(
48 ApmDataDumper* apm_data_dumper,
49 const AudioProcessing::Config::GainController2::AdaptiveDigital& config)
50 : apm_data_dumper_(apm_data_dumper),
51 initial_speech_level_dbfs_(GetInitialSpeechLevelEstimateDbfs(config)),
52 adjacent_speech_frames_threshold_(
53 config.adjacent_speech_frames_threshold),
54 level_dbfs_(initial_speech_level_dbfs_) {
55 RTC_DCHECK(apm_data_dumper_);
56 RTC_DCHECK_GE(adjacent_speech_frames_threshold_, 1);
57 Reset();
58 }
59
Update(float rms_dbfs,float peak_dbfs,float speech_probability)60 void SpeechLevelEstimator::Update(float rms_dbfs,
61 float peak_dbfs,
62 float speech_probability) {
63 RTC_DCHECK_GT(rms_dbfs, -150.0f);
64 RTC_DCHECK_LT(rms_dbfs, 50.0f);
65 RTC_DCHECK_GT(peak_dbfs, -150.0f);
66 RTC_DCHECK_LT(peak_dbfs, 50.0f);
67 RTC_DCHECK_GE(speech_probability, 0.0f);
68 RTC_DCHECK_LE(speech_probability, 1.0f);
69 if (speech_probability < kVadConfidenceThreshold) {
70 // Not a speech frame.
71 if (adjacent_speech_frames_threshold_ > 1) {
72 // When two or more adjacent speech frames are required in order to update
73 // the state, we need to decide whether to discard or confirm the updates
74 // based on the speech sequence length.
75 if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) {
76 // First non-speech frame after a long enough sequence of speech frames.
77 // Update the reliable state.
78 reliable_state_ = preliminary_state_;
79 } else if (num_adjacent_speech_frames_ > 0) {
80 // First non-speech frame after a too short sequence of speech frames.
81 // Reset to the last reliable state.
82 preliminary_state_ = reliable_state_;
83 }
84 }
85 num_adjacent_speech_frames_ = 0;
86 } else {
87 // Speech frame observed.
88 num_adjacent_speech_frames_++;
89
90 // Update preliminary level estimate.
91 RTC_DCHECK_GE(preliminary_state_.time_to_confidence_ms, 0);
92 const bool buffer_is_full = preliminary_state_.time_to_confidence_ms == 0;
93 if (!buffer_is_full) {
94 preliminary_state_.time_to_confidence_ms -= kFrameDurationMs;
95 }
96 // Weighted average of levels with speech probability as weight.
97 RTC_DCHECK_GT(speech_probability, 0.0f);
98 const float leak_factor = buffer_is_full ? kLevelEstimatorLeakFactor : 1.0f;
99 preliminary_state_.level_dbfs.numerator =
100 preliminary_state_.level_dbfs.numerator * leak_factor +
101 rms_dbfs * speech_probability;
102 preliminary_state_.level_dbfs.denominator =
103 preliminary_state_.level_dbfs.denominator * leak_factor +
104 speech_probability;
105
106 const float level_dbfs = preliminary_state_.level_dbfs.GetRatio();
107
108 if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) {
109 // `preliminary_state_` is now reliable. Update the last level estimation.
110 level_dbfs_ = ClampLevelEstimateDbfs(level_dbfs);
111 }
112 }
113 DumpDebugData();
114 }
115
IsConfident() const116 bool SpeechLevelEstimator::IsConfident() const {
117 if (adjacent_speech_frames_threshold_ == 1) {
118 // Ignore `reliable_state_` when a single frame is enough to update the
119 // level estimate (because it is not used).
120 return preliminary_state_.time_to_confidence_ms == 0;
121 }
122 // Once confident, it remains confident.
123 RTC_DCHECK(reliable_state_.time_to_confidence_ms != 0 ||
124 preliminary_state_.time_to_confidence_ms == 0);
125 // During the first long enough speech sequence, `reliable_state_` must be
126 // ignored since `preliminary_state_` is used.
127 return reliable_state_.time_to_confidence_ms == 0 ||
128 (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_ &&
129 preliminary_state_.time_to_confidence_ms == 0);
130 }
131
Reset()132 void SpeechLevelEstimator::Reset() {
133 ResetLevelEstimatorState(preliminary_state_);
134 ResetLevelEstimatorState(reliable_state_);
135 level_dbfs_ = initial_speech_level_dbfs_;
136 num_adjacent_speech_frames_ = 0;
137 }
138
ResetLevelEstimatorState(LevelEstimatorState & state) const139 void SpeechLevelEstimator::ResetLevelEstimatorState(
140 LevelEstimatorState& state) const {
141 state.time_to_confidence_ms = kLevelEstimatorTimeToConfidenceMs;
142 state.level_dbfs.numerator = initial_speech_level_dbfs_;
143 state.level_dbfs.denominator = 1.0f;
144 }
145
DumpDebugData() const146 void SpeechLevelEstimator::DumpDebugData() const {
147 apm_data_dumper_->DumpRaw(
148 "agc2_adaptive_level_estimator_num_adjacent_speech_frames",
149 num_adjacent_speech_frames_);
150 apm_data_dumper_->DumpRaw(
151 "agc2_adaptive_level_estimator_preliminary_level_estimate_num",
152 preliminary_state_.level_dbfs.numerator);
153 apm_data_dumper_->DumpRaw(
154 "agc2_adaptive_level_estimator_preliminary_level_estimate_den",
155 preliminary_state_.level_dbfs.denominator);
156 apm_data_dumper_->DumpRaw(
157 "agc2_adaptive_level_estimator_preliminary_time_to_confidence_ms",
158 preliminary_state_.time_to_confidence_ms);
159 apm_data_dumper_->DumpRaw(
160 "agc2_adaptive_level_estimator_reliable_time_to_confidence_ms",
161 reliable_state_.time_to_confidence_ms);
162 }
163
164 } // namespace webrtc
165