xref: /aosp_15_r20/external/webrtc/modules/audio_coding/codecs/cng/audio_encoder_cng.cc (revision d9f758449e529ab9291ac668be2861e7a55c2422)
1 /*
2  *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "modules/audio_coding/codecs/cng/audio_encoder_cng.h"
12 
13 #include <cstdint>
14 #include <memory>
15 #include <utility>
16 
17 #include "absl/types/optional.h"
18 #include "api/units/time_delta.h"
19 #include "modules/audio_coding/codecs/cng/webrtc_cng.h"
20 #include "rtc_base/checks.h"
21 
22 namespace webrtc {
23 
24 namespace {
25 
26 const int kMaxFrameSizeMs = 60;
27 
28 class AudioEncoderCng final : public AudioEncoder {
29  public:
30   explicit AudioEncoderCng(AudioEncoderCngConfig&& config);
31   ~AudioEncoderCng() override;
32 
33   // Not copyable or moveable.
34   AudioEncoderCng(const AudioEncoderCng&) = delete;
35   AudioEncoderCng(AudioEncoderCng&&) = delete;
36   AudioEncoderCng& operator=(const AudioEncoderCng&) = delete;
37   AudioEncoderCng& operator=(AudioEncoderCng&&) = delete;
38 
39   int SampleRateHz() const override;
40   size_t NumChannels() const override;
41   int RtpTimestampRateHz() const override;
42   size_t Num10MsFramesInNextPacket() const override;
43   size_t Max10MsFramesInAPacket() const override;
44   int GetTargetBitrate() const override;
45   EncodedInfo EncodeImpl(uint32_t rtp_timestamp,
46                          rtc::ArrayView<const int16_t> audio,
47                          rtc::Buffer* encoded) override;
48   void Reset() override;
49   bool SetFec(bool enable) override;
50   bool SetDtx(bool enable) override;
51   bool SetApplication(Application application) override;
52   void SetMaxPlaybackRate(int frequency_hz) override;
53   rtc::ArrayView<std::unique_ptr<AudioEncoder>> ReclaimContainedEncoders()
54       override;
55   void OnReceivedUplinkPacketLossFraction(
56       float uplink_packet_loss_fraction) override;
57   void OnReceivedUplinkBandwidth(
58       int target_audio_bitrate_bps,
59       absl::optional<int64_t> bwe_period_ms) override;
60   absl::optional<std::pair<TimeDelta, TimeDelta>> GetFrameLengthRange()
61       const override;
62 
63  private:
64   EncodedInfo EncodePassive(size_t frames_to_encode, rtc::Buffer* encoded);
65   EncodedInfo EncodeActive(size_t frames_to_encode, rtc::Buffer* encoded);
66   size_t SamplesPer10msFrame() const;
67 
68   std::unique_ptr<AudioEncoder> speech_encoder_;
69   const int cng_payload_type_;
70   const int num_cng_coefficients_;
71   const int sid_frame_interval_ms_;
72   std::vector<int16_t> speech_buffer_;
73   std::vector<uint32_t> rtp_timestamps_;
74   bool last_frame_active_;
75   std::unique_ptr<Vad> vad_;
76   std::unique_ptr<ComfortNoiseEncoder> cng_encoder_;
77 };
78 
AudioEncoderCng(AudioEncoderCngConfig && config)79 AudioEncoderCng::AudioEncoderCng(AudioEncoderCngConfig&& config)
80     : speech_encoder_((static_cast<void>([&] {
81                          RTC_CHECK(config.IsOk()) << "Invalid configuration.";
82                        }()),
83                        std::move(config.speech_encoder))),
84       cng_payload_type_(config.payload_type),
85       num_cng_coefficients_(config.num_cng_coefficients),
86       sid_frame_interval_ms_(config.sid_frame_interval_ms),
87       last_frame_active_(true),
88       vad_(config.vad ? std::unique_ptr<Vad>(config.vad)
89                       : CreateVad(config.vad_mode)),
90       cng_encoder_(new ComfortNoiseEncoder(SampleRateHz(),
91                                            sid_frame_interval_ms_,
92                                            num_cng_coefficients_)) {}
93 
94 AudioEncoderCng::~AudioEncoderCng() = default;
95 
SampleRateHz() const96 int AudioEncoderCng::SampleRateHz() const {
97   return speech_encoder_->SampleRateHz();
98 }
99 
NumChannels() const100 size_t AudioEncoderCng::NumChannels() const {
101   return 1;
102 }
103 
RtpTimestampRateHz() const104 int AudioEncoderCng::RtpTimestampRateHz() const {
105   return speech_encoder_->RtpTimestampRateHz();
106 }
107 
Num10MsFramesInNextPacket() const108 size_t AudioEncoderCng::Num10MsFramesInNextPacket() const {
109   return speech_encoder_->Num10MsFramesInNextPacket();
110 }
111 
Max10MsFramesInAPacket() const112 size_t AudioEncoderCng::Max10MsFramesInAPacket() const {
113   return speech_encoder_->Max10MsFramesInAPacket();
114 }
115 
GetTargetBitrate() const116 int AudioEncoderCng::GetTargetBitrate() const {
117   return speech_encoder_->GetTargetBitrate();
118 }
119 
EncodeImpl(uint32_t rtp_timestamp,rtc::ArrayView<const int16_t> audio,rtc::Buffer * encoded)120 AudioEncoder::EncodedInfo AudioEncoderCng::EncodeImpl(
121     uint32_t rtp_timestamp,
122     rtc::ArrayView<const int16_t> audio,
123     rtc::Buffer* encoded) {
124   const size_t samples_per_10ms_frame = SamplesPer10msFrame();
125   RTC_CHECK_EQ(speech_buffer_.size(),
126                rtp_timestamps_.size() * samples_per_10ms_frame);
127   rtp_timestamps_.push_back(rtp_timestamp);
128   RTC_DCHECK_EQ(samples_per_10ms_frame, audio.size());
129   speech_buffer_.insert(speech_buffer_.end(), audio.cbegin(), audio.cend());
130   const size_t frames_to_encode = speech_encoder_->Num10MsFramesInNextPacket();
131   if (rtp_timestamps_.size() < frames_to_encode) {
132     return EncodedInfo();
133   }
134   RTC_CHECK_LE(frames_to_encode * 10, kMaxFrameSizeMs)
135       << "Frame size cannot be larger than " << kMaxFrameSizeMs
136       << " ms when using VAD/CNG.";
137 
138   // Group several 10 ms blocks per VAD call. Call VAD once or twice using the
139   // following split sizes:
140   // 10 ms = 10 + 0 ms; 20 ms = 20 + 0 ms; 30 ms = 30 + 0 ms;
141   // 40 ms = 20 + 20 ms; 50 ms = 30 + 20 ms; 60 ms = 30 + 30 ms.
142   size_t blocks_in_first_vad_call =
143       (frames_to_encode > 3 ? 3 : frames_to_encode);
144   if (frames_to_encode == 4)
145     blocks_in_first_vad_call = 2;
146   RTC_CHECK_GE(frames_to_encode, blocks_in_first_vad_call);
147   const size_t blocks_in_second_vad_call =
148       frames_to_encode - blocks_in_first_vad_call;
149 
150   // Check if all of the buffer is passive speech. Start with checking the first
151   // block.
152   Vad::Activity activity = vad_->VoiceActivity(
153       &speech_buffer_[0], samples_per_10ms_frame * blocks_in_first_vad_call,
154       SampleRateHz());
155   if (activity == Vad::kPassive && blocks_in_second_vad_call > 0) {
156     // Only check the second block if the first was passive.
157     activity = vad_->VoiceActivity(
158         &speech_buffer_[samples_per_10ms_frame * blocks_in_first_vad_call],
159         samples_per_10ms_frame * blocks_in_second_vad_call, SampleRateHz());
160   }
161 
162   EncodedInfo info;
163   switch (activity) {
164     case Vad::kPassive: {
165       info = EncodePassive(frames_to_encode, encoded);
166       last_frame_active_ = false;
167       break;
168     }
169     case Vad::kActive: {
170       info = EncodeActive(frames_to_encode, encoded);
171       last_frame_active_ = true;
172       break;
173     }
174     default: {
175       RTC_CHECK_NOTREACHED();
176     }
177   }
178 
179   speech_buffer_.erase(
180       speech_buffer_.begin(),
181       speech_buffer_.begin() + frames_to_encode * samples_per_10ms_frame);
182   rtp_timestamps_.erase(rtp_timestamps_.begin(),
183                         rtp_timestamps_.begin() + frames_to_encode);
184   return info;
185 }
186 
Reset()187 void AudioEncoderCng::Reset() {
188   speech_encoder_->Reset();
189   speech_buffer_.clear();
190   rtp_timestamps_.clear();
191   last_frame_active_ = true;
192   vad_->Reset();
193   cng_encoder_.reset(new ComfortNoiseEncoder(
194       SampleRateHz(), sid_frame_interval_ms_, num_cng_coefficients_));
195 }
196 
SetFec(bool enable)197 bool AudioEncoderCng::SetFec(bool enable) {
198   return speech_encoder_->SetFec(enable);
199 }
200 
SetDtx(bool enable)201 bool AudioEncoderCng::SetDtx(bool enable) {
202   return speech_encoder_->SetDtx(enable);
203 }
204 
SetApplication(Application application)205 bool AudioEncoderCng::SetApplication(Application application) {
206   return speech_encoder_->SetApplication(application);
207 }
208 
SetMaxPlaybackRate(int frequency_hz)209 void AudioEncoderCng::SetMaxPlaybackRate(int frequency_hz) {
210   speech_encoder_->SetMaxPlaybackRate(frequency_hz);
211 }
212 
213 rtc::ArrayView<std::unique_ptr<AudioEncoder>>
ReclaimContainedEncoders()214 AudioEncoderCng::ReclaimContainedEncoders() {
215   return rtc::ArrayView<std::unique_ptr<AudioEncoder>>(&speech_encoder_, 1);
216 }
217 
OnReceivedUplinkPacketLossFraction(float uplink_packet_loss_fraction)218 void AudioEncoderCng::OnReceivedUplinkPacketLossFraction(
219     float uplink_packet_loss_fraction) {
220   speech_encoder_->OnReceivedUplinkPacketLossFraction(
221       uplink_packet_loss_fraction);
222 }
223 
OnReceivedUplinkBandwidth(int target_audio_bitrate_bps,absl::optional<int64_t> bwe_period_ms)224 void AudioEncoderCng::OnReceivedUplinkBandwidth(
225     int target_audio_bitrate_bps,
226     absl::optional<int64_t> bwe_period_ms) {
227   speech_encoder_->OnReceivedUplinkBandwidth(target_audio_bitrate_bps,
228                                              bwe_period_ms);
229 }
230 
231 absl::optional<std::pair<TimeDelta, TimeDelta>>
GetFrameLengthRange() const232 AudioEncoderCng::GetFrameLengthRange() const {
233   return speech_encoder_->GetFrameLengthRange();
234 }
235 
EncodePassive(size_t frames_to_encode,rtc::Buffer * encoded)236 AudioEncoder::EncodedInfo AudioEncoderCng::EncodePassive(
237     size_t frames_to_encode,
238     rtc::Buffer* encoded) {
239   bool force_sid = last_frame_active_;
240   bool output_produced = false;
241   const size_t samples_per_10ms_frame = SamplesPer10msFrame();
242   AudioEncoder::EncodedInfo info;
243 
244   for (size_t i = 0; i < frames_to_encode; ++i) {
245     // It's important not to pass &info.encoded_bytes directly to
246     // WebRtcCng_Encode(), since later loop iterations may return zero in
247     // that value, in which case we don't want to overwrite any value from
248     // an earlier iteration.
249     size_t encoded_bytes_tmp =
250         cng_encoder_->Encode(rtc::ArrayView<const int16_t>(
251                                  &speech_buffer_[i * samples_per_10ms_frame],
252                                  samples_per_10ms_frame),
253                              force_sid, encoded);
254 
255     if (encoded_bytes_tmp > 0) {
256       RTC_CHECK(!output_produced);
257       info.encoded_bytes = encoded_bytes_tmp;
258       output_produced = true;
259       force_sid = false;
260     }
261   }
262 
263   info.encoded_timestamp = rtp_timestamps_.front();
264   info.payload_type = cng_payload_type_;
265   info.send_even_if_empty = true;
266   info.speech = false;
267   return info;
268 }
269 
EncodeActive(size_t frames_to_encode,rtc::Buffer * encoded)270 AudioEncoder::EncodedInfo AudioEncoderCng::EncodeActive(size_t frames_to_encode,
271                                                         rtc::Buffer* encoded) {
272   const size_t samples_per_10ms_frame = SamplesPer10msFrame();
273   AudioEncoder::EncodedInfo info;
274   for (size_t i = 0; i < frames_to_encode; ++i) {
275     info =
276         speech_encoder_->Encode(rtp_timestamps_.front(),
277                                 rtc::ArrayView<const int16_t>(
278                                     &speech_buffer_[i * samples_per_10ms_frame],
279                                     samples_per_10ms_frame),
280                                 encoded);
281     if (i + 1 == frames_to_encode) {
282       RTC_CHECK_GT(info.encoded_bytes, 0) << "Encoder didn't deliver data.";
283     } else {
284       RTC_CHECK_EQ(info.encoded_bytes, 0)
285           << "Encoder delivered data too early.";
286     }
287   }
288   return info;
289 }
290 
SamplesPer10msFrame() const291 size_t AudioEncoderCng::SamplesPer10msFrame() const {
292   return rtc::CheckedDivExact(10 * SampleRateHz(), 1000);
293 }
294 
295 }  // namespace
296 
297 AudioEncoderCngConfig::AudioEncoderCngConfig() = default;
298 AudioEncoderCngConfig::AudioEncoderCngConfig(AudioEncoderCngConfig&&) = default;
299 AudioEncoderCngConfig::~AudioEncoderCngConfig() = default;
300 
IsOk() const301 bool AudioEncoderCngConfig::IsOk() const {
302   if (num_channels != 1)
303     return false;
304   if (!speech_encoder)
305     return false;
306   if (num_channels != speech_encoder->NumChannels())
307     return false;
308   if (sid_frame_interval_ms <
309       static_cast<int>(speech_encoder->Max10MsFramesInAPacket() * 10))
310     return false;
311   if (num_cng_coefficients > WEBRTC_CNG_MAX_LPC_ORDER ||
312       num_cng_coefficients <= 0)
313     return false;
314   return true;
315 }
316 
CreateComfortNoiseEncoder(AudioEncoderCngConfig && config)317 std::unique_ptr<AudioEncoder> CreateComfortNoiseEncoder(
318     AudioEncoderCngConfig&& config) {
319   return std::make_unique<AudioEncoderCng>(std::move(config));
320 }
321 
322 }  // namespace webrtc
323