xref: /aosp_15_r20/external/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc (revision d9f758449e529ab9291ac668be2861e7a55c2422)
1 /*
2  *  Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "modules/audio_processing/test/conversational_speech/multiend_call.h"
12 
13 #include <algorithm>
14 #include <iterator>
15 
16 #include "absl/strings/string_view.h"
17 #include "rtc_base/logging.h"
18 #include "test/testsupport/file_utils.h"
19 
20 namespace webrtc {
21 namespace test {
22 namespace conversational_speech {
23 
MultiEndCall(rtc::ArrayView<const Turn> timing,absl::string_view audiotracks_path,std::unique_ptr<WavReaderAbstractFactory> wavreader_abstract_factory)24 MultiEndCall::MultiEndCall(
25     rtc::ArrayView<const Turn> timing,
26     absl::string_view audiotracks_path,
27     std::unique_ptr<WavReaderAbstractFactory> wavreader_abstract_factory)
28     : timing_(timing),
29       audiotracks_path_(audiotracks_path),
30       wavreader_abstract_factory_(std::move(wavreader_abstract_factory)),
31       valid_(false) {
32   FindSpeakerNames();
33   if (CreateAudioTrackReaders())
34     valid_ = CheckTiming();
35 }
36 
37 MultiEndCall::~MultiEndCall() = default;
38 
FindSpeakerNames()39 void MultiEndCall::FindSpeakerNames() {
40   RTC_DCHECK(speaker_names_.empty());
41   for (const Turn& turn : timing_) {
42     speaker_names_.emplace(turn.speaker_name);
43   }
44 }
45 
CreateAudioTrackReaders()46 bool MultiEndCall::CreateAudioTrackReaders() {
47   RTC_DCHECK(audiotrack_readers_.empty());
48   sample_rate_hz_ = 0;  // Sample rate will be set when reading the first track.
49   for (const Turn& turn : timing_) {
50     auto it = audiotrack_readers_.find(turn.audiotrack_file_name);
51     if (it != audiotrack_readers_.end())
52       continue;
53 
54     const std::string audiotrack_file_path =
55         test::JoinFilename(audiotracks_path_, turn.audiotrack_file_name);
56 
57     // Map the audiotrack file name to a new instance of WavReaderInterface.
58     std::unique_ptr<WavReaderInterface> wavreader =
59         wavreader_abstract_factory_->Create(
60             test::JoinFilename(audiotracks_path_, turn.audiotrack_file_name));
61 
62     if (sample_rate_hz_ == 0) {
63       sample_rate_hz_ = wavreader->SampleRate();
64     } else if (sample_rate_hz_ != wavreader->SampleRate()) {
65       RTC_LOG(LS_ERROR)
66           << "All the audio tracks should have the same sample rate.";
67       return false;
68     }
69 
70     if (wavreader->NumChannels() != 1) {
71       RTC_LOG(LS_ERROR) << "Only mono audio tracks supported.";
72       return false;
73     }
74 
75     audiotrack_readers_.emplace(turn.audiotrack_file_name,
76                                 std::move(wavreader));
77   }
78 
79   return true;
80 }
81 
CheckTiming()82 bool MultiEndCall::CheckTiming() {
83   struct Interval {
84     size_t begin;
85     size_t end;
86   };
87   size_t number_of_turns = timing_.size();
88   auto millisecond_to_samples = [](int ms, int sr) -> int {
89     // Truncation may happen if the sampling rate is not an integer multiple
90     // of 1000 (e.g., 44100).
91     return ms * sr / 1000;
92   };
93   auto in_interval = [](size_t value, const Interval& interval) {
94     return interval.begin <= value && value < interval.end;
95   };
96   total_duration_samples_ = 0;
97   speaking_turns_.clear();
98 
99   // Begin and end timestamps for the last two turns (unit: number of samples).
100   Interval second_last_turn = {0, 0};
101   Interval last_turn = {0, 0};
102 
103   // Initialize map to store speaking turn indices of each speaker (used to
104   // detect self cross-talk).
105   std::map<std::string, std::vector<size_t>> speaking_turn_indices;
106   for (const std::string& speaker_name : speaker_names_) {
107     speaking_turn_indices.emplace(std::piecewise_construct,
108                                   std::forward_as_tuple(speaker_name),
109                                   std::forward_as_tuple());
110   }
111 
112   // Parse turns.
113   for (size_t turn_index = 0; turn_index < number_of_turns; ++turn_index) {
114     const Turn& turn = timing_[turn_index];
115     auto it = audiotrack_readers_.find(turn.audiotrack_file_name);
116     RTC_CHECK(it != audiotrack_readers_.end())
117         << "Audio track reader not created";
118 
119     // Begin and end timestamps for the current turn.
120     int offset_samples =
121         millisecond_to_samples(turn.offset, it->second->SampleRate());
122     std::size_t begin_timestamp = last_turn.end + offset_samples;
123     std::size_t end_timestamp = begin_timestamp + it->second->NumSamples();
124     RTC_LOG(LS_INFO) << "turn #" << turn_index << " " << begin_timestamp << "-"
125                      << end_timestamp << " ms";
126 
127     // The order is invalid if the offset is negative and its absolute value is
128     // larger then the duration of the previous turn.
129     if (offset_samples < 0 &&
130         -offset_samples > static_cast<int>(last_turn.end - last_turn.begin)) {
131       RTC_LOG(LS_ERROR) << "invalid order";
132       return false;
133     }
134 
135     // Cross-talk with 3 or more speakers occurs when the beginning of the
136     // current interval falls in the last two turns.
137     if (turn_index > 1 && in_interval(begin_timestamp, last_turn) &&
138         in_interval(begin_timestamp, second_last_turn)) {
139       RTC_LOG(LS_ERROR) << "cross-talk with 3+ speakers";
140       return false;
141     }
142 
143     // Append turn.
144     speaking_turns_.emplace_back(turn.speaker_name, turn.audiotrack_file_name,
145                                  begin_timestamp, end_timestamp, turn.gain);
146 
147     // Save speaking turn index for self cross-talk detection.
148     RTC_DCHECK_EQ(speaking_turns_.size(), turn_index + 1);
149     speaking_turn_indices[turn.speaker_name].push_back(turn_index);
150 
151     // Update total duration of the consversational speech.
152     if (total_duration_samples_ < end_timestamp)
153       total_duration_samples_ = end_timestamp;
154 
155     // Update and continue with next turn.
156     second_last_turn = last_turn;
157     last_turn.begin = begin_timestamp;
158     last_turn.end = end_timestamp;
159   }
160 
161   // Detect self cross-talk.
162   for (const std::string& speaker_name : speaker_names_) {
163     RTC_LOG(LS_INFO) << "checking self cross-talk for <" << speaker_name << ">";
164 
165     // Copy all turns for this speaker to new vector.
166     std::vector<SpeakingTurn> speaking_turns_for_name;
167     std::copy_if(speaking_turns_.begin(), speaking_turns_.end(),
168                  std::back_inserter(speaking_turns_for_name),
169                  [&speaker_name](const SpeakingTurn& st) {
170                    return st.speaker_name == speaker_name;
171                  });
172 
173     // Check for overlap between adjacent elements.
174     // This is a sufficient condition for self cross-talk since the intervals
175     // are sorted by begin timestamp.
176     auto overlap = std::adjacent_find(
177         speaking_turns_for_name.begin(), speaking_turns_for_name.end(),
178         [](const SpeakingTurn& a, const SpeakingTurn& b) {
179           return a.end > b.begin;
180         });
181 
182     if (overlap != speaking_turns_for_name.end()) {
183       RTC_LOG(LS_ERROR) << "Self cross-talk detected";
184       return false;
185     }
186   }
187 
188   return true;
189 }
190 
191 }  // namespace conversational_speech
192 }  // namespace test
193 }  // namespace webrtc
194