1 /*
2 * Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 // This file consists of unit tests for webrtc::test::conversational_speech
12 // members. Part of them focus on accepting or rejecting different
13 // conversational speech setups. A setup is defined by a set of audio tracks and
14 // timing information).
15 // The docstring at the beginning of each TEST(ConversationalSpeechTest,
16 // MultiEndCallSetup*) function looks like the drawing below and indicates which
17 // setup is tested.
18 //
19 // Accept:
20 // A 0****.....
21 // B .....1****
22 //
23 // The drawing indicates the following:
24 // - the illustrated setup should be accepted,
25 // - there are two speakers (namely, A and B),
26 // - A is the first speaking, B is the second one,
27 // - each character after the speaker's letter indicates a time unit (e.g., 100
28 // ms),
29 // - "*" indicates speaking, "." listening,
30 // - numbers indicate the turn index in std::vector<Turn>.
31 //
32 // Note that the same speaker can appear in multiple lines in order to depict
33 // cases in which there are wrong offsets leading to self cross-talk (which is
34 // rejected).
35
36 // MSVC++ requires this to be set before any other includes to get M_PI.
37 #define _USE_MATH_DEFINES
38
39 #include <stdio.h>
40
41 #include <cmath>
42 #include <map>
43 #include <memory>
44 #include <vector>
45
46 #include "absl/strings/string_view.h"
47 #include "absl/types/optional.h"
48 #include "common_audio/wav_file.h"
49 #include "modules/audio_processing/test/conversational_speech/config.h"
50 #include "modules/audio_processing/test/conversational_speech/mock_wavreader_factory.h"
51 #include "modules/audio_processing/test/conversational_speech/multiend_call.h"
52 #include "modules/audio_processing/test/conversational_speech/simulator.h"
53 #include "modules/audio_processing/test/conversational_speech/timing.h"
54 #include "modules/audio_processing/test/conversational_speech/wavreader_factory.h"
55 #include "rtc_base/logging.h"
56 #include "test/gmock.h"
57 #include "test/gtest.h"
58 #include "test/testsupport/file_utils.h"
59
60 namespace webrtc {
61 namespace test {
62 namespace {
63
64 using conversational_speech::LoadTiming;
65 using conversational_speech::MockWavReaderFactory;
66 using conversational_speech::MultiEndCall;
67 using conversational_speech::SaveTiming;
68 using conversational_speech::Turn;
69 using conversational_speech::WavReaderFactory;
70
71 const char* const audiotracks_path = "/path/to/audiotracks";
72 const char* const timing_filepath = "/path/to/timing_file.txt";
73 const char* const output_path = "/path/to/output_dir";
74
75 const std::vector<Turn> expected_timing = {
76 {"A", "a1", 0, 0}, {"B", "b1", 0, 0}, {"A", "a2", 100, 0},
77 {"B", "b2", -200, 0}, {"A", "a3", 0, 0}, {"A", "a3", 0, 0},
78 };
79 const std::size_t kNumberOfTurns = expected_timing.size();
80
81 // Default arguments for MockWavReaderFactory ctor.
82 // Fake audio track parameters.
83 constexpr int kDefaultSampleRate = 48000;
84 const std::map<std::string, const MockWavReaderFactory::Params>
85 kDefaultMockWavReaderFactoryParamsMap = {
86 {"t300", {kDefaultSampleRate, 1u, 14400u}}, // Mono, 0.3 seconds.
87 {"t500", {kDefaultSampleRate, 1u, 24000u}}, // Mono, 0.5 seconds.
88 {"t1000", {kDefaultSampleRate, 1u, 48000u}}, // Mono, 1.0 seconds.
89 {"sr8000", {8000, 1u, 8000u}}, // 8kHz sample rate, mono, 1 second.
90 {"sr16000", {16000, 1u, 16000u}}, // 16kHz sample rate, mono, 1 second.
91 {"sr16000_stereo", {16000, 2u, 16000u}}, // Like sr16000, but stereo.
92 };
93 const MockWavReaderFactory::Params& kDefaultMockWavReaderFactoryParams =
94 kDefaultMockWavReaderFactoryParamsMap.at("t500");
95
CreateMockWavReaderFactory()96 std::unique_ptr<MockWavReaderFactory> CreateMockWavReaderFactory() {
97 return std::unique_ptr<MockWavReaderFactory>(
98 new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams,
99 kDefaultMockWavReaderFactoryParamsMap));
100 }
101
CreateSineWavFile(absl::string_view filepath,const MockWavReaderFactory::Params & params,float frequency=440.0f)102 void CreateSineWavFile(absl::string_view filepath,
103 const MockWavReaderFactory::Params& params,
104 float frequency = 440.0f) {
105 // Create samples.
106 constexpr double two_pi = 2.0 * M_PI;
107 std::vector<int16_t> samples(params.num_samples);
108 for (std::size_t i = 0; i < params.num_samples; ++i) {
109 // TODO(alessiob): the produced tone is not pure, improve.
110 samples[i] = std::lround(
111 32767.0f * std::sin(two_pi * i * frequency / params.sample_rate));
112 }
113
114 // Write samples.
115 WavWriter wav_writer(filepath, params.sample_rate, params.num_channels);
116 wav_writer.WriteSamples(samples.data(), params.num_samples);
117 }
118
119 // Parameters to generate audio tracks with CreateSineWavFile.
120 struct SineAudioTrackParams {
121 MockWavReaderFactory::Params params;
122 float frequency;
123 };
124
125 // Creates a temporary directory in which sine audio tracks are written.
CreateTemporarySineAudioTracks(const std::map<std::string,SineAudioTrackParams> & sine_tracks_params)126 std::string CreateTemporarySineAudioTracks(
127 const std::map<std::string, SineAudioTrackParams>& sine_tracks_params) {
128 // Create temporary directory.
129 std::string temp_directory =
130 OutputPath() + "TempConversationalSpeechAudioTracks";
131 CreateDir(temp_directory);
132
133 // Create sine tracks.
134 for (const auto& it : sine_tracks_params) {
135 const std::string temp_filepath = JoinFilename(temp_directory, it.first);
136 CreateSineWavFile(temp_filepath, it.second.params, it.second.frequency);
137 }
138
139 return temp_directory;
140 }
141
CheckAudioTrackParams(const WavReaderFactory & wav_reader_factory,absl::string_view filepath,const MockWavReaderFactory::Params & expeted_params)142 void CheckAudioTrackParams(const WavReaderFactory& wav_reader_factory,
143 absl::string_view filepath,
144 const MockWavReaderFactory::Params& expeted_params) {
145 auto wav_reader = wav_reader_factory.Create(filepath);
146 EXPECT_EQ(expeted_params.sample_rate, wav_reader->SampleRate());
147 EXPECT_EQ(expeted_params.num_channels, wav_reader->NumChannels());
148 EXPECT_EQ(expeted_params.num_samples, wav_reader->NumSamples());
149 }
150
DeleteFolderAndContents(absl::string_view dir)151 void DeleteFolderAndContents(absl::string_view dir) {
152 if (!DirExists(dir)) {
153 return;
154 }
155 absl::optional<std::vector<std::string>> dir_content = ReadDirectory(dir);
156 EXPECT_TRUE(dir_content);
157 for (const auto& path : *dir_content) {
158 if (DirExists(path)) {
159 DeleteFolderAndContents(path);
160 } else if (FileExists(path)) {
161 // TODO(alessiob): Wrap with EXPECT_TRUE() once webrtc:7769 bug fixed.
162 RemoveFile(path);
163 } else {
164 FAIL();
165 }
166 }
167 // TODO(alessiob): Wrap with EXPECT_TRUE() once webrtc:7769 bug fixed.
168 RemoveDir(dir);
169 }
170
171 } // namespace
172
173 using ::testing::_;
174
TEST(ConversationalSpeechTest,Settings)175 TEST(ConversationalSpeechTest, Settings) {
176 const conversational_speech::Config config(audiotracks_path, timing_filepath,
177 output_path);
178
179 // Test getters.
180 EXPECT_EQ(audiotracks_path, config.audiotracks_path());
181 EXPECT_EQ(timing_filepath, config.timing_filepath());
182 EXPECT_EQ(output_path, config.output_path());
183 }
184
TEST(ConversationalSpeechTest,TimingSaveLoad)185 TEST(ConversationalSpeechTest, TimingSaveLoad) {
186 // Save test timing.
187 const std::string temporary_filepath =
188 TempFilename(OutputPath(), "TempTimingTestFile");
189 SaveTiming(temporary_filepath, expected_timing);
190
191 // Create a std::vector<Turn> instance by loading from file.
192 std::vector<Turn> actual_timing = LoadTiming(temporary_filepath);
193 RemoveFile(temporary_filepath);
194
195 // Check size.
196 EXPECT_EQ(expected_timing.size(), actual_timing.size());
197
198 // Check Turn instances.
199 for (size_t index = 0; index < expected_timing.size(); ++index) {
200 EXPECT_EQ(expected_timing[index], actual_timing[index])
201 << "turn #" << index << " not matching";
202 }
203 }
204
TEST(ConversationalSpeechTest,MultiEndCallCreate)205 TEST(ConversationalSpeechTest, MultiEndCallCreate) {
206 auto mock_wavreader_factory = CreateMockWavReaderFactory();
207
208 // There are 5 unique audio tracks to read.
209 EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(5);
210
211 // Inject the mock wav reader factory.
212 conversational_speech::MultiEndCall multiend_call(
213 expected_timing, audiotracks_path, std::move(mock_wavreader_factory));
214 EXPECT_TRUE(multiend_call.valid());
215
216 // Test.
217 EXPECT_EQ(2u, multiend_call.speaker_names().size());
218 EXPECT_EQ(5u, multiend_call.audiotrack_readers().size());
219 EXPECT_EQ(6u, multiend_call.speaking_turns().size());
220 }
221
TEST(ConversationalSpeechTest,MultiEndCallSetupDifferentSampleRates)222 TEST(ConversationalSpeechTest, MultiEndCallSetupDifferentSampleRates) {
223 const std::vector<Turn> timing = {
224 {"A", "sr8000", 0, 0},
225 {"B", "sr16000", 0, 0},
226 };
227 auto mock_wavreader_factory = CreateMockWavReaderFactory();
228
229 // There are two unique audio tracks to read.
230 EXPECT_CALL(*mock_wavreader_factory, Create(::testing::_)).Times(2);
231
232 MultiEndCall multiend_call(timing, audiotracks_path,
233 std::move(mock_wavreader_factory));
234 EXPECT_FALSE(multiend_call.valid());
235 }
236
TEST(ConversationalSpeechTest,MultiEndCallSetupMultipleChannels)237 TEST(ConversationalSpeechTest, MultiEndCallSetupMultipleChannels) {
238 const std::vector<Turn> timing = {
239 {"A", "sr16000_stereo", 0, 0},
240 {"B", "sr16000_stereo", 0, 0},
241 };
242 auto mock_wavreader_factory = CreateMockWavReaderFactory();
243
244 // There is one unique audio track to read.
245 EXPECT_CALL(*mock_wavreader_factory, Create(::testing::_)).Times(1);
246
247 MultiEndCall multiend_call(timing, audiotracks_path,
248 std::move(mock_wavreader_factory));
249 EXPECT_FALSE(multiend_call.valid());
250 }
251
TEST(ConversationalSpeechTest,MultiEndCallSetupDifferentSampleRatesAndMultipleNumChannels)252 TEST(ConversationalSpeechTest,
253 MultiEndCallSetupDifferentSampleRatesAndMultipleNumChannels) {
254 const std::vector<Turn> timing = {
255 {"A", "sr8000", 0, 0},
256 {"B", "sr16000_stereo", 0, 0},
257 };
258 auto mock_wavreader_factory = CreateMockWavReaderFactory();
259
260 // There are two unique audio tracks to read.
261 EXPECT_CALL(*mock_wavreader_factory, Create(::testing::_)).Times(2);
262
263 MultiEndCall multiend_call(timing, audiotracks_path,
264 std::move(mock_wavreader_factory));
265 EXPECT_FALSE(multiend_call.valid());
266 }
267
TEST(ConversationalSpeechTest,MultiEndCallSetupFirstOffsetNegative)268 TEST(ConversationalSpeechTest, MultiEndCallSetupFirstOffsetNegative) {
269 const std::vector<Turn> timing = {
270 {"A", "t500", -100, 0},
271 {"B", "t500", 0, 0},
272 };
273 auto mock_wavreader_factory = CreateMockWavReaderFactory();
274
275 // There is one unique audio track to read.
276 EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
277
278 conversational_speech::MultiEndCall multiend_call(
279 timing, audiotracks_path, std::move(mock_wavreader_factory));
280 EXPECT_FALSE(multiend_call.valid());
281 }
282
TEST(ConversationalSpeechTest,MultiEndCallSetupSimple)283 TEST(ConversationalSpeechTest, MultiEndCallSetupSimple) {
284 // Accept:
285 // A 0****.....
286 // B .....1****
287 constexpr std::size_t expected_duration = kDefaultSampleRate;
288 const std::vector<Turn> timing = {
289 {"A", "t500", 0, 0},
290 {"B", "t500", 0, 0},
291 };
292 auto mock_wavreader_factory = CreateMockWavReaderFactory();
293
294 // There is one unique audio track to read.
295 EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
296
297 conversational_speech::MultiEndCall multiend_call(
298 timing, audiotracks_path, std::move(mock_wavreader_factory));
299 EXPECT_TRUE(multiend_call.valid());
300
301 // Test.
302 EXPECT_EQ(2u, multiend_call.speaker_names().size());
303 EXPECT_EQ(1u, multiend_call.audiotrack_readers().size());
304 EXPECT_EQ(2u, multiend_call.speaking_turns().size());
305 EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
306 }
307
TEST(ConversationalSpeechTest,MultiEndCallSetupPause)308 TEST(ConversationalSpeechTest, MultiEndCallSetupPause) {
309 // Accept:
310 // A 0****.......
311 // B .......1****
312 constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
313 const std::vector<Turn> timing = {
314 {"A", "t500", 0, 0},
315 {"B", "t500", 200, 0},
316 };
317 auto mock_wavreader_factory = CreateMockWavReaderFactory();
318
319 // There is one unique audio track to read.
320 EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
321
322 conversational_speech::MultiEndCall multiend_call(
323 timing, audiotracks_path, std::move(mock_wavreader_factory));
324 EXPECT_TRUE(multiend_call.valid());
325
326 // Test.
327 EXPECT_EQ(2u, multiend_call.speaker_names().size());
328 EXPECT_EQ(1u, multiend_call.audiotrack_readers().size());
329 EXPECT_EQ(2u, multiend_call.speaking_turns().size());
330 EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
331 }
332
TEST(ConversationalSpeechTest,MultiEndCallSetupCrossTalk)333 TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalk) {
334 // Accept:
335 // A 0****....
336 // B ....1****
337 constexpr std::size_t expected_duration = kDefaultSampleRate * 0.9;
338 const std::vector<Turn> timing = {
339 {"A", "t500", 0, 0},
340 {"B", "t500", -100, 0},
341 };
342 auto mock_wavreader_factory = CreateMockWavReaderFactory();
343
344 // There is one unique audio track to read.
345 EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
346
347 conversational_speech::MultiEndCall multiend_call(
348 timing, audiotracks_path, std::move(mock_wavreader_factory));
349 EXPECT_TRUE(multiend_call.valid());
350
351 // Test.
352 EXPECT_EQ(2u, multiend_call.speaker_names().size());
353 EXPECT_EQ(1u, multiend_call.audiotrack_readers().size());
354 EXPECT_EQ(2u, multiend_call.speaking_turns().size());
355 EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
356 }
357
TEST(ConversationalSpeechTest,MultiEndCallSetupInvalidOrder)358 TEST(ConversationalSpeechTest, MultiEndCallSetupInvalidOrder) {
359 // Reject:
360 // A ..0****
361 // B .1****. The n-th turn cannot start before the (n-1)-th one.
362 const std::vector<Turn> timing = {
363 {"A", "t500", 200, 0},
364 {"B", "t500", -600, 0},
365 };
366 auto mock_wavreader_factory = CreateMockWavReaderFactory();
367
368 // There is one unique audio track to read.
369 EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
370
371 conversational_speech::MultiEndCall multiend_call(
372 timing, audiotracks_path, std::move(mock_wavreader_factory));
373 EXPECT_FALSE(multiend_call.valid());
374 }
375
TEST(ConversationalSpeechTest,MultiEndCallSetupCrossTalkThree)376 TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkThree) {
377 // Accept:
378 // A 0****2****...
379 // B ...1*********
380 constexpr std::size_t expected_duration = kDefaultSampleRate * 1.3;
381 const std::vector<Turn> timing = {
382 {"A", "t500", 0, 0},
383 {"B", "t1000", -200, 0},
384 {"A", "t500", -800, 0},
385 };
386 auto mock_wavreader_factory = CreateMockWavReaderFactory();
387
388 // There are two unique audio tracks to read.
389 EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
390
391 conversational_speech::MultiEndCall multiend_call(
392 timing, audiotracks_path, std::move(mock_wavreader_factory));
393 EXPECT_TRUE(multiend_call.valid());
394
395 // Test.
396 EXPECT_EQ(2u, multiend_call.speaker_names().size());
397 EXPECT_EQ(2u, multiend_call.audiotrack_readers().size());
398 EXPECT_EQ(3u, multiend_call.speaking_turns().size());
399 EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
400 }
401
TEST(ConversationalSpeechTest,MultiEndCallSetupSelfCrossTalkNearInvalid)402 TEST(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkNearInvalid) {
403 // Reject:
404 // A 0****......
405 // A ...1****...
406 // B ......2****
407 // ^ Turn #1 overlaps with #0 which is from the same speaker.
408 const std::vector<Turn> timing = {
409 {"A", "t500", 0, 0},
410 {"A", "t500", -200, 0},
411 {"B", "t500", -200, 0},
412 };
413 auto mock_wavreader_factory = CreateMockWavReaderFactory();
414
415 // There is one unique audio track to read.
416 EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
417
418 conversational_speech::MultiEndCall multiend_call(
419 timing, audiotracks_path, std::move(mock_wavreader_factory));
420 EXPECT_FALSE(multiend_call.valid());
421 }
422
TEST(ConversationalSpeechTest,MultiEndCallSetupSelfCrossTalkFarInvalid)423 TEST(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkFarInvalid) {
424 // Reject:
425 // A 0*********
426 // B 1**.......
427 // C ...2**....
428 // A ......3**.
429 // ^ Turn #3 overlaps with #0 which is from the same speaker.
430 const std::vector<Turn> timing = {
431 {"A", "t1000", 0, 0},
432 {"B", "t300", -1000, 0},
433 {"C", "t300", 0, 0},
434 {"A", "t300", 0, 0},
435 };
436 auto mock_wavreader_factory = CreateMockWavReaderFactory();
437
438 // There are two unique audio tracks to read.
439 EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
440
441 conversational_speech::MultiEndCall multiend_call(
442 timing, audiotracks_path, std::move(mock_wavreader_factory));
443 EXPECT_FALSE(multiend_call.valid());
444 }
445
TEST(ConversationalSpeechTest,MultiEndCallSetupCrossTalkMiddleValid)446 TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleValid) {
447 // Accept:
448 // A 0*********..
449 // B ..1****.....
450 // C .......2****
451 constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
452 const std::vector<Turn> timing = {
453 {"A", "t1000", 0, 0},
454 {"B", "t500", -800, 0},
455 {"C", "t500", 0, 0},
456 };
457 auto mock_wavreader_factory = CreateMockWavReaderFactory();
458
459 // There are two unique audio tracks to read.
460 EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
461
462 conversational_speech::MultiEndCall multiend_call(
463 timing, audiotracks_path, std::move(mock_wavreader_factory));
464 EXPECT_TRUE(multiend_call.valid());
465
466 // Test.
467 EXPECT_EQ(3u, multiend_call.speaker_names().size());
468 EXPECT_EQ(2u, multiend_call.audiotrack_readers().size());
469 EXPECT_EQ(3u, multiend_call.speaking_turns().size());
470 EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
471 }
472
TEST(ConversationalSpeechTest,MultiEndCallSetupCrossTalkMiddleInvalid)473 TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleInvalid) {
474 // Reject:
475 // A 0*********
476 // B ..1****...
477 // C ....2****.
478 // ^ Turn #2 overlaps both with #0 and #1 (cross-talk with 3+ speakers
479 // not permitted).
480 const std::vector<Turn> timing = {
481 {"A", "t1000", 0, 0},
482 {"B", "t500", -800, 0},
483 {"C", "t500", -300, 0},
484 };
485 auto mock_wavreader_factory = CreateMockWavReaderFactory();
486
487 // There are two unique audio tracks to read.
488 EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
489
490 conversational_speech::MultiEndCall multiend_call(
491 timing, audiotracks_path, std::move(mock_wavreader_factory));
492 EXPECT_FALSE(multiend_call.valid());
493 }
494
TEST(ConversationalSpeechTest,MultiEndCallSetupCrossTalkMiddleAndPause)495 TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleAndPause) {
496 // Accept:
497 // A 0*********..
498 // B .2****......
499 // C .......3****
500 constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
501 const std::vector<Turn> timing = {
502 {"A", "t1000", 0, 0},
503 {"B", "t500", -900, 0},
504 {"C", "t500", 100, 0},
505 };
506 auto mock_wavreader_factory = CreateMockWavReaderFactory();
507
508 // There are two unique audio tracks to read.
509 EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
510
511 conversational_speech::MultiEndCall multiend_call(
512 timing, audiotracks_path, std::move(mock_wavreader_factory));
513 EXPECT_TRUE(multiend_call.valid());
514
515 // Test.
516 EXPECT_EQ(3u, multiend_call.speaker_names().size());
517 EXPECT_EQ(2u, multiend_call.audiotrack_readers().size());
518 EXPECT_EQ(3u, multiend_call.speaking_turns().size());
519 EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
520 }
521
TEST(ConversationalSpeechTest,MultiEndCallSetupCrossTalkFullOverlapValid)522 TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkFullOverlapValid) {
523 // Accept:
524 // A 0****
525 // B 1****
526 const std::vector<Turn> timing = {
527 {"A", "t500", 0, 0},
528 {"B", "t500", -500, 0},
529 };
530 auto mock_wavreader_factory = CreateMockWavReaderFactory();
531
532 // There is one unique audio track to read.
533 EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
534
535 conversational_speech::MultiEndCall multiend_call(
536 timing, audiotracks_path, std::move(mock_wavreader_factory));
537 EXPECT_TRUE(multiend_call.valid());
538
539 // Test.
540 EXPECT_EQ(2u, multiend_call.speaker_names().size());
541 EXPECT_EQ(1u, multiend_call.audiotrack_readers().size());
542 EXPECT_EQ(2u, multiend_call.speaking_turns().size());
543 }
544
TEST(ConversationalSpeechTest,MultiEndCallSetupLongSequence)545 TEST(ConversationalSpeechTest, MultiEndCallSetupLongSequence) {
546 // Accept:
547 // A 0****....3****.5**.
548 // B .....1****...4**...
549 // C ......2**.......6**..
550 constexpr std::size_t expected_duration = kDefaultSampleRate * 1.9;
551 const std::vector<Turn> timing = {
552 {"A", "t500", 0, 0}, {"B", "t500", 0, 0}, {"C", "t300", -400, 0},
553 {"A", "t500", 0, 0}, {"B", "t300", -100, 0}, {"A", "t300", -100, 0},
554 {"C", "t300", -200, 0},
555 };
556 auto mock_wavreader_factory = std::unique_ptr<MockWavReaderFactory>(
557 new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams,
558 kDefaultMockWavReaderFactoryParamsMap));
559
560 // There are two unique audio tracks to read.
561 EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
562
563 conversational_speech::MultiEndCall multiend_call(
564 timing, audiotracks_path, std::move(mock_wavreader_factory));
565 EXPECT_TRUE(multiend_call.valid());
566
567 // Test.
568 EXPECT_EQ(3u, multiend_call.speaker_names().size());
569 EXPECT_EQ(2u, multiend_call.audiotrack_readers().size());
570 EXPECT_EQ(7u, multiend_call.speaking_turns().size());
571 EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
572 }
573
TEST(ConversationalSpeechTest,MultiEndCallSetupLongSequenceInvalid)574 TEST(ConversationalSpeechTest, MultiEndCallSetupLongSequenceInvalid) {
575 // Reject:
576 // A 0****....3****.6**
577 // B .....1****...4**..
578 // C ......2**.....5**..
579 // ^ Turns #4, #5 and #6 overlapping (cross-talk with 3+
580 // speakers not permitted).
581 const std::vector<Turn> timing = {
582 {"A", "t500", 0, 0}, {"B", "t500", 0, 0}, {"C", "t300", -400, 0},
583 {"A", "t500", 0, 0}, {"B", "t300", -100, 0}, {"A", "t300", -200, 0},
584 {"C", "t300", -200, 0},
585 };
586 auto mock_wavreader_factory = std::unique_ptr<MockWavReaderFactory>(
587 new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams,
588 kDefaultMockWavReaderFactoryParamsMap));
589
590 // There are two unique audio tracks to read.
591 EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
592
593 conversational_speech::MultiEndCall multiend_call(
594 timing, audiotracks_path, std::move(mock_wavreader_factory));
595 EXPECT_FALSE(multiend_call.valid());
596 }
597
TEST(ConversationalSpeechTest,MultiEndCallWavReaderAdaptorSine)598 TEST(ConversationalSpeechTest, MultiEndCallWavReaderAdaptorSine) {
599 // Parameters with which wav files are created.
600 constexpr int duration_seconds = 5;
601 const int sample_rates[] = {8000, 11025, 16000, 22050, 32000, 44100, 48000};
602
603 for (int sample_rate : sample_rates) {
604 const std::string temp_filename = OutputPath() + "TempSineWavFile_" +
605 std::to_string(sample_rate) + ".wav";
606
607 // Write wav file.
608 const std::size_t num_samples = duration_seconds * sample_rate;
609 MockWavReaderFactory::Params params = {sample_rate, 1u, num_samples};
610 CreateSineWavFile(temp_filename, params);
611
612 // Load wav file and check if params match.
613 WavReaderFactory wav_reader_factory;
614 MockWavReaderFactory::Params expeted_params = {sample_rate, 1u,
615 num_samples};
616 CheckAudioTrackParams(wav_reader_factory, temp_filename, expeted_params);
617
618 // Clean up.
619 RemoveFile(temp_filename);
620 }
621 }
622
TEST(ConversationalSpeechTest,DISABLED_MultiEndCallSimulator)623 TEST(ConversationalSpeechTest, DISABLED_MultiEndCallSimulator) {
624 // Simulated call (one character corresponding to 500 ms):
625 // A 0*********...........2*********.....
626 // B ...........1*********.....3*********
627 const std::vector<Turn> expected_timing = {
628 {"A", "t5000_440.wav", 0, 0},
629 {"B", "t5000_880.wav", 500, 0},
630 {"A", "t5000_440.wav", 0, 0},
631 {"B", "t5000_880.wav", -2500, 0},
632 };
633 const std::size_t expected_duration_seconds = 18;
634
635 // Create temporary audio track files.
636 const int sample_rate = 16000;
637 const std::map<std::string, SineAudioTrackParams> sine_tracks_params = {
638 {"t5000_440.wav", {{sample_rate, 1u, sample_rate * 5}, 440.0}},
639 {"t5000_880.wav", {{sample_rate, 1u, sample_rate * 5}, 880.0}},
640 };
641 const std::string audiotracks_path =
642 CreateTemporarySineAudioTracks(sine_tracks_params);
643
644 // Set up the multi-end call.
645 auto wavreader_factory =
646 std::unique_ptr<WavReaderFactory>(new WavReaderFactory());
647 MultiEndCall multiend_call(expected_timing, audiotracks_path,
648 std::move(wavreader_factory));
649
650 // Simulate the call.
651 std::string output_path = JoinFilename(audiotracks_path, "output");
652 CreateDir(output_path);
653 RTC_LOG(LS_VERBOSE) << "simulator output path: " << output_path;
654 auto generated_audiotrak_pairs =
655 conversational_speech::Simulate(multiend_call, output_path);
656 EXPECT_EQ(2u, generated_audiotrak_pairs->size());
657
658 // Check the output.
659 WavReaderFactory wav_reader_factory;
660 const MockWavReaderFactory::Params expeted_params = {
661 sample_rate, 1u, sample_rate * expected_duration_seconds};
662 for (const auto& it : *generated_audiotrak_pairs) {
663 RTC_LOG(LS_VERBOSE) << "checking far/near-end for <" << it.first << ">";
664 CheckAudioTrackParams(wav_reader_factory, it.second.near_end,
665 expeted_params);
666 CheckAudioTrackParams(wav_reader_factory, it.second.far_end,
667 expeted_params);
668 }
669
670 // Clean.
671 EXPECT_NO_FATAL_FAILURE(DeleteFolderAndContents(audiotracks_path));
672 }
673
674 } // namespace test
675 } // namespace webrtc
676