1 /*
2  *  Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 // This file consists of unit tests for webrtc::test::conversational_speech
12 // members. Part of them focus on accepting or rejecting different
13 // conversational speech setups. A setup is defined by a set of audio tracks and
14 // timing information).
15 // The docstring at the beginning of each TEST(ConversationalSpeechTest,
16 // MultiEndCallSetup*) function looks like the drawing below and indicates which
17 // setup is tested.
18 //
19 //    Accept:
20 //    A 0****.....
21 //    B .....1****
22 //
23 // The drawing indicates the following:
24 // - the illustrated setup should be accepted,
25 // - there are two speakers (namely, A and B),
26 // - A is the first speaking, B is the second one,
27 // - each character after the speaker's letter indicates a time unit (e.g., 100
28 //   ms),
29 // - "*" indicates speaking, "." listening,
30 // - numbers indicate the turn index in std::vector<Turn>.
31 //
32 // Note that the same speaker can appear in multiple lines in order to depict
33 // cases in which there are wrong offsets leading to self cross-talk (which is
34 // rejected).
35 
36 // MSVC++ requires this to be set before any other includes to get M_PI.
37 #define _USE_MATH_DEFINES
38 
39 #include <stdio.h>
40 
41 #include <cmath>
42 #include <map>
43 #include <memory>
44 #include <vector>
45 
46 #include "absl/strings/string_view.h"
47 #include "absl/types/optional.h"
48 #include "common_audio/wav_file.h"
49 #include "modules/audio_processing/test/conversational_speech/config.h"
50 #include "modules/audio_processing/test/conversational_speech/mock_wavreader_factory.h"
51 #include "modules/audio_processing/test/conversational_speech/multiend_call.h"
52 #include "modules/audio_processing/test/conversational_speech/simulator.h"
53 #include "modules/audio_processing/test/conversational_speech/timing.h"
54 #include "modules/audio_processing/test/conversational_speech/wavreader_factory.h"
55 #include "rtc_base/logging.h"
56 #include "test/gmock.h"
57 #include "test/gtest.h"
58 #include "test/testsupport/file_utils.h"
59 
60 namespace webrtc {
61 namespace test {
62 namespace {
63 
64 using conversational_speech::LoadTiming;
65 using conversational_speech::MockWavReaderFactory;
66 using conversational_speech::MultiEndCall;
67 using conversational_speech::SaveTiming;
68 using conversational_speech::Turn;
69 using conversational_speech::WavReaderFactory;
70 
71 const char* const audiotracks_path = "/path/to/audiotracks";
72 const char* const timing_filepath = "/path/to/timing_file.txt";
73 const char* const output_path = "/path/to/output_dir";
74 
75 const std::vector<Turn> expected_timing = {
76     {"A", "a1", 0, 0},    {"B", "b1", 0, 0}, {"A", "a2", 100, 0},
77     {"B", "b2", -200, 0}, {"A", "a3", 0, 0}, {"A", "a3", 0, 0},
78 };
79 const std::size_t kNumberOfTurns = expected_timing.size();
80 
81 // Default arguments for MockWavReaderFactory ctor.
82 // Fake audio track parameters.
83 constexpr int kDefaultSampleRate = 48000;
84 const std::map<std::string, const MockWavReaderFactory::Params>
85     kDefaultMockWavReaderFactoryParamsMap = {
86         {"t300", {kDefaultSampleRate, 1u, 14400u}},   // Mono, 0.3 seconds.
87         {"t500", {kDefaultSampleRate, 1u, 24000u}},   // Mono, 0.5 seconds.
88         {"t1000", {kDefaultSampleRate, 1u, 48000u}},  // Mono, 1.0 seconds.
89         {"sr8000", {8000, 1u, 8000u}},     // 8kHz sample rate, mono, 1 second.
90         {"sr16000", {16000, 1u, 16000u}},  // 16kHz sample rate, mono, 1 second.
91         {"sr16000_stereo", {16000, 2u, 16000u}},  // Like sr16000, but stereo.
92 };
93 const MockWavReaderFactory::Params& kDefaultMockWavReaderFactoryParams =
94     kDefaultMockWavReaderFactoryParamsMap.at("t500");
95 
CreateMockWavReaderFactory()96 std::unique_ptr<MockWavReaderFactory> CreateMockWavReaderFactory() {
97   return std::unique_ptr<MockWavReaderFactory>(
98       new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams,
99                                kDefaultMockWavReaderFactoryParamsMap));
100 }
101 
CreateSineWavFile(absl::string_view filepath,const MockWavReaderFactory::Params & params,float frequency=440.0f)102 void CreateSineWavFile(absl::string_view filepath,
103                        const MockWavReaderFactory::Params& params,
104                        float frequency = 440.0f) {
105   // Create samples.
106   constexpr double two_pi = 2.0 * M_PI;
107   std::vector<int16_t> samples(params.num_samples);
108   for (std::size_t i = 0; i < params.num_samples; ++i) {
109     // TODO(alessiob): the produced tone is not pure, improve.
110     samples[i] = std::lround(
111         32767.0f * std::sin(two_pi * i * frequency / params.sample_rate));
112   }
113 
114   // Write samples.
115   WavWriter wav_writer(filepath, params.sample_rate, params.num_channels);
116   wav_writer.WriteSamples(samples.data(), params.num_samples);
117 }
118 
119 // Parameters to generate audio tracks with CreateSineWavFile.
120 struct SineAudioTrackParams {
121   MockWavReaderFactory::Params params;
122   float frequency;
123 };
124 
125 // Creates a temporary directory in which sine audio tracks are written.
CreateTemporarySineAudioTracks(const std::map<std::string,SineAudioTrackParams> & sine_tracks_params)126 std::string CreateTemporarySineAudioTracks(
127     const std::map<std::string, SineAudioTrackParams>& sine_tracks_params) {
128   // Create temporary directory.
129   std::string temp_directory =
130       OutputPath() + "TempConversationalSpeechAudioTracks";
131   CreateDir(temp_directory);
132 
133   // Create sine tracks.
134   for (const auto& it : sine_tracks_params) {
135     const std::string temp_filepath = JoinFilename(temp_directory, it.first);
136     CreateSineWavFile(temp_filepath, it.second.params, it.second.frequency);
137   }
138 
139   return temp_directory;
140 }
141 
CheckAudioTrackParams(const WavReaderFactory & wav_reader_factory,absl::string_view filepath,const MockWavReaderFactory::Params & expeted_params)142 void CheckAudioTrackParams(const WavReaderFactory& wav_reader_factory,
143                            absl::string_view filepath,
144                            const MockWavReaderFactory::Params& expeted_params) {
145   auto wav_reader = wav_reader_factory.Create(filepath);
146   EXPECT_EQ(expeted_params.sample_rate, wav_reader->SampleRate());
147   EXPECT_EQ(expeted_params.num_channels, wav_reader->NumChannels());
148   EXPECT_EQ(expeted_params.num_samples, wav_reader->NumSamples());
149 }
150 
DeleteFolderAndContents(absl::string_view dir)151 void DeleteFolderAndContents(absl::string_view dir) {
152   if (!DirExists(dir)) {
153     return;
154   }
155   absl::optional<std::vector<std::string>> dir_content = ReadDirectory(dir);
156   EXPECT_TRUE(dir_content);
157   for (const auto& path : *dir_content) {
158     if (DirExists(path)) {
159       DeleteFolderAndContents(path);
160     } else if (FileExists(path)) {
161       // TODO(alessiob): Wrap with EXPECT_TRUE() once webrtc:7769 bug fixed.
162       RemoveFile(path);
163     } else {
164       FAIL();
165     }
166   }
167   // TODO(alessiob): Wrap with EXPECT_TRUE() once webrtc:7769 bug fixed.
168   RemoveDir(dir);
169 }
170 
171 }  // namespace
172 
173 using ::testing::_;
174 
TEST(ConversationalSpeechTest,Settings)175 TEST(ConversationalSpeechTest, Settings) {
176   const conversational_speech::Config config(audiotracks_path, timing_filepath,
177                                              output_path);
178 
179   // Test getters.
180   EXPECT_EQ(audiotracks_path, config.audiotracks_path());
181   EXPECT_EQ(timing_filepath, config.timing_filepath());
182   EXPECT_EQ(output_path, config.output_path());
183 }
184 
TEST(ConversationalSpeechTest,TimingSaveLoad)185 TEST(ConversationalSpeechTest, TimingSaveLoad) {
186   // Save test timing.
187   const std::string temporary_filepath =
188       TempFilename(OutputPath(), "TempTimingTestFile");
189   SaveTiming(temporary_filepath, expected_timing);
190 
191   // Create a std::vector<Turn> instance by loading from file.
192   std::vector<Turn> actual_timing = LoadTiming(temporary_filepath);
193   RemoveFile(temporary_filepath);
194 
195   // Check size.
196   EXPECT_EQ(expected_timing.size(), actual_timing.size());
197 
198   // Check Turn instances.
199   for (size_t index = 0; index < expected_timing.size(); ++index) {
200     EXPECT_EQ(expected_timing[index], actual_timing[index])
201         << "turn #" << index << " not matching";
202   }
203 }
204 
TEST(ConversationalSpeechTest,MultiEndCallCreate)205 TEST(ConversationalSpeechTest, MultiEndCallCreate) {
206   auto mock_wavreader_factory = CreateMockWavReaderFactory();
207 
208   // There are 5 unique audio tracks to read.
209   EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(5);
210 
211   // Inject the mock wav reader factory.
212   conversational_speech::MultiEndCall multiend_call(
213       expected_timing, audiotracks_path, std::move(mock_wavreader_factory));
214   EXPECT_TRUE(multiend_call.valid());
215 
216   // Test.
217   EXPECT_EQ(2u, multiend_call.speaker_names().size());
218   EXPECT_EQ(5u, multiend_call.audiotrack_readers().size());
219   EXPECT_EQ(6u, multiend_call.speaking_turns().size());
220 }
221 
TEST(ConversationalSpeechTest,MultiEndCallSetupDifferentSampleRates)222 TEST(ConversationalSpeechTest, MultiEndCallSetupDifferentSampleRates) {
223   const std::vector<Turn> timing = {
224       {"A", "sr8000", 0, 0},
225       {"B", "sr16000", 0, 0},
226   };
227   auto mock_wavreader_factory = CreateMockWavReaderFactory();
228 
229   // There are two unique audio tracks to read.
230   EXPECT_CALL(*mock_wavreader_factory, Create(::testing::_)).Times(2);
231 
232   MultiEndCall multiend_call(timing, audiotracks_path,
233                              std::move(mock_wavreader_factory));
234   EXPECT_FALSE(multiend_call.valid());
235 }
236 
TEST(ConversationalSpeechTest,MultiEndCallSetupMultipleChannels)237 TEST(ConversationalSpeechTest, MultiEndCallSetupMultipleChannels) {
238   const std::vector<Turn> timing = {
239       {"A", "sr16000_stereo", 0, 0},
240       {"B", "sr16000_stereo", 0, 0},
241   };
242   auto mock_wavreader_factory = CreateMockWavReaderFactory();
243 
244   // There is one unique audio track to read.
245   EXPECT_CALL(*mock_wavreader_factory, Create(::testing::_)).Times(1);
246 
247   MultiEndCall multiend_call(timing, audiotracks_path,
248                              std::move(mock_wavreader_factory));
249   EXPECT_FALSE(multiend_call.valid());
250 }
251 
TEST(ConversationalSpeechTest,MultiEndCallSetupDifferentSampleRatesAndMultipleNumChannels)252 TEST(ConversationalSpeechTest,
253      MultiEndCallSetupDifferentSampleRatesAndMultipleNumChannels) {
254   const std::vector<Turn> timing = {
255       {"A", "sr8000", 0, 0},
256       {"B", "sr16000_stereo", 0, 0},
257   };
258   auto mock_wavreader_factory = CreateMockWavReaderFactory();
259 
260   // There are two unique audio tracks to read.
261   EXPECT_CALL(*mock_wavreader_factory, Create(::testing::_)).Times(2);
262 
263   MultiEndCall multiend_call(timing, audiotracks_path,
264                              std::move(mock_wavreader_factory));
265   EXPECT_FALSE(multiend_call.valid());
266 }
267 
TEST(ConversationalSpeechTest,MultiEndCallSetupFirstOffsetNegative)268 TEST(ConversationalSpeechTest, MultiEndCallSetupFirstOffsetNegative) {
269   const std::vector<Turn> timing = {
270       {"A", "t500", -100, 0},
271       {"B", "t500", 0, 0},
272   };
273   auto mock_wavreader_factory = CreateMockWavReaderFactory();
274 
275   // There is one unique audio track to read.
276   EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
277 
278   conversational_speech::MultiEndCall multiend_call(
279       timing, audiotracks_path, std::move(mock_wavreader_factory));
280   EXPECT_FALSE(multiend_call.valid());
281 }
282 
TEST(ConversationalSpeechTest,MultiEndCallSetupSimple)283 TEST(ConversationalSpeechTest, MultiEndCallSetupSimple) {
284   // Accept:
285   // A 0****.....
286   // B .....1****
287   constexpr std::size_t expected_duration = kDefaultSampleRate;
288   const std::vector<Turn> timing = {
289       {"A", "t500", 0, 0},
290       {"B", "t500", 0, 0},
291   };
292   auto mock_wavreader_factory = CreateMockWavReaderFactory();
293 
294   // There is one unique audio track to read.
295   EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
296 
297   conversational_speech::MultiEndCall multiend_call(
298       timing, audiotracks_path, std::move(mock_wavreader_factory));
299   EXPECT_TRUE(multiend_call.valid());
300 
301   // Test.
302   EXPECT_EQ(2u, multiend_call.speaker_names().size());
303   EXPECT_EQ(1u, multiend_call.audiotrack_readers().size());
304   EXPECT_EQ(2u, multiend_call.speaking_turns().size());
305   EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
306 }
307 
TEST(ConversationalSpeechTest,MultiEndCallSetupPause)308 TEST(ConversationalSpeechTest, MultiEndCallSetupPause) {
309   // Accept:
310   // A 0****.......
311   // B .......1****
312   constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
313   const std::vector<Turn> timing = {
314       {"A", "t500", 0, 0},
315       {"B", "t500", 200, 0},
316   };
317   auto mock_wavreader_factory = CreateMockWavReaderFactory();
318 
319   // There is one unique audio track to read.
320   EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
321 
322   conversational_speech::MultiEndCall multiend_call(
323       timing, audiotracks_path, std::move(mock_wavreader_factory));
324   EXPECT_TRUE(multiend_call.valid());
325 
326   // Test.
327   EXPECT_EQ(2u, multiend_call.speaker_names().size());
328   EXPECT_EQ(1u, multiend_call.audiotrack_readers().size());
329   EXPECT_EQ(2u, multiend_call.speaking_turns().size());
330   EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
331 }
332 
TEST(ConversationalSpeechTest,MultiEndCallSetupCrossTalk)333 TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalk) {
334   // Accept:
335   // A 0****....
336   // B ....1****
337   constexpr std::size_t expected_duration = kDefaultSampleRate * 0.9;
338   const std::vector<Turn> timing = {
339       {"A", "t500", 0, 0},
340       {"B", "t500", -100, 0},
341   };
342   auto mock_wavreader_factory = CreateMockWavReaderFactory();
343 
344   // There is one unique audio track to read.
345   EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
346 
347   conversational_speech::MultiEndCall multiend_call(
348       timing, audiotracks_path, std::move(mock_wavreader_factory));
349   EXPECT_TRUE(multiend_call.valid());
350 
351   // Test.
352   EXPECT_EQ(2u, multiend_call.speaker_names().size());
353   EXPECT_EQ(1u, multiend_call.audiotrack_readers().size());
354   EXPECT_EQ(2u, multiend_call.speaking_turns().size());
355   EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
356 }
357 
TEST(ConversationalSpeechTest,MultiEndCallSetupInvalidOrder)358 TEST(ConversationalSpeechTest, MultiEndCallSetupInvalidOrder) {
359   // Reject:
360   // A ..0****
361   // B .1****.  The n-th turn cannot start before the (n-1)-th one.
362   const std::vector<Turn> timing = {
363       {"A", "t500", 200, 0},
364       {"B", "t500", -600, 0},
365   };
366   auto mock_wavreader_factory = CreateMockWavReaderFactory();
367 
368   // There is one unique audio track to read.
369   EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
370 
371   conversational_speech::MultiEndCall multiend_call(
372       timing, audiotracks_path, std::move(mock_wavreader_factory));
373   EXPECT_FALSE(multiend_call.valid());
374 }
375 
TEST(ConversationalSpeechTest,MultiEndCallSetupCrossTalkThree)376 TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkThree) {
377   // Accept:
378   // A 0****2****...
379   // B ...1*********
380   constexpr std::size_t expected_duration = kDefaultSampleRate * 1.3;
381   const std::vector<Turn> timing = {
382       {"A", "t500", 0, 0},
383       {"B", "t1000", -200, 0},
384       {"A", "t500", -800, 0},
385   };
386   auto mock_wavreader_factory = CreateMockWavReaderFactory();
387 
388   // There are two unique audio tracks to read.
389   EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
390 
391   conversational_speech::MultiEndCall multiend_call(
392       timing, audiotracks_path, std::move(mock_wavreader_factory));
393   EXPECT_TRUE(multiend_call.valid());
394 
395   // Test.
396   EXPECT_EQ(2u, multiend_call.speaker_names().size());
397   EXPECT_EQ(2u, multiend_call.audiotrack_readers().size());
398   EXPECT_EQ(3u, multiend_call.speaking_turns().size());
399   EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
400 }
401 
TEST(ConversationalSpeechTest,MultiEndCallSetupSelfCrossTalkNearInvalid)402 TEST(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkNearInvalid) {
403   // Reject:
404   // A 0****......
405   // A ...1****...
406   // B ......2****
407   //      ^  Turn #1 overlaps with #0 which is from the same speaker.
408   const std::vector<Turn> timing = {
409       {"A", "t500", 0, 0},
410       {"A", "t500", -200, 0},
411       {"B", "t500", -200, 0},
412   };
413   auto mock_wavreader_factory = CreateMockWavReaderFactory();
414 
415   // There is one unique audio track to read.
416   EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
417 
418   conversational_speech::MultiEndCall multiend_call(
419       timing, audiotracks_path, std::move(mock_wavreader_factory));
420   EXPECT_FALSE(multiend_call.valid());
421 }
422 
TEST(ConversationalSpeechTest,MultiEndCallSetupSelfCrossTalkFarInvalid)423 TEST(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkFarInvalid) {
424   // Reject:
425   // A 0*********
426   // B 1**.......
427   // C ...2**....
428   // A ......3**.
429   //         ^  Turn #3 overlaps with #0 which is from the same speaker.
430   const std::vector<Turn> timing = {
431       {"A", "t1000", 0, 0},
432       {"B", "t300", -1000, 0},
433       {"C", "t300", 0, 0},
434       {"A", "t300", 0, 0},
435   };
436   auto mock_wavreader_factory = CreateMockWavReaderFactory();
437 
438   // There are two unique audio tracks to read.
439   EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
440 
441   conversational_speech::MultiEndCall multiend_call(
442       timing, audiotracks_path, std::move(mock_wavreader_factory));
443   EXPECT_FALSE(multiend_call.valid());
444 }
445 
TEST(ConversationalSpeechTest,MultiEndCallSetupCrossTalkMiddleValid)446 TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleValid) {
447   // Accept:
448   // A 0*********..
449   // B ..1****.....
450   // C .......2****
451   constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
452   const std::vector<Turn> timing = {
453       {"A", "t1000", 0, 0},
454       {"B", "t500", -800, 0},
455       {"C", "t500", 0, 0},
456   };
457   auto mock_wavreader_factory = CreateMockWavReaderFactory();
458 
459   // There are two unique audio tracks to read.
460   EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
461 
462   conversational_speech::MultiEndCall multiend_call(
463       timing, audiotracks_path, std::move(mock_wavreader_factory));
464   EXPECT_TRUE(multiend_call.valid());
465 
466   // Test.
467   EXPECT_EQ(3u, multiend_call.speaker_names().size());
468   EXPECT_EQ(2u, multiend_call.audiotrack_readers().size());
469   EXPECT_EQ(3u, multiend_call.speaking_turns().size());
470   EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
471 }
472 
TEST(ConversationalSpeechTest,MultiEndCallSetupCrossTalkMiddleInvalid)473 TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleInvalid) {
474   // Reject:
475   // A 0*********
476   // B ..1****...
477   // C ....2****.
478   //       ^  Turn #2 overlaps both with #0 and #1 (cross-talk with 3+ speakers
479   //          not permitted).
480   const std::vector<Turn> timing = {
481       {"A", "t1000", 0, 0},
482       {"B", "t500", -800, 0},
483       {"C", "t500", -300, 0},
484   };
485   auto mock_wavreader_factory = CreateMockWavReaderFactory();
486 
487   // There are two unique audio tracks to read.
488   EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
489 
490   conversational_speech::MultiEndCall multiend_call(
491       timing, audiotracks_path, std::move(mock_wavreader_factory));
492   EXPECT_FALSE(multiend_call.valid());
493 }
494 
TEST(ConversationalSpeechTest,MultiEndCallSetupCrossTalkMiddleAndPause)495 TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleAndPause) {
496   // Accept:
497   // A 0*********..
498   // B .2****......
499   // C .......3****
500   constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
501   const std::vector<Turn> timing = {
502       {"A", "t1000", 0, 0},
503       {"B", "t500", -900, 0},
504       {"C", "t500", 100, 0},
505   };
506   auto mock_wavreader_factory = CreateMockWavReaderFactory();
507 
508   // There are two unique audio tracks to read.
509   EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
510 
511   conversational_speech::MultiEndCall multiend_call(
512       timing, audiotracks_path, std::move(mock_wavreader_factory));
513   EXPECT_TRUE(multiend_call.valid());
514 
515   // Test.
516   EXPECT_EQ(3u, multiend_call.speaker_names().size());
517   EXPECT_EQ(2u, multiend_call.audiotrack_readers().size());
518   EXPECT_EQ(3u, multiend_call.speaking_turns().size());
519   EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
520 }
521 
TEST(ConversationalSpeechTest,MultiEndCallSetupCrossTalkFullOverlapValid)522 TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkFullOverlapValid) {
523   // Accept:
524   // A 0****
525   // B 1****
526   const std::vector<Turn> timing = {
527       {"A", "t500", 0, 0},
528       {"B", "t500", -500, 0},
529   };
530   auto mock_wavreader_factory = CreateMockWavReaderFactory();
531 
532   // There is one unique audio track to read.
533   EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
534 
535   conversational_speech::MultiEndCall multiend_call(
536       timing, audiotracks_path, std::move(mock_wavreader_factory));
537   EXPECT_TRUE(multiend_call.valid());
538 
539   // Test.
540   EXPECT_EQ(2u, multiend_call.speaker_names().size());
541   EXPECT_EQ(1u, multiend_call.audiotrack_readers().size());
542   EXPECT_EQ(2u, multiend_call.speaking_turns().size());
543 }
544 
TEST(ConversationalSpeechTest,MultiEndCallSetupLongSequence)545 TEST(ConversationalSpeechTest, MultiEndCallSetupLongSequence) {
546   // Accept:
547   // A 0****....3****.5**.
548   // B .....1****...4**...
549   // C ......2**.......6**..
550   constexpr std::size_t expected_duration = kDefaultSampleRate * 1.9;
551   const std::vector<Turn> timing = {
552       {"A", "t500", 0, 0},    {"B", "t500", 0, 0},    {"C", "t300", -400, 0},
553       {"A", "t500", 0, 0},    {"B", "t300", -100, 0}, {"A", "t300", -100, 0},
554       {"C", "t300", -200, 0},
555   };
556   auto mock_wavreader_factory = std::unique_ptr<MockWavReaderFactory>(
557       new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams,
558                                kDefaultMockWavReaderFactoryParamsMap));
559 
560   // There are two unique audio tracks to read.
561   EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
562 
563   conversational_speech::MultiEndCall multiend_call(
564       timing, audiotracks_path, std::move(mock_wavreader_factory));
565   EXPECT_TRUE(multiend_call.valid());
566 
567   // Test.
568   EXPECT_EQ(3u, multiend_call.speaker_names().size());
569   EXPECT_EQ(2u, multiend_call.audiotrack_readers().size());
570   EXPECT_EQ(7u, multiend_call.speaking_turns().size());
571   EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
572 }
573 
TEST(ConversationalSpeechTest,MultiEndCallSetupLongSequenceInvalid)574 TEST(ConversationalSpeechTest, MultiEndCallSetupLongSequenceInvalid) {
575   // Reject:
576   // A 0****....3****.6**
577   // B .....1****...4**..
578   // C ......2**.....5**..
579   //                 ^ Turns #4, #5 and #6 overlapping (cross-talk with 3+
580   //                   speakers not permitted).
581   const std::vector<Turn> timing = {
582       {"A", "t500", 0, 0},    {"B", "t500", 0, 0},    {"C", "t300", -400, 0},
583       {"A", "t500", 0, 0},    {"B", "t300", -100, 0}, {"A", "t300", -200, 0},
584       {"C", "t300", -200, 0},
585   };
586   auto mock_wavreader_factory = std::unique_ptr<MockWavReaderFactory>(
587       new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams,
588                                kDefaultMockWavReaderFactoryParamsMap));
589 
590   // There are two unique audio tracks to read.
591   EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
592 
593   conversational_speech::MultiEndCall multiend_call(
594       timing, audiotracks_path, std::move(mock_wavreader_factory));
595   EXPECT_FALSE(multiend_call.valid());
596 }
597 
TEST(ConversationalSpeechTest,MultiEndCallWavReaderAdaptorSine)598 TEST(ConversationalSpeechTest, MultiEndCallWavReaderAdaptorSine) {
599   // Parameters with which wav files are created.
600   constexpr int duration_seconds = 5;
601   const int sample_rates[] = {8000, 11025, 16000, 22050, 32000, 44100, 48000};
602 
603   for (int sample_rate : sample_rates) {
604     const std::string temp_filename = OutputPath() + "TempSineWavFile_" +
605                                       std::to_string(sample_rate) + ".wav";
606 
607     // Write wav file.
608     const std::size_t num_samples = duration_seconds * sample_rate;
609     MockWavReaderFactory::Params params = {sample_rate, 1u, num_samples};
610     CreateSineWavFile(temp_filename, params);
611 
612     // Load wav file and check if params match.
613     WavReaderFactory wav_reader_factory;
614     MockWavReaderFactory::Params expeted_params = {sample_rate, 1u,
615                                                    num_samples};
616     CheckAudioTrackParams(wav_reader_factory, temp_filename, expeted_params);
617 
618     // Clean up.
619     RemoveFile(temp_filename);
620   }
621 }
622 
TEST(ConversationalSpeechTest,DISABLED_MultiEndCallSimulator)623 TEST(ConversationalSpeechTest, DISABLED_MultiEndCallSimulator) {
624   // Simulated call (one character corresponding to 500 ms):
625   // A 0*********...........2*********.....
626   // B ...........1*********.....3*********
627   const std::vector<Turn> expected_timing = {
628       {"A", "t5000_440.wav", 0, 0},
629       {"B", "t5000_880.wav", 500, 0},
630       {"A", "t5000_440.wav", 0, 0},
631       {"B", "t5000_880.wav", -2500, 0},
632   };
633   const std::size_t expected_duration_seconds = 18;
634 
635   // Create temporary audio track files.
636   const int sample_rate = 16000;
637   const std::map<std::string, SineAudioTrackParams> sine_tracks_params = {
638       {"t5000_440.wav", {{sample_rate, 1u, sample_rate * 5}, 440.0}},
639       {"t5000_880.wav", {{sample_rate, 1u, sample_rate * 5}, 880.0}},
640   };
641   const std::string audiotracks_path =
642       CreateTemporarySineAudioTracks(sine_tracks_params);
643 
644   // Set up the multi-end call.
645   auto wavreader_factory =
646       std::unique_ptr<WavReaderFactory>(new WavReaderFactory());
647   MultiEndCall multiend_call(expected_timing, audiotracks_path,
648                              std::move(wavreader_factory));
649 
650   // Simulate the call.
651   std::string output_path = JoinFilename(audiotracks_path, "output");
652   CreateDir(output_path);
653   RTC_LOG(LS_VERBOSE) << "simulator output path: " << output_path;
654   auto generated_audiotrak_pairs =
655       conversational_speech::Simulate(multiend_call, output_path);
656   EXPECT_EQ(2u, generated_audiotrak_pairs->size());
657 
658   // Check the output.
659   WavReaderFactory wav_reader_factory;
660   const MockWavReaderFactory::Params expeted_params = {
661       sample_rate, 1u, sample_rate * expected_duration_seconds};
662   for (const auto& it : *generated_audiotrak_pairs) {
663     RTC_LOG(LS_VERBOSE) << "checking far/near-end for <" << it.first << ">";
664     CheckAudioTrackParams(wav_reader_factory, it.second.near_end,
665                           expeted_params);
666     CheckAudioTrackParams(wav_reader_factory, it.second.far_end,
667                           expeted_params);
668   }
669 
670   // Clean.
671   EXPECT_NO_FATAL_FAILURE(DeleteFolderAndContents(audiotracks_path));
672 }
673 
674 }  // namespace test
675 }  // namespace webrtc
676