xref: /aosp_15_r20/external/grpc-grpc/test/cpp/end2end/flaky_network_test.cc (revision cc02d7e222339f7a4f6ba5f422e6413f4bd931f2)
1 //
2 //
3 // Copyright 2019 gRPC authors.
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License");
6 // you may not use this file except in compliance with the License.
7 // You may obtain a copy of the License at
8 //
9 //     http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
17 //
18 
19 #include <algorithm>
20 #include <condition_variable>
21 #include <memory>
22 #include <mutex>
23 #include <random>
24 #include <thread>
25 
26 #include <gtest/gtest.h>
27 
28 #include "absl/memory/memory.h"
29 
30 #include <grpc/grpc.h>
31 #include <grpc/support/alloc.h>
32 #include <grpc/support/atm.h>
33 #include <grpc/support/log.h>
34 #include <grpc/support/port_platform.h>
35 #include <grpc/support/string_util.h>
36 #include <grpc/support/time.h>
37 #include <grpcpp/channel.h>
38 #include <grpcpp/client_context.h>
39 #include <grpcpp/create_channel.h>
40 #include <grpcpp/health_check_service_interface.h>
41 #include <grpcpp/server.h>
42 #include <grpcpp/server_builder.h>
43 
44 #include "src/core/lib/backoff/backoff.h"
45 #include "src/core/lib/gprpp/crash.h"
46 #include "src/core/lib/gprpp/env.h"
47 #include "src/proto/grpc/testing/echo.grpc.pb.h"
48 #include "test/core/util/port.h"
49 #include "test/core/util/test_config.h"
50 #include "test/cpp/end2end/test_service_impl.h"
51 #include "test/cpp/util/test_credentials_provider.h"
52 
53 #ifdef GPR_LINUX
54 
55 namespace grpc {
56 namespace testing {
57 namespace {
58 
59 struct TestScenario {
TestScenariogrpc::testing::__anon8a0e79be0111::TestScenario60   TestScenario(const std::string& creds_type, const std::string& content)
61       : credentials_type(creds_type), message_content(content) {}
62   const std::string credentials_type;
63   const std::string message_content;
64 };
65 
66 class FlakyNetworkTest : public ::testing::TestWithParam<TestScenario> {
67  protected:
FlakyNetworkTest()68   FlakyNetworkTest()
69       : server_host_("grpctest"),
70         interface_("lo:1"),
71         ipv4_address_("10.0.0.1"),
72         netmask_("/32") {}
73 
InterfaceUp()74   void InterfaceUp() {
75     std::ostringstream cmd;
76     // create interface_ with address ipv4_address_
77     cmd << "ip addr add " << ipv4_address_ << netmask_ << " dev " << interface_;
78     std::system(cmd.str().c_str());
79   }
80 
InterfaceDown()81   void InterfaceDown() {
82     std::ostringstream cmd;
83     // remove interface_
84     cmd << "ip addr del " << ipv4_address_ << netmask_ << " dev " << interface_;
85     std::system(cmd.str().c_str());
86   }
87 
DNSUp()88   void DNSUp() {
89     std::ostringstream cmd;
90     // Add DNS entry for server_host_ in /etc/hosts
91     cmd << "echo '" << ipv4_address_ << "      " << server_host_
92         << "' >> /etc/hosts";
93     std::system(cmd.str().c_str());
94   }
95 
DNSDown()96   void DNSDown() {
97     std::ostringstream cmd;
98     // Remove DNS entry for server_host_ from /etc/hosts
99     // NOTE: we can't do this in one step with sed -i because when we are
100     // running under docker, the file is mounted by docker so we can't change
101     // its inode from within the container (sed -i creates a new file and
102     // replaces the old file, which changes the inode)
103     cmd << "sed  '/" << server_host_ << "/d' /etc/hosts > /etc/hosts.orig";
104     std::system(cmd.str().c_str());
105 
106     // clear the stream
107     cmd.str("");
108 
109     cmd << "cat /etc/hosts.orig > /etc/hosts";
110     std::system(cmd.str().c_str());
111   }
112 
DropPackets()113   void DropPackets() {
114     std::ostringstream cmd;
115     // drop packets with src IP = ipv4_address_
116     cmd << "iptables -A INPUT -s " << ipv4_address_ << " -j DROP";
117 
118     std::system(cmd.str().c_str());
119     // clear the stream
120     cmd.str("");
121 
122     // drop packets with dst IP = ipv4_address_
123     cmd << "iptables -A INPUT -d " << ipv4_address_ << " -j DROP";
124   }
125 
RestoreNetwork()126   void RestoreNetwork() {
127     std::ostringstream cmd;
128     // remove iptables rule to drop packets with src IP = ipv4_address_
129     cmd << "iptables -D INPUT -s " << ipv4_address_ << " -j DROP";
130     std::system(cmd.str().c_str());
131     // clear the stream
132     cmd.str("");
133     // remove iptables rule to drop packets with dest IP = ipv4_address_
134     cmd << "iptables -D INPUT -d " << ipv4_address_ << " -j DROP";
135   }
136 
FlakeNetwork()137   void FlakeNetwork() {
138     std::ostringstream cmd;
139     // Emulate a flaky network connection over interface_. Add a delay of 100ms
140     // +/- 20ms, 0.1% packet loss, 1% duplicates and 0.01% corrupt packets.
141     cmd << "tc qdisc replace dev " << interface_
142         << " root netem delay 100ms 20ms distribution normal loss 0.1% "
143            "duplicate "
144            "0.1% corrupt 0.01% ";
145     std::system(cmd.str().c_str());
146   }
147 
UnflakeNetwork()148   void UnflakeNetwork() {
149     // Remove simulated network flake on interface_
150     std::ostringstream cmd;
151     cmd << "tc qdisc del dev " << interface_ << " root netem";
152     std::system(cmd.str().c_str());
153   }
154 
NetworkUp()155   void NetworkUp() {
156     InterfaceUp();
157     DNSUp();
158   }
159 
NetworkDown()160   void NetworkDown() {
161     InterfaceDown();
162     DNSDown();
163   }
164 
SetUp()165   void SetUp() override {
166     NetworkUp();
167     grpc_init();
168     StartServer();
169   }
170 
TearDown()171   void TearDown() override {
172     NetworkDown();
173     StopServer();
174     grpc_shutdown();
175   }
176 
StartServer()177   void StartServer() {
178     // TODO (pjaikumar): Ideally, we should allocate the port dynamically using
179     // grpc_pick_unused_port_or_die(). That doesn't work inside some docker
180     // containers because port_server listens on localhost which maps to
181     // ip6-looopback, but ipv6 support is not enabled by default in docker.
182     port_ = SERVER_PORT;
183 
184     server_ = std::make_unique<ServerData>(port_, GetParam().credentials_type);
185     server_->Start(server_host_);
186   }
StopServer()187   void StopServer() { server_->Shutdown(); }
188 
BuildStub(const std::shared_ptr<Channel> & channel)189   std::unique_ptr<grpc::testing::EchoTestService::Stub> BuildStub(
190       const std::shared_ptr<Channel>& channel) {
191     return grpc::testing::EchoTestService::NewStub(channel);
192   }
193 
BuildChannel(const std::string & lb_policy_name,ChannelArguments args=ChannelArguments ())194   std::shared_ptr<Channel> BuildChannel(
195       const std::string& lb_policy_name,
196       ChannelArguments args = ChannelArguments()) {
197     if (!lb_policy_name.empty()) {
198       args.SetLoadBalancingPolicyName(lb_policy_name);
199     }  // else, default to pick first
200     auto channel_creds = GetCredentialsProvider()->GetChannelCredentials(
201         GetParam().credentials_type, &args);
202     std::ostringstream server_address;
203     server_address << server_host_ << ":" << port_;
204     return CreateCustomChannel(server_address.str(), channel_creds, args);
205   }
206 
SendRpc(const std::unique_ptr<grpc::testing::EchoTestService::Stub> & stub,int timeout_ms=0,bool wait_for_ready=false)207   bool SendRpc(
208       const std::unique_ptr<grpc::testing::EchoTestService::Stub>& stub,
209       int timeout_ms = 0, bool wait_for_ready = false) {
210     auto response = std::make_unique<EchoResponse>();
211     EchoRequest request;
212     auto& msg = GetParam().message_content;
213     request.set_message(msg);
214     ClientContext context;
215     if (timeout_ms > 0) {
216       context.set_deadline(grpc_timeout_milliseconds_to_deadline(timeout_ms));
217       // Allow an RPC to be canceled (for deadline exceeded) after it has
218       // reached the server.
219       request.mutable_param()->set_skip_cancelled_check(true);
220     }
221     // See https://github.com/grpc/grpc/blob/master/doc/wait-for-ready.md for
222     // details of wait-for-ready semantics
223     if (wait_for_ready) {
224       context.set_wait_for_ready(true);
225     }
226     Status status = stub->Echo(&context, request, response.get());
227     auto ok = status.ok();
228     if (ok) {
229       gpr_log(GPR_DEBUG, "RPC succeeded");
230     } else {
231       gpr_log(GPR_DEBUG, "RPC failed: %s", status.error_message().c_str());
232     }
233     return ok;
234   }
235 
236   struct ServerData {
237     int port_;
238     const std::string creds_;
239     std::unique_ptr<Server> server_;
240     TestServiceImpl service_;
241     std::unique_ptr<std::thread> thread_;
242     bool server_ready_ = false;
243 
ServerDatagrpc::testing::__anon8a0e79be0111::FlakyNetworkTest::ServerData244     ServerData(int port, const std::string& creds)
245         : port_(port), creds_(creds) {}
246 
Startgrpc::testing::__anon8a0e79be0111::FlakyNetworkTest::ServerData247     void Start(const std::string& server_host) {
248       gpr_log(GPR_INFO, "starting server on port %d", port_);
249       std::mutex mu;
250       std::unique_lock<std::mutex> lock(mu);
251       std::condition_variable cond;
252       thread_ = std::make_unique<std::thread>(
253           std::bind(&ServerData::Serve, this, server_host, &mu, &cond));
254       cond.wait(lock, [this] { return server_ready_; });
255       server_ready_ = false;
256       gpr_log(GPR_INFO, "server startup complete");
257     }
258 
Servegrpc::testing::__anon8a0e79be0111::FlakyNetworkTest::ServerData259     void Serve(const std::string& server_host, std::mutex* mu,
260                std::condition_variable* cond) {
261       std::ostringstream server_address;
262       server_address << server_host << ":" << port_;
263       ServerBuilder builder;
264       auto server_creds =
265           GetCredentialsProvider()->GetServerCredentials(creds_);
266       builder.AddListeningPort(server_address.str(), server_creds);
267       builder.RegisterService(&service_);
268       server_ = builder.BuildAndStart();
269       std::lock_guard<std::mutex> lock(*mu);
270       server_ready_ = true;
271       cond->notify_one();
272     }
273 
Shutdowngrpc::testing::__anon8a0e79be0111::FlakyNetworkTest::ServerData274     void Shutdown() {
275       server_->Shutdown(grpc_timeout_milliseconds_to_deadline(0));
276       thread_->join();
277     }
278   };
279 
WaitForChannelNotReady(Channel * channel,int timeout_seconds=5)280   bool WaitForChannelNotReady(Channel* channel, int timeout_seconds = 5) {
281     const gpr_timespec deadline =
282         grpc_timeout_seconds_to_deadline(timeout_seconds);
283     grpc_connectivity_state state;
284     while ((state = channel->GetState(false /* try_to_connect */)) ==
285            GRPC_CHANNEL_READY) {
286       if (!channel->WaitForStateChange(state, deadline)) return false;
287     }
288     return true;
289   }
290 
WaitForChannelReady(Channel * channel,int timeout_seconds=5)291   bool WaitForChannelReady(Channel* channel, int timeout_seconds = 5) {
292     const gpr_timespec deadline =
293         grpc_timeout_seconds_to_deadline(timeout_seconds);
294     grpc_connectivity_state state;
295     while ((state = channel->GetState(true /* try_to_connect */)) !=
296            GRPC_CHANNEL_READY) {
297       if (!channel->WaitForStateChange(state, deadline)) return false;
298     }
299     return true;
300   }
301 
302  private:
303   const std::string server_host_;
304   const std::string interface_;
305   const std::string ipv4_address_;
306   const std::string netmask_;
307   std::unique_ptr<grpc::testing::EchoTestService::Stub> stub_;
308   std::unique_ptr<ServerData> server_;
309   const int SERVER_PORT = 32750;
310   int port_;
311 };
312 
CreateTestScenarios()313 std::vector<TestScenario> CreateTestScenarios() {
314   std::vector<TestScenario> scenarios;
315   std::vector<std::string> credentials_types;
316   std::vector<std::string> messages;
317 
318   credentials_types.push_back(kInsecureCredentialsType);
319   auto sec_list = GetCredentialsProvider()->GetSecureCredentialsTypeList();
320   for (auto sec = sec_list.begin(); sec != sec_list.end(); sec++) {
321     credentials_types.push_back(*sec);
322   }
323 
324   messages.push_back("��");
325   for (size_t k = 1; k < GRPC_DEFAULT_MAX_RECV_MESSAGE_LENGTH / 1024; k *= 32) {
326     std::string big_msg;
327     for (size_t i = 0; i < k * 1024; ++i) {
328       char c = 'a' + (i % 26);
329       big_msg += c;
330     }
331     messages.push_back(big_msg);
332   }
333   for (auto cred = credentials_types.begin(); cred != credentials_types.end();
334        ++cred) {
335     for (auto msg = messages.begin(); msg != messages.end(); msg++) {
336       scenarios.emplace_back(*cred, *msg);
337     }
338   }
339 
340   return scenarios;
341 }
342 
343 INSTANTIATE_TEST_SUITE_P(FlakyNetworkTest, FlakyNetworkTest,
344                          ::testing::ValuesIn(CreateTestScenarios()));
345 
346 // Network interface connected to server flaps
TEST_P(FlakyNetworkTest,NetworkTransition)347 TEST_P(FlakyNetworkTest, NetworkTransition) {
348   const int kKeepAliveTimeMs = 1000;
349   const int kKeepAliveTimeoutMs = 1000;
350   ChannelArguments args;
351   args.SetInt(GRPC_ARG_KEEPALIVE_TIME_MS, kKeepAliveTimeMs);
352   args.SetInt(GRPC_ARG_KEEPALIVE_TIMEOUT_MS, kKeepAliveTimeoutMs);
353   args.SetInt(GRPC_ARG_KEEPALIVE_PERMIT_WITHOUT_CALLS, 1);
354   args.SetInt(GRPC_ARG_HTTP2_MAX_PINGS_WITHOUT_DATA, 0);
355 
356   auto channel = BuildChannel("pick_first", args);
357   auto stub = BuildStub(channel);
358   // Channel should be in READY state after we send an RPC
359   EXPECT_TRUE(SendRpc(stub));
360   EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_READY);
361 
362   std::atomic_bool shutdown{false};
363   std::thread sender = std::thread([this, &stub, &shutdown]() {
364     while (true) {
365       if (shutdown.load()) {
366         return;
367       }
368       SendRpc(stub);
369       std::this_thread::sleep_for(std::chrono::milliseconds(1000));
370     }
371   });
372 
373   // bring down network
374   NetworkDown();
375   EXPECT_TRUE(WaitForChannelNotReady(channel.get()));
376   // bring network interface back up
377   InterfaceUp();
378   std::this_thread::sleep_for(std::chrono::milliseconds(1000));
379   // Restore DNS entry for server
380   DNSUp();
381   EXPECT_TRUE(WaitForChannelReady(channel.get()));
382   EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_READY);
383   shutdown.store(true);
384   sender.join();
385 }
386 
387 // Traffic to server server is blackholed temporarily with keepalives enabled
TEST_P(FlakyNetworkTest,ServerUnreachableWithKeepalive)388 TEST_P(FlakyNetworkTest, ServerUnreachableWithKeepalive) {
389   const int kKeepAliveTimeMs = 1000;
390   const int kKeepAliveTimeoutMs = 1000;
391   const int kReconnectBackoffMs = 1000;
392   ChannelArguments args;
393   args.SetInt(GRPC_ARG_KEEPALIVE_TIME_MS, kKeepAliveTimeMs);
394   args.SetInt(GRPC_ARG_KEEPALIVE_TIMEOUT_MS, kKeepAliveTimeoutMs);
395   args.SetInt(GRPC_ARG_KEEPALIVE_PERMIT_WITHOUT_CALLS, 1);
396   args.SetInt(GRPC_ARG_HTTP2_MAX_PINGS_WITHOUT_DATA, 0);
397   // max time for a connection attempt
398   args.SetInt(GRPC_ARG_MIN_RECONNECT_BACKOFF_MS, kReconnectBackoffMs);
399   // max time between reconnect attempts
400   args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, kReconnectBackoffMs);
401 
402   gpr_log(GPR_DEBUG, "FlakyNetworkTest.ServerUnreachableWithKeepalive start");
403   auto channel = BuildChannel("pick_first", args);
404   auto stub = BuildStub(channel);
405   // Channel should be in READY state after we send an RPC
406   EXPECT_TRUE(SendRpc(stub));
407   EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_READY);
408 
409   std::atomic_bool shutdown{false};
410   std::thread sender = std::thread([this, &stub, &shutdown]() {
411     while (true) {
412       if (shutdown.load()) {
413         return;
414       }
415       SendRpc(stub);
416       std::this_thread::sleep_for(std::chrono::milliseconds(1000));
417     }
418   });
419 
420   // break network connectivity
421   gpr_log(GPR_DEBUG, "Adding iptables rule to drop packets");
422   DropPackets();
423   std::this_thread::sleep_for(std::chrono::milliseconds(10000));
424   EXPECT_TRUE(WaitForChannelNotReady(channel.get()));
425   // bring network interface back up
426   RestoreNetwork();
427   gpr_log(GPR_DEBUG, "Removed iptables rule to drop packets");
428   EXPECT_TRUE(WaitForChannelReady(channel.get()));
429   EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_READY);
430   shutdown.store(true);
431   sender.join();
432   gpr_log(GPR_DEBUG, "FlakyNetworkTest.ServerUnreachableWithKeepalive end");
433 }
434 
435 //
436 // Traffic to server server is blackholed temporarily with keepalives disabled
TEST_P(FlakyNetworkTest,ServerUnreachableNoKeepalive)437 TEST_P(FlakyNetworkTest, ServerUnreachableNoKeepalive) {
438   auto channel = BuildChannel("pick_first", ChannelArguments());
439   auto stub = BuildStub(channel);
440   // Channel should be in READY state after we send an RPC
441   EXPECT_TRUE(SendRpc(stub));
442   EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_READY);
443 
444   // break network connectivity
445   DropPackets();
446 
447   std::thread sender = std::thread([this, &stub]() {
448     // RPC with deadline should timeout
449     EXPECT_FALSE(SendRpc(stub, /*timeout_ms=*/500, /*wait_for_ready=*/true));
450     // RPC without deadline forever until call finishes
451     EXPECT_TRUE(SendRpc(stub, /*timeout_ms=*/0, /*wait_for_ready=*/true));
452   });
453 
454   std::this_thread::sleep_for(std::chrono::milliseconds(2000));
455   // bring network interface back up
456   RestoreNetwork();
457 
458   // wait for RPC to finish
459   sender.join();
460 }
461 
462 // Send RPCs over a flaky network connection
TEST_P(FlakyNetworkTest,FlakyNetwork)463 TEST_P(FlakyNetworkTest, FlakyNetwork) {
464   const int kKeepAliveTimeMs = 1000;
465   const int kKeepAliveTimeoutMs = 1000;
466   const int kMessageCount = 100;
467   ChannelArguments args;
468   args.SetInt(GRPC_ARG_KEEPALIVE_TIME_MS, kKeepAliveTimeMs);
469   args.SetInt(GRPC_ARG_KEEPALIVE_TIMEOUT_MS, kKeepAliveTimeoutMs);
470   args.SetInt(GRPC_ARG_KEEPALIVE_PERMIT_WITHOUT_CALLS, 1);
471   args.SetInt(GRPC_ARG_HTTP2_MAX_PINGS_WITHOUT_DATA, 0);
472 
473   auto channel = BuildChannel("pick_first", args);
474   auto stub = BuildStub(channel);
475   // Channel should be in READY state after we send an RPC
476   EXPECT_TRUE(SendRpc(stub));
477   EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_READY);
478 
479   // simulate flaky network (packet loss, corruption and delays)
480   FlakeNetwork();
481   for (int i = 0; i < kMessageCount; ++i) {
482     SendRpc(stub);
483   }
484   // remove network flakiness
485   UnflakeNetwork();
486   EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_READY);
487 }
488 
489 // Server is shutdown gracefully and restarted. Client keepalives are enabled
TEST_P(FlakyNetworkTest,ServerRestartKeepaliveEnabled)490 TEST_P(FlakyNetworkTest, ServerRestartKeepaliveEnabled) {
491   const int kKeepAliveTimeMs = 1000;
492   const int kKeepAliveTimeoutMs = 1000;
493   ChannelArguments args;
494   args.SetInt(GRPC_ARG_KEEPALIVE_TIME_MS, kKeepAliveTimeMs);
495   args.SetInt(GRPC_ARG_KEEPALIVE_TIMEOUT_MS, kKeepAliveTimeoutMs);
496   args.SetInt(GRPC_ARG_KEEPALIVE_PERMIT_WITHOUT_CALLS, 1);
497   args.SetInt(GRPC_ARG_HTTP2_MAX_PINGS_WITHOUT_DATA, 0);
498 
499   auto channel = BuildChannel("pick_first", args);
500   auto stub = BuildStub(channel);
501   // Channel should be in READY state after we send an RPC
502   EXPECT_TRUE(SendRpc(stub));
503   EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_READY);
504 
505   // server goes down, client should detect server going down and calls should
506   // fail
507   StopServer();
508   EXPECT_TRUE(WaitForChannelNotReady(channel.get()));
509   EXPECT_FALSE(SendRpc(stub));
510 
511   std::this_thread::sleep_for(std::chrono::milliseconds(1000));
512 
513   // server restarts, calls succeed
514   StartServer();
515   EXPECT_TRUE(WaitForChannelReady(channel.get()));
516   // EXPECT_TRUE(SendRpc(stub));
517 }
518 
519 // Server is shutdown gracefully and restarted. Client keepalives are enabled
TEST_P(FlakyNetworkTest,ServerRestartKeepaliveDisabled)520 TEST_P(FlakyNetworkTest, ServerRestartKeepaliveDisabled) {
521   auto channel = BuildChannel("pick_first", ChannelArguments());
522   auto stub = BuildStub(channel);
523   // Channel should be in READY state after we send an RPC
524   EXPECT_TRUE(SendRpc(stub));
525   EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_READY);
526 
527   // server sends GOAWAY when it's shutdown, so client attempts to reconnect
528   StopServer();
529   std::this_thread::sleep_for(std::chrono::milliseconds(1000));
530 
531   EXPECT_TRUE(WaitForChannelNotReady(channel.get()));
532 
533   std::this_thread::sleep_for(std::chrono::milliseconds(1000));
534 
535   // server restarts, calls succeed
536   StartServer();
537   EXPECT_TRUE(WaitForChannelReady(channel.get()));
538 }
539 
540 }  // namespace
541 }  // namespace testing
542 }  // namespace grpc
543 #endif  // GPR_LINUX
544 
main(int argc,char ** argv)545 int main(int argc, char** argv) {
546   ::testing::InitGoogleTest(&argc, argv);
547   grpc::testing::TestEnvironment env(&argc, argv);
548   auto result = RUN_ALL_TESTS();
549   return result;
550 }
551