From 4d21903d3eceb6b8db3de368cc9ed51914ce553b Mon Sep 17 00:00:00 2001 From: Jelte Fennema Date: Fri, 19 Aug 2022 15:03:08 +0200 Subject: [PATCH] Fix flakyness in failure_setup (#6205) In CI sometimes failure_setup will fail with the following error: ```diff SELECT master_add_node('localhost', :worker_2_proxy_port); -- an mitmproxy which forwards to the second worker - master_add_node ---------------------------------------------------------------------- - 2 -(1 row) - +ERROR: connection to the remote node localhost:9060 failed with the following error: could not connect to server: Connection refused + Is the server running on host "localhost" (127.0.0.1) and accepting + TCP/IP connections on port 9060? +could not connect to server: Connection refused + Is the server running on host "localhost" (127.0.0.1) and accepting + TCP/IP connections on port 9060? +could not connect to server: Cannot assign requested address + Is the server running on host "localhost" (::1) and accepting + TCP/IP connections on port 9060? diff -dU10 -w /home/circleci/project/src/test/regress/expected/failure_online_move_shard_placement.out /home/circleci/project/src/test/regress/results/failure_online_move_shard_placement.out ``` This then breaks all the tests run after it as well, because we're missing one worker node. Locally I was able to reproduce this error by sleeping for 10 seconds in the forked process sleep before actually starting mitmproxy. So I'm expecting what's happening in CI is that due to limited resources, mitmproxy is not up yet when we try to add its port as a workernode. This PR fixes this by waiting until mitmproxy is listening on its socket before actually starting to run our tests. This fixed it locally for me when I made the forked process sleep for 10 seconds before starting mitmproxy. In passing it also improves the detection and errors that we already had for the case where something was already listening on the mitmproxy port. Because both @gledis69 and me were changing things in our CI images at the same time this also includes a bump of the style checker tools. Closes #6200 (cherry picked from commit 25e5cf2e505fc93875f0bfb0aaade873433bffdf) --- .circleci/config.yml | 4 ++-- src/test/regress/pg_regress_multi.pl | 26 +++++++++++++++++++------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 9b5b09b70..9de970791 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -6,7 +6,7 @@ orbs: parameters: image_suffix: type: string - default: '-v0aadde0' + default: '-vb4dd087' pg13_version: type: string default: '13.4' @@ -18,7 +18,7 @@ parameters: default: '13.4-14.0' style_checker_tools_version: type: string - default: '0.7.9' + default: '0.8.18' jobs: build: description: Build the citus extension diff --git a/src/test/regress/pg_regress_multi.pl b/src/test/regress/pg_regress_multi.pl index aec7e71d5..3a2adde77 100755 --- a/src/test/regress/pg_regress_multi.pl +++ b/src/test/regress/pg_regress_multi.pl @@ -769,18 +769,17 @@ if ($useMitmproxy) die "a file already exists at $mitmFifoPath, delete it before trying again"; } - system("lsof -i :$mitmPort"); - if (! $?) { - die "cannot start mitmproxy because a process already exists on port $mitmPort"; - } - if ($Config{osname} eq "linux") { - system("netstat --tcp -n | grep $mitmPort"); + system("netstat --tcp -n | grep :$mitmPort"); } else { - system("netstat -p tcp -n | grep $mitmPort"); + system("netstat -p tcp -n | grep :$mitmPort"); + } + + if (system("lsof -i :$mitmPort") == 0) { + die "cannot start mitmproxy because a process already exists on port $mitmPort"; } my $childPid = fork(); @@ -994,6 +993,19 @@ my $startTime = time(); my $exitcode = 0; + +if ($useMitmproxy) { + my $tries = 0; + until(system("lsof -i :$mitmPort") == 0) { + if ($tries > 60) { + die("waited for 60 seconds to start the mitmproxy, but it failed") + } + print("waiting: mitmproxy was not started yet\n"); + sleep(1); + $tries++; + } +} + # Finally run the tests if ($vanillatest) {