From 1e283bee4cddf60b47148626ec0e9bfd6f7660b0 Mon Sep 17 00:00:00 2001
From: Jelte Fennema-Nio <jelte.fennema@microsoft.com>
Date: Wed, 1 Nov 2023 11:46:01 +0100
Subject: [PATCH] Fix flaky multi_cluster_management (#7295)

One of our most flaky and most anoying tests is
multi_cluster_management. It usually fails like this:
```diff
 SELECT citus_disable_node('localhost', :worker_2_port);
  citus_disable_node
 --------------------

 (1 row)

 SELECT public.wait_until_metadata_sync(60000);
+WARNING:  waiting for metadata sync timed out
  wait_until_metadata_sync
 --------------------------

 (1 row)

```

This tries to address that by hardening wait_until_metadata_sync. I
believe the reason for this warning is that there is a race condition in
wait_until_metadata_sync. It's possible for the pre-check to fail, then
have the maintenance daemon send a notification. And only then have the
backend start to listen. I tried to fix it in two ways:
1. First run LISTEN, and only then read do the pre-check.
2. If we time out, check again just to make sure that we did not miss
   the notification somehow. And don't show a warning if all metadata is
   synced after the timeout.

It's hard to know for sure that this fixes it because the test is not
repeatable and I could not reproduce it locally. Let's just hope for the
best.

---------

Co-authored-by: Onur Tirtir <onurcantirtir@gmail.com>
---
 src/backend/distributed/test/metadata_sync.c  | 48 +++++++++++--------
 .../expected/multi_cluster_management.out     |  4 +-
 .../regress/sql/multi_cluster_management.sql  |  4 +-
 3 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/src/backend/distributed/test/metadata_sync.c b/src/backend/distributed/test/metadata_sync.c
index 46d2303d6..8ad4b15f2 100644
--- a/src/backend/distributed/test/metadata_sync.c
+++ b/src/backend/distributed/test/metadata_sync.c
@@ -90,6 +90,28 @@ activate_node_snapshot(PG_FUNCTION_ARGS)
 }
 
 
+/*
+ * IsMetadataSynced checks the workers to see if all workers with metadata are
+ * synced.
+ */
+static bool
+IsMetadataSynced(void)
+{
+	List *workerList = ActivePrimaryNonCoordinatorNodeList(NoLock);
+
+	WorkerNode *workerNode = NULL;
+	foreach_ptr(workerNode, workerList)
+	{
+		if (workerNode->hasMetadata && !workerNode->metadataSynced)
+		{
+			return false;
+		}
+	}
+
+	return true;
+}
+
+
 /*
  * wait_until_metadata_sync waits until the maintenance daemon does a metadata
  * sync, or times out.
@@ -99,19 +121,10 @@ wait_until_metadata_sync(PG_FUNCTION_ARGS)
 {
 	uint32 timeout = PG_GETARG_UINT32(0);
 
-	List *workerList = ActivePrimaryNonCoordinatorNodeList(NoLock);
-	bool waitNotifications = false;
-
-	WorkerNode *workerNode = NULL;
-	foreach_ptr(workerNode, workerList)
-	{
-		/* if already has metadata, no need to do it again */
-		if (workerNode->hasMetadata && !workerNode->metadataSynced)
-		{
-			waitNotifications = true;
-			break;
-		}
-	}
+	/* First we start listening. */
+	MultiConnection *connection = GetNodeConnection(FORCE_NEW_CONNECTION,
+													LOCAL_HOST_NAME, PostPortNumber);
+	ExecuteCriticalRemoteCommand(connection, "LISTEN " METADATA_SYNC_CHANNEL);
 
 	/*
 	 * If all the metadata nodes have already been synced, we should not wait.
@@ -119,15 +132,12 @@ wait_until_metadata_sync(PG_FUNCTION_ARGS)
 	 * the notification and we'd wait unnecessarily here. Worse, the test outputs
 	 * might be inconsistent across executions due to the warning.
 	 */
-	if (!waitNotifications)
+	if (IsMetadataSynced())
 	{
+		CloseConnection(connection);
 		PG_RETURN_VOID();
 	}
 
-	MultiConnection *connection = GetNodeConnection(FORCE_NEW_CONNECTION,
-													LOCAL_HOST_NAME, PostPortNumber);
-	ExecuteCriticalRemoteCommand(connection, "LISTEN " METADATA_SYNC_CHANNEL);
-
 	int waitFlags = WL_SOCKET_READABLE | WL_TIMEOUT | WL_POSTMASTER_DEATH;
 	int waitResult = WaitLatchOrSocket(NULL, waitFlags, PQsocket(connection->pgConn),
 									   timeout, 0);
@@ -139,7 +149,7 @@ wait_until_metadata_sync(PG_FUNCTION_ARGS)
 	{
 		ClearResults(connection, true);
 	}
-	else if (waitResult & WL_TIMEOUT)
+	else if (waitResult & WL_TIMEOUT && !IsMetadataSynced())
 	{
 		elog(WARNING, "waiting for metadata sync timed out");
 	}
diff --git a/src/test/regress/expected/multi_cluster_management.out b/src/test/regress/expected/multi_cluster_management.out
index b92d8d136..3eb549ab5 100644
--- a/src/test/regress/expected/multi_cluster_management.out
+++ b/src/test/regress/expected/multi_cluster_management.out
@@ -90,7 +90,7 @@ SELECT citus_disable_node('localhost', :worker_2_port);
 
 (1 row)
 
-SELECT public.wait_until_metadata_sync(60000);
+SELECT public.wait_until_metadata_sync(20000);
  wait_until_metadata_sync
 ---------------------------------------------------------------------
 
@@ -812,7 +812,7 @@ SELECT citus_disable_node('localhost', 9999);
 
 (1 row)
 
-SELECT public.wait_until_metadata_sync(60000);
+SELECT public.wait_until_metadata_sync(20000);
  wait_until_metadata_sync
 ---------------------------------------------------------------------
 
diff --git a/src/test/regress/sql/multi_cluster_management.sql b/src/test/regress/sql/multi_cluster_management.sql
index ab268939f..86fbd15b6 100644
--- a/src/test/regress/sql/multi_cluster_management.sql
+++ b/src/test/regress/sql/multi_cluster_management.sql
@@ -39,7 +39,7 @@ SELECT master_get_active_worker_nodes();
 SELECT 1 FROM master_add_node('localhost', :worker_2_port);
 
 SELECT citus_disable_node('localhost', :worker_2_port);
-SELECT public.wait_until_metadata_sync(60000);
+SELECT public.wait_until_metadata_sync(20000);
 SELECT master_get_active_worker_nodes();
 
 -- add some shard placements to the cluster
@@ -328,7 +328,7 @@ SELECT 1 FROM master_add_inactive_node('localhost', 9996, groupid => :worker_2_g
 SELECT master_add_inactive_node('localhost', 9999, groupid => :worker_2_group, nodecluster => 'olap', noderole => 'secondary');
 SELECT master_activate_node('localhost', 9999);
 SELECT citus_disable_node('localhost', 9999);
-SELECT public.wait_until_metadata_sync(60000);
+SELECT public.wait_until_metadata_sync(20000);
 SELECT master_remove_node('localhost', 9999);
 
 -- check that you can't manually add two primaries to a group