From 0d83ab57de8c266764fc47810afd115758da5034 Mon Sep 17 00:00:00 2001
From: Jelte Fennema-Nio <jelte.fennema@microsoft.com>
Date: Wed, 1 Nov 2023 11:46:01 +0100
Subject: [PATCH 1/4] Fix flaky multi_cluster_management (#7295)

One of our most flaky and most anoying tests is
multi_cluster_management. It usually fails like this:
```diff
 SELECT citus_disable_node('localhost', :worker_2_port);
  citus_disable_node
 --------------------

 (1 row)

 SELECT public.wait_until_metadata_sync(60000);
+WARNING:  waiting for metadata sync timed out
  wait_until_metadata_sync
 --------------------------

 (1 row)

```

This tries to address that by hardening wait_until_metadata_sync. I
believe the reason for this warning is that there is a race condition in
wait_until_metadata_sync. It's possible for the pre-check to fail, then
have the maintenance daemon send a notification. And only then have the
backend start to listen. I tried to fix it in two ways:
1. First run LISTEN, and only then read do the pre-check.
2. If we time out, check again just to make sure that we did not miss
   the notification somehow. And don't show a warning if all metadata is
   synced after the timeout.

It's hard to know for sure that this fixes it because the test is not
repeatable and I could not reproduce it locally. Let's just hope for the
best.

---------

Co-authored-by: Onur Tirtir <onurcantirtir@gmail.com>
---
 src/backend/distributed/test/metadata_sync.c  | 48 +++++++++++--------
 .../expected/multi_cluster_management.out     |  4 +-
 .../regress/sql/multi_cluster_management.sql  |  4 +-
 3 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/src/backend/distributed/test/metadata_sync.c b/src/backend/distributed/test/metadata_sync.c
index 46d2303d6..8ad4b15f2 100644
--- a/src/backend/distributed/test/metadata_sync.c
+++ b/src/backend/distributed/test/metadata_sync.c
@@ -90,6 +90,28 @@ activate_node_snapshot(PG_FUNCTION_ARGS)
 }
 
 
+/*
+ * IsMetadataSynced checks the workers to see if all workers with metadata are
+ * synced.
+ */
+static bool
+IsMetadataSynced(void)
+{
+	List *workerList = ActivePrimaryNonCoordinatorNodeList(NoLock);
+
+	WorkerNode *workerNode = NULL;
+	foreach_ptr(workerNode, workerList)
+	{
+		if (workerNode->hasMetadata && !workerNode->metadataSynced)
+		{
+			return false;
+		}
+	}
+
+	return true;
+}
+
+
 /*
  * wait_until_metadata_sync waits until the maintenance daemon does a metadata
  * sync, or times out.
@@ -99,19 +121,10 @@ wait_until_metadata_sync(PG_FUNCTION_ARGS)
 {
 	uint32 timeout = PG_GETARG_UINT32(0);
 
-	List *workerList = ActivePrimaryNonCoordinatorNodeList(NoLock);
-	bool waitNotifications = false;
-
-	WorkerNode *workerNode = NULL;
-	foreach_ptr(workerNode, workerList)
-	{
-		/* if already has metadata, no need to do it again */
-		if (workerNode->hasMetadata && !workerNode->metadataSynced)
-		{
-			waitNotifications = true;
-			break;
-		}
-	}
+	/* First we start listening. */
+	MultiConnection *connection = GetNodeConnection(FORCE_NEW_CONNECTION,
+													LOCAL_HOST_NAME, PostPortNumber);
+	ExecuteCriticalRemoteCommand(connection, "LISTEN " METADATA_SYNC_CHANNEL);
 
 	/*
 	 * If all the metadata nodes have already been synced, we should not wait.
@@ -119,15 +132,12 @@ wait_until_metadata_sync(PG_FUNCTION_ARGS)
 	 * the notification and we'd wait unnecessarily here. Worse, the test outputs
 	 * might be inconsistent across executions due to the warning.
 	 */
-	if (!waitNotifications)
+	if (IsMetadataSynced())
 	{
+		CloseConnection(connection);
 		PG_RETURN_VOID();
 	}
 
-	MultiConnection *connection = GetNodeConnection(FORCE_NEW_CONNECTION,
-													LOCAL_HOST_NAME, PostPortNumber);
-	ExecuteCriticalRemoteCommand(connection, "LISTEN " METADATA_SYNC_CHANNEL);
-
 	int waitFlags = WL_SOCKET_READABLE | WL_TIMEOUT | WL_POSTMASTER_DEATH;
 	int waitResult = WaitLatchOrSocket(NULL, waitFlags, PQsocket(connection->pgConn),
 									   timeout, 0);
@@ -139,7 +149,7 @@ wait_until_metadata_sync(PG_FUNCTION_ARGS)
 	{
 		ClearResults(connection, true);
 	}
-	else if (waitResult & WL_TIMEOUT)
+	else if (waitResult & WL_TIMEOUT && !IsMetadataSynced())
 	{
 		elog(WARNING, "waiting for metadata sync timed out");
 	}
diff --git a/src/test/regress/expected/multi_cluster_management.out b/src/test/regress/expected/multi_cluster_management.out
index b92d8d136..3eb549ab5 100644
--- a/src/test/regress/expected/multi_cluster_management.out
+++ b/src/test/regress/expected/multi_cluster_management.out
@@ -90,7 +90,7 @@ SELECT citus_disable_node('localhost', :worker_2_port);
 
 (1 row)
 
-SELECT public.wait_until_metadata_sync(60000);
+SELECT public.wait_until_metadata_sync(20000);
  wait_until_metadata_sync
 ---------------------------------------------------------------------
 
@@ -812,7 +812,7 @@ SELECT citus_disable_node('localhost', 9999);
 
 (1 row)
 
-SELECT public.wait_until_metadata_sync(60000);
+SELECT public.wait_until_metadata_sync(20000);
  wait_until_metadata_sync
 ---------------------------------------------------------------------
 
diff --git a/src/test/regress/sql/multi_cluster_management.sql b/src/test/regress/sql/multi_cluster_management.sql
index ab268939f..86fbd15b6 100644
--- a/src/test/regress/sql/multi_cluster_management.sql
+++ b/src/test/regress/sql/multi_cluster_management.sql
@@ -39,7 +39,7 @@ SELECT master_get_active_worker_nodes();
 SELECT 1 FROM master_add_node('localhost', :worker_2_port);
 
 SELECT citus_disable_node('localhost', :worker_2_port);
-SELECT public.wait_until_metadata_sync(60000);
+SELECT public.wait_until_metadata_sync(20000);
 SELECT master_get_active_worker_nodes();
 
 -- add some shard placements to the cluster
@@ -328,7 +328,7 @@ SELECT 1 FROM master_add_inactive_node('localhost', 9996, groupid => :worker_2_g
 SELECT master_add_inactive_node('localhost', 9999, groupid => :worker_2_group, nodecluster => 'olap', noderole => 'secondary');
 SELECT master_activate_node('localhost', 9999);
 SELECT citus_disable_node('localhost', 9999);
-SELECT public.wait_until_metadata_sync(60000);
+SELECT public.wait_until_metadata_sync(20000);
 SELECT master_remove_node('localhost', 9999);
 
 -- check that you can't manually add two primaries to a group

From 2bccb5815770b67cf646a7dc5bb9539d5a29c010 Mon Sep 17 00:00:00 2001
From: Jelte Fennema-Nio <jelte.fennema@microsoft.com>
Date: Wed, 1 Nov 2023 13:12:20 +0100
Subject: [PATCH 2/4] Run github actions on main (#7292)

We want the nice looking green checkmark on our main branch too.

This PR includes running on pushes to release branches too, but that
won't come into effect until we have release branches with this
workflow file.
---
 .github/workflows/build_and_test.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index d285e4f50..d900fe867 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -10,6 +10,10 @@ on:
         required: false
         default: false
         type: boolean
+  push:
+    branches:
+      - "main"
+      - "release-*"
   pull_request:
     types: [opened, reopened,synchronize]
 jobs:

From c83c5567028d2035651c39f737ac5a944a70db16 Mon Sep 17 00:00:00 2001
From: Jelte Fennema-Nio <jelte.fennema@microsoft.com>
Date: Wed, 1 Nov 2023 14:44:45 +0100
Subject: [PATCH 3/4] Fix flaky isolation_master_update_node (#7303)

Sometimes in CI isolation_master_update_node fails like this:

```diff
 ------------------

 (1 row)

 step s2-abort: ABORT;
 step s1-abort: ABORT;
 FATAL:  terminating connection due to administrator command
 FATAL:  terminating connection due to administrator command
 SSL connection has been closed unexpectedly
+server closed the connection unexpectedly

 master_remove_node
 ------------------

```

This just seesm like a random error line. The only way to reasonably fix
this is by adding an extra output file. So that's what this PR does.
---
 .../isolation_master_update_node_1.out        | 68 +++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 src/test/regress/expected/isolation_master_update_node_1.out

diff --git a/src/test/regress/expected/isolation_master_update_node_1.out b/src/test/regress/expected/isolation_master_update_node_1.out
new file mode 100644
index 000000000..474956629
--- /dev/null
+++ b/src/test/regress/expected/isolation_master_update_node_1.out
@@ -0,0 +1,68 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1-begin s1-insert s2-begin s2-update-node-1 s1-abort s2-abort
+create_distributed_table
+---------------------------------------------------------------------
+
+(1 row)
+
+step s1-begin: BEGIN;
+step s1-insert: INSERT INTO t1 SELECT generate_series(1, 100);
+step s2-begin: BEGIN;
+step s2-update-node-1:
+    -- update a specific node by address
+    SELECT master_update_node(nodeid, 'localhost', nodeport + 10)
+      FROM pg_dist_node
+     WHERE nodename = 'localhost'
+       AND nodeport = 57637;
+ <waiting ...>
+step s1-abort: ABORT;
+step s2-update-node-1: <... completed>
+master_update_node
+---------------------------------------------------------------------
+
+(1 row)
+
+step s2-abort: ABORT;
+master_remove_node
+---------------------------------------------------------------------
+
+
+(2 rows)
+
+
+starting permutation: s1-begin s1-insert s2-begin s2-update-node-1-force s2-abort s1-abort
+create_distributed_table
+---------------------------------------------------------------------
+
+(1 row)
+
+step s1-begin: BEGIN;
+step s1-insert: INSERT INTO t1 SELECT generate_series(1, 100);
+step s2-begin: BEGIN;
+step s2-update-node-1-force:
+    -- update a specific node by address (force)
+    SELECT master_update_node(nodeid, 'localhost', nodeport + 10, force => true, lock_cooldown => 100)
+      FROM pg_dist_node
+     WHERE nodename = 'localhost'
+       AND nodeport = 57637;
+ <waiting ...>
+step s2-update-node-1-force: <... completed>
+master_update_node
+---------------------------------------------------------------------
+
+(1 row)
+
+step s2-abort: ABORT;
+step s1-abort: ABORT;
+FATAL:  terminating connection due to administrator command
+FATAL:  terminating connection due to administrator command
+SSL connection has been closed unexpectedly
+server closed the connection unexpectedly
+
+master_remove_node
+---------------------------------------------------------------------
+
+
+(2 rows)
+

From c9f2fc892d4ce01a4bc23beb508e2ff03f08a774 Mon Sep 17 00:00:00 2001
From: Jelte Fennema-Nio <jelte.fennema@microsoft.com>
Date: Wed, 1 Nov 2023 15:08:51 +0100
Subject: [PATCH 4/4] Fix flaky failure_split_cleanup (#7299)

Sometimes failure_split_cleanup failed in CI like this:

```diff
 ERROR:  server closed the connection unexpectedly
 CONTEXT:  while executing command on localhost:9060
     SELECT operation_id, object_type, object_name, node_group_id, policy_type
     FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name;
  operation_id | object_type |                        object_name                        | node_group_id | policy_type
 --------------+-------------+-----------------------------------------------------------+---------------+-------------
           777 |           1 | citus_failure_split_cleanup_schema.table_to_split_8981000 |             1 |           0
-          777 |           1 | citus_failure_split_cleanup_schema.table_to_split_8981002 |             1 |           1
           777 |           1 | citus_failure_split_cleanup_schema.table_to_split_8981002 |             2 |           0
+          777 |           1 | citus_failure_split_cleanup_schema.table_to_split_8981002 |             1 |           1
           777 |           1 | citus_failure_split_cleanup_schema.table_to_split_8981003 |             2 |           1
           777 |           4 | citus_shard_split_publication_1_10_777                    |             2 |           0
 (5 rows)

     -- we need to allow connection so that we can connect to proxy
```

Source:
https://github.com/citusdata/citus/actions/runs/6717642291/attempts/1#summary-18256014949

It's the common problem where we're missing a column in the ORDER BY
clause. This fixes that by adding an node_group_id to the query in
question.
---
 .../regress/expected/failure_split_cleanup.out | 18 +++++++++---------
 src/test/regress/sql/failure_split_cleanup.sql | 16 ++++++++--------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/test/regress/expected/failure_split_cleanup.out b/src/test/regress/expected/failure_split_cleanup.out
index fe646587c..d81335325 100644
--- a/src/test/regress/expected/failure_split_cleanup.out
+++ b/src/test/regress/expected/failure_split_cleanup.out
@@ -277,12 +277,12 @@ CONTEXT:  while executing command on localhost:xxxxx
 ERROR:  connection not open
 CONTEXT:  while executing command on localhost:xxxxx
     SELECT operation_id, object_type, object_name, node_group_id, policy_type
-    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name;
+    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id;
  operation_id | object_type |                        object_name                        | node_group_id | policy_type
 ---------------------------------------------------------------------
           777 |           1 | citus_failure_split_cleanup_schema.table_to_split_8981000 |             1 |           0
-          777 |           1 | citus_failure_split_cleanup_schema.table_to_split_8981002 |             2 |           0
           777 |           1 | citus_failure_split_cleanup_schema.table_to_split_8981002 |             1 |           1
+          777 |           1 | citus_failure_split_cleanup_schema.table_to_split_8981002 |             2 |           0
           777 |           1 | citus_failure_split_cleanup_schema.table_to_split_8981003 |             2 |           1
           777 |           4 | citus_shard_split_publication_xxxxxxx_xxxxxxx_xxxxxxx                    |             2 |           0
           777 |           4 | citus_shard_split_publication_xxxxxxx_xxxxxxx_xxxxxxx                    |             2 |           0
@@ -336,7 +336,7 @@ CONTEXT:  while executing command on localhost:xxxxx
 (1 row)
 
     SELECT operation_id, object_type, object_name, node_group_id, policy_type
-    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name;
+    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id;
  operation_id | object_type | object_name | node_group_id | policy_type
 ---------------------------------------------------------------------
 (0 rows)
@@ -388,7 +388,7 @@ CONTEXT:  while executing command on localhost:xxxxx
 ERROR:  connection not open
 CONTEXT:  while executing command on localhost:xxxxx
     SELECT operation_id, object_type, object_name, node_group_id, policy_type
-    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name;
+    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id;
  operation_id | object_type |                        object_name                        | node_group_id | policy_type
 ---------------------------------------------------------------------
           777 |           1 | citus_failure_split_cleanup_schema.table_to_split_8981000 |             1 |           0
@@ -455,7 +455,7 @@ CONTEXT:  while executing command on localhost:xxxxx
 (1 row)
 
     SELECT operation_id, object_type, object_name, node_group_id, policy_type
-    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name;
+    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id;
  operation_id | object_type | object_name | node_group_id | policy_type
 ---------------------------------------------------------------------
 (0 rows)
@@ -507,7 +507,7 @@ CONTEXT:  while executing command on localhost:xxxxx
 ERROR:  connection not open
 CONTEXT:  while executing command on localhost:xxxxx
     SELECT operation_id, object_type, object_name, node_group_id, policy_type
-    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name;
+    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id;
  operation_id | object_type |                        object_name                        | node_group_id | policy_type
 ---------------------------------------------------------------------
           777 |           1 | citus_failure_split_cleanup_schema.table_to_split_8981000 |             1 |           0
@@ -574,7 +574,7 @@ CONTEXT:  while executing command on localhost:xxxxx
 (1 row)
 
     SELECT operation_id, object_type, object_name, node_group_id, policy_type
-    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name;
+    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id;
  operation_id | object_type | object_name | node_group_id | policy_type
 ---------------------------------------------------------------------
 (0 rows)
@@ -634,7 +634,7 @@ WARNING:  connection to the remote node localhost:xxxxx failed with the followin
 ERROR:  connection not open
 CONTEXT:  while executing command on localhost:xxxxx
     SELECT operation_id, object_type, object_name, node_group_id, policy_type
-    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name;
+    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id;
  operation_id | object_type |                        object_name                        | node_group_id | policy_type
 ---------------------------------------------------------------------
           777 |           1 | citus_failure_split_cleanup_schema.table_to_split_8981002 |             1 |           1
@@ -701,7 +701,7 @@ CONTEXT:  while executing command on localhost:xxxxx
 (1 row)
 
     SELECT operation_id, object_type, object_name, node_group_id, policy_type
-    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name;
+    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id;
  operation_id | object_type | object_name | node_group_id | policy_type
 ---------------------------------------------------------------------
 (0 rows)
diff --git a/src/test/regress/sql/failure_split_cleanup.sql b/src/test/regress/sql/failure_split_cleanup.sql
index 1b85d3d17..9dfbb245e 100644
--- a/src/test/regress/sql/failure_split_cleanup.sql
+++ b/src/test/regress/sql/failure_split_cleanup.sql
@@ -136,7 +136,7 @@ SELECT create_distributed_table('table_to_split', 'id');
         ARRAY[:worker_1_node, :worker_2_node],
         'force_logical');
     SELECT operation_id, object_type, object_name, node_group_id, policy_type
-    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name;
+    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id;
     -- we need to allow connection so that we can connect to proxy
     SELECT citus.mitmproxy('conn.allow()');
 
@@ -155,7 +155,7 @@ SELECT create_distributed_table('table_to_split', 'id');
     \c - postgres - :master_port
     SELECT public.wait_for_resource_cleanup();
     SELECT operation_id, object_type, object_name, node_group_id, policy_type
-    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name;
+    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id;
 
     \c - - - :worker_2_proxy_port
     SET search_path TO "citus_failure_split_cleanup_schema", public, pg_catalog;
@@ -182,7 +182,7 @@ SELECT create_distributed_table('table_to_split', 'id');
         ARRAY[:worker_1_node, :worker_2_node],
         'force_logical');
     SELECT operation_id, object_type, object_name, node_group_id, policy_type
-    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name;
+    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id;
     -- we need to allow connection so that we can connect to proxy
     SELECT citus.mitmproxy('conn.allow()');
 
@@ -201,7 +201,7 @@ SELECT create_distributed_table('table_to_split', 'id');
     \c - postgres - :master_port
     SELECT public.wait_for_resource_cleanup();
     SELECT operation_id, object_type, object_name, node_group_id, policy_type
-    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name;
+    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id;
 
     \c - - - :worker_2_proxy_port
     SET search_path TO "citus_failure_split_cleanup_schema", public, pg_catalog;
@@ -228,7 +228,7 @@ SELECT create_distributed_table('table_to_split', 'id');
         ARRAY[:worker_1_node, :worker_2_node],
         'force_logical');
     SELECT operation_id, object_type, object_name, node_group_id, policy_type
-    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name;
+    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id;
     -- we need to allow connection so that we can connect to proxy
     SELECT citus.mitmproxy('conn.allow()');
 
@@ -247,7 +247,7 @@ SELECT create_distributed_table('table_to_split', 'id');
     \c - postgres - :master_port
     SELECT public.wait_for_resource_cleanup();
     SELECT operation_id, object_type, object_name, node_group_id, policy_type
-    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name;
+    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id;
 
     \c - - - :worker_2_proxy_port
     SET search_path TO "citus_failure_split_cleanup_schema", public, pg_catalog;
@@ -275,7 +275,7 @@ SELECT create_distributed_table('table_to_split', 'id');
         'force_logical');
 
     SELECT operation_id, object_type, object_name, node_group_id, policy_type
-    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name;
+    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id;
     SELECT relname FROM pg_class where relname LIKE '%table_to_split_%' AND relkind = 'r' order by relname;
     -- we need to allow connection so that we can connect to proxy
     SELECT citus.mitmproxy('conn.allow()');
@@ -295,7 +295,7 @@ SELECT create_distributed_table('table_to_split', 'id');
     \c - postgres - :master_port
     SELECT public.wait_for_resource_cleanup();
     SELECT operation_id, object_type, object_name, node_group_id, policy_type
-    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name;
+    FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id;
 
     \c - - - :worker_2_proxy_port
     SET search_path TO "citus_failure_split_cleanup_schema", public, pg_catalog;