From e3c93c303dec623f6e196ca8f2ca1d1a20c51e6c Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Wed, 1 Nov 2023 17:21:12 +0100 Subject: [PATCH 1/2] Fix flaky citus_non_blocking_split_shard_cleanup (#7311) Sometimes in CI citus_non_blocking_split_shard_cleanup failed like this: ```diff --- /__w/citus/citus/src/test/regress/expected/citus_non_blocking_split_shard_cleanup.out.modified 2023-11-01 15:07:14.280551207 +0000 +++ /__w/citus/citus/src/test/regress/results/citus_non_blocking_split_shard_cleanup.out.modified 2023-11-01 15:07:14.292551358 +0000 @@ -106,21 +106,22 @@ ----------------------------------- (1 row) \c - - - :worker_2_port SET search_path TO "citus_split_test_schema"; -- Replication slots should be cleaned up SELECT slot_name FROM pg_replication_slots; slot_name --------------------------------- -(0 rows) + citus_shard_split_slot_19_10_17 +(1 row) -- Publications should be cleanedup SELECT count(*) FROM pg_publication; count ``` It's expected that the replication slot is sometimes not cleaned up if we don't wait until resource cleanup completes. This PR starts doing that here. --- .../expected/citus_non_blocking_split_shard_cleanup.out | 6 ++++++ .../regress/sql/citus_non_blocking_split_shard_cleanup.sql | 2 ++ 2 files changed, 8 insertions(+) diff --git a/src/test/regress/expected/citus_non_blocking_split_shard_cleanup.out b/src/test/regress/expected/citus_non_blocking_split_shard_cleanup.out index e2685c2d7..a559ec442 100644 --- a/src/test/regress/expected/citus_non_blocking_split_shard_cleanup.out +++ b/src/test/regress/expected/citus_non_blocking_split_shard_cleanup.out @@ -107,6 +107,12 @@ SELECT pg_catalog.citus_split_shard_by_split_points( (1 row) +SELECT public.wait_for_resource_cleanup(); + wait_for_resource_cleanup +--------------------------------------------------------------------- + +(1 row) + \c - - - :worker_2_port SET search_path TO "citus_split_test_schema"; -- Replication slots should be cleaned up diff --git a/src/test/regress/sql/citus_non_blocking_split_shard_cleanup.sql b/src/test/regress/sql/citus_non_blocking_split_shard_cleanup.sql index ba3f95215..480d81b88 100644 --- a/src/test/regress/sql/citus_non_blocking_split_shard_cleanup.sql +++ b/src/test/regress/sql/citus_non_blocking_split_shard_cleanup.sql @@ -79,6 +79,8 @@ SELECT pg_catalog.citus_split_shard_by_split_points( ARRAY[:worker_2_node, :worker_2_node, :worker_2_node], 'force_logical'); +SELECT public.wait_for_resource_cleanup(); + \c - - - :worker_2_port SET search_path TO "citus_split_test_schema"; -- Replication slots should be cleaned up From 2cf4c0402319a9616e4d0feb4d9273757b3c1eaf Mon Sep 17 00:00:00 2001 From: Onur Tirtir Date: Thu, 2 Nov 2023 01:59:41 +0300 Subject: [PATCH 2/2] Fix flaky global_cancel.sql test (#7316) --- src/test/regress/expected/global_cancel.out | 10 ++++++++-- src/test/regress/sql/global_cancel.sql | 6 ++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/test/regress/expected/global_cancel.out b/src/test/regress/expected/global_cancel.out index 5adeef3c8..e5ce4fbc6 100644 --- a/src/test/regress/expected/global_cancel.out +++ b/src/test/regress/expected/global_cancel.out @@ -9,9 +9,14 @@ SELECT 1 FROM master_add_node('localhost', :master_port, groupid => 0); RESET client_min_messages; -- Kill maintenance daemon so it gets restarted and gets a gpid containing our -- nodeid -SELECT pg_terminate_backend(pid) +SELECT COUNT(pg_terminate_backend(pid)) >= 0 FROM pg_stat_activity -WHERE application_name = 'Citus Maintenance Daemon' \gset +WHERE application_name = 'Citus Maintenance Daemon'; + ?column? +--------------------------------------------------------------------- + t +(1 row) + -- reconnect to make sure we get a session with the gpid containing our nodeid \c - - - - CREATE SCHEMA global_cancel; @@ -77,6 +82,7 @@ ERROR: must be a superuser to terminate superuser process SELECT pg_cancel_backend(citus_backend_gpid()); ERROR: canceling statement due to user request \c - postgres - :master_port +DROP USER global_cancel_user; SET client_min_messages TO DEBUG; -- 10000000000 is the node id multiplier for global pid SELECT pg_cancel_backend(10000000000 * citus_coordinator_nodeid() + 0); diff --git a/src/test/regress/sql/global_cancel.sql b/src/test/regress/sql/global_cancel.sql index 848c3b01a..12330baf2 100644 --- a/src/test/regress/sql/global_cancel.sql +++ b/src/test/regress/sql/global_cancel.sql @@ -5,9 +5,9 @@ RESET client_min_messages; -- Kill maintenance daemon so it gets restarted and gets a gpid containing our -- nodeid -SELECT pg_terminate_backend(pid) +SELECT COUNT(pg_terminate_backend(pid)) >= 0 FROM pg_stat_activity -WHERE application_name = 'Citus Maintenance Daemon' \gset +WHERE application_name = 'Citus Maintenance Daemon'; -- reconnect to make sure we get a session with the gpid containing our nodeid \c - - - - @@ -58,6 +58,8 @@ SELECT pg_cancel_backend(citus_backend_gpid()); \c - postgres - :master_port +DROP USER global_cancel_user; + SET client_min_messages TO DEBUG; -- 10000000000 is the node id multiplier for global pid