From 5a48a1602e6a42b1f6261c50c726d02339f153f0 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Thu, 2 Nov 2023 13:15:02 +0100 Subject: [PATCH] Debug flaky logical_replication test (#7309) Sometimes in CI our logical_replication test fails like this: ```diff +++ /__w/citus/citus/src/test/regress/results/logical_replication.out.modified 2023-11-01 14:15:08.562758546 +0000 @@ -40,21 +40,21 @@ SELECT count(*) from pg_publication; count ------- 0 (1 row) SELECT count(*) from pg_replication_slots; count ------- - 0 + 1 (1 row) SELECT count(*) FROM dist; count ------- ``` It's hard to understand what is going on here, just based on the wrong number. So this PR changes the test to show the name of the subscription, publication and replication slot to make finding the cause easier. In passing this also fixes another flaky test in the same file that our flaky test detection picked up. This is done by waiting for resource cleanup after the shard move. --- .../regress/expected/logical_replication.out | 113 +++++++++--------- src/test/regress/sql/logical_replication.sql | 32 ++--- 2 files changed, 72 insertions(+), 73 deletions(-) diff --git a/src/test/regress/expected/logical_replication.out b/src/test/regress/expected/logical_replication.out index 8a3e96da9..b5a36125a 100644 --- a/src/test/regress/expected/logical_replication.out +++ b/src/test/regress/expected/logical_replication.out @@ -32,23 +32,21 @@ CREATE SUBSCRIPTION citus_shard_move_subscription_:postgres_oid PUBLICATION citus_shard_move_publication_:postgres_oid WITH (enabled=false, slot_name=citus_shard_move_slot_:postgres_oid); NOTICE: created replication slot "citus_shard_move_slot_10" on publisher -SELECT count(*) from pg_subscription; - count +SELECT subname from pg_subscription; + subname --------------------------------------------------------------------- - 1 + citus_shard_move_subscription_10 (1 row) -SELECT count(*) from pg_publication; - count +SELECT pubname from pg_publication; + pubname --------------------------------------------------------------------- - 0 -(1 row) +(0 rows) -SELECT count(*) from pg_replication_slots; - count +SELECT slot_name from pg_replication_slots; + slot_name --------------------------------------------------------------------- - 0 -(1 row) +(0 rows) SELECT count(*) FROM dist; count @@ -58,22 +56,21 @@ SELECT count(*) FROM dist; \c - - - :worker_1_port SET search_path TO logical_replication; -SELECT count(*) from pg_subscription; - count +SELECT subname from pg_subscription; + subname --------------------------------------------------------------------- - 0 +(0 rows) + +SELECT pubname from pg_publication; + pubname +--------------------------------------------------------------------- + citus_shard_move_publication_10 (1 row) -SELECT count(*) from pg_publication; - count +SELECT slot_name from pg_replication_slots; + slot_name --------------------------------------------------------------------- - 1 -(1 row) - -SELECT count(*) from pg_replication_slots; - count ---------------------------------------------------------------------- - 1 + citus_shard_move_slot_10 (1 row) SELECT count(*) FROM dist; @@ -90,25 +87,29 @@ select citus_move_shard_placement(6830002, 'localhost', :worker_1_port, 'localho (1 row) +SELECT public.wait_for_resource_cleanup(); + wait_for_resource_cleanup +--------------------------------------------------------------------- + +(1 row) + -- the subscription is still there, as there is no cleanup record for it -- we have created it manually -SELECT count(*) from pg_subscription; - count +SELECT subname from pg_subscription; + subname --------------------------------------------------------------------- - 1 + citus_shard_move_subscription_10 (1 row) -SELECT count(*) from pg_publication; - count +SELECT pubname from pg_publication; + pubname --------------------------------------------------------------------- - 0 -(1 row) +(0 rows) -SELECT count(*) from pg_replication_slots; - count +SELECT slot_name from pg_replication_slots; + slot_name --------------------------------------------------------------------- - 0 -(1 row) +(0 rows) SELECT count(*) from dist; count @@ -120,22 +121,21 @@ SELECT count(*) from dist; SET search_path TO logical_replication; -- the publication and repslot are still there, as there are no cleanup records for them -- we have created them manually -SELECT count(*) from pg_subscription; - count +SELECT subname from pg_subscription; + subname --------------------------------------------------------------------- - 0 +(0 rows) + +SELECT pubname from pg_publication; + pubname +--------------------------------------------------------------------- + citus_shard_move_publication_10 (1 row) -SELECT count(*) from pg_publication; - count +SELECT slot_name from pg_replication_slots; + slot_name --------------------------------------------------------------------- - 1 -(1 row) - -SELECT count(*) from pg_replication_slots; - count ---------------------------------------------------------------------- - 1 + citus_shard_move_slot_10 (1 row) SELECT count(*) from dist; @@ -153,23 +153,20 @@ SELECT pg_drop_replication_slot('citus_shard_move_slot_' || :postgres_oid); \c - - - :worker_2_port SET search_path TO logical_replication; -SELECT count(*) from pg_subscription; - count +SELECT subname from pg_subscription; + subname --------------------------------------------------------------------- - 0 -(1 row) +(0 rows) -SELECT count(*) from pg_publication; - count +SELECT pubname from pg_publication; + pubname --------------------------------------------------------------------- - 0 -(1 row) +(0 rows) -SELECT count(*) from pg_replication_slots; - count +SELECT slot_name from pg_replication_slots; + slot_name --------------------------------------------------------------------- - 0 -(1 row) +(0 rows) SELECT count(*) from dist; count diff --git a/src/test/regress/sql/logical_replication.sql b/src/test/regress/sql/logical_replication.sql index 3f8e048ca..a85c70b08 100644 --- a/src/test/regress/sql/logical_replication.sql +++ b/src/test/regress/sql/logical_replication.sql @@ -35,17 +35,17 @@ CREATE SUBSCRIPTION citus_shard_move_subscription_:postgres_oid WITH (enabled=false, slot_name=citus_shard_move_slot_:postgres_oid); -SELECT count(*) from pg_subscription; -SELECT count(*) from pg_publication; -SELECT count(*) from pg_replication_slots; +SELECT subname from pg_subscription; +SELECT pubname from pg_publication; +SELECT slot_name from pg_replication_slots; SELECT count(*) FROM dist; \c - - - :worker_1_port SET search_path TO logical_replication; -SELECT count(*) from pg_subscription; -SELECT count(*) from pg_publication; -SELECT count(*) from pg_replication_slots; +SELECT subname from pg_subscription; +SELECT pubname from pg_publication; +SELECT slot_name from pg_replication_slots; SELECT count(*) FROM dist; \c - - - :master_port @@ -53,11 +53,13 @@ SET search_path TO logical_replication; select citus_move_shard_placement(6830002, 'localhost', :worker_1_port, 'localhost', :worker_2_port, 'force_logical'); +SELECT public.wait_for_resource_cleanup(); + -- the subscription is still there, as there is no cleanup record for it -- we have created it manually -SELECT count(*) from pg_subscription; -SELECT count(*) from pg_publication; -SELECT count(*) from pg_replication_slots; +SELECT subname from pg_subscription; +SELECT pubname from pg_publication; +SELECT slot_name from pg_replication_slots; SELECT count(*) from dist; \c - - - :worker_1_port @@ -65,9 +67,9 @@ SET search_path TO logical_replication; -- the publication and repslot are still there, as there are no cleanup records for them -- we have created them manually -SELECT count(*) from pg_subscription; -SELECT count(*) from pg_publication; -SELECT count(*) from pg_replication_slots; +SELECT subname from pg_subscription; +SELECT pubname from pg_publication; +SELECT slot_name from pg_replication_slots; SELECT count(*) from dist; DROP PUBLICATION citus_shard_move_publication_:postgres_oid; @@ -76,9 +78,9 @@ SELECT pg_drop_replication_slot('citus_shard_move_slot_' || :postgres_oid); \c - - - :worker_2_port SET search_path TO logical_replication; -SELECT count(*) from pg_subscription; -SELECT count(*) from pg_publication; -SELECT count(*) from pg_replication_slots; +SELECT subname from pg_subscription; +SELECT pubname from pg_publication; +SELECT slot_name from pg_replication_slots; SELECT count(*) from dist; \c - - - :master_port