Add regression test for zombie replication slot cleanup during job cancellation

2025-04-08 10:58:33 +00:00 · 2025-04-08 10:58:33 +00:00 · 23a4671a68
parent 5b6b7b847e
commit 23a4671a68
3 changed files with 247 additions and 60 deletions
--- a/src/test/regress/expected/issue_7896.out
+++ b/src/test/regress/expected/issue_7896.out
@ -1,46 +1,161 @@
 ---------------------------------------------------------------------
 -- Regression Test: Simulate zombie replication slot when
 -- citus_rebalance_wait() is canceled.
 --
 -- In the buggy behavior, canceling citus_rebalance_wait()
 -- (via a short statement_timeout or Ctrl+C) leaves behind an active logical
 -- replication slot on a worker. This, in turn, prevents DROP DATABASE
 -- (with FORCE) from succeeding.
 --
 -- With your fix applied, the underlying rebalance job is canceled,
 -- no zombie slot remains, and DROP DATABASE succeeds.
 ---------------------------------------------------------------------
 ---------------------------------------------------------------------
 -- 1) Create an isolated schema for this test.
 ---------------------------------------------------------------------
 CREATE SCHEMA issue_7896;
 SET search_path TO issue_7896;
-- Create a temporary table to simulate the background job catalog.
+---------------------------------------------------------------------
-- (In production this would be the actual catalog table.)
+-- 2) Set cluster parameters and initialize environment.
-CREATE TEMP TABLE pg_dist_background_job
+---------------------------------------------------------------------
-(
+-- We assume a coordinator with at least two workers.
-    job_id    int8 PRIMARY KEY,
+-- Set replication factor to 2 and enable repartition joins.
-    job_state text,
+SET citus.shard_replication_factor TO 2;
-    started_at timestamptz,
+SET citus.enable_repartition_joins TO ON;
-    finished_at timestamptz
+-- For faster background task processing, set a short background task queue interval.
 ALTER SYSTEM SET citus.background_task_queue_interval TO '1s';
 SELECT pg_reload_conf();
 pg_reload_conf
 ---------------------------------------------------------------------
 t
 (1 row)
 ---------------------------------------------------------------------
 -- 3) Create a distributed table.
 ---------------------------------------------------------------------
 DROP TABLE IF EXISTS t1;
 NOTICE:  table "t1" does not exist, skipping
 CREATE TABLE t1 (a int PRIMARY KEY);
 SELECT create_distributed_table('t1', 'a', shard_count => 4, colocate_with => 'none');
 create_distributed_table
 ---------------------------------------------------------------------
 (1 row)
 ---------------------------------------------------------------------
 -- 4) Insert enough data so that a rebalance has measurable work.
 ---------------------------------------------------------------------
 INSERT INTO t1
  SELECT generate_series(1, 1000000);
 ---------------------------------------------------------------------
 -- 5) Verify that a rebalance on a balanced cluster is a no-op.
 ---------------------------------------------------------------------
 SELECT 1 FROM citus_rebalance_start();
 NOTICE:  No moves available for rebalancing
 ?column?
 ---------------------------------------------------------------------
        1
 (1 row)
 -- Expected: NOTICE "No moves available for rebalancing".
 SELECT citus_rebalance_wait();
 WARNING:  no ongoing rebalance that can be waited on
 citus_rebalance_wait
 ---------------------------------------------------------------------
 (1 row)
 -- Expected: WARNING "no ongoing rebalance that can be waited on".
 ---------------------------------------------------------------------
 -- 6) Force a shard movement so that a rebalance job is scheduled.
 -- Remove and re-add a worker using a parameter placeholder.
 ---------------------------------------------------------------------
 SELECT citus_remove_node('localhost', :worker_2_port);
 citus_remove_node
 ---------------------------------------------------------------------
 (1 row)
 SELECT citus_add_node('localhost', :worker_2_port);
 citus_add_node
 ---------------------------------------------------------------------
             30
 (1 row)
 ---------------------------------------------------------------------
 -- 7) Start a rebalance job that will do actual work.
 ---------------------------------------------------------------------
 SELECT citus_rebalance_start(
         rebalance_strategy := 'by_disk_size',
         shard_transfer_mode   := 'force_logical'
       );
-- Insert a dummy job record with job_state set to 'running'
+NOTICE:  Scheduled 2 moves as job xxx
-INSERT INTO pg_dist_background_job (job_id, job_state, started_at)
+DETAIL:  Rebalance scheduled as background job
-VALUES (1001, 'running', now());
+HINT:  To monitor progress, run: SELECT * FROM citus_rebalance_status();
-- Set a short statement timeout so that citus_rebalance_wait times out quickly.
+ citus_rebalance_start
-SET statement_timeout = '1000ms';
+---------------------------------------------------------------------
                     1
 (1 row)
 -- Expected: Notice that moves are scheduled as a background job.
 -- (You may verify with: SELECT * FROM citus_rebalance_status();)
 ---------------------------------------------------------------------
 -- 8) Attempt to wait on the rebalance with a short timeout so that the wait
 --    is canceled. The PG_CATCH block in citus_job_wait_internal should then
 --    cancel the underlying job (cleaning up temporary replication slots).
 ---------------------------------------------------------------------
 SET statement_timeout = '2s';
 DO $$
 BEGIN
    BEGIN
-        -- Call the wait function.
+        RAISE NOTICE 'Waiting on rebalance with a 2-second timeout...';
-        -- Note: The public function citus_rebalance_wait() takes no arguments.
+        -- Public function citus_rebalance_wait() takes no arguments.
        PERFORM citus_rebalance_wait();
    EXCEPTION
        WHEN query_canceled THEN
-            RAISE NOTICE 'Query canceled as expected';
+            RAISE NOTICE 'Rebalance wait canceled as expected';
-            -- Swallow the error so the transaction continues.
+            -- Your fix should cancel the underlying rebalance job.
    END;
 END;
 $$ LANGUAGE plpgsql;
-WARNING:  no ongoing rebalance that can be waited on
+NOTICE:  Waiting on rebalance with a 2-second timeout...
-CONTEXT:  SQL statement "SELECT citus_rebalance_wait()"
+CONTEXT:  PL/pgSQL function inline_code_block line XX at RAISE
-PL/pgSQL function inline_code_block line XX at PERFORM
+NOTICE:  Rebalance wait canceled as expected
-- Reset the statement timeout for subsequent queries.
+CONTEXT:  PL/pgSQL function inline_code_block line XX at RAISE
 SET statement_timeout = '0';
 -- Verify that the job's state has been updated to 'cancelled'
 -- (the expected outcome after a cancellation).
 SELECT job_state
 FROM pg_dist_background_job
 WHERE job_id = 1001;
 job_state
 ---------------------------------------------------------------------
- running
+-- 9) Cleanup orphaned background resources (if any).
-(1 row)
+---------------------------------------------------------------------
 CALL citus_cleanup_orphaned_resources();
 NOTICE:  cleaned up 5 orphaned resources
 ---------------------------------------------------------------------
 -- 12) Traverse nodes and check for active replication slots.
 --
 -- Connect to the coordinator and worker nodes, then query for replication slots.
 -- Expected Outcome (with the fix applied): No active replication slots.
 ---------------------------------------------------------------------
 \c - - - :master_port
 SELECT * FROM pg_replication_slots;
 slot_name | plugin | slot_type | datoid | database | temporary | active | active_pid | xmin | catalog_xmin | restart_lsn | confirmed_flush_lsn | wal_status | safe_wal_size | two_phase | inactive_since | conflicting | invalidation_reason | failover | synced
 ---------------------------------------------------------------------
 (0 rows)
-SET client_min_messages TO WARNING;
+\c - - - :worker_1_port
-DROP SCHEMA issue_7896 CASCADE;
+SELECT * FROM pg_replication_slots;
 slot_name | plugin | slot_type | datoid | database | temporary | active | active_pid | xmin | catalog_xmin | restart_lsn | confirmed_flush_lsn | wal_status | safe_wal_size | two_phase | inactive_since | conflicting | invalidation_reason | failover | synced
 ---------------------------------------------------------------------
 (0 rows)
 \c - - - :worker_2_port
 SELECT * FROM pg_replication_slots;
 slot_name | plugin | slot_type | datoid | database | temporary | active | active_pid | xmin | catalog_xmin | restart_lsn | confirmed_flush_lsn | wal_status | safe_wal_size | two_phase | inactive_since | conflicting | invalidation_reason | failover | synced
 ---------------------------------------------------------------------
 (0 rows)
 ---------------------------------------------------------------------
 -- 11) Cleanup: Drop the test schema.
 ---------------------------------------------------------------------
 \c - - - :master_port
 SET search_path TO issue_7896;
 DROP SCHEMA IF EXISTS issue_7896 CASCADE;
 NOTICE:  drop cascades to table t1
--- a/src/test/regress/multi_schedule
+++ b/src/test/regress/multi_schedule
@ -104,7 +104,7 @@ test: multi_dropped_column_aliases foreign_key_restriction_enforcement
 test: binary_protocol
 test: alter_table_set_access_method
 test: alter_distributed_table
-test: issue_5248 issue_5099 issue_5763 issue_6543 issue_6758 issue_7477 issue_7891
+test: issue_5248 issue_5099 issue_5763 issue_6543 issue_6758 issue_7477 issue_7891 issue_7896
 test: object_propagation_debug
 test: undistribute_table
 test: run_command_on_all_nodes
--- a/src/test/regress/sql/issue_7896.sql
+++ b/src/test/regress/sql/issue_7896.sql
@ -1,45 +1,117 @@
 ---------------------------------------------------------------------
 -- Regression Test: Simulate zombie replication slot when 
 -- citus_rebalance_wait() is canceled.
 --
 -- In the buggy behavior, canceling citus_rebalance_wait()
 -- (via a short statement_timeout or Ctrl+C) leaves behind an active logical
 -- replication slot on a worker. This, in turn, prevents DROP DATABASE
 -- (with FORCE) from succeeding.
 --
 -- With your fix applied, the underlying rebalance job is canceled,
 -- no zombie slot remains, and DROP DATABASE succeeds.
 ---------------------------------------------------------------------
 ---------------------------------------------------------------------
 -- 1) Create an isolated schema for this test.
 ---------------------------------------------------------------------
 CREATE SCHEMA issue_7896;
 SET search_path TO issue_7896;
-- Create a temporary table to simulate the background job catalog.
+---------------------------------------------------------------------
-- (In production this would be the actual catalog table.)
+-- 2) Set cluster parameters and initialize environment.
-CREATE TEMP TABLE pg_dist_background_job
+---------------------------------------------------------------------
-(
+-- We assume a coordinator with at least two workers.
-    job_id    int8 PRIMARY KEY,
+-- Set replication factor to 2 and enable repartition joins.
-    job_state text,
+SET citus.shard_replication_factor TO 2;
-    started_at timestamptz,
+SET citus.enable_repartition_joins TO ON;
-    finished_at timestamptz
+-- For faster background task processing, set a short background task queue interval.
 ALTER SYSTEM SET citus.background_task_queue_interval TO '1s';
 SELECT pg_reload_conf();
 ---------------------------------------------------------------------
 -- 3) Create a distributed table.
 ---------------------------------------------------------------------
 DROP TABLE IF EXISTS t1;
 CREATE TABLE t1 (a int PRIMARY KEY);
 SELECT create_distributed_table('t1', 'a', shard_count => 4, colocate_with => 'none');
 ---------------------------------------------------------------------
 -- 4) Insert enough data so that a rebalance has measurable work.
 ---------------------------------------------------------------------
 INSERT INTO t1
  SELECT generate_series(1, 1000000);
 ---------------------------------------------------------------------
 -- 5) Verify that a rebalance on a balanced cluster is a no-op.
 ---------------------------------------------------------------------
 SELECT 1 FROM citus_rebalance_start();
 -- Expected: NOTICE "No moves available for rebalancing".
 SELECT citus_rebalance_wait();
 -- Expected: WARNING "no ongoing rebalance that can be waited on".
 ---------------------------------------------------------------------
 -- 6) Force a shard movement so that a rebalance job is scheduled.
 -- Remove and re-add a worker using a parameter placeholder.
 ---------------------------------------------------------------------
 SELECT citus_remove_node('localhost', :worker_2_port);
 SELECT citus_add_node('localhost', :worker_2_port);
 ---------------------------------------------------------------------
 -- 7) Start a rebalance job that will do actual work.
 ---------------------------------------------------------------------
 SELECT citus_rebalance_start(
         rebalance_strategy := 'by_disk_size',
         shard_transfer_mode   := 'force_logical'
       );
 -- Expected: Notice that moves are scheduled as a background job.
 -- (You may verify with: SELECT * FROM citus_rebalance_status();)
-- Insert a dummy job record with job_state set to 'running'
+---------------------------------------------------------------------
-INSERT INTO pg_dist_background_job (job_id, job_state, started_at)
+-- 8) Attempt to wait on the rebalance with a short timeout so that the wait
-VALUES (1001, 'running', now());
+--    is canceled. The PG_CATCH block in citus_job_wait_internal should then
-
+--    cancel the underlying job (cleaning up temporary replication slots).
-- Set a short statement timeout so that citus_rebalance_wait times out quickly.
+---------------------------------------------------------------------
-SET statement_timeout = '1000ms';
+SET statement_timeout = '2s';
 DO $$
 BEGIN
    BEGIN
-        -- Call the wait function.
+        RAISE NOTICE 'Waiting on rebalance with a 2-second timeout...';
-        -- Note: The public function citus_rebalance_wait() takes no arguments.
+        -- Public function citus_rebalance_wait() takes no arguments.
        PERFORM citus_rebalance_wait();
    EXCEPTION
        WHEN query_canceled THEN
-            RAISE NOTICE 'Query canceled as expected';
+            RAISE NOTICE 'Rebalance wait canceled as expected';
-            -- Swallow the error so the transaction continues.
+            -- Your fix should cancel the underlying rebalance job.
    END;
 END;
 $$ LANGUAGE plpgsql;
 -- Reset the statement timeout for subsequent queries.
 SET statement_timeout = '0';
-- Verify that the job's state has been updated to 'cancelled'
+---------------------------------------------------------------------
-- (the expected outcome after a cancellation).
+-- 9) Cleanup orphaned background resources (if any).
-SELECT job_state
+---------------------------------------------------------------------
-FROM pg_dist_background_job
+CALL citus_cleanup_orphaned_resources();
-WHERE job_id = 1001;
+
 ---------------------------------------------------------------------
 -- 10) Traverse nodes and check for active replication slots.
 --
 -- Connect to the coordinator and worker nodes, then query for replication slots.
 -- Expected Outcome (with the fix applied): No active replication slots.
 ---------------------------------------------------------------------
 \c - - - :master_port
 SELECT * FROM pg_replication_slots;
 \c - - - :worker_1_port
 SELECT * FROM pg_replication_slots;
 \c - - - :worker_2_port
 SELECT * FROM pg_replication_slots;
 ---------------------------------------------------------------------
 -- 11) Cleanup: Drop the test schema.
 ---------------------------------------------------------------------
 \c - - - :master_port
 SET search_path TO issue_7896;
 DROP SCHEMA IF EXISTS issue_7896 CASCADE;
 SET client_min_messages TO WARNING;
 DROP SCHEMA issue_7896 CASCADE;