Fix flakyness in citus_split_shard_by_split_points_deferred_drop (#6819)

In CI we would sometimes get this failure:
```diff
 -- The original shard is marked for deferred drop with policy_type = 2.
 -- The previous shard should be dropped at the beginning of the second split call
 SELECT * from pg_dist_cleanup;
  record_id | operation_id | object_type |                               object_name                                | node_group_id | policy_type
 -----------+--------------+-------------+--------------------------------------------------------------------------+---------------+-------------
+        60 |          778 |           3 | citus_shard_split_slot_18_21216_778                                      |            16 |           0
        512 |          778 |           1 | citus_split_shard_by_split_points_deferred_schema.table_to_split_8981001 |            16 |           2
-(1 row)
+(2 rows)
```

Replication slots sometimes cannot be deleted right away. Which is hard
to resolve, but luckily we can filter these cleanup records out easily
by filtering by policy_type.

While debugging this issue I learnt that we did not use
`GetNextCleanupRecordId` in all places where we created cleanup
records. This caused test failures when running tests multiple times,
when they set `citus.next_cleanup_record_id`. I tried fixing that by
calling GetNextCleanupRecordId in all places but that caused many
other tests to fail due to deadlocks. So, instead this adresses
that issue by using `ALTER SEQUENCE ... RESTART` instead of
`citus.next_cleanup_record_id`. In a follow up PR we should
probably get rid of `citus.next_cleanup_record_id`, since it's
only used in one other file.
naisila/remove_misleading_constant
Jelte Fennema 2023-04-04 09:45:48 +02:00 committed by GitHub
parent 7c0589abb8
commit dcee370270
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 12 additions and 8 deletions

View File

@ -28,7 +28,7 @@ SET citus.next_placement_id TO 8610000;
SET citus.shard_count TO 2;
SET citus.shard_replication_factor TO 1;
SET citus.next_operation_id TO 777;
SET citus.next_cleanup_record_id TO 511;
ALTER SEQUENCE pg_catalog.pg_dist_cleanup_recordid_seq RESTART 511;
SET ROLE test_split_role;
SET search_path TO "citus_split_shard_by_split_points_deferred_schema";
CREATE TABLE table_to_split(id int PRIMARY KEY, int_data int, data text);
@ -64,10 +64,10 @@ SELECT pg_catalog.citus_split_shard_by_split_points(
-- The original shard is marked for deferred drop with policy_type = 2.
-- The previous shard should be dropped at the beginning of the second split call
SELECT * from pg_dist_cleanup;
SELECT * FROM pg_dist_cleanup WHERE policy_type = 2;
record_id | operation_id | object_type | object_name | node_group_id | policy_type
---------------------------------------------------------------------
512 | 778 | 1 | citus_split_shard_by_split_points_deferred_schema.table_to_split_8981001 | 16 | 2
526 | 778 | 1 | citus_split_shard_by_split_points_deferred_schema.table_to_split_8981001 | 16 | 2
(1 row)
-- One of the physical shards should not be deleted, the other one should.
@ -90,8 +90,12 @@ SELECT relname FROM pg_class where relname LIKE '%table_to_split_%' AND relkind
-- Perform deferred drop cleanup.
\c - postgres - :master_port
CALL citus_cleanup_orphaned_resources();
NOTICE: cleaned up 1 orphaned resources
SELECT public.wait_for_resource_cleanup();
wait_for_resource_cleanup
---------------------------------------------------------------------
(1 row)
-- Clenaup has been done.
SELECT * from pg_dist_cleanup;
record_id | operation_id | object_type | object_name | node_group_id | policy_type

View File

@ -24,7 +24,7 @@ SET citus.next_placement_id TO 8610000;
SET citus.shard_count TO 2;
SET citus.shard_replication_factor TO 1;
SET citus.next_operation_id TO 777;
SET citus.next_cleanup_record_id TO 511;
ALTER SEQUENCE pg_catalog.pg_dist_cleanup_recordid_seq RESTART 511;
SET ROLE test_split_role;
SET search_path TO "citus_split_shard_by_split_points_deferred_schema";
@ -51,7 +51,7 @@ SELECT pg_catalog.citus_split_shard_by_split_points(
-- The original shard is marked for deferred drop with policy_type = 2.
-- The previous shard should be dropped at the beginning of the second split call
SELECT * from pg_dist_cleanup;
SELECT * FROM pg_dist_cleanup WHERE policy_type = 2;
-- One of the physical shards should not be deleted, the other one should.
\c - - - :worker_1_port
@ -62,7 +62,7 @@ SELECT relname FROM pg_class where relname LIKE '%table_to_split_%' AND relkind
-- Perform deferred drop cleanup.
\c - postgres - :master_port
CALL citus_cleanup_orphaned_resources();
SELECT public.wait_for_resource_cleanup();
-- Clenaup has been done.
SELECT * from pg_dist_cleanup;