Do not rebalance if replication factor is greater than the node count

2023-07-20 13:08:46 +03:00 · 2023-07-20 13:08:46 +03:00 · c968dc9c27
parent c2f46f0f3f
commit c968dc9c27
5 changed files with 22 additions and 0 deletions
--- a/src/backend/distributed/operations/shard_rebalancer.c
+++ b/src/backend/distributed/operations/shard_rebalancer.c
@ -526,6 +526,13 @@ GetRebalanceSteps(RebalanceOptions *options)
 		}
 	}

+	if (shardAllowedNodeCount < ShardReplicationFactor)
+	{
+		ereport(ERROR, (errmsg("Shard replication factor (%d) cannot be greater than "
+							   "number of nodes with should_have_shards=true (%d).",
+							   ShardReplicationFactor, shardAllowedNodeCount)));
+	}
+
 	List *activeShardPlacementListList = NIL;
 	List *unbalancedShards = NIL;

--- a/src/test/regress/expected/shard_rebalancer.out
+++ b/src/test/regress/expected/shard_rebalancer.out
@ -2553,12 +2553,18 @@ SELECT public.wait_until_metadata_sync(30000);

 (1 row)

+-- errors out because shard replication factor > shard allowed node count
+SELECT rebalance_table_shards('test_rebalance_with_disabled_worker');
+ERROR:  Shard replication factor (2) cannot be greater than number of nodes with should_have_shards=true (1).
+-- set replication factor to one, and try again
+SET citus.shard_replication_factor TO 1;
 SELECT rebalance_table_shards('test_rebalance_with_disabled_worker');
 rebalance_table_shards
 ---------------------------------------------------------------------

 (1 row)

+SET citus.shard_replication_factor TO 2;
 SELECT 1 FROM citus_activate_node('localhost', :worker_2_port);
 ?column?
 ---------------------------------------------------------------------
--- a/src/test/regress/expected/single_node_enterprise.out
+++ b/src/test/regress/expected/single_node_enterprise.out
@ -411,6 +411,7 @@ NOTICE:  executing the command locally: SELECT count(*) AS count FROM single_nod

 ROLLBACK;
 NOTICE:  issuing ROLLBACK
+SET citus.shard_replication_factor TO 1;
 -- now, lets move all the shards of distributed tables out of the coordinator
 -- block writes is much faster for the sake of the test timings we prefer it
 SELECT master_drain_node('localhost', :master_port, shard_transfer_mode:='block_writes');
--- a/src/test/regress/sql/shard_rebalancer.sql
+++ b/src/test/regress/sql/shard_rebalancer.sql
@ -1427,8 +1427,14 @@ SELECT create_distributed_table('test_rebalance_with_disabled_worker', 'a', colo
 SELECT citus_disable_node('localhost', :worker_2_port);
 SELECT public.wait_until_metadata_sync(30000);

+-- errors out because shard replication factor > shard allowed node count
 SELECT rebalance_table_shards('test_rebalance_with_disabled_worker');

+-- set replication factor to one, and try again
+SET citus.shard_replication_factor TO 1;
+SELECT rebalance_table_shards('test_rebalance_with_disabled_worker');
+SET citus.shard_replication_factor TO 2;
+
 SELECT 1 FROM citus_activate_node('localhost', :worker_2_port);

 DROP TABLE test_rebalance_with_disabled_worker;
--- a/src/test/regress/sql/single_node_enterprise.sql
+++ b/src/test/regress/sql/single_node_enterprise.sql
@ -272,6 +272,8 @@ BEGIN;
 	SELECT count(*) FROM test;
 ROLLBACK;

+SET citus.shard_replication_factor TO 1;
+
 -- now, lets move all the shards of distributed tables out of the coordinator
 -- block writes is much faster for the sake of the test timings we prefer it
 SELECT master_drain_node('localhost', :master_port, shard_transfer_mode:='block_writes');