From c968dc9c274592c0f7efa60744e7eda4df248150 Mon Sep 17 00:00:00 2001
From: ahmet gedemenli <afgedemenli@gmail.com>
Date: Thu, 20 Jul 2023 13:08:46 +0300
Subject: [PATCH] Do not rebalance if replication factor is greater than the
 node count

---
 src/backend/distributed/operations/shard_rebalancer.c | 7 +++++++
 src/test/regress/expected/shard_rebalancer.out        | 6 ++++++
 src/test/regress/expected/single_node_enterprise.out  | 1 +
 src/test/regress/sql/shard_rebalancer.sql             | 6 ++++++
 src/test/regress/sql/single_node_enterprise.sql       | 2 ++
 5 files changed, 22 insertions(+)

diff --git a/src/backend/distributed/operations/shard_rebalancer.c b/src/backend/distributed/operations/shard_rebalancer.c
index a8cb3df5c..61a4ee9b0 100644
--- a/src/backend/distributed/operations/shard_rebalancer.c
+++ b/src/backend/distributed/operations/shard_rebalancer.c
@@ -526,6 +526,13 @@ GetRebalanceSteps(RebalanceOptions *options)
 		}
 	}
 
+	if (shardAllowedNodeCount < ShardReplicationFactor)
+	{
+		ereport(ERROR, (errmsg("Shard replication factor (%d) cannot be greater than "
+							   "number of nodes with should_have_shards=true (%d).",
+							   ShardReplicationFactor, shardAllowedNodeCount)));
+	}
+
 	List *activeShardPlacementListList = NIL;
 	List *unbalancedShards = NIL;
 
diff --git a/src/test/regress/expected/shard_rebalancer.out b/src/test/regress/expected/shard_rebalancer.out
index b8f4010b1..6d608d1f9 100644
--- a/src/test/regress/expected/shard_rebalancer.out
+++ b/src/test/regress/expected/shard_rebalancer.out
@@ -2553,12 +2553,18 @@ SELECT public.wait_until_metadata_sync(30000);
 
 (1 row)
 
+-- errors out because shard replication factor > shard allowed node count
+SELECT rebalance_table_shards('test_rebalance_with_disabled_worker');
+ERROR:  Shard replication factor (2) cannot be greater than number of nodes with should_have_shards=true (1).
+-- set replication factor to one, and try again
+SET citus.shard_replication_factor TO 1;
 SELECT rebalance_table_shards('test_rebalance_with_disabled_worker');
  rebalance_table_shards
 ---------------------------------------------------------------------
 
 (1 row)
 
+SET citus.shard_replication_factor TO 2;
 SELECT 1 FROM citus_activate_node('localhost', :worker_2_port);
  ?column?
 ---------------------------------------------------------------------
diff --git a/src/test/regress/expected/single_node_enterprise.out b/src/test/regress/expected/single_node_enterprise.out
index 305a02b8e..79f231864 100644
--- a/src/test/regress/expected/single_node_enterprise.out
+++ b/src/test/regress/expected/single_node_enterprise.out
@@ -411,6 +411,7 @@ NOTICE:  executing the command locally: SELECT count(*) AS count FROM single_nod
 
 ROLLBACK;
 NOTICE:  issuing ROLLBACK
+SET citus.shard_replication_factor TO 1;
 -- now, lets move all the shards of distributed tables out of the coordinator
 -- block writes is much faster for the sake of the test timings we prefer it
 SELECT master_drain_node('localhost', :master_port, shard_transfer_mode:='block_writes');
diff --git a/src/test/regress/sql/shard_rebalancer.sql b/src/test/regress/sql/shard_rebalancer.sql
index d64fb6826..a53ec8752 100644
--- a/src/test/regress/sql/shard_rebalancer.sql
+++ b/src/test/regress/sql/shard_rebalancer.sql
@@ -1427,8 +1427,14 @@ SELECT create_distributed_table('test_rebalance_with_disabled_worker', 'a', colo
 SELECT citus_disable_node('localhost', :worker_2_port);
 SELECT public.wait_until_metadata_sync(30000);
 
+-- errors out because shard replication factor > shard allowed node count
 SELECT rebalance_table_shards('test_rebalance_with_disabled_worker');
 
+-- set replication factor to one, and try again
+SET citus.shard_replication_factor TO 1;
+SELECT rebalance_table_shards('test_rebalance_with_disabled_worker');
+SET citus.shard_replication_factor TO 2;
+
 SELECT 1 FROM citus_activate_node('localhost', :worker_2_port);
 
 DROP TABLE test_rebalance_with_disabled_worker;
diff --git a/src/test/regress/sql/single_node_enterprise.sql b/src/test/regress/sql/single_node_enterprise.sql
index fb6e47b9a..19393ba24 100644
--- a/src/test/regress/sql/single_node_enterprise.sql
+++ b/src/test/regress/sql/single_node_enterprise.sql
@@ -272,6 +272,8 @@ BEGIN;
 	SELECT count(*) FROM test;
 ROLLBACK;
 
+SET citus.shard_replication_factor TO 1;
+
 -- now, lets move all the shards of distributed tables out of the coordinator
 -- block writes is much faster for the sake of the test timings we prefer it
 SELECT master_drain_node('localhost', :master_port, shard_transfer_mode:='block_writes');