From ac24e1198682ed02caea5ec6b36b01984350b134 Mon Sep 17 00:00:00 2001
From: Jelte Fennema <jelte.fennema@microsoft.com>
Date: Mon, 3 Jul 2023 11:08:24 +0200
Subject: [PATCH] Change default rebalance strategy to by_disk_size (#7033)

DESCRIPTION: Change default rebalance strategy to by_disk_size

When introducing rebalancing by disk size we didn't make it the default
initially. The main reason was, because we expected some problems with
it. We have indeed had some problems/bugs with it over the years, and
have fixed all of them. By now we're quite confident in its stability,
and that it pretty much always gives better results than by_shard_count.

So this PR makes by_disk_size the new default. We don't change the
default when some other strategy than by_shard_count is the current
default. This is in case someone defined their own rebalance strategy
and marked this as the default themselves.

Note: It explicitly does nothing during a downgrade, because there's no
way of knowing if the rebalance strategy before the upgrade was
by_disk_size or by_shard_count. And even in previous versions
by_disk_size is considered superior for quite some time.
---
 .../distributed/sql/citus--11.3-1--12.0-1.sql      |  7 +++++++
 .../sql/downgrades/citus--12.0-1--11.3-1.sql       |  5 +++++
 src/test/regress/expected/shard_rebalancer.out     | 14 ++++++++++++++
 .../regress/expected/single_shard_table_udfs.out   |  2 +-
 src/test/regress/sql/shard_rebalancer.sql          |  4 ++++
 src/test/regress/sql/single_shard_table_udfs.sql   |  2 +-
 6 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/src/backend/distributed/sql/citus--11.3-1--12.0-1.sql b/src/backend/distributed/sql/citus--11.3-1--12.0-1.sql
index 998ffc2be..a35d772b7 100644
--- a/src/backend/distributed/sql/citus--11.3-1--12.0-1.sql
+++ b/src/backend/distributed/sql/citus--11.3-1--12.0-1.sql
@@ -42,3 +42,10 @@ DROP FUNCTION citus_shard_sizes;
 
 #include "udfs/drop_old_time_partitions/12.0-1.sql"
 #include "udfs/get_missing_time_partition_ranges/12.0-1.sql"
+
+-- Update the default rebalance strategy to 'by_disk_size', but only if the
+-- default is currently 'by_shard_count'
+SELECT citus_set_default_rebalance_strategy(name)
+FROM pg_dist_rebalance_strategy
+WHERE name = 'by_disk_size'
+    AND (SELECT default_strategy FROM pg_dist_rebalance_strategy WHERE name = 'by_shard_count');
diff --git a/src/backend/distributed/sql/downgrades/citus--12.0-1--11.3-1.sql b/src/backend/distributed/sql/downgrades/citus--12.0-1--11.3-1.sql
index 1adb4cb72..4a25cfda0 100644
--- a/src/backend/distributed/sql/downgrades/citus--12.0-1--11.3-1.sql
+++ b/src/backend/distributed/sql/downgrades/citus--12.0-1--11.3-1.sql
@@ -76,3 +76,8 @@ DROP FUNCTION pg_catalog.citus_stat_tenants_local_internal(
 
 #include "../udfs/drop_old_time_partitions/10.2-1.sql"
 #include "../udfs/get_missing_time_partition_ranges/10.2-1.sql"
+
+-- This explicitly does not reset the rebalance strategy to by_shard_count,
+-- because there's no way of knowing if the rebalance strategy before the
+-- upgrade was by_disk_size or by_shard_count. And even in previous versions
+-- by_disk_size is considered superior for quite some time.
diff --git a/src/test/regress/expected/shard_rebalancer.out b/src/test/regress/expected/shard_rebalancer.out
index 62ae17487..b8f4010b1 100644
--- a/src/test/regress/expected/shard_rebalancer.out
+++ b/src/test/regress/expected/shard_rebalancer.out
@@ -3,6 +3,14 @@
 --
 SET citus.next_shard_id TO 433000;
 SET citus.propagate_session_settings_for_loopback_connection TO ON;
+-- Because of historic reasons this test was written in a way that assumes that
+-- by_shard_count is the default strategy.
+SELECT citus_set_default_rebalance_strategy('by_shard_count');
+ citus_set_default_rebalance_strategy
+---------------------------------------------------------------------
+
+(1 row)
+
 -- Lower the minimum disk size that a shard group is considered as. Otherwise
 -- we need to create shards of more than 100MB.
 ALTER SYSTEM SET citus.rebalancer_by_disk_size_base_cost = 0;
@@ -2863,6 +2871,12 @@ select 1 from citus_add_node('localhost', :worker_2_port);
 select rebalance_table_shards();
 ERROR:  cannot use logical replication to transfer shards of the relation table_without_primary_key since it doesn't have a REPLICA IDENTITY or PRIMARY KEY
 DROP TABLE table_with_primary_key, table_without_primary_key;
+SELECT citus_set_default_rebalance_strategy('by_disk_size');
+ citus_set_default_rebalance_strategy
+---------------------------------------------------------------------
+
+(1 row)
+
 ALTER SYSTEM RESET citus.rebalancer_by_disk_size_base_cost;
 SELECT pg_reload_conf();
  pg_reload_conf
diff --git a/src/test/regress/expected/single_shard_table_udfs.out b/src/test/regress/expected/single_shard_table_udfs.out
index 3b73473b0..d49027b60 100644
--- a/src/test/regress/expected/single_shard_table_udfs.out
+++ b/src/test/regress/expected/single_shard_table_udfs.out
@@ -334,7 +334,7 @@ ERROR:  Table 'single_shard_table_col2_1' is streaming replicated. Shards of str
 SELECT citus_copy_shard_placement(1820005, :worker_1_node, :worker_2_node);
 ERROR:  Table 'single_shard_table_col2_1' is streaming replicated. Shards of streaming replicated tables cannot be copied
 -- no changes because it's already balanced
-SELECT rebalance_table_shards();
+SELECT rebalance_table_shards(rebalance_strategy := 'by_shard_count');
  rebalance_table_shards
 ---------------------------------------------------------------------
 
diff --git a/src/test/regress/sql/shard_rebalancer.sql b/src/test/regress/sql/shard_rebalancer.sql
index ba22a8abd..d64fb6826 100644
--- a/src/test/regress/sql/shard_rebalancer.sql
+++ b/src/test/regress/sql/shard_rebalancer.sql
@@ -5,6 +5,9 @@
 SET citus.next_shard_id TO 433000;
 SET citus.propagate_session_settings_for_loopback_connection TO ON;
 
+-- Because of historic reasons this test was written in a way that assumes that
+-- by_shard_count is the default strategy.
+SELECT citus_set_default_rebalance_strategy('by_shard_count');
 -- Lower the minimum disk size that a shard group is considered as. Otherwise
 -- we need to create shards of more than 100MB.
 ALTER SYSTEM SET citus.rebalancer_by_disk_size_base_cost = 0;
@@ -1574,6 +1577,7 @@ select 1 from citus_add_node('localhost', :worker_2_port);
 select rebalance_table_shards();
 
 DROP TABLE table_with_primary_key, table_without_primary_key;
+SELECT citus_set_default_rebalance_strategy('by_disk_size');
 ALTER SYSTEM RESET citus.rebalancer_by_disk_size_base_cost;
 SELECT pg_reload_conf();
 \c - - - :worker_1_port
diff --git a/src/test/regress/sql/single_shard_table_udfs.sql b/src/test/regress/sql/single_shard_table_udfs.sql
index 615e566db..7566f53e3 100644
--- a/src/test/regress/sql/single_shard_table_udfs.sql
+++ b/src/test/regress/sql/single_shard_table_udfs.sql
@@ -160,7 +160,7 @@ SELECT master_copy_shard_placement(1820005, 'localhost', :worker_1_port, 'localh
 SELECT citus_copy_shard_placement(1820005, :worker_1_node, :worker_2_node);
 
 -- no changes because it's already balanced
-SELECT rebalance_table_shards();
+SELECT rebalance_table_shards(rebalance_strategy := 'by_shard_count');
 
 -- same placements
 SELECT shardid, nodeport FROM pg_dist_shard_placement WHERE shardid > 1820000 ORDER BY shardid;