Ignore nodes not allowed for shards, when planning rebalance steps (#6887)

We are handling colocation groups with shard group count less than the worker node count, using a method different than the usual rebalancer. See #6739 While making the decision of using this method or not, we should've ignored the nodes that are marked `shouldhaveshards = false`. This PR excludes those nodes when making the decision. Adds a test such that: coordinator: [] worker 1: [1_1, 1_2] worker 2: [2_1, 2_2] (rebalance) coordinator: [] worker 1: [1_1, 2_1] worker 2: [1_2, 2_2] If we take the coordinator into account, the rebalancer considers the first state as balanced and does nothing (because shard_count < worker_count) But with this pr, we ignore the coordinator because it's shouldhaveshards = false So the rebalancer distributes each colocation group to both workers Also, fixes an unrelated flaky test in the same file
2023-05-01 13:21:08 +03:00 · 2023-05-01 13:21:08 +03:00 · 59ccf364df
parent 8cb69cfd13
commit 59ccf364df
3 changed files with 177 additions and 3 deletions
--- a/src/backend/distributed/operations/shard_rebalancer.c
+++ b/src/backend/distributed/operations/shard_rebalancer.c
@ -515,6 +515,16 @@ GetRebalanceSteps(RebalanceOptions *options)
 	/* sort the lists to make the function more deterministic */
 	List *activeWorkerList = SortedActiveWorkers();
 	int shardAllowedNodeCount = 0;
 	WorkerNode *workerNode = NULL;
 	foreach_ptr(workerNode, activeWorkerList)
 	{
 		if (workerNode->shouldHaveShards)
 		{
 			shardAllowedNodeCount++;
 		}
 	}
 	List *activeShardPlacementListList = NIL;
 	List *unbalancedShards = NIL;
@ -532,8 +542,7 @@ GetRebalanceSteps(RebalanceOptions *options)
 				shardPlacementList, options->workerNode);
 		}
-		if (list_length(activeShardPlacementListForRelation) >= list_length(
+		if (list_length(activeShardPlacementListForRelation) >= shardAllowedNodeCount)
 				activeWorkerList))
 		{
 			activeShardPlacementListList = lappend(activeShardPlacementListList,
 												   activeShardPlacementListForRelation);
--- a/src/test/regress/expected/shard_rebalancer.out
+++ b/src/test/regress/expected/shard_rebalancer.out
@ -20,13 +20,14 @@ SELECT create_distributed_table('dist_table_test', 'a');
 CREATE TABLE postgres_table_test(a int primary key);
 -- make sure that all rebalance operations works fine when
 -- reference tables are replicated to the coordinator
 SET client_min_messages TO ERROR;
 SELECT 1 FROM master_add_node('localhost', :master_port, groupId=>0);
 NOTICE:  localhost:xxxxx is the coordinator and already contains metadata, skipping syncing the metadata
 ?column?
 ---------------------------------------------------------------------
        1
 (1 row)
 RESET client_min_messages;
 -- should just be noops even if we add the coordinator to the pg_dist_node
 SELECT rebalance_table_shards('dist_table_test');
 rebalance_table_shards
@ -2713,6 +2714,113 @@ SELECT sh.logicalrelid, pl.nodeport
 (5 rows)
 DROP TABLE single_shard_colocation_1a, single_shard_colocation_1b, single_shard_colocation_1c, single_shard_colocation_2a, single_shard_colocation_2b CASCADE;
 -- test the same with coordinator shouldhaveshards = false and shard_count = 2
 -- so that the shard allowed node count would be 2 when rebalancing
 -- for such cases, we only count the nodes that are allowed for shard placements
 UPDATE pg_dist_node SET shouldhaveshards=false WHERE nodeport = :master_port;
 create table two_shard_colocation_1a (a int primary key);
 create table two_shard_colocation_1b (a int primary key);
 SET citus.shard_replication_factor = 1;
 select create_distributed_table('two_shard_colocation_1a','a', colocate_with => 'none', shard_count => 2);
 create_distributed_table
 ---------------------------------------------------------------------
 (1 row)
 select create_distributed_table('two_shard_colocation_1b','a',colocate_with=>'two_shard_colocation_1a');
 create_distributed_table
 ---------------------------------------------------------------------
 (1 row)
 create table two_shard_colocation_2a (a int primary key);
 create table two_shard_colocation_2b (a int primary key);
 select create_distributed_table('two_shard_colocation_2a','a', colocate_with => 'none', shard_count => 2);
 create_distributed_table
 ---------------------------------------------------------------------
 (1 row)
 select create_distributed_table('two_shard_colocation_2b','a',colocate_with=>'two_shard_colocation_2a');
 create_distributed_table
 ---------------------------------------------------------------------
 (1 row)
 -- move shards of colocation group 1 to worker1
 SELECT citus_move_shard_placement(sh.shardid, 'localhost', :worker_2_port, 'localhost', :worker_1_port)
    FROM pg_dist_shard sh JOIN pg_dist_shard_placement pl ON sh.shardid = pl.shardid
    WHERE sh.logicalrelid = 'two_shard_colocation_1a'::regclass
        AND pl.nodeport = :worker_2_port
    LIMIT 1;
 citus_move_shard_placement
 ---------------------------------------------------------------------
 (1 row)
 -- move shards of colocation group 2 to worker2
 SELECT citus_move_shard_placement(sh.shardid, 'localhost', :worker_1_port, 'localhost', :worker_2_port)
    FROM pg_dist_shard sh JOIN pg_dist_shard_placement pl ON sh.shardid = pl.shardid
    WHERE sh.logicalrelid = 'two_shard_colocation_2a'::regclass
        AND pl.nodeport = :worker_1_port
    LIMIT 1;
 citus_move_shard_placement
 ---------------------------------------------------------------------
 (1 row)
 -- current state:
 -- coordinator: []
 -- worker 1: [1_1, 1_2]
 -- worker 2: [2_1, 2_2]
 SELECT sh.logicalrelid, pl.nodeport
    FROM pg_dist_shard sh JOIN pg_dist_shard_placement pl ON sh.shardid = pl.shardid
    WHERE sh.logicalrelid::text IN ('two_shard_colocation_1a', 'two_shard_colocation_1b', 'two_shard_colocation_2a', 'two_shard_colocation_2b')
    ORDER BY sh.logicalrelid, pl.nodeport;
      logicalrelid       | nodeport
 ---------------------------------------------------------------------
 two_shard_colocation_1a |    57637
 two_shard_colocation_1a |    57637
 two_shard_colocation_1b |    57637
 two_shard_colocation_1b |    57637
 two_shard_colocation_2a |    57638
 two_shard_colocation_2a |    57638
 two_shard_colocation_2b |    57638
 two_shard_colocation_2b |    57638
 (8 rows)
 -- If we take the coordinator into account, the rebalancer considers this as balanced and does nothing (shard_count < worker_count)
 -- but because the coordinator is not allowed for shards, rebalancer will distribute each colocation group to both workers
 select rebalance_table_shards(shard_transfer_mode:='block_writes');
 NOTICE:  Moving shard xxxxx from localhost:xxxxx to localhost:xxxxx ...
 NOTICE:  Moving shard xxxxx from localhost:xxxxx to localhost:xxxxx ...
 rebalance_table_shards
 ---------------------------------------------------------------------
 (1 row)
 -- final state:
 -- coordinator: []
 -- worker 1: [1_1, 2_1]
 -- worker 2: [1_2, 2_2]
 SELECT sh.logicalrelid, pl.nodeport
    FROM pg_dist_shard sh JOIN pg_dist_shard_placement pl ON sh.shardid = pl.shardid
    WHERE sh.logicalrelid::text IN ('two_shard_colocation_1a', 'two_shard_colocation_1b', 'two_shard_colocation_2a', 'two_shard_colocation_2b')
    ORDER BY sh.logicalrelid, pl.nodeport;
      logicalrelid       | nodeport
 ---------------------------------------------------------------------
 two_shard_colocation_1a |    57637
 two_shard_colocation_1a |    57638
 two_shard_colocation_1b |    57637
 two_shard_colocation_1b |    57638
 two_shard_colocation_2a |    57637
 two_shard_colocation_2a |    57638
 two_shard_colocation_2b |    57637
 two_shard_colocation_2b |    57638
 (8 rows)
 -- cleanup
 DROP TABLE two_shard_colocation_1a, two_shard_colocation_1b, two_shard_colocation_2a, two_shard_colocation_2b CASCADE;
 -- verify we detect if one of the tables do not have a replica identity or primary key
 -- and error out in case of shard transfer mode = auto
 SELECT 1 FROM citus_remove_node('localhost', :worker_2_port);
--- a/src/test/regress/sql/shard_rebalancer.sql
+++ b/src/test/regress/sql/shard_rebalancer.sql
@ -13,7 +13,9 @@ CREATE TABLE postgres_table_test(a int primary key);
 -- make sure that all rebalance operations works fine when
 -- reference tables are replicated to the coordinator
 SET client_min_messages TO ERROR;
 SELECT 1 FROM master_add_node('localhost', :master_port, groupId=>0);
 RESET client_min_messages;
 -- should just be noops even if we add the coordinator to the pg_dist_node
 SELECT rebalance_table_shards('dist_table_test');
@ -1497,6 +1499,61 @@ SELECT sh.logicalrelid, pl.nodeport
 DROP TABLE single_shard_colocation_1a, single_shard_colocation_1b, single_shard_colocation_1c, single_shard_colocation_2a, single_shard_colocation_2b CASCADE;
 -- test the same with coordinator shouldhaveshards = false and shard_count = 2
 -- so that the shard allowed node count would be 2 when rebalancing
 -- for such cases, we only count the nodes that are allowed for shard placements
 UPDATE pg_dist_node SET shouldhaveshards=false WHERE nodeport = :master_port;
 create table two_shard_colocation_1a (a int primary key);
 create table two_shard_colocation_1b (a int primary key);
 SET citus.shard_replication_factor = 1;
 select create_distributed_table('two_shard_colocation_1a','a', colocate_with => 'none', shard_count => 2);
 select create_distributed_table('two_shard_colocation_1b','a',colocate_with=>'two_shard_colocation_1a');
 create table two_shard_colocation_2a (a int primary key);
 create table two_shard_colocation_2b (a int primary key);
 select create_distributed_table('two_shard_colocation_2a','a', colocate_with => 'none', shard_count => 2);
 select create_distributed_table('two_shard_colocation_2b','a',colocate_with=>'two_shard_colocation_2a');
 -- move shards of colocation group 1 to worker1
 SELECT citus_move_shard_placement(sh.shardid, 'localhost', :worker_2_port, 'localhost', :worker_1_port)
    FROM pg_dist_shard sh JOIN pg_dist_shard_placement pl ON sh.shardid = pl.shardid
    WHERE sh.logicalrelid = 'two_shard_colocation_1a'::regclass
        AND pl.nodeport = :worker_2_port
    LIMIT 1;
 -- move shards of colocation group 2 to worker2
 SELECT citus_move_shard_placement(sh.shardid, 'localhost', :worker_1_port, 'localhost', :worker_2_port)
    FROM pg_dist_shard sh JOIN pg_dist_shard_placement pl ON sh.shardid = pl.shardid
    WHERE sh.logicalrelid = 'two_shard_colocation_2a'::regclass
        AND pl.nodeport = :worker_1_port
    LIMIT 1;
 -- current state:
 -- coordinator: []
 -- worker 1: [1_1, 1_2]
 -- worker 2: [2_1, 2_2]
 SELECT sh.logicalrelid, pl.nodeport
    FROM pg_dist_shard sh JOIN pg_dist_shard_placement pl ON sh.shardid = pl.shardid
    WHERE sh.logicalrelid::text IN ('two_shard_colocation_1a', 'two_shard_colocation_1b', 'two_shard_colocation_2a', 'two_shard_colocation_2b')
    ORDER BY sh.logicalrelid, pl.nodeport;
 -- If we take the coordinator into account, the rebalancer considers this as balanced and does nothing (shard_count < worker_count)
 -- but because the coordinator is not allowed for shards, rebalancer will distribute each colocation group to both workers
 select rebalance_table_shards(shard_transfer_mode:='block_writes');
 -- final state:
 -- coordinator: []
 -- worker 1: [1_1, 2_1]
 -- worker 2: [1_2, 2_2]
 SELECT sh.logicalrelid, pl.nodeport
    FROM pg_dist_shard sh JOIN pg_dist_shard_placement pl ON sh.shardid = pl.shardid
    WHERE sh.logicalrelid::text IN ('two_shard_colocation_1a', 'two_shard_colocation_1b', 'two_shard_colocation_2a', 'two_shard_colocation_2b')
    ORDER BY sh.logicalrelid, pl.nodeport;
 -- cleanup
 DROP TABLE two_shard_colocation_1a, two_shard_colocation_1b, two_shard_colocation_2a, two_shard_colocation_2b CASCADE;
 -- verify we detect if one of the tables do not have a replica identity or primary key
 -- and error out in case of shard transfer mode = auto
 SELECT 1 FROM citus_remove_node('localhost', :worker_2_port);