citus/src/backend/distributed/operations/rebalancer_placement_separa...

418 lines
13 KiB
C

/*-------------------------------------------------------------------------
*
* rebalancer_placement_separation.c
* Routines to determine which worker node should be used to separate
* a colocated set of shard placements that need separate nodes.
*
* Copyright (c) Citus Data, Inc.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "nodes/pg_list.h"
#include "utils/hsearch.h"
#include "utils/lsyscache.h"
#include "distributed/colocation_utils.h"
#include "distributed/hash_helpers.h"
#include "distributed/listutils.h"
#include "distributed/metadata_cache.h"
#include "distributed/metadata_utility.h"
#include "distributed/multi_physical_planner.h"
#include "distributed/rebalancer_placement_separation.h"
#include "distributed/shard_rebalancer.h"
struct RebalancerPlacementSeparationContext
{
/*
* Hash table where each entry is of the form NodeToPlacementGroupHashEntry,
* meaning that each entry maps the node with nodeGroupId to
* a NodeToPlacementGroupHashEntry.
*/
HTAB *nodePlacementGroupHash;
};
/*
* Entry of the hash table that maps each primary worker node to a shard
* placement group that is determined to be separated from other shards in
* the cluster via that node.
*/
typedef struct NodeToPlacementGroupHashEntry
{
/* hash key -- group id of the node */
int32 nodeGroupId;
/*
* Whether given node is allowed to have any shards.
*
* Inherited from WorkerNode->shouldHaveShards.
*/
bool shouldHaveShards;
/*
* Whether given node has some shard placements that cannot be moved away.
*
* For the nodes that this rebalancer-run is not allowed to move the
* placements away from, InitRebalancerPlacementSeparationContext() sets
* this to true if the node has some shard placements already. And if the
* node has a single shard placement group that needs a separate node, it
* also sets assignedPlacementGroup.
*
* We do so to prevent TryAssignPlacementGroupsToNodeGroups() making
* incorrect assignments later on.
*
* See InitRebalancerPlacementSeparationContext() for more details.
*/
bool hasPlacementsThatCannotBeMovedAway;
/*
* Shardgroup placement that is assigned to this node to be separated
* from others in the cluster.
*
* NULL if no shardgroup placement is assigned yet.
*/
ShardgroupPlacement *assignedPlacementGroup;
} NodeToPlacementGroupHashEntry;
/*
* Routines to prepare RebalancerPlacementSeparationContext.
*/
static void InitRebalancerPlacementSeparationContext(
RebalancerPlacementSeparationContext *context,
List *activeWorkerNodeList,
List *rebalancePlacementList);
static void TryAssignPlacementGroupsToNodeGroups(
RebalancerPlacementSeparationContext *context,
List *activeWorkerNodeList,
List *rebalancePlacementList,
FmgrInfo *shardAllowedOnNodeUDF);
static bool TryAssignPlacementGroupToNodeGroup(
RebalancerPlacementSeparationContext *context,
int32 candidateNodeGroupId,
ShardPlacement *shardPlacement,
FmgrInfo *shardAllowedOnNodeUDF);
/* other helpers */
static List * PlacementListGetUniqueNodeGroupIds(List *placementList);
/*
* PrepareRebalancerPlacementSeparationContext creates RebalancerPlacementSeparationContext
* that keeps track of which worker nodes are used to separate which shardgroup placements
* that need separate nodes.
*/
RebalancerPlacementSeparationContext *
PrepareRebalancerPlacementSeparationContext(List *activeWorkerNodeList,
List *rebalancePlacementList,
FmgrInfo *shardAllowedOnNodeUDF)
{
HTAB *nodePlacementGroupHash =
CreateSimpleHashWithNameAndSize(uint32, NodeToPlacementGroupHashEntry,
"NodeToPlacementGroupHash",
list_length(activeWorkerNodeList));
RebalancerPlacementSeparationContext *context =
palloc(sizeof(RebalancerPlacementSeparationContext));
context->nodePlacementGroupHash = nodePlacementGroupHash;
activeWorkerNodeList = SortList(activeWorkerNodeList, CompareWorkerNodes);
rebalancePlacementList = SortList(rebalancePlacementList, CompareShardPlacements);
InitRebalancerPlacementSeparationContext(context, activeWorkerNodeList,
rebalancePlacementList);
TryAssignPlacementGroupsToNodeGroups(context,
activeWorkerNodeList,
rebalancePlacementList,
shardAllowedOnNodeUDF);
return context;
}
/*
* InitRebalancerPlacementSeparationContext initializes given
* RebalancerPlacementSeparationContext by using given list
* of worker nodes and the worker node that is being drained,
* if specified.
*/
static void
InitRebalancerPlacementSeparationContext(RebalancerPlacementSeparationContext *context,
List *activeWorkerNodeList,
List *rebalancePlacementList)
{
HTAB *nodePlacementGroupHash = context->nodePlacementGroupHash;
List *placementListUniqueNodeGroupIds =
PlacementListGetUniqueNodeGroupIds(rebalancePlacementList);
WorkerNode *workerNode = NULL;
foreach_ptr(workerNode, activeWorkerNodeList)
{
NodeToPlacementGroupHashEntry *nodePlacementGroupHashEntry =
hash_search(nodePlacementGroupHash, &workerNode->groupId, HASH_ENTER,
NULL);
nodePlacementGroupHashEntry->shouldHaveShards =
workerNode->shouldHaveShards;
nodePlacementGroupHashEntry->hasPlacementsThatCannotBeMovedAway = false;
nodePlacementGroupHashEntry->assignedPlacementGroup = NULL;
/*
* Lets call set of the nodes that placements in rebalancePlacementList
* are stored on as D and the others as S. In other words, D is the set
* of the nodes that we're allowed to move the placements "from" or
* "to (*)" (* = if we're not draining it) and S is the set of the nodes
* that we're only allowed to move the placements "to" but not "from".
*
* This means that, for a node of type S, the fact that whether the node
* is used to separate a placement group or not cannot be changed in the
* runtime.
*
* For this reason, below we find out the assigned placement groups for
* nodes of type S because we want to avoid from moving the placements
* (if any) from a node of type D to a node that is used to separate a
* placement group within S. We also set hasPlacementsThatCannotBeMovedAway
* to true for the nodes that already have some shard placements within S
* because we want to avoid from moving the placements that need a separate
* node (if any) from node D to node S.
*
* We skip below code for nodes of type D not because optimization purposes
* but because it would be "incorrect" to assume that "current placement
* distribution for a node of type D would be the same" after the rebalancer
* plans the moves.
*/
if (!workerNode->shouldHaveShards)
{
/* we can't assing any shardgroup placements to the node anyway */
continue;
}
if (list_member_int(placementListUniqueNodeGroupIds, workerNode->groupId))
{
/* node is of type D */
continue;
}
/* node is of type S */
nodePlacementGroupHashEntry->hasPlacementsThatCannotBeMovedAway =
NodeGroupHasDistributedTableShardPlacements(
nodePlacementGroupHashEntry->nodeGroupId);
nodePlacementGroupHashEntry->assignedPlacementGroup =
NodeGroupGetSeparatedShardgroupPlacement(
nodePlacementGroupHashEntry->nodeGroupId);
}
}
/*
* TryAssignPlacementGroupsToNodeGroups tries to assign placements that need
* separate nodes within given placement list to individual worker nodes.
*/
static void
TryAssignPlacementGroupsToNodeGroups(RebalancerPlacementSeparationContext *context,
List *activeWorkerNodeList,
List *rebalancePlacementList,
FmgrInfo *shardAllowedOnNodeUDF)
{
List *unassignedPlacementList = NIL;
/*
* Assign as much as possible shardgroup placements to worker nodes where
* they are stored already.
*/
ShardPlacement *shardPlacement = NULL;
foreach_ptr(shardPlacement, rebalancePlacementList)
{
ShardInterval *shardInterval = LoadShardInterval(shardPlacement->shardId);
if (!shardInterval->needsSeparateNode)
{
continue;
}
int32 currentNodeGroupId = shardPlacement->groupId;
if (!TryAssignPlacementGroupToNodeGroup(context,
currentNodeGroupId,
shardPlacement,
shardAllowedOnNodeUDF))
{
unassignedPlacementList =
lappend(unassignedPlacementList, shardPlacement);
}
}
bool emitWarning = false;
/*
* For the shardgroup placements that could not be assigned to their
* current node, assign them to any other node.
*/
ShardPlacement *unassignedShardPlacement = NULL;
foreach_ptr(unassignedShardPlacement, unassignedPlacementList)
{
bool separated = false;
WorkerNode *activeWorkerNode = NULL;
foreach_ptr(activeWorkerNode, activeWorkerNodeList)
{
if (TryAssignPlacementGroupToNodeGroup(context,
activeWorkerNode->groupId,
unassignedShardPlacement,
shardAllowedOnNodeUDF))
{
separated = true;
break;
}
}
if (!separated)
{
emitWarning = true;
}
}
if (emitWarning)
{
ereport(WARNING, (errmsg("could not separate all shard placements "
"that need a separate node")));
}
}
/*
* TryAssignPlacementGroupToNodeGroup is an helper to
* TryAssignPlacementGroupsToNodeGroups that tries to assign given
* shard placement to given node and returns true if it succeeds.
*/
static bool
TryAssignPlacementGroupToNodeGroup(RebalancerPlacementSeparationContext *context,
int32 candidateNodeGroupId,
ShardPlacement *shardPlacement,
FmgrInfo *shardAllowedOnNodeUDF)
{
HTAB *nodePlacementGroupHash = context->nodePlacementGroupHash;
bool found = false;
NodeToPlacementGroupHashEntry *nodePlacementGroupHashEntry =
hash_search(nodePlacementGroupHash, &candidateNodeGroupId, HASH_FIND, &found);
if (!found)
{
ereport(ERROR, (errmsg("no such node is found")));
}
if (nodePlacementGroupHashEntry->assignedPlacementGroup)
{
/*
* Right now callers of this function call it once for each distinct
* shardgroup placement, hence we assume that shardgroup placement
* that given shard placement belongs to and
* nodePlacementGroupHashEntry->assignedPlacementGroup cannot be the
* same, without checking.
*/
return false;
}
if (nodePlacementGroupHashEntry->hasPlacementsThatCannotBeMovedAway)
{
return false;
}
if (!nodePlacementGroupHashEntry->shouldHaveShards)
{
return false;
}
WorkerNode *workerNode = PrimaryNodeForGroup(candidateNodeGroupId, NULL);
Datum allowed = FunctionCall2(shardAllowedOnNodeUDF, shardPlacement->shardId,
workerNode->nodeId);
if (!DatumGetBool(allowed))
{
return false;
}
nodePlacementGroupHashEntry->assignedPlacementGroup =
GetShardgroupPlacementForPlacement(shardPlacement->shardId,
shardPlacement->placementId);
return true;
}
/*
* RebalancerPlacementSeparationContextPlacementIsAllowedOnWorker returns true
* if shard placement with given shardId & placementId is allowed to be stored
* on given worker node.
*/
bool
RebalancerPlacementSeparationContextPlacementIsAllowedOnWorker(
RebalancerPlacementSeparationContext *context,
uint64 shardId,
uint64 placementId,
WorkerNode *workerNode)
{
HTAB *nodePlacementGroupHash = context->nodePlacementGroupHash;
bool found = false;
NodeToPlacementGroupHashEntry *nodePlacementGroupHashEntry =
hash_search(nodePlacementGroupHash, &(workerNode->groupId), HASH_FIND, &found);
if (!found)
{
ereport(ERROR, (errmsg("no such node is found")));
}
ShardInterval *shardInterval = LoadShardInterval(shardId);
if (!shardInterval->needsSeparateNode)
{
/*
* It doesn't need a separate node, but is the node used to separate
* a shardgroup placement? If so, we cannot store it on this node.
*/
return nodePlacementGroupHashEntry->shouldHaveShards &&
nodePlacementGroupHashEntry->assignedPlacementGroup == NULL;
}
/*
* Given shard placement needs a separate node.
* Check if given worker node is the one that is assigned to separate it.
*/
if (nodePlacementGroupHashEntry->assignedPlacementGroup == NULL)
{
/* the node is not supposed to separate a placement group */
return false;
}
ShardgroupPlacement *placementGroup =
GetShardgroupPlacementForPlacement(shardId, placementId);
return ShardgroupPlacementsSame(nodePlacementGroupHashEntry->assignedPlacementGroup,
placementGroup);
}
/*
* PlacementListGetUniqueNodeGroupIds returns a list of unique node group ids
* that are used by given list of shard placements.
*/
static List *
PlacementListGetUniqueNodeGroupIds(List *placementList)
{
List *placementListUniqueNodeGroupIds = NIL;
ShardPlacement *shardPlacement = NULL;
foreach_ptr(shardPlacement, placementList)
{
placementListUniqueNodeGroupIds =
list_append_unique_int(placementListUniqueNodeGroupIds,
shardPlacement->groupId);
}
return placementListUniqueNodeGroupIds;
}