mirror of https://github.com/citusdata/citus.git
483 lines
15 KiB
C
483 lines
15 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* rebalancer_placement_isolation.c
|
|
* Routines to determine which worker node should be used to separate
|
|
* a colocated set of shard placements that need separate nodes.
|
|
*
|
|
* Copyright (c) Citus Data, Inc.
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
#include "nodes/pg_list.h"
|
|
#include "utils/hsearch.h"
|
|
#include "utils/lsyscache.h"
|
|
|
|
#include "distributed/colocation_utils.h"
|
|
#include "distributed/hash_helpers.h"
|
|
#include "distributed/listutils.h"
|
|
#include "distributed/metadata_cache.h"
|
|
#include "distributed/metadata_utility.h"
|
|
#include "distributed/multi_physical_planner.h"
|
|
#include "distributed/rebalancer_placement_isolation.h"
|
|
#include "distributed/shard_rebalancer.h"
|
|
|
|
|
|
struct RebalancerPlacementIsolationContext
|
|
{
|
|
HTAB *nodePlacementGroupHash;
|
|
};
|
|
|
|
|
|
/*
|
|
* Entry of the hash table that maps each primary worker node to a shard
|
|
* placement group that is determined to be separated from other shards in
|
|
* the cluster via that node.
|
|
*/
|
|
typedef struct
|
|
{
|
|
/* hash key -- group id of the node */
|
|
int32 nodeGroupId;
|
|
|
|
/*
|
|
* Whether given node is allowed to have any shards.
|
|
*
|
|
* This is not just WorkerNode->shouldHaveShards but also takes into account
|
|
* whether the node is being drained.
|
|
*/
|
|
bool shouldHaveShards;
|
|
|
|
/*
|
|
* Whether given node is allowed to separate any shard placement groups.
|
|
*
|
|
* This is set only if we're draining a single node because otherwise
|
|
* we have the control to separate shard placement groups on any node.
|
|
*
|
|
* However if we're draining a single node, we cannot separate shard
|
|
* placement groups on the node that already has some placements because
|
|
* we cannot move the existing placements from a node that we're not
|
|
* draining to another node when we're draining a single node.
|
|
*/
|
|
bool allowedToSeparateAnyPlacementGroup;
|
|
|
|
/*
|
|
* Shard placement group that is assigned to this node to be separated
|
|
* from others in the cluster.
|
|
*
|
|
* NULL if no shard placement group is assigned yet.
|
|
*/
|
|
ShardPlacementGroup *assignedPlacementGroup;
|
|
} NodeToPlacementGroupHashEntry;
|
|
|
|
/*
|
|
* Routines to prepare a hash table where each entry is of type
|
|
* NodeToPlacementGroupHashEntry.
|
|
*/
|
|
static void NodeToPlacementGroupHashInit(HTAB *nodePlacementGroupHash,
|
|
List *activeWorkerNodeList,
|
|
List *rebalancePlacementList,
|
|
WorkerNode *drainWorkerNode);
|
|
static void NodeToPlacementGroupHashAssignNodes(HTAB *nodePlacementGroupHash,
|
|
List *activeWorkerNodeList,
|
|
List *rebalancePlacementList,
|
|
FmgrInfo *shardAllowedOnNodeUDF);
|
|
static bool NodeToPlacementGroupHashAssignNode(HTAB *nodePlacementGroupHash,
|
|
int32 nodeGroupId,
|
|
ShardPlacement *shardPlacement,
|
|
FmgrInfo *shardAllowedOnNodeUDF);
|
|
static NodeToPlacementGroupHashEntry * NodeToPlacementGroupHashGetNodeWithGroupId(
|
|
HTAB *nodePlacementGroupHash,
|
|
int32
|
|
nodeGroupId);
|
|
|
|
|
|
/* other helpers */
|
|
static List * PlacementListGetUniqueNodeGroupIds(List *placementList);
|
|
static int WorkerNodeListGetNodeWithGroupId(List *workerNodeList, int32 nodeGroupId);
|
|
|
|
|
|
/*
|
|
* PrepareRebalancerPlacementIsolationContext creates RebalancerPlacementIsolationContext
|
|
* that keeps track of which worker nodes are used to separate which shard placement groups
|
|
* that need separate nodes.
|
|
*/
|
|
RebalancerPlacementIsolationContext *
|
|
PrepareRebalancerPlacementIsolationContext(List *activeWorkerNodeList,
|
|
List *rebalancePlacementList,
|
|
WorkerNode *drainWorkerNode,
|
|
FmgrInfo *shardAllowedOnNodeUDF)
|
|
{
|
|
HTAB *nodePlacementGroupHash =
|
|
CreateSimpleHashWithNameAndSize(uint32, NodeToPlacementGroupHashEntry,
|
|
"NodeToPlacementGroupHash",
|
|
list_length(activeWorkerNodeList));
|
|
|
|
activeWorkerNodeList = SortList(activeWorkerNodeList, CompareWorkerNodes);
|
|
rebalancePlacementList = SortList(rebalancePlacementList, CompareShardPlacements);
|
|
|
|
NodeToPlacementGroupHashInit(nodePlacementGroupHash, activeWorkerNodeList,
|
|
rebalancePlacementList, drainWorkerNode);
|
|
|
|
NodeToPlacementGroupHashAssignNodes(nodePlacementGroupHash,
|
|
activeWorkerNodeList,
|
|
rebalancePlacementList,
|
|
shardAllowedOnNodeUDF);
|
|
|
|
RebalancerPlacementIsolationContext *context =
|
|
palloc(sizeof(RebalancerPlacementIsolationContext));
|
|
context->nodePlacementGroupHash = nodePlacementGroupHash;
|
|
|
|
return context;
|
|
}
|
|
|
|
|
|
/*
|
|
* NodeToPlacementGroupHashInit initializes given hash table where each
|
|
* entry is of type NodeToPlacementGroupHashEntry by using given list
|
|
* of worker nodes and the worker node that is being drained, if specified.
|
|
*/
|
|
static void
|
|
NodeToPlacementGroupHashInit(HTAB *nodePlacementGroupHash, List *activeWorkerNodeList,
|
|
List *rebalancePlacementList, WorkerNode *drainWorkerNode)
|
|
{
|
|
List *placementListUniqueNodeGroupIds =
|
|
PlacementListGetUniqueNodeGroupIds(rebalancePlacementList);
|
|
|
|
WorkerNode *workerNode = NULL;
|
|
foreach_ptr(workerNode, activeWorkerNodeList)
|
|
{
|
|
NodeToPlacementGroupHashEntry *nodePlacementGroupHashEntry =
|
|
hash_search(nodePlacementGroupHash, &workerNode->groupId, HASH_ENTER,
|
|
NULL);
|
|
|
|
nodePlacementGroupHashEntry->nodeGroupId = workerNode->groupId;
|
|
|
|
bool shouldHaveShards = workerNode->shouldHaveShards;
|
|
if (drainWorkerNode && drainWorkerNode->groupId == workerNode->groupId)
|
|
{
|
|
shouldHaveShards = false;
|
|
}
|
|
|
|
nodePlacementGroupHashEntry->shouldHaveShards = shouldHaveShards;
|
|
nodePlacementGroupHashEntry->allowedToSeparateAnyPlacementGroup =
|
|
shouldHaveShards;
|
|
nodePlacementGroupHashEntry->assignedPlacementGroup = NULL;
|
|
|
|
/*
|
|
* Lets call set of the nodes that placements in rebalancePlacementList
|
|
* are stored on as D and the others as S. In other words, D is the set
|
|
* of the nodes that we're allowed to move the placements "from" or
|
|
* "to (*)" (* = if we're not draining it) and S is the set of the nodes
|
|
* that we're only allowed to move the placements "to" but not "from".
|
|
*
|
|
* This means that, for a node of type S, the fact that whether the node
|
|
* is used to separate a placement group or not cannot be changed in the
|
|
* runtime.
|
|
*
|
|
* For this reason, below we find out the assigned placement groups for
|
|
* nodes of type S because we want to avoid from moving the placements
|
|
* (if any) from a node of type D to a node that is used to separate a
|
|
* placement group within S. We also set allowedToSeparateAnyPlacementGroup
|
|
* to false for the nodes that already have some shard placements within S
|
|
* because we want to avoid from moving the placements that need a separate
|
|
* node (if any) from node D to node S.
|
|
*
|
|
* We skip below code for nodes of type D not because optimization purposes
|
|
* but because it would be "incorrect" to assume that "current placement
|
|
* distribution for a node of type D would be the same" after the rebalancer
|
|
* plans the moves.
|
|
*/
|
|
|
|
if (!shouldHaveShards)
|
|
{
|
|
/* we can't assing any shard placement groups to the node anyway */
|
|
continue;
|
|
}
|
|
|
|
if (list_length(placementListUniqueNodeGroupIds) == list_length(
|
|
activeWorkerNodeList))
|
|
{
|
|
/*
|
|
* list_member_oid() check would return true for all placements then.
|
|
* This means that all the nodes are of type D.
|
|
*/
|
|
Assert(list_member_oid(placementListUniqueNodeGroupIds, workerNode->groupId));
|
|
continue;
|
|
}
|
|
|
|
if (list_member_oid(placementListUniqueNodeGroupIds, workerNode->groupId))
|
|
{
|
|
/* node is of type D */
|
|
continue;
|
|
}
|
|
|
|
ShardPlacementGroup *separatedShardPlacementGroup =
|
|
NodeGroupGetSeparatedShardPlacementGroup(
|
|
nodePlacementGroupHashEntry->nodeGroupId);
|
|
if (separatedShardPlacementGroup)
|
|
{
|
|
nodePlacementGroupHashEntry->assignedPlacementGroup =
|
|
separatedShardPlacementGroup;
|
|
}
|
|
else
|
|
{
|
|
nodePlacementGroupHashEntry->allowedToSeparateAnyPlacementGroup =
|
|
!NodeGroupHasShardPlacements(nodePlacementGroupHashEntry->nodeGroupId);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* NodeToPlacementGroupHashAssignNodes assigns all active shard placements in
|
|
* the cluster that need separate nodes to individual worker nodes.
|
|
*/
|
|
static void
|
|
NodeToPlacementGroupHashAssignNodes(HTAB *nodePlacementGroupHash,
|
|
List *activeWorkerNodeList,
|
|
List *rebalancePlacementList,
|
|
FmgrInfo *shardAllowedOnNodeUDF)
|
|
{
|
|
List *availableWorkerList = list_copy(activeWorkerNodeList);
|
|
List *unassignedPlacementList = NIL;
|
|
|
|
/*
|
|
* Assign as much as possible shard placement groups to worker nodes where
|
|
* they are stored already.
|
|
*/
|
|
ShardPlacement *shardPlacement = NULL;
|
|
foreach_ptr(shardPlacement, rebalancePlacementList)
|
|
{
|
|
ShardInterval *shardInterval = LoadShardInterval(shardPlacement->shardId);
|
|
if (!shardInterval->needsSeparateNode)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
int32 shardPlacementGroupId = shardPlacement->groupId;
|
|
if (NodeToPlacementGroupHashAssignNode(nodePlacementGroupHash,
|
|
shardPlacementGroupId,
|
|
shardPlacement,
|
|
shardAllowedOnNodeUDF))
|
|
{
|
|
/*
|
|
* NodeToPlacementGroupHashAssignNode() succeeds for each worker node
|
|
* once, hence we must not have removed the worker node from the list
|
|
* yet, and WorkerNodeListGetNodeWithGroupId() ensures that already.
|
|
*/
|
|
int currentPlacementNodeIdx =
|
|
WorkerNodeListGetNodeWithGroupId(availableWorkerList,
|
|
shardPlacementGroupId);
|
|
availableWorkerList = list_delete_nth_cell(availableWorkerList,
|
|
currentPlacementNodeIdx);
|
|
}
|
|
else
|
|
{
|
|
unassignedPlacementList =
|
|
lappend(unassignedPlacementList, shardPlacement);
|
|
}
|
|
}
|
|
|
|
bool emitWarning = false;
|
|
|
|
/*
|
|
* For the shard placement groups that could not be assigned to their
|
|
* current node, assign them to any other node that is available.
|
|
*/
|
|
ShardPlacement *unassignedShardPlacement = NULL;
|
|
foreach_ptr(unassignedShardPlacement, unassignedPlacementList)
|
|
{
|
|
bool separated = false;
|
|
|
|
WorkerNode *availableWorkerNode = NULL;
|
|
foreach_ptr(availableWorkerNode, availableWorkerList)
|
|
{
|
|
if (NodeToPlacementGroupHashAssignNode(nodePlacementGroupHash,
|
|
availableWorkerNode->groupId,
|
|
unassignedShardPlacement,
|
|
shardAllowedOnNodeUDF))
|
|
{
|
|
separated = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!separated)
|
|
{
|
|
emitWarning = true;
|
|
}
|
|
}
|
|
|
|
if (emitWarning)
|
|
{
|
|
ereport(WARNING, (errmsg("could not separate all shard placements "
|
|
"that need a separate node")));
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* NodeToPlacementGroupHashAssignNode is an helper to
|
|
* NodeToPlacementGroupHashAssignNodes that tries to assign given
|
|
* shard placement to given node and returns true if it succeeds.
|
|
*/
|
|
static bool
|
|
NodeToPlacementGroupHashAssignNode(HTAB *nodePlacementGroupHash,
|
|
int32 nodeGroupId,
|
|
ShardPlacement *shardPlacement,
|
|
FmgrInfo *shardAllowedOnNodeUDF)
|
|
{
|
|
NodeToPlacementGroupHashEntry *nodePlacementGroupHashEntry =
|
|
NodeToPlacementGroupHashGetNodeWithGroupId(nodePlacementGroupHash, nodeGroupId);
|
|
|
|
if (nodePlacementGroupHashEntry->assignedPlacementGroup)
|
|
{
|
|
/*
|
|
* Right now callers of this function call it once for each distinct
|
|
* shard placement group, hence we assume that shard placement group
|
|
* that given shard placement belongs to and
|
|
* nodePlacementGroupHashEntry->assignedPlacementGroup cannot be the
|
|
* same, without checking.
|
|
*/
|
|
return false;
|
|
}
|
|
|
|
if (!nodePlacementGroupHashEntry->allowedToSeparateAnyPlacementGroup)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
if (!nodePlacementGroupHashEntry->shouldHaveShards)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
WorkerNode *workerNode = PrimaryNodeForGroup(nodeGroupId, NULL);
|
|
Datum allowed = FunctionCall2(shardAllowedOnNodeUDF, shardPlacement->shardId,
|
|
workerNode->nodeId);
|
|
if (!DatumGetBool(allowed))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
nodePlacementGroupHashEntry->assignedPlacementGroup =
|
|
GetShardPlacementGroupForPlacement(shardPlacement->shardId,
|
|
shardPlacement->placementId);
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
/*
|
|
* RebalancerPlacementIsolationContextPlacementIsAllowedOnWorker returns true
|
|
* if shard placement with given shardId & placementId is allowed to be stored
|
|
* on given worker node.
|
|
*/
|
|
bool
|
|
RebalancerPlacementIsolationContextPlacementIsAllowedOnWorker(
|
|
RebalancerPlacementIsolationContext *context,
|
|
uint64 shardId,
|
|
uint64 placementId,
|
|
WorkerNode *workerNode)
|
|
{
|
|
HTAB *nodePlacementGroupHash = context->nodePlacementGroupHash;
|
|
NodeToPlacementGroupHashEntry *nodePlacementGroupHashEntry =
|
|
NodeToPlacementGroupHashGetNodeWithGroupId(nodePlacementGroupHash,
|
|
workerNode->groupId);
|
|
|
|
ShardInterval *shardInterval = LoadShardInterval(shardId);
|
|
if (!shardInterval->needsSeparateNode)
|
|
{
|
|
/*
|
|
* It doesn't need a separate node, but is the node used to separate
|
|
* a shard placement group? If so, we cannot store it on this node.
|
|
*/
|
|
return nodePlacementGroupHashEntry->shouldHaveShards &&
|
|
nodePlacementGroupHashEntry->assignedPlacementGroup == NULL;
|
|
}
|
|
|
|
/*
|
|
* Given shard placement needs a separate node.
|
|
* Check if given worker node is the one that is assigned to separate it.
|
|
*/
|
|
if (nodePlacementGroupHashEntry->assignedPlacementGroup == NULL)
|
|
{
|
|
/* the node is not supposed to separate a placement group */
|
|
return false;
|
|
}
|
|
|
|
ShardPlacementGroup *placementGroup =
|
|
GetShardPlacementGroupForPlacement(shardId, placementId);
|
|
return ShardPlacementGroupsSame(nodePlacementGroupHashEntry->assignedPlacementGroup,
|
|
placementGroup);
|
|
}
|
|
|
|
|
|
/*
|
|
* NodeToPlacementGroupHashGetNodeWithGroupId searches given hash table for
|
|
* NodeToPlacementGroupHashEntry with given node id and returns it.
|
|
*
|
|
* Throws an error if no such entry is found.
|
|
*/
|
|
static NodeToPlacementGroupHashEntry *
|
|
NodeToPlacementGroupHashGetNodeWithGroupId(HTAB *nodePlacementGroupHash,
|
|
int32 nodeGroupId)
|
|
{
|
|
NodeToPlacementGroupHashEntry *nodePlacementGroupHashEntry =
|
|
hash_search(nodePlacementGroupHash, &nodeGroupId, HASH_FIND, NULL);
|
|
|
|
if (nodePlacementGroupHashEntry == NULL)
|
|
{
|
|
ereport(ERROR, (errmsg("no such node is found")));
|
|
}
|
|
|
|
return nodePlacementGroupHashEntry;
|
|
}
|
|
|
|
|
|
/*
|
|
* PlacementListGetUniqueNodeGroupIds returns a list of unique node group ids
|
|
* that are used by given list of shard placements.
|
|
*/
|
|
static List *
|
|
PlacementListGetUniqueNodeGroupIds(List *placementList)
|
|
{
|
|
List *placementListUniqueNodeGroupIds = NIL;
|
|
|
|
ShardPlacement *shardPlacement = NULL;
|
|
foreach_ptr(shardPlacement, placementList)
|
|
{
|
|
placementListUniqueNodeGroupIds =
|
|
list_append_unique_oid(placementListUniqueNodeGroupIds,
|
|
shardPlacement->groupId);
|
|
}
|
|
|
|
return placementListUniqueNodeGroupIds;
|
|
}
|
|
|
|
|
|
/*
|
|
* WorkerNodeListGetNodeWithGroupId returns the index of worker node with given id
|
|
* in given worker node list.
|
|
*
|
|
* Throws an error if no such node is found.
|
|
*/
|
|
static int
|
|
WorkerNodeListGetNodeWithGroupId(List *workerNodeList, int32 nodeGroupId)
|
|
{
|
|
int workerNodeIndex = 0;
|
|
WorkerNode *workerNode = NULL;
|
|
foreach_ptr(workerNode, workerNodeList)
|
|
{
|
|
if (workerNode->groupId == nodeGroupId)
|
|
{
|
|
return workerNodeIndex;
|
|
}
|
|
|
|
workerNodeIndex++;
|
|
}
|
|
|
|
ereport(ERROR, (errmsg("no such node is found")));
|
|
}
|