mirror of https://github.com/citusdata/citus.git
447 lines
15 KiB
C
447 lines
15 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* master_create_shards.c
|
|
*
|
|
* This file contains functions to distribute a table by creating shards for it
|
|
* across a set of worker nodes.
|
|
*
|
|
* Copyright (c) 2014-2016, Citus Data, Inc.
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
#include "c.h"
|
|
#include "fmgr.h"
|
|
#include "libpq-fe.h"
|
|
#include "miscadmin.h"
|
|
#include "port.h"
|
|
|
|
#include <ctype.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <stdint.h>
|
|
|
|
#include "catalog/namespace.h"
|
|
#include "catalog/pg_class.h"
|
|
#include "distributed/listutils.h"
|
|
#include "distributed/master_metadata_utility.h"
|
|
#include "distributed/master_protocol.h"
|
|
#include "distributed/metadata_cache.h"
|
|
#include "distributed/multi_join_order.h"
|
|
#include "distributed/multi_executor.h"
|
|
#include "distributed/multi_partitioning_utils.h"
|
|
#include "distributed/pg_dist_partition.h"
|
|
#include "distributed/pg_dist_shard.h"
|
|
#include "distributed/reference_table_utils.h"
|
|
#include "distributed/resource_lock.h"
|
|
#include "distributed/shardinterval_utils.h"
|
|
#include "distributed/transaction_management.h"
|
|
#include "distributed/worker_manager.h"
|
|
#include "lib/stringinfo.h"
|
|
#include "nodes/pg_list.h"
|
|
#include "nodes/primnodes.h"
|
|
#include "postmaster/postmaster.h"
|
|
#include "storage/fd.h"
|
|
#include "storage/lmgr.h"
|
|
#include "storage/lock.h"
|
|
#include "utils/builtins.h"
|
|
#include "utils/elog.h"
|
|
#include "utils/errcodes.h"
|
|
#include "utils/lsyscache.h"
|
|
#include "utils/palloc.h"
|
|
|
|
|
|
/* declarations for dynamic loading */
|
|
PG_FUNCTION_INFO_V1(master_create_worker_shards);
|
|
|
|
|
|
/*
|
|
* master_create_worker_shards is a user facing function to create worker shards
|
|
* for the given relation in round robin order.
|
|
*/
|
|
Datum
|
|
master_create_worker_shards(PG_FUNCTION_ARGS)
|
|
{
|
|
text *tableNameText = PG_GETARG_TEXT_P(0);
|
|
int32 shardCount = PG_GETARG_INT32(1);
|
|
int32 replicationFactor = PG_GETARG_INT32(2);
|
|
ObjectAddress tableAddress = { 0 };
|
|
|
|
Oid distributedTableId = ResolveRelationId(tableNameText, false);
|
|
|
|
/* do not add any data */
|
|
bool useExclusiveConnections = false;
|
|
|
|
EnsureCoordinator();
|
|
CheckCitusVersion(ERROR);
|
|
|
|
/*
|
|
* distributed tables might have dependencies on different objects, since we create
|
|
* shards for a distributed table via multiple sessions these objects will be created
|
|
* via their own connection and committed immediately so they become visible to all
|
|
* sessions creating shards.
|
|
*/
|
|
ObjectAddressSet(tableAddress, RelationRelationId, distributedTableId);
|
|
EnsureDependenciesExistsOnAllNodes(&tableAddress);
|
|
|
|
CreateShardsWithRoundRobinPolicy(distributedTableId, shardCount, replicationFactor,
|
|
useExclusiveConnections);
|
|
|
|
PG_RETURN_VOID();
|
|
}
|
|
|
|
|
|
/*
|
|
* CreateShardsWithRoundRobinPolicy creates empty shards for the given table
|
|
* based on the specified number of initial shards. The function first updates
|
|
* metadata on the coordinator node to make this shard (and its placements)
|
|
* visible. Note that the function assumes the table is hash partitioned and
|
|
* calculates the min/max hash token ranges for each shard, giving them an equal
|
|
* split of the hash space. Finally, function creates empty shard placements on
|
|
* worker nodes.
|
|
*/
|
|
void
|
|
CreateShardsWithRoundRobinPolicy(Oid distributedTableId, int32 shardCount,
|
|
int32 replicationFactor, bool useExclusiveConnections)
|
|
{
|
|
char shardStorageType = 0;
|
|
List *workerNodeList = NIL;
|
|
int32 workerNodeCount = 0;
|
|
uint32 placementAttemptCount = 0;
|
|
uint64 hashTokenIncrement = 0;
|
|
List *existingShardList = NIL;
|
|
int64 shardIndex = 0;
|
|
DistTableCacheEntry *cacheEntry = DistributedTableCacheEntry(distributedTableId);
|
|
bool colocatedShard = false;
|
|
List *insertedShardPlacements = NIL;
|
|
|
|
/* make sure table is hash partitioned */
|
|
CheckHashPartitionedTable(distributedTableId);
|
|
|
|
/*
|
|
* In contrast to append/range partitioned tables it makes more sense to
|
|
* require ownership privileges - shards for hash-partitioned tables are
|
|
* only created once, not continually during ingest as for the other
|
|
* partitioning types.
|
|
*/
|
|
EnsureTableOwner(distributedTableId);
|
|
|
|
/* we plan to add shards: get an exclusive lock on relation oid */
|
|
LockRelationOid(distributedTableId, ExclusiveLock);
|
|
|
|
/* validate that shards haven't already been created for this table */
|
|
existingShardList = LoadShardList(distributedTableId);
|
|
if (existingShardList != NIL)
|
|
{
|
|
char *tableName = get_rel_name(distributedTableId);
|
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
|
errmsg("table \"%s\" has already had shards created for it",
|
|
tableName)));
|
|
}
|
|
|
|
/* make sure that at least one shard is specified */
|
|
if (shardCount <= 0)
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("shard_count must be positive")));
|
|
}
|
|
|
|
/* make sure that at least one replica is specified */
|
|
if (replicationFactor <= 0)
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("replication_factor must be positive")));
|
|
}
|
|
|
|
/* make sure that RF=1 if the table is streaming replicated */
|
|
if (cacheEntry->replicationModel == REPLICATION_MODEL_STREAMING &&
|
|
replicationFactor > 1)
|
|
{
|
|
char *relationName = get_rel_name(cacheEntry->relationId);
|
|
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("using replication factor %d with the streaming "
|
|
"replication model is not supported",
|
|
replicationFactor),
|
|
errdetail("The table %s is marked as streaming replicated and "
|
|
"the shard replication factor of streaming replicated "
|
|
"tables must be 1.", relationName),
|
|
errhint("Use replication factor 1.")));
|
|
}
|
|
|
|
/* calculate the split of the hash space */
|
|
hashTokenIncrement = HASH_TOKEN_COUNT / shardCount;
|
|
|
|
/* don't allow concurrent node list changes that require an exclusive lock */
|
|
LockRelationOid(DistNodeRelationId(), RowShareLock);
|
|
|
|
/* load and sort the worker node list for deterministic placement */
|
|
workerNodeList = DistributedTablePlacementNodeList(NoLock);
|
|
workerNodeList = SortList(workerNodeList, CompareWorkerNodes);
|
|
|
|
workerNodeCount = list_length(workerNodeList);
|
|
if (replicationFactor > workerNodeCount)
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("replication_factor (%d) exceeds number of worker nodes "
|
|
"(%d)", replicationFactor, workerNodeCount),
|
|
errhint("Add more worker nodes or try again with a lower "
|
|
"replication factor.")));
|
|
}
|
|
|
|
/* if we have enough nodes, add an extra placement attempt for backup */
|
|
placementAttemptCount = (uint32) replicationFactor;
|
|
if (workerNodeCount > replicationFactor)
|
|
{
|
|
placementAttemptCount++;
|
|
}
|
|
|
|
/* set shard storage type according to relation type */
|
|
shardStorageType = ShardStorageType(distributedTableId);
|
|
|
|
for (shardIndex = 0; shardIndex < shardCount; shardIndex++)
|
|
{
|
|
uint32 roundRobinNodeIndex = shardIndex % workerNodeCount;
|
|
|
|
/* initialize the hash token space for this shard */
|
|
text *minHashTokenText = NULL;
|
|
text *maxHashTokenText = NULL;
|
|
int32 shardMinHashToken = INT32_MIN + (shardIndex * hashTokenIncrement);
|
|
int32 shardMaxHashToken = shardMinHashToken + (hashTokenIncrement - 1);
|
|
uint64 shardId = GetNextShardId();
|
|
List *currentInsertedShardPlacements = NIL;
|
|
|
|
/* if we are at the last shard, make sure the max token value is INT_MAX */
|
|
if (shardIndex == (shardCount - 1))
|
|
{
|
|
shardMaxHashToken = INT32_MAX;
|
|
}
|
|
|
|
/* insert the shard metadata row along with its min/max values */
|
|
minHashTokenText = IntegerToText(shardMinHashToken);
|
|
maxHashTokenText = IntegerToText(shardMaxHashToken);
|
|
|
|
/*
|
|
* Grabbing the shard metadata lock isn't technically necessary since
|
|
* we already hold an exclusive lock on the partition table, but we'll
|
|
* acquire it for the sake of completeness. As we're adding new active
|
|
* placements, the mode must be exclusive.
|
|
*/
|
|
LockShardDistributionMetadata(shardId, ExclusiveLock);
|
|
|
|
InsertShardRow(distributedTableId, shardId, shardStorageType,
|
|
minHashTokenText, maxHashTokenText);
|
|
|
|
currentInsertedShardPlacements = InsertShardPlacementRows(distributedTableId,
|
|
shardId,
|
|
workerNodeList,
|
|
roundRobinNodeIndex,
|
|
replicationFactor);
|
|
insertedShardPlacements = list_concat(insertedShardPlacements,
|
|
currentInsertedShardPlacements);
|
|
}
|
|
|
|
CreateShardsOnWorkers(distributedTableId, insertedShardPlacements,
|
|
useExclusiveConnections, colocatedShard);
|
|
}
|
|
|
|
|
|
/*
|
|
* CreateColocatedShards creates shards for the target relation colocated with
|
|
* the source relation.
|
|
*/
|
|
void
|
|
CreateColocatedShards(Oid targetRelationId, Oid sourceRelationId, bool
|
|
useExclusiveConnections)
|
|
{
|
|
char targetShardStorageType = 0;
|
|
List *existingShardList = NIL;
|
|
List *sourceShardIntervalList = NIL;
|
|
ListCell *sourceShardCell = NULL;
|
|
bool colocatedShard = true;
|
|
List *insertedShardPlacements = NIL;
|
|
|
|
/* make sure that tables are hash partitioned */
|
|
CheckHashPartitionedTable(targetRelationId);
|
|
CheckHashPartitionedTable(sourceRelationId);
|
|
|
|
/*
|
|
* In contrast to append/range partitioned tables it makes more sense to
|
|
* require ownership privileges - shards for hash-partitioned tables are
|
|
* only created once, not continually during ingest as for the other
|
|
* partitioning types.
|
|
*/
|
|
EnsureTableOwner(targetRelationId);
|
|
|
|
/* we plan to add shards: get an exclusive lock on target relation oid */
|
|
LockRelationOid(targetRelationId, ExclusiveLock);
|
|
|
|
/* we don't want source table to get dropped before we colocate with it */
|
|
LockRelationOid(sourceRelationId, AccessShareLock);
|
|
|
|
/* prevent placement changes of the source relation until we colocate with them */
|
|
sourceShardIntervalList = LoadShardIntervalList(sourceRelationId);
|
|
LockShardListMetadata(sourceShardIntervalList, ShareLock);
|
|
|
|
/* validate that shards haven't already been created for this table */
|
|
existingShardList = LoadShardList(targetRelationId);
|
|
if (existingShardList != NIL)
|
|
{
|
|
char *targetRelationName = get_rel_name(targetRelationId);
|
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
|
errmsg("table \"%s\" has already had shards created for it",
|
|
targetRelationName)));
|
|
}
|
|
|
|
targetShardStorageType = ShardStorageType(targetRelationId);
|
|
|
|
foreach(sourceShardCell, sourceShardIntervalList)
|
|
{
|
|
ShardInterval *sourceShardInterval = (ShardInterval *) lfirst(sourceShardCell);
|
|
uint64 sourceShardId = sourceShardInterval->shardId;
|
|
uint64 newShardId = GetNextShardId();
|
|
ListCell *sourceShardPlacementCell = NULL;
|
|
|
|
int32 shardMinValue = DatumGetInt32(sourceShardInterval->minValue);
|
|
int32 shardMaxValue = DatumGetInt32(sourceShardInterval->maxValue);
|
|
text *shardMinValueText = IntegerToText(shardMinValue);
|
|
text *shardMaxValueText = IntegerToText(shardMaxValue);
|
|
List *sourceShardPlacementList = ShardPlacementList(sourceShardId);
|
|
|
|
InsertShardRow(targetRelationId, newShardId, targetShardStorageType,
|
|
shardMinValueText, shardMaxValueText);
|
|
|
|
foreach(sourceShardPlacementCell, sourceShardPlacementList)
|
|
{
|
|
ShardPlacement *sourcePlacement =
|
|
(ShardPlacement *) lfirst(sourceShardPlacementCell);
|
|
int32 groupId = sourcePlacement->groupId;
|
|
const RelayFileState shardState = FILE_FINALIZED;
|
|
const uint64 shardSize = 0;
|
|
uint64 shardPlacementId = 0;
|
|
ShardPlacement *shardPlacement = NULL;
|
|
|
|
/*
|
|
* Optimistically add shard placement row the pg_dist_shard_placement, in case
|
|
* of any error it will be roll-backed.
|
|
*/
|
|
shardPlacementId = InsertShardPlacementRow(newShardId, INVALID_PLACEMENT_ID,
|
|
shardState, shardSize, groupId);
|
|
|
|
shardPlacement = LoadShardPlacement(newShardId, shardPlacementId);
|
|
insertedShardPlacements = lappend(insertedShardPlacements, shardPlacement);
|
|
}
|
|
}
|
|
|
|
CreateShardsOnWorkers(targetRelationId, insertedShardPlacements,
|
|
useExclusiveConnections, colocatedShard);
|
|
}
|
|
|
|
|
|
/*
|
|
* CreateReferenceTableShard creates a single shard for the given
|
|
* distributedTableId. The created shard does not have min/max values.
|
|
* Also, the shard is replicated to the all active nodes in the cluster.
|
|
*/
|
|
void
|
|
CreateReferenceTableShard(Oid distributedTableId)
|
|
{
|
|
char shardStorageType = 0;
|
|
List *nodeList = NIL;
|
|
List *existingShardList = NIL;
|
|
uint64 shardId = INVALID_SHARD_ID;
|
|
int workerStartIndex = 0;
|
|
int replicationFactor = 0;
|
|
text *shardMinValue = NULL;
|
|
text *shardMaxValue = NULL;
|
|
bool useExclusiveConnection = false;
|
|
bool colocatedShard = false;
|
|
List *insertedShardPlacements = NIL;
|
|
|
|
/*
|
|
* In contrast to append/range partitioned tables it makes more sense to
|
|
* require ownership privileges - shards for reference tables are
|
|
* only created once, not continually during ingest as for the other
|
|
* partitioning types such as append and range.
|
|
*/
|
|
EnsureTableOwner(distributedTableId);
|
|
|
|
/* we plan to add shards: get an exclusive lock on relation oid */
|
|
LockRelationOid(distributedTableId, ExclusiveLock);
|
|
|
|
/* set shard storage type according to relation type */
|
|
shardStorageType = ShardStorageType(distributedTableId);
|
|
|
|
/* validate that shards haven't already been created for this table */
|
|
existingShardList = LoadShardList(distributedTableId);
|
|
if (existingShardList != NIL)
|
|
{
|
|
char *tableName = get_rel_name(distributedTableId);
|
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
|
errmsg("table \"%s\" has already had shards created for it",
|
|
tableName)));
|
|
}
|
|
|
|
/*
|
|
* load and sort the worker node list for deterministic placements
|
|
* create_reference_table has already acquired pg_dist_node lock
|
|
*/
|
|
nodeList = ReferenceTablePlacementNodeList(ShareLock);
|
|
nodeList = SortList(nodeList, CompareWorkerNodes);
|
|
|
|
replicationFactor = ReferenceTableReplicationFactor();
|
|
|
|
/* get the next shard id */
|
|
shardId = GetNextShardId();
|
|
|
|
/*
|
|
* Grabbing the shard metadata lock isn't technically necessary since
|
|
* we already hold an exclusive lock on the partition table, but we'll
|
|
* acquire it for the sake of completeness. As we're adding new active
|
|
* placements, the mode must be exclusive.
|
|
*/
|
|
LockShardDistributionMetadata(shardId, ExclusiveLock);
|
|
|
|
InsertShardRow(distributedTableId, shardId, shardStorageType, shardMinValue,
|
|
shardMaxValue);
|
|
|
|
insertedShardPlacements = InsertShardPlacementRows(distributedTableId, shardId,
|
|
nodeList, workerStartIndex,
|
|
replicationFactor);
|
|
|
|
CreateShardsOnWorkers(distributedTableId, insertedShardPlacements,
|
|
useExclusiveConnection, colocatedShard);
|
|
}
|
|
|
|
|
|
/*
|
|
* CheckHashPartitionedTable looks up the partition information for the given
|
|
* tableId and checks if the table is hash partitioned. If not, the function
|
|
* throws an error.
|
|
*/
|
|
void
|
|
CheckHashPartitionedTable(Oid distributedTableId)
|
|
{
|
|
char partitionType = PartitionMethod(distributedTableId);
|
|
if (partitionType != DISTRIBUTE_BY_HASH)
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
|
errmsg("unsupported table partition type: %c", partitionType)));
|
|
}
|
|
}
|
|
|
|
|
|
/* Helper function to convert an integer value to a text type */
|
|
text *
|
|
IntegerToText(int32 value)
|
|
{
|
|
text *valueText = NULL;
|
|
StringInfo valueString = makeStringInfo();
|
|
appendStringInfo(valueString, "%d", value);
|
|
|
|
valueText = cstring_to_text(valueString->data);
|
|
|
|
return valueText;
|
|
}
|