mirror of https://github.com/citusdata/citus.git
461 lines
14 KiB
C
461 lines
14 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* shard_split_replication.c
|
|
* This file contains functions to setup information about list of shards
|
|
* that are being split.
|
|
*
|
|
* Copyright (c) Citus Data, Inc.
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#include "postgres.h"
|
|
#include "common/hashfn.h"
|
|
#include "distributed/shardinterval_utils.h"
|
|
#include "distributed/shard_utils.h"
|
|
#include "distributed/shardsplit_shared_memory.h"
|
|
#include "distributed/citus_safe_lib.h"
|
|
#include "utils/builtins.h"
|
|
#include "utils/lsyscache.h"
|
|
|
|
/* declarations for dynamic loading */
|
|
PG_FUNCTION_INFO_V1(split_shard_replication_setup);
|
|
|
|
static HTAB *ShardInfoHashMap = NULL;
|
|
|
|
/* Entry for hash map */
|
|
typedef struct NodeShardMappingEntry
|
|
{
|
|
uint64_t key;
|
|
List *shardSplitInfoList;
|
|
} NodeShardMappingEntry;
|
|
|
|
/* Function declarations */
|
|
static void ParseShardSplitInfo(ArrayType *shardInfoArrayObject,
|
|
int shardSplitInfoIndex,
|
|
uint64 *sourceShardId,
|
|
uint64 *desShardId,
|
|
int32 *minValue,
|
|
int32 *maxValue,
|
|
int32 *nodeId);
|
|
static ShardSplitInfo * CreateShardSplitInfo(uint64 sourceShardIdToSplit,
|
|
uint64 desSplitChildShardId,
|
|
int32 minValue,
|
|
int32 maxValue,
|
|
int32 nodeId);
|
|
static void AddShardSplitInfoEntryForNodeInMap(ShardSplitInfo *shardSplitInfo);
|
|
static void * PopulateShardSplitInfoInSM(ShardSplitInfo *shardSplitInfoArray,
|
|
HTAB *shardInfoHashMap,
|
|
dsm_handle dsmHandle,
|
|
int shardSplitInfoCount);
|
|
static void SetupHashMapForShardInfo();
|
|
|
|
/*
|
|
* split_shard_replication_setup UDF creates in-memory data structures
|
|
* to store the meta information about the shard undergoing split and new split
|
|
* children along with their placements required during the catch up phase
|
|
* of logical replication.
|
|
* This meta information is stored in a shared memory segment and accessed
|
|
* by logical decoding plugin.
|
|
*
|
|
* Split information is given by user as an Array of source shards undergoing splits
|
|
* in the below format.
|
|
* Array[Array[sourceShardId, childShardId, minValue, maxValue, Destination NodeId]]
|
|
*
|
|
* sourceShardId - id of the shard that is undergoing a split
|
|
* childShardId - id of shard that stores a specific range of values
|
|
* belonging to sourceShardId(parent)
|
|
* minValue - lower bound of hash value which childShard stores
|
|
*
|
|
* maxValue - upper bound of hash value which childShard stores
|
|
*
|
|
* NodeId - Node where the childShardId is located
|
|
*
|
|
* The function parses the data and builds routing map per destination node id.
|
|
* Multiple shards can be placed on the same destiation node. Source and
|
|
* destinations nodes can be same too.
|
|
*
|
|
* Usage Semantics:
|
|
* This UDF returns a shared memory handle where the information is stored. This shared memory
|
|
* handle is used by caller to encode replication slot name as "NodeId_MemoryHandle" for every
|
|
* distinct target node. The same encoded slot name is stored in one of the fields of the
|
|
* in-memory data structure(ShardSplitInfo).
|
|
*
|
|
* There is a 1-1 mapping between a target node and a replication slot as one replication
|
|
* slot takes care of replicating changes for one node.
|
|
*
|
|
* During the replication phase, 'decoding_plugin_for_shard_split' called for a change on a particular
|
|
* replication slot, will decode the shared memory handle from its slot name and will attach to the
|
|
* shared memory. The plugin consumes the information from shared memory. It routes the tuple
|
|
* from the source shard to the appropriate destination shard for which the respective slot is
|
|
* responsible.
|
|
*/
|
|
Datum
|
|
split_shard_replication_setup(PG_FUNCTION_ARGS)
|
|
{
|
|
ArrayType *shardInfoArrayObject = PG_GETARG_ARRAYTYPE_P(0);
|
|
int shardInfoArrayLength = ARR_DIMS(shardInfoArrayObject)[0];
|
|
|
|
/* SetupMap */
|
|
SetupHashMapForShardInfo();
|
|
|
|
int shardSplitInfoCount = 0;
|
|
for (int index = 0; index < shardInfoArrayLength; index++)
|
|
{
|
|
uint64 sourceShardId = 0;
|
|
uint64 desShardId = 0;
|
|
int32 minValue = 0;
|
|
int32 maxValue = 0;
|
|
int32 nodeId = 0;
|
|
|
|
ParseShardSplitInfo(
|
|
shardInfoArrayObject,
|
|
index,
|
|
&sourceShardId,
|
|
&desShardId,
|
|
&minValue,
|
|
&maxValue,
|
|
&nodeId);
|
|
|
|
ShardSplitInfo *shardSplitInfo = CreateShardSplitInfo(
|
|
sourceShardId,
|
|
desShardId,
|
|
minValue,
|
|
maxValue,
|
|
nodeId);
|
|
|
|
AddShardSplitInfoEntryForNodeInMap(shardSplitInfo);
|
|
shardSplitInfoCount++;
|
|
}
|
|
|
|
dsm_handle dsmHandle;
|
|
ShardSplitInfo *splitShardInfoSMArray =
|
|
CreateSharedMemoryForShardSplitInfo(shardSplitInfoCount, &dsmHandle);
|
|
|
|
PopulateShardSplitInfoInSM(splitShardInfoSMArray,
|
|
ShardInfoHashMap,
|
|
dsmHandle,
|
|
shardSplitInfoCount);
|
|
|
|
return dsmHandle;
|
|
}
|
|
|
|
|
|
/*
|
|
* SetupHashMapForShardInfo initializes a hash map to store shard split
|
|
* information by grouping them node id wise. The key of the hash table
|
|
* is 'nodeId' and value is a list of ShardSplitInfo that are placed on
|
|
* this particular node.
|
|
*/
|
|
static void
|
|
SetupHashMapForShardInfo()
|
|
{
|
|
HASHCTL info;
|
|
memset(&info, 0, sizeof(info));
|
|
info.keysize = sizeof(uint64_t);
|
|
info.entrysize = sizeof(NodeShardMappingEntry);
|
|
info.hash = uint32_hash;
|
|
info.hcxt = CurrentMemoryContext;
|
|
int hashFlags = (HASH_ELEM | HASH_CONTEXT | HASH_FUNCTION);
|
|
|
|
ShardInfoHashMap = hash_create("ShardInfoMap", 128, &info, hashFlags);
|
|
}
|
|
|
|
|
|
static void
|
|
ParseShardSplitInfo(ArrayType *shardInfoArrayObject,
|
|
int shardSplitInfoIndex,
|
|
uint64 *sourceShardId,
|
|
uint64 *desShardId,
|
|
int32 *minValue,
|
|
int32 *maxValue,
|
|
int32 *nodeId)
|
|
{
|
|
Oid elemtypeId = ARR_ELEMTYPE(shardInfoArrayObject);
|
|
int elemtypeLength = 0;
|
|
bool elemtypeByValue = false;
|
|
char elemtypeAlignment = 0;
|
|
get_typlenbyvalalign(elemtypeId, &elemtypeLength, &elemtypeByValue,
|
|
&elemtypeAlignment);
|
|
|
|
int elementIndex = 0;
|
|
int indexes[] = { shardSplitInfoIndex + 1, elementIndex + 1 };
|
|
bool isNull = false;
|
|
|
|
/* Get source shard Id */
|
|
Datum sourceShardIdDat = array_ref(
|
|
shardInfoArrayObject,
|
|
2,
|
|
indexes,
|
|
-1, /* (> 0 is for fixed-length arrays -- these are assumed to be 1-d, 0-based) */
|
|
elemtypeLength,
|
|
elemtypeByValue,
|
|
elemtypeAlignment,
|
|
&isNull);
|
|
|
|
if (isNull)
|
|
{
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_SYNTAX_ERROR),
|
|
errmsg("null entry found for source shardId")));
|
|
}
|
|
|
|
*sourceShardId = DatumGetUInt64(sourceShardIdDat);
|
|
|
|
/* Get destination shard Id */
|
|
elementIndex++;
|
|
isNull = false;
|
|
indexes[0] = shardSplitInfoIndex + 1;
|
|
indexes[1] = elementIndex + 1;
|
|
Datum destinationShardIdDat = array_ref(
|
|
shardInfoArrayObject,
|
|
2,
|
|
indexes,
|
|
-1, /* (> 0 is for fixed-length arrays -- these are assumed to be 1-d, 0-based) */
|
|
elemtypeLength,
|
|
elemtypeByValue,
|
|
elemtypeAlignment,
|
|
&isNull);
|
|
|
|
if (isNull)
|
|
{
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_SYNTAX_ERROR),
|
|
errmsg("null entry found for destination shardId")));
|
|
}
|
|
|
|
*desShardId = DatumGetUInt64(destinationShardIdDat);
|
|
|
|
/* Get minValue for destination shard */
|
|
elementIndex++;
|
|
isNull = false;
|
|
indexes[0] = shardSplitInfoIndex + 1;
|
|
indexes[1] = elementIndex + 1;
|
|
Datum minValueDat = array_ref(
|
|
shardInfoArrayObject,
|
|
2,
|
|
indexes,
|
|
-1, /* (> 0 is for fixed-length arrays -- these are assumed to be 1-d, 0-based) */
|
|
elemtypeLength,
|
|
elemtypeByValue,
|
|
elemtypeAlignment,
|
|
&isNull);
|
|
|
|
if (isNull)
|
|
{
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_SYNTAX_ERROR),
|
|
errmsg("null entry found for min value")));
|
|
}
|
|
|
|
*minValue = DatumGetInt32(minValueDat);
|
|
|
|
/* Get maxValue for destination shard */
|
|
elementIndex++;
|
|
isNull = false;
|
|
indexes[0] = shardSplitInfoIndex + 1;
|
|
indexes[1] = elementIndex + 1;
|
|
Datum maxValueDat = array_ref(
|
|
shardInfoArrayObject,
|
|
2,
|
|
indexes,
|
|
-1, /* (> 0 is for fixed-length arrays -- these are assumed to be 1-d, 0-based) */
|
|
elemtypeLength,
|
|
elemtypeByValue,
|
|
elemtypeAlignment,
|
|
&isNull);
|
|
|
|
if (isNull)
|
|
{
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_SYNTAX_ERROR),
|
|
errmsg("null entry found for max value")));
|
|
}
|
|
|
|
*maxValue = DatumGetInt32(maxValueDat);
|
|
|
|
/* Get nodeId for shard placement*/
|
|
elementIndex++;
|
|
isNull = false;
|
|
indexes[0] = shardSplitInfoIndex + 1;
|
|
indexes[1] = elementIndex + 1;
|
|
Datum nodeIdDat = array_ref(
|
|
shardInfoArrayObject,
|
|
2,
|
|
indexes,
|
|
-1, /* (> 0 is for fixed-length arrays -- these are assumed to be 1-d, 0-based) */
|
|
elemtypeLength,
|
|
elemtypeByValue,
|
|
elemtypeAlignment,
|
|
&isNull);
|
|
|
|
if (isNull)
|
|
{
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_SYNTAX_ERROR),
|
|
errmsg("null entry found for max value")));
|
|
}
|
|
|
|
*nodeId = DatumGetInt32(nodeIdDat);
|
|
|
|
PG_RETURN_VOID();
|
|
}
|
|
|
|
|
|
/*
|
|
* CreateShardSplitInfo function constructs ShardSplitInfo data structure
|
|
* with appropriate OIs' for source and destination relation.
|
|
*
|
|
* sourceShardIdToSplit - Existing shardId which has a valid entry in cache and catalogue
|
|
* desSplitChildShardId - New split child shard which doesn't have an entry in metacache yet.
|
|
* minValue - Minimum hash value for desSplitChildShardId
|
|
* maxValue - Maximum hash value for desSplitChildShardId
|
|
* nodeId - NodeId where
|
|
* However we can use shard ID and construct qualified shardName.
|
|
*/
|
|
ShardSplitInfo *
|
|
CreateShardSplitInfo(uint64 sourceShardIdToSplit,
|
|
uint64 desSplitChildShardId,
|
|
int32 minValue,
|
|
int32 maxValue,
|
|
int32 nodeId)
|
|
{
|
|
ShardInterval *shardIntervalToSplit = LoadShardInterval(sourceShardIdToSplit);
|
|
CitusTableCacheEntry *cachedTableEntry = GetCitusTableCacheEntry(
|
|
shardIntervalToSplit->relationId);
|
|
|
|
/*Todo(sameer): Also check if non-distributed table */
|
|
if (!IsCitusTableTypeCacheEntry(cachedTableEntry, HASH_DISTRIBUTED))
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
|
errmsg("Cannot Support the feature")));
|
|
}
|
|
|
|
Assert(shardIntervalToSplit->minValueExists);
|
|
Assert(shardIntervalToSplit->maxValueExists);
|
|
|
|
/* Oid of distributed table */
|
|
Oid citusTableOid = InvalidOid;
|
|
citusTableOid = shardIntervalToSplit->relationId;
|
|
Oid sourceShardToSplitOid = InvalidOid;
|
|
sourceShardToSplitOid = GetTableLocalShardOid(citusTableOid,
|
|
sourceShardIdToSplit);
|
|
|
|
/* Oid of dummy table at the source */
|
|
Oid desSplitChildShardOid = InvalidOid;
|
|
desSplitChildShardOid = GetTableLocalShardOid(citusTableOid,
|
|
desSplitChildShardId);
|
|
|
|
if (citusTableOid == InvalidOid ||
|
|
sourceShardToSplitOid == InvalidOid ||
|
|
desSplitChildShardOid == InvalidOid)
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR),
|
|
errmsg("Invalid citusTableOid:%u "
|
|
"sourceShardToSplitOid: %u,"
|
|
"desSplitChildShardOid :%u ",
|
|
citusTableOid,
|
|
sourceShardToSplitOid,
|
|
desSplitChildShardOid)));
|
|
}
|
|
|
|
/* determine the partition column in the tuple descriptor */
|
|
Var *partitionColumn = cachedTableEntry->partitionColumn;
|
|
if (partitionColumn == NULL)
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR),
|
|
errmsg("Invalid Partition Column")));
|
|
}
|
|
int partitionColumnIndex = -1;
|
|
partitionColumnIndex = partitionColumn->varattno - 1;
|
|
|
|
ShardSplitInfo *shardSplitInfo = palloc0(sizeof(ShardSplitInfo));
|
|
shardSplitInfo->distributedTableOid = citusTableOid;
|
|
shardSplitInfo->partitionColumnIndex = partitionColumnIndex;
|
|
shardSplitInfo->sourceShardOid = sourceShardToSplitOid;
|
|
shardSplitInfo->splitChildShardOid = desSplitChildShardOid;
|
|
shardSplitInfo->shardMinValue = minValue;
|
|
shardSplitInfo->shardMaxValue = maxValue;
|
|
shardSplitInfo->nodeId = nodeId;
|
|
|
|
return shardSplitInfo;
|
|
}
|
|
|
|
|
|
/*
|
|
* AddShardSplitInfoEntryForNodeInMap function add's ShardSplitInfo entry
|
|
* to the hash map. The key is nodeId on which the new shard is to be placed.
|
|
*/
|
|
void
|
|
AddShardSplitInfoEntryForNodeInMap(ShardSplitInfo *shardSplitInfo)
|
|
{
|
|
uint64_t keyNodeId = shardSplitInfo->nodeId;
|
|
bool found = false;
|
|
NodeShardMappingEntry *nodeMappingEntry =
|
|
(NodeShardMappingEntry *) hash_search(ShardInfoHashMap, &keyNodeId, HASH_ENTER,
|
|
&found);
|
|
|
|
if (!found)
|
|
{
|
|
nodeMappingEntry->shardSplitInfoList = NULL;
|
|
nodeMappingEntry->key = keyNodeId;
|
|
}
|
|
|
|
nodeMappingEntry->shardSplitInfoList =
|
|
lappend(nodeMappingEntry->shardSplitInfoList, (ShardSplitInfo *) shardSplitInfo);
|
|
|
|
PG_RETURN_VOID();
|
|
}
|
|
|
|
|
|
/*
|
|
* PopulateShardSplitInfoInSM function copies information from the hash map
|
|
* into shared memory segment. This information is consumed by the WAL sender
|
|
* process during logical replication.
|
|
*
|
|
* shardSplitInfoArray - Shared memory pointer where information has to
|
|
* be copied
|
|
*
|
|
* shardInfoHashMap - Hashmap containing parsed split information
|
|
* per nodeId wise
|
|
*
|
|
* dsmHandle - Shared memory segment handle
|
|
*/
|
|
void *
|
|
PopulateShardSplitInfoInSM(ShardSplitInfo *shardSplitInfoArray,
|
|
HTAB *shardInfoHashMap,
|
|
dsm_handle dsmHandle,
|
|
int shardSplitInfoCount)
|
|
{
|
|
HASH_SEQ_STATUS status;
|
|
hash_seq_init(&status, shardInfoHashMap);
|
|
|
|
NodeShardMappingEntry *entry = NULL;
|
|
int index = 0;
|
|
while ((entry = (NodeShardMappingEntry *) hash_seq_search(&status)) != NULL)
|
|
{
|
|
uint64_t nodeId = entry->key;
|
|
char *derivedSlotName =
|
|
encode_replication_slot(nodeId, dsmHandle);
|
|
|
|
List *shardSplitInfoList = entry->shardSplitInfoList;
|
|
ListCell *listCell = NULL;
|
|
foreach(listCell, shardSplitInfoList)
|
|
{
|
|
ShardSplitInfo *splitShardInfo = (ShardSplitInfo *) lfirst(listCell);
|
|
ShardSplitInfo *shardInfoInSM = &shardSplitInfoArray[index];
|
|
|
|
shardInfoInSM->distributedTableOid = splitShardInfo->distributedTableOid;
|
|
shardInfoInSM->partitionColumnIndex = splitShardInfo->partitionColumnIndex;
|
|
shardInfoInSM->sourceShardOid = splitShardInfo->sourceShardOid;
|
|
shardInfoInSM->splitChildShardOid = splitShardInfo->splitChildShardOid;
|
|
shardInfoInSM->shardMinValue = splitShardInfo->shardMinValue;
|
|
shardInfoInSM->shardMaxValue = splitShardInfo->shardMaxValue;
|
|
shardInfoInSM->nodeId = splitShardInfo->nodeId;
|
|
strcpy_s(shardInfoInSM->slotName, NAMEDATALEN, derivedSlotName);
|
|
index++;
|
|
}
|
|
}
|
|
|
|
PG_RETURN_VOID();
|
|
}
|