mirror of https://github.com/citusdata/citus.git
Defer reference table replication to shard creation time
parent
76a8a3c7c9
commit
bf6b12e351
|
@ -346,6 +346,12 @@ CreateDistributedTable(Oid relationId, Var *distributionColumn, char distributio
|
|||
EnsureRelationCanBeDistributed(relationId, distributionColumn, distributionMethod,
|
||||
colocationId, replicationModel, viaDeprecatedAPI);
|
||||
|
||||
/*
|
||||
* Make sure that existing reference tables have been replicated to all the nodes
|
||||
* such that we can create foreign keys and joins work immediately after creation.
|
||||
*/
|
||||
EnsureReferenceTablesExistOnAllNodes();
|
||||
|
||||
/* we need to calculate these variables before creating distributed metadata */
|
||||
bool localTableEmpty = LocalTableEmpty(relationId);
|
||||
Oid colocatedTableId = ColocatedTableId(colocationId);
|
||||
|
|
|
@ -86,6 +86,8 @@ master_create_worker_shards(PG_FUNCTION_ARGS)
|
|||
ObjectAddressSet(tableAddress, RelationRelationId, distributedTableId);
|
||||
EnsureDependenciesExistOnAllNodes(&tableAddress);
|
||||
|
||||
EnsureReferenceTablesExistOnAllNodes();
|
||||
|
||||
CreateShardsWithRoundRobinPolicy(distributedTableId, shardCount, replicationFactor,
|
||||
useExclusiveConnections);
|
||||
|
||||
|
|
|
@ -28,6 +28,7 @@
|
|||
#include "distributed/metadata_sync.h"
|
||||
#include "distributed/multi_join_order.h"
|
||||
#include "distributed/multi_partitioning_utils.h"
|
||||
#include "distributed/reference_table_utils.h"
|
||||
#include "distributed/resource_lock.h"
|
||||
#include "distributed/worker_manager.h"
|
||||
#include "distributed/worker_protocol.h"
|
||||
|
@ -449,6 +450,19 @@ ReplicateColocatedShardPlacement(int64 shardId, char *sourceNodeName,
|
|||
targetNodeName, targetNodePort);
|
||||
}
|
||||
|
||||
if (!IsReferenceTable(distributedTableId))
|
||||
{
|
||||
/*
|
||||
* When copying a shard to a new node, we should first ensure that reference
|
||||
* tables are present such that joins work immediately after copying the shard.
|
||||
* When copying a reference table, we are probably trying to achieve just that.
|
||||
*
|
||||
* Since this a long-running operation we do this after the error checks, but
|
||||
* before taking metadata locks.
|
||||
*/
|
||||
EnsureReferenceTablesExistOnAllNodes();
|
||||
}
|
||||
|
||||
/*
|
||||
* CopyColocatedShardPlacement function copies given shard with its co-located
|
||||
* shards.
|
||||
|
|
|
@ -371,8 +371,6 @@ SetUpDistributedTableDependencies(WorkerNode *newWorkerNode)
|
|||
EnsureNoModificationsHaveBeenDone();
|
||||
ReplicateAllDependenciesToNode(newWorkerNode->workerName,
|
||||
newWorkerNode->workerPort);
|
||||
ReplicateAllReferenceTablesToNode(newWorkerNode->workerName,
|
||||
newWorkerNode->workerPort);
|
||||
|
||||
/*
|
||||
* Let the maintenance daemon do the hard work of syncing the metadata.
|
||||
|
|
|
@ -4,3 +4,4 @@
|
|||
|
||||
#include "udfs/citus_extradata_container/9.3-2.sql"
|
||||
#include "udfs/update_distributed_table_colocation/9.3-2.sql"
|
||||
#include "udfs/replicate_reference_tables/9.3-1.sql"
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
CREATE FUNCTION pg_catalog.replicate_reference_tables()
|
||||
RETURNS VOID
|
||||
LANGUAGE C STRICT
|
||||
AS 'MODULE_PATHNAME', $$replicate_reference_tables$$;
|
||||
COMMENT ON FUNCTION pg_catalog.replicate_reference_tables()
|
||||
IS 'replicate reference tables to all nodes';
|
||||
REVOKE ALL ON FUNCTION pg_catalog.replicate_reference_tables() FROM PUBLIC;
|
|
@ -0,0 +1,7 @@
|
|||
CREATE FUNCTION pg_catalog.replicate_reference_tables()
|
||||
RETURNS VOID
|
||||
LANGUAGE C STRICT
|
||||
AS 'MODULE_PATHNAME', $$replicate_reference_tables$$;
|
||||
COMMENT ON FUNCTION pg_catalog.replicate_reference_tables()
|
||||
IS 'replicate reference tables to all nodes';
|
||||
REVOKE ALL ON FUNCTION pg_catalog.replicate_reference_tables() FROM PUBLIC;
|
|
@ -24,18 +24,24 @@
|
|||
#include "distributed/metadata_sync.h"
|
||||
#include "distributed/multi_logical_planner.h"
|
||||
#include "distributed/reference_table_utils.h"
|
||||
#include "distributed/remote_commands.h"
|
||||
#include "distributed/resource_lock.h"
|
||||
#include "distributed/shardinterval_utils.h"
|
||||
#include "distributed/transaction_management.h"
|
||||
#include "distributed/worker_manager.h"
|
||||
#include "distributed/worker_transaction.h"
|
||||
#include "postmaster/postmaster.h"
|
||||
#include "storage/lmgr.h"
|
||||
#include "utils/builtins.h"
|
||||
#include "utils/fmgroids.h"
|
||||
#include "utils/lsyscache.h"
|
||||
#include "utils/rel.h"
|
||||
|
||||
|
||||
/* local function forward declarations */
|
||||
static List * WorkersWithoutReferenceTablePlacement(uint64 shardId);
|
||||
static void CopyShardPlacementToNewWorkerNode(ShardPlacement *sourceShardPlacement,
|
||||
WorkerNode *newWorkerNode);
|
||||
static void ReplicateSingleShardTableToAllNodes(Oid relationId);
|
||||
static void ReplicateShardToAllNodes(ShardInterval *shardInterval);
|
||||
static void ReplicateShardToNode(ShardInterval *shardInterval, char *nodeName,
|
||||
|
@ -44,6 +50,173 @@ static void ConvertToReferenceTableMetadata(Oid relationId, uint64 shardId);
|
|||
|
||||
/* exports for SQL callable functions */
|
||||
PG_FUNCTION_INFO_V1(upgrade_to_reference_table);
|
||||
PG_FUNCTION_INFO_V1(replicate_reference_tables);
|
||||
|
||||
|
||||
/*
|
||||
* IsReferenceTable returns whether the given relation ID identifies a reference
|
||||
* table.
|
||||
*/
|
||||
bool
|
||||
IsReferenceTable(Oid relationId)
|
||||
{
|
||||
CitusTableCacheEntry *tableEntry = GetCitusTableCacheEntry(relationId);
|
||||
|
||||
if (!tableEntry->isCitusTable)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (tableEntry->partitionMethod != DISTRIBUTE_BY_NONE)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* replicate_reference_tables is a UDF to ensure that allreference tables are
|
||||
* replicated to all nodes.
|
||||
*/
|
||||
Datum
|
||||
replicate_reference_tables(PG_FUNCTION_ARGS)
|
||||
{
|
||||
EnsureReferenceTablesExistOnAllNodes();
|
||||
|
||||
PG_RETURN_VOID();
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* EnsureReferenceTablesExistOnAllNodes ensures that a shard placement for every
|
||||
* reference table exists on all nodes. If a node does not have a set of shard
|
||||
* placements, then master_copy_shard_placement is called in a subtransaction
|
||||
* to pull the data to the new node.
|
||||
*/
|
||||
void
|
||||
EnsureReferenceTablesExistOnAllNodes(void)
|
||||
{
|
||||
List *referenceTableIdList = ReferenceTableOidList();
|
||||
if (list_length(referenceTableIdList) == 0)
|
||||
{
|
||||
/* no reference tables exist */
|
||||
return;
|
||||
}
|
||||
|
||||
Oid referenceTableId = linitial_oid(referenceTableIdList);
|
||||
List *shardIntervalList = LoadShardIntervalList(referenceTableId);
|
||||
if (list_length(shardIntervalList) == 0)
|
||||
{
|
||||
/* check for corrupt metadata */
|
||||
ereport(ERROR, (errmsg("reference table \"%s\" does not have a shard",
|
||||
get_rel_name(referenceTableId))));
|
||||
}
|
||||
|
||||
ShardInterval *shardInterval = (ShardInterval *) linitial(shardIntervalList);
|
||||
uint64 shardId = shardInterval->shardId;
|
||||
|
||||
/* prevent this funcion from running concurrently with itself */
|
||||
int colocationId = TableColocationId(referenceTableId);
|
||||
LockColocationId(colocationId, ExclusiveLock);
|
||||
|
||||
List *newWorkersList = WorkersWithoutReferenceTablePlacement(shardId);
|
||||
if (list_length(newWorkersList) == 0)
|
||||
{
|
||||
/* nothing to do, no need for lock */
|
||||
UnlockColocationId(colocationId, ExclusiveLock);
|
||||
return;
|
||||
}
|
||||
|
||||
/* TODO: ensure reference tables have not been modified in this transaction */
|
||||
|
||||
bool missingOk = false;
|
||||
ShardPlacement *sourceShardPlacement = ActiveShardPlacement(shardId, missingOk);
|
||||
if (sourceShardPlacement == NULL)
|
||||
{
|
||||
/* check for corrupt metadata */
|
||||
ereport(ERROR, (errmsg("reference table shard " UINT64_FORMAT " does not "
|
||||
"have an active shard placement",
|
||||
shardId)));
|
||||
}
|
||||
|
||||
WorkerNode *newWorkerNode = NULL;
|
||||
foreach_ptr(newWorkerNode, newWorkersList)
|
||||
{
|
||||
CopyShardPlacementToNewWorkerNode(sourceShardPlacement, newWorkerNode);
|
||||
}
|
||||
|
||||
/*
|
||||
* Unblock other backends, they will probably observe that there are no
|
||||
* more worker nodes without placements, unless nodes were added concurrently
|
||||
*/
|
||||
UnlockColocationId(colocationId, ExclusiveLock);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* WorkersWithoutReferenceTablePlacement returns a list of workers (WorkerNode) that
|
||||
* do not yet have a placement for the given reference table shard ID, but are
|
||||
* supposed to.
|
||||
*/
|
||||
static List *
|
||||
WorkersWithoutReferenceTablePlacement(uint64 shardId)
|
||||
{
|
||||
List *workersWithoutPlacements = NIL;
|
||||
|
||||
List *shardPlacementList = ActiveShardPlacementList(shardId);
|
||||
|
||||
/* we only take an access share lock, otherwise we'll hold up master_add_node */
|
||||
List *workerNodeList = ReferenceTablePlacementNodeList(AccessShareLock);
|
||||
workerNodeList = SortList(workerNodeList, CompareWorkerNodes);
|
||||
|
||||
WorkerNode *workerNode = NULL;
|
||||
foreach_ptr(workerNode, workerNodeList)
|
||||
{
|
||||
char *nodeName = workerNode->workerName;
|
||||
uint32 nodePort = workerNode->workerPort;
|
||||
bool missingWorkerOk = true;
|
||||
ShardPlacement *targetPlacement = SearchShardPlacementInList(shardPlacementList,
|
||||
nodeName, nodePort,
|
||||
missingWorkerOk);
|
||||
if (targetPlacement == NULL)
|
||||
{
|
||||
workersWithoutPlacements = lappend(workersWithoutPlacements, workerNode);
|
||||
}
|
||||
}
|
||||
|
||||
return workersWithoutPlacements;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* CopyShardPlacementToNewWorkerNode runs master_copy_shard_placement in a
|
||||
* subtransaction by connecting to localhost.
|
||||
*/
|
||||
static void
|
||||
CopyShardPlacementToNewWorkerNode(ShardPlacement *sourceShardPlacement,
|
||||
WorkerNode *newWorkerNode)
|
||||
{
|
||||
int connectionFlags = OUTSIDE_TRANSACTION;
|
||||
StringInfo queryString = makeStringInfo();
|
||||
const char *userName = CitusExtensionOwnerName();
|
||||
|
||||
MultiConnection *connection = GetNodeUserDatabaseConnection(
|
||||
connectionFlags, "localhost", PostPortNumber,
|
||||
userName, NULL);
|
||||
|
||||
appendStringInfo(queryString,
|
||||
"SELECT master_copy_shard_placement("
|
||||
UINT64_FORMAT ", %s, %d, %s, %d, do_repair := false)",
|
||||
sourceShardPlacement->shardId,
|
||||
quote_literal_cstr(sourceShardPlacement->nodeName),
|
||||
sourceShardPlacement->nodePort,
|
||||
quote_literal_cstr(newWorkerNode->workerName),
|
||||
newWorkerNode->workerPort);
|
||||
|
||||
ExecuteCriticalRemoteCommand(connection, queryString->data);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
|
@ -110,66 +283,6 @@ upgrade_to_reference_table(PG_FUNCTION_ARGS)
|
|||
}
|
||||
|
||||
|
||||
/*
|
||||
* ReplicateAllReferenceTablesToNode function finds all reference tables and
|
||||
* replicates them to the given worker node. It also modifies pg_dist_colocation
|
||||
* table to update the replication factor column when necessary. This function
|
||||
* skips reference tables if that node already has healthy placement of that
|
||||
* reference table to prevent unnecessary data transfer.
|
||||
*/
|
||||
void
|
||||
ReplicateAllReferenceTablesToNode(char *nodeName, int nodePort)
|
||||
{
|
||||
List *referenceTableList = ReferenceTableOidList();
|
||||
|
||||
/* if there is no reference table, we do not need to replicate anything */
|
||||
if (list_length(referenceTableList) > 0)
|
||||
{
|
||||
List *referenceShardIntervalList = NIL;
|
||||
|
||||
/*
|
||||
* We sort the reference table list to prevent deadlocks in concurrent
|
||||
* ReplicateAllReferenceTablesToAllNodes calls.
|
||||
*/
|
||||
referenceTableList = SortList(referenceTableList, CompareOids);
|
||||
Oid referenceTableId = InvalidOid;
|
||||
foreach_oid(referenceTableId, referenceTableList)
|
||||
{
|
||||
List *shardIntervalList = LoadShardIntervalList(referenceTableId);
|
||||
ShardInterval *shardInterval = (ShardInterval *) linitial(shardIntervalList);
|
||||
|
||||
referenceShardIntervalList = lappend(referenceShardIntervalList,
|
||||
shardInterval);
|
||||
}
|
||||
|
||||
if (ClusterHasKnownMetadataWorkers())
|
||||
{
|
||||
BlockWritesToShardList(referenceShardIntervalList);
|
||||
}
|
||||
|
||||
ShardInterval *shardInterval = NULL;
|
||||
foreach_ptr(shardInterval, referenceShardIntervalList)
|
||||
{
|
||||
uint64 shardId = shardInterval->shardId;
|
||||
|
||||
LockShardDistributionMetadata(shardId, ExclusiveLock);
|
||||
|
||||
ReplicateShardToNode(shardInterval, nodeName, nodePort);
|
||||
}
|
||||
|
||||
/* create foreign constraints between reference tables */
|
||||
foreach_ptr(shardInterval, referenceShardIntervalList)
|
||||
{
|
||||
char *tableOwner = TableOwner(shardInterval->relationId);
|
||||
List *commandList = CopyShardForeignConstraintCommandList(shardInterval);
|
||||
|
||||
SendCommandListToWorkerInSingleTransaction(nodeName, nodePort, tableOwner,
|
||||
commandList);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* ReplicateSingleShardTableToAllNodes accepts a broadcast table and replicates
|
||||
* it to all worker nodes, and the coordinator if it has been added by the user
|
||||
|
|
|
@ -319,6 +319,36 @@ IntToLockMode(int mode)
|
|||
}
|
||||
|
||||
|
||||
/*
|
||||
* LockColocationId returns after acquiring a co-location ID lock, typically used
|
||||
* for rebalancing and replication.
|
||||
*/
|
||||
void
|
||||
LockColocationId(int colocationId, LOCKMODE lockMode)
|
||||
{
|
||||
LOCKTAG tag;
|
||||
const bool sessionLock = false;
|
||||
const bool dontWait = false;
|
||||
|
||||
SET_LOCKTAG_REBALANCE_COLOCATION(tag, (int64) colocationId);
|
||||
(void) LockAcquire(&tag, lockMode, sessionLock, dontWait);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* UnlockColocationId releases a co-location ID lock.
|
||||
*/
|
||||
void
|
||||
UnlockColocationId(int colocationId, LOCKMODE lockMode)
|
||||
{
|
||||
LOCKTAG tag;
|
||||
const bool sessionLock = false;
|
||||
|
||||
SET_LOCKTAG_REBALANCE_COLOCATION(tag, (int64) colocationId);
|
||||
LockRelease(&tag, lockMode, sessionLock);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* LockShardDistributionMetadata returns after grabbing a lock for distribution
|
||||
* metadata related to the specified shard, blocking if required. Any locks
|
||||
|
|
|
@ -16,8 +16,9 @@
|
|||
|
||||
#include "listutils.h"
|
||||
|
||||
extern bool IsReferenceTable(Oid relationId);
|
||||
extern void EnsureReferenceTablesExistOnAllNodes(void);
|
||||
extern uint32 CreateReferenceTableColocationId(void);
|
||||
extern void ReplicateAllReferenceTablesToNode(char *nodeName, int nodePort);
|
||||
extern void DeleteAllReferenceTablePlacementsFromNodeGroup(int32 groupId);
|
||||
extern List * ReferenceTableOidList(void);
|
||||
extern int CompareOids(const void *leftElement, const void *rightElement);
|
||||
|
|
|
@ -102,6 +102,10 @@ extern void UnlockShardResource(uint64 shardId, LOCKMODE lockmode);
|
|||
extern void LockJobResource(uint64 jobId, LOCKMODE lockmode);
|
||||
extern void UnlockJobResource(uint64 jobId, LOCKMODE lockmode);
|
||||
|
||||
/* Lock a co-location group */
|
||||
extern void LockColocationId(int colocationId, LOCKMODE lockMode);
|
||||
extern void UnlockColocationId(int colocationId, LOCKMODE lockMode);
|
||||
|
||||
/* Lock multiple shards for safe modification */
|
||||
extern void LockShardListMetadata(List *shardIntervalList, LOCKMODE lockMode);
|
||||
extern void LockShardsInPlacementListMetadata(List *shardPlacementList,
|
||||
|
|
|
@ -247,7 +247,6 @@ SELECT run_command_on_workers($$SELECT extversion FROM pg_extension WHERE extnam
|
|||
|
||||
-- and add the other node
|
||||
SELECT 1 from master_add_node('localhost', :worker_2_port);
|
||||
NOTICE: Replicating reference table "ref_table_2" to the node localhost:xxxxx
|
||||
?column?
|
||||
---------------------------------------------------------------------
|
||||
1
|
||||
|
@ -443,7 +442,6 @@ BEGIN;
|
|||
COMMIT;
|
||||
-- add the node back
|
||||
SELECT 1 from master_add_node('localhost', :worker_2_port);
|
||||
NOTICE: Replicating reference table "t3" to the node localhost:xxxxx
|
||||
?column?
|
||||
---------------------------------------------------------------------
|
||||
1
|
||||
|
|
Loading…
Reference in New Issue