1)Revert the original order of workflow

2) Introduce GetNextShardIdForSplitChild method
users/saawasek/non_blocking_split_integrated
Sameer Awasekar 2022-08-04 16:26:15 +05:30
parent b92b00e3cf
commit 00c3830bee
3 changed files with 124 additions and 105 deletions

View File

@ -39,6 +39,7 @@
#include "distributed/shardsplit_logical_replication.h"
#include "distributed/deparse_shard_query.h"
#include "distributed/shard_rebalancer.h"
#include "postmaster/postmaster.h"
/*
* Entry for map that tracks ShardInterval -> Placement Node
@ -127,6 +128,7 @@ static void AddDummyShardEntryInMap(HTAB *mapOfDummyShards, uint32 targetNodeId,
ShardInterval *shardInterval);
static void DropDummyShards(HTAB *mapOfDummyShardToPlacement);
static void DropDummyShard(MultiConnection *connection, ShardInterval *shardInterval);
static uint64 GetNextShardIdForSplitChild(void);
/* Customize error message strings based on operation type */
@ -963,7 +965,7 @@ CreateSplitIntervalsForShard(ShardInterval *sourceShard,
{
ShardInterval *splitChildShardInterval = CopyShardInterval(sourceShard);
splitChildShardInterval->shardIndex = -1;
splitChildShardInterval->shardId = GetNextShardId();
splitChildShardInterval->shardId = GetNextShardIdForSplitChild();
splitChildShardInterval->minValueExists = true;
splitChildShardInterval->minValue = currentSplitChildMinValue;
@ -1302,30 +1304,13 @@ NonBlockingShardSplit(SplitOperation splitOperation,
/* Non-Blocking shard split workflow starts here */
PG_TRY();
{
/*
* 1) Create empty publications. Tables will be added after
* template replication slot and split shards are created.
*/
CreateShardSplitEmptyPublications(sourceConnection,
shardSplitHashMapForPublication);
/*
* 2) Create template replication Slot. It returns a snapshot. The snapshot remains
* valid till the lifetime of the session that creates it. The connection is closed
* at the end of the workflow.
*/
MultiConnection *templateSlotConnection = NULL;
char *snapShotName = CreateTemplateReplicationSlotAndReturnSnapshot(
shardIntervalToSplit, sourceShardToCopyNode, &templateSlotConnection);
/* 3) Physically create split children. */
/* 1) Physically create split children. */
CreateSplitShardsForShardGroup(mapOfShardToPlacementCreatedByWorkflow,
shardGroupSplitIntervalListList,
workersForPlacementList);
/*
* 4) Create dummy shards due to PG logical replication constraints.
* 2) Create dummy shards due to PG logical replication constraints.
* Refer to the comment section of 'CreateDummyShardsForShardGroup' for indepth
* information.
*/
@ -1339,17 +1324,24 @@ NonBlockingShardSplit(SplitOperation splitOperation,
CreateReplicaIdentities(mapOfDummyShardToPlacement,
shardGroupSplitIntervalListList, workersForPlacementList);
/* 3) Create Publications. */
CreateShardSplitPublications(sourceConnection, shardSplitHashMapForPublication);
/* 5) Alter Publications and add split shards for logical replication */
AlterShardSplitPublications(sourceConnection, shardSplitHashMapForPublication);
/*
* 4) Create template replication Slot. It returns a snapshot. The snapshot remains
* valid till the lifetime of the session that creates it. The connection is closed
* at the end of the workflow.
*/
MultiConnection *templateSlotConnection = NULL;
char *snapShotName = CreateTemplateReplicationSlotAndReturnSnapshot(
shardIntervalToSplit, sourceShardToCopyNode, &templateSlotConnection);
/* 6) Do snapshotted Copy */
/* 5) Do snapshotted Copy */
DoSplitCopy(sourceShardToCopyNode, sourceColocatedShardIntervalList,
shardGroupSplitIntervalListList, workersForPlacementList,
snapShotName);
/* 7) Execute 'worker_split_shard_replication_setup UDF */
/* 6) Execute 'worker_split_shard_replication_setup UDF */
List *replicationSlotInfoList = ExecuteSplitShardReplicationSetupUDF(
sourceShardToCopyNode,
sourceColocatedShardIntervalList,
@ -1364,19 +1356,19 @@ NonBlockingShardSplit(SplitOperation splitOperation,
PopulateShardSplitSubscriptionsMetadataList(
shardSplitHashMapForPublication, replicationSlotInfoList);
/* Create connections to the target nodes. TODO: can be refactored */
/* Create connections to the target nodes */
List *targetNodeConnectionList = CreateTargetNodeConnectionsForShardSplit(
shardSplitSubscribersMetadataList,
connectionFlags,
superUser, databaseName);
/* 8) Create copies of template replication slot */
/* 7) Create copies of template replication slot */
char *templateSlotName = ShardSplitTemplateReplicationSlotName(
shardIntervalToSplit->shardId);
CreateReplicationSlots(sourceConnection, templateSlotName,
shardSplitSubscribersMetadataList);
/* 9) Create subscriptions on target nodes */
/* 8) Create subscriptions on target nodes */
CreateShardSplitSubscriptions(targetNodeConnectionList,
shardSplitSubscribersMetadataList,
sourceShardToCopyNode,
@ -1386,40 +1378,40 @@ NonBlockingShardSplit(SplitOperation splitOperation,
/* Used for testing */
ConflictOnlyWithIsolationTesting();
/* 10) Wait for subscriptions to be ready */
/* 9) Wait for subscriptions to be ready */
WaitForShardSplitRelationSubscriptionsBecomeReady(
shardSplitSubscribersMetadataList);
/* 11) Wait for subscribers to catchup till source LSN */
/* 10) Wait for subscribers to catchup till source LSN */
XLogRecPtr sourcePosition = GetRemoteLogPosition(sourceConnection);
WaitForShardSplitRelationSubscriptionsToBeCaughtUp(sourcePosition,
shardSplitSubscribersMetadataList);
/* 12) Create Auxilary structures */
/* 11) Create Auxilary structures */
CreateAuxiliaryStructuresForShardGroup(shardGroupSplitIntervalListList,
workersForPlacementList,
false /* includeReplicaIdentity*/);
/* 13) Wait for subscribers to catchup till source LSN */
/* 12) Wait for subscribers to catchup till source LSN */
sourcePosition = GetRemoteLogPosition(sourceConnection);
WaitForShardSplitRelationSubscriptionsToBeCaughtUp(sourcePosition,
shardSplitSubscribersMetadataList);
/* 14) Block writes on source shards */
/* 13) Block writes on source shards */
BlockWritesToShardList(sourceColocatedShardIntervalList);
/* 15) Wait for subscribers to catchup till source LSN */
/* 14) Wait for subscribers to catchup till source LSN */
sourcePosition = GetRemoteLogPosition(sourceConnection);
WaitForShardSplitRelationSubscriptionsToBeCaughtUp(sourcePosition,
shardSplitSubscribersMetadataList);
/* 16) Drop Subscribers */
/* 15) Drop Subscribers */
DropShardSplitSubsriptions(shardSplitSubscribersMetadataList);
/* 17) Drop Publications */
/* 16) Drop Publications */
DropShardSplitPublications(sourceConnection, shardSplitHashMapForPublication);
/* 18) Drop replication slots
/* 17) Drop replication slots
* Drop template and subscriber replication slots
*/
DropShardReplicationSlot(sourceConnection, ShardSplitTemplateReplicationSlotName(
@ -1427,13 +1419,13 @@ NonBlockingShardSplit(SplitOperation splitOperation,
DropShardSplitReplicationSlots(sourceConnection, replicationSlotInfoList);
/*
* 19) Drop old shards and delete related metadata. Have to do that before
* 18) Drop old shards and delete related metadata. Have to do that before
* creating the new shard metadata, because there's cross-checks
* preventing inconsistent metadata (like overlapping shards).
*/
DropShardList(sourceColocatedShardIntervalList);
/* 20) Insert new shard and placement metdata */
/* 19) Insert new shard and placement metdata */
InsertSplitChildrenShardMetadata(shardGroupSplitIntervalListList,
workersForPlacementList);
@ -1441,7 +1433,7 @@ NonBlockingShardSplit(SplitOperation splitOperation,
workersForPlacementList);
/*
* 21) Create foreign keys if exists after the metadata changes happening in
* 20) Create foreign keys if exists after the metadata changes happening in
* DropShardList() and InsertSplitChildrenShardMetadata() because the foreign
* key creation depends on the new metadata.
*/
@ -1449,17 +1441,17 @@ NonBlockingShardSplit(SplitOperation splitOperation,
workersForPlacementList);
/*
* 22) Drop dummy shards.
* 21) Drop dummy shards.
*/
DropDummyShards(mapOfDummyShardToPlacement);
/* 23) Close source connection */
/* 22) Close source connection */
CloseConnection(sourceConnection);
/* 24) Close all subscriber connections */
/* 23) Close all subscriber connections */
CloseShardSplitSubscriberConnections(shardSplitSubscribersMetadataList);
/* 25) Close connection of template replication slot */
/* 24) Close connection of template replication slot */
CloseConnection(templateSlotConnection);
}
PG_CATCH();
@ -1969,3 +1961,63 @@ CreateReplicaIdentities(HTAB *mapOfDummyShardToPlacement,
shardToBeDroppedNode->workerPort);
}
}
/*
* GetNextShardIdForSplitChild returns shard id to be used for split child.
* The function connects to the local node through a new connection and gets the next
* sequence. This prevents self deadlock when 'CREATE_REPLICATION_SLOT' is executed
* as a part of nonblocking split workflow.
*/
static uint64
GetNextShardIdForSplitChild()
{
uint64 shardId = 0;
/*
* In regression tests, we would like to generate shard IDs consistently
* even if the tests run in parallel. Instead of the sequence, we can use
* the next_shard_id GUC to specify which shard ID the current session should
* generate next. The GUC is automatically increased by 1 every time a new
* shard ID is generated.
*/
if (NextShardId > 0)
{
shardId = NextShardId;
NextShardId += 1;
return shardId;
}
StringInfo nextValueCommand = makeStringInfo();
appendStringInfo(nextValueCommand, "SELECT nextval(%s);", quote_literal_cstr(
"pg_catalog.pg_dist_shardid_seq"));
int connectionFlag = FORCE_NEW_CONNECTION;
MultiConnection *connection = GetNodeUserDatabaseConnection(connectionFlag,
LocalHostName,
PostPortNumber,
CitusExtensionOwnerName(),
get_database_name(
MyDatabaseId));
PGresult *result = NULL;
int queryResult = ExecuteOptionalRemoteCommand(connection, nextValueCommand->data,
&result);
if (queryResult != RESPONSE_OKAY || !IsResponseOK(result) || PQntuples(result) != 1 ||
PQnfields(result) != 1)
{
PQclear(result);
ForgetResults(connection);
CloseConnection(connection);
ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE),
errmsg(
"Could not generate next shard id while executing shard splits.")));
}
shardId = SafeStringToUint64(PQgetvalue(result, 0, 0 /* nodeId column*/));
CloseConnection(connection);
return shardId;
}

View File

@ -33,14 +33,14 @@ static HTAB *ShardInfoHashMapForPublications = NULL;
static void AddPublishableShardEntryInMap(uint32 targetNodeId,
ShardInterval *shardInterval, bool
isChildShardInterval);
static void AlterShardSplitPublicationForNode(MultiConnection *connection,
List *shardList,
uint32_t publicationForTargetNodeId, Oid
ownerId);
ShardSplitSubscriberMetadata * CreateShardSplitSubscriberMetadata(Oid tableOwnerId, uint32
nodeId,
List *
replicationSlotInfoList);
static void CreateShardSplitPublicationForNode(MultiConnection *connection,
List *shardList,
uint32_t publicationForTargetNodeId, Oid
tableOwner);
static char * ShardSplitPublicationName(uint32_t nodeId, Oid ownerId);
static void DropAllShardSplitSubscriptions(MultiConnection *cleanupConnection);
static void DropAllShardSplitPublications(MultiConnection *cleanupConnection);
@ -174,44 +174,7 @@ AddPublishableShardEntryInMap(uint32 targetNodeId, ShardInterval *shardInterval,
/*
* CreateShardSplitEmptyPublications creates empty publications on the source node.
* Due to a sporadic bug in PG, we have to create publications before we create replication slot.
* After the template replication slot is created, these empty publications are altered
* with actual tables to be replicated.
* More details about the bug can be found in the below mailing link.
* (https://www.postgresql.org/message-id/20191010115752.2d0f27af%40firost).
*
* We follow the 'SHARD_SPLIT_X_PREFIX' naming scheme for creating publications
* related to split operations.
*/
void
CreateShardSplitEmptyPublications(MultiConnection *sourceConnection,
HTAB *shardInfoHashMapForPublication)
{
HASH_SEQ_STATUS status;
hash_seq_init(&status, shardInfoHashMapForPublication);
NodeShardMappingEntry *entry = NULL;
while ((entry = (NodeShardMappingEntry *) hash_seq_search(&status)) != NULL)
{
uint32 nodeId = entry->key.nodeId;
uint32 tableOwnerId = entry->key.tableOwnerId;
StringInfo createEmptyPublicationCommand = makeStringInfo();
appendStringInfo(createEmptyPublicationCommand, "CREATE PUBLICATION %s",
ShardSplitPublicationName(nodeId, tableOwnerId));
ExecuteCriticalRemoteCommand(sourceConnection,
createEmptyPublicationCommand->data);
pfree(createEmptyPublicationCommand->data);
pfree(createEmptyPublicationCommand);
}
}
/*
* AlterShardSplitPublications alters publications on the source node.
* It adds split shards for logical replication.
* CreateShardSplitPublications creates publications on the source node.
*
* sourceConnection - Connection of source node.
*
@ -220,8 +183,8 @@ CreateShardSplitEmptyPublications(MultiConnection *sourceConnection,
* ShardIntervals mapped by key.
*/
void
AlterShardSplitPublications(MultiConnection *sourceConnection,
HTAB *shardInfoHashMapForPublication)
CreateShardSplitPublications(MultiConnection *sourceConnection,
HTAB *shardInfoHashMapForPublication)
{
HASH_SEQ_STATUS status;
hash_seq_init(&status, shardInfoHashMapForPublication);
@ -232,26 +195,30 @@ AlterShardSplitPublications(MultiConnection *sourceConnection,
uint32 nodeId = entry->key.nodeId;
uint32 tableOwnerId = entry->key.tableOwnerId;
List *shardListForPublication = entry->shardSplitInfoList;
AlterShardSplitPublicationForNode(sourceConnection,
shardListForPublication,
nodeId,
tableOwnerId);
/* Create publication on shard list */
CreateShardSplitPublicationForNode(sourceConnection,
shardListForPublication,
nodeId,
tableOwnerId);
}
}
/*
* AlterShardSplitPublicationForNode adds shards that have to be replicated
* for a given publication.
* CreateShardSplitPublicationForNode creates a publication on source node
* for given shard list.
* We follow the 'SHARD_SPLIT_X_PREFIX' naming scheme for creating publications
* related to split operations.
*/
static void
AlterShardSplitPublicationForNode(MultiConnection *connection, List *shardList,
uint32_t publicationForTargetNodeId, Oid ownerId)
CreateShardSplitPublicationForNode(MultiConnection *connection, List *shardList,
uint32_t publicationForTargetNodeId, Oid ownerId)
{
StringInfo alterPublicationCommand = makeStringInfo();
StringInfo createPublicationCommand = makeStringInfo();
bool prefixWithComma = false;
appendStringInfo(alterPublicationCommand, "ALTER PUBLICATION %s ADD TABLE ",
appendStringInfo(createPublicationCommand, "CREATE PUBLICATION %s FOR TABLE ",
ShardSplitPublicationName(publicationForTargetNodeId, ownerId));
ShardInterval *shard = NULL;
@ -261,16 +228,16 @@ AlterShardSplitPublicationForNode(MultiConnection *connection, List *shardList,
if (prefixWithComma)
{
appendStringInfoString(alterPublicationCommand, ",");
appendStringInfoString(createPublicationCommand, ",");
}
appendStringInfoString(alterPublicationCommand, shardName);
appendStringInfoString(createPublicationCommand, shardName);
prefixWithComma = true;
}
ExecuteCriticalRemoteCommand(connection, alterPublicationCommand->data);
pfree(alterPublicationCommand->data);
pfree(alterPublicationCommand);
ExecuteCriticalRemoteCommand(connection, createPublicationCommand->data);
pfree(createPublicationCommand->data);
pfree(createPublicationCommand);
}

View File

@ -87,8 +87,8 @@ extern List * CreateTargetNodeConnectionsForShardSplit(
char *databaseName);
/* Functions to drop publisher-subscriber resources */
extern void CreateShardSplitEmptyPublications(MultiConnection *sourceConnection,
HTAB *shardInfoHashMapForPublication);
extern void CreateShardSplitPublications(MultiConnection *sourceConnection,
HTAB *shardInfoHashMapForPublication);
extern char * CreateTemplateReplicationSlot(ShardInterval *shardIntervalToSplit,
MultiConnection *
sourceConnection);