Using replication origin remote commands for shard moves/splits.

pull/6453/head
Rajesh Kumar 2022-11-15 22:54:01 +05:30
parent 773dbe147d
commit 827393ab41
2 changed files with 121 additions and 72 deletions

View File

@ -32,6 +32,17 @@
* argument to the copy callback. * argument to the copy callback.
*/ */
static StringInfo LocalCopyBuffer; static StringInfo LocalCopyBuffer;
#define CDC_REPLICATION_ORIGIN_CREATE_IF_NOT_EXISTS_CMD \
"SELECT pg_catalog.pg_replication_origin_create('citus_cdc') \
where (select pg_catalog.pg_replication_origin_oid('citus_cdc')) IS NULL;"
#define CDC_REPLICATION_ORIGIN_SESION_SETUP_CMD \
"SELECT pg_catalog.pg_replication_origin_session_setup('citus_cdc') \
where pg_catalog.pg_replication_origin_session_is_setup()='f';"
#define CDC_REPLICATION_ORIGIN_SESION_RESET_CMD \
"SELECT pg_catalog.pg_replication_origin_session_reset() \
where pg_catalog.pg_replication_origin_session_is_setup()='t';"
typedef struct ShardCopyDestReceiver typedef struct ShardCopyDestReceiver
{ {
@ -80,7 +91,10 @@ static int ReadFromLocalBufferCallback(void *outBuf, int minRead, int maxRead);
static void LocalCopyToShard(ShardCopyDestReceiver *copyDest, CopyOutState static void LocalCopyToShard(ShardCopyDestReceiver *copyDest, CopyOutState
localCopyOutState); localCopyOutState);
static void ConnectToRemoteAndStartCopy(ShardCopyDestReceiver *copyDest); static void ConnectToRemoteAndStartCopy(ShardCopyDestReceiver *copyDest);
static void SetupReplicationOrigin(RepOriginId nodeId);
static void CreateReplicationOriginIfNotExists(ShardCopyDestReceiver *dest);
static void SetupReplicationOriginSessionIfNotSetupAlready(MultiConnection *connection);
static void ResetReplicationOriginSessionIfSetupAlready(MultiConnection *connection);
static bool static bool
CanUseLocalCopy(uint32_t destinationNodeId) CanUseLocalCopy(uint32_t destinationNodeId)
@ -89,6 +103,30 @@ CanUseLocalCopy(uint32_t destinationNodeId)
return GetLocalNodeId() == (int32) destinationNodeId; return GetLocalNodeId() == (int32) destinationNodeId;
} }
/*
* SetupReplicationOriginSessionIfNotSetupAlready sets up the replication origin session
* if it is not already setup.
*/
static void SetupReplicationOriginSessionIfNotSetupAlready(MultiConnection *connection) {
/* Setup replication Origin if not setup already */
if (!SendRemoteCommand(connection, CDC_REPLICATION_ORIGIN_SESION_SETUP_CMD))
{
ReportConnectionError(connection, ERROR);
}
ForgetResults(connection);
}
/*
* ResetReplicationOriginSessionIfSetupAlready resets the replication origin session
* if it has been setup currently.
*/
static void ResetReplicationOriginSessionIfSetupAlready(MultiConnection *connection) {
if (!SendRemoteCommand(connection, CDC_REPLICATION_ORIGIN_SESION_RESET_CMD))
{
ReportConnectionError(connection, ERROR);
}
ForgetResults(connection);
}
/* Connect to node with source shard and trigger copy start. */ /* Connect to node with source shard and trigger copy start. */
static void static void
@ -105,10 +143,13 @@ ConnectToRemoteAndStartCopy(ShardCopyDestReceiver *copyDest)
NULL /* database (current) */); NULL /* database (current) */);
ClaimConnectionExclusively(copyDest->connection); ClaimConnectionExclusively(copyDest->connection);
/* Setup Replication Origin Session if not setup already for
avoiding publication of events more than once. */
SetupReplicationOriginSessionIfNotSetupAlready(copyDest->connection);
StringInfo copyStatement = ConstructShardCopyStatement( StringInfo copyStatement = ConstructShardCopyStatement(
copyDest->destinationShardFullyQualifiedName, copyDest->destinationShardFullyQualifiedName,
copyDest->copyOutState->binary); copyDest->copyOutState->binary);
if (!SendRemoteCommand(copyDest->connection, copyStatement->data)) if (!SendRemoteCommand(copyDest->connection, copyStatement->data))
{ {
ReportConnectionError(copyDest->connection, ERROR); ReportConnectionError(copyDest->connection, ERROR);
@ -154,7 +195,9 @@ CreateShardCopyDestReceiver(EState *executorState,
return (DestReceiver *) copyDest; return (DestReceiver *) copyDest;
} }
#define InvalidRepOriginId 0 #define InvalidRepOriginId 0
/* /*
* ShardCopyDestReceiverReceive implements the receiveSlot function of * ShardCopyDestReceiverReceive implements the receiveSlot function of
* ShardCopyDestReceiver. It takes a TupleTableSlot and sends the contents to * ShardCopyDestReceiver. It takes a TupleTableSlot and sends the contents to
@ -234,27 +277,6 @@ ShardCopyDestReceiverReceive(TupleTableSlot *slot, DestReceiver *dest)
return true; return true;
} }
static void
SetupReplicationOrigin(RepOriginId nodeId) {
RepOriginId originid = InvalidRepOriginId;
XLogRecPtr origin_startpos = InvalidXLogRecPtr;
//Check if there is a replication origin session already active.
if (replorigin_session_origin == InvalidRepOriginId) {
//Lookup the replication origin and create it if it does not exist.
char originname[NAMEDATALEN];
snprintf(originname, sizeof(originname), "pg_%u",nodeId);
originid = replorigin_by_name(originname, true);
if (originid == InvalidRepOriginId) {
originid = replorigin_create(originname);
}
//Setup the replication origin session.
replorigin_session_setup(originid);
replorigin_session_origin = originid;
origin_startpos = replorigin_session_get_progress(false);
//elog(LOG, "!!!! Citus: ShardCopyDestReceiverReceive replorigin_session_origin %d", replorigin_session_origin);
}
}
/* /*
* ShardCopyDestReceiverStartup implements the rStartup interface of ShardCopyDestReceiver. * ShardCopyDestReceiverStartup implements the rStartup interface of ShardCopyDestReceiver.
@ -282,7 +304,26 @@ ShardCopyDestReceiverStartup(DestReceiver *dest, int operation, TupleDesc
copyDest->columnOutputFunctions = ColumnOutputFunctions(inputTupleDescriptor, copyDest->columnOutputFunctions = ColumnOutputFunctions(inputTupleDescriptor,
copyOutState->binary); copyOutState->binary);
copyDest->copyOutState = copyOutState; copyDest->copyOutState = copyOutState;
SetupReplicationOrigin(copyDest->destinationNodeId); CreateReplicationOriginIfNotExists(copyDest);
}
static void
CreateReplicationOriginIfNotExists(ShardCopyDestReceiver *dest)
{
int connectionFlags = OUTSIDE_TRANSACTION;
char *currentUser = CurrentUserName();
WorkerNode *workerNode = FindNodeWithNodeId(dest->destinationNodeId,
false /* missingOk */);
MultiConnection *connection = GetNodeUserDatabaseConnection(connectionFlags,
workerNode->workerName,
workerNode->workerPort,
currentUser,
NULL /* database (current) */);
ClaimConnectionExclusively(connection);
ExecuteCriticalRemoteCommand(connection,
CDC_REPLICATION_ORIGIN_CREATE_IF_NOT_EXISTS_CMD);
CloseConnection(connection);
} }
@ -341,6 +382,11 @@ ShardCopyDestReceiverShutdown(DestReceiver *dest)
PQclear(result); PQclear(result);
ForgetResults(copyDest->connection); ForgetResults(copyDest->connection);
/*Reset Replication Origin. */
ExecuteCriticalRemoteCommand(copyDest->connection,
CDC_REPLICATION_ORIGIN_SESION_RESET_CMD);
CloseConnection(copyDest->connection); CloseConnection(copyDest->connection);
} }
} }

View File

@ -13,6 +13,7 @@
#include "distributed/worker_shard_visibility.h" #include "distributed/worker_shard_visibility.h"
#include "distributed/worker_protocol.h" #include "distributed/worker_protocol.h"
#include "distributed/listutils.h" #include "distributed/listutils.h"
#include "distributed/metadata/distobject.h"
#include "replication/logical.h" #include "replication/logical.h"
#include "utils/typcache.h" #include "utils/typcache.h"
#include "utils/lsyscache.h" #include "utils/lsyscache.h"
@ -39,15 +40,15 @@ static Oid FindTargetRelationOid(Relation sourceShardRelation,
static HeapTuple GetTupleForTargetSchema(HeapTuple sourceRelationTuple, static HeapTuple GetTupleForTargetSchema(HeapTuple sourceRelationTuple,
TupleDesc sourceTupleDesc, TupleDesc sourceTupleDesc,
TupleDesc targetTupleDesc); TupleDesc targetTupleDesc);
static bool static bool replication_origin_filter_cb(LogicalDecodingContext *ctx, RepOriginId origin_id);
cdc_origin_filter(LogicalDecodingContext *ctx, RepOriginId origin_id);
static bool static bool PublishChangesIfCdcSlot(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
is_cdc_replication_slot(LogicalDecodingContext *ctx);
bool
handle_cdc_changes(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
Relation relation, ReorderBufferChange *change); Relation relation, ReorderBufferChange *change);
/* used in the replication_origin_filter_cb function. */
#define InvalidRepOriginId 0
#define CITUS_CDC_SLOT_NAME "citus_cdc_slot"
/* /*
* Postgres uses 'pgoutput' as default plugin for logical replication. * Postgres uses 'pgoutput' as default plugin for logical replication.
* We want to reuse Postgres pgoutput's functionality as much as possible. * We want to reuse Postgres pgoutput's functionality as much as possible.
@ -69,70 +70,71 @@ _PG_output_plugin_init(OutputPluginCallbacks *cb)
/* ask the output plugin to fill the callback struct */ /* ask the output plugin to fill the callback struct */
plugin_init(cb); plugin_init(cb);
/* actual pgoutput callback will be called with the appropriate destination shard */ /* actual pgoutput callback will be called with the appropriate destination shard */
pgoutputChangeCB = cb->change_cb; pgoutputChangeCB = cb->change_cb;
cb->change_cb = split_change_cb; cb->change_cb = split_change_cb;
cb->filter_by_origin_cb = cdc_origin_filter; cb->filter_by_origin_cb = replication_origin_filter_cb;
} }
#define InvalidRepOriginId 0
/* /*
* cdc_origin_filter is called for each change. If the change is not * replication_origin_filter_cb call back function filters out publication of changes
* originated from the CDC replication slot, we return false to skip * originated from any other node other than the current node. This is
* the change. * identified by the "origin_id" of the changes. The origin_id is set to
* a non-zero value in the origin node as part of WAL replication.
*/ */
static bool static bool
cdc_origin_filter(LogicalDecodingContext *ctx, RepOriginId origin_id) replication_origin_filter_cb(LogicalDecodingContext *ctx, RepOriginId origin_id)
{ {
if (origin_id != InvalidRepOriginId) if (origin_id != InvalidRepOriginId)
{ {
elog(LOG,"!!!! cdc_origin_filter: filtering because origin_id: %d \n",origin_id);
return true; return true;
} }
//elog(LOG,"!!!! cdc_origin_filter: NOT filtering because origin_id: %d \n",origin_id);
return false; return false;
} }
/*
* PublishChangesIfCdcSlot checks if the current slot is a CDC slot. If so, it publishes
* the changes as the change for the distributed table instead of shard.
* If not, it returns false. It also skips the Citus metadata tables.
*/
static bool static bool
is_cdc_replication_slot(LogicalDecodingContext *ctx) { PublishChangesIfCdcSlot(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
Relation relation, ReorderBufferChange *change)
{
char *replicationSlotName = ctx->slot->data.name.data; char *replicationSlotName = ctx->slot->data.name.data;
//elog(LOG,"is_cdc_replication_slot: replicationSlotName %s", replicationSlotName); /* Check if the replication slot is CITUS_CDC_SLOT*/
return (replicationSlotName != NULL && strcmp(replicationSlotName,"cdc")== 0); if (replicationSlotName != NULL && strcmp(replicationSlotName, CITUS_CDC_SLOT_NAME) == 0)
{
/* Skip publishing changes for Citus metadata tables*/
ObjectAddress objectAdress = {RelationRelationId, relation->rd_id, 0};
if (IsObjectAddressOwnedByCitus(&objectAdress))
{
return true;
} }
/* Check if this change is for a shard in distributed table. */
bool if (RelationIsAKnownShard(relation->rd_id))
handle_cdc_changes(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, {
Relation relation, ReorderBufferChange *change) {
bool is_cdc_translated_to_distribution_table = false;
if (is_cdc_replication_slot(ctx)) {
// Check if it is a change in a Shard table
if (RelationIsAKnownShard(relation->rd_id)) {
//Oid shardRelationId = relation->rd_id;
char *shardRelationName = RelationGetRelationName(relation); char *shardRelationName = RelationGetRelationName(relation);
uint64 shardId = ExtractShardIdFromTableName(shardRelationName, true); uint64 shardId = ExtractShardIdFromTableName(shardRelationName, true);
if (shardId != INVALID_SHARD_ID) if (shardId != INVALID_SHARD_ID)
{ {
// try to get the distributed relation id for the shard /* try to get the distributed relation id for the shard */
Oid distributedRelationId = RelationIdForShard(shardId); Oid distributedRelationId = RelationIdForShard(shardId);
if (OidIsValid(distributedRelationId)) if (OidIsValid(distributedRelationId))
{ {
char* relationName = get_rel_name(distributedRelationId); relation = RelationIdGetRelation(distributedRelationId);
//elog(LOG,"changing to distributed relation name:%s id:%d ", relationName, distributedRelationId);
Relation distributedRelation = RelationIdGetRelation(distributedRelationId);
pgoutputChangeCB(ctx, txn, distributedRelation, change);
is_cdc_translated_to_distribution_table = true;
} }
} }
} }
if (!is_cdc_translated_to_distribution_table) {
pgoutputChangeCB(ctx, txn, relation, change); pgoutputChangeCB(ctx, txn, relation, change);
}
return true; return true;
} }
return false; return false;
} }
/* /*
* split_change function emits the incoming tuple change * split_change function emits the incoming tuple change
* to the appropriate destination shard. * to the appropriate destination shard.
@ -141,11 +143,11 @@ static void
split_change_cb(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, split_change_cb(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
Relation relation, ReorderBufferChange *change) Relation relation, ReorderBufferChange *change)
{ {
//elog(LOG,"Citus split_change_cb called"); if (!is_publishable_relation(relation))
if (handle_cdc_changes(ctx,txn,relation,change)) { {
return; return;
} }
if (!is_publishable_relation(relation)) if (PublishChangesIfCdcSlot(ctx, txn, relation, change))
{ {
return; return;
} }
@ -274,6 +276,7 @@ split_change_cb(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
RelationClose(targetRelation); RelationClose(targetRelation);
} }
/* /*
* FindTargetRelationOid returns the destination relation Oid for the incoming * FindTargetRelationOid returns the destination relation Oid for the incoming
* tuple. * tuple.