citus/src/backend/distributed/replication/multi_logical_replication.c

/*-------------------------------------------------------------------------
 *
 * multi_logical_replication.c
 *
 * This file contains functions to use logical replication on the distributed
 * tables for moving/replicating shards.
 *
 * Copyright (c) 2017, Citus Data, Inc.
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

#include "fmgr.h"
#include "libpq-fe.h"
#include "miscadmin.h"
#include "pgstat.h"

#include "access/genam.h"
#include "access/htup_details.h"
#include "access/sysattr.h"
#include "access/xact.h"
#include "catalog/namespace.h"
#include "catalog/pg_constraint.h"
#include "catalog/pg_subscription_rel.h"
#include "commands/dbcommands.h"
#include "common/hashfn.h"
#include "nodes/bitmapset.h"
#include "parser/scansup.h"
#include "postmaster/interrupt.h"
#include "storage/ipc.h"
#include "storage/latch.h"
#include "storage/lock.h"
#include "utils/builtins.h"
#include "utils/fmgroids.h"
#include "utils/fmgrprotos.h"
#include "utils/formatting.h"
#include "utils/guc.h"
#include "utils/inval.h"
#include "utils/lsyscache.h"
#include "utils/pg_lsn.h"
#include "utils/rel.h"
#include "utils/ruleutils.h"
#include "utils/syscache.h"

#include "pg_version_constants.h"

#include "distributed/adaptive_executor.h"
#include "distributed/citus_safe_lib.h"
#include "distributed/colocation_utils.h"
#include "distributed/connection_management.h"
#include "distributed/coordinator_protocol.h"
#include "distributed/distributed_planner.h"
#include "distributed/hash_helpers.h"
#include "distributed/listutils.h"
#include "distributed/metadata_cache.h"
#include "distributed/metadata_sync.h"
#include "distributed/multi_join_order.h"
#include "distributed/multi_logical_replication.h"
#include "distributed/multi_partitioning_utils.h"
#include "distributed/priority.h"
#include "distributed/remote_commands.h"
#include "distributed/resource_lock.h"
#include "distributed/shard_cleaner.h"
#include "distributed/shard_rebalancer.h"
#include "distributed/shard_transfer.h"
#include "distributed/version_compat.h"

#define CURRENT_LOG_POSITION_COMMAND "SELECT pg_current_wal_lsn()"

/* decimal representation of Adler-16 hash value of citus_shard_move_publication */
#define SHARD_MOVE_ADVISORY_LOCK_FIRST_KEY 44000

/* decimal representation of Adler-16 hash value of citus_shard_move_subscription */
#define SHARD_MOVE_ADVISORY_LOCK_SECOND_KEY 55152

static const char *publicationPrefix[] = {
	[SHARD_MOVE] = "citus_shard_move_publication_",
	[SHARD_SPLIT] = "citus_shard_split_publication_",
};

static const char *replicationSlotPrefix[] = {
	[SHARD_MOVE] = "citus_shard_move_slot_",
	[SHARD_SPLIT] = "citus_shard_split_slot_",
};

/*
 * IMPORTANT: All the subscription names should start with "citus_". Otherwise
 * our utility hook does not defend against non-superusers altering or dropping
 * them, which is important for security purposes.
 *
 * We should also keep these in sync with IsCitusShardTransferBackend().
 */
static const char *subscriptionPrefix[] = {
	[SHARD_MOVE] = "citus_shard_move_subscription_",
	[SHARD_SPLIT] = "citus_shard_split_subscription_",
};

static const char *subscriptionRolePrefix[] = {
	[SHARD_MOVE] = "citus_shard_move_subscription_role_",
	[SHARD_SPLIT] = "citus_shard_split_subscription_role_",
};


/* GUC variable, defaults to 2 hours */
int LogicalReplicationTimeout = 2 * 60 * 60 * 1000;


/* see the comment in master_move_shard_placement */
bool PlacementMovedUsingLogicalReplicationInTX = false;

/* report in every 10 seconds */
static int logicalReplicationProgressReportTimeout = 10 * 1000;


static List * PrepareReplicationSubscriptionList(List *shardList);
static List * GetReplicaIdentityCommandListForShard(Oid relationId, uint64 shardId);
static List * GetIndexCommandListForShardBackingReplicaIdentity(Oid relationId,
																uint64 shardId);
static void CreatePostLogicalReplicationDataLoadObjects(List *logicalRepTargetList,
														LogicalRepType type);
static void ExecuteCreateIndexCommands(List *logicalRepTargetList);
static void ExecuteCreateConstraintsBackedByIndexCommands(List *logicalRepTargetList);
static List * ConvertNonExistingPlacementDDLCommandsToTasks(List *shardCommandList,
															char *targetNodeName,
															int targetNodePort);
static void ExecuteClusterOnCommands(List *logicalRepTargetList);
static void ExecuteCreateIndexStatisticsCommands(List *logicalRepTargetList);
static void ExecuteRemainingPostLoadTableCommands(List *logicalRepTargetList);
static char * escape_param_str(const char *str);
static XLogRecPtr GetRemoteLSN(MultiConnection *connection, char *command);
static void WaitForMiliseconds(long timeout);
static XLogRecPtr GetSubscriptionPosition(
	GroupedLogicalRepTargets *groupedLogicalRepTargets);
static void AcquireLogicalReplicationLock(void);

static HTAB * CreateShardMovePublicationInfoHash(WorkerNode *targetNode,
												 List *shardIntervals);
static List * CreateShardMoveLogicalRepTargetList(HTAB *publicationInfoHash,
												  List *shardList);
static void WaitForGroupedLogicalRepTargetsToCatchUp(XLogRecPtr sourcePosition,
													 GroupedLogicalRepTargets *
													 groupedLogicalRepTargets);

/*
 * LogicallyReplicateShards replicates a list of shards from one node to another
 * using logical replication. Once replication is reasonably caught up, writes
 * are blocked and then the publication and subscription are dropped.
 *
 * The caller of the function should ensure that logical replication is applicable
 * for the given shards, source and target nodes. Also, the caller is responsible
 * for ensuring that the input shard list consists of co-located distributed tables
 * or a single shard.
 */
void
LogicallyReplicateShards(List *shardList, char *sourceNodeName, int sourceNodePort,
						 char *targetNodeName, int targetNodePort)
{
	AcquireLogicalReplicationLock();
	char *superUser = CitusExtensionOwnerName();
	char *databaseName = get_database_name(MyDatabaseId);
	int connectionFlags = FORCE_NEW_CONNECTION;
	List *replicationSubscriptionList = PrepareReplicationSubscriptionList(shardList);

	/* no shards to move */
	if (list_length(replicationSubscriptionList) == 0)
	{
		return;
	}

	MultiConnection *sourceConnection =
		GetNodeUserDatabaseConnection(connectionFlags, sourceNodeName, sourceNodePort,
									  superUser, databaseName);

	/*
	 * Operations on publications and replication slots cannot run in a
	 * transaction block. We claim the connections exclusively to ensure they
	 * do not get used for metadata syncing, which does open a transaction
	 * block.
	 */
	ClaimConnectionExclusively(sourceConnection);

	WorkerNode *sourceNode = FindWorkerNode(sourceNodeName, sourceNodePort);
	WorkerNode *targetNode = FindWorkerNode(targetNodeName, targetNodePort);

	HTAB *publicationInfoHash = CreateShardMovePublicationInfoHash(
		targetNode, replicationSubscriptionList);

	List *logicalRepTargetList = CreateShardMoveLogicalRepTargetList(publicationInfoHash,
																	 shardList);

	HTAB *groupedLogicalRepTargetsHash = CreateGroupedLogicalRepTargetsHash(
		logicalRepTargetList);

	CreateGroupedLogicalRepTargetsConnections(groupedLogicalRepTargetsHash, superUser,
											  databaseName);

	MultiConnection *sourceReplicationConnection =
		GetReplicationConnection(sourceConnection->hostname, sourceConnection->port);

	/* set up the publication on the source and subscription on the target */
	CreatePublications(sourceConnection, publicationInfoHash);
	char *snapshot = CreateReplicationSlots(
		sourceConnection,
		sourceReplicationConnection,
		logicalRepTargetList,
		"pgoutput");

	CreateSubscriptions(
		sourceConnection,
		sourceConnection->database,
		logicalRepTargetList);

	/* only useful for isolation testing, see the function comment for the details */
	ConflictWithIsolationTestingBeforeCopy();

	/*
	 * We have to create the primary key (or any other replica identity)
	 * before the update/delete operations that are queued will be
	 * replicated. Because if the replica identity does not exist on the
	 * target, the replication would fail.
	 *
	 * So the latest possible moment we could do this is right after the
	 * initial data COPY, but before enabling the susbcriptions. It might
	 * seem like a good idea to it after the initial data COPY, since
	 * it's generally the rule that it's cheaper to build an index at once
	 * than to create it incrementally. This general rule, is why we create
	 * all the regular indexes as late during the move as possible.
	 *
	 * But as it turns out in practice it's not as clear cut, and we saw a
	 * speed degradation in the time it takes to move shards when doing the
	 * replica identity creation after the initial COPY. So, instead we
	 * keep it before the COPY.
	 */
	CreateReplicaIdentities(logicalRepTargetList);

	UpdatePlacementUpdateStatusForShardIntervalList(
		shardList,
		sourceNodeName,
		sourceNodePort,
		PLACEMENT_UPDATE_STATUS_COPYING_DATA);

	CopyShardsToNode(sourceNode, targetNode, shardList, snapshot);

	/*
	 * We can close this connection now, because we're done copying the
	 * data and thus don't need access to the snapshot anymore. The
	 * replication slot will still be at the same LSN, because the
	 * subscriptions have not been enabled yet.
	 */
	CloseConnection(sourceReplicationConnection);

	/*
	 * Start the replication and copy all data
	 */
	CompleteNonBlockingShardTransfer(shardList,
									 sourceConnection,
									 publicationInfoHash,
									 logicalRepTargetList,
									 groupedLogicalRepTargetsHash,
									 SHARD_MOVE);

	/*
	 * We use these connections exclusively for subscription management,
	 * because otherwise subsequent metadata changes may inadvertedly use
	 * these connections instead of the connections that were used to
	 * grab locks in BlockWritesToShardList.
	 */
	CloseGroupedLogicalRepTargetsConnections(groupedLogicalRepTargetsHash);
	CloseConnection(sourceConnection);
}


/*
 * CreateGroupedLogicalRepTargetsHash creates a hashmap that groups the subscriptions
 * logicalRepTargetList by node. This is useful for cases where we want to
 * iterate the subscriptions by node, so we can batch certain operations, such
 * as checking subscription readiness.
 */
HTAB *
CreateGroupedLogicalRepTargetsHash(List *logicalRepTargetList)
{
	HTAB *logicalRepTargetsHash = CreateSimpleHash(uint32, GroupedLogicalRepTargets);
	LogicalRepTarget *target = NULL;
	foreach_ptr(target, logicalRepTargetList)
	{
		bool found = false;
		GroupedLogicalRepTargets *groupedLogicalRepTargets =
			(GroupedLogicalRepTargets *) hash_search(
				logicalRepTargetsHash,
				&target->replicationSlot->targetNodeId,
				HASH_ENTER,
				&found);
		if (!found)
		{
			groupedLogicalRepTargets->logicalRepTargetList = NIL;
			groupedLogicalRepTargets->superuserConnection = NULL;
		}
		groupedLogicalRepTargets->logicalRepTargetList =
			lappend(groupedLogicalRepTargets->logicalRepTargetList, target);
	}
	return logicalRepTargetsHash;
}


/*
 * CompleteNonBlockingShardTransfer uses logical replication to apply the changes
 * made on the source to the target. It also runs all DDL on the target shards
 * that need to be run after the data copy.
 *
 * For shard splits it skips the partition hierarchy and foreign key creation
 * though, since those need to happen after the metadata is updated.
 */
void
CompleteNonBlockingShardTransfer(List *shardList,
								 MultiConnection *sourceConnection,
								 HTAB *publicationInfoHash,
								 List *logicalRepTargetList,
								 HTAB *groupedLogicalRepTargetsHash,
								 LogicalRepType type)
{
	/* Start applying the changes from the replication slots to catch up. */
	EnableSubscriptions(logicalRepTargetList);

	UpdatePlacementUpdateStatusForShardIntervalList(
		shardList,
		sourceConnection->hostname,
		sourceConnection->port,
		PLACEMENT_UPDATE_STATUS_CATCHING_UP);

	/*
	 * Wait until all the subscriptions are caught up to changes that
	 * happened after the initial COPY on the shards.
	 */
	WaitForAllSubscriptionsToCatchUp(sourceConnection, groupedLogicalRepTargetsHash);

	UpdatePlacementUpdateStatusForShardIntervalList(
		shardList,
		sourceConnection->hostname,
		sourceConnection->port,
		PLACEMENT_UPDATE_STATUS_CREATING_CONSTRAINTS);

	/*
	 * Now lets create the post-load objects, such as the indexes, constraints
	 * and partitioning hierarchy. Once they are done, wait until the replication
	 * catches up again. So we don't block writes too long.
	 */
	CreatePostLogicalReplicationDataLoadObjects(logicalRepTargetList, type);

	UpdatePlacementUpdateStatusForShardIntervalList(
		shardList,
		sourceConnection->hostname,
		sourceConnection->port,
		PLACEMENT_UPDATE_STATUS_FINAL_CATCH_UP);

	WaitForAllSubscriptionsToCatchUp(sourceConnection, groupedLogicalRepTargetsHash);


	/* only useful for isolation testing, see the function comment for the details */
	ConflictWithIsolationTestingAfterCopy();

	/*
	 * We're almost done, we'll block the writes to the shards that we're
	 * replicating and expect all the subscription to catch up quickly
	 * afterwards.
	 *
	 * Notice that although shards in partitioned relation are excluded from
	 * logical replication, they are still locked against modification, and
	 * foreign constraints are created on them too.
	 */
	BlockWritesToShardList(shardList);

	WaitForAllSubscriptionsToCatchUp(sourceConnection, groupedLogicalRepTargetsHash);

	if (type != SHARD_SPLIT)
	{
		UpdatePlacementUpdateStatusForShardIntervalList(
			shardList,
			sourceConnection->hostname,
			sourceConnection->port,
			PLACEMENT_UPDATE_STATUS_CREATING_FOREIGN_KEYS);

		/*
		 * We're creating the foreign constraints to reference tables after the
		 * data is already replicated and all the necessary locks are acquired.
		 *
		 * We prefer to do it here because the placements of reference tables
		 * are always valid, and any modification during the shard move would
		 * cascade to the hash distributed tables' shards if we had created
		 * the constraints earlier. The same is true for foreign keys between
		 * tables owned by different users.
		 */
		CreateUncheckedForeignKeyConstraints(logicalRepTargetList);
	}

	UpdatePlacementUpdateStatusForShardIntervalList(
		shardList,
		sourceConnection->hostname,
		sourceConnection->port,
		PLACEMENT_UPDATE_STATUS_COMPLETING);
}


/*
 * CreateShardMovePublicationInfoHash creates hashmap of PublicationInfos for a
 * shard move. Even though we only support moving a shard to a single target
 * node, the resulting hashmap can have multiple PublicationInfos in it.
 * The reason for that is that we need a separate publication for each
 * distributed table owning user in the shard group.
 */
static HTAB *
CreateShardMovePublicationInfoHash(WorkerNode *targetNode, List *shardIntervals)
{
	HTAB *publicationInfoHash = CreateSimpleHash(NodeAndOwner, PublicationInfo);
	ShardInterval *shardInterval = NULL;
	foreach_ptr(shardInterval, shardIntervals)
	{
		NodeAndOwner key;
		key.nodeId = targetNode->nodeId;
		key.tableOwnerId = TableOwnerOid(shardInterval->relationId);
		bool found = false;
		PublicationInfo *publicationInfo =
			(PublicationInfo *) hash_search(publicationInfoHash, &key,
											HASH_ENTER,
											&found);
		if (!found)
		{
			publicationInfo->name = PublicationName(SHARD_MOVE, key.nodeId,
													key.tableOwnerId);
			publicationInfo->shardIntervals = NIL;
		}
		publicationInfo->shardIntervals =
			lappend(publicationInfo->shardIntervals, shardInterval);
	}
	return publicationInfoHash;
}


/*
 * CreateShardMoveLogicalRepTargetList creates the list containing all the
 * subscriptions that should be connected to the publications in the given
 * publicationHash.
 */
static List *
CreateShardMoveLogicalRepTargetList(HTAB *publicationInfoHash, List *shardList)
{
	List *logicalRepTargetList = NIL;

	HASH_SEQ_STATUS status;
	hash_seq_init(&status, publicationInfoHash);
	Oid nodeId = InvalidOid;

	PublicationInfo *publication = NULL;
	while ((publication = (PublicationInfo *) hash_seq_search(&status)) != NULL)
	{
		Oid ownerId = publication->key.tableOwnerId;
		nodeId = publication->key.nodeId;
		LogicalRepTarget *target = palloc0(sizeof(LogicalRepTarget));
		target->subscriptionName = SubscriptionName(SHARD_MOVE, ownerId);
		target->tableOwnerId = ownerId;
		target->publication = publication;
		publication->target = target;
		target->newShards = NIL;
		target->subscriptionOwnerName = SubscriptionRoleName(SHARD_MOVE, ownerId);
		target->replicationSlot = palloc0(sizeof(ReplicationSlotInfo));
		target->replicationSlot->name =
			ReplicationSlotNameForNodeAndOwnerForOperation(SHARD_MOVE,
														   nodeId,
														   ownerId,
														   CurrentOperationId);
		target->replicationSlot->targetNodeId = nodeId;
		target->replicationSlot->tableOwnerId = ownerId;
		logicalRepTargetList = lappend(logicalRepTargetList, target);
	}

	ShardInterval *shardInterval = NULL;
	foreach_ptr(shardInterval, shardList)
	{
		NodeAndOwner key;
		key.nodeId = nodeId;
		key.tableOwnerId = TableOwnerOid(shardInterval->relationId);

		bool found = false;
		publication = (PublicationInfo *) hash_search(
			publicationInfoHash,
			&key,
			HASH_FIND,
			&found);
		if (!found)
		{
			ereport(ERROR, errmsg("Could not find publication matching a split"));
		}
		publication->target->newShards = lappend(
			publication->target->newShards, shardInterval);
	}
	return logicalRepTargetList;
}


/*
 * AcquireLogicalReplicationLock tries to acquire a lock for logical
 * replication. We need this lock, because at the start of logical replication
 * we clean up old subscriptions and publications. Because of this cleanup it's
 * not safe to run multiple logical replication based shard moves at the same
 * time. If multiple logical replication moves would run at the same time, the
 * second move might clean up subscriptions and publications that are in use by
 * another move.
 */
static void
AcquireLogicalReplicationLock(void)
{
	LOCKTAG tag;
	SET_LOCKTAG_LOGICAL_REPLICATION(tag);

	LockAcquire(&tag, ExclusiveLock, false, false);
}


/*
 * PrepareReplicationSubscriptionList returns list of shards to be logically
 * replicated from given shard list. This is needed because Postgres does not
 * allow logical replication on partitioned tables, therefore shards belonging
 * to a partitioned tables should be exluded from logical replication
 * subscription list.
 */
static List *
PrepareReplicationSubscriptionList(List *shardList)
{
	List *replicationSubscriptionList = NIL;
	ListCell *shardCell = NULL;

	foreach(shardCell, shardList)
	{
		ShardInterval *shardInterval = (ShardInterval *) lfirst(shardCell);
		if (!PartitionedTable(shardInterval->relationId))
		{
			/* only add regular and child tables to subscription */
			replicationSubscriptionList = lappend(replicationSubscriptionList,
												  shardInterval);
		}
	}

	return replicationSubscriptionList;
}


/*
 * CreateReplicaIdentities creates replica identities for all the shards that
 * are part of the given subscriptions.
 */
void
CreateReplicaIdentities(List *logicalRepTargetList)
{
	LogicalRepTarget *target = NULL;
	foreach_ptr(target, logicalRepTargetList)
	{
		MultiConnection *superuserConnection = target->superuserConnection;
		CreateReplicaIdentitiesOnNode(
			target->newShards,
			superuserConnection->hostname,
			superuserConnection->port);
	}
}


/*
 * CreateReplicaIdentitiesOnNode gets a shardList and creates all the replica
 * identities on the shards in the given node.
 */
void
CreateReplicaIdentitiesOnNode(List *shardList, char *nodeName, int32 nodePort)
{
	MemoryContext localContext = AllocSetContextCreate(CurrentMemoryContext,
													   "CreateReplicaIdentitiesOnNode",
													   ALLOCSET_DEFAULT_SIZES);
	MemoryContext oldContext = MemoryContextSwitchTo(localContext);

	ShardInterval *shardInterval;
	foreach_ptr(shardInterval, shardList)
	{
		uint64 shardId = shardInterval->shardId;
		Oid relationId = shardInterval->relationId;

		List *backingIndexCommandList =
			GetIndexCommandListForShardBackingReplicaIdentity(relationId, shardId);

		List *replicaIdentityShardCommandList =
			GetReplicaIdentityCommandListForShard(relationId, shardId);

		List *commandList =
			list_concat(backingIndexCommandList, replicaIdentityShardCommandList);

		if (commandList != NIL)
		{
			ereport(DEBUG1, (errmsg("Creating replica identity for shard %ld on "
									"target node %s:%d", shardId, nodeName, nodePort)));

			SendCommandListToWorkerOutsideTransaction(nodeName, nodePort,
													  TableOwner(relationId),
													  commandList);
		}

		MemoryContextReset(localContext);
	}

	MemoryContextSwitchTo(oldContext);
}


/*
 * GetIndexCommandListForShardBackingReplicaIdentity returns all the create index
 * commands that are needed to create replica identity. If the table doesn't have
 * a replica identity, the function returns NIL.
 */
static List *
GetIndexCommandListForShardBackingReplicaIdentity(Oid relationId, uint64 shardId)
{
	List *commandList = NIL;
	Relation relation = table_open(relationId, AccessShareLock);
	Oid replicaIdentityIndex = GetRelationIdentityOrPK(relation);
	table_close(relation, NoLock);

	if (OidIsValid(replicaIdentityIndex))
	{
		/*
		 * The replica identity is backed by an index or primary key,
		 * so get the index/pkey definition first.
		 */
		HeapTuple indexTuple =
			SearchSysCache1(INDEXRELID, ObjectIdGetDatum(replicaIdentityIndex));
		if (!HeapTupleIsValid(indexTuple))
		{
			/* should not happen */
			elog(ERROR, "cache lookup failed for index %u", replicaIdentityIndex);
		}

		Form_pg_index indexForm = ((Form_pg_index) GETSTRUCT(indexTuple));
		List *indexCommandTableDDLList = NIL;
		int indexFlags = INCLUDE_INDEX_ALL_STATEMENTS;
		GatherIndexAndConstraintDefinitionList(indexForm, &indexCommandTableDDLList,
											   indexFlags);

		List *indexCommandShardDDLList =
			WorkerApplyShardDDLCommandList(indexCommandTableDDLList, shardId);

		commandList = list_concat(commandList, indexCommandShardDDLList);

		ReleaseSysCache(indexTuple);
	}

	return commandList;
}


/*
 * GetReplicaIdentityCommandListForShard returns the create replica identity
 * command that are needed to create replica identity. If the table doesn't have
 * a replica identity, the function returns NIL.
 */
static List *
GetReplicaIdentityCommandListForShard(Oid relationId, uint64 shardId)
{
	List *replicaIdentityTableDDLCommand =
		GetTableReplicaIdentityCommand(relationId);
	List *replicaIdentityShardCommandList =
		WorkerApplyShardDDLCommandList(replicaIdentityTableDDLCommand, shardId);

	return replicaIdentityShardCommandList;
}


/*
 * CreatePostLogicalReplicationDataLoadObjects gets a shardList and creates all
 * the objects that can be created after the data is moved with logical replication.
 */
static void
CreatePostLogicalReplicationDataLoadObjects(List *logicalRepTargetList,
											LogicalRepType type)
{
	/*
	 * We create indexes in 4 steps.
	 *  - CREATE INDEX statements
	 *  - CREATE CONSTRAINT statements that are backed by
	 *    indexes (unique and exclude constraints)
	 *  - ALTER TABLE %s CLUSTER ON %s
	 *  - ALTER INDEX %s ALTER COLUMN %d SET STATISTICS %d
	 *
	 *  On each step, we execute can execute commands in parallel. For example,
	 *  multiple indexes on the shard table or indexes for the colocated shards
	 *  can be created in parallel. However, the latter two steps, clustering the
	 *  table and setting the statistics of indexes, depends on the indexes being
	 *  created. That's why the execution is divided into four distinct stages.
	 */
	ExecuteCreateIndexCommands(logicalRepTargetList);
	ExecuteCreateConstraintsBackedByIndexCommands(logicalRepTargetList);
	ExecuteClusterOnCommands(logicalRepTargetList);
	ExecuteCreateIndexStatisticsCommands(logicalRepTargetList);

	/*
	 * Once the indexes are created, there are few more objects like triggers and table
	 * statistics that should be created after the data move.
	 */
	ExecuteRemainingPostLoadTableCommands(logicalRepTargetList);

	/*
	 * Creating the partitioning hierarchy errors out in shard splits when
	 */
	if (type != SHARD_SPLIT)
	{
		/* create partitioning hierarchy, if any */
		CreatePartitioningHierarchy(logicalRepTargetList);
	}
}


/*
 * ExecuteCreateIndexCommands gets a shardList and creates all the indexes
 * for the given shardList in the given target node.
 *
 * The execution is done in parallel, and throws an error if any of the
 * commands fail.
 */
static void
ExecuteCreateIndexCommands(List *logicalRepTargetList)
{
	List *taskList = NIL;
	LogicalRepTarget *target = NULL;
	foreach_ptr(target, logicalRepTargetList)
	{
		ShardInterval *shardInterval = NULL;
		foreach_ptr(shardInterval, target->newShards)
		{
			Oid relationId = shardInterval->relationId;

			List *tableCreateIndexCommandList =
				GetTableIndexAndConstraintCommandsExcludingReplicaIdentity(relationId,
																		   INCLUDE_CREATE_INDEX_STATEMENTS);

			List *shardCreateIndexCommandList =
				WorkerApplyShardDDLCommandList(tableCreateIndexCommandList,
											   shardInterval->shardId);
			List *taskListForShard =
				ConvertNonExistingPlacementDDLCommandsToTasks(
					shardCreateIndexCommandList,
					target->superuserConnection->hostname,
					target->superuserConnection->port);
			taskList = list_concat(taskList, taskListForShard);
		}
	}

	/*
	 * We are going to create indexes and constraints using the current user. That is
	 * alright because an index/constraint always belongs to the owner of the table,
	 * and Citus already ensures that the current user owns all the tables that are
	 * moved.
	 *
	 * CREATE INDEX commands acquire ShareLock on a relation. So, it is
	 * allowed to run multiple CREATE INDEX commands concurrently on a table
	 * and across different tables (e.g., shards).
	 */

	ereport(DEBUG1, (errmsg("Creating post logical replication objects "
							"(indexes)")));

	ExecuteTaskListOutsideTransaction(ROW_MODIFY_NONE, taskList,
									  MaxAdaptiveExecutorPoolSize,
									  NIL);
}


/*
 * ExecuteCreateConstraintsBackedByIndexCommands gets a shardList and creates all the constraints
 * that are backed by indexes for the given shardList in the given target node.
 *
 * The execution is done in sequential mode, and throws an error if any of the
 * commands fail.
 */
static void
ExecuteCreateConstraintsBackedByIndexCommands(List *logicalRepTargetList)
{
	ereport(DEBUG1, (errmsg("Creating post logical replication objects "
							"(constraints backed by indexes)")));

	MemoryContext localContext = AllocSetContextCreate(CurrentMemoryContext,
													   "CreateConstraintsBackedByIndexContext",
													   ALLOCSET_DEFAULT_SIZES);
	MemoryContext oldContext = MemoryContextSwitchTo(localContext);

	LogicalRepTarget *target = NULL;
	foreach_ptr(target, logicalRepTargetList)
	{
		ShardInterval *shardInterval = NULL;
		foreach_ptr(shardInterval, target->newShards)
		{
			Oid relationId = shardInterval->relationId;

			List *tableCreateConstraintCommandList =
				GetTableIndexAndConstraintCommandsExcludingReplicaIdentity(relationId,
																		   INCLUDE_CREATE_CONSTRAINT_STATEMENTS);

			if (tableCreateConstraintCommandList == NIL)
			{
				/* no constraints backed by indexes, skip */
				MemoryContextReset(localContext);
				continue;
			}

			List *shardCreateConstraintCommandList =
				WorkerApplyShardDDLCommandList(tableCreateConstraintCommandList,
											   shardInterval->shardId);

			char *tableOwner = TableOwner(shardInterval->relationId);
			SendCommandListToWorkerOutsideTransaction(
				target->superuserConnection->hostname,
				target->superuserConnection->port,
				tableOwner,
				shardCreateConstraintCommandList);
			MemoryContextReset(localContext);
		}
	}

	MemoryContextSwitchTo(oldContext);
}


/*
 * ConvertNonExistingShardDDLCommandsToTasks generates one task per input
 * element in shardCommandList.
 *
 * The generated tasks' placements do not exist (yet). We are generating
 * fake placements for the tasks.
 */
static List *
ConvertNonExistingPlacementDDLCommandsToTasks(List *shardCommandList,
											  char *targetNodeName,
											  int targetNodePort)
{
	WorkerNode *workerNode = FindWorkerNodeOrError(targetNodeName, targetNodePort);

	List *taskList = NIL;
	uint64 jobId = INVALID_JOB_ID;

	ListCell *commandCell = NULL;
	int taskId = 1;
	foreach(commandCell, shardCommandList)
	{
		char *command = (char *) lfirst(commandCell);
		Task *task = CreateBasicTask(jobId, taskId, DDL_TASK, command);

		/* this placement currently does not exist */
		ShardPlacement *taskPlacement = CitusMakeNode(ShardPlacement);
		SetPlacementNodeMetadata(taskPlacement, workerNode);

		task->taskPlacementList = list_make1(taskPlacement);

		taskList = lappend(taskList, task);
		taskId++;
	}

	return taskList;
}


/*
 * ExecuteClusterOnCommands gets a shardList and creates all the CLUSTER ON commands
 * for the given shardList in the given target node.
 *
 * The execution is done in parallel, and in case of any failure, the transaction
 * is aborted.
 */
static void
ExecuteClusterOnCommands(List *logicalRepTargetList)
{
	List *taskList = NIL;
	LogicalRepTarget *target = NULL;
	foreach_ptr(target, logicalRepTargetList)
	{
		ShardInterval *shardInterval = NULL;
		foreach_ptr(shardInterval, target->newShards)
		{
			Oid relationId = shardInterval->relationId;

			List *tableAlterTableClusterOnCommandList =
				GetTableIndexAndConstraintCommandsExcludingReplicaIdentity(relationId,
																		   INCLUDE_INDEX_CLUSTERED_STATEMENTS);

			List *shardAlterTableClusterOnCommandList =
				WorkerApplyShardDDLCommandList(tableAlterTableClusterOnCommandList,
											   shardInterval->shardId);

			List *taskListForShard =
				ConvertNonExistingPlacementDDLCommandsToTasks(
					shardAlterTableClusterOnCommandList,
					target->superuserConnection->hostname,
					target->superuserConnection->port);
			taskList = list_concat(taskList, taskListForShard);
		}
	}

	ereport(DEBUG1, (errmsg("Creating post logical replication objects "
							"(CLUSTER ON)")));

	ExecuteTaskListOutsideTransaction(ROW_MODIFY_NONE, taskList,
									  MaxAdaptiveExecutorPoolSize,
									  NIL);
}


/*
 * ExecuteCreateIndexStatisticsCommands gets a shardList and creates
 * all the statistics objects for the indexes in the given target node.
 *
 * The execution is done in sequentially, and in case of any failure, the transaction
 * is aborted.
 */
static void
ExecuteCreateIndexStatisticsCommands(List *logicalRepTargetList)
{
	ereport(DEBUG1, (errmsg("Creating post logical replication objects "
							"(index statistics)")));

	MemoryContext localContext = AllocSetContextCreate(CurrentMemoryContext,
													   "CreateIndexStatisticsContext",
													   ALLOCSET_DEFAULT_SIZES);
	MemoryContext oldContext = MemoryContextSwitchTo(localContext);

	LogicalRepTarget *target = NULL;
	foreach_ptr(target, logicalRepTargetList)
	{
		ShardInterval *shardInterval = NULL;
		foreach_ptr(shardInterval, target->newShards)
		{
			Oid relationId = shardInterval->relationId;

			List *tableAlterIndexSetStatisticsCommandList =
				GetTableIndexAndConstraintCommandsExcludingReplicaIdentity(relationId,
																		   INCLUDE_INDEX_STATISTICS_STATEMENTTS);
			List *shardAlterIndexSetStatisticsCommandList =
				WorkerApplyShardDDLCommandList(tableAlterIndexSetStatisticsCommandList,
											   shardInterval->shardId);

			if (shardAlterIndexSetStatisticsCommandList == NIL)
			{
				/* no index statistics exists, skip */
				MemoryContextReset(localContext);
				continue;
			}

			/*
			 * These remaining operations do not require significant resources, so no
			 * need to create them in parallel.
			 */
			char *tableOwner = TableOwner(shardInterval->relationId);
			SendCommandListToWorkerOutsideTransaction(
				target->superuserConnection->hostname,
				target->superuserConnection->port,
				tableOwner,
				shardAlterIndexSetStatisticsCommandList);

			MemoryContextReset(localContext);
		}
	}

	MemoryContextSwitchTo(oldContext);
}


/*
 * ExecuteRemainingPostLoadTableCommands gets a shardList and creates
 * all the remaining post load objects other than the indexes
 * in the given target node.
 */
static void
ExecuteRemainingPostLoadTableCommands(List *logicalRepTargetList)
{
	ereport(DEBUG1, (errmsg("Creating post logical replication objects "
							"(triggers and table statistics)"
							)));

	MemoryContext localContext = AllocSetContextCreate(CurrentMemoryContext,
													   "CreateTableStatisticsContext",
													   ALLOCSET_DEFAULT_SIZES);
	MemoryContext oldContext = MemoryContextSwitchTo(localContext);

	LogicalRepTarget *target = NULL;
	foreach_ptr(target, logicalRepTargetList)
	{
		ShardInterval *shardInterval = NULL;
		foreach_ptr(shardInterval, target->newShards)
		{
			Oid relationId = shardInterval->relationId;

			bool includeIndexes = false;
			bool includeReplicaIdentity = false;

			List *tablePostLoadTableCommandList =
				GetPostLoadTableCreationCommands(relationId, includeIndexes,
												 includeReplicaIdentity);

			List *shardPostLoadTableCommandList =
				WorkerApplyShardDDLCommandList(tablePostLoadTableCommandList,
											   shardInterval->shardId);

			if (shardPostLoadTableCommandList == NIL)
			{
				/* no index statistics exists, skip */
				continue;
			}

			/*
			 * These remaining operations do not require significant resources, so no
			 * need to create them in parallel.
			 */
			char *tableOwner = TableOwner(shardInterval->relationId);
			SendCommandListToWorkerOutsideTransaction(
				target->superuserConnection->hostname,
				target->superuserConnection->port,
				tableOwner,
				shardPostLoadTableCommandList);

			MemoryContextReset(localContext);
		}
	}

	MemoryContextSwitchTo(oldContext);
}


/*
 * CreatePartitioningHierarchy gets a shardList and creates the partitioning
 * hierarchy between the shardList, if any,
 */
void
CreatePartitioningHierarchy(List *logicalRepTargetList)
{
	ereport(DEBUG1, (errmsg("Creating post logical replication objects "
							"(partitioning hierarchy)")));

	MemoryContext localContext = AllocSetContextCreate(CurrentMemoryContext,
													   "CreatePartitioningHierarchy",
													   ALLOCSET_DEFAULT_SIZES);
	MemoryContext oldContext = MemoryContextSwitchTo(localContext);

	LogicalRepTarget *target = NULL;
	foreach_ptr(target, logicalRepTargetList)
	{
		ShardInterval *shardInterval = NULL;
		foreach_ptr(shardInterval, target->newShards)
		{
			if (PartitionTable(shardInterval->relationId))
			{
				char *attachPartitionCommand =
					GenerateAttachShardPartitionCommand(shardInterval);

				char *tableOwner = TableOwner(shardInterval->relationId);

				/*
				 * Attaching partition may acquire conflicting locks when created in
				 * parallel, so create them sequentially. Also attaching partition
				 * is a quick operation, so it is fine to execute sequentially.
				 */

				MultiConnection *connection =
					GetNodeUserDatabaseConnection(OUTSIDE_TRANSACTION,
												  target->superuserConnection->hostname,
												  target->superuserConnection->port,
												  tableOwner, NULL);
				ExecuteCriticalRemoteCommand(connection, attachPartitionCommand);

				MemoryContextReset(localContext);
			}
		}
	}

	MemoryContextSwitchTo(oldContext);
}


/*
 * CreateUncheckedForeignKeyConstraints is used to create the foreign
 * constraints on the logical replication target without checking that they are
 * actually valid.
 *
 * We skip the validation phase of foreign keys to after a shard
 * move/copy/split because the validation is pretty costly and given that the
 * source placements are already valid, the validation in the target nodes is
 * useless.
 */
void
CreateUncheckedForeignKeyConstraints(List *logicalRepTargetList)
{
	MemoryContext localContext =
		AllocSetContextCreate(CurrentMemoryContext,
							  "CreateKeyForeignConstraints",
							  ALLOCSET_DEFAULT_SIZES);
	MemoryContext oldContext = MemoryContextSwitchTo(localContext);


	/*
	 * Iterate over all the shards in the shard group.
	 */
	LogicalRepTarget *target = NULL;
	foreach_ptr(target, logicalRepTargetList)
	{
		ShardInterval *shardInterval = NULL;

		/*
		 * Iterate on split shards list for a given shard and create constraints.
		 */
		foreach_ptr(shardInterval, target->newShards)
		{
			List *commandList = CopyShardForeignConstraintCommandList(
				shardInterval);
			commandList = list_concat(
				list_make1("SET LOCAL citus.skip_constraint_validation TO ON;"),
				commandList);

			SendCommandListToWorkerOutsideTransactionWithConnection(
				target->superuserConnection,
				commandList);

			MemoryContextReset(localContext);
		}
	}

	MemoryContextSwitchTo(oldContext);
}


/*
 * ConflictWithIsolationTestingBeforeCopy is only useful to test
 * get_rebalance_progress by pausing before doing the actual copy. This way we
 * can see the state of the tables at that point. This should not be called by
 * any code-path except for code paths to move and split shards().
 *
 * Note that since the cost of calling this function is pretty low, we prefer
 * to use it in non-assert builds as well not to diverge in the behaviour.
 */
extern void
ConflictWithIsolationTestingBeforeCopy(void)
{
	LOCKTAG tag;
	const bool sessionLock = false;
	const bool dontWait = false;

	if (RunningUnderCitusTestSuite)
	{
		SET_LOCKTAG_ADVISORY(tag, MyDatabaseId,
							 SHARD_MOVE_ADVISORY_LOCK_SECOND_KEY,
							 SHARD_MOVE_ADVISORY_LOCK_FIRST_KEY, 2);

		/* uses sharelock so concurrent moves don't conflict with eachother */
		(void) LockAcquire(&tag, ShareLock, sessionLock, dontWait);
	}
}


/*
 * ConflictWithIsolationTestingAfterCopy is only useful for two types of tests.
 * 1. Testing the output of get_rebalance_progress after the copy is completed,
 *    but before the move is completely finished. Because finishing the move
 *    will clear the contents of get_rebalance_progress.
 * 2. To test that our non-blocking shard moves/splits actually don't block
 *    writes. Since logically replicating shards does eventually block
 *    modifications, it becomes tricky to use isolation tester to show
 *    concurrent behaviour of online shard rebalancing and modification
 *    queries. So, during logical replication we call this function at
 *    the end of the catchup, right before blocking writes.
 *
 * Note that since the cost of calling this function is pretty low, we prefer
 * to use it in non-assert builds as well not to diverge in the behaviour.
 */
extern void
ConflictWithIsolationTestingAfterCopy(void)
{
	LOCKTAG tag;
	const bool sessionLock = false;
	const bool dontWait = false;

	if (RunningUnderCitusTestSuite)
	{
		SET_LOCKTAG_ADVISORY(tag, MyDatabaseId,
							 SHARD_MOVE_ADVISORY_LOCK_FIRST_KEY,
							 SHARD_MOVE_ADVISORY_LOCK_SECOND_KEY, 2);

		/* uses sharelock so concurrent moves don't conflict with eachother */
		(void) LockAcquire(&tag, ShareLock, sessionLock, dontWait);
	}
}


/*
 * PublicationName returns the name of the publication for the given node and
 * table owner.
 */
char *
PublicationName(LogicalRepType type, uint32_t nodeId, Oid ownerId)
{
	return psprintf("%s%u_%u_%lu", publicationPrefix[type],
					nodeId, ownerId, CurrentOperationId);
}


/*
 * ReplicationSlotNameForNodeAndOwnerForOperation returns the name of the
 * replication slot for the given node, table owner and operation id.
 *
 * Note that PG15 introduced a new ReplicationSlotName function that caused name conflicts
 * and we renamed this function.
 */
char *
ReplicationSlotNameForNodeAndOwnerForOperation(LogicalRepType type, uint32_t nodeId,
											   Oid ownerId, OperationId operationId)
{
	StringInfo slotName = makeStringInfo();
	appendStringInfo(slotName, "%s%u_%u_%lu", replicationSlotPrefix[type], nodeId,
					 ownerId, operationId);

	if (slotName->len > NAMEDATALEN)
	{
		ereport(ERROR,
				(errmsg(
					 "Replication Slot name:%s having length:%d is greater than maximum allowed length:%d",
					 slotName->data, slotName->len, NAMEDATALEN)));
	}
	return slotName->data;
}


/*
 * SubscriptionName returns the name of the subscription for the given owner.
 */
char *
SubscriptionName(LogicalRepType type, Oid ownerId)
{
	return psprintf("%s%u_%lu", subscriptionPrefix[type],
					ownerId, CurrentOperationId);
}


/*
 * SubscriptionRoleName returns the name of the role used by the
 * subscription that subscribes to the tables of the given owner.
 */
char *
SubscriptionRoleName(LogicalRepType type, Oid ownerId)
{
	return psprintf("%s%u_%lu", subscriptionRolePrefix[type], ownerId,
					CurrentOperationId);
}


/*
 * GetQueryResultStringList expects a query that returns a single column of
 * strings. This query is executed on the connection and the function then
 * returns the results of the query in a List.
 */
List *
GetQueryResultStringList(MultiConnection *connection, char *query)
{
	bool raiseInterrupts = true;

	int querySent = SendRemoteCommand(connection, query);
	if (querySent == 0)
	{
		ReportConnectionError(connection, ERROR);
	}

	PGresult *result = GetRemoteCommandResult(connection, raiseInterrupts);
	if (!IsResponseOK(result))
	{
		ReportResultError(connection, result, ERROR);
	}

	int rowCount = PQntuples(result);
	int columnCount = PQnfields(result);

	if (columnCount != 1)
	{
		ereport(ERROR, (errmsg("unexpected number of columns returned while reading ")));
	}

	List *resultList = NIL;
	for (int rowIndex = 0; rowIndex < rowCount; rowIndex++)
	{
		int columnIndex = 0;
		StringInfo resultStringInfo = makeStringInfo();

		char *resultString = PQgetvalue(result, rowIndex, columnIndex);

		/* we're using the stringinfo to copy the data into the current memory context */
		appendStringInfoString(resultStringInfo, resultString);

		resultList = lappend(resultList, resultStringInfo->data);
	}

	PQclear(result);
	ForgetResults(connection);
	return resultList;
}


/*
 * CreatePublications creates a the publications defined in the
 * publicationInfoHash over the given connection.
 */
void
CreatePublications(MultiConnection *connection,
				   HTAB *publicationInfoHash)
{
	HASH_SEQ_STATUS status;
	hash_seq_init(&status, publicationInfoHash);
	PublicationInfo *entry = NULL;
	while ((entry = (PublicationInfo *) hash_seq_search(&status)) != NULL)
	{
		StringInfo createPublicationCommand = makeStringInfo();
		bool prefixWithComma = false;

		appendStringInfo(createPublicationCommand, "CREATE PUBLICATION %s FOR TABLE ",
						 quote_identifier(entry->name));

		ShardInterval *shard = NULL;
		foreach_ptr(shard, entry->shardIntervals)
		{
			char *shardName = ConstructQualifiedShardName(shard);

			if (prefixWithComma)
			{
				appendStringInfoString(createPublicationCommand, ",");
			}

			appendStringInfoString(createPublicationCommand, shardName);
			prefixWithComma = true;
		}

		WorkerNode *worker = FindWorkerNode(connection->hostname,
											connection->port);
		InsertCleanupRecordInSubtransaction(CLEANUP_OBJECT_PUBLICATION,
											entry->name,
											worker->groupId,
											CLEANUP_ALWAYS);

		ExecuteCriticalRemoteCommand(connection, DISABLE_DDL_PROPAGATION);
		ExecuteCriticalRemoteCommand(connection, createPublicationCommand->data);
		ExecuteCriticalRemoteCommand(connection, ENABLE_DDL_PROPAGATION);
		pfree(createPublicationCommand->data);
		pfree(createPublicationCommand);
	}
}


/*
 * GetReplicationConnection opens a new replication connection to this node.
 * This connection can be used to send replication commands, such as
 * CREATE_REPLICATION_SLOT.
 */
MultiConnection *
GetReplicationConnection(char *nodeName, int nodePort)
{
	int connectionFlags = FORCE_NEW_CONNECTION;
	connectionFlags |= REQUIRE_REPLICATION_CONNECTION_PARAM;

	MultiConnection *connection = GetNodeUserDatabaseConnection(
		connectionFlags,
		nodeName,
		nodePort,
		CitusExtensionOwnerName(),
		get_database_name(MyDatabaseId));

	/*
	 * Replication connections are special and don't support all of SQL, so we
	 * don't want it to be used for other purposes what we create it for.
	 */
	ClaimConnectionExclusively(connection);
	return connection;
}


/*
 * CreateReplicationSlot creates a replication slot with the given slot name
 * over the given connection. The given connection should be a replication
 * connection. This function returns the name of the snapshot that is used for
 * this replication slot. When using this snapshot name for other transactions
 * you need to keep the given replication connection open until you have used
 * the snapshot name.
 */
static char *
CreateReplicationSlot(MultiConnection *connection, char *slotname, char *outputPlugin)
{
	StringInfo createReplicationSlotCommand = makeStringInfo();
	appendStringInfo(createReplicationSlotCommand,
					 "CREATE_REPLICATION_SLOT %s LOGICAL %s EXPORT_SNAPSHOT;",
					 quote_identifier(slotname), quote_identifier(outputPlugin));

	PGresult *result = NULL;
	int response = ExecuteOptionalRemoteCommand(connection,
												createReplicationSlotCommand->data,
												&result);

	if (response != RESPONSE_OKAY || !IsResponseOK(result) || PQntuples(result) != 1)
	{
		ReportResultError(connection, result, ERROR);
	}

	/*'snapshot_name' is second column where index starts from zero.
	 * We're using the pstrdup to copy the data into the current memory context */
	char *snapShotName = pstrdup(PQgetvalue(result, 0, 2 /* columIndex */));
	PQclear(result);
	ForgetResults(connection);
	return snapShotName;
}


/*
 * CreateReplicationSlots creates the replication slots that the subscriptions
 * in the logicalRepTargetList can use.
 *
 * This function returns the snapshot name of the replication slots that are
 * used by the subscription. When using this snapshot name for other
 * transactions you need to keep the given replication connection open until
 * you are finished using the snapshot.
 */
char *
CreateReplicationSlots(MultiConnection *sourceConnection,
					   MultiConnection *sourceReplicationConnection,
					   List *logicalRepTargetList,
					   char *outputPlugin)
{
	ReplicationSlotInfo *firstReplicationSlot = NULL;
	char *snapshot = NULL;
	LogicalRepTarget *target = NULL;
	foreach_ptr(target, logicalRepTargetList)
	{
		ReplicationSlotInfo *replicationSlot = target->replicationSlot;

		WorkerNode *worker = FindWorkerNode(sourceConnection->hostname,
											sourceConnection->port);
		InsertCleanupRecordInSubtransaction(CLEANUP_OBJECT_REPLICATION_SLOT,
											replicationSlot->name,
											worker->groupId,
											CLEANUP_ALWAYS);

		if (!firstReplicationSlot)
		{
			firstReplicationSlot = replicationSlot;
			snapshot = CreateReplicationSlot(
				sourceReplicationConnection,
				replicationSlot->name,
				outputPlugin
				);
		}
		else
		{
			ExecuteCriticalRemoteCommand(
				sourceConnection,
				psprintf("SELECT pg_catalog.pg_copy_logical_replication_slot(%s, %s)",
						 quote_literal_cstr(firstReplicationSlot->name),
						 quote_literal_cstr(replicationSlot->name)));
		}
	}
	return snapshot;
}


/*
 * CreateSubscriptions creates the subscriptions according to their definition
 * in the logicalRepTargetList. The remote node(s) needs to have appropriate
 * pg_dist_authinfo rows for the superuser such that the apply process can
 * connect. Because the generated CREATE SUBSCRIPTION statements use the host
 * and port names directly (rather than looking up any relevant
 * pg_dist_poolinfo rows), all such connections remain direct and will not
 * route through any configured poolers.
 *
 * The subscriptions created by this function are created in the disabled
 * state. This is done so a data copy can be done manually afterwards. To
 * enable the subscriptions you can use EnableSubscriptions().
 */
void
CreateSubscriptions(MultiConnection *sourceConnection,
					char *databaseName,
					List *logicalRepTargetList)
{
	LogicalRepTarget *target = NULL;
	foreach_ptr(target, logicalRepTargetList)
	{
		int ownerId = target->tableOwnerId;

		WorkerNode *worker = FindWorkerNode(target->superuserConnection->hostname,
											target->superuserConnection->port);

		/*
		 * The CREATE USER command should not propagate, so we temporarily
		 * disable DDL propagation.
		 *
		 * Subscription workers have SUPERUSER permissions. Hence we temporarily
		 * create a user with SUPERUSER permissions and then alter it to NOSUPERUSER.
		 * This prevents permission escalations.
		 */
		SendCommandListToWorkerOutsideTransactionWithConnection(
			target->superuserConnection,
			list_make2(
				"SET LOCAL citus.enable_ddl_propagation TO OFF;",
				psprintf(
					"CREATE USER %s SUPERUSER IN ROLE %s;",
					quote_identifier(target->subscriptionOwnerName),
					quote_identifier(GetUserNameFromId(ownerId, false))
					)));

		InsertCleanupRecordInSubtransaction(CLEANUP_OBJECT_USER,
											target->subscriptionOwnerName,
											worker->groupId,
											CLEANUP_ALWAYS);

		StringInfo conninfo = makeStringInfo();
		appendStringInfo(conninfo, "host='%s' port=%d user='%s' dbname='%s' "
								   "connect_timeout=20",
						 escape_param_str(sourceConnection->hostname),
						 sourceConnection->port,
						 escape_param_str(sourceConnection->user), escape_param_str(
							 databaseName));
		if (CpuPriorityLogicalRepSender != CPU_PRIORITY_INHERIT &&
			list_length(logicalRepTargetList) <= MaxHighPriorityBackgroundProcesess)
		{
			appendStringInfo(conninfo,
							 " options='-c citus.cpu_priority=%d'",
							 CpuPriorityLogicalRepSender);
		}

		StringInfo createSubscriptionCommand = makeStringInfo();
		appendStringInfo(createSubscriptionCommand,
						 "CREATE SUBSCRIPTION %s CONNECTION %s PUBLICATION %s "
						 "WITH (citus_use_authinfo=true, create_slot=false, "
#if PG_VERSION_NUM >= PG_VERSION_16

		                 /*
		                  * password_required specifies whether connections to the publisher
		                  * made as a result of this subscription must use password authentication.
		                  * However, this setting is ignored when the subscription is owned
		                  * by a superuser.
		                  * Given that this command is executed below with superuser
		                  * ExecuteCriticalRemoteCommand(target->superuserConnection,
		                  *                              createSubscriptionCommand->data);
		                  * We are safe to pass password_required as false because
		                  * it will be ignored anyway
		                  */
						 "copy_data=false, enabled=false, slot_name=%s, password_required=false",
#else
						 "copy_data=false, enabled=false, slot_name=%s",
#endif
						 quote_identifier(target->subscriptionName),
						 quote_literal_cstr(conninfo->data),
						 quote_identifier(target->publication->name),
						 quote_identifier(target->replicationSlot->name));

		if (EnableBinaryProtocol)
		{
			appendStringInfoString(createSubscriptionCommand, ", binary=true)");
		}
		else
		{
			appendStringInfoString(createSubscriptionCommand, ")");
		}


		ExecuteCriticalRemoteCommand(target->superuserConnection,
									 createSubscriptionCommand->data);
		pfree(createSubscriptionCommand->data);
		pfree(createSubscriptionCommand);

		InsertCleanupRecordInSubtransaction(CLEANUP_OBJECT_SUBSCRIPTION,
											target->subscriptionName,
											worker->groupId,
											CLEANUP_ALWAYS);

		ExecuteCriticalRemoteCommand(target->superuserConnection, psprintf(
										 "ALTER SUBSCRIPTION %s OWNER TO %s",
										 quote_identifier(target->subscriptionName),
										 quote_identifier(target->subscriptionOwnerName)
										 ));

		/*
		 * The ALTER ROLE command should not propagate, so we temporarily
		 * disable DDL propagation.
		 */
		SendCommandListToWorkerOutsideTransactionWithConnection(
			target->superuserConnection,
			list_make2(
				"SET LOCAL citus.enable_ddl_propagation TO OFF;",
				psprintf(
					"ALTER ROLE %s NOSUPERUSER;",
					quote_identifier(target->subscriptionOwnerName)
					)));
	}
}


/*
 * EnableSubscriptions enables all the the subscriptions in the
 * logicalRepTargetList. This means the replication slot will start to be read
 * and the catchup phase begins.
 */
void
EnableSubscriptions(List *logicalRepTargetList)
{
	LogicalRepTarget *target = NULL;
	foreach_ptr(target, logicalRepTargetList)
	{
		ExecuteCriticalRemoteCommand(target->superuserConnection, psprintf(
										 "ALTER SUBSCRIPTION %s ENABLE",
										 target->subscriptionName
										 ));
	}
}


/* *INDENT-OFF* */
/*
 * Escaping libpq connect parameter strings.
 *
 * Replaces "'" with "\'" and "\" with "\\".
 *
 * Copied from dblink.c to escape libpq params
 */
static char *
escape_param_str(const char *str)
{
	StringInfoData buf;

	initStringInfo(&buf);

	for (const char *cp = str; *cp; cp++)
	{
		if (*cp == '\\' || *cp == '\'')
			appendStringInfoChar(&buf, '\\');
		appendStringInfoChar(&buf, *cp);
	}

	return buf.data;
}

/* *INDENT-ON* */


/*
 * GetRemoteLogPosition gets the current WAL log position over the given connection.
 */
XLogRecPtr
GetRemoteLogPosition(MultiConnection *connection)
{
	return GetRemoteLSN(connection, CURRENT_LOG_POSITION_COMMAND);
}


/*
 * GetRemoteLSN executes a command that returns a single LSN over the given connection
 * and returns it as an XLogRecPtr (uint64).
 */
static XLogRecPtr
GetRemoteLSN(MultiConnection *connection, char *command)
{
	bool raiseInterrupts = false;
	XLogRecPtr remoteLogPosition = InvalidXLogRecPtr;

	int querySent = SendRemoteCommand(connection, command);
	if (querySent == 0)
	{
		ReportConnectionError(connection, ERROR);
	}

	PGresult *result = GetRemoteCommandResult(connection, raiseInterrupts);
	if (!IsResponseOK(result))
	{
		ReportResultError(connection, result, ERROR);
	}

	int rowCount = PQntuples(result);
	if (rowCount != 1)
	{
		PQclear(result);
		ForgetResults(connection);
		return InvalidXLogRecPtr;
	}

	int colCount = PQnfields(result);
	if (colCount != 1)
	{
		ereport(ERROR, (errmsg("unexpected number of columns returned by: %s",
							   command)));
	}

	if (!PQgetisnull(result, 0, 0))
	{
		char *resultString = PQgetvalue(result, 0, 0);
		Datum remoteLogPositionDatum = DirectFunctionCall1Coll(pg_lsn_in, InvalidOid,
															   CStringGetDatum(
																   resultString));
		remoteLogPosition = DatumGetLSN(remoteLogPositionDatum);
	}

	PQclear(result);
	ForgetResults(connection);

	return remoteLogPosition;
}


/*
 * CreateGroupedLogicalRepTargetsConnections creates connections for all of the nodes
 * in the groupedLogicalRepTargetsHash.
 */
void
CreateGroupedLogicalRepTargetsConnections(HTAB *groupedLogicalRepTargetsHash,
										  char *user,
										  char *databaseName)
{
	int connectionFlags = FORCE_NEW_CONNECTION;
	HASH_SEQ_STATUS status;
	GroupedLogicalRepTargets *groupedLogicalRepTargets = NULL;
	foreach_htab(groupedLogicalRepTargets, &status, groupedLogicalRepTargetsHash)
	{
		WorkerNode *targetWorkerNode = FindNodeWithNodeId(
			groupedLogicalRepTargets->nodeId,
			false);
		MultiConnection *superuserConnection =
			GetNodeUserDatabaseConnection(connectionFlags, targetWorkerNode->workerName,
										  targetWorkerNode->workerPort,
										  user,
										  databaseName);

		/*
		 * Operations on subscriptions cannot run in a transaction block. We
		 * claim the connections exclusively to ensure they do not get used for
		 * metadata syncing, which does open a transaction block.
		 */
		ClaimConnectionExclusively(superuserConnection);

		groupedLogicalRepTargets->superuserConnection = superuserConnection;

		LogicalRepTarget *target = NULL;
		foreach_ptr(target, groupedLogicalRepTargets->logicalRepTargetList)
		{
			target->superuserConnection = superuserConnection;
		}
	}
}


/*
 * CreateGroupedLogicalRepTargetsConnections closes the  connections for all of the
 * nodes in the groupedLogicalRepTargetsHash.
 */
void
CloseGroupedLogicalRepTargetsConnections(HTAB *groupedLogicalRepTargetsHash)
{
	HASH_SEQ_STATUS status;
	GroupedLogicalRepTargets *groupedLogicalRepTargets = NULL;
	foreach_htab(groupedLogicalRepTargets, &status, groupedLogicalRepTargetsHash)
	{
		CloseConnection(groupedLogicalRepTargets->superuserConnection);
	}
}


/*
 * SubscriptionNamesValueList returns a SQL value list containing the
 * subscription names from the logicalRepTargetList. This value list can
 * be used in a query by using the IN operator.
 */
static char *
SubscriptionNamesValueList(List *logicalRepTargetList)
{
	StringInfo subscriptionValueList = makeStringInfo();
	appendStringInfoString(subscriptionValueList, "(");
	bool first = true;

	LogicalRepTarget *target = NULL;
	foreach_ptr(target, logicalRepTargetList)
	{
		if (!first)
		{
			appendStringInfoString(subscriptionValueList, ",");
		}
		else
		{
			first = false;
		}
		appendStringInfoString(subscriptionValueList, quote_literal_cstr(
								   target->subscriptionName));
	}
	appendStringInfoString(subscriptionValueList, ")");
	return subscriptionValueList->data;
}


/*
 * WaitForAllSubscriptionToCatchUp waits until the last LSN reported by the
 * subscription.
 *
 * The function errors if the target LSN doesn't increase within
 * LogicalReplicationErrorTimeout. The function also reports its progress in
 * every logicalReplicationProgressReportTimeout.
 */
void
WaitForAllSubscriptionsToCatchUp(MultiConnection *sourceConnection,
								 HTAB *groupedLogicalRepTargetsHash)
{
	XLogRecPtr sourcePosition = GetRemoteLogPosition(sourceConnection);
	HASH_SEQ_STATUS status;
	GroupedLogicalRepTargets *groupedLogicalRepTargets = NULL;
	foreach_htab(groupedLogicalRepTargets, &status, groupedLogicalRepTargetsHash)
	{
		WaitForGroupedLogicalRepTargetsToCatchUp(sourcePosition,
												 groupedLogicalRepTargets);
	}
}


/*
 * WaitForNodeSubscriptionToCatchUp waits until the last LSN reported by the
 * subscription.
 *
 * The function errors if the target LSN doesn't increase within
 * LogicalReplicationErrorTimeout. The function also reports its progress in
 * every logicalReplicationProgressReportTimeout.
 */
static void
WaitForGroupedLogicalRepTargetsToCatchUp(XLogRecPtr sourcePosition,
										 GroupedLogicalRepTargets *
										 groupedLogicalRepTargets)
{
	XLogRecPtr previousTargetPosition = 0;
	TimestampTz previousLSNIncrementTime = GetCurrentTimestamp();

	/* report in the first iteration as well */
	TimestampTz previousReportTime = 0;
	MultiConnection *superuserConnection = groupedLogicalRepTargets->superuserConnection;


	/*
	 * We might be in the loop for a while. Since we don't need to preserve
	 * any memory beyond this function, we can simply switch to a child context
	 * and reset it on every iteration to make sure we don't slowly build up
	 * a lot of memory.
	 */
	MemoryContext loopContext = AllocSetContextCreateInternal(CurrentMemoryContext,
															  "WaitForShardSubscriptionToCatchUp",
															  ALLOCSET_DEFAULT_MINSIZE,
															  ALLOCSET_DEFAULT_INITSIZE,
															  ALLOCSET_DEFAULT_MAXSIZE);

	MemoryContext oldContext = MemoryContextSwitchTo(loopContext);

	while (true)
	{
		XLogRecPtr targetPosition = GetSubscriptionPosition(groupedLogicalRepTargets);
		if (targetPosition >= sourcePosition)
		{
			ereport(LOG, (errmsg(
							  "The LSN of the target subscriptions on node %s:%d have "
							  "caught up with the source LSN ",
							  superuserConnection->hostname,
							  superuserConnection->port)));

			break;
		}

		/*
		 * The following logic ensures that the subsription continues to grow withing
		 * LogicalReplicationErrorTimeout duration. Otherwise, we error out since we
		 * suspect that there is a problem on the target. It also handles the progess
		 * reporting.
		 */
		if (targetPosition > previousTargetPosition)
		{
			/* variable is only used for the log message */
			uint64 previousTargetBeforeThisLoop = previousTargetPosition;

			previousTargetPosition = targetPosition;
			previousLSNIncrementTime = GetCurrentTimestamp();

			if (TimestampDifferenceExceeds(previousReportTime,
										   GetCurrentTimestamp(),
										   logicalReplicationProgressReportTimeout))
			{
				ereport(LOG, (errmsg(
								  "The LSN of the target subscriptions on node %s:%d have "
								  "increased from %ld to %ld at %s where the source LSN is %ld  ",
								  superuserConnection->hostname,
								  superuserConnection->port, previousTargetBeforeThisLoop,
								  targetPosition,
								  timestamptz_to_str(previousLSNIncrementTime),
								  sourcePosition)));

				previousReportTime = GetCurrentTimestamp();
			}
		}
		else
		{
			if (TimestampDifferenceExceeds(previousLSNIncrementTime,
										   GetCurrentTimestamp(),
										   LogicalReplicationTimeout))
			{
				ereport(ERROR, (errmsg("The logical replication waiting timeout "
									   "of %d msec is exceeded",
									   LogicalReplicationTimeout),
								errdetail("The LSN on the target subscription hasn't "
										  "caught up ready on the target node %s:%d",
										  superuserConnection->hostname,
										  superuserConnection->port),
								errhint(
									"There might have occurred problems on the target "
									"node. If not consider using higher values for "
									"citus.logical_replication_error_timeout")));
			}
		}

		/* sleep for 1 seconds (1000 miliseconds) and try again */
		WaitForMiliseconds(1000);

		MemoryContextReset(loopContext);
	}

	MemoryContextSwitchTo(oldContext);
}


/*
 * WaitForMiliseconds waits for given timeout and then checks for some
 * interrupts.
 */
static void
WaitForMiliseconds(long timeout)
{
	int latchFlags = WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH;

	/* wait until timeout, or until somebody wakes us up */
	int rc = WaitLatch(MyLatch, latchFlags, timeout, PG_WAIT_EXTENSION);

	/* emergency bailout if postmaster has died */
	if (rc & WL_POSTMASTER_DEATH)
	{
		proc_exit(1);
	}

	if (rc & WL_LATCH_SET)
	{
		ResetLatch(MyLatch);
		CHECK_FOR_INTERRUPTS();
	}

	if (ConfigReloadPending)
	{
		ConfigReloadPending = false;
		ProcessConfigFile(PGC_SIGHUP);
	}
}


/*
 * GetSubscriptionPosition gets the minimum WAL log position of the
 * subscription given subscriptions: That is the WAL log position on the source
 * node up to which the subscription completed replication.
 */
static XLogRecPtr
GetSubscriptionPosition(GroupedLogicalRepTargets *groupedLogicalRepTargets)
{
	char *subscriptionValueList = SubscriptionNamesValueList(
		groupedLogicalRepTargets->logicalRepTargetList);
	return GetRemoteLSN(groupedLogicalRepTargets->superuserConnection, psprintf(
							"SELECT min(latest_end_lsn) FROM pg_stat_subscription "
							"WHERE subname IN %s", subscriptionValueList));
}