citus/src/backend/distributed/metadata/node_metadata.c

/*
 * node_metadata.c
 *	  Functions that operate on pg_dist_node
 *
 * Copyright (c) Citus Data, Inc.
 */
#include "postgres.h"

#include "funcapi.h"
#include "miscadmin.h"

#include "access/genam.h"
#include "access/heapam.h"
#include "access/htup.h"
#include "access/htup_details.h"
#include "access/skey.h"
#include "access/tupmacs.h"
#include "access/xact.h"
#include "catalog/indexing.h"
#include "catalog/namespace.h"
#include "commands/sequence.h"
#include "executor/spi.h"
#include "lib/stringinfo.h"
#include "postmaster/postmaster.h"
#include "storage/bufmgr.h"
#include "storage/fd.h"
#include "storage/lmgr.h"
#include "storage/lock.h"
#include "utils/builtins.h"
#include "utils/fmgroids.h"
#include "utils/lsyscache.h"
#include "utils/plancache.h"
#include "utils/rel.h"
#include "utils/relcache.h"

#include "distributed/citus_acquire_lock.h"
#include "distributed/citus_safe_lib.h"
#include "distributed/colocation_utils.h"
#include "distributed/commands.h"
#include "distributed/commands/utility_hook.h"
#include "distributed/connection_management.h"
#include "distributed/coordinator_protocol.h"
#include "distributed/maintenanced.h"
#include "distributed/metadata/distobject.h"
#include "distributed/metadata/pg_dist_object.h"
#include "distributed/metadata_cache.h"
#include "distributed/metadata_sync.h"
#include "distributed/metadata_utility.h"
#include "distributed/multi_join_order.h"
#include "distributed/multi_partitioning_utils.h"
#include "distributed/multi_router_planner.h"
#include "distributed/pg_dist_node.h"
#include "distributed/pg_dist_node_metadata.h"
#include "distributed/reference_table_utils.h"
#include "distributed/remote_commands.h"
#include "distributed/resource_lock.h"
#include "distributed/shardinterval_utils.h"
#include "distributed/shared_connection_stats.h"
#include "distributed/string_utils.h"
#include "distributed/transaction_recovery.h"
#include "distributed/version_compat.h"
#include "distributed/worker_manager.h"
#include "distributed/worker_transaction.h"

#define INVALID_GROUP_ID -1

/* default group size */
int GroupSize = 1;

/* config variable managed via guc.c */
char *CurrentCluster = "default";

/* did current transaction modify pg_dist_node? */
bool TransactionModifiedNodeMetadata = false;

bool EnableMetadataSync = true;

typedef struct NodeMetadata
{
	int32 groupId;
	char *nodeRack;
	bool hasMetadata;
	bool metadataSynced;
	bool isActive;
	Oid nodeRole;
	bool shouldHaveShards;
	char *nodeCluster;
} NodeMetadata;

/* local function forward declarations */
static void RemoveNodeFromCluster(char *nodeName, int32 nodePort);
static void ErrorIfNodeContainsNonRemovablePlacements(WorkerNode *workerNode);
static bool PlacementHasActivePlacementOnAnotherGroup(GroupShardPlacement
													  *sourcePlacement);
static int AddNodeMetadata(char *nodeName, int32 nodePort, NodeMetadata *nodeMetadata,
						   bool *nodeAlreadyExists, bool localOnly);
static int AddNodeMetadataViaMetadataContext(char *nodeName, int32 nodePort,
											 NodeMetadata *nodeMetadata,
											 bool *nodeAlreadyExists);
static HeapTuple GetNodeTuple(const char *nodeName, int32 nodePort);
static HeapTuple GetNodeByNodeId(int32 nodeId);
static int32 GetNextGroupId(void);
static int GetNextNodeId(void);
static void InsertPlaceholderCoordinatorRecord(void);
static void InsertNodeRow(int nodeid, char *nodename, int32 nodeport,
						  NodeMetadata *nodeMetadata);
static void DeleteNodeRow(char *nodename, int32 nodeport);
static void BlockDistributedQueriesOnMetadataNodes(void);
static WorkerNode * TupleToWorkerNode(TupleDesc tupleDescriptor, HeapTuple heapTuple);
static bool NodeIsLocal(WorkerNode *worker);
static void SetLockTimeoutLocally(int32 lock_cooldown);
static void UpdateNodeLocation(int32 nodeId, char *newNodeName, int32 newNodePort,
							   bool localOnly);
static bool UnsetMetadataSyncedForAllWorkers(void);
static char * GetMetadataSyncCommandToSetNodeColumn(WorkerNode *workerNode,
													int columnIndex,
													Datum value);
static char * NodeHasmetadataUpdateCommand(uint32 nodeId, bool hasMetadata);
static char * NodeMetadataSyncedUpdateCommand(uint32 nodeId, bool metadataSynced);
static void ErrorIfCoordinatorMetadataSetFalse(WorkerNode *workerNode, Datum value,
											   char *field);
static WorkerNode * SetShouldHaveShards(WorkerNode *workerNode, bool shouldHaveShards);
static WorkerNode * FindNodeAnyClusterByNodeId(uint32 nodeId);
static void ErrorIfAnyNodeNotExist(List *nodeList);
static void UpdateLocalGroupIdsViaMetadataContext(MetadataSyncContext *context);
static void SendDeletionCommandsForReplicatedTablePlacements(
	MetadataSyncContext *context);
static void SyncNodeMetadata(MetadataSyncContext *context);
static void SetNodeStateViaMetadataContext(MetadataSyncContext *context,
										   WorkerNode *workerNode,
										   Datum value);
static void MarkNodesNotSyncedInLoopBackConnection(MetadataSyncContext *context,
												   pid_t parentSessionPid);
static void EnsureParentSessionHasExclusiveLockOnPgDistNode(pid_t parentSessionPid);
static void SetNodeMetadata(MetadataSyncContext *context, bool localOnly);
static void EnsureTransactionalMetadataSyncMode(void);
static void LockShardsInWorkerPlacementList(WorkerNode *workerNode, LOCKMODE
											lockMode);
static BackgroundWorkerHandle * CheckBackgroundWorkerToObtainLocks(int32 lock_cooldown);
static BackgroundWorkerHandle * LockPlacementsWithBackgroundWorkersInPrimaryNode(
	WorkerNode *workerNode, bool force, int32 lock_cooldown);

/* Function definitions go here */

/* declarations for dynamic loading */
PG_FUNCTION_INFO_V1(citus_set_coordinator_host);
PG_FUNCTION_INFO_V1(citus_add_node);
PG_FUNCTION_INFO_V1(master_add_node);
PG_FUNCTION_INFO_V1(citus_add_inactive_node);
PG_FUNCTION_INFO_V1(master_add_inactive_node);
PG_FUNCTION_INFO_V1(citus_add_secondary_node);
PG_FUNCTION_INFO_V1(master_add_secondary_node);
PG_FUNCTION_INFO_V1(citus_set_node_property);
PG_FUNCTION_INFO_V1(master_set_node_property);
PG_FUNCTION_INFO_V1(citus_remove_node);
PG_FUNCTION_INFO_V1(master_remove_node);
PG_FUNCTION_INFO_V1(citus_disable_node);
PG_FUNCTION_INFO_V1(master_disable_node);
PG_FUNCTION_INFO_V1(citus_activate_node);
PG_FUNCTION_INFO_V1(master_activate_node);
PG_FUNCTION_INFO_V1(citus_update_node);
PG_FUNCTION_INFO_V1(citus_pause_node_within_txn);
PG_FUNCTION_INFO_V1(master_update_node);
PG_FUNCTION_INFO_V1(get_shard_id_for_distribution_column);
PG_FUNCTION_INFO_V1(citus_nodename_for_nodeid);
PG_FUNCTION_INFO_V1(citus_nodeport_for_nodeid);
PG_FUNCTION_INFO_V1(citus_coordinator_nodeid);
PG_FUNCTION_INFO_V1(citus_is_coordinator);
PG_FUNCTION_INFO_V1(citus_internal_mark_node_not_synced);
PG_FUNCTION_INFO_V1(citus_is_primary_node);

/*
 * DefaultNodeMetadata creates a NodeMetadata struct with the fields set to
 * sane defaults, e.g. nodeRack = WORKER_DEFAULT_RACK.
 */
static NodeMetadata
DefaultNodeMetadata()
{
	NodeMetadata nodeMetadata;

	/* ensure uninitialized padding doesn't escape the function */
	memset_struct_0(nodeMetadata);
	nodeMetadata.nodeRack = WORKER_DEFAULT_RACK;
	nodeMetadata.shouldHaveShards = true;
	nodeMetadata.groupId = INVALID_GROUP_ID;

	return nodeMetadata;
}


/*
 * citus_set_coordinator_host configures the hostname and port through which worker
 * nodes can connect to the coordinator.
 */
Datum
citus_set_coordinator_host(PG_FUNCTION_ARGS)
{
	CheckCitusVersion(ERROR);

	text *nodeName = PG_GETARG_TEXT_P(0);
	int32 nodePort = PG_GETARG_INT32(1);
	char *nodeNameString = text_to_cstring(nodeName);

	NodeMetadata nodeMetadata = DefaultNodeMetadata();
	nodeMetadata.groupId = 0;
	nodeMetadata.shouldHaveShards = false;
	nodeMetadata.nodeRole = PG_GETARG_OID(2);

	Name nodeClusterName = PG_GETARG_NAME(3);
	nodeMetadata.nodeCluster = NameStr(*nodeClusterName);

	/*
	 * We do not allow metadata operations on secondary nodes in nontransactional
	 * sync mode.
	 */
	if (nodeMetadata.nodeRole == SecondaryNodeRoleId())
	{
		EnsureTransactionalMetadataSyncMode();
	}

	/* prevent concurrent modification */
	LockRelationOid(DistNodeRelationId(), RowExclusiveLock);

	bool isCoordinatorInMetadata = false;
	WorkerNode *coordinatorNode = PrimaryNodeForGroup(COORDINATOR_GROUP_ID,
													  &isCoordinatorInMetadata);
	if (!isCoordinatorInMetadata)
	{
		bool nodeAlreadyExists = false;
		bool localOnly = false;

		/* add the coordinator to pg_dist_node if it was not already added */
		AddNodeMetadata(nodeNameString, nodePort, &nodeMetadata,
						&nodeAlreadyExists, localOnly);

		/* we just checked */
		Assert(!nodeAlreadyExists);
	}
	else
	{
		/*
		 * since AddNodeMetadata takes an exclusive lock on pg_dist_node, we
		 * do not need to worry about concurrent changes (e.g. deletion) and
		 * can proceed to update immediately.
		 */
		bool localOnly = false;
		UpdateNodeLocation(coordinatorNode->nodeId, nodeNameString, nodePort, localOnly);

		/* clear cached plans that have the old host/port */
		ResetPlanCache();
	}

	TransactionModifiedNodeMetadata = true;

	PG_RETURN_VOID();
}


/*
 * EnsureTransactionalMetadataSyncMode ensures metadata sync mode is transactional.
 */
static void
EnsureTransactionalMetadataSyncMode(void)
{
	if (MetadataSyncTransMode == METADATA_SYNC_NON_TRANSACTIONAL)
	{
		ereport(ERROR, (errmsg("this operation cannot be completed in nontransactional "
							   "metadata sync mode"),
						errhint("SET citus.metadata_sync_mode to 'transactional'")));
	}
}


/*
 * citus_add_node function adds a new node to the cluster and returns its id. It also
 * replicates all reference tables to the new node.
 */
Datum
citus_add_node(PG_FUNCTION_ARGS)
{
	CheckCitusVersion(ERROR);

	EnsureSuperUser();
	EnsureCoordinator();

	text *nodeName = PG_GETARG_TEXT_P(0);
	int32 nodePort = PG_GETARG_INT32(1);
	char *nodeNameString = text_to_cstring(nodeName);

	NodeMetadata nodeMetadata = DefaultNodeMetadata();
	bool nodeAlreadyExists = false;
	nodeMetadata.groupId = PG_GETARG_INT32(2);

	/*
	 * During tests this function is called before nodeRole and nodeCluster have been
	 * created.
	 */
	if (PG_NARGS() == 3)
	{
		nodeMetadata.nodeRole = InvalidOid;
		nodeMetadata.nodeCluster = "default";
	}
	else
	{
		Name nodeClusterName = PG_GETARG_NAME(4);
		nodeMetadata.nodeCluster = NameStr(*nodeClusterName);

		nodeMetadata.nodeRole = PG_GETARG_OID(3);
	}

	if (nodeMetadata.groupId == COORDINATOR_GROUP_ID)
	{
		/* by default, we add the coordinator without shards */
		nodeMetadata.shouldHaveShards = false;
	}

	/*
	 * We do not allow metadata operations on secondary nodes in nontransactional
	 * sync mode.
	 */
	if (nodeMetadata.nodeRole == SecondaryNodeRoleId())
	{
		EnsureTransactionalMetadataSyncMode();
	}

	if (MetadataSyncTransMode == METADATA_SYNC_NON_TRANSACTIONAL &&
		IsMultiStatementTransaction())
	{
		/*
		 * prevent inside transaction block as we use bare connections which can
		 * lead deadlock
		 */
		ereport(ERROR, (errmsg("do not add node in transaction block "
							   "when the sync mode is nontransactional"),
						errhint("add the node after SET citus.metadata_sync_mode "
								"TO 'transactional'")));
	}

	int nodeId = AddNodeMetadataViaMetadataContext(nodeNameString, nodePort,
												   &nodeMetadata,
												   &nodeAlreadyExists);
	TransactionModifiedNodeMetadata = true;

	PG_RETURN_INT32(nodeId);
}


/*
 * master_add_node is a wrapper function for old UDF name.
 */
Datum
master_add_node(PG_FUNCTION_ARGS)
{
	return citus_add_node(fcinfo);
}


/*
 * citus_add_inactive_node function adds a new node to the cluster as inactive node
 * and returns id of the newly added node. It does not replicate reference
 * tables to the new node, it only adds new node to the pg_dist_node table.
 */
Datum
citus_add_inactive_node(PG_FUNCTION_ARGS)
{
	CheckCitusVersion(ERROR);

	text *nodeName = PG_GETARG_TEXT_P(0);
	int32 nodePort = PG_GETARG_INT32(1);
	char *nodeNameString = text_to_cstring(nodeName);
	Name nodeClusterName = PG_GETARG_NAME(4);

	NodeMetadata nodeMetadata = DefaultNodeMetadata();
	bool nodeAlreadyExists = false;
	nodeMetadata.groupId = PG_GETARG_INT32(2);
	nodeMetadata.nodeRole = PG_GETARG_OID(3);
	nodeMetadata.nodeCluster = NameStr(*nodeClusterName);

	if (nodeMetadata.groupId == COORDINATOR_GROUP_ID)
	{
		ereport(ERROR, (errmsg("coordinator node cannot be added as inactive node")));
	}

	/*
	 * We do not allow metadata operations on secondary nodes in nontransactional
	 * sync mode.
	 */
	if (nodeMetadata.nodeRole == SecondaryNodeRoleId())
	{
		EnsureTransactionalMetadataSyncMode();
	}

	bool localOnly = false;
	int nodeId = AddNodeMetadata(nodeNameString, nodePort, &nodeMetadata,
								 &nodeAlreadyExists, localOnly);
	TransactionModifiedNodeMetadata = true;

	PG_RETURN_INT32(nodeId);
}


/*
 * master_add_inactive_node is a wrapper function for old UDF name.
 */
Datum
master_add_inactive_node(PG_FUNCTION_ARGS)
{
	return citus_add_inactive_node(fcinfo);
}


/*
 * citus_add_secondary_node adds a new secondary node to the cluster. It accepts as
 * arguments the primary node it should share a group with.
 */
Datum
citus_add_secondary_node(PG_FUNCTION_ARGS)
{
	CheckCitusVersion(ERROR);

	text *nodeName = PG_GETARG_TEXT_P(0);
	int32 nodePort = PG_GETARG_INT32(1);
	char *nodeNameString = text_to_cstring(nodeName);

	text *primaryName = PG_GETARG_TEXT_P(2);
	int32 primaryPort = PG_GETARG_INT32(3);
	char *primaryNameString = text_to_cstring(primaryName);

	Name nodeClusterName = PG_GETARG_NAME(4);
	NodeMetadata nodeMetadata = DefaultNodeMetadata();
	bool nodeAlreadyExists = false;

	nodeMetadata.groupId = GroupForNode(primaryNameString, primaryPort);
	nodeMetadata.nodeCluster = NameStr(*nodeClusterName);
	nodeMetadata.nodeRole = SecondaryNodeRoleId();
	nodeMetadata.isActive = true;

	/*
	 * We do not allow metadata operations on secondary nodes in nontransactional
	 * sync mode.
	 */
	EnsureTransactionalMetadataSyncMode();

	bool localOnly = false;
	int nodeId = AddNodeMetadata(nodeNameString, nodePort, &nodeMetadata,
								 &nodeAlreadyExists, localOnly);
	TransactionModifiedNodeMetadata = true;

	PG_RETURN_INT32(nodeId);
}


/*
 * master_add_secondary_node is a wrapper function for old UDF name.
 */
Datum
master_add_secondary_node(PG_FUNCTION_ARGS)
{
	return citus_add_secondary_node(fcinfo);
}


/*
 * citus_remove_node function removes the provided node from the pg_dist_node table of
 * the master node and all nodes with metadata.
 * The call to the citus_remove_node should be done by the super user and the specified
 * node should not have any active placements.
 * This function also deletes all reference table placements belong to the given node from
 * pg_dist_placement, but it does not drop actual placement at the node. In the case of
 * re-adding the node, citus_add_node first drops and re-creates the reference tables.
 */
Datum
citus_remove_node(PG_FUNCTION_ARGS)
{
	CheckCitusVersion(ERROR);

	text *nodeNameText = PG_GETARG_TEXT_P(0);
	int32 nodePort = PG_GETARG_INT32(1);

	RemoveNodeFromCluster(text_to_cstring(nodeNameText), nodePort);
	TransactionModifiedNodeMetadata = true;

	PG_RETURN_VOID();
}


/*
 * master_remove_node is a wrapper function for old UDF name.
 */
Datum
master_remove_node(PG_FUNCTION_ARGS)
{
	return citus_remove_node(fcinfo);
}


/*
 * citus_disable_node function sets isactive value of the provided node as inactive at
 * coordinator and all nodes with metadata regardless of the node having an active shard
 * placement.
 *
 * The call to the citus_disable_node must be done by the super user.
 *
 * This function also deletes all reference table placements belong to the given node
 * from pg_dist_placement, but it does not drop actual placement at the node. In the case
 * of re-activating the node, citus_add_node first drops and re-creates the reference
 * tables.
 */
Datum
citus_disable_node(PG_FUNCTION_ARGS)
{
	text *nodeNameText = PG_GETARG_TEXT_P(0);
	int32 nodePort = PG_GETARG_INT32(1);

	bool synchronousDisableNode = 1;
	Assert(PG_NARGS() == 2 || PG_NARGS() == 3);
	if (PG_NARGS() == 3)
	{
		synchronousDisableNode = PG_GETARG_BOOL(2);
	}

	char *nodeName = text_to_cstring(nodeNameText);
	WorkerNode *workerNode = ModifiableWorkerNode(nodeName, nodePort);

	/* there is no concept of invalid coordinator */
	bool isActive = false;
	ErrorIfCoordinatorMetadataSetFalse(workerNode, BoolGetDatum(isActive),
									   "isactive");

	/*
	 * We do not allow metadata operations on secondary nodes in nontransactional
	 * sync mode.
	 */
	if (NodeIsSecondary(workerNode))
	{
		EnsureTransactionalMetadataSyncMode();
	}

	WorkerNode *firstWorkerNode = GetFirstPrimaryWorkerNode();
	bool disablingFirstNode =
		(firstWorkerNode && firstWorkerNode->nodeId == workerNode->nodeId);

	if (disablingFirstNode && !synchronousDisableNode)
	{
		/*
		 * We sync metadata async and optionally in the background worker,
		 * it would mean that some nodes might get the updates while other
		 * not. And, if the node metadata that is changing is the first
		 * worker node, the problem gets nasty. We serialize modifications
		 * to replicated tables by acquiring locks on the first worker node.
		 *
		 * If some nodes get the metadata changes and some do not, they'd be
		 * acquiring the locks on different nodes. Hence, having the
		 * possibility of diverged shard placements for the same shard.
		 *
		 * To prevent that, we currently do not allow disabling the first
		 * worker node unless it is explicitly opted synchronous.
		 */
		ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
						errmsg("disabling the first worker node in the "
							   "metadata is not allowed"),
						errhint("You can force disabling node, SELECT "
								"citus_disable_node('%s', %d, "
								"synchronous:=true);",
								workerNode->workerName,
								nodePort),
						errdetail("Citus uses the first worker node in the "
								  "metadata for certain internal operations when "
								  "replicated tables are modified. Synchronous mode "
								  "ensures that all nodes have the same view of the "
								  "first worker node, which is used for certain "
								  "locking operations.")));
	}

	/*
	 * First, locally mark the node as inactive. We'll later trigger background
	 * worker to sync the metadata changes to the relevant nodes.
	 */
	workerNode =
		SetWorkerColumnLocalOnly(workerNode,
								 Anum_pg_dist_node_isactive,
								 BoolGetDatum(isActive));
	if (NodeIsPrimary(workerNode))
	{
		/*
		 * We do not allow disabling nodes if it contains any
		 * primary placement that is the "only" active placement
		 * for any given shard.
		 */
		ErrorIfNodeContainsNonRemovablePlacements(workerNode);
	}

	TransactionModifiedNodeMetadata = true;

	if (synchronousDisableNode)
	{
		/*
		 * The user might pick between sync vs async options.
		 *      - Pros for the sync option:
		 *          (a) the changes become visible on the cluster immediately
		 *          (b) even if the first worker node is disabled, there is no
		 *              risk of divergence of the placements of replicated shards
		 *      - Cons for the sync options:
		 *          (a) Does not work within 2PC transaction (e.g., BEGIN;
		 *              citus_disable_node(); PREPARE TRANSACTION ...);
		 *          (b) If there are multiple node failures (e.g., one another node
		 *              than the current node being disabled), the sync option would
		 *              fail because it'd try to sync the metadata changes to a node
		 *              that is not up and running.
		 */
		if (firstWorkerNode && firstWorkerNode->nodeId == workerNode->nodeId)
		{
			/*
			 * We cannot let any modification query on a replicated table to run
			 * concurrently with citus_disable_node() on the first worker node. If
			 * we let that, some worker nodes might calculate FirstWorkerNode()
			 * different than others. See LockShardListResourcesOnFirstWorker()
			 * for the details.
			 */
			BlockDistributedQueriesOnMetadataNodes();
		}

		SyncNodeMetadataToNodes();
	}
	else if (UnsetMetadataSyncedForAllWorkers())
	{
		/*
		 * We have not propagated the node metadata changes yet, make sure that all the
		 * active nodes get the metadata updates. We defer this operation to the
		 * background worker to make it possible disabling nodes when multiple nodes
		 * are down.
		 *
		 * Note that the active placements reside on the active nodes. Hence, when
		 * Citus finds active placements, it filters out the placements that are on
		 * the disabled nodes. That's why, we don't have to change/sync placement
		 * metadata at this point. Instead, we defer that to citus_activate_node()
		 * where we expect all nodes up and running.
		 */

		TriggerNodeMetadataSyncOnCommit();
	}

	PG_RETURN_VOID();
}


/*
 * BlockDistributedQueriesOnMetadataNodes blocks all the modification queries on
 * all nodes. Hence, should be used with caution.
 */
static void
BlockDistributedQueriesOnMetadataNodes(void)
{
	/* first, block on the coordinator */
	LockRelationOid(DistNodeRelationId(), ExclusiveLock);

	/*
	 * Note that we might re-design this lock to be more granular than
	 * pg_dist_node, scoping only for modifications on the replicated
	 * tables. However, we currently do not have any such mechanism and
	 * given that citus_disable_node() runs instantly, it seems acceptable
	 * to block reads (or modifications on non-replicated tables) for
	 * a while.
	 */

	/* only superuser can disable node */
	Assert(superuser());

	SendCommandToWorkersWithMetadata(
		"LOCK TABLE pg_catalog.pg_dist_node IN EXCLUSIVE MODE;");
}


/*
 * master_disable_node is a wrapper function for old UDF name.
 */
Datum
master_disable_node(PG_FUNCTION_ARGS)
{
	return citus_disable_node(fcinfo);
}


/*
 * citus_set_node_property sets a property of the node
 */
Datum
citus_set_node_property(PG_FUNCTION_ARGS)
{
	text *nodeNameText = PG_GETARG_TEXT_P(0);
	int32 nodePort = PG_GETARG_INT32(1);
	text *propertyText = PG_GETARG_TEXT_P(2);
	bool value = PG_GETARG_BOOL(3);

	WorkerNode *workerNode = ModifiableWorkerNode(text_to_cstring(nodeNameText),
												  nodePort);

	/*
	 * We do not allow metadata operations on secondary nodes in nontransactional
	 * sync mode.
	 */
	if (NodeIsSecondary(workerNode))
	{
		EnsureTransactionalMetadataSyncMode();
	}

	if (strcmp(text_to_cstring(propertyText), "shouldhaveshards") == 0)
	{
		SetShouldHaveShards(workerNode, value);
	}
	else
	{
		ereport(ERROR, (errmsg(
							"only the 'shouldhaveshards' property can be set using this function")));
	}

	TransactionModifiedNodeMetadata = true;

	PG_RETURN_VOID();
}


/*
 * master_set_node_property is a wrapper function for old UDF name.
 */
Datum
master_set_node_property(PG_FUNCTION_ARGS)
{
	return citus_set_node_property(fcinfo);
}


/*
 * ModifiableWorkerNode gets the requested WorkerNode and also gets locks
 * required for modifying it. This fails if the node does not exist.
 */
WorkerNode *
ModifiableWorkerNode(const char *nodeName, int32 nodePort)
{
	CheckCitusVersion(ERROR);
	EnsureCoordinator();

	/* take an exclusive lock on pg_dist_node to serialize pg_dist_node changes */
	LockRelationOid(DistNodeRelationId(), ExclusiveLock);

	WorkerNode *workerNode = FindWorkerNodeAnyCluster(nodeName, nodePort);
	if (workerNode == NULL)
	{
		ereport(ERROR, (errmsg("node at \"%s:%u\" does not exist", nodeName, nodePort)));
	}

	return workerNode;
}


/*
 * citus_activate_node UDF activates the given node. It sets the node's isactive
 * value to active and replicates all reference tables to that node.
 */
Datum
citus_activate_node(PG_FUNCTION_ARGS)
{
	text *nodeNameText = PG_GETARG_TEXT_P(0);
	int32 nodePort = PG_GETARG_INT32(1);

	char *nodeNameString = text_to_cstring(nodeNameText);
	WorkerNode *workerNode = ModifiableWorkerNode(nodeNameString, nodePort);

	/*
	 * We do not allow metadata operations on secondary nodes in nontransactional
	 * sync mode.
	 */
	if (NodeIsSecondary(workerNode))
	{
		EnsureTransactionalMetadataSyncMode();
	}

	/*
	 * Create MetadataSyncContext which is used throughout nodes' activation.
	 * It contains activated nodes, bare connections if the mode is nontransactional,
	 * and a memory context for allocation.
	 */
	bool collectCommands = false;
	bool nodesAddedInSameTransaction = false;
	MetadataSyncContext *context = CreateMetadataSyncContext(list_make1(workerNode),
															 collectCommands,
															 nodesAddedInSameTransaction);

	ActivateNodeList(context);
	TransactionModifiedNodeMetadata = true;

	PG_RETURN_INT32(workerNode->nodeId);
}


/*
 * master_activate_node is a wrapper function for old UDF name.
 */
Datum
master_activate_node(PG_FUNCTION_ARGS)
{
	return citus_activate_node(fcinfo);
}


/*
 * GroupForNode returns the group which a given node belongs to.
 *
 * It only works if the requested node is a part of CurrentCluster.
 */
uint32
GroupForNode(char *nodeName, int nodePort)
{
	WorkerNode *workerNode = FindWorkerNode(nodeName, nodePort);

	if (workerNode == NULL)
	{
		ereport(ERROR, (errmsg("node at \"%s:%u\" does not exist", nodeName, nodePort)));
	}

	return workerNode->groupId;
}


/*
 * NodeIsPrimaryAndRemote returns whether the argument represents the remote
 * primary node.
 */
bool
NodeIsPrimaryAndRemote(WorkerNode *worker)
{
	return NodeIsPrimary(worker) && !NodeIsLocal(worker);
}


/*
 * NodeIsPrimary returns whether the argument represents a primary node.
 */
bool
NodeIsPrimary(WorkerNode *worker)
{
	Oid primaryRole = PrimaryNodeRoleId();

	/* if nodeRole does not yet exist, all nodes are primary nodes */
	if (primaryRole == InvalidOid)
	{
		return true;
	}

	return worker->nodeRole == primaryRole;
}


/*
 * NodeIsLocal returns whether the argument represents the local node.
 */
static bool
NodeIsLocal(WorkerNode *worker)
{
	return worker->groupId == GetLocalGroupId();
}


/*
 * NodeIsSecondary returns whether the argument represents a secondary node.
 */
bool
NodeIsSecondary(WorkerNode *worker)
{
	Oid secondaryRole = SecondaryNodeRoleId();

	/* if nodeRole does not yet exist, all nodes are primary nodes */
	if (secondaryRole == InvalidOid)
	{
		return false;
	}

	return worker->nodeRole == secondaryRole;
}


/*
 * NodeIsReadable returns whether we're allowed to send SELECT queries to this
 * node.
 */
bool
NodeIsReadable(WorkerNode *workerNode)
{
	if (ReadFromSecondaries == USE_SECONDARY_NODES_NEVER &&
		NodeIsPrimary(workerNode))
	{
		return true;
	}

	if (ReadFromSecondaries == USE_SECONDARY_NODES_ALWAYS &&
		NodeIsSecondary(workerNode))
	{
		return true;
	}

	return false;
}


/*
 * PrimaryNodeForGroup returns the (unique) primary in the specified group.
 *
 * If there are any nodes in the requested group and groupContainsNodes is not NULL
 * it will set the bool groupContainsNodes references to true.
 */
WorkerNode *
PrimaryNodeForGroup(int32 groupId, bool *groupContainsNodes)
{
	WorkerNode *workerNode = NULL;
	HASH_SEQ_STATUS status;
	HTAB *workerNodeHash = GetWorkerNodeHash();

	hash_seq_init(&status, workerNodeHash);

	while ((workerNode = hash_seq_search(&status)) != NULL)
	{
		int32 workerNodeGroupId = workerNode->groupId;
		if (workerNodeGroupId != groupId)
		{
			continue;
		}

		if (groupContainsNodes != NULL)
		{
			*groupContainsNodes = true;
		}

		if (NodeIsPrimary(workerNode))
		{
			hash_seq_term(&status);
			return workerNode;
		}
	}

	return NULL;
}


/*
 * MarkNodesNotSyncedInLoopBackConnection unsets metadatasynced flag in separate
 * connection to localhost by calling the udf `citus_internal_mark_node_not_synced`.
 */
static void
MarkNodesNotSyncedInLoopBackConnection(MetadataSyncContext *context,
									   pid_t parentSessionPid)
{
	Assert(context->transactionMode == METADATA_SYNC_NON_TRANSACTIONAL);
	Assert(!MetadataSyncCollectsCommands(context));

	/*
	 * Set metadatasynced to false for all activated nodes to mark the nodes as not synced
	 * in case nontransactional metadata sync fails before we activate the nodes inside
	 * metadataSyncContext.
	 * We set metadatasynced to false at coordinator to mark the nodes as not synced. But we
	 * do not set isactive and hasmetadata flags to false as we still want to route queries
	 * to the nodes if their isactive flag is true and propagate DDL to the nodes if possible.
	 *
	 * NOTES:
	 * 1) We use separate connection to localhost as we would rollback the local
	 * transaction in case of failure.
	 * 2) Operator should handle problems at workers if any. Wworkers probably fail
	 * due to improper metadata when a query hits. Or DDL might fail due to desynced
	 * nodes. (when hasmetadata = true, metadatasynced = false)
	 * In those cases, proper metadata sync for the workers should be done.)
	 */

	/*
	 * Because we try to unset metadatasynced flag with a separate transaction,
	 * we could not find the new node if the node is added in the current local
	 * transaction. But, hopefully, we do not need to unset metadatasynced for
	 * the new node as local transaction would rollback in case of a failure.
	 */
	if (context->nodesAddedInSameTransaction)
	{
		return;
	}

	if (context->activatedWorkerNodeList == NIL)
	{
		return;
	}

	int connectionFlag = FORCE_NEW_CONNECTION;
	MultiConnection *connection = GetNodeConnection(connectionFlag, LocalHostName,
													PostPortNumber);

	List *commandList = NIL;
	WorkerNode *workerNode = NULL;
	foreach_declared_ptr(workerNode, context->activatedWorkerNodeList)
	{
		/*
		 * We need to prevent self deadlock when we access pg_dist_node using separate
		 * connection to localhost. To achieve this, we check if the caller session's
		 * pid holds the Exclusive lock on pg_dist_node. After ensuring that (we are
		 * called from parent session which holds the Exclusive lock), we can safely
		 * update node metadata by acquiring the relaxed lock.
		 */
		StringInfo metadatasyncCommand = makeStringInfo();
		appendStringInfo(metadatasyncCommand, CITUS_INTERNAL_MARK_NODE_NOT_SYNCED,
						 parentSessionPid, workerNode->nodeId);
		commandList = lappend(commandList, metadatasyncCommand->data);
	}

	SendCommandListToWorkerOutsideTransactionWithConnection(connection, commandList);
	CloseConnection(connection);
}


/*
 * SetNodeMetadata sets isactive, metadatasynced and hasmetadata flags locally
 * and, if required, remotely.
 */
static void
SetNodeMetadata(MetadataSyncContext *context, bool localOnly)
{
	/* do not execute local transaction if we collect commands */
	if (!MetadataSyncCollectsCommands(context))
	{
		List *updatedActivatedNodeList = NIL;

		WorkerNode *node = NULL;
		foreach_declared_ptr(node, context->activatedWorkerNodeList)
		{
			node = SetWorkerColumnLocalOnly(node, Anum_pg_dist_node_isactive,
											BoolGetDatum(true));
			node = SetWorkerColumnLocalOnly(node, Anum_pg_dist_node_metadatasynced,
											BoolGetDatum(true));
			node = SetWorkerColumnLocalOnly(node, Anum_pg_dist_node_hasmetadata,
											BoolGetDatum(true));

			updatedActivatedNodeList = lappend(updatedActivatedNodeList, node);
		}

		/* reset activated nodes inside metadataSyncContext afer local update */
		SetMetadataSyncNodesFromNodeList(context, updatedActivatedNodeList);
	}

	if (!localOnly && EnableMetadataSync)
	{
		WorkerNode *node = NULL;
		foreach_declared_ptr(node, context->activatedWorkerNodeList)
		{
			SetNodeStateViaMetadataContext(context, node, BoolGetDatum(true));
		}
	}
}


/*
 * ActivateNodeList does some sanity checks and acquire Exclusive lock on pg_dist_node,
 * and then activates the nodes inside given metadataSyncContext.
 *
 * The function operates in 3 different modes according to transactionMode inside
 * metadataSyncContext.
 *
 * 1. MetadataSyncCollectsCommands(context):
 *      Only collect commands instead of sending them to workers,
 * 2. context.transactionMode == METADATA_SYNC_TRANSACTIONAL:
 *      Send all commands using coordinated transaction,
 * 3. context.transactionMode == METADATA_SYNC_NON_TRANSACTIONAL:
 *      Send all commands using bare (no transaction block) connections.
 */
void
ActivateNodeList(MetadataSyncContext *context)
{
	if (context->transactionMode == METADATA_SYNC_NON_TRANSACTIONAL &&
		IsMultiStatementTransaction())
	{
		/*
		 * prevent inside transaction block as we use bare connections which can
		 * lead deadlock
		 */
		ereport(ERROR, (errmsg("do not sync metadata in transaction block "
							   "when the sync mode is nontransactional"),
						errhint("resync after SET citus.metadata_sync_mode "
								"TO 'transactional'")));
	}

	/*
	 * We currently require the object propagation to happen via superuser,
	 * see #5139. While activating a node, we sync both metadata and object
	 * propagation.
	 *
	 * In order to have a fully transactional semantics with add/activate
	 * node operations, we require superuser. Note that for creating
	 * non-owned objects, we already require a superuser connection.
	 * By ensuring the current user to be a superuser, we can guarantee
	 * to send all commands within the same remote transaction.
	 */
	EnsureSuperUser();

	/*
	 * Take an exclusive lock on pg_dist_node to serialize pg_dist_node
	 * changes.
	 */
	LockRelationOid(DistNodeRelationId(), ExclusiveLock);

	/*
	 * Error if there is concurrent change to node table before acquiring
	 * the lock
	 */
	ErrorIfAnyNodeNotExist(context->activatedWorkerNodeList);

	/*
	 * we need to unset metadatasynced flag to false at coordinator in separate
	 * transaction only at nontransactional sync mode and if we do not collect
	 * commands.
	 *
	 * We make sure we set the flag to false at the start of nontransactional
	 * metadata sync to mark those nodes are not synced in case of a failure in
	 * the middle of the sync.
	 */
	if (context->transactionMode == METADATA_SYNC_NON_TRANSACTIONAL &&
		!MetadataSyncCollectsCommands(context))
	{
		MarkNodesNotSyncedInLoopBackConnection(context, MyProcPid);
	}

	/*
	 * Delete existing reference and replicated table placements on the
	 * given groupId if the group has been disabled earlier (e.g., isActive
	 * set to false).
	 */
	SendDeletionCommandsForReplicatedTablePlacements(context);

	/*
	 * SetNodeMetadata sets isactive, metadatasynced and hasmetadata flags
	 * locally for following reasons:
	 *
	 * 1) Set isactive to true locally so that we can find activated nodes amongst
	 *    active workers,
	 * 2) Do not fail just because the current metadata is not synced. (see
	 *    ErrorIfAnyMetadataNodeOutOfSync),
	 * 3) To propagate activated nodes nodemetadata correctly.
	 *
	 * We are going to sync the metadata anyway in this transaction, set
	 * isactive, metadatasynced, and hasmetadata to true locally.
	 * The changes would rollback in case of failure.
	 */
	bool localOnly = true;
	SetNodeMetadata(context, localOnly);

	/*
	 * Update local group ids so that upcoming transactions can see its effect.
	 * Object dependency logic requires to have updated local group id.
	 */
	UpdateLocalGroupIdsViaMetadataContext(context);

	/*
	 * Sync node metadata so that placement insertion does not fail due to
	 * EnsureShardPlacementMetadataIsSane.
	 */
	SyncNodeMetadata(context);

	/*
	 * Sync all dependencies and distributed objects with their pg_dist_xx tables to
	 * metadata nodes inside metadataSyncContext. Depends on node metadata.
	 */
	SyncDistributedObjects(context);

	/*
	 * Let all nodes to be active and synced after all operations succeeded.
	 * we make sure that the metadata sync is idempotent and safe overall with multiple
	 * other transactions, if nontransactional mode is used.
	 *
	 * We already took Exclusive lock on node metadata, which prevents modification
	 * on node metadata on coordinator. The step will rollback, in case of a failure,
	 * to the state where metadatasynced=false.
	 */
	localOnly = false;
	SetNodeMetadata(context, localOnly);
}


/*
 * Acquires shard metadata locks on all shards residing in the given worker node
 *
 * TODO: This function is not compatible with query from any node feature.
 * To ensure proper behavior, it is essential to acquire locks on placements across all nodes
 * rather than limiting it to just the coordinator (or the specific node from which this function is called)
 */
void
LockShardsInWorkerPlacementList(WorkerNode *workerNode, LOCKMODE lockMode)
{
	List *placementList = AllShardPlacementsOnNodeGroup(workerNode->groupId);
	LockShardsInPlacementListMetadata(placementList, lockMode);
}


/*
 * This function is used to start a background worker to kill backends holding conflicting
 * locks with this backend. It returns NULL if the background worker could not be started.
 */
BackgroundWorkerHandle *
CheckBackgroundWorkerToObtainLocks(int32 lock_cooldown)
{
	BackgroundWorkerHandle *handle = StartLockAcquireHelperBackgroundWorker(MyProcPid,
																			lock_cooldown);
	if (!handle)
	{
		/*
		 * We failed to start a background worker, which probably means that we exceeded
		 * max_worker_processes, and this is unlikely to be resolved by retrying. We do not want
		 * to repeatedly throw an error because if citus_update_node is called to complete a
		 * failover then finishing is the only way to bring the cluster back up. Therefore we
		 * give up on killing other backends and simply wait for the lock. We do set
		 * lock_timeout to lock_cooldown, because we don't want to wait forever to get a lock.
		 */
		SetLockTimeoutLocally(lock_cooldown);
		ereport(WARNING, (errmsg(
							  "could not start background worker to kill backends with conflicting"
							  " locks to force the update. Degrading to acquiring locks "
							  "with a lock time out."),
						  errhint(
							  "Increasing max_worker_processes might help.")));
	}
	return handle;
}


/*
 * This function is used to lock shards in a primary node.
 * If force is true, we start a background worker to kill backends holding
 * conflicting locks with this backend.
 *
 * If the node is a primary node we block reads and writes.
 *
 * This lock has two purposes:
 *
 * - Ensure buggy code in Citus doesn't cause failures when the
 *   nodename/nodeport of a node changes mid-query
 *
 * - Provide fencing during failover, after this function returns all
 *   connections will use the new node location.
 *
 * Drawback:
 *
 * - This function blocks until all previous queries have finished. This
 *   means that long-running queries will prevent failover.
 *
 *   In case of node failure said long-running queries will fail in the end
 *   anyway as they will be unable to commit successfully on the failed
 *   machine. To cause quick failure of these queries use force => true
 *   during the invocation of citus_update_node to terminate conflicting
 *   backends proactively.
 *
 * It might be worth blocking reads to a secondary for the same reasons,
 * though we currently only query secondaries on follower clusters
 * where these locks will have no effect.
 */
BackgroundWorkerHandle *
LockPlacementsWithBackgroundWorkersInPrimaryNode(WorkerNode *workerNode, bool force, int32
												 lock_cooldown)
{
	BackgroundWorkerHandle *handle = NULL;

	if (NodeIsPrimary(workerNode))
	{
		if (force)
		{
			handle = CheckBackgroundWorkerToObtainLocks(lock_cooldown);
		}
		LockShardsInWorkerPlacementList(workerNode, AccessExclusiveLock);
	}
	return handle;
}


/*
 * citus_update_node moves the requested node to a different nodename and nodeport. It
 * locks to ensure no queries are running concurrently; and is intended for customers who
 * are running their own failover solution.
 */
Datum
citus_update_node(PG_FUNCTION_ARGS)
{
	CheckCitusVersion(ERROR);

	int32 nodeId = PG_GETARG_INT32(0);

	text *newNodeName = PG_GETARG_TEXT_P(1);
	int32 newNodePort = PG_GETARG_INT32(2);

	/*
	 * force is used when an update needs to happen regardless of conflicting locks. This
	 * feature is important to force the update during a failover due to failure, eg. by
	 * a high-availability system such as pg_auto_failover. The strategy is to start a
	 * background worker that actively cancels backends holding conflicting locks with
	 * this backend.
	 *
	 * Defaults to false
	 */
	bool force = PG_GETARG_BOOL(3);
	int32 lock_cooldown = PG_GETARG_INT32(4);

	char *newNodeNameString = text_to_cstring(newNodeName);

	WorkerNode *workerNodeWithSameAddress = FindWorkerNodeAnyCluster(newNodeNameString,
																	 newNodePort);
	if (workerNodeWithSameAddress != NULL)
	{
		/* a node with the given hostname and port already exists in the metadata */

		if (workerNodeWithSameAddress->nodeId == nodeId)
		{
			/* it's the node itself, meaning this is a noop update */
			PG_RETURN_VOID();
		}
		else
		{
			ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
							errmsg("there is already another node with the specified "
								   "hostname and port")));
		}
	}

	WorkerNode *workerNode = FindNodeAnyClusterByNodeId(nodeId);
	if (workerNode == NULL)
	{
		ereport(ERROR, (errcode(ERRCODE_NO_DATA_FOUND),
						errmsg("node %u not found", nodeId)));
	}

	/*
	 * We do not allow metadata operations on secondary nodes in nontransactional
	 * sync mode.
	 */
	if (NodeIsSecondary(workerNode))
	{
		EnsureTransactionalMetadataSyncMode();
	}

	BackgroundWorkerHandle *handle = LockPlacementsWithBackgroundWorkersInPrimaryNode(
		workerNode, force,
		lock_cooldown);

	/*
	 * if we have planned statements such as prepared statements, we should clear the cache so that
	 * the planned cache doesn't return the old nodename/nodepost.
	 */
	ResetPlanCache();

	bool localOnly = true;
	UpdateNodeLocation(nodeId, newNodeNameString, newNodePort, localOnly);

	/* we should be able to find the new node from the metadata */
	workerNode = FindWorkerNodeAnyCluster(newNodeNameString, newNodePort);
	Assert(workerNode->nodeId == nodeId);

	/*
	 * Propagate the updated pg_dist_node entry to all metadata workers.
	 * citus-ha uses citus_update_node() in a prepared transaction, and
	 * we don't support coordinated prepared transactions, so we cannot
	 * propagate the changes to the worker nodes here. Instead we mark
	 * all metadata nodes as not-synced and ask maintenanced to do the
	 * propagation.
	 *
	 * It is possible that maintenance daemon does the first resync too
	 * early, but that's fine, since this will start a retry loop with
	 * 5 second intervals until sync is complete.
	 */
	if (UnsetMetadataSyncedForAllWorkers())
	{
		TriggerNodeMetadataSyncOnCommit();
	}

	if (handle != NULL)
	{
		/*
		 * this will be called on memory context cleanup as well, if the worker has been
		 * terminated already this will be a noop
		 */
		TerminateBackgroundWorker(handle);
	}

	TransactionModifiedNodeMetadata = true;

	PG_RETURN_VOID();
}


/*
 * This function is designed to obtain locks for all the shards in a worker placement list.
 * Once the transaction is committed, the acquired locks will be automatically released.
 * Therefore, it is essential to invoke this function within a transaction.
 * This function proves beneficial when there is a need to temporarily disable writes to a specific node within a transaction.
 */
Datum
citus_pause_node_within_txn(PG_FUNCTION_ARGS)
{
	CheckCitusVersion(ERROR);

	int32 nodeId = PG_GETARG_INT32(0);
	bool force = PG_GETARG_BOOL(1);
	int32 lock_cooldown = PG_GETARG_INT32(2);

	WorkerNode *workerNode = FindNodeAnyClusterByNodeId(nodeId);
	if (workerNode == NULL)
	{
		ereport(ERROR, (errcode(ERRCODE_NO_DATA_FOUND),
						errmsg("node %u not found", nodeId)));
	}

	LockPlacementsWithBackgroundWorkersInPrimaryNode(workerNode, force, lock_cooldown);

	PG_RETURN_VOID();
}


/*
 * master_update_node is a wrapper function for old UDF name.
 */
Datum
master_update_node(PG_FUNCTION_ARGS)
{
	return citus_update_node(fcinfo);
}


/*
 * SetLockTimeoutLocally sets the lock_timeout to the given value.
 * This setting is local.
 */
static void
SetLockTimeoutLocally(int32 lockCooldown)
{
	set_config_option("lock_timeout", ConvertIntToString(lockCooldown),
					  (superuser() ? PGC_SUSET : PGC_USERSET), PGC_S_SESSION,
					  GUC_ACTION_LOCAL, true, 0, false);
}


static void
UpdateNodeLocation(int32 nodeId, char *newNodeName, int32 newNodePort, bool localOnly)
{
	const bool indexOK = true;

	ScanKeyData scanKey[1];
	Datum values[Natts_pg_dist_node];
	bool isnull[Natts_pg_dist_node];
	bool replace[Natts_pg_dist_node];

	Relation pgDistNode = table_open(DistNodeRelationId(), RowExclusiveLock);
	TupleDesc tupleDescriptor = RelationGetDescr(pgDistNode);

	ScanKeyInit(&scanKey[0], Anum_pg_dist_node_nodeid,
				BTEqualStrategyNumber, F_INT4EQ, Int32GetDatum(nodeId));

	SysScanDesc scanDescriptor = systable_beginscan(pgDistNode, DistNodeNodeIdIndexId(),
													indexOK,
													NULL, 1, scanKey);

	HeapTuple heapTuple = systable_getnext(scanDescriptor);
	if (!HeapTupleIsValid(heapTuple))
	{
		ereport(ERROR, (errmsg("could not find valid entry for node \"%s:%d\"",
							   newNodeName, newNodePort)));
	}

	memset(replace, 0, sizeof(replace));

	values[Anum_pg_dist_node_nodeport - 1] = Int32GetDatum(newNodePort);
	isnull[Anum_pg_dist_node_nodeport - 1] = false;
	replace[Anum_pg_dist_node_nodeport - 1] = true;

	values[Anum_pg_dist_node_nodename - 1] = CStringGetTextDatum(newNodeName);
	isnull[Anum_pg_dist_node_nodename - 1] = false;
	replace[Anum_pg_dist_node_nodename - 1] = true;

	heapTuple = heap_modify_tuple(heapTuple, tupleDescriptor, values, isnull, replace);

	CatalogTupleUpdate(pgDistNode, &heapTuple->t_self, heapTuple);

	CitusInvalidateRelcacheByRelid(DistNodeRelationId());

	CommandCounterIncrement();

	if (!localOnly && EnableMetadataSync)
	{
		WorkerNode *updatedNode = FindWorkerNodeAnyCluster(newNodeName, newNodePort);
		Assert(updatedNode->nodeId == nodeId);

		/* send the delete command to all primary nodes with metadata */
		char *nodeDeleteCommand = NodeDeleteCommand(updatedNode->nodeId);
		SendCommandToWorkersWithMetadata(nodeDeleteCommand);

		/* send the insert command to all primary nodes with metadata */
		char *nodeInsertCommand = NodeListInsertCommand(list_make1(updatedNode));
		SendCommandToWorkersWithMetadata(nodeInsertCommand);
	}

	systable_endscan(scanDescriptor);
	table_close(pgDistNode, NoLock);
}


/*
 * get_shard_id_for_distribution_column function takes a distributed table name and a
 * distribution value then returns shard id of the shard which belongs to given table and
 * contains given value. This function only works for hash distributed tables.
 */
Datum
get_shard_id_for_distribution_column(PG_FUNCTION_ARGS)
{
	CheckCitusVersion(ERROR);

	ShardInterval *shardInterval = NULL;

	/*
	 * To have optional parameter as NULL, we defined this UDF as not strict, therefore
	 * we need to check all parameters for NULL values.
	 */
	if (PG_ARGISNULL(0))
	{
		ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
						errmsg("relation cannot be NULL")));
	}

	Oid relationId = PG_GETARG_OID(0);
	EnsureTablePermissions(relationId, ACL_SELECT);

	if (!IsCitusTable(relationId))
	{
		ereport(ERROR, (errcode(ERRCODE_INVALID_TABLE_DEFINITION),
						errmsg("relation is not distributed")));
	}

	if (!HasDistributionKey(relationId))
	{
		List *shardIntervalList = LoadShardIntervalList(relationId);
		if (shardIntervalList == NIL)
		{
			PG_RETURN_INT64(0);
		}

		shardInterval = (ShardInterval *) linitial(shardIntervalList);
	}
	else if (IsCitusTableType(relationId, HASH_DISTRIBUTED) ||
			 IsCitusTableType(relationId, RANGE_DISTRIBUTED))
	{
		CitusTableCacheEntry *cacheEntry = GetCitusTableCacheEntry(relationId);

		/* if given table is not reference table, distributionValue cannot be NULL */
		if (PG_ARGISNULL(1))
		{
			ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
							errmsg("distribution value cannot be NULL for tables other "
								   "than reference tables.")));
		}

		Datum inputDatum = PG_GETARG_DATUM(1);
		Oid inputDataType = get_fn_expr_argtype(fcinfo->flinfo, 1);
		char *distributionValueString = DatumToString(inputDatum, inputDataType);

		Var *distributionColumn = DistPartitionKeyOrError(relationId);
		Oid distributionDataType = distributionColumn->vartype;

		Datum distributionValueDatum = StringToDatum(distributionValueString,
													 distributionDataType);

		shardInterval = FindShardInterval(distributionValueDatum, cacheEntry);
	}
	else
	{
		ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
						errmsg("finding shard id of given distribution value is only "
							   "supported for hash partitioned tables, range partitioned "
							   "tables and reference tables.")));
	}

	if (shardInterval != NULL)
	{
		PG_RETURN_INT64(shardInterval->shardId);
	}

	PG_RETURN_INT64(0);
}


/*
 * citus_nodename_for_nodeid returns the node name for the node with given node id
 */
Datum
citus_nodename_for_nodeid(PG_FUNCTION_ARGS)
{
	CheckCitusVersion(ERROR);

	int nodeId = PG_GETARG_INT32(0);

	WorkerNode *node = FindNodeAnyClusterByNodeId(nodeId);

	if (node == NULL)
	{
		PG_RETURN_NULL();
	}

	PG_RETURN_TEXT_P(cstring_to_text(node->workerName));
}


/*
 * citus_nodeport_for_nodeid returns the node port for the node with given node id
 */
Datum
citus_nodeport_for_nodeid(PG_FUNCTION_ARGS)
{
	CheckCitusVersion(ERROR);

	int nodeId = PG_GETARG_INT32(0);

	WorkerNode *node = FindNodeAnyClusterByNodeId(nodeId);

	if (node == NULL)
	{
		PG_RETURN_NULL();
	}

	PG_RETURN_INT32(node->workerPort);
}


/*
 * citus_coordinator_nodeid returns the node id of the coordinator node
 */
Datum
citus_coordinator_nodeid(PG_FUNCTION_ARGS)
{
	CheckCitusVersion(ERROR);

	int coordinatorNodeId = FindCoordinatorNodeId();

	if (coordinatorNodeId == -1)
	{
		PG_RETURN_INT32(0);
	}

	PG_RETURN_INT32(coordinatorNodeId);
}


/*
 * citus_is_coordinator returns whether the current node is a coordinator.
 * We consider the node a coordinator if its group ID is 0 and it has
 * pg_dist_node entries (only group ID 0 could indicate a worker without
 * metadata).
 */
Datum
citus_is_coordinator(PG_FUNCTION_ARGS)
{
	CheckCitusVersion(ERROR);

	bool isCoordinator = false;

	if (GetLocalGroupId() == COORDINATOR_GROUP_ID &&
		ActiveReadableNodeCount() > 0)
	{
		isCoordinator = true;
	}

	PG_RETURN_BOOL(isCoordinator);
}


/*
 * citus_is_primary_node returns whether the current node is a primary for
 * a given group_id. We consider the node a primary if it has
 * pg_dist_node entries marked as primary
 */
Datum
citus_is_primary_node(PG_FUNCTION_ARGS)
{
	CheckCitusVersion(ERROR);

	int32 groupId = GetLocalGroupId();
	WorkerNode *workerNode = PrimaryNodeForGroup(groupId, NULL);
	if (workerNode == NULL)
	{
		ereport(WARNING, (errmsg("could not find the current node in pg_dist_node"),
						  errdetail("If this is the coordinator node, consider adding it "
									"into the metadata by using citus_set_coordinator_host() "
									"UDF. Otherwise, if you're going to use this node as a "
									"worker node for a new cluster, make sure to add this "
									"node into the metadata from the coordinator by using "
									"citus_add_node() UDF.")));
		PG_RETURN_NULL();
	}

	bool isPrimary = workerNode->nodeId == GetLocalNodeId();

	PG_RETURN_BOOL(isPrimary);
}


/*
 * EnsureParentSessionHasExclusiveLockOnPgDistNode ensures given session id
 * holds Exclusive lock on pg_dist_node.
 */
static void
EnsureParentSessionHasExclusiveLockOnPgDistNode(pid_t parentSessionPid)
{
	StringInfo checkIfParentLockCommandStr = makeStringInfo();

	int spiConnectionResult = SPI_connect();
	if (spiConnectionResult != SPI_OK_CONNECT)
	{
		ereport(ERROR, (errmsg("could not connect to SPI manager")));
	}

	char *checkIfParentLockCommand = "SELECT pid FROM pg_locks WHERE "
									 "pid = %d AND database = %d AND relation = %d AND "
									 "mode = 'ExclusiveLock' AND granted = TRUE";
	appendStringInfo(checkIfParentLockCommandStr, checkIfParentLockCommand,
					 parentSessionPid, MyDatabaseId, DistNodeRelationId());

	bool readOnly = true;
	int spiQueryResult = SPI_execute(checkIfParentLockCommandStr->data, readOnly, 0);
	if (spiQueryResult != SPI_OK_SELECT)
	{
		ereport(ERROR, (errmsg("execution was not successful \"%s\"",
							   checkIfParentLockCommandStr->data)));
	}

	bool parentHasExclusiveLock = SPI_processed > 0;

	SPI_finish();

	if (!parentHasExclusiveLock)
	{
		ereport(ERROR, (errmsg("lock is not held by the caller. Unexpected caller "
							   "for citus_internal.mark_node_not_synced")));
	}
}


/*
 * citus_internal_mark_node_not_synced unsets metadatasynced flag in separate connection
 * to localhost. Should only be called by `MarkNodesNotSyncedInLoopBackConnection`.
 * See it for details.
 */
Datum
citus_internal_mark_node_not_synced(PG_FUNCTION_ARGS)
{
	CheckCitusVersion(ERROR);

	/* only called by superuser */
	EnsureSuperUser();

	pid_t parentSessionPid = PG_GETARG_INT32(0);

	/* fetch node by id */
	int nodeId = PG_GETARG_INT32(1);
	HeapTuple heapTuple = GetNodeByNodeId(nodeId);

	/* ensure that parent session holds Exclusive lock to pg_dist_node */
	EnsureParentSessionHasExclusiveLockOnPgDistNode(parentSessionPid);

	/*
	 * We made sure parent session holds the ExclusiveLock, so we can unset
	 * metadatasynced for the node safely with the relaxed lock here.
	 */
	Relation pgDistNode = table_open(DistNodeRelationId(), AccessShareLock);
	TupleDesc tupleDescriptor = RelationGetDescr(pgDistNode);

	Datum values[Natts_pg_dist_node];
	bool isnull[Natts_pg_dist_node];
	bool replace[Natts_pg_dist_node];

	memset(replace, 0, sizeof(replace));
	values[Anum_pg_dist_node_metadatasynced - 1] = DatumGetBool(false);
	isnull[Anum_pg_dist_node_metadatasynced - 1] = false;
	replace[Anum_pg_dist_node_metadatasynced - 1] = true;

	heapTuple = heap_modify_tuple(heapTuple, tupleDescriptor, values, isnull, replace);

	CatalogTupleUpdate(pgDistNode, &heapTuple->t_self, heapTuple);

	CitusInvalidateRelcacheByRelid(DistNodeRelationId());
	CommandCounterIncrement();

	table_close(pgDistNode, NoLock);

	PG_RETURN_VOID();
}


/*
 * FindWorkerNode searches over the worker nodes and returns the workerNode
 * if it already exists. Else, the function returns NULL.
 *
 * NOTE: A special case that this handles is when nodeName and nodePort are set
 * to LocalHostName and PostPortNumber. In that case we return the primary node
 * for the local group.
 */
WorkerNode *
FindWorkerNode(const char *nodeName, int32 nodePort)
{
	HTAB *workerNodeHash = GetWorkerNodeHash();
	bool handleFound = false;

	WorkerNode *searchedNode = (WorkerNode *) palloc0(sizeof(WorkerNode));
	strlcpy(searchedNode->workerName, nodeName, WORKER_LENGTH);
	searchedNode->workerPort = nodePort;

	void *hashKey = (void *) searchedNode;
	WorkerNode *cachedWorkerNode = (WorkerNode *) hash_search(workerNodeHash, hashKey,
															  HASH_FIND,
															  &handleFound);
	if (handleFound)
	{
		WorkerNode *workerNode = (WorkerNode *) palloc(sizeof(WorkerNode));
		*workerNode = *cachedWorkerNode;
		return workerNode;
	}

	if (strcmp(LocalHostName, nodeName) == 0 && nodePort == PostPortNumber)
	{
		return PrimaryNodeForGroup(GetLocalGroupId(), NULL);
	}

	return NULL;
}


/*
 * FindWorkerNode searches over the worker nodes and returns the workerNode
 * if it exists otherwise it errors out.
 */
WorkerNode *
FindWorkerNodeOrError(const char *nodeName, int32 nodePort)
{
	WorkerNode *node = FindWorkerNode(nodeName, nodePort);
	if (node == NULL)
	{
		ereport(ERROR, (errcode(ERRCODE_NO_DATA_FOUND),
						errmsg("node %s:%d not found", nodeName, nodePort)));
	}
	return node;
}


/*
 * FindWorkerNodeAnyCluster returns the workerNode no matter which cluster it is a part
 * of. FindWorkerNodes, like almost every other function, acts as if nodes in other
 * clusters do not exist.
 */
WorkerNode *
FindWorkerNodeAnyCluster(const char *nodeName, int32 nodePort)
{
	WorkerNode *workerNode = NULL;

	Relation pgDistNode = table_open(DistNodeRelationId(), AccessShareLock);
	TupleDesc tupleDescriptor = RelationGetDescr(pgDistNode);

	HeapTuple heapTuple = GetNodeTuple(nodeName, nodePort);
	if (heapTuple != NULL)
	{
		workerNode = TupleToWorkerNode(tupleDescriptor, heapTuple);
	}

	table_close(pgDistNode, NoLock);
	return workerNode;
}


/*
 * FindNodeAnyClusterByNodeId searches pg_dist_node and returns the node with
 * the nodeId. If the node can't be found returns NULL.
 */
static WorkerNode *
FindNodeAnyClusterByNodeId(uint32 nodeId)
{
	bool includeNodesFromOtherClusters = true;
	List *nodeList = ReadDistNode(includeNodesFromOtherClusters);
	WorkerNode *node = NULL;

	foreach_declared_ptr(node, nodeList)
	{
		if (node->nodeId == nodeId)
		{
			return node;
		}
	}

	return NULL;
}


/*
 * FindNodeWithNodeId searches pg_dist_node and returns the node with the nodeId.
 * If the node cannot be found this functions errors.
 */
WorkerNode *
FindNodeWithNodeId(int nodeId, bool missingOk)
{
	List *nodeList = ActiveReadableNodeList();
	WorkerNode *node = NULL;

	foreach_declared_ptr(node, nodeList)
	{
		if (node->nodeId == nodeId)
		{
			return node;
		}
	}

	/* there isn't any node with nodeId in pg_dist_node */
	if (!missingOk)
	{
		elog(ERROR, "node with node id %d could not be found", nodeId);
	}

	return NULL;
}


/*
 * FindCoordinatorNodeId returns the node id of the coordinator node
 */
int
FindCoordinatorNodeId()
{
	bool includeNodesFromOtherClusters = false;
	List *nodeList = ReadDistNode(includeNodesFromOtherClusters);
	WorkerNode *node = NULL;

	foreach_declared_ptr(node, nodeList)
	{
		if (NodeIsCoordinator(node))
		{
			return node->nodeId;
		}
	}

	return -1;
}


/*
 * ReadDistNode iterates over pg_dist_node table, converts each row
 * into its memory representation (i.e., WorkerNode) and adds them into
 * a list. Lastly, the list is returned to the caller.
 *
 * It skips nodes which are not in the current clusters unless requested to do otherwise
 * by includeNodesFromOtherClusters.
 */
List *
ReadDistNode(bool includeNodesFromOtherClusters)
{
	ScanKeyData scanKey[1];
	int scanKeyCount = 0;
	List *workerNodeList = NIL;

	Relation pgDistNode = table_open(DistNodeRelationId(), AccessShareLock);

	SysScanDesc scanDescriptor = systable_beginscan(pgDistNode,
													InvalidOid, false,
													NULL, scanKeyCount, scanKey);

	TupleDesc tupleDescriptor = RelationGetDescr(pgDistNode);

	HeapTuple heapTuple = systable_getnext(scanDescriptor);
	while (HeapTupleIsValid(heapTuple))
	{
		WorkerNode *workerNode = TupleToWorkerNode(tupleDescriptor, heapTuple);

		if (includeNodesFromOtherClusters ||
			strncmp(workerNode->nodeCluster, CurrentCluster, WORKER_LENGTH) == 0)
		{
			/* the coordinator acts as if it never sees nodes not in its cluster */
			workerNodeList = lappend(workerNodeList, workerNode);
		}

		heapTuple = systable_getnext(scanDescriptor);
	}

	systable_endscan(scanDescriptor);
	table_close(pgDistNode, NoLock);

	return workerNodeList;
}


/*
 * RemoveNodeFromCluster removes the provided node from the pg_dist_node table of
 * the master node and all nodes with metadata.
 * The call to the master_remove_node should be done by the super user. If there are
 * active shard placements on the node; the function errors out.
 * This function also deletes all reference table placements belong to the given node from
 * pg_dist_placement, but it does not drop actual placement at the node. It also
 * modifies replication factor of the colocation group of reference tables, so that
 * replication factor will be equal to worker count.
 */
static void
RemoveNodeFromCluster(char *nodeName, int32 nodePort)
{
	WorkerNode *workerNode = ModifiableWorkerNode(nodeName, nodePort);

	/*
	 * We do not allow metadata operations on secondary nodes in nontransactional
	 * sync mode.
	 */
	if (NodeIsSecondary(workerNode))
	{
		EnsureTransactionalMetadataSyncMode();
	}

	if (NodeIsPrimary(workerNode))
	{
		ErrorIfNodeContainsNonRemovablePlacements(workerNode);

		/*
		 * Delete reference table placements so they are not taken into account
		 * for the check if there are placements after this.
		 */
		bool localOnly = false;
		DeleteAllReplicatedTablePlacementsFromNodeGroup(workerNode->groupId,
														localOnly);

		/*
		 * Secondary nodes are read-only, never 2PC is used.
		 * Hence, no items can be inserted to pg_dist_transaction
		 * for secondary nodes.
		 */
		DeleteWorkerTransactions(workerNode);
	}

	DeleteNodeRow(workerNode->workerName, nodePort);

	/* make sure we don't have any lingering session lifespan connections */
	CloseNodeConnectionsAfterTransaction(workerNode->workerName, nodePort);

	if (EnableMetadataSync)
	{
		char *nodeDeleteCommand = NodeDeleteCommand(workerNode->nodeId);

		SendCommandToWorkersWithMetadata(nodeDeleteCommand);
	}
}


/*
 * ErrorIfNodeContainsNonRemovablePlacements throws an error if the input node
 * contains at least one placement on the node that is the last active
 * placement.
 */
static void
ErrorIfNodeContainsNonRemovablePlacements(WorkerNode *workerNode)
{
	int32 groupId = workerNode->groupId;
	List *shardPlacements = AllShardPlacementsOnNodeGroup(groupId);

	/* sort the list to prevent regression tests getting flaky */
	shardPlacements = SortList(shardPlacements, CompareGroupShardPlacements);

	GroupShardPlacement *placement = NULL;
	foreach_declared_ptr(placement, shardPlacements)
	{
		if (!PlacementHasActivePlacementOnAnotherGroup(placement))
		{
			Oid relationId = RelationIdForShard(placement->shardId);
			char *qualifiedRelationName = generate_qualified_relation_name(relationId);

			ereport(ERROR, (errmsg("cannot remove or disable the node "
								   "%s:%d because because it contains "
								   "the only shard placement for "
								   "shard " UINT64_FORMAT,
								   workerNode->workerName,
								   workerNode->workerPort, placement->shardId),
							errdetail("One of the table(s) that prevents the operation "
									  "complete successfully is %s",
									  qualifiedRelationName),
							errhint("To proceed, either drop the tables or use "
									"undistribute_table() function to convert "
									"them to local tables")));
		}
	}
}


/*
 * PlacementHasActivePlacementOnAnotherGroup returns true if there is at least
 * one more active placement of the input sourcePlacement on another group.
 */
static bool
PlacementHasActivePlacementOnAnotherGroup(GroupShardPlacement *sourcePlacement)
{
	uint64 shardId = sourcePlacement->shardId;
	List *activePlacementList = ActiveShardPlacementList(shardId);

	bool foundActivePlacementOnAnotherGroup = false;
	ShardPlacement *activePlacement = NULL;
	foreach_declared_ptr(activePlacement, activePlacementList)
	{
		if (activePlacement->groupId != sourcePlacement->groupId)
		{
			foundActivePlacementOnAnotherGroup = true;
			break;
		}
	}

	return foundActivePlacementOnAnotherGroup;
}


/* CountPrimariesWithMetadata returns the number of primary nodes which have metadata. */
uint32
CountPrimariesWithMetadata(void)
{
	uint32 primariesWithMetadata = 0;
	WorkerNode *workerNode = NULL;

	HASH_SEQ_STATUS status;
	HTAB *workerNodeHash = GetWorkerNodeHash();

	hash_seq_init(&status, workerNodeHash);

	while ((workerNode = hash_seq_search(&status)) != NULL)
	{
		if (workerNode->hasMetadata && NodeIsPrimary(workerNode))
		{
			primariesWithMetadata++;
		}
	}

	return primariesWithMetadata;
}


/*
 * AddNodeMetadata checks the given node information and adds the specified node to the
 * pg_dist_node table of the master and workers with metadata.
 * If the node already exists, the function returns the id of the node.
 * If not, the following procedure is followed while adding a node: If the groupId is not
 * explicitly given by the user, the function picks the group that the new node should
 * be in with respect to GroupSize. Then, the new node is inserted into the local
 * pg_dist_node as well as the nodes with hasmetadata=true if localOnly is false.
 */
static int
AddNodeMetadata(char *nodeName, int32 nodePort, NodeMetadata *nodeMetadata,
				bool *nodeAlreadyExists, bool localOnly)
{
	EnsureCoordinator();

	*nodeAlreadyExists = false;

	WorkerNode *workerNode = FindWorkerNodeAnyCluster(nodeName, nodePort);
	if (workerNode != NULL)
	{
		/* return early without holding locks when the node already exists */
		*nodeAlreadyExists = true;

		return workerNode->nodeId;
	}

	/*
	 * We are going to change pg_dist_node, prevent any concurrent reads that
	 * are not tolerant to concurrent node addition by taking an exclusive
	 * lock (conflicts with all but AccessShareLock).
	 *
	 * We may want to relax or have more fine-grained locking in the future
	 * to allow users to add multiple nodes concurrently.
	 */
	LockRelationOid(DistNodeRelationId(), ExclusiveLock);

	/* recheck in case 2 node additions pass the first check concurrently */
	workerNode = FindWorkerNodeAnyCluster(nodeName, nodePort);
	if (workerNode != NULL)
	{
		*nodeAlreadyExists = true;

		return workerNode->nodeId;
	}

	if (nodeMetadata->groupId != COORDINATOR_GROUP_ID &&
		strcmp(nodeName, "localhost") != 0)
	{
		/*
		 * User tries to add a worker with a non-localhost address. If the coordinator
		 * is added with "localhost" as well, the worker won't be able to connect.
		 */

		bool isCoordinatorInMetadata = false;
		WorkerNode *coordinatorNode = PrimaryNodeForGroup(COORDINATOR_GROUP_ID,
														  &isCoordinatorInMetadata);
		if (isCoordinatorInMetadata &&
			strcmp(coordinatorNode->workerName, "localhost") == 0)
		{
			ereport(ERROR, (errmsg("cannot add a worker node when the coordinator "
								   "hostname is set to localhost"),
							errdetail("Worker nodes need to be able to connect to the "
									  "coordinator to transfer data."),
							errhint("Use SELECT citus_set_coordinator_host('<hostname>') "
									"to configure the coordinator hostname")));
		}
	}

	/*
	 * When adding the first worker when the coordinator has shard placements,
	 * print a notice on how to drain the coordinator.
	 */
	if (nodeMetadata->groupId != COORDINATOR_GROUP_ID && CoordinatorAddedAsWorkerNode() &&
		ActivePrimaryNonCoordinatorNodeCount() == 0 &&
		NodeGroupHasShardPlacements(COORDINATOR_GROUP_ID))
	{
		WorkerNode *coordinator = CoordinatorNodeIfAddedAsWorkerOrError();

		ereport(NOTICE, (errmsg("shards are still on the coordinator after adding the "
								"new node"),
						 errhint("Use SELECT rebalance_table_shards(); to balance "
								 "shards data between workers and coordinator or "
								 "SELECT citus_drain_node(%s,%d); to permanently "
								 "move shards away from the coordinator.",
								 quote_literal_cstr(coordinator->workerName),
								 coordinator->workerPort)));
	}

	/* user lets Citus to decide on the group that the newly added node should be in */
	if (nodeMetadata->groupId == INVALID_GROUP_ID)
	{
		nodeMetadata->groupId = GetNextGroupId();
	}

	if (nodeMetadata->groupId == COORDINATOR_GROUP_ID)
	{
		/*
		 * Coordinator has always the authoritative metadata, reflect this
		 * fact in the pg_dist_node.
		 */
		nodeMetadata->hasMetadata = true;
		nodeMetadata->metadataSynced = true;

		/*
		 * There is no concept of "inactive" coordinator, so hard code it.
		 */
		nodeMetadata->isActive = true;
	}

	/* if nodeRole hasn't been added yet there's a constraint for one-node-per-group */
	if (nodeMetadata->nodeRole != InvalidOid && nodeMetadata->nodeRole ==
		PrimaryNodeRoleId())
	{
		WorkerNode *existingPrimaryNode = PrimaryNodeForGroup(nodeMetadata->groupId,
															  NULL);

		if (existingPrimaryNode != NULL)
		{
			ereport(ERROR, (errmsg("group %d already has a primary node",
								   nodeMetadata->groupId)));
		}
	}

	if (nodeMetadata->nodeRole == PrimaryNodeRoleId())
	{
		if (strncmp(nodeMetadata->nodeCluster,
					WORKER_DEFAULT_CLUSTER,
					WORKER_LENGTH) != 0)
		{
			ereport(ERROR, (errmsg("primaries must be added to the default cluster")));
		}
	}

	/* generate the new node id from the sequence */
	int nextNodeIdInt = GetNextNodeId();

	InsertNodeRow(nextNodeIdInt, nodeName, nodePort, nodeMetadata);

	workerNode = FindWorkerNodeAnyCluster(nodeName, nodePort);

	if (EnableMetadataSync && !localOnly)
	{
		/* send the delete command to all primary nodes with metadata */
		char *nodeDeleteCommand = NodeDeleteCommand(workerNode->nodeId);
		SendCommandToWorkersWithMetadata(nodeDeleteCommand);

		/* finally prepare the insert command and send it to all primary nodes */
		uint32 primariesWithMetadata = CountPrimariesWithMetadata();
		if (primariesWithMetadata != 0)
		{
			List *workerNodeList = list_make1(workerNode);
			char *nodeInsertCommand = NodeListInsertCommand(workerNodeList);

			SendCommandToWorkersWithMetadata(nodeInsertCommand);
		}
	}

	return workerNode->nodeId;
}


/*
 * AddNodeMetadataViaMetadataContext does the same thing as AddNodeMetadata but
 * make use of metadata sync context to send commands to workers to support both
 * transactional and nontransactional sync modes.
 */
static int
AddNodeMetadataViaMetadataContext(char *nodeName, int32 nodePort,
								  NodeMetadata *nodeMetadata, bool *nodeAlreadyExists)
{
	bool localOnly = true;
	int nodeId = AddNodeMetadata(nodeName, nodePort, nodeMetadata, nodeAlreadyExists,
								 localOnly);

	/* do nothing as the node already exists */
	if (*nodeAlreadyExists)
	{
		return nodeId;
	}

	/*
	 * Create metadata sync context that is used throughout node addition
	 * and activation if necessary.
	 */
	WorkerNode *node = ModifiableWorkerNode(nodeName, nodePort);

	/* we should always set active flag to true if we call citus_add_node */
	node = SetWorkerColumnLocalOnly(node, Anum_pg_dist_node_isactive, DatumGetBool(true));

	/*
	 * After adding new node, if the node did not already exist, we will activate
	 * the node.
	 * If the worker is not marked as a coordinator, check that
	 * the node is not trying to add itself
	 */
	if (node != NULL &&
		node->groupId != COORDINATOR_GROUP_ID &&
		node->nodeRole != SecondaryNodeRoleId() &&
		IsWorkerTheCurrentNode(node))
	{
		ereport(ERROR, (errmsg("Node cannot add itself as a worker."),
						errhint(
							"Add the node as a coordinator by using: "
							"SELECT citus_set_coordinator_host('%s', %d);",
							node->workerName, node->workerPort)));
	}

	List *nodeList = list_make1(node);
	bool collectCommands = false;
	bool nodesAddedInSameTransaction = true;
	MetadataSyncContext *context = CreateMetadataSyncContext(nodeList, collectCommands,
															 nodesAddedInSameTransaction);

	if (EnableMetadataSync)
	{
		/* send the delete command to all primary nodes with metadata */
		char *nodeDeleteCommand = NodeDeleteCommand(node->nodeId);
		SendOrCollectCommandListToMetadataNodes(context, list_make1(nodeDeleteCommand));

		/* finally prepare the insert command and send it to all primary nodes */
		uint32 primariesWithMetadata = CountPrimariesWithMetadata();
		if (primariesWithMetadata != 0)
		{
			char *nodeInsertCommand = NULL;
			if (context->transactionMode == METADATA_SYNC_TRANSACTIONAL)
			{
				nodeInsertCommand = NodeListInsertCommand(nodeList);
			}
			else if (context->transactionMode == METADATA_SYNC_NON_TRANSACTIONAL)
			{
				/*
				 * We need to ensure node insertion is idempotent in nontransactional
				 * sync mode.
				 */
				nodeInsertCommand = NodeListIdempotentInsertCommand(nodeList);
			}
			Assert(nodeInsertCommand != NULL);
			SendOrCollectCommandListToMetadataNodes(context,
													list_make1(nodeInsertCommand));
		}
	}

	ActivateNodeList(context);

	return nodeId;
}


/*
 * SetWorkerColumn function sets the column with the specified index
 * on the worker in pg_dist_node, by calling SetWorkerColumnLocalOnly.
 * It also sends the same command for node update to other metadata nodes.
 * If anything fails during the transaction, we rollback it.
 * Returns the new worker node after the modification.
 */
WorkerNode *
SetWorkerColumn(WorkerNode *workerNode, int columnIndex, Datum value)
{
	workerNode = SetWorkerColumnLocalOnly(workerNode, columnIndex, value);

	if (EnableMetadataSync)
	{
		char *metadataSyncCommand =
			GetMetadataSyncCommandToSetNodeColumn(workerNode, columnIndex, value);

		SendCommandToWorkersWithMetadata(metadataSyncCommand);
	}

	return workerNode;
}


/*
 * SetNodeStateViaMetadataContext sets or unsets isactive, metadatasynced, and hasmetadata
 * flags via metadataSyncContext.
 */
static void
SetNodeStateViaMetadataContext(MetadataSyncContext *context, WorkerNode *workerNode,
							   Datum value)
{
	char *isActiveCommand =
		GetMetadataSyncCommandToSetNodeColumn(workerNode, Anum_pg_dist_node_isactive,
											  value);
	char *metadatasyncedCommand =
		GetMetadataSyncCommandToSetNodeColumn(workerNode,
											  Anum_pg_dist_node_metadatasynced, value);
	char *hasmetadataCommand =
		GetMetadataSyncCommandToSetNodeColumn(workerNode, Anum_pg_dist_node_hasmetadata,
											  value);
	List *commandList = list_make3(isActiveCommand, metadatasyncedCommand,
								   hasmetadataCommand);

	SendOrCollectCommandListToMetadataNodes(context, commandList);
}


/*
 * SetWorkerColumnOptional function sets the column with the specified index
 * on the worker in pg_dist_node, by calling SetWorkerColumnLocalOnly.
 * It also sends the same command optionally for node update to other metadata nodes,
 * meaning that failures are ignored. Returns the new worker node after the modification.
 */
WorkerNode *
SetWorkerColumnOptional(WorkerNode *workerNode, int columnIndex, Datum value)
{
	char *metadataSyncCommand = GetMetadataSyncCommandToSetNodeColumn(workerNode,
																	  columnIndex,
																	  value);

	List *workerNodeList = TargetWorkerSetNodeList(NON_COORDINATOR_METADATA_NODES,
												   ShareLock);

	/* open connections in parallel */
	WorkerNode *worker = NULL;
	foreach_declared_ptr(worker, workerNodeList)
	{
		bool success = SendOptionalMetadataCommandListToWorkerInCoordinatedTransaction(
			worker->workerName, worker->workerPort,
			CurrentUserName(),
			list_make1(metadataSyncCommand));

		if (!success)
		{
			/* metadata out of sync, mark the worker as not synced */
			ereport(WARNING, (errmsg("Updating the metadata of the node %s:%d "
									 "is failed on node %s:%d. "
									 "Metadata on %s:%d is marked as out of sync.",
									 workerNode->workerName, workerNode->workerPort,
									 worker->workerName, worker->workerPort,
									 worker->workerName, worker->workerPort)));

			SetWorkerColumnLocalOnly(worker, Anum_pg_dist_node_metadatasynced,
									 BoolGetDatum(false));
		}
		else if (workerNode->nodeId == worker->nodeId)
		{
			/*
			 * If this is the node we want to update and it is updated succesfully,
			 * then we can safely update the flag on the coordinator as well.
			 */
			SetWorkerColumnLocalOnly(workerNode, columnIndex, value);
		}
	}

	return FindWorkerNode(workerNode->workerName, workerNode->workerPort);
}


/*
 * SetWorkerColumnLocalOnly function sets the column with the specified index
 * (see pg_dist_node.h) on the worker in pg_dist_node.
 * It returns the new worker node after the modification.
 */
WorkerNode *
SetWorkerColumnLocalOnly(WorkerNode *workerNode, int columnIndex, Datum value)
{
	Relation pgDistNode = table_open(DistNodeRelationId(), RowExclusiveLock);
	TupleDesc tupleDescriptor = RelationGetDescr(pgDistNode);
	HeapTuple heapTuple = GetNodeTuple(workerNode->workerName, workerNode->workerPort);

	Datum values[Natts_pg_dist_node];
	bool isnull[Natts_pg_dist_node];
	bool replace[Natts_pg_dist_node];

	if (heapTuple == NULL)
	{
		ereport(ERROR, (errmsg("could not find valid entry for node \"%s:%d\"",
							   workerNode->workerName, workerNode->workerPort)));
	}

	memset(replace, 0, sizeof(replace));
	values[columnIndex - 1] = value;
	isnull[columnIndex - 1] = false;
	replace[columnIndex - 1] = true;

	heapTuple = heap_modify_tuple(heapTuple, tupleDescriptor, values, isnull, replace);

	CatalogTupleUpdate(pgDistNode, &heapTuple->t_self, heapTuple);

	CitusInvalidateRelcacheByRelid(DistNodeRelationId());
	CommandCounterIncrement();

	WorkerNode *newWorkerNode = TupleToWorkerNode(tupleDescriptor, heapTuple);

	table_close(pgDistNode, NoLock);

	return newWorkerNode;
}


/*
 * GetMetadataSyncCommandToSetNodeColumn checks if the given workerNode and value is
 * valid or not. Then it returns the necessary metadata sync command as a string.
 */
static char *
GetMetadataSyncCommandToSetNodeColumn(WorkerNode *workerNode, int columnIndex, Datum
									  value)
{
	char *metadataSyncCommand = NULL;

	switch (columnIndex)
	{
		case Anum_pg_dist_node_hasmetadata:
		{
			ErrorIfCoordinatorMetadataSetFalse(workerNode, value, "hasmetadata");
			metadataSyncCommand = NodeHasmetadataUpdateCommand(workerNode->nodeId,
															   DatumGetBool(value));
			break;
		}

		case Anum_pg_dist_node_isactive:
		{
			ErrorIfCoordinatorMetadataSetFalse(workerNode, value, "isactive");

			metadataSyncCommand = NodeStateUpdateCommand(workerNode->nodeId,
														 DatumGetBool(value));
			break;
		}

		case Anum_pg_dist_node_shouldhaveshards:
		{
			metadataSyncCommand = ShouldHaveShardsUpdateCommand(workerNode->nodeId,
																DatumGetBool(value));
			break;
		}

		case Anum_pg_dist_node_metadatasynced:
		{
			ErrorIfCoordinatorMetadataSetFalse(workerNode, value, "metadatasynced");
			metadataSyncCommand = NodeMetadataSyncedUpdateCommand(workerNode->nodeId,
																  DatumGetBool(value));
			break;
		}

		default:
		{
			ereport(ERROR, (errmsg("could not find valid entry for node \"%s:%d\"",
								   workerNode->workerName, workerNode->workerPort)));
		}
	}

	return metadataSyncCommand;
}


/*
 * NodeHasmetadataUpdateCommand generates and returns a SQL UPDATE command
 * that updates the hasmetada column of pg_dist_node, for the given nodeid.
 */
static char *
NodeHasmetadataUpdateCommand(uint32 nodeId, bool hasMetadata)
{
	StringInfo updateCommand = makeStringInfo();
	char *hasMetadataString = hasMetadata ? "TRUE" : "FALSE";
	appendStringInfo(updateCommand,
					 "UPDATE pg_dist_node SET hasmetadata = %s "
					 "WHERE nodeid = %u",
					 hasMetadataString, nodeId);
	return updateCommand->data;
}


/*
 * NodeMetadataSyncedUpdateCommand generates and returns a SQL UPDATE command
 * that updates the metadataSynced column of pg_dist_node, for the given nodeid.
 */
static char *
NodeMetadataSyncedUpdateCommand(uint32 nodeId, bool metadataSynced)
{
	StringInfo updateCommand = makeStringInfo();
	char *hasMetadataString = metadataSynced ? "TRUE" : "FALSE";
	appendStringInfo(updateCommand,
					 "UPDATE pg_dist_node SET metadatasynced = %s "
					 "WHERE nodeid = %u",
					 hasMetadataString, nodeId);
	return updateCommand->data;
}


/*
 * ErrorIfCoordinatorMetadataSetFalse throws an error if the input node
 * is the coordinator and the value is false.
 */
static void
ErrorIfCoordinatorMetadataSetFalse(WorkerNode *workerNode, Datum value, char *field)
{
	bool valueBool = DatumGetBool(value);
	if (!valueBool && workerNode->groupId == COORDINATOR_GROUP_ID)
	{
		ereport(ERROR, (errmsg("cannot change \"%s\" field of the "
							   "coordinator node",
							   field)));
	}
}


/*
 * SetShouldHaveShards function sets the shouldhaveshards column of the
 * specified worker in pg_dist_node. also propagates this to other metadata nodes.
 * It returns the new worker node after the modification.
 */
static WorkerNode *
SetShouldHaveShards(WorkerNode *workerNode, bool shouldHaveShards)
{
	return SetWorkerColumn(workerNode, Anum_pg_dist_node_shouldhaveshards, BoolGetDatum(
							   shouldHaveShards));
}


/*
 * GetNodeTuple function returns the heap tuple of given nodeName and nodePort. If the
 * node is not found this function returns NULL.
 *
 * This function may return worker nodes from other clusters.
 */
static HeapTuple
GetNodeTuple(const char *nodeName, int32 nodePort)
{
	Relation pgDistNode = table_open(DistNodeRelationId(), AccessShareLock);
	const int scanKeyCount = 2;
	const bool indexOK = false;

	ScanKeyData scanKey[2];
	HeapTuple nodeTuple = NULL;

	ScanKeyInit(&scanKey[0], Anum_pg_dist_node_nodename,
				BTEqualStrategyNumber, F_TEXTEQ, CStringGetTextDatum(nodeName));
	ScanKeyInit(&scanKey[1], Anum_pg_dist_node_nodeport,
				BTEqualStrategyNumber, F_INT4EQ, Int32GetDatum(nodePort));
	SysScanDesc scanDescriptor = systable_beginscan(pgDistNode, InvalidOid, indexOK,
													NULL, scanKeyCount, scanKey);

	HeapTuple heapTuple = systable_getnext(scanDescriptor);
	if (HeapTupleIsValid(heapTuple))
	{
		nodeTuple = heap_copytuple(heapTuple);
	}

	systable_endscan(scanDescriptor);
	table_close(pgDistNode, NoLock);

	return nodeTuple;
}


/*
 * GetNodeByNodeId returns the heap tuple for given node id by looking up catalog.
 */
static HeapTuple
GetNodeByNodeId(int32 nodeId)
{
	Relation pgDistNode = table_open(DistNodeRelationId(), AccessShareLock);
	const int scanKeyCount = 1;
	const bool indexOK = false;

	ScanKeyData scanKey[1];
	HeapTuple nodeTuple = NULL;

	ScanKeyInit(&scanKey[0], Anum_pg_dist_node_nodeid,
				BTEqualStrategyNumber, F_INT4EQ, Int32GetDatum(nodeId));
	SysScanDesc scanDescriptor = systable_beginscan(pgDistNode, InvalidOid, indexOK,
													NULL, scanKeyCount, scanKey);

	HeapTuple heapTuple = systable_getnext(scanDescriptor);
	if (HeapTupleIsValid(heapTuple))
	{
		nodeTuple = heap_copytuple(heapTuple);
	}
	else
	{
		ereport(ERROR, (errmsg("could not find valid entry for node id %d", nodeId)));
	}

	systable_endscan(scanDescriptor);
	table_close(pgDistNode, NoLock);

	return nodeTuple;
}


/*
 * GetNextGroupId allocates and returns a unique groupId for the group
 * to be created. This allocation occurs both in shared memory and in write
 * ahead logs; writing to logs avoids the risk of having groupId collisions.
 *
 * Please note that the caller is still responsible for finalizing node data
 * and the groupId with the master node. Further note that this function relies
 * on an internal sequence created in initdb to generate unique identifiers.
 */
int32
GetNextGroupId()
{
	text *sequenceName = cstring_to_text(GROUPID_SEQUENCE_NAME);
	Oid sequenceId = ResolveRelationId(sequenceName, false);
	Datum sequenceIdDatum = ObjectIdGetDatum(sequenceId);
	Oid savedUserId = InvalidOid;
	int savedSecurityContext = 0;

	GetUserIdAndSecContext(&savedUserId, &savedSecurityContext);
	SetUserIdAndSecContext(CitusExtensionOwner(), SECURITY_LOCAL_USERID_CHANGE);

	/* generate new and unique shardId from sequence */
	Datum groupIdDatum = DirectFunctionCall1(nextval_oid, sequenceIdDatum);

	SetUserIdAndSecContext(savedUserId, savedSecurityContext);

	int32 groupId = DatumGetInt32(groupIdDatum);

	return groupId;
}


/*
 * GetNextNodeId allocates and returns a unique nodeId for the node
 * to be added. This allocation occurs both in shared memory and in write
 * ahead logs; writing to logs avoids the risk of having nodeId collisions.
 *
 * Please note that the caller is still responsible for finalizing node data
 * and the nodeId with the master node. Further note that this function relies
 * on an internal sequence created in initdb to generate unique identifiers.
 */
int
GetNextNodeId()
{
	text *sequenceName = cstring_to_text(NODEID_SEQUENCE_NAME);
	Oid sequenceId = ResolveRelationId(sequenceName, false);
	Datum sequenceIdDatum = ObjectIdGetDatum(sequenceId);
	Oid savedUserId = InvalidOid;
	int savedSecurityContext = 0;

	GetUserIdAndSecContext(&savedUserId, &savedSecurityContext);
	SetUserIdAndSecContext(CitusExtensionOwner(), SECURITY_LOCAL_USERID_CHANGE);

	/* generate new and unique shardId from sequence */
	Datum nextNodeIdDatum = DirectFunctionCall1(nextval_oid, sequenceIdDatum);

	SetUserIdAndSecContext(savedUserId, savedSecurityContext);

	int nextNodeId = DatumGetUInt32(nextNodeIdDatum);

	return nextNodeId;
}


/*
 * EnsureCoordinator checks if the current node is the coordinator. If it does not,
 * the function errors out.
 */
void
EnsureCoordinator(void)
{
	int32 localGroupId = GetLocalGroupId();

	if (localGroupId != 0)
	{
		ereport(ERROR, (errmsg("operation is not allowed on this node"),
						errhint("Connect to the coordinator and run it again.")));
	}
}


/*
 * EnsurePropagationToCoordinator checks whether the coordinator is added to the
 * metadata if we're not on the coordinator.
 *
 * Given that metadata syncing skips syncing metadata to the coordinator, we need
 * too make sure that the coordinator is added to the metadata before propagating
 * a command from a worker. For this reason, today we use this only for the commands
 * that we support propagating from workers.
 */
void
EnsurePropagationToCoordinator(void)
{
	if (!IsCoordinator())
	{
		EnsureCoordinatorIsInMetadata();
	}
}


/*
 * EnsureCoordinatorIsInMetadata checks whether the coordinator is added to the
 * metadata, which is required for many operations.
 */
void
EnsureCoordinatorIsInMetadata(void)
{
	bool isCoordinatorInMetadata = false;
	PrimaryNodeForGroup(COORDINATOR_GROUP_ID, &isCoordinatorInMetadata);
	if (isCoordinatorInMetadata)
	{
		return;
	}

	/* be more descriptive when we're not on coordinator */
	if (IsCoordinator())
	{
		ereport(ERROR, (errmsg("coordinator is not added to the metadata"),
						errhint("Use SELECT citus_set_coordinator_host('<hostname>') "
								"to configure the coordinator hostname")));
	}
	else
	{
		ereport(ERROR, (errmsg("coordinator is not added to the metadata"),
						errhint("Use SELECT citus_set_coordinator_host('<hostname>') "
								"on coordinator to configure the coordinator hostname")));
	}
}


/*
 * InsertCoordinatorIfClusterEmpty can be used to ensure Citus tables can be
 * created even on a node that has just performed CREATE EXTENSION citus;
 */
void
InsertCoordinatorIfClusterEmpty(void)
{
	/* prevent concurrent node additions */
	Relation pgDistNode = table_open(DistNodeRelationId(), RowShareLock);

	if (!HasAnyNodes())
	{
		/*
		 * create_distributed_table being called for the first time and there are
		 * no pg_dist_node records. Add a record for the coordinator.
		 */
		InsertPlaceholderCoordinatorRecord();
	}

	/*
	 * We release the lock, if InsertPlaceholderCoordinatorRecord was called
	 * we already have a strong (RowExclusive) lock.
	 */
	table_close(pgDistNode, RowShareLock);
}


/*
 * InsertPlaceholderCoordinatorRecord inserts a placeholder record for the coordinator
 * to be able to create distributed tables on a single node.
 */
static void
InsertPlaceholderCoordinatorRecord(void)
{
	NodeMetadata nodeMetadata = DefaultNodeMetadata();
	nodeMetadata.groupId = 0;
	nodeMetadata.shouldHaveShards = true;
	nodeMetadata.nodeRole = PrimaryNodeRoleId();
	nodeMetadata.nodeCluster = "default";

	bool nodeAlreadyExists = false;
	bool localOnly = false;

	/* as long as there is a single node, localhost should be ok */
	AddNodeMetadata(LocalHostName, PostPortNumber, &nodeMetadata, &nodeAlreadyExists,
					localOnly);
}


/*
 * InsertNodeRow opens the node system catalog, and inserts a new row with the
 * given values into that system catalog.
 *
 * NOTE: If you call this function you probably need to have taken a
 * ShareRowExclusiveLock then checked that you're not adding a second primary to
 * an existing group. If you don't it's possible for the metadata to become inconsistent.
 */
static void
InsertNodeRow(int nodeid, char *nodeName, int32 nodePort, NodeMetadata *nodeMetadata)
{
	Datum values[Natts_pg_dist_node];
	bool isNulls[Natts_pg_dist_node];

	Datum nodeClusterStringDatum = CStringGetDatum(nodeMetadata->nodeCluster);
	Datum nodeClusterNameDatum = DirectFunctionCall1(namein, nodeClusterStringDatum);

	/* form new shard tuple */
	memset(values, 0, sizeof(values));
	memset(isNulls, false, sizeof(isNulls));

	values[Anum_pg_dist_node_nodeid - 1] = UInt32GetDatum(nodeid);
	values[Anum_pg_dist_node_groupid - 1] = Int32GetDatum(nodeMetadata->groupId);
	values[Anum_pg_dist_node_nodename - 1] = CStringGetTextDatum(nodeName);
	values[Anum_pg_dist_node_nodeport - 1] = UInt32GetDatum(nodePort);
	values[Anum_pg_dist_node_noderack - 1] = CStringGetTextDatum(nodeMetadata->nodeRack);
	values[Anum_pg_dist_node_hasmetadata - 1] = BoolGetDatum(nodeMetadata->hasMetadata);
	values[Anum_pg_dist_node_metadatasynced - 1] = BoolGetDatum(
		nodeMetadata->metadataSynced);
	values[Anum_pg_dist_node_isactive - 1] = BoolGetDatum(nodeMetadata->isActive);
	values[Anum_pg_dist_node_noderole - 1] = ObjectIdGetDatum(nodeMetadata->nodeRole);
	values[Anum_pg_dist_node_nodecluster - 1] = nodeClusterNameDatum;
	values[Anum_pg_dist_node_shouldhaveshards - 1] = BoolGetDatum(
		nodeMetadata->shouldHaveShards);

	Relation pgDistNode = table_open(DistNodeRelationId(), RowExclusiveLock);

	TupleDesc tupleDescriptor = RelationGetDescr(pgDistNode);
	HeapTuple heapTuple = heap_form_tuple(tupleDescriptor, values, isNulls);

	CatalogTupleInsert(pgDistNode, heapTuple);

	CitusInvalidateRelcacheByRelid(DistNodeRelationId());

	/* increment the counter so that next command can see the row */
	CommandCounterIncrement();

	/* close relation */
	table_close(pgDistNode, NoLock);
}


/*
 * DeleteNodeRow removes the requested row from pg_dist_node table if it exists.
 */
static void
DeleteNodeRow(char *nodeName, int32 nodePort)
{
	const int scanKeyCount = 2;
	bool indexOK = false;

	ScanKeyData scanKey[2];
	Relation pgDistNode = table_open(DistNodeRelationId(), RowExclusiveLock);

	/*
	 * simple_heap_delete() expects that the caller has at least an
	 * AccessShareLock on primary key index.
	 *
	 * XXX: This does not seem required, do we really need to acquire this lock?
	 * Postgres doesn't acquire such locks on indexes before deleting catalog tuples.
	 * Linking here the reasons we added this lock acquirement:
	 * https://github.com/citusdata/citus/pull/2851#discussion_r306569462
	 * https://github.com/citusdata/citus/pull/2855#discussion_r313628554
	 * https://github.com/citusdata/citus/issues/1890
	 */
	Relation replicaIndex = index_open(RelationGetPrimaryKeyIndex(pgDistNode),
									   AccessShareLock);

	ScanKeyInit(&scanKey[0], Anum_pg_dist_node_nodename,
				BTEqualStrategyNumber, F_TEXTEQ, CStringGetTextDatum(nodeName));
	ScanKeyInit(&scanKey[1], Anum_pg_dist_node_nodeport,
				BTEqualStrategyNumber, F_INT4EQ, Int32GetDatum(nodePort));

	SysScanDesc heapScan = systable_beginscan(pgDistNode, InvalidOid, indexOK,
											  NULL, scanKeyCount, scanKey);

	HeapTuple heapTuple = systable_getnext(heapScan);

	if (!HeapTupleIsValid(heapTuple))
	{
		ereport(ERROR, (errmsg("could not find valid entry for node \"%s:%d\"",
							   nodeName, nodePort)));
	}

	simple_heap_delete(pgDistNode, &(heapTuple->t_self));

	systable_endscan(heapScan);

	/* ensure future commands don't use the node we just removed */
	CitusInvalidateRelcacheByRelid(DistNodeRelationId());

	/* increment the counter so that next command won't see the row */
	CommandCounterIncrement();

	table_close(replicaIndex, AccessShareLock);
	table_close(pgDistNode, NoLock);
}


/*
 * TupleToWorkerNode takes in a heap tuple from pg_dist_node, and
 * converts this tuple to an equivalent struct in memory. The function assumes
 * the caller already has locks on the tuple, and doesn't perform any locking.
 */
static WorkerNode *
TupleToWorkerNode(TupleDesc tupleDescriptor, HeapTuple heapTuple)
{
	Datum datumArray[Natts_pg_dist_node];
	bool isNullArray[Natts_pg_dist_node];

	Assert(!HeapTupleHasNulls(heapTuple));

	/*
	 * This function can be called before "ALTER TABLE ... ADD COLUMN nodecluster ...",
	 * therefore heap_deform_tuple() won't set the isNullArray for this column. We
	 * initialize it true to be safe in that case.
	 */
	memset(isNullArray, true, sizeof(isNullArray));

	/*
	 * We use heap_deform_tuple() instead of heap_getattr() to expand tuple
	 * to contain missing values when ALTER TABLE ADD COLUMN happens.
	 */
	heap_deform_tuple(heapTuple, tupleDescriptor, datumArray, isNullArray);

	char *nodeName = TextDatumGetCString(datumArray[Anum_pg_dist_node_nodename - 1]);
	char *nodeRack = TextDatumGetCString(datumArray[Anum_pg_dist_node_noderack - 1]);

	WorkerNode *workerNode = (WorkerNode *) palloc0(sizeof(WorkerNode));
	workerNode->nodeId = DatumGetUInt32(datumArray[Anum_pg_dist_node_nodeid - 1]);
	workerNode->workerPort = DatumGetUInt32(datumArray[Anum_pg_dist_node_nodeport - 1]);
	workerNode->groupId = DatumGetInt32(datumArray[Anum_pg_dist_node_groupid - 1]);
	strlcpy(workerNode->workerName, nodeName, WORKER_LENGTH);
	strlcpy(workerNode->workerRack, nodeRack, WORKER_LENGTH);
	workerNode->hasMetadata = DatumGetBool(datumArray[Anum_pg_dist_node_hasmetadata - 1]);
	workerNode->metadataSynced =
		DatumGetBool(datumArray[Anum_pg_dist_node_metadatasynced - 1]);
	workerNode->isActive = DatumGetBool(datumArray[Anum_pg_dist_node_isactive - 1]);
	workerNode->nodeRole = DatumGetObjectId(datumArray[Anum_pg_dist_node_noderole - 1]);
	workerNode->shouldHaveShards = DatumGetBool(
		datumArray[Anum_pg_dist_node_shouldhaveshards -
				   1]);

	/*
	 * nodecluster column can be missing. In the case of extension creation/upgrade,
	 * master_initialize_node_metadata function is called before the nodecluster
	 * column is added to pg_dist_node table.
	 */
	if (!isNullArray[Anum_pg_dist_node_nodecluster - 1])
	{
		Name nodeClusterName =
			DatumGetName(datumArray[Anum_pg_dist_node_nodecluster - 1]);
		char *nodeClusterString = NameStr(*nodeClusterName);
		strlcpy(workerNode->nodeCluster, nodeClusterString, NAMEDATALEN);
	}

	return workerNode;
}


/*
 * StringToDatum transforms a string representation into a Datum.
 */
Datum
StringToDatum(char *inputString, Oid dataType)
{
	Oid typIoFunc = InvalidOid;
	Oid typIoParam = InvalidOid;
	int32 typeModifier = -1;

	getTypeInputInfo(dataType, &typIoFunc, &typIoParam);
	getBaseTypeAndTypmod(dataType, &typeModifier);

	Datum datum = OidInputFunctionCall(typIoFunc, inputString, typIoParam, typeModifier);

	return datum;
}


/*
 * DatumToString returns the string representation of the given datum.
 */
char *
DatumToString(Datum datum, Oid dataType)
{
	Oid typIoFunc = InvalidOid;
	bool typIsVarlena = false;

	getTypeOutputInfo(dataType, &typIoFunc, &typIsVarlena);
	char *outputString = OidOutputFunctionCall(typIoFunc, datum);

	return outputString;
}


/*
 * UnsetMetadataSyncedForAllWorkers sets the metadatasynced column of all metadata
 * worker nodes to false. It returns true if it updated at least a node.
 */
static bool
UnsetMetadataSyncedForAllWorkers(void)
{
	bool updatedAtLeastOne = false;
	ScanKeyData scanKey[3];
	int scanKeyCount = 3;
	bool indexOK = false;

	/*
	 * Concurrent citus_update_node() calls might iterate and try to update
	 * pg_dist_node in different orders. To protect against deadlock, we
	 * get an exclusive lock here.
	 */
	Relation relation = table_open(DistNodeRelationId(), ExclusiveLock);
	TupleDesc tupleDescriptor = RelationGetDescr(relation);
	ScanKeyInit(&scanKey[0], Anum_pg_dist_node_hasmetadata,
				BTEqualStrategyNumber, F_BOOLEQ, BoolGetDatum(true));
	ScanKeyInit(&scanKey[1], Anum_pg_dist_node_metadatasynced,
				BTEqualStrategyNumber, F_BOOLEQ, BoolGetDatum(true));

	/* coordinator always has the up to date metadata */
	ScanKeyInit(&scanKey[2], Anum_pg_dist_node_groupid,
				BTGreaterStrategyNumber, F_INT4GT,
				Int32GetDatum(COORDINATOR_GROUP_ID));

	CatalogIndexState indstate = CatalogOpenIndexes(relation);

	SysScanDesc scanDescriptor = systable_beginscan(relation,
													InvalidOid, indexOK,
													NULL, scanKeyCount, scanKey);

	HeapTuple heapTuple = systable_getnext(scanDescriptor);
	if (HeapTupleIsValid(heapTuple))
	{
		updatedAtLeastOne = true;
	}

	while (HeapTupleIsValid(heapTuple))
	{
		Datum values[Natts_pg_dist_node];
		bool isnull[Natts_pg_dist_node];
		bool replace[Natts_pg_dist_node];

		memset(replace, false, sizeof(replace));
		memset(isnull, false, sizeof(isnull));
		memset(values, 0, sizeof(values));

		values[Anum_pg_dist_node_metadatasynced - 1] = BoolGetDatum(false);
		replace[Anum_pg_dist_node_metadatasynced - 1] = true;

		HeapTuple newHeapTuple = heap_modify_tuple(heapTuple, tupleDescriptor, values,
												   isnull,
												   replace);

		CatalogTupleUpdateWithInfo(relation, &newHeapTuple->t_self, newHeapTuple,
								   indstate);

		CommandCounterIncrement();

		heap_freetuple(newHeapTuple);

		heapTuple = systable_getnext(scanDescriptor);
	}

	systable_endscan(scanDescriptor);
	CatalogCloseIndexes(indstate);
	table_close(relation, NoLock);

	return updatedAtLeastOne;
}


/*
 * ErrorIfAnyNodeNotExist errors if any node in given list not found.
 */
static void
ErrorIfAnyNodeNotExist(List *nodeList)
{
	WorkerNode *node = NULL;
	foreach_declared_ptr(node, nodeList)
	{
		/*
		 * First, locally mark the node is active, if everything goes well,
		 * we are going to sync this information to all the metadata nodes.
		 */
		WorkerNode *workerNode =
			FindWorkerNodeAnyCluster(node->workerName, node->workerPort);
		if (workerNode == NULL)
		{
			ereport(ERROR, (errmsg("node at \"%s:%u\" does not exist", node->workerName,
								   node->workerPort)));
		}
	}
}


/*
 * UpdateLocalGroupIdsViaMetadataContext updates local group ids for given list
 * of nodes with transactional or nontransactional mode according to transactionMode
 * inside metadataSyncContext.
 */
static void
UpdateLocalGroupIdsViaMetadataContext(MetadataSyncContext *context)
{
	int activatedPrimaryCount = list_length(context->activatedWorkerNodeList);
	int nodeIdx = 0;
	for (nodeIdx = 0; nodeIdx < activatedPrimaryCount; nodeIdx++)
	{
		WorkerNode *node = list_nth(context->activatedWorkerNodeList, nodeIdx);
		List *commandList = list_make1(LocalGroupIdUpdateCommand(node->groupId));

		/* send commands to new workers, the current user should be a superuser */
		Assert(superuser());

		SendOrCollectCommandListToSingleNode(context, commandList, nodeIdx);
	}
}


/*
 * SendDeletionCommandsForReplicatedTablePlacements sends commands to delete replicated
 * placement for the metadata nodes with transactional or nontransactional mode according
 * to transactionMode inside metadataSyncContext.
 */
static void
SendDeletionCommandsForReplicatedTablePlacements(MetadataSyncContext *context)
{
	WorkerNode *node = NULL;
	foreach_declared_ptr(node, context->activatedWorkerNodeList)
	{
		if (!node->isActive)
		{
			bool localOnly = false;
			int32 groupId = node->groupId;
			DeleteAllReplicatedTablePlacementsFromNodeGroupViaMetadataContext(context,
																			  groupId,
																			  localOnly);
		}
	}
}


/*
 * SyncNodeMetadata syncs node metadata with transactional or nontransactional
 * mode according to transactionMode inside metadataSyncContext.
 */
static void
SyncNodeMetadata(MetadataSyncContext *context)
{
	CheckCitusVersion(ERROR);

	if (!EnableMetadataSync)
	{
		return;
	}

	/*
	 * Do not fail when we call this method from activate_node_snapshot
	 * from workers.
	 */
	if (!MetadataSyncCollectsCommands(context))
	{
		EnsureCoordinator();
	}

	EnsureModificationsCanRun();
	EnsureSequentialModeMetadataOperations();

	LockRelationOid(DistNodeRelationId(), ExclusiveLock);

	/* generate the queries which drop the node metadata */
	List *dropMetadataCommandList = NodeMetadataDropCommands();

	/* generate the queries which create the node metadata from scratch */
	List *createMetadataCommandList = NodeMetadataCreateCommands();

	List *recreateNodeSnapshotCommandList = dropMetadataCommandList;
	recreateNodeSnapshotCommandList = list_concat(recreateNodeSnapshotCommandList,
												  createMetadataCommandList);

	/*
	 * We should have already added node metadata to metadata workers. Sync node
	 * metadata just for activated workers.
	 */
	SendOrCollectCommandListToActivatedNodes(context, recreateNodeSnapshotCommandList);
}