citus/src/backend/distributed/commands/multi_copy.c

/*-------------------------------------------------------------------------
 *
 * multi_copy.c
 *     This file contains implementation of COPY utility for distributed
 *     tables.
 *
 * The CitusCopyFrom function should be called from the utility hook to process
 * COPY ... FROM commands on distributed tables. CitusCopyFrom parses the input
 * from stdin, a program, or a file, and decides to copy new rows to existing
 * shards or new shards based on the partition method of the distributed table.
 * If copy is run a worker node, CitusCopyFrom calls CopyFromWorkerNode which
 * parses the master node copy options and handles communication with the master
 * node.
 *
 * It opens a new connection for every shard placement and uses the PQputCopyData
 * function to copy the data. Because PQputCopyData transmits data, asynchronously,
 * the workers will ingest data at least partially in parallel.
 *
 * For hash-partitioned tables, if it fails to connect to a worker, the master
 * marks the placement for which it was trying to open a connection as inactive,
 * similar to the way DML statements are handled. If a failure occurs after
 * connecting, the transaction is rolled back on all the workers. Note that,
 * in the case of append-partitioned tables, if a fail occurs, immediately
 * metadata changes are rolled back on the master node, but shard placements
 * are left on the worker nodes.
 *
 * By default, COPY uses normal transactions on the workers. In the case of
 * hash or range-partitioned tables, this can cause a problem when some of the
 * transactions fail to commit while others have succeeded. To ensure no data
 * is lost, COPY can use two-phase commit, by increasing max_prepared_transactions
 * on the worker and setting citus.multi_shard_commit_protocol to '2pc'. The default
 * is '1pc'. This is not a problem for append-partitioned tables because new
 * shards are created and in the case of failure, metadata changes are rolled
 * back on the master node.
 *
 * Parsing options are processed and enforced on the node where copy command
 * is run, while constraints are enforced on the worker. In either case,
 * failure causes the whole COPY to roll back.
 *
 * Copyright (c) 2016, Citus Data, Inc.
 *
 * With contributions from Postgres Professional.
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"
#include "libpq-fe.h"
#include "libpq/libpq.h"
#include "libpq/pqformat.h"
#include "miscadmin.h"

#include <arpa/inet.h> /* for htons */
#include <netinet/in.h> /* for htons */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/epoll.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <zmq.h>

#include "access/htup_details.h"
#include "access/htup.h"
#include "access/nbtree.h"
#include "access/sdir.h"
#include "catalog/namespace.h"
#include "catalog/pg_type.h"
#include "commands/copy.h"
#include "commands/defrem.h"
#include "distributed/bload.h"
#include "distributed/colocation_utils.h"
#include "distributed/master_protocol.h"
#include "distributed/metadata_cache.h"
#include "distributed/multi_copy.h"
#include "distributed/multi_physical_planner.h"
#include "distributed/multi_shard_transaction.h"
#include "distributed/placement_connection.h"
#include "distributed/pg_dist_shard.h"
#include "distributed/remote_commands.h"
#include "distributed/resource_lock.h"
#include "distributed/worker_protocol.h"
#include "executor/executor.h"
#include "tsearch/ts_locale.h"
#include "utils/builtins.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
#include "utils/memutils.h"
#include "utils/typcache.h"


/* constant used in binary protocol */
static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0";

/* use a global connection to the master node in order to skip passing it around */
static MultiConnection *masterConnection = NULL;

static int MaxEvents = 64; /* up to MaxEvents returned by epoll_wait() */
static int EpollTimeout = 100; /* wait for a maximum time of EpollTimeout ms */
static int ZeromqPortCount = 100; /* number of available zeromq ports */
static int ZeromqStartPort = 10240; /* start port of zeormq */

/* Local functions forward declarations */
static void CopyFromWorkerNode(CopyStmt *copyStatement, char *completionTag);
static void CopyToExistingShards(CopyStmt *copyStatement, char *completionTag, Oid relationId);
static void CopyToNewShards(CopyStmt *copyStatement, char *completionTag, Oid relationId);
static char MasterPartitionMethod(RangeVar *relation);
static void RemoveMasterOptions(CopyStmt *copyStatement);
static void OpenCopyConnections(CopyStmt *copyStatement,
								ShardConnections *shardConnections, bool stopOnFailure,
								bool useBinaryCopyFormat);

static bool CanUseBinaryCopyFormat(TupleDesc tupleDescription,
								   CopyOutState rowOutputState);
static List * MasterShardPlacementList(uint64 shardId);
static List * RemoteFinalizedShardPlacementList(uint64 shardId);

static void SendCopyBinaryHeaders(CopyOutState copyOutState, int64 shardId,
								  List *connectionList);
static void SendCopyBinaryFooters(CopyOutState copyOutState, int64 shardId,
								  List *connectionList);

static StringInfo ConstructCopyStatement(CopyStmt *copyStatement, int64 shardId,
										 bool useBinaryCopyFormat, bool useFreeze);
static void SendCopyDataToAll(StringInfo dataBuffer, int64 shardId, List *connectionList);
static void SendCopyDataToPlacement(StringInfo dataBuffer, int64 shardId,
									MultiConnection *connection);
static void EndRemoteCopy(int64 shardId, List *connectionList, bool stopOnFailure);
static void ReportCopyError(MultiConnection *connection, PGresult *result);
static uint32 AvailableColumnCount(TupleDesc tupleDescriptor);
static int64 StartCopyToNewShard(ShardConnections *shardConnections,
								 CopyStmt *copyStatement, bool useBinaryCopyFormat);
static int64 MasterCreateEmptyShard(char *relationName);
static int64 CreateEmptyShard(char *relationName);
static int64 RemoteCreateEmptyShard(char *relationName);
static void MasterUpdateShardStatistics(uint64 shardId);
static void RemoteUpdateShardStatistics(uint64 shardId);

/* Private functions copied and adapted from copy.c in PostgreSQL */
static void CopySendData(CopyOutState outputState, const void *databuf, int datasize);
static void CopySendString(CopyOutState outputState, const char *str);
static void CopySendChar(CopyOutState outputState, char c);
static void CopySendInt32(CopyOutState outputState, int32 val);
static void CopySendInt16(CopyOutState outputState, int16 val);
static void CopyAttributeOutText(CopyOutState outputState, char *string);
static inline void CopyFlushOutput(CopyOutState outputState, char *start, char *pointer);

/* Functions for bulkload copy */
static void RemoveBulkloadOptions(CopyStmt *copyStatement);

static StringInfo ConstructBulkloadCopyStmt(CopyStmt *copyStatement,
		NodeAddress *masterNodeAddress, char *nodeName, uint32 nodePort);
static void RebuildBulkloadCopyStatement(CopyStmt *copyStatement,
		NodeAddress *bulkloadServer);
static StringInfo DeparseCopyStatementOptions(List *options);

static NodeAddress * LocalAddress(void);
static NodeAddress * BulkloadServerAddress(CopyStmt *copyStatement);

static void BulkloadCopyToNewShards(CopyStmt *copyStatement, char *completionTag,
		Oid relationId);
static void BulkloadCopyToExistingShards(CopyStmt *copyStatement, char *completionTag,
		Oid relationId);
static void BulkloadCopyServer(CopyStmt *copyStatement, char *completionTag,
		NodeAddress *masterNodeAddress, Oid relationId);

static List * MasterWorkerNodeList(void);
static List * RemoteWorkerNodeList(void);
static DistTableCacheEntry * MasterDistributedTableCacheEntry(RangeVar *relation);

static void StartZeroMQServer(ZeroMQServer *zeromqServer, bool is_program, bool binary,
		int natts);
static void SendMessage(ZeroMQServer *zeromqServer, char *buf, size_t len, bool kill);
static void StopZeroMQServer(ZeroMQServer *zeromqServer);

static int CopyGetAttnums(Oid relationId, List *attnamelist);
static PGconn * GetConnectionBySock(List *connList, int sock, int *connIdx);


/*
 * CitusCopyFrom implements the COPY table_name FROM. It dispacthes the copy
 * statement to related subfunctions based on where the copy command is run
 * and the partition method of the distributed table.
 */
void
CitusCopyFrom(CopyStmt *copyStatement, char *completionTag)
{
	bool isCopyFromWorker = false;
	bool isBulkloadCopy = false;

	BeginOrContinueCoordinatedTransaction();
	if (MultiShardCommitProtocol == COMMIT_PROTOCOL_2PC)
	{
		CoordinatedTransactionUse2PC();
	}

	/* disallow COPY to/from file or program except for superusers */
	if (copyStatement->filename != NULL && !superuser())
	{
		if (copyStatement->is_program)
		{
			ereport(ERROR,
					(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
					 errmsg("must be superuser to COPY to or from an external program"),
					 errhint("Anyone can COPY to stdout or from stdin. "
							 "psql's \\copy command also works for anyone.")));
		}
		else
		{
			ereport(ERROR,
					(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
					 errmsg("must be superuser to COPY to or from a file"),
					 errhint("Anyone can COPY to stdout or from stdin. "
							 "psql's \\copy command also works for anyone.")));
		}
	}

	masterConnection = NULL; /* reset, might still be set after error */
	isBulkloadCopy = IsBulkloadCopy(copyStatement);
	isCopyFromWorker = IsCopyFromWorker(copyStatement);
	if (isBulkloadCopy)
	{
		CitusBulkloadCopy(copyStatement, completionTag);
	}
	else if (isCopyFromWorker)
	{
		CopyFromWorkerNode(copyStatement, completionTag);
	}
	else
	{
		Oid relationId = RangeVarGetRelid(copyStatement->relation, NoLock, false);
		char partitionMethod = PartitionMethod(relationId);

		if (partitionMethod == DISTRIBUTE_BY_HASH || partitionMethod ==
			DISTRIBUTE_BY_RANGE || partitionMethod == DISTRIBUTE_BY_NONE)
		{
			CopyToExistingShards(copyStatement, completionTag, InvalidOid);
		}
		else if (partitionMethod == DISTRIBUTE_BY_APPEND)
		{
			CopyToNewShards(copyStatement, completionTag, relationId);
		}
		else
		{
			ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
							errmsg("unsupported partition method")));
		}
	}

	XactModificationLevel = XACT_MODIFICATION_DATA;
}


/*
 * IsCopyFromWorker checks if the given copy statement has the master host option.
 */
bool
IsCopyFromWorker(CopyStmt *copyStatement)
{
	ListCell *optionCell = NULL;
	foreach(optionCell, copyStatement->options)
	{
		DefElem *defel = (DefElem *) lfirst(optionCell);
		if (strncmp(defel->defname, "master_host", NAMEDATALEN) == 0)
		{
			return true;
		}
	}

	return false;
}


/*
 * CopyFromWorkerNode implements the COPY table_name FROM ... from worker nodes
 * for append-partitioned tables.
 */
static void
CopyFromWorkerNode(CopyStmt *copyStatement, char *completionTag)
{
	NodeAddress *masterNodeAddress = MasterNodeAddress(copyStatement);
	char *nodeName = masterNodeAddress->nodeName;
	int32 nodePort = masterNodeAddress->nodePort;
	Oid relationId = InvalidOid;
	char partitionMethod = 0;
	char *schemaName = NULL;
	uint32 connectionFlags = FOR_DML;

	masterConnection = GetNodeConnection(connectionFlags, nodeName, nodePort);
	ClaimConnectionExclusively(masterConnection);

	RemoteTransactionBeginIfNecessary(masterConnection);

	/* strip schema name for local reference */
	schemaName = copyStatement->relation->schemaname;
	copyStatement->relation->schemaname = NULL;

	relationId = RangeVarGetRelid(copyStatement->relation, NoLock, false);

	/* put schema name back */
	copyStatement->relation->schemaname = schemaName;
	partitionMethod = MasterPartitionMethod(copyStatement->relation);
	if (partitionMethod != DISTRIBUTE_BY_APPEND)
	{
		ereport(ERROR, (errmsg("copy from worker nodes is only supported "
							   "for append-partitioned tables")));
	}

	/*
	 * Remove master node options from the copy statement because they are not
	 * recognized by PostgreSQL machinery.
	 */
	RemoveMasterOptions(copyStatement);

	CopyToNewShards(copyStatement, completionTag, relationId);

	UnclaimConnection(masterConnection);
	masterConnection = NULL;
}


/*
 * CopyToExistingShards implements the COPY table_name FROM ... for hash or
 * range-partitioned tables where there are already shards into which to copy
 * rows.
 */
static void
CopyToExistingShards(CopyStmt *copyStatement, char *completionTag, Oid relationId)
{
	Oid tableId = InvalidOid;
	if (relationId != InvalidOid)
	{
		tableId = relationId;
	}
	else
	{
		tableId = RangeVarGetRelid(copyStatement->relation, NoLock, false);
	}
	char *relationName = get_rel_name(tableId);
	Relation distributedRelation = NULL;
	TupleDesc tupleDescriptor = NULL;
	uint32 columnCount = 0;
	Datum *columnValues = NULL;
	bool *columnNulls = NULL;
	FmgrInfo *hashFunction = NULL;
	FmgrInfo *compareFunction = NULL;
	bool hasUniformHashDistribution = false;
	DistTableCacheEntry *cacheEntry = DistributedTableCacheEntry(tableId);
	const char *delimiterCharacter = "\t";
	const char *nullPrintCharacter = "\\N";

	int shardCount = 0;
	List *shardIntervalList = NULL;
	ShardInterval **shardIntervalCache = NULL;
	bool useBinarySearch = false;

	HTAB *shardConnectionHash = NULL;
	ShardConnections *shardConnections = NULL;
	List *shardConnectionsList = NIL;
	ListCell *shardConnectionsCell = NULL;

	EState *executorState = NULL;
	MemoryContext executorTupleContext = NULL;
	ExprContext *executorExpressionContext = NULL;

	CopyState copyState = NULL;
	CopyOutState copyOutState = NULL;
	FmgrInfo *columnOutputFunctions = NULL;
	uint64 processedRowCount = 0;

	Var *partitionColumn = PartitionColumn(tableId, 0);
	char partitionMethod = PartitionMethod(tableId);

	ErrorContextCallback errorCallback;

	/* get hash function for partition column */
	hashFunction = cacheEntry->hashFunction;

	/* get compare function for shard intervals */
	compareFunction = cacheEntry->shardIntervalCompareFunction;

	/* allocate column values and nulls arrays */
	distributedRelation = heap_open(tableId, RowExclusiveLock);
	tupleDescriptor = RelationGetDescr(distributedRelation);
	columnCount = tupleDescriptor->natts;
	columnValues = palloc0(columnCount * sizeof(Datum));
	columnNulls = palloc0(columnCount * sizeof(bool));

	/* we don't support copy to reference tables from workers */
	if (partitionMethod == DISTRIBUTE_BY_NONE)
	{
		EnsureCoordinator();
	}

	/* load the list of shards and verify that we have shards to copy into */
	shardIntervalList = LoadShardIntervalList(tableId);
	if (shardIntervalList == NIL)
	{
		if (partitionMethod == DISTRIBUTE_BY_HASH)
		{
			ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
							errmsg("could not find any shards into which to copy"),
							errdetail("No shards exist for distributed table \"%s\".",
									  relationName),
							errhint("Run master_create_worker_shards to create shards "
									"and try again.")));
		}
		else
		{
			ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
							errmsg("could not find any shards into which to copy"),
							errdetail("No shards exist for distributed table \"%s\".",
									  relationName)));
		}
	}

	/* error if any shard missing min/max values for non reference tables */
	if (partitionMethod != DISTRIBUTE_BY_NONE &&
		cacheEntry->hasUninitializedShardInterval)
	{
		ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
						errmsg("could not start copy"),
						errdetail("Distributed relation \"%s\" has shards "
								  "with missing shardminvalue/shardmaxvalue.",
								  relationName)));
	}

	/* prevent concurrent placement changes and non-commutative DML statements */
	LockShardListMetadata(shardIntervalList, ShareLock);
	LockShardListResources(shardIntervalList, ShareLock);

	/* initialize the shard interval cache */
	shardCount = cacheEntry->shardIntervalArrayLength;
	shardIntervalCache = cacheEntry->sortedShardIntervalArray;
	hasUniformHashDistribution = cacheEntry->hasUniformHashDistribution;

	/* determine whether to use binary search */
	if (partitionMethod != DISTRIBUTE_BY_HASH || !hasUniformHashDistribution)
	{
		useBinarySearch = true;
	}

	if (cacheEntry->replicationModel == REPLICATION_MODEL_2PC)
	{
		CoordinatedTransactionUse2PC();
	}

	/* initialize copy state to read from COPY data source */
	copyState = BeginCopyFrom(distributedRelation,
							  copyStatement->filename,
							  copyStatement->is_program,
							  copyStatement->attlist,
							  copyStatement->options);

	executorState = CreateExecutorState();
	executorTupleContext = GetPerTupleMemoryContext(executorState);
	executorExpressionContext = GetPerTupleExprContext(executorState);

	copyOutState = (CopyOutState) palloc0(sizeof(CopyOutStateData));
	copyOutState->delim = (char *) delimiterCharacter;
	copyOutState->null_print = (char *) nullPrintCharacter;
	copyOutState->null_print_client = (char *) nullPrintCharacter;
	copyOutState->binary = CanUseBinaryCopyFormat(tupleDescriptor, copyOutState);
	copyOutState->fe_msgbuf = makeStringInfo();
	copyOutState->rowcontext = executorTupleContext;

	columnOutputFunctions = ColumnOutputFunctions(tupleDescriptor, copyOutState->binary);

	/* create a mapping of shard id to a connection for each of its placements */
	shardConnectionHash = CreateShardConnectionHash(TopTransactionContext);

	/* set up callback to identify error line number */
	errorCallback.callback = CopyFromErrorCallback;
	errorCallback.arg = (void *) copyState;
	errorCallback.previous = error_context_stack;
	error_context_stack = &errorCallback;

	while (true)
	{
		bool nextRowFound = false;
		Datum partitionColumnValue = 0;
		ShardInterval *shardInterval = NULL;
		int64 shardId = 0;
		bool shardConnectionsFound = false;
		MemoryContext oldContext = NULL;

		ResetPerTupleExprContext(executorState);

		oldContext = MemoryContextSwitchTo(executorTupleContext);

		/* parse a row from the input */
		nextRowFound = NextCopyFrom(copyState, executorExpressionContext,
									columnValues, columnNulls, NULL);

		if (!nextRowFound)
		{
			MemoryContextSwitchTo(oldContext);
			break;
		}

		CHECK_FOR_INTERRUPTS();

		/*
		 * Find the partition column value and corresponding shard interval
		 * for non-reference tables.
		 * Get the existing (and only a single) shard interval for the reference
		 * tables. Note that, reference tables has NULL partition column values so
		 * skip the check.
		 */
		if (partitionColumn != NULL)
		{
			if (columnNulls[partitionColumn->varattno - 1])
			{
				ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
								errmsg("cannot copy row with NULL value "
									   "in partition column")));
			}

			partitionColumnValue = columnValues[partitionColumn->varattno - 1];
		}

		/*
		 * Find the shard interval and id for the partition column value for
		 * non-reference tables.
		 * For reference table, this function blindly returns the tables single
		 * shard.
		 */
		shardInterval = FindShardInterval(partitionColumnValue,
										  shardIntervalCache,
										  shardCount, partitionMethod,
										  compareFunction, hashFunction,
										  useBinarySearch);

		if (shardInterval == NULL)
		{
			ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
							errmsg("could not find shard for partition column "
								   "value")));
		}

		shardId = shardInterval->shardId;

		MemoryContextSwitchTo(oldContext);

		/* get existing connections to the shard placements, if any */
		shardConnections = GetShardHashConnections(shardConnectionHash, shardId,
												   &shardConnectionsFound);
		if (!shardConnectionsFound)
		{
			bool stopOnFailure = false;

			if (cacheEntry->partitionMethod == DISTRIBUTE_BY_NONE)
			{
				stopOnFailure = true;
			}

			/* open connections and initiate COPY on shard placements */
			OpenCopyConnections(copyStatement, shardConnections, stopOnFailure,
								copyOutState->binary);

			/* send copy binary headers to shard placements */
			if (copyOutState->binary)
			{
				SendCopyBinaryHeaders(copyOutState, shardId,
									  shardConnections->connectionList);
			}
		}

		/* replicate row to shard placements */
		resetStringInfo(copyOutState->fe_msgbuf);
		AppendCopyRowData(columnValues, columnNulls, tupleDescriptor,
						  copyOutState, columnOutputFunctions);
		SendCopyDataToAll(copyOutState->fe_msgbuf, shardId,
						  shardConnections->connectionList);

		processedRowCount += 1;
	}

	/* all lines have been copied, stop showing line number in errors */
	error_context_stack = errorCallback.previous;

	shardConnectionsList = ShardConnectionList(shardConnectionHash);
	foreach(shardConnectionsCell, shardConnectionsList)
	{
		ShardConnections *shardConnections = (ShardConnections *) lfirst(
			shardConnectionsCell);

		/* send copy binary footers to all shard placements */
		if (copyOutState->binary)
		{
			SendCopyBinaryFooters(copyOutState, shardConnections->shardId,
								  shardConnections->connectionList);
		}

		/* close the COPY input on all shard placements */
		EndRemoteCopy(shardConnections->shardId, shardConnections->connectionList, true);
	}

	EndCopyFrom(copyState);
	heap_close(distributedRelation, NoLock);

	/* mark failed placements as inactive */
	MarkFailedShardPlacements();

	CHECK_FOR_INTERRUPTS();

	if (completionTag != NULL)
	{
		snprintf(completionTag, COMPLETION_TAG_BUFSIZE,
				 "COPY " UINT64_FORMAT, processedRowCount);
	}
}


/*
 * CopyToNewShards implements the COPY table_name FROM ... for append-partitioned
 * tables where we create new shards into which to copy rows.
 */
static void
CopyToNewShards(CopyStmt *copyStatement, char *completionTag, Oid relationId)
{
	FmgrInfo *columnOutputFunctions = NULL;

	/* allocate column values and nulls arrays */
	Relation distributedRelation = heap_open(relationId, RowExclusiveLock);
	TupleDesc tupleDescriptor = RelationGetDescr(distributedRelation);
	uint32 columnCount = tupleDescriptor->natts;
	Datum *columnValues = palloc0(columnCount * sizeof(Datum));
	bool *columnNulls = palloc0(columnCount * sizeof(bool));

	EState *executorState = CreateExecutorState();
	MemoryContext executorTupleContext = GetPerTupleMemoryContext(executorState);
	ExprContext *executorExpressionContext = GetPerTupleExprContext(executorState);

	const char *delimiterCharacter = "\t";
	const char *nullPrintCharacter = "\\N";

	ErrorContextCallback errorCallback;

	int64 currentShardId = INVALID_SHARD_ID;
	uint64 shardMaxSizeInBytes = (int64) ShardMaxSize * 1024L;
	uint64 copiedDataSizeInBytes = 0;
	uint64 processedRowCount = 0;

	ShardConnections *shardConnections =
		(ShardConnections *) palloc0(sizeof(ShardConnections));

	/* initialize copy state to read from COPY data source */
	CopyState copyState = BeginCopyFrom(distributedRelation,
										copyStatement->filename,
										copyStatement->is_program,
										copyStatement->attlist,
										copyStatement->options);

	CopyOutState copyOutState = (CopyOutState) palloc0(sizeof(CopyOutStateData));
	copyOutState->delim = (char *) delimiterCharacter;
	copyOutState->null_print = (char *) nullPrintCharacter;
	copyOutState->null_print_client = (char *) nullPrintCharacter;
	copyOutState->binary = CanUseBinaryCopyFormat(tupleDescriptor, copyOutState);
	copyOutState->fe_msgbuf = makeStringInfo();
	copyOutState->rowcontext = executorTupleContext;

	columnOutputFunctions = ColumnOutputFunctions(tupleDescriptor, copyOutState->binary);

	/* set up callback to identify error line number */
	errorCallback.callback = CopyFromErrorCallback;
	errorCallback.arg = (void *) copyState;
	errorCallback.previous = error_context_stack;

	while (true)
	{
		bool nextRowFound = false;
		MemoryContext oldContext = NULL;
		uint64 messageBufferSize = 0;

		ResetPerTupleExprContext(executorState);

		/* switch to tuple memory context and start showing line number in errors */
		error_context_stack = &errorCallback;
		oldContext = MemoryContextSwitchTo(executorTupleContext);

		/* parse a row from the input */
		nextRowFound = NextCopyFrom(copyState, executorExpressionContext,
									columnValues, columnNulls, NULL);

		if (!nextRowFound)
		{
			/* switch to regular memory context and stop showing line number in errors */
			MemoryContextSwitchTo(oldContext);
			error_context_stack = errorCallback.previous;
			break;
		}

		CHECK_FOR_INTERRUPTS();

		/* switch to regular memory context and stop showing line number in errors */
		MemoryContextSwitchTo(oldContext);
		error_context_stack = errorCallback.previous;

		/*
		 * If copied data size is zero, this means either this is the first
		 * line in the copy or we just filled the previous shard up to its
		 * capacity. Either way, we need to create a new shard and
		 * start copying new rows into it.
		 */
		if (copiedDataSizeInBytes == 0)
		{
			/* create shard and open connections to shard placements */
			currentShardId = StartCopyToNewShard(shardConnections, copyStatement,
												 copyOutState->binary);

			/* send copy binary headers to shard placements */
			if (copyOutState->binary)
			{
				SendCopyBinaryHeaders(copyOutState, currentShardId,
									  shardConnections->connectionList);
			}
		}

		/* replicate row to shard placements */
		resetStringInfo(copyOutState->fe_msgbuf);
		AppendCopyRowData(columnValues, columnNulls, tupleDescriptor,
						  copyOutState, columnOutputFunctions);
		SendCopyDataToAll(copyOutState->fe_msgbuf, currentShardId,
						  shardConnections->connectionList);

		messageBufferSize = copyOutState->fe_msgbuf->len;
		copiedDataSizeInBytes = copiedDataSizeInBytes + messageBufferSize;

		/*
		 * If we filled up this shard to its capacity, send copy binary footers
		 * to shard placements, and update shard statistics.
		 */
		if (copiedDataSizeInBytes > shardMaxSizeInBytes)
		{
			Assert(currentShardId != INVALID_SHARD_ID);

			if (copyOutState->binary)
			{
				SendCopyBinaryFooters(copyOutState, currentShardId,
									  shardConnections->connectionList);
			}

			EndRemoteCopy(currentShardId, shardConnections->connectionList, true);
			MasterUpdateShardStatistics(shardConnections->shardId);

			copiedDataSizeInBytes = 0;
			currentShardId = INVALID_SHARD_ID;
		}

		processedRowCount += 1;
	}

	/*
	 * For the last shard, send copy binary footers to shard placements,
	 * and update shard statistics. If no row is send, there is no shard
	 * to finalize the copy command.
	 */
	if (copiedDataSizeInBytes > 0)
	{
		Assert(currentShardId != INVALID_SHARD_ID);

		if (copyOutState->binary)
		{
			SendCopyBinaryFooters(copyOutState, currentShardId,
								  shardConnections->connectionList);
		}
		EndRemoteCopy(currentShardId, shardConnections->connectionList, true);
		MasterUpdateShardStatistics(shardConnections->shardId);
	}

	EndCopyFrom(copyState);
	heap_close(distributedRelation, NoLock);

	/* check for cancellation one last time before returning */
	CHECK_FOR_INTERRUPTS();

	if (completionTag != NULL)
	{
		snprintf(completionTag, COMPLETION_TAG_BUFSIZE,
				 "COPY " UINT64_FORMAT, processedRowCount);
	}
}


/*
 * MasterNodeAddress gets the master node address from copy options and returns
 * it. Note that if the master_port is not provided, we use 5432 as the default
 * port.
 */
NodeAddress *
MasterNodeAddress(CopyStmt *copyStatement)
{
	NodeAddress *masterNodeAddress = (NodeAddress *) palloc0(sizeof(NodeAddress));
	char *nodeName = NULL;

	/* set default port to 5432 */
	int32 nodePort = 5432;

	ListCell *optionCell = NULL;
	foreach(optionCell, copyStatement->options)
	{
		DefElem *defel = (DefElem *) lfirst(optionCell);
		if (strncmp(defel->defname, "master_host", NAMEDATALEN) == 0)
		{
			nodeName = defGetString(defel);
		}
		else if (strncmp(defel->defname, "master_port", NAMEDATALEN) == 0)
		{
			nodePort = defGetInt32(defel);
		}
	}

	masterNodeAddress->nodeName = nodeName;
	masterNodeAddress->nodePort = nodePort;

	return masterNodeAddress;
}


/*
 * MasterPartitionMethod gets the partition method of the given relation from
 * the master node and returns it.
 */
static char
MasterPartitionMethod(RangeVar *relation)
{
	char partitionMethod = '\0';
	PGresult *queryResult = NULL;

	char *relationName = relation->relname;
	char *schemaName = relation->schemaname;
	char *qualifiedName = quote_qualified_identifier(schemaName, relationName);

	StringInfo partitionMethodCommand = makeStringInfo();
	appendStringInfo(partitionMethodCommand, PARTITION_METHOD_QUERY, qualifiedName);

	queryResult = PQexec(masterConnection->pgConn, partitionMethodCommand->data);
	if (PQresultStatus(queryResult) == PGRES_TUPLES_OK)
	{
		char *partitionMethodString = PQgetvalue((PGresult *) queryResult, 0, 0);
		if (partitionMethodString == NULL || (*partitionMethodString) == '\0')
		{
			ereport(ERROR, (errmsg("could not find a partition method for the "
								   "table %s", relationName)));
		}

		partitionMethod = partitionMethodString[0];
	}
	else
	{
		ReportResultError(masterConnection, queryResult, WARNING);
		ereport(ERROR, (errmsg("could not get the partition method of the "
							   "distributed table")));
	}

	PQclear(queryResult);

	return partitionMethod;
}


/*
 * RemoveMasterOptions removes master node related copy options from the option
 * list of the copy statement.
 */
static void
RemoveMasterOptions(CopyStmt *copyStatement)
{
	List *newOptionList = NIL;
	ListCell *optionCell = NULL;

	/* walk over the list of all options */
	foreach(optionCell, copyStatement->options)
	{
		DefElem *option = (DefElem *) lfirst(optionCell);

		/* skip master related options */
		if ((strncmp(option->defname, "master_host", NAMEDATALEN) == 0) ||
			(strncmp(option->defname, "master_port", NAMEDATALEN) == 0))
		{
			continue;
		}

		newOptionList = lappend(newOptionList, option);
	}

	copyStatement->options = newOptionList;
}


/*
 * OpenCopyConnections opens a connection for each placement of a shard and
 * starts a COPY transaction if necessary. If a connection cannot be opened,
 * then the shard placement is marked as inactive and the COPY continues with the remaining
 * shard placements.
 */
static void
OpenCopyConnections(CopyStmt *copyStatement, ShardConnections *shardConnections,
					bool stopOnFailure, bool useBinaryCopyFormat)
{
	List *finalizedPlacementList = NIL;
	int failedPlacementCount = 0;
	ListCell *placementCell = NULL;
	List *connectionList = NULL;
	int64 shardId = shardConnections->shardId;

	MemoryContext localContext = AllocSetContextCreate(CurrentMemoryContext,
													   "OpenCopyConnections",
													   ALLOCSET_DEFAULT_MINSIZE,
													   ALLOCSET_DEFAULT_INITSIZE,
													   ALLOCSET_DEFAULT_MAXSIZE);

	/* release finalized placement list at the end of this function */
	MemoryContext oldContext = MemoryContextSwitchTo(localContext);

	finalizedPlacementList = MasterShardPlacementList(shardId);

	MemoryContextSwitchTo(oldContext);

	if (XactModificationLevel > XACT_MODIFICATION_DATA)
	{
		ereport(ERROR, (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
						errmsg("distributed copy operations must not appear in "
							   "transaction blocks containing other distributed "
							   "modifications")));
	}

	foreach(placementCell, finalizedPlacementList)
	{
		ShardPlacement *placement = (ShardPlacement *) lfirst(placementCell);
		char *nodeUser = CurrentUserName();
		MultiConnection *connection = NULL;
		uint32 connectionFlags = FOR_DML;
		StringInfo copyCommand = NULL;
		PGresult *result = NULL;

		connection = GetPlacementConnection(connectionFlags, placement, nodeUser);

		if (PQstatus(connection->pgConn) != CONNECTION_OK)
		{
			if (stopOnFailure)
			{
				ReportConnectionError(connection, ERROR);
			}
			else
			{
				ReportConnectionError(connection, WARNING);
				MarkRemoteTransactionFailed(connection, true);

				failedPlacementCount++;
				continue;
			}
		}

		/*
		 * Errors are supposed to cause immediate aborts (i.e. we don't
		 * want to/can't invalidate placements), mark the connection as
		 * critical so later errors cause failures.
		 */
		MarkRemoteTransactionCritical(connection);
		ClaimConnectionExclusively(connection);
		RemoteTransactionBeginIfNecessary(connection);
		copyCommand = ConstructCopyStatement(copyStatement, shardConnections->shardId,
											 useBinaryCopyFormat, false);
		result = PQexec(connection->pgConn, copyCommand->data);

		if (PQresultStatus(result) != PGRES_COPY_IN)
		{
			ReportResultError(connection, result, ERROR);
		}

		PQclear(result);
		connectionList = lappend(connectionList, connection);
	}

	/* if all placements failed, error out */
	if (failedPlacementCount == list_length(finalizedPlacementList))
	{
		ereport(ERROR, (errmsg("could not connect to any active placements")));
	}

	/*
	 * If stopOnFailure is true, we just error out and code execution should
	 * never reach to this point. This is the case for reference tables and
	 * copy from worker nodes.
	 */
	Assert(!stopOnFailure || failedPlacementCount == 0);

	shardConnections->connectionList = connectionList;

	MemoryContextReset(localContext);
}


/*
 * CanUseBinaryCopyFormat iterates over columns of the relation given in rowOutputState
 * and looks for a column whose type is array of user-defined type or composite type.
 * If it finds such column, that means we cannot use binary format for COPY, because
 * binary format sends Oid of the types, which are generally not same in master and
 * worker nodes for user-defined types.
 */
static bool
CanUseBinaryCopyFormat(TupleDesc tupleDescription, CopyOutState rowOutputState)
{
	bool useBinaryCopyFormat = true;
	int totalColumnCount = tupleDescription->natts;
	int columnIndex = 0;

	for (columnIndex = 0; columnIndex < totalColumnCount; columnIndex++)
	{
		Form_pg_attribute currentColumn = tupleDescription->attrs[columnIndex];
		Oid typeId = InvalidOid;
		char typeCategory = '\0';
		bool typePreferred = false;

		if (currentColumn->attisdropped)
		{
			continue;
		}

		typeId = currentColumn->atttypid;
		if (typeId >= FirstNormalObjectId)
		{
			get_type_category_preferred(typeId, &typeCategory, &typePreferred);
			if (typeCategory == TYPCATEGORY_ARRAY ||
				typeCategory == TYPCATEGORY_COMPOSITE)
			{
				useBinaryCopyFormat = false;
				break;
			}
		}
	}

	return useBinaryCopyFormat;
}


/*
 * MasterShardPlacementList dispatches the finalized shard placements call
 * between local or remote master node according to the master connection state.
 */
static List *
MasterShardPlacementList(uint64 shardId)
{
	List *finalizedPlacementList = NIL;
	if (masterConnection == NULL)
	{
		finalizedPlacementList = FinalizedShardPlacementList(shardId);
	}
	else
	{
		finalizedPlacementList = RemoteFinalizedShardPlacementList(shardId);
	}

	return finalizedPlacementList;
}


/*
 * RemoteFinalizedShardPlacementList gets the finalized shard placement list
 * for the given shard id from the remote master node.
 */
static List *
RemoteFinalizedShardPlacementList(uint64 shardId)
{
	List *finalizedPlacementList = NIL;
	PGresult *queryResult = NULL;

	StringInfo shardPlacementsCommand = makeStringInfo();
	appendStringInfo(shardPlacementsCommand, FINALIZED_SHARD_PLACEMENTS_QUERY, shardId);

	queryResult = PQexec(masterConnection->pgConn, shardPlacementsCommand->data);
	if (PQresultStatus(queryResult) == PGRES_TUPLES_OK)
	{
		int rowCount = PQntuples(queryResult);
		int rowIndex = 0;

		for (rowIndex = 0; rowIndex < rowCount; rowIndex++)
		{
			char *placementIdString = PQgetvalue(queryResult, rowIndex, 0);
			char *nodeName = PQgetvalue(queryResult, rowIndex, 1);
			char *nodePortString = PQgetvalue(queryResult, rowIndex, 2);
			uint32 nodePort = atoi(nodePortString);
			uint64 placementId = atoll(placementIdString);

			ShardPlacement *shardPlacement =
				(ShardPlacement *) palloc0(sizeof(ShardPlacement));

			shardPlacement->placementId = placementId;
			shardPlacement->nodeName = (char *) palloc0(strlen(nodeName) + 1);
			strcpy(shardPlacement->nodeName, nodeName);
			shardPlacement->nodePort = nodePort;

			finalizedPlacementList = lappend(finalizedPlacementList, shardPlacement);
		}
	}
	else
	{
		ereport(ERROR, (errmsg("could not get shard placements from the master node")));
	}

	return finalizedPlacementList;
}


/* Send copy binary headers to given connections */
static void
SendCopyBinaryHeaders(CopyOutState copyOutState, int64 shardId, List *connectionList)
{
	resetStringInfo(copyOutState->fe_msgbuf);
	AppendCopyBinaryHeaders(copyOutState);
	SendCopyDataToAll(copyOutState->fe_msgbuf, shardId, connectionList);
}


/* Send copy binary footers to given connections */
static void
SendCopyBinaryFooters(CopyOutState copyOutState, int64 shardId, List *connectionList)
{
	resetStringInfo(copyOutState->fe_msgbuf);
	AppendCopyBinaryFooters(copyOutState);
	SendCopyDataToAll(copyOutState->fe_msgbuf, shardId, connectionList);
}


/*
 * ConstructCopyStatement constructs the text of a COPY statement for a particular
 * shard.
 */
static StringInfo
ConstructCopyStatement(CopyStmt *copyStatement, int64 shardId, bool useBinaryCopyFormat,
		bool useFreeze)
{
	StringInfo command = makeStringInfo();

	char *schemaName = copyStatement->relation->schemaname;
	char *relationName = copyStatement->relation->relname;

	char *shardName = pstrdup(relationName);
	char *shardQualifiedName = NULL;
	const char *copyFormat = NULL;
	const char *freeze = NULL;

	AppendShardIdToName(&shardName, shardId);

	shardQualifiedName = quote_qualified_identifier(schemaName, shardName);

	if (useBinaryCopyFormat)
	{
		copyFormat = "BINARY";
	}
	else
	{
		copyFormat = "TEXT";
	}
	if (useFreeze)
	{
		freeze = "TRUE";
	}
	else
	{
		freeze = "FALSE";
	}
	appendStringInfo(command, "COPY %s FROM STDIN WITH (FORMAT %s, FREEZE %s)",
			shardQualifiedName, copyFormat, freeze);

	return command;
}


/*
 * SendCopyDataToAll sends copy data to all connections in a list.
 */
static void
SendCopyDataToAll(StringInfo dataBuffer, int64 shardId, List *connectionList)
{
	ListCell *connectionCell = NULL;
	foreach(connectionCell, connectionList)
	{
		MultiConnection *connection = (MultiConnection *) lfirst(connectionCell);
		SendCopyDataToPlacement(dataBuffer, shardId, connection);
	}
}


/*
 * SendCopyDataToPlacement sends serialized COPY data to a specific shard placement
 * over the given connection.
 */
static void
SendCopyDataToPlacement(StringInfo dataBuffer, int64 shardId, MultiConnection *connection)
{
	int copyResult = PQputCopyData(connection->pgConn, dataBuffer->data, dataBuffer->len);
	if (copyResult != 1)
	{
		ereport(ERROR, (errcode(ERRCODE_IO_ERROR),
						errmsg("failed to COPY to shard %ld on %s:%d",
							   shardId, connection->hostname, connection->port),
						errdetail("failed to send %d bytes %s", dataBuffer->len,
								  dataBuffer->data)));
	}
}


/*
 * EndRemoteCopy ends the COPY input on all connections, and unclaims connections.
 * If stopOnFailure is true, then EndRemoteCopy reports an error on failure,
 * otherwise it reports a warning or continues.
 */
static void
EndRemoteCopy(int64 shardId, List *connectionList, bool stopOnFailure)
{
	ListCell *connectionCell = NULL;

	foreach(connectionCell, connectionList)
	{
		MultiConnection *connection = (MultiConnection *) lfirst(connectionCell);
		int copyEndResult = 0;
		PGresult *result = NULL;

		/* end the COPY input */
		copyEndResult = PQputCopyEnd(connection->pgConn, NULL);

		if (copyEndResult != 1)
		{
			if (stopOnFailure)
			{
				ereport(ERROR, (errcode(ERRCODE_IO_ERROR),
								errmsg("failed to COPY to shard %ld on %s:%d",
									   shardId, connection->hostname, connection->port)));
			}

			continue;
		}

		/* check whether there were any COPY errors */
		result = PQgetResult(connection->pgConn);
		if (PQresultStatus(result) != PGRES_COMMAND_OK && stopOnFailure)
		{
			ReportCopyError(connection, result);
		}

		PQclear(result);
		ForgetResults(connection);
		UnclaimConnection(connection);
	}
}


/*
 * ReportCopyError tries to report a useful error message for the user from
 * the remote COPY error messages.
 */
static void
ReportCopyError(MultiConnection *connection, PGresult *result)
{
	char *remoteMessage = PQresultErrorField(result, PG_DIAG_MESSAGE_PRIMARY);

	if (remoteMessage != NULL)
	{
		/* probably a constraint violation, show remote message and detail */
		char *remoteDetail = PQresultErrorField(result, PG_DIAG_MESSAGE_DETAIL);

		ereport(ERROR, (errmsg("%s", remoteMessage),
						errdetail("%s", remoteDetail)));
	}
	else
	{
		/* probably a connection problem, get the message from the connection */
		char *lastNewlineIndex = NULL;

		remoteMessage = PQerrorMessage(connection->pgConn);
		lastNewlineIndex = strrchr(remoteMessage, '\n');

		/* trim trailing newline, if any */
		if (lastNewlineIndex != NULL)
		{
			*lastNewlineIndex = '\0';
		}

		ereport(ERROR, (errcode(ERRCODE_IO_ERROR),
						errmsg("failed to complete COPY on %s:%d", connection->hostname,
							   connection->port),
						errdetail("%s", remoteMessage)));
	}
}


/*
 * ColumnOutputFunctions walks over a table's columns, and finds each column's
 * type information. The function then resolves each type's output function,
 * and stores and returns these output functions in an array.
 */
FmgrInfo *
ColumnOutputFunctions(TupleDesc rowDescriptor, bool binaryFormat)
{
	uint32 columnCount = (uint32) rowDescriptor->natts;
	FmgrInfo *columnOutputFunctions = palloc0(columnCount * sizeof(FmgrInfo));

	uint32 columnIndex = 0;
	for (columnIndex = 0; columnIndex < columnCount; columnIndex++)
	{
		FmgrInfo *currentOutputFunction = &columnOutputFunctions[columnIndex];
		Form_pg_attribute currentColumn = rowDescriptor->attrs[columnIndex];
		Oid columnTypeId = currentColumn->atttypid;
		Oid outputFunctionId = InvalidOid;
		bool typeVariableLength = false;

		if (currentColumn->attisdropped)
		{
			/* dropped column, leave the output function NULL */
			continue;
		}
		else if (binaryFormat)
		{
			getTypeBinaryOutputInfo(columnTypeId, &outputFunctionId, &typeVariableLength);
		}
		else
		{
			getTypeOutputInfo(columnTypeId, &outputFunctionId, &typeVariableLength);
		}

		fmgr_info(outputFunctionId, currentOutputFunction);
	}

	return columnOutputFunctions;
}


/*
 * AppendCopyRowData serializes one row using the column output functions,
 * and appends the data to the row output state object's message buffer.
 * This function is modeled after the CopyOneRowTo() function in
 * commands/copy.c, but only implements a subset of that functionality.
 * Note that the caller of this function should reset row memory context
 * to not bloat memory usage.
 */
void
AppendCopyRowData(Datum *valueArray, bool *isNullArray, TupleDesc rowDescriptor,
				  CopyOutState rowOutputState, FmgrInfo *columnOutputFunctions)
{
	uint32 totalColumnCount = (uint32) rowDescriptor->natts;
	uint32 availableColumnCount = AvailableColumnCount(rowDescriptor);
	uint32 appendedColumnCount = 0;
	uint32 columnIndex = 0;

	MemoryContext oldContext = MemoryContextSwitchTo(rowOutputState->rowcontext);

	if (rowOutputState->binary)
	{
		CopySendInt16(rowOutputState, availableColumnCount);
	}

	for (columnIndex = 0; columnIndex < totalColumnCount; columnIndex++)
	{
		Form_pg_attribute currentColumn = rowDescriptor->attrs[columnIndex];
		Datum value = valueArray[columnIndex];
		bool isNull = isNullArray[columnIndex];
		bool lastColumn = false;

		if (currentColumn->attisdropped)
		{
			continue;
		}
		else if (rowOutputState->binary)
		{
			if (!isNull)
			{
				FmgrInfo *outputFunctionPointer = &columnOutputFunctions[columnIndex];
				bytea *outputBytes = SendFunctionCall(outputFunctionPointer, value);

				CopySendInt32(rowOutputState, VARSIZE(outputBytes) - VARHDRSZ);
				CopySendData(rowOutputState, VARDATA(outputBytes),
							 VARSIZE(outputBytes) - VARHDRSZ);
			}
			else
			{
				CopySendInt32(rowOutputState, -1);
			}
		}
		else
		{
			if (!isNull)
			{
				FmgrInfo *outputFunctionPointer = &columnOutputFunctions[columnIndex];
				char *columnText = OutputFunctionCall(outputFunctionPointer, value);

				CopyAttributeOutText(rowOutputState, columnText);
			}
			else
			{
				CopySendString(rowOutputState, rowOutputState->null_print_client);
			}

			lastColumn = ((appendedColumnCount + 1) == availableColumnCount);
			if (!lastColumn)
			{
				CopySendChar(rowOutputState, rowOutputState->delim[0]);
			}
		}

		appendedColumnCount++;
	}

	if (!rowOutputState->binary)
	{
		/* append default line termination string depending on the platform */
#ifndef WIN32
		CopySendChar(rowOutputState, '\n');
#else
		CopySendString(rowOutputState, "\r\n");
#endif
	}

	MemoryContextSwitchTo(oldContext);
}


/*
 * AvailableColumnCount returns the number of columns in a tuple descriptor, excluding
 * columns that were dropped.
 */
static uint32
AvailableColumnCount(TupleDesc tupleDescriptor)
{
	uint32 columnCount = 0;
	uint32 columnIndex = 0;

	for (columnIndex = 0; columnIndex < tupleDescriptor->natts; columnIndex++)
	{
		Form_pg_attribute currentColumn = tupleDescriptor->attrs[columnIndex];

		if (!currentColumn->attisdropped)
		{
			columnCount++;
		}
	}

	return columnCount;
}


/*
 * AppendCopyBinaryHeaders appends binary headers to the copy buffer in
 * headerOutputState.
 */
void
AppendCopyBinaryHeaders(CopyOutState headerOutputState)
{
	const int32 zero = 0;
	MemoryContext oldContext = MemoryContextSwitchTo(headerOutputState->rowcontext);

	/* Signature */
	CopySendData(headerOutputState, BinarySignature, 11);

	/* Flags field (no OIDs) */
	CopySendInt32(headerOutputState, zero);

	/* No header extension */
	CopySendInt32(headerOutputState, zero);

	MemoryContextSwitchTo(oldContext);
}


/*
 * AppendCopyBinaryFooters appends binary footers to the copy buffer in
 * footerOutputState.
 */
void
AppendCopyBinaryFooters(CopyOutState footerOutputState)
{
	int16 negative = -1;
	MemoryContext oldContext = MemoryContextSwitchTo(footerOutputState->rowcontext);

	CopySendInt16(footerOutputState, negative);

	MemoryContextSwitchTo(oldContext);
}


/*
 * StartCopyToNewShard creates a new shard and related shard placements and
 * opens connections to shard placements.
 */
static int64
StartCopyToNewShard(ShardConnections *shardConnections, CopyStmt *copyStatement,
					bool useBinaryCopyFormat)
{
	char *relationName = copyStatement->relation->relname;
	char *schemaName = copyStatement->relation->schemaname;
	char *qualifiedName = quote_qualified_identifier(schemaName, relationName);
	int64 shardId = MasterCreateEmptyShard(qualifiedName);
	bool stopOnFailure = true;

	shardConnections->shardId = shardId;

	shardConnections->connectionList = NIL;

	/* connect to shards placements and start transactions */
	OpenCopyConnections(copyStatement, shardConnections, stopOnFailure,
						useBinaryCopyFormat);

	return shardId;
}


/*
 * MasterCreateEmptyShard dispatches the create empty shard call between local or
 * remote master node according to the master connection state.
 */
static int64
MasterCreateEmptyShard(char *relationName)
{
	int64 shardId = 0;
	if (masterConnection == NULL)
	{
		shardId = CreateEmptyShard(relationName);
	}
	else
	{
		shardId = RemoteCreateEmptyShard(relationName);
	}

	return shardId;
}


/*
 * CreateEmptyShard creates a new shard and related shard placements from the
 * local master node.
 */
static int64
CreateEmptyShard(char *relationName)
{
	int64 shardId = 0;

	text *relationNameText = cstring_to_text(relationName);
	Datum relationNameDatum = PointerGetDatum(relationNameText);
	Datum shardIdDatum = DirectFunctionCall1(master_create_empty_shard,
											 relationNameDatum);
	shardId = DatumGetInt64(shardIdDatum);

	return shardId;
}


/*
 * RemoteCreateEmptyShard creates a new shard and related shard placements from
 * the remote master node.
 */
static int64
RemoteCreateEmptyShard(char *relationName)
{
	int64 shardId = 0;
	PGresult *queryResult = NULL;

	StringInfo createEmptyShardCommand = makeStringInfo();
	appendStringInfo(createEmptyShardCommand, CREATE_EMPTY_SHARD_QUERY, relationName);

	queryResult = PQexec(masterConnection->pgConn, createEmptyShardCommand->data);
	if (PQresultStatus(queryResult) == PGRES_TUPLES_OK)
	{
		char *shardIdString = PQgetvalue((PGresult *) queryResult, 0, 0);
		char *shardIdStringEnd = NULL;
		shardId = strtoul(shardIdString, &shardIdStringEnd, 0);
	}
	else
	{
		ReportResultError(masterConnection, queryResult, WARNING);
		ereport(ERROR, (errmsg("could not create a new empty shard on the remote node")));
	}

	PQclear(queryResult);

	return shardId;
}


/*
 * MasterUpdateShardStatistics dispatches the update shard statistics call
 * between local or remote master node according to the master connection state.
 */
static void
MasterUpdateShardStatistics(uint64 shardId)
{
	if (masterConnection == NULL)
	{
		UpdateShardStatistics(shardId);
	}
	else
	{
		RemoteUpdateShardStatistics(shardId);
	}
}


/*
 * RemoteUpdateShardStatistics updates shard statistics on the remote master node.
 */
static void
RemoteUpdateShardStatistics(uint64 shardId)
{
	PGresult *queryResult = NULL;

	StringInfo updateShardStatisticsCommand = makeStringInfo();
	appendStringInfo(updateShardStatisticsCommand, UPDATE_SHARD_STATISTICS_QUERY,
					 shardId);

	queryResult = PQexec(masterConnection->pgConn, updateShardStatisticsCommand->data);
	if (PQresultStatus(queryResult) != PGRES_TUPLES_OK)
	{
		ereport(ERROR, (errmsg("could not update shard statistics")));
	}

	PQclear(queryResult);
}


/* *INDENT-OFF* */
/* Append data to the copy buffer in outputState */
static void
CopySendData(CopyOutState outputState, const void *databuf, int datasize)
{
	appendBinaryStringInfo(outputState->fe_msgbuf, databuf, datasize);
}


/* Append a striong to the copy buffer in outputState. */
static void
CopySendString(CopyOutState outputState, const char *str)
{
	appendBinaryStringInfo(outputState->fe_msgbuf, str, strlen(str));
}


/* Append a char to the copy buffer in outputState. */
static void
CopySendChar(CopyOutState outputState, char c)
{
	appendStringInfoCharMacro(outputState->fe_msgbuf, c);
}


/* Append an int32 to the copy buffer in outputState. */
static void
CopySendInt32(CopyOutState outputState, int32 val)
{
	uint32 buf = htonl((uint32) val);
	CopySendData(outputState, &buf, sizeof(buf));
}


/* Append an int16 to the copy buffer in outputState. */
static void
CopySendInt16(CopyOutState outputState, int16 val)
{
	uint16 buf = htons((uint16) val);
	CopySendData(outputState, &buf, sizeof(buf));
}


/*
 * Send text representation of one column, with conversion and escaping.
 *
 * NB: This function is based on commands/copy.c and doesn't fully conform to
 * our coding style. The function should be kept in sync with copy.c.
 */
static void
CopyAttributeOutText(CopyOutState cstate, char *string)
{
	char *pointer = NULL;
	char *start = NULL;
	char c = '\0';
	char delimc = cstate->delim[0];

	if (cstate->need_transcoding)
	{
		pointer = pg_server_to_any(string, strlen(string), cstate->file_encoding);
	}
	else
	{
		pointer = string;
	}

	/*
	 * We have to grovel through the string searching for control characters
	 * and instances of the delimiter character.  In most cases, though, these
	 * are infrequent.  To avoid overhead from calling CopySendData once per
	 * character, we dump out all characters between escaped characters in a
	 * single call.  The loop invariant is that the data from "start" to "pointer"
	 * can be sent literally, but hasn't yet been.
	 *
	 * As all encodings here are safe, i.e. backend supported ones, we can
	 * skip doing pg_encoding_mblen(), because in valid backend encodings,
	 * extra bytes of a multibyte character never look like ASCII.
	 */
	start = pointer;
	while ((c = *pointer) != '\0')
	{
		if ((unsigned char) c < (unsigned char) 0x20)
		{
			/*
			 * \r and \n must be escaped, the others are traditional. We
			 * prefer to dump these using the C-like notation, rather than
			 * a backslash and the literal character, because it makes the
			 * dump file a bit more proof against Microsoftish data
			 * mangling.
			 */
			switch (c)
			{
				case '\b':
					c = 'b';
					break;
				case '\f':
					c = 'f';
					break;
				case '\n':
					c = 'n';
					break;
				case '\r':
					c = 'r';
					break;
				case '\t':
					c = 't';
					break;
				case '\v':
					c = 'v';
					break;
				default:
					/* If it's the delimiter, must backslash it */
					if (c == delimc)
						break;
					/* All ASCII control chars are length 1 */
					pointer++;
					continue;		/* fall to end of loop */
			}
			/* if we get here, we need to convert the control char */
			CopyFlushOutput(cstate, start, pointer);
			CopySendChar(cstate, '\\');
			CopySendChar(cstate, c);
			start = ++pointer;	/* do not include char in next run */
		}
		else if (c == '\\' || c == delimc)
		{
			CopyFlushOutput(cstate, start, pointer);
			CopySendChar(cstate, '\\');
			start = pointer++;	/* we include char in next run */
		}
		else
		{
			pointer++;
		}
	}

	CopyFlushOutput(cstate, start, pointer);
}


/* *INDENT-ON* */
/* Helper function to send pending copy output */
static inline void
CopyFlushOutput(CopyOutState cstate, char *start, char *pointer)
{
	if (pointer > start)
	{
		CopySendData(cstate, start, pointer - start);
	}
}

/*
 * CitusBulkloadCopy implements the COPY table_name FROM xxx WITH(method 'bulkload').
 * For bulkload server, it dispatches the copy statement and records from FROM to all
 * workers, and waits them finish. For bulkload clients, they pull records from server
 * and copy them into shards. Bulkload client handles differently against append and
 * hash distributed table.
 * For APPEND distributed table, there are two copy policies:
 * 1. bulkload client would create a shard for each tablespace and insert records to
 *    these shards in round-robin policy, and if any shard reaches ShardMaxSize, it
 *    would create a new shard in the tablespace and so on. In this policy, since DDL
 *    commands of create shard and DML commands of copy are running in one transaction,
 *    we can use COPY FREEZE and Lazy-Indexing to improve ingestion performance.
 * 2. each bulkload client acts just like CopyToNewShards(), calling master to create a
 *    new shard and insert records into this new shard, when the new shard reaches the
 *    ShardMaxSize, it would call master to create another new shard and so on.
 * For HASH distributed table, clients get metadata of the table from master node and
 * send records to different shards according to hash value.
 */
void
CitusBulkloadCopy(CopyStmt *copyStatement, char *completionTag)
{
	bool isCopyFromWorker = false;
	bool isBulkloadClient = false;
	NodeAddress *masterNodeAddress = NULL;
	Oid relationId = InvalidOid;
	char partitionMethod = 0;
	char *nodeName = NULL;
	uint32 nodePort = 0;
	char *nodeUser = NULL;
	char *schemaName = NULL;

	/*
	 * from postgres/src/bin/psql/copy.c:handleCopyIn(), we know that a pq_message
	 * contains exactly one record for csv and text format, but not for binary.
	 * since in StartZeroMQServer() it's hard to handle pq_message which may contain
	 * incomplete records, currently, we don't support COPY FROM STDIN with binary
	 * format for bulkload copy.
	 */
	if (copyStatement->filename == NULL && IsBinaryCopy(copyStatement))
	{
		elog(ERROR, "bulkload doesn't support copy from stdin with binary format");
	}

	isCopyFromWorker = IsCopyFromWorker(copyStatement);
	if (isCopyFromWorker)
	{
		masterNodeAddress = MasterNodeAddress(copyStatement);
		nodeName = masterNodeAddress->nodeName;
		nodePort = masterNodeAddress->nodePort;
		nodeUser = CurrentUserName();
		masterConnection = GetNodeConnection(FORCE_NEW_CONNECTION, nodeName, nodePort);
		if (masterConnection == NULL)
		{
			elog(ERROR, "Can't connect to master server %s:%d as user %s",
					nodeName, nodePort, nodeUser);
		}
		RemoveMasterOptions(copyStatement);

		/* strip schema name for local reference */
		schemaName = copyStatement->relation->schemaname;
		copyStatement->relation->schemaname = NULL;
		relationId = RangeVarGetRelid(copyStatement->relation, NoLock, false);
		/* put schema name back */
		copyStatement->relation->schemaname = schemaName;
		partitionMethod = MasterPartitionMethod(copyStatement->relation);
	}
	else
	{
		masterNodeAddress = LocalAddress();
		relationId = RangeVarGetRelid(copyStatement->relation, NoLock, false);
		partitionMethod = PartitionMethod(relationId);
	}

	isBulkloadClient = IsBulkloadClient(copyStatement);
	PG_TRY();
	{
		if (isBulkloadClient)
		{
			if (partitionMethod == DISTRIBUTE_BY_APPEND
					|| partitionMethod == DISTRIBUTE_BY_RANGE)
			{
				PGresult *queryResult = NULL;
				Assert(masterConnection != NULL);
				/* run all metadata commands in a transaction */
				queryResult = PQexec(masterConnection->pgConn, "BEGIN");
				if (PQresultStatus(queryResult) != PGRES_COMMAND_OK)
				{
					elog(ERROR, "could not start to update master node metadata");
				}
				PQclear(queryResult);

				/* there are two policies for copying into new shard */
				// BulkloadCopyToNewShardsV1(copyStatement, completionTag, masterNodeAddress,
				//							relationId);
				BulkloadCopyToNewShards(copyStatement, completionTag, relationId);

				/* commit metadata transactions */
				queryResult = PQexec(masterConnection->pgConn, "COMMIT");
				if (PQresultStatus(queryResult) != PGRES_COMMAND_OK)
				{
					elog(ERROR, "could not commit master node metadata changes");
				}
				PQclear(queryResult);
			}
			else if (partitionMethod == DISTRIBUTE_BY_HASH)
			{
				BulkloadCopyToExistingShards(copyStatement, completionTag, relationId);
			}
			else
			{
				elog(ERROR, "Unknown partition method: %d", partitionMethod);
			}
		}
		else
		{
			BulkloadCopyServer(copyStatement, completionTag, masterNodeAddress, relationId);
		}
	}
	PG_CATCH();
	{
		PG_RE_THROW();
	}
	PG_END_TRY();
}

/*
 * CopyGetAttnums - get the number of non-dropped columns of relation.
 * copy from postgresql/src/backend/commands/copy.c
 */
static int
CopyGetAttnums(Oid relationId, List *attnamelist)
{
	int attnums = list_length(attnamelist);
	if (attnums != 0)
	{
		return attnums;
	}
	else
	{
		Relation rel = heap_open(relationId, AccessShareLock);
		TupleDesc tupDesc = RelationGetDescr(rel);
		Form_pg_attribute *attr = tupDesc->attrs;
		int										 attr_count = tupDesc->natts;
		int										 i;
		for (i = 0; i < attr_count; i++)
		{
			if (attr[i]->attisdropped)
				continue;
			attnums++;
		}
		heap_close(rel, NoLock);
		return attnums;
	}
}

/*
 * BulkloadCopyServer rebuild a COPY statement with 'bulkload_host' and 'bulkload_port'
 * options and dispatches it to all worker nodes for asynchronous executing. It also
 * starts a zeromq server to dispatches records from FROM clause to all worker nodes.
 */
static void
BulkloadCopyServer(CopyStmt *copyStatement, char *completionTag,
		NodeAddress *masterNodeAddress, Oid relationId)
{
	List *workerNodeList = NULL;
	ListCell *workerCell = NULL;
	List *workerConnectionList = NIL;
	NodeAddress *serverAddress = NULL;
	StringInfo clientCopyCommand = NULL;
	struct ZeroMQServer *zeromqServer = NULL;
	uint64 processedRowCount = 0;
	int loopIndex;
	char *nodeName = NULL;
	uint32 nodePort = 0;
	WorkerNode *workerNode = NULL;
	MultiConnection *multiConn = NULL;
	PGconn *conn = NULL;
	PGresult *res = NULL;
	int workerConnectionCount = 0;
	int finishCount = 0;
	int failCount = 0;
	int *finish = NULL;
	int rc;
	int efd;
	int nevents;
	int sock;
	int connIdx;
	struct epoll_event *event = NULL;
	struct epoll_event *events = NULL;

	workerNodeList = MasterWorkerNodeList();
	serverAddress = LocalAddress();

	zeromqServer = (struct ZeroMQServer *) palloc0(sizeof(ZeroMQServer));
	strcpy(zeromqServer->host, serverAddress->nodeName);
	/*
	 * use port number between ZeromqStartPort and ZeromqStartPort+ZeromqPortCount
	 * as zeromq server port
	 */
	zeromqServer->port = random() % ZeromqPortCount + ZeromqStartPort;

	if (copyStatement->filename != NULL)
	{
		strcpy(zeromqServer->file, copyStatement->filename);
	}

	clientCopyCommand = ConstructBulkloadCopyStmt(copyStatement, masterNodeAddress,
			serverAddress->nodeName, zeromqServer->port);

	events = (struct epoll_event *) palloc0(MaxEvents * sizeof(struct epoll_event));
	efd = epoll_create1(0);
	if (efd == -1)
	{
		elog(ERROR, "epoll_create failed");
	}

	foreach(workerCell, workerNodeList)
	{
		workerNode = (WorkerNode *) lfirst(workerCell);
		nodeName = workerNode->workerName;
		nodePort = workerNode->workerPort;
		multiConn = GetNodeConnection(FOR_DML, nodeName, nodePort);
		conn = multiConn->pgConn;
		if (conn == NULL)
		{
			elog(WARNING, "connect to %s:%d failed", nodeName, nodePort);
		}
		else
		{
			int querySent = PQsendQuery(conn, clientCopyCommand->data);
			if (querySent == 0)
			{
				elog(WARNING, "send bulkload copy to %s:%d failed: %s", nodeName, nodePort,
						PQerrorMessage(conn));
			}
			else
			{
				if (PQsetnonblocking(conn, 1) == -1)
				{
					/*
					 * make sure it wouldn't cause to fatal error even in blocking mode
					 */
					elog(WARNING, "%s:%d set non-blocking failed", nodeName, nodePort);
				}
				sock = PQsocket(conn);
				if (sock < 0)
				{
					elog(WARNING, "%s:%d get socket failed", nodeName, nodePort);
				}
				else
				{
					event = (struct epoll_event *) palloc0(sizeof(struct epoll_event));
					event->events = EPOLLIN | EPOLLERR | EPOLLET;
					event->data.fd = sock;
					if (epoll_ctl(efd, EPOLL_CTL_ADD, sock, event) != 0)
					{
						elog(WARNING, "epoll_ctl add socket of %s:%d failed", nodeName, nodePort);
					}
					else
					{
						/*
						 * finally we append the connection which we have sent query and got it's
						 * socket file descriptor successfully to connection list.
						 */
						workerConnectionList = lappend(workerConnectionList, conn);
					}
				}
			}
		}
	}
	workerConnectionCount = list_length(workerConnectionList);
	if (workerConnectionCount == 0)
	{
		elog(ERROR, "Can't send bulkload copy to any worker");
	}

	/*
	 * array representing the status of worker connection:
	 * -1: worker failed
	 *	0: worker still running
	 *	1: worker succeed
	 */
	finish = (int *) palloc0(workerConnectionCount * sizeof(int));

	PG_TRY();
	{
		int natts = CopyGetAttnums(relationId, copyStatement->attlist);
		/*
		 * check status of workers before starting zeromq server in case
		 * of unproper bulkload copy command.
		 * TODO@luoyuanhao: if error occurs after EpollTimeOut, there's
		 * still possibility of deadlock, refactoring this code properly.
		 */
		do
		{
			nevents = epoll_wait(efd, events, MaxEvents, EpollTimeout * 10);
			if (nevents == -1)
			{
				elog(ERROR, "epoll_wait error(%d): %s", errno, strerror(errno));
			}
			for (loopIndex = 0; loopIndex	< nevents; loopIndex++)
			{
				conn = GetConnectionBySock(workerConnectionList, events[loopIndex].data.fd,
						&connIdx);
				Assert(conn != NULL);
				if (finish[connIdx] != 0) continue;
				/*
				 * if bulkload copy command is okay, there should be neither output nor error
				 * message in socket, otherwise, bulkload copy command is wrong.
				 */
				elog(WARNING, "bulkload copy in %s:%s fail, read log file to get error message",
						PQhost(conn), PQport(conn));
				finish[connIdx] = -1;
				finishCount++;
			}
		} while(nevents != 0);
		if (finishCount == workerConnectionCount)
		{
			elog(ERROR, "bulkload copy commands fail in all workers");
		}

		StartZeroMQServer(zeromqServer, copyStatement->is_program,
				IsBinaryCopy(copyStatement), natts);

		while (finishCount < workerConnectionCount)
		{
			CHECK_FOR_INTERRUPTS();

			/* send EOF message */
			SendMessage(zeromqServer, "KILL", 4, true);

			/*
			 * wait indefinitely may cause to dead-lock: we send a 'KILL' signal,
			 * but bload may not catch it and wait indefinitely, so COPY command
			 * wouldn't finish and therefore there are no responses(events) from
			 * pg connection.
			 */
			nevents = epoll_wait(efd, events, MaxEvents, EpollTimeout);
			if (nevents == -1)
			{
				elog(ERROR, "epoll_wait error(%d): %s", errno, strerror(errno));
			}
			for (loopIndex = 0; loopIndex	< nevents; loopIndex++)
			{
				conn = GetConnectionBySock(workerConnectionList, events[loopIndex].data.fd,
						&connIdx);
				Assert(conn != NULL);
				if (finish[connIdx] != 0) continue;
				if (events[loopIndex].events & EPOLLERR)
				{
					elog(WARNING, "socket of %s:%s error", PQhost(conn), PQport(conn));
					finish[connIdx] = -1;
					finishCount++;
					continue;
				}
				if (events[loopIndex].events & EPOLLIN)
				{
					rc = PQconsumeInput(conn);
					if (rc == 0)
					{
						elog(WARNING, "%s:%s error:%s", PQhost(conn), PQport(conn),
								PQerrorMessage(conn));
						finish[connIdx] = -1;
						finishCount++;
					}
					else
					{
						if (!PQisBusy(conn))
						{
							res = PQgetResult(conn);
							if (res == NULL)
							{
								finish[connIdx] = 1;
							}
							else
							{
								if (PQresultStatus(res) != PGRES_COMMAND_OK &&
										PQresultStatus(res) != PGRES_TUPLES_OK)
								{
									elog(WARNING, "%s:%s error:%s", PQhost(conn), PQport(conn),
											PQresultErrorMessage(res));
									finish[connIdx] = -1;
								}
								else
								{
									processedRowCount += atol(PQcmdTuples(res));
									finish[connIdx] = 1;
								}
								PQclear(res);
							}
							finishCount++;
						}
					}
				}
			}
		}
	}
	PG_CATCH();
	{
		char *errbuf = (char *) palloc0(NAMEDATALEN);
		foreach(workerCell, workerConnectionList)
		{
			PGconn *conn = (PGconn *) lfirst(workerCell);
			PGcancel *cancel = PQgetCancel(conn);
			if (cancel != NULL && PQcancel(cancel, errbuf, NAMEDATALEN) != 1)
			{
				elog(WARNING, "%s", errbuf);
			}
			PQfreeCancel(cancel);
		}
		StopZeroMQServer(zeromqServer);

		PG_RE_THROW();
	}
	PG_END_TRY();

	for (loopIndex = 0; loopIndex < workerConnectionCount; loopIndex++)
	{
		if (finish[loopIndex] == -1)
		{
			failCount++;
		}
	}
	/*
	 * TODO@luoyuanhao: two phase commit, if failCount > 0, rollback.
	 */
	StopZeroMQServer(zeromqServer);
	if (completionTag != NULL)
	{
		snprintf(completionTag, COMPLETION_TAG_BUFSIZE,
				"COPY " UINT64_FORMAT, processedRowCount);
	}
}

/*
 * IsBulkloadCopy checks if the given copy statement has the 'method' option
 * and the value is 'bulkload'.
 */
bool
IsBulkloadCopy(CopyStmt *copyStatement)
{
	ListCell *optionCell = NULL;
	DefElem *defel = NULL;
	foreach(optionCell, copyStatement->options)
	{
		defel = (DefElem *) lfirst(optionCell);
		if (strcasecmp(defel->defname, "method") == 0)
		{
			char *method = defGetString(defel);
			if (strcasecmp(method, "bulkload") != 0)
			{
				elog(ERROR, "Unsupported method: %s. Valid values('bulkload')", method);
			}
			else
			{
				return true;
			}
		}
	}
	return false;
}

/*
 * IsBinaryCopy checks if the given copy statement has the 'format' option
 * and the value is 'binary'.
 */
	bool
IsBinaryCopy(CopyStmt *copyStatement)
{
	ListCell *optionCell = NULL;
	DefElem *defel = NULL;
	foreach(optionCell, copyStatement->options)
	{
		defel = (DefElem *) lfirst(optionCell);
		if (strcasecmp(defel->defname, "format") == 0)
		{
			char *method = defGetString(defel);
			if (strcasecmp(method, "binary") == 0)
			{
				return true;
			}
		}
	}
	return false;
}

/*
 * IsBulkloadClient checks if the given copy statement has the 'bulkload_host' option.
 */
	bool
IsBulkloadClient(CopyStmt *copyStatement)
{
	ListCell *optionCell = NULL;
	DefElem *defel = NULL;
	foreach(optionCell, copyStatement->options)
	{
		defel = (DefElem *) lfirst(optionCell);
		if (strcasecmp(defel->defname, "bulkload_host") == 0)
		{
			return true;
		}
	}
	return false;
}

/*
 * RemoveBulkloadOptions removes bulkload related copy options from the option
 * list of the copy statement.
 */
static void
RemoveBulkloadOptions(CopyStmt *copyStatement)
{
	List *newOptionList = NIL;
	ListCell *optionCell = NULL;

	/* walk over the list of all options */
	foreach(optionCell, copyStatement->options)
	{
		DefElem *option = (DefElem *) lfirst(optionCell);

		/* skip master related options */
		if ((strcmp(option->defname, "bulkload_host") == 0) ||
				(strcmp(option->defname, "bulkload_port") == 0) ||
				(strcmp(option->defname, "method") == 0))
		{
			continue;
		}

		newOptionList = lappend(newOptionList, option);
	}

	copyStatement->options = newOptionList;
}

/*
 * BulkloadServerAddress gets the bulkload zeromq server address from copy options
 * and returns it. Note that if the bulkload_port is not provided, we use 5557 as
 * the default port.
 */
static NodeAddress *
BulkloadServerAddress(CopyStmt *copyStatement)
{
	NodeAddress *bulkloadServer = (NodeAddress *) palloc0(sizeof(NodeAddress));
	char *nodeName = NULL;

	/* set default port to 5557 */
	uint32 nodePort = 5557;

	ListCell *optionCell = NULL;
	foreach(optionCell, copyStatement->options)
	{
		DefElem *defel = (DefElem *) lfirst(optionCell);
		if (strncmp(defel->defname, "bulkload_host", NAMEDATALEN) == 0)
		{
			nodeName = defGetString(defel);
		}
		else if (strncmp(defel->defname, "bulkload_port", NAMEDATALEN) == 0)
		{
			nodePort = defGetInt32(defel);
		}
	}

	bulkloadServer->nodeName = nodeName;
	bulkloadServer->nodePort = nodePort;
	return bulkloadServer;
}

/*
 * ConstructBulkloadCopyStmt constructs the text of a Bulkload COPY statement for
 * executing in bulkload copy client.
 */
static StringInfo
ConstructBulkloadCopyStmt(CopyStmt *copyStatement, NodeAddress *masterNodeAddress,
		char *nodeName, uint32 nodePort)
{
	char *schemaName = copyStatement->relation->schemaname;
	char *relationName = copyStatement->relation->relname;
	char *qualifiedName = quote_qualified_identifier(schemaName, relationName);
	List *attlist = copyStatement->attlist;
	ListCell *lc = NULL;
	char *binaryPath = NULL;
	StringInfo optionsString = NULL;
	StringInfo command = NULL;
	int res;
	bool isfirst = true;

	RemoveBulkloadOptions(copyStatement);

	binaryPath = (char *) palloc0(NAMEDATALEN);
	res = readlink("/proc/self/exe", binaryPath, NAMEDATALEN);
	if (res == -1)
	{
		elog(ERROR, "%s", "Can't get absolute path of PG_HOME");
	}
	else
	{
		/*
		 * original string would be "/path_of_pg_home/bin/postgres"
		 * after cutting it turns to be "/path_of_pg_home/bin/"
		 */
		binaryPath[res - 8] = '\0';
		/* append 'bload' */
		strcat(binaryPath, "bload");
	}

	optionsString = DeparseCopyStatementOptions(copyStatement->options);
	command = makeStringInfo();
	appendStringInfo(command, "COPY %s", qualifiedName);
	if (list_length(attlist) != 0)
	{
		appendStringInfoChar(command, '(');
		foreach(lc, attlist)
		{
			if (isfirst)
			{
				isfirst = false;
			}
			else
			{
				appendStringInfoString(command, ", ");
			}
			appendStringInfoString(command, strVal(lfirst(lc)));
		}
		appendStringInfoChar(command, ')');
	}
	appendStringInfo(command, " FROM PROGRAM '%s' WITH(master_host '%s', "
			"master_port %d, method 'bulkload', bulkload_host '%s', bulkload_port %d",
			binaryPath,
			masterNodeAddress->nodeName,
			masterNodeAddress->nodePort,
			nodeName,
			nodePort);
	if (strlen(optionsString->data) != 0)
	{
		appendStringInfo(command, ", %s)", optionsString->data);
	}
	else
	{
		appendStringInfoChar(command, ')');
	}
	return command;
}

/*
 * DeparseCopyStatementOptions construct the text command in WITH clause of COPY stmt.
 */
static StringInfo
DeparseCopyStatementOptions(List *options)
{
	StringInfo optionsStr = makeStringInfo();
	ListCell *option;
	bool isfirst = true;
	DefElem *defel = NULL;
	foreach(option, options)
	{
		if (isfirst) isfirst = false;
		else appendStringInfoString(optionsStr, ", ");

		defel = (DefElem *) lfirst(option);

		if (strcmp(defel->defname, "format") == 0)
		{
			appendStringInfo(optionsStr, "format %s", defGetString(defel));
		}
		else if (strcmp(defel->defname, "oids") == 0)
		{
			appendStringInfo(optionsStr, "oids %s", defGetBoolean(defel) ? "true" : "false");
		}
		else if (strcmp(defel->defname, "freeze") == 0)
		{
			appendStringInfo(optionsStr, "freeze %s", defGetBoolean(defel) ? "true" : "false");
		}
		else if (strcmp(defel->defname, "delimiter") == 0)
		{
			appendStringInfo(optionsStr, "delimiter '%s'", defGetString(defel));
		}
		else if (strcmp(defel->defname, "null") == 0)
		{
			appendStringInfo(optionsStr, "null '%s'", defGetString(defel));
		}
		else if (strcmp(defel->defname, "header") == 0)
		{
			appendStringInfo(optionsStr, "header %s", defGetBoolean(defel) ? "true" : "false");
		}
		else if (strcmp(defel->defname, "quote") == 0)
		{
			appendStringInfo(optionsStr, "quote '%s'", defGetString(defel));
		}
		else if (strcmp(defel->defname, "escape") == 0)
		{
			if (strcmp(defGetString(defel), "\\") == 0)
			{
				appendStringInfo(optionsStr, "quote '\\%s'", defGetString(defel));
			}
			else
			{
				appendStringInfo(optionsStr, "quote '%s'", defGetString(defel));
			}
		}
		/* unhandle force_quote/force_not_null/force_null and convert_selectively*/
		//else if (strcmp(defel->defname, "force_quote") == 0)
		//{
		//			if (cstate->force_quote || cstate->force_quote_all)
		//							ereport(ERROR,
		//															(errcode(ERRCODE_SYNTAX_ERROR),
		//															 errmsg("conflicting or redundant options")));
		//			if (defel->arg && IsA(defel->arg, A_Star))
		//							cstate->force_quote_all = true;
		//			else if (defel->arg && IsA(defel->arg, List))
		//							cstate->force_quote = (List *) defel->arg;
		//			else
		//							ereport(ERROR,
		//															(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
		//															 errmsg("argument to option \"%s\" must be a list of column names",
		//																			 defel->defname)));
		//}
		//else if (strcmp(defel->defname, "force_not_null") == 0)
		//{
		//			if (cstate->force_notnull)
		//							ereport(ERROR,
		//															(errcode(ERRCODE_SYNTAX_ERROR),
		//															 errmsg("conflicting or redundant options")));
		//			if (defel->arg && IsA(defel->arg, List))
		//							cstate->force_notnull = (List *) defel->arg;
		//			else
		//							ereport(ERROR,
		//															(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
		//															 errmsg("argument to option \"%s\" must be a list of column names",
		//																			 defel->defname)));
		//}
		//else if (strcmp(defel->defname, "force_null") == 0)
		//{
		//			if (cstate->force_null)
		//							ereport(ERROR,
		//															(errcode(ERRCODE_SYNTAX_ERROR),
		//															 errmsg("conflicting or redundant options")));
		//			if (defel->arg && IsA(defel->arg, List))
		//							cstate->force_null = (List *) defel->arg;
		//			else
		//							ereport(ERROR,
		//															(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
		//															 errmsg("argument to option \"%s\" must be a list of column names",
		//																			 defel->defname)));
		//}
		//else if (strcmp(defel->defname, "convert_selectively") == 0)
		//{
		//			/*
		//			 * Undocumented, not-accessible-from-SQL option: convert only the
		//			 * named columns to binary form, storing the rest as NULLs. It's
		//			 * allowed for the column list to be NIL.
		//			 */
		//			if (cstate->convert_selectively)
		//							ereport(ERROR,
		//															(errcode(ERRCODE_SYNTAX_ERROR),
		//															 errmsg("conflicting or redundant options")));
		//			cstate->convert_selectively = true;
		//			if (defel->arg == NULL || IsA(defel->arg, List))
		//							cstate->convert_select = (List *) defel->arg;
		//			else
		//							ereport(ERROR,
		//															(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
		//															 errmsg("argument to option \"%s\" must be a list of column names",
		//																			 defel->defname)));
		//}
		//else if (strcmp(defel->defname, "encoding") == 0)
		//{
		//			appendStringInfo(optionsStr, "encoding '%s'", defGetString(defel));
		//}
		else
			ereport(ERROR,
					(errcode(ERRCODE_SYNTAX_ERROR),
					 errmsg("option \"%s\" not recognized",
						 defel->defname)));
	}
	return optionsStr;
}
/*
 * BulkloadCopyToNewShards executes the bulkload COPY command sent by bulkload server
 * for APPEND distributed table.
 * It acts just like CopyToNewShards() but records are received from zeromq server.
 */
static void
BulkloadCopyToNewShards(CopyStmt *copyStatement, char *completionTag, Oid relationId)
{
	NodeAddress *bulkloadServer = BulkloadServerAddress(copyStatement);
	RemoveBulkloadOptions(copyStatement);
	RebuildBulkloadCopyStatement(copyStatement, bulkloadServer);
	CopyToNewShards(copyStatement, completionTag, relationId);
}
/*
 * LocalAddress gets the host and port of current running postgres.
 */
static NodeAddress *
LocalAddress(void)
{
	NodeAddress *node = (NodeAddress *) palloc0(sizeof(NodeAddress));
	char *host = (char *) palloc0(32);
	const char *portStr = GetConfigOption("port", true, false);
	int rc = gethostname(host, 32);
	if (rc != 0)
	{
		strcpy(host, "localhost");
		elog(WARNING, "gethostname fail: %s, use 'localhost'", strerror(errno));
	}
	node->nodeName = host;
	if (portStr == NULL)
	{
		node->nodePort = 5432;
	}
	else
	{
		node->nodePort = atoi(portStr);
	}
	return node;
}

/*
 * MasterWorkerNodeList dispatches the master_get_active_worker_nodes
 * between local or remote master node according to the master connection state.
 */
static List *
MasterWorkerNodeList(void)
{
	List *workerNodeList = NIL;
	if (masterConnection == NULL)
	{
		workerNodeList = WorkerNodeList();
	}
	else
	{
		workerNodeList = RemoteWorkerNodeList();
	}

	return workerNodeList ;
}

/*
 * RemoteWorkerNodeList gets the active worker node list from the remote master node.
 */
static List *
RemoteWorkerNodeList(void)
{
	List *workerNodeList = NIL;
	PGresult *queryResult = NULL;

	StringInfo workerNodeCommand = makeStringInfo();
	appendStringInfoString(workerNodeCommand, ACTIVE_WORKER_NODE_QUERY);

	queryResult = PQexec(masterConnection->pgConn, workerNodeCommand->data);
	if (PQresultStatus(queryResult) == PGRES_TUPLES_OK)
	{
		int rowCount = PQntuples(queryResult);
		int rowIndex = 0;

		for (rowIndex = 0; rowIndex < rowCount; rowIndex++)
		{
			WorkerNode *workerNode =
				(WorkerNode *) palloc0(sizeof(WorkerNode));

			char *host = PQgetvalue(queryResult, rowIndex, 0);
			char *port = PQgetvalue(queryResult, rowIndex, 1);
			strcpy(workerNode->workerName, host);
			workerNode->workerPort = atoi(port);

			workerNodeList = lappend(workerNodeList, workerNode);
		}
	}
	else
	{
		elog(ERROR, "could not get active worker node list from the master node: %s",
				PQresultErrorMessage(queryResult));
	}
	PQclear(queryResult);

	return workerNodeList;
}

/*
 * StartZeroMQServer starts a zeromq socket, reads data from file, remote frontend
 * or the output of the program and then sends it to zeromq client.
 * TODO@luoyuanhao: Currently we don't support bulkload copy from stdin with binary
 */
static void
StartZeroMQServer(ZeroMQServer *zeromqServer, bool is_program, bool binary, int natts)
{
	uint64_t start = 0, read = 0;
	FILE *fp = NULL;
	char *buf = NULL;
	char zeroaddr[32];
	void *context = NULL;
	void *sender = NULL;
	void *controller = NULL;
	char *file = zeromqServer->file;
	bool pipe = (strlen(file) == 0);
	StringInfoData msgbuf;
	int16 format = binary ? 1 : 0;
	int loopIdx;
	bool copyDone = false;

	context = zmq_ctx_new ();
	Assert(context != NULL);
	//			Socket to send messages on
	sender = zmq_socket(context, ZMQ_PUSH);
	if (sender == NULL)
	{
		elog(ERROR, "zmq_socket() error(%d): %s", errno, zmq_strerror(errno));
	}
	// Socket for control signal
	controller = zmq_socket(context, ZMQ_PUB);
	if (controller == NULL)
	{
		elog(ERROR, "zmq_socket() error(%d): %s", errno, zmq_strerror(errno));
	}

	zeromqServer->context = context;
	zeromqServer->sender = sender;
	zeromqServer->controller = controller;

	sprintf(zeroaddr, "tcp://*:%d", zeromqServer->port);
	if (zmq_bind (sender, zeroaddr) != 0)
	{
		elog(ERROR, "zmq_bind() error(%d): %s", errno, zmq_strerror(errno));
	}
	sprintf(zeroaddr, "tcp://*:%d", zeromqServer->port + 1);
	if (zmq_bind (controller, zeroaddr) != 0)
	{
		elog(ERROR, "zmq_bind() error(%d): %s", errno, zmq_strerror(errno));
	}

	if (pipe)
	{
		/*
		 * inspired by ReceivedCopyBegin()
		 */
		Assert(!binary);
		pq_beginmessage(&msgbuf, 'G');
		pq_sendbyte(&msgbuf, format);					 /* overall format */
		pq_sendint(&msgbuf, natts, 2);
		for (loopIdx = 0; loopIdx < natts; loopIdx++)
			pq_sendint(&msgbuf, format, 2);				 /* per-column formats */
		pq_endmessage(&msgbuf);
		pq_flush();

		initStringInfo(&msgbuf);
		/* get records from fe */
		while (!copyDone)
		{
			int mtype;
			CHECK_FOR_INTERRUPTS();
			HOLD_CANCEL_INTERRUPTS();
			/*
			 * inspired by CopyGetData()
			 */
			pq_startmsgread();
			mtype = pq_getbyte();
			if (mtype == EOF)
				elog(ERROR, "unexpected EOF on client connection with an open transaction");
			if (pq_getmessage(&msgbuf, 0))
				elog(ERROR, "unexpected EOF on client connection with an open transaction");
			RESUME_CANCEL_INTERRUPTS();
			switch (mtype)
			{
				case 'd':							 /* CopyData */
					SendMessage(zeromqServer, msgbuf.data, msgbuf.len, false);
					break;
				case 'c':							 /* CopyDone */
					/* COPY IN correctly terminated by frontend */
					copyDone = true;
					break;
				case 'f':							 /* CopyFail */
					elog(ERROR, "COPY from stdin failed: %s", pq_getmsgstring(&msgbuf));
					break;
				case 'H':							 /* Flush */
				case 'S':							 /* Sync */
					break;
				default:
					elog(ERROR, "unexpected message type 0x%02X during COPY from stdin", mtype);
					break;
			}
		}
		return;
	}

	Assert(!pipe);
	if (is_program)
	{
		fp = popen(file, PG_BINARY_R);
		if (fp == NULL)
		{
			elog(ERROR, "could not execute command \"%s\"", file);
		}
	}
	else
	{
		struct stat st;
		fp = fopen(file, PG_BINARY_R);
		if (fp == NULL)
		{
			elog(ERROR, "could not open file \"%s\": %s", file, strerror(errno));
		}
		if (fstat(fileno(fp), &st))
		{
			elog(ERROR, "could not stat file \"%s\"", file);
		}
		if (S_ISDIR(st.st_mode))
		{
			elog(ERROR, "\"%s\" is a directory", file);
		}
	}

	buf = (char *) palloc0(BatchSize + MaxRecordSize + 1);
	Assert(buf != NULL);

	if (!binary)
	{
		while (true)
		{
			uint64_t i;
			CHECK_FOR_INTERRUPTS();
			start = 0;
			read = fread(buf + start, 1, BatchSize, fp);
			start += read;
			if (read < BatchSize) break;
			for (i = 0; i < MaxRecordSize; i++)
			{
				read = fread(buf + start, 1, 1, fp);
				if (read == 0) break;
				Assert(read == 1);
				start += read;
				if (buf[start - 1] == '\n') break;
			}
			if (i == MaxRecordSize)
			{
				char *tmp = (char*) palloc0(MaxRecordSize + 1);
				strncpy(tmp, buf + start - MaxRecordSize, MaxRecordSize);
				tmp[MaxRecordSize] = '\0';
				elog(ERROR, "Too large record: %s", tmp);
			}
			else
			{
				SendMessage(zeromqServer, buf, start, false);
			}
		}
		if (start > 0)
		{
			SendMessage(zeromqServer, buf, start, false);
		}
	}
	else
	{
		int32 flag, elen;
		int16 fld_count;

		/* Signature */
		read = fread(buf, 1, 11, fp);
		if (read != 11 || strncmp(buf, BinarySignature, 11) != 0)
		{
			elog(ERROR, "COPY file signature not recognized");
		}
		/* Flags field */
		read = fread(buf, 1, 4, fp);
		if (read != 4)
		{
			elog(ERROR, "invalid COPY file header (missing flags)");
		}
		flag = (int32) ntohl(*(uint32 *)buf);
		if ((flag & (1 << 16)) != 0)
		{
			elog(ERROR, "bulkload COPY can't set OID flag");
		}
		flag &= ~(1 << 16);
		if ((flag >> 16) != 0)
		{
			elog(ERROR, "unrecognized critical flags in COPY file header");
		}
		/* Header extension length */
		read = fread(buf, 1, 4, fp);
		if (read != 4)
		{
			elog(ERROR, "invalid COPY file header (missing length)");
		}
		elen = (int32) ntohl(*(uint32 *)buf);
		/* Skip extension header, if present */
		read = fread(buf, 1, elen, fp);
		if (read != elen)
		{
			elog(ERROR, "invalid COPY file header (wrong length)");
		}

		/* handle tuples one by one */
		while (true)
		{
			int16 fld_index;
			int32 fld_size;

			CHECK_FOR_INTERRUPTS();
			start = 0;
			read = fread(buf + start, 1, 2, fp);
			if (read != 2)
			{
				/* EOF detected (end of file, or protocol-level EOR) */
				break;
			}
			fld_count = (int16) ntohs(*(uint16 *)buf);
			if (fld_count == -1)
			{
				read = fread(buf + start, 1, 1, fp);
				if (read == 1)
				{
					elog(ERROR, "received copy data after EOF marker");
				}
				/* Received EOF marker */
				break;
			}
			start += 2;
			for (fld_index = 0; fld_index < fld_count; fld_index++)
			{
				read = fread(buf + start, 1, 4, fp);
				if (read != 4)
				{
					elog(ERROR, "unexpected EOF in COPY data");
				}
				fld_size = (int32) ntohl(*(uint32 *)(buf + start));
				if (fld_size == -1)
				{
					/* null value */
					start += 4;
				}
				else if (fld_size < 0)
				{
					elog(ERROR, "invalid field size %d", fld_size);
				}
				else
				{
					start += 4;
					read = fread(buf + start, 1, fld_size, fp);
					if (read != fld_size)
					{
						elog(ERROR, "unexpected EOF in COPY data");
					}
					else
					{
						/* skip field value */
						start += fld_size;
					}
				}

				if (start >= MaxRecordSize + BatchSize)
				{
					elog(ERROR, "Too large binary record: %s", buf);
				}
			}
			SendMessage(zeromqServer, buf, start, false);
		}
	}
	if (is_program)
	{
		int rc = pclose(fp);
		if (rc == -1)
		{
			elog(WARNING, "could not close pipe to external command \"%s\"", file);
		}
		else if (rc != 0)
		{
			elog(WARNING, "program \"%s\" failed", file);
		}
	}
	else if (fclose(fp) != 0)
	{
		elog(WARNING, "close file error: %s", strerror(errno));
	}
}

/*
 * SendMessage sends message to zeromq socket.
 * If kill is true, send KILL signal.
 */
static void
SendMessage(ZeroMQServer *zeromqServer, char *buf, size_t len, bool kill)
{
	int rc;
	if (kill)
	{
		rc = zmq_send(zeromqServer->controller, buf, len, 0);
	}
	else
	{
		rc = zmq_send(zeromqServer->sender, buf, len, 0);
	}
	if (rc != len)
	{
		elog(LOG, "zmq_send() error(%d): %s", errno, zmq_strerror(errno));
	}
}

/*
 * StopZeroMQServer stops zeromq server and releases related resources.
 */
static void
StopZeroMQServer(ZeroMQServer *zeromqServer)
{
	if (zeromqServer->sender)
	{
		zmq_close(zeromqServer->sender);
		zeromqServer->sender = NULL;
	}
	if (zeromqServer->controller)
	{
		zmq_close(zeromqServer->controller);
		zeromqServer->controller = NULL;
	}
	if (zeromqServer->context)
	{
		zmq_ctx_destroy(zeromqServer->context);
		zeromqServer->context = NULL;
	}
}

/*
 * RebuildBulkloadCopyStatement adds bulkload server address as PROGRAM's arguments.
 */
static void
RebuildBulkloadCopyStatement(CopyStmt *copyStatement, NodeAddress *bulkloadServer)
{
	StringInfo tmp = makeStringInfo();
	appendStringInfo(tmp, "%s %s %d", copyStatement->filename, bulkloadServer->nodeName,
			bulkloadServer->nodePort);
	if (IsBinaryCopy(copyStatement))
	{
		appendStringInfoString(tmp, " binary");
	}
	copyStatement->filename = tmp->data;
}
/*
 * MasterRelationId gets the relationId of relation from the master node.
 */
static Oid
MasterRelationId(char *qualifiedName)
{
	Oid relationId = 0;
	PGresult *queryResult = NULL;

	StringInfo relationIdCommand = makeStringInfo();
	appendStringInfo(relationIdCommand, RELATIONID_QUERY, qualifiedName);

	queryResult = PQexec(masterConnection->pgConn, relationIdCommand->data);
	if (PQresultStatus(queryResult) == PGRES_TUPLES_OK)
	{
		char *relationIdString = PQgetvalue(queryResult, 0, 0);
		if (relationIdString == NULL || (*relationIdString) == '\0')
		{
			elog(ERROR, "could not find relationId for the table %s", qualifiedName);
		}

		relationId = (Oid) atoi(relationIdString);
	}
	else
	{
		elog(ERROR, "could not get the relationId of the distributed table %s: %s",
				qualifiedName, PQresultErrorMessage(queryResult));
	}
	PQclear(queryResult);
	return relationId;
}

/*
 * MasterDistributedTableCacheEntry get's metadada from master node and
 * build's a DistTableCacheEntry for the relation.
 */
static DistTableCacheEntry *
MasterDistributedTableCacheEntry(RangeVar *relation)
{
	DistTableCacheEntry *cacheEntry = NULL;
	Oid relationId = 0;

	/* temporary value */
	char *partmethod = NULL;
	char *colocationid = NULL;
	char *repmodel = NULL;
	char *shardidString = NULL;
	char *storageString = NULL;
	char *minValueString = NULL;
	char *maxValueString = NULL;
	char *shardidStringEnd = NULL;

	/* members of pg_dist_partition and DistTableCacheEntry */
	char *partitionKeyString = NULL;
	char partitionMethod = 0;
	uint32 colocationId = INVALID_COLOCATION_ID;
	char replicationModel = 0;

	/* members of pg_dist_shard */
	int64 shardId;
	char storageType;
	Datum minValue = 0;
	Datum maxValue = 0;
	bool minValueExists = true;
	bool maxValueExists = true;

	/* members of DistTableCacheEntry */
	int shardIntervalArrayLength = 0;
	ShardInterval **shardIntervalArray = NULL;
	ShardInterval **sortedShardIntervalArray = NULL;
	FmgrInfo *shardIntervalCompareFunction = NULL;
	FmgrInfo *hashFunction = NULL;
	bool hasUninitializedShardInterval = false;
	bool hasUniformHashDistribution = false;

	ShardInterval *shardInterval = NULL;
	Oid intervalTypeId = INT4OID;
	int32 intervalTypeMod = -1;
	int16 intervalTypeLen = 0;
	bool intervalByVal = false;
	char intervalAlign = '0';
	char intervalDelim = '0';
	Oid typeIoParam = InvalidOid;
	Oid inputFunctionId = InvalidOid;
	PGresult *queryResult = NULL;
	StringInfo partitionKeyStringInfo = makeStringInfo();
	StringInfo queryString = makeStringInfo();

	char *relationName = relation->relname;
	char *schemaName = relation->schemaname;
	char *qualifiedName = quote_qualified_identifier(schemaName, relationName);
	relationId = MasterRelationId(qualifiedName);

	Assert(masterConnection != NULL);

	appendStringInfo(queryString, "SELECT * FROM pg_dist_partition WHERE logicalrelid=%d",
			relationId);
	queryResult = PQexec(masterConnection->pgConn, queryString->data);
	if (PQresultStatus(queryResult) == PGRES_TUPLES_OK)
	{
		int rowCount = PQntuples(queryResult);
		Assert(rowCount == 1);

		partmethod = PQgetvalue(queryResult, 0, Anum_pg_dist_partition_partmethod - 1);
		partitionKeyString = PQgetvalue(queryResult, 0, Anum_pg_dist_partition_partkey - 1);
		colocationid = PQgetvalue(queryResult, 0, Anum_pg_dist_partition_colocationid - 1);
		repmodel = PQgetvalue(queryResult, 0, Anum_pg_dist_partition_repmodel - 1);

		partitionMethod = partmethod[0];
		appendStringInfoString(partitionKeyStringInfo, partitionKeyString);
		partitionKeyString = partitionKeyStringInfo->data;
		colocationId = (uint32) atoi(colocationid);
		replicationModel = repmodel[0];
	}
	else
	{
		elog(ERROR, "could not get metadata of table %s: %s",
				qualifiedName, PQresultErrorMessage(queryResult));
	}
	PQclear(queryResult);

	get_type_io_data(intervalTypeId, IOFunc_input, &intervalTypeLen, &intervalByVal,
			&intervalAlign, &intervalDelim, &typeIoParam, &inputFunctionId);

	resetStringInfo(queryString);
	appendStringInfo(queryString, "SELECT * FROM pg_dist_shard WHERE logicalrelid=%d",
			relationId);
	queryResult = PQexec(masterConnection->pgConn, queryString->data);
	if (PQresultStatus(queryResult) == PGRES_TUPLES_OK)
	{
		int arrayIndex = 0;

		shardIntervalArrayLength = PQntuples(queryResult);
		shardIntervalArray = (ShardInterval **) palloc0(
				shardIntervalArrayLength * sizeof(ShardInterval *));

		for (arrayIndex = 0; arrayIndex < shardIntervalArrayLength; arrayIndex++)
		{
			shardidString =
				PQgetvalue(queryResult, arrayIndex, Anum_pg_dist_shard_shardid - 1);
			storageString =
				PQgetvalue(queryResult, arrayIndex, Anum_pg_dist_shard_shardstorage - 1);
			minValueString =
				PQgetvalue(queryResult, arrayIndex, Anum_pg_dist_shard_shardminvalue - 2);
			maxValueString =
				PQgetvalue(queryResult, arrayIndex, Anum_pg_dist_shard_shardmaxvalue - 2);

			shardId = strtoul(shardidString, &shardidStringEnd, 0);
			storageType = storageString[0];
			/* finally convert min/max values to their actual types */
			minValue = OidInputFunctionCall(inputFunctionId, minValueString,
					typeIoParam, intervalTypeMod);
			maxValue = OidInputFunctionCall(inputFunctionId, maxValueString,
					typeIoParam, intervalTypeMod);

			shardInterval = CitusMakeNode(ShardInterval);
			shardInterval->relationId = relationId;
			shardInterval->storageType = storageType;
			shardInterval->valueTypeId = intervalTypeId;
			shardInterval->valueTypeLen = intervalTypeLen;
			shardInterval->valueByVal = intervalByVal;
			shardInterval->minValueExists = minValueExists;
			shardInterval->maxValueExists = maxValueExists;
			shardInterval->minValue = minValue;
			shardInterval->maxValue = maxValue;
			shardInterval->shardId = shardId;

			shardIntervalArray[arrayIndex] = shardInterval;
		}
	}
	else
	{
		elog(ERROR, "could not get metadata of table %s: %s",
				qualifiedName, PQresultErrorMessage(queryResult));
	}
	PQclear(queryResult);

	/* decide and allocate interval comparison function */
	if (shardIntervalArrayLength > 0)
	{
		shardIntervalCompareFunction = GetFunctionInfo(INT4OID, BTREE_AM_OID,
				BTORDER_PROC);
	}

	/* sort the interval array */
	sortedShardIntervalArray = SortShardIntervalArray(shardIntervalArray,
			shardIntervalArrayLength,
			shardIntervalCompareFunction);

	/* check if there exists any shard intervals with no min/max values */
	hasUninitializedShardInterval =
		HasUninitializedShardInterval(sortedShardIntervalArray, shardIntervalArrayLength);

	/* we only need hash functions for hash distributed tables */
	if (partitionMethod == DISTRIBUTE_BY_HASH)
	{
		TypeCacheEntry *typeEntry = NULL;
		Node *partitionNode = stringToNode(partitionKeyString);
		Var *partitionColumn = (Var *) partitionNode;
		Assert(IsA(partitionNode, Var));
		typeEntry = lookup_type_cache(partitionColumn->vartype,
				TYPECACHE_HASH_PROC_FINFO);

		hashFunction = (FmgrInfo *) palloc0(sizeof(FmgrInfo));

		fmgr_info_copy(hashFunction, &(typeEntry->hash_proc_finfo), CurrentMemoryContext);

		/* check the shard distribution for hash partitioned tables */
		hasUniformHashDistribution =
			HasUniformHashDistribution(sortedShardIntervalArray, shardIntervalArrayLength);
	}

	cacheEntry = (DistTableCacheEntry *) palloc0(sizeof(DistTableCacheEntry));
	cacheEntry->relationId = relationId;
	cacheEntry->isValid = true;
	cacheEntry->isDistributedTable = true;
	cacheEntry->partitionKeyString = partitionKeyString;
	cacheEntry->partitionMethod = partitionMethod;
	cacheEntry->colocationId = colocationId;
	cacheEntry->replicationModel = replicationModel;
	cacheEntry->shardIntervalArrayLength = shardIntervalArrayLength;
	cacheEntry->sortedShardIntervalArray = sortedShardIntervalArray;
	cacheEntry->shardIntervalCompareFunction = shardIntervalCompareFunction;
	cacheEntry->hashFunction = hashFunction;
	cacheEntry->hasUninitializedShardInterval = hasUninitializedShardInterval;
	cacheEntry->hasUniformHashDistribution = hasUniformHashDistribution;

	return cacheEntry;
}

/*
 * BulkloadCopyToExistingShards implements the COPY table_name FROM ... for HASH
 * distributed table where there are already shards into which to copy. It works just
 * like CopyToExistingShards, except that the later runs only in master node, but the
 * former runs on each worker node. So BulkloadCopyToExistingShards would be mush more
 * faster(# worker times) than CopyToExistingShards for HASH distributed table.
 */
static void
BulkloadCopyToExistingShards(CopyStmt *copyStatement, char *completionTag,
		Oid relationId)
{
	DistTableCacheEntry *cacheEntry = NULL;
	NodeAddress *bulkloadServer = NULL;

	Assert(masterConnection != NULL);
	cacheEntry = MasterDistributedTableCacheEntry(copyStatement->relation);
	InsertDistTableCacheEntry(relationId, cacheEntry);

	bulkloadServer = BulkloadServerAddress(copyStatement);
	RemoveBulkloadOptions(copyStatement);
	RebuildBulkloadCopyStatement(copyStatement, bulkloadServer);

	CopyToExistingShards(copyStatement, completionTag, relationId);
}

/*
 * Get PGconn* and it's index from PGconn* list by socket descriptor.
 */
static PGconn *
GetConnectionBySock(List *connList, int sock, int *connIdx)
{
	PGconn *conn = NULL;
	int idx;
	int n = list_length(connList);
	for (idx = 0; idx < n; idx++)
	{
		conn = (PGconn *) list_nth(connList, idx);
		if (PQsocket(conn) == sock)
		{
			*connIdx = idx;
			return conn;
		}
	}
	return NULL;
}