mirror of https://github.com/citusdata/citus.git
3997 lines
115 KiB
C
3997 lines
115 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* multi_copy.c
|
|
* This file contains implementation of COPY utility for distributed
|
|
* tables.
|
|
*
|
|
* The CitusCopyFrom function should be called from the utility hook to process
|
|
* COPY ... FROM commands on distributed tables. CitusCopyFrom parses the input
|
|
* from stdin, a program, or a file, and decides to copy new rows to existing
|
|
* shards or new shards based on the partition method of the distributed table.
|
|
*
|
|
* If this is the first command in the transaction, we open a new connection for
|
|
* every shard placement. Otherwise we open as many connections as we can to
|
|
* not conflict with previous commands in transactions, in which case some shards
|
|
* may share connections. See the comments of CopyConnectionState for how we
|
|
* operate in that case.
|
|
*
|
|
* We use the PQputCopyData function to copy the data. Because PQputCopyData
|
|
* transmits data asynchronously, the workers will ingest data at least partially
|
|
* in parallel.
|
|
*
|
|
* For hash-partitioned tables, if it fails to connect to a worker, the master
|
|
* rollbacks the distributed transaction, similar to the way DML statements
|
|
* are handled. If a failure occurs after connecting, the transaction
|
|
* is rolled back on all the workers. Note that,
|
|
* in the case of append-partitioned tables, if a fail occurs, immediately
|
|
* metadata changes are rolled back on the master node, but shard placements
|
|
* are left on the worker nodes.
|
|
*
|
|
* By default, COPY uses normal transactions on the workers. In the case of
|
|
* hash or range-partitioned tables, this can cause a problem when some of the
|
|
* transactions fail to commit while others have succeeded. To ensure no data
|
|
* is lost, COPY uses two-phase commit.
|
|
*
|
|
* Parsing options are processed and enforced on the node where copy command
|
|
* is run, while constraints are enforced on the worker. In either case,
|
|
* failure causes the whole COPY to roll back.
|
|
*
|
|
* Copyright (c) Citus Data, Inc.
|
|
*
|
|
* With contributions from Postgres Professional.
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
#include "libpq-fe.h"
|
|
#include "miscadmin.h"
|
|
#include "pgstat.h"
|
|
|
|
#include <arpa/inet.h> /* for htons */
|
|
#include <netinet/in.h> /* for htons */
|
|
#include <string.h>
|
|
|
|
#include "distributed/pg_version_constants.h"
|
|
|
|
#include "access/htup_details.h"
|
|
#include "access/htup.h"
|
|
#include "access/sdir.h"
|
|
#include "access/sysattr.h"
|
|
#include "access/xact.h"
|
|
#include "catalog/namespace.h"
|
|
#include "catalog/pg_attribute.h"
|
|
#include "catalog/pg_type.h"
|
|
#include "commands/copy.h"
|
|
#include "commands/defrem.h"
|
|
#include "commands/progress.h"
|
|
#include "distributed/citus_safe_lib.h"
|
|
#include "distributed/commands/multi_copy.h"
|
|
#include "distributed/commands/utility_hook.h"
|
|
#include "distributed/intermediate_results.h"
|
|
#include "distributed/listutils.h"
|
|
#include "distributed/local_executor.h"
|
|
#include "distributed/log_utils.h"
|
|
#include "distributed/coordinator_protocol.h"
|
|
#include "distributed/metadata_cache.h"
|
|
#include "distributed/multi_executor.h"
|
|
#include "distributed/multi_partitioning_utils.h"
|
|
#include "distributed/multi_physical_planner.h"
|
|
#include "distributed/multi_router_planner.h"
|
|
#include "distributed/multi_executor.h"
|
|
#include "distributed/listutils.h"
|
|
#include "distributed/locally_reserved_shared_connections.h"
|
|
#include "distributed/placement_connection.h"
|
|
#include "distributed/relation_access_tracking.h"
|
|
#include "distributed/remote_commands.h"
|
|
#include "distributed/remote_transaction.h"
|
|
#include "distributed/replication_origin_session_utils.h"
|
|
#include "distributed/resource_lock.h"
|
|
#include "distributed/shard_pruning.h"
|
|
#include "distributed/shared_connection_stats.h"
|
|
#include "distributed/version_compat.h"
|
|
#include "distributed/worker_protocol.h"
|
|
#include "distributed/local_multi_copy.h"
|
|
#include "distributed/hash_helpers.h"
|
|
#include "distributed/transmit.h"
|
|
#include "executor/executor.h"
|
|
#include "foreign/foreign.h"
|
|
|
|
#include "libpq/libpq.h"
|
|
#include "libpq/pqformat.h"
|
|
#include "nodes/makefuncs.h"
|
|
#include "nodes/nodeFuncs.h"
|
|
#include "parser/parse_func.h"
|
|
#include "parser/parse_type.h"
|
|
#include "tcop/cmdtag.h"
|
|
#include "tsearch/ts_locale.h"
|
|
#include "utils/builtins.h"
|
|
#include "utils/lsyscache.h"
|
|
#include "utils/rel.h"
|
|
#include "utils/syscache.h"
|
|
#include "utils/memutils.h"
|
|
|
|
|
|
/* constant used in binary protocol */
|
|
static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0";
|
|
|
|
/* if true, skip validation of JSONB columns during COPY */
|
|
bool SkipJsonbValidationInCopy = true;
|
|
|
|
/* custom Citus option for appending to a shard */
|
|
#define APPEND_TO_SHARD_OPTION "append_to_shard"
|
|
|
|
/*
|
|
* Data size threshold to switch over the active placement for a connection.
|
|
* If this is too low, overhead of starting COPY commands will hurt the
|
|
* performance. If this is too high, buffered data will use lots of memory.
|
|
* 4MB is a good balance between memory usage and performance. Note that this
|
|
* is irrelevant in the common case where we open one connection per placement.
|
|
*/
|
|
int CopySwitchOverThresholdBytes = 4 * 1024 * 1024;
|
|
|
|
#define FILE_IS_OPEN(x) (x > -1)
|
|
|
|
typedef struct CopyShardState CopyShardState;
|
|
typedef struct CopyPlacementState CopyPlacementState;
|
|
|
|
/*
|
|
* Multiple shard placements can share one connection. Each connection has one
|
|
* of those placements as the activePlacementState, and others in the
|
|
* bufferedPlacementList. When we want to send a tuple to a CopyPlacementState,
|
|
* we check if it is the active one in its connectionState, and in this case we
|
|
* directly put data on wire. Otherwise, we buffer it so we can put it on wire
|
|
* later, when copy ends or a switch-over happens. See CitusSendTupleToPlacements()
|
|
* for more details.
|
|
*
|
|
* This is done so we are compatible with adaptive_executor. If a previous command
|
|
* in the current transaction has been executed using adaptive_executor.c, then
|
|
* CopyGetPlacementConnection() might return the same connection for multiple
|
|
* placements. We support that case by the buffering mechanism described above.
|
|
*
|
|
* If no previous command in the current transaction has used adaptive_executor.c,
|
|
* then CopyGetPlacementConnection() returns one connection per placement and no
|
|
* buffering happens and we put the copy data directly on connection.
|
|
*/
|
|
typedef struct CopyConnectionState
|
|
{
|
|
/* Used as hash key. Equal to PQsocket(connection->pgConn). */
|
|
int socket;
|
|
|
|
MultiConnection *connection;
|
|
|
|
/*
|
|
* Placement for which we have an active COPY going on over connection.
|
|
* Can be NULL.
|
|
*/
|
|
CopyPlacementState *activePlacementState;
|
|
|
|
/*
|
|
* Other placements that we are buffering data for. Later when a switch-over
|
|
* happens, we remove an item from this list and set it to activePlacementState.
|
|
* In this case, old activePlacementState isn't NULL, is added to this list.
|
|
*/
|
|
dlist_head bufferedPlacementList;
|
|
|
|
/* length of bufferedPlacementList, to avoid iterations over the list when needed */
|
|
int bufferedPlacementCount;
|
|
} CopyConnectionState;
|
|
|
|
|
|
struct CopyPlacementState
|
|
{
|
|
/* Connection state to which the placemement is assigned to. */
|
|
CopyConnectionState *connectionState;
|
|
|
|
/* State of shard to which the placement belongs to. */
|
|
CopyShardState *shardState;
|
|
|
|
/* node group ID of the placement */
|
|
int32 groupId;
|
|
|
|
/*
|
|
* Buffered COPY data. When the placement is activePlacementState of
|
|
* some connection, this is empty. Because in that case we directly
|
|
* send the data over connection.
|
|
*/
|
|
StringInfo data;
|
|
|
|
/* List node for CopyConnectionState->bufferedPlacementList. */
|
|
dlist_node bufferedPlacementNode;
|
|
};
|
|
|
|
struct CopyShardState
|
|
{
|
|
/* Used as hash key. */
|
|
uint64 shardId;
|
|
|
|
/* used for doing local copy, either for a shard or a co-located file */
|
|
CopyOutState copyOutState;
|
|
|
|
/* used when copy is targeting co-located file */
|
|
FileCompat fileDest;
|
|
|
|
/* containsLocalPlacement is true if we have a local placement for the shard id of this state */
|
|
bool containsLocalPlacement;
|
|
|
|
/* List of CopyPlacementStates for all active placements of the shard. */
|
|
List *placementStateList;
|
|
};
|
|
|
|
|
|
/*
|
|
* Represents the state for allowing copy via local
|
|
* execution.
|
|
*/
|
|
typedef enum LocalCopyStatus
|
|
{
|
|
LOCAL_COPY_REQUIRED,
|
|
LOCAL_COPY_OPTIONAL,
|
|
LOCAL_COPY_DISABLED
|
|
} LocalCopyStatus;
|
|
|
|
|
|
/* Local functions forward declarations */
|
|
static void CopyToExistingShards(CopyStmt *copyStatement,
|
|
QueryCompletion *completionTag);
|
|
static bool IsCopyInBinaryFormat(CopyStmt *copyStatement);
|
|
static List * FindJsonbInputColumns(TupleDesc tupleDescriptor,
|
|
List *inputColumnNameList);
|
|
static List * RemoveOptionFromList(List *optionList, char *optionName);
|
|
static bool BinaryOutputFunctionDefined(Oid typeId);
|
|
static bool BinaryInputFunctionDefined(Oid typeId);
|
|
static void SendCopyBinaryHeaders(CopyOutState copyOutState, int64 shardId,
|
|
List *connectionList);
|
|
static void SendCopyBinaryFooters(CopyOutState copyOutState, int64 shardId,
|
|
List *connectionList);
|
|
static StringInfo ConstructCopyStatement(CopyStmt *copyStatement, int64 shardId);
|
|
static void SendCopyDataToAll(StringInfo dataBuffer, int64 shardId, List *connectionList);
|
|
static void SendCopyDataToPlacement(StringInfo dataBuffer, int64 shardId,
|
|
MultiConnection *connection);
|
|
static uint32 AvailableColumnCount(TupleDesc tupleDescriptor);
|
|
|
|
static Oid TypeForColumnName(Oid relationId, TupleDesc tupleDescriptor, char *columnName);
|
|
static Oid * TypeArrayFromTupleDescriptor(TupleDesc tupleDescriptor);
|
|
static CopyCoercionData * ColumnCoercionPaths(TupleDesc destTupleDescriptor,
|
|
TupleDesc inputTupleDescriptor,
|
|
Oid destRelId, List *columnNameList,
|
|
Oid *finalColumnTypeArray);
|
|
static FmgrInfo * TypeOutputFunctions(uint32 columnCount, Oid *typeIdArray,
|
|
bool binaryFormat);
|
|
#if PG_VERSION_NUM < PG_VERSION_14
|
|
static List * CopyGetAttnums(TupleDesc tupDesc, Relation rel, List *attnamelist);
|
|
#endif
|
|
static bool CopyStatementHasFormat(CopyStmt *copyStatement, char *formatName);
|
|
static void CitusCopyFrom(CopyStmt *copyStatement, QueryCompletion *completionTag);
|
|
static void EnsureCopyCanRunOnRelation(Oid relationId);
|
|
static HTAB * CreateConnectionStateHash(MemoryContext memoryContext);
|
|
static HTAB * CreateShardStateHash(MemoryContext memoryContext);
|
|
static CopyConnectionState * GetConnectionState(HTAB *connectionStateHash,
|
|
MultiConnection *connection);
|
|
static CopyShardState * GetShardState(uint64 shardId, HTAB *shardStateHash,
|
|
HTAB *connectionStateHash,
|
|
bool *found, bool shouldUseLocalCopy, CopyOutState
|
|
copyOutState, bool isColocatedIntermediateResult,
|
|
bool isPublishable);
|
|
static MultiConnection * CopyGetPlacementConnection(HTAB *connectionStateHash,
|
|
ShardPlacement *placement,
|
|
bool colocatedIntermediateResult);
|
|
static bool HasReachedAdaptiveExecutorPoolSize(List *connectionStateHash);
|
|
static MultiConnection * GetLeastUtilisedCopyConnection(List *connectionStateList,
|
|
char *nodeName, int nodePort);
|
|
static List * ConnectionStateList(HTAB *connectionStateHash);
|
|
static List * ConnectionStateListToNode(HTAB *connectionStateHash,
|
|
const char *hostname, int32 port);
|
|
static void InitializeCopyShardState(CopyShardState *shardState,
|
|
HTAB *connectionStateHash,
|
|
uint64 shardId,
|
|
bool canUseLocalCopy,
|
|
CopyOutState copyOutState,
|
|
bool colocatedIntermediateResult, bool
|
|
isPublishable);
|
|
static void StartPlacementStateCopyCommand(CopyPlacementState *placementState,
|
|
CopyStmt *copyStatement,
|
|
CopyOutState copyOutState);
|
|
static void EndPlacementStateCopyCommand(CopyPlacementState *placementState,
|
|
CopyOutState copyOutState);
|
|
static void UnclaimCopyConnections(List *connectionStateList);
|
|
static void ShutdownCopyConnectionState(CopyConnectionState *connectionState,
|
|
CitusCopyDestReceiver *copyDest);
|
|
static SelectStmt * CitusCopySelect(CopyStmt *copyStatement);
|
|
static void CitusCopyTo(CopyStmt *copyStatement, QueryCompletion *completionTag);
|
|
static int64 ForwardCopyDataFromConnection(CopyOutState copyOutState,
|
|
MultiConnection *connection);
|
|
|
|
/* Private functions copied and adapted from copy.c in PostgreSQL */
|
|
static void SendCopyBegin(CopyOutState cstate);
|
|
static void SendCopyEnd(CopyOutState cstate);
|
|
static void CopySendData(CopyOutState outputState, const void *databuf, int datasize);
|
|
static void CopySendString(CopyOutState outputState, const char *str);
|
|
static void CopySendChar(CopyOutState outputState, char c);
|
|
static void CopySendInt32(CopyOutState outputState, int32 val);
|
|
static void CopySendInt16(CopyOutState outputState, int16 val);
|
|
static void CopySendEndOfRow(CopyOutState cstate, bool includeEndOfLine);
|
|
static void CopyAttributeOutText(CopyOutState outputState, char *string);
|
|
static inline void CopyFlushOutput(CopyOutState outputState, char *start, char *pointer);
|
|
static bool CitusSendTupleToPlacements(TupleTableSlot *slot,
|
|
CitusCopyDestReceiver *copyDest);
|
|
static void AddPlacementStateToCopyConnectionStateBuffer(CopyConnectionState *
|
|
connectionState,
|
|
CopyPlacementState *
|
|
placementState);
|
|
static void RemovePlacementStateFromCopyConnectionStateBuffer(CopyConnectionState *
|
|
connectionState,
|
|
CopyPlacementState *
|
|
placementState);
|
|
static uint64 ProcessAppendToShardOption(Oid relationId, CopyStmt *copyStatement);
|
|
static uint64 ShardIdForTuple(CitusCopyDestReceiver *copyDest, Datum *columnValues,
|
|
bool *columnNulls);
|
|
|
|
/* CitusCopyDestReceiver functions */
|
|
static void CitusCopyDestReceiverStartup(DestReceiver *copyDest, int operation,
|
|
TupleDesc inputTupleDesc);
|
|
static bool CitusCopyDestReceiverReceive(TupleTableSlot *slot,
|
|
DestReceiver *copyDest);
|
|
static void CitusCopyDestReceiverShutdown(DestReceiver *destReceiver);
|
|
static void CitusCopyDestReceiverDestroy(DestReceiver *destReceiver);
|
|
static bool ContainsLocalPlacement(int64 shardId);
|
|
static void CompleteCopyQueryTagCompat(QueryCompletion *completionTag, uint64
|
|
processedRowCount);
|
|
static void FinishLocalCopy(CitusCopyDestReceiver *copyDest);
|
|
static void CreateLocalColocatedIntermediateFile(CitusCopyDestReceiver *copyDest,
|
|
CopyShardState *shardState);
|
|
static void FinishLocalColocatedIntermediateFiles(CitusCopyDestReceiver *copyDest);
|
|
static void CloneCopyOutStateForLocalCopy(CopyOutState from, CopyOutState to);
|
|
static LocalCopyStatus GetLocalCopyStatus(void);
|
|
static bool ShardIntervalListHasLocalPlacements(List *shardIntervalList);
|
|
static void LogLocalCopyToRelationExecution(uint64 shardId);
|
|
static void LogLocalCopyToFileExecution(uint64 shardId);
|
|
|
|
|
|
/* exports for SQL callable functions */
|
|
PG_FUNCTION_INFO_V1(citus_text_send_as_jsonb);
|
|
|
|
|
|
/*
|
|
* CitusCopyFrom implements the COPY table_name FROM. It dispacthes the copy
|
|
* statement to related subfunctions based on where the copy command is run
|
|
* and the partition method of the distributed table.
|
|
*/
|
|
static void
|
|
CitusCopyFrom(CopyStmt *copyStatement, QueryCompletion *completionTag)
|
|
{
|
|
UseCoordinatedTransaction();
|
|
|
|
/* disallow COPY to/from file or program except for superusers */
|
|
if (copyStatement->filename != NULL && !superuser())
|
|
{
|
|
if (copyStatement->is_program)
|
|
{
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
|
|
errmsg("must be superuser to COPY to or from an external program"),
|
|
errhint("Anyone can COPY to stdout or from stdin. "
|
|
"psql's \\copy command also works for anyone.")));
|
|
}
|
|
else
|
|
{
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
|
|
errmsg("must be superuser to COPY to or from a file"),
|
|
errhint("Anyone can COPY to stdout or from stdin. "
|
|
"psql's \\copy command also works for anyone.")));
|
|
}
|
|
}
|
|
|
|
|
|
Oid relationId = RangeVarGetRelid(copyStatement->relation, NoLock, false);
|
|
|
|
EnsureCopyCanRunOnRelation(relationId);
|
|
|
|
CitusTableCacheEntry *cacheEntry = GetCitusTableCacheEntry(relationId);
|
|
|
|
/* disallow modifications to a partition table which have rep. factor > 1 */
|
|
EnsurePartitionTableNotReplicated(relationId);
|
|
|
|
if (IsCitusTableTypeCacheEntry(cacheEntry, HASH_DISTRIBUTED) ||
|
|
IsCitusTableTypeCacheEntry(cacheEntry, RANGE_DISTRIBUTED) ||
|
|
IsCitusTableTypeCacheEntry(cacheEntry, APPEND_DISTRIBUTED) ||
|
|
!HasDistributionKeyCacheEntry(cacheEntry))
|
|
{
|
|
CopyToExistingShards(copyStatement, completionTag);
|
|
}
|
|
else
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
|
errmsg("unsupported partition method")));
|
|
}
|
|
|
|
XactModificationLevel = XACT_MODIFICATION_DATA;
|
|
}
|
|
|
|
|
|
/*
|
|
* EnsureCopyCanRunOnRelation throws error is the database in read-only mode.
|
|
*/
|
|
static void
|
|
EnsureCopyCanRunOnRelation(Oid relationId)
|
|
{
|
|
/* first, do the regular check and give consistent errors with regular queries */
|
|
EnsureModificationsCanRunOnRelation(relationId);
|
|
|
|
/*
|
|
* We use 2PC for all COPY commands. It means that we cannot allow any COPY
|
|
* on replicas even if the user allows via WritableStandbyCoordinator GUC.
|
|
*/
|
|
if (RecoveryInProgress() && WritableStandbyCoordinator)
|
|
{
|
|
ereport(ERROR, (errmsg("COPY command to Citus tables is not allowed in "
|
|
"read-only mode"),
|
|
errhint("All COPY commands to citus tables happen via 2PC, "
|
|
"and 2PC requires the database to be in a writable state."),
|
|
errdetail("the database is read-only")));
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* CopyToExistingShards implements the COPY table_name FROM ... for hash or
|
|
* range-partitioned tables where there are already shards into which to copy
|
|
* rows.
|
|
*/
|
|
static void
|
|
CopyToExistingShards(CopyStmt *copyStatement, QueryCompletion *completionTag)
|
|
{
|
|
Oid tableId = RangeVarGetRelid(copyStatement->relation, NoLock, false);
|
|
|
|
|
|
List *columnNameList = NIL;
|
|
int partitionColumnIndex = INVALID_PARTITION_COLUMN_INDEX;
|
|
|
|
bool isInputFormatBinary = IsCopyInBinaryFormat(copyStatement);
|
|
uint64 processedRowCount = 0;
|
|
|
|
ErrorContextCallback errorCallback;
|
|
|
|
/* allocate column values and nulls arrays */
|
|
Relation distributedRelation = table_open(tableId, RowExclusiveLock);
|
|
TupleDesc tupleDescriptor = RelationGetDescr(distributedRelation);
|
|
uint32 columnCount = tupleDescriptor->natts;
|
|
Datum *columnValues = palloc0(columnCount * sizeof(Datum));
|
|
bool *columnNulls = palloc0(columnCount * sizeof(bool));
|
|
|
|
/* set up a virtual tuple table slot */
|
|
TupleTableSlot *tupleTableSlot = MakeSingleTupleTableSlot(tupleDescriptor,
|
|
&TTSOpsVirtual);
|
|
tupleTableSlot->tts_nvalid = columnCount;
|
|
tupleTableSlot->tts_values = columnValues;
|
|
tupleTableSlot->tts_isnull = columnNulls;
|
|
|
|
/* determine the partition column index in the tuple descriptor */
|
|
Var *partitionColumn = PartitionColumn(tableId, 0);
|
|
if (partitionColumn != NULL)
|
|
{
|
|
partitionColumnIndex = partitionColumn->varattno - 1;
|
|
}
|
|
|
|
/* build the list of column names for remote COPY statements */
|
|
for (int columnIndex = 0; columnIndex < columnCount; columnIndex++)
|
|
{
|
|
Form_pg_attribute currentColumn = TupleDescAttr(tupleDescriptor, columnIndex);
|
|
char *columnName = NameStr(currentColumn->attname);
|
|
|
|
if (currentColumn->attisdropped ||
|
|
currentColumn->attgenerated == ATTRIBUTE_GENERATED_STORED
|
|
)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
columnNameList = lappend(columnNameList, columnName);
|
|
}
|
|
|
|
EState *executorState = CreateExecutorState();
|
|
MemoryContext executorTupleContext = GetPerTupleMemoryContext(executorState);
|
|
ExprContext *executorExpressionContext = GetPerTupleExprContext(executorState);
|
|
|
|
/* set up the destination for the COPY */
|
|
const bool publishableData = true;
|
|
CitusCopyDestReceiver *copyDest = CreateCitusCopyDestReceiver(tableId, columnNameList,
|
|
partitionColumnIndex,
|
|
executorState, NULL,
|
|
publishableData);
|
|
|
|
/* if the user specified an explicit append-to_shard option, write to it */
|
|
uint64 appendShardId = ProcessAppendToShardOption(tableId, copyStatement);
|
|
if (appendShardId != INVALID_SHARD_ID)
|
|
{
|
|
copyDest->appendShardId = appendShardId;
|
|
}
|
|
|
|
DestReceiver *dest = (DestReceiver *) copyDest;
|
|
dest->rStartup(dest, 0, tupleDescriptor);
|
|
|
|
/*
|
|
* Below, we change a few fields in the Relation to control the behaviour
|
|
* of BeginCopyFrom. However, we obviously should not do this in relcache
|
|
* and therefore make a copy of the Relation.
|
|
*/
|
|
Relation copiedDistributedRelation = (Relation) palloc(sizeof(RelationData));
|
|
Form_pg_class copiedDistributedRelationTuple =
|
|
(Form_pg_class) palloc(CLASS_TUPLE_SIZE);
|
|
|
|
/*
|
|
* There is no need to deep copy everything. We will just deep copy of the fields
|
|
* we will change.
|
|
*/
|
|
*copiedDistributedRelation = *distributedRelation;
|
|
*copiedDistributedRelationTuple = *distributedRelation->rd_rel;
|
|
|
|
copiedDistributedRelation->rd_rel = copiedDistributedRelationTuple;
|
|
copiedDistributedRelation->rd_att = CreateTupleDescCopyConstr(tupleDescriptor);
|
|
|
|
/*
|
|
* BeginCopyFrom opens all partitions of given partitioned table with relation_open
|
|
* and it expects its caller to close those relations. We do not have direct access
|
|
* to opened relations, thus we are changing relkind of partitioned tables so that
|
|
* Postgres will treat those tables as regular relations and will not open its
|
|
* partitions.
|
|
*/
|
|
if (PartitionedTable(tableId))
|
|
{
|
|
copiedDistributedRelationTuple->relkind = RELKIND_RELATION;
|
|
}
|
|
|
|
/*
|
|
* We make an optimisation to skip JSON parsing for JSONB columns, because many
|
|
* Citus users have large objects in this column and parsing it on the coordinator
|
|
* causes significant CPU overhead. We do this by forcing BeginCopyFrom and
|
|
* NextCopyFrom to parse the column as text and then encoding it as JSON again
|
|
* by using citus_text_send_as_jsonb as the binary output function.
|
|
*
|
|
* The main downside of enabling this optimisation is that it defers validation
|
|
* until the object is parsed by the worker, which is unable to give an accurate
|
|
* line number.
|
|
*/
|
|
if (SkipJsonbValidationInCopy && !isInputFormatBinary)
|
|
{
|
|
CopyOutState copyOutState = copyDest->copyOutState;
|
|
ListCell *jsonbColumnIndexCell = NULL;
|
|
|
|
/* get the column indices for all JSONB columns that appear in the input */
|
|
List *jsonbColumnIndexList = FindJsonbInputColumns(
|
|
copiedDistributedRelation->rd_att,
|
|
copyStatement->attlist);
|
|
|
|
foreach(jsonbColumnIndexCell, jsonbColumnIndexList)
|
|
{
|
|
int jsonbColumnIndex = lfirst_int(jsonbColumnIndexCell);
|
|
Form_pg_attribute currentColumn =
|
|
TupleDescAttr(copiedDistributedRelation->rd_att, jsonbColumnIndex);
|
|
|
|
if (jsonbColumnIndex == partitionColumnIndex)
|
|
{
|
|
/*
|
|
* In the curious case of using a JSONB column as partition column,
|
|
* we leave it as is because we want to make sure the hashing works
|
|
* correctly.
|
|
*/
|
|
continue;
|
|
}
|
|
|
|
ereport(DEBUG1, (errmsg("parsing JSONB column %s as text",
|
|
NameStr(currentColumn->attname))));
|
|
|
|
/* parse the column as text instead of JSONB */
|
|
currentColumn->atttypid = TEXTOID;
|
|
|
|
if (copyOutState->binary)
|
|
{
|
|
Oid textSendAsJsonbFunctionId = CitusTextSendAsJsonbFunctionId();
|
|
|
|
/*
|
|
* If we're using binary encoding between coordinator and workers
|
|
* then we should honour the format expected by jsonb_recv, which
|
|
* is a version number followed by text. We therefore use an output
|
|
* function which sends the text as if it were jsonb, namely by
|
|
* prepending a version number.
|
|
*/
|
|
fmgr_info(textSendAsJsonbFunctionId,
|
|
©Dest->columnOutputFunctions[jsonbColumnIndex]);
|
|
}
|
|
else
|
|
{
|
|
Oid textoutFunctionId = TextOutFunctionId();
|
|
fmgr_info(textoutFunctionId,
|
|
©Dest->columnOutputFunctions[jsonbColumnIndex]);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* initialize copy state to read from COPY data source */
|
|
CopyFromState copyState = BeginCopyFrom_compat(NULL,
|
|
copiedDistributedRelation,
|
|
NULL,
|
|
copyStatement->filename,
|
|
copyStatement->is_program,
|
|
NULL,
|
|
copyStatement->attlist,
|
|
copyStatement->options);
|
|
|
|
/* set up callback to identify error line number */
|
|
errorCallback.callback = CopyFromErrorCallback;
|
|
errorCallback.arg = (void *) copyState;
|
|
errorCallback.previous = error_context_stack;
|
|
error_context_stack = &errorCallback;
|
|
|
|
while (true)
|
|
{
|
|
ResetPerTupleExprContext(executorState);
|
|
|
|
MemoryContext oldContext = MemoryContextSwitchTo(executorTupleContext);
|
|
|
|
/* parse a row from the input */
|
|
bool nextRowFound = NextCopyFrom(copyState, executorExpressionContext,
|
|
columnValues, columnNulls);
|
|
|
|
if (!nextRowFound)
|
|
{
|
|
MemoryContextSwitchTo(oldContext);
|
|
break;
|
|
}
|
|
|
|
CHECK_FOR_INTERRUPTS();
|
|
|
|
MemoryContextSwitchTo(oldContext);
|
|
|
|
dest->receiveSlot(tupleTableSlot, dest);
|
|
|
|
++processedRowCount;
|
|
|
|
#if PG_VERSION_NUM >= PG_VERSION_14
|
|
pgstat_progress_update_param(PROGRESS_COPY_TUPLES_PROCESSED, processedRowCount);
|
|
#endif
|
|
}
|
|
|
|
EndCopyFrom(copyState);
|
|
|
|
/* all lines have been copied, stop showing line number in errors */
|
|
error_context_stack = errorCallback.previous;
|
|
|
|
/* finish the COPY commands */
|
|
dest->rShutdown(dest);
|
|
dest->rDestroy(dest);
|
|
|
|
ExecDropSingleTupleTableSlot(tupleTableSlot);
|
|
FreeExecutorState(executorState);
|
|
table_close(distributedRelation, NoLock);
|
|
|
|
CHECK_FOR_INTERRUPTS();
|
|
|
|
if (completionTag != NULL)
|
|
{
|
|
CompleteCopyQueryTagCompat(completionTag, processedRowCount);
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* IsCopyInBinaryFormat determines whether the given COPY statement has the
|
|
* WITH (format binary) option.
|
|
*/
|
|
static bool
|
|
IsCopyInBinaryFormat(CopyStmt *copyStatement)
|
|
{
|
|
ListCell *optionCell = NULL;
|
|
|
|
foreach(optionCell, copyStatement->options)
|
|
{
|
|
DefElem *defel = lfirst_node(DefElem, optionCell);
|
|
if (strcmp(defel->defname, "format") == 0 &&
|
|
strcmp(defGetString(defel), "binary") == 0)
|
|
{
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
/*
|
|
* FindJsonbInputColumns finds columns in the tuple descriptor that have
|
|
* the JSONB type and appear in inputColumnNameList. If the list is empty then
|
|
* all JSONB columns are returned.
|
|
*/
|
|
static List *
|
|
FindJsonbInputColumns(TupleDesc tupleDescriptor, List *inputColumnNameList)
|
|
{
|
|
List *jsonbColumnIndexList = NIL;
|
|
int columnCount = tupleDescriptor->natts;
|
|
|
|
for (int columnIndex = 0; columnIndex < columnCount; columnIndex++)
|
|
{
|
|
Form_pg_attribute currentColumn = TupleDescAttr(tupleDescriptor, columnIndex);
|
|
if (currentColumn->attisdropped)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
if (currentColumn->atttypid != JSONBOID)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
if (inputColumnNameList != NIL)
|
|
{
|
|
ListCell *inputColumnCell = NULL;
|
|
bool isInputColumn = false;
|
|
|
|
foreach(inputColumnCell, inputColumnNameList)
|
|
{
|
|
char *inputColumnName = strVal(lfirst(inputColumnCell));
|
|
|
|
if (namestrcmp(¤tColumn->attname, inputColumnName) == 0)
|
|
{
|
|
isInputColumn = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!isInputColumn)
|
|
{
|
|
continue;
|
|
}
|
|
}
|
|
|
|
jsonbColumnIndexList = lappend_int(jsonbColumnIndexList, columnIndex);
|
|
}
|
|
|
|
return jsonbColumnIndexList;
|
|
}
|
|
|
|
|
|
static void
|
|
CompleteCopyQueryTagCompat(QueryCompletion *completionTag, uint64 processedRowCount)
|
|
{
|
|
SetQueryCompletion(completionTag, CMDTAG_COPY, processedRowCount);
|
|
}
|
|
|
|
|
|
/*
|
|
* RemoveOptionFromList removes an option from a list of options in a
|
|
* COPY .. WITH (..) statement by name and returns the resulting list.
|
|
*/
|
|
static List *
|
|
RemoveOptionFromList(List *optionList, char *optionName)
|
|
{
|
|
ListCell *optionCell = NULL;
|
|
foreach(optionCell, optionList)
|
|
{
|
|
DefElem *option = (DefElem *) lfirst(optionCell);
|
|
|
|
if (strncmp(option->defname, optionName, NAMEDATALEN) == 0)
|
|
{
|
|
return list_delete_cell(optionList, optionCell);
|
|
}
|
|
}
|
|
|
|
return optionList;
|
|
}
|
|
|
|
|
|
/*
|
|
* CanUseBinaryCopyFormat iterates over columns of the relation and looks for a
|
|
* column whose type is array of user-defined type or composite type. If it finds
|
|
* such column, that means we cannot use binary format for COPY, because binary
|
|
* format sends Oid of the types, which are generally not same in master and
|
|
* worker nodes for user-defined types. If the function can not detect a binary
|
|
* output function for any of the column, it returns false.
|
|
*/
|
|
bool
|
|
CanUseBinaryCopyFormat(TupleDesc tupleDescription)
|
|
{
|
|
bool useBinaryCopyFormat = true;
|
|
int totalColumnCount = tupleDescription->natts;
|
|
|
|
for (int columnIndex = 0; columnIndex < totalColumnCount; columnIndex++)
|
|
{
|
|
Form_pg_attribute currentColumn = TupleDescAttr(tupleDescription, columnIndex);
|
|
|
|
if (currentColumn->attisdropped ||
|
|
currentColumn->attgenerated == ATTRIBUTE_GENERATED_STORED
|
|
)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
Oid typeId = currentColumn->atttypid;
|
|
if (!CanUseBinaryCopyFormatForType(typeId))
|
|
{
|
|
useBinaryCopyFormat = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return useBinaryCopyFormat;
|
|
}
|
|
|
|
|
|
/*
|
|
* CanUseBinaryCopyFormatForTargetList returns true if we can use binary
|
|
* copy format for all columns of the given target list.
|
|
*/
|
|
bool
|
|
CanUseBinaryCopyFormatForTargetList(List *targetEntryList)
|
|
{
|
|
ListCell *targetEntryCell = NULL;
|
|
foreach(targetEntryCell, targetEntryList)
|
|
{
|
|
TargetEntry *targetEntry = (TargetEntry *) lfirst(targetEntryCell);
|
|
Node *targetExpr = (Node *) targetEntry->expr;
|
|
|
|
Oid columnType = exprType(targetExpr);
|
|
if (!CanUseBinaryCopyFormatForType(columnType))
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
/*
|
|
* CanUseBinaryCopyFormatForType determines whether it is safe to use the
|
|
* binary copy format for the given type. See contents of the function for
|
|
* details of when it's safe to use binary copy.
|
|
*/
|
|
bool
|
|
CanUseBinaryCopyFormatForType(Oid typeId)
|
|
{
|
|
if (!BinaryOutputFunctionDefined(typeId))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
if (!BinaryInputFunctionDefined(typeId))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* A row type can contain any types, possibly types that don't have
|
|
* the binary input and output functions defined.
|
|
*/
|
|
if (type_is_rowtype(typeId))
|
|
{
|
|
/*
|
|
* TODO: Inspect the types inside the record and check if all of them
|
|
* can be binary encoded. If so, it's safe to use binary encoding.
|
|
*
|
|
* IMPORTANT: When implementing this todo keep the following in mind:
|
|
*
|
|
* In PG versions before PG14 the record_recv function would error out
|
|
* more than necessary.
|
|
*
|
|
* It errors out when any of the columns in the row have a type oid
|
|
* that doesn't match with the oid in the received data. This happens
|
|
* pretty much always for non built in types, because their oids differ
|
|
* between postgres intallations. So for those Postgres versions we
|
|
* would need a check like the following for each column:
|
|
*
|
|
* if (columnType >= FirstNormalObjectId) {
|
|
* return false
|
|
* }
|
|
*/
|
|
return false;
|
|
}
|
|
|
|
HeapTuple typeTup = typeidType(typeId);
|
|
Form_pg_type type = (Form_pg_type) GETSTRUCT(typeTup);
|
|
Oid elementType = type->typelem;
|
|
#if PG_VERSION_NUM < PG_VERSION_14
|
|
char typeCategory = type->typcategory;
|
|
#endif
|
|
ReleaseSysCache(typeTup);
|
|
|
|
#if PG_VERSION_NUM < PG_VERSION_14
|
|
|
|
/*
|
|
* In PG versions before PG14 the array_recv function would error out more
|
|
* than necessary.
|
|
*
|
|
* It errors out when the element type its oids don't match with the oid in
|
|
* the received data. This happens pretty much always for non built in
|
|
* types, because their oids differ between postgres intallations. So we
|
|
* skip binary encoding when the element type is a non built in type.
|
|
*/
|
|
if (typeCategory == TYPCATEGORY_ARRAY && elementType >= FirstNormalObjectId)
|
|
{
|
|
return false;
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* Any type that is a wrapper around an element type (e.g. arrays and
|
|
* ranges) require the element type to also has support for binary
|
|
* encoding.
|
|
*/
|
|
if (elementType != InvalidOid)
|
|
{
|
|
if (!CanUseBinaryCopyFormatForType(elementType))
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* For domains, make sure that the underlying type can be binary copied.
|
|
*/
|
|
Oid baseTypeId = getBaseType(typeId);
|
|
if (typeId != baseTypeId)
|
|
{
|
|
if (!CanUseBinaryCopyFormatForType(baseTypeId))
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
/*
|
|
* BinaryOutputFunctionDefined checks whether binary output function is defined
|
|
* for the given type.
|
|
*/
|
|
static bool
|
|
BinaryOutputFunctionDefined(Oid typeId)
|
|
{
|
|
Oid typeFunctionId = InvalidOid;
|
|
Oid typeIoParam = InvalidOid;
|
|
int16 typeLength = 0;
|
|
bool typeByVal = false;
|
|
char typeAlign = 0;
|
|
char typeDelim = 0;
|
|
|
|
get_type_io_data(typeId, IOFunc_send, &typeLength, &typeByVal,
|
|
&typeAlign, &typeDelim, &typeIoParam, &typeFunctionId);
|
|
|
|
return OidIsValid(typeFunctionId);
|
|
}
|
|
|
|
|
|
/*
|
|
* BinaryInputFunctionDefined checks whether binary output function is defined
|
|
* for the given type.
|
|
*/
|
|
static bool
|
|
BinaryInputFunctionDefined(Oid typeId)
|
|
{
|
|
Oid typeFunctionId = InvalidOid;
|
|
Oid typeIoParam = InvalidOid;
|
|
int16 typeLength = 0;
|
|
bool typeByVal = false;
|
|
char typeAlign = 0;
|
|
char typeDelim = 0;
|
|
|
|
get_type_io_data(typeId, IOFunc_receive, &typeLength, &typeByVal,
|
|
&typeAlign, &typeDelim, &typeIoParam, &typeFunctionId);
|
|
|
|
return OidIsValid(typeFunctionId);
|
|
}
|
|
|
|
|
|
/* Send copy binary headers to given connections */
|
|
static void
|
|
SendCopyBinaryHeaders(CopyOutState copyOutState, int64 shardId, List *connectionList)
|
|
{
|
|
resetStringInfo(copyOutState->fe_msgbuf);
|
|
AppendCopyBinaryHeaders(copyOutState);
|
|
SendCopyDataToAll(copyOutState->fe_msgbuf, shardId, connectionList);
|
|
}
|
|
|
|
|
|
/* Send copy binary footers to given connections */
|
|
static void
|
|
SendCopyBinaryFooters(CopyOutState copyOutState, int64 shardId, List *connectionList)
|
|
{
|
|
resetStringInfo(copyOutState->fe_msgbuf);
|
|
AppendCopyBinaryFooters(copyOutState);
|
|
SendCopyDataToAll(copyOutState->fe_msgbuf, shardId, connectionList);
|
|
}
|
|
|
|
|
|
/*
|
|
* ConstructCopyStatement constructs the text of a COPY statement for a particular
|
|
* shard.
|
|
*/
|
|
static StringInfo
|
|
ConstructCopyStatement(CopyStmt *copyStatement, int64 shardId)
|
|
{
|
|
StringInfo command = makeStringInfo();
|
|
|
|
char *schemaName = copyStatement->relation->schemaname;
|
|
char *relationName = copyStatement->relation->relname;
|
|
|
|
char *shardName = pstrdup(relationName);
|
|
|
|
AppendShardIdToName(&shardName, shardId);
|
|
|
|
char *shardQualifiedName = quote_qualified_identifier(schemaName, shardName);
|
|
|
|
appendStringInfo(command, "COPY %s ", shardQualifiedName);
|
|
|
|
if (copyStatement->attlist != NIL)
|
|
{
|
|
ListCell *columnNameCell = NULL;
|
|
bool appendedFirstName = false;
|
|
|
|
foreach(columnNameCell, copyStatement->attlist)
|
|
{
|
|
char *columnName = strVal(lfirst(columnNameCell));
|
|
const char *quotedColumnName = quote_identifier(columnName);
|
|
|
|
if (!appendedFirstName)
|
|
{
|
|
appendStringInfo(command, "(%s", quotedColumnName);
|
|
appendedFirstName = true;
|
|
}
|
|
else
|
|
{
|
|
appendStringInfo(command, ", %s", quotedColumnName);
|
|
}
|
|
}
|
|
|
|
appendStringInfoString(command, ") ");
|
|
}
|
|
|
|
if (copyStatement->is_from)
|
|
{
|
|
appendStringInfoString(command, "FROM STDIN");
|
|
}
|
|
else
|
|
{
|
|
appendStringInfoString(command, "TO STDOUT");
|
|
}
|
|
|
|
if (copyStatement->options != NIL)
|
|
{
|
|
ListCell *optionCell = NULL;
|
|
|
|
appendStringInfoString(command, " WITH (");
|
|
|
|
foreach(optionCell, copyStatement->options)
|
|
{
|
|
DefElem *defel = (DefElem *) lfirst(optionCell);
|
|
|
|
if (optionCell != list_head(copyStatement->options))
|
|
{
|
|
appendStringInfoString(command, ", ");
|
|
}
|
|
|
|
appendStringInfo(command, "%s", defel->defname);
|
|
|
|
if (defel->arg == NULL)
|
|
{
|
|
/* option without value */
|
|
}
|
|
else if (IsA(defel->arg, String))
|
|
{
|
|
char *value = defGetString(defel);
|
|
|
|
/* make sure strings are quoted (may contain reserved characters) */
|
|
appendStringInfo(command, " %s", quote_literal_cstr(value));
|
|
}
|
|
else if (IsA(defel->arg, List))
|
|
{
|
|
List *nameList = defGetStringList(defel);
|
|
|
|
appendStringInfo(command, " (%s)", NameListToQuotedString(nameList));
|
|
}
|
|
else
|
|
{
|
|
char *value = defGetString(defel);
|
|
|
|
/* numeric options or * should not have quotes */
|
|
appendStringInfo(command, " %s", value);
|
|
}
|
|
}
|
|
|
|
appendStringInfoString(command, ")");
|
|
}
|
|
|
|
return command;
|
|
}
|
|
|
|
|
|
/*
|
|
* SendCopyDataToAll sends copy data to all connections in a list.
|
|
*/
|
|
static void
|
|
SendCopyDataToAll(StringInfo dataBuffer, int64 shardId, List *connectionList)
|
|
{
|
|
ListCell *connectionCell = NULL;
|
|
foreach(connectionCell, connectionList)
|
|
{
|
|
MultiConnection *connection = (MultiConnection *) lfirst(connectionCell);
|
|
SendCopyDataToPlacement(dataBuffer, shardId, connection);
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* SendCopyDataToPlacement sends serialized COPY data to a specific shard placement
|
|
* over the given connection.
|
|
*/
|
|
static void
|
|
SendCopyDataToPlacement(StringInfo dataBuffer, int64 shardId, MultiConnection *connection)
|
|
{
|
|
if (!PutRemoteCopyData(connection, dataBuffer->data, dataBuffer->len))
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_IO_ERROR),
|
|
errmsg("failed to COPY to shard " INT64_FORMAT " on %s:%d",
|
|
shardId, connection->hostname, connection->port),
|
|
errdetail("failed to send %d bytes %s", dataBuffer->len,
|
|
dataBuffer->data)));
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* EndRemoteCopy ends the COPY input on all connections, and unclaims connections.
|
|
* This reports an error on failure.
|
|
*/
|
|
void
|
|
EndRemoteCopy(int64 shardId, List *connectionList)
|
|
{
|
|
ListCell *connectionCell = NULL;
|
|
|
|
foreach(connectionCell, connectionList)
|
|
{
|
|
MultiConnection *connection = (MultiConnection *) lfirst(connectionCell);
|
|
bool raiseInterrupts = true;
|
|
|
|
/* end the COPY input */
|
|
if (!PutRemoteCopyEnd(connection, NULL))
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_IO_ERROR),
|
|
errmsg("failed to COPY to shard " INT64_FORMAT " on %s:%d",
|
|
shardId, connection->hostname, connection->port)));
|
|
}
|
|
|
|
/* check whether there were any COPY errors */
|
|
PGresult *result = GetRemoteCommandResult(connection, raiseInterrupts);
|
|
if (PQresultStatus(result) != PGRES_COMMAND_OK)
|
|
{
|
|
ReportCopyError(connection, result);
|
|
}
|
|
|
|
PQclear(result);
|
|
ForgetResults(connection);
|
|
UnclaimConnection(connection);
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* ReportCopyError tries to report a useful error message for the user from
|
|
* the remote COPY error messages.
|
|
*/
|
|
void
|
|
ReportCopyError(MultiConnection *connection, PGresult *result)
|
|
{
|
|
char *remoteMessage = PQresultErrorField(result, PG_DIAG_MESSAGE_PRIMARY);
|
|
|
|
if (remoteMessage != NULL)
|
|
{
|
|
/* probably a constraint violation, show remote message and detail */
|
|
char *remoteDetail = PQresultErrorField(result, PG_DIAG_MESSAGE_DETAIL);
|
|
bool haveDetail = remoteDetail != NULL;
|
|
|
|
ereport(ERROR, (errmsg("%s", remoteMessage),
|
|
haveDetail ? errdetail("%s", remoteDetail) :
|
|
0));
|
|
}
|
|
else
|
|
{
|
|
/* trim the trailing characters */
|
|
remoteMessage = pchomp(PQerrorMessage(connection->pgConn));
|
|
|
|
ereport(ERROR, (errcode(ERRCODE_IO_ERROR),
|
|
errmsg("failed to complete COPY on %s:%d", connection->hostname,
|
|
connection->port),
|
|
errdetail("%s", remoteMessage)));
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* ConversionPathForTypes fills *result with all the data necessary for converting
|
|
* Datums of type inputType to Datums of type destType.
|
|
*/
|
|
void
|
|
ConversionPathForTypes(Oid inputType, Oid destType, CopyCoercionData *result)
|
|
{
|
|
Oid coercionFuncId = InvalidOid;
|
|
CoercionPathType coercionType = COERCION_PATH_RELABELTYPE;
|
|
|
|
if (destType == inputType)
|
|
{
|
|
result->coercionType = COERCION_PATH_RELABELTYPE;
|
|
return;
|
|
}
|
|
|
|
coercionType = find_coercion_pathway(destType, inputType,
|
|
COERCION_EXPLICIT,
|
|
&coercionFuncId);
|
|
|
|
switch (coercionType)
|
|
{
|
|
case COERCION_PATH_NONE:
|
|
{
|
|
ereport(ERROR, (errmsg("cannot cast %d to %d", inputType, destType)));
|
|
return;
|
|
}
|
|
|
|
case COERCION_PATH_ARRAYCOERCE:
|
|
{
|
|
Oid inputBaseType = get_base_element_type(inputType);
|
|
Oid destBaseType = get_base_element_type(destType);
|
|
CoercionPathType baseCoercionType = COERCION_PATH_NONE;
|
|
|
|
if (inputBaseType != InvalidOid && destBaseType != InvalidOid)
|
|
{
|
|
baseCoercionType = find_coercion_pathway(inputBaseType, destBaseType,
|
|
COERCION_EXPLICIT,
|
|
&coercionFuncId);
|
|
}
|
|
|
|
if (baseCoercionType != COERCION_PATH_COERCEVIAIO)
|
|
{
|
|
ereport(ERROR, (errmsg("can not run query which uses an implicit coercion"
|
|
" between array types")));
|
|
}
|
|
}
|
|
|
|
/* fallthrough */
|
|
|
|
case COERCION_PATH_COERCEVIAIO:
|
|
{
|
|
result->coercionType = COERCION_PATH_COERCEVIAIO;
|
|
|
|
{
|
|
bool typisvarlena = false; /* ignored */
|
|
Oid iofunc = InvalidOid;
|
|
getTypeOutputInfo(inputType, &iofunc, &typisvarlena);
|
|
fmgr_info(iofunc, &(result->outputFunction));
|
|
}
|
|
|
|
{
|
|
Oid iofunc = InvalidOid;
|
|
getTypeInputInfo(destType, &iofunc, &(result->typioparam));
|
|
fmgr_info(iofunc, &(result->inputFunction));
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
case COERCION_PATH_FUNC:
|
|
{
|
|
result->coercionType = COERCION_PATH_FUNC;
|
|
fmgr_info(coercionFuncId, &(result->coerceFunction));
|
|
return;
|
|
}
|
|
|
|
case COERCION_PATH_RELABELTYPE:
|
|
{
|
|
result->coercionType = COERCION_PATH_RELABELTYPE;
|
|
return; /* the types are binary compatible, no need to call a function */
|
|
}
|
|
|
|
default:
|
|
Assert(false); /* there are no other options for this enum */
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* Returns the type of the provided column of the provided tuple. Throws an error if the
|
|
* column does not exist or is dropped.
|
|
*
|
|
* tupleDescriptor and relationId must refer to the same table.
|
|
*/
|
|
static Oid
|
|
TypeForColumnName(Oid relationId, TupleDesc tupleDescriptor, char *columnName)
|
|
{
|
|
AttrNumber destAttrNumber = get_attnum(relationId, columnName);
|
|
|
|
if (destAttrNumber == InvalidAttrNumber)
|
|
{
|
|
ereport(ERROR, (errmsg("invalid attr? %s", columnName)));
|
|
}
|
|
|
|
Form_pg_attribute attr = TupleDescAttr(tupleDescriptor, destAttrNumber - 1);
|
|
return attr->atttypid;
|
|
}
|
|
|
|
|
|
/*
|
|
* Walks a TupleDesc and returns an array of the types of each attribute.
|
|
* Returns InvalidOid in the place of dropped or generated attributes.
|
|
*/
|
|
static Oid *
|
|
TypeArrayFromTupleDescriptor(TupleDesc tupleDescriptor)
|
|
{
|
|
int columnCount = tupleDescriptor->natts;
|
|
Oid *typeArray = palloc0(columnCount * sizeof(Oid));
|
|
|
|
for (int columnIndex = 0; columnIndex < columnCount; columnIndex++)
|
|
{
|
|
Form_pg_attribute attr = TupleDescAttr(tupleDescriptor, columnIndex);
|
|
if (attr->attisdropped ||
|
|
attr->attgenerated == ATTRIBUTE_GENERATED_STORED
|
|
)
|
|
{
|
|
typeArray[columnIndex] = InvalidOid;
|
|
}
|
|
else
|
|
{
|
|
typeArray[columnIndex] = attr->atttypid;
|
|
}
|
|
}
|
|
|
|
return typeArray;
|
|
}
|
|
|
|
|
|
/*
|
|
* ColumnCoercionPaths scans the input and output tuples looking for mismatched types,
|
|
* it then returns an array of coercion functions to use on the input tuples, and an
|
|
* array of types which descript the output tuple
|
|
*/
|
|
static CopyCoercionData *
|
|
ColumnCoercionPaths(TupleDesc destTupleDescriptor, TupleDesc inputTupleDescriptor,
|
|
Oid destRelId, List *columnNameList,
|
|
Oid *finalColumnTypeArray)
|
|
{
|
|
int columnCount = inputTupleDescriptor->natts;
|
|
CopyCoercionData *coercePaths = palloc0(columnCount * sizeof(CopyCoercionData));
|
|
Oid *inputTupleTypes = TypeArrayFromTupleDescriptor(inputTupleDescriptor);
|
|
ListCell *currentColumnName = list_head(columnNameList);
|
|
|
|
for (int columnIndex = 0; columnIndex < columnCount; columnIndex++)
|
|
{
|
|
Oid inputTupleType = inputTupleTypes[columnIndex];
|
|
char *columnName = lfirst(currentColumnName);
|
|
|
|
if (inputTupleType == InvalidOid)
|
|
{
|
|
/* TypeArrayFromTupleDescriptor decided to skip this column */
|
|
continue;
|
|
}
|
|
|
|
Oid destTupleType = TypeForColumnName(destRelId, destTupleDescriptor, columnName);
|
|
|
|
finalColumnTypeArray[columnIndex] = destTupleType;
|
|
|
|
ConversionPathForTypes(inputTupleType, destTupleType,
|
|
&coercePaths[columnIndex]);
|
|
|
|
currentColumnName = lnext(columnNameList, currentColumnName);
|
|
|
|
if (currentColumnName == NULL)
|
|
{
|
|
/* the rest of inputTupleDescriptor are dropped columns, return early! */
|
|
break;
|
|
}
|
|
}
|
|
|
|
return coercePaths;
|
|
}
|
|
|
|
|
|
/*
|
|
* TypeOutputFunctions takes an array of types and returns an array of output functions
|
|
* for those types.
|
|
*/
|
|
static FmgrInfo *
|
|
TypeOutputFunctions(uint32 columnCount, Oid *typeIdArray, bool binaryFormat)
|
|
{
|
|
FmgrInfo *columnOutputFunctions = palloc0(columnCount * sizeof(FmgrInfo));
|
|
|
|
for (uint32 columnIndex = 0; columnIndex < columnCount; columnIndex++)
|
|
{
|
|
FmgrInfo *currentOutputFunction = &columnOutputFunctions[columnIndex];
|
|
Oid columnTypeId = typeIdArray[columnIndex];
|
|
bool typeVariableLength = false;
|
|
Oid outputFunctionId = InvalidOid;
|
|
|
|
if (columnTypeId == InvalidOid)
|
|
{
|
|
/* TypeArrayFromTupleDescriptor decided to skip this column */
|
|
continue;
|
|
}
|
|
else if (binaryFormat)
|
|
{
|
|
getTypeBinaryOutputInfo(columnTypeId, &outputFunctionId, &typeVariableLength);
|
|
}
|
|
else
|
|
{
|
|
getTypeOutputInfo(columnTypeId, &outputFunctionId, &typeVariableLength);
|
|
}
|
|
|
|
fmgr_info(outputFunctionId, currentOutputFunction);
|
|
}
|
|
|
|
return columnOutputFunctions;
|
|
}
|
|
|
|
|
|
/*
|
|
* ColumnOutputFunctions is a wrapper around TypeOutputFunctions, it takes a
|
|
* tupleDescriptor and returns an array of output functions, one for each column in
|
|
* the tuple.
|
|
*/
|
|
FmgrInfo *
|
|
ColumnOutputFunctions(TupleDesc rowDescriptor, bool binaryFormat)
|
|
{
|
|
uint32 columnCount = (uint32) rowDescriptor->natts;
|
|
Oid *columnTypes = TypeArrayFromTupleDescriptor(rowDescriptor);
|
|
FmgrInfo *outputFunctions =
|
|
TypeOutputFunctions(columnCount, columnTypes, binaryFormat);
|
|
|
|
return outputFunctions;
|
|
}
|
|
|
|
|
|
/*
|
|
* citus_text_send_as_jsonb sends a text as if it was a JSONB. This should only
|
|
* be used if the text is indeed valid JSON.
|
|
*/
|
|
Datum
|
|
citus_text_send_as_jsonb(PG_FUNCTION_ARGS)
|
|
{
|
|
text *inputText = PG_GETARG_TEXT_PP(0);
|
|
StringInfoData buf;
|
|
int version = 1;
|
|
|
|
pq_begintypsend(&buf);
|
|
pq_sendint(&buf, version, 1);
|
|
pq_sendtext(&buf, VARDATA_ANY(inputText), VARSIZE_ANY_EXHDR(inputText));
|
|
|
|
PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
|
|
}
|
|
|
|
|
|
/*
|
|
* AppendCopyRowData serializes one row using the column output functions,
|
|
* and appends the data to the row output state object's message buffer.
|
|
* This function is modeled after the CopyOneRowTo() function in
|
|
* commands/copy.c, but only implements a subset of that functionality.
|
|
* Note that the caller of this function should reset row memory context
|
|
* to not bloat memory usage.
|
|
*/
|
|
void
|
|
AppendCopyRowData(Datum *valueArray, bool *isNullArray, TupleDesc rowDescriptor,
|
|
CopyOutState rowOutputState, FmgrInfo *columnOutputFunctions,
|
|
CopyCoercionData *columnCoercionPaths)
|
|
{
|
|
uint32 totalColumnCount = (uint32) rowDescriptor->natts;
|
|
uint32 availableColumnCount = AvailableColumnCount(rowDescriptor);
|
|
uint32 appendedColumnCount = 0;
|
|
|
|
MemoryContext oldContext = MemoryContextSwitchTo(rowOutputState->rowcontext);
|
|
|
|
if (rowOutputState->binary)
|
|
{
|
|
CopySendInt16(rowOutputState, availableColumnCount);
|
|
}
|
|
for (uint32 columnIndex = 0; columnIndex < totalColumnCount; columnIndex++)
|
|
{
|
|
Form_pg_attribute currentColumn = TupleDescAttr(rowDescriptor, columnIndex);
|
|
Datum value = valueArray[columnIndex];
|
|
bool isNull = isNullArray[columnIndex];
|
|
bool lastColumn = false;
|
|
|
|
if (!isNull && columnCoercionPaths != NULL)
|
|
{
|
|
value = CoerceColumnValue(value, &columnCoercionPaths[columnIndex]);
|
|
}
|
|
|
|
if (currentColumn->attisdropped ||
|
|
currentColumn->attgenerated == ATTRIBUTE_GENERATED_STORED
|
|
)
|
|
{
|
|
continue;
|
|
}
|
|
else if (rowOutputState->binary)
|
|
{
|
|
if (!isNull)
|
|
{
|
|
FmgrInfo *outputFunctionPointer = &columnOutputFunctions[columnIndex];
|
|
bytea *outputBytes = SendFunctionCall(outputFunctionPointer, value);
|
|
|
|
CopySendInt32(rowOutputState, VARSIZE(outputBytes) - VARHDRSZ);
|
|
CopySendData(rowOutputState, VARDATA(outputBytes),
|
|
VARSIZE(outputBytes) - VARHDRSZ);
|
|
}
|
|
else
|
|
{
|
|
CopySendInt32(rowOutputState, -1);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (!isNull)
|
|
{
|
|
FmgrInfo *outputFunctionPointer = &columnOutputFunctions[columnIndex];
|
|
char *columnText = OutputFunctionCall(outputFunctionPointer, value);
|
|
|
|
CopyAttributeOutText(rowOutputState, columnText);
|
|
}
|
|
else
|
|
{
|
|
CopySendString(rowOutputState, rowOutputState->null_print_client);
|
|
}
|
|
|
|
lastColumn = ((appendedColumnCount + 1) == availableColumnCount);
|
|
if (!lastColumn)
|
|
{
|
|
CopySendChar(rowOutputState, rowOutputState->delim[0]);
|
|
}
|
|
}
|
|
|
|
appendedColumnCount++;
|
|
}
|
|
|
|
if (!rowOutputState->binary)
|
|
{
|
|
/* append default line termination string depending on the platform */
|
|
#ifndef WIN32
|
|
CopySendChar(rowOutputState, '\n');
|
|
#else
|
|
CopySendString(rowOutputState, "\r\n");
|
|
#endif
|
|
}
|
|
|
|
MemoryContextSwitchTo(oldContext);
|
|
}
|
|
|
|
|
|
/*
|
|
* CoerceColumnValue follows the instructions in *coercionPath and uses them to convert
|
|
* inputValue into a Datum of the correct type.
|
|
*/
|
|
Datum
|
|
CoerceColumnValue(Datum inputValue, CopyCoercionData *coercionPath)
|
|
{
|
|
switch (coercionPath->coercionType)
|
|
{
|
|
case 0:
|
|
{
|
|
return inputValue; /* this was a dropped column */
|
|
}
|
|
|
|
case COERCION_PATH_RELABELTYPE:
|
|
{
|
|
return inputValue; /* no need to do anything */
|
|
}
|
|
|
|
case COERCION_PATH_FUNC:
|
|
{
|
|
FmgrInfo *coerceFunction = &(coercionPath->coerceFunction);
|
|
Datum outputValue = FunctionCall1(coerceFunction, inputValue);
|
|
return outputValue;
|
|
}
|
|
|
|
case COERCION_PATH_COERCEVIAIO:
|
|
{
|
|
FmgrInfo *outFunction = &(coercionPath->outputFunction);
|
|
Datum textRepr = FunctionCall1(outFunction, inputValue);
|
|
|
|
FmgrInfo *inFunction = &(coercionPath->inputFunction);
|
|
Oid typioparam = coercionPath->typioparam;
|
|
Datum outputValue = FunctionCall3(inFunction, textRepr, typioparam,
|
|
Int32GetDatum(-1));
|
|
|
|
return outputValue;
|
|
}
|
|
|
|
default:
|
|
{
|
|
/* this should never happen */
|
|
ereport(ERROR, (errmsg("unsupported coercion type")));
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* AvailableColumnCount returns the number of columns in a tuple descriptor, excluding
|
|
* columns that were dropped.
|
|
*/
|
|
static uint32
|
|
AvailableColumnCount(TupleDesc tupleDescriptor)
|
|
{
|
|
uint32 columnCount = 0;
|
|
|
|
for (uint32 columnIndex = 0; columnIndex < tupleDescriptor->natts; columnIndex++)
|
|
{
|
|
Form_pg_attribute currentColumn = TupleDescAttr(tupleDescriptor, columnIndex);
|
|
|
|
if (!currentColumn->attisdropped &&
|
|
currentColumn->attgenerated != ATTRIBUTE_GENERATED_STORED
|
|
)
|
|
{
|
|
columnCount++;
|
|
}
|
|
}
|
|
|
|
return columnCount;
|
|
}
|
|
|
|
|
|
/*
|
|
* AppendCopyBinaryHeaders appends binary headers to the copy buffer in
|
|
* headerOutputState.
|
|
*/
|
|
void
|
|
AppendCopyBinaryHeaders(CopyOutState headerOutputState)
|
|
{
|
|
const int32 zero = 0;
|
|
MemoryContext oldContext = MemoryContextSwitchTo(headerOutputState->rowcontext);
|
|
|
|
/* Signature */
|
|
CopySendData(headerOutputState, BinarySignature, 11);
|
|
|
|
/* Flags field (no OIDs) */
|
|
CopySendInt32(headerOutputState, zero);
|
|
|
|
/* No header extension */
|
|
CopySendInt32(headerOutputState, zero);
|
|
|
|
MemoryContextSwitchTo(oldContext);
|
|
}
|
|
|
|
|
|
/*
|
|
* AppendCopyBinaryFooters appends binary footers to the copy buffer in
|
|
* footerOutputState.
|
|
*/
|
|
void
|
|
AppendCopyBinaryFooters(CopyOutState footerOutputState)
|
|
{
|
|
int16 negative = -1;
|
|
MemoryContext oldContext = MemoryContextSwitchTo(footerOutputState->rowcontext);
|
|
|
|
CopySendInt16(footerOutputState, negative);
|
|
|
|
MemoryContextSwitchTo(oldContext);
|
|
}
|
|
|
|
|
|
/* *INDENT-OFF* */
|
|
|
|
|
|
/*
|
|
* Send copy start/stop messages for frontend copies. These have changed
|
|
* in past protocol redesigns.
|
|
*/
|
|
static void
|
|
SendCopyBegin(CopyOutState cstate)
|
|
{
|
|
#if PG_VERSION_NUM < PG_VERSION_14
|
|
if (PG_PROTOCOL_MAJOR(FrontendProtocol) < 3) {
|
|
/* old way */
|
|
if (cstate->binary)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
|
errmsg("COPY BINARY is not supported to stdout or from stdin")));
|
|
pq_putemptymessage('H');
|
|
/* grottiness needed for old COPY OUT protocol */
|
|
pq_startcopyout();
|
|
cstate->copy_dest = COPY_OLD_FE;
|
|
return;
|
|
}
|
|
#endif
|
|
StringInfoData buf;
|
|
int natts = list_length(cstate->attnumlist);
|
|
int16 format = (cstate->binary ? 1 : 0);
|
|
int i;
|
|
|
|
pq_beginmessage(&buf, 'H');
|
|
pq_sendbyte(&buf, format); /* overall format */
|
|
pq_sendint16(&buf, natts);
|
|
for (i = 0; i < natts; i++)
|
|
pq_sendint16(&buf, format); /* per-column formats */
|
|
pq_endmessage(&buf);
|
|
cstate->copy_dest = COPY_FRONTEND;
|
|
}
|
|
|
|
|
|
/* End a copy stream sent to the client */
|
|
static void
|
|
SendCopyEnd(CopyOutState cstate)
|
|
{
|
|
#if PG_VERSION_NUM < PG_VERSION_14
|
|
if (cstate->copy_dest != COPY_NEW_FE)
|
|
{
|
|
CopySendData(cstate, "\\.", 2);
|
|
/* Need to flush out the trailer (this also appends a newline) */
|
|
CopySendEndOfRow(cstate, true);
|
|
pq_endcopyout(false);
|
|
return;
|
|
}
|
|
#endif
|
|
/* Shouldn't have any unsent data */
|
|
Assert(cstate->fe_msgbuf->len == 0);
|
|
/* Send Copy Done message */
|
|
pq_putemptymessage('c');
|
|
}
|
|
|
|
|
|
/* Append data to the copy buffer in outputState */
|
|
static void
|
|
CopySendData(CopyOutState outputState, const void *databuf, int datasize)
|
|
{
|
|
appendBinaryStringInfo(outputState->fe_msgbuf, databuf, datasize);
|
|
}
|
|
|
|
|
|
/* Append a striong to the copy buffer in outputState. */
|
|
static void
|
|
CopySendString(CopyOutState outputState, const char *str)
|
|
{
|
|
appendBinaryStringInfo(outputState->fe_msgbuf, str, strlen(str));
|
|
}
|
|
|
|
|
|
/* Append a char to the copy buffer in outputState. */
|
|
static void
|
|
CopySendChar(CopyOutState outputState, char c)
|
|
{
|
|
appendStringInfoCharMacro(outputState->fe_msgbuf, c);
|
|
}
|
|
|
|
|
|
/* Append an int32 to the copy buffer in outputState. */
|
|
static void
|
|
CopySendInt32(CopyOutState outputState, int32 val)
|
|
{
|
|
uint32 buf = htonl((uint32) val);
|
|
CopySendData(outputState, &buf, sizeof(buf));
|
|
}
|
|
|
|
|
|
/* Append an int16 to the copy buffer in outputState. */
|
|
static void
|
|
CopySendInt16(CopyOutState outputState, int16 val)
|
|
{
|
|
uint16 buf = htons((uint16) val);
|
|
CopySendData(outputState, &buf, sizeof(buf));
|
|
}
|
|
|
|
|
|
/* Send the row to the appropriate destination */
|
|
static void
|
|
CopySendEndOfRow(CopyOutState cstate, bool includeEndOfLine)
|
|
{
|
|
StringInfo fe_msgbuf = cstate->fe_msgbuf;
|
|
|
|
switch (cstate->copy_dest)
|
|
{
|
|
#if PG_VERSION_NUM < PG_VERSION_14
|
|
case COPY_OLD_FE:
|
|
/* The FE/BE protocol uses \n as newline for all platforms */
|
|
if (!cstate->binary && includeEndOfLine)
|
|
CopySendChar(cstate, '\n');
|
|
|
|
if (pq_putbytes(fe_msgbuf->data, fe_msgbuf->len))
|
|
{
|
|
/* no hope of recovering connection sync, so FATAL */
|
|
ereport(FATAL,
|
|
(errcode(ERRCODE_CONNECTION_FAILURE),
|
|
errmsg("connection lost during COPY to stdout")));
|
|
}
|
|
break;
|
|
#endif
|
|
case COPY_FRONTEND:
|
|
/* The FE/BE protocol uses \n as newline for all platforms */
|
|
if (!cstate->binary && includeEndOfLine)
|
|
CopySendChar(cstate, '\n');
|
|
|
|
/* Dump the accumulated row as one CopyData message */
|
|
(void) pq_putmessage('d', fe_msgbuf->data, fe_msgbuf->len);
|
|
break;
|
|
case COPY_FILE:
|
|
case COPY_CALLBACK:
|
|
Assert(false); /* Not yet supported. */
|
|
break;
|
|
}
|
|
|
|
resetStringInfo(fe_msgbuf);
|
|
}
|
|
|
|
|
|
/*
|
|
* Send text representation of one column, with conversion and escaping.
|
|
*
|
|
* NB: This function is based on commands/copy.c and doesn't fully conform to
|
|
* our coding style. The function should be kept in sync with copy.c.
|
|
*/
|
|
static void
|
|
CopyAttributeOutText(CopyOutState cstate, char *string)
|
|
{
|
|
char *pointer = NULL;
|
|
char c = '\0';
|
|
char delimc = cstate->delim[0];
|
|
|
|
if (cstate->need_transcoding)
|
|
{
|
|
pointer = pg_server_to_any(string, strlen(string), cstate->file_encoding);
|
|
}
|
|
else
|
|
{
|
|
pointer = string;
|
|
}
|
|
|
|
/*
|
|
* We have to grovel through the string searching for control characters
|
|
* and instances of the delimiter character. In most cases, though, these
|
|
* are infrequent. To avoid overhead from calling CopySendData once per
|
|
* character, we dump out all characters between escaped characters in a
|
|
* single call. The loop invariant is that the data from "start" to "pointer"
|
|
* can be sent literally, but hasn't yet been.
|
|
*
|
|
* As all encodings here are safe, i.e. backend supported ones, we can
|
|
* skip doing pg_encoding_mblen(), because in valid backend encodings,
|
|
* extra bytes of a multibyte character never look like ASCII.
|
|
*/
|
|
char *start = pointer;
|
|
while ((c = *pointer) != '\0')
|
|
{
|
|
if ((unsigned char) c < (unsigned char) 0x20)
|
|
{
|
|
/*
|
|
* \r and \n must be escaped, the others are traditional. We
|
|
* prefer to dump these using the C-like notation, rather than
|
|
* a backslash and the literal character, because it makes the
|
|
* dump file a bit more proof against Microsoftish data
|
|
* mangling.
|
|
*/
|
|
switch (c)
|
|
{
|
|
case '\b':
|
|
c = 'b';
|
|
break;
|
|
case '\f':
|
|
c = 'f';
|
|
break;
|
|
case '\n':
|
|
c = 'n';
|
|
break;
|
|
case '\r':
|
|
c = 'r';
|
|
break;
|
|
case '\t':
|
|
c = 't';
|
|
break;
|
|
case '\v':
|
|
c = 'v';
|
|
break;
|
|
default:
|
|
/* If it's the delimiter, must backslash it */
|
|
if (c == delimc)
|
|
break;
|
|
/* All ASCII control chars are length 1 */
|
|
pointer++;
|
|
continue; /* fall to end of loop */
|
|
}
|
|
/* if we get here, we need to convert the control char */
|
|
CopyFlushOutput(cstate, start, pointer);
|
|
CopySendChar(cstate, '\\');
|
|
CopySendChar(cstate, c);
|
|
start = ++pointer; /* do not include char in next run */
|
|
}
|
|
else if (c == '\\' || c == delimc)
|
|
{
|
|
CopyFlushOutput(cstate, start, pointer);
|
|
CopySendChar(cstate, '\\');
|
|
start = pointer++; /* we include char in next run */
|
|
}
|
|
else
|
|
{
|
|
pointer++;
|
|
}
|
|
}
|
|
|
|
CopyFlushOutput(cstate, start, pointer);
|
|
}
|
|
|
|
|
|
/* *INDENT-ON* */
|
|
/* Helper function to send pending copy output */
|
|
static inline void
|
|
CopyFlushOutput(CopyOutState cstate, char *start, char *pointer)
|
|
{
|
|
if (pointer > start)
|
|
{
|
|
CopySendData(cstate, start, pointer - start);
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* CreateCitusCopyDestReceiver creates a DestReceiver that copies into
|
|
* a distributed table.
|
|
*
|
|
* The caller should provide the list of column names to use in the
|
|
* remote COPY statement, and the partition column index in the tuple
|
|
* descriptor (*not* the column name list).
|
|
*
|
|
* If intermediateResultIdPrefix is not NULL, the COPY will go into a set
|
|
* of intermediate results that are co-located with the actual table.
|
|
* The names of the intermediate results with be of the form:
|
|
* intermediateResultIdPrefix_<shardid>
|
|
*/
|
|
CitusCopyDestReceiver *
|
|
CreateCitusCopyDestReceiver(Oid tableId, List *columnNameList, int partitionColumnIndex,
|
|
EState *executorState,
|
|
char *intermediateResultIdPrefix, bool isPublishable)
|
|
{
|
|
CitusCopyDestReceiver *copyDest = (CitusCopyDestReceiver *) palloc0(
|
|
sizeof(CitusCopyDestReceiver));
|
|
|
|
/* set up the DestReceiver function pointers */
|
|
copyDest->pub.receiveSlot = CitusCopyDestReceiverReceive;
|
|
copyDest->pub.rStartup = CitusCopyDestReceiverStartup;
|
|
copyDest->pub.rShutdown = CitusCopyDestReceiverShutdown;
|
|
copyDest->pub.rDestroy = CitusCopyDestReceiverDestroy;
|
|
copyDest->pub.mydest = DestCopyOut;
|
|
|
|
/* set up output parameters */
|
|
copyDest->distributedRelationId = tableId;
|
|
copyDest->columnNameList = columnNameList;
|
|
copyDest->partitionColumnIndex = partitionColumnIndex;
|
|
copyDest->executorState = executorState;
|
|
copyDest->colocatedIntermediateResultIdPrefix = intermediateResultIdPrefix;
|
|
copyDest->memoryContext = CurrentMemoryContext;
|
|
copyDest->isPublishable = isPublishable;
|
|
|
|
return copyDest;
|
|
}
|
|
|
|
|
|
/*
|
|
* GetLocalCopyStatus returns the status for executing copy locally.
|
|
* If LOCAL_COPY_DISABLED or LOCAL_COPY_REQUIRED, the caller has to
|
|
* follow that. Else, the caller may decide to use local or remote
|
|
* execution depending on other information.
|
|
*/
|
|
static LocalCopyStatus
|
|
GetLocalCopyStatus(void)
|
|
{
|
|
if (!EnableLocalExecution ||
|
|
GetCurrentLocalExecutionStatus() == LOCAL_EXECUTION_DISABLED)
|
|
{
|
|
return LOCAL_COPY_DISABLED;
|
|
}
|
|
else if (GetCurrentLocalExecutionStatus() == LOCAL_EXECUTION_REQUIRED)
|
|
{
|
|
/*
|
|
* For various reasons, including the transaction visibility
|
|
* rules (e.g., read-your-own-writes), we have to use local
|
|
* execution again if it has already happened within this
|
|
* transaction block.
|
|
*
|
|
* We might error out later in the execution if it is not suitable
|
|
* to execute the tasks locally.
|
|
*/
|
|
Assert(IsMultiStatementTransaction() || InCoordinatedTransaction());
|
|
|
|
/*
|
|
* TODO: A future improvement could be to keep track of which placements
|
|
* have been locally executed. At this point, only use local execution for
|
|
* those placements. That'd help to benefit more from parallelism.
|
|
*/
|
|
|
|
return LOCAL_COPY_REQUIRED;
|
|
}
|
|
else if (IsMultiStatementTransaction())
|
|
{
|
|
return LOCAL_COPY_REQUIRED;
|
|
}
|
|
|
|
return LOCAL_COPY_OPTIONAL;
|
|
}
|
|
|
|
|
|
/*
|
|
* ShardIntervalListHasLocalPlacements returns true if any of the input
|
|
* shard placement has a local placement;
|
|
*/
|
|
static bool
|
|
ShardIntervalListHasLocalPlacements(List *shardIntervalList)
|
|
{
|
|
int32 localGroupId = GetLocalGroupId();
|
|
ShardInterval *shardInterval = NULL;
|
|
foreach_ptr(shardInterval, shardIntervalList)
|
|
{
|
|
if (ActiveShardPlacementOnGroup(localGroupId, shardInterval->shardId) != NULL)
|
|
{
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
/*
|
|
* CitusCopyDestReceiverStartup implements the rStartup interface of
|
|
* CitusCopyDestReceiver. It opens the relation, acquires necessary
|
|
* locks, and initializes the state required for doing the copy.
|
|
*/
|
|
static void
|
|
CitusCopyDestReceiverStartup(DestReceiver *dest, int operation,
|
|
TupleDesc inputTupleDescriptor)
|
|
{
|
|
CitusCopyDestReceiver *copyDest = (CitusCopyDestReceiver *) dest;
|
|
|
|
Oid tableId = copyDest->distributedRelationId;
|
|
|
|
char *relationName = get_rel_name(tableId);
|
|
Oid schemaOid = get_rel_namespace(tableId);
|
|
char *schemaName = get_namespace_name(schemaOid);
|
|
|
|
List *columnNameList = copyDest->columnNameList;
|
|
List *attributeList = NIL;
|
|
|
|
ListCell *columnNameCell = NULL;
|
|
|
|
const char *delimiterCharacter = "\t";
|
|
const char *nullPrintCharacter = "\\N";
|
|
|
|
/* look up table properties */
|
|
Relation distributedRelation = table_open(tableId, RowExclusiveLock);
|
|
CitusTableCacheEntry *cacheEntry = GetCitusTableCacheEntry(tableId);
|
|
|
|
copyDest->distributedRelation = distributedRelation;
|
|
copyDest->tupleDescriptor = inputTupleDescriptor;
|
|
|
|
/* load the list of shards and verify that we have shards to copy into */
|
|
List *shardIntervalList = LoadShardIntervalList(tableId);
|
|
if (shardIntervalList == NIL)
|
|
{
|
|
if (IsCitusTableTypeCacheEntry(cacheEntry, HASH_DISTRIBUTED))
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
|
errmsg("could not find any shards into which to copy"),
|
|
errdetail("No shards exist for distributed table \"%s\".",
|
|
relationName),
|
|
errhint("Run master_create_worker_shards to create shards "
|
|
"and try again.")));
|
|
}
|
|
else
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
|
errmsg("could not find any shards into which to copy"),
|
|
errdetail("No shards exist for distributed table \"%s\".",
|
|
relationName)));
|
|
}
|
|
}
|
|
|
|
/* error if any shard missing min/max values */
|
|
if (cacheEntry->hasUninitializedShardInterval)
|
|
{
|
|
if (IsCitusTableTypeCacheEntry(cacheEntry, HASH_DISTRIBUTED) ||
|
|
IsCitusTableTypeCacheEntry(cacheEntry, RANGE_DISTRIBUTED))
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
|
errmsg("could not start copy"),
|
|
errdetail("Distributed relation \"%s\" has shards "
|
|
"with missing shardminvalue/shardmaxvalue.",
|
|
relationName)));
|
|
}
|
|
}
|
|
|
|
/* prevent concurrent placement changes and non-commutative DML statements */
|
|
LockShardListMetadata(shardIntervalList, ShareLock);
|
|
|
|
/*
|
|
* Prevent concurrent UPDATE/DELETE on replication factor >1
|
|
* (see AcquireExecutorMultiShardLocks() at multi_router_executor.c)
|
|
*/
|
|
SerializeNonCommutativeWrites(shardIntervalList, RowExclusiveLock);
|
|
|
|
UseCoordinatedTransaction();
|
|
|
|
/* all modifications use 2PC */
|
|
Use2PCForCoordinatedTransaction();
|
|
|
|
/* define how tuples will be serialised */
|
|
CopyOutState copyOutState = (CopyOutState) palloc0(sizeof(CopyOutStateData));
|
|
copyOutState->delim = (char *) delimiterCharacter;
|
|
copyOutState->null_print = (char *) nullPrintCharacter;
|
|
copyOutState->null_print_client = (char *) nullPrintCharacter;
|
|
copyOutState->binary = CanUseBinaryCopyFormat(inputTupleDescriptor);
|
|
copyOutState->fe_msgbuf = makeStringInfo();
|
|
copyOutState->rowcontext = GetPerTupleMemoryContext(copyDest->executorState);
|
|
copyDest->copyOutState = copyOutState;
|
|
copyDest->multiShardCopy = false;
|
|
|
|
/* prepare functions to call on received tuples */
|
|
{
|
|
TupleDesc destTupleDescriptor = distributedRelation->rd_att;
|
|
int columnCount = inputTupleDescriptor->natts;
|
|
Oid *finalTypeArray = palloc0(columnCount * sizeof(Oid));
|
|
|
|
copyDest->columnCoercionPaths =
|
|
ColumnCoercionPaths(destTupleDescriptor, inputTupleDescriptor,
|
|
tableId, columnNameList, finalTypeArray);
|
|
|
|
copyDest->columnOutputFunctions =
|
|
TypeOutputFunctions(columnCount, finalTypeArray, copyOutState->binary);
|
|
}
|
|
|
|
/* wrap the column names as Values */
|
|
foreach(columnNameCell, columnNameList)
|
|
{
|
|
char *columnName = (char *) lfirst(columnNameCell);
|
|
String *columnNameValue = makeString(columnName);
|
|
|
|
attributeList = lappend(attributeList, columnNameValue);
|
|
}
|
|
|
|
if (IsCitusTableTypeCacheEntry(cacheEntry, DISTRIBUTED_TABLE) &&
|
|
!IsCitusTableTypeCacheEntry(cacheEntry, SINGLE_SHARD_DISTRIBUTED) &&
|
|
copyDest->partitionColumnIndex == INVALID_PARTITION_COLUMN_INDEX)
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
|
|
errmsg("the partition column of table %s should have a value",
|
|
quote_qualified_identifier(schemaName, relationName))));
|
|
}
|
|
|
|
/* define the template for the COPY statement that is sent to workers */
|
|
CopyStmt *copyStatement = makeNode(CopyStmt);
|
|
|
|
bool colocatedIntermediateResults =
|
|
copyDest->colocatedIntermediateResultIdPrefix != NULL;
|
|
if (colocatedIntermediateResults)
|
|
{
|
|
copyStatement->relation = makeRangeVar(NULL,
|
|
copyDest->
|
|
colocatedIntermediateResultIdPrefix,
|
|
-1);
|
|
|
|
DefElem *formatResultOption = makeDefElem("format", (Node *) makeString("result"),
|
|
-1);
|
|
copyStatement->options = list_make1(formatResultOption);
|
|
}
|
|
else
|
|
{
|
|
copyStatement->relation = makeRangeVar(schemaName, relationName, -1);
|
|
copyStatement->options = NIL;
|
|
|
|
if (copyOutState->binary)
|
|
{
|
|
DefElem *binaryFormatOption =
|
|
makeDefElem("format", (Node *) makeString("binary"), -1);
|
|
|
|
copyStatement->options = lappend(copyStatement->options, binaryFormatOption);
|
|
}
|
|
}
|
|
|
|
|
|
copyStatement->query = NULL;
|
|
copyStatement->attlist = attributeList;
|
|
copyStatement->is_from = true;
|
|
copyStatement->is_program = false;
|
|
copyStatement->filename = NULL;
|
|
copyDest->copyStatement = copyStatement;
|
|
|
|
copyDest->shardStateHash = CreateShardStateHash(TopTransactionContext);
|
|
copyDest->connectionStateHash = CreateConnectionStateHash(TopTransactionContext);
|
|
|
|
RecordRelationAccessIfNonDistTable(tableId, PLACEMENT_ACCESS_DML);
|
|
|
|
/*
|
|
* Colocated intermediate results do not honor citus.max_shared_pool_size,
|
|
* so we don't need to reserve any connections. Each result file is sent
|
|
* over a single connection.
|
|
*/
|
|
if (!colocatedIntermediateResults)
|
|
{
|
|
/*
|
|
* For all the primary (e.g., writable) remote nodes, reserve a shared
|
|
* connection. We do this upfront because we cannot know which nodes
|
|
* are going to be accessed. Since the order of the reservation is
|
|
* important, we need to do it right here. For the details on why the
|
|
* order important, see EnsureConnectionPossibilityForNodeList().
|
|
*
|
|
* We don't need to care about local node because we either get a
|
|
* connection or use local connection, so it cannot be part of
|
|
* the starvation. As an edge case, if it cannot get a connection
|
|
* and cannot switch to local execution (e.g., disabled by user),
|
|
* COPY would fail hinting the user to change the relevant settiing.
|
|
*/
|
|
EnsureConnectionPossibilityForRemotePrimaryNodes();
|
|
}
|
|
|
|
LocalCopyStatus localCopyStatus = GetLocalCopyStatus();
|
|
if (localCopyStatus == LOCAL_COPY_DISABLED)
|
|
{
|
|
copyDest->shouldUseLocalCopy = false;
|
|
}
|
|
else if (localCopyStatus == LOCAL_COPY_REQUIRED)
|
|
{
|
|
copyDest->shouldUseLocalCopy = true;
|
|
}
|
|
else if (localCopyStatus == LOCAL_COPY_OPTIONAL)
|
|
{
|
|
/*
|
|
* At this point, there is no requirements for doing the copy locally.
|
|
* However, if there are local placements, we can try to reserve
|
|
* a connection to local node. If we cannot reserve, we can still use
|
|
* local execution.
|
|
*
|
|
* NB: It is not advantageous to use remote execution just with a
|
|
* single remote connection. In other words, a single remote connection
|
|
* would not perform better than local execution. However, we prefer to
|
|
* do this because it is likely that the COPY would get more connections
|
|
* to parallelize the operation. In the future, we might relax this
|
|
* requirement and failover to local execution as on connection attempt
|
|
* failures as the executor does.
|
|
*/
|
|
if (ShardIntervalListHasLocalPlacements(shardIntervalList))
|
|
{
|
|
bool reservedConnection = TryConnectionPossibilityForLocalPrimaryNode();
|
|
copyDest->shouldUseLocalCopy = !reservedConnection;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* CitusCopyDestReceiverReceive implements the receiveSlot function of
|
|
* CitusCopyDestReceiver. It takes a TupleTableSlot and sends the contents to
|
|
* the appropriate shard placement(s).
|
|
*/
|
|
static bool
|
|
CitusCopyDestReceiverReceive(TupleTableSlot *slot, DestReceiver *dest)
|
|
{
|
|
bool result = false;
|
|
CitusCopyDestReceiver *copyDest = (CitusCopyDestReceiver *) dest;
|
|
|
|
PG_TRY();
|
|
{
|
|
result = CitusSendTupleToPlacements(slot, copyDest);
|
|
}
|
|
PG_CATCH();
|
|
{
|
|
/*
|
|
* We might be able to recover from errors with ROLLBACK TO SAVEPOINT,
|
|
* so unclaim the connections before throwing errors.
|
|
*/
|
|
List *connectionStateList = ConnectionStateList(copyDest->connectionStateHash);
|
|
UnclaimCopyConnections(connectionStateList);
|
|
|
|
PG_RE_THROW();
|
|
}
|
|
PG_END_TRY();
|
|
|
|
return result;
|
|
}
|
|
|
|
|
|
/*
|
|
* CitusSendTupleToPlacements sends the given TupleTableSlot to the appropriate
|
|
* shard placement(s).
|
|
*/
|
|
static bool
|
|
CitusSendTupleToPlacements(TupleTableSlot *slot, CitusCopyDestReceiver *copyDest)
|
|
{
|
|
TupleDesc tupleDescriptor = copyDest->tupleDescriptor;
|
|
CopyStmt *copyStatement = copyDest->copyStatement;
|
|
|
|
CopyOutState copyOutState = copyDest->copyOutState;
|
|
FmgrInfo *columnOutputFunctions = copyDest->columnOutputFunctions;
|
|
CopyCoercionData *columnCoercionPaths = copyDest->columnCoercionPaths;
|
|
ListCell *placementStateCell = NULL;
|
|
bool cachedShardStateFound = false;
|
|
bool firstTupleInShard = false;
|
|
|
|
|
|
EState *executorState = copyDest->executorState;
|
|
MemoryContext executorTupleContext = GetPerTupleMemoryContext(executorState);
|
|
MemoryContext oldContext = MemoryContextSwitchTo(executorTupleContext);
|
|
|
|
slot_getallattrs(slot);
|
|
|
|
Datum *columnValues = slot->tts_values;
|
|
bool *columnNulls = slot->tts_isnull;
|
|
|
|
int64 shardId = ShardIdForTuple(copyDest, columnValues, columnNulls);
|
|
|
|
/* connections hash is kept in memory context */
|
|
MemoryContextSwitchTo(copyDest->memoryContext);
|
|
bool isColocatedIntermediateResult =
|
|
copyDest->colocatedIntermediateResultIdPrefix != NULL;
|
|
|
|
CopyShardState *shardState = GetShardState(shardId, copyDest->shardStateHash,
|
|
copyDest->connectionStateHash,
|
|
&cachedShardStateFound,
|
|
copyDest->shouldUseLocalCopy,
|
|
copyDest->copyOutState,
|
|
isColocatedIntermediateResult,
|
|
copyDest->isPublishable);
|
|
|
|
if (!cachedShardStateFound)
|
|
{
|
|
firstTupleInShard = true;
|
|
}
|
|
|
|
if (firstTupleInShard && !copyDest->multiShardCopy &&
|
|
hash_get_num_entries(copyDest->shardStateHash) == 2)
|
|
{
|
|
Oid relationId = copyDest->distributedRelationId;
|
|
|
|
/* mark as multi shard to skip doing the same thing over and over */
|
|
copyDest->multiShardCopy = true;
|
|
|
|
if (MultiShardConnectionType != SEQUENTIAL_CONNECTION)
|
|
{
|
|
/* when we see multiple shard connections, we mark COPY as parallel modify */
|
|
RecordParallelModifyAccess(relationId);
|
|
}
|
|
}
|
|
|
|
if (isColocatedIntermediateResult && copyDest->shouldUseLocalCopy &&
|
|
shardState->containsLocalPlacement)
|
|
{
|
|
if (firstTupleInShard)
|
|
{
|
|
CreateLocalColocatedIntermediateFile(copyDest, shardState);
|
|
}
|
|
|
|
WriteTupleToLocalFile(slot, copyDest, shardId,
|
|
shardState->copyOutState, &shardState->fileDest);
|
|
}
|
|
else if (copyDest->shouldUseLocalCopy && shardState->containsLocalPlacement)
|
|
{
|
|
WriteTupleToLocalShard(slot, copyDest, shardId, shardState->copyOutState);
|
|
}
|
|
|
|
foreach(placementStateCell, shardState->placementStateList)
|
|
{
|
|
CopyPlacementState *currentPlacementState = lfirst(placementStateCell);
|
|
CopyConnectionState *connectionState = currentPlacementState->connectionState;
|
|
CopyPlacementState *activePlacementState = connectionState->activePlacementState;
|
|
bool switchToCurrentPlacement = false;
|
|
bool sendTupleOverConnection = false;
|
|
|
|
if (activePlacementState == NULL)
|
|
{
|
|
switchToCurrentPlacement = true;
|
|
}
|
|
else if (currentPlacementState != activePlacementState &&
|
|
currentPlacementState->data->len > CopySwitchOverThresholdBytes)
|
|
{
|
|
switchToCurrentPlacement = true;
|
|
|
|
/* before switching, make sure to finish the copy */
|
|
EndPlacementStateCopyCommand(activePlacementState, copyOutState);
|
|
AddPlacementStateToCopyConnectionStateBuffer(connectionState,
|
|
activePlacementState);
|
|
}
|
|
|
|
if (switchToCurrentPlacement)
|
|
{
|
|
StartPlacementStateCopyCommand(currentPlacementState, copyStatement,
|
|
copyOutState);
|
|
|
|
RemovePlacementStateFromCopyConnectionStateBuffer(connectionState,
|
|
currentPlacementState);
|
|
|
|
connectionState->activePlacementState = currentPlacementState;
|
|
|
|
/* send previously buffered tuples */
|
|
SendCopyDataToPlacement(currentPlacementState->data, shardId,
|
|
connectionState->connection);
|
|
resetStringInfo(currentPlacementState->data);
|
|
|
|
/* additionaly, we need to send the current tuple too */
|
|
sendTupleOverConnection = true;
|
|
}
|
|
else if (currentPlacementState != activePlacementState)
|
|
{
|
|
/* buffer data */
|
|
StringInfo copyBuffer = copyOutState->fe_msgbuf;
|
|
resetStringInfo(copyBuffer);
|
|
AppendCopyRowData(columnValues, columnNulls, tupleDescriptor,
|
|
copyOutState, columnOutputFunctions,
|
|
columnCoercionPaths);
|
|
appendBinaryStringInfo(currentPlacementState->data, copyBuffer->data,
|
|
copyBuffer->len);
|
|
}
|
|
else
|
|
{
|
|
Assert(currentPlacementState == activePlacementState);
|
|
sendTupleOverConnection = true;
|
|
}
|
|
|
|
if (sendTupleOverConnection)
|
|
{
|
|
resetStringInfo(copyOutState->fe_msgbuf);
|
|
AppendCopyRowData(columnValues, columnNulls, tupleDescriptor,
|
|
copyOutState, columnOutputFunctions, columnCoercionPaths);
|
|
SendCopyDataToPlacement(copyOutState->fe_msgbuf, shardId,
|
|
connectionState->connection);
|
|
}
|
|
}
|
|
|
|
MemoryContextSwitchTo(oldContext);
|
|
|
|
copyDest->tuplesSent++;
|
|
|
|
/*
|
|
* Release per tuple memory allocated in this function. If we're writing
|
|
* the results of an INSERT ... SELECT then the SELECT execution will use
|
|
* its own executor state and reset the per tuple expression context
|
|
* separately.
|
|
*/
|
|
ResetPerTupleExprContext(executorState);
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
/*
|
|
* AddPlacementStateToCopyConnectionStateBuffer is a helper function to add a placement
|
|
* state to connection state's placement buffer. In addition to that, keep the counter
|
|
* up to date.
|
|
*/
|
|
static void
|
|
AddPlacementStateToCopyConnectionStateBuffer(CopyConnectionState *connectionState,
|
|
CopyPlacementState *placementState)
|
|
{
|
|
dlist_push_head(&connectionState->bufferedPlacementList,
|
|
&placementState->bufferedPlacementNode);
|
|
connectionState->bufferedPlacementCount++;
|
|
}
|
|
|
|
|
|
/*
|
|
* RemovePlacementStateFromCopyConnectionStateBuffer is a helper function to removes a placement
|
|
* state from connection state's placement buffer. In addition to that, keep the counter
|
|
* up to date.
|
|
*/
|
|
static void
|
|
RemovePlacementStateFromCopyConnectionStateBuffer(CopyConnectionState *connectionState,
|
|
CopyPlacementState *placementState)
|
|
{
|
|
dlist_delete(&placementState->bufferedPlacementNode);
|
|
connectionState->bufferedPlacementCount--;
|
|
}
|
|
|
|
|
|
/*
|
|
* ProcessAppendToShardOption returns the value of append_to_shard if set,
|
|
* and removes the option from the options list.
|
|
*/
|
|
static uint64
|
|
ProcessAppendToShardOption(Oid relationId, CopyStmt *copyStatement)
|
|
{
|
|
uint64 appendShardId = INVALID_SHARD_ID;
|
|
bool appendToShardSet = false;
|
|
|
|
DefElem *defel = NULL;
|
|
foreach_ptr(defel, copyStatement->options)
|
|
{
|
|
if (strncmp(defel->defname, APPEND_TO_SHARD_OPTION, NAMEDATALEN) == 0)
|
|
{
|
|
appendShardId = defGetInt64(defel);
|
|
appendToShardSet = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (appendToShardSet)
|
|
{
|
|
if (!IsCitusTableType(relationId, APPEND_DISTRIBUTED))
|
|
{
|
|
ereport(ERROR, (errmsg(APPEND_TO_SHARD_OPTION " is only valid for "
|
|
"append-distributed tables")));
|
|
}
|
|
|
|
/* throws an error if shard does not exist */
|
|
ShardInterval *shardInterval = LoadShardInterval(appendShardId);
|
|
|
|
/* also check whether shard belongs to table */
|
|
if (shardInterval->relationId != relationId)
|
|
{
|
|
ereport(ERROR, (errmsg("shard " UINT64_FORMAT " does not belong to table %s",
|
|
appendShardId, get_rel_name(relationId))));
|
|
}
|
|
|
|
copyStatement->options =
|
|
RemoveOptionFromList(copyStatement->options, APPEND_TO_SHARD_OPTION);
|
|
}
|
|
else if (IsCitusTableType(relationId, APPEND_DISTRIBUTED))
|
|
{
|
|
ereport(ERROR, (errmsg("COPY into append-distributed table requires using the "
|
|
APPEND_TO_SHARD_OPTION " option")));
|
|
}
|
|
|
|
return appendShardId;
|
|
}
|
|
|
|
|
|
/*
|
|
* ContainsLocalPlacement returns true if the current node has
|
|
* a local placement for the given shard id.
|
|
*/
|
|
static bool
|
|
ContainsLocalPlacement(int64 shardId)
|
|
{
|
|
ListCell *placementCell = NULL;
|
|
List *activePlacementList = ActiveShardPlacementList(shardId);
|
|
int32 localGroupId = GetLocalGroupId();
|
|
|
|
foreach(placementCell, activePlacementList)
|
|
{
|
|
ShardPlacement *placement = (ShardPlacement *) lfirst(placementCell);
|
|
|
|
if (placement->groupId == localGroupId)
|
|
{
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
|
|
/*
|
|
* ShardIdForTuple returns id of the shard to which the given tuple belongs to.
|
|
*/
|
|
static uint64
|
|
ShardIdForTuple(CitusCopyDestReceiver *copyDest, Datum *columnValues, bool *columnNulls)
|
|
{
|
|
int partitionColumnIndex = copyDest->partitionColumnIndex;
|
|
Datum partitionColumnValue = 0;
|
|
CopyCoercionData *columnCoercionPaths = copyDest->columnCoercionPaths;
|
|
CitusTableCacheEntry *cacheEntry =
|
|
GetCitusTableCacheEntry(copyDest->distributedRelationId);
|
|
|
|
if (IsCitusTableTypeCacheEntry(cacheEntry, APPEND_DISTRIBUTED))
|
|
{
|
|
return copyDest->appendShardId;
|
|
}
|
|
|
|
/*
|
|
* Find the partition column value and corresponding shard interval
|
|
* for non-reference tables.
|
|
* Get the existing (and only a single) shard interval for the reference
|
|
* tables. Note that, reference tables has NULL partition column values so
|
|
* skip the check.
|
|
*/
|
|
if (partitionColumnIndex != INVALID_PARTITION_COLUMN_INDEX)
|
|
{
|
|
CopyCoercionData *coercePath = &columnCoercionPaths[partitionColumnIndex];
|
|
|
|
if (columnNulls[partitionColumnIndex])
|
|
{
|
|
Oid relationId = copyDest->distributedRelationId;
|
|
char *relationName = get_rel_name(relationId);
|
|
Oid schemaOid = get_rel_namespace(relationId);
|
|
char *schemaName = get_namespace_name(schemaOid);
|
|
char *qualifiedTableName = quote_qualified_identifier(schemaName,
|
|
relationName);
|
|
|
|
ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
|
|
errmsg("the partition column of table %s cannot be NULL",
|
|
qualifiedTableName)));
|
|
}
|
|
|
|
/* find the partition column value */
|
|
partitionColumnValue = columnValues[partitionColumnIndex];
|
|
|
|
/* annoyingly this is evaluated twice, but at least we don't crash! */
|
|
partitionColumnValue = CoerceColumnValue(partitionColumnValue, coercePath);
|
|
}
|
|
|
|
/*
|
|
* Find the shard interval and id for the partition column value for
|
|
* non-reference tables.
|
|
*
|
|
* For reference table, this function blindly returns the tables single
|
|
* shard.
|
|
*/
|
|
ShardInterval *shardInterval = FindShardInterval(partitionColumnValue, cacheEntry);
|
|
if (shardInterval == NULL)
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
|
errmsg("could not find shard for partition column "
|
|
"value")));
|
|
}
|
|
|
|
return shardInterval->shardId;
|
|
}
|
|
|
|
|
|
/*
|
|
* CitusCopyDestReceiverShutdown implements the rShutdown interface of
|
|
* CitusCopyDestReceiver. It ends the COPY on all the open connections and closes
|
|
* the relation.
|
|
*/
|
|
static void
|
|
CitusCopyDestReceiverShutdown(DestReceiver *destReceiver)
|
|
{
|
|
CitusCopyDestReceiver *copyDest = (CitusCopyDestReceiver *) destReceiver;
|
|
|
|
HTAB *connectionStateHash = copyDest->connectionStateHash;
|
|
ListCell *connectionStateCell = NULL;
|
|
Relation distributedRelation = copyDest->distributedRelation;
|
|
|
|
List *connectionStateList = ConnectionStateList(connectionStateHash);
|
|
|
|
FinishLocalColocatedIntermediateFiles(copyDest);
|
|
FinishLocalCopy(copyDest);
|
|
|
|
PG_TRY();
|
|
{
|
|
foreach(connectionStateCell, connectionStateList)
|
|
{
|
|
CopyConnectionState *connectionState =
|
|
(CopyConnectionState *) lfirst(connectionStateCell);
|
|
|
|
ShutdownCopyConnectionState(connectionState, copyDest);
|
|
}
|
|
}
|
|
PG_CATCH();
|
|
{
|
|
/*
|
|
* We might be able to recover from errors with ROLLBACK TO SAVEPOINT,
|
|
* so unclaim the connections before throwing errors.
|
|
*/
|
|
UnclaimCopyConnections(connectionStateList);
|
|
|
|
PG_RE_THROW();
|
|
}
|
|
PG_END_TRY();
|
|
|
|
table_close(distributedRelation, NoLock);
|
|
}
|
|
|
|
|
|
/*
|
|
* FinishLocalCopy sends the remaining copies for local placements.
|
|
*/
|
|
static void
|
|
FinishLocalCopy(CitusCopyDestReceiver *copyDest)
|
|
{
|
|
HTAB *shardStateHash = copyDest->shardStateHash;
|
|
HASH_SEQ_STATUS status;
|
|
CopyShardState *copyShardState;
|
|
|
|
foreach_htab(copyShardState, &status, shardStateHash)
|
|
{
|
|
if (copyShardState->copyOutState != NULL &&
|
|
copyShardState->copyOutState->fe_msgbuf->len > 0)
|
|
{
|
|
FinishLocalCopyToShard(copyDest, copyShardState->shardId,
|
|
copyShardState->copyOutState);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* CreateLocalColocatedIntermediateFile creates a co-located file for the given
|
|
* shard, and appends the binary headers if needed. The function also modifies
|
|
* shardState to set the fileDest and copyOutState.
|
|
*/
|
|
static void
|
|
CreateLocalColocatedIntermediateFile(CitusCopyDestReceiver *copyDest,
|
|
CopyShardState *shardState)
|
|
{
|
|
/* make sure the directory exists */
|
|
CreateIntermediateResultsDirectory();
|
|
|
|
const int fileFlags = (O_CREAT | O_RDWR | O_TRUNC);
|
|
const int fileMode = (S_IRUSR | S_IWUSR);
|
|
|
|
StringInfo filePath = makeStringInfo();
|
|
appendStringInfo(filePath, "%s_%ld", copyDest->colocatedIntermediateResultIdPrefix,
|
|
shardState->shardId);
|
|
|
|
const char *fileName = QueryResultFileName(filePath->data);
|
|
shardState->fileDest =
|
|
FileCompatFromFileStart(FileOpenForTransmit(fileName, fileFlags, fileMode));
|
|
|
|
CopyOutState localFileCopyOutState = shardState->copyOutState;
|
|
bool isBinaryCopy = localFileCopyOutState->binary;
|
|
if (isBinaryCopy)
|
|
{
|
|
AppendCopyBinaryHeaders(localFileCopyOutState);
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* FinishLocalColocatedIntermediateFiles iterates over all the colocated
|
|
* intermediate files and finishes the COPY on all of them.
|
|
*/
|
|
static void
|
|
FinishLocalColocatedIntermediateFiles(CitusCopyDestReceiver *copyDest)
|
|
{
|
|
HTAB *shardStateHash = copyDest->shardStateHash;
|
|
HASH_SEQ_STATUS status;
|
|
CopyShardState *copyShardState;
|
|
|
|
foreach_htab(copyShardState, &status, shardStateHash)
|
|
{
|
|
if (copyShardState->copyOutState != NULL &&
|
|
FILE_IS_OPEN(copyShardState->fileDest.fd))
|
|
{
|
|
FinishLocalCopyToFile(copyShardState->copyOutState,
|
|
©ShardState->fileDest);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* ShutdownCopyConnectionState ends the copy command for the current active
|
|
* placement on connection, and then sends the rest of the buffers over the
|
|
* connection.
|
|
*/
|
|
static void
|
|
ShutdownCopyConnectionState(CopyConnectionState *connectionState,
|
|
CitusCopyDestReceiver *copyDest)
|
|
{
|
|
CopyOutState copyOutState = copyDest->copyOutState;
|
|
CopyStmt *copyStatement = copyDest->copyStatement;
|
|
dlist_iter iter;
|
|
|
|
CopyPlacementState *activePlacementState = connectionState->activePlacementState;
|
|
if (activePlacementState != NULL)
|
|
{
|
|
EndPlacementStateCopyCommand(activePlacementState, copyOutState);
|
|
if (!copyDest->isPublishable)
|
|
{
|
|
ResetReplicationOriginRemoteSession(
|
|
activePlacementState->connectionState->connection);
|
|
}
|
|
}
|
|
|
|
dlist_foreach(iter, &connectionState->bufferedPlacementList)
|
|
{
|
|
CopyPlacementState *placementState =
|
|
dlist_container(CopyPlacementState, bufferedPlacementNode, iter.cur);
|
|
uint64 shardId = placementState->shardState->shardId;
|
|
|
|
StartPlacementStateCopyCommand(placementState, copyStatement,
|
|
copyOutState);
|
|
SendCopyDataToPlacement(placementState->data, shardId,
|
|
connectionState->connection);
|
|
EndPlacementStateCopyCommand(placementState, copyOutState);
|
|
if (!copyDest->isPublishable)
|
|
{
|
|
ResetReplicationOriginRemoteSession(connectionState->connection);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* CitusCopyDestReceiverDestroy frees the DestReceiver
|
|
*/
|
|
static void
|
|
CitusCopyDestReceiverDestroy(DestReceiver *destReceiver)
|
|
{
|
|
CitusCopyDestReceiver *copyDest = (CitusCopyDestReceiver *) destReceiver;
|
|
|
|
if (copyDest->copyOutState)
|
|
{
|
|
pfree(copyDest->copyOutState);
|
|
}
|
|
|
|
if (copyDest->columnOutputFunctions)
|
|
{
|
|
pfree(copyDest->columnOutputFunctions);
|
|
}
|
|
|
|
if (copyDest->columnCoercionPaths)
|
|
{
|
|
pfree(copyDest->columnCoercionPaths);
|
|
}
|
|
|
|
if (copyDest->shardStateHash)
|
|
{
|
|
hash_destroy(copyDest->shardStateHash);
|
|
}
|
|
|
|
if (copyDest->connectionStateHash)
|
|
{
|
|
hash_destroy(copyDest->connectionStateHash);
|
|
}
|
|
|
|
pfree(copyDest);
|
|
}
|
|
|
|
|
|
/*
|
|
* IsCopyResultStmt determines whether the given copy statement is a
|
|
* COPY "resultkey" FROM STDIN WITH (format result) statement, which is used
|
|
* to copy query results from the coordinator into workers.
|
|
*/
|
|
bool
|
|
IsCopyResultStmt(CopyStmt *copyStatement)
|
|
{
|
|
return CopyStatementHasFormat(copyStatement, "result");
|
|
}
|
|
|
|
|
|
/*
|
|
* CopyStatementHasFormat checks whether the COPY statement has the given
|
|
* format.
|
|
*/
|
|
static bool
|
|
CopyStatementHasFormat(CopyStmt *copyStatement, char *formatName)
|
|
{
|
|
ListCell *optionCell = NULL;
|
|
bool hasFormat = false;
|
|
|
|
/* extract WITH (...) options from the COPY statement */
|
|
foreach(optionCell, copyStatement->options)
|
|
{
|
|
DefElem *defel = (DefElem *) lfirst(optionCell);
|
|
|
|
if (strncmp(defel->defname, "format", NAMEDATALEN) == 0 &&
|
|
strncmp(defGetString(defel), formatName, NAMEDATALEN) == 0)
|
|
{
|
|
hasFormat = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return hasFormat;
|
|
}
|
|
|
|
|
|
/*
|
|
* ProcessCopyStmt handles Citus specific concerns for COPY like supporting
|
|
* COPYing from distributed tables and preventing unsupported actions. The
|
|
* function returns a modified COPY statement to be executed, or NULL if no
|
|
* further processing is needed.
|
|
*/
|
|
Node *
|
|
ProcessCopyStmt(CopyStmt *copyStatement, QueryCompletion *completionTag, const
|
|
char *queryString)
|
|
{
|
|
/*
|
|
* Handle special COPY "resultid" FROM STDIN WITH (format result) commands
|
|
* for sending intermediate results to workers.
|
|
*/
|
|
if (IsCopyResultStmt(copyStatement))
|
|
{
|
|
const char *resultId = copyStatement->relation->relname;
|
|
|
|
if (copyStatement->is_from)
|
|
{
|
|
ReceiveQueryResultViaCopy(resultId);
|
|
}
|
|
else
|
|
{
|
|
SendQueryResultViaCopy(resultId);
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* We check whether a distributed relation is affected. For that, we need to open the
|
|
* relation. To prevent race conditions with later lookups, lock the table, and modify
|
|
* the rangevar to include the schema.
|
|
*/
|
|
if (copyStatement->relation != NULL)
|
|
{
|
|
bool isFrom = copyStatement->is_from;
|
|
|
|
/* consider using RangeVarGetRelidExtended to check perms before locking */
|
|
Relation copiedRelation = table_openrv(copyStatement->relation,
|
|
isFrom ? RowExclusiveLock :
|
|
AccessShareLock);
|
|
|
|
bool isCitusRelation = IsCitusTable(RelationGetRelid(copiedRelation));
|
|
|
|
/* ensure future lookups hit the same relation */
|
|
char *schemaName = get_namespace_name(RelationGetNamespace(copiedRelation));
|
|
|
|
/* ensure we copy string into proper context */
|
|
MemoryContext relationContext = GetMemoryChunkContext(
|
|
copyStatement->relation);
|
|
schemaName = MemoryContextStrdup(relationContext, schemaName);
|
|
copyStatement->relation->schemaname = schemaName;
|
|
|
|
table_close(copiedRelation, NoLock);
|
|
|
|
if (isCitusRelation)
|
|
{
|
|
if (copyStatement->is_from)
|
|
{
|
|
if (copyStatement->whereClause)
|
|
{
|
|
/*
|
|
* Update progress reporting for tuples progressed so that the
|
|
* progress is reflected on pg_stat_progress_copy. Citus currently
|
|
* does not support COPY .. WHERE clause so TUPLES_EXCLUDED is not
|
|
* handled. When we remove this check, we should implement progress
|
|
* reporting as well.
|
|
*/
|
|
ereport(ERROR, (errmsg(
|
|
"Citus does not support COPY FROM with WHERE")));
|
|
}
|
|
|
|
/* check permissions, we're bypassing postgres' normal checks */
|
|
CheckCopyPermissions(copyStatement);
|
|
CitusCopyFrom(copyStatement, completionTag);
|
|
return NULL;
|
|
}
|
|
else if (copyStatement->filename == NULL && !copyStatement->is_program &&
|
|
!CopyStatementHasFormat(copyStatement, "binary"))
|
|
{
|
|
/*
|
|
* COPY table TO STDOUT is handled by specialized logic to
|
|
* avoid buffering the table on the coordinator. This enables
|
|
* pg_dump of large tables.
|
|
*/
|
|
CitusCopyTo(copyStatement, completionTag);
|
|
return NULL;
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* COPY table TO PROGRAM / file is handled by wrapping the table
|
|
* in a SELECT and going through the resulting COPY logic.
|
|
*/
|
|
SelectStmt *selectStmt = CitusCopySelect(copyStatement);
|
|
|
|
/* replace original statement */
|
|
copyStatement = copyObject(copyStatement);
|
|
copyStatement->relation = NULL;
|
|
copyStatement->query = (Node *) selectStmt;
|
|
}
|
|
}
|
|
}
|
|
return (Node *) copyStatement;
|
|
}
|
|
|
|
|
|
/*
|
|
* CitusCopySelect generates a SelectStmt such that table may be replaced in
|
|
* "COPY table FROM" for an equivalent result.
|
|
*/
|
|
static SelectStmt *
|
|
CitusCopySelect(CopyStmt *copyStatement)
|
|
{
|
|
SelectStmt *selectStmt = makeNode(SelectStmt);
|
|
selectStmt->fromClause = list_make1(copyObject(copyStatement->relation));
|
|
|
|
Relation distributedRelation = table_openrv(copyStatement->relation, AccessShareLock);
|
|
TupleDesc tupleDescriptor = RelationGetDescr(distributedRelation);
|
|
List *targetList = NIL;
|
|
|
|
for (int i = 0; i < tupleDescriptor->natts; i++)
|
|
{
|
|
Form_pg_attribute attr = &tupleDescriptor->attrs[i];
|
|
|
|
if (attr->attisdropped ||
|
|
attr->attgenerated
|
|
)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
ColumnRef *column = makeNode(ColumnRef);
|
|
column->fields = list_make1(makeString(pstrdup(attr->attname.data)));
|
|
column->location = -1;
|
|
|
|
ResTarget *selectTarget = makeNode(ResTarget);
|
|
selectTarget->name = NULL;
|
|
selectTarget->indirection = NIL;
|
|
selectTarget->val = (Node *) column;
|
|
selectTarget->location = -1;
|
|
|
|
targetList = lappend(targetList, selectTarget);
|
|
}
|
|
|
|
table_close(distributedRelation, NoLock);
|
|
|
|
selectStmt->targetList = targetList;
|
|
return selectStmt;
|
|
}
|
|
|
|
|
|
/*
|
|
* CitusCopyTo runs a COPY .. TO STDOUT command on each shard to do a full
|
|
* table dump.
|
|
*/
|
|
static void
|
|
CitusCopyTo(CopyStmt *copyStatement, QueryCompletion *completionTag)
|
|
{
|
|
ListCell *shardIntervalCell = NULL;
|
|
int64 tuplesSent = 0;
|
|
|
|
Relation distributedRelation = table_openrv(copyStatement->relation, AccessShareLock);
|
|
Oid relationId = RelationGetRelid(distributedRelation);
|
|
TupleDesc tupleDescriptor = RelationGetDescr(distributedRelation);
|
|
|
|
CopyOutState copyOutState = (CopyOutState) palloc0(sizeof(CopyOutStateData));
|
|
copyOutState->fe_msgbuf = makeStringInfo();
|
|
copyOutState->binary = false;
|
|
copyOutState->attnumlist = CopyGetAttnums(tupleDescriptor, distributedRelation,
|
|
copyStatement->attlist);
|
|
|
|
SendCopyBegin(copyOutState);
|
|
|
|
List *shardIntervalList = LoadShardIntervalList(relationId);
|
|
|
|
foreach(shardIntervalCell, shardIntervalList)
|
|
{
|
|
ShardInterval *shardInterval = lfirst(shardIntervalCell);
|
|
List *shardPlacementList = ActiveShardPlacementList(shardInterval->shardId);
|
|
ListCell *shardPlacementCell = NULL;
|
|
int placementIndex = 0;
|
|
|
|
StringInfo copyCommand = ConstructCopyStatement(copyStatement,
|
|
shardInterval->shardId);
|
|
|
|
foreach(shardPlacementCell, shardPlacementList)
|
|
{
|
|
ShardPlacement *shardPlacement = lfirst(shardPlacementCell);
|
|
int connectionFlags = 0;
|
|
char *userName = NULL;
|
|
const bool raiseErrors = true;
|
|
|
|
MultiConnection *connection = GetPlacementConnection(connectionFlags,
|
|
shardPlacement,
|
|
userName);
|
|
|
|
/*
|
|
* This code-path doesn't support optional connections, so we don't expect
|
|
* NULL connections.
|
|
*/
|
|
Assert(connection != NULL);
|
|
|
|
if (placementIndex == list_length(shardPlacementList) - 1)
|
|
{
|
|
/* last chance for this shard */
|
|
MarkRemoteTransactionCritical(connection);
|
|
}
|
|
|
|
if (PQstatus(connection->pgConn) != CONNECTION_OK)
|
|
{
|
|
ReportConnectionError(connection, ERROR);
|
|
continue;
|
|
}
|
|
|
|
RemoteTransactionBeginIfNecessary(connection);
|
|
|
|
if (!SendRemoteCommand(connection, copyCommand->data))
|
|
{
|
|
ReportConnectionError(connection, ERROR);
|
|
continue;
|
|
}
|
|
|
|
PGresult *result = GetRemoteCommandResult(connection, raiseErrors);
|
|
if (PQresultStatus(result) != PGRES_COPY_OUT)
|
|
{
|
|
ReportResultError(connection, result, ERROR);
|
|
}
|
|
|
|
PQclear(result);
|
|
|
|
tuplesSent += ForwardCopyDataFromConnection(copyOutState, connection);
|
|
|
|
break;
|
|
}
|
|
|
|
if (shardIntervalCell == list_head(shardIntervalList))
|
|
{
|
|
/* remove header after the first shard */
|
|
copyStatement->options =
|
|
RemoveOptionFromList(copyStatement->options, "header");
|
|
}
|
|
}
|
|
|
|
SendCopyEnd(copyOutState);
|
|
|
|
table_close(distributedRelation, AccessShareLock);
|
|
|
|
if (completionTag != NULL)
|
|
{
|
|
CompleteCopyQueryTagCompat(completionTag, tuplesSent);
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* ForwardCopyDataFromConnection forwards copy data received over the given connection
|
|
* to the client or file descriptor.
|
|
*/
|
|
static int64
|
|
ForwardCopyDataFromConnection(CopyOutState copyOutState, MultiConnection *connection)
|
|
{
|
|
char *receiveBuffer = NULL;
|
|
const int useAsync = 0;
|
|
bool raiseErrors = true;
|
|
int64 tuplesSent = 0;
|
|
|
|
/* receive copy data message in a synchronous manner */
|
|
int receiveLength = PQgetCopyData(connection->pgConn, &receiveBuffer, useAsync);
|
|
while (receiveLength > 0)
|
|
{
|
|
bool includeEndOfLine = false;
|
|
|
|
CopySendData(copyOutState, receiveBuffer, receiveLength);
|
|
CopySendEndOfRow(copyOutState, includeEndOfLine);
|
|
tuplesSent++;
|
|
|
|
PQfreemem(receiveBuffer);
|
|
|
|
receiveLength = PQgetCopyData(connection->pgConn, &receiveBuffer, useAsync);
|
|
}
|
|
|
|
if (receiveLength != -1)
|
|
{
|
|
ReportConnectionError(connection, ERROR);
|
|
}
|
|
|
|
PGresult *result = GetRemoteCommandResult(connection, raiseErrors);
|
|
if (!IsResponseOK(result))
|
|
{
|
|
ReportResultError(connection, result, ERROR);
|
|
}
|
|
|
|
PQclear(result);
|
|
ClearResults(connection, raiseErrors);
|
|
|
|
return tuplesSent;
|
|
}
|
|
|
|
|
|
/*
|
|
* Check whether the current user has the permission to execute a COPY
|
|
* statement, raise ERROR if not. In some cases we have to do this separately
|
|
* from postgres' copy.c, because we have to execute the copy with elevated
|
|
* privileges.
|
|
*
|
|
* Copied from postgres, where it's part of DoCopy().
|
|
*/
|
|
void
|
|
CheckCopyPermissions(CopyStmt *copyStatement)
|
|
{
|
|
/* *INDENT-OFF* */
|
|
bool is_from = copyStatement->is_from;
|
|
Relation rel;
|
|
List *range_table = NIL;
|
|
TupleDesc tupDesc;
|
|
AclMode required_access = (is_from ? ACL_INSERT : ACL_SELECT);
|
|
List *attnums;
|
|
ListCell *cur;
|
|
|
|
rel = table_openrv(copyStatement->relation,
|
|
is_from ? RowExclusiveLock : AccessShareLock);
|
|
|
|
range_table = CreateRangeTable(rel, required_access);
|
|
RangeTblEntry *rte = (RangeTblEntry*) linitial(range_table);
|
|
tupDesc = RelationGetDescr(rel);
|
|
|
|
attnums = CopyGetAttnums(tupDesc, rel, copyStatement->attlist);
|
|
foreach(cur, attnums)
|
|
{
|
|
int attno = lfirst_int(cur) - FirstLowInvalidHeapAttributeNumber;
|
|
|
|
if (is_from)
|
|
{
|
|
rte->insertedCols = bms_add_member(rte->insertedCols, attno);
|
|
}
|
|
else
|
|
{
|
|
rte->selectedCols = bms_add_member(rte->selectedCols, attno);
|
|
}
|
|
}
|
|
|
|
ExecCheckRTPerms(range_table, true);
|
|
|
|
/* TODO: Perform RLS checks once supported */
|
|
|
|
table_close(rel, NoLock);
|
|
/* *INDENT-ON* */
|
|
}
|
|
|
|
|
|
/*
|
|
* CreateRangeTable creates a range table with the given relation.
|
|
*/
|
|
List *
|
|
CreateRangeTable(Relation rel, AclMode requiredAccess)
|
|
{
|
|
RangeTblEntry *rte = makeNode(RangeTblEntry);
|
|
rte->rtekind = RTE_RELATION;
|
|
rte->relid = rel->rd_id;
|
|
rte->relkind = rel->rd_rel->relkind;
|
|
rte->requiredPerms = requiredAccess;
|
|
return list_make1(rte);
|
|
}
|
|
|
|
|
|
#if PG_VERSION_NUM < PG_VERSION_14
|
|
|
|
/* Helper for CheckCopyPermissions(), copied from postgres */
|
|
static List *
|
|
CopyGetAttnums(TupleDesc tupDesc, Relation rel, List *attnamelist)
|
|
{
|
|
/* *INDENT-OFF* */
|
|
List *attnums = NIL;
|
|
|
|
if (attnamelist == NIL)
|
|
{
|
|
/* Generate default column list */
|
|
int attr_count = tupDesc->natts;
|
|
int i;
|
|
|
|
for (i = 0; i < attr_count; i++)
|
|
{
|
|
if (TupleDescAttr(tupDesc, i)->attisdropped)
|
|
continue;
|
|
if (TupleDescAttr(tupDesc, i)->attgenerated)
|
|
continue;
|
|
attnums = lappend_int(attnums, i + 1);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* Validate the user-supplied list and extract attnums */
|
|
ListCell *l;
|
|
|
|
foreach(l, attnamelist)
|
|
{
|
|
char *name = strVal(lfirst(l));
|
|
int attnum;
|
|
int i;
|
|
|
|
/* Lookup column name */
|
|
attnum = InvalidAttrNumber;
|
|
for (i = 0; i < tupDesc->natts; i++)
|
|
{
|
|
Form_pg_attribute att = TupleDescAttr(tupDesc, i);
|
|
|
|
if (att->attisdropped)
|
|
continue;
|
|
if (namestrcmp(&(att->attname), name) == 0)
|
|
{
|
|
if (att->attgenerated)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INVALID_COLUMN_REFERENCE),
|
|
errmsg("column \"%s\" is a generated column",
|
|
name),
|
|
errdetail("Generated columns cannot be used in COPY.")));
|
|
attnum = att->attnum;
|
|
break;
|
|
}
|
|
}
|
|
if (attnum == InvalidAttrNumber)
|
|
{
|
|
if (rel != NULL)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_UNDEFINED_COLUMN),
|
|
errmsg("column \"%s\" of relation \"%s\" does not exist",
|
|
name, RelationGetRelationName(rel))));
|
|
else
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_UNDEFINED_COLUMN),
|
|
errmsg("column \"%s\" does not exist",
|
|
name)));
|
|
}
|
|
/* Check for duplicates */
|
|
if (list_member_int(attnums, attnum))
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_DUPLICATE_COLUMN),
|
|
errmsg("column \"%s\" specified more than once",
|
|
name)));
|
|
attnums = lappend_int(attnums, attnum);
|
|
}
|
|
}
|
|
|
|
return attnums;
|
|
/* *INDENT-ON* */
|
|
}
|
|
|
|
|
|
#endif
|
|
|
|
|
|
/*
|
|
* CreateConnectionStateHash constructs a hash table which maps from socket
|
|
* number to CopyConnectionState, passing the provided MemoryContext to
|
|
* hash_create for hash allocations.
|
|
*/
|
|
static HTAB *
|
|
CreateConnectionStateHash(MemoryContext memoryContext)
|
|
{
|
|
HASHCTL info;
|
|
|
|
memset(&info, 0, sizeof(info));
|
|
info.keysize = sizeof(int);
|
|
info.entrysize = sizeof(CopyConnectionState);
|
|
info.hcxt = memoryContext;
|
|
int hashFlags = (HASH_ELEM | HASH_CONTEXT | HASH_BLOBS);
|
|
|
|
HTAB *connectionStateHash = hash_create("Copy Connection State Hash", 128, &info,
|
|
hashFlags);
|
|
|
|
return connectionStateHash;
|
|
}
|
|
|
|
|
|
/*
|
|
* CreateShardStateHash constructs a hash table which maps from shard
|
|
* identifier to CopyShardState, passing the provided MemoryContext to
|
|
* hash_create for hash allocations.
|
|
*/
|
|
static HTAB *
|
|
CreateShardStateHash(MemoryContext memoryContext)
|
|
{
|
|
HASHCTL info;
|
|
|
|
memset(&info, 0, sizeof(info));
|
|
info.keysize = sizeof(uint64);
|
|
info.entrysize = sizeof(CopyShardState);
|
|
info.hcxt = memoryContext;
|
|
int hashFlags = (HASH_ELEM | HASH_CONTEXT | HASH_BLOBS);
|
|
|
|
HTAB *shardStateHash = hash_create("Copy Shard State Hash", 128, &info, hashFlags);
|
|
|
|
return shardStateHash;
|
|
}
|
|
|
|
|
|
/*
|
|
* GetConnectionState finds existing CopyConnectionState for a connection in the
|
|
* provided hash. If not found, then a default structure is returned.
|
|
*/
|
|
static CopyConnectionState *
|
|
GetConnectionState(HTAB *connectionStateHash, MultiConnection *connection)
|
|
{
|
|
bool found = false;
|
|
|
|
int sock = PQsocket(connection->pgConn);
|
|
Assert(sock != -1);
|
|
|
|
CopyConnectionState *connectionState = (CopyConnectionState *) hash_search(
|
|
connectionStateHash, &sock,
|
|
HASH_ENTER,
|
|
&found);
|
|
if (!found)
|
|
{
|
|
connectionState->socket = sock;
|
|
connectionState->connection = connection;
|
|
connectionState->activePlacementState = NULL;
|
|
connectionState->bufferedPlacementCount = 0;
|
|
dlist_init(&connectionState->bufferedPlacementList);
|
|
}
|
|
|
|
return connectionState;
|
|
}
|
|
|
|
|
|
/*
|
|
* ConnectionStateList returns all CopyConnectionState structures in
|
|
* the given hash.
|
|
*/
|
|
static List *
|
|
ConnectionStateList(HTAB *connectionStateHash)
|
|
{
|
|
List *connectionStateList = NIL;
|
|
HASH_SEQ_STATUS status;
|
|
|
|
hash_seq_init(&status, connectionStateHash);
|
|
|
|
CopyConnectionState *connectionState = (CopyConnectionState *) hash_seq_search(
|
|
&status);
|
|
while (connectionState != NULL)
|
|
{
|
|
connectionStateList = lappend(connectionStateList, connectionState);
|
|
|
|
connectionState = (CopyConnectionState *) hash_seq_search(&status);
|
|
}
|
|
|
|
return connectionStateList;
|
|
}
|
|
|
|
|
|
/*
|
|
* ConnectionStateListToNode returns all CopyConnectionState structures in
|
|
* the given hash for a given hostname and port values.
|
|
*/
|
|
static List *
|
|
ConnectionStateListToNode(HTAB *connectionStateHash, const char *hostname, int32 port)
|
|
{
|
|
List *connectionStateList = NIL;
|
|
HASH_SEQ_STATUS status;
|
|
|
|
hash_seq_init(&status, connectionStateHash);
|
|
|
|
CopyConnectionState *connectionState =
|
|
(CopyConnectionState *) hash_seq_search(&status);
|
|
while (connectionState != NULL)
|
|
{
|
|
char *connectionHostname = connectionState->connection->hostname;
|
|
if (strncmp(connectionHostname, hostname, MAX_NODE_LENGTH) == 0 &&
|
|
connectionState->connection->port == port)
|
|
{
|
|
connectionStateList = lappend(connectionStateList, connectionState);
|
|
}
|
|
|
|
connectionState = (CopyConnectionState *) hash_seq_search(&status);
|
|
}
|
|
|
|
return connectionStateList;
|
|
}
|
|
|
|
|
|
/*
|
|
* GetShardState finds existing CopyShardState for a shard in the provided
|
|
* hash. If not found, then a new shard state is returned with all related
|
|
* CopyPlacementStates initialized.
|
|
*/
|
|
static CopyShardState *
|
|
GetShardState(uint64 shardId, HTAB *shardStateHash,
|
|
HTAB *connectionStateHash, bool *found, bool
|
|
shouldUseLocalCopy, CopyOutState copyOutState,
|
|
bool isColocatedIntermediateResult, bool isPublishable)
|
|
{
|
|
CopyShardState *shardState = (CopyShardState *) hash_search(shardStateHash, &shardId,
|
|
HASH_ENTER, found);
|
|
if (!*found)
|
|
{
|
|
InitializeCopyShardState(shardState, connectionStateHash,
|
|
shardId, shouldUseLocalCopy,
|
|
copyOutState, isColocatedIntermediateResult,
|
|
isPublishable);
|
|
}
|
|
|
|
return shardState;
|
|
}
|
|
|
|
|
|
/*
|
|
* InitializeCopyShardState initializes the given shardState. It finds all
|
|
* placements for the given shardId, assignes connections to them, and
|
|
* adds them to shardState->placementStateList.
|
|
*/
|
|
static void
|
|
InitializeCopyShardState(CopyShardState *shardState,
|
|
HTAB *connectionStateHash, uint64 shardId,
|
|
bool shouldUseLocalCopy,
|
|
CopyOutState copyOutState,
|
|
bool colocatedIntermediateResult,
|
|
bool isPublishable)
|
|
{
|
|
ListCell *placementCell = NULL;
|
|
int failedPlacementCount = 0;
|
|
bool hasRemoteCopy = false;
|
|
|
|
MemoryContext localContext =
|
|
AllocSetContextCreateInternal(CurrentMemoryContext,
|
|
"InitializeCopyShardState",
|
|
ALLOCSET_DEFAULT_MINSIZE,
|
|
ALLOCSET_DEFAULT_INITSIZE,
|
|
ALLOCSET_DEFAULT_MAXSIZE);
|
|
|
|
|
|
/* release active placement list at the end of this function */
|
|
MemoryContext oldContext = MemoryContextSwitchTo(localContext);
|
|
|
|
List *activePlacementList = ActiveShardPlacementList(shardId);
|
|
|
|
MemoryContextSwitchTo(oldContext);
|
|
|
|
shardState->shardId = shardId;
|
|
shardState->placementStateList = NIL;
|
|
shardState->copyOutState = NULL;
|
|
shardState->containsLocalPlacement = ContainsLocalPlacement(shardId);
|
|
shardState->fileDest.fd = -1;
|
|
|
|
foreach(placementCell, activePlacementList)
|
|
{
|
|
ShardPlacement *placement = (ShardPlacement *) lfirst(placementCell);
|
|
|
|
if (shouldUseLocalCopy && placement->groupId == GetLocalGroupId())
|
|
{
|
|
shardState->copyOutState = (CopyOutState) palloc0(sizeof(*copyOutState));
|
|
CloneCopyOutStateForLocalCopy(copyOutState, shardState->copyOutState);
|
|
|
|
if (colocatedIntermediateResult)
|
|
{
|
|
LogLocalCopyToFileExecution(shardId);
|
|
}
|
|
else
|
|
{
|
|
LogLocalCopyToRelationExecution(shardId);
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
hasRemoteCopy = true;
|
|
|
|
MultiConnection *connection =
|
|
CopyGetPlacementConnection(connectionStateHash, placement,
|
|
colocatedIntermediateResult);
|
|
if (connection == NULL)
|
|
{
|
|
failedPlacementCount++;
|
|
continue;
|
|
}
|
|
|
|
CopyConnectionState *connectionState = GetConnectionState(connectionStateHash,
|
|
connection);
|
|
|
|
/*
|
|
* If this is the first time we are using this connection for copying a
|
|
* shard, send begin if necessary.
|
|
*/
|
|
if (connectionState->activePlacementState == NULL)
|
|
{
|
|
RemoteTransactionBeginIfNecessary(connection);
|
|
}
|
|
|
|
if (!isPublishable)
|
|
{
|
|
SetupReplicationOriginRemoteSession(connection);
|
|
}
|
|
|
|
CopyPlacementState *placementState = palloc0(sizeof(CopyPlacementState));
|
|
placementState->shardState = shardState;
|
|
placementState->data = makeStringInfo();
|
|
placementState->groupId = placement->groupId;
|
|
placementState->connectionState = connectionState;
|
|
|
|
/*
|
|
* We don't set connectionState->activePlacementState here even if it
|
|
* is NULL. Later in CitusSendTupleToPlacements() we set it at the
|
|
* same time as calling StartPlacementStateCopyCommand() so we actually
|
|
* know the COPY operation for the placement is ongoing.
|
|
*/
|
|
AddPlacementStateToCopyConnectionStateBuffer(connectionState, placementState);
|
|
shardState->placementStateList = lappend(shardState->placementStateList,
|
|
placementState);
|
|
}
|
|
|
|
/* if all placements failed, error out */
|
|
if (failedPlacementCount == list_length(activePlacementList))
|
|
{
|
|
ereport(ERROR, (errmsg("could not connect to any active placements")));
|
|
}
|
|
|
|
EnsureTaskExecutionAllowed(hasRemoteCopy);
|
|
|
|
/*
|
|
* We just error out and code execution should never reach to this
|
|
* point. This is the case for all tables.
|
|
*/
|
|
Assert(failedPlacementCount == 0);
|
|
|
|
MemoryContextReset(localContext);
|
|
}
|
|
|
|
|
|
/*
|
|
* CloneCopyOutStateForLocalCopy creates a shallow copy of the CopyOutState with a new
|
|
* fe_msgbuf. We keep a separate CopyOutState for every local shard placement, because
|
|
* in case of local copy we serialize and buffer incoming tuples into fe_msgbuf for each
|
|
* placement and the serialization functions take a CopyOutState as a parameter.
|
|
*/
|
|
static void
|
|
CloneCopyOutStateForLocalCopy(CopyOutState from, CopyOutState to)
|
|
{
|
|
to->attnumlist = from->attnumlist;
|
|
to->binary = from->binary;
|
|
to->copy_dest = from->copy_dest;
|
|
to->delim = from->delim;
|
|
to->file_encoding = from->file_encoding;
|
|
to->need_transcoding = from->need_transcoding;
|
|
to->null_print = from->null_print;
|
|
to->null_print_client = from->null_print_client;
|
|
to->rowcontext = from->rowcontext;
|
|
to->fe_msgbuf = makeStringInfo();
|
|
}
|
|
|
|
|
|
/*
|
|
* LogLocalCopyToRelationExecution logs that the copy will be done
|
|
* locally for the given shard.
|
|
*/
|
|
static void
|
|
LogLocalCopyToRelationExecution(uint64 shardId)
|
|
{
|
|
if (!(LogRemoteCommands || LogLocalCommands))
|
|
{
|
|
return;
|
|
}
|
|
ereport(NOTICE, (errmsg("executing the copy locally for shard %lu", shardId)));
|
|
}
|
|
|
|
|
|
/*
|
|
* LogLocalCopyToFileExecution logs that the copy will be done locally for
|
|
* a file colocated to the given shard.
|
|
*/
|
|
static void
|
|
LogLocalCopyToFileExecution(uint64 shardId)
|
|
{
|
|
if (!(LogRemoteCommands || LogLocalCommands))
|
|
{
|
|
return;
|
|
}
|
|
ereport(NOTICE, (errmsg("executing the copy locally for colocated file with "
|
|
"shard %lu", shardId)));
|
|
}
|
|
|
|
|
|
/*
|
|
* CopyGetPlacementConnection assigns a connection to the given placement. If
|
|
* a connection has already been assigned the placement in the current transaction
|
|
* then it reuses the connection. Otherwise, it requests a connection for placement.
|
|
*/
|
|
static MultiConnection *
|
|
CopyGetPlacementConnection(HTAB *connectionStateHash, ShardPlacement *placement,
|
|
bool colocatedIntermediateResult)
|
|
{
|
|
if (colocatedIntermediateResult)
|
|
{
|
|
/*
|
|
* Colocated intermediate results are just files and not required to use
|
|
* the same connections with their co-located shards. So, we are free to
|
|
* use any connection we can get.
|
|
*
|
|
* Also, the current connection re-use logic does not know how to handle
|
|
* intermediate results as the intermediate results always truncates the
|
|
* existing files. That's why we we use one connection per intermediate
|
|
* result.
|
|
*
|
|
* Also note that we are breaking the guarantees of citus.shared_pool_size
|
|
* as we cannot rely on optional connections.
|
|
*/
|
|
uint32 connectionFlagsForIntermediateResult = 0;
|
|
MultiConnection *connection =
|
|
GetNodeConnection(connectionFlagsForIntermediateResult, placement->nodeName,
|
|
placement->nodePort);
|
|
|
|
/*
|
|
* As noted above, we want each intermediate file to go over
|
|
* a separate connection.
|
|
*/
|
|
ClaimConnectionExclusively(connection);
|
|
|
|
/* and, we cannot afford to handle failures when anything goes wrong */
|
|
MarkRemoteTransactionCritical(connection);
|
|
|
|
return connection;
|
|
}
|
|
|
|
/*
|
|
* Determine whether the task has to be assigned to a particular connection
|
|
* due to a preceding access to the placement in the same transaction.
|
|
*/
|
|
ShardPlacementAccess *placementAccess = CreatePlacementAccess(placement,
|
|
PLACEMENT_ACCESS_DML);
|
|
uint32 connectionFlags = FOR_DML;
|
|
MultiConnection *connection =
|
|
GetConnectionIfPlacementAccessedInXact(connectionFlags,
|
|
list_make1(placementAccess), NULL);
|
|
if (connection != NULL)
|
|
{
|
|
/*
|
|
* Errors are supposed to cause immediate aborts (i.e. we don't
|
|
* want to/can't invalidate placements), mark the connection as
|
|
* critical so later errors cause failures.
|
|
*/
|
|
MarkRemoteTransactionCritical(connection);
|
|
|
|
return connection;
|
|
}
|
|
|
|
/*
|
|
* If we exceeded citus.max_adaptive_executor_pool_size, we should re-use the
|
|
* existing connections to multiplex multiple COPY commands on shards over a
|
|
* single connection.
|
|
*/
|
|
char *nodeName = placement->nodeName;
|
|
int nodePort = placement->nodePort;
|
|
List *copyConnectionStateList =
|
|
ConnectionStateListToNode(connectionStateHash, nodeName, nodePort);
|
|
if (HasReachedAdaptiveExecutorPoolSize(copyConnectionStateList))
|
|
{
|
|
/*
|
|
* If we've already reached the executor pool size, there should be at
|
|
* least one connection to any given node.
|
|
*
|
|
* Note that we don't need to mark the connection as critical, since the
|
|
* connection was already returned by this function before.
|
|
*/
|
|
connection = GetLeastUtilisedCopyConnection(copyConnectionStateList,
|
|
nodeName,
|
|
nodePort);
|
|
|
|
/*
|
|
* Make sure that the connection management remembers that Citus
|
|
* accesses this placement over the connection.
|
|
*/
|
|
AssignPlacementListToConnection(list_make1(placementAccess), connection);
|
|
|
|
return connection;
|
|
}
|
|
|
|
if (IsReservationPossible())
|
|
{
|
|
/*
|
|
* Enforce the requirements for adaptive connection management
|
|
* (a.k.a., throttle connections if citus.max_shared_pool_size
|
|
* reached).
|
|
*
|
|
* Given that we have done reservations per node, we do not ever
|
|
* need to pass WAIT_FOR_CONNECTION, we are sure that there is a
|
|
* connection either reserved for this backend or already established
|
|
* by the previous commands in the same transaction block.
|
|
*/
|
|
int adaptiveConnectionManagementFlag = OPTIONAL_CONNECTION;
|
|
connectionFlags |= adaptiveConnectionManagementFlag;
|
|
}
|
|
|
|
|
|
/*
|
|
* For placements that haven't been assigned a connection by a previous command
|
|
* in the current transaction, we use a separate connection per placement for
|
|
* hash-distributed tables in order to get the maximum performance.
|
|
*/
|
|
if (placement->partitionMethod == DISTRIBUTE_BY_HASH &&
|
|
MultiShardConnectionType != SEQUENTIAL_CONNECTION)
|
|
{
|
|
/*
|
|
* Claiming the connection exclusively (done below) would also have the
|
|
* effect of opening multiple connections, but claiming the connection
|
|
* exclusively prevents GetConnectionIfPlacementAccessedInXact from returning
|
|
* the connection if it is needed for a different shard placement.
|
|
*
|
|
* By setting the REQUIRE_CLEAN_CONNECTION flag we are guaranteed to get
|
|
* connection that will not be returned by GetConnectionIfPlacementAccessedInXact
|
|
* for the remainder of the COPY, hence it safe to claim the connection
|
|
* exclusively. Claiming a connection exclusively prevents it from being
|
|
* used in other distributed queries that happen during the COPY (e.g. if
|
|
* the copy logic calls a function to calculate a default value, and the
|
|
* function does a distributed query).
|
|
*/
|
|
connectionFlags |= REQUIRE_CLEAN_CONNECTION;
|
|
}
|
|
|
|
char *nodeUser = CurrentUserName();
|
|
connection = GetPlacementConnection(connectionFlags, placement, nodeUser);
|
|
if (connection == NULL)
|
|
{
|
|
if (list_length(copyConnectionStateList) > 0)
|
|
{
|
|
/*
|
|
* The connection manager throttled any new connections, so pick an existing
|
|
* connection with least utilization.
|
|
*
|
|
* Note that we don't need to mark the connection as critical, since the
|
|
* connection was already returned by this function before.
|
|
*/
|
|
connection =
|
|
GetLeastUtilisedCopyConnection(copyConnectionStateList, nodeName,
|
|
nodePort);
|
|
|
|
/*
|
|
* Make sure that the connection management remembers that Citus
|
|
* accesses this placement over the connection.
|
|
*/
|
|
AssignPlacementListToConnection(list_make1(placementAccess), connection);
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* For this COPY command, we have not established any connections
|
|
* and adaptive connection management throttled the new connection
|
|
* request. This could only happen if this COPY command is the
|
|
* second (or later) COPY command in a transaction block as the
|
|
* first COPY command always gets a connection per node thanks to
|
|
* the connection reservation.
|
|
*
|
|
* As we know that there has been at least one COPY command happened
|
|
* earlier, we need to find the connection to that node, and use it.
|
|
*/
|
|
connection =
|
|
ConnectionAvailableToNode(nodeName, nodePort, CurrentUserName(),
|
|
CurrentDatabaseName());
|
|
|
|
/*
|
|
* We do not expect this to happen, but still instead of an assert,
|
|
* we prefer explicit error message.
|
|
*/
|
|
if (connection == NULL)
|
|
{
|
|
ereport(ERROR, (errmsg("could not find an available connection"),
|
|
errhint("Set citus.max_shared_pool_size TO -1 to let "
|
|
"COPY command finish")));
|
|
}
|
|
}
|
|
|
|
return connection;
|
|
}
|
|
|
|
if (PQstatus(connection->pgConn) != CONNECTION_OK)
|
|
{
|
|
ReportConnectionError(connection, ERROR);
|
|
}
|
|
|
|
/*
|
|
* Errors are supposed to cause immediate aborts (i.e. we don't
|
|
* want to/can't invalidate placements), mark the connection as
|
|
* critical so later errors cause failures.
|
|
*/
|
|
MarkRemoteTransactionCritical(connection);
|
|
|
|
if (MultiShardConnectionType != SEQUENTIAL_CONNECTION)
|
|
{
|
|
ClaimConnectionExclusively(connection);
|
|
}
|
|
|
|
return connection;
|
|
}
|
|
|
|
|
|
/*
|
|
* HasReachedAdaptiveExecutorPoolSize returns true if the number of entries in input
|
|
* connection list has greater than or equal to citus.max_adaptive_executor_pool_size.
|
|
*/
|
|
static bool
|
|
HasReachedAdaptiveExecutorPoolSize(List *connectionStateList)
|
|
{
|
|
if (list_length(connectionStateList) >= MaxAdaptiveExecutorPoolSize)
|
|
{
|
|
/*
|
|
* We've not reached MaxAdaptiveExecutorPoolSize number of
|
|
* connections, so we're allowed to establish a new
|
|
* connection to the given node.
|
|
*/
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
/*
|
|
* GetLeastUtilisedCopyConnection returns a MultiConnection to the given node
|
|
* with the least number of placements assigned to it.
|
|
*
|
|
* It is assumed that there exists at least one connection to the node.
|
|
*/
|
|
static MultiConnection *
|
|
GetLeastUtilisedCopyConnection(List *connectionStateList, char *nodeName,
|
|
int nodePort)
|
|
{
|
|
MultiConnection *connection = NULL;
|
|
int minPlacementCount = PG_INT32_MAX;
|
|
ListCell *connectionStateCell = NULL;
|
|
|
|
/*
|
|
* We only pick the least utilised connection when some connection limits are
|
|
* reached such as max_shared_pool_size or max_adaptive_executor_pool_size.
|
|
*
|
|
* Therefore there should be some connections to choose from.
|
|
*/
|
|
Assert(list_length(connectionStateList) > 0);
|
|
|
|
foreach(connectionStateCell, connectionStateList)
|
|
{
|
|
CopyConnectionState *connectionState = lfirst(connectionStateCell);
|
|
int currentConnectionPlacementCount = connectionState->bufferedPlacementCount;
|
|
|
|
if (connectionState->activePlacementState != NULL)
|
|
{
|
|
currentConnectionPlacementCount++;
|
|
}
|
|
|
|
Assert(currentConnectionPlacementCount > 0);
|
|
|
|
if (currentConnectionPlacementCount < minPlacementCount)
|
|
{
|
|
minPlacementCount = currentConnectionPlacementCount;
|
|
connection = connectionState->connection;
|
|
}
|
|
}
|
|
|
|
return connection;
|
|
}
|
|
|
|
|
|
/*
|
|
* StartPlacementStateCopyCommand sends the COPY for the given placement. It also
|
|
* sends binary headers if this is a binary COPY.
|
|
*/
|
|
static void
|
|
StartPlacementStateCopyCommand(CopyPlacementState *placementState,
|
|
CopyStmt *copyStatement, CopyOutState copyOutState)
|
|
{
|
|
MultiConnection *connection = placementState->connectionState->connection;
|
|
uint64 shardId = placementState->shardState->shardId;
|
|
bool raiseInterrupts = true;
|
|
bool binaryCopy = copyOutState->binary;
|
|
|
|
StringInfo copyCommand = ConstructCopyStatement(copyStatement, shardId);
|
|
|
|
if (!SendRemoteCommand(connection, copyCommand->data))
|
|
{
|
|
ReportConnectionError(connection, ERROR);
|
|
}
|
|
|
|
PGresult *result = GetRemoteCommandResult(connection, raiseInterrupts);
|
|
if (PQresultStatus(result) != PGRES_COPY_IN)
|
|
{
|
|
ReportResultError(connection, result, ERROR);
|
|
}
|
|
|
|
PQclear(result);
|
|
|
|
if (binaryCopy)
|
|
{
|
|
SendCopyBinaryHeaders(copyOutState, shardId, list_make1(connection));
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* EndPlacementStateCopyCommand ends the COPY for the given placement. It also
|
|
* sends binary footers if this is a binary COPY.
|
|
*/
|
|
static void
|
|
EndPlacementStateCopyCommand(CopyPlacementState *placementState,
|
|
CopyOutState copyOutState)
|
|
{
|
|
MultiConnection *connection = placementState->connectionState->connection;
|
|
uint64 shardId = placementState->shardState->shardId;
|
|
bool binaryCopy = copyOutState->binary;
|
|
|
|
/* send footers and end copy command */
|
|
if (binaryCopy)
|
|
{
|
|
SendCopyBinaryFooters(copyOutState, shardId, list_make1(connection));
|
|
}
|
|
|
|
EndRemoteCopy(shardId, list_make1(connection));
|
|
}
|
|
|
|
|
|
/*
|
|
* UnclaimCopyConnections unclaims all the connections used for COPY.
|
|
*/
|
|
static void
|
|
UnclaimCopyConnections(List *connectionStateList)
|
|
{
|
|
ListCell *connectionStateCell = NULL;
|
|
|
|
foreach(connectionStateCell, connectionStateList)
|
|
{
|
|
CopyConnectionState *connectionState = lfirst(connectionStateCell);
|
|
UnclaimConnection(connectionState->connection);
|
|
}
|
|
}
|