mirror of https://github.com/citusdata/citus.git
1859 lines
55 KiB
C
1859 lines
55 KiB
C
/*
|
|
* multi_router_executor.c
|
|
*
|
|
* Routines for executing remote tasks as part of a distributed execution plan
|
|
* with synchronous connections. The routines utilize the connection cache.
|
|
* Therefore, only a single connection is opened for each worker. Also, router
|
|
* executor does not require a master table and a master query. In other words,
|
|
* the results that are fetched from a single worker is sent to the output console
|
|
* directly. Lastly, router executor can only execute a single task.
|
|
*
|
|
* Copyright (c) 2012-2016, Citus Data, Inc.
|
|
*/
|
|
|
|
#include "postgres.h" /* IWYU pragma: keep */
|
|
#include "c.h"
|
|
#include "fmgr.h" /* IWYU pragma: keep */
|
|
#include "funcapi.h"
|
|
#include "libpq-fe.h"
|
|
#include "miscadmin.h"
|
|
|
|
#include <string.h>
|
|
|
|
#include "access/htup.h"
|
|
#include "access/sdir.h"
|
|
#include "access/transam.h"
|
|
#include "access/tupdesc.h"
|
|
#include "access/xact.h"
|
|
#include "catalog/pg_type.h"
|
|
#include "distributed/citus_clauses.h"
|
|
#include "distributed/citus_ruleutils.h"
|
|
#include "distributed/commit_protocol.h"
|
|
#include "distributed/connection_cache.h"
|
|
#include "distributed/connection_management.h"
|
|
#include "distributed/deparse_shard_query.h"
|
|
#include "distributed/listutils.h"
|
|
#include "distributed/master_metadata_utility.h"
|
|
#include "distributed/metadata_cache.h"
|
|
#include "distributed/multi_executor.h"
|
|
#include "distributed/multi_physical_planner.h"
|
|
#include "distributed/multi_planner.h"
|
|
#include "distributed/multi_router_executor.h"
|
|
#include "distributed/multi_router_planner.h"
|
|
#include "distributed/multi_shard_transaction.h"
|
|
#include "distributed/relay_utility.h"
|
|
#include "distributed/remote_commands.h"
|
|
#include "distributed/remote_transaction.h"
|
|
#include "distributed/resource_lock.h"
|
|
#include "executor/execdesc.h"
|
|
#include "executor/executor.h"
|
|
#include "executor/instrument.h"
|
|
#include "executor/tuptable.h"
|
|
#include "lib/stringinfo.h"
|
|
#include "nodes/execnodes.h"
|
|
#include "nodes/nodes.h"
|
|
#include "nodes/params.h"
|
|
#include "nodes/parsenodes.h"
|
|
#include "nodes/pg_list.h"
|
|
#include "nodes/plannodes.h"
|
|
#include "storage/ipc.h"
|
|
#include "storage/lock.h"
|
|
#include "tcop/dest.h"
|
|
#include "utils/elog.h"
|
|
#include "utils/errcodes.h"
|
|
#include "utils/hsearch.h"
|
|
#include "utils/int8.h"
|
|
#include "utils/lsyscache.h"
|
|
#include "utils/memutils.h"
|
|
#include "utils/palloc.h"
|
|
#include "utils/tuplestore.h"
|
|
|
|
|
|
/* controls use of locks to enforce safe commutativity */
|
|
bool AllModificationsCommutative = false;
|
|
|
|
|
|
/*
|
|
* The following static variables are necessary to track the progression of
|
|
* multi-statement transactions managed by the router executor. After the first
|
|
* modification within a transaction, the executor populates a hash with the
|
|
* transaction's initial participants (nodes hit by that initial modification).
|
|
*
|
|
* To keep track of the reverse mapping (from shards to nodes), we have a list
|
|
* of XactShardConnSets, which map a shard identifier to a set of connection
|
|
* hash entries. This list is walked by MarkRemainingInactivePlacements to
|
|
* ensure we mark placements as failed if they reject a COMMIT.
|
|
*/
|
|
static HTAB *xactParticipantHash = NULL;
|
|
static List *xactShardConnSetList = NIL;
|
|
|
|
/* functions needed during start phase */
|
|
static void InitTransactionStateForTask(Task *task);
|
|
static HTAB * CreateXactParticipantHash(void);
|
|
|
|
/* functions needed during run phase */
|
|
static void ReacquireMetadataLocks(List *taskList);
|
|
static bool ExecuteSingleTask(QueryDesc *queryDesc, Task *task,
|
|
bool isModificationQuery, bool expectResults);
|
|
static void GetPlacementConnectionsReadyForTwoPhaseCommit(List *taskPlacementList);
|
|
static void ExecuteMultipleTasks(QueryDesc *queryDesc, List *taskList,
|
|
bool isModificationQuery, bool expectResults);
|
|
static int64 ExecuteModifyTasks(List *taskList, bool expectResults,
|
|
ParamListInfo paramListInfo,
|
|
MaterialState *routerState,
|
|
TupleDesc tupleDescriptor);
|
|
static List * TaskShardIntervalList(List *taskList);
|
|
static void AcquireExecutorShardLock(Task *task, CmdType commandType);
|
|
static void AcquireExecutorMultiShardLocks(List *taskList);
|
|
static bool RequiresConsistentSnapshot(Task *task);
|
|
static uint64 ReturnRowsFromTuplestore(uint64 tupleCount, TupleDesc tupleDescriptor,
|
|
DestReceiver *destination,
|
|
Tuplestorestate *tupleStore);
|
|
static PGconn * GetConnectionForPlacement(ShardPlacement *placement,
|
|
bool isModificationQuery);
|
|
static void PurgeConnectionForPlacement(PGconn *connection, ShardPlacement *placement);
|
|
static void RemoveXactConnection(PGconn *connection);
|
|
static void ExtractParametersFromParamListInfo(ParamListInfo paramListInfo,
|
|
Oid **parameterTypes,
|
|
const char ***parameterValues);
|
|
static bool SendQueryInSingleRowMode(PGconn *connection, char *query,
|
|
ParamListInfo paramListInfo);
|
|
static bool StoreQueryResult(MaterialState *routerState, PGconn *connection,
|
|
TupleDesc tupleDescriptor, bool failOnError, int64 *rows);
|
|
static bool ConsumeQueryResult(PGconn *connection, bool failOnError, int64 *rows);
|
|
static void RecordShardIdParticipant(uint64 affectedShardId,
|
|
NodeConnectionEntry *participantEntry);
|
|
|
|
/* to verify the health of shards after a transactional modification command */
|
|
static void MarkRemainingInactivePlacements(void);
|
|
|
|
|
|
/*
|
|
* RouterExecutorStart sets up the executor state and queryDesc for router
|
|
* execution.
|
|
*/
|
|
void
|
|
RouterExecutorStart(QueryDesc *queryDesc, int eflags, List *taskList)
|
|
{
|
|
EState *executorState = NULL;
|
|
CmdType commandType = queryDesc->operation;
|
|
|
|
/*
|
|
* If we are executing a prepared statement, then we may not yet have obtained
|
|
* the metadata locks in this transaction. To prevent a concurrent shard copy,
|
|
* we re-obtain them here or error out if a shard copy has already started.
|
|
*
|
|
* If a shard copy finishes in between fetching a plan from cache and
|
|
* re-acquiring the locks, then we might still run a stale plan, which could
|
|
* cause shard placements to diverge. To minimize this window, we take the
|
|
* locks as early as possible.
|
|
*/
|
|
ReacquireMetadataLocks(taskList);
|
|
|
|
/* disallow triggers during distributed modify commands */
|
|
if (commandType != CMD_SELECT)
|
|
{
|
|
eflags |= EXEC_FLAG_SKIP_TRIGGERS;
|
|
}
|
|
|
|
/* signal that it is a router execution */
|
|
eflags |= EXEC_FLAG_CITUS_ROUTER_EXECUTOR;
|
|
|
|
/* build empty executor state to obtain per-query memory context */
|
|
executorState = CreateExecutorState();
|
|
executorState->es_top_eflags = eflags;
|
|
executorState->es_instrument = queryDesc->instrument_options;
|
|
|
|
queryDesc->estate = executorState;
|
|
|
|
/*
|
|
* As it's similar to what we're doing, use a MaterialState node to store
|
|
* our state. This is used to store our tuplestore, so cursors etc. can
|
|
* work.
|
|
*/
|
|
queryDesc->planstate = (PlanState *) makeNode(MaterialState);
|
|
}
|
|
|
|
|
|
/*
|
|
* ReacquireMetadataLocks re-acquires the metadata locks that are normally
|
|
* acquired during planning.
|
|
*
|
|
* If we are executing a prepared statement, then planning might have
|
|
* happened in a separate transaction and advisory locks are no longer
|
|
* held. If a shard is currently being repaired/copied/moved, then
|
|
* obtaining the locks will fail and this function throws an error to
|
|
* prevent executing a stale plan.
|
|
*
|
|
* If we are executing a non-prepared statement or planning happened in
|
|
* the same transaction, then we already have the locks and obtain them
|
|
* again here. Since we always release these locks at the end of the
|
|
* transaction, this is effectively a no-op.
|
|
*/
|
|
static void
|
|
ReacquireMetadataLocks(List *taskList)
|
|
{
|
|
ListCell *taskCell = NULL;
|
|
|
|
/*
|
|
* Note: to avoid the overhead of additional sorting, we assume tasks
|
|
* to be already sorted by shard ID such that deadlocks are avoided.
|
|
* This is true for INSERT/SELECT, which is the only multi-shard
|
|
* command right now.
|
|
*/
|
|
|
|
foreach(taskCell, taskList)
|
|
{
|
|
Task *task = (Task *) lfirst(taskCell);
|
|
|
|
/*
|
|
* Only obtain metadata locks for modifications to allow reads to
|
|
* proceed during shard copy.
|
|
*/
|
|
if (task->taskType == MODIFY_TASK &&
|
|
!TryLockShardDistributionMetadata(task->anchorShardId, ShareLock))
|
|
{
|
|
/*
|
|
* We could error out immediately to give quick feedback to the
|
|
* client, but this might complicate flow control and our default
|
|
* behaviour during shard copy is to block.
|
|
*
|
|
* Block until the lock becomes available such that the next command
|
|
* will likely succeed and use the serialization failure error code
|
|
* to signal to the client that it should retry the current command.
|
|
*/
|
|
LockShardDistributionMetadata(task->anchorShardId, ShareLock);
|
|
|
|
ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
|
|
errmsg("prepared modifications cannot be executed on "
|
|
"a shard while it is being copied")));
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* InitTransactionStateForTask is called during executor start with the first
|
|
* modifying (INSERT/UPDATE/DELETE) task during a transaction. It creates the
|
|
* transaction participant hash, opens connections to this task's nodes, and
|
|
* populates the hash with those connections after sending BEGIN commands to
|
|
* each. If a node fails to respond, its connection is set to NULL to prevent
|
|
* further interaction with it during the transaction.
|
|
*/
|
|
static void
|
|
InitTransactionStateForTask(Task *task)
|
|
{
|
|
ListCell *placementCell = NULL;
|
|
|
|
BeginOrContinueCoordinatedTransaction();
|
|
|
|
xactParticipantHash = CreateXactParticipantHash();
|
|
|
|
foreach(placementCell, task->taskPlacementList)
|
|
{
|
|
ShardPlacement *placement = (ShardPlacement *) lfirst(placementCell);
|
|
NodeConnectionKey participantKey;
|
|
NodeConnectionEntry *participantEntry = NULL;
|
|
bool entryFound = false;
|
|
int connectionFlags = SESSION_LIFESPAN;
|
|
MultiConnection *connection =
|
|
GetNodeConnection(connectionFlags, placement->nodeName, placement->nodePort);
|
|
|
|
MemSet(&participantKey, 0, sizeof(participantKey));
|
|
strlcpy(participantKey.nodeName, placement->nodeName,
|
|
MAX_NODE_LENGTH + 1);
|
|
participantKey.nodePort = placement->nodePort;
|
|
|
|
participantEntry = hash_search(xactParticipantHash, &participantKey,
|
|
HASH_ENTER, &entryFound);
|
|
Assert(!entryFound);
|
|
|
|
/* issue BEGIN if necessary */
|
|
RemoteTransactionBeginIfNecessary(connection);
|
|
|
|
participantEntry->connection = connection;
|
|
}
|
|
|
|
XactModificationLevel = XACT_MODIFICATION_DATA;
|
|
}
|
|
|
|
|
|
/*
|
|
* AcquireExecutorShardLock acquires a lock on the shard for the given task and
|
|
* command type if necessary to avoid divergence between multiple replicas of
|
|
* the same shard. No lock is obtained when there is only one replica.
|
|
*
|
|
* The function determines the appropriate lock mode based on the commutativity
|
|
* rule of the command. In each case, it uses a lock mode that enforces the
|
|
* commutativity rule.
|
|
*
|
|
* The mapping is overridden when all_modifications_commutative is set to true.
|
|
* In that case, all modifications are treated as commutative, which can be used
|
|
* to communicate that the application is only generating commutative
|
|
* UPDATE/DELETE/UPSERT commands and exclusive locks are unnecessary.
|
|
*/
|
|
static void
|
|
AcquireExecutorShardLock(Task *task, CmdType commandType)
|
|
{
|
|
LOCKMODE lockMode = NoLock;
|
|
int64 shardId = task->anchorShardId;
|
|
|
|
if (commandType == CMD_SELECT || list_length(task->taskPlacementList) == 1)
|
|
{
|
|
/*
|
|
* The executor shard lock is used to maintain consistency between
|
|
* replicas and therefore no lock is required for read-only queries
|
|
* or in general when there is only one replica.
|
|
*/
|
|
|
|
lockMode = NoLock;
|
|
}
|
|
else if (AllModificationsCommutative)
|
|
{
|
|
/*
|
|
* Bypass commutativity checks when citus.all_modifications_commutative
|
|
* is enabled.
|
|
*
|
|
* A RowExclusiveLock does not conflict with itself and therefore allows
|
|
* multiple commutative commands to proceed concurrently. It does
|
|
* conflict with ExclusiveLock, which may still be obtained by another
|
|
* session that executes an UPDATE/DELETE/UPSERT command with
|
|
* citus.all_modifications_commutative disabled.
|
|
*/
|
|
|
|
lockMode = RowExclusiveLock;
|
|
}
|
|
else if (task->upsertQuery || commandType == CMD_UPDATE || commandType == CMD_DELETE)
|
|
{
|
|
/*
|
|
* UPDATE/DELETE/UPSERT commands do not commute with other modifications
|
|
* since the rows modified by one command may be affected by the outcome
|
|
* of another command.
|
|
*
|
|
* We need to handle upsert before INSERT, because PostgreSQL models
|
|
* upsert commands as INSERT with an ON CONFLICT section.
|
|
*
|
|
* ExclusiveLock conflicts with all lock types used by modifications
|
|
* and therefore prevents other modifications from running
|
|
* concurrently.
|
|
*/
|
|
|
|
lockMode = ExclusiveLock;
|
|
}
|
|
else if (commandType == CMD_INSERT)
|
|
{
|
|
/*
|
|
* An INSERT commutes with other INSERT commands, since performing them
|
|
* out-of-order only affects the table order on disk, but not the
|
|
* contents.
|
|
*
|
|
* When a unique constraint exists, INSERTs are not strictly commutative,
|
|
* but whichever INSERT comes last will error out and thus has no effect.
|
|
* INSERT is not commutative with UPDATE/DELETE/UPSERT, since the
|
|
* UPDATE/DELETE/UPSERT may consider the INSERT, depending on execution
|
|
* order.
|
|
*
|
|
* A RowExclusiveLock does not conflict with itself and therefore allows
|
|
* multiple INSERT commands to proceed concurrently. It conflicts with
|
|
* ExclusiveLock obtained by UPDATE/DELETE/UPSERT, ensuring those do
|
|
* not run concurrently with INSERT.
|
|
*/
|
|
|
|
lockMode = RowExclusiveLock;
|
|
}
|
|
else
|
|
{
|
|
ereport(ERROR, (errmsg("unrecognized operation code: %d", (int) commandType)));
|
|
}
|
|
|
|
if (shardId != INVALID_SHARD_ID && lockMode != NoLock)
|
|
{
|
|
LockShardResource(shardId, lockMode);
|
|
}
|
|
|
|
/*
|
|
* If the task has a subselect, then we may need to lock the shards from which
|
|
* the query selects as well to prevent the subselects from seeing different
|
|
* results on different replicas. In particular this prevents INSERT.. SELECT
|
|
* commands from having a different effect on different placements.
|
|
*/
|
|
if (RequiresConsistentSnapshot(task))
|
|
{
|
|
/*
|
|
* ExclusiveLock conflicts with all lock types used by modifications
|
|
* and therefore prevents other modifications from running
|
|
* concurrently.
|
|
*/
|
|
|
|
LockRelationShardResources(task->relationShardList, ExclusiveLock);
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* AcquireExecutorMultiShardLocks acquires shard locks needed for execution
|
|
* of writes on multiple shards. In addition to honouring commutativity
|
|
* rules, we currently only allow a single multi-shard command on a shard at
|
|
* a time. Otherwise, concurrent multi-shard commands may take row-level
|
|
* locks on the shard placements in a different order and create a distributed
|
|
* deadlock. This applies even when writes are commutative and/or there is
|
|
* no replication.
|
|
*
|
|
* 1. If citus.all_modifications_commutative is set to true, then all locks
|
|
* are acquired as ShareUpdateExclusiveLock.
|
|
*
|
|
* 2. If citus.all_modifications_commutative is false, then only the shards
|
|
* with 2 or more replicas are locked with ExclusiveLock. Otherwise, the
|
|
* lock is acquired with ShareUpdateExclusiveLock.
|
|
*
|
|
* ShareUpdateExclusiveLock conflicts with itself such that only one
|
|
* multi-shard modification at a time is allowed on a shard. It also conflicts
|
|
* with ExclusiveLock, which ensures that updates/deletes/upserts are applied
|
|
* in the same order on all placements. It does not conflict with
|
|
* RowExclusiveLock, which is normally obtained by single-shard, commutative
|
|
* writes.
|
|
*/
|
|
static void
|
|
AcquireExecutorMultiShardLocks(List *taskList)
|
|
{
|
|
ListCell *taskCell = NULL;
|
|
|
|
foreach(taskCell, taskList)
|
|
{
|
|
Task *task = (Task *) lfirst(taskCell);
|
|
LOCKMODE lockMode = NoLock;
|
|
|
|
if (AllModificationsCommutative || list_length(task->taskPlacementList) == 1)
|
|
{
|
|
/*
|
|
* When all writes are commutative then we only need to prevent multi-shard
|
|
* commands from running concurrently with each other and with commands
|
|
* that are explicitly non-commutative. When there is no replication then
|
|
* we only need to prevent concurrent multi-shard commands.
|
|
*
|
|
* In either case, ShareUpdateExclusive has the desired effect, since
|
|
* it conflicts with itself and ExclusiveLock (taken by non-commutative
|
|
* writes).
|
|
*/
|
|
|
|
lockMode = ShareUpdateExclusiveLock;
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* When there is replication, prevent all concurrent writes to the same
|
|
* shards to ensure the writes are ordered.
|
|
*/
|
|
|
|
lockMode = ExclusiveLock;
|
|
}
|
|
|
|
LockShardResource(task->anchorShardId, lockMode);
|
|
|
|
/*
|
|
* If the task has a subselect, then we may need to lock the shards from which
|
|
* the query selects as well to prevent the subselects from seeing different
|
|
* results on different replicas. In particular this prevents INSERT..SELECT
|
|
* commands from having different effects on different placements.
|
|
*/
|
|
|
|
if (RequiresConsistentSnapshot(task))
|
|
{
|
|
/*
|
|
* ExclusiveLock conflicts with all lock types used by modifications
|
|
* and therefore prevents other modifications from running
|
|
* concurrently.
|
|
*/
|
|
|
|
LockRelationShardResources(task->relationShardList, ExclusiveLock);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* RequiresConsistentSnapshot returns true if the given task need to take
|
|
* the necessary locks to ensure that a subquery in the INSERT ... SELECT
|
|
* query returns the same output for all task placements.
|
|
*/
|
|
static bool
|
|
RequiresConsistentSnapshot(Task *task)
|
|
{
|
|
bool requiresIsolation = false;
|
|
|
|
if (!task->insertSelectQuery)
|
|
{
|
|
/*
|
|
* Only INSERT/SELECT commands currently require SELECT isolation.
|
|
* Other commands do not read from other shards.
|
|
*/
|
|
|
|
requiresIsolation = false;
|
|
}
|
|
else if (list_length(task->taskPlacementList) == 1)
|
|
{
|
|
/*
|
|
* If there is only one replica then we fully rely on PostgreSQL to
|
|
* provide SELECT isolation. In this case, we do not provide isolation
|
|
* across the shards, but that was never our intention.
|
|
*/
|
|
|
|
requiresIsolation = false;
|
|
}
|
|
else if (AllModificationsCommutative)
|
|
{
|
|
/*
|
|
* An INSERT/SELECT is commutative with other writes if it excludes
|
|
* any ongoing writes based on the filter conditions. Without knowing
|
|
* whether this is true, we assume the user took this into account
|
|
* when enabling citus.all_modifications_commutative. This option
|
|
* gives users an escape from aggressive locking during INSERT/SELECT.
|
|
*/
|
|
|
|
requiresIsolation = false;
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* If this is a non-commutative write, then we need to block ongoing
|
|
* writes to make sure that the subselect returns the same result
|
|
* on all placements.
|
|
*/
|
|
|
|
requiresIsolation = true;
|
|
}
|
|
|
|
return requiresIsolation;
|
|
}
|
|
|
|
|
|
/*
|
|
* CreateXactParticipantHash initializes the map used to store the connections
|
|
* needed to process distributed transactions. Unlike the connection cache, we
|
|
* permit NULL connections here to signify that a participant has seen an error
|
|
* and is no longer receiving commands during a transaction. This hash should
|
|
* be walked at transaction end to send final COMMIT or ABORT commands.
|
|
*/
|
|
static HTAB *
|
|
CreateXactParticipantHash(void)
|
|
{
|
|
HTAB *xactParticipantHash = NULL;
|
|
HASHCTL info;
|
|
int hashFlags = 0;
|
|
|
|
MemSet(&info, 0, sizeof(info));
|
|
info.keysize = sizeof(NodeConnectionKey);
|
|
info.entrysize = sizeof(NodeConnectionEntry);
|
|
info.hcxt = TopTransactionContext;
|
|
hashFlags = (HASH_ELEM | HASH_CONTEXT | HASH_BLOBS);
|
|
|
|
xactParticipantHash = hash_create("citus xact participant hash", 32, &info,
|
|
hashFlags);
|
|
|
|
return xactParticipantHash;
|
|
}
|
|
|
|
|
|
/*
|
|
* RouterExecutorRun actually executes a single task on a worker.
|
|
*/
|
|
void
|
|
RouterExecutorRun(QueryDesc *queryDesc, ScanDirection direction, long count)
|
|
{
|
|
PlannedStmt *planStatement = queryDesc->plannedstmt;
|
|
MultiPlan *multiPlan = GetMultiPlan(planStatement);
|
|
Job *workerJob = multiPlan->workerJob;
|
|
List *taskList = workerJob->taskList;
|
|
EState *estate = queryDesc->estate;
|
|
CmdType operation = queryDesc->operation;
|
|
MemoryContext oldcontext = NULL;
|
|
DestReceiver *destination = queryDesc->dest;
|
|
MaterialState *routerState = (MaterialState *) queryDesc->planstate;
|
|
bool sendTuples = operation == CMD_SELECT || queryDesc->plannedstmt->hasReturning;
|
|
|
|
Assert(estate != NULL);
|
|
Assert(!(estate->es_top_eflags & EXEC_FLAG_EXPLAIN_ONLY));
|
|
|
|
oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
|
|
|
|
if (queryDesc->totaltime != NULL)
|
|
{
|
|
InstrStartNode(queryDesc->totaltime);
|
|
}
|
|
|
|
estate->es_processed = 0;
|
|
|
|
/* startup the tuple receiver */
|
|
if (sendTuples)
|
|
{
|
|
(*destination->rStartup)(destination, operation, queryDesc->tupDesc);
|
|
}
|
|
|
|
/* we only support returning nothing or scanning forward */
|
|
if (ScanDirectionIsNoMovement(direction))
|
|
{
|
|
/* comments in PortalRunSelect() explain the reason for this case */
|
|
goto out;
|
|
}
|
|
else if (!ScanDirectionIsForward(direction))
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
|
errmsg("scan directions other than forward scans "
|
|
"are unsupported")));
|
|
}
|
|
|
|
/*
|
|
* If query has not yet been executed, do so now. The main reason why the
|
|
* query might already have been executed is cursors.
|
|
*/
|
|
if (!routerState->eof_underlying)
|
|
{
|
|
bool isModificationQuery = false;
|
|
bool requiresMasterEvaluation = workerJob->requiresMasterEvaluation;
|
|
|
|
if (operation == CMD_INSERT || operation == CMD_UPDATE ||
|
|
operation == CMD_DELETE)
|
|
{
|
|
isModificationQuery = true;
|
|
}
|
|
else if (operation != CMD_SELECT)
|
|
{
|
|
ereport(ERROR, (errmsg("unrecognized operation code: %d",
|
|
(int) operation)));
|
|
}
|
|
|
|
if (requiresMasterEvaluation)
|
|
{
|
|
Query *jobQuery = workerJob->jobQuery;
|
|
|
|
ExecuteMasterEvaluableFunctions(jobQuery);
|
|
RebuildQueryStrings(jobQuery, taskList);
|
|
}
|
|
|
|
if (list_length(taskList) == 1)
|
|
{
|
|
Task *task = (Task *) linitial(taskList);
|
|
bool resultsOK = false;
|
|
|
|
resultsOK = ExecuteSingleTask(queryDesc, task, isModificationQuery,
|
|
sendTuples);
|
|
if (!resultsOK)
|
|
{
|
|
ereport(ERROR, (errmsg("could not receive query results")));
|
|
}
|
|
}
|
|
else
|
|
{
|
|
ExecuteMultipleTasks(queryDesc, taskList, isModificationQuery,
|
|
sendTuples);
|
|
}
|
|
|
|
/* mark underlying query as having executed */
|
|
routerState->eof_underlying = true;
|
|
}
|
|
|
|
/* if the underlying query produced output, return it */
|
|
if (routerState->tuplestorestate != NULL)
|
|
{
|
|
TupleDesc resultTupleDescriptor = queryDesc->tupDesc;
|
|
int64 returnedRows = 0;
|
|
|
|
/* return rows from the tuplestore */
|
|
returnedRows = ReturnRowsFromTuplestore(count, resultTupleDescriptor,
|
|
destination,
|
|
routerState->tuplestorestate);
|
|
|
|
/*
|
|
* Count tuples processed, if this is a SELECT. (For modifications
|
|
* it'll already have been increased, as we want the number of
|
|
* modified tuples, not the number of RETURNed tuples.)
|
|
*/
|
|
if (operation == CMD_SELECT)
|
|
{
|
|
estate->es_processed += returnedRows;
|
|
}
|
|
}
|
|
|
|
out:
|
|
|
|
/* shutdown tuple receiver, if we started it */
|
|
if (sendTuples)
|
|
{
|
|
(*destination->rShutdown)(destination);
|
|
}
|
|
|
|
if (queryDesc->totaltime != NULL)
|
|
{
|
|
InstrStopNode(queryDesc->totaltime, estate->es_processed);
|
|
}
|
|
|
|
MemoryContextSwitchTo(oldcontext);
|
|
}
|
|
|
|
|
|
/*
|
|
* ExecuteSingleTask executes the task on the remote node, retrieves the
|
|
* results and stores them, if SELECT or RETURNING is used, in a tuple
|
|
* store.
|
|
*
|
|
* If the task fails on one of the placements, the function retries it on
|
|
* other placements (SELECT), reraises the remote error (constraint violation
|
|
* in DML), marks the affected placement as invalid (DML on some placement
|
|
* failed), or errors out (DML failed on all placements).
|
|
*/
|
|
static bool
|
|
ExecuteSingleTask(QueryDesc *queryDesc, Task *task,
|
|
bool isModificationQuery,
|
|
bool expectResults)
|
|
{
|
|
CmdType operation = queryDesc->operation;
|
|
TupleDesc tupleDescriptor = queryDesc->tupDesc;
|
|
EState *executorState = queryDesc->estate;
|
|
MaterialState *routerState = (MaterialState *) queryDesc->planstate;
|
|
ParamListInfo paramListInfo = queryDesc->params;
|
|
bool resultsOK = false;
|
|
List *taskPlacementList = task->taskPlacementList;
|
|
ListCell *taskPlacementCell = NULL;
|
|
List *failedPlacementList = NIL;
|
|
int64 affectedTupleCount = -1;
|
|
bool gotResults = false;
|
|
char *queryString = task->queryString;
|
|
bool taskRequiresTwoPhaseCommit = (task->replicationModel == REPLICATION_MODEL_2PC);
|
|
|
|
if (XactModificationLevel == XACT_MODIFICATION_MULTI_SHARD)
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
|
|
errmsg("single-shard DML commands must not appear in "
|
|
"transaction blocks which contain multi-shard data "
|
|
"modifications")));
|
|
}
|
|
|
|
/*
|
|
* Firstly ensure that distributed transaction is started. Then, force
|
|
* the transaction manager to use 2PC while running the task on the placements.
|
|
*/
|
|
if (taskRequiresTwoPhaseCommit)
|
|
{
|
|
BeginOrContinueCoordinatedTransaction();
|
|
CoordinatedTransactionUse2PC();
|
|
|
|
/*
|
|
* Mark connections for all placements as critical and establish connections
|
|
* to all placements at once.
|
|
*/
|
|
GetPlacementConnectionsReadyForTwoPhaseCommit(taskPlacementList);
|
|
}
|
|
|
|
/*
|
|
* We could naturally handle function-based transactions (i.e. those
|
|
* using PL/pgSQL or similar) by checking the type of queryDesc->dest,
|
|
* but some customers already use functions that touch multiple shards
|
|
* from within a function, so we'll ignore functions for now.
|
|
*/
|
|
if (operation != CMD_SELECT && xactParticipantHash == NULL && IsTransactionBlock())
|
|
{
|
|
InitTransactionStateForTask(task);
|
|
}
|
|
|
|
/* prevent replicas of the same shard from diverging */
|
|
AcquireExecutorShardLock(task, operation);
|
|
|
|
/*
|
|
* Try to run the query to completion on one placement. If the query fails
|
|
* attempt the query on the next placement.
|
|
*/
|
|
foreach(taskPlacementCell, taskPlacementList)
|
|
{
|
|
ShardPlacement *taskPlacement = (ShardPlacement *) lfirst(taskPlacementCell);
|
|
bool queryOK = false;
|
|
bool failOnError = false;
|
|
int64 currentAffectedTupleCount = 0;
|
|
PGconn *connection = GetConnectionForPlacement(taskPlacement,
|
|
isModificationQuery);
|
|
|
|
if (connection == NULL)
|
|
{
|
|
failedPlacementList = lappend(failedPlacementList, taskPlacement);
|
|
continue;
|
|
}
|
|
|
|
queryOK = SendQueryInSingleRowMode(connection, queryString, paramListInfo);
|
|
if (!queryOK)
|
|
{
|
|
PurgeConnectionForPlacement(connection, taskPlacement);
|
|
failedPlacementList = lappend(failedPlacementList, taskPlacement);
|
|
continue;
|
|
}
|
|
|
|
/* if we're running a 2PC, the query should fail on error */
|
|
failOnError = taskRequiresTwoPhaseCommit;
|
|
|
|
/*
|
|
* If caller is interested, store query results the first time
|
|
* through. The output of the query's execution on other shards is
|
|
* discarded if we run there (because it's a modification query).
|
|
*/
|
|
if (!gotResults && expectResults)
|
|
{
|
|
queryOK = StoreQueryResult(routerState, connection, tupleDescriptor,
|
|
failOnError, ¤tAffectedTupleCount);
|
|
}
|
|
else
|
|
{
|
|
queryOK = ConsumeQueryResult(connection, failOnError,
|
|
¤tAffectedTupleCount);
|
|
}
|
|
|
|
if (queryOK)
|
|
{
|
|
if ((affectedTupleCount == -1) ||
|
|
(affectedTupleCount == currentAffectedTupleCount))
|
|
{
|
|
affectedTupleCount = currentAffectedTupleCount;
|
|
}
|
|
else
|
|
{
|
|
ereport(WARNING,
|
|
(errmsg("modified "INT64_FORMAT " tuples, but expected "
|
|
"to modify "INT64_FORMAT,
|
|
currentAffectedTupleCount, affectedTupleCount),
|
|
errdetail("modified placement on %s:%d",
|
|
taskPlacement->nodeName, taskPlacement->nodePort)));
|
|
}
|
|
|
|
#if (PG_VERSION_NUM < 90600)
|
|
|
|
/* before 9.6, PostgreSQL used a uint32 for this field, so check */
|
|
Assert(currentAffectedTupleCount <= 0xFFFFFFFF);
|
|
#endif
|
|
|
|
resultsOK = true;
|
|
gotResults = true;
|
|
|
|
/*
|
|
* Modifications have to be executed on all placements, but for
|
|
* read queries we can stop here.
|
|
*/
|
|
if (!isModificationQuery)
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
PurgeConnectionForPlacement(connection, taskPlacement);
|
|
|
|
failedPlacementList = lappend(failedPlacementList, taskPlacement);
|
|
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if (isModificationQuery)
|
|
{
|
|
ListCell *failedPlacementCell = NULL;
|
|
|
|
/* if all placements failed, error out */
|
|
if (list_length(failedPlacementList) == list_length(task->taskPlacementList))
|
|
{
|
|
ereport(ERROR, (errmsg("could not modify any active placements")));
|
|
}
|
|
|
|
/*
|
|
* Otherwise, mark failed placements as inactive: they're stale. Note that
|
|
* connections for tasks that require 2PC has already failed the whole transaction
|
|
* and there is no way that they're marked stale here.
|
|
*/
|
|
foreach(failedPlacementCell, failedPlacementList)
|
|
{
|
|
ShardPlacement *failedPlacement =
|
|
(ShardPlacement *) lfirst(failedPlacementCell);
|
|
|
|
Assert(!taskRequiresTwoPhaseCommit);
|
|
|
|
UpdateShardPlacementState(failedPlacement->placementId, FILE_INACTIVE);
|
|
}
|
|
|
|
executorState->es_processed = affectedTupleCount;
|
|
}
|
|
|
|
return resultsOK;
|
|
}
|
|
|
|
|
|
/*
|
|
* GetPlacementConnectionsReadyForTwoPhaseCommit iterates over the task placement list,
|
|
* starts the connections to the nodes and marks them critical. In the second iteration,
|
|
* the connection establishments are finished. Finally, BEGIN commands are sent,
|
|
* if necessary.
|
|
*/
|
|
static void
|
|
GetPlacementConnectionsReadyForTwoPhaseCommit(List *taskPlacementList)
|
|
{
|
|
ListCell *taskPlacementCell = NULL;
|
|
List *multiConnectionList = NIL;
|
|
|
|
/* in the first iteration start the connections */
|
|
foreach(taskPlacementCell, taskPlacementList)
|
|
{
|
|
ShardPlacement *taskPlacement = (ShardPlacement *) lfirst(taskPlacementCell);
|
|
int connectionFlags = SESSION_LIFESPAN;
|
|
MultiConnection *multiConnection = StartNodeConnection(connectionFlags,
|
|
taskPlacement->nodeName,
|
|
taskPlacement->nodePort);
|
|
|
|
MarkRemoteTransactionCritical(multiConnection);
|
|
|
|
multiConnectionList = lappend(multiConnectionList, multiConnection);
|
|
}
|
|
|
|
FinishConnectionListEstablishment(multiConnectionList);
|
|
|
|
RemoteTransactionsBeginIfNecessary(multiConnectionList);
|
|
}
|
|
|
|
|
|
/*
|
|
* ExecuteMultipleTasks executes a list of tasks on remote nodes, retrieves
|
|
* the results and, if RETURNING is used, stores them in a tuple store.
|
|
*
|
|
* If a task fails on one of the placements, the transaction rolls back.
|
|
* Otherwise, the changes are committed using 2PC when the local transaction
|
|
* commits.
|
|
*/
|
|
static void
|
|
ExecuteMultipleTasks(QueryDesc *queryDesc, List *taskList,
|
|
bool isModificationQuery, bool expectResults)
|
|
{
|
|
TupleDesc tupleDescriptor = queryDesc->tupDesc;
|
|
EState *executorState = queryDesc->estate;
|
|
MaterialState *routerState = (MaterialState *) queryDesc->planstate;
|
|
ParamListInfo paramListInfo = queryDesc->params;
|
|
int64 affectedTupleCount = -1;
|
|
|
|
/* can only support modifications right now */
|
|
Assert(isModificationQuery);
|
|
|
|
affectedTupleCount = ExecuteModifyTasks(taskList, expectResults, paramListInfo,
|
|
routerState, tupleDescriptor);
|
|
|
|
executorState->es_processed = affectedTupleCount;
|
|
}
|
|
|
|
|
|
/*
|
|
* ExecuteModifyTasksWithoutResults provides a wrapper around ExecuteModifyTasks
|
|
* for calls that do not require results. In this case, the expectResults flag
|
|
* is set to false and arguments related to result sets and query parameters are
|
|
* NULL. This function is primarily intended to allow DDL and
|
|
* master_modify_multiple_shards to use the router executor infrastructure.
|
|
*/
|
|
int64
|
|
ExecuteModifyTasksWithoutResults(List *taskList)
|
|
{
|
|
return ExecuteModifyTasks(taskList, false, NULL, NULL, NULL);
|
|
}
|
|
|
|
|
|
/*
|
|
* ExecuteModifyTasks executes a list of tasks on remote nodes, and
|
|
* optionally retrieves the results and stores them in a tuple store.
|
|
*
|
|
* If a task fails on one of the placements, the transaction rolls back.
|
|
* Otherwise, the changes are committed using 2PC when the local transaction
|
|
* commits.
|
|
*/
|
|
static int64
|
|
ExecuteModifyTasks(List *taskList, bool expectResults, ParamListInfo paramListInfo,
|
|
MaterialState *routerState, TupleDesc tupleDescriptor)
|
|
{
|
|
int64 totalAffectedTupleCount = 0;
|
|
ListCell *taskCell = NULL;
|
|
char *userName = CurrentUserName();
|
|
List *shardIntervalList = NIL;
|
|
List *affectedTupleCountList = NIL;
|
|
bool tasksPending = true;
|
|
int placementIndex = 0;
|
|
|
|
if (XactModificationLevel == XACT_MODIFICATION_DATA)
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
|
|
errmsg("multi-shard data modifications must not appear in "
|
|
"transaction blocks which contain single-shard DML "
|
|
"commands")));
|
|
}
|
|
|
|
shardIntervalList = TaskShardIntervalList(taskList);
|
|
|
|
/* ensure that there are no concurrent modifications on the same shards */
|
|
AcquireExecutorMultiShardLocks(taskList);
|
|
|
|
/* open connection to all relevant placements, if not already open */
|
|
OpenTransactionsToAllShardPlacements(shardIntervalList, userName);
|
|
|
|
XactModificationLevel = XACT_MODIFICATION_MULTI_SHARD;
|
|
|
|
/* iterate over placements in rounds, to ensure in-order execution */
|
|
while (tasksPending)
|
|
{
|
|
int taskIndex = 0;
|
|
|
|
tasksPending = false;
|
|
|
|
/* send command to all shard placements with the current index in parallel */
|
|
foreach(taskCell, taskList)
|
|
{
|
|
Task *task = (Task *) lfirst(taskCell);
|
|
int64 shardId = task->anchorShardId;
|
|
char *queryString = task->queryString;
|
|
bool shardConnectionsFound = false;
|
|
ShardConnections *shardConnections = NULL;
|
|
List *connectionList = NIL;
|
|
MultiConnection *multiConnection = NULL;
|
|
PGconn *connection = NULL;
|
|
bool queryOK = false;
|
|
|
|
shardConnections = GetShardConnections(shardId, &shardConnectionsFound);
|
|
connectionList = shardConnections->connectionList;
|
|
|
|
if (placementIndex >= list_length(connectionList))
|
|
{
|
|
/* no more active placements for this task */
|
|
continue;
|
|
}
|
|
|
|
multiConnection =
|
|
(MultiConnection *) list_nth(connectionList, placementIndex);
|
|
connection = multiConnection->pgConn;
|
|
|
|
queryOK = SendQueryInSingleRowMode(connection, queryString, paramListInfo);
|
|
if (!queryOK)
|
|
{
|
|
ReraiseRemoteError(connection, NULL);
|
|
}
|
|
}
|
|
|
|
/* collects results from all relevant shard placements */
|
|
foreach(taskCell, taskList)
|
|
{
|
|
Task *task = (Task *) lfirst(taskCell);
|
|
int64 shardId = task->anchorShardId;
|
|
bool shardConnectionsFound = false;
|
|
ShardConnections *shardConnections = NULL;
|
|
List *connectionList = NIL;
|
|
MultiConnection *multiConnection = NULL;
|
|
PGconn *connection = NULL;
|
|
int64 currentAffectedTupleCount = 0;
|
|
bool failOnError = true;
|
|
bool queryOK PG_USED_FOR_ASSERTS_ONLY = false;
|
|
|
|
/* abort in case of cancellation */
|
|
CHECK_FOR_INTERRUPTS();
|
|
|
|
shardConnections = GetShardConnections(shardId, &shardConnectionsFound);
|
|
connectionList = shardConnections->connectionList;
|
|
|
|
if (placementIndex >= list_length(connectionList))
|
|
{
|
|
/* no more active placements for this task */
|
|
taskIndex++;
|
|
continue;
|
|
}
|
|
|
|
multiConnection =
|
|
(MultiConnection *) list_nth(connectionList, placementIndex);
|
|
connection = multiConnection->pgConn;
|
|
|
|
/*
|
|
* If caller is interested, store query results the first time
|
|
* through. The output of the query's execution on other shards is
|
|
* discarded if we run there (because it's a modification query).
|
|
*/
|
|
if (placementIndex == 0 && expectResults)
|
|
{
|
|
Assert(routerState != NULL && tupleDescriptor != NULL);
|
|
|
|
queryOK = StoreQueryResult(routerState, connection, tupleDescriptor,
|
|
failOnError, ¤tAffectedTupleCount);
|
|
}
|
|
else
|
|
{
|
|
queryOK = ConsumeQueryResult(connection, failOnError,
|
|
¤tAffectedTupleCount);
|
|
}
|
|
|
|
/* should have rolled back on error */
|
|
Assert(queryOK);
|
|
|
|
if (placementIndex == 0)
|
|
{
|
|
totalAffectedTupleCount += currentAffectedTupleCount;
|
|
|
|
/* keep track of the initial affected tuple count */
|
|
affectedTupleCountList = lappend_int(affectedTupleCountList,
|
|
currentAffectedTupleCount);
|
|
}
|
|
else
|
|
{
|
|
/* warn the user if shard placements have diverged */
|
|
int64 previousAffectedTupleCount = list_nth_int(affectedTupleCountList,
|
|
taskIndex);
|
|
|
|
if (currentAffectedTupleCount != previousAffectedTupleCount)
|
|
{
|
|
char *nodeName = ConnectionGetOptionValue(connection, "host");
|
|
char *nodePort = ConnectionGetOptionValue(connection, "port");
|
|
|
|
ereport(WARNING,
|
|
(errmsg("modified "INT64_FORMAT " tuples of shard "
|
|
UINT64_FORMAT ", but expected to modify "INT64_FORMAT,
|
|
currentAffectedTupleCount, shardId,
|
|
previousAffectedTupleCount),
|
|
errdetail("modified placement on %s:%s", nodeName,
|
|
nodePort)));
|
|
}
|
|
}
|
|
|
|
if (!tasksPending && placementIndex + 1 < list_length(connectionList))
|
|
{
|
|
/* more tasks to be done after thise one */
|
|
tasksPending = true;
|
|
}
|
|
|
|
taskIndex++;
|
|
}
|
|
|
|
placementIndex++;
|
|
}
|
|
|
|
CHECK_FOR_INTERRUPTS();
|
|
|
|
return totalAffectedTupleCount;
|
|
}
|
|
|
|
|
|
/*
|
|
* TaskShardIntervalList returns a list of shard intervals for a given list of
|
|
* tasks.
|
|
*/
|
|
static List *
|
|
TaskShardIntervalList(List *taskList)
|
|
{
|
|
ListCell *taskCell = NULL;
|
|
List *shardIntervalList = NIL;
|
|
|
|
foreach(taskCell, taskList)
|
|
{
|
|
Task *task = (Task *) lfirst(taskCell);
|
|
int64 shardId = task->anchorShardId;
|
|
ShardInterval *shardInterval = LoadShardInterval(shardId);
|
|
|
|
shardIntervalList = lappend(shardIntervalList, shardInterval);
|
|
}
|
|
|
|
return shardIntervalList;
|
|
}
|
|
|
|
|
|
/*
|
|
* ReturnRowsFromTuplestore moves rows from a given tuplestore into a
|
|
* receiver. It performs the necessary limiting to support cursors.
|
|
*/
|
|
static uint64
|
|
ReturnRowsFromTuplestore(uint64 tupleCount, TupleDesc tupleDescriptor,
|
|
DestReceiver *destination, Tuplestorestate *tupleStore)
|
|
{
|
|
TupleTableSlot *tupleTableSlot = NULL;
|
|
uint64 currentTupleCount = 0;
|
|
|
|
tupleTableSlot = MakeSingleTupleTableSlot(tupleDescriptor);
|
|
|
|
/* iterate over tuples in tuple store, and send them to destination */
|
|
for (;;)
|
|
{
|
|
bool nextTuple = tuplestore_gettupleslot(tupleStore, true, false, tupleTableSlot);
|
|
if (!nextTuple)
|
|
{
|
|
break;
|
|
}
|
|
|
|
(*destination->receiveSlot)(tupleTableSlot, destination);
|
|
|
|
ExecClearTuple(tupleTableSlot);
|
|
|
|
currentTupleCount++;
|
|
|
|
/*
|
|
* If numberTuples is zero fetch all tuples, otherwise stop after
|
|
* count tuples.
|
|
*/
|
|
if (tupleCount > 0 && tupleCount == currentTupleCount)
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
|
|
ExecDropSingleTupleTableSlot(tupleTableSlot);
|
|
|
|
return currentTupleCount;
|
|
}
|
|
|
|
|
|
/*
|
|
* GetConnectionForPlacement is the main entry point for acquiring a connection
|
|
* within the router executor. By using placements (rather than node names and
|
|
* ports) to identify connections, the router executor can keep track of shards
|
|
* used by multi-statement transactions and error out if a transaction tries
|
|
* to reach a new node altogether). In the single-statement commands context,
|
|
* GetConnectionForPlacement simply falls through to GetOrEstablishConnection.
|
|
*/
|
|
static PGconn *
|
|
GetConnectionForPlacement(ShardPlacement *placement, bool isModificationQuery)
|
|
{
|
|
NodeConnectionKey participantKey;
|
|
NodeConnectionEntry *participantEntry = NULL;
|
|
bool entryFound = false;
|
|
|
|
/* if not in a transaction, fall through to connection cache */
|
|
if (xactParticipantHash == NULL)
|
|
{
|
|
PGconn *connection = GetOrEstablishConnection(placement->nodeName,
|
|
placement->nodePort);
|
|
|
|
return connection;
|
|
}
|
|
|
|
Assert(IsTransactionBlock());
|
|
|
|
MemSet(&participantKey, 0, sizeof(participantKey));
|
|
strlcpy(participantKey.nodeName, placement->nodeName, MAX_NODE_LENGTH + 1);
|
|
participantKey.nodePort = placement->nodePort;
|
|
|
|
participantEntry = hash_search(xactParticipantHash, &participantKey, HASH_FIND,
|
|
&entryFound);
|
|
|
|
if (entryFound)
|
|
{
|
|
if (isModificationQuery)
|
|
{
|
|
RecordShardIdParticipant(placement->shardId, participantEntry);
|
|
}
|
|
|
|
return participantEntry->connection->pgConn;
|
|
}
|
|
else
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_CONNECTION_DOES_NOT_EXIST),
|
|
errmsg("no transaction participant matches %s:%d",
|
|
placement->nodeName, placement->nodePort),
|
|
errdetail("Transactions which modify distributed tables may only "
|
|
"target nodes affected by the modification command "
|
|
"which began the transaction.")));
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* PurgeConnectionForPlacement provides a way to purge an invalid connection
|
|
* from all relevant connection hashes using the placement involved in the
|
|
* query at the time of the error. If a transaction is ongoing, this function
|
|
* ensures the right node's connection is set to NULL in the participant map
|
|
* for the transaction in addition to purging the connection cache's entry.
|
|
*/
|
|
static void
|
|
PurgeConnectionForPlacement(PGconn *connection, ShardPlacement *placement)
|
|
{
|
|
CloseConnectionByPGconn(connection);
|
|
|
|
/*
|
|
* The following is logically identical to RemoveXactConnection, but since
|
|
* we have a ShardPlacement to help build a NodeConnectionKey, we avoid
|
|
* any penalty incurred by calling BuildKeyForConnection, which must ex-
|
|
* tract host, port, and user from the connection options list.
|
|
*/
|
|
if (xactParticipantHash != NULL)
|
|
{
|
|
NodeConnectionEntry *participantEntry = NULL;
|
|
bool entryFound = false;
|
|
NodeConnectionKey nodeKey;
|
|
char *currentUser = CurrentUserName();
|
|
|
|
MemSet(&nodeKey, 0, sizeof(NodeConnectionKey));
|
|
strlcpy(nodeKey.nodeName, placement->nodeName, MAX_NODE_LENGTH + 1);
|
|
nodeKey.nodePort = placement->nodePort;
|
|
strlcpy(nodeKey.nodeUser, currentUser, NAMEDATALEN);
|
|
Assert(IsTransactionBlock());
|
|
|
|
/* the participant hash doesn't use the user field */
|
|
MemSet(&nodeKey.nodeUser, 0, sizeof(nodeKey.nodeUser));
|
|
participantEntry = hash_search(xactParticipantHash, &nodeKey, HASH_FIND,
|
|
&entryFound);
|
|
|
|
Assert(entryFound);
|
|
|
|
participantEntry->connection = NULL;
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* Removes a given connection from the transaction participant hash, based on
|
|
* the host and port of the provided connection. If the hash is not NULL, it
|
|
* MUST contain the provided connection, or a FATAL error is raised.
|
|
*/
|
|
static void
|
|
RemoveXactConnection(PGconn *connection)
|
|
{
|
|
NodeConnectionKey nodeKey;
|
|
NodeConnectionEntry *participantEntry = NULL;
|
|
bool entryFound = false;
|
|
|
|
if (xactParticipantHash == NULL)
|
|
{
|
|
return;
|
|
}
|
|
|
|
BuildKeyForConnection(connection, &nodeKey);
|
|
|
|
/* the participant hash doesn't use the user field */
|
|
MemSet(&nodeKey.nodeUser, 0, sizeof(nodeKey.nodeUser));
|
|
participantEntry = hash_search(xactParticipantHash, &nodeKey, HASH_FIND,
|
|
&entryFound);
|
|
|
|
if (!entryFound)
|
|
{
|
|
ereport(FATAL, (errmsg("could not find specified transaction connection")));
|
|
}
|
|
|
|
participantEntry->connection = NULL;
|
|
}
|
|
|
|
|
|
/*
|
|
* SendQueryInSingleRowMode sends the given query on the connection in an
|
|
* asynchronous way. The function also sets the single-row mode on the
|
|
* connection so that we receive results a row at a time.
|
|
*/
|
|
static bool
|
|
SendQueryInSingleRowMode(PGconn *connection, char *query, ParamListInfo paramListInfo)
|
|
{
|
|
int querySent = 0;
|
|
int singleRowMode = 0;
|
|
|
|
if (paramListInfo != NULL)
|
|
{
|
|
int parameterCount = paramListInfo->numParams;
|
|
Oid *parameterTypes = NULL;
|
|
const char **parameterValues = NULL;
|
|
|
|
ExtractParametersFromParamListInfo(paramListInfo, ¶meterTypes,
|
|
¶meterValues);
|
|
|
|
querySent = PQsendQueryParams(connection, query, parameterCount, parameterTypes,
|
|
parameterValues, NULL, NULL, 0);
|
|
}
|
|
else
|
|
{
|
|
querySent = PQsendQuery(connection, query);
|
|
}
|
|
|
|
if (querySent == 0)
|
|
{
|
|
WarnRemoteError(connection, NULL);
|
|
return false;
|
|
}
|
|
|
|
singleRowMode = PQsetSingleRowMode(connection);
|
|
if (singleRowMode == 0)
|
|
{
|
|
WarnRemoteError(connection, NULL);
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
/*
|
|
* ExtractParametersFromParamListInfo extracts parameter types and values from
|
|
* the given ParamListInfo structure, and fills parameter type and value arrays.
|
|
*/
|
|
static void
|
|
ExtractParametersFromParamListInfo(ParamListInfo paramListInfo, Oid **parameterTypes,
|
|
const char ***parameterValues)
|
|
{
|
|
int parameterIndex = 0;
|
|
int parameterCount = paramListInfo->numParams;
|
|
|
|
*parameterTypes = (Oid *) palloc0(parameterCount * sizeof(Oid));
|
|
*parameterValues = (const char **) palloc0(parameterCount * sizeof(char *));
|
|
|
|
/* get parameter types and values */
|
|
for (parameterIndex = 0; parameterIndex < parameterCount; parameterIndex++)
|
|
{
|
|
ParamExternData *parameterData = ¶mListInfo->params[parameterIndex];
|
|
Oid typeOutputFunctionId = InvalidOid;
|
|
bool variableLengthType = false;
|
|
|
|
/*
|
|
* Use 0 for data types where the oid values can be different on
|
|
* the master and worker nodes. Therefore, the worker nodes can
|
|
* infer the correct oid.
|
|
*/
|
|
if (parameterData->ptype >= FirstNormalObjectId)
|
|
{
|
|
(*parameterTypes)[parameterIndex] = 0;
|
|
}
|
|
else
|
|
{
|
|
(*parameterTypes)[parameterIndex] = parameterData->ptype;
|
|
}
|
|
|
|
/*
|
|
* If the parameter is not referenced / used (ptype == 0) and
|
|
* would otherwise have errored out inside standard_planner()),
|
|
* don't pass a value to the remote side, and pass text oid to prevent
|
|
* undetermined data type errors on workers.
|
|
*/
|
|
if (parameterData->ptype == 0)
|
|
{
|
|
(*parameterValues)[parameterIndex] = NULL;
|
|
(*parameterTypes)[parameterIndex] = TEXTOID;
|
|
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
* If the parameter is NULL then we preserve its type, but
|
|
* don't need to evaluate its value.
|
|
*/
|
|
if (parameterData->isnull)
|
|
{
|
|
(*parameterValues)[parameterIndex] = NULL;
|
|
|
|
continue;
|
|
}
|
|
|
|
getTypeOutputInfo(parameterData->ptype, &typeOutputFunctionId,
|
|
&variableLengthType);
|
|
|
|
(*parameterValues)[parameterIndex] = OidOutputFunctionCall(typeOutputFunctionId,
|
|
parameterData->value);
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* StoreQueryResult gets the query results from the given connection, builds
|
|
* tuples from the results, and stores them in the a newly created
|
|
* tuple-store. If the function can't receive query results, it returns
|
|
* false. Note that this function assumes the query has already been sent on
|
|
* the connection.
|
|
*/
|
|
static bool
|
|
StoreQueryResult(MaterialState *routerState, PGconn *connection,
|
|
TupleDesc tupleDescriptor, bool failOnError, int64 *rows)
|
|
{
|
|
AttInMetadata *attributeInputMetadata = TupleDescGetAttInMetadata(tupleDescriptor);
|
|
Tuplestorestate *tupleStore = NULL;
|
|
uint32 expectedColumnCount = tupleDescriptor->natts;
|
|
char **columnArray = (char **) palloc0(expectedColumnCount * sizeof(char *));
|
|
bool commandFailed = false;
|
|
MemoryContext ioContext = AllocSetContextCreate(CurrentMemoryContext,
|
|
"StoreQueryResult",
|
|
ALLOCSET_DEFAULT_MINSIZE,
|
|
ALLOCSET_DEFAULT_INITSIZE,
|
|
ALLOCSET_DEFAULT_MAXSIZE);
|
|
*rows = 0;
|
|
|
|
if (routerState->tuplestorestate == NULL)
|
|
{
|
|
routerState->tuplestorestate = tuplestore_begin_heap(false, false, work_mem);
|
|
}
|
|
else if (!failOnError)
|
|
{
|
|
/* might have failed query execution on another placement before */
|
|
tuplestore_clear(routerState->tuplestorestate);
|
|
}
|
|
|
|
tupleStore = routerState->tuplestorestate;
|
|
|
|
for (;;)
|
|
{
|
|
uint32 rowIndex = 0;
|
|
uint32 columnIndex = 0;
|
|
uint32 rowCount = 0;
|
|
uint32 columnCount = 0;
|
|
ExecStatusType resultStatus = 0;
|
|
|
|
PGresult *result = PQgetResult(connection);
|
|
if (result == NULL)
|
|
{
|
|
break;
|
|
}
|
|
|
|
resultStatus = PQresultStatus(result);
|
|
if ((resultStatus != PGRES_SINGLE_TUPLE) && (resultStatus != PGRES_TUPLES_OK))
|
|
{
|
|
char *sqlStateString = PQresultErrorField(result, PG_DIAG_SQLSTATE);
|
|
int category = 0;
|
|
bool isConstraintViolation = false;
|
|
|
|
/*
|
|
* If the error code is in constraint violation class, we want to
|
|
* fail fast because we must get the same error from all shard
|
|
* placements.
|
|
*/
|
|
category = ERRCODE_TO_CATEGORY(ERRCODE_INTEGRITY_CONSTRAINT_VIOLATION);
|
|
isConstraintViolation = SqlStateMatchesCategory(sqlStateString, category);
|
|
|
|
if (isConstraintViolation || failOnError)
|
|
{
|
|
RemoveXactConnection(connection);
|
|
ReraiseRemoteError(connection, result);
|
|
}
|
|
else
|
|
{
|
|
WarnRemoteError(connection, result);
|
|
}
|
|
|
|
PQclear(result);
|
|
|
|
commandFailed = true;
|
|
|
|
/* continue, there could be other lingering results due to row mode */
|
|
continue;
|
|
}
|
|
|
|
rowCount = PQntuples(result);
|
|
columnCount = PQnfields(result);
|
|
Assert(columnCount == expectedColumnCount);
|
|
|
|
for (rowIndex = 0; rowIndex < rowCount; rowIndex++)
|
|
{
|
|
HeapTuple heapTuple = NULL;
|
|
MemoryContext oldContext = NULL;
|
|
memset(columnArray, 0, columnCount * sizeof(char *));
|
|
|
|
for (columnIndex = 0; columnIndex < columnCount; columnIndex++)
|
|
{
|
|
if (PQgetisnull(result, rowIndex, columnIndex))
|
|
{
|
|
columnArray[columnIndex] = NULL;
|
|
}
|
|
else
|
|
{
|
|
columnArray[columnIndex] = PQgetvalue(result, rowIndex, columnIndex);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Switch to a temporary memory context that we reset after each tuple. This
|
|
* protects us from any memory leaks that might be present in I/O functions
|
|
* called by BuildTupleFromCStrings.
|
|
*/
|
|
oldContext = MemoryContextSwitchTo(ioContext);
|
|
|
|
heapTuple = BuildTupleFromCStrings(attributeInputMetadata, columnArray);
|
|
|
|
MemoryContextSwitchTo(oldContext);
|
|
|
|
tuplestore_puttuple(tupleStore, heapTuple);
|
|
MemoryContextReset(ioContext);
|
|
(*rows)++;
|
|
}
|
|
|
|
PQclear(result);
|
|
}
|
|
|
|
pfree(columnArray);
|
|
|
|
return !commandFailed;
|
|
}
|
|
|
|
|
|
/*
|
|
* ConsumeQueryResult gets a query result from a connection, counting the rows
|
|
* and checking for errors, but otherwise discarding potentially returned
|
|
* rows. Returns true if a non-error result has been returned, false if there
|
|
* has been an error.
|
|
*/
|
|
static bool
|
|
ConsumeQueryResult(PGconn *connection, bool failOnError, int64 *rows)
|
|
{
|
|
bool commandFailed = false;
|
|
bool gotResponse = false;
|
|
|
|
*rows = 0;
|
|
|
|
/*
|
|
* Due to single row mode we have to do multiple PQgetResult() to finish
|
|
* processing of this query, even without RETURNING. For single-row mode
|
|
* we have to loop until all rows are consumed.
|
|
*/
|
|
while (true)
|
|
{
|
|
PGresult *result = PQgetResult(connection);
|
|
ExecStatusType status = PGRES_COMMAND_OK;
|
|
|
|
if (result == NULL)
|
|
{
|
|
break;
|
|
}
|
|
|
|
status = PQresultStatus(result);
|
|
|
|
if (status != PGRES_COMMAND_OK &&
|
|
status != PGRES_SINGLE_TUPLE &&
|
|
status != PGRES_TUPLES_OK)
|
|
{
|
|
char *sqlStateString = PQresultErrorField(result, PG_DIAG_SQLSTATE);
|
|
int category = 0;
|
|
bool isConstraintViolation = false;
|
|
|
|
/*
|
|
* If the error code is in constraint violation class, we want to
|
|
* fail fast because we must get the same error from all shard
|
|
* placements.
|
|
*/
|
|
category = ERRCODE_TO_CATEGORY(ERRCODE_INTEGRITY_CONSTRAINT_VIOLATION);
|
|
isConstraintViolation = SqlStateMatchesCategory(sqlStateString, category);
|
|
|
|
if (isConstraintViolation || failOnError)
|
|
{
|
|
RemoveXactConnection(connection);
|
|
ReraiseRemoteError(connection, result);
|
|
}
|
|
else
|
|
{
|
|
WarnRemoteError(connection, result);
|
|
}
|
|
PQclear(result);
|
|
|
|
commandFailed = true;
|
|
|
|
/* continue, there could be other lingering results due to row mode */
|
|
continue;
|
|
}
|
|
|
|
if (status == PGRES_COMMAND_OK)
|
|
{
|
|
char *currentAffectedTupleString = PQcmdTuples(result);
|
|
int64 currentAffectedTupleCount = 0;
|
|
|
|
if (*currentAffectedTupleString != '\0')
|
|
{
|
|
scanint8(currentAffectedTupleString, false, ¤tAffectedTupleCount);
|
|
Assert(currentAffectedTupleCount >= 0);
|
|
}
|
|
|
|
#if (PG_VERSION_NUM < 90600)
|
|
|
|
/* before 9.6, PostgreSQL used a uint32 for this field, so check */
|
|
Assert(currentAffectedTupleCount <= 0xFFFFFFFF);
|
|
#endif
|
|
*rows += currentAffectedTupleCount;
|
|
}
|
|
else
|
|
{
|
|
*rows += PQntuples(result);
|
|
}
|
|
|
|
PQclear(result);
|
|
gotResponse = true;
|
|
}
|
|
|
|
return gotResponse && !commandFailed;
|
|
}
|
|
|
|
|
|
/*
|
|
* RecordShardIdParticipant registers a connection as being involved with a
|
|
* particular shard during a multi-statement transaction.
|
|
*/
|
|
static void
|
|
RecordShardIdParticipant(uint64 affectedShardId, NodeConnectionEntry *participantEntry)
|
|
{
|
|
XactShardConnSet *shardConnSetMatch = NULL;
|
|
ListCell *listCell = NULL;
|
|
MemoryContext oldContext = NULL;
|
|
List *connectionEntryList = NIL;
|
|
|
|
/* check whether an entry already exists for this shard */
|
|
foreach(listCell, xactShardConnSetList)
|
|
{
|
|
XactShardConnSet *shardConnSet = (XactShardConnSet *) lfirst(listCell);
|
|
|
|
if (shardConnSet->shardId == affectedShardId)
|
|
{
|
|
shardConnSetMatch = shardConnSet;
|
|
}
|
|
}
|
|
|
|
/* entries must last through the whole top-level transaction */
|
|
oldContext = MemoryContextSwitchTo(TopTransactionContext);
|
|
|
|
/* if no entry found, make one */
|
|
if (shardConnSetMatch == NULL)
|
|
{
|
|
shardConnSetMatch = (XactShardConnSet *) palloc0(sizeof(XactShardConnSet));
|
|
shardConnSetMatch->shardId = affectedShardId;
|
|
|
|
xactShardConnSetList = lappend(xactShardConnSetList, shardConnSetMatch);
|
|
}
|
|
|
|
/* add connection, avoiding duplicates */
|
|
connectionEntryList = shardConnSetMatch->connectionEntryList;
|
|
shardConnSetMatch->connectionEntryList = list_append_unique_ptr(connectionEntryList,
|
|
participantEntry);
|
|
|
|
MemoryContextSwitchTo(oldContext);
|
|
}
|
|
|
|
|
|
/*
|
|
* RouterExecutorFinish cleans up after a distributed execution.
|
|
*/
|
|
void
|
|
RouterExecutorFinish(QueryDesc *queryDesc)
|
|
{
|
|
EState *estate = queryDesc->estate;
|
|
Assert(estate != NULL);
|
|
|
|
estate->es_finished = true;
|
|
}
|
|
|
|
|
|
/*
|
|
* RouterExecutorEnd cleans up the executor state after a distributed
|
|
* execution.
|
|
*/
|
|
void
|
|
RouterExecutorEnd(QueryDesc *queryDesc)
|
|
{
|
|
EState *estate = queryDesc->estate;
|
|
MaterialState *routerState = (MaterialState *) queryDesc->planstate;
|
|
|
|
if (routerState->tuplestorestate)
|
|
{
|
|
tuplestore_end(routerState->tuplestorestate);
|
|
}
|
|
|
|
Assert(estate != NULL);
|
|
|
|
FreeExecutorState(estate);
|
|
queryDesc->estate = NULL;
|
|
queryDesc->totaltime = NULL;
|
|
}
|
|
|
|
|
|
/*
|
|
* RouterExecutorPreCommitCheck() gets called after remote transactions have
|
|
* committed, so it can invalidate failed shards and perform related checks.
|
|
*/
|
|
void
|
|
RouterExecutorPreCommitCheck(void)
|
|
{
|
|
/* no transactional router modification were issued, nothing to do */
|
|
if (xactParticipantHash == NULL)
|
|
{
|
|
return;
|
|
}
|
|
|
|
MarkRemainingInactivePlacements();
|
|
}
|
|
|
|
|
|
/*
|
|
* Cleanup callback called after a transaction commits or aborts.
|
|
*/
|
|
void
|
|
RouterExecutorPostCommit(void)
|
|
{
|
|
/* reset transaction state */
|
|
xactParticipantHash = NULL;
|
|
xactShardConnSetList = NIL;
|
|
}
|
|
|
|
|
|
/*
|
|
* MarkRemainingInactivePlacements takes care of marking placements of a shard
|
|
* inactive after some of the placements rejected the final COMMIT phase of a
|
|
* transaction.
|
|
*
|
|
* Failures are detected by checking the connection & transaction state for
|
|
* each of the entries in the connection set for each shard.
|
|
*/
|
|
static void
|
|
MarkRemainingInactivePlacements(void)
|
|
{
|
|
ListCell *shardConnSetCell = NULL;
|
|
int totalSuccesses = 0;
|
|
|
|
if (xactParticipantHash == NULL)
|
|
{
|
|
return;
|
|
}
|
|
|
|
foreach(shardConnSetCell, xactShardConnSetList)
|
|
{
|
|
XactShardConnSet *shardConnSet = (XactShardConnSet *) lfirst(shardConnSetCell);
|
|
List *participantList = shardConnSet->connectionEntryList;
|
|
ListCell *participantCell = NULL;
|
|
int successes = list_length(participantList); /* assume full success */
|
|
|
|
/* determine how many actual successes there were: subtract failures */
|
|
foreach(participantCell, participantList)
|
|
{
|
|
NodeConnectionEntry *participant =
|
|
(NodeConnectionEntry *) lfirst(participantCell);
|
|
MultiConnection *connection = participant->connection;
|
|
|
|
/*
|
|
* Fail if the connection has been set to NULL after an error, or
|
|
* if the transaction failed for other reasons (e.g. COMMIT
|
|
* failed).
|
|
*/
|
|
if (connection == NULL || connection->remoteTransaction.transactionFailed)
|
|
{
|
|
successes--;
|
|
}
|
|
}
|
|
|
|
/* if no nodes succeeded for this shard, don't do anything */
|
|
if (successes == 0)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
/* otherwise, ensure failed placements are marked inactive */
|
|
foreach(participantCell, participantList)
|
|
{
|
|
NodeConnectionEntry *participant = NULL;
|
|
participant = (NodeConnectionEntry *) lfirst(participantCell);
|
|
|
|
if (participant->connection == NULL ||
|
|
participant->connection->remoteTransaction.transactionFailed)
|
|
{
|
|
uint64 shardId = shardConnSet->shardId;
|
|
NodeConnectionKey *nodeKey = &participant->cacheKey;
|
|
uint64 shardLength = 0;
|
|
uint64 placementId = INVALID_PLACEMENT_ID;
|
|
|
|
placementId = DeleteShardPlacementRow(shardId, nodeKey->nodeName,
|
|
nodeKey->nodePort);
|
|
InsertShardPlacementRow(shardId, placementId, FILE_INACTIVE, shardLength,
|
|
nodeKey->nodeName, nodeKey->nodePort);
|
|
}
|
|
}
|
|
|
|
totalSuccesses++;
|
|
}
|
|
|
|
/* If no shards could be modified at all, error out. */
|
|
if (totalSuccesses == 0)
|
|
{
|
|
ereport(ERROR, (errmsg("could not commit transaction on any active nodes")));
|
|
}
|
|
}
|