mirror of https://github.com/citusdata/citus.git
Follow consistent execution order in parallel commands
parent
a497e7178c
commit
65f6d7c02a
|
@ -96,6 +96,8 @@ static void ExecuteMultipleTasks(QueryDesc *queryDesc, List *taskList,
|
||||||
bool isModificationQuery, bool expectResults);
|
bool isModificationQuery, bool expectResults);
|
||||||
static List * TaskShardIntervalList(List *taskList);
|
static List * TaskShardIntervalList(List *taskList);
|
||||||
static void AcquireExecutorShardLock(Task *task, CmdType commandType);
|
static void AcquireExecutorShardLock(Task *task, CmdType commandType);
|
||||||
|
static void AcquireExecutorMultiShardLocks(List *shardIntervalList);
|
||||||
|
static bool IsReplicated(List *shardIntervalList);
|
||||||
static uint64 ReturnRowsFromTuplestore(uint64 tupleCount, TupleDesc tupleDescriptor,
|
static uint64 ReturnRowsFromTuplestore(uint64 tupleCount, TupleDesc tupleDescriptor,
|
||||||
DestReceiver *destination,
|
DestReceiver *destination,
|
||||||
Tuplestorestate *tupleStore);
|
Tuplestorestate *tupleStore);
|
||||||
|
@ -252,14 +254,14 @@ AcquireExecutorShardLock(Task *task, CmdType commandType)
|
||||||
* Bypass commutativity checks when citus.all_modifications_commutative
|
* Bypass commutativity checks when citus.all_modifications_commutative
|
||||||
* is enabled.
|
* is enabled.
|
||||||
*
|
*
|
||||||
* A ShareLock does not conflict with itself and therefore allows
|
* A RowExclusiveLock does not conflict with itself and therefore allows
|
||||||
* multiple commutative commands to proceed concurrently. It does
|
* multiple commutative commands to proceed concurrently. It does
|
||||||
* conflict with ExclusiveLock, which may still be obtained by another
|
* conflict with ExclusiveLock, which may still be obtained by another
|
||||||
* session that executes an UPDATE/DELETE/UPSERT command with
|
* session that executes an UPDATE/DELETE/UPSERT command with
|
||||||
* citus.all_modifications_commutative disabled.
|
* citus.all_modifications_commutative disabled.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
lockMode = ShareLock;
|
lockMode = RowExclusiveLock;
|
||||||
}
|
}
|
||||||
else if (task->upsertQuery || commandType == CMD_UPDATE || commandType == CMD_DELETE)
|
else if (task->upsertQuery || commandType == CMD_UPDATE || commandType == CMD_DELETE)
|
||||||
{
|
{
|
||||||
|
@ -291,13 +293,13 @@ AcquireExecutorShardLock(Task *task, CmdType commandType)
|
||||||
* UPDATE/DELETE/UPSERT may consider the INSERT, depending on execution
|
* UPDATE/DELETE/UPSERT may consider the INSERT, depending on execution
|
||||||
* order.
|
* order.
|
||||||
*
|
*
|
||||||
* A ShareLock does not conflict with itself and therefore allows
|
* A RowExclusiveLock does not conflict with itself and therefore allows
|
||||||
* multiple INSERT commands to proceed concurrently. It conflicts with
|
* multiple INSERT commands to proceed concurrently. It conflicts with
|
||||||
* ExclusiveLock obtained by UPDATE/DELETE/UPSERT, ensuring those do
|
* ExclusiveLock obtained by UPDATE/DELETE/UPSERT, ensuring those do
|
||||||
* not run concurrently with INSERT.
|
* not run concurrently with INSERT.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
lockMode = ShareLock;
|
lockMode = RowExclusiveLock;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -311,6 +313,81 @@ AcquireExecutorShardLock(Task *task, CmdType commandType)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* AcquireExecutorMultiShardLocks acquires shard locks need for execution
|
||||||
|
* of writes on multiple shards.
|
||||||
|
*
|
||||||
|
* 1. If citus.all_modifications_commutative is set to true, then all locks
|
||||||
|
* are acquired as ShareUpdateExclusiveLock.
|
||||||
|
* 2. If citus.all_modifications_commutative is false, then only the shards
|
||||||
|
* with 2 or more replicas are locked with ExclusiveLock. Otherwise, the
|
||||||
|
* lock is acquired with ShareUpdateExclusiveLock.
|
||||||
|
*
|
||||||
|
* ShareUpdateExclusiveLock conflicts with itself such that only one
|
||||||
|
* multi-shard modification at a time is allowed on a shard. It also conflicts
|
||||||
|
* with ExclusiveLock, which ensures that updates/deletes/upserts are applied
|
||||||
|
* in the same order on all placements. It does not conflict with ShareLock,
|
||||||
|
* which is normally obtained by single-shard commutative writes.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
AcquireExecutorMultiShardLocks(List *shardIntervalList)
|
||||||
|
{
|
||||||
|
LOCKMODE lockMode = NoLock;
|
||||||
|
|
||||||
|
if (AllModificationsCommutative || !IsReplicated(shardIntervalList))
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* When all writes are commutative then we only need to prevent multi-shard
|
||||||
|
* commands from running concurrently with each other and with commands
|
||||||
|
* that are explicitly non-commutative. When there is not replication then
|
||||||
|
* we only need to prevent concurrent multi-shard commands.
|
||||||
|
*
|
||||||
|
* In either case, ShareUpdateExclusive has the desired effect, since
|
||||||
|
* it conflicts with itself and ExclusiveLock (taken by non-commutative
|
||||||
|
* writes).
|
||||||
|
*/
|
||||||
|
|
||||||
|
lockMode = ShareUpdateExclusiveLock;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* When there is replication, prevent all concurrent writes to the same
|
||||||
|
* shards to ensure the writes are ordered.
|
||||||
|
*/
|
||||||
|
lockMode = ExclusiveLock;
|
||||||
|
}
|
||||||
|
|
||||||
|
LockShardListResources(shardIntervalList, lockMode);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* IsReplicated checks whether any of the shards in the given list has more
|
||||||
|
* than one replica.
|
||||||
|
*/
|
||||||
|
static bool
|
||||||
|
IsReplicated(List *shardIntervalList)
|
||||||
|
{
|
||||||
|
ListCell *shardIntervalCell;
|
||||||
|
bool hasReplication = false;
|
||||||
|
|
||||||
|
foreach(shardIntervalCell, shardIntervalList)
|
||||||
|
{
|
||||||
|
ShardInterval *shardInterval = (ShardInterval *) lfirst(shardIntervalCell);
|
||||||
|
uint64 shardId = shardInterval->shardId;
|
||||||
|
List *shardPlacementList = FinalizedShardPlacementList(shardId);
|
||||||
|
if (shardPlacementList->length > 1)
|
||||||
|
{
|
||||||
|
hasReplication = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return hasReplication;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* CreateXactParticipantHash initializes the map used to store the connections
|
* CreateXactParticipantHash initializes the map used to store the connections
|
||||||
* needed to process distributed transactions. Unlike the connection cache, we
|
* needed to process distributed transactions. Unlike the connection cache, we
|
||||||
|
@ -422,8 +499,10 @@ RouterExecutorRun(QueryDesc *queryDesc, ScanDirection direction, long count)
|
||||||
deparse_shard_query(query, relationId, task->anchorShardId,
|
deparse_shard_query(query, relationId, task->anchorShardId,
|
||||||
newQueryString);
|
newQueryString);
|
||||||
|
|
||||||
elog(DEBUG4, "query before master evaluation: %s", task->queryString);
|
ereport(DEBUG4, (errmsg("query before master evaluation: %s",
|
||||||
elog(DEBUG4, "query after master evaluation: %s", newQueryString->data);
|
task->queryString)));
|
||||||
|
ereport(DEBUG4, (errmsg("query after master evaluation: %s",
|
||||||
|
newQueryString->data)));
|
||||||
|
|
||||||
task->queryString = newQueryString->data;
|
task->queryString = newQueryString->data;
|
||||||
}
|
}
|
||||||
|
@ -698,6 +777,9 @@ ExecuteModifyTasks(List *taskList, bool expectResults, ParamListInfo paramListIn
|
||||||
ListCell *taskCell = NULL;
|
ListCell *taskCell = NULL;
|
||||||
char *userName = CurrentUserName();
|
char *userName = CurrentUserName();
|
||||||
List *shardIntervalList = NIL;
|
List *shardIntervalList = NIL;
|
||||||
|
List *affectedTupleCountList = NIL;
|
||||||
|
bool tasksPending = true;
|
||||||
|
int placementIndex = 0;
|
||||||
|
|
||||||
if (XactModificationLevel == XACT_MODIFICATION_DATA)
|
if (XactModificationLevel == XACT_MODIFICATION_DATA)
|
||||||
{
|
{
|
||||||
|
@ -712,12 +794,19 @@ ExecuteModifyTasks(List *taskList, bool expectResults, ParamListInfo paramListIn
|
||||||
shardIntervalList = TaskShardIntervalList(taskList);
|
shardIntervalList = TaskShardIntervalList(taskList);
|
||||||
|
|
||||||
/* ensure that there are no concurrent modifications on the same shards */
|
/* ensure that there are no concurrent modifications on the same shards */
|
||||||
LockShardListResources(shardIntervalList, ExclusiveLock);
|
AcquireExecutorMultiShardLocks(shardIntervalList);
|
||||||
|
|
||||||
/* open connection to all relevant placements, if not already open */
|
/* open connection to all relevant placements, if not already open */
|
||||||
OpenTransactionsToAllShardPlacements(shardIntervalList, userName);
|
OpenTransactionsToAllShardPlacements(shardIntervalList, userName);
|
||||||
|
|
||||||
/* send command to all relevant shard placements */
|
/* iterate over placements in rounds, to ensure in-order execution */
|
||||||
|
while (tasksPending)
|
||||||
|
{
|
||||||
|
int taskIndex = 0;
|
||||||
|
|
||||||
|
tasksPending = false;
|
||||||
|
|
||||||
|
/* send command to all shard placements with the current index in parallel */
|
||||||
foreach(taskCell, taskList)
|
foreach(taskCell, taskList)
|
||||||
{
|
{
|
||||||
Task *task = (Task *) lfirst(taskCell);
|
Task *task = (Task *) lfirst(taskCell);
|
||||||
|
@ -726,20 +815,22 @@ ExecuteModifyTasks(List *taskList, bool expectResults, ParamListInfo paramListIn
|
||||||
bool shardConnectionsFound = false;
|
bool shardConnectionsFound = false;
|
||||||
ShardConnections *shardConnections = NULL;
|
ShardConnections *shardConnections = NULL;
|
||||||
List *connectionList = NIL;
|
List *connectionList = NIL;
|
||||||
ListCell *connectionCell = NULL;
|
TransactionConnection *transactionConnection = NULL;
|
||||||
|
PGconn *connection = NULL;
|
||||||
|
bool queryOK = false;
|
||||||
|
|
||||||
shardConnections = GetShardConnections(shardId, &shardConnectionsFound);
|
shardConnections = GetShardConnections(shardId, &shardConnectionsFound);
|
||||||
Assert(shardConnectionsFound);
|
|
||||||
|
|
||||||
connectionList = shardConnections->connectionList;
|
connectionList = shardConnections->connectionList;
|
||||||
Assert(connectionList != NIL);
|
|
||||||
|
|
||||||
foreach(connectionCell, connectionList)
|
if (placementIndex >= list_length(connectionList))
|
||||||
{
|
{
|
||||||
TransactionConnection *transactionConnection =
|
/* no more active placements for this task */
|
||||||
(TransactionConnection *) lfirst(connectionCell);
|
continue;
|
||||||
PGconn *connection = transactionConnection->connection;
|
}
|
||||||
bool queryOK = false;
|
|
||||||
|
transactionConnection =
|
||||||
|
(TransactionConnection *) list_nth(connectionList, placementIndex);
|
||||||
|
connection = transactionConnection->connection;
|
||||||
|
|
||||||
queryOK = SendQueryInSingleRowMode(connection, queryString, paramListInfo);
|
queryOK = SendQueryInSingleRowMode(connection, queryString, paramListInfo);
|
||||||
if (!queryOK)
|
if (!queryOK)
|
||||||
|
@ -747,7 +838,6 @@ ExecuteModifyTasks(List *taskList, bool expectResults, ParamListInfo paramListIn
|
||||||
ReraiseRemoteError(connection, NULL);
|
ReraiseRemoteError(connection, NULL);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
/* collects results from all relevant shard placements */
|
/* collects results from all relevant shard placements */
|
||||||
foreach(taskCell, taskList)
|
foreach(taskCell, taskList)
|
||||||
|
@ -757,9 +847,11 @@ ExecuteModifyTasks(List *taskList, bool expectResults, ParamListInfo paramListIn
|
||||||
bool shardConnectionsFound = false;
|
bool shardConnectionsFound = false;
|
||||||
ShardConnections *shardConnections = NULL;
|
ShardConnections *shardConnections = NULL;
|
||||||
List *connectionList = NIL;
|
List *connectionList = NIL;
|
||||||
ListCell *connectionCell = NULL;
|
TransactionConnection *transactionConnection = NULL;
|
||||||
int64 affectedTupleCount = 0;
|
PGconn *connection = NULL;
|
||||||
bool gotResults = false;
|
int64 currentAffectedTupleCount = 0;
|
||||||
|
bool failOnError = true;
|
||||||
|
bool queryOK PG_USED_FOR_ASSERTS_ONLY = false;
|
||||||
|
|
||||||
/* abort in case of cancellation */
|
/* abort in case of cancellation */
|
||||||
CHECK_FOR_INTERRUPTS();
|
CHECK_FOR_INTERRUPTS();
|
||||||
|
@ -767,21 +859,22 @@ ExecuteModifyTasks(List *taskList, bool expectResults, ParamListInfo paramListIn
|
||||||
shardConnections = GetShardConnections(shardId, &shardConnectionsFound);
|
shardConnections = GetShardConnections(shardId, &shardConnectionsFound);
|
||||||
connectionList = shardConnections->connectionList;
|
connectionList = shardConnections->connectionList;
|
||||||
|
|
||||||
foreach(connectionCell, connectionList)
|
if (placementIndex >= list_length(connectionList))
|
||||||
{
|
{
|
||||||
TransactionConnection *transactionConnection =
|
/* no more active placements for this task */
|
||||||
(TransactionConnection *) lfirst(connectionCell);
|
continue;
|
||||||
PGconn *connection = transactionConnection->connection;
|
}
|
||||||
int64 currentAffectedTupleCount = 0;
|
|
||||||
bool failOnError = true;
|
transactionConnection =
|
||||||
bool queryOK PG_USED_FOR_ASSERTS_ONLY = false;
|
(TransactionConnection *) list_nth(connectionList, placementIndex);
|
||||||
|
connection = transactionConnection->connection;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If caller is interested, store query results the first time
|
* If caller is interested, store query results the first time
|
||||||
* through. The output of the query's execution on other shards is
|
* through. The output of the query's execution on other shards is
|
||||||
* discarded if we run there (because it's a modification query).
|
* discarded if we run there (because it's a modification query).
|
||||||
*/
|
*/
|
||||||
if (!gotResults && expectResults)
|
if (placementIndex == 0 && expectResults)
|
||||||
{
|
{
|
||||||
Assert(routerState != NULL && tupleDescriptor != NULL);
|
Assert(routerState != NULL && tupleDescriptor != NULL);
|
||||||
|
|
||||||
|
@ -797,26 +890,45 @@ ExecuteModifyTasks(List *taskList, bool expectResults, ParamListInfo paramListIn
|
||||||
/* should have rolled back on error */
|
/* should have rolled back on error */
|
||||||
Assert(queryOK);
|
Assert(queryOK);
|
||||||
|
|
||||||
if (!gotResults)
|
if (placementIndex == 0)
|
||||||
{
|
{
|
||||||
affectedTupleCount = currentAffectedTupleCount;
|
totalAffectedTupleCount += currentAffectedTupleCount;
|
||||||
totalAffectedTupleCount += affectedTupleCount;
|
|
||||||
|
/* keep track of the initial affected tuple count */
|
||||||
|
affectedTupleCountList = lappend_int(affectedTupleCountList,
|
||||||
|
currentAffectedTupleCount);
|
||||||
}
|
}
|
||||||
else if (currentAffectedTupleCount != affectedTupleCount)
|
else
|
||||||
|
{
|
||||||
|
/* warn the user if shard placements have diverged */
|
||||||
|
int64 previousAffectedTupleCount = list_nth_int(affectedTupleCountList,
|
||||||
|
taskIndex);
|
||||||
|
|
||||||
|
if (currentAffectedTupleCount != previousAffectedTupleCount)
|
||||||
{
|
{
|
||||||
char *nodeName = ConnectionGetOptionValue(connection, "host");
|
char *nodeName = ConnectionGetOptionValue(connection, "host");
|
||||||
char *nodePort = ConnectionGetOptionValue(connection, "port");
|
char *nodePort = ConnectionGetOptionValue(connection, "port");
|
||||||
|
|
||||||
ereport(WARNING,
|
ereport(WARNING,
|
||||||
(errmsg("modified "INT64_FORMAT " tuples, but expected "
|
(errmsg("modified "INT64_FORMAT " tuples of shard "
|
||||||
"to modify "INT64_FORMAT,
|
UINT64_FORMAT ", but expected to modify "INT64_FORMAT,
|
||||||
currentAffectedTupleCount, affectedTupleCount),
|
currentAffectedTupleCount, shardId,
|
||||||
|
previousAffectedTupleCount),
|
||||||
errdetail("modified placement on %s:%s", nodeName,
|
errdetail("modified placement on %s:%s", nodeName,
|
||||||
nodePort)));
|
nodePort)));
|
||||||
}
|
}
|
||||||
|
|
||||||
gotResults = true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!tasksPending && placementIndex + 1 < list_length(connectionList))
|
||||||
|
{
|
||||||
|
/* more tasks to be done after thise one */
|
||||||
|
tasksPending = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
taskIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
placementIndex++;
|
||||||
}
|
}
|
||||||
|
|
||||||
CHECK_FOR_INTERRUPTS();
|
CHECK_FOR_INTERRUPTS();
|
||||||
|
|
Loading…
Reference in New Issue