Follow consistent execution order in parallel commands

pull/855/head
Marco Slot 2016-10-18 10:21:52 +02:00
parent a497e7178c
commit 65f6d7c02a
1 changed files with 176 additions and 64 deletions

View File

@ -96,6 +96,8 @@ static void ExecuteMultipleTasks(QueryDesc *queryDesc, List *taskList,
bool isModificationQuery, bool expectResults); bool isModificationQuery, bool expectResults);
static List * TaskShardIntervalList(List *taskList); static List * TaskShardIntervalList(List *taskList);
static void AcquireExecutorShardLock(Task *task, CmdType commandType); static void AcquireExecutorShardLock(Task *task, CmdType commandType);
static void AcquireExecutorMultiShardLocks(List *shardIntervalList);
static bool IsReplicated(List *shardIntervalList);
static uint64 ReturnRowsFromTuplestore(uint64 tupleCount, TupleDesc tupleDescriptor, static uint64 ReturnRowsFromTuplestore(uint64 tupleCount, TupleDesc tupleDescriptor,
DestReceiver *destination, DestReceiver *destination,
Tuplestorestate *tupleStore); Tuplestorestate *tupleStore);
@ -252,14 +254,14 @@ AcquireExecutorShardLock(Task *task, CmdType commandType)
* Bypass commutativity checks when citus.all_modifications_commutative * Bypass commutativity checks when citus.all_modifications_commutative
* is enabled. * is enabled.
* *
* A ShareLock does not conflict with itself and therefore allows * A RowExclusiveLock does not conflict with itself and therefore allows
* multiple commutative commands to proceed concurrently. It does * multiple commutative commands to proceed concurrently. It does
* conflict with ExclusiveLock, which may still be obtained by another * conflict with ExclusiveLock, which may still be obtained by another
* session that executes an UPDATE/DELETE/UPSERT command with * session that executes an UPDATE/DELETE/UPSERT command with
* citus.all_modifications_commutative disabled. * citus.all_modifications_commutative disabled.
*/ */
lockMode = ShareLock; lockMode = RowExclusiveLock;
} }
else if (task->upsertQuery || commandType == CMD_UPDATE || commandType == CMD_DELETE) else if (task->upsertQuery || commandType == CMD_UPDATE || commandType == CMD_DELETE)
{ {
@ -291,13 +293,13 @@ AcquireExecutorShardLock(Task *task, CmdType commandType)
* UPDATE/DELETE/UPSERT may consider the INSERT, depending on execution * UPDATE/DELETE/UPSERT may consider the INSERT, depending on execution
* order. * order.
* *
* A ShareLock does not conflict with itself and therefore allows * A RowExclusiveLock does not conflict with itself and therefore allows
* multiple INSERT commands to proceed concurrently. It conflicts with * multiple INSERT commands to proceed concurrently. It conflicts with
* ExclusiveLock obtained by UPDATE/DELETE/UPSERT, ensuring those do * ExclusiveLock obtained by UPDATE/DELETE/UPSERT, ensuring those do
* not run concurrently with INSERT. * not run concurrently with INSERT.
*/ */
lockMode = ShareLock; lockMode = RowExclusiveLock;
} }
else else
{ {
@ -311,6 +313,81 @@ AcquireExecutorShardLock(Task *task, CmdType commandType)
} }
/*
* AcquireExecutorMultiShardLocks acquires shard locks need for execution
* of writes on multiple shards.
*
* 1. If citus.all_modifications_commutative is set to true, then all locks
* are acquired as ShareUpdateExclusiveLock.
* 2. If citus.all_modifications_commutative is false, then only the shards
* with 2 or more replicas are locked with ExclusiveLock. Otherwise, the
* lock is acquired with ShareUpdateExclusiveLock.
*
* ShareUpdateExclusiveLock conflicts with itself such that only one
* multi-shard modification at a time is allowed on a shard. It also conflicts
* with ExclusiveLock, which ensures that updates/deletes/upserts are applied
* in the same order on all placements. It does not conflict with ShareLock,
* which is normally obtained by single-shard commutative writes.
*/
static void
AcquireExecutorMultiShardLocks(List *shardIntervalList)
{
LOCKMODE lockMode = NoLock;
if (AllModificationsCommutative || !IsReplicated(shardIntervalList))
{
/*
* When all writes are commutative then we only need to prevent multi-shard
* commands from running concurrently with each other and with commands
* that are explicitly non-commutative. When there is not replication then
* we only need to prevent concurrent multi-shard commands.
*
* In either case, ShareUpdateExclusive has the desired effect, since
* it conflicts with itself and ExclusiveLock (taken by non-commutative
* writes).
*/
lockMode = ShareUpdateExclusiveLock;
}
else
{
/*
* When there is replication, prevent all concurrent writes to the same
* shards to ensure the writes are ordered.
*/
lockMode = ExclusiveLock;
}
LockShardListResources(shardIntervalList, lockMode);
}
/*
* IsReplicated checks whether any of the shards in the given list has more
* than one replica.
*/
static bool
IsReplicated(List *shardIntervalList)
{
ListCell *shardIntervalCell;
bool hasReplication = false;
foreach(shardIntervalCell, shardIntervalList)
{
ShardInterval *shardInterval = (ShardInterval *) lfirst(shardIntervalCell);
uint64 shardId = shardInterval->shardId;
List *shardPlacementList = FinalizedShardPlacementList(shardId);
if (shardPlacementList->length > 1)
{
hasReplication = true;
break;
}
}
return hasReplication;
}
/* /*
* CreateXactParticipantHash initializes the map used to store the connections * CreateXactParticipantHash initializes the map used to store the connections
* needed to process distributed transactions. Unlike the connection cache, we * needed to process distributed transactions. Unlike the connection cache, we
@ -422,8 +499,10 @@ RouterExecutorRun(QueryDesc *queryDesc, ScanDirection direction, long count)
deparse_shard_query(query, relationId, task->anchorShardId, deparse_shard_query(query, relationId, task->anchorShardId,
newQueryString); newQueryString);
elog(DEBUG4, "query before master evaluation: %s", task->queryString); ereport(DEBUG4, (errmsg("query before master evaluation: %s",
elog(DEBUG4, "query after master evaluation: %s", newQueryString->data); task->queryString)));
ereport(DEBUG4, (errmsg("query after master evaluation: %s",
newQueryString->data)));
task->queryString = newQueryString->data; task->queryString = newQueryString->data;
} }
@ -698,6 +777,9 @@ ExecuteModifyTasks(List *taskList, bool expectResults, ParamListInfo paramListIn
ListCell *taskCell = NULL; ListCell *taskCell = NULL;
char *userName = CurrentUserName(); char *userName = CurrentUserName();
List *shardIntervalList = NIL; List *shardIntervalList = NIL;
List *affectedTupleCountList = NIL;
bool tasksPending = true;
int placementIndex = 0;
if (XactModificationLevel == XACT_MODIFICATION_DATA) if (XactModificationLevel == XACT_MODIFICATION_DATA)
{ {
@ -712,12 +794,19 @@ ExecuteModifyTasks(List *taskList, bool expectResults, ParamListInfo paramListIn
shardIntervalList = TaskShardIntervalList(taskList); shardIntervalList = TaskShardIntervalList(taskList);
/* ensure that there are no concurrent modifications on the same shards */ /* ensure that there are no concurrent modifications on the same shards */
LockShardListResources(shardIntervalList, ExclusiveLock); AcquireExecutorMultiShardLocks(shardIntervalList);
/* open connection to all relevant placements, if not already open */ /* open connection to all relevant placements, if not already open */
OpenTransactionsToAllShardPlacements(shardIntervalList, userName); OpenTransactionsToAllShardPlacements(shardIntervalList, userName);
/* send command to all relevant shard placements */ /* iterate over placements in rounds, to ensure in-order execution */
while (tasksPending)
{
int taskIndex = 0;
tasksPending = false;
/* send command to all shard placements with the current index in parallel */
foreach(taskCell, taskList) foreach(taskCell, taskList)
{ {
Task *task = (Task *) lfirst(taskCell); Task *task = (Task *) lfirst(taskCell);
@ -726,20 +815,22 @@ ExecuteModifyTasks(List *taskList, bool expectResults, ParamListInfo paramListIn
bool shardConnectionsFound = false; bool shardConnectionsFound = false;
ShardConnections *shardConnections = NULL; ShardConnections *shardConnections = NULL;
List *connectionList = NIL; List *connectionList = NIL;
ListCell *connectionCell = NULL; TransactionConnection *transactionConnection = NULL;
PGconn *connection = NULL;
bool queryOK = false;
shardConnections = GetShardConnections(shardId, &shardConnectionsFound); shardConnections = GetShardConnections(shardId, &shardConnectionsFound);
Assert(shardConnectionsFound);
connectionList = shardConnections->connectionList; connectionList = shardConnections->connectionList;
Assert(connectionList != NIL);
foreach(connectionCell, connectionList) if (placementIndex >= list_length(connectionList))
{ {
TransactionConnection *transactionConnection = /* no more active placements for this task */
(TransactionConnection *) lfirst(connectionCell); continue;
PGconn *connection = transactionConnection->connection; }
bool queryOK = false;
transactionConnection =
(TransactionConnection *) list_nth(connectionList, placementIndex);
connection = transactionConnection->connection;
queryOK = SendQueryInSingleRowMode(connection, queryString, paramListInfo); queryOK = SendQueryInSingleRowMode(connection, queryString, paramListInfo);
if (!queryOK) if (!queryOK)
@ -747,7 +838,6 @@ ExecuteModifyTasks(List *taskList, bool expectResults, ParamListInfo paramListIn
ReraiseRemoteError(connection, NULL); ReraiseRemoteError(connection, NULL);
} }
} }
}
/* collects results from all relevant shard placements */ /* collects results from all relevant shard placements */
foreach(taskCell, taskList) foreach(taskCell, taskList)
@ -757,9 +847,11 @@ ExecuteModifyTasks(List *taskList, bool expectResults, ParamListInfo paramListIn
bool shardConnectionsFound = false; bool shardConnectionsFound = false;
ShardConnections *shardConnections = NULL; ShardConnections *shardConnections = NULL;
List *connectionList = NIL; List *connectionList = NIL;
ListCell *connectionCell = NULL; TransactionConnection *transactionConnection = NULL;
int64 affectedTupleCount = 0; PGconn *connection = NULL;
bool gotResults = false; int64 currentAffectedTupleCount = 0;
bool failOnError = true;
bool queryOK PG_USED_FOR_ASSERTS_ONLY = false;
/* abort in case of cancellation */ /* abort in case of cancellation */
CHECK_FOR_INTERRUPTS(); CHECK_FOR_INTERRUPTS();
@ -767,21 +859,22 @@ ExecuteModifyTasks(List *taskList, bool expectResults, ParamListInfo paramListIn
shardConnections = GetShardConnections(shardId, &shardConnectionsFound); shardConnections = GetShardConnections(shardId, &shardConnectionsFound);
connectionList = shardConnections->connectionList; connectionList = shardConnections->connectionList;
foreach(connectionCell, connectionList) if (placementIndex >= list_length(connectionList))
{ {
TransactionConnection *transactionConnection = /* no more active placements for this task */
(TransactionConnection *) lfirst(connectionCell); continue;
PGconn *connection = transactionConnection->connection; }
int64 currentAffectedTupleCount = 0;
bool failOnError = true; transactionConnection =
bool queryOK PG_USED_FOR_ASSERTS_ONLY = false; (TransactionConnection *) list_nth(connectionList, placementIndex);
connection = transactionConnection->connection;
/* /*
* If caller is interested, store query results the first time * If caller is interested, store query results the first time
* through. The output of the query's execution on other shards is * through. The output of the query's execution on other shards is
* discarded if we run there (because it's a modification query). * discarded if we run there (because it's a modification query).
*/ */
if (!gotResults && expectResults) if (placementIndex == 0 && expectResults)
{ {
Assert(routerState != NULL && tupleDescriptor != NULL); Assert(routerState != NULL && tupleDescriptor != NULL);
@ -797,26 +890,45 @@ ExecuteModifyTasks(List *taskList, bool expectResults, ParamListInfo paramListIn
/* should have rolled back on error */ /* should have rolled back on error */
Assert(queryOK); Assert(queryOK);
if (!gotResults) if (placementIndex == 0)
{ {
affectedTupleCount = currentAffectedTupleCount; totalAffectedTupleCount += currentAffectedTupleCount;
totalAffectedTupleCount += affectedTupleCount;
/* keep track of the initial affected tuple count */
affectedTupleCountList = lappend_int(affectedTupleCountList,
currentAffectedTupleCount);
} }
else if (currentAffectedTupleCount != affectedTupleCount) else
{
/* warn the user if shard placements have diverged */
int64 previousAffectedTupleCount = list_nth_int(affectedTupleCountList,
taskIndex);
if (currentAffectedTupleCount != previousAffectedTupleCount)
{ {
char *nodeName = ConnectionGetOptionValue(connection, "host"); char *nodeName = ConnectionGetOptionValue(connection, "host");
char *nodePort = ConnectionGetOptionValue(connection, "port"); char *nodePort = ConnectionGetOptionValue(connection, "port");
ereport(WARNING, ereport(WARNING,
(errmsg("modified "INT64_FORMAT " tuples, but expected " (errmsg("modified "INT64_FORMAT " tuples of shard "
"to modify "INT64_FORMAT, UINT64_FORMAT ", but expected to modify "INT64_FORMAT,
currentAffectedTupleCount, affectedTupleCount), currentAffectedTupleCount, shardId,
previousAffectedTupleCount),
errdetail("modified placement on %s:%s", nodeName, errdetail("modified placement on %s:%s", nodeName,
nodePort))); nodePort)));
} }
gotResults = true;
} }
if (!tasksPending && placementIndex + 1 < list_length(connectionList))
{
/* more tasks to be done after thise one */
tasksPending = true;
}
taskIndex++;
}
placementIndex++;
} }
CHECK_FOR_INTERRUPTS(); CHECK_FOR_INTERRUPTS();