mirror of https://github.com/citusdata/citus.git
Add read locks for making INSERT/SELECT consistent across placements
When an INSERT/SELECT command runs on a distributed table with replication factor >1, we need to ensure that it sees the same result on each placement of a shard. After this change, the router executor takes exclusive locks on shards from which the SELECT in an INSERT/SELECT reads in order to prevent concurrent changes. This is not a very optimal solution, but it's simple and correct. The citus.all_modifications_commutative can be used to avoid aggressive locking. An INSERT/SELECT whose filters are known to exclude any ongoing writes can be marked as commutative.pull/896/head
parent
2a6875d817
commit
44d1f7b950
|
@ -100,8 +100,8 @@ static int64 ExecuteModifyTasks(List *taskList, bool expectResults,
|
||||||
TupleDesc tupleDescriptor);
|
TupleDesc tupleDescriptor);
|
||||||
static List * TaskShardIntervalList(List *taskList);
|
static List * TaskShardIntervalList(List *taskList);
|
||||||
static void AcquireExecutorShardLock(Task *task, CmdType commandType);
|
static void AcquireExecutorShardLock(Task *task, CmdType commandType);
|
||||||
static void AcquireExecutorMultiShardLocks(List *shardIntervalList);
|
static void AcquireExecutorMultiShardLocks(List *taskList);
|
||||||
static bool IsReplicated(List *shardIntervalList);
|
static bool RequiresSelectIsolation(Task *task);
|
||||||
static uint64 ReturnRowsFromTuplestore(uint64 tupleCount, TupleDesc tupleDescriptor,
|
static uint64 ReturnRowsFromTuplestore(uint64 tupleCount, TupleDesc tupleDescriptor,
|
||||||
DestReceiver *destination,
|
DestReceiver *destination,
|
||||||
Tuplestorestate *tupleStore);
|
Tuplestorestate *tupleStore);
|
||||||
|
@ -311,15 +311,38 @@ AcquireExecutorShardLock(Task *task, CmdType commandType)
|
||||||
{
|
{
|
||||||
LockShardResource(shardId, lockMode);
|
LockShardResource(shardId, lockMode);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the task has a subselect, then we may need to lock the shards from which
|
||||||
|
* the query selects as well to prevent the subselects from seeing different
|
||||||
|
* results on different replicas. In particular this prevents INSERT.. SELECT
|
||||||
|
* commands from having a different effect on different placements.
|
||||||
|
*/
|
||||||
|
if (RequiresSelectIsolation(task))
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* ExclusiveLock conflicts with all lock types used by modifications
|
||||||
|
* and therefore prevents other modifications from running
|
||||||
|
* concurrently.
|
||||||
|
*/
|
||||||
|
|
||||||
|
LockShardListResources(task->selectShardList, ExclusiveLock);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* AcquireExecutorMultiShardLocks acquires shard locks need for execution
|
* AcquireExecutorMultiShardLocks acquires shard locks needed for execution
|
||||||
* of writes on multiple shards.
|
* of writes on multiple shards. In addition to honouring commutativity
|
||||||
|
* rules, we currently only allow a single multi-shard command on a shard at
|
||||||
|
* a time. Otherwise, concurrent multi-shard commands may take row-level
|
||||||
|
* locks on the shard placements in a different order and create a distributed
|
||||||
|
* deadlock. This applies even when writes are commutative and/or there is
|
||||||
|
* no replication.
|
||||||
*
|
*
|
||||||
* 1. If citus.all_modifications_commutative is set to true, then all locks
|
* 1. If citus.all_modifications_commutative is set to true, then all locks
|
||||||
* are acquired as ShareUpdateExclusiveLock.
|
* are acquired as ShareUpdateExclusiveLock.
|
||||||
|
*
|
||||||
* 2. If citus.all_modifications_commutative is false, then only the shards
|
* 2. If citus.all_modifications_commutative is false, then only the shards
|
||||||
* with 2 or more replicas are locked with ExclusiveLock. Otherwise, the
|
* with 2 or more replicas are locked with ExclusiveLock. Otherwise, the
|
||||||
* lock is acquired with ShareUpdateExclusiveLock.
|
* lock is acquired with ShareUpdateExclusiveLock.
|
||||||
|
@ -327,20 +350,26 @@ AcquireExecutorShardLock(Task *task, CmdType commandType)
|
||||||
* ShareUpdateExclusiveLock conflicts with itself such that only one
|
* ShareUpdateExclusiveLock conflicts with itself such that only one
|
||||||
* multi-shard modification at a time is allowed on a shard. It also conflicts
|
* multi-shard modification at a time is allowed on a shard. It also conflicts
|
||||||
* with ExclusiveLock, which ensures that updates/deletes/upserts are applied
|
* with ExclusiveLock, which ensures that updates/deletes/upserts are applied
|
||||||
* in the same order on all placements. It does not conflict with ShareLock,
|
* in the same order on all placements. It does not conflict with
|
||||||
* which is normally obtained by single-shard commutative writes.
|
* RowExclusiveLock, which is normally obtained by single-shard, commutative
|
||||||
|
* writes.
|
||||||
*/
|
*/
|
||||||
static void
|
static void
|
||||||
AcquireExecutorMultiShardLocks(List *shardIntervalList)
|
AcquireExecutorMultiShardLocks(List *taskList)
|
||||||
{
|
{
|
||||||
|
ListCell *taskCell = NULL;
|
||||||
|
|
||||||
|
foreach(taskCell, taskList)
|
||||||
|
{
|
||||||
|
Task *task = (Task *) lfirst(taskCell);
|
||||||
LOCKMODE lockMode = NoLock;
|
LOCKMODE lockMode = NoLock;
|
||||||
|
|
||||||
if (AllModificationsCommutative || !IsReplicated(shardIntervalList))
|
if (AllModificationsCommutative || list_length(task->taskPlacementList) == 1)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
* When all writes are commutative then we only need to prevent multi-shard
|
* When all writes are commutative then we only need to prevent multi-shard
|
||||||
* commands from running concurrently with each other and with commands
|
* commands from running concurrently with each other and with commands
|
||||||
* that are explicitly non-commutative. When there is not replication then
|
* that are explicitly non-commutative. When there is no replication then
|
||||||
* we only need to prevent concurrent multi-shard commands.
|
* we only need to prevent concurrent multi-shard commands.
|
||||||
*
|
*
|
||||||
* In either case, ShareUpdateExclusive has the desired effect, since
|
* In either case, ShareUpdateExclusive has the desired effect, since
|
||||||
|
@ -356,36 +385,87 @@ AcquireExecutorMultiShardLocks(List *shardIntervalList)
|
||||||
* When there is replication, prevent all concurrent writes to the same
|
* When there is replication, prevent all concurrent writes to the same
|
||||||
* shards to ensure the writes are ordered.
|
* shards to ensure the writes are ordered.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
lockMode = ExclusiveLock;
|
lockMode = ExclusiveLock;
|
||||||
}
|
}
|
||||||
|
|
||||||
LockShardListResources(shardIntervalList, lockMode);
|
LockShardResource(task->anchorShardId, lockMode);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the task has a subselect, then we may need to lock the shards from which
|
||||||
|
* the query selects as well to prevent the subselects from seeing different
|
||||||
|
* results on different replicas. In particular this prevents INSERT..SELECT
|
||||||
|
* commands from having different effects on different placements.
|
||||||
|
*/
|
||||||
|
|
||||||
|
if (RequiresSelectIsolation(task))
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* ExclusiveLock conflicts with all lock types used by modifications
|
||||||
|
* and therefore prevents other modifications from running
|
||||||
|
* concurrently.
|
||||||
|
*/
|
||||||
|
|
||||||
|
LockShardListResources(task->selectShardList, ExclusiveLock);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* IsReplicated checks whether any of the shards in the given list has more
|
* RequiresSelectIsolation returns whether a given task requires SELECT
|
||||||
* than one replica.
|
* isolation, meaning we should take the necessary locks to ensure that a
|
||||||
|
* subselect, which may join multiple shards, will return the same output
|
||||||
|
* for all task placements.
|
||||||
*/
|
*/
|
||||||
static bool
|
static bool
|
||||||
IsReplicated(List *shardIntervalList)
|
RequiresSelectIsolation(Task *task)
|
||||||
{
|
{
|
||||||
ListCell *shardIntervalCell;
|
bool requiresIsolation = false;
|
||||||
bool hasReplication = false;
|
|
||||||
|
|
||||||
foreach(shardIntervalCell, shardIntervalList)
|
if (!task->insertSelectQuery)
|
||||||
{
|
{
|
||||||
ShardInterval *shardInterval = (ShardInterval *) lfirst(shardIntervalCell);
|
/*
|
||||||
uint64 shardId = shardInterval->shardId;
|
* Only INSERT/SELECT commands currently require SELECT isolation.
|
||||||
List *shardPlacementList = FinalizedShardPlacementList(shardId);
|
* Other commands do not read from other shards.
|
||||||
if (shardPlacementList->length > 1)
|
*/
|
||||||
{
|
|
||||||
hasReplication = true;
|
requiresIsolation = false;
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
else if (list_length(task->taskPlacementList) == 1)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* If there is only one replica then we fully rely on PostgreSQL to
|
||||||
|
* provide SELECT isolation. In this case, we do not provide isolation
|
||||||
|
* across the shards, but that was never our intention.
|
||||||
|
*/
|
||||||
|
|
||||||
|
requiresIsolation = false;
|
||||||
|
}
|
||||||
|
else if (AllModificationsCommutative)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* An INSERT/SELECT is commutative with other writes if it excludes
|
||||||
|
* any ongoing writes based on the filter conditions. Without knowing
|
||||||
|
* whether this is true, we assume the user took this into account
|
||||||
|
* when enabling citus.all_modifications_commutative. This option
|
||||||
|
* gives users an escape from aggressive locking during INSERT/SELECT.
|
||||||
|
*/
|
||||||
|
|
||||||
|
requiresIsolation = false;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* If this is a non-commutative write, then we need to block ongoing
|
||||||
|
* writes to make sure that the subselect returns the same result
|
||||||
|
* on all placements.
|
||||||
|
*/
|
||||||
|
|
||||||
|
requiresIsolation = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
return hasReplication;
|
return requiresIsolation;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -809,7 +889,7 @@ ExecuteModifyTasks(List *taskList, bool expectResults, ParamListInfo paramListIn
|
||||||
shardIntervalList = TaskShardIntervalList(taskList);
|
shardIntervalList = TaskShardIntervalList(taskList);
|
||||||
|
|
||||||
/* ensure that there are no concurrent modifications on the same shards */
|
/* ensure that there are no concurrent modifications on the same shards */
|
||||||
AcquireExecutorMultiShardLocks(shardIntervalList);
|
AcquireExecutorMultiShardLocks(taskList);
|
||||||
|
|
||||||
/* open connection to all relevant placements, if not already open */
|
/* open connection to all relevant placements, if not already open */
|
||||||
OpenTransactionsToAllShardPlacements(shardIntervalList, userName);
|
OpenTransactionsToAllShardPlacements(shardIntervalList, userName);
|
||||||
|
|
|
@ -105,7 +105,8 @@ static Task * RouterSelectTask(Query *originalQuery,
|
||||||
List **placementList);
|
List **placementList);
|
||||||
static bool RouterSelectQuery(Query *originalQuery,
|
static bool RouterSelectQuery(Query *originalQuery,
|
||||||
RelationRestrictionContext *restrictionContext,
|
RelationRestrictionContext *restrictionContext,
|
||||||
List **placementList, uint64 *anchorShardId);
|
List **placementList, uint64 *anchorShardId,
|
||||||
|
List **selectShardList);
|
||||||
static List * TargetShardIntervalsForSelect(Query *query,
|
static List * TargetShardIntervalsForSelect(Query *query,
|
||||||
RelationRestrictionContext *restrictionContext);
|
RelationRestrictionContext *restrictionContext);
|
||||||
static List * WorkersContainingAllShards(List *prunedShardIntervalsList);
|
static List * WorkersContainingAllShards(List *prunedShardIntervalsList);
|
||||||
|
@ -264,6 +265,8 @@ CreateMultiTaskRouterPlan(Query *originalQuery,
|
||||||
/* add the task if it could be created */
|
/* add the task if it could be created */
|
||||||
if (modifyTask != NULL)
|
if (modifyTask != NULL)
|
||||||
{
|
{
|
||||||
|
modifyTask->insertSelectQuery = true;
|
||||||
|
|
||||||
sqlTaskList = lappend(sqlTaskList, modifyTask);
|
sqlTaskList = lappend(sqlTaskList, modifyTask);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -323,6 +326,7 @@ RouterModifyTaskForShardInterval(Query *originalQuery, ShardInterval *shardInter
|
||||||
Task *modifyTask = NULL;
|
Task *modifyTask = NULL;
|
||||||
List *selectPlacementList = NIL;
|
List *selectPlacementList = NIL;
|
||||||
uint64 selectAnchorShardId = INVALID_SHARD_ID;
|
uint64 selectAnchorShardId = INVALID_SHARD_ID;
|
||||||
|
List *selectShardList = NIL;
|
||||||
uint64 jobId = INVALID_JOB_ID;
|
uint64 jobId = INVALID_JOB_ID;
|
||||||
List *insertShardPlacementList = NULL;
|
List *insertShardPlacementList = NULL;
|
||||||
List *intersectedPlacementList = NULL;
|
List *intersectedPlacementList = NULL;
|
||||||
|
@ -363,7 +367,8 @@ RouterModifyTaskForShardInterval(Query *originalQuery, ShardInterval *shardInter
|
||||||
* updated to point to the relevant nodes and selectPlacementList is determined.
|
* updated to point to the relevant nodes and selectPlacementList is determined.
|
||||||
*/
|
*/
|
||||||
routerPlannable = RouterSelectQuery(copiedSubquery, copiedRestrictionContext,
|
routerPlannable = RouterSelectQuery(copiedSubquery, copiedRestrictionContext,
|
||||||
&selectPlacementList, &selectAnchorShardId);
|
&selectPlacementList, &selectAnchorShardId,
|
||||||
|
&selectShardList);
|
||||||
|
|
||||||
if (!routerPlannable)
|
if (!routerPlannable)
|
||||||
{
|
{
|
||||||
|
@ -417,6 +422,7 @@ RouterModifyTaskForShardInterval(Query *originalQuery, ShardInterval *shardInter
|
||||||
modifyTask->anchorShardId = shardId;
|
modifyTask->anchorShardId = shardId;
|
||||||
modifyTask->taskPlacementList = insertShardPlacementList;
|
modifyTask->taskPlacementList = insertShardPlacementList;
|
||||||
modifyTask->upsertQuery = upsertQuery;
|
modifyTask->upsertQuery = upsertQuery;
|
||||||
|
modifyTask->selectShardList = selectShardList;
|
||||||
|
|
||||||
return modifyTask;
|
return modifyTask;
|
||||||
}
|
}
|
||||||
|
@ -1762,9 +1768,10 @@ RouterSelectTask(Query *originalQuery, RelationRestrictionContext *restrictionCo
|
||||||
StringInfo queryString = makeStringInfo();
|
StringInfo queryString = makeStringInfo();
|
||||||
bool upsertQuery = false;
|
bool upsertQuery = false;
|
||||||
uint64 shardId = INVALID_SHARD_ID;
|
uint64 shardId = INVALID_SHARD_ID;
|
||||||
|
List *selectShardList = NIL;
|
||||||
|
|
||||||
queryRoutable = RouterSelectQuery(originalQuery, restrictionContext,
|
queryRoutable = RouterSelectQuery(originalQuery, restrictionContext,
|
||||||
placementList, &shardId);
|
placementList, &shardId, &selectShardList);
|
||||||
|
|
||||||
|
|
||||||
if (!queryRoutable)
|
if (!queryRoutable)
|
||||||
|
@ -1798,7 +1805,7 @@ RouterSelectTask(Query *originalQuery, RelationRestrictionContext *restrictionCo
|
||||||
*/
|
*/
|
||||||
static bool
|
static bool
|
||||||
RouterSelectQuery(Query *originalQuery, RelationRestrictionContext *restrictionContext,
|
RouterSelectQuery(Query *originalQuery, RelationRestrictionContext *restrictionContext,
|
||||||
List **placementList, uint64 *anchorShardId)
|
List **placementList, uint64 *anchorShardId, List **selectShardList)
|
||||||
{
|
{
|
||||||
List *prunedRelationShardList = TargetShardIntervalsForSelect(originalQuery,
|
List *prunedRelationShardList = TargetShardIntervalsForSelect(originalQuery,
|
||||||
restrictionContext);
|
restrictionContext);
|
||||||
|
@ -1833,12 +1840,15 @@ RouterSelectQuery(Query *originalQuery, RelationRestrictionContext *restrictionC
|
||||||
/* all relations are now pruned down to 0 or 1 shards */
|
/* all relations are now pruned down to 0 or 1 shards */
|
||||||
Assert(list_length(prunedShardList) <= 1);
|
Assert(list_length(prunedShardList) <= 1);
|
||||||
|
|
||||||
|
shardInterval = (ShardInterval *) linitial(prunedShardList);
|
||||||
|
|
||||||
/* anchor shard id */
|
/* anchor shard id */
|
||||||
if (shardId == INVALID_SHARD_ID)
|
if (shardId == INVALID_SHARD_ID)
|
||||||
{
|
{
|
||||||
shardInterval = (ShardInterval *) linitial(prunedShardList);
|
|
||||||
shardId = shardInterval->shardId;
|
shardId = shardInterval->shardId;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
*selectShardList = lappend(*selectShardList, shardInterval);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -493,6 +493,8 @@ OutTask(OUTFUNC_ARGS)
|
||||||
WRITE_BOOL_FIELD(assignmentConstrained);
|
WRITE_BOOL_FIELD(assignmentConstrained);
|
||||||
WRITE_NODE_FIELD(taskExecution);
|
WRITE_NODE_FIELD(taskExecution);
|
||||||
WRITE_BOOL_FIELD(upsertQuery);
|
WRITE_BOOL_FIELD(upsertQuery);
|
||||||
|
WRITE_BOOL_FIELD(insertSelectQuery);
|
||||||
|
WRITE_NODE_FIELD(selectShardList);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if (PG_VERSION_NUM < 90600)
|
#if (PG_VERSION_NUM < 90600)
|
||||||
|
|
|
@ -289,6 +289,8 @@ ReadTask(READFUNC_ARGS)
|
||||||
READ_BOOL_FIELD(assignmentConstrained);
|
READ_BOOL_FIELD(assignmentConstrained);
|
||||||
READ_NODE_FIELD(taskExecution);
|
READ_NODE_FIELD(taskExecution);
|
||||||
READ_BOOL_FIELD(upsertQuery);
|
READ_BOOL_FIELD(upsertQuery);
|
||||||
|
READ_BOOL_FIELD(insertSelectQuery);
|
||||||
|
READ_NODE_FIELD(selectShardList);
|
||||||
|
|
||||||
READ_DONE();
|
READ_DONE();
|
||||||
}
|
}
|
||||||
|
|
|
@ -169,6 +169,9 @@ typedef struct Task
|
||||||
uint64 shardId; /* only applies to shard fetch tasks */
|
uint64 shardId; /* only applies to shard fetch tasks */
|
||||||
TaskExecution *taskExecution; /* used by task tracker executor */
|
TaskExecution *taskExecution; /* used by task tracker executor */
|
||||||
bool upsertQuery; /* only applies to modify tasks */
|
bool upsertQuery; /* only applies to modify tasks */
|
||||||
|
|
||||||
|
bool insertSelectQuery;
|
||||||
|
List *selectShardList; /* only applies INSERT/SELECT tasks */
|
||||||
} Task;
|
} Task;
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue