mirror of https://github.com/citusdata/citus.git
446 lines
14 KiB
C
446 lines
14 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* distributed_execution_locks.c
|
|
*
|
|
* Definitions of the functions used in executing distributed
|
|
* execution locking.
|
|
*
|
|
* Copyright (c) Citus Data, Inc.
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#include "distributed/distributed_execution_locks.h"
|
|
#include "distributed/listutils.h"
|
|
#include "distributed/coordinator_protocol.h"
|
|
#include "distributed/metadata_cache.h"
|
|
#include "distributed/multi_executor.h"
|
|
#include "distributed/multi_partitioning_utils.h"
|
|
#include "distributed/pg_dist_partition.h"
|
|
#include "distributed/resource_lock.h"
|
|
#include "distributed/transaction_management.h"
|
|
|
|
|
|
static bool RequiresConsistentSnapshot(Task *task);
|
|
static void AcquireExecutorShardLockForRowModify(Task *task, RowModifyLevel modLevel);
|
|
static void AcquireExecutorShardLocksForRelationRowLockList(List *relationRowLockList);
|
|
|
|
|
|
/*
|
|
* AcquireExecutorShardLocks acquires locks on shards for the given task if
|
|
* necessary to avoid divergence between multiple replicas of the same shard.
|
|
* No lock is obtained when there is only one replica.
|
|
*
|
|
* The function determines the appropriate lock mode based on the commutativity
|
|
* rule of the command. In each case, it uses a lock mode that enforces the
|
|
* commutativity rule.
|
|
*
|
|
* The mapping is overridden when all_modifications_commutative is set to true.
|
|
* In that case, all modifications are treated as commutative, which can be used
|
|
* to communicate that the application is only generating commutative
|
|
* UPDATE/DELETE/UPSERT commands and exclusive locks are unnecessary.
|
|
*/
|
|
void
|
|
AcquireExecutorShardLocks(Task *task, RowModifyLevel modLevel)
|
|
{
|
|
AcquireExecutorShardLockForRowModify(task, modLevel);
|
|
AcquireExecutorShardLocksForRelationRowLockList(task->relationRowLockList);
|
|
|
|
/*
|
|
* If the task has a subselect, then we may need to lock the shards from which
|
|
* the query selects as well to prevent the subselects from seeing different
|
|
* results on different replicas. In particular this prevents INSERT.. SELECT
|
|
* commands from having a different effect on different placements.
|
|
*/
|
|
if (RequiresConsistentSnapshot(task))
|
|
{
|
|
/*
|
|
* ExclusiveLock conflicts with all lock types used by modifications
|
|
* and therefore prevents other modifications from running
|
|
* concurrently.
|
|
*/
|
|
|
|
LockRelationShardResources(task->relationShardList, ExclusiveLock);
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* AcquireExecutorMultiShardLocks acquires shard locks needed for execution
|
|
* of writes on multiple shards. In addition to honouring commutativity
|
|
* rules, we currently only allow a single multi-shard command on a shard at
|
|
* a time. Otherwise, concurrent multi-shard commands may take row-level
|
|
* locks on the shard placements in a different order and create a distributed
|
|
* deadlock. This applies even when writes are commutative and/or there is
|
|
* no replication.
|
|
*
|
|
* 1. If citus.all_modifications_commutative is set to true, then all locks
|
|
* are acquired as ShareUpdateExclusiveLock.
|
|
*
|
|
* 2. If citus.all_modifications_commutative is false, then only the shards
|
|
* with 2 or more replicas are locked with ExclusiveLock. Otherwise, the
|
|
* lock is acquired with ShareUpdateExclusiveLock.
|
|
*
|
|
* ShareUpdateExclusiveLock conflicts with itself such that only one
|
|
* multi-shard modification at a time is allowed on a shard. It also conflicts
|
|
* with ExclusiveLock, which ensures that updates/deletes/upserts are applied
|
|
* in the same order on all placements. It does not conflict with
|
|
* RowExclusiveLock, which is normally obtained by single-shard, commutative
|
|
* writes.
|
|
*/
|
|
void
|
|
AcquireExecutorMultiShardLocks(List *taskList)
|
|
{
|
|
Task *task = NULL;
|
|
foreach_ptr(task, taskList)
|
|
{
|
|
LOCKMODE lockMode = NoLock;
|
|
|
|
if (task->anchorShardId == INVALID_SHARD_ID)
|
|
{
|
|
/* no shard locks to take if the task is not anchored to a shard */
|
|
continue;
|
|
}
|
|
|
|
if (AllModificationsCommutative || list_length(task->taskPlacementList) == 1)
|
|
{
|
|
/*
|
|
* When all writes are commutative then we only need to prevent multi-shard
|
|
* commands from running concurrently with each other and with commands
|
|
* that are explicitly non-commutative. When there is no replication then
|
|
* we only need to prevent concurrent multi-shard commands.
|
|
*
|
|
* In either case, ShareUpdateExclusive has the desired effect, since
|
|
* it conflicts with itself and ExclusiveLock (taken by non-commutative
|
|
* writes).
|
|
*
|
|
* However, some users find this too restrictive, so we allow them to
|
|
* reduce to a RowExclusiveLock when citus.enable_deadlock_prevention
|
|
* is enabled, which lets multi-shard modifications run in parallel as
|
|
* long as they all disable the GUC.
|
|
*
|
|
* We also skip taking a heavy-weight lock when running a multi-shard
|
|
* commands from workers, since we cannot prevent concurrency across
|
|
* workers anyway.
|
|
*/
|
|
|
|
if (EnableDeadlockPrevention && IsCoordinator())
|
|
{
|
|
lockMode = ShareUpdateExclusiveLock;
|
|
}
|
|
else
|
|
{
|
|
lockMode = RowExclusiveLock;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* When there is replication, prevent all concurrent writes to the same
|
|
* shards to ensure the writes are ordered.
|
|
*/
|
|
|
|
lockMode = ExclusiveLock;
|
|
}
|
|
|
|
/*
|
|
* If we are dealing with a partition we are also taking locks on parent table
|
|
* to prevent deadlocks on concurrent operations on a partition and its parent.
|
|
*/
|
|
LockParentShardResourceIfPartition(task->anchorShardId, lockMode);
|
|
LockShardResource(task->anchorShardId, lockMode);
|
|
|
|
/*
|
|
* If the task has a subselect, then we may need to lock the shards from which
|
|
* the query selects as well to prevent the subselects from seeing different
|
|
* results on different replicas.
|
|
*/
|
|
|
|
if (RequiresConsistentSnapshot(task))
|
|
{
|
|
/*
|
|
* ExclusiveLock conflicts with all lock types used by modifications
|
|
* and therefore prevents other modifications from running
|
|
* concurrently.
|
|
*/
|
|
|
|
LockRelationShardResources(task->relationShardList, ExclusiveLock);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* RequiresConsistentSnapshot returns true if the given task need to take
|
|
* the necessary locks to ensure that a subquery in the modify query
|
|
* returns the same output for all task placements.
|
|
*/
|
|
static bool
|
|
RequiresConsistentSnapshot(Task *task)
|
|
{
|
|
bool requiresIsolation = false;
|
|
|
|
if (!task->modifyWithSubquery)
|
|
{
|
|
/*
|
|
* Other commands do not read from other shards.
|
|
*/
|
|
|
|
requiresIsolation = false;
|
|
}
|
|
else if (list_length(task->taskPlacementList) == 1)
|
|
{
|
|
/*
|
|
* If there is only one replica then we fully rely on PostgreSQL to
|
|
* provide SELECT isolation. In this case, we do not provide isolation
|
|
* across the shards, but that was never our intention.
|
|
*/
|
|
|
|
requiresIsolation = false;
|
|
}
|
|
else if (AllModificationsCommutative)
|
|
{
|
|
/*
|
|
* An INSERT/SELECT is commutative with other writes if it excludes
|
|
* any ongoing writes based on the filter conditions. Without knowing
|
|
* whether this is true, we assume the user took this into account
|
|
* when enabling citus.all_modifications_commutative. This option
|
|
* gives users an escape from aggressive locking during INSERT/SELECT.
|
|
*/
|
|
|
|
requiresIsolation = false;
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* If this is a non-commutative write, then we need to block ongoing
|
|
* writes to make sure that the subselect returns the same result
|
|
* on all placements.
|
|
*/
|
|
|
|
requiresIsolation = true;
|
|
}
|
|
|
|
return requiresIsolation;
|
|
}
|
|
|
|
|
|
/*
|
|
* AcquireMetadataLocks acquires metadata locks on each of the anchor
|
|
* shards in the task list to prevent a shard being modified while it
|
|
* is being copied.
|
|
*/
|
|
void
|
|
AcquireMetadataLocks(List *taskList)
|
|
{
|
|
/*
|
|
* Note: to avoid the overhead of additional sorting, we assume tasks
|
|
* to be already sorted by shard ID such that deadlocks are avoided.
|
|
* This is true for INSERT/SELECT, which is the only multi-shard
|
|
* command right now.
|
|
*/
|
|
|
|
Task *task = NULL;
|
|
foreach_ptr(task, taskList)
|
|
{
|
|
LockShardDistributionMetadata(task->anchorShardId, ShareLock);
|
|
}
|
|
}
|
|
|
|
|
|
static void
|
|
AcquireExecutorShardLockForRowModify(Task *task, RowModifyLevel modLevel)
|
|
{
|
|
LOCKMODE lockMode = NoLock;
|
|
int64 shardId = task->anchorShardId;
|
|
|
|
if (shardId == INVALID_SHARD_ID)
|
|
{
|
|
return;
|
|
}
|
|
|
|
if (modLevel <= ROW_MODIFY_READONLY)
|
|
{
|
|
/*
|
|
* The executor shard lock is used to maintain consistency between
|
|
* replicas and therefore no lock is required for read-only queries
|
|
* or in general when there is only one replica.
|
|
*/
|
|
|
|
lockMode = NoLock;
|
|
}
|
|
else if (list_length(task->taskPlacementList) == 1)
|
|
{
|
|
if (task->replicationModel == REPLICATION_MODEL_2PC)
|
|
{
|
|
/*
|
|
* While we don't need a lock to ensure writes are applied in
|
|
* a consistent order when there is a single replica. We also use
|
|
* shard resource locks as a crude implementation of SELECT..
|
|
* FOR UPDATE on reference tables, so we should always take
|
|
* a lock that conflicts with the FOR UPDATE/SHARE locks.
|
|
*/
|
|
lockMode = RowExclusiveLock;
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* When there is no replication, the worker itself can decide on
|
|
* on the order in which writes are applied.
|
|
*/
|
|
lockMode = NoLock;
|
|
}
|
|
}
|
|
else if (AllModificationsCommutative)
|
|
{
|
|
/*
|
|
* Bypass commutativity checks when citus.all_modifications_commutative
|
|
* is enabled.
|
|
*
|
|
* A RowExclusiveLock does not conflict with itself and therefore allows
|
|
* multiple commutative commands to proceed concurrently. It does
|
|
* conflict with ExclusiveLock, which may still be obtained by another
|
|
* session that executes an UPDATE/DELETE/UPSERT command with
|
|
* citus.all_modifications_commutative disabled.
|
|
*/
|
|
|
|
lockMode = RowExclusiveLock;
|
|
}
|
|
else if (modLevel < ROW_MODIFY_NONCOMMUTATIVE)
|
|
{
|
|
/*
|
|
* An INSERT commutes with other INSERT commands, since performing them
|
|
* out-of-order only affects the table order on disk, but not the
|
|
* contents.
|
|
*
|
|
* When a unique constraint exists, INSERTs are not strictly commutative,
|
|
* but whichever INSERT comes last will error out and thus has no effect.
|
|
* INSERT is not commutative with UPDATE/DELETE/UPSERT, since the
|
|
* UPDATE/DELETE/UPSERT may consider the INSERT, depending on execution
|
|
* order.
|
|
*
|
|
* A RowExclusiveLock does not conflict with itself and therefore allows
|
|
* multiple INSERT commands to proceed concurrently. It conflicts with
|
|
* ExclusiveLock obtained by UPDATE/DELETE/UPSERT, ensuring those do
|
|
* not run concurrently with INSERT.
|
|
*/
|
|
|
|
lockMode = RowExclusiveLock;
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* UPDATE/DELETE/UPSERT commands do not commute with other modifications
|
|
* since the rows modified by one command may be affected by the outcome
|
|
* of another command.
|
|
*
|
|
* We need to handle upsert before INSERT, because PostgreSQL models
|
|
* upsert commands as INSERT with an ON CONFLICT section.
|
|
*
|
|
* ExclusiveLock conflicts with all lock types used by modifications
|
|
* and therefore prevents other modifications from running
|
|
* concurrently.
|
|
*/
|
|
|
|
lockMode = ExclusiveLock;
|
|
}
|
|
|
|
if (lockMode != NoLock)
|
|
{
|
|
ShardInterval *shardInterval = LoadShardInterval(shardId);
|
|
|
|
SerializeNonCommutativeWrites(list_make1(shardInterval), lockMode);
|
|
}
|
|
}
|
|
|
|
|
|
static void
|
|
AcquireExecutorShardLocksForRelationRowLockList(List *relationRowLockList)
|
|
{
|
|
LOCKMODE rowLockMode = NoLock;
|
|
|
|
if (relationRowLockList == NIL)
|
|
{
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* If lock clause exists and it affects any reference table, we need to get
|
|
* lock on shard resource. Type of lock is determined by the type of row lock
|
|
* given in the query. If the type of row lock is either FOR NO KEY UPDATE or
|
|
* FOR UPDATE we get ExclusiveLock on shard resource. We get ShareLock if it
|
|
* is FOR KEY SHARE or FOR KEY SHARE.
|
|
*
|
|
* We have selected these lock types according to conflict table given in the
|
|
* Postgres documentation. It is given that FOR UPDATE and FOR NO KEY UPDATE
|
|
* must be conflict with each other modify command. By getting ExlcusiveLock
|
|
* we guarantee that. Note that, getting ExlusiveLock does not mimic the
|
|
* behaviour of Postgres exactly. Getting row lock with FOR NO KEY UPDATE and
|
|
* FOR KEY SHARE do not conflict in Postgres, yet they block each other in
|
|
* our implementation. Since FOR SHARE and FOR KEY SHARE does not conflict
|
|
* with each other but conflicts with modify commands, we get ShareLock for
|
|
* them.
|
|
*/
|
|
RelationRowLock *relationRowLock = NULL;
|
|
foreach_ptr(relationRowLock, relationRowLockList)
|
|
{
|
|
LockClauseStrength rowLockStrength = relationRowLock->rowLockStrength;
|
|
Oid relationId = relationRowLock->relationId;
|
|
|
|
if (IsCitusTableType(relationId, REFERENCE_TABLE))
|
|
{
|
|
List *shardIntervalList = LoadShardIntervalList(relationId);
|
|
|
|
if (rowLockStrength == LCS_FORKEYSHARE || rowLockStrength == LCS_FORSHARE)
|
|
{
|
|
rowLockMode = ShareLock;
|
|
}
|
|
else if (rowLockStrength == LCS_FORNOKEYUPDATE ||
|
|
rowLockStrength == LCS_FORUPDATE)
|
|
{
|
|
rowLockMode = ExclusiveLock;
|
|
}
|
|
|
|
SerializeNonCommutativeWrites(shardIntervalList, rowLockMode);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* LockPartitionsInRelationList iterates over given list and acquires locks on
|
|
* partitions of each partitioned table. It does nothing for non-partitioned tables.
|
|
*/
|
|
void
|
|
LockPartitionsInRelationList(List *relationIdList, LOCKMODE lockmode)
|
|
{
|
|
Oid relationId = InvalidOid;
|
|
foreach_oid(relationId, relationIdList)
|
|
{
|
|
if (PartitionedTable(relationId))
|
|
{
|
|
LockPartitionRelations(relationId, lockmode);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* LockPartitionRelations acquires relation lock on all partitions of given
|
|
* partitioned relation. This function expects that given relation is a
|
|
* partitioned relation.
|
|
*/
|
|
void
|
|
LockPartitionRelations(Oid relationId, LOCKMODE lockMode)
|
|
{
|
|
/*
|
|
* PartitionList function generates partition list in the same order
|
|
* as PostgreSQL. Therefore we do not need to sort it before acquiring
|
|
* locks.
|
|
*/
|
|
List *partitionList = PartitionList(relationId);
|
|
Oid partitionRelationId = InvalidOid;
|
|
foreach_oid(partitionRelationId, partitionList)
|
|
{
|
|
LockRelationOid(partitionRelationId, lockMode);
|
|
}
|
|
}
|