citus/src/backend/distributed/executor/distributed_execution_locks.c

469 lines
16 KiB
C

/*-------------------------------------------------------------------------
*
* distributed_execution_locks.c
*
* Definitions of the functions used in executing distributed
* execution locking.
*
* Copyright (c) Citus Data, Inc.
*-------------------------------------------------------------------------
*/
#include "distributed/coordinator_protocol.h"
#include "distributed/distributed_execution_locks.h"
#include "distributed/executor_util.h"
#include "distributed/listutils.h"
#include "distributed/metadata_cache.h"
#include "distributed/multi_executor.h"
#include "distributed/multi_partitioning_utils.h"
#include "distributed/pg_dist_partition.h"
#include "distributed/resource_lock.h"
#include "distributed/transaction_management.h"
/*
* AcquireExecutorShardLocksForExecution acquires advisory lock on shard IDs
* to prevent unsafe concurrent modifications of shards.
*
* We prevent concurrent modifications of shards in two cases:
* 1. Any non-commutative writes to a replicated table
* 2. Multi-shard writes that are executed in parallel
*
* The first case ensures we do not apply updates in different orders on
* different replicas (e.g. of a reference table), which could lead the
* replicas to diverge.
*
* The second case prevents deadlocks due to out-of-order execution.
*
* There are two GUCs that can override the default behaviors.
* 'citus.all_modifications_commutative' relaxes locking
* that's done for the purpose of keeping replicas consistent.
* 'citus.enable_deadlock_prevention' relaxes locking done for
* the purpose of avoiding deadlocks between concurrent
* multi-shard commands.
*
* We do not take executor shard locks for utility commands such as
* TRUNCATE because the table locks already prevent concurrent access.
*/
void
AcquireExecutorShardLocksForExecution(RowModifyLevel modLevel, List *taskList)
{
if (modLevel <= ROW_MODIFY_READONLY &&
!SelectForUpdateOnReferenceTable(taskList))
{
/*
* Executor locks only apply to DML commands and SELECT FOR UPDATE queries
* touching reference tables.
*/
return;
}
bool requiresParallelExecutionLocks =
!(list_length(taskList) == 1 || ShouldRunTasksSequentially(taskList));
bool modifiedTableReplicated = ModifiedTableReplicated(taskList);
if (!modifiedTableReplicated && !requiresParallelExecutionLocks)
{
/*
* When a distributed query on tables with replication
* factor == 1 and command hits only a single shard, we
* rely on Postgres to handle the serialization of the
* concurrent modifications on the workers.
*
* For reference tables, even if their placements are replicated
* ones (e.g., single node), we acquire the distributed execution
* locks to be consistent when new node(s) are added. So, they
* do not return at this point.
*/
return;
}
/*
* We first assume that all the remaining modifications are going to
* be serialized. So, start with an ExclusiveLock and lower the lock level
* as much as possible.
*/
int lockMode = ExclusiveLock;
/*
* In addition to honouring commutativity rules, we currently only
* allow a single multi-shard command on a shard at a time. Otherwise,
* concurrent multi-shard commands may take row-level locks on the
* shard placements in a different order and create a distributed
* deadlock. This applies even when writes are commutative and/or
* there is no replication. This can be relaxed via
* EnableDeadlockPrevention.
*
* 1. If citus.all_modifications_commutative is set to true, then all locks
* are acquired as RowExclusiveLock.
*
* 2. If citus.all_modifications_commutative is false, then only the shards
* with more than one replicas are locked with ExclusiveLock. Otherwise, the
* lock is acquired with ShareUpdateExclusiveLock.
*
* ShareUpdateExclusiveLock conflicts with itself such that only one
* multi-shard modification at a time is allowed on a shard. It also conflicts
* with ExclusiveLock, which ensures that updates/deletes/upserts are applied
* in the same order on all placements. It does not conflict with
* RowExclusiveLock, which is normally obtained by single-shard, commutative
* writes.
*/
if (!modifiedTableReplicated && requiresParallelExecutionLocks)
{
/*
* When there is no replication then we only need to prevent
* concurrent multi-shard commands on the same shards. This is
* because concurrent, parallel commands may modify the same
* set of shards, but in different orders. The order of the
* accesses might trigger distributed deadlocks that are not
* possible to happen on non-distributed systems such
* regular Postgres.
*
* As an example, assume that we have two queries: query-1 and query-2.
* Both queries access shard-1 and shard-2. If query-1 first accesses to
* shard-1 then shard-2, and query-2 accesses shard-2 then shard-1, these
* two commands might block each other in case they modify the same rows
* (e.g., cause distributed deadlocks).
*
* In either case, ShareUpdateExclusive has the desired effect, since
* it conflicts with itself and ExclusiveLock (taken by non-commutative
* writes).
*
* However, some users find this too restrictive, so we allow them to
* reduce to a RowExclusiveLock when citus.enable_deadlock_prevention
* is enabled, which lets multi-shard modifications run in parallel as
* long as they all disable the GUC.
*/
lockMode =
EnableDeadlockPrevention ? ShareUpdateExclusiveLock : RowExclusiveLock;
if (!IsCoordinator())
{
/*
* We also skip taking a heavy-weight lock when running a multi-shard
* commands from workers, since we currently do not prevent concurrency
* across workers anyway.
*/
lockMode = RowExclusiveLock;
}
}
else if (modifiedTableReplicated)
{
/*
* When we are executing distributed queries on replicated tables, our
* default behaviour is to prevent any concurrency. This is valid
* for when parallel execution is happening or not.
*
* The reason is that we cannot control the order of the placement accesses
* of two distributed queries to the same shards. The order of the accesses
* might cause the replicas of the same shard placements diverge. This is
* not possible to happen on non-distributed systems such regular Postgres.
*
* As an example, assume that we have two queries: query-1 and query-2.
* Both queries only access the placements of shard-1, say p-1 and p-2.
*
* And, assume that these queries are non-commutative, such as:
* query-1: UPDATE table SET b = 1 WHERE key = 1;
* query-2: UPDATE table SET b = 2 WHERE key = 1;
*
* If query-1 accesses to p-1 then p-2, and query-2 accesses
* p-2 then p-1, these two commands would leave the p-1 and p-2
* diverged (e.g., the values for the column "b" would be different).
*
* The only exception to this rule is the single shard commutative
* modifications, such as INSERTs. In that case, we can allow
* concurrency among such backends, hence lowering the lock level
* to RowExclusiveLock.
*/
if (!requiresParallelExecutionLocks && modLevel < ROW_MODIFY_NONCOMMUTATIVE)
{
lockMode = RowExclusiveLock;
}
}
if (AllModificationsCommutative)
{
/*
* The mapping is overridden when all_modifications_commutative is set to true.
* In that case, all modifications are treated as commutative, which can be used
* to communicate that the application is only generating commutative
* UPDATE/DELETE/UPSERT commands and exclusive locks are unnecessary. This
* is irrespective of single-shard/multi-shard or replicated tables.
*/
lockMode = RowExclusiveLock;
}
/* now, iterate on the tasks and acquire the executor locks on the shards */
List *anchorShardIntervalList = NIL;
List *relationRowLockList = NIL;
List *requiresConsistentSnapshotRelationShardList = NIL;
Task *task = NULL;
foreach_declared_ptr(task, taskList)
{
ShardInterval *anchorShardInterval = LoadShardInterval(task->anchorShardId);
anchorShardIntervalList = lappend(anchorShardIntervalList, anchorShardInterval);
/* Acquire additional locks for SELECT .. FOR UPDATE on reference tables */
AcquireExecutorShardLocksForRelationRowLockList(task->relationRowLockList);
relationRowLockList =
list_concat(relationRowLockList,
task->relationRowLockList);
/*
* If the task has a subselect, then we may need to lock the shards from which
* the query selects as well to prevent the subselects from seeing different
* results on different replicas.
*/
if (RequiresConsistentSnapshot(task))
{
/*
* ExclusiveLock conflicts with all lock types used by modifications
* and therefore prevents other modifications from running
* concurrently.
*/
requiresConsistentSnapshotRelationShardList =
list_concat(requiresConsistentSnapshotRelationShardList,
task->relationShardList);
}
}
/*
* Acquire the locks in a sorted way to avoid deadlocks due to lock
* ordering across concurrent sessions.
*/
anchorShardIntervalList =
SortList(anchorShardIntervalList, CompareShardIntervalsById);
/*
* If we are dealing with a partition we are also taking locks on parent table
* to prevent deadlocks on concurrent operations on a partition and its parent.
*
* Note that this function currently does not acquire any remote locks as that
* is necessary to control the concurrency across multiple nodes for replicated
* tables. That is because Citus currently does not allow modifications to
* partitions from any node other than the coordinator.
*/
LockParentShardResourceIfPartition(anchorShardIntervalList, lockMode);
/* Acquire distribution execution locks on the affected shards */
SerializeNonCommutativeWrites(anchorShardIntervalList, lockMode);
if (relationRowLockList != NIL)
{
/* Acquire additional locks for SELECT .. FOR UPDATE on reference tables */
AcquireExecutorShardLocksForRelationRowLockList(relationRowLockList);
}
if (requiresConsistentSnapshotRelationShardList != NIL)
{
/*
* If the task has a subselect, then we may need to lock the shards from which
* the query selects as well to prevent the subselects from seeing different
* results on different replicas.
*
* ExclusiveLock conflicts with all lock types used by modifications
* and therefore prevents other modifications from running
* concurrently.
*/
LockRelationShardResources(requiresConsistentSnapshotRelationShardList,
ExclusiveLock);
}
}
/*
* RequiresConsistentSnapshot returns true if the given task need to take
* the necessary locks to ensure that a subquery in the modify query
* returns the same output for all task placements.
*/
bool
RequiresConsistentSnapshot(Task *task)
{
bool requiresIsolation = false;
if (!task->modifyWithSubquery)
{
/*
* Other commands do not read from other shards.
*/
requiresIsolation = false;
}
else if (list_length(task->taskPlacementList) == 1)
{
/*
* If there is only one replica then we fully rely on PostgreSQL to
* provide SELECT isolation. In this case, we do not provide isolation
* across the shards, but that was never our intention.
*/
requiresIsolation = false;
}
else if (AllModificationsCommutative)
{
/*
* An INSERT/SELECT is commutative with other writes if it excludes
* any ongoing writes based on the filter conditions. Without knowing
* whether this is true, we assume the user took this into account
* when enabling citus.all_modifications_commutative. This option
* gives users an escape from aggressive locking during INSERT/SELECT.
*/
requiresIsolation = false;
}
else
{
/*
* If this is a non-commutative write, then we need to block ongoing
* writes to make sure that the subselect returns the same result
* on all placements.
*/
requiresIsolation = true;
}
return requiresIsolation;
}
/*
* AcquireMetadataLocks acquires metadata locks on each of the anchor
* shards in the task list to prevent a shard being modified while it
* is being copied.
*/
void
AcquireMetadataLocks(List *taskList)
{
/*
* Note: to avoid the overhead of additional sorting, we assume tasks
* to be already sorted by shard ID such that deadlocks are avoided.
* This is true for INSERT/SELECT, which is the only multi-shard
* command right now.
*/
Task *task = NULL;
foreach_declared_ptr(task, taskList)
{
LockShardDistributionMetadata(task->anchorShardId, ShareLock);
}
}
void
AcquireExecutorShardLocksForRelationRowLockList(List *relationRowLockList)
{
LOCKMODE rowLockMode = NoLock;
if (relationRowLockList == NIL)
{
return;
}
/*
* If lock clause exists and it affects any reference table, we need to get
* lock on shard resource. Type of lock is determined by the type of row lock
* given in the query. If the type of row lock is either FOR NO KEY UPDATE or
* FOR UPDATE we get ExclusiveLock on shard resource. We get ShareLock if it
* is FOR KEY SHARE or FOR KEY SHARE.
*
* We have selected these lock types according to conflict table given in the
* Postgres documentation. It is given that FOR UPDATE and FOR NO KEY UPDATE
* must be conflict with each other modify command. By getting ExlcusiveLock
* we guarantee that. Note that, getting ExclusiveLock does not mimic the
* behaviour of Postgres exactly. Getting row lock with FOR NO KEY UPDATE and
* FOR KEY SHARE do not conflict in Postgres, yet they block each other in
* our implementation. Since FOR SHARE and FOR KEY SHARE does not conflict
* with each other but conflicts with modify commands, we get ShareLock for
* them.
*/
RelationRowLock *relationRowLock = NULL;
foreach_declared_ptr(relationRowLock, relationRowLockList)
{
LockClauseStrength rowLockStrength = relationRowLock->rowLockStrength;
Oid relationId = relationRowLock->relationId;
if (IsCitusTableType(relationId, REFERENCE_TABLE))
{
List *shardIntervalList = LoadShardIntervalList(relationId);
if (rowLockStrength == LCS_FORKEYSHARE || rowLockStrength == LCS_FORSHARE)
{
rowLockMode = ShareLock;
}
else if (rowLockStrength == LCS_FORNOKEYUPDATE ||
rowLockStrength == LCS_FORUPDATE)
{
rowLockMode = ExclusiveLock;
}
SerializeNonCommutativeWrites(shardIntervalList, rowLockMode);
}
}
}
/*
* LockPartitionsInRelationList iterates over given list and acquires locks on
* partitions of each partitioned table. It does nothing for non-partitioned tables.
*/
void
LockPartitionsInRelationList(List *relationIdList, LOCKMODE lockmode)
{
Oid relationId = InvalidOid;
foreach_declared_oid(relationId, relationIdList)
{
if (PartitionedTable(relationId))
{
LockPartitionRelations(relationId, lockmode);
}
}
}
/*
* LockPartitionRelations acquires relation lock on all partitions of given
* partitioned relation. This function expects that given relation is a
* partitioned relation.
*/
void
LockPartitionRelations(Oid relationId, LOCKMODE lockMode)
{
/*
* PartitionList function generates partition list in the same order
* as PostgreSQL. Therefore we do not need to sort it before acquiring
* locks.
*/
List *partitionList = PartitionList(relationId);
Oid partitionRelationId = InvalidOid;
foreach_declared_oid(partitionRelationId, partitionList)
{
LockRelationOid(partitionRelationId, lockMode);
}
}
/*
* LockPartitionsForDistributedPlan ensures commands take locks on all partitions
* of a distributed table that appears in the query. We do this primarily out of
* consistency with PostgreSQL locking.
*/
void
LockPartitionsForDistributedPlan(DistributedPlan *plan)
{
if (TaskListModifiesDatabase(plan->modLevel, plan->workerJob->taskList))
{
Oid targetRelationId = plan->targetRelationId;
LockPartitionsInRelationList(list_make1_oid(targetRelationId), RowExclusiveLock);
}
/*
* Lock partitions of tables that appear in a SELECT or subquery. In the
* DML case this also includes the target relation, but since we already
* have a stronger lock this doesn't do any harm.
*/
LockPartitionsInRelationList(plan->relationIdList, AccessShareLock);
}