mirror of https://github.com/citusdata/citus.git
Do not obtain AccessShareLock before actual lock (#5965)
Do not obtain AccessShareLock before acquiring the distributed locks. Acquiring an AccessShareLock ensures that the relations which we are trying to get a distributed lock on will not be dropped in the time between when the LOCK command is issued and the LOCK commands are send to the worker. However, this also leads to distributed deadlocks in such scenarios: ```sql -- for dist lock acquiring order coor, w1, w2 -- on w2 LOCK t1 IN ACCESS EXLUSIVE MODE; -- acquire AccessShareLock locally on t1 to ensure it is not dropped while we get ready to distribute the lock -- concurrently on w1 LOCK t1 IN ACCESS EXLUSIVE MODE; -- acquire AccessShareLock locally on t1 to ensure it is not dropped while we get ready to distribute the lock -- acquire dist lock on coor, w1, gets blocked on local AccessShareLock on w2 -- on w2 continuation of the execution above -- starts to acquire dist locks and gets blocked on the coor by the lock acquired by w1 -- distributed deadlock ``` We opt for avoiding such deadlocks with the cost of the possibility of running into errors when the relations on which we are trying to acquire locks on get dropped.pull/5966/head
parent
2a768176c4
commit
27ddb4fc8e
|
@ -1281,8 +1281,8 @@ ActivateNode(char *nodeName, int nodePort)
|
|||
{
|
||||
bool isActive = true;
|
||||
|
||||
WorkerNode *workreNode = ModifiableWorkerNode(nodeName, nodePort);
|
||||
ActivateNodeList(list_make1(workreNode));
|
||||
WorkerNode *workerNode = ModifiableWorkerNode(nodeName, nodePort);
|
||||
ActivateNodeList(list_make1(workerNode));
|
||||
|
||||
/* finally, let all other active metadata nodes to learn about this change */
|
||||
WorkerNode *newWorkerNode = SetNodeState(nodeName, nodePort, isActive);
|
||||
|
|
|
@ -1339,8 +1339,7 @@ AcquireDistributedLockOnRelations(List *relationList, LOCKMODE lockMode, uint32
|
|||
RangeVar *rangeVar = NULL;
|
||||
foreach_ptr(rangeVar, relationList)
|
||||
{
|
||||
Oid relationId = RangeVarGetRelidExtended(rangeVar, AccessShareLock, nowait ?
|
||||
RVR_NOWAIT : 0, NULL, NULL);
|
||||
Oid relationId = RangeVarGetRelid(rangeVar, NoLock, false);
|
||||
|
||||
LockRelationRecord *lockRelationRecord = MakeLockRelationRecord(relationId,
|
||||
rangeVar->inh);
|
||||
|
|
|
@ -154,7 +154,7 @@ step coor-begin:
|
|||
step coor-acquire-aggresive-lock-on-dist-table-nowait:
|
||||
LOCK dist_table IN ACCESS EXCLUSIVE MODE NOWAIT;
|
||||
|
||||
ERROR: could not obtain lock on relation "dist_table"
|
||||
ERROR: could not obtain lock on relation "public.dist_table"
|
||||
step coor-rollback:
|
||||
ROLLBACK;
|
||||
|
||||
|
@ -1086,3 +1086,106 @@ restore_isolation_tester_func
|
|||
|
||||
(1 row)
|
||||
|
||||
|
||||
starting permutation: coor-begin coor-read-dist-table w2-start-session-level-connection w2-begin w1-start-session-level-connection w1-begin w2-acquire-aggressive-lock-dist-table w1-acquire-aggressive-lock-dist-table coor-rollback w2-rollback w1-rollback w1-stop-connection w2-stop-connection
|
||||
step coor-begin:
|
||||
BEGIN;
|
||||
|
||||
step coor-read-dist-table:
|
||||
SELECT COUNT(*) FROM dist_table;
|
||||
|
||||
count
|
||||
---------------------------------------------------------------------
|
||||
5
|
||||
(1 row)
|
||||
|
||||
step w2-start-session-level-connection:
|
||||
SELECT start_session_level_connection_to_node('localhost', 57638);
|
||||
|
||||
start_session_level_connection_to_node
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
step w2-begin:
|
||||
SELECT run_commands_on_session_level_connection_to_node('BEGIN');
|
||||
|
||||
run_commands_on_session_level_connection_to_node
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
step w1-start-session-level-connection:
|
||||
SELECT start_session_level_connection_to_node('localhost', 57637);
|
||||
|
||||
start_session_level_connection_to_node
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
step w1-begin:
|
||||
SELECT run_commands_on_session_level_connection_to_node('BEGIN');
|
||||
|
||||
run_commands_on_session_level_connection_to_node
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
step w2-acquire-aggressive-lock-dist-table:
|
||||
SELECT run_commands_on_session_level_connection_to_node('LOCK dist_table IN ACCESS EXCLUSIVE MODE');
|
||||
<waiting ...>
|
||||
step w1-acquire-aggressive-lock-dist-table:
|
||||
SELECT run_commands_on_session_level_connection_to_node('LOCK dist_table IN ACCESS EXCLUSIVE MODE');
|
||||
<waiting ...>
|
||||
step coor-rollback:
|
||||
ROLLBACK;
|
||||
|
||||
step w2-acquire-aggressive-lock-dist-table: <... completed>
|
||||
run_commands_on_session_level_connection_to_node
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
step w2-rollback:
|
||||
SELECT run_commands_on_session_level_connection_to_node('ROLLBACK');
|
||||
|
||||
run_commands_on_session_level_connection_to_node
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
step w1-acquire-aggressive-lock-dist-table: <... completed>
|
||||
run_commands_on_session_level_connection_to_node
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
step w1-rollback:
|
||||
SELECT run_commands_on_session_level_connection_to_node('ROLLBACK');
|
||||
|
||||
run_commands_on_session_level_connection_to_node
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
step w1-stop-connection:
|
||||
SELECT stop_session_level_connection_to_node();
|
||||
|
||||
stop_session_level_connection_to_node
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
step w2-stop-connection:
|
||||
SELECT stop_session_level_connection_to_node();
|
||||
|
||||
stop_session_level_connection_to_node
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
restore_isolation_tester_func
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
|
|
|
@ -251,13 +251,18 @@ step s2-truncate:
|
|||
step s1-commit:
|
||||
COMMIT;
|
||||
|
||||
s2: WARNING: relation "public.dist_table" does not exist
|
||||
step s2-truncate: <... completed>
|
||||
ERROR: failure on connection marked as essential: localhost:xxxxx
|
||||
step s2-select:
|
||||
SELECT * FROM dist_table ORDER BY 1, 2;
|
||||
|
||||
a|b
|
||||
---------------------------------------------------------------------
|
||||
(0 rows)
|
||||
1|2
|
||||
3|4
|
||||
5|6
|
||||
(3 rows)
|
||||
|
||||
restore_isolation_tester_func
|
||||
---------------------------------------------------------------------
|
||||
|
|
|
@ -240,3 +240,4 @@ permutation "coor-begin" "coor-acquire-aggresive-lock-on-partitioned-table-with-
|
|||
permutation "coor-begin" "coor-acquire-aggresive-lock-on-only-partitioned-table" "w1-start-session-level-connection" "w1-begin" "w1-read-partitioned-table" "coor-rollback" "w1-rollback" "w1-stop-connection"
|
||||
permutation "coor-begin" "coor-acquire-aggresive-lock-on-only-partitioned-table" "w1-start-session-level-connection" "w1-begin" "w1-read-partition-of-partitioned-table" "coor-rollback" "w1-rollback" "w1-stop-connection"
|
||||
permutation "coor-begin" "coor-acquire-aggresive-lock-on-ref-table" "w1-start-session-level-connection" "w1-begin" "w1-read-main-view" "coor-rollback" "w1-rollback" "w1-stop-connection"
|
||||
permutation "coor-begin" "coor-read-dist-table" "w2-start-session-level-connection" "w2-begin" "w1-start-session-level-connection" "w1-begin" "w2-acquire-aggressive-lock-dist-table" "w1-acquire-aggressive-lock-dist-table" "coor-rollback" "w2-rollback" "w1-rollback" "w1-stop-connection" "w2-stop-connection"
|
||||
|
|
Loading…
Reference in New Issue