From 27ddb4fc8e2aae7923f1819eac98f2a9b0ae7063 Mon Sep 17 00:00:00 2001 From: Gledis Zeneli <43916939+gledis69@users.noreply.github.com> Date: Mon, 23 May 2022 13:06:38 +0300 Subject: [PATCH] Do not obtain AccessShareLock before actual lock (#5965) Do not obtain AccessShareLock before acquiring the distributed locks. Acquiring an AccessShareLock ensures that the relations which we are trying to get a distributed lock on will not be dropped in the time between when the LOCK command is issued and the LOCK commands are send to the worker. However, this also leads to distributed deadlocks in such scenarios: ```sql -- for dist lock acquiring order coor, w1, w2 -- on w2 LOCK t1 IN ACCESS EXLUSIVE MODE; -- acquire AccessShareLock locally on t1 to ensure it is not dropped while we get ready to distribute the lock -- concurrently on w1 LOCK t1 IN ACCESS EXLUSIVE MODE; -- acquire AccessShareLock locally on t1 to ensure it is not dropped while we get ready to distribute the lock -- acquire dist lock on coor, w1, gets blocked on local AccessShareLock on w2 -- on w2 continuation of the execution above -- starts to acquire dist locks and gets blocked on the coor by the lock acquired by w1 -- distributed deadlock ``` We opt for avoiding such deadlocks with the cost of the possibility of running into errors when the relations on which we are trying to acquire locks on get dropped. --- .../distributed/metadata/node_metadata.c | 4 +- src/backend/distributed/utils/resource_lock.c | 3 +- .../isolation_acquire_distributed_locks.out | 105 +++++++++++++++++- .../expected/isolation_undistribute_table.out | 7 +- .../isolation_acquire_distributed_locks.spec | 1 + 5 files changed, 114 insertions(+), 6 deletions(-) diff --git a/src/backend/distributed/metadata/node_metadata.c b/src/backend/distributed/metadata/node_metadata.c index da0656cc2..c19148317 100644 --- a/src/backend/distributed/metadata/node_metadata.c +++ b/src/backend/distributed/metadata/node_metadata.c @@ -1281,8 +1281,8 @@ ActivateNode(char *nodeName, int nodePort) { bool isActive = true; - WorkerNode *workreNode = ModifiableWorkerNode(nodeName, nodePort); - ActivateNodeList(list_make1(workreNode)); + WorkerNode *workerNode = ModifiableWorkerNode(nodeName, nodePort); + ActivateNodeList(list_make1(workerNode)); /* finally, let all other active metadata nodes to learn about this change */ WorkerNode *newWorkerNode = SetNodeState(nodeName, nodePort, isActive); diff --git a/src/backend/distributed/utils/resource_lock.c b/src/backend/distributed/utils/resource_lock.c index b9e29711e..3a4beef17 100644 --- a/src/backend/distributed/utils/resource_lock.c +++ b/src/backend/distributed/utils/resource_lock.c @@ -1339,8 +1339,7 @@ AcquireDistributedLockOnRelations(List *relationList, LOCKMODE lockMode, uint32 RangeVar *rangeVar = NULL; foreach_ptr(rangeVar, relationList) { - Oid relationId = RangeVarGetRelidExtended(rangeVar, AccessShareLock, nowait ? - RVR_NOWAIT : 0, NULL, NULL); + Oid relationId = RangeVarGetRelid(rangeVar, NoLock, false); LockRelationRecord *lockRelationRecord = MakeLockRelationRecord(relationId, rangeVar->inh); diff --git a/src/test/regress/expected/isolation_acquire_distributed_locks.out b/src/test/regress/expected/isolation_acquire_distributed_locks.out index b82ad32e6..ed51f1da6 100644 --- a/src/test/regress/expected/isolation_acquire_distributed_locks.out +++ b/src/test/regress/expected/isolation_acquire_distributed_locks.out @@ -154,7 +154,7 @@ step coor-begin: step coor-acquire-aggresive-lock-on-dist-table-nowait: LOCK dist_table IN ACCESS EXCLUSIVE MODE NOWAIT; -ERROR: could not obtain lock on relation "dist_table" +ERROR: could not obtain lock on relation "public.dist_table" step coor-rollback: ROLLBACK; @@ -1086,3 +1086,106 @@ restore_isolation_tester_func (1 row) + +starting permutation: coor-begin coor-read-dist-table w2-start-session-level-connection w2-begin w1-start-session-level-connection w1-begin w2-acquire-aggressive-lock-dist-table w1-acquire-aggressive-lock-dist-table coor-rollback w2-rollback w1-rollback w1-stop-connection w2-stop-connection +step coor-begin: + BEGIN; + +step coor-read-dist-table: + SELECT COUNT(*) FROM dist_table; + +count +--------------------------------------------------------------------- + 5 +(1 row) + +step w2-start-session-level-connection: + SELECT start_session_level_connection_to_node('localhost', 57638); + +start_session_level_connection_to_node +--------------------------------------------------------------------- + +(1 row) + +step w2-begin: + SELECT run_commands_on_session_level_connection_to_node('BEGIN'); + +run_commands_on_session_level_connection_to_node +--------------------------------------------------------------------- + +(1 row) + +step w1-start-session-level-connection: + SELECT start_session_level_connection_to_node('localhost', 57637); + +start_session_level_connection_to_node +--------------------------------------------------------------------- + +(1 row) + +step w1-begin: + SELECT run_commands_on_session_level_connection_to_node('BEGIN'); + +run_commands_on_session_level_connection_to_node +--------------------------------------------------------------------- + +(1 row) + +step w2-acquire-aggressive-lock-dist-table: + SELECT run_commands_on_session_level_connection_to_node('LOCK dist_table IN ACCESS EXCLUSIVE MODE'); + +step w1-acquire-aggressive-lock-dist-table: + SELECT run_commands_on_session_level_connection_to_node('LOCK dist_table IN ACCESS EXCLUSIVE MODE'); + +step coor-rollback: + ROLLBACK; + +step w2-acquire-aggressive-lock-dist-table: <... completed> +run_commands_on_session_level_connection_to_node +--------------------------------------------------------------------- + +(1 row) + +step w2-rollback: + SELECT run_commands_on_session_level_connection_to_node('ROLLBACK'); + +run_commands_on_session_level_connection_to_node +--------------------------------------------------------------------- + +(1 row) + +step w1-acquire-aggressive-lock-dist-table: <... completed> +run_commands_on_session_level_connection_to_node +--------------------------------------------------------------------- + +(1 row) + +step w1-rollback: + SELECT run_commands_on_session_level_connection_to_node('ROLLBACK'); + +run_commands_on_session_level_connection_to_node +--------------------------------------------------------------------- + +(1 row) + +step w1-stop-connection: + SELECT stop_session_level_connection_to_node(); + +stop_session_level_connection_to_node +--------------------------------------------------------------------- + +(1 row) + +step w2-stop-connection: + SELECT stop_session_level_connection_to_node(); + +stop_session_level_connection_to_node +--------------------------------------------------------------------- + +(1 row) + +restore_isolation_tester_func +--------------------------------------------------------------------- + +(1 row) + diff --git a/src/test/regress/expected/isolation_undistribute_table.out b/src/test/regress/expected/isolation_undistribute_table.out index d861b5af8..4048f0f52 100644 --- a/src/test/regress/expected/isolation_undistribute_table.out +++ b/src/test/regress/expected/isolation_undistribute_table.out @@ -251,13 +251,18 @@ step s2-truncate: step s1-commit: COMMIT; +s2: WARNING: relation "public.dist_table" does not exist step s2-truncate: <... completed> +ERROR: failure on connection marked as essential: localhost:xxxxx step s2-select: SELECT * FROM dist_table ORDER BY 1, 2; a|b --------------------------------------------------------------------- -(0 rows) +1|2 +3|4 +5|6 +(3 rows) restore_isolation_tester_func --------------------------------------------------------------------- diff --git a/src/test/regress/spec/isolation_acquire_distributed_locks.spec b/src/test/regress/spec/isolation_acquire_distributed_locks.spec index 66a74bc24..eb6d51e68 100644 --- a/src/test/regress/spec/isolation_acquire_distributed_locks.spec +++ b/src/test/regress/spec/isolation_acquire_distributed_locks.spec @@ -240,3 +240,4 @@ permutation "coor-begin" "coor-acquire-aggresive-lock-on-partitioned-table-with- permutation "coor-begin" "coor-acquire-aggresive-lock-on-only-partitioned-table" "w1-start-session-level-connection" "w1-begin" "w1-read-partitioned-table" "coor-rollback" "w1-rollback" "w1-stop-connection" permutation "coor-begin" "coor-acquire-aggresive-lock-on-only-partitioned-table" "w1-start-session-level-connection" "w1-begin" "w1-read-partition-of-partitioned-table" "coor-rollback" "w1-rollback" "w1-stop-connection" permutation "coor-begin" "coor-acquire-aggresive-lock-on-ref-table" "w1-start-session-level-connection" "w1-begin" "w1-read-main-view" "coor-rollback" "w1-rollback" "w1-stop-connection" +permutation "coor-begin" "coor-read-dist-table" "w2-start-session-level-connection" "w2-begin" "w1-start-session-level-connection" "w1-begin" "w2-acquire-aggressive-lock-dist-table" "w1-acquire-aggressive-lock-dist-table" "coor-rollback" "w2-rollback" "w1-rollback" "w1-stop-connection" "w2-stop-connection"