From 684b4c6b9666954bdf13c9eb597390eb71a66d48 Mon Sep 17 00:00:00 2001 From: Onur Tirtir Date: Fri, 31 Jan 2025 14:56:33 +0300 Subject: [PATCH] Release RowExclusiveLock on pg_dist_transaction as soon as remote xacts are recovered As of this commit, after recovering the remote transactions, now we release the lock on pg_dist_transaction while closing it to avoid deadlocks that might occur because of trying to acquire a lock on pg_dist_authinfo while holding a lock on pg_dist_transaction. Such a scenario can only cause a deadlock if another transaction is trying to acquire a strong lock on pg_dist_transaction while holding a lock on pg_dist_authinfo. As of today, we (implicitly) acquire a strong lock on pg_dist_transaction only when upgrading Citus to 11.3-1 and this happens when creating a REPLICA IDENTITY on pg_dist_transaction. And regardless of the code-path we are in, it should be okay to release the lock there because all we do after that point is to abort the prepared transactions that are not part of an in-progress distributed transaction and releasing the lock before doing so should be just fine. This also changes the blocking behavior between citus_create_restore_point and the transaction recovery code-path in the sense that now citus_create_restore_point doesn't until transaction recovery completes aborting the prepared transactions that are not part of an in-progress distributed transaction. However, this should be fine because even before this was possible, e.g., if transaction recovery fails to open a remote connection to a node. --- .../transaction/transaction_recovery.c | 18 +++++++++++++++++- .../isolation_create_restore_point.out | 7 +++---- .../spec/isolation_create_restore_point.spec | 5 ++++- 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/src/backend/distributed/transaction/transaction_recovery.c b/src/backend/distributed/transaction/transaction_recovery.c index f25823b30..c114a2ed3 100644 --- a/src/backend/distributed/transaction/transaction_recovery.c +++ b/src/backend/distributed/transaction/transaction_recovery.c @@ -347,7 +347,23 @@ RecoverWorkerTransactions(WorkerNode *workerNode) } systable_endscan(scanDescriptor); - table_close(pgDistTransaction, NoLock); + + /* + * Here we release the lock on pg_dist_transaction while closing it to avoid + * deadlocks that might occur because of trying to acquire a lock on + * pg_dist_authinfo while holding a lock on pg_dist_transaction. Such a scenario + * can only cause a deadlock if another transaction is trying to acquire a strong + * lock on pg_dist_transaction while holding a lock on pg_dist_authinfo. As of + * today, we (implicitly) acquire a strong lock on pg_dist_transaction only when + * upgrading Citus to 11.3-1 and this happens when creating a REPLICA IDENTITY on + * pg_dist_transaction. + * + * And reglardless of the code-path we are in, it should be okay to release the + * lock now because all we do after this point is to abort the prepared + * transactions that are not part of an in-progress distributed transaction and + * releasing the lock before doing so should be just fine. + */ + table_close(pgDistTransaction, RowExclusiveLock); if (!recoveryFailed) { diff --git a/src/test/regress/expected/isolation_create_restore_point.out b/src/test/regress/expected/isolation_create_restore_point.out index 3b1bdf9eb..dce15a35d 100644 --- a/src/test/regress/expected/isolation_create_restore_point.out +++ b/src/test/regress/expected/isolation_create_restore_point.out @@ -147,16 +147,15 @@ recover_prepared_transactions step s2-create-restore: SELECT 1 FROM citus_create_restore_point('citus-test'); - -step s1-commit: - COMMIT; -step s2-create-restore: <... completed> ?column? --------------------------------------------------------------------- 1 (1 row) +step s1-commit: + COMMIT; + starting permutation: s1-begin s1-drop s2-create-restore s1-commit create_reference_table diff --git a/src/test/regress/spec/isolation_create_restore_point.spec b/src/test/regress/spec/isolation_create_restore_point.spec index 2cdc66f85..c62a64a44 100644 --- a/src/test/regress/spec/isolation_create_restore_point.spec +++ b/src/test/regress/spec/isolation_create_restore_point.spec @@ -154,7 +154,10 @@ permutation "s1-begin" "s1-ddl" "s2-create-restore" "s1-commit" // verify that citus_create_restore_point is not blocked by concurrent COPY (only commit) permutation "s1-begin" "s1-copy" "s2-create-restore" "s1-commit" -// verify that citus_create_restore_point is blocked by concurrent recover_prepared_transactions +// verify that citus_create_restore_point is partially blocked by concurrent recover_prepared_transactions. +// In the test output, we won't be able to explicitly observe this since +// recover_prepared_transactions unblocks citus_create_restore_point after in-progress prepared transactions +// are recovered. permutation "s1-begin" "s1-recover" "s2-create-restore" "s1-commit" // verify that citus_create_restore_point is blocked by concurrent DROP TABLE