From 550a5578d8655404ee425a8cc6ad08b3cbe9ca56 Mon Sep 17 00:00:00 2001 From: Onder Kalaci Date: Thu, 17 Aug 2017 19:33:39 +0300 Subject: [PATCH 1/2] Skip deadlock detection on the workers Do not run distributed deadlock detection on the worker nodes to prevent errornous decisions to kill the deadlocks. --- .../distributed_deadlock_detection.c | 23 ++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/backend/distributed/transaction/distributed_deadlock_detection.c b/src/backend/distributed/transaction/distributed_deadlock_detection.c index 176a2c60b..6d8b108ed 100644 --- a/src/backend/distributed/transaction/distributed_deadlock_detection.c +++ b/src/backend/distributed/transaction/distributed_deadlock_detection.c @@ -101,12 +101,29 @@ check_distributed_deadlocks(PG_FUNCTION_ARGS) bool CheckForDistributedDeadlocks(void) { - WaitGraph *waitGraph = BuildGlobalWaitGraph(); - HTAB *adjacencyLists = BuildAdjacencyListsForWaitGraph(waitGraph); + WaitGraph *waitGraph = NULL; + HTAB *adjacencyLists = NULL; HASH_SEQ_STATUS status; TransactionNode *transactionNode = NULL; - int edgeCount = waitGraph->edgeCount; + int edgeCount = 0; int localGroupId = GetLocalGroupId(); + List *workerNodeList = ActiveReadableNodeList(); + + /* + * We don't need to do any distributed deadlock checking if there + * are no worker nodes. This might even be problematic for a non-mx + * worker node which has the same group id with its master (i.e., 0), + * which may erroneously decide to kill the deadlocks happening on it. + */ + if (list_length(workerNodeList) == 0) + { + return false; + } + + waitGraph = BuildGlobalWaitGraph(); + adjacencyLists = BuildAdjacencyListsForWaitGraph(waitGraph); + + edgeCount = waitGraph->edgeCount; /* * We iterate on transaction nodes and search for deadlocks where the From 20679c9e8b76e17ae282af5415dc011c9424c50c Mon Sep 17 00:00:00 2001 From: Onder Kalaci Date: Thu, 17 Aug 2017 19:37:40 +0300 Subject: [PATCH 2/2] Relax assertion on deadlock detection considering self deadlocks. --- .../transaction/distributed_deadlock_detection.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/backend/distributed/transaction/distributed_deadlock_detection.c b/src/backend/distributed/transaction/distributed_deadlock_detection.c index 6d8b108ed..73bb3896c 100644 --- a/src/backend/distributed/transaction/distributed_deadlock_detection.c +++ b/src/backend/distributed/transaction/distributed_deadlock_detection.c @@ -152,8 +152,12 @@ CheckForDistributedDeadlocks(void) TransactionNode *youngestTransaction = transactionNode; ListCell *participantTransactionCell = NULL; - /* there should be at least two transactions to get into a deadlock */ - Assert(list_length(deadlockPath) > 1); + /* + * There should generally be at least two transactions to get into a + * deadlock. However, in case Citus gets into a self-deadlock, we may + * find a deadlock with a single transaction. + */ + Assert(list_length(deadlockPath) >= 1); LogDistributedDeadlockDebugMessage("Distributed deadlock found among the " "following distributed transactions:");