From 658b8c4ec24711e27a1e098bb5eb35b29f71a1cc Mon Sep 17 00:00:00 2001 From: Hanefi Onaldi Date: Tue, 20 Jun 2023 14:47:36 +0300 Subject: [PATCH 1/3] Change worker list sort: coord before workers Before this change, we sort the list of worker nodes only by their host name and port number. This was good enough when we only had worker nodes in the metadata in the past, but now that we also have coordinator it is harder to prevent distributed deadlocks. Hence, we now make sure that the coordinator node(s) is always sorted to be in the beginning of the list of worker nodes. A DDL command that is run at the coordinator node will take locks on the coordinator, and if successful attempt to take locks on the worker nodes. Similarly, a DDL command that is run on a worker node will take the locks on itself first, and then attempt to take locks on the remaining of the workers. If the worker nodes are sorted before the coordinator node, then we can end up in a situation where the coordinator node is waiting for a lock on a worker node, while the worker node is waiting for a lock on the coordinator node. This is a distributed deadlock. --- .../distributed/operations/worker_node_manager.c | 14 +++++++++++++- src/include/distributed/worker_manager.h | 2 +- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/backend/distributed/operations/worker_node_manager.c b/src/backend/distributed/operations/worker_node_manager.c index 76f2732ba..811f22731 100644 --- a/src/backend/distributed/operations/worker_node_manager.c +++ b/src/backend/distributed/operations/worker_node_manager.c @@ -108,7 +108,7 @@ ActiveReadableNodeCount(void) * NodeIsCoordinator returns true if the given node represents the coordinator. */ bool -NodeIsCoordinator(WorkerNode *node) +NodeIsCoordinator(const WorkerNode *node) { return node->groupId == COORDINATOR_GROUP_ID; } @@ -352,6 +352,9 @@ CompareWorkerNodes(const void *leftElement, const void *rightElement) * WorkerNodeCompare compares two worker nodes by their host name and port * number. Two nodes that only differ by their rack locations are considered to * be equal to each other. + * + * This function also makes sure that coordinator nodes are always considered + * lexicographically smaller than other worker nodes. */ int WorkerNodeCompare(const void *lhsKey, const void *rhsKey, Size keySize) @@ -359,6 +362,15 @@ WorkerNodeCompare(const void *lhsKey, const void *rhsKey, Size keySize) const WorkerNode *workerLhs = (const WorkerNode *) lhsKey; const WorkerNode *workerRhs = (const WorkerNode *) rhsKey; + if (NodeIsCoordinator(workerLhs)) + { + return -1; + } + if (NodeIsCoordinator(workerRhs)) + { + return 1; + } + return NodeNamePortCompare(workerLhs->workerName, workerRhs->workerName, workerLhs->workerPort, workerRhs->workerPort); } diff --git a/src/include/distributed/worker_manager.h b/src/include/distributed/worker_manager.h index 5ad7f4962..4d4c36ca2 100644 --- a/src/include/distributed/worker_manager.h +++ b/src/include/distributed/worker_manager.h @@ -95,7 +95,7 @@ extern bool NodeIsPrimaryAndRemote(WorkerNode *worker); extern bool NodeIsPrimary(WorkerNode *worker); extern bool NodeIsSecondary(WorkerNode *worker); extern bool NodeIsReadable(WorkerNode *worker); -extern bool NodeIsCoordinator(WorkerNode *node); +extern bool NodeIsCoordinator(const WorkerNode *node); extern WorkerNode * SetWorkerColumn(WorkerNode *workerNode, int columnIndex, Datum value); extern WorkerNode * SetWorkerColumnOptional(WorkerNode *workerNode, int columnIndex, Datum value); From 3bc1d1a120491889352e983cefc33ac5584c22cc Mon Sep 17 00:00:00 2001 From: Hanefi Onaldi Date: Tue, 5 Sep 2023 16:10:42 +0300 Subject: [PATCH 2/3] Fix a segmentation problem --- src/backend/distributed/metadata/metadata_sync.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/distributed/metadata/metadata_sync.c b/src/backend/distributed/metadata/metadata_sync.c index 40bdae0ea..653ffde31 100644 --- a/src/backend/distributed/metadata/metadata_sync.c +++ b/src/backend/distributed/metadata/metadata_sync.c @@ -2750,7 +2750,7 @@ SyncNodeMetadataToNodesOptional(void) /* we fetch the same node again to check if it's synced or not */ WorkerNode *nodeUpdated = FindWorkerNode(workerNode->workerName, workerNode->workerPort); - if (!nodeUpdated->metadataSynced) + if (nodeUpdated==NULL || !nodeUpdated->metadataSynced) { /* set the result to FAILED to trigger the sync again */ result = NODE_METADATA_SYNC_FAILED_SYNC; From 6de73eb656cdb870c331b5209288932e9d377eec Mon Sep 17 00:00:00 2001 From: Hanefi Onaldi Date: Tue, 5 Sep 2023 16:12:19 +0300 Subject: [PATCH 3/3] Make citus_indent happy --- src/backend/distributed/metadata/metadata_sync.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/distributed/metadata/metadata_sync.c b/src/backend/distributed/metadata/metadata_sync.c index 653ffde31..1631cc259 100644 --- a/src/backend/distributed/metadata/metadata_sync.c +++ b/src/backend/distributed/metadata/metadata_sync.c @@ -2750,7 +2750,7 @@ SyncNodeMetadataToNodesOptional(void) /* we fetch the same node again to check if it's synced or not */ WorkerNode *nodeUpdated = FindWorkerNode(workerNode->workerName, workerNode->workerPort); - if (nodeUpdated==NULL || !nodeUpdated->metadataSynced) + if (nodeUpdated == NULL || !nodeUpdated->metadataSynced) { /* set the result to FAILED to trigger the sync again */ result = NODE_METADATA_SYNC_FAILED_SYNC;