Do not acquire locks on reference tables when a node is removed/disabled

Before this commit, we acquire the metadata locks on the reference
tables while removing/disabling a node on all the MX nodes.

Although it has some marginal benefits, such as a concurrent
modification during remove/disable node blocks, instead of erroring
out, the drawbacks seems worse. Both citus_remove_node and citus_disable_node
are not tolerant to multiple node failures.

With this commit, we relax the locks. The implication is that while
a node is removed/disabled, users might see query errors. On the
other hand, this change becomes removing/disabling nodes more
tolerant to multiple node failures.
pull/5467/head
Onder Kalaci 2021-11-16 17:24:48 +01:00
parent 76b8006a9e
commit b4931f7345
3 changed files with 7 additions and 17 deletions

View File

@ -434,7 +434,6 @@ void
DeleteAllReferenceTablePlacementsFromNodeGroup(int32 groupId)
{
List *referenceTableList = CitusTableTypeIdList(REFERENCE_TABLE);
List *referenceShardIntervalList = NIL;
/* if there are no reference tables, we do not need to do anything */
if (list_length(referenceTableList) == 0)
@ -442,18 +441,6 @@ DeleteAllReferenceTablePlacementsFromNodeGroup(int32 groupId)
return;
}
/*
* We sort the reference table list to prevent deadlocks in concurrent
* DeleteAllReferenceTablePlacementsFromNodeGroup calls.
*/
referenceTableList = SortList(referenceTableList, CompareOids);
if (ClusterHasKnownMetadataWorkers())
{
referenceShardIntervalList = GetSortedReferenceShardIntervals(referenceTableList);
BlockWritesToShardList(referenceShardIntervalList);
}
StringInfo deletePlacementCommand = makeStringInfo();
Oid referenceTableId = InvalidOid;
foreach_oid(referenceTableId, referenceTableList)

View File

@ -682,9 +682,13 @@ SELECT wait_until_metadata_sync(30000);
-- set metadatasynced so we try porpagating metadata changes
UPDATE pg_dist_node SET metadatasynced = TRUE WHERE nodeid IN (:nodeid_1, :nodeid_2);
-- should error out
-- should not error out, master_disable_node is tolerant for node failures
SELECT 1 FROM master_disable_node('localhost', 1);
ERROR: Disabling localhost:xxxxx failed
?column?
---------------------------------------------------------------------
1
(1 row)
-- try again after stopping metadata sync
SELECT stop_metadata_sync_to_node('localhost', 1);
NOTICE: dropping metadata on the node (localhost,1)

View File

@ -301,7 +301,7 @@ SELECT wait_until_metadata_sync(30000);
-- set metadatasynced so we try porpagating metadata changes
UPDATE pg_dist_node SET metadatasynced = TRUE WHERE nodeid IN (:nodeid_1, :nodeid_2);
-- should error out
-- should not error out, master_disable_node is tolerant for node failures
SELECT 1 FROM master_disable_node('localhost', 1);
-- try again after stopping metadata sync
@ -316,7 +316,6 @@ SELECT wait_until_metadata_sync(30000);
SELECT 1 FROM master_activate_node('localhost', :worker_2_port);
SELECT verify_metadata('localhost', :worker_1_port);
------------------------------------------------------------------------------------
-- Test master_disable_node() when the other node is down
------------------------------------------------------------------------------------