mirror of https://github.com/citusdata/citus.git
Do not acquire locks on reference tables when a node is removed/disabled
Before this commit, we acquire the metadata locks on the reference tables while removing/disabling a node on all the MX nodes. Although it has some marginal benefits, such as a concurrent modification during remove/disable node blocks, instead of erroring out, the drawbacks seems worse. Both citus_remove_node and citus_disable_node are not tolerant to multiple node failures. With this commit, we relax the locks. The implication is that while a node is removed/disabled, users might see query errors. On the other hand, this change becomes removing/disabling nodes more tolerant to multiple node failures.pull/5467/head
parent
76b8006a9e
commit
b4931f7345
|
@ -434,7 +434,6 @@ void
|
|||
DeleteAllReferenceTablePlacementsFromNodeGroup(int32 groupId)
|
||||
{
|
||||
List *referenceTableList = CitusTableTypeIdList(REFERENCE_TABLE);
|
||||
List *referenceShardIntervalList = NIL;
|
||||
|
||||
/* if there are no reference tables, we do not need to do anything */
|
||||
if (list_length(referenceTableList) == 0)
|
||||
|
@ -442,18 +441,6 @@ DeleteAllReferenceTablePlacementsFromNodeGroup(int32 groupId)
|
|||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* We sort the reference table list to prevent deadlocks in concurrent
|
||||
* DeleteAllReferenceTablePlacementsFromNodeGroup calls.
|
||||
*/
|
||||
referenceTableList = SortList(referenceTableList, CompareOids);
|
||||
if (ClusterHasKnownMetadataWorkers())
|
||||
{
|
||||
referenceShardIntervalList = GetSortedReferenceShardIntervals(referenceTableList);
|
||||
|
||||
BlockWritesToShardList(referenceShardIntervalList);
|
||||
}
|
||||
|
||||
StringInfo deletePlacementCommand = makeStringInfo();
|
||||
Oid referenceTableId = InvalidOid;
|
||||
foreach_oid(referenceTableId, referenceTableList)
|
||||
|
|
|
@ -682,9 +682,13 @@ SELECT wait_until_metadata_sync(30000);
|
|||
|
||||
-- set metadatasynced so we try porpagating metadata changes
|
||||
UPDATE pg_dist_node SET metadatasynced = TRUE WHERE nodeid IN (:nodeid_1, :nodeid_2);
|
||||
-- should error out
|
||||
-- should not error out, master_disable_node is tolerant for node failures
|
||||
SELECT 1 FROM master_disable_node('localhost', 1);
|
||||
ERROR: Disabling localhost:xxxxx failed
|
||||
?column?
|
||||
---------------------------------------------------------------------
|
||||
1
|
||||
(1 row)
|
||||
|
||||
-- try again after stopping metadata sync
|
||||
SELECT stop_metadata_sync_to_node('localhost', 1);
|
||||
NOTICE: dropping metadata on the node (localhost,1)
|
||||
|
|
|
@ -301,7 +301,7 @@ SELECT wait_until_metadata_sync(30000);
|
|||
-- set metadatasynced so we try porpagating metadata changes
|
||||
UPDATE pg_dist_node SET metadatasynced = TRUE WHERE nodeid IN (:nodeid_1, :nodeid_2);
|
||||
|
||||
-- should error out
|
||||
-- should not error out, master_disable_node is tolerant for node failures
|
||||
SELECT 1 FROM master_disable_node('localhost', 1);
|
||||
|
||||
-- try again after stopping metadata sync
|
||||
|
@ -316,7 +316,6 @@ SELECT wait_until_metadata_sync(30000);
|
|||
SELECT 1 FROM master_activate_node('localhost', :worker_2_port);
|
||||
SELECT verify_metadata('localhost', :worker_1_port);
|
||||
|
||||
|
||||
------------------------------------------------------------------------------------
|
||||
-- Test master_disable_node() when the other node is down
|
||||
------------------------------------------------------------------------------------
|
||||
|
|
Loading…
Reference in New Issue