Do not warn unncessarily when a node is removed

In the past (pre-11), we allowed removing worker nodes
that had active placements for replicated distributed
table, without even checking if there are any other
replicas of the same placement.

However, with #5469, we prevent disabling nodes via a hard
error when there is the last active placement of shard, as we
do for reference tables. Note that otherwise, we'd allow
users to lose data.

As of today, the NOTICE is completely irrelevant.
pull/5912/head
Onder Kalaci 2022-05-16 10:05:29 +02:00
parent b4dbd84743
commit 127450466e
11 changed files with 44 additions and 77 deletions

View File

@ -453,8 +453,7 @@ citus_disable_node(PG_FUNCTION_ARGS)
{
text *nodeNameText = PG_GETARG_TEXT_P(0);
int32 nodePort = PG_GETARG_INT32(1);
bool forceDisableNode = PG_GETARG_BOOL(2);
bool synchronousDisableNode = PG_GETARG_BOOL(3);
bool synchronousDisableNode = PG_GETARG_BOOL(2);
char *nodeName = text_to_cstring(nodeNameText);
WorkerNode *workerNode = ModifiableWorkerNode(nodeName, nodePort);
@ -465,8 +464,10 @@ citus_disable_node(PG_FUNCTION_ARGS)
"isactive");
WorkerNode *firstWorkerNode = GetFirstPrimaryWorkerNode();
if (!forceDisableNode && firstWorkerNode &&
firstWorkerNode->nodeId == workerNode->nodeId)
bool disablingFirstNode =
(firstWorkerNode && firstWorkerNode->nodeId == workerNode->nodeId);
if (disablingFirstNode && !synchronousDisableNode)
{
/*
* We sync metadata async and optionally in the background worker,
@ -480,21 +481,21 @@ citus_disable_node(PG_FUNCTION_ARGS)
* possibility of diverged shard placements for the same shard.
*
* To prevent that, we currently do not allow disabling the first
* worker node.
* worker node unless it is explicitly opted synchronous.
*/
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("disabling the first worker node in the "
"metadata is not allowed"),
errhint("You can force disabling node, SELECT "
"citus_disable_node('%s', %d, force:=true, "
"synchronous:=true); "
"Passing synchronous:=false might cause replicated shards "
"to diverge.", workerNode->workerName, nodePort),
"citus_disable_node('%s', %d, "
"synchronous:=true);", workerNode->workerName,
nodePort),
errdetail("Citus uses the first worker node in the "
"metadata for certain internal operations when "
"replicated tables are modified. Synchronous mode "
"ensures the first worker node is accurately "
"visible to all nodes.")));
"ensures that all nodes have the same view of the "
"first worker node, which is used for certain "
"locking operations.")));
}
/*
@ -513,20 +514,6 @@ citus_disable_node(PG_FUNCTION_ARGS)
* for any given shard.
*/
ErrorIfNodeContainsNonRemovablePlacements(workerNode);
bool onlyConsiderActivePlacements = false;
if (NodeGroupHasShardPlacements(workerNode->groupId,
onlyConsiderActivePlacements))
{
ereport(NOTICE, (errmsg(
"Node %s:%d has active shard placements. Some queries "
"may fail after this operation. Use "
"SELECT citus_activate_node('%s', %d) to activate this "
"node back.",
workerNode->workerName, nodePort,
workerNode->workerName,
nodePort)));
}
}
TransactionModifiedNodeMetadata = true;
@ -584,7 +571,8 @@ citus_disable_node(PG_FUNCTION_ARGS)
/*
* BlockDistributedQueriesOnMetadataNodes blocks all the
* BlockDistributedQueriesOnMetadataNodes blocks all the modification queries on
* all nodes. Hence, should be used with caution.
*/
static void
BlockDistributedQueriesOnMetadataNodes(void)
@ -592,36 +580,20 @@ BlockDistributedQueriesOnMetadataNodes(void)
/* first, block on the coordinator */
LockRelationOid(DistNodeRelationId(), ExclusiveLock);
List *workerList = ActivePrimaryNonCoordinatorNodeList(NoLock);
WorkerNode *workerNodeToLock = NULL;
foreach_ptr(workerNodeToLock, workerList)
{
if (!workerNodeToLock->hasMetadata)
{
/* non-metadata workers cannot run distributed queries, so skip */
continue;
}
/*
* Note that we might re-design this lock to be more granular than
* pg_dist_node, scoping only for modifications on the replicated
* tables. However, we currently do not have any such mechanism and
* given that citus_disable_node() runs instantly, it seems acceptable
* to block reads (or modifications on non-replicated tables) for
* a while.
*/
/*
* Note that we might re-design this lock to be more granular than
* pg_dist_node, scoping only for modifications on the replicated
* tables. However, we currently do not have any such mechanism and
* given that citus_disable_node() runs instantly, it seems acceptable
* to block reads (or modifications on non-replicated tables) for
* a while.
*/
/* only superuser can disable node */
Assert(superuser());
/* only superuser can disable node */
Assert(superuser());
List *commandList =
list_make1("LOCK TABLE pg_dist_node IN EXCLUSIVE MODE;");
SendMetadataCommandListToWorkerInCoordinatedTransaction(
workerNodeToLock->workerName,
workerNodeToLock->workerPort,
CurrentUserName(),
commandList);
}
SendCommandToWorkersWithMetadata(
"LOCK TABLE pg_catalog.pg_dist_node IN EXCLUSIVE MODE;");
}

View File

@ -1,9 +1,9 @@
DROP FUNCTION pg_catalog.citus_disable_node(nodename text, nodeport integer, force bool);
CREATE FUNCTION pg_catalog.citus_disable_node(nodename text, nodeport integer, force bool default false, synchronous bool default false)
CREATE FUNCTION pg_catalog.citus_disable_node(nodename text, nodeport integer, synchronous bool default false)
RETURNS void
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$citus_disable_node$$;
COMMENT ON FUNCTION pg_catalog.citus_disable_node(nodename text, nodeport integer, force bool, synchronous bool)
COMMENT ON FUNCTION pg_catalog.citus_disable_node(nodename text, nodeport integer, synchronous bool)
IS 'removes node from the cluster temporarily';
REVOKE ALL ON FUNCTION pg_catalog.citus_disable_node(text,int, bool, bool) FROM PUBLIC;
REVOKE ALL ON FUNCTION pg_catalog.citus_disable_node(text,int, bool) FROM PUBLIC;

View File

@ -1,9 +1,9 @@
DROP FUNCTION pg_catalog.citus_disable_node(nodename text, nodeport integer, force bool);
CREATE FUNCTION pg_catalog.citus_disable_node(nodename text, nodeport integer, force bool default false, synchronous bool default false)
CREATE FUNCTION pg_catalog.citus_disable_node(nodename text, nodeport integer, synchronous bool default false)
RETURNS void
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$citus_disable_node$$;
COMMENT ON FUNCTION pg_catalog.citus_disable_node(nodename text, nodeport integer, force bool, synchronous bool)
COMMENT ON FUNCTION pg_catalog.citus_disable_node(nodename text, nodeport integer, synchronous bool)
IS 'removes node from the cluster temporarily';
REVOKE ALL ON FUNCTION pg_catalog.citus_disable_node(text,int, bool, bool) FROM PUBLIC;
REVOKE ALL ON FUNCTION pg_catalog.citus_disable_node(text,int, bool) FROM PUBLIC;

View File

@ -54,7 +54,6 @@ ORDER BY placementid;
(2 rows)
SELECT citus_disable_node('localhost', :worker_2_proxy_port, true);
NOTICE: Node localhost:xxxxx has active shard placements. Some queries may fail after this operation. Use SELECT citus_activate_node('localhost', 9060) to activate this node back.
citus_disable_node
---------------------------------------------------------------------

View File

@ -132,9 +132,9 @@ SELECT 1 FROM citus_activate_node('localhost', :worker_2_port);
-- disable node with sync/force options
SELECT citus_disable_node('localhost', :worker_1_port);
ERROR: disabling the first worker node in the metadata is not allowed
DETAIL: Citus uses the first worker node in the metadata for certain internal operations when replicated tables are modified. Synchronous mode ensures the first worker node is accurately visible to all nodes.
HINT: You can force disabling node, SELECT citus_disable_node('localhost', 57637, force:=true, synchronous:=true); Passing synchronous:=false might cause replicated shards to diverge.
SELECT citus_disable_node('localhost', :worker_1_port, force:=true, synchronous:=true);
DETAIL: Citus uses the first worker node in the metadata for certain internal operations when replicated tables are modified. Synchronous mode ensures that all nodes have the same view of the first worker node, which is used for certain locking operations.
HINT: You can force disabling node, SELECT citus_disable_node('localhost', 57637, synchronous:=true);
SELECT citus_disable_node('localhost', :worker_1_port, synchronous:=true);
citus_disable_node
---------------------------------------------------------------------
@ -265,7 +265,7 @@ GRANT EXECUTE ON FUNCTION master_activate_node(text,int) TO node_metadata_user;
GRANT EXECUTE ON FUNCTION master_add_inactive_node(text,int,int,noderole,name) TO node_metadata_user;
GRANT EXECUTE ON FUNCTION master_add_node(text,int,int,noderole,name) TO node_metadata_user;
GRANT EXECUTE ON FUNCTION master_add_secondary_node(text,int,text,int,name) TO node_metadata_user;
GRANT EXECUTE ON FUNCTION citus_disable_node(text,int,bool,bool) TO node_metadata_user;
GRANT EXECUTE ON FUNCTION citus_disable_node(text,int,bool) TO node_metadata_user;
GRANT EXECUTE ON FUNCTION citus_disable_node_and_wait(text,int,bool) TO node_metadata_user;
GRANT EXECUTE ON FUNCTION master_remove_node(text,int) TO node_metadata_user;
GRANT EXECUTE ON FUNCTION master_update_node(int,text,int,bool,int) TO node_metadata_user;

View File

@ -1034,12 +1034,10 @@ SELECT * FROM multi_extension.print_extension_changes();
-- Snapshot of state at 11.0-2
ALTER EXTENSION citus UPDATE TO '11.0-2';
SELECT * FROM multi_extension.print_extension_changes();
previous_object | current_object
previous_object | current_object
---------------------------------------------------------------------
function citus_disable_node(text,integer,boolean) void |
| function citus_disable_node(text,integer,boolean,boolean) void
| function citus_is_coordinator() boolean
(3 rows)
| function citus_is_coordinator() boolean
(1 row)
-- Snapshot of state at 11.1-1
ALTER EXTENSION citus UPDATE TO '11.1-1';

View File

@ -1780,8 +1780,8 @@ ERROR: localhost:xxxxx is a metadata node, but is out of sync
HINT: If the node is up, wait until metadata gets synced to it and try again.
SELECT citus_disable_node_and_wait('localhost', :worker_1_port);
ERROR: disabling the first worker node in the metadata is not allowed
DETAIL: Citus uses the first worker node in the metadata for certain internal operations when replicated tables are modified. Synchronous mode ensures the first worker node is accurately visible to all nodes.
HINT: You can force disabling node, SELECT citus_disable_node('localhost', 57637, force:=true, synchronous:=true); Passing synchronous:=false might cause replicated shards to diverge.
DETAIL: Citus uses the first worker node in the metadata for certain internal operations when replicated tables are modified. Synchronous mode ensures that all nodes have the same view of the first worker node, which is used for certain locking operations.
HINT: You can force disabling node, SELECT citus_disable_node('localhost', 57637, synchronous:=true);
CONTEXT: SQL statement "SELECT pg_catalog.citus_disable_node(nodename, nodeport, force)"
PL/pgSQL function citus_disable_node_and_wait(text,integer,boolean) line XX at PERFORM
SELECT citus_disable_node_and_wait('localhost', :worker_2_port);

View File

@ -642,7 +642,6 @@ SELECT verify_metadata('localhost', :worker_1_port),
-- Don't drop the reference table so it has shards on the nodes being disabled
DROP TABLE dist_table_1, dist_table_2;
SELECT pg_catalog.citus_disable_node('localhost', :worker_2_port);
NOTICE: Node localhost:xxxxx has active shard placements. Some queries may fail after this operation. Use SELECT citus_activate_node('localhost', 57638) to activate this node back.
citus_disable_node
---------------------------------------------------------------------

View File

@ -20,7 +20,6 @@ INSERT INTO replicated SELECT i,i FROM generate_series(0,10)i;
INSERT INTO ref SELECT i,i FROM generate_series(0,10)i;
-- should be successfully disable node
SELECT citus_disable_node('localhost', :worker_2_port, true);
NOTICE: Node localhost:xxxxx has active shard placements. Some queries may fail after this operation. Use SELECT citus_activate_node('localhost', 57638) to activate this node back.
citus_disable_node
---------------------------------------------------------------------

View File

@ -47,7 +47,7 @@ ORDER BY 1;
function citus_coordinator_nodeid()
function citus_copy_shard_placement(bigint,text,integer,text,integer,boolean,citus.shard_transfer_mode)
function citus_create_restore_point(text)
function citus_disable_node(text,integer,boolean,boolean)
function citus_disable_node(text,integer,boolean)
function citus_dist_local_group_cache_invalidate()
function citus_dist_node_cache_invalidate()
function citus_dist_object_cache_invalidate()

View File

@ -58,7 +58,7 @@ SELECT 1 FROM citus_activate_node('localhost', :worker_2_port);
-- disable node with sync/force options
SELECT citus_disable_node('localhost', :worker_1_port);
SELECT citus_disable_node('localhost', :worker_1_port, force:=true, synchronous:=true);
SELECT citus_disable_node('localhost', :worker_1_port, synchronous:=true);
SELECT run_command_on_workers($$SELECT array_agg(isactive ORDER BY nodeport) FROM pg_dist_node WHERE hasmetadata and noderole='primary'::noderole AND nodecluster='default'$$);
SELECT 1 FROM citus_activate_node('localhost', :worker_1_port);
@ -112,7 +112,7 @@ GRANT EXECUTE ON FUNCTION master_activate_node(text,int) TO node_metadata_user;
GRANT EXECUTE ON FUNCTION master_add_inactive_node(text,int,int,noderole,name) TO node_metadata_user;
GRANT EXECUTE ON FUNCTION master_add_node(text,int,int,noderole,name) TO node_metadata_user;
GRANT EXECUTE ON FUNCTION master_add_secondary_node(text,int,text,int,name) TO node_metadata_user;
GRANT EXECUTE ON FUNCTION citus_disable_node(text,int,bool,bool) TO node_metadata_user;
GRANT EXECUTE ON FUNCTION citus_disable_node(text,int,bool) TO node_metadata_user;
GRANT EXECUTE ON FUNCTION citus_disable_node_and_wait(text,int,bool) TO node_metadata_user;
GRANT EXECUTE ON FUNCTION master_remove_node(text,int) TO node_metadata_user;
GRANT EXECUTE ON FUNCTION master_update_node(int,text,int,bool,int) TO node_metadata_user;