diff --git a/src/backend/distributed/utils/node_metadata.c b/src/backend/distributed/utils/node_metadata.c index 9685f6376..18aedcb53 100644 --- a/src/backend/distributed/utils/node_metadata.c +++ b/src/backend/distributed/utils/node_metadata.c @@ -273,29 +273,51 @@ master_disable_node(PG_FUNCTION_ARGS) WorkerNode *workerNode = ModifiableWorkerNode(nodeName, nodePort); bool isActive = false; bool onlyConsiderActivePlacements = false; + MemoryContext savedContext = CurrentMemoryContext; - if (WorkerNodeIsPrimary(workerNode)) + PG_TRY(); { - /* - * Delete reference table placements so they are not taken into account - * for the check if there are placements after this - */ - DeleteAllReferenceTablePlacementsFromNodeGroup(workerNode->groupId); - - if (NodeGroupHasShardPlacements(workerNode->groupId, - onlyConsiderActivePlacements)) + if (WorkerNodeIsPrimary(workerNode)) { - ereport(NOTICE, (errmsg( - "Node %s:%d has active shard placements. Some queries " - "may fail after this operation. Use " - "SELECT master_activate_node('%s', %d) to activate this " - "node back.", - workerNode->workerName, nodePort, workerNode->workerName, - nodePort))); - } - } + /* + * Delete reference table placements so they are not taken into account + * for the check if there are placements after this. + */ + DeleteAllReferenceTablePlacementsFromNodeGroup(workerNode->groupId); - SetNodeState(nodeName, nodePort, isActive); + if (NodeGroupHasShardPlacements(workerNode->groupId, + onlyConsiderActivePlacements)) + { + ereport(NOTICE, (errmsg( + "Node %s:%d has active shard placements. Some queries " + "may fail after this operation. Use " + "SELECT master_activate_node('%s', %d) to activate this " + "node back.", + workerNode->workerName, nodePort, + workerNode->workerName, + nodePort))); + } + } + + SetNodeState(nodeName, nodePort, isActive); + } + PG_CATCH(); + { + ErrorData *edata = NULL; + + /* CopyErrorData() requires (CurrentMemoryContext != ErrorContext) */ + MemoryContextSwitchTo(savedContext); + edata = CopyErrorData(); + + ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("Disabling %s:%d failed", workerNode->workerName, + nodePort), + errdetail("%s", edata->message), + errhint( + "If you are using MX, try stop_metadata_sync_to_node(hostname, port) " + "for nodes that are down before disabling them."))); + } + PG_END_TRY(); PG_RETURN_VOID(); } @@ -350,7 +372,7 @@ SetUpDistributedTableDependencies(WorkerNode *newWorkerNode) newWorkerNode->workerPort); /* - * Let the maintanince deamon do the hard work of syncing the metadata. + * Let the maintenance daemon do the hard work of syncing the metadata. * We prefer this because otherwise node activation might fail within * transaction blocks. */ @@ -1129,8 +1151,8 @@ SetWorkerColumn(WorkerNode *workerNode, int columnIndex, Datum value) { case Anum_pg_dist_node_isactive: { - metadataSyncCommand = ShouldHaveShardsUpdateCommand(workerNode->nodeId, - DatumGetBool(value)); + metadataSyncCommand = NodeStateUpdateCommand(workerNode->nodeId, + DatumGetBool(value)); break; } diff --git a/src/backend/distributed/utils/reference_table_utils.c b/src/backend/distributed/utils/reference_table_utils.c index 7e9055be5..1bd31ab8f 100644 --- a/src/backend/distributed/utils/reference_table_utils.c +++ b/src/backend/distributed/utils/reference_table_utils.c @@ -406,8 +406,7 @@ CreateReferenceTableColocationId() /* * DeleteAllReferenceTablePlacementsFromNodeGroup function iterates over list of reference * tables and deletes all reference table placements from pg_dist_placement table - * for given group. However, it does not modify replication factor of the colocation - * group of reference tables. It is caller's responsibility to do that if it is necessary. + * for given group. */ void DeleteAllReferenceTablePlacementsFromNodeGroup(int32 groupId) diff --git a/src/test/regress/expected/multi_mx_master_update_node.out b/src/test/regress/expected/multi_mx_node_metadata.out similarity index 75% rename from src/test/regress/expected/multi_mx_master_update_node.out rename to src/test/regress/expected/multi_mx_node_metadata.out index d7f687186..f42130e35 100644 --- a/src/test/regress/expected/multi_mx_master_update_node.out +++ b/src/test/regress/expected/multi_mx_node_metadata.out @@ -210,7 +210,7 @@ SELECT nodeid, hasmetadata, metadatasynced FROM pg_dist_node; -------------------------------------------------------------------------- -- Test updating a node when another node is in readonly-mode -------------------------------------------------------------------------- -SELECT FROM master_add_node('localhost', :worker_2_port) AS nodeid_2 \gset +SELECT master_add_node('localhost', :worker_2_port) AS nodeid_2 \gset NOTICE: Replicating reference table "ref_table" to the node localhost:57638 SELECT 1 FROM start_metadata_sync_to_node('localhost', :worker_2_port); ?column? @@ -403,8 +403,166 @@ SELECT verify_metadata('localhost', :worker_1_port), t | t (1 row) +-------------------------------------------------------------------------- +-- Test that changes in isactive is propagated to the metadata nodes +-------------------------------------------------------------------------- +-- Don't drop the reference table so it has shards on the nodes being disabled +DROP TABLE dist_table_1, dist_table_2; +SELECT 1 FROM master_disable_node('localhost', :worker_2_port); + ?column? +---------- + 1 +(1 row) + +SELECT verify_metadata('localhost', :worker_1_port); + verify_metadata +----------------- + t +(1 row) + +SELECT 1 FROM master_activate_node('localhost', :worker_2_port); +NOTICE: Replicating reference table "ref_table" to the node localhost:57638 + ?column? +---------- + 1 +(1 row) + +SELECT verify_metadata('localhost', :worker_1_port); + verify_metadata +----------------- + t +(1 row) + +------------------------------------------------------------------------------------ +-- Test master_disable_node() when the node that is being disabled is actually down +------------------------------------------------------------------------------------ +SELECT master_update_node(:nodeid_2, 'localhost', 1); + master_update_node +-------------------- + +(1 row) + +SELECT wait_until_metadata_sync(); + wait_until_metadata_sync +-------------------------- + +(1 row) + +-- set metadatasynced so we try porpagating metadata changes +UPDATE pg_dist_node SET metadatasynced = TRUE WHERE nodeid IN (:nodeid_1, :nodeid_2); +-- should error out +SELECT 1 FROM master_disable_node('localhost', 1); +ERROR: Disabling localhost:1 failed +DETAIL: connection error: localhost:1 +HINT: If you are using MX, try stop_metadata_sync_to_node(hostname, port) for nodes that are down before disabling them. +-- try again after stopping metadata sync +SELECT stop_metadata_sync_to_node('localhost', 1); + stop_metadata_sync_to_node +---------------------------- + +(1 row) + +SELECT 1 FROM master_disable_node('localhost', 1); + ?column? +---------- + 1 +(1 row) + +SELECT verify_metadata('localhost', :worker_1_port); + verify_metadata +----------------- + t +(1 row) + +SELECT master_update_node(:nodeid_2, 'localhost', :worker_2_port); + master_update_node +-------------------- + +(1 row) + +SELECT wait_until_metadata_sync(); + wait_until_metadata_sync +-------------------------- + +(1 row) + +SELECT 1 FROM master_activate_node('localhost', :worker_2_port); +NOTICE: Replicating reference table "ref_table" to the node localhost:57638 + ?column? +---------- + 1 +(1 row) + +SELECT verify_metadata('localhost', :worker_1_port); + verify_metadata +----------------- + t +(1 row) + +------------------------------------------------------------------------------------ +-- Test master_disable_node() when the other node is down +------------------------------------------------------------------------------------ +-- node 1 is down. +SELECT master_update_node(:nodeid_1, 'localhost', 1); + master_update_node +-------------------- + +(1 row) + +SELECT wait_until_metadata_sync(); + wait_until_metadata_sync +-------------------------- + +(1 row) + +-- set metadatasynced so we try porpagating metadata changes +UPDATE pg_dist_node SET metadatasynced = TRUE WHERE nodeid IN (:nodeid_1, :nodeid_2); +-- should error out +SELECT 1 FROM master_disable_node('localhost', :worker_2_port); +ERROR: Disabling localhost:57638 failed +DETAIL: connection error: localhost:1 +HINT: If you are using MX, try stop_metadata_sync_to_node(hostname, port) for nodes that are down before disabling them. +-- try again after stopping metadata sync +SELECT stop_metadata_sync_to_node('localhost', 1); + stop_metadata_sync_to_node +---------------------------- + +(1 row) + +SELECT 1 FROM master_disable_node('localhost', :worker_2_port); + ?column? +---------- + 1 +(1 row) + +-- bring up node 1 +SELECT master_update_node(:nodeid_1, 'localhost', :worker_1_port); + master_update_node +-------------------- + +(1 row) + +SELECT wait_until_metadata_sync(); + wait_until_metadata_sync +-------------------------- + +(1 row) + +SELECT 1 FROM master_activate_node('localhost', :worker_2_port); +NOTICE: Replicating reference table "ref_table" to the node localhost:57638 + ?column? +---------- + 1 +(1 row) + +SELECT verify_metadata('localhost', :worker_1_port); + verify_metadata +----------------- + t +(1 row) + -- cleanup -DROP TABLE dist_table_1, ref_table, dist_table_2; +DROP TABLE ref_table; TRUNCATE pg_dist_colocation; SELECT count(*) FROM (SELECT master_remove_node(nodename, nodeport) FROM pg_dist_node) t; count diff --git a/src/test/regress/multi_mx_schedule b/src/test/regress/multi_mx_schedule index 43a2f2567..c82b45d17 100644 --- a/src/test/regress/multi_mx_schedule +++ b/src/test/regress/multi_mx_schedule @@ -14,7 +14,7 @@ # Tests around schema changes, these are run first, so there's no preexisting objects. # --- test: multi_extension -test: multi_mx_master_update_node +test: multi_mx_node_metadata test: multi_cluster_management test: multi_test_helpers diff --git a/src/test/regress/sql/multi_mx_master_update_node.sql b/src/test/regress/sql/multi_mx_node_metadata.sql similarity index 77% rename from src/test/regress/sql/multi_mx_master_update_node.sql rename to src/test/regress/sql/multi_mx_node_metadata.sql index 6b0afde97..5907b8a76 100644 --- a/src/test/regress/sql/multi_mx_master_update_node.sql +++ b/src/test/regress/sql/multi_mx_node_metadata.sql @@ -121,7 +121,7 @@ SELECT nodeid, hasmetadata, metadatasynced FROM pg_dist_node; -- Test updating a node when another node is in readonly-mode -------------------------------------------------------------------------- -SELECT FROM master_add_node('localhost', :worker_2_port) AS nodeid_2 \gset +SELECT master_add_node('localhost', :worker_2_port) AS nodeid_2 \gset SELECT 1 FROM start_metadata_sync_to_node('localhost', :worker_2_port); -- Create a table with shards on both nodes @@ -197,8 +197,70 @@ SELECT nodeid, hasmetadata, metadatasynced FROM pg_dist_node ORDER BY nodeid; SELECT verify_metadata('localhost', :worker_1_port), verify_metadata('localhost', :worker_2_port); +-------------------------------------------------------------------------- +-- Test that changes in isactive is propagated to the metadata nodes +-------------------------------------------------------------------------- +-- Don't drop the reference table so it has shards on the nodes being disabled +DROP TABLE dist_table_1, dist_table_2; + +SELECT 1 FROM master_disable_node('localhost', :worker_2_port); +SELECT verify_metadata('localhost', :worker_1_port); + +SELECT 1 FROM master_activate_node('localhost', :worker_2_port); +SELECT verify_metadata('localhost', :worker_1_port); + +------------------------------------------------------------------------------------ +-- Test master_disable_node() when the node that is being disabled is actually down +------------------------------------------------------------------------------------ +SELECT master_update_node(:nodeid_2, 'localhost', 1); +SELECT wait_until_metadata_sync(); + +-- set metadatasynced so we try porpagating metadata changes +UPDATE pg_dist_node SET metadatasynced = TRUE WHERE nodeid IN (:nodeid_1, :nodeid_2); + +-- should error out +SELECT 1 FROM master_disable_node('localhost', 1); + +-- try again after stopping metadata sync +SELECT stop_metadata_sync_to_node('localhost', 1); +SELECT 1 FROM master_disable_node('localhost', 1); + +SELECT verify_metadata('localhost', :worker_1_port); + +SELECT master_update_node(:nodeid_2, 'localhost', :worker_2_port); +SELECT wait_until_metadata_sync(); + +SELECT 1 FROM master_activate_node('localhost', :worker_2_port); +SELECT verify_metadata('localhost', :worker_1_port); + + +------------------------------------------------------------------------------------ +-- Test master_disable_node() when the other node is down +------------------------------------------------------------------------------------ +-- node 1 is down. +SELECT master_update_node(:nodeid_1, 'localhost', 1); +SELECT wait_until_metadata_sync(); + +-- set metadatasynced so we try porpagating metadata changes +UPDATE pg_dist_node SET metadatasynced = TRUE WHERE nodeid IN (:nodeid_1, :nodeid_2); + +-- should error out +SELECT 1 FROM master_disable_node('localhost', :worker_2_port); + +-- try again after stopping metadata sync +SELECT stop_metadata_sync_to_node('localhost', 1); +SELECT 1 FROM master_disable_node('localhost', :worker_2_port); + +-- bring up node 1 +SELECT master_update_node(:nodeid_1, 'localhost', :worker_1_port); +SELECT wait_until_metadata_sync(); + +SELECT 1 FROM master_activate_node('localhost', :worker_2_port); + +SELECT verify_metadata('localhost', :worker_1_port); + -- cleanup -DROP TABLE dist_table_1, ref_table, dist_table_2; +DROP TABLE ref_table; TRUNCATE pg_dist_colocation; SELECT count(*) FROM (SELECT master_remove_node(nodename, nodeport) FROM pg_dist_node) t; ALTER SEQUENCE pg_catalog.pg_dist_groupid_seq RESTART :last_group_id;