Merge pull request #3164 from citusdata/propagate_activate

Propagate isactive to metadata nodes.
pull/3170/head
Hadi Moshayedi 2019-11-15 05:57:35 -08:00 committed by GitHub
commit c8c68d719b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 270 additions and 29 deletions

View File

@ -273,29 +273,51 @@ master_disable_node(PG_FUNCTION_ARGS)
WorkerNode *workerNode = ModifiableWorkerNode(nodeName, nodePort);
bool isActive = false;
bool onlyConsiderActivePlacements = false;
MemoryContext savedContext = CurrentMemoryContext;
if (WorkerNodeIsPrimary(workerNode))
PG_TRY();
{
/*
* Delete reference table placements so they are not taken into account
* for the check if there are placements after this
*/
DeleteAllReferenceTablePlacementsFromNodeGroup(workerNode->groupId);
if (NodeGroupHasShardPlacements(workerNode->groupId,
onlyConsiderActivePlacements))
if (WorkerNodeIsPrimary(workerNode))
{
ereport(NOTICE, (errmsg(
"Node %s:%d has active shard placements. Some queries "
"may fail after this operation. Use "
"SELECT master_activate_node('%s', %d) to activate this "
"node back.",
workerNode->workerName, nodePort, workerNode->workerName,
nodePort)));
}
}
/*
* Delete reference table placements so they are not taken into account
* for the check if there are placements after this.
*/
DeleteAllReferenceTablePlacementsFromNodeGroup(workerNode->groupId);
SetNodeState(nodeName, nodePort, isActive);
if (NodeGroupHasShardPlacements(workerNode->groupId,
onlyConsiderActivePlacements))
{
ereport(NOTICE, (errmsg(
"Node %s:%d has active shard placements. Some queries "
"may fail after this operation. Use "
"SELECT master_activate_node('%s', %d) to activate this "
"node back.",
workerNode->workerName, nodePort,
workerNode->workerName,
nodePort)));
}
}
SetNodeState(nodeName, nodePort, isActive);
}
PG_CATCH();
{
ErrorData *edata = NULL;
/* CopyErrorData() requires (CurrentMemoryContext != ErrorContext) */
MemoryContextSwitchTo(savedContext);
edata = CopyErrorData();
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("Disabling %s:%d failed", workerNode->workerName,
nodePort),
errdetail("%s", edata->message),
errhint(
"If you are using MX, try stop_metadata_sync_to_node(hostname, port) "
"for nodes that are down before disabling them.")));
}
PG_END_TRY();
PG_RETURN_VOID();
}
@ -350,7 +372,7 @@ SetUpDistributedTableDependencies(WorkerNode *newWorkerNode)
newWorkerNode->workerPort);
/*
* Let the maintanince deamon do the hard work of syncing the metadata.
* Let the maintenance daemon do the hard work of syncing the metadata.
* We prefer this because otherwise node activation might fail within
* transaction blocks.
*/
@ -1129,8 +1151,8 @@ SetWorkerColumn(WorkerNode *workerNode, int columnIndex, Datum value)
{
case Anum_pg_dist_node_isactive:
{
metadataSyncCommand = ShouldHaveShardsUpdateCommand(workerNode->nodeId,
DatumGetBool(value));
metadataSyncCommand = NodeStateUpdateCommand(workerNode->nodeId,
DatumGetBool(value));
break;
}

View File

@ -406,8 +406,7 @@ CreateReferenceTableColocationId()
/*
* DeleteAllReferenceTablePlacementsFromNodeGroup function iterates over list of reference
* tables and deletes all reference table placements from pg_dist_placement table
* for given group. However, it does not modify replication factor of the colocation
* group of reference tables. It is caller's responsibility to do that if it is necessary.
* for given group.
*/
void
DeleteAllReferenceTablePlacementsFromNodeGroup(int32 groupId)

View File

@ -210,7 +210,7 @@ SELECT nodeid, hasmetadata, metadatasynced FROM pg_dist_node;
--------------------------------------------------------------------------
-- Test updating a node when another node is in readonly-mode
--------------------------------------------------------------------------
SELECT FROM master_add_node('localhost', :worker_2_port) AS nodeid_2 \gset
SELECT master_add_node('localhost', :worker_2_port) AS nodeid_2 \gset
NOTICE: Replicating reference table "ref_table" to the node localhost:57638
SELECT 1 FROM start_metadata_sync_to_node('localhost', :worker_2_port);
?column?
@ -403,8 +403,166 @@ SELECT verify_metadata('localhost', :worker_1_port),
t | t
(1 row)
--------------------------------------------------------------------------
-- Test that changes in isactive is propagated to the metadata nodes
--------------------------------------------------------------------------
-- Don't drop the reference table so it has shards on the nodes being disabled
DROP TABLE dist_table_1, dist_table_2;
SELECT 1 FROM master_disable_node('localhost', :worker_2_port);
?column?
----------
1
(1 row)
SELECT verify_metadata('localhost', :worker_1_port);
verify_metadata
-----------------
t
(1 row)
SELECT 1 FROM master_activate_node('localhost', :worker_2_port);
NOTICE: Replicating reference table "ref_table" to the node localhost:57638
?column?
----------
1
(1 row)
SELECT verify_metadata('localhost', :worker_1_port);
verify_metadata
-----------------
t
(1 row)
------------------------------------------------------------------------------------
-- Test master_disable_node() when the node that is being disabled is actually down
------------------------------------------------------------------------------------
SELECT master_update_node(:nodeid_2, 'localhost', 1);
master_update_node
--------------------
(1 row)
SELECT wait_until_metadata_sync();
wait_until_metadata_sync
--------------------------
(1 row)
-- set metadatasynced so we try porpagating metadata changes
UPDATE pg_dist_node SET metadatasynced = TRUE WHERE nodeid IN (:nodeid_1, :nodeid_2);
-- should error out
SELECT 1 FROM master_disable_node('localhost', 1);
ERROR: Disabling localhost:1 failed
DETAIL: connection error: localhost:1
HINT: If you are using MX, try stop_metadata_sync_to_node(hostname, port) for nodes that are down before disabling them.
-- try again after stopping metadata sync
SELECT stop_metadata_sync_to_node('localhost', 1);
stop_metadata_sync_to_node
----------------------------
(1 row)
SELECT 1 FROM master_disable_node('localhost', 1);
?column?
----------
1
(1 row)
SELECT verify_metadata('localhost', :worker_1_port);
verify_metadata
-----------------
t
(1 row)
SELECT master_update_node(:nodeid_2, 'localhost', :worker_2_port);
master_update_node
--------------------
(1 row)
SELECT wait_until_metadata_sync();
wait_until_metadata_sync
--------------------------
(1 row)
SELECT 1 FROM master_activate_node('localhost', :worker_2_port);
NOTICE: Replicating reference table "ref_table" to the node localhost:57638
?column?
----------
1
(1 row)
SELECT verify_metadata('localhost', :worker_1_port);
verify_metadata
-----------------
t
(1 row)
------------------------------------------------------------------------------------
-- Test master_disable_node() when the other node is down
------------------------------------------------------------------------------------
-- node 1 is down.
SELECT master_update_node(:nodeid_1, 'localhost', 1);
master_update_node
--------------------
(1 row)
SELECT wait_until_metadata_sync();
wait_until_metadata_sync
--------------------------
(1 row)
-- set metadatasynced so we try porpagating metadata changes
UPDATE pg_dist_node SET metadatasynced = TRUE WHERE nodeid IN (:nodeid_1, :nodeid_2);
-- should error out
SELECT 1 FROM master_disable_node('localhost', :worker_2_port);
ERROR: Disabling localhost:57638 failed
DETAIL: connection error: localhost:1
HINT: If you are using MX, try stop_metadata_sync_to_node(hostname, port) for nodes that are down before disabling them.
-- try again after stopping metadata sync
SELECT stop_metadata_sync_to_node('localhost', 1);
stop_metadata_sync_to_node
----------------------------
(1 row)
SELECT 1 FROM master_disable_node('localhost', :worker_2_port);
?column?
----------
1
(1 row)
-- bring up node 1
SELECT master_update_node(:nodeid_1, 'localhost', :worker_1_port);
master_update_node
--------------------
(1 row)
SELECT wait_until_metadata_sync();
wait_until_metadata_sync
--------------------------
(1 row)
SELECT 1 FROM master_activate_node('localhost', :worker_2_port);
NOTICE: Replicating reference table "ref_table" to the node localhost:57638
?column?
----------
1
(1 row)
SELECT verify_metadata('localhost', :worker_1_port);
verify_metadata
-----------------
t
(1 row)
-- cleanup
DROP TABLE dist_table_1, ref_table, dist_table_2;
DROP TABLE ref_table;
TRUNCATE pg_dist_colocation;
SELECT count(*) FROM (SELECT master_remove_node(nodename, nodeport) FROM pg_dist_node) t;
count

View File

@ -14,7 +14,7 @@
# Tests around schema changes, these are run first, so there's no preexisting objects.
# ---
test: multi_extension
test: multi_mx_master_update_node
test: multi_mx_node_metadata
test: multi_cluster_management
test: multi_test_helpers

View File

@ -121,7 +121,7 @@ SELECT nodeid, hasmetadata, metadatasynced FROM pg_dist_node;
-- Test updating a node when another node is in readonly-mode
--------------------------------------------------------------------------
SELECT FROM master_add_node('localhost', :worker_2_port) AS nodeid_2 \gset
SELECT master_add_node('localhost', :worker_2_port) AS nodeid_2 \gset
SELECT 1 FROM start_metadata_sync_to_node('localhost', :worker_2_port);
-- Create a table with shards on both nodes
@ -197,8 +197,70 @@ SELECT nodeid, hasmetadata, metadatasynced FROM pg_dist_node ORDER BY nodeid;
SELECT verify_metadata('localhost', :worker_1_port),
verify_metadata('localhost', :worker_2_port);
--------------------------------------------------------------------------
-- Test that changes in isactive is propagated to the metadata nodes
--------------------------------------------------------------------------
-- Don't drop the reference table so it has shards on the nodes being disabled
DROP TABLE dist_table_1, dist_table_2;
SELECT 1 FROM master_disable_node('localhost', :worker_2_port);
SELECT verify_metadata('localhost', :worker_1_port);
SELECT 1 FROM master_activate_node('localhost', :worker_2_port);
SELECT verify_metadata('localhost', :worker_1_port);
------------------------------------------------------------------------------------
-- Test master_disable_node() when the node that is being disabled is actually down
------------------------------------------------------------------------------------
SELECT master_update_node(:nodeid_2, 'localhost', 1);
SELECT wait_until_metadata_sync();
-- set metadatasynced so we try porpagating metadata changes
UPDATE pg_dist_node SET metadatasynced = TRUE WHERE nodeid IN (:nodeid_1, :nodeid_2);
-- should error out
SELECT 1 FROM master_disable_node('localhost', 1);
-- try again after stopping metadata sync
SELECT stop_metadata_sync_to_node('localhost', 1);
SELECT 1 FROM master_disable_node('localhost', 1);
SELECT verify_metadata('localhost', :worker_1_port);
SELECT master_update_node(:nodeid_2, 'localhost', :worker_2_port);
SELECT wait_until_metadata_sync();
SELECT 1 FROM master_activate_node('localhost', :worker_2_port);
SELECT verify_metadata('localhost', :worker_1_port);
------------------------------------------------------------------------------------
-- Test master_disable_node() when the other node is down
------------------------------------------------------------------------------------
-- node 1 is down.
SELECT master_update_node(:nodeid_1, 'localhost', 1);
SELECT wait_until_metadata_sync();
-- set metadatasynced so we try porpagating metadata changes
UPDATE pg_dist_node SET metadatasynced = TRUE WHERE nodeid IN (:nodeid_1, :nodeid_2);
-- should error out
SELECT 1 FROM master_disable_node('localhost', :worker_2_port);
-- try again after stopping metadata sync
SELECT stop_metadata_sync_to_node('localhost', 1);
SELECT 1 FROM master_disable_node('localhost', :worker_2_port);
-- bring up node 1
SELECT master_update_node(:nodeid_1, 'localhost', :worker_1_port);
SELECT wait_until_metadata_sync();
SELECT 1 FROM master_activate_node('localhost', :worker_2_port);
SELECT verify_metadata('localhost', :worker_1_port);
-- cleanup
DROP TABLE dist_table_1, ref_table, dist_table_2;
DROP TABLE ref_table;
TRUNCATE pg_dist_colocation;
SELECT count(*) FROM (SELECT master_remove_node(nodename, nodeport) FROM pg_dist_node) t;
ALTER SEQUENCE pg_catalog.pg_dist_groupid_seq RESTART :last_group_id;