Merge pull request #5469 from citusdata/make_errors_generic

Generalize the error checks while removing node
pull/5493/head
Önder Kalacı 2021-11-26 14:31:26 +01:00 committed by GitHub
commit 7b6588fec0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 299 additions and 335 deletions

View File

@ -1255,26 +1255,6 @@ ShardLength(uint64 shardId)
}
/*
* NodeGroupHasLivePlacements returns true if there is any placement
* on the given node group which is not a SHARD_STATE_TO_DELETE placement.
*/
bool
NodeGroupHasLivePlacements(int32 groupId)
{
List *shardPlacements = AllShardPlacementsOnNodeGroup(groupId);
GroupShardPlacement *placement = NULL;
foreach_ptr(placement, shardPlacements)
{
if (placement->shardState != SHARD_STATE_TO_DELETE)
{
return true;
}
}
return false;
}
/*
* NodeGroupHasShardPlacements returns whether any active shards are placed on the group
*/

View File

@ -91,8 +91,10 @@ typedef struct NodeMetadata
/* local function forward declarations */
static int ActivateNode(char *nodeName, int nodePort);
static bool CanRemoveReferenceTablePlacements(void);
static void RemoveNodeFromCluster(char *nodeName, int32 nodePort);
static void ErrorIfNodeContainsNonRemovablePlacements(WorkerNode *workerNode);
static bool PlacementHasActivePlacementOnAnotherGroup(GroupShardPlacement
*sourcePlacement);
static int AddNodeMetadata(char *nodeName, int32 nodePort, NodeMetadata
*nodeMetadata, bool *nodeAlreadyExists);
static WorkerNode * SetNodeState(char *nodeName, int32 nodePort, bool isActive);
@ -1295,35 +1297,18 @@ RemoveNodeFromCluster(char *nodeName, int32 nodePort)
WorkerNode *workerNode = ModifiableWorkerNode(nodeName, nodePort);
if (NodeIsPrimary(workerNode))
{
if (CanRemoveReferenceTablePlacements())
{
ErrorIfNodeContainsNonRemovablePlacements(workerNode);
/*
* Delete reference table placements so they are not taken into account
* for the check if there are placements after this.
*/
DeleteAllReferenceTablePlacementsFromNodeGroup(workerNode->groupId);
}
if (NodeGroupHasLivePlacements(workerNode->groupId))
{
if (ActivePrimaryNodeCount() == 1 && ClusterHasReferenceTable())
{
ereport(ERROR, (errmsg(
"cannot remove the last worker node because there are reference "
"tables and it would cause data loss on reference tables"),
errhint(
"To proceed, either drop the reference tables or use "
"undistribute_table() function to convert them to local tables")));
}
ereport(ERROR, (errmsg("cannot remove the primary node of a node group "
"which has shard placements"),
errhint(
"To proceed, either drop the distributed tables or use "
"undistribute_table() function to convert them to local tables")));
}
/*
* Secondary nodes are read-only, never 2PC is used.
* Hence, no items can be inserted to pg_dist_transaction for secondary nodes.
* Hence, no items can be inserted to pg_dist_transaction
* for secondary nodes.
*/
DeleteWorkerTransactions(workerNode);
}
@ -1341,6 +1326,65 @@ RemoveNodeFromCluster(char *nodeName, int32 nodePort)
}
/*
* ErrorIfNodeContainsNonRemovablePlacements throws an error if the input node
* contains at least one placement on the node that is the last active
* placement.
*/
static void
ErrorIfNodeContainsNonRemovablePlacements(WorkerNode *workerNode)
{
int32 groupId = workerNode->groupId;
List *shardPlacements = AllShardPlacementsOnNodeGroup(groupId);
GroupShardPlacement *placement = NULL;
foreach_ptr(placement, shardPlacements)
{
if (!PlacementHasActivePlacementOnAnotherGroup(placement))
{
Oid relationId = RelationIdForShard(placement->shardId);
char *qualifiedRelationName = generate_qualified_relation_name(relationId);
ereport(ERROR, (errmsg("cannot remove or disable the node "
"%s:%d because because it contains "
"the only shard placement for "
"shard " UINT64_FORMAT, workerNode->workerName,
workerNode->workerPort, placement->shardId),
errdetail("One of the table(s) that prevents the operation "
"complete successfully is %s",
qualifiedRelationName),
errhint("To proceed, either drop the tables or use "
"undistribute_table() function to convert "
"them to local tables")));
}
}
}
/*
* PlacementHasActivePlacementOnAnotherGroup returns true if there is at least
* one more healthy placement of the input sourcePlacement on another group.
*/
static bool
PlacementHasActivePlacementOnAnotherGroup(GroupShardPlacement *sourcePlacement)
{
uint64 shardId = sourcePlacement->shardId;
List *activePlacementList = ActiveShardPlacementList(shardId);
bool foundHealtyPlacementOnAnotherGroup = false;
ShardPlacement *activePlacement = NULL;
foreach_ptr(activePlacement, activePlacementList)
{
if (activePlacement->groupId != sourcePlacement->groupId)
{
foundHealtyPlacementOnAnotherGroup = true;
break;
}
}
return foundHealtyPlacementOnAnotherGroup;
}
/*
* RemoveOldShardPlacementForNodeGroup removes all old shard placements
* for the given node group from pg_dist_placement.
@ -1364,18 +1408,6 @@ RemoveOldShardPlacementForNodeGroup(int groupId)
}
/*
* CanRemoveReferenceTablePlacements returns true if active primary
* node count is more than 1, which means that even if we remove a node
* we will still have some other node that has reference table placement.
*/
static bool
CanRemoveReferenceTablePlacements(void)
{
return ActivePrimaryNodeCount() > 1;
}
/* CountPrimariesWithMetadata returns the number of primary nodes which have metadata. */
uint32
CountPrimariesWithMetadata(void)

View File

@ -207,7 +207,6 @@ extern int ShardIntervalCount(Oid relationId);
extern List * LoadShardList(Oid relationId);
extern ShardInterval * CopyShardInterval(ShardInterval *srcInterval);
extern uint64 ShardLength(uint64 shardId);
extern bool NodeGroupHasLivePlacements(int32 groupId);
extern bool NodeGroupHasShardPlacements(int32 groupId,
bool onlyConsiderActivePlacements);
extern List * ActiveShardPlacementListOnGroup(uint64 shardId, int32 groupId);

View File

@ -27,7 +27,7 @@ SELECT citus_add_local_table_to_metadata('citus_local_table_1');
-- try to remove coordinator and observe failure as there exist a citus local table
SELECT 1 FROM master_remove_node('localhost', :master_port);
ERROR: cannot remove the primary node of a node group which has shard placements
ERROR: cannot remove or disable the node localhost:xxxxx because because it contains the only shard placement for shard xxxxx
DROP TABLE citus_local_table_1;
NOTICE: executing the command locally: DROP TABLE IF EXISTS citus_local_tables_test_schema.citus_local_table_1_xxxxx CASCADE
-- this should work now as the citus local table is dropped

View File

@ -122,10 +122,18 @@ ORDER BY placementid;
200000 | 1
(1 row)
-- master_remove_node fails when there are shards on that worker
SELECT master_remove_node('localhost', :worker_2_proxy_port);
ERROR: cannot remove the last worker node because there are reference tables and it would cause data loss on reference tables
HINT: To proceed, either drop the reference tables or use undistribute_table() function to convert them to local tables
BEGIN;
-- master_remove_node succeeds because there are the
-- healthy placements of the shards that exists on
-- worker_2_proxy_port on the other worker (worker_1_port)
-- as well
SELECT master_remove_node('localhost', :worker_2_proxy_port);
master_remove_node
---------------------------------------------------------------------
(1 row)
ROLLBACK;
-- drop event table and re-run remove
DROP TABLE event_table;
SELECT master_remove_node('localhost', :worker_2_proxy_port);

View File

@ -162,7 +162,7 @@ citus_add_local_table_to_metadata
step s2-remove-coordinator: SELECT master_remove_node('localhost', 57636); <waiting ...>
step s1-commit: COMMIT;
step s2-remove-coordinator: <... completed>
ERROR: cannot remove the primary node of a node group which has shard placements
ERROR: cannot remove or disable the node localhost:xxxxx because because it contains the only shard placement for shard xxxxx
step s2-commit: COMMIT;
master_remove_node
---------------------------------------------------------------------

View File

@ -369,7 +369,7 @@ step s2-commit:
COMMIT;
step s1-remove-node-2: <... completed>
ERROR: cannot remove the primary node of a node group which has shard placements
ERROR: cannot remove or disable the node localhost:xxxxx because because it contains the only shard placement for shard xxxxx
step s1-show-placements:
SELECT
nodename, nodeport
@ -429,7 +429,7 @@ master_remove_node
step s2-create-table-2:
SET citus.shard_count TO 4;
SET citus.shard_replication_factor TO 2;
SET citus.shard_replication_factor TO 1;
CREATE TABLE dist_table (x int, y int);
SELECT create_distributed_table('dist_table', 'x');
<waiting ...>
@ -437,11 +437,18 @@ step s1-commit:
COMMIT;
step s2-create-table-2: <... completed>
ERROR: replication_factor (2) exceeds number of worker nodes (1)
create_distributed_table
---------------------------------------------------------------------
(1 row)
step s2-select:
SELECT * FROM dist_table;
ERROR: relation "dist_table" does not exist
x|y
---------------------------------------------------------------------
(0 rows)
master_remove_node
---------------------------------------------------------------------
@ -467,7 +474,7 @@ step s2-begin:
step s2-create-table-2:
SET citus.shard_count TO 4;
SET citus.shard_replication_factor TO 2;
SET citus.shard_replication_factor TO 1;
CREATE TABLE dist_table (x int, y int);
SELECT create_distributed_table('dist_table', 'x');
@ -483,7 +490,7 @@ step s2-commit:
COMMIT;
step s1-remove-node-2: <... completed>
ERROR: cannot remove the primary node of a node group which has shard placements
ERROR: cannot remove or disable the node localhost:xxxxx because because it contains the only shard placement for shard xxxxx
step s2-select:
SELECT * FROM dist_table;
@ -596,7 +603,7 @@ step s2-commit:
COMMIT;
step s1-remove-node-2: <... completed>
ERROR: cannot remove the primary node of a node group which has shard placements
ERROR: cannot remove or disable the node localhost:xxxxx because because it contains the only shard placement for shard xxxxx
step s2-select:
SELECT * FROM dist_table;

View File

@ -28,13 +28,13 @@ step detector-dump-wait-edges:
waiting_transaction_num|blocking_transaction_num|blocking_transaction_waiting
---------------------------------------------------------------------
405| 404|f
406| 405|f
(1 row)
transactionnumber|waitingtransactionnumbers
---------------------------------------------------------------------
404|
405| 404
405|
406| 405
(2 rows)
step s1-abort:
@ -84,16 +84,16 @@ step detector-dump-wait-edges:
waiting_transaction_num|blocking_transaction_num|blocking_transaction_waiting
---------------------------------------------------------------------
409| 408|f
410| 408|f
410| 409|t
410| 409|f
411| 409|f
411| 410|t
(3 rows)
transactionnumber|waitingtransactionnumbers
---------------------------------------------------------------------
408|
409|408
410|408,409
409|
410|409
411|409,410
(3 rows)
step s1-abort:

View File

@ -134,8 +134,9 @@ SELECT shardid, shardstate, nodename, nodeport FROM pg_dist_shard_placement WHER
-- try to remove a node with active placements and see that node removal is failed
SELECT master_remove_node('localhost', :worker_2_port);
ERROR: cannot remove the primary node of a node group which has shard placements
HINT: To proceed, either drop the distributed tables or use undistribute_table() function to convert them to local tables
ERROR: cannot remove or disable the node localhost:xxxxx because because it contains the only shard placement for shard xxxxx
DETAIL: One of the table(s) that prevents the operation complete successfully is public.cluster_management_test
HINT: To proceed, either drop the tables or use undistribute_table() function to convert them to local tables
SELECT master_get_active_worker_nodes();
master_get_active_worker_nodes
---------------------------------------------------------------------
@ -154,8 +155,9 @@ SELECT create_reference_table('test_reference_table');
INSERT INTO test_reference_table VALUES (1, '1');
-- try to remove a node with active placements and reference tables
SELECT citus_remove_node('localhost', :worker_2_port);
ERROR: cannot remove the primary node of a node group which has shard placements
HINT: To proceed, either drop the distributed tables or use undistribute_table() function to convert them to local tables
ERROR: cannot remove or disable the node localhost:xxxxx because because it contains the only shard placement for shard xxxxx
DETAIL: One of the table(s) that prevents the operation complete successfully is public.cluster_management_test
HINT: To proceed, either drop the tables or use undistribute_table() function to convert them to local tables
-- try to disable a node with active placements see that node is removed
-- observe that a notification is displayed
SELECT master_disable_node('localhost', :worker_2_port);
@ -335,8 +337,9 @@ SELECT create_distributed_table('cluster_management_test', 'col_1', 'hash');
-- try to remove a node with active placements and see that node removal is failed
SELECT master_remove_node('localhost', :worker_2_port);
ERROR: cannot remove the primary node of a node group which has shard placements
HINT: To proceed, either drop the distributed tables or use undistribute_table() function to convert them to local tables
ERROR: cannot remove or disable the node localhost:xxxxx because because it contains the only shard placement for shard xxxxx
DETAIL: One of the table(s) that prevents the operation complete successfully is public.cluster_management_test
HINT: To proceed, either drop the tables or use undistribute_table() function to convert them to local tables
-- mark all placements in the candidate node as inactive
SELECT groupid AS worker_2_group FROM pg_dist_node WHERE nodeport=:worker_2_port \gset
UPDATE pg_dist_placement SET shardstate=3 WHERE groupid=:worker_2_group;
@ -363,8 +366,9 @@ SELECT shardid, shardstate, nodename, nodeport FROM pg_dist_shard_placement WHER
-- try to remove a node with only inactive placements and see that removal still fails
SELECT master_remove_node('localhost', :worker_2_port);
ERROR: cannot remove the primary node of a node group which has shard placements
HINT: To proceed, either drop the distributed tables or use undistribute_table() function to convert them to local tables
ERROR: cannot remove or disable the node localhost:xxxxx because because it contains the only shard placement for shard xxxxx
DETAIL: One of the table(s) that prevents the operation complete successfully is public.cluster_management_test
HINT: To proceed, either drop the tables or use undistribute_table() function to convert them to local tables
SELECT master_get_active_worker_nodes();
master_get_active_worker_nodes
---------------------------------------------------------------------
@ -440,68 +444,6 @@ SELECT logicalrelid, shardid, shardstate, nodename, nodeport FROM pg_dist_shard_
cluster_management_test | 1220015 | 4 | localhost | 57638
(24 rows)
SELECT * INTO removed_placements FROM pg_dist_placement WHERE shardstate = 4;
SELECT run_command_on_workers('SELECT * INTO removed_placements FROM pg_dist_placement WHERE shardstate = 4');
run_command_on_workers
---------------------------------------------------------------------
(localhost,57637,t,"SELECT 8")
(localhost,57638,t,"SELECT 8")
(2 rows)
-- try to remove a node with only to be deleted placements and see that removal succeeds
SELECT master_remove_node('localhost', :worker_2_port);
master_remove_node
---------------------------------------------------------------------
(1 row)
SELECT master_get_active_worker_nodes();
master_get_active_worker_nodes
---------------------------------------------------------------------
(localhost,57637)
(1 row)
SELECT master_add_node('localhost', :worker_2_port, groupId := :worker_2_group);
WARNING: citus.enable_object_propagation is off, not creating distributed objects on worker
DETAIL: distributed objects are only kept in sync when citus.enable_object_propagation is set to on. Newly activated nodes will not get these objects created
WARNING: could not find any shard placements for shardId 1220001
WARNING: could not find any shard placements for shardId 1220003
WARNING: could not find any shard placements for shardId 1220005
WARNING: could not find any shard placements for shardId 1220007
WARNING: could not find any shard placements for shardId 1220009
WARNING: could not find any shard placements for shardId 1220011
WARNING: could not find any shard placements for shardId 1220013
WARNING: could not find any shard placements for shardId 1220015
WARNING: could not find any shard placements for shardId 1220017
WARNING: could not find any shard placements for shardId 1220019
WARNING: could not find any shard placements for shardId 1220021
WARNING: could not find any shard placements for shardId 1220023
WARNING: could not find any shard placements for shardId 1220025
WARNING: could not find any shard placements for shardId 1220027
WARNING: could not find any shard placements for shardId 1220029
WARNING: could not find any shard placements for shardId 1220031
master_add_node
---------------------------------------------------------------------
7
(1 row)
-- put removed placements back for testing purposes(in practice we wouldn't have only old placements for a shard)
INSERT INTO pg_dist_placement SELECT * FROM removed_placements;
SELECT run_command_on_workers('INSERT INTO pg_dist_placement SELECT * FROM removed_placements');
run_command_on_workers
---------------------------------------------------------------------
(localhost,57637,f,"ERROR: duplicate key value violates unique constraint ""pg_dist_placement_placementid_index""")
(localhost,57638,t,"INSERT 0 8")
(2 rows)
DROP TABLE removed_placements;
SELECT run_command_on_workers('DROP TABLE removed_placements');
run_command_on_workers
---------------------------------------------------------------------
(localhost,57637,t,"DROP TABLE")
(localhost,57638,t,"DROP TABLE")
(2 rows)
-- clean-up
SELECT 1 FROM master_add_node('localhost', :worker_2_port);
?column?
@ -572,8 +514,9 @@ SELECT 1 FROM master_add_node('localhost', 9990, groupid => :new_group, noderole
(1 row)
SELECT master_remove_node('localhost', :worker_2_port);
ERROR: cannot remove the primary node of a node group which has shard placements
HINT: To proceed, either drop the distributed tables or use undistribute_table() function to convert them to local tables
ERROR: cannot remove or disable the node localhost:xxxxx because because it contains the only shard placement for shard xxxxx
DETAIL: One of the table(s) that prevents the operation complete successfully is public.cluster_management_test
HINT: To proceed, either drop the tables or use undistribute_table() function to convert them to local tables
SELECT master_remove_node('localhost', 9990);
master_remove_node
---------------------------------------------------------------------
@ -674,14 +617,14 @@ WARNING: citus.enable_object_propagation is off, not creating distributed objec
DETAIL: distributed objects are only kept in sync when citus.enable_object_propagation is set to on. Newly activated nodes will not get these objects created
master_add_node | master_add_node
---------------------------------------------------------------------
12 | 13
11 | 12
(1 row)
SELECT * FROM pg_dist_node ORDER BY nodeid;
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards
---------------------------------------------------------------------
12 | 9 | localhost | 57637 | default | t | t | primary | default | t | t
13 | 10 | localhost | 57638 | default | t | t | primary | default | t | t
11 | 9 | localhost | 57637 | default | t | t | primary | default | t | t
12 | 10 | localhost | 57638 | default | t | t | primary | default | t | t
(2 rows)
-- check that mixed add/remove node commands work fine inside transaction
@ -881,13 +824,13 @@ SELECT 1 FROM master_add_inactive_node('localhost', 9996, groupid => :worker_2_g
SELECT master_add_inactive_node('localhost', 9999, groupid => :worker_2_group, nodecluster => 'olap', noderole => 'secondary');
master_add_inactive_node
---------------------------------------------------------------------
23
22
(1 row)
SELECT master_activate_node('localhost', 9999);
master_activate_node
---------------------------------------------------------------------
23
22
(1 row)
SELECT master_disable_node('localhost', 9999);
@ -915,17 +858,17 @@ CONTEXT: PL/pgSQL function citus_internal.pg_dist_node_trigger_func() line XX a
INSERT INTO pg_dist_node (nodename, nodeport, groupid, noderole, nodecluster)
VALUES ('localhost', 5000, 1000, 'primary', 'olap');
ERROR: new row for relation "pg_dist_node" violates check constraint "primaries_are_only_allowed_in_the_default_cluster"
DETAIL: Failing row contains (25, 1000, localhost, 5000, default, f, t, primary, olap, f, t).
DETAIL: Failing row contains (24, 1000, localhost, 5000, default, f, t, primary, olap, f, t).
UPDATE pg_dist_node SET nodecluster = 'olap'
WHERE nodeport = :worker_1_port;
ERROR: new row for relation "pg_dist_node" violates check constraint "primaries_are_only_allowed_in_the_default_cluster"
DETAIL: Failing row contains (17, 14, localhost, 57637, default, f, t, primary, olap, f, t).
DETAIL: Failing row contains (16, 14, localhost, 57637, default, f, t, primary, olap, f, t).
-- check that you /can/ add a secondary node to a non-default cluster
SELECT groupid AS worker_2_group FROM pg_dist_node WHERE nodeport = :worker_2_port \gset
SELECT master_add_node('localhost', 8888, groupid => :worker_1_group, noderole => 'secondary', nodecluster=> 'olap');
master_add_node
---------------------------------------------------------------------
26
25
(1 row)
-- check that super-long cluster names are truncated
@ -938,13 +881,13 @@ SELECT master_add_node('localhost', 8887, groupid => :worker_1_group, noderole =
);
master_add_node
---------------------------------------------------------------------
27
26
(1 row)
SELECT * FROM pg_dist_node WHERE nodeport=8887;
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards
---------------------------------------------------------------------
27 | 14 | localhost | 8887 | default | f | t | secondary | thisisasixtyfourcharacterstringrepeatedfourtimestomake256chars. | f | t
26 | 14 | localhost | 8887 | default | f | t | secondary | thisisasixtyfourcharacterstringrepeatedfourtimestomake256chars. | f | t
(1 row)
-- don't remove the secondary and unavailable nodes, check that no commands are sent to
@ -953,13 +896,13 @@ SELECT * FROM pg_dist_node WHERE nodeport=8887;
SELECT master_add_secondary_node('localhost', 9995, 'localhost', :worker_1_port);
master_add_secondary_node
---------------------------------------------------------------------
28
27
(1 row)
SELECT master_add_secondary_node('localhost', 9994, primaryname => 'localhost', primaryport => :worker_2_port);
master_add_secondary_node
---------------------------------------------------------------------
29
28
(1 row)
SELECT master_add_secondary_node('localhost', 9993, 'localhost', 2000);
@ -967,7 +910,7 @@ ERROR: node at "localhost:xxxxx" does not exist
SELECT master_add_secondary_node('localhost', 9992, 'localhost', :worker_1_port, nodecluster => 'second-cluster');
master_add_secondary_node
---------------------------------------------------------------------
30
29
(1 row)
SELECT nodeid AS worker_1_node FROM pg_dist_node WHERE nodeport=:worker_1_port \gset
@ -987,7 +930,7 @@ SELECT master_update_node(:worker_1_node, 'somehost', 9000);
SELECT * FROM pg_dist_node WHERE nodeid = :worker_1_node;
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards
---------------------------------------------------------------------
17 | 14 | somehost | 9000 | default | f | t | primary | default | f | t
16 | 14 | somehost | 9000 | default | f | t | primary | default | f | t
(1 row)
-- cleanup
@ -1000,7 +943,7 @@ SELECT master_update_node(:worker_1_node, 'localhost', :worker_1_port);
SELECT * FROM pg_dist_node WHERE nodeid = :worker_1_node;
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards
---------------------------------------------------------------------
17 | 14 | localhost | 57637 | default | f | t | primary | default | f | t
16 | 14 | localhost | 57637 | default | f | t | primary | default | f | t
(1 row)
SET client_min_messages TO ERROR;

View File

@ -1594,11 +1594,13 @@ ERROR: Disabling localhost:xxxxx failed
DETAIL: localhost:xxxxx is a metadata node, but is out of sync
HINT: If you are using MX, try stop_metadata_sync_to_node(hostname, port) for nodes that are down before disabling them.
SELECT master_remove_node('localhost', :worker_1_port);
ERROR: localhost:xxxxx is a metadata node, but is out of sync
HINT: If the node is up, wait until metadata gets synced to it and try again.
ERROR: cannot remove or disable the node localhost:xxxxx because because it contains the only shard placement for shard xxxxx
DETAIL: One of the table(s) that prevents the operation complete successfully is mx_testing_schema.mx_test_table
HINT: To proceed, either drop the tables or use undistribute_table() function to convert them to local tables
SELECT master_remove_node('localhost', :worker_2_port);
ERROR: localhost:xxxxx is a metadata node, but is out of sync
HINT: If the node is up, wait until metadata gets synced to it and try again.
ERROR: cannot remove or disable the node localhost:xxxxx because because it contains the only shard placement for shard xxxxx
DETAIL: One of the table(s) that prevents the operation complete successfully is mx_testing_schema.mx_test_table
HINT: To proceed, either drop the tables or use undistribute_table() function to convert them to local tables
-- master_update_node should succeed
SELECT nodeid AS worker_2_nodeid FROM pg_dist_node WHERE nodeport=:worker_2_port \gset
SELECT master_update_node(:worker_2_nodeid, 'localhost', 4444);

View File

@ -214,8 +214,9 @@ WHERE colocationid IN
(1 row)
SELECT master_remove_node('localhost', :worker_1_port);
ERROR: cannot remove the last worker node because there are reference tables and it would cause data loss on reference tables
HINT: To proceed, either drop the reference tables or use undistribute_table() function to convert them to local tables
ERROR: cannot remove or disable the node localhost:xxxxx because because it contains the only shard placement for shard xxxxx
DETAIL: One of the table(s) that prevents the operation complete successfully is public.remove_node_reference_table
HINT: To proceed, either drop the tables or use undistribute_table() function to convert them to local tables
\c - - - :worker_1_port
SELECT COUNT(*) FROM pg_dist_node WHERE nodeport = :worker_2_port;
count

View File

@ -2118,8 +2118,9 @@ SELECT pg_reload_conf();
SET client_min_messages TO error;
-- cannot remove coordinator since a reference table exists on coordinator and no other worker nodes are added
SELECT 1 FROM master_remove_node('localhost', :master_port);
ERROR: cannot remove the last worker node because there are reference tables and it would cause data loss on reference tables
HINT: To proceed, either drop the reference tables or use undistribute_table() function to convert them to local tables
ERROR: cannot remove or disable the node localhost:xxxxx because because it contains the only shard placement for shard xxxxx
DETAIL: One of the table(s) that prevents the operation complete successfully is single_node.ref
HINT: To proceed, either drop the tables or use undistribute_table() function to convert them to local tables
-- Cleanup
DROP SCHEMA single_node CASCADE;
-- Remove the coordinator again

View File

@ -433,7 +433,7 @@ SET client_min_messages TO ERROR;
SELECT citus_activate_node('localhost', :worker_1_port);
citus_activate_node
---------------------------------------------------------------------
17
16
(1 row)
\c - - - :worker_2_port

View File

@ -68,7 +68,7 @@ step "s2-create-table-1"
step "s2-create-table-2"
{
SET citus.shard_count TO 4;
SET citus.shard_replication_factor TO 2;
SET citus.shard_replication_factor TO 1;
CREATE TABLE dist_table (x int, y int);
SELECT create_distributed_table('dist_table', 'x');
}
@ -101,7 +101,7 @@ permutation "s1-add-node-2" "s1-begin" "s1-remove-node-2" "s2-create-table-1" "s
permutation "s1-add-node-2" "s1-begin" "s1-remove-node-2" "s2-create-table-1" "s1-abort" "s1-show-placements" "s2-select"
permutation "s1-add-node-2" "s2-begin" "s2-create-table-1" "s1-remove-node-2" "s2-commit" "s1-show-placements" "s2-select"
// session 1 removes a node, session 2 creates a distributed table with replication factor 2, should throw a sane error
// session 1 removes a node, session 2 creates a distributed table with replication factor 1, should throw a sane error
permutation "s1-add-node-2" "s1-begin" "s1-remove-node-2" "s2-create-table-2" "s1-commit" "s2-select"
permutation "s1-add-node-2" "s2-begin" "s2-create-table-2" "s1-remove-node-2" "s2-commit" "s2-select"

View File

@ -63,8 +63,13 @@ FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid)
WHERE s.logicalrelid = 'user_table'::regclass
ORDER BY placementid;
-- master_remove_node fails when there are shards on that worker
SELECT master_remove_node('localhost', :worker_2_proxy_port);
BEGIN;
-- master_remove_node succeeds because there are the
-- healthy placements of the shards that exists on
-- worker_2_proxy_port on the other worker (worker_1_port)
-- as well
SELECT master_remove_node('localhost', :worker_2_proxy_port);
ROLLBACK;
-- drop event table and re-run remove
DROP TABLE event_table;

View File

@ -168,20 +168,6 @@ SELECT create_distributed_table('cluster_management_test_colocated', 'col_1', 'h
-- Check that colocated shards don't get created for shards that are to be deleted
SELECT logicalrelid, shardid, shardstate, nodename, nodeport FROM pg_dist_shard_placement NATURAL JOIN pg_dist_shard ORDER BY shardstate, shardid;
SELECT * INTO removed_placements FROM pg_dist_placement WHERE shardstate = 4;
SELECT run_command_on_workers('SELECT * INTO removed_placements FROM pg_dist_placement WHERE shardstate = 4');
-- try to remove a node with only to be deleted placements and see that removal succeeds
SELECT master_remove_node('localhost', :worker_2_port);
SELECT master_get_active_worker_nodes();
SELECT master_add_node('localhost', :worker_2_port, groupId := :worker_2_group);
-- put removed placements back for testing purposes(in practice we wouldn't have only old placements for a shard)
INSERT INTO pg_dist_placement SELECT * FROM removed_placements;
SELECT run_command_on_workers('INSERT INTO pg_dist_placement SELECT * FROM removed_placements');
DROP TABLE removed_placements;
SELECT run_command_on_workers('DROP TABLE removed_placements');
-- clean-up
SELECT 1 FROM master_add_node('localhost', :worker_2_port);
UPDATE pg_dist_placement SET shardstate=1 WHERE groupid=:worker_2_group;