From dae2c69fd7901b01758fb3b8030ac101c88874a7 Mon Sep 17 00:00:00 2001 From: SaitTalhaNisanci Date: Fri, 18 Sep 2020 15:35:59 +0300 Subject: [PATCH] Not allow removing a single node with ref tables (#4127) * Not allow removing a single node with ref tables We should not allow removing a node if it is the only node in the cluster and there is a data on it. We have this check for distributed tables but we didn't have it for reference tables. * Update src/test/regress/expected/single_node.out Co-authored-by: Onur Tirtir * Update src/test/regress/sql/single_node.sql Co-authored-by: Onur Tirtir --- .../distributed/metadata/metadata_cache.c | 11 +++++ .../distributed/metadata/node_metadata.c | 44 +++++++++++++++---- .../operations/worker_node_manager.c | 11 +++++ src/include/distributed/metadata_cache.h | 1 + src/include/distributed/worker_manager.h | 1 + .../regress/expected/citus_local_tables.out | 4 +- .../expected/failure_add_disable_node.out | 3 +- .../isolation_create_citus_local_table.out | 2 +- ...lation_create_table_vs_add_remove_node.out | 6 +-- .../expected/multi_cluster_management.out | 15 ++++--- .../multi_remove_node_reference_table.out | 3 ++ src/test/regress/expected/single_node.out | 6 +++ .../sql/multi_remove_node_reference_table.sql | 2 + src/test/regress/sql/single_node.sql | 3 ++ 14 files changed, 91 insertions(+), 21 deletions(-) diff --git a/src/backend/distributed/metadata/metadata_cache.c b/src/backend/distributed/metadata/metadata_cache.c index a57c752d1..58045747d 100644 --- a/src/backend/distributed/metadata/metadata_cache.c +++ b/src/backend/distributed/metadata/metadata_cache.c @@ -3867,6 +3867,17 @@ ReferenceTableOidList() } +/* + * ClusterHasReferenceTable returns true if the cluster has + * any reference table. + */ +bool +ClusterHasReferenceTable(void) +{ + return list_length(ReferenceTableOidList()) > 0; +} + + /* * InvalidateNodeRelationCacheCallback destroys the WorkerNodeHash when * any change happens on pg_dist_node table. It also set WorkerNodeHash to diff --git a/src/backend/distributed/metadata/node_metadata.c b/src/backend/distributed/metadata/node_metadata.c index 72ea11d18..ba28fedb6 100644 --- a/src/backend/distributed/metadata/node_metadata.c +++ b/src/backend/distributed/metadata/node_metadata.c @@ -86,6 +86,7 @@ typedef struct NodeMetadata /* local function forward declarations */ static int ActivateNode(char *nodeName, int nodePort); +static bool CanRemoveReferenceTablePlacements(void); static void RemoveNodeFromCluster(char *nodeName, int32 nodePort); static int AddNodeMetadata(char *nodeName, int32 nodePort, NodeMetadata *nodeMetadata, bool *nodeAlreadyExists); @@ -1053,19 +1054,32 @@ RemoveNodeFromCluster(char *nodeName, int32 nodePort) WorkerNode *workerNode = ModifiableWorkerNode(nodeName, nodePort); if (NodeIsPrimary(workerNode)) { + if (CanRemoveReferenceTablePlacements()) + { + /* + * Delete reference table placements so they are not taken into account + * for the check if there are placements after this. + */ + DeleteAllReferenceTablePlacementsFromNodeGroup(workerNode->groupId); + } bool onlyConsiderActivePlacements = false; - - /* - * Delete reference table placements so they are not taken into account - * for the check if there are placements after this - */ - DeleteAllReferenceTablePlacementsFromNodeGroup(workerNode->groupId); - if (NodeGroupHasShardPlacements(workerNode->groupId, onlyConsiderActivePlacements)) { - ereport(ERROR, (errmsg("you cannot remove the primary node of a node group " - "which has shard placements"))); + if (ClusterHasReferenceTable()) + { + ereport(ERROR, (errmsg( + "cannot remove the last worker node because there are reference " + "tables and it would cause data loss on reference tables"), + errhint( + "To proceed, either drop the reference tables or use " + "undistribute_table() function to convert them to local tables"))); + } + ereport(ERROR, (errmsg("cannot remove the primary node of a node group " + "which has shard placements"), + errhint( + "To proceed, either drop the distributed tables or use " + "undistribute_table() function to convert them to local tables"))); } } @@ -1080,6 +1094,18 @@ RemoveNodeFromCluster(char *nodeName, int32 nodePort) } +/* + * CanRemoveReferenceTablePlacements returns true if active primary + * node count is more than 1, which means that even if we remove a node + * we will still have some other node that has reference table placement. + */ +static bool +CanRemoveReferenceTablePlacements(void) +{ + return ActivePrimaryNodeCount() > 1; +} + + /* CountPrimariesWithMetadata returns the number of primary nodes which have metadata. */ uint32 CountPrimariesWithMetadata(void) diff --git a/src/backend/distributed/operations/worker_node_manager.c b/src/backend/distributed/operations/worker_node_manager.c index e20a97729..a1637bf34 100644 --- a/src/backend/distributed/operations/worker_node_manager.c +++ b/src/backend/distributed/operations/worker_node_manager.c @@ -307,6 +307,17 @@ ActivePrimaryNonCoordinatorNodeCount(void) } +/* + * ActivePrimaryNodeCount returns the number of groups with a primary in the cluster. + */ +uint32 +ActivePrimaryNodeCount(void) +{ + List *nodeList = ActivePrimaryNodeList(NoLock); + return list_length(nodeList); +} + + /* * ActiveReadableNonCoordinatorNodeCount returns the number of groups with a node we can read from. * This method excludes coordinator even if it is added as a worker. diff --git a/src/include/distributed/metadata_cache.h b/src/include/distributed/metadata_cache.h index a160636e7..82e609abc 100644 --- a/src/include/distributed/metadata_cache.h +++ b/src/include/distributed/metadata_cache.h @@ -162,6 +162,7 @@ extern void InvalidateForeignKeyGraph(void); extern void FlushDistTableCache(void); extern void InvalidateMetadataSystemCache(void); extern Datum DistNodeMetadata(void); +extern bool ClusterHasReferenceTable(void); extern bool HasUniformHashDistribution(ShardInterval **shardIntervalArray, int shardIntervalArrayLength); extern bool HasUninitializedShardInterval(ShardInterval **sortedShardIntervalArray, diff --git a/src/include/distributed/worker_manager.h b/src/include/distributed/worker_manager.h index 753ab9f83..db0045166 100644 --- a/src/include/distributed/worker_manager.h +++ b/src/include/distributed/worker_manager.h @@ -71,6 +71,7 @@ extern WorkerNode * WorkerGetRoundRobinCandidateNode(List *workerNodeList, uint32 placementIndex); extern WorkerNode * WorkerGetLocalFirstCandidateNode(List *currentNodeList); extern uint32 ActivePrimaryNonCoordinatorNodeCount(void); +extern uint32 ActivePrimaryNodeCount(void); extern List * ActivePrimaryNonCoordinatorNodeList(LOCKMODE lockMode); extern List * ActivePrimaryNodeList(LOCKMODE lockMode); extern bool CoordinatorAddedAsWorkerNode(void); diff --git a/src/test/regress/expected/citus_local_tables.out b/src/test/regress/expected/citus_local_tables.out index 025196986..0d7685ed5 100644 --- a/src/test/regress/expected/citus_local_tables.out +++ b/src/test/regress/expected/citus_local_tables.out @@ -27,7 +27,7 @@ SELECT create_citus_local_table('citus_local_table_1'); -- try to remove coordinator and observe failure as there exist a citus local table SELECT 1 FROM master_remove_node('localhost', :master_port); -ERROR: you cannot remove the primary node of a node group which has shard placements +ERROR: cannot remove the primary node of a node group which has shard placements DROP TABLE citus_local_table_1; NOTICE: executing the command locally: DROP TABLE IF EXISTS citus_local_tables_test_schema.citus_local_table_1_xxxxx CASCADE -- this should work now as the citus local table is dropped @@ -559,7 +559,7 @@ FROM pg_dist_partition WHERE logicalrelid = 'citus_local_table_4'::regclass; (1 row) SELECT column_name_to_column('citus_local_table_4', 'a'); - column_name_to_column + column_name_to_column --------------------------------------------------------------------- {VAR :varno 1 :varattno 1 :vartype 23 :vartypmod -1 :varcollid 0 :varlevelsup 0 :varnoold 1 :varoattno 1 :location -1} (1 row) diff --git a/src/test/regress/expected/failure_add_disable_node.out b/src/test/regress/expected/failure_add_disable_node.out index aec4d756e..229b72276 100644 --- a/src/test/regress/expected/failure_add_disable_node.out +++ b/src/test/regress/expected/failure_add_disable_node.out @@ -124,7 +124,8 @@ ORDER BY placementid; -- master_remove_node fails when there are shards on that worker SELECT master_remove_node('localhost', :worker_2_proxy_port); -ERROR: you cannot remove the primary node of a node group which has shard placements +ERROR: cannot remove the last worker node because there are reference tables and it would cause data loss on reference tables +HINT: To proceed, either drop the reference tables or use undistribute_table() function to convert them to local tables -- drop event table and re-run remove DROP TABLE event_table; SELECT master_remove_node('localhost', :worker_2_proxy_port); diff --git a/src/test/regress/expected/isolation_create_citus_local_table.out b/src/test/regress/expected/isolation_create_citus_local_table.out index 1643ee521..54877d3fa 100644 --- a/src/test/regress/expected/isolation_create_citus_local_table.out +++ b/src/test/regress/expected/isolation_create_citus_local_table.out @@ -126,7 +126,7 @@ create_citus_local_table step s2-remove-coordinator: SELECT master_remove_node('localhost', 57636); step s1-commit: COMMIT; step s2-remove-coordinator: <... completed> -error in steps s1-commit s2-remove-coordinator: ERROR: you cannot remove the primary node of a node group which has shard placements +error in steps s1-commit s2-remove-coordinator: ERROR: cannot remove the primary node of a node group which has shard placements step s2-commit: COMMIT; master_remove_node diff --git a/src/test/regress/expected/isolation_create_table_vs_add_remove_node.out b/src/test/regress/expected/isolation_create_table_vs_add_remove_node.out index cdfcca4e6..b97e8c9cd 100644 --- a/src/test/regress/expected/isolation_create_table_vs_add_remove_node.out +++ b/src/test/regress/expected/isolation_create_table_vs_add_remove_node.out @@ -299,7 +299,7 @@ step s2-commit: COMMIT; step s1-remove-node-2: <... completed> -error in steps s2-commit s1-remove-node-2: ERROR: you cannot remove the primary node of a node group which has shard placements +error in steps s2-commit s1-remove-node-2: ERROR: cannot remove the primary node of a node group which has shard placements step s1-show-placements: SELECT nodename, nodeport @@ -393,7 +393,7 @@ step s2-commit: COMMIT; step s1-remove-node-2: <... completed> -error in steps s2-commit s1-remove-node-2: ERROR: you cannot remove the primary node of a node group which has shard placements +error in steps s2-commit s1-remove-node-2: ERROR: cannot remove the primary node of a node group which has shard placements step s2-select: SELECT * FROM dist_table; @@ -480,7 +480,7 @@ step s2-commit: COMMIT; step s1-remove-node-2: <... completed> -error in steps s2-commit s1-remove-node-2: ERROR: you cannot remove the primary node of a node group which has shard placements +error in steps s2-commit s1-remove-node-2: ERROR: cannot remove the primary node of a node group which has shard placements step s2-select: SELECT * FROM dist_table; diff --git a/src/test/regress/expected/multi_cluster_management.out b/src/test/regress/expected/multi_cluster_management.out index 531bff5af..4ba2fc35a 100644 --- a/src/test/regress/expected/multi_cluster_management.out +++ b/src/test/regress/expected/multi_cluster_management.out @@ -117,7 +117,8 @@ SELECT shardid, shardstate, nodename, nodeport FROM pg_dist_shard_placement WHER -- try to remove a node with active placements and see that node removal is failed SELECT master_remove_node('localhost', :worker_2_port); -ERROR: you cannot remove the primary node of a node group which has shard placements +ERROR: cannot remove the primary node of a node group which has shard placements +HINT: To proceed, either drop the distributed tables or use undistribute_table() function to convert them to local tables SELECT master_get_active_worker_nodes(); master_get_active_worker_nodes --------------------------------------------------------------------- @@ -256,7 +257,8 @@ DETAIL: distributed objects are only kept in sync when citus.enable_object_prop -- try to remove a node with active placements and see that node removal is failed SELECT master_remove_node('localhost', :worker_2_port); -ERROR: you cannot remove the primary node of a node group which has shard placements +ERROR: cannot remove the primary node of a node group which has shard placements +HINT: To proceed, either drop the distributed tables or use undistribute_table() function to convert them to local tables -- mark all placements in the candidate node as inactive SELECT groupid AS worker_2_group FROM pg_dist_node WHERE nodeport=:worker_2_port \gset UPDATE pg_dist_placement SET shardstate=3 WHERE groupid=:worker_2_group; @@ -275,7 +277,8 @@ SELECT shardid, shardstate, nodename, nodeport FROM pg_dist_shard_placement WHER -- try to remove a node with only inactive placements and see that removal still fails SELECT master_remove_node('localhost', :worker_2_port); -ERROR: you cannot remove the primary node of a node group which has shard placements +ERROR: cannot remove the primary node of a node group which has shard placements +HINT: To proceed, either drop the distributed tables or use undistribute_table() function to convert them to local tables SELECT master_get_active_worker_nodes(); master_get_active_worker_nodes --------------------------------------------------------------------- @@ -337,7 +340,8 @@ SELECT logicalrelid, shardid, shardstate, nodename, nodeport FROM pg_dist_shard_ -- try to remove a node with only to be deleted placements and see that removal still fails SELECT master_remove_node('localhost', :worker_2_port); -ERROR: you cannot remove the primary node of a node group which has shard placements +ERROR: cannot remove the primary node of a node group which has shard placements +HINT: To proceed, either drop the distributed tables or use undistribute_table() function to convert them to local tables SELECT master_get_active_worker_nodes(); master_get_active_worker_nodes --------------------------------------------------------------------- @@ -378,7 +382,8 @@ SELECT 1 FROM master_add_node('localhost', 9990, groupid => :new_group, noderole (1 row) SELECT master_remove_node('localhost', :worker_2_port); -ERROR: you cannot remove the primary node of a node group which has shard placements +ERROR: cannot remove the primary node of a node group which has shard placements +HINT: To proceed, either drop the distributed tables or use undistribute_table() function to convert them to local tables SELECT master_remove_node('localhost', 9990); master_remove_node --------------------------------------------------------------------- diff --git a/src/test/regress/expected/multi_remove_node_reference_table.out b/src/test/regress/expected/multi_remove_node_reference_table.out index e7722fd85..8e87d4e9c 100644 --- a/src/test/regress/expected/multi_remove_node_reference_table.out +++ b/src/test/regress/expected/multi_remove_node_reference_table.out @@ -199,6 +199,9 @@ WHERE colocationid IN 1 | -1 | 0 (1 row) +SELECT master_remove_node('localhost', :worker_1_port); +ERROR: cannot remove the last worker node because there are reference tables and it would cause data loss on reference tables +HINT: To proceed, either drop the reference tables or use undistribute_table() function to convert them to local tables \c - - - :worker_1_port SELECT COUNT(*) FROM pg_dist_node WHERE nodeport = :worker_2_port; count diff --git a/src/test/regress/expected/single_node.out b/src/test/regress/expected/single_node.out index 96ec0758c..3eb117231 100644 --- a/src/test/regress/expected/single_node.out +++ b/src/test/regress/expected/single_node.out @@ -739,6 +739,12 @@ SELECT create_distributed_function('call_delegation(int)', '$1', 'test'); (1 row) CALL call_delegation(1); +DROP TABLE test CASCADE; +NOTICE: drop cascades to view single_node_view +-- cannot remove coordinator since a reference table exists on coordinator and no other worker nodes are added +SELECT 1 FROM master_remove_node('localhost', :master_port); +ERROR: cannot remove the last worker node because there are reference tables and it would cause data loss on reference tables +HINT: To proceed, either drop the reference tables or use undistribute_table() function to convert them to local tables -- Cleanup SET client_min_messages TO WARNING; DROP SCHEMA single_node CASCADE; diff --git a/src/test/regress/sql/multi_remove_node_reference_table.sql b/src/test/regress/sql/multi_remove_node_reference_table.sql index b9581924d..5f7e8357c 100644 --- a/src/test/regress/sql/multi_remove_node_reference_table.sql +++ b/src/test/regress/sql/multi_remove_node_reference_table.sql @@ -104,6 +104,8 @@ WHERE colocationid IN FROM pg_dist_partition WHERE logicalrelid = 'remove_node_reference_table'::regclass); +SELECT master_remove_node('localhost', :worker_1_port); + \c - - - :worker_1_port SELECT COUNT(*) FROM pg_dist_node WHERE nodeport = :worker_2_port; diff --git a/src/test/regress/sql/single_node.sql b/src/test/regress/sql/single_node.sql index 187953a5c..2a744178f 100644 --- a/src/test/regress/sql/single_node.sql +++ b/src/test/regress/sql/single_node.sql @@ -361,6 +361,9 @@ END;$$; SELECT * FROM pg_dist_node; SELECT create_distributed_function('call_delegation(int)', '$1', 'test'); CALL call_delegation(1); +DROP TABLE test CASCADE; +-- cannot remove coordinator since a reference table exists on coordinator and no other worker nodes are added +SELECT 1 FROM master_remove_node('localhost', :master_port); -- Cleanup SET client_min_messages TO WARNING;