Not allow removing a single node with ref tables (#4127)

* Not allow removing a single node with ref tables

We should not allow removing a node if it is the only node in the
cluster and there is a data on it. We have this check for distributed
tables but we didn't have it for reference tables.

* Update src/test/regress/expected/single_node.out

Co-authored-by: Onur Tirtir <onurcantirtir@gmail.com>

* Update src/test/regress/sql/single_node.sql

Co-authored-by: Onur Tirtir <onurcantirtir@gmail.com>
pull/4176/head
SaitTalhaNisanci 2020-09-18 15:35:59 +03:00 committed by GitHub
parent 6e316d46a2
commit dae2c69fd7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 91 additions and 21 deletions

View File

@ -3867,6 +3867,17 @@ ReferenceTableOidList()
}
/*
* ClusterHasReferenceTable returns true if the cluster has
* any reference table.
*/
bool
ClusterHasReferenceTable(void)
{
return list_length(ReferenceTableOidList()) > 0;
}
/*
* InvalidateNodeRelationCacheCallback destroys the WorkerNodeHash when
* any change happens on pg_dist_node table. It also set WorkerNodeHash to

View File

@ -86,6 +86,7 @@ typedef struct NodeMetadata
/* local function forward declarations */
static int ActivateNode(char *nodeName, int nodePort);
static bool CanRemoveReferenceTablePlacements(void);
static void RemoveNodeFromCluster(char *nodeName, int32 nodePort);
static int AddNodeMetadata(char *nodeName, int32 nodePort, NodeMetadata
*nodeMetadata, bool *nodeAlreadyExists);
@ -1053,19 +1054,32 @@ RemoveNodeFromCluster(char *nodeName, int32 nodePort)
WorkerNode *workerNode = ModifiableWorkerNode(nodeName, nodePort);
if (NodeIsPrimary(workerNode))
{
if (CanRemoveReferenceTablePlacements())
{
/*
* Delete reference table placements so they are not taken into account
* for the check if there are placements after this.
*/
DeleteAllReferenceTablePlacementsFromNodeGroup(workerNode->groupId);
}
bool onlyConsiderActivePlacements = false;
/*
* Delete reference table placements so they are not taken into account
* for the check if there are placements after this
*/
DeleteAllReferenceTablePlacementsFromNodeGroup(workerNode->groupId);
if (NodeGroupHasShardPlacements(workerNode->groupId,
onlyConsiderActivePlacements))
{
ereport(ERROR, (errmsg("you cannot remove the primary node of a node group "
"which has shard placements")));
if (ClusterHasReferenceTable())
{
ereport(ERROR, (errmsg(
"cannot remove the last worker node because there are reference "
"tables and it would cause data loss on reference tables"),
errhint(
"To proceed, either drop the reference tables or use "
"undistribute_table() function to convert them to local tables")));
}
ereport(ERROR, (errmsg("cannot remove the primary node of a node group "
"which has shard placements"),
errhint(
"To proceed, either drop the distributed tables or use "
"undistribute_table() function to convert them to local tables")));
}
}
@ -1080,6 +1094,18 @@ RemoveNodeFromCluster(char *nodeName, int32 nodePort)
}
/*
* CanRemoveReferenceTablePlacements returns true if active primary
* node count is more than 1, which means that even if we remove a node
* we will still have some other node that has reference table placement.
*/
static bool
CanRemoveReferenceTablePlacements(void)
{
return ActivePrimaryNodeCount() > 1;
}
/* CountPrimariesWithMetadata returns the number of primary nodes which have metadata. */
uint32
CountPrimariesWithMetadata(void)

View File

@ -307,6 +307,17 @@ ActivePrimaryNonCoordinatorNodeCount(void)
}
/*
* ActivePrimaryNodeCount returns the number of groups with a primary in the cluster.
*/
uint32
ActivePrimaryNodeCount(void)
{
List *nodeList = ActivePrimaryNodeList(NoLock);
return list_length(nodeList);
}
/*
* ActiveReadableNonCoordinatorNodeCount returns the number of groups with a node we can read from.
* This method excludes coordinator even if it is added as a worker.

View File

@ -162,6 +162,7 @@ extern void InvalidateForeignKeyGraph(void);
extern void FlushDistTableCache(void);
extern void InvalidateMetadataSystemCache(void);
extern Datum DistNodeMetadata(void);
extern bool ClusterHasReferenceTable(void);
extern bool HasUniformHashDistribution(ShardInterval **shardIntervalArray,
int shardIntervalArrayLength);
extern bool HasUninitializedShardInterval(ShardInterval **sortedShardIntervalArray,

View File

@ -71,6 +71,7 @@ extern WorkerNode * WorkerGetRoundRobinCandidateNode(List *workerNodeList,
uint32 placementIndex);
extern WorkerNode * WorkerGetLocalFirstCandidateNode(List *currentNodeList);
extern uint32 ActivePrimaryNonCoordinatorNodeCount(void);
extern uint32 ActivePrimaryNodeCount(void);
extern List * ActivePrimaryNonCoordinatorNodeList(LOCKMODE lockMode);
extern List * ActivePrimaryNodeList(LOCKMODE lockMode);
extern bool CoordinatorAddedAsWorkerNode(void);

View File

@ -27,7 +27,7 @@ SELECT create_citus_local_table('citus_local_table_1');
-- try to remove coordinator and observe failure as there exist a citus local table
SELECT 1 FROM master_remove_node('localhost', :master_port);
ERROR: you cannot remove the primary node of a node group which has shard placements
ERROR: cannot remove the primary node of a node group which has shard placements
DROP TABLE citus_local_table_1;
NOTICE: executing the command locally: DROP TABLE IF EXISTS citus_local_tables_test_schema.citus_local_table_1_xxxxx CASCADE
-- this should work now as the citus local table is dropped
@ -559,7 +559,7 @@ FROM pg_dist_partition WHERE logicalrelid = 'citus_local_table_4'::regclass;
(1 row)
SELECT column_name_to_column('citus_local_table_4', 'a');
column_name_to_column
column_name_to_column
---------------------------------------------------------------------
{VAR :varno 1 :varattno 1 :vartype 23 :vartypmod -1 :varcollid 0 :varlevelsup 0 :varnoold 1 :varoattno 1 :location -1}
(1 row)

View File

@ -124,7 +124,8 @@ ORDER BY placementid;
-- master_remove_node fails when there are shards on that worker
SELECT master_remove_node('localhost', :worker_2_proxy_port);
ERROR: you cannot remove the primary node of a node group which has shard placements
ERROR: cannot remove the last worker node because there are reference tables and it would cause data loss on reference tables
HINT: To proceed, either drop the reference tables or use undistribute_table() function to convert them to local tables
-- drop event table and re-run remove
DROP TABLE event_table;
SELECT master_remove_node('localhost', :worker_2_proxy_port);

View File

@ -126,7 +126,7 @@ create_citus_local_table
step s2-remove-coordinator: SELECT master_remove_node('localhost', 57636); <waiting ...>
step s1-commit: COMMIT;
step s2-remove-coordinator: <... completed>
error in steps s1-commit s2-remove-coordinator: ERROR: you cannot remove the primary node of a node group which has shard placements
error in steps s1-commit s2-remove-coordinator: ERROR: cannot remove the primary node of a node group which has shard placements
step s2-commit: COMMIT;
master_remove_node

View File

@ -299,7 +299,7 @@ step s2-commit:
COMMIT;
step s1-remove-node-2: <... completed>
error in steps s2-commit s1-remove-node-2: ERROR: you cannot remove the primary node of a node group which has shard placements
error in steps s2-commit s1-remove-node-2: ERROR: cannot remove the primary node of a node group which has shard placements
step s1-show-placements:
SELECT
nodename, nodeport
@ -393,7 +393,7 @@ step s2-commit:
COMMIT;
step s1-remove-node-2: <... completed>
error in steps s2-commit s1-remove-node-2: ERROR: you cannot remove the primary node of a node group which has shard placements
error in steps s2-commit s1-remove-node-2: ERROR: cannot remove the primary node of a node group which has shard placements
step s2-select:
SELECT * FROM dist_table;
@ -480,7 +480,7 @@ step s2-commit:
COMMIT;
step s1-remove-node-2: <... completed>
error in steps s2-commit s1-remove-node-2: ERROR: you cannot remove the primary node of a node group which has shard placements
error in steps s2-commit s1-remove-node-2: ERROR: cannot remove the primary node of a node group which has shard placements
step s2-select:
SELECT * FROM dist_table;

View File

@ -117,7 +117,8 @@ SELECT shardid, shardstate, nodename, nodeport FROM pg_dist_shard_placement WHER
-- try to remove a node with active placements and see that node removal is failed
SELECT master_remove_node('localhost', :worker_2_port);
ERROR: you cannot remove the primary node of a node group which has shard placements
ERROR: cannot remove the primary node of a node group which has shard placements
HINT: To proceed, either drop the distributed tables or use undistribute_table() function to convert them to local tables
SELECT master_get_active_worker_nodes();
master_get_active_worker_nodes
---------------------------------------------------------------------
@ -256,7 +257,8 @@ DETAIL: distributed objects are only kept in sync when citus.enable_object_prop
-- try to remove a node with active placements and see that node removal is failed
SELECT master_remove_node('localhost', :worker_2_port);
ERROR: you cannot remove the primary node of a node group which has shard placements
ERROR: cannot remove the primary node of a node group which has shard placements
HINT: To proceed, either drop the distributed tables or use undistribute_table() function to convert them to local tables
-- mark all placements in the candidate node as inactive
SELECT groupid AS worker_2_group FROM pg_dist_node WHERE nodeport=:worker_2_port \gset
UPDATE pg_dist_placement SET shardstate=3 WHERE groupid=:worker_2_group;
@ -275,7 +277,8 @@ SELECT shardid, shardstate, nodename, nodeport FROM pg_dist_shard_placement WHER
-- try to remove a node with only inactive placements and see that removal still fails
SELECT master_remove_node('localhost', :worker_2_port);
ERROR: you cannot remove the primary node of a node group which has shard placements
ERROR: cannot remove the primary node of a node group which has shard placements
HINT: To proceed, either drop the distributed tables or use undistribute_table() function to convert them to local tables
SELECT master_get_active_worker_nodes();
master_get_active_worker_nodes
---------------------------------------------------------------------
@ -337,7 +340,8 @@ SELECT logicalrelid, shardid, shardstate, nodename, nodeport FROM pg_dist_shard_
-- try to remove a node with only to be deleted placements and see that removal still fails
SELECT master_remove_node('localhost', :worker_2_port);
ERROR: you cannot remove the primary node of a node group which has shard placements
ERROR: cannot remove the primary node of a node group which has shard placements
HINT: To proceed, either drop the distributed tables or use undistribute_table() function to convert them to local tables
SELECT master_get_active_worker_nodes();
master_get_active_worker_nodes
---------------------------------------------------------------------
@ -378,7 +382,8 @@ SELECT 1 FROM master_add_node('localhost', 9990, groupid => :new_group, noderole
(1 row)
SELECT master_remove_node('localhost', :worker_2_port);
ERROR: you cannot remove the primary node of a node group which has shard placements
ERROR: cannot remove the primary node of a node group which has shard placements
HINT: To proceed, either drop the distributed tables or use undistribute_table() function to convert them to local tables
SELECT master_remove_node('localhost', 9990);
master_remove_node
---------------------------------------------------------------------

View File

@ -199,6 +199,9 @@ WHERE colocationid IN
1 | -1 | 0
(1 row)
SELECT master_remove_node('localhost', :worker_1_port);
ERROR: cannot remove the last worker node because there are reference tables and it would cause data loss on reference tables
HINT: To proceed, either drop the reference tables or use undistribute_table() function to convert them to local tables
\c - - - :worker_1_port
SELECT COUNT(*) FROM pg_dist_node WHERE nodeport = :worker_2_port;
count

View File

@ -739,6 +739,12 @@ SELECT create_distributed_function('call_delegation(int)', '$1', 'test');
(1 row)
CALL call_delegation(1);
DROP TABLE test CASCADE;
NOTICE: drop cascades to view single_node_view
-- cannot remove coordinator since a reference table exists on coordinator and no other worker nodes are added
SELECT 1 FROM master_remove_node('localhost', :master_port);
ERROR: cannot remove the last worker node because there are reference tables and it would cause data loss on reference tables
HINT: To proceed, either drop the reference tables or use undistribute_table() function to convert them to local tables
-- Cleanup
SET client_min_messages TO WARNING;
DROP SCHEMA single_node CASCADE;

View File

@ -104,6 +104,8 @@ WHERE colocationid IN
FROM pg_dist_partition
WHERE logicalrelid = 'remove_node_reference_table'::regclass);
SELECT master_remove_node('localhost', :worker_1_port);
\c - - - :worker_1_port
SELECT COUNT(*) FROM pg_dist_node WHERE nodeport = :worker_2_port;

View File

@ -361,6 +361,9 @@ END;$$;
SELECT * FROM pg_dist_node;
SELECT create_distributed_function('call_delegation(int)', '$1', 'test');
CALL call_delegation(1);
DROP TABLE test CASCADE;
-- cannot remove coordinator since a reference table exists on coordinator and no other worker nodes are added
SELECT 1 FROM master_remove_node('localhost', :master_port);
-- Cleanup
SET client_min_messages TO WARNING;