Add failure tests for master add/remove/disable/active node

pull/2277/head
Murat Tuncer 2018-07-11 15:42:04 +03:00
parent a05817e2ec
commit a837dde1a0
3 changed files with 569 additions and 0 deletions

View File

@ -0,0 +1,387 @@
--
-- failure_add_disable_node tests master_add_node, master_remove_node
-- master_activate_node for failures.
-- master_disable_node and master_add_inactive_node can not be
-- tested as they don't create network activity
--
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
-----------
(1 row)
SET citus.next_shard_id TO 200000;
-- verify we have all worker nodes present
SELECT * FROM master_get_active_worker_nodes()
ORDER BY 1, 2;
node_name | node_port
-----------+-----------
localhost | 57637
localhost | 57640
(2 rows)
-- verify there are no tables that could prevent add/remove node operations
SELECT * FROM pg_dist_partition;
logicalrelid | partmethod | partkey | colocationid | repmodel
--------------+------------+---------+--------------+----------
(0 rows)
CREATE SCHEMA add_remove_node;
SET SEARCH_PATH=add_remove_node;
CREATE TABLE user_table(user_id int, user_name text);
SELECT create_reference_table('user_table');
create_reference_table
------------------------
(1 row)
CREATE TABLE event_table(user_id int, event_id int, event_name text);
SELECT create_distributed_table('event_table', 'user_id');
create_distributed_table
--------------------------
(1 row)
SELECT shardid, shardstate
FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid)
WHERE s.logicalrelid = 'user_table'::regclass
ORDER BY placementid;
shardid | shardstate
---------+------------
200000 | 1
200000 | 1
(2 rows)
SELECT master_disable_node('localhost', :worker_2_proxy_port);
NOTICE: Node localhost:57640 has active shard placements. Some queries may fail after this operation. Use SELECT master_activate_node('localhost', 57640) to activate this node back.
master_disable_node
---------------------
(1 row)
SELECT * FROM master_get_active_worker_nodes()
ORDER BY 1, 2;
node_name | node_port
-----------+-----------
localhost | 57637
(1 row)
SELECT shardid, shardstate
FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid)
WHERE s.logicalrelid = 'user_table'::regclass
ORDER BY placementid;
shardid | shardstate
---------+------------
200000 | 1
(1 row)
-- fail activate node by failing reference table creation
SELECT citus.mitmproxy('conn.onQuery(query="CREATE TABLE").kill()');
mitmproxy
-----------
(1 row)
SELECT master_activate_node('localhost', :worker_2_proxy_port);
NOTICE: Replicating reference table "user_table" to the node localhost:57640
ERROR: server closed the connection unexpectedly
This probably means the server terminated abnormally
before or while processing the request.
CONTEXT: while executing command on localhost:57640
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
-----------
(1 row)
-- verify node is not activated
SELECT * FROM master_get_active_worker_nodes()
ORDER BY 1, 2;
node_name | node_port
-----------+-----------
localhost | 57637
(1 row)
SELECT shardid, shardstate
FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid)
WHERE s.logicalrelid = 'user_table'::regclass
ORDER BY placementid;
shardid | shardstate
---------+------------
200000 | 1
(1 row)
-- fail create schema command
SELECT citus.mitmproxy('conn.onQuery(query="CREATE SCHEMA").kill()');
mitmproxy
-----------
(1 row)
SELECT master_activate_node('localhost', :worker_2_proxy_port);
NOTICE: Replicating reference table "user_table" to the node localhost:57640
ERROR: server closed the connection unexpectedly
This probably means the server terminated abnormally
before or while processing the request.
CONTEXT: while executing command on localhost:57640
-- verify node is not activated
SELECT * FROM master_get_active_worker_nodes()
ORDER BY 1, 2;
node_name | node_port
-----------+-----------
localhost | 57637
(1 row)
SELECT shardid, shardstate
FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid)
WHERE s.logicalrelid = 'user_table'::regclass
ORDER BY placementid;
shardid | shardstate
---------+------------
200000 | 1
(1 row)
-- fail activate node by failing reference table creation
SELECT citus.mitmproxy('conn.onQuery(query="CREATE TABLE").cancel(' || pg_backend_pid() || ')');
mitmproxy
-----------
(1 row)
SELECT master_activate_node('localhost', :worker_2_proxy_port);
NOTICE: Replicating reference table "user_table" to the node localhost:57640
ERROR: canceling statement due to user request
-- verify node is not activated
SELECT * FROM master_get_active_worker_nodes()
ORDER BY 1, 2;
node_name | node_port
-----------+-----------
localhost | 57637
(1 row)
SELECT shardid, shardstate
FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid)
WHERE s.logicalrelid = 'user_table'::regclass
ORDER BY placementid;
shardid | shardstate
---------+------------
200000 | 1
(1 row)
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
-----------
(1 row)
-- master_remove_node fails when there are shards on that worker
SELECT master_remove_node('localhost', :worker_2_proxy_port);
ERROR: you cannot remove the primary node of a node group which has shard placements
-- drop event table and re-run remove
DROP TABLE event_table;
SELECT master_remove_node('localhost', :worker_2_proxy_port);
master_remove_node
--------------------
(1 row)
-- verify node is removed
SELECT * FROM master_get_active_worker_nodes()
ORDER BY 1, 2;
node_name | node_port
-----------+-----------
localhost | 57637
(1 row)
SELECT shardid, shardstate
FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid)
WHERE s.logicalrelid = 'user_table'::regclass
ORDER BY placementid;
shardid | shardstate
---------+------------
200000 | 1
(1 row)
-- test master_add_inactive_node
-- it does not create any network activity therefore can not
-- be injected failure through network
SELECT master_add_inactive_node('localhost', :worker_2_proxy_port);
master_add_inactive_node
---------------------------------------------------
(3,3,localhost,57640,default,f,f,primary,default)
(1 row)
SELECT master_remove_node('localhost', :worker_2_proxy_port);
master_remove_node
--------------------
(1 row)
SELECT shardid, shardstate
FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid)
WHERE s.logicalrelid = 'user_table'::regclass
ORDER BY placementid;
shardid | shardstate
---------+------------
200000 | 1
(1 row)
-- test master_add_node replicated a reference table
-- to newly added node.
SELECT citus.mitmproxy('conn.onQuery(query="CREATE TABLE").kill()');
mitmproxy
-----------
(1 row)
SELECT master_add_node('localhost', :worker_2_proxy_port);
NOTICE: Replicating reference table "user_table" to the node localhost:57640
ERROR: server closed the connection unexpectedly
This probably means the server terminated abnormally
before or while processing the request.
CONTEXT: while executing command on localhost:57640
-- verify node is not added
SELECT * FROM master_get_active_worker_nodes()
ORDER BY 1, 2;
node_name | node_port
-----------+-----------
localhost | 57637
(1 row)
SELECT shardid, shardstate
FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid)
WHERE s.logicalrelid = 'user_table'::regclass
ORDER BY placementid;
shardid | shardstate
---------+------------
200000 | 1
(1 row)
SELECT citus.mitmproxy('conn.onQuery(query="CREATE TABLE").cancel(' || pg_backend_pid() || ')');
mitmproxy
-----------
(1 row)
SELECT master_add_node('localhost', :worker_2_proxy_port);
NOTICE: Replicating reference table "user_table" to the node localhost:57640
ERROR: canceling statement due to user request
-- verify node is not added
SELECT * FROM master_get_active_worker_nodes()
ORDER BY 1, 2;
node_name | node_port
-----------+-----------
localhost | 57637
(1 row)
SELECT shardid, shardstate
FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid)
WHERE s.logicalrelid = 'user_table'::regclass
ORDER BY placementid;
shardid | shardstate
---------+------------
200000 | 1
(1 row)
-- reset cluster to original state
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
-----------
(1 row)
SELECT master_add_node('localhost', :worker_2_proxy_port);
NOTICE: Replicating reference table "user_table" to the node localhost:57640
master_add_node
---------------------------------------------------
(6,6,localhost,57640,default,f,t,primary,default)
(1 row)
-- verify node is added
SELECT * FROM master_get_active_worker_nodes()
ORDER BY 1, 2;
node_name | node_port
-----------+-----------
localhost | 57637
localhost | 57640
(2 rows)
SELECT shardid, shardstate
FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid)
WHERE s.logicalrelid = 'user_table'::regclass
ORDER BY placementid;
shardid | shardstate
---------+------------
200000 | 1
200000 | 1
(2 rows)
-- fail master_add_node by failing copy out operation
SELECT master_remove_node('localhost', :worker_1_port);
master_remove_node
--------------------
(1 row)
SELECT citus.mitmproxy('conn.onQuery(query="COPY").kill()');
mitmproxy
-----------
(1 row)
SELECT master_add_node('localhost', :worker_1_port);
NOTICE: Replicating reference table "user_table" to the node localhost:57637
ERROR: could not copy table "user_table_200000" from "localhost:57640"
CONTEXT: while executing command on localhost:57637
-- verify node is not added
SELECT * FROM master_get_active_worker_nodes()
ORDER BY 1, 2;
node_name | node_port
-----------+-----------
localhost | 57640
(1 row)
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
-----------
(1 row)
SELECT master_add_node('localhost', :worker_1_port);
NOTICE: Replicating reference table "user_table" to the node localhost:57637
master_add_node
---------------------------------------------------
(8,8,localhost,57637,default,f,t,primary,default)
(1 row)
-- verify node is added
SELECT * FROM master_get_active_worker_nodes()
ORDER BY 1, 2;
node_name | node_port
-----------+-----------
localhost | 57637
localhost | 57640
(2 rows)
SELECT shardid, shardstate
FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid)
WHERE s.logicalrelid = 'user_table'::regclass
ORDER BY placementid;
shardid | shardstate
---------+------------
200000 | 1
200000 | 1
(2 rows)
RESET SEARCH_PATH;
DROP SCHEMA add_remove_node CASCADE;
NOTICE: drop cascades to table add_remove_node.user_table
SELECT * FROM run_command_on_workers('DROP SCHEMA IF EXISTS add_remove_node CASCADE')
ORDER BY nodeport;
nodename | nodeport | success | result
-----------+----------+---------+-------------
localhost | 57637 | t | DROP SCHEMA
localhost | 57640 | t | DROP SCHEMA
(2 rows)

View File

@ -8,3 +8,4 @@ test: multi_test_helpers
test: failure_ddl
test: failure_truncate
test: failure_create_index_concurrently
test: failure_add_disable_node

View File

@ -0,0 +1,181 @@
--
-- failure_add_disable_node tests master_add_node, master_remove_node
-- master_activate_node for failures.
-- master_disable_node and master_add_inactive_node can not be
-- tested as they don't create network activity
--
SELECT citus.mitmproxy('conn.allow()');
SET citus.next_shard_id TO 200000;
-- verify we have all worker nodes present
SELECT * FROM master_get_active_worker_nodes()
ORDER BY 1, 2;
-- verify there are no tables that could prevent add/remove node operations
SELECT * FROM pg_dist_partition;
CREATE SCHEMA add_remove_node;
SET SEARCH_PATH=add_remove_node;
CREATE TABLE user_table(user_id int, user_name text);
SELECT create_reference_table('user_table');
CREATE TABLE event_table(user_id int, event_id int, event_name text);
SELECT create_distributed_table('event_table', 'user_id');
SELECT shardid, shardstate
FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid)
WHERE s.logicalrelid = 'user_table'::regclass
ORDER BY placementid;
SELECT master_disable_node('localhost', :worker_2_proxy_port);
SELECT * FROM master_get_active_worker_nodes()
ORDER BY 1, 2;
SELECT shardid, shardstate
FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid)
WHERE s.logicalrelid = 'user_table'::regclass
ORDER BY placementid;
-- fail activate node by failing reference table creation
SELECT citus.mitmproxy('conn.onQuery(query="CREATE TABLE").kill()');
SELECT master_activate_node('localhost', :worker_2_proxy_port);
SELECT citus.mitmproxy('conn.allow()');
-- verify node is not activated
SELECT * FROM master_get_active_worker_nodes()
ORDER BY 1, 2;
SELECT shardid, shardstate
FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid)
WHERE s.logicalrelid = 'user_table'::regclass
ORDER BY placementid;
-- fail create schema command
SELECT citus.mitmproxy('conn.onQuery(query="CREATE SCHEMA").kill()');
SELECT master_activate_node('localhost', :worker_2_proxy_port);
-- verify node is not activated
SELECT * FROM master_get_active_worker_nodes()
ORDER BY 1, 2;
SELECT shardid, shardstate
FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid)
WHERE s.logicalrelid = 'user_table'::regclass
ORDER BY placementid;
-- fail activate node by failing reference table creation
SELECT citus.mitmproxy('conn.onQuery(query="CREATE TABLE").cancel(' || pg_backend_pid() || ')');
SELECT master_activate_node('localhost', :worker_2_proxy_port);
-- verify node is not activated
SELECT * FROM master_get_active_worker_nodes()
ORDER BY 1, 2;
SELECT shardid, shardstate
FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid)
WHERE s.logicalrelid = 'user_table'::regclass
ORDER BY placementid;
SELECT citus.mitmproxy('conn.allow()');
-- master_remove_node fails when there are shards on that worker
SELECT master_remove_node('localhost', :worker_2_proxy_port);
-- drop event table and re-run remove
DROP TABLE event_table;
SELECT master_remove_node('localhost', :worker_2_proxy_port);
-- verify node is removed
SELECT * FROM master_get_active_worker_nodes()
ORDER BY 1, 2;
SELECT shardid, shardstate
FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid)
WHERE s.logicalrelid = 'user_table'::regclass
ORDER BY placementid;
-- test master_add_inactive_node
-- it does not create any network activity therefore can not
-- be injected failure through network
SELECT master_add_inactive_node('localhost', :worker_2_proxy_port);
SELECT master_remove_node('localhost', :worker_2_proxy_port);
SELECT shardid, shardstate
FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid)
WHERE s.logicalrelid = 'user_table'::regclass
ORDER BY placementid;
-- test master_add_node replicated a reference table
-- to newly added node.
SELECT citus.mitmproxy('conn.onQuery(query="CREATE TABLE").kill()');
SELECT master_add_node('localhost', :worker_2_proxy_port);
-- verify node is not added
SELECT * FROM master_get_active_worker_nodes()
ORDER BY 1, 2;
SELECT shardid, shardstate
FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid)
WHERE s.logicalrelid = 'user_table'::regclass
ORDER BY placementid;
SELECT citus.mitmproxy('conn.onQuery(query="CREATE TABLE").cancel(' || pg_backend_pid() || ')');
SELECT master_add_node('localhost', :worker_2_proxy_port);
-- verify node is not added
SELECT * FROM master_get_active_worker_nodes()
ORDER BY 1, 2;
SELECT shardid, shardstate
FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid)
WHERE s.logicalrelid = 'user_table'::regclass
ORDER BY placementid;
-- reset cluster to original state
SELECT citus.mitmproxy('conn.allow()');
SELECT master_add_node('localhost', :worker_2_proxy_port);
-- verify node is added
SELECT * FROM master_get_active_worker_nodes()
ORDER BY 1, 2;
SELECT shardid, shardstate
FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid)
WHERE s.logicalrelid = 'user_table'::regclass
ORDER BY placementid;
-- fail master_add_node by failing copy out operation
SELECT master_remove_node('localhost', :worker_1_port);
SELECT citus.mitmproxy('conn.onQuery(query="COPY").kill()');
SELECT master_add_node('localhost', :worker_1_port);
-- verify node is not added
SELECT * FROM master_get_active_worker_nodes()
ORDER BY 1, 2;
SELECT citus.mitmproxy('conn.allow()');
SELECT master_add_node('localhost', :worker_1_port);
-- verify node is added
SELECT * FROM master_get_active_worker_nodes()
ORDER BY 1, 2;
SELECT shardid, shardstate
FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid)
WHERE s.logicalrelid = 'user_table'::regclass
ORDER BY placementid;
RESET SEARCH_PATH;
DROP SCHEMA add_remove_node CASCADE;
SELECT * FROM run_command_on_workers('DROP SCHEMA IF EXISTS add_remove_node CASCADE')
ORDER BY nodeport;