diff --git a/src/test/regress/expected/failure_add_disable_node.out b/src/test/regress/expected/failure_add_disable_node.out new file mode 100644 index 000000000..c81d0ab3f --- /dev/null +++ b/src/test/regress/expected/failure_add_disable_node.out @@ -0,0 +1,387 @@ +-- +-- failure_add_disable_node tests master_add_node, master_remove_node +-- master_activate_node for failures. +-- master_disable_node and master_add_inactive_node can not be +-- tested as they don't create network activity +-- + +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +SET citus.next_shard_id TO 200000; +-- verify we have all worker nodes present +SELECT * FROM master_get_active_worker_nodes() +ORDER BY 1, 2; + node_name | node_port +-----------+----------- + localhost | 57637 + localhost | 57640 +(2 rows) + +-- verify there are no tables that could prevent add/remove node operations +SELECT * FROM pg_dist_partition; + logicalrelid | partmethod | partkey | colocationid | repmodel +--------------+------------+---------+--------------+---------- +(0 rows) + +CREATE SCHEMA add_remove_node; +SET SEARCH_PATH=add_remove_node; +CREATE TABLE user_table(user_id int, user_name text); +SELECT create_reference_table('user_table'); + create_reference_table +------------------------ + +(1 row) + +CREATE TABLE event_table(user_id int, event_id int, event_name text); +SELECT create_distributed_table('event_table', 'user_id'); + create_distributed_table +-------------------------- + +(1 row) + +SELECT shardid, shardstate +FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid) +WHERE s.logicalrelid = 'user_table'::regclass +ORDER BY placementid; + shardid | shardstate +---------+------------ + 200000 | 1 + 200000 | 1 +(2 rows) + +SELECT master_disable_node('localhost', :worker_2_proxy_port); +NOTICE: Node localhost:57640 has active shard placements. Some queries may fail after this operation. Use SELECT master_activate_node('localhost', 57640) to activate this node back. + master_disable_node +--------------------- + +(1 row) + +SELECT * FROM master_get_active_worker_nodes() +ORDER BY 1, 2; + node_name | node_port +-----------+----------- + localhost | 57637 +(1 row) + +SELECT shardid, shardstate +FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid) +WHERE s.logicalrelid = 'user_table'::regclass +ORDER BY placementid; + shardid | shardstate +---------+------------ + 200000 | 1 +(1 row) + +-- fail activate node by failing reference table creation +SELECT citus.mitmproxy('conn.onQuery(query="CREATE TABLE").kill()'); + mitmproxy +----------- + +(1 row) + +SELECT master_activate_node('localhost', :worker_2_proxy_port); +NOTICE: Replicating reference table "user_table" to the node localhost:57640 +ERROR: server closed the connection unexpectedly + This probably means the server terminated abnormally + before or while processing the request. +CONTEXT: while executing command on localhost:57640 +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +-- verify node is not activated +SELECT * FROM master_get_active_worker_nodes() +ORDER BY 1, 2; + node_name | node_port +-----------+----------- + localhost | 57637 +(1 row) + +SELECT shardid, shardstate +FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid) +WHERE s.logicalrelid = 'user_table'::regclass +ORDER BY placementid; + shardid | shardstate +---------+------------ + 200000 | 1 +(1 row) + +-- fail create schema command +SELECT citus.mitmproxy('conn.onQuery(query="CREATE SCHEMA").kill()'); + mitmproxy +----------- + +(1 row) + +SELECT master_activate_node('localhost', :worker_2_proxy_port); +NOTICE: Replicating reference table "user_table" to the node localhost:57640 +ERROR: server closed the connection unexpectedly + This probably means the server terminated abnormally + before or while processing the request. +CONTEXT: while executing command on localhost:57640 +-- verify node is not activated +SELECT * FROM master_get_active_worker_nodes() +ORDER BY 1, 2; + node_name | node_port +-----------+----------- + localhost | 57637 +(1 row) + +SELECT shardid, shardstate +FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid) +WHERE s.logicalrelid = 'user_table'::regclass +ORDER BY placementid; + shardid | shardstate +---------+------------ + 200000 | 1 +(1 row) + +-- fail activate node by failing reference table creation +SELECT citus.mitmproxy('conn.onQuery(query="CREATE TABLE").cancel(' || pg_backend_pid() || ')'); + mitmproxy +----------- + +(1 row) + +SELECT master_activate_node('localhost', :worker_2_proxy_port); +NOTICE: Replicating reference table "user_table" to the node localhost:57640 +ERROR: canceling statement due to user request +-- verify node is not activated +SELECT * FROM master_get_active_worker_nodes() +ORDER BY 1, 2; + node_name | node_port +-----------+----------- + localhost | 57637 +(1 row) + +SELECT shardid, shardstate +FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid) +WHERE s.logicalrelid = 'user_table'::regclass +ORDER BY placementid; + shardid | shardstate +---------+------------ + 200000 | 1 +(1 row) + +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +-- master_remove_node fails when there are shards on that worker +SELECT master_remove_node('localhost', :worker_2_proxy_port); +ERROR: you cannot remove the primary node of a node group which has shard placements +-- drop event table and re-run remove +DROP TABLE event_table; +SELECT master_remove_node('localhost', :worker_2_proxy_port); + master_remove_node +-------------------- + +(1 row) + +-- verify node is removed +SELECT * FROM master_get_active_worker_nodes() +ORDER BY 1, 2; + node_name | node_port +-----------+----------- + localhost | 57637 +(1 row) + +SELECT shardid, shardstate +FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid) +WHERE s.logicalrelid = 'user_table'::regclass +ORDER BY placementid; + shardid | shardstate +---------+------------ + 200000 | 1 +(1 row) + +-- test master_add_inactive_node +-- it does not create any network activity therefore can not +-- be injected failure through network +SELECT master_add_inactive_node('localhost', :worker_2_proxy_port); + master_add_inactive_node +--------------------------------------------------- + (3,3,localhost,57640,default,f,f,primary,default) +(1 row) + +SELECT master_remove_node('localhost', :worker_2_proxy_port); + master_remove_node +-------------------- + +(1 row) + +SELECT shardid, shardstate +FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid) +WHERE s.logicalrelid = 'user_table'::regclass +ORDER BY placementid; + shardid | shardstate +---------+------------ + 200000 | 1 +(1 row) + +-- test master_add_node replicated a reference table +-- to newly added node. +SELECT citus.mitmproxy('conn.onQuery(query="CREATE TABLE").kill()'); + mitmproxy +----------- + +(1 row) + +SELECT master_add_node('localhost', :worker_2_proxy_port); +NOTICE: Replicating reference table "user_table" to the node localhost:57640 +ERROR: server closed the connection unexpectedly + This probably means the server terminated abnormally + before or while processing the request. +CONTEXT: while executing command on localhost:57640 +-- verify node is not added +SELECT * FROM master_get_active_worker_nodes() +ORDER BY 1, 2; + node_name | node_port +-----------+----------- + localhost | 57637 +(1 row) + +SELECT shardid, shardstate +FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid) +WHERE s.logicalrelid = 'user_table'::regclass +ORDER BY placementid; + shardid | shardstate +---------+------------ + 200000 | 1 +(1 row) + +SELECT citus.mitmproxy('conn.onQuery(query="CREATE TABLE").cancel(' || pg_backend_pid() || ')'); + mitmproxy +----------- + +(1 row) + +SELECT master_add_node('localhost', :worker_2_proxy_port); +NOTICE: Replicating reference table "user_table" to the node localhost:57640 +ERROR: canceling statement due to user request +-- verify node is not added +SELECT * FROM master_get_active_worker_nodes() +ORDER BY 1, 2; + node_name | node_port +-----------+----------- + localhost | 57637 +(1 row) + +SELECT shardid, shardstate +FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid) +WHERE s.logicalrelid = 'user_table'::regclass +ORDER BY placementid; + shardid | shardstate +---------+------------ + 200000 | 1 +(1 row) + +-- reset cluster to original state +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +SELECT master_add_node('localhost', :worker_2_proxy_port); +NOTICE: Replicating reference table "user_table" to the node localhost:57640 + master_add_node +--------------------------------------------------- + (6,6,localhost,57640,default,f,t,primary,default) +(1 row) + +-- verify node is added +SELECT * FROM master_get_active_worker_nodes() +ORDER BY 1, 2; + node_name | node_port +-----------+----------- + localhost | 57637 + localhost | 57640 +(2 rows) + +SELECT shardid, shardstate +FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid) +WHERE s.logicalrelid = 'user_table'::regclass +ORDER BY placementid; + shardid | shardstate +---------+------------ + 200000 | 1 + 200000 | 1 +(2 rows) + +-- fail master_add_node by failing copy out operation +SELECT master_remove_node('localhost', :worker_1_port); + master_remove_node +-------------------- + +(1 row) + +SELECT citus.mitmproxy('conn.onQuery(query="COPY").kill()'); + mitmproxy +----------- + +(1 row) + +SELECT master_add_node('localhost', :worker_1_port); +NOTICE: Replicating reference table "user_table" to the node localhost:57637 +ERROR: could not copy table "user_table_200000" from "localhost:57640" +CONTEXT: while executing command on localhost:57637 +-- verify node is not added +SELECT * FROM master_get_active_worker_nodes() +ORDER BY 1, 2; + node_name | node_port +-----------+----------- + localhost | 57640 +(1 row) + +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +SELECT master_add_node('localhost', :worker_1_port); +NOTICE: Replicating reference table "user_table" to the node localhost:57637 + master_add_node +--------------------------------------------------- + (8,8,localhost,57637,default,f,t,primary,default) +(1 row) + +-- verify node is added +SELECT * FROM master_get_active_worker_nodes() +ORDER BY 1, 2; + node_name | node_port +-----------+----------- + localhost | 57637 + localhost | 57640 +(2 rows) + +SELECT shardid, shardstate +FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid) +WHERE s.logicalrelid = 'user_table'::regclass +ORDER BY placementid; + shardid | shardstate +---------+------------ + 200000 | 1 + 200000 | 1 +(2 rows) + +RESET SEARCH_PATH; +DROP SCHEMA add_remove_node CASCADE; +NOTICE: drop cascades to table add_remove_node.user_table +SELECT * FROM run_command_on_workers('DROP SCHEMA IF EXISTS add_remove_node CASCADE') +ORDER BY nodeport; + nodename | nodeport | success | result +-----------+----------+---------+------------- + localhost | 57637 | t | DROP SCHEMA + localhost | 57640 | t | DROP SCHEMA +(2 rows) + diff --git a/src/test/regress/failure_schedule b/src/test/regress/failure_schedule index afd672da1..d8e76d65f 100644 --- a/src/test/regress/failure_schedule +++ b/src/test/regress/failure_schedule @@ -8,3 +8,4 @@ test: multi_test_helpers test: failure_ddl test: failure_truncate test: failure_create_index_concurrently +test: failure_add_disable_node diff --git a/src/test/regress/sql/failure_add_disable_node.sql b/src/test/regress/sql/failure_add_disable_node.sql new file mode 100644 index 000000000..d19fbe0ba --- /dev/null +++ b/src/test/regress/sql/failure_add_disable_node.sql @@ -0,0 +1,181 @@ +-- +-- failure_add_disable_node tests master_add_node, master_remove_node +-- master_activate_node for failures. +-- master_disable_node and master_add_inactive_node can not be +-- tested as they don't create network activity +-- + +SELECT citus.mitmproxy('conn.allow()'); + +SET citus.next_shard_id TO 200000; + +-- verify we have all worker nodes present +SELECT * FROM master_get_active_worker_nodes() +ORDER BY 1, 2; + +-- verify there are no tables that could prevent add/remove node operations +SELECT * FROM pg_dist_partition; + +CREATE SCHEMA add_remove_node; +SET SEARCH_PATH=add_remove_node; +CREATE TABLE user_table(user_id int, user_name text); +SELECT create_reference_table('user_table'); + +CREATE TABLE event_table(user_id int, event_id int, event_name text); +SELECT create_distributed_table('event_table', 'user_id'); + +SELECT shardid, shardstate +FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid) +WHERE s.logicalrelid = 'user_table'::regclass +ORDER BY placementid; + +SELECT master_disable_node('localhost', :worker_2_proxy_port); + +SELECT * FROM master_get_active_worker_nodes() +ORDER BY 1, 2; + +SELECT shardid, shardstate +FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid) +WHERE s.logicalrelid = 'user_table'::regclass +ORDER BY placementid; + +-- fail activate node by failing reference table creation +SELECT citus.mitmproxy('conn.onQuery(query="CREATE TABLE").kill()'); + +SELECT master_activate_node('localhost', :worker_2_proxy_port); + +SELECT citus.mitmproxy('conn.allow()'); + +-- verify node is not activated +SELECT * FROM master_get_active_worker_nodes() +ORDER BY 1, 2; + +SELECT shardid, shardstate +FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid) +WHERE s.logicalrelid = 'user_table'::regclass +ORDER BY placementid; + +-- fail create schema command +SELECT citus.mitmproxy('conn.onQuery(query="CREATE SCHEMA").kill()'); + +SELECT master_activate_node('localhost', :worker_2_proxy_port); + +-- verify node is not activated +SELECT * FROM master_get_active_worker_nodes() +ORDER BY 1, 2; + +SELECT shardid, shardstate +FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid) +WHERE s.logicalrelid = 'user_table'::regclass +ORDER BY placementid; + +-- fail activate node by failing reference table creation +SELECT citus.mitmproxy('conn.onQuery(query="CREATE TABLE").cancel(' || pg_backend_pid() || ')'); + +SELECT master_activate_node('localhost', :worker_2_proxy_port); + +-- verify node is not activated +SELECT * FROM master_get_active_worker_nodes() +ORDER BY 1, 2; + +SELECT shardid, shardstate +FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid) +WHERE s.logicalrelid = 'user_table'::regclass +ORDER BY placementid; + +SELECT citus.mitmproxy('conn.allow()'); + +-- master_remove_node fails when there are shards on that worker +SELECT master_remove_node('localhost', :worker_2_proxy_port); + +-- drop event table and re-run remove +DROP TABLE event_table; +SELECT master_remove_node('localhost', :worker_2_proxy_port); + +-- verify node is removed +SELECT * FROM master_get_active_worker_nodes() +ORDER BY 1, 2; + +SELECT shardid, shardstate +FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid) +WHERE s.logicalrelid = 'user_table'::regclass +ORDER BY placementid; + +-- test master_add_inactive_node +-- it does not create any network activity therefore can not +-- be injected failure through network +SELECT master_add_inactive_node('localhost', :worker_2_proxy_port); + +SELECT master_remove_node('localhost', :worker_2_proxy_port); + +SELECT shardid, shardstate +FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid) +WHERE s.logicalrelid = 'user_table'::regclass +ORDER BY placementid; + +-- test master_add_node replicated a reference table +-- to newly added node. +SELECT citus.mitmproxy('conn.onQuery(query="CREATE TABLE").kill()'); + +SELECT master_add_node('localhost', :worker_2_proxy_port); + +-- verify node is not added +SELECT * FROM master_get_active_worker_nodes() +ORDER BY 1, 2; + +SELECT shardid, shardstate +FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid) +WHERE s.logicalrelid = 'user_table'::regclass +ORDER BY placementid; + +SELECT citus.mitmproxy('conn.onQuery(query="CREATE TABLE").cancel(' || pg_backend_pid() || ')'); + +SELECT master_add_node('localhost', :worker_2_proxy_port); + +-- verify node is not added +SELECT * FROM master_get_active_worker_nodes() +ORDER BY 1, 2; + +SELECT shardid, shardstate +FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid) +WHERE s.logicalrelid = 'user_table'::regclass +ORDER BY placementid; + +-- reset cluster to original state +SELECT citus.mitmproxy('conn.allow()'); +SELECT master_add_node('localhost', :worker_2_proxy_port); + +-- verify node is added +SELECT * FROM master_get_active_worker_nodes() +ORDER BY 1, 2; + +SELECT shardid, shardstate +FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid) +WHERE s.logicalrelid = 'user_table'::regclass +ORDER BY placementid; + +-- fail master_add_node by failing copy out operation +SELECT master_remove_node('localhost', :worker_1_port); +SELECT citus.mitmproxy('conn.onQuery(query="COPY").kill()'); +SELECT master_add_node('localhost', :worker_1_port); + +-- verify node is not added +SELECT * FROM master_get_active_worker_nodes() +ORDER BY 1, 2; + +SELECT citus.mitmproxy('conn.allow()'); +SELECT master_add_node('localhost', :worker_1_port); + +-- verify node is added +SELECT * FROM master_get_active_worker_nodes() +ORDER BY 1, 2; + +SELECT shardid, shardstate +FROM pg_dist_placement p JOIN pg_dist_shard s USING (shardid) +WHERE s.logicalrelid = 'user_table'::regclass +ORDER BY placementid; + +RESET SEARCH_PATH; +DROP SCHEMA add_remove_node CASCADE; +SELECT * FROM run_command_on_workers('DROP SCHEMA IF EXISTS add_remove_node CASCADE') +ORDER BY nodeport;