citus/src/test/regress/expected/failure_connection_establis...

438 lines
15 KiB
Plaintext

--
-- failure_connection_establishment.sql tests some behaviour of connection management when
-- it fails to connect.
--
-- Failure cases covered:
-- - timeout
--
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
CREATE SCHEMA fail_connect;
SET search_path TO 'fail_connect';
SET citus.shard_count TO 4;
SET citus.max_cached_conns_per_worker TO 0;
ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 1450000;
ALTER SEQUENCE pg_catalog.pg_dist_placement_placementid_seq RESTART 1450000;
CREATE TABLE products (
product_no integer,
name text,
price numeric
);
SELECT create_distributed_table('products', 'product_no');
create_distributed_table
---------------------------------------------------------------------
(1 row)
-- Can only add primary key constraint on distribution column (or group of columns
-- including distribution column)
-- Command below should error out since 'name' is not a distribution column
ALTER TABLE products ADD CONSTRAINT p_key PRIMARY KEY(name);
ERROR: cannot create constraint on "products"
DETAIL: Distributed relations cannot have UNIQUE, EXCLUDE, or PRIMARY KEY constraints that do not include the partition column (with an equality operator if EXCLUDE).
CREATE TABLE r1 (
id int PRIMARY KEY,
name text
);
INSERT INTO r1 (id, name) VALUES
(1,'foo'),
(2,'bar'),
(3,'baz');
SELECT create_reference_table('r1');
NOTICE: Copying data from local table...
NOTICE: copying the data has completed
DETAIL: The local data in the table is no longer visible, but is still on disk.
HINT: To remove the local data, run: SELECT truncate_local_data_after_distributing_table($$fail_connect.r1$$)
create_reference_table
---------------------------------------------------------------------
(1 row)
-- Confirm that the first placement for both tables is on the second worker
-- node. This is necessary so we can use the first-replica task assignment
-- policy to first hit the node that we generate timeouts for.
SELECT placementid, p.shardid, logicalrelid, LEAST(2, groupid) groupid
FROM pg_dist_placement p JOIN pg_dist_shard s ON p.shardid = s.shardid
ORDER BY placementid;
placementid | shardid | logicalrelid | groupid
---------------------------------------------------------------------
1450000 | 1450000 | products | 2
1450001 | 1450000 | products | 1
1450002 | 1450001 | products | 1
1450003 | 1450001 | products | 2
1450004 | 1450002 | products | 2
1450005 | 1450002 | products | 1
1450006 | 1450003 | products | 1
1450007 | 1450003 | products | 2
1450008 | 1450004 | r1 | 2
1450009 | 1450004 | r1 | 1
(10 rows)
SET citus.task_assignment_policy TO 'first-replica';
-- we will insert a connection delay here as this query was the cause for an
-- investigation into connection establishment problems
SET citus.node_connection_timeout TO 900;
SELECT citus.mitmproxy('conn.connect_delay(1400)');
mitmproxy
---------------------------------------------------------------------
(1 row)
ALTER TABLE products ADD CONSTRAINT p_key PRIMARY KEY(product_no);
WARNING: could not establish connection after 900 ms
ERROR: connection to the remote node localhost:xxxxx failed
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
-- Make sure that we fall back to a working node for reads, even if it's not
-- the first choice in our task assignment policy.
SET citus.node_connection_timeout TO 900;
SELECT citus.mitmproxy('conn.connect_delay(1400)');
mitmproxy
---------------------------------------------------------------------
(1 row)
-- tests for connectivity checks
SELECT name FROM r1 WHERE id = 2;
WARNING: could not establish any connections to the node localhost:xxxxx after 900 ms
name
---------------------------------------------------------------------
bar
(1 row)
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
-- similar test with the above but this time on a distributed table instead of
-- a reference table and with citus.force_max_query_parallelization is set
SET citus.force_max_query_parallelization TO ON;
SET citus.node_connection_timeout TO 900;
SELECT citus.mitmproxy('conn.connect_delay(1400)');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT count(*) FROM products;
WARNING: could not establish any connections to the node localhost:xxxxx after 900 ms
count
---------------------------------------------------------------------
0
(1 row)
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SET citus.shard_replication_factor TO 1;
CREATE TABLE single_replicatated(key int);
SELECT create_distributed_table('single_replicatated', 'key');
create_distributed_table
---------------------------------------------------------------------
(1 row)
-- this time the table is single replicated and we're still using the
-- the max parallelization flag, so the query should fail
SET citus.force_max_query_parallelization TO ON;
SET citus.node_connection_timeout TO 900;
SELECT citus.mitmproxy('conn.connect_delay(1400)');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT count(*) FROM single_replicatated;
ERROR: could not establish any connections to the node localhost:xxxxx after 900 ms
RESET citus.force_max_query_parallelization;
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
-- one similar test, and this time on modification queries
-- to see that connection establishement failures could
-- fail the transaction (but not mark any placements as INVALID)
BEGIN;
SELECT
count(*) as invalid_placement_count
FROM
pg_dist_shard_placement
WHERE
shardstate = 3 AND
shardid IN (SELECT shardid from pg_dist_shard where logicalrelid = 'single_replicatated'::regclass);
invalid_placement_count
---------------------------------------------------------------------
0
(1 row)
SET citus.node_connection_timeout TO 900;
SELECT citus.mitmproxy('conn.connect_delay(1400)');
mitmproxy
---------------------------------------------------------------------
(1 row)
INSERT INTO single_replicatated VALUES (100);
ERROR: could not establish any connections to the node localhost:xxxxx after 900 ms
COMMIT;
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT
count(*) as invalid_placement_count
FROM
pg_dist_shard_placement
WHERE
shardstate = 3 AND
shardid IN (SELECT shardid from pg_dist_shard where logicalrelid = 'single_replicatated'::regclass);
invalid_placement_count
---------------------------------------------------------------------
0
(1 row)
SELECT count(*) FROM single_replicatated WHERE key = 100;
count
---------------------------------------------------------------------
0
(1 row)
RESET client_min_messages;
-- verify get_global_active_transactions works when a timeout happens on a connection
SELECT * FROM get_global_active_transactions() WHERE transaction_number != 0;
datid | process_id | initiator_node_identifier | worker_query | transaction_number | transaction_stamp | global_pid
---------------------------------------------------------------------
(0 rows)
-- tests for connectivity checks
SET client_min_messages TO ERROR;
-- kill the connection after authentication is ok
SELECT citus.mitmproxy('conn.onAuthenticationOk().kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM citus_check_connection_to_node('localhost', :worker_2_proxy_port);
citus_check_connection_to_node
---------------------------------------------------------------------
f
(1 row)
-- cancel the connection after authentication is ok
SELECT citus.mitmproxy('conn.onAuthenticationOk().cancel(' || pg_backend_pid() || ')');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM citus_check_connection_to_node('localhost', :worker_2_proxy_port);
ERROR: canceling statement due to user request
-- kill the connection after connectivity check query is sent
SELECT citus.mitmproxy('conn.onQuery(query="^SELECT 1$").kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM citus_check_connection_to_node('localhost', :worker_2_proxy_port);
citus_check_connection_to_node
---------------------------------------------------------------------
f
(1 row)
-- cancel the connection after connectivity check query is sent
SELECT citus.mitmproxy('conn.onQuery(query="^SELECT 1$").cancel(' || pg_backend_pid() || ')');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM citus_check_connection_to_node('localhost', :worker_2_proxy_port);
ERROR: canceling statement due to user request
-- kill the connection after connectivity check command is complete
SELECT citus.mitmproxy('conn.onCommandComplete(command="SELECT 1").kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM citus_check_connection_to_node('localhost', :worker_2_proxy_port);
citus_check_connection_to_node
---------------------------------------------------------------------
f
(1 row)
-- cancel the connection after connectivity check command is complete
SELECT citus.mitmproxy('conn.onCommandComplete(command="SELECT 1").cancel(' || pg_backend_pid() || ')');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM citus_check_connection_to_node('localhost', :worker_2_proxy_port);
ERROR: canceling statement due to user request
-- verify that the checks are not successful when timeouts happen on a connection
SET citus.node_connection_timeout TO 900;
SELECT citus.mitmproxy('conn.connect_delay(1400)');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM citus_check_connection_to_node('localhost', :worker_2_proxy_port);
citus_check_connection_to_node
---------------------------------------------------------------------
f
(1 row)
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
-- tests for citus_check_cluster_node_health
-- kill all connectivity checks that originate from this node
SELECT citus.mitmproxy('conn.onQuery(query="^SELECT citus_check_connection_to_node").kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM citus_check_cluster_node_health();
from_nodename | from_nodeport | to_nodename | to_nodeport | result
---------------------------------------------------------------------
localhost | 9060 | localhost | 9060 |
localhost | 9060 | localhost | 57637 |
localhost | 57637 | localhost | 9060 | t
localhost | 57637 | localhost | 57637 | t
(4 rows)
-- suggested summary queries for connectivity checks
SELECT bool_and(coalesce(result, false)) FROM citus_check_cluster_node_health();
bool_and
---------------------------------------------------------------------
f
(1 row)
SELECT result, count(*) FROM citus_check_cluster_node_health() GROUP BY result ORDER BY 1;
result | count
---------------------------------------------------------------------
t | 2
| 2
(2 rows)
-- cancel all connectivity checks that originate from this node
SELECT citus.mitmproxy('conn.onQuery(query="^SELECT citus_check_connection_to_node").cancel(' || pg_backend_pid() || ')');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM citus_check_cluster_node_health();
ERROR: canceling statement due to user request
-- kill all but first connectivity checks that originate from this node
SELECT citus.mitmproxy('conn.onQuery(query="^SELECT citus_check_connection_to_node").after(1).kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM citus_check_cluster_node_health();
from_nodename | from_nodeport | to_nodename | to_nodeport | result
---------------------------------------------------------------------
localhost | 9060 | localhost | 9060 | t
localhost | 9060 | localhost | 57637 |
localhost | 57637 | localhost | 9060 | t
localhost | 57637 | localhost | 57637 | t
(4 rows)
-- cancel all but first connectivity checks that originate from this node
SELECT citus.mitmproxy('conn.onQuery(query="^SELECT citus_check_connection_to_node").after(1).cancel(' || pg_backend_pid() || ')');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM citus_check_cluster_node_health();
ERROR: canceling statement due to user request
-- kill all connections to this node
SELECT citus.mitmproxy('conn.onAuthenticationOk().kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM citus_check_cluster_node_health();
from_nodename | from_nodeport | to_nodename | to_nodeport | result
---------------------------------------------------------------------
localhost | 9060 | localhost | 9060 |
localhost | 9060 | localhost | 57637 |
localhost | 57637 | localhost | 9060 | f
localhost | 57637 | localhost | 57637 | t
(4 rows)
-- kill connection checks to this node
SELECT citus.mitmproxy('conn.onQuery(query="^SELECT 1$").kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM citus_check_cluster_node_health();
from_nodename | from_nodeport | to_nodename | to_nodeport | result
---------------------------------------------------------------------
localhost | 9060 | localhost | 9060 | f
localhost | 9060 | localhost | 57637 | t
localhost | 57637 | localhost | 9060 | f
localhost | 57637 | localhost | 57637 | t
(4 rows)
-- cancel connection checks to this node
SELECT citus.mitmproxy('conn.onQuery(query="^SELECT 1$").cancel(' || pg_backend_pid() || ')');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM citus_check_cluster_node_health();
ERROR: canceling statement due to user request
RESET client_min_messages;
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
DROP SCHEMA fail_connect CASCADE;
NOTICE: drop cascades to 3 other objects
DETAIL: drop cascades to table products
drop cascades to table r1
drop cascades to table single_replicatated
SET search_path TO default;