citus/src/test/regress/expected/failure_connection_establis...

443 lines
15 KiB
Plaintext

--
-- failure_connection_establishment.sql tests some behaviour of connection management when
-- it fails to connect.
--
-- Failure cases covered:
-- - timeout
--
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
CREATE SCHEMA fail_connect;
SET search_path TO 'fail_connect';
SET citus.shard_count TO 4;
SET citus.max_cached_conns_per_worker TO 0;
ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 1450000;
ALTER SEQUENCE pg_catalog.pg_dist_placement_placementid_seq RESTART 1450000;
CREATE TABLE products (
product_no integer,
name text,
price numeric
);
SELECT create_distributed_table('products', 'product_no');
create_distributed_table
---------------------------------------------------------------------
(1 row)
-- Can only add primary key constraint on distribution column (or group of columns
-- including distribution column)
-- Command below should error out since 'name' is not a distribution column
ALTER TABLE products ADD CONSTRAINT p_key PRIMARY KEY(name);
ERROR: cannot create constraint on "products"
DETAIL: Distributed relations cannot have UNIQUE, EXCLUDE, or PRIMARY KEY constraints that do not include the partition column (with an equality operator if EXCLUDE).
-- we will insert a connection delay here as this query was the cause for an investigation
-- into connection establishment problems
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.delay(500)');
mitmproxy
---------------------------------------------------------------------
(1 row)
ALTER TABLE products ADD CONSTRAINT p_key PRIMARY KEY(product_no);
WARNING: could not establish connection after 400 ms
ERROR: connection to the remote node localhost:xxxxx failed
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
CREATE TABLE r1 (
id int PRIMARY KEY,
name text
);
INSERT INTO r1 (id, name) VALUES
(1,'foo'),
(2,'bar'),
(3,'baz');
SELECT create_reference_table('r1');
NOTICE: Copying data from local table...
NOTICE: copying the data has completed
DETAIL: The local data in the table is no longer visible, but is still on disk.
HINT: To remove the local data, run: SELECT truncate_local_data_after_distributing_table($$fail_connect.r1$$)
create_reference_table
---------------------------------------------------------------------
(1 row)
SELECT citus.clear_network_traffic();
clear_network_traffic
---------------------------------------------------------------------
(1 row)
SELECT citus.mitmproxy('conn.delay(500)');
mitmproxy
---------------------------------------------------------------------
(1 row)
-- we cannot control which replica of the reference table will be queried and there is
-- only one specific client we can control the connection for.
-- by using round-robin task_assignment_policy we can force to hit both machines.
-- and in the end, dumping the network traffic shows that the connection establishment
-- is initiated to the node behind the proxy
SET client_min_messages TO ERROR;
SET citus.task_assignment_policy TO 'round-robin';
-- suppress the warning since we can't control which shard is chose first. Failure of this
-- test would be if one of the queries does not return the result but an error.
SELECT name FROM r1 WHERE id = 2;
name
---------------------------------------------------------------------
bar
(1 row)
SELECT name FROM r1 WHERE id = 2;
name
---------------------------------------------------------------------
bar
(1 row)
-- verify a connection attempt was made to the intercepted node, this would have cause the
-- connection to have been delayed and thus caused a timeout
SELECT * FROM citus.dump_network_traffic() WHERE conn=0;
conn | source | message
---------------------------------------------------------------------
0 | coordinator | [initial message]
(1 row)
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
-- similar test with the above but this time on a
-- distributed table instead of a reference table
-- and with citus.force_max_query_parallelization is set
SET citus.force_max_query_parallelization TO ON;
SELECT citus.mitmproxy('conn.delay(500)');
mitmproxy
---------------------------------------------------------------------
(1 row)
-- suppress the warning since we can't control which shard is chose first. Failure of this
-- test would be if one of the queries does not return the result but an error.
SELECT count(*) FROM products;
count
---------------------------------------------------------------------
0
(1 row)
SELECT count(*) FROM products;
count
---------------------------------------------------------------------
0
(1 row)
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SET citus.shard_replication_factor TO 1;
CREATE TABLE single_replicatated(key int);
SELECT create_distributed_table('single_replicatated', 'key');
create_distributed_table
---------------------------------------------------------------------
(1 row)
-- this time the table is single replicated and we're still using the
-- the max parallelization flag, so the query should fail
SET citus.force_max_query_parallelization TO ON;
SELECT citus.mitmproxy('conn.delay(500)');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT count(*) FROM single_replicatated;
ERROR: could not establish any connections to the node localhost:xxxxx after 400 ms
SET citus.force_max_query_parallelization TO OFF;
-- one similar test, and this time on modification queries
-- to see that connection establishement failures could
-- fail the transaction (but not mark any placements as INVALID)
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
BEGIN;
SELECT
count(*) as invalid_placement_count
FROM
pg_dist_shard_placement
WHERE
shardstate = 3 AND
shardid IN (SELECT shardid from pg_dist_shard where logicalrelid = 'single_replicatated'::regclass);
invalid_placement_count
---------------------------------------------------------------------
0
(1 row)
SELECT citus.mitmproxy('conn.delay(500)');
mitmproxy
---------------------------------------------------------------------
(1 row)
INSERT INTO single_replicatated VALUES (100);
ERROR: could not establish any connections to the node localhost:xxxxx after 400 ms
COMMIT;
SELECT
count(*) as invalid_placement_count
FROM
pg_dist_shard_placement
WHERE
shardstate = 3 AND
shardid IN (SELECT shardid from pg_dist_shard where logicalrelid = 'single_replicatated'::regclass);
invalid_placement_count
---------------------------------------------------------------------
0
(1 row)
-- show that INSERT failed
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT count(*) FROM single_replicatated WHERE key = 100;
count
---------------------------------------------------------------------
0
(1 row)
RESET client_min_messages;
-- verify get_global_active_transactions works when a timeout happens on a connection
SELECT * FROM get_global_active_transactions() WHERE transaction_number != 0;
datid | process_id | initiator_node_identifier | worker_query | transaction_number | transaction_stamp | global_pid
---------------------------------------------------------------------
(0 rows)
-- tests for connectivity checks
SET client_min_messages TO ERROR;
-- kill the connection after authentication is ok
SELECT citus.mitmproxy('conn.onAuthenticationOk().kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM citus_check_connection_to_node('localhost', :worker_2_proxy_port);
citus_check_connection_to_node
---------------------------------------------------------------------
f
(1 row)
-- cancel the connection after authentication is ok
SELECT citus.mitmproxy('conn.onAuthenticationOk().cancel(' || pg_backend_pid() || ')');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM citus_check_connection_to_node('localhost', :worker_2_proxy_port);
ERROR: canceling statement due to user request
-- kill the connection after connectivity check query is sent
SELECT citus.mitmproxy('conn.onQuery(query="^SELECT 1$").kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM citus_check_connection_to_node('localhost', :worker_2_proxy_port);
citus_check_connection_to_node
---------------------------------------------------------------------
f
(1 row)
-- cancel the connection after connectivity check query is sent
SELECT citus.mitmproxy('conn.onQuery(query="^SELECT 1$").cancel(' || pg_backend_pid() || ')');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM citus_check_connection_to_node('localhost', :worker_2_proxy_port);
ERROR: canceling statement due to user request
-- kill the connection after connectivity check command is complete
SELECT citus.mitmproxy('conn.onCommandComplete(command="SELECT 1").kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM citus_check_connection_to_node('localhost', :worker_2_proxy_port);
citus_check_connection_to_node
---------------------------------------------------------------------
f
(1 row)
-- cancel the connection after connectivity check command is complete
SELECT citus.mitmproxy('conn.onCommandComplete(command="SELECT 1").cancel(' || pg_backend_pid() || ')');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM citus_check_connection_to_node('localhost', :worker_2_proxy_port);
ERROR: canceling statement due to user request
-- verify that the checks are not successful when timeouts happen on a connection
SELECT citus.mitmproxy('conn.delay(500)');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM citus_check_connection_to_node('localhost', :worker_2_proxy_port);
citus_check_connection_to_node
---------------------------------------------------------------------
f
(1 row)
-- tests for citus_check_cluster_node_health
-- kill all connectivity checks that originate from this node
SELECT citus.mitmproxy('conn.onQuery(query="^SELECT citus_check_connection_to_node").kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM citus_check_cluster_node_health();
from_nodename | from_nodeport | to_nodename | to_nodeport | result
---------------------------------------------------------------------
localhost | 9060 | localhost | 9060 |
localhost | 9060 | localhost | 57637 |
localhost | 57637 | localhost | 9060 | t
localhost | 57637 | localhost | 57637 | t
(4 rows)
-- suggested summary queries for connectivity checks
SELECT bool_and(coalesce(result, false)) FROM citus_check_cluster_node_health();
bool_and
---------------------------------------------------------------------
f
(1 row)
SELECT result, count(*) FROM citus_check_cluster_node_health() GROUP BY result ORDER BY 1;
result | count
---------------------------------------------------------------------
t | 2
| 2
(2 rows)
-- cancel all connectivity checks that originate from this node
SELECT citus.mitmproxy('conn.onQuery(query="^SELECT citus_check_connection_to_node").cancel(' || pg_backend_pid() || ')');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM citus_check_cluster_node_health();
ERROR: canceling statement due to user request
-- kill all but first connectivity checks that originate from this node
SELECT citus.mitmproxy('conn.onQuery(query="^SELECT citus_check_connection_to_node").after(1).kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM citus_check_cluster_node_health();
from_nodename | from_nodeport | to_nodename | to_nodeport | result
---------------------------------------------------------------------
localhost | 9060 | localhost | 9060 | t
localhost | 9060 | localhost | 57637 |
localhost | 57637 | localhost | 9060 | t
localhost | 57637 | localhost | 57637 | t
(4 rows)
-- cancel all but first connectivity checks that originate from this node
SELECT citus.mitmproxy('conn.onQuery(query="^SELECT citus_check_connection_to_node").after(1).cancel(' || pg_backend_pid() || ')');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM citus_check_cluster_node_health();
ERROR: canceling statement due to user request
-- kill all connections to this node
SELECT citus.mitmproxy('conn.onAuthenticationOk().kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM citus_check_cluster_node_health();
from_nodename | from_nodeport | to_nodename | to_nodeport | result
---------------------------------------------------------------------
localhost | 9060 | localhost | 9060 |
localhost | 9060 | localhost | 57637 |
localhost | 57637 | localhost | 9060 | f
localhost | 57637 | localhost | 57637 | t
(4 rows)
-- cancel all connections to this node
SELECT citus.mitmproxy('conn.onAuthenticationOk().cancel(' || pg_backend_pid() || ')');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM citus_check_cluster_node_health();
ERROR: canceling statement due to user request
-- kill connection checks to this node
SELECT citus.mitmproxy('conn.onQuery(query="^SELECT 1$").kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM citus_check_cluster_node_health();
from_nodename | from_nodeport | to_nodename | to_nodeport | result
---------------------------------------------------------------------
localhost | 9060 | localhost | 9060 | f
localhost | 9060 | localhost | 57637 | t
localhost | 57637 | localhost | 9060 | f
localhost | 57637 | localhost | 57637 | t
(4 rows)
-- cancel connection checks to this node
SELECT citus.mitmproxy('conn.onQuery(query="^SELECT 1$").cancel(' || pg_backend_pid() || ')');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM citus_check_cluster_node_health();
ERROR: canceling statement due to user request
RESET client_min_messages;
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SET citus.node_connection_timeout TO DEFAULT;
DROP SCHEMA fail_connect CASCADE;
NOTICE: drop cascades to 3 other objects
DETAIL: drop cascades to table products
drop cascades to table r1
drop cascades to table single_replicatated
SET search_path TO default;