Increase timeouts

fix-flaky-failure_connection_establishment5
Jelte Fennema 2022-08-23 10:35:44 +02:00
parent e8b4db4236
commit d7b2dee56e
5 changed files with 149 additions and 37 deletions

View File

@ -755,3 +755,115 @@ workflows:
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-13b_check-failure'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13b_check-failure-1'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13b_check-failure-2'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13b_check-failure-3'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13b_check-failure-4'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13b_check-failure-5'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13b_check-failure-6'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13b_check-failure-7'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-14b_check-failure'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14b_check-failure-1'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14b_check-failure-2'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14b_check-failure-3'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14b_check-failure-4'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14b_check-failure-5'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14b_check-failure-6'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14b_check-failure-7'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]

View File

@ -75,27 +75,27 @@ ORDER BY placementid;
SET citus.task_assignment_policy TO 'first-replica';
-- we will insert a connection delay here as this query was the cause for an
-- investigation into connection establishment problems
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.connect_delay(500)');
SET citus.node_connection_timeout TO 900;
SELECT citus.mitmproxy('conn.connect_delay(1000)');
mitmproxy
---------------------------------------------------------------------
(1 row)
ALTER TABLE products ADD CONSTRAINT p_key PRIMARY KEY(product_no);
WARNING: could not establish connection after 400 ms
WARNING: could not establish connection after 900 ms
ERROR: connection to the remote node localhost:xxxxx failed
-- Make sure that we fall back to a working node for reads, even if it's not
-- the first choice in our task assignment policy.
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.connect_delay(500)');
SET citus.node_connection_timeout TO 900;
SELECT citus.mitmproxy('conn.connect_delay(1000)');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT name FROM r1 WHERE id = 2;
WARNING: could not establish any connections to the node localhost:xxxxx after 400 ms
WARNING: could not establish any connections to the node localhost:xxxxx after 900 ms
name
---------------------------------------------------------------------
bar
@ -104,15 +104,15 @@ WARNING: could not establish any connections to the node localhost:xxxxx after
-- similar test with the above but this time on a distributed table instead of
-- a reference table and with citus.force_max_query_parallelization is set
SET citus.force_max_query_parallelization TO ON;
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.connect_delay(500)');
SET citus.node_connection_timeout TO 900;
SELECT citus.mitmproxy('conn.connect_delay(1000)');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT count(*) FROM products;
WARNING: could not establish any connections to the node localhost:xxxxx after 400 ms
WARNING: could not establish any connections to the node localhost:xxxxx after 900 ms
count
---------------------------------------------------------------------
0
@ -136,15 +136,15 @@ SELECT create_distributed_table('single_replicatated', 'key');
-- this time the table is single replicated and we're still using the
-- the max parallelization flag, so the query should fail
SET citus.force_max_query_parallelization TO ON;
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.connect_delay(500)');
SET citus.node_connection_timeout TO 900;
SELECT citus.mitmproxy('conn.connect_delay(1000)');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT count(*) FROM single_replicatated;
ERROR: could not establish any connections to the node localhost:xxxxx after 400 ms
ERROR: could not establish any connections to the node localhost:xxxxx after 900 ms
SET citus.force_max_query_parallelization TO OFF;
-- one similar test, and this time on modification queries
-- to see that connection establishement failures could
@ -169,15 +169,15 @@ WHERE
0
(1 row)
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.connect_delay(500)');
SET citus.node_connection_timeout TO 900;
SELECT citus.mitmproxy('conn.connect_delay(1000)');
mitmproxy
---------------------------------------------------------------------
(1 row)
INSERT INTO single_replicatated VALUES (100);
ERROR: could not establish any connections to the node localhost:xxxxx after 400 ms
ERROR: could not establish any connections to the node localhost:xxxxx after 900 ms
COMMIT;
SELECT
count(*) as invalid_placement_count
@ -281,8 +281,8 @@ SELECT citus.mitmproxy('conn.onCommandComplete(command="SELECT 1").cancel(' || p
SELECT * FROM citus_check_connection_to_node('localhost', :worker_2_proxy_port);
ERROR: canceling statement due to user request
-- verify that the checks are not successful when timeouts happen on a connection
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.connect_delay(500)');
SET citus.node_connection_timeout TO 900;
SELECT citus.mitmproxy('conn.connect_delay(1000)');
mitmproxy
---------------------------------------------------------------------

View File

@ -35,8 +35,8 @@ SET citus.max_cached_conns_per_worker to 0;
INSERT INTO failover_to_local SELECT i, i::text FROM generate_series(0,20)i;
-- even if the connection establishment fails, Citus can
-- failover to local exection
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.connect_delay(500)');
SET citus.node_connection_timeout TO 900;
SELECT citus.mitmproxy('conn.connect_delay(1000)');
mitmproxy
---------------------------------------------------------------------
@ -45,7 +45,7 @@ SELECT citus.mitmproxy('conn.connect_delay(500)');
SET citus.log_local_commands TO ON;
SET client_min_messages TO DEBUG1;
SELECT count(*) FROM failover_to_local;
DEBUG: could not establish any connections to the node localhost:xxxxx after 400 ms
DEBUG: could not establish any connections to the node localhost:xxxxx after 900 ms
NOTICE: executing the command locally: SELECT count(*) AS count FROM failure_failover_to_local_execution.failover_to_local_1980000 failover_to_local WHERE true
NOTICE: executing the command locally: SELECT count(*) AS count FROM failure_failover_to_local_execution.failover_to_local_1980002 failover_to_local WHERE true
count
@ -68,7 +68,7 @@ CONTEXT: while executing command on localhost:xxxxx
-- if the local execution is disabled, Citus does
-- not try to fallback to local execution
SET citus.enable_local_execution TO false;
SELECT citus.mitmproxy('conn.connect_delay(500)');
SELECT citus.mitmproxy('conn.connect_delay(1000)');
mitmproxy
---------------------------------------------------------------------
@ -76,7 +76,7 @@ SELECT citus.mitmproxy('conn.connect_delay(500)');
SET citus.log_local_commands TO ON;
SELECT count(*) FROM failover_to_local;
ERROR: could not establish any connections to the node localhost:xxxxx after 400 ms
ERROR: could not establish any connections to the node localhost:xxxxx after 900 ms
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------

View File

@ -49,23 +49,23 @@ SET citus.task_assignment_policy TO 'first-replica';
-- we will insert a connection delay here as this query was the cause for an
-- investigation into connection establishment problems
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.connect_delay(500)');
SET citus.node_connection_timeout TO 900;
SELECT citus.mitmproxy('conn.connect_delay(1000)');
ALTER TABLE products ADD CONSTRAINT p_key PRIMARY KEY(product_no);
-- Make sure that we fall back to a working node for reads, even if it's not
-- the first choice in our task assignment policy.
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.connect_delay(500)');
SET citus.node_connection_timeout TO 900;
SELECT citus.mitmproxy('conn.connect_delay(1000)');
SELECT name FROM r1 WHERE id = 2;
-- similar test with the above but this time on a distributed table instead of
-- a reference table and with citus.force_max_query_parallelization is set
SET citus.force_max_query_parallelization TO ON;
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.connect_delay(500)');
SET citus.node_connection_timeout TO 900;
SELECT citus.mitmproxy('conn.connect_delay(1000)');
SELECT count(*) FROM products;
RESET citus.node_connection_timeout;
@ -77,8 +77,8 @@ SELECT create_distributed_table('single_replicatated', 'key');
-- this time the table is single replicated and we're still using the
-- the max parallelization flag, so the query should fail
SET citus.force_max_query_parallelization TO ON;
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.connect_delay(500)');
SET citus.node_connection_timeout TO 900;
SELECT citus.mitmproxy('conn.connect_delay(1000)');
SELECT count(*) FROM single_replicatated;
SET citus.force_max_query_parallelization TO OFF;
@ -96,8 +96,8 @@ FROM
WHERE
shardstate = 3 AND
shardid IN (SELECT shardid from pg_dist_shard where logicalrelid = 'single_replicatated'::regclass);
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.connect_delay(500)');
SET citus.node_connection_timeout TO 900;
SELECT citus.mitmproxy('conn.connect_delay(1000)');
INSERT INTO single_replicatated VALUES (100);
COMMIT;
SELECT
@ -147,8 +147,8 @@ SELECT citus.mitmproxy('conn.onCommandComplete(command="SELECT 1").cancel(' || p
SELECT * FROM citus_check_connection_to_node('localhost', :worker_2_proxy_port);
-- verify that the checks are not successful when timeouts happen on a connection
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.connect_delay(500)');
SET citus.node_connection_timeout TO 900;
SELECT citus.mitmproxy('conn.connect_delay(1000)');
SELECT * FROM citus_check_connection_to_node('localhost', :worker_2_proxy_port);
-- tests for citus_check_cluster_node_health

View File

@ -22,8 +22,8 @@ INSERT INTO failover_to_local SELECT i, i::text FROM generate_series(0,20)i;
-- even if the connection establishment fails, Citus can
-- failover to local exection
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.connect_delay(500)');
SET citus.node_connection_timeout TO 900;
SELECT citus.mitmproxy('conn.connect_delay(1000)');
SET citus.log_local_commands TO ON;
SET client_min_messages TO DEBUG1;
SELECT count(*) FROM failover_to_local;
@ -37,7 +37,7 @@ SELECT key / 0 FROM failover_to_local;
-- if the local execution is disabled, Citus does
-- not try to fallback to local execution
SET citus.enable_local_execution TO false;
SELECT citus.mitmproxy('conn.connect_delay(500)');
SELECT citus.mitmproxy('conn.connect_delay(1000)');
SET citus.log_local_commands TO ON;
SELECT count(*) FROM failover_to_local;
SELECT citus.mitmproxy('conn.allow()');