RESET connection_timeout all the time

fix-flaky-failure_connection_establishment5
Jelte Fennema 2022-08-23 09:26:00 +02:00
parent ba1a67e8c3
commit a6ae5756ef
4 changed files with 146 additions and 6 deletions

View File

@ -642,3 +642,116 @@ workflows:
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-13a_check-failure'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13a_check-failure-1'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13a_check-failure-2'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13a_check-failure-3'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13a_check-failure-4'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13a_check-failure-5'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13a_check-failure-6'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13a_check-failure-7'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-14a_check-failure'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14a_check-failure-1'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14a_check-failure-2'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14a_check-failure-3'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14a_check-failure-4'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14a_check-failure-5'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14a_check-failure-6'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14a_check-failure-7'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]

View File

@ -46,6 +46,7 @@ SELECT citus.mitmproxy('conn.delay(500)');
ALTER TABLE products ADD CONSTRAINT p_key PRIMARY KEY(product_no);
WARNING: could not establish connection after 400 ms
ERROR: connection to the remote node localhost:xxxxx failed
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
@ -93,6 +94,7 @@ SELECT citus.clear_network_traffic();
(1 row)
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.delay(500)');
mitmproxy
---------------------------------------------------------------------
@ -116,6 +118,7 @@ SELECT * FROM citus.dump_network_traffic() WHERE conn=0 AND source = 'coordinato
0 | coordinator | [initial message]
(1 row)
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
@ -126,6 +129,7 @@ SELECT citus.mitmproxy('conn.allow()');
-- distributed table instead of a reference table
-- and with citus.force_max_query_parallelization is set
SET citus.force_max_query_parallelization TO ON;
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.delay(500)');
mitmproxy
---------------------------------------------------------------------
@ -139,6 +143,7 @@ WARNING: could not establish any connections to the node localhost:xxxxx after
0
(1 row)
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
@ -147,7 +152,6 @@ SELECT citus.mitmproxy('conn.allow()');
SET citus.shard_replication_factor TO 1;
CREATE TABLE single_replicatated(key int);
RESET citus.node_connection_timeout; -- speed up test and make it less flaky in CI
SELECT create_distributed_table('single_replicatated', 'key');
create_distributed_table
---------------------------------------------------------------------
@ -156,8 +160,8 @@ SELECT create_distributed_table('single_replicatated', 'key');
-- this time the table is single replicated and we're still using the
-- the max parallelization flag, so the query should fail
SET citus.node_connection_timeout TO 400;
SET citus.force_max_query_parallelization TO ON;
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.delay(500)');
mitmproxy
---------------------------------------------------------------------
@ -170,6 +174,7 @@ SET citus.force_max_query_parallelization TO OFF;
-- one similar test, and this time on modification queries
-- to see that connection establishement failures could
-- fail the transaction (but not mark any placements as INVALID)
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
@ -189,6 +194,7 @@ WHERE
0
(1 row)
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.delay(500)');
mitmproxy
---------------------------------------------------------------------
@ -211,6 +217,7 @@ WHERE
(1 row)
-- show that INSERT failed
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
@ -299,6 +306,7 @@ SELECT citus.mitmproxy('conn.onCommandComplete(command="SELECT 1").cancel(' || p
SELECT * FROM citus_check_connection_to_node('localhost', :worker_2_proxy_port);
ERROR: canceling statement due to user request
-- verify that the checks are not successful when timeouts happen on a connection
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.delay(500)');
mitmproxy
---------------------------------------------------------------------
@ -427,13 +435,13 @@ SELECT citus.mitmproxy('conn.onQuery(query="^SELECT 1$").cancel(' || pg_backend_
SELECT * FROM citus_check_cluster_node_health();
ERROR: canceling statement due to user request
RESET client_min_messages;
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SET citus.node_connection_timeout TO DEFAULT;
DROP SCHEMA fail_connect CASCADE;
NOTICE: drop cascades to 3 other objects
DETAIL: drop cascades to table products

View File

@ -28,3 +28,14 @@ test: failure_connection_establishment
test: failure_connection_establishment
test: failure_connection_establishment
test: failure_connection_establishment
test: failure_connection_establishment
test: failure_connection_establishment
test: failure_connection_establishment
test: failure_connection_establishment
test: failure_connection_establishment
test: failure_connection_establishment
test: failure_connection_establishment
test: failure_connection_establishment
test: failure_connection_establishment
test: failure_connection_establishment
test: failure_connection_establishment

View File

@ -36,6 +36,7 @@ SELECT citus.mitmproxy('conn.delay(500)');
ALTER TABLE products ADD CONSTRAINT p_key PRIMARY KEY(product_no);
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
CREATE TABLE r1 (
@ -55,6 +56,7 @@ ORDER BY placementid;
SELECT citus.clear_network_traffic();
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.delay(500)');
SET citus.task_assignment_policy TO 'round-robin';
@ -65,25 +67,27 @@ SELECT name FROM r1 WHERE id = 2;
-- connection to have been delayed and thus caused a timeout
SELECT * FROM citus.dump_network_traffic() WHERE conn=0 AND source = 'coordinator';
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
-- similar test with the above but this time on a
-- distributed table instead of a reference table
-- and with citus.force_max_query_parallelization is set
SET citus.force_max_query_parallelization TO ON;
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.delay(500)');
SELECT count(*) FROM products;
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
SET citus.shard_replication_factor TO 1;
CREATE TABLE single_replicatated(key int);
RESET citus.node_connection_timeout; -- speed up test and make it less flaky in CI
SELECT create_distributed_table('single_replicatated', 'key');
-- this time the table is single replicated and we're still using the
-- the max parallelization flag, so the query should fail
SET citus.node_connection_timeout TO 400;
SET citus.force_max_query_parallelization TO ON;
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.delay(500)');
SELECT count(*) FROM single_replicatated;
@ -92,6 +96,7 @@ SET citus.force_max_query_parallelization TO OFF;
-- one similar test, and this time on modification queries
-- to see that connection establishement failures could
-- fail the transaction (but not mark any placements as INVALID)
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
BEGIN;
SELECT
@ -101,6 +106,7 @@ FROM
WHERE
shardstate = 3 AND
shardid IN (SELECT shardid from pg_dist_shard where logicalrelid = 'single_replicatated'::regclass);
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.delay(500)');
INSERT INTO single_replicatated VALUES (100);
COMMIT;
@ -113,6 +119,7 @@ WHERE
shardid IN (SELECT shardid from pg_dist_shard where logicalrelid = 'single_replicatated'::regclass);
-- show that INSERT failed
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
SELECT count(*) FROM single_replicatated WHERE key = 100;
@ -150,6 +157,7 @@ SELECT citus.mitmproxy('conn.onCommandComplete(command="SELECT 1").cancel(' || p
SELECT * FROM citus_check_connection_to_node('localhost', :worker_2_proxy_port);
-- verify that the checks are not successful when timeouts happen on a connection
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.delay(500)');
SELECT * FROM citus_check_connection_to_node('localhost', :worker_2_proxy_port);
@ -193,7 +201,7 @@ SELECT * FROM citus_check_cluster_node_health();
RESET client_min_messages;
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
SET citus.node_connection_timeout TO DEFAULT;
DROP SCHEMA fail_connect CASCADE;
SET search_path TO default;