RESET connection_timeout all the time

fix-flaky-failure_connection_establishment5
Jelte Fennema 2022-08-23 09:26:00 +02:00
parent ba1a67e8c3
commit a6ae5756ef
4 changed files with 146 additions and 6 deletions

View File

@ -642,3 +642,116 @@ workflows:
image_tag: '<< pipeline.parameters.pg14_version >>' image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure make: check-failure
requires: [build-14] requires: [build-14]
- test-citus:
name: 'test-13a_check-failure'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13a_check-failure-1'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13a_check-failure-2'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13a_check-failure-3'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13a_check-failure-4'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13a_check-failure-5'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13a_check-failure-6'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13a_check-failure-7'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-14a_check-failure'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14a_check-failure-1'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14a_check-failure-2'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14a_check-failure-3'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14a_check-failure-4'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14a_check-failure-5'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14a_check-failure-6'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14a_check-failure-7'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]

View File

@ -46,6 +46,7 @@ SELECT citus.mitmproxy('conn.delay(500)');
ALTER TABLE products ADD CONSTRAINT p_key PRIMARY KEY(product_no); ALTER TABLE products ADD CONSTRAINT p_key PRIMARY KEY(product_no);
WARNING: could not establish connection after 400 ms WARNING: could not establish connection after 400 ms
ERROR: connection to the remote node localhost:xxxxx failed ERROR: connection to the remote node localhost:xxxxx failed
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()'); SELECT citus.mitmproxy('conn.allow()');
mitmproxy mitmproxy
--------------------------------------------------------------------- ---------------------------------------------------------------------
@ -93,6 +94,7 @@ SELECT citus.clear_network_traffic();
(1 row) (1 row)
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.delay(500)'); SELECT citus.mitmproxy('conn.delay(500)');
mitmproxy mitmproxy
--------------------------------------------------------------------- ---------------------------------------------------------------------
@ -116,6 +118,7 @@ SELECT * FROM citus.dump_network_traffic() WHERE conn=0 AND source = 'coordinato
0 | coordinator | [initial message] 0 | coordinator | [initial message]
(1 row) (1 row)
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()'); SELECT citus.mitmproxy('conn.allow()');
mitmproxy mitmproxy
--------------------------------------------------------------------- ---------------------------------------------------------------------
@ -126,6 +129,7 @@ SELECT citus.mitmproxy('conn.allow()');
-- distributed table instead of a reference table -- distributed table instead of a reference table
-- and with citus.force_max_query_parallelization is set -- and with citus.force_max_query_parallelization is set
SET citus.force_max_query_parallelization TO ON; SET citus.force_max_query_parallelization TO ON;
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.delay(500)'); SELECT citus.mitmproxy('conn.delay(500)');
mitmproxy mitmproxy
--------------------------------------------------------------------- ---------------------------------------------------------------------
@ -139,6 +143,7 @@ WARNING: could not establish any connections to the node localhost:xxxxx after
0 0
(1 row) (1 row)
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()'); SELECT citus.mitmproxy('conn.allow()');
mitmproxy mitmproxy
--------------------------------------------------------------------- ---------------------------------------------------------------------
@ -147,7 +152,6 @@ SELECT citus.mitmproxy('conn.allow()');
SET citus.shard_replication_factor TO 1; SET citus.shard_replication_factor TO 1;
CREATE TABLE single_replicatated(key int); CREATE TABLE single_replicatated(key int);
RESET citus.node_connection_timeout; -- speed up test and make it less flaky in CI
SELECT create_distributed_table('single_replicatated', 'key'); SELECT create_distributed_table('single_replicatated', 'key');
create_distributed_table create_distributed_table
--------------------------------------------------------------------- ---------------------------------------------------------------------
@ -156,8 +160,8 @@ SELECT create_distributed_table('single_replicatated', 'key');
-- this time the table is single replicated and we're still using the -- this time the table is single replicated and we're still using the
-- the max parallelization flag, so the query should fail -- the max parallelization flag, so the query should fail
SET citus.node_connection_timeout TO 400;
SET citus.force_max_query_parallelization TO ON; SET citus.force_max_query_parallelization TO ON;
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.delay(500)'); SELECT citus.mitmproxy('conn.delay(500)');
mitmproxy mitmproxy
--------------------------------------------------------------------- ---------------------------------------------------------------------
@ -170,6 +174,7 @@ SET citus.force_max_query_parallelization TO OFF;
-- one similar test, and this time on modification queries -- one similar test, and this time on modification queries
-- to see that connection establishement failures could -- to see that connection establishement failures could
-- fail the transaction (but not mark any placements as INVALID) -- fail the transaction (but not mark any placements as INVALID)
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()'); SELECT citus.mitmproxy('conn.allow()');
mitmproxy mitmproxy
--------------------------------------------------------------------- ---------------------------------------------------------------------
@ -189,6 +194,7 @@ WHERE
0 0
(1 row) (1 row)
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.delay(500)'); SELECT citus.mitmproxy('conn.delay(500)');
mitmproxy mitmproxy
--------------------------------------------------------------------- ---------------------------------------------------------------------
@ -211,6 +217,7 @@ WHERE
(1 row) (1 row)
-- show that INSERT failed -- show that INSERT failed
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()'); SELECT citus.mitmproxy('conn.allow()');
mitmproxy mitmproxy
--------------------------------------------------------------------- ---------------------------------------------------------------------
@ -299,6 +306,7 @@ SELECT citus.mitmproxy('conn.onCommandComplete(command="SELECT 1").cancel(' || p
SELECT * FROM citus_check_connection_to_node('localhost', :worker_2_proxy_port); SELECT * FROM citus_check_connection_to_node('localhost', :worker_2_proxy_port);
ERROR: canceling statement due to user request ERROR: canceling statement due to user request
-- verify that the checks are not successful when timeouts happen on a connection -- verify that the checks are not successful when timeouts happen on a connection
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.delay(500)'); SELECT citus.mitmproxy('conn.delay(500)');
mitmproxy mitmproxy
--------------------------------------------------------------------- ---------------------------------------------------------------------
@ -427,13 +435,13 @@ SELECT citus.mitmproxy('conn.onQuery(query="^SELECT 1$").cancel(' || pg_backend_
SELECT * FROM citus_check_cluster_node_health(); SELECT * FROM citus_check_cluster_node_health();
ERROR: canceling statement due to user request ERROR: canceling statement due to user request
RESET client_min_messages; RESET client_min_messages;
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()'); SELECT citus.mitmproxy('conn.allow()');
mitmproxy mitmproxy
--------------------------------------------------------------------- ---------------------------------------------------------------------
(1 row) (1 row)
SET citus.node_connection_timeout TO DEFAULT;
DROP SCHEMA fail_connect CASCADE; DROP SCHEMA fail_connect CASCADE;
NOTICE: drop cascades to 3 other objects NOTICE: drop cascades to 3 other objects
DETAIL: drop cascades to table products DETAIL: drop cascades to table products

View File

@ -28,3 +28,14 @@ test: failure_connection_establishment
test: failure_connection_establishment test: failure_connection_establishment
test: failure_connection_establishment test: failure_connection_establishment
test: failure_connection_establishment test: failure_connection_establishment
test: failure_connection_establishment
test: failure_connection_establishment
test: failure_connection_establishment
test: failure_connection_establishment
test: failure_connection_establishment
test: failure_connection_establishment
test: failure_connection_establishment
test: failure_connection_establishment
test: failure_connection_establishment
test: failure_connection_establishment
test: failure_connection_establishment

View File

@ -36,6 +36,7 @@ SELECT citus.mitmproxy('conn.delay(500)');
ALTER TABLE products ADD CONSTRAINT p_key PRIMARY KEY(product_no); ALTER TABLE products ADD CONSTRAINT p_key PRIMARY KEY(product_no);
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()'); SELECT citus.mitmproxy('conn.allow()');
CREATE TABLE r1 ( CREATE TABLE r1 (
@ -55,6 +56,7 @@ ORDER BY placementid;
SELECT citus.clear_network_traffic(); SELECT citus.clear_network_traffic();
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.delay(500)'); SELECT citus.mitmproxy('conn.delay(500)');
SET citus.task_assignment_policy TO 'round-robin'; SET citus.task_assignment_policy TO 'round-robin';
@ -65,25 +67,27 @@ SELECT name FROM r1 WHERE id = 2;
-- connection to have been delayed and thus caused a timeout -- connection to have been delayed and thus caused a timeout
SELECT * FROM citus.dump_network_traffic() WHERE conn=0 AND source = 'coordinator'; SELECT * FROM citus.dump_network_traffic() WHERE conn=0 AND source = 'coordinator';
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()'); SELECT citus.mitmproxy('conn.allow()');
-- similar test with the above but this time on a -- similar test with the above but this time on a
-- distributed table instead of a reference table -- distributed table instead of a reference table
-- and with citus.force_max_query_parallelization is set -- and with citus.force_max_query_parallelization is set
SET citus.force_max_query_parallelization TO ON; SET citus.force_max_query_parallelization TO ON;
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.delay(500)'); SELECT citus.mitmproxy('conn.delay(500)');
SELECT count(*) FROM products; SELECT count(*) FROM products;
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()'); SELECT citus.mitmproxy('conn.allow()');
SET citus.shard_replication_factor TO 1; SET citus.shard_replication_factor TO 1;
CREATE TABLE single_replicatated(key int); CREATE TABLE single_replicatated(key int);
RESET citus.node_connection_timeout; -- speed up test and make it less flaky in CI
SELECT create_distributed_table('single_replicatated', 'key'); SELECT create_distributed_table('single_replicatated', 'key');
-- this time the table is single replicated and we're still using the -- this time the table is single replicated and we're still using the
-- the max parallelization flag, so the query should fail -- the max parallelization flag, so the query should fail
SET citus.node_connection_timeout TO 400;
SET citus.force_max_query_parallelization TO ON; SET citus.force_max_query_parallelization TO ON;
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.delay(500)'); SELECT citus.mitmproxy('conn.delay(500)');
SELECT count(*) FROM single_replicatated; SELECT count(*) FROM single_replicatated;
@ -92,6 +96,7 @@ SET citus.force_max_query_parallelization TO OFF;
-- one similar test, and this time on modification queries -- one similar test, and this time on modification queries
-- to see that connection establishement failures could -- to see that connection establishement failures could
-- fail the transaction (but not mark any placements as INVALID) -- fail the transaction (but not mark any placements as INVALID)
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()'); SELECT citus.mitmproxy('conn.allow()');
BEGIN; BEGIN;
SELECT SELECT
@ -101,6 +106,7 @@ FROM
WHERE WHERE
shardstate = 3 AND shardstate = 3 AND
shardid IN (SELECT shardid from pg_dist_shard where logicalrelid = 'single_replicatated'::regclass); shardid IN (SELECT shardid from pg_dist_shard where logicalrelid = 'single_replicatated'::regclass);
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.delay(500)'); SELECT citus.mitmproxy('conn.delay(500)');
INSERT INTO single_replicatated VALUES (100); INSERT INTO single_replicatated VALUES (100);
COMMIT; COMMIT;
@ -113,6 +119,7 @@ WHERE
shardid IN (SELECT shardid from pg_dist_shard where logicalrelid = 'single_replicatated'::regclass); shardid IN (SELECT shardid from pg_dist_shard where logicalrelid = 'single_replicatated'::regclass);
-- show that INSERT failed -- show that INSERT failed
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()'); SELECT citus.mitmproxy('conn.allow()');
SELECT count(*) FROM single_replicatated WHERE key = 100; SELECT count(*) FROM single_replicatated WHERE key = 100;
@ -150,6 +157,7 @@ SELECT citus.mitmproxy('conn.onCommandComplete(command="SELECT 1").cancel(' || p
SELECT * FROM citus_check_connection_to_node('localhost', :worker_2_proxy_port); SELECT * FROM citus_check_connection_to_node('localhost', :worker_2_proxy_port);
-- verify that the checks are not successful when timeouts happen on a connection -- verify that the checks are not successful when timeouts happen on a connection
SET citus.node_connection_timeout TO 400;
SELECT citus.mitmproxy('conn.delay(500)'); SELECT citus.mitmproxy('conn.delay(500)');
SELECT * FROM citus_check_connection_to_node('localhost', :worker_2_proxy_port); SELECT * FROM citus_check_connection_to_node('localhost', :worker_2_proxy_port);
@ -193,7 +201,7 @@ SELECT * FROM citus_check_cluster_node_health();
RESET client_min_messages; RESET client_min_messages;
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()'); SELECT citus.mitmproxy('conn.allow()');
SET citus.node_connection_timeout TO DEFAULT;
DROP SCHEMA fail_connect CASCADE; DROP SCHEMA fail_connect CASCADE;
SET search_path TO default; SET search_path TO default;