Try dump_network_traffic again

fix-flaky-failure_connection_establishment5
Jelte Fennema 2022-08-23 11:01:11 +02:00
parent d7b2dee56e
commit 3a373cb277
3 changed files with 294 additions and 21 deletions

View File

@ -867,3 +867,231 @@ workflows:
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-13c_check-failure'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13c_check-failure-1'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13c_check-failure-2'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13c_check-failure-3'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13c_check-failure-4'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13c_check-failure-5'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13c_check-failure-6'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13c_check-failure-7'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-14c_check-failure'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14c_check-failure-1'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14c_check-failure-2'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14c_check-failure-3'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14c_check-failure-4'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14c_check-failure-5'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14c_check-failure-6'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14c_check-failure-7'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-13d_check-failure'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13d_check-failure-1'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13d_check-failure-2'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13d_check-failure-3'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13d_check-failure-4'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13d_check-failure-5'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13d_check-failure-6'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-13d_check-failure-7'
pg_major: 13
image: citus/failtester
image_tag: '<< pipeline.parameters.pg13_version >>'
make: check-failure
requires: [build-13]
- test-citus:
name: 'test-14d_check-failure'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14d_check-failure-1'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14d_check-failure-2'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14d_check-failure-3'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14d_check-failure-4'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14d_check-failure-5'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14d_check-failure-6'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]
- test-citus:
name: 'test-14d_check-failure-7'
pg_major: 14
image: citus/failtester
image_tag: '<< pipeline.parameters.pg14_version >>'
make: check-failure
requires: [build-14]

View File

@ -85,6 +85,19 @@ SELECT citus.mitmproxy('conn.connect_delay(1000)');
ALTER TABLE products ADD CONSTRAINT p_key PRIMARY KEY(product_no);
WARNING: could not establish connection after 900 ms
ERROR: connection to the remote node localhost:xxxxx failed
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT citus.clear_network_traffic();
clear_network_traffic
---------------------------------------------------------------------
(1 row)
-- Make sure that we fall back to a working node for reads, even if it's not
-- the first choice in our task assignment policy.
SET citus.node_connection_timeout TO 900;
@ -94,6 +107,7 @@ SELECT citus.mitmproxy('conn.connect_delay(1000)');
(1 row)
-- tests for connectivity checks
SELECT name FROM r1 WHERE id = 2;
WARNING: could not establish any connections to the node localhost:xxxxx after 900 ms
name
@ -101,6 +115,21 @@ WARNING: could not establish any connections to the node localhost:xxxxx after
bar
(1 row)
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
-- verify a connection attempt was made to the intercepted node, this would
-- have cause the connection to have been delayed and thus caused a timeout
SELECT * FROM citus.dump_network_traffic() WHERE conn=0 AND source = 'coordinator';
conn | source | message
---------------------------------------------------------------------
0 | coordinator | [initial message]
(1 row)
-- similar test with the above but this time on a distributed table instead of
-- a reference table and with citus.force_max_query_parallelization is set
SET citus.force_max_query_parallelization TO ON;
@ -145,10 +174,7 @@ SELECT citus.mitmproxy('conn.connect_delay(1000)');
SELECT count(*) FROM single_replicatated;
ERROR: could not establish any connections to the node localhost:xxxxx after 900 ms
SET citus.force_max_query_parallelization TO OFF;
-- one similar test, and this time on modification queries
-- to see that connection establishement failures could
-- fail the transaction (but not mark any placements as INVALID)
RESET citus.force_max_query_parallelization;
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
@ -156,6 +182,9 @@ SELECT citus.mitmproxy('conn.allow()');
(1 row)
-- one similar test, and this time on modification queries
-- to see that connection establishement failures could
-- fail the transaction (but not mark any placements as INVALID)
BEGIN;
SELECT
count(*) as invalid_placement_count
@ -179,6 +208,13 @@ SELECT citus.mitmproxy('conn.connect_delay(1000)');
INSERT INTO single_replicatated VALUES (100);
ERROR: could not establish any connections to the node localhost:xxxxx after 900 ms
COMMIT;
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT
count(*) as invalid_placement_count
FROM
@ -191,14 +227,6 @@ WHERE
0
(1 row)
-- show that INSERT failed
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT count(*) FROM single_replicatated WHERE key = 100;
count
---------------------------------------------------------------------
@ -294,6 +322,13 @@ SELECT * FROM citus_check_connection_to_node('localhost', :worker_2_proxy_port);
f
(1 row)
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
-- tests for citus_check_cluster_node_health
-- kill all connectivity checks that originate from this node
SELECT citus.mitmproxy('conn.onQuery(query="^SELECT citus_check_connection_to_node").kill()');

View File

@ -51,15 +51,24 @@ SET citus.task_assignment_policy TO 'first-replica';
-- investigation into connection establishment problems
SET citus.node_connection_timeout TO 900;
SELECT citus.mitmproxy('conn.connect_delay(1000)');
ALTER TABLE products ADD CONSTRAINT p_key PRIMARY KEY(product_no);
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
SELECT citus.clear_network_traffic();
-- Make sure that we fall back to a working node for reads, even if it's not
-- the first choice in our task assignment policy.
SET citus.node_connection_timeout TO 900;
SELECT citus.mitmproxy('conn.connect_delay(1000)');
-- tests for connectivity checks
SELECT name FROM r1 WHERE id = 2;
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
-- verify a connection attempt was made to the intercepted node, this would
-- have cause the connection to have been delayed and thus caused a timeout
SELECT * FROM citus.dump_network_traffic() WHERE conn=0 AND source = 'coordinator';
-- similar test with the above but this time on a distributed table instead of
-- a reference table and with citus.force_max_query_parallelization is set
@ -67,9 +76,9 @@ SET citus.force_max_query_parallelization TO ON;
SET citus.node_connection_timeout TO 900;
SELECT citus.mitmproxy('conn.connect_delay(1000)');
SELECT count(*) FROM products;
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
SET citus.shard_replication_factor TO 1;
CREATE TABLE single_replicatated(key int);
SELECT create_distributed_table('single_replicatated', 'key');
@ -80,14 +89,14 @@ SET citus.force_max_query_parallelization TO ON;
SET citus.node_connection_timeout TO 900;
SELECT citus.mitmproxy('conn.connect_delay(1000)');
SELECT count(*) FROM single_replicatated;
RESET citus.force_max_query_parallelization;
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
SET citus.force_max_query_parallelization TO OFF;
-- one similar test, and this time on modification queries
-- to see that connection establishement failures could
-- fail the transaction (but not mark any placements as INVALID)
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
BEGIN;
SELECT
count(*) as invalid_placement_count
@ -100,6 +109,8 @@ SET citus.node_connection_timeout TO 900;
SELECT citus.mitmproxy('conn.connect_delay(1000)');
INSERT INTO single_replicatated VALUES (100);
COMMIT;
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
SELECT
count(*) as invalid_placement_count
FROM
@ -108,9 +119,6 @@ WHERE
shardstate = 3 AND
shardid IN (SELECT shardid from pg_dist_shard where logicalrelid = 'single_replicatated'::regclass);
-- show that INSERT failed
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
SELECT count(*) FROM single_replicatated WHERE key = 100;
@ -150,6 +158,8 @@ SELECT * FROM citus_check_connection_to_node('localhost', :worker_2_proxy_port);
SET citus.node_connection_timeout TO 900;
SELECT citus.mitmproxy('conn.connect_delay(1000)');
SELECT * FROM citus_check_connection_to_node('localhost', :worker_2_proxy_port);
RESET citus.node_connection_timeout;
SELECT citus.mitmproxy('conn.allow()');
-- tests for citus_check_cluster_node_health