mirror of https://github.com/citusdata/citus.git
Fix flakyness in adaptive_executor (#6275)
Sometimes in CI our adaptive_executor test would fail randomly with the following error: ```diff SELECT sum(result::bigint) FROM run_command_on_workers($$ SELECT count(*) FROM pg_stat_activity WHERE pid <> pg_backend_pid() AND query LIKE '%8010090%' $$); sum ----- - 4 + 2 (1 row) END; ``` Source: https://app.circleci.com/pipelines/github/citusdata/citus/26665/workflows/40665680-0044-4852-8fe4-5fd628f9fb47/jobs/764371 This means that the low slow start interval did not have any effect on the number of connections being opened. I could see two possibilities for this to happen: 1. CI was slow and actually doing the start of the second connection. I tried to solve this by doubling the time a query to the worker takes. 2. The second option is that the shards were queried in the oposite order than we expect. This would mean that the first query to the worker completes quickly because there's no, sleep because it doesn't contain any rows. I tried to solve this option by adding a row to each shard. After trying to reproduce the random failure in CI it turned out that I needed both of these fixes to resolve the random failure.pull/6272/head
parent
8354853dec
commit
f22a47981a
|
@ -10,8 +10,35 @@ SELECT create_distributed_table('test','x');
|
||||||
|
|
||||||
(1 row)
|
(1 row)
|
||||||
|
|
||||||
|
-- Add 1 row to each shard
|
||||||
|
SELECT get_shard_id_for_distribution_column('test', 1);
|
||||||
|
get_shard_id_for_distribution_column
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
801009000
|
||||||
|
(1 row)
|
||||||
|
|
||||||
INSERT INTO test VALUES (1,2);
|
INSERT INTO test VALUES (1,2);
|
||||||
|
SELECT get_shard_id_for_distribution_column('test', 3);
|
||||||
|
get_shard_id_for_distribution_column
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
801009001
|
||||||
|
(1 row)
|
||||||
|
|
||||||
INSERT INTO test VALUES (3,2);
|
INSERT INTO test VALUES (3,2);
|
||||||
|
SELECT get_shard_id_for_distribution_column('test', 6);
|
||||||
|
get_shard_id_for_distribution_column
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
801009002
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
INSERT INTO test VALUES (8,2);
|
||||||
|
SELECT get_shard_id_for_distribution_column('test', 11);
|
||||||
|
get_shard_id_for_distribution_column
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
801009003
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
INSERT INTO test VALUES (11,2);
|
||||||
-- Set a very high slow start to avoid opening parallel connections
|
-- Set a very high slow start to avoid opening parallel connections
|
||||||
SET citus.executor_slow_start_interval TO '60s';
|
SET citus.executor_slow_start_interval TO '60s';
|
||||||
SET citus.max_adaptive_executor_pool_size TO 2;
|
SET citus.max_adaptive_executor_pool_size TO 2;
|
||||||
|
@ -19,7 +46,7 @@ BEGIN;
|
||||||
SELECT count(*) FROM test a JOIN (SELECT x, pg_sleep(0.1) FROM test) b USING (x);
|
SELECT count(*) FROM test a JOIN (SELECT x, pg_sleep(0.1) FROM test) b USING (x);
|
||||||
count
|
count
|
||||||
---------------------------------------------------------------------
|
---------------------------------------------------------------------
|
||||||
2
|
4
|
||||||
(1 row)
|
(1 row)
|
||||||
|
|
||||||
SELECT sum(result::bigint) FROM run_command_on_workers($$
|
SELECT sum(result::bigint) FROM run_command_on_workers($$
|
||||||
|
@ -35,10 +62,10 @@ END;
|
||||||
-- SELECT takes longer than slow start interval, should open multiple connections
|
-- SELECT takes longer than slow start interval, should open multiple connections
|
||||||
SET citus.executor_slow_start_interval TO '10ms';
|
SET citus.executor_slow_start_interval TO '10ms';
|
||||||
BEGIN;
|
BEGIN;
|
||||||
SELECT count(*) FROM test a JOIN (SELECT x, pg_sleep(0.1) FROM test) b USING (x);
|
SELECT count(*) FROM test a JOIN (SELECT x, pg_sleep(0.2) FROM test) b USING (x);
|
||||||
count
|
count
|
||||||
---------------------------------------------------------------------
|
---------------------------------------------------------------------
|
||||||
2
|
4
|
||||||
(1 row)
|
(1 row)
|
||||||
|
|
||||||
SELECT sum(result::bigint) FROM run_command_on_workers($$
|
SELECT sum(result::bigint) FROM run_command_on_workers($$
|
||||||
|
|
|
@ -7,8 +7,15 @@ SET citus.shard_count TO 4;
|
||||||
SET citus.shard_replication_factor TO 1;
|
SET citus.shard_replication_factor TO 1;
|
||||||
SET citus.next_shard_id TO 801009000;
|
SET citus.next_shard_id TO 801009000;
|
||||||
SELECT create_distributed_table('test','x');
|
SELECT create_distributed_table('test','x');
|
||||||
|
-- Add 1 row to each shard
|
||||||
|
SELECT get_shard_id_for_distribution_column('test', 1);
|
||||||
INSERT INTO test VALUES (1,2);
|
INSERT INTO test VALUES (1,2);
|
||||||
|
SELECT get_shard_id_for_distribution_column('test', 3);
|
||||||
INSERT INTO test VALUES (3,2);
|
INSERT INTO test VALUES (3,2);
|
||||||
|
SELECT get_shard_id_for_distribution_column('test', 6);
|
||||||
|
INSERT INTO test VALUES (8,2);
|
||||||
|
SELECT get_shard_id_for_distribution_column('test', 11);
|
||||||
|
INSERT INTO test VALUES (11,2);
|
||||||
|
|
||||||
-- Set a very high slow start to avoid opening parallel connections
|
-- Set a very high slow start to avoid opening parallel connections
|
||||||
SET citus.executor_slow_start_interval TO '60s';
|
SET citus.executor_slow_start_interval TO '60s';
|
||||||
|
@ -26,7 +33,7 @@ END;
|
||||||
SET citus.executor_slow_start_interval TO '10ms';
|
SET citus.executor_slow_start_interval TO '10ms';
|
||||||
|
|
||||||
BEGIN;
|
BEGIN;
|
||||||
SELECT count(*) FROM test a JOIN (SELECT x, pg_sleep(0.1) FROM test) b USING (x);
|
SELECT count(*) FROM test a JOIN (SELECT x, pg_sleep(0.2) FROM test) b USING (x);
|
||||||
SELECT sum(result::bigint) FROM run_command_on_workers($$
|
SELECT sum(result::bigint) FROM run_command_on_workers($$
|
||||||
SELECT count(*) FROM pg_stat_activity
|
SELECT count(*) FROM pg_stat_activity
|
||||||
WHERE pid <> pg_backend_pid() AND query LIKE '%8010090%'
|
WHERE pid <> pg_backend_pid() AND query LIKE '%8010090%'
|
||||||
|
|
Loading…
Reference in New Issue