From 5901b815a267b95005587ef3f22aec612c83d557 Mon Sep 17 00:00:00 2001 From: Jelte Fennema Date: Tue, 30 Aug 2022 23:23:30 +0200 Subject: [PATCH] Fix flakyness in adaptive_executor (#6275) Sometimes in CI our adaptive_executor test would fail randomly with the following error: ```diff SELECT sum(result::bigint) FROM run_command_on_workers($$ SELECT count(*) FROM pg_stat_activity WHERE pid <> pg_backend_pid() AND query LIKE '%8010090%' $$); sum ----- - 4 + 2 (1 row) END; ``` Source: https://app.circleci.com/pipelines/github/citusdata/citus/26665/workflows/40665680-0044-4852-8fe4-5fd628f9fb47/jobs/764371 This means that the low slow start interval did not have any effect on the number of connections being opened. I could see two possibilities for this to happen: 1. CI was slow and actually doing the start of the second connection. I tried to solve this by doubling the time a query to the worker takes. 2. The second option is that the shards were queried in the oposite order than we expect. This would mean that the first query to the worker completes quickly because there's no, sleep because it doesn't contain any rows. I tried to solve this option by adding a row to each shard. After trying to reproduce the random failure in CI it turned out that I needed both of these fixes to resolve the random failure. (cherry picked from commit f22a47981ad7e69d76aab6d3471f385e8ca87cc9) --- .../regress/expected/adaptive_executor.out | 33 +++++++++++++++++-- src/test/regress/sql/adaptive_executor.sql | 9 ++++- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/src/test/regress/expected/adaptive_executor.out b/src/test/regress/expected/adaptive_executor.out index 73ba772b5..aeaa553f2 100644 --- a/src/test/regress/expected/adaptive_executor.out +++ b/src/test/regress/expected/adaptive_executor.out @@ -10,8 +10,35 @@ SELECT create_distributed_table('test','x'); (1 row) +-- Add 1 row to each shard +SELECT get_shard_id_for_distribution_column('test', 1); + get_shard_id_for_distribution_column +--------------------------------------------------------------------- + 801009000 +(1 row) + INSERT INTO test VALUES (1,2); +SELECT get_shard_id_for_distribution_column('test', 3); + get_shard_id_for_distribution_column +--------------------------------------------------------------------- + 801009001 +(1 row) + INSERT INTO test VALUES (3,2); +SELECT get_shard_id_for_distribution_column('test', 6); + get_shard_id_for_distribution_column +--------------------------------------------------------------------- + 801009002 +(1 row) + +INSERT INTO test VALUES (8,2); +SELECT get_shard_id_for_distribution_column('test', 11); + get_shard_id_for_distribution_column +--------------------------------------------------------------------- + 801009003 +(1 row) + +INSERT INTO test VALUES (11,2); -- Set a very high slow start to avoid opening parallel connections SET citus.executor_slow_start_interval TO '60s'; SET citus.max_adaptive_executor_pool_size TO 2; @@ -19,7 +46,7 @@ BEGIN; SELECT count(*) FROM test a JOIN (SELECT x, pg_sleep(0.1) FROM test) b USING (x); count --------------------------------------------------------------------- - 2 + 4 (1 row) SELECT sum(result::bigint) FROM run_command_on_workers($$ @@ -35,10 +62,10 @@ END; -- SELECT takes longer than slow start interval, should open multiple connections SET citus.executor_slow_start_interval TO '10ms'; BEGIN; -SELECT count(*) FROM test a JOIN (SELECT x, pg_sleep(0.1) FROM test) b USING (x); +SELECT count(*) FROM test a JOIN (SELECT x, pg_sleep(0.2) FROM test) b USING (x); count --------------------------------------------------------------------- - 2 + 4 (1 row) SELECT sum(result::bigint) FROM run_command_on_workers($$ diff --git a/src/test/regress/sql/adaptive_executor.sql b/src/test/regress/sql/adaptive_executor.sql index a744437f1..f7d6c6f1e 100644 --- a/src/test/regress/sql/adaptive_executor.sql +++ b/src/test/regress/sql/adaptive_executor.sql @@ -7,8 +7,15 @@ SET citus.shard_count TO 4; SET citus.shard_replication_factor TO 1; SET citus.next_shard_id TO 801009000; SELECT create_distributed_table('test','x'); +-- Add 1 row to each shard +SELECT get_shard_id_for_distribution_column('test', 1); INSERT INTO test VALUES (1,2); +SELECT get_shard_id_for_distribution_column('test', 3); INSERT INTO test VALUES (3,2); +SELECT get_shard_id_for_distribution_column('test', 6); +INSERT INTO test VALUES (8,2); +SELECT get_shard_id_for_distribution_column('test', 11); +INSERT INTO test VALUES (11,2); -- Set a very high slow start to avoid opening parallel connections SET citus.executor_slow_start_interval TO '60s'; @@ -26,7 +33,7 @@ END; SET citus.executor_slow_start_interval TO '10ms'; BEGIN; -SELECT count(*) FROM test a JOIN (SELECT x, pg_sleep(0.1) FROM test) b USING (x); +SELECT count(*) FROM test a JOIN (SELECT x, pg_sleep(0.2) FROM test) b USING (x); SELECT sum(result::bigint) FROM run_command_on_workers($$ SELECT count(*) FROM pg_stat_activity WHERE pid <> pg_backend_pid() AND query LIKE '%8010090%'