Fix flakyness in adaptive_executor (#6275)

Sometimes in CI our adaptive_executor test would fail randomly with the
following error:

```diff
 SELECT sum(result::bigint) FROM run_command_on_workers($$
   SELECT count(*) FROM pg_stat_activity
   WHERE pid <> pg_backend_pid() AND query LIKE '%8010090%'
 $$);
  sum
 -----
-   4
+   2
 (1 row)

 END;
```
Source: https://app.circleci.com/pipelines/github/citusdata/citus/26665/workflows/40665680-0044-4852-8fe4-5fd628f9fb47/jobs/764371

This means that the low slow start interval did not have any effect on
the number of connections being opened. I could see two possibilities
for this to happen:
1. CI was slow and actually doing the start of the second connection. I
   tried to solve this by doubling the time a query to the worker takes.
2. The second option is that the shards were queried in the oposite
   order than we expect. This would mean that the first query to the
   worker completes quickly because there's no, sleep because it doesn't
   contain any rows. I tried to solve this option by adding a row to
   each shard.

After trying to reproduce the random failure in CI it turned out that I
needed both of these fixes to resolve the random failure.

(cherry picked from commit f22a47981a)
pull/6363/head
Jelte Fennema 2022-08-30 23:23:30 +02:00
parent 8451dd3554
commit 5901b815a2
2 changed files with 38 additions and 4 deletions

View File

@ -10,8 +10,35 @@ SELECT create_distributed_table('test','x');
(1 row)
-- Add 1 row to each shard
SELECT get_shard_id_for_distribution_column('test', 1);
get_shard_id_for_distribution_column
---------------------------------------------------------------------
801009000
(1 row)
INSERT INTO test VALUES (1,2);
SELECT get_shard_id_for_distribution_column('test', 3);
get_shard_id_for_distribution_column
---------------------------------------------------------------------
801009001
(1 row)
INSERT INTO test VALUES (3,2);
SELECT get_shard_id_for_distribution_column('test', 6);
get_shard_id_for_distribution_column
---------------------------------------------------------------------
801009002
(1 row)
INSERT INTO test VALUES (8,2);
SELECT get_shard_id_for_distribution_column('test', 11);
get_shard_id_for_distribution_column
---------------------------------------------------------------------
801009003
(1 row)
INSERT INTO test VALUES (11,2);
-- Set a very high slow start to avoid opening parallel connections
SET citus.executor_slow_start_interval TO '60s';
SET citus.max_adaptive_executor_pool_size TO 2;
@ -19,7 +46,7 @@ BEGIN;
SELECT count(*) FROM test a JOIN (SELECT x, pg_sleep(0.1) FROM test) b USING (x);
count
---------------------------------------------------------------------
2
4
(1 row)
SELECT sum(result::bigint) FROM run_command_on_workers($$
@ -35,10 +62,10 @@ END;
-- SELECT takes longer than slow start interval, should open multiple connections
SET citus.executor_slow_start_interval TO '10ms';
BEGIN;
SELECT count(*) FROM test a JOIN (SELECT x, pg_sleep(0.1) FROM test) b USING (x);
SELECT count(*) FROM test a JOIN (SELECT x, pg_sleep(0.2) FROM test) b USING (x);
count
---------------------------------------------------------------------
2
4
(1 row)
SELECT sum(result::bigint) FROM run_command_on_workers($$

View File

@ -7,8 +7,15 @@ SET citus.shard_count TO 4;
SET citus.shard_replication_factor TO 1;
SET citus.next_shard_id TO 801009000;
SELECT create_distributed_table('test','x');
-- Add 1 row to each shard
SELECT get_shard_id_for_distribution_column('test', 1);
INSERT INTO test VALUES (1,2);
SELECT get_shard_id_for_distribution_column('test', 3);
INSERT INTO test VALUES (3,2);
SELECT get_shard_id_for_distribution_column('test', 6);
INSERT INTO test VALUES (8,2);
SELECT get_shard_id_for_distribution_column('test', 11);
INSERT INTO test VALUES (11,2);
-- Set a very high slow start to avoid opening parallel connections
SET citus.executor_slow_start_interval TO '60s';
@ -26,7 +33,7 @@ END;
SET citus.executor_slow_start_interval TO '10ms';
BEGIN;
SELECT count(*) FROM test a JOIN (SELECT x, pg_sleep(0.1) FROM test) b USING (x);
SELECT count(*) FROM test a JOIN (SELECT x, pg_sleep(0.2) FROM test) b USING (x);
SELECT sum(result::bigint) FROM run_command_on_workers($$
SELECT count(*) FROM pg_stat_activity
WHERE pid <> pg_backend_pid() AND query LIKE '%8010090%'