mirror of https://github.com/citusdata/citus.git
Fix flakyness in adaptive_executor (#6275)
Sometimes in CI our adaptive_executor test would fail randomly with the
following error:
```diff
SELECT sum(result::bigint) FROM run_command_on_workers($$
SELECT count(*) FROM pg_stat_activity
WHERE pid <> pg_backend_pid() AND query LIKE '%8010090%'
$$);
sum
-----
- 4
+ 2
(1 row)
END;
```
Source: https://app.circleci.com/pipelines/github/citusdata/citus/26665/workflows/40665680-0044-4852-8fe4-5fd628f9fb47/jobs/764371
This means that the low slow start interval did not have any effect on
the number of connections being opened. I could see two possibilities
for this to happen:
1. CI was slow and actually doing the start of the second connection. I
tried to solve this by doubling the time a query to the worker takes.
2. The second option is that the shards were queried in the oposite
order than we expect. This would mean that the first query to the
worker completes quickly because there's no, sleep because it doesn't
contain any rows. I tried to solve this option by adding a row to
each shard.
After trying to reproduce the random failure in CI it turned out that I
needed both of these fixes to resolve the random failure.
(cherry picked from commit f22a47981a
)
pull/6363/head
parent
8451dd3554
commit
5901b815a2
|
@ -10,8 +10,35 @@ SELECT create_distributed_table('test','x');
|
|||
|
||||
(1 row)
|
||||
|
||||
-- Add 1 row to each shard
|
||||
SELECT get_shard_id_for_distribution_column('test', 1);
|
||||
get_shard_id_for_distribution_column
|
||||
---------------------------------------------------------------------
|
||||
801009000
|
||||
(1 row)
|
||||
|
||||
INSERT INTO test VALUES (1,2);
|
||||
SELECT get_shard_id_for_distribution_column('test', 3);
|
||||
get_shard_id_for_distribution_column
|
||||
---------------------------------------------------------------------
|
||||
801009001
|
||||
(1 row)
|
||||
|
||||
INSERT INTO test VALUES (3,2);
|
||||
SELECT get_shard_id_for_distribution_column('test', 6);
|
||||
get_shard_id_for_distribution_column
|
||||
---------------------------------------------------------------------
|
||||
801009002
|
||||
(1 row)
|
||||
|
||||
INSERT INTO test VALUES (8,2);
|
||||
SELECT get_shard_id_for_distribution_column('test', 11);
|
||||
get_shard_id_for_distribution_column
|
||||
---------------------------------------------------------------------
|
||||
801009003
|
||||
(1 row)
|
||||
|
||||
INSERT INTO test VALUES (11,2);
|
||||
-- Set a very high slow start to avoid opening parallel connections
|
||||
SET citus.executor_slow_start_interval TO '60s';
|
||||
SET citus.max_adaptive_executor_pool_size TO 2;
|
||||
|
@ -19,7 +46,7 @@ BEGIN;
|
|||
SELECT count(*) FROM test a JOIN (SELECT x, pg_sleep(0.1) FROM test) b USING (x);
|
||||
count
|
||||
---------------------------------------------------------------------
|
||||
2
|
||||
4
|
||||
(1 row)
|
||||
|
||||
SELECT sum(result::bigint) FROM run_command_on_workers($$
|
||||
|
@ -35,10 +62,10 @@ END;
|
|||
-- SELECT takes longer than slow start interval, should open multiple connections
|
||||
SET citus.executor_slow_start_interval TO '10ms';
|
||||
BEGIN;
|
||||
SELECT count(*) FROM test a JOIN (SELECT x, pg_sleep(0.1) FROM test) b USING (x);
|
||||
SELECT count(*) FROM test a JOIN (SELECT x, pg_sleep(0.2) FROM test) b USING (x);
|
||||
count
|
||||
---------------------------------------------------------------------
|
||||
2
|
||||
4
|
||||
(1 row)
|
||||
|
||||
SELECT sum(result::bigint) FROM run_command_on_workers($$
|
||||
|
|
|
@ -7,8 +7,15 @@ SET citus.shard_count TO 4;
|
|||
SET citus.shard_replication_factor TO 1;
|
||||
SET citus.next_shard_id TO 801009000;
|
||||
SELECT create_distributed_table('test','x');
|
||||
-- Add 1 row to each shard
|
||||
SELECT get_shard_id_for_distribution_column('test', 1);
|
||||
INSERT INTO test VALUES (1,2);
|
||||
SELECT get_shard_id_for_distribution_column('test', 3);
|
||||
INSERT INTO test VALUES (3,2);
|
||||
SELECT get_shard_id_for_distribution_column('test', 6);
|
||||
INSERT INTO test VALUES (8,2);
|
||||
SELECT get_shard_id_for_distribution_column('test', 11);
|
||||
INSERT INTO test VALUES (11,2);
|
||||
|
||||
-- Set a very high slow start to avoid opening parallel connections
|
||||
SET citus.executor_slow_start_interval TO '60s';
|
||||
|
@ -26,7 +33,7 @@ END;
|
|||
SET citus.executor_slow_start_interval TO '10ms';
|
||||
|
||||
BEGIN;
|
||||
SELECT count(*) FROM test a JOIN (SELECT x, pg_sleep(0.1) FROM test) b USING (x);
|
||||
SELECT count(*) FROM test a JOIN (SELECT x, pg_sleep(0.2) FROM test) b USING (x);
|
||||
SELECT sum(result::bigint) FROM run_command_on_workers($$
|
||||
SELECT count(*) FROM pg_stat_activity
|
||||
WHERE pid <> pg_backend_pid() AND query LIKE '%8010090%'
|
||||
|
|
Loading…
Reference in New Issue