From 905fd46410544c98aac2be92d188c52cc0bb2802 Mon Sep 17 00:00:00 2001 From: Naisila Puka <37271756+naisila@users.noreply.github.com> Date: Fri, 5 May 2023 16:47:01 +0300 Subject: [PATCH] Fixes flakiness in background_rebalance_parallel test (#6910) Fixes the following flaky outputs by decreasing citus_task_wait loop interval, and changing the order of wait commands. https://app.circleci.com/pipelines/github/citusdata/citus/32102/workflows/19958297-6c7e-49ef-9bc2-8efe8aacb96f/jobs/1089589 ``` diff SELECT job_id, task_id, status, nodes_involved FROM pg_dist_background_task WHERE job_id in (:job_id) ORDER BY task_id; job_id | task_id | status | nodes_involved --------+---------+----------+---------------- 17779 | 1013 | done | {50,56} 17779 | 1014 | running | {50,57} - 17779 | 1015 | running | {50,56} - 17779 | 1016 | blocked | {50,57} + 17779 | 1015 | done | {50,56} + 17779 | 1016 | running | {50,57} 17779 | 1017 | runnable | {50,56} 17779 | 1018 | blocked | {50,57} 17779 | 1019 | runnable | {50,56} 17779 | 1020 | blocked | {50,57} (8 rows) ``` https://github.com/citusdata/citus/pull/6893#issuecomment-1525661408 ```diff SELECT job_id, task_id, status, nodes_involved FROM pg_dist_background_task WHERE job_id in (:job_id) ORDER BY task_id; job_id | task_id | status | nodes_involved --------+---------+----------+---------------- 17779 | 1013 | done | {50,56} - 17779 | 1014 | running | {50,57} + 17779 | 1014 | runnable | {50,57} 17779 | 1015 | running | {50,56} 17779 | 1016 | blocked | {50,57} 17779 | 1017 | runnable | {50,56} 17779 | 1018 | blocked | {50,57} 17779 | 1019 | runnable | {50,56} 17779 | 1020 | blocked | {50,57} (8 rows) ``` --- src/backend/distributed/utils/background_jobs.c | 2 +- .../regress/expected/background_rebalance_parallel.out | 10 ++++++++-- src/test/regress/sql/background_rebalance_parallel.sql | 4 +++- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/backend/distributed/utils/background_jobs.c b/src/backend/distributed/utils/background_jobs.c index 789732d21..84ef4229f 100644 --- a/src/backend/distributed/utils/background_jobs.c +++ b/src/backend/distributed/utils/background_jobs.c @@ -395,7 +395,7 @@ citus_task_wait_internal(int64 taskid, BackgroundTaskStatus *desiredStatus) /* sleep for a while, before rechecking the task status */ CHECK_FOR_INTERRUPTS(); - const long delay_ms = 1000; + const long delay_ms = 100; (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, delay_ms, diff --git a/src/test/regress/expected/background_rebalance_parallel.out b/src/test/regress/expected/background_rebalance_parallel.out index 187f709e4..dbdd963a9 100644 --- a/src/test/regress/expected/background_rebalance_parallel.out +++ b/src/test/regress/expected/background_rebalance_parallel.out @@ -513,6 +513,12 @@ FROM pg_dist_background_task WHERE job_id in (:job_id) ORDER BY task_id; (8 rows) -- increase citus.max_background_task_executors_per_node +SELECT citus_task_wait(1013, desired_status => 'done'); + citus_task_wait +--------------------------------------------------------------------- + +(1 row) + ALTER SYSTEM SET citus.max_background_task_executors_per_node = 2; SELECT pg_reload_conf(); pg_reload_conf @@ -520,13 +526,13 @@ SELECT pg_reload_conf(); t (1 row) -SELECT citus_task_wait(1015, desired_status => 'running'); +SELECT citus_task_wait(1014, desired_status => 'running'); citus_task_wait --------------------------------------------------------------------- (1 row) -SELECT citus_task_wait(1013, desired_status => 'done'); +SELECT citus_task_wait(1015, desired_status => 'running'); citus_task_wait --------------------------------------------------------------------- diff --git a/src/test/regress/sql/background_rebalance_parallel.sql b/src/test/regress/sql/background_rebalance_parallel.sql index e55fd93bb..2eb952b67 100644 --- a/src/test/regress/sql/background_rebalance_parallel.sql +++ b/src/test/regress/sql/background_rebalance_parallel.sql @@ -221,10 +221,12 @@ SELECT job_id, task_id, status, nodes_involved FROM pg_dist_background_task WHERE job_id in (:job_id) ORDER BY task_id; -- increase citus.max_background_task_executors_per_node +SELECT citus_task_wait(1013, desired_status => 'done'); ALTER SYSTEM SET citus.max_background_task_executors_per_node = 2; SELECT pg_reload_conf(); + +SELECT citus_task_wait(1014, desired_status => 'running'); SELECT citus_task_wait(1015, desired_status => 'running'); -SELECT citus_task_wait(1013, desired_status => 'done'); -- show that at most 2 tasks per node are running -- among the tasks that are not blocked