From 47ff03123bd8c1df3d028be10ecbe9a0559f48d9 Mon Sep 17 00:00:00 2001 From: Hanefi Onaldi Date: Tue, 31 Jan 2023 15:26:52 +0300 Subject: [PATCH] Improve rebalance reporting for retried tasks (#6683) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If there is a problem with an ongoing rebalance, we did not show details on background tasks that are stuck in runnable state. Similar to how we show details for errored tasks, we now show details on tasks that are being retried. Earlier we showed the following output when a task was stuck: ``` ┌────────────────────────────┐ │ { ↵│ │ "tasks": [ ↵│ │ ], ↵│ │ "task_state_counts": {↵│ │ "done": 13, ↵│ │ "blocked": 2, ↵│ │ "runnable": 1 ↵│ │ } ↵│ │ } │ └────────────────────────────┘ ``` Now we show details like the following: ``` +----------------------------------------------------------------------- | { | "tasks": [ | { | "state": "runnable", | "command": "SELECT pg_catalog.citus_move_shard_placement(1 | "message": "ERROR: Moving shards to a node that shouldn't | "retried": 2, | "task_id": 3 | } | ], | "task_state_counts": { | "blocked": 1, | "runnable": 1 | } | } +----------------------------------------------------------------------- ``` --- .../distributed/sql/udfs/citus_job_status/11.2-1.sql | 6 +++--- .../distributed/sql/udfs/citus_job_status/latest.sql | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/backend/distributed/sql/udfs/citus_job_status/11.2-1.sql b/src/backend/distributed/sql/udfs/citus_job_status/11.2-1.sql index 07709a614..93496203a 100644 --- a/src/backend/distributed/sql/udfs/citus_job_status/11.2-1.sql +++ b/src/backend/distributed/sql/udfs/citus_job_status/11.2-1.sql @@ -74,7 +74,7 @@ CREATE OR REPLACE FUNCTION pg_catalog.citus_job_status ( WHERE j.job_id = $1 AND t.status = 'running' ), - errored_task_details AS ( + errored_or_retried_task_details AS ( SELECT jsonb_agg(jsonb_build_object( 'state', t.status, 'retried', coalesce(t.retry_count,0), @@ -85,7 +85,7 @@ CREATE OR REPLACE FUNCTION pg_catalog.citus_job_status ( pg_dist_background_task t JOIN pg_dist_background_job j ON t.job_id = j.job_id WHERE j.job_id = $1 AND NOT EXISTS (SELECT 1 FROM rp WHERE rp.sessionid = t.pid) - AND t.status = 'error' + AND (t.status = 'error' OR (t.status = 'runnable' AND t.retry_count > 0)) ) SELECT job_id, @@ -97,7 +97,7 @@ CREATE OR REPLACE FUNCTION pg_catalog.citus_job_status ( jsonb_build_object( 'task_state_counts', (SELECT jsonb_object_agg(status, count) FROM task_state_occurence_counts), 'tasks', (COALESCE((SELECT tasks FROM running_task_details),'[]'::jsonb) || - COALESCE((SELECT tasks FROM errored_task_details),'[]'::jsonb))) AS details + COALESCE((SELECT tasks FROM errored_or_retried_task_details),'[]'::jsonb))) AS details FROM pg_dist_background_job j WHERE j.job_id = $1 $fn$; diff --git a/src/backend/distributed/sql/udfs/citus_job_status/latest.sql b/src/backend/distributed/sql/udfs/citus_job_status/latest.sql index 07709a614..93496203a 100644 --- a/src/backend/distributed/sql/udfs/citus_job_status/latest.sql +++ b/src/backend/distributed/sql/udfs/citus_job_status/latest.sql @@ -74,7 +74,7 @@ CREATE OR REPLACE FUNCTION pg_catalog.citus_job_status ( WHERE j.job_id = $1 AND t.status = 'running' ), - errored_task_details AS ( + errored_or_retried_task_details AS ( SELECT jsonb_agg(jsonb_build_object( 'state', t.status, 'retried', coalesce(t.retry_count,0), @@ -85,7 +85,7 @@ CREATE OR REPLACE FUNCTION pg_catalog.citus_job_status ( pg_dist_background_task t JOIN pg_dist_background_job j ON t.job_id = j.job_id WHERE j.job_id = $1 AND NOT EXISTS (SELECT 1 FROM rp WHERE rp.sessionid = t.pid) - AND t.status = 'error' + AND (t.status = 'error' OR (t.status = 'runnable' AND t.retry_count > 0)) ) SELECT job_id, @@ -97,7 +97,7 @@ CREATE OR REPLACE FUNCTION pg_catalog.citus_job_status ( jsonb_build_object( 'task_state_counts', (SELECT jsonb_object_agg(status, count) FROM task_state_occurence_counts), 'tasks', (COALESCE((SELECT tasks FROM running_task_details),'[]'::jsonb) || - COALESCE((SELECT tasks FROM errored_task_details),'[]'::jsonb))) AS details + COALESCE((SELECT tasks FROM errored_or_retried_task_details),'[]'::jsonb))) AS details FROM pg_dist_background_job j WHERE j.job_id = $1 $fn$;