Improve rebalance reporting for retried tasks (#6683)

If there is a problem with an ongoing rebalance, we did not show details
on background tasks that are stuck in runnable state. Similar to how we
show details for errored tasks, we now show details on tasks that are
being retried.

Earlier we showed the following output when a task was stuck:

```
┌────────────────────────────┐
│ {                         ↵│
│     "tasks": [            ↵│
│     ],                    ↵│
│     "task_state_counts": {↵│
│         "done": 13,       ↵│
│         "blocked": 2,     ↵│
│         "runnable": 1     ↵│
│     }                     ↵│
│ }                          │
└────────────────────────────┘
```

Now we show details like the following:

```
+-----------------------------------------------------------------------
| {
|     "tasks": [
|         {
|             "state": "runnable",
|             "command": "SELECT pg_catalog.citus_move_shard_placement(1
|             "message": "ERROR: Moving shards to a node that shouldn't
|             "retried": 2,
|             "task_id": 3
|         }
|     ],
|     "task_state_counts": {
|         "blocked": 1,
|         "runnable": 1
|     }
| }
+-----------------------------------------------------------------------
```
pull/6684/head
Hanefi Onaldi 2023-01-31 15:26:52 +03:00 committed by GitHub
parent 14c31fbb07
commit 47ff03123b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 6 additions and 6 deletions

View File

@ -74,7 +74,7 @@ CREATE OR REPLACE FUNCTION pg_catalog.citus_job_status (
WHERE j.job_id = $1
AND t.status = 'running'
),
errored_task_details AS (
errored_or_retried_task_details AS (
SELECT jsonb_agg(jsonb_build_object(
'state', t.status,
'retried', coalesce(t.retry_count,0),
@ -85,7 +85,7 @@ CREATE OR REPLACE FUNCTION pg_catalog.citus_job_status (
pg_dist_background_task t JOIN pg_dist_background_job j ON t.job_id = j.job_id
WHERE j.job_id = $1
AND NOT EXISTS (SELECT 1 FROM rp WHERE rp.sessionid = t.pid)
AND t.status = 'error'
AND (t.status = 'error' OR (t.status = 'runnable' AND t.retry_count > 0))
)
SELECT
job_id,
@ -97,7 +97,7 @@ CREATE OR REPLACE FUNCTION pg_catalog.citus_job_status (
jsonb_build_object(
'task_state_counts', (SELECT jsonb_object_agg(status, count) FROM task_state_occurence_counts),
'tasks', (COALESCE((SELECT tasks FROM running_task_details),'[]'::jsonb) ||
COALESCE((SELECT tasks FROM errored_task_details),'[]'::jsonb))) AS details
COALESCE((SELECT tasks FROM errored_or_retried_task_details),'[]'::jsonb))) AS details
FROM pg_dist_background_job j
WHERE j.job_id = $1
$fn$;

View File

@ -74,7 +74,7 @@ CREATE OR REPLACE FUNCTION pg_catalog.citus_job_status (
WHERE j.job_id = $1
AND t.status = 'running'
),
errored_task_details AS (
errored_or_retried_task_details AS (
SELECT jsonb_agg(jsonb_build_object(
'state', t.status,
'retried', coalesce(t.retry_count,0),
@ -85,7 +85,7 @@ CREATE OR REPLACE FUNCTION pg_catalog.citus_job_status (
pg_dist_background_task t JOIN pg_dist_background_job j ON t.job_id = j.job_id
WHERE j.job_id = $1
AND NOT EXISTS (SELECT 1 FROM rp WHERE rp.sessionid = t.pid)
AND t.status = 'error'
AND (t.status = 'error' OR (t.status = 'runnable' AND t.retry_count > 0))
)
SELECT
job_id,
@ -97,7 +97,7 @@ CREATE OR REPLACE FUNCTION pg_catalog.citus_job_status (
jsonb_build_object(
'task_state_counts', (SELECT jsonb_object_agg(status, count) FROM task_state_occurence_counts),
'tasks', (COALESCE((SELECT tasks FROM running_task_details),'[]'::jsonb) ||
COALESCE((SELECT tasks FROM errored_task_details),'[]'::jsonb))) AS details
COALESCE((SELECT tasks FROM errored_or_retried_task_details),'[]'::jsonb))) AS details
FROM pg_dist_background_job j
WHERE j.job_id = $1
$fn$;