* add SIGTERM handler to gracefully terminate task executors, \ (#6473)

Adds signal handlers for graceful termination, cancellation of
task executors and detecting config updates. Related to PR #6459.

#### How to handle termination signal?
Monitor need to gracefully terminate all running task executors before
terminating. Hence, we have sigterm handler for the monitor.

#### How to handle cancellation signal?
Monitor need to gracefully cancel all running task executors before
terminating. Hence, we have sigint handler for the monitor.

#### How to detect configuration changes?
Monitor has SIGHUP handler to reflect configuration changes while
executing tasks.
pull/6508/head
aykut-bozkurt 2022-12-02 18:15:31 +03:00 committed by GitHub
parent 6781ace3a1
commit 65f256eec4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 620 additions and 49 deletions

View File

@ -72,12 +72,14 @@
#define CITUS_BACKGROUND_TASK_KEY_COMMAND 2
#define CITUS_BACKGROUND_TASK_KEY_QUEUE 3
#define CITUS_BACKGROUND_TASK_KEY_TASK_ID 4
#define CITUS_BACKGROUND_TASK_NKEYS 5
#define CITUS_BACKGROUND_TASK_KEY_JOB_ID 5
#define CITUS_BACKGROUND_TASK_NKEYS 6
static BackgroundWorkerHandle * StartCitusBackgroundTaskExecutor(char *database,
char *user,
char *command,
int64 taskId,
int64 jobId,
dsm_segment **pSegment);
static void ExecuteSqlString(const char *sql);
static shm_mq_result ConsumeTaskWorkerOutput(shm_mq_handle *responseq, StringInfo message,
@ -106,6 +108,18 @@ static TaskExecutionStatus ConsumeExecutorQueue(
TaskExecutionContext *taskExecutionContext);
static void TaskHadError(TaskExecutionContext *taskExecutionContext);
static void TaskEnded(TaskExecutionContext *taskExecutionContext);
static void TerminateAllTaskExecutors(HTAB *currentExecutors);
static HTAB * GetRunningUniqueJobIds(HTAB *currentExecutors);
static void CancelAllTaskExecutors(HTAB *currentExecutors);
static bool MonitorGotTerminationOrCancellationRequest();
static void QueueMonitorSigTermHandler(SIGNAL_ARGS);
static void QueueMonitorSigIntHandler(SIGNAL_ARGS);
static void QueueMonitorSigHupHandler(SIGNAL_ARGS);
/* flags set by signal handlers */
static volatile sig_atomic_t GotSigterm = false;
static volatile sig_atomic_t GotSigint = false;
static volatile sig_atomic_t GotSighup = false;
PG_FUNCTION_INFO_V1(citus_job_cancel);
PG_FUNCTION_INFO_V1(citus_job_wait);
@ -337,8 +351,6 @@ CitusBackgroundTaskQueueMonitorErrorCallback(void *arg)
/*
* NewExecutorExceedsCitusLimit returns true if currently we reached Citus' max worker count.
* It also sleeps 1 sec to let running tasks progress so that we have better chance to not hit
* that limit again.
*/
static bool
NewExecutorExceedsCitusLimit(QueueMonitorExecutionContext *queueMonitorExecutionContext)
@ -372,8 +384,6 @@ NewExecutorExceedsCitusLimit(QueueMonitorExecutionContext *queueMonitorExecution
/*
* NewExecutorExceedsPgMaxWorkers returns true if currently we reached Postgres' max worker count.
* It also sleeps 1 sec to let running tasks progress so that we have better chance to not hit
* that limit again.
*/
static bool
NewExecutorExceedsPgMaxWorkers(BackgroundWorkerHandle *handle,
@ -389,9 +399,8 @@ NewExecutorExceedsPgMaxWorkers(BackgroundWorkerHandle *handle,
*/
if (queueMonitorExecutionContext->backgroundWorkerFailedStartTime == 0)
{
ereport(WARNING, (errmsg(
"unable to start background worker for "
"background task execution"),
ereport(WARNING, (errmsg("unable to start background worker for "
"background task execution"),
errdetail(
"Current number of task "
"executors: %ld/%d",
@ -432,7 +441,7 @@ AssignRunnableTaskToNewExecutor(BackgroundTask *runnableTask,
dsm_segment *seg = NULL;
BackgroundWorkerHandle *handle =
StartCitusBackgroundTaskExecutor(databaseName, userName, runnableTask->command,
runnableTask->taskid, &seg);
runnableTask->taskid, runnableTask->jobid, &seg);
MemoryContextSwitchTo(oldContext);
if (NewExecutorExceedsPgMaxWorkers(handle, queueMonitorExecutionContext))
@ -450,6 +459,7 @@ AssignRunnableTaskToNewExecutor(BackgroundTask *runnableTask,
Assert(!handleEntryFound);
handleEntry->handle = handle;
handleEntry->seg = seg;
handleEntry->jobid = runnableTask->jobid;
/* reset worker allocation timestamp and log time elapsed since the last failure */
CheckAndResetLastWorkerAllocationFailure(queueMonitorExecutionContext);
@ -741,6 +751,138 @@ TaskEnded(TaskExecutionContext *taskExecutionContext)
}
/*
* QueueMonitorSigHupHandler handles SIGHUP to update monitor related config params.
*/
static void
QueueMonitorSigHupHandler(SIGNAL_ARGS)
{
int saved_errno = errno;
GotSighup = true;
if (MyProc)
{
SetLatch(&MyProc->procLatch);
}
errno = saved_errno;
}
/*
* MonitorGotTerminationOrCancellationRequest returns true if monitor had SIGTERM or SIGINT signals
*/
static bool
MonitorGotTerminationOrCancellationRequest()
{
return GotSigterm || GotSigint;
}
/*
* QueueMonitorSigTermHandler handles SIGTERM by setting a flag to inform the monitor process
* so that it can terminate active task executors properly. It also sets the latch to awake the
* monitor if it waits on it.
*/
static void
QueueMonitorSigTermHandler(SIGNAL_ARGS)
{
int saved_errno = errno;
GotSigterm = true;
if (MyProc)
{
SetLatch(&MyProc->procLatch);
}
errno = saved_errno;
}
/*
* QueueMonitorSigIntHandler handles SIGINT by setting a flag to inform the monitor process
* so that it can terminate active task executors properly. It also sets the latch to awake the
* monitor if it waits on it.
*/
static void
QueueMonitorSigIntHandler(SIGNAL_ARGS)
{
int saved_errno = errno;
GotSigint = true;
if (MyProc)
{
SetLatch(&MyProc->procLatch);
}
errno = saved_errno;
}
/*
* TerminateAllTaskExecutors terminates task executors given in the hash map.
*/
static void
TerminateAllTaskExecutors(HTAB *currentExecutors)
{
HASH_SEQ_STATUS status;
BackgroundExecutorHashEntry *backgroundExecutorHashEntry;
foreach_htab(backgroundExecutorHashEntry, &status, currentExecutors)
{
TerminateBackgroundWorker(backgroundExecutorHashEntry->handle);
}
}
/*
* GetRunningUniqueJobIds returns unique job ids from currentExecutors
*/
static HTAB *
GetRunningUniqueJobIds(HTAB *currentExecutors)
{
/* create a set to store unique job ids for currently executing tasks */
HTAB *uniqueJobIds = CreateSimpleHashSetWithSize(int64, MAX_BG_TASK_EXECUTORS);
HASH_SEQ_STATUS status;
BackgroundExecutorHashEntry *backgroundExecutorHashEntry;
foreach_htab(backgroundExecutorHashEntry, &status, currentExecutors)
{
hash_search(uniqueJobIds, &backgroundExecutorHashEntry->jobid, HASH_ENTER, NULL);
}
return uniqueJobIds;
}
/*
* CancelAllTaskExecutors cancels task executors given in the hash map.
*/
static void
CancelAllTaskExecutors(HTAB *currentExecutors)
{
StartTransactionCommand();
PushActiveSnapshot(GetTransactionSnapshot());
/* get unique job id set for running tasks in currentExecutors */
HTAB *uniqueJobIds = GetRunningUniqueJobIds(currentExecutors);
HASH_SEQ_STATUS status;
int64 *uniqueJobId;
foreach_htab(uniqueJobId, &status, uniqueJobIds)
{
ereport(DEBUG1, (errmsg("cancelling job: %ld", *uniqueJobId)));
Datum jobidDatum = Int64GetDatum(*uniqueJobId);
DirectFunctionCall1(citus_job_cancel, jobidDatum);
}
PopActiveSnapshot();
CommitTransactionCommand();
}
/*
* CitusBackgroundTaskQueueMonitorMain is the main entry point for the background worker
* running the background tasks queue monitor.
@ -758,6 +900,18 @@ TaskEnded(TaskExecutionContext *taskExecutionContext)
void
CitusBackgroundTaskQueueMonitorMain(Datum arg)
{
/* handle SIGTERM to properly terminate active task executors */
pqsignal(SIGTERM, QueueMonitorSigTermHandler);
/* handle SIGINT to properly cancel active task executors */
pqsignal(SIGINT, QueueMonitorSigIntHandler);
/* handle SIGHUP to update MaxBackgroundTaskExecutors */
pqsignal(SIGHUP, QueueMonitorSigHupHandler);
/* ready to handle signals */
BackgroundWorkerUnblockSignals();
Oid databaseOid = DatumGetObjectId(arg);
/* extension owner is passed via bgw_extra */
@ -765,8 +919,6 @@ CitusBackgroundTaskQueueMonitorMain(Datum arg)
memcpy_s(&extensionOwner, sizeof(extensionOwner),
MyBgworkerEntry->bgw_extra, sizeof(Oid));
BackgroundWorkerUnblockSignals();
/* connect to database, after that we can actually access catalogs */
BackgroundWorkerInitializeConnectionByOid(databaseOid, extensionOwner, 0);
@ -807,10 +959,6 @@ CitusBackgroundTaskQueueMonitorMain(Datum arg)
* cause conflicts on processing the tasks in the catalog table as well as violate
* parallelism guarantees. To make sure there is at most, exactly one backend running
* we take a session lock on the CITUS_BACKGROUND_TASK_MONITOR operation.
*
* TODO now that we have a lock, we should install a term handler to terminate any
* 'child' backend when we are terminated. Otherwise we will still have a situation
* where the actual task could be running multiple times.
*/
LOCKTAG tag = { 0 };
SET_LOCKTAG_CITUS_OPERATION(tag, CITUS_BACKGROUND_TASK_MONITOR);
@ -841,11 +989,13 @@ CitusBackgroundTaskQueueMonitorMain(Datum arg)
PopActiveSnapshot();
CommitTransactionCommand();
/* create a map to store parallel task executors */
/* create a map to store parallel task executors. Persist it in monitor memory context */
oldContext = MemoryContextSwitchTo(backgroundTaskContext);
HTAB *currentExecutors = CreateSimpleHashWithNameAndSize(int64,
BackgroundExecutorHashEntry,
"Background Executor Hash",
MAX_BG_TASK_EXECUTORS);
MemoryContextSwitchTo(oldContext);
/*
* monitor execution context that is useful during the monitor loop.
@ -861,6 +1011,10 @@ CitusBackgroundTaskQueueMonitorMain(Datum arg)
.ctx = backgroundTaskContext
};
/* flag to prevent duplicate termination and cancellation of task executors */
bool terminateExecutorsStarted = false;
bool cancelExecutorsStarted = false;
/* loop exits if there is no running or runnable tasks left */
bool hasAnyTask = true;
while (hasAnyTask)
@ -868,15 +1022,47 @@ CitusBackgroundTaskQueueMonitorMain(Datum arg)
/* handle signals */
CHECK_FOR_INTERRUPTS();
/*
* if the flag is set, we should terminate all task executor workers to prevent duplicate
* runs of the same task on the next start of the monitor, which is dangerous for non-idempotent
* tasks. We do not break the loop here as we want to reflect tasks' messages. Hence, we wait until
* all tasks finish and also do not allow new runnable tasks to start running. After all current tasks
* finish, we can exit the loop safely.
*/
if (GotSigterm && !terminateExecutorsStarted)
{
ereport(LOG, (errmsg("handling termination signal")));
terminateExecutorsStarted = true;
TerminateAllTaskExecutors(queueMonitorExecutionContext.currentExecutors);
}
if (GotSigint && !cancelExecutorsStarted)
{
ereport(LOG, (errmsg("handling cancellation signal")));
cancelExecutorsStarted = true;
CancelAllTaskExecutors(queueMonitorExecutionContext.currentExecutors);
}
if (GotSighup)
{
GotSighup = false;
/* update max_background_task_executors if changed */
ProcessConfigFile(PGC_SIGHUP);
}
/* invalidate cache for new data in catalog */
InvalidateMetadataSystemCache();
/* assign runnable tasks, if any, to new task executors in a transaction */
StartTransactionCommand();
PushActiveSnapshot(GetTransactionSnapshot());
AssignRunnableTasks(&queueMonitorExecutionContext);
PopActiveSnapshot();
CommitTransactionCommand();
/* assign runnable tasks, if any, to new task executors in a transaction if we do not have SIGTERM or SIGINT */
if (!MonitorGotTerminationOrCancellationRequest())
{
StartTransactionCommand();
PushActiveSnapshot(GetTransactionSnapshot());
AssignRunnableTasks(&queueMonitorExecutionContext);
PopActiveSnapshot();
CommitTransactionCommand();
}
/* get running task entries from hash table */
List *runningTaskEntries = GetRunningTaskEntries(
@ -1268,7 +1454,8 @@ ConsumeTaskWorkerOutput(shm_mq_handle *responseq, StringInfo message, bool *hadE
* environment to the executor.
*/
static dsm_segment *
StoreArgumentsInDSM(char *database, char *username, char *command, int64 taskId)
StoreArgumentsInDSM(char *database, char *username, char *command,
int64 taskId, int64 jobId)
{
/*
* Create the shared memory that we will pass to the background
@ -1284,6 +1471,7 @@ StoreArgumentsInDSM(char *database, char *username, char *command, int64 taskId)
#define QUEUE_SIZE ((Size) 65536)
shm_toc_estimate_chunk(&e, QUEUE_SIZE);
shm_toc_estimate_chunk(&e, sizeof(int64));
shm_toc_estimate_chunk(&e, sizeof(int64));
shm_toc_estimate_keys(&e, CITUS_BACKGROUND_TASK_NKEYS);
Size segsize = shm_toc_estimate(&e);
@ -1330,6 +1518,10 @@ StoreArgumentsInDSM(char *database, char *username, char *command, int64 taskId)
*taskIdTarget = taskId;
shm_toc_insert(toc, CITUS_BACKGROUND_TASK_KEY_TASK_ID, taskIdTarget);
int64 *jobIdTarget = shm_toc_allocate(toc, sizeof(int64));
*jobIdTarget = jobId;
shm_toc_insert(toc, CITUS_BACKGROUND_TASK_KEY_JOB_ID, jobIdTarget);
shm_mq_attach(mq, seg, NULL);
return seg;
@ -1343,17 +1535,17 @@ StoreArgumentsInDSM(char *database, char *username, char *command, int64 taskId)
* pointer to the dynamic shared memory.
*/
static BackgroundWorkerHandle *
StartCitusBackgroundTaskExecutor(char *database, char *user, char *command, int64 taskId,
dsm_segment **pSegment)
StartCitusBackgroundTaskExecutor(char *database, char *user, char *command,
int64 taskId, int64 jobId, dsm_segment **pSegment)
{
dsm_segment *seg = StoreArgumentsInDSM(database, user, command, taskId);
dsm_segment *seg = StoreArgumentsInDSM(database, user, command, taskId, jobId);
/* Configure a worker. */
BackgroundWorker worker = { 0 };
memset(&worker, 0, sizeof(worker));
SafeSnprintf(worker.bgw_name, BGW_MAXLEN,
"Citus Background Task Queue Executor: %s/%s",
database, user);
"Citus Background Task Queue Executor: %s/%s for (%ld/%ld)",
database, user, jobId, taskId);
worker.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION;
worker.bgw_start_time = BgWorkerStart_ConsistentState;
@ -1391,6 +1583,8 @@ typedef struct CitusBackgroundJobExecutorErrorCallbackContext
{
const char *database;
const char *username;
int64 taskId;
int64 jobId;
} CitusBackgroundJobExecutorErrorCallbackContext;
@ -1403,8 +1597,9 @@ CitusBackgroundJobExecutorErrorCallback(void *arg)
{
CitusBackgroundJobExecutorErrorCallbackContext *context =
(CitusBackgroundJobExecutorErrorCallbackContext *) arg;
errcontext("Citus Background Task Queue Executor: %s/%s", context->database,
context->username);
errcontext("Citus Background Task Queue Executor: %s/%s for (%ld/%ld)",
context->database, context->username,
context->jobId, context->taskId);
}
@ -1418,10 +1613,6 @@ CitusBackgroundJobExecutorErrorCallback(void *arg)
void
CitusBackgroundTaskExecutor(Datum main_arg)
{
/*
* TODO figure out if we need this signal handler that is in pgcron
* pqsignal(SIGTERM, pg_cron_background_worker_sigterm);
*/
BackgroundWorkerUnblockSignals();
/* Set up a dynamic shared memory segment. */
@ -1445,6 +1636,7 @@ CitusBackgroundTaskExecutor(Datum main_arg)
char *username = shm_toc_lookup(toc, CITUS_BACKGROUND_TASK_KEY_USERNAME, false);
char *command = shm_toc_lookup(toc, CITUS_BACKGROUND_TASK_KEY_COMMAND, false);
int64 *taskId = shm_toc_lookup(toc, CITUS_BACKGROUND_TASK_KEY_TASK_ID, false);
int64 *jobId = shm_toc_lookup(toc, CITUS_BACKGROUND_TASK_KEY_JOB_ID, false);
shm_mq *mq = shm_toc_lookup(toc, CITUS_BACKGROUND_TASK_KEY_QUEUE, false);
shm_mq_set_sender(mq, MyProc);
@ -1456,6 +1648,8 @@ CitusBackgroundTaskExecutor(Datum main_arg)
CitusBackgroundJobExecutorErrorCallbackContext context = {
.database = database,
.username = username,
.taskId = *taskId,
.jobId = *jobId,
};
errorCallback.callback = CitusBackgroundJobExecutorErrorCallback;
errorCallback.arg = (void *) &context;
@ -1482,8 +1676,8 @@ CitusBackgroundTaskExecutor(Datum main_arg)
/* Prepare to execute the query. */
SetCurrentStatementStartTimestamp();
debug_query_string = command;
char *appname = psprintf("citus background task queue executor (taskId %ld)",
*taskId);
char *appname = psprintf("citus background task queue executor (%ld/%ld)",
*jobId, *taskId);
pgstat_report_appname(appname);
pgstat_report_activity(STATE_RUNNING, command);
StartTransactionCommand();

View File

@ -28,6 +28,7 @@ typedef struct BackgroundExecutorHashEntry
BackgroundWorkerHandle *handle;
dsm_segment *seg;
int64 jobid;
StringInfo message;
} BackgroundExecutorHashEntry;

View File

@ -125,6 +125,12 @@ SELECT citus_job_cancel(:job_id);
(1 row)
SELECT citus_job_wait(:job_id);
citus_job_wait
---------------------------------------------------------------------
(1 row)
SELECT state, NOT(started_at IS NULL) AS did_start FROM pg_dist_background_job WHERE job_id = :job_id;
state | did_start
---------------------------------------------------------------------
@ -242,6 +248,12 @@ SELECT citus_job_cancel(:job_id2); -- when a job with 1 task is cancelled, the l
(1 row)
SELECT citus_job_wait(:job_id2); -- wait for the job to be cancelled
citus_job_wait
---------------------------------------------------------------------
(1 row)
SELECT citus_job_wait(:job_id3, desired_status => 'running');
citus_job_wait
---------------------------------------------------------------------
@ -278,12 +290,6 @@ SELECT citus_job_wait(:job_id1);
(1 row)
SELECT citus_job_wait(:job_id2);
citus_job_wait
---------------------------------------------------------------------
(1 row)
SELECT citus_job_wait(:job_id3);
citus_job_wait
---------------------------------------------------------------------
@ -302,11 +308,262 @@ SELECT job_id, task_id, status FROM pg_dist_background_task
9 | 15 | cancelled
(5 rows)
-- verify that task is not starved by currently long running task
-- verify that a task, previously not started due to lack of workers, is executed after we increase max worker count
BEGIN;
INSERT INTO pg_dist_background_job (job_type, description) VALUES ('test_job', 'simple test to verify max parallel background execution') RETURNING job_id AS job_id1 \gset
INSERT INTO pg_dist_background_task (job_id, command) VALUES (:job_id1, $job$ SELECT pg_sleep(5000); $job$) RETURNING task_id AS task_id1 \gset
INSERT INTO pg_dist_background_task (job_id, command) VALUES (:job_id1, $job$ SELECT pg_sleep(5); $job$) RETURNING task_id AS task_id1 \gset
INSERT INTO pg_dist_background_task (job_id, command) VALUES (:job_id1, $job$ SELECT pg_sleep(5); $job$) RETURNING task_id AS task_id2 \gset
INSERT INTO pg_dist_background_task (job_id, command) VALUES (:job_id1, $job$ SELECT pg_sleep(5); $job$) RETURNING task_id AS task_id3 \gset
INSERT INTO pg_dist_background_job (job_type, description) VALUES ('test_job', 'simple test to verify max parallel background execution') RETURNING job_id AS job_id2 \gset
INSERT INTO pg_dist_background_task (job_id, command) VALUES (:job_id2, $job$ SELECT pg_sleep(5); $job$) RETURNING task_id AS task_id4 \gset
INSERT INTO pg_dist_background_job (job_type, description) VALUES ('test_job', 'simple test to verify max parallel background execution') RETURNING job_id AS job_id3 \gset
INSERT INTO pg_dist_background_task (job_id, command) VALUES (:job_id3, $job$ SELECT pg_sleep(5); $job$) RETURNING task_id AS task_id5 \gset
COMMIT;
SELECT pg_sleep(2); -- we assume this is enough time for all tasks to be in running status except the last one due to parallel worker limit
pg_sleep
---------------------------------------------------------------------
(1 row)
SELECT task_id, status FROM pg_dist_background_task
WHERE task_id IN (:task_id1, :task_id2, :task_id3, :task_id4, :task_id5)
ORDER BY task_id; -- show that last task is not running but ready to run(runnable)
task_id | status
---------------------------------------------------------------------
16 | running
17 | running
18 | running
19 | running
20 | runnable
(5 rows)
ALTER SYSTEM SET citus.max_background_task_executors TO 5;
SELECT pg_reload_conf(); -- the last runnable task will be running after change
pg_reload_conf
---------------------------------------------------------------------
t
(1 row)
SELECT citus_job_wait(:job_id3, desired_status => 'running');
citus_job_wait
---------------------------------------------------------------------
(1 row)
SELECT task_id, status FROM pg_dist_background_task
WHERE task_id IN (:task_id1, :task_id2, :task_id3, :task_id4, :task_id5)
ORDER BY task_id; -- show that last task is running
task_id | status
---------------------------------------------------------------------
16 | running
17 | running
18 | running
19 | running
20 | running
(5 rows)
SELECT citus_job_cancel(:job_id1);
citus_job_cancel
---------------------------------------------------------------------
(1 row)
SELECT citus_job_cancel(:job_id2);
citus_job_cancel
---------------------------------------------------------------------
(1 row)
SELECT citus_job_cancel(:job_id3);
citus_job_cancel
---------------------------------------------------------------------
(1 row)
SELECT citus_job_wait(:job_id1);
citus_job_wait
---------------------------------------------------------------------
(1 row)
SELECT citus_job_wait(:job_id2);
citus_job_wait
---------------------------------------------------------------------
(1 row)
SELECT citus_job_wait(:job_id3);
citus_job_wait
---------------------------------------------------------------------
(1 row)
SELECT task_id, status FROM pg_dist_background_task
WHERE task_id IN (:task_id1, :task_id2, :task_id3, :task_id4, :task_id5)
ORDER BY task_id; -- show that all tasks are cancelled
task_id | status
---------------------------------------------------------------------
16 | cancelled
17 | cancelled
18 | cancelled
19 | cancelled
20 | cancelled
(5 rows)
-- verify that upon termination signal, all tasks fail and retry policy sets their status back to runnable
BEGIN;
INSERT INTO pg_dist_background_job (job_type, description) VALUES ('test_job', 'simple test to verify termination on monitor') RETURNING job_id AS job_id1 \gset
INSERT INTO pg_dist_background_task (job_id, command) VALUES (:job_id1, $job$ SELECT pg_sleep(500); $job$) RETURNING task_id AS task_id1 \gset
INSERT INTO pg_dist_background_job (job_type, description) VALUES ('test_job', 'simple test to verify termination on monitor') RETURNING job_id AS job_id2 \gset
INSERT INTO pg_dist_background_task (job_id, command) VALUES (:job_id2, $job$ SELECT pg_sleep(500); $job$) RETURNING task_id AS task_id2 \gset
COMMIT;
SELECT citus_job_wait(:job_id1, desired_status => 'running');
citus_job_wait
---------------------------------------------------------------------
(1 row)
SELECT citus_job_wait(:job_id2, desired_status => 'running');
citus_job_wait
---------------------------------------------------------------------
(1 row)
SELECT task_id, status FROM pg_dist_background_task
WHERE task_id IN (:task_id1, :task_id2)
ORDER BY task_id;
task_id | status
---------------------------------------------------------------------
21 | running
22 | running
(2 rows)
SELECT pid AS monitor_pid FROM pg_stat_activity WHERE application_name ~ 'task queue monitor' \gset
SELECT pg_terminate_backend(:monitor_pid); -- terminate monitor process
pg_terminate_backend
---------------------------------------------------------------------
t
(1 row)
SELECT pg_sleep(2); -- wait enough to show that tasks are terminated
pg_sleep
---------------------------------------------------------------------
(1 row)
SELECT task_id, status, retry_count, message FROM pg_dist_background_task
WHERE task_id IN (:task_id1, :task_id2)
ORDER BY task_id; -- show that all tasks are runnable by retry policy after termination signal
task_id | status | retry_count | message
---------------------------------------------------------------------
21 | runnable | 1 | FATAL: terminating background worker "Citus Background Task Queue Executor: regression/postgres for (13/21)" due to administrator command+
| | | CONTEXT: Citus Background Task Queue Executor: regression/postgres for (13/21) +
| | |
22 | runnable | 1 | FATAL: terminating background worker "Citus Background Task Queue Executor: regression/postgres for (14/22)" due to administrator command+
| | | CONTEXT: Citus Background Task Queue Executor: regression/postgres for (14/22) +
| | |
(2 rows)
SELECT citus_job_cancel(:job_id1);
citus_job_cancel
---------------------------------------------------------------------
(1 row)
SELECT citus_job_cancel(:job_id2);
citus_job_cancel
---------------------------------------------------------------------
(1 row)
SELECT citus_job_wait(:job_id1);
citus_job_wait
---------------------------------------------------------------------
(1 row)
SELECT citus_job_wait(:job_id2);
citus_job_wait
---------------------------------------------------------------------
(1 row)
SELECT task_id, status FROM pg_dist_background_task
WHERE task_id IN (:task_id1, :task_id2)
ORDER BY task_id; -- show that all tasks are cancelled
task_id | status
---------------------------------------------------------------------
21 | cancelled
22 | cancelled
(2 rows)
-- verify that upon cancellation signal, all tasks are cancelled
BEGIN;
INSERT INTO pg_dist_background_job (job_type, description) VALUES ('test_job', 'simple test to verify cancellation on monitor') RETURNING job_id AS job_id1 \gset
INSERT INTO pg_dist_background_task (job_id, command) VALUES (:job_id1, $job$ SELECT pg_sleep(500); $job$) RETURNING task_id AS task_id1 \gset
INSERT INTO pg_dist_background_job (job_type, description) VALUES ('test_job', 'simple test to verify cancellation on monitor') RETURNING job_id AS job_id2 \gset
INSERT INTO pg_dist_background_task (job_id, command) VALUES (:job_id2, $job$ SELECT pg_sleep(500); $job$) RETURNING task_id AS task_id2 \gset
COMMIT;
SELECT citus_job_wait(:job_id1, desired_status => 'running');
citus_job_wait
---------------------------------------------------------------------
(1 row)
SELECT citus_job_wait(:job_id2, desired_status => 'running');
citus_job_wait
---------------------------------------------------------------------
(1 row)
SELECT task_id, status FROM pg_dist_background_task
WHERE task_id IN (:task_id1, :task_id2)
ORDER BY task_id;
task_id | status
---------------------------------------------------------------------
23 | running
24 | running
(2 rows)
SELECT pid AS monitor_pid FROM pg_stat_activity WHERE application_name ~ 'task queue monitor' \gset
SELECT pg_cancel_backend(:monitor_pid); -- cancel monitor process
pg_cancel_backend
---------------------------------------------------------------------
t
(1 row)
SELECT pg_sleep(2); -- wait enough to show that tasks are cancelled
pg_sleep
---------------------------------------------------------------------
(1 row)
SELECT citus_job_wait(:job_id1);
citus_job_wait
---------------------------------------------------------------------
(1 row)
SELECT citus_job_wait(:job_id2);
citus_job_wait
---------------------------------------------------------------------
(1 row)
SELECT task_id, status FROM pg_dist_background_task
WHERE task_id IN (:task_id1, :task_id2)
ORDER BY task_id; -- show that all tasks are cancelled
task_id | status
---------------------------------------------------------------------
23 | cancelled
24 | cancelled
(2 rows)
-- verify that task is not starved by currently long running task
BEGIN;
INSERT INTO pg_dist_background_job (job_type, description) VALUES ('test_job', 'simple test to verify task execution starvation') RETURNING job_id AS job_id1 \gset
INSERT INTO pg_dist_background_task (job_id, command) VALUES (:job_id1, $job$ SELECT pg_sleep(5000); $job$) RETURNING task_id AS task_id1 \gset
INSERT INTO pg_dist_background_job (job_type, description) VALUES ('test_job', 'simple test to verify task execution starvation') RETURNING job_id AS job_id2 \gset
INSERT INTO pg_dist_background_task (job_id, command) VALUES (:job_id2, $job$ SELECT 1; $job$) RETURNING task_id AS task_id2 \gset
COMMIT;
SELECT citus_job_wait(:job_id1, desired_status => 'running');
@ -326,8 +583,8 @@ SELECT job_id, task_id, status FROM pg_dist_background_task
ORDER BY job_id, task_id; -- show that last task is finished without starvation
job_id | task_id | status
---------------------------------------------------------------------
10 | 16 | running
11 | 17 | done
17 | 25 | running
18 | 26 | done
(2 rows)
SELECT citus_job_cancel(:job_id1);
@ -336,6 +593,21 @@ SELECT citus_job_cancel(:job_id1);
(1 row)
SELECT citus_job_wait(:job_id1);
citus_job_wait
---------------------------------------------------------------------
(1 row)
SELECT job_id, task_id, status FROM pg_dist_background_task
WHERE task_id IN (:task_id1, :task_id2)
ORDER BY job_id, task_id; -- show that task is cancelled
job_id | task_id | status
---------------------------------------------------------------------
17 | 25 | cancelled
18 | 26 | done
(2 rows)
SET client_min_messages TO WARNING;
DROP SCHEMA background_task_queue_monitor CASCADE;
ALTER SYSTEM RESET citus.background_task_queue_interval;

View File

@ -54,6 +54,7 @@ SELECT status, pid, retry_count, NOT(message = '') AS has_message, (not_before >
-- test cancelling a failed/retrying job
SELECT citus_job_cancel(:job_id);
SELECT citus_job_wait(:job_id);
SELECT state, NOT(started_at IS NULL) AS did_start FROM pg_dist_background_job WHERE job_id = :job_id;
SELECT status, NOT(message = '') AS did_start FROM pg_dist_background_task WHERE job_id = :job_id ORDER BY task_id ASC;
@ -110,6 +111,7 @@ SELECT job_id, task_id, status FROM pg_dist_background_task
ORDER BY job_id, task_id; -- show that last task is not running but ready to run(runnable)
SELECT citus_job_cancel(:job_id2); -- when a job with 1 task is cancelled, the last runnable task will be running
SELECT citus_job_wait(:job_id2); -- wait for the job to be cancelled
SELECT citus_job_wait(:job_id3, desired_status => 'running');
SELECT job_id, task_id, status FROM pg_dist_background_task
WHERE task_id IN (:task_id1, :task_id2, :task_id3, :task_id4, :task_id5)
@ -118,18 +120,115 @@ SELECT job_id, task_id, status FROM pg_dist_background_task
SELECT citus_job_cancel(:job_id1);
SELECT citus_job_cancel(:job_id3);
SELECT citus_job_wait(:job_id1);
SELECT citus_job_wait(:job_id2);
SELECT citus_job_wait(:job_id3);
SELECT job_id, task_id, status FROM pg_dist_background_task
WHERE task_id IN (:task_id1, :task_id2, :task_id3, :task_id4, :task_id5)
ORDER BY job_id, task_id; -- show that multiple cancels worked
-- verify that task is not starved by currently long running task
-- verify that a task, previously not started due to lack of workers, is executed after we increase max worker count
BEGIN;
INSERT INTO pg_dist_background_job (job_type, description) VALUES ('test_job', 'simple test to verify max parallel background execution') RETURNING job_id AS job_id1 \gset
INSERT INTO pg_dist_background_task (job_id, command) VALUES (:job_id1, $job$ SELECT pg_sleep(5000); $job$) RETURNING task_id AS task_id1 \gset
INSERT INTO pg_dist_background_task (job_id, command) VALUES (:job_id1, $job$ SELECT pg_sleep(5); $job$) RETURNING task_id AS task_id1 \gset
INSERT INTO pg_dist_background_task (job_id, command) VALUES (:job_id1, $job$ SELECT pg_sleep(5); $job$) RETURNING task_id AS task_id2 \gset
INSERT INTO pg_dist_background_task (job_id, command) VALUES (:job_id1, $job$ SELECT pg_sleep(5); $job$) RETURNING task_id AS task_id3 \gset
INSERT INTO pg_dist_background_job (job_type, description) VALUES ('test_job', 'simple test to verify max parallel background execution') RETURNING job_id AS job_id2 \gset
INSERT INTO pg_dist_background_task (job_id, command) VALUES (:job_id2, $job$ SELECT pg_sleep(5); $job$) RETURNING task_id AS task_id4 \gset
INSERT INTO pg_dist_background_job (job_type, description) VALUES ('test_job', 'simple test to verify max parallel background execution') RETURNING job_id AS job_id3 \gset
INSERT INTO pg_dist_background_task (job_id, command) VALUES (:job_id3, $job$ SELECT pg_sleep(5); $job$) RETURNING task_id AS task_id5 \gset
COMMIT;
SELECT pg_sleep(2); -- we assume this is enough time for all tasks to be in running status except the last one due to parallel worker limit
SELECT task_id, status FROM pg_dist_background_task
WHERE task_id IN (:task_id1, :task_id2, :task_id3, :task_id4, :task_id5)
ORDER BY task_id; -- show that last task is not running but ready to run(runnable)
ALTER SYSTEM SET citus.max_background_task_executors TO 5;
SELECT pg_reload_conf(); -- the last runnable task will be running after change
SELECT citus_job_wait(:job_id3, desired_status => 'running');
SELECT task_id, status FROM pg_dist_background_task
WHERE task_id IN (:task_id1, :task_id2, :task_id3, :task_id4, :task_id5)
ORDER BY task_id; -- show that last task is running
SELECT citus_job_cancel(:job_id1);
SELECT citus_job_cancel(:job_id2);
SELECT citus_job_cancel(:job_id3);
SELECT citus_job_wait(:job_id1);
SELECT citus_job_wait(:job_id2);
SELECT citus_job_wait(:job_id3);
SELECT task_id, status FROM pg_dist_background_task
WHERE task_id IN (:task_id1, :task_id2, :task_id3, :task_id4, :task_id5)
ORDER BY task_id; -- show that all tasks are cancelled
-- verify that upon termination signal, all tasks fail and retry policy sets their status back to runnable
BEGIN;
INSERT INTO pg_dist_background_job (job_type, description) VALUES ('test_job', 'simple test to verify termination on monitor') RETURNING job_id AS job_id1 \gset
INSERT INTO pg_dist_background_task (job_id, command) VALUES (:job_id1, $job$ SELECT pg_sleep(500); $job$) RETURNING task_id AS task_id1 \gset
INSERT INTO pg_dist_background_job (job_type, description) VALUES ('test_job', 'simple test to verify termination on monitor') RETURNING job_id AS job_id2 \gset
INSERT INTO pg_dist_background_task (job_id, command) VALUES (:job_id2, $job$ SELECT pg_sleep(500); $job$) RETURNING task_id AS task_id2 \gset
COMMIT;
SELECT citus_job_wait(:job_id1, desired_status => 'running');
SELECT citus_job_wait(:job_id2, desired_status => 'running');
SELECT task_id, status FROM pg_dist_background_task
WHERE task_id IN (:task_id1, :task_id2)
ORDER BY task_id;
SELECT pid AS monitor_pid FROM pg_stat_activity WHERE application_name ~ 'task queue monitor' \gset
SELECT pg_terminate_backend(:monitor_pid); -- terminate monitor process
SELECT pg_sleep(2); -- wait enough to show that tasks are terminated
SELECT task_id, status, retry_count, message FROM pg_dist_background_task
WHERE task_id IN (:task_id1, :task_id2)
ORDER BY task_id; -- show that all tasks are runnable by retry policy after termination signal
SELECT citus_job_cancel(:job_id1);
SELECT citus_job_cancel(:job_id2);
SELECT citus_job_wait(:job_id1);
SELECT citus_job_wait(:job_id2);
SELECT task_id, status FROM pg_dist_background_task
WHERE task_id IN (:task_id1, :task_id2)
ORDER BY task_id; -- show that all tasks are cancelled
-- verify that upon cancellation signal, all tasks are cancelled
BEGIN;
INSERT INTO pg_dist_background_job (job_type, description) VALUES ('test_job', 'simple test to verify cancellation on monitor') RETURNING job_id AS job_id1 \gset
INSERT INTO pg_dist_background_task (job_id, command) VALUES (:job_id1, $job$ SELECT pg_sleep(500); $job$) RETURNING task_id AS task_id1 \gset
INSERT INTO pg_dist_background_job (job_type, description) VALUES ('test_job', 'simple test to verify cancellation on monitor') RETURNING job_id AS job_id2 \gset
INSERT INTO pg_dist_background_task (job_id, command) VALUES (:job_id2, $job$ SELECT pg_sleep(500); $job$) RETURNING task_id AS task_id2 \gset
COMMIT;
SELECT citus_job_wait(:job_id1, desired_status => 'running');
SELECT citus_job_wait(:job_id2, desired_status => 'running');
SELECT task_id, status FROM pg_dist_background_task
WHERE task_id IN (:task_id1, :task_id2)
ORDER BY task_id;
SELECT pid AS monitor_pid FROM pg_stat_activity WHERE application_name ~ 'task queue monitor' \gset
SELECT pg_cancel_backend(:monitor_pid); -- cancel monitor process
SELECT pg_sleep(2); -- wait enough to show that tasks are cancelled
SELECT citus_job_wait(:job_id1);
SELECT citus_job_wait(:job_id2);
SELECT task_id, status FROM pg_dist_background_task
WHERE task_id IN (:task_id1, :task_id2)
ORDER BY task_id; -- show that all tasks are cancelled
-- verify that task is not starved by currently long running task
BEGIN;
INSERT INTO pg_dist_background_job (job_type, description) VALUES ('test_job', 'simple test to verify task execution starvation') RETURNING job_id AS job_id1 \gset
INSERT INTO pg_dist_background_task (job_id, command) VALUES (:job_id1, $job$ SELECT pg_sleep(5000); $job$) RETURNING task_id AS task_id1 \gset
INSERT INTO pg_dist_background_job (job_type, description) VALUES ('test_job', 'simple test to verify task execution starvation') RETURNING job_id AS job_id2 \gset
INSERT INTO pg_dist_background_task (job_id, command) VALUES (:job_id2, $job$ SELECT 1; $job$) RETURNING task_id AS task_id2 \gset
COMMIT;
@ -139,6 +238,11 @@ SELECT job_id, task_id, status FROM pg_dist_background_task
WHERE task_id IN (:task_id1, :task_id2)
ORDER BY job_id, task_id; -- show that last task is finished without starvation
SELECT citus_job_cancel(:job_id1);
SELECT citus_job_wait(:job_id1);
SELECT job_id, task_id, status FROM pg_dist_background_task
WHERE task_id IN (:task_id1, :task_id2)
ORDER BY job_id, task_id; -- show that task is cancelled
SET client_min_messages TO WARNING;
DROP SCHEMA background_task_queue_monitor CASCADE;