mirror of https://github.com/citusdata/citus.git
Enable distributed deadlock detection on the maintenance deamon
With this commit, the maintenance deamon starts to check for distributed deadlocks. We also introduced a GUC variable (distributed_deadlock_detection_factor) whose value is multiplied with Postgres' deadlock_timeout. Setting it to -1 disables the distributed deadlock detection.pull/1529/head
parent
66936053a0
commit
e5d5bdff51
|
@ -61,6 +61,8 @@ static void multi_log_hook(ErrorData *edata);
|
|||
static void CreateRequiredDirectories(void);
|
||||
static void RegisterCitusConfigVariables(void);
|
||||
static void WarningForEnableDeadlockPrevention(bool newval, void *extra);
|
||||
static bool ErrorIfNotASuitableDeadlockFactor(double *newval, void **extra,
|
||||
GucSource source);
|
||||
static void NormalizeWorkerListPath(void);
|
||||
|
||||
|
||||
|
@ -394,6 +396,19 @@ RegisterCitusConfigVariables(void)
|
|||
0,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
DefineCustomRealVariable(
|
||||
"citus.distributed_deadlock_detection_factor",
|
||||
gettext_noop("Sets the time to wait before checking for distributed "
|
||||
"deadlocks. Postgres' deadlock_timeout setting is "
|
||||
"multiplied with the value. If the value is set to"
|
||||
"1000, distributed deadlock detection is disabled."),
|
||||
NULL,
|
||||
&DistributedDeadlockDetectionTimeoutFactor,
|
||||
2.0, -1.0, 1000.0,
|
||||
PGC_SIGHUP,
|
||||
0,
|
||||
ErrorIfNotASuitableDeadlockFactor, NULL, NULL);
|
||||
|
||||
DefineCustomBoolVariable(
|
||||
"citus.enable_deadlock_prevention",
|
||||
gettext_noop("Prevents transactions from expanding to multiple nodes"),
|
||||
|
@ -775,6 +790,27 @@ WarningForEnableDeadlockPrevention(bool newval, void *extra)
|
|||
}
|
||||
|
||||
|
||||
/*
|
||||
* We don't want to allow values less than 1.0. However, we define -1 as the value to disable
|
||||
* distributed deadlock checking. Here we enforce our special constraint.
|
||||
*/
|
||||
static bool
|
||||
ErrorIfNotASuitableDeadlockFactor(double *newval, void **extra, GucSource source)
|
||||
{
|
||||
if (*newval <= 1.0 && *newval != -1.0)
|
||||
{
|
||||
ereport(WARNING, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg(
|
||||
"citus.distributed_deadlock_detection_factor cannot be less than 1. "
|
||||
"To disable distributed deadlock detection set the value to -1.")));
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* NormalizeWorkerListPath converts the path configured via
|
||||
* citus.worker_list_file into an absolute path, falling back to the default
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
|
||||
#include "access/xact.h"
|
||||
#include "libpq/pqsignal.h"
|
||||
#include "distributed/distributed_deadlock_detection.h"
|
||||
#include "distributed/maintenanced.h"
|
||||
#include "distributed/metadata_cache.h"
|
||||
#include "postmaster/bgworker.h"
|
||||
|
@ -72,6 +73,8 @@ typedef struct MaintenanceDaemonDBData
|
|||
Latch *latch; /* pointer to the background worker's latch */
|
||||
} MaintenanceDaemonDBData;
|
||||
|
||||
/* config variable for distributed deadlock detection timeout */
|
||||
double DistributedDeadlockDetectionTimeoutFactor = 2.0;
|
||||
|
||||
static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
|
||||
static MaintenanceDaemonControlData *MaintenanceDaemonControl = NULL;
|
||||
|
@ -248,7 +251,8 @@ CitusMaintenanceDaemonMain(Datum main_arg)
|
|||
{
|
||||
int rc;
|
||||
int latchFlags = WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH;
|
||||
int timeout = 10000; /* wake up at least every so often */
|
||||
double timeout = 10000.0; /* use this if the deadlock detection is disabled */
|
||||
bool foundDeadlock = false;
|
||||
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
|
||||
|
@ -258,13 +262,40 @@ CitusMaintenanceDaemonMain(Datum main_arg)
|
|||
* tasks should do their own time math about whether to re-run checks.
|
||||
*/
|
||||
|
||||
/* the config value -1 disables the distributed deadlock detection */
|
||||
if (DistributedDeadlockDetectionTimeoutFactor != -1.0)
|
||||
{
|
||||
StartTransactionCommand();
|
||||
foundDeadlock = CheckForDistributedDeadlocks();
|
||||
CommitTransactionCommand();
|
||||
|
||||
/*
|
||||
* If we find any deadlocks, run the distributed deadlock detection
|
||||
* more often since it is quite possible that there are other
|
||||
* deadlocks need to be resolved.
|
||||
*
|
||||
* Thus, we use 1/20 of the calculated value. With the default
|
||||
* values (i.e., deadlock_timeout 1 seconds,
|
||||
* citus.distributed_deadlock_detection_factor 2), we'd be able to cancel
|
||||
* ~10 distributed deadlocks per second.
|
||||
*/
|
||||
timeout =
|
||||
DistributedDeadlockDetectionTimeoutFactor * (double) DeadlockTimeout;
|
||||
|
||||
if (foundDeadlock)
|
||||
{
|
||||
timeout = timeout / 20.0;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Wait until timeout, or until somebody wakes us up.
|
||||
* Wait until timeout, or until somebody wakes us up. Also cast the timeout to
|
||||
* integer where we've calculated it using double for not losing the precision.
|
||||
*/
|
||||
#if (PG_VERSION_NUM >= 100000)
|
||||
rc = WaitLatch(MyLatch, latchFlags, timeout, PG_WAIT_EXTENSION);
|
||||
rc = WaitLatch(MyLatch, latchFlags, (long) timeout, PG_WAIT_EXTENSION);
|
||||
#else
|
||||
rc = WaitLatch(MyLatch, latchFlags, timeout);
|
||||
rc = WaitLatch(MyLatch, latchFlags, (long) timeout);
|
||||
#endif
|
||||
|
||||
/* emergency bailout if postmaster has died */
|
||||
|
|
|
@ -12,6 +12,9 @@
|
|||
#ifndef MAINTENANCED_H
|
||||
#define MAINTENANCED_H
|
||||
|
||||
/* config variable for */
|
||||
extern double DistributedDeadlockDetectionTimeoutFactor;
|
||||
|
||||
extern void InitializeMaintenanceDaemon(void);
|
||||
extern void InitializeMaintenanceDaemonBackend(void);
|
||||
|
||||
|
|
Loading…
Reference in New Issue