mirror of https://github.com/citusdata/citus.git
Enable distributed deadlock detection on the maintenance deamon
With this commit, the maintenance deamon starts to check for distributed deadlocks. We also introduced a GUC variable (distributed_deadlock_detection_factor) whose value is multiplied with Postgres' deadlock_timeout. Setting it to -1 disables the distributed deadlock detection.pull/1529/head
parent
66936053a0
commit
e5d5bdff51
|
@ -61,6 +61,8 @@ static void multi_log_hook(ErrorData *edata);
|
||||||
static void CreateRequiredDirectories(void);
|
static void CreateRequiredDirectories(void);
|
||||||
static void RegisterCitusConfigVariables(void);
|
static void RegisterCitusConfigVariables(void);
|
||||||
static void WarningForEnableDeadlockPrevention(bool newval, void *extra);
|
static void WarningForEnableDeadlockPrevention(bool newval, void *extra);
|
||||||
|
static bool ErrorIfNotASuitableDeadlockFactor(double *newval, void **extra,
|
||||||
|
GucSource source);
|
||||||
static void NormalizeWorkerListPath(void);
|
static void NormalizeWorkerListPath(void);
|
||||||
|
|
||||||
|
|
||||||
|
@ -394,6 +396,19 @@ RegisterCitusConfigVariables(void)
|
||||||
0,
|
0,
|
||||||
NULL, NULL, NULL);
|
NULL, NULL, NULL);
|
||||||
|
|
||||||
|
DefineCustomRealVariable(
|
||||||
|
"citus.distributed_deadlock_detection_factor",
|
||||||
|
gettext_noop("Sets the time to wait before checking for distributed "
|
||||||
|
"deadlocks. Postgres' deadlock_timeout setting is "
|
||||||
|
"multiplied with the value. If the value is set to"
|
||||||
|
"1000, distributed deadlock detection is disabled."),
|
||||||
|
NULL,
|
||||||
|
&DistributedDeadlockDetectionTimeoutFactor,
|
||||||
|
2.0, -1.0, 1000.0,
|
||||||
|
PGC_SIGHUP,
|
||||||
|
0,
|
||||||
|
ErrorIfNotASuitableDeadlockFactor, NULL, NULL);
|
||||||
|
|
||||||
DefineCustomBoolVariable(
|
DefineCustomBoolVariable(
|
||||||
"citus.enable_deadlock_prevention",
|
"citus.enable_deadlock_prevention",
|
||||||
gettext_noop("Prevents transactions from expanding to multiple nodes"),
|
gettext_noop("Prevents transactions from expanding to multiple nodes"),
|
||||||
|
@ -775,6 +790,27 @@ WarningForEnableDeadlockPrevention(bool newval, void *extra)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We don't want to allow values less than 1.0. However, we define -1 as the value to disable
|
||||||
|
* distributed deadlock checking. Here we enforce our special constraint.
|
||||||
|
*/
|
||||||
|
static bool
|
||||||
|
ErrorIfNotASuitableDeadlockFactor(double *newval, void **extra, GucSource source)
|
||||||
|
{
|
||||||
|
if (*newval <= 1.0 && *newval != -1.0)
|
||||||
|
{
|
||||||
|
ereport(WARNING, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||||
|
errmsg(
|
||||||
|
"citus.distributed_deadlock_detection_factor cannot be less than 1. "
|
||||||
|
"To disable distributed deadlock detection set the value to -1.")));
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* NormalizeWorkerListPath converts the path configured via
|
* NormalizeWorkerListPath converts the path configured via
|
||||||
* citus.worker_list_file into an absolute path, falling back to the default
|
* citus.worker_list_file into an absolute path, falling back to the default
|
||||||
|
|
|
@ -22,6 +22,7 @@
|
||||||
|
|
||||||
#include "access/xact.h"
|
#include "access/xact.h"
|
||||||
#include "libpq/pqsignal.h"
|
#include "libpq/pqsignal.h"
|
||||||
|
#include "distributed/distributed_deadlock_detection.h"
|
||||||
#include "distributed/maintenanced.h"
|
#include "distributed/maintenanced.h"
|
||||||
#include "distributed/metadata_cache.h"
|
#include "distributed/metadata_cache.h"
|
||||||
#include "postmaster/bgworker.h"
|
#include "postmaster/bgworker.h"
|
||||||
|
@ -72,6 +73,8 @@ typedef struct MaintenanceDaemonDBData
|
||||||
Latch *latch; /* pointer to the background worker's latch */
|
Latch *latch; /* pointer to the background worker's latch */
|
||||||
} MaintenanceDaemonDBData;
|
} MaintenanceDaemonDBData;
|
||||||
|
|
||||||
|
/* config variable for distributed deadlock detection timeout */
|
||||||
|
double DistributedDeadlockDetectionTimeoutFactor = 2.0;
|
||||||
|
|
||||||
static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
|
static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
|
||||||
static MaintenanceDaemonControlData *MaintenanceDaemonControl = NULL;
|
static MaintenanceDaemonControlData *MaintenanceDaemonControl = NULL;
|
||||||
|
@ -248,7 +251,8 @@ CitusMaintenanceDaemonMain(Datum main_arg)
|
||||||
{
|
{
|
||||||
int rc;
|
int rc;
|
||||||
int latchFlags = WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH;
|
int latchFlags = WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH;
|
||||||
int timeout = 10000; /* wake up at least every so often */
|
double timeout = 10000.0; /* use this if the deadlock detection is disabled */
|
||||||
|
bool foundDeadlock = false;
|
||||||
|
|
||||||
CHECK_FOR_INTERRUPTS();
|
CHECK_FOR_INTERRUPTS();
|
||||||
|
|
||||||
|
@ -258,13 +262,40 @@ CitusMaintenanceDaemonMain(Datum main_arg)
|
||||||
* tasks should do their own time math about whether to re-run checks.
|
* tasks should do their own time math about whether to re-run checks.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/* the config value -1 disables the distributed deadlock detection */
|
||||||
|
if (DistributedDeadlockDetectionTimeoutFactor != -1.0)
|
||||||
|
{
|
||||||
|
StartTransactionCommand();
|
||||||
|
foundDeadlock = CheckForDistributedDeadlocks();
|
||||||
|
CommitTransactionCommand();
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Wait until timeout, or until somebody wakes us up.
|
* If we find any deadlocks, run the distributed deadlock detection
|
||||||
|
* more often since it is quite possible that there are other
|
||||||
|
* deadlocks need to be resolved.
|
||||||
|
*
|
||||||
|
* Thus, we use 1/20 of the calculated value. With the default
|
||||||
|
* values (i.e., deadlock_timeout 1 seconds,
|
||||||
|
* citus.distributed_deadlock_detection_factor 2), we'd be able to cancel
|
||||||
|
* ~10 distributed deadlocks per second.
|
||||||
|
*/
|
||||||
|
timeout =
|
||||||
|
DistributedDeadlockDetectionTimeoutFactor * (double) DeadlockTimeout;
|
||||||
|
|
||||||
|
if (foundDeadlock)
|
||||||
|
{
|
||||||
|
timeout = timeout / 20.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Wait until timeout, or until somebody wakes us up. Also cast the timeout to
|
||||||
|
* integer where we've calculated it using double for not losing the precision.
|
||||||
*/
|
*/
|
||||||
#if (PG_VERSION_NUM >= 100000)
|
#if (PG_VERSION_NUM >= 100000)
|
||||||
rc = WaitLatch(MyLatch, latchFlags, timeout, PG_WAIT_EXTENSION);
|
rc = WaitLatch(MyLatch, latchFlags, (long) timeout, PG_WAIT_EXTENSION);
|
||||||
#else
|
#else
|
||||||
rc = WaitLatch(MyLatch, latchFlags, timeout);
|
rc = WaitLatch(MyLatch, latchFlags, (long) timeout);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* emergency bailout if postmaster has died */
|
/* emergency bailout if postmaster has died */
|
||||||
|
|
|
@ -12,6 +12,9 @@
|
||||||
#ifndef MAINTENANCED_H
|
#ifndef MAINTENANCED_H
|
||||||
#define MAINTENANCED_H
|
#define MAINTENANCED_H
|
||||||
|
|
||||||
|
/* config variable for */
|
||||||
|
extern double DistributedDeadlockDetectionTimeoutFactor;
|
||||||
|
|
||||||
extern void InitializeMaintenanceDaemon(void);
|
extern void InitializeMaintenanceDaemon(void);
|
||||||
extern void InitializeMaintenanceDaemonBackend(void);
|
extern void InitializeMaintenanceDaemonBackend(void);
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue