Enable distributed deadlock detection on the maintenance deamon

With this commit, the maintenance deamon starts to check for
distributed deadlocks.

We also introduced a GUC variable (distributed_deadlock_detection_factor)
whose value is multiplied with Postgres' deadlock_timeout. Setting
it to -1 disables the distributed deadlock detection.
pull/1529/head
Onder Kalaci 2017-08-11 11:54:36 +03:00
parent 66936053a0
commit e5d5bdff51
3 changed files with 74 additions and 4 deletions

View File

@ -61,6 +61,8 @@ static void multi_log_hook(ErrorData *edata);
static void CreateRequiredDirectories(void);
static void RegisterCitusConfigVariables(void);
static void WarningForEnableDeadlockPrevention(bool newval, void *extra);
static bool ErrorIfNotASuitableDeadlockFactor(double *newval, void **extra,
GucSource source);
static void NormalizeWorkerListPath(void);
@ -394,6 +396,19 @@ RegisterCitusConfigVariables(void)
0,
NULL, NULL, NULL);
DefineCustomRealVariable(
"citus.distributed_deadlock_detection_factor",
gettext_noop("Sets the time to wait before checking for distributed "
"deadlocks. Postgres' deadlock_timeout setting is "
"multiplied with the value. If the value is set to"
"1000, distributed deadlock detection is disabled."),
NULL,
&DistributedDeadlockDetectionTimeoutFactor,
2.0, -1.0, 1000.0,
PGC_SIGHUP,
0,
ErrorIfNotASuitableDeadlockFactor, NULL, NULL);
DefineCustomBoolVariable(
"citus.enable_deadlock_prevention",
gettext_noop("Prevents transactions from expanding to multiple nodes"),
@ -775,6 +790,27 @@ WarningForEnableDeadlockPrevention(bool newval, void *extra)
}
/*
* We don't want to allow values less than 1.0. However, we define -1 as the value to disable
* distributed deadlock checking. Here we enforce our special constraint.
*/
static bool
ErrorIfNotASuitableDeadlockFactor(double *newval, void **extra, GucSource source)
{
if (*newval <= 1.0 && *newval != -1.0)
{
ereport(WARNING, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg(
"citus.distributed_deadlock_detection_factor cannot be less than 1. "
"To disable distributed deadlock detection set the value to -1.")));
return false;
}
return true;
}
/*
* NormalizeWorkerListPath converts the path configured via
* citus.worker_list_file into an absolute path, falling back to the default

View File

@ -22,6 +22,7 @@
#include "access/xact.h"
#include "libpq/pqsignal.h"
#include "distributed/distributed_deadlock_detection.h"
#include "distributed/maintenanced.h"
#include "distributed/metadata_cache.h"
#include "postmaster/bgworker.h"
@ -72,6 +73,8 @@ typedef struct MaintenanceDaemonDBData
Latch *latch; /* pointer to the background worker's latch */
} MaintenanceDaemonDBData;
/* config variable for distributed deadlock detection timeout */
double DistributedDeadlockDetectionTimeoutFactor = 2.0;
static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
static MaintenanceDaemonControlData *MaintenanceDaemonControl = NULL;
@ -248,7 +251,8 @@ CitusMaintenanceDaemonMain(Datum main_arg)
{
int rc;
int latchFlags = WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH;
int timeout = 10000; /* wake up at least every so often */
double timeout = 10000.0; /* use this if the deadlock detection is disabled */
bool foundDeadlock = false;
CHECK_FOR_INTERRUPTS();
@ -258,13 +262,40 @@ CitusMaintenanceDaemonMain(Datum main_arg)
* tasks should do their own time math about whether to re-run checks.
*/
/* the config value -1 disables the distributed deadlock detection */
if (DistributedDeadlockDetectionTimeoutFactor != -1.0)
{
StartTransactionCommand();
foundDeadlock = CheckForDistributedDeadlocks();
CommitTransactionCommand();
/*
* If we find any deadlocks, run the distributed deadlock detection
* more often since it is quite possible that there are other
* deadlocks need to be resolved.
*
* Thus, we use 1/20 of the calculated value. With the default
* values (i.e., deadlock_timeout 1 seconds,
* citus.distributed_deadlock_detection_factor 2), we'd be able to cancel
* ~10 distributed deadlocks per second.
*/
timeout =
DistributedDeadlockDetectionTimeoutFactor * (double) DeadlockTimeout;
if (foundDeadlock)
{
timeout = timeout / 20.0;
}
}
/*
* Wait until timeout, or until somebody wakes us up.
* Wait until timeout, or until somebody wakes us up. Also cast the timeout to
* integer where we've calculated it using double for not losing the precision.
*/
#if (PG_VERSION_NUM >= 100000)
rc = WaitLatch(MyLatch, latchFlags, timeout, PG_WAIT_EXTENSION);
rc = WaitLatch(MyLatch, latchFlags, (long) timeout, PG_WAIT_EXTENSION);
#else
rc = WaitLatch(MyLatch, latchFlags, timeout);
rc = WaitLatch(MyLatch, latchFlags, (long) timeout);
#endif
/* emergency bailout if postmaster has died */

View File

@ -12,6 +12,9 @@
#ifndef MAINTENANCED_H
#define MAINTENANCED_H
/* config variable for */
extern double DistributedDeadlockDetectionTimeoutFactor;
extern void InitializeMaintenanceDaemon(void);
extern void InitializeMaintenanceDaemonBackend(void);