diff --git a/src/backend/distributed/shared_library_init.c b/src/backend/distributed/shared_library_init.c index 537e9307f..e1ccc6cbb 100644 --- a/src/backend/distributed/shared_library_init.c +++ b/src/backend/distributed/shared_library_init.c @@ -61,6 +61,8 @@ static void multi_log_hook(ErrorData *edata); static void CreateRequiredDirectories(void); static void RegisterCitusConfigVariables(void); static void WarningForEnableDeadlockPrevention(bool newval, void *extra); +static bool ErrorIfNotASuitableDeadlockFactor(double *newval, void **extra, + GucSource source); static void NormalizeWorkerListPath(void); @@ -394,6 +396,19 @@ RegisterCitusConfigVariables(void) 0, NULL, NULL, NULL); + DefineCustomRealVariable( + "citus.distributed_deadlock_detection_factor", + gettext_noop("Sets the time to wait before checking for distributed " + "deadlocks. Postgres' deadlock_timeout setting is " + "multiplied with the value. If the value is set to" + "1000, distributed deadlock detection is disabled."), + NULL, + &DistributedDeadlockDetectionTimeoutFactor, + 2.0, -1.0, 1000.0, + PGC_SIGHUP, + 0, + ErrorIfNotASuitableDeadlockFactor, NULL, NULL); + DefineCustomBoolVariable( "citus.enable_deadlock_prevention", gettext_noop("Prevents transactions from expanding to multiple nodes"), @@ -775,6 +790,27 @@ WarningForEnableDeadlockPrevention(bool newval, void *extra) } +/* + * We don't want to allow values less than 1.0. However, we define -1 as the value to disable + * distributed deadlock checking. Here we enforce our special constraint. + */ +static bool +ErrorIfNotASuitableDeadlockFactor(double *newval, void **extra, GucSource source) +{ + if (*newval <= 1.0 && *newval != -1.0) + { + ereport(WARNING, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg( + "citus.distributed_deadlock_detection_factor cannot be less than 1. " + "To disable distributed deadlock detection set the value to -1."))); + + return false; + } + + return true; +} + + /* * NormalizeWorkerListPath converts the path configured via * citus.worker_list_file into an absolute path, falling back to the default diff --git a/src/backend/distributed/utils/maintenanced.c b/src/backend/distributed/utils/maintenanced.c index da5652157..2bf847455 100644 --- a/src/backend/distributed/utils/maintenanced.c +++ b/src/backend/distributed/utils/maintenanced.c @@ -22,6 +22,7 @@ #include "access/xact.h" #include "libpq/pqsignal.h" +#include "distributed/distributed_deadlock_detection.h" #include "distributed/maintenanced.h" #include "distributed/metadata_cache.h" #include "postmaster/bgworker.h" @@ -72,6 +73,8 @@ typedef struct MaintenanceDaemonDBData Latch *latch; /* pointer to the background worker's latch */ } MaintenanceDaemonDBData; +/* config variable for distributed deadlock detection timeout */ +double DistributedDeadlockDetectionTimeoutFactor = 2.0; static shmem_startup_hook_type prev_shmem_startup_hook = NULL; static MaintenanceDaemonControlData *MaintenanceDaemonControl = NULL; @@ -248,7 +251,8 @@ CitusMaintenanceDaemonMain(Datum main_arg) { int rc; int latchFlags = WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH; - int timeout = 10000; /* wake up at least every so often */ + double timeout = 10000.0; /* use this if the deadlock detection is disabled */ + bool foundDeadlock = false; CHECK_FOR_INTERRUPTS(); @@ -258,13 +262,40 @@ CitusMaintenanceDaemonMain(Datum main_arg) * tasks should do their own time math about whether to re-run checks. */ + /* the config value -1 disables the distributed deadlock detection */ + if (DistributedDeadlockDetectionTimeoutFactor != -1.0) + { + StartTransactionCommand(); + foundDeadlock = CheckForDistributedDeadlocks(); + CommitTransactionCommand(); + + /* + * If we find any deadlocks, run the distributed deadlock detection + * more often since it is quite possible that there are other + * deadlocks need to be resolved. + * + * Thus, we use 1/20 of the calculated value. With the default + * values (i.e., deadlock_timeout 1 seconds, + * citus.distributed_deadlock_detection_factor 2), we'd be able to cancel + * ~10 distributed deadlocks per second. + */ + timeout = + DistributedDeadlockDetectionTimeoutFactor * (double) DeadlockTimeout; + + if (foundDeadlock) + { + timeout = timeout / 20.0; + } + } + /* - * Wait until timeout, or until somebody wakes us up. + * Wait until timeout, or until somebody wakes us up. Also cast the timeout to + * integer where we've calculated it using double for not losing the precision. */ #if (PG_VERSION_NUM >= 100000) - rc = WaitLatch(MyLatch, latchFlags, timeout, PG_WAIT_EXTENSION); + rc = WaitLatch(MyLatch, latchFlags, (long) timeout, PG_WAIT_EXTENSION); #else - rc = WaitLatch(MyLatch, latchFlags, timeout); + rc = WaitLatch(MyLatch, latchFlags, (long) timeout); #endif /* emergency bailout if postmaster has died */ diff --git a/src/include/distributed/maintenanced.h b/src/include/distributed/maintenanced.h index bb6ebcd6b..f8fa3c6e9 100644 --- a/src/include/distributed/maintenanced.h +++ b/src/include/distributed/maintenanced.h @@ -12,6 +12,9 @@ #ifndef MAINTENANCED_H #define MAINTENANCED_H +/* config variable for */ +extern double DistributedDeadlockDetectionTimeoutFactor; + extern void InitializeMaintenanceDaemon(void); extern void InitializeMaintenanceDaemonBackend(void);