From f6e5006dfda0a201f18ecfe1a9c39f25537d889a Mon Sep 17 00:00:00 2001 From: Marco Slot Date: Wed, 7 Oct 2020 17:47:18 +0200 Subject: [PATCH] Fix a bug that could lead to multiple maintenance daemons --- src/backend/distributed/utils/maintenanced.c | 71 ++++++++++++++----- src/test/regress/expected/multi_extension.out | 7 ++ src/test/regress/sql/multi_extension.sql | 2 + 3 files changed, 63 insertions(+), 17 deletions(-) diff --git a/src/backend/distributed/utils/maintenanced.c b/src/backend/distributed/utils/maintenanced.c index f3a3edc40..ffd51a3be 100644 --- a/src/backend/distributed/utils/maintenanced.c +++ b/src/backend/distributed/utils/maintenanced.c @@ -104,6 +104,9 @@ static HTAB *MaintenanceDaemonDBHash; static volatile sig_atomic_t got_SIGHUP = false; static volatile sig_atomic_t got_SIGTERM = false; +/* set to true when becoming a maintenance daemon */ +static bool IsMaintenanceDaemon = false; + static void MaintenanceDaemonSigTermHandler(SIGNAL_ARGS); static void MaintenanceDaemonSigHupHandler(SIGNAL_ARGS); static size_t MaintenanceDaemonShmemSize(void); @@ -160,15 +163,31 @@ InitializeMaintenanceDaemonBackend(void) return; } - /* maintenance daemon can ignore itself */ - if (dbData->workerPid == MyProcPid) + if (!found) { + /* ensure the values in MaintenanceDaemonDBData are zero */ + memset(((char *) dbData) + sizeof(Oid), 0, + sizeof(MaintenanceDaemonDBData) - sizeof(Oid)); + } + + if (IsMaintenanceDaemon) + { + /* + * InitializeMaintenanceDaemonBackend is called by the maintenance daemon + * itself. In that case, we clearly don't need to start another maintenance + * daemon. + */ + Assert(found); + Assert(dbData->workerPid == MyProcPid); + LWLockRelease(&MaintenanceDaemonControl->lock); return; } if (!found || !dbData->daemonStarted) { + Assert(dbData->workerPid == 0); + BackgroundWorker worker; BackgroundWorkerHandle *handle = NULL; @@ -287,13 +306,33 @@ CitusMaintenanceDaemonMain(Datum main_arg) proc_exit(0); } + if (myDbData->workerPid != 0) + { + /* + * Another maintenance daemon is running. This usually happens because + * postgres restarts the daemon after an non-zero exit, and + * InitializeMaintenanceDaemonBackend started one before postgres did. + * In that case, the first one stays and the last one exits. + */ + + proc_exit(0); + } + before_shmem_exit(MaintenanceDaemonShmemExit, main_arg); - Assert(myDbData->workerPid == 0); - - /* from this point, DROP DATABASE will attempt to kill the worker */ + /* + * Signal that I am the maintenance daemon now. + * + * From this point, DROP DATABASE/EXTENSION will send a SIGTERM to me. + */ myDbData->workerPid = MyProcPid; + /* + * Signal that we are running. This in mainly needed in case of restart after + * an error, otherwise the daemonStarted flag is already true. + */ + myDbData->daemonStarted = true; + /* wire up signals */ pqsignal(SIGTERM, MaintenanceDaemonSigTermHandler); pqsignal(SIGHUP, MaintenanceDaemonSigHupHandler); @@ -301,6 +340,8 @@ CitusMaintenanceDaemonMain(Datum main_arg) myDbData->latch = MyLatch; + IsMaintenanceDaemon = true; + LWLockRelease(&MaintenanceDaemonControl->lock); /* @@ -334,8 +375,6 @@ CitusMaintenanceDaemonMain(Datum main_arg) CHECK_FOR_INTERRUPTS(); - Assert(myDbData->workerPid == MyProcPid); - CitusTableCacheFlushInvalidatedEntries(); /* @@ -562,15 +601,6 @@ CitusMaintenanceDaemonMain(Datum main_arg) /* check for changed configuration */ if (myDbData->userOid != GetSessionUserId()) { - /* - * Reset myDbData->daemonStarted so InitializeMaintenanceDaemonBackend() - * notices this is a restart. - */ - LWLockAcquire(&MaintenanceDaemonControl->lock, LW_EXCLUSIVE); - myDbData->daemonStarted = false; - myDbData->workerPid = 0; - LWLockRelease(&MaintenanceDaemonControl->lock); - /* return code of 1 requests worker restart */ proc_exit(1); } @@ -682,8 +712,15 @@ MaintenanceDaemonShmemExit(int code, Datum arg) MaintenanceDaemonDBData *myDbData = (MaintenanceDaemonDBData *) hash_search(MaintenanceDaemonDBHash, &databaseOid, HASH_FIND, NULL); - if (myDbData && myDbData->workerPid == MyProcPid) + + /* myDbData is NULL after StopMaintenanceDaemon */ + if (myDbData != NULL) { + /* + * Confirm that I am still the registered maintenance daemon before exiting. + */ + Assert(myDbData->workerPid == MyProcPid); + myDbData->daemonStarted = false; myDbData->workerPid = 0; } diff --git a/src/test/regress/expected/multi_extension.out b/src/test/regress/expected/multi_extension.out index 9e31dc247..0d503d993 100644 --- a/src/test/regress/expected/multi_extension.out +++ b/src/test/regress/expected/multi_extension.out @@ -718,6 +718,13 @@ SELECT pg_cancel_backend(pid) FROM pg_stat_activity WHERE application_name = 'Ci -- reconnect \c - - - :master_port +-- run something that goes through planner hook and therefore kicks of maintenance daemon +SELECT 1; + ?column? +--------------------------------------------------------------------- + 1 +(1 row) + -- wait for maintenance daemon restart SELECT datname, current_database(), usename, (SELECT extowner::regrole::text FROM pg_extension WHERE extname = 'citus') diff --git a/src/test/regress/sql/multi_extension.sql b/src/test/regress/sql/multi_extension.sql index 7acbb2186..78fbd4e55 100644 --- a/src/test/regress/sql/multi_extension.sql +++ b/src/test/regress/sql/multi_extension.sql @@ -448,6 +448,8 @@ SELECT pg_cancel_backend(pid) FROM pg_stat_activity WHERE application_name = 'Ci -- reconnect \c - - - :master_port +-- run something that goes through planner hook and therefore kicks of maintenance daemon +SELECT 1; -- wait for maintenance daemon restart SELECT datname, current_database(),