From 18219843d0b9bda90b8e3a03fe9fd8e45f6a2148 Mon Sep 17 00:00:00 2001 From: Marco Slot Date: Wed, 7 Oct 2020 17:39:57 +0200 Subject: [PATCH 1/2] Add maintenance daemon error tests --- src/test/regress/expected/multi_extension.out | 59 +++++++++++++++++++ src/test/regress/sql/multi_extension.sql | 32 ++++++++++ 2 files changed, 91 insertions(+) diff --git a/src/test/regress/expected/multi_extension.out b/src/test/regress/expected/multi_extension.out index e41389f3b..b73b635ae 100644 --- a/src/test/regress/expected/multi_extension.out +++ b/src/test/regress/expected/multi_extension.out @@ -723,3 +723,62 @@ CONTEXT: PL/pgSQL function inline_code_block line 6 at RAISE DROP DATABASE another; \c - - - :worker_1_port DROP DATABASE another; +\c - - - :master_port +-- only the regression database should have a maintenance daemon +SELECT count(*) FROM pg_stat_activity WHERE application_name = 'Citus Maintenance Daemon'; + count +--------------------------------------------------------------------- + 1 +(1 row) + +-- recreate the extension immediately after the maintenancae daemon errors +SELECT pg_cancel_backend(pid) FROM pg_stat_activity WHERE application_name = 'Citus Maintenance Daemon'; + pg_cancel_backend +--------------------------------------------------------------------- + t +(1 row) + +DROP EXTENSION citus; +CREATE EXTENSION citus; +-- wait for maintenance daemon restart +SELECT datname, current_database(), + usename, (SELECT extowner::regrole::text FROM pg_extension WHERE extname = 'citus') +FROM test.maintenance_worker(); + datname | current_database | usename | extowner +--------------------------------------------------------------------- + regression | regression | postgres | postgres +(1 row) + +-- confirm that there is only one maintenance daemon +SELECT count(*) FROM pg_stat_activity WHERE application_name = 'Citus Maintenance Daemon'; + count +--------------------------------------------------------------------- + 1 +(1 row) + +-- kill the maintenance daemon +SELECT pg_cancel_backend(pid) FROM pg_stat_activity WHERE application_name = 'Citus Maintenance Daemon'; + pg_cancel_backend +--------------------------------------------------------------------- + t +(1 row) + +-- reconnect +\c - - - :master_port +-- wait for maintenance daemon restart +SELECT datname, current_database(), + usename, (SELECT extowner::regrole::text FROM pg_extension WHERE extname = 'citus') +FROM test.maintenance_worker(); + datname | current_database | usename | extowner +--------------------------------------------------------------------- + regression | regression | postgres | postgres +(1 row) + +-- confirm that there is only one maintenance daemon +SELECT count(*) FROM pg_stat_activity WHERE application_name = 'Citus Maintenance Daemon'; + count +--------------------------------------------------------------------- + 1 +(1 row) + +DROP TABLE version_mismatch_table; diff --git a/src/test/regress/sql/multi_extension.sql b/src/test/regress/sql/multi_extension.sql index da73f0171..748179d22 100644 --- a/src/test/regress/sql/multi_extension.sql +++ b/src/test/regress/sql/multi_extension.sql @@ -448,3 +448,35 @@ DROP DATABASE another; \c - - - :worker_1_port DROP DATABASE another; +\c - - - :master_port +-- only the regression database should have a maintenance daemon +SELECT count(*) FROM pg_stat_activity WHERE application_name = 'Citus Maintenance Daemon'; + +-- recreate the extension immediately after the maintenancae daemon errors +SELECT pg_cancel_backend(pid) FROM pg_stat_activity WHERE application_name = 'Citus Maintenance Daemon'; +DROP EXTENSION citus; +CREATE EXTENSION citus; + +-- wait for maintenance daemon restart +SELECT datname, current_database(), + usename, (SELECT extowner::regrole::text FROM pg_extension WHERE extname = 'citus') +FROM test.maintenance_worker(); + +-- confirm that there is only one maintenance daemon +SELECT count(*) FROM pg_stat_activity WHERE application_name = 'Citus Maintenance Daemon'; + +-- kill the maintenance daemon +SELECT pg_cancel_backend(pid) FROM pg_stat_activity WHERE application_name = 'Citus Maintenance Daemon'; + +-- reconnect +\c - - - :master_port + +-- wait for maintenance daemon restart +SELECT datname, current_database(), + usename, (SELECT extowner::regrole::text FROM pg_extension WHERE extname = 'citus') +FROM test.maintenance_worker(); + +-- confirm that there is only one maintenance daemon +SELECT count(*) FROM pg_stat_activity WHERE application_name = 'Citus Maintenance Daemon'; + +DROP TABLE version_mismatch_table; From 881e5df780122149d895cd75d2b8033b39ed702a Mon Sep 17 00:00:00 2001 From: Marco Slot Date: Wed, 7 Oct 2020 17:47:18 +0200 Subject: [PATCH 2/2] Fix a bug that could lead to multiple maintenance daemons --- src/backend/distributed/utils/maintenanced.c | 71 ++++++++++++++----- src/test/regress/expected/multi_extension.out | 7 ++ src/test/regress/sql/multi_extension.sql | 2 + 3 files changed, 63 insertions(+), 17 deletions(-) diff --git a/src/backend/distributed/utils/maintenanced.c b/src/backend/distributed/utils/maintenanced.c index 4d6b79d3e..74ac7fbe5 100644 --- a/src/backend/distributed/utils/maintenanced.c +++ b/src/backend/distributed/utils/maintenanced.c @@ -109,6 +109,9 @@ static HTAB *MaintenanceDaemonDBHash; static volatile sig_atomic_t got_SIGHUP = false; static volatile sig_atomic_t got_SIGTERM = false; +/* set to true when becoming a maintenance daemon */ +static bool IsMaintenanceDaemon = false; + static void MaintenanceDaemonSigTermHandler(SIGNAL_ARGS); static void MaintenanceDaemonSigHupHandler(SIGNAL_ARGS); static size_t MaintenanceDaemonShmemSize(void); @@ -165,15 +168,31 @@ InitializeMaintenanceDaemonBackend(void) return; } - /* maintenance daemon can ignore itself */ - if (dbData->workerPid == MyProcPid) + if (!found) { + /* ensure the values in MaintenanceDaemonDBData are zero */ + memset(((char *) dbData) + sizeof(Oid), 0, + sizeof(MaintenanceDaemonDBData) - sizeof(Oid)); + } + + if (IsMaintenanceDaemon) + { + /* + * InitializeMaintenanceDaemonBackend is called by the maintenance daemon + * itself. In that case, we clearly don't need to start another maintenance + * daemon. + */ + Assert(found); + Assert(dbData->workerPid == MyProcPid); + LWLockRelease(&MaintenanceDaemonControl->lock); return; } if (!found || !dbData->daemonStarted) { + Assert(dbData->workerPid == 0); + BackgroundWorker worker; BackgroundWorkerHandle *handle = NULL; @@ -292,13 +311,33 @@ CitusMaintenanceDaemonMain(Datum main_arg) proc_exit(0); } + if (myDbData->workerPid != 0) + { + /* + * Another maintenance daemon is running. This usually happens because + * postgres restarts the daemon after an non-zero exit, and + * InitializeMaintenanceDaemonBackend started one before postgres did. + * In that case, the first one stays and the last one exits. + */ + + proc_exit(0); + } + before_shmem_exit(MaintenanceDaemonShmemExit, main_arg); - Assert(myDbData->workerPid == 0); - - /* from this point, DROP DATABASE will attempt to kill the worker */ + /* + * Signal that I am the maintenance daemon now. + * + * From this point, DROP DATABASE/EXTENSION will send a SIGTERM to me. + */ myDbData->workerPid = MyProcPid; + /* + * Signal that we are running. This in mainly needed in case of restart after + * an error, otherwise the daemonStarted flag is already true. + */ + myDbData->daemonStarted = true; + /* wire up signals */ pqsignal(SIGTERM, MaintenanceDaemonSigTermHandler); pqsignal(SIGHUP, MaintenanceDaemonSigHupHandler); @@ -306,6 +345,8 @@ CitusMaintenanceDaemonMain(Datum main_arg) myDbData->latch = MyLatch; + IsMaintenanceDaemon = true; + LWLockRelease(&MaintenanceDaemonControl->lock); /* @@ -339,8 +380,6 @@ CitusMaintenanceDaemonMain(Datum main_arg) CHECK_FOR_INTERRUPTS(); - Assert(myDbData->workerPid == MyProcPid); - CitusTableCacheFlushInvalidatedEntries(); /* @@ -567,15 +606,6 @@ CitusMaintenanceDaemonMain(Datum main_arg) /* check for changed configuration */ if (myDbData->userOid != GetSessionUserId()) { - /* - * Reset myDbData->daemonStarted so InitializeMaintenanceDaemonBackend() - * notices this is a restart. - */ - LWLockAcquire(&MaintenanceDaemonControl->lock, LW_EXCLUSIVE); - myDbData->daemonStarted = false; - myDbData->workerPid = 0; - LWLockRelease(&MaintenanceDaemonControl->lock); - /* return code of 1 requests worker restart */ proc_exit(1); } @@ -687,8 +717,15 @@ MaintenanceDaemonShmemExit(int code, Datum arg) MaintenanceDaemonDBData *myDbData = (MaintenanceDaemonDBData *) hash_search(MaintenanceDaemonDBHash, &databaseOid, HASH_FIND, NULL); - if (myDbData && myDbData->workerPid == MyProcPid) + + /* myDbData is NULL after StopMaintenanceDaemon */ + if (myDbData != NULL) { + /* + * Confirm that I am still the registered maintenance daemon before exiting. + */ + Assert(myDbData->workerPid == MyProcPid); + myDbData->daemonStarted = false; myDbData->workerPid = 0; } diff --git a/src/test/regress/expected/multi_extension.out b/src/test/regress/expected/multi_extension.out index b73b635ae..b640a8aa3 100644 --- a/src/test/regress/expected/multi_extension.out +++ b/src/test/regress/expected/multi_extension.out @@ -765,6 +765,13 @@ SELECT pg_cancel_backend(pid) FROM pg_stat_activity WHERE application_name = 'Ci -- reconnect \c - - - :master_port +-- run something that goes through planner hook and therefore kicks of maintenance daemon +SELECT 1; + ?column? +--------------------------------------------------------------------- + 1 +(1 row) + -- wait for maintenance daemon restart SELECT datname, current_database(), usename, (SELECT extowner::regrole::text FROM pg_extension WHERE extname = 'citus') diff --git a/src/test/regress/sql/multi_extension.sql b/src/test/regress/sql/multi_extension.sql index 748179d22..fe4163f15 100644 --- a/src/test/regress/sql/multi_extension.sql +++ b/src/test/regress/sql/multi_extension.sql @@ -470,6 +470,8 @@ SELECT pg_cancel_backend(pid) FROM pg_stat_activity WHERE application_name = 'Ci -- reconnect \c - - - :master_port +-- run something that goes through planner hook and therefore kicks of maintenance daemon +SELECT 1; -- wait for maintenance daemon restart SELECT datname, current_database(),