Merge pull request #4222 from citusdata/fix/multiple-maintenanced

pull/4218/head^2
Marco Slot 2020-10-08 16:45:39 +02:00 committed by GitHub
commit fd40605745
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 154 additions and 17 deletions

View File

@ -109,6 +109,9 @@ static HTAB *MaintenanceDaemonDBHash;
static volatile sig_atomic_t got_SIGHUP = false;
static volatile sig_atomic_t got_SIGTERM = false;
/* set to true when becoming a maintenance daemon */
static bool IsMaintenanceDaemon = false;
static void MaintenanceDaemonSigTermHandler(SIGNAL_ARGS);
static void MaintenanceDaemonSigHupHandler(SIGNAL_ARGS);
static size_t MaintenanceDaemonShmemSize(void);
@ -165,15 +168,31 @@ InitializeMaintenanceDaemonBackend(void)
return;
}
/* maintenance daemon can ignore itself */
if (dbData->workerPid == MyProcPid)
if (!found)
{
/* ensure the values in MaintenanceDaemonDBData are zero */
memset(((char *) dbData) + sizeof(Oid), 0,
sizeof(MaintenanceDaemonDBData) - sizeof(Oid));
}
if (IsMaintenanceDaemon)
{
/*
* InitializeMaintenanceDaemonBackend is called by the maintenance daemon
* itself. In that case, we clearly don't need to start another maintenance
* daemon.
*/
Assert(found);
Assert(dbData->workerPid == MyProcPid);
LWLockRelease(&MaintenanceDaemonControl->lock);
return;
}
if (!found || !dbData->daemonStarted)
{
Assert(dbData->workerPid == 0);
BackgroundWorker worker;
BackgroundWorkerHandle *handle = NULL;
@ -292,13 +311,33 @@ CitusMaintenanceDaemonMain(Datum main_arg)
proc_exit(0);
}
if (myDbData->workerPid != 0)
{
/*
* Another maintenance daemon is running. This usually happens because
* postgres restarts the daemon after an non-zero exit, and
* InitializeMaintenanceDaemonBackend started one before postgres did.
* In that case, the first one stays and the last one exits.
*/
proc_exit(0);
}
before_shmem_exit(MaintenanceDaemonShmemExit, main_arg);
Assert(myDbData->workerPid == 0);
/* from this point, DROP DATABASE will attempt to kill the worker */
/*
* Signal that I am the maintenance daemon now.
*
* From this point, DROP DATABASE/EXTENSION will send a SIGTERM to me.
*/
myDbData->workerPid = MyProcPid;
/*
* Signal that we are running. This in mainly needed in case of restart after
* an error, otherwise the daemonStarted flag is already true.
*/
myDbData->daemonStarted = true;
/* wire up signals */
pqsignal(SIGTERM, MaintenanceDaemonSigTermHandler);
pqsignal(SIGHUP, MaintenanceDaemonSigHupHandler);
@ -306,6 +345,8 @@ CitusMaintenanceDaemonMain(Datum main_arg)
myDbData->latch = MyLatch;
IsMaintenanceDaemon = true;
LWLockRelease(&MaintenanceDaemonControl->lock);
/*
@ -339,8 +380,6 @@ CitusMaintenanceDaemonMain(Datum main_arg)
CHECK_FOR_INTERRUPTS();
Assert(myDbData->workerPid == MyProcPid);
CitusTableCacheFlushInvalidatedEntries();
/*
@ -567,15 +606,6 @@ CitusMaintenanceDaemonMain(Datum main_arg)
/* check for changed configuration */
if (myDbData->userOid != GetSessionUserId())
{
/*
* Reset myDbData->daemonStarted so InitializeMaintenanceDaemonBackend()
* notices this is a restart.
*/
LWLockAcquire(&MaintenanceDaemonControl->lock, LW_EXCLUSIVE);
myDbData->daemonStarted = false;
myDbData->workerPid = 0;
LWLockRelease(&MaintenanceDaemonControl->lock);
/* return code of 1 requests worker restart */
proc_exit(1);
}
@ -687,8 +717,15 @@ MaintenanceDaemonShmemExit(int code, Datum arg)
MaintenanceDaemonDBData *myDbData = (MaintenanceDaemonDBData *)
hash_search(MaintenanceDaemonDBHash, &databaseOid,
HASH_FIND, NULL);
if (myDbData && myDbData->workerPid == MyProcPid)
/* myDbData is NULL after StopMaintenanceDaemon */
if (myDbData != NULL)
{
/*
* Confirm that I am still the registered maintenance daemon before exiting.
*/
Assert(myDbData->workerPid == MyProcPid);
myDbData->daemonStarted = false;
myDbData->workerPid = 0;
}

View File

@ -723,3 +723,69 @@ CONTEXT: PL/pgSQL function inline_code_block line 6 at RAISE
DROP DATABASE another;
\c - - - :worker_1_port
DROP DATABASE another;
\c - - - :master_port
-- only the regression database should have a maintenance daemon
SELECT count(*) FROM pg_stat_activity WHERE application_name = 'Citus Maintenance Daemon';
count
---------------------------------------------------------------------
1
(1 row)
-- recreate the extension immediately after the maintenancae daemon errors
SELECT pg_cancel_backend(pid) FROM pg_stat_activity WHERE application_name = 'Citus Maintenance Daemon';
pg_cancel_backend
---------------------------------------------------------------------
t
(1 row)
DROP EXTENSION citus;
CREATE EXTENSION citus;
-- wait for maintenance daemon restart
SELECT datname, current_database(),
usename, (SELECT extowner::regrole::text FROM pg_extension WHERE extname = 'citus')
FROM test.maintenance_worker();
datname | current_database | usename | extowner
---------------------------------------------------------------------
regression | regression | postgres | postgres
(1 row)
-- confirm that there is only one maintenance daemon
SELECT count(*) FROM pg_stat_activity WHERE application_name = 'Citus Maintenance Daemon';
count
---------------------------------------------------------------------
1
(1 row)
-- kill the maintenance daemon
SELECT pg_cancel_backend(pid) FROM pg_stat_activity WHERE application_name = 'Citus Maintenance Daemon';
pg_cancel_backend
---------------------------------------------------------------------
t
(1 row)
-- reconnect
\c - - - :master_port
-- run something that goes through planner hook and therefore kicks of maintenance daemon
SELECT 1;
?column?
---------------------------------------------------------------------
1
(1 row)
-- wait for maintenance daemon restart
SELECT datname, current_database(),
usename, (SELECT extowner::regrole::text FROM pg_extension WHERE extname = 'citus')
FROM test.maintenance_worker();
datname | current_database | usename | extowner
---------------------------------------------------------------------
regression | regression | postgres | postgres
(1 row)
-- confirm that there is only one maintenance daemon
SELECT count(*) FROM pg_stat_activity WHERE application_name = 'Citus Maintenance Daemon';
count
---------------------------------------------------------------------
1
(1 row)
DROP TABLE version_mismatch_table;

View File

@ -448,3 +448,37 @@ DROP DATABASE another;
\c - - - :worker_1_port
DROP DATABASE another;
\c - - - :master_port
-- only the regression database should have a maintenance daemon
SELECT count(*) FROM pg_stat_activity WHERE application_name = 'Citus Maintenance Daemon';
-- recreate the extension immediately after the maintenancae daemon errors
SELECT pg_cancel_backend(pid) FROM pg_stat_activity WHERE application_name = 'Citus Maintenance Daemon';
DROP EXTENSION citus;
CREATE EXTENSION citus;
-- wait for maintenance daemon restart
SELECT datname, current_database(),
usename, (SELECT extowner::regrole::text FROM pg_extension WHERE extname = 'citus')
FROM test.maintenance_worker();
-- confirm that there is only one maintenance daemon
SELECT count(*) FROM pg_stat_activity WHERE application_name = 'Citus Maintenance Daemon';
-- kill the maintenance daemon
SELECT pg_cancel_backend(pid) FROM pg_stat_activity WHERE application_name = 'Citus Maintenance Daemon';
-- reconnect
\c - - - :master_port
-- run something that goes through planner hook and therefore kicks of maintenance daemon
SELECT 1;
-- wait for maintenance daemon restart
SELECT datname, current_database(),
usename, (SELECT extowner::regrole::text FROM pg_extension WHERE extname = 'citus')
FROM test.maintenance_worker();
-- confirm that there is only one maintenance daemon
SELECT count(*) FROM pg_stat_activity WHERE application_name = 'Citus Maintenance Daemon';
DROP TABLE version_mismatch_table;