From 0c658b73fc3d99e65576c6a306ab7f4d6c42eb95 Mon Sep 17 00:00:00 2001 From: Onur Tirtir Date: Thu, 4 Sep 2025 15:13:57 +0300 Subject: [PATCH] Fix an assertion failure in Citus maintenance daemon that can happen in very slow systems (#8158) Fixes #5808. DESCRIPTION: Fixes an assertion failure in Citus maintenance daemon that can happen in very slow systems. Try running `make -C src/test/regress/ check-multi-1-vg` - while the tests will exit with code 2 at least %50 of the times in the very early stages of the test suite by producing a core-dump on main, it won't be the case on this branch, at least based on my trials :) --- src/backend/distributed/utils/maintenanced.c | 28 ++++++++++++++++---- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/src/backend/distributed/utils/maintenanced.c b/src/backend/distributed/utils/maintenanced.c index 4dcc201d2..5ca7fd6b5 100644 --- a/src/backend/distributed/utils/maintenanced.c +++ b/src/backend/distributed/utils/maintenanced.c @@ -1040,12 +1040,30 @@ MaintenanceDaemonShmemExit(int code, Datum arg) if (myDbData != NULL) { /* - * Confirm that I am still the registered maintenance daemon before exiting. + * Once the maintenance daemon fails (e.g., due to an error in the main loop), + * both Postgres tries to restart the failed daemon and Citus attempt to start + * a new one. In that case, the one started by Citus ends up here. + * + * As the maintenance daemon that Citus tried to start, we might see the entry + * for the daemon restarted by Postgres if the system was so slow that it + * took a long time for us to be re-scheduled to call MaintenanceDaemonShmemExit(), + * e.g., under valgrind testing. + * + * In that case, we should unregister ourself only if we are still the registered + * maintenance daemon. */ - Assert(myDbData->workerPid == MyProcPid); - - myDbData->daemonStarted = false; - myDbData->workerPid = 0; + if (myDbData->workerPid == MyProcPid) + { + myDbData->daemonStarted = false; + myDbData->workerPid = 0; + } + else + { + ereport(LOG, (errmsg( + "maintenance daemon for database %u has already been replaced by " + "Postgres, skipping to unregister this maintenance daemon", + databaseOid))); + } } LWLockRelease(&MaintenanceDaemonControl->lock);