mirror of https://github.com/citusdata/citus.git
Fix an assertion failure in Citus maintenance daemon that can happen in very slow systems (#8158)
Fixes #5808. DESCRIPTION: Fixes an assertion failure in Citus maintenance daemon that can happen in very slow systems. Try running `make -C src/test/regress/ check-multi-1-vg` - while the tests will exit with code 2 at least %50 of the times in the very early stages of the test suite by producing a core-dump on main, it won't be the case on this branch, at least based on my trials :)pull/8176/head
parent
2834fa26c9
commit
0c658b73fc
|
|
@ -1040,13 +1040,31 @@ MaintenanceDaemonShmemExit(int code, Datum arg)
|
|||
if (myDbData != NULL)
|
||||
{
|
||||
/*
|
||||
* Confirm that I am still the registered maintenance daemon before exiting.
|
||||
* Once the maintenance daemon fails (e.g., due to an error in the main loop),
|
||||
* both Postgres tries to restart the failed daemon and Citus attempt to start
|
||||
* a new one. In that case, the one started by Citus ends up here.
|
||||
*
|
||||
* As the maintenance daemon that Citus tried to start, we might see the entry
|
||||
* for the daemon restarted by Postgres if the system was so slow that it
|
||||
* took a long time for us to be re-scheduled to call MaintenanceDaemonShmemExit(),
|
||||
* e.g., under valgrind testing.
|
||||
*
|
||||
* In that case, we should unregister ourself only if we are still the registered
|
||||
* maintenance daemon.
|
||||
*/
|
||||
Assert(myDbData->workerPid == MyProcPid);
|
||||
|
||||
if (myDbData->workerPid == MyProcPid)
|
||||
{
|
||||
myDbData->daemonStarted = false;
|
||||
myDbData->workerPid = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
ereport(LOG, (errmsg(
|
||||
"maintenance daemon for database %u has already been replaced by "
|
||||
"Postgres, skipping to unregister this maintenance daemon",
|
||||
databaseOid)));
|
||||
}
|
||||
}
|
||||
|
||||
LWLockRelease(&MaintenanceDaemonControl->lock);
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue