mirror of https://github.com/citusdata/citus.git
Avoid re-assigning the global pid for client backends and bg workers when the application_name changes (#7791)
DESCRIPTION: Fixes a crash that happens because of unsafe catalog access when re-assigning the global pid after application_name changes. When application_name changes, we don't actually need to try re-assigning the global pid for external client backends because application_name doesn't affect the global pid for such backends. Plus, trying to re-assign the global pid for external client backends would unnecessarily cause performing a catalog access when the cached local node id is invalidated. However, accessing to the catalog tables is dangerous in certain situations like when we're not in a transaction block. And for the other types of backends, i.e., the Citus internal backends, we need to re-assign the global pid when the application_name changes because for such backends we simply extract the global pid inherited from the originating backend from the application_name -that's specified by originating backend when openning that connection- and this doesn't require catalog access.7787-segfault-loj
parent
665d72a2f5
commit
73411915a4
|
@ -2890,14 +2890,27 @@ ApplicationNameAssignHook(const char *newval, void *extra)
|
||||||
DetermineCitusBackendType(newval);
|
DetermineCitusBackendType(newval);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* AssignGlobalPID might read from catalog tables to get the the local
|
* We use StartupCitusBackend to initialize the global pid after catalogs
|
||||||
* nodeid. But ApplicationNameAssignHook might be called before catalog
|
* are available. After that happens this hook becomes responsible to update
|
||||||
* access is available to the backend (such as in early stages of
|
* the global pid on later application_name changes. So we set the
|
||||||
* authentication). We use StartupCitusBackend to initialize the global pid
|
* FinishedStartupCitusBackend flag in StartupCitusBackend to indicate when
|
||||||
* after catalogs are available. After that happens this hook becomes
|
* this responsibility handoff has happened.
|
||||||
* responsible to update the global pid on later application_name changes.
|
*
|
||||||
* So we set the FinishedStartupCitusBackend flag in StartupCitusBackend to
|
* Also note that when application_name changes, we don't actually need to
|
||||||
* indicate when this responsibility handoff has happened.
|
* try re-assigning the global pid for external client backends and
|
||||||
|
* background workers because application_name doesn't affect the global
|
||||||
|
* pid for such backends - note that !IsExternalClientBackend() check covers
|
||||||
|
* both types of backends. Plus,
|
||||||
|
* trying to re-assign the global pid for such backends would unnecessarily
|
||||||
|
* cause performing a catalog access when the cached local node id is
|
||||||
|
* invalidated. However, accessing to the catalog tables is dangerous in
|
||||||
|
* certain situations like when we're not in a transaction block. And for
|
||||||
|
* the other types of backends, i.e., the Citus internal backends, we need
|
||||||
|
* to re-assign the global pid when the application_name changes because for
|
||||||
|
* such backends we simply extract the global pid inherited from the
|
||||||
|
* originating backend from the application_name -that's specified by
|
||||||
|
* originating backend when openning that connection- and this doesn't require
|
||||||
|
* catalog access.
|
||||||
*
|
*
|
||||||
* Another solution to the catalog table acccess problem would be to update
|
* Another solution to the catalog table acccess problem would be to update
|
||||||
* global pid lazily, like we do for HideShards. But that's not possible
|
* global pid lazily, like we do for HideShards. But that's not possible
|
||||||
|
@ -2907,7 +2920,7 @@ ApplicationNameAssignHook(const char *newval, void *extra)
|
||||||
* as reasonably possible, which is also why we extract global pids in the
|
* as reasonably possible, which is also why we extract global pids in the
|
||||||
* AuthHook already (extracting doesn't require catalog access).
|
* AuthHook already (extracting doesn't require catalog access).
|
||||||
*/
|
*/
|
||||||
if (FinishedStartupCitusBackend)
|
if (FinishedStartupCitusBackend && !IsExternalClientBackend())
|
||||||
{
|
{
|
||||||
AssignGlobalPID(newval);
|
AssignGlobalPID(newval);
|
||||||
}
|
}
|
||||||
|
|
|
@ -190,6 +190,9 @@ run_commands_on_session_level_connection_to_node(PG_FUNCTION_ARGS)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* override_backend_data_gpid is a wrapper around SetBackendDataGpid().
|
* override_backend_data_gpid is a wrapper around SetBackendDataGpid().
|
||||||
|
* Also sets distributedCommandOriginator to true since the only caller of
|
||||||
|
* this method calls this function actually wants this backend to
|
||||||
|
* be treated as a distributed command originator with the given global pid.
|
||||||
*/
|
*/
|
||||||
Datum
|
Datum
|
||||||
override_backend_data_gpid(PG_FUNCTION_ARGS)
|
override_backend_data_gpid(PG_FUNCTION_ARGS)
|
||||||
|
@ -199,6 +202,7 @@ override_backend_data_gpid(PG_FUNCTION_ARGS)
|
||||||
uint64 gpid = PG_GETARG_INT64(0);
|
uint64 gpid = PG_GETARG_INT64(0);
|
||||||
|
|
||||||
SetBackendDataGlobalPID(gpid);
|
SetBackendDataGlobalPID(gpid);
|
||||||
|
SetBackendDataDistributedCommandOriginator(true);
|
||||||
|
|
||||||
PG_RETURN_VOID();
|
PG_RETURN_VOID();
|
||||||
}
|
}
|
||||||
|
|
|
@ -964,6 +964,23 @@ SetBackendDataGlobalPID(uint64 gpid)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* SetBackendDataDistributedCommandOriginator sets the distributedCommandOriginator
|
||||||
|
* field on MyBackendData.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
SetBackendDataDistributedCommandOriginator(bool distributedCommandOriginator)
|
||||||
|
{
|
||||||
|
if (!MyBackendData)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
SpinLockAcquire(&MyBackendData->mutex);
|
||||||
|
MyBackendData->distributedCommandOriginator = distributedCommandOriginator;
|
||||||
|
SpinLockRelease(&MyBackendData->mutex);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* GetGlobalPID returns the global process id of the current backend.
|
* GetGlobalPID returns the global process id of the current backend.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -61,6 +61,7 @@ extern void AssignGlobalPID(const char *applicationName);
|
||||||
extern uint64 GetGlobalPID(void);
|
extern uint64 GetGlobalPID(void);
|
||||||
extern void SetBackendDataDatabaseId(void);
|
extern void SetBackendDataDatabaseId(void);
|
||||||
extern void SetBackendDataGlobalPID(uint64 gpid);
|
extern void SetBackendDataGlobalPID(uint64 gpid);
|
||||||
|
extern void SetBackendDataDistributedCommandOriginator(bool distributedCommandOriginator);
|
||||||
extern uint64 ExtractGlobalPID(const char *applicationName);
|
extern uint64 ExtractGlobalPID(const char *applicationName);
|
||||||
extern int ExtractNodeIdFromGlobalPID(uint64 globalPID, bool missingOk);
|
extern int ExtractNodeIdFromGlobalPID(uint64 globalPID, bool missingOk);
|
||||||
extern int ExtractProcessIdFromGlobalPID(uint64 globalPID);
|
extern int ExtractProcessIdFromGlobalPID(uint64 globalPID);
|
||||||
|
|
|
@ -5,6 +5,37 @@ SELECT master_remove_node('localhost', :master_port);
|
||||||
|
|
||||||
(1 row)
|
(1 row)
|
||||||
|
|
||||||
|
-- to silence -potentially flaky- "could not establish connection after" warnings in below test
|
||||||
|
SET client_min_messages TO ERROR;
|
||||||
|
-- to fail fast when the hostname is not resolvable, as it will be the case below
|
||||||
|
SET citus.node_connection_timeout to '1s';
|
||||||
|
BEGIN;
|
||||||
|
SET application_name TO 'new_app_name';
|
||||||
|
-- that should fail because of bad hostname & port
|
||||||
|
SELECT citus_add_node('200.200.200.200', 1, 200);
|
||||||
|
ERROR: connection to the remote node postgres@200.200.200.200:1 failed
|
||||||
|
-- Since above command failed, now Postgres will need to revert the
|
||||||
|
-- application_name change made in this transaction and this will
|
||||||
|
-- happen within abort-transaction callback, so we won't be in a
|
||||||
|
-- transaction block while Postgres does that.
|
||||||
|
--
|
||||||
|
-- And when the application_name changes, Citus tries to re-assign
|
||||||
|
-- the global pid but it does so only for Citus internal backends,
|
||||||
|
-- and doing so for Citus internal backends doesn't require being
|
||||||
|
-- in a transaction block and is safe.
|
||||||
|
--
|
||||||
|
-- However, for the client external backends (like us here), Citus
|
||||||
|
-- doesn't re-assign the global pid because it's not needed and it's
|
||||||
|
-- not safe to do so outside of a transaction block. This is because,
|
||||||
|
-- it would require performing a catalog access to retrive the local
|
||||||
|
-- node id when the cached local node is invalidated like what just
|
||||||
|
-- happened here because of the failed citus_add_node() call made
|
||||||
|
-- above.
|
||||||
|
--
|
||||||
|
-- So by failing here (rather than crashing), we ensure this behavior.
|
||||||
|
ROLLBACK;
|
||||||
|
RESET client_min_messages;
|
||||||
|
RESET citus.node_connection_timeout;
|
||||||
-- restore coordinator for the rest of the tests
|
-- restore coordinator for the rest of the tests
|
||||||
SELECT citus_set_coordinator_host('localhost', :master_port);
|
SELECT citus_set_coordinator_host('localhost', :master_port);
|
||||||
citus_set_coordinator_host
|
citus_set_coordinator_host
|
||||||
|
|
|
@ -1,5 +1,41 @@
|
||||||
-- removing coordinator from pg_dist_node should update pg_dist_colocation
|
-- removing coordinator from pg_dist_node should update pg_dist_colocation
|
||||||
SELECT master_remove_node('localhost', :master_port);
|
SELECT master_remove_node('localhost', :master_port);
|
||||||
|
|
||||||
|
-- to silence -potentially flaky- "could not establish connection after" warnings in below test
|
||||||
|
SET client_min_messages TO ERROR;
|
||||||
|
|
||||||
|
-- to fail fast when the hostname is not resolvable, as it will be the case below
|
||||||
|
SET citus.node_connection_timeout to '1s';
|
||||||
|
|
||||||
|
BEGIN;
|
||||||
|
SET application_name TO 'new_app_name';
|
||||||
|
|
||||||
|
-- that should fail because of bad hostname & port
|
||||||
|
SELECT citus_add_node('200.200.200.200', 1, 200);
|
||||||
|
|
||||||
|
-- Since above command failed, now Postgres will need to revert the
|
||||||
|
-- application_name change made in this transaction and this will
|
||||||
|
-- happen within abort-transaction callback, so we won't be in a
|
||||||
|
-- transaction block while Postgres does that.
|
||||||
|
--
|
||||||
|
-- And when the application_name changes, Citus tries to re-assign
|
||||||
|
-- the global pid but it does so only for Citus internal backends,
|
||||||
|
-- and doing so for Citus internal backends doesn't require being
|
||||||
|
-- in a transaction block and is safe.
|
||||||
|
--
|
||||||
|
-- However, for the client external backends (like us here), Citus
|
||||||
|
-- doesn't re-assign the global pid because it's not needed and it's
|
||||||
|
-- not safe to do so outside of a transaction block. This is because,
|
||||||
|
-- it would require performing a catalog access to retrive the local
|
||||||
|
-- node id when the cached local node is invalidated like what just
|
||||||
|
-- happened here because of the failed citus_add_node() call made
|
||||||
|
-- above.
|
||||||
|
--
|
||||||
|
-- So by failing here (rather than crashing), we ensure this behavior.
|
||||||
|
ROLLBACK;
|
||||||
|
|
||||||
|
RESET client_min_messages;
|
||||||
|
RESET citus.node_connection_timeout;
|
||||||
|
|
||||||
-- restore coordinator for the rest of the tests
|
-- restore coordinator for the rest of the tests
|
||||||
SELECT citus_set_coordinator_host('localhost', :master_port);
|
SELECT citus_set_coordinator_host('localhost', :master_port);
|
||||||
|
|
Loading…
Reference in New Issue