Avoid re-assigning the global pid for client backends and bg workers when the application_name changes (#7791)

DESCRIPTION: Fixes a crash that happens because of unsafe catalog access when re-assigning the global pid after application_name changes. When application_name changes, we don't actually need to try re-assigning the global pid for external client backends because application_name doesn't affect the global pid for such backends. Plus, trying to re-assign the global pid for external client backends would unnecessarily cause performing a catalog access when the cached local node id is invalidated. However, accessing to the catalog tables is dangerous in certain situations like when we're not in a transaction block. And for the other types of backends, i.e., the Citus internal backends, we need to re-assign the global pid when the application_name changes because for such backends we simply extract the global pid inherited from the originating backend from the application_name -that's specified by originating backend when openning that connection- and this doesn't require catalog access. (cherry picked from commit 73411915a4)
2024-12-23 17:01:53 +03:00 · 2024-12-23 17:01:53 +03:00 · a19e180212
parent cdded256ef
commit a19e180212
6 changed files with 111 additions and 9 deletions
--- a/src/backend/distributed/shared_library_init.c
+++ b/src/backend/distributed/shared_library_init.c
@ -2853,14 +2853,27 @@ ApplicationNameAssignHook(const char *newval, void *extra)
 	DetermineCitusBackendType(newval);

 	/*
-	 * AssignGlobalPID might read from catalog tables to get the the local
-	 * nodeid. But ApplicationNameAssignHook might be called before catalog
-	 * access is available to the backend (such as in early stages of
-	 * authentication). We use StartupCitusBackend to initialize the global pid
-	 * after catalogs are available. After that happens this hook becomes
-	 * responsible to update the global pid on later application_name changes.
-	 * So we set the FinishedStartupCitusBackend flag in StartupCitusBackend to
-	 * indicate when this responsibility handoff has happened.
+	 * We use StartupCitusBackend to initialize the global pid after catalogs
+	 * are available. After that happens this hook becomes responsible to update
+	 * the global pid on later application_name changes. So we set the
+	 * FinishedStartupCitusBackend flag in StartupCitusBackend to indicate when
+	 * this responsibility handoff has happened.
+	 *
+	 * Also note that when application_name changes, we don't actually need to
+	 * try re-assigning the global pid for external client backends and
+	 * background workers because application_name doesn't affect the global
+	 * pid for such backends - note that !IsExternalClientBackend() check covers
+	 * both types of backends. Plus,
+	 * trying to re-assign the global pid for such backends would unnecessarily
+	 * cause performing a catalog access when the cached local node id is
+	 * invalidated. However, accessing to the catalog tables is dangerous in
+	 * certain situations like when we're not in a transaction block. And for
+	 * the other types of backends, i.e., the Citus internal backends, we need
+	 * to re-assign the global pid when the application_name changes because for
+	 * such backends we simply extract the global pid inherited from the
+	 * originating backend from the application_name -that's specified by
+	 * originating backend when openning that connection- and this doesn't require
+	 * catalog access.
 	 *
 	 * Another solution to the catalog table acccess problem would be to update
 	 * global pid lazily, like we do for HideShards. But that's not possible
@ -2870,7 +2883,7 @@ ApplicationNameAssignHook(const char *newval, void *extra)
 	 * as reasonably possible, which is also why we extract global pids in the
 	 * AuthHook already (extracting doesn't require catalog access).
 	 */
-	if (FinishedStartupCitusBackend)
+	if (FinishedStartupCitusBackend && !IsExternalClientBackend())
 	{
 		AssignGlobalPID(newval);
 	}
--- a/src/backend/distributed/test/run_from_same_connection.c
+++ b/src/backend/distributed/test/run_from_same_connection.c
@ -190,6 +190,9 @@ run_commands_on_session_level_connection_to_node(PG_FUNCTION_ARGS)

 /*
 * override_backend_data_gpid is a wrapper around SetBackendDataGpid().
+ * Also sets distributedCommandOriginator to true since the only caller of
+ * this method calls this function actually wants this backend to
+ * be treated as a distributed command originator with the given global pid.
 */
 Datum
 override_backend_data_gpid(PG_FUNCTION_ARGS)
@ -199,6 +202,7 @@ override_backend_data_gpid(PG_FUNCTION_ARGS)
 	uint64 gpid = PG_GETARG_INT64(0);

 	SetBackendDataGlobalPID(gpid);
+	SetBackendDataDistributedCommandOriginator(true);

 	PG_RETURN_VOID();
 }
--- a/src/backend/distributed/transaction/backend_data.c
+++ b/src/backend/distributed/transaction/backend_data.c
@ -964,6 +964,23 @@ SetBackendDataGlobalPID(uint64 gpid)
 }


+/*
+ * SetBackendDataDistributedCommandOriginator sets the distributedCommandOriginator
+ * field on MyBackendData.
+ */
+void
+SetBackendDataDistributedCommandOriginator(bool distributedCommandOriginator)
+{
+	if (!MyBackendData)
+	{
+		return;
+	}
+	SpinLockAcquire(&MyBackendData->mutex);
+	MyBackendData->distributedCommandOriginator = distributedCommandOriginator;
+	SpinLockRelease(&MyBackendData->mutex);
+}
+
+
 /*
 * GetGlobalPID returns the global process id of the current backend.
 */
--- a/src/include/distributed/backend_data.h
+++ b/src/include/distributed/backend_data.h
@ -61,6 +61,7 @@ extern void AssignGlobalPID(const char *applicationName);
 extern uint64 GetGlobalPID(void);
 extern void SetBackendDataDatabaseId(void);
 extern void SetBackendDataGlobalPID(uint64 gpid);
+extern void SetBackendDataDistributedCommandOriginator(bool distributedCommandOriginator);
 extern uint64 ExtractGlobalPID(const char *applicationName);
 extern int ExtractNodeIdFromGlobalPID(uint64 globalPID, bool missingOk);
 extern int ExtractProcessIdFromGlobalPID(uint64 globalPID);
--- a/src/test/regress/expected/remove_coordinator.out
+++ b/src/test/regress/expected/remove_coordinator.out
@ -5,6 +5,37 @@ SELECT master_remove_node('localhost', :master_port);

 (1 row)

+-- to silence -potentially flaky- "could not establish connection after" warnings in below test
+SET client_min_messages TO ERROR;
+-- to fail fast when the hostname is not resolvable, as it will be the case below
+SET citus.node_connection_timeout to '1s';
+BEGIN;
+  SET application_name TO 'new_app_name';
+  -- that should fail because of bad hostname & port
+  SELECT citus_add_node('200.200.200.200', 1, 200);
+ERROR:  connection to the remote node postgres@200.200.200.200:1 failed
+  -- Since above command failed, now Postgres will need to revert the
+  -- application_name change made in this transaction and this will
+  -- happen within abort-transaction callback, so we won't be in a
+  -- transaction block while Postgres does that.
+  --
+  -- And when the application_name changes, Citus tries to re-assign
+  -- the global pid but it does so only for Citus internal backends,
+  -- and doing so for Citus internal backends doesn't require being
+  -- in a transaction block and is safe.
+  --
+  -- However, for the client external backends (like us here), Citus
+  -- doesn't re-assign the global pid because it's not needed and it's
+  -- not safe to do so outside of a transaction block. This is because,
+  -- it would require performing a catalog access to retrive the local
+  -- node id when the cached local node is invalidated like what just
+  -- happened here because of the failed citus_add_node() call made
+  -- above.
+  --
+  -- So by failing here (rather than crashing), we ensure this behavior.
+ROLLBACK;
+RESET client_min_messages;
+RESET citus.node_connection_timeout;
 -- restore coordinator for the rest of the tests
 SELECT citus_set_coordinator_host('localhost', :master_port);
 citus_set_coordinator_host
--- a/src/test/regress/sql/remove_coordinator.sql
+++ b/src/test/regress/sql/remove_coordinator.sql
@ -1,5 +1,41 @@
 -- removing coordinator from pg_dist_node should update pg_dist_colocation
 SELECT master_remove_node('localhost', :master_port);

+-- to silence -potentially flaky- "could not establish connection after" warnings in below test
+SET client_min_messages TO ERROR;
+
+-- to fail fast when the hostname is not resolvable, as it will be the case below
+SET citus.node_connection_timeout to '1s';
+
+BEGIN;
+  SET application_name TO 'new_app_name';
+
+  -- that should fail because of bad hostname & port
+  SELECT citus_add_node('200.200.200.200', 1, 200);
+
+  -- Since above command failed, now Postgres will need to revert the
+  -- application_name change made in this transaction and this will
+  -- happen within abort-transaction callback, so we won't be in a
+  -- transaction block while Postgres does that.
+  --
+  -- And when the application_name changes, Citus tries to re-assign
+  -- the global pid but it does so only for Citus internal backends,
+  -- and doing so for Citus internal backends doesn't require being
+  -- in a transaction block and is safe.
+  --
+  -- However, for the client external backends (like us here), Citus
+  -- doesn't re-assign the global pid because it's not needed and it's
+  -- not safe to do so outside of a transaction block. This is because,
+  -- it would require performing a catalog access to retrive the local
+  -- node id when the cached local node is invalidated like what just
+  -- happened here because of the failed citus_add_node() call made
+  -- above.
+  --
+  -- So by failing here (rather than crashing), we ensure this behavior.
+ROLLBACK;
+
+RESET client_min_messages;
+RESET citus.node_connection_timeout;
+
 -- restore coordinator for the rest of the tests
 SELECT citus_set_coordinator_host('localhost', :master_port);