diff --git a/src/backend/distributed/metadata/metadata_cache.c b/src/backend/distributed/metadata/metadata_cache.c index cffa666fc..85e827f61 100644 --- a/src/backend/distributed/metadata/metadata_cache.c +++ b/src/backend/distributed/metadata/metadata_cache.c @@ -4622,7 +4622,13 @@ InvalidateDistRelationCacheCallback(Datum argument, Oid relationId) /* * If pg_dist_partition is being invalidated drop all state * This happens pretty rarely, but most importantly happens during - * DROP EXTENSION citus; + * DROP EXTENSION citus; This isn't the only time when this happens + * though, it can happen for multiple other reasons, such as an + * autovacuum running ANALYZE on pg_dist_partition. Such an ANALYZE + * wouldn't really need a full Metadata cache invalidation, but we + * don't know how to differentiate between DROP EXTENSION and ANALYZE. + * So for now we simply drop it in both cases and take the slight + * temporary performance hit. */ if (relationId == MetadataCache.distPartitionRelationId) { diff --git a/src/backend/distributed/shared_library_init.c b/src/backend/distributed/shared_library_init.c index f03e7ab1d..a5868dc31 100644 --- a/src/backend/distributed/shared_library_init.c +++ b/src/backend/distributed/shared_library_init.c @@ -649,7 +649,9 @@ multi_log_hook(ErrorData *edata) * * NB: All code here has to be able to cope with this routine being called * multiple times in the same backend. This will e.g. happen when the - * extension is created or upgraded. + * extension is created, upgraded or dropped. Due to the way we detect the + * extension being dropped this can also happen when autovacuum runs ANALYZE on + * pg_dist_partition, see InvalidateDistRelationCacheCallback for details. */ void StartupCitusBackend(void) @@ -657,13 +659,18 @@ StartupCitusBackend(void) InitializeMaintenanceDaemonBackend(); /* - * For queries this will be a no-op. But for background daemons we might - * still need to initialize the backend data. For those backaground daemons - * it doesn't really matter that we temporarily assign - * INVALID_CITUS_INTERNAL_BACKEND_GPID, since we override it again two - * lines below. + * For query backends this will be a no-op, because InitializeBackendData + * is already called from the CitusAuthHook. But for background workers we + * still need to initialize the backend data. + */ + InitializeBackendData(application_name); + + /* + * If this is an external connection or a background workers this will + * generate the global PID for this connection. For internal connections + * this is a no-op, since InitializeBackendData will already have extracted + * the gpid from the application_name. */ - InitializeBackendData(INVALID_CITUS_INTERNAL_BACKEND_GPID); AssignGlobalPID(); RegisterConnectionCleanup(); } diff --git a/src/backend/distributed/transaction/backend_data.c b/src/backend/distributed/transaction/backend_data.c index e641b7a9b..90b15ad1c 100644 --- a/src/backend/distributed/transaction/backend_data.c +++ b/src/backend/distributed/transaction/backend_data.c @@ -871,17 +871,40 @@ AssignDistributedTransactionId(void) * If this is a Citus initiated backend, which means it is distributed part of a distributed * query, then this function assigns the global pid extracted from the application name. * If not, this function assigns a new generated global pid. + * + * If a global PID is already assigned to this backend, then this function is a + * no-op. In most scenarios this would already be the case, because a newly + * assigned global PID would be the same as a proviously assigned one. But + * there's two important cases where the newly assigned global PID would be + * different from the previous one: + * 1. The current backend is an internal backend and in the meantime the + * application_name was changed to one without a gpid, e.g. + * citus_rebalancer. In this case we don't want to throw away the original + * gpid of the query originator, because that would mess up distributed + * deadlock detection involving this backend. + * 2. The current backend is an external backend and the node id of the current + * node changed. Updating the gpid to match the nodeid might actually seem + * like a desirable property, but that's not the case. Updating the gpid + * with the new nodeid would mess up distributed deadlock and originator + * detection of queries too. Because if this backend already opened + * connections to other nodes, then those backends will still have the old + * gpid. */ void AssignGlobalPID(void) { + if (GetGlobalPID() != INVALID_CITUS_INTERNAL_BACKEND_GPID) + { + return; + } + uint64 globalPID = INVALID_CITUS_INTERNAL_BACKEND_GPID; bool distributedCommandOriginator = false; if (!IsCitusInternalBackend()) { globalPID = GenerateGlobalPID(); - distributedCommandOriginator = true; + distributedCommandOriginator = IsExternalClientBackend(); } else { diff --git a/src/test/regress/expected/global_cancel.out b/src/test/regress/expected/global_cancel.out index 7ca531820..df50dbe3f 100644 --- a/src/test/regress/expected/global_cancel.out +++ b/src/test/regress/expected/global_cancel.out @@ -1,3 +1,19 @@ +-- add coordinator in idempotent way, this is needed so we get a nodeid +SET client_min_messages TO ERROR; +SELECT 1 FROM master_add_node('localhost', :master_port, groupid => 0); + ?column? +--------------------------------------------------------------------- + 1 +(1 row) + +RESET client_min_messages; +-- Kill maintenance daemon so it gets restarted and gets a gpid containing our +-- nodeid +SELECT pg_terminate_backend(pid) +FROM pg_stat_activity +WHERE application_name = 'Citus Maintenance Daemon' \gset +-- reconnect to make sure we get a session with the gpid containing our nodeid +\c - - - - CREATE SCHEMA global_cancel; SET search_path TO global_cancel; SET citus.next_shard_id TO 56789000; diff --git a/src/test/regress/sql/global_cancel.sql b/src/test/regress/sql/global_cancel.sql index d89b83175..4a6157489 100644 --- a/src/test/regress/sql/global_cancel.sql +++ b/src/test/regress/sql/global_cancel.sql @@ -1,3 +1,16 @@ +-- add coordinator in idempotent way, this is needed so we get a nodeid +SET client_min_messages TO ERROR; +SELECT 1 FROM master_add_node('localhost', :master_port, groupid => 0); +RESET client_min_messages; + +-- Kill maintenance daemon so it gets restarted and gets a gpid containing our +-- nodeid +SELECT pg_terminate_backend(pid) +FROM pg_stat_activity +WHERE application_name = 'Citus Maintenance Daemon' \gset + +-- reconnect to make sure we get a session with the gpid containing our nodeid +\c - - - - CREATE SCHEMA global_cancel; SET search_path TO global_cancel; SET citus.next_shard_id TO 56789000;