From 454179e0bd722724a6d9a9f39313ef43f269a627 Mon Sep 17 00:00:00 2001 From: Onur Tirtir Date: Wed, 18 Dec 2024 14:54:12 +0300 Subject: [PATCH] Add a test that reproduces the bug --- .../regress/expected/remove_coordinator.out | 18 ++++++---- src/test/regress/sql/remove_coordinator.sql | 35 +++++++++++++++++++ 2 files changed, 46 insertions(+), 7 deletions(-) diff --git a/src/test/regress/expected/remove_coordinator.out b/src/test/regress/expected/remove_coordinator.out index 0226a7cd0..907eef8a2 100644 --- a/src/test/regress/expected/remove_coordinator.out +++ b/src/test/regress/expected/remove_coordinator.out @@ -5,10 +5,14 @@ SELECT master_remove_node('localhost', :master_port); (1 row) --- restore coordinator for the rest of the tests -SELECT citus_set_coordinator_host('localhost', :master_port); - citus_set_coordinator_host ---------------------------------------------------------------------- - -(1 row) - +-- to silence -potentially flaky- "could not establish connection after" warnings in below test +SET client_min_messages TO ERROR; +-- to fail fast if the hostname is not resolvable +SET citus.node_connection_timeout to '1s'; +BEGIN; + SET application_name TO 'new_app_name'; + -- that should fail because of bad hostname & port + SELECT citus_add_node('200.200.200.200', 1, 200); +ERROR: connection to the remote node postgres@200.200.200.200:1 failed +SSL SYSCALL error: EOF detected +connection to server was lost diff --git a/src/test/regress/sql/remove_coordinator.sql b/src/test/regress/sql/remove_coordinator.sql index b0df327d1..02df7c28b 100644 --- a/src/test/regress/sql/remove_coordinator.sql +++ b/src/test/regress/sql/remove_coordinator.sql @@ -1,5 +1,40 @@ -- removing coordinator from pg_dist_node should update pg_dist_colocation SELECT master_remove_node('localhost', :master_port); +-- to silence -potentially flaky- "could not establish connection after" warnings in below test +SET client_min_messages TO ERROR; + +-- to fail fast if the hostname is not resolvable +SET citus.node_connection_timeout to '1s'; + +BEGIN; + SET application_name TO 'new_app_name'; + + -- that should fail because of bad hostname & port + SELECT citus_add_node('200.200.200.200', 1, 200); + + -- Since above command failed, now Postgres will need to revert the + -- application_name change made in this transaction and this will + -- happen within abort-transaction callback, so we won't be in a + -- transaction block while Postgres does that. + -- + -- And when the application_name changes, Citus tries to re-assign + -- the global pid but it does so only for Citus internal backends, + -- and doing so for Citus internal backends doesn't require being + -- in a transaction block and is safe. + -- + -- However, for the client external backends (like us here), Citus + -- doesn't re-assign the global pid because it's not safe to do so + -- outside of a transaction block. This is because, it would require + -- performing a catalog access to retrive the local node id when the + -- cached local node is invalidated like what just happened here + -- because of the failed citus_add_node() call made above. + -- + -- So by failing here (rather than crashing), we ensure this behavior. +ROLLBACK; + +RESET client_min_messages; +RESET citus.node_connection_timeout; + -- restore coordinator for the rest of the tests SELECT citus_set_coordinator_host('localhost', :master_port);