diff --git a/src/backend/distributed/connection/shared_connection_stats.c b/src/backend/distributed/connection/shared_connection_stats.c index 3fb201343..29578e8a6 100644 --- a/src/backend/distributed/connection/shared_connection_stats.c +++ b/src/backend/distributed/connection/shared_connection_stats.c @@ -171,6 +171,11 @@ StoreAllConnections(Tuplestorestate *tupleStore, TupleDesc tupleDescriptor) memset(isNulls, false, sizeof(isNulls)); char *databaseName = get_database_name(connectionEntry->key.databaseOid); + if (databaseName == NULL) + { + /* database might have been dropped */ + continue; + } values[0] = PointerGetDatum(cstring_to_text(connectionEntry->key.hostname)); values[1] = Int32GetDatum(connectionEntry->key.port); diff --git a/src/test/regress/expected/ensure_no_shared_connection_leak.out b/src/test/regress/expected/ensure_no_shared_connection_leak.out new file mode 100644 index 000000000..04f0c7c62 --- /dev/null +++ b/src/test/regress/expected/ensure_no_shared_connection_leak.out @@ -0,0 +1,160 @@ +-- this test file is intended to be called at the end +-- of any test schedule, ensuring that there is not +-- leak/wrong calculation of the connection stats +-- in the shared memory +CREATE SCHEMA ensure_no_shared_connection_leak; +SET search_path TO ensure_no_shared_connection_leak; +-- set the cached connections to zero +-- and execute a distributed query so that +-- we end up with zero cached connections afterwards +ALTER SYSTEM SET citus.max_cached_conns_per_worker TO 0; +SELECT pg_reload_conf(); + pg_reload_conf +--------------------------------------------------------------------- + t +(1 row) + +-- disable deadlock detection and re-trigger 2PC recovery +-- once more when citus.max_cached_conns_per_worker is zero +-- so that we can be sure that the connections established for +-- maintanince daemon is closed properly. +-- this is to prevent random failures in the tests (otherwise, we +-- might see connections established for this operations) +ALTER SYSTEM SET citus.distributed_deadlock_detection_factor TO -1; +ALTER SYSTEM SET citus.recover_2pc_interval TO '1ms'; +SELECT pg_reload_conf(); + pg_reload_conf +--------------------------------------------------------------------- + t +(1 row) + +SELECT pg_sleep(0.1); + pg_sleep +--------------------------------------------------------------------- + +(1 row) + +-- now that last 2PC recovery is done, we're good to disable it +ALTER SYSTEM SET citus.recover_2pc_interval TO '1h'; +SELECT pg_reload_conf(); + pg_reload_conf +--------------------------------------------------------------------- + t +(1 row) + +CREATE TABLE test (a int); +SELECT create_distributed_table('test', 'a'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +SELECT count(*) FROM test; + count +--------------------------------------------------------------------- + 0 +(1 row) + +-- in case of MX, we should prevent deadlock detection and +-- 2PC recover from the workers as well +\c - - - :worker_1_port +ALTER SYSTEM SET citus.max_cached_conns_per_worker TO 0; +SELECT pg_reload_conf(); + pg_reload_conf +--------------------------------------------------------------------- + t +(1 row) + +ALTER SYSTEM SET citus.distributed_deadlock_detection_factor TO -1; +ALTER SYSTEM SET citus.recover_2pc_interval TO '1ms'; +SELECT pg_reload_conf(); + pg_reload_conf +--------------------------------------------------------------------- + t +(1 row) + +SELECT pg_sleep(0.1); + pg_sleep +--------------------------------------------------------------------- + +(1 row) + +ALTER SYSTEM SET citus.recover_2pc_interval TO '1h'; +SELECT pg_reload_conf(); + pg_reload_conf +--------------------------------------------------------------------- + t +(1 row) + +\c - - - :worker_2_port +ALTER SYSTEM SET citus.max_cached_conns_per_worker TO 0; +SELECT pg_reload_conf(); + pg_reload_conf +--------------------------------------------------------------------- + t +(1 row) + +ALTER SYSTEM SET citus.distributed_deadlock_detection_factor TO -1; +ALTER SYSTEM SET citus.recover_2pc_interval TO '1ms'; +SELECT pg_reload_conf(); + pg_reload_conf +--------------------------------------------------------------------- + t +(1 row) + +SELECT pg_sleep(0.1); + pg_sleep +--------------------------------------------------------------------- + +(1 row) + +ALTER SYSTEM SET citus.recover_2pc_interval TO '1h'; +SELECT pg_reload_conf(); + pg_reload_conf +--------------------------------------------------------------------- + t +(1 row) + +\c - - - :master_port +SET search_path TO ensure_no_shared_connection_leak; +-- ensure that we only have at most citus.max_cached_conns_per_worker +-- connections per node +select + (connection_count_to_node = 0) as no_connection_to_node +FROM + citus_remote_connection_stats() +WHERE + port IN (SELECT node_port FROM master_get_active_worker_nodes()) AND + database_name = 'regression' +ORDER BY 1; + no_connection_to_node +--------------------------------------------------------------------- + t + t +(2 rows) + +-- now, ensure this from the workers perspective +-- we should only see the connection/backend that is running the command below +SELECT + result, success +FROM + run_command_on_workers($$select count(*) from pg_stat_activity WHERE backend_type = 'client backend';$$) +ORDER BY 1, 2; + result | success +--------------------------------------------------------------------- + 1 | t + 1 | t +(2 rows) + +-- in case other tests relies on these setting, reset them +ALTER SYSTEM RESET citus.distributed_deadlock_detection_factor; +ALTER SYSTEM RESET citus.recover_2pc_interval; +ALTER SYSTEM RESET citus.max_cached_conns_per_worker; +SELECT pg_reload_conf(); + pg_reload_conf +--------------------------------------------------------------------- + t +(1 row) + +DROP SCHEMA ensure_no_shared_connection_leak CASCADE; +NOTICE: drop cascades to table test diff --git a/src/test/regress/failure_schedule b/src/test/regress/failure_schedule index fb0ecd0bf..7fd540e95 100644 --- a/src/test/regress/failure_schedule +++ b/src/test/regress/failure_schedule @@ -37,3 +37,9 @@ test: failure_connection_establishment # test that no tests leaked intermediate results. This should always be last test: ensure_no_intermediate_data_leak + +# --------- +# ensures that we never leak any connection counts +# in the shared memory +# -------- +test: ensure_no_shared_connection_leak diff --git a/src/test/regress/multi_mx_schedule b/src/test/regress/multi_mx_schedule index 0e6a38a69..7c1d3d61c 100644 --- a/src/test/regress/multi_mx_schedule +++ b/src/test/regress/multi_mx_schedule @@ -50,3 +50,9 @@ test: locally_execute_intermediate_results # test that no tests leaked intermediate results. This should always be last test: ensure_no_intermediate_data_leak + +# --------- +# ensures that we never leak any connection counts +# in the shared memory +# -------- +test: ensure_no_shared_connection_leak diff --git a/src/test/regress/multi_schedule b/src/test/regress/multi_schedule index 753343e8f..fba9c3986 100644 --- a/src/test/regress/multi_schedule +++ b/src/test/regress/multi_schedule @@ -344,3 +344,9 @@ test: multi_deparse_function multi_deparse_procedure # test that no tests leaked intermediate results. This should always be last # --------- test: ensure_no_intermediate_data_leak + +# --------- +# ensures that we never leak any connection counts +# in the shared memory +# -------- +test: ensure_no_shared_connection_leak diff --git a/src/test/regress/multi_task_tracker_extra_schedule b/src/test/regress/multi_task_tracker_extra_schedule index 6cefdd098..4a45bcd11 100644 --- a/src/test/regress/multi_task_tracker_extra_schedule +++ b/src/test/regress/multi_task_tracker_extra_schedule @@ -114,3 +114,9 @@ test: multi_schema_support # test that no tests leaked intermediate results. This should always be last # ---------- test: ensure_no_intermediate_data_leak + +# --------- +# ensures that we never leak any connection counts +# in the shared memory +# -------- +test: ensure_no_shared_connection_leak diff --git a/src/test/regress/sql/ensure_no_shared_connection_leak.sql b/src/test/regress/sql/ensure_no_shared_connection_leak.sql new file mode 100644 index 000000000..5d42d8820 --- /dev/null +++ b/src/test/regress/sql/ensure_no_shared_connection_leak.sql @@ -0,0 +1,83 @@ +-- this test file is intended to be called at the end +-- of any test schedule, ensuring that there is not +-- leak/wrong calculation of the connection stats +-- in the shared memory +CREATE SCHEMA ensure_no_shared_connection_leak; +SET search_path TO ensure_no_shared_connection_leak; + +-- set the cached connections to zero +-- and execute a distributed query so that +-- we end up with zero cached connections afterwards +ALTER SYSTEM SET citus.max_cached_conns_per_worker TO 0; +SELECT pg_reload_conf(); + +-- disable deadlock detection and re-trigger 2PC recovery +-- once more when citus.max_cached_conns_per_worker is zero +-- so that we can be sure that the connections established for +-- maintanince daemon is closed properly. +-- this is to prevent random failures in the tests (otherwise, we +-- might see connections established for this operations) +ALTER SYSTEM SET citus.distributed_deadlock_detection_factor TO -1; +ALTER SYSTEM SET citus.recover_2pc_interval TO '1ms'; +SELECT pg_reload_conf(); +SELECT pg_sleep(0.1); + +-- now that last 2PC recovery is done, we're good to disable it +ALTER SYSTEM SET citus.recover_2pc_interval TO '1h'; +SELECT pg_reload_conf(); + +CREATE TABLE test (a int); +SELECT create_distributed_table('test', 'a'); +SELECT count(*) FROM test; + +-- in case of MX, we should prevent deadlock detection and +-- 2PC recover from the workers as well +\c - - - :worker_1_port +ALTER SYSTEM SET citus.max_cached_conns_per_worker TO 0; +SELECT pg_reload_conf(); +ALTER SYSTEM SET citus.distributed_deadlock_detection_factor TO -1; +ALTER SYSTEM SET citus.recover_2pc_interval TO '1ms'; +SELECT pg_reload_conf(); +SELECT pg_sleep(0.1); +ALTER SYSTEM SET citus.recover_2pc_interval TO '1h'; +SELECT pg_reload_conf(); +\c - - - :worker_2_port +ALTER SYSTEM SET citus.max_cached_conns_per_worker TO 0; +SELECT pg_reload_conf(); +ALTER SYSTEM SET citus.distributed_deadlock_detection_factor TO -1; +ALTER SYSTEM SET citus.recover_2pc_interval TO '1ms'; +SELECT pg_reload_conf(); +SELECT pg_sleep(0.1); +ALTER SYSTEM SET citus.recover_2pc_interval TO '1h'; +SELECT pg_reload_conf(); + +\c - - - :master_port +SET search_path TO ensure_no_shared_connection_leak; + +-- ensure that we only have at most citus.max_cached_conns_per_worker +-- connections per node +select + (connection_count_to_node = 0) as no_connection_to_node +FROM + citus_remote_connection_stats() +WHERE + port IN (SELECT node_port FROM master_get_active_worker_nodes()) AND + database_name = 'regression' +ORDER BY 1; + +-- now, ensure this from the workers perspective +-- we should only see the connection/backend that is running the command below +SELECT + result, success +FROM + run_command_on_workers($$select count(*) from pg_stat_activity WHERE backend_type = 'client backend';$$) +ORDER BY 1, 2; + + +-- in case other tests relies on these setting, reset them +ALTER SYSTEM RESET citus.distributed_deadlock_detection_factor; +ALTER SYSTEM RESET citus.recover_2pc_interval; +ALTER SYSTEM RESET citus.max_cached_conns_per_worker; +SELECT pg_reload_conf(); + +DROP SCHEMA ensure_no_shared_connection_leak CASCADE;