- Fix limits check for local nodes

- WIP test_multiple_databases_distributed_deadlock_detection
pull/7286/head
ivyazmitinov 2024-06-27 17:50:04 +02:00
parent bdc7bead09
commit 4312b0656b
3 changed files with 77 additions and 34 deletions

View File

@ -461,8 +461,8 @@ IncrementSharedConnectionCounterInternal(uint32 externalFlags,
currentConnectionsCount = workerNodeConnectionEntry->regularConnectionsCount; currentConnectionsCount = workerNodeConnectionEntry->regularConnectionsCount;
} }
bool remoteNodeLimitExceeded = currentConnectionsCount + 1 > bool currentConnectionsLimitExceeded = currentConnectionsCount + 1 >
currentConnectionsLimit; currentConnectionsLimit;
/* /*
* For local nodes, solely relying on citus.max_shared_pool_size or * For local nodes, solely relying on citus.max_shared_pool_size or
@ -476,11 +476,11 @@ IncrementSharedConnectionCounterInternal(uint32 externalFlags,
* a reasonable pace. The latter limit typically kicks in when the database * a reasonable pace. The latter limit typically kicks in when the database
* is issued lots of concurrent sessions at the same time, such as benchmarks. * is issued lots of concurrent sessions at the same time, such as benchmarks.
*/ */
bool localNodeLimitExceeded = bool localNodeConnectionsLimitExceeded =
connectionToLocalNode && connectionToLocalNode &&
(GetLocalSharedPoolSize() == DISABLE_REMOTE_CONNECTIONS_FOR_LOCAL_QUERIES || (GetLocalSharedPoolSize() == DISABLE_REMOTE_CONNECTIONS_FOR_LOCAL_QUERIES ||
GetExternalClientBackendCount() + 1 > currentConnectionsLimit); GetExternalClientBackendCount() + 1 > GetLocalSharedPoolSize());
if (remoteNodeLimitExceeded || localNodeLimitExceeded) if (currentConnectionsLimitExceeded || localNodeConnectionsLimitExceeded)
{ {
connectionSlotAvailable = false; connectionSlotAvailable = false;
} }
@ -502,9 +502,10 @@ IncrementSharedConnectionCounterInternal(uint32 externalFlags,
if (IsLoggableLevel(DEBUG4)) if (IsLoggableLevel(DEBUG4))
{ {
ereport(DEBUG4, errmsg( ereport(DEBUG4, errmsg(
"Incrementing connection counter. " "Incrementing %s connection counter. "
"Current regular connections: %i, maintenance connections: %i. " "Current regular connections: %i, maintenance connections: %i. "
"Connection slot to %s:%i database %i is %s", "Connection slot to %s:%i database %i is %s",
maintenanceConnection ? "maintenance" : "regular",
workerNodeConnectionEntry->regularConnectionsCount, workerNodeConnectionEntry->regularConnectionsCount,
workerNodeConnectionEntry->maintenanceConnectionsCount, workerNodeConnectionEntry->maintenanceConnectionsCount,
hostname, hostname,
@ -568,7 +569,8 @@ DecrementSharedConnectionCounterInternal(uint32 externalFlags,
Assert(workerNodeConnectionEntry->regularConnectionsCount > 0 || Assert(workerNodeConnectionEntry->regularConnectionsCount > 0 ||
workerNodeConnectionEntry->maintenanceConnectionsCount > 0); workerNodeConnectionEntry->maintenanceConnectionsCount > 0);
if (externalFlags & MAINTENANCE_CONNECTION) bool maintenanceConnection = externalFlags & MAINTENANCE_CONNECTION;
if (maintenanceConnection)
{ {
workerNodeConnectionEntry->maintenanceConnectionsCount -= 1; workerNodeConnectionEntry->maintenanceConnectionsCount -= 1;
} }
@ -580,9 +582,10 @@ DecrementSharedConnectionCounterInternal(uint32 externalFlags,
if (IsLoggableLevel(DEBUG4)) if (IsLoggableLevel(DEBUG4))
{ {
ereport(DEBUG4, errmsg( ereport(DEBUG4, errmsg(
"Decrementing connection counter. " "Decrementing %s connection counter. "
"Current regular connections: %i, maintenance connections: %i. " "Current regular connections: %i, maintenance connections: %i. "
"Connection slot to %s:%i database %i is released", "Connection slot to %s:%i database %i is released",
maintenanceConnection ? "maintenance" : "regular",
workerNodeConnectionEntry->regularConnectionsCount, workerNodeConnectionEntry->regularConnectionsCount,
workerNodeConnectionEntry->maintenanceConnectionsCount, workerNodeConnectionEntry->maintenanceConnectionsCount,
hostname, hostname,

View File

@ -977,6 +977,14 @@ class Postgres(QueryRunner):
for config in configs: for config in configs:
self.sql(f"alter system set {config}") self.sql(f"alter system set {config}")
def reset_configuration(self, *configs):
"""Reset specific Postgres settings using ALTER SYSTEM RESET
NOTE: after configuring a call to reload or restart is needed for the
settings to become effective.
"""
for config in configs:
self.sql(f"alter system reset {config}")
def log_handle(self): def log_handle(self):
"""Returns the opened logfile at the current end of the log """Returns the opened logfile at the current end of the log

View File

@ -10,29 +10,38 @@ DATABASES_NUMBER = 40
async def test_multiple_databases_distributed_deadlock_detection(cluster): async def test_multiple_databases_distributed_deadlock_detection(cluster):
# Disable maintenance on all nodes # Disable maintenance on all nodes
for node in cluster.nodes: for node in cluster.nodes:
node.sql("ALTER SYSTEM SET citus.recover_2pc_interval TO '-1';") node.configure(
node.sql("ALTER SYSTEM SET citus.distributed_deadlock_detection_factor = '-1';") "citus.recover_2pc_interval = '-1'",
node.sql("ALTER SYSTEM SET citus.max_maintenance_shared_pool_size = 10;") "citus.distributed_deadlock_detection_factor = '-1'",
node.sql("SELECT pg_reload_conf();") "citus.max_maintenance_shared_pool_size = 5",
# "log_min_messages = 'debug4'",
# "citus.main_db='postgres'"
)
node.restart()
# Prepare database names for test # Prepare database names for test
db_names = [f'db{db_index}' for db_index in range(1, DATABASES_NUMBER + 1)] db_names = [f"db{db_index}" for db_index in range(1, DATABASES_NUMBER + 1)]
# Create and configure databases # Create and configure databases
for db_name in db_names: for db_name in db_names:
nodes = cluster.workers + [cluster.coordinator] nodes = cluster.workers + [cluster.coordinator]
for node in nodes: for node in nodes:
node.sql(f'CREATE DATABASE {db_name}') node.sql(f"CREATE DATABASE {db_name}")
with node.cur(dbname=db_name) as node_cursor: with node.cur(dbname=db_name) as node_cursor:
node_cursor.execute("CREATE EXTENSION citus;") node_cursor.execute("CREATE EXTENSION citus;")
if node == cluster.coordinator: if node == cluster.coordinator:
for worker in cluster.workers: for worker in cluster.workers:
node_cursor.execute(f"SELECT citus_add_node('localhost', {worker.port});") node_cursor.execute(
node_cursor.execute(""" "SELECT pg_catalog.citus_add_node(%s, %s)",
(worker.host, worker.port),
)
node_cursor.execute(
"""
CREATE TABLE public.deadlock_detection_test (user_id int UNIQUE, some_val int); CREATE TABLE public.deadlock_detection_test (user_id int UNIQUE, some_val int);
SELECT create_distributed_table('public.deadlock_detection_test', 'user_id'); SELECT create_distributed_table('public.deadlock_detection_test', 'user_id');
INSERT INTO public.deadlock_detection_test SELECT i, i FROM generate_series(1,2) i; INSERT INTO public.deadlock_detection_test SELECT i, i FROM generate_series(1,2) i;
""") """
)
print("Setup is done") print("Setup is done")
@ -40,26 +49,43 @@ async def test_multiple_databases_distributed_deadlock_detection(cluster):
"""Function to prepare a deadlock query in a given database""" """Function to prepare a deadlock query in a given database"""
# Init connections and store for later commits # Init connections and store for later commits
if run_on_coordinator: if run_on_coordinator:
first_connection = await cluster.coordinator.aconn(dbname=db_name, autocommit=False) first_connection = await cluster.coordinator.aconn(
dbname=db_name, autocommit=False
)
first_cursor = first_connection.cursor() first_cursor = first_connection.cursor()
second_connection = await cluster.coordinator.aconn(dbname=db_name, autocommit=False) second_connection = await cluster.coordinator.aconn(
dbname=db_name, autocommit=False
)
second_cursor = second_connection.cursor() second_cursor = second_connection.cursor()
else: else:
first_connection = await cluster.workers[0].aconn(dbname=db_name, autocommit=False) first_connection = await cluster.workers[0].aconn(
dbname=db_name, autocommit=False
)
first_cursor = first_connection.cursor() first_cursor = first_connection.cursor()
second_connection = await cluster.workers[1].aconn(dbname=db_name, autocommit=False) second_connection = await cluster.workers[1].aconn(
dbname=db_name, autocommit=False
)
second_cursor = second_connection.cursor() second_cursor = second_connection.cursor()
# initiate deadlock # initiate deadlock
await first_cursor.execute("UPDATE public.deadlock_detection_test SET some_val = 1 WHERE user_id = 1;") await first_cursor.execute(
await second_cursor.execute("UPDATE public.deadlock_detection_test SET some_val = 2 WHERE user_id = 2;") "UPDATE public.deadlock_detection_test SET some_val = 1 WHERE user_id = 1;"
)
await second_cursor.execute(
"UPDATE public.deadlock_detection_test SET some_val = 2 WHERE user_id = 2;"
)
# Test that deadlock is resolved by a maintenance daemon # Test that deadlock is resolved by a maintenance daemon
with pytest.raises(DeadlockDetected): with pytest.raises(DeadlockDetected):
async def run_deadlocked_queries(): async def run_deadlocked_queries():
await asyncio.gather( await asyncio.gather(
second_cursor.execute("UPDATE public.deadlock_detection_test SET some_val = 2 WHERE user_id = 1;"), second_cursor.execute(
first_cursor.execute("UPDATE public.deadlock_detection_test SET some_val = 1 WHERE user_id = 2;") "UPDATE public.deadlock_detection_test SET some_val = 2 WHERE user_id = 1;"
),
first_cursor.execute(
"UPDATE public.deadlock_detection_test SET some_val = 1 WHERE user_id = 2;"
),
) )
await asyncio.wait_for(run_deadlocked_queries(), 300) await asyncio.wait_for(run_deadlocked_queries(), 300)
@ -72,16 +98,18 @@ async def test_multiple_databases_distributed_deadlock_detection(cluster):
# Check that queries are deadlocked # Check that queries are deadlocked
databases_with_deadlock = set() databases_with_deadlock = set()
while len(databases_with_deadlock) < DATABASES_NUMBER: while len(databases_with_deadlock) < DATABASES_NUMBER:
for db_name in (db for db in db_names if for db_name in (db for db in db_names if db not in databases_with_deadlock):
db not in databases_with_deadlock):
for node in cluster.nodes: for node in cluster.nodes:
async with node.acur(dbname=db_name) as cursor: async with node.acur(dbname=db_name) as cursor:
expected_lock_count = 4 if node == cluster.coordinator else 2 expected_lock_count = 4 if node == cluster.coordinator else 2
await cursor.execute(f""" await cursor.execute(
SELECT count(*) = {expected_lock_count} AS deadlock_created """
SELECT count(*) = %s AS deadlock_created
FROM pg_locks FROM pg_locks
INNER JOIN pg_class pc ON relation = oid INNER JOIN pg_class pc ON relation = oid
WHERE relname LIKE 'deadlock_detection_test%'""") WHERE relname LIKE 'deadlock_detection_test%%'""",
(expected_lock_count,),
)
queries_deadlocked = await cursor.fetchone() queries_deadlocked = await cursor.fetchone()
if queries_deadlocked[0]: if queries_deadlocked[0]:
print(f"Queries are deadlocked on {db_name}") print(f"Queries are deadlocked on {db_name}")
@ -91,14 +119,18 @@ async def test_multiple_databases_distributed_deadlock_detection(cluster):
# Enable maintenance back # Enable maintenance back
for node in cluster.nodes: for node in cluster.nodes:
node.sql("ALTER SYSTEM RESET citus.recover_2pc_interval;") node.reset_configuration(
node.sql("ALTER SYSTEM RESET citus.distributed_deadlock_detection_factor;") "citus.recover_2pc_interval",
node.sql("SELECT pg_reload_conf();") "citus.distributed_deadlock_detection_factor",
)
node.reload()
tasks = list() tasks = list()
for idx, db_name in enumerate(db_names): for idx, db_name in enumerate(db_names):
run_on_coordinator = True if idx % 3 == 0 else False run_on_coordinator = True if idx % 3 == 0 else False
tasks.append(test_deadlock(db_name=db_name, run_on_coordinator=run_on_coordinator)) tasks.append(
test_deadlock(db_name=db_name, run_on_coordinator=run_on_coordinator)
)
tasks.append(enable_maintenance_when_deadlocks_ready()) tasks.append(enable_maintenance_when_deadlocks_ready())