test_multiple_databases_distributed_deadlock_detection WIP

pull/7286/head
ivyazmitinov 2024-06-26 16:27:13 +02:00
parent f584ecc1aa
commit bdc7bead09
3 changed files with 109 additions and 2 deletions

View File

@ -708,7 +708,9 @@ WaitForSharedConnection(uint32 flags)
ereport(ERROR, (errmsg("Failed to acquire maintenance connection for %i ms",
MaintenanceConnectionPoolTimeout),
errhint(
"Try to increase citus.maintenance_connection_pool_timeout")));
"Try increasing citus.max_maintenance_shared_pool_size or "
"citus.maintenance_connection_pool_timeout"
)));
}
}
else

View File

@ -823,7 +823,7 @@ class Postgres(QueryRunner):
# of our tests
pgconf.write("max_logical_replication_workers = 50\n")
pgconf.write("max_wal_senders = 50\n")
pgconf.write("max_worker_processes = 50\n")
pgconf.write("max_worker_processes = 150\n")
pgconf.write("max_replication_slots = 50\n")
# We need to make the log go to stderr so that the tests can

View File

@ -0,0 +1,105 @@
import asyncio
import psycopg
import pytest
from psycopg.errors import DeadlockDetected
DATABASES_NUMBER = 40
async def test_multiple_databases_distributed_deadlock_detection(cluster):
# Disable maintenance on all nodes
for node in cluster.nodes:
node.sql("ALTER SYSTEM SET citus.recover_2pc_interval TO '-1';")
node.sql("ALTER SYSTEM SET citus.distributed_deadlock_detection_factor = '-1';")
node.sql("ALTER SYSTEM SET citus.max_maintenance_shared_pool_size = 10;")
node.sql("SELECT pg_reload_conf();")
# Prepare database names for test
db_names = [f'db{db_index}' for db_index in range(1, DATABASES_NUMBER + 1)]
# Create and configure databases
for db_name in db_names:
nodes = cluster.workers + [cluster.coordinator]
for node in nodes:
node.sql(f'CREATE DATABASE {db_name}')
with node.cur(dbname=db_name) as node_cursor:
node_cursor.execute("CREATE EXTENSION citus;")
if node == cluster.coordinator:
for worker in cluster.workers:
node_cursor.execute(f"SELECT citus_add_node('localhost', {worker.port});")
node_cursor.execute("""
CREATE TABLE public.deadlock_detection_test (user_id int UNIQUE, some_val int);
SELECT create_distributed_table('public.deadlock_detection_test', 'user_id');
INSERT INTO public.deadlock_detection_test SELECT i, i FROM generate_series(1,2) i;
""")
print("Setup is done")
async def test_deadlock(db_name, run_on_coordinator):
"""Function to prepare a deadlock query in a given database"""
# Init connections and store for later commits
if run_on_coordinator:
first_connection = await cluster.coordinator.aconn(dbname=db_name, autocommit=False)
first_cursor = first_connection.cursor()
second_connection = await cluster.coordinator.aconn(dbname=db_name, autocommit=False)
second_cursor = second_connection.cursor()
else:
first_connection = await cluster.workers[0].aconn(dbname=db_name, autocommit=False)
first_cursor = first_connection.cursor()
second_connection = await cluster.workers[1].aconn(dbname=db_name, autocommit=False)
second_cursor = second_connection.cursor()
# initiate deadlock
await first_cursor.execute("UPDATE public.deadlock_detection_test SET some_val = 1 WHERE user_id = 1;")
await second_cursor.execute("UPDATE public.deadlock_detection_test SET some_val = 2 WHERE user_id = 2;")
# Test that deadlock is resolved by a maintenance daemon
with pytest.raises(DeadlockDetected):
async def run_deadlocked_queries():
await asyncio.gather(
second_cursor.execute("UPDATE public.deadlock_detection_test SET some_val = 2 WHERE user_id = 1;"),
first_cursor.execute("UPDATE public.deadlock_detection_test SET some_val = 1 WHERE user_id = 2;")
)
await asyncio.wait_for(run_deadlocked_queries(), 300)
async def enable_maintenance_when_deadlocks_ready():
"""Function to enable maintenance daemons, when all the expected deadlock queries are ready"""
# Let deadlocks commence
await asyncio.sleep(2)
# cluster.debug()
# Check that queries are deadlocked
databases_with_deadlock = set()
while len(databases_with_deadlock) < DATABASES_NUMBER:
for db_name in (db for db in db_names if
db not in databases_with_deadlock):
for node in cluster.nodes:
async with node.acur(dbname=db_name) as cursor:
expected_lock_count = 4 if node == cluster.coordinator else 2
await cursor.execute(f"""
SELECT count(*) = {expected_lock_count} AS deadlock_created
FROM pg_locks
INNER JOIN pg_class pc ON relation = oid
WHERE relname LIKE 'deadlock_detection_test%'""")
queries_deadlocked = await cursor.fetchone()
if queries_deadlocked[0]:
print(f"Queries are deadlocked on {db_name}")
databases_with_deadlock.add(db_name)
print("Queries on all databases are deadlocked, enabling maintenance")
# Enable maintenance back
for node in cluster.nodes:
node.sql("ALTER SYSTEM RESET citus.recover_2pc_interval;")
node.sql("ALTER SYSTEM RESET citus.distributed_deadlock_detection_factor;")
node.sql("SELECT pg_reload_conf();")
tasks = list()
for idx, db_name in enumerate(db_names):
run_on_coordinator = True if idx % 3 == 0 else False
tasks.append(test_deadlock(db_name=db_name, run_on_coordinator=run_on_coordinator))
tasks.append(enable_maintenance_when_deadlocks_ready())
await asyncio.gather(*tasks)