From b2051bffd7bc7a3afd06474ee0e32f38031bb44b Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Thu, 5 Oct 2023 15:50:20 +0700 Subject: [PATCH 01/41] Move databaseOid from SharedConnStatsHashKey to SharedConnStatsHashEntry --- .../connection/shared_connection_stats.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/src/backend/distributed/connection/shared_connection_stats.c b/src/backend/distributed/connection/shared_connection_stats.c index 26598b465..b748fdf02 100644 --- a/src/backend/distributed/connection/shared_connection_stats.c +++ b/src/backend/distributed/connection/shared_connection_stats.c @@ -68,18 +68,13 @@ typedef struct SharedConnStatsHashKey */ char hostname[MAX_NODE_LENGTH]; int32 port; - - /* - * Given that citus.shared_max_pool_size can be defined per database, we - * should keep track of shared connections per database. - */ - Oid databaseOid; } SharedConnStatsHashKey; /* hash entry for per worker stats */ typedef struct SharedConnStatsHashEntry { SharedConnStatsHashKey key; + Oid databaseOid; int connectionCount; } SharedConnStatsHashEntry; @@ -169,7 +164,7 @@ StoreAllRemoteConnectionStats(Tuplestorestate *tupleStore, TupleDesc tupleDescri memset(values, 0, sizeof(values)); memset(isNulls, false, sizeof(isNulls)); - char *databaseName = get_database_name(connectionEntry->key.databaseOid); + char *databaseName = get_database_name(connectionEntry->databaseOid); if (databaseName == NULL) { /* database might have been dropped */ @@ -308,7 +303,6 @@ TryToIncrementSharedConnectionCounter(const char *hostname, int port) } connKey.port = port; - connKey.databaseOid = MyDatabaseId; /* * Handle adaptive connection management for the local node slightly different @@ -363,6 +357,7 @@ TryToIncrementSharedConnectionCounter(const char *hostname, int port) { /* we successfully allocated the entry for the first time, so initialize it */ connectionEntry->connectionCount = 1; + connectionEntry->databaseOid = MyDatabaseId; counterIncremented = true; } @@ -435,7 +430,6 @@ IncrementSharedConnectionCounter(const char *hostname, int port) } connKey.port = port; - connKey.databaseOid = MyDatabaseId; LockConnectionSharedMemory(LW_EXCLUSIVE); @@ -467,6 +461,7 @@ IncrementSharedConnectionCounter(const char *hostname, int port) { /* we successfully allocated the entry for the first time, so initialize it */ connectionEntry->connectionCount = 0; + connectionEntry->databaseOid = MyDatabaseId; } connectionEntry->connectionCount += 1; @@ -503,7 +498,6 @@ DecrementSharedConnectionCounter(const char *hostname, int port) } connKey.port = port; - connKey.databaseOid = MyDatabaseId; LockConnectionSharedMemory(LW_EXCLUSIVE); @@ -807,7 +801,6 @@ SharedConnectionHashHash(const void *key, Size keysize) uint32 hash = string_hash(entry->hostname, NAMEDATALEN); hash = hash_combine(hash, hash_uint32(entry->port)); - hash = hash_combine(hash, hash_uint32(entry->databaseOid)); return hash; } @@ -820,8 +813,7 @@ SharedConnectionHashCompare(const void *a, const void *b, Size keysize) SharedConnStatsHashKey *cb = (SharedConnStatsHashKey *) b; if (strncmp(ca->hostname, cb->hostname, MAX_NODE_LENGTH) != 0 || - ca->port != cb->port || - ca->databaseOid != cb->databaseOid) + ca->port != cb->port) { return 1; } From 23e6b36e23012d66e67cd351f25a25cddff0df45 Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Fri, 13 Oct 2023 21:34:55 +0700 Subject: [PATCH 02/41] - Separate HTAB for connection management and statistics. - Refactoring --- .../connection/connection_management.c | 11 +- .../locally_reserved_shared_connections.c | 11 +- .../connection/shared_connection_stats.c | 644 ++++++++++-------- .../operations/worker_node_manager.c | 1 + .../distributed/connection_management.h | 4 +- .../distributed/shared_connection_stats.h | 11 +- src/include/distributed/worker_manager.h | 1 + 7 files changed, 392 insertions(+), 291 deletions(-) diff --git a/src/backend/distributed/connection/connection_management.c b/src/backend/distributed/connection/connection_management.c index f8e4816ed..96e750059 100644 --- a/src/backend/distributed/connection/connection_management.c +++ b/src/backend/distributed/connection/connection_management.c @@ -379,7 +379,10 @@ StartNodeUserDatabaseConnection(uint32 flags, const char *hostname, int32 port, if (flags & WAIT_FOR_CONNECTION) { - WaitLoopForSharedConnection(hostname, port); + int sharedCounterFlags = (flags & REQUIRE_MAINTENANCE_CONNECTION) + ? MAINTENANCE_CONNECTION_POOL + : 0; + WaitLoopForSharedConnection(sharedCounterFlags, hostname, port); } else if (flags & OPTIONAL_CONNECTION) { @@ -389,7 +392,8 @@ StartNodeUserDatabaseConnection(uint32 flags, const char *hostname, int32 port, * cannot reserve the right to establish a connection, we prefer to * error out. */ - if (!TryToIncrementSharedConnectionCounter(hostname, port)) + int sharedCounterFlags = 0; + if (!TryToIncrementSharedConnectionCounter(sharedCounterFlags, hostname, port)) { /* do not track the connection anymore */ dlist_delete(&connection->connectionNode); @@ -409,7 +413,8 @@ StartNodeUserDatabaseConnection(uint32 flags, const char *hostname, int32 port, * * Still, we keep track of the connection counter. */ - IncrementSharedConnectionCounter(hostname, port); + int sharedCounterFlags = 0; + IncrementSharedConnectionCounter(sharedCounterFlags, hostname, port); } diff --git a/src/backend/distributed/connection/locally_reserved_shared_connections.c b/src/backend/distributed/connection/locally_reserved_shared_connections.c index a64930b32..a2d2fac24 100644 --- a/src/backend/distributed/connection/locally_reserved_shared_connections.c +++ b/src/backend/distributed/connection/locally_reserved_shared_connections.c @@ -439,13 +439,16 @@ EnsureConnectionPossibilityForNode(WorkerNode *workerNode, bool waitForConnectio * Increment the shared counter, we may need to wait if there are * no space left. */ - WaitLoopForSharedConnection(workerNode->workerName, workerNode->workerPort); + int sharedCounterFlags = 0; + WaitLoopForSharedConnection(sharedCounterFlags, workerNode->workerName, workerNode->workerPort); } else { - bool incremented = - TryToIncrementSharedConnectionCounter(workerNode->workerName, - workerNode->workerPort); + int sharedCounterFlags = 0; + bool incremented = TryToIncrementSharedConnectionCounter( + sharedCounterFlags, + workerNode->workerName, + workerNode->workerPort); if (!incremented) { /* diff --git a/src/backend/distributed/connection/shared_connection_stats.c b/src/backend/distributed/connection/shared_connection_stats.c index b748fdf02..40d74ca11 100644 --- a/src/backend/distributed/connection/shared_connection_stats.c +++ b/src/backend/distributed/connection/shared_connection_stats.c @@ -18,7 +18,6 @@ #include "access/hash.h" #include "access/htup_details.h" -#include "catalog/pg_authid.h" #include "commands/dbcommands.h" #include "common/hashfn.h" #include "storage/ipc.h" @@ -27,15 +26,13 @@ #include "pg_version_constants.h" #include "distributed/backend_data.h" -#include "distributed/cancel_utils.h" #include "distributed/connection_management.h" -#include "distributed/listutils.h" #include "distributed/locally_reserved_shared_connections.h" #include "distributed/metadata_cache.h" -#include "distributed/multi_executor.h" #include "distributed/placement_connection.h" #include "distributed/shared_connection_stats.h" #include "distributed/time_constants.h" +#include "distributed/worker_manager.h" #include "distributed/tuplestore.h" #include "distributed/worker_manager.h" @@ -58,8 +55,14 @@ typedef struct ConnectionStatsSharedData ConditionVariable waitersConditionVariable; } ConnectionStatsSharedData; - -typedef struct SharedConnStatsHashKey +/* + * There are two hash tables: + * + * 1. The first one tracks the connection count per worker node and used for the connection throttling + * 2. The second one tracks the connection count per database on a worker node and used for statistics + * + */ +typedef struct SharedWorkerNodeConnStatsHashKey { /* * We keep the entries in the shared memory even after master_update_node() @@ -68,16 +71,28 @@ typedef struct SharedConnStatsHashKey */ char hostname[MAX_NODE_LENGTH]; int32 port; -} SharedConnStatsHashKey; +} SharedWorkerNodeConnStatsHashKey; + +typedef struct SharedWorkerNodeDatabaseConnStatsHashKey +{ + SharedWorkerNodeConnStatsHashKey workerNodeKey; + Oid database; +} SharedWorkerNodeDatabaseConnStatsHashKey; /* hash entry for per worker stats */ -typedef struct SharedConnStatsHashEntry +typedef struct SharedWorkerNodeConnStatsHashEntry { - SharedConnStatsHashKey key; - Oid databaseOid; + SharedWorkerNodeConnStatsHashKey key; - int connectionCount; -} SharedConnStatsHashEntry; + int count; +} SharedWorkerNodeConnStatsHashEntry; + +/* hash entry for per database on worker stats */ +typedef struct SharedWorkerNodeDatabaseConnStatsHashEntry +{ + SharedWorkerNodeDatabaseConnStatsHashKey key; + int count; +} SharedWorkerNodeDatabaseConnStatsHashEntry; /* @@ -88,6 +103,11 @@ typedef struct SharedConnStatsHashEntry */ int MaxSharedPoolSize = 0; +/* + * + */ +float SharedPoolSizeMaintenancePercent = 10.0f; + /* * Controlled via a GUC, never access directly, use GetLocalSharedPoolSize(). * "0" means adjust LocalSharedPoolSize automatically by using MaxConnections. @@ -101,7 +121,8 @@ int MaxClientConnections = ALLOW_ALL_EXTERNAL_CONNECTIONS; /* the following two structs are used for accessing shared memory */ -static HTAB *SharedConnStatsHash = NULL; +static HTAB *SharedWorkerNodeConnStatsHash = NULL; +static HTAB *SharedWorkerNodeDatabaseConnStatsHash = NULL; static ConnectionStatsSharedData *ConnectionStatsSharedState = NULL; @@ -116,10 +137,25 @@ static void UnLockConnectionSharedMemory(void); static bool ShouldWaitForConnection(int currentConnectionCount); static uint32 SharedConnectionHashHash(const void *key, Size keysize); static int SharedConnectionHashCompare(const void *a, const void *b, Size keysize); +static uint32 SharedWorkerNodeDatabaseHashHash(const void *key, Size keysize); +static int SharedWorkerNodeDatabaseHashCompare(const void *a, const void *b, Size keysize); +static bool IsConnectionToLocalNode(SharedWorkerNodeConnStatsHashKey *connKey); +static bool isConnectionSlotAvailable(SharedWorkerNodeConnStatsHashKey *connKey, + const SharedWorkerNodeConnStatsHashEntry *connectionEntry); +static bool +IncrementSharedConnectionCounterInternal(uint32 externalFlags, bool checkLimits, const char *hostname, int port, + Oid database); +static SharedWorkerNodeConnStatsHashKey PrepareWorkerNodeHashKey(const char *hostname, int port); +static SharedWorkerNodeDatabaseConnStatsHashKey PrepareWorkerNodeDatabaseHashKey(const char *hostname, + int port, + Oid database); +static void +DecrementSharedConnectionCounterInternal(const char *hostname, int port); PG_FUNCTION_INFO_V1(citus_remote_connection_stats); + /* * citus_remote_connection_stats returns all the avaliable information about all * the remote connections (a.k.a., connections to remote nodes). @@ -155,26 +191,26 @@ StoreAllRemoteConnectionStats(Tuplestorestate *tupleStore, TupleDesc tupleDescri LockConnectionSharedMemory(LW_SHARED); HASH_SEQ_STATUS status; - SharedConnStatsHashEntry *connectionEntry = NULL; + SharedWorkerNodeDatabaseConnStatsHashEntry *connectionEntry = NULL; - hash_seq_init(&status, SharedConnStatsHash); - while ((connectionEntry = (SharedConnStatsHashEntry *) hash_seq_search(&status)) != 0) + hash_seq_init(&status, SharedWorkerNodeDatabaseConnStatsHash); + while ((connectionEntry = (SharedWorkerNodeDatabaseConnStatsHashEntry *) hash_seq_search(&status)) != 0) { /* get ready for the next tuple */ memset(values, 0, sizeof(values)); memset(isNulls, false, sizeof(isNulls)); - char *databaseName = get_database_name(connectionEntry->databaseOid); + char *databaseName = get_database_name(connectionEntry->key.database); if (databaseName == NULL) { /* database might have been dropped */ continue; } - values[0] = PointerGetDatum(cstring_to_text(connectionEntry->key.hostname)); - values[1] = Int32GetDatum(connectionEntry->key.port); + values[0] = PointerGetDatum(cstring_to_text(connectionEntry->key.workerNodeKey.hostname)); + values[1] = Int32GetDatum(connectionEntry->key.workerNodeKey.port); values[2] = PointerGetDatum(cstring_to_text(databaseName)); - values[3] = Int32GetDatum(connectionEntry->connectionCount); + values[3] = Int32GetDatum(connectionEntry->count); tuplestore_putvalues(tupleStore, tupleDescriptor, values, isNulls); } @@ -220,6 +256,12 @@ GetMaxSharedPoolSize(void) return MaxSharedPoolSize; } +float +GetSharedPoolSizeMaintenancePercent(void) +{ + return SharedPoolSizeMaintenancePercent; +} + /* * GetLocalSharedPoolSize is a wrapper around LocalSharedPoolSize which is @@ -243,14 +285,14 @@ GetLocalSharedPoolSize(void) /* * WaitLoopForSharedConnection tries to increment the shared connection * counter for the given hostname/port and the current database in - * SharedConnStatsHash. + * SharedWorkerNodeConnStatsHash. * * The function implements a retry mechanism via a condition variable. */ void -WaitLoopForSharedConnection(const char *hostname, int port) +WaitLoopForSharedConnection(uint32 flags, const char *hostname, int port) { - while (!TryToIncrementSharedConnectionCounter(hostname, port)) + while (!TryToIncrementSharedConnectionCounter(flags, hostname, port)) { CHECK_FOR_INTERRUPTS(); @@ -260,149 +302,37 @@ WaitLoopForSharedConnection(const char *hostname, int port) ConditionVariableCancelSleep(); } - /* * TryToIncrementSharedConnectionCounter tries to increment the shared * connection counter for the given nodeId and the current database in - * SharedConnStatsHash. + * SharedWorkerNodeConnStatsHash. * * If the function returns true, the caller is allowed (and expected) * to establish a new connection to the given node. Else, the caller * is not allowed to establish a new connection. */ bool -TryToIncrementSharedConnectionCounter(const char *hostname, int port) +TryToIncrementSharedConnectionCounter(uint32 flags, const char *hostname, int port) { - if (GetMaxSharedPoolSize() == DISABLE_CONNECTION_THROTTLING) + if (GetMaxSharedPoolSize() == DISABLE_CONNECTION_THROTTLING) { /* connection throttling disabled */ return true; } - bool counterIncremented = false; - SharedConnStatsHashKey connKey; - - strlcpy(connKey.hostname, hostname, MAX_NODE_LENGTH); - if (strlen(hostname) > MAX_NODE_LENGTH) - { - ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("hostname exceeds the maximum length of %d", - MAX_NODE_LENGTH))); - } - - /* + /* * The local session might already have some reserved connections to the given * node. In that case, we don't need to go through the shared memory. */ - Oid userId = GetUserId(); - if (CanUseReservedConnection(hostname, port, userId, MyDatabaseId)) - { - MarkReservedConnectionUsed(hostname, port, userId, MyDatabaseId); + Oid userId = GetUserId(); + if (CanUseReservedConnection(hostname, port, userId, MyDatabaseId)) + { + MarkReservedConnectionUsed(hostname, port, userId, MyDatabaseId); - return true; - } + return true; + } - connKey.port = port; - - /* - * Handle adaptive connection management for the local node slightly different - * as local node can failover to local execution. - */ - bool connectionToLocalNode = false; - int activeBackendCount = 0; - WorkerNode *workerNode = FindWorkerNode(hostname, port); - if (workerNode) - { - connectionToLocalNode = (workerNode->groupId == GetLocalGroupId()); - if (connectionToLocalNode && - GetLocalSharedPoolSize() == DISABLE_REMOTE_CONNECTIONS_FOR_LOCAL_QUERIES) - { - /* - * This early return is required as LocalNodeParallelExecutionFactor - * is ignored for the first connection below. This check makes the - * user experience is more accurate and also makes it easy for - * having regression tests which emulates the local node adaptive - * connection management. - */ - return false; - } - - activeBackendCount = GetExternalClientBackendCount(); - } - - LockConnectionSharedMemory(LW_EXCLUSIVE); - - /* - * As the hash map is allocated in shared memory, it doesn't rely on palloc for - * memory allocation, so we could get NULL via HASH_ENTER_NULL when there is no - * space in the shared memory. That's why we prefer continuing the execution - * instead of throwing an error. - */ - bool entryFound = false; - SharedConnStatsHashEntry *connectionEntry = - hash_search(SharedConnStatsHash, &connKey, HASH_ENTER_NULL, &entryFound); - - /* - * It is possible to throw an error at this point, but that doesn't help us in anyway. - * Instead, we try our best, let the connection establishment continue by-passing the - * connection throttling. - */ - if (!connectionEntry) - { - UnLockConnectionSharedMemory(); - return true; - } - - if (!entryFound) - { - /* we successfully allocated the entry for the first time, so initialize it */ - connectionEntry->connectionCount = 1; - connectionEntry->databaseOid = MyDatabaseId; - - counterIncremented = true; - } - else if (connectionToLocalNode) - { - /* - * For local nodes, solely relying on citus.max_shared_pool_size or - * max_connections might not be sufficient. The former gives us - * a preview of the future (e.g., we let the new connections to establish, - * but they are not established yet). The latter gives us the close to - * precise view of the past (e.g., the active number of client backends). - * - * Overall, we want to limit both of the metrics. The former limit typically - * kicks in under regular loads, where the load of the database increases in - * a reasonable pace. The latter limit typically kicks in when the database - * is issued lots of concurrent sessions at the same time, such as benchmarks. - */ - if (activeBackendCount + 1 > GetLocalSharedPoolSize()) - { - counterIncremented = false; - } - else if (connectionEntry->connectionCount + 1 > GetLocalSharedPoolSize()) - { - counterIncremented = false; - } - else - { - connectionEntry->connectionCount++; - counterIncremented = true; - } - } - else if (connectionEntry->connectionCount + 1 > GetMaxSharedPoolSize()) - { - /* there is no space left for this connection */ - counterIncremented = false; - } - else - { - connectionEntry->connectionCount++; - counterIncremented = true; - } - - UnLockConnectionSharedMemory(); - - return counterIncremented; + return IncrementSharedConnectionCounterInternal(flags, true, hostname, port, MyDatabaseId); } @@ -411,64 +341,162 @@ TryToIncrementSharedConnectionCounter(const char *hostname, int port) * for the given hostname and port. */ void -IncrementSharedConnectionCounter(const char *hostname, int port) +IncrementSharedConnectionCounter(uint32 flags, const char *hostname, int port) { - SharedConnStatsHashKey connKey; + if (MaxSharedPoolSize == DISABLE_CONNECTION_THROTTLING) + { + /* connection throttling disabled */ + return; + } - if (MaxSharedPoolSize == DISABLE_CONNECTION_THROTTLING) - { - /* connection throttling disabled */ - return; - } - - strlcpy(connKey.hostname, hostname, MAX_NODE_LENGTH); - if (strlen(hostname) > MAX_NODE_LENGTH) - { - ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("hostname exceeds the maximum length of %d", - MAX_NODE_LENGTH))); - } - - connKey.port = port; - - LockConnectionSharedMemory(LW_EXCLUSIVE); - - /* - * As the hash map is allocated in shared memory, it doesn't rely on palloc for - * memory allocation, so we could get NULL via HASH_ENTER_NULL. That's why we prefer - * continuing the execution instead of throwing an error. - */ - bool entryFound = false; - SharedConnStatsHashEntry *connectionEntry = - hash_search(SharedConnStatsHash, &connKey, HASH_ENTER_NULL, &entryFound); - - /* - * It is possible to throw an error at this point, but that doesn't help us in anyway. - * Instead, we try our best, let the connection establishment continue by-passing the - * connection throttling. - */ - if (!connectionEntry) - { - UnLockConnectionSharedMemory(); - - ereport(DEBUG4, (errmsg("No entry found for node %s:%d while incrementing " - "connection counter", hostname, port))); - - return; - } - - if (!entryFound) - { - /* we successfully allocated the entry for the first time, so initialize it */ - connectionEntry->connectionCount = 0; - connectionEntry->databaseOid = MyDatabaseId; - } - - connectionEntry->connectionCount += 1; - - UnLockConnectionSharedMemory(); + IncrementSharedConnectionCounterInternal(flags, false, hostname, port, MyDatabaseId); } +static SharedWorkerNodeConnStatsHashKey PrepareWorkerNodeHashKey(const char *hostname, int port) +{ + SharedWorkerNodeConnStatsHashKey key; + strlcpy(key.hostname, hostname, MAX_NODE_LENGTH); + if (strlen(hostname) > MAX_NODE_LENGTH) + { + ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("hostname exceeds the maximum length of %d", + MAX_NODE_LENGTH))); + } + key.port = port; + return key; +} + +static SharedWorkerNodeDatabaseConnStatsHashKey PrepareWorkerNodeDatabaseHashKey(const char *hostname, + int port, + Oid database) +{ + SharedWorkerNodeDatabaseConnStatsHashKey workerNodeDatabaseKey; + workerNodeDatabaseKey.workerNodeKey = PrepareWorkerNodeHashKey(hostname, port); + workerNodeDatabaseKey.database = database; + return workerNodeDatabaseKey; +} + +static bool +IncrementSharedConnectionCounterInternal(uint32 externalFlags, + bool checkLimits, + const char *hostname, + int port, + Oid database) +{ + LockConnectionSharedMemory(LW_EXCLUSIVE); + + /* + * As the hash map is allocated in shared memory, it doesn't rely on palloc for + * memory allocation, so we could get NULL via HASH_ENTER_NULL when there is no + * space in the shared memory. That's why we prefer continuing the execution + * instead of throwing an error. + */ + SharedWorkerNodeConnStatsHashKey workerNodeKey = PrepareWorkerNodeHashKey(hostname, port); + bool workerNodeEntryFound = false; + SharedWorkerNodeConnStatsHashEntry *workerNodeConnectionEntry = + hash_search(SharedWorkerNodeConnStatsHash, + &workerNodeKey, + HASH_ENTER_NULL, + &workerNodeEntryFound); + + /* + * It is possible to throw an error at this point, but that doesn't help us in anyway. + * Instead, we try our best, let the connection establishment continue by-passing the + * connection throttling. + */ + if (!workerNodeConnectionEntry) + { + UnLockConnectionSharedMemory(); + return true; + } + + if (!workerNodeEntryFound) + { + /* we successfully allocated the entry for the first time, so initialize it */ + workerNodeConnectionEntry->count = 0; + } + + /* Initialized SharedWorkerNodeDatabaseConnStatsHash the same way */ + SharedWorkerNodeDatabaseConnStatsHashKey workerNodeDatabaseKey = + PrepareWorkerNodeDatabaseHashKey(hostname, port, database); + bool workerNodeDatabaseEntryFound = false; + SharedWorkerNodeDatabaseConnStatsHashEntry *workerNodeDatabaseEntry = + hash_search(SharedWorkerNodeDatabaseConnStatsHash, + &workerNodeDatabaseKey, + HASH_ENTER_NULL, + &workerNodeDatabaseEntryFound); + + if (!workerNodeDatabaseEntry) + { + UnLockConnectionSharedMemory(); + return true; + } + + if (!workerNodeDatabaseEntryFound) + { + workerNodeDatabaseEntry->count = 0; + } + + /* Increment counter if possible */ + bool connectionSlotAvailable = true; + connectionSlotAvailable = !checkLimits || + isConnectionSlotAvailable(&workerNodeKey, workerNodeConnectionEntry); + + if (connectionSlotAvailable) + { + workerNodeConnectionEntry->count += 1; + workerNodeDatabaseEntry->count += 1; + } + + UnLockConnectionSharedMemory(); + + return connectionSlotAvailable; +} + +static bool IsConnectionToLocalNode(SharedWorkerNodeConnStatsHashKey *connKey) +{ + WorkerNode *workerNode = FindWorkerNode(connKey->hostname, connKey->port); + return workerNode && (workerNode->groupId == GetLocalGroupId()); +} + + +static bool isConnectionSlotAvailable(SharedWorkerNodeConnStatsHashKey *connKey, + const SharedWorkerNodeConnStatsHashEntry *connectionEntry) +{ + bool connectionSlotAvailable = true; + bool connectionToLocalNode = IsConnectionToLocalNode(connKey); + if (connectionToLocalNode) + { + bool remoteConnectionsForLocalQueriesDisabled = + GetLocalSharedPoolSize() == DISABLE_REMOTE_CONNECTIONS_FOR_LOCAL_QUERIES; + /* + * For local nodes, solely relying on citus.max_shared_pool_size or + * max_connections might not be sufficient. The former gives us + * a preview of the future (e.g., we let the new connections to establish, + * but they are not established yet). The latter gives us the close to + * precise view of the past (e.g., the active number of client backends). + * + * Overall, we want to limit both of the metrics. The former limit typically + * kicks in under regular loads, where the load of the database increases in + * a reasonable pace. The latter limit typically kicks in when the database + * is issued lots of concurrent sessions at the same time, such as benchmarks. + */ + bool localConnectionLimitExceeded = + GetExternalClientBackendCount() + 1 > GetLocalSharedPoolSize() || + connectionEntry->count + 1 > GetLocalSharedPoolSize(); + if (remoteConnectionsForLocalQueriesDisabled || localConnectionLimitExceeded) + { + connectionSlotAvailable = false; + } + } + else if (connectionEntry->count + 1 > GetMaxSharedPoolSize()) + { + connectionSlotAvailable = false; + } + return connectionSlotAvailable; +} + + /* * DecrementSharedConnectionCounter decrements the shared counter @@ -477,79 +505,96 @@ IncrementSharedConnectionCounter(const char *hostname, int port) void DecrementSharedConnectionCounter(const char *hostname, int port) { - SharedConnStatsHashKey connKey; - /* - * Do not call GetMaxSharedPoolSize() here, since it may read from - * the catalog and we may be in the process exit handler. - */ - if (MaxSharedPoolSize == DISABLE_CONNECTION_THROTTLING) - { - /* connection throttling disabled */ - return; - } + DecrementSharedConnectionCounterInternal(hostname, port); + UnLockConnectionSharedMemory(); + WakeupWaiterBackendsForSharedConnection(); +} - strlcpy(connKey.hostname, hostname, MAX_NODE_LENGTH); - if (strlen(hostname) > MAX_NODE_LENGTH) - { - ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("hostname exceeds the maximum length of %d", - MAX_NODE_LENGTH))); - } +static void +DecrementSharedConnectionCounterInternal(const char *hostname, int port) +{ + /* + * Do not call GetMaxSharedPoolSize() here, since it may read from + * the catalog and we may be in the process exit handler. + */ + if (MaxSharedPoolSize == DISABLE_CONNECTION_THROTTLING) + { + /* connection throttling disabled */ + return; + } - connKey.port = port; + LockConnectionSharedMemory(LW_EXCLUSIVE); - LockConnectionSharedMemory(LW_EXCLUSIVE); + bool workerNodeEntryFound = false; + SharedWorkerNodeConnStatsHashKey workerNodeKey = PrepareWorkerNodeHashKey(hostname, port); + SharedWorkerNodeConnStatsHashEntry *workerNodeEntry = + hash_search(SharedWorkerNodeConnStatsHash, &workerNodeKey, HASH_FIND, &workerNodeEntryFound); - bool entryFound = false; - SharedConnStatsHashEntry *connectionEntry = - hash_search(SharedConnStatsHash, &connKey, HASH_FIND, &entryFound); + /* this worker node is removed or updated, no need to care */ + if (!workerNodeEntryFound) + { + ereport(DEBUG4, (errmsg("No entry found for node %s:%d while decrementing " + "connection counter", hostname, port))); + return; + } - /* this worker node is removed or updated, no need to care */ - if (!entryFound) - { - UnLockConnectionSharedMemory(); + /* we should never go below 0 */ + Assert(workerNodeEntry->count > 0); - /* wake up any waiters in case any backend is waiting for this node */ - WakeupWaiterBackendsForSharedConnection(); - ereport(DEBUG4, (errmsg("No entry found for node %s:%d while decrementing " - "connection counter", hostname, port))); + workerNodeEntry->count -= 1; - return; - } - /* we should never go below 0 */ - Assert(connectionEntry->connectionCount > 0); + /* + * We don't have to remove at this point as the node might be still active + * and will have new connections open to it. Still, this seems like a convenient + * place to remove the entry, as count == 0 implies that the server is + * not busy, and given the default value of MaxCachedConnectionsPerWorker = 1, + * we're unlikely to trigger this often. + */ + if (workerNodeEntry->count == 0) + { + hash_search(SharedWorkerNodeConnStatsHash, &workerNodeKey, HASH_REMOVE, NULL); + } - connectionEntry->connectionCount -= 1; + /* + * Perform the same with SharedWorkerNodeDatabaseConnStatsHashKey + */ - if (connectionEntry->connectionCount == 0) - { - /* - * We don't have to remove at this point as the node might be still active - * and will have new connections open to it. Still, this seems like a convenient - * place to remove the entry, as connectionCount == 0 implies that the server is - * not busy, and given the default value of MaxCachedConnectionsPerWorker = 1, - * we're unlikely to trigger this often. - */ - hash_search(SharedConnStatsHash, &connKey, HASH_REMOVE, &entryFound); - } + SharedWorkerNodeDatabaseConnStatsHashKey workerNodeDatabaseKey = + PrepareWorkerNodeDatabaseHashKey(hostname, port, MyDatabaseId); + bool workerNodeDatabaseEntryFound = false; + SharedWorkerNodeDatabaseConnStatsHashEntry *workerNodeDatabaseEntry = + hash_search(SharedWorkerNodeDatabaseConnStatsHash, + &workerNodeDatabaseKey, + HASH_FIND, + &workerNodeDatabaseEntryFound); - UnLockConnectionSharedMemory(); + if (!workerNodeDatabaseEntryFound) + { + return; + } - WakeupWaiterBackendsForSharedConnection(); + Assert(workerNodeDatabaseEntry->count > 0); + + workerNodeDatabaseEntry->count -= 1; + + if (workerNodeDatabaseEntry->count == 0) + { + hash_search(SharedWorkerNodeDatabaseConnStatsHash, &workerNodeDatabaseKey, HASH_REMOVE, NULL); + } } /* * LockConnectionSharedMemory is a utility function that should be used when - * accessing to the SharedConnStatsHash, which is in the shared memory. + * accessing to the SharedWorkerNodeConnStatsHash, which is in the shared memory. */ static void LockConnectionSharedMemory(LWLockMode lockMode) { - LWLockAcquire(&ConnectionStatsSharedState->sharedConnectionHashLock, lockMode); + LWLockAcquire(&ConnectionStatsSharedState->sharedConnectionHashLock, lockMode); } @@ -634,12 +679,17 @@ SharedConnectionStatsShmemSize(void) size = add_size(size, sizeof(ConnectionStatsSharedData)); - Size hashSize = hash_estimate_size(MaxWorkerNodesTracked, - sizeof(SharedConnStatsHashEntry)); + Size workerNodeConnHashSize = hash_estimate_size(MaxWorkerNodesTracked, + sizeof(SharedWorkerNodeConnStatsHashEntry)); - size = add_size(size, hashSize); + size = add_size(size, workerNodeConnHashSize); - return size; + Size workerNodeDatabaseConnSize = hash_estimate_size(DatabasesPerWorker, + sizeof(SharedWorkerNodeDatabaseConnStatsHashEntry)); + + size = add_size(size, workerNodeDatabaseConnSize); + + return size; } @@ -651,15 +701,6 @@ void SharedConnectionStatsShmemInit(void) { bool alreadyInitialized = false; - HASHCTL info; - - /* create (hostname, port, database) -> [counter] */ - memset(&info, 0, sizeof(info)); - info.keysize = sizeof(SharedConnStatsHashKey); - info.entrysize = sizeof(SharedConnStatsHashEntry); - info.hash = SharedConnectionHashHash; - info.match = SharedConnectionHashCompare; - uint32 hashFlags = (HASH_ELEM | HASH_FUNCTION | HASH_COMPARE); /* * Currently the lock isn't required because allocation only happens at @@ -688,14 +729,42 @@ SharedConnectionStatsShmemInit(void) ConditionVariableInit(&ConnectionStatsSharedState->waitersConditionVariable); } - /* allocate hash table */ - SharedConnStatsHash = - ShmemInitHash("Shared Conn. Stats Hash", MaxWorkerNodesTracked, - MaxWorkerNodesTracked, &info, hashFlags); + /* allocate hash tables */ + + /* create (hostname, port) -> [counter] */ + HASHCTL sharedWorkerNodeConnStatsHashInfo; + memset(&sharedWorkerNodeConnStatsHashInfo, 0, sizeof(sharedWorkerNodeConnStatsHashInfo)); + sharedWorkerNodeConnStatsHashInfo.keysize = sizeof(SharedWorkerNodeConnStatsHashKey); + sharedWorkerNodeConnStatsHashInfo.entrysize = sizeof(SharedWorkerNodeConnStatsHashEntry); + sharedWorkerNodeConnStatsHashInfo.hash = SharedConnectionHashHash; + sharedWorkerNodeConnStatsHashInfo.match = SharedConnectionHashCompare; + SharedWorkerNodeConnStatsHash = + ShmemInitHash("Shared Conn. Stats Hash", + MaxWorkerNodesTracked, + MaxWorkerNodesTracked, + &sharedWorkerNodeConnStatsHashInfo, + (HASH_ELEM | HASH_FUNCTION | HASH_COMPARE)); + + /* create (hostname, port, database) -> [counter] */ + HASHCTL sharedWorkerNodeDatabaseConnStatsHashInfo; + memset(&sharedWorkerNodeDatabaseConnStatsHashInfo, 0, sizeof(sharedWorkerNodeDatabaseConnStatsHashInfo)); + sharedWorkerNodeDatabaseConnStatsHashInfo.keysize = sizeof(SharedWorkerNodeDatabaseConnStatsHashKey); + sharedWorkerNodeDatabaseConnStatsHashInfo.entrysize = sizeof(SharedWorkerNodeDatabaseConnStatsHashEntry); + sharedWorkerNodeDatabaseConnStatsHashInfo.hash = SharedWorkerNodeDatabaseHashHash; + sharedWorkerNodeDatabaseConnStatsHashInfo.match = SharedWorkerNodeDatabaseHashCompare; + + int sharedWorkerNodeDatabaseConnStatsHashSize = MaxWorkerNodesTracked * DatabasesPerWorker; + SharedWorkerNodeDatabaseConnStatsHash = + ShmemInitHash("Shared Conn Per Database. Stats Hash", + sharedWorkerNodeDatabaseConnStatsHashSize, + sharedWorkerNodeDatabaseConnStatsHashSize, + &sharedWorkerNodeDatabaseConnStatsHashInfo, + (HASH_ELEM | HASH_FUNCTION | HASH_COMPARE)); LWLockRelease(AddinShmemInitLock); - Assert(SharedConnStatsHash != NULL); + Assert(SharedWorkerNodeConnStatsHash != NULL); + Assert(SharedWorkerNodeDatabaseConnStatsHash != NULL); Assert(ConnectionStatsSharedState->sharedConnectionHashTrancheId != 0); if (prev_shmem_startup_hook != NULL) @@ -797,7 +866,7 @@ ShouldWaitForConnection(int currentConnectionCount) static uint32 SharedConnectionHashHash(const void *key, Size keysize) { - SharedConnStatsHashKey *entry = (SharedConnStatsHashKey *) key; + SharedWorkerNodeConnStatsHashKey *entry = (SharedWorkerNodeConnStatsHashKey *) key; uint32 hash = string_hash(entry->hostname, NAMEDATALEN); hash = hash_combine(hash, hash_uint32(entry->port)); @@ -805,20 +874,35 @@ SharedConnectionHashHash(const void *key, Size keysize) return hash; } +static uint32 +SharedWorkerNodeDatabaseHashHash(const void *key, Size keysize) +{ + SharedWorkerNodeDatabaseConnStatsHashKey *entry = (SharedWorkerNodeDatabaseConnStatsHashKey *) key; + uint32 hash = string_hash(entry->workerNodeKey.hostname, NAMEDATALEN); + hash = hash_combine(hash, hash_uint32(entry->workerNodeKey.port)); + hash = hash_combine(hash, hash_uint32(entry->database)); + + return hash; +} + static int SharedConnectionHashCompare(const void *a, const void *b, Size keysize) { - SharedConnStatsHashKey *ca = (SharedConnStatsHashKey *) a; - SharedConnStatsHashKey *cb = (SharedConnStatsHashKey *) b; + SharedWorkerNodeConnStatsHashKey *ca = (SharedWorkerNodeConnStatsHashKey *) a; + SharedWorkerNodeConnStatsHashKey *cb = (SharedWorkerNodeConnStatsHashKey *) b; - if (strncmp(ca->hostname, cb->hostname, MAX_NODE_LENGTH) != 0 || - ca->port != cb->port) - { - return 1; - } - else - { - return 0; - } + return strncmp(ca->hostname, cb->hostname, MAX_NODE_LENGTH) != 0 || + ca->port != cb->port; +} + +static int +SharedWorkerNodeDatabaseHashCompare(const void *a, const void *b, Size keysize) +{ + SharedWorkerNodeDatabaseConnStatsHashKey *ca = (SharedWorkerNodeDatabaseConnStatsHashKey *) a; + SharedWorkerNodeDatabaseConnStatsHashKey *cb = (SharedWorkerNodeDatabaseConnStatsHashKey *) b; + + return strncmp(ca->workerNodeKey.hostname, cb->workerNodeKey.hostname, MAX_NODE_LENGTH) != 0 || + ca->workerNodeKey.port != cb->workerNodeKey.port || + ca->database != cb->database; } diff --git a/src/backend/distributed/operations/worker_node_manager.c b/src/backend/distributed/operations/worker_node_manager.c index ba622e4d7..594e9f23f 100644 --- a/src/backend/distributed/operations/worker_node_manager.c +++ b/src/backend/distributed/operations/worker_node_manager.c @@ -39,6 +39,7 @@ /* Config variables managed via guc.c */ char *WorkerListFileName; int MaxWorkerNodesTracked = 2048; /* determines worker node hash table size */ +int DatabasesPerWorker = 10; /* determine database per worker hash table size */ /* Local functions forward declarations */ diff --git a/src/include/distributed/connection_management.h b/src/include/distributed/connection_management.h index d93e4483a..4f6c3d82f 100644 --- a/src/include/distributed/connection_management.h +++ b/src/include/distributed/connection_management.h @@ -123,7 +123,9 @@ enum MultiConnectionMode * * This is need to run 'CREATE_REPLICATION_SLOT' command. */ - REQUIRE_REPLICATION_CONNECTION_PARAM = 1 << 8 + REQUIRE_REPLICATION_CONNECTION_PARAM = 1 << 8, + + REQUIRE_MAINTENANCE_CONNECTION = 1 << 9 }; diff --git a/src/include/distributed/shared_connection_stats.h b/src/include/distributed/shared_connection_stats.h index 007691e16..b23dee905 100644 --- a/src/include/distributed/shared_connection_stats.h +++ b/src/include/distributed/shared_connection_stats.h @@ -16,8 +16,13 @@ #define DISABLE_REMOTE_CONNECTIONS_FOR_LOCAL_QUERIES -1 #define ALLOW_ALL_EXTERNAL_CONNECTIONS -1 +enum SharedPoolCounterMode +{ + MAINTENANCE_CONNECTION_POOL = 1 << 0 +}; extern int MaxSharedPoolSize; +extern float MaintenanceSharedPoolSizePercent; extern int LocalSharedPoolSize; extern int MaxClientConnections; @@ -30,10 +35,10 @@ extern void SharedConnectionStatsShmemInit(void); extern int GetMaxClientConnections(void); extern int GetMaxSharedPoolSize(void); extern int GetLocalSharedPoolSize(void); -extern bool TryToIncrementSharedConnectionCounter(const char *hostname, int port); -extern void WaitLoopForSharedConnection(const char *hostname, int port); +extern bool TryToIncrementSharedConnectionCounter(uint32 flags, const char *hostname, int port); +extern void WaitLoopForSharedConnection(uint32 flags, const char *hostname, int port); extern void DecrementSharedConnectionCounter(const char *hostname, int port); -extern void IncrementSharedConnectionCounter(const char *hostname, int port); +extern void IncrementSharedConnectionCounter(uint32 flags, const char *hostname, int port); extern int AdaptiveConnectionManagementFlag(bool connectToLocalNode, int activeConnectionCount); diff --git a/src/include/distributed/worker_manager.h b/src/include/distributed/worker_manager.h index 02a43fe0b..aad2f157c 100644 --- a/src/include/distributed/worker_manager.h +++ b/src/include/distributed/worker_manager.h @@ -59,6 +59,7 @@ typedef struct WorkerNode /* Config variables managed via guc.c */ extern int MaxWorkerNodesTracked; +extern int DatabasesPerWorker; extern char *WorkerListFileName; extern char *CurrentCluster; From d44b1dd99a38f7ec081e9c832652771a2924a67f Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Fri, 13 Oct 2023 21:52:08 +0700 Subject: [PATCH 03/41] Small fixes --- .../connection/shared_connection_stats.c | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/backend/distributed/connection/shared_connection_stats.c b/src/backend/distributed/connection/shared_connection_stats.c index 40d74ca11..577dd86d3 100644 --- a/src/backend/distributed/connection/shared_connection_stats.c +++ b/src/backend/distributed/connection/shared_connection_stats.c @@ -416,7 +416,7 @@ IncrementSharedConnectionCounterInternal(uint32 externalFlags, workerNodeConnectionEntry->count = 0; } - /* Initialized SharedWorkerNodeDatabaseConnStatsHash the same way */ + /* Initialize SharedWorkerNodeDatabaseConnStatsHash the same way */ SharedWorkerNodeDatabaseConnStatsHashKey workerNodeDatabaseKey = PrepareWorkerNodeDatabaseHashKey(hostname, port, database); bool workerNodeDatabaseEntryFound = false; @@ -504,15 +504,6 @@ static bool isConnectionSlotAvailable(SharedWorkerNodeConnStatsHashKey *connKey, */ void DecrementSharedConnectionCounter(const char *hostname, int port) -{ - - DecrementSharedConnectionCounterInternal(hostname, port); - UnLockConnectionSharedMemory(); - WakeupWaiterBackendsForSharedConnection(); -} - -static void -DecrementSharedConnectionCounterInternal(const char *hostname, int port) { /* * Do not call GetMaxSharedPoolSize() here, since it may read from @@ -526,6 +517,17 @@ DecrementSharedConnectionCounterInternal(const char *hostname, int port) LockConnectionSharedMemory(LW_EXCLUSIVE); + DecrementSharedConnectionCounterInternal(hostname, port); + + UnLockConnectionSharedMemory(); + WakeupWaiterBackendsForSharedConnection(); +} + +static void +DecrementSharedConnectionCounterInternal(const char *hostname, int port) +{ + + bool workerNodeEntryFound = false; SharedWorkerNodeConnStatsHashKey workerNodeKey = PrepareWorkerNodeHashKey(hostname, port); SharedWorkerNodeConnStatsHashEntry *workerNodeEntry = From 0bb5876a8797de7cd1d709af595dc0aae72e86e4 Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Fri, 13 Oct 2023 21:52:31 +0700 Subject: [PATCH 04/41] Small fixes --- src/backend/distributed/connection/shared_connection_stats.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/backend/distributed/connection/shared_connection_stats.c b/src/backend/distributed/connection/shared_connection_stats.c index 577dd86d3..359d76b5a 100644 --- a/src/backend/distributed/connection/shared_connection_stats.c +++ b/src/backend/distributed/connection/shared_connection_stats.c @@ -526,8 +526,6 @@ DecrementSharedConnectionCounter(const char *hostname, int port) static void DecrementSharedConnectionCounterInternal(const char *hostname, int port) { - - bool workerNodeEntryFound = false; SharedWorkerNodeConnStatsHashKey workerNodeKey = PrepareWorkerNodeHashKey(hostname, port); SharedWorkerNodeConnStatsHashEntry *workerNodeEntry = From 4d775ab3616d3357ae07bb30c292f4d0a6ee1a70 Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Fri, 13 Oct 2023 22:37:20 +0700 Subject: [PATCH 05/41] Support for maintenance quota --- .../connection/connection_management.c | 10 +-- .../connection/shared_connection_stats.c | 84 +++++++++++-------- .../distributed/connection_management.h | 5 ++ .../distributed/shared_connection_stats.h | 6 +- 4 files changed, 63 insertions(+), 42 deletions(-) diff --git a/src/backend/distributed/connection/connection_management.c b/src/backend/distributed/connection/connection_management.c index 96e750059..6066ef4bf 100644 --- a/src/backend/distributed/connection/connection_management.c +++ b/src/backend/distributed/connection/connection_management.c @@ -327,7 +327,7 @@ StartNodeUserDatabaseConnection(uint32 flags, const char *hostname, int32 port, */ ConnectionHashEntry *entry = hash_search(ConnectionHash, &key, HASH_ENTER, &found); - if (!found || !entry->isValid) + if (!(found && entry->isValid)) { /* * We are just building hash entry or previously it was left in an @@ -377,11 +377,11 @@ StartNodeUserDatabaseConnection(uint32 flags, const char *hostname, int32 port, /* these two flags are by nature cannot happen at the same time */ Assert(!((flags & WAIT_FOR_CONNECTION) && (flags & OPTIONAL_CONNECTION))); + int sharedCounterFlags = (flags & REQUIRE_MAINTENANCE_CONNECTION) + ? MAINTENANCE_CONNECTION + : 0; if (flags & WAIT_FOR_CONNECTION) { - int sharedCounterFlags = (flags & REQUIRE_MAINTENANCE_CONNECTION) - ? MAINTENANCE_CONNECTION_POOL - : 0; WaitLoopForSharedConnection(sharedCounterFlags, hostname, port); } else if (flags & OPTIONAL_CONNECTION) @@ -392,7 +392,6 @@ StartNodeUserDatabaseConnection(uint32 flags, const char *hostname, int32 port, * cannot reserve the right to establish a connection, we prefer to * error out. */ - int sharedCounterFlags = 0; if (!TryToIncrementSharedConnectionCounter(sharedCounterFlags, hostname, port)) { /* do not track the connection anymore */ @@ -413,7 +412,6 @@ StartNodeUserDatabaseConnection(uint32 flags, const char *hostname, int32 port, * * Still, we keep track of the connection counter. */ - int sharedCounterFlags = 0; IncrementSharedConnectionCounter(sharedCounterFlags, hostname, port); } diff --git a/src/backend/distributed/connection/shared_connection_stats.c b/src/backend/distributed/connection/shared_connection_stats.c index 359d76b5a..4ea94abdb 100644 --- a/src/backend/distributed/connection/shared_connection_stats.c +++ b/src/backend/distributed/connection/shared_connection_stats.c @@ -35,7 +35,7 @@ #include "distributed/worker_manager.h" #include "distributed/tuplestore.h" #include "distributed/worker_manager.h" - +#include "math.h" #define REMOTE_CONNECTION_STATS_COLUMNS 4 @@ -104,9 +104,11 @@ typedef struct SharedWorkerNodeDatabaseConnStatsHashEntry int MaxSharedPoolSize = 0; /* - * + * Controlled via a GUC, never access directly, use GetSharedPoolSizeMaintenanceQuota(). + * Percent of MaxSharedPoolSize reserved for maintenance operations. + * "0" effectively means that regular and maintenance connection will compete over the common pool */ -float SharedPoolSizeMaintenancePercent = 10.0f; +double SharedPoolSizeMaintenanceQuota = 0.1; /* * Controlled via a GUC, never access directly, use GetLocalSharedPoolSize(). @@ -140,7 +142,7 @@ static int SharedConnectionHashCompare(const void *a, const void *b, Size keysiz static uint32 SharedWorkerNodeDatabaseHashHash(const void *key, Size keysize); static int SharedWorkerNodeDatabaseHashCompare(const void *a, const void *b, Size keysize); static bool IsConnectionToLocalNode(SharedWorkerNodeConnStatsHashKey *connKey); -static bool isConnectionSlotAvailable(SharedWorkerNodeConnStatsHashKey *connKey, +static bool isConnectionSlotAvailable(uint32 flags, SharedWorkerNodeConnStatsHashKey *connKey, const SharedWorkerNodeConnStatsHashEntry *connectionEntry); static bool IncrementSharedConnectionCounterInternal(uint32 externalFlags, bool checkLimits, const char *hostname, int port, @@ -256,10 +258,10 @@ GetMaxSharedPoolSize(void) return MaxSharedPoolSize; } -float -GetSharedPoolSizeMaintenancePercent(void) +double +GetSharedPoolSizeMaintenanceQuota(void) { - return SharedPoolSizeMaintenancePercent; + return SharedPoolSizeMaintenanceQuota; } @@ -352,29 +354,6 @@ IncrementSharedConnectionCounter(uint32 flags, const char *hostname, int port) IncrementSharedConnectionCounterInternal(flags, false, hostname, port, MyDatabaseId); } -static SharedWorkerNodeConnStatsHashKey PrepareWorkerNodeHashKey(const char *hostname, int port) -{ - SharedWorkerNodeConnStatsHashKey key; - strlcpy(key.hostname, hostname, MAX_NODE_LENGTH); - if (strlen(hostname) > MAX_NODE_LENGTH) - { - ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("hostname exceeds the maximum length of %d", - MAX_NODE_LENGTH))); - } - key.port = port; - return key; -} - -static SharedWorkerNodeDatabaseConnStatsHashKey PrepareWorkerNodeDatabaseHashKey(const char *hostname, - int port, - Oid database) -{ - SharedWorkerNodeDatabaseConnStatsHashKey workerNodeDatabaseKey; - workerNodeDatabaseKey.workerNodeKey = PrepareWorkerNodeHashKey(hostname, port); - workerNodeDatabaseKey.database = database; - return workerNodeDatabaseKey; -} static bool IncrementSharedConnectionCounterInternal(uint32 externalFlags, @@ -437,10 +416,13 @@ IncrementSharedConnectionCounterInternal(uint32 externalFlags, workerNodeDatabaseEntry->count = 0; } - /* Increment counter if possible */ + /* Increment counter if a slot available */ bool connectionSlotAvailable = true; - connectionSlotAvailable = !checkLimits || - isConnectionSlotAvailable(&workerNodeKey, workerNodeConnectionEntry); + connectionSlotAvailable = + !checkLimits || + isConnectionSlotAvailable(externalFlags, + &workerNodeKey, + workerNodeConnectionEntry); if (connectionSlotAvailable) { @@ -460,11 +442,19 @@ static bool IsConnectionToLocalNode(SharedWorkerNodeConnStatsHashKey *connKey) } -static bool isConnectionSlotAvailable(SharedWorkerNodeConnStatsHashKey *connKey, +static bool isConnectionSlotAvailable(uint32 flags, + SharedWorkerNodeConnStatsHashKey *connKey, const SharedWorkerNodeConnStatsHashEntry *connectionEntry) { bool connectionSlotAvailable = true; bool connectionToLocalNode = IsConnectionToLocalNode(connKey); + /* + * Use full capacity for maintenance connections, + */ + int maintenanceConnectionsQuota = + (flags & MAINTENANCE_CONNECTION) + ? 0 + : (int) floor((double) GetMaxSharedPoolSize() * GetSharedPoolSizeMaintenanceQuota()); if (connectionToLocalNode) { bool remoteConnectionsForLocalQueriesDisabled = @@ -489,7 +479,7 @@ static bool isConnectionSlotAvailable(SharedWorkerNodeConnStatsHashKey *connKey, connectionSlotAvailable = false; } } - else if (connectionEntry->count + 1 > GetMaxSharedPoolSize()) + else if (connectionEntry->count + 1 > (GetMaxSharedPoolSize() - maintenanceConnectionsQuota)) { connectionSlotAvailable = false; } @@ -862,6 +852,30 @@ ShouldWaitForConnection(int currentConnectionCount) return false; } +static SharedWorkerNodeConnStatsHashKey PrepareWorkerNodeHashKey(const char *hostname, int port) +{ + SharedWorkerNodeConnStatsHashKey key; + strlcpy(key.hostname, hostname, MAX_NODE_LENGTH); + if (strlen(hostname) > MAX_NODE_LENGTH) + { + ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("hostname exceeds the maximum length of %d", + MAX_NODE_LENGTH))); + } + key.port = port; + return key; +} + +static SharedWorkerNodeDatabaseConnStatsHashKey PrepareWorkerNodeDatabaseHashKey(const char *hostname, + int port, + Oid database) +{ + SharedWorkerNodeDatabaseConnStatsHashKey workerNodeDatabaseKey; + workerNodeDatabaseKey.workerNodeKey = PrepareWorkerNodeHashKey(hostname, port); + workerNodeDatabaseKey.database = database; + return workerNodeDatabaseKey; +} + static uint32 SharedConnectionHashHash(const void *key, Size keysize) diff --git a/src/include/distributed/connection_management.h b/src/include/distributed/connection_management.h index 4f6c3d82f..487b5f12f 100644 --- a/src/include/distributed/connection_management.h +++ b/src/include/distributed/connection_management.h @@ -125,6 +125,11 @@ enum MultiConnectionMode */ REQUIRE_REPLICATION_CONNECTION_PARAM = 1 << 8, + /* + * This flag specifies that connection is required for maintenance operations, e.g. + * transaction recovery, distributed deadlock detection. Such connections may have + * special treatment, like dedicated share of pool, etc. + */ REQUIRE_MAINTENANCE_CONNECTION = 1 << 9 }; diff --git a/src/include/distributed/shared_connection_stats.h b/src/include/distributed/shared_connection_stats.h index b23dee905..33e1e31b4 100644 --- a/src/include/distributed/shared_connection_stats.h +++ b/src/include/distributed/shared_connection_stats.h @@ -18,7 +18,10 @@ enum SharedPoolCounterMode { - MAINTENANCE_CONNECTION_POOL = 1 << 0 + /* + * Use this flag to reserve a connection from a maintenance quota + */ + MAINTENANCE_CONNECTION = 1 << 0 }; extern int MaxSharedPoolSize; @@ -34,6 +37,7 @@ extern size_t SharedConnectionStatsShmemSize(void); extern void SharedConnectionStatsShmemInit(void); extern int GetMaxClientConnections(void); extern int GetMaxSharedPoolSize(void); +extern double GetSharedPoolSizeMaintenanceQuota(void); extern int GetLocalSharedPoolSize(void); extern bool TryToIncrementSharedConnectionCounter(uint32 flags, const char *hostname, int port); extern void WaitLoopForSharedConnection(uint32 flags, const char *hostname, int port); From 76d10cc4139424e9fba1227331a83a7ac3429a3e Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Tue, 17 Oct 2023 21:08:57 +0700 Subject: [PATCH 06/41] Implementation of a dedicated maintenance quota --- .../connection/connection_management.c | 17 +- .../locally_reserved_shared_connections.c | 3 +- .../connection/shared_connection_stats.c | 179 +++++++++--------- .../operations/worker_node_manager.c | 2 +- .../transaction/transaction_recovery.c | 2 +- .../distributed/connection_management.h | 7 +- .../distributed/shared_connection_stats.h | 2 +- .../expected/shared_connection_stats.out | 18 +- .../regress/sql/shared_connection_stats.sql | 2 +- 9 files changed, 125 insertions(+), 107 deletions(-) diff --git a/src/backend/distributed/connection/connection_management.c b/src/backend/distributed/connection/connection_management.c index 6066ef4bf..6e39c8589 100644 --- a/src/backend/distributed/connection/connection_management.c +++ b/src/backend/distributed/connection/connection_management.c @@ -61,8 +61,8 @@ static MultiConnection * FindAvailableConnection(dlist_head *connections, uint32 static void ErrorIfMultipleMetadataConnectionExists(dlist_head *connections); static void FreeConnParamsHashEntryFields(ConnParamsHashEntry *entry); static void AfterXactHostConnectionHandling(ConnectionHashEntry *entry, bool isCommit); -static bool ShouldShutdownConnection(MultiConnection *connection, const int - cachedConnectionCount); +static bool ShouldShutdownConnection(MultiConnection *connection, + const int cachedConnectionCount); static bool RemoteTransactionIdle(MultiConnection *connection); static int EventSetSizeForConnectionList(List *connections); @@ -427,10 +427,14 @@ StartNodeUserDatabaseConnection(uint32 flags, const char *hostname, int32 port, ResetShardPlacementAssociation(connection); - if ((flags & REQUIRE_METADATA_CONNECTION)) + if (flags & REQUIRE_METADATA_CONNECTION) { connection->useForMetadataOperations = true; - } + } + else if (flags & REQUIRE_MAINTENANCE_CONNECTION) + { + connection->useForMaintenanceOperations = true; + } /* fully initialized the connection, record it */ connection->initializationState = POOL_STATE_INITIALIZED; @@ -1194,7 +1198,10 @@ CitusPQFinish(MultiConnection *connection) /* behave idempotently, there is no gurantee that CitusPQFinish() is called once */ if (connection->initializationState >= POOL_STATE_COUNTER_INCREMENTED) { - DecrementSharedConnectionCounter(connection->hostname, connection->port); + int sharedCounterFlags = (connection->useForMaintenanceOperations) + ? MAINTENANCE_CONNECTION + : 0; + DecrementSharedConnectionCounter(sharedCounterFlags, connection->hostname, connection->port); connection->initializationState = POOL_STATE_NOT_INITIALIZED; } } diff --git a/src/backend/distributed/connection/locally_reserved_shared_connections.c b/src/backend/distributed/connection/locally_reserved_shared_connections.c index a2d2fac24..dc31ad080 100644 --- a/src/backend/distributed/connection/locally_reserved_shared_connections.c +++ b/src/backend/distributed/connection/locally_reserved_shared_connections.c @@ -240,7 +240,8 @@ DeallocateReservedConnections(void) * We have not used this reservation, make sure to clean-up from * the shared memory as well. */ - DecrementSharedConnectionCounter(entry->key.hostname, entry->key.port); + int sharedCounterFlags = 0; + DecrementSharedConnectionCounter(sharedCounterFlags, entry->key.hostname, entry->key.port); /* for completeness, set it to true */ entry->usedReservation = true; diff --git a/src/backend/distributed/connection/shared_connection_stats.c b/src/backend/distributed/connection/shared_connection_stats.c index 4ea94abdb..55df40680 100644 --- a/src/backend/distributed/connection/shared_connection_stats.c +++ b/src/backend/distributed/connection/shared_connection_stats.c @@ -84,7 +84,8 @@ typedef struct SharedWorkerNodeConnStatsHashEntry { SharedWorkerNodeConnStatsHashKey key; - int count; + int regularConnectionsCount; + int maintenanceConnectionsCount; } SharedWorkerNodeConnStatsHashEntry; /* hash entry for per database on worker stats */ @@ -141,9 +142,7 @@ static uint32 SharedConnectionHashHash(const void *key, Size keysize); static int SharedConnectionHashCompare(const void *a, const void *b, Size keysize); static uint32 SharedWorkerNodeDatabaseHashHash(const void *key, Size keysize); static int SharedWorkerNodeDatabaseHashCompare(const void *a, const void *b, Size keysize); -static bool IsConnectionToLocalNode(SharedWorkerNodeConnStatsHashKey *connKey); -static bool isConnectionSlotAvailable(uint32 flags, SharedWorkerNodeConnStatsHashKey *connKey, - const SharedWorkerNodeConnStatsHashEntry *connectionEntry); +static bool isConnectionThrottlingDisabled(); static bool IncrementSharedConnectionCounterInternal(uint32 externalFlags, bool checkLimits, const char *hostname, int port, Oid database); @@ -152,7 +151,7 @@ static SharedWorkerNodeDatabaseConnStatsHashKey PrepareWorkerNodeDatabaseHashKey int port, Oid database); static void -DecrementSharedConnectionCounterInternal(const char *hostname, int port); +DecrementSharedConnectionCounterInternal(uint32 externalFlags, const char *hostname, int port); PG_FUNCTION_INFO_V1(citus_remote_connection_stats); @@ -316,11 +315,10 @@ WaitLoopForSharedConnection(uint32 flags, const char *hostname, int port) bool TryToIncrementSharedConnectionCounter(uint32 flags, const char *hostname, int port) { - if (GetMaxSharedPoolSize() == DISABLE_CONNECTION_THROTTLING) - { - /* connection throttling disabled */ - return true; - } + if (isConnectionThrottlingDisabled()) + { + return true; + } /* * The local session might already have some reserved connections to the given @@ -334,7 +332,11 @@ TryToIncrementSharedConnectionCounter(uint32 flags, const char *hostname, int po return true; } - return IncrementSharedConnectionCounterInternal(flags, true, hostname, port, MyDatabaseId); + return IncrementSharedConnectionCounterInternal(flags, + true, + hostname, + port, + MyDatabaseId); } @@ -345,13 +347,16 @@ TryToIncrementSharedConnectionCounter(uint32 flags, const char *hostname, int po void IncrementSharedConnectionCounter(uint32 flags, const char *hostname, int port) { - if (MaxSharedPoolSize == DISABLE_CONNECTION_THROTTLING) + if (isConnectionThrottlingDisabled()) { - /* connection throttling disabled */ return; } - IncrementSharedConnectionCounterInternal(flags, false, hostname, port, MyDatabaseId); + IncrementSharedConnectionCounterInternal(flags, + false, + hostname, + port, + MyDatabaseId); } @@ -392,7 +397,8 @@ IncrementSharedConnectionCounterInternal(uint32 externalFlags, if (!workerNodeEntryFound) { /* we successfully allocated the entry for the first time, so initialize it */ - workerNodeConnectionEntry->count = 0; + workerNodeConnectionEntry->regularConnectionsCount = 0; + workerNodeConnectionEntry->maintenanceConnectionsCount = 0; } /* Initialize SharedWorkerNodeDatabaseConnStatsHash the same way */ @@ -418,15 +424,57 @@ IncrementSharedConnectionCounterInternal(uint32 externalFlags, /* Increment counter if a slot available */ bool connectionSlotAvailable = true; - connectionSlotAvailable = - !checkLimits || - isConnectionSlotAvailable(externalFlags, - &workerNodeKey, - workerNodeConnectionEntry); + + /* When GetSharedPoolSizeMaintenanceQuota() == 0, treat maintenance connections as regular */ + bool maintenanceConnection = (GetSharedPoolSizeMaintenanceQuota() > 0 && (externalFlags & MAINTENANCE_CONNECTION)); + if (checkLimits) + { + WorkerNode *workerNode = FindWorkerNode(hostname, port); + bool connectionToLocalNode = workerNode && (workerNode->groupId == GetLocalGroupId()); + int currentConnectionsLimit = connectionToLocalNode + ? GetLocalSharedPoolSize() + : GetMaxSharedPoolSize(); + int maintenanceQuota = (int) ceil((double) currentConnectionsLimit * GetSharedPoolSizeMaintenanceQuota()); + /* Connections limit should never go below 1 */ + currentConnectionsLimit = Max(maintenanceConnection + ? maintenanceQuota + : currentConnectionsLimit - maintenanceQuota, 1); + int currentConnectionsCount = maintenanceConnection + ? workerNodeConnectionEntry->maintenanceConnectionsCount + : workerNodeConnectionEntry->regularConnectionsCount; + bool remoteNodeLimitExceeded = currentConnectionsCount + 1 > currentConnectionsLimit; + /* + * For local nodes, solely relying on citus.max_shared_pool_size or + * max_connections might not be sufficient. The former gives us + * a preview of the future (e.g., we let the new connections to establish, + * but they are not established yet). The latter gives us the close to + * precise view of the past (e.g., the active number of client backends). + * + * Overall, we want to limit both of the metrics. The former limit typically + * kicks in under regular loads, where the load of the database increases in + * a reasonable pace. The latter limit typically kicks in when the database + * is issued lots of concurrent sessions at the same time, such as benchmarks. + */ + bool localNodeLimitExceeded = + connectionToLocalNode && + (GetLocalSharedPoolSize() == DISABLE_REMOTE_CONNECTIONS_FOR_LOCAL_QUERIES || + GetExternalClientBackendCount() + 1 > currentConnectionsLimit); + if (remoteNodeLimitExceeded || localNodeLimitExceeded) + { + connectionSlotAvailable = false; + } + } if (connectionSlotAvailable) { - workerNodeConnectionEntry->count += 1; + if (maintenanceConnection) + { + workerNodeConnectionEntry->maintenanceConnectionsCount += 1; + } + else + { + workerNodeConnectionEntry->regularConnectionsCount += 1; + } workerNodeDatabaseEntry->count += 1; } @@ -435,86 +483,29 @@ IncrementSharedConnectionCounterInternal(uint32 externalFlags, return connectionSlotAvailable; } -static bool IsConnectionToLocalNode(SharedWorkerNodeConnStatsHashKey *connKey) -{ - WorkerNode *workerNode = FindWorkerNode(connKey->hostname, connKey->port); - return workerNode && (workerNode->groupId == GetLocalGroupId()); -} - - -static bool isConnectionSlotAvailable(uint32 flags, - SharedWorkerNodeConnStatsHashKey *connKey, - const SharedWorkerNodeConnStatsHashEntry *connectionEntry) -{ - bool connectionSlotAvailable = true; - bool connectionToLocalNode = IsConnectionToLocalNode(connKey); - /* - * Use full capacity for maintenance connections, - */ - int maintenanceConnectionsQuota = - (flags & MAINTENANCE_CONNECTION) - ? 0 - : (int) floor((double) GetMaxSharedPoolSize() * GetSharedPoolSizeMaintenanceQuota()); - if (connectionToLocalNode) - { - bool remoteConnectionsForLocalQueriesDisabled = - GetLocalSharedPoolSize() == DISABLE_REMOTE_CONNECTIONS_FOR_LOCAL_QUERIES; - /* - * For local nodes, solely relying on citus.max_shared_pool_size or - * max_connections might not be sufficient. The former gives us - * a preview of the future (e.g., we let the new connections to establish, - * but they are not established yet). The latter gives us the close to - * precise view of the past (e.g., the active number of client backends). - * - * Overall, we want to limit both of the metrics. The former limit typically - * kicks in under regular loads, where the load of the database increases in - * a reasonable pace. The latter limit typically kicks in when the database - * is issued lots of concurrent sessions at the same time, such as benchmarks. - */ - bool localConnectionLimitExceeded = - GetExternalClientBackendCount() + 1 > GetLocalSharedPoolSize() || - connectionEntry->count + 1 > GetLocalSharedPoolSize(); - if (remoteConnectionsForLocalQueriesDisabled || localConnectionLimitExceeded) - { - connectionSlotAvailable = false; - } - } - else if (connectionEntry->count + 1 > (GetMaxSharedPoolSize() - maintenanceConnectionsQuota)) - { - connectionSlotAvailable = false; - } - return connectionSlotAvailable; -} - - /* * DecrementSharedConnectionCounter decrements the shared counter * for the given hostname and port for the given count. */ void -DecrementSharedConnectionCounter(const char *hostname, int port) +DecrementSharedConnectionCounter(uint32 externalFlags, const char *hostname, int port) { - /* - * Do not call GetMaxSharedPoolSize() here, since it may read from - * the catalog and we may be in the process exit handler. - */ - if (MaxSharedPoolSize == DISABLE_CONNECTION_THROTTLING) + if (isConnectionThrottlingDisabled()) { - /* connection throttling disabled */ return; } LockConnectionSharedMemory(LW_EXCLUSIVE); - DecrementSharedConnectionCounterInternal(hostname, port); + DecrementSharedConnectionCounterInternal(externalFlags, hostname, port); UnLockConnectionSharedMemory(); WakeupWaiterBackendsForSharedConnection(); } static void -DecrementSharedConnectionCounterInternal(const char *hostname, int port) +DecrementSharedConnectionCounterInternal(uint32 externalFlags, const char *hostname, int port) { bool workerNodeEntryFound = false; SharedWorkerNodeConnStatsHashKey workerNodeKey = PrepareWorkerNodeHashKey(hostname, port); @@ -530,10 +521,17 @@ DecrementSharedConnectionCounterInternal(const char *hostname, int port) } /* we should never go below 0 */ - Assert(workerNodeEntry->count > 0); + Assert(workerNodeEntry->regularConnectionsCount > 0 || workerNodeEntry->maintenanceConnectionsCount > 0); - - workerNodeEntry->count -= 1; + /* When GetSharedPoolSizeMaintenanceQuota() == 0, treat maintenance connections as regular */ + if ((GetSharedPoolSizeMaintenanceQuota() > 0 && (externalFlags & MAINTENANCE_CONNECTION))) + { + workerNodeEntry->maintenanceConnectionsCount -= 1; + } + else + { + workerNodeEntry->regularConnectionsCount -= 1; + } /* @@ -543,7 +541,7 @@ DecrementSharedConnectionCounterInternal(const char *hostname, int port) * not busy, and given the default value of MaxCachedConnectionsPerWorker = 1, * we're unlikely to trigger this often. */ - if (workerNodeEntry->count == 0) + if (workerNodeEntry->regularConnectionsCount == 0 && workerNodeEntry->maintenanceConnectionsCount == 0) { hash_search(SharedWorkerNodeConnStatsHash, &workerNodeKey, HASH_REMOVE, NULL); } @@ -920,3 +918,12 @@ SharedWorkerNodeDatabaseHashCompare(const void *a, const void *b, Size keysize) ca->workerNodeKey.port != cb->workerNodeKey.port || ca->database != cb->database; } + +static bool isConnectionThrottlingDisabled() +{ + /* + * Do not call Get*PoolSize() functions here, since it may read from + * the catalog and we may be in the process exit handler. + */ + return MaxSharedPoolSize == DISABLE_CONNECTION_THROTTLING; +} diff --git a/src/backend/distributed/operations/worker_node_manager.c b/src/backend/distributed/operations/worker_node_manager.c index 594e9f23f..32eb28b94 100644 --- a/src/backend/distributed/operations/worker_node_manager.c +++ b/src/backend/distributed/operations/worker_node_manager.c @@ -39,7 +39,7 @@ /* Config variables managed via guc.c */ char *WorkerListFileName; int MaxWorkerNodesTracked = 2048; /* determines worker node hash table size */ -int DatabasesPerWorker = 10; /* determine database per worker hash table size */ +int DatabasesPerWorker = 1; /* determine database per worker hash table size */ /* Local functions forward declarations */ diff --git a/src/backend/distributed/transaction/transaction_recovery.c b/src/backend/distributed/transaction/transaction_recovery.c index 653b962db..bc960796d 100644 --- a/src/backend/distributed/transaction/transaction_recovery.c +++ b/src/backend/distributed/transaction/transaction_recovery.c @@ -159,7 +159,7 @@ RecoverWorkerTransactions(WorkerNode *workerNode) bool recoveryFailed = false; - int connectionFlags = 0; + int connectionFlags = WAIT_FOR_CONNECTION | REQUIRE_MAINTENANCE_CONNECTION; MultiConnection *connection = GetNodeConnection(connectionFlags, nodeName, nodePort); if (connection->pgConn == NULL || PQstatus(connection->pgConn) != CONNECTION_OK) { diff --git a/src/include/distributed/connection_management.h b/src/include/distributed/connection_management.h index 487b5f12f..91e1e9222 100644 --- a/src/include/distributed/connection_management.h +++ b/src/include/distributed/connection_management.h @@ -127,8 +127,8 @@ enum MultiConnectionMode /* * This flag specifies that connection is required for maintenance operations, e.g. - * transaction recovery, distributed deadlock detection. Such connections may have - * special treatment, like dedicated share of pool, etc. + * transaction recovery, distributed deadlock detection. Such connections have + * a reserved quota of the MaxSharedPoolSize. */ REQUIRE_MAINTENANCE_CONNECTION = 1 << 9 }; @@ -230,6 +230,9 @@ typedef struct MultiConnection /* replication option */ bool requiresReplication; + /* See REQUIRE_MAINTENANCE_CONNECTION */ + bool useForMaintenanceOperations; + MultiConnectionStructInitializationState initializationState; } MultiConnection; diff --git a/src/include/distributed/shared_connection_stats.h b/src/include/distributed/shared_connection_stats.h index 33e1e31b4..817f80b54 100644 --- a/src/include/distributed/shared_connection_stats.h +++ b/src/include/distributed/shared_connection_stats.h @@ -41,7 +41,7 @@ extern double GetSharedPoolSizeMaintenanceQuota(void); extern int GetLocalSharedPoolSize(void); extern bool TryToIncrementSharedConnectionCounter(uint32 flags, const char *hostname, int port); extern void WaitLoopForSharedConnection(uint32 flags, const char *hostname, int port); -extern void DecrementSharedConnectionCounter(const char *hostname, int port); +extern void DecrementSharedConnectionCounter(uint32 externalFlags, const char *hostname, int port); extern void IncrementSharedConnectionCounter(uint32 flags, const char *hostname, int port); extern int AdaptiveConnectionManagementFlag(bool connectToLocalNode, int activeConnectionCount); diff --git a/src/test/regress/expected/shared_connection_stats.out b/src/test/regress/expected/shared_connection_stats.out index 0ce22548f..91d35db5d 100644 --- a/src/test/regress/expected/shared_connection_stats.out +++ b/src/test/regress/expected/shared_connection_stats.out @@ -224,7 +224,7 @@ BEGIN; COMMIT; -- pg_sleep forces almost 1 connection per placement -- now, some of the optional connections would be skipped, --- and only 5 connections are used per node +-- and only 4 connections (5 minus the maintenance quota) are used per node BEGIN; SET LOCAL citus.max_adaptive_executor_pool_size TO 16; with cte_1 as (select pg_sleep(0.1) is null, a from test) SELECT a from cte_1 ORDER By 1 LIMIT 1; @@ -244,8 +244,8 @@ BEGIN; hostname, port; connection_count_to_node --------------------------------------------------------------------- - 5 - 5 + 4 + 4 (2 rows) COMMIT; @@ -382,8 +382,8 @@ COPY test FROM PROGRAM 'seq 32'; hostname, port; connection_count_to_node --------------------------------------------------------------------- - 3 - 3 + 2 + 2 (2 rows) ROLLBACK; @@ -404,7 +404,7 @@ BEGIN; hostname, port; connection_count_to_node --------------------------------------------------------------------- - 3 + 2 1 (2 rows) @@ -423,7 +423,7 @@ COPY test FROM STDIN; hostname, port; connection_count_to_node --------------------------------------------------------------------- - 3 + 2 1 (2 rows) @@ -450,7 +450,7 @@ BEGIN; hostname, port; connection_count_to_node --------------------------------------------------------------------- - 3 + 2 (1 row) -- in this second COPY, we access the same node but different shards @@ -468,7 +468,7 @@ COPY test FROM STDIN; hostname, port; connection_count_to_node --------------------------------------------------------------------- - 3 + 2 1 (2 rows) diff --git a/src/test/regress/sql/shared_connection_stats.sql b/src/test/regress/sql/shared_connection_stats.sql index 7c653e788..7c040af5c 100644 --- a/src/test/regress/sql/shared_connection_stats.sql +++ b/src/test/regress/sql/shared_connection_stats.sql @@ -146,7 +146,7 @@ COMMIT; -- pg_sleep forces almost 1 connection per placement -- now, some of the optional connections would be skipped, --- and only 5 connections are used per node +-- and only 4 connections (5 minus the maintenance quota) are used per node BEGIN; SET LOCAL citus.max_adaptive_executor_pool_size TO 16; with cte_1 as (select pg_sleep(0.1) is null, a from test) SELECT a from cte_1 ORDER By 1 LIMIT 1; From 19681ca5920c8957f530cac0a34e0eadffb33631 Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Wed, 18 Oct 2023 15:30:14 +0700 Subject: [PATCH 07/41] Introducing new backend type and disabling caching for daemon backends --- .../distributed/connection/connection_management.c | 13 +++++++++---- src/backend/distributed/transaction/backend_data.c | 14 +++++++++++++- src/backend/distributed/utils/maintenanced.c | 3 ++- src/include/distributed/backend_data.h | 1 + src/include/distributed/connection_management.h | 2 ++ 5 files changed, 27 insertions(+), 6 deletions(-) diff --git a/src/backend/distributed/connection/connection_management.c b/src/backend/distributed/connection/connection_management.c index 6e39c8589..226ad366a 100644 --- a/src/backend/distributed/connection/connection_management.c +++ b/src/backend/distributed/connection/connection_management.c @@ -502,6 +502,12 @@ FindAvailableConnection(dlist_head *connections, uint32 flags) continue; } + if ((flags & REQUIRE_MAINTENANCE_CONNECTION) && + !connection->useForMaintenanceOperations) + { + continue; + } + if ((flags & REQUIRE_METADATA_CONNECTION) && !connection->useForMetadataOperations) { @@ -1498,7 +1504,7 @@ ShouldShutdownConnection(MultiConnection *connection, const int cachedConnection * escalating the number of cached connections. We can recognize such backends * from their application name. */ - return (IsCitusInternalBackend() || IsRebalancerInternalBackend()) || + return (isCitusMaintenanceDaemonBackend() || IsCitusInternalBackend() || IsRebalancerInternalBackend()) || connection->initializationState != POOL_STATE_INITIALIZED || cachedConnectionCount >= MaxCachedConnectionsPerWorker || connection->forceCloseAtTransactionEnd || @@ -1506,9 +1512,8 @@ ShouldShutdownConnection(MultiConnection *connection, const int cachedConnection !RemoteTransactionIdle(connection) || connection->requiresReplication || connection->isReplicationOriginSessionSetup || - (MaxCachedConnectionLifetime >= 0 && - MillisecondsToTimeout(connection->connectionEstablishmentStart, - MaxCachedConnectionLifetime) <= 0); + (MaxCachedConnectionLifetime >= 0 + && MillisecondsToTimeout(connection->connectionEstablishmentStart, MaxCachedConnectionLifetime) <= 0); } diff --git a/src/backend/distributed/transaction/backend_data.c b/src/backend/distributed/transaction/backend_data.c index 5f868f548..6db15e794 100644 --- a/src/backend/distributed/transaction/backend_data.c +++ b/src/backend/distributed/transaction/backend_data.c @@ -86,6 +86,7 @@ typedef struct BackendManagementShmemData typedef enum CitusBackendType { CITUS_BACKEND_NOT_ASSIGNED, + CITUS_MAINTENANCE_DAEMON_BACKEND, CITUS_INTERNAL_BACKEND, CITUS_REBALANCER_BACKEND, CITUS_RUN_COMMAND_BACKEND, @@ -96,6 +97,7 @@ static const char *CitusBackendPrefixes[] = { CITUS_APPLICATION_NAME_PREFIX, CITUS_REBALANCER_APPLICATION_NAME_PREFIX, CITUS_RUN_COMMAND_APPLICATION_NAME_PREFIX, + CITUS_MAINTENANCE_DAEMON_APPLICATION_NAME_PREFIX, }; static const CitusBackendType CitusBackendTypes[] = { @@ -1071,7 +1073,6 @@ citus_pid_for_gpid(PG_FUNCTION_ARGS) PG_RETURN_INT32(ExtractProcessIdFromGlobalPID(globalPID)); } - /* * ExtractGlobalPID extracts the global process id from the application name and returns it * if the application name is not compatible with Citus' application names returns 0. @@ -1445,6 +1446,17 @@ IsCitusShardTransferBackend(void) prefixLength) == 0; } +bool isCitusMaintenanceDaemonBackend(void) +{ + if (CurrentBackendType == CITUS_BACKEND_NOT_ASSIGNED) + { + DetermineCitusBackendType(application_name); + } + + return CurrentBackendType == CITUS_MAINTENANCE_DAEMON_BACKEND; +} + + /* * DetermineCitusBackendType determines the type of backend based on the application_name. diff --git a/src/backend/distributed/utils/maintenanced.c b/src/backend/distributed/utils/maintenanced.c index 9cef13539..23667a062 100644 --- a/src/backend/distributed/utils/maintenanced.c +++ b/src/backend/distributed/utils/maintenanced.c @@ -60,6 +60,7 @@ #include "distributed/statistics_collection.h" #include "distributed/transaction_recovery.h" #include "distributed/version_compat.h" +#include "distributed/connection_management.h" /* * Shared memory data for all maintenance workers. @@ -506,7 +507,7 @@ CitusMaintenanceDaemonMain(Datum main_arg) MaintenanceDaemonDBData *myDbData = ConnectToDatabase(databaseOid); /* make worker recognizable in pg_stat_activity */ - pgstat_report_appname("Citus Maintenance Daemon"); + pgstat_report_appname(CITUS_MAINTENANCE_DAEMON_APPLICATION_NAME_PREFIX); /* * Terminate orphaned metadata sync daemons spawned from previously terminated diff --git a/src/include/distributed/backend_data.h b/src/include/distributed/backend_data.h index 8014fe5a6..1e244a52e 100644 --- a/src/include/distributed/backend_data.h +++ b/src/include/distributed/backend_data.h @@ -78,6 +78,7 @@ extern bool IsRebalancerInternalBackend(void); extern bool IsCitusRunCommandBackend(void); extern bool IsExternalClientBackend(void); extern bool IsCitusShardTransferBackend(void); +extern bool isCitusMaintenanceDaemonBackend(void); #define INVALID_CITUS_INTERNAL_BACKEND_GPID 0 #define GLOBAL_PID_NODE_ID_FOR_NODES_NOT_IN_METADATA 99999999 diff --git a/src/include/distributed/connection_management.h b/src/include/distributed/connection_management.h index 91e1e9222..089d477d5 100644 --- a/src/include/distributed/connection_management.h +++ b/src/include/distributed/connection_management.h @@ -44,6 +44,8 @@ /* application name used for connections made by run_command_on_* */ #define CITUS_RUN_COMMAND_APPLICATION_NAME_PREFIX "citus_run_command gpid=" +#define CITUS_MAINTENANCE_DAEMON_APPLICATION_NAME_PREFIX "Citus Maintenance Daemon" + /* * application name prefix for move/split replication connections. * From 5880ecc6802cc6c901ed308b33abc4d64a8bee91 Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Wed, 18 Oct 2023 16:18:11 +0700 Subject: [PATCH 08/41] Introduce new GUC --- .../connection/shared_connection_stats.c | 4 +-- .../operations/worker_node_manager.c | 2 +- src/backend/distributed/shared_library_init.c | 28 +++++++++++++++++++ .../distributed/shared_connection_stats.h | 2 +- src/include/distributed/worker_manager.h | 2 +- 5 files changed, 33 insertions(+), 5 deletions(-) diff --git a/src/backend/distributed/connection/shared_connection_stats.c b/src/backend/distributed/connection/shared_connection_stats.c index 55df40680..249dfbd4e 100644 --- a/src/backend/distributed/connection/shared_connection_stats.c +++ b/src/backend/distributed/connection/shared_connection_stats.c @@ -672,7 +672,7 @@ SharedConnectionStatsShmemSize(void) size = add_size(size, workerNodeConnHashSize); - Size workerNodeDatabaseConnSize = hash_estimate_size(DatabasesPerWorker, + Size workerNodeDatabaseConnSize = hash_estimate_size(MaxWorkerNodesTracked * MaxDatabasesPerWorkerNodesTracked, sizeof(SharedWorkerNodeDatabaseConnStatsHashEntry)); size = add_size(size, workerNodeDatabaseConnSize); @@ -741,7 +741,7 @@ SharedConnectionStatsShmemInit(void) sharedWorkerNodeDatabaseConnStatsHashInfo.hash = SharedWorkerNodeDatabaseHashHash; sharedWorkerNodeDatabaseConnStatsHashInfo.match = SharedWorkerNodeDatabaseHashCompare; - int sharedWorkerNodeDatabaseConnStatsHashSize = MaxWorkerNodesTracked * DatabasesPerWorker; + int sharedWorkerNodeDatabaseConnStatsHashSize = MaxWorkerNodesTracked * MaxDatabasesPerWorkerNodesTracked; SharedWorkerNodeDatabaseConnStatsHash = ShmemInitHash("Shared Conn Per Database. Stats Hash", sharedWorkerNodeDatabaseConnStatsHashSize, diff --git a/src/backend/distributed/operations/worker_node_manager.c b/src/backend/distributed/operations/worker_node_manager.c index 32eb28b94..53109e324 100644 --- a/src/backend/distributed/operations/worker_node_manager.c +++ b/src/backend/distributed/operations/worker_node_manager.c @@ -39,7 +39,7 @@ /* Config variables managed via guc.c */ char *WorkerListFileName; int MaxWorkerNodesTracked = 2048; /* determines worker node hash table size */ -int DatabasesPerWorker = 1; /* determine database per worker hash table size */ +int MaxDatabasesPerWorkerNodesTracked = 1; /* determine database per worker hash table size */ /* Local functions forward declarations */ diff --git a/src/backend/distributed/shared_library_init.c b/src/backend/distributed/shared_library_init.c index 45e212e8b..97f78defd 100644 --- a/src/backend/distributed/shared_library_init.c +++ b/src/backend/distributed/shared_library_init.c @@ -1993,6 +1993,20 @@ RegisterCitusConfigVariables(void) GUC_SUPERUSER_ONLY, NULL, NULL, MaxSharedPoolSizeGucShowHook); + DefineCustomRealVariable( + "citus.shared_pool_size_maintenance_quota", + gettext_noop("Sets the maximum number of connections allowed per worker node " + "across all the backends from this node. Setting to -1 disables " + "connections throttling. Setting to 0 makes it auto-adjust, meaning " + "equal to max_connections on the coordinator."), + gettext_noop("As a rule of thumb, the value should be at most equal to the " + "max_connections on the remote nodes."), + &SharedPoolSizeMaintenanceQuota, + 0.1, 0, 1, + PGC_SIGHUP, + GUC_SUPERUSER_ONLY, + NULL, NULL, MaxSharedPoolSizeGucShowHook); + DefineCustomIntVariable( "citus.max_worker_nodes_tracked", gettext_noop("Sets the maximum number of worker nodes that are tracked."), @@ -2012,6 +2026,20 @@ RegisterCitusConfigVariables(void) GUC_STANDARD, NULL, NULL, NULL); + DefineCustomIntVariable( + "citus.max_databases_per_worker_tracked", + gettext_noop("Sets the amount of databases per worker tracked."), + gettext_noop( + "This configuration value compliments the citus.max_worker_nodes_tracked." + "It should be used when there are more then one database with Citus in cluster," + "and, effectively, limits the size of the hash table with connections per worker + database." + "Currently, it does not affect the connection management logic and serves only statistical purposes."), + &MaxDatabasesPerWorkerNodesTracked, + 1, 1, INT_MAX, + PGC_POSTMASTER, + GUC_STANDARD, + NULL, NULL, NULL); + DefineCustomIntVariable( "citus.metadata_sync_interval", gettext_noop("Sets the time to wait between metadata syncs."), diff --git a/src/include/distributed/shared_connection_stats.h b/src/include/distributed/shared_connection_stats.h index 817f80b54..5a0be6ea2 100644 --- a/src/include/distributed/shared_connection_stats.h +++ b/src/include/distributed/shared_connection_stats.h @@ -25,7 +25,7 @@ enum SharedPoolCounterMode }; extern int MaxSharedPoolSize; -extern float MaintenanceSharedPoolSizePercent; +extern double SharedPoolSizeMaintenanceQuota; extern int LocalSharedPoolSize; extern int MaxClientConnections; diff --git a/src/include/distributed/worker_manager.h b/src/include/distributed/worker_manager.h index aad2f157c..4e1287e07 100644 --- a/src/include/distributed/worker_manager.h +++ b/src/include/distributed/worker_manager.h @@ -59,7 +59,7 @@ typedef struct WorkerNode /* Config variables managed via guc.c */ extern int MaxWorkerNodesTracked; -extern int DatabasesPerWorker; +extern int MaxDatabasesPerWorkerNodesTracked; extern char *WorkerListFileName; extern char *CurrentCluster; From 9f26f744b26f8247e9f006572f8d4810ffa93bf2 Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Thu, 19 Oct 2023 18:35:11 +0700 Subject: [PATCH 09/41] Implementation of a dedicated maintenance database --- .../connection/connection_management.c | 18 ++-- src/backend/distributed/shared_library_init.c | 10 +++ .../distributed/transaction/backend_data.c | 3 +- src/backend/distributed/utils/maintenanced.c | 86 ++++++++++++++----- src/include/distributed/backend_data.h | 2 +- src/include/distributed/maintenanced.h | 2 + 6 files changed, 91 insertions(+), 30 deletions(-) diff --git a/src/backend/distributed/connection/connection_management.c b/src/backend/distributed/connection/connection_management.c index 226ad366a..608c60ba2 100644 --- a/src/backend/distributed/connection/connection_management.c +++ b/src/backend/distributed/connection/connection_management.c @@ -42,6 +42,12 @@ #include "distributed/time_constants.h" #include "distributed/version_compat.h" #include "distributed/worker_log_messages.h" +#include "mb/pg_wchar.h" +#include "pg_config.h" +#include "portability/instr_time.h" +#include "storage/ipc.h" +#include "utils/hsearch.h" +#include "utils/memutils.h" int NodeConnectionTimeout = 30000; @@ -1504,14 +1510,16 @@ ShouldShutdownConnection(MultiConnection *connection, const int cachedConnection * escalating the number of cached connections. We can recognize such backends * from their application name. */ - return (isCitusMaintenanceDaemonBackend() || IsCitusInternalBackend() || IsRebalancerInternalBackend()) || + return ((IsCitusMaintenanceDaemonBackend() && !IsMaintenanceManagementDatabase(MyDatabaseId)) || + IsCitusInternalBackend() || + IsRebalancerInternalBackend()) || connection->initializationState != POOL_STATE_INITIALIZED || cachedConnectionCount >= MaxCachedConnectionsPerWorker || - connection->forceCloseAtTransactionEnd || + connection->forceCloseAtTransactionEnd || PQstatus(connection->pgConn) != CONNECTION_OK || - !RemoteTransactionIdle(connection) || - connection->requiresReplication || - connection->isReplicationOriginSessionSetup || + !RemoteTransactionIdle(connection) || + connection->requiresReplication || + connection->isReplicationOriginSessionSetup || (MaxCachedConnectionLifetime >= 0 && MillisecondsToTimeout(connection->connectionEstablishmentStart, MaxCachedConnectionLifetime) <= 0); } diff --git a/src/backend/distributed/shared_library_init.c b/src/backend/distributed/shared_library_init.c index 97f78defd..0bd0c6592 100644 --- a/src/backend/distributed/shared_library_init.c +++ b/src/backend/distributed/shared_library_init.c @@ -2706,6 +2706,16 @@ RegisterCitusConfigVariables(void) GUC_STANDARD, NULL, NULL, NULL); + DefineCustomStringVariable( + "citus.maintenance_management_database", + gettext_noop("Database for cluster-wide maintenance operations across all databases"), + NULL, + &MaintenanceManagementDatabase, + "", + PGC_SIGHUP, + GUC_STANDARD, + NULL, NULL, NULL); + /* warn about config items in the citus namespace that are not registered above */ EmitWarningsOnPlaceholders("citus"); diff --git a/src/backend/distributed/transaction/backend_data.c b/src/backend/distributed/transaction/backend_data.c index 6db15e794..16d381acb 100644 --- a/src/backend/distributed/transaction/backend_data.c +++ b/src/backend/distributed/transaction/backend_data.c @@ -1446,7 +1446,8 @@ IsCitusShardTransferBackend(void) prefixLength) == 0; } -bool isCitusMaintenanceDaemonBackend(void) +bool +IsCitusMaintenanceDaemonBackend(void) { if (CurrentBackendType == CITUS_BACKEND_NOT_ASSIGNED) { diff --git a/src/backend/distributed/utils/maintenanced.c b/src/backend/distributed/utils/maintenanced.c index 23667a062..a4656c0c8 100644 --- a/src/backend/distributed/utils/maintenanced.c +++ b/src/backend/distributed/utils/maintenanced.c @@ -96,6 +96,7 @@ typedef struct MaintenanceDaemonDBData /* config variable for distributed deadlock detection timeout */ double DistributedDeadlockDetectionTimeoutFactor = 2.0; +char *MaintenanceManagementDatabase = ""; int Recover2PCInterval = 60000; int DeferShardDeleteInterval = 15000; int BackgroundTaskQueueCheckInterval = 5000; @@ -705,32 +706,39 @@ CitusMaintenanceDaemonMain(Datum main_arg) timeout = Min(timeout, Recover2PCInterval); } - /* the config value -1 disables the distributed deadlock detection */ - if (DistributedDeadlockDetectionTimeoutFactor != -1.0) + /* + * Execute only on the maintenance database, if it configured, otherwise run from every daemon. + * The config value -1 disables the distributed deadlock detection + */ + if (DistributedDeadlockDetectionTimeoutFactor != -1.0) { - double deadlockTimeout = - DistributedDeadlockDetectionTimeoutFactor * (double) DeadlockTimeout; + double deadlockTimeout = + DistributedDeadlockDetectionTimeoutFactor * (double) DeadlockTimeout; - InvalidateMetadataSystemCache(); - StartTransactionCommand(); + InvalidateMetadataSystemCache(); + StartTransactionCommand(); - /* - * We skip the deadlock detection if citus extension - * is not accessible. - * - * Similarly, we skip to run the deadlock checks if - * there exists any version mismatch or the extension - * is not fully created yet. - */ - if (!LockCitusExtension()) - { - ereport(DEBUG1, (errmsg("could not lock the citus extension, " - "skipping deadlock detection"))); - } - else if (CheckCitusVersion(DEBUG1) && CitusHasBeenLoaded()) - { - foundDeadlock = CheckForDistributedDeadlocks(); - } + + if ((strcmp(GetMaintenanceManagementDatabase(), "") == 0 || IsMaintenanceManagementDatabase(databaseOid))) + { + /* + * We skip the deadlock detection if citus extension + * is not accessible. + * + * Similarly, we skip to run the deadlock checks if + * there exists any version mismatch or the extension + * is not fully created yet. + */ + if (!LockCitusExtension()) + { + ereport(DEBUG1, (errmsg("could not lock the citus extension, " + "skipping deadlock detection"))); + } + else if (CheckCitusVersion(DEBUG1) && CitusHasBeenLoaded()) + { + foundDeadlock = CheckForDistributedDeadlocks(); + } + } CommitTransactionCommand(); @@ -1228,3 +1236,35 @@ MetadataSyncTriggeredCheckAndReset(MaintenanceDaemonDBData *dbData) return metadataSyncTriggered; } + +char +*GetMaintenanceManagementDatabase(void) +{ + char *result = MaintenanceManagementDatabase; + /* If MaintenanceManagementDatabase is not set, all maintenance daemons are considered independent */ + if (strcmp(MaintenanceManagementDatabase, "") != 0) + { + Oid maintenanceDatabaseOid = get_database_oid(MaintenanceManagementDatabase, true); + if (!maintenanceDatabaseOid) + { + ereport(WARNING, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("Database %s doesn't exists, please check the citus.maintenance_management_database parameter.", + MaintenanceManagementDatabase))); + result = ""; + } + } + return result; +} + +bool +IsMaintenanceManagementDatabase(Oid databaseOid) +{ + if (strcmp(GetMaintenanceManagementDatabase(), "") == 0) + { + /* If MaintenanceManagementDatabase is not set, all maintenance daemons are considered independent */ + return false; + } + Oid maintenanceDatabaseOid = get_database_oid(MaintenanceManagementDatabase, true); + return maintenanceDatabaseOid == databaseOid; +} + diff --git a/src/include/distributed/backend_data.h b/src/include/distributed/backend_data.h index 1e244a52e..716c4024b 100644 --- a/src/include/distributed/backend_data.h +++ b/src/include/distributed/backend_data.h @@ -78,7 +78,7 @@ extern bool IsRebalancerInternalBackend(void); extern bool IsCitusRunCommandBackend(void); extern bool IsExternalClientBackend(void); extern bool IsCitusShardTransferBackend(void); -extern bool isCitusMaintenanceDaemonBackend(void); +extern bool IsCitusMaintenanceDaemonBackend(void); #define INVALID_CITUS_INTERNAL_BACKEND_GPID 0 #define GLOBAL_PID_NODE_ID_FOR_NODES_NOT_IN_METADATA 99999999 diff --git a/src/include/distributed/maintenanced.h b/src/include/distributed/maintenanced.h index 07387a7fd..7e7a1d474 100644 --- a/src/include/distributed/maintenanced.h +++ b/src/include/distributed/maintenanced.h @@ -12,6 +12,8 @@ #ifndef MAINTENANCED_H #define MAINTENANCED_H +#include "commands/dbcommands.h" + /* collect statistics every 24 hours */ #define STATS_COLLECTION_TIMEOUT_MILLIS (24 * 60 * 60 * 1000) From c8ec1b603a3defdcaf6f3c905eac81342dc03556 Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Thu, 19 Oct 2023 19:55:34 +0700 Subject: [PATCH 10/41] Style fixes --- src/backend/distributed/connection/connection_management.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backend/distributed/connection/connection_management.c b/src/backend/distributed/connection/connection_management.c index 608c60ba2..5996a128f 100644 --- a/src/backend/distributed/connection/connection_management.c +++ b/src/backend/distributed/connection/connection_management.c @@ -1520,8 +1520,8 @@ ShouldShutdownConnection(MultiConnection *connection, const int cachedConnection !RemoteTransactionIdle(connection) || connection->requiresReplication || connection->isReplicationOriginSessionSetup || - (MaxCachedConnectionLifetime >= 0 - && MillisecondsToTimeout(connection->connectionEstablishmentStart, MaxCachedConnectionLifetime) <= 0); + (MaxCachedConnectionLifetime >= 0 && + MillisecondsToTimeout(connection->connectionEstablishmentStart, MaxCachedConnectionLifetime) <= 0); } From 481aa99205782f37fcad103c728ce975e49a242f Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Fri, 27 Oct 2023 21:59:09 +0700 Subject: [PATCH 11/41] Changes: - Changed the way to disable connection caching for maintenance daemons - Implemented a test for basic use cases - Brushed up before MR --- .../connection/connection_management.c | 16 +- .../connection/shared_connection_stats.c | 50 +- src/backend/distributed/shared_library_init.c | 16 +- .../distributed/transaction/backend_data.c | 14 - .../distributed/transaction/lock_graph.c | 2 +- src/backend/distributed/utils/maintenanced.c | 7 +- src/include/distributed/backend_data.h | 1 - .../distributed/connection_management.h | 2 - .../multi_maintenance_multiple_databases.out | 502 ++++++++++++++++++ src/test/regress/multi_1_schedule | 1 + src/test/regress/pg_regress_multi.pl | 2 +- .../multi_maintenance_multiple_databases.sql | 423 +++++++++++++++ 12 files changed, 994 insertions(+), 42 deletions(-) create mode 100644 src/test/regress/expected/multi_maintenance_multiple_databases.out create mode 100644 src/test/regress/sql/multi_maintenance_multiple_databases.sql diff --git a/src/backend/distributed/connection/connection_management.c b/src/backend/distributed/connection/connection_management.c index 5996a128f..0ed71d198 100644 --- a/src/backend/distributed/connection/connection_management.c +++ b/src/backend/distributed/connection/connection_management.c @@ -360,6 +360,13 @@ StartNodeUserDatabaseConnection(uint32 flags, const char *hostname, int32 port, MultiConnection *connection = FindAvailableConnection(entry->connections, flags); if (connection) { + if ((flags & REQUIRE_MAINTENANCE_CONNECTION) && + IsMaintenanceDaemon && + !IsMaintenanceManagementDatabase(MyDatabaseId)) + { + // Maintenance database may have changed, so cached connection should be closed + connection->forceCloseAtTransactionEnd = true; + } return connection; } } @@ -432,7 +439,6 @@ StartNodeUserDatabaseConnection(uint32 flags, const char *hostname, int32 port, ResetShardPlacementAssociation(connection); - if (flags & REQUIRE_METADATA_CONNECTION) { connection->useForMetadataOperations = true; @@ -440,6 +446,10 @@ StartNodeUserDatabaseConnection(uint32 flags, const char *hostname, int32 port, else if (flags & REQUIRE_MAINTENANCE_CONNECTION) { connection->useForMaintenanceOperations = true; + if (IsMaintenanceDaemon && !IsMaintenanceManagementDatabase(MyDatabaseId)) + { + connection->forceCloseAtTransactionEnd = true; + } } /* fully initialized the connection, record it */ @@ -1510,9 +1520,7 @@ ShouldShutdownConnection(MultiConnection *connection, const int cachedConnection * escalating the number of cached connections. We can recognize such backends * from their application name. */ - return ((IsCitusMaintenanceDaemonBackend() && !IsMaintenanceManagementDatabase(MyDatabaseId)) || - IsCitusInternalBackend() || - IsRebalancerInternalBackend()) || + return (IsCitusInternalBackend() || IsRebalancerInternalBackend()) || connection->initializationState != POOL_STATE_INITIALIZED || cachedConnectionCount >= MaxCachedConnectionsPerWorker || connection->forceCloseAtTransactionEnd || diff --git a/src/backend/distributed/connection/shared_connection_stats.c b/src/backend/distributed/connection/shared_connection_stats.c index 249dfbd4e..b8a84bd0d 100644 --- a/src/backend/distributed/connection/shared_connection_stats.c +++ b/src/backend/distributed/connection/shared_connection_stats.c @@ -151,7 +151,7 @@ static SharedWorkerNodeDatabaseConnStatsHashKey PrepareWorkerNodeDatabaseHashKey int port, Oid database); static void -DecrementSharedConnectionCounterInternal(uint32 externalFlags, const char *hostname, int port); +DecrementSharedConnectionCounterInternal(uint32 externalFlags, const char *hostname, int port, Oid database); PG_FUNCTION_INFO_V1(citus_remote_connection_stats); @@ -478,8 +478,24 @@ IncrementSharedConnectionCounterInternal(uint32 externalFlags, workerNodeDatabaseEntry->count += 1; } + if (IsLoggableLevel(DEBUG4)) + { + ereport(DEBUG4, errmsg( + "Incrementing connection counter. " + "Current regular connections: %i, maintenance connections: %i. " + "Connection slot to %s:%i database %i is %s", + workerNodeConnectionEntry->regularConnectionsCount, + workerNodeConnectionEntry->maintenanceConnectionsCount, + hostname, + port, + database, + connectionSlotAvailable ? "available" : "not available" + )); + } + UnLockConnectionSharedMemory(); + return connectionSlotAvailable; } @@ -498,18 +514,21 @@ DecrementSharedConnectionCounter(uint32 externalFlags, const char *hostname, int LockConnectionSharedMemory(LW_EXCLUSIVE); - DecrementSharedConnectionCounterInternal(externalFlags, hostname, port); + DecrementSharedConnectionCounterInternal(externalFlags, hostname, port, MyDatabaseId); UnLockConnectionSharedMemory(); WakeupWaiterBackendsForSharedConnection(); } static void -DecrementSharedConnectionCounterInternal(uint32 externalFlags, const char *hostname, int port) +DecrementSharedConnectionCounterInternal(uint32 externalFlags, + const char *hostname, + int port, + Oid database) { bool workerNodeEntryFound = false; SharedWorkerNodeConnStatsHashKey workerNodeKey = PrepareWorkerNodeHashKey(hostname, port); - SharedWorkerNodeConnStatsHashEntry *workerNodeEntry = + SharedWorkerNodeConnStatsHashEntry *workerNodeConnectionEntry = hash_search(SharedWorkerNodeConnStatsHash, &workerNodeKey, HASH_FIND, &workerNodeEntryFound); /* this worker node is removed or updated, no need to care */ @@ -521,18 +540,32 @@ DecrementSharedConnectionCounterInternal(uint32 externalFlags, const char *hostn } /* we should never go below 0 */ - Assert(workerNodeEntry->regularConnectionsCount > 0 || workerNodeEntry->maintenanceConnectionsCount > 0); + Assert(workerNodeConnectionEntry->regularConnectionsCount > 0 || + workerNodeConnectionEntry->maintenanceConnectionsCount > 0); /* When GetSharedPoolSizeMaintenanceQuota() == 0, treat maintenance connections as regular */ if ((GetSharedPoolSizeMaintenanceQuota() > 0 && (externalFlags & MAINTENANCE_CONNECTION))) { - workerNodeEntry->maintenanceConnectionsCount -= 1; + workerNodeConnectionEntry->maintenanceConnectionsCount -= 1; } else { - workerNodeEntry->regularConnectionsCount -= 1; + workerNodeConnectionEntry->regularConnectionsCount -= 1; } + if (IsLoggableLevel(DEBUG4)) + { + ereport(DEBUG4, errmsg( + "Decrementing connection counter. " + "Current regular connections: %i, maintenance connections: %i. " + "Connection slot to %s:%i database %i is released", + workerNodeConnectionEntry->regularConnectionsCount, + workerNodeConnectionEntry->maintenanceConnectionsCount, + hostname, + port, + database + )); + } /* * We don't have to remove at this point as the node might be still active @@ -541,7 +574,8 @@ DecrementSharedConnectionCounterInternal(uint32 externalFlags, const char *hostn * not busy, and given the default value of MaxCachedConnectionsPerWorker = 1, * we're unlikely to trigger this often. */ - if (workerNodeEntry->regularConnectionsCount == 0 && workerNodeEntry->maintenanceConnectionsCount == 0) + if (workerNodeConnectionEntry->regularConnectionsCount == 0 && + workerNodeConnectionEntry->maintenanceConnectionsCount == 0) { hash_search(SharedWorkerNodeConnStatsHash, &workerNodeKey, HASH_REMOVE, NULL); } diff --git a/src/backend/distributed/shared_library_init.c b/src/backend/distributed/shared_library_init.c index 0bd0c6592..186ac886d 100644 --- a/src/backend/distributed/shared_library_init.c +++ b/src/backend/distributed/shared_library_init.c @@ -1995,12 +1995,11 @@ RegisterCitusConfigVariables(void) DefineCustomRealVariable( "citus.shared_pool_size_maintenance_quota", - gettext_noop("Sets the maximum number of connections allowed per worker node " - "across all the backends from this node. Setting to -1 disables " - "connections throttling. Setting to 0 makes it auto-adjust, meaning " - "equal to max_connections on the coordinator."), - gettext_noop("As a rule of thumb, the value should be at most equal to the " - "max_connections on the remote nodes."), + gettext_noop("Sets the fraction of citus.max_shared_pool_size reserved " + "for maintenance operations only. " + "Setting it to 0 disables the quota. " + "This way the maintenance and regular connections will share the same pool"), + NULL, &SharedPoolSizeMaintenanceQuota, 0.1, 0, 1, PGC_SIGHUP, @@ -2030,7 +2029,7 @@ RegisterCitusConfigVariables(void) "citus.max_databases_per_worker_tracked", gettext_noop("Sets the amount of databases per worker tracked."), gettext_noop( - "This configuration value compliments the citus.max_worker_nodes_tracked." + "This configuration value complements the citus.max_worker_nodes_tracked." "It should be used when there are more then one database with Citus in cluster," "and, effectively, limits the size of the hash table with connections per worker + database." "Currently, it does not affect the connection management logic and serves only statistical purposes."), @@ -2709,7 +2708,8 @@ RegisterCitusConfigVariables(void) DefineCustomStringVariable( "citus.maintenance_management_database", gettext_noop("Database for cluster-wide maintenance operations across all databases"), - NULL, + gettext_noop("It should be enabled when there are more than " + "one database with Citus in a cluster."), &MaintenanceManagementDatabase, "", PGC_SIGHUP, diff --git a/src/backend/distributed/transaction/backend_data.c b/src/backend/distributed/transaction/backend_data.c index 16d381acb..829f19850 100644 --- a/src/backend/distributed/transaction/backend_data.c +++ b/src/backend/distributed/transaction/backend_data.c @@ -86,7 +86,6 @@ typedef struct BackendManagementShmemData typedef enum CitusBackendType { CITUS_BACKEND_NOT_ASSIGNED, - CITUS_MAINTENANCE_DAEMON_BACKEND, CITUS_INTERNAL_BACKEND, CITUS_REBALANCER_BACKEND, CITUS_RUN_COMMAND_BACKEND, @@ -97,7 +96,6 @@ static const char *CitusBackendPrefixes[] = { CITUS_APPLICATION_NAME_PREFIX, CITUS_REBALANCER_APPLICATION_NAME_PREFIX, CITUS_RUN_COMMAND_APPLICATION_NAME_PREFIX, - CITUS_MAINTENANCE_DAEMON_APPLICATION_NAME_PREFIX, }; static const CitusBackendType CitusBackendTypes[] = { @@ -1446,18 +1444,6 @@ IsCitusShardTransferBackend(void) prefixLength) == 0; } -bool -IsCitusMaintenanceDaemonBackend(void) -{ - if (CurrentBackendType == CITUS_BACKEND_NOT_ASSIGNED) - { - DetermineCitusBackendType(application_name); - } - - return CurrentBackendType == CITUS_MAINTENANCE_DAEMON_BACKEND; -} - - /* * DetermineCitusBackendType determines the type of backend based on the application_name. diff --git a/src/backend/distributed/transaction/lock_graph.c b/src/backend/distributed/transaction/lock_graph.c index b55a72843..a1c524ae1 100644 --- a/src/backend/distributed/transaction/lock_graph.c +++ b/src/backend/distributed/transaction/lock_graph.c @@ -153,7 +153,7 @@ BuildGlobalWaitGraph(bool onlyDistributedTx) { const char *nodeName = workerNode->workerName; int nodePort = workerNode->workerPort; - int connectionFlags = 0; + int connectionFlags = WAIT_FOR_CONNECTION | REQUIRE_MAINTENANCE_CONNECTION; if (workerNode->groupId == localGroupId) { diff --git a/src/backend/distributed/utils/maintenanced.c b/src/backend/distributed/utils/maintenanced.c index a4656c0c8..0ab30e529 100644 --- a/src/backend/distributed/utils/maintenanced.c +++ b/src/backend/distributed/utils/maintenanced.c @@ -120,7 +120,7 @@ static volatile sig_atomic_t got_SIGHUP = false; static volatile sig_atomic_t got_SIGTERM = false; /* set to true when becoming a maintenance daemon */ -static bool IsMaintenanceDaemon = false; +bool IsMaintenanceDaemon = false; static void MaintenanceDaemonSigTermHandler(SIGNAL_ARGS); static void MaintenanceDaemonSigHupHandler(SIGNAL_ARGS); @@ -508,7 +508,7 @@ CitusMaintenanceDaemonMain(Datum main_arg) MaintenanceDaemonDBData *myDbData = ConnectToDatabase(databaseOid); /* make worker recognizable in pg_stat_activity */ - pgstat_report_appname(CITUS_MAINTENANCE_DAEMON_APPLICATION_NAME_PREFIX); + pgstat_report_appname("Citus Maintenance Daemon"); /* * Terminate orphaned metadata sync daemons spawned from previously terminated @@ -1248,7 +1248,8 @@ char if (!maintenanceDatabaseOid) { ereport(WARNING, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("Database %s doesn't exists, please check the citus.maintenance_management_database parameter.", + errmsg("Database \"%s\" doesn't exists, please check the citus.maintenance_management_database parameter. " + "Applying a default value instead.", MaintenanceManagementDatabase))); result = ""; } diff --git a/src/include/distributed/backend_data.h b/src/include/distributed/backend_data.h index 716c4024b..8014fe5a6 100644 --- a/src/include/distributed/backend_data.h +++ b/src/include/distributed/backend_data.h @@ -78,7 +78,6 @@ extern bool IsRebalancerInternalBackend(void); extern bool IsCitusRunCommandBackend(void); extern bool IsExternalClientBackend(void); extern bool IsCitusShardTransferBackend(void); -extern bool IsCitusMaintenanceDaemonBackend(void); #define INVALID_CITUS_INTERNAL_BACKEND_GPID 0 #define GLOBAL_PID_NODE_ID_FOR_NODES_NOT_IN_METADATA 99999999 diff --git a/src/include/distributed/connection_management.h b/src/include/distributed/connection_management.h index 089d477d5..91e1e9222 100644 --- a/src/include/distributed/connection_management.h +++ b/src/include/distributed/connection_management.h @@ -44,8 +44,6 @@ /* application name used for connections made by run_command_on_* */ #define CITUS_RUN_COMMAND_APPLICATION_NAME_PREFIX "citus_run_command gpid=" -#define CITUS_MAINTENANCE_DAEMON_APPLICATION_NAME_PREFIX "Citus Maintenance Daemon" - /* * application name prefix for move/split replication connections. * diff --git a/src/test/regress/expected/multi_maintenance_multiple_databases.out b/src/test/regress/expected/multi_maintenance_multiple_databases.out new file mode 100644 index 000000000..8b37d7e25 --- /dev/null +++ b/src/test/regress/expected/multi_maintenance_multiple_databases.out @@ -0,0 +1,502 @@ +-- This test verfies a behavioir of maintenance daemon in multi-database environment +-- It checks two things: +-- 1. Maintenance daemons should not cache connections, except the one for the citus.maintenance_management_database +-- 2. 2PC transaction recovery should respect the citus.shared_pool_size_maintenance_quota +-- 2. Distributed deadlock detection should run only on citus.maintenance_management_database. +-- +-- To do that, it created 100 databases and syntactically generates distributed transactions in various states there. +SELECT $definition$ + ALTER SYSTEM SET citus.recover_2pc_interval TO '-1'; + ALTER SYSTEM SET citus.distributed_deadlock_detection_factor = '-1'; + SELECT pg_reload_conf(); + $definition$ AS turn_off_maintenance +\gset +SELECT $deinition$ +ALTER SYSTEM SET citus.recover_2pc_interval TO '5s'; +ALTER SYSTEM RESET citus.distributed_deadlock_detection_factor; +ALTER SYSTEM SET citus.maintenance_management_database = 'regression'; +SELECT pg_reload_conf(); +$deinition$ AS turn_on_maintenance +\gset +SELECT $definition$ + DO + $do$ + DECLARE + index int; + db_name text; + current_port int; + db_create_statement text; + BEGIN + SELECT setting::int FROM pg_settings WHERE name = 'port' + INTO current_port; + FOR index IN 1..100 + LOOP + SELECT format('db%s', index) + INTO db_name; + + SELECT format('CREATE DATABASE %I', db_name) + INTO db_create_statement; + + PERFORM dblink(format('dbname=regression host=localhost port=%s user=postgres', current_port), + db_create_statement); + PERFORM dblink(format('dbname=%s host=localhost port=%s user=postgres', db_name, current_port), + 'CREATE EXTENSION citus;'); + IF (SELECT groupid = 0 FROM pg_dist_node WHERE nodeport = current_port) THEN + PERFORM dblink_exec(format('dbname=%s host=localhost port=%s user=postgres', db_name, current_port), + format($add_workers$SELECT citus_add_node('localhost', %s);COMMIT;$add_workers$, nodeport)) + FROM pg_dist_node + WHERE groupid != 0 AND isactive AND noderole = 'primary'; + END IF; + END LOOP; + END; + $do$; + $definition$ AS create_databases +\gset +-- Code reiles heavily on dblink for cross-db and cross-node queries +CREATE EXTENSION IF NOT EXISTS dblink; +-- Disable maintenance operations to prepare the environment +:turn_off_maintenance + pg_reload_conf +--------------------------------------------------------------------- + t +(1 row) + +\c - - - :worker_1_port +:turn_off_maintenance + pg_reload_conf +--------------------------------------------------------------------- + t +(1 row) + +\c - - - :worker_2_port +:turn_off_maintenance + pg_reload_conf +--------------------------------------------------------------------- + t +(1 row) + +-- Create databases +\c - - - :worker_1_port +:create_databases +SELECT count(*) +FROM pg_database +WHERE datname LIKE 'db%'; + count +--------------------------------------------------------------------- + 100 +(1 row) + +\c - - - :worker_2_port +:create_databases +SELECT count(*) +FROM pg_database +WHERE datname LIKE 'db%'; + count +--------------------------------------------------------------------- + 100 +(1 row) + +\c - - - :master_port +:create_databases +SELECT count(*) +FROM pg_database +WHERE datname LIKE 'db%'; + count +--------------------------------------------------------------------- + 100 +(1 row) + +-- Generate distributed transactions +\c - - - :master_port +DO +$do$ + DECLARE + index int; + db_name text; + transaction_to_abort_name text; + transaction_to_commit_name text; + transaction_to_be_forgotten text; + coordinator_port int; + BEGIN + FOR index IN 1..100 + LOOP + SELECT format('db%s', index) + INTO db_name; + + SELECT format('citus_0_1234_3_0_%s', oid) + FROM pg_database + WHERE datname = db_name + INTO transaction_to_abort_name; + + SELECT format('citus_0_1234_4_0_%s', oid) + FROM pg_database + WHERE datname = db_name + INTO transaction_to_commit_name; + + SELECT format('citus_0_should_be_forgotten_%s', oid) + FROM pg_database + WHERE datname = db_name + INTO transaction_to_be_forgotten; + + SELECT setting::int + FROM pg_settings + WHERE name = 'port' + INTO coordinator_port; + + -- Prepare transactions on workers + PERFORM dblink_exec(format('dbname=%s host=localhost port=%s user=postgres', db_name, nodeport), + format($worker_cmd$ + BEGIN; + CREATE TABLE should_abort + (value int); + PREPARE TRANSACTION '%s'; + + BEGIN; + CREATE TABLE should_commit + (value int); + PREPARE TRANSACTION '%s'; + $worker_cmd$, transaction_to_abort_name, transaction_to_commit_name)) + FROM pg_dist_node + WHERE groupid != 0 + AND isactive + AND noderole = 'primary'; + + -- Fill the pg_dist_transaction + PERFORM dblink_exec(format('dbname=%s host=localhost port=%s user=postgres', db_name, coordinator_port), + format($coordinator_cmd$ + INSERT INTO pg_dist_transaction + SELECT groupid, '%s' FROM pg_dist_node + UNION ALL + SELECT groupid, '%s' FROM pg_dist_node; + $coordinator_cmd$, transaction_to_commit_name, transaction_to_be_forgotten)); + END LOOP; + END; +$do$; +-- Verify state before enabling maintenance +\c - - - :master_port +SELECT count(*) = 400 AS pg_dist_transaction_before_recovery_coordinator_test +FROM pg_database, + dblink(format('dbname=%s host=localhost port=%s user=postgres', datname, + (SELECT setting::int FROM pg_settings WHERE name = 'port')), + $statement$ + SELECT * + FROM pg_dist_transaction + WHERE gid LIKE 'citus_0_1234_4_0_%' + OR gid LIKE 'citus_0_should_be_forgotten_%' + $statement$) AS t(groupid integer, gid text) +WHERE datname LIKE 'db%'; + pg_dist_transaction_before_recovery_coordinator_test +--------------------------------------------------------------------- + t +(1 row) + +SELECT count(*) = 0 AS cached_connections_before_recovery_coordinator_test +FROM pg_stat_activity +WHERE state = 'idle' + AND now() - backend_start > '5 seconds'::interval; + cached_connections_before_recovery_coordinator_test +--------------------------------------------------------------------- + t +(1 row) + +\c - - - :worker_1_port +SELECT count(*) = 100 AS pg_prepared_xacts_before_recover_worker_1_test +FROM pg_prepared_xacts +WHERE gid LIKE 'citus_0_1234_4_0_%' + OR gid LIKE 'citus_0_should_be_forgotten_%'; + pg_prepared_xacts_before_recover_worker_1_test +--------------------------------------------------------------------- + t +(1 row) + +SELECT count(*) = 0 AS cached_connections_before_recovery_worker_1_test +FROM pg_stat_activity +WHERE state = 'idle' + AND now() - backend_start > '5 seconds'::interval; + cached_connections_before_recovery_worker_1_test +--------------------------------------------------------------------- + t +(1 row) + +\c - - - :worker_2_port +SELECT count(*) = 100 AS pg_prepared_xacts_before_recover_worker_2_test +FROM pg_prepared_xacts +WHERE gid LIKE 'citus_0_1234_4_0_%' + OR gid LIKE 'citus_0_should_be_forgotten_%'; + pg_prepared_xacts_before_recover_worker_2_test +--------------------------------------------------------------------- + t +(1 row) + +SELECT count(*) = 0 AS cached_connections_before_recovery_worker_2_test +FROM pg_stat_activity +WHERE state = 'idle' + AND now() - backend_start > '5 seconds'::interval; + cached_connections_before_recovery_worker_2_test +--------------------------------------------------------------------- + t +(1 row) + +-- Turn on the maintenance +\c - - - :master_port +:turn_on_maintenance + pg_reload_conf +--------------------------------------------------------------------- + t +(1 row) + +\c - - - :worker_1_port +:turn_on_maintenance + pg_reload_conf +--------------------------------------------------------------------- + t +(1 row) + +\c - - - :worker_2_port +:turn_on_maintenance + pg_reload_conf +--------------------------------------------------------------------- + t +(1 row) + +\c - - - :master_port +-- Let maintenance do it's work... +SELECT pg_sleep_for('10 seconds'::interval); + pg_sleep_for +--------------------------------------------------------------------- + +(1 row) + +-- Verify maintenance result +SELECT count(*) = 0 AS pg_dist_transaction_after_recovery_coordinator_test +FROM pg_database, + dblink(format('dbname=%s host=localhost port=%s user=postgres', datname, + (SELECT setting::int FROM pg_settings WHERE name = 'port')), + $statement$ + SELECT * + FROM pg_dist_transaction + WHERE gid LIKE 'citus_0_1234_4_0_%' + OR gid LIKE 'citus_0_should_be_forgotten_%' + $statement$) AS t(groupid integer, gid text) +WHERE datname LIKE 'db%'; + pg_dist_transaction_after_recovery_coordinator_test +--------------------------------------------------------------------- + t +(1 row) + +SELECT count(*) BETWEEN 1 AND 3 AS cached_connections_after_recovery_coordinator_test +FROM pg_stat_activity +WHERE state = 'idle' + AND now() - backend_start > '5 seconds'::interval; + cached_connections_after_recovery_coordinator_test +--------------------------------------------------------------------- + t +(1 row) + +\c - - - :worker_1_port +SELECT count(*) = 0 AS pg_prepared_xacts_after_recover_worker_1_test +FROM pg_prepared_xacts +WHERE gid LIKE 'citus_0_1234_4_0_%' + OR gid LIKE 'citus_0_should_be_forgotten_%'; + pg_prepared_xacts_after_recover_worker_1_test +--------------------------------------------------------------------- + t +(1 row) + +SELECT count(*) BETWEEN 1 AND 3 AS cached_connections_after_recovery_worker_1_test +FROM pg_stat_activity +WHERE state = 'idle' + AND now() - backend_start > '5 seconds'::interval; + cached_connections_after_recovery_worker_1_test +--------------------------------------------------------------------- + t +(1 row) + +\c - - - :worker_2_port +SELECT count(*) = 0 AS pg_prepared_xacts_after_recover_worker_2_test +FROM pg_prepared_xacts +WHERE gid LIKE 'citus_0_1234_4_0_%' + OR gid LIKE 'citus_0_should_be_forgotten_%'; + pg_prepared_xacts_after_recover_worker_2_test +--------------------------------------------------------------------- + t +(1 row) + +SELECT count(*) BETWEEN 1 AND 3 AS cached_connections_after_recovery_worker_2_test +FROM pg_stat_activity +WHERE state = 'idle' + AND now() - backend_start > '5 seconds'::interval; + cached_connections_after_recovery_worker_2_test +--------------------------------------------------------------------- + t +(1 row) + +-- Cleanup +\c - - - :master_port +SELECT $definition$ + ALTER SYSTEM RESET citus.recover_2pc_interval; + ALTER SYSTEM RESET citus.distributed_deadlock_detection_factor; + ALTER SYSTEM RESET citus.maintenance_management_database; + SELECT pg_reload_conf(); + + DO + $do$ + DECLARE + index int; + db_name text; + current_port int; + BEGIN + SELECT setting::int FROM pg_settings WHERE name = 'port' + INTO current_port; + FOR index IN 1..100 + LOOP + SELECT format('db%s', index) + INTO db_name; + + PERFORM dblink(format('dbname=%s host=localhost port=%s user=postgres', db_name, current_port), + 'DROP EXTENSION citus;'); + END LOOP; + END; + $do$; + + -- Dropping tables explicitly because ProcSignalBarrier prevents from using dblink + DROP DATABASE db1 WITH (FORCE); + DROP DATABASE db2 WITH (FORCE); + DROP DATABASE db3 WITH (FORCE); + DROP DATABASE db4 WITH (FORCE); + DROP DATABASE db5 WITH (FORCE); + DROP DATABASE db6 WITH (FORCE); + DROP DATABASE db7 WITH (FORCE); + DROP DATABASE db8 WITH (FORCE); + DROP DATABASE db9 WITH (FORCE); + DROP DATABASE db10 WITH (FORCE); + DROP DATABASE db11 WITH (FORCE); + DROP DATABASE db12 WITH (FORCE); + DROP DATABASE db13 WITH (FORCE); + DROP DATABASE db14 WITH (FORCE); + DROP DATABASE db15 WITH (FORCE); + DROP DATABASE db16 WITH (FORCE); + DROP DATABASE db17 WITH (FORCE); + DROP DATABASE db18 WITH (FORCE); + DROP DATABASE db19 WITH (FORCE); + DROP DATABASE db20 WITH (FORCE); + DROP DATABASE db21 WITH (FORCE); + DROP DATABASE db22 WITH (FORCE); + DROP DATABASE db23 WITH (FORCE); + DROP DATABASE db24 WITH (FORCE); + DROP DATABASE db25 WITH (FORCE); + DROP DATABASE db26 WITH (FORCE); + DROP DATABASE db27 WITH (FORCE); + DROP DATABASE db28 WITH (FORCE); + DROP DATABASE db29 WITH (FORCE); + DROP DATABASE db30 WITH (FORCE); + DROP DATABASE db31 WITH (FORCE); + DROP DATABASE db32 WITH (FORCE); + DROP DATABASE db33 WITH (FORCE); + DROP DATABASE db34 WITH (FORCE); + DROP DATABASE db35 WITH (FORCE); + DROP DATABASE db36 WITH (FORCE); + DROP DATABASE db37 WITH (FORCE); + DROP DATABASE db38 WITH (FORCE); + DROP DATABASE db39 WITH (FORCE); + DROP DATABASE db40 WITH (FORCE); + DROP DATABASE db41 WITH (FORCE); + DROP DATABASE db42 WITH (FORCE); + DROP DATABASE db43 WITH (FORCE); + DROP DATABASE db44 WITH (FORCE); + DROP DATABASE db45 WITH (FORCE); + DROP DATABASE db46 WITH (FORCE); + DROP DATABASE db47 WITH (FORCE); + DROP DATABASE db48 WITH (FORCE); + DROP DATABASE db49 WITH (FORCE); + DROP DATABASE db50 WITH (FORCE); + DROP DATABASE db51 WITH (FORCE); + DROP DATABASE db52 WITH (FORCE); + DROP DATABASE db53 WITH (FORCE); + DROP DATABASE db54 WITH (FORCE); + DROP DATABASE db55 WITH (FORCE); + DROP DATABASE db56 WITH (FORCE); + DROP DATABASE db57 WITH (FORCE); + DROP DATABASE db58 WITH (FORCE); + DROP DATABASE db59 WITH (FORCE); + DROP DATABASE db60 WITH (FORCE); + DROP DATABASE db61 WITH (FORCE); + DROP DATABASE db62 WITH (FORCE); + DROP DATABASE db63 WITH (FORCE); + DROP DATABASE db64 WITH (FORCE); + DROP DATABASE db65 WITH (FORCE); + DROP DATABASE db66 WITH (FORCE); + DROP DATABASE db67 WITH (FORCE); + DROP DATABASE db68 WITH (FORCE); + DROP DATABASE db69 WITH (FORCE); + DROP DATABASE db70 WITH (FORCE); + DROP DATABASE db71 WITH (FORCE); + DROP DATABASE db72 WITH (FORCE); + DROP DATABASE db73 WITH (FORCE); + DROP DATABASE db74 WITH (FORCE); + DROP DATABASE db75 WITH (FORCE); + DROP DATABASE db76 WITH (FORCE); + DROP DATABASE db77 WITH (FORCE); + DROP DATABASE db78 WITH (FORCE); + DROP DATABASE db79 WITH (FORCE); + DROP DATABASE db80 WITH (FORCE); + DROP DATABASE db81 WITH (FORCE); + DROP DATABASE db82 WITH (FORCE); + DROP DATABASE db83 WITH (FORCE); + DROP DATABASE db84 WITH (FORCE); + DROP DATABASE db85 WITH (FORCE); + DROP DATABASE db86 WITH (FORCE); + DROP DATABASE db87 WITH (FORCE); + DROP DATABASE db88 WITH (FORCE); + DROP DATABASE db89 WITH (FORCE); + DROP DATABASE db90 WITH (FORCE); + DROP DATABASE db91 WITH (FORCE); + DROP DATABASE db92 WITH (FORCE); + DROP DATABASE db93 WITH (FORCE); + DROP DATABASE db94 WITH (FORCE); + DROP DATABASE db95 WITH (FORCE); + DROP DATABASE db96 WITH (FORCE); + DROP DATABASE db97 WITH (FORCE); + DROP DATABASE db98 WITH (FORCE); + DROP DATABASE db99 WITH (FORCE); + DROP DATABASE db100 WITH (FORCE); + SELECT count(*) + FROM pg_database + WHERE datname LIKE 'db%'; + $definition$ AS cleanup +\gset +:cleanup + pg_reload_conf +--------------------------------------------------------------------- + t +(1 row) + + count +--------------------------------------------------------------------- + 0 +(1 row) + +\c - - - :worker_1_port +:cleanup + pg_reload_conf +--------------------------------------------------------------------- + t +(1 row) + + count +--------------------------------------------------------------------- + 0 +(1 row) + +\c - - - :worker_2_port +:cleanup + pg_reload_conf +--------------------------------------------------------------------- + t +(1 row) + + count +--------------------------------------------------------------------- + 0 +(1 row) + diff --git a/src/test/regress/multi_1_schedule b/src/test/regress/multi_1_schedule index 015f74973..cbd8ecf2f 100644 --- a/src/test/regress/multi_1_schedule +++ b/src/test/regress/multi_1_schedule @@ -220,6 +220,7 @@ test: multi_generate_ddl_commands test: multi_create_shards test: multi_transaction_recovery test: multi_transaction_recovery_multiple_databases +test: multi_maintenance_multiple_databases test: local_dist_join_modifications test: local_table_join diff --git a/src/test/regress/pg_regress_multi.pl b/src/test/regress/pg_regress_multi.pl index c9a85d523..4d01bc698 100755 --- a/src/test/regress/pg_regress_multi.pl +++ b/src/test/regress/pg_regress_multi.pl @@ -462,7 +462,7 @@ push(@pgOptions, "wal_retrieve_retry_interval=250"); push(@pgOptions, "max_logical_replication_workers=50"); push(@pgOptions, "max_wal_senders=50"); -push(@pgOptions, "max_worker_processes=50"); +push(@pgOptions, "max_worker_processes=150"); if ($majorversion >= "14") { # disable compute_query_id so that we don't get Query Identifiers diff --git a/src/test/regress/sql/multi_maintenance_multiple_databases.sql b/src/test/regress/sql/multi_maintenance_multiple_databases.sql new file mode 100644 index 000000000..fce49acdc --- /dev/null +++ b/src/test/regress/sql/multi_maintenance_multiple_databases.sql @@ -0,0 +1,423 @@ +-- This test verfies a behavioir of maintenance daemon in multi-database environment +-- It checks two things: +-- 1. Maintenance daemons should not cache connections, except the one for the citus.maintenance_management_database +-- 2. 2PC transaction recovery should respect the citus.shared_pool_size_maintenance_quota +-- 2. Distributed deadlock detection should run only on citus.maintenance_management_database. +-- +-- To do that, it created 100 databases and syntactically generates distributed transactions in various states there. + +SELECT $definition$ + ALTER SYSTEM SET citus.recover_2pc_interval TO '-1'; + ALTER SYSTEM SET citus.distributed_deadlock_detection_factor = '-1'; + SELECT pg_reload_conf(); + $definition$ AS turn_off_maintenance +\gset + +SELECT $deinition$ +ALTER SYSTEM SET citus.recover_2pc_interval TO '5s'; +ALTER SYSTEM RESET citus.distributed_deadlock_detection_factor; +ALTER SYSTEM SET citus.maintenance_management_database = 'regression'; +SELECT pg_reload_conf(); +$deinition$ AS turn_on_maintenance +\gset + +SELECT $definition$ + DO + $do$ + DECLARE + index int; + db_name text; + current_port int; + db_create_statement text; + BEGIN + SELECT setting::int FROM pg_settings WHERE name = 'port' + INTO current_port; + FOR index IN 1..100 + LOOP + SELECT format('db%s', index) + INTO db_name; + + SELECT format('CREATE DATABASE %I', db_name) + INTO db_create_statement; + + PERFORM dblink(format('dbname=regression host=localhost port=%s user=postgres', current_port), + db_create_statement); + PERFORM dblink(format('dbname=%s host=localhost port=%s user=postgres', db_name, current_port), + 'CREATE EXTENSION citus;'); + IF (SELECT groupid = 0 FROM pg_dist_node WHERE nodeport = current_port) THEN + PERFORM dblink_exec(format('dbname=%s host=localhost port=%s user=postgres', db_name, current_port), + format($add_workers$SELECT citus_add_node('localhost', %s);COMMIT;$add_workers$, nodeport)) + FROM pg_dist_node + WHERE groupid != 0 AND isactive AND noderole = 'primary'; + END IF; + END LOOP; + END; + $do$; + $definition$ AS create_databases +\gset + +-- Code reiles heavily on dblink for cross-db and cross-node queries +CREATE EXTENSION IF NOT EXISTS dblink; + +-- Disable maintenance operations to prepare the environment +:turn_off_maintenance + +\c - - - :worker_1_port + +:turn_off_maintenance + +\c - - - :worker_2_port + +:turn_off_maintenance + +-- Create databases + +\c - - - :worker_1_port + +:create_databases + +SELECT count(*) +FROM pg_database +WHERE datname LIKE 'db%'; + +\c - - - :worker_2_port + +:create_databases + +SELECT count(*) +FROM pg_database +WHERE datname LIKE 'db%'; + +\c - - - :master_port + +:create_databases + +SELECT count(*) +FROM pg_database +WHERE datname LIKE 'db%'; + +-- Generate distributed transactions + +\c - - - :master_port + +DO +$do$ + DECLARE + index int; + db_name text; + transaction_to_abort_name text; + transaction_to_commit_name text; + transaction_to_be_forgotten text; + coordinator_port int; + BEGIN + FOR index IN 1..100 + LOOP + SELECT format('db%s', index) + INTO db_name; + + SELECT format('citus_0_1234_3_0_%s', oid) + FROM pg_database + WHERE datname = db_name + INTO transaction_to_abort_name; + + SELECT format('citus_0_1234_4_0_%s', oid) + FROM pg_database + WHERE datname = db_name + INTO transaction_to_commit_name; + + SELECT format('citus_0_should_be_forgotten_%s', oid) + FROM pg_database + WHERE datname = db_name + INTO transaction_to_be_forgotten; + + SELECT setting::int + FROM pg_settings + WHERE name = 'port' + INTO coordinator_port; + + -- Prepare transactions on workers + PERFORM dblink_exec(format('dbname=%s host=localhost port=%s user=postgres', db_name, nodeport), + format($worker_cmd$ + BEGIN; + CREATE TABLE should_abort + (value int); + PREPARE TRANSACTION '%s'; + + BEGIN; + CREATE TABLE should_commit + (value int); + PREPARE TRANSACTION '%s'; + $worker_cmd$, transaction_to_abort_name, transaction_to_commit_name)) + FROM pg_dist_node + WHERE groupid != 0 + AND isactive + AND noderole = 'primary'; + + -- Fill the pg_dist_transaction + PERFORM dblink_exec(format('dbname=%s host=localhost port=%s user=postgres', db_name, coordinator_port), + format($coordinator_cmd$ + INSERT INTO pg_dist_transaction + SELECT groupid, '%s' FROM pg_dist_node + UNION ALL + SELECT groupid, '%s' FROM pg_dist_node; + $coordinator_cmd$, transaction_to_commit_name, transaction_to_be_forgotten)); + END LOOP; + END; +$do$; + +-- Verify state before enabling maintenance +\c - - - :master_port + +SELECT count(*) = 400 AS pg_dist_transaction_before_recovery_coordinator_test +FROM pg_database, + dblink(format('dbname=%s host=localhost port=%s user=postgres', datname, + (SELECT setting::int FROM pg_settings WHERE name = 'port')), + $statement$ + SELECT * + FROM pg_dist_transaction + WHERE gid LIKE 'citus_0_1234_4_0_%' + OR gid LIKE 'citus_0_should_be_forgotten_%' + $statement$) AS t(groupid integer, gid text) +WHERE datname LIKE 'db%'; + +SELECT count(*) = 0 AS cached_connections_before_recovery_coordinator_test +FROM pg_stat_activity +WHERE state = 'idle' + AND now() - backend_start > '5 seconds'::interval; + +\c - - - :worker_1_port + +SELECT count(*) = 100 AS pg_prepared_xacts_before_recover_worker_1_test +FROM pg_prepared_xacts +WHERE gid LIKE 'citus_0_1234_4_0_%' + OR gid LIKE 'citus_0_should_be_forgotten_%'; + +SELECT count(*) = 0 AS cached_connections_before_recovery_worker_1_test +FROM pg_stat_activity +WHERE state = 'idle' + AND now() - backend_start > '5 seconds'::interval; + +\c - - - :worker_2_port + +SELECT count(*) = 100 AS pg_prepared_xacts_before_recover_worker_2_test +FROM pg_prepared_xacts +WHERE gid LIKE 'citus_0_1234_4_0_%' + OR gid LIKE 'citus_0_should_be_forgotten_%'; + +SELECT count(*) = 0 AS cached_connections_before_recovery_worker_2_test +FROM pg_stat_activity +WHERE state = 'idle' + AND now() - backend_start > '5 seconds'::interval; + +-- Turn on the maintenance + +\c - - - :master_port + +:turn_on_maintenance + +\c - - - :worker_1_port + +:turn_on_maintenance + +\c - - - :worker_2_port + +:turn_on_maintenance + + + +\c - - - :master_port + +-- Let maintenance do it's work... + +SELECT pg_sleep_for('10 seconds'::interval); + +-- Verify maintenance result + +SELECT count(*) = 0 AS pg_dist_transaction_after_recovery_coordinator_test +FROM pg_database, + dblink(format('dbname=%s host=localhost port=%s user=postgres', datname, + (SELECT setting::int FROM pg_settings WHERE name = 'port')), + $statement$ + SELECT * + FROM pg_dist_transaction + WHERE gid LIKE 'citus_0_1234_4_0_%' + OR gid LIKE 'citus_0_should_be_forgotten_%' + $statement$) AS t(groupid integer, gid text) +WHERE datname LIKE 'db%'; + +SELECT count(*) BETWEEN 1 AND 3 AS cached_connections_after_recovery_coordinator_test +FROM pg_stat_activity +WHERE state = 'idle' + AND now() - backend_start > '5 seconds'::interval; + +\c - - - :worker_1_port + +SELECT count(*) = 0 AS pg_prepared_xacts_after_recover_worker_1_test +FROM pg_prepared_xacts +WHERE gid LIKE 'citus_0_1234_4_0_%' + OR gid LIKE 'citus_0_should_be_forgotten_%'; + +SELECT count(*) BETWEEN 1 AND 3 AS cached_connections_after_recovery_worker_1_test +FROM pg_stat_activity +WHERE state = 'idle' + AND now() - backend_start > '5 seconds'::interval; + +\c - - - :worker_2_port + +SELECT count(*) = 0 AS pg_prepared_xacts_after_recover_worker_2_test +FROM pg_prepared_xacts +WHERE gid LIKE 'citus_0_1234_4_0_%' + OR gid LIKE 'citus_0_should_be_forgotten_%'; + +SELECT count(*) BETWEEN 1 AND 3 AS cached_connections_after_recovery_worker_2_test +FROM pg_stat_activity +WHERE state = 'idle' + AND now() - backend_start > '5 seconds'::interval; + + +-- Cleanup + +\c - - - :master_port + +SELECT $definition$ + ALTER SYSTEM RESET citus.recover_2pc_interval; + ALTER SYSTEM RESET citus.distributed_deadlock_detection_factor; + ALTER SYSTEM RESET citus.maintenance_management_database; + SELECT pg_reload_conf(); + + DO + $do$ + DECLARE + index int; + db_name text; + current_port int; + BEGIN + SELECT setting::int FROM pg_settings WHERE name = 'port' + INTO current_port; + FOR index IN 1..100 + LOOP + SELECT format('db%s', index) + INTO db_name; + + PERFORM dblink(format('dbname=%s host=localhost port=%s user=postgres', db_name, current_port), + 'DROP EXTENSION citus;'); + END LOOP; + END; + $do$; + + -- Dropping tables explicitly because ProcSignalBarrier prevents from using dblink + DROP DATABASE db1 WITH (FORCE); + DROP DATABASE db2 WITH (FORCE); + DROP DATABASE db3 WITH (FORCE); + DROP DATABASE db4 WITH (FORCE); + DROP DATABASE db5 WITH (FORCE); + DROP DATABASE db6 WITH (FORCE); + DROP DATABASE db7 WITH (FORCE); + DROP DATABASE db8 WITH (FORCE); + DROP DATABASE db9 WITH (FORCE); + DROP DATABASE db10 WITH (FORCE); + DROP DATABASE db11 WITH (FORCE); + DROP DATABASE db12 WITH (FORCE); + DROP DATABASE db13 WITH (FORCE); + DROP DATABASE db14 WITH (FORCE); + DROP DATABASE db15 WITH (FORCE); + DROP DATABASE db16 WITH (FORCE); + DROP DATABASE db17 WITH (FORCE); + DROP DATABASE db18 WITH (FORCE); + DROP DATABASE db19 WITH (FORCE); + DROP DATABASE db20 WITH (FORCE); + DROP DATABASE db21 WITH (FORCE); + DROP DATABASE db22 WITH (FORCE); + DROP DATABASE db23 WITH (FORCE); + DROP DATABASE db24 WITH (FORCE); + DROP DATABASE db25 WITH (FORCE); + DROP DATABASE db26 WITH (FORCE); + DROP DATABASE db27 WITH (FORCE); + DROP DATABASE db28 WITH (FORCE); + DROP DATABASE db29 WITH (FORCE); + DROP DATABASE db30 WITH (FORCE); + DROP DATABASE db31 WITH (FORCE); + DROP DATABASE db32 WITH (FORCE); + DROP DATABASE db33 WITH (FORCE); + DROP DATABASE db34 WITH (FORCE); + DROP DATABASE db35 WITH (FORCE); + DROP DATABASE db36 WITH (FORCE); + DROP DATABASE db37 WITH (FORCE); + DROP DATABASE db38 WITH (FORCE); + DROP DATABASE db39 WITH (FORCE); + DROP DATABASE db40 WITH (FORCE); + DROP DATABASE db41 WITH (FORCE); + DROP DATABASE db42 WITH (FORCE); + DROP DATABASE db43 WITH (FORCE); + DROP DATABASE db44 WITH (FORCE); + DROP DATABASE db45 WITH (FORCE); + DROP DATABASE db46 WITH (FORCE); + DROP DATABASE db47 WITH (FORCE); + DROP DATABASE db48 WITH (FORCE); + DROP DATABASE db49 WITH (FORCE); + DROP DATABASE db50 WITH (FORCE); + DROP DATABASE db51 WITH (FORCE); + DROP DATABASE db52 WITH (FORCE); + DROP DATABASE db53 WITH (FORCE); + DROP DATABASE db54 WITH (FORCE); + DROP DATABASE db55 WITH (FORCE); + DROP DATABASE db56 WITH (FORCE); + DROP DATABASE db57 WITH (FORCE); + DROP DATABASE db58 WITH (FORCE); + DROP DATABASE db59 WITH (FORCE); + DROP DATABASE db60 WITH (FORCE); + DROP DATABASE db61 WITH (FORCE); + DROP DATABASE db62 WITH (FORCE); + DROP DATABASE db63 WITH (FORCE); + DROP DATABASE db64 WITH (FORCE); + DROP DATABASE db65 WITH (FORCE); + DROP DATABASE db66 WITH (FORCE); + DROP DATABASE db67 WITH (FORCE); + DROP DATABASE db68 WITH (FORCE); + DROP DATABASE db69 WITH (FORCE); + DROP DATABASE db70 WITH (FORCE); + DROP DATABASE db71 WITH (FORCE); + DROP DATABASE db72 WITH (FORCE); + DROP DATABASE db73 WITH (FORCE); + DROP DATABASE db74 WITH (FORCE); + DROP DATABASE db75 WITH (FORCE); + DROP DATABASE db76 WITH (FORCE); + DROP DATABASE db77 WITH (FORCE); + DROP DATABASE db78 WITH (FORCE); + DROP DATABASE db79 WITH (FORCE); + DROP DATABASE db80 WITH (FORCE); + DROP DATABASE db81 WITH (FORCE); + DROP DATABASE db82 WITH (FORCE); + DROP DATABASE db83 WITH (FORCE); + DROP DATABASE db84 WITH (FORCE); + DROP DATABASE db85 WITH (FORCE); + DROP DATABASE db86 WITH (FORCE); + DROP DATABASE db87 WITH (FORCE); + DROP DATABASE db88 WITH (FORCE); + DROP DATABASE db89 WITH (FORCE); + DROP DATABASE db90 WITH (FORCE); + DROP DATABASE db91 WITH (FORCE); + DROP DATABASE db92 WITH (FORCE); + DROP DATABASE db93 WITH (FORCE); + DROP DATABASE db94 WITH (FORCE); + DROP DATABASE db95 WITH (FORCE); + DROP DATABASE db96 WITH (FORCE); + DROP DATABASE db97 WITH (FORCE); + DROP DATABASE db98 WITH (FORCE); + DROP DATABASE db99 WITH (FORCE); + DROP DATABASE db100 WITH (FORCE); + SELECT count(*) + FROM pg_database + WHERE datname LIKE 'db%'; + $definition$ AS cleanup +\gset + +:cleanup + +\c - - - :worker_1_port + +:cleanup + +\c - - - :worker_2_port + +:cleanup From b3bfca9ac619768de60b0ca08dc7055af83ca9cf Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Sun, 29 Oct 2023 01:40:38 +0700 Subject: [PATCH 12/41] Adapt implementation to changing MaintenanceQuota on a fly --- .../connection/shared_connection_stats.c | 31 +++++++++++++------ 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/src/backend/distributed/connection/shared_connection_stats.c b/src/backend/distributed/connection/shared_connection_stats.c index b8a84bd0d..a2849caec 100644 --- a/src/backend/distributed/connection/shared_connection_stats.c +++ b/src/backend/distributed/connection/shared_connection_stats.c @@ -425,8 +425,7 @@ IncrementSharedConnectionCounterInternal(uint32 externalFlags, /* Increment counter if a slot available */ bool connectionSlotAvailable = true; - /* When GetSharedPoolSizeMaintenanceQuota() == 0, treat maintenance connections as regular */ - bool maintenanceConnection = (GetSharedPoolSizeMaintenanceQuota() > 0 && (externalFlags & MAINTENANCE_CONNECTION)); + bool maintenanceConnection = externalFlags & MAINTENANCE_CONNECTION; if (checkLimits) { WorkerNode *workerNode = FindWorkerNode(hostname, port); @@ -434,15 +433,28 @@ IncrementSharedConnectionCounterInternal(uint32 externalFlags, int currentConnectionsLimit = connectionToLocalNode ? GetLocalSharedPoolSize() : GetMaxSharedPoolSize(); - int maintenanceQuota = (int) ceil((double) currentConnectionsLimit * GetSharedPoolSizeMaintenanceQuota()); - /* Connections limit should never go below 1 */ - currentConnectionsLimit = Max(maintenanceConnection - ? maintenanceQuota - : currentConnectionsLimit - maintenanceQuota, 1); - int currentConnectionsCount = maintenanceConnection + int currentConnectionsCount; + + if (GetSharedPoolSizeMaintenanceQuota() > 0) + { + int maintenanceQuota = (int) ceil((double) currentConnectionsLimit * GetSharedPoolSizeMaintenanceQuota()); + /* Connections limit should never go below 1 */ + currentConnectionsLimit = Max(maintenanceConnection + ? maintenanceQuota + : currentConnectionsLimit - maintenanceQuota, 1); + currentConnectionsCount = maintenanceConnection ? workerNodeConnectionEntry->maintenanceConnectionsCount : workerNodeConnectionEntry->regularConnectionsCount; + + } + else + { + /* When maintenance quota disabled, all connections treated equally*/ + currentConnectionsCount = (workerNodeConnectionEntry->maintenanceConnectionsCount + + workerNodeConnectionEntry->regularConnectionsCount); + } bool remoteNodeLimitExceeded = currentConnectionsCount + 1 > currentConnectionsLimit; + /* * For local nodes, solely relying on citus.max_shared_pool_size or * max_connections might not be sufficient. The former gives us @@ -543,8 +555,7 @@ DecrementSharedConnectionCounterInternal(uint32 externalFlags, Assert(workerNodeConnectionEntry->regularConnectionsCount > 0 || workerNodeConnectionEntry->maintenanceConnectionsCount > 0); - /* When GetSharedPoolSizeMaintenanceQuota() == 0, treat maintenance connections as regular */ - if ((GetSharedPoolSizeMaintenanceQuota() > 0 && (externalFlags & MAINTENANCE_CONNECTION))) + if (externalFlags & MAINTENANCE_CONNECTION) { workerNodeConnectionEntry->maintenanceConnectionsCount -= 1; } From f447b39b84e0afd94b8073453a1d67a6fd0a5158 Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Tue, 28 Nov 2023 14:16:39 +0300 Subject: [PATCH 13/41] - Synced with main - Removed maintenance_management_database GUC and logic --- .../connection/connection_management.c | 9 +-- src/backend/distributed/shared_library_init.c | 11 --- src/backend/distributed/utils/maintenanced.c | 67 +++++-------------- .../multi_maintenance_multiple_databases.out | 16 ++--- .../multi_maintenance_multiple_databases.sql | 18 +++-- 5 files changed, 32 insertions(+), 89 deletions(-) diff --git a/src/backend/distributed/connection/connection_management.c b/src/backend/distributed/connection/connection_management.c index 0ed71d198..a7d4c605d 100644 --- a/src/backend/distributed/connection/connection_management.c +++ b/src/backend/distributed/connection/connection_management.c @@ -360,9 +360,7 @@ StartNodeUserDatabaseConnection(uint32 flags, const char *hostname, int32 port, MultiConnection *connection = FindAvailableConnection(entry->connections, flags); if (connection) { - if ((flags & REQUIRE_MAINTENANCE_CONNECTION) && - IsMaintenanceDaemon && - !IsMaintenanceManagementDatabase(MyDatabaseId)) + if (flags & REQUIRE_MAINTENANCE_CONNECTION) { // Maintenance database may have changed, so cached connection should be closed connection->forceCloseAtTransactionEnd = true; @@ -446,10 +444,7 @@ StartNodeUserDatabaseConnection(uint32 flags, const char *hostname, int32 port, else if (flags & REQUIRE_MAINTENANCE_CONNECTION) { connection->useForMaintenanceOperations = true; - if (IsMaintenanceDaemon && !IsMaintenanceManagementDatabase(MyDatabaseId)) - { - connection->forceCloseAtTransactionEnd = true; - } + connection->forceCloseAtTransactionEnd = true; } /* fully initialized the connection, record it */ diff --git a/src/backend/distributed/shared_library_init.c b/src/backend/distributed/shared_library_init.c index 186ac886d..3c9b2d97c 100644 --- a/src/backend/distributed/shared_library_init.c +++ b/src/backend/distributed/shared_library_init.c @@ -2705,17 +2705,6 @@ RegisterCitusConfigVariables(void) GUC_STANDARD, NULL, NULL, NULL); - DefineCustomStringVariable( - "citus.maintenance_management_database", - gettext_noop("Database for cluster-wide maintenance operations across all databases"), - gettext_noop("It should be enabled when there are more than " - "one database with Citus in a cluster."), - &MaintenanceManagementDatabase, - "", - PGC_SIGHUP, - GUC_STANDARD, - NULL, NULL, NULL); - /* warn about config items in the citus namespace that are not registered above */ EmitWarningsOnPlaceholders("citus"); diff --git a/src/backend/distributed/utils/maintenanced.c b/src/backend/distributed/utils/maintenanced.c index 0ab30e529..117507c15 100644 --- a/src/backend/distributed/utils/maintenanced.c +++ b/src/backend/distributed/utils/maintenanced.c @@ -718,27 +718,24 @@ CitusMaintenanceDaemonMain(Datum main_arg) InvalidateMetadataSystemCache(); StartTransactionCommand(); - - if ((strcmp(GetMaintenanceManagementDatabase(), "") == 0 || IsMaintenanceManagementDatabase(databaseOid))) + /* + * We skip the deadlock detection if citus extension + * is not accessible. + * + * Similarly, we skip to run the deadlock checks if + * there exists any version mismatch or the extension + * is not fully created yet. + */ + if (!LockCitusExtension()) { - /* - * We skip the deadlock detection if citus extension - * is not accessible. - * - * Similarly, we skip to run the deadlock checks if - * there exists any version mismatch or the extension - * is not fully created yet. - */ - if (!LockCitusExtension()) - { - ereport(DEBUG1, (errmsg("could not lock the citus extension, " - "skipping deadlock detection"))); - } - else if (CheckCitusVersion(DEBUG1) && CitusHasBeenLoaded()) - { - foundDeadlock = CheckForDistributedDeadlocks(); - } + ereport(DEBUG1, (errmsg("could not lock the citus extension, " + "skipping deadlock detection"))); } + else if (CheckCitusVersion(DEBUG1) && CitusHasBeenLoaded()) + { + foundDeadlock = CheckForDistributedDeadlocks(); + } + CommitTransactionCommand(); @@ -1237,35 +1234,3 @@ MetadataSyncTriggeredCheckAndReset(MaintenanceDaemonDBData *dbData) return metadataSyncTriggered; } -char -*GetMaintenanceManagementDatabase(void) -{ - char *result = MaintenanceManagementDatabase; - /* If MaintenanceManagementDatabase is not set, all maintenance daemons are considered independent */ - if (strcmp(MaintenanceManagementDatabase, "") != 0) - { - Oid maintenanceDatabaseOid = get_database_oid(MaintenanceManagementDatabase, true); - if (!maintenanceDatabaseOid) - { - ereport(WARNING, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("Database \"%s\" doesn't exists, please check the citus.maintenance_management_database parameter. " - "Applying a default value instead.", - MaintenanceManagementDatabase))); - result = ""; - } - } - return result; -} - -bool -IsMaintenanceManagementDatabase(Oid databaseOid) -{ - if (strcmp(GetMaintenanceManagementDatabase(), "") == 0) - { - /* If MaintenanceManagementDatabase is not set, all maintenance daemons are considered independent */ - return false; - } - Oid maintenanceDatabaseOid = get_database_oid(MaintenanceManagementDatabase, true); - return maintenanceDatabaseOid == databaseOid; -} - diff --git a/src/test/regress/expected/multi_maintenance_multiple_databases.out b/src/test/regress/expected/multi_maintenance_multiple_databases.out index 8b37d7e25..854807508 100644 --- a/src/test/regress/expected/multi_maintenance_multiple_databases.out +++ b/src/test/regress/expected/multi_maintenance_multiple_databases.out @@ -1,9 +1,5 @@ -- This test verfies a behavioir of maintenance daemon in multi-database environment --- It checks two things: --- 1. Maintenance daemons should not cache connections, except the one for the citus.maintenance_management_database --- 2. 2PC transaction recovery should respect the citus.shared_pool_size_maintenance_quota --- 2. Distributed deadlock detection should run only on citus.maintenance_management_database. --- +-- It checks that distributed deadlock detection and 2PC transaction recovery should respect the citus.shared_pool_size_maintenance_quota. -- To do that, it created 100 databases and syntactically generates distributed transactions in various states there. SELECT $definition$ ALTER SYSTEM SET citus.recover_2pc_interval TO '-1'; @@ -14,7 +10,6 @@ SELECT $definition$ SELECT $deinition$ ALTER SYSTEM SET citus.recover_2pc_interval TO '5s'; ALTER SYSTEM RESET citus.distributed_deadlock_detection_factor; -ALTER SYSTEM SET citus.maintenance_management_database = 'regression'; SELECT pg_reload_conf(); $deinition$ AS turn_on_maintenance \gset @@ -284,7 +279,7 @@ WHERE datname LIKE 'db%'; t (1 row) -SELECT count(*) BETWEEN 1 AND 3 AS cached_connections_after_recovery_coordinator_test +SELECT count(*) = 0 AS cached_connections_after_recovery_coordinator_test FROM pg_stat_activity WHERE state = 'idle' AND now() - backend_start > '5 seconds'::interval; @@ -303,7 +298,7 @@ WHERE gid LIKE 'citus_0_1234_4_0_%' t (1 row) -SELECT count(*) BETWEEN 1 AND 3 AS cached_connections_after_recovery_worker_1_test +SELECT count(*) = 0 AS cached_connections_after_recovery_worker_1_test FROM pg_stat_activity WHERE state = 'idle' AND now() - backend_start > '5 seconds'::interval; @@ -322,7 +317,7 @@ WHERE gid LIKE 'citus_0_1234_4_0_%' t (1 row) -SELECT count(*) BETWEEN 1 AND 3 AS cached_connections_after_recovery_worker_2_test +SELECT count(*) = 0 AS cached_connections_after_recovery_worker_2_test FROM pg_stat_activity WHERE state = 'idle' AND now() - backend_start > '5 seconds'::interval; @@ -336,7 +331,6 @@ WHERE state = 'idle' SELECT $definition$ ALTER SYSTEM RESET citus.recover_2pc_interval; ALTER SYSTEM RESET citus.distributed_deadlock_detection_factor; - ALTER SYSTEM RESET citus.maintenance_management_database; SELECT pg_reload_conf(); DO @@ -500,3 +494,5 @@ SELECT $definition$ 0 (1 row) +\c - - - :master_port +DROP EXTENSION IF EXISTS dblink; diff --git a/src/test/regress/sql/multi_maintenance_multiple_databases.sql b/src/test/regress/sql/multi_maintenance_multiple_databases.sql index fce49acdc..25c26c738 100644 --- a/src/test/regress/sql/multi_maintenance_multiple_databases.sql +++ b/src/test/regress/sql/multi_maintenance_multiple_databases.sql @@ -1,9 +1,5 @@ -- This test verfies a behavioir of maintenance daemon in multi-database environment --- It checks two things: --- 1. Maintenance daemons should not cache connections, except the one for the citus.maintenance_management_database --- 2. 2PC transaction recovery should respect the citus.shared_pool_size_maintenance_quota --- 2. Distributed deadlock detection should run only on citus.maintenance_management_database. --- +-- It checks that distributed deadlock detection and 2PC transaction recovery should respect the citus.shared_pool_size_maintenance_quota. -- To do that, it created 100 databases and syntactically generates distributed transactions in various states there. SELECT $definition$ @@ -16,7 +12,6 @@ SELECT $definition$ SELECT $deinition$ ALTER SYSTEM SET citus.recover_2pc_interval TO '5s'; ALTER SYSTEM RESET citus.distributed_deadlock_detection_factor; -ALTER SYSTEM SET citus.maintenance_management_database = 'regression'; SELECT pg_reload_conf(); $deinition$ AS turn_on_maintenance \gset @@ -245,7 +240,7 @@ FROM pg_database, $statement$) AS t(groupid integer, gid text) WHERE datname LIKE 'db%'; -SELECT count(*) BETWEEN 1 AND 3 AS cached_connections_after_recovery_coordinator_test +SELECT count(*) = 0 AS cached_connections_after_recovery_coordinator_test FROM pg_stat_activity WHERE state = 'idle' AND now() - backend_start > '5 seconds'::interval; @@ -257,7 +252,7 @@ FROM pg_prepared_xacts WHERE gid LIKE 'citus_0_1234_4_0_%' OR gid LIKE 'citus_0_should_be_forgotten_%'; -SELECT count(*) BETWEEN 1 AND 3 AS cached_connections_after_recovery_worker_1_test +SELECT count(*) = 0 AS cached_connections_after_recovery_worker_1_test FROM pg_stat_activity WHERE state = 'idle' AND now() - backend_start > '5 seconds'::interval; @@ -269,7 +264,7 @@ FROM pg_prepared_xacts WHERE gid LIKE 'citus_0_1234_4_0_%' OR gid LIKE 'citus_0_should_be_forgotten_%'; -SELECT count(*) BETWEEN 1 AND 3 AS cached_connections_after_recovery_worker_2_test +SELECT count(*) = 0 AS cached_connections_after_recovery_worker_2_test FROM pg_stat_activity WHERE state = 'idle' AND now() - backend_start > '5 seconds'::interval; @@ -282,7 +277,6 @@ WHERE state = 'idle' SELECT $definition$ ALTER SYSTEM RESET citus.recover_2pc_interval; ALTER SYSTEM RESET citus.distributed_deadlock_detection_factor; - ALTER SYSTEM RESET citus.maintenance_management_database; SELECT pg_reload_conf(); DO @@ -421,3 +415,7 @@ SELECT $definition$ \c - - - :worker_2_port :cleanup + +\c - - - :master_port + +DROP EXTENSION IF EXISTS dblink; From 09917f846ff3131c3c5272dda499e116af5eaca5 Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Wed, 29 Nov 2023 17:34:38 +0300 Subject: [PATCH 14/41] - Change the implementation from quota of the shared pool to a separate pool - Improve tests --- .../locally_reserved_shared_connections.c | 1 + .../connection/shared_connection_stats.c | 57 +++++++++---------- src/backend/distributed/shared_library_init.c | 17 +++--- .../distributed/shared_connection_stats.h | 4 +- .../multi_maintenance_multiple_databases.out | 24 ++++++++ .../multi_maintenance_multiple_databases.sql | 14 ++++- 6 files changed, 74 insertions(+), 43 deletions(-) diff --git a/src/backend/distributed/connection/locally_reserved_shared_connections.c b/src/backend/distributed/connection/locally_reserved_shared_connections.c index dc31ad080..023c9b7c0 100644 --- a/src/backend/distributed/connection/locally_reserved_shared_connections.c +++ b/src/backend/distributed/connection/locally_reserved_shared_connections.c @@ -481,6 +481,7 @@ EnsureConnectionPossibilityForNode(WorkerNode *workerNode, bool waitForConnectio bool IsReservationPossible(void) { + // TODO add check for maintenance connection if (GetMaxSharedPoolSize() == DISABLE_CONNECTION_THROTTLING) { /* connection throttling disabled */ diff --git a/src/backend/distributed/connection/shared_connection_stats.c b/src/backend/distributed/connection/shared_connection_stats.c index a2849caec..1cab33306 100644 --- a/src/backend/distributed/connection/shared_connection_stats.c +++ b/src/backend/distributed/connection/shared_connection_stats.c @@ -105,11 +105,11 @@ typedef struct SharedWorkerNodeDatabaseConnStatsHashEntry int MaxSharedPoolSize = 0; /* - * Controlled via a GUC, never access directly, use GetSharedPoolSizeMaintenanceQuota(). - * Percent of MaxSharedPoolSize reserved for maintenance operations. - * "0" effectively means that regular and maintenance connection will compete over the common pool + * Controlled via a GUC, never access directly, use GetMaxMaintenanceSharedPoolSize(). + * Pool size for maintenance connections exclusively + * "0" or "-1" means do not apply connection throttling */ -double SharedPoolSizeMaintenanceQuota = 0.1; +int MaxMaintenanceSharedPoolSize = 5; /* * Controlled via a GUC, never access directly, use GetLocalSharedPoolSize(). @@ -142,7 +142,7 @@ static uint32 SharedConnectionHashHash(const void *key, Size keysize); static int SharedConnectionHashCompare(const void *a, const void *b, Size keysize); static uint32 SharedWorkerNodeDatabaseHashHash(const void *key, Size keysize); static int SharedWorkerNodeDatabaseHashCompare(const void *a, const void *b, Size keysize); -static bool isConnectionThrottlingDisabled(); +static bool isConnectionThrottlingDisabled(uint32 externalFlags); static bool IncrementSharedConnectionCounterInternal(uint32 externalFlags, bool checkLimits, const char *hostname, int port, Oid database); @@ -257,10 +257,10 @@ GetMaxSharedPoolSize(void) return MaxSharedPoolSize; } -double -GetSharedPoolSizeMaintenanceQuota(void) +int +GetMaxMaintenanceSharedPoolSize(void) { - return SharedPoolSizeMaintenanceQuota; + return MaxMaintenanceSharedPoolSize; } @@ -315,7 +315,7 @@ WaitLoopForSharedConnection(uint32 flags, const char *hostname, int port) bool TryToIncrementSharedConnectionCounter(uint32 flags, const char *hostname, int port) { - if (isConnectionThrottlingDisabled()) + if (isConnectionThrottlingDisabled(flags)) { return true; } @@ -347,7 +347,7 @@ TryToIncrementSharedConnectionCounter(uint32 flags, const char *hostname, int po void IncrementSharedConnectionCounter(uint32 flags, const char *hostname, int port) { - if (isConnectionThrottlingDisabled()) + if (isConnectionThrottlingDisabled(flags)) { return; } @@ -430,29 +430,22 @@ IncrementSharedConnectionCounterInternal(uint32 externalFlags, { WorkerNode *workerNode = FindWorkerNode(hostname, port); bool connectionToLocalNode = workerNode && (workerNode->groupId == GetLocalGroupId()); - int currentConnectionsLimit = connectionToLocalNode - ? GetLocalSharedPoolSize() - : GetMaxSharedPoolSize(); + + int currentConnectionsLimit; int currentConnectionsCount; - - if (GetSharedPoolSizeMaintenanceQuota() > 0) + if (maintenanceConnection) { - int maintenanceQuota = (int) ceil((double) currentConnectionsLimit * GetSharedPoolSizeMaintenanceQuota()); - /* Connections limit should never go below 1 */ - currentConnectionsLimit = Max(maintenanceConnection - ? maintenanceQuota - : currentConnectionsLimit - maintenanceQuota, 1); - currentConnectionsCount = maintenanceConnection - ? workerNodeConnectionEntry->maintenanceConnectionsCount - : workerNodeConnectionEntry->regularConnectionsCount; - + currentConnectionsLimit = GetMaxMaintenanceSharedPoolSize(); + currentConnectionsCount = workerNodeConnectionEntry->maintenanceConnectionsCount; } else { - /* When maintenance quota disabled, all connections treated equally*/ - currentConnectionsCount = (workerNodeConnectionEntry->maintenanceConnectionsCount + - workerNodeConnectionEntry->regularConnectionsCount); + currentConnectionsLimit = connectionToLocalNode + ? GetLocalSharedPoolSize() + : GetMaxSharedPoolSize(); + currentConnectionsCount = workerNodeConnectionEntry->regularConnectionsCount; } + bool remoteNodeLimitExceeded = currentConnectionsCount + 1 > currentConnectionsLimit; /* @@ -519,7 +512,8 @@ IncrementSharedConnectionCounterInternal(uint32 externalFlags, void DecrementSharedConnectionCounter(uint32 externalFlags, const char *hostname, int port) { - if (isConnectionThrottlingDisabled()) + // TODO: possible bug, remove this check? + if (isConnectionThrottlingDisabled(externalFlags)) { return; } @@ -964,11 +958,14 @@ SharedWorkerNodeDatabaseHashCompare(const void *a, const void *b, Size keysize) ca->database != cb->database; } -static bool isConnectionThrottlingDisabled() +static bool isConnectionThrottlingDisabled(uint32 externalFlags) { + bool maintenanceConnection = externalFlags & MAINTENANCE_CONNECTION; /* * Do not call Get*PoolSize() functions here, since it may read from * the catalog and we may be in the process exit handler. */ - return MaxSharedPoolSize == DISABLE_CONNECTION_THROTTLING; + return maintenanceConnection + ? MaxMaintenanceSharedPoolSize <= 0 + : MaxSharedPoolSize == DISABLE_CONNECTION_THROTTLING; } diff --git a/src/backend/distributed/shared_library_init.c b/src/backend/distributed/shared_library_init.c index 3c9b2d97c..d0f8dfbd6 100644 --- a/src/backend/distributed/shared_library_init.c +++ b/src/backend/distributed/shared_library_init.c @@ -1993,18 +1993,17 @@ RegisterCitusConfigVariables(void) GUC_SUPERUSER_ONLY, NULL, NULL, MaxSharedPoolSizeGucShowHook); - DefineCustomRealVariable( - "citus.shared_pool_size_maintenance_quota", - gettext_noop("Sets the fraction of citus.max_shared_pool_size reserved " - "for maintenance operations only. " - "Setting it to 0 disables the quota. " - "This way the maintenance and regular connections will share the same pool"), + DefineCustomIntVariable( + "citus.max_maintenance_shared_pool_size", + gettext_noop("Similar to citus.max_shared_pool_size, but applies to connections " + "for maintenance operations only." + "Setting it to 0 or -1 disables maintenance connection throttling."), NULL, - &SharedPoolSizeMaintenanceQuota, - 0.1, 0, 1, + &MaxMaintenanceSharedPoolSize, + 5, -1, INT_MAX, PGC_SIGHUP, GUC_SUPERUSER_ONLY, - NULL, NULL, MaxSharedPoolSizeGucShowHook); + NULL, NULL, NULL); DefineCustomIntVariable( "citus.max_worker_nodes_tracked", diff --git a/src/include/distributed/shared_connection_stats.h b/src/include/distributed/shared_connection_stats.h index 5a0be6ea2..d5901014a 100644 --- a/src/include/distributed/shared_connection_stats.h +++ b/src/include/distributed/shared_connection_stats.h @@ -25,7 +25,7 @@ enum SharedPoolCounterMode }; extern int MaxSharedPoolSize; -extern double SharedPoolSizeMaintenanceQuota; +extern int MaxMaintenanceSharedPoolSize; extern int LocalSharedPoolSize; extern int MaxClientConnections; @@ -37,7 +37,7 @@ extern size_t SharedConnectionStatsShmemSize(void); extern void SharedConnectionStatsShmemInit(void); extern int GetMaxClientConnections(void); extern int GetMaxSharedPoolSize(void); -extern double GetSharedPoolSizeMaintenanceQuota(void); +extern int GetMaxMaintenanceSharedPoolSize(void); extern int GetLocalSharedPoolSize(void); extern bool TryToIncrementSharedConnectionCounter(uint32 flags, const char *hostname, int port); extern void WaitLoopForSharedConnection(uint32 flags, const char *hostname, int port); diff --git a/src/test/regress/expected/multi_maintenance_multiple_databases.out b/src/test/regress/expected/multi_maintenance_multiple_databases.out index 854807508..5013c3a84 100644 --- a/src/test/regress/expected/multi_maintenance_multiple_databases.out +++ b/src/test/regress/expected/multi_maintenance_multiple_databases.out @@ -263,6 +263,14 @@ SELECT pg_sleep_for('10 seconds'::interval); (1 row) -- Verify maintenance result +SELECT count(*) = 0 AS too_many_clients_test +FROM regexp_split_to_table(pg_read_file('../log/postmaster.log'), E'\n') AS t(log_line) +WHERE log_line LIKE '%sorry, too many clients already%'; + too_many_clients_test +--------------------------------------------------------------------- + t +(1 row) + SELECT count(*) = 0 AS pg_dist_transaction_after_recovery_coordinator_test FROM pg_database, dblink(format('dbname=%s host=localhost port=%s user=postgres', datname, @@ -289,6 +297,14 @@ WHERE state = 'idle' (1 row) \c - - - :worker_1_port +SELECT count(*) = 0 AS too_many_clients_test +FROM regexp_split_to_table(pg_read_file('../log/postmaster.log'), E'\n') AS t(log_line) +WHERE log_line LIKE '%sorry, too many clients already%'; + too_many_clients_test +--------------------------------------------------------------------- + t +(1 row) + SELECT count(*) = 0 AS pg_prepared_xacts_after_recover_worker_1_test FROM pg_prepared_xacts WHERE gid LIKE 'citus_0_1234_4_0_%' @@ -308,6 +324,14 @@ WHERE state = 'idle' (1 row) \c - - - :worker_2_port +SELECT count(*) = 0 AS too_many_clients_test +FROM regexp_split_to_table(pg_read_file('../log/postmaster.log'), E'\n') AS t(log_line) +WHERE log_line LIKE '%sorry, too many clients already%'; + too_many_clients_test +--------------------------------------------------------------------- + t +(1 row) + SELECT count(*) = 0 AS pg_prepared_xacts_after_recover_worker_2_test FROM pg_prepared_xacts WHERE gid LIKE 'citus_0_1234_4_0_%' diff --git a/src/test/regress/sql/multi_maintenance_multiple_databases.sql b/src/test/regress/sql/multi_maintenance_multiple_databases.sql index 25c26c738..20a6aa132 100644 --- a/src/test/regress/sql/multi_maintenance_multiple_databases.sql +++ b/src/test/regress/sql/multi_maintenance_multiple_databases.sql @@ -218,8 +218,6 @@ WHERE state = 'idle' :turn_on_maintenance - - \c - - - :master_port -- Let maintenance do it's work... @@ -228,6 +226,10 @@ SELECT pg_sleep_for('10 seconds'::interval); -- Verify maintenance result +SELECT count(*) = 0 AS too_many_clients_test +FROM regexp_split_to_table(pg_read_file('../log/postmaster.log'), E'\n') AS t(log_line) +WHERE log_line LIKE '%sorry, too many clients already%'; + SELECT count(*) = 0 AS pg_dist_transaction_after_recovery_coordinator_test FROM pg_database, dblink(format('dbname=%s host=localhost port=%s user=postgres', datname, @@ -247,6 +249,10 @@ WHERE state = 'idle' \c - - - :worker_1_port +SELECT count(*) = 0 AS too_many_clients_test +FROM regexp_split_to_table(pg_read_file('../log/postmaster.log'), E'\n') AS t(log_line) +WHERE log_line LIKE '%sorry, too many clients already%'; + SELECT count(*) = 0 AS pg_prepared_xacts_after_recover_worker_1_test FROM pg_prepared_xacts WHERE gid LIKE 'citus_0_1234_4_0_%' @@ -259,6 +265,10 @@ WHERE state = 'idle' \c - - - :worker_2_port +SELECT count(*) = 0 AS too_many_clients_test +FROM regexp_split_to_table(pg_read_file('../log/postmaster.log'), E'\n') AS t(log_line) +WHERE log_line LIKE '%sorry, too many clients already%'; + SELECT count(*) = 0 AS pg_prepared_xacts_after_recover_worker_2_test FROM pg_prepared_xacts WHERE gid LIKE 'citus_0_1234_4_0_%' From 115ed00c069ed3f3f1a4c78952ffb02295bfb1e3 Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Thu, 30 Nov 2023 15:16:48 +0300 Subject: [PATCH 15/41] - Adapt locally_reserved_shared_connections to maintenance connection pool - Fixed tests --- src/backend/distributed/commands/multi_copy.c | 7 +++-- .../locally_reserved_shared_connections.c | 30 +++++++++++-------- .../locally_reserved_shared_connections.h | 6 ++-- .../expected/shared_connection_stats.out | 18 +++++------ .../regress/sql/shared_connection_stats.sql | 2 +- 5 files changed, 34 insertions(+), 29 deletions(-) diff --git a/src/backend/distributed/commands/multi_copy.c b/src/backend/distributed/commands/multi_copy.c index 23847ac01..0db780d1b 100644 --- a/src/backend/distributed/commands/multi_copy.c +++ b/src/backend/distributed/commands/multi_copy.c @@ -2160,6 +2160,7 @@ CitusCopyDestReceiverStartup(DestReceiver *dest, int operation, copyDest->connectionStateHash = CreateConnectionStateHash(TopTransactionContext); RecordRelationAccessIfNonDistTable(tableId, PLACEMENT_ACCESS_DML); + uint32 connectionFlags = 0; /* * Colocated intermediate results do not honor citus.max_shared_pool_size, @@ -2181,7 +2182,7 @@ CitusCopyDestReceiverStartup(DestReceiver *dest, int operation, * and cannot switch to local execution (e.g., disabled by user), * COPY would fail hinting the user to change the relevant settiing. */ - EnsureConnectionPossibilityForRemotePrimaryNodes(); + EnsureConnectionPossibilityForRemotePrimaryNodes(connectionFlags); } LocalCopyStatus localCopyStatus = GetLocalCopyStatus(); @@ -2211,7 +2212,7 @@ CitusCopyDestReceiverStartup(DestReceiver *dest, int operation, */ if (ShardIntervalListHasLocalPlacements(shardIntervalList)) { - bool reservedConnection = TryConnectionPossibilityForLocalPrimaryNode(); + bool reservedConnection = TryConnectionPossibilityForLocalPrimaryNode(connectionFlags); copyDest->shouldUseLocalCopy = !reservedConnection; } } @@ -3634,7 +3635,7 @@ CopyGetPlacementConnection(HTAB *connectionStateHash, ShardPlacement *placement, return connection; } - if (IsReservationPossible()) + if (IsReservationPossible(connectionFlags)) { /* * Enforce the requirements for adaptive connection management diff --git a/src/backend/distributed/connection/locally_reserved_shared_connections.c b/src/backend/distributed/connection/locally_reserved_shared_connections.c index 023c9b7c0..a9079f752 100644 --- a/src/backend/distributed/connection/locally_reserved_shared_connections.c +++ b/src/backend/distributed/connection/locally_reserved_shared_connections.c @@ -92,9 +92,10 @@ static ReservedConnectionHashEntry * AllocateOrGetReservedConnectionEntry(char * userId, Oid databaseOid, bool *found); -static void EnsureConnectionPossibilityForNodeList(List *nodeList); +static void EnsureConnectionPossibilityForNodeList(List *nodeList, uint32 connectionFlags); static bool EnsureConnectionPossibilityForNode(WorkerNode *workerNode, - bool waitForConnection); + bool waitForConnection, + uint32 connectionFlags); static uint32 LocalConnectionReserveHashHash(const void *key, Size keysize); static int LocalConnectionReserveHashCompare(const void *a, const void *b, Size keysize); @@ -296,7 +297,7 @@ MarkReservedConnectionUsed(const char *hostName, int nodePort, Oid userId, * EnsureConnectionPossibilityForNodeList. */ void -EnsureConnectionPossibilityForRemotePrimaryNodes(void) +EnsureConnectionPossibilityForRemotePrimaryNodes(uint32 connectionFlags) { /* * By using NoLock there is a tiny risk of that we miss to reserve a @@ -305,7 +306,7 @@ EnsureConnectionPossibilityForRemotePrimaryNodes(void) * going to access would be on the new node. */ List *remoteNodeList = ActivePrimaryRemoteNodeList(NoLock); - EnsureConnectionPossibilityForNodeList(remoteNodeList); + EnsureConnectionPossibilityForNodeList(remoteNodeList, connectionFlags); } @@ -315,7 +316,7 @@ EnsureConnectionPossibilityForRemotePrimaryNodes(void) * If not, the function returns false. */ bool -TryConnectionPossibilityForLocalPrimaryNode(void) +TryConnectionPossibilityForLocalPrimaryNode(uint32 connectionFlags) { bool nodeIsInMetadata = false; WorkerNode *localNode = @@ -331,7 +332,7 @@ TryConnectionPossibilityForLocalPrimaryNode(void) } bool waitForConnection = false; - return EnsureConnectionPossibilityForNode(localNode, waitForConnection); + return EnsureConnectionPossibilityForNode(localNode, waitForConnection, connectionFlags); } @@ -345,7 +346,7 @@ TryConnectionPossibilityForLocalPrimaryNode(void) * single reservation per backend) */ static void -EnsureConnectionPossibilityForNodeList(List *nodeList) +EnsureConnectionPossibilityForNodeList(List *nodeList, uint32 connectionFlags) { /* * We sort the workerList because adaptive connection management @@ -364,7 +365,7 @@ EnsureConnectionPossibilityForNodeList(List *nodeList) foreach_ptr(workerNode, nodeList) { bool waitForConnection = true; - EnsureConnectionPossibilityForNode(workerNode, waitForConnection); + EnsureConnectionPossibilityForNode(workerNode, waitForConnection, connectionFlags); } } @@ -383,9 +384,9 @@ EnsureConnectionPossibilityForNodeList(List *nodeList) * return false. */ static bool -EnsureConnectionPossibilityForNode(WorkerNode *workerNode, bool waitForConnection) +EnsureConnectionPossibilityForNode(WorkerNode *workerNode, bool waitForConnection, uint32 connectionFlags) { - if (!IsReservationPossible()) + if (!IsReservationPossible(connectionFlags)) { return false; } @@ -479,10 +480,13 @@ EnsureConnectionPossibilityForNode(WorkerNode *workerNode, bool waitForConnectio * session is eligible for shared connection reservation. */ bool -IsReservationPossible(void) +IsReservationPossible(uint32 connectionFlags) { - // TODO add check for maintenance connection - if (GetMaxSharedPoolSize() == DISABLE_CONNECTION_THROTTLING) + bool connectionThrottlingDisabled = + connectionFlags & REQUIRE_MAINTENANCE_CONNECTION + ? GetMaxMaintenanceSharedPoolSize() <= 0 + : GetMaxSharedPoolSize() == DISABLE_CONNECTION_THROTTLING; + if (connectionThrottlingDisabled) { /* connection throttling disabled */ return false; diff --git a/src/include/distributed/locally_reserved_shared_connections.h b/src/include/distributed/locally_reserved_shared_connections.h index adec8c9c4..637cbce47 100644 --- a/src/include/distributed/locally_reserved_shared_connections.h +++ b/src/include/distributed/locally_reserved_shared_connections.h @@ -20,8 +20,8 @@ extern bool CanUseReservedConnection(const char *hostName, int nodePort, extern void MarkReservedConnectionUsed(const char *hostName, int nodePort, Oid userId, Oid databaseOid); extern void DeallocateReservedConnections(void); -extern void EnsureConnectionPossibilityForRemotePrimaryNodes(void); -extern bool TryConnectionPossibilityForLocalPrimaryNode(void); -extern bool IsReservationPossible(void); +extern void EnsureConnectionPossibilityForRemotePrimaryNodes(uint32 connectionFlags); +extern bool TryConnectionPossibilityForLocalPrimaryNode(uint32 connectionFlags); +extern bool IsReservationPossible(uint32 connectionFlags); #endif /* LOCALLY_RESERVED_SHARED_CONNECTIONS_H_ */ diff --git a/src/test/regress/expected/shared_connection_stats.out b/src/test/regress/expected/shared_connection_stats.out index 91d35db5d..0ce22548f 100644 --- a/src/test/regress/expected/shared_connection_stats.out +++ b/src/test/regress/expected/shared_connection_stats.out @@ -224,7 +224,7 @@ BEGIN; COMMIT; -- pg_sleep forces almost 1 connection per placement -- now, some of the optional connections would be skipped, --- and only 4 connections (5 minus the maintenance quota) are used per node +-- and only 5 connections are used per node BEGIN; SET LOCAL citus.max_adaptive_executor_pool_size TO 16; with cte_1 as (select pg_sleep(0.1) is null, a from test) SELECT a from cte_1 ORDER By 1 LIMIT 1; @@ -244,8 +244,8 @@ BEGIN; hostname, port; connection_count_to_node --------------------------------------------------------------------- - 4 - 4 + 5 + 5 (2 rows) COMMIT; @@ -382,8 +382,8 @@ COPY test FROM PROGRAM 'seq 32'; hostname, port; connection_count_to_node --------------------------------------------------------------------- - 2 - 2 + 3 + 3 (2 rows) ROLLBACK; @@ -404,7 +404,7 @@ BEGIN; hostname, port; connection_count_to_node --------------------------------------------------------------------- - 2 + 3 1 (2 rows) @@ -423,7 +423,7 @@ COPY test FROM STDIN; hostname, port; connection_count_to_node --------------------------------------------------------------------- - 2 + 3 1 (2 rows) @@ -450,7 +450,7 @@ BEGIN; hostname, port; connection_count_to_node --------------------------------------------------------------------- - 2 + 3 (1 row) -- in this second COPY, we access the same node but different shards @@ -468,7 +468,7 @@ COPY test FROM STDIN; hostname, port; connection_count_to_node --------------------------------------------------------------------- - 2 + 3 1 (2 rows) diff --git a/src/test/regress/sql/shared_connection_stats.sql b/src/test/regress/sql/shared_connection_stats.sql index 7c040af5c..7c653e788 100644 --- a/src/test/regress/sql/shared_connection_stats.sql +++ b/src/test/regress/sql/shared_connection_stats.sql @@ -146,7 +146,7 @@ COMMIT; -- pg_sleep forces almost 1 connection per placement -- now, some of the optional connections would be skipped, --- and only 4 connections (5 minus the maintenance quota) are used per node +-- and only 5 connections are used per node BEGIN; SET LOCAL citus.max_adaptive_executor_pool_size TO 16; with cte_1 as (select pg_sleep(0.1) is null, a from test) SELECT a from cte_1 ORDER By 1 LIMIT 1; From 61714862a6d899b1494ae811f1c49a2cf08df77b Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Thu, 18 Jan 2024 12:36:13 +0100 Subject: [PATCH 16/41] Run make reindent --- src/backend/distributed/commands/multi_copy.c | 7 +- .../connection/connection_management.c | 72 +- .../locally_reserved_shared_connections.c | 48 +- .../connection/shared_connection_stats.c | 710 ++++++++++-------- src/backend/distributed/shared_library_init.c | 48 +- .../distributed/transaction/backend_data.c | 1 + .../transaction/transaction_recovery.c | 2 +- src/backend/distributed/utils/maintenanced.c | 53 +- .../distributed/connection_management.h | 16 +- .../distributed/shared_connection_stats.h | 21 +- 10 files changed, 516 insertions(+), 462 deletions(-) diff --git a/src/backend/distributed/commands/multi_copy.c b/src/backend/distributed/commands/multi_copy.c index 0db780d1b..1c2408a1a 100644 --- a/src/backend/distributed/commands/multi_copy.c +++ b/src/backend/distributed/commands/multi_copy.c @@ -2160,7 +2160,7 @@ CitusCopyDestReceiverStartup(DestReceiver *dest, int operation, copyDest->connectionStateHash = CreateConnectionStateHash(TopTransactionContext); RecordRelationAccessIfNonDistTable(tableId, PLACEMENT_ACCESS_DML); - uint32 connectionFlags = 0; + uint32 connectionFlags = 0; /* * Colocated intermediate results do not honor citus.max_shared_pool_size, @@ -2212,7 +2212,8 @@ CitusCopyDestReceiverStartup(DestReceiver *dest, int operation, */ if (ShardIntervalListHasLocalPlacements(shardIntervalList)) { - bool reservedConnection = TryConnectionPossibilityForLocalPrimaryNode(connectionFlags); + bool reservedConnection = TryConnectionPossibilityForLocalPrimaryNode( + connectionFlags); copyDest->shouldUseLocalCopy = !reservedConnection; } } @@ -3635,7 +3636,7 @@ CopyGetPlacementConnection(HTAB *connectionStateHash, ShardPlacement *placement, return connection; } - if (IsReservationPossible(connectionFlags)) + if (IsReservationPossible(connectionFlags)) { /* * Enforce the requirements for adaptive connection management diff --git a/src/backend/distributed/connection/connection_management.c b/src/backend/distributed/connection/connection_management.c index a7d4c605d..fa7f9f3bf 100644 --- a/src/backend/distributed/connection/connection_management.c +++ b/src/backend/distributed/connection/connection_management.c @@ -68,7 +68,7 @@ static void ErrorIfMultipleMetadataConnectionExists(dlist_head *connections); static void FreeConnParamsHashEntryFields(ConnParamsHashEntry *entry); static void AfterXactHostConnectionHandling(ConnectionHashEntry *entry, bool isCommit); static bool ShouldShutdownConnection(MultiConnection *connection, - const int cachedConnectionCount); + const int cachedConnectionCount); static bool RemoteTransactionIdle(MultiConnection *connection); static int EventSetSizeForConnectionList(List *connections); @@ -360,11 +360,11 @@ StartNodeUserDatabaseConnection(uint32 flags, const char *hostname, int32 port, MultiConnection *connection = FindAvailableConnection(entry->connections, flags); if (connection) { - if (flags & REQUIRE_MAINTENANCE_CONNECTION) - { - // Maintenance database may have changed, so cached connection should be closed - connection->forceCloseAtTransactionEnd = true; - } + if (flags & REQUIRE_MAINTENANCE_CONNECTION) + { + /* Maintenance database may have changed, so cached connection should be closed */ + connection->forceCloseAtTransactionEnd = true; + } return connection; } } @@ -388,12 +388,12 @@ StartNodeUserDatabaseConnection(uint32 flags, const char *hostname, int32 port, /* these two flags are by nature cannot happen at the same time */ Assert(!((flags & WAIT_FOR_CONNECTION) && (flags & OPTIONAL_CONNECTION))); - int sharedCounterFlags = (flags & REQUIRE_MAINTENANCE_CONNECTION) - ? MAINTENANCE_CONNECTION - : 0; + int sharedCounterFlags = (flags & REQUIRE_MAINTENANCE_CONNECTION) + ? MAINTENANCE_CONNECTION + : 0; if (flags & WAIT_FOR_CONNECTION) { - WaitLoopForSharedConnection(sharedCounterFlags, hostname, port); + WaitLoopForSharedConnection(sharedCounterFlags, hostname, port); } else if (flags & OPTIONAL_CONNECTION) { @@ -403,7 +403,7 @@ StartNodeUserDatabaseConnection(uint32 flags, const char *hostname, int32 port, * cannot reserve the right to establish a connection, we prefer to * error out. */ - if (!TryToIncrementSharedConnectionCounter(sharedCounterFlags, hostname, port)) + if (!TryToIncrementSharedConnectionCounter(sharedCounterFlags, hostname, port)) { /* do not track the connection anymore */ dlist_delete(&connection->connectionNode); @@ -423,7 +423,7 @@ StartNodeUserDatabaseConnection(uint32 flags, const char *hostname, int32 port, * * Still, we keep track of the connection counter. */ - IncrementSharedConnectionCounter(sharedCounterFlags, hostname, port); + IncrementSharedConnectionCounter(sharedCounterFlags, hostname, port); } @@ -437,15 +437,15 @@ StartNodeUserDatabaseConnection(uint32 flags, const char *hostname, int32 port, ResetShardPlacementAssociation(connection); - if (flags & REQUIRE_METADATA_CONNECTION) + if (flags & REQUIRE_METADATA_CONNECTION) { connection->useForMetadataOperations = true; - } - else if (flags & REQUIRE_MAINTENANCE_CONNECTION) - { - connection->useForMaintenanceOperations = true; - connection->forceCloseAtTransactionEnd = true; - } + } + else if (flags & REQUIRE_MAINTENANCE_CONNECTION) + { + connection->useForMaintenanceOperations = true; + connection->forceCloseAtTransactionEnd = true; + } /* fully initialized the connection, record it */ connection->initializationState = POOL_STATE_INITIALIZED; @@ -513,11 +513,11 @@ FindAvailableConnection(dlist_head *connections, uint32 flags) continue; } - if ((flags & REQUIRE_MAINTENANCE_CONNECTION) && - !connection->useForMaintenanceOperations) - { - continue; - } + if ((flags & REQUIRE_MAINTENANCE_CONNECTION) && + !connection->useForMaintenanceOperations) + { + continue; + } if ((flags & REQUIRE_METADATA_CONNECTION) && !connection->useForMetadataOperations) @@ -1215,10 +1215,11 @@ CitusPQFinish(MultiConnection *connection) /* behave idempotently, there is no gurantee that CitusPQFinish() is called once */ if (connection->initializationState >= POOL_STATE_COUNTER_INCREMENTED) { - int sharedCounterFlags = (connection->useForMaintenanceOperations) - ? MAINTENANCE_CONNECTION - : 0; - DecrementSharedConnectionCounter(sharedCounterFlags, connection->hostname, connection->port); + int sharedCounterFlags = (connection->useForMaintenanceOperations) + ? MAINTENANCE_CONNECTION + : 0; + DecrementSharedConnectionCounter(sharedCounterFlags, connection->hostname, + connection->port); connection->initializationState = POOL_STATE_NOT_INITIALIZED; } } @@ -1515,16 +1516,17 @@ ShouldShutdownConnection(MultiConnection *connection, const int cachedConnection * escalating the number of cached connections. We can recognize such backends * from their application name. */ - return (IsCitusInternalBackend() || IsRebalancerInternalBackend()) || + return (IsCitusInternalBackend() || IsRebalancerInternalBackend()) || connection->initializationState != POOL_STATE_INITIALIZED || cachedConnectionCount >= MaxCachedConnectionsPerWorker || - connection->forceCloseAtTransactionEnd || + connection->forceCloseAtTransactionEnd || PQstatus(connection->pgConn) != CONNECTION_OK || - !RemoteTransactionIdle(connection) || - connection->requiresReplication || - connection->isReplicationOriginSessionSetup || - (MaxCachedConnectionLifetime >= 0 && - MillisecondsToTimeout(connection->connectionEstablishmentStart, MaxCachedConnectionLifetime) <= 0); + !RemoteTransactionIdle(connection) || + connection->requiresReplication || + connection->isReplicationOriginSessionSetup || + (MaxCachedConnectionLifetime >= 0 && + MillisecondsToTimeout(connection->connectionEstablishmentStart, + MaxCachedConnectionLifetime) <= 0); } diff --git a/src/backend/distributed/connection/locally_reserved_shared_connections.c b/src/backend/distributed/connection/locally_reserved_shared_connections.c index a9079f752..cbe125dda 100644 --- a/src/backend/distributed/connection/locally_reserved_shared_connections.c +++ b/src/backend/distributed/connection/locally_reserved_shared_connections.c @@ -92,10 +92,11 @@ static ReservedConnectionHashEntry * AllocateOrGetReservedConnectionEntry(char * userId, Oid databaseOid, bool *found); -static void EnsureConnectionPossibilityForNodeList(List *nodeList, uint32 connectionFlags); +static void EnsureConnectionPossibilityForNodeList(List *nodeList, uint32 + connectionFlags); static bool EnsureConnectionPossibilityForNode(WorkerNode *workerNode, - bool waitForConnection, - uint32 connectionFlags); + bool waitForConnection, + uint32 connectionFlags); static uint32 LocalConnectionReserveHashHash(const void *key, Size keysize); static int LocalConnectionReserveHashCompare(const void *a, const void *b, Size keysize); @@ -241,8 +242,9 @@ DeallocateReservedConnections(void) * We have not used this reservation, make sure to clean-up from * the shared memory as well. */ - int sharedCounterFlags = 0; - DecrementSharedConnectionCounter(sharedCounterFlags, entry->key.hostname, entry->key.port); + int sharedCounterFlags = 0; + DecrementSharedConnectionCounter(sharedCounterFlags, entry->key.hostname, + entry->key.port); /* for completeness, set it to true */ entry->usedReservation = true; @@ -332,7 +334,8 @@ TryConnectionPossibilityForLocalPrimaryNode(uint32 connectionFlags) } bool waitForConnection = false; - return EnsureConnectionPossibilityForNode(localNode, waitForConnection, connectionFlags); + return EnsureConnectionPossibilityForNode(localNode, waitForConnection, + connectionFlags); } @@ -365,7 +368,8 @@ EnsureConnectionPossibilityForNodeList(List *nodeList, uint32 connectionFlags) foreach_ptr(workerNode, nodeList) { bool waitForConnection = true; - EnsureConnectionPossibilityForNode(workerNode, waitForConnection, connectionFlags); + EnsureConnectionPossibilityForNode(workerNode, waitForConnection, + connectionFlags); } } @@ -384,9 +388,10 @@ EnsureConnectionPossibilityForNodeList(List *nodeList, uint32 connectionFlags) * return false. */ static bool -EnsureConnectionPossibilityForNode(WorkerNode *workerNode, bool waitForConnection, uint32 connectionFlags) +EnsureConnectionPossibilityForNode(WorkerNode *workerNode, bool waitForConnection, uint32 + connectionFlags) { - if (!IsReservationPossible(connectionFlags)) + if (!IsReservationPossible(connectionFlags)) { return false; } @@ -441,16 +446,17 @@ EnsureConnectionPossibilityForNode(WorkerNode *workerNode, bool waitForConnectio * Increment the shared counter, we may need to wait if there are * no space left. */ - int sharedCounterFlags = 0; - WaitLoopForSharedConnection(sharedCounterFlags, workerNode->workerName, workerNode->workerPort); + int sharedCounterFlags = 0; + WaitLoopForSharedConnection(sharedCounterFlags, workerNode->workerName, + workerNode->workerPort); } else { - int sharedCounterFlags = 0; - bool incremented = TryToIncrementSharedConnectionCounter( - sharedCounterFlags, - workerNode->workerName, - workerNode->workerPort); + int sharedCounterFlags = 0; + bool incremented = TryToIncrementSharedConnectionCounter( + sharedCounterFlags, + workerNode->workerName, + workerNode->workerPort); if (!incremented) { /* @@ -482,11 +488,11 @@ EnsureConnectionPossibilityForNode(WorkerNode *workerNode, bool waitForConnectio bool IsReservationPossible(uint32 connectionFlags) { - bool connectionThrottlingDisabled = - connectionFlags & REQUIRE_MAINTENANCE_CONNECTION - ? GetMaxMaintenanceSharedPoolSize() <= 0 - : GetMaxSharedPoolSize() == DISABLE_CONNECTION_THROTTLING; - if (connectionThrottlingDisabled) + bool connectionThrottlingDisabled = + connectionFlags & REQUIRE_MAINTENANCE_CONNECTION + ? GetMaxMaintenanceSharedPoolSize() <= 0 + : GetMaxSharedPoolSize() == DISABLE_CONNECTION_THROTTLING; + if (connectionThrottlingDisabled) { /* connection throttling disabled */ return false; diff --git a/src/backend/distributed/connection/shared_connection_stats.c b/src/backend/distributed/connection/shared_connection_stats.c index 1cab33306..69ad0166a 100644 --- a/src/backend/distributed/connection/shared_connection_stats.c +++ b/src/backend/distributed/connection/shared_connection_stats.c @@ -75,24 +75,24 @@ typedef struct SharedWorkerNodeConnStatsHashKey typedef struct SharedWorkerNodeDatabaseConnStatsHashKey { - SharedWorkerNodeConnStatsHashKey workerNodeKey; - Oid database; + SharedWorkerNodeConnStatsHashKey workerNodeKey; + Oid database; } SharedWorkerNodeDatabaseConnStatsHashKey; /* hash entry for per worker stats */ typedef struct SharedWorkerNodeConnStatsHashEntry { - SharedWorkerNodeConnStatsHashKey key; + SharedWorkerNodeConnStatsHashKey key; - int regularConnectionsCount; - int maintenanceConnectionsCount; + int regularConnectionsCount; + int maintenanceConnectionsCount; } SharedWorkerNodeConnStatsHashEntry; /* hash entry for per database on worker stats */ typedef struct SharedWorkerNodeDatabaseConnStatsHashEntry { - SharedWorkerNodeDatabaseConnStatsHashKey key; - int count; + SharedWorkerNodeDatabaseConnStatsHashKey key; + int count; } SharedWorkerNodeDatabaseConnStatsHashEntry; @@ -141,17 +141,24 @@ static bool ShouldWaitForConnection(int currentConnectionCount); static uint32 SharedConnectionHashHash(const void *key, Size keysize); static int SharedConnectionHashCompare(const void *a, const void *b, Size keysize); static uint32 SharedWorkerNodeDatabaseHashHash(const void *key, Size keysize); -static int SharedWorkerNodeDatabaseHashCompare(const void *a, const void *b, Size keysize); +static int SharedWorkerNodeDatabaseHashCompare(const void *a, const void *b, Size + keysize); static bool isConnectionThrottlingDisabled(uint32 externalFlags); -static bool -IncrementSharedConnectionCounterInternal(uint32 externalFlags, bool checkLimits, const char *hostname, int port, - Oid database); -static SharedWorkerNodeConnStatsHashKey PrepareWorkerNodeHashKey(const char *hostname, int port); -static SharedWorkerNodeDatabaseConnStatsHashKey PrepareWorkerNodeDatabaseHashKey(const char *hostname, - int port, - Oid database); -static void -DecrementSharedConnectionCounterInternal(uint32 externalFlags, const char *hostname, int port, Oid database); +static bool IncrementSharedConnectionCounterInternal(uint32 externalFlags, bool + checkLimits, const char *hostname, + int port, + Oid database); +static SharedWorkerNodeConnStatsHashKey PrepareWorkerNodeHashKey(const char *hostname, int + port); +static SharedWorkerNodeDatabaseConnStatsHashKey PrepareWorkerNodeDatabaseHashKey(const + char * + hostname, + int port, + Oid + database); +static void DecrementSharedConnectionCounterInternal(uint32 externalFlags, const + char *hostname, int port, Oid + database); PG_FUNCTION_INFO_V1(citus_remote_connection_stats); @@ -192,26 +199,29 @@ StoreAllRemoteConnectionStats(Tuplestorestate *tupleStore, TupleDesc tupleDescri LockConnectionSharedMemory(LW_SHARED); HASH_SEQ_STATUS status; - SharedWorkerNodeDatabaseConnStatsHashEntry *connectionEntry = NULL; + SharedWorkerNodeDatabaseConnStatsHashEntry *connectionEntry = NULL; - hash_seq_init(&status, SharedWorkerNodeDatabaseConnStatsHash); - while ((connectionEntry = (SharedWorkerNodeDatabaseConnStatsHashEntry *) hash_seq_search(&status)) != 0) + hash_seq_init(&status, SharedWorkerNodeDatabaseConnStatsHash); + while ((connectionEntry = + (SharedWorkerNodeDatabaseConnStatsHashEntry *) hash_seq_search( + &status)) != 0) { /* get ready for the next tuple */ memset(values, 0, sizeof(values)); memset(isNulls, false, sizeof(isNulls)); - char *databaseName = get_database_name(connectionEntry->key.database); + char *databaseName = get_database_name(connectionEntry->key.database); if (databaseName == NULL) { /* database might have been dropped */ continue; } - values[0] = PointerGetDatum(cstring_to_text(connectionEntry->key.workerNodeKey.hostname)); - values[1] = Int32GetDatum(connectionEntry->key.workerNodeKey.port); + values[0] = PointerGetDatum(cstring_to_text( + connectionEntry->key.workerNodeKey.hostname)); + values[1] = Int32GetDatum(connectionEntry->key.workerNodeKey.port); values[2] = PointerGetDatum(cstring_to_text(databaseName)); - values[3] = Int32GetDatum(connectionEntry->count); + values[3] = Int32GetDatum(connectionEntry->count); tuplestore_putvalues(tupleStore, tupleDescriptor, values, isNulls); } @@ -257,10 +267,11 @@ GetMaxSharedPoolSize(void) return MaxSharedPoolSize; } + int GetMaxMaintenanceSharedPoolSize(void) { - return MaxMaintenanceSharedPoolSize; + return MaxMaintenanceSharedPoolSize; } @@ -293,7 +304,7 @@ GetLocalSharedPoolSize(void) void WaitLoopForSharedConnection(uint32 flags, const char *hostname, int port) { - while (!TryToIncrementSharedConnectionCounter(flags, hostname, port)) + while (!TryToIncrementSharedConnectionCounter(flags, hostname, port)) { CHECK_FOR_INTERRUPTS(); @@ -303,6 +314,7 @@ WaitLoopForSharedConnection(uint32 flags, const char *hostname, int port) ConditionVariableCancelSleep(); } + /* * TryToIncrementSharedConnectionCounter tries to increment the shared * connection counter for the given nodeId and the current database in @@ -315,28 +327,28 @@ WaitLoopForSharedConnection(uint32 flags, const char *hostname, int port) bool TryToIncrementSharedConnectionCounter(uint32 flags, const char *hostname, int port) { - if (isConnectionThrottlingDisabled(flags)) - { - return true; - } + if (isConnectionThrottlingDisabled(flags)) + { + return true; + } - /* + /* * The local session might already have some reserved connections to the given * node. In that case, we don't need to go through the shared memory. */ - Oid userId = GetUserId(); - if (CanUseReservedConnection(hostname, port, userId, MyDatabaseId)) - { - MarkReservedConnectionUsed(hostname, port, userId, MyDatabaseId); + Oid userId = GetUserId(); + if (CanUseReservedConnection(hostname, port, userId, MyDatabaseId)) + { + MarkReservedConnectionUsed(hostname, port, userId, MyDatabaseId); - return true; - } + return true; + } - return IncrementSharedConnectionCounterInternal(flags, - true, - hostname, - port, - MyDatabaseId); + return IncrementSharedConnectionCounterInternal(flags, + true, + hostname, + port, + MyDatabaseId); } @@ -347,161 +359,165 @@ TryToIncrementSharedConnectionCounter(uint32 flags, const char *hostname, int po void IncrementSharedConnectionCounter(uint32 flags, const char *hostname, int port) { - if (isConnectionThrottlingDisabled(flags)) - { - return; - } + if (isConnectionThrottlingDisabled(flags)) + { + return; + } - IncrementSharedConnectionCounterInternal(flags, - false, - hostname, - port, - MyDatabaseId); + IncrementSharedConnectionCounterInternal(flags, + false, + hostname, + port, + MyDatabaseId); } static bool IncrementSharedConnectionCounterInternal(uint32 externalFlags, - bool checkLimits, - const char *hostname, - int port, - Oid database) + bool checkLimits, + const char *hostname, + int port, + Oid database) { - LockConnectionSharedMemory(LW_EXCLUSIVE); + LockConnectionSharedMemory(LW_EXCLUSIVE); - /* - * As the hash map is allocated in shared memory, it doesn't rely on palloc for - * memory allocation, so we could get NULL via HASH_ENTER_NULL when there is no - * space in the shared memory. That's why we prefer continuing the execution - * instead of throwing an error. - */ - SharedWorkerNodeConnStatsHashKey workerNodeKey = PrepareWorkerNodeHashKey(hostname, port); - bool workerNodeEntryFound = false; - SharedWorkerNodeConnStatsHashEntry *workerNodeConnectionEntry = - hash_search(SharedWorkerNodeConnStatsHash, - &workerNodeKey, - HASH_ENTER_NULL, - &workerNodeEntryFound); + /* + * As the hash map is allocated in shared memory, it doesn't rely on palloc for + * memory allocation, so we could get NULL via HASH_ENTER_NULL when there is no + * space in the shared memory. That's why we prefer continuing the execution + * instead of throwing an error. + */ + SharedWorkerNodeConnStatsHashKey workerNodeKey = PrepareWorkerNodeHashKey(hostname, + port); + bool workerNodeEntryFound = false; + SharedWorkerNodeConnStatsHashEntry *workerNodeConnectionEntry = + hash_search(SharedWorkerNodeConnStatsHash, + &workerNodeKey, + HASH_ENTER_NULL, + &workerNodeEntryFound); - /* - * It is possible to throw an error at this point, but that doesn't help us in anyway. - * Instead, we try our best, let the connection establishment continue by-passing the - * connection throttling. - */ - if (!workerNodeConnectionEntry) - { - UnLockConnectionSharedMemory(); - return true; - } + /* + * It is possible to throw an error at this point, but that doesn't help us in anyway. + * Instead, we try our best, let the connection establishment continue by-passing the + * connection throttling. + */ + if (!workerNodeConnectionEntry) + { + UnLockConnectionSharedMemory(); + return true; + } - if (!workerNodeEntryFound) - { - /* we successfully allocated the entry for the first time, so initialize it */ - workerNodeConnectionEntry->regularConnectionsCount = 0; - workerNodeConnectionEntry->maintenanceConnectionsCount = 0; - } + if (!workerNodeEntryFound) + { + /* we successfully allocated the entry for the first time, so initialize it */ + workerNodeConnectionEntry->regularConnectionsCount = 0; + workerNodeConnectionEntry->maintenanceConnectionsCount = 0; + } - /* Initialize SharedWorkerNodeDatabaseConnStatsHash the same way */ - SharedWorkerNodeDatabaseConnStatsHashKey workerNodeDatabaseKey = - PrepareWorkerNodeDatabaseHashKey(hostname, port, database); - bool workerNodeDatabaseEntryFound = false; - SharedWorkerNodeDatabaseConnStatsHashEntry *workerNodeDatabaseEntry = - hash_search(SharedWorkerNodeDatabaseConnStatsHash, - &workerNodeDatabaseKey, - HASH_ENTER_NULL, - &workerNodeDatabaseEntryFound); + /* Initialize SharedWorkerNodeDatabaseConnStatsHash the same way */ + SharedWorkerNodeDatabaseConnStatsHashKey workerNodeDatabaseKey = + PrepareWorkerNodeDatabaseHashKey(hostname, port, database); + bool workerNodeDatabaseEntryFound = false; + SharedWorkerNodeDatabaseConnStatsHashEntry *workerNodeDatabaseEntry = + hash_search(SharedWorkerNodeDatabaseConnStatsHash, + &workerNodeDatabaseKey, + HASH_ENTER_NULL, + &workerNodeDatabaseEntryFound); - if (!workerNodeDatabaseEntry) - { - UnLockConnectionSharedMemory(); - return true; - } + if (!workerNodeDatabaseEntry) + { + UnLockConnectionSharedMemory(); + return true; + } - if (!workerNodeDatabaseEntryFound) - { - workerNodeDatabaseEntry->count = 0; - } + if (!workerNodeDatabaseEntryFound) + { + workerNodeDatabaseEntry->count = 0; + } - /* Increment counter if a slot available */ - bool connectionSlotAvailable = true; + /* Increment counter if a slot available */ + bool connectionSlotAvailable = true; - bool maintenanceConnection = externalFlags & MAINTENANCE_CONNECTION; - if (checkLimits) - { - WorkerNode *workerNode = FindWorkerNode(hostname, port); - bool connectionToLocalNode = workerNode && (workerNode->groupId == GetLocalGroupId()); + bool maintenanceConnection = externalFlags & MAINTENANCE_CONNECTION; + if (checkLimits) + { + WorkerNode *workerNode = FindWorkerNode(hostname, port); + bool connectionToLocalNode = workerNode && (workerNode->groupId == + GetLocalGroupId()); - int currentConnectionsLimit; - int currentConnectionsCount; - if (maintenanceConnection) - { - currentConnectionsLimit = GetMaxMaintenanceSharedPoolSize(); - currentConnectionsCount = workerNodeConnectionEntry->maintenanceConnectionsCount; - } - else - { - currentConnectionsLimit = connectionToLocalNode - ? GetLocalSharedPoolSize() - : GetMaxSharedPoolSize(); - currentConnectionsCount = workerNodeConnectionEntry->regularConnectionsCount; - } + int currentConnectionsLimit; + int currentConnectionsCount; + if (maintenanceConnection) + { + currentConnectionsLimit = GetMaxMaintenanceSharedPoolSize(); + currentConnectionsCount = + workerNodeConnectionEntry->maintenanceConnectionsCount; + } + else + { + currentConnectionsLimit = connectionToLocalNode + ? GetLocalSharedPoolSize() + : GetMaxSharedPoolSize(); + currentConnectionsCount = workerNodeConnectionEntry->regularConnectionsCount; + } - bool remoteNodeLimitExceeded = currentConnectionsCount + 1 > currentConnectionsLimit; + bool remoteNodeLimitExceeded = currentConnectionsCount + 1 > + currentConnectionsLimit; - /* - * For local nodes, solely relying on citus.max_shared_pool_size or - * max_connections might not be sufficient. The former gives us - * a preview of the future (e.g., we let the new connections to establish, - * but they are not established yet). The latter gives us the close to - * precise view of the past (e.g., the active number of client backends). - * - * Overall, we want to limit both of the metrics. The former limit typically - * kicks in under regular loads, where the load of the database increases in - * a reasonable pace. The latter limit typically kicks in when the database - * is issued lots of concurrent sessions at the same time, such as benchmarks. - */ - bool localNodeLimitExceeded = - connectionToLocalNode && - (GetLocalSharedPoolSize() == DISABLE_REMOTE_CONNECTIONS_FOR_LOCAL_QUERIES || - GetExternalClientBackendCount() + 1 > currentConnectionsLimit); - if (remoteNodeLimitExceeded || localNodeLimitExceeded) - { - connectionSlotAvailable = false; - } - } + /* + * For local nodes, solely relying on citus.max_shared_pool_size or + * max_connections might not be sufficient. The former gives us + * a preview of the future (e.g., we let the new connections to establish, + * but they are not established yet). The latter gives us the close to + * precise view of the past (e.g., the active number of client backends). + * + * Overall, we want to limit both of the metrics. The former limit typically + * kicks in under regular loads, where the load of the database increases in + * a reasonable pace. The latter limit typically kicks in when the database + * is issued lots of concurrent sessions at the same time, such as benchmarks. + */ + bool localNodeLimitExceeded = + connectionToLocalNode && + (GetLocalSharedPoolSize() == DISABLE_REMOTE_CONNECTIONS_FOR_LOCAL_QUERIES || + GetExternalClientBackendCount() + 1 > currentConnectionsLimit); + if (remoteNodeLimitExceeded || localNodeLimitExceeded) + { + connectionSlotAvailable = false; + } + } - if (connectionSlotAvailable) - { - if (maintenanceConnection) - { - workerNodeConnectionEntry->maintenanceConnectionsCount += 1; - } - else - { - workerNodeConnectionEntry->regularConnectionsCount += 1; - } - workerNodeDatabaseEntry->count += 1; - } + if (connectionSlotAvailable) + { + if (maintenanceConnection) + { + workerNodeConnectionEntry->maintenanceConnectionsCount += 1; + } + else + { + workerNodeConnectionEntry->regularConnectionsCount += 1; + } + workerNodeDatabaseEntry->count += 1; + } - if (IsLoggableLevel(DEBUG4)) - { - ereport(DEBUG4, errmsg( - "Incrementing connection counter. " - "Current regular connections: %i, maintenance connections: %i. " - "Connection slot to %s:%i database %i is %s", - workerNodeConnectionEntry->regularConnectionsCount, - workerNodeConnectionEntry->maintenanceConnectionsCount, - hostname, - port, - database, - connectionSlotAvailable ? "available" : "not available" - )); - } + if (IsLoggableLevel(DEBUG4)) + { + ereport(DEBUG4, errmsg( + "Incrementing connection counter. " + "Current regular connections: %i, maintenance connections: %i. " + "Connection slot to %s:%i database %i is %s", + workerNodeConnectionEntry->regularConnectionsCount, + workerNodeConnectionEntry->maintenanceConnectionsCount, + hostname, + port, + database, + connectionSlotAvailable ? "available" : "not available" + )); + } - UnLockConnectionSharedMemory(); + UnLockConnectionSharedMemory(); - return connectionSlotAvailable; + return connectionSlotAvailable; } @@ -512,105 +528,109 @@ IncrementSharedConnectionCounterInternal(uint32 externalFlags, void DecrementSharedConnectionCounter(uint32 externalFlags, const char *hostname, int port) { - // TODO: possible bug, remove this check? - if (isConnectionThrottlingDisabled(externalFlags)) - { - return; - } + /* TODO: possible bug, remove this check? */ + if (isConnectionThrottlingDisabled(externalFlags)) + { + return; + } - LockConnectionSharedMemory(LW_EXCLUSIVE); + LockConnectionSharedMemory(LW_EXCLUSIVE); - DecrementSharedConnectionCounterInternal(externalFlags, hostname, port, MyDatabaseId); + DecrementSharedConnectionCounterInternal(externalFlags, hostname, port, MyDatabaseId); - UnLockConnectionSharedMemory(); - WakeupWaiterBackendsForSharedConnection(); + UnLockConnectionSharedMemory(); + WakeupWaiterBackendsForSharedConnection(); } + static void DecrementSharedConnectionCounterInternal(uint32 externalFlags, - const char *hostname, - int port, - Oid database) + const char *hostname, + int port, + Oid database) { - bool workerNodeEntryFound = false; - SharedWorkerNodeConnStatsHashKey workerNodeKey = PrepareWorkerNodeHashKey(hostname, port); - SharedWorkerNodeConnStatsHashEntry *workerNodeConnectionEntry = - hash_search(SharedWorkerNodeConnStatsHash, &workerNodeKey, HASH_FIND, &workerNodeEntryFound); + bool workerNodeEntryFound = false; + SharedWorkerNodeConnStatsHashKey workerNodeKey = PrepareWorkerNodeHashKey(hostname, + port); + SharedWorkerNodeConnStatsHashEntry *workerNodeConnectionEntry = + hash_search(SharedWorkerNodeConnStatsHash, &workerNodeKey, HASH_FIND, + &workerNodeEntryFound); - /* this worker node is removed or updated, no need to care */ - if (!workerNodeEntryFound) - { - ereport(DEBUG4, (errmsg("No entry found for node %s:%d while decrementing " - "connection counter", hostname, port))); - return; - } + /* this worker node is removed or updated, no need to care */ + if (!workerNodeEntryFound) + { + ereport(DEBUG4, (errmsg("No entry found for node %s:%d while decrementing " + "connection counter", hostname, port))); + return; + } - /* we should never go below 0 */ - Assert(workerNodeConnectionEntry->regularConnectionsCount > 0 || - workerNodeConnectionEntry->maintenanceConnectionsCount > 0); + /* we should never go below 0 */ + Assert(workerNodeConnectionEntry->regularConnectionsCount > 0 || + workerNodeConnectionEntry->maintenanceConnectionsCount > 0); - if (externalFlags & MAINTENANCE_CONNECTION) - { - workerNodeConnectionEntry->maintenanceConnectionsCount -= 1; - } - else - { - workerNodeConnectionEntry->regularConnectionsCount -= 1; - } + if (externalFlags & MAINTENANCE_CONNECTION) + { + workerNodeConnectionEntry->maintenanceConnectionsCount -= 1; + } + else + { + workerNodeConnectionEntry->regularConnectionsCount -= 1; + } - if (IsLoggableLevel(DEBUG4)) - { - ereport(DEBUG4, errmsg( - "Decrementing connection counter. " - "Current regular connections: %i, maintenance connections: %i. " - "Connection slot to %s:%i database %i is released", - workerNodeConnectionEntry->regularConnectionsCount, - workerNodeConnectionEntry->maintenanceConnectionsCount, - hostname, - port, - database - )); - } + if (IsLoggableLevel(DEBUG4)) + { + ereport(DEBUG4, errmsg( + "Decrementing connection counter. " + "Current regular connections: %i, maintenance connections: %i. " + "Connection slot to %s:%i database %i is released", + workerNodeConnectionEntry->regularConnectionsCount, + workerNodeConnectionEntry->maintenanceConnectionsCount, + hostname, + port, + database + )); + } - /* - * We don't have to remove at this point as the node might be still active - * and will have new connections open to it. Still, this seems like a convenient - * place to remove the entry, as count == 0 implies that the server is - * not busy, and given the default value of MaxCachedConnectionsPerWorker = 1, - * we're unlikely to trigger this often. - */ - if (workerNodeConnectionEntry->regularConnectionsCount == 0 && - workerNodeConnectionEntry->maintenanceConnectionsCount == 0) - { - hash_search(SharedWorkerNodeConnStatsHash, &workerNodeKey, HASH_REMOVE, NULL); - } + /* + * We don't have to remove at this point as the node might be still active + * and will have new connections open to it. Still, this seems like a convenient + * place to remove the entry, as count == 0 implies that the server is + * not busy, and given the default value of MaxCachedConnectionsPerWorker = 1, + * we're unlikely to trigger this often. + */ + if (workerNodeConnectionEntry->regularConnectionsCount == 0 && + workerNodeConnectionEntry->maintenanceConnectionsCount == 0) + { + hash_search(SharedWorkerNodeConnStatsHash, &workerNodeKey, HASH_REMOVE, NULL); + } - /* - * Perform the same with SharedWorkerNodeDatabaseConnStatsHashKey - */ + /* + * Perform the same with SharedWorkerNodeDatabaseConnStatsHashKey + */ - SharedWorkerNodeDatabaseConnStatsHashKey workerNodeDatabaseKey = - PrepareWorkerNodeDatabaseHashKey(hostname, port, MyDatabaseId); - bool workerNodeDatabaseEntryFound = false; - SharedWorkerNodeDatabaseConnStatsHashEntry *workerNodeDatabaseEntry = - hash_search(SharedWorkerNodeDatabaseConnStatsHash, - &workerNodeDatabaseKey, - HASH_FIND, - &workerNodeDatabaseEntryFound); + SharedWorkerNodeDatabaseConnStatsHashKey workerNodeDatabaseKey = + PrepareWorkerNodeDatabaseHashKey(hostname, port, MyDatabaseId); + bool workerNodeDatabaseEntryFound = false; + SharedWorkerNodeDatabaseConnStatsHashEntry *workerNodeDatabaseEntry = + hash_search(SharedWorkerNodeDatabaseConnStatsHash, + &workerNodeDatabaseKey, + HASH_FIND, + &workerNodeDatabaseEntryFound); - if (!workerNodeDatabaseEntryFound) - { - return; - } + if (!workerNodeDatabaseEntryFound) + { + return; + } - Assert(workerNodeDatabaseEntry->count > 0); + Assert(workerNodeDatabaseEntry->count > 0); - workerNodeDatabaseEntry->count -= 1; + workerNodeDatabaseEntry->count -= 1; - if (workerNodeDatabaseEntry->count == 0) - { - hash_search(SharedWorkerNodeDatabaseConnStatsHash, &workerNodeDatabaseKey, HASH_REMOVE, NULL); - } + if (workerNodeDatabaseEntry->count == 0) + { + hash_search(SharedWorkerNodeDatabaseConnStatsHash, &workerNodeDatabaseKey, + HASH_REMOVE, NULL); + } } @@ -621,7 +641,7 @@ DecrementSharedConnectionCounterInternal(uint32 externalFlags, static void LockConnectionSharedMemory(LWLockMode lockMode) { - LWLockAcquire(&ConnectionStatsSharedState->sharedConnectionHashLock, lockMode); + LWLockAcquire(&ConnectionStatsSharedState->sharedConnectionHashLock, lockMode); } @@ -706,17 +726,20 @@ SharedConnectionStatsShmemSize(void) size = add_size(size, sizeof(ConnectionStatsSharedData)); - Size workerNodeConnHashSize = hash_estimate_size(MaxWorkerNodesTracked, - sizeof(SharedWorkerNodeConnStatsHashEntry)); + Size workerNodeConnHashSize = hash_estimate_size(MaxWorkerNodesTracked, + sizeof( + SharedWorkerNodeConnStatsHashEntry)); - size = add_size(size, workerNodeConnHashSize); + size = add_size(size, workerNodeConnHashSize); - Size workerNodeDatabaseConnSize = hash_estimate_size(MaxWorkerNodesTracked * MaxDatabasesPerWorkerNodesTracked, - sizeof(SharedWorkerNodeDatabaseConnStatsHashEntry)); + Size workerNodeDatabaseConnSize = hash_estimate_size(MaxWorkerNodesTracked * + MaxDatabasesPerWorkerNodesTracked, + sizeof( + SharedWorkerNodeDatabaseConnStatsHashEntry)); - size = add_size(size, workerNodeDatabaseConnSize); + size = add_size(size, workerNodeDatabaseConnSize); - return size; + return size; } @@ -756,42 +779,48 @@ SharedConnectionStatsShmemInit(void) ConditionVariableInit(&ConnectionStatsSharedState->waitersConditionVariable); } - /* allocate hash tables */ + /* allocate hash tables */ - /* create (hostname, port) -> [counter] */ - HASHCTL sharedWorkerNodeConnStatsHashInfo; - memset(&sharedWorkerNodeConnStatsHashInfo, 0, sizeof(sharedWorkerNodeConnStatsHashInfo)); - sharedWorkerNodeConnStatsHashInfo.keysize = sizeof(SharedWorkerNodeConnStatsHashKey); - sharedWorkerNodeConnStatsHashInfo.entrysize = sizeof(SharedWorkerNodeConnStatsHashEntry); - sharedWorkerNodeConnStatsHashInfo.hash = SharedConnectionHashHash; - sharedWorkerNodeConnStatsHashInfo.match = SharedConnectionHashCompare; - SharedWorkerNodeConnStatsHash = - ShmemInitHash("Shared Conn. Stats Hash", - MaxWorkerNodesTracked, - MaxWorkerNodesTracked, - &sharedWorkerNodeConnStatsHashInfo, - (HASH_ELEM | HASH_FUNCTION | HASH_COMPARE)); + /* create (hostname, port) -> [counter] */ + HASHCTL sharedWorkerNodeConnStatsHashInfo; + memset(&sharedWorkerNodeConnStatsHashInfo, 0, + sizeof(sharedWorkerNodeConnStatsHashInfo)); + sharedWorkerNodeConnStatsHashInfo.keysize = sizeof(SharedWorkerNodeConnStatsHashKey); + sharedWorkerNodeConnStatsHashInfo.entrysize = + sizeof(SharedWorkerNodeConnStatsHashEntry); + sharedWorkerNodeConnStatsHashInfo.hash = SharedConnectionHashHash; + sharedWorkerNodeConnStatsHashInfo.match = SharedConnectionHashCompare; + SharedWorkerNodeConnStatsHash = + ShmemInitHash("Shared Conn. Stats Hash", + MaxWorkerNodesTracked, + MaxWorkerNodesTracked, + &sharedWorkerNodeConnStatsHashInfo, + (HASH_ELEM | HASH_FUNCTION | HASH_COMPARE)); - /* create (hostname, port, database) -> [counter] */ - HASHCTL sharedWorkerNodeDatabaseConnStatsHashInfo; - memset(&sharedWorkerNodeDatabaseConnStatsHashInfo, 0, sizeof(sharedWorkerNodeDatabaseConnStatsHashInfo)); - sharedWorkerNodeDatabaseConnStatsHashInfo.keysize = sizeof(SharedWorkerNodeDatabaseConnStatsHashKey); - sharedWorkerNodeDatabaseConnStatsHashInfo.entrysize = sizeof(SharedWorkerNodeDatabaseConnStatsHashEntry); - sharedWorkerNodeDatabaseConnStatsHashInfo.hash = SharedWorkerNodeDatabaseHashHash; - sharedWorkerNodeDatabaseConnStatsHashInfo.match = SharedWorkerNodeDatabaseHashCompare; + /* create (hostname, port, database) -> [counter] */ + HASHCTL sharedWorkerNodeDatabaseConnStatsHashInfo; + memset(&sharedWorkerNodeDatabaseConnStatsHashInfo, 0, + sizeof(sharedWorkerNodeDatabaseConnStatsHashInfo)); + sharedWorkerNodeDatabaseConnStatsHashInfo.keysize = + sizeof(SharedWorkerNodeDatabaseConnStatsHashKey); + sharedWorkerNodeDatabaseConnStatsHashInfo.entrysize = + sizeof(SharedWorkerNodeDatabaseConnStatsHashEntry); + sharedWorkerNodeDatabaseConnStatsHashInfo.hash = SharedWorkerNodeDatabaseHashHash; + sharedWorkerNodeDatabaseConnStatsHashInfo.match = SharedWorkerNodeDatabaseHashCompare; - int sharedWorkerNodeDatabaseConnStatsHashSize = MaxWorkerNodesTracked * MaxDatabasesPerWorkerNodesTracked; - SharedWorkerNodeDatabaseConnStatsHash = - ShmemInitHash("Shared Conn Per Database. Stats Hash", - sharedWorkerNodeDatabaseConnStatsHashSize, - sharedWorkerNodeDatabaseConnStatsHashSize, - &sharedWorkerNodeDatabaseConnStatsHashInfo, - (HASH_ELEM | HASH_FUNCTION | HASH_COMPARE)); + int sharedWorkerNodeDatabaseConnStatsHashSize = MaxWorkerNodesTracked * + MaxDatabasesPerWorkerNodesTracked; + SharedWorkerNodeDatabaseConnStatsHash = + ShmemInitHash("Shared Conn Per Database. Stats Hash", + sharedWorkerNodeDatabaseConnStatsHashSize, + sharedWorkerNodeDatabaseConnStatsHashSize, + &sharedWorkerNodeDatabaseConnStatsHashInfo, + (HASH_ELEM | HASH_FUNCTION | HASH_COMPARE)); LWLockRelease(AddinShmemInitLock); - Assert(SharedWorkerNodeConnStatsHash != NULL); - Assert(SharedWorkerNodeDatabaseConnStatsHash != NULL); + Assert(SharedWorkerNodeConnStatsHash != NULL); + Assert(SharedWorkerNodeDatabaseConnStatsHash != NULL); Assert(ConnectionStatsSharedState->sharedConnectionHashTrancheId != 0); if (prev_shmem_startup_hook != NULL) @@ -889,35 +918,39 @@ ShouldWaitForConnection(int currentConnectionCount) return false; } -static SharedWorkerNodeConnStatsHashKey PrepareWorkerNodeHashKey(const char *hostname, int port) + +static SharedWorkerNodeConnStatsHashKey +PrepareWorkerNodeHashKey(const char *hostname, int port) { - SharedWorkerNodeConnStatsHashKey key; - strlcpy(key.hostname, hostname, MAX_NODE_LENGTH); - if (strlen(hostname) > MAX_NODE_LENGTH) - { - ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("hostname exceeds the maximum length of %d", - MAX_NODE_LENGTH))); - } - key.port = port; - return key; + SharedWorkerNodeConnStatsHashKey key; + strlcpy(key.hostname, hostname, MAX_NODE_LENGTH); + if (strlen(hostname) > MAX_NODE_LENGTH) + { + ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("hostname exceeds the maximum length of %d", + MAX_NODE_LENGTH))); + } + key.port = port; + return key; } -static SharedWorkerNodeDatabaseConnStatsHashKey PrepareWorkerNodeDatabaseHashKey(const char *hostname, - int port, - Oid database) + +static SharedWorkerNodeDatabaseConnStatsHashKey +PrepareWorkerNodeDatabaseHashKey(const char *hostname, + int port, + Oid database) { - SharedWorkerNodeDatabaseConnStatsHashKey workerNodeDatabaseKey; - workerNodeDatabaseKey.workerNodeKey = PrepareWorkerNodeHashKey(hostname, port); - workerNodeDatabaseKey.database = database; - return workerNodeDatabaseKey; + SharedWorkerNodeDatabaseConnStatsHashKey workerNodeDatabaseKey; + workerNodeDatabaseKey.workerNodeKey = PrepareWorkerNodeHashKey(hostname, port); + workerNodeDatabaseKey.database = database; + return workerNodeDatabaseKey; } static uint32 SharedConnectionHashHash(const void *key, Size keysize) { - SharedWorkerNodeConnStatsHashKey *entry = (SharedWorkerNodeConnStatsHashKey *) key; + SharedWorkerNodeConnStatsHashKey *entry = (SharedWorkerNodeConnStatsHashKey *) key; uint32 hash = string_hash(entry->hostname, NAMEDATALEN); hash = hash_combine(hash, hash_uint32(entry->port)); @@ -925,47 +958,56 @@ SharedConnectionHashHash(const void *key, Size keysize) return hash; } + static uint32 SharedWorkerNodeDatabaseHashHash(const void *key, Size keysize) { - SharedWorkerNodeDatabaseConnStatsHashKey *entry = (SharedWorkerNodeDatabaseConnStatsHashKey *) key; - uint32 hash = string_hash(entry->workerNodeKey.hostname, NAMEDATALEN); - hash = hash_combine(hash, hash_uint32(entry->workerNodeKey.port)); - hash = hash_combine(hash, hash_uint32(entry->database)); + SharedWorkerNodeDatabaseConnStatsHashKey *entry = + (SharedWorkerNodeDatabaseConnStatsHashKey *) key; + uint32 hash = string_hash(entry->workerNodeKey.hostname, NAMEDATALEN); + hash = hash_combine(hash, hash_uint32(entry->workerNodeKey.port)); + hash = hash_combine(hash, hash_uint32(entry->database)); - return hash; + return hash; } static int SharedConnectionHashCompare(const void *a, const void *b, Size keysize) { - SharedWorkerNodeConnStatsHashKey *ca = (SharedWorkerNodeConnStatsHashKey *) a; - SharedWorkerNodeConnStatsHashKey *cb = (SharedWorkerNodeConnStatsHashKey *) b; + SharedWorkerNodeConnStatsHashKey *ca = (SharedWorkerNodeConnStatsHashKey *) a; + SharedWorkerNodeConnStatsHashKey *cb = (SharedWorkerNodeConnStatsHashKey *) b; - return strncmp(ca->hostname, cb->hostname, MAX_NODE_LENGTH) != 0 || - ca->port != cb->port; + return strncmp(ca->hostname, cb->hostname, MAX_NODE_LENGTH) != 0 || + ca->port != cb->port; } + static int SharedWorkerNodeDatabaseHashCompare(const void *a, const void *b, Size keysize) { - SharedWorkerNodeDatabaseConnStatsHashKey *ca = (SharedWorkerNodeDatabaseConnStatsHashKey *) a; - SharedWorkerNodeDatabaseConnStatsHashKey *cb = (SharedWorkerNodeDatabaseConnStatsHashKey *) b; + SharedWorkerNodeDatabaseConnStatsHashKey *ca = + (SharedWorkerNodeDatabaseConnStatsHashKey *) a; + SharedWorkerNodeDatabaseConnStatsHashKey *cb = + (SharedWorkerNodeDatabaseConnStatsHashKey *) b; - return strncmp(ca->workerNodeKey.hostname, cb->workerNodeKey.hostname, MAX_NODE_LENGTH) != 0 || - ca->workerNodeKey.port != cb->workerNodeKey.port || - ca->database != cb->database; + return strncmp(ca->workerNodeKey.hostname, cb->workerNodeKey.hostname, + MAX_NODE_LENGTH) != 0 || + ca->workerNodeKey.port != cb->workerNodeKey.port || + ca->database != cb->database; } -static bool isConnectionThrottlingDisabled(uint32 externalFlags) + +static bool +isConnectionThrottlingDisabled(uint32 externalFlags) { - bool maintenanceConnection = externalFlags & MAINTENANCE_CONNECTION; - /* - * Do not call Get*PoolSize() functions here, since it may read from - * the catalog and we may be in the process exit handler. - */ - return maintenanceConnection - ? MaxMaintenanceSharedPoolSize <= 0 - : MaxSharedPoolSize == DISABLE_CONNECTION_THROTTLING; + bool maintenanceConnection = externalFlags & MAINTENANCE_CONNECTION; + + /* + * Do not call Get*PoolSize() functions here, since it may read from + * the catalog and we may be in the process exit handler. + */ + return maintenanceConnection + ? MaxMaintenanceSharedPoolSize <= 0 + : MaxSharedPoolSize == DISABLE_CONNECTION_THROTTLING; } diff --git a/src/backend/distributed/shared_library_init.c b/src/backend/distributed/shared_library_init.c index d0f8dfbd6..f06acf009 100644 --- a/src/backend/distributed/shared_library_init.c +++ b/src/backend/distributed/shared_library_init.c @@ -1993,17 +1993,17 @@ RegisterCitusConfigVariables(void) GUC_SUPERUSER_ONLY, NULL, NULL, MaxSharedPoolSizeGucShowHook); - DefineCustomIntVariable( - "citus.max_maintenance_shared_pool_size", - gettext_noop("Similar to citus.max_shared_pool_size, but applies to connections " - "for maintenance operations only." - "Setting it to 0 or -1 disables maintenance connection throttling."), - NULL, - &MaxMaintenanceSharedPoolSize, - 5, -1, INT_MAX, - PGC_SIGHUP, - GUC_SUPERUSER_ONLY, - NULL, NULL, NULL); + DefineCustomIntVariable( + "citus.max_maintenance_shared_pool_size", + gettext_noop("Similar to citus.max_shared_pool_size, but applies to connections " + "for maintenance operations only." + "Setting it to 0 or -1 disables maintenance connection throttling."), + NULL, + &MaxMaintenanceSharedPoolSize, + 5, -1, INT_MAX, + PGC_SIGHUP, + GUC_SUPERUSER_ONLY, + NULL, NULL, NULL); DefineCustomIntVariable( "citus.max_worker_nodes_tracked", @@ -2024,19 +2024,19 @@ RegisterCitusConfigVariables(void) GUC_STANDARD, NULL, NULL, NULL); - DefineCustomIntVariable( - "citus.max_databases_per_worker_tracked", - gettext_noop("Sets the amount of databases per worker tracked."), - gettext_noop( - "This configuration value complements the citus.max_worker_nodes_tracked." - "It should be used when there are more then one database with Citus in cluster," - "and, effectively, limits the size of the hash table with connections per worker + database." - "Currently, it does not affect the connection management logic and serves only statistical purposes."), - &MaxDatabasesPerWorkerNodesTracked, - 1, 1, INT_MAX, - PGC_POSTMASTER, - GUC_STANDARD, - NULL, NULL, NULL); + DefineCustomIntVariable( + "citus.max_databases_per_worker_tracked", + gettext_noop("Sets the amount of databases per worker tracked."), + gettext_noop( + "This configuration value complements the citus.max_worker_nodes_tracked." + "It should be used when there are more then one database with Citus in cluster," + "and, effectively, limits the size of the hash table with connections per worker + database." + "Currently, it does not affect the connection management logic and serves only statistical purposes."), + &MaxDatabasesPerWorkerNodesTracked, + 1, 1, INT_MAX, + PGC_POSTMASTER, + GUC_STANDARD, + NULL, NULL, NULL); DefineCustomIntVariable( "citus.metadata_sync_interval", diff --git a/src/backend/distributed/transaction/backend_data.c b/src/backend/distributed/transaction/backend_data.c index 829f19850..5f868f548 100644 --- a/src/backend/distributed/transaction/backend_data.c +++ b/src/backend/distributed/transaction/backend_data.c @@ -1071,6 +1071,7 @@ citus_pid_for_gpid(PG_FUNCTION_ARGS) PG_RETURN_INT32(ExtractProcessIdFromGlobalPID(globalPID)); } + /* * ExtractGlobalPID extracts the global process id from the application name and returns it * if the application name is not compatible with Citus' application names returns 0. diff --git a/src/backend/distributed/transaction/transaction_recovery.c b/src/backend/distributed/transaction/transaction_recovery.c index bc960796d..8054d2927 100644 --- a/src/backend/distributed/transaction/transaction_recovery.c +++ b/src/backend/distributed/transaction/transaction_recovery.c @@ -159,7 +159,7 @@ RecoverWorkerTransactions(WorkerNode *workerNode) bool recoveryFailed = false; - int connectionFlags = WAIT_FOR_CONNECTION | REQUIRE_MAINTENANCE_CONNECTION; + int connectionFlags = WAIT_FOR_CONNECTION | REQUIRE_MAINTENANCE_CONNECTION; MultiConnection *connection = GetNodeConnection(connectionFlags, nodeName, nodePort); if (connection->pgConn == NULL || PQstatus(connection->pgConn) != CONNECTION_OK) { diff --git a/src/backend/distributed/utils/maintenanced.c b/src/backend/distributed/utils/maintenanced.c index 117507c15..f329ae001 100644 --- a/src/backend/distributed/utils/maintenanced.c +++ b/src/backend/distributed/utils/maintenanced.c @@ -706,35 +706,35 @@ CitusMaintenanceDaemonMain(Datum main_arg) timeout = Min(timeout, Recover2PCInterval); } - /* - * Execute only on the maintenance database, if it configured, otherwise run from every daemon. - * The config value -1 disables the distributed deadlock detection - */ - if (DistributedDeadlockDetectionTimeoutFactor != -1.0) + /* + * Execute only on the maintenance database, if it configured, otherwise run from every daemon. + * The config value -1 disables the distributed deadlock detection + */ + if (DistributedDeadlockDetectionTimeoutFactor != -1.0) { - double deadlockTimeout = - DistributedDeadlockDetectionTimeoutFactor * (double) DeadlockTimeout; + double deadlockTimeout = + DistributedDeadlockDetectionTimeoutFactor * (double) DeadlockTimeout; - InvalidateMetadataSystemCache(); - StartTransactionCommand(); + InvalidateMetadataSystemCache(); + StartTransactionCommand(); - /* - * We skip the deadlock detection if citus extension - * is not accessible. - * - * Similarly, we skip to run the deadlock checks if - * there exists any version mismatch or the extension - * is not fully created yet. - */ - if (!LockCitusExtension()) - { - ereport(DEBUG1, (errmsg("could not lock the citus extension, " - "skipping deadlock detection"))); - } - else if (CheckCitusVersion(DEBUG1) && CitusHasBeenLoaded()) - { - foundDeadlock = CheckForDistributedDeadlocks(); - } + /* + * We skip the deadlock detection if citus extension + * is not accessible. + * + * Similarly, we skip to run the deadlock checks if + * there exists any version mismatch or the extension + * is not fully created yet. + */ + if (!LockCitusExtension()) + { + ereport(DEBUG1, (errmsg("could not lock the citus extension, " + "skipping deadlock detection"))); + } + else if (CheckCitusVersion(DEBUG1) && CitusHasBeenLoaded()) + { + foundDeadlock = CheckForDistributedDeadlocks(); + } CommitTransactionCommand(); @@ -1233,4 +1233,3 @@ MetadataSyncTriggeredCheckAndReset(MaintenanceDaemonDBData *dbData) return metadataSyncTriggered; } - diff --git a/src/include/distributed/connection_management.h b/src/include/distributed/connection_management.h index 91e1e9222..cc2fa3403 100644 --- a/src/include/distributed/connection_management.h +++ b/src/include/distributed/connection_management.h @@ -125,12 +125,12 @@ enum MultiConnectionMode */ REQUIRE_REPLICATION_CONNECTION_PARAM = 1 << 8, - /* - * This flag specifies that connection is required for maintenance operations, e.g. - * transaction recovery, distributed deadlock detection. Such connections have - * a reserved quota of the MaxSharedPoolSize. - */ - REQUIRE_MAINTENANCE_CONNECTION = 1 << 9 + /* + * This flag specifies that connection is required for maintenance operations, e.g. + * transaction recovery, distributed deadlock detection. Such connections have + * a reserved quota of the MaxSharedPoolSize. + */ + REQUIRE_MAINTENANCE_CONNECTION = 1 << 9 }; @@ -230,8 +230,8 @@ typedef struct MultiConnection /* replication option */ bool requiresReplication; - /* See REQUIRE_MAINTENANCE_CONNECTION */ - bool useForMaintenanceOperations; + /* See REQUIRE_MAINTENANCE_CONNECTION */ + bool useForMaintenanceOperations; MultiConnectionStructInitializationState initializationState; } MultiConnection; diff --git a/src/include/distributed/shared_connection_stats.h b/src/include/distributed/shared_connection_stats.h index d5901014a..ea8346769 100644 --- a/src/include/distributed/shared_connection_stats.h +++ b/src/include/distributed/shared_connection_stats.h @@ -18,10 +18,10 @@ enum SharedPoolCounterMode { - /* - * Use this flag to reserve a connection from a maintenance quota - */ - MAINTENANCE_CONNECTION = 1 << 0 + /* + * Use this flag to reserve a connection from a maintenance quota + */ + MAINTENANCE_CONNECTION = 1 << 0 }; extern int MaxSharedPoolSize; @@ -39,11 +39,14 @@ extern int GetMaxClientConnections(void); extern int GetMaxSharedPoolSize(void); extern int GetMaxMaintenanceSharedPoolSize(void); extern int GetLocalSharedPoolSize(void); -extern bool TryToIncrementSharedConnectionCounter(uint32 flags, const char *hostname, int port); +extern bool TryToIncrementSharedConnectionCounter(uint32 flags, const char *hostname, + int port); extern void WaitLoopForSharedConnection(uint32 flags, const char *hostname, int port); -extern void DecrementSharedConnectionCounter(uint32 externalFlags, const char *hostname, int port); -extern void IncrementSharedConnectionCounter(uint32 flags, const char *hostname, int port); -extern int AdaptiveConnectionManagementFlag(bool connectToLocalNode, int - activeConnectionCount); +extern void DecrementSharedConnectionCounter(uint32 externalFlags, const char *hostname, + int port); +extern void IncrementSharedConnectionCounter(uint32 flags, const char *hostname, + int port); +extern int AdaptiveConnectionManagementFlag(bool connectToLocalNode, + int activeConnectionCount); #endif /* SHARED_CONNECTION_STATS_H */ From 728ce2b36eff021d50a7bebe9e26e83010f24cd3 Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Tue, 30 Jan 2024 21:50:49 +0100 Subject: [PATCH 17/41] Fix tests --- .../regress/expected/multi_maintenance_multiple_databases.out | 4 ++-- src/test/regress/sql/multi_maintenance_multiple_databases.sql | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/test/regress/expected/multi_maintenance_multiple_databases.out b/src/test/regress/expected/multi_maintenance_multiple_databases.out index 5013c3a84..fb5ea712d 100644 --- a/src/test/regress/expected/multi_maintenance_multiple_databases.out +++ b/src/test/regress/expected/multi_maintenance_multiple_databases.out @@ -174,7 +174,7 @@ FROM pg_database, dblink(format('dbname=%s host=localhost port=%s user=postgres', datname, (SELECT setting::int FROM pg_settings WHERE name = 'port')), $statement$ - SELECT * + SELECT groupid, gid FROM pg_dist_transaction WHERE gid LIKE 'citus_0_1234_4_0_%' OR gid LIKE 'citus_0_should_be_forgotten_%' @@ -276,7 +276,7 @@ FROM pg_database, dblink(format('dbname=%s host=localhost port=%s user=postgres', datname, (SELECT setting::int FROM pg_settings WHERE name = 'port')), $statement$ - SELECT * + SELECT groupid, gid FROM pg_dist_transaction WHERE gid LIKE 'citus_0_1234_4_0_%' OR gid LIKE 'citus_0_should_be_forgotten_%' diff --git a/src/test/regress/sql/multi_maintenance_multiple_databases.sql b/src/test/regress/sql/multi_maintenance_multiple_databases.sql index 20a6aa132..11b56a999 100644 --- a/src/test/regress/sql/multi_maintenance_multiple_databases.sql +++ b/src/test/regress/sql/multi_maintenance_multiple_databases.sql @@ -168,7 +168,7 @@ FROM pg_database, dblink(format('dbname=%s host=localhost port=%s user=postgres', datname, (SELECT setting::int FROM pg_settings WHERE name = 'port')), $statement$ - SELECT * + SELECT groupid, gid FROM pg_dist_transaction WHERE gid LIKE 'citus_0_1234_4_0_%' OR gid LIKE 'citus_0_should_be_forgotten_%' @@ -235,7 +235,7 @@ FROM pg_database, dblink(format('dbname=%s host=localhost port=%s user=postgres', datname, (SELECT setting::int FROM pg_settings WHERE name = 'port')), $statement$ - SELECT * + SELECT groupid, gid FROM pg_dist_transaction WHERE gid LIKE 'citus_0_1234_4_0_%' OR gid LIKE 'citus_0_should_be_forgotten_%' From 5b8dcb154b3faafa52bb96a7971fa6c5f36e8052 Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Thu, 1 Feb 2024 12:08:57 +0100 Subject: [PATCH 18/41] Fix failure_single_select --- src/test/regress/expected/failure_single_select.out | 8 ++++++-- src/test/regress/sql/failure_single_select.sql | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/test/regress/expected/failure_single_select.out b/src/test/regress/expected/failure_single_select.out index 586dd4756..76edf6e17 100644 --- a/src/test/regress/expected/failure_single_select.out +++ b/src/test/regress/expected/failure_single_select.out @@ -144,6 +144,7 @@ INSERT INTO select_test VALUES (3, 'even more data'); SELECT * FROM select_test WHERE key = 3; ERROR: connection to the remote node postgres@localhost:xxxxx failed with the following error: connection not open COMMIT; +-- Maintenance connections are not cached SELECT citus.mitmproxy('conn.onQuery(query="SELECT.*pg_prepared_xacts").after(2).kill()'); mitmproxy --------------------------------------------------------------------- @@ -157,8 +158,11 @@ SELECT recover_prepared_transactions(); (1 row) SELECT recover_prepared_transactions(); -ERROR: connection not open -CONTEXT: while executing command on localhost:xxxxx + recover_prepared_transactions +--------------------------------------------------------------------- + 0 +(1 row) + -- bug from https://github.com/citusdata/citus/issues/1926 SET citus.max_cached_conns_per_worker TO 0; -- purge cache DROP TABLE select_test; diff --git a/src/test/regress/sql/failure_single_select.sql b/src/test/regress/sql/failure_single_select.sql index c8218c950..80e3f9ca8 100644 --- a/src/test/regress/sql/failure_single_select.sql +++ b/src/test/regress/sql/failure_single_select.sql @@ -77,6 +77,7 @@ INSERT INTO select_test VALUES (3, 'even more data'); SELECT * FROM select_test WHERE key = 3; COMMIT; +-- Maintenance connections are not cached SELECT citus.mitmproxy('conn.onQuery(query="SELECT.*pg_prepared_xacts").after(2).kill()'); SELECT recover_prepared_transactions(); SELECT recover_prepared_transactions(); From f1b550948f6244aefed6ec83e200086abaeedcf2 Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Thu, 1 Feb 2024 13:15:36 +0100 Subject: [PATCH 19/41] Fix pg16 build --- src/include/pg_version_compat.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/include/pg_version_compat.h b/src/include/pg_version_compat.h index 665cd30c2..255ff804c 100644 --- a/src/include/pg_version_compat.h +++ b/src/include/pg_version_compat.h @@ -48,8 +48,6 @@ get_guc_variables_compat(int *gucCount) #define pgstat_fetch_stat_local_beentry(a) pgstat_get_local_beentry_by_index(a) -#define have_createdb_privilege() have_createdb_privilege() - #else #include "miscadmin.h" From d3f358ed7766b6e8d641fa669fde7febdfbfa019 Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Thu, 1 Feb 2024 15:10:57 +0100 Subject: [PATCH 20/41] - Address review - run citus_indent --- .../distributed/connection/connection_management.c | 6 ------ .../connection/shared_connection_stats.c | 13 +++++-------- src/backend/distributed/utils/maintenanced.c | 2 +- 3 files changed, 6 insertions(+), 15 deletions(-) diff --git a/src/backend/distributed/connection/connection_management.c b/src/backend/distributed/connection/connection_management.c index fa7f9f3bf..09e50bfa7 100644 --- a/src/backend/distributed/connection/connection_management.c +++ b/src/backend/distributed/connection/connection_management.c @@ -42,12 +42,6 @@ #include "distributed/time_constants.h" #include "distributed/version_compat.h" #include "distributed/worker_log_messages.h" -#include "mb/pg_wchar.h" -#include "pg_config.h" -#include "portability/instr_time.h" -#include "storage/ipc.h" -#include "utils/hsearch.h" -#include "utils/memutils.h" int NodeConnectionTimeout = 30000; diff --git a/src/backend/distributed/connection/shared_connection_stats.c b/src/backend/distributed/connection/shared_connection_stats.c index 69ad0166a..4ad3f287e 100644 --- a/src/backend/distributed/connection/shared_connection_stats.c +++ b/src/backend/distributed/connection/shared_connection_stats.c @@ -13,6 +13,7 @@ #include "postgres.h" #include "libpq-fe.h" +#include "math.h" #include "miscadmin.h" #include "pgstat.h" @@ -32,10 +33,8 @@ #include "distributed/placement_connection.h" #include "distributed/shared_connection_stats.h" #include "distributed/time_constants.h" -#include "distributed/worker_manager.h" #include "distributed/tuplestore.h" #include "distributed/worker_manager.h" -#include "math.h" #define REMOTE_CONNECTION_STATS_COLUMNS 4 @@ -163,7 +162,6 @@ static void DecrementSharedConnectionCounterInternal(uint32 externalFlags, const PG_FUNCTION_INFO_V1(citus_remote_connection_stats); - /* * citus_remote_connection_stats returns all the avaliable information about all * the remote connections (a.k.a., connections to remote nodes). @@ -964,8 +962,7 @@ SharedWorkerNodeDatabaseHashHash(const void *key, Size keysize) { SharedWorkerNodeDatabaseConnStatsHashKey *entry = (SharedWorkerNodeDatabaseConnStatsHashKey *) key; - uint32 hash = string_hash(entry->workerNodeKey.hostname, NAMEDATALEN); - hash = hash_combine(hash, hash_uint32(entry->workerNodeKey.port)); + uint32 hash = SharedConnectionHashHash(&(entry->workerNodeKey), keysize); hash = hash_combine(hash, hash_uint32(entry->database)); return hash; @@ -991,9 +988,9 @@ SharedWorkerNodeDatabaseHashCompare(const void *a, const void *b, Size keysize) SharedWorkerNodeDatabaseConnStatsHashKey *cb = (SharedWorkerNodeDatabaseConnStatsHashKey *) b; - return strncmp(ca->workerNodeKey.hostname, cb->workerNodeKey.hostname, - MAX_NODE_LENGTH) != 0 || - ca->workerNodeKey.port != cb->workerNodeKey.port || + int sharedConnectionHashCompare = + SharedConnectionHashCompare(&(ca->workerNodeKey), &(cb->workerNodeKey), keysize); + return sharedConnectionHashCompare || ca->database != cb->database; } diff --git a/src/backend/distributed/utils/maintenanced.c b/src/backend/distributed/utils/maintenanced.c index f329ae001..785608729 100644 --- a/src/backend/distributed/utils/maintenanced.c +++ b/src/backend/distributed/utils/maintenanced.c @@ -49,6 +49,7 @@ #include "distributed/background_jobs.h" #include "distributed/citus_safe_lib.h" +#include "distributed/connection_management.h" #include "distributed/coordinator_protocol.h" #include "distributed/distributed_deadlock_detection.h" #include "distributed/maintenanced.h" @@ -60,7 +61,6 @@ #include "distributed/statistics_collection.h" #include "distributed/transaction_recovery.h" #include "distributed/version_compat.h" -#include "distributed/connection_management.h" /* * Shared memory data for all maintenance workers. From 62c5090fd85582892a105fa4cb0669c510ec7d19 Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Wed, 7 Feb 2024 09:20:43 +0100 Subject: [PATCH 21/41] Fix gucs order --- src/backend/distributed/shared_library_init.c | 52 +++++++++---------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/src/backend/distributed/shared_library_init.c b/src/backend/distributed/shared_library_init.c index f06acf009..c672c26a3 100644 --- a/src/backend/distributed/shared_library_init.c +++ b/src/backend/distributed/shared_library_init.c @@ -1933,6 +1933,20 @@ RegisterCitusConfigVariables(void) GUC_STANDARD, NULL, NULL, NULL); + DefineCustomIntVariable( + "citus.max_databases_per_worker_tracked", + gettext_noop("Sets the amount of databases per worker tracked."), + gettext_noop( + "This configuration value complements the citus.max_worker_nodes_tracked." + "It should be used when there are more then one database with Citus in cluster," + "and, effectively, limits the size of the hash table with connections per worker + database." + "Currently, it does not affect the connection management logic and serves only statistical purposes."), + &MaxDatabasesPerWorkerNodesTracked, + 1, 1, INT_MAX, + PGC_POSTMASTER, + GUC_STANDARD, + NULL, NULL, NULL); + DefineCustomIntVariable( "citus.max_high_priority_background_processes", gettext_noop("Sets the maximum number of background processes " @@ -1958,6 +1972,18 @@ RegisterCitusConfigVariables(void) GUC_UNIT_KB | GUC_STANDARD, NULL, NULL, NULL); + DefineCustomIntVariable( + "citus.max_maintenance_shared_pool_size", + gettext_noop("Similar to citus.max_shared_pool_size, but applies to connections " + "for maintenance operations only." + "Setting it to 0 or -1 disables maintenance connection throttling."), + NULL, + &MaxMaintenanceSharedPoolSize, + 5, -1, INT_MAX, + PGC_SIGHUP, + GUC_SUPERUSER_ONLY, + NULL, NULL, NULL); + DefineCustomIntVariable( "citus.max_matview_size_to_auto_recreate", gettext_noop("Sets the maximum size of materialized views in MB to " @@ -1993,18 +2019,6 @@ RegisterCitusConfigVariables(void) GUC_SUPERUSER_ONLY, NULL, NULL, MaxSharedPoolSizeGucShowHook); - DefineCustomIntVariable( - "citus.max_maintenance_shared_pool_size", - gettext_noop("Similar to citus.max_shared_pool_size, but applies to connections " - "for maintenance operations only." - "Setting it to 0 or -1 disables maintenance connection throttling."), - NULL, - &MaxMaintenanceSharedPoolSize, - 5, -1, INT_MAX, - PGC_SIGHUP, - GUC_SUPERUSER_ONLY, - NULL, NULL, NULL); - DefineCustomIntVariable( "citus.max_worker_nodes_tracked", gettext_noop("Sets the maximum number of worker nodes that are tracked."), @@ -2024,20 +2038,6 @@ RegisterCitusConfigVariables(void) GUC_STANDARD, NULL, NULL, NULL); - DefineCustomIntVariable( - "citus.max_databases_per_worker_tracked", - gettext_noop("Sets the amount of databases per worker tracked."), - gettext_noop( - "This configuration value complements the citus.max_worker_nodes_tracked." - "It should be used when there are more then one database with Citus in cluster," - "and, effectively, limits the size of the hash table with connections per worker + database." - "Currently, it does not affect the connection management logic and serves only statistical purposes."), - &MaxDatabasesPerWorkerNodesTracked, - 1, 1, INT_MAX, - PGC_POSTMASTER, - GUC_STANDARD, - NULL, NULL, NULL); - DefineCustomIntVariable( "citus.metadata_sync_interval", gettext_noop("Sets the time to wait between metadata syncs."), From 37198369cdd14ecb70e5f28cea9062099bde184d Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Wed, 7 Feb 2024 09:54:22 +0100 Subject: [PATCH 22/41] Run citus_indent --- src/backend/distributed/shared_library_init.c | 44 +++++++++---------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/src/backend/distributed/shared_library_init.c b/src/backend/distributed/shared_library_init.c index c672c26a3..2a9b73f20 100644 --- a/src/backend/distributed/shared_library_init.c +++ b/src/backend/distributed/shared_library_init.c @@ -1934,18 +1934,18 @@ RegisterCitusConfigVariables(void) NULL, NULL, NULL); DefineCustomIntVariable( - "citus.max_databases_per_worker_tracked", - gettext_noop("Sets the amount of databases per worker tracked."), - gettext_noop( - "This configuration value complements the citus.max_worker_nodes_tracked." - "It should be used when there are more then one database with Citus in cluster," - "and, effectively, limits the size of the hash table with connections per worker + database." - "Currently, it does not affect the connection management logic and serves only statistical purposes."), - &MaxDatabasesPerWorkerNodesTracked, - 1, 1, INT_MAX, - PGC_POSTMASTER, - GUC_STANDARD, - NULL, NULL, NULL); + "citus.max_databases_per_worker_tracked", + gettext_noop("Sets the amount of databases per worker tracked."), + gettext_noop( + "This configuration value complements the citus.max_worker_nodes_tracked." + "It should be used when there are more then one database with Citus in cluster," + "and, effectively, limits the size of the hash table with connections per worker + database." + "Currently, it does not affect the connection management logic and serves only statistical purposes."), + &MaxDatabasesPerWorkerNodesTracked, + 1, 1, INT_MAX, + PGC_POSTMASTER, + GUC_STANDARD, + NULL, NULL, NULL); DefineCustomIntVariable( "citus.max_high_priority_background_processes", @@ -1973,16 +1973,16 @@ RegisterCitusConfigVariables(void) NULL, NULL, NULL); DefineCustomIntVariable( - "citus.max_maintenance_shared_pool_size", - gettext_noop("Similar to citus.max_shared_pool_size, but applies to connections " - "for maintenance operations only." - "Setting it to 0 or -1 disables maintenance connection throttling."), - NULL, - &MaxMaintenanceSharedPoolSize, - 5, -1, INT_MAX, - PGC_SIGHUP, - GUC_SUPERUSER_ONLY, - NULL, NULL, NULL); + "citus.max_maintenance_shared_pool_size", + gettext_noop("Similar to citus.max_shared_pool_size, but applies to connections " + "for maintenance operations only." + "Setting it to 0 or -1 disables maintenance connection throttling."), + NULL, + &MaxMaintenanceSharedPoolSize, + 5, -1, INT_MAX, + PGC_SIGHUP, + GUC_SUPERUSER_ONLY, + NULL, NULL, NULL); DefineCustomIntVariable( "citus.max_matview_size_to_auto_recreate", From 5b2c297ca4f6e4bdb90a651090314a53a5d034c0 Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Mon, 12 Feb 2024 12:05:52 +0100 Subject: [PATCH 23/41] Fix flakiness in multi_maintenance_multiple_databases.sql --- .../multi_maintenance_multiple_databases.out | 47 ++++++++++--------- .../multi_maintenance_multiple_databases.sql | 38 ++++++++++----- 2 files changed, 50 insertions(+), 35 deletions(-) diff --git a/src/test/regress/expected/multi_maintenance_multiple_databases.out b/src/test/regress/expected/multi_maintenance_multiple_databases.out index fb5ea712d..9ce129838 100644 --- a/src/test/regress/expected/multi_maintenance_multiple_databases.out +++ b/src/test/regress/expected/multi_maintenance_multiple_databases.out @@ -256,12 +256,31 @@ WHERE state = 'idle' \c - - - :master_port -- Let maintenance do it's work... -SELECT pg_sleep_for('10 seconds'::interval); - pg_sleep_for ---------------------------------------------------------------------- - -(1 row) - +DO +$$ + BEGIN + FOR i IN 0 .. 300 + LOOP + IF i = 300 THEN RAISE 'Waited too long'; END IF; + PERFORM pg_stat_clear_snapshot(); + PERFORM * FROM pg_stat_activity WHERE application_name = 'Citus Maintenance Daemon'; + IF (SELECT count(*) = 0 AS pg_dist_transaction_after_recovery_coordinator_test + FROM pg_database, + dblink(format('dbname=%s host=localhost port=%s user=postgres', datname, + (SELECT setting::int FROM pg_settings WHERE name = 'port')), + $statement$ + SELECT groupid, gid + FROM pg_dist_transaction + WHERE gid LIKE 'citus_0_1234_4_0_%' + OR gid LIKE 'citus_0_should_be_forgotten_%' + $statement$) AS t(groupid integer, gid text) + WHERE datname LIKE 'db%') THEN + EXIT; + END IF; + PERFORM pg_sleep_for('1 SECOND'::interval); + END LOOP; + END +$$; -- Verify maintenance result SELECT count(*) = 0 AS too_many_clients_test FROM regexp_split_to_table(pg_read_file('../log/postmaster.log'), E'\n') AS t(log_line) @@ -271,22 +290,6 @@ WHERE log_line LIKE '%sorry, too many clients already%'; t (1 row) -SELECT count(*) = 0 AS pg_dist_transaction_after_recovery_coordinator_test -FROM pg_database, - dblink(format('dbname=%s host=localhost port=%s user=postgres', datname, - (SELECT setting::int FROM pg_settings WHERE name = 'port')), - $statement$ - SELECT groupid, gid - FROM pg_dist_transaction - WHERE gid LIKE 'citus_0_1234_4_0_%' - OR gid LIKE 'citus_0_should_be_forgotten_%' - $statement$) AS t(groupid integer, gid text) -WHERE datname LIKE 'db%'; - pg_dist_transaction_after_recovery_coordinator_test ---------------------------------------------------------------------- - t -(1 row) - SELECT count(*) = 0 AS cached_connections_after_recovery_coordinator_test FROM pg_stat_activity WHERE state = 'idle' diff --git a/src/test/regress/sql/multi_maintenance_multiple_databases.sql b/src/test/regress/sql/multi_maintenance_multiple_databases.sql index 11b56a999..d67f58296 100644 --- a/src/test/regress/sql/multi_maintenance_multiple_databases.sql +++ b/src/test/regress/sql/multi_maintenance_multiple_databases.sql @@ -222,7 +222,31 @@ WHERE state = 'idle' -- Let maintenance do it's work... -SELECT pg_sleep_for('10 seconds'::interval); +DO +$$ + BEGIN + FOR i IN 0 .. 300 + LOOP + IF i = 300 THEN RAISE 'Waited too long'; END IF; + PERFORM pg_stat_clear_snapshot(); + PERFORM * FROM pg_stat_activity WHERE application_name = 'Citus Maintenance Daemon'; + IF (SELECT count(*) = 0 AS pg_dist_transaction_after_recovery_coordinator_test + FROM pg_database, + dblink(format('dbname=%s host=localhost port=%s user=postgres', datname, + (SELECT setting::int FROM pg_settings WHERE name = 'port')), + $statement$ + SELECT groupid, gid + FROM pg_dist_transaction + WHERE gid LIKE 'citus_0_1234_4_0_%' + OR gid LIKE 'citus_0_should_be_forgotten_%' + $statement$) AS t(groupid integer, gid text) + WHERE datname LIKE 'db%') THEN + EXIT; + END IF; + PERFORM pg_sleep_for('1 SECOND'::interval); + END LOOP; + END +$$; -- Verify maintenance result @@ -230,18 +254,6 @@ SELECT count(*) = 0 AS too_many_clients_test FROM regexp_split_to_table(pg_read_file('../log/postmaster.log'), E'\n') AS t(log_line) WHERE log_line LIKE '%sorry, too many clients already%'; -SELECT count(*) = 0 AS pg_dist_transaction_after_recovery_coordinator_test -FROM pg_database, - dblink(format('dbname=%s host=localhost port=%s user=postgres', datname, - (SELECT setting::int FROM pg_settings WHERE name = 'port')), - $statement$ - SELECT groupid, gid - FROM pg_dist_transaction - WHERE gid LIKE 'citus_0_1234_4_0_%' - OR gid LIKE 'citus_0_should_be_forgotten_%' - $statement$) AS t(groupid integer, gid text) -WHERE datname LIKE 'db%'; - SELECT count(*) = 0 AS cached_connections_after_recovery_coordinator_test FROM pg_stat_activity WHERE state = 'idle' From 10f4777aaf089f5cf449df783c5d5d4a3e93f49a Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Mon, 12 Feb 2024 13:48:44 +0100 Subject: [PATCH 24/41] Fix flakiness in multi_maintenance_multiple_databases.sql x2 --- .../multi_maintenance_multiple_databases.out | 120 ++++++++++-------- .../multi_maintenance_multiple_databases.sql | 110 ++++++++++------ 2 files changed, 137 insertions(+), 93 deletions(-) diff --git a/src/test/regress/expected/multi_maintenance_multiple_databases.out b/src/test/regress/expected/multi_maintenance_multiple_databases.out index 9ce129838..04cb9c1a0 100644 --- a/src/test/regress/expected/multi_maintenance_multiple_databases.out +++ b/src/test/regress/expected/multi_maintenance_multiple_databases.out @@ -255,32 +255,6 @@ WHERE state = 'idle' (1 row) \c - - - :master_port --- Let maintenance do it's work... -DO -$$ - BEGIN - FOR i IN 0 .. 300 - LOOP - IF i = 300 THEN RAISE 'Waited too long'; END IF; - PERFORM pg_stat_clear_snapshot(); - PERFORM * FROM pg_stat_activity WHERE application_name = 'Citus Maintenance Daemon'; - IF (SELECT count(*) = 0 AS pg_dist_transaction_after_recovery_coordinator_test - FROM pg_database, - dblink(format('dbname=%s host=localhost port=%s user=postgres', datname, - (SELECT setting::int FROM pg_settings WHERE name = 'port')), - $statement$ - SELECT groupid, gid - FROM pg_dist_transaction - WHERE gid LIKE 'citus_0_1234_4_0_%' - OR gid LIKE 'citus_0_should_be_forgotten_%' - $statement$) AS t(groupid integer, gid text) - WHERE datname LIKE 'db%') THEN - EXIT; - END IF; - PERFORM pg_sleep_for('1 SECOND'::interval); - END LOOP; - END -$$; -- Verify maintenance result SELECT count(*) = 0 AS too_many_clients_test FROM regexp_split_to_table(pg_read_file('../log/postmaster.log'), E'\n') AS t(log_line) @@ -290,15 +264,43 @@ WHERE log_line LIKE '%sorry, too many clients already%'; t (1 row) -SELECT count(*) = 0 AS cached_connections_after_recovery_coordinator_test -FROM pg_stat_activity -WHERE state = 'idle' - AND now() - backend_start > '5 seconds'::interval; - cached_connections_after_recovery_coordinator_test ---------------------------------------------------------------------- - t -(1 row) +DO +$$ + DECLARE + pg_dist_transaction_after_recovery_coordinator_test boolean; + cached_connections_after_recovery_coordinator_test boolean; + BEGIN + FOR i IN 0 .. 300 + LOOP + IF i = 300 THEN RAISE 'Waited too long'; END IF; + SELECT count(*) = 0 + FROM pg_database, + dblink(format('dbname=%s host=localhost port=%s user=postgres', datname, + (SELECT setting::int FROM pg_settings WHERE name = 'port')), + $statement$ + SELECT groupid, gid + FROM pg_dist_transaction + WHERE gid LIKE 'citus_0_1234_4_0_%' + OR gid LIKE 'citus_0_should_be_forgotten_%' + $statement$) AS t(groupid integer, gid text) + WHERE datname LIKE 'db%' + INTO pg_dist_transaction_after_recovery_coordinator_test; + SELECT count(*) = 0 AS cached_connections_after_recovery_coordinator_test + FROM pg_stat_activity + WHERE state = 'idle' + AND now() - backend_start > '5 seconds'::interval + INTO cached_connections_after_recovery_coordinator_test; + + IF (pg_dist_transaction_after_recovery_coordinator_test + AND cached_connections_after_recovery_coordinator_test) THEN + EXIT; + END IF; + + PERFORM pg_sleep_for('1 SECOND'::interval); + END LOOP; + END +$$; \c - - - :worker_1_port SELECT count(*) = 0 AS too_many_clients_test FROM regexp_split_to_table(pg_read_file('../log/postmaster.log'), E'\n') AS t(log_line) @@ -317,15 +319,22 @@ WHERE gid LIKE 'citus_0_1234_4_0_%' t (1 row) -SELECT count(*) = 0 AS cached_connections_after_recovery_worker_1_test -FROM pg_stat_activity -WHERE state = 'idle' - AND now() - backend_start > '5 seconds'::interval; - cached_connections_after_recovery_worker_1_test ---------------------------------------------------------------------- - t -(1 row) - +DO +$$ + BEGIN + FOR i IN 0 .. 300 + LOOP + IF i = 300 THEN RAISE 'Waited too long'; END IF; + IF (SELECT count(*) = 0 AS cached_connections_after_recovery_worker_1_test + FROM pg_stat_activity + WHERE state = 'idle' + AND now() - backend_start > '5 seconds'::interval) THEN + EXIT; + END IF; + PERFORM pg_sleep_for('1 SECOND'::interval); + END LOOP; + END +$$; \c - - - :worker_2_port SELECT count(*) = 0 AS too_many_clients_test FROM regexp_split_to_table(pg_read_file('../log/postmaster.log'), E'\n') AS t(log_line) @@ -344,15 +353,22 @@ WHERE gid LIKE 'citus_0_1234_4_0_%' t (1 row) -SELECT count(*) = 0 AS cached_connections_after_recovery_worker_2_test -FROM pg_stat_activity -WHERE state = 'idle' - AND now() - backend_start > '5 seconds'::interval; - cached_connections_after_recovery_worker_2_test ---------------------------------------------------------------------- - t -(1 row) - +DO +$$ + BEGIN + FOR i IN 0 .. 300 + LOOP + IF i = 300 THEN RAISE 'Waited too long'; END IF; + IF (SELECT count(*) = 0 AS cached_connections_after_recovery_worker_2_test + FROM pg_stat_activity + WHERE state = 'idle' + AND now() - backend_start > '5 seconds'::interval) THEN + EXIT; + END IF; + PERFORM pg_sleep_for('1 SECOND'::interval); + END LOOP; + END +$$; -- Cleanup \c - - - :master_port SELECT $definition$ diff --git a/src/test/regress/sql/multi_maintenance_multiple_databases.sql b/src/test/regress/sql/multi_maintenance_multiple_databases.sql index d67f58296..4c29c6bc9 100644 --- a/src/test/regress/sql/multi_maintenance_multiple_databases.sql +++ b/src/test/regress/sql/multi_maintenance_multiple_databases.sql @@ -220,44 +220,49 @@ WHERE state = 'idle' \c - - - :master_port --- Let maintenance do it's work... - -DO -$$ - BEGIN - FOR i IN 0 .. 300 - LOOP - IF i = 300 THEN RAISE 'Waited too long'; END IF; - PERFORM pg_stat_clear_snapshot(); - PERFORM * FROM pg_stat_activity WHERE application_name = 'Citus Maintenance Daemon'; - IF (SELECT count(*) = 0 AS pg_dist_transaction_after_recovery_coordinator_test - FROM pg_database, - dblink(format('dbname=%s host=localhost port=%s user=postgres', datname, - (SELECT setting::int FROM pg_settings WHERE name = 'port')), - $statement$ - SELECT groupid, gid - FROM pg_dist_transaction - WHERE gid LIKE 'citus_0_1234_4_0_%' - OR gid LIKE 'citus_0_should_be_forgotten_%' - $statement$) AS t(groupid integer, gid text) - WHERE datname LIKE 'db%') THEN - EXIT; - END IF; - PERFORM pg_sleep_for('1 SECOND'::interval); - END LOOP; - END -$$; - -- Verify maintenance result SELECT count(*) = 0 AS too_many_clients_test FROM regexp_split_to_table(pg_read_file('../log/postmaster.log'), E'\n') AS t(log_line) WHERE log_line LIKE '%sorry, too many clients already%'; -SELECT count(*) = 0 AS cached_connections_after_recovery_coordinator_test -FROM pg_stat_activity -WHERE state = 'idle' - AND now() - backend_start > '5 seconds'::interval; +DO +$$ + DECLARE + pg_dist_transaction_after_recovery_coordinator_test boolean; + cached_connections_after_recovery_coordinator_test boolean; + BEGIN + FOR i IN 0 .. 300 + LOOP + IF i = 300 THEN RAISE 'Waited too long'; END IF; + SELECT count(*) = 0 + FROM pg_database, + dblink(format('dbname=%s host=localhost port=%s user=postgres', datname, + (SELECT setting::int FROM pg_settings WHERE name = 'port')), + $statement$ + SELECT groupid, gid + FROM pg_dist_transaction + WHERE gid LIKE 'citus_0_1234_4_0_%' + OR gid LIKE 'citus_0_should_be_forgotten_%' + $statement$) AS t(groupid integer, gid text) + WHERE datname LIKE 'db%' + INTO pg_dist_transaction_after_recovery_coordinator_test; + + SELECT count(*) = 0 AS cached_connections_after_recovery_coordinator_test + FROM pg_stat_activity + WHERE state = 'idle' + AND now() - backend_start > '5 seconds'::interval + INTO cached_connections_after_recovery_coordinator_test; + + IF (pg_dist_transaction_after_recovery_coordinator_test + AND cached_connections_after_recovery_coordinator_test) THEN + EXIT; + END IF; + + PERFORM pg_sleep_for('1 SECOND'::interval); + END LOOP; + END +$$; \c - - - :worker_1_port @@ -270,10 +275,22 @@ FROM pg_prepared_xacts WHERE gid LIKE 'citus_0_1234_4_0_%' OR gid LIKE 'citus_0_should_be_forgotten_%'; -SELECT count(*) = 0 AS cached_connections_after_recovery_worker_1_test -FROM pg_stat_activity -WHERE state = 'idle' - AND now() - backend_start > '5 seconds'::interval; +DO +$$ + BEGIN + FOR i IN 0 .. 300 + LOOP + IF i = 300 THEN RAISE 'Waited too long'; END IF; + IF (SELECT count(*) = 0 AS cached_connections_after_recovery_worker_1_test + FROM pg_stat_activity + WHERE state = 'idle' + AND now() - backend_start > '5 seconds'::interval) THEN + EXIT; + END IF; + PERFORM pg_sleep_for('1 SECOND'::interval); + END LOOP; + END +$$; \c - - - :worker_2_port @@ -286,11 +303,22 @@ FROM pg_prepared_xacts WHERE gid LIKE 'citus_0_1234_4_0_%' OR gid LIKE 'citus_0_should_be_forgotten_%'; -SELECT count(*) = 0 AS cached_connections_after_recovery_worker_2_test -FROM pg_stat_activity -WHERE state = 'idle' - AND now() - backend_start > '5 seconds'::interval; - +DO +$$ + BEGIN + FOR i IN 0 .. 300 + LOOP + IF i = 300 THEN RAISE 'Waited too long'; END IF; + IF (SELECT count(*) = 0 AS cached_connections_after_recovery_worker_2_test + FROM pg_stat_activity + WHERE state = 'idle' + AND now() - backend_start > '5 seconds'::interval) THEN + EXIT; + END IF; + PERFORM pg_sleep_for('1 SECOND'::interval); + END LOOP; + END +$$; -- Cleanup From a48f2743bfcf887254860af385f2009596d8ca53 Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Wed, 28 Feb 2024 09:18:49 +0100 Subject: [PATCH 25/41] Address review --- src/backend/distributed/connection/connection_management.c | 2 +- src/backend/distributed/utils/maintenanced.c | 6 +----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/backend/distributed/connection/connection_management.c b/src/backend/distributed/connection/connection_management.c index 09e50bfa7..1d3eee0c4 100644 --- a/src/backend/distributed/connection/connection_management.c +++ b/src/backend/distributed/connection/connection_management.c @@ -327,7 +327,7 @@ StartNodeUserDatabaseConnection(uint32 flags, const char *hostname, int32 port, */ ConnectionHashEntry *entry = hash_search(ConnectionHash, &key, HASH_ENTER, &found); - if (!(found && entry->isValid)) + if (!found || !entry->isValid) { /* * We are just building hash entry or previously it was left in an diff --git a/src/backend/distributed/utils/maintenanced.c b/src/backend/distributed/utils/maintenanced.c index 785608729..49ed61763 100644 --- a/src/backend/distributed/utils/maintenanced.c +++ b/src/backend/distributed/utils/maintenanced.c @@ -706,10 +706,7 @@ CitusMaintenanceDaemonMain(Datum main_arg) timeout = Min(timeout, Recover2PCInterval); } - /* - * Execute only on the maintenance database, if it configured, otherwise run from every daemon. - * The config value -1 disables the distributed deadlock detection - */ + /* the config value -1 disables the distributed deadlock detection */ if (DistributedDeadlockDetectionTimeoutFactor != -1.0) { double deadlockTimeout = @@ -736,7 +733,6 @@ CitusMaintenanceDaemonMain(Datum main_arg) foundDeadlock = CheckForDistributedDeadlocks(); } - CommitTransactionCommand(); /* From 725bdce5bdbbc4cf0dbff2898c4fb171ac3377bb Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Thu, 9 May 2024 16:20:33 +0200 Subject: [PATCH 26/41] Preliminary add "citus.maintenance_connection_pool_timeout" --- .../connection/shared_connection_stats.c | 52 +++++++++++++++---- src/backend/distributed/shared_library_init.c | 17 +++++- .../test/shared_connection_counters.c | 3 +- .../distributed/shared_connection_stats.h | 5 +- .../multi_maintenance_multiple_databases.out | 2 + src/test/regress/multi_1_schedule | 1 + .../sql/maintenance_connection_timeout.sql | 18 +++++++ .../multi_maintenance_multiple_databases.sql | 2 + 8 files changed, 85 insertions(+), 15 deletions(-) create mode 100644 src/test/regress/sql/maintenance_connection_timeout.sql diff --git a/src/backend/distributed/connection/shared_connection_stats.c b/src/backend/distributed/connection/shared_connection_stats.c index 4ad3f287e..8812f0e75 100644 --- a/src/backend/distributed/connection/shared_connection_stats.c +++ b/src/backend/distributed/connection/shared_connection_stats.c @@ -51,7 +51,8 @@ typedef struct ConnectionStatsSharedData char *sharedConnectionHashTrancheName; LWLock sharedConnectionHashLock; - ConditionVariable waitersConditionVariable; + ConditionVariable regularConnectionWaitersConditionVariable; + ConditionVariable maintenanceConnectionWaitersConditionVariable; } ConnectionStatsSharedData; /* @@ -108,7 +109,8 @@ int MaxSharedPoolSize = 0; * Pool size for maintenance connections exclusively * "0" or "-1" means do not apply connection throttling */ -int MaxMaintenanceSharedPoolSize = 5; +int MaxMaintenanceSharedPoolSize = -1; +int MaintenanceConnectionPoolTimeout = 30 * MS_PER_SECOND; /* * Controlled via a GUC, never access directly, use GetLocalSharedPoolSize(). @@ -306,7 +308,7 @@ WaitLoopForSharedConnection(uint32 flags, const char *hostname, int port) { CHECK_FOR_INTERRUPTS(); - WaitForSharedConnection(); + WaitForSharedConnection(flags); } ConditionVariableCancelSleep(); @@ -537,7 +539,7 @@ DecrementSharedConnectionCounter(uint32 externalFlags, const char *hostname, int DecrementSharedConnectionCounterInternal(externalFlags, hostname, port, MyDatabaseId); UnLockConnectionSharedMemory(); - WakeupWaiterBackendsForSharedConnection(); + WakeupWaiterBackendsForSharedConnection(externalFlags); } @@ -671,9 +673,18 @@ UnLockConnectionSharedMemory(void) * this function. */ void -WakeupWaiterBackendsForSharedConnection(void) +WakeupWaiterBackendsForSharedConnection(uint32 flags) { - ConditionVariableBroadcast(&ConnectionStatsSharedState->waitersConditionVariable); + if (flags & MAINTENANCE_CONNECTION) + { + ConditionVariableBroadcast( + &ConnectionStatsSharedState->maintenanceConnectionWaitersConditionVariable); + } + else + { + ConditionVariableBroadcast( + &ConnectionStatsSharedState->regularConnectionWaitersConditionVariable); + } } @@ -684,10 +695,28 @@ WakeupWaiterBackendsForSharedConnection(void) * WakeupWaiterBackendsForSharedConnection(). */ void -WaitForSharedConnection(void) +WaitForSharedConnection(uint32 flags) { - ConditionVariableSleep(&ConnectionStatsSharedState->waitersConditionVariable, - PG_WAIT_EXTENSION); + if (flags & MAINTENANCE_CONNECTION) + { + bool connectionSlotNotAcquired = ConditionVariableTimedSleep( + &ConnectionStatsSharedState->maintenanceConnectionWaitersConditionVariable, + MaintenanceConnectionPoolTimeout, + PG_WAIT_EXTENSION); + if (connectionSlotNotAcquired) + { + ereport(ERROR, (errmsg("Failed to acquire maintenance connection for %i ms", + MaintenanceConnectionPoolTimeout), + errhint( + "Try to increase citus.maintenance_connection_pool_timeout"))); + } + } + else + { + ConditionVariableSleep( + &ConnectionStatsSharedState->regularConnectionWaitersConditionVariable, + PG_WAIT_EXTENSION); + } } @@ -774,7 +803,10 @@ SharedConnectionStatsShmemInit(void) LWLockInitialize(&ConnectionStatsSharedState->sharedConnectionHashLock, ConnectionStatsSharedState->sharedConnectionHashTrancheId); - ConditionVariableInit(&ConnectionStatsSharedState->waitersConditionVariable); + ConditionVariableInit( + &ConnectionStatsSharedState->regularConnectionWaitersConditionVariable); + ConditionVariableInit( + &ConnectionStatsSharedState->maintenanceConnectionWaitersConditionVariable); } /* allocate hash tables */ diff --git a/src/backend/distributed/shared_library_init.c b/src/backend/distributed/shared_library_init.c index 2a9b73f20..0f1815a5c 100644 --- a/src/backend/distributed/shared_library_init.c +++ b/src/backend/distributed/shared_library_init.c @@ -1844,6 +1844,19 @@ RegisterCitusConfigVariables(void) GUC_STANDARD, NULL, NULL, NULL); + DefineCustomIntVariable( + "citus.maintenance_connection_pool_timeout", + gettext_noop( + "Timeout for acquiring a connection from a maintenance shared pool size. " + "Applicable only when the maintenance pool is enabled via citus.max_maintenance_shared_pool_size. " + "Setting it to 0 or -1 disables the timeout"), + NULL, + &MaintenanceConnectionPoolTimeout, + 30 * MS_PER_SECOND, -1, MS_PER_HOUR, + PGC_SIGHUP, + GUC_SUPERUSER_ONLY, + NULL, NULL, NULL); + DefineCustomIntVariable( "citus.max_adaptive_executor_pool_size", gettext_noop("Sets the maximum number of connections per worker node used by " @@ -1975,11 +1988,11 @@ RegisterCitusConfigVariables(void) DefineCustomIntVariable( "citus.max_maintenance_shared_pool_size", gettext_noop("Similar to citus.max_shared_pool_size, but applies to connections " - "for maintenance operations only." + "for maintenance operations only. " "Setting it to 0 or -1 disables maintenance connection throttling."), NULL, &MaxMaintenanceSharedPoolSize, - 5, -1, INT_MAX, + -1, -1, INT_MAX, PGC_SIGHUP, GUC_SUPERUSER_ONLY, NULL, NULL, NULL); diff --git a/src/backend/distributed/test/shared_connection_counters.c b/src/backend/distributed/test/shared_connection_counters.c index c59602887..917ffc19d 100644 --- a/src/backend/distributed/test/shared_connection_counters.c +++ b/src/backend/distributed/test/shared_connection_counters.c @@ -33,7 +33,8 @@ PG_FUNCTION_INFO_V1(set_max_shared_pool_size); Datum wake_up_connection_pool_waiters(PG_FUNCTION_ARGS) { - WakeupWaiterBackendsForSharedConnection(); + WakeupWaiterBackendsForSharedConnection(0); + WakeupWaiterBackendsForSharedConnection(MAINTENANCE_CONNECTION); PG_RETURN_VOID(); } diff --git a/src/include/distributed/shared_connection_stats.h b/src/include/distributed/shared_connection_stats.h index ea8346769..681e5bf5b 100644 --- a/src/include/distributed/shared_connection_stats.h +++ b/src/include/distributed/shared_connection_stats.h @@ -26,13 +26,14 @@ enum SharedPoolCounterMode extern int MaxSharedPoolSize; extern int MaxMaintenanceSharedPoolSize; +extern int MaintenanceConnectionPoolTimeout; extern int LocalSharedPoolSize; extern int MaxClientConnections; extern void InitializeSharedConnectionStats(void); -extern void WaitForSharedConnection(void); -extern void WakeupWaiterBackendsForSharedConnection(void); +extern void WaitForSharedConnection(uint32); +extern void WakeupWaiterBackendsForSharedConnection(uint32); extern size_t SharedConnectionStatsShmemSize(void); extern void SharedConnectionStatsShmemInit(void); extern int GetMaxClientConnections(void); diff --git a/src/test/regress/expected/multi_maintenance_multiple_databases.out b/src/test/regress/expected/multi_maintenance_multiple_databases.out index 04cb9c1a0..6c5a4c4d3 100644 --- a/src/test/regress/expected/multi_maintenance_multiple_databases.out +++ b/src/test/regress/expected/multi_maintenance_multiple_databases.out @@ -9,6 +9,7 @@ SELECT $definition$ \gset SELECT $deinition$ ALTER SYSTEM SET citus.recover_2pc_interval TO '5s'; +ALTER SYSTEM SET citus.max_maintenance_shared_pool_size = 10; ALTER SYSTEM RESET citus.distributed_deadlock_detection_factor; SELECT pg_reload_conf(); $deinition$ AS turn_on_maintenance @@ -374,6 +375,7 @@ $$; SELECT $definition$ ALTER SYSTEM RESET citus.recover_2pc_interval; ALTER SYSTEM RESET citus.distributed_deadlock_detection_factor; + ALTER SYSTEM RESET citus.max_maintenance_shared_pool_size; SELECT pg_reload_conf(); DO diff --git a/src/test/regress/multi_1_schedule b/src/test/regress/multi_1_schedule index cbd8ecf2f..50ddd3e7a 100644 --- a/src/test/regress/multi_1_schedule +++ b/src/test/regress/multi_1_schedule @@ -221,6 +221,7 @@ test: multi_create_shards test: multi_transaction_recovery test: multi_transaction_recovery_multiple_databases test: multi_maintenance_multiple_databases +#test: maintenance_connection_timeout test: local_dist_join_modifications test: local_table_join diff --git a/src/test/regress/sql/maintenance_connection_timeout.sql b/src/test/regress/sql/maintenance_connection_timeout.sql new file mode 100644 index 000000000..b293c0b95 --- /dev/null +++ b/src/test/regress/sql/maintenance_connection_timeout.sql @@ -0,0 +1,18 @@ +-- Create a new database +ALTER SYSTEM SET citus.max_maintenance_shared_pool_size = 1; +SELECT pg_reload_conf(); +set citus.enable_create_database_propagation to on; +CREATE DATABASE role_operations_test_db; +SET citus.superuser TO 'postgres'; +-- Connect to the new database +\c role_operations_test_db + +CREATE ROLE test_role1; + +\c regression - - :master_port +-- Clean up: drop the database +set citus.enable_create_database_propagation to on; +DROP DATABASE role_operations_test_db; +reset citus.enable_create_database_propagation; +ALTER SYSTEM SET citus.max_maintenance_shared_pool_size = 1; +SELECT pg_reload_conf(); diff --git a/src/test/regress/sql/multi_maintenance_multiple_databases.sql b/src/test/regress/sql/multi_maintenance_multiple_databases.sql index 4c29c6bc9..bfcf6d788 100644 --- a/src/test/regress/sql/multi_maintenance_multiple_databases.sql +++ b/src/test/regress/sql/multi_maintenance_multiple_databases.sql @@ -11,6 +11,7 @@ SELECT $definition$ SELECT $deinition$ ALTER SYSTEM SET citus.recover_2pc_interval TO '5s'; +ALTER SYSTEM SET citus.max_maintenance_shared_pool_size = 10; ALTER SYSTEM RESET citus.distributed_deadlock_detection_factor; SELECT pg_reload_conf(); $deinition$ AS turn_on_maintenance @@ -327,6 +328,7 @@ $$; SELECT $definition$ ALTER SYSTEM RESET citus.recover_2pc_interval; ALTER SYSTEM RESET citus.distributed_deadlock_detection_factor; + ALTER SYSTEM RESET citus.max_maintenance_shared_pool_size; SELECT pg_reload_conf(); DO From 53869e0ae6c274941621afece56853981ea6a95d Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Wed, 12 Jun 2024 13:59:10 +0200 Subject: [PATCH 27/41] Delete maintenance_connection_timeout.sql temporarily to pass the tests --- .../sql/maintenance_connection_timeout.sql | 18 ------------------ 1 file changed, 18 deletions(-) delete mode 100644 src/test/regress/sql/maintenance_connection_timeout.sql diff --git a/src/test/regress/sql/maintenance_connection_timeout.sql b/src/test/regress/sql/maintenance_connection_timeout.sql deleted file mode 100644 index b293c0b95..000000000 --- a/src/test/regress/sql/maintenance_connection_timeout.sql +++ /dev/null @@ -1,18 +0,0 @@ --- Create a new database -ALTER SYSTEM SET citus.max_maintenance_shared_pool_size = 1; -SELECT pg_reload_conf(); -set citus.enable_create_database_propagation to on; -CREATE DATABASE role_operations_test_db; -SET citus.superuser TO 'postgres'; --- Connect to the new database -\c role_operations_test_db - -CREATE ROLE test_role1; - -\c regression - - :master_port --- Clean up: drop the database -set citus.enable_create_database_propagation to on; -DROP DATABASE role_operations_test_db; -reset citus.enable_create_database_propagation; -ALTER SYSTEM SET citus.max_maintenance_shared_pool_size = 1; -SELECT pg_reload_conf(); From 9f63b5a379b1c284ee4753b063b4fed7678f43b0 Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Tue, 18 Jun 2024 17:20:41 +0200 Subject: [PATCH 28/41] Fix StopMaintenanceDaemon by introduction of dbData.daemonShuttingDown flag --- src/backend/distributed/utils/maintenanced.c | 28 +++++++++----------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/src/backend/distributed/utils/maintenanced.c b/src/backend/distributed/utils/maintenanced.c index 49ed61763..fe173bd21 100644 --- a/src/backend/distributed/utils/maintenanced.c +++ b/src/backend/distributed/utils/maintenanced.c @@ -90,6 +90,7 @@ typedef struct MaintenanceDaemonDBData Oid userOid; pid_t workerPid; bool daemonStarted; + bool daemonShuttingDown; bool triggerNodeMetadataSync; Latch *latch; /* pointer to the background worker's latch */ } MaintenanceDaemonDBData; @@ -243,6 +244,14 @@ InitializeMaintenanceDaemonBackend(void) return; } + if (dbData->daemonShuttingDown) + { + elog(DEBUG1, "Another maintenance daemon for database %u is shutting down. " + "Aborting current initialization", MyDatabaseId); + LWLockRelease(&MaintenanceDaemonControl->lock); + return; + } + if (IsMaintenanceDaemon) { /* @@ -1058,20 +1067,8 @@ MaintenanceDaemonShmemExit(int code, Datum arg) MaintenanceDaemonDBData *myDbData = (MaintenanceDaemonDBData *) hash_search(MaintenanceDaemonDBHash, &databaseOid, - HASH_FIND, NULL); - - /* myDbData is NULL after StopMaintenanceDaemon */ - if (myDbData != NULL) - { - /* - * Confirm that I am still the registered maintenance daemon before exiting. - */ - Assert(myDbData->workerPid == MyProcPid); - - myDbData->daemonStarted = false; - myDbData->workerPid = 0; - } - + HASH_REMOVE, NULL); + Assert(myDbData->workerPid == MyProcPid); LWLockRelease(&MaintenanceDaemonControl->lock); } @@ -1170,11 +1167,12 @@ StopMaintenanceDaemon(Oid databaseId) MaintenanceDaemonDBData *dbData = (MaintenanceDaemonDBData *) hash_search( MaintenanceDaemonDBHash, &databaseId, - HASH_REMOVE, &found); + HASH_FIND, &found); if (found) { workerPid = dbData->workerPid; + dbData->daemonShuttingDown = true; } LWLockRelease(&MaintenanceDaemonControl->lock); From b1ca0080d620826017f9930fe6db8d83d1104a81 Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Tue, 18 Jun 2024 17:31:35 +0200 Subject: [PATCH 29/41] Workaround for -Werror=unused-variable --- src/backend/distributed/utils/maintenanced.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/backend/distributed/utils/maintenanced.c b/src/backend/distributed/utils/maintenanced.c index fe173bd21..3eff4b197 100644 --- a/src/backend/distributed/utils/maintenanced.c +++ b/src/backend/distributed/utils/maintenanced.c @@ -1068,6 +1068,8 @@ MaintenanceDaemonShmemExit(int code, Datum arg) MaintenanceDaemonDBData *myDbData = (MaintenanceDaemonDBData *) hash_search(MaintenanceDaemonDBHash, &databaseOid, HASH_REMOVE, NULL); + // Workaround for -Werror=unused-variable + (void ) myDbData; Assert(myDbData->workerPid == MyProcPid); LWLockRelease(&MaintenanceDaemonControl->lock); } From 028702e6c299142e3f8b3faea73ed24e7bfc63fa Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Tue, 18 Jun 2024 17:35:12 +0200 Subject: [PATCH 30/41] Fix formatting --- src/backend/distributed/utils/maintenanced.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backend/distributed/utils/maintenanced.c b/src/backend/distributed/utils/maintenanced.c index 3eff4b197..54a8db6c3 100644 --- a/src/backend/distributed/utils/maintenanced.c +++ b/src/backend/distributed/utils/maintenanced.c @@ -1068,8 +1068,8 @@ MaintenanceDaemonShmemExit(int code, Datum arg) MaintenanceDaemonDBData *myDbData = (MaintenanceDaemonDBData *) hash_search(MaintenanceDaemonDBHash, &databaseOid, HASH_REMOVE, NULL); - // Workaround for -Werror=unused-variable - (void ) myDbData; + /* Workaround for -Werror=unused-variable */ + (void) myDbData; Assert(myDbData->workerPid == MyProcPid); LWLockRelease(&MaintenanceDaemonControl->lock); } From f584ecc1aa690c08972187c13c907ba8fa556524 Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Wed, 19 Jun 2024 15:02:52 +0200 Subject: [PATCH 31/41] Fix flaky test and checkstyle --- src/backend/distributed/utils/maintenanced.c | 1 + .../multi_maintenance_multiple_databases.out | 36 ++++++++++--------- .../multi_maintenance_multiple_databases.sql | 14 ++++---- 3 files changed, 28 insertions(+), 23 deletions(-) diff --git a/src/backend/distributed/utils/maintenanced.c b/src/backend/distributed/utils/maintenanced.c index 54a8db6c3..cc808eb89 100644 --- a/src/backend/distributed/utils/maintenanced.c +++ b/src/backend/distributed/utils/maintenanced.c @@ -1068,6 +1068,7 @@ MaintenanceDaemonShmemExit(int code, Datum arg) MaintenanceDaemonDBData *myDbData = (MaintenanceDaemonDBData *) hash_search(MaintenanceDaemonDBHash, &databaseOid, HASH_REMOVE, NULL); + /* Workaround for -Werror=unused-variable */ (void) myDbData; Assert(myDbData->workerPid == MyProcPid); diff --git a/src/test/regress/expected/multi_maintenance_multiple_databases.out b/src/test/regress/expected/multi_maintenance_multiple_databases.out index 6c5a4c4d3..1b0a65046 100644 --- a/src/test/regress/expected/multi_maintenance_multiple_databases.out +++ b/src/test/regress/expected/multi_maintenance_multiple_databases.out @@ -373,11 +373,6 @@ $$; -- Cleanup \c - - - :master_port SELECT $definition$ - ALTER SYSTEM RESET citus.recover_2pc_interval; - ALTER SYSTEM RESET citus.distributed_deadlock_detection_factor; - ALTER SYSTEM RESET citus.max_maintenance_shared_pool_size; - SELECT pg_reload_conf(); - DO $do$ DECLARE @@ -499,45 +494,52 @@ SELECT $definition$ DROP DATABASE db98 WITH (FORCE); DROP DATABASE db99 WITH (FORCE); DROP DATABASE db100 WITH (FORCE); - SELECT count(*) + + SELECT count(*) = 0 as all_databases_dropped FROM pg_database WHERE datname LIKE 'db%'; + + ALTER SYSTEM RESET citus.recover_2pc_interval; + ALTER SYSTEM RESET citus.distributed_deadlock_detection_factor; + ALTER SYSTEM RESET citus.max_maintenance_shared_pool_size; + SELECT pg_reload_conf(); + $definition$ AS cleanup \gset :cleanup - pg_reload_conf + all_databases_dropped --------------------------------------------------------------------- t (1 row) - count + pg_reload_conf --------------------------------------------------------------------- - 0 + t (1 row) \c - - - :worker_1_port :cleanup - pg_reload_conf + all_databases_dropped --------------------------------------------------------------------- t (1 row) - count + pg_reload_conf --------------------------------------------------------------------- - 0 + t (1 row) \c - - - :worker_2_port :cleanup + all_databases_dropped +--------------------------------------------------------------------- + t +(1 row) + pg_reload_conf --------------------------------------------------------------------- t (1 row) - count ---------------------------------------------------------------------- - 0 -(1 row) - \c - - - :master_port DROP EXTENSION IF EXISTS dblink; diff --git a/src/test/regress/sql/multi_maintenance_multiple_databases.sql b/src/test/regress/sql/multi_maintenance_multiple_databases.sql index bfcf6d788..7b27c755d 100644 --- a/src/test/regress/sql/multi_maintenance_multiple_databases.sql +++ b/src/test/regress/sql/multi_maintenance_multiple_databases.sql @@ -326,11 +326,6 @@ $$; \c - - - :master_port SELECT $definition$ - ALTER SYSTEM RESET citus.recover_2pc_interval; - ALTER SYSTEM RESET citus.distributed_deadlock_detection_factor; - ALTER SYSTEM RESET citus.max_maintenance_shared_pool_size; - SELECT pg_reload_conf(); - DO $do$ DECLARE @@ -452,9 +447,16 @@ SELECT $definition$ DROP DATABASE db98 WITH (FORCE); DROP DATABASE db99 WITH (FORCE); DROP DATABASE db100 WITH (FORCE); - SELECT count(*) + + SELECT count(*) = 0 as all_databases_dropped FROM pg_database WHERE datname LIKE 'db%'; + + ALTER SYSTEM RESET citus.recover_2pc_interval; + ALTER SYSTEM RESET citus.distributed_deadlock_detection_factor; + ALTER SYSTEM RESET citus.max_maintenance_shared_pool_size; + SELECT pg_reload_conf(); + $definition$ AS cleanup \gset From bdc7bead096cace4087234873b3f4ed8353577d6 Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Wed, 26 Jun 2024 16:27:13 +0200 Subject: [PATCH 32/41] test_multiple_databases_distributed_deadlock_detection WIP --- .../connection/shared_connection_stats.c | 4 +- src/test/regress/citus_tests/common.py | 2 +- ...atabases_distributed_deadlock_detection.py | 105 ++++++++++++++++++ 3 files changed, 109 insertions(+), 2 deletions(-) create mode 100644 src/test/regress/citus_tests/test/test_multiple_databases_distributed_deadlock_detection.py diff --git a/src/backend/distributed/connection/shared_connection_stats.c b/src/backend/distributed/connection/shared_connection_stats.c index 8812f0e75..33c6943c1 100644 --- a/src/backend/distributed/connection/shared_connection_stats.c +++ b/src/backend/distributed/connection/shared_connection_stats.c @@ -708,7 +708,9 @@ WaitForSharedConnection(uint32 flags) ereport(ERROR, (errmsg("Failed to acquire maintenance connection for %i ms", MaintenanceConnectionPoolTimeout), errhint( - "Try to increase citus.maintenance_connection_pool_timeout"))); + "Try increasing citus.max_maintenance_shared_pool_size or " + "citus.maintenance_connection_pool_timeout" + ))); } } else diff --git a/src/test/regress/citus_tests/common.py b/src/test/regress/citus_tests/common.py index 6c09e0b38..322788462 100644 --- a/src/test/regress/citus_tests/common.py +++ b/src/test/regress/citus_tests/common.py @@ -823,7 +823,7 @@ class Postgres(QueryRunner): # of our tests pgconf.write("max_logical_replication_workers = 50\n") pgconf.write("max_wal_senders = 50\n") - pgconf.write("max_worker_processes = 50\n") + pgconf.write("max_worker_processes = 150\n") pgconf.write("max_replication_slots = 50\n") # We need to make the log go to stderr so that the tests can diff --git a/src/test/regress/citus_tests/test/test_multiple_databases_distributed_deadlock_detection.py b/src/test/regress/citus_tests/test/test_multiple_databases_distributed_deadlock_detection.py new file mode 100644 index 000000000..35b0e36fe --- /dev/null +++ b/src/test/regress/citus_tests/test/test_multiple_databases_distributed_deadlock_detection.py @@ -0,0 +1,105 @@ +import asyncio + +import psycopg +import pytest +from psycopg.errors import DeadlockDetected + +DATABASES_NUMBER = 40 + + +async def test_multiple_databases_distributed_deadlock_detection(cluster): + # Disable maintenance on all nodes + for node in cluster.nodes: + node.sql("ALTER SYSTEM SET citus.recover_2pc_interval TO '-1';") + node.sql("ALTER SYSTEM SET citus.distributed_deadlock_detection_factor = '-1';") + node.sql("ALTER SYSTEM SET citus.max_maintenance_shared_pool_size = 10;") + node.sql("SELECT pg_reload_conf();") + + # Prepare database names for test + db_names = [f'db{db_index}' for db_index in range(1, DATABASES_NUMBER + 1)] + + # Create and configure databases + for db_name in db_names: + nodes = cluster.workers + [cluster.coordinator] + for node in nodes: + node.sql(f'CREATE DATABASE {db_name}') + with node.cur(dbname=db_name) as node_cursor: + node_cursor.execute("CREATE EXTENSION citus;") + if node == cluster.coordinator: + for worker in cluster.workers: + node_cursor.execute(f"SELECT citus_add_node('localhost', {worker.port});") + node_cursor.execute(""" + CREATE TABLE public.deadlock_detection_test (user_id int UNIQUE, some_val int); + SELECT create_distributed_table('public.deadlock_detection_test', 'user_id'); + INSERT INTO public.deadlock_detection_test SELECT i, i FROM generate_series(1,2) i; + """) + + print("Setup is done") + + async def test_deadlock(db_name, run_on_coordinator): + """Function to prepare a deadlock query in a given database""" + # Init connections and store for later commits + if run_on_coordinator: + first_connection = await cluster.coordinator.aconn(dbname=db_name, autocommit=False) + first_cursor = first_connection.cursor() + second_connection = await cluster.coordinator.aconn(dbname=db_name, autocommit=False) + second_cursor = second_connection.cursor() + else: + first_connection = await cluster.workers[0].aconn(dbname=db_name, autocommit=False) + first_cursor = first_connection.cursor() + second_connection = await cluster.workers[1].aconn(dbname=db_name, autocommit=False) + second_cursor = second_connection.cursor() + + # initiate deadlock + await first_cursor.execute("UPDATE public.deadlock_detection_test SET some_val = 1 WHERE user_id = 1;") + await second_cursor.execute("UPDATE public.deadlock_detection_test SET some_val = 2 WHERE user_id = 2;") + + # Test that deadlock is resolved by a maintenance daemon + with pytest.raises(DeadlockDetected): + async def run_deadlocked_queries(): + await asyncio.gather( + second_cursor.execute("UPDATE public.deadlock_detection_test SET some_val = 2 WHERE user_id = 1;"), + first_cursor.execute("UPDATE public.deadlock_detection_test SET some_val = 1 WHERE user_id = 2;") + ) + + await asyncio.wait_for(run_deadlocked_queries(), 300) + + async def enable_maintenance_when_deadlocks_ready(): + """Function to enable maintenance daemons, when all the expected deadlock queries are ready""" + # Let deadlocks commence + await asyncio.sleep(2) + # cluster.debug() + # Check that queries are deadlocked + databases_with_deadlock = set() + while len(databases_with_deadlock) < DATABASES_NUMBER: + for db_name in (db for db in db_names if + db not in databases_with_deadlock): + for node in cluster.nodes: + async with node.acur(dbname=db_name) as cursor: + expected_lock_count = 4 if node == cluster.coordinator else 2 + await cursor.execute(f""" + SELECT count(*) = {expected_lock_count} AS deadlock_created + FROM pg_locks + INNER JOIN pg_class pc ON relation = oid + WHERE relname LIKE 'deadlock_detection_test%'""") + queries_deadlocked = await cursor.fetchone() + if queries_deadlocked[0]: + print(f"Queries are deadlocked on {db_name}") + databases_with_deadlock.add(db_name) + + print("Queries on all databases are deadlocked, enabling maintenance") + + # Enable maintenance back + for node in cluster.nodes: + node.sql("ALTER SYSTEM RESET citus.recover_2pc_interval;") + node.sql("ALTER SYSTEM RESET citus.distributed_deadlock_detection_factor;") + node.sql("SELECT pg_reload_conf();") + + tasks = list() + for idx, db_name in enumerate(db_names): + run_on_coordinator = True if idx % 3 == 0 else False + tasks.append(test_deadlock(db_name=db_name, run_on_coordinator=run_on_coordinator)) + + tasks.append(enable_maintenance_when_deadlocks_ready()) + + await asyncio.gather(*tasks) From 4312b0656b6d8487e1cadc9d0ff0870c4bdd0af2 Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Thu, 27 Jun 2024 17:50:04 +0200 Subject: [PATCH 33/41] - Fix limits check for local nodes - WIP test_multiple_databases_distributed_deadlock_detection --- .../connection/shared_connection_stats.c | 19 +++-- src/test/regress/citus_tests/common.py | 8 ++ ...atabases_distributed_deadlock_detection.py | 84 +++++++++++++------ 3 files changed, 77 insertions(+), 34 deletions(-) diff --git a/src/backend/distributed/connection/shared_connection_stats.c b/src/backend/distributed/connection/shared_connection_stats.c index 33c6943c1..4338a7860 100644 --- a/src/backend/distributed/connection/shared_connection_stats.c +++ b/src/backend/distributed/connection/shared_connection_stats.c @@ -461,8 +461,8 @@ IncrementSharedConnectionCounterInternal(uint32 externalFlags, currentConnectionsCount = workerNodeConnectionEntry->regularConnectionsCount; } - bool remoteNodeLimitExceeded = currentConnectionsCount + 1 > - currentConnectionsLimit; + bool currentConnectionsLimitExceeded = currentConnectionsCount + 1 > + currentConnectionsLimit; /* * For local nodes, solely relying on citus.max_shared_pool_size or @@ -476,11 +476,11 @@ IncrementSharedConnectionCounterInternal(uint32 externalFlags, * a reasonable pace. The latter limit typically kicks in when the database * is issued lots of concurrent sessions at the same time, such as benchmarks. */ - bool localNodeLimitExceeded = + bool localNodeConnectionsLimitExceeded = connectionToLocalNode && (GetLocalSharedPoolSize() == DISABLE_REMOTE_CONNECTIONS_FOR_LOCAL_QUERIES || - GetExternalClientBackendCount() + 1 > currentConnectionsLimit); - if (remoteNodeLimitExceeded || localNodeLimitExceeded) + GetExternalClientBackendCount() + 1 > GetLocalSharedPoolSize()); + if (currentConnectionsLimitExceeded || localNodeConnectionsLimitExceeded) { connectionSlotAvailable = false; } @@ -502,9 +502,10 @@ IncrementSharedConnectionCounterInternal(uint32 externalFlags, if (IsLoggableLevel(DEBUG4)) { ereport(DEBUG4, errmsg( - "Incrementing connection counter. " + "Incrementing %s connection counter. " "Current regular connections: %i, maintenance connections: %i. " "Connection slot to %s:%i database %i is %s", + maintenanceConnection ? "maintenance" : "regular", workerNodeConnectionEntry->regularConnectionsCount, workerNodeConnectionEntry->maintenanceConnectionsCount, hostname, @@ -568,7 +569,8 @@ DecrementSharedConnectionCounterInternal(uint32 externalFlags, Assert(workerNodeConnectionEntry->regularConnectionsCount > 0 || workerNodeConnectionEntry->maintenanceConnectionsCount > 0); - if (externalFlags & MAINTENANCE_CONNECTION) + bool maintenanceConnection = externalFlags & MAINTENANCE_CONNECTION; + if (maintenanceConnection) { workerNodeConnectionEntry->maintenanceConnectionsCount -= 1; } @@ -580,9 +582,10 @@ DecrementSharedConnectionCounterInternal(uint32 externalFlags, if (IsLoggableLevel(DEBUG4)) { ereport(DEBUG4, errmsg( - "Decrementing connection counter. " + "Decrementing %s connection counter. " "Current regular connections: %i, maintenance connections: %i. " "Connection slot to %s:%i database %i is released", + maintenanceConnection ? "maintenance" : "regular", workerNodeConnectionEntry->regularConnectionsCount, workerNodeConnectionEntry->maintenanceConnectionsCount, hostname, diff --git a/src/test/regress/citus_tests/common.py b/src/test/regress/citus_tests/common.py index 322788462..12cc8d54d 100644 --- a/src/test/regress/citus_tests/common.py +++ b/src/test/regress/citus_tests/common.py @@ -977,6 +977,14 @@ class Postgres(QueryRunner): for config in configs: self.sql(f"alter system set {config}") + def reset_configuration(self, *configs): + """Reset specific Postgres settings using ALTER SYSTEM RESET + NOTE: after configuring a call to reload or restart is needed for the + settings to become effective. + """ + for config in configs: + self.sql(f"alter system reset {config}") + def log_handle(self): """Returns the opened logfile at the current end of the log diff --git a/src/test/regress/citus_tests/test/test_multiple_databases_distributed_deadlock_detection.py b/src/test/regress/citus_tests/test/test_multiple_databases_distributed_deadlock_detection.py index 35b0e36fe..b71e0211f 100644 --- a/src/test/regress/citus_tests/test/test_multiple_databases_distributed_deadlock_detection.py +++ b/src/test/regress/citus_tests/test/test_multiple_databases_distributed_deadlock_detection.py @@ -10,29 +10,38 @@ DATABASES_NUMBER = 40 async def test_multiple_databases_distributed_deadlock_detection(cluster): # Disable maintenance on all nodes for node in cluster.nodes: - node.sql("ALTER SYSTEM SET citus.recover_2pc_interval TO '-1';") - node.sql("ALTER SYSTEM SET citus.distributed_deadlock_detection_factor = '-1';") - node.sql("ALTER SYSTEM SET citus.max_maintenance_shared_pool_size = 10;") - node.sql("SELECT pg_reload_conf();") + node.configure( + "citus.recover_2pc_interval = '-1'", + "citus.distributed_deadlock_detection_factor = '-1'", + "citus.max_maintenance_shared_pool_size = 5", + # "log_min_messages = 'debug4'", + # "citus.main_db='postgres'" + ) + node.restart() # Prepare database names for test - db_names = [f'db{db_index}' for db_index in range(1, DATABASES_NUMBER + 1)] + db_names = [f"db{db_index}" for db_index in range(1, DATABASES_NUMBER + 1)] # Create and configure databases for db_name in db_names: nodes = cluster.workers + [cluster.coordinator] for node in nodes: - node.sql(f'CREATE DATABASE {db_name}') + node.sql(f"CREATE DATABASE {db_name}") with node.cur(dbname=db_name) as node_cursor: node_cursor.execute("CREATE EXTENSION citus;") if node == cluster.coordinator: for worker in cluster.workers: - node_cursor.execute(f"SELECT citus_add_node('localhost', {worker.port});") - node_cursor.execute(""" + node_cursor.execute( + "SELECT pg_catalog.citus_add_node(%s, %s)", + (worker.host, worker.port), + ) + node_cursor.execute( + """ CREATE TABLE public.deadlock_detection_test (user_id int UNIQUE, some_val int); SELECT create_distributed_table('public.deadlock_detection_test', 'user_id'); INSERT INTO public.deadlock_detection_test SELECT i, i FROM generate_series(1,2) i; - """) + """ + ) print("Setup is done") @@ -40,26 +49,43 @@ async def test_multiple_databases_distributed_deadlock_detection(cluster): """Function to prepare a deadlock query in a given database""" # Init connections and store for later commits if run_on_coordinator: - first_connection = await cluster.coordinator.aconn(dbname=db_name, autocommit=False) + first_connection = await cluster.coordinator.aconn( + dbname=db_name, autocommit=False + ) first_cursor = first_connection.cursor() - second_connection = await cluster.coordinator.aconn(dbname=db_name, autocommit=False) + second_connection = await cluster.coordinator.aconn( + dbname=db_name, autocommit=False + ) second_cursor = second_connection.cursor() else: - first_connection = await cluster.workers[0].aconn(dbname=db_name, autocommit=False) + first_connection = await cluster.workers[0].aconn( + dbname=db_name, autocommit=False + ) first_cursor = first_connection.cursor() - second_connection = await cluster.workers[1].aconn(dbname=db_name, autocommit=False) + second_connection = await cluster.workers[1].aconn( + dbname=db_name, autocommit=False + ) second_cursor = second_connection.cursor() # initiate deadlock - await first_cursor.execute("UPDATE public.deadlock_detection_test SET some_val = 1 WHERE user_id = 1;") - await second_cursor.execute("UPDATE public.deadlock_detection_test SET some_val = 2 WHERE user_id = 2;") + await first_cursor.execute( + "UPDATE public.deadlock_detection_test SET some_val = 1 WHERE user_id = 1;" + ) + await second_cursor.execute( + "UPDATE public.deadlock_detection_test SET some_val = 2 WHERE user_id = 2;" + ) # Test that deadlock is resolved by a maintenance daemon with pytest.raises(DeadlockDetected): + async def run_deadlocked_queries(): await asyncio.gather( - second_cursor.execute("UPDATE public.deadlock_detection_test SET some_val = 2 WHERE user_id = 1;"), - first_cursor.execute("UPDATE public.deadlock_detection_test SET some_val = 1 WHERE user_id = 2;") + second_cursor.execute( + "UPDATE public.deadlock_detection_test SET some_val = 2 WHERE user_id = 1;" + ), + first_cursor.execute( + "UPDATE public.deadlock_detection_test SET some_val = 1 WHERE user_id = 2;" + ), ) await asyncio.wait_for(run_deadlocked_queries(), 300) @@ -72,16 +98,18 @@ async def test_multiple_databases_distributed_deadlock_detection(cluster): # Check that queries are deadlocked databases_with_deadlock = set() while len(databases_with_deadlock) < DATABASES_NUMBER: - for db_name in (db for db in db_names if - db not in databases_with_deadlock): + for db_name in (db for db in db_names if db not in databases_with_deadlock): for node in cluster.nodes: async with node.acur(dbname=db_name) as cursor: expected_lock_count = 4 if node == cluster.coordinator else 2 - await cursor.execute(f""" - SELECT count(*) = {expected_lock_count} AS deadlock_created + await cursor.execute( + """ + SELECT count(*) = %s AS deadlock_created FROM pg_locks INNER JOIN pg_class pc ON relation = oid - WHERE relname LIKE 'deadlock_detection_test%'""") + WHERE relname LIKE 'deadlock_detection_test%%'""", + (expected_lock_count,), + ) queries_deadlocked = await cursor.fetchone() if queries_deadlocked[0]: print(f"Queries are deadlocked on {db_name}") @@ -91,14 +119,18 @@ async def test_multiple_databases_distributed_deadlock_detection(cluster): # Enable maintenance back for node in cluster.nodes: - node.sql("ALTER SYSTEM RESET citus.recover_2pc_interval;") - node.sql("ALTER SYSTEM RESET citus.distributed_deadlock_detection_factor;") - node.sql("SELECT pg_reload_conf();") + node.reset_configuration( + "citus.recover_2pc_interval", + "citus.distributed_deadlock_detection_factor", + ) + node.reload() tasks = list() for idx, db_name in enumerate(db_names): run_on_coordinator = True if idx % 3 == 0 else False - tasks.append(test_deadlock(db_name=db_name, run_on_coordinator=run_on_coordinator)) + tasks.append( + test_deadlock(db_name=db_name, run_on_coordinator=run_on_coordinator) + ) tasks.append(enable_maintenance_when_deadlocks_ready()) From 76a2776feefe13326f69653f9b302a48de5af9d0 Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Thu, 27 Jun 2024 17:53:56 +0200 Subject: [PATCH 34/41] Fix checkstyle --- .../test_multiple_databases_distributed_deadlock_detection.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/test/regress/citus_tests/test/test_multiple_databases_distributed_deadlock_detection.py b/src/test/regress/citus_tests/test/test_multiple_databases_distributed_deadlock_detection.py index b71e0211f..99ff15492 100644 --- a/src/test/regress/citus_tests/test/test_multiple_databases_distributed_deadlock_detection.py +++ b/src/test/regress/citus_tests/test/test_multiple_databases_distributed_deadlock_detection.py @@ -1,6 +1,5 @@ import asyncio -import psycopg import pytest from psycopg.errors import DeadlockDetected From 3517850441774cf81dd83246d0834e29145aceb5 Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Thu, 27 Jun 2024 18:26:37 +0200 Subject: [PATCH 35/41] WIP test_multiple_databases_distributed_deadlock_detection --- src/test/regress/citus_tests/common.py | 2 +- .../test_multiple_databases_distributed_deadlock_detection.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/regress/citus_tests/common.py b/src/test/regress/citus_tests/common.py index 12cc8d54d..785ebcc1f 100644 --- a/src/test/regress/citus_tests/common.py +++ b/src/test/regress/citus_tests/common.py @@ -1056,7 +1056,7 @@ class Postgres(QueryRunner): def cleanup_databases(self): for database in self.databases: self.sql( - sql.SQL("DROP DATABASE IF EXISTS {}").format(sql.Identifier(database)) + sql.SQL("DROP DATABASE IF EXISTS {} WITH (FORCE)").format(sql.Identifier(database)) ) def cleanup_schemas(self): diff --git a/src/test/regress/citus_tests/test/test_multiple_databases_distributed_deadlock_detection.py b/src/test/regress/citus_tests/test/test_multiple_databases_distributed_deadlock_detection.py index 99ff15492..6db6db69f 100644 --- a/src/test/regress/citus_tests/test/test_multiple_databases_distributed_deadlock_detection.py +++ b/src/test/regress/citus_tests/test/test_multiple_databases_distributed_deadlock_detection.py @@ -25,7 +25,7 @@ async def test_multiple_databases_distributed_deadlock_detection(cluster): for db_name in db_names: nodes = cluster.workers + [cluster.coordinator] for node in nodes: - node.sql(f"CREATE DATABASE {db_name}") + node.create_database(f"{db_name}") with node.cur(dbname=db_name) as node_cursor: node_cursor.execute("CREATE EXTENSION citus;") if node == cluster.coordinator: From f25b4b294c45711252c1aba89fce5e9d98078c41 Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Thu, 27 Jun 2024 18:28:26 +0200 Subject: [PATCH 36/41] Fix checkstyle --- src/test/regress/citus_tests/common.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/test/regress/citus_tests/common.py b/src/test/regress/citus_tests/common.py index 785ebcc1f..e3dbcaf8b 100644 --- a/src/test/regress/citus_tests/common.py +++ b/src/test/regress/citus_tests/common.py @@ -1056,7 +1056,9 @@ class Postgres(QueryRunner): def cleanup_databases(self): for database in self.databases: self.sql( - sql.SQL("DROP DATABASE IF EXISTS {} WITH (FORCE)").format(sql.Identifier(database)) + sql.SQL("DROP DATABASE IF EXISTS {} WITH (FORCE)").format( + sql.Identifier(database) + ) ) def cleanup_schemas(self): From 1c219fe9454c778e0f4e444b4652b733483797f0 Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Fri, 28 Jun 2024 16:49:46 +0200 Subject: [PATCH 37/41] Done test_multiple_databases_distributed_deadlock_detection --- src/test/regress/citus_tests/common.py | 11 ++++++ .../test/test_maintenancedeamon.py | 2 +- ...atabases_distributed_deadlock_detection.py | 35 +++++++++++++------ 3 files changed, 36 insertions(+), 12 deletions(-) diff --git a/src/test/regress/citus_tests/common.py b/src/test/regress/citus_tests/common.py index e3dbcaf8b..a1990203f 100644 --- a/src/test/regress/citus_tests/common.py +++ b/src/test/regress/citus_tests/common.py @@ -1026,6 +1026,15 @@ class Postgres(QueryRunner): self.databases.add(name) self.sql(sql.SQL("CREATE DATABASE {}").format(sql.Identifier(name))) + def drop_database(self, name): + self.sql("DROP EXTENSION IF EXISTS citus CASCADE", dbname=name) + self.sql( + sql.SQL("DROP DATABASE IF EXISTS {} WITH (FORCE)").format( + sql.Identifier(name) + ) + ) + self.databases.remove(name) + def create_schema(self, name): self.schemas.add(name) self.sql(sql.SQL("CREATE SCHEMA {}").format(sql.Identifier(name))) @@ -1055,11 +1064,13 @@ class Postgres(QueryRunner): def cleanup_databases(self): for database in self.databases: + self.sql("DROP EXTENSION IF EXISTS citus CASCADE", dbname=database) self.sql( sql.SQL("DROP DATABASE IF EXISTS {} WITH (FORCE)").format( sql.Identifier(database) ) ) + self.databases.clear() def cleanup_schemas(self): for schema in self.schemas: diff --git a/src/test/regress/citus_tests/test/test_maintenancedeamon.py b/src/test/regress/citus_tests/test/test_maintenancedeamon.py index 3f6cb501e..1eb4e28c9 100644 --- a/src/test/regress/citus_tests/test/test_maintenancedeamon.py +++ b/src/test/regress/citus_tests/test/test_maintenancedeamon.py @@ -62,7 +62,7 @@ def test_set_maindb(cluster_factory): wait_until_maintenance_deamons_start(2, cluster) - cluster.coordinator.sql("DROP DATABASE mymaindb;") + cluster.coordinator.drop_database("mymaindb") wait_until_maintenance_deamons_start(1, cluster) diff --git a/src/test/regress/citus_tests/test/test_multiple_databases_distributed_deadlock_detection.py b/src/test/regress/citus_tests/test/test_multiple_databases_distributed_deadlock_detection.py index 6db6db69f..b9cd9596b 100644 --- a/src/test/regress/citus_tests/test/test_multiple_databases_distributed_deadlock_detection.py +++ b/src/test/regress/citus_tests/test/test_multiple_databases_distributed_deadlock_detection.py @@ -3,18 +3,20 @@ import asyncio import pytest from psycopg.errors import DeadlockDetected +# For every database there is expected to be 2 queries, +# so ~80 connections will be held by deadlocks. Another 5 is expected to be used by maintenance daemon, +# leaving ~15 available DATABASES_NUMBER = 40 async def test_multiple_databases_distributed_deadlock_detection(cluster): + # Disable maintenance on all nodes for node in cluster.nodes: node.configure( "citus.recover_2pc_interval = '-1'", "citus.distributed_deadlock_detection_factor = '-1'", "citus.max_maintenance_shared_pool_size = 5", - # "log_min_messages = 'debug4'", - # "citus.main_db='postgres'" ) node.restart() @@ -42,9 +44,7 @@ async def test_multiple_databases_distributed_deadlock_detection(cluster): """ ) - print("Setup is done") - - async def test_deadlock(db_name, run_on_coordinator): + async def create_deadlock(db_name, run_on_coordinator): """Function to prepare a deadlock query in a given database""" # Init connections and store for later commits if run_on_coordinator: @@ -89,11 +89,13 @@ async def test_multiple_databases_distributed_deadlock_detection(cluster): await asyncio.wait_for(run_deadlocked_queries(), 300) + await first_connection.commit() + await second_connection.commit() + async def enable_maintenance_when_deadlocks_ready(): """Function to enable maintenance daemons, when all the expected deadlock queries are ready""" # Let deadlocks commence await asyncio.sleep(2) - # cluster.debug() # Check that queries are deadlocked databases_with_deadlock = set() while len(databases_with_deadlock) < DATABASES_NUMBER: @@ -111,11 +113,7 @@ async def test_multiple_databases_distributed_deadlock_detection(cluster): ) queries_deadlocked = await cursor.fetchone() if queries_deadlocked[0]: - print(f"Queries are deadlocked on {db_name}") databases_with_deadlock.add(db_name) - - print("Queries on all databases are deadlocked, enabling maintenance") - # Enable maintenance back for node in cluster.nodes: node.reset_configuration( @@ -124,13 +122,28 @@ async def test_multiple_databases_distributed_deadlock_detection(cluster): ) node.reload() + # Distribute deadlocked queries among all nodes in the cluster tasks = list() for idx, db_name in enumerate(db_names): run_on_coordinator = True if idx % 3 == 0 else False tasks.append( - test_deadlock(db_name=db_name, run_on_coordinator=run_on_coordinator) + create_deadlock(db_name=db_name, run_on_coordinator=run_on_coordinator) ) tasks.append(enable_maintenance_when_deadlocks_ready()) + # await for the results await asyncio.gather(*tasks) + + # Check for "too many clients" on all nodes + for node in cluster.nodes: + with node.cur() as cursor: + cursor.execute( + """ + SELECT count(*) AS too_many_clients_errors_count + FROM regexp_split_to_table(pg_read_file(%s), E'\n') AS t(log_line) + WHERE log_line LIKE '%%sorry, too many clients already%%';""", + (node.log_path.as_posix(),), + ) + too_many_clients_errors_count = cursor.fetchone()[0] + assert too_many_clients_errors_count == 0 From f7516d7e2680c99c768f9b06950a9413133821d5 Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Fri, 28 Jun 2024 18:26:01 +0200 Subject: [PATCH 38/41] Fix tests --- src/test/regress/citus_tests/common.py | 2 ++ src/test/regress/expected/global_cancel.out | 10 ++++++++-- src/test/regress/sql/global_cancel.sql | 5 +++-- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/test/regress/citus_tests/common.py b/src/test/regress/citus_tests/common.py index a1990203f..420ddb5c0 100644 --- a/src/test/regress/citus_tests/common.py +++ b/src/test/regress/citus_tests/common.py @@ -846,6 +846,8 @@ class Postgres(QueryRunner): # happened pgconf.write("restart_after_crash = off\n") + # prevent tests from hanging + pgconf.write("statement_timeout= '5min'\n") os.truncate(self.hba_path, 0) self.ssl_access("all", "trust") self.nossl_access("all", "trust") diff --git a/src/test/regress/expected/global_cancel.out b/src/test/regress/expected/global_cancel.out index e5ce4fbc6..3687871d6 100644 --- a/src/test/regress/expected/global_cancel.out +++ b/src/test/regress/expected/global_cancel.out @@ -56,8 +56,8 @@ SELECT global_pid AS maintenance_daemon_gpid FROM pg_stat_activity psa JOIN get_all_active_transactions() gaat ON psa.pid = gaat.process_id WHERE application_name = 'Citus Maintenance Daemon' \gset SET client_min_messages TO ERROR; -CREATE USER global_cancel_user; -SELECT 1 FROM run_command_on_workers('CREATE USER global_cancel_user'); +CREATE USER global_cancel_user NOSUPERUSER ; +SELECT 1 FROM run_command_on_workers('CREATE USER global_cancel_user NOSUPERUSER ;'); ?column? --------------------------------------------------------------------- 1 @@ -66,6 +66,12 @@ SELECT 1 FROM run_command_on_workers('CREATE USER global_cancel_user'); RESET client_min_messages; \c - global_cancel_user - :master_port +SELECT current_user; + current_user +--------------------------------------------------------------------- + global_cancel_user +(1 row) + SELECT pg_typeof(:maintenance_daemon_gpid); pg_typeof --------------------------------------------------------------------- diff --git a/src/test/regress/sql/global_cancel.sql b/src/test/regress/sql/global_cancel.sql index 12330baf2..2288cd8b5 100644 --- a/src/test/regress/sql/global_cancel.sql +++ b/src/test/regress/sql/global_cancel.sql @@ -39,11 +39,12 @@ FROM pg_stat_activity psa JOIN get_all_active_transactions() gaat ON psa.pid = g WHERE application_name = 'Citus Maintenance Daemon' \gset SET client_min_messages TO ERROR; -CREATE USER global_cancel_user; -SELECT 1 FROM run_command_on_workers('CREATE USER global_cancel_user'); +CREATE USER global_cancel_user NOSUPERUSER ; +SELECT 1 FROM run_command_on_workers('CREATE USER global_cancel_user NOSUPERUSER ;'); RESET client_min_messages; \c - global_cancel_user - :master_port +SELECT current_user; SELECT pg_typeof(:maintenance_daemon_gpid); From 4d54ee80a3d4137206683a12ae59254d7fbdad74 Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Fri, 28 Jun 2024 18:58:18 +0200 Subject: [PATCH 39/41] Fix tests --- ...test_multiple_databases_distributed_deadlock_detection.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/test/regress/citus_tests/test/test_multiple_databases_distributed_deadlock_detection.py b/src/test/regress/citus_tests/test/test_multiple_databases_distributed_deadlock_detection.py index b9cd9596b..58ad565c5 100644 --- a/src/test/regress/citus_tests/test/test_multiple_databases_distributed_deadlock_detection.py +++ b/src/test/regress/citus_tests/test/test_multiple_databases_distributed_deadlock_detection.py @@ -147,3 +147,8 @@ async def test_multiple_databases_distributed_deadlock_detection(cluster): ) too_many_clients_errors_count = cursor.fetchone()[0] assert too_many_clients_errors_count == 0 + + for db_name in db_names: + cluster.coordinator.sql( + "DROP TABLE public.deadlock_detection_test", dbname=db_name + ) From a227bcc8059c6c43602061543be13c5b0c9fdbcf Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Fri, 28 Jun 2024 21:30:09 +0200 Subject: [PATCH 40/41] - Fix tests - Investigate citus_local_tables --- src/test/regress/citus_tests/run_test.py | 3 ++- ..._multiple_databases_distributed_deadlock_detection.py | 9 ++------- src/test/regress/sql/citus_local_tables.sql | 1 + 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/src/test/regress/citus_tests/run_test.py b/src/test/regress/citus_tests/run_test.py index 9a648c0ab..fa430fa0a 100755 --- a/src/test/regress/citus_tests/run_test.py +++ b/src/test/regress/citus_tests/run_test.py @@ -241,7 +241,8 @@ def run_python_test(test_name, args): "pytest", "pytest", "--numprocesses", - "auto", + # Tests may be heavy, so limit the concurrency + "2", "--count", str(args["repeat"]), str(test_path), diff --git a/src/test/regress/citus_tests/test/test_multiple_databases_distributed_deadlock_detection.py b/src/test/regress/citus_tests/test/test_multiple_databases_distributed_deadlock_detection.py index 58ad565c5..6df95b4f7 100644 --- a/src/test/regress/citus_tests/test/test_multiple_databases_distributed_deadlock_detection.py +++ b/src/test/regress/citus_tests/test/test_multiple_databases_distributed_deadlock_detection.py @@ -89,8 +89,8 @@ async def test_multiple_databases_distributed_deadlock_detection(cluster): await asyncio.wait_for(run_deadlocked_queries(), 300) - await first_connection.commit() - await second_connection.commit() + await first_connection.rollback() + await second_connection.rollback() async def enable_maintenance_when_deadlocks_ready(): """Function to enable maintenance daemons, when all the expected deadlock queries are ready""" @@ -147,8 +147,3 @@ async def test_multiple_databases_distributed_deadlock_detection(cluster): ) too_many_clients_errors_count = cursor.fetchone()[0] assert too_many_clients_errors_count == 0 - - for db_name in db_names: - cluster.coordinator.sql( - "DROP TABLE public.deadlock_detection_test", dbname=db_name - ) diff --git a/src/test/regress/sql/citus_local_tables.sql b/src/test/regress/sql/citus_local_tables.sql index e33fbc6cc..ace8c22c9 100644 --- a/src/test/regress/sql/citus_local_tables.sql +++ b/src/test/regress/sql/citus_local_tables.sql @@ -579,6 +579,7 @@ alter table parent_1 add constraint fkey2 foreign key(a) references parent_2(a); SELECT create_reference_table('ref_table'); alter table parent_1 drop constraint fkey_test_drop; select count(*) from pg_constraint where conname = 'fkey_test_drop'; +SELECT * FROM pg_dist_shard ORDER BY shardid; -- verify we still preserve the child-parent hierarchy after all conversions -- check the shard partition select inhrelid::regclass from pg_inherits where (select inhparent::regclass::text) ~ '^parent_1_\d{7}$' order by 1; From bf2572a92764b32a5622f3f849e04cbc9fb88909 Mon Sep 17 00:00:00 2001 From: ivyazmitinov Date: Fri, 28 Jun 2024 22:08:49 +0200 Subject: [PATCH 41/41] Fix citus_local_tables --- src/test/regress/expected/citus_local_tables.out | 11 ++++++++--- src/test/regress/sql/citus_local_tables.sql | 8 ++++++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/src/test/regress/expected/citus_local_tables.out b/src/test/regress/expected/citus_local_tables.out index 4f3053094..08784f789 100644 --- a/src/test/regress/expected/citus_local_tables.out +++ b/src/test/regress/expected/citus_local_tables.out @@ -949,10 +949,15 @@ select count(*) from pg_constraint where conname = 'fkey_test_drop'; -- verify we still preserve the child-parent hierarchy after all conversions -- check the shard partition -select inhrelid::regclass from pg_inherits where (select inhparent::regclass::text) ~ '^parent_1_\d{7}$' order by 1; - inhrelid +SELECT count(*) = 1 AS child_parent_hierarchy_test +FROM pg_inherits +WHERE (SELECT inhparent::regclass::text) ~ '^parent_1_\d{7}$' + -- order of the child shard id is not guaranteed, but should be either 1904004 or 04 + AND (inhrelid::regclass::text) IN ('parent_1_child_1_1904004', 'parent_1_child_1_1904006') +ORDER BY 1; + child_parent_hierarchy_test --------------------------------------------------------------------- - parent_1_child_1_1904006 + t (1 row) -- check the shell partition diff --git a/src/test/regress/sql/citus_local_tables.sql b/src/test/regress/sql/citus_local_tables.sql index ace8c22c9..f61c97f9d 100644 --- a/src/test/regress/sql/citus_local_tables.sql +++ b/src/test/regress/sql/citus_local_tables.sql @@ -579,10 +579,14 @@ alter table parent_1 add constraint fkey2 foreign key(a) references parent_2(a); SELECT create_reference_table('ref_table'); alter table parent_1 drop constraint fkey_test_drop; select count(*) from pg_constraint where conname = 'fkey_test_drop'; -SELECT * FROM pg_dist_shard ORDER BY shardid; -- verify we still preserve the child-parent hierarchy after all conversions -- check the shard partition -select inhrelid::regclass from pg_inherits where (select inhparent::regclass::text) ~ '^parent_1_\d{7}$' order by 1; +SELECT count(*) = 1 AS child_parent_hierarchy_test +FROM pg_inherits +WHERE (SELECT inhparent::regclass::text) ~ '^parent_1_\d{7}$' + -- order of the child shard id is not guaranteed, but should be either 1904004 or 04 + AND (inhrelid::regclass::text) IN ('parent_1_child_1_1904004', 'parent_1_child_1_1904006') +ORDER BY 1; -- check the shell partition select inhrelid::regclass from pg_inherits where inhparent='parent_1'::regclass order by 1;