/*------------------------------------------------------------------------- * * connection_management.c * Central management of connections and their life-cycle * * Copyright (c) 2016, Citus Data, Inc. * *------------------------------------------------------------------------- */ #include "postgres.h" #ifdef HAVE_POLL_H #include #endif #include "libpq-fe.h" #include "miscadmin.h" #include "access/hash.h" #include "commands/dbcommands.h" #include "distributed/connection_management.h" #include "distributed/metadata_cache.h" #include "distributed/hash_helpers.h" #include "mb/pg_wchar.h" #include "utils/hsearch.h" #include "utils/memutils.h" HTAB *ConnectionHash = NULL; MemoryContext ConnectionContext = NULL; static uint32 ConnectionHashHash(const void *key, Size keysize); static int ConnectionHashCompare(const void *a, const void *b, Size keysize); static MultiConnection * StartConnectionEstablishment(ConnectionHashKey *key); /* * Initialize per-backend connection management infrastructure. */ void InitializeConnectionManagement(void) { HASHCTL info; uint32 hashFlags = 0; /* * Create a single context for connection and transaction related memory * management. Doing so, instead of allocating in TopMemoryContext, makes * it easier to associate used memory. */ ConnectionContext = AllocSetContextCreate(TopMemoryContext, "Connection Context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); /* create (host,port,user,database) -> [connection] hash */ memset(&info, 0, sizeof(info)); info.keysize = sizeof(ConnectionHashKey); info.entrysize = sizeof(ConnectionHashEntry); info.hash = ConnectionHashHash; info.match = ConnectionHashCompare; info.hcxt = ConnectionContext; hashFlags = (HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT | HASH_COMPARE); ConnectionHash = hash_create("citus connection cache (host,port,user,database)", 64, &info, hashFlags); } /* * Perform connection management activity after the end of a transaction. Both * COMMIT and ABORT paths are handled here. * * This is called by Citus' global transaction callback. */ void AtEOXact_Connections(bool isCommit) { HASH_SEQ_STATUS status; ConnectionHashEntry *entry; /* * Close all remote connections if necessary anymore (i.e. not session * lifetime), or if in a failed state. */ hash_seq_init(&status, ConnectionHash); while ((entry = (ConnectionHashEntry *) hash_seq_search(&status)) != 0) { ListCell *previousCell = NULL; ListCell *nextCell = NULL; ListCell *connectionCell = NULL; /* * Have to iterate "manually", to be able to delete connections in the * middle of the list. */ for (connectionCell = list_head(entry->connections); connectionCell != NULL; connectionCell = nextCell) { MultiConnection *connection = (MultiConnection *) lfirst(connectionCell); nextCell = lnext(connectionCell); /* * To avoid code leaking connections we warn if connections are * still claimed exclusively. We can only do so if the transaction * committed, as it's normal that code didn't have chance to clean * up after errors. */ if (isCommit && connection->claimedExclusively) { ereport(WARNING, (errmsg("connection claimed exclusively at transaction commit"))); } /* * Only let a connection life longer than a single transaction if * instructed to do so by the caller. We also skip doing so if * it's in a state that wouldn't allow us to run queries again. */ if (!connection->sessionLifespan || PQstatus(connection->conn) != CONNECTION_OK || PQtransactionStatus(connection->conn) != PQTRANS_IDLE) { PQfinish(connection->conn); connection->conn = NULL; entry->connections = list_delete_cell(entry->connections, connectionCell, previousCell); pfree(connection); } else { /* reset per-transaction state */ connection->activeInTransaction = false; UnclaimConnection(connection); previousCell = connectionCell; } } /* * NB: We leave the hash entry in place, even if there's no individual * connections in it anymore. There seems no benefit in deleting it, * and it'll save a bit of work in the next transaction. */ } } /* * GetNodeConnection() establishes a connection to remote node, using default * user and database. * * See StartNodeUserDatabaseConnection for details. */ MultiConnection * GetNodeConnection(uint32 flags, const char *hostname, int32 port) { return GetNodeUserDatabaseConnection(flags, hostname, port, NULL, NULL); } /* * StartNodeConnection initiate a connection to remote node, using default * user and database. * * See StartNodeUserDatabaseConnection for details. */ MultiConnection * StartNodeConnection(uint32 flags, const char *hostname, int32 port) { return StartNodeUserDatabaseConnection(flags, hostname, port, NULL, NULL); } /* * GetNodeUserDatabaseConnection establishes connection to remote node. * * See StartNodeUserDatabaseConnection for details. */ MultiConnection * GetNodeUserDatabaseConnection(uint32 flags, const char *hostname, int32 port, const char *user, const char *database) { MultiConnection *connection; connection = StartNodeUserDatabaseConnection(flags, hostname, port, user, database); FinishConnectionEstablishment(connection); return connection; } /* * StartNodeUserDatabaseConnection() initiates a connection to a remote node. * * If user or database are NULL, the current session's defaults are used. The * following flags influence connection establishment behaviour: * - NEW_CONNECTION - it is permitted to establish a new connection * - CACHED_CONNECTION - it is permitted to re-use an established connection * - SESSION_LIFESPAN - the connection should persist after transaction end * - FOR_DML - only meaningful for placement associated connections * - FOR_DDL - only meaningful for placement associated connections * - CRITICAL_CONNECTION - transaction failures on this connection fail the entire * coordinated transaction * * The returned connection has only been initiated, not fully * established. That's useful to allow parallel connection establishment. If * that's not desired use the Get* variant. */ MultiConnection * StartNodeUserDatabaseConnection(uint32 flags, const char *hostname, int32 port, const char *user, const char *database) { ConnectionHashKey key; ConnectionHashEntry *entry = NULL; MultiConnection *connection; MemoryContext oldContext; bool found; strlcpy(key.hostname, hostname, MAX_NODE_LENGTH); key.port = port; if (user) { strlcpy(key.user, user, NAMEDATALEN); } else { strlcpy(key.user, CurrentUserName(), NAMEDATALEN); } if (database) { strlcpy(key.database, database, NAMEDATALEN); } else { strlcpy(key.database, get_database_name(MyDatabaseId), NAMEDATALEN); } if (CurrentCoordinatedTransactionState == COORD_TRANS_NONE) { CurrentCoordinatedTransactionState = COORD_TRANS_IDLE; } /* * Lookup relevant hash entry. We always enter. If only a cached * connection is desired, and there's none, we'll simply leave the * connection list empty. */ entry = hash_search(ConnectionHash, &key, HASH_ENTER, &found); if (!found) { entry->connections = NIL; } if (flags & CACHED_CONNECTION) { ListCell *connectionCell = NULL; /* check connection cache for a connection that's not already in use */ foreach(connectionCell, entry->connections) { connection = (MultiConnection *) lfirst(connectionCell); /* don't return claimed connections */ if (!connection->claimedExclusively) { if (flags & SESSION_LIFESPAN) { connection->sessionLifespan = true; } connection->activeInTransaction = true; /* * Check whether we're right now allowed to open new * connections. A cached connection counts as new if it hasn't * been used in this transaction. * * FIXME: This should be removed soon, once all connections go * through this API. */ if (!connection->activeInTransaction && XactModificationLevel > XACT_MODIFICATION_DATA) { ereport(ERROR, (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION), errmsg("cannot open new connections after the first " "modification command within a transaction"))); } return connection; } /* * One could argue for erroring out when the connection is in a * failed state. But that'd be a bad idea for two reasons: * * 1) Generally starting a connection might fail, after calling * this function, so calling code needs to handle that anyway. * 2) This might be used in code that transparently handles * connection failure. */ } /* no connection available, done if a new connection isn't desirable */ if (!(flags & NEW_CONNECTION)) { return NULL; } } /* * Check whether we're right now allowed to open new connections. * * FIXME: This should be removed soon, once all connections go through * this API. */ if (XactModificationLevel > XACT_MODIFICATION_DATA) { ereport(ERROR, (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION), errmsg("cannot open new connections after the first modification " "command within a transaction"))); } /* * Either no caching desired, or no pre-established, non-claimed, * connection present. Initiate connection establishment. */ connection = StartConnectionEstablishment(&key); oldContext = MemoryContextSwitchTo(ConnectionContext); entry->connections = lappend(entry->connections, connection); MemoryContextSwitchTo(oldContext); if (flags & SESSION_LIFESPAN) { connection->sessionLifespan = true; } connection->activeInTransaction = true; return connection; } /* * Synchronously finish connection establishment of an individual connection. * * TODO: Replace with variant waiting for multiple connections. */ void FinishConnectionEstablishment(MultiConnection *connection) { /* * Loop until connection is established, or failed (possibly just timed * out). */ while (true) { ConnStatusType status = PQstatus(connection->conn); PostgresPollingStatusType pollmode; if (status == CONNECTION_OK) { return; } /* FIXME: retries? */ if (status == CONNECTION_BAD) { return; } pollmode = PQconnectPoll(connection->conn); /* * FIXME: Do we want to add transparent retry support here? */ if (pollmode == PGRES_POLLING_FAILED) { return; } else if (pollmode == PGRES_POLLING_OK) { return; } else { Assert(pollmode == PGRES_POLLING_WRITING || pollmode == PGRES_POLLING_READING); } /* Loop, to handle poll() being interrupted by signals (EINTR) */ while (true) { struct pollfd pollFileDescriptor; int pollResult = 0; pollFileDescriptor.fd = PQsocket(connection->conn); if (pollmode == PGRES_POLLING_READING) { pollFileDescriptor.events = POLLIN; } else { pollFileDescriptor.events = POLLOUT; } pollFileDescriptor.revents = 0; pollResult = poll(&pollFileDescriptor, 1, CLIENT_CONNECT_TIMEOUT_SECONDS_INT); if (pollResult == 0) { /* timeout exceeded */ } else if (pollResult > 0) { /* IO possible, continue connection establishment */ break; } else if (pollResult != EINTR) { /* retrying, signal */ } else { /* * We ERROR here, instead of just returning a failed * connection, because this shouldn't happen, and indicates a * programming error somewhere, not a network etc. issue. */ ereport(ERROR, (errcode_for_socket_access(), errmsg("poll() failed: %m"))); } } } } /* * ClaimConnectionExclusively signals that this connection is actively being * used. That means it'll not be, again, returned by * StartNodeUserDatabaseConnection() et al until releases with * UnclaimConnection(). */ void ClaimConnectionExclusively(MultiConnection *connection) { Assert(!connection->claimedExclusively); connection->claimedExclusively = true; } /* * UnclaimConnection signals that this connection is not being used * anymore. That means it again may be returned by returned by * StartNodeUserDatabaseConnection() et al. */ void UnclaimConnection(MultiConnection *connection) { connection->claimedExclusively = false; } static uint32 ConnectionHashHash(const void *key, Size keysize) { ConnectionHashKey *entry = (ConnectionHashKey *) key; uint32 hash = 0; hash = string_hash(entry->hostname, NAMEDATALEN); hash = hash_combine(hash, hash_uint32(entry->port)); hash = hash_combine(hash, string_hash(entry->user, NAMEDATALEN)); hash = hash_combine(hash, string_hash(entry->database, NAMEDATALEN)); return hash; } static int ConnectionHashCompare(const void *a, const void *b, Size keysize) { ConnectionHashKey *ca = (ConnectionHashKey *) a; ConnectionHashKey *cb = (ConnectionHashKey *) b; if (strncmp(ca->hostname, cb->hostname, NAMEDATALEN) != 0 || ca->port != cb->port || strncmp(ca->user, cb->user, NAMEDATALEN) != 0 || strncmp(ca->database, cb->database, NAMEDATALEN) != 0) { return 1; } else { return 0; } } /* * Asynchronously establish connection to a remote node, but don't wait for * that to finish. DNS lookups etc. are performed synchronously though. */ static MultiConnection * StartConnectionEstablishment(ConnectionHashKey *key) { char nodePortString[12]; const char *clientEncoding = GetDatabaseEncodingName(); MultiConnection *connection = NULL; const char *keywords[] = { "host", "port", "dbname", "user", "client_encoding", "fallback_application_name", NULL }; const char *values[] = { key->hostname, nodePortString, key->database, key->user, clientEncoding, "citus", NULL }; connection = MemoryContextAllocZero(ConnectionContext, sizeof(MultiConnection)); sprintf(nodePortString, "%d", key->port); strlcpy(connection->hostname, key->hostname, MAX_NODE_LENGTH); connection->port = key->port; strlcpy(connection->database, key->database, NAMEDATALEN); strlcpy(connection->user, key->user, NAMEDATALEN); connection->conn = PQconnectStartParams(keywords, values, false); return connection; }