citus/src/backend/distributed/connection/placement_connection.c

1138 lines
34 KiB
C

/*-------------------------------------------------------------------------
*
* placement_connection.c
* Per placement connection handling.
*
* Copyright (c) Citus Data, Inc.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/hash.h"
#include "common/hashfn.h"
#include "utils/hsearch.h"
#include "utils/memutils.h"
#include "pg_version_constants.h"
#include "distributed/colocation_utils.h"
#include "distributed/connection_management.h"
#include "distributed/coordinator_protocol.h"
#include "distributed/distributed_planner.h"
#include "distributed/hash_helpers.h"
#include "distributed/listutils.h"
#include "distributed/metadata_cache.h"
#include "distributed/multi_executor.h"
#include "distributed/multi_partitioning_utils.h"
#include "distributed/placement_connection.h"
#include "distributed/relation_access_tracking.h"
/*
* A connection reference is used to register that a connection has been used
* to read or modify either a) a shard placement as a particular user b) a
* group of colocated placements (which depend on whether the reference is
* from ConnectionPlacementHashEntry or ColocatedPlacementHashEntry).
*/
typedef struct ConnectionReference
{
/*
* The user used to read/modify the placement. We cannot reuse connections
* that were performed using a different role, since it would not have the
* right permissions.
*/
const char *userName;
/* the connection */
MultiConnection *connection;
/*
* Information about what the connection is used for. There can only be
* one connection executing DDL/DML for a placement to avoid deadlock
* issues/read-your-own-writes violations. The difference between DDL/DML
* currently is only used to emit more precise error messages.
*/
bool hadDML;
bool hadDDL;
/* colocation group of the placement, if any */
uint32 colocationGroupId;
uint32 representativeValue;
/* placementId of the placement, used only for append distributed tables */
uint64 placementId;
/* membership in MultiConnection->referencedPlacements */
dlist_node connectionNode;
} ConnectionReference;
struct ColocatedPlacementsHashEntry;
/*
* Hash table mapping placements to a list of connections.
*
* This stores a list of connections for each placement, because multiple
* connections to the same placement may exist at the same time. E.g. an
* adaptive executor query may reference the same placement in several
* sub-tasks.
*
* We keep track about a connection having executed DML or DDL, since we can
* only ever allow a single transaction to do either to prevent deadlocks and
* consistency violations (e.g. read-your-own-writes).
*/
/* hash key */
typedef struct ConnectionPlacementHashKey
{
uint64 placementId;
} ConnectionPlacementHashKey;
/* hash entry */
typedef struct ConnectionPlacementHashEntry
{
ConnectionPlacementHashKey key;
/* did any remote transactions fail? */
bool failed;
/* primary connection used to access the placement */
ConnectionReference *primaryConnection;
/* are any other connections reading from the placements? */
bool hasSecondaryConnections;
/* entry for the set of co-located placements */
struct ColocatedPlacementsHashEntry *colocatedEntry;
/* membership in ConnectionShardHashEntry->placementConnections */
dlist_node shardNode;
} ConnectionPlacementHashEntry;
/* hash table */
static HTAB *ConnectionPlacementHash;
/*
* A hash-table mapping colocated placements to connections. Colocated
* placements being the set of placements on a single node that represent the
* same value range. This is needed because connections for colocated
* placements (i.e. the corresponding placements for different colocated
* distributed tables) need to share connections. Otherwise things like
* foreign keys can very easily lead to unprincipled deadlocks. This means
* that there can only be one DML/DDL connection for a set of colocated
* placements.
*
* A set of colocated placements is identified, besides node identifying
* information, by the associated colocation group id and the placement's
* 'representativeValue' which currently is the lower boundary of it's
* hash-range.
*
* Note that this hash-table only contains entries for hash-partitioned
* tables, because others so far don't support colocation.
*/
/* hash key */
typedef struct ColocatedPlacementsHashKey
{
/* to identify host - database can't differ */
uint32 nodeId;
/* colocation group, or invalid */
uint32 colocationGroupId;
/* to represent the value range */
uint32 representativeValue;
} ColocatedPlacementsHashKey;
/* hash entry */
typedef struct ColocatedPlacementsHashEntry
{
ColocatedPlacementsHashKey key;
/* primary connection used to access the co-located placements */
ConnectionReference *primaryConnection;
/* are any other connections reading from the placements? */
bool hasSecondaryConnections;
} ColocatedPlacementsHashEntry;
static HTAB *ColocatedPlacementsHash;
/*
* Hash table mapping shard ids to placements.
*
* This is used to track whether placements of a shard have to be marked
* invalid after a failure, or whether a coordinated transaction has to be
* aborted, to avoid all placements of a shard to be marked invalid.
*/
/* hash key */
typedef struct ConnectionShardHashKey
{
uint64 shardId;
} ConnectionShardHashKey;
/* hash entry */
typedef struct ConnectionShardHashEntry
{
ConnectionShardHashKey key;
dlist_head placementConnections;
} ConnectionShardHashEntry;
/* hash table */
static HTAB *ConnectionShardHash;
static MultiConnection * FindPlacementListConnection(int flags, List *placementAccessList,
const char *userName);
static ConnectionPlacementHashEntry * FindOrCreatePlacementEntry(
ShardPlacement *placement);
static bool CanUseExistingConnection(uint32 flags, const char *userName,
ConnectionReference *placementConnection);
static bool ConnectionAccessedDifferentPlacement(MultiConnection *connection,
ShardPlacement *placement);
static void AssociatePlacementWithShard(ConnectionPlacementHashEntry *placementEntry,
ShardPlacement *placement);
static bool HasModificationFailedForShard(ConnectionShardHashEntry *shardEntry);
static uint32 ColocatedPlacementsHashHash(const void *key, Size keysize);
static int ColocatedPlacementsHashCompare(const void *a, const void *b, Size keysize);
/*
* GetPlacementConnection establishes a connection for a placement.
*
* See StartPlacementConnection for details.
*/
MultiConnection *
GetPlacementConnection(uint32 flags, ShardPlacement *placement, const char *userName)
{
MultiConnection *connection = StartPlacementConnection(flags, placement, userName);
if (connection == NULL)
{
/* connection can only be NULL for optional connections */
Assert((flags & OPTIONAL_CONNECTION));
return NULL;
}
FinishConnectionEstablishment(connection);
return connection;
}
/*
* StartPlacementConnection initiates a connection to a remote node,
* associated with the placement and transaction.
*
* The connection is established for the current database. If userName is NULL
* the current user is used, otherwise the provided one.
*
* See StartNodeUserDatabaseConnection for details.
*
* Flags have the corresponding meaning from StartNodeUserDatabaseConnection,
* except that two additional flags have an effect:
* - FOR_DML - signal that connection is going to be used for DML (modifications)
* - FOR_DDL - signal that connection is going to be used for DDL
*
* Only one connection associated with the placement may have FOR_DML or
* FOR_DDL set. For hash-partitioned tables only one connection for a set of
* colocated placements may have FOR_DML/DDL set. This restriction prevents
* deadlocks and wrong results due to in-progress transactions.
*/
MultiConnection *
StartPlacementConnection(uint32 flags, ShardPlacement *placement, const char *userName)
{
ShardPlacementAccess *placementAccess =
(ShardPlacementAccess *) palloc0(sizeof(ShardPlacementAccess));
placementAccess->placement = placement;
if (flags & FOR_DDL)
{
placementAccess->accessType = PLACEMENT_ACCESS_DDL;
}
else if (flags & FOR_DML)
{
placementAccess->accessType = PLACEMENT_ACCESS_DML;
}
else
{
placementAccess->accessType = PLACEMENT_ACCESS_SELECT;
}
return StartPlacementListConnection(flags, list_make1(placementAccess), userName);
}
/*
* StartPlacementListConnection returns a connection to a remote node suitable for
* a placement accesses (SELECT, DML, DDL) or throws an error if no suitable
* connection can be established if would cause a self-deadlock or consistency
* violation.
*/
MultiConnection *
StartPlacementListConnection(uint32 flags, List *placementAccessList,
const char *userName)
{
char *freeUserName = NULL;
if (userName == NULL)
{
userName = freeUserName = CurrentUserName();
}
MultiConnection *chosenConnection = FindPlacementListConnection(flags,
placementAccessList,
userName);
if (chosenConnection == NULL)
{
/* use the first placement from the list to extract nodename and nodeport */
ShardPlacementAccess *placementAccess =
(ShardPlacementAccess *) linitial(placementAccessList);
ShardPlacement *placement = placementAccess->placement;
char *nodeName = placement->nodeName;
int nodePort = placement->nodePort;
/*
* No suitable connection in the placement->connection mapping, get one from
* the node->connection pool.
*/
chosenConnection = StartNodeUserDatabaseConnection(flags, nodeName, nodePort,
userName, NULL);
if (chosenConnection == NULL)
{
/* connection can only be NULL for optional connections */
Assert((flags & OPTIONAL_CONNECTION));
return NULL;
}
if ((flags & REQUIRE_CLEAN_CONNECTION) &&
ConnectionAccessedDifferentPlacement(chosenConnection, placement))
{
/*
* Cached connection accessed a non-co-located placement in the same
* table or co-location group, while the caller asked for a clean
* connection. Open a new connection instead.
*
* We use this for situations in which we want to use a different
* connection for every placement, such as COPY. If we blindly returned
* a cached connection that already modified a different, non-co-located
* placement B in the same table or in a table with the same co-location
* ID as the current placement, then we'd no longer able to write to
* placement B later in the COPY.
*/
chosenConnection = StartNodeUserDatabaseConnection(flags |
FORCE_NEW_CONNECTION,
nodeName, nodePort,
userName, NULL);
if (chosenConnection == NULL)
{
/* connection can only be NULL for optional connections */
Assert((flags & OPTIONAL_CONNECTION));
return NULL;
}
Assert(!ConnectionAccessedDifferentPlacement(chosenConnection, placement));
}
}
/* remember which connection we're going to use to access the placements */
AssignPlacementListToConnection(placementAccessList, chosenConnection);
if (freeUserName)
{
pfree(freeUserName);
}
return chosenConnection;
}
/*
* AssignPlacementListToConnection assigns a set of shard placement accesses to a
* given connection, meaning that connection must be used for all (conflicting)
* accesses of the same shard placements to make sure reads see writes and to
* make sure we don't take conflicting locks.
*/
void
AssignPlacementListToConnection(List *placementAccessList, MultiConnection *connection)
{
const char *userName = connection->user;
ShardPlacementAccess *placementAccess = NULL;
foreach_ptr(placementAccess, placementAccessList)
{
ShardPlacement *placement = placementAccess->placement;
ShardPlacementAccessType accessType = placementAccess->accessType;
if (placement->shardId == INVALID_SHARD_ID)
{
/*
* When a SELECT prunes down to 0 shard, we use a dummy placement
* which is only used to route the query to a worker node, but
* the SELECT doesn't actually access any shard placement.
*
* FIXME: this can be removed if we evaluate empty SELECTs locally.
*/
continue;
}
ConnectionPlacementHashEntry *placementEntry = FindOrCreatePlacementEntry(
placement);
ConnectionReference *placementConnection = placementEntry->primaryConnection;
if (placementConnection->connection == connection)
{
/* using the connection that was already assigned to the placement */
}
else if (placementConnection->connection == NULL)
{
/* placement does not have a connection assigned yet */
placementConnection->connection = connection;
placementConnection->hadDDL = false;
placementConnection->hadDML = false;
placementConnection->userName = MemoryContextStrdup(TopTransactionContext,
userName);
placementConnection->placementId = placementAccess->placement->placementId;
/* record association with connection */
dlist_push_tail(&connection->referencedPlacements,
&placementConnection->connectionNode);
}
else
{
/* using a different connection than the one assigned to the placement */
if (accessType != PLACEMENT_ACCESS_SELECT)
{
/*
* We previously read from the placement, but now we're writing to
* it (if we had written to the placement, we would have either chosen
* the same connection, or errored out). Update the connection reference
* to point to the connection used for writing. We don't need to remember
* the existing connection since we won't be able to reuse it for
* accessing the placement. However, we do register that it exists in
* hasSecondaryConnections.
*/
placementConnection->connection = connection;
placementConnection->userName = MemoryContextStrdup(TopTransactionContext,
userName);
Assert(!placementConnection->hadDDL);
Assert(!placementConnection->hadDML);
/* record association with connection */
dlist_push_tail(&connection->referencedPlacements,
&placementConnection->connectionNode);
}
/*
* There are now multiple connections that read from the placement
* and DDL commands are forbidden.
*/
placementEntry->hasSecondaryConnections = true;
if (placementEntry->colocatedEntry != NULL)
{
/* we also remember this for co-located placements */
placementEntry->colocatedEntry->hasSecondaryConnections = true;
}
}
/*
* Remember that we used the current connection for writes.
*/
if (accessType == PLACEMENT_ACCESS_DDL)
{
placementConnection->hadDDL = true;
}
if (accessType == PLACEMENT_ACCESS_DML)
{
placementConnection->hadDML = true;
}
/* record the relation access */
Oid relationId = RelationIdForShard(placement->shardId);
RecordRelationAccessIfNonDistTable(relationId, accessType);
}
}
/*
* GetConnectionIfPlacementAccessedInXact returns the connection over which
* the placement has been access in the transaction. If not found, returns
* NULL.
*/
MultiConnection *
GetConnectionIfPlacementAccessedInXact(int flags, List *placementAccessList,
const char *userName)
{
char *freeUserName = NULL;
if (userName == NULL)
{
userName = freeUserName = CurrentUserName();
}
MultiConnection *connection = FindPlacementListConnection(flags, placementAccessList,
userName);
if (freeUserName != NULL)
{
pfree(freeUserName);
}
return connection;
}
/*
* FindPlacementListConnection determines whether there is a connection that must
* be used to perform the given placement accesses.
*
* If a placement was only read in this transaction, then the same connection must
* be used for DDL to prevent self-deadlock. If a placement was modified in this
* transaction, then the same connection must be used for all subsequent accesses
* to ensure read-your-writes consistency and prevent self-deadlock. If those
* conditions cannot be met, because a connection is in use or the placements in
* the placement access list were modified over multiple connections, then this
* function throws an error.
*
* The function returns the connection that needs to be used, if such a connection
* exists.
*/
static MultiConnection *
FindPlacementListConnection(int flags, List *placementAccessList, const char *userName)
{
bool foundModifyingConnection = false;
MultiConnection *chosenConnection = NULL;
/*
* Go through all placement accesses to find a suitable connection.
*
* If none of the placements have been accessed in this transaction, connection
* remains NULL.
*
* If one or more of the placements have been modified in this transaction, then
* use the connection that performed the write. If placements have been written
* over multiple connections or the connection is not available, error out.
*
* If placements have only been read in this transaction, then use the last
* suitable connection found for a placement in the placementAccessList.
*/
ShardPlacementAccess *placementAccess = NULL;
foreach_ptr(placementAccess, placementAccessList)
{
ShardPlacement *placement = placementAccess->placement;
ShardPlacementAccessType accessType = placementAccess->accessType;
if (placement->shardId == INVALID_SHARD_ID)
{
/*
* When a SELECT prunes down to 0 shard, we use a dummy placement.
* In that case, we can fall back to the default connection.
*
* FIXME: this can be removed if we evaluate empty SELECTs locally.
*/
continue;
}
ConnectionPlacementHashEntry *placementEntry = FindOrCreatePlacementEntry(
placement);
ColocatedPlacementsHashEntry *colocatedEntry = placementEntry->colocatedEntry;
ConnectionReference *placementConnection = placementEntry->primaryConnection;
/* note: the Asserts below are primarily for clarifying the conditions */
if (placementConnection->connection == NULL)
{
/* no connection has been chosen for the placement */
}
else if (accessType == PLACEMENT_ACCESS_DDL &&
placementEntry->hasSecondaryConnections)
{
/*
* If a placement has been read over multiple connections (typically as
* a result of a reference table join) then a DDL command on the placement
* would create a self-deadlock.
*/
Assert(placementConnection != NULL);
ereport(ERROR,
(errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
errmsg("cannot perform DDL on placement " UINT64_FORMAT
", which has been read over multiple connections",
placement->placementId)));
}
else if (accessType == PLACEMENT_ACCESS_DDL && colocatedEntry != NULL &&
colocatedEntry->hasSecondaryConnections)
{
/*
* If a placement has been read over multiple (uncommitted) connections
* then a DDL command on a co-located placement may create a self-deadlock
* if there exist some relationship between the co-located placements
* (e.g. foreign key, partitioning).
*/
Assert(placementConnection != NULL);
ereport(ERROR,
(errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
errmsg("cannot perform DDL on placement " UINT64_FORMAT
" since a co-located placement has been read over multiple connections",
placement->placementId)));
}
else if (foundModifyingConnection)
{
/*
* We already found a connection that performed writes on of the placements
* and must use it.
*/
if ((placementConnection->hadDDL || placementConnection->hadDML) &&
placementConnection->connection != chosenConnection)
{
/*
* The current placement may have been modified over a different
* connection. Neither connection is guaranteed to see all uncomitted
* writes and therefore we cannot proceed.
*/
ereport(ERROR,
(errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
errmsg("cannot perform query with placements that were "
"modified over multiple connections")));
}
}
else if (accessType == PLACEMENT_ACCESS_SELECT &&
placementEntry->hasSecondaryConnections &&
!placementConnection->hadDDL && !placementConnection->hadDML)
{
/*
* Two separate connections have already selected from this placement
* and it was not modified. There is no benefit to using this connection.
*/
}
else if (CanUseExistingConnection(flags, userName, placementConnection))
{
/*
* There is an existing connection for the placement and we can use it.
*/
Assert(placementConnection != NULL);
chosenConnection = placementConnection->connection;
if (placementConnection->hadDDL || placementConnection->hadDML)
{
/* this connection performed writes, we must use it */
foundModifyingConnection = true;
}
}
else if (placementConnection->hadDDL || placementConnection->hadDML)
{
if (strcmp(placementConnection->userName, userName) != 0)
{
ereport(ERROR,
(errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
errmsg("cannot perform query on placements that were "
"modified in this transaction by a different "
"user")));
}
ereport(ERROR,
(errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
errmsg("cannot perform query, because modifications were "
"made over a connection that cannot be used at "
"this time. This is most likely a Citus bug so "
"please report it"
)));
}
}
return chosenConnection;
}
/*
* FindOrCreatePlacementEntry finds a placement entry in either the
* placement->connection hash or the co-located placements->connection hash,
* or adds a new entry if the placement has not yet been accessed in the
* current transaction.
*/
static ConnectionPlacementHashEntry *
FindOrCreatePlacementEntry(ShardPlacement *placement)
{
ConnectionPlacementHashKey connKey;
bool found = false;
connKey.placementId = placement->placementId;
ConnectionPlacementHashEntry *placementEntry = hash_search(ConnectionPlacementHash,
&connKey, HASH_ENTER,
&found);
if (!found)
{
/* no connection has been chosen for this placement */
placementEntry->failed = false;
placementEntry->primaryConnection = NULL;
placementEntry->hasSecondaryConnections = false;
placementEntry->colocatedEntry = NULL;
if (placement->partitionMethod == DISTRIBUTE_BY_HASH ||
placement->partitionMethod == DISTRIBUTE_BY_NONE)
{
ColocatedPlacementsHashKey coloKey;
coloKey.nodeId = placement->nodeId;
coloKey.colocationGroupId = placement->colocationGroupId;
coloKey.representativeValue = placement->representativeValue;
/* look for a connection assigned to co-located placements */
ColocatedPlacementsHashEntry *colocatedEntry = hash_search(
ColocatedPlacementsHash, &coloKey, HASH_ENTER,
&found);
if (!found)
{
void *conRef = MemoryContextAllocZero(TopTransactionContext,
sizeof(ConnectionReference));
ConnectionReference *connectionReference = (ConnectionReference *) conRef;
/*
* Store the co-location group information such that we can later
* determine whether a connection accessed different placements
* of the same co-location group.
*/
connectionReference->colocationGroupId = placement->colocationGroupId;
connectionReference->representativeValue = placement->representativeValue;
/*
* Create a connection reference that can be used for the entire
* set of co-located placements.
*/
colocatedEntry->primaryConnection = connectionReference;
colocatedEntry->hasSecondaryConnections = false;
}
/*
* Assign the connection reference for the set of co-located placements
* to the current placement.
*/
placementEntry->primaryConnection = colocatedEntry->primaryConnection;
placementEntry->colocatedEntry = colocatedEntry;
}
else
{
void *conRef = MemoryContextAllocZero(TopTransactionContext,
sizeof(ConnectionReference));
placementEntry->primaryConnection = (ConnectionReference *) conRef;
}
}
/* record association with shard, for invalidation */
AssociatePlacementWithShard(placementEntry, placement);
return placementEntry;
}
/*
* CanUseExistingConnection is a helper function for CheckExistingConnections()
* that checks whether an existing connection can be reused.
*/
static bool
CanUseExistingConnection(uint32 flags, const char *userName,
ConnectionReference *connectionReference)
{
MultiConnection *connection = connectionReference->connection;
if (!connection)
{
/* if already closed connection obviously not usable */
return false;
}
else if (connection->claimedExclusively)
{
/* already used */
return false;
}
else if (flags & FORCE_NEW_CONNECTION)
{
/* no connection reuse desired */
return false;
}
else if (strcmp(connectionReference->userName, userName) != 0)
{
/* connection for different user, check for conflict */
return false;
}
else
{
return true;
}
}
/*
* ConnectionAccessedDifferentPlacement returns true if the connection accessed another
* placement in the same colocation group with a different representative value,
* meaning it's not strictly colocated.
*/
static bool
ConnectionAccessedDifferentPlacement(MultiConnection *connection,
ShardPlacement *placement)
{
dlist_iter placementIter;
dlist_foreach(placementIter, &connection->referencedPlacements)
{
ConnectionReference *connectionReference =
dlist_container(ConnectionReference, connectionNode, placementIter.cur);
/* handle append and range distributed tables */
if (placement->partitionMethod != DISTRIBUTE_BY_HASH &&
placement->placementId != connectionReference->placementId)
{
return true;
}
/* handle hash distributed tables */
if (placement->colocationGroupId != INVALID_COLOCATION_ID &&
placement->colocationGroupId == connectionReference->colocationGroupId &&
placement->representativeValue != connectionReference->representativeValue)
{
/* non-co-located placements from the same co-location group */
return true;
}
}
return false;
}
/*
* ConnectionModifiedPlacement returns true if any DML or DDL is executed over
* the connection on any placement/table.
*/
bool
ConnectionModifiedPlacement(MultiConnection *connection)
{
dlist_iter placementIter;
if (connection->remoteTransaction.transactionState == REMOTE_TRANS_NOT_STARTED)
{
/*
* When StartPlacementListConnection() is called, we set the
* hadDDL/hadDML even before the actual command is sent to
* remote nodes. And, if this function is called at that
* point, we should not assume that the connection has already
* done any modifications.
*/
return false;
}
if (dlist_is_empty(&connection->referencedPlacements))
{
/*
* When referencesPlacements are empty, it means that we come here
* from an API that uses a node connection (e.g., not placement connection),
* which doesn't set placements.
* In that case, the command sent could be either write or read, so we assume
* it is write to be on the safe side.
*/
return true;
}
dlist_foreach(placementIter, &connection->referencedPlacements)
{
ConnectionReference *connectionReference =
dlist_container(ConnectionReference, connectionNode, placementIter.cur);
if (connectionReference->hadDDL || connectionReference->hadDML)
{
return true;
}
}
return false;
}
/*
* AssociatePlacementWithShard records shard->placement relation in
* ConnectionShardHash.
*
* That association is later used, in CheckForFailedPlacements, to invalidate
* shard placements if necessary.
*/
static void
AssociatePlacementWithShard(ConnectionPlacementHashEntry *placementEntry,
ShardPlacement *placement)
{
ConnectionShardHashKey shardKey;
bool found = false;
dlist_iter placementIter;
shardKey.shardId = placement->shardId;
ConnectionShardHashEntry *shardEntry = hash_search(ConnectionShardHash, &shardKey,
HASH_ENTER, &found);
if (!found)
{
dlist_init(&shardEntry->placementConnections);
}
/*
* Check if placement is already associated with shard (happens if there's
* multiple connections for a placement). There'll usually only be few
* placement per shard, so the price of iterating isn't large.
*/
dlist_foreach(placementIter, &shardEntry->placementConnections)
{
ConnectionPlacementHashEntry *currPlacementEntry =
dlist_container(ConnectionPlacementHashEntry, shardNode, placementIter.cur);
if (currPlacementEntry->key.placementId == placement->placementId)
{
return;
}
}
/* otherwise add */
dlist_push_tail(&shardEntry->placementConnections, &placementEntry->shardNode);
}
/*
* CloseShardPlacementAssociation handles a connection being closed before
* transaction end.
*
* This should only be called by connection_management.c.
*/
void
CloseShardPlacementAssociation(struct MultiConnection *connection)
{
dlist_iter placementIter;
/* set connection to NULL for all references to the connection */
dlist_foreach(placementIter, &connection->referencedPlacements)
{
ConnectionReference *reference =
dlist_container(ConnectionReference, connectionNode, placementIter.cur);
reference->connection = NULL;
/*
* Note that we don't reset ConnectionPlacementHashEntry's
* primaryConnection here, that'd be more complicated than it seems
* worth. That means we'll error out spuriously if a DML/DDL
* executing connection is closed earlier in a transaction.
*/
}
}
/*
* ResetShardPlacementAssociation resets the association of connections to
* shard placements at the end of a transaction.
*
* This should only be called by connection_management.c.
*/
void
ResetShardPlacementAssociation(struct MultiConnection *connection)
{
dlist_init(&connection->referencedPlacements);
}
/*
* ResetPlacementConnectionManagement() dissociates connections from
* placements and shards. This will be called at the end of XACT_EVENT_COMMIT
* and XACT_EVENT_ABORT.
*/
void
ResetPlacementConnectionManagement(void)
{
/* Simply delete all entries */
hash_delete_all(ConnectionPlacementHash);
hash_delete_all(ConnectionShardHash);
hash_delete_all(ColocatedPlacementsHash);
/*
* NB: memory for ConnectionReference structs and subordinate data is
* deleted by virtue of being allocated in TopTransactionContext.
*/
}
/*
* ErrorIfPostCommitFailedShardPlacements throws an error if any of the placements
* that modified the database and involved in the transaction has failed.
*
* Note that Citus already fails queries/commands in case of any failures during query
* processing. However, there are certain failures that can only be detected on the
* COMMIT time. And, this check mainly ensures to catch errors that happens on the
* COMMIT time on the placements.
*
* The most common example for this case is the deferred errors that are thrown by
* triggers or constraints at the COMMIT time.
*/
void
ErrorIfPostCommitFailedShardPlacements(void)
{
HASH_SEQ_STATUS status;
ConnectionShardHashEntry *shardEntry = NULL;
hash_seq_init(&status, ConnectionShardHash);
while ((shardEntry = (ConnectionShardHashEntry *) hash_seq_search(&status)) != 0)
{
if (HasModificationFailedForShard(shardEntry))
{
ereport(ERROR,
(errmsg("could not commit transaction for shard " INT64_FORMAT
" on at least one active node", shardEntry->key.shardId)));
}
}
}
/*
* HasModificationFailedForShard is a helper function for
* ErrorIfPostCommitFailedShardPlacements that performs the per-shard work.
*
* The function returns true if any placement of the input shard is modified
* and any failures has happened (either connection failures or transaction
* failures).
*/
static bool
HasModificationFailedForShard(ConnectionShardHashEntry *shardEntry)
{
dlist_iter placementIter;
dlist_foreach(placementIter, &shardEntry->placementConnections)
{
ConnectionPlacementHashEntry *placementEntry =
dlist_container(ConnectionPlacementHashEntry, shardNode, placementIter.cur);
ConnectionReference *primaryConnection = placementEntry->primaryConnection;
/* we only consider shards that are modified */
if (primaryConnection == NULL ||
!(primaryConnection->hadDDL || primaryConnection->hadDML))
{
continue;
}
MultiConnection *connection = primaryConnection->connection;
if (!connection || connection->remoteTransaction.transactionFailed)
{
return true;
}
}
return false;
}
/*
* InitPlacementConnectionManagement performs initialization of the
* infrastructure in this file at server start.
*/
void
InitPlacementConnectionManagement(void)
{
HASHCTL info;
/* create (placementId) -> [ConnectionReference] hash */
memset(&info, 0, sizeof(info));
info.keysize = sizeof(ConnectionPlacementHashKey);
info.entrysize = sizeof(ConnectionPlacementHashEntry);
info.hash = tag_hash;
info.hcxt = ConnectionContext;
uint32 hashFlags = (HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
ConnectionPlacementHash = hash_create("citus connection cache (placementid)",
64, &info, hashFlags);
/* create (colocated placement identity) -> [ConnectionReference] hash */
memset(&info, 0, sizeof(info));
info.keysize = sizeof(ColocatedPlacementsHashKey);
info.entrysize = sizeof(ColocatedPlacementsHashEntry);
info.hash = ColocatedPlacementsHashHash;
info.match = ColocatedPlacementsHashCompare;
info.hcxt = ConnectionContext;
hashFlags = (HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT | HASH_COMPARE);
ColocatedPlacementsHash = hash_create("citus connection cache (colocated placements)",
64, &info, hashFlags);
/* create (shardId) -> [ConnectionShardHashEntry] hash */
memset(&info, 0, sizeof(info));
info.keysize = sizeof(ConnectionShardHashKey);
info.entrysize = sizeof(ConnectionShardHashEntry);
info.hash = tag_hash;
info.hcxt = ConnectionContext;
hashFlags = (HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
ConnectionShardHash = hash_create("citus connection cache (shardid)",
64, &info, hashFlags);
}
/*
* UseConnectionPerPlacement returns whether we should use as separate connection
* per placement even if another connection is idle. We mostly use this in testing
* scenarios.
*/
bool
UseConnectionPerPlacement(void)
{
return ForceMaxQueryParallelization &&
MultiShardConnectionType != SEQUENTIAL_CONNECTION;
}
static uint32
ColocatedPlacementsHashHash(const void *key, Size keysize)
{
ColocatedPlacementsHashKey *entry = (ColocatedPlacementsHashKey *) key;
uint32 hash = hash_uint32(entry->nodeId);
hash = hash_combine(hash, hash_uint32(entry->colocationGroupId));
hash = hash_combine(hash, hash_uint32(entry->representativeValue));
return hash;
}
static int
ColocatedPlacementsHashCompare(const void *a, const void *b, Size keysize)
{
ColocatedPlacementsHashKey *ca = (ColocatedPlacementsHashKey *) a;
ColocatedPlacementsHashKey *cb = (ColocatedPlacementsHashKey *) b;
if (ca->nodeId != cb->nodeId ||
ca->colocationGroupId != cb->colocationGroupId ||
ca->representativeValue != cb->representativeValue)
{
return 1;
}
else
{
return 0;
}
}