citus/src/backend/distributed/connection/placement_connection.c

1176 lines
36 KiB
C

/*-------------------------------------------------------------------------
*
* placement_connection.c
* Per placement connection handling.
*
* Copyright (c) 2016-2017, Citus Data, Inc.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/hash.h"
#include "distributed/colocation_utils.h"
#include "distributed/connection_management.h"
#include "distributed/hash_helpers.h"
#include "distributed/master_protocol.h"
#include "distributed/metadata_cache.h"
#include "distributed/distributed_planner.h"
#include "distributed/placement_connection.h"
#include "distributed/relation_access_tracking.h"
#include "utils/hsearch.h"
#include "utils/memutils.h"
/*
* A connection reference is used to register that a connection has been used
* to read or modify either a) a shard placement as a particular user b) a
* group of colocated placements (which depend on whether the reference is
* from ConnectionPlacementHashEntry or ColocatedPlacementHashEntry).
*/
typedef struct ConnectionReference
{
/*
* The user used to read/modify the placement. We cannot reuse connections
* that were performed using a different role, since it would not have the
* right permissions.
*/
const char *userName;
/* the connection */
MultiConnection *connection;
/*
* Information about what the connection is used for. There can only be
* one connection executing DDL/DML for a placement to avoid deadlock
* issues/read-your-own-writes violations. The difference between DDL/DML
* currently is only used to emit more precise error messages.
*/
bool hadDML;
bool hadDDL;
/* colocation group of the placement, if any */
uint32 colocationGroupId;
uint32 representativeValue;
/* placementId of the placement, used only for append distributed tables */
uint64 placementId;
/* membership in MultiConnection->referencedPlacements */
dlist_node connectionNode;
} ConnectionReference;
struct ColocatedPlacementsHashEntry;
/*
* Hash table mapping placements to a list of connections.
*
* This stores a list of connections for each placement, because multiple
* connections to the same placement may exist at the same time. E.g. a
* real-time executor query may reference the same placement in several
* sub-tasks.
*
* We keep track about a connection having executed DML or DDL, since we can
* only ever allow a single transaction to do either to prevent deadlocks and
* consistency violations (e.g. read-your-own-writes).
*/
/* hash key */
typedef struct ConnectionPlacementHashKey
{
uint64 placementId;
} ConnectionPlacementHashKey;
/* hash entry */
typedef struct ConnectionPlacementHashEntry
{
ConnectionPlacementHashKey key;
/* did any remote transactions fail? */
bool failed;
/* primary connection used to access the placement */
ConnectionReference *primaryConnection;
/* are any other connections reading from the placements? */
bool hasSecondaryConnections;
/* entry for the set of co-located placements */
struct ColocatedPlacementsHashEntry *colocatedEntry;
/* membership in ConnectionShardHashEntry->placementConnections */
dlist_node shardNode;
} ConnectionPlacementHashEntry;
/* hash table */
static HTAB *ConnectionPlacementHash;
/*
* A hash-table mapping colocated placements to connections. Colocated
* placements being the set of placements on a single node that represent the
* same value range. This is needed because connections for colocated
* placements (i.e. the corresponding placements for different colocated
* distributed tables) need to share connections. Otherwise things like
* foreign keys can very easily lead to unprincipled deadlocks. This means
* that there can only one DML/DDL connection for a set of colocated
* placements.
*
* A set of colocated placements is identified, besides node identifying
* information, by the associated colocation group id and the placement's
* 'representativeValue' which currently is the lower boundary of it's
* hash-range.
*
* Note that this hash-table only contains entries for hash-partitioned
* tables, because others so far don't support colocation.
*/
/* hash key */
typedef struct ColocatedPlacementsHashKey
{
/* to identify host - database can't differ */
char nodeName[MAX_NODE_LENGTH];
uint32 nodePort;
/* colocation group, or invalid */
uint32 colocationGroupId;
/* to represent the value range */
uint32 representativeValue;
} ColocatedPlacementsHashKey;
/* hash entry */
typedef struct ColocatedPlacementsHashEntry
{
ColocatedPlacementsHashKey key;
/* primary connection used to access the co-located placements */
ConnectionReference *primaryConnection;
/* are any other connections reading from the placements? */
bool hasSecondaryConnections;
} ColocatedPlacementsHashEntry;
static HTAB *ColocatedPlacementsHash;
/*
* Hash table mapping shard ids to placements.
*
* This is used to track whether placements of a shard have to be marked
* invalid after a failure, or whether a coordinated transaction has to be
* aborted, to avoid all placements of a shard to be marked invalid.
*/
/* hash key */
typedef struct ConnectionShardHashKey
{
uint64 shardId;
} ConnectionShardHashKey;
/* hash entry */
typedef struct ConnectionShardHashEntry
{
ConnectionShardHashKey key;
dlist_head placementConnections;
} ConnectionShardHashEntry;
/* hash table */
static HTAB *ConnectionShardHash;
static MultiConnection * FindPlacementListConnection(int flags, List *placementAccessList,
const char *userName,
List **placementEntryList);
static ConnectionPlacementHashEntry * FindOrCreatePlacementEntry(
ShardPlacement *placement);
static bool CanUseExistingConnection(uint32 flags, const char *userName,
ConnectionReference *placementConnection);
static bool ConnectionAccessedDifferentPlacement(MultiConnection *connection,
ShardPlacement *placement);
static void AssociatePlacementWithShard(ConnectionPlacementHashEntry *placementEntry,
ShardPlacement *placement);
static bool CheckShardPlacements(ConnectionShardHashEntry *shardEntry);
static uint32 ColocatedPlacementsHashHash(const void *key, Size keysize);
static int ColocatedPlacementsHashCompare(const void *a, const void *b, Size keysize);
/*
* GetPlacementConnection establishes a connection for a placement.
*
* See StartPlacementConnection for details.
*/
MultiConnection *
GetPlacementConnection(uint32 flags, ShardPlacement *placement, const char *userName)
{
MultiConnection *connection = StartPlacementConnection(flags, placement, userName);
FinishConnectionEstablishment(connection);
return connection;
}
/*
* StartPlacementConnection initiates a connection to a remote node,
* associated with the placement and transaction.
*
* The connection is established for the current database. If userName is NULL
* the current user is used, otherwise the provided one.
*
* See StartNodeUserDatabaseConnection for details.
*
* Flags have the corresponding meaning from StartNodeUserDatabaseConnection,
* except that two additional flags have an effect:
* - FOR_DML - signal that connection is going to be used for DML (modifications)
* - FOR_DDL - signal that connection is going to be used for DDL
*
* Only one connection associated with the placement may have FOR_DML or
* FOR_DDL set. For hash-partitioned tables only one connection for a set of
* colocated placements may have FOR_DML/DDL set. This restriction prevents
* deadlocks and wrong results due to in-progress transactions.
*/
MultiConnection *
StartPlacementConnection(uint32 flags, ShardPlacement *placement, const char *userName)
{
ShardPlacementAccess *placementAccess =
(ShardPlacementAccess *) palloc0(sizeof(ShardPlacementAccess));
placementAccess->placement = placement;
if (flags & FOR_DDL)
{
placementAccess->accessType = PLACEMENT_ACCESS_DDL;
}
else if (flags & FOR_DML)
{
placementAccess->accessType = PLACEMENT_ACCESS_DML;
}
else
{
placementAccess->accessType = PLACEMENT_ACCESS_SELECT;
}
return StartPlacementListConnection(flags, list_make1(placementAccess), userName);
}
/*
* GetPlacementListConnection establishes a connection for a set of placement
* accesses.
*
* See StartPlacementListConnection for details.
*/
MultiConnection *
GetPlacementListConnection(uint32 flags, List *placementAccessList, const char *userName)
{
MultiConnection *connection = StartPlacementListConnection(flags, placementAccessList,
userName);
FinishConnectionEstablishment(connection);
return connection;
}
/*
* StartPlacementListConnection returns a connection to a remote node suitable for
* a placement accesses (SELECT, DML, DDL) or throws an error if no suitable
* connection can be established if would cause a self-deadlock or consistency
* violation.
*/
MultiConnection *
StartPlacementListConnection(uint32 flags, List *placementAccessList,
const char *userName)
{
char *freeUserName = NULL;
ListCell *placementAccessCell = NULL;
List *placementEntryList = NIL;
ListCell *placementEntryCell = NULL;
MultiConnection *chosenConnection = NULL;
if (userName == NULL)
{
userName = freeUserName = CurrentUserName();
}
chosenConnection = FindPlacementListConnection(flags, placementAccessList, userName,
&placementEntryList);
if (chosenConnection == NULL)
{
/* use the first placement from the list to extract nodename and nodeport */
ShardPlacementAccess *placementAccess =
(ShardPlacementAccess *) linitial(placementAccessList);
ShardPlacement *placement = placementAccess->placement;
char *nodeName = placement->nodeName;
int nodePort = placement->nodePort;
/*
* No suitable connection in the placement->connection mapping, get one from
* the node->connection pool.
*/
chosenConnection = StartNodeConnection(flags, nodeName, nodePort);
if (flags & CONNECTION_PER_PLACEMENT &&
ConnectionAccessedDifferentPlacement(chosenConnection, placement))
{
/*
* Cached connection accessed a non-co-located placement in the same
* table or co-location group, while the caller asked for a connection
* per placement. Open a new connection instead.
*
* We use this for situations in which we want to use a different
* connection for every placement, such as COPY. If we blindly returned
* a cached conection that already modified a different, non-co-located
* placement B in the same table or in a table with the same co-location
* ID as the current placement, then we'd no longer able to write to
* placement B later in the COPY.
*/
chosenConnection = StartNodeConnection(flags | FORCE_NEW_CONNECTION, nodeName,
nodePort);
Assert(!ConnectionAccessedDifferentPlacement(chosenConnection, placement));
}
}
/*
* Now that a connection has been chosen, initialise or update the connection
* references for all placements.
*/
forboth(placementAccessCell, placementAccessList,
placementEntryCell, placementEntryList)
{
ShardPlacementAccess *placementAccess =
(ShardPlacementAccess *) lfirst(placementAccessCell);
ShardPlacementAccessType accessType = placementAccess->accessType;
ConnectionPlacementHashEntry *placementEntry =
(ConnectionPlacementHashEntry *) lfirst(placementEntryCell);
ConnectionReference *placementConnection = placementEntry->primaryConnection;
if (placementConnection->connection == chosenConnection)
{
/* using the connection that was already assigned to the placement */
}
else if (placementConnection->connection == NULL)
{
/* placement does not have a connection assigned yet */
placementConnection->connection = chosenConnection;
placementConnection->hadDDL = false;
placementConnection->hadDML = false;
placementConnection->userName = MemoryContextStrdup(TopTransactionContext,
userName);
placementConnection->placementId = placementAccess->placement->placementId;
/* record association with connection */
dlist_push_tail(&chosenConnection->referencedPlacements,
&placementConnection->connectionNode);
}
else
{
/* using a different connection than the one assigned to the placement */
if (accessType != PLACEMENT_ACCESS_SELECT)
{
/*
* We previously read from the placement, but now we're writing to
* it (if we had written to the placement, we would have either chosen
* the same connection, or errored out). Update the connection reference
* to point to the connection used for writing. We don't need to remember
* the existing connection since we won't be able to reuse it for
* accessing the placement. However, we do register that it exists in
* hasSecondaryConnections.
*/
placementConnection->connection = chosenConnection;
placementConnection->userName = MemoryContextStrdup(TopTransactionContext,
userName);
Assert(!placementConnection->hadDDL);
Assert(!placementConnection->hadDML);
/* record association with connection */
dlist_push_tail(&chosenConnection->referencedPlacements,
&placementConnection->connectionNode);
}
/*
* There are now multiple connections that read from the placement
* and DDL commands are forbidden.
*/
placementEntry->hasSecondaryConnections = true;
if (placementEntry->colocatedEntry != NULL)
{
/* we also remember this for co-located placements */
placementEntry->colocatedEntry->hasSecondaryConnections = true;
}
}
/*
* Remember that we used the current connection for writes.
*/
if (accessType == PLACEMENT_ACCESS_DDL)
{
placementConnection->hadDDL = true;
}
if (accessType == PLACEMENT_ACCESS_DML)
{
placementConnection->hadDML = true;
}
}
if (freeUserName)
{
pfree(freeUserName);
}
return chosenConnection;
}
/*
* FindPlacementListConnection determines whether there is a connection that must
* be used to perform the given placement accesses.
*
* If a placement was only read in this transaction, then the same connection must
* be used for DDL to prevent self-deadlock. If a placement was modified in this
* transaction, then the same connection must be used for all subsequent accesses
* to ensure read-your-writes consistency and prevent self-deadlock. If those
* conditions cannot be met, because a connection is in use or the placements in
* the placement access list were modified over multiple connections, then this
* function throws an error.
*
* The function returns the connection that needs to be used, if such a connection
* exists, and the current placement entries for all placements in the placement
* access list.
*/
static MultiConnection *
FindPlacementListConnection(int flags, List *placementAccessList, const char *userName,
List **placementEntryList)
{
bool foundModifyingConnection = false;
ListCell *placementAccessCell = NULL;
MultiConnection *chosenConnection = NULL;
/*
* Go through all placement accesses to find a suitable connection.
*
* If none of the placements have been accessed in this transaction, connection
* remains NULL.
*
* If one or more of the placements have been modified in this transaction, then
* use the connection that performed the write. If placements have been written
* over multiple connections or the connection is not available, error out.
*
* If placements have only been read in this transaction, then use the last
* suitable connection found for a placement in the placementAccessList.
*/
foreach(placementAccessCell, placementAccessList)
{
ShardPlacementAccess *placementAccess =
(ShardPlacementAccess *) lfirst(placementAccessCell);
ShardPlacement *placement = placementAccess->placement;
ShardPlacementAccessType accessType = placementAccess->accessType;
ConnectionPlacementHashEntry *placementEntry = NULL;
ColocatedPlacementsHashEntry *colocatedEntry = NULL;
ConnectionReference *placementConnection = NULL;
if (placement->shardId == INVALID_SHARD_ID)
{
/*
* When a SELECT prunes down to 0 shard, we use a dummy placement.
* In that case, we can fall back to the default connection.
*
* FIXME: this can be removed if we evaluate empty SELECTs locally.
*/
continue;
}
placementEntry = FindOrCreatePlacementEntry(placement);
colocatedEntry = placementEntry->colocatedEntry;
placementConnection = placementEntry->primaryConnection;
/* note: the Asserts below are primarily for clarifying the conditions */
if (placementConnection->connection == NULL)
{
/* no connection has been chosen for the placement */
}
else if (accessType == PLACEMENT_ACCESS_DDL &&
placementEntry->hasSecondaryConnections)
{
/*
* If a placement has been read over multiple connections (typically as
* a result of a reference table join) then a DDL command on the placement
* would create a self-deadlock.
*/
Assert(placementConnection != NULL);
ereport(ERROR,
(errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
errmsg("cannot perform DDL on placement " UINT64_FORMAT
", which has been read over multiple connections",
placement->placementId)));
}
else if (accessType == PLACEMENT_ACCESS_DDL && colocatedEntry != NULL &&
colocatedEntry->hasSecondaryConnections)
{
/*
* If a placement has been read over multiple (uncommitted) connections
* then a DDL command on a co-located placement may create a self-deadlock
* if there exist some relationship between the co-located placements
* (e.g. foreign key, partitioning).
*/
Assert(placementConnection != NULL);
ereport(ERROR,
(errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
errmsg("cannot perform DDL on placement " UINT64_FORMAT
" since a co-located placement has been read over multiple connections",
placement->placementId)));
}
else if (foundModifyingConnection)
{
/*
* We already found a connection that performed writes on of the placements
* and must use it.
*/
if ((placementConnection->hadDDL || placementConnection->hadDML) &&
placementConnection->connection != chosenConnection)
{
/*
* The current placement may have been modified over a different
* connection. Neither connection is guaranteed to see all uncomitted
* writes and therefore we cannot proceed.
*/
ereport(ERROR,
(errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
errmsg("cannot perform query with placements that were "
"modified over multiple connections")));
}
}
else if (CanUseExistingConnection(flags, userName, placementConnection))
{
/*
* There is an existing connection for the placement and we can use it.
*/
Assert(placementConnection != NULL);
chosenConnection = placementConnection->connection;
if (placementConnection->hadDDL || placementConnection->hadDML)
{
/* this connection performed writes, we must use it */
foundModifyingConnection = true;
}
}
else if (placementConnection->hadDDL)
{
/*
* There is an existing connection, but we cannot use it and it executed
* DDL. Any subsequent operation needs to be able to see the results of
* the DDL command and thus cannot proceed if it cannot use the connection.
*/
Assert(placementConnection != NULL);
Assert(!CanUseExistingConnection(flags, userName, placementConnection));
ereport(ERROR,
(errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
errmsg("cannot establish a new connection for "
"placement " UINT64_FORMAT
", since DDL has been executed on a connection that is in use",
placement->placementId)));
}
else if (placementConnection->hadDML)
{
/*
* There is an existing connection, but we cannot use it and it executed
* DML. Any subsequent operation needs to be able to see the results of
* the DML command and thus cannot proceed if it cannot use the connection.
*
* Note that this is not meaningfully different from the previous case. We
* just produce a different error message based on whether DDL was or only
* DML was executed.
*/
Assert(placementConnection != NULL);
Assert(!CanUseExistingConnection(flags, userName, placementConnection));
Assert(!placementConnection->hadDDL);
ereport(ERROR,
(errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
errmsg("cannot establish a new connection for "
"placement " UINT64_FORMAT
", since DML has been executed on a connection that is in use",
placement->placementId)));
}
else if (accessType == PLACEMENT_ACCESS_DDL)
{
/*
* There is an existing connection, but we cannot use it and we want to
* execute DDL. The operation on the existing connection might conflict
* with the DDL statement.
*/
Assert(placementConnection != NULL);
Assert(!CanUseExistingConnection(flags, userName, placementConnection));
Assert(!placementConnection->hadDDL);
Assert(!placementConnection->hadDML);
ereport(ERROR,
(errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
errmsg("cannot perform a parallel DDL command because multiple "
"placements have been accessed over the same connection")));
}
else
{
/*
* The placement has a connection assigned to it, but it cannot be used,
* most likely because it has been claimed exclusively. Fortunately, it
* has only been used for reads and we're not performing a DDL command.
* We can therefore use a different connection for this placement.
*/
Assert(placementConnection != NULL);
Assert(!CanUseExistingConnection(flags, userName, placementConnection));
Assert(!placementConnection->hadDDL);
Assert(!placementConnection->hadDML);
Assert(accessType != PLACEMENT_ACCESS_DDL);
}
*placementEntryList = lappend(*placementEntryList, placementEntry);
/* record the relation access mapping */
AssociatePlacementAccessWithRelation(placement, accessType);
}
return chosenConnection;
}
/*
* FindOrCreatePlacementEntry finds a placement entry in either the
* placement->connection hash or the co-located placements->connection hash,
* or adds a new entry if the placement has not yet been accessed in the
* current transaction.
*/
static ConnectionPlacementHashEntry *
FindOrCreatePlacementEntry(ShardPlacement *placement)
{
ConnectionPlacementHashKey key;
ConnectionPlacementHashEntry *placementEntry = NULL;
bool found = false;
key.placementId = placement->placementId;
placementEntry = hash_search(ConnectionPlacementHash, &key, HASH_ENTER, &found);
if (!found)
{
/* no connection has been chosen for this placement */
placementEntry->failed = false;
placementEntry->primaryConnection = NULL;
placementEntry->hasSecondaryConnections = false;
placementEntry->colocatedEntry = NULL;
if (placement->partitionMethod == DISTRIBUTE_BY_HASH ||
placement->partitionMethod == DISTRIBUTE_BY_NONE)
{
ColocatedPlacementsHashKey key;
ColocatedPlacementsHashEntry *colocatedEntry = NULL;
strcpy(key.nodeName, placement->nodeName);
key.nodePort = placement->nodePort;
key.colocationGroupId = placement->colocationGroupId;
key.representativeValue = placement->representativeValue;
/* look for a connection assigned to co-located placements */
colocatedEntry = hash_search(ColocatedPlacementsHash, &key, HASH_ENTER,
&found);
if (!found)
{
void *conRef = MemoryContextAllocZero(TopTransactionContext,
sizeof(ConnectionReference));
ConnectionReference *connectionReference = (ConnectionReference *) conRef;
/*
* Store the co-location group information such that we can later
* determine whether a connection accessed different placements
* of the same co-location group.
*/
connectionReference->colocationGroupId = placement->colocationGroupId;
connectionReference->representativeValue = placement->representativeValue;
/*
* Create a connection reference that can be used for the entire
* set of co-located placements.
*/
colocatedEntry->primaryConnection = connectionReference;
colocatedEntry->hasSecondaryConnections = false;
}
/*
* Assign the connection reference for the set of co-located placements
* to the current placement.
*/
placementEntry->primaryConnection = colocatedEntry->primaryConnection;
placementEntry->colocatedEntry = colocatedEntry;
}
else
{
void *conRef = MemoryContextAllocZero(TopTransactionContext,
sizeof(ConnectionReference));
placementEntry->primaryConnection = (ConnectionReference *) conRef;
}
}
/* record association with shard, for invalidation */
AssociatePlacementWithShard(placementEntry, placement);
return placementEntry;
}
/*
* CanUseExistingConnection is a helper function for CheckExistingConnections()
* that checks whether an existing connection can be reused.
*/
static bool
CanUseExistingConnection(uint32 flags, const char *userName,
ConnectionReference *connectionReference)
{
MultiConnection *connection = connectionReference->connection;
if (!connection)
{
/* if already closed connection obviously not usable */
return false;
}
else if (connection->claimedExclusively)
{
/* already used */
return false;
}
else if (flags & FORCE_NEW_CONNECTION)
{
/* no connection reuse desired */
return false;
}
else if (strcmp(connectionReference->userName, userName) != 0)
{
/* connection for different user, check for conflict */
return false;
}
else
{
return true;
}
}
/*
* ConnectionAccessedDifferentPlacement returns true if the connection accessed another
* placement in the same colocation group with a different representative value,
* meaning it's not strictly colocated.
*/
static bool
ConnectionAccessedDifferentPlacement(MultiConnection *connection,
ShardPlacement *placement)
{
dlist_iter placementIter;
dlist_foreach(placementIter, &connection->referencedPlacements)
{
ConnectionReference *connectionReference =
dlist_container(ConnectionReference, connectionNode, placementIter.cur);
/* handle append and range distributed tables */
if (placement->partitionMethod != DISTRIBUTE_BY_HASH &&
placement->placementId != connectionReference->placementId)
{
return true;
}
/* handle hash distributed tables */
if (placement->colocationGroupId != INVALID_COLOCATION_ID &&
placement->colocationGroupId == connectionReference->colocationGroupId &&
placement->representativeValue != connectionReference->representativeValue)
{
/* non-co-located placements from the same co-location group */
return true;
}
}
return false;
}
/*
* ConnectionUsedForAnyPlacements returns true if the connection
* has not been associated with any placement.
*/
bool
ConnectionUsedForAnyPlacements(MultiConnection *connection)
{
return !dlist_is_empty(&connection->referencedPlacements);
}
/*
* AssociatePlacementWithShard records shard->placement relation in
* ConnectionShardHash.
*
* That association is later used, in CheckForFailedPlacements, to invalidate
* shard placements if necessary.
*/
static void
AssociatePlacementWithShard(ConnectionPlacementHashEntry *placementEntry,
ShardPlacement *placement)
{
ConnectionShardHashKey shardKey;
ConnectionShardHashEntry *shardEntry = NULL;
bool found = false;
dlist_iter placementIter;
shardKey.shardId = placement->shardId;
shardEntry = hash_search(ConnectionShardHash, &shardKey, HASH_ENTER, &found);
if (!found)
{
dlist_init(&shardEntry->placementConnections);
}
/*
* Check if placement is already associated with shard (happens if there's
* multiple connections for a placement). There'll usually only be few
* placement per shard, so the price of iterating isn't large.
*/
dlist_foreach(placementIter, &shardEntry->placementConnections)
{
ConnectionPlacementHashEntry *placementEntry =
dlist_container(ConnectionPlacementHashEntry, shardNode, placementIter.cur);
if (placementEntry->key.placementId == placement->placementId)
{
return;
}
}
/* otherwise add */
dlist_push_tail(&shardEntry->placementConnections, &placementEntry->shardNode);
}
/*
* CloseShardPlacementAssociation handles a connection being closed before
* transaction end.
*
* This should only be called by connection_management.c.
*/
void
CloseShardPlacementAssociation(struct MultiConnection *connection)
{
dlist_iter placementIter;
/* set connection to NULL for all references to the connection */
dlist_foreach(placementIter, &connection->referencedPlacements)
{
ConnectionReference *reference =
dlist_container(ConnectionReference, connectionNode, placementIter.cur);
reference->connection = NULL;
/*
* Note that we don't reset ConnectionPlacementHashEntry's
* primaryConnection here, that'd more complicated than it seems
* worth. That means we'll error out spuriously if a DML/DDL
* executing connection is closed earlier in a transaction.
*/
}
}
/*
* ResetShardPlacementAssociation resets the association of connections to
* shard placements at the end of a transaction.
*
* This should only be called by connection_management.c.
*/
void
ResetShardPlacementAssociation(struct MultiConnection *connection)
{
dlist_init(&connection->referencedPlacements);
}
/*
* ResetPlacementConnectionManagement() disassociates connections from
* placements and shards. This will be called at the end of XACT_EVENT_COMMIT
* and XACT_EVENT_ABORT.
*/
void
ResetPlacementConnectionManagement(void)
{
/* Simply delete all entries */
hash_delete_all(ConnectionPlacementHash);
hash_delete_all(ConnectionShardHash);
hash_delete_all(ColocatedPlacementsHash);
ResetRelationAccessHash();
/*
* NB: memory for ConnectionReference structs and subordinate data is
* deleted by virtue of being allocated in TopTransactionContext.
*/
}
/*
* MarkFailedShardPlacements looks through every connection in the connection shard hash
* and marks the placements associated with failed connections invalid.
*
* Every shard must have at least one placement connection which did not fail. If all
* modifying connections for a shard failed then the transaction will be aborted.
*
* This will be called just before commit, so we can abort before executing remote
* commits. It should also be called after modification statements, to ensure that we
* don't run future statements against placements which are not up to date.
*/
void
MarkFailedShardPlacements()
{
HASH_SEQ_STATUS status;
ConnectionShardHashEntry *shardEntry = NULL;
hash_seq_init(&status, ConnectionShardHash);
while ((shardEntry = (ConnectionShardHashEntry *) hash_seq_search(&status)) != 0)
{
if (!CheckShardPlacements(shardEntry))
{
ereport(ERROR,
(errmsg("could not make changes to shard " INT64_FORMAT
" on any node",
shardEntry->key.shardId)));
}
}
}
/*
* PostCommitMarkFailedShardPlacements marks placements invalid and checks whether
* sufficiently many placements have failed to abort the entire coordinated
* transaction.
*
* This will be called just after a coordinated commit so we can handle remote
* transactions which failed during commit.
*
* When using2PC is set as least one placement must succeed per shard. If all placements
* fail for a shard the entire transaction is aborted. If using2PC is not set then a only
* a warning will be emitted; we cannot abort because some remote transactions might have
* already been committed.
*/
void
PostCommitMarkFailedShardPlacements(bool using2PC)
{
HASH_SEQ_STATUS status;
ConnectionShardHashEntry *shardEntry = NULL;
int successes = 0;
int attempts = 0;
int elevel = using2PC ? ERROR : WARNING;
hash_seq_init(&status, ConnectionShardHash);
while ((shardEntry = (ConnectionShardHashEntry *) hash_seq_search(&status)) != 0)
{
attempts++;
if (CheckShardPlacements(shardEntry))
{
successes++;
}
else
{
/*
* Only error out if we're using 2PC. If we're not using 2PC we can't error
* out otherwise we can end up with a state where some shard modifications
* have already committed successfully.
*/
ereport(elevel,
(errmsg("could not commit transaction for shard " INT64_FORMAT
" on any active node",
shardEntry->key.shardId)));
}
}
/*
* If no shards could be modified at all, error out. Doesn't matter whether
* we're post-commit - there's nothing to invalidate.
*/
if (attempts > 0 && successes == 0)
{
ereport(ERROR, (errmsg("could not commit transaction on any active node")));
}
}
/*
* CheckShardPlacements is a helper function for CheckForFailedPlacements that
* performs the per-shard work.
*/
static bool
CheckShardPlacements(ConnectionShardHashEntry *shardEntry)
{
int failures = 0;
int successes = 0;
dlist_iter placementIter;
dlist_foreach(placementIter, &shardEntry->placementConnections)
{
ConnectionPlacementHashEntry *placementEntry =
dlist_container(ConnectionPlacementHashEntry, shardNode, placementIter.cur);
ConnectionReference *primaryConnection = placementEntry->primaryConnection;
MultiConnection *connection = NULL;
/* we only consider shards that are modified */
if (primaryConnection == NULL ||
!(primaryConnection->hadDDL || primaryConnection->hadDML))
{
continue;
}
connection = primaryConnection->connection;
if (!connection || connection->remoteTransaction.transactionFailed)
{
placementEntry->failed = true;
failures++;
}
else
{
successes++;
}
}
if (failures > 0 && successes == 0)
{
return false;
}
/* mark all failed placements invalid */
dlist_foreach(placementIter, &shardEntry->placementConnections)
{
ConnectionPlacementHashEntry *placementEntry =
dlist_container(ConnectionPlacementHashEntry, shardNode, placementIter.cur);
if (placementEntry->failed)
{
uint64 shardId = shardEntry->key.shardId;
uint64 placementId = placementEntry->key.placementId;
GroupShardPlacement *shardPlacement =
LoadGroupShardPlacement(shardId, placementId);
/*
* We only set shard state if its current state is FILE_FINALIZED, which
* prevents overwriting shard state if it is already set at somewhere else.
*/
if (shardPlacement->shardState == FILE_FINALIZED)
{
UpdateShardPlacementState(placementEntry->key.placementId, FILE_INACTIVE);
}
}
}
return true;
}
/*
* InitPlacementConnectionManagement performs initialization of the
* infrastructure in this file at server start.
*/
void
InitPlacementConnectionManagement(void)
{
HASHCTL info;
uint32 hashFlags = 0;
/* create (placementId) -> [ConnectionReference] hash */
memset(&info, 0, sizeof(info));
info.keysize = sizeof(ConnectionPlacementHashKey);
info.entrysize = sizeof(ConnectionPlacementHashEntry);
info.hash = tag_hash;
info.hcxt = ConnectionContext;
hashFlags = (HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
ConnectionPlacementHash = hash_create("citus connection cache (placementid)",
64, &info, hashFlags);
/* create (colocated placement identity) -> [ConnectionReference] hash */
memset(&info, 0, sizeof(info));
info.keysize = sizeof(ColocatedPlacementsHashKey);
info.entrysize = sizeof(ColocatedPlacementsHashEntry);
info.hash = ColocatedPlacementsHashHash;
info.match = ColocatedPlacementsHashCompare;
info.hcxt = ConnectionContext;
hashFlags = (HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT | HASH_COMPARE);
ColocatedPlacementsHash = hash_create("citus connection cache (colocated placements)",
64, &info, hashFlags);
/* create (shardId) -> [ConnectionShardHashEntry] hash */
memset(&info, 0, sizeof(info));
info.keysize = sizeof(ConnectionShardHashKey);
info.entrysize = sizeof(ConnectionShardHashEntry);
info.hash = tag_hash;
info.hcxt = ConnectionContext;
hashFlags = (HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
ConnectionShardHash = hash_create("citus connection cache (shardid)",
64, &info, hashFlags);
/* (relationId) = [relationAccessMode] hash */
AllocateRelationAccessHash();
}
static uint32
ColocatedPlacementsHashHash(const void *key, Size keysize)
{
ColocatedPlacementsHashKey *entry = (ColocatedPlacementsHashKey *) key;
uint32 hash = 0;
hash = string_hash(entry->nodeName, NAMEDATALEN);
hash = hash_combine(hash, hash_uint32(entry->nodePort));
hash = hash_combine(hash, hash_uint32(entry->colocationGroupId));
hash = hash_combine(hash, hash_uint32(entry->representativeValue));
return hash;
}
static int
ColocatedPlacementsHashCompare(const void *a, const void *b, Size keysize)
{
ColocatedPlacementsHashKey *ca = (ColocatedPlacementsHashKey *) a;
ColocatedPlacementsHashKey *cb = (ColocatedPlacementsHashKey *) b;
if (strncmp(ca->nodeName, cb->nodeName, MAX_NODE_LENGTH) != 0 ||
ca->nodePort != cb->nodePort ||
ca->colocationGroupId != cb->colocationGroupId ||
ca->representativeValue != cb->representativeValue)
{
return 1;
}
else
{
return 0;
}
}