mirror of https://github.com/citusdata/citus.git
Merge pull request #864 from citusdata/migrate_worker_transactions
Add worker transaction and transaction recovery infrastructurepull/843/head
commit
0bce20dd74
|
@ -8,7 +8,7 @@ EXTENSION = citus
|
||||||
EXTVERSIONS = 5.0 5.0-1 5.0-2 \
|
EXTVERSIONS = 5.0 5.0-1 5.0-2 \
|
||||||
5.1-1 5.1-2 5.1-3 5.1-4 5.1-5 5.1-6 5.1-7 5.1-8 \
|
5.1-1 5.1-2 5.1-3 5.1-4 5.1-5 5.1-6 5.1-7 5.1-8 \
|
||||||
5.2-1 5.2-2 5.2-3 5.2-4 \
|
5.2-1 5.2-2 5.2-3 5.2-4 \
|
||||||
6.0-1 6.0-2 6.0-3 6.0-4 6.0-5 6.0-6 6.0-7 6.0-8 6.0-9
|
6.0-1 6.0-2 6.0-3 6.0-4 6.0-5 6.0-6 6.0-7 6.0-8 6.0-9 6.0-10
|
||||||
|
|
||||||
# All citus--*.sql files in the source directory
|
# All citus--*.sql files in the source directory
|
||||||
DATA = $(patsubst $(citus_abs_srcdir)/%.sql,%.sql,$(wildcard $(citus_abs_srcdir)/$(EXTENSION)--*--*.sql))
|
DATA = $(patsubst $(citus_abs_srcdir)/%.sql,%.sql,$(wildcard $(citus_abs_srcdir)/$(EXTENSION)--*--*.sql))
|
||||||
|
@ -76,6 +76,8 @@ $(EXTENSION)--6.0-8.sql: $(EXTENSION)--6.0-7.sql $(EXTENSION)--6.0-7--6.0-8.sql
|
||||||
cat $^ > $@
|
cat $^ > $@
|
||||||
$(EXTENSION)--6.0-9.sql: $(EXTENSION)--6.0-8.sql $(EXTENSION)--6.0-8--6.0-9.sql
|
$(EXTENSION)--6.0-9.sql: $(EXTENSION)--6.0-8.sql $(EXTENSION)--6.0-8--6.0-9.sql
|
||||||
cat $^ > $@
|
cat $^ > $@
|
||||||
|
$(EXTENSION)--6.0-10.sql: $(EXTENSION)--6.0-9.sql $(EXTENSION)--6.0-9--6.0-10.sql
|
||||||
|
cat $^ > $@
|
||||||
|
|
||||||
NO_PGXS = 1
|
NO_PGXS = 1
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Replace oid column in pg_dist_shard_placement with an sequence column.
|
* Replace oid column in pg_dist_shard_placement with a sequence column.
|
||||||
*/
|
*/
|
||||||
CREATE SEQUENCE citus.pg_dist_shard_placement_placementid_seq
|
CREATE SEQUENCE citus.pg_dist_shard_placement_placementid_seq
|
||||||
NO CYCLE;
|
NO CYCLE;
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
/* citus--6.0-8--6.0-9.sql */
|
||||||
|
|
||||||
CREATE TABLE citus.pg_dist_local_group(
|
CREATE TABLE citus.pg_dist_local_group(
|
||||||
groupid int NOT NULL PRIMARY KEY)
|
groupid int NOT NULL PRIMARY KEY)
|
||||||
;
|
;
|
||||||
|
|
|
@ -0,0 +1,24 @@
|
||||||
|
/* citus--6.0-9--6.0-10.sql */
|
||||||
|
|
||||||
|
CREATE TABLE citus.pg_dist_transaction (
|
||||||
|
groupid int NOT NULL,
|
||||||
|
gid text NOT NULL
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX pg_dist_transaction_group_index
|
||||||
|
ON citus.pg_dist_transaction using btree(groupid);
|
||||||
|
|
||||||
|
ALTER TABLE citus.pg_dist_transaction SET SCHEMA pg_catalog;
|
||||||
|
ALTER TABLE pg_catalog.pg_dist_transaction
|
||||||
|
ADD CONSTRAINT pg_dist_transaction_unique_constraint UNIQUE (groupid, gid);
|
||||||
|
|
||||||
|
GRANT SELECT ON pg_catalog.pg_dist_transaction TO public;
|
||||||
|
|
||||||
|
CREATE FUNCTION recover_prepared_transactions()
|
||||||
|
RETURNS int
|
||||||
|
LANGUAGE C STRICT
|
||||||
|
AS 'MODULE_PATHNAME', $$recover_prepared_transactions$$;
|
||||||
|
|
||||||
|
COMMENT ON FUNCTION recover_prepared_transactions()
|
||||||
|
IS 'recover prepared transactions started by this node';
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# Citus extension
|
# Citus extension
|
||||||
comment = 'Citus distributed database'
|
comment = 'Citus distributed database'
|
||||||
default_version = '6.0-9'
|
default_version = '6.0-10'
|
||||||
module_pathname = '$libdir/citus'
|
module_pathname = '$libdir/citus'
|
||||||
relocatable = false
|
relocatable = false
|
||||||
schema = pg_catalog
|
schema = pg_catalog
|
||||||
|
|
|
@ -956,6 +956,8 @@ OpenCopyTransactions(CopyStmt *copyStatement, ShardConnections *shardConnections
|
||||||
ShardPlacement *placement = (ShardPlacement *) lfirst(placementCell);
|
ShardPlacement *placement = (ShardPlacement *) lfirst(placementCell);
|
||||||
char *nodeName = placement->nodeName;
|
char *nodeName = placement->nodeName;
|
||||||
int nodePort = placement->nodePort;
|
int nodePort = placement->nodePort;
|
||||||
|
WorkerNode *workerNode = FindWorkerNode(nodeName, nodePort);
|
||||||
|
int workerGroupId = 0;
|
||||||
char *nodeUser = CurrentUserName();
|
char *nodeUser = CurrentUserName();
|
||||||
PGconn *connection = ConnectToNode(nodeName, nodePort, nodeUser);
|
PGconn *connection = ConnectToNode(nodeName, nodePort, nodeUser);
|
||||||
|
|
||||||
|
@ -963,6 +965,17 @@ OpenCopyTransactions(CopyStmt *copyStatement, ShardConnections *shardConnections
|
||||||
StringInfo copyCommand = NULL;
|
StringInfo copyCommand = NULL;
|
||||||
PGresult *result = NULL;
|
PGresult *result = NULL;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* When a copy is initiated from a worker, the information about the connected
|
||||||
|
* worker node may not be found if pg_dist_node entries are not synced to this
|
||||||
|
* node. In that case we leave the groupId as 0. Fortunately, it is unused since
|
||||||
|
* COPY from a worker does not initiate a 2PC.
|
||||||
|
*/
|
||||||
|
if (workerNode != NULL)
|
||||||
|
{
|
||||||
|
workerGroupId = workerNode->groupId;
|
||||||
|
}
|
||||||
|
|
||||||
if (connection == NULL)
|
if (connection == NULL)
|
||||||
{
|
{
|
||||||
if (stopOnFailure)
|
if (stopOnFailure)
|
||||||
|
@ -1003,9 +1016,12 @@ OpenCopyTransactions(CopyStmt *copyStatement, ShardConnections *shardConnections
|
||||||
|
|
||||||
transactionConnection = palloc0(sizeof(TransactionConnection));
|
transactionConnection = palloc0(sizeof(TransactionConnection));
|
||||||
|
|
||||||
|
transactionConnection->groupId = workerGroupId;
|
||||||
transactionConnection->connectionId = shardConnections->shardId;
|
transactionConnection->connectionId = shardConnections->shardId;
|
||||||
transactionConnection->transactionState = TRANSACTION_STATE_COPY_STARTED;
|
transactionConnection->transactionState = TRANSACTION_STATE_COPY_STARTED;
|
||||||
transactionConnection->connection = connection;
|
transactionConnection->connection = connection;
|
||||||
|
transactionConnection->nodeName = nodeName;
|
||||||
|
transactionConnection->nodePort = nodePort;
|
||||||
|
|
||||||
connectionList = lappend(connectionList, transactionConnection);
|
connectionList = lappend(connectionList, transactionConnection);
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,6 +27,7 @@
|
||||||
#include "distributed/resource_lock.h"
|
#include "distributed/resource_lock.h"
|
||||||
#include "distributed/worker_manager.h"
|
#include "distributed/worker_manager.h"
|
||||||
#include "distributed/worker_protocol.h"
|
#include "distributed/worker_protocol.h"
|
||||||
|
#include "distributed/worker_transaction.h"
|
||||||
#include "lib/stringinfo.h"
|
#include "lib/stringinfo.h"
|
||||||
#include "nodes/pg_list.h"
|
#include "nodes/pg_list.h"
|
||||||
#include "storage/lock.h"
|
#include "storage/lock.h"
|
||||||
|
@ -53,9 +54,6 @@ static List * CopyShardCommandList(ShardInterval *shardInterval, char *sourceNod
|
||||||
int32 sourceNodePort);
|
int32 sourceNodePort);
|
||||||
static char * ConstructQualifiedShardName(ShardInterval *shardInterval);
|
static char * ConstructQualifiedShardName(ShardInterval *shardInterval);
|
||||||
static List * RecreateTableDDLCommandList(Oid relationId);
|
static List * RecreateTableDDLCommandList(Oid relationId);
|
||||||
static void SendCommandListInSingleTransaction(char *nodeName, int32 nodePort,
|
|
||||||
List *commandList);
|
|
||||||
static char * CitusExtensionOwnerName(void);
|
|
||||||
|
|
||||||
/* declarations for dynamic loading */
|
/* declarations for dynamic loading */
|
||||||
PG_FUNCTION_INFO_V1(master_copy_shard_placement);
|
PG_FUNCTION_INFO_V1(master_copy_shard_placement);
|
||||||
|
@ -283,6 +281,7 @@ CopyShardPlacement(int64 shardId, char *sourceNodeName, int32 sourceNodePort,
|
||||||
List *colocatedShardList = ColocatedShardIntervalList(shardInterval);
|
List *colocatedShardList = ColocatedShardIntervalList(shardInterval);
|
||||||
ListCell *colocatedShardCell = NULL;
|
ListCell *colocatedShardCell = NULL;
|
||||||
List *ddlCommandList = NIL;
|
List *ddlCommandList = NIL;
|
||||||
|
char *nodeUser = CitusExtensionOwnerName();
|
||||||
|
|
||||||
foreach(colocatedShardCell, colocatedShardList)
|
foreach(colocatedShardCell, colocatedShardList)
|
||||||
{
|
{
|
||||||
|
@ -297,7 +296,8 @@ CopyShardPlacement(int64 shardId, char *sourceNodeName, int32 sourceNodePort,
|
||||||
|
|
||||||
HOLD_INTERRUPTS();
|
HOLD_INTERRUPTS();
|
||||||
|
|
||||||
SendCommandListInSingleTransaction(targetNodeName, targetNodePort, ddlCommandList);
|
SendCommandListToWorkerInSingleTransaction(targetNodeName, targetNodePort, nodeUser,
|
||||||
|
ddlCommandList);
|
||||||
|
|
||||||
foreach(colocatedShardCell, colocatedShardList)
|
foreach(colocatedShardCell, colocatedShardList)
|
||||||
{
|
{
|
||||||
|
@ -443,87 +443,3 @@ RecreateTableDDLCommandList(Oid relationId)
|
||||||
|
|
||||||
return recreateCommandList;
|
return recreateCommandList;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
* SendCommandListInSingleTransaction opens connection to the node with the given
|
|
||||||
* nodeName and nodePort. Then, the connection starts a transaction on the remote
|
|
||||||
* node and executes the commands in the transaction. The function raises error if
|
|
||||||
* any of the queries fails.
|
|
||||||
*
|
|
||||||
* FIXME: Copied from Citus-MX, should be removed once those changes checked-in to Citus.
|
|
||||||
*/
|
|
||||||
static void
|
|
||||||
SendCommandListInSingleTransaction(char *nodeName, int32 nodePort, List *commandList)
|
|
||||||
{
|
|
||||||
char *nodeUser = CitusExtensionOwnerName();
|
|
||||||
PGconn *workerConnection = NULL;
|
|
||||||
PGresult *queryResult = NULL;
|
|
||||||
ListCell *commandCell = NULL;
|
|
||||||
|
|
||||||
workerConnection = ConnectToNode(nodeName, nodePort, nodeUser);
|
|
||||||
if (workerConnection == NULL)
|
|
||||||
{
|
|
||||||
ereport(ERROR, (errmsg("could not open connection to %s:%d as %s",
|
|
||||||
nodeName, nodePort, nodeUser)));
|
|
||||||
}
|
|
||||||
|
|
||||||
/* start the transaction on the worker node */
|
|
||||||
queryResult = PQexec(workerConnection, "BEGIN");
|
|
||||||
if (PQresultStatus(queryResult) != PGRES_COMMAND_OK)
|
|
||||||
{
|
|
||||||
ReraiseRemoteError(workerConnection, queryResult);
|
|
||||||
}
|
|
||||||
|
|
||||||
PQclear(queryResult);
|
|
||||||
|
|
||||||
/* iterate over the commands and execute them in the same connection */
|
|
||||||
foreach(commandCell, commandList)
|
|
||||||
{
|
|
||||||
char *commandString = lfirst(commandCell);
|
|
||||||
ExecStatusType resultStatus = PGRES_EMPTY_QUERY;
|
|
||||||
|
|
||||||
queryResult = PQexec(workerConnection, commandString);
|
|
||||||
resultStatus = PQresultStatus(queryResult);
|
|
||||||
if (!(resultStatus == PGRES_SINGLE_TUPLE || resultStatus == PGRES_TUPLES_OK ||
|
|
||||||
resultStatus == PGRES_COMMAND_OK))
|
|
||||||
{
|
|
||||||
ReraiseRemoteError(workerConnection, queryResult);
|
|
||||||
}
|
|
||||||
|
|
||||||
PQclear(queryResult);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* commit the transaction on the worker node */
|
|
||||||
queryResult = PQexec(workerConnection, "COMMIT");
|
|
||||||
if (PQresultStatus(queryResult) != PGRES_COMMAND_OK)
|
|
||||||
{
|
|
||||||
ReraiseRemoteError(workerConnection, queryResult);
|
|
||||||
}
|
|
||||||
|
|
||||||
PQclear(queryResult);
|
|
||||||
|
|
||||||
/* clear NULL result */
|
|
||||||
PQgetResult(workerConnection);
|
|
||||||
|
|
||||||
/* we no longer need this connection */
|
|
||||||
PQfinish(workerConnection);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
* CitusExtensionOwnerName returns the name of the owner of the extension.
|
|
||||||
*
|
|
||||||
* FIXME: Copied from Citus-MX, should be removed once those changes checked-in to Citus.
|
|
||||||
*/
|
|
||||||
static char *
|
|
||||||
CitusExtensionOwnerName(void)
|
|
||||||
{
|
|
||||||
Oid superUserId = CitusExtensionOwner();
|
|
||||||
|
|
||||||
#if (PG_VERSION_NUM < 90500)
|
|
||||||
return GetUserNameFromId(superUserId);
|
|
||||||
#else
|
|
||||||
return GetUserNameFromId(superUserId, false);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
|
@ -26,10 +26,6 @@
|
||||||
static uint32 DistributedTransactionId = 0;
|
static uint32 DistributedTransactionId = 0;
|
||||||
|
|
||||||
|
|
||||||
/* Local functions forward declarations */
|
|
||||||
static StringInfo BuildTransactionName(int connectionId);
|
|
||||||
|
|
||||||
|
|
||||||
/* the commit protocol to use for COPY commands */
|
/* the commit protocol to use for COPY commands */
|
||||||
int MultiShardCommitProtocol = COMMIT_PROTOCOL_1PC;
|
int MultiShardCommitProtocol = COMMIT_PROTOCOL_1PC;
|
||||||
|
|
||||||
|
@ -252,7 +248,7 @@ CommitRemoteTransactions(List *connectionList, bool stopOnFailure)
|
||||||
* transaction, which causes it to be rolled back. In general, the user
|
* transaction, which causes it to be rolled back. In general, the user
|
||||||
* should ensure that prepared transactions do not linger.
|
* should ensure that prepared transactions do not linger.
|
||||||
*/
|
*/
|
||||||
static StringInfo
|
StringInfo
|
||||||
BuildTransactionName(int connectionId)
|
BuildTransactionName(int connectionId)
|
||||||
{
|
{
|
||||||
StringInfo commandString = makeStringInfo();
|
StringInfo commandString = makeStringInfo();
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
#include "distributed/connection_cache.h"
|
#include "distributed/connection_cache.h"
|
||||||
#include "distributed/master_metadata_utility.h"
|
#include "distributed/master_metadata_utility.h"
|
||||||
#include "distributed/multi_shard_transaction.h"
|
#include "distributed/multi_shard_transaction.h"
|
||||||
|
#include "distributed/worker_manager.h"
|
||||||
#include "nodes/pg_list.h"
|
#include "nodes/pg_list.h"
|
||||||
#include "storage/ipc.h"
|
#include "storage/ipc.h"
|
||||||
#include "utils/memutils.h"
|
#include "utils/memutils.h"
|
||||||
|
@ -126,8 +127,16 @@ BeginTransactionOnShardPlacements(uint64 shardId, char *userName)
|
||||||
ShardPlacement *shardPlacement = (ShardPlacement *) lfirst(placementCell);
|
ShardPlacement *shardPlacement = (ShardPlacement *) lfirst(placementCell);
|
||||||
PGconn *connection = NULL;
|
PGconn *connection = NULL;
|
||||||
TransactionConnection *transactionConnection = NULL;
|
TransactionConnection *transactionConnection = NULL;
|
||||||
|
WorkerNode *workerNode = FindWorkerNode(shardPlacement->nodeName,
|
||||||
|
shardPlacement->nodePort);
|
||||||
PGresult *result = NULL;
|
PGresult *result = NULL;
|
||||||
|
|
||||||
|
if (workerNode == NULL)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errmsg("could not find worker node %s:%d",
|
||||||
|
shardPlacement->nodeName, shardPlacement->nodePort)));
|
||||||
|
}
|
||||||
|
|
||||||
connection = ConnectToNode(shardPlacement->nodeName, shardPlacement->nodePort,
|
connection = ConnectToNode(shardPlacement->nodeName, shardPlacement->nodePort,
|
||||||
userName);
|
userName);
|
||||||
|
|
||||||
|
@ -142,9 +151,12 @@ BeginTransactionOnShardPlacements(uint64 shardId, char *userName)
|
||||||
|
|
||||||
transactionConnection = palloc0(sizeof(TransactionConnection));
|
transactionConnection = palloc0(sizeof(TransactionConnection));
|
||||||
|
|
||||||
|
transactionConnection->groupId = workerNode->groupId;
|
||||||
transactionConnection->connectionId = shardConnections->shardId;
|
transactionConnection->connectionId = shardConnections->shardId;
|
||||||
transactionConnection->transactionState = TRANSACTION_STATE_INVALID;
|
transactionConnection->transactionState = TRANSACTION_STATE_INVALID;
|
||||||
transactionConnection->connection = connection;
|
transactionConnection->connection = connection;
|
||||||
|
transactionConnection->nodeName = shardPlacement->nodeName;
|
||||||
|
transactionConnection->nodePort = shardPlacement->nodePort;
|
||||||
|
|
||||||
shardConnections->connectionList = lappend(shardConnections->connectionList,
|
shardConnections->connectionList = lappend(shardConnections->connectionList,
|
||||||
transactionConnection);
|
transactionConnection);
|
||||||
|
|
|
@ -0,0 +1,545 @@
|
||||||
|
/*-------------------------------------------------------------------------
|
||||||
|
*
|
||||||
|
* transaction_recovery.c
|
||||||
|
*
|
||||||
|
* Routines for recovering two-phase commits started by this node if a
|
||||||
|
* failure occurs between prepare and commit/abort.
|
||||||
|
*
|
||||||
|
* Copyright (c) 2016, Citus Data, Inc.
|
||||||
|
*
|
||||||
|
* $Id$
|
||||||
|
*
|
||||||
|
*-------------------------------------------------------------------------
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "postgres.h"
|
||||||
|
#include "miscadmin.h"
|
||||||
|
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
|
#include "access/heapam.h"
|
||||||
|
#include "access/htup_details.h"
|
||||||
|
#include "access/relscan.h"
|
||||||
|
#include "access/xact.h"
|
||||||
|
#include "catalog/indexing.h"
|
||||||
|
#include "distributed/commit_protocol.h"
|
||||||
|
#include "distributed/connection_cache.h"
|
||||||
|
#include "distributed/listutils.h"
|
||||||
|
#include "distributed/metadata_cache.h"
|
||||||
|
#include "distributed/pg_dist_transaction.h"
|
||||||
|
#include "distributed/transaction_recovery.h"
|
||||||
|
#include "distributed/worker_manager.h"
|
||||||
|
#include "distributed/worker_transaction.h"
|
||||||
|
#include "lib/stringinfo.h"
|
||||||
|
#include "storage/lmgr.h"
|
||||||
|
#include "storage/lock.h"
|
||||||
|
#include "utils/builtins.h"
|
||||||
|
#include "utils/fmgroids.h"
|
||||||
|
#include "utils/memutils.h"
|
||||||
|
#include "utils/rel.h"
|
||||||
|
|
||||||
|
|
||||||
|
/* exports for SQL callable functions */
|
||||||
|
PG_FUNCTION_INFO_V1(recover_prepared_transactions);
|
||||||
|
|
||||||
|
|
||||||
|
/* Local functions forward declarations */
|
||||||
|
static void LogTransactionRecord(int groupId, char *transactionName);
|
||||||
|
static int RecoverPreparedTransactions(void);
|
||||||
|
static int RecoverWorkerTransactions(WorkerNode *workerNode);
|
||||||
|
static List * NameListDifference(List *nameList, List *subtractList);
|
||||||
|
static int CompareNames(const void *leftPointer, const void *rightPointer);
|
||||||
|
static bool FindMatchingName(char **nameArray, int nameCount, char *needle,
|
||||||
|
int *matchIndex);
|
||||||
|
static List * PendingWorkerTransactionList(PGconn *connection);
|
||||||
|
static List * UnconfirmedWorkerTransactionsList(int groupId);
|
||||||
|
static void DeleteTransactionRecord(int32 groupId, char *transactionName);
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* recover_prepared_transactions recovers any pending prepared
|
||||||
|
* transactions started by this node on other nodes.
|
||||||
|
*/
|
||||||
|
Datum
|
||||||
|
recover_prepared_transactions(PG_FUNCTION_ARGS)
|
||||||
|
{
|
||||||
|
int recoveredTransactionCount = 0;
|
||||||
|
|
||||||
|
recoveredTransactionCount = RecoverPreparedTransactions();
|
||||||
|
|
||||||
|
PG_RETURN_INT32(recoveredTransactionCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* LogPreparedTransactions logs a commit record for all prepared transactions
|
||||||
|
* on connections in connectionList. The remote transaction is safe to commit
|
||||||
|
* once the record has been durably stored (i.e. the local transaction is
|
||||||
|
* committed).
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
LogPreparedTransactions(List *connectionList)
|
||||||
|
{
|
||||||
|
ListCell *connectionCell = NULL;
|
||||||
|
|
||||||
|
foreach(connectionCell, connectionList)
|
||||||
|
{
|
||||||
|
TransactionConnection *transactionConnection =
|
||||||
|
(TransactionConnection *) lfirst(connectionCell);
|
||||||
|
|
||||||
|
char transactionState PG_USED_FOR_ASSERTS_ONLY =
|
||||||
|
transactionConnection->transactionState;
|
||||||
|
int groupId = transactionConnection->groupId;
|
||||||
|
int64 connectionId = transactionConnection->connectionId;
|
||||||
|
StringInfo transactionName = BuildTransactionName(connectionId);
|
||||||
|
|
||||||
|
Assert(transactionState == TRANSACTION_STATE_PREPARED);
|
||||||
|
|
||||||
|
LogTransactionRecord(groupId, transactionName->data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* LogTransactionRecord registers the fact that a transaction has been
|
||||||
|
* prepared on a worker. The presence of this record indicates that the
|
||||||
|
* prepared transaction should be committed.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
LogTransactionRecord(int groupId, char *transactionName)
|
||||||
|
{
|
||||||
|
Relation pgDistTransaction = NULL;
|
||||||
|
TupleDesc tupleDescriptor = NULL;
|
||||||
|
HeapTuple heapTuple = NULL;
|
||||||
|
Datum values[Natts_pg_dist_transaction];
|
||||||
|
bool isNulls[Natts_pg_dist_transaction];
|
||||||
|
|
||||||
|
/* form new transaction tuple */
|
||||||
|
memset(values, 0, sizeof(values));
|
||||||
|
memset(isNulls, false, sizeof(isNulls));
|
||||||
|
|
||||||
|
values[Anum_pg_dist_transaction_groupid - 1] = Int32GetDatum(groupId);
|
||||||
|
values[Anum_pg_dist_transaction_gid - 1] = CStringGetTextDatum(transactionName);
|
||||||
|
|
||||||
|
/* open transaction relation and insert new tuple */
|
||||||
|
pgDistTransaction = heap_open(DistTransactionRelationId(), RowExclusiveLock);
|
||||||
|
|
||||||
|
tupleDescriptor = RelationGetDescr(pgDistTransaction);
|
||||||
|
heapTuple = heap_form_tuple(tupleDescriptor, values, isNulls);
|
||||||
|
|
||||||
|
simple_heap_insert(pgDistTransaction, heapTuple);
|
||||||
|
CatalogUpdateIndexes(pgDistTransaction, heapTuple);
|
||||||
|
CommandCounterIncrement();
|
||||||
|
|
||||||
|
/* close relation and invalidate previous cache entry */
|
||||||
|
heap_close(pgDistTransaction, RowExclusiveLock);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* RecoverPreparedTransactions recovers any pending prepared
|
||||||
|
* transactions started by this node on other nodes.
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
RecoverPreparedTransactions(void)
|
||||||
|
{
|
||||||
|
List *workerList = NIL;
|
||||||
|
ListCell *workerNodeCell = NULL;
|
||||||
|
int recoveredTransactionCount = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We block here if metadata transactions are ongoing, since we
|
||||||
|
* mustn't commit/abort their prepared transactions under their
|
||||||
|
* feet. We also prevent concurrent recovery.
|
||||||
|
*/
|
||||||
|
LockRelationOid(DistTransactionRelationId(), ExclusiveLock);
|
||||||
|
|
||||||
|
workerList = WorkerNodeList();
|
||||||
|
|
||||||
|
foreach(workerNodeCell, workerList)
|
||||||
|
{
|
||||||
|
WorkerNode *workerNode = (WorkerNode *) lfirst(workerNodeCell);
|
||||||
|
|
||||||
|
recoveredTransactionCount += RecoverWorkerTransactions(workerNode);
|
||||||
|
}
|
||||||
|
|
||||||
|
return recoveredTransactionCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* RecoverWorkerTransactions recovers any pending prepared transactions
|
||||||
|
* started by this node on the specified worker.
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
RecoverWorkerTransactions(WorkerNode *workerNode)
|
||||||
|
{
|
||||||
|
int recoveredTransactionCount = 0;
|
||||||
|
|
||||||
|
int groupId = workerNode->groupId;
|
||||||
|
char *nodeName = workerNode->workerName;
|
||||||
|
int nodePort = workerNode->workerPort;
|
||||||
|
|
||||||
|
List *pendingTransactionList = NIL;
|
||||||
|
ListCell *pendingTransactionCell = NULL;
|
||||||
|
|
||||||
|
List *unconfirmedTransactionList = NIL;
|
||||||
|
char **unconfirmedTransactionArray = NULL;
|
||||||
|
int unconfirmedTransactionCount = 0;
|
||||||
|
int unconfirmedTransactionIndex = 0;
|
||||||
|
|
||||||
|
List *committedTransactionList = NIL;
|
||||||
|
ListCell *committedTransactionCell = NULL;
|
||||||
|
|
||||||
|
MemoryContext localContext = NULL;
|
||||||
|
MemoryContext oldContext = NULL;
|
||||||
|
|
||||||
|
PGconn *connection = GetOrEstablishConnection(nodeName, nodePort);
|
||||||
|
if (connection == NULL)
|
||||||
|
{
|
||||||
|
/* cannot recover transactions on this worker right now */
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
localContext = AllocSetContextCreate(CurrentMemoryContext,
|
||||||
|
"RecoverWorkerTransactions",
|
||||||
|
ALLOCSET_DEFAULT_MINSIZE,
|
||||||
|
ALLOCSET_DEFAULT_INITSIZE,
|
||||||
|
ALLOCSET_DEFAULT_MAXSIZE);
|
||||||
|
oldContext = MemoryContextSwitchTo(localContext);
|
||||||
|
|
||||||
|
/* find transactions that were committed, but not yet confirmed */
|
||||||
|
unconfirmedTransactionList = UnconfirmedWorkerTransactionsList(groupId);
|
||||||
|
unconfirmedTransactionList = SortList(unconfirmedTransactionList, CompareNames);
|
||||||
|
|
||||||
|
/* convert list to an array to use with FindMatchingNames */
|
||||||
|
unconfirmedTransactionCount = list_length(unconfirmedTransactionList);
|
||||||
|
unconfirmedTransactionArray =
|
||||||
|
(char **) PointerArrayFromList(unconfirmedTransactionList);
|
||||||
|
|
||||||
|
/* find stale prepared transactions on the remote node */
|
||||||
|
pendingTransactionList = PendingWorkerTransactionList(connection);
|
||||||
|
pendingTransactionList = SortList(pendingTransactionList, CompareNames);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Transactions that have no pending prepared transaction are assumed to
|
||||||
|
* have been committed. Any records in unconfirmedTransactionList that
|
||||||
|
* don't have a transaction in pendingTransactionList can be removed.
|
||||||
|
*/
|
||||||
|
committedTransactionList = NameListDifference(unconfirmedTransactionList,
|
||||||
|
pendingTransactionList);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* For each pending prepared transaction, check whether there is a transaction
|
||||||
|
* record. If so, commit. If not, the transaction that started the transaction
|
||||||
|
* must have rolled back and thus the prepared transaction should be aborted.
|
||||||
|
*/
|
||||||
|
foreach(pendingTransactionCell, pendingTransactionList)
|
||||||
|
{
|
||||||
|
char *transactionName = (char *) lfirst(pendingTransactionCell);
|
||||||
|
StringInfo command = makeStringInfo();
|
||||||
|
PGresult *result = NULL;
|
||||||
|
|
||||||
|
bool shouldCommit = FindMatchingName(unconfirmedTransactionArray,
|
||||||
|
unconfirmedTransactionCount,
|
||||||
|
transactionName,
|
||||||
|
&unconfirmedTransactionIndex);
|
||||||
|
|
||||||
|
if (shouldCommit)
|
||||||
|
{
|
||||||
|
/* should have committed this prepared transaction */
|
||||||
|
appendStringInfo(command, "COMMIT PREPARED '%s'", transactionName);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* no record of this prepared transaction, abort */
|
||||||
|
appendStringInfo(command, "ROLLBACK PREPARED '%s'", transactionName);
|
||||||
|
}
|
||||||
|
|
||||||
|
result = PQexec(connection, command->data);
|
||||||
|
if (PQresultStatus(result) != PGRES_COMMAND_OK)
|
||||||
|
{
|
||||||
|
WarnRemoteError(connection, result);
|
||||||
|
PQclear(result);
|
||||||
|
|
||||||
|
/* cannot recover this transaction right now */
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
PQclear(result);
|
||||||
|
|
||||||
|
ereport(NOTICE, (errmsg("recovered a prepared transaction on %s:%d",
|
||||||
|
nodeName, nodePort),
|
||||||
|
errcontext("%s", command->data)));
|
||||||
|
|
||||||
|
if (shouldCommit)
|
||||||
|
{
|
||||||
|
committedTransactionList = lappend(committedTransactionList,
|
||||||
|
transactionName);
|
||||||
|
}
|
||||||
|
|
||||||
|
recoveredTransactionCount += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* we can remove the transaction records of confirmed transactions */
|
||||||
|
foreach(committedTransactionCell, committedTransactionList)
|
||||||
|
{
|
||||||
|
char *transactionName = (char *) lfirst(committedTransactionCell);
|
||||||
|
|
||||||
|
DeleteTransactionRecord(groupId, transactionName);
|
||||||
|
}
|
||||||
|
|
||||||
|
MemoryContextReset(localContext);
|
||||||
|
MemoryContextSwitchTo(oldContext);
|
||||||
|
|
||||||
|
return recoveredTransactionCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* NameListDifference returns the difference between the bag of
|
||||||
|
* names in nameList and subtractList. Both are assumed to be
|
||||||
|
* sorted. We cannot use list_difference_ptr here since we need
|
||||||
|
* to compare the actual strings.
|
||||||
|
*/
|
||||||
|
static List *
|
||||||
|
NameListDifference(List *nameList, List *subtractList)
|
||||||
|
{
|
||||||
|
List *differenceList = NIL;
|
||||||
|
ListCell *nameCell = NULL;
|
||||||
|
|
||||||
|
int subtractIndex = 0;
|
||||||
|
int subtractCount = list_length(subtractList);
|
||||||
|
char **subtractArray = (char **) PointerArrayFromList(subtractList);
|
||||||
|
|
||||||
|
foreach(nameCell, nameList)
|
||||||
|
{
|
||||||
|
char *baseName = (char *) lfirst(nameCell);
|
||||||
|
|
||||||
|
bool nameFound = FindMatchingName(subtractArray, subtractCount,
|
||||||
|
baseName, &subtractIndex);
|
||||||
|
|
||||||
|
if (!nameFound)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* baseName is not in subtractArray and thus included
|
||||||
|
* in the difference.
|
||||||
|
*/
|
||||||
|
differenceList = lappend(differenceList, baseName);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pfree(subtractArray);
|
||||||
|
|
||||||
|
return differenceList;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* CompareNames compares names using strncmp. Its signature allows it to
|
||||||
|
* be used in qsort.
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
CompareNames(const void *leftPointer, const void *rightPointer)
|
||||||
|
{
|
||||||
|
const char *leftString = *((char **) leftPointer);
|
||||||
|
const char *rightString = *((char **) rightPointer);
|
||||||
|
|
||||||
|
int nameCompare = strncmp(leftString, rightString, NAMEDATALEN);
|
||||||
|
|
||||||
|
return nameCompare;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* FindMatchingName searches for name in nameArray, starting at the
|
||||||
|
* value pointed to by matchIndex and stopping at the first index of
|
||||||
|
* name which is greater or equal to needle. nameArray is assumed
|
||||||
|
* to be sorted.
|
||||||
|
*
|
||||||
|
* The function sets matchIndex to the index of the name and returns
|
||||||
|
* true if the name is equal to needle. If matchIndex >= nameCount,
|
||||||
|
* then the function always returns false.
|
||||||
|
*/
|
||||||
|
static bool
|
||||||
|
FindMatchingName(char **nameArray, int nameCount, char *needle,
|
||||||
|
int *matchIndex)
|
||||||
|
{
|
||||||
|
bool foundMatchingName = false;
|
||||||
|
int searchIndex = *matchIndex;
|
||||||
|
int compareResult = -1;
|
||||||
|
|
||||||
|
while (searchIndex < nameCount)
|
||||||
|
{
|
||||||
|
char *testName = nameArray[searchIndex];
|
||||||
|
compareResult = strncmp(needle, testName, NAMEDATALEN);
|
||||||
|
|
||||||
|
if (compareResult <= 0)
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
searchIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
*matchIndex = searchIndex;
|
||||||
|
|
||||||
|
if (compareResult == 0)
|
||||||
|
{
|
||||||
|
foundMatchingName = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return foundMatchingName;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* PendingWorkerTransactionList returns a list of pending prepared
|
||||||
|
* transactions on a remote node that were started by this node.
|
||||||
|
*/
|
||||||
|
static List *
|
||||||
|
PendingWorkerTransactionList(PGconn *connection)
|
||||||
|
{
|
||||||
|
StringInfo command = makeStringInfo();
|
||||||
|
PGresult *result = NULL;
|
||||||
|
int rowCount = 0;
|
||||||
|
int rowIndex = 0;
|
||||||
|
List *transactionNames = NIL;
|
||||||
|
int coordinatorId = 0;
|
||||||
|
|
||||||
|
appendStringInfo(command, "SELECT gid FROM pg_prepared_xacts "
|
||||||
|
"WHERE gid LIKE 'citus_%d_%%'",
|
||||||
|
coordinatorId);
|
||||||
|
|
||||||
|
result = PQexec(connection, command->data);
|
||||||
|
if (PQresultStatus(result) != PGRES_TUPLES_OK)
|
||||||
|
{
|
||||||
|
ReraiseRemoteError(connection, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
rowCount = PQntuples(result);
|
||||||
|
|
||||||
|
for (rowIndex = 0; rowIndex < rowCount; rowIndex++)
|
||||||
|
{
|
||||||
|
const int columnIndex = 0;
|
||||||
|
char *transactionName = PQgetvalue(result, rowIndex, columnIndex);
|
||||||
|
|
||||||
|
transactionNames = lappend(transactionNames, pstrdup(transactionName));
|
||||||
|
}
|
||||||
|
|
||||||
|
PQclear(result);
|
||||||
|
|
||||||
|
return transactionNames;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* UnconfirmedWorkerTransactionList returns a list of unconfirmed transactions
|
||||||
|
* for a group of workers from pg_dist_transaction. A transaction is confirmed
|
||||||
|
* once we have verified that it does not exist in pg_prepared_xacts on the
|
||||||
|
* remote node and the entry in pg_dist_transaction is removed.
|
||||||
|
*/
|
||||||
|
static List *
|
||||||
|
UnconfirmedWorkerTransactionsList(int groupId)
|
||||||
|
{
|
||||||
|
List *transactionNameList = NIL;
|
||||||
|
Relation pgDistTransaction = NULL;
|
||||||
|
SysScanDesc scanDescriptor = NULL;
|
||||||
|
ScanKeyData scanKey[1];
|
||||||
|
int scanKeyCount = 1;
|
||||||
|
bool indexOK = true;
|
||||||
|
HeapTuple heapTuple = NULL;
|
||||||
|
|
||||||
|
pgDistTransaction = heap_open(DistTransactionRelationId(), AccessShareLock);
|
||||||
|
|
||||||
|
ScanKeyInit(&scanKey[0], Anum_pg_dist_transaction_groupid,
|
||||||
|
BTEqualStrategyNumber, F_INT4EQ, Int32GetDatum(groupId));
|
||||||
|
|
||||||
|
scanDescriptor = systable_beginscan(pgDistTransaction,
|
||||||
|
DistTransactionGroupIndexId(), indexOK,
|
||||||
|
NULL, scanKeyCount, scanKey);
|
||||||
|
|
||||||
|
heapTuple = systable_getnext(scanDescriptor);
|
||||||
|
while (HeapTupleIsValid(heapTuple))
|
||||||
|
{
|
||||||
|
TupleDesc tupleDescriptor = RelationGetDescr(pgDistTransaction);
|
||||||
|
bool isNull = false;
|
||||||
|
|
||||||
|
Datum transactionNameDatum = heap_getattr(heapTuple,
|
||||||
|
Anum_pg_dist_transaction_gid,
|
||||||
|
tupleDescriptor, &isNull);
|
||||||
|
|
||||||
|
char *transactionName = TextDatumGetCString(transactionNameDatum);
|
||||||
|
transactionNameList = lappend(transactionNameList, transactionName);
|
||||||
|
|
||||||
|
heapTuple = systable_getnext(scanDescriptor);
|
||||||
|
}
|
||||||
|
|
||||||
|
systable_endscan(scanDescriptor);
|
||||||
|
heap_close(pgDistTransaction, AccessShareLock);
|
||||||
|
|
||||||
|
return transactionNameList;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* DeleteTransactionRecord opens the pg_dist_transaction system catalog, finds the
|
||||||
|
* first (unique) row that corresponds to the given transactionName and worker node,
|
||||||
|
* and deletes this row.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
DeleteTransactionRecord(int32 groupId, char *transactionName)
|
||||||
|
{
|
||||||
|
Relation pgDistTransaction = NULL;
|
||||||
|
SysScanDesc scanDescriptor = NULL;
|
||||||
|
ScanKeyData scanKey[1];
|
||||||
|
int scanKeyCount = 1;
|
||||||
|
bool indexOK = true;
|
||||||
|
HeapTuple heapTuple = NULL;
|
||||||
|
bool heapTupleFound = false;
|
||||||
|
|
||||||
|
pgDistTransaction = heap_open(DistTransactionRelationId(), RowExclusiveLock);
|
||||||
|
|
||||||
|
ScanKeyInit(&scanKey[0], Anum_pg_dist_transaction_groupid,
|
||||||
|
BTEqualStrategyNumber, F_INT4EQ, Int32GetDatum(groupId));
|
||||||
|
|
||||||
|
scanDescriptor = systable_beginscan(pgDistTransaction,
|
||||||
|
DistTransactionGroupIndexId(), indexOK,
|
||||||
|
NULL, scanKeyCount, scanKey);
|
||||||
|
|
||||||
|
heapTuple = systable_getnext(scanDescriptor);
|
||||||
|
while (HeapTupleIsValid(heapTuple))
|
||||||
|
{
|
||||||
|
TupleDesc tupleDescriptor = RelationGetDescr(pgDistTransaction);
|
||||||
|
bool isNull = false;
|
||||||
|
|
||||||
|
Datum gidDatum = heap_getattr(heapTuple,
|
||||||
|
Anum_pg_dist_transaction_gid,
|
||||||
|
tupleDescriptor, &isNull);
|
||||||
|
|
||||||
|
char *gid = TextDatumGetCString(gidDatum);
|
||||||
|
|
||||||
|
if (strncmp(transactionName, gid, NAMEDATALEN) == 0)
|
||||||
|
{
|
||||||
|
heapTupleFound = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
heapTuple = systable_getnext(scanDescriptor);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* if we couldn't find the transaction record to delete, error out */
|
||||||
|
if (!heapTupleFound)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errmsg("could not find valid entry for transaction record "
|
||||||
|
"'%s' in group %d",
|
||||||
|
transactionName, groupId)));
|
||||||
|
}
|
||||||
|
|
||||||
|
simple_heap_delete(pgDistTransaction, &heapTuple->t_self);
|
||||||
|
CommandCounterIncrement();
|
||||||
|
|
||||||
|
systable_endscan(scanDescriptor);
|
||||||
|
heap_close(pgDistTransaction, RowExclusiveLock);
|
||||||
|
}
|
|
@ -0,0 +1,518 @@
|
||||||
|
/*-------------------------------------------------------------------------
|
||||||
|
*
|
||||||
|
* worker_transaction.c
|
||||||
|
*
|
||||||
|
* Routines for performing transactions across all workers.
|
||||||
|
*
|
||||||
|
* Copyright (c) 2016, Citus Data, Inc.
|
||||||
|
*
|
||||||
|
* $Id$
|
||||||
|
*
|
||||||
|
*-------------------------------------------------------------------------
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "postgres.h"
|
||||||
|
#include "miscadmin.h"
|
||||||
|
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
|
#include "access/xact.h"
|
||||||
|
#include "distributed/commit_protocol.h"
|
||||||
|
#include "distributed/connection_cache.h"
|
||||||
|
#include "distributed/metadata_cache.h"
|
||||||
|
#include "distributed/multi_shard_transaction.h"
|
||||||
|
#include "distributed/resource_lock.h"
|
||||||
|
#include "distributed/pg_dist_node.h"
|
||||||
|
#include "distributed/pg_dist_transaction.h"
|
||||||
|
#include "distributed/transaction_recovery.h"
|
||||||
|
#include "distributed/worker_manager.h"
|
||||||
|
#include "distributed/worker_transaction.h"
|
||||||
|
#include "utils/memutils.h"
|
||||||
|
|
||||||
|
|
||||||
|
/* Local functions forward declarations */
|
||||||
|
static void EnableXactCallback(void);
|
||||||
|
static void CompleteWorkerTransactions(XactEvent event, void *arg);
|
||||||
|
static List * OpenWorkerTransactions(void);
|
||||||
|
static TransactionConnection * GetWorkerTransaction(char *nodeName, int32 nodePort);
|
||||||
|
static List * GetTargetWorkerTransactions(TargetWorkerSet targetWorkerSet);
|
||||||
|
static bool IsResponseOK(ExecStatusType resultStatus);
|
||||||
|
|
||||||
|
|
||||||
|
/* Global worker connection list */
|
||||||
|
static List *workerConnectionList = NIL;
|
||||||
|
static bool isXactCallbackRegistered = false;
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* GetWorkerTransactions opens connections to all workers and starts
|
||||||
|
* a transaction block that is committed or aborted when the local
|
||||||
|
* transaction commits or aborts. Multiple invocations of
|
||||||
|
* GetWorkerTransactions within the same transaction will return
|
||||||
|
* the same list of connections.
|
||||||
|
*/
|
||||||
|
List *
|
||||||
|
GetWorkerTransactions(void)
|
||||||
|
{
|
||||||
|
if (workerConnectionList == NIL)
|
||||||
|
{
|
||||||
|
InitializeDistributedTransaction();
|
||||||
|
EnableXactCallback();
|
||||||
|
|
||||||
|
workerConnectionList = OpenWorkerTransactions();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ensure that number of workers has not change */
|
||||||
|
Assert(list_length(WorkerNodeList()) == list_length(workerConnectionList));
|
||||||
|
|
||||||
|
return workerConnectionList;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* SendCommandToWorker sends a command to a particular worker as part of the
|
||||||
|
* 2PC.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
SendCommandToWorker(char *nodeName, int32 nodePort, char *command)
|
||||||
|
{
|
||||||
|
TransactionConnection *transactionConnection = NULL;
|
||||||
|
PGresult *queryResult = NULL;
|
||||||
|
ExecStatusType resultStatus = PGRES_EMPTY_QUERY;
|
||||||
|
|
||||||
|
transactionConnection = GetWorkerTransaction(nodeName, nodePort);
|
||||||
|
if (transactionConnection == NULL)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errmsg("worker %s:%d is not part of current transaction",
|
||||||
|
nodeName, nodePort)));
|
||||||
|
}
|
||||||
|
|
||||||
|
queryResult = PQexec(transactionConnection->connection, command);
|
||||||
|
resultStatus = PQresultStatus(queryResult);
|
||||||
|
if (resultStatus != PGRES_COMMAND_OK && resultStatus != PGRES_TUPLES_OK)
|
||||||
|
{
|
||||||
|
ReraiseRemoteError(transactionConnection->connection, queryResult);
|
||||||
|
}
|
||||||
|
|
||||||
|
PQclear(queryResult);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* SendCommandToWorkers sends a command to all workers in
|
||||||
|
* parallel. Commands are committed on the workers when the local
|
||||||
|
* transaction commits. The connection are made as the extension
|
||||||
|
* owner to ensure write access to the Citus metadata tables.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
SendCommandToWorkers(TargetWorkerSet targetWorkerSet, char *command)
|
||||||
|
{
|
||||||
|
SendCommandToWorkersParams(targetWorkerSet, command, 0, NULL, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* SendCommandToWorkersParams sends a command to all workers in parallel.
|
||||||
|
* Commands are committed on the workers when the local transaction commits. The
|
||||||
|
* connection are made as the extension owner to ensure write access to the Citus
|
||||||
|
* metadata tables. Parameters can be specified as for PQexecParams, except that
|
||||||
|
* paramLengths, paramFormats and resultFormat are hard-coded to NULL, NULL and 0
|
||||||
|
* respectively.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
SendCommandToWorkersParams(TargetWorkerSet targetWorkerSet, char *command,
|
||||||
|
int parameterCount, const Oid *parameterTypes,
|
||||||
|
const char *const *parameterValues)
|
||||||
|
{
|
||||||
|
ListCell *connectionCell = NULL;
|
||||||
|
List *targetConnectionList = GetTargetWorkerTransactions(targetWorkerSet);
|
||||||
|
|
||||||
|
foreach(connectionCell, targetConnectionList)
|
||||||
|
{
|
||||||
|
TransactionConnection *transactionConnection =
|
||||||
|
(TransactionConnection *) lfirst(connectionCell);
|
||||||
|
|
||||||
|
PGconn *connection = transactionConnection->connection;
|
||||||
|
|
||||||
|
int querySent = PQsendQueryParams(connection, command, parameterCount,
|
||||||
|
parameterTypes, parameterValues, NULL, NULL, 0);
|
||||||
|
if (querySent == 0)
|
||||||
|
{
|
||||||
|
ReraiseRemoteError(connection, NULL);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach(connectionCell, targetConnectionList)
|
||||||
|
{
|
||||||
|
TransactionConnection *transactionConnection =
|
||||||
|
(TransactionConnection *) lfirst(connectionCell);
|
||||||
|
|
||||||
|
PGconn *connection = transactionConnection->connection;
|
||||||
|
PGresult *result = PQgetResult(connection);
|
||||||
|
ExecStatusType resultStatus = PQresultStatus(result);
|
||||||
|
|
||||||
|
if (!IsResponseOK(resultStatus))
|
||||||
|
{
|
||||||
|
ReraiseRemoteError(connection, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
PQclear(result);
|
||||||
|
|
||||||
|
/* clear NULL result */
|
||||||
|
PQgetResult(connection);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* SendCommandListInSingleTransaction opens connection to the node with the given
|
||||||
|
* nodeName and nodePort. Then, the connection starts a transaction on the remote
|
||||||
|
* node and executes the commands in the transaction. The function raises error if
|
||||||
|
* any of the queries fails.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
SendCommandListToWorkerInSingleTransaction(char *nodeName, int32 nodePort, char *nodeUser,
|
||||||
|
List *commandList)
|
||||||
|
{
|
||||||
|
PGconn *workerConnection = NULL;
|
||||||
|
PGresult *queryResult = NULL;
|
||||||
|
ListCell *commandCell = NULL;
|
||||||
|
|
||||||
|
workerConnection = ConnectToNode(nodeName, nodePort, nodeUser);
|
||||||
|
if (workerConnection == NULL)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errmsg("could not open connection to %s:%d as %s",
|
||||||
|
nodeName, nodePort, nodeUser)));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* start the transaction on the worker node */
|
||||||
|
queryResult = PQexec(workerConnection, "BEGIN");
|
||||||
|
if (PQresultStatus(queryResult) != PGRES_COMMAND_OK)
|
||||||
|
{
|
||||||
|
ReraiseRemoteError(workerConnection, queryResult);
|
||||||
|
}
|
||||||
|
|
||||||
|
PQclear(queryResult);
|
||||||
|
|
||||||
|
/* iterate over the commands and execute them in the same connection */
|
||||||
|
foreach(commandCell, commandList)
|
||||||
|
{
|
||||||
|
char *commandString = lfirst(commandCell);
|
||||||
|
ExecStatusType resultStatus = PGRES_EMPTY_QUERY;
|
||||||
|
|
||||||
|
queryResult = PQexec(workerConnection, commandString);
|
||||||
|
resultStatus = PQresultStatus(queryResult);
|
||||||
|
if (!(resultStatus == PGRES_SINGLE_TUPLE || resultStatus == PGRES_TUPLES_OK ||
|
||||||
|
resultStatus == PGRES_COMMAND_OK))
|
||||||
|
{
|
||||||
|
ReraiseRemoteError(workerConnection, queryResult);
|
||||||
|
}
|
||||||
|
|
||||||
|
PQclear(queryResult);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* commit the transaction on the worker node */
|
||||||
|
queryResult = PQexec(workerConnection, "COMMIT");
|
||||||
|
if (PQresultStatus(queryResult) != PGRES_COMMAND_OK)
|
||||||
|
{
|
||||||
|
ReraiseRemoteError(workerConnection, queryResult);
|
||||||
|
}
|
||||||
|
|
||||||
|
PQclear(queryResult);
|
||||||
|
|
||||||
|
/* clear NULL result */
|
||||||
|
PQgetResult(workerConnection);
|
||||||
|
|
||||||
|
/* we no longer need this connection */
|
||||||
|
PQfinish(workerConnection);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* IsWorkerTransactionActive returns true if there exists any on going
|
||||||
|
* worker transactions.
|
||||||
|
*/
|
||||||
|
bool
|
||||||
|
IsWorkerTransactionActive(void)
|
||||||
|
{
|
||||||
|
bool isWorkerTransactionActive = false;
|
||||||
|
|
||||||
|
if (workerConnectionList != NIL)
|
||||||
|
{
|
||||||
|
isWorkerTransactionActive = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return isWorkerTransactionActive;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* EnableXactCallback registers the CompleteWorkerTransactions function as the callback
|
||||||
|
* of the worker transactions.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
EnableXactCallback(void)
|
||||||
|
{
|
||||||
|
if (!isXactCallbackRegistered)
|
||||||
|
{
|
||||||
|
RegisterXactCallback(CompleteWorkerTransactions, NULL);
|
||||||
|
isXactCallbackRegistered = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* CompleteWorkerTransaction commits or aborts pending worker transactions
|
||||||
|
* when the local transaction commits or aborts.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
CompleteWorkerTransactions(XactEvent event, void *arg)
|
||||||
|
{
|
||||||
|
if (workerConnectionList == NIL)
|
||||||
|
{
|
||||||
|
/* nothing to do */
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
else if (event == XACT_EVENT_PRE_COMMIT)
|
||||||
|
{
|
||||||
|
if (MultiShardCommitProtocol == COMMIT_PROTOCOL_2PC)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Any failure here will cause local changes to be rolled back,
|
||||||
|
* and may leave a prepared transaction on the remote node.
|
||||||
|
*/
|
||||||
|
|
||||||
|
PrepareRemoteTransactions(workerConnectionList);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We are now ready to commit the local transaction, followed
|
||||||
|
* by the remote transaction. As a final step, write commit
|
||||||
|
* records to a table. If there is a last-minute crash
|
||||||
|
* on the local machine, then the absence of these records
|
||||||
|
* will indicate that the remote transactions should be rolled
|
||||||
|
* back. Otherwise, the presence of these records indicates
|
||||||
|
* that the remote transactions should be committed.
|
||||||
|
*/
|
||||||
|
|
||||||
|
LogPreparedTransactions(workerConnectionList);
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
else if (event == XACT_EVENT_COMMIT)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* A failure here may cause some prepared transactions to be
|
||||||
|
* left pending. However, the local change have already been
|
||||||
|
* committed and a commit record exists to indicate that the
|
||||||
|
* remote transaction should be committed as well.
|
||||||
|
*/
|
||||||
|
|
||||||
|
CommitRemoteTransactions(workerConnectionList, false);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* At this point, it is safe to remove the transaction records
|
||||||
|
* for all commits that have succeeded. However, we are no
|
||||||
|
* longer in a transaction and therefore cannot make changes
|
||||||
|
* to the metadata.
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
else if (event == XACT_EVENT_ABORT)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* A failure here may cause some prepared transactions to be
|
||||||
|
* left pending. The local changes have already been rolled
|
||||||
|
* back and the absence of a commit record indicates that
|
||||||
|
* the remote transaction should be rolled back as well.
|
||||||
|
*/
|
||||||
|
|
||||||
|
AbortRemoteTransactions(workerConnectionList);
|
||||||
|
}
|
||||||
|
else if (event == XACT_EVENT_PREPARE || event == XACT_EVENT_PRE_PREPARE)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* If we allow a prepare we might not get to the commit handler
|
||||||
|
* in this session. We could resolve that if we intercept
|
||||||
|
* COMMIT/ABORT PREPARED commands. For now, we just error out.
|
||||||
|
*/
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
||||||
|
errmsg("cannot prepare a transaction that modified "
|
||||||
|
"distributed tables")));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
CloseConnections(workerConnectionList);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Memory allocated in workerConnectionList will be reclaimed when
|
||||||
|
* TopTransactionContext is released.
|
||||||
|
*/
|
||||||
|
|
||||||
|
workerConnectionList = NIL;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* OpenWorkerTransactions opens connections to all primary workers and sends
|
||||||
|
* BEGIN commands. The returned TransactionConnection's are allocated in the
|
||||||
|
* top transaction context, such that they can still be used in the commit
|
||||||
|
* handler. The connections are made as the extension owner, such that they
|
||||||
|
* have write access to the Citus metadata tables.
|
||||||
|
*/
|
||||||
|
static List *
|
||||||
|
OpenWorkerTransactions(void)
|
||||||
|
{
|
||||||
|
ListCell *workerNodeCell = NULL;
|
||||||
|
List *connectionList = NIL;
|
||||||
|
MemoryContext oldContext = NULL;
|
||||||
|
List *workerList = NIL;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* A new node addition might be in progress which will invalidate the
|
||||||
|
* worker list. The following statement blocks until the node addition and
|
||||||
|
* metadata syncing finishes after which we reload the worker list.
|
||||||
|
* It also ensures that no new node addition and metadata synchronization
|
||||||
|
* will run until this transaction finishes.
|
||||||
|
*/
|
||||||
|
LockMetadataSnapshot(AccessShareLock);
|
||||||
|
|
||||||
|
workerList = WorkerNodeList();
|
||||||
|
|
||||||
|
oldContext = MemoryContextSwitchTo(TopTransactionContext);
|
||||||
|
|
||||||
|
foreach(workerNodeCell, workerList)
|
||||||
|
{
|
||||||
|
WorkerNode *workerNode = (WorkerNode *) lfirst(workerNodeCell);
|
||||||
|
char *nodeUser = CitusExtensionOwnerName();
|
||||||
|
char *nodeName = workerNode->workerName;
|
||||||
|
int nodePort = workerNode->workerPort;
|
||||||
|
PGconn *connection = NULL;
|
||||||
|
|
||||||
|
TransactionConnection *transactionConnection = NULL;
|
||||||
|
PGresult *result = NULL;
|
||||||
|
|
||||||
|
connection = ConnectToNode(nodeName, nodePort, nodeUser);
|
||||||
|
if (connection == NULL)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errmsg("could not open connection to %s:%d as %s",
|
||||||
|
nodeName, nodePort, nodeUser)));
|
||||||
|
}
|
||||||
|
|
||||||
|
result = PQexec(connection, "BEGIN");
|
||||||
|
if (PQresultStatus(result) != PGRES_COMMAND_OK)
|
||||||
|
{
|
||||||
|
ReraiseRemoteError(connection, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
PQclear(result);
|
||||||
|
|
||||||
|
transactionConnection = palloc0(sizeof(TransactionConnection));
|
||||||
|
|
||||||
|
transactionConnection->groupId = workerNode->groupId;
|
||||||
|
transactionConnection->connectionId = 0;
|
||||||
|
transactionConnection->transactionState = TRANSACTION_STATE_OPEN;
|
||||||
|
transactionConnection->connection = connection;
|
||||||
|
transactionConnection->nodeName = pstrdup(nodeName);
|
||||||
|
transactionConnection->nodePort = nodePort;
|
||||||
|
|
||||||
|
connectionList = lappend(connectionList, transactionConnection);
|
||||||
|
}
|
||||||
|
|
||||||
|
MemoryContextSwitchTo(oldContext);
|
||||||
|
|
||||||
|
return connectionList;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* GetNodeTransactionConnection finds the opened connection for the specified
|
||||||
|
* node. Note that it opens transaction connections to all workers, by
|
||||||
|
* calling GetWorkerTransactions therefore, it is suggested to use this
|
||||||
|
* function in operations that sends commands to all workers inside a
|
||||||
|
* distributed transaction.
|
||||||
|
*
|
||||||
|
* GetNodeTransactionConnection returns NULL, if the node with the specified
|
||||||
|
* nodeName and nodePort is not found. Note that this worker may open
|
||||||
|
* connections to all workers if there were not open already.
|
||||||
|
*/
|
||||||
|
static TransactionConnection *
|
||||||
|
GetWorkerTransaction(char *nodeName, int32 nodePort)
|
||||||
|
{
|
||||||
|
List *connectionList = NIL;
|
||||||
|
ListCell *connectionCell = NULL;
|
||||||
|
TransactionConnection *workerTransaction = NULL;
|
||||||
|
|
||||||
|
connectionList = GetWorkerTransactions();
|
||||||
|
|
||||||
|
foreach(connectionCell, connectionList)
|
||||||
|
{
|
||||||
|
TransactionConnection *transactionConnection =
|
||||||
|
(TransactionConnection *) lfirst(connectionCell);
|
||||||
|
|
||||||
|
if (strcmp(transactionConnection->nodeName, nodeName) == 0 &&
|
||||||
|
transactionConnection->nodePort == nodePort)
|
||||||
|
{
|
||||||
|
workerTransaction = transactionConnection;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return workerTransaction;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* GetTargetWorkerTransactions returns a subset of all worker transactions
|
||||||
|
* matching the given target worker set.
|
||||||
|
*/
|
||||||
|
static List *
|
||||||
|
GetTargetWorkerTransactions(TargetWorkerSet targetWorkerSet)
|
||||||
|
{
|
||||||
|
List *allWorkerConnectionsList = GetWorkerTransactions();
|
||||||
|
List *targetConnectionList = NIL;
|
||||||
|
ListCell *connectionCell = NULL;
|
||||||
|
|
||||||
|
if (targetWorkerSet == WORKERS_WITH_METADATA)
|
||||||
|
{
|
||||||
|
foreach(connectionCell, allWorkerConnectionsList)
|
||||||
|
{
|
||||||
|
TransactionConnection *transactionConnection =
|
||||||
|
(TransactionConnection *) lfirst(connectionCell);
|
||||||
|
char *nodeName = pstrdup(transactionConnection->nodeName);
|
||||||
|
int nodePort = transactionConnection->nodePort;
|
||||||
|
WorkerNode *workerNode = FindWorkerNode(nodeName, nodePort);
|
||||||
|
|
||||||
|
if (workerNode->hasMetadata)
|
||||||
|
{
|
||||||
|
targetConnectionList = lappend(targetConnectionList,
|
||||||
|
transactionConnection);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
targetConnectionList = allWorkerConnectionsList;
|
||||||
|
}
|
||||||
|
|
||||||
|
return targetConnectionList;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* IsResponseOK checks the resultStatus and returns true if the status is OK.
|
||||||
|
*/
|
||||||
|
static bool
|
||||||
|
IsResponseOK(ExecStatusType resultStatus)
|
||||||
|
{
|
||||||
|
if (resultStatus == PGRES_SINGLE_TUPLE || resultStatus == PGRES_TUPLES_OK ||
|
||||||
|
resultStatus == PGRES_COMMAND_OK)
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
|
@ -60,3 +60,24 @@ SortList(List *pointerList, int (*comparisonFunction)(const void *, const void *
|
||||||
|
|
||||||
return sortedList;
|
return sortedList;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* PointerArrayFromList converts a list of pointers to an array of pointers.
|
||||||
|
*/
|
||||||
|
void **
|
||||||
|
PointerArrayFromList(List *pointerList)
|
||||||
|
{
|
||||||
|
int pointerCount = list_length(pointerList);
|
||||||
|
void **pointerArray = (void **) palloc0(pointerCount * sizeof(void *));
|
||||||
|
ListCell *pointerCell = NULL;
|
||||||
|
int pointerIndex = 0;
|
||||||
|
|
||||||
|
foreach(pointerCell, pointerList)
|
||||||
|
{
|
||||||
|
pointerArray[pointerIndex] = (void *) lfirst(pointerCell);
|
||||||
|
pointerIndex += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return pointerArray;
|
||||||
|
}
|
||||||
|
|
|
@ -62,6 +62,8 @@ static Oid distShardShardidIndexId = InvalidOid;
|
||||||
static Oid distShardPlacementShardidIndexId = InvalidOid;
|
static Oid distShardPlacementShardidIndexId = InvalidOid;
|
||||||
static Oid distShardPlacementPlacementidIndexId = InvalidOid;
|
static Oid distShardPlacementPlacementidIndexId = InvalidOid;
|
||||||
static Oid distShardPlacementNodeidIndexId = InvalidOid;
|
static Oid distShardPlacementNodeidIndexId = InvalidOid;
|
||||||
|
static Oid distTransactionRelationId = InvalidOid;
|
||||||
|
static Oid distTransactionGroupIndexId = InvalidOid;
|
||||||
static Oid extraDataContainerFuncId = InvalidOid;
|
static Oid extraDataContainerFuncId = InvalidOid;
|
||||||
|
|
||||||
/* Hash table for informations about each partition */
|
/* Hash table for informations about each partition */
|
||||||
|
@ -762,6 +764,27 @@ DistShardPlacementPlacementidIndexId(void)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* return oid of pg_dist_transaction relation */
|
||||||
|
Oid
|
||||||
|
DistTransactionRelationId(void)
|
||||||
|
{
|
||||||
|
CachedRelationLookup("pg_dist_transaction", &distTransactionRelationId);
|
||||||
|
|
||||||
|
return distTransactionRelationId;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* return oid of pg_dist_transaction_group_index */
|
||||||
|
Oid
|
||||||
|
DistTransactionGroupIndexId(void)
|
||||||
|
{
|
||||||
|
CachedRelationLookup("pg_dist_transaction_group_index",
|
||||||
|
&distTransactionGroupIndexId);
|
||||||
|
|
||||||
|
return distTransactionGroupIndexId;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/* return oid of pg_dist_shard_placement_nodeid_index */
|
/* return oid of pg_dist_shard_placement_nodeid_index */
|
||||||
Oid
|
Oid
|
||||||
DistShardPlacementNodeidIndexId(void)
|
DistShardPlacementNodeidIndexId(void)
|
||||||
|
@ -857,6 +880,18 @@ CitusExtensionOwner(void)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* CitusExtensionOwnerName returns the name of the owner of the extension.
|
||||||
|
*/
|
||||||
|
char *
|
||||||
|
CitusExtensionOwnerName(void)
|
||||||
|
{
|
||||||
|
Oid superUserId = CitusExtensionOwner();
|
||||||
|
|
||||||
|
return GetUserNameFromId(superUserId, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/* return the username of the currently active role */
|
/* return the username of the currently active role */
|
||||||
char *
|
char *
|
||||||
CurrentUserName(void)
|
CurrentUserName(void)
|
||||||
|
@ -1382,6 +1417,7 @@ InvalidateDistRelationCacheCallback(Datum argument, Oid relationId)
|
||||||
distShardRelationId = InvalidOid;
|
distShardRelationId = InvalidOid;
|
||||||
distShardPlacementRelationId = InvalidOid;
|
distShardPlacementRelationId = InvalidOid;
|
||||||
distLocalGroupRelationId = InvalidOid;
|
distLocalGroupRelationId = InvalidOid;
|
||||||
|
distNodeRelationId = InvalidOid;
|
||||||
distPartitionRelationId = InvalidOid;
|
distPartitionRelationId = InvalidOid;
|
||||||
distPartitionLogicalRelidIndexId = InvalidOid;
|
distPartitionLogicalRelidIndexId = InvalidOid;
|
||||||
distPartitionColocationidIndexId = InvalidOid;
|
distPartitionColocationidIndexId = InvalidOid;
|
||||||
|
@ -1389,7 +1425,8 @@ InvalidateDistRelationCacheCallback(Datum argument, Oid relationId)
|
||||||
distShardShardidIndexId = InvalidOid;
|
distShardShardidIndexId = InvalidOid;
|
||||||
distShardPlacementShardidIndexId = InvalidOid;
|
distShardPlacementShardidIndexId = InvalidOid;
|
||||||
distShardPlacementPlacementidIndexId = InvalidOid;
|
distShardPlacementPlacementidIndexId = InvalidOid;
|
||||||
distNodeRelationId = InvalidOid;
|
distTransactionRelationId = InvalidOid;
|
||||||
|
distTransactionGroupIndexId = InvalidOid;
|
||||||
extraDataContainerFuncId = InvalidOid;
|
extraDataContainerFuncId = InvalidOid;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
|
|
||||||
#include "distributed/listutils.h"
|
#include "distributed/listutils.h"
|
||||||
#include "distributed/master_metadata_utility.h"
|
#include "distributed/master_metadata_utility.h"
|
||||||
|
#include "distributed/metadata_cache.h"
|
||||||
#include "distributed/relay_utility.h"
|
#include "distributed/relay_utility.h"
|
||||||
#include "distributed/resource_lock.h"
|
#include "distributed/resource_lock.h"
|
||||||
#include "distributed/shardinterval_utils.h"
|
#include "distributed/shardinterval_utils.h"
|
||||||
|
@ -149,3 +150,18 @@ LockShards(List *shardIntervalList, LOCKMODE lockMode)
|
||||||
LockShardResource(shardId, lockMode);
|
LockShardResource(shardId, lockMode);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* LockMetadataSnapshot acquires a lock needed to serialize changes to pg_dist_node
|
||||||
|
* and all other metadata changes. Operations that modify pg_dist_node should acquire
|
||||||
|
* AccessExclusiveLock. All other metadata changes should acquire AccessShareLock. Any locks
|
||||||
|
* acquired using this method are released at transaction end.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
LockMetadataSnapshot(LOCKMODE lockMode)
|
||||||
|
{
|
||||||
|
Assert(lockMode == AccessExclusiveLock || lockMode == AccessShareLock);
|
||||||
|
|
||||||
|
(void) LockRelationOid(DistNodeRelationId(), lockMode);
|
||||||
|
}
|
||||||
|
|
|
@ -42,9 +42,12 @@ typedef enum
|
||||||
*/
|
*/
|
||||||
typedef struct TransactionConnection
|
typedef struct TransactionConnection
|
||||||
{
|
{
|
||||||
|
int groupId;
|
||||||
int64 connectionId;
|
int64 connectionId;
|
||||||
TransactionState transactionState;
|
TransactionState transactionState;
|
||||||
PGconn *connection;
|
PGconn *connection;
|
||||||
|
const char *nodeName;
|
||||||
|
int nodePort;
|
||||||
} TransactionConnection;
|
} TransactionConnection;
|
||||||
|
|
||||||
|
|
||||||
|
@ -57,5 +60,6 @@ extern void InitializeDistributedTransaction(void);
|
||||||
extern void PrepareRemoteTransactions(List *connectionList);
|
extern void PrepareRemoteTransactions(List *connectionList);
|
||||||
extern void AbortRemoteTransactions(List *connectionList);
|
extern void AbortRemoteTransactions(List *connectionList);
|
||||||
extern void CommitRemoteTransactions(List *connectionList, bool stopOnFailure);
|
extern void CommitRemoteTransactions(List *connectionList, bool stopOnFailure);
|
||||||
|
extern StringInfo BuildTransactionName(int connectionId);
|
||||||
|
|
||||||
#endif /* COMMIT_PROTOCOL_H */
|
#endif /* COMMIT_PROTOCOL_H */
|
||||||
|
|
|
@ -21,6 +21,7 @@
|
||||||
/* utility functions declaration shared within this module */
|
/* utility functions declaration shared within this module */
|
||||||
extern List * SortList(List *pointerList,
|
extern List * SortList(List *pointerList,
|
||||||
int (*ComparisonFunction)(const void *, const void *));
|
int (*ComparisonFunction)(const void *, const void *));
|
||||||
|
extern void ** PointerArrayFromList(List *pointerList);
|
||||||
|
|
||||||
|
|
||||||
#endif /* CITUS_LISTUTILS_H */
|
#endif /* CITUS_LISTUTILS_H */
|
||||||
|
|
|
@ -79,6 +79,8 @@ extern Oid DistShardLogicalRelidIndexId(void);
|
||||||
extern Oid DistShardShardidIndexId(void);
|
extern Oid DistShardShardidIndexId(void);
|
||||||
extern Oid DistShardPlacementShardidIndexId(void);
|
extern Oid DistShardPlacementShardidIndexId(void);
|
||||||
extern Oid DistShardPlacementPlacementidIndexId(void);
|
extern Oid DistShardPlacementPlacementidIndexId(void);
|
||||||
|
extern Oid DistTransactionRelationId(void);
|
||||||
|
extern Oid DistTransactionGroupIndexId(void);
|
||||||
extern Oid DistShardPlacementNodeidIndexId(void);
|
extern Oid DistShardPlacementNodeidIndexId(void);
|
||||||
|
|
||||||
/* function oids */
|
/* function oids */
|
||||||
|
@ -86,5 +88,6 @@ extern Oid CitusExtraDataContainerFuncId(void);
|
||||||
|
|
||||||
/* user related functions */
|
/* user related functions */
|
||||||
extern Oid CitusExtensionOwner(void);
|
extern Oid CitusExtensionOwner(void);
|
||||||
|
extern char * CitusExtensionOwnerName(void);
|
||||||
extern char * CurrentUserName(void);
|
extern char * CurrentUserName(void);
|
||||||
#endif /* METADATA_CACHE_H */
|
#endif /* METADATA_CACHE_H */
|
||||||
|
|
|
@ -0,0 +1,43 @@
|
||||||
|
/*-------------------------------------------------------------------------
|
||||||
|
*
|
||||||
|
* pg_dist_transaction.h
|
||||||
|
* definition of the "transaction" relation (pg_dist_transaction).
|
||||||
|
*
|
||||||
|
* Copyright (c) 2016, Citus Data, Inc.
|
||||||
|
*
|
||||||
|
*-------------------------------------------------------------------------
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef PG_DIST_TRANSACTION_H
|
||||||
|
#define PG_DIST_TRANSACTION_H
|
||||||
|
|
||||||
|
|
||||||
|
/* ----------------
|
||||||
|
* pg_dist_transaction definition.
|
||||||
|
* ----------------
|
||||||
|
*/
|
||||||
|
typedef struct FormData_pg_dist_transaction
|
||||||
|
{
|
||||||
|
int32 groupid; /* id of the replication group */
|
||||||
|
text gid; /* global transaction identifier */
|
||||||
|
} FormData_pg_dist_transaction;
|
||||||
|
|
||||||
|
|
||||||
|
/* ----------------
|
||||||
|
* Form_pg_dist_transactions corresponds to a pointer to a tuple with
|
||||||
|
* the format of pg_dist_transactions relation.
|
||||||
|
* ----------------
|
||||||
|
*/
|
||||||
|
typedef FormData_pg_dist_transaction *Form_pg_dist_transaction;
|
||||||
|
|
||||||
|
|
||||||
|
/* ----------------
|
||||||
|
* compiler constants for pg_dist_transaction
|
||||||
|
* ----------------
|
||||||
|
*/
|
||||||
|
#define Natts_pg_dist_transaction 2
|
||||||
|
#define Anum_pg_dist_transaction_groupid 1
|
||||||
|
#define Anum_pg_dist_transaction_gid 2
|
||||||
|
|
||||||
|
|
||||||
|
#endif /* PG_DIST_TRANSACTION_H */
|
|
@ -76,5 +76,6 @@ extern void LockJobResource(uint64 jobId, LOCKMODE lockmode);
|
||||||
extern void UnlockJobResource(uint64 jobId, LOCKMODE lockmode);
|
extern void UnlockJobResource(uint64 jobId, LOCKMODE lockmode);
|
||||||
|
|
||||||
extern void LockShards(List *shardIntervalList, LOCKMODE lockMode);
|
extern void LockShards(List *shardIntervalList, LOCKMODE lockMode);
|
||||||
|
extern void LockMetadataSnapshot(LOCKMODE lockMode);
|
||||||
|
|
||||||
#endif /* RESOURCE_LOCK_H */
|
#endif /* RESOURCE_LOCK_H */
|
||||||
|
|
|
@ -0,0 +1,22 @@
|
||||||
|
/*-------------------------------------------------------------------------
|
||||||
|
*
|
||||||
|
* transaction_recovery.h
|
||||||
|
* Type and function declarations used in recovering 2PC transactions.
|
||||||
|
*
|
||||||
|
* Copyright (c) 2016, Citus Data, Inc.
|
||||||
|
*
|
||||||
|
*-------------------------------------------------------------------------
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef TRANSACTION_RECOVERY_H
|
||||||
|
#define TRANSACTION_RECOVERY_H
|
||||||
|
|
||||||
|
|
||||||
|
#include "nodes/pg_list.h"
|
||||||
|
|
||||||
|
|
||||||
|
/* Functions declarations for worker transactions */
|
||||||
|
extern void LogPreparedTransactions(List *connectionList);
|
||||||
|
|
||||||
|
|
||||||
|
#endif /* TRANSACTION_RECOVERY_H */
|
|
@ -0,0 +1,42 @@
|
||||||
|
/*-------------------------------------------------------------------------
|
||||||
|
*
|
||||||
|
* worker_transaction.h
|
||||||
|
* Type and function declarations used in performing transactions across
|
||||||
|
* workers.
|
||||||
|
*
|
||||||
|
* Copyright (c) 2016, Citus Data, Inc.
|
||||||
|
*
|
||||||
|
*-------------------------------------------------------------------------
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef WORKER_TRANSACTION_H
|
||||||
|
#define WORKER_TRANSACTION_H
|
||||||
|
|
||||||
|
#include "distributed/worker_manager.h"
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* TargetWorkerSet is used for determining the type of workers that a command
|
||||||
|
* is targeted to.
|
||||||
|
*/
|
||||||
|
typedef enum TargetWorkerSet
|
||||||
|
{
|
||||||
|
WORKERS_WITH_METADATA,
|
||||||
|
ALL_WORKERS
|
||||||
|
} TargetWorkerSet;
|
||||||
|
|
||||||
|
|
||||||
|
/* Functions declarations for worker transactions */
|
||||||
|
extern List * GetWorkerTransactions(void);
|
||||||
|
extern void SendCommandToWorker(char *nodeName, int32 nodePort, char *command);
|
||||||
|
extern void SendCommandToWorkers(TargetWorkerSet targetWorkerSet, char *command);
|
||||||
|
extern void SendCommandToWorkersParams(TargetWorkerSet targetWorkerSet, char *command,
|
||||||
|
int parameterCount, const Oid *parameterTypes,
|
||||||
|
const char *const *parameterValues);
|
||||||
|
extern void SendCommandListToWorkerInSingleTransaction(char *nodeName, int32 nodePort,
|
||||||
|
char *nodeUser, List *commandList);
|
||||||
|
|
||||||
|
/* helper functions for worker transactions */
|
||||||
|
extern bool IsWorkerTransactionActive(void);
|
||||||
|
|
||||||
|
#endif /* WORKER_TRANSACTION_H */
|
|
@ -35,6 +35,7 @@ ALTER EXTENSION citus UPDATE TO '6.0-6';
|
||||||
ALTER EXTENSION citus UPDATE TO '6.0-7';
|
ALTER EXTENSION citus UPDATE TO '6.0-7';
|
||||||
ALTER EXTENSION citus UPDATE TO '6.0-8';
|
ALTER EXTENSION citus UPDATE TO '6.0-8';
|
||||||
ALTER EXTENSION citus UPDATE TO '6.0-9';
|
ALTER EXTENSION citus UPDATE TO '6.0-9';
|
||||||
|
ALTER EXTENSION citus UPDATE TO '6.0-10';
|
||||||
-- drop extension an re-create in newest version
|
-- drop extension an re-create in newest version
|
||||||
DROP EXTENSION citus;
|
DROP EXTENSION citus;
|
||||||
\c
|
\c
|
||||||
|
|
|
@ -0,0 +1,62 @@
|
||||||
|
ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 1220000;
|
||||||
|
ALTER SEQUENCE pg_catalog.pg_dist_jobid_seq RESTART 1220000;
|
||||||
|
-- Tests for prepared transaction recovery
|
||||||
|
-- Ensure pg_dist_transaction is empty for test
|
||||||
|
SELECT recover_prepared_transactions();
|
||||||
|
recover_prepared_transactions
|
||||||
|
-------------------------------
|
||||||
|
0
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT * FROM pg_dist_transaction;
|
||||||
|
groupid | gid
|
||||||
|
---------+-----
|
||||||
|
(0 rows)
|
||||||
|
|
||||||
|
-- Create some "fake" prepared transactions to recover
|
||||||
|
\c - - - :worker_1_port
|
||||||
|
BEGIN;
|
||||||
|
CREATE TABLE should_abort (value int);
|
||||||
|
PREPARE TRANSACTION 'citus_0_should_abort';
|
||||||
|
BEGIN;
|
||||||
|
CREATE TABLE should_commit (value int);
|
||||||
|
PREPARE TRANSACTION 'citus_0_should_commit';
|
||||||
|
BEGIN;
|
||||||
|
CREATE TABLE should_be_sorted_into_middle (value int);
|
||||||
|
PREPARE TRANSACTION 'citus_0_should_be_sorted_into_middle';
|
||||||
|
\c - - - :master_port
|
||||||
|
-- Add "fake" pg_dist_transaction records and run recovery
|
||||||
|
INSERT INTO pg_dist_transaction VALUES (1, 'citus_0_should_commit');
|
||||||
|
INSERT INTO pg_dist_transaction VALUES (1, 'citus_0_should_be_forgotten');
|
||||||
|
SELECT recover_prepared_transactions();
|
||||||
|
NOTICE: recovered a prepared transaction on localhost:57637
|
||||||
|
CONTEXT: ROLLBACK PREPARED 'citus_0_should_abort'
|
||||||
|
NOTICE: recovered a prepared transaction on localhost:57637
|
||||||
|
CONTEXT: ROLLBACK PREPARED 'citus_0_should_be_sorted_into_middle'
|
||||||
|
NOTICE: recovered a prepared transaction on localhost:57637
|
||||||
|
CONTEXT: COMMIT PREPARED 'citus_0_should_commit'
|
||||||
|
recover_prepared_transactions
|
||||||
|
-------------------------------
|
||||||
|
3
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT count(*) FROM pg_dist_transaction;
|
||||||
|
count
|
||||||
|
-------
|
||||||
|
0
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
-- Confirm that transactions were correctly rolled forward
|
||||||
|
\c - - - :worker_1_port
|
||||||
|
SELECT count(*) FROM pg_tables WHERE tablename = 'should_abort';
|
||||||
|
count
|
||||||
|
-------
|
||||||
|
0
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT count(*) FROM pg_tables WHERE tablename = 'should_commit';
|
||||||
|
count
|
||||||
|
-------
|
||||||
|
1
|
||||||
|
(1 row)
|
||||||
|
|
|
@ -135,6 +135,7 @@ test: multi_repartition_udt
|
||||||
test: multi_repartitioned_subquery_udf
|
test: multi_repartitioned_subquery_udf
|
||||||
test: multi_modifying_xacts
|
test: multi_modifying_xacts
|
||||||
test: multi_metadata_snapshot
|
test: multi_metadata_snapshot
|
||||||
|
test: multi_transaction_recovery
|
||||||
|
|
||||||
# ---------
|
# ---------
|
||||||
# multi_copy creates hash and range-partitioned tables and performs COPY
|
# multi_copy creates hash and range-partitioned tables and performs COPY
|
||||||
|
|
|
@ -40,6 +40,7 @@ ALTER EXTENSION citus UPDATE TO '6.0-6';
|
||||||
ALTER EXTENSION citus UPDATE TO '6.0-7';
|
ALTER EXTENSION citus UPDATE TO '6.0-7';
|
||||||
ALTER EXTENSION citus UPDATE TO '6.0-8';
|
ALTER EXTENSION citus UPDATE TO '6.0-8';
|
||||||
ALTER EXTENSION citus UPDATE TO '6.0-9';
|
ALTER EXTENSION citus UPDATE TO '6.0-9';
|
||||||
|
ALTER EXTENSION citus UPDATE TO '6.0-10';
|
||||||
|
|
||||||
-- drop extension an re-create in newest version
|
-- drop extension an re-create in newest version
|
||||||
DROP EXTENSION citus;
|
DROP EXTENSION citus;
|
||||||
|
|
|
@ -0,0 +1,37 @@
|
||||||
|
ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 1220000;
|
||||||
|
ALTER SEQUENCE pg_catalog.pg_dist_jobid_seq RESTART 1220000;
|
||||||
|
|
||||||
|
-- Tests for prepared transaction recovery
|
||||||
|
|
||||||
|
-- Ensure pg_dist_transaction is empty for test
|
||||||
|
SELECT recover_prepared_transactions();
|
||||||
|
|
||||||
|
SELECT * FROM pg_dist_transaction;
|
||||||
|
|
||||||
|
-- Create some "fake" prepared transactions to recover
|
||||||
|
\c - - - :worker_1_port
|
||||||
|
|
||||||
|
BEGIN;
|
||||||
|
CREATE TABLE should_abort (value int);
|
||||||
|
PREPARE TRANSACTION 'citus_0_should_abort';
|
||||||
|
|
||||||
|
BEGIN;
|
||||||
|
CREATE TABLE should_commit (value int);
|
||||||
|
PREPARE TRANSACTION 'citus_0_should_commit';
|
||||||
|
|
||||||
|
BEGIN;
|
||||||
|
CREATE TABLE should_be_sorted_into_middle (value int);
|
||||||
|
PREPARE TRANSACTION 'citus_0_should_be_sorted_into_middle';
|
||||||
|
|
||||||
|
\c - - - :master_port
|
||||||
|
-- Add "fake" pg_dist_transaction records and run recovery
|
||||||
|
INSERT INTO pg_dist_transaction VALUES (1, 'citus_0_should_commit');
|
||||||
|
INSERT INTO pg_dist_transaction VALUES (1, 'citus_0_should_be_forgotten');
|
||||||
|
|
||||||
|
SELECT recover_prepared_transactions();
|
||||||
|
SELECT count(*) FROM pg_dist_transaction;
|
||||||
|
|
||||||
|
-- Confirm that transactions were correctly rolled forward
|
||||||
|
\c - - - :worker_1_port
|
||||||
|
SELECT count(*) FROM pg_tables WHERE tablename = 'should_abort';
|
||||||
|
SELECT count(*) FROM pg_tables WHERE tablename = 'should_commit';
|
Loading…
Reference in New Issue