mirror of https://github.com/citusdata/citus.git
Merge 0d32050145 into 79cabe7eca
commit
21204ea91b
|
|
@ -31,11 +31,14 @@
|
||||||
|
|
||||||
#define CREATE_RESTORE_POINT_COMMAND "SELECT pg_catalog.pg_create_restore_point($1::text)"
|
#define CREATE_RESTORE_POINT_COMMAND "SELECT pg_catalog.pg_create_restore_point($1::text)"
|
||||||
|
|
||||||
|
#define BLOCK_TRANSACTIONS_COMMAND \
|
||||||
|
"LOCK TABLE pg_catalog.pg_dist_transaction IN EXCLUSIVE MODE"
|
||||||
|
|
||||||
/* local functions forward declarations */
|
/* local functions forward declarations */
|
||||||
static List * OpenConnectionsToAllWorkerNodes(LOCKMODE lockMode);
|
static List * OpenConnectionsToAllWorkerNodes(LOCKMODE lockMode);
|
||||||
static void BlockDistributedTransactions(void);
|
static void BlockDistributedTransactions(void);
|
||||||
static void CreateRemoteRestorePoints(char *restoreName, List *connectionList);
|
static void CreateRemoteRestorePoints(char *restoreName, List *connectionList);
|
||||||
|
static void BlockDistributedTransactionsOnAllMetadataNodes(List *connectionList);
|
||||||
|
|
||||||
|
|
||||||
/* exports for SQL callable functions */
|
/* exports for SQL callable functions */
|
||||||
|
|
@ -43,10 +46,28 @@ PG_FUNCTION_INFO_V1(citus_create_restore_point);
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* citus_create_restore_point blocks writes to distributed tables and then
|
* citus_create_restore_point creates a cluster-consistent restore point
|
||||||
* runs pg_create_restore_point on all nodes. This creates a consistent
|
* across all nodes in the Citus cluster.
|
||||||
* restore point under the assumption that there are no other writers
|
*
|
||||||
* than the coordinator.
|
* In coordinator-only mode, this function blocks new distributed writes
|
||||||
|
* at the coordinator and creates restore points on all worker nodes.
|
||||||
|
*
|
||||||
|
* In MX mode (multi-writer), this function blocks the 2PC commit decision
|
||||||
|
* point on all MX-enabled nodes by acquiring ExclusiveLock on the
|
||||||
|
* pg_dist_transaction catalog table across the cluster. This prevents new
|
||||||
|
* distributed transactions from recording commit decisions, ensuring that
|
||||||
|
* all restore points represent the same consistent cluster state.
|
||||||
|
*
|
||||||
|
* The function returns the LSN of the restore point on the coordinator,
|
||||||
|
* maintaining backward compatibility with the original implementation.
|
||||||
|
*
|
||||||
|
* Key insight: We do NOT need to drain in-flight transactions. The commit
|
||||||
|
* decision in Citus 2PC happens when LogTransactionRecord() writes to
|
||||||
|
* pg_dist_transaction, which occurs BEFORE the writer's local commit.
|
||||||
|
* By blocking writes to pg_dist_transaction, we prevent commit decisions
|
||||||
|
* from being made. Transactions that have already recorded their commit
|
||||||
|
* decision will complete normally, while those that haven't will
|
||||||
|
* be blocked. This creates a clean cut point for consistency.
|
||||||
*/
|
*/
|
||||||
Datum
|
Datum
|
||||||
citus_create_restore_point(PG_FUNCTION_ARGS)
|
citus_create_restore_point(PG_FUNCTION_ARGS)
|
||||||
|
|
@ -88,22 +109,56 @@ citus_create_restore_point(PG_FUNCTION_ARGS)
|
||||||
* ShareLock prevents new nodes being added, rendering connectionList incomplete
|
* ShareLock prevents new nodes being added, rendering connectionList incomplete
|
||||||
*/
|
*/
|
||||||
List *connectionList = OpenConnectionsToAllWorkerNodes(ShareLock);
|
List *connectionList = OpenConnectionsToAllWorkerNodes(ShareLock);
|
||||||
|
XLogRecPtr localRestorePoint = InvalidXLogRecPtr;
|
||||||
|
|
||||||
/*
|
PG_TRY();
|
||||||
* Send a BEGIN to bust through pgbouncer. We won't actually commit since
|
{
|
||||||
* that takes time. Instead we just close the connections and roll back,
|
/*
|
||||||
* which doesn't undo pg_create_restore_point.
|
* Send a BEGIN to bust through pgbouncer. We won't actually commit since
|
||||||
*/
|
* that takes time. Instead we just close the connections and roll back,
|
||||||
RemoteTransactionListBegin(connectionList);
|
* which doesn't undo pg_create_restore_point.
|
||||||
|
*/
|
||||||
|
RemoteTransactionListBegin(connectionList);
|
||||||
|
|
||||||
/* DANGER: finish as quickly as possible after this */
|
/* DANGER: finish as quickly as possible after this */
|
||||||
BlockDistributedTransactions();
|
BlockDistributedTransactions();
|
||||||
|
|
||||||
/* do local restore point first to bail out early if something goes wrong */
|
BlockDistributedTransactionsOnAllMetadataNodes(connectionList);
|
||||||
XLogRecPtr localRestorePoint = XLogRestorePoint(restoreNameString);
|
|
||||||
|
|
||||||
/* run pg_create_restore_point on all nodes */
|
/* do local restore point first to bail out early if something goes wrong */
|
||||||
CreateRemoteRestorePoints(restoreNameString, connectionList);
|
localRestorePoint = XLogRestorePoint(restoreNameString);
|
||||||
|
|
||||||
|
/* run pg_create_restore_point on all nodes */
|
||||||
|
CreateRemoteRestorePoints(restoreNameString, connectionList);
|
||||||
|
|
||||||
|
/* close connections to all nodes and
|
||||||
|
* all locks gets released as part of the transaction rollback
|
||||||
|
*/
|
||||||
|
MultiConnection *conn = NULL;
|
||||||
|
foreach_declared_ptr(conn, connectionList)
|
||||||
|
{
|
||||||
|
ForgetResults(conn);
|
||||||
|
CloseConnection(conn);
|
||||||
|
}
|
||||||
|
connectionList = NIL;
|
||||||
|
}
|
||||||
|
PG_CATCH();
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* On error, ensure we clean up connections and release locks.
|
||||||
|
* Rolling back the metadata node transactions releases the
|
||||||
|
* ExclusiveLocks on pg_dist_transaction cluster-wide.
|
||||||
|
*/
|
||||||
|
MultiConnection *conn = NULL;
|
||||||
|
foreach_declared_ptr(conn, connectionList)
|
||||||
|
{
|
||||||
|
ForgetResults(conn);
|
||||||
|
CloseConnection(conn);
|
||||||
|
}
|
||||||
|
connectionList = NIL;
|
||||||
|
PG_RE_THROW();
|
||||||
|
}
|
||||||
|
PG_END_TRY();
|
||||||
|
|
||||||
PG_RETURN_LSN(localRestorePoint);
|
PG_RETURN_LSN(localRestorePoint);
|
||||||
}
|
}
|
||||||
|
|
@ -152,6 +207,90 @@ BlockDistributedTransactions(void)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* BlockDistributedTransactionsOnAllMetadataNodes blocks distributed transactions
|
||||||
|
* on all metadata nodes by executing pg_lock_table remotely.
|
||||||
|
*
|
||||||
|
* This is the MX-mode equivalent of BlockDistributedTransactions(), extended
|
||||||
|
* to all nodes capable of initiating distributed transactions. We must hold
|
||||||
|
* these locks across the cluster to prevent commit decisions from being made
|
||||||
|
* on any node.
|
||||||
|
*
|
||||||
|
* The function expects that connections are already in a transaction block
|
||||||
|
* (BEGIN has been sent). The locks will be held until the transaction is
|
||||||
|
* rolled back or committed.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
BlockDistributedTransactionsOnAllMetadataNodes(List *connectionList)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Send LOCK TABLE commands to all metadata nodes in parallel. We use
|
||||||
|
* standard SQL LOCK TABLE syntax to acquire ExclusiveLock on catalog
|
||||||
|
* tables, mirroring what BlockDistributedTransactions() does on the
|
||||||
|
* coordinator via LockRelationOid().
|
||||||
|
*
|
||||||
|
* The BLOCK_TRANSACTIONS_COMMAND acquires:
|
||||||
|
* 1. ExclusiveLock on pg_dist_transaction (blocks 2PC commit decisions)
|
||||||
|
*
|
||||||
|
* Note: Unlike the local coordinator lock which also locks pg_dist_node
|
||||||
|
* and pg_dist_partition, we only lock pg_dist_transaction on remote nodes
|
||||||
|
* because DDL and node management operations are coordinator-only even in
|
||||||
|
* MX mode. This is sufficient to block distributed writes while allowing
|
||||||
|
* the restore point operation to complete quickly.
|
||||||
|
*
|
||||||
|
* These locks naturally serialize concurrent restore point operations
|
||||||
|
* cluster-wide, so no additional advisory lock is needed.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Build list of remote metadata node connections */
|
||||||
|
List *metadataConnectionList = NIL;
|
||||||
|
MultiConnection *connection = NULL;
|
||||||
|
foreach_declared_ptr(connection, connectionList)
|
||||||
|
{
|
||||||
|
WorkerNode *workerNode = FindWorkerNode(connection->hostname, connection->port);
|
||||||
|
bool isRemoteMetadataNode = workerNode != NULL &&
|
||||||
|
NodeIsPrimaryAndRemote(workerNode);
|
||||||
|
|
||||||
|
if (isRemoteMetadataNode)
|
||||||
|
{
|
||||||
|
metadataConnectionList = lappend(metadataConnectionList, connection);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Send lock commands in parallel to all remote metadata nodes */
|
||||||
|
foreach_declared_ptr(connection, metadataConnectionList)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* We could use ExecuteCriticalRemoteCommand instead, but it would
|
||||||
|
* not allow us to execute the commands in parallel. So for sake of
|
||||||
|
* performance, we use SendRemoteCommand and send lock commands in parallel
|
||||||
|
* to all metadata nodes, and later wait for all lock acquisitions to complete.
|
||||||
|
*/
|
||||||
|
int querySent = SendRemoteCommand(connection, BLOCK_TRANSACTIONS_COMMAND);
|
||||||
|
if (querySent == 0)
|
||||||
|
{
|
||||||
|
ReportConnectionError(connection, ERROR);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Wait for all lock acquisitions to complete. If any node fails to
|
||||||
|
* acquire locks (e.g., due to a conflicting lock), this will error out.
|
||||||
|
*/
|
||||||
|
foreach_declared_ptr(connection, metadataConnectionList)
|
||||||
|
{
|
||||||
|
PGresult *result = GetRemoteCommandResult(connection, true);
|
||||||
|
if (!IsResponseOK(result))
|
||||||
|
{
|
||||||
|
ReportResultError(connection, result, ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
PQclear(result);
|
||||||
|
ForgetResults(connection);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* CreateRemoteRestorePoints creates a restore point via each of the
|
* CreateRemoteRestorePoints creates a restore point via each of the
|
||||||
* connections in the list in parallel.
|
* connections in the list in parallel.
|
||||||
|
|
@ -186,6 +325,5 @@ CreateRemoteRestorePoints(char *restoreName, List *connectionList)
|
||||||
PQclear(result);
|
PQclear(result);
|
||||||
|
|
||||||
ForgetResults(connection);
|
ForgetResults(connection);
|
||||||
CloseConnection(connection);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue