/*-------------------------------------------------------------------------
 *
 * transaction_management.c
 *
 *   Transaction management for Citus.  Most of the work is delegated to other
 *   subsystems, this files, and especially CoordinatedTransactionCallback,
 *   coordinates the work between them.
 *
 * Copyright (c) Citus Data, Inc.
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include "libpq-fe.h"

#include "miscadmin.h"

#include "access/twophase.h"
#include "access/xact.h"
#include "distributed/backend_data.h"
#include "distributed/connection_management.h"
#include "distributed/distributed_planner.h"
#include "distributed/hash_helpers.h"
#include "distributed/intermediate_results.h"
#include "distributed/local_executor.h"
#include "distributed/multi_executor.h"
#include "distributed/transaction_management.h"
#include "distributed/placement_connection.h"
#include "distributed/subplan_execution.h"
#include "distributed/version_compat.h"
#include "utils/hsearch.h"
#include "utils/guc.h"
#include "utils/memutils.h"
#include "storage/fd.h"


CoordinatedTransactionState CurrentCoordinatedTransactionState = COORD_TRANS_NONE;

/* GUC, the commit protocol to use for commands affecting more than one connection */
int MultiShardCommitProtocol = COMMIT_PROTOCOL_2PC;
int SingleShardCommitProtocol = COMMIT_PROTOCOL_2PC;
int SavedMultiShardCommitProtocol = COMMIT_PROTOCOL_BARE;

/*
 * GUC that determines whether a SELECT in a transaction block should also run in
 * a transaction block on the worker even if no writes have occurred yet.
 */
bool SelectOpensTransactionBlock = true;

/* controls use of locks to enforce safe commutativity */
bool AllModificationsCommutative = false;

/* we've deprecated this flag, keeping here for some time not to break existing users */
bool EnableDeadlockPrevention = true;

/* number of nested stored procedure call levels we are currently in */
int StoredProcedureLevel = 0;

/* number of nested DO block levels we are currently in */
int DoBlockLevel = 0;

/* state needed to keep track of operations used during a transaction */
XactModificationType XactModificationLevel = XACT_MODIFICATION_NONE;

/* list of connections that are part of the current coordinated transaction */
dlist_head InProgressTransactions = DLIST_STATIC_INIT(InProgressTransactions);

/*
 * activeSetStmts keeps track of SET LOCAL statements executed within the current
 * subxact and will be set to NULL when pushing into new subxact or ending top xact.
 */
StringInfo activeSetStmts;

/*
 * Though a list, we treat this as a stack, pushing on subxact contexts whenever
 * e.g. a SAVEPOINT is executed (though this is actually performed by providing
 * PostgreSQL with a sub-xact callback). At present, the context of a subxact
 * includes a subxact identifier as well as any SET LOCAL statements propagated
 * to workers during the sub-transaction.
 */
static List *activeSubXactContexts = NIL;

/* some pre-allocated memory so we don't need to call malloc() during callbacks */
MemoryContext CommitContext = NULL;

/*
 * Should this coordinated transaction use 2PC? Set by
 * CoordinatedTransactionUse2PC(), e.g. if DDL was issued and
 * MultiShardCommitProtocol was set to 2PC.
 */
bool CoordinatedTransactionUses2PC = false;

/* if disabled, distributed statements in a function may run as separate transactions */
bool FunctionOpensTransactionBlock = true;


/* transaction management functions */
static void BeginCoordinatedTransaction(void);
static void CoordinatedTransactionCallback(XactEvent event, void *arg);
static void CoordinatedSubTransactionCallback(SubXactEvent event, SubTransactionId subId,
											  SubTransactionId parentSubid, void *arg);

/* remaining functions */
static void ResetShardPlacementTransactionState(void);
static void AdjustMaxPreparedTransactions(void);
static void PushSubXact(SubTransactionId subId);
static void PopSubXact(SubTransactionId subId);
static void SwallowErrors(void (*func)());
static bool MaybeExecutingUDF(void);


/*
 * BeginOrContinueCoordinatedTransaction starts a coordinated transaction,
 * unless one already is in progress.
 */
void
BeginOrContinueCoordinatedTransaction(void)
{
	if (CurrentCoordinatedTransactionState == COORD_TRANS_STARTED)
	{
		return;
	}

	BeginCoordinatedTransaction();
}


/*
 * InCoordinatedTransaction returns whether a coordinated transaction has been
 * started.
 */
bool
InCoordinatedTransaction(void)
{
	return CurrentCoordinatedTransactionState != COORD_TRANS_NONE &&
		   CurrentCoordinatedTransactionState != COORD_TRANS_IDLE;
}


/*
 * CoordinatedTransactionUse2PC() signals that the current coordinated
 * transaction should use 2PC to commit.
 */
void
CoordinatedTransactionUse2PC(void)
{
	Assert(InCoordinatedTransaction());

	CoordinatedTransactionUses2PC = true;
}


void
InitializeTransactionManagement(void)
{
	/* hook into transaction machinery */
	RegisterXactCallback(CoordinatedTransactionCallback, NULL);
	RegisterSubXactCallback(CoordinatedSubTransactionCallback, NULL);

	AdjustMaxPreparedTransactions();

	/* set aside 8kb of memory for use in CoordinatedTransactionCallback */
	CommitContext = AllocSetContextCreateExtended(TopMemoryContext,
												  "CommitContext",
												  8 * 1024,
												  8 * 1024,
												  8 * 1024);
}


/*
 * BeginCoordinatedTransaction begins a coordinated transaction. No
 * pre-existing coordinated transaction may be in progress./
 */
static void
BeginCoordinatedTransaction(void)
{
	if (CurrentCoordinatedTransactionState != COORD_TRANS_NONE &&
		CurrentCoordinatedTransactionState != COORD_TRANS_IDLE)
	{
		ereport(ERROR, (errmsg("starting transaction in wrong state")));
	}

	CurrentCoordinatedTransactionState = COORD_TRANS_STARTED;

	AssignDistributedTransactionId();
}


/*
 * Transaction management callback, handling coordinated transaction, and
 * transaction independent connection management.
 *
 * NB: There should only ever be a single transaction callback in citus, the
 * ordering between the callbacks and thee actions within those callbacks
 * otherwise becomes too undeterministic / hard to reason about.
 */
static void
CoordinatedTransactionCallback(XactEvent event, void *arg)
{
	switch (event)
	{
		case XACT_EVENT_COMMIT:
		{
			/*
			 * ERRORs thrown during XACT_EVENT_COMMIT will cause postgres to abort, at
			 * this point enough work has been done that it's not possible to rollback.
			 *
			 * One possible source of errors is memory allocation failures. To minimize
			 * the chance of those happening we've pre-allocated some memory in the
			 * CommitContext, it has 8kb of memory that we're allowed to use.
			 *
			 * We only do this in the COMMIT callback because:
			 * - Errors thrown in other callbacks (such as PRE_COMMIT) won't cause
			 *   crashes, they will simply cause the ABORT handler to be called.
			 * - The exception is ABORT, errors thrown there could also cause crashes, but
			 *   postgres already creates a TransactionAbortContext which performs this
			 *   trick, so there's no need for us to do it again.
			 */
			MemoryContext previousContext = CurrentMemoryContext;
			MemoryContextSwitchTo(CommitContext);

			/*
			 * Call other parts of citus that need to integrate into
			 * transaction management. Do so before doing other work, so the
			 * callbacks still can perform work if needed.
			 */
			ResetShardPlacementTransactionState();

			if (CurrentCoordinatedTransactionState == COORD_TRANS_PREPARED)
			{
				/* handles both already prepared and open transactions */
				CoordinatedRemoteTransactionsCommit();
			}

			/* close connections etc. */
			if (CurrentCoordinatedTransactionState != COORD_TRANS_NONE)
			{
				ResetPlacementConnectionManagement();
				AfterXactConnectionHandling(true);
			}

			CurrentCoordinatedTransactionState = COORD_TRANS_NONE;
			XactModificationLevel = XACT_MODIFICATION_NONE;
			LocalExecutionHappened = false;
			dlist_init(&InProgressTransactions);
			activeSetStmts = NULL;
			CoordinatedTransactionUses2PC = false;

			UnSetDistributedTransactionId();

			/* empty the CommitContext to ensure we're not leaking memory */
			MemoryContextSwitchTo(previousContext);
			MemoryContextReset(CommitContext);
			break;
		}

		case XACT_EVENT_ABORT:
		{
			/*
			 * FIXME: Add warning for the COORD_TRANS_COMMITTED case. That
			 * can be reached if this backend fails after the
			 * XACT_EVENT_PRE_COMMIT state.
			 */

			/*
			 * Call other parts of citus that need to integrate into
			 * transaction management. Do so before doing other work, so the
			 * callbacks still can perform work if needed.
			 */
			{
				/*
				 * On Windows it's not possible to delete a file before you've closed all
				 * handles to it (rmdir will return success but not take effect). Since
				 * we're in an ABORT handler it's very likely that not all handles have
				 * been closed; force them closed here before running
				 * RemoveIntermediateResultsDirectory.
				 */
				AtEOXact_Files(false);
				SwallowErrors(RemoveIntermediateResultsDirectory);
			}
			ResetShardPlacementTransactionState();

			/* handles both already prepared and open transactions */
			if (CurrentCoordinatedTransactionState > COORD_TRANS_IDLE)
			{
				CoordinatedRemoteTransactionsAbort();
			}

			/* close connections etc. */
			if (CurrentCoordinatedTransactionState != COORD_TRANS_NONE)
			{
				ResetPlacementConnectionManagement();
				AfterXactConnectionHandling(false);
			}

			CurrentCoordinatedTransactionState = COORD_TRANS_NONE;
			XactModificationLevel = XACT_MODIFICATION_NONE;
			LocalExecutionHappened = false;
			dlist_init(&InProgressTransactions);
			activeSetStmts = NULL;
			CoordinatedTransactionUses2PC = false;
			ExecutorLevel = 0;

			/*
			 * Getting here without PlannerLevel 0 is a bug, however it is such a big
			 * problem that will persist between reuse of the backend we still assign 0 in
			 * production deploys, but during development and tests we want to crash.
			 */
			Assert(PlannerLevel == 0);
			PlannerLevel = 0;

			/*
			 * We should reset SubPlanLevel in case a transaction is aborted,
			 * otherwise this variable would stay +ve if the transaction is
			 * aborted in the middle of a CTE/complex subquery execution
			 * which would cause the subsequent queries to error out in
			 * case the copy size is greater than
			 * citus.max_intermediate_result_size
			 */
			SubPlanLevel = 0;
			UnSetDistributedTransactionId();
			UnsetCitusNoticeLevel();
			break;
		}

		case XACT_EVENT_PARALLEL_COMMIT:
		case XACT_EVENT_PARALLEL_ABORT:
		{
			break;
		}

		case XACT_EVENT_PREPARE:
		{
			/*
			 * This callback is only relevant for worker queries since
			 * distributed queries cannot be executed with 2PC, see
			 * XACT_EVENT_PRE_PREPARE.
			 *
			 * We should remove the intermediate results before unsetting the
			 * distributed transaction id. That is necessary, otherwise Citus
			 * would try to remove a non-existing folder and leak some of the
			 * existing folders that are associated with distributed transaction
			 * ids on the worker nodes.
			 */
			RemoveIntermediateResultsDirectory();

			UnSetDistributedTransactionId();
			break;
		}

		case XACT_EVENT_PRE_COMMIT:
		{
			/*
			 * If the distributed query involves 2PC, we already removed
			 * the intermediate result directory on XACT_EVENT_PREPARE. However,
			 * if not, we should remove it here on the COMMIT. Since
			 * RemoveIntermediateResultsDirectory() is idempotent, we're safe
			 * to call it here again even if the transaction involves 2PC.
			 */
			RemoveIntermediateResultsDirectory();

			/* nothing further to do if there's no managed remote xacts */
			if (CurrentCoordinatedTransactionState == COORD_TRANS_NONE)
			{
				break;
			}

			/*
			 * TODO: It'd probably be a good idea to force constraints and
			 * such to 'immediate' here. Deferred triggers might try to send
			 * stuff to the remote side, which'd not be good.  Doing so
			 * remotely would also catch a class of errors where committing
			 * fails, which can lead to divergence when not using 2PC.
			 */

			/*
			 * Check whether the coordinated transaction is in a state we want
			 * to persist, or whether we want to error out.  This handles the
			 * case where iteratively executed commands marked all placements
			 * as invalid.
			 */
			MarkFailedShardPlacements();

			if (CoordinatedTransactionUses2PC)
			{
				CoordinatedRemoteTransactionsPrepare();
				CurrentCoordinatedTransactionState = COORD_TRANS_PREPARED;

				/*
				 * Make sure we did not have any failures on connections marked as
				 * critical before committing.
				 */
				CheckRemoteTransactionsHealth();
			}
			else
			{
				CheckRemoteTransactionsHealth();

				/*
				 * Have to commit remote transactions in PRE_COMMIT, to allow
				 * us to mark failed placements as invalid.  Better don't use
				 * this for anything important (i.e. DDL/metadata).
				 */
				CoordinatedRemoteTransactionsCommit();
				CurrentCoordinatedTransactionState = COORD_TRANS_COMMITTED;
			}

			/*
			 * Check again whether shards/placement successfully
			 * committed. This handles failure at COMMIT/PREPARE time.
			 */
			PostCommitMarkFailedShardPlacements(CoordinatedTransactionUses2PC);
			break;
		}

		case XACT_EVENT_PARALLEL_PRE_COMMIT:
		case XACT_EVENT_PRE_PREPARE:
		{
			if (CurrentCoordinatedTransactionState > COORD_TRANS_NONE)
			{
				ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
								errmsg("cannot use 2PC in transactions involving "
									   "multiple servers")));
			}
			break;
		}
	}
}


/*
 * ResetShardPlacementTransactionState performs cleanup after the end of a
 * transaction.
 */
static void
ResetShardPlacementTransactionState(void)
{
	if (MultiShardCommitProtocol == COMMIT_PROTOCOL_BARE)
	{
		MultiShardCommitProtocol = SavedMultiShardCommitProtocol;
		SavedMultiShardCommitProtocol = COMMIT_PROTOCOL_BARE;
	}
}


/*
 * Subtransaction callback - currently only used to remember whether a
 * savepoint has been rolled back, as we don't support that.
 */
static void
CoordinatedSubTransactionCallback(SubXactEvent event, SubTransactionId subId,
								  SubTransactionId parentSubid, void *arg)
{
	switch (event)
	{
		/*
		 * Our subtransaction stack should be consistent with postgres' internal
		 * transaction stack. In case of subxact begin, postgres calls our
		 * callback after it has pushed the transaction into stack, so we have to
		 * do the same even if worker commands fail, so we PushSubXact() first.
		 * In case of subxact commit, callback is called before pushing subxact to
		 * the postgres transaction stack, so we call PopSubXact() after making sure
		 * worker commands didn't fail. Otherwise, Postgres would roll back that
		 * would cause us to call PopSubXact again.
		 */
		case SUBXACT_EVENT_START_SUB:
		{
			PushSubXact(subId);
			if (InCoordinatedTransaction())
			{
				CoordinatedRemoteTransactionsSavepointBegin(subId);
			}
			break;
		}

		case SUBXACT_EVENT_COMMIT_SUB:
		{
			if (InCoordinatedTransaction())
			{
				CoordinatedRemoteTransactionsSavepointRelease(subId);
			}
			PopSubXact(subId);
			break;
		}

		case SUBXACT_EVENT_ABORT_SUB:
		{
			if (InCoordinatedTransaction())
			{
				CoordinatedRemoteTransactionsSavepointRollback(subId);
			}
			PopSubXact(subId);

			UnsetCitusNoticeLevel();
			break;
		}

		case SUBXACT_EVENT_PRE_COMMIT_SUB:
		{
			/* nothing to do */
			break;
		}
	}
}


/*
 * AdjustMaxPreparedTransactions configures the number of available prepared
 * transaction slots at startup.
 */
static void
AdjustMaxPreparedTransactions(void)
{
	/*
	 * As Citus uses 2PC internally, there always should be some available. As
	 * the default is 0, we increase it to something appropriate
	 * (connections * 2 currently).  If the user explicitly configured 2PC, we
	 * leave the configuration alone - there might have been intent behind the
	 * decision.
	 */
	if (max_prepared_xacts == 0)
	{
		char newvalue[12];

		snprintf(newvalue, sizeof(newvalue), "%d", MaxConnections * 2);

		SetConfigOption("max_prepared_transactions", newvalue, PGC_POSTMASTER,
						PGC_S_OVERRIDE);

		ereport(LOG, (errmsg("number of prepared transactions has not been "
							 "configured, overriding"),
					  errdetail("max_prepared_transactions is now set to %s",
								newvalue)));
	}
}


/* PushSubXact pushes subId to the stack of active sub-transactions. */
static void
PushSubXact(SubTransactionId subId)
{
	MemoryContext old_context = MemoryContextSwitchTo(CurTransactionContext);

	/* save provided subId as well as propagated SET LOCAL stmts */
	SubXactContext *state = palloc(sizeof(SubXactContext));
	state->subId = subId;
	state->setLocalCmds = activeSetStmts;

	/* append to list and reset active set stmts for upcoming sub-xact */
	activeSubXactContexts = lcons(state, activeSubXactContexts);
	activeSetStmts = makeStringInfo();

	MemoryContextSwitchTo(old_context);
}


/* PopSubXact pops subId from the stack of active sub-transactions. */
static void
PopSubXact(SubTransactionId subId)
{
	MemoryContext old_context = MemoryContextSwitchTo(CurTransactionContext);
	SubXactContext *state = linitial(activeSubXactContexts);

	/*
	 * the previous activeSetStmts is already invalid because it's in the now-
	 * aborted subxact (what we're popping), so no need to free before assign-
	 * ing with the setLocalCmds of the popped context
	 */
	Assert(state->subId == subId);
	activeSetStmts = state->setLocalCmds;
	activeSubXactContexts = list_delete_first(activeSubXactContexts);

	MemoryContextSwitchTo(old_context);
}


/* ActiveSubXacts returns list of active sub-transactions in temporal order. */
List *
ActiveSubXacts(void)
{
	ListCell *subXactCell = NULL;
	List *activeSubXactsReversed = NIL;

	/*
	 * activeSubXactContexts is in reversed temporal order, so we reverse it to get it
	 * in temporal order.
	 */
	foreach(subXactCell, activeSubXactContexts)
	{
		SubXactContext *state = lfirst(subXactCell);
		activeSubXactsReversed = lcons_int(state->subId, activeSubXactsReversed);
	}

	return activeSubXactsReversed;
}


/* ActiveSubXactContexts returns the list of active sub-xact context in temporal order. */
List *
ActiveSubXactContexts(void)
{
	ListCell *subXactCell = NULL;
	List *reversedSubXactStates = NIL;

	/*
	 * activeSubXactContexts is in reversed temporal order, so we reverse it to get it
	 * in temporal order.
	 */
	foreach(subXactCell, activeSubXactContexts)
	{
		SubXactContext *state = lfirst(subXactCell);
		reversedSubXactStates = lcons(state, reversedSubXactStates);
	}

	return reversedSubXactStates;
}


/*
 * If an ERROR is thrown while processing a transaction the ABORT handler is called.
 * ERRORS thrown during ABORT are not treated any differently, the ABORT handler is also
 * called during processing of those. If an ERROR was raised the first time through it's
 * unlikely that the second try will succeed; more likely that an ERROR will be thrown
 * again. This loop continues until Postgres notices and PANICs, complaining about a stack
 * overflow.
 *
 * Instead of looping and crashing, SwallowErrors lets us attempt to continue running the
 * ABORT logic. This wouldn't be safe in most other parts of the codebase, in
 * approximately none of the places where we emit ERROR do we first clean up after
 * ourselves! It's fine inside the ABORT handler though; Postgres is going to clean
 * everything up before control passes back to us.
 */
static void
SwallowErrors(void (*func)())
{
	MemoryContext savedContext = CurrentMemoryContext;

	PG_TRY();
	{
		func();
	}
	PG_CATCH();
	{
		ErrorData *edata = CopyErrorData();

		/* don't try to intercept PANIC or FATAL, let those breeze past us */
		if (edata->elevel != ERROR)
		{
			PG_RE_THROW();
		}

		/* turn the ERROR into a WARNING and emit it */
		edata->elevel = WARNING;
		ThrowErrorData(edata);

		/* leave the error handling system */
		FlushErrorState();
		MemoryContextSwitchTo(savedContext);
	}
	PG_END_TRY();
}


/*
 * IsMultiStatementTransaction determines whether the current statement is
 * part of a bigger multi-statement transaction. This is the case when the
 * statement is wrapped in a transaction block (comes after BEGIN), or it
 * is called from a stored procedure or function.
 */
bool
IsMultiStatementTransaction(void)
{
	if (IsTransactionBlock())
	{
		/* in a BEGIN...END block */
		return true;
	}
	else if (DoBlockLevel > 0)
	{
		/* in (a transaction within) a do block */
		return true;
	}
	else if (StoredProcedureLevel > 0)
	{
		/* in (a transaction within) a stored procedure */
		return true;
	}
	else if (MaybeExecutingUDF() && FunctionOpensTransactionBlock)
	{
		/* in a language-handler function call, open a transaction if configured to do so */
		return true;
	}
	else
	{
		return false;
	}
}


/*
 * MaybeExecutingUDF returns true if we are possibly executing a function call.
 * We use nested level of executor to check this, so this can return true for
 * CTEs, etc. which also start nested executors.
 *
 * If the planner is being called from the executor, then we may also be in
 * a UDF.
 */
static bool
MaybeExecutingUDF(void)
{
	return ExecutorLevel > 1 || (ExecutorLevel == 1 && PlannerLevel > 0);
}