citus/src/backend/distributed/executor/insert_select_executor.c

/*-------------------------------------------------------------------------
 *
 * insert_select_executor.c
 *
 * Executor logic for INSERT..SELECT.
 *
 * Copyright (c) Citus Data, Inc.
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include "miscadmin.h"

#include "executor/executor.h"
#include "nodes/execnodes.h"
#include "nodes/makefuncs.h"
#include "nodes/nodeFuncs.h"
#include "nodes/parsenodes.h"
#include "nodes/plannodes.h"
#include "parser/parse_coerce.h"
#include "parser/parse_relation.h"
#include "parser/parsetree.h"
#include "tcop/pquery.h"
#include "tcop/tcopprot.h"
#include "utils/lsyscache.h"
#include "utils/portal.h"
#include "utils/rel.h"
#include "utils/snapmgr.h"

#include "distributed/adaptive_executor.h"
#include "distributed/citus_ruleutils.h"
#include "distributed/commands/multi_copy.h"
#include "distributed/deparse_shard_query.h"
#include "distributed/distributed_execution_locks.h"
#include "distributed/distributed_planner.h"
#include "distributed/insert_select_executor.h"
#include "distributed/insert_select_planner.h"
#include "distributed/intermediate_results.h"
#include "distributed/listutils.h"
#include "distributed/local_executor.h"
#include "distributed/merge_planner.h"
#include "distributed/metadata_cache.h"
#include "distributed/multi_executor.h"
#include "distributed/multi_partitioning_utils.h"
#include "distributed/multi_physical_planner.h"
#include "distributed/multi_router_planner.h"
#include "distributed/recursive_planning.h"
#include "distributed/relation_access_tracking.h"
#include "distributed/repartition_executor.h"
#include "distributed/resource_lock.h"
#include "distributed/shardinterval_utils.h"
#include "distributed/subplan_execution.h"
#include "distributed/transaction_management.h"
#include "distributed/version_compat.h"

/* Config variables managed via guc.c */
bool EnableRepartitionedInsertSelect = true;


static void ExecutePlanIntoRelation(Oid targetRelationId, List *insertTargetList,
									PlannedStmt *selectPlan, EState *executorState);
static HTAB * ExecutePlanIntoColocatedIntermediateResults(Oid targetRelationId,
														  List *insertTargetList,
														  PlannedStmt *selectPlan,
														  EState *executorState,
														  char *intermediateResultIdPrefix);
static int PartitionColumnIndexFromColumnList(Oid relationId, List *columnNameList);
static void WrapTaskListForProjection(List *taskList, List *projectedTargetEntries);


/*
 * NonPushableInsertSelectExecScan executes an INSERT INTO distributed_table
 * SELECT .. query either by routing via coordinator or by repartitioning
 * task results and moving data directly between nodes.
 */
TupleTableSlot *
NonPushableInsertSelectExecScan(CustomScanState *node)
{
	CitusScanState *scanState = (CitusScanState *) node;

	if (!scanState->finishedRemoteScan)
	{
		EState *executorState = ScanStateGetExecutorState(scanState);
		DistributedPlan *distributedPlan = scanState->distributedPlan;
		Query *insertSelectQuery =
			copyObject(distributedPlan->modifyQueryViaCoordinatorOrRepartition);
		List *insertTargetList = insertSelectQuery->targetList;
		RangeTblEntry *selectRte = ExtractSelectRangeTableEntry(insertSelectQuery);
		RangeTblEntry *insertRte = ExtractResultRelationRTE(insertSelectQuery);
		Oid targetRelationId = insertRte->relid;
		char *intermediateResultIdPrefix = distributedPlan->intermediateResultIdPrefix;
		bool hasReturning = distributedPlan->expectResults;
		HTAB *shardStateHash = NULL;

		Query *selectQuery = selectRte->subquery;
		PlannedStmt *selectPlan =
			copyObject(distributedPlan->selectPlanForModifyViaCoordinatorOrRepartition);

		/*
		 * If we are dealing with partitioned table, we also need to lock its
		 * partitions. Here we only lock targetRelation, we acquire necessary
		 * locks on selected tables during execution of those select queries.
		 */
		if (PartitionedTable(targetRelationId))
		{
			LockPartitionRelations(targetRelationId, RowExclusiveLock);
		}

		if (distributedPlan->modifyWithSelectMethod == MODIFY_WITH_SELECT_REPARTITION)
		{
			ereport(DEBUG1, (errmsg("performing repartitioned INSERT ... SELECT")));

			DistributedPlan *distSelectPlan =
				GetDistributedPlan((CustomScan *) selectPlan->planTree);
			Job *distSelectJob = distSelectPlan->workerJob;
			List *distSelectTaskList = distSelectJob->taskList;
			bool randomAccess = true;
			bool interTransactions = false;
			bool binaryFormat =
				CanUseBinaryCopyFormatForTargetList(selectQuery->targetList);

			ExecuteSubPlans(distSelectPlan);

			/*
			 * We have a separate directory for each transaction, so choosing
			 * the same result prefix won't cause filename conflicts. Results
			 * directory name also includes node id and database id, so we don't
			 * need to include them in the filename. We include job id here for
			 * the case "INSERT/SELECTs" are executed recursively.
			 */
			StringInfo distResultPrefixString = makeStringInfo();
			appendStringInfo(distResultPrefixString,
							 "repartitioned_results_" UINT64_FORMAT,
							 distSelectJob->jobId);
			char *distResultPrefix = distResultPrefixString->data;

			CitusTableCacheEntry *targetRelation =
				GetCitusTableCacheEntry(targetRelationId);

			int distributionColumnIndex =
				DistributionColumnIndex(insertTargetList,
										targetRelation->partitionColumn);
			if (distributionColumnIndex == -1)
			{
				ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
								errmsg(
									"the partition column of table %s should have a value",
									generate_qualified_relation_name(targetRelationId))));
			}

			TargetEntry *selectPartitionTE = list_nth(selectQuery->targetList,
													  distributionColumnIndex);
			const char *partitionColumnName = selectPartitionTE->resname ?
											  selectPartitionTE->resname : "(none)";

			ereport(DEBUG2, (errmsg(
								 "partitioning SELECT query by column index %d with name %s",
								 distributionColumnIndex, quote_literal_cstr(
									 partitionColumnName))));

			/*
			 * ExpandWorkerTargetEntry() can add additional columns to the worker
			 * query. Modify the task queries to only select columns we need.
			 */
			int requiredColumnCount = list_length(insertTargetList);
			List *jobTargetList = distSelectJob->jobQuery->targetList;
			if (list_length(jobTargetList) > requiredColumnCount)
			{
				List *projectedTargetEntries = ListTake(jobTargetList,
														requiredColumnCount);
				WrapTaskListForProjection(distSelectTaskList, projectedTargetEntries);
			}

			List **redistributedResults = RedistributeTaskListResults(distResultPrefix,
																	  distSelectTaskList,
																	  distributionColumnIndex,
																	  targetRelation,
																	  binaryFormat);

			/*
			 * At this point select query has been executed on workers and results
			 * have been fetched in such a way that they are colocated with corresponding
			 * target shard. Create and execute a list of tasks of form
			 * INSERT INTO ... SELECT * FROM read_intermediate_results(...);
			 */
			List *taskList = GenerateTaskListWithRedistributedResults(insertSelectQuery,
																	  targetRelation,
																	  redistributedResults,
																	  binaryFormat);

			scanState->tuplestorestate =
				tuplestore_begin_heap(randomAccess, interTransactions, work_mem);
			TupleDesc tupleDescriptor = ScanStateGetTupleDescriptor(scanState);
			TupleDestination *tupleDest = CreateTupleStoreTupleDest(
				scanState->tuplestorestate, tupleDescriptor);
			uint64 rowsInserted = ExecuteTaskListIntoTupleDest(ROW_MODIFY_COMMUTATIVE,
															   taskList, tupleDest,
															   hasReturning);

			executorState->es_processed = rowsInserted;

			if (SortReturning && hasReturning)
			{
				SortTupleStore(scanState);
			}
		}
		else if (insertSelectQuery->onConflict || hasReturning)
		{
			ereport(DEBUG1, (errmsg(
								 "Collecting INSERT ... SELECT results on coordinator")));

			/*
			 * If we also have a workerJob that means there is a second step
			 * to the INSERT...SELECT. This happens when there is a RETURNING
			 * or ON CONFLICT clause which is implemented as a separate
			 * distributed INSERT...SELECT from a set of intermediate results
			 * to the target relation.
			 */
			List *prunedTaskList = NIL;

			shardStateHash = ExecutePlanIntoColocatedIntermediateResults(
				targetRelationId,
				insertTargetList,
				selectPlan,
				executorState,
				intermediateResultIdPrefix);

			/* generate tasks for the INSERT..SELECT phase */
			List *taskList =
				GenerateTaskListWithColocatedIntermediateResults(
					targetRelationId, insertSelectQuery,
					intermediateResultIdPrefix);

			/*
			 * We cannot actually execute INSERT...SELECT tasks that read from
			 * intermediate results that weren't created because no rows were
			 * written to them. Prune those tasks out by only including tasks
			 * on shards with connections.
			 */
			Task *task = NULL;
			foreach_declared_ptr(task, taskList)
			{
				uint64 shardId = task->anchorShardId;
				bool shardModified = false;

				hash_search(shardStateHash, &shardId, HASH_FIND, &shardModified);
				if (shardModified)
				{
					prunedTaskList = lappend(prunedTaskList, task);
				}
			}

			if (prunedTaskList != NIL)
			{
				bool randomAccess = true;
				bool interTransactions = false;

				Assert(scanState->tuplestorestate == NULL);
				scanState->tuplestorestate =
					tuplestore_begin_heap(randomAccess, interTransactions, work_mem);

				TupleDesc tupleDescriptor = ScanStateGetTupleDescriptor(scanState);
				TupleDestination *tupleDest = CreateTupleStoreTupleDest(
					scanState->tuplestorestate, tupleDescriptor);

				ExecuteTaskListIntoTupleDest(ROW_MODIFY_COMMUTATIVE, prunedTaskList,
											 tupleDest, hasReturning);

				if (SortReturning && hasReturning)
				{
					SortTupleStore(scanState);
				}
			}
		}
		else
		{
			ereport(DEBUG1, (errmsg(
								 "Collecting INSERT ... SELECT results on coordinator")));

			ExecutePlanIntoRelation(targetRelationId, insertTargetList, selectPlan,
									executorState);
		}

		scanState->finishedRemoteScan = true;
	}

	TupleTableSlot *resultSlot = ReturnTupleFromTuplestore(scanState);

	return resultSlot;
}


/*
 * ExecutePlanIntoColocatedIntermediateResults executes the given PlannedStmt
 * and inserts tuples into a set of intermediate results that are colocated with
 * the target table for further processing of ON CONFLICT or RETURNING. It also
 * returns the hash of shard states that were used to insert tuplesinto the target
 * relation.
 */
static HTAB *
ExecutePlanIntoColocatedIntermediateResults(Oid targetRelationId,
											List *insertTargetList,
											PlannedStmt *selectPlan,
											EState *executorState,
											char *intermediateResultIdPrefix)
{
	ParamListInfo paramListInfo = executorState->es_param_list_info;

	/* Get column name list and partition column index for the target table */
	List *columnNameList = BuildColumnNameListFromTargetList(targetRelationId,
															 insertTargetList);
	int partitionColumnIndex = PartitionColumnIndexFromColumnList(targetRelationId,
																  columnNameList);

	/* set up a DestReceiver that copies into the intermediate table */
	const bool publishableData = true;
	CitusCopyDestReceiver *copyDest = CreateCitusCopyDestReceiver(targetRelationId,
																  columnNameList,
																  partitionColumnIndex,
																  executorState,
																  intermediateResultIdPrefix,
																  publishableData);

	ExecutePlanIntoDestReceiver(selectPlan, paramListInfo, (DestReceiver *) copyDest);

	executorState->es_processed = copyDest->tuplesSent;

	XactModificationLevel = XACT_MODIFICATION_DATA;

	return copyDest->shardStateHash;
}


/*
 * ExecutePlanIntoRelation executes the given plan and inserts the
 * results into the target relation, which is assumed to be a distributed
 * table.
 */
static void
ExecutePlanIntoRelation(Oid targetRelationId, List *insertTargetList,
						PlannedStmt *selectPlan, EState *executorState)
{
	ParamListInfo paramListInfo = executorState->es_param_list_info;

	/* Get column name list and partition column index for the target table */
	List *columnNameList = BuildColumnNameListFromTargetList(targetRelationId,
															 insertTargetList);
	int partitionColumnIndex = PartitionColumnIndexFromColumnList(targetRelationId,
																  columnNameList);

	/* set up a DestReceiver that copies into the distributed table */
	const bool publishableData = true;
	CitusCopyDestReceiver *copyDest = CreateCitusCopyDestReceiver(targetRelationId,
																  columnNameList,
																  partitionColumnIndex,
																  executorState, NULL,
																  publishableData);

	ExecutePlanIntoDestReceiver(selectPlan, paramListInfo, (DestReceiver *) copyDest);

	executorState->es_processed = copyDest->tuplesSent;

	XactModificationLevel = XACT_MODIFICATION_DATA;
}


/*
 * BuildColumnNameListForCopyStatement build the column name list given the insert
 * target list.
 */
List *
BuildColumnNameListFromTargetList(Oid targetRelationId, List *insertTargetList)
{
	List *columnNameList = NIL;

	/* build the list of column names for the COPY statement */
	TargetEntry *insertTargetEntry = NULL;
	foreach_declared_ptr(insertTargetEntry, insertTargetList)
	{
		columnNameList = lappend(columnNameList, insertTargetEntry->resname);
	}

	return columnNameList;
}


/*
 * PartitionColumnIndexFromColumnList returns the index of partition column from given
 * column name list and relation ID. If given list doesn't contain the partition
 * column, it returns -1.
 */
static int
PartitionColumnIndexFromColumnList(Oid relationId, List *columnNameList)
{
	Var *partitionColumn = PartitionColumn(relationId, 0);
	int partitionColumnIndex = 0;

	const char *columnName = NULL;
	foreach_declared_ptr(columnName, columnNameList)
	{
		AttrNumber attrNumber = get_attnum(relationId, columnName);

		/* check whether this is the partition column */
		if (partitionColumn != NULL && attrNumber == partitionColumn->varattno)
		{
			return partitionColumnIndex;
		}

		partitionColumnIndex++;
	}

	return -1;
}


/*
 * DistributionColumnIndex finds the index of given distribution column in the
 * given target list.
 */
int
DistributionColumnIndex(List *insertTargetList, Var *distributionColumn)
{
	TargetEntry *insertTargetEntry = NULL;
	int targetEntryIndex = 0;
	foreach_declared_ptr(insertTargetEntry, insertTargetList)
	{
		if (insertTargetEntry->resno == distributionColumn->varattno)
		{
			return targetEntryIndex;
		}

		targetEntryIndex++;
	}

	return -1;
}


/*
 * WrapTaskListForProjection wraps task query string to only select given
 * projected columns. It modifies the taskList.
 */
static void
WrapTaskListForProjection(List *taskList, List *projectedTargetEntries)
{
	StringInfo projectedColumnsString = makeStringInfo();
	int entryIndex = 0;
	TargetEntry *targetEntry = NULL;
	foreach_declared_ptr(targetEntry, projectedTargetEntries)
	{
		if (entryIndex != 0)
		{
			appendStringInfoChar(projectedColumnsString, ',');
		}

		char *columnName = targetEntry->resname;
		Assert(columnName != NULL);
		appendStringInfoString(projectedColumnsString, quote_identifier(columnName));

		entryIndex++;
	}

	Task *task = NULL;
	foreach_declared_ptr(task, taskList)
	{
		StringInfo wrappedQuery = makeStringInfo();
		appendStringInfo(wrappedQuery, "SELECT %s FROM (%s) subquery",
						 projectedColumnsString->data,
						 TaskQueryString(task));
		SetTaskQueryString(task, wrappedQuery->data);
	}
}