citus/src/backend/distributed/planner/multi_logical_optimizer.c

/*-------------------------------------------------------------------------
 *
 * multi_logical_optimizer.c
 *	  Routines for optimizing logical plan trees based on multi-relational
 *	  algebra.
 *
 * Copyright (c) Citus Data, Inc.
 *
 * $Id$
 *
 *-------------------------------------------------------------------------
 */

#include <math.h>

#include "postgres.h"

#include "access/genam.h"
#include "access/heapam.h"
#include "access/htup_details.h"
#include "access/nbtree.h"
#include "catalog/indexing.h"
#include "catalog/pg_aggregate.h"
#include "catalog/pg_am.h"
#include "catalog/pg_proc.h"
#include "catalog/pg_type.h"
#include "commands/extension.h"
#include "nodes/makefuncs.h"
#include "nodes/nodeFuncs.h"
#include "optimizer/clauses.h"
#include "optimizer/optimizer.h"
#include "optimizer/tlist.h"
#include "parser/parse_agg.h"
#include "parser/parse_coerce.h"
#include "parser/parse_oper.h"
#include "parser/parsetree.h"
#include "rewrite/rewriteManip.h"
#include "utils/fmgroids.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
#include "utils/syscache.h"

#include "pg_version_constants.h"

#include "distributed/citus_nodes.h"
#include "distributed/citus_ruleutils.h"
#include "distributed/colocation_utils.h"
#include "distributed/errormessage.h"
#include "distributed/extended_op_node_utils.h"
#include "distributed/function_utils.h"
#include "distributed/listutils.h"
#include "distributed/metadata_cache.h"
#include "distributed/multi_logical_optimizer.h"
#include "distributed/multi_logical_planner.h"
#include "distributed/multi_physical_planner.h"
#include "distributed/pg_dist_partition.h"
#include "distributed/query_pushdown_planning.h"
#include "distributed/string_utils.h"
#include "distributed/tdigest_extension.h"
#include "distributed/version_compat.h"
#include "distributed/worker_protocol.h"

/* Config variable managed via guc.c */
int LimitClauseRowFetchCount = -1; /* number of rows to fetch from each task */
double CountDistinctErrorRate = 0.0; /* precision of count(distinct) approximate */
int CoordinatorAggregationStrategy = COORDINATOR_AGGREGATION_ROW_GATHER;

/* Constant used throughout file */
static const uint32 masterTableId = 1; /* first range table reference on the master node */

typedef struct MasterAggregateWalkerContext
{
	const ExtendedOpNodeProperties *extendedOpNodeProperties;
	AttrNumber columnId;
} MasterAggregateWalkerContext;

typedef struct WorkerAggregateWalkerContext
{
	const ExtendedOpNodeProperties *extendedOpNodeProperties;
	List *expressionList;
	bool createGroupByClause;
} WorkerAggregateWalkerContext;


/*
 * QueryTargetList encapsulates the necessary fields to form
 * worker query's target list.
 */
typedef struct QueryTargetList
{
	List *targetEntryList; /* the list of target entries */
	AttrNumber targetProjectionNumber; /* the index of the last entry */
} QueryTargetList;


/*
 * QueryGroupClause encapsulates the necessary fields to form
 * worker query's group by clause.
 */
typedef struct QueryGroupClause
{
	List *groupClauseList; /* the list of group clause entries */
	Index *nextSortGroupRefIndex; /* pointer to the index of the largest sort group reference index */
} QueryGroupClause;


/*
 * QueryDistinctClause encapsulates the necessary fields to form
 * worker query's DISTINCT/DISTINCT ON parts.
 */
typedef struct QueryDistinctClause
{
	List *workerDistinctClause; /* the list of distinct clause entries */
	bool workerHasDistinctOn;
} QueryDistinctClause;


/*
 * QueryWindowClause encapsulates the necessary fields to form
 * worker query's window clause.
 */
typedef struct QueryWindowClause
{
	List *workerWindowClauseList; /* the list of window clause entries */
	bool hasWindowFunctions;
	Index *nextSortGroupRefIndex; /* see QueryGroupClause */
} QueryWindowClause;


/*
 * QueryOrderByLimit encapsulates the necessary fields to form
 * worker query's order by and limit clauses. Note that we don't
 * keep track of limit offset clause since it is incorporated
 * into the limit clause during the processing.
 */
typedef struct QueryOrderByLimit
{
	Node *workerLimitCount;
	List *workerSortClauseList;
	Index *nextSortGroupRefIndex; /* see QueryGroupClause */
} QueryOrderByLimit;


/*
 * LimitPushdownable tells us how a limit can be pushed down.
 * See WorkerLimitCount for details.
 */
typedef enum LimitPushdownable
{
	LIMIT_CANNOT_PUSHDOWN,
	LIMIT_CAN_PUSHDOWN,
	LIMIT_CAN_APPROXIMATE,
} LimitPushdownable;


/*
 * OrderByLimitReference a structure that is used commonly while
 * processing sort and limit clauses.
 */
typedef struct OrderByLimitReference
{
	bool groupedByDisjointPartitionColumn;
	bool onlyPushableWindowFunctions;
	bool groupClauseIsEmpty;
	bool sortClauseIsEmpty;
	bool hasOrderByAggregate;
	bool canApproximate;
	bool hasDistinctOn;
} OrderByLimitReference;


/* Local functions forward declarations */
static MultiSelect * AndSelectNode(MultiSelect *selectNode);
static MultiSelect * OrSelectNode(MultiSelect *selectNode);
static List * OrSelectClauseList(List *selectClauseList);
static void PushDownNodeLoop(MultiUnaryNode *currentNode);
static void PullUpCollectLoop(MultiCollect *collectNode);
static void AddressProjectSpecialConditions(MultiProject *projectNode);
static PushDownStatus CanPushDown(MultiUnaryNode *parentNode);
static PullUpStatus CanPullUp(MultiUnaryNode *childNode);
static PushDownStatus Commutative(MultiUnaryNode *parentNode,
								  MultiUnaryNode *childNode);
static PushDownStatus Distributive(MultiUnaryNode *parentNode,
								   MultiBinaryNode *childNode);
static PullUpStatus Factorizable(MultiBinaryNode *parentNode,
								 MultiUnaryNode *childNode);
static List * SelectClauseTableIdList(List *selectClauseList);
static MultiUnaryNode * GenerateLeftNode(MultiUnaryNode *currentNode,
										 MultiBinaryNode *binaryNode);
static MultiUnaryNode * GenerateRightNode(MultiUnaryNode *currentNode,
										  MultiBinaryNode *binaryNode);
static MultiUnaryNode * GenerateNode(MultiUnaryNode *currentNode, MultiNode *childNode);
static List * TableIdListColumns(List *tableIdList, List *columnList);
static List * TableIdListSelectClauses(List *tableIdList, List *selectClauseList);
static void PushDownBelowUnaryChild(MultiUnaryNode *currentNode,
									MultiUnaryNode *childNode);
static void PlaceUnaryNodeChild(MultiUnaryNode *unaryNode, MultiUnaryNode *childNode);
static void PlaceBinaryNodeLeftChild(MultiBinaryNode *binaryNode,
									 MultiUnaryNode *newLeftChildNode);
static void PlaceBinaryNodeRightChild(MultiBinaryNode *binaryNode,
									  MultiUnaryNode *newRightChildNode);
static void RemoveUnaryNode(MultiUnaryNode *unaryNode);
static void PullUpUnaryNode(MultiUnaryNode *unaryNode);
static void ParentSetNewChild(MultiNode *parentNode, MultiNode *oldChildNode,
							  MultiNode *newChildNode);

/* Local functions forward declarations for aggregate expressions */
static void ApplyExtendedOpNodes(MultiExtendedOp *originalNode,
								 MultiExtendedOp *masterNode,
								 MultiExtendedOp *workerNode);
static void TransformSubqueryNode(MultiTable *subqueryNode,
								  bool subqueryHasNonDistributableAggregates);
static MultiExtendedOp * MasterExtendedOpNode(MultiExtendedOp *originalOpNode,
											  ExtendedOpNodeProperties *
											  extendedOpNodeProperties);
static Node * MasterAggregateMutator(Node *originalNode,
									 MasterAggregateWalkerContext *walkerContext);
static Expr * MasterAggregateExpression(Aggref *originalAggregate,
										MasterAggregateWalkerContext *walkerContext);
static Expr * MasterAverageExpression(Oid sumAggregateType, Oid countAggregateType,
									  AttrNumber *columnId);
static Expr * AddTypeConversion(Node *originalAggregate, Node *newExpression);
static MultiExtendedOp * WorkerExtendedOpNode(MultiExtendedOp *originalOpNode,
											  ExtendedOpNodeProperties *
											  extendedOpNodeProperties);
static void ProcessTargetListForWorkerQuery(List *targetEntryList,
											ExtendedOpNodeProperties *
											extendedOpNodeProperties,
											QueryTargetList *queryTargetList,
											QueryGroupClause *queryGroupClause);
static void ProcessHavingClauseForWorkerQuery(Node *havingQual,
											  ExtendedOpNodeProperties *
											  extendedOpNodeProperties,
											  Node **workerHavingQual,
											  QueryTargetList *queryTargetList,
											  QueryGroupClause *queryGroupClause);
static void ProcessDistinctClauseForWorkerQuery(List *distinctClause, bool hasDistinctOn,
												List *groupClauseList,
												bool queryHasAggregates,
												QueryDistinctClause *queryDistinctClause,
												bool *distinctPreventsLimitPushdown);
static void ProcessWindowFunctionsForWorkerQuery(List *windowClauseList,
												 List *originalTargetEntryList,
												 QueryWindowClause *queryWindowClause,
												 QueryTargetList *queryTargetList);
static void ProcessWindowFunctionPullUpForWorkerQuery(List *windowClause,
													  QueryTargetList *queryTargetList);
static void ProcessLimitOrderByForWorkerQuery(OrderByLimitReference orderByLimitReference,
											  Node *originalLimitCount, Node *limitOffset,
											  List *sortClauseList, List *groupClauseList,
											  List *originalTargetList,
											  QueryOrderByLimit *queryOrderByLimit,
											  QueryTargetList *queryTargetList);
static OrderByLimitReference BuildOrderByLimitReference(bool hasDistinctOn, bool
														groupedByDisjointPartitionColumn,
														bool onlyPushableWindowFunctions,
														List *groupClause,
														List *sortClauseList,
														List *targetList);
static void ExpandWorkerTargetEntry(List *expressionList,
									TargetEntry *originalTargetEntry,
									bool addToGroupByClause,
									QueryTargetList *queryTargetList,
									QueryGroupClause *queryGroupClause);
static Index GetNextSortGroupRef(List *targetEntryList);
static TargetEntry * GenerateWorkerTargetEntry(TargetEntry *targetEntry,
											   Expr *workerExpression,
											   AttrNumber targetProjectionNumber);
static void AppendTargetEntryToGroupClause(TargetEntry *targetEntry,
										   QueryGroupClause *queryGroupClause);
static bool WorkerAggregateWalker(Node *node,
								  WorkerAggregateWalkerContext *walkerContext);
static List * WorkerAggregateExpressionList(Aggref *originalAggregate,
											WorkerAggregateWalkerContext *walkerContextry);
static AggregateType GetAggregateType(Aggref *aggregatExpression);
static Oid AggregateArgumentType(Aggref *aggregate);
static Expr * FirstAggregateArgument(Aggref *aggregate);
static bool AggregateEnabledCustom(Aggref *aggregateExpression);
static Oid CitusFunctionOidWithSignature(char *functionName, int numargs, Oid *argtypes);
static Oid WorkerPartialAggOid(void);
static Oid CoordCombineAggOid(void);
static Oid AggregateFunctionOid(const char *functionName, Oid inputType);
static Oid TypeOid(Oid schemaId, const char *typeName);
static SortGroupClause * CreateSortGroupClause(Var *column);

/* Local functions forward declarations for count(distinct) approximations */
static const char * CountDistinctHashFunctionName(Oid argumentType);
static int CountDistinctStorageSize(double approximationErrorRate);
static Const * MakeIntegerConstInt64(int64 integerValue);
static Const * MakeIntegerConst(int32 integerValue);


/* Local functions forward declarations for aggregate expression checks */
static bool HasNonDistributableAggregates(MultiNode *logicalPlanNode);
static bool CanPushDownExpression(Node *expression,
								  const ExtendedOpNodeProperties *extendedOpNodeProperties);
static DeferredErrorMessage * DeferErrorIfHasNonDistributableAggregates(
	MultiNode *logicalPlanNode);
static DeferredErrorMessage * DeferErrorIfUnsupportedArrayAggregate(
	Aggref *arrayAggregateExpression);
static DeferredErrorMessage * DeferErrorIfUnsupportedJsonAggregate(AggregateType type,
																   Aggref *
																   aggregateExpression);
static DeferredErrorMessage * DeferErrorIfUnsupportedAggregateDistinct(
	Aggref *aggregateExpression,
	MultiNode *
	logicalPlanNode);
static Var * AggregateDistinctColumn(Aggref *aggregateExpression);
static bool TablePartitioningSupportsDistinct(List *tableNodeList,
											  MultiExtendedOp *opNode,
											  Var *distinctColumn,
											  AggregateType aggregateType);

/* Local functions forward declarations for limit clauses */
static Node * WorkerLimitCount(Node *limitCount, Node *limitOffset, OrderByLimitReference
							   orderByLimitReference);
static List * WorkerSortClauseList(Node *limitCount,
								   List *groupClauseList, List *sortClauseList,
								   OrderByLimitReference orderByLimitReference);
static bool CanPushDownLimitApproximate(List *sortClauseList, List *targetList);
static bool HasOrderByAggregate(List *sortClauseList, List *targetList);
static bool HasOrderByNonCommutativeAggregate(List *sortClauseList, List *targetList);
static bool HasOrderByComplexExpression(List *sortClauseList, List *targetList);
static bool HasOrderByHllType(List *sortClauseList, List *targetList);
static bool ShouldProcessDistinctOrderAndLimitForWorker(
	ExtendedOpNodeProperties *extendedOpNodeProperties,
	bool pushingDownOriginalGrouping,
	Node *havingQual);
static bool IsIndexInRange(const List *list, int index);

/*
 * MultiLogicalPlanOptimize applies multi-relational algebra optimizations on
 * the given logical plan tree. Specifically, the function applies four set of
 * optimizations in a particular order.
 *
 * First, the function splits the search node into two nodes that contain And
 * and Or clauses, and pushes down the node that contains And clauses. Second,
 * the function pushes down the project node; this node either contains columns
 * to return to the user, or aggregate expressions used by the aggregate node.
 * Third, the function pulls up the collect operators in the tree. Fourth, the
 * function finds the extended operator node, and splits this node into master
 * and worker extended operator nodes.
 */
void
MultiLogicalPlanOptimize(MultiTreeRoot *multiLogicalPlan)
{
	MultiNode *logicalPlanNode = (MultiNode *) multiLogicalPlan;
	bool hasNonDistributableAggregates = HasNonDistributableAggregates(
		logicalPlanNode);
	List *extendedOpNodeList = FindNodesOfType(logicalPlanNode, T_MultiExtendedOp);
	MultiExtendedOp *extendedOpNode = (MultiExtendedOp *) linitial(extendedOpNodeList);
	ExtendedOpNodeProperties extendedOpNodeProperties = BuildExtendedOpNodeProperties(
		extendedOpNode, hasNonDistributableAggregates);

	if (!extendedOpNodeProperties.groupedByDisjointPartitionColumn &&
		!extendedOpNodeProperties.pullUpIntermediateRows)
	{
		DeferredErrorMessage *aggregatePushdownError =
			DeferErrorIfHasNonDistributableAggregates(logicalPlanNode);

		if (aggregatePushdownError != NULL)
		{
			if (CoordinatorAggregationStrategy == COORDINATOR_AGGREGATION_DISABLED)
			{
				RaiseDeferredError(aggregatePushdownError, ERROR);
			}
			else
			{
				extendedOpNodeProperties.pullUpIntermediateRows = true;
				extendedOpNodeProperties.pushDownGroupingAndHaving = false;
			}
		}
	}

	/*
	 * If a select node exists, we use the idempower property to split the node
	 * into two nodes that contain And and Or clauses. If both And and Or nodes
	 * exist, we modify the tree in place to swap the original select node with
	 * And and Or nodes. We then push down the And select node if it exists.
	 */
	List *selectNodeList = FindNodesOfType(logicalPlanNode, T_MultiSelect);
	if (selectNodeList != NIL)
	{
		MultiSelect *selectNode = (MultiSelect *) linitial(selectNodeList);
		MultiSelect *andSelectNode = AndSelectNode(selectNode);
		MultiSelect *orSelectNode = OrSelectNode(selectNode);

		if (andSelectNode != NULL && orSelectNode != NULL)
		{
			MultiNode *parentNode = ParentNode((MultiNode *) selectNode);
			MultiNode *childNode = ChildNode((MultiUnaryNode *) selectNode);
			Assert(UnaryOperator(parentNode));

			SetChild((MultiUnaryNode *) parentNode, (MultiNode *) orSelectNode);
			SetChild((MultiUnaryNode *) orSelectNode, (MultiNode *) andSelectNode);
			SetChild((MultiUnaryNode *) andSelectNode, (MultiNode *) childNode);
		}
		else if (andSelectNode != NULL && orSelectNode == NULL)
		{
			andSelectNode = selectNode; /* no need to modify the tree */
		}

		if (andSelectNode != NULL)
		{
			PushDownNodeLoop((MultiUnaryNode *) andSelectNode);
		}
	}

	/* push down the multi project node */
	List *projectNodeList = FindNodesOfType(logicalPlanNode, T_MultiProject);
	MultiProject *projectNode = (MultiProject *) linitial(projectNodeList);
	PushDownNodeLoop((MultiUnaryNode *) projectNode);

	/* pull up collect nodes and merge duplicate collects */
	List *collectNodeList = FindNodesOfType(logicalPlanNode, T_MultiCollect);
	MultiCollect *collectNode = NULL;
	foreach_ptr(collectNode, collectNodeList)
	{
		PullUpCollectLoop(collectNode);
	}

	/*
	 * We split the extended operator node into its equivalent master and worker
	 * operator nodes; and if the extended operator has aggregates, we transform
	 * aggregate functions accordingly for the master and worker operator nodes.
	 * If we can push down the limit clause, we also add limit count and sort
	 * clause list to the worker operator node. We then push the worker operator
	 * node below the collect node.
	 */
	MultiExtendedOp *masterExtendedOpNode =
		MasterExtendedOpNode(extendedOpNode, &extendedOpNodeProperties);
	MultiExtendedOp *workerExtendedOpNode =
		WorkerExtendedOpNode(extendedOpNode, &extendedOpNodeProperties);

	ApplyExtendedOpNodes(extendedOpNode, masterExtendedOpNode, workerExtendedOpNode);

	List *tableNodeList = FindNodesOfType(logicalPlanNode, T_MultiTable);
	MultiTable *tableNode = NULL;
	foreach_ptr(tableNode, tableNodeList)
	{
		if (tableNode->relationId == SUBQUERY_RELATION_ID)
		{
			DeferredErrorMessage *error =
				DeferErrorIfHasNonDistributableAggregates((MultiNode *) tableNode);
			bool subqueryHasNonDistributableAggregates = false;

			if (error != NULL)
			{
				if (CoordinatorAggregationStrategy == COORDINATOR_AGGREGATION_DISABLED)
				{
					RaiseDeferredError(error, ERROR);
				}
				else
				{
					subqueryHasNonDistributableAggregates = true;
				}
			}

			TransformSubqueryNode(tableNode, subqueryHasNonDistributableAggregates);
		}
	}

	/*
	 * When enabled, count(distinct) approximation uses hll as the intermediate
	 * data type. We currently have a mismatch between hll target entry and sort
	 * clause's sortop oid, so we can't push an order by on the hll data type to
	 * the worker node. We check that here and error out if necessary.
	 */
	bool hasOrderByHllType = HasOrderByHllType(workerExtendedOpNode->sortClauseList,
											   workerExtendedOpNode->targetList);
	if (hasOrderByHllType)
	{
		ereport(ERROR, (errmsg("cannot approximate count(distinct) and order by it"),
						errhint("You might need to disable approximations for either "
								"count(distinct) or limit through configuration.")));
	}

	if (TargetListContainsSubquery(masterExtendedOpNode->targetList))
	{
		ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
						errmsg("cannot push down subquery on the target list"),
						errdetail("Subqueries in the SELECT part of the query can only "
								  "be pushed down if they happen before aggregates and "
								  "window functions")));
	}
}


/*
 * AndSelectNode looks for AND clauses in the given select node. If they exist,
 * the function returns these clauses in a new node. Otherwise, the function
 * returns null.
 */
static MultiSelect *
AndSelectNode(MultiSelect *selectNode)
{
	MultiSelect *andSelectNode = NULL;
	List *selectClauseList = selectNode->selectClauseList;
	List *orSelectClauseList = OrSelectClauseList(selectClauseList);

	/* AND clauses are select clauses that are not OR clauses */
	List *andSelectClauseList = list_difference(selectClauseList, orSelectClauseList);
	if (andSelectClauseList != NIL)
	{
		andSelectNode = CitusMakeNode(MultiSelect);
		andSelectNode->selectClauseList = andSelectClauseList;
	}

	return andSelectNode;
}


/*
 * OrSelectNode looks for OR clauses in the given select node. If they exist,
 * the function returns these clauses in a new node. Otherwise, the function
 * returns null.
 */
static MultiSelect *
OrSelectNode(MultiSelect *selectNode)
{
	MultiSelect *orSelectNode = NULL;
	List *selectClauseList = selectNode->selectClauseList;
	List *orSelectClauseList = OrSelectClauseList(selectClauseList);

	if (orSelectClauseList != NIL)
	{
		orSelectNode = CitusMakeNode(MultiSelect);
		orSelectNode->selectClauseList = orSelectClauseList;
	}

	return orSelectNode;
}


/*
 * OrSelectClauseList walks over the select clause list, and returns all clauses
 * that have OR expressions in them.
 */
static List *
OrSelectClauseList(List *selectClauseList)
{
	List *orSelectClauseList = NIL;

	Node *selectClause = NULL;
	foreach_ptr(selectClause, selectClauseList)
	{
		bool orClause = is_orclause(selectClause);
		if (orClause)
		{
			orSelectClauseList = lappend(orSelectClauseList, selectClause);
		}
	}

	return orSelectClauseList;
}


/*
 * PushDownNodeLoop pushes down the current node as far down the plan tree as
 * possible. For this, the function first addresses any special conditions that
 * may apply on the current node. Then, the function pushes down the current
 * node if its child node is unary. If the child is binary, the function splits
 * the current node into two nodes by applying generation rules, and recurses
 * into itself to push down these two nodes.
 */
static void
PushDownNodeLoop(MultiUnaryNode *currentNode)
{
	MultiUnaryNode *projectNodeGenerated = NULL;
	MultiUnaryNode *leftNodeGenerated = NULL;
	MultiUnaryNode *rightNodeGenerated = NULL;

	PushDownStatus pushDownStatus = CanPushDown(currentNode);
	while (pushDownStatus == PUSH_DOWN_VALID ||
		   pushDownStatus == PUSH_DOWN_SPECIAL_CONDITIONS)
	{
		MultiNode *childNode = currentNode->childNode;
		bool unaryChild = UnaryOperator(childNode);
		bool binaryChild = BinaryOperator(childNode);

		/*
		 * We first check if we can use the idempower property to split the
		 * project node. We split at a partition node as it captures the
		 * minimal set of columns needed from a partition job. After the split
		 * we break from the loop and recursively call pushdown for the
		 * generated project node.
		 */
		MultiNode *parentNode = ParentNode((MultiNode *) currentNode);
		CitusNodeTag currentNodeType = CitusNodeTag(currentNode);
		CitusNodeTag parentNodeType = CitusNodeTag(parentNode);

		if (currentNodeType == T_MultiProject && parentNodeType == T_MultiPartition)
		{
			projectNodeGenerated = GenerateNode(currentNode, childNode);
			PlaceUnaryNodeChild(currentNode, projectNodeGenerated);

			break;
		}

		/* address any special conditions before we can perform the pushdown */
		if (pushDownStatus == PUSH_DOWN_SPECIAL_CONDITIONS)
		{
			MultiProject *projectNode = (MultiProject *) currentNode;
			Assert(currentNodeType == T_MultiProject);

			AddressProjectSpecialConditions(projectNode);
		}

		if (unaryChild)
		{
			MultiUnaryNode *unaryChildNode = (MultiUnaryNode *) childNode;
			PushDownBelowUnaryChild(currentNode, unaryChildNode);
		}
		else if (binaryChild)
		{
			MultiBinaryNode *binaryChildNode = (MultiBinaryNode *) childNode;
			leftNodeGenerated = GenerateLeftNode(currentNode, binaryChildNode);
			rightNodeGenerated = GenerateRightNode(currentNode, binaryChildNode);

			/* push down the generated nodes below the binary child node */
			PlaceBinaryNodeLeftChild(binaryChildNode, leftNodeGenerated);
			PlaceBinaryNodeRightChild(binaryChildNode, rightNodeGenerated);

			/*
			 * Remove the current node, and break out of the push down loop for
			 * the current node. Then, recurse into the push down function for
			 * the newly generated nodes.
			 */
			RemoveUnaryNode(currentNode);
			break;
		}

		pushDownStatus = CanPushDown(currentNode);
	}

	/* recursively perform pushdown of any nodes generated in the loop */
	if (projectNodeGenerated != NULL)
	{
		PushDownNodeLoop(projectNodeGenerated);
	}
	if (leftNodeGenerated != NULL)
	{
		PushDownNodeLoop(leftNodeGenerated);
	}
	if (rightNodeGenerated != NULL)
	{
		PushDownNodeLoop(rightNodeGenerated);
	}
}


/*
 * PullUpCollectLoop pulls up the collect node as far up as possible in the plan
 * tree. The function also merges two collect nodes that are direct descendants
 * of each other by removing the given collect node from the tree.
 */
static void
PullUpCollectLoop(MultiCollect *collectNode)
{
	MultiUnaryNode *currentNode = (MultiUnaryNode *) collectNode;

	PullUpStatus pullUpStatus = CanPullUp(currentNode);
	while (pullUpStatus == PULL_UP_VALID)
	{
		PullUpUnaryNode(currentNode);
		pullUpStatus = CanPullUp(currentNode);
	}

	/*
	 * After pulling up the collect node, if we find that our child node is also
	 * a collect, we merge the two collect nodes together by removing this node.
	 */
	MultiNode *childNode = currentNode->childNode;
	if (CitusIsA(childNode, MultiCollect))
	{
		RemoveUnaryNode(currentNode);
	}
}


/*
 * AddressProjectSpecialConditions adds columns to the project node if necessary
 * to make the node commutative and distributive with its child node. For this,
 * the function checks for any special conditions between the project and child
 * node, and determines the child node columns to add for the special conditions
 * to apply. The function then adds these columns to the project node.
 */
static void
AddressProjectSpecialConditions(MultiProject *projectNode)
{
	MultiNode *childNode = ChildNode((MultiUnaryNode *) projectNode);
	CitusNodeTag childNodeTag = CitusNodeTag(childNode);
	List *childColumnList = NIL;

	/*
	 * We check if we need to include any child columns in the project node to
	 * address the following special conditions.
	 *
	 * SNC1: project node must include child node's projected columns, or
	 * SNC2: project node must include child node's partition column,  or
	 * SNC3: project node must include child node's selection columns, or
	 * NSC1: project node must include child node's join columns.
	 */
	if (childNodeTag == T_MultiProject)
	{
		MultiProject *projectChildNode = (MultiProject *) childNode;
		List *projectColumnList = projectChildNode->columnList;

		childColumnList = copyObject(projectColumnList);
	}
	else if (childNodeTag == T_MultiPartition)
	{
		MultiPartition *partitionNode = (MultiPartition *) childNode;
		Var *partitionColumn = partitionNode->partitionColumn;
		List *partitionColumnList = list_make1(partitionColumn);

		childColumnList = copyObject(partitionColumnList);
	}
	else if (childNodeTag == T_MultiSelect)
	{
		MultiSelect *selectNode = (MultiSelect *) childNode;
		Node *selectClauseList = (Node *) selectNode->selectClauseList;
		List *selectList = pull_var_clause_default(selectClauseList);

		childColumnList = copyObject(selectList);
	}
	else if (childNodeTag == T_MultiJoin)
	{
		MultiJoin *joinNode = (MultiJoin *) childNode;
		Node *joinClauseList = (Node *) joinNode->joinClauseList;
		List *joinList = pull_var_clause_default(joinClauseList);

		childColumnList = copyObject(joinList);
	}

	/*
	 * If we need to include any child columns, then find the columns that are
	 * not already in the project column list, and add them.
	 */
	if (childColumnList != NIL)
	{
		List *projectColumnList = projectNode->columnList;
		List *newColumnList = list_concat_unique(projectColumnList, childColumnList);

		projectNode->columnList = newColumnList;
	}
}


/*
 * CanPushDown determines if a particular node can be moved below its child. The
 * criteria for pushing down a node is determined by multi-relational algebra's
 * rules for commutativity and distributivity.
 */
static PushDownStatus
CanPushDown(MultiUnaryNode *parentNode)
{
	PushDownStatus pushDownStatus = PUSH_DOWN_INVALID_FIRST;
	MultiNode *childNode = parentNode->childNode;
	bool unaryChild = UnaryOperator(childNode);
	bool binaryChild = BinaryOperator(childNode);

	if (unaryChild)
	{
		pushDownStatus = Commutative(parentNode, (MultiUnaryNode *) childNode);
	}
	else if (binaryChild)
	{
		pushDownStatus = Distributive(parentNode, (MultiBinaryNode *) childNode);
	}

	Assert(pushDownStatus != PUSH_DOWN_INVALID_FIRST);
	return pushDownStatus;
}


/*
 * CanPullUp determines if a particular node can be moved above its parent. The
 * criteria for pulling up a node is determined by multi-relational algebra's
 * rules for commutativity and factorizability.
 */
static PullUpStatus
CanPullUp(MultiUnaryNode *childNode)
{
	PullUpStatus pullUpStatus = PULL_UP_INVALID_FIRST;
	MultiNode *parentNode = ParentNode((MultiNode *) childNode);
	bool unaryParent = UnaryOperator(parentNode);
	bool binaryParent = BinaryOperator(parentNode);

	if (unaryParent)
	{
		/*
		 * Evaluate if parent can be pushed down below the child node, since it
		 * is equivalent to pulling up the child above its parent.
		 */
		PushDownStatus parentPushDownStatus = Commutative((MultiUnaryNode *) parentNode,
														  childNode);

		if (parentPushDownStatus == PUSH_DOWN_VALID)
		{
			pullUpStatus = PULL_UP_VALID;
		}
		else
		{
			pullUpStatus = PULL_UP_NOT_VALID;
		}
	}
	else if (binaryParent)
	{
		pullUpStatus = Factorizable((MultiBinaryNode *) parentNode, childNode);
	}

	Assert(pullUpStatus != PULL_UP_INVALID_FIRST);
	return pullUpStatus;
}


/*
 * Commutative returns a status which denotes whether the given parent node can
 * be pushed down below its child node using the commutative property.
 */
static PushDownStatus
Commutative(MultiUnaryNode *parentNode, MultiUnaryNode *childNode)
{
	PushDownStatus pushDownStatus = PUSH_DOWN_NOT_VALID;
	CitusNodeTag parentNodeTag = CitusNodeTag(parentNode);
	CitusNodeTag childNodeTag = CitusNodeTag(childNode);

	/* we cannot be commutative with non-query operators */
	if (childNodeTag == T_MultiTreeRoot || childNodeTag == T_MultiTable)
	{
		return PUSH_DOWN_NOT_VALID;
	}

	/* first check for commutative operators and no special conditions */
	if ((parentNodeTag == T_MultiPartition && childNodeTag == T_MultiProject) ||
		(parentNodeTag == T_MultiPartition && childNodeTag == T_MultiPartition) ||
		(parentNodeTag == T_MultiPartition && childNodeTag == T_MultiSelect))
	{
		pushDownStatus = PUSH_DOWN_VALID;
	}
	if ((parentNodeTag == T_MultiCollect && childNodeTag == T_MultiProject) ||
		(parentNodeTag == T_MultiCollect && childNodeTag == T_MultiCollect) ||
		(parentNodeTag == T_MultiCollect && childNodeTag == T_MultiSelect))
	{
		pushDownStatus = PUSH_DOWN_VALID;
	}
	if (parentNodeTag == T_MultiSelect)
	{
		pushDownStatus = PUSH_DOWN_VALID;
	}
	if (parentNodeTag == T_MultiProject && childNodeTag == T_MultiCollect)
	{
		pushDownStatus = PUSH_DOWN_VALID;
	}

	/*
	 * The project node is commutative with the below operators given that
	 * its special conditions apply.
	 */
	if ((parentNodeTag == T_MultiProject && childNodeTag == T_MultiProject) ||
		(parentNodeTag == T_MultiProject && childNodeTag == T_MultiPartition) ||
		(parentNodeTag == T_MultiProject && childNodeTag == T_MultiSelect) ||
		(parentNodeTag == T_MultiProject && childNodeTag == T_MultiJoin))
	{
		pushDownStatus = PUSH_DOWN_SPECIAL_CONDITIONS;
	}

	return pushDownStatus;
}


/*
 * Distributive returns a status which denotes whether the given parent node can
 * be pushed down below its binary child node using the distributive property.
 */
static PushDownStatus
Distributive(MultiUnaryNode *parentNode, MultiBinaryNode *childNode)
{
	PushDownStatus pushDownStatus = PUSH_DOWN_NOT_VALID;
	CitusNodeTag parentNodeTag = CitusNodeTag(parentNode);
	CitusNodeTag childNodeTag = CitusNodeTag(childNode);

	/* special condition checks for partition operator are not implemented */
	Assert(parentNodeTag != T_MultiPartition);

	/*
	 * The project node is distributive with the join operator given that its
	 * special conditions apply.
	 */
	if (parentNodeTag == T_MultiProject)
	{
		pushDownStatus = PUSH_DOWN_SPECIAL_CONDITIONS;
	}

	/* collect node is distributive without special conditions */
	if ((parentNodeTag == T_MultiCollect && childNodeTag == T_MultiJoin) ||
		(parentNodeTag == T_MultiCollect && childNodeTag == T_MultiCartesianProduct))
	{
		pushDownStatus = PUSH_DOWN_VALID;
	}

	/*
	 * The select node is distributive with a binary operator if all tables in
	 * the select clauses are output by the binary child. The select clauses are
	 * individually AND'd; and therefore this check is sufficient to implement
	 * the NSC3 special condition in multi-relational algebra.
	 */
	if ((parentNodeTag == T_MultiSelect && childNodeTag == T_MultiJoin) ||
		(parentNodeTag == T_MultiSelect && childNodeTag == T_MultiCartesianProduct))
	{
		MultiSelect *selectNode = (MultiSelect *) parentNode;
		List *selectClauseList = selectNode->selectClauseList;

		List *selectTableIdList = SelectClauseTableIdList(selectClauseList);
		List *childTableIdList = OutputTableIdList((MultiNode *) childNode);

		/* find tables that are in select clause list, but not in child list */
		List *diffList = list_difference_int(selectTableIdList, childTableIdList);
		if (diffList == NIL)
		{
			pushDownStatus = PUSH_DOWN_VALID;
		}
	}

	return pushDownStatus;
}


/*
 * Factorizable returns a status which denotes whether the given unary child
 * node can be pulled up above its binary parent node using the factorizability
 * property. The function currently performs this check only for collect node
 * types; other node types have generation rules that are not yet implemented.
 */
static PullUpStatus
Factorizable(MultiBinaryNode *parentNode, MultiUnaryNode *childNode)
{
	PullUpStatus pullUpStatus = PULL_UP_NOT_VALID;
	CitusNodeTag parentNodeTag = CitusNodeTag(parentNode);
	CitusNodeTag childNodeTag = CitusNodeTag(childNode);

	/*
	 * The following nodes are factorizable with their parents, but we don't
	 * have their generation rules implemented. We therefore assert here.
	 */
	Assert(childNodeTag != T_MultiProject);
	Assert(childNodeTag != T_MultiPartition);
	Assert(childNodeTag != T_MultiSelect);

	if ((childNodeTag == T_MultiCollect && parentNodeTag == T_MultiJoin) ||
		(childNodeTag == T_MultiCollect && parentNodeTag == T_MultiCartesianProduct))
	{
		pullUpStatus = PULL_UP_VALID;
	}

	return pullUpStatus;
}


/*
 * SelectClauseTableIdList finds the (range) table identifier for each select
 * clause in the given list, and returns these identifiers in a new list.
 */
static List *
SelectClauseTableIdList(List *selectClauseList)
{
	List *tableIdList = NIL;

	Node *selectClause = NULL;
	foreach_ptr(selectClause, selectClauseList)
	{
		List *selectColumnList = pull_var_clause_default(selectClause);

		if (list_length(selectColumnList) == 0)
		{
			/* filter is a constant, e.g. false or 1=0 */
			continue;
		}

		Var *selectColumn = (Var *) linitial(selectColumnList);
		int selectColumnTableId = (int) selectColumn->varno;

		tableIdList = lappend_int(tableIdList, selectColumnTableId);
	}

	return tableIdList;
}


/*
 * GenerateLeftNode splits the current node over the binary node by applying the
 * generation rule for distributivity in multi-relational algebra. After the
 * split, the function returns the left node.
 */
static MultiUnaryNode *
GenerateLeftNode(MultiUnaryNode *currentNode, MultiBinaryNode *binaryNode)
{
	MultiNode *leftChildNode = binaryNode->leftChildNode;
	MultiUnaryNode *leftNodeGenerated = GenerateNode(currentNode, leftChildNode);

	return leftNodeGenerated;
}


/*
 * GenerateRightNode splits the current node over the binary node by applying
 * the generation rule for distributivity in multi-relational algebra. After the
 * split, the function returns the right node.
 */
static MultiUnaryNode *
GenerateRightNode(MultiUnaryNode *currentNode, MultiBinaryNode *binaryNode)
{
	MultiNode *rightChildNode = binaryNode->rightChildNode;
	MultiUnaryNode *rightNodeGenerated = GenerateNode(currentNode, rightChildNode);

	return rightNodeGenerated;
}


/*
 * GenerateNode determines the current node's type, and applies the relevant
 * generation node for that node type. If the current node is a project node,
 * the function creates a new project node with attributes that only have the
 * child subtree's tables. Else if the current node is a select node, the
 * function creates a new select node with select clauses that only belong to
 * the tables output by the child node's subtree.
 */
static MultiUnaryNode *
GenerateNode(MultiUnaryNode *currentNode, MultiNode *childNode)
{
	MultiUnaryNode *generatedNode = NULL;
	CitusNodeTag currentNodeType = CitusNodeTag(currentNode);
	List *tableIdList = OutputTableIdList(childNode);

	if (currentNodeType == T_MultiProject)
	{
		MultiProject *projectNode = (MultiProject *) currentNode;
		List *columnList = copyObject(projectNode->columnList);

		List *newColumnList = TableIdListColumns(tableIdList, columnList);
		if (newColumnList != NIL)
		{
			MultiProject *newProjectNode = CitusMakeNode(MultiProject);
			newProjectNode->columnList = newColumnList;

			generatedNode = (MultiUnaryNode *) newProjectNode;
		}
	}
	else if (currentNodeType == T_MultiSelect)
	{
		MultiSelect *selectNode = (MultiSelect *) currentNode;
		List *selectClauseList = copyObject(selectNode->selectClauseList);

		List *newSelectClauseList = TableIdListSelectClauses(tableIdList,
															 selectClauseList);
		if (newSelectClauseList != NIL)
		{
			MultiSelect *newSelectNode = CitusMakeNode(MultiSelect);
			newSelectNode->selectClauseList = newSelectClauseList;

			generatedNode = (MultiUnaryNode *) newSelectNode;
		}
	}

	return generatedNode;
}


/*
 * TableIdListColumns walks over the given column list, finds columns belonging
 * to the given table id list, and returns the found columns in a new list.
 */
static List *
TableIdListColumns(List *tableIdList, List *columnList)
{
	List *tableColumnList = NIL;

	Var *column = NULL;
	foreach_ptr(column, columnList)
	{
		int columnTableId = (int) column->varno;

		bool tableListMember = list_member_int(tableIdList, columnTableId);
		if (tableListMember)
		{
			tableColumnList = lappend(tableColumnList, column);
		}
	}

	return tableColumnList;
}


/*
 * TableIdListSelectClauses walks over the given select clause list, finds the
 * select clauses whose column references belong to the given table list, and
 * returns the found clauses in a new list.
 */
static List *
TableIdListSelectClauses(List *tableIdList, List *selectClauseList)
{
	List *tableSelectClauseList = NIL;

	Node *selectClause = NULL;
	foreach_ptr(selectClause, selectClauseList)
	{
		List *selectColumnList = pull_var_clause_default(selectClause);
		if (list_length(selectColumnList) == 0)
		{
			/* filter is a constant, e.g. false or 1=0, always include it */
			tableSelectClauseList = lappend(tableSelectClauseList, selectClause);
		}
		else
		{
			Var *selectColumn = (Var *) linitial(selectColumnList);
			int selectClauseTableId = (int) selectColumn->varno;

			bool tableIdListMember = list_member_int(tableIdList, selectClauseTableId);
			if (tableIdListMember)
			{
				tableSelectClauseList = lappend(tableSelectClauseList, selectClause);
			}
		}
	}

	return tableSelectClauseList;
}


/* Pushes down the current node below its unary child node. */
static void
PushDownBelowUnaryChild(MultiUnaryNode *currentNode, MultiUnaryNode *childNode)
{
	MultiNode *parentNode = ParentNode((MultiNode *) currentNode);
	MultiNode *childChildNode = ChildNode(childNode);

	/* current node's parent now points to the child node */
	ParentSetNewChild(parentNode, (MultiNode *) currentNode, (MultiNode *) childNode);

	/* current node's child becomes its parent */
	SetChild(childNode, (MultiNode *) currentNode);

	/* current node points to the child node's child */
	SetChild(currentNode, childChildNode);
}


/*
 * PlaceUnaryNodeChild inserts the new node as a child node under the given
 * unary node. The function also places the previous child node under the new
 * child node.
 */
static void
PlaceUnaryNodeChild(MultiUnaryNode *unaryNode, MultiUnaryNode *newChildNode)
{
	MultiNode *oldChildNode = ChildNode(unaryNode);

	SetChild(unaryNode, (MultiNode *) newChildNode);
	SetChild(newChildNode, oldChildNode);
}


/*
 * PlaceBinaryNodeLeftChild inserts the new left child as the binary node's left
 * child. The function also places the previous left child below the new child
 * node.
 */
static void
PlaceBinaryNodeLeftChild(MultiBinaryNode *binaryNode, MultiUnaryNode *newLeftChildNode)
{
	if (newLeftChildNode == NULL)
	{
		return;
	}

	SetChild(newLeftChildNode, binaryNode->leftChildNode);
	SetLeftChild(binaryNode, (MultiNode *) newLeftChildNode);
}


/*
 * PlaceBinaryNodeRightChild inserts the new right child as the binary node's
 * right child. The function also places the previous right child below the new
 * child node.
 */
static void
PlaceBinaryNodeRightChild(MultiBinaryNode *binaryNode, MultiUnaryNode *newRightChildNode)
{
	if (newRightChildNode == NULL)
	{
		return;
	}

	SetChild(newRightChildNode, binaryNode->rightChildNode);
	SetRightChild(binaryNode, (MultiNode *) newRightChildNode);
}


/* Removes the given unary node from the logical plan, and frees the node. */
static void
RemoveUnaryNode(MultiUnaryNode *unaryNode)
{
	MultiNode *parentNode = ParentNode((MultiNode *) unaryNode);
	MultiNode *childNode = ChildNode(unaryNode);

	/* set parent to directly point to unary node's child */
	ParentSetNewChild(parentNode, (MultiNode *) unaryNode, childNode);

	pfree(unaryNode);
}


/* Pulls up the given current node above its parent node. */
static void
PullUpUnaryNode(MultiUnaryNode *unaryNode)
{
	MultiNode *parentNode = ParentNode((MultiNode *) unaryNode);
	bool unaryParent = UnaryOperator(parentNode);
	bool binaryParent = BinaryOperator(parentNode);

	if (unaryParent)
	{
		/* pulling up a node is the same as pushing down the node's unary parent */
		MultiUnaryNode *unaryParentNode = (MultiUnaryNode *) parentNode;
		PushDownBelowUnaryChild(unaryParentNode, unaryNode);
	}
	else if (binaryParent)
	{
		MultiBinaryNode *binaryParentNode = (MultiBinaryNode *) parentNode;
		MultiNode *parentParentNode = ParentNode((MultiNode *) binaryParentNode);
		MultiNode *childNode = unaryNode->childNode;

		/* make the parent node point to the unary node's child node */
		if (binaryParentNode->leftChildNode == ((MultiNode *) unaryNode))
		{
			SetLeftChild(binaryParentNode, childNode);
		}
		else
		{
			SetRightChild(binaryParentNode, childNode);
		}

		/* make the parent parent node point to the unary node */
		ParentSetNewChild(parentParentNode, parentNode, (MultiNode *) unaryNode);

		/* make the unary node point to the (old) parent node */
		SetChild(unaryNode, parentNode);
	}
}


/*
 * ParentSetNewChild takes in the given parent node, and replaces the parent's
 * old child node with the new child node. The function needs the old child node
 * in case the parent is a binary node and the function needs to determine which
 * side of the parent node the new child node needs to go to.
 */
static void
ParentSetNewChild(MultiNode *parentNode, MultiNode *oldChildNode,
				  MultiNode *newChildNode)
{
	bool unaryParent = UnaryOperator(parentNode);
	bool binaryParent = BinaryOperator(parentNode);

	if (unaryParent)
	{
		MultiUnaryNode *unaryParentNode = (MultiUnaryNode *) parentNode;
		SetChild(unaryParentNode, newChildNode);
	}
	else if (binaryParent)
	{
		MultiBinaryNode *binaryParentNode = (MultiBinaryNode *) parentNode;

		/* determine which side of the parent the old child is on */
		if (binaryParentNode->leftChildNode == oldChildNode)
		{
			SetLeftChild(binaryParentNode, newChildNode);
		}
		else
		{
			SetRightChild(binaryParentNode, newChildNode);
		}
	}
}


/*
 * ApplyExtendedOpNodes replaces the original extended operator node with the
 * master and worker extended operator nodes. The function then pushes down the
 * worker node below the original node's child node. Note that for the push down
 * to apply, the original node's child must be a collect node.
 */
static void
ApplyExtendedOpNodes(MultiExtendedOp *originalNode, MultiExtendedOp *masterNode,
					 MultiExtendedOp *workerNode)
{
	MultiNode *parentNode = ParentNode((MultiNode *) originalNode);
	MultiNode *collectNode = ChildNode((MultiUnaryNode *) originalNode);
	MultiNode *collectChildNode = ChildNode((MultiUnaryNode *) collectNode);

	/* original node's child must be a collect node */
	Assert(CitusIsA(collectNode, MultiCollect));
	Assert(UnaryOperator(parentNode));

	/* swap the original aggregate node with the master extended node */
	SetChild((MultiUnaryNode *) parentNode, (MultiNode *) masterNode);
	SetChild((MultiUnaryNode *) masterNode, (MultiNode *) collectNode);

	/* add the worker extended node below the collect node */
	SetChild((MultiUnaryNode *) collectNode, (MultiNode *) workerNode);
	SetChild((MultiUnaryNode *) workerNode, (MultiNode *) collectChildNode);

	/* clean up the original extended operator node */
	pfree(originalNode);
}


/*
 * TransformSubqueryNode splits the extended operator node under subquery
 * multi table node into its equivalent master and worker operator nodes, and
 * we transform aggregate functions accordingly for the master and worker
 * operator nodes. We create a partition node based on the first group by
 * column of the extended operator node and set it as the child of the master
 * operator node.
 */
static void
TransformSubqueryNode(MultiTable *subqueryNode,
					  bool subqueryHasNonDistributableAggregates)
{
	if (CoordinatorAggregationStrategy != COORDINATOR_AGGREGATION_DISABLED &&
		HasNonDistributableAggregates((MultiNode *) subqueryNode))
	{
		subqueryHasNonDistributableAggregates = true;
	}

	MultiExtendedOp *extendedOpNode =
		(MultiExtendedOp *) ChildNode((MultiUnaryNode *) subqueryNode);
	MultiNode *collectNode = ChildNode((MultiUnaryNode *) extendedOpNode);
	MultiNode *collectChildNode = ChildNode((MultiUnaryNode *) collectNode);

	ExtendedOpNodeProperties extendedOpNodeProperties =
		BuildExtendedOpNodeProperties(extendedOpNode,
									  subqueryHasNonDistributableAggregates);

	MultiExtendedOp *masterExtendedOpNode =
		MasterExtendedOpNode(extendedOpNode, &extendedOpNodeProperties);
	MultiExtendedOp *workerExtendedOpNode =
		WorkerExtendedOpNode(extendedOpNode, &extendedOpNodeProperties);

	List *groupClauseList = extendedOpNode->groupClauseList;
	List *targetEntryList = extendedOpNode->targetList;
	List *groupTargetEntryList = GroupTargetEntryList(groupClauseList, targetEntryList);
	TargetEntry *groupByTargetEntry = (TargetEntry *) linitial(groupTargetEntryList);
	Expr *groupByExpression = groupByTargetEntry->expr;

	MultiPartition *partitionNode = CitusMakeNode(MultiPartition);

	/*
	 * If group by is on a function expression, then we create a new column from
	 * function expression result type. Because later while creating partition
	 * tasks, we expect a column type to partition intermediate results.
	 * Note that we will only need partition type. So we set column type to
	 * result type of the function expression, and set other fields of column to
	 * default values.
	 */
	if (IsA(groupByExpression, Var))
	{
		partitionNode->partitionColumn = (Var *) groupByExpression;
	}
	else if (IsA(groupByExpression, FuncExpr))
	{
		FuncExpr *functionExpression = (FuncExpr *) groupByExpression;
		Index tableId = 0;
		AttrNumber columnAttributeNumber = InvalidAttrNumber;
		Oid columnType = functionExpression->funcresulttype;
		int32 columnTypeMod = -1;
		Oid columnCollationOid = InvalidOid;
		Index columnLevelSup = 0;

		Var *partitionColumn = makeVar(tableId, columnAttributeNumber, columnType,
									   columnTypeMod, columnCollationOid, columnLevelSup);
		partitionNode->partitionColumn = partitionColumn;
	}
	else
	{
		ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
						errmsg("cannot run this subquery"),
						errdetail("Currently only columns and function expressions "
								  "are allowed in group by expression of subqueries")));
	}

	SetChild((MultiUnaryNode *) subqueryNode, (MultiNode *) masterExtendedOpNode);
	SetChild((MultiUnaryNode *) masterExtendedOpNode, (MultiNode *) partitionNode);
	SetChild((MultiUnaryNode *) partitionNode, (MultiNode *) collectNode);
	SetChild((MultiUnaryNode *) collectNode, (MultiNode *) workerExtendedOpNode);
	SetChild((MultiUnaryNode *) workerExtendedOpNode, (MultiNode *) collectChildNode);
}


/*
 * MasterExtendedOpNode creates the master extended operator node from the given
 * target entries. The function walks over these target entries; and for entries
 * with aggregates in them, this function calls the aggregate expression mutator
 * function.
 *
 * Note that the function logically depends on the worker extended operator node
 * function. If the target entry does not contain aggregate functions, we assume
 * all work is done on the worker side, and create a column that references the
 * worker nodes' results.
 */
static MultiExtendedOp *
MasterExtendedOpNode(MultiExtendedOp *originalOpNode,
					 ExtendedOpNodeProperties *extendedOpNodeProperties)
{
	List *targetEntryList = originalOpNode->targetList;
	List *newTargetEntryList = NIL;
	List *newGroupClauseList = NIL;
	Node *originalHavingQual = originalOpNode->havingQual;
	Node *newHavingQual = NULL;
	MasterAggregateWalkerContext walkerContext = {
		.extendedOpNodeProperties = extendedOpNodeProperties,
		.columnId = 1,
	};

	/* iterate over original target entries */
	TargetEntry *originalTargetEntry = NULL;
	foreach_ptr(originalTargetEntry, targetEntryList)
	{
		TargetEntry *newTargetEntry = flatCopyTargetEntry(originalTargetEntry);
		Expr *originalExpression = originalTargetEntry->expr;
		Expr *newExpression = NULL;

		if (CanPushDownExpression((Node *) originalExpression, extendedOpNodeProperties))
		{
			/*
			 * The expression was entirely pushed down to worker.
			 * We simply make it reference the output generated by worker nodes.
			 */
			Var *column = makeVarFromTargetEntry(masterTableId, originalTargetEntry);
			column->varattno = walkerContext.columnId;
			column->varattnosyn = walkerContext.columnId;
			walkerContext.columnId++;

			if (column->vartype == RECORDOID || column->vartype == RECORDARRAYOID)
			{
				column->vartypmod = BlessRecordExpression(originalTargetEntry->expr);
			}

			newExpression = (Expr *) column;
		}
		else
		{
			Node *newNode = MasterAggregateMutator((Node *) originalExpression,
												   &walkerContext);
			newExpression = (Expr *) newNode;
		}

		newTargetEntry->expr = newExpression;
		newTargetEntryList = lappend(newTargetEntryList, newTargetEntry);
	}

	if (!extendedOpNodeProperties->pushDownGroupingAndHaving)
	{
		/*
		 * Not pushing down GROUP BY, need to regroup on coordinator
		 * and apply having on the coordinator.
		 */
		newGroupClauseList = originalOpNode->groupClauseList;

		if (originalHavingQual != NULL)
		{
			newHavingQual = MasterAggregateMutator(originalHavingQual, &walkerContext);
			if (IsA(newHavingQual, List))
			{
				/*
				 * unflatten having qual to allow standard planner to work when transforming
				 * the master query to a plan
				 */
				newHavingQual = (Node *) make_ands_explicit(
					castNode(List, newHavingQual));
			}
		}
	}

	MultiExtendedOp *masterExtendedOpNode = CitusMakeNode(MultiExtendedOp);
	masterExtendedOpNode->targetList = newTargetEntryList;
	masterExtendedOpNode->groupClauseList = newGroupClauseList;
	masterExtendedOpNode->sortClauseList = originalOpNode->sortClauseList;
	masterExtendedOpNode->distinctClause = originalOpNode->distinctClause;
	masterExtendedOpNode->hasDistinctOn = originalOpNode->hasDistinctOn;
	masterExtendedOpNode->limitCount = originalOpNode->limitCount;
	masterExtendedOpNode->limitOffset = originalOpNode->limitOffset;
	masterExtendedOpNode->limitOption = originalOpNode->limitOption;
	masterExtendedOpNode->havingQual = newHavingQual;

	if (!extendedOpNodeProperties->onlyPushableWindowFunctions)
	{
		masterExtendedOpNode->hasWindowFuncs = originalOpNode->hasWindowFuncs;
		masterExtendedOpNode->windowClause = originalOpNode->windowClause;
		masterExtendedOpNode->onlyPushableWindowFunctions = false;
	}

	return masterExtendedOpNode;
}


/*
 * MasterAggregateMutator walks over the original target entry expression, and
 * creates the new expression tree to execute on the master node. The function
 * transforms aggregates, and copies columns; and recurses into the expression
 * mutator function for all other expression types.
 *
 * Please note that the recursive mutator function traverses the expression tree
 * in depth first order. For this function to set attribute numbers correctly,
 * WorkerAggregateWalker() *must* walk over the expression tree in the same
 * depth first order.
 */
static Node *
MasterAggregateMutator(Node *originalNode, MasterAggregateWalkerContext *walkerContext)
{
	Node *newNode = NULL;
	if (originalNode == NULL)
	{
		return NULL;
	}

	if (IsA(originalNode, Aggref))
	{
		Aggref *originalAggregate = (Aggref *) originalNode;
		if (CanPushDownExpression(originalNode,
								  walkerContext->extendedOpNodeProperties))
		{
			/*
			 * The expression was entirely pushed down to worker.
			 * We simply make it reference the output generated by worker nodes.
			 */
			Var *column = makeVar(masterTableId, walkerContext->columnId,
								  originalAggregate->aggtype,
								  -1, originalAggregate->aggcollid, 0);
			walkerContext->columnId++;

			if (column->vartype == RECORDOID || column->vartype == RECORDARRAYOID)
			{
				column->vartypmod = BlessRecordExpression((Expr *) originalNode);
			}

			newNode = (Node *) column;
		}
		else
		{
			Expr *newExpression = MasterAggregateExpression(originalAggregate,
															walkerContext);

			newNode = (Node *) newExpression;
		}
	}
	else if (IsA(originalNode, Var))
	{
		Var *origColumn = (Var *) originalNode;
		Var *newColumn = makeVar(masterTableId, walkerContext->columnId,
								 origColumn->vartype, origColumn->vartypmod,
								 origColumn->varcollid, origColumn->varlevelsup);
		walkerContext->columnId++;

		newNode = (Node *) newColumn;
	}
	else
	{
		newNode = expression_tree_mutator(originalNode, MasterAggregateMutator,
										  (void *) walkerContext);
	}

	return newNode;
}


/*
 * MasterAggregateExpression creates the master aggregate expression using the
 * original aggregate and aggregate's type information. This function handles
 * the average, count, array_agg, hll and topn aggregates separately due to
 * differences in these aggregate functions' transformations.
 *
 * Note that this function has implicit knowledge of the transformations applied
 * for worker nodes on the original aggregate. The function uses this implicit
 * knowledge to create the appropriate master function with correct data types.
 */
static Expr *
MasterAggregateExpression(Aggref *originalAggregate,
						  MasterAggregateWalkerContext *walkerContext)
{
	const Index columnLevelsUp = 0;  /* normal column */
	const AttrNumber argumentId = 1; /* our aggregates have single arguments */
	AggregateType aggregateType = GetAggregateType(originalAggregate);
	Expr *newMasterExpression = NULL;

	if (walkerContext->extendedOpNodeProperties->pullUpIntermediateRows)
	{
		Aggref *aggregate = (Aggref *) copyObject(originalAggregate);

		TargetEntry *targetEntry;
		foreach_ptr(targetEntry, aggregate->args)
		{
			targetEntry->expr = (Expr *)
								makeVar(masterTableId, walkerContext->columnId,
										exprType((Node *) targetEntry->expr),
										exprTypmod((Node *) targetEntry->expr),
										exprCollation((Node *) targetEntry->expr),
										columnLevelsUp);
			walkerContext->columnId++;
		}

		aggregate->aggdirectargs = NIL;
		Expr *directarg;
		foreach_ptr(directarg, originalAggregate->aggdirectargs)
		{
			/*
			 * Need to replace nodes that contain any Vars with Vars referring
			 * to the related column of the result set returned for the worker
			 * aggregation.
			 *
			 * When there are no Vars, then the expression can be fully evaluated
			 * on the coordinator, so we skip it here. This is not just an
			 * optimization, but the result of the expression might require
			 * calling the final function of the aggregate, and doing so when
			 * there are no input rows (i.e.: with an empty tuple slot) is not
			 * desirable for the node-executor methods.
			 */
			if (pull_var_clause_default((Node *) directarg) != NIL)
			{
				Var *var = makeVar(masterTableId, walkerContext->columnId,
								   exprType((Node *) directarg),
								   exprTypmod((Node *) directarg),
								   exprCollation((Node *) directarg),
								   columnLevelsUp);
				aggregate->aggdirectargs = lappend(aggregate->aggdirectargs, var);
				walkerContext->columnId++;
			}
			else
			{
				aggregate->aggdirectargs = lappend(aggregate->aggdirectargs, directarg);
			}
		}

		if (aggregate->aggfilter)
		{
			aggregate->aggfilter = (Expr *)
								   makeVar(masterTableId, walkerContext->columnId,
										   BOOLOID, -1, InvalidOid, columnLevelsUp);
			walkerContext->columnId++;
		}

		newMasterExpression = (Expr *) aggregate;
	}
	else if (aggregateType == AGGREGATE_COUNT && originalAggregate->aggdistinct &&
			 CountDistinctErrorRate == DISABLE_DISTINCT_APPROXIMATION &&
			 walkerContext->extendedOpNodeProperties->pullDistinctColumns)
	{
		Aggref *aggregate = (Aggref *) copyObject(originalAggregate);
		List *varList = pull_var_clause_default((Node *) aggregate);
		List *uniqueVarList = NIL;
		int startColumnCount = walkerContext->columnId;

		/* determine unique vars that were placed in target list by worker */
		Var *column = NULL;
		foreach_ptr(column, varList)
		{
			uniqueVarList = list_append_unique(uniqueVarList, copyObject(column));
		}

		/*
		 * Go over each var inside aggregate and update their varattno's according to
		 * worker query target entry column index.
		 */
		Var *columnToUpdate = NULL;
		foreach_ptr(columnToUpdate, varList)
		{
			int columnIndex = 0;

			Var *currentVar = NULL;
			foreach_ptr(currentVar, uniqueVarList)
			{
				if (equal(columnToUpdate, currentVar))
				{
					break;
				}
				columnIndex++;
			}

			columnToUpdate->varno = masterTableId;
			columnToUpdate->varnosyn = masterTableId;
			columnToUpdate->varattno = startColumnCount + columnIndex;
			columnToUpdate->varattnosyn = startColumnCount + columnIndex;
		}

		/* we added that many columns */
		walkerContext->columnId += list_length(uniqueVarList);

		newMasterExpression = (Expr *) aggregate;
	}
	else if (aggregateType == AGGREGATE_COUNT && originalAggregate->aggdistinct &&
			 CountDistinctErrorRate != DISABLE_DISTINCT_APPROXIMATION)
	{
		/*
		 * If enabled, we check for count(distinct) approximations before count
		 * distincts. For this, we first compute hll_add_agg(hll_hash(column)) on
		 * worker nodes, and get hll values. We then gather hlls on the master
		 * node, and compute hll_cardinality(hll_union_agg(hll)).
		 */
		const int argCount = 1;
		const int defaultTypeMod = -1;


		/* extract schema name of hll */
		Oid hllId = get_extension_oid(HLL_EXTENSION_NAME, false);
		Oid hllSchemaOid = get_extension_schema(hllId);
		const char *hllSchemaName = get_namespace_name(hllSchemaOid);

		Oid unionFunctionId = FunctionOid(hllSchemaName, HLL_UNION_AGGREGATE_NAME,
										  argCount);
		Oid cardinalityFunctionId = FunctionOid(hllSchemaName, HLL_CARDINALITY_FUNC_NAME,
												argCount);
		Oid cardinalityReturnType = get_func_rettype(cardinalityFunctionId);

		Oid hllType = TypeOid(hllSchemaOid, HLL_TYPE_NAME);
		Oid hllTypeCollationId = get_typcollation(hllType);
		Var *hllColumn = makeVar(masterTableId, walkerContext->columnId, hllType,
								 defaultTypeMod,
								 hllTypeCollationId, columnLevelsUp);
		walkerContext->columnId++;

		TargetEntry *hllTargetEntry = makeTargetEntry((Expr *) hllColumn, argumentId,
													  NULL, false);

		Aggref *unionAggregate = makeNode(Aggref);
		unionAggregate->aggfnoid = unionFunctionId;
		unionAggregate->aggtype = hllType;
		unionAggregate->args = list_make1(hllTargetEntry);
		unionAggregate->aggkind = AGGKIND_NORMAL;
		unionAggregate->aggfilter = NULL;
		unionAggregate->aggtranstype = InvalidOid;
		unionAggregate->aggargtypes = list_make1_oid(unionAggregate->aggtype);
		unionAggregate->aggsplit = AGGSPLIT_SIMPLE;

		FuncExpr *cardinalityExpression = makeNode(FuncExpr);
		cardinalityExpression->funcid = cardinalityFunctionId;
		cardinalityExpression->funcresulttype = cardinalityReturnType;
		cardinalityExpression->args = list_make1(unionAggregate);

		newMasterExpression = (Expr *) cardinalityExpression;
	}
	else if (aggregateType == AGGREGATE_AVERAGE)
	{
		/*
		 * If the original aggregate is an average, we first compute sum(colum)
		 * and count(column) on worker nodes. Then, we compute (sum(sum(column))
		 * / sum(count(column))) on the master node.
		 */
		const char *sumAggregateName = AggregateNames[AGGREGATE_SUM];
		const char *countAggregateName = AggregateNames[AGGREGATE_COUNT];

		Oid argumentType = AggregateArgumentType(originalAggregate);

		Oid sumFunctionId = AggregateFunctionOid(sumAggregateName, argumentType);
		Oid countFunctionId = AggregateFunctionOid(countAggregateName, ANYOID);

		/* calculate the aggregate types that worker nodes are going to return */
		Oid workerSumReturnType = get_func_rettype(sumFunctionId);
		Oid workerCountReturnType = get_func_rettype(countFunctionId);

		/* create the expression sum(sum(column) / sum(count(column))) */
		newMasterExpression = MasterAverageExpression(workerSumReturnType,
													  workerCountReturnType,
													  &(walkerContext->columnId));
	}
	else if (aggregateType == AGGREGATE_COUNT)
	{
		/*
		 * Count aggregates are handled in two steps. First, worker nodes report
		 * their count results. Then, the master node sums up these results.
		 */

		/* worker aggregate and original aggregate have the same return type */
		Oid workerReturnType = exprType((Node *) originalAggregate);
		int32 workerReturnTypeMod = exprTypmod((Node *) originalAggregate);
		Oid workerCollationId = exprCollation((Node *) originalAggregate);

		const char *sumAggregateName = AggregateNames[AGGREGATE_SUM];
		Oid sumFunctionId = AggregateFunctionOid(sumAggregateName, workerReturnType);
		Oid masterReturnType = get_func_rettype(sumFunctionId);

		Aggref *newMasterAggregate = copyObject(originalAggregate);
		newMasterAggregate->aggstar = false;
		newMasterAggregate->aggdistinct = NULL;
		newMasterAggregate->aggfnoid = sumFunctionId;
		newMasterAggregate->aggtype = masterReturnType;
		newMasterAggregate->aggfilter = NULL;
		newMasterAggregate->aggtranstype = InvalidOid;
		newMasterAggregate->aggargtypes = list_make1_oid(newMasterAggregate->aggtype);
		newMasterAggregate->aggsplit = AGGSPLIT_SIMPLE;

		Var *column = makeVar(masterTableId, walkerContext->columnId, workerReturnType,
							  workerReturnTypeMod, workerCollationId, columnLevelsUp);
		walkerContext->columnId++;

		/* aggref expects its arguments to be wrapped in target entries */
		TargetEntry *columnTargetEntry = makeTargetEntry((Expr *) column, argumentId,
														 NULL, false);
		newMasterAggregate->args = list_make1(columnTargetEntry);

		/* cast numeric sum result to bigint (count's return type) */
		CoerceViaIO *coerceExpr = makeNode(CoerceViaIO);
		coerceExpr->arg = (Expr *) newMasterAggregate;
		coerceExpr->resulttype = INT8OID;
		coerceExpr->resultcollid = InvalidOid;
		coerceExpr->coerceformat = COERCE_IMPLICIT_CAST;
		coerceExpr->location = -1;

		/* convert NULL to 0 in case of no rows */
		Const *zeroConst = MakeIntegerConstInt64(0);
		List *coalesceArgs = list_make2(coerceExpr, zeroConst);

		CoalesceExpr *coalesceExpr = makeNode(CoalesceExpr);
		coalesceExpr->coalescetype = INT8OID;
		coalesceExpr->coalescecollid = InvalidOid;
		coalesceExpr->args = coalesceArgs;
		coalesceExpr->location = -1;

		newMasterExpression = (Expr *) coalesceExpr;
	}
	else if (aggregateType == AGGREGATE_ARRAY_AGG ||
			 aggregateType == AGGREGATE_JSONB_AGG ||
			 aggregateType == AGGREGATE_JSONB_OBJECT_AGG ||
			 aggregateType == AGGREGATE_JSON_AGG ||
			 aggregateType == AGGREGATE_JSON_OBJECT_AGG)
	{
		/*
		 * Array and json aggregates are handled in two steps. First, we compute
		 * array_agg() or json aggregate on the worker nodes. Then, we gather
		 * the arrays or jsons on the master and compute the array_cat_agg()
		 * or jsonb_cat_agg() aggregate on them to get the final array or json.
		 */
		const char *catAggregateName = NULL;
		Oid catInputType = InvalidOid;

		/* worker aggregate and original aggregate have same return type */
		Oid workerReturnType = exprType((Node *) originalAggregate);
		int32 workerReturnTypeMod = exprTypmod((Node *) originalAggregate);
		Oid workerCollationId = exprCollation((Node *) originalAggregate);

		/* assert that we do not support array or json aggregation with
		 * distinct or order by */
		Assert(!originalAggregate->aggorder);
		Assert(!originalAggregate->aggdistinct);

		if (aggregateType == AGGREGATE_ARRAY_AGG)
		{
			/* array_cat_agg() takes anyarray as input */
			catAggregateName = ARRAY_CAT_AGGREGATE_NAME;
			catInputType = ANYCOMPATIBLEARRAYOID;
		}
		else if (aggregateType == AGGREGATE_JSONB_AGG ||
				 aggregateType == AGGREGATE_JSONB_OBJECT_AGG)
		{
			/* jsonb_cat_agg() takes jsonb as input */
			catAggregateName = JSONB_CAT_AGGREGATE_NAME;
			catInputType = JSONBOID;
		}
		else
		{
			/* json_cat_agg() takes json as input */
			catAggregateName = JSON_CAT_AGGREGATE_NAME;
			catInputType = JSONOID;
		}

		Assert(catAggregateName != NULL);
		Assert(catInputType != InvalidOid);

		Oid aggregateFunctionId = AggregateFunctionOid(catAggregateName,
													   catInputType);

		/* create argument for the array_cat_agg() or jsonb_cat_agg() aggregate */
		Var *column = makeVar(masterTableId, walkerContext->columnId, workerReturnType,
							  workerReturnTypeMod, workerCollationId, columnLevelsUp);
		TargetEntry *catAggArgument = makeTargetEntry((Expr *) column, argumentId, NULL,
													  false);
		walkerContext->columnId++;

		/* construct the master array_cat_agg() or jsonb_cat_agg() expression */
		Aggref *newMasterAggregate = copyObject(originalAggregate);
		newMasterAggregate->aggfnoid = aggregateFunctionId;
		newMasterAggregate->args = list_make1(catAggArgument);
		newMasterAggregate->aggfilter = NULL;
		newMasterAggregate->aggtranstype = InvalidOid;

		if (aggregateType == AGGREGATE_ARRAY_AGG)
		{
			/*
			 * Postgres expects the type of the array here such as INT4ARRAYOID.
			 * Hence we set it to workerReturnType. If we set this to
			 * ANYCOMPATIBLEARRAYOID then we will get the following error:
			 * "argument declared anycompatiblearray is not an array but type anycompatiblearray"
			 */
			newMasterAggregate->aggargtypes = list_make1_oid(workerReturnType);
		}
		else
		{
			newMasterAggregate->aggargtypes = list_make1_oid(ANYARRAYOID);
		}
		newMasterAggregate->aggsplit = AGGSPLIT_SIMPLE;

		newMasterExpression = (Expr *) newMasterAggregate;
	}
	else if (aggregateType == AGGREGATE_HLL_ADD ||
			 aggregateType == AGGREGATE_HLL_UNION)
	{
		/*
		 * If hll aggregates are called, we simply create the hll_union_aggregate
		 * to apply in the master after running the original aggregate in
		 * workers.
		 */

		Oid hllType = exprType((Node *) originalAggregate);
		Oid unionFunctionId = AggregateFunctionOid(HLL_UNION_AGGREGATE_NAME, hllType);
		int32 hllReturnTypeMod = exprTypmod((Node *) originalAggregate);
		Oid hllTypeCollationId = exprCollation((Node *) originalAggregate);

		Var *hllColumn = makeVar(masterTableId, walkerContext->columnId, hllType,
								 hllReturnTypeMod, hllTypeCollationId, columnLevelsUp);
		walkerContext->columnId++;

		TargetEntry *hllTargetEntry = makeTargetEntry((Expr *) hllColumn, argumentId,
													  NULL, false);

		Aggref *unionAggregate = makeNode(Aggref);
		unionAggregate->aggfnoid = unionFunctionId;
		unionAggregate->aggtype = hllType;
		unionAggregate->args = list_make1(hllTargetEntry);
		unionAggregate->aggkind = AGGKIND_NORMAL;
		unionAggregate->aggfilter = NULL;
		unionAggregate->aggtranstype = InvalidOid;
		unionAggregate->aggargtypes = list_make1_oid(hllType);
		unionAggregate->aggsplit = AGGSPLIT_SIMPLE;

		newMasterExpression = (Expr *) unionAggregate;
	}
	else if (aggregateType == AGGREGATE_TOPN_UNION_AGG ||
			 aggregateType == AGGREGATE_TOPN_ADD_AGG)
	{
		/*
		 * Top-N aggregates are handled in two steps. First, we compute
		 * topn_add_agg() or topn_union_agg() aggregates on the worker nodes.
		 * Then, we gather the Top-Ns on the master and take the union of all
		 * to get the final topn.
		 */

		/* worker aggregate and original aggregate have same return type */
		Oid topnType = exprType((Node *) originalAggregate);
		Oid unionFunctionId = AggregateFunctionOid(TOPN_UNION_AGGREGATE_NAME,
												   topnType);
		int32 topnReturnTypeMod = exprTypmod((Node *) originalAggregate);
		Oid topnTypeCollationId = exprCollation((Node *) originalAggregate);

		/* create argument for the topn_union_agg() aggregate */
		Var *topnColumn = makeVar(masterTableId, walkerContext->columnId, topnType,
								  topnReturnTypeMod, topnTypeCollationId, columnLevelsUp);
		walkerContext->columnId++;

		TargetEntry *topNTargetEntry = makeTargetEntry((Expr *) topnColumn, argumentId,
													   NULL, false);

		/* construct the master topn_union_agg() expression */
		Aggref *unionAggregate = makeNode(Aggref);
		unionAggregate->aggfnoid = unionFunctionId;
		unionAggregate->aggtype = topnType;
		unionAggregate->args = list_make1(topNTargetEntry);
		unionAggregate->aggkind = AGGKIND_NORMAL;
		unionAggregate->aggfilter = NULL;
		unionAggregate->aggtranstype = InvalidOid;
		unionAggregate->aggargtypes = list_make1_oid(topnType);
		unionAggregate->aggsplit = AGGSPLIT_SIMPLE;

		newMasterExpression = (Expr *) unionAggregate;
	}
	else if (aggregateType == AGGREGATE_TDIGEST_COMBINE ||
			 aggregateType == AGGREGATE_TDIGEST_ADD_DOUBLE)
	{
		/* tdigest of column */
		Oid tdigestType = TDigestExtensionTypeOid(); /* tdigest type */
		Oid unionFunctionId = TDigestExtensionAggTDigest1();

		int32 tdigestReturnTypeMod = exprTypmod((Node *) originalAggregate);
		Oid tdigestTypeCollationId = exprCollation((Node *) originalAggregate);

		/* create first argument for tdigest_precentile(tdigest, double) */
		Var *tdigestColumn = makeVar(masterTableId, walkerContext->columnId, tdigestType,
									 tdigestReturnTypeMod, tdigestTypeCollationId,
									 columnLevelsUp);
		TargetEntry *tdigestTargetEntry = makeTargetEntry((Expr *) tdigestColumn,
														  argumentId,
														  NULL, false);
		walkerContext->columnId++;

		/* construct the master tdigest(tdigest) expression */
		Aggref *unionAggregate = makeNode(Aggref);
		unionAggregate->aggfnoid = unionFunctionId;
		unionAggregate->aggtype = originalAggregate->aggtype;
		unionAggregate->args = list_make1(tdigestTargetEntry);
		unionAggregate->aggkind = AGGKIND_NORMAL;
		unionAggregate->aggfilter = NULL;
		unionAggregate->aggtranstype = InvalidOid;
		unionAggregate->aggargtypes = list_make1_oid(tdigestType);
		unionAggregate->aggsplit = AGGSPLIT_SIMPLE;

		newMasterExpression = (Expr *) unionAggregate;
	}
	else if (aggregateType == AGGREGATE_TDIGEST_PERCENTILE_ADD_DOUBLE ||
			 aggregateType == AGGREGATE_TDIGEST_PERCENTILE_ADD_DOUBLEARRAY ||
			 aggregateType == AGGREGATE_TDIGEST_PERCENTILE_OF_ADD_DOUBLE ||
			 aggregateType == AGGREGATE_TDIGEST_PERCENTILE_OF_ADD_DOUBLEARRAY)
	{
		/* tdigest of column */
		Oid tdigestType = TDigestExtensionTypeOid();
		Oid unionFunctionId = InvalidOid;
		if (aggregateType == AGGREGATE_TDIGEST_PERCENTILE_ADD_DOUBLE)
		{
			unionFunctionId = TDigestExtensionAggTDigestPercentile2();
		}
		else if (aggregateType == AGGREGATE_TDIGEST_PERCENTILE_ADD_DOUBLEARRAY)
		{
			unionFunctionId = TDigestExtensionAggTDigestPercentile2a();
		}
		else if (aggregateType == AGGREGATE_TDIGEST_PERCENTILE_OF_ADD_DOUBLE)
		{
			unionFunctionId = TDigestExtensionAggTDigestPercentileOf2();
		}
		else if (aggregateType == AGGREGATE_TDIGEST_PERCENTILE_OF_ADD_DOUBLEARRAY)
		{
			unionFunctionId = TDigestExtensionAggTDigestPercentileOf2a();
		}
		Assert(OidIsValid(unionFunctionId));

		int32 tdigestReturnTypeMod = exprTypmod((Node *) originalAggregate);
		Oid tdigestTypeCollationId = exprCollation((Node *) originalAggregate);

		/* create first argument for tdigest_precentile(tdigest, double) */
		Var *tdigestColumn = makeVar(masterTableId, walkerContext->columnId, tdigestType,
									 tdigestReturnTypeMod, tdigestTypeCollationId,
									 columnLevelsUp);
		TargetEntry *tdigestTargetEntry = makeTargetEntry((Expr *) tdigestColumn,
														  argumentId, NULL, false);
		walkerContext->columnId++;

		/* construct the master tdigest_precentile(tdigest, double) expression */
		Aggref *unionAggregate = makeNode(Aggref);
		unionAggregate->aggfnoid = unionFunctionId;
		unionAggregate->aggtype = originalAggregate->aggtype;
		unionAggregate->args = list_make2(
			tdigestTargetEntry,
			list_nth(originalAggregate->args, 2));
		unionAggregate->aggkind = AGGKIND_NORMAL;
		unionAggregate->aggfilter = NULL;
		unionAggregate->aggtranstype = InvalidOid;
		unionAggregate->aggargtypes = list_make2_oid(
			tdigestType,
			list_nth_oid(originalAggregate->aggargtypes, 2));
		unionAggregate->aggsplit = AGGSPLIT_SIMPLE;

		newMasterExpression = (Expr *) unionAggregate;
	}
	else if (aggregateType == AGGREGATE_TDIGEST_PERCENTILE_TDIGEST_DOUBLE ||
			 aggregateType == AGGREGATE_TDIGEST_PERCENTILE_TDIGEST_DOUBLEARRAY ||
			 aggregateType == AGGREGATE_TDIGEST_PERCENTILE_OF_TDIGEST_DOUBLE ||
			 aggregateType == AGGREGATE_TDIGEST_PERCENTILE_OF_TDIGEST_DOUBLEARRAY)
	{
		/* tdigest of column */
		Oid tdigestType = TDigestExtensionTypeOid();

		/* These functions already will combine the tdigest arguments returned */
		Oid unionFunctionId = originalAggregate->aggfnoid;

		int32 tdigestReturnTypeMod = exprTypmod((Node *) originalAggregate);
		Oid tdigestTypeCollationId = exprCollation((Node *) originalAggregate);

		/* create first argument for tdigest_precentile(tdigest, double) */
		Var *tdigestColumn = makeVar(masterTableId, walkerContext->columnId, tdigestType,
									 tdigestReturnTypeMod, tdigestTypeCollationId,
									 columnLevelsUp);
		TargetEntry *tdigestTargetEntry = makeTargetEntry((Expr *) tdigestColumn,
														  argumentId, NULL, false);
		walkerContext->columnId++;

		/* construct the master tdigest_precentile(tdigest, double) expression */
		Aggref *unionAggregate = makeNode(Aggref);
		unionAggregate->aggfnoid = unionFunctionId;
		unionAggregate->aggtype = originalAggregate->aggtype;
		unionAggregate->args = list_make2(
			tdigestTargetEntry,
			list_nth(originalAggregate->args, 1));
		unionAggregate->aggkind = AGGKIND_NORMAL;
		unionAggregate->aggfilter = NULL;
		unionAggregate->aggtranstype = InvalidOid;
		unionAggregate->aggargtypes = list_make2_oid(
			tdigestType,
			list_nth_oid(originalAggregate->aggargtypes, 1));
		unionAggregate->aggsplit = AGGSPLIT_SIMPLE;

		newMasterExpression = (Expr *) unionAggregate;
	}
	else if (aggregateType == AGGREGATE_CUSTOM_COMBINE)
	{
		HeapTuple aggTuple =
			SearchSysCache1(AGGFNOID, ObjectIdGetDatum(originalAggregate->aggfnoid));
		Form_pg_aggregate aggform;
		Oid combine;

		if (!HeapTupleIsValid(aggTuple))
		{
			elog(ERROR, "citus cache lookup failed for aggregate %u",
				 originalAggregate->aggfnoid);
			return NULL;
		}
		else
		{
			aggform = (Form_pg_aggregate) GETSTRUCT(aggTuple);
			combine = aggform->aggcombinefn;
			ReleaseSysCache(aggTuple);
		}

		if (combine != InvalidOid)
		{
			Oid coordCombineId = CoordCombineAggOid();
			Oid workerReturnType = CSTRINGOID;
			int32 workerReturnTypeMod = -1;
			Oid workerCollationId = InvalidOid;
			Oid resultType = exprType((Node *) originalAggregate);

			Const *aggOidParam = makeConst(OIDOID, -1, InvalidOid, sizeof(Oid),
										   ObjectIdGetDatum(originalAggregate->aggfnoid),
										   false, true);
			Var *column = makeVar(masterTableId, walkerContext->columnId,
								  workerReturnType,
								  workerReturnTypeMod, workerCollationId, columnLevelsUp);
			walkerContext->columnId++;
			Const *nullTag = makeNullConst(resultType, -1, InvalidOid);

			List *aggArguments =
				list_make3(makeTargetEntry((Expr *) aggOidParam, 1, NULL, false),
						   makeTargetEntry((Expr *) column, 2, NULL, false),
						   makeTargetEntry((Expr *) nullTag, 3, NULL, false));

			/* coord_combine_agg(agg, workercol) */
			Aggref *newMasterAggregate = makeNode(Aggref);
			newMasterAggregate->aggfnoid = coordCombineId;
			newMasterAggregate->aggtype = originalAggregate->aggtype;
			newMasterAggregate->args = aggArguments;
			newMasterAggregate->aggkind = AGGKIND_NORMAL;
			newMasterAggregate->aggfilter = NULL;
			newMasterAggregate->aggtranstype = INTERNALOID;
			newMasterAggregate->aggargtypes = list_make3_oid(OIDOID, CSTRINGOID,
															 resultType);
			newMasterAggregate->aggsplit = AGGSPLIT_SIMPLE;

			newMasterExpression = (Expr *) newMasterAggregate;
		}
		else
		{
			elog(ERROR, "Aggregate lacks COMBINEFUNC");
		}
	}
	else
	{
		/*
		 * All other aggregates are handled as they are. These include sum, min,
		 * and max.
		 */

		/* worker aggregate and original aggregate have the same return type */
		Oid workerReturnType = exprType((Node *) originalAggregate);
		int32 workerReturnTypeMod = exprTypmod((Node *) originalAggregate);
		Oid workerCollationId = exprCollation((Node *) originalAggregate);

		const char *aggregateName = AggregateNames[aggregateType];
		Oid aggregateFunctionId = AggregateFunctionOid(aggregateName, workerReturnType);
		Oid masterReturnType = get_func_rettype(aggregateFunctionId);

		Aggref *newMasterAggregate = copyObject(originalAggregate);
		newMasterAggregate->aggdistinct = NULL;
		newMasterAggregate->aggfnoid = aggregateFunctionId;
		newMasterAggregate->aggtype = masterReturnType;
		newMasterAggregate->aggfilter = NULL;

		/*
		 * If return type aggregate is anyelement, its actual return type is
		 * determined on the type of its argument. So we replace it with the
		 * argument type in that case.
		 */
		if (masterReturnType == ANYELEMENTOID)
		{
			newMasterAggregate->aggtype = workerReturnType;

			Expr *firstArg = FirstAggregateArgument(originalAggregate);
			newMasterAggregate->aggcollid = exprCollation((Node *) firstArg);
		}

		Var *column = makeVar(masterTableId, walkerContext->columnId, workerReturnType,
							  workerReturnTypeMod, workerCollationId, columnLevelsUp);
		walkerContext->columnId++;

		/* aggref expects its arguments to be wrapped in target entries */
		TargetEntry *columnTargetEntry = makeTargetEntry((Expr *) column, argumentId,
														 NULL, false);
		newMasterAggregate->args = list_make1(columnTargetEntry);

		newMasterExpression = (Expr *) newMasterAggregate;
	}


	/*
	 * Aggregate functions could have changed the return type. If so, we wrap
	 * the new expression with a conversion function to make it have the same
	 * type as the original aggregate. We need this since functions like sorting
	 * and grouping have already been chosen based on the original type.
	 */
	Expr *typeConvertedExpression = AddTypeConversion((Node *) originalAggregate,
													  (Node *) newMasterExpression);
	if (typeConvertedExpression != NULL)
	{
		newMasterExpression = typeConvertedExpression;
	}

	return newMasterExpression;
}


/*
 * MasterAverageExpression creates an expression of the form (sum(column1) /
 * sum(column2)), where column1 is the sum of the original value, and column2 is
 * the count of that value. This expression allows us to evaluate the average
 * function over distributed data.
 */
static Expr *
MasterAverageExpression(Oid sumAggregateType, Oid countAggregateType,
						AttrNumber *columnId)
{
	const char *sumAggregateName = AggregateNames[AGGREGATE_SUM];
	const int32 defaultTypeMod = -1;
	const Index defaultLevelsUp = 0;
	const AttrNumber argumentId = 1;

	Oid sumTypeCollationId = get_typcollation(sumAggregateType);
	Oid countTypeCollationId = get_typcollation(countAggregateType);

	/* create the first argument for sum(column1) */
	Var *firstColumn = makeVar(masterTableId, (*columnId), sumAggregateType,
							   defaultTypeMod, sumTypeCollationId, defaultLevelsUp);
	TargetEntry *firstTargetEntry = makeTargetEntry((Expr *) firstColumn, argumentId,
													NULL, false);
	(*columnId)++;

	Aggref *firstSum = makeNode(Aggref);
	firstSum->aggfnoid = AggregateFunctionOid(sumAggregateName, sumAggregateType);
	firstSum->aggtype = get_func_rettype(firstSum->aggfnoid);
	firstSum->args = list_make1(firstTargetEntry);
	firstSum->aggkind = AGGKIND_NORMAL;
	firstSum->aggtranstype = InvalidOid;
	firstSum->aggargtypes = list_make1_oid(firstSum->aggtype);
	firstSum->aggsplit = AGGSPLIT_SIMPLE;

	/* create the second argument for sum(column2) */
	Var *secondColumn = makeVar(masterTableId, (*columnId), countAggregateType,
								defaultTypeMod, countTypeCollationId, defaultLevelsUp);
	TargetEntry *secondTargetEntry = makeTargetEntry((Expr *) secondColumn, argumentId,
													 NULL, false);
	(*columnId)++;

	Aggref *secondSum = makeNode(Aggref);
	secondSum->aggfnoid = AggregateFunctionOid(sumAggregateName, countAggregateType);
	secondSum->aggtype = get_func_rettype(secondSum->aggfnoid);
	secondSum->args = list_make1(secondTargetEntry);
	secondSum->aggkind = AGGKIND_NORMAL;
	secondSum->aggtranstype = InvalidOid;
	secondSum->aggargtypes = list_make1_oid(firstSum->aggtype);
	secondSum->aggsplit = AGGSPLIT_SIMPLE;

	/*
	 * Build the division operator between these two aggregates. This function
	 * will convert the types of the aggregates if necessary.
	 */
	List *operatorNameList = list_make1(makeString(DIVISION_OPER_NAME));
	Expr *opExpr = make_op(NULL, operatorNameList, (Node *) firstSum, (Node *) secondSum,
						   NULL,
						   -1);

	return opExpr;
}


/*
 * AddTypeConversion checks if the given expressions generate the same types. If
 * they don't, the function adds a type conversion function on top of the new
 * expression to have it generate the same type as the original aggregate.
 */
static Expr *
AddTypeConversion(Node *originalAggregate, Node *newExpression)
{
	Oid newTypeId = exprType(newExpression);
	Oid originalTypeId = exprType(originalAggregate);
	int32 originalTypeMod = exprTypmod(originalAggregate);

	/* nothing to do if the two types are the same */
	if (originalTypeId == newTypeId)
	{
		return NULL;
	}

	/* otherwise, add a type conversion function */
	Node *typeConvertedExpression = coerce_to_target_type(NULL, newExpression, newTypeId,
														  originalTypeId, originalTypeMod,
														  COERCION_EXPLICIT,
														  COERCE_EXPLICIT_CAST, -1);
	Assert(typeConvertedExpression != NULL);
	return (Expr *) typeConvertedExpression;
}


/*
 * WorkerExtendedOpNode creates the worker extended operator node from the given
 * originalOpNode and extendedOpNodeProperties.
 *
 * For the details of the processing see the comments of the functions that
 * are called from this function.
 */
static MultiExtendedOp *
WorkerExtendedOpNode(MultiExtendedOp *originalOpNode,
					 ExtendedOpNodeProperties *extendedOpNodeProperties)
{
	bool distinctPreventsLimitPushdown = false;

	QueryTargetList queryTargetList;
	QueryGroupClause queryGroupClause;
	QueryDistinctClause queryDistinctClause;
	QueryWindowClause queryWindowClause;
	QueryOrderByLimit queryOrderByLimit;
	Node *queryHavingQual = NULL;

	List *originalTargetEntryList = originalOpNode->targetList;
	List *originalGroupClauseList = originalOpNode->groupClauseList;
	List *originalSortClauseList = originalOpNode->sortClauseList;
	Node *originalHavingQual = originalOpNode->havingQual;
	Node *originalLimitCount = originalOpNode->limitCount;
	Node *originalLimitOffset = originalOpNode->limitOffset;
	List *originalWindowClause = originalOpNode->windowClause;
	List *originalDistinctClause = originalOpNode->distinctClause;
	bool hasDistinctOn = originalOpNode->hasDistinctOn;

	int originalGroupClauseLength = list_length(originalGroupClauseList);

	/* initialize to default values */
	memset(&queryTargetList, 0, sizeof(queryTargetList));
	memset(&queryGroupClause, 0, sizeof(queryGroupClause));
	memset(&queryDistinctClause, 0, sizeof(queryDistinctClause));
	memset(&queryWindowClause, 0, sizeof(queryWindowClause));
	memset(&queryOrderByLimit, 0, sizeof(queryOrderByLimit));

	/* calculate the next sort group index based on the original target list */
	Index nextSortGroupRefIndex = GetNextSortGroupRef(originalTargetEntryList);

	/* targetProjectionNumber starts from 1 */
	queryTargetList.targetProjectionNumber = 1;

	if (!extendedOpNodeProperties->pullUpIntermediateRows)
	{
		queryGroupClause.groupClauseList = copyObject(originalGroupClauseList);
	}
	else
	{
		queryGroupClause.groupClauseList = NIL;
	}

	/*
	 * For the purpose of this variable, not pushing down when there are no groups
	 * is pushing down the original grouping, ie the worker's GROUP BY matches
	 * the master's GROUP BY.
	 */
	bool pushingDownOriginalGrouping =
		list_length(queryGroupClause.groupClauseList) == originalGroupClauseLength;

	/*
	 * nextSortGroupRefIndex is used by group by, window and order by clauses.
	 * Thus, we pass a reference to a single nextSortGroupRefIndex and expect
	 * it modified separately while processing those parts of the query.
	 */
	queryGroupClause.nextSortGroupRefIndex = &nextSortGroupRefIndex;
	queryWindowClause.nextSortGroupRefIndex = &nextSortGroupRefIndex;
	queryOrderByLimit.nextSortGroupRefIndex = &nextSortGroupRefIndex;

	/* process each part of the query in order to generate the worker query's parts */
	ProcessTargetListForWorkerQuery(originalTargetEntryList, extendedOpNodeProperties,
									&queryTargetList, &queryGroupClause);

	ProcessHavingClauseForWorkerQuery(originalHavingQual, extendedOpNodeProperties,
									  &queryHavingQual, &queryTargetList,
									  &queryGroupClause);

	/*
	 * Planner optimizations may leave window clauses with hasWindowFuncs as false.
	 * Ignore window clauses in that case.
	 */
	if (extendedOpNodeProperties->hasWindowFuncs)
	{
		if (extendedOpNodeProperties->onlyPushableWindowFunctions)
		{
			ProcessWindowFunctionsForWorkerQuery(originalWindowClause,
												 originalTargetEntryList,
												 &queryWindowClause, &queryTargetList);
		}
		else
		{
			ProcessWindowFunctionPullUpForWorkerQuery(originalWindowClause,
													  &queryTargetList);
		}
	}

	if (ShouldProcessDistinctOrderAndLimitForWorker(extendedOpNodeProperties,
													pushingDownOriginalGrouping,
													originalHavingQual))
	{
		bool queryHasAggregates = TargetListHasAggregates(originalTargetEntryList);

		ProcessDistinctClauseForWorkerQuery(originalDistinctClause, hasDistinctOn,
											queryGroupClause.groupClauseList,
											queryHasAggregates, &queryDistinctClause,
											&distinctPreventsLimitPushdown);

		/*
		 * Order by and limit clauses are relevant to each other, and processing
		 * them together makes it handy for us.
		 *
		 * The other parts of the query might have already prohibited pushing down
		 * LIMIT and ORDER BY clauses as described below:
		 *      (1) Creating a new group by clause during aggregate mutation, or
		 *      (2) Distinct clause is not pushed down
		 */
		bool groupByExtended =
			list_length(queryGroupClause.groupClauseList) > originalGroupClauseLength;
		if (pushingDownOriginalGrouping && !groupByExtended &&
			!distinctPreventsLimitPushdown)
		{
			/* both sort and limit clauses rely on similar information */
			OrderByLimitReference limitOrderByReference =
				BuildOrderByLimitReference(hasDistinctOn,
										   extendedOpNodeProperties->
										   groupedByDisjointPartitionColumn,
										   extendedOpNodeProperties->
										   onlyPushableWindowFunctions,
										   originalGroupClauseList,
										   originalSortClauseList,
										   originalTargetEntryList);

			ProcessLimitOrderByForWorkerQuery(limitOrderByReference, originalLimitCount,
											  originalLimitOffset,
											  originalSortClauseList,
											  originalGroupClauseList,
											  originalTargetEntryList,
											  &queryOrderByLimit,
											  &queryTargetList);
		}
	}

	/* finally, fill the extended op node with the data we gathered */
	MultiExtendedOp *workerExtendedOpNode = CitusMakeNode(MultiExtendedOp);

	workerExtendedOpNode->targetList = queryTargetList.targetEntryList;
	workerExtendedOpNode->groupClauseList = queryGroupClause.groupClauseList;
	workerExtendedOpNode->havingQual = queryHavingQual;
	workerExtendedOpNode->hasDistinctOn = queryDistinctClause.workerHasDistinctOn;
	workerExtendedOpNode->distinctClause = queryDistinctClause.workerDistinctClause;
	workerExtendedOpNode->hasWindowFuncs = queryWindowClause.hasWindowFunctions;
	workerExtendedOpNode->windowClause = queryWindowClause.workerWindowClauseList;
	workerExtendedOpNode->sortClauseList = queryOrderByLimit.workerSortClauseList;
	workerExtendedOpNode->limitCount = queryOrderByLimit.workerLimitCount;

	/*
	 * If the limitCount cannot be pushed down it will be NULL, so the deparser will
	 * ignore the limitOption.
	 */
	workerExtendedOpNode->limitOption = originalOpNode->limitOption;

	return workerExtendedOpNode;
}


/*
 * ProcessTargetListForWorkerQuery gets the inputs and modifies the outputs
 * such that the worker query's target list and group by clauses are extended
 * for the given inputs.
 *
 * The function walks over the input targetEntryList. For the entries
 * with aggregates in them, it calls the recursive aggregate walker function to
 * create aggregates for the worker nodes. For example, the avg() is sent to
 * the worker with two expressions count() and sum(). Thus, a single target entry
 * might end up with multiple expressions in the worker query.
 *
 * The function doesn't change the aggregates in the window functions and sends them
 * as-is. The reason is that Citus only supports pushing down window functions when
 * this is safe to do.
 *
 * The function also handles count distinct operator if it is used in repartition
 * subqueries or on non-partition columns (e.g., cannot be pushed down). Each
 * column in count distinct aggregate is added to target list, and group by
 * list of worker extended operator. This approach guarantees the distinctness
 * in the worker queries.
 *
 *     inputs: targetEntryList, extendedOpNodeProperties
 *     outputs: queryTargetList, queryGroupClause
 */
static void
ProcessTargetListForWorkerQuery(List *targetEntryList,
								ExtendedOpNodeProperties *extendedOpNodeProperties,
								QueryTargetList *queryTargetList,
								QueryGroupClause *queryGroupClause)
{
	WorkerAggregateWalkerContext workerAggContext = {
		.extendedOpNodeProperties = extendedOpNodeProperties,
	};

	/* iterate over original target entries */
	TargetEntry *originalTargetEntry = NULL;
	foreach_ptr(originalTargetEntry, targetEntryList)
	{
		Expr *originalExpression = originalTargetEntry->expr;
		List *newExpressionList = NIL;

		/* reset walker context */
		workerAggContext.expressionList = NIL;
		workerAggContext.createGroupByClause = false;

		/*
		 * If we can push down the expression we copy the expression to the targetlist of the worker query.
		 * Otherwise the expression is processed to be combined on the coordinator.
		 */
		if (CanPushDownExpression((Node *) originalExpression, extendedOpNodeProperties))
		{
			newExpressionList = list_make1(originalExpression);
		}
		else
		{
			WorkerAggregateWalker((Node *) originalExpression, &workerAggContext);

			newExpressionList = workerAggContext.expressionList;
		}

		ExpandWorkerTargetEntry(newExpressionList, originalTargetEntry,
								workerAggContext.createGroupByClause,
								queryTargetList, queryGroupClause);
	}
}


/*
 * ProcessHavingClauseForWorkerQuery gets the inputs and modifies the outputs
 * such that the worker query's target list and group by clauses are extended
 * based on the inputs.
 *
 * The rule is that Citus always applies the HAVING clause on the
 * coordinator. Thus, it pulls the necessary data from the workers. Also, when the
 * having clause is safe to pushdown to the workers, workerHavingQual is set to
 * be the original having clause.
 *
 *     inputs: originalHavingQual, extendedOpNodeProperties
 *     outputs: workerHavingQual, queryTargetList, queryGroupClause
 */
static void
ProcessHavingClauseForWorkerQuery(Node *originalHavingQual,
								  ExtendedOpNodeProperties *extendedOpNodeProperties,
								  Node **workerHavingQual,
								  QueryTargetList *queryTargetList,
								  QueryGroupClause *queryGroupClause)
{
	*workerHavingQual = NULL;

	if (originalHavingQual == NULL)
	{
		return;
	}

	if (extendedOpNodeProperties->pushDownGroupingAndHaving)
	{
		/*
		 * We converted the having expression to a list in subquery pushdown
		 * planner. However, this query cannot be parsed as it is in the worker.
		 * We should convert this back to being explicit for worker query
		 * so that it can be parsed when it hits the standard planner in worker.
		 */
		if (IsA(originalHavingQual, List))
		{
			*workerHavingQual =
				(Node *) make_ands_explicit((List *) originalHavingQual);
		}
		else
		{
			*workerHavingQual = originalHavingQual;
		}
	}
	else
	{
		/*
		 * If the GROUP BY or PARTITION BY is not on the distribution column
		 * then we need to combine the aggregates in the HAVING across shards.
		 */
		WorkerAggregateWalkerContext workerAggContext = {
			.extendedOpNodeProperties = extendedOpNodeProperties,
		};

		WorkerAggregateWalker(originalHavingQual, &workerAggContext);
		List *newExpressionList = workerAggContext.expressionList;
		TargetEntry *targetEntry = NULL;

		ExpandWorkerTargetEntry(newExpressionList, targetEntry,
								workerAggContext.createGroupByClause,
								queryTargetList, queryGroupClause);
	}
}


/*
 * ProcessDistinctClauseForWorkerQuery gets the inputs and modifies the outputs
 * such that worker query's DISTINCT and DISTINCT ON clauses are set accordingly.
 * Note the function may or may not decide to pushdown the DISTINCT and DISTINCT
 * on clauses based on the inputs.
 *
 * See the detailed comments in the function for the rules of pushing down DISTINCT
 * and DISTINCT ON clauses to the worker queries.
 *
 * The function also sets distinctPreventsLimitPushdown. As the name reveals,
 * distinct could prevent pushing down LIMIT clauses later in the planning.
 * For the details, see the comments in the function.
 *
 *     inputs: distinctClause, hasDistinctOn, groupClauseList, queryHasAggregates
 *     outputs: queryDistinctClause, distinctPreventsLimitPushdown
 *
 */
static void
ProcessDistinctClauseForWorkerQuery(List *distinctClause, bool hasDistinctOn,
									List *groupClauseList,
									bool queryHasAggregates,
									QueryDistinctClause *queryDistinctClause,
									bool *distinctPreventsLimitPushdown)
{
	*distinctPreventsLimitPushdown = false;

	if (distinctClause == NIL)
	{
		return;
	}

	bool distinctClauseSupersetofGroupClause = false;

	if (groupClauseList == NIL ||
		IsGroupBySubsetOfDistinct(groupClauseList, distinctClause))
	{
		distinctClauseSupersetofGroupClause = true;
	}
	else
	{
		distinctClauseSupersetofGroupClause = false;

		/*
		 * GROUP BY being a subset of DISTINCT guarantees the
		 * distinctness on the workers. Otherwise, pushing down
		 * LIMIT might cause missing the necessary data from
		 * the worker query
		 */
		*distinctPreventsLimitPushdown = true;
	}

	/*
	 * Distinct is pushed down to worker query only if the query does not
	 * contain an aggregate in which master processing might be required to
	 * complete the final result before distinct operation. We also prevent
	 * distinct pushdown if distinct clause is missing some entries that
	 * group by clause has.
	 */
	bool shouldPushdownDistinct = !queryHasAggregates &&
								  distinctClauseSupersetofGroupClause;
	if (shouldPushdownDistinct)
	{
		queryDistinctClause->workerDistinctClause = distinctClause;
		queryDistinctClause->workerHasDistinctOn = hasDistinctOn;
	}
}


/*
 * ProcessWindowFunctionsForWorkerQuery gets the inputs and modifies the outputs such
 * that worker query's workerWindowClauseList is set when the window clauses are safe to
 * pushdown.
 *
 * Note that even though Citus only pushes down the window functions, it may need to
 * modify the target list of the worker query when the window function refers to
 * an avg(). The reason is that any aggregate which is also referred by other
 * target entries would be mutated by Citus. Thus, we add a copy of the same aggregate
 * to the worker target list to make sure that the window function refers to the
 * non-mutated aggregate.
 *
 *     inputs: windowClauseList, originalTargetEntryList
 *     outputs: queryWindowClause, queryTargetList
 *
 */
static void
ProcessWindowFunctionsForWorkerQuery(List *windowClauseList,
									 List *originalTargetEntryList,
									 QueryWindowClause *queryWindowClause,
									 QueryTargetList *queryTargetList)
{
	if (windowClauseList == NIL)
	{
		return;
	}

	queryWindowClause->workerWindowClauseList = windowClauseList;
	queryWindowClause->hasWindowFunctions = true;
}


/* ProcessWindowFunctionPullUpForWorkerQuery pulls up inputs for window functions */
static void
ProcessWindowFunctionPullUpForWorkerQuery(List *windowClause,
										  QueryTargetList *queryTargetList)
{
	if (windowClause != NIL)
	{
		List *columnList = pull_var_clause_default((Node *) windowClause);

		Expr *newExpression = NULL;
		foreach_ptr(newExpression, columnList)
		{
			TargetEntry *newTargetEntry = makeNode(TargetEntry);

			newTargetEntry->expr = newExpression;

			newTargetEntry->resname =
				WorkerColumnName(queryTargetList->targetProjectionNumber);

			/* force resjunk to false as we may need this on the master */
			newTargetEntry->resjunk = false;
			newTargetEntry->resno = queryTargetList->targetProjectionNumber;

			queryTargetList->targetEntryList =
				lappend(queryTargetList->targetEntryList, newTargetEntry);
			queryTargetList->targetProjectionNumber++;
		}
	}
}


/*
 * ProcessLimitOrderByForWorkerQuery gets the inputs and modifies the outputs
 * such that worker query's LIMIT and ORDER BY clauses are set accordingly.
 * Adding entries to ORDER BY might trigger adding new entries to newTargetEntryList.
 * See GenerateNewTargetEntriesForSortClauses() for the details.
 *
 * For the decisions on whether and how to pushdown LIMIT and ORDER BY are documented
 * in the functions that are called from this function.
 *
 *     inputs: sortLimitReference, originalLimitCount, limitOffset,
 *             sortClauseList, groupClauseList, originalTargetList
 *     outputs: queryOrderByLimit, queryTargetList
 */
static void
ProcessLimitOrderByForWorkerQuery(OrderByLimitReference orderByLimitReference,
								  Node *originalLimitCount, Node *limitOffset,
								  List *sortClauseList, List *groupClauseList,
								  List *originalTargetList,
								  QueryOrderByLimit *queryOrderByLimit,
								  QueryTargetList *queryTargetList)
{
	queryOrderByLimit->workerLimitCount =
		WorkerLimitCount(originalLimitCount, limitOffset, orderByLimitReference);

	queryOrderByLimit->workerSortClauseList =
		WorkerSortClauseList(originalLimitCount,
							 groupClauseList,
							 sortClauseList,
							 orderByLimitReference);
}


/*
 * BuildOrderByLimitReference is a helper function that simply builds
 * the necessary information for processing the limit and order by.
 * The return value should be used in a read-only manner.
 */
static OrderByLimitReference
BuildOrderByLimitReference(bool hasDistinctOn, bool groupedByDisjointPartitionColumn,
						   bool onlyPushableWindowFunctions,
						   List *groupClause, List *sortClauseList, List *targetList)
{
	OrderByLimitReference limitOrderByReference;

	limitOrderByReference.groupedByDisjointPartitionColumn =
		groupedByDisjointPartitionColumn;
	limitOrderByReference.onlyPushableWindowFunctions =
		onlyPushableWindowFunctions;
	limitOrderByReference.hasDistinctOn = hasDistinctOn;
	limitOrderByReference.groupClauseIsEmpty = (groupClause == NIL);
	limitOrderByReference.sortClauseIsEmpty = (sortClauseList == NIL);
	limitOrderByReference.canApproximate =
		CanPushDownLimitApproximate(sortClauseList, targetList);
	limitOrderByReference.hasOrderByAggregate =
		HasOrderByAggregate(sortClauseList, targetList);

	return limitOrderByReference;
}


/*
 * TargetListHasAggregates returns true if any of the elements in the
 * target list contain aggregates that are not inside the window functions.
 * This function should not be called if window functions are being pulled up.
 */
bool
TargetListHasAggregates(List *targetEntryList)
{
	TargetEntry *targetEntry = NULL;
	foreach_ptr(targetEntry, targetEntryList)
	{
		Expr *targetExpr = targetEntry->expr;
		bool hasAggregates = contain_aggs_of_level((Node *) targetExpr, 0);
		bool hasWindowFunction = contain_window_function((Node *) targetExpr);

		/*
		 * If the expression uses aggregates inside window function contain agg
		 * clause still returns true. We want to make sure it is not a part of
		 * window function before we proceed.
		 */
		if (hasAggregates && !hasWindowFunction)
		{
			return true;
		}
	}

	return false;
}


/*
 * ExpandWorkerTargetEntry is a utility function which processes the
 * expressions that are intended to be added to the worker target list.
 *
 * In summary, the function gets a list of expressions, converts them to target
 * entries and updates all the necessary fields such that the expression is correctly
 * added to the worker query's target list.
 *
 * Inputs:
 *  - expressionList: The list of expressions that should be added to the worker query's
 *                    target list.
 *  - originalTargetEntry: Target entry that the expressionList generated for. NULL
 *                         if the expressionList is not generated from any target entry.
 *  - addToGroupByClause: True if the expressionList should also be added to the
 *                        worker query's GROUP BY clause.
 */
static void
ExpandWorkerTargetEntry(List *expressionList, TargetEntry *originalTargetEntry,
						bool addToGroupByClause, QueryTargetList *queryTargetList,
						QueryGroupClause *queryGroupClause)
{
	/* now create target entries for each new expression */
	Expr *newExpression = NULL;
	foreach_ptr(newExpression, expressionList)
	{
		/* generate and add the new target entry to the target list */
		TargetEntry *newTargetEntry =
			GenerateWorkerTargetEntry(originalTargetEntry, newExpression,
									  queryTargetList->targetProjectionNumber);
		queryTargetList->targetProjectionNumber++;
		queryTargetList->targetEntryList =
			lappend(queryTargetList->targetEntryList, newTargetEntry);

		/*
		 * Detect new targets of type Var and add it to group clause list.
		 * This case is expected only if the target entry has aggregates and
		 * it is inside a repartitioned subquery. We create group by entry
		 * for each Var in target list. This code does not check if this
		 * Var was already in the target list or in group by clauses.
		 */
		if (IsA(newExpression, Var) && addToGroupByClause)
		{
			AppendTargetEntryToGroupClause(newTargetEntry, queryGroupClause);
		}
	}
}


/*
 * GetNextSortGroupRef gets a target list entry and returns
 * the next ressortgroupref that should be used based on the
 * input target list.
 */
static Index
GetNextSortGroupRef(List *targetEntryList)
{
	Index nextSortGroupRefIndex = 0;

	/* find max of sort group ref index */
	TargetEntry *targetEntry = NULL;
	foreach_ptr(targetEntry, targetEntryList)
	{
		if (targetEntry->ressortgroupref > nextSortGroupRefIndex)
		{
			nextSortGroupRefIndex = targetEntry->ressortgroupref;
		}
	}

	/* next group ref index starts from max group ref index + 1 */
	nextSortGroupRefIndex++;

	return nextSortGroupRefIndex;
}


/*
 * GenerateWorkerTargetEntry is a simple utility function which gets a
 * target entry, an expression and a targetProjectionNumber.
 *
 * The function returns a newly allocated target entry which can be added
 * to the worker's target list.
 */
static TargetEntry *
GenerateWorkerTargetEntry(TargetEntry *targetEntry, Expr *workerExpression,
						  AttrNumber targetProjectionNumber)
{
	TargetEntry *newTargetEntry = NULL;

	/*
	 * If a target entry is already provided, use a copy of
	 * it because some of the callers rely on resorigtbl and
	 * resorigcol.
	 */
	if (targetEntry)
	{
		newTargetEntry = flatCopyTargetEntry(targetEntry);
	}
	else
	{
		newTargetEntry = makeNode(TargetEntry);
	}

	if (newTargetEntry->resname == NULL)
	{
		newTargetEntry->resname = WorkerColumnName(targetProjectionNumber);
	}

	/* we can't generate a target entry without an expression */
	Assert(workerExpression != NULL);

	/* force resjunk to false as we may need this on the master */
	newTargetEntry->expr = workerExpression;
	newTargetEntry->resjunk = false;
	newTargetEntry->resno = targetProjectionNumber;

	return newTargetEntry;
}


/*
 * AppendTargetEntryToGroupClause gets a target entry, pointer to group list
 * and the ressortgroupref index.
 *
 * The function modifies all of the three input such that the target entry is
 * appended to the group clause and the index is incremented by one.
 */
static void
AppendTargetEntryToGroupClause(TargetEntry *targetEntry,
							   QueryGroupClause *queryGroupClause)
{
	Expr *targetExpr PG_USED_FOR_ASSERTS_ONLY = targetEntry->expr;

	/* we currently only support appending Var target entries */
	Assert(IsA(targetExpr, Var));

	Var *targetColumn = (Var *) targetEntry->expr;
	SortGroupClause *groupByClause = CreateSortGroupClause(targetColumn);

	/* the target entry should have an index */
	targetEntry->ressortgroupref = *queryGroupClause->nextSortGroupRefIndex;

	/* the group by clause entry should point to the correct index in the target list */
	groupByClause->tleSortGroupRef = *queryGroupClause->nextSortGroupRefIndex;

	/* update the group by list and the index's value */
	queryGroupClause->groupClauseList =
		lappend(queryGroupClause->groupClauseList, groupByClause);
	(*queryGroupClause->nextSortGroupRefIndex)++;
}


/*
 * WorkerAggregateWalker walks over the original target entry expression, and
 * creates the list of expression trees (potentially more than one) to execute
 * on the worker nodes. The function creates new expressions for aggregates and
 * columns; and recurses into expression_tree_walker() for all other expression
 * types.
 */
static bool
WorkerAggregateWalker(Node *node, WorkerAggregateWalkerContext *walkerContext)
{
	bool walkerResult = false;
	if (node == NULL)
	{
		return false;
	}

	if (IsA(node, Aggref))
	{
		if (CanPushDownExpression(node, walkerContext->extendedOpNodeProperties))
		{
			walkerContext->expressionList = lappend(walkerContext->expressionList,
													node);
		}
		else
		{
			Aggref *originalAggregate = (Aggref *) node;
			List *workerAggregateList = WorkerAggregateExpressionList(originalAggregate,
																	  walkerContext);

			walkerContext->expressionList = list_concat(walkerContext->expressionList,
														workerAggregateList);
		}
	}
	else if (IsA(node, Var))
	{
		Var *originalColumn = (Var *) node;
		walkerContext->expressionList = lappend(walkerContext->expressionList,
												originalColumn);
	}
	else
	{
		walkerResult = expression_tree_walker(node, WorkerAggregateWalker,
											  (void *) walkerContext);
	}

	return walkerResult;
}


/*
 * WorkerAggregateExpressionList takes in the original aggregate function, and
 * determines the transformed aggregate functions to execute on worker nodes.
 * The function then returns these aggregates in a list. It also creates
 * group by clauses for newly added targets to be placed in the extended operator
 * node.
 */
static List *
WorkerAggregateExpressionList(Aggref *originalAggregate,
							  WorkerAggregateWalkerContext *walkerContext)
{
	List *workerAggregateList = NIL;

	if (walkerContext->extendedOpNodeProperties->pullUpIntermediateRows)
	{
		TargetEntry *targetEntry;
		foreach_ptr(targetEntry, originalAggregate->args)
		{
			workerAggregateList = lappend(workerAggregateList, targetEntry->expr);
		}

		Expr *directarg;
		foreach_ptr(directarg, originalAggregate->aggdirectargs)
		{
			/*
			 * The worker aggregation should execute any node that contains any
			 * Var nodes and return the result in the targetlist, so that the
			 * combine query can then fetch the result via remote scan; see
			 * MasterAggregateExpression.
			 */
			if (pull_var_clause_default((Node *) directarg) != NIL)
			{
				workerAggregateList = lappend(workerAggregateList, directarg);
			}
		}

		if (originalAggregate->aggfilter)
		{
			workerAggregateList = lappend(workerAggregateList,
										  originalAggregate->aggfilter);
		}

		return workerAggregateList;
	}

	AggregateType aggregateType = GetAggregateType(originalAggregate);

	if (aggregateType == AGGREGATE_COUNT && originalAggregate->aggdistinct &&
		CountDistinctErrorRate == DISABLE_DISTINCT_APPROXIMATION &&
		walkerContext->extendedOpNodeProperties->pullDistinctColumns)
	{
		Aggref *aggregate = (Aggref *) copyObject(originalAggregate);
		List *columnList = pull_var_clause_default((Node *) aggregate);

		Var *column = NULL;
		foreach_ptr(column, columnList)
		{
			workerAggregateList = list_append_unique(workerAggregateList, column);
		}

		walkerContext->createGroupByClause = true;
	}
	else if (aggregateType == AGGREGATE_COUNT && originalAggregate->aggdistinct &&
			 CountDistinctErrorRate != DISABLE_DISTINCT_APPROXIMATION)
	{
		/*
		 * If the original aggregate is a count(distinct) approximation, we want
		 * to compute hll_add_agg(hll_hash(var), storageSize) on worker nodes.
		 */
		const AttrNumber firstArgumentId = 1;
		const AttrNumber secondArgumentId = 2;
		const int hashArgumentCount = 2;
		const int addArgumentCount = 2;


		/* init hll_hash() related variables */
		Oid argumentType = AggregateArgumentType(originalAggregate);
		TargetEntry *argument = (TargetEntry *) linitial(originalAggregate->args);
		Expr *argumentExpression = copyObject(argument->expr);

		/* extract schema name of hll */
		Oid hllId = get_extension_oid(HLL_EXTENSION_NAME, false);
		Oid hllSchemaOid = get_extension_schema(hllId);
		const char *hllSchemaName = get_namespace_name(hllSchemaOid);

		const char *hashFunctionName = CountDistinctHashFunctionName(argumentType);
		Oid hashFunctionId = FunctionOid(hllSchemaName, hashFunctionName,
										 hashArgumentCount);
		Oid hashFunctionReturnType = get_func_rettype(hashFunctionId);

		/* init hll_add_agg() related variables */
		Oid addFunctionId = FunctionOid(hllSchemaName, HLL_ADD_AGGREGATE_NAME,
										addArgumentCount);
		Oid hllType = TypeOid(hllSchemaOid, HLL_TYPE_NAME);
		int logOfStorageSize = CountDistinctStorageSize(CountDistinctErrorRate);
		Const *logOfStorageSizeConst = MakeIntegerConst(logOfStorageSize);

		/* construct hll_hash() expression */
		FuncExpr *hashFunction = makeNode(FuncExpr);
		hashFunction->funcid = hashFunctionId;
		hashFunction->funcresulttype = hashFunctionReturnType;
		hashFunction->args = list_make1(argumentExpression);

		/* construct hll_add_agg() expression */
		TargetEntry *hashedColumnArgument = makeTargetEntry((Expr *) hashFunction,
															firstArgumentId, NULL, false);
		TargetEntry *storageSizeArgument = makeTargetEntry((Expr *) logOfStorageSizeConst,
														   secondArgumentId, NULL, false);
		List *addAggregateArgumentList = list_make2(hashedColumnArgument,
													storageSizeArgument);

		Aggref *addAggregateFunction = makeNode(Aggref);
		addAggregateFunction->aggfnoid = addFunctionId;
		addAggregateFunction->aggtype = hllType;
		addAggregateFunction->args = addAggregateArgumentList;
		addAggregateFunction->aggkind = AGGKIND_NORMAL;
		addAggregateFunction->aggfilter = (Expr *) copyObject(
			originalAggregate->aggfilter);

		workerAggregateList = lappend(workerAggregateList, addAggregateFunction);
	}
	else if (aggregateType == AGGREGATE_AVERAGE)
	{
		/*
		 * If the original aggregate is an average, we want to compute sum(var)
		 * and count(var) on worker nodes.
		 */
		Aggref *sumAggregate = copyObject(originalAggregate);
		Aggref *countAggregate = copyObject(originalAggregate);

		/* extract function names for sum and count */
		const char *sumAggregateName = AggregateNames[AGGREGATE_SUM];
		const char *countAggregateName = AggregateNames[AGGREGATE_COUNT];

		/*
		 * Find the type of the expression over which we execute the aggregate.
		 * We then need to find the right sum function for that type.
		 */
		Oid argumentType = AggregateArgumentType(originalAggregate);

		/* find function implementing sum over the original type */
		sumAggregate->aggfnoid = AggregateFunctionOid(sumAggregateName, argumentType);
		sumAggregate->aggtype = get_func_rettype(sumAggregate->aggfnoid);

		sumAggregate->aggtranstype = InvalidOid;
		sumAggregate->aggargtypes = list_make1_oid(argumentType);
		sumAggregate->aggsplit = AGGSPLIT_SIMPLE;

		/* count has any input type */
		countAggregate->aggfnoid = AggregateFunctionOid(countAggregateName, ANYOID);
		countAggregate->aggtype = get_func_rettype(countAggregate->aggfnoid);
		countAggregate->aggtranstype = InvalidOid;
		countAggregate->aggargtypes = list_make1_oid(argumentType);
		countAggregate->aggsplit = AGGSPLIT_SIMPLE;

		workerAggregateList = lappend(workerAggregateList, sumAggregate);
		workerAggregateList = lappend(workerAggregateList, countAggregate);
	}
	else if (aggregateType == AGGREGATE_TDIGEST_PERCENTILE_ADD_DOUBLE ||
			 aggregateType == AGGREGATE_TDIGEST_PERCENTILE_ADD_DOUBLEARRAY ||
			 aggregateType == AGGREGATE_TDIGEST_PERCENTILE_OF_ADD_DOUBLE ||
			 aggregateType == AGGREGATE_TDIGEST_PERCENTILE_OF_ADD_DOUBLEARRAY)
	{
		/*
		 * The original query has an aggregate in the form of either
		 *  - tdigest_percentile(column, compression, quantile)
		 *  - tdigest_percentile(column, compression, quantile[])
		 *  - tdigest_percentile_of(column, compression, value)
		 *  - tdigest_percentile_of(column, compression, value[])
		 *
		 * We are creating the worker part of this query by creating a
		 *  - tdigest(column, compression)
		 *
		 * One could see we are passing argument 0 and argument 1 from the original query
		 * in here. This corresponds with the list_nth calls in the args and aggargstypes
		 * list construction. The tdigest function and type are read from the catalog.
		 */
		Aggref *newWorkerAggregate = copyObject(originalAggregate);
		newWorkerAggregate->aggfnoid = TDigestExtensionAggTDigest2();
		newWorkerAggregate->aggtype = TDigestExtensionTypeOid();
		newWorkerAggregate->args = list_make2(
			list_nth(newWorkerAggregate->args, 0),
			list_nth(newWorkerAggregate->args, 1));
		newWorkerAggregate->aggkind = AGGKIND_NORMAL;
		newWorkerAggregate->aggtranstype = InvalidOid;
		newWorkerAggregate->aggargtypes = list_make2_oid(
			list_nth_oid(newWorkerAggregate->aggargtypes, 0),
			list_nth_oid(newWorkerAggregate->aggargtypes, 1));
		newWorkerAggregate->aggsplit = AGGSPLIT_SIMPLE;

		workerAggregateList = lappend(workerAggregateList, newWorkerAggregate);
	}
	else if (aggregateType == AGGREGATE_TDIGEST_PERCENTILE_TDIGEST_DOUBLE ||
			 aggregateType == AGGREGATE_TDIGEST_PERCENTILE_TDIGEST_DOUBLEARRAY ||
			 aggregateType == AGGREGATE_TDIGEST_PERCENTILE_OF_TDIGEST_DOUBLE ||
			 aggregateType == AGGREGATE_TDIGEST_PERCENTILE_OF_TDIGEST_DOUBLEARRAY)
	{
		/*
		 * The original query has an aggregate in the form of either
		 *  - tdigest_percentile(tdigest, quantile)
		 *  - tdigest_percentile(tdigest, quantile[])
		 *  - tdigest_percentile_of(tdigest, value)
		 *  - tdigest_percentile_of(tdigest, value[])
		 *
		 * We are creating the worker part of this query by creating a
		 *  - tdigest(tdigest)
		 *
		 * One could see we are passing argument 0 from the original query in here. This
		 * corresponds with the list_nth calls in the args and aggargstypes list
		 * construction. The tdigest function and type are read from the catalog.
		 */
		Aggref *newWorkerAggregate = copyObject(originalAggregate);
		newWorkerAggregate->aggfnoid = TDigestExtensionAggTDigest1();
		newWorkerAggregate->aggtype = TDigestExtensionTypeOid();
		newWorkerAggregate->args = list_make1(list_nth(newWorkerAggregate->args, 0));
		newWorkerAggregate->aggkind = AGGKIND_NORMAL;
		newWorkerAggregate->aggtranstype = InvalidOid;
		newWorkerAggregate->aggargtypes = list_make1_oid(
			list_nth_oid(newWorkerAggregate->aggargtypes, 0));
		newWorkerAggregate->aggsplit = AGGSPLIT_SIMPLE;

		workerAggregateList = lappend(workerAggregateList, newWorkerAggregate);
	}
	else if (aggregateType == AGGREGATE_CUSTOM_COMBINE)
	{
		HeapTuple aggTuple =
			SearchSysCache1(AGGFNOID, ObjectIdGetDatum(originalAggregate->aggfnoid));
		Form_pg_aggregate aggform;
		Oid combine;

		if (!HeapTupleIsValid(aggTuple))
		{
			elog(ERROR, "citus cache lookup failed for aggregate %u",
				 originalAggregate->aggfnoid);
			return NULL;
		}
		else
		{
			aggform = (Form_pg_aggregate) GETSTRUCT(aggTuple);
			combine = aggform->aggcombinefn;
			ReleaseSysCache(aggTuple);
		}

		if (combine != InvalidOid)
		{
			Oid workerPartialId = WorkerPartialAggOid();

			Const *aggOidParam = makeConst(REGPROCEDUREOID, -1, InvalidOid, sizeof(Oid),
										   ObjectIdGetDatum(originalAggregate->aggfnoid),
										   false, true);

			List *newWorkerAggregateArgs =
				list_make1(makeTargetEntry((Expr *) aggOidParam, 1, NULL, false));

			if (list_length(originalAggregate->args) == 1)
			{
				/*
				 * Single argument case, append 'arg' to worker_partial_agg(agg, arg).
				 * We don't wrap single argument in a row expression because
				 * it has performance implications to unwrap arguments on each
				 * SFUNC invocation.
				 */
				TargetEntry *newArg =
					copyObject((TargetEntry *) linitial(originalAggregate->args));
				newArg->resno++;
				newWorkerAggregateArgs = lappend(newWorkerAggregateArgs, newArg);
			}
			else
			{
				/*
				 * Aggregation on workers assumes a single aggregation parameter.
				 * To still be able to handle multiple parameters, we combine
				 * parameters into a single row expression, i.e., append 'ROW(...args)'
				 * to worker_partial_agg(agg, ROW(...args)).
				 */
				RowExpr *rowExpr = makeNode(RowExpr);
				rowExpr->row_typeid = RECORDOID;
				rowExpr->row_format = COERCE_EXPLICIT_CALL;
				rowExpr->location = -1;
				rowExpr->colnames = NIL;

				TargetEntry *arg = NULL;
				foreach_ptr(arg, originalAggregate->args)
				{
					rowExpr->args = lappend(rowExpr->args, copyObject(arg->expr));
				}

				newWorkerAggregateArgs =
					lappend(newWorkerAggregateArgs,
							makeTargetEntry((Expr *) rowExpr, 2, NULL, false));
			}

			/* worker_partial_agg(agg, arg) or worker_partial_agg(agg, ROW(...args)) */
			Aggref *newWorkerAggregate = copyObject(originalAggregate);
			newWorkerAggregate->aggfnoid = workerPartialId;
			newWorkerAggregate->aggtype = CSTRINGOID;
			newWorkerAggregate->args = newWorkerAggregateArgs;
			newWorkerAggregate->aggkind = AGGKIND_NORMAL;
			newWorkerAggregate->aggtranstype = INTERNALOID;
			newWorkerAggregate->aggargtypes = lcons_oid(OIDOID,
														newWorkerAggregate->aggargtypes);
			newWorkerAggregate->aggsplit = AGGSPLIT_SIMPLE;

			workerAggregateList = list_make1(newWorkerAggregate);
		}
		else
		{
			elog(ERROR, "Aggregate lacks COMBINEFUNC");
		}
	}
	else
	{
		/*
		 * All other aggregates are sent as they are to the worker nodes.
		 */
		Aggref *workerAggregate = copyObject(originalAggregate);
		workerAggregateList = lappend(workerAggregateList, workerAggregate);
	}

	return workerAggregateList;
}


/*
 * GetAggregateType scans pg_catalog.pg_proc for the given aggregate oid, and
 * finds the aggregate's name. The function then matches the aggregate's name to
 * previously stored strings, and returns the appropriate aggregate type.
 */
static AggregateType
GetAggregateType(Aggref *aggregateExpression)
{
	Oid aggFunctionId = aggregateExpression->aggfnoid;

	/* custom aggregates with combine func take precedence over name-based logic */
	if (aggFunctionId >= FirstNormalObjectId &&
		AggregateEnabledCustom(aggregateExpression))
	{
		return AGGREGATE_CUSTOM_COMBINE;
	}

	/* look up the function name */
	char *aggregateProcName = get_func_name(aggFunctionId);
	if (aggregateProcName == NULL)
	{
		ereport(ERROR, (errmsg("citus cache lookup failed for function %u",
							   aggFunctionId)));
	}

	uint32 aggregateCount = lengthof(AggregateNames);

	for (uint32 aggregateIndex = 1; aggregateIndex < aggregateCount; aggregateIndex++)
	{
		const char *aggregateName = AggregateNames[aggregateIndex];
		if (strncmp(aggregateName, aggregateProcName, NAMEDATALEN) == 0)
		{
			return aggregateIndex;
		}
	}

	/*
	 * All functions from github.com/tvondra/tdigest start with the "tdigest" prefix.
	 * Since it requires lookups of function names in a schema we would like to only
	 * perform these checks if there is some chance it will actually result in a positive
	 * hit.
	 */
	if (StringStartsWith(aggregateProcName, "tdigest"))
	{
		if (aggFunctionId == TDigestExtensionAggTDigest1())
		{
			return AGGREGATE_TDIGEST_COMBINE;
		}

		if (aggFunctionId == TDigestExtensionAggTDigest2())
		{
			return AGGREGATE_TDIGEST_ADD_DOUBLE;
		}

		if (aggFunctionId == TDigestExtensionAggTDigestPercentile3())
		{
			return AGGREGATE_TDIGEST_PERCENTILE_ADD_DOUBLE;
		}

		if (aggFunctionId == TDigestExtensionAggTDigestPercentile3a())
		{
			return AGGREGATE_TDIGEST_PERCENTILE_ADD_DOUBLEARRAY;
		}

		if (aggFunctionId == TDigestExtensionAggTDigestPercentile2())
		{
			return AGGREGATE_TDIGEST_PERCENTILE_TDIGEST_DOUBLE;
		}

		if (aggFunctionId == TDigestExtensionAggTDigestPercentile2a())
		{
			return AGGREGATE_TDIGEST_PERCENTILE_TDIGEST_DOUBLEARRAY;
		}

		if (aggFunctionId == TDigestExtensionAggTDigestPercentileOf3())
		{
			return AGGREGATE_TDIGEST_PERCENTILE_OF_ADD_DOUBLE;
		}

		if (aggFunctionId == TDigestExtensionAggTDigestPercentileOf3a())
		{
			return AGGREGATE_TDIGEST_PERCENTILE_OF_ADD_DOUBLEARRAY;
		}

		if (aggFunctionId == TDigestExtensionAggTDigestPercentileOf2())
		{
			return AGGREGATE_TDIGEST_PERCENTILE_OF_TDIGEST_DOUBLE;
		}

		if (aggFunctionId == TDigestExtensionAggTDigestPercentileOf2a())
		{
			return AGGREGATE_TDIGEST_PERCENTILE_OF_TDIGEST_DOUBLEARRAY;
		}
	}

	/* handle any remaining built-in aggregates with a suitable combinefn */
	if (AggregateEnabledCustom(aggregateExpression))
	{
		return AGGREGATE_CUSTOM_COMBINE;
	}

	if (CoordinatorAggregationStrategy == COORDINATOR_AGGREGATION_DISABLED)
	{
		ereport(ERROR, (errmsg("unsupported aggregate function %s", aggregateProcName)));
	}
	else
	{
		return AGGREGATE_CUSTOM_ROW_GATHER;
	}
}


/* Extracts the type of the argument over which the aggregate is operating. */
static Oid
AggregateArgumentType(Aggref *aggregate)
{
	List *argumentList = aggregate->args;
	TargetEntry *argument = (TargetEntry *) linitial(argumentList);
	Oid returnTypeId = exprType((Node *) argument->expr);

	/* Here we currently support aggregates with only one argument; assert that. */
	Assert(list_length(argumentList) == 1);

	return returnTypeId;
}


/*
 * FirstAggregateArgument returns the first argument of the aggregate.
 */
static Expr *
FirstAggregateArgument(Aggref *aggregate)
{
	List *argumentList = aggregate->args;

	Assert(list_length(argumentList) >= 1);

	TargetEntry *argument = (TargetEntry *) linitial(argumentList);

	return argument->expr;
}


/*
 * AggregateEnabledCustom returns whether given aggregate can be
 * distributed across workers using worker_partial_agg & coord_combine_agg.
 */
static bool
AggregateEnabledCustom(Aggref *aggregateExpression)
{
	if (aggregateExpression->aggorder != NIL ||
		list_length(aggregateExpression->args) == 0)
	{
		return false;
	}

	Oid aggregateOid = aggregateExpression->aggfnoid;
	HeapTuple aggTuple = SearchSysCache1(AGGFNOID, aggregateOid);
	if (!HeapTupleIsValid(aggTuple))
	{
		elog(ERROR, "citus cache lookup failed.");
	}
	Form_pg_aggregate aggform = (Form_pg_aggregate) GETSTRUCT(aggTuple);

	if (aggform->aggcombinefn == InvalidOid)
	{
		ReleaseSysCache(aggTuple);
		return false;
	}

	HeapTuple typeTuple = SearchSysCache1(TYPEOID, aggform->aggtranstype);
	if (!HeapTupleIsValid(typeTuple))
	{
		elog(ERROR, "citus cache lookup failed.");
	}
	Form_pg_type typeform = (Form_pg_type) GETSTRUCT(typeTuple);

	bool supportsSafeCombine = typeform->typtype != TYPTYPE_PSEUDO;

	ReleaseSysCache(aggTuple);
	ReleaseSysCache(typeTuple);

	return supportsSafeCombine;
}


/*
 * AggregateFunctionOid performs a reverse lookup on aggregate function name,
 * and returns the corresponding aggregate function oid for the given function
 * name and input type.
 */
static Oid
AggregateFunctionOid(const char *functionName, Oid inputType)
{
	Oid functionOid = InvalidOid;
	ScanKeyData scanKey[1];
	int scanKeyCount = 1;

	Relation procRelation = table_open(ProcedureRelationId, AccessShareLock);

	ScanKeyInit(&scanKey[0], Anum_pg_proc_proname,
				BTEqualStrategyNumber, F_NAMEEQ, CStringGetDatum(functionName));

	SysScanDesc scanDescriptor = systable_beginscan(procRelation,
													ProcedureNameArgsNspIndexId, true,
													NULL, scanKeyCount, scanKey);

	/* loop until we find the right function */
	HeapTuple heapTuple = systable_getnext(scanDescriptor);
	while (HeapTupleIsValid(heapTuple))
	{
		Form_pg_proc procForm = (Form_pg_proc) GETSTRUCT(heapTuple);
		int argumentCount = procForm->pronargs;

		if (argumentCount == 1)
		{
			/* check if input type and found value type match */
			if (procForm->proargtypes.values[0] == inputType ||
				procForm->proargtypes.values[0] == ANYELEMENTOID)
			{
				functionOid = procForm->oid;
				break;
			}
		}
		Assert(argumentCount <= 1);

		heapTuple = systable_getnext(scanDescriptor);
	}

	if (functionOid == InvalidOid)
	{
		ereport(ERROR, (errmsg("no matching oid for function: %s", functionName)));
	}

	systable_endscan(scanDescriptor);
	table_close(procRelation, AccessShareLock);

	return functionOid;
}


/*
 * CitusFunctionOidWithSignature looks up a function with given input types.
 * Looks in pg_catalog schema, as this function's sole purpose is
 * support aggregate lookup.
 */
static Oid
CitusFunctionOidWithSignature(char *functionName, int numargs, Oid *argtypes)
{
	List *aggregateName = list_make2(makeString("pg_catalog"), makeString(functionName));
	FuncCandidateList clist = FuncnameGetCandidates(aggregateName, numargs, NIL,
													false, false, false, true);

	for (; clist; clist = clist->next)
	{
		if (memcmp(clist->args, argtypes, numargs * sizeof(Oid)) == 0)
		{
			return clist->oid;
		}
	}

	ereport(ERROR, (errmsg("no matching oid for function: %s", functionName)));
	return InvalidOid;
}


/*
 * WorkerPartialAggOid looks up oid of pg_catalog.worker_partial_agg
 */
static Oid
WorkerPartialAggOid()
{
	Oid argtypes[] = {
		OIDOID,
		ANYELEMENTOID,
	};

	return CitusFunctionOidWithSignature(WORKER_PARTIAL_AGGREGATE_NAME, 2, argtypes);
}


/*
 * CoordCombineAggOid looks up oid of pg_catalog.coord_combine_agg
 */
static Oid
CoordCombineAggOid()
{
	Oid argtypes[] = {
		OIDOID,
		CSTRINGOID,
		ANYELEMENTOID,
	};

	return CitusFunctionOidWithSignature(COORD_COMBINE_AGGREGATE_NAME, 3, argtypes);
}


/*
 * TypeOid looks for a type that has the given name and schema, and returns the
 * corresponding type's oid.
 */
static Oid
TypeOid(Oid schemaId, const char *typeName)
{
	Oid typeOid = GetSysCacheOid2(TYPENAMENSP, Anum_pg_type_oid,
								  PointerGetDatum(typeName),
								  ObjectIdGetDatum(schemaId));

	return typeOid;
}


/*
 * CreateSortGroupClause creates SortGroupClause for a given column Var.
 * The caller should set tleSortGroupRef field and respective
 * TargetEntry->ressortgroupref fields to appropriate SortGroupRefIndex.
 */
static SortGroupClause *
CreateSortGroupClause(Var *column)
{
	Oid lessThanOperator = InvalidOid;
	Oid equalsOperator = InvalidOid;
	bool hashable = false;
	SortGroupClause *groupByClause = makeNode(SortGroupClause);

	get_sort_group_operators(column->vartype, true, true, true,
							 &lessThanOperator, &equalsOperator, NULL,
							 &hashable);
	groupByClause->eqop = equalsOperator;
	groupByClause->hashable = hashable;
	groupByClause->nulls_first = false;
	groupByClause->sortop = lessThanOperator;

	return groupByClause;
}


/*
 * CountDistinctHashFunctionName resolves the hll_hash function name to use for
 * the given input type, and returns this function name.
 */
static const char *
CountDistinctHashFunctionName(Oid argumentType)
{
	/* resolve hash function name based on input argument type */
	switch (argumentType)
	{
		case INT4OID:
		{
			return HLL_HASH_INTEGER_FUNC_NAME;
		}

		case INT8OID:
		{
			return HLL_HASH_BIGINT_FUNC_NAME;
		}

		case TEXTOID:
		case BPCHAROID:
		case VARCHAROID:
		{
			return HLL_HASH_TEXT_FUNC_NAME;
		}

		default:
		{
			return HLL_HASH_ANY_FUNC_NAME;
		}
	}
}


/*
 * CountDistinctStorageSize takes in the desired precision for count distinct
 * approximations, and returns the log-base-2 of storage space needed for the
 * HyperLogLog algorithm.
 */
static int
CountDistinctStorageSize(double approximationErrorRate)
{
	double desiredStorageSize = pow((1.04 / approximationErrorRate), 2);
	double logOfDesiredStorageSize = log(desiredStorageSize) / log(2);

	/* keep log2(storage size) inside allowed range */
	int logOfStorageSize = (int) rint(logOfDesiredStorageSize);
	if (logOfStorageSize < 4)
	{
		logOfStorageSize = 4;
	}
	else if (logOfStorageSize > 17)
	{
		logOfStorageSize = 17;
	}

	return logOfStorageSize;
}


/* Makes an integer constant node from the given value, and returns that node. */
static Const *
MakeIntegerConst(int32 integerValue)
{
	const int typeCollationId = get_typcollation(INT4OID);
	const int16 typeLength = get_typlen(INT4OID);
	const int32 typeModifier = -1;
	const bool typeIsNull = false;
	const bool typePassByValue = true;

	Datum integerDatum = Int32GetDatum(integerValue);
	Const *integerConst = makeConst(INT4OID, typeModifier, typeCollationId, typeLength,
									integerDatum, typeIsNull, typePassByValue);

	return integerConst;
}


/* Makes a 64-bit integer constant node from the given value, and returns that node. */
static Const *
MakeIntegerConstInt64(int64 integerValue)
{
	const int typeCollationId = get_typcollation(INT8OID);
	const int16 typeLength = get_typlen(INT8OID);
	const int32 typeModifier = -1;
	const bool typeIsNull = false;
	const bool typePassByValue = true;

	Datum integer64Datum = Int64GetDatum(integerValue);
	Const *integer64Const = makeConst(INT8OID, typeModifier, typeCollationId, typeLength,
									  integer64Datum, typeIsNull, typePassByValue);

	return integer64Const;
}


/*
 * HasNonDistributableAggregates checks for if any aggregates cannot be pushed down.
 * This only checks with GetAggregateType. DeferErrorIfHasNonDistributableAggregates
 * performs further checks which should be done if aggregates are not being pushed down.
 */
static bool
HasNonDistributableAggregates(MultiNode *logicalPlanNode)
{
	if (CoordinatorAggregationStrategy == COORDINATOR_AGGREGATION_DISABLED)
	{
		return false;
	}

	List *opNodeList = FindNodesOfType(logicalPlanNode, T_MultiExtendedOp);
	MultiExtendedOp *extendedOpNode = (MultiExtendedOp *) linitial(opNodeList);

	List *targetList = extendedOpNode->targetList;
	Node *havingQual = extendedOpNode->havingQual;

	/*
	 * PVC_REJECT_PLACEHOLDERS is implicit if PVC_INCLUDE_PLACEHOLDERS isn't
	 * specified.
	 */
	List *expressionList = pull_var_clause((Node *) targetList, PVC_INCLUDE_AGGREGATES |
										   PVC_INCLUDE_WINDOWFUNCS);
	expressionList = list_concat(expressionList,
								 pull_var_clause(havingQual, PVC_INCLUDE_AGGREGATES));

	Node *expression = NULL;
	foreach_ptr(expression, expressionList)
	{
		/* only consider aggregate expressions */
		if (!IsA(expression, Aggref))
		{
			continue;
		}

		AggregateType aggregateType = GetAggregateType((Aggref *) expression);
		Assert(aggregateType != AGGREGATE_INVALID_FIRST);

		if (aggregateType == AGGREGATE_CUSTOM_ROW_GATHER)
		{
			return true;
		}
	}

	return false;
}


/*
 * CanPushDownExpression returns whether the expression can be pushed down to workers.
 */
static bool
CanPushDownExpression(Node *expression,
					  const ExtendedOpNodeProperties *extendedOpNodeProperties)
{
	if (contain_nextval_expression_walker(expression, NULL))
	{
		/* nextval can only be evaluated on the coordinator */
		return false;
	}

	bool hasAggregate = contain_aggs_of_level(expression, 0);
	bool hasWindowFunction = contain_window_function(expression);
	if (!hasAggregate && !hasWindowFunction)
	{
		/*
		 * If the query has the form SELECT expression, agg(..) FROM table;
		 * then expression should be evaluated on the coordinator.
		 *
		 * Other than the efficiency part of this, we could also crash if
		 * we pushed down the expression to the workers. When pushing down
		 * expressions to workers we create a Var reference to the worker
		 * tuples. If the result from worker is empty, but we need to have
		 * at least a row in coordinator result, postgres will crash when
		 * trying to evaluate the Var.
		 *
		 * For details, see https://github.com/citusdata/citus/pull/3961
		 */
		if (!extendedOpNodeProperties->hasAggregate ||
			extendedOpNodeProperties->hasGroupBy)
		{
			return true;
		}
	}

	/* aggregates inside pushed down window functions can be pushed down */
	bool hasPushableWindowFunction =
		hasWindowFunction && extendedOpNodeProperties->onlyPushableWindowFunctions;
	if (hasPushableWindowFunction)
	{
		return true;
	}

	if (extendedOpNodeProperties->pushDownGroupingAndHaving && !hasWindowFunction)
	{
		return true;
	}

	if (hasAggregate && !hasWindowFunction &&
		extendedOpNodeProperties->groupedByDisjointPartitionColumn)
	{
		return true;
	}

	return false;
}


/*
 * DeferErrorIfHasNonDistributableAggregates extracts aggregate expressions from
 * the logical plan, walks over them and uses helper functions to check if we
 * can transform these aggregate expressions and push them down to worker nodes.
 */
static DeferredErrorMessage *
DeferErrorIfHasNonDistributableAggregates(MultiNode *logicalPlanNode)
{
	DeferredErrorMessage *error = NULL;
	List *opNodeList = FindNodesOfType(logicalPlanNode, T_MultiExtendedOp);
	MultiExtendedOp *extendedOpNode = (MultiExtendedOp *) linitial(opNodeList);

	List *targetList = extendedOpNode->targetList;
	Node *havingQual = extendedOpNode->havingQual;

	/*
	 * PVC_REJECT_PLACEHOLDERS is implicit if PVC_INCLUDE_PLACEHOLDERS isn't
	 * specified.
	 */
	List *expressionList = pull_var_clause((Node *) targetList, PVC_INCLUDE_AGGREGATES |
										   PVC_INCLUDE_WINDOWFUNCS);
	expressionList = list_concat(expressionList,
								 pull_var_clause(havingQual, PVC_INCLUDE_AGGREGATES));

	Node *expression = NULL;
	foreach_ptr(expression, expressionList)
	{
		/* only consider aggregate expressions */
		if (!IsA(expression, Aggref))
		{
			continue;
		}

		/* GetAggregateType errors out on unsupported aggregate types */
		Aggref *aggregateExpression = (Aggref *) expression;
		AggregateType aggregateType = GetAggregateType(aggregateExpression);
		Assert(aggregateType != AGGREGATE_INVALID_FIRST);

		/*
		 * Check that we can transform the current aggregate expression. These
		 * functions error out on unsupported array_agg and aggregate (distinct)
		 * clauses.
		 */
		if (aggregateType == AGGREGATE_ARRAY_AGG)
		{
			error = DeferErrorIfUnsupportedArrayAggregate(aggregateExpression);
		}
		else if (aggregateType == AGGREGATE_JSONB_AGG ||
				 aggregateType == AGGREGATE_JSON_AGG)
		{
			error = DeferErrorIfUnsupportedJsonAggregate(aggregateType,
														 aggregateExpression);
		}
		else if (aggregateType == AGGREGATE_JSONB_OBJECT_AGG ||
				 aggregateType == AGGREGATE_JSON_OBJECT_AGG)
		{
			error = DeferErrorIfUnsupportedJsonAggregate(aggregateType,
														 aggregateExpression);
		}
		else if (aggregateExpression->aggdistinct)
		{
			error = DeferErrorIfUnsupportedAggregateDistinct(aggregateExpression,
															 logicalPlanNode);
		}

		if (error != NULL)
		{
			return error;
		}
	}

	return NULL;
}


/*
 * DeferErrorIfUnsupportedArrayAggregate checks if we can transform the array aggregate
 * expression and push it down to the worker node. If we cannot transform the
 * aggregate, this function errors.
 */
static DeferredErrorMessage *
DeferErrorIfUnsupportedArrayAggregate(Aggref *arrayAggregateExpression)
{
	/* if array_agg has order by, we error out */
	if (arrayAggregateExpression->aggorder)
	{
		return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
							 "array_agg with order by is unsupported",
							 NULL, NULL);
	}

	/* if array_agg has distinct, we error out */
	if (arrayAggregateExpression->aggdistinct)
	{
		return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
							 "array_agg (distinct) is unsupported",
							 NULL, NULL);
	}

	return NULL;
}


/*
 * DeferErrorIfUnsupportedJsonAggregate checks if we can transform the json
 * aggregate expression and push it down to the worker node. If we cannot
 * transform the aggregate, this function errors.
 */
static DeferredErrorMessage *
DeferErrorIfUnsupportedJsonAggregate(AggregateType type,
									 Aggref *aggregateExpression)
{
	/* if json aggregate has order by, we error out */
	if (aggregateExpression->aggdistinct || aggregateExpression->aggorder)
	{
		StringInfoData errorDetail;
		initStringInfo(&errorDetail);
		const char *name = AggregateNames[type];

		appendStringInfoString(&errorDetail, name);
		if (aggregateExpression->aggorder)
		{
			appendStringInfoString(&errorDetail, " with order by is unsupported");
		}
		else
		{
			appendStringInfoString(&errorDetail, " (distinct) is unsupported");
		}

		return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, errorDetail.data,
							 NULL, NULL);
	}

	return NULL;
}


/*
 * DeferErrorIfUnsupportedAggregateDistinct checks if we can transform the aggregate
 * (distinct expression) and push it down to the worker node. It handles count
 * (distinct) separately to check if we can use distinct approximations. If we
 * cannot transform the aggregate, this function errors.
 */
static DeferredErrorMessage *
DeferErrorIfUnsupportedAggregateDistinct(Aggref *aggregateExpression,
										 MultiNode *logicalPlanNode)
{
	const char *errorDetail = NULL;
	bool distinctSupported = true;

	AggregateType aggregateType = GetAggregateType(aggregateExpression);

	/* If we're aggregating on coordinator, this becomes simple. */
	if (aggregateType == AGGREGATE_CUSTOM_ROW_GATHER)
	{
		return NULL;
	}

	/*
	 * We partially support count(distinct) in subqueries, other distinct aggregates in
	 * subqueries are not supported yet.
	 */
	if (aggregateType == AGGREGATE_COUNT)
	{
		Node *aggregateArgument = (Node *) linitial(aggregateExpression->args);
		List *columnList = pull_var_clause_default(aggregateArgument);

		Var *column = NULL;
		foreach_ptr(column, columnList)
		{
			if (column->varattno <= 0)
			{
				return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
									 "cannot compute count (distinct)",
									 "Non-column references are not supported yet",
									 NULL);
			}
		}
	}
	else
	{
		List *multiTableNodeList = FindNodesOfType(logicalPlanNode, T_MultiTable);

		MultiTable *multiTable = NULL;
		foreach_ptr(multiTable, multiTableNodeList)
		{
			if (multiTable->relationId == SUBQUERY_RELATION_ID ||
				multiTable->relationId == SUBQUERY_PUSHDOWN_RELATION_ID)
			{
				return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
									 "cannot compute aggregate (distinct)",
									 "Only count(distinct) aggregate is "
									 "supported in subqueries", NULL);
			}
		}
	}

	/* if we have a count(distinct), and distinct approximation is enabled */
	if (aggregateType == AGGREGATE_COUNT &&
		CountDistinctErrorRate != DISABLE_DISTINCT_APPROXIMATION)
	{
		bool missingOK = true;
		Oid distinctExtensionId = get_extension_oid(HLL_EXTENSION_NAME, missingOK);

		/* if extension for distinct approximation is loaded, we are good */
		if (distinctExtensionId != InvalidOid)
		{
			return NULL;
		}
		else
		{
			return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
								 "cannot compute count (distinct) approximation",
								 NULL,
								 "You need to have the hll extension loaded.");
		}
	}

	if (aggregateType == AGGREGATE_COUNT)
	{
		List *aggregateVarList = pull_var_clause_default((Node *) aggregateExpression);
		if (aggregateVarList == NIL)
		{
			distinctSupported = false;
			errorDetail = "aggregate (distinct) with no columns is unsupported";
		}
	}

	List *repartitionNodeList = FindNodesOfType(logicalPlanNode, T_MultiPartition);
	if (repartitionNodeList != NIL)
	{
		distinctSupported = false;
		errorDetail = "aggregate (distinct) with table repartitioning is unsupported";
	}

	List *tableNodeList = FindNodesOfType(logicalPlanNode, T_MultiTable);
	List *extendedOpNodeList = FindNodesOfType(logicalPlanNode, T_MultiExtendedOp);
	MultiExtendedOp *extendedOpNode = (MultiExtendedOp *) linitial(extendedOpNodeList);

	Var *distinctColumn = AggregateDistinctColumn(aggregateExpression);
	if (distinctSupported)
	{
		if (distinctColumn == NULL)
		{
			/*
			 * If the query has a single table, and table is grouped by partition
			 * column, then we support count distincts even distinct column can
			 * not be identified.
			 */
			distinctSupported = TablePartitioningSupportsDistinct(tableNodeList,
																  extendedOpNode,
																  distinctColumn,
																  aggregateType);
			if (!distinctSupported)
			{
				errorDetail = "aggregate (distinct) on complex expressions is"
							  " unsupported";
			}
		}
		else if (aggregateType != AGGREGATE_COUNT)
		{
			bool supports = TablePartitioningSupportsDistinct(tableNodeList,
															  extendedOpNode,
															  distinctColumn,
															  aggregateType);
			if (!supports)
			{
				distinctSupported = false;
				errorDetail = "table partitioning is unsuitable for aggregate (distinct)";
			}
		}
	}

	/* if current aggregate expression isn't supported, error out */
	if (!distinctSupported)
	{
		const char *errorHint = NULL;
		if (aggregateType == AGGREGATE_COUNT)
		{
			errorHint = "You can load the hll extension from contrib "
						"packages and enable distinct approximations.";
		}

		return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
							 "cannot compute aggregate (distinct)",
							 errorDetail, errorHint);
	}

	return NULL;
}


/*
 * AggregateDistinctColumn checks if the given aggregate expression's distinct
 * clause is on a single column. If it is, the function finds and returns that
 * column. Otherwise, the function returns null.
 * The function expects to find a single column here, no FieldSelect or other
 * expressions are accepted as a column.
 */
static Var *
AggregateDistinctColumn(Aggref *aggregateExpression)
{
	/* only consider aggregates with distincts */
	if (!aggregateExpression->aggdistinct)
	{
		return NULL;
	}

	int aggregateArgumentCount = list_length(aggregateExpression->args);
	if (aggregateArgumentCount != 1)
	{
		return NULL;
	}

	TargetEntry *aggregateTargetEntry = (TargetEntry *) linitial(
		aggregateExpression->args);
	if (!IsA(aggregateTargetEntry->expr, Var))
	{
		return NULL;
	}

	Var *aggregateColumn = (Var *) aggregateTargetEntry->expr;
	return aggregateColumn;
}


/*
 * TablePartitioningSupportsDistinct walks over all tables in the given list and
 * checks that each table's partitioning method is suitable for pushing down an
 * aggregate (distinct) expression to worker nodes. For this, the function needs
 * to check that task results do not overlap with one another on the distinct
 * column.
 */
static bool
TablePartitioningSupportsDistinct(List *tableNodeList, MultiExtendedOp *opNode,
								  Var *distinctColumn, AggregateType aggregateType)
{
	bool distinctSupported = true;

	MultiTable *tableNode = NULL;
	foreach_ptr(tableNode, tableNodeList)
	{
		Oid relationId = tableNode->relationId;
		bool tableDistinctSupported = false;

		if (relationId == SUBQUERY_RELATION_ID ||
			relationId == SUBQUERY_PUSHDOWN_RELATION_ID)
		{
			return true;
		}

		/* if table has one shard, task results don't overlap */
		List *shardList = LoadShardList(relationId);
		if (list_length(shardList) == 1)
		{
			continue;
		}

		/*
		 * We need to check that task results don't overlap. We can only do this
		 * if table is range partitioned.
		 */
		if (IsCitusTableType(relationId, RANGE_DISTRIBUTED) ||
			IsCitusTableType(relationId, HASH_DISTRIBUTED))
		{
			Var *tablePartitionColumn = tableNode->partitionColumn;

			if (aggregateType == AGGREGATE_COUNT)
			{
				tableDistinctSupported = true;
			}

			/* if distinct is on table partition column, we can push it down */
			if (distinctColumn != NULL &&
				tablePartitionColumn->varno == distinctColumn->varno &&
				tablePartitionColumn->varattno == distinctColumn->varattno)
			{
				tableDistinctSupported = true;
			}

			/* if results are grouped by partition column, we can push down */
			bool groupedByPartitionColumn = GroupedByColumn(opNode->groupClauseList,
															opNode->targetList,
															tablePartitionColumn);
			if (groupedByPartitionColumn)
			{
				tableDistinctSupported = true;
			}
		}

		if (!tableDistinctSupported)
		{
			distinctSupported = false;
			break;
		}
	}

	return distinctSupported;
}


/*
 * GroupedByColumn walks over group clauses in the given list, and checks if any
 * of the group clauses is on the given column.
 */
bool
GroupedByColumn(List *groupClauseList, List *targetList, Var *column)
{
	bool groupedByColumn = false;

	if (column == NULL)
	{
		return false;
	}

	SortGroupClause *groupClause = NULL;
	foreach_ptr(groupClause, groupClauseList)
	{
		TargetEntry *groupTargetEntry = get_sortgroupclause_tle(groupClause, targetList);

		Expr *groupExpression = (Expr *) groupTargetEntry->expr;
		if (IsA(groupExpression, Var))
		{
			Var *groupColumn = (Var *) groupExpression;
			if (groupColumn->varno == column->varno &&
				groupColumn->varattno == column->varattno)
			{
				groupedByColumn = true;
				break;
			}
		}
	}

	return groupedByColumn;
}


/*
 * SubqueryMultiTableList extracts multi tables in the given logical plan tree
 * and returns subquery multi tables in a new list.
 */
List *
SubqueryMultiTableList(MultiNode *multiNode)
{
	List *subqueryMultiTableList = NIL;
	List *multiTableNodeList = FindNodesOfType(multiNode, T_MultiTable);

	MultiTable *multiTable = NULL;
	foreach_ptr(multiTable, multiTableNodeList)
	{
		Query *subquery = multiTable->subquery;

		if (subquery != NULL)
		{
			subqueryMultiTableList = lappend(subqueryMultiTableList, multiTable);
		}
	}

	return subqueryMultiTableList;
}


/*
 * GroupTargetEntryList walks over group clauses in the given list, finds
 * matching target entries and return them in a new list.
 */
List *
GroupTargetEntryList(List *groupClauseList, List *targetEntryList)
{
	List *groupTargetEntryList = NIL;

	SortGroupClause *groupClause = NULL;
	foreach_ptr(groupClause, groupClauseList)
	{
		TargetEntry *groupTargetEntry =
			get_sortgroupclause_tle(groupClause, targetEntryList);
		groupTargetEntryList = lappend(groupTargetEntryList, groupTargetEntry);
	}

	return groupTargetEntryList;
}


/*
 * IsPartitionColumn returns true if the given column is a partition column.
 * The function uses FindReferencedTableColumn to find the original relation
 * id and column that the column expression refers to. It then checks whether
 * that column is a partition column of the relation.
 *
 * Also, the function returns always false for reference tables given that
 * reference tables do not have partition column. The function does not
 * support queries with CTEs, it would return false if columnExpression
 * refers to a column returned by a CTE.
 *
 * If skipOuterVars is true, then it doesn't process the outervars.
 */
bool
IsPartitionColumn(Expr *columnExpression, Query *query, bool skipOuterVars)
{
	bool isPartitionColumn = false;
	Var *column = NULL;
	RangeTblEntry *relationRTE = NULL;

	FindReferencedTableColumn(columnExpression, NIL, query, &column, &relationRTE,
							  skipOuterVars);
	Oid relationId = relationRTE ? relationRTE->relid : InvalidOid;
	if (relationId != InvalidOid && column != NULL)
	{
		Var *partitionColumn = DistPartitionKey(relationId);

		/* not all distributed tables have partition column */
		if (partitionColumn != NULL && column->varattno == partitionColumn->varattno)
		{
			isPartitionColumn = true;
		}
	}

	return isPartitionColumn;
}


/*
 * FindReferencedTableColumn recursively traverses query tree to find actual relation
 * id, and column that columnExpression refers to. If columnExpression is a
 * non-relational or computed/derived expression, the function returns NULL for
 * rte and NULL for column. The caller should provide parent query list from
 * top of the tree to this particular Query's parent. This argument is used to look
 * into CTEs that may be present in the query.
 *
 * If skipOuterVars is true, then it doesn't check vars coming from outer queries.
 * We probably don't need this skipOuterVars check but we wanted to be on the safe side
 * and used it only in UNION path, we can separately work on verifying that it doesn't break
 * anything existing.
 */
void
FindReferencedTableColumn(Expr *columnExpression, List *parentQueryList, Query *query,
						  Var **column, RangeTblEntry **rteContainingReferencedColumn,
						  bool skipOuterVars)
{
	Var *candidateColumn = NULL;
	Expr *strippedColumnExpression = (Expr *) strip_implicit_coercions(
		(Node *) columnExpression);

	*rteContainingReferencedColumn = NULL;
	*column = NULL;

	if (IsA(strippedColumnExpression, Var))
	{
		candidateColumn = (Var *) strippedColumnExpression;
	}
	else if (IsA(strippedColumnExpression, FieldSelect))
	{
		FieldSelect *compositeField = (FieldSelect *) strippedColumnExpression;
		Expr *fieldExpression = compositeField->arg;

		if (IsA(fieldExpression, Var))
		{
			candidateColumn = (Var *) fieldExpression;
		}
	}

	if (candidateColumn == NULL)
	{
		return;
	}


	if (candidateColumn->varlevelsup > 0)
	{
		if (skipOuterVars)
		{
			/*
			 * we don't want to process outer vars, so we return early.
			 */
			return;
		}

		/*
		 * We currently don't support finding partition keys in the subqueries
		 * that reference outer subqueries. For example, in correlated
		 * subqueries in WHERE clause, we don't support use of partition keys
		 * in the subquery that is referred from the outer query.
		 */

		int parentQueryIndex = list_length(parentQueryList) -
							   candidateColumn->varlevelsup;
		if (!(IsIndexInRange(parentQueryList, parentQueryIndex)))
		{
			return;
		}

		/*
		 * Before we recurse into the query tree, we should update the candidateColumn and we use copy of it.
		 * As we get the query from varlevelsup up, we reset the varlevelsup.
		 */
		candidateColumn = copyObject(candidateColumn);
		candidateColumn->varlevelsup = 0;

		/*
		 * We should be careful about these fields because they need to
		 * be updated correctly based on ctelevelsup and varlevelsup.
		 */
		query = list_nth(parentQueryList, parentQueryIndex);
		parentQueryList = list_truncate(parentQueryList, parentQueryIndex);
	}

	if (candidateColumn->varattno == InvalidAttrNumber)
	{
		/*
		 * varattno can be 0 in case of SELECT table FROM table, but that Var
		 * definitely does not correspond to a specific column.
		 */
		return;
	}

	List *rangetableList = query->rtable;
	int rangeTableEntryIndex = candidateColumn->varno - 1;
	RangeTblEntry *rangeTableEntry = list_nth(rangetableList, rangeTableEntryIndex);

	if (rangeTableEntry->rtekind == RTE_RELATION)
	{
		*rteContainingReferencedColumn = rangeTableEntry;
		*column = candidateColumn;
	}
	else if (rangeTableEntry->rtekind == RTE_SUBQUERY)
	{
		Query *subquery = rangeTableEntry->subquery;
		List *targetEntryList = subquery->targetList;
		AttrNumber targetEntryIndex = candidateColumn->varattno - 1;
		TargetEntry *subqueryTargetEntry = list_nth(targetEntryList, targetEntryIndex);
		Expr *subColumnExpression = subqueryTargetEntry->expr;

		/* append current query to parent query list */
		parentQueryList = lappend(parentQueryList, query);
		FindReferencedTableColumn(subColumnExpression, parentQueryList,
								  subquery, column, rteContainingReferencedColumn,
								  skipOuterVars);
	}
	else if (rangeTableEntry->rtekind == RTE_JOIN)
	{
		List *joinColumnList = rangeTableEntry->joinaliasvars;
		AttrNumber joinColumnIndex = candidateColumn->varattno - 1;
		Expr *joinColumn = list_nth(joinColumnList, joinColumnIndex);

		/* parent query list stays the same since still in the same query boundary */
		FindReferencedTableColumn(joinColumn, parentQueryList, query, column,
								  rteContainingReferencedColumn, skipOuterVars);
	}
	else if (rangeTableEntry->rtekind == RTE_CTE)
	{
		/*
		 * When outerVars are considered, we modify parentQueryList, so this
		 * logic might need to change when we support outervars in CTEs.
		 */
		Assert(skipOuterVars);

		int cteParentListIndex = list_length(parentQueryList) -
								 rangeTableEntry->ctelevelsup - 1;
		Query *cteParentQuery = NULL;
		List *cteList = NIL;
		CommonTableExpr *cte = NULL;

		/*
		 * This should have been an error case, not marking it as error at the
		 * moment due to usage from IsPartitionColumn. Callers of that function
		 * do not have access to parent query list.
		 */
		if (IsIndexInRange(parentQueryList, cteParentListIndex))
		{
			cteParentQuery = list_nth(parentQueryList, cteParentListIndex);
			cteList = cteParentQuery->cteList;
		}

		CommonTableExpr *candidateCte = NULL;
		foreach_ptr(candidateCte, cteList)
		{
			if (strcmp(candidateCte->ctename, rangeTableEntry->ctename) == 0)
			{
				cte = candidateCte;
				break;
			}
		}

		if (cte != NULL)
		{
			Query *cteQuery = (Query *) cte->ctequery;
			List *targetEntryList = cteQuery->targetList;
			AttrNumber targetEntryIndex = candidateColumn->varattno - 1;
			TargetEntry *targetEntry = list_nth(targetEntryList, targetEntryIndex);

			parentQueryList = lappend(parentQueryList, query);
			FindReferencedTableColumn(targetEntry->expr, parentQueryList,
									  cteQuery, column, rteContainingReferencedColumn,
									  skipOuterVars);
		}
	}
}


/*
 * IsIndexInRange returns true if the given index is within the
 * range of the given list.
 */
static bool
IsIndexInRange(const List *list, int index)
{
	return index >= 0 && index < list_length(list);
}


/*
 * ExtractQueryWalker walks over a query, and finds all queries in the query
 * tree and returns these queries. Note that the function also recurses into
 * the subqueries in WHERE clause.
 */
bool
ExtractQueryWalker(Node *node, List **queryList)
{
	if (node == NULL)
	{
		return false;
	}

	if (IsA(node, Query))
	{
		Query *query = (Query *) node;

		(*queryList) = lappend(*queryList, query);
		return query_tree_walker(query, ExtractQueryWalker, queryList, 0);
	}

	return expression_tree_walker(node, ExtractQueryWalker, queryList);
}


/*
 * WorkerLimitCount checks if the given input contains a valid limit node, and
 * if that node can be pushed down. For this, the function checks if this limit
 * count or a meaningful approximation of it can be pushed down to worker nodes.
 * If they can, the function returns the limit count.
 *
 * The limit push-down decision tree is as follows:
 *                                         group by?
 *                                       1/         \0
 *                       group by partition column?   (exact pd)
 *                              0/         \1
 *                          order by?        (exact pd)
 *                       1/           \0
 *           has order by agg?          (no pd)
 *            1/           \0
 *     can approximate?    (exact pd)
 *      1/       \0
 * (approx pd)   (no pd)
 *
 * When an offset is present, the offset value is added to limit because for a query
 * with LIMIT x OFFSET y, (x+y) records should be pulled from the workers.
 *
 * If no limit is present or can be pushed down, then WorkerLimitCount
 * returns null.
 */
static Node *
WorkerLimitCount(Node *limitCount, Node *limitOffset, OrderByLimitReference
				 orderByLimitReference)
{
	Node *workerLimitNode = NULL;
	LimitPushdownable canPushDownLimit = LIMIT_CANNOT_PUSHDOWN;

	if (limitCount == NULL)
	{
		/* no limit node to push down */
		return NULL;
	}

	if (!IsA(limitCount, Const))
	{
		/*
		 * We only push down constant LIMIT clauses to make sure we get back
		 * the minimum number of rows.
		 */
		return NULL;
	}

	if (limitOffset != NULL && !IsA(limitOffset, Const))
	{
		/*
		 * If OFFSET is not a constant then we cannot calculate the LIMIT to
		 * push down.
		 */
		return NULL;
	}


	/*
	 * If window functions are computed on coordinator, we cannot push down LIMIT.
	 * If we don't have group by clauses, or we have group by partition column,
	 * or if we have order by clauses without aggregates, we can push down the
	 * original limit. Else if we have order by clauses with commutative aggregates,
	 * we can push down approximate limits.
	 */
	if (!orderByLimitReference.onlyPushableWindowFunctions)
	{
		canPushDownLimit = LIMIT_CANNOT_PUSHDOWN;
	}
	else if (orderByLimitReference.groupClauseIsEmpty ||
			 orderByLimitReference.groupedByDisjointPartitionColumn)
	{
		canPushDownLimit = LIMIT_CAN_PUSHDOWN;
	}
	else if (orderByLimitReference.sortClauseIsEmpty)
	{
		canPushDownLimit = LIMIT_CANNOT_PUSHDOWN;
	}
	else if (!orderByLimitReference.hasOrderByAggregate)
	{
		canPushDownLimit = LIMIT_CAN_PUSHDOWN;
	}
	else if (orderByLimitReference.canApproximate)
	{
		canPushDownLimit = LIMIT_CAN_APPROXIMATE;
	}

	/* create the workerLimitNode according to the decisions above */
	if (canPushDownLimit == LIMIT_CAN_PUSHDOWN)
	{
		workerLimitNode = (Node *) copyObject(limitCount);
	}
	else if (canPushDownLimit == LIMIT_CAN_APPROXIMATE)
	{
		Const *workerLimitConst = (Const *) copyObject(limitCount);
		int64 workerLimitCount = (int64) LimitClauseRowFetchCount;
		workerLimitConst->constvalue = Int64GetDatum(workerLimitCount);

		workerLimitNode = (Node *) workerLimitConst;
	}

	/*
	 * If offset clause is present and limit can be pushed down (whether exactly or
	 * approximately), add the offset value to limit on workers
	 */
	if (workerLimitNode != NULL && limitOffset != NULL)
	{
		Const *workerLimitConst = (Const *) workerLimitNode;

		/* Only update the worker limit if the const is not null.*/
		if (!workerLimitConst->constisnull)
		{
			Const *workerOffsetConst = (Const *) limitOffset;
			int64 workerLimitCount = DatumGetInt64(workerLimitConst->constvalue);

			/* If the offset is null, it defaults to 0 when cast to int64. */
			int64 workerOffsetCount = DatumGetInt64(workerOffsetConst->constvalue);
			workerLimitCount = workerLimitCount + workerOffsetCount;
			workerLimitNode = (Node *) MakeIntegerConstInt64(workerLimitCount);
		}
	}

	/* display debug message on limit push down */
	if (workerLimitNode != NULL)
	{
		Const *workerLimitConst = (Const *) workerLimitNode;
		if (!workerLimitConst->constisnull)
		{
			int64 workerLimitCount = DatumGetInt64(workerLimitConst->constvalue);

			ereport(DEBUG1, (errmsg("push down of limit count: " INT64_FORMAT,
									workerLimitCount)));
		}
		else
		{
			ereport(DEBUG1, (errmsg("push down of limit count: ALL")));
		}
	}

	return workerLimitNode;
}


/*
 * WorkerSortClauseList first checks if the given input contains a limit
 * or hasDistinctOn that can be pushed down. If it does, the function then
 * checks if we need to add any sorting and grouping clauses to the sort list we
 * push down for the limit. If we do, the function adds these clauses and
 * returns them. Otherwise, the function returns null.
 */
static List *
WorkerSortClauseList(Node *limitCount, List *groupClauseList, List *sortClauseList,
					 OrderByLimitReference orderByLimitReference)
{
	List *workerSortClauseList = NIL;

	/* if no limit node and no hasDistinctOn, no need to push down sort clauses */
	if (limitCount == NULL && !orderByLimitReference.hasDistinctOn)
	{
		return NIL;
	}

	/* If window functions are computed on coordinator, we cannot push down sorting. */
	if (!orderByLimitReference.onlyPushableWindowFunctions)
	{
		return NIL;
	}

	sortClauseList = copyObject(sortClauseList);

	/*
	 * If we are pushing down the limit, push down any order by clauses. Also if
	 * we are pushing down the limit because the order by clauses don't have any
	 * aggregates, add group by clauses to the order by list. We do this because
	 * rows that belong to the same grouping may appear in different "offsets"
	 * in different task results. By ordering on the group by clause, we ensure
	 * that query results are consistent.
	 */
	if (orderByLimitReference.groupClauseIsEmpty ||
		orderByLimitReference.groupedByDisjointPartitionColumn)
	{
		workerSortClauseList = sortClauseList;
	}
	else if (sortClauseList != NIL)
	{
		bool orderByNonAggregates = !orderByLimitReference.hasOrderByAggregate;
		bool canApproximate = orderByLimitReference.canApproximate;

		if (orderByNonAggregates)
		{
			workerSortClauseList = sortClauseList;
			workerSortClauseList = list_concat(workerSortClauseList, groupClauseList);
		}
		else if (canApproximate)
		{
			workerSortClauseList = sortClauseList;
		}
	}

	return workerSortClauseList;
}


/*
 * CanPushDownLimitApproximate checks if we can push down the limit clause to
 * the worker nodes, and get approximate and meaningful results. We can do this
 * only when: (1) the user has enabled the limit approximation and (2) the query
 * has order by clauses that are commutative.
 */
static bool
CanPushDownLimitApproximate(List *sortClauseList, List *targetList)
{
	bool canApproximate = false;

	/* user hasn't enabled the limit approximation */
	if (LimitClauseRowFetchCount == DISABLE_LIMIT_APPROXIMATION)
	{
		return false;
	}

	if (sortClauseList != NIL)
	{
		bool orderByNonCommutativeAggregate =
			HasOrderByNonCommutativeAggregate(sortClauseList, targetList);
		bool orderByComplex = HasOrderByComplexExpression(sortClauseList, targetList);

		if (!orderByNonCommutativeAggregate && !orderByComplex)
		{
			canApproximate = true;
		}
	}

	return canApproximate;
}


/*
 * HasOrderByAggregate walks over the given order by clauses, and checks if we
 * have an order by an aggregate function. If we do, the function returns true.
 */
static bool
HasOrderByAggregate(List *sortClauseList, List *targetList)
{
	bool hasOrderByAggregate = false;

	SortGroupClause *sortClause = NULL;
	foreach_ptr(sortClause, sortClauseList)
	{
		Node *sortExpression = get_sortgroupclause_expr(sortClause, targetList);

		bool containsAggregate = contain_aggs_of_level(sortExpression, 0);
		if (containsAggregate)
		{
			hasOrderByAggregate = true;
			break;
		}
	}

	return hasOrderByAggregate;
}


/*
 * HasOrderByNonCommutativeAggregate walks over the given order by clauses,
 * and checks if we have an order by an aggregate which is not commutative.
 */
static bool
HasOrderByNonCommutativeAggregate(List *sortClauseList, List *targetList)
{
	bool hasOrderByNonCommutativeAggregate = false;

	SortGroupClause *sortClause = NULL;
	foreach_ptr(sortClause, sortClauseList)
	{
		Node *sortExpression = get_sortgroupclause_expr(sortClause, targetList);

		/* if sort expression is an aggregate, check its type */
		if (IsA(sortExpression, Aggref))
		{
			Aggref *aggregate = (Aggref *) sortExpression;

			AggregateType aggregateType = GetAggregateType(aggregate);
			if (aggregateType != AGGREGATE_MIN &&
				aggregateType != AGGREGATE_MAX &&
				aggregateType != AGGREGATE_SUM &&
				aggregateType != AGGREGATE_COUNT &&
				aggregateType != AGGREGATE_BIT_AND &&
				aggregateType != AGGREGATE_BIT_OR &&
				aggregateType != AGGREGATE_EVERY &&
				aggregateType != AGGREGATE_ANY_VALUE)
			{
				hasOrderByNonCommutativeAggregate = true;
				break;
			}
		}
	}

	return hasOrderByNonCommutativeAggregate;
}


/*
 * HasOrderByComplexExpression walks over the given order by clauses, and checks
 * if we have a nested expression that contains an aggregate function within it.
 * If we do, the function returns true.
 */
static bool
HasOrderByComplexExpression(List *sortClauseList, List *targetList)
{
	bool hasOrderByComplexExpression = false;

	SortGroupClause *sortClause = NULL;
	foreach_ptr(sortClause, sortClauseList)
	{
		Node *sortExpression = get_sortgroupclause_expr(sortClause, targetList);

		/* simple aggregate functions are ok */
		if (IsA(sortExpression, Aggref))
		{
			continue;
		}

		bool nestedAggregate = contain_aggs_of_level(sortExpression, 0);
		if (nestedAggregate)
		{
			hasOrderByComplexExpression = true;
			break;
		}
	}

	return hasOrderByComplexExpression;
}


/*
 * HasOrderByHllType walks over the given order by clauses, and checks if any of
 * those clauses operate on hll data type. If they do, the function returns true.
 */
static bool
HasOrderByHllType(List *sortClauseList, List *targetList)
{
	bool hasOrderByHllType = false;

	/* check whether HLL is loaded */
	Oid hllId = get_extension_oid(HLL_EXTENSION_NAME, true);
	if (!OidIsValid(hllId))
	{
		return hasOrderByHllType;
	}

	Oid hllSchemaOid = get_extension_schema(hllId);
	Oid hllTypeId = TypeOid(hllSchemaOid, HLL_TYPE_NAME);

	SortGroupClause *sortClause = NULL;
	foreach_ptr(sortClause, sortClauseList)
	{
		Node *sortExpression = get_sortgroupclause_expr(sortClause, targetList);

		Oid sortColumnTypeId = exprType(sortExpression);
		if (sortColumnTypeId == hllTypeId)
		{
			hasOrderByHllType = true;
			break;
		}
	}

	return hasOrderByHllType;
}


/*
 * ShouldProcessDistinctOrderAndLimitForWorker returns whether
 * ProcessDistinctClauseForWorkerQuery should be called. If not,
 * neither should ProcessLimitOrderByForWorkerQuery.
 */
static bool
ShouldProcessDistinctOrderAndLimitForWorker(
	ExtendedOpNodeProperties *extendedOpNodeProperties,
	bool pushingDownOriginalGrouping,
	Node *havingQual)
{
	if (extendedOpNodeProperties->pullUpIntermediateRows)
	{
		return false;
	}

	/* window functions must be evaluated beforehand */
	if (!extendedOpNodeProperties->onlyPushableWindowFunctions)
	{
		return false;
	}

	if (extendedOpNodeProperties->pushDownGroupingAndHaving)
	{
		return true;
	}

	/* If the same GROUP BY is being pushed down and there's no HAVING,
	 * then the push down logic will be able to handle this scenario.
	 */
	if (pushingDownOriginalGrouping && havingQual == NULL)
	{
		return true;
	}

	return false;
}


/*
 * WorkerColumnName returns a palloc'd string for being the resname of a TargetEntry.
 */
char *
WorkerColumnName(AttrNumber resno)
{
	StringInfoData name = { 0 };
	initStringInfo(&name);
	appendStringInfo(&name, WORKER_COLUMN_FORMAT, resno);

	return name.data;
}


/*
 * IsGroupBySubsetOfDistinct checks whether each clause in group clauses also
 * exists in the distinct clauses. Note that, empty group clause is not a subset
 * of distinct clause.
 */
bool
IsGroupBySubsetOfDistinct(List *groupClauses, List *distinctClauses)
{
	/* There must be a group clause */
	if (list_length(groupClauses) == 0)
	{
		return false;
	}

	SortGroupClause *groupClause = NULL;
	foreach_ptr(groupClause, groupClauses)
	{
		bool isFound = false;

		SortGroupClause *distinctClause = NULL;
		foreach_ptr(distinctClause, distinctClauses)
		{
			if (groupClause->tleSortGroupRef == distinctClause->tleSortGroupRef)
			{
				isFound = true;
				break;
			}
		}

		/*
		 * If we can't find any member of group clause in the distinct clause,
		 * that means group clause is not a subset of distinct clause.
		 */
		if (!isFound)
		{
			return false;
		}
	}

	return true;
}