citus/src/backend/distributed/planner/fast_path_router_planner.c

/*-------------------------------------------------------------------------
 *
 * fast_path_router_planner.c
 *
 * Planning logic for fast path router planner queries. In this context,
 * we define "Fast Path Planning" as trivial queries where Citus
 * can skip relying on the standard_planner() and handle all the planning.
 *
 * For router planner, standard_planner() is mostly important to generate
 * the necessary restriction information. Later, the restriction information
 * generated by the standard_planner is used to decide whether all the shards
 * that a distributed query touches reside on a single worker node. However,
 * standard_planner() does a lot of extra things such as cost estimation and
 * execution path generations which are completely unnecessary in the context
 * of distributed planning.
 *
 * There are certain types of queries where Citus could skip relying on
 * standard_planner() to generate the restriction information. For queries
 * in the following format, Citus does not need any information that the
 * standard_planner() generates:
 *   SELECT ... FROM single_table WHERE distribution_key = X;  or
 *   DELETE FROM single_table WHERE distribution_key = X; or
 *   UPDATE single_table SET value_1 = value_2 + 1 WHERE distribution_key = X;
 *
 * Note that the queries might not be as simple as the above such that
 * GROUP BY, WINDOW FUNCIONS, ORDER BY or HAVING etc. are all acceptable. The
 * only rule is that the query is on a single distributed (or reference) table
 * and there is a "distribution_key = X;" in the WHERE clause. With that, we
 * could use to decide the shard that a distributed query touches reside on
 * a worker node.
 *
 * Copyright (c) 2019, Citus Data, Inc.
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

#include "distributed/distributed_planner.h"
#include "distributed/multi_physical_planner.h" /* only to use some utility functions */
#include "distributed/metadata_cache.h"
#include "distributed/multi_router_planner.h"
#include "distributed/pg_dist_partition.h"
#include "distributed/shardinterval_utils.h"
#include "distributed/shard_pruning.h"
#if PG_VERSION_NUM >= 120000
#include "nodes/makefuncs.h"
#endif
#include "nodes/nodeFuncs.h"
#include "nodes/parsenodes.h"
#include "nodes/pg_list.h"
#if PG_VERSION_NUM >= 120000
#include "optimizer/optimizer.h"
#else
#include "optimizer/clauses.h"
#endif

bool EnableFastPathRouterPlanner = true;

static bool ColumnAppearsMultipleTimes(Node *quals, Var *distributionKey);
static bool ConjunctionContainsColumnFilter(Node *node, Var *column);
static bool DistKeyInSimpleOpExpression(Expr *clause, Var *distColumn);


/*
 * FastPathPlanner is intended to be used instead of standard_planner() for trivial
 * queries defined by FastPathRouterQuery().
 *
 * The basic idea is that we need a very little of what standard_planner() does for
 * the trivial queries. So skip calling standard_planner() to save CPU cycles.
 *
 */
PlannedStmt *
FastPathPlanner(Query *originalQuery, Query *parse, ParamListInfo boundParams)
{
	/*
	 * To support prepared statements for fast-path queries, we resolve the
	 * external parameters at this point. Note that this is normally done by
	 * eval_const_expr() in standard planner when the boundParams are avaliable.
	 * If not avaliable, as does for all other types of queries, Citus goes
	 * through the logic of increasing the cost of the plan and forcing
	 * PostgreSQL to pick custom plans.
	 *
	 * We're also only interested in resolving the quals since we'd want to
	 * do shard pruning based on the filter on the distribution column.
	 */
	originalQuery->jointree->quals =
		ResolveExternalParams((Node *) originalQuery->jointree->quals,
							  copyParamList(boundParams));

	/*
	 * Citus planner relies on some of the transformations on constant
	 * evaluation on the parse tree.
	 */
	parse->targetList =
		(List *) eval_const_expressions(NULL, (Node *) parse->targetList);
	parse->jointree->quals =
		(Node *) eval_const_expressions(NULL, (Node *) parse->jointree->quals);


	PlannedStmt *result = GeneratePlaceHolderPlannedStmt(originalQuery);

	return result;
}


/*
 * GeneratePlaceHolderPlannedStmt creates a planned statement which contains
 * a sequential scan on the relation that is accessed by the input query.
 * The returned PlannedStmt is not proper (e.g., set_plan_references() is
 * not called on the plan or the quals are not set), so should not be
 * passed to the executor directly. This is only useful to have a
 * placeholder PlannedStmt where target list is properly set. Note that
 * this is what router executor relies on.
 *
 * This function makes the assumption (and the assertion) that
 * the input query is in the form defined by FastPathRouterQuery().
 */
PlannedStmt *
GeneratePlaceHolderPlannedStmt(Query *parse)
{
	PlannedStmt *result = makeNode(PlannedStmt);
	SeqScan *seqScanNode = makeNode(SeqScan);
	Plan *plan = &seqScanNode->plan;

	AssertArg(FastPathRouterQuery(parse));

	/* there is only a single relation rte */
	seqScanNode->scanrelid = 1;

	plan->targetlist = copyObject(parse->targetList);
	plan->qual = NULL;
	plan->lefttree = NULL;
	plan->righttree = NULL;
	plan->plan_node_id = 1;

	/*  rtable is used for access permission checks */
	result->commandType = parse->commandType;
	result->queryId = parse->queryId;
	result->stmt_len = parse->stmt_len;

	result->rtable = copyObject(parse->rtable);
	result->planTree = (Plan *) plan;

	Oid relationId = ExtractFirstDistributedTableId(parse);
	result->relationOids = list_make1_oid(relationId);

	return result;
}


/*
 * FastPathRouterQuery gets a query and returns true if the query is eligable for
 * being a fast path router query.
 * The requirements for the fast path query can be listed below:
 *
 *   - SELECT query without CTES, sublinks-subqueries, set operations
 *   - The query should touch only a single hash distributed or reference table
 *   - The distribution with equality operator should be in the WHERE clause
 *      and it should be ANDed with any other filters. Also, the distribution
 *      key should only exists once in the WHERE clause. So basically,
 *          SELECT ... FROM dist_table WHERE dist_key = X
 *   - No returning for UPDATE/DELETE queries
 */
bool
FastPathRouterQuery(Query *query)
{
	FromExpr *joinTree = query->jointree;
	Node *quals = NULL;

	if (!EnableFastPathRouterPlanner)
	{
		return false;
	}

	if (!(query->commandType == CMD_SELECT || query->commandType == CMD_UPDATE ||
		  query->commandType == CMD_DELETE))
	{
		return false;
	}

	/*
	 * We want to deal with only very simple select queries. Some of the
	 * checks might be too restrictive, still we prefer this way.
	 */
	if (query->cteList != NIL || query->returningList != NIL ||
		query->hasSubLinks || query->setOperations != NULL ||
		query->hasTargetSRFs || query->hasModifyingCTE)
	{
		return false;
	}

	/* make sure that the only range table in FROM clause */
	if (list_length(query->rtable) != 1)
	{
		return false;
	}

	RangeTblEntry *rangeTableEntry = (RangeTblEntry *) linitial(query->rtable);
	if (rangeTableEntry->rtekind != RTE_RELATION)
	{
		return false;
	}

	/* we don't want to deal with append/range distributed tables */
	Oid distributedTableId = rangeTableEntry->relid;
	DistTableCacheEntry *cacheEntry = DistributedTableCacheEntry(distributedTableId);
	if (!(cacheEntry->partitionMethod == DISTRIBUTE_BY_HASH ||
		  cacheEntry->partitionMethod == DISTRIBUTE_BY_NONE))
	{
		return false;
	}

	/* WHERE clause should not be empty for distributed tables */
	if (joinTree == NULL ||
		(cacheEntry->partitionMethod != DISTRIBUTE_BY_NONE && joinTree->quals == NULL))
	{
		return false;
	}

	/* if that's a reference table, we don't need to check anything further */
	Var *distributionKey = PartitionColumn(distributedTableId, 1);
	if (!distributionKey)
	{
		return true;
	}

	/* convert list of expressions into expression tree for further processing */
	quals = joinTree->quals;
	if (quals != NULL && IsA(quals, List))
	{
		quals = (Node *) make_ands_explicit((List *) quals);
	}

	/*
	 * Distribution column must be used in a simple equality match check and it must be
	 * place at top level conjustion operator. In simple words, we should have
	 *	    WHERE dist_key = VALUE [AND  ....];
	 *
	 *	We're also not allowing any other appearances of the distribution key in the quals.
	 *
	 *	Overall the logic is might sound fuzzy since it involves two individual checks:
	 *	    (a) Check for top level AND operator with one side being "dist_key = const"
	 *	    (b) Only allow single appearance of "dist_key" in the quals
	 *
	 *	This is to simplify both of the individual checks and omit various edge cases
	 *	that might arise with multiple distribution keys in the quals.
	 */
	if (ConjunctionContainsColumnFilter(quals, distributionKey) &&
		!ColumnAppearsMultipleTimes(quals, distributionKey))
	{
		return true;
	}

	return false;
}


/*
 * ColumnAppearsMultipleTimes returns true if the given input
 * appears more than once in the quals.
 */
static bool
ColumnAppearsMultipleTimes(Node *quals, Var *distributionKey)
{
	ListCell *varClauseCell = NULL;
	int partitionColumnReferenceCount = 0;

	/* make sure partition column is used only once in the quals */
	List *varClauseList = pull_var_clause_default(quals);
	foreach(varClauseCell, varClauseList)
	{
		Var *column = (Var *) lfirst(varClauseCell);
		if (equal(column, distributionKey))
		{
			partitionColumnReferenceCount++;

			if (partitionColumnReferenceCount > 1)
			{
				return true;
			}
		}
	}

	return false;
}


/*
 * ConjunctionContainsColumnFilter returns true if the query contains an exact
 * match (equal) expression on the provided column. The function returns true only
 * if the match expression has an AND relation with the rest of the expression tree.
 */
static bool
ConjunctionContainsColumnFilter(Node *node, Var *column)
{
	if (node == NULL)
	{
		return false;
	}

	if (IsA(node, OpExpr))
	{
		OpExpr *opExpr = (OpExpr *) node;
		bool distKeyInSimpleOpExpression =
			DistKeyInSimpleOpExpression((Expr *) opExpr, column);

		if (!distKeyInSimpleOpExpression)
		{
			return false;
		}

		return OperatorImplementsEquality(opExpr->opno);
	}
	else if (IsA(node, BoolExpr))
	{
		BoolExpr *boolExpr = (BoolExpr *) node;
		List *argumentList = boolExpr->args;
		ListCell *argumentCell = NULL;


		/*
		 * We do not descend into boolean expressions other than AND.
		 * If the column filter appears in an OR clause, we do not
		 * consider it even if it is logically the same as a single value
		 * comparison (e.g. `<column> = <Const> OR false`)
		 */
		if (boolExpr->boolop != AND_EXPR)
		{
			return false;
		}

		foreach(argumentCell, argumentList)
		{
			Node *argumentNode = (Node *) lfirst(argumentCell);

			if (ConjunctionContainsColumnFilter(argumentNode, column))
			{
				return true;
			}
		}
	}

	return false;
}


/*
 * DistKeyInSimpleOpExpression checks whether given expression is a simple operator
 * expression with either (dist_key = param) or (dist_key = const). Note that the
 * operands could be in the reverse order as well.
 */
static bool
DistKeyInSimpleOpExpression(Expr *clause, Var *distColumn)
{
	Node *leftOperand = NULL;
	Node *rightOperand = NULL;
	Param *paramClause = NULL;
	Const *constantClause = NULL;

	Var *columnInExpr = NULL;

	if (is_opclause(clause) && list_length(((OpExpr *) clause)->args) == 2)
	{
		leftOperand = get_leftop(clause);
		rightOperand = get_rightop(clause);
	}
	else
	{
		return false; /* not a binary opclause */
	}

	/* strip coercions before doing check */
	leftOperand = strip_implicit_coercions(leftOperand);
	rightOperand = strip_implicit_coercions(rightOperand);

	if (IsA(rightOperand, Param) && IsA(leftOperand, Var))
	{
		paramClause = (Param *) rightOperand;
		columnInExpr = (Var *) leftOperand;
	}
	else if (IsA(leftOperand, Param) && IsA(rightOperand, Var))
	{
		paramClause = (Param *) leftOperand;
		columnInExpr = (Var *) rightOperand;
	}
	else if (IsA(rightOperand, Const) && IsA(leftOperand, Var))
	{
		constantClause = (Const *) rightOperand;
		columnInExpr = (Var *) leftOperand;
	}
	else if (IsA(leftOperand, Const) && IsA(rightOperand, Var))
	{
		constantClause = (Const *) leftOperand;
		columnInExpr = (Var *) rightOperand;
	}
	else
	{
		return false;
	}

	if (paramClause && paramClause->paramkind != PARAM_EXTERN)
	{
		/* we can only handle param_externs */
		return false;
	}
	else if (constantClause && constantClause->constisnull)
	{
		/* we can only handle non-null constants */
		return false;
	}

	/* at this point we should have the columnInExpr */
	Assert(columnInExpr);

	return equal(distColumn, columnInExpr);
}