/*------------------------------------------------------------------------- * * fast_path_router_planner.c * * Planning logic for fast path router planner queries. In this context, * we define "Fast Path Planning" as trivial queries where Citus * can skip relying on the standard_planner() and handle all the planning. * * For router planner, standard_planner() is mostly important to generate * the necessary restriction information. Later, the restriction information * generated by the standard_planner is used to decide whether all the shards * that a distributed query touches reside on a single worker node. However, * standard_planner() does a lot of extra things such as cost estimation and * execution path generations which are completely unnecessary in the context * of distributed planning. * * There are certain types of queries where Citus could skip relying on * standard_planner() to generate the restriction information. For queries * in the following format, Citus does not need any information that the * standard_planner() generates: * SELECT ... FROM single_table WHERE distribution_key = X; or * DELETE FROM single_table WHERE distribution_key = X; or * UPDATE single_table SET value_1 = value_2 + 1 WHERE distribution_key = X; * * Note that the queries might not be as simple as the above such that * GROUP BY, WINDOW FUNCIONS, ORDER BY or HAVING etc. are all acceptable. The * only rule is that the query is on a single distributed (or reference) table * and there is a "distribution_key = X;" in the WHERE clause. With that, we * could use to decide the shard that a distributed query touches reside on * a worker node. * * Copyright (c) 2019, Citus Data, Inc. *------------------------------------------------------------------------- */ #include "postgres.h" #include "distributed/distributed_planner.h" #include "distributed/multi_physical_planner.h" /* only to use some utility functions */ #include "distributed/metadata_cache.h" #include "distributed/multi_router_planner.h" #include "distributed/pg_dist_partition.h" #include "distributed/shardinterval_utils.h" #include "distributed/shard_pruning.h" #if PG_VERSION_NUM >= 120000 #include "nodes/makefuncs.h" #endif #include "nodes/nodeFuncs.h" #include "nodes/parsenodes.h" #include "nodes/pg_list.h" #if PG_VERSION_NUM >= 120000 #include "optimizer/optimizer.h" #else #include "optimizer/clauses.h" #endif bool EnableFastPathRouterPlanner = true; static bool ColumnAppearsMultipleTimes(Node *quals, Var *distributionKey); static bool ConjunctionContainsColumnFilter(Node *node, Var *column); static bool DistKeyInSimpleOpExpression(Expr *clause, Var *distColumn); /* * FastPathPlanner is intended to be used instead of standard_planner() for trivial * queries defined by FastPathRouterQuery(). * * The basic idea is that we need a very little of what standard_planner() does for * the trivial queries. So skip calling standard_planner() to save CPU cycles. * */ PlannedStmt * FastPathPlanner(Query *originalQuery, Query *parse, ParamListInfo boundParams) { /* * To support prepared statements for fast-path queries, we resolve the * external parameters at this point. Note that this is normally done by * eval_const_expr() in standard planner when the boundParams are avaliable. * If not avaliable, as does for all other types of queries, Citus goes * through the logic of increasing the cost of the plan and forcing * PostgreSQL to pick custom plans. * * We're also only interested in resolving the quals since we'd want to * do shard pruning based on the filter on the distribution column. */ originalQuery->jointree->quals = ResolveExternalParams((Node *) originalQuery->jointree->quals, copyParamList(boundParams)); /* * Citus planner relies on some of the transformations on constant * evaluation on the parse tree. */ parse->targetList = (List *) eval_const_expressions(NULL, (Node *) parse->targetList); parse->jointree->quals = (Node *) eval_const_expressions(NULL, (Node *) parse->jointree->quals); PlannedStmt *result = GeneratePlaceHolderPlannedStmt(originalQuery); return result; } /* * GeneratePlaceHolderPlannedStmt creates a planned statement which contains * a sequential scan on the relation that is accessed by the input query. * The returned PlannedStmt is not proper (e.g., set_plan_references() is * not called on the plan or the quals are not set), so should not be * passed to the executor directly. This is only useful to have a * placeholder PlannedStmt where target list is properly set. Note that * this is what router executor relies on. * * This function makes the assumption (and the assertion) that * the input query is in the form defined by FastPathRouterQuery(). */ PlannedStmt * GeneratePlaceHolderPlannedStmt(Query *parse) { PlannedStmt *result = makeNode(PlannedStmt); SeqScan *seqScanNode = makeNode(SeqScan); Plan *plan = &seqScanNode->plan; AssertArg(FastPathRouterQuery(parse)); /* there is only a single relation rte */ seqScanNode->scanrelid = 1; plan->targetlist = copyObject(parse->targetList); plan->qual = NULL; plan->lefttree = NULL; plan->righttree = NULL; plan->plan_node_id = 1; /* rtable is used for access permission checks */ result->commandType = parse->commandType; result->queryId = parse->queryId; result->stmt_len = parse->stmt_len; result->rtable = copyObject(parse->rtable); result->planTree = (Plan *) plan; Oid relationId = ExtractFirstDistributedTableId(parse); result->relationOids = list_make1_oid(relationId); return result; } /* * FastPathRouterQuery gets a query and returns true if the query is eligable for * being a fast path router query. * The requirements for the fast path query can be listed below: * * - SELECT query without CTES, sublinks-subqueries, set operations * - The query should touch only a single hash distributed or reference table * - The distribution with equality operator should be in the WHERE clause * and it should be ANDed with any other filters. Also, the distribution * key should only exists once in the WHERE clause. So basically, * SELECT ... FROM dist_table WHERE dist_key = X * - No returning for UPDATE/DELETE queries */ bool FastPathRouterQuery(Query *query) { FromExpr *joinTree = query->jointree; Node *quals = NULL; if (!EnableFastPathRouterPlanner) { return false; } if (!(query->commandType == CMD_SELECT || query->commandType == CMD_UPDATE || query->commandType == CMD_DELETE)) { return false; } /* * We want to deal with only very simple select queries. Some of the * checks might be too restrictive, still we prefer this way. */ if (query->cteList != NIL || query->returningList != NIL || query->hasSubLinks || query->setOperations != NULL || query->hasTargetSRFs || query->hasModifyingCTE) { return false; } /* make sure that the only range table in FROM clause */ if (list_length(query->rtable) != 1) { return false; } RangeTblEntry *rangeTableEntry = (RangeTblEntry *) linitial(query->rtable); if (rangeTableEntry->rtekind != RTE_RELATION) { return false; } /* we don't want to deal with append/range distributed tables */ Oid distributedTableId = rangeTableEntry->relid; DistTableCacheEntry *cacheEntry = DistributedTableCacheEntry(distributedTableId); if (!(cacheEntry->partitionMethod == DISTRIBUTE_BY_HASH || cacheEntry->partitionMethod == DISTRIBUTE_BY_NONE)) { return false; } /* WHERE clause should not be empty for distributed tables */ if (joinTree == NULL || (cacheEntry->partitionMethod != DISTRIBUTE_BY_NONE && joinTree->quals == NULL)) { return false; } /* if that's a reference table, we don't need to check anything further */ Var *distributionKey = PartitionColumn(distributedTableId, 1); if (!distributionKey) { return true; } /* convert list of expressions into expression tree for further processing */ quals = joinTree->quals; if (quals != NULL && IsA(quals, List)) { quals = (Node *) make_ands_explicit((List *) quals); } /* * Distribution column must be used in a simple equality match check and it must be * place at top level conjustion operator. In simple words, we should have * WHERE dist_key = VALUE [AND ....]; * * We're also not allowing any other appearances of the distribution key in the quals. * * Overall the logic is might sound fuzzy since it involves two individual checks: * (a) Check for top level AND operator with one side being "dist_key = const" * (b) Only allow single appearance of "dist_key" in the quals * * This is to simplify both of the individual checks and omit various edge cases * that might arise with multiple distribution keys in the quals. */ if (ConjunctionContainsColumnFilter(quals, distributionKey) && !ColumnAppearsMultipleTimes(quals, distributionKey)) { return true; } return false; } /* * ColumnAppearsMultipleTimes returns true if the given input * appears more than once in the quals. */ static bool ColumnAppearsMultipleTimes(Node *quals, Var *distributionKey) { ListCell *varClauseCell = NULL; int partitionColumnReferenceCount = 0; /* make sure partition column is used only once in the quals */ List *varClauseList = pull_var_clause_default(quals); foreach(varClauseCell, varClauseList) { Var *column = (Var *) lfirst(varClauseCell); if (equal(column, distributionKey)) { partitionColumnReferenceCount++; if (partitionColumnReferenceCount > 1) { return true; } } } return false; } /* * ConjunctionContainsColumnFilter returns true if the query contains an exact * match (equal) expression on the provided column. The function returns true only * if the match expression has an AND relation with the rest of the expression tree. */ static bool ConjunctionContainsColumnFilter(Node *node, Var *column) { if (node == NULL) { return false; } if (IsA(node, OpExpr)) { OpExpr *opExpr = (OpExpr *) node; bool distKeyInSimpleOpExpression = DistKeyInSimpleOpExpression((Expr *) opExpr, column); if (!distKeyInSimpleOpExpression) { return false; } return OperatorImplementsEquality(opExpr->opno); } else if (IsA(node, BoolExpr)) { BoolExpr *boolExpr = (BoolExpr *) node; List *argumentList = boolExpr->args; ListCell *argumentCell = NULL; /* * We do not descend into boolean expressions other than AND. * If the column filter appears in an OR clause, we do not * consider it even if it is logically the same as a single value * comparison (e.g. ` = OR false`) */ if (boolExpr->boolop != AND_EXPR) { return false; } foreach(argumentCell, argumentList) { Node *argumentNode = (Node *) lfirst(argumentCell); if (ConjunctionContainsColumnFilter(argumentNode, column)) { return true; } } } return false; } /* * DistKeyInSimpleOpExpression checks whether given expression is a simple operator * expression with either (dist_key = param) or (dist_key = const). Note that the * operands could be in the reverse order as well. */ static bool DistKeyInSimpleOpExpression(Expr *clause, Var *distColumn) { Node *leftOperand = NULL; Node *rightOperand = NULL; Param *paramClause = NULL; Const *constantClause = NULL; Var *columnInExpr = NULL; if (is_opclause(clause) && list_length(((OpExpr *) clause)->args) == 2) { leftOperand = get_leftop(clause); rightOperand = get_rightop(clause); } else { return false; /* not a binary opclause */ } /* strip coercions before doing check */ leftOperand = strip_implicit_coercions(leftOperand); rightOperand = strip_implicit_coercions(rightOperand); if (IsA(rightOperand, Param) && IsA(leftOperand, Var)) { paramClause = (Param *) rightOperand; columnInExpr = (Var *) leftOperand; } else if (IsA(leftOperand, Param) && IsA(rightOperand, Var)) { paramClause = (Param *) leftOperand; columnInExpr = (Var *) rightOperand; } else if (IsA(rightOperand, Const) && IsA(leftOperand, Var)) { constantClause = (Const *) rightOperand; columnInExpr = (Var *) leftOperand; } else if (IsA(leftOperand, Const) && IsA(rightOperand, Var)) { constantClause = (Const *) leftOperand; columnInExpr = (Var *) rightOperand; } else { return false; } if (paramClause && paramClause->paramkind != PARAM_EXTERN) { /* we can only handle param_externs */ return false; } else if (constantClause && constantClause->constisnull) { /* we can only handle non-null constants */ return false; } /* at this point we should have the columnInExpr */ Assert(columnInExpr); return equal(distColumn, columnInExpr); }