From 6ce2803777736f459995cc88fb7957077aaf0b83 Mon Sep 17 00:00:00 2001 From: Marco Slot Date: Thu, 30 Apr 2020 10:41:33 +0200 Subject: [PATCH] Make sure we don't wrap GROUP BY expressions in any_value --- .../planner/multi_physical_planner.c | 163 ++++++++++++++---- .../planner/query_pushdown_planning.c | 16 +- .../distributed/multi_physical_planner.h | 5 +- .../expected/subquery_complex_target_list.out | 56 ++++++ .../sql/subquery_complex_target_list.sql | 37 ++++ 5 files changed, 238 insertions(+), 39 deletions(-) diff --git a/src/backend/distributed/planner/multi_physical_planner.c b/src/backend/distributed/planner/multi_physical_planner.c index d8861da18..528b2e812 100644 --- a/src/backend/distributed/planner/multi_physical_planner.c +++ b/src/backend/distributed/planner/multi_physical_planner.c @@ -95,6 +95,24 @@ bool EnableUniqueJobIds = true; static List *OperatorCache = NIL; +/* context passed down in AddAnyValueAggregates mutator */ +typedef struct AddAnyValueAggregatesContext +{ + /* SortGroupClauses corresponding to the GROUP BY clause */ + List *groupClauseList; + + /* TargetEntry's to which the GROUP BY clauses refer */ + List *groupByTargetEntryList; + + /* + * haveNonVarGrouping is true if there are expressions in the + * GROUP BY target entries. We use this as an optimisation to + * skip expensive checks when possible. + */ + bool haveNonVarGrouping; +} AddAnyValueAggregatesContext; + + /* Local functions forward declarations for job creation */ static Job * BuildJobTree(MultiTreeRoot *multiTree); static MultiNode * LeftMostNode(MultiTreeRoot *multiTree); @@ -105,6 +123,7 @@ static Query * BuildReduceQuery(MultiExtendedOp *extendedOpNode, List *dependent static List * BaseRangeTableList(MultiNode *multiNode); static List * QueryTargetList(MultiNode *multiNode); static List * TargetEntryList(List *expressionList); +static Node * AddAnyValueAggregates(Node *node, AddAnyValueAggregatesContext *context); static List * QueryGroupClauseList(MultiNode *multiNode); static List * QuerySelectClauseList(MultiNode *multiNode); static List * QueryJoinClauseList(MultiNode *multiNode); @@ -696,13 +715,11 @@ BuildJobQuery(MultiNode *multiNode, List *dependentJobList) */ if (groupClauseList != NIL && isRepartitionJoin) { - targetList = (List *) expression_tree_mutator((Node *) targetList, - AddAnyValueAggregates, - groupClauseList); + targetList = (List *) WrapUngroupedVarsInAnyValueAggregate( + (Node *) targetList, groupClauseList, targetList, true); - havingQual = expression_tree_mutator((Node *) havingQual, - AddAnyValueAggregates, - groupClauseList); + havingQual = WrapUngroupedVarsInAnyValueAggregate( + (Node *) havingQual, groupClauseList, targetList, false); } /* @@ -974,23 +991,116 @@ TargetEntryList(List *expressionList) /* - * AddAnyValueAggregates wraps all vars that do not appear in the GROUP BY - * clause or are inside an aggregate function in an any_value aggregate - * function. This is needed for repartition joins because primary keys are not - * present on intermediate tables. + * WrapUngroupedVarsInAnyValueAggregate finds Var nodes in the expression + * that do not refer to any GROUP BY column and wraps them in an any_value + * aggregate. These columns are allowed when the GROUP BY is on a primary + * key of a relation, but not if we wrap the relation in a subquery. + * However, since we still know the value is unique, any_value gives the + * right result. */ Node * -AddAnyValueAggregates(Node *node, void *context) +WrapUngroupedVarsInAnyValueAggregate(Node *expression, List *groupClauseList, + List *targetList, bool checkExpressionEquality) +{ + if (expression == NULL) + { + return NULL; + } + + AddAnyValueAggregatesContext context; + context.groupClauseList = groupClauseList; + context.groupByTargetEntryList = GroupTargetEntryList(groupClauseList, targetList); + context.haveNonVarGrouping = false; + + if (checkExpressionEquality) + { + /* + * If the GROUP BY contains non-Var expressions, we need to do an expensive + * subexpression equality check. + */ + TargetEntry *targetEntry = NULL; + foreach_ptr(targetEntry, context.groupByTargetEntryList) + { + if (!IsA(targetEntry->expr, Var)) + { + context.haveNonVarGrouping = true; + break; + } + } + } + + /* put the result in the same memory context */ + MemoryContext nodeContext = GetMemoryChunkContext(expression); + MemoryContext oldContext = MemoryContextSwitchTo(nodeContext); + + Node *result = expression_tree_mutator(expression, AddAnyValueAggregates, + &context); + + MemoryContextSwitchTo(oldContext); + + return result; +} + + +/* + * AddAnyValueAggregates wraps all vars that do not appear in the GROUP BY + * clause or are inside an aggregate function in an any_value aggregate + * function. This is needed because postgres allows columns that are not + * in the GROUP BY to appear on the target list as long as the primary key + * of the table is in the GROUP BY, but we sometimes wrap the join tree + * in a subquery in which case the primary key information is lost. + * + * This function copies parts of the node tree, but may contain references + * to the original node tree. + * + * The implementation is derived from / inspired by + * check_ungrouped_columns_walker. + */ +static Node * +AddAnyValueAggregates(Node *node, AddAnyValueAggregatesContext *context) { - List *groupClauseList = context; if (node == NULL) { return node; } - if (IsA(node, Var)) + if (IsA(node, Aggref) || IsA(node, GroupingFunc)) + { + /* any column is allowed to appear in an aggregate or grouping */ + return node; + } + else if (IsA(node, Var)) { Var *var = (Var *) node; + + /* + * Check whether this Var appears in the GROUP BY. + */ + TargetEntry *groupByTargetEntry = NULL; + foreach_ptr(groupByTargetEntry, context->groupByTargetEntryList) + { + if (!IsA(groupByTargetEntry->expr, Var)) + { + continue; + } + + Var *groupByVar = (Var *) groupByTargetEntry->expr; + + /* we should only be doing this at the top level of the query */ + Assert(groupByVar->varlevelsup == 0); + + if (var->varno == groupByVar->varno && + var->varattno == groupByVar->varattno) + { + /* this Var is in the GROUP BY, do not wrap it */ + return node; + } + } + + /* + * We have found a Var that does not appear in the GROUP BY. + * Wrap it in an any_value aggregate. + */ Aggref *agg = makeNode(Aggref); agg->aggfnoid = CitusAnyValueFunctionId(); agg->aggtype = var->vartype; @@ -1002,31 +1112,24 @@ AddAnyValueAggregates(Node *node, void *context) agg->aggcollid = exprCollation((Node *) var); return (Node *) agg; } - if (IsA(node, TargetEntry)) + else if (context->haveNonVarGrouping) { - TargetEntry *targetEntry = (TargetEntry *) node; - - /* - * Stop searching this part of the tree if the targetEntry is part of - * the group by clause. + * The GROUP BY contains at least one expression. Check whether the + * current expression is equal to one of the GROUP BY expressions. + * Otherwise, continue to descend into subexpressions. */ - if (targetEntry->ressortgroupref != 0) + TargetEntry *groupByTargetEntry = NULL; + foreach_ptr(groupByTargetEntry, context->groupByTargetEntryList) { - SortGroupClause *sortGroupClause = NULL; - foreach_ptr(sortGroupClause, groupClauseList) + if (equal(node, groupByTargetEntry->expr)) { - if (sortGroupClause->tleSortGroupRef == targetEntry->ressortgroupref) - { - return node; - } + /* do not descend into mutator, all Vars are safe */ + return node; } } } - if (IsA(node, Aggref) || IsA(node, GroupingFunc)) - { - return node; - } + return expression_tree_mutator(node, AddAnyValueAggregates, context); } diff --git a/src/backend/distributed/planner/query_pushdown_planning.c b/src/backend/distributed/planner/query_pushdown_planning.c index cd21654d7..175a42bbf 100644 --- a/src/backend/distributed/planner/query_pushdown_planning.c +++ b/src/backend/distributed/planner/query_pushdown_planning.c @@ -1648,15 +1648,15 @@ SubqueryPushdownMultiNodeTree(Query *originalQuery) */ if (extendedOpNode->groupClauseList != NIL) { - extendedOpNode->targetList = - (List *) expression_tree_mutator((Node *) extendedOpNode->targetList, - AddAnyValueAggregates, - extendedOpNode->groupClauseList); + extendedOpNode->targetList = (List *) WrapUngroupedVarsInAnyValueAggregate( + (Node *) extendedOpNode->targetList, + extendedOpNode->groupClauseList, + extendedOpNode->targetList, true); - extendedOpNode->havingQual = - expression_tree_mutator((Node *) extendedOpNode->havingQual, - AddAnyValueAggregates, - extendedOpNode->groupClauseList); + extendedOpNode->havingQual = WrapUngroupedVarsInAnyValueAggregate( + (Node *) extendedOpNode->havingQual, + extendedOpNode->groupClauseList, + extendedOpNode->targetList, false); } /* diff --git a/src/include/distributed/multi_physical_planner.h b/src/include/distributed/multi_physical_planner.h index 2da3f7758..dca343e27 100644 --- a/src/include/distributed/multi_physical_planner.h +++ b/src/include/distributed/multi_physical_planner.h @@ -497,7 +497,10 @@ extern Task * CreateBasicTask(uint64 jobId, uint32 taskId, TaskType taskType, char *queryString); extern OpExpr * MakeOpExpression(Var *variable, int16 strategyNumber); -extern Node * AddAnyValueAggregates(Node *node, void *context); +extern Node * WrapUngroupedVarsInAnyValueAggregate(Node *expression, + List *groupClauseList, + List *targetList, + bool checkExpressionEquality); /* * Function declarations for building, updating constraints and simple operator diff --git a/src/test/regress/expected/subquery_complex_target_list.out b/src/test/regress/expected/subquery_complex_target_list.out index 25210ed57..f1009d06d 100644 --- a/src/test/regress/expected/subquery_complex_target_list.out +++ b/src/test/regress/expected/subquery_complex_target_list.out @@ -509,6 +509,62 @@ GROUP BY a.key ORDER BY 3, 2, 1; key-1 | (key-1,value-2,"Wed Jan 01 00:00:00 2020") | 1 (2 rows) +-- Of the target list entries, v1-v3 should be wrapped in any_value as they do +-- not appear in GROUP BY. The append happens on the coordinator in that case. +-- Vars in the HAVING that do not appear in the GROUP BY are also wrapped. +SELECT + a.key as k1, + a.key as k2, + a.key || '_append' as k3, + a.value as v1, + a.value as v2, + a.value || '_notgrouped' as v3, + a.value || '_append' as va1, + a.value || '_append' as va2, + a.value || '_append' || '_more' as va2, + count(*) +FROM items a LEFT JOIN other_items b ON (a.key = b.key) +GROUP BY a.key, a.value ||'_append' +HAVING length(a.key) + length(a.value) < length(a.value || '_append') +ORDER BY 1; + k1 | k2 | k3 | v1 | v2 | v3 | va1 | va2 | va2 | count +--------------------------------------------------------------------- + key-1 | key-1 | key-1_append | value-2 | value-2 | value-2_notgrouped | value-2_append | value-2_append | value-2_append_more | 1 + key-2 | key-2 | key-2_append | value-1 | value-1 | value-1_notgrouped | value-1_append | value-1_append | value-1_append_more | 1 +(2 rows) + +SELECT coordinator_plan($$ +EXPLAIN (VERBOSE ON, COSTS OFF) +SELECT + a.key as k1, + a.key as k2, + a.key || '_append' as k3, + a.value as v1, + a.value as v2, + a.value || '_notgrouped' as v3, + a.value || '_append' as va1, + a.value || '_append' as va2, + a.value || '_append' || '_more' as va3, + count(*) +FROM items a LEFT JOIN other_items b ON (a.key = b.key) +GROUP BY a.key, a.value ||'_append' +HAVING length(a.key) + length(a.value) < length(a.value || '_append') +ORDER BY 1 +$$); + coordinator_plan +--------------------------------------------------------------------- + Sort + Output: remote_scan.k1, remote_scan.k2, remote_scan.k3, (any_value(remote_scan.v1)), (any_value(remote_scan.v2)), ((any_value(remote_scan.v3) || '_notgrouped'::text)), remote_scan.va1, remote_scan.va2, remote_scan.va3, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) + Sort Key: remote_scan.k1 + -> HashAggregate + Output: remote_scan.k1, remote_scan.k2, remote_scan.k3, any_value(remote_scan.v1), any_value(remote_scan.v2), (any_value(remote_scan.v3) || '_notgrouped'::text), remote_scan.va1, remote_scan.va2, remote_scan.va3, COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint) + Group Key: remote_scan.k1, remote_scan.va1 + Filter: ((length(remote_scan.worker_column_11) + length(any_value(remote_scan.worker_column_12))) < length((any_value(remote_scan.worker_column_13) || '_append'::text))) + -> Custom Scan (Citus Adaptive) + Output: remote_scan.k1, remote_scan.k2, remote_scan.k3, remote_scan.v1, remote_scan.v2, remote_scan.v3, remote_scan.va1, remote_scan.va2, remote_scan.va3, remote_scan.count, remote_scan.worker_column_11, remote_scan.worker_column_12, remote_scan.worker_column_13 + Task Count: 4 +(10 rows) + SELECT a FROM items a ORDER BY key; a --------------------------------------------------------------------- diff --git a/src/test/regress/sql/subquery_complex_target_list.sql b/src/test/regress/sql/subquery_complex_target_list.sql index 13bf96671..d2d3a2c5e 100644 --- a/src/test/regress/sql/subquery_complex_target_list.sql +++ b/src/test/regress/sql/subquery_complex_target_list.sql @@ -351,6 +351,43 @@ SELECT a.key, a, count(b) FROM items a LEFT JOIN other_items b ON (a.key = b.key) GROUP BY a.key ORDER BY 3, 2, 1; +-- Of the target list entries, v1-v3 should be wrapped in any_value as they do +-- not appear in GROUP BY. The append happens on the coordinator in that case. +-- Vars in the HAVING that do not appear in the GROUP BY are also wrapped. +SELECT + a.key as k1, + a.key as k2, + a.key || '_append' as k3, + a.value as v1, + a.value as v2, + a.value || '_notgrouped' as v3, + a.value || '_append' as va1, + a.value || '_append' as va2, + a.value || '_append' || '_more' as va2, + count(*) +FROM items a LEFT JOIN other_items b ON (a.key = b.key) +GROUP BY a.key, a.value ||'_append' +HAVING length(a.key) + length(a.value) < length(a.value || '_append') +ORDER BY 1; + +SELECT coordinator_plan($$ +EXPLAIN (VERBOSE ON, COSTS OFF) +SELECT + a.key as k1, + a.key as k2, + a.key || '_append' as k3, + a.value as v1, + a.value as v2, + a.value || '_notgrouped' as v3, + a.value || '_append' as va1, + a.value || '_append' as va2, + a.value || '_append' || '_more' as va3, + count(*) +FROM items a LEFT JOIN other_items b ON (a.key = b.key) +GROUP BY a.key, a.value ||'_append' +HAVING length(a.key) + length(a.value) < length(a.value || '_append') +ORDER BY 1 +$$); SELECT a FROM items a ORDER BY key; SELECT a FROM items a WHERE key = 'key-1'; SELECT a FROM (SELECT a, random() FROM items a) b ORDER BY a;