Fix error when using LEFT JOIN with GROUP BY on primary key

2020-03-26 20:15:24 +01:00 · 2020-03-26 20:15:24 +01:00 · 331b45348c
parent e1802c5c00
commit 331b45348c
6 changed files with 184 additions and 10 deletions
--- a/src/backend/distributed/planner/multi_logical_optimizer.c
+++ b/src/backend/distributed/planner/multi_logical_optimizer.c
@ -274,6 +274,7 @@ static List * WorkerAggregateExpressionList(Aggref *originalAggregate,
 											WorkerAggregateWalkerContext *walkerContextry);
 static AggregateType GetAggregateType(Aggref *aggregatExpression);
 static Oid AggregateArgumentType(Aggref *aggregate);
 static Expr * FirstAggregateArgument(Aggref *aggregate);
 static bool AggregateEnabledCustom(Aggref *aggregateExpression);
 static Oid CitusFunctionOidWithSignature(char *functionName, int numargs, Oid *argtypes);
 static Oid WorkerPartialAggOid(void);
@ -2028,6 +2029,12 @@ MasterAggregateExpression(Aggref *originalAggregate,
 		Oid aggregateFunctionId = AggregateFunctionOid(aggregateName, workerReturnType);
 		Oid masterReturnType = get_func_rettype(aggregateFunctionId);
 		Aggref *newMasterAggregate = copyObject(originalAggregate);
 		newMasterAggregate->aggdistinct = NULL;
 		newMasterAggregate->aggfnoid = aggregateFunctionId;
 		newMasterAggregate->aggtype = masterReturnType;
 		newMasterAggregate->aggfilter = NULL;
 		/*
 		 * If return type aggregate is anyelement, its actual return type is
 		 * determined on the type of its argument. So we replace it with the
@ -2035,13 +2042,11 @@ MasterAggregateExpression(Aggref *originalAggregate,
 		 */
 		if (masterReturnType == ANYELEMENTOID)
 		{
-			masterReturnType = workerReturnType;
+			newMasterAggregate->aggtype = workerReturnType;
 			Expr *firstArg = FirstAggregateArgument(originalAggregate);
 			newMasterAggregate->aggcollid = exprCollation((Node *) firstArg);
 		}
 		Aggref *newMasterAggregate = copyObject(originalAggregate);
 		newMasterAggregate->aggdistinct = NULL;
 		newMasterAggregate->aggfnoid = aggregateFunctionId;
 		newMasterAggregate->aggtype = masterReturnType;
 		newMasterAggregate->aggfilter = NULL;
 		Var *column = makeVar(masterTableId, walkerContext->columnId, workerReturnType,
 							  workerReturnTypeMod, workerCollationId, columnLevelsUp);
@ -3213,6 +3218,22 @@ AggregateArgumentType(Aggref *aggregate)
 }
 /*
 * FirstAggregateArgument returns the first argument of the aggregate.
 */
 static Expr *
 FirstAggregateArgument(Aggref *aggregate)
 {
 	List *argumentList = aggregate->args;
 	Assert(list_length(argumentList) >= 1);
 	TargetEntry *argument = (TargetEntry *) linitial(argumentList);
 	return argument->expr;
 }
 /*
 * AggregateEnabledCustom returns whether given aggregate can be
 * distributed across workers using worker_partial_agg & coord_combine_agg.
--- a/src/backend/distributed/planner/multi_physical_planner.c
+++ b/src/backend/distributed/planner/multi_physical_planner.c
@ -215,7 +215,6 @@ static StringInfo IntermediateTableQueryString(uint64 jobId, uint32 taskIdIndex,
 static uint32 FinalTargetEntryCount(List *targetEntryList);
 static bool CoPlacedShardIntervals(ShardInterval *firstInterval,
 								   ShardInterval *secondInterval);
 static Node * AddAnyValueAggregates(Node *node, void *context);
 /*
@ -974,7 +973,7 @@ TargetEntryList(List *expressionList)
 * function. This is needed for repartition joins because primary keys are not
 * present on intermediate tables.
 */
-static Node *
+Node *
 AddAnyValueAggregates(Node *node, void *context)
 {
 	List *groupClauseList = context;
@ -994,6 +993,7 @@ AddAnyValueAggregates(Node *node, void *context)
 		agg->aggtranstype = InvalidOid;
 		agg->aggargtypes = list_make1_oid(var->vartype);
 		agg->aggsplit = AGGSPLIT_SIMPLE;
 		agg->aggcollid = exprCollation((Node *) var);
 		return (Node *) agg;
 	}
 	if (IsA(node, TargetEntry))
--- a/src/backend/distributed/planner/query_pushdown_planning.c
+++ b/src/backend/distributed/planner/query_pushdown_planning.c
@ -1636,6 +1636,27 @@ SubqueryPushdownMultiNodeTree(Query *originalQuery)
 			(Node *) make_ands_implicit((Expr *) extendedOpNode->havingQual);
 	}
 	/*
 	 * Group by on primary key allows all columns to appear in the target
 	 * list, but once we wrap the join tree into a subquery the GROUP BY
 	 * will no longer directly refer to the primary key and referencing
 	 * columns that are not in the GROUP BY would result in an error. To
 	 * prevent that we wrap all the columns that do not appear in the
 	 * GROUP BY in an any_value aggregate.
 	 */
 	if (extendedOpNode->groupClauseList != NIL)
 	{
 		extendedOpNode->targetList =
 			(List *) expression_tree_mutator((Node *) extendedOpNode->targetList,
 											 AddAnyValueAggregates,
 											 extendedOpNode->groupClauseList);
 		extendedOpNode->havingQual =
 			expression_tree_mutator((Node *) extendedOpNode->havingQual,
 									AddAnyValueAggregates,
 									extendedOpNode->groupClauseList);
 	}
 	/*
 	 * Postgres standard planner evaluates expressions in the LIMIT/OFFSET clauses.
 	 * Since we're using original query here, we should manually evaluate the
--- a/src/include/distributed/multi_physical_planner.h
+++ b/src/include/distributed/multi_physical_planner.h
@ -453,6 +453,7 @@ extern Task * CreateBasicTask(uint64 jobId, uint32 taskId, TaskType taskType,
 							  char *queryString);
 extern OpExpr * MakeOpExpression(Var *variable, int16 strategyNumber);
 extern Node * AddAnyValueAggregates(Node *node, void *context);
 /*
 * Function declarations for building, updating constraints and simple operator
--- a/src/test/regress/expected/subquery_complex_target_list.out
+++ b/src/test/regress/expected/subquery_complex_target_list.out
@ -415,5 +415,91 @@ DEBUG:  Plan XXX query after replacing subqueries and CTEs: SELECT event_type, c
 COMMIT;
 SET client_min_messages TO DEFAULT;
 CREATE TABLE items (key text primary key, value text not null, t timestamp);
 SELECT create_distributed_table('items','key');
 create_distributed_table
 ---------------------------------------------------------------------
 (1 row)
 INSERT INTO items VALUES ('key-1','value-2', '2020-01-01 00:00');
 INSERT INTO items VALUES ('key-2','value-1', '2020-02-02 00:00');
 CREATE TABLE other_items (key text primary key, value text not null);
 SELECT create_distributed_table('other_items','key');
 create_distributed_table
 ---------------------------------------------------------------------
 (1 row)
 INSERT INTO other_items VALUES ('key-1','value-2');
 -- LEFT JOINs are wrapped into a subquery under the covers, which causes GROUP BY
 -- to be separated from the LEFT JOIN. If the GROUP BY is on a primary key we can
 -- normally use any column even ones that are not in the GROUP BY, but not when
 -- it is in the outer query. In that case, we use the any_value aggregate.
 SELECT key, a.value, count(b.value), t
 FROM items a LEFT JOIN other_items b USING (key)
 GROUP BY key HAVING a.value != 'value-2' ORDER BY count(b.value), a.value LIMIT 5;
  key  |  value  | count |            t
 ---------------------------------------------------------------------
 key-2 | value-1 |     0 | Sun Feb 02 00:00:00 2020
 (1 row)
 SELECT key, a.value, count(b.value), t
 FROM items a LEFT JOIN other_items b USING (key)
 GROUP BY key, t HAVING a.value != 'value-2' ORDER BY count(b.value), a.value LIMIT 5;
  key  |  value  | count |            t
 ---------------------------------------------------------------------
 key-2 | value-1 |     0 | Sun Feb 02 00:00:00 2020
 (1 row)
 -- make sure the same logic works for regular joins
 SELECT key, a.value, count(b.value), t
 FROM items a JOIN other_items b USING (key)
 GROUP BY key HAVING a.value = 'value-2' ORDER BY count(b.value), a.value LIMIT 5;
  key  |  value  | count |            t
 ---------------------------------------------------------------------
 key-1 | value-2 |     1 | Wed Jan 01 00:00:00 2020
 (1 row)
 -- subqueries also trigger wrapping
 SELECT key, a.value, count(b.value), t
 FROM items a JOIN (SELECT key, value, random() FROM other_items) b USING (key)
 GROUP BY key ORDER BY 3, 2, 1;
  key  |  value  | count |            t
 ---------------------------------------------------------------------
 key-1 | value-2 |     1 | Wed Jan 01 00:00:00 2020
 (1 row)
 -- pushdownable window functions also trigger wrapping
 SELECT a.key, a.value, count(a.value) OVER (PARTITION BY a.key)
 FROM items a JOIN other_items b ON (a.key = b.key)
 GROUP BY a.key ORDER BY 3, 2, 1;
  key  |  value  | count
 ---------------------------------------------------------------------
 key-1 | value-2 |     1
 (1 row)
 -- left join with non-pushdownable window functions
 SELECT a.key, a.value, count(a.value) OVER ()
 FROM items a LEFT JOIN other_items b ON (a.key = b.key)
 GROUP BY a.key ORDER BY 3, 2, 1;
  key  |  value  | count
 ---------------------------------------------------------------------
 key-2 | value-1 |     2
 key-1 | value-2 |     2
 (2 rows)
 -- function joins (actually with read_intermediate_results) also trigger wrapping
 SELECT key, a.value, sum(b)
 FROM items a JOIN generate_series(1,10) b ON (a.key = 'key-'||b)
 GROUP BY key ORDER BY 3, 2, 1;
  key  |  value  | sum
 ---------------------------------------------------------------------
 key-1 | value-2 |   1
 key-2 | value-1 |   2
 (2 rows)
 DROP SCHEMA subquery_complex CASCADE;
-SET search_path TO public;
+NOTICE:  drop cascades to 2 other objects
 DETAIL:  drop cascades to table items
 drop cascades to table other_items
--- a/src/test/regress/sql/subquery_complex_target_list.sql
+++ b/src/test/regress/sql/subquery_complex_target_list.sql
@ -300,5 +300,50 @@ COMMIT;
 SET client_min_messages TO DEFAULT;
 CREATE TABLE items (key text primary key, value text not null, t timestamp);
 SELECT create_distributed_table('items','key');
 INSERT INTO items VALUES ('key-1','value-2', '2020-01-01 00:00');
 INSERT INTO items VALUES ('key-2','value-1', '2020-02-02 00:00');
 CREATE TABLE other_items (key text primary key, value text not null);
 SELECT create_distributed_table('other_items','key');
 INSERT INTO other_items VALUES ('key-1','value-2');
 -- LEFT JOINs are wrapped into a subquery under the covers, which causes GROUP BY
 -- to be separated from the LEFT JOIN. If the GROUP BY is on a primary key we can
 -- normally use any column even ones that are not in the GROUP BY, but not when
 -- it is in the outer query. In that case, we use the any_value aggregate.
 SELECT key, a.value, count(b.value), t
 FROM items a LEFT JOIN other_items b USING (key)
 GROUP BY key HAVING a.value != 'value-2' ORDER BY count(b.value), a.value LIMIT 5;
 SELECT key, a.value, count(b.value), t
 FROM items a LEFT JOIN other_items b USING (key)
 GROUP BY key, t HAVING a.value != 'value-2' ORDER BY count(b.value), a.value LIMIT 5;
 -- make sure the same logic works for regular joins
 SELECT key, a.value, count(b.value), t
 FROM items a JOIN other_items b USING (key)
 GROUP BY key HAVING a.value = 'value-2' ORDER BY count(b.value), a.value LIMIT 5;
 -- subqueries also trigger wrapping
 SELECT key, a.value, count(b.value), t
 FROM items a JOIN (SELECT key, value, random() FROM other_items) b USING (key)
 GROUP BY key ORDER BY 3, 2, 1;
 -- pushdownable window functions also trigger wrapping
 SELECT a.key, a.value, count(a.value) OVER (PARTITION BY a.key)
 FROM items a JOIN other_items b ON (a.key = b.key)
 GROUP BY a.key ORDER BY 3, 2, 1;
 -- left join with non-pushdownable window functions
 SELECT a.key, a.value, count(a.value) OVER ()
 FROM items a LEFT JOIN other_items b ON (a.key = b.key)
 GROUP BY a.key ORDER BY 3, 2, 1;
 -- function joins (actually with read_intermediate_results) also trigger wrapping
 SELECT key, a.value, sum(b)
 FROM items a JOIN generate_series(1,10) b ON (a.key = 'key-'||b)
 GROUP BY key ORDER BY 3, 2, 1;
 DROP SCHEMA subquery_complex CASCADE;
 SET search_path TO public;