Fix error when using LEFT JOIN with GROUP BY on primary key

pull/3668/head
Marco Slot 2020-03-26 20:15:24 +01:00
parent e1802c5c00
commit 331b45348c
6 changed files with 184 additions and 10 deletions

View File

@ -274,6 +274,7 @@ static List * WorkerAggregateExpressionList(Aggref *originalAggregate,
WorkerAggregateWalkerContext *walkerContextry);
static AggregateType GetAggregateType(Aggref *aggregatExpression);
static Oid AggregateArgumentType(Aggref *aggregate);
static Expr * FirstAggregateArgument(Aggref *aggregate);
static bool AggregateEnabledCustom(Aggref *aggregateExpression);
static Oid CitusFunctionOidWithSignature(char *functionName, int numargs, Oid *argtypes);
static Oid WorkerPartialAggOid(void);
@ -2028,6 +2029,12 @@ MasterAggregateExpression(Aggref *originalAggregate,
Oid aggregateFunctionId = AggregateFunctionOid(aggregateName, workerReturnType);
Oid masterReturnType = get_func_rettype(aggregateFunctionId);
Aggref *newMasterAggregate = copyObject(originalAggregate);
newMasterAggregate->aggdistinct = NULL;
newMasterAggregate->aggfnoid = aggregateFunctionId;
newMasterAggregate->aggtype = masterReturnType;
newMasterAggregate->aggfilter = NULL;
/*
* If return type aggregate is anyelement, its actual return type is
* determined on the type of its argument. So we replace it with the
@ -2035,13 +2042,11 @@ MasterAggregateExpression(Aggref *originalAggregate,
*/
if (masterReturnType == ANYELEMENTOID)
{
masterReturnType = workerReturnType;
newMasterAggregate->aggtype = workerReturnType;
Expr *firstArg = FirstAggregateArgument(originalAggregate);
newMasterAggregate->aggcollid = exprCollation((Node *) firstArg);
}
Aggref *newMasterAggregate = copyObject(originalAggregate);
newMasterAggregate->aggdistinct = NULL;
newMasterAggregate->aggfnoid = aggregateFunctionId;
newMasterAggregate->aggtype = masterReturnType;
newMasterAggregate->aggfilter = NULL;
Var *column = makeVar(masterTableId, walkerContext->columnId, workerReturnType,
workerReturnTypeMod, workerCollationId, columnLevelsUp);
@ -3213,6 +3218,22 @@ AggregateArgumentType(Aggref *aggregate)
}
/*
* FirstAggregateArgument returns the first argument of the aggregate.
*/
static Expr *
FirstAggregateArgument(Aggref *aggregate)
{
List *argumentList = aggregate->args;
Assert(list_length(argumentList) >= 1);
TargetEntry *argument = (TargetEntry *) linitial(argumentList);
return argument->expr;
}
/*
* AggregateEnabledCustom returns whether given aggregate can be
* distributed across workers using worker_partial_agg & coord_combine_agg.

View File

@ -215,7 +215,6 @@ static StringInfo IntermediateTableQueryString(uint64 jobId, uint32 taskIdIndex,
static uint32 FinalTargetEntryCount(List *targetEntryList);
static bool CoPlacedShardIntervals(ShardInterval *firstInterval,
ShardInterval *secondInterval);
static Node * AddAnyValueAggregates(Node *node, void *context);
/*
@ -974,7 +973,7 @@ TargetEntryList(List *expressionList)
* function. This is needed for repartition joins because primary keys are not
* present on intermediate tables.
*/
static Node *
Node *
AddAnyValueAggregates(Node *node, void *context)
{
List *groupClauseList = context;
@ -994,6 +993,7 @@ AddAnyValueAggregates(Node *node, void *context)
agg->aggtranstype = InvalidOid;
agg->aggargtypes = list_make1_oid(var->vartype);
agg->aggsplit = AGGSPLIT_SIMPLE;
agg->aggcollid = exprCollation((Node *) var);
return (Node *) agg;
}
if (IsA(node, TargetEntry))

View File

@ -1636,6 +1636,27 @@ SubqueryPushdownMultiNodeTree(Query *originalQuery)
(Node *) make_ands_implicit((Expr *) extendedOpNode->havingQual);
}
/*
* Group by on primary key allows all columns to appear in the target
* list, but once we wrap the join tree into a subquery the GROUP BY
* will no longer directly refer to the primary key and referencing
* columns that are not in the GROUP BY would result in an error. To
* prevent that we wrap all the columns that do not appear in the
* GROUP BY in an any_value aggregate.
*/
if (extendedOpNode->groupClauseList != NIL)
{
extendedOpNode->targetList =
(List *) expression_tree_mutator((Node *) extendedOpNode->targetList,
AddAnyValueAggregates,
extendedOpNode->groupClauseList);
extendedOpNode->havingQual =
expression_tree_mutator((Node *) extendedOpNode->havingQual,
AddAnyValueAggregates,
extendedOpNode->groupClauseList);
}
/*
* Postgres standard planner evaluates expressions in the LIMIT/OFFSET clauses.
* Since we're using original query here, we should manually evaluate the

View File

@ -453,6 +453,7 @@ extern Task * CreateBasicTask(uint64 jobId, uint32 taskId, TaskType taskType,
char *queryString);
extern OpExpr * MakeOpExpression(Var *variable, int16 strategyNumber);
extern Node * AddAnyValueAggregates(Node *node, void *context);
/*
* Function declarations for building, updating constraints and simple operator

View File

@ -415,5 +415,91 @@ DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT event_type, c
COMMIT;
SET client_min_messages TO DEFAULT;
CREATE TABLE items (key text primary key, value text not null, t timestamp);
SELECT create_distributed_table('items','key');
create_distributed_table
---------------------------------------------------------------------
(1 row)
INSERT INTO items VALUES ('key-1','value-2', '2020-01-01 00:00');
INSERT INTO items VALUES ('key-2','value-1', '2020-02-02 00:00');
CREATE TABLE other_items (key text primary key, value text not null);
SELECT create_distributed_table('other_items','key');
create_distributed_table
---------------------------------------------------------------------
(1 row)
INSERT INTO other_items VALUES ('key-1','value-2');
-- LEFT JOINs are wrapped into a subquery under the covers, which causes GROUP BY
-- to be separated from the LEFT JOIN. If the GROUP BY is on a primary key we can
-- normally use any column even ones that are not in the GROUP BY, but not when
-- it is in the outer query. In that case, we use the any_value aggregate.
SELECT key, a.value, count(b.value), t
FROM items a LEFT JOIN other_items b USING (key)
GROUP BY key HAVING a.value != 'value-2' ORDER BY count(b.value), a.value LIMIT 5;
key | value | count | t
---------------------------------------------------------------------
key-2 | value-1 | 0 | Sun Feb 02 00:00:00 2020
(1 row)
SELECT key, a.value, count(b.value), t
FROM items a LEFT JOIN other_items b USING (key)
GROUP BY key, t HAVING a.value != 'value-2' ORDER BY count(b.value), a.value LIMIT 5;
key | value | count | t
---------------------------------------------------------------------
key-2 | value-1 | 0 | Sun Feb 02 00:00:00 2020
(1 row)
-- make sure the same logic works for regular joins
SELECT key, a.value, count(b.value), t
FROM items a JOIN other_items b USING (key)
GROUP BY key HAVING a.value = 'value-2' ORDER BY count(b.value), a.value LIMIT 5;
key | value | count | t
---------------------------------------------------------------------
key-1 | value-2 | 1 | Wed Jan 01 00:00:00 2020
(1 row)
-- subqueries also trigger wrapping
SELECT key, a.value, count(b.value), t
FROM items a JOIN (SELECT key, value, random() FROM other_items) b USING (key)
GROUP BY key ORDER BY 3, 2, 1;
key | value | count | t
---------------------------------------------------------------------
key-1 | value-2 | 1 | Wed Jan 01 00:00:00 2020
(1 row)
-- pushdownable window functions also trigger wrapping
SELECT a.key, a.value, count(a.value) OVER (PARTITION BY a.key)
FROM items a JOIN other_items b ON (a.key = b.key)
GROUP BY a.key ORDER BY 3, 2, 1;
key | value | count
---------------------------------------------------------------------
key-1 | value-2 | 1
(1 row)
-- left join with non-pushdownable window functions
SELECT a.key, a.value, count(a.value) OVER ()
FROM items a LEFT JOIN other_items b ON (a.key = b.key)
GROUP BY a.key ORDER BY 3, 2, 1;
key | value | count
---------------------------------------------------------------------
key-2 | value-1 | 2
key-1 | value-2 | 2
(2 rows)
-- function joins (actually with read_intermediate_results) also trigger wrapping
SELECT key, a.value, sum(b)
FROM items a JOIN generate_series(1,10) b ON (a.key = 'key-'||b)
GROUP BY key ORDER BY 3, 2, 1;
key | value | sum
---------------------------------------------------------------------
key-1 | value-2 | 1
key-2 | value-1 | 2
(2 rows)
DROP SCHEMA subquery_complex CASCADE;
SET search_path TO public;
NOTICE: drop cascades to 2 other objects
DETAIL: drop cascades to table items
drop cascades to table other_items

View File

@ -300,5 +300,50 @@ COMMIT;
SET client_min_messages TO DEFAULT;
CREATE TABLE items (key text primary key, value text not null, t timestamp);
SELECT create_distributed_table('items','key');
INSERT INTO items VALUES ('key-1','value-2', '2020-01-01 00:00');
INSERT INTO items VALUES ('key-2','value-1', '2020-02-02 00:00');
CREATE TABLE other_items (key text primary key, value text not null);
SELECT create_distributed_table('other_items','key');
INSERT INTO other_items VALUES ('key-1','value-2');
-- LEFT JOINs are wrapped into a subquery under the covers, which causes GROUP BY
-- to be separated from the LEFT JOIN. If the GROUP BY is on a primary key we can
-- normally use any column even ones that are not in the GROUP BY, but not when
-- it is in the outer query. In that case, we use the any_value aggregate.
SELECT key, a.value, count(b.value), t
FROM items a LEFT JOIN other_items b USING (key)
GROUP BY key HAVING a.value != 'value-2' ORDER BY count(b.value), a.value LIMIT 5;
SELECT key, a.value, count(b.value), t
FROM items a LEFT JOIN other_items b USING (key)
GROUP BY key, t HAVING a.value != 'value-2' ORDER BY count(b.value), a.value LIMIT 5;
-- make sure the same logic works for regular joins
SELECT key, a.value, count(b.value), t
FROM items a JOIN other_items b USING (key)
GROUP BY key HAVING a.value = 'value-2' ORDER BY count(b.value), a.value LIMIT 5;
-- subqueries also trigger wrapping
SELECT key, a.value, count(b.value), t
FROM items a JOIN (SELECT key, value, random() FROM other_items) b USING (key)
GROUP BY key ORDER BY 3, 2, 1;
-- pushdownable window functions also trigger wrapping
SELECT a.key, a.value, count(a.value) OVER (PARTITION BY a.key)
FROM items a JOIN other_items b ON (a.key = b.key)
GROUP BY a.key ORDER BY 3, 2, 1;
-- left join with non-pushdownable window functions
SELECT a.key, a.value, count(a.value) OVER ()
FROM items a LEFT JOIN other_items b ON (a.key = b.key)
GROUP BY a.key ORDER BY 3, 2, 1;
-- function joins (actually with read_intermediate_results) also trigger wrapping
SELECT key, a.value, sum(b)
FROM items a JOIN generate_series(1,10) b ON (a.key = 'key-'||b)
GROUP BY key ORDER BY 3, 2, 1;
DROP SCHEMA subquery_complex CASCADE;
SET search_path TO public;