Fix incorrect limit pushdown when distinct clause is not superset of group by (#2035)

Pushing down limit and order by into workers may produce
wrong output when distinct on() clause has expressions,
aggregates, or window functions.

This checking allows pushing down of limits only if
distinct clause is a superset of group by clause. i.e. it contains all clauses in group by.
pull/2043/merge
Murat Tuncer 2018-03-07 13:24:56 +03:00 committed by GitHub
parent 27d159d6f6
commit 1440caeef2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 460 additions and 100 deletions

View File

@ -1860,7 +1860,9 @@ WorkerExtendedOpNode(MultiExtendedOp *originalOpNode,
palloc0(sizeof(WorkerAggregateWalkerContext));
Index nextSortGroupRefIndex = 0;
bool queryHasAggregates = false;
bool enableLimitPushdown = true;
bool distinctClauseSupersetofGroupClause = false;
bool distinctPreventsLimitPushdown = false;
bool createdNewGroupByClause = false;
bool hasNonPartitionColumnDistinctAgg = false;
bool repartitionSubquery = false;
@ -1949,11 +1951,7 @@ WorkerExtendedOpNode(MultiExtendedOp *originalOpNode,
groupClauseList = lappend(groupClauseList, groupByClause);
nextSortGroupRefIndex++;
/*
* If we introduce new columns accompanied by a new group by clause,
* than pushing down limits will cause incorrect results.
*/
enableLimitPushdown = false;
createdNewGroupByClause = true;
}
if (newTargetEntry->resname == NULL)
@ -2013,11 +2011,7 @@ WorkerExtendedOpNode(MultiExtendedOp *originalOpNode,
groupClauseList = lappend(groupClauseList, groupByClause);
nextSortGroupRefIndex++;
/*
* If we introduce new columns accompanied by a new group by clause,
* than pushing down limits will cause incorrect results.
*/
enableLimitPushdown = false;
createdNewGroupByClause = true;
}
newTargetEntryList = lappend(newTargetEntryList, newTargetEntry);
@ -2030,16 +2024,52 @@ WorkerExtendedOpNode(MultiExtendedOp *originalOpNode,
workerExtendedOpNode->hasDistinctOn = false;
workerExtendedOpNode->hasWindowFuncs = originalOpNode->hasWindowFuncs;
workerExtendedOpNode->windowClause = originalOpNode->windowClause;
workerExtendedOpNode->groupClauseList = groupClauseList;
if (!queryHasAggregates)
if (originalOpNode->distinctClause)
{
bool shouldPushdownDistinct = false;
if (groupClauseList == NIL ||
IsGroupBySubsetOfDistinct(groupClauseList,
originalOpNode->distinctClause))
{
distinctClauseSupersetofGroupClause = true;
}
else
{
distinctClauseSupersetofGroupClause = false;
/*
* GROUP BY being a subset of DISTINCT guarantees the
* distinctness on the workers. Otherwise, pushing down
* LIMIT might cause missing the necessary data from
* the worker query
*/
distinctPreventsLimitPushdown = true;
}
/*
* Distinct is pushed down to worker query only if the query does not
* contain an aggregate in which master processing might be required to
* complete the final result before distinct operation. We also prevent
* distinct pushdown if distinct clause is missing some entries that
* group by clause has.
*/
shouldPushdownDistinct = !queryHasAggregates &&
distinctClauseSupersetofGroupClause;
if (shouldPushdownDistinct)
{
workerExtendedOpNode->distinctClause = originalOpNode->distinctClause;
workerExtendedOpNode->hasDistinctOn = originalOpNode->hasDistinctOn;
}
}
workerExtendedOpNode->groupClauseList = groupClauseList;
if (enableLimitPushdown)
/*
* Order by and limit clauses are pushed down only if
* (1) We do not create a new group by clause during aggregate mutation, and
* (2) There distinct clause does not prevent limit pushdown
*/
if (!createdNewGroupByClause && !distinctPreventsLimitPushdown)
{
List *newTargetEntryListForSortClauses = NIL;
@ -3858,3 +3888,50 @@ HasOrderByHllType(List *sortClauseList, List *targetList)
return hasOrderByHllType;
}
/*
* IsGroupBySubsetOfDistinct checks whether each clause in group clauses also
* exists in the distinct clauses. Note that, empty group clause is not a subset
* of distinct clause.
*/
bool
IsGroupBySubsetOfDistinct(List *groupClause, List *distinctClause)
{
ListCell *distinctCell = NULL;
ListCell *groupCell = NULL;
/* There must be a group clause */
if (list_length(groupClause) == 0)
{
return false;
}
foreach(groupCell, groupClause)
{
SortGroupClause *groupClause = (SortGroupClause *) lfirst(groupCell);
bool isFound = false;
foreach(distinctCell, distinctClause)
{
SortGroupClause *distinctClause = (SortGroupClause *) lfirst(distinctCell);
if (groupClause->tleSortGroupRef == distinctClause->tleSortGroupRef)
{
isFound = true;
break;
}
}
/*
* If we can't find any member of group clause in the distinct clause,
* that means group clause is not a subset of distinct clause.
*/
if (!isFound)
{
return false;
}
}
return true;
}

View File

@ -13,6 +13,7 @@
#include "postgres.h"
#include "distributed/multi_logical_optimizer.h"
#include "distributed/multi_master_planner.h"
#include "distributed/multi_physical_planner.h"
#include "distributed/distributed_planner.h"
@ -38,7 +39,6 @@ static Agg * BuildAggregatePlan(Query *masterQuery, Plan *subPlan);
static bool HasDistinctAggregate(Query *masterQuery);
static Plan * BuildDistinctPlan(Query *masterQuery, Plan *subPlan);
static List * PrepareTargetListForNextPlan(List *targetList);
static bool IsGroupBySubsetOfDistinct(Query *masterQuery);
/*
@ -414,7 +414,7 @@ BuildDistinctPlan(Query *masterQuery, Plan *subPlan)
* clause also used in distinct clause, since group by clause guarantees the
* uniqueness of the target list for every row.
*/
if (IsGroupBySubsetOfDistinct(masterQuery))
if (IsGroupBySubsetOfDistinct(masterQuery->groupClause, masterQuery->distinctClause))
{
return subPlan;
}
@ -494,52 +494,3 @@ PrepareTargetListForNextPlan(List *targetList)
return newtargetList;
}
/*
* IsGroupBySubsetOfDistinct checks whether each clause in group clauses also
* exists in the distinct clauses. Note that, empty group clause is not a subset
* of distinct clause.
*/
static bool
IsGroupBySubsetOfDistinct(Query *masterQuery)
{
List *distinctClauses = masterQuery->distinctClause;
List *groupClauses = masterQuery->groupClause;
ListCell *distinctCell = NULL;
ListCell *groupCell = NULL;
/* There must be a group clause */
if (list_length(groupClauses) == 0)
{
return false;
}
foreach(groupCell, groupClauses)
{
SortGroupClause *groupClause = (SortGroupClause *) lfirst(groupCell);
bool isFound = false;
foreach(distinctCell, distinctClauses)
{
SortGroupClause *distinctClause = (SortGroupClause *) lfirst(distinctCell);
if (groupClause->tleSortGroupRef == distinctClause->tleSortGroupRef)
{
isFound = true;
break;
}
}
/*
* If we can't find any member of group clause in the distinct clause,
* that means group clause is not a subset of distinct clause.
*/
if (!isFound)
{
return false;
}
}
return true;
}

View File

@ -138,5 +138,6 @@ extern bool IsPartitionColumn(Expr *columnExpression, Query *query);
extern void FindReferencedTableColumn(Expr *columnExpression, List *parentQueryList,
Query *query, Oid *relationId, Var **column);
extern bool IsGroupBySubsetOfDistinct(List *groupClause, List *distinctClause);
#endif /* MULTI_LOGICAL_OPTIMIZER_H */

View File

@ -297,5 +297,227 @@ SELECT
O | TRUCK | 11-17-1998
(5 rows)
-- Push down limit even if there is distinct on
SELECT
DISTINCT ON (l_orderkey, l_linenumber) l_orderkey, l_linenumber
FROM lineitem_hash
GROUP BY l_orderkey, l_linenumber
ORDER BY l_orderkey, l_linenumber
LIMIT 5;
DEBUG: push down of limit count: 5
l_orderkey | l_linenumber
------------+--------------
1 | 1
1 | 2
1 | 3
1 | 4
1 | 5
(5 rows)
-- Don't push down limit when group by clause not included in distinct on
SELECT
DISTINCT ON (l_linenumber) l_orderkey, l_linenumber
FROM lineitem_hash
GROUP BY l_orderkey, l_linenumber
ORDER BY l_linenumber, l_orderkey
LIMIT 5;
l_orderkey | l_linenumber
------------+--------------
1 | 1
1 | 2
1 | 3
1 | 4
1 | 5
(5 rows)
-- Push down limit when there is const in distinct on
-- referring to a column such that group by clause
-- list is contained in distinct on
SELECT
DISTINCT ON (l_linenumber, 1) l_orderkey, l_linenumber
FROM lineitem_hash
GROUP BY l_orderkey, l_linenumber
ORDER BY l_linenumber, l_orderkey
LIMIT 5;
DEBUG: push down of limit count: 5
l_orderkey | l_linenumber
------------+--------------
1 | 1
2 | 1
3 | 1
4 | 1
5 | 1
(5 rows)
-- Don't push down limit when there is const expression in distinct on
-- even if there is a group by on the expression
-- This is due to fact that postgres removes (1+1) from distinct on
-- clause but keeps it in group by list.
SELECT
DISTINCT ON (l_linenumber, 1+1, l_linenumber) l_orderkey, l_linenumber
FROM lineitem_hash
GROUP BY l_orderkey, (1+1), l_linenumber
ORDER BY l_linenumber, (1+1), l_orderkey
LIMIT 5;
l_orderkey | l_linenumber
------------+--------------
1 | 1
1 | 2
1 | 3
1 | 4
1 | 5
(5 rows)
-- Don't push down limit when there is const reference
-- does not point to a column to make distinct clause superset
-- of group by
SELECT
DISTINCT ON (l_linenumber, 2) l_orderkey, l_linenumber
FROM lineitem_hash
GROUP BY l_orderkey, l_linenumber
ORDER BY l_linenumber, l_orderkey
LIMIT 5;
l_orderkey | l_linenumber
------------+--------------
1 | 1
1 | 2
1 | 3
1 | 4
1 | 5
(5 rows)
-- Push down limit even when there is a column expression
-- in distinct clause provided that distinct clause covers
-- group by expression, and there is no aggregates in the query.
SELECT
DISTINCT ON (l_orderkey + 1) l_orderkey + 1
FROM lineitem_hash
GROUP BY l_orderkey + 1
ORDER BY l_orderkey + 1
LIMIT 5;
DEBUG: push down of limit count: 5
?column?
----------
2
3
4
5
6
(5 rows)
-- Limit is not pushed down when there are aggregates in the query
-- This is because group by is not on distribution column itself
-- but on an expression on distribution column
SELECT
DISTINCT ON (l_orderkey + 1, count(*)) l_orderkey + 1, count(*)
FROM lineitem_hash
GROUP BY l_orderkey + 1
ORDER BY l_orderkey + 1 , 2
LIMIT 5;
?column? | count
----------+-------
2 | 6
3 | 1
4 | 6
5 | 1
6 | 3
(5 rows)
-- same query with column instead of column expression, limit is pushed down
-- because group by is on distribution column
SELECT
DISTINCT ON (l_orderkey, count(*)) l_orderkey, count(*)
FROM lineitem_hash
GROUP BY l_orderkey
ORDER BY l_orderkey , 2
LIMIT 5;
DEBUG: push down of limit count: 5
l_orderkey | count
------------+-------
1 | 6
2 | 1
3 | 6
4 | 1
5 | 3
(5 rows)
-- limit is not pushed down because distinct clause
-- does not cover group by clause
SELECT
DISTINCT ON (count(*)) l_orderkey, count(*)
FROM lineitem_hash
GROUP BY l_orderkey
ORDER BY 2 DESC, 1
LIMIT 2;
l_orderkey | count
------------+-------
7 | 7
1 | 6
(2 rows)
-- push down limit if there is a window function in distinct on
SELECT
DISTINCT ON (l_orderkey, RANK() OVER (partition by l_orderkey)) l_orderkey, RANK() OVER (partition by l_orderkey)
FROM lineitem_hash
GROUP BY l_orderkey
ORDER BY l_orderkey , 2
LIMIT 5;
DEBUG: push down of limit count: 5
l_orderkey | rank
------------+------
1 | 1
2 | 1
3 | 1
4 | 1
5 | 1
(5 rows)
-- do not push down limit if there is an aggragete in distinct on
-- we should be able to push this down, but query goes to subquery
-- planner and we can't safely determine it is grouped by partition
-- column.
SELECT
DISTINCT ON (l_orderkey, RANK() OVER (partition by l_orderkey)) l_orderkey, count(*), RANK() OVER (partition by l_orderkey)
FROM lineitem_hash
GROUP BY l_orderkey
ORDER BY l_orderkey , 3, 2
LIMIT 5;
l_orderkey | count | rank
------------+-------+------
1 | 6 | 1
2 | 1 | 1
3 | 6 | 1
4 | 1 | 1
5 | 3 | 1
(5 rows)
-- limit is not pushed down due to same reason
SELECT
DISTINCT ON (l_orderkey, count(*) OVER (partition by l_orderkey)) l_orderkey, l_linenumber, count(*), count(*) OVER (partition by l_orderkey)
FROM lineitem_hash
GROUP BY l_orderkey, l_linenumber
ORDER BY l_orderkey , count(*) OVER (partition by l_orderkey), count(*), l_linenumber
LIMIT 5;
l_orderkey | l_linenumber | count | count
------------+--------------+-------+-------
1 | 1 | 1 | 6
2 | 1 | 1 | 1
3 | 1 | 1 | 6
4 | 1 | 1 | 1
5 | 1 | 1 | 3
(5 rows)
-- limit is not pushed down since distinct clause is not superset of group clause
SELECT
DISTINCT ON (RANK() OVER (partition by l_orderkey)) l_orderkey, RANK() OVER (partition by l_orderkey)
FROM lineitem_hash
GROUP BY l_orderkey
ORDER BY 2 DESC, 1
LIMIT 5;
l_orderkey | rank
------------+------
1 | 1
(1 row)
SET client_min_messages TO NOTICE;
DROP TABLE lineitem_hash;

View File

@ -3,6 +3,7 @@
--
-- Tests select distinct, and select distinct on features.
--
ANALYZE lineitem_hash_part;
-- function calls are supported
SELECT DISTINCT l_orderkey, now() FROM lineitem_hash_part LIMIT 0;
l_orderkey | now
@ -360,13 +361,10 @@ EXPLAIN (COSTS FALSE)
Tasks Shown: One of 4
-> Task
Node: host=localhost port=57637 dbname=regression
-> Limit
-> Sort
Sort Key: l_suppkey, l_linenumber
-> HashAggregate
Group Key: l_suppkey, l_linenumber
-> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part
(18 rows)
(15 rows)
-- check the plan if the hash aggreate is disabled. Similar to the explain of
-- the query above.
@ -394,13 +392,10 @@ EXPLAIN (COSTS FALSE)
Tasks Shown: One of 4
-> Task
Node: host=localhost port=57637 dbname=regression
-> Limit
-> Sort
Sort Key: l_suppkey, l_linenumber
-> HashAggregate
Group Key: l_suppkey, l_linenumber
-> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part
(21 rows)
(18 rows)
SET enable_hashagg TO on;
-- Similar to the above query, not with count but avg. Only difference with the
@ -930,15 +925,12 @@ EXPLAIN (COSTS FALSE)
Tasks Shown: One of 4
-> Task
Node: host=localhost port=57637 dbname=regression
-> Limit
-> Sort
Sort Key: (array_length(array_agg(l_linenumber), 1))
-> GroupAggregate
Group Key: l_orderkey
-> Sort
Sort Key: l_orderkey
-> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part
(20 rows)
(17 rows)
-- check the plan if the hash aggreate is disabled.
SET enable_hashagg TO off;
@ -965,15 +957,12 @@ EXPLAIN (COSTS FALSE)
Tasks Shown: One of 4
-> Task
Node: host=localhost port=57637 dbname=regression
-> Limit
-> Sort
Sort Key: (array_length(array_agg(l_linenumber), 1))
-> GroupAggregate
Group Key: l_orderkey
-> Sort
Sort Key: l_orderkey
-> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part
(23 rows)
(20 rows)
SET enable_hashagg TO on;

View File

@ -102,6 +102,125 @@ SELECT
GROUP BY l_linestatus, l_shipmode
ORDER BY 3 DESC, 1, 2 LIMIT 5;
-- Push down limit even if there is distinct on
SELECT
DISTINCT ON (l_orderkey, l_linenumber) l_orderkey, l_linenumber
FROM lineitem_hash
GROUP BY l_orderkey, l_linenumber
ORDER BY l_orderkey, l_linenumber
LIMIT 5;
-- Don't push down limit when group by clause not included in distinct on
SELECT
DISTINCT ON (l_linenumber) l_orderkey, l_linenumber
FROM lineitem_hash
GROUP BY l_orderkey, l_linenumber
ORDER BY l_linenumber, l_orderkey
LIMIT 5;
-- Push down limit when there is const in distinct on
-- referring to a column such that group by clause
-- list is contained in distinct on
SELECT
DISTINCT ON (l_linenumber, 1) l_orderkey, l_linenumber
FROM lineitem_hash
GROUP BY l_orderkey, l_linenumber
ORDER BY l_linenumber, l_orderkey
LIMIT 5;
-- Don't push down limit when there is const expression in distinct on
-- even if there is a group by on the expression
-- This is due to fact that postgres removes (1+1) from distinct on
-- clause but keeps it in group by list.
SELECT
DISTINCT ON (l_linenumber, 1+1, l_linenumber) l_orderkey, l_linenumber
FROM lineitem_hash
GROUP BY l_orderkey, (1+1), l_linenumber
ORDER BY l_linenumber, (1+1), l_orderkey
LIMIT 5;
-- Don't push down limit when there is const reference
-- does not point to a column to make distinct clause superset
-- of group by
SELECT
DISTINCT ON (l_linenumber, 2) l_orderkey, l_linenumber
FROM lineitem_hash
GROUP BY l_orderkey, l_linenumber
ORDER BY l_linenumber, l_orderkey
LIMIT 5;
-- Push down limit even when there is a column expression
-- in distinct clause provided that distinct clause covers
-- group by expression, and there is no aggregates in the query.
SELECT
DISTINCT ON (l_orderkey + 1) l_orderkey + 1
FROM lineitem_hash
GROUP BY l_orderkey + 1
ORDER BY l_orderkey + 1
LIMIT 5;
-- Limit is not pushed down when there are aggregates in the query
-- This is because group by is not on distribution column itself
-- but on an expression on distribution column
SELECT
DISTINCT ON (l_orderkey + 1, count(*)) l_orderkey + 1, count(*)
FROM lineitem_hash
GROUP BY l_orderkey + 1
ORDER BY l_orderkey + 1 , 2
LIMIT 5;
-- same query with column instead of column expression, limit is pushed down
-- because group by is on distribution column
SELECT
DISTINCT ON (l_orderkey, count(*)) l_orderkey, count(*)
FROM lineitem_hash
GROUP BY l_orderkey
ORDER BY l_orderkey , 2
LIMIT 5;
-- limit is not pushed down because distinct clause
-- does not cover group by clause
SELECT
DISTINCT ON (count(*)) l_orderkey, count(*)
FROM lineitem_hash
GROUP BY l_orderkey
ORDER BY 2 DESC, 1
LIMIT 2;
-- push down limit if there is a window function in distinct on
SELECT
DISTINCT ON (l_orderkey, RANK() OVER (partition by l_orderkey)) l_orderkey, RANK() OVER (partition by l_orderkey)
FROM lineitem_hash
GROUP BY l_orderkey
ORDER BY l_orderkey , 2
LIMIT 5;
-- do not push down limit if there is an aggragete in distinct on
-- we should be able to push this down, but query goes to subquery
-- planner and we can't safely determine it is grouped by partition
-- column.
SELECT
DISTINCT ON (l_orderkey, RANK() OVER (partition by l_orderkey)) l_orderkey, count(*), RANK() OVER (partition by l_orderkey)
FROM lineitem_hash
GROUP BY l_orderkey
ORDER BY l_orderkey , 3, 2
LIMIT 5;
-- limit is not pushed down due to same reason
SELECT
DISTINCT ON (l_orderkey, count(*) OVER (partition by l_orderkey)) l_orderkey, l_linenumber, count(*), count(*) OVER (partition by l_orderkey)
FROM lineitem_hash
GROUP BY l_orderkey, l_linenumber
ORDER BY l_orderkey , count(*) OVER (partition by l_orderkey), count(*), l_linenumber
LIMIT 5;
-- limit is not pushed down since distinct clause is not superset of group clause
SELECT
DISTINCT ON (RANK() OVER (partition by l_orderkey)) l_orderkey, RANK() OVER (partition by l_orderkey)
FROM lineitem_hash
GROUP BY l_orderkey
ORDER BY 2 DESC, 1
LIMIT 5;
SET client_min_messages TO NOTICE;
DROP TABLE lineitem_hash;

View File

@ -4,6 +4,7 @@
-- Tests select distinct, and select distinct on features.
--
ANALYZE lineitem_hash_part;
-- function calls are supported
SELECT DISTINCT l_orderkey, now() FROM lineitem_hash_part LIMIT 0;