Merge pull request #2018 from citusdata/distinct_without_groupby_column

Add distinct plan after aggregation plan on master planner
pull/2028/head
Burak Velioglu 2018-02-26 15:51:19 +03:00 committed by GitHub
commit ee67ce892f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 1094 additions and 13 deletions

View File

@ -37,6 +37,8 @@ static PlannedStmt * BuildSelectStatement(Query *masterQuery, List *masterTarget
static Agg * BuildAggregatePlan(Query *masterQuery, Plan *subPlan);
static bool HasDistinctAggregate(Query *masterQuery);
static Plan * BuildDistinctPlan(Query *masterQuery, Plan *subPlan);
static List * PrepareTargetListForNextPlan(List *targetList);
static bool IsGroupBySubsetOfDistinct(Query *masterQuery);
/*
@ -397,28 +399,29 @@ BuildDistinctPlan(Query *masterQuery, Plan *subPlan)
bool distinctClausesHashable = true;
List *distinctClauseList = masterQuery->distinctClause;
List *targetList = copyObject(masterQuery->targetList);
List *columnList = pull_var_clause_default((Node *) targetList);
ListCell *columnCell = NULL;
bool hasDistinctAggregate = false;
if (IsA(subPlan, Agg))
/*
* We don't need to add distinct plan if all of the columns used in group by
* clause also used in distinct clause, since group by clause guarantees the
* uniqueness of the target list for every row.
*/
if (IsGroupBySubsetOfDistinct(masterQuery))
{
return subPlan;
}
/*
* We need to adjust varno to OUTER_VAR, since planner expects that for upper
* level plans above the sequential scan. We also need to convert aggregations
* (if exists) to regular Vars since the aggregation would be applied by the
* previous aggregation plan and we don't want them to be applied again.
*/
targetList = PrepareTargetListForNextPlan(targetList);
Assert(masterQuery->distinctClause);
Assert(!masterQuery->hasDistinctOn);
/*
* For upper level plans above the sequential scan, the planner expects the
* table id (varno) to be set to OUTER_VAR.
*/
foreach(columnCell, columnList)
{
Var *column = (Var *) lfirst(columnCell);
column->varno = OUTER_VAR;
}
/*
* Create group by plan with HashAggregate if all distinct
* members are hashable, and not containing distinct aggregate.
@ -451,3 +454,84 @@ BuildDistinctPlan(Query *masterQuery, Plan *subPlan)
return distinctPlan;
}
/*
* PrepareTargetListForNextPlan handles both regular columns to have right varno
* and convert aggregates to regular Vars in the target list.
*/
static List *
PrepareTargetListForNextPlan(List *targetList)
{
List *newtargetList = NIL;
ListCell *targetEntryCell = NULL;
foreach(targetEntryCell, targetList)
{
TargetEntry *targetEntry = lfirst(targetEntryCell);
TargetEntry *newTargetEntry = NULL;
Var *newVar = NULL;
Assert(IsA(targetEntry, TargetEntry));
/*
* For upper level plans above the sequential scan, the planner expects the
* table id (varno) to be set to OUTER_VAR.
*/
newVar = makeVarFromTargetEntry(OUTER_VAR, targetEntry);
newTargetEntry = flatCopyTargetEntry(targetEntry);
newTargetEntry->expr = (Expr *) newVar;
newtargetList = lappend(newtargetList, newTargetEntry);
}
return newtargetList;
}
/*
* IsGroupBySubsetOfDistinct checks whether each clause in group clauses also
* exists in the distinct clauses. Note that, empty group clause is not a subset
* of distinct clause.
*/
static bool
IsGroupBySubsetOfDistinct(Query *masterQuery)
{
List *distinctClauses = masterQuery->distinctClause;
List *groupClauses = masterQuery->groupClause;
ListCell *distinctCell = NULL;
ListCell *groupCell = NULL;
/* There must be a group clause */
if (list_length(groupClauses) == 0)
{
return false;
}
foreach(groupCell, groupClauses)
{
SortGroupClause *groupClause = (SortGroupClause *) lfirst(groupCell);
bool isFound = false;
foreach(distinctCell, distinctClauses)
{
SortGroupClause *distinctClause = (SortGroupClause *) lfirst(distinctCell);
if (groupClause->tleSortGroupRef == distinctClause->tleSortGroupRef)
{
isFound = true;
break;
}
}
/*
* If we can't find any member of group clause in the distinct clause,
* that means group clause is not a subset of distinct clause.
*/
if (!isFound)
{
return false;
}
}
return true;
}

View File

@ -245,6 +245,739 @@ EXPLAIN (COSTS FALSE)
Filter: (l_orderkey < 200)
(17 rows)
SET enable_hashagg TO on;
-- distinct on aggregate of group by columns, we try to check whether we handle
-- queries which does not have any group by column in distinct columns properly.
SELECT DISTINCT count(*)
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY 1;
count
-------
1
2
3
4
(4 rows)
-- explain the query to see actual plan. We expect to see Aggregate node having
-- group by key on count(*) column, since columns in the Group By doesn't guarantee
-- the uniqueness of the result.
EXPLAIN (COSTS FALSE)
SELECT DISTINCT count(*)
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY 1;
QUERY PLAN
----------------------------------------------------------------------------------------------
Sort
Sort Key: (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint))
-> HashAggregate
Group Key: COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)
-> HashAggregate
Group Key: remote_scan.worker_column_2, remote_scan.worker_column_3
-> Custom Scan (Citus Real-Time)
Task Count: 4
Tasks Shown: One of 4
-> Task
Node: host=localhost port=57637 dbname=regression
-> HashAggregate
Group Key: l_suppkey, l_linenumber
-> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part
(14 rows)
-- check the plan if the hash aggreate is disabled. We expect to see sort+unique
-- instead of aggregate plan node to handle distinct.
SET enable_hashagg TO off;
EXPLAIN (COSTS FALSE)
SELECT DISTINCT count(*)
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY 1;
QUERY PLAN
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Sort
Sort Key: COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint))))::bigint, '0'::bigint))))::bigint, '0'::bigint))))::bigint, '0'::bigint)
-> Unique
-> Sort
Sort Key: COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint))))::bigint, '0'::bigint)
-> GroupAggregate
Group Key: remote_scan.worker_column_2, remote_scan.worker_column_3
-> Sort
Sort Key: remote_scan.worker_column_2, remote_scan.worker_column_3
-> Custom Scan (Citus Real-Time)
Task Count: 4
Tasks Shown: One of 4
-> Task
Node: host=localhost port=57637 dbname=regression
-> HashAggregate
Group Key: l_suppkey, l_linenumber
-> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part
(17 rows)
SET enable_hashagg TO on;
-- Now we have only part of group clause columns in distinct, yet it is still not
-- enough to use Group By columns to guarantee uniqueness of result list.
SELECT DISTINCT l_suppkey, count(*)
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY 1
LIMIT 10;
l_suppkey | count
-----------+-------
1 | 1
2 | 1
3 | 1
4 | 1
5 | 1
7 | 1
10 | 1
12 | 1
13 | 1
14 | 1
(10 rows)
-- explain the query to see actual plan. Similar to the explain of the query above.
EXPLAIN (COSTS FALSE)
SELECT DISTINCT l_suppkey, count(*)
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY 1
LIMIT 10;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------
Limit
-> Sort
Sort Key: remote_scan.l_suppkey
-> HashAggregate
Group Key: remote_scan.l_suppkey, COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)
-> HashAggregate
Group Key: remote_scan.l_suppkey, remote_scan.worker_column_3
-> Custom Scan (Citus Real-Time)
Task Count: 4
Tasks Shown: One of 4
-> Task
Node: host=localhost port=57637 dbname=regression
-> Limit
-> Sort
Sort Key: l_suppkey, l_linenumber
-> HashAggregate
Group Key: l_suppkey, l_linenumber
-> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part
(18 rows)
-- check the plan if the hash aggreate is disabled. Similar to the explain of
-- the query above.
SET enable_hashagg TO off;
EXPLAIN (COSTS FALSE)
SELECT DISTINCT l_suppkey, count(*)
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY 1
LIMIT 10;
QUERY PLAN
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Limit
-> Sort
Sort Key: remote_scan.l_suppkey
-> Unique
-> Sort
Sort Key: remote_scan.l_suppkey, COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint))))::bigint, '0'::bigint)
-> GroupAggregate
Group Key: remote_scan.l_suppkey, remote_scan.worker_column_3
-> Sort
Sort Key: remote_scan.l_suppkey, remote_scan.worker_column_3
-> Custom Scan (Citus Real-Time)
Task Count: 4
Tasks Shown: One of 4
-> Task
Node: host=localhost port=57637 dbname=regression
-> Limit
-> Sort
Sort Key: l_suppkey, l_linenumber
-> HashAggregate
Group Key: l_suppkey, l_linenumber
-> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part
(21 rows)
SET enable_hashagg TO on;
-- Similar to the above query, not with count but avg. Only difference with the
-- above query is that, we create run two aggregate functions in workers.
SELECT DISTINCT l_suppkey, avg(l_partkey)
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY 1,2
LIMIT 10;
l_suppkey | avg
-----------+------------------------
1 | 190000.000000000000
2 | 172450.000000000000
3 | 112469.000000000000
3 | 134976.000000000000
4 | 112470.000000000000
4 | 142461.000000000000
5 | 182450.000000000000
7 | 137493.000000000000
10 | 150009.000000000000
12 | 17510.0000000000000000
(10 rows)
-- explain the query to see actual plan. Similar to the explain of the query above.
-- Only aggregate functions will be changed.
EXPLAIN (COSTS FALSE)
SELECT DISTINCT l_suppkey, avg(l_partkey)
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY 1,2
LIMIT 10;
QUERY PLAN
-----------------------------------------------------------------------------------------------------------------------
Limit
-> Sort
Sort Key: remote_scan.l_suppkey, ((pg_catalog.sum(remote_scan.avg) / pg_catalog.sum(remote_scan.avg_1)))
-> HashAggregate
Group Key: remote_scan.l_suppkey, (pg_catalog.sum(remote_scan.avg) / pg_catalog.sum(remote_scan.avg_1))
-> HashAggregate
Group Key: remote_scan.l_suppkey, remote_scan.worker_column_4
-> Custom Scan (Citus Real-Time)
Task Count: 4
Tasks Shown: One of 4
-> Task
Node: host=localhost port=57637 dbname=regression
-> HashAggregate
Group Key: l_suppkey, l_linenumber
-> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part
(15 rows)
-- check the plan if the hash aggreate is disabled. This explain errors out due
-- to a bug right now, expectation must be corrected after fixing it.
SET enable_hashagg TO off;
EXPLAIN (COSTS FALSE)
SELECT DISTINCT l_suppkey, avg(l_partkey)
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY 1,2
LIMIT 10;
ERROR: bogus varattno for OUTER_VAR var: 4
SET enable_hashagg TO on;
-- Similar to the above query but with distinct on
SELECT DISTINCT ON (l_suppkey) avg(l_partkey)
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY l_suppkey,1
LIMIT 10;
avg
------------------------
190000.000000000000
172450.000000000000
112469.000000000000
112470.000000000000
182450.000000000000
137493.000000000000
150009.000000000000
17510.0000000000000000
87504.000000000000
77506.000000000000
(10 rows)
-- explain the query to see actual plan. We expect to see sort+unique to handle
-- distinct on.
EXPLAIN (COSTS FALSE)
SELECT DISTINCT ON (l_suppkey) avg(l_partkey)
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY l_suppkey,1
LIMIT 10;
QUERY PLAN
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Limit
-> Unique
-> Sort
Sort Key: remote_scan.worker_column_4, (pg_catalog.sum(((pg_catalog.sum(remote_scan.avg) / pg_catalog.sum(remote_scan.avg_1)))) / pg_catalog.sum(remote_scan.worker_column_3))
-> HashAggregate
Group Key: remote_scan.worker_column_3, remote_scan.worker_column_4
-> Custom Scan (Citus Real-Time)
Task Count: 4
Tasks Shown: One of 4
-> Task
Node: host=localhost port=57637 dbname=regression
-> HashAggregate
Group Key: l_suppkey, l_linenumber
-> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part
(14 rows)
-- check the plan if the hash aggreate is disabled. We expect to see sort+unique to
-- handle distinct on.
SET enable_hashagg TO off;
EXPLAIN (COSTS FALSE)
SELECT DISTINCT ON (l_suppkey) avg(l_partkey)
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY l_suppkey,1
LIMIT 10;
QUERY PLAN
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Limit
-> Unique
-> Sort
Sort Key: remote_scan.worker_column_4, (pg_catalog.sum(((pg_catalog.sum(remote_scan.avg) / pg_catalog.sum(remote_scan.avg_1)))) / pg_catalog.sum(remote_scan.worker_column_3))
-> GroupAggregate
Group Key: remote_scan.worker_column_3, remote_scan.worker_column_4
-> Sort
Sort Key: remote_scan.worker_column_3, remote_scan.worker_column_4
-> Custom Scan (Citus Real-Time)
Task Count: 4
Tasks Shown: One of 4
-> Task
Node: host=localhost port=57637 dbname=regression
-> HashAggregate
Group Key: l_suppkey, l_linenumber
-> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part
(16 rows)
SET enable_hashagg TO on;
-- distinct with expression and aggregation
SELECT DISTINCT avg(ceil(l_partkey / 2))
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY 1
LIMIT 10;
avg
-----
9
39
74
87
89
91
97
102
111
122
(10 rows)
-- explain the query to see actual plan
EXPLAIN (COSTS FALSE)
SELECT DISTINCT avg(ceil(l_partkey / 2))
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY 1
LIMIT 10;
QUERY PLAN
---------------------------------------------------------------------------------------------------------
Limit
-> Sort
Sort Key: ((sum(remote_scan.avg) / (pg_catalog.sum(remote_scan.avg_1))::double precision))
-> HashAggregate
Group Key: (sum(remote_scan.avg) / (pg_catalog.sum(remote_scan.avg_1))::double precision)
-> HashAggregate
Group Key: remote_scan.worker_column_3, remote_scan.worker_column_4
-> Custom Scan (Citus Real-Time)
Task Count: 4
Tasks Shown: One of 4
-> Task
Node: host=localhost port=57637 dbname=regression
-> HashAggregate
Group Key: l_suppkey, l_linenumber
-> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part
(15 rows)
-- check the plan if the hash aggreate is disabled. This explain errors out due
-- to a bug right now, expectation must be corrected after fixing it.
SET enable_hashagg TO off;
EXPLAIN (COSTS FALSE)
SELECT DISTINCT avg(ceil(l_partkey / 2))
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY 1
LIMIT 10;
ERROR: bogus varattno for OUTER_VAR var: 4
SET enable_hashagg TO on;
-- expression among aggregations.
SELECT DISTINCT sum(l_suppkey) + count(l_partkey) AS dis
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY 1
LIMIT 10;
dis
-----
2
3
4
5
6
8
11
13
14
15
(10 rows)
-- explain the query to see actual plan
EXPLAIN (COSTS FALSE)
SELECT DISTINCT sum(l_suppkey) + count(l_partkey) AS dis
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY 1
LIMIT 10;
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------------------
Limit
-> Sort
Sort Key: (((pg_catalog.sum(remote_scan.dis))::bigint + COALESCE((pg_catalog.sum(remote_scan.dis_1))::bigint, '0'::bigint)))
-> HashAggregate
Group Key: ((pg_catalog.sum(remote_scan.dis))::bigint + COALESCE((pg_catalog.sum(remote_scan.dis_1))::bigint, '0'::bigint))
-> HashAggregate
Group Key: remote_scan.worker_column_3, remote_scan.worker_column_4
-> Custom Scan (Citus Real-Time)
Task Count: 4
Tasks Shown: One of 4
-> Task
Node: host=localhost port=57637 dbname=regression
-> HashAggregate
Group Key: l_suppkey, l_linenumber
-> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part
(15 rows)
-- check the plan if the hash aggreate is disabled. This explain errors out due
-- to a bug right now, expectation must be corrected after fixing it.
SET enable_hashagg TO off;
EXPLAIN (COSTS FALSE)
SELECT DISTINCT sum(l_suppkey) + count(l_partkey) AS dis
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY 1
LIMIT 10;
ERROR: bogus varattno for OUTER_VAR var: 4
SET enable_hashagg TO on;
-- distinct on all columns, note Group By columns guarantees uniqueness of the
-- result list.
SELECT DISTINCT *
FROM lineitem_hash_part
GROUP BY 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
ORDER BY 1,2
LIMIT 10;
l_orderkey | l_partkey | l_suppkey | l_linenumber | l_quantity | l_extendedprice | l_discount | l_tax | l_returnflag | l_linestatus | l_shipdate | l_commitdate | l_receiptdate | l_shipinstruct | l_shipmode | l_comment
------------+-----------+-----------+--------------+------------+-----------------+------------+-------+--------------+--------------+------------+--------------+---------------+---------------------------+------------+--------------------------------------------
1 | 2132 | 4633 | 4 | 28.00 | 28955.64 | 0.09 | 0.06 | N | O | 04-21-1996 | 03-30-1996 | 05-16-1996 | NONE | AIR | lites. fluffily even de
1 | 15635 | 638 | 6 | 32.00 | 49620.16 | 0.07 | 0.02 | N | O | 01-30-1996 | 02-07-1996 | 02-03-1996 | DELIVER IN PERSON | MAIL | arefully slyly ex
1 | 24027 | 1534 | 5 | 24.00 | 22824.48 | 0.10 | 0.04 | N | O | 03-30-1996 | 03-14-1996 | 04-01-1996 | NONE | FOB | pending foxes. slyly re
1 | 63700 | 3701 | 3 | 8.00 | 13309.60 | 0.10 | 0.02 | N | O | 01-29-1996 | 03-05-1996 | 01-31-1996 | TAKE BACK RETURN | REG AIR | riously. regular, express dep
1 | 67310 | 7311 | 2 | 36.00 | 45983.16 | 0.09 | 0.06 | N | O | 04-12-1996 | 02-28-1996 | 04-20-1996 | TAKE BACK RETURN | MAIL | ly final dependencies: slyly bold
1 | 155190 | 7706 | 1 | 17.00 | 21168.23 | 0.04 | 0.02 | N | O | 03-13-1996 | 02-12-1996 | 03-22-1996 | DELIVER IN PERSON | TRUCK | egular courts above the
2 | 106170 | 1191 | 1 | 38.00 | 44694.46 | 0.00 | 0.05 | N | O | 01-28-1997 | 01-14-1997 | 02-02-1997 | TAKE BACK RETURN | RAIL | ven requests. deposits breach a
3 | 4297 | 1798 | 1 | 45.00 | 54058.05 | 0.06 | 0.00 | R | F | 02-02-1994 | 01-04-1994 | 02-23-1994 | NONE | AIR | ongside of the furiously brave acco
3 | 19036 | 6540 | 2 | 49.00 | 46796.47 | 0.10 | 0.00 | R | F | 11-09-1993 | 12-20-1993 | 11-24-1993 | TAKE BACK RETURN | RAIL | unusual accounts. eve
3 | 29380 | 1883 | 4 | 2.00 | 2618.76 | 0.01 | 0.06 | A | F | 12-04-1993 | 01-07-1994 | 01-01-1994 | NONE | TRUCK | y. fluffily pending d
(10 rows)
-- explain the query to see actual plan. We expect to see only one aggregation
-- node since group by columns guarantees the uniqueness.
EXPLAIN (COSTS FALSE)
SELECT DISTINCT *
FROM lineitem_hash_part
GROUP BY 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
ORDER BY 1,2
LIMIT 10;
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Limit
-> Sort
Sort Key: remote_scan.l_orderkey, remote_scan.l_partkey
-> HashAggregate
Group Key: remote_scan.l_orderkey, remote_scan.l_partkey, remote_scan.l_suppkey, remote_scan.l_linenumber, remote_scan.l_quantity, remote_scan.l_extendedprice, remote_scan.l_discount, remote_scan.l_tax, remote_scan.l_returnflag, remote_scan.l_linestatus, remote_scan.l_shipdate, remote_scan.l_commitdate, remote_scan.l_receiptdate, remote_scan.l_shipinstruct, remote_scan.l_shipmode, remote_scan.l_comment
-> Custom Scan (Citus Real-Time)
Task Count: 4
Tasks Shown: One of 4
-> Task
Node: host=localhost port=57637 dbname=regression
-> Limit
-> Unique
-> Group
Group Key: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment
-> Sort
Sort Key: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment
-> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part
(17 rows)
-- check the plan if the hash aggreate is disabled. We expect to see only one
-- aggregation node since group by columns guarantees the uniqueness.
SET enable_hashagg TO off;
EXPLAIN (COSTS FALSE)
SELECT DISTINCT *
FROM lineitem_hash_part
GROUP BY 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
ORDER BY 1,2
LIMIT 10;
QUERY PLAN
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Limit
-> Sort
Sort Key: remote_scan.l_orderkey, remote_scan.l_partkey
-> GroupAggregate
Group Key: remote_scan.l_orderkey, remote_scan.l_partkey, remote_scan.l_suppkey, remote_scan.l_linenumber, remote_scan.l_quantity, remote_scan.l_extendedprice, remote_scan.l_discount, remote_scan.l_tax, remote_scan.l_returnflag, remote_scan.l_linestatus, remote_scan.l_shipdate, remote_scan.l_commitdate, remote_scan.l_receiptdate, remote_scan.l_shipinstruct, remote_scan.l_shipmode, remote_scan.l_comment
-> Sort
Sort Key: remote_scan.l_orderkey, remote_scan.l_partkey, remote_scan.l_suppkey, remote_scan.l_linenumber, remote_scan.l_quantity, remote_scan.l_extendedprice, remote_scan.l_discount, remote_scan.l_tax, remote_scan.l_returnflag, remote_scan.l_linestatus, remote_scan.l_shipdate, remote_scan.l_commitdate, remote_scan.l_receiptdate, remote_scan.l_shipinstruct, remote_scan.l_shipmode, remote_scan.l_comment
-> Custom Scan (Citus Real-Time)
Task Count: 4
Tasks Shown: One of 4
-> Task
Node: host=localhost port=57637 dbname=regression
-> Limit
-> Unique
-> Group
Group Key: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment
-> Sort
Sort Key: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment
-> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part
(19 rows)
SET enable_hashagg TO on;
-- distinct on count distinct
SELECT DISTINCT count(DISTINCT l_partkey), count(DISTINCT l_shipmode)
FROM lineitem_hash_part
GROUP BY l_orderkey
ORDER BY 1,2;
count | count
-------+-------
1 | 1
2 | 1
2 | 2
3 | 1
3 | 2
3 | 3
4 | 1
4 | 2
4 | 3
4 | 4
5 | 2
5 | 3
5 | 4
5 | 5
6 | 2
6 | 3
6 | 4
6 | 5
6 | 6
7 | 2
7 | 3
7 | 4
7 | 5
7 | 6
7 | 7
(25 rows)
-- explain the query to see actual plan. We expect to see aggregation plan for
-- the outer distinct.
EXPLAIN (COSTS FALSE)
SELECT DISTINCT count(DISTINCT l_partkey), count(DISTINCT l_shipmode)
FROM lineitem_hash_part
GROUP BY l_orderkey
ORDER BY 1,2;
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------------------------------------
Sort
Sort Key: (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)), (COALESCE((pg_catalog.sum(remote_scan.count_1))::bigint, '0'::bigint))
-> HashAggregate
Group Key: COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint), COALESCE((pg_catalog.sum(remote_scan.count_1))::bigint, '0'::bigint)
-> HashAggregate
Group Key: remote_scan.worker_column_3
-> Custom Scan (Citus Real-Time)
Task Count: 4
Tasks Shown: One of 4
-> Task
Node: host=localhost port=57637 dbname=regression
-> GroupAggregate
Group Key: l_orderkey
-> Sort
Sort Key: l_orderkey
-> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part
(16 rows)
-- check the plan if the hash aggreate is disabled. We expect to see sort + unique
-- plans for the outer distinct.
SET enable_hashagg TO off;
EXPLAIN (COSTS FALSE)
SELECT DISTINCT count(DISTINCT l_partkey), count(DISTINCT l_shipmode)
FROM lineitem_hash_part
GROUP BY l_orderkey
ORDER BY 1,2;
QUERY PLAN
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Sort
Sort Key: COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint))))::bigint, '0'::bigint))))::bigint, '0'::bigint))))::bigint, '0'::bigint), COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum(remote_scan.count_1))::bigint, '0'::bigint))))::bigint, '0'::bigint))))::bigint, '0'::bigint))))::bigint, '0'::bigint)
-> Unique
-> Sort
Sort Key: COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint))))::bigint, '0'::bigint), COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum(remote_scan.count_1))::bigint, '0'::bigint))))::bigint, '0'::bigint)
-> GroupAggregate
Group Key: remote_scan.worker_column_3
-> Sort
Sort Key: remote_scan.worker_column_3
-> Custom Scan (Citus Real-Time)
Task Count: 4
Tasks Shown: One of 4
-> Task
Node: host=localhost port=57637 dbname=regression
-> GroupAggregate
Group Key: l_orderkey
-> Sort
Sort Key: l_orderkey
-> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part
(19 rows)
SET enable_hashagg TO on;
-- distinct on aggregation with filter and expression
SELECT DISTINCT ceil(count(case when l_partkey > 100000 THEN 1 ELSE 0 END) / 2) AS count
FROM lineitem_hash_part
GROUP BY l_suppkey
ORDER BY 1;
count
-------
0
1
2
3
4
(5 rows)
-- explain the query to see actual plan
EXPLAIN (COSTS FALSE)
SELECT DISTINCT ceil(count(case when l_partkey > 100000 THEN 1 ELSE 0 END) / 2) AS count
FROM lineitem_hash_part
GROUP BY l_suppkey
ORDER BY 1;
QUERY PLAN
-----------------------------------------------------------------------------------------------------------------------
Sort
Sort Key: (ceil(((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint) / 2))::double precision))
-> HashAggregate
Group Key: ceil(((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint) / 2))::double precision)
-> HashAggregate
Group Key: remote_scan.worker_column_2
-> Custom Scan (Citus Real-Time)
Task Count: 4
Tasks Shown: One of 4
-> Task
Node: host=localhost port=57637 dbname=regression
-> HashAggregate
Group Key: l_suppkey
-> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part
(14 rows)
-- check the plan if the hash aggreate is disabled
SET enable_hashagg TO off;
EXPLAIN (COSTS FALSE)
SELECT DISTINCT ceil(count(case when l_partkey > 100000 THEN 1 ELSE 0 END) / 2) AS count
FROM lineitem_hash_part
GROUP BY l_suppkey
ORDER BY 1;
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Sort
Sort Key: ceil(((COALESCE((pg_catalog.sum((ceil(((COALESCE((pg_catalog.sum((ceil(((COALESCE((pg_catalog.sum((ceil(((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint) / 2))::double precision))))::bigint, '0'::bigint) / 2))::double precision))))::bigint, '0'::bigint) / 2))::double precision))))::bigint, '0'::bigint) / 2))::double precision)
-> Unique
-> Sort
Sort Key: ceil(((COALESCE((pg_catalog.sum((ceil(((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint) / 2))::double precision))))::bigint, '0'::bigint) / 2))::double precision)
-> GroupAggregate
Group Key: remote_scan.worker_column_2
-> Sort
Sort Key: remote_scan.worker_column_2
-> Custom Scan (Citus Real-Time)
Task Count: 4
Tasks Shown: One of 4
-> Task
Node: host=localhost port=57637 dbname=regression
-> HashAggregate
Group Key: l_suppkey
-> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part
(17 rows)
SET enable_hashagg TO on;
-- explain the query to see actual plan with array_agg aggregation. Note that,
-- worker query created for this query is not correct. It will be fixed soon.
EXPLAIN (COSTS FALSE)
SELECT DISTINCT array_agg(l_linenumber), array_length(array_agg(l_linenumber), 1)
FROM lineitem_hash_part
GROUP BY l_orderkey
ORDER BY 2
LIMIT 15;
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------
Limit
-> Sort
Sort Key: (array_length(array_cat_agg(remote_scan.array_length), 1))
-> HashAggregate
Group Key: array_length(array_cat_agg(remote_scan.array_length), 1), array_cat_agg(remote_scan.array_agg)
-> HashAggregate
Group Key: remote_scan.worker_column_3
-> Custom Scan (Citus Real-Time)
Task Count: 4
Tasks Shown: One of 4
-> Task
Node: host=localhost port=57637 dbname=regression
-> Limit
-> Sort
Sort Key: (array_agg(l_linenumber))
-> GroupAggregate
Group Key: l_orderkey
-> Sort
Sort Key: l_orderkey
-> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part
(20 rows)
-- check the plan if the hash aggreate is disabled. Note that,
-- worker query created for this query is not correct. It will be fixed soon.
SET enable_hashagg TO off;
EXPLAIN (COSTS FALSE)
SELECT DISTINCT array_agg(l_linenumber), array_length(array_agg(l_linenumber), 1)
FROM lineitem_hash_part
GROUP BY l_orderkey
ORDER BY 2
LIMIT 15;
QUERY PLAN
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Limit
-> Sort
Sort Key: array_length(array_cat_agg((array_length(array_cat_agg((array_length(array_cat_agg((array_length(array_cat_agg(remote_scan.array_length), 1))), 1))), 1))), 1)
-> Unique
-> Sort
Sort Key: array_length(array_cat_agg((array_length(array_cat_agg(remote_scan.array_length), 1))), 1), array_cat_agg((array_cat_agg(remote_scan.array_agg)))
-> GroupAggregate
Group Key: remote_scan.worker_column_3
-> Sort
Sort Key: remote_scan.worker_column_3
-> Custom Scan (Citus Real-Time)
Task Count: 4
Tasks Shown: One of 4
-> Task
Node: host=localhost port=57637 dbname=regression
-> Limit
-> Sort
Sort Key: (array_agg(l_linenumber))
-> GroupAggregate
Group Key: l_orderkey
-> Sort
Sort Key: l_orderkey
-> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part
(23 rows)
SET enable_hashagg TO on;
-- distinct on non-partition column with aggregate
-- this is the same as non-distinct version due to group by

View File

@ -78,6 +78,270 @@ EXPLAIN (COSTS FALSE)
SET enable_hashagg TO on;
-- distinct on aggregate of group by columns, we try to check whether we handle
-- queries which does not have any group by column in distinct columns properly.
SELECT DISTINCT count(*)
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY 1;
-- explain the query to see actual plan. We expect to see Aggregate node having
-- group by key on count(*) column, since columns in the Group By doesn't guarantee
-- the uniqueness of the result.
EXPLAIN (COSTS FALSE)
SELECT DISTINCT count(*)
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY 1;
-- check the plan if the hash aggreate is disabled. We expect to see sort+unique
-- instead of aggregate plan node to handle distinct.
SET enable_hashagg TO off;
EXPLAIN (COSTS FALSE)
SELECT DISTINCT count(*)
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY 1;
SET enable_hashagg TO on;
-- Now we have only part of group clause columns in distinct, yet it is still not
-- enough to use Group By columns to guarantee uniqueness of result list.
SELECT DISTINCT l_suppkey, count(*)
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY 1
LIMIT 10;
-- explain the query to see actual plan. Similar to the explain of the query above.
EXPLAIN (COSTS FALSE)
SELECT DISTINCT l_suppkey, count(*)
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY 1
LIMIT 10;
-- check the plan if the hash aggreate is disabled. Similar to the explain of
-- the query above.
SET enable_hashagg TO off;
EXPLAIN (COSTS FALSE)
SELECT DISTINCT l_suppkey, count(*)
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY 1
LIMIT 10;
SET enable_hashagg TO on;
-- Similar to the above query, not with count but avg. Only difference with the
-- above query is that, we create run two aggregate functions in workers.
SELECT DISTINCT l_suppkey, avg(l_partkey)
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY 1,2
LIMIT 10;
-- explain the query to see actual plan. Similar to the explain of the query above.
-- Only aggregate functions will be changed.
EXPLAIN (COSTS FALSE)
SELECT DISTINCT l_suppkey, avg(l_partkey)
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY 1,2
LIMIT 10;
-- check the plan if the hash aggreate is disabled. This explain errors out due
-- to a bug right now, expectation must be corrected after fixing it.
SET enable_hashagg TO off;
EXPLAIN (COSTS FALSE)
SELECT DISTINCT l_suppkey, avg(l_partkey)
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY 1,2
LIMIT 10;
SET enable_hashagg TO on;
-- Similar to the above query but with distinct on
SELECT DISTINCT ON (l_suppkey) avg(l_partkey)
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY l_suppkey,1
LIMIT 10;
-- explain the query to see actual plan. We expect to see sort+unique to handle
-- distinct on.
EXPLAIN (COSTS FALSE)
SELECT DISTINCT ON (l_suppkey) avg(l_partkey)
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY l_suppkey,1
LIMIT 10;
-- check the plan if the hash aggreate is disabled. We expect to see sort+unique to
-- handle distinct on.
SET enable_hashagg TO off;
EXPLAIN (COSTS FALSE)
SELECT DISTINCT ON (l_suppkey) avg(l_partkey)
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY l_suppkey,1
LIMIT 10;
SET enable_hashagg TO on;
-- distinct with expression and aggregation
SELECT DISTINCT avg(ceil(l_partkey / 2))
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY 1
LIMIT 10;
-- explain the query to see actual plan
EXPLAIN (COSTS FALSE)
SELECT DISTINCT avg(ceil(l_partkey / 2))
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY 1
LIMIT 10;
-- check the plan if the hash aggreate is disabled. This explain errors out due
-- to a bug right now, expectation must be corrected after fixing it.
SET enable_hashagg TO off;
EXPLAIN (COSTS FALSE)
SELECT DISTINCT avg(ceil(l_partkey / 2))
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY 1
LIMIT 10;
SET enable_hashagg TO on;
-- expression among aggregations.
SELECT DISTINCT sum(l_suppkey) + count(l_partkey) AS dis
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY 1
LIMIT 10;
-- explain the query to see actual plan
EXPLAIN (COSTS FALSE)
SELECT DISTINCT sum(l_suppkey) + count(l_partkey) AS dis
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY 1
LIMIT 10;
-- check the plan if the hash aggreate is disabled. This explain errors out due
-- to a bug right now, expectation must be corrected after fixing it.
SET enable_hashagg TO off;
EXPLAIN (COSTS FALSE)
SELECT DISTINCT sum(l_suppkey) + count(l_partkey) AS dis
FROM lineitem_hash_part
GROUP BY l_suppkey, l_linenumber
ORDER BY 1
LIMIT 10;
SET enable_hashagg TO on;
-- distinct on all columns, note Group By columns guarantees uniqueness of the
-- result list.
SELECT DISTINCT *
FROM lineitem_hash_part
GROUP BY 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
ORDER BY 1,2
LIMIT 10;
-- explain the query to see actual plan. We expect to see only one aggregation
-- node since group by columns guarantees the uniqueness.
EXPLAIN (COSTS FALSE)
SELECT DISTINCT *
FROM lineitem_hash_part
GROUP BY 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
ORDER BY 1,2
LIMIT 10;
-- check the plan if the hash aggreate is disabled. We expect to see only one
-- aggregation node since group by columns guarantees the uniqueness.
SET enable_hashagg TO off;
EXPLAIN (COSTS FALSE)
SELECT DISTINCT *
FROM lineitem_hash_part
GROUP BY 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
ORDER BY 1,2
LIMIT 10;
SET enable_hashagg TO on;
-- distinct on count distinct
SELECT DISTINCT count(DISTINCT l_partkey), count(DISTINCT l_shipmode)
FROM lineitem_hash_part
GROUP BY l_orderkey
ORDER BY 1,2;
-- explain the query to see actual plan. We expect to see aggregation plan for
-- the outer distinct.
EXPLAIN (COSTS FALSE)
SELECT DISTINCT count(DISTINCT l_partkey), count(DISTINCT l_shipmode)
FROM lineitem_hash_part
GROUP BY l_orderkey
ORDER BY 1,2;
-- check the plan if the hash aggreate is disabled. We expect to see sort + unique
-- plans for the outer distinct.
SET enable_hashagg TO off;
EXPLAIN (COSTS FALSE)
SELECT DISTINCT count(DISTINCT l_partkey), count(DISTINCT l_shipmode)
FROM lineitem_hash_part
GROUP BY l_orderkey
ORDER BY 1,2;
SET enable_hashagg TO on;
-- distinct on aggregation with filter and expression
SELECT DISTINCT ceil(count(case when l_partkey > 100000 THEN 1 ELSE 0 END) / 2) AS count
FROM lineitem_hash_part
GROUP BY l_suppkey
ORDER BY 1;
-- explain the query to see actual plan
EXPLAIN (COSTS FALSE)
SELECT DISTINCT ceil(count(case when l_partkey > 100000 THEN 1 ELSE 0 END) / 2) AS count
FROM lineitem_hash_part
GROUP BY l_suppkey
ORDER BY 1;
-- check the plan if the hash aggreate is disabled
SET enable_hashagg TO off;
EXPLAIN (COSTS FALSE)
SELECT DISTINCT ceil(count(case when l_partkey > 100000 THEN 1 ELSE 0 END) / 2) AS count
FROM lineitem_hash_part
GROUP BY l_suppkey
ORDER BY 1;
SET enable_hashagg TO on;
-- explain the query to see actual plan with array_agg aggregation. Note that,
-- worker query created for this query is not correct. It will be fixed soon.
EXPLAIN (COSTS FALSE)
SELECT DISTINCT array_agg(l_linenumber), array_length(array_agg(l_linenumber), 1)
FROM lineitem_hash_part
GROUP BY l_orderkey
ORDER BY 2
LIMIT 15;
-- check the plan if the hash aggreate is disabled. Note that,
-- worker query created for this query is not correct. It will be fixed soon.
SET enable_hashagg TO off;
EXPLAIN (COSTS FALSE)
SELECT DISTINCT array_agg(l_linenumber), array_length(array_agg(l_linenumber), 1)
FROM lineitem_hash_part
GROUP BY l_orderkey
ORDER BY 2
LIMIT 15;
SET enable_hashagg TO on;
-- distinct on non-partition column with aggregate
-- this is the same as non-distinct version due to group by
SELECT DISTINCT l_partkey, count(*)