diff --git a/src/backend/distributed/planner/multi_master_planner.c b/src/backend/distributed/planner/multi_master_planner.c index 0d63a44ba..52961f9f7 100644 --- a/src/backend/distributed/planner/multi_master_planner.c +++ b/src/backend/distributed/planner/multi_master_planner.c @@ -37,6 +37,8 @@ static PlannedStmt * BuildSelectStatement(Query *masterQuery, List *masterTarget static Agg * BuildAggregatePlan(Query *masterQuery, Plan *subPlan); static bool HasDistinctAggregate(Query *masterQuery); static Plan * BuildDistinctPlan(Query *masterQuery, Plan *subPlan); +static List * PrepareTargetListForNextPlan(List *targetList); +static bool IsGroupBySubsetOfDistinct(Query *masterQuery); /* @@ -397,28 +399,29 @@ BuildDistinctPlan(Query *masterQuery, Plan *subPlan) bool distinctClausesHashable = true; List *distinctClauseList = masterQuery->distinctClause; List *targetList = copyObject(masterQuery->targetList); - List *columnList = pull_var_clause_default((Node *) targetList); - ListCell *columnCell = NULL; bool hasDistinctAggregate = false; - if (IsA(subPlan, Agg)) + /* + * We don't need to add distinct plan if all of the columns used in group by + * clause also used in distinct clause, since group by clause guarantees the + * uniqueness of the target list for every row. + */ + if (IsGroupBySubsetOfDistinct(masterQuery)) { return subPlan; } + /* + * We need to adjust varno to OUTER_VAR, since planner expects that for upper + * level plans above the sequential scan. We also need to convert aggregations + * (if exists) to regular Vars since the aggregation would be applied by the + * previous aggregation plan and we don't want them to be applied again. + */ + targetList = PrepareTargetListForNextPlan(targetList); + Assert(masterQuery->distinctClause); Assert(!masterQuery->hasDistinctOn); - /* - * For upper level plans above the sequential scan, the planner expects the - * table id (varno) to be set to OUTER_VAR. - */ - foreach(columnCell, columnList) - { - Var *column = (Var *) lfirst(columnCell); - column->varno = OUTER_VAR; - } - /* * Create group by plan with HashAggregate if all distinct * members are hashable, and not containing distinct aggregate. @@ -451,3 +454,84 @@ BuildDistinctPlan(Query *masterQuery, Plan *subPlan) return distinctPlan; } + + +/* + * PrepareTargetListForNextPlan handles both regular columns to have right varno + * and convert aggregates to regular Vars in the target list. + */ +static List * +PrepareTargetListForNextPlan(List *targetList) +{ + List *newtargetList = NIL; + ListCell *targetEntryCell = NULL; + + foreach(targetEntryCell, targetList) + { + TargetEntry *targetEntry = lfirst(targetEntryCell); + TargetEntry *newTargetEntry = NULL; + Var *newVar = NULL; + + Assert(IsA(targetEntry, TargetEntry)); + + /* + * For upper level plans above the sequential scan, the planner expects the + * table id (varno) to be set to OUTER_VAR. + */ + newVar = makeVarFromTargetEntry(OUTER_VAR, targetEntry); + newTargetEntry = flatCopyTargetEntry(targetEntry); + newTargetEntry->expr = (Expr *) newVar; + newtargetList = lappend(newtargetList, newTargetEntry); + } + + return newtargetList; +} + + +/* + * IsGroupBySubsetOfDistinct checks whether each clause in group clauses also + * exists in the distinct clauses. Note that, empty group clause is not a subset + * of distinct clause. + */ +static bool +IsGroupBySubsetOfDistinct(Query *masterQuery) +{ + List *distinctClauses = masterQuery->distinctClause; + List *groupClauses = masterQuery->groupClause; + ListCell *distinctCell = NULL; + ListCell *groupCell = NULL; + + /* There must be a group clause */ + if (list_length(groupClauses) == 0) + { + return false; + } + + foreach(groupCell, groupClauses) + { + SortGroupClause *groupClause = (SortGroupClause *) lfirst(groupCell); + bool isFound = false; + + foreach(distinctCell, distinctClauses) + { + SortGroupClause *distinctClause = (SortGroupClause *) lfirst(distinctCell); + + if (groupClause->tleSortGroupRef == distinctClause->tleSortGroupRef) + { + isFound = true; + break; + } + } + + /* + * If we can't find any member of group clause in the distinct clause, + * that means group clause is not a subset of distinct clause. + */ + if (!isFound) + { + return false; + } + } + + return true; +} diff --git a/src/test/regress/expected/multi_select_distinct.out b/src/test/regress/expected/multi_select_distinct.out index 51dda9108..2bd872043 100644 --- a/src/test/regress/expected/multi_select_distinct.out +++ b/src/test/regress/expected/multi_select_distinct.out @@ -245,6 +245,739 @@ EXPLAIN (COSTS FALSE) Filter: (l_orderkey < 200) (17 rows) +SET enable_hashagg TO on; +-- distinct on aggregate of group by columns, we try to check whether we handle +-- queries which does not have any group by column in distinct columns properly. +SELECT DISTINCT count(*) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1; + count +------- + 1 + 2 + 3 + 4 +(4 rows) + + +-- explain the query to see actual plan. We expect to see Aggregate node having +-- group by key on count(*) column, since columns in the Group By doesn't guarantee +-- the uniqueness of the result. +EXPLAIN (COSTS FALSE) + SELECT DISTINCT count(*) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1; + QUERY PLAN +---------------------------------------------------------------------------------------------- + Sort + Sort Key: (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) + -> HashAggregate + Group Key: COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint) + -> HashAggregate + Group Key: remote_scan.worker_column_2, remote_scan.worker_column_3 + -> Custom Scan (Citus Real-Time) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part +(14 rows) + +-- check the plan if the hash aggreate is disabled. We expect to see sort+unique +-- instead of aggregate plan node to handle distinct. +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT count(*) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1; + QUERY PLAN +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Sort + Sort Key: COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint))))::bigint, '0'::bigint))))::bigint, '0'::bigint))))::bigint, '0'::bigint) + -> Unique + -> Sort + Sort Key: COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint))))::bigint, '0'::bigint) + -> GroupAggregate + Group Key: remote_scan.worker_column_2, remote_scan.worker_column_3 + -> Sort + Sort Key: remote_scan.worker_column_2, remote_scan.worker_column_3 + -> Custom Scan (Citus Real-Time) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part +(17 rows) + +SET enable_hashagg TO on; +-- Now we have only part of group clause columns in distinct, yet it is still not +-- enough to use Group By columns to guarantee uniqueness of result list. +SELECT DISTINCT l_suppkey, count(*) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; + l_suppkey | count +-----------+------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 + 7 | 1 + 10 | 1 + 12 | 1 + 13 | 1 + 14 | 1 +(10 rows) + + +-- explain the query to see actual plan. Similar to the explain of the query above. +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_suppkey, count(*) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: remote_scan.l_suppkey + -> HashAggregate + Group Key: remote_scan.l_suppkey, COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint) + -> HashAggregate + Group Key: remote_scan.l_suppkey, remote_scan.worker_column_3 + -> Custom Scan (Citus Real-Time) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> Limit + -> Sort + Sort Key: l_suppkey, l_linenumber + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part +(18 rows) + +-- check the plan if the hash aggreate is disabled. Similar to the explain of +-- the query above. +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_suppkey, count(*) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: remote_scan.l_suppkey + -> Unique + -> Sort + Sort Key: remote_scan.l_suppkey, COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint))))::bigint, '0'::bigint) + -> GroupAggregate + Group Key: remote_scan.l_suppkey, remote_scan.worker_column_3 + -> Sort + Sort Key: remote_scan.l_suppkey, remote_scan.worker_column_3 + -> Custom Scan (Citus Real-Time) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> Limit + -> Sort + Sort Key: l_suppkey, l_linenumber + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part +(21 rows) + +SET enable_hashagg TO on; +-- Similar to the above query, not with count but avg. Only difference with the +-- above query is that, we create run two aggregate functions in workers. +SELECT DISTINCT l_suppkey, avg(l_partkey) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1,2 + LIMIT 10; + l_suppkey | avg +-----------+------------------------ + 1 | 190000.000000000000 + 2 | 172450.000000000000 + 3 | 112469.000000000000 + 3 | 134976.000000000000 + 4 | 112470.000000000000 + 4 | 142461.000000000000 + 5 | 182450.000000000000 + 7 | 137493.000000000000 + 10 | 150009.000000000000 + 12 | 17510.0000000000000000 +(10 rows) + + +-- explain the query to see actual plan. Similar to the explain of the query above. +-- Only aggregate functions will be changed. +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_suppkey, avg(l_partkey) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1,2 + LIMIT 10; + QUERY PLAN +----------------------------------------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: remote_scan.l_suppkey, ((pg_catalog.sum(remote_scan.avg) / pg_catalog.sum(remote_scan.avg_1))) + -> HashAggregate + Group Key: remote_scan.l_suppkey, (pg_catalog.sum(remote_scan.avg) / pg_catalog.sum(remote_scan.avg_1)) + -> HashAggregate + Group Key: remote_scan.l_suppkey, remote_scan.worker_column_4 + -> Custom Scan (Citus Real-Time) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part +(15 rows) + +-- check the plan if the hash aggreate is disabled. This explain errors out due +-- to a bug right now, expectation must be corrected after fixing it. +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_suppkey, avg(l_partkey) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1,2 + LIMIT 10; +ERROR: bogus varattno for OUTER_VAR var: 4 +SET enable_hashagg TO on; +-- Similar to the above query but with distinct on +SELECT DISTINCT ON (l_suppkey) avg(l_partkey) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY l_suppkey,1 + LIMIT 10; + avg +------------------------ + 190000.000000000000 + 172450.000000000000 + 112469.000000000000 + 112470.000000000000 + 182450.000000000000 + 137493.000000000000 + 150009.000000000000 + 17510.0000000000000000 + 87504.000000000000 + 77506.000000000000 +(10 rows) + + +-- explain the query to see actual plan. We expect to see sort+unique to handle +-- distinct on. +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (l_suppkey) avg(l_partkey) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY l_suppkey,1 + LIMIT 10; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: remote_scan.worker_column_4, (pg_catalog.sum(((pg_catalog.sum(remote_scan.avg) / pg_catalog.sum(remote_scan.avg_1)))) / pg_catalog.sum(remote_scan.worker_column_3)) + -> HashAggregate + Group Key: remote_scan.worker_column_3, remote_scan.worker_column_4 + -> Custom Scan (Citus Real-Time) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part +(14 rows) + +-- check the plan if the hash aggreate is disabled. We expect to see sort+unique to +-- handle distinct on. +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (l_suppkey) avg(l_partkey) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY l_suppkey,1 + LIMIT 10; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: remote_scan.worker_column_4, (pg_catalog.sum(((pg_catalog.sum(remote_scan.avg) / pg_catalog.sum(remote_scan.avg_1)))) / pg_catalog.sum(remote_scan.worker_column_3)) + -> GroupAggregate + Group Key: remote_scan.worker_column_3, remote_scan.worker_column_4 + -> Sort + Sort Key: remote_scan.worker_column_3, remote_scan.worker_column_4 + -> Custom Scan (Citus Real-Time) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part +(16 rows) + +SET enable_hashagg TO on; +-- distinct with expression and aggregation +SELECT DISTINCT avg(ceil(l_partkey / 2)) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; + avg +----- + 9 + 39 + 74 + 87 + 89 + 91 + 97 + 102 + 111 + 122 +(10 rows) + + +-- explain the query to see actual plan +EXPLAIN (COSTS FALSE) + SELECT DISTINCT avg(ceil(l_partkey / 2)) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; + QUERY PLAN +--------------------------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: ((sum(remote_scan.avg) / (pg_catalog.sum(remote_scan.avg_1))::double precision)) + -> HashAggregate + Group Key: (sum(remote_scan.avg) / (pg_catalog.sum(remote_scan.avg_1))::double precision) + -> HashAggregate + Group Key: remote_scan.worker_column_3, remote_scan.worker_column_4 + -> Custom Scan (Citus Real-Time) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part +(15 rows) + +-- check the plan if the hash aggreate is disabled. This explain errors out due +-- to a bug right now, expectation must be corrected after fixing it. +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT avg(ceil(l_partkey / 2)) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; +ERROR: bogus varattno for OUTER_VAR var: 4 + +SET enable_hashagg TO on; +-- expression among aggregations. +SELECT DISTINCT sum(l_suppkey) + count(l_partkey) AS dis + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; + dis +----- + 2 + 3 + 4 + 5 + 6 + 8 + 11 + 13 + 14 + 15 +(10 rows) + + +-- explain the query to see actual plan +EXPLAIN (COSTS FALSE) + SELECT DISTINCT sum(l_suppkey) + count(l_partkey) AS dis + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: (((pg_catalog.sum(remote_scan.dis))::bigint + COALESCE((pg_catalog.sum(remote_scan.dis_1))::bigint, '0'::bigint))) + -> HashAggregate + Group Key: ((pg_catalog.sum(remote_scan.dis))::bigint + COALESCE((pg_catalog.sum(remote_scan.dis_1))::bigint, '0'::bigint)) + -> HashAggregate + Group Key: remote_scan.worker_column_3, remote_scan.worker_column_4 + -> Custom Scan (Citus Real-Time) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part +(15 rows) + +-- check the plan if the hash aggreate is disabled. This explain errors out due +-- to a bug right now, expectation must be corrected after fixing it. +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT sum(l_suppkey) + count(l_partkey) AS dis + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; +ERROR: bogus varattno for OUTER_VAR var: 4 + +SET enable_hashagg TO on; + +-- distinct on all columns, note Group By columns guarantees uniqueness of the +-- result list. +SELECT DISTINCT * + FROM lineitem_hash_part + GROUP BY 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 + ORDER BY 1,2 + LIMIT 10; + l_orderkey | l_partkey | l_suppkey | l_linenumber | l_quantity | l_extendedprice | l_discount | l_tax | l_returnflag | l_linestatus | l_shipdate | l_commitdate | l_receiptdate | l_shipinstruct | l_shipmode | l_comment +------------+-----------+-----------+--------------+------------+-----------------+------------+-------+--------------+--------------+------------+--------------+---------------+---------------------------+------------+-------------------------------------------- + 1 | 2132 | 4633 | 4 | 28.00 | 28955.64 | 0.09 | 0.06 | N | O | 04-21-1996 | 03-30-1996 | 05-16-1996 | NONE | AIR | lites. fluffily even de + 1 | 15635 | 638 | 6 | 32.00 | 49620.16 | 0.07 | 0.02 | N | O | 01-30-1996 | 02-07-1996 | 02-03-1996 | DELIVER IN PERSON | MAIL | arefully slyly ex + 1 | 24027 | 1534 | 5 | 24.00 | 22824.48 | 0.10 | 0.04 | N | O | 03-30-1996 | 03-14-1996 | 04-01-1996 | NONE | FOB | pending foxes. slyly re + 1 | 63700 | 3701 | 3 | 8.00 | 13309.60 | 0.10 | 0.02 | N | O | 01-29-1996 | 03-05-1996 | 01-31-1996 | TAKE BACK RETURN | REG AIR | riously. regular, express dep + 1 | 67310 | 7311 | 2 | 36.00 | 45983.16 | 0.09 | 0.06 | N | O | 04-12-1996 | 02-28-1996 | 04-20-1996 | TAKE BACK RETURN | MAIL | ly final dependencies: slyly bold + 1 | 155190 | 7706 | 1 | 17.00 | 21168.23 | 0.04 | 0.02 | N | O | 03-13-1996 | 02-12-1996 | 03-22-1996 | DELIVER IN PERSON | TRUCK | egular courts above the + 2 | 106170 | 1191 | 1 | 38.00 | 44694.46 | 0.00 | 0.05 | N | O | 01-28-1997 | 01-14-1997 | 02-02-1997 | TAKE BACK RETURN | RAIL | ven requests. deposits breach a + 3 | 4297 | 1798 | 1 | 45.00 | 54058.05 | 0.06 | 0.00 | R | F | 02-02-1994 | 01-04-1994 | 02-23-1994 | NONE | AIR | ongside of the furiously brave acco + 3 | 19036 | 6540 | 2 | 49.00 | 46796.47 | 0.10 | 0.00 | R | F | 11-09-1993 | 12-20-1993 | 11-24-1993 | TAKE BACK RETURN | RAIL | unusual accounts. eve + 3 | 29380 | 1883 | 4 | 2.00 | 2618.76 | 0.01 | 0.06 | A | F | 12-04-1993 | 01-07-1994 | 01-01-1994 | NONE | TRUCK | y. fluffily pending d +(10 rows) + + +-- explain the query to see actual plan. We expect to see only one aggregation +-- node since group by columns guarantees the uniqueness. +EXPLAIN (COSTS FALSE) + SELECT DISTINCT * + FROM lineitem_hash_part + GROUP BY 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 + ORDER BY 1,2 + LIMIT 10; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: remote_scan.l_orderkey, remote_scan.l_partkey + -> HashAggregate + Group Key: remote_scan.l_orderkey, remote_scan.l_partkey, remote_scan.l_suppkey, remote_scan.l_linenumber, remote_scan.l_quantity, remote_scan.l_extendedprice, remote_scan.l_discount, remote_scan.l_tax, remote_scan.l_returnflag, remote_scan.l_linestatus, remote_scan.l_shipdate, remote_scan.l_commitdate, remote_scan.l_receiptdate, remote_scan.l_shipinstruct, remote_scan.l_shipmode, remote_scan.l_comment + -> Custom Scan (Citus Real-Time) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> Limit + -> Unique + -> Group + Group Key: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment + -> Sort + Sort Key: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment + -> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part +(17 rows) + +-- check the plan if the hash aggreate is disabled. We expect to see only one +-- aggregation node since group by columns guarantees the uniqueness. +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT * + FROM lineitem_hash_part + GROUP BY 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 + ORDER BY 1,2 + LIMIT 10; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ + Limit + -> Sort + Sort Key: remote_scan.l_orderkey, remote_scan.l_partkey + -> GroupAggregate + Group Key: remote_scan.l_orderkey, remote_scan.l_partkey, remote_scan.l_suppkey, remote_scan.l_linenumber, remote_scan.l_quantity, remote_scan.l_extendedprice, remote_scan.l_discount, remote_scan.l_tax, remote_scan.l_returnflag, remote_scan.l_linestatus, remote_scan.l_shipdate, remote_scan.l_commitdate, remote_scan.l_receiptdate, remote_scan.l_shipinstruct, remote_scan.l_shipmode, remote_scan.l_comment + -> Sort + Sort Key: remote_scan.l_orderkey, remote_scan.l_partkey, remote_scan.l_suppkey, remote_scan.l_linenumber, remote_scan.l_quantity, remote_scan.l_extendedprice, remote_scan.l_discount, remote_scan.l_tax, remote_scan.l_returnflag, remote_scan.l_linestatus, remote_scan.l_shipdate, remote_scan.l_commitdate, remote_scan.l_receiptdate, remote_scan.l_shipinstruct, remote_scan.l_shipmode, remote_scan.l_comment + -> Custom Scan (Citus Real-Time) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> Limit + -> Unique + -> Group + Group Key: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment + -> Sort + Sort Key: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment + -> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part +(19 rows) + + +SET enable_hashagg TO on; +-- distinct on count distinct +SELECT DISTINCT count(DISTINCT l_partkey), count(DISTINCT l_shipmode) + FROM lineitem_hash_part + GROUP BY l_orderkey + ORDER BY 1,2; + count | count +-------+------- + 1 | 1 + 2 | 1 + 2 | 2 + 3 | 1 + 3 | 2 + 3 | 3 + 4 | 1 + 4 | 2 + 4 | 3 + 4 | 4 + 5 | 2 + 5 | 3 + 5 | 4 + 5 | 5 + 6 | 2 + 6 | 3 + 6 | 4 + 6 | 5 + 6 | 6 + 7 | 2 + 7 | 3 + 7 | 4 + 7 | 5 + 7 | 6 + 7 | 7 +(25 rows) + + +-- explain the query to see actual plan. We expect to see aggregation plan for +-- the outer distinct. +EXPLAIN (COSTS FALSE) + SELECT DISTINCT count(DISTINCT l_partkey), count(DISTINCT l_shipmode) + FROM lineitem_hash_part + GROUP BY l_orderkey + ORDER BY 1,2; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------------------- + Sort + Sort Key: (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)), (COALESCE((pg_catalog.sum(remote_scan.count_1))::bigint, '0'::bigint)) + -> HashAggregate + Group Key: COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint), COALESCE((pg_catalog.sum(remote_scan.count_1))::bigint, '0'::bigint) + -> HashAggregate + Group Key: remote_scan.worker_column_3 + -> Custom Scan (Citus Real-Time) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> GroupAggregate + Group Key: l_orderkey + -> Sort + Sort Key: l_orderkey + -> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part +(16 rows) + +-- check the plan if the hash aggreate is disabled. We expect to see sort + unique +-- plans for the outer distinct. +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT count(DISTINCT l_partkey), count(DISTINCT l_shipmode) + FROM lineitem_hash_part + GROUP BY l_orderkey + ORDER BY 1,2; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ + Sort + Sort Key: COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint))))::bigint, '0'::bigint))))::bigint, '0'::bigint))))::bigint, '0'::bigint), COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum(remote_scan.count_1))::bigint, '0'::bigint))))::bigint, '0'::bigint))))::bigint, '0'::bigint))))::bigint, '0'::bigint) + -> Unique + -> Sort + Sort Key: COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint))))::bigint, '0'::bigint), COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum(remote_scan.count_1))::bigint, '0'::bigint))))::bigint, '0'::bigint) + -> GroupAggregate + Group Key: remote_scan.worker_column_3 + -> Sort + Sort Key: remote_scan.worker_column_3 + -> Custom Scan (Citus Real-Time) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> GroupAggregate + Group Key: l_orderkey + -> Sort + Sort Key: l_orderkey + -> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part +(19 rows) + + +SET enable_hashagg TO on; +-- distinct on aggregation with filter and expression +SELECT DISTINCT ceil(count(case when l_partkey > 100000 THEN 1 ELSE 0 END) / 2) AS count + FROM lineitem_hash_part + GROUP BY l_suppkey + ORDER BY 1; + count +------- + 0 + 1 + 2 + 3 + 4 +(5 rows) + + +-- explain the query to see actual plan +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ceil(count(case when l_partkey > 100000 THEN 1 ELSE 0 END) / 2) AS count + FROM lineitem_hash_part + GROUP BY l_suppkey + ORDER BY 1; + QUERY PLAN +----------------------------------------------------------------------------------------------------------------------- + Sort + Sort Key: (ceil(((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint) / 2))::double precision)) + -> HashAggregate + Group Key: ceil(((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint) / 2))::double precision) + -> HashAggregate + Group Key: remote_scan.worker_column_2 + -> Custom Scan (Citus Real-Time) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> HashAggregate + Group Key: l_suppkey + -> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part +(14 rows) + +-- check the plan if the hash aggreate is disabled +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ceil(count(case when l_partkey > 100000 THEN 1 ELSE 0 END) / 2) AS count + FROM lineitem_hash_part + GROUP BY l_suppkey + ORDER BY 1; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Sort + Sort Key: ceil(((COALESCE((pg_catalog.sum((ceil(((COALESCE((pg_catalog.sum((ceil(((COALESCE((pg_catalog.sum((ceil(((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint) / 2))::double precision))))::bigint, '0'::bigint) / 2))::double precision))))::bigint, '0'::bigint) / 2))::double precision))))::bigint, '0'::bigint) / 2))::double precision) + -> Unique + -> Sort + Sort Key: ceil(((COALESCE((pg_catalog.sum((ceil(((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint) / 2))::double precision))))::bigint, '0'::bigint) / 2))::double precision) + -> GroupAggregate + Group Key: remote_scan.worker_column_2 + -> Sort + Sort Key: remote_scan.worker_column_2 + -> Custom Scan (Citus Real-Time) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> HashAggregate + Group Key: l_suppkey + -> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part +(17 rows) + + +SET enable_hashagg TO on; + +-- explain the query to see actual plan with array_agg aggregation. Note that, +-- worker query created for this query is not correct. It will be fixed soon. +EXPLAIN (COSTS FALSE) + SELECT DISTINCT array_agg(l_linenumber), array_length(array_agg(l_linenumber), 1) + FROM lineitem_hash_part + GROUP BY l_orderkey + ORDER BY 2 + LIMIT 15; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: (array_length(array_cat_agg(remote_scan.array_length), 1)) + -> HashAggregate + Group Key: array_length(array_cat_agg(remote_scan.array_length), 1), array_cat_agg(remote_scan.array_agg) + -> HashAggregate + Group Key: remote_scan.worker_column_3 + -> Custom Scan (Citus Real-Time) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> Limit + -> Sort + Sort Key: (array_agg(l_linenumber)) + -> GroupAggregate + Group Key: l_orderkey + -> Sort + Sort Key: l_orderkey + -> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part +(20 rows) + +-- check the plan if the hash aggreate is disabled. Note that, +-- worker query created for this query is not correct. It will be fixed soon. +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT array_agg(l_linenumber), array_length(array_agg(l_linenumber), 1) + FROM lineitem_hash_part + GROUP BY l_orderkey + ORDER BY 2 + LIMIT 15; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: array_length(array_cat_agg((array_length(array_cat_agg((array_length(array_cat_agg((array_length(array_cat_agg(remote_scan.array_length), 1))), 1))), 1))), 1) + -> Unique + -> Sort + Sort Key: array_length(array_cat_agg((array_length(array_cat_agg(remote_scan.array_length), 1))), 1), array_cat_agg((array_cat_agg(remote_scan.array_agg))) + -> GroupAggregate + Group Key: remote_scan.worker_column_3 + -> Sort + Sort Key: remote_scan.worker_column_3 + -> Custom Scan (Citus Real-Time) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> Limit + -> Sort + Sort Key: (array_agg(l_linenumber)) + -> GroupAggregate + Group Key: l_orderkey + -> Sort + Sort Key: l_orderkey + -> Seq Scan on lineitem_hash_part_360038 lineitem_hash_part +(23 rows) + + SET enable_hashagg TO on; -- distinct on non-partition column with aggregate -- this is the same as non-distinct version due to group by diff --git a/src/test/regress/sql/multi_select_distinct.sql b/src/test/regress/sql/multi_select_distinct.sql index 65b88c0ec..3bf66f3a0 100644 --- a/src/test/regress/sql/multi_select_distinct.sql +++ b/src/test/regress/sql/multi_select_distinct.sql @@ -78,6 +78,270 @@ EXPLAIN (COSTS FALSE) SET enable_hashagg TO on; +-- distinct on aggregate of group by columns, we try to check whether we handle +-- queries which does not have any group by column in distinct columns properly. +SELECT DISTINCT count(*) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1; + +-- explain the query to see actual plan. We expect to see Aggregate node having +-- group by key on count(*) column, since columns in the Group By doesn't guarantee +-- the uniqueness of the result. +EXPLAIN (COSTS FALSE) + SELECT DISTINCT count(*) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1; + +-- check the plan if the hash aggreate is disabled. We expect to see sort+unique +-- instead of aggregate plan node to handle distinct. +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT count(*) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1; + +SET enable_hashagg TO on; + +-- Now we have only part of group clause columns in distinct, yet it is still not +-- enough to use Group By columns to guarantee uniqueness of result list. +SELECT DISTINCT l_suppkey, count(*) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; + +-- explain the query to see actual plan. Similar to the explain of the query above. +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_suppkey, count(*) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; + +-- check the plan if the hash aggreate is disabled. Similar to the explain of +-- the query above. +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_suppkey, count(*) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; + +SET enable_hashagg TO on; + +-- Similar to the above query, not with count but avg. Only difference with the +-- above query is that, we create run two aggregate functions in workers. +SELECT DISTINCT l_suppkey, avg(l_partkey) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1,2 + LIMIT 10; + +-- explain the query to see actual plan. Similar to the explain of the query above. +-- Only aggregate functions will be changed. +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_suppkey, avg(l_partkey) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1,2 + LIMIT 10; + +-- check the plan if the hash aggreate is disabled. This explain errors out due +-- to a bug right now, expectation must be corrected after fixing it. +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_suppkey, avg(l_partkey) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1,2 + LIMIT 10; + +SET enable_hashagg TO on; + +-- Similar to the above query but with distinct on +SELECT DISTINCT ON (l_suppkey) avg(l_partkey) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY l_suppkey,1 + LIMIT 10; + +-- explain the query to see actual plan. We expect to see sort+unique to handle +-- distinct on. +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (l_suppkey) avg(l_partkey) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY l_suppkey,1 + LIMIT 10; + +-- check the plan if the hash aggreate is disabled. We expect to see sort+unique to +-- handle distinct on. +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (l_suppkey) avg(l_partkey) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY l_suppkey,1 + LIMIT 10; + +SET enable_hashagg TO on; + +-- distinct with expression and aggregation +SELECT DISTINCT avg(ceil(l_partkey / 2)) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; + +-- explain the query to see actual plan +EXPLAIN (COSTS FALSE) + SELECT DISTINCT avg(ceil(l_partkey / 2)) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; + +-- check the plan if the hash aggreate is disabled. This explain errors out due +-- to a bug right now, expectation must be corrected after fixing it. +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT avg(ceil(l_partkey / 2)) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; + +SET enable_hashagg TO on; + +-- expression among aggregations. +SELECT DISTINCT sum(l_suppkey) + count(l_partkey) AS dis + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; + +-- explain the query to see actual plan +EXPLAIN (COSTS FALSE) + SELECT DISTINCT sum(l_suppkey) + count(l_partkey) AS dis + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; + +-- check the plan if the hash aggreate is disabled. This explain errors out due +-- to a bug right now, expectation must be corrected after fixing it. +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT sum(l_suppkey) + count(l_partkey) AS dis + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; + +SET enable_hashagg TO on; + +-- distinct on all columns, note Group By columns guarantees uniqueness of the +-- result list. +SELECT DISTINCT * + FROM lineitem_hash_part + GROUP BY 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 + ORDER BY 1,2 + LIMIT 10; + +-- explain the query to see actual plan. We expect to see only one aggregation +-- node since group by columns guarantees the uniqueness. +EXPLAIN (COSTS FALSE) + SELECT DISTINCT * + FROM lineitem_hash_part + GROUP BY 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 + ORDER BY 1,2 + LIMIT 10; + +-- check the plan if the hash aggreate is disabled. We expect to see only one +-- aggregation node since group by columns guarantees the uniqueness. +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT * + FROM lineitem_hash_part + GROUP BY 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 + ORDER BY 1,2 + LIMIT 10; + +SET enable_hashagg TO on; + +-- distinct on count distinct +SELECT DISTINCT count(DISTINCT l_partkey), count(DISTINCT l_shipmode) + FROM lineitem_hash_part + GROUP BY l_orderkey + ORDER BY 1,2; + +-- explain the query to see actual plan. We expect to see aggregation plan for +-- the outer distinct. +EXPLAIN (COSTS FALSE) + SELECT DISTINCT count(DISTINCT l_partkey), count(DISTINCT l_shipmode) + FROM lineitem_hash_part + GROUP BY l_orderkey + ORDER BY 1,2; + +-- check the plan if the hash aggreate is disabled. We expect to see sort + unique +-- plans for the outer distinct. +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT count(DISTINCT l_partkey), count(DISTINCT l_shipmode) + FROM lineitem_hash_part + GROUP BY l_orderkey + ORDER BY 1,2; + +SET enable_hashagg TO on; + +-- distinct on aggregation with filter and expression +SELECT DISTINCT ceil(count(case when l_partkey > 100000 THEN 1 ELSE 0 END) / 2) AS count + FROM lineitem_hash_part + GROUP BY l_suppkey + ORDER BY 1; + +-- explain the query to see actual plan +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ceil(count(case when l_partkey > 100000 THEN 1 ELSE 0 END) / 2) AS count + FROM lineitem_hash_part + GROUP BY l_suppkey + ORDER BY 1; + +-- check the plan if the hash aggreate is disabled +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ceil(count(case when l_partkey > 100000 THEN 1 ELSE 0 END) / 2) AS count + FROM lineitem_hash_part + GROUP BY l_suppkey + ORDER BY 1; + +SET enable_hashagg TO on; + +-- explain the query to see actual plan with array_agg aggregation. Note that, +-- worker query created for this query is not correct. It will be fixed soon. +EXPLAIN (COSTS FALSE) + SELECT DISTINCT array_agg(l_linenumber), array_length(array_agg(l_linenumber), 1) + FROM lineitem_hash_part + GROUP BY l_orderkey + ORDER BY 2 + LIMIT 15; + +-- check the plan if the hash aggreate is disabled. Note that, +-- worker query created for this query is not correct. It will be fixed soon. +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT array_agg(l_linenumber), array_length(array_agg(l_linenumber), 1) + FROM lineitem_hash_part + GROUP BY l_orderkey + ORDER BY 2 + LIMIT 15; + +SET enable_hashagg TO on; + -- distinct on non-partition column with aggregate -- this is the same as non-distinct version due to group by SELECT DISTINCT l_partkey, count(*)