diff --git a/src/test/regress/expected/multi_select_distinct.out b/src/test/regress/expected/multi_select_distinct.out index 7f639727c..7e79858b3 100644 --- a/src/test/regress/expected/multi_select_distinct.out +++ b/src/test/regress/expected/multi_select_distinct.out @@ -3,6 +3,9 @@ -- -- Tests select distinct, and select distinct on features. -- +-- This test file has an alternative output because some HashAggregate +-- leverages in PG15 compared to PG13/PG14 +-- ANALYZE lineitem_hash_part; -- function calls are supported SELECT DISTINCT l_orderkey, now() FROM lineitem_hash_part LIMIT 0; @@ -764,13 +767,14 @@ EXPLAIN (COSTS FALSE) -> Task Node: host=localhost port=xxxxx dbname=regression -> Limit - -> Unique - -> Group + -> Sort + Sort Key: l_orderkey, l_partkey + -> HashAggregate Group Key: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment - -> Sort - Sort Key: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment + -> HashAggregate + Group Key: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part -(17 rows) +(18 rows) -- check the plan if the hash aggreate is disabled. We expect to see only one -- aggregation node since group by columns guarantees the uniqueness. @@ -793,13 +797,14 @@ EXPLAIN (COSTS FALSE) -> Task Node: host=localhost port=xxxxx dbname=regression -> Limit - -> Unique - -> Group + -> Sort + Sort Key: l_orderkey, l_partkey + -> HashAggregate Group Key: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment - -> Sort - Sort Key: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment + -> HashAggregate + Group Key: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part -(16 rows) +(17 rows) SET enable_hashagg TO on; -- distinct on count distinct @@ -972,12 +977,10 @@ EXPLAIN (COSTS FALSE) Tasks Shown: One of 4 -> Task Node: host=localhost port=xxxxx dbname=regression - -> GroupAggregate + -> HashAggregate Group Key: l_orderkey - -> Sort - Sort Key: l_orderkey - -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part -(15 rows) + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(13 rows) -- check the plan if the hash aggreate is disabled. SET enable_hashagg TO off; @@ -998,12 +1001,10 @@ EXPLAIN (COSTS FALSE) Tasks Shown: One of 4 -> Task Node: host=localhost port=xxxxx dbname=regression - -> GroupAggregate + -> HashAggregate Group Key: l_orderkey - -> Sort - Sort Key: l_orderkey - -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part -(14 rows) + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(12 rows) SET enable_hashagg TO on; -- distinct on non-partition column with aggregate diff --git a/src/test/regress/expected/multi_select_distinct_0.out b/src/test/regress/expected/multi_select_distinct_0.out new file mode 100644 index 000000000..eba4c55c5 --- /dev/null +++ b/src/test/regress/expected/multi_select_distinct_0.out @@ -0,0 +1,1586 @@ +-- +-- MULTI_SELECT_DISTINCT +-- +-- Tests select distinct, and select distinct on features. +-- +-- This test file has an alternative output because some HashAggregate +-- leverages in PG15 compared to PG13/PG14 +-- +ANALYZE lineitem_hash_part; +-- function calls are supported +SELECT DISTINCT l_orderkey, now() FROM lineitem_hash_part LIMIT 0; + l_orderkey | now +--------------------------------------------------------------------- +(0 rows) + +SELECT DISTINCT l_orderkey, avg(l_linenumber) +FROM lineitem_hash_part +GROUP BY l_orderkey +HAVING avg(l_linenumber) = (select avg(distinct l_linenumber)) +LIMIT 10; +ERROR: Subqueries in HAVING cannot refer to outer query +SELECT DISTINCT l_orderkey +FROM lineitem_hash_part +GROUP BY l_orderkey +HAVING (select avg(distinct l_linenumber) = l_orderkey) +LIMIT 10; +ERROR: Subqueries in HAVING cannot refer to outer query +SELECT DISTINCT l_partkey, 1 + (random() * 0)::int FROM lineitem_hash_part ORDER BY 1 DESC LIMIT 3; + l_partkey | ?column? +--------------------------------------------------------------------- + 199973 | 1 + 199946 | 1 + 199943 | 1 +(3 rows) + +-- const expressions are supported +SELECT DISTINCT l_orderkey, 1+1 FROM lineitem_hash_part ORDER BY 1 LIMIT 5; + l_orderkey | ?column? +--------------------------------------------------------------------- + 1 | 2 + 2 | 2 + 3 | 2 + 4 | 2 + 5 | 2 +(5 rows) + +-- non const expressions are also supported +SELECT DISTINCT l_orderkey, l_partkey + 1 FROM lineitem_hash_part ORDER BY 1, 2 LIMIT 5; + l_orderkey | ?column? +--------------------------------------------------------------------- + 1 | 2133 + 1 | 15636 + 1 | 24028 + 1 | 63701 + 1 | 67311 +(5 rows) + +-- column expressions are supported +SELECT DISTINCT l_orderkey, l_shipinstruct || l_shipmode FROM lineitem_hash_part ORDER BY 2 , 1 LIMIT 5; + l_orderkey | ?column? +--------------------------------------------------------------------- + 32 | COLLECT CODAIR + 39 | COLLECT CODAIR + 66 | COLLECT CODAIR + 70 | COLLECT CODAIR + 98 | COLLECT CODAIR +(5 rows) + +-- function calls with const input are supported +SELECT DISTINCT l_orderkey, strpos('AIR', 'A') FROM lineitem_hash_part ORDER BY 1,2 LIMIT 5; + l_orderkey | strpos +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +-- function calls with non-const input are supported +SELECT DISTINCT l_orderkey, strpos(l_shipmode, 'I') + FROM lineitem_hash_part + WHERE strpos(l_shipmode, 'I') > 1 + ORDER BY 2, 1 + LIMIT 5; + l_orderkey | strpos +--------------------------------------------------------------------- + 1 | 2 + 3 | 2 + 5 | 2 + 32 | 2 + 33 | 2 +(5 rows) + +-- row types are supported +SELECT DISTINCT (l_orderkey, l_partkey) AS pair FROM lineitem_hash_part ORDER BY 1 LIMIT 5; + pair +--------------------------------------------------------------------- + (1,2132) + (1,15635) + (1,24027) + (1,63700) + (1,67310) +(5 rows) + +-- distinct on partition column +-- verify counts match with respect to count(distinct) +CREATE TEMP TABLE temp_orderkeys AS SELECT DISTINCT l_orderkey FROM lineitem_hash_part; +SELECT COUNT(*) FROM temp_orderkeys; + count +--------------------------------------------------------------------- + 2985 +(1 row) + +SELECT COUNT(DISTINCT l_orderkey) FROM lineitem_hash_part; + count +--------------------------------------------------------------------- + 2985 +(1 row) + +SELECT DISTINCT l_orderkey FROM lineitem_hash_part WHERE l_orderkey < 500 and l_partkey < 5000 order by 1; + l_orderkey +--------------------------------------------------------------------- + 1 + 3 + 32 + 35 + 39 + 65 + 129 + 130 + 134 + 164 + 194 + 228 + 261 + 290 + 320 + 321 + 354 + 418 +(18 rows) + +-- distinct on non-partition column +SELECT DISTINCT l_partkey FROM lineitem_hash_part WHERE l_orderkey > 5 and l_orderkey < 20 order by 1; + l_partkey +--------------------------------------------------------------------- + 79251 + 94780 + 139636 + 145243 + 151894 + 157238 + 163073 + 182052 +(8 rows) + +SELECT DISTINCT l_shipmode FROM lineitem_hash_part ORDER BY 1 DESC; + l_shipmode +--------------------------------------------------------------------- + TRUCK + SHIP + REG AIR + RAIL + MAIL + FOB + AIR +(7 rows) + +-- distinct with multiple columns +SELECT DISTINCT l_orderkey, o_orderdate + FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey) + WHERE l_orderkey < 10 + ORDER BY l_orderkey; + l_orderkey | o_orderdate +--------------------------------------------------------------------- + 1 | 01-02-1996 + 2 | 12-01-1996 + 3 | 10-14-1993 + 4 | 10-11-1995 + 5 | 07-30-1994 + 6 | 02-21-1992 + 7 | 01-10-1996 +(7 rows) + +-- distinct on partition column with aggregate +-- this is the same as the one without distinct due to group by +SELECT DISTINCT l_orderkey, count(*) + FROM lineitem_hash_part + WHERE l_orderkey < 200 + GROUP BY 1 + HAVING count(*) > 5 + ORDER BY 2 DESC, 1; + l_orderkey | count +--------------------------------------------------------------------- + 7 | 7 + 68 | 7 + 129 | 7 + 164 | 7 + 194 | 7 + 1 | 6 + 3 | 6 + 32 | 6 + 35 | 6 + 39 | 6 + 67 | 6 + 69 | 6 + 70 | 6 + 71 | 6 + 134 | 6 + 135 | 6 + 163 | 6 + 192 | 6 + 197 | 6 +(19 rows) + +-- explain the query to see actual plan +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_orderkey, count(*) + FROM lineitem_hash_part + WHERE l_orderkey < 200 + GROUP BY 1 + HAVING count(*) > 5 + ORDER BY 2 DESC, 1; + QUERY PLAN +--------------------------------------------------------------------- + Sort + Sort Key: remote_scan.count DESC, remote_scan.l_orderkey + -> HashAggregate + Group Key: remote_scan.count, remote_scan.l_orderkey + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_orderkey + Filter: (count(*) > 5) + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part + Filter: (l_orderkey < 200) +(14 rows) + +-- check the plan if the hash aggreate is disabled +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_orderkey, count(*) + FROM lineitem_hash_part + WHERE l_orderkey < 200 + GROUP BY 1 + HAVING count(*) > 5 + ORDER BY 2 DESC, 1; + QUERY PLAN +--------------------------------------------------------------------- + Unique + -> Sort + Sort Key: remote_scan.count DESC, remote_scan.l_orderkey + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_orderkey + Filter: (count(*) > 5) + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part + Filter: (l_orderkey < 200) +(13 rows) + +SET enable_hashagg TO on; +-- distinct on aggregate of group by columns, we try to check whether we handle +-- queries which does not have any group by column in distinct columns properly. +SELECT DISTINCT count(*) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1; + count +--------------------------------------------------------------------- + 1 + 2 + 3 + 4 +(4 rows) + +-- explain the query to see actual plan. We expect to see Aggregate node having +-- group by key on count(*) column, since columns in the Group By doesn't guarantee +-- the uniqueness of the result. +EXPLAIN (COSTS FALSE) + SELECT DISTINCT count(*) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1; + QUERY PLAN +--------------------------------------------------------------------- + Unique + -> Sort + Sort Key: (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) + -> HashAggregate + Group Key: remote_scan.worker_column_2, remote_scan.worker_column_3 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(13 rows) + +-- check the plan if the hash aggreate is disabled. We expect to see sort+unique +-- instead of aggregate plan node to handle distinct. +SET enable_hashagg TO off; +SELECT public.plan_without_result_lines($Q$ +EXPLAIN (COSTS FALSE) + SELECT DISTINCT count(*) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1; +$Q$); + plan_without_result_lines +--------------------------------------------------------------------- + Unique + -> Sort + Sort Key: (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) + -> GroupAggregate + Group Key: remote_scan.worker_column_2, remote_scan.worker_column_3 + -> Sort + Sort Key: remote_scan.worker_column_2, remote_scan.worker_column_3 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(15 rows) + +SET enable_hashagg TO on; +-- Now we have only part of group clause columns in distinct, yet it is still not +-- enough to use Group By columns to guarantee uniqueness of result list. +SELECT DISTINCT l_suppkey, count(*) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; + l_suppkey | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 + 7 | 1 + 10 | 1 + 12 | 1 + 13 | 1 + 14 | 1 +(10 rows) + +-- explain the query to see actual plan. Similar to the explain of the query above. +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_suppkey, count(*) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: remote_scan.l_suppkey, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) + -> HashAggregate + Group Key: remote_scan.l_suppkey, remote_scan.worker_column_3 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(14 rows) + +-- check the plan if the hash aggreate is disabled. Similar to the explain of +-- the query above. +SET enable_hashagg TO off; +SELECT public.plan_without_result_lines($Q$ +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_suppkey, count(*) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; +$Q$); + plan_without_result_lines +--------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: remote_scan.l_suppkey, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) + -> GroupAggregate + Group Key: remote_scan.l_suppkey, remote_scan.worker_column_3 + -> Sort + Sort Key: remote_scan.l_suppkey, remote_scan.worker_column_3 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(16 rows) + +SET enable_hashagg TO on; +-- Similar to the above query, not with count but avg. Only difference with the +-- above query is that, we create run two aggregate functions in workers. +SELECT DISTINCT l_suppkey, avg(l_partkey) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1,2 + LIMIT 10; + l_suppkey | avg +--------------------------------------------------------------------- + 1 | 190000.000000000000 + 2 | 172450.000000000000 + 3 | 112469.000000000000 + 3 | 134976.000000000000 + 4 | 112470.000000000000 + 4 | 142461.000000000000 + 5 | 182450.000000000000 + 7 | 137493.000000000000 + 10 | 150009.000000000000 + 12 | 17510.0000000000000000 +(10 rows) + +-- explain the query to see actual plan. Similar to the explain of the query above. +-- Only aggregate functions will be changed. +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_suppkey, avg(l_partkey) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1,2 + LIMIT 10; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: remote_scan.l_suppkey, ((pg_catalog.sum(remote_scan.avg) / pg_catalog.sum(remote_scan.avg_1))) + -> HashAggregate + Group Key: remote_scan.l_suppkey, remote_scan.worker_column_4 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(14 rows) + +-- check the plan if the hash aggreate is disabled. This explain errors out due +-- to a bug right now, expectation must be corrected after fixing it. +SET enable_hashagg TO off; +SELECT public.plan_without_result_lines($Q$ +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_suppkey, avg(l_partkey) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1,2 + LIMIT 10; +$Q$); + plan_without_result_lines +--------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: remote_scan.l_suppkey, ((pg_catalog.sum(remote_scan.avg) / pg_catalog.sum(remote_scan.avg_1))) + -> GroupAggregate + Group Key: remote_scan.l_suppkey, remote_scan.worker_column_4 + -> Sort + Sort Key: remote_scan.l_suppkey, remote_scan.worker_column_4 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(16 rows) + +SET enable_hashagg TO on; +-- Similar to the above query but with distinct on +SELECT DISTINCT ON (l_suppkey) avg(l_partkey) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY l_suppkey,1 + LIMIT 10; + avg +--------------------------------------------------------------------- + 190000.000000000000 + 172450.000000000000 + 112469.000000000000 + 112470.000000000000 + 182450.000000000000 + 137493.000000000000 + 150009.000000000000 + 17510.0000000000000000 + 87504.000000000000 + 77506.000000000000 +(10 rows) + +-- explain the query to see actual plan. We expect to see sort+unique to handle +-- distinct on. +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (l_suppkey) avg(l_partkey) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY l_suppkey,1 + LIMIT 10; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: remote_scan.worker_column_3, ((pg_catalog.sum(remote_scan.avg) / pg_catalog.sum(remote_scan.avg_1))) + -> HashAggregate + Group Key: remote_scan.worker_column_3, remote_scan.worker_column_4 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(14 rows) + +-- check the plan if the hash aggreate is disabled. We expect to see sort+unique to +-- handle distinct on. +SET enable_hashagg TO off; +SELECT public.plan_without_result_lines($Q$ +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (l_suppkey) avg(l_partkey) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY l_suppkey,1 + LIMIT 10; +$Q$); + plan_without_result_lines +--------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: remote_scan.worker_column_3, ((pg_catalog.sum(remote_scan.avg) / pg_catalog.sum(remote_scan.avg_1))) + -> GroupAggregate + Group Key: remote_scan.worker_column_3, remote_scan.worker_column_4 + -> Sort + Sort Key: remote_scan.worker_column_3, remote_scan.worker_column_4 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(16 rows) + +SET enable_hashagg TO on; +-- distinct with expression and aggregation +SELECT DISTINCT avg(ceil(l_partkey / 2)) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; + avg +--------------------------------------------------------------------- + 9 + 39 + 74 + 87 + 89 + 91 + 97 + 102 + 111 + 122 +(10 rows) + +-- explain the query to see actual plan +EXPLAIN (COSTS FALSE) + SELECT DISTINCT avg(ceil(l_partkey / 2)) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: ((sum(remote_scan.avg) / (pg_catalog.sum(remote_scan.avg_1))::double precision)) + -> HashAggregate + Group Key: remote_scan.worker_column_3, remote_scan.worker_column_4 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(14 rows) + +-- check the plan if the hash aggreate is disabled. This explain errors out due +-- to a bug right now, expectation must be corrected after fixing it. +SET enable_hashagg TO off; +SELECT public.plan_without_result_lines($Q$ +EXPLAIN (COSTS FALSE) + SELECT DISTINCT avg(ceil(l_partkey / 2)) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; +$Q$); + plan_without_result_lines +--------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: ((sum(remote_scan.avg) / (pg_catalog.sum(remote_scan.avg_1))::double precision)) + -> GroupAggregate + Group Key: remote_scan.worker_column_3, remote_scan.worker_column_4 + -> Sort + Sort Key: remote_scan.worker_column_3, remote_scan.worker_column_4 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(16 rows) + +SET enable_hashagg TO on; +-- expression among aggregations. +SELECT DISTINCT sum(l_suppkey) + count(l_partkey) AS dis + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; + dis +--------------------------------------------------------------------- + 2 + 3 + 4 + 5 + 6 + 8 + 11 + 13 + 14 + 15 +(10 rows) + +-- explain the query to see actual plan +EXPLAIN (COSTS FALSE) + SELECT DISTINCT sum(l_suppkey) + count(l_partkey) AS dis + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: (((pg_catalog.sum(remote_scan.dis))::bigint + COALESCE((pg_catalog.sum(remote_scan.dis_1))::bigint, '0'::bigint))) + -> HashAggregate + Group Key: remote_scan.worker_column_3, remote_scan.worker_column_4 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(14 rows) + +-- check the plan if the hash aggreate is disabled. This explain errors out due +-- to a bug right now, expectation must be corrected after fixing it. +SET enable_hashagg TO off; +SELECT public.plan_without_result_lines($Q$ +EXPLAIN (COSTS FALSE) + SELECT DISTINCT sum(l_suppkey) + count(l_partkey) AS dis + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; +$Q$); + plan_without_result_lines +--------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: (((pg_catalog.sum(remote_scan.dis))::bigint + COALESCE((pg_catalog.sum(remote_scan.dis_1))::bigint, '0'::bigint))) + -> GroupAggregate + Group Key: remote_scan.worker_column_3, remote_scan.worker_column_4 + -> Sort + Sort Key: remote_scan.worker_column_3, remote_scan.worker_column_4 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(16 rows) + +SET enable_hashagg TO on; +-- distinct on all columns, note Group By columns guarantees uniqueness of the +-- result list. +SELECT DISTINCT * + FROM lineitem_hash_part + GROUP BY 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 + ORDER BY 1,2 + LIMIT 10; + l_orderkey | l_partkey | l_suppkey | l_linenumber | l_quantity | l_extendedprice | l_discount | l_tax | l_returnflag | l_linestatus | l_shipdate | l_commitdate | l_receiptdate | l_shipinstruct | l_shipmode | l_comment +--------------------------------------------------------------------- + 1 | 2132 | 4633 | 4 | 28.00 | 28955.64 | 0.09 | 0.06 | N | O | 04-21-1996 | 03-30-1996 | 05-16-1996 | NONE | AIR | lites. fluffily even de + 1 | 15635 | 638 | 6 | 32.00 | 49620.16 | 0.07 | 0.02 | N | O | 01-30-1996 | 02-07-1996 | 02-03-1996 | DELIVER IN PERSON | MAIL | arefully slyly ex + 1 | 24027 | 1534 | 5 | 24.00 | 22824.48 | 0.10 | 0.04 | N | O | 03-30-1996 | 03-14-1996 | 04-01-1996 | NONE | FOB | pending foxes. slyly re + 1 | 63700 | 3701 | 3 | 8.00 | 13309.60 | 0.10 | 0.02 | N | O | 01-29-1996 | 03-05-1996 | 01-31-1996 | TAKE BACK RETURN | REG AIR | riously. regular, express dep + 1 | 67310 | 7311 | 2 | 36.00 | 45983.16 | 0.09 | 0.06 | N | O | 04-12-1996 | 02-28-1996 | 04-20-1996 | TAKE BACK RETURN | MAIL | ly final dependencies: slyly bold + 1 | 155190 | 7706 | 1 | 17.00 | 21168.23 | 0.04 | 0.02 | N | O | 03-13-1996 | 02-12-1996 | 03-22-1996 | DELIVER IN PERSON | TRUCK | egular courts above the + 2 | 106170 | 1191 | 1 | 38.00 | 44694.46 | 0.00 | 0.05 | N | O | 01-28-1997 | 01-14-1997 | 02-02-1997 | TAKE BACK RETURN | RAIL | ven requests. deposits breach a + 3 | 4297 | 1798 | 1 | 45.00 | 54058.05 | 0.06 | 0.00 | R | F | 02-02-1994 | 01-04-1994 | 02-23-1994 | NONE | AIR | ongside of the furiously brave acco + 3 | 19036 | 6540 | 2 | 49.00 | 46796.47 | 0.10 | 0.00 | R | F | 11-09-1993 | 12-20-1993 | 11-24-1993 | TAKE BACK RETURN | RAIL | unusual accounts. eve + 3 | 29380 | 1883 | 4 | 2.00 | 2618.76 | 0.01 | 0.06 | A | F | 12-04-1993 | 01-07-1994 | 01-01-1994 | NONE | TRUCK | y. fluffily pending d +(10 rows) + +-- explain the query to see actual plan. We expect to see only one aggregation +-- node since group by columns guarantees the uniqueness. +EXPLAIN (COSTS FALSE) + SELECT DISTINCT * + FROM lineitem_hash_part + GROUP BY 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 + ORDER BY 1,2 + LIMIT 10; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Sort + Sort Key: remote_scan.l_orderkey, remote_scan.l_partkey + -> HashAggregate + Group Key: remote_scan.l_orderkey, remote_scan.l_partkey, remote_scan.l_suppkey, remote_scan.l_linenumber, remote_scan.l_quantity, remote_scan.l_extendedprice, remote_scan.l_discount, remote_scan.l_tax, remote_scan.l_returnflag, remote_scan.l_linestatus, remote_scan.l_shipdate, remote_scan.l_commitdate, remote_scan.l_receiptdate, remote_scan.l_shipinstruct, remote_scan.l_shipmode, remote_scan.l_comment + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Unique + -> Group + Group Key: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment + -> Sort + Sort Key: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(17 rows) + +-- check the plan if the hash aggreate is disabled. We expect to see only one +-- aggregation node since group by columns guarantees the uniqueness. +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT * + FROM lineitem_hash_part + GROUP BY 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 + ORDER BY 1,2 + LIMIT 10; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: remote_scan.l_orderkey, remote_scan.l_partkey, remote_scan.l_suppkey, remote_scan.l_linenumber, remote_scan.l_quantity, remote_scan.l_extendedprice, remote_scan.l_discount, remote_scan.l_tax, remote_scan.l_returnflag, remote_scan.l_linestatus, remote_scan.l_shipdate, remote_scan.l_commitdate, remote_scan.l_receiptdate, remote_scan.l_shipinstruct, remote_scan.l_shipmode, remote_scan.l_comment + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Unique + -> Group + Group Key: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment + -> Sort + Sort Key: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(16 rows) + +SET enable_hashagg TO on; +-- distinct on count distinct +SELECT DISTINCT count(DISTINCT l_partkey), count(DISTINCT l_shipmode) + FROM lineitem_hash_part + GROUP BY l_orderkey + ORDER BY 1,2; + count | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 2 | 2 + 3 | 1 + 3 | 2 + 3 | 3 + 4 | 1 + 4 | 2 + 4 | 3 + 4 | 4 + 5 | 2 + 5 | 3 + 5 | 4 + 5 | 5 + 6 | 2 + 6 | 3 + 6 | 4 + 6 | 5 + 6 | 6 + 7 | 2 + 7 | 3 + 7 | 4 + 7 | 5 + 7 | 6 + 7 | 7 +(25 rows) + +-- explain the query to see actual plan. We expect to see aggregation plan for +-- the outer distinct. +EXPLAIN (COSTS FALSE) + SELECT DISTINCT count(DISTINCT l_partkey), count(DISTINCT l_shipmode) + FROM lineitem_hash_part + GROUP BY l_orderkey + ORDER BY 1,2; + QUERY PLAN +--------------------------------------------------------------------- + Sort + Sort Key: remote_scan.count, remote_scan.count_1 + -> HashAggregate + Group Key: remote_scan.count, remote_scan.count_1 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> GroupAggregate + Group Key: l_orderkey + -> Sort + Sort Key: l_orderkey + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(14 rows) + +-- check the plan if the hash aggreate is disabled. We expect to see sort + unique +-- plans for the outer distinct. +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT count(DISTINCT l_partkey), count(DISTINCT l_shipmode) + FROM lineitem_hash_part + GROUP BY l_orderkey + ORDER BY 1,2; + QUERY PLAN +--------------------------------------------------------------------- + Unique + -> Sort + Sort Key: remote_scan.count, remote_scan.count_1 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> GroupAggregate + Group Key: l_orderkey + -> Sort + Sort Key: l_orderkey + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(13 rows) + +SET enable_hashagg TO on; +-- distinct on aggregation with filter and expression +SELECT DISTINCT ceil(count(case when l_partkey > 100000 THEN 1 ELSE 0 END) / 2) AS count + FROM lineitem_hash_part + GROUP BY l_suppkey + ORDER BY 1; + count +--------------------------------------------------------------------- + 0 + 1 + 2 + 3 + 4 +(5 rows) + +-- explain the query to see actual plan +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ceil(count(case when l_partkey > 100000 THEN 1 ELSE 0 END) / 2) AS count + FROM lineitem_hash_part + GROUP BY l_suppkey + ORDER BY 1; + QUERY PLAN +--------------------------------------------------------------------- + Unique + -> Sort + Sort Key: (ceil(((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint) / 2))::double precision)) + -> HashAggregate + Group Key: remote_scan.worker_column_2 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_suppkey + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(13 rows) + +-- check the plan if the hash aggreate is disabled +SET enable_hashagg TO off; +SELECT public.plan_without_result_lines($Q$ +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ceil(count(case when l_partkey > 100000 THEN 1 ELSE 0 END) / 2) AS count + FROM lineitem_hash_part + GROUP BY l_suppkey + ORDER BY 1; +$Q$); + plan_without_result_lines +--------------------------------------------------------------------- + Unique + -> Sort + Sort Key: (ceil(((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint) / 2))::double precision)) + -> GroupAggregate + Group Key: remote_scan.worker_column_2 + -> Sort + Sort Key: remote_scan.worker_column_2 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_suppkey + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(15 rows) + +SET enable_hashagg TO on; +-- explain the query to see actual plan with array_agg aggregation. +EXPLAIN (COSTS FALSE) + SELECT DISTINCT array_agg(l_linenumber), array_length(array_agg(l_linenumber), 1) + FROM lineitem_hash_part + GROUP BY l_orderkey + ORDER BY 2 + LIMIT 15; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Sort + Sort Key: remote_scan.array_length + -> HashAggregate + Group Key: remote_scan.array_length, remote_scan.array_agg + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> GroupAggregate + Group Key: l_orderkey + -> Sort + Sort Key: l_orderkey + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(15 rows) + +-- check the plan if the hash aggreate is disabled. +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT array_agg(l_linenumber), array_length(array_agg(l_linenumber), 1) + FROM lineitem_hash_part + GROUP BY l_orderkey + ORDER BY 2 + LIMIT 15; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: remote_scan.array_length, remote_scan.array_agg + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> GroupAggregate + Group Key: l_orderkey + -> Sort + Sort Key: l_orderkey + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(14 rows) + +SET enable_hashagg TO on; +-- distinct on non-partition column with aggregate +-- this is the same as non-distinct version due to group by +SELECT DISTINCT l_partkey, count(*) + FROM lineitem_hash_part + GROUP BY 1 + HAVING count(*) > 2 + ORDER BY 1; + l_partkey | count +--------------------------------------------------------------------- + 1051 | 3 + 1927 | 3 + 6983 | 3 + 15283 | 3 + 87761 | 3 + 136884 | 3 + 149926 | 3 + 160895 | 3 + 177771 | 3 + 188804 | 3 + 199146 | 3 +(11 rows) + +-- explain the query to see actual plan +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_partkey, count(*) + FROM lineitem_hash_part + GROUP BY 1 + HAVING count(*) > 2 + ORDER BY 1; + QUERY PLAN +--------------------------------------------------------------------- + Unique + -> Sort + Sort Key: remote_scan.l_partkey, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) + -> HashAggregate + Group Key: remote_scan.l_partkey + Filter: (COALESCE((pg_catalog.sum(remote_scan.worker_column_3))::bigint, '0'::bigint) > 2) + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_partkey + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(14 rows) + +-- distinct on non-partition column and avg +SELECT DISTINCT l_partkey, avg(l_linenumber) + FROM lineitem_hash_part + WHERE l_partkey < 500 + GROUP BY 1 + HAVING avg(l_linenumber) > 2 + ORDER BY 1; + l_partkey | avg +--------------------------------------------------------------------- + 18 | 7.0000000000000000 + 79 | 6.0000000000000000 + 149 | 4.5000000000000000 + 175 | 5.0000000000000000 + 179 | 6.0000000000000000 + 182 | 3.0000000000000000 + 222 | 4.0000000000000000 + 278 | 3.0000000000000000 + 299 | 7.0000000000000000 + 308 | 7.0000000000000000 + 309 | 5.0000000000000000 + 321 | 3.0000000000000000 + 337 | 6.0000000000000000 + 364 | 3.0000000000000000 + 403 | 4.0000000000000000 +(15 rows) + +-- distinct on multiple non-partition columns +SELECT DISTINCT l_partkey, l_suppkey + FROM lineitem_hash_part + WHERE l_shipmode = 'AIR' AND l_orderkey < 100 + ORDER BY 1, 2; + l_partkey | l_suppkey +--------------------------------------------------------------------- + 2132 | 4633 + 4297 | 1798 + 37531 | 35 + 44161 | 6666 + 44706 | 4707 + 67831 | 5350 + 85811 | 8320 + 94368 | 6878 + 108338 | 849 + 108570 | 8571 + 137267 | 4807 + 137469 | 9983 + 173489 | 3490 + 196156 | 1195 + 197921 | 441 +(15 rows) + +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_partkey, l_suppkey + FROM lineitem_hash_part + WHERE l_shipmode = 'AIR' AND l_orderkey < 100 + ORDER BY 1, 2; + QUERY PLAN +--------------------------------------------------------------------- + Sort + Sort Key: remote_scan.l_partkey, remote_scan.l_suppkey + -> HashAggregate + Group Key: remote_scan.l_partkey, remote_scan.l_suppkey + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Unique + -> Sort + Sort Key: l_partkey, l_suppkey + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part + Filter: ((l_orderkey < 100) AND (l_shipmode = 'AIR'::bpchar)) +(14 rows) + +-- distinct on partition column +SELECT DISTINCT ON (l_orderkey) l_orderkey, l_partkey, l_suppkey + FROM lineitem_hash_part + WHERE l_orderkey < 35 + ORDER BY 1, 2, 3; + l_orderkey | l_partkey | l_suppkey +--------------------------------------------------------------------- + 1 | 2132 | 4633 + 2 | 106170 | 1191 + 3 | 4297 | 1798 + 4 | 88035 | 5560 + 5 | 37531 | 35 + 6 | 139636 | 2150 + 7 | 79251 | 1759 + 32 | 2743 | 7744 + 33 | 33918 | 3919 + 34 | 88362 | 871 +(10 rows) + +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (l_orderkey) l_orderkey, l_partkey, l_suppkey + FROM lineitem_hash_part + WHERE l_orderkey < 35 + ORDER BY 1, 2, 3; + QUERY PLAN +--------------------------------------------------------------------- + Unique + -> Sort + Sort Key: remote_scan.l_orderkey, remote_scan.l_partkey, remote_scan.l_suppkey + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Unique + -> Sort + Sort Key: l_orderkey, l_partkey, l_suppkey + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part + Filter: (l_orderkey < 35) +(13 rows) + +-- distinct on non-partition column +-- note order by is required here +-- otherwise query results will be different since +-- distinct on clause is on non-partition column +SELECT DISTINCT ON (l_partkey) l_partkey, l_orderkey + FROM lineitem_hash_part + ORDER BY 1,2 + LIMIT 20; + l_partkey | l_orderkey +--------------------------------------------------------------------- + 18 | 12005 + 79 | 5121 + 91 | 2883 + 149 | 807 + 175 | 4102 + 179 | 2117 + 182 | 548 + 195 | 2528 + 204 | 10048 + 222 | 9413 + 245 | 9446 + 278 | 1287 + 299 | 1122 + 308 | 11137 + 309 | 2374 + 318 | 321 + 321 | 5984 + 337 | 10403 + 350 | 13698 + 358 | 4323 +(20 rows) + +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (l_partkey) l_partkey, l_orderkey + FROM lineitem_hash_part + ORDER BY 1,2 + LIMIT 20; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: remote_scan.l_partkey, remote_scan.l_orderkey + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Unique + -> Sort + Sort Key: l_partkey, l_orderkey + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(14 rows) + +-- distinct on with joins +-- each customer's first order key +SELECT DISTINCT ON (o_custkey) o_custkey, l_orderkey + FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey) + WHERE o_custkey < 15 + ORDER BY 1,2; + o_custkey | l_orderkey +--------------------------------------------------------------------- + 1 | 9154 + 2 | 10563 + 4 | 320 + 5 | 11682 + 7 | 10402 + 8 | 102 + 10 | 1602 + 11 | 12800 + 13 | 994 + 14 | 11011 +(10 rows) + +SELECT coordinator_plan($Q$ +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (o_custkey) o_custkey, l_orderkey + FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey) + WHERE o_custkey < 15 + ORDER BY 1,2; +$Q$); + coordinator_plan +--------------------------------------------------------------------- + Unique + -> Sort + Sort Key: remote_scan.o_custkey, remote_scan.l_orderkey + -> Custom Scan (Citus Adaptive) + Task Count: 4 +(5 rows) + +-- explain without order by +-- notice master plan has order by on distinct on column +SELECT coordinator_plan($Q$ +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (o_custkey) o_custkey, l_orderkey + FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey) + WHERE o_custkey < 15; +$Q$); + coordinator_plan +--------------------------------------------------------------------- + Unique + -> Sort + Sort Key: remote_scan.o_custkey + -> Custom Scan (Citus Adaptive) + Task Count: 4 +(5 rows) + +-- each customer's each order's first l_partkey +SELECT DISTINCT ON (o_custkey, l_orderkey) o_custkey, l_orderkey, l_linenumber, l_partkey + FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey) + WHERE o_custkey < 20 + ORDER BY 1,2,3; + o_custkey | l_orderkey | l_linenumber | l_partkey +--------------------------------------------------------------------- + 1 | 9154 | 1 | 86513 + 1 | 14656 | 1 | 59539 + 2 | 10563 | 1 | 147459 + 4 | 320 | 1 | 4415 + 4 | 739 | 1 | 84489 + 4 | 10688 | 1 | 45037 + 4 | 10788 | 1 | 50814 + 4 | 13728 | 1 | 86216 + 5 | 11682 | 1 | 31634 + 5 | 11746 | 1 | 180724 + 5 | 14308 | 1 | 157430 + 7 | 10402 | 1 | 53661 + 7 | 13031 | 1 | 112161 + 7 | 14145 | 1 | 138729 + 7 | 14404 | 1 | 143034 + 8 | 102 | 1 | 88914 + 8 | 164 | 1 | 91309 + 8 | 13601 | 1 | 40504 + 10 | 1602 | 1 | 182806 + 10 | 9862 | 1 | 86241 + 10 | 11431 | 1 | 62112 + 10 | 13124 | 1 | 29414 + 11 | 12800 | 1 | 152806 + 13 | 994 | 1 | 64486 + 13 | 1603 | 1 | 38191 + 13 | 4704 | 1 | 77934 + 13 | 9927 | 1 | 875 + 14 | 11011 | 1 | 172485 + 17 | 896 | 1 | 38675 + 17 | 5507 | 1 | 9600 + 19 | 353 | 1 | 119305 + 19 | 1504 | 1 | 81389 + 19 | 1669 | 1 | 78373 + 19 | 5893 | 1 | 133707 + 19 | 9954 | 1 | 92138 + 19 | 14885 | 1 | 36154 +(36 rows) + +-- explain without order by +SELECT coordinator_plan($Q$ +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (o_custkey, l_orderkey) o_custkey, l_orderkey, l_linenumber, l_partkey + FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey) + WHERE o_custkey < 20; +$Q$); + coordinator_plan +--------------------------------------------------------------------- + Unique + -> Sort + Sort Key: remote_scan.o_custkey, remote_scan.l_orderkey + -> Custom Scan (Citus Adaptive) + Task Count: 4 +(5 rows) + +-- each customer's each order's last l_partkey +SELECT DISTINCT ON (o_custkey, l_orderkey) o_custkey, l_orderkey, l_linenumber, l_partkey + FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey) + WHERE o_custkey < 15 + ORDER BY 1,2,3 DESC; + o_custkey | l_orderkey | l_linenumber | l_partkey +--------------------------------------------------------------------- + 1 | 9154 | 7 | 173448 + 1 | 14656 | 1 | 59539 + 2 | 10563 | 4 | 110741 + 4 | 320 | 2 | 192158 + 4 | 739 | 5 | 187523 + 4 | 10688 | 2 | 132574 + 4 | 10788 | 4 | 196473 + 4 | 13728 | 3 | 12450 + 5 | 11682 | 3 | 177152 + 5 | 11746 | 7 | 193807 + 5 | 14308 | 3 | 140916 + 7 | 10402 | 2 | 64514 + 7 | 13031 | 6 | 7761 + 7 | 14145 | 6 | 130723 + 7 | 14404 | 7 | 35349 + 8 | 102 | 4 | 61158 + 8 | 164 | 7 | 3037 + 8 | 13601 | 5 | 12470 + 10 | 1602 | 1 | 182806 + 10 | 9862 | 5 | 135675 + 10 | 11431 | 7 | 8563 + 10 | 13124 | 3 | 67055 + 11 | 12800 | 5 | 179110 + 13 | 994 | 4 | 130471 + 13 | 1603 | 2 | 65209 + 13 | 4704 | 3 | 63081 + 13 | 9927 | 6 | 119356 + 14 | 11011 | 7 | 95939 +(28 rows) + +-- subqueries +SELECT DISTINCT l_orderkey, l_partkey + FROM ( + SELECT l_orderkey, l_partkey + FROM lineitem_hash_part + ) q + ORDER BY 1,2 + LIMIT 10; + l_orderkey | l_partkey +--------------------------------------------------------------------- + 1 | 2132 + 1 | 15635 + 1 | 24027 + 1 | 63700 + 1 | 67310 + 1 | 155190 + 2 | 106170 + 3 | 4297 + 3 | 19036 + 3 | 29380 +(10 rows) + +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_orderkey, l_partkey + FROM ( + SELECT l_orderkey, l_partkey + FROM lineitem_hash_part + ) q + ORDER BY 1,2 + LIMIT 10; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Sort + Sort Key: remote_scan.l_orderkey, remote_scan.l_partkey + -> HashAggregate + Group Key: remote_scan.l_orderkey, remote_scan.l_partkey + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: l_orderkey, l_partkey + -> HashAggregate + Group Key: l_orderkey, l_partkey + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(16 rows) + +SELECT DISTINCT l_orderkey, cnt + FROM ( + SELECT l_orderkey, count(*) as cnt + FROM lineitem_hash_part + GROUP BY 1 + ) q + ORDER BY 1,2 + LIMIT 10; + l_orderkey | cnt +--------------------------------------------------------------------- + 1 | 6 + 2 | 1 + 3 | 6 + 4 | 1 + 5 | 3 + 6 | 1 + 7 | 7 + 32 | 6 + 33 | 4 + 34 | 3 +(10 rows) + +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_orderkey, cnt + FROM ( + SELECT l_orderkey, count(*) as cnt + FROM lineitem_hash_part + GROUP BY 1 + ) q + ORDER BY 1,2 + LIMIT 10; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Sort + Sort Key: remote_scan.l_orderkey, remote_scan.cnt + -> HashAggregate + Group Key: remote_scan.l_orderkey, remote_scan.cnt + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: lineitem_hash_part.l_orderkey, (count(*)) + -> HashAggregate + Group Key: lineitem_hash_part.l_orderkey, count(*) + -> HashAggregate + Group Key: lineitem_hash_part.l_orderkey + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(18 rows) + +-- distinct on partition column +-- random() is added to inner query to prevent flattening +SELECT DISTINCT ON (l_orderkey) l_orderkey, l_partkey + FROM ( + SELECT l_orderkey, l_partkey, (random()*10)::int + 2 as r + FROM lineitem_hash_part + ) q + WHERE r > 1 + ORDER BY 1,2 + LIMIT 10; + l_orderkey | l_partkey +--------------------------------------------------------------------- + 1 | 2132 + 2 | 106170 + 3 | 4297 + 4 | 88035 + 5 | 37531 + 6 | 139636 + 7 | 79251 + 32 | 2743 + 33 | 33918 + 34 | 88362 +(10 rows) + +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (l_orderkey) l_orderkey, l_partkey + FROM ( + SELECT l_orderkey, l_partkey, (random()*10)::int + 2 as r + FROM lineitem_hash_part + ) q + WHERE r > 1 + ORDER BY 1,2 + LIMIT 10; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: remote_scan.l_orderkey, remote_scan.l_partkey + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Unique + -> Sort + Sort Key: q.l_orderkey, q.l_partkey + -> Subquery Scan on q + Filter: (q.r > 1) + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(16 rows) + +-- distinct on non-partition column +SELECT DISTINCT ON (l_partkey) l_orderkey, l_partkey + FROM ( + SELECT l_orderkey, l_partkey, (random()*10)::int + 2 as r + FROM lineitem_hash_part + ) q + WHERE r > 1 + ORDER BY 2,1 + LIMIT 10; + l_orderkey | l_partkey +--------------------------------------------------------------------- + 12005 | 18 + 5121 | 79 + 2883 | 91 + 807 | 149 + 4102 | 175 + 2117 | 179 + 548 | 182 + 2528 | 195 + 10048 | 204 + 9413 | 222 +(10 rows) + +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (l_partkey) l_orderkey, l_partkey + FROM ( + SELECT l_orderkey, l_partkey, (random()*10)::int + 2 as r + FROM lineitem_hash_part + ) q + WHERE r > 1 + ORDER BY 2,1 + LIMIT 10; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: remote_scan.l_partkey, remote_scan.l_orderkey + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Unique + -> Sort + Sort Key: q.l_partkey, q.l_orderkey + -> Subquery Scan on q + Filter: (q.r > 1) + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(16 rows) + diff --git a/src/test/regress/sql/multi_select_distinct.sql b/src/test/regress/sql/multi_select_distinct.sql index 39f49b746..f94fa35f6 100644 --- a/src/test/regress/sql/multi_select_distinct.sql +++ b/src/test/regress/sql/multi_select_distinct.sql @@ -3,6 +3,9 @@ -- -- Tests select distinct, and select distinct on features. -- +-- This test file has an alternative output because some HashAggregate +-- leverages in PG15 compared to PG13/PG14 +-- ANALYZE lineitem_hash_part;