-- -- COMPLEX_COUNT_DISTINCT -- SET citus.next_shard_id TO 240000; CREATE TABLE lineitem_hash ( l_orderkey bigint not null, l_partkey integer not null, l_suppkey integer not null, l_linenumber integer not null, l_quantity decimal(15, 2) not null, l_extendedprice decimal(15, 2) not null, l_discount decimal(15, 2) not null, l_tax decimal(15, 2) not null, l_returnflag char(1) not null, l_linestatus char(1) not null, l_shipdate date not null, l_commitdate date not null, l_receiptdate date not null, l_shipinstruct char(25) not null, l_shipmode char(10) not null, l_comment varchar(44) not null, PRIMARY KEY(l_orderkey, l_linenumber) ); SELECT master_create_distributed_table('lineitem_hash', 'l_orderkey', 'hash'); SELECT master_create_worker_shards('lineitem_hash', 8, 1); \copy lineitem_hash FROM '@abs_srcdir@/data/lineitem.1.data' with delimiter '|' \copy lineitem_hash FROM '@abs_srcdir@/data/lineitem.2.data' with delimiter '|' ANALYZE lineitem_hash; SET citus.task_executor_type to "task-tracker"; -- count(distinct) is supported on top level query if there -- is a grouping on the partition key SELECT l_orderkey, count(DISTINCT l_partkey) FROM lineitem_hash GROUP BY l_orderkey ORDER BY 2 DESC, 1 DESC LIMIT 10; EXPLAIN (COSTS false, VERBOSE true) SELECT l_orderkey, count(DISTINCT l_partkey) FROM lineitem_hash GROUP BY l_orderkey ORDER BY 2 DESC, 1 DESC LIMIT 10; -- it is also supported if there is no grouping or grouping is on non-partition field SELECT count(DISTINCT l_partkey) FROM lineitem_hash ORDER BY 1 DESC LIMIT 10; EXPLAIN (COSTS false, VERBOSE true) SELECT count(DISTINCT l_partkey) FROM lineitem_hash ORDER BY 1 DESC LIMIT 10; SELECT l_shipmode, count(DISTINCT l_partkey) FROM lineitem_hash GROUP BY l_shipmode ORDER BY 2 DESC, 1 DESC LIMIT 10; EXPLAIN (COSTS false, VERBOSE true) SELECT l_shipmode, count(DISTINCT l_partkey) FROM lineitem_hash GROUP BY l_shipmode ORDER BY 2 DESC, 1 DESC LIMIT 10; -- mixed mode count distinct, grouped by partition column SELECT l_orderkey, count(distinct l_partkey), count(distinct l_shipmode) FROM lineitem_hash GROUP BY l_orderkey ORDER BY 3 DESC, 2 DESC, 1 LIMIT 10; EXPLAIN (COSTS false, VERBOSE true) SELECT l_orderkey, count(distinct l_partkey), count(distinct l_shipmode) FROM lineitem_hash GROUP BY l_orderkey ORDER BY 3 DESC, 2 DESC, 1 LIMIT 10; -- partition/non-partition column count distinct no grouping SELECT count(distinct l_orderkey), count(distinct l_partkey), count(distinct l_shipmode) FROM lineitem_hash; EXPLAIN (COSTS false, VERBOSE true) SELECT count(distinct l_orderkey), count(distinct l_partkey), count(distinct l_shipmode) FROM lineitem_hash; -- distinct/non-distinct on partition and non-partition columns SELECT count(distinct l_orderkey), count(l_orderkey), count(distinct l_partkey), count(l_partkey), count(distinct l_shipmode), count(l_shipmode) FROM lineitem_hash; -- mixed mode count distinct, grouped by non-partition column SELECT l_shipmode, count(distinct l_partkey), count(distinct l_orderkey) FROM lineitem_hash GROUP BY l_shipmode ORDER BY 1, 2 DESC, 3 DESC; -- mixed mode count distinct, grouped by non-partition column -- having on partition column SELECT l_shipmode, count(distinct l_partkey), count(distinct l_orderkey) FROM lineitem_hash GROUP BY l_shipmode HAVING count(distinct l_orderkey) > 1300 ORDER BY 1, 2 DESC; -- same but having clause is not on target list SELECT l_shipmode, count(distinct l_partkey) FROM lineitem_hash GROUP BY l_shipmode HAVING count(distinct l_orderkey) > 1300 ORDER BY 1, 2 DESC; -- mixed mode count distinct, grouped by non-partition column -- having on non-partition column SELECT l_shipmode, count(distinct l_partkey), count(distinct l_suppkey) FROM lineitem_hash GROUP BY l_shipmode HAVING count(distinct l_suppkey) > 1550 ORDER BY 1, 2 DESC; -- same but having clause is not on target list SELECT l_shipmode, count(distinct l_partkey) FROM lineitem_hash GROUP BY l_shipmode HAVING count(distinct l_suppkey) > 1550 ORDER BY 1, 2 DESC; EXPLAIN (COSTS false, VERBOSE true) SELECT l_shipmode, count(distinct l_partkey) FROM lineitem_hash GROUP BY l_shipmode HAVING count(distinct l_suppkey) > 1550 ORDER BY 1, 2 DESC; -- count distinct is supported on single table subqueries SELECT * FROM ( SELECT l_orderkey, count(DISTINCT l_partkey) FROM lineitem_hash GROUP BY l_orderkey) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; SELECT * FROM ( SELECT l_partkey, count(DISTINCT l_orderkey) FROM lineitem_hash GROUP BY l_partkey) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; EXPLAIN (COSTS false, VERBOSE true) SELECT * FROM ( SELECT l_partkey, count(DISTINCT l_orderkey) FROM lineitem_hash GROUP BY l_partkey) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; -- count distinct with filters SELECT l_orderkey, count(DISTINCT l_suppkey) FILTER (WHERE l_shipmode = 'AIR'), count(DISTINCT l_suppkey) FROM lineitem_hash GROUP BY l_orderkey ORDER BY 2 DESC, 3 DESC, 1 LIMIT 10; EXPLAIN (COSTS false, VERBOSE true) SELECT l_orderkey, count(DISTINCT l_suppkey) FILTER (WHERE l_shipmode = 'AIR'), count(DISTINCT l_suppkey) FROM lineitem_hash GROUP BY l_orderkey ORDER BY 2 DESC, 3 DESC, 1 LIMIT 10; -- group by on non-partition column SELECT l_suppkey, count(DISTINCT l_partkey) FILTER (WHERE l_shipmode = 'AIR') FROM lineitem_hash GROUP BY l_suppkey ORDER BY 2 DESC, 1 DESC LIMIT 10; -- explaining the same query fails EXPLAIN (COSTS false, VERBOSE true) SELECT l_suppkey, count(DISTINCT l_partkey) FILTER (WHERE l_shipmode = 'AIR') FROM lineitem_hash GROUP BY l_suppkey ORDER BY 2 DESC, 1 DESC LIMIT 10; -- without group by, on partition column SELECT count(DISTINCT l_orderkey) FILTER (WHERE l_shipmode = 'AIR') FROM lineitem_hash; -- without group by, on non-partition column SELECT count(DISTINCT l_partkey) FILTER (WHERE l_shipmode = 'AIR') FROM lineitem_hash; SELECT count(DISTINCT l_partkey) FILTER (WHERE l_shipmode = 'AIR'), count(DISTINCT l_partkey), count(DISTINCT l_shipdate) FROM lineitem_hash; -- filter column already exists in target list SELECT * FROM ( SELECT l_orderkey, count(DISTINCT l_partkey) FILTER (WHERE l_orderkey > 100) FROM lineitem_hash GROUP BY l_orderkey) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; -- filter column does not exist in target list SELECT * FROM ( SELECT l_orderkey, count(DISTINCT l_partkey) FILTER (WHERE l_shipmode = 'AIR') FROM lineitem_hash GROUP BY l_orderkey) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; -- case expr in count distinct is supported. -- count orders partkeys if l_shipmode is air SELECT * FROM ( SELECT l_orderkey, count(DISTINCT CASE WHEN l_shipmode = 'AIR' THEN l_partkey ELSE NULL END) as count FROM lineitem_hash GROUP BY l_orderkey) sub WHERE count > 0 ORDER BY 2 DESC, 1 DESC LIMIT 10; -- text like operator is also supported SELECT * FROM ( SELECT l_orderkey, count(DISTINCT CASE WHEN l_shipmode like '%A%' THEN l_partkey ELSE NULL END) as count FROM lineitem_hash GROUP BY l_orderkey) sub WHERE count > 0 ORDER BY 2 DESC, 1 DESC LIMIT 10; -- count distinct is rejected if it does not reference any columns SELECT * FROM ( SELECT l_linenumber, count(DISTINCT 1) FROM lineitem_hash GROUP BY l_linenumber) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; -- count distinct is rejected if it does not reference any columns SELECT * FROM ( SELECT l_linenumber, count(DISTINCT (random() * 5)::int) FROM lineitem_hash GROUP BY l_linenumber) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; -- even non-const function calls are supported within count distinct SELECT * FROM ( SELECT l_orderkey, count(DISTINCT (random() * 5)::int = l_linenumber) FROM lineitem_hash GROUP BY l_orderkey) sub ORDER BY 2 DESC, 1 DESC LIMIT 0; -- multiple nested subquery SELECT total, avg(avg_count) as total_avg_count FROM ( SELECT number_sum, count(DISTINCT l_suppkey) as total, avg(total_count) avg_count FROM ( SELECT l_suppkey, sum(l_linenumber) as number_sum, count(DISTINCT l_shipmode) as total_count FROM lineitem_hash WHERE l_partkey > 100 and l_quantity > 2 and l_orderkey < 10000 GROUP BY l_suppkey) as distributed_table WHERE number_sum >= 10 GROUP BY number_sum) as distributed_table_2 GROUP BY total ORDER BY total_avg_count DESC; -- multiple cases query SELECT * FROM ( SELECT count(DISTINCT CASE WHEN l_shipmode = 'TRUCK' THEN l_partkey WHEN l_shipmode = 'AIR' THEN l_quantity WHEN l_shipmode = 'SHIP' THEN l_discount ELSE l_suppkey END) as count, l_shipdate FROM lineitem_hash GROUP BY l_shipdate) sub WHERE count > 0 ORDER BY 1 DESC, 2 DESC LIMIT 10; -- count DISTINCT expression SELECT * FROM ( SELECT l_quantity, count(DISTINCT ((l_orderkey / 1000) * 1000 )) as count FROM lineitem_hash GROUP BY l_quantity) sub WHERE count > 0 ORDER BY 2 DESC, 1 DESC LIMIT 10; -- count DISTINCT is part of an expression which inclues another aggregate SELECT * FROM ( SELECT sum(((l_partkey * l_tax) / 100)) / count(DISTINCT CASE WHEN l_shipmode = 'TRUCK' THEN l_partkey ELSE l_suppkey END) as avg, l_shipmode FROM lineitem_hash GROUP BY l_shipmode) sub ORDER BY 1 DESC, 2 DESC LIMIT 10; --- count DISTINCT CASE WHEN expression SELECT * FROM ( SELECT count(DISTINCT CASE WHEN l_shipmode = 'TRUCK' THEN l_linenumber WHEN l_shipmode = 'AIR' THEN l_linenumber + 10 ELSE 2 END) as avg FROM lineitem_hash GROUP BY l_shipdate) sub ORDER BY 1 DESC LIMIT 10; -- COUNT DISTINCT (c1, c2) SELECT * FROM (SELECT l_shipmode, count(DISTINCT (l_shipdate, l_tax)) FROM lineitem_hash GROUP BY l_shipmode) t ORDER BY 2 DESC,1 DESC LIMIT 10; -- other distinct aggregate are not supported SELECT * FROM ( SELECT l_linenumber, sum(DISTINCT l_partkey) FROM lineitem_hash GROUP BY l_linenumber) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; SELECT * FROM ( SELECT l_linenumber, avg(DISTINCT l_partkey) FROM lineitem_hash GROUP BY l_linenumber) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; -- whole row references, oid, and ctid are not supported in count distinct -- test table does not have oid or ctid enabled, so tests for them are skipped SELECT * FROM ( SELECT l_linenumber, count(DISTINCT lineitem_hash) FROM lineitem_hash GROUP BY l_linenumber) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; SELECT * FROM ( SELECT l_linenumber, count(DISTINCT lineitem_hash.*) FROM lineitem_hash GROUP BY l_linenumber) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; DROP TABLE lineitem_hash;