-- -- COMPLEX_COUNT_DISTINCT -- ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 240000; ALTER SEQUENCE pg_catalog.pg_dist_jobid_seq RESTART 240000; CREATE TABLE lineitem_hash ( l_orderkey bigint not null, l_partkey integer not null, l_suppkey integer not null, l_linenumber integer not null, l_quantity decimal(15, 2) not null, l_extendedprice decimal(15, 2) not null, l_discount decimal(15, 2) not null, l_tax decimal(15, 2) not null, l_returnflag char(1) not null, l_linestatus char(1) not null, l_shipdate date not null, l_commitdate date not null, l_receiptdate date not null, l_shipinstruct char(25) not null, l_shipmode char(10) not null, l_comment varchar(44) not null, PRIMARY KEY(l_orderkey, l_linenumber) ); SELECT master_create_distributed_table('lineitem_hash', 'l_orderkey', 'hash'); SELECT master_create_worker_shards('lineitem_hash', 8, 1); \COPY lineitem_hash FROM '@abs_srcdir@/data/lineitem.1.data' with delimiter '|' \COPY lineitem_hash FROM '@abs_srcdir@/data/lineitem.2.data' with delimiter '|' SET citus.task_executor_type to "task-tracker"; -- count(distinct) is supported on top level query if there -- is a grouping on the partition key SELECT l_orderkey, count(DISTINCT l_partkey) FROM lineitem_hash GROUP BY l_orderkey ORDER BY 2 DESC, 1 DESC LIMIT 10; -- it is not supported if there is no grouping or grouping is on non-partition field SELECT count(DISTINCT l_partkey) FROM lineitem_hash ORDER BY 1 DESC LIMIT 10; SELECT l_shipmode, count(DISTINCT l_partkey) FROM lineitem_hash GROUP BY l_shipmode ORDER BY 2 DESC, 1 DESC LIMIT 10; -- count distinct is supported on single table subqueries SELECT * FROM ( SELECT l_orderkey, count(DISTINCT l_partkey) FROM lineitem_hash GROUP BY l_orderkey) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; SELECT * FROM ( SELECT l_partkey, count(DISTINCT l_orderkey) FROM lineitem_hash GROUP BY l_partkey) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; -- case expr in count distinct is supported. -- count orders partkeys if l_shipmode is air SELECT * FROM ( SELECT l_orderkey, count(DISTINCT CASE WHEN l_shipmode = 'AIR' THEN l_partkey ELSE NULL END) as count FROM lineitem_hash GROUP BY l_orderkey) sub WHERE count > 0 ORDER BY 2 DESC, 1 DESC LIMIT 10; -- text like operator is also supported SELECT * FROM ( SELECT l_orderkey, count(DISTINCT CASE WHEN l_shipmode like '%A%' THEN l_partkey ELSE NULL END) as count FROM lineitem_hash GROUP BY l_orderkey) sub WHERE count > 0 ORDER BY 2 DESC, 1 DESC LIMIT 10; -- count distinct is rejected if it does not reference any columns SELECT * FROM ( SELECT l_orderkey, count(DISTINCT 1) FROM lineitem_hash GROUP BY l_orderkey) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; -- count distinct is rejected if it does not reference any columns SELECT * FROM ( SELECT l_orderkey, count(DISTINCT (random() * 5)::int) FROM lineitem_hash GROUP BY l_orderkey) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; -- even non-const function calls are supported within count distinct SELECT * FROM ( SELECT l_orderkey, count(DISTINCT (random() * 5)::int = l_linenumber) FROM lineitem_hash GROUP BY l_orderkey) sub ORDER BY 2 DESC, 1 DESC LIMIT 0; -- multiple nested subquery SELECT total, avg(avg_count) as total_avg_count FROM ( SELECT number_sum, count(DISTINCT l_suppkey) as total, avg(total_count) avg_count FROM ( SELECT l_suppkey, sum(l_linenumber) as number_sum, count(DISTINCT l_shipmode) as total_count FROM lineitem_hash WHERE l_partkey > 100 and l_quantity > 2 and l_orderkey < 10000 GROUP BY l_suppkey) as distributed_table WHERE number_sum >= 10 GROUP BY number_sum) as distributed_table_2 GROUP BY total ORDER BY total_avg_count DESC; -- multiple cases query SELECT * FROM ( SELECT count(DISTINCT CASE WHEN l_shipmode = 'TRUCK' THEN l_partkey WHEN l_shipmode = 'AIR' THEN l_quantity WHEN l_shipmode = 'SHIP' THEN l_discount ELSE l_suppkey END) as count, l_shipdate FROM lineitem_hash GROUP BY l_shipdate) sub WHERE count > 0 ORDER BY 1 DESC, 2 DESC LIMIT 10; -- count DISTINCT expression SELECT * FROM ( SELECT l_quantity, count(DISTINCT ((l_orderkey / 1000) * 1000 )) as count FROM lineitem_hash GROUP BY l_quantity) sub WHERE count > 0 ORDER BY 2 DESC, 1 DESC LIMIT 10; -- count DISTINCT is part of an expression which inclues another aggregate SELECT * FROM ( SELECT sum(((l_partkey * l_tax) / 100)) / count(DISTINCT CASE WHEN l_shipmode = 'TRUCK' THEN l_partkey ELSE l_suppkey END) as avg, l_shipmode FROM lineitem_hash GROUP BY l_shipmode) sub ORDER BY 1 DESC, 2 DESC LIMIT 10; --- count DISTINCT CASE WHEN expression SELECT * FROM ( SELECT count(DISTINCT CASE WHEN l_shipmode = 'TRUCK' THEN l_linenumber WHEN l_shipmode = 'AIR' THEN l_linenumber + 10 ELSE 2 END) as avg FROM lineitem_hash GROUP BY l_shipdate) sub ORDER BY 1 DESC LIMIT 10; -- COUNT DISTINCT (c1, c2) SELECT * FROM (SELECT l_shipmode, count(DISTINCT (l_shipdate, l_tax)) FROM lineitem_hash GROUP BY l_shipmode) t ORDER BY 2 DESC,1 DESC LIMIT 10; -- other distinct aggregate are not supported SELECT * FROM ( SELECT l_orderkey, sum(DISTINCT l_partkey) FROM lineitem_hash GROUP BY l_orderkey) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; SELECT * FROM ( SELECT l_orderkey, avg(DISTINCT l_partkey) FROM lineitem_hash GROUP BY l_orderkey) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; -- whole row references, oid, and ctid are not supported in count distinct -- test table does not have oid or ctid enabled, so tests for them are skipped SELECT * FROM ( SELECT l_orderkey, count(DISTINCT lineitem_hash) FROM lineitem_hash GROUP BY l_orderkey) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; SELECT * FROM ( SELECT l_orderkey, count(DISTINCT lineitem_hash.*) FROM lineitem_hash GROUP BY l_orderkey) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; DROP TABLE lineitem_hash;