-- -- COMPLEX_COUNT_DISTINCT -- CREATE TABLE lineitem_hash ( l_orderkey bigint not null, l_partkey integer not null, l_suppkey integer not null, l_linenumber integer not null, l_quantity decimal(15, 2) not null, l_extendedprice decimal(15, 2) not null, l_discount decimal(15, 2) not null, l_tax decimal(15, 2) not null, l_returnflag char(1) not null, l_linestatus char(1) not null, l_shipdate date not null, l_commitdate date not null, l_receiptdate date not null, l_shipinstruct char(25) not null, l_shipmode char(10) not null, l_comment varchar(44) not null, PRIMARY KEY(l_orderkey, l_linenumber) ); SELECT master_create_distributed_table('lineitem_hash', 'l_orderkey', 'hash'); master_create_distributed_table --------------------------------- (1 row) SELECT master_create_worker_shards('lineitem_hash', 8, 1); master_create_worker_shards ----------------------------- (1 row) \COPY lineitem_hash FROM '@abs_srcdir@/data/lineitem.1.data' with delimiter '|' \COPY lineitem_hash FROM '@abs_srcdir@/data/lineitem.2.data' with delimiter '|' SET citus.task_executor_type to "task-tracker"; -- count(distinct) is supported on top level query if there -- is a grouping on the partition key SELECT l_orderkey, count(DISTINCT l_partkey) FROM lineitem_hash GROUP BY l_orderkey ORDER BY 2 DESC, 1 DESC LIMIT 10; l_orderkey | count ------------+------- 14885 | 7 14884 | 7 14821 | 7 14790 | 7 14785 | 7 14755 | 7 14725 | 7 14694 | 7 14627 | 7 14624 | 7 (10 rows) -- it is not supported if there is no grouping or grouping is on non-partition field SELECT count(DISTINCT l_partkey) FROM lineitem_hash ORDER BY 1 DESC LIMIT 10; ERROR: cannot compute aggregate (distinct) DETAIL: table partitioning is unsuitable for aggregate (distinct) HINT: You can load the hll extension from contrib packages and enable distinct approximations. SELECT l_shipmode, count(DISTINCT l_partkey) FROM lineitem_hash GROUP BY l_shipmode ORDER BY 2 DESC, 1 DESC LIMIT 10; ERROR: cannot compute aggregate (distinct) DETAIL: table partitioning is unsuitable for aggregate (distinct) HINT: You can load the hll extension from contrib packages and enable distinct approximations. -- count distinct is supported on single table subqueries SELECT * FROM ( SELECT l_orderkey, count(DISTINCT l_partkey) FROM lineitem_hash GROUP BY l_orderkey) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; l_orderkey | count ------------+------- 14885 | 7 14884 | 7 14821 | 7 14790 | 7 14785 | 7 14755 | 7 14725 | 7 14694 | 7 14627 | 7 14624 | 7 (10 rows) SELECT * FROM ( SELECT l_partkey, count(DISTINCT l_orderkey) FROM lineitem_hash GROUP BY l_partkey) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; l_partkey | count -----------+------- 199146 | 3 188804 | 3 177771 | 3 160895 | 3 149926 | 3 136884 | 3 87761 | 3 15283 | 3 6983 | 3 1927 | 3 (10 rows) -- case expr in count distinct is supported. -- count orders partkeys if l_shipmode is air SELECT * FROM ( SELECT l_orderkey, count(DISTINCT CASE WHEN l_shipmode = 'AIR' THEN l_partkey ELSE NULL END) as count FROM lineitem_hash GROUP BY l_orderkey) sub WHERE count > 0 ORDER BY 2 DESC, 1 DESC LIMIT 10; l_orderkey | count ------------+------- 12005 | 4 5409 | 4 4964 | 4 14848 | 3 14496 | 3 13473 | 3 13122 | 3 12929 | 3 12645 | 3 12417 | 3 (10 rows) -- text like operator is also supported SELECT * FROM ( SELECT l_orderkey, count(DISTINCT CASE WHEN l_shipmode like '%A%' THEN l_partkey ELSE NULL END) as count FROM lineitem_hash GROUP BY l_orderkey) sub WHERE count > 0 ORDER BY 2 DESC, 1 DESC LIMIT 10; l_orderkey | count ------------+------- 14275 | 7 14181 | 7 13605 | 7 12707 | 7 12384 | 7 11746 | 7 10727 | 7 10467 | 7 5636 | 7 4614 | 7 (10 rows) -- count distinct is rejected if it does not reference any columns SELECT * FROM ( SELECT l_orderkey, count(DISTINCT 1) FROM lineitem_hash GROUP BY l_orderkey) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; ERROR: cannot compute aggregate (distinct) DETAIL: aggregate (distinct) with no columns is unsupported HINT: You can load the hll extension from contrib packages and enable distinct approximations. -- count distinct is rejected if it does not reference any columns SELECT * FROM ( SELECT l_orderkey, count(DISTINCT (random() * 5)::int) FROM lineitem_hash GROUP BY l_orderkey) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; ERROR: cannot compute aggregate (distinct) DETAIL: aggregate (distinct) with no columns is unsupported HINT: You can load the hll extension from contrib packages and enable distinct approximations. -- even non-const function calls are supported within count distinct SELECT * FROM ( SELECT l_orderkey, count(DISTINCT (random() * 5)::int = l_linenumber) FROM lineitem_hash GROUP BY l_orderkey) sub ORDER BY 2 DESC, 1 DESC LIMIT 0; l_orderkey | count ------------+------- (0 rows) -- multiple nested subquery SELECT total, avg(avg_count) as total_avg_count FROM ( SELECT number_sum, count(DISTINCT l_suppkey) as total, avg(total_count) avg_count FROM ( SELECT l_suppkey, sum(l_linenumber) as number_sum, count(DISTINCT l_shipmode) as total_count FROM lineitem_hash WHERE l_partkey > 100 and l_quantity > 2 and l_orderkey < 10000 GROUP BY l_suppkey) as distributed_table WHERE number_sum >= 10 GROUP BY number_sum) as distributed_table_2 GROUP BY total ORDER BY total_avg_count DESC; total | total_avg_count -------+-------------------- 1 | 3.6000000000000000 6 | 2.8333333333333333 10 | 2.6000000000000000 27 | 2.5555555555555556 32 | 2.4687500000000000 77 | 2.1948051948051948 57 | 2.1754385964912281 (7 rows) -- multiple cases query SELECT * FROM ( SELECT count(DISTINCT CASE WHEN l_shipmode = 'TRUCK' THEN l_partkey WHEN l_shipmode = 'AIR' THEN l_quantity WHEN l_shipmode = 'SHIP' THEN l_discount ELSE l_suppkey END) as count, l_shipdate FROM lineitem_hash GROUP BY l_shipdate) sub WHERE count > 0 ORDER BY 1 DESC, 2 DESC LIMIT 10; count | l_shipdate -------+------------ 14 | 07-30-1997 13 | 05-26-1998 13 | 08-08-1997 13 | 11-17-1995 13 | 01-09-1993 12 | 01-15-1998 12 | 10-15-1997 12 | 09-07-1997 12 | 06-02-1997 12 | 03-14-1997 (10 rows) -- count DISTINCT expression SELECT * FROM ( SELECT l_quantity, count(DISTINCT ((l_orderkey / 1000) * 1000 )) as count FROM lineitem_hash GROUP BY l_quantity) sub WHERE count > 0 ORDER BY 2 DESC, 1 DESC LIMIT 10; l_quantity | count ------------+------- 48.00 | 13 47.00 | 13 37.00 | 13 33.00 | 13 26.00 | 13 25.00 | 13 23.00 | 13 21.00 | 13 15.00 | 13 12.00 | 13 (10 rows) -- count DISTINCT is part of an expression which inclues another aggregate SELECT * FROM ( SELECT sum(((l_partkey * l_tax) / 100)) / count(DISTINCT CASE WHEN l_shipmode = 'TRUCK' THEN l_partkey ELSE l_suppkey END) as avg, l_shipmode FROM lineitem_hash GROUP BY l_shipmode) sub ORDER BY 1 DESC, 2 DESC LIMIT 10; avg | l_shipmode -------------------------+------------ 44.82904609027336300064 | MAIL 44.80704536679536679537 | SHIP 44.68891732736572890026 | AIR 44.34106724470134874759 | REG AIR 43.12739987269255251432 | FOB 43.07299253636938646426 | RAIL 40.50298377916903813318 | TRUCK (7 rows) --- count DISTINCT CASE WHEN expression SELECT * FROM ( SELECT count(DISTINCT CASE WHEN l_shipmode = 'TRUCK' THEN l_linenumber WHEN l_shipmode = 'AIR' THEN l_linenumber + 10 ELSE 2 END) as avg FROM lineitem_hash GROUP BY l_shipdate) sub ORDER BY 1 DESC LIMIT 10; avg ----- 7 6 6 6 6 6 6 6 5 5 (10 rows) -- COUNT DISTINCT (c1, c2) SELECT * FROM (SELECT l_shipmode, count(DISTINCT (l_shipdate, l_tax)) FROM lineitem_hash GROUP BY l_shipmode) t ORDER BY 2 DESC,1 DESC LIMIT 10; l_shipmode | count ------------+------- TRUCK | 1689 MAIL | 1683 FOB | 1655 AIR | 1650 SHIP | 1644 RAIL | 1636 REG AIR | 1607 (7 rows) -- other distinct aggregate are not supported SELECT * FROM ( SELECT l_orderkey, sum(DISTINCT l_partkey) FROM lineitem_hash GROUP BY l_orderkey) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; ERROR: cannot compute aggregate (distinct) DETAIL: Only count(distinct) aggregate is supported in subqueries SELECT * FROM ( SELECT l_orderkey, avg(DISTINCT l_partkey) FROM lineitem_hash GROUP BY l_orderkey) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; ERROR: cannot compute aggregate (distinct) DETAIL: Only count(distinct) aggregate is supported in subqueries -- whole row references, oid, and ctid are not supported in count distinct -- test table does not have oid or ctid enabled, so tests for them are skipped SELECT * FROM ( SELECT l_orderkey, count(DISTINCT lineitem_hash) FROM lineitem_hash GROUP BY l_orderkey) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; ERROR: cannot compute count (distinct) DETAIL: Non-column references are not supported yet SELECT * FROM ( SELECT l_orderkey, count(DISTINCT lineitem_hash.*) FROM lineitem_hash GROUP BY l_orderkey) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; ERROR: cannot compute count (distinct) DETAIL: Non-column references are not supported yet DROP TABLE lineitem_hash;