-- -- MULTI_AGG_APPROXIMATE_DISTINCT -- -- Create HLL extension if present, print false result otherwise SELECT CASE WHEN COUNT(*) > 0 THEN 'CREATE EXTENSION HLL' ELSE 'SELECT false AS hll_present' END AS create_cmd FROM pg_available_extensions() WHERE name = 'hll' \gset :create_cmd; -- Try to execute count(distinct) when approximate distincts aren't enabled SELECT count(distinct l_orderkey) FROM lineitem; count ------- 2985 (1 row) -- Check approximate count(distinct) at different precisions / error rates SET citus.count_distinct_error_rate = 0.1; SELECT count(distinct l_orderkey) FROM lineitem; count ------- 2612 (1 row) SET citus.count_distinct_error_rate = 0.01; SELECT count(distinct l_orderkey) FROM lineitem; count ------- 2967 (1 row) -- Check approximate count(distinct) for different data types SELECT count(distinct l_partkey) FROM lineitem; count ------- 11654 (1 row) SELECT count(distinct l_extendedprice) FROM lineitem; count ------- 11691 (1 row) SELECT count(distinct l_shipdate) FROM lineitem; count ------- 2483 (1 row) SELECT count(distinct l_comment) FROM lineitem; count ------- 11788 (1 row) -- Check that we can execute approximate count(distinct) on complex expressions SELECT count(distinct (l_orderkey * 2 + 1)) FROM lineitem; count ------- 2980 (1 row) SELECT count(distinct extract(month from l_shipdate)) AS my_month FROM lineitem; my_month ---------- 12 (1 row) SELECT count(distinct l_partkey) / count(distinct l_orderkey) FROM lineitem; ?column? ---------- 3 (1 row) -- Check that we can execute approximate count(distinct) on select queries that -- contain different filter, join, sort and limit clauses SELECT count(distinct l_orderkey) FROM lineitem WHERE octet_length(l_comment) + octet_length('randomtext'::text) > 40; count ------- 2355 (1 row) SELECT count(DISTINCT l_orderkey) FROM lineitem, orders WHERE l_orderkey = o_orderkey AND l_quantity < 5; count ------- 835 (1 row) SELECT count(DISTINCT l_orderkey) as distinct_order_count, l_quantity FROM lineitem WHERE l_quantity < 32.0 GROUP BY l_quantity ORDER BY distinct_order_count ASC, l_quantity ASC LIMIT 10; distinct_order_count | l_quantity ----------------------+------------ 210 | 29.00 216 | 13.00 217 | 16.00 219 | 3.00 220 | 18.00 222 | 14.00 223 | 7.00 223 | 17.00 223 | 26.00 223 | 31.00 (10 rows) -- Check that approximate count(distinct) works at a table in a schema other than public -- create necessary objects SET citus.next_shard_id TO 20000000; SET citus.next_placement_id TO 20000000; CREATE SCHEMA test_count_distinct_schema; CREATE TABLE test_count_distinct_schema.nation_hash( n_nationkey integer not null, n_name char(25) not null, n_regionkey integer not null, n_comment varchar(152) ); SELECT create_distributed_table('test_count_distinct_schema.nation_hash', 'n_nationkey', 'hash'); create_distributed_table -------------------------- (1 row) \copy test_count_distinct_schema.nation_hash FROM STDIN with delimiter '|'; SET search_path TO public; SET citus.count_distinct_error_rate TO 0.01; SELECT COUNT (DISTINCT n_regionkey) FROM test_count_distinct_schema.nation_hash; count ------- 3 (1 row) -- test with search_path is set SET search_path TO test_count_distinct_schema; SELECT COUNT (DISTINCT n_regionkey) FROM nation_hash; count ------- 3 (1 row) SET search_path TO public; -- If we have an order by on count(distinct) that we intend to push down to -- worker nodes, we need to error out. Otherwise, we are fine. SET citus.limit_clause_row_fetch_count = 1000; SELECT l_returnflag, count(DISTINCT l_shipdate) as count_distinct, count(*) as total FROM lineitem GROUP BY l_returnflag ORDER BY count_distinct LIMIT 10; ERROR: cannot approximate count(distinct) and order by it HINT: You might need to disable approximations for either count(distinct) or limit through configuration. SELECT l_returnflag, count(DISTINCT l_shipdate) as count_distinct, count(*) as total FROM lineitem GROUP BY l_returnflag ORDER BY total LIMIT 10; l_returnflag | count_distinct | total --------------+----------------+------- R | 1103 | 2901 A | 1108 | 2944 N | 1265 | 6155 (3 rows) SELECT l_orderkey, count(l_partkey) FILTER (WHERE l_shipmode = 'AIR'), count(DISTINCT l_partkey) FILTER (WHERE l_shipmode = 'AIR'), count(DISTINCT CASE WHEN l_shipmode = 'AIR' THEN l_partkey ELSE NULL END) FROM lineitem GROUP BY l_orderkey ORDER BY 2 DESC, 1 DESC LIMIT 10; l_orderkey | count | count | count ------------+-------+-------+------- 12005 | 4 | 4 | 4 5409 | 4 | 4 | 4 4964 | 4 | 4 | 4 14848 | 3 | 3 | 3 14496 | 3 | 3 | 3 13473 | 3 | 3 | 3 13122 | 3 | 3 | 3 12929 | 3 | 3 | 3 12645 | 3 | 3 | 3 12417 | 3 | 3 | 3 (10 rows) -- Check that we can revert config and disable count(distinct) approximations SET citus.count_distinct_error_rate = 0.0; SELECT count(distinct l_orderkey) FROM lineitem; count ------- 2985 (1 row)