-- -- MULTI_AGG_DISTINCT -- ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 200000; ALTER SEQUENCE pg_catalog.pg_dist_jobid_seq RESTART 200000; -- Create a new range partitioned lineitem table and stage data into it CREATE TABLE lineitem_range ( l_orderkey bigint not null, l_partkey integer not null, l_suppkey integer not null, l_linenumber integer not null, l_quantity decimal(15, 2) not null, l_extendedprice decimal(15, 2) not null, l_discount decimal(15, 2) not null, l_tax decimal(15, 2) not null, l_returnflag char(1) not null, l_linestatus char(1) not null, l_shipdate date not null, l_commitdate date not null, l_receiptdate date not null, l_shipinstruct char(25) not null, l_shipmode char(10) not null, l_comment varchar(44) not null ); SELECT master_create_distributed_table('lineitem_range', 'l_orderkey', 'range'); master_create_distributed_table --------------------------------- (1 row) SET citus.shard_max_size TO "500MB"; \STAGE lineitem_range FROM '@abs_srcdir@/data/lineitem.1.data' with delimiter '|' \STAGE lineitem_range FROM '@abs_srcdir@/data/lineitem.2.data' with delimiter '|' RESET citus.shard_max_size; -- Run aggregate(distinct) on partition column for range partitioned table SELECT count(distinct l_orderkey) FROM lineitem_range; count ------- 2985 (1 row) SELECT avg(distinct l_orderkey) FROM lineitem_range; avg ----------------------- 7463.9474036850921273 (1 row) -- Run count(distinct) on join between a range partitioned table and a single -- sharded table. For this test, we also change a config setting to ensure that -- we don't repartition any of the tables during the query. SET citus.large_table_shard_count TO 2; SELECT p_partkey, count(distinct l_orderkey) FROM lineitem_range, part WHERE l_partkey = p_partkey GROUP BY p_partkey ORDER BY p_partkey LIMIT 10; p_partkey | count -----------+------- 18 | 1 79 | 1 91 | 1 149 | 2 175 | 1 179 | 1 182 | 1 195 | 1 204 | 1 222 | 1 (10 rows) RESET citus.large_table_shard_count; -- Check that we don't support count(distinct) on non-partition column, and -- complex expressions. SELECT count(distinct l_partkey) FROM lineitem_range; ERROR: cannot compute aggregate (distinct) DETAIL: table partitioning is unsuitable for aggregate (distinct) HINT: You can load the hll extension from contrib packages and enable distinct approximations. SELECT count(distinct (l_orderkey + 1)) FROM lineitem_range; ERROR: cannot compute aggregate (distinct) DETAIL: aggregate (distinct) on complex expressions is unsupported HINT: You can load the hll extension from contrib packages and enable distinct approximations. -- Now test append partitioned tables. First run count(distinct) on a single -- sharded table. SELECT count(distinct p_mfgr) FROM part; count ------- 5 (1 row) SELECT p_mfgr, count(distinct p_partkey) FROM part GROUP BY p_mfgr ORDER BY p_mfgr; p_mfgr | count ---------------------------+------- Manufacturer#1 | 193 Manufacturer#2 | 190 Manufacturer#3 | 228 Manufacturer#4 | 204 Manufacturer#5 | 185 (5 rows) -- We don't support count(distinct) queries if table is append partitioned and -- has multiple shards SELECT count(distinct o_orderkey) FROM orders; ERROR: cannot compute aggregate (distinct) DETAIL: table partitioning is unsuitable for aggregate (distinct) HINT: You can load the hll extension from contrib packages and enable distinct approximations. -- Hash partitioned tables: CREATE TABLE lineitem_hash ( l_orderkey bigint not null, l_partkey integer not null, l_suppkey integer not null, l_linenumber integer not null, l_quantity decimal(15, 2) not null, l_extendedprice decimal(15, 2) not null, l_discount decimal(15, 2) not null, l_tax decimal(15, 2) not null, l_returnflag char(1) not null, l_linestatus char(1) not null, l_shipdate date not null, l_commitdate date not null, l_receiptdate date not null, l_shipinstruct char(25) not null, l_shipmode char(10) not null, l_comment varchar(44) not null ); SELECT master_create_distributed_table('lineitem_hash', 'l_orderkey', 'hash'); master_create_distributed_table --------------------------------- (1 row) SELECT master_create_worker_shards('lineitem_hash', 4, 1); master_create_worker_shards ----------------------------- (1 row) \COPY lineitem_hash FROM '@abs_srcdir@/data/lineitem.1.data' with delimiter '|' \COPY lineitem_hash FROM '@abs_srcdir@/data/lineitem.2.data' with delimiter '|' -- aggregate(distinct) on partition column is allowed SELECT count(distinct l_orderkey) FROM lineitem_hash; count ------- 2985 (1 row) SELECT avg(distinct l_orderkey) FROM lineitem_hash; avg ----------------------- 7463.9474036850921273 (1 row) -- count(distinct) on non-partition column or expression is not allowed SELECT count(distinct l_partkey) FROM lineitem_hash; ERROR: cannot compute aggregate (distinct) DETAIL: table partitioning is unsuitable for aggregate (distinct) HINT: You can load the hll extension from contrib packages and enable distinct approximations. SELECT count(distinct (l_orderkey + 1)) FROM lineitem_hash; ERROR: cannot compute aggregate (distinct) DETAIL: aggregate (distinct) on complex expressions is unsupported HINT: You can load the hll extension from contrib packages and enable distinct approximations. -- agg(distinct) is allowed if we group by partition column SELECT l_orderkey, count(distinct l_partkey) INTO hash_results FROM lineitem_hash GROUP BY l_orderkey; SELECT l_orderkey, count(distinct l_partkey) INTO range_results FROM lineitem_range GROUP BY l_orderkey; -- they should return the same results SELECT * FROM hash_results h, range_results r WHERE h.l_orderkey = r.l_orderkey AND h.count != r.count; l_orderkey | count | l_orderkey | count ------------+-------+------------+------- (0 rows) DROP TABLE lineitem_hash;