--
-- MULTI_AGG_DISTINCT
--
ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 200000;
ALTER SEQUENCE pg_catalog.pg_dist_jobid_seq RESTART 200000;
-- Create a new range partitioned lineitem table and stage data into it
CREATE TABLE lineitem_range (
	l_orderkey bigint not null,
	l_partkey integer not null,
	l_suppkey integer not null,
	l_linenumber integer not null,
	l_quantity decimal(15, 2) not null,
	l_extendedprice decimal(15, 2) not null,
	l_discount decimal(15, 2) not null,
	l_tax decimal(15, 2) not null,
	l_returnflag char(1) not null,
	l_linestatus char(1) not null,
	l_shipdate date not null,
	l_commitdate date not null,
	l_receiptdate date not null,
	l_shipinstruct char(25) not null,
	l_shipmode char(10) not null,
	l_comment varchar(44) not null );
SELECT master_create_distributed_table('lineitem_range', 'l_orderkey', 'range');
 master_create_distributed_table 
---------------------------------
 
(1 row)

SET citus.shard_max_size TO "500MB";
\STAGE lineitem_range FROM '@abs_srcdir@/data/lineitem.1.data' with delimiter '|'
\STAGE lineitem_range FROM '@abs_srcdir@/data/lineitem.2.data' with delimiter '|'
RESET citus.shard_max_size;
-- Run aggregate(distinct) on partition column for range partitioned table
SELECT count(distinct l_orderkey) FROM lineitem_range;
 count 
-------
  2985
(1 row)

SELECT avg(distinct l_orderkey) FROM lineitem_range;
          avg          
-----------------------
 7463.9474036850921273
(1 row)

-- Run count(distinct) on join between a range partitioned table and a single
-- sharded table. For this test, we also change a config setting to ensure that
-- we don't repartition any of the tables during the query.
SET citus.large_table_shard_count TO 2;
SELECT p_partkey, count(distinct l_orderkey) FROM lineitem_range, part
	WHERE l_partkey = p_partkey
	GROUP BY p_partkey
	ORDER BY p_partkey LIMIT 10;
 p_partkey | count 
-----------+-------
        18 |     1
        79 |     1
        91 |     1
       149 |     2
       175 |     1
       179 |     1
       182 |     1
       195 |     1
       204 |     1
       222 |     1
(10 rows)

RESET citus.large_table_shard_count;
-- Check that we don't support count(distinct) on non-partition column, and
-- complex expressions.
SELECT count(distinct l_partkey) FROM lineitem_range;
ERROR:  cannot compute aggregate (distinct)
DETAIL:  table partitioning is unsuitable for aggregate (distinct)
HINT:  You can load the hll extension from contrib packages and enable distinct approximations.
SELECT count(distinct (l_orderkey + 1)) FROM lineitem_range;
ERROR:  cannot compute aggregate (distinct)
DETAIL:  aggregate (distinct) on complex expressions is unsupported
HINT:  You can load the hll extension from contrib packages and enable distinct approximations.
-- Now test append partitioned tables. First run count(distinct) on a single
-- sharded table.
SELECT count(distinct p_mfgr) FROM part;
 count 
-------
     5
(1 row)

SELECT p_mfgr, count(distinct p_partkey) FROM part GROUP BY p_mfgr ORDER BY p_mfgr;
          p_mfgr           | count 
---------------------------+-------
 Manufacturer#1            |   193
 Manufacturer#2            |   190
 Manufacturer#3            |   228
 Manufacturer#4            |   204
 Manufacturer#5            |   185
(5 rows)

-- We don't support count(distinct) queries if table is append partitioned and
-- has multiple shards
SELECT count(distinct o_orderkey) FROM orders;
ERROR:  cannot compute aggregate (distinct)
DETAIL:  table partitioning is unsuitable for aggregate (distinct)
HINT:  You can load the hll extension from contrib packages and enable distinct approximations.
-- Hash partitioned tables:
CREATE TABLE lineitem_hash (
	l_orderkey bigint not null,
	l_partkey integer not null,
	l_suppkey integer not null,
	l_linenumber integer not null,
	l_quantity decimal(15, 2) not null,
	l_extendedprice decimal(15, 2) not null,
	l_discount decimal(15, 2) not null,
	l_tax decimal(15, 2) not null,
	l_returnflag char(1) not null,
	l_linestatus char(1) not null,
	l_shipdate date not null,
	l_commitdate date not null,
	l_receiptdate date not null,
	l_shipinstruct char(25) not null,
	l_shipmode char(10) not null,
	l_comment varchar(44) not null );
SELECT master_create_distributed_table('lineitem_hash', 'l_orderkey', 'hash');
 master_create_distributed_table 
---------------------------------
 
(1 row)

SELECT master_create_worker_shards('lineitem_hash', 4, 1);
 master_create_worker_shards 
-----------------------------
 
(1 row)

\COPY lineitem_hash FROM '@abs_srcdir@/data/lineitem.1.data' with delimiter '|'
\COPY lineitem_hash FROM '@abs_srcdir@/data/lineitem.2.data' with delimiter '|'
-- aggregate(distinct) on partition column is allowed
SELECT count(distinct l_orderkey) FROM lineitem_hash;
 count 
-------
  2985
(1 row)

SELECT avg(distinct l_orderkey) FROM lineitem_hash;
          avg          
-----------------------
 7463.9474036850921273
(1 row)

-- count(distinct) on non-partition column or expression is not allowed
SELECT count(distinct l_partkey) FROM lineitem_hash;
ERROR:  cannot compute aggregate (distinct)
DETAIL:  table partitioning is unsuitable for aggregate (distinct)
HINT:  You can load the hll extension from contrib packages and enable distinct approximations.
SELECT count(distinct (l_orderkey + 1)) FROM lineitem_hash;
ERROR:  cannot compute aggregate (distinct)
DETAIL:  aggregate (distinct) on complex expressions is unsupported
HINT:  You can load the hll extension from contrib packages and enable distinct approximations.
-- agg(distinct) is allowed if we group by partition column
SELECT l_orderkey, count(distinct l_partkey) INTO hash_results FROM lineitem_hash GROUP BY l_orderkey;
SELECT l_orderkey, count(distinct l_partkey) INTO range_results FROM lineitem_range GROUP BY l_orderkey;
-- they should return the same results
SELECT * FROM hash_results h, range_results r WHERE h.l_orderkey = r.l_orderkey AND h.count != r.count;
 l_orderkey | count | l_orderkey | count 
------------+-------+------------+-------
(0 rows)

DROP TABLE lineitem_hash;