mirror of https://github.com/citusdata/citus.git
439 lines
9.9 KiB
Plaintext
439 lines
9.9 KiB
Plaintext
--
|
|
-- COMPLEX_COUNT_DISTINCT
|
|
--
|
|
CREATE TABLE lineitem_hash (
|
|
l_orderkey bigint not null,
|
|
l_partkey integer not null,
|
|
l_suppkey integer not null,
|
|
l_linenumber integer not null,
|
|
l_quantity decimal(15, 2) not null,
|
|
l_extendedprice decimal(15, 2) not null,
|
|
l_discount decimal(15, 2) not null,
|
|
l_tax decimal(15, 2) not null,
|
|
l_returnflag char(1) not null,
|
|
l_linestatus char(1) not null,
|
|
l_shipdate date not null,
|
|
l_commitdate date not null,
|
|
l_receiptdate date not null,
|
|
l_shipinstruct char(25) not null,
|
|
l_shipmode char(10) not null,
|
|
l_comment varchar(44) not null,
|
|
PRIMARY KEY(l_orderkey, l_linenumber) );
|
|
|
|
SELECT master_create_distributed_table('lineitem_hash', 'l_orderkey', 'hash');
|
|
master_create_distributed_table
|
|
---------------------------------
|
|
|
|
(1 row)
|
|
|
|
SELECT master_create_worker_shards('lineitem_hash', 8, 1);
|
|
master_create_worker_shards
|
|
-----------------------------
|
|
|
|
(1 row)
|
|
|
|
\COPY lineitem_hash FROM '@abs_srcdir@/data/lineitem.1.data' with delimiter '|'
|
|
\COPY lineitem_hash FROM '@abs_srcdir@/data/lineitem.2.data' with delimiter '|'
|
|
SET citus.task_executor_type to "task-tracker";
|
|
-- count(distinct) is supported on top level query if there
|
|
-- is a grouping on the partition key
|
|
SELECT
|
|
l_orderkey, count(DISTINCT l_partkey)
|
|
FROM lineitem_hash
|
|
GROUP BY l_orderkey
|
|
ORDER BY 2 DESC, 1 DESC
|
|
LIMIT 10;
|
|
l_orderkey | count
|
|
------------+-------
|
|
14885 | 7
|
|
14884 | 7
|
|
14821 | 7
|
|
14790 | 7
|
|
14785 | 7
|
|
14755 | 7
|
|
14725 | 7
|
|
14694 | 7
|
|
14627 | 7
|
|
14624 | 7
|
|
(10 rows)
|
|
|
|
-- it is not supported if there is no grouping or grouping is on non-partition field
|
|
SELECT
|
|
count(DISTINCT l_partkey)
|
|
FROM lineitem_hash
|
|
ORDER BY 1 DESC
|
|
LIMIT 10;
|
|
ERROR: cannot compute aggregate (distinct)
|
|
DETAIL: table partitioning is unsuitable for aggregate (distinct)
|
|
HINT: You can load the hll extension from contrib packages and enable distinct approximations.
|
|
SELECT
|
|
l_shipmode, count(DISTINCT l_partkey)
|
|
FROM lineitem_hash
|
|
GROUP BY l_shipmode
|
|
ORDER BY 2 DESC, 1 DESC
|
|
LIMIT 10;
|
|
ERROR: cannot compute aggregate (distinct)
|
|
DETAIL: table partitioning is unsuitable for aggregate (distinct)
|
|
HINT: You can load the hll extension from contrib packages and enable distinct approximations.
|
|
-- count distinct is supported on single table subqueries
|
|
SELECT *
|
|
FROM (
|
|
SELECT
|
|
l_orderkey, count(DISTINCT l_partkey)
|
|
FROM lineitem_hash
|
|
GROUP BY l_orderkey) sub
|
|
ORDER BY 2 DESC, 1 DESC
|
|
LIMIT 10;
|
|
l_orderkey | count
|
|
------------+-------
|
|
14885 | 7
|
|
14884 | 7
|
|
14821 | 7
|
|
14790 | 7
|
|
14785 | 7
|
|
14755 | 7
|
|
14725 | 7
|
|
14694 | 7
|
|
14627 | 7
|
|
14624 | 7
|
|
(10 rows)
|
|
|
|
SELECT *
|
|
FROM (
|
|
SELECT
|
|
l_partkey, count(DISTINCT l_orderkey)
|
|
FROM lineitem_hash
|
|
GROUP BY l_partkey) sub
|
|
ORDER BY 2 DESC, 1 DESC
|
|
LIMIT 10;
|
|
l_partkey | count
|
|
-----------+-------
|
|
199146 | 3
|
|
188804 | 3
|
|
177771 | 3
|
|
160895 | 3
|
|
149926 | 3
|
|
136884 | 3
|
|
87761 | 3
|
|
15283 | 3
|
|
6983 | 3
|
|
1927 | 3
|
|
(10 rows)
|
|
|
|
-- case expr in count distinct is supported.
|
|
-- count orders partkeys if l_shipmode is air
|
|
SELECT *
|
|
FROM (
|
|
SELECT
|
|
l_orderkey, count(DISTINCT CASE WHEN l_shipmode = 'AIR' THEN l_partkey ELSE NULL END) as count
|
|
FROM lineitem_hash
|
|
GROUP BY l_orderkey) sub
|
|
WHERE count > 0
|
|
ORDER BY 2 DESC, 1 DESC
|
|
LIMIT 10;
|
|
l_orderkey | count
|
|
------------+-------
|
|
12005 | 4
|
|
5409 | 4
|
|
4964 | 4
|
|
14848 | 3
|
|
14496 | 3
|
|
13473 | 3
|
|
13122 | 3
|
|
12929 | 3
|
|
12645 | 3
|
|
12417 | 3
|
|
(10 rows)
|
|
|
|
-- text like operator is also supported
|
|
SELECT *
|
|
FROM (
|
|
SELECT
|
|
l_orderkey, count(DISTINCT CASE WHEN l_shipmode like '%A%' THEN l_partkey ELSE NULL END) as count
|
|
FROM lineitem_hash
|
|
GROUP BY l_orderkey) sub
|
|
WHERE count > 0
|
|
ORDER BY 2 DESC, 1 DESC
|
|
LIMIT 10;
|
|
l_orderkey | count
|
|
------------+-------
|
|
14275 | 7
|
|
14181 | 7
|
|
13605 | 7
|
|
12707 | 7
|
|
12384 | 7
|
|
11746 | 7
|
|
10727 | 7
|
|
10467 | 7
|
|
5636 | 7
|
|
4614 | 7
|
|
(10 rows)
|
|
|
|
-- count distinct is rejected if it does not reference any columns
|
|
SELECT *
|
|
FROM (
|
|
SELECT
|
|
l_orderkey, count(DISTINCT 1)
|
|
FROM lineitem_hash
|
|
GROUP BY l_orderkey) sub
|
|
ORDER BY 2 DESC, 1 DESC
|
|
LIMIT 10;
|
|
ERROR: cannot compute aggregate (distinct)
|
|
DETAIL: aggregate (distinct) with no columns is unsupported
|
|
HINT: You can load the hll extension from contrib packages and enable distinct approximations.
|
|
-- count distinct is rejected if it does not reference any columns
|
|
SELECT *
|
|
FROM (
|
|
SELECT
|
|
l_orderkey, count(DISTINCT (random() * 5)::int)
|
|
FROM lineitem_hash
|
|
GROUP BY l_orderkey) sub
|
|
ORDER BY 2 DESC, 1 DESC
|
|
LIMIT 10;
|
|
ERROR: cannot compute aggregate (distinct)
|
|
DETAIL: aggregate (distinct) with no columns is unsupported
|
|
HINT: You can load the hll extension from contrib packages and enable distinct approximations.
|
|
-- even non-const function calls are supported within count distinct
|
|
SELECT *
|
|
FROM (
|
|
SELECT
|
|
l_orderkey, count(DISTINCT (random() * 5)::int = l_linenumber)
|
|
FROM lineitem_hash
|
|
GROUP BY l_orderkey) sub
|
|
ORDER BY 2 DESC, 1 DESC
|
|
LIMIT 0;
|
|
l_orderkey | count
|
|
------------+-------
|
|
(0 rows)
|
|
|
|
-- multiple nested subquery
|
|
SELECT
|
|
total,
|
|
avg(avg_count) as total_avg_count
|
|
FROM (
|
|
SELECT
|
|
number_sum,
|
|
count(DISTINCT l_suppkey) as total,
|
|
avg(total_count) avg_count
|
|
FROM (
|
|
SELECT
|
|
l_suppkey,
|
|
sum(l_linenumber) as number_sum,
|
|
count(DISTINCT l_shipmode) as total_count
|
|
FROM
|
|
lineitem_hash
|
|
WHERE
|
|
l_partkey > 100 and
|
|
l_quantity > 2 and
|
|
l_orderkey < 10000
|
|
GROUP BY
|
|
l_suppkey) as distributed_table
|
|
WHERE
|
|
number_sum >= 10
|
|
GROUP BY
|
|
number_sum) as distributed_table_2
|
|
GROUP BY
|
|
total
|
|
ORDER BY
|
|
total_avg_count DESC;
|
|
total | total_avg_count
|
|
-------+--------------------
|
|
1 | 3.6000000000000000
|
|
6 | 2.8333333333333333
|
|
10 | 2.6000000000000000
|
|
27 | 2.5555555555555556
|
|
32 | 2.4687500000000000
|
|
77 | 2.1948051948051948
|
|
57 | 2.1754385964912281
|
|
(7 rows)
|
|
|
|
-- multiple cases query
|
|
SELECT *
|
|
FROM (
|
|
SELECT
|
|
count(DISTINCT
|
|
CASE
|
|
WHEN l_shipmode = 'TRUCK' THEN l_partkey
|
|
WHEN l_shipmode = 'AIR' THEN l_quantity
|
|
WHEN l_shipmode = 'SHIP' THEN l_discount
|
|
ELSE l_suppkey
|
|
END) as count,
|
|
l_shipdate
|
|
FROM
|
|
lineitem_hash
|
|
GROUP BY
|
|
l_shipdate) sub
|
|
WHERE
|
|
count > 0
|
|
ORDER BY
|
|
1 DESC, 2 DESC
|
|
LIMIT 10;
|
|
count | l_shipdate
|
|
-------+------------
|
|
14 | 07-30-1997
|
|
13 | 05-26-1998
|
|
13 | 08-08-1997
|
|
13 | 11-17-1995
|
|
13 | 01-09-1993
|
|
12 | 01-15-1998
|
|
12 | 10-15-1997
|
|
12 | 09-07-1997
|
|
12 | 06-02-1997
|
|
12 | 03-14-1997
|
|
(10 rows)
|
|
|
|
-- count DISTINCT expression
|
|
SELECT *
|
|
FROM (
|
|
SELECT
|
|
l_quantity, count(DISTINCT ((l_orderkey / 1000) * 1000 )) as count
|
|
FROM
|
|
lineitem_hash
|
|
GROUP BY
|
|
l_quantity) sub
|
|
WHERE
|
|
count > 0
|
|
ORDER BY
|
|
2 DESC, 1 DESC
|
|
LIMIT 10;
|
|
l_quantity | count
|
|
------------+-------
|
|
48.00 | 13
|
|
47.00 | 13
|
|
37.00 | 13
|
|
33.00 | 13
|
|
26.00 | 13
|
|
25.00 | 13
|
|
23.00 | 13
|
|
21.00 | 13
|
|
15.00 | 13
|
|
12.00 | 13
|
|
(10 rows)
|
|
|
|
-- count DISTINCT is part of an expression which inclues another aggregate
|
|
SELECT *
|
|
FROM (
|
|
SELECT
|
|
sum(((l_partkey * l_tax) / 100)) /
|
|
count(DISTINCT
|
|
CASE
|
|
WHEN l_shipmode = 'TRUCK' THEN l_partkey
|
|
ELSE l_suppkey
|
|
END) as avg,
|
|
l_shipmode
|
|
FROM
|
|
lineitem_hash
|
|
GROUP BY
|
|
l_shipmode) sub
|
|
ORDER BY
|
|
1 DESC, 2 DESC
|
|
LIMIT 10;
|
|
avg | l_shipmode
|
|
-------------------------+------------
|
|
44.82904609027336300064 | MAIL
|
|
44.80704536679536679537 | SHIP
|
|
44.68891732736572890026 | AIR
|
|
44.34106724470134874759 | REG AIR
|
|
43.12739987269255251432 | FOB
|
|
43.07299253636938646426 | RAIL
|
|
40.50298377916903813318 | TRUCK
|
|
(7 rows)
|
|
|
|
--- count DISTINCT CASE WHEN expression
|
|
SELECT *
|
|
FROM (
|
|
SELECT
|
|
count(DISTINCT
|
|
CASE
|
|
WHEN l_shipmode = 'TRUCK' THEN l_linenumber
|
|
WHEN l_shipmode = 'AIR' THEN l_linenumber + 10
|
|
ELSE 2
|
|
END) as avg
|
|
FROM
|
|
lineitem_hash
|
|
GROUP BY l_shipdate) sub
|
|
ORDER BY 1 DESC
|
|
LIMIT 10;
|
|
avg
|
|
-----
|
|
7
|
|
6
|
|
6
|
|
6
|
|
6
|
|
6
|
|
6
|
|
6
|
|
5
|
|
5
|
|
(10 rows)
|
|
|
|
-- COUNT DISTINCT (c1, c2)
|
|
SELECT *
|
|
FROM
|
|
(SELECT
|
|
l_shipmode,
|
|
count(DISTINCT (l_shipdate, l_tax))
|
|
FROM
|
|
lineitem_hash
|
|
GROUP BY
|
|
l_shipmode) t
|
|
ORDER BY
|
|
2 DESC,1 DESC
|
|
LIMIT 10;
|
|
l_shipmode | count
|
|
------------+-------
|
|
TRUCK | 1689
|
|
MAIL | 1683
|
|
FOB | 1655
|
|
AIR | 1650
|
|
SHIP | 1644
|
|
RAIL | 1636
|
|
REG AIR | 1607
|
|
(7 rows)
|
|
|
|
-- other distinct aggregate are not supported
|
|
SELECT *
|
|
FROM (
|
|
SELECT
|
|
l_orderkey, sum(DISTINCT l_partkey)
|
|
FROM lineitem_hash
|
|
GROUP BY l_orderkey) sub
|
|
ORDER BY 2 DESC, 1 DESC
|
|
LIMIT 10;
|
|
ERROR: cannot compute aggregate (distinct)
|
|
DETAIL: Only count(distinct) aggregate is supported in subqueries
|
|
SELECT *
|
|
FROM (
|
|
SELECT
|
|
l_orderkey, avg(DISTINCT l_partkey)
|
|
FROM lineitem_hash
|
|
GROUP BY l_orderkey) sub
|
|
ORDER BY 2 DESC, 1 DESC
|
|
LIMIT 10;
|
|
ERROR: cannot compute aggregate (distinct)
|
|
DETAIL: Only count(distinct) aggregate is supported in subqueries
|
|
-- whole row references, oid, and ctid are not supported in count distinct
|
|
-- test table does not have oid or ctid enabled, so tests for them are skipped
|
|
SELECT *
|
|
FROM (
|
|
SELECT
|
|
l_orderkey, count(DISTINCT lineitem_hash)
|
|
FROM lineitem_hash
|
|
GROUP BY l_orderkey) sub
|
|
ORDER BY 2 DESC, 1 DESC
|
|
LIMIT 10;
|
|
ERROR: cannot compute count (distinct)
|
|
DETAIL: Non-column references are not supported yet
|
|
SELECT *
|
|
FROM (
|
|
SELECT
|
|
l_orderkey, count(DISTINCT lineitem_hash.*)
|
|
FROM lineitem_hash
|
|
GROUP BY l_orderkey) sub
|
|
ORDER BY 2 DESC, 1 DESC
|
|
LIMIT 10;
|
|
ERROR: cannot compute count (distinct)
|
|
DETAIL: Non-column references are not supported yet
|
|
DROP TABLE lineitem_hash;
|