mirror of https://github.com/citusdata/citus.git
hll aggregates are tested
parent
06217be326
commit
df11dda750
|
@ -0,0 +1,110 @@
|
||||||
|
--
|
||||||
|
-- CUSTOM_AGGREGATE_SUPPORT
|
||||||
|
--
|
||||||
|
-- Create HLL extension if present, print false result otherwise
|
||||||
|
SELECT CASE WHEN COUNT(*) > 0 THEN
|
||||||
|
'CREATE EXTENSION HLL'
|
||||||
|
ELSE 'SELECT false AS hll_present' END
|
||||||
|
AS create_cmd FROM pg_available_extensions()
|
||||||
|
WHERE name = 'hll'
|
||||||
|
\gset
|
||||||
|
:create_cmd;
|
||||||
|
ERROR: extension "hll" already exists
|
||||||
|
\c - - - :worker_1_port
|
||||||
|
:create_cmd;
|
||||||
|
ERROR: extension "hll" already exists
|
||||||
|
\c - - - :worker_2_port
|
||||||
|
:create_cmd;
|
||||||
|
ERROR: extension "hll" already exists
|
||||||
|
\c - - - :master_port
|
||||||
|
SET citus.shard_count TO 4;
|
||||||
|
CREATE TABLE raw_table (day date, user_id int);
|
||||||
|
CREATE TABLE daily_uniques(day date, unique_users hll);
|
||||||
|
SELECT create_distributed_table('raw_table', 'user_id');
|
||||||
|
create_distributed_table
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT create_distributed_table('daily_uniques', 'day');
|
||||||
|
create_distributed_table
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
INSERT INTO raw_table
|
||||||
|
SELECT day, user_id % 19
|
||||||
|
FROM generate_series('2018-05-24'::timestamp, '2018-06-24'::timestamp, '1 day'::interval) as f(day),
|
||||||
|
generate_series(1,100) as g(user_id);
|
||||||
|
INSERT INTO raw_table
|
||||||
|
SELECT day, user_id % 13
|
||||||
|
FROM generate_series('2018-06-10'::timestamp, '2018-07-10'::timestamp, '1 day'::interval) as f(day),
|
||||||
|
generate_series(1,100) as g(user_id);
|
||||||
|
-- Run hll on raw data
|
||||||
|
SELECT hll_cardinality(hll_union_agg(agg))
|
||||||
|
FROM (
|
||||||
|
SELECT hll_add_agg(hll_hash_integer(user_id)) AS agg
|
||||||
|
FROM raw_table)a;
|
||||||
|
hll_cardinality
|
||||||
|
-----------------
|
||||||
|
19
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
-- Aggregate the data into daily_uniques
|
||||||
|
INSERT INTO daily_uniques
|
||||||
|
SELECT day, hll_add_agg(hll_hash_integer(user_id))
|
||||||
|
FROM raw_table
|
||||||
|
GROUP BY 1;
|
||||||
|
-- Basic hll_cardinality check on aggregated data
|
||||||
|
SELECT day, hll_cardinality(unique_users)
|
||||||
|
FROM daily_uniques
|
||||||
|
WHERE day >= '2018-06-20' and day <= '2018-06-30'
|
||||||
|
ORDER BY 2 DESC,1
|
||||||
|
LIMIT 10;
|
||||||
|
day | hll_cardinality
|
||||||
|
------------+-----------------
|
||||||
|
06-20-2018 | 19
|
||||||
|
06-21-2018 | 19
|
||||||
|
06-22-2018 | 19
|
||||||
|
06-23-2018 | 19
|
||||||
|
06-24-2018 | 19
|
||||||
|
06-25-2018 | 13
|
||||||
|
06-26-2018 | 13
|
||||||
|
06-27-2018 | 13
|
||||||
|
06-28-2018 | 13
|
||||||
|
06-29-2018 | 13
|
||||||
|
(10 rows)
|
||||||
|
|
||||||
|
-- Union aggregated data for one week
|
||||||
|
SELECT hll_cardinality(hll_union_agg(unique_users))
|
||||||
|
FROM daily_uniques
|
||||||
|
WHERE day >= '2018-05-24'::date AND day <= '2018-05-31'::date;
|
||||||
|
hll_cardinality
|
||||||
|
-----------------
|
||||||
|
19
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT EXTRACT(MONTH FROM day) AS month, hll_cardinality(hll_union_agg(unique_users))
|
||||||
|
FROM daily_uniques
|
||||||
|
WHERE day >= '2018-06-23' AND day <= '2018-07-01'
|
||||||
|
GROUP BY 1
|
||||||
|
ORDER BY 1;
|
||||||
|
month | hll_cardinality
|
||||||
|
-------+-----------------
|
||||||
|
6 | 19
|
||||||
|
7 | 13
|
||||||
|
(2 rows)
|
||||||
|
|
||||||
|
-- These are going to be supported after window function support
|
||||||
|
SELECT day, hll_cardinality(hll_union_agg(unique_users) OVER seven_days)
|
||||||
|
FROM daily_uniques
|
||||||
|
WINDOW seven_days AS (ORDER BY day ASC ROWS 6 PRECEDING);
|
||||||
|
ERROR: could not run distributed query because the window function that is used cannot be pushed down
|
||||||
|
HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column
|
||||||
|
SELECT day, (hll_cardinality(hll_union_agg(unique_users) OVER two_days)) - hll_cardinality(unique_users) AS lost_uniques
|
||||||
|
FROM daily_uniques
|
||||||
|
WINDOW two_days AS (ORDER BY day ASC ROWS 1 PRECEDING);
|
||||||
|
ERROR: could not run distributed query because the window function that is used cannot be pushed down
|
||||||
|
HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column
|
||||||
|
DROP TABLE raw_table;
|
||||||
|
DROP TABLE daily_uniques;
|
|
@ -0,0 +1,112 @@
|
||||||
|
--
|
||||||
|
-- CUSTOM_AGGREGATE_SUPPORT
|
||||||
|
--
|
||||||
|
-- Create HLL extension if present, print false result otherwise
|
||||||
|
SELECT CASE WHEN COUNT(*) > 0 THEN
|
||||||
|
'CREATE EXTENSION HLL'
|
||||||
|
ELSE 'SELECT false AS hll_present' END
|
||||||
|
AS create_cmd FROM pg_available_extensions()
|
||||||
|
WHERE name = 'hll'
|
||||||
|
\gset
|
||||||
|
:create_cmd;
|
||||||
|
hll_present
|
||||||
|
-------------
|
||||||
|
f
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
\c - - - :worker_1_port
|
||||||
|
:create_cmd;
|
||||||
|
hll_present
|
||||||
|
-------------
|
||||||
|
f
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
\c - - - :worker_2_port
|
||||||
|
:create_cmd;
|
||||||
|
hll_present
|
||||||
|
-------------
|
||||||
|
f
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
\c - - - :master_port
|
||||||
|
SET citus.shard_count TO 4;
|
||||||
|
CREATE TABLE raw_table (day date, user_id int);
|
||||||
|
CREATE TABLE daily_uniques(day date, unique_users hll);
|
||||||
|
ERROR: type "hll" does not exist
|
||||||
|
LINE 1: CREATE TABLE daily_uniques(day date, unique_users hll);
|
||||||
|
^
|
||||||
|
SELECT create_distributed_table('raw_table', 'user_id');
|
||||||
|
create_distributed_table
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT create_distributed_table('daily_uniques', 'day');
|
||||||
|
ERROR: relation "daily_uniques" does not exist
|
||||||
|
LINE 1: SELECT create_distributed_table('daily_uniques', 'day');
|
||||||
|
^
|
||||||
|
INSERT INTO raw_table
|
||||||
|
SELECT day, user_id % 19
|
||||||
|
FROM generate_series('2018-05-24'::timestamp, '2018-06-24'::timestamp, '1 day'::interval) as f(day),
|
||||||
|
generate_series(1,100) as g(user_id);
|
||||||
|
INSERT INTO raw_table
|
||||||
|
SELECT day, user_id % 13
|
||||||
|
FROM generate_series('2018-06-10'::timestamp, '2018-07-10'::timestamp, '1 day'::interval) as f(day),
|
||||||
|
generate_series(1,100) as g(user_id);
|
||||||
|
-- Run hll on raw data
|
||||||
|
SELECT hll_cardinality(hll_union_agg(agg))
|
||||||
|
FROM (
|
||||||
|
SELECT hll_add_agg(hll_hash_integer(user_id)) AS agg
|
||||||
|
FROM raw_table)a;
|
||||||
|
ERROR: function hll_hash_integer(integer) does not exist
|
||||||
|
LINE 3: SELECT hll_add_agg(hll_hash_integer(user_id)) AS agg
|
||||||
|
^
|
||||||
|
HINT: No function matches the given name and argument types. You might need to add explicit type casts.
|
||||||
|
-- Aggregate the data into daily_uniques
|
||||||
|
INSERT INTO daily_uniques
|
||||||
|
SELECT day, hll_add_agg(hll_hash_integer(user_id))
|
||||||
|
FROM raw_table
|
||||||
|
GROUP BY 1;
|
||||||
|
ERROR: relation "daily_uniques" does not exist
|
||||||
|
LINE 1: INSERT INTO daily_uniques
|
||||||
|
^
|
||||||
|
-- Basic hll_cardinality check on aggregated data
|
||||||
|
SELECT day, hll_cardinality(unique_users)
|
||||||
|
FROM daily_uniques
|
||||||
|
WHERE day >= '2018-06-20' and day <= '2018-06-30'
|
||||||
|
ORDER BY 2 DESC,1
|
||||||
|
LIMIT 10;
|
||||||
|
ERROR: relation "daily_uniques" does not exist
|
||||||
|
LINE 2: FROM daily_uniques
|
||||||
|
^
|
||||||
|
-- Union aggregated data for one week
|
||||||
|
SELECT hll_cardinality(hll_union_agg(unique_users))
|
||||||
|
FROM daily_uniques
|
||||||
|
WHERE day >= '2018-05-24'::date AND day <= '2018-05-31'::date;
|
||||||
|
ERROR: relation "daily_uniques" does not exist
|
||||||
|
LINE 2: FROM daily_uniques
|
||||||
|
^
|
||||||
|
SELECT EXTRACT(MONTH FROM day) AS month, hll_cardinality(hll_union_agg(unique_users))
|
||||||
|
FROM daily_uniques
|
||||||
|
WHERE day >= '2018-06-23' AND day <= '2018-07-01'
|
||||||
|
GROUP BY 1
|
||||||
|
ORDER BY 1;
|
||||||
|
ERROR: relation "daily_uniques" does not exist
|
||||||
|
LINE 2: FROM daily_uniques
|
||||||
|
^
|
||||||
|
-- These are going to be supported after window function support
|
||||||
|
SELECT day, hll_cardinality(hll_union_agg(unique_users) OVER seven_days)
|
||||||
|
FROM daily_uniques
|
||||||
|
WINDOW seven_days AS (ORDER BY day ASC ROWS 6 PRECEDING);
|
||||||
|
ERROR: relation "daily_uniques" does not exist
|
||||||
|
LINE 2: FROM daily_uniques
|
||||||
|
^
|
||||||
|
SELECT day, (hll_cardinality(hll_union_agg(unique_users) OVER two_days)) - hll_cardinality(unique_users) AS lost_uniques
|
||||||
|
FROM daily_uniques
|
||||||
|
WINDOW two_days AS (ORDER BY day ASC ROWS 1 PRECEDING);
|
||||||
|
ERROR: relation "daily_uniques" does not exist
|
||||||
|
LINE 2: FROM daily_uniques
|
||||||
|
^
|
||||||
|
DROP TABLE raw_table;
|
||||||
|
DROP TABLE daily_uniques;
|
||||||
|
ERROR: table "daily_uniques" does not exist
|
|
@ -58,7 +58,7 @@ test: multi_subquery_complex_reference_clause multi_subquery_window_functions mu
|
||||||
test: multi_subquery_in_where_reference_clause
|
test: multi_subquery_in_where_reference_clause
|
||||||
test: multi_subquery_union multi_subquery_in_where_clause multi_subquery_misc
|
test: multi_subquery_union multi_subquery_in_where_clause multi_subquery_misc
|
||||||
test: multi_agg_distinct multi_agg_approximate_distinct multi_limit_clause_approximate multi_outer_join_reference multi_single_relation_subquery multi_prepare_plsql
|
test: multi_agg_distinct multi_agg_approximate_distinct multi_limit_clause_approximate multi_outer_join_reference multi_single_relation_subquery multi_prepare_plsql
|
||||||
test: multi_reference_table multi_select_for_update relation_access_tracking
|
test: multi_reference_table multi_select_for_update relation_access_tracking custom_aggregate_support
|
||||||
test: multi_average_expression multi_working_columns multi_having_pushdown
|
test: multi_average_expression multi_working_columns multi_having_pushdown
|
||||||
test: multi_array_agg multi_limit_clause multi_orderby_limit_pushdown
|
test: multi_array_agg multi_limit_clause multi_orderby_limit_pushdown
|
||||||
test: multi_jsonb_agg multi_jsonb_object_agg multi_json_agg multi_json_object_agg bool_agg
|
test: multi_jsonb_agg multi_jsonb_object_agg multi_json_agg multi_json_object_agg bool_agg
|
||||||
|
|
|
@ -0,0 +1,80 @@
|
||||||
|
--
|
||||||
|
-- CUSTOM_AGGREGATE_SUPPORT
|
||||||
|
--
|
||||||
|
-- Create HLL extension if present, print false result otherwise
|
||||||
|
SELECT CASE WHEN COUNT(*) > 0 THEN
|
||||||
|
'CREATE EXTENSION HLL'
|
||||||
|
ELSE 'SELECT false AS hll_present' END
|
||||||
|
AS create_cmd FROM pg_available_extensions()
|
||||||
|
WHERE name = 'hll'
|
||||||
|
\gset
|
||||||
|
|
||||||
|
:create_cmd;
|
||||||
|
|
||||||
|
\c - - - :worker_1_port
|
||||||
|
:create_cmd;
|
||||||
|
|
||||||
|
\c - - - :worker_2_port
|
||||||
|
:create_cmd;
|
||||||
|
|
||||||
|
\c - - - :master_port
|
||||||
|
|
||||||
|
SET citus.shard_count TO 4;
|
||||||
|
|
||||||
|
CREATE TABLE raw_table (day date, user_id int);
|
||||||
|
CREATE TABLE daily_uniques(day date, unique_users hll);
|
||||||
|
|
||||||
|
SELECT create_distributed_table('raw_table', 'user_id');
|
||||||
|
SELECT create_distributed_table('daily_uniques', 'day');
|
||||||
|
|
||||||
|
INSERT INTO raw_table
|
||||||
|
SELECT day, user_id % 19
|
||||||
|
FROM generate_series('2018-05-24'::timestamp, '2018-06-24'::timestamp, '1 day'::interval) as f(day),
|
||||||
|
generate_series(1,100) as g(user_id);
|
||||||
|
INSERT INTO raw_table
|
||||||
|
SELECT day, user_id % 13
|
||||||
|
FROM generate_series('2018-06-10'::timestamp, '2018-07-10'::timestamp, '1 day'::interval) as f(day),
|
||||||
|
generate_series(1,100) as g(user_id);
|
||||||
|
|
||||||
|
-- Run hll on raw data
|
||||||
|
SELECT hll_cardinality(hll_union_agg(agg))
|
||||||
|
FROM (
|
||||||
|
SELECT hll_add_agg(hll_hash_integer(user_id)) AS agg
|
||||||
|
FROM raw_table)a;
|
||||||
|
|
||||||
|
-- Aggregate the data into daily_uniques
|
||||||
|
INSERT INTO daily_uniques
|
||||||
|
SELECT day, hll_add_agg(hll_hash_integer(user_id))
|
||||||
|
FROM raw_table
|
||||||
|
GROUP BY 1;
|
||||||
|
|
||||||
|
-- Basic hll_cardinality check on aggregated data
|
||||||
|
SELECT day, hll_cardinality(unique_users)
|
||||||
|
FROM daily_uniques
|
||||||
|
WHERE day >= '2018-06-20' and day <= '2018-06-30'
|
||||||
|
ORDER BY 2 DESC,1
|
||||||
|
LIMIT 10;
|
||||||
|
|
||||||
|
-- Union aggregated data for one week
|
||||||
|
SELECT hll_cardinality(hll_union_agg(unique_users))
|
||||||
|
FROM daily_uniques
|
||||||
|
WHERE day >= '2018-05-24'::date AND day <= '2018-05-31'::date;
|
||||||
|
|
||||||
|
|
||||||
|
SELECT EXTRACT(MONTH FROM day) AS month, hll_cardinality(hll_union_agg(unique_users))
|
||||||
|
FROM daily_uniques
|
||||||
|
WHERE day >= '2018-06-23' AND day <= '2018-07-01'
|
||||||
|
GROUP BY 1
|
||||||
|
ORDER BY 1;
|
||||||
|
|
||||||
|
-- These are going to be supported after window function support
|
||||||
|
SELECT day, hll_cardinality(hll_union_agg(unique_users) OVER seven_days)
|
||||||
|
FROM daily_uniques
|
||||||
|
WINDOW seven_days AS (ORDER BY day ASC ROWS 6 PRECEDING);
|
||||||
|
|
||||||
|
SELECT day, (hll_cardinality(hll_union_agg(unique_users) OVER two_days)) - hll_cardinality(unique_users) AS lost_uniques
|
||||||
|
FROM daily_uniques
|
||||||
|
WINDOW two_days AS (ORDER BY day ASC ROWS 1 PRECEDING);
|
||||||
|
|
||||||
|
DROP TABLE raw_table;
|
||||||
|
DROP TABLE daily_uniques;
|
Loading…
Reference in New Issue