hll aggregates are tested

pull/2186/head
mehmet furkan şahin 2018-05-24 14:02:21 +03:00
parent 06217be326
commit df11dda750
4 changed files with 303 additions and 1 deletions

View File

@ -0,0 +1,110 @@
--
-- CUSTOM_AGGREGATE_SUPPORT
--
-- Create HLL extension if present, print false result otherwise
SELECT CASE WHEN COUNT(*) > 0 THEN
'CREATE EXTENSION HLL'
ELSE 'SELECT false AS hll_present' END
AS create_cmd FROM pg_available_extensions()
WHERE name = 'hll'
\gset
:create_cmd;
ERROR: extension "hll" already exists
\c - - - :worker_1_port
:create_cmd;
ERROR: extension "hll" already exists
\c - - - :worker_2_port
:create_cmd;
ERROR: extension "hll" already exists
\c - - - :master_port
SET citus.shard_count TO 4;
CREATE TABLE raw_table (day date, user_id int);
CREATE TABLE daily_uniques(day date, unique_users hll);
SELECT create_distributed_table('raw_table', 'user_id');
create_distributed_table
--------------------------
(1 row)
SELECT create_distributed_table('daily_uniques', 'day');
create_distributed_table
--------------------------
(1 row)
INSERT INTO raw_table
SELECT day, user_id % 19
FROM generate_series('2018-05-24'::timestamp, '2018-06-24'::timestamp, '1 day'::interval) as f(day),
generate_series(1,100) as g(user_id);
INSERT INTO raw_table
SELECT day, user_id % 13
FROM generate_series('2018-06-10'::timestamp, '2018-07-10'::timestamp, '1 day'::interval) as f(day),
generate_series(1,100) as g(user_id);
-- Run hll on raw data
SELECT hll_cardinality(hll_union_agg(agg))
FROM (
SELECT hll_add_agg(hll_hash_integer(user_id)) AS agg
FROM raw_table)a;
hll_cardinality
-----------------
19
(1 row)
-- Aggregate the data into daily_uniques
INSERT INTO daily_uniques
SELECT day, hll_add_agg(hll_hash_integer(user_id))
FROM raw_table
GROUP BY 1;
-- Basic hll_cardinality check on aggregated data
SELECT day, hll_cardinality(unique_users)
FROM daily_uniques
WHERE day >= '2018-06-20' and day <= '2018-06-30'
ORDER BY 2 DESC,1
LIMIT 10;
day | hll_cardinality
------------+-----------------
06-20-2018 | 19
06-21-2018 | 19
06-22-2018 | 19
06-23-2018 | 19
06-24-2018 | 19
06-25-2018 | 13
06-26-2018 | 13
06-27-2018 | 13
06-28-2018 | 13
06-29-2018 | 13
(10 rows)
-- Union aggregated data for one week
SELECT hll_cardinality(hll_union_agg(unique_users))
FROM daily_uniques
WHERE day >= '2018-05-24'::date AND day <= '2018-05-31'::date;
hll_cardinality
-----------------
19
(1 row)
SELECT EXTRACT(MONTH FROM day) AS month, hll_cardinality(hll_union_agg(unique_users))
FROM daily_uniques
WHERE day >= '2018-06-23' AND day <= '2018-07-01'
GROUP BY 1
ORDER BY 1;
month | hll_cardinality
-------+-----------------
6 | 19
7 | 13
(2 rows)
-- These are going to be supported after window function support
SELECT day, hll_cardinality(hll_union_agg(unique_users) OVER seven_days)
FROM daily_uniques
WINDOW seven_days AS (ORDER BY day ASC ROWS 6 PRECEDING);
ERROR: could not run distributed query because the window function that is used cannot be pushed down
HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column
SELECT day, (hll_cardinality(hll_union_agg(unique_users) OVER two_days)) - hll_cardinality(unique_users) AS lost_uniques
FROM daily_uniques
WINDOW two_days AS (ORDER BY day ASC ROWS 1 PRECEDING);
ERROR: could not run distributed query because the window function that is used cannot be pushed down
HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column
DROP TABLE raw_table;
DROP TABLE daily_uniques;

View File

@ -0,0 +1,112 @@
--
-- CUSTOM_AGGREGATE_SUPPORT
--
-- Create HLL extension if present, print false result otherwise
SELECT CASE WHEN COUNT(*) > 0 THEN
'CREATE EXTENSION HLL'
ELSE 'SELECT false AS hll_present' END
AS create_cmd FROM pg_available_extensions()
WHERE name = 'hll'
\gset
:create_cmd;
hll_present
-------------
f
(1 row)
\c - - - :worker_1_port
:create_cmd;
hll_present
-------------
f
(1 row)
\c - - - :worker_2_port
:create_cmd;
hll_present
-------------
f
(1 row)
\c - - - :master_port
SET citus.shard_count TO 4;
CREATE TABLE raw_table (day date, user_id int);
CREATE TABLE daily_uniques(day date, unique_users hll);
ERROR: type "hll" does not exist
LINE 1: CREATE TABLE daily_uniques(day date, unique_users hll);
^
SELECT create_distributed_table('raw_table', 'user_id');
create_distributed_table
--------------------------
(1 row)
SELECT create_distributed_table('daily_uniques', 'day');
ERROR: relation "daily_uniques" does not exist
LINE 1: SELECT create_distributed_table('daily_uniques', 'day');
^
INSERT INTO raw_table
SELECT day, user_id % 19
FROM generate_series('2018-05-24'::timestamp, '2018-06-24'::timestamp, '1 day'::interval) as f(day),
generate_series(1,100) as g(user_id);
INSERT INTO raw_table
SELECT day, user_id % 13
FROM generate_series('2018-06-10'::timestamp, '2018-07-10'::timestamp, '1 day'::interval) as f(day),
generate_series(1,100) as g(user_id);
-- Run hll on raw data
SELECT hll_cardinality(hll_union_agg(agg))
FROM (
SELECT hll_add_agg(hll_hash_integer(user_id)) AS agg
FROM raw_table)a;
ERROR: function hll_hash_integer(integer) does not exist
LINE 3: SELECT hll_add_agg(hll_hash_integer(user_id)) AS agg
^
HINT: No function matches the given name and argument types. You might need to add explicit type casts.
-- Aggregate the data into daily_uniques
INSERT INTO daily_uniques
SELECT day, hll_add_agg(hll_hash_integer(user_id))
FROM raw_table
GROUP BY 1;
ERROR: relation "daily_uniques" does not exist
LINE 1: INSERT INTO daily_uniques
^
-- Basic hll_cardinality check on aggregated data
SELECT day, hll_cardinality(unique_users)
FROM daily_uniques
WHERE day >= '2018-06-20' and day <= '2018-06-30'
ORDER BY 2 DESC,1
LIMIT 10;
ERROR: relation "daily_uniques" does not exist
LINE 2: FROM daily_uniques
^
-- Union aggregated data for one week
SELECT hll_cardinality(hll_union_agg(unique_users))
FROM daily_uniques
WHERE day >= '2018-05-24'::date AND day <= '2018-05-31'::date;
ERROR: relation "daily_uniques" does not exist
LINE 2: FROM daily_uniques
^
SELECT EXTRACT(MONTH FROM day) AS month, hll_cardinality(hll_union_agg(unique_users))
FROM daily_uniques
WHERE day >= '2018-06-23' AND day <= '2018-07-01'
GROUP BY 1
ORDER BY 1;
ERROR: relation "daily_uniques" does not exist
LINE 2: FROM daily_uniques
^
-- These are going to be supported after window function support
SELECT day, hll_cardinality(hll_union_agg(unique_users) OVER seven_days)
FROM daily_uniques
WINDOW seven_days AS (ORDER BY day ASC ROWS 6 PRECEDING);
ERROR: relation "daily_uniques" does not exist
LINE 2: FROM daily_uniques
^
SELECT day, (hll_cardinality(hll_union_agg(unique_users) OVER two_days)) - hll_cardinality(unique_users) AS lost_uniques
FROM daily_uniques
WINDOW two_days AS (ORDER BY day ASC ROWS 1 PRECEDING);
ERROR: relation "daily_uniques" does not exist
LINE 2: FROM daily_uniques
^
DROP TABLE raw_table;
DROP TABLE daily_uniques;
ERROR: table "daily_uniques" does not exist

View File

@ -58,7 +58,7 @@ test: multi_subquery_complex_reference_clause multi_subquery_window_functions mu
test: multi_subquery_in_where_reference_clause test: multi_subquery_in_where_reference_clause
test: multi_subquery_union multi_subquery_in_where_clause multi_subquery_misc test: multi_subquery_union multi_subquery_in_where_clause multi_subquery_misc
test: multi_agg_distinct multi_agg_approximate_distinct multi_limit_clause_approximate multi_outer_join_reference multi_single_relation_subquery multi_prepare_plsql test: multi_agg_distinct multi_agg_approximate_distinct multi_limit_clause_approximate multi_outer_join_reference multi_single_relation_subquery multi_prepare_plsql
test: multi_reference_table multi_select_for_update relation_access_tracking test: multi_reference_table multi_select_for_update relation_access_tracking custom_aggregate_support
test: multi_average_expression multi_working_columns multi_having_pushdown test: multi_average_expression multi_working_columns multi_having_pushdown
test: multi_array_agg multi_limit_clause multi_orderby_limit_pushdown test: multi_array_agg multi_limit_clause multi_orderby_limit_pushdown
test: multi_jsonb_agg multi_jsonb_object_agg multi_json_agg multi_json_object_agg bool_agg test: multi_jsonb_agg multi_jsonb_object_agg multi_json_agg multi_json_object_agg bool_agg

View File

@ -0,0 +1,80 @@
--
-- CUSTOM_AGGREGATE_SUPPORT
--
-- Create HLL extension if present, print false result otherwise
SELECT CASE WHEN COUNT(*) > 0 THEN
'CREATE EXTENSION HLL'
ELSE 'SELECT false AS hll_present' END
AS create_cmd FROM pg_available_extensions()
WHERE name = 'hll'
\gset
:create_cmd;
\c - - - :worker_1_port
:create_cmd;
\c - - - :worker_2_port
:create_cmd;
\c - - - :master_port
SET citus.shard_count TO 4;
CREATE TABLE raw_table (day date, user_id int);
CREATE TABLE daily_uniques(day date, unique_users hll);
SELECT create_distributed_table('raw_table', 'user_id');
SELECT create_distributed_table('daily_uniques', 'day');
INSERT INTO raw_table
SELECT day, user_id % 19
FROM generate_series('2018-05-24'::timestamp, '2018-06-24'::timestamp, '1 day'::interval) as f(day),
generate_series(1,100) as g(user_id);
INSERT INTO raw_table
SELECT day, user_id % 13
FROM generate_series('2018-06-10'::timestamp, '2018-07-10'::timestamp, '1 day'::interval) as f(day),
generate_series(1,100) as g(user_id);
-- Run hll on raw data
SELECT hll_cardinality(hll_union_agg(agg))
FROM (
SELECT hll_add_agg(hll_hash_integer(user_id)) AS agg
FROM raw_table)a;
-- Aggregate the data into daily_uniques
INSERT INTO daily_uniques
SELECT day, hll_add_agg(hll_hash_integer(user_id))
FROM raw_table
GROUP BY 1;
-- Basic hll_cardinality check on aggregated data
SELECT day, hll_cardinality(unique_users)
FROM daily_uniques
WHERE day >= '2018-06-20' and day <= '2018-06-30'
ORDER BY 2 DESC,1
LIMIT 10;
-- Union aggregated data for one week
SELECT hll_cardinality(hll_union_agg(unique_users))
FROM daily_uniques
WHERE day >= '2018-05-24'::date AND day <= '2018-05-31'::date;
SELECT EXTRACT(MONTH FROM day) AS month, hll_cardinality(hll_union_agg(unique_users))
FROM daily_uniques
WHERE day >= '2018-06-23' AND day <= '2018-07-01'
GROUP BY 1
ORDER BY 1;
-- These are going to be supported after window function support
SELECT day, hll_cardinality(hll_union_agg(unique_users) OVER seven_days)
FROM daily_uniques
WINDOW seven_days AS (ORDER BY day ASC ROWS 6 PRECEDING);
SELECT day, (hll_cardinality(hll_union_agg(unique_users) OVER two_days)) - hll_cardinality(unique_users) AS lost_uniques
FROM daily_uniques
WINDOW two_days AS (ORDER BY day ASC ROWS 1 PRECEDING);
DROP TABLE raw_table;
DROP TABLE daily_uniques;