From 498ac80d8b49716087ae6807277a57c399cdea90 Mon Sep 17 00:00:00 2001 From: Onder Kalaci Date: Sat, 16 Sep 2017 02:37:15 +0300 Subject: [PATCH] Add window function support for SUBQUERY PUSHDOWN and INSERT INTO SELECT This commit provides the support for window functions in subquery and insert into select queries. Note that our support for window functions is still limited because it must have a partition by clause on the distribution key. This commit makes changes in the files insert_select_planner and multi_logical_planner. The required tests are also added with files multi_subquery_window_functions.out and multi_insert_select_window.out. --- .../planner/insert_select_planner.c | 40 +- .../planner/multi_logical_planner.c | 109 +- .../distributed/multi_logical_planner.h | 2 + .../regress/expected/multi_insert_select.out | 13 +- .../expected/multi_insert_select_window.out | 855 ++++++++++++++ .../multi_subquery_complex_queries.out | 13 - ...ulti_subquery_complex_reference_clause.out | 546 +++++---- .../multi_subquery_window_functions.out | 1050 +++++++++++++++++ ...i_behavioral_analytics_create_table.source | 27 +- src/test/regress/multi_schedule | 4 +- ...i_behavioral_analytics_create_table.source | 29 +- .../sql/multi_insert_select_window.sql | 769 ++++++++++++ .../sql/multi_subquery_complex_queries.sql | 12 - ...ulti_subquery_complex_reference_clause.sql | 543 +++++---- .../sql/multi_subquery_window_functions.sql | 706 +++++++++++ 15 files changed, 4091 insertions(+), 627 deletions(-) create mode 100644 src/test/regress/expected/multi_insert_select_window.out create mode 100644 src/test/regress/expected/multi_subquery_window_functions.out create mode 100644 src/test/regress/sql/multi_insert_select_window.sql create mode 100644 src/test/regress/sql/multi_subquery_window_functions.sql diff --git a/src/backend/distributed/planner/insert_select_planner.c b/src/backend/distributed/planner/insert_select_planner.c index 7f69922ec..a3f7b107a 100644 --- a/src/backend/distributed/planner/insert_select_planner.c +++ b/src/backend/distributed/planner/insert_select_planner.c @@ -776,6 +776,7 @@ MultiTaskRouterSelectQuerySupported(Query *query) { List *queryList = NIL; ListCell *queryCell = NULL; + StringInfo errorDetail = NULL; ExtractQueryWalker((Node *) query, &queryList); foreach(queryCell, queryList) @@ -797,7 +798,7 @@ MultiTaskRouterSelectQuerySupported(Query *query) if (subquery->limitCount != NULL) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, - "LIMIT clauses are not allowed in distirbuted INSERT " + "LIMIT clauses are not allowed in distributed INSERT " "... SELECT queries", NULL, NULL); } @@ -811,17 +812,34 @@ MultiTaskRouterSelectQuerySupported(Query *query) NULL, NULL); } - /* - * We could potentially support window clauses where the data is partitioned - * over distribution column. For simplicity, we currently do not support window - * clauses at all. - */ - if (subquery->windowClause != NULL) + /* group clause list must include partition column */ + if (subquery->groupClause) { - return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, - "window functions are not allowed in distributed " - "INSERT ... SELECT queries", - NULL, NULL); + List *groupClauseList = subquery->groupClause; + List *targetEntryList = subquery->targetList; + List *groupTargetEntryList = GroupTargetEntryList(groupClauseList, + targetEntryList); + bool groupOnPartitionColumn = TargetListOnPartitionColumn(subquery, + groupTargetEntryList); + if (!groupOnPartitionColumn) + { + return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, + "Group by list without distribution column is " + "not allowed in distributed INSERT ... " + "SELECT queries", + NULL, NULL); + } + } + + /* + * We support window functions when the window function + * is partitioned on distribution column. + */ + if (subquery->windowClause && !SafeToPushdownWindowFunction(subquery, + &errorDetail)) + { + return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, errorDetail->data, NULL, + NULL); } if (subquery->setOperations != NULL) diff --git a/src/backend/distributed/planner/multi_logical_planner.c b/src/backend/distributed/planner/multi_logical_planner.c index b414ddc3f..af7e4bb24 100644 --- a/src/backend/distributed/planner/multi_logical_planner.c +++ b/src/backend/distributed/planner/multi_logical_planner.c @@ -83,7 +83,7 @@ static DeferredErrorMessage * DeferErrorIfUnsupportedUnionQuery(Query *queryTree outerMostQueryHasLimit); static bool ExtractSetOperationStatmentWalker(Node *node, List **setOperationList); static DeferredErrorMessage * DeferErrorIfUnsupportedTableCombination(Query *queryTree); -static bool TargetListOnPartitionColumn(Query *query, List *targetEntryList); +static bool WindowPartitionOnDistributionColumn(Query *query); static FieldSelect * CompositeFieldRecursive(Expr *expression, Query *query); static bool FullCompositeFieldList(List *compositeFieldList); static MultiNode * MultiPlanTree(Query *queryTree); @@ -447,6 +447,7 @@ MultiSubqueryPlanTree(Query *originalQuery, Query *queryTree, * - Only a single RTE_RELATION exists, which means only a single table * name is specified on the whole query * - No sublinks exists in the subquery + * - No window functions in the subquery * * Note that the caller should still call DeferErrorIfUnsupportedSubqueryRepartition() * to ensure that Citus supports the subquery. Also, this function is designed to run @@ -466,6 +467,12 @@ SingleRelationRepartitionSubquery(Query *queryTree) return false; } + /* we don't support window functions */ + if (queryTree->hasWindowFuncs) + { + return false; + } + /* * Don't allow joins and set operations. If join appears in the queryTree, the * length would be greater than 1. If only set operations exists, the length @@ -782,6 +789,7 @@ DeferErrorIfCannotPushdownSubquery(Query *subqueryTree, bool outerMostQueryHasLi { bool preconditionsSatisfied = true; char *errorDetail = NULL; + StringInfo errorInfo = NULL; DeferredErrorMessage *deferredError = NULL; deferredError = DeferErrorIfUnsupportedTableCombination(subqueryTree); @@ -796,12 +804,6 @@ DeferErrorIfCannotPushdownSubquery(Query *subqueryTree, bool outerMostQueryHasLi errorDetail = "Subqueries without relations are unsupported"; } - if (subqueryTree->hasWindowFuncs) - { - preconditionsSatisfied = false; - errorDetail = "Window functions are currently unsupported"; - } - if (subqueryTree->limitOffset) { preconditionsSatisfied = false; @@ -871,6 +873,17 @@ DeferErrorIfCannotPushdownSubquery(Query *subqueryTree, bool outerMostQueryHasLi } } + /* + * We support window functions when the window function + * is partitioned on distribution column. + */ + if (subqueryTree->windowClause && !SafeToPushdownWindowFunction(subqueryTree, + &errorInfo)) + { + errorDetail = (char *) errorInfo->data; + preconditionsSatisfied = false; + } + /* we don't support aggregates without group by */ if (subqueryTree->hasAggs && (subqueryTree->groupClause == NULL)) { @@ -1083,11 +1096,91 @@ DeferErrorIfUnsupportedTableCombination(Query *queryTree) } +/* + * SafeToPushdownWindowFunction checks if the query with window function is supported. + * It returns the result accordingly and modifies the error detail. + */ +bool +SafeToPushdownWindowFunction(Query *query, StringInfo *errorDetail) +{ + ListCell *windowClauseCell = NULL; + List *windowClauseList = query->windowClause; + + /* + * We need to check each window clause separately if there is a partition by clause + * and if it is partitioned on the distribution column. + */ + foreach(windowClauseCell, windowClauseList) + { + WindowClause *windowClause = lfirst(windowClauseCell); + + if (!windowClause->partitionClause) + { + *errorDetail = makeStringInfo(); + appendStringInfoString(*errorDetail, + "Window functions without PARTITION BY on distribution " + "column is currently unsupported"); + return false; + } + } + + if (!WindowPartitionOnDistributionColumn(query)) + { + *errorDetail = makeStringInfo(); + appendStringInfoString(*errorDetail, + "Window functions with PARTITION BY list missing distribution " + "column is currently unsupported"); + return false; + } + + return true; +} + + +/* + * WindowPartitionOnDistributionColumn checks if the given subquery has one + * or more window functions and at least one of them is not partitioned by + * distribution column. The function returns false if your window function does not + * have a partition by clause or it does not include the distribution column. + * + * Please note that if the query does not have a window function, the function + * returns true. + */ +static bool +WindowPartitionOnDistributionColumn(Query *query) +{ + List *windowClauseList = query->windowClause; + ListCell *windowClauseCell = NULL; + + foreach(windowClauseCell, windowClauseList) + { + WindowClause *windowClause = lfirst(windowClauseCell); + List *groupTargetEntryList = NIL; + bool partitionOnDistributionColumn = false; + List *partitionClauseList = windowClause->partitionClause; + List *targetEntryList = query->targetList; + + groupTargetEntryList = + GroupTargetEntryList(partitionClauseList, targetEntryList); + + partitionOnDistributionColumn = + TargetListOnPartitionColumn(query, groupTargetEntryList); + + if (!partitionOnDistributionColumn) + { + return false; + } + } + + return true; +} + + /* * TargetListOnPartitionColumn checks if at least one target list entry is on * partition column. */ -static bool +bool TargetListOnPartitionColumn(Query *query, List *targetEntryList) { bool targetListOnPartitionColumn = false; diff --git a/src/include/distributed/multi_logical_planner.h b/src/include/distributed/multi_logical_planner.h index 6ebebc80f..b3ff12aac 100644 --- a/src/include/distributed/multi_logical_planner.h +++ b/src/include/distributed/multi_logical_planner.h @@ -185,6 +185,8 @@ extern MultiTreeRoot * MultiLogicalPlanCreate(Query *originalQuery, Query *query PlannerRestrictionContext * plannerRestrictionContext, ParamListInfo boundParams); +extern bool SafeToPushdownWindowFunction(Query *query, StringInfo *errorDetail); +extern bool TargetListOnPartitionColumn(Query *query, List *targetEntryList); extern bool NeedsDistributedPlanning(Query *queryTree); extern MultiNode * ParentNode(MultiNode *multiNode); extern MultiNode * ChildNode(MultiUnaryNode *multiNode); diff --git a/src/test/regress/expected/multi_insert_select.out b/src/test/regress/expected/multi_insert_select.out index 6a95b5e47..fb2c20853 100644 --- a/src/test/regress/expected/multi_insert_select.out +++ b/src/test/regress/expected/multi_insert_select.out @@ -1015,8 +1015,7 @@ FROM (SELECT SUM(raw_events_second.value_4) AS v4, raw_events_second WHERE raw_events_first.user_id = raw_events_second.user_id GROUP BY raw_events_second.value_3) AS foo; -DEBUG: cannot perform distributed INSERT INTO ... SELECT because the partition columns in the source table and subquery do not match -DETAIL: The data type of the target table's partition column should exactly match the data type of the corresponding simple column reference in the subquery. +DEBUG: Group by list without distribution column is not allowed in distributed INSERT ... SELECT queries DEBUG: Collecting INSERT ... SELECT results on coordinator ERROR: cannot push down this subquery DETAIL: Group by list without partition column is currently unsupported @@ -1133,9 +1132,7 @@ FROM (SELECT SUM(raw_events_second.value_4) AS v4, GROUP BY raw_events_second.value_1 HAVING SUM(raw_events_second.value_4) > 10) AS foo2 ) as f2 ON (f.id = f2.id); -DEBUG: cannot perform distributed INSERT INTO ... SELECT because the partition columns in the source table and subquery do not match -DETAIL: Subquery contains an expression that is not a simple column reference in the same position as the target table's partition column. -HINT: Ensure the target table's partition column has a corresponding simple column reference to a distributed table's partition column in the subquery. +DEBUG: Group by list without distribution column is not allowed in distributed INSERT ... SELECT queries DEBUG: Collecting INSERT ... SELECT results on coordinator ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. @@ -1164,8 +1161,10 @@ FROM (SELECT SUM(raw_events_second.value_4) AS v4, GROUP BY raw_events_second.value_1 HAVING SUM(raw_events_second.value_4) > 10) AS foo2 ) as f2 ON (f.id = f2.id); -ERROR: cannot perform distributed planning for the given modification -DETAIL: Select query cannot be pushed down to the worker. +DEBUG: Group by list without distribution column is not allowed in distributed INSERT ... SELECT queries +DEBUG: Collecting INSERT ... SELECT results on coordinator +ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys +DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. -- cannot pushdown the query since the JOIN is not equi JOIN INSERT INTO agg_events (user_id, value_4_agg) diff --git a/src/test/regress/expected/multi_insert_select_window.out b/src/test/regress/expected/multi_insert_select_window.out new file mode 100644 index 000000000..845e0b710 --- /dev/null +++ b/src/test/regress/expected/multi_insert_select_window.out @@ -0,0 +1,855 @@ +-- =================================================================== +-- test insert select functionality for window functions +-- =================================================================== +TRUNCATE agg_results; +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS (PARTITION BY user_id ORDER BY time DESC) +) as foo; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 10001 | 101 | 49.5810418958104190 +(1 row) + +TRUNCATE agg_results; +-- the same test with different syntax +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER (PARTITION BY user_id ORDER BY time DESC) as rnk + FROM + events_table +) as foo; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 10001 | 101 | 49.5810418958104190 +(1 row) + +TRUNCATE agg_results; +-- similar test with lag +INSERT INTO agg_results (user_id, agg_time, value_2_agg, value_3_agg) +SELECT + user_id, time, lag_event_type, row_no +FROM +( + SELECT + *, lag(event_type) OVER my_win as lag_event_type, row_number() OVER my_win as row_no + FROM + events_table WINDOW my_win AS (PARTITION BY user_id ORDER BY time DESC) +) as foo; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 10001 | 101 | 49.5810418958104190 +(1 row) + +TRUNCATE agg_results; +-- simple window function, partitioned and grouped by on the distribution key +INSERT INTO agg_results (user_id, value_1_agg, value_2_agg) +SELECT + user_id, rnk, tme +FROM +( + SELECT + user_id, rank() OVER my_win as rnk, avg(value_2) as tme + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY user_id ORDER BY avg(event_type) DESC) +) as foo; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 1188 | 101 | 49.7895622895622896 +(1 row) + +TRUNCATE agg_results; +-- top level query has a group by on the result of the window function +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + min(user_id), min(time), lag_event_type +FROM +( + SELECT + *, lag(event_type) OVER my_win as lag_event_type + FROM + events_table WINDOW my_win AS (PARTITION BY user_id ORDER BY time DESC) +) as foo +GROUP BY + lag_event_type; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+-------------------- + 1002 | 50 | 9.7844311377245509 +(1 row) + +TRUNCATE agg_results; +-- window functions should work along with joins as well +INSERT INTO agg_results (user_id, value_1_agg, value_2_agg) +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w1 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time) +) as foo; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 195 | 91 | 51.0205128205128205 +(1 row) + +TRUNCATE agg_results; +-- two window functions in a single subquery should work fine as well +INSERT INTO agg_results (user_id, value_1_agg, value_2_agg) +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id, (events_table.value_2 % 25) ORDER BY events_table.time) +) as foo; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 202 | 91 | 50.2970297029702970 +(1 row) + +TRUNCATE agg_results; +-- window functions should be fine within subquery joins +INSERT INTO agg_results (user_id, value_1_agg, value_2_agg, value_3_agg) +SELECT sub_1.user_id, max(lag_1), max(rank_1), max(rank_2) FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1 as lag_1, rank() OVER w2 as rank_1 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id, (events_table.value_2 % 25) ORDER BY events_table.time) +) as sub_1 +JOIN +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1 as lag_2, rank() OVER w2 as rank_2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.value_2 ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id, (events_table.value_2 % 50) ORDER BY events_table.time) +) as sub_2 + ON(sub_1.user_id = sub_2.user_id) + GROUP BY + sub_1.user_id; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 91 | 91 | 50.2637362637362637 +(1 row) + +TRUNCATE agg_results; +-- GROUP BYs and PARTITION BYs should work fine together +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + avg(user_id), max(time), my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY user_id ORDER BY count(*) DESC) +) as foo +WHERE + my_rank > 5 +GROUP BY + my_rank; + -- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 7 | 6 | 50.0000000000000000 +(1 row) + +TRUNCATE agg_results; +-- aggregates in the PARTITION BY is also allows +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + avg(user_id), max(time), my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY user_id, avg(event_type%10)::int ORDER BY count(*) DESC) +) as foo +WHERE + my_rank > 0 +GROUP BY + my_rank; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 8 | 7 | 48.8750000000000000 +(1 row) + +TRUNCATE agg_results; +-- GROUP BY should not necessarly be inclusive of partitioning +-- but this query doesn't make much sense +INSERT INTO agg_results (user_id, value_1_agg) +SELECT + avg(user_id), my_rank +FROM +( + SELECT + user_id, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id + WINDOW my_win AS (PARTITION BY user_id, max(event_type) ORDER BY count(*) DESC) +) as foo +GROUP BY + my_rank; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 1 | 1 | 50.0000000000000000 +(1 row) + +TRUNCATE agg_results; +-- Group by has more columns than partition by which uses coordinator insert ... select +INSERT INTO agg_results(user_id, value_2_agg) +SELECT * FROM ( + SELECT + DISTINCT user_id, SUM(value_2) OVER (PARTITION BY user_id) + FROM + users_table + GROUP BY + user_id, value_1, value_2 +) a +ORDER BY + 2 DESC, 1 +LIMIT + 10; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 10 | 10 | 49.1000000000000000 +(1 row) + +TRUNCATE agg_results; +INSERT INTO agg_results(user_id, value_2_agg) +SELECT user_id, max(sum) FROM ( + SELECT + user_id, SUM(value_2) OVER (PARTITION BY user_id, value_1) + FROM + users_table + GROUP BY + user_id, value_1, value_2 +) a +GROUP BY user_id; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 101 | 101 | 50.0000000000000000 +(1 row) + +TRUNCATE agg_results; + -- Subquery in where with window function +INSERT INTO agg_results(user_id) +SELECT + user_id +FROM + users_table +WHERE + value_2 > 545 AND + value_2 < ALL ( + SELECT + avg(value_3) OVER (PARTITION BY user_id) + FROM + events_table + WHERE + users_table.user_id = events_table.user_id + ) +GROUP BY + user_id; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 4 | 4 | 35.2500000000000000 +(1 row) + +TRUNCATE agg_results; +-- Partition by with aggregate functions. This query does not make much sense since the +-- result of aggregate function will be the same for every row in a partition and it is +-- not going to affect the group that the count function will work on. +INSERT INTO agg_results(user_id, value_2_agg) +SELECT * FROM ( + SELECT + user_id, COUNT(*) OVER (PARTITION BY user_id, MIN(value_2)) + FROM + users_table + GROUP BY + 1 +) a; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 101 | 101 | 50.0000000000000000 +(1 row) + +TRUNCATE agg_results; +-- Some more nested queries +INSERT INTO agg_results(user_id, value_2_agg, value_3_agg, value_4_agg) +SELECT + user_id, rank, SUM(ABS(value_2 - value_3)) AS difference, COUNT(*) AS distinct_users +FROM ( + SELECT + *, rank() OVER (PARTITION BY user_id ORDER BY value_2 DESC) + FROM ( + SELECT + user_id, value_2, sum(value_3) OVER (PARTITION BY user_id, value_2) as value_3 + FROM users_table + ) AS A +) AS A +GROUP BY + user_id, rank; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 9501 | 101 | 49.8461214608988528 +(1 row) + +TRUNCATE agg_results; +INSERT INTO agg_results(user_id, value_1_agg) +SELECT * FROM ( + SELECT DISTINCT + f3.user_id, ABS(f2.sum - f3.sum) + FROM ( + SELECT DISTINCT + user_id, sum(value_3) OVER (PARTITION BY user_id) + FROM + users_table + GROUP BY + user_id, value_3 + ) f3, + ( + SELECT DISTINCT + user_id, sum(value_2) OVER (PARTITION BY user_id) + FROM + users_table + GROUP BY + user_id, value_2 + ) f2 +WHERE + f3.user_id=f2.user_id +) a; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 101 | 101 | 50.0000000000000000 +(1 row) + +TRUNCATE agg_results; +-- test with reference table partitioned on columns from both +INSERT INTO agg_results(user_id, value_1_agg) +SELECT * +FROM +( + SELECT + DISTINCT user_id, count(id) OVER (PARTITION BY user_id, id) + FROM + users_table, users_ref_test_table +) a; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 101 | 101 | 50.0000000000000000 +(1 row) + +TRUNCATE agg_results; +-- Window functions with HAVING clause +INSERT INTO agg_results (user_id, value_1_agg) +SELECT * FROM ( + SELECT + DISTINCT user_id, rank() OVER (PARTITION BY user_id ORDER BY value_1) + FROM + users_table + GROUP BY + user_id, value_1 HAVING count(*) > 1 +) a; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 437 | 100 | 49.9496567505720824 +(1 row) + +TRUNCATE agg_results; +-- Window functions with HAVING clause which uses coordinator insert ... select +INSERT INTO agg_results (user_id, value_1_agg) +SELECT * FROM ( + SELECT + DISTINCT user_id, rank() OVER (PARTITION BY user_id ORDER BY value_1) + FROM + users_table + GROUP BY + user_id, value_1 HAVING count(*) > 1 +) a +ORDER BY + 2 DESC, 1 +LIMIT + 10; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 10 | 5 | 32.4000000000000000 +(1 row) + +TRUNCATE agg_results; +-- Window function in View works +CREATE VIEW view_with_window_func AS +SELECT + DISTINCT user_id, rank() OVER (PARTITION BY user_id ORDER BY value_1) +FROM + users_table +GROUP BY + user_id, value_1 +HAVING count(*) > 1; +INSERT INTO agg_results(user_id, value_1_agg) +SELECT * +FROM + view_with_window_func; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 437 | 100 | 49.9496567505720824 +(1 row) + +TRUNCATE agg_results; +-- Window function in View works and the query uses coordinator insert ... select +INSERT INTO agg_results(user_id, value_1_agg) +SELECT * +FROM + view_with_window_func +LIMIT + 10; +-- get some statistics from the aggregated results to ensure the results are correct +-- since there is a limit but not order, we cannot run avg(user_id) +SELECT count(*) FROM agg_results; + count +------- + 10 +(1 row) + +TRUNCATE agg_results; +INSERT INTO agg_results(user_id, value_1_agg) +SELECT + user_id, max(avg) +FROM +( + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (1, 2, 3, 4, 5)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (6, 7, 8, 9, 10)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (11, 12, 13, 14, 15)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (16, 17, 18, 19, 20)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (21, 22, 23, 24, 25)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (26, 27, 28, 29, 30)) +) b +GROUP BY + user_id +LIMIT + 5; +-- get some statistics from the aggregated results to ensure the results are correct +-- since there is a limit but not order, we cannot test avg or distinct count +SELECT count(*) FROM agg_results; + count +------- + 5 +(1 row) + +TRUNCATE agg_results; +INSERT INTO agg_results(user_id, value_1_agg) +SELECT + user_id, max(avg) +FROM +( + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (1, 2, 3, 4, 5)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (6, 7, 8, 9, 10)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (11, 12, 13, 14, 15)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (16, 17, 18, 19, 20)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (21, 22, 23, 24, 25)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (26, 27, 28, 29, 30)) +) b +GROUP BY + user_id; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 94 | 94 | 50.4787234042553191 +(1 row) + +TRUNCATE agg_results; +INSERT INTO agg_results(user_id, value_1_agg) +SELECT * +FROM ( + ( SELECT user_id, + sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_1 + GROUP BY + user_id) + UNION + (SELECT + user_id, sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_2 + GROUP BY + user_id + ) +) AS ftop +LIMIT + 5; +-- get some statistics from the aggregated results to ensure the results are correct +-- since there is a limit but not order, we cannot test avg or distinct count +SELECT count(*) FROM agg_results; + count +------- + 5 +(1 row) + +TRUNCATE agg_results; +INSERT INTO agg_results(user_id, value_1_agg) +SELECT * +FROM ( + ( SELECT user_id, + sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_1 + GROUP BY + user_id) + UNION + (SELECT + user_id, sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_2 + GROUP BY + user_id + ) +) AS ftop; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 101 | 101 | 50.0000000000000000 +(1 row) + +TRUNCATE agg_results; +-- lets have some queries that Citus shouldn't push down +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS (PARTITION BY event_type ORDER BY time DESC) +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; +ERROR: cannot push down this subquery +DETAIL: Window functions with PARTITION BY list missing distribution column is currently unsupported +-- user needs to supply partition by which should +-- include the distribution key +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS () +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; +ERROR: cannot push down this subquery +DETAIL: Window functions without PARTITION BY on distribution column is currently unsupported +-- user needs to supply partition by which should +-- include the distribution key +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS (ORDER BY time DESC) +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; +ERROR: cannot push down this subquery +DETAIL: Window functions without PARTITION BY on distribution column is currently unsupported +-- w2 should not be pushed down +INSERT INTO agg_results (user_id, value_1_agg, value_2_agg) +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id+1, (events_table.value_2 % 25) ORDER BY events_table.time) +) as foo +LIMIT + 10; +ERROR: cannot push down this subquery +DETAIL: Window functions with PARTITION BY list missing distribution column is currently unsupported +-- GROUP BY includes the partition key, but not the WINDOW function +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + user_id, time, my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (ORDER BY avg(event_type)) +) as foo +WHERE + my_rank > 125; +ERROR: cannot push down this subquery +DETAIL: Window functions without PARTITION BY on distribution column is currently unsupported +-- GROUP BY includes the partition key, but not the WINDOW function +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + user_id, time, my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY date_trunc('day', time) ORDER BY avg(event_type)) +) as foo +WHERE + my_rank > 125; +ERROR: cannot push down this subquery +DETAIL: Window functions with PARTITION BY list missing distribution column is currently unsupported +-- w2 should not be allowed +INSERT INTO agg_results (user_id, value_2_agg, value_3_agg) +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (ORDER BY events_table.time) +) as foo; +ERROR: cannot push down this subquery +DETAIL: Window functions without PARTITION BY on distribution column is currently unsupported +-- unsupported window function with an override +INSERT INTO agg_results(user_id, agg_time, value_2_agg) +SELECT * FROM ( + SELECT + user_id, date_trunc('day', time) as time, sum(rank) OVER w2 + FROM ( + SELECT DISTINCT + user_id as user_id, time, rank() over w1 + FROM + users_table + WINDOW + w AS (PARTITION BY time), w1 AS (w ORDER BY value_2, value_3) + ) fab + WINDOW + w2 as (PARTITION BY user_id, time) +) a; +ERROR: cannot push down this subquery +DETAIL: Window functions with PARTITION BY list missing distribution column is currently unsupported + -- Subquery in where with unsupported window function +INSERT INTO agg_results(user_id) +SELECT + user_id +FROM + users_table +WHERE + value_2 > 545 AND + value_2 < ALL ( + SELECT + avg(value_3) OVER () + FROM + events_table + WHERE + users_table.user_id = events_table.user_id + ) +GROUP BY + user_id; +ERROR: cannot push down this subquery +DETAIL: Window functions without PARTITION BY on distribution column is currently unsupported +-- Aggregate function on distribution column should error out +INSERT INTO agg_results(user_id, value_2_agg) +SELECT * FROM ( + SELECT + user_id, COUNT(*) OVER (PARTITION BY sum(user_id), MIN(value_2)) + FROM + users_table + GROUP BY + user_id +) a; +ERROR: cannot push down this subquery +DETAIL: Window functions with PARTITION BY list missing distribution column is currently unsupported +-- UNION with only one subquery which has a partition on non-distribution column should +-- error out +INSERT INTO agg_results(user_id, value_1_agg) +SELECT * +FROM ( + ( SELECT user_id, + sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_1 + GROUP BY + user_id) + UNION + (SELECT + user_id, sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by event_type) AS counter + FROM + events_table) user_id_2 + GROUP BY + user_id + ) +) AS ftop; +ERROR: cannot push down this subquery +DETAIL: Window functions with PARTITION BY list missing distribution column is currently unsupported +DROP VIEW view_with_window_func; diff --git a/src/test/regress/expected/multi_subquery_complex_queries.out b/src/test/regress/expected/multi_subquery_complex_queries.out index 2414ee2d2..d303bf671 100644 --- a/src/test/regress/expected/multi_subquery_complex_queries.out +++ b/src/test/regress/expected/multi_subquery_complex_queries.out @@ -2381,19 +2381,6 @@ ORDER BY types; ERROR: cannot push down this subquery DETAIL: Offset clause is currently unsupported --- not supported due to window functions -SELECT user_id, - some_vals -FROM ( - SELECT * , - Row_number() over (PARTITION BY "user_id" ORDER BY "user_id") AS "some_vals", - Random() - FROM users_table - ) user_id -ORDER BY 1, - 2 limit 10; -ERROR: cannot perform distributed planning on this query -DETAIL: Subqueries without group by clause are not supported yet -- not supported due to non relation rte SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType FROM diff --git a/src/test/regress/expected/multi_subquery_complex_reference_clause.out b/src/test/regress/expected/multi_subquery_complex_reference_clause.out index 30b5a8053..3ca038a5a 100644 --- a/src/test/regress/expected/multi_subquery_complex_reference_clause.out +++ b/src/test/regress/expected/multi_subquery_complex_reference_clause.out @@ -6,7 +6,6 @@ -- We don't need shard id sequence here, so commented out to prevent conflicts with concurrent tests -- ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 1400000; ALTER SEQUENCE pg_catalog.pg_dist_jobid_seq RESTART 1400000; - SET citus.enable_router_execution TO FALSE; CREATE TABLE user_buy_test_table(user_id int, item_id int, buy_count int); SELECT create_distributed_table('user_buy_test_table', 'user_id'); @@ -29,19 +28,6 @@ SELECT create_distributed_table('users_return_test_table', 'user_id'); INSERT INTO users_return_test_table VALUES(4,1,1); INSERT INTO users_return_test_table VALUES(1,3,1); INSERT INTO users_return_test_table VALUES(3,2,2); -CREATE TABLE users_ref_test_table(id int, it_name varchar(25), k_no int); -SELECT create_reference_table('users_ref_test_table'); - create_reference_table ------------------------- - -(1 row) - -INSERT INTO users_ref_test_table VALUES(1,'User_1',45); -INSERT INTO users_ref_test_table VALUES(2,'User_2',46); -INSERT INTO users_ref_test_table VALUES(3,'User_3',47); -INSERT INTO users_ref_test_table VALUES(4,'User_4',48); -INSERT INTO users_ref_test_table VALUES(5,'User_5',49); -INSERT INTO users_ref_test_table VALUES(6,'User_6',50); -- Simple Join test with reference table SELECT count(*) FROM (SELECT random() FROM user_buy_test_table JOIN users_ref_test_table @@ -165,7 +151,7 @@ WHERE subquery_1.user_id = subquery_2.user_id ; 4 (1 row) - -- Should be able to push down since reference tables are inner joined + -- Should be able to push down since reference tables are inner joined -- with hash distributed tables, the results of those joins are the parts of -- an outer join SELECT subquery_2.id FROM @@ -193,15 +179,15 @@ ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join -- should be able to pushdown since reference table is in the -- inner part of the left join -SELECT - user_id, sum(value_1) -FROM - (SELECT +SELECT + user_id, sum(value_1) +FROM + (SELECT users_table.user_id, users_table.value_1, random() - FROM - users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) + FROM + users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) INNER JOIN events_reference_table ON (events_reference_table.value_2 = users_table.user_id) - ) as foo + ) as foo GROUP BY user_id ORDER BY 2 DESC LIMIT 10; user_id | sum ---------+---------- @@ -218,15 +204,15 @@ FROM (10 rows) -- same query as above, reference table is wrapped into a subquery -SELECT - user_id, sum(value_1) -FROM - (SELECT +SELECT + user_id, sum(value_1) +FROM + (SELECT users_table.user_id, users_table.value_1, random() - FROM - users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) + FROM + users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) INNER JOIN (SELECT *, random() FROM events_reference_table) as ref_all ON (ref_all.value_2 = users_table.user_id) - ) as foo + ) as foo GROUP BY user_id ORDER BY 2 DESC LIMIT 10; user_id | sum ---------+---------- @@ -244,15 +230,15 @@ FROM -- should be able to pushdown since reference table is in the -- inner part of the left join -SELECT - user_id, sum(value_1) -FROM - (SELECT +SELECT + user_id, sum(value_1) +FROM + (SELECT users_table.user_id, users_table.value_1, random() - FROM - users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) + FROM + users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) LEFT JOIN events_reference_table ON (events_reference_table.value_2 = users_table.user_id) - ) as foo + ) as foo GROUP BY user_id ORDER BY 2 DESC LIMIT 10; user_id | sum ---------+---------- @@ -270,16 +256,16 @@ FROM -- should not be able to pushdown since reference table is in the -- direct outer part of the left join -SELECT - user_id, sum(value_1) -FROM - (SELECT +SELECT + user_id, sum(value_1) +FROM + (SELECT users_table.user_id, users_table.value_1, random() - FROM - events_reference_table LEFT JOIN users_table ON (users_table.user_id = events_reference_table.value_2) + FROM + events_reference_table LEFT JOIN users_table ON (users_table.user_id = events_reference_table.value_2) LEFT JOIN events_table ON (events_table.user_id = users_table.user_id) - ) as foo - GROUP BY user_id ORDER BY 2 DESC LIMIT 10; + ) as foo + GROUP BY user_id ORDER BY 2 DESC LIMIT 10; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join -- should not be able to pushdown since reference table is in the @@ -287,22 +273,22 @@ DETAIL: There exist a reference table in the outer part of the outer join SELECT * FROM - (SELECT *, random() FROM events_reference_table) as ref_all LEFT JOIN users_table + (SELECT *, random() FROM events_reference_table) as ref_all LEFT JOIN users_table ON (users_table.user_id = ref_all.value_2); ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join -- should not be able to pushdown since reference table is in the -- outer part of the left join -SELECT - user_id, sum(value_1) -FROM - (SELECT +SELECT + user_id, sum(value_1) +FROM + (SELECT users_table.user_id, users_table.value_1, random() - FROM - events_reference_table LEFT JOIN users_table ON (users_table.user_id = events_reference_table.value_2) + FROM + events_reference_table LEFT JOIN users_table ON (users_table.user_id = events_reference_table.value_2) LEFT JOIN events_table ON (events_table.user_id = users_table.user_id) - ) as foo - GROUP BY user_id ORDER BY 2 DESC LIMIT 10; + ) as foo + GROUP BY user_id ORDER BY 2 DESC LIMIT 10; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join -- should be able to pushdown since reference table is in the @@ -311,18 +297,18 @@ SELECT * FROM ( SELECT DISTINCT foo.user_id FROM - ((SELECT + ((SELECT "events"."time", "events"."user_id" as event_user_id, value_2 as event_val_2, random() - FROM + FROM events_reference_table as "events" - WHERE + WHERE event_type > 80) as "temp_data_queries" INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_table as "users" - WHERE + WHERE user_id > 80 and value_2 = 5) as foo_in ON (event_val_2 = user_id)) as foo LEFT JOIN (SELECT user_id as user_user_id FROM users_table) as fooo ON (user_id = user_user_id)) as bar; user_id @@ -335,24 +321,24 @@ SELECT * FROM ( SELECT DISTINCT foo.user_id FROM - ((SELECT + ((SELECT "events"."time", "events"."user_id" as event_user_id, value_2 as event_val_2, random() - FROM + FROM events_reference_table as "events" - WHERE + WHERE event_type > 80) as "temp_data_queries" LEFT JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_table as "users" - WHERE + WHERE user_id > 80 and value_2 = 5) as foo_in ON (event_val_2 = user_id)) as foo LEFT JOIN (SELECT user_id as user_user_id FROM users_table) as fooo ON (user_id = user_user_id)) as bar; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join --- we could even suuport the following where the subquery --- on the outer part of the left join contains a reference table +-- we could even suuport the following where the subquery +-- on the outer part of the left join contains a reference table SELECT max(events_all.cnt), events_all.usr_id FROM (SELECT users_table.user_id as usr_id, @@ -372,7 +358,7 @@ LEFT JOIN events_table ON (events_all.usr_id = events_table.user_id) GROUP BY 2 -- but, we fail to pushdown the following query where join that reference table appears -- wrapped into a subquery SELECT max(events_all.cnt), - events_all.usr_id + events_all.usr_id FROM( SELECT *, random() FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt @@ -390,59 +376,59 @@ DETAIL: There exist a reference table in the outer part of the outer join SET citus.subquery_pushdown to ON; SELECT user_id, lastseen FROM - (SELECT + (SELECT "some_users_data".user_id, lastseen FROM - (SELECT + (SELECT filter_users_1.user_id, time AS lastseen FROM - (SELECT + (SELECT user_where_1_1.user_id FROM - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE + WHERE user_id > 12 and user_id < 16 and value_1 > 20) user_where_1_1 INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE - user_id > 12 and user_id < 16 and value_2 > 60) user_where_1_join_1 - ON ("user_where_1_1".user_id = "user_where_1_join_1".user_id)) - filter_users_1 + WHERE + user_id > 12 and user_id < 16 and value_2 > 60) user_where_1_join_1 + ON ("user_where_1_1".user_id = "user_where_1_join_1".user_id)) + filter_users_1 JOIN LATERAL - (SELECT + (SELECT user_id, time - FROM + FROM events_reference_table as "events" WHERE - user_id > 12 and user_id < 16 AND + user_id > 12 and user_id < 16 AND user_id = filter_users_1.user_id - ORDER BY + ORDER BY time DESC - LIMIT 1) "last_events_1" + LIMIT 1) "last_events_1" ON TRUE - ORDER BY + ORDER BY time DESC LIMIT 10) "some_recent_users" JOIN LATERAL - (SELECT + (SELECT "users".user_id - FROM + FROM users_reference_table as "users" - WHERE - "users"."user_id" = "some_recent_users"."user_id" AND + WHERE + "users"."user_id" = "some_recent_users"."user_id" AND "users"."value_2" > 70 - LIMIT 1) "some_users_data" + LIMIT 1) "some_users_data" ON TRUE - ORDER BY + ORDER BY lastseen DESC LIMIT 10) "some_users" -ORDER BY +ORDER BY user_id DESC LIMIT 10; user_id | lastseen @@ -461,46 +447,46 @@ LIMIT 10; SET citus.subquery_pushdown to OFF; -- NESTED INNER JOINs with reference tables -SELECT - count(*) AS value, "generated_group_field" +SELECT + count(*) AS value, "generated_group_field" FROM - (SELECT + (SELECT DISTINCT "pushedDownQuery"."user_id", "generated_group_field" FROM - (SELECT + (SELECT "eventQuery"."user_id", "eventQuery"."time", random(), ("eventQuery"."value_2") AS "generated_group_field" FROM - (SELECT + (SELECT * FROM - (SELECT + (SELECT "events"."time", "events"."user_id", "events"."value_2" - FROM + FROM events_table as "events" - WHERE + WHERE user_id > 10 and user_id < 40 AND event_type IN (40, 41, 42, 43, 44, 45) ) "temp_data_queries" INNER JOIN - (SELECT + (SELECT user_where_1_1.real_user_id FROM - (SELECT + (SELECT "users"."user_id" as real_user_id - FROM + FROM users_reference_table as "users" WHERE user_id > 10 and user_id < 40 and value_2 > 50 ) user_where_1_1 INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE - user_id > 10 and user_id < 40 and value_3 > 50 ) user_where_1_join_1 + WHERE + user_id > 10 and user_id < 40 and value_3 > 50 ) user_where_1_join_1 ON ("user_where_1_1".real_user_id = "user_where_1_join_1".user_id)) "user_filters_1" ON ("temp_data_queries".user_id = "user_filters_1".real_user_id)) "eventQuery") "pushedDownQuery") "pushedDownQuery" -GROUP BY - "generated_group_field" -ORDER BY +GROUP BY + "generated_group_field" +ORDER BY generated_group_field DESC, value DESC; value | generated_group_field -------+----------------------- @@ -523,40 +509,40 @@ ORDER BY (16 rows) -- single level inner joins with reference tables -SELECT - "value_3", count(*) AS cnt +SELECT + "value_3", count(*) AS cnt FROM - (SELECT + (SELECT "value_3", "user_id", random() FROM - (SELECT - users_in_segment_1.user_id, value_3 + (SELECT + users_in_segment_1.user_id, value_3 FROM - (SELECT + (SELECT user_id, value_3 * 2 as value_3 FROM - (SELECT - user_id, value_3 + (SELECT + user_id, value_3 FROM - (SELECT + (SELECT "users"."user_id", value_3 - FROM + FROM users_reference_table as "users" - WHERE + WHERE user_id > 10 and user_id < 40 and value_2 > 30 ) simple_user_where_1 ) all_buckets_1 ) users_in_segment_1 JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE + WHERE user_id > 10 and user_id < 40 and value_2 > 60 ) some_users_data ON ("users_in_segment_1".user_id = "some_users_data".user_id) - ) segmentalias_1) "tempQuery" + ) segmentalias_1) "tempQuery" GROUP BY "value_3" ORDER BY cnt, value_3 DESC LIMIT 10; value_3 | cnt @@ -579,42 +565,42 @@ SELECT * FROM (SELECT "some_users_data".user_id, "some_recent_users".value_3 FROM - (SELECT + (SELECT filter_users_1.user_id, value_3 FROM - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE + WHERE user_id > 20 and user_id < 70 and users.value_2 = 200) filter_users_1 JOIN LATERAL - (SELECT + (SELECT user_id, value_3 - FROM + FROM events_reference_table as "events" WHERE - user_id > 20 and user_id < 70 AND + user_id > 20 and user_id < 70 AND ("events".user_id = "filter_users_1".user_id) - ORDER BY + ORDER BY value_3 DESC LIMIT 1) "last_events_1" ON true ORDER BY value_3 DESC LIMIT 10) "some_recent_users" JOIN LATERAL - (SELECT + (SELECT "users".user_id - FROM + FROM users_reference_table as "users" - WHERE - "users"."user_id" = "some_recent_users"."user_id" AND + WHERE + "users"."user_id" = "some_recent_users"."user_id" AND users.value_2 > 200 LIMIT 1) "some_users_data" ON true - ORDER BY + ORDER BY value_3 DESC LIMIT 10) "some_users" -ORDER BY - value_3 DESC +ORDER BY + value_3 DESC LIMIT 10; user_id | value_3 ---------+--------- @@ -632,37 +618,37 @@ SET citus.subquery_pushdown to OFF; SELECT count(*) AS cnt, "generated_group_field" FROM - (SELECT + (SELECT "eventQuery"."user_id", random(), generated_group_field FROM - (SELECT + (SELECT "multi_group_wrapper_1".*, generated_group_field, random() FROM (SELECT * FROM - (SELECT + (SELECT "events"."time", "events"."user_id" as event_user_id - FROM + FROM events_table as "events" - WHERE + WHERE user_id > 80) "temp_data_queries" INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE - user_id > 80 and value_2 = 5) "user_filters_1" + WHERE + user_id > 80 and value_2 = 5) "user_filters_1" ON ("temp_data_queries".event_user_id = "user_filters_1".user_id)) AS "multi_group_wrapper_1" LEFT JOIN - (SELECT + (SELECT "users"."user_id" AS "user_id", value_2 AS "generated_group_field" - FROM + FROM users_table as "users") "left_group_by_1" - ON ("left_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" + ON ("left_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" group BY "generated_group_field" - ORDER BY + ORDER BY cnt DESC, generated_group_field ASC LIMIT 10; cnt | generated_group_field @@ -684,43 +670,43 @@ count(*) AS cnt, "generated_group_field" SELECT count(*) AS cnt, "generated_group_field" FROM - (SELECT + (SELECT "eventQuery"."user_id", random(), generated_group_field FROM - (SELECT + (SELECT "multi_group_wrapper_1".*, generated_group_field, random() FROM (SELECT * FROM - (SELECT + (SELECT "events"."time", "events"."user_id" as event_user_id - FROM + FROM events_table as "events" - WHERE + WHERE user_id > 80) "temp_data_queries" INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_table as "users" - WHERE - user_id > 80 and value_2 = 5) "user_filters_1" + WHERE + user_id > 80 and value_2 = 5) "user_filters_1" ON ("temp_data_queries".event_user_id = "user_filters_1".user_id)) AS "multi_group_wrapper_1" RIGHT JOIN - (SELECT + (SELECT "users"."user_id" AS "user_id", value_2 AS "generated_group_field" - FROM + FROM users_reference_table as "users") "right_group_by_1" - ON ("right_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" + ON ("right_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" group BY "generated_group_field" - ORDER BY + ORDER BY cnt DESC, generated_group_field ASC LIMIT 10; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join -- right join where the inner part of the join includes a reference table --- joined with hash partitioned table using non-equi join +-- joined with hash partitioned table using non-equi join SELECT user_id, sum(array_length(events_table, 1)), length(hasdone_event), hasdone_event FROM ( SELECT @@ -794,42 +780,42 @@ ORDER BY user_id; (4 rows) -- LEFT JOINs used with INNER JOINs --- events_table and users_reference_table joined +-- events_table and users_reference_table joined -- with event_table.non_part_key < reference_table.any_key SELECT count(*) AS cnt, "generated_group_field" FROM - (SELECT + (SELECT "eventQuery"."user_id", random(), generated_group_field FROM - (SELECT + (SELECT "multi_group_wrapper_1".*, generated_group_field, random() FROM (SELECT * FROM - (SELECT + (SELECT "events"."time", "events"."user_id" as event_user_id - FROM + FROM events_table as "events" - WHERE + WHERE user_id > 80) "temp_data_queries" INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE - user_id > 80 and value_2 = 5) "user_filters_1" + WHERE + user_id > 80 and value_2 = 5) "user_filters_1" ON ("temp_data_queries".event_user_id < "user_filters_1".user_id)) AS "multi_group_wrapper_1" RIGHT JOIN - (SELECT + (SELECT "users"."user_id" AS "user_id", value_2 AS "generated_group_field" - FROM + FROM users_table as "users") "left_group_by_1" - ON ("left_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" + ON ("left_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" group BY "generated_group_field" - ORDER BY + ORDER BY cnt DESC, generated_group_field ASC LIMIT 10; cnt | generated_group_field @@ -853,32 +839,32 @@ FROM FROM (SELECT user_id, time FROM - (SELECT + (SELECT user_id, time - FROM + FROM events_reference_table as "events" - WHERE + WHERE user_id > 10 and user_id < 40) "events_1" ORDER BY time DESC) "recent_events_1" - GROUP BY + GROUP BY user_id - ORDER BY + ORDER BY max(TIME) DESC) "some_recent_users" FULL JOIN - (SELECT + (SELECT "users".user_id - FROM + FROM users_table as "users" - WHERE - users.value_2 > 50 and users.value_2 < 55) "some_users_data" + WHERE + users.value_2 > 50 and users.value_2 < 55) "some_users_data" ON "some_users_data"."user_id" = "some_recent_users"."user_id" -ORDER BY +ORDER BY user_id limit 50; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join - -- + -- -- UNIONs and JOINs with reference tables, should error out -- SELECT ("final_query"."event_types") as types @@ -889,132 +875,132 @@ FROM FROM ( SELECT "t1"."user_id", min("t1"."time") AS "time", array_agg(("t1"."event") ORDER BY TIME ASC, event DESC) AS collected_events FROM ( - (SELECT + (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 0 AS event - FROM + FROM events_table as "events" - WHERE - event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) - UNION - (SELECT + WHERE + event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) + UNION + (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 1 AS event - FROM + FROM events_reference_table as "events" - WHERE + WHERE event_type IN (15, 16, 17, 18, 19) ) events_subquery_2) - UNION - (SELECT + UNION + (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 2 AS event - FROM + FROM events_table as "events" - WHERE + WHERE event_type IN (20, 21, 22, 23, 24, 25) ) events_subquery_3) - UNION - (SELECT + UNION + (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 3 AS event - FROM + FROM events_table as "events" - WHERE + WHERE event_type IN (26, 27, 28, 29, 30, 13)) events_subquery_4)) t1 - GROUP BY "t1"."user_id") AS t) "q" + GROUP BY "t1"."user_id") AS t) "q" INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_table as "users" - WHERE - value_1 > 50 and value_1 < 70) AS t + WHERE + value_1 > 50 and value_1 < 70) AS t ON (t.user_id = q.user_id)) as final_query -ORDER BY +ORDER BY types; ERROR: cannot push down this subquery DETAIL: Reference tables are not supported with union operator - -- reference table exist in the subquery of union, should error out + -- reference table exist in the subquery of union, should error out SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType FROM - ( SELECT + ( SELECT *, random() FROM - (SELECT + (SELECT "t"."user_id", "t"."time", unnest("t"."collected_events") AS "event_types" FROM - ( SELECT + ( SELECT "t1"."user_id", min("t1"."time") AS "time", array_agg(("t1"."event") ORDER BY TIME ASC, event DESC) AS collected_events FROM ( - (SELECT + (SELECT * FROM - (SELECT + (SELECT "events"."time", 0 AS event, "events"."user_id" - FROM + FROM events_table as "events" - WHERE - event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) - UNION + WHERE + event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) + UNION (SELECT * FROM ( SELECT * FROM ( - SELECT + SELECT max("users"."time"), 0 AS event, "users"."user_id" - FROM + FROM events_reference_table as "events", users_table as "users" - WHERE + WHERE events.user_id = users.user_id AND event_type IN (10, 11, 12, 13, 14, 15) GROUP BY "users"."user_id" ) as events_subquery_5 ) events_subquery_2) - UNION + UNION (SELECT * FROM - (SELECT + (SELECT "events"."time", 2 AS event, "events"."user_id" - FROM + FROM events_table as "events" - WHERE + WHERE event_type IN (20, 21, 22, 23, 24, 25) ) events_subquery_3) - UNION + UNION (SELECT * FROM (SELECT "events"."time", 3 AS event, "events"."user_id" - FROM + FROM events_table as "events" - WHERE + WHERE event_type IN (26, 27, 28, 29, 30, 13)) events_subquery_4) ) t1 - GROUP BY "t1"."user_id") AS t) "q" + GROUP BY "t1"."user_id") AS t) "q" INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_table as "users" - WHERE - value_1 > 50 and value_1 < 70) AS t + WHERE + value_1 > 50 and value_1 < 70) AS t ON (t.user_id = q.user_id)) as final_query -GROUP BY +GROUP BY types -ORDER BY +ORDER BY types; ERROR: cannot push down this subquery DETAIL: Reference tables are not supported with union operator --- +-- -- Should error out with UNION ALL Queries on reference tables -- SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType @@ -1027,40 +1013,40 @@ FROM FROM ( (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 0 AS event - FROM + FROM events_table as "events" - WHERE - event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) + WHERE + event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) UNION ALL (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 1 AS event - FROM + FROM events_table as "events" - WHERE + WHERE event_type IN (15, 16, 17, 18, 19) ) events_subquery_2) UNION ALL (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 2 AS event - FROM + FROM events_reference_table as "events" - WHERE + WHERE event_type IN (20, 21, 22, 23, 24, 25) ) events_subquery_3) UNION ALL (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 3 AS event - FROM + FROM events_table as "events" - WHERE + WHERE event_type IN (26, 27, 28, 29, 30, 13)) events_subquery_4)) t1 - GROUP BY "t1"."user_id") AS t) "q" + GROUP BY "t1"."user_id") AS t) "q" INNER JOIN (SELECT "users"."user_id" FROM users_table as "users" @@ -1069,7 +1055,7 @@ GROUP BY types ORDER BY types; ERROR: cannot push down this subquery DETAIL: Reference tables are not supported with union operator --- just a sanity check that we don't allow this if the reference table is on the +-- just a sanity check that we don't allow this if the reference table is on the -- left part of the left join SELECT count(*) FROM (SELECT random() FROM users_ref_test_table LEFT JOIN user_buy_test_table @@ -1090,37 +1076,37 @@ DETAIL: Each relation should be joined with at least one another relation using SELECT count(*) AS cnt, "generated_group_field" FROM - (SELECT + (SELECT "eventQuery"."user_id", random(), generated_group_field FROM - (SELECT + (SELECT "multi_group_wrapper_1".*, generated_group_field, random() FROM (SELECT * FROM - (SELECT + (SELECT "events"."time", "events"."user_id" as event_user_id - FROM + FROM events_table as "events" - WHERE + WHERE user_id > 80) "temp_data_queries" INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE - user_id > 80 and value_2 = 5) "user_filters_1" + WHERE + user_id > 80 and value_2 = 5) "user_filters_1" ON ("temp_data_queries".event_user_id < "user_filters_1".user_id)) AS "multi_group_wrapper_1" RIGHT JOIN - (SELECT + (SELECT "users"."user_id" AS "user_id", value_2 AS "generated_group_field" - FROM + FROM users_table as "users") "left_group_by_1" - ON ("left_group_by_1".user_id > "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" + ON ("left_group_by_1".user_id > "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" group BY "generated_group_field" - ORDER BY + ORDER BY cnt DESC, generated_group_field ASC LIMIT 10; ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys @@ -1128,21 +1114,21 @@ DETAIL: Each relation should be joined with at least one another relation using -- two hash partitioned relations are not joined -- on partiton keys although reference table is fine -- to push down -SELECT +SELECT u1.user_id, count(*) -FROM +FROM events_table as e1, users_table as u1 WHERE event_type IN - (SELECT + (SELECT event_type - FROM + FROM events_reference_table as e2 WHERE value_2 = 15 AND value_3 > 25 AND e1.value_2 > e2.value_2 - ) + ) AND u1.user_id > e1.user_id GROUP BY 1 ORDER BY 2 DESC, 1 DESC diff --git a/src/test/regress/expected/multi_subquery_window_functions.out b/src/test/regress/expected/multi_subquery_window_functions.out new file mode 100644 index 000000000..4788a6a63 --- /dev/null +++ b/src/test/regress/expected/multi_subquery_window_functions.out @@ -0,0 +1,1050 @@ +-- =================================================================== +-- test multi subquery functionality for window functions +-- =================================================================== +CREATE VIEW subq AS +SELECT + DISTINCT user_id, rank() OVER (PARTITION BY user_id ORDER BY value_1) +FROM + users_table +GROUP BY + user_id, value_1 +HAVING count(*) > 1; +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS (PARTITION BY user_id ORDER BY time DESC) +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; + user_id | time | rnk +---------+---------------------------------+----- + 23 | Fri Jan 10 20:11:40.439606 2014 | 127 + 23 | Fri Jan 10 20:15:35.594738 2014 | 126 + 23 | Fri Jan 10 23:14:59.348548 2014 | 125 + 23 | Fri Jan 10 23:38:35.800498 2014 | 124 + 25 | Fri Jan 10 21:50:55.465393 2014 | 123 + 23 | Sat Jan 11 00:40:59.383928 2014 | 123 + 25 | Fri Jan 10 22:43:09.881855 2014 | 122 + 23 | Sat Jan 11 00:42:46.148 2014 | 122 + 25 | Fri Jan 10 23:08:28.963923 2014 | 121 + 23 | Sat Jan 11 01:23:01.126017 2014 | 121 +(10 rows) + +-- the same test with different syntax +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER (PARTITION BY user_id ORDER BY time DESC) as rnk + FROM + events_table +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; + user_id | time | rnk +---------+---------------------------------+----- + 23 | Fri Jan 10 20:11:40.439606 2014 | 127 + 23 | Fri Jan 10 20:15:35.594738 2014 | 126 + 23 | Fri Jan 10 23:14:59.348548 2014 | 125 + 23 | Fri Jan 10 23:38:35.800498 2014 | 124 + 25 | Fri Jan 10 21:50:55.465393 2014 | 123 + 23 | Sat Jan 11 00:40:59.383928 2014 | 123 + 25 | Fri Jan 10 22:43:09.881855 2014 | 122 + 23 | Sat Jan 11 00:42:46.148 2014 | 122 + 25 | Fri Jan 10 23:08:28.963923 2014 | 121 + 23 | Sat Jan 11 01:23:01.126017 2014 | 121 +(10 rows) + +-- similar test with lag +SELECT + user_id, time, lag_event_type, row_no +FROM +( + SELECT + *, lag(event_type) OVER my_win as lag_event_type, row_number() OVER my_win as row_no + FROM + events_table WINDOW my_win AS (PARTITION BY user_id ORDER BY time DESC) +) as foo +ORDER BY + 4 DESC, 3 DESC NULLS LAST, 1 DESC, 2 DESC +LIMIT + 10; + user_id | time | lag_event_type | row_no +---------+---------------------------------+----------------+-------- + 23 | Fri Jan 10 20:11:40.439606 2014 | 338 | 127 + 23 | Fri Jan 10 20:15:35.594738 2014 | 999 | 126 + 23 | Fri Jan 10 23:14:59.348548 2014 | 783 | 125 + 23 | Fri Jan 10 23:38:35.800498 2014 | 802 | 124 + 25 | Fri Jan 10 21:50:55.465393 2014 | 517 | 123 + 23 | Sat Jan 11 00:40:59.383928 2014 | 359 | 123 + 25 | Fri Jan 10 22:43:09.881855 2014 | 918 | 122 + 23 | Sat Jan 11 00:42:46.148 2014 | 68 | 122 + 25 | Fri Jan 10 23:08:28.963923 2014 | 757 | 121 + 23 | Sat Jan 11 01:23:01.126017 2014 | 251 | 121 +(10 rows) + +-- simple window function, partitioned and grouped by on the distribution key +SELECT + user_id, rnk, avg_val_2 +FROM +( + SELECT + user_id, rank() OVER my_win as rnk, avg(value_2) as avg_val_2 + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY user_id ORDER BY avg(event_type) DESC) +) as foo +ORDER BY + 2 DESC, 1 DESC, 3 DESC +LIMIT + 10; + user_id | rnk | avg_val_2 +---------+-----+---------------------- + 98 | 12 | 647.5000000000000000 + 95 | 12 | 428.5000000000000000 + 94 | 12 | 608.6666666666666667 + 92 | 12 | 724.0000000000000000 + 91 | 12 | 549.0000000000000000 + 90 | 12 | 525.1000000000000000 + 89 | 12 | 531.0000000000000000 + 87 | 12 | 740.0000000000000000 + 84 | 12 | 487.7500000000000000 + 83 | 12 | 629.5000000000000000 +(10 rows) + +-- top level query has a group by on the result of the window function +SELECT + min(user_id), min(time), lag_event_type, count(*) +FROM +( + SELECT + *, lag(event_type) OVER my_win as lag_event_type + FROM + events_table WINDOW my_win AS (PARTITION BY user_id ORDER BY time DESC) +) as foo +GROUP BY + lag_event_type +ORDER BY + 3 DESC NULLS LAST, 1 DESC, 2 DESC +LIMIT + 10; + min | min | lag_event_type | count +-----+---------------------------------+----------------+------- + 45 | Sat Jan 11 12:47:09.502744 2014 | 1000 | 2 + 18 | Fri Jan 10 20:15:35.594738 2014 | 999 | 9 + 1 | Sat Jan 11 21:08:41.737933 2014 | 998 | 10 + 0 | Sat Jan 11 16:32:40.662168 2014 | 997 | 9 + 3 | Fri Jan 10 23:30:18.011423 2014 | 996 | 13 + 17 | Sun Jan 12 03:54:06.464758 2014 | 995 | 9 + 23 | Tue Jan 14 22:04:23.44321 2014 | 994 | 7 + 7 | Sat Jan 11 04:59:48.119353 2014 | 993 | 9 + 8 | Sat Jan 11 05:14:45.845071 2014 | 992 | 14 + 0 | Sun Jan 12 03:24:01.449152 2014 | 991 | 10 +(10 rows) + +-- window functions should work along with joins as well +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w1 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time) +) as foo +ORDER BY 3 DESC, 1 DESC, 2 DESC NULLS LAST +LIMIT 10; + user_id | lag | rank +---------+-----+------ + 90 | 90 | 114 + 72 | 72 | 109 + 26 | 26 | 109 + 91 | 91 | 108 + 55 | 55 | 107 + 27 | 27 | 106 + 60 | 60 | 101 + 98 | 98 | 97 + 39 | 39 | 95 + 61 | 61 | 93 +(10 rows) + +-- two window functions in a single subquery should work fine as well +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id, (events_table.value_2 % 25) ORDER BY events_table.time) +) as foo +ORDER BY 3 DESC, 1 DESC, 2 DESC NULLS LAST +LIMIT 10; + user_id | lag | rank +---------+-----+------ + 73 | 73 | 112 + 73 | | 112 + 48 | 48 | 111 + 48 | | 111 + 43 | 43 | 105 + 43 | | 105 + 77 | 77 | 104 + 77 | | 104 + 30 | 30 | 104 + 30 | | 104 +(10 rows) + +-- window functions should be fine within subquery joins +SELECT sub_1.user_id, max(lag_1), max(rank_1), max(rank_2) FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1 as lag_1, rank() OVER w2 as rank_1 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id, (events_table.value_2 % 25) ORDER BY events_table.time) +) as sub_1 +JOIN +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1 as lag_2, rank() OVER w2 as rank_2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.value_2 ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id, (events_table.value_2 % 50) ORDER BY events_table.time) +) as sub_2 + ON(sub_1.user_id = sub_2.user_id) + GROUP BY + sub_1.user_id + ORDER BY 3 DESC, 4 DESC, 1 DESC, 2 DESC NULLS LAST +LIMIT 10; + user_id | max | max | max +---------+-----+-----+----- + 73 | 73 | 112 | 112 + 48 | 48 | 111 | 111 + 43 | 43 | 105 | 1 + 77 | 77 | 104 | 104 + 30 | 30 | 104 | 104 + 50 | 50 | 101 | 1 + 79 | 79 | 97 | 1 + 49 | 49 | 96 | 96 + 44 | 44 | 93 | 1 + 13 | 13 | 87 | 1 +(10 rows) + +-- GROUP BYs and PARTITION BYs should work fine together +SELECT + avg(user_id), max(time), my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY user_id ORDER BY count(*) DESC) +) as foo +WHERE + my_rank > 5 +GROUP BY + my_rank +ORDER BY + 3 DESC, 1 DESC,2 DESC +LIMIT + 10; + avg | max | my_rank +---------------------+--------------------------+--------- + 48.6250000000000000 | Tue Jan 21 00:00:00 2014 | 12 + 48.4786324786324786 | Tue Jan 21 00:00:00 2014 | 11 + 50.2083333333333333 | Tue Jan 21 00:00:00 2014 | 10 + 51.8247422680412371 | Tue Jan 21 00:00:00 2014 | 9 + 46.2061855670103093 | Mon Jan 20 00:00:00 2014 | 8 + 52.5945945945945946 | Mon Jan 20 00:00:00 2014 | 7 + 52.2589285714285714 | Tue Jan 21 00:00:00 2014 | 6 +(7 rows) + +-- aggregates in the PARTITION BY is also allows +SELECT + avg(user_id), max(time), my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY user_id, avg(event_type%10)::int ORDER BY count(*) DESC) +) as foo +WHERE + my_rank > 0 +GROUP BY + my_rank +ORDER BY + 3 DESC, 1 DESC,2 DESC +LIMIT + 10; + avg | max | my_rank +---------------------+--------------------------+--------- + 22.0000000000000000 | Fri Jan 10 00:00:00 2014 | 8 + 60.4000000000000000 | Mon Jan 20 00:00:00 2014 | 7 + 55.5500000000000000 | Tue Jan 21 00:00:00 2014 | 6 + 50.6142857142857143 | Tue Jan 21 00:00:00 2014 | 5 + 53.6697247706422018 | Tue Jan 21 00:00:00 2014 | 4 + 49.7604166666666667 | Tue Jan 21 00:00:00 2014 | 3 + 47.9569892473118280 | Tue Jan 21 00:00:00 2014 | 2 + 49.5859375000000000 | Tue Jan 21 00:00:00 2014 | 1 +(8 rows) + +-- GROUP BY should not necessarly be inclusive of partitioning +-- but this query doesn't make much sense +SELECT + avg(user_id), my_rank +FROM +( + SELECT + user_id, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id + WINDOW my_win AS (PARTITION BY user_id, max(event_type) ORDER BY count(*) DESC) +) as foo +GROUP BY + my_rank +ORDER BY + 2 DESC, 1 DESC +LIMIT + 10; + avg | my_rank +---------------------+--------- + 50.0000000000000000 | 1 +(1 row) + + -- Using previously defined supported window function on distribution key +SELECT * FROM ( + SELECT + user_id, date_trunc('day', time) as time, sum(rank) OVER w2 + FROM ( + SELECT DISTINCT + user_id as user_id, time, rank() over w1 + FROM users_table + WINDOW + w AS (PARTITION BY user_id), + w1 AS (w ORDER BY value_2, value_3) + ) fab + WINDOW + w2 as (PARTITION BY user_id, time) +) a +ORDER BY + 1, 2, 3 DESC +LIMIT + 10; + user_id | time | sum +---------+--------------------------+----- + 0 | Fri Jan 10 00:00:00 2014 | 32 + 0 | Sat Jan 11 00:00:00 2014 | 40 + 0 | Sat Jan 11 00:00:00 2014 | 38 + 0 | Sat Jan 11 00:00:00 2014 | 30 + 0 | Sun Jan 12 00:00:00 2014 | 49 + 0 | Sun Jan 12 00:00:00 2014 | 47 + 0 | Sun Jan 12 00:00:00 2014 | 34 + 0 | Sun Jan 12 00:00:00 2014 | 29 + 0 | Sun Jan 12 00:00:00 2014 | 24 + 0 | Sun Jan 12 00:00:00 2014 | 20 +(10 rows) + +-- test with reference table partitioned on columns from both +SELECT * +FROM +( + SELECT + DISTINCT user_id, it_name, count(id) OVER (PARTITION BY user_id, id) + FROM + users_table, users_ref_test_table + WHERE users_table.value_2=users_ref_test_table.k_no +) a +ORDER BY + 1, 2, 3 +LIMIT + 20; + user_id | it_name | count +---------+---------+------- + 6 | User_4 | 1 + 8 | User_4 | 1 + 9 | User_3 | 1 + 11 | User_3 | 1 + 15 | User_6 | 1 + 16 | User_2 | 1 + 16 | User_3 | 1 + 17 | User_3 | 1 + 17 | User_4 | 1 + 20 | User_3 | 1 + 21 | User_6 | 1 + 22 | User_1 | 1 + 24 | User_4 | 1 + 26 | User_2 | 2 + 27 | User_4 | 1 + 28 | User_2 | 1 + 28 | User_3 | 1 + 31 | User_1 | 1 + 31 | User_4 | 1 + 34 | User_3 | 1 +(20 rows) + +-- Group by has more columns than partition by +SELECT * FROM ( + SELECT + DISTINCT user_id, SUM(value_2) OVER (PARTITION BY user_id) + FROM + users_table + GROUP BY + user_id, value_1, value_2 +) a +ORDER BY + 2 DESC, 1 +LIMIT + 10; + user_id | sum +---------+------- + 46 | 63666 + 23 | 62524 + 56 | 61350 + 12 | 61317 + 48 | 60144 + 71 | 60095 + 45 | 59904 + 94 | 59773 + 3 | 59141 + 93 | 58365 +(10 rows) + +SELECT user_id, max(sum) FROM ( + SELECT + user_id, SUM(value_2) OVER (PARTITION BY user_id, value_1) + FROM + users_table + GROUP BY + user_id, value_1, value_2 +) a +GROUP BY user_id ORDER BY + 2 DESC,1 +LIMIT + 10; + user_id | max +---------+------ + 1 | 2469 + 87 | 2089 + 81 | 1952 + 23 | 1891 + 58 | 1888 + 97 | 1868 + 94 | 1849 + 17 | 1844 + 22 | 1844 + 43 | 1843 +(10 rows) + +-- Window functions with HAVING clause +SELECT * FROM ( + SELECT + DISTINCT user_id, rank() OVER (PARTITION BY user_id ORDER BY value_1) + FROM + users_table + GROUP BY + user_id, value_1 HAVING count(*) > 1 +) a +ORDER BY + 2 DESC, 1 +LIMIT + 10; + user_id | rank +---------+------ + 12 | 10 + 55 | 10 + 12 | 9 + 25 | 9 + 46 | 9 + 55 | 9 + 12 | 8 + 25 | 8 + 36 | 8 + 46 | 8 +(10 rows) + +-- Window function in View works +SELECT * +FROM + subq +ORDER BY + 2 DESC, 1 +LIMIT + 10; + user_id | rank +---------+------ + 12 | 10 + 55 | 10 + 12 | 9 + 25 | 9 + 46 | 9 + 55 | 9 + 12 | 8 + 25 | 8 + 36 | 8 + 46 | 8 +(10 rows) + +-- Window functions with UNION/UNION ALL works +SELECT + max(avg) +FROM +( + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (1, 2, 3, 4, 5)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (6, 7, 8, 9, 10)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (11, 12, 13, 14, 15)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (16, 17, 18, 19, 20)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (21, 22, 23, 24, 25)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (26, 27, 28, 29, 30)) +) b +GROUP BY user_id +ORDER BY 1 DESC +LIMIT 5; + max +----- + 996 + 995 + 987 + 978 + 974 +(5 rows) + +SELECT * +FROM ( + ( SELECT user_id, + sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_1 + GROUP BY + user_id) + UNION + (SELECT + user_id, sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_2 + GROUP BY + user_id)) AS ftop +ORDER BY 2 DESC, 1 DESC +LIMIT 5; + user_id | sum +---------+-------- + 23 | 126017 + 45 | 117323 + 25 | 116595 + 17 | 116520 + 90 | 115843 +(5 rows) + +-- Subquery in where with window function +SELECT + user_id +FROM + users_table +WHERE + value_2 > 545 AND + value_2 < ALL ( + SELECT + avg(value_3) OVER (PARTITION BY user_id) + FROM + events_table + WHERE + users_table.user_id = events_table.user_id + ) +GROUP BY + user_id +ORDER BY + user_id DESC +LIMIT + 3; + user_id +--------- + 69 + 52 + 12 +(3 rows) + +-- Some more nested queries +SELECT + user_id, rank, SUM(ABS(value_2 - value_3)) AS difference, COUNT(*) AS distinct_users +FROM ( + SELECT + *, rank() OVER (PARTITION BY user_id ORDER BY value_2 DESC) + FROM ( + SELECT + user_id, value_2, sum(value_3) OVER (PARTITION BY user_id, value_2) as value_3 + FROM users_table + ) AS A +) AS A +GROUP BY + user_id, rank +ORDER BY + difference DESC, rank DESC +LIMIT 20; + user_id | rank | difference | distinct_users +---------+------+------------+---------------- + 2 | 101 | 3696 | 2 + 73 | 98 | 3526 | 2 + 73 | 103 | 3522 | 3 + 97 | 73 | 3440 | 2 + 43 | 92 | 3418 | 2 + 57 | 75 | 3286 | 2 + 66 | 64 | 3249 | 3 + 42 | 97 | 3218 | 2 + 19 | 101 | 3110 | 2 + 91 | 94 | 3064 | 2 + 71 | 100 | 3026 | 2 + 59 | 69 | 3016 | 2 + 46 | 83 | 2858 | 2 + 16 | 86 | 2848 | 2 + 23 | 63 | 2734 | 2 + 62 | 96 | 2668 | 2 + 81 | 84 | 2666 | 2 + 7 | 74 | 2648 | 2 + 27 | 97 | 2640 | 2 + 55 | 76 | 2630 | 2 +(20 rows) + +SELECT * FROM ( + SELECT DISTINCT + f3.user_id, ABS(f2.sum - f3.sum) + FROM ( + SELECT DISTINCT + user_id, sum(value_3) OVER (PARTITION BY user_id) + FROM + users_table + GROUP BY + user_id, value_3 + ) f3, + ( + SELECT DISTINCT + user_id, sum(value_2) OVER (PARTITION BY user_id) + FROM + users_table + GROUP BY + user_id, value_2 + ) f2 +WHERE + f3.user_id=f2.user_id +) a +ORDER BY + abs DESC +LIMIT 10; + user_id | abs +---------+------- + 64 | 10669 + 74 | 10037 + 26 | 9571 + 76 | 9376 + 81 | 8330 + 16 | 7746 + 9 | 7100 + 98 | 6922 + 94 | 6895 + 93 | 6653 +(10 rows) + +-- Partition by with aggregate functions. This query does not make much sense since the +-- result of aggregate function will be the same for every row in a partition and it is +-- not going to affect the group that the count function will work on. +SELECT * FROM ( + SELECT + user_id, COUNT(*) OVER (PARTITION BY user_id, MIN(value_2)) + FROM + users_table + GROUP BY + 1 +) a +ORDER BY + 1 DESC +LIMIT + 5; + user_id | count +---------+------- + 100 | 1 + 99 | 1 + 98 | 1 + 97 | 1 + 96 | 1 +(5 rows) + +EXPLAIN (COSTS FALSE, VERBOSE TRUE) + SELECT * + FROM ( + ( SELECT user_id, + sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_1 + GROUP BY + user_id) + UNION + (SELECT + user_id, sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_2 + GROUP BY + user_id)) AS ftop + ORDER BY 2 DESC, 1 DESC + LIMIT 5; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ + Limit + Output: remote_scan.user_id, remote_scan.sum + -> Sort + Output: remote_scan.user_id, remote_scan.sum + Sort Key: remote_scan.sum DESC, remote_scan.user_id DESC + -> Custom Scan (Citus Real-Time) + Output: remote_scan.user_id, remote_scan.sum + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> Limit + Output: users_table.user_id, (sum((sum(users_table.value_2) OVER (?)))) + -> Sort + Output: users_table.user_id, (sum((sum(users_table.value_2) OVER (?)))) + Sort Key: (sum((sum(users_table.value_2) OVER (?)))) DESC, users_table.user_id DESC + -> HashAggregate + Output: users_table.user_id, (sum((sum(users_table.value_2) OVER (?)))) + Group Key: users_table.user_id, (sum((sum(users_table.value_2) OVER (?)))) + -> Append + -> HashAggregate + Output: users_table.user_id, sum((sum(users_table.value_2) OVER (?))) + Group Key: users_table.user_id + -> HashAggregate + Output: users_table.user_id, (sum(users_table.value_2) OVER (?)) + Group Key: users_table.user_id, (sum(users_table.value_2) OVER (?)) + -> Append + -> WindowAgg + Output: users_table.user_id, sum(users_table.value_2) OVER (?) + -> Index Scan using is_index1_1400000 on public.users_table_1400000 users_table + Output: users_table.user_id, users_table."time", users_table.value_1, users_table.value_2, users_table.value_3, users_table.value_4 + -> WindowAgg + Output: events_table.user_id, sum(events_table.value_2) OVER (?) + -> Index Scan using is_index2_1400004 on public.events_table_1400004 events_table + Output: events_table.user_id, events_table."time", events_table.event_type, events_table.value_2, events_table.value_3, events_table.value_4 + -> HashAggregate + Output: users_table_1.user_id, sum((sum(users_table_1.value_2) OVER (?))) + Group Key: users_table_1.user_id + -> HashAggregate + Output: users_table_1.user_id, (sum(users_table_1.value_2) OVER (?)) + Group Key: users_table_1.user_id, (sum(users_table_1.value_2) OVER (?)) + -> Append + -> WindowAgg + Output: users_table_1.user_id, sum(users_table_1.value_2) OVER (?) + -> Index Scan using is_index1_1400000 on public.users_table_1400000 users_table_1 + Output: users_table_1.user_id, users_table_1."time", users_table_1.value_1, users_table_1.value_2, users_table_1.value_3, users_table_1.value_4 + -> WindowAgg + Output: events_table_1.user_id, sum(events_table_1.value_2) OVER (?) + -> Index Scan using is_index2_1400004 on public.events_table_1400004 events_table_1 + Output: events_table_1.user_id, events_table_1."time", events_table_1.event_type, events_table_1.value_2, events_table_1.value_3, events_table_1.value_4 +(50 rows) + +-- lets have some queries that Citus shouldn't push down +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS (PARTITION BY event_type ORDER BY time DESC) +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; +ERROR: cannot push down this subquery +DETAIL: Window functions with PARTITION BY list missing distribution column is currently unsupported +-- user needs to supply partition by which should +-- include the distribution key +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS () +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; +ERROR: cannot push down this subquery +DETAIL: Window functions without PARTITION BY on distribution column is currently unsupported +-- user needs to supply partition by which should +-- include the distribution key +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS (ORDER BY time DESC) +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; +ERROR: cannot push down this subquery +DETAIL: Window functions without PARTITION BY on distribution column is currently unsupported +-- w2 should not be pushed down +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id+1, (events_table.value_2 % 25) ORDER BY events_table.time) +) as foo +ORDER BY 3 DESC, 1 DESC, 2 DESC NULLS LAST +LIMIT 10; +ERROR: cannot push down this subquery +DETAIL: Window functions with PARTITION BY list missing distribution column is currently unsupported +-- w2 should not be pushed down +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (ORDER BY events_table.time) +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC NULLS LAST +LIMIT + 10; +ERROR: cannot push down this subquery +DETAIL: Window functions without PARTITION BY on distribution column is currently unsupported +-- GROUP BY includes the partition key, but not the WINDOW function +SELECT + user_id, time, my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (ORDER BY avg(event_type)) +) as foo +WHERE + my_rank > 125 +ORDER BY + 3 DESC, 1 DESC,2 DESC +LIMIT + 10; +ERROR: cannot push down this subquery +DETAIL: Window functions without PARTITION BY on distribution column is currently unsupported +-- GROUP BY includes the partition key, but not the WINDOW function +SELECT + user_id, time, my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY date_trunc('day', time) ORDER BY avg(event_type)) +) as foo +WHERE + my_rank > 125 +ORDER BY + 3 DESC, 1 DESC,2 DESC +LIMIT + 10; +ERROR: cannot push down this subquery +DETAIL: Window functions with PARTITION BY list missing distribution column is currently unsupported +-- Overriding window function but not supported +SELECT * FROM ( + SELECT + user_id, date_trunc('day', time) as time, sum(rank) OVER w2 + FROM ( + SELECT DISTINCT + user_id as user_id, time, rank() over w1 + FROM + users_table + WINDOW + w AS (PARTITION BY time), w1 AS (w ORDER BY value_2, value_3) + ) fab + WINDOW + w2 as (PARTITION BY user_id, time) +) a +ORDER BY +1,2,3; +ERROR: cannot push down this subquery +DETAIL: Window functions with PARTITION BY list missing distribution column is currently unsupported +-- Aggregate function on distribution column should error out +SELECT * FROM ( + SELECT + user_id, COUNT(*) OVER (PARTITION BY sum(user_id), MIN(value_2)) + FROM + users_table + GROUP BY + user_id +) a +ORDER BY + 1 DESC, 2 DESC; +ERROR: cannot push down this subquery +DETAIL: Window functions with PARTITION BY list missing distribution column is currently unsupported +-- test with reference table partitioned on only a column from reference table +SELECT * +FROM +( + SELECT + DISTINCT user_id, it_name, count(id) OVER (PARTITION BY id) + FROM + users_table, users_ref_test_table +) a +ORDER BY + 1, 2, 3 +LIMIT + 20; +ERROR: cannot push down this subquery +DETAIL: Window functions with PARTITION BY list missing distribution column is currently unsupported +-- UNION ALL with only one of them is not partitioned over distribution column which +-- should not be allowed. +SELECT + max(avg) +FROM +( + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (1, 2, 3, 4, 5)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (6, 7, 8, 9, 10)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (11, 12, 13, 14, 15)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (16, 17, 18, 19, 20)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (21, 22, 23, 24, 25)) + UNION ALL + (SELECT avg(value_3) over (partition by event_type), user_id FROM events_table where event_type IN (26, 27, 28, 29, 30)) +) b +GROUP BY user_id +ORDER BY 1 DESC +LIMIT 5; +ERROR: cannot push down this subquery +DETAIL: Window functions with PARTITION BY list missing distribution column is currently unsupported +-- UNION with only one subquery which has a partition on non-distribution column should +-- error out +SELECT * +FROM ( + ( SELECT user_id, + sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_1 + GROUP BY + user_id) + UNION + (SELECT + user_id, sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by event_type) AS counter + FROM + events_table) user_id_2 + GROUP BY + user_id)) AS ftop +ORDER BY 2 DESC, 1 DESC +LIMIT 5; +ERROR: cannot push down this subquery +DETAIL: Window functions with PARTITION BY list missing distribution column is currently unsupported +DROP VIEW subq; diff --git a/src/test/regress/input/multi_behavioral_analytics_create_table.source b/src/test/regress/input/multi_behavioral_analytics_create_table.source index b81082fe2..838bfbf23 100644 --- a/src/test/regress/input/multi_behavioral_analytics_create_table.source +++ b/src/test/regress/input/multi_behavioral_analytics_create_table.source @@ -2,10 +2,8 @@ -- multi behavioral analytics -- this file is intended to create the table requires for the tests -- - ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 1400000; ALTER SEQUENCE pg_catalog.pg_dist_jobid_seq RESTART 1400000; - SET citus.shard_replication_factor = 1; SET citus.shard_count = 4; @@ -30,10 +28,19 @@ SELECT create_distributed_table('agg_results_third', 'user_id'); CREATE TABLE agg_results_fourth (user_id int, value_1_agg int, value_2_agg int, value_3_agg float, value_4_agg bigint, agg_time timestamp); SELECT create_distributed_table('agg_results_fourth', 'user_id'); +CREATE TABLE users_ref_test_table(id int, it_name varchar(25), k_no int); +SELECT create_reference_table('users_ref_test_table'); +INSERT INTO users_ref_test_table VALUES(1,'User_1',45); +INSERT INTO users_ref_test_table VALUES(2,'User_2',46); +INSERT INTO users_ref_test_table VALUES(3,'User_3',47); +INSERT INTO users_ref_test_table VALUES(4,'User_4',48); +INSERT INTO users_ref_test_table VALUES(5,'User_5',49); +INSERT INTO users_ref_test_table VALUES(6,'User_6',50); + COPY users_table FROM '@abs_srcdir@/data/users_table.data' WITH CSV; COPY events_table FROM '@abs_srcdir@/data/events_table.data' WITH CSV; --- create indexes for +-- create indexes for CREATE INDEX is_index1 ON users_table(user_id); CREATE INDEX is_index2 ON events_table(user_id); CREATE INDEX is_index3 ON users_table(value_1); @@ -130,10 +137,10 @@ SELECT run_command_on_master_and_workers($f$ LEFTARG = user_composite_type, RIGHTARG = user_composite_type, PROCEDURE = equal_user_composite_type_function, - commutator = =, - RESTRICT = eqsel, - JOIN = eqjoinsel, - merges, + commutator = =, + RESTRICT = eqsel, + JOIN = eqjoinsel, + merges, hashes ); $f$); @@ -187,12 +194,12 @@ SELECT run_command_on_master_and_workers($f$ OPERATOR 3 = (user_composite_type, user_composite_type), OPERATOR 4 >= (user_composite_type, user_composite_type), OPERATOR 5 > (user_composite_type, user_composite_type), - + FUNCTION 1 cmp_user_composite_type_function(user_composite_type, user_composite_type); $f$); SELECT run_command_on_master_and_workers($f$ - + CREATE OPERATOR CLASS cats_2_op_fam_class DEFAULT FOR TYPE user_composite_type USING HASH AS OPERATOR 1 = (user_composite_type, user_composite_type), @@ -356,7 +363,7 @@ SET citus.shard_max_size TO "1MB"; CREATE TABLE events_reference_table (like events_table including all); SELECT create_reference_table('events_reference_table'); -CREATE INDEX events_ref_val2 on events_reference_table(value_2); +CREATE INDEX events_ref_val2 on events_reference_table(value_2); INSERT INTO events_reference_table SELECT * FROM events_table; CREATE TABLE users_reference_table (like users_table including all); diff --git a/src/test/regress/multi_schedule b/src/test/regress/multi_schedule index b24532b17..96a991fff 100644 --- a/src/test/regress/multi_schedule +++ b/src/test/regress/multi_schedule @@ -36,7 +36,7 @@ test: multi_load_data test: multi_behavioral_analytics_create_table test: multi_behavioral_analytics_basics multi_behavioral_analytics_single_shard_queries multi_insert_select_non_pushable_queries -test: multi_insert_select +test: multi_insert_select multi_insert_select_window # --- # Tests for partitioning support @@ -51,7 +51,7 @@ test: multi_deparse_shard_query multi_distributed_transaction_id test: multi_basic_queries multi_complex_expressions test: multi_explain test: multi_subquery multi_subquery_complex_queries multi_subquery_behavioral_analytics -test: multi_subquery_complex_reference_clause +test: multi_subquery_complex_reference_clause multi_subquery_window_functions test: multi_subquery_in_where_reference_clause test: multi_subquery_union multi_subquery_in_where_clause multi_subquery_misc test: multi_reference_table diff --git a/src/test/regress/output/multi_behavioral_analytics_create_table.source b/src/test/regress/output/multi_behavioral_analytics_create_table.source index 026634fe6..d0ad1547d 100644 --- a/src/test/regress/output/multi_behavioral_analytics_create_table.source +++ b/src/test/regress/output/multi_behavioral_analytics_create_table.source @@ -51,9 +51,22 @@ SELECT create_distributed_table('agg_results_fourth', 'user_id'); (1 row) +CREATE TABLE users_ref_test_table(id int, it_name varchar(25), k_no int); +SELECT create_reference_table('users_ref_test_table'); + create_reference_table +------------------------ + +(1 row) + +INSERT INTO users_ref_test_table VALUES(1,'User_1',45); +INSERT INTO users_ref_test_table VALUES(2,'User_2',46); +INSERT INTO users_ref_test_table VALUES(3,'User_3',47); +INSERT INTO users_ref_test_table VALUES(4,'User_4',48); +INSERT INTO users_ref_test_table VALUES(5,'User_5',49); +INSERT INTO users_ref_test_table VALUES(6,'User_6',50); COPY users_table FROM '@abs_srcdir@/data/users_table.data' WITH CSV; COPY events_table FROM '@abs_srcdir@/data/events_table.data' WITH CSV; --- create indexes for +-- create indexes for CREATE INDEX is_index1 ON users_table(user_id); NOTICE: using one-phase commit for distributed DDL commands HINT: You can enable two-phase commit for extra safety with: SET citus.multi_shard_commit_protocol TO '2pc' @@ -187,10 +200,10 @@ SELECT run_command_on_master_and_workers($f$ LEFTARG = user_composite_type, RIGHTARG = user_composite_type, PROCEDURE = equal_user_composite_type_function, - commutator = =, - RESTRICT = eqsel, - JOIN = eqjoinsel, - merges, + commutator = =, + RESTRICT = eqsel, + JOIN = eqjoinsel, + merges, hashes ); $f$); @@ -261,7 +274,7 @@ SELECT run_command_on_master_and_workers($f$ OPERATOR 3 = (user_composite_type, user_composite_type), OPERATOR 4 >= (user_composite_type, user_composite_type), OPERATOR 5 > (user_composite_type, user_composite_type), - + FUNCTION 1 cmp_user_composite_type_function(user_composite_type, user_composite_type); $f$); run_command_on_master_and_workers @@ -270,7 +283,7 @@ $f$); (1 row) SELECT run_command_on_master_and_workers($f$ - + CREATE OPERATOR CLASS cats_2_op_fam_class DEFAULT FOR TYPE user_composite_type USING HASH AS OPERATOR 1 = (user_composite_type, user_composite_type), @@ -428,7 +441,7 @@ SELECT create_reference_table('events_reference_table'); (1 row) -CREATE INDEX events_ref_val2 on events_reference_table(value_2); +CREATE INDEX events_ref_val2 on events_reference_table(value_2); INSERT INTO events_reference_table SELECT * FROM events_table; CREATE TABLE users_reference_table (like users_table including all); SELECT create_reference_table('users_reference_table'); diff --git a/src/test/regress/sql/multi_insert_select_window.sql b/src/test/regress/sql/multi_insert_select_window.sql new file mode 100644 index 000000000..97814c15b --- /dev/null +++ b/src/test/regress/sql/multi_insert_select_window.sql @@ -0,0 +1,769 @@ +-- =================================================================== +-- test insert select functionality for window functions +-- =================================================================== + +TRUNCATE agg_results; + +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS (PARTITION BY user_id ORDER BY time DESC) +) as foo; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- the same test with different syntax +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER (PARTITION BY user_id ORDER BY time DESC) as rnk + FROM + events_table +) as foo; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- similar test with lag +INSERT INTO agg_results (user_id, agg_time, value_2_agg, value_3_agg) +SELECT + user_id, time, lag_event_type, row_no +FROM +( + SELECT + *, lag(event_type) OVER my_win as lag_event_type, row_number() OVER my_win as row_no + FROM + events_table WINDOW my_win AS (PARTITION BY user_id ORDER BY time DESC) +) as foo; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- simple window function, partitioned and grouped by on the distribution key +INSERT INTO agg_results (user_id, value_1_agg, value_2_agg) +SELECT + user_id, rnk, tme +FROM +( + SELECT + user_id, rank() OVER my_win as rnk, avg(value_2) as tme + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY user_id ORDER BY avg(event_type) DESC) +) as foo; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- top level query has a group by on the result of the window function +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + min(user_id), min(time), lag_event_type +FROM +( + SELECT + *, lag(event_type) OVER my_win as lag_event_type + FROM + events_table WINDOW my_win AS (PARTITION BY user_id ORDER BY time DESC) +) as foo +GROUP BY + lag_event_type; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- window functions should work along with joins as well +INSERT INTO agg_results (user_id, value_1_agg, value_2_agg) +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w1 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time) +) as foo; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- two window functions in a single subquery should work fine as well +INSERT INTO agg_results (user_id, value_1_agg, value_2_agg) +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id, (events_table.value_2 % 25) ORDER BY events_table.time) +) as foo; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- window functions should be fine within subquery joins +INSERT INTO agg_results (user_id, value_1_agg, value_2_agg, value_3_agg) +SELECT sub_1.user_id, max(lag_1), max(rank_1), max(rank_2) FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1 as lag_1, rank() OVER w2 as rank_1 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id, (events_table.value_2 % 25) ORDER BY events_table.time) +) as sub_1 +JOIN +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1 as lag_2, rank() OVER w2 as rank_2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.value_2 ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id, (events_table.value_2 % 50) ORDER BY events_table.time) +) as sub_2 + ON(sub_1.user_id = sub_2.user_id) + GROUP BY + sub_1.user_id; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- GROUP BYs and PARTITION BYs should work fine together +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + avg(user_id), max(time), my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY user_id ORDER BY count(*) DESC) +) as foo +WHERE + my_rank > 5 +GROUP BY + my_rank; + + -- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- aggregates in the PARTITION BY is also allows +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + avg(user_id), max(time), my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY user_id, avg(event_type%10)::int ORDER BY count(*) DESC) +) as foo +WHERE + my_rank > 0 +GROUP BY + my_rank; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- GROUP BY should not necessarly be inclusive of partitioning +-- but this query doesn't make much sense +INSERT INTO agg_results (user_id, value_1_agg) +SELECT + avg(user_id), my_rank +FROM +( + SELECT + user_id, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id + WINDOW my_win AS (PARTITION BY user_id, max(event_type) ORDER BY count(*) DESC) +) as foo +GROUP BY + my_rank; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- Group by has more columns than partition by which uses coordinator insert ... select +INSERT INTO agg_results(user_id, value_2_agg) +SELECT * FROM ( + SELECT + DISTINCT user_id, SUM(value_2) OVER (PARTITION BY user_id) + FROM + users_table + GROUP BY + user_id, value_1, value_2 +) a +ORDER BY + 2 DESC, 1 +LIMIT + 10; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +INSERT INTO agg_results(user_id, value_2_agg) +SELECT user_id, max(sum) FROM ( + SELECT + user_id, SUM(value_2) OVER (PARTITION BY user_id, value_1) + FROM + users_table + GROUP BY + user_id, value_1, value_2 +) a +GROUP BY user_id; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + + -- Subquery in where with window function +INSERT INTO agg_results(user_id) +SELECT + user_id +FROM + users_table +WHERE + value_2 > 545 AND + value_2 < ALL ( + SELECT + avg(value_3) OVER (PARTITION BY user_id) + FROM + events_table + WHERE + users_table.user_id = events_table.user_id + ) +GROUP BY + user_id; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- Partition by with aggregate functions. This query does not make much sense since the +-- result of aggregate function will be the same for every row in a partition and it is +-- not going to affect the group that the count function will work on. +INSERT INTO agg_results(user_id, value_2_agg) +SELECT * FROM ( + SELECT + user_id, COUNT(*) OVER (PARTITION BY user_id, MIN(value_2)) + FROM + users_table + GROUP BY + 1 +) a; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- Some more nested queries +INSERT INTO agg_results(user_id, value_2_agg, value_3_agg, value_4_agg) +SELECT + user_id, rank, SUM(ABS(value_2 - value_3)) AS difference, COUNT(*) AS distinct_users +FROM ( + SELECT + *, rank() OVER (PARTITION BY user_id ORDER BY value_2 DESC) + FROM ( + SELECT + user_id, value_2, sum(value_3) OVER (PARTITION BY user_id, value_2) as value_3 + FROM users_table + ) AS A +) AS A +GROUP BY + user_id, rank; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +INSERT INTO agg_results(user_id, value_1_agg) +SELECT * FROM ( + SELECT DISTINCT + f3.user_id, ABS(f2.sum - f3.sum) + FROM ( + SELECT DISTINCT + user_id, sum(value_3) OVER (PARTITION BY user_id) + FROM + users_table + GROUP BY + user_id, value_3 + ) f3, + ( + SELECT DISTINCT + user_id, sum(value_2) OVER (PARTITION BY user_id) + FROM + users_table + GROUP BY + user_id, value_2 + ) f2 +WHERE + f3.user_id=f2.user_id +) a; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- test with reference table partitioned on columns from both +INSERT INTO agg_results(user_id, value_1_agg) +SELECT * +FROM +( + SELECT + DISTINCT user_id, count(id) OVER (PARTITION BY user_id, id) + FROM + users_table, users_ref_test_table +) a; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- Window functions with HAVING clause +INSERT INTO agg_results (user_id, value_1_agg) +SELECT * FROM ( + SELECT + DISTINCT user_id, rank() OVER (PARTITION BY user_id ORDER BY value_1) + FROM + users_table + GROUP BY + user_id, value_1 HAVING count(*) > 1 +) a; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- Window functions with HAVING clause which uses coordinator insert ... select +INSERT INTO agg_results (user_id, value_1_agg) +SELECT * FROM ( + SELECT + DISTINCT user_id, rank() OVER (PARTITION BY user_id ORDER BY value_1) + FROM + users_table + GROUP BY + user_id, value_1 HAVING count(*) > 1 +) a +ORDER BY + 2 DESC, 1 +LIMIT + 10; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- Window function in View works +CREATE VIEW view_with_window_func AS +SELECT + DISTINCT user_id, rank() OVER (PARTITION BY user_id ORDER BY value_1) +FROM + users_table +GROUP BY + user_id, value_1 +HAVING count(*) > 1; + +INSERT INTO agg_results(user_id, value_1_agg) +SELECT * +FROM + view_with_window_func; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- Window function in View works and the query uses coordinator insert ... select +INSERT INTO agg_results(user_id, value_1_agg) +SELECT * +FROM + view_with_window_func +LIMIT + 10; + +-- get some statistics from the aggregated results to ensure the results are correct +-- since there is a limit but not order, we cannot run avg(user_id) +SELECT count(*) FROM agg_results; +TRUNCATE agg_results; + +INSERT INTO agg_results(user_id, value_1_agg) +SELECT + user_id, max(avg) +FROM +( + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (1, 2, 3, 4, 5)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (6, 7, 8, 9, 10)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (11, 12, 13, 14, 15)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (16, 17, 18, 19, 20)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (21, 22, 23, 24, 25)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (26, 27, 28, 29, 30)) +) b +GROUP BY + user_id +LIMIT + 5; + +-- get some statistics from the aggregated results to ensure the results are correct +-- since there is a limit but not order, we cannot test avg or distinct count +SELECT count(*) FROM agg_results; +TRUNCATE agg_results; + +INSERT INTO agg_results(user_id, value_1_agg) +SELECT + user_id, max(avg) +FROM +( + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (1, 2, 3, 4, 5)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (6, 7, 8, 9, 10)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (11, 12, 13, 14, 15)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (16, 17, 18, 19, 20)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (21, 22, 23, 24, 25)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (26, 27, 28, 29, 30)) +) b +GROUP BY + user_id; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +INSERT INTO agg_results(user_id, value_1_agg) +SELECT * +FROM ( + ( SELECT user_id, + sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_1 + GROUP BY + user_id) + UNION + (SELECT + user_id, sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_2 + GROUP BY + user_id + ) +) AS ftop +LIMIT + 5; + +-- get some statistics from the aggregated results to ensure the results are correct +-- since there is a limit but not order, we cannot test avg or distinct count +SELECT count(*) FROM agg_results; +TRUNCATE agg_results; + +INSERT INTO agg_results(user_id, value_1_agg) +SELECT * +FROM ( + ( SELECT user_id, + sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_1 + GROUP BY + user_id) + UNION + (SELECT + user_id, sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_2 + GROUP BY + user_id + ) +) AS ftop; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- lets have some queries that Citus shouldn't push down +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS (PARTITION BY event_type ORDER BY time DESC) +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; + +-- user needs to supply partition by which should +-- include the distribution key +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS () +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; + +-- user needs to supply partition by which should +-- include the distribution key +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS (ORDER BY time DESC) +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; + +-- w2 should not be pushed down +INSERT INTO agg_results (user_id, value_1_agg, value_2_agg) +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id+1, (events_table.value_2 % 25) ORDER BY events_table.time) +) as foo +LIMIT + 10; + +-- GROUP BY includes the partition key, but not the WINDOW function +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + user_id, time, my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (ORDER BY avg(event_type)) +) as foo +WHERE + my_rank > 125; + +-- GROUP BY includes the partition key, but not the WINDOW function +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + user_id, time, my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY date_trunc('day', time) ORDER BY avg(event_type)) +) as foo +WHERE + my_rank > 125; + +-- w2 should not be allowed +INSERT INTO agg_results (user_id, value_2_agg, value_3_agg) +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (ORDER BY events_table.time) +) as foo; + +-- unsupported window function with an override +INSERT INTO agg_results(user_id, agg_time, value_2_agg) +SELECT * FROM ( + SELECT + user_id, date_trunc('day', time) as time, sum(rank) OVER w2 + FROM ( + SELECT DISTINCT + user_id as user_id, time, rank() over w1 + FROM + users_table + WINDOW + w AS (PARTITION BY time), w1 AS (w ORDER BY value_2, value_3) + ) fab + WINDOW + w2 as (PARTITION BY user_id, time) +) a; + + -- Subquery in where with unsupported window function +INSERT INTO agg_results(user_id) +SELECT + user_id +FROM + users_table +WHERE + value_2 > 545 AND + value_2 < ALL ( + SELECT + avg(value_3) OVER () + FROM + events_table + WHERE + users_table.user_id = events_table.user_id + ) +GROUP BY + user_id; + +-- Aggregate function on distribution column should error out +INSERT INTO agg_results(user_id, value_2_agg) +SELECT * FROM ( + SELECT + user_id, COUNT(*) OVER (PARTITION BY sum(user_id), MIN(value_2)) + FROM + users_table + GROUP BY + user_id +) a; + +-- UNION with only one subquery which has a partition on non-distribution column should +-- error out +INSERT INTO agg_results(user_id, value_1_agg) +SELECT * +FROM ( + ( SELECT user_id, + sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_1 + GROUP BY + user_id) + UNION + (SELECT + user_id, sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by event_type) AS counter + FROM + events_table) user_id_2 + GROUP BY + user_id + ) +) AS ftop; + +DROP VIEW view_with_window_func; diff --git a/src/test/regress/sql/multi_subquery_complex_queries.sql b/src/test/regress/sql/multi_subquery_complex_queries.sql index 300a79a94..787110511 100644 --- a/src/test/regress/sql/multi_subquery_complex_queries.sql +++ b/src/test/regress/sql/multi_subquery_complex_queries.sql @@ -2160,18 +2160,6 @@ GROUP BY ORDER BY types; --- not supported due to window functions -SELECT user_id, - some_vals -FROM ( - SELECT * , - Row_number() over (PARTITION BY "user_id" ORDER BY "user_id") AS "some_vals", - Random() - FROM users_table - ) user_id -ORDER BY 1, - 2 limit 10; - -- not supported due to non relation rte SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType FROM diff --git a/src/test/regress/sql/multi_subquery_complex_reference_clause.sql b/src/test/regress/sql/multi_subquery_complex_reference_clause.sql index be2aa896e..c76237c1d 100644 --- a/src/test/regress/sql/multi_subquery_complex_reference_clause.sql +++ b/src/test/regress/sql/multi_subquery_complex_reference_clause.sql @@ -7,7 +7,7 @@ -- We don't need shard id sequence here, so commented out to prevent conflicts with concurrent tests -- ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 1400000; ALTER SEQUENCE pg_catalog.pg_dist_jobid_seq RESTART 1400000; - + SET citus.enable_router_execution TO FALSE; CREATE TABLE user_buy_test_table(user_id int, item_id int, buy_count int); @@ -23,15 +23,6 @@ INSERT INTO users_return_test_table VALUES(4,1,1); INSERT INTO users_return_test_table VALUES(1,3,1); INSERT INTO users_return_test_table VALUES(3,2,2); -CREATE TABLE users_ref_test_table(id int, it_name varchar(25), k_no int); -SELECT create_reference_table('users_ref_test_table'); -INSERT INTO users_ref_test_table VALUES(1,'User_1',45); -INSERT INTO users_ref_test_table VALUES(2,'User_2',46); -INSERT INTO users_ref_test_table VALUES(3,'User_3',47); -INSERT INTO users_ref_test_table VALUES(4,'User_4',48); -INSERT INTO users_ref_test_table VALUES(5,'User_5',49); -INSERT INTO users_ref_test_table VALUES(6,'User_6',50); - -- Simple Join test with reference table SELECT count(*) FROM (SELECT random() FROM user_buy_test_table JOIN users_ref_test_table @@ -101,7 +92,7 @@ SELECT count(*) FROM ON user_buy_test_table.user_id > users_ref_test_table.id AND users_ref_test_table.k_no > 44 AND user_buy_test_table.user_id > 44) subquery_2 WHERE subquery_1.user_id = subquery_2.user_id ; - -- Should be able to push down since reference tables are inner joined + -- Should be able to push down since reference tables are inner joined -- with hash distributed tables, the results of those joins are the parts of -- an outer join SELECT subquery_2.id FROM @@ -122,75 +113,75 @@ SELECT * FROM -- should be able to pushdown since reference table is in the -- inner part of the left join -SELECT - user_id, sum(value_1) -FROM - (SELECT +SELECT + user_id, sum(value_1) +FROM + (SELECT users_table.user_id, users_table.value_1, random() - FROM - users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) + FROM + users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) INNER JOIN events_reference_table ON (events_reference_table.value_2 = users_table.user_id) - ) as foo + ) as foo GROUP BY user_id ORDER BY 2 DESC LIMIT 10; -- same query as above, reference table is wrapped into a subquery -SELECT - user_id, sum(value_1) -FROM - (SELECT +SELECT + user_id, sum(value_1) +FROM + (SELECT users_table.user_id, users_table.value_1, random() - FROM - users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) + FROM + users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) INNER JOIN (SELECT *, random() FROM events_reference_table) as ref_all ON (ref_all.value_2 = users_table.user_id) - ) as foo + ) as foo GROUP BY user_id ORDER BY 2 DESC LIMIT 10; -- should be able to pushdown since reference table is in the -- inner part of the left join -SELECT - user_id, sum(value_1) -FROM - (SELECT +SELECT + user_id, sum(value_1) +FROM + (SELECT users_table.user_id, users_table.value_1, random() - FROM - users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) + FROM + users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) LEFT JOIN events_reference_table ON (events_reference_table.value_2 = users_table.user_id) - ) as foo + ) as foo GROUP BY user_id ORDER BY 2 DESC LIMIT 10; -- should not be able to pushdown since reference table is in the -- direct outer part of the left join -SELECT - user_id, sum(value_1) -FROM - (SELECT +SELECT + user_id, sum(value_1) +FROM + (SELECT users_table.user_id, users_table.value_1, random() - FROM - events_reference_table LEFT JOIN users_table ON (users_table.user_id = events_reference_table.value_2) + FROM + events_reference_table LEFT JOIN users_table ON (users_table.user_id = events_reference_table.value_2) LEFT JOIN events_table ON (events_table.user_id = users_table.user_id) - ) as foo - GROUP BY user_id ORDER BY 2 DESC LIMIT 10; + ) as foo + GROUP BY user_id ORDER BY 2 DESC LIMIT 10; -- should not be able to pushdown since reference table is in the -- direct outer part of the left join wrapped into a subquery SELECT * FROM - (SELECT *, random() FROM events_reference_table) as ref_all LEFT JOIN users_table + (SELECT *, random() FROM events_reference_table) as ref_all LEFT JOIN users_table ON (users_table.user_id = ref_all.value_2); -- should not be able to pushdown since reference table is in the -- outer part of the left join -SELECT - user_id, sum(value_1) -FROM - (SELECT +SELECT + user_id, sum(value_1) +FROM + (SELECT users_table.user_id, users_table.value_1, random() - FROM - events_reference_table LEFT JOIN users_table ON (users_table.user_id = events_reference_table.value_2) + FROM + events_reference_table LEFT JOIN users_table ON (users_table.user_id = events_reference_table.value_2) LEFT JOIN events_table ON (events_table.user_id = users_table.user_id) - ) as foo - GROUP BY user_id ORDER BY 2 DESC LIMIT 10; + ) as foo + GROUP BY user_id ORDER BY 2 DESC LIMIT 10; -- should be able to pushdown since reference table is in the -- inner part of the left join @@ -198,18 +189,18 @@ SELECT * FROM ( SELECT DISTINCT foo.user_id FROM - ((SELECT + ((SELECT "events"."time", "events"."user_id" as event_user_id, value_2 as event_val_2, random() - FROM + FROM events_reference_table as "events" - WHERE + WHERE event_type > 80) as "temp_data_queries" INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_table as "users" - WHERE + WHERE user_id > 80 and value_2 = 5) as foo_in ON (event_val_2 = user_id)) as foo LEFT JOIN (SELECT user_id as user_user_id FROM users_table) as fooo ON (user_id = user_user_id)) as bar; @@ -218,23 +209,23 @@ SELECT * FROM ( SELECT DISTINCT foo.user_id FROM - ((SELECT + ((SELECT "events"."time", "events"."user_id" as event_user_id, value_2 as event_val_2, random() - FROM + FROM events_reference_table as "events" - WHERE + WHERE event_type > 80) as "temp_data_queries" LEFT JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_table as "users" - WHERE + WHERE user_id > 80 and value_2 = 5) as foo_in ON (event_val_2 = user_id)) as foo LEFT JOIN (SELECT user_id as user_user_id FROM users_table) as fooo ON (user_id = user_user_id)) as bar; --- we could even suuport the following where the subquery --- on the outer part of the left join contains a reference table +-- we could even suuport the following where the subquery +-- on the outer part of the left join contains a reference table SELECT max(events_all.cnt), events_all.usr_id FROM (SELECT users_table.user_id as usr_id, @@ -246,7 +237,7 @@ LEFT JOIN events_table ON (events_all.usr_id = events_table.user_id) GROUP BY 2 -- but, we fail to pushdown the following query where join that reference table appears -- wrapped into a subquery SELECT max(events_all.cnt), - events_all.usr_id + events_all.usr_id FROM( SELECT *, random() FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt @@ -263,141 +254,141 @@ LIMIT 5; SET citus.subquery_pushdown to ON; SELECT user_id, lastseen FROM - (SELECT + (SELECT "some_users_data".user_id, lastseen FROM - (SELECT + (SELECT filter_users_1.user_id, time AS lastseen FROM - (SELECT + (SELECT user_where_1_1.user_id FROM - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE + WHERE user_id > 12 and user_id < 16 and value_1 > 20) user_where_1_1 INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE - user_id > 12 and user_id < 16 and value_2 > 60) user_where_1_join_1 - ON ("user_where_1_1".user_id = "user_where_1_join_1".user_id)) - filter_users_1 + WHERE + user_id > 12 and user_id < 16 and value_2 > 60) user_where_1_join_1 + ON ("user_where_1_1".user_id = "user_where_1_join_1".user_id)) + filter_users_1 JOIN LATERAL - (SELECT + (SELECT user_id, time - FROM + FROM events_reference_table as "events" WHERE - user_id > 12 and user_id < 16 AND + user_id > 12 and user_id < 16 AND user_id = filter_users_1.user_id - ORDER BY + ORDER BY time DESC - LIMIT 1) "last_events_1" + LIMIT 1) "last_events_1" ON TRUE - ORDER BY + ORDER BY time DESC LIMIT 10) "some_recent_users" JOIN LATERAL - (SELECT + (SELECT "users".user_id - FROM + FROM users_reference_table as "users" - WHERE - "users"."user_id" = "some_recent_users"."user_id" AND + WHERE + "users"."user_id" = "some_recent_users"."user_id" AND "users"."value_2" > 70 - LIMIT 1) "some_users_data" + LIMIT 1) "some_users_data" ON TRUE - ORDER BY + ORDER BY lastseen DESC LIMIT 10) "some_users" -ORDER BY +ORDER BY user_id DESC LIMIT 10; SET citus.subquery_pushdown to OFF; -- NESTED INNER JOINs with reference tables -SELECT - count(*) AS value, "generated_group_field" +SELECT + count(*) AS value, "generated_group_field" FROM - (SELECT + (SELECT DISTINCT "pushedDownQuery"."user_id", "generated_group_field" FROM - (SELECT + (SELECT "eventQuery"."user_id", "eventQuery"."time", random(), ("eventQuery"."value_2") AS "generated_group_field" FROM - (SELECT + (SELECT * FROM - (SELECT + (SELECT "events"."time", "events"."user_id", "events"."value_2" - FROM + FROM events_table as "events" - WHERE + WHERE user_id > 10 and user_id < 40 AND event_type IN (40, 41, 42, 43, 44, 45) ) "temp_data_queries" INNER JOIN - (SELECT + (SELECT user_where_1_1.real_user_id FROM - (SELECT + (SELECT "users"."user_id" as real_user_id - FROM + FROM users_reference_table as "users" WHERE user_id > 10 and user_id < 40 and value_2 > 50 ) user_where_1_1 INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE - user_id > 10 and user_id < 40 and value_3 > 50 ) user_where_1_join_1 + WHERE + user_id > 10 and user_id < 40 and value_3 > 50 ) user_where_1_join_1 ON ("user_where_1_1".real_user_id = "user_where_1_join_1".user_id)) "user_filters_1" ON ("temp_data_queries".user_id = "user_filters_1".real_user_id)) "eventQuery") "pushedDownQuery") "pushedDownQuery" -GROUP BY - "generated_group_field" -ORDER BY +GROUP BY + "generated_group_field" +ORDER BY generated_group_field DESC, value DESC; -- single level inner joins with reference tables -SELECT - "value_3", count(*) AS cnt +SELECT + "value_3", count(*) AS cnt FROM - (SELECT + (SELECT "value_3", "user_id", random() FROM - (SELECT - users_in_segment_1.user_id, value_3 + (SELECT + users_in_segment_1.user_id, value_3 FROM - (SELECT + (SELECT user_id, value_3 * 2 as value_3 FROM - (SELECT - user_id, value_3 + (SELECT + user_id, value_3 FROM - (SELECT + (SELECT "users"."user_id", value_3 - FROM + FROM users_reference_table as "users" - WHERE + WHERE user_id > 10 and user_id < 40 and value_2 > 30 ) simple_user_where_1 ) all_buckets_1 ) users_in_segment_1 JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE + WHERE user_id > 10 and user_id < 40 and value_2 > 60 ) some_users_data ON ("users_in_segment_1".user_id = "some_users_data".user_id) - ) segmentalias_1) "tempQuery" + ) segmentalias_1) "tempQuery" GROUP BY "value_3" ORDER BY cnt, value_3 DESC LIMIT 10; @@ -407,42 +398,42 @@ SELECT * FROM (SELECT "some_users_data".user_id, "some_recent_users".value_3 FROM - (SELECT + (SELECT filter_users_1.user_id, value_3 FROM - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE + WHERE user_id > 20 and user_id < 70 and users.value_2 = 200) filter_users_1 JOIN LATERAL - (SELECT + (SELECT user_id, value_3 - FROM + FROM events_reference_table as "events" WHERE - user_id > 20 and user_id < 70 AND + user_id > 20 and user_id < 70 AND ("events".user_id = "filter_users_1".user_id) - ORDER BY + ORDER BY value_3 DESC LIMIT 1) "last_events_1" ON true ORDER BY value_3 DESC LIMIT 10) "some_recent_users" JOIN LATERAL - (SELECT + (SELECT "users".user_id - FROM + FROM users_reference_table as "users" - WHERE - "users"."user_id" = "some_recent_users"."user_id" AND + WHERE + "users"."user_id" = "some_recent_users"."user_id" AND users.value_2 > 200 LIMIT 1) "some_users_data" ON true - ORDER BY + ORDER BY value_3 DESC LIMIT 10) "some_users" -ORDER BY - value_3 DESC +ORDER BY + value_3 DESC LIMIT 10; SET citus.subquery_pushdown to OFF; @@ -451,37 +442,37 @@ SET citus.subquery_pushdown to OFF; SELECT count(*) AS cnt, "generated_group_field" FROM - (SELECT + (SELECT "eventQuery"."user_id", random(), generated_group_field FROM - (SELECT + (SELECT "multi_group_wrapper_1".*, generated_group_field, random() FROM (SELECT * FROM - (SELECT + (SELECT "events"."time", "events"."user_id" as event_user_id - FROM + FROM events_table as "events" - WHERE + WHERE user_id > 80) "temp_data_queries" INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE - user_id > 80 and value_2 = 5) "user_filters_1" + WHERE + user_id > 80 and value_2 = 5) "user_filters_1" ON ("temp_data_queries".event_user_id = "user_filters_1".user_id)) AS "multi_group_wrapper_1" LEFT JOIN - (SELECT + (SELECT "users"."user_id" AS "user_id", value_2 AS "generated_group_field" - FROM + FROM users_table as "users") "left_group_by_1" - ON ("left_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" + ON ("left_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" group BY "generated_group_field" - ORDER BY + ORDER BY cnt DESC, generated_group_field ASC LIMIT 10; @@ -490,42 +481,42 @@ count(*) AS cnt, "generated_group_field" SELECT count(*) AS cnt, "generated_group_field" FROM - (SELECT + (SELECT "eventQuery"."user_id", random(), generated_group_field FROM - (SELECT + (SELECT "multi_group_wrapper_1".*, generated_group_field, random() FROM (SELECT * FROM - (SELECT + (SELECT "events"."time", "events"."user_id" as event_user_id - FROM + FROM events_table as "events" - WHERE + WHERE user_id > 80) "temp_data_queries" INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_table as "users" - WHERE - user_id > 80 and value_2 = 5) "user_filters_1" + WHERE + user_id > 80 and value_2 = 5) "user_filters_1" ON ("temp_data_queries".event_user_id = "user_filters_1".user_id)) AS "multi_group_wrapper_1" RIGHT JOIN - (SELECT + (SELECT "users"."user_id" AS "user_id", value_2 AS "generated_group_field" - FROM + FROM users_reference_table as "users") "right_group_by_1" - ON ("right_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" + ON ("right_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" group BY "generated_group_field" - ORDER BY + ORDER BY cnt DESC, generated_group_field ASC LIMIT 10; -- right join where the inner part of the join includes a reference table --- joined with hash partitioned table using non-equi join +-- joined with hash partitioned table using non-equi join SELECT user_id, sum(array_length(events_table, 1)), length(hasdone_event), hasdone_event FROM ( SELECT @@ -585,42 +576,42 @@ ORDER BY user_id; -- LEFT JOINs used with INNER JOINs --- events_table and users_reference_table joined +-- events_table and users_reference_table joined -- with event_table.non_part_key < reference_table.any_key SELECT count(*) AS cnt, "generated_group_field" FROM - (SELECT + (SELECT "eventQuery"."user_id", random(), generated_group_field FROM - (SELECT + (SELECT "multi_group_wrapper_1".*, generated_group_field, random() FROM (SELECT * FROM - (SELECT + (SELECT "events"."time", "events"."user_id" as event_user_id - FROM + FROM events_table as "events" - WHERE + WHERE user_id > 80) "temp_data_queries" INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE - user_id > 80 and value_2 = 5) "user_filters_1" + WHERE + user_id > 80 and value_2 = 5) "user_filters_1" ON ("temp_data_queries".event_user_id < "user_filters_1".user_id)) AS "multi_group_wrapper_1" RIGHT JOIN - (SELECT + (SELECT "users"."user_id" AS "user_id", value_2 AS "generated_group_field" - FROM + FROM users_table as "users") "left_group_by_1" - ON ("left_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" + ON ("left_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" group BY "generated_group_field" - ORDER BY + ORDER BY cnt DESC, generated_group_field ASC LIMIT 10; @@ -631,31 +622,31 @@ FROM FROM (SELECT user_id, time FROM - (SELECT + (SELECT user_id, time - FROM + FROM events_reference_table as "events" - WHERE + WHERE user_id > 10 and user_id < 40) "events_1" ORDER BY time DESC) "recent_events_1" - GROUP BY + GROUP BY user_id - ORDER BY + ORDER BY max(TIME) DESC) "some_recent_users" FULL JOIN - (SELECT + (SELECT "users".user_id - FROM + FROM users_table as "users" - WHERE - users.value_2 > 50 and users.value_2 < 55) "some_users_data" + WHERE + users.value_2 > 50 and users.value_2 < 55) "some_users_data" ON "some_users_data"."user_id" = "some_recent_users"."user_id" -ORDER BY +ORDER BY user_id limit 50; - -- + -- -- UNIONs and JOINs with reference tables, should error out -- SELECT ("final_query"."event_types") as types @@ -666,130 +657,130 @@ FROM FROM ( SELECT "t1"."user_id", min("t1"."time") AS "time", array_agg(("t1"."event") ORDER BY TIME ASC, event DESC) AS collected_events FROM ( - (SELECT + (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 0 AS event - FROM + FROM events_table as "events" - WHERE - event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) - UNION - (SELECT + WHERE + event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) + UNION + (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 1 AS event - FROM + FROM events_reference_table as "events" - WHERE + WHERE event_type IN (15, 16, 17, 18, 19) ) events_subquery_2) - UNION - (SELECT + UNION + (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 2 AS event - FROM + FROM events_table as "events" - WHERE + WHERE event_type IN (20, 21, 22, 23, 24, 25) ) events_subquery_3) - UNION - (SELECT + UNION + (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 3 AS event - FROM + FROM events_table as "events" - WHERE + WHERE event_type IN (26, 27, 28, 29, 30, 13)) events_subquery_4)) t1 - GROUP BY "t1"."user_id") AS t) "q" + GROUP BY "t1"."user_id") AS t) "q" INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_table as "users" - WHERE - value_1 > 50 and value_1 < 70) AS t + WHERE + value_1 > 50 and value_1 < 70) AS t ON (t.user_id = q.user_id)) as final_query -ORDER BY +ORDER BY types; - -- reference table exist in the subquery of union, should error out + -- reference table exist in the subquery of union, should error out SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType FROM - ( SELECT + ( SELECT *, random() FROM - (SELECT + (SELECT "t"."user_id", "t"."time", unnest("t"."collected_events") AS "event_types" FROM - ( SELECT + ( SELECT "t1"."user_id", min("t1"."time") AS "time", array_agg(("t1"."event") ORDER BY TIME ASC, event DESC) AS collected_events FROM ( - (SELECT + (SELECT * FROM - (SELECT + (SELECT "events"."time", 0 AS event, "events"."user_id" - FROM + FROM events_table as "events" - WHERE - event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) - UNION + WHERE + event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) + UNION (SELECT * FROM ( SELECT * FROM ( - SELECT + SELECT max("users"."time"), 0 AS event, "users"."user_id" - FROM + FROM events_reference_table as "events", users_table as "users" - WHERE + WHERE events.user_id = users.user_id AND event_type IN (10, 11, 12, 13, 14, 15) GROUP BY "users"."user_id" ) as events_subquery_5 ) events_subquery_2) - UNION + UNION (SELECT * FROM - (SELECT + (SELECT "events"."time", 2 AS event, "events"."user_id" - FROM + FROM events_table as "events" - WHERE + WHERE event_type IN (20, 21, 22, 23, 24, 25) ) events_subquery_3) - UNION + UNION (SELECT * FROM (SELECT "events"."time", 3 AS event, "events"."user_id" - FROM + FROM events_table as "events" - WHERE + WHERE event_type IN (26, 27, 28, 29, 30, 13)) events_subquery_4) ) t1 - GROUP BY "t1"."user_id") AS t) "q" + GROUP BY "t1"."user_id") AS t) "q" INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_table as "users" - WHERE - value_1 > 50 and value_1 < 70) AS t + WHERE + value_1 > 50 and value_1 < 70) AS t ON (t.user_id = q.user_id)) as final_query -GROUP BY +GROUP BY types -ORDER BY +ORDER BY types; --- +-- -- Should error out with UNION ALL Queries on reference tables -- SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType @@ -802,40 +793,40 @@ FROM FROM ( (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 0 AS event - FROM + FROM events_table as "events" - WHERE - event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) + WHERE + event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) UNION ALL (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 1 AS event - FROM + FROM events_table as "events" - WHERE + WHERE event_type IN (15, 16, 17, 18, 19) ) events_subquery_2) UNION ALL (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 2 AS event - FROM + FROM events_reference_table as "events" - WHERE + WHERE event_type IN (20, 21, 22, 23, 24, 25) ) events_subquery_3) UNION ALL (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 3 AS event - FROM + FROM events_table as "events" - WHERE + WHERE event_type IN (26, 27, 28, 29, 30, 13)) events_subquery_4)) t1 - GROUP BY "t1"."user_id") AS t) "q" + GROUP BY "t1"."user_id") AS t) "q" INNER JOIN (SELECT "users"."user_id" FROM users_table as "users" @@ -843,7 +834,7 @@ INNER JOIN GROUP BY types ORDER BY types; --- just a sanity check that we don't allow this if the reference table is on the +-- just a sanity check that we don't allow this if the reference table is on the -- left part of the left join SELECT count(*) FROM (SELECT random() FROM users_ref_test_table LEFT JOIN user_buy_test_table @@ -862,58 +853,58 @@ WHERE subquery_1.user_id != subquery_2.user_id ; SELECT count(*) AS cnt, "generated_group_field" FROM - (SELECT + (SELECT "eventQuery"."user_id", random(), generated_group_field FROM - (SELECT + (SELECT "multi_group_wrapper_1".*, generated_group_field, random() FROM (SELECT * FROM - (SELECT + (SELECT "events"."time", "events"."user_id" as event_user_id - FROM + FROM events_table as "events" - WHERE + WHERE user_id > 80) "temp_data_queries" INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE - user_id > 80 and value_2 = 5) "user_filters_1" + WHERE + user_id > 80 and value_2 = 5) "user_filters_1" ON ("temp_data_queries".event_user_id < "user_filters_1".user_id)) AS "multi_group_wrapper_1" RIGHT JOIN - (SELECT + (SELECT "users"."user_id" AS "user_id", value_2 AS "generated_group_field" - FROM + FROM users_table as "users") "left_group_by_1" - ON ("left_group_by_1".user_id > "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" + ON ("left_group_by_1".user_id > "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" group BY "generated_group_field" - ORDER BY + ORDER BY cnt DESC, generated_group_field ASC LIMIT 10; -- two hash partitioned relations are not joined -- on partiton keys although reference table is fine -- to push down -SELECT +SELECT u1.user_id, count(*) -FROM +FROM events_table as e1, users_table as u1 WHERE event_type IN - (SELECT + (SELECT event_type - FROM + FROM events_reference_table as e2 WHERE value_2 = 15 AND value_3 > 25 AND e1.value_2 > e2.value_2 - ) + ) AND u1.user_id > e1.user_id GROUP BY 1 ORDER BY 2 DESC, 1 DESC diff --git a/src/test/regress/sql/multi_subquery_window_functions.sql b/src/test/regress/sql/multi_subquery_window_functions.sql new file mode 100644 index 000000000..634ce790c --- /dev/null +++ b/src/test/regress/sql/multi_subquery_window_functions.sql @@ -0,0 +1,706 @@ +-- =================================================================== +-- test multi subquery functionality for window functions +-- =================================================================== + +CREATE VIEW subq AS +SELECT + DISTINCT user_id, rank() OVER (PARTITION BY user_id ORDER BY value_1) +FROM + users_table +GROUP BY + user_id, value_1 +HAVING count(*) > 1; + +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS (PARTITION BY user_id ORDER BY time DESC) +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; + +-- the same test with different syntax +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER (PARTITION BY user_id ORDER BY time DESC) as rnk + FROM + events_table +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; + +-- similar test with lag +SELECT + user_id, time, lag_event_type, row_no +FROM +( + SELECT + *, lag(event_type) OVER my_win as lag_event_type, row_number() OVER my_win as row_no + FROM + events_table WINDOW my_win AS (PARTITION BY user_id ORDER BY time DESC) +) as foo +ORDER BY + 4 DESC, 3 DESC NULLS LAST, 1 DESC, 2 DESC +LIMIT + 10; + +-- simple window function, partitioned and grouped by on the distribution key +SELECT + user_id, rnk, avg_val_2 +FROM +( + SELECT + user_id, rank() OVER my_win as rnk, avg(value_2) as avg_val_2 + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY user_id ORDER BY avg(event_type) DESC) +) as foo +ORDER BY + 2 DESC, 1 DESC, 3 DESC +LIMIT + 10; + +-- top level query has a group by on the result of the window function +SELECT + min(user_id), min(time), lag_event_type, count(*) +FROM +( + SELECT + *, lag(event_type) OVER my_win as lag_event_type + FROM + events_table WINDOW my_win AS (PARTITION BY user_id ORDER BY time DESC) +) as foo +GROUP BY + lag_event_type +ORDER BY + 3 DESC NULLS LAST, 1 DESC, 2 DESC +LIMIT + 10; + +-- window functions should work along with joins as well +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w1 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time) +) as foo +ORDER BY 3 DESC, 1 DESC, 2 DESC NULLS LAST +LIMIT 10; + +-- two window functions in a single subquery should work fine as well +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id, (events_table.value_2 % 25) ORDER BY events_table.time) +) as foo +ORDER BY 3 DESC, 1 DESC, 2 DESC NULLS LAST +LIMIT 10; + +-- window functions should be fine within subquery joins +SELECT sub_1.user_id, max(lag_1), max(rank_1), max(rank_2) FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1 as lag_1, rank() OVER w2 as rank_1 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id, (events_table.value_2 % 25) ORDER BY events_table.time) +) as sub_1 +JOIN +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1 as lag_2, rank() OVER w2 as rank_2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.value_2 ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id, (events_table.value_2 % 50) ORDER BY events_table.time) +) as sub_2 + ON(sub_1.user_id = sub_2.user_id) + GROUP BY + sub_1.user_id + ORDER BY 3 DESC, 4 DESC, 1 DESC, 2 DESC NULLS LAST +LIMIT 10; + +-- GROUP BYs and PARTITION BYs should work fine together +SELECT + avg(user_id), max(time), my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY user_id ORDER BY count(*) DESC) +) as foo +WHERE + my_rank > 5 +GROUP BY + my_rank +ORDER BY + 3 DESC, 1 DESC,2 DESC +LIMIT + 10; + +-- aggregates in the PARTITION BY is also allows +SELECT + avg(user_id), max(time), my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY user_id, avg(event_type%10)::int ORDER BY count(*) DESC) +) as foo +WHERE + my_rank > 0 +GROUP BY + my_rank +ORDER BY + 3 DESC, 1 DESC,2 DESC +LIMIT + 10; + +-- GROUP BY should not necessarly be inclusive of partitioning +-- but this query doesn't make much sense +SELECT + avg(user_id), my_rank +FROM +( + SELECT + user_id, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id + WINDOW my_win AS (PARTITION BY user_id, max(event_type) ORDER BY count(*) DESC) +) as foo +GROUP BY + my_rank +ORDER BY + 2 DESC, 1 DESC +LIMIT + 10; + + -- Using previously defined supported window function on distribution key +SELECT * FROM ( + SELECT + user_id, date_trunc('day', time) as time, sum(rank) OVER w2 + FROM ( + SELECT DISTINCT + user_id as user_id, time, rank() over w1 + FROM users_table + WINDOW + w AS (PARTITION BY user_id), + w1 AS (w ORDER BY value_2, value_3) + ) fab + WINDOW + w2 as (PARTITION BY user_id, time) +) a +ORDER BY + 1, 2, 3 DESC +LIMIT + 10; + +-- test with reference table partitioned on columns from both +SELECT * +FROM +( + SELECT + DISTINCT user_id, it_name, count(id) OVER (PARTITION BY user_id, id) + FROM + users_table, users_ref_test_table + WHERE users_table.value_2=users_ref_test_table.k_no +) a +ORDER BY + 1, 2, 3 +LIMIT + 20; + +-- Group by has more columns than partition by +SELECT * FROM ( + SELECT + DISTINCT user_id, SUM(value_2) OVER (PARTITION BY user_id) + FROM + users_table + GROUP BY + user_id, value_1, value_2 +) a +ORDER BY + 2 DESC, 1 +LIMIT + 10; + +SELECT user_id, max(sum) FROM ( + SELECT + user_id, SUM(value_2) OVER (PARTITION BY user_id, value_1) + FROM + users_table + GROUP BY + user_id, value_1, value_2 +) a +GROUP BY user_id ORDER BY + 2 DESC,1 +LIMIT + 10; + +-- Window functions with HAVING clause +SELECT * FROM ( + SELECT + DISTINCT user_id, rank() OVER (PARTITION BY user_id ORDER BY value_1) + FROM + users_table + GROUP BY + user_id, value_1 HAVING count(*) > 1 +) a +ORDER BY + 2 DESC, 1 +LIMIT + 10; + +-- Window function in View works +SELECT * +FROM + subq +ORDER BY + 2 DESC, 1 +LIMIT + 10; + +-- Window functions with UNION/UNION ALL works +SELECT + max(avg) +FROM +( + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (1, 2, 3, 4, 5)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (6, 7, 8, 9, 10)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (11, 12, 13, 14, 15)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (16, 17, 18, 19, 20)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (21, 22, 23, 24, 25)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (26, 27, 28, 29, 30)) +) b +GROUP BY user_id +ORDER BY 1 DESC +LIMIT 5; + +SELECT * +FROM ( + ( SELECT user_id, + sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_1 + GROUP BY + user_id) + UNION + (SELECT + user_id, sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_2 + GROUP BY + user_id)) AS ftop +ORDER BY 2 DESC, 1 DESC +LIMIT 5; + +-- Subquery in where with window function +SELECT + user_id +FROM + users_table +WHERE + value_2 > 545 AND + value_2 < ALL ( + SELECT + avg(value_3) OVER (PARTITION BY user_id) + FROM + events_table + WHERE + users_table.user_id = events_table.user_id + ) +GROUP BY + user_id +ORDER BY + user_id DESC +LIMIT + 3; + +-- Some more nested queries +SELECT + user_id, rank, SUM(ABS(value_2 - value_3)) AS difference, COUNT(*) AS distinct_users +FROM ( + SELECT + *, rank() OVER (PARTITION BY user_id ORDER BY value_2 DESC) + FROM ( + SELECT + user_id, value_2, sum(value_3) OVER (PARTITION BY user_id, value_2) as value_3 + FROM users_table + ) AS A +) AS A +GROUP BY + user_id, rank +ORDER BY + difference DESC, rank DESC +LIMIT 20; + +SELECT * FROM ( + SELECT DISTINCT + f3.user_id, ABS(f2.sum - f3.sum) + FROM ( + SELECT DISTINCT + user_id, sum(value_3) OVER (PARTITION BY user_id) + FROM + users_table + GROUP BY + user_id, value_3 + ) f3, + ( + SELECT DISTINCT + user_id, sum(value_2) OVER (PARTITION BY user_id) + FROM + users_table + GROUP BY + user_id, value_2 + ) f2 +WHERE + f3.user_id=f2.user_id +) a +ORDER BY + abs DESC +LIMIT 10; + + +-- Partition by with aggregate functions. This query does not make much sense since the +-- result of aggregate function will be the same for every row in a partition and it is +-- not going to affect the group that the count function will work on. +SELECT * FROM ( + SELECT + user_id, COUNT(*) OVER (PARTITION BY user_id, MIN(value_2)) + FROM + users_table + GROUP BY + 1 +) a +ORDER BY + 1 DESC +LIMIT + 5; + +EXPLAIN (COSTS FALSE, VERBOSE TRUE) + SELECT * + FROM ( + ( SELECT user_id, + sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_1 + GROUP BY + user_id) + UNION + (SELECT + user_id, sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_2 + GROUP BY + user_id)) AS ftop + ORDER BY 2 DESC, 1 DESC + LIMIT 5; + +-- lets have some queries that Citus shouldn't push down +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS (PARTITION BY event_type ORDER BY time DESC) +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; + +-- user needs to supply partition by which should +-- include the distribution key +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS () +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; + +-- user needs to supply partition by which should +-- include the distribution key +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS (ORDER BY time DESC) +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; + +-- w2 should not be pushed down +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id+1, (events_table.value_2 % 25) ORDER BY events_table.time) +) as foo +ORDER BY 3 DESC, 1 DESC, 2 DESC NULLS LAST +LIMIT 10; + +-- w2 should not be pushed down +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (ORDER BY events_table.time) +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC NULLS LAST +LIMIT + 10; + +-- GROUP BY includes the partition key, but not the WINDOW function +SELECT + user_id, time, my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (ORDER BY avg(event_type)) +) as foo +WHERE + my_rank > 125 +ORDER BY + 3 DESC, 1 DESC,2 DESC +LIMIT + 10; + +-- GROUP BY includes the partition key, but not the WINDOW function +SELECT + user_id, time, my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY date_trunc('day', time) ORDER BY avg(event_type)) +) as foo +WHERE + my_rank > 125 +ORDER BY + 3 DESC, 1 DESC,2 DESC +LIMIT + 10; + +-- Overriding window function but not supported +SELECT * FROM ( + SELECT + user_id, date_trunc('day', time) as time, sum(rank) OVER w2 + FROM ( + SELECT DISTINCT + user_id as user_id, time, rank() over w1 + FROM + users_table + WINDOW + w AS (PARTITION BY time), w1 AS (w ORDER BY value_2, value_3) + ) fab + WINDOW + w2 as (PARTITION BY user_id, time) +) a +ORDER BY +1,2,3; + + +-- Aggregate function on distribution column should error out +SELECT * FROM ( + SELECT + user_id, COUNT(*) OVER (PARTITION BY sum(user_id), MIN(value_2)) + FROM + users_table + GROUP BY + user_id +) a +ORDER BY + 1 DESC, 2 DESC; + +-- test with reference table partitioned on only a column from reference table +SELECT * +FROM +( + SELECT + DISTINCT user_id, it_name, count(id) OVER (PARTITION BY id) + FROM + users_table, users_ref_test_table +) a +ORDER BY + 1, 2, 3 +LIMIT + 20; + +-- UNION ALL with only one of them is not partitioned over distribution column which +-- should not be allowed. +SELECT + max(avg) +FROM +( + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (1, 2, 3, 4, 5)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (6, 7, 8, 9, 10)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (11, 12, 13, 14, 15)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (16, 17, 18, 19, 20)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (21, 22, 23, 24, 25)) + UNION ALL + (SELECT avg(value_3) over (partition by event_type), user_id FROM events_table where event_type IN (26, 27, 28, 29, 30)) +) b +GROUP BY user_id +ORDER BY 1 DESC +LIMIT 5; + +-- UNION with only one subquery which has a partition on non-distribution column should +-- error out +SELECT * +FROM ( + ( SELECT user_id, + sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_1 + GROUP BY + user_id) + UNION + (SELECT + user_id, sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by event_type) AS counter + FROM + events_table) user_id_2 + GROUP BY + user_id)) AS ftop +ORDER BY 2 DESC, 1 DESC +LIMIT 5; + +DROP VIEW subq;