diff --git a/src/backend/distributed/planner/insert_select_planner.c b/src/backend/distributed/planner/insert_select_planner.c index 7f69922ec..a3f7b107a 100644 --- a/src/backend/distributed/planner/insert_select_planner.c +++ b/src/backend/distributed/planner/insert_select_planner.c @@ -776,6 +776,7 @@ MultiTaskRouterSelectQuerySupported(Query *query) { List *queryList = NIL; ListCell *queryCell = NULL; + StringInfo errorDetail = NULL; ExtractQueryWalker((Node *) query, &queryList); foreach(queryCell, queryList) @@ -797,7 +798,7 @@ MultiTaskRouterSelectQuerySupported(Query *query) if (subquery->limitCount != NULL) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, - "LIMIT clauses are not allowed in distirbuted INSERT " + "LIMIT clauses are not allowed in distributed INSERT " "... SELECT queries", NULL, NULL); } @@ -811,17 +812,34 @@ MultiTaskRouterSelectQuerySupported(Query *query) NULL, NULL); } - /* - * We could potentially support window clauses where the data is partitioned - * over distribution column. For simplicity, we currently do not support window - * clauses at all. - */ - if (subquery->windowClause != NULL) + /* group clause list must include partition column */ + if (subquery->groupClause) { - return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, - "window functions are not allowed in distributed " - "INSERT ... SELECT queries", - NULL, NULL); + List *groupClauseList = subquery->groupClause; + List *targetEntryList = subquery->targetList; + List *groupTargetEntryList = GroupTargetEntryList(groupClauseList, + targetEntryList); + bool groupOnPartitionColumn = TargetListOnPartitionColumn(subquery, + groupTargetEntryList); + if (!groupOnPartitionColumn) + { + return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, + "Group by list without distribution column is " + "not allowed in distributed INSERT ... " + "SELECT queries", + NULL, NULL); + } + } + + /* + * We support window functions when the window function + * is partitioned on distribution column. + */ + if (subquery->windowClause && !SafeToPushdownWindowFunction(subquery, + &errorDetail)) + { + return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, errorDetail->data, NULL, + NULL); } if (subquery->setOperations != NULL) diff --git a/src/backend/distributed/planner/multi_logical_planner.c b/src/backend/distributed/planner/multi_logical_planner.c index b414ddc3f..af7e4bb24 100644 --- a/src/backend/distributed/planner/multi_logical_planner.c +++ b/src/backend/distributed/planner/multi_logical_planner.c @@ -83,7 +83,7 @@ static DeferredErrorMessage * DeferErrorIfUnsupportedUnionQuery(Query *queryTree outerMostQueryHasLimit); static bool ExtractSetOperationStatmentWalker(Node *node, List **setOperationList); static DeferredErrorMessage * DeferErrorIfUnsupportedTableCombination(Query *queryTree); -static bool TargetListOnPartitionColumn(Query *query, List *targetEntryList); +static bool WindowPartitionOnDistributionColumn(Query *query); static FieldSelect * CompositeFieldRecursive(Expr *expression, Query *query); static bool FullCompositeFieldList(List *compositeFieldList); static MultiNode * MultiPlanTree(Query *queryTree); @@ -447,6 +447,7 @@ MultiSubqueryPlanTree(Query *originalQuery, Query *queryTree, * - Only a single RTE_RELATION exists, which means only a single table * name is specified on the whole query * - No sublinks exists in the subquery + * - No window functions in the subquery * * Note that the caller should still call DeferErrorIfUnsupportedSubqueryRepartition() * to ensure that Citus supports the subquery. Also, this function is designed to run @@ -466,6 +467,12 @@ SingleRelationRepartitionSubquery(Query *queryTree) return false; } + /* we don't support window functions */ + if (queryTree->hasWindowFuncs) + { + return false; + } + /* * Don't allow joins and set operations. If join appears in the queryTree, the * length would be greater than 1. If only set operations exists, the length @@ -782,6 +789,7 @@ DeferErrorIfCannotPushdownSubquery(Query *subqueryTree, bool outerMostQueryHasLi { bool preconditionsSatisfied = true; char *errorDetail = NULL; + StringInfo errorInfo = NULL; DeferredErrorMessage *deferredError = NULL; deferredError = DeferErrorIfUnsupportedTableCombination(subqueryTree); @@ -796,12 +804,6 @@ DeferErrorIfCannotPushdownSubquery(Query *subqueryTree, bool outerMostQueryHasLi errorDetail = "Subqueries without relations are unsupported"; } - if (subqueryTree->hasWindowFuncs) - { - preconditionsSatisfied = false; - errorDetail = "Window functions are currently unsupported"; - } - if (subqueryTree->limitOffset) { preconditionsSatisfied = false; @@ -871,6 +873,17 @@ DeferErrorIfCannotPushdownSubquery(Query *subqueryTree, bool outerMostQueryHasLi } } + /* + * We support window functions when the window function + * is partitioned on distribution column. + */ + if (subqueryTree->windowClause && !SafeToPushdownWindowFunction(subqueryTree, + &errorInfo)) + { + errorDetail = (char *) errorInfo->data; + preconditionsSatisfied = false; + } + /* we don't support aggregates without group by */ if (subqueryTree->hasAggs && (subqueryTree->groupClause == NULL)) { @@ -1083,11 +1096,91 @@ DeferErrorIfUnsupportedTableCombination(Query *queryTree) } +/* + * SafeToPushdownWindowFunction checks if the query with window function is supported. + * It returns the result accordingly and modifies the error detail. + */ +bool +SafeToPushdownWindowFunction(Query *query, StringInfo *errorDetail) +{ + ListCell *windowClauseCell = NULL; + List *windowClauseList = query->windowClause; + + /* + * We need to check each window clause separately if there is a partition by clause + * and if it is partitioned on the distribution column. + */ + foreach(windowClauseCell, windowClauseList) + { + WindowClause *windowClause = lfirst(windowClauseCell); + + if (!windowClause->partitionClause) + { + *errorDetail = makeStringInfo(); + appendStringInfoString(*errorDetail, + "Window functions without PARTITION BY on distribution " + "column is currently unsupported"); + return false; + } + } + + if (!WindowPartitionOnDistributionColumn(query)) + { + *errorDetail = makeStringInfo(); + appendStringInfoString(*errorDetail, + "Window functions with PARTITION BY list missing distribution " + "column is currently unsupported"); + return false; + } + + return true; +} + + +/* + * WindowPartitionOnDistributionColumn checks if the given subquery has one + * or more window functions and at least one of them is not partitioned by + * distribution column. The function returns false if your window function does not + * have a partition by clause or it does not include the distribution column. + * + * Please note that if the query does not have a window function, the function + * returns true. + */ +static bool +WindowPartitionOnDistributionColumn(Query *query) +{ + List *windowClauseList = query->windowClause; + ListCell *windowClauseCell = NULL; + + foreach(windowClauseCell, windowClauseList) + { + WindowClause *windowClause = lfirst(windowClauseCell); + List *groupTargetEntryList = NIL; + bool partitionOnDistributionColumn = false; + List *partitionClauseList = windowClause->partitionClause; + List *targetEntryList = query->targetList; + + groupTargetEntryList = + GroupTargetEntryList(partitionClauseList, targetEntryList); + + partitionOnDistributionColumn = + TargetListOnPartitionColumn(query, groupTargetEntryList); + + if (!partitionOnDistributionColumn) + { + return false; + } + } + + return true; +} + + /* * TargetListOnPartitionColumn checks if at least one target list entry is on * partition column. */ -static bool +bool TargetListOnPartitionColumn(Query *query, List *targetEntryList) { bool targetListOnPartitionColumn = false; diff --git a/src/include/distributed/multi_logical_planner.h b/src/include/distributed/multi_logical_planner.h index 6ebebc80f..b3ff12aac 100644 --- a/src/include/distributed/multi_logical_planner.h +++ b/src/include/distributed/multi_logical_planner.h @@ -185,6 +185,8 @@ extern MultiTreeRoot * MultiLogicalPlanCreate(Query *originalQuery, Query *query PlannerRestrictionContext * plannerRestrictionContext, ParamListInfo boundParams); +extern bool SafeToPushdownWindowFunction(Query *query, StringInfo *errorDetail); +extern bool TargetListOnPartitionColumn(Query *query, List *targetEntryList); extern bool NeedsDistributedPlanning(Query *queryTree); extern MultiNode * ParentNode(MultiNode *multiNode); extern MultiNode * ChildNode(MultiUnaryNode *multiNode); diff --git a/src/test/regress/expected/multi_insert_select.out b/src/test/regress/expected/multi_insert_select.out index 6a95b5e47..fb2c20853 100644 --- a/src/test/regress/expected/multi_insert_select.out +++ b/src/test/regress/expected/multi_insert_select.out @@ -1015,8 +1015,7 @@ FROM (SELECT SUM(raw_events_second.value_4) AS v4, raw_events_second WHERE raw_events_first.user_id = raw_events_second.user_id GROUP BY raw_events_second.value_3) AS foo; -DEBUG: cannot perform distributed INSERT INTO ... SELECT because the partition columns in the source table and subquery do not match -DETAIL: The data type of the target table's partition column should exactly match the data type of the corresponding simple column reference in the subquery. +DEBUG: Group by list without distribution column is not allowed in distributed INSERT ... SELECT queries DEBUG: Collecting INSERT ... SELECT results on coordinator ERROR: cannot push down this subquery DETAIL: Group by list without partition column is currently unsupported @@ -1133,9 +1132,7 @@ FROM (SELECT SUM(raw_events_second.value_4) AS v4, GROUP BY raw_events_second.value_1 HAVING SUM(raw_events_second.value_4) > 10) AS foo2 ) as f2 ON (f.id = f2.id); -DEBUG: cannot perform distributed INSERT INTO ... SELECT because the partition columns in the source table and subquery do not match -DETAIL: Subquery contains an expression that is not a simple column reference in the same position as the target table's partition column. -HINT: Ensure the target table's partition column has a corresponding simple column reference to a distributed table's partition column in the subquery. +DEBUG: Group by list without distribution column is not allowed in distributed INSERT ... SELECT queries DEBUG: Collecting INSERT ... SELECT results on coordinator ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. @@ -1164,8 +1161,10 @@ FROM (SELECT SUM(raw_events_second.value_4) AS v4, GROUP BY raw_events_second.value_1 HAVING SUM(raw_events_second.value_4) > 10) AS foo2 ) as f2 ON (f.id = f2.id); -ERROR: cannot perform distributed planning for the given modification -DETAIL: Select query cannot be pushed down to the worker. +DEBUG: Group by list without distribution column is not allowed in distributed INSERT ... SELECT queries +DEBUG: Collecting INSERT ... SELECT results on coordinator +ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys +DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. -- cannot pushdown the query since the JOIN is not equi JOIN INSERT INTO agg_events (user_id, value_4_agg) diff --git a/src/test/regress/expected/multi_insert_select_window.out b/src/test/regress/expected/multi_insert_select_window.out new file mode 100644 index 000000000..845e0b710 --- /dev/null +++ b/src/test/regress/expected/multi_insert_select_window.out @@ -0,0 +1,855 @@ +-- =================================================================== +-- test insert select functionality for window functions +-- =================================================================== +TRUNCATE agg_results; +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS (PARTITION BY user_id ORDER BY time DESC) +) as foo; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 10001 | 101 | 49.5810418958104190 +(1 row) + +TRUNCATE agg_results; +-- the same test with different syntax +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER (PARTITION BY user_id ORDER BY time DESC) as rnk + FROM + events_table +) as foo; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 10001 | 101 | 49.5810418958104190 +(1 row) + +TRUNCATE agg_results; +-- similar test with lag +INSERT INTO agg_results (user_id, agg_time, value_2_agg, value_3_agg) +SELECT + user_id, time, lag_event_type, row_no +FROM +( + SELECT + *, lag(event_type) OVER my_win as lag_event_type, row_number() OVER my_win as row_no + FROM + events_table WINDOW my_win AS (PARTITION BY user_id ORDER BY time DESC) +) as foo; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 10001 | 101 | 49.5810418958104190 +(1 row) + +TRUNCATE agg_results; +-- simple window function, partitioned and grouped by on the distribution key +INSERT INTO agg_results (user_id, value_1_agg, value_2_agg) +SELECT + user_id, rnk, tme +FROM +( + SELECT + user_id, rank() OVER my_win as rnk, avg(value_2) as tme + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY user_id ORDER BY avg(event_type) DESC) +) as foo; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 1188 | 101 | 49.7895622895622896 +(1 row) + +TRUNCATE agg_results; +-- top level query has a group by on the result of the window function +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + min(user_id), min(time), lag_event_type +FROM +( + SELECT + *, lag(event_type) OVER my_win as lag_event_type + FROM + events_table WINDOW my_win AS (PARTITION BY user_id ORDER BY time DESC) +) as foo +GROUP BY + lag_event_type; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+-------------------- + 1002 | 50 | 9.7844311377245509 +(1 row) + +TRUNCATE agg_results; +-- window functions should work along with joins as well +INSERT INTO agg_results (user_id, value_1_agg, value_2_agg) +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w1 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time) +) as foo; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 195 | 91 | 51.0205128205128205 +(1 row) + +TRUNCATE agg_results; +-- two window functions in a single subquery should work fine as well +INSERT INTO agg_results (user_id, value_1_agg, value_2_agg) +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id, (events_table.value_2 % 25) ORDER BY events_table.time) +) as foo; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 202 | 91 | 50.2970297029702970 +(1 row) + +TRUNCATE agg_results; +-- window functions should be fine within subquery joins +INSERT INTO agg_results (user_id, value_1_agg, value_2_agg, value_3_agg) +SELECT sub_1.user_id, max(lag_1), max(rank_1), max(rank_2) FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1 as lag_1, rank() OVER w2 as rank_1 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id, (events_table.value_2 % 25) ORDER BY events_table.time) +) as sub_1 +JOIN +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1 as lag_2, rank() OVER w2 as rank_2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.value_2 ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id, (events_table.value_2 % 50) ORDER BY events_table.time) +) as sub_2 + ON(sub_1.user_id = sub_2.user_id) + GROUP BY + sub_1.user_id; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 91 | 91 | 50.2637362637362637 +(1 row) + +TRUNCATE agg_results; +-- GROUP BYs and PARTITION BYs should work fine together +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + avg(user_id), max(time), my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY user_id ORDER BY count(*) DESC) +) as foo +WHERE + my_rank > 5 +GROUP BY + my_rank; + -- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 7 | 6 | 50.0000000000000000 +(1 row) + +TRUNCATE agg_results; +-- aggregates in the PARTITION BY is also allows +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + avg(user_id), max(time), my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY user_id, avg(event_type%10)::int ORDER BY count(*) DESC) +) as foo +WHERE + my_rank > 0 +GROUP BY + my_rank; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 8 | 7 | 48.8750000000000000 +(1 row) + +TRUNCATE agg_results; +-- GROUP BY should not necessarly be inclusive of partitioning +-- but this query doesn't make much sense +INSERT INTO agg_results (user_id, value_1_agg) +SELECT + avg(user_id), my_rank +FROM +( + SELECT + user_id, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id + WINDOW my_win AS (PARTITION BY user_id, max(event_type) ORDER BY count(*) DESC) +) as foo +GROUP BY + my_rank; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 1 | 1 | 50.0000000000000000 +(1 row) + +TRUNCATE agg_results; +-- Group by has more columns than partition by which uses coordinator insert ... select +INSERT INTO agg_results(user_id, value_2_agg) +SELECT * FROM ( + SELECT + DISTINCT user_id, SUM(value_2) OVER (PARTITION BY user_id) + FROM + users_table + GROUP BY + user_id, value_1, value_2 +) a +ORDER BY + 2 DESC, 1 +LIMIT + 10; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 10 | 10 | 49.1000000000000000 +(1 row) + +TRUNCATE agg_results; +INSERT INTO agg_results(user_id, value_2_agg) +SELECT user_id, max(sum) FROM ( + SELECT + user_id, SUM(value_2) OVER (PARTITION BY user_id, value_1) + FROM + users_table + GROUP BY + user_id, value_1, value_2 +) a +GROUP BY user_id; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 101 | 101 | 50.0000000000000000 +(1 row) + +TRUNCATE agg_results; + -- Subquery in where with window function +INSERT INTO agg_results(user_id) +SELECT + user_id +FROM + users_table +WHERE + value_2 > 545 AND + value_2 < ALL ( + SELECT + avg(value_3) OVER (PARTITION BY user_id) + FROM + events_table + WHERE + users_table.user_id = events_table.user_id + ) +GROUP BY + user_id; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 4 | 4 | 35.2500000000000000 +(1 row) + +TRUNCATE agg_results; +-- Partition by with aggregate functions. This query does not make much sense since the +-- result of aggregate function will be the same for every row in a partition and it is +-- not going to affect the group that the count function will work on. +INSERT INTO agg_results(user_id, value_2_agg) +SELECT * FROM ( + SELECT + user_id, COUNT(*) OVER (PARTITION BY user_id, MIN(value_2)) + FROM + users_table + GROUP BY + 1 +) a; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 101 | 101 | 50.0000000000000000 +(1 row) + +TRUNCATE agg_results; +-- Some more nested queries +INSERT INTO agg_results(user_id, value_2_agg, value_3_agg, value_4_agg) +SELECT + user_id, rank, SUM(ABS(value_2 - value_3)) AS difference, COUNT(*) AS distinct_users +FROM ( + SELECT + *, rank() OVER (PARTITION BY user_id ORDER BY value_2 DESC) + FROM ( + SELECT + user_id, value_2, sum(value_3) OVER (PARTITION BY user_id, value_2) as value_3 + FROM users_table + ) AS A +) AS A +GROUP BY + user_id, rank; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 9501 | 101 | 49.8461214608988528 +(1 row) + +TRUNCATE agg_results; +INSERT INTO agg_results(user_id, value_1_agg) +SELECT * FROM ( + SELECT DISTINCT + f3.user_id, ABS(f2.sum - f3.sum) + FROM ( + SELECT DISTINCT + user_id, sum(value_3) OVER (PARTITION BY user_id) + FROM + users_table + GROUP BY + user_id, value_3 + ) f3, + ( + SELECT DISTINCT + user_id, sum(value_2) OVER (PARTITION BY user_id) + FROM + users_table + GROUP BY + user_id, value_2 + ) f2 +WHERE + f3.user_id=f2.user_id +) a; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 101 | 101 | 50.0000000000000000 +(1 row) + +TRUNCATE agg_results; +-- test with reference table partitioned on columns from both +INSERT INTO agg_results(user_id, value_1_agg) +SELECT * +FROM +( + SELECT + DISTINCT user_id, count(id) OVER (PARTITION BY user_id, id) + FROM + users_table, users_ref_test_table +) a; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 101 | 101 | 50.0000000000000000 +(1 row) + +TRUNCATE agg_results; +-- Window functions with HAVING clause +INSERT INTO agg_results (user_id, value_1_agg) +SELECT * FROM ( + SELECT + DISTINCT user_id, rank() OVER (PARTITION BY user_id ORDER BY value_1) + FROM + users_table + GROUP BY + user_id, value_1 HAVING count(*) > 1 +) a; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 437 | 100 | 49.9496567505720824 +(1 row) + +TRUNCATE agg_results; +-- Window functions with HAVING clause which uses coordinator insert ... select +INSERT INTO agg_results (user_id, value_1_agg) +SELECT * FROM ( + SELECT + DISTINCT user_id, rank() OVER (PARTITION BY user_id ORDER BY value_1) + FROM + users_table + GROUP BY + user_id, value_1 HAVING count(*) > 1 +) a +ORDER BY + 2 DESC, 1 +LIMIT + 10; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 10 | 5 | 32.4000000000000000 +(1 row) + +TRUNCATE agg_results; +-- Window function in View works +CREATE VIEW view_with_window_func AS +SELECT + DISTINCT user_id, rank() OVER (PARTITION BY user_id ORDER BY value_1) +FROM + users_table +GROUP BY + user_id, value_1 +HAVING count(*) > 1; +INSERT INTO agg_results(user_id, value_1_agg) +SELECT * +FROM + view_with_window_func; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 437 | 100 | 49.9496567505720824 +(1 row) + +TRUNCATE agg_results; +-- Window function in View works and the query uses coordinator insert ... select +INSERT INTO agg_results(user_id, value_1_agg) +SELECT * +FROM + view_with_window_func +LIMIT + 10; +-- get some statistics from the aggregated results to ensure the results are correct +-- since there is a limit but not order, we cannot run avg(user_id) +SELECT count(*) FROM agg_results; + count +------- + 10 +(1 row) + +TRUNCATE agg_results; +INSERT INTO agg_results(user_id, value_1_agg) +SELECT + user_id, max(avg) +FROM +( + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (1, 2, 3, 4, 5)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (6, 7, 8, 9, 10)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (11, 12, 13, 14, 15)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (16, 17, 18, 19, 20)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (21, 22, 23, 24, 25)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (26, 27, 28, 29, 30)) +) b +GROUP BY + user_id +LIMIT + 5; +-- get some statistics from the aggregated results to ensure the results are correct +-- since there is a limit but not order, we cannot test avg or distinct count +SELECT count(*) FROM agg_results; + count +------- + 5 +(1 row) + +TRUNCATE agg_results; +INSERT INTO agg_results(user_id, value_1_agg) +SELECT + user_id, max(avg) +FROM +( + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (1, 2, 3, 4, 5)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (6, 7, 8, 9, 10)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (11, 12, 13, 14, 15)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (16, 17, 18, 19, 20)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (21, 22, 23, 24, 25)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (26, 27, 28, 29, 30)) +) b +GROUP BY + user_id; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 94 | 94 | 50.4787234042553191 +(1 row) + +TRUNCATE agg_results; +INSERT INTO agg_results(user_id, value_1_agg) +SELECT * +FROM ( + ( SELECT user_id, + sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_1 + GROUP BY + user_id) + UNION + (SELECT + user_id, sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_2 + GROUP BY + user_id + ) +) AS ftop +LIMIT + 5; +-- get some statistics from the aggregated results to ensure the results are correct +-- since there is a limit but not order, we cannot test avg or distinct count +SELECT count(*) FROM agg_results; + count +------- + 5 +(1 row) + +TRUNCATE agg_results; +INSERT INTO agg_results(user_id, value_1_agg) +SELECT * +FROM ( + ( SELECT user_id, + sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_1 + GROUP BY + user_id) + UNION + (SELECT + user_id, sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_2 + GROUP BY + user_id + ) +) AS ftop; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 101 | 101 | 50.0000000000000000 +(1 row) + +TRUNCATE agg_results; +-- lets have some queries that Citus shouldn't push down +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS (PARTITION BY event_type ORDER BY time DESC) +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; +ERROR: cannot push down this subquery +DETAIL: Window functions with PARTITION BY list missing distribution column is currently unsupported +-- user needs to supply partition by which should +-- include the distribution key +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS () +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; +ERROR: cannot push down this subquery +DETAIL: Window functions without PARTITION BY on distribution column is currently unsupported +-- user needs to supply partition by which should +-- include the distribution key +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS (ORDER BY time DESC) +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; +ERROR: cannot push down this subquery +DETAIL: Window functions without PARTITION BY on distribution column is currently unsupported +-- w2 should not be pushed down +INSERT INTO agg_results (user_id, value_1_agg, value_2_agg) +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id+1, (events_table.value_2 % 25) ORDER BY events_table.time) +) as foo +LIMIT + 10; +ERROR: cannot push down this subquery +DETAIL: Window functions with PARTITION BY list missing distribution column is currently unsupported +-- GROUP BY includes the partition key, but not the WINDOW function +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + user_id, time, my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (ORDER BY avg(event_type)) +) as foo +WHERE + my_rank > 125; +ERROR: cannot push down this subquery +DETAIL: Window functions without PARTITION BY on distribution column is currently unsupported +-- GROUP BY includes the partition key, but not the WINDOW function +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + user_id, time, my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY date_trunc('day', time) ORDER BY avg(event_type)) +) as foo +WHERE + my_rank > 125; +ERROR: cannot push down this subquery +DETAIL: Window functions with PARTITION BY list missing distribution column is currently unsupported +-- w2 should not be allowed +INSERT INTO agg_results (user_id, value_2_agg, value_3_agg) +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (ORDER BY events_table.time) +) as foo; +ERROR: cannot push down this subquery +DETAIL: Window functions without PARTITION BY on distribution column is currently unsupported +-- unsupported window function with an override +INSERT INTO agg_results(user_id, agg_time, value_2_agg) +SELECT * FROM ( + SELECT + user_id, date_trunc('day', time) as time, sum(rank) OVER w2 + FROM ( + SELECT DISTINCT + user_id as user_id, time, rank() over w1 + FROM + users_table + WINDOW + w AS (PARTITION BY time), w1 AS (w ORDER BY value_2, value_3) + ) fab + WINDOW + w2 as (PARTITION BY user_id, time) +) a; +ERROR: cannot push down this subquery +DETAIL: Window functions with PARTITION BY list missing distribution column is currently unsupported + -- Subquery in where with unsupported window function +INSERT INTO agg_results(user_id) +SELECT + user_id +FROM + users_table +WHERE + value_2 > 545 AND + value_2 < ALL ( + SELECT + avg(value_3) OVER () + FROM + events_table + WHERE + users_table.user_id = events_table.user_id + ) +GROUP BY + user_id; +ERROR: cannot push down this subquery +DETAIL: Window functions without PARTITION BY on distribution column is currently unsupported +-- Aggregate function on distribution column should error out +INSERT INTO agg_results(user_id, value_2_agg) +SELECT * FROM ( + SELECT + user_id, COUNT(*) OVER (PARTITION BY sum(user_id), MIN(value_2)) + FROM + users_table + GROUP BY + user_id +) a; +ERROR: cannot push down this subquery +DETAIL: Window functions with PARTITION BY list missing distribution column is currently unsupported +-- UNION with only one subquery which has a partition on non-distribution column should +-- error out +INSERT INTO agg_results(user_id, value_1_agg) +SELECT * +FROM ( + ( SELECT user_id, + sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_1 + GROUP BY + user_id) + UNION + (SELECT + user_id, sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by event_type) AS counter + FROM + events_table) user_id_2 + GROUP BY + user_id + ) +) AS ftop; +ERROR: cannot push down this subquery +DETAIL: Window functions with PARTITION BY list missing distribution column is currently unsupported +DROP VIEW view_with_window_func; diff --git a/src/test/regress/expected/multi_subquery_complex_queries.out b/src/test/regress/expected/multi_subquery_complex_queries.out index 2414ee2d2..d303bf671 100644 --- a/src/test/regress/expected/multi_subquery_complex_queries.out +++ b/src/test/regress/expected/multi_subquery_complex_queries.out @@ -2381,19 +2381,6 @@ ORDER BY types; ERROR: cannot push down this subquery DETAIL: Offset clause is currently unsupported --- not supported due to window functions -SELECT user_id, - some_vals -FROM ( - SELECT * , - Row_number() over (PARTITION BY "user_id" ORDER BY "user_id") AS "some_vals", - Random() - FROM users_table - ) user_id -ORDER BY 1, - 2 limit 10; -ERROR: cannot perform distributed planning on this query -DETAIL: Subqueries without group by clause are not supported yet -- not supported due to non relation rte SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType FROM diff --git a/src/test/regress/expected/multi_subquery_complex_reference_clause.out b/src/test/regress/expected/multi_subquery_complex_reference_clause.out index 30b5a8053..3ca038a5a 100644 --- a/src/test/regress/expected/multi_subquery_complex_reference_clause.out +++ b/src/test/regress/expected/multi_subquery_complex_reference_clause.out @@ -6,7 +6,6 @@ -- We don't need shard id sequence here, so commented out to prevent conflicts with concurrent tests -- ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 1400000; ALTER SEQUENCE pg_catalog.pg_dist_jobid_seq RESTART 1400000; - SET citus.enable_router_execution TO FALSE; CREATE TABLE user_buy_test_table(user_id int, item_id int, buy_count int); SELECT create_distributed_table('user_buy_test_table', 'user_id'); @@ -29,19 +28,6 @@ SELECT create_distributed_table('users_return_test_table', 'user_id'); INSERT INTO users_return_test_table VALUES(4,1,1); INSERT INTO users_return_test_table VALUES(1,3,1); INSERT INTO users_return_test_table VALUES(3,2,2); -CREATE TABLE users_ref_test_table(id int, it_name varchar(25), k_no int); -SELECT create_reference_table('users_ref_test_table'); - create_reference_table ------------------------- - -(1 row) - -INSERT INTO users_ref_test_table VALUES(1,'User_1',45); -INSERT INTO users_ref_test_table VALUES(2,'User_2',46); -INSERT INTO users_ref_test_table VALUES(3,'User_3',47); -INSERT INTO users_ref_test_table VALUES(4,'User_4',48); -INSERT INTO users_ref_test_table VALUES(5,'User_5',49); -INSERT INTO users_ref_test_table VALUES(6,'User_6',50); -- Simple Join test with reference table SELECT count(*) FROM (SELECT random() FROM user_buy_test_table JOIN users_ref_test_table @@ -165,7 +151,7 @@ WHERE subquery_1.user_id = subquery_2.user_id ; 4 (1 row) - -- Should be able to push down since reference tables are inner joined + -- Should be able to push down since reference tables are inner joined -- with hash distributed tables, the results of those joins are the parts of -- an outer join SELECT subquery_2.id FROM @@ -193,15 +179,15 @@ ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join -- should be able to pushdown since reference table is in the -- inner part of the left join -SELECT - user_id, sum(value_1) -FROM - (SELECT +SELECT + user_id, sum(value_1) +FROM + (SELECT users_table.user_id, users_table.value_1, random() - FROM - users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) + FROM + users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) INNER JOIN events_reference_table ON (events_reference_table.value_2 = users_table.user_id) - ) as foo + ) as foo GROUP BY user_id ORDER BY 2 DESC LIMIT 10; user_id | sum ---------+---------- @@ -218,15 +204,15 @@ FROM (10 rows) -- same query as above, reference table is wrapped into a subquery -SELECT - user_id, sum(value_1) -FROM - (SELECT +SELECT + user_id, sum(value_1) +FROM + (SELECT users_table.user_id, users_table.value_1, random() - FROM - users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) + FROM + users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) INNER JOIN (SELECT *, random() FROM events_reference_table) as ref_all ON (ref_all.value_2 = users_table.user_id) - ) as foo + ) as foo GROUP BY user_id ORDER BY 2 DESC LIMIT 10; user_id | sum ---------+---------- @@ -244,15 +230,15 @@ FROM -- should be able to pushdown since reference table is in the -- inner part of the left join -SELECT - user_id, sum(value_1) -FROM - (SELECT +SELECT + user_id, sum(value_1) +FROM + (SELECT users_table.user_id, users_table.value_1, random() - FROM - users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) + FROM + users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) LEFT JOIN events_reference_table ON (events_reference_table.value_2 = users_table.user_id) - ) as foo + ) as foo GROUP BY user_id ORDER BY 2 DESC LIMIT 10; user_id | sum ---------+---------- @@ -270,16 +256,16 @@ FROM -- should not be able to pushdown since reference table is in the -- direct outer part of the left join -SELECT - user_id, sum(value_1) -FROM - (SELECT +SELECT + user_id, sum(value_1) +FROM + (SELECT users_table.user_id, users_table.value_1, random() - FROM - events_reference_table LEFT JOIN users_table ON (users_table.user_id = events_reference_table.value_2) + FROM + events_reference_table LEFT JOIN users_table ON (users_table.user_id = events_reference_table.value_2) LEFT JOIN events_table ON (events_table.user_id = users_table.user_id) - ) as foo - GROUP BY user_id ORDER BY 2 DESC LIMIT 10; + ) as foo + GROUP BY user_id ORDER BY 2 DESC LIMIT 10; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join -- should not be able to pushdown since reference table is in the @@ -287,22 +273,22 @@ DETAIL: There exist a reference table in the outer part of the outer join SELECT * FROM - (SELECT *, random() FROM events_reference_table) as ref_all LEFT JOIN users_table + (SELECT *, random() FROM events_reference_table) as ref_all LEFT JOIN users_table ON (users_table.user_id = ref_all.value_2); ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join -- should not be able to pushdown since reference table is in the -- outer part of the left join -SELECT - user_id, sum(value_1) -FROM - (SELECT +SELECT + user_id, sum(value_1) +FROM + (SELECT users_table.user_id, users_table.value_1, random() - FROM - events_reference_table LEFT JOIN users_table ON (users_table.user_id = events_reference_table.value_2) + FROM + events_reference_table LEFT JOIN users_table ON (users_table.user_id = events_reference_table.value_2) LEFT JOIN events_table ON (events_table.user_id = users_table.user_id) - ) as foo - GROUP BY user_id ORDER BY 2 DESC LIMIT 10; + ) as foo + GROUP BY user_id ORDER BY 2 DESC LIMIT 10; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join -- should be able to pushdown since reference table is in the @@ -311,18 +297,18 @@ SELECT * FROM ( SELECT DISTINCT foo.user_id FROM - ((SELECT + ((SELECT "events"."time", "events"."user_id" as event_user_id, value_2 as event_val_2, random() - FROM + FROM events_reference_table as "events" - WHERE + WHERE event_type > 80) as "temp_data_queries" INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_table as "users" - WHERE + WHERE user_id > 80 and value_2 = 5) as foo_in ON (event_val_2 = user_id)) as foo LEFT JOIN (SELECT user_id as user_user_id FROM users_table) as fooo ON (user_id = user_user_id)) as bar; user_id @@ -335,24 +321,24 @@ SELECT * FROM ( SELECT DISTINCT foo.user_id FROM - ((SELECT + ((SELECT "events"."time", "events"."user_id" as event_user_id, value_2 as event_val_2, random() - FROM + FROM events_reference_table as "events" - WHERE + WHERE event_type > 80) as "temp_data_queries" LEFT JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_table as "users" - WHERE + WHERE user_id > 80 and value_2 = 5) as foo_in ON (event_val_2 = user_id)) as foo LEFT JOIN (SELECT user_id as user_user_id FROM users_table) as fooo ON (user_id = user_user_id)) as bar; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join --- we could even suuport the following where the subquery --- on the outer part of the left join contains a reference table +-- we could even suuport the following where the subquery +-- on the outer part of the left join contains a reference table SELECT max(events_all.cnt), events_all.usr_id FROM (SELECT users_table.user_id as usr_id, @@ -372,7 +358,7 @@ LEFT JOIN events_table ON (events_all.usr_id = events_table.user_id) GROUP BY 2 -- but, we fail to pushdown the following query where join that reference table appears -- wrapped into a subquery SELECT max(events_all.cnt), - events_all.usr_id + events_all.usr_id FROM( SELECT *, random() FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt @@ -390,59 +376,59 @@ DETAIL: There exist a reference table in the outer part of the outer join SET citus.subquery_pushdown to ON; SELECT user_id, lastseen FROM - (SELECT + (SELECT "some_users_data".user_id, lastseen FROM - (SELECT + (SELECT filter_users_1.user_id, time AS lastseen FROM - (SELECT + (SELECT user_where_1_1.user_id FROM - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE + WHERE user_id > 12 and user_id < 16 and value_1 > 20) user_where_1_1 INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE - user_id > 12 and user_id < 16 and value_2 > 60) user_where_1_join_1 - ON ("user_where_1_1".user_id = "user_where_1_join_1".user_id)) - filter_users_1 + WHERE + user_id > 12 and user_id < 16 and value_2 > 60) user_where_1_join_1 + ON ("user_where_1_1".user_id = "user_where_1_join_1".user_id)) + filter_users_1 JOIN LATERAL - (SELECT + (SELECT user_id, time - FROM + FROM events_reference_table as "events" WHERE - user_id > 12 and user_id < 16 AND + user_id > 12 and user_id < 16 AND user_id = filter_users_1.user_id - ORDER BY + ORDER BY time DESC - LIMIT 1) "last_events_1" + LIMIT 1) "last_events_1" ON TRUE - ORDER BY + ORDER BY time DESC LIMIT 10) "some_recent_users" JOIN LATERAL - (SELECT + (SELECT "users".user_id - FROM + FROM users_reference_table as "users" - WHERE - "users"."user_id" = "some_recent_users"."user_id" AND + WHERE + "users"."user_id" = "some_recent_users"."user_id" AND "users"."value_2" > 70 - LIMIT 1) "some_users_data" + LIMIT 1) "some_users_data" ON TRUE - ORDER BY + ORDER BY lastseen DESC LIMIT 10) "some_users" -ORDER BY +ORDER BY user_id DESC LIMIT 10; user_id | lastseen @@ -461,46 +447,46 @@ LIMIT 10; SET citus.subquery_pushdown to OFF; -- NESTED INNER JOINs with reference tables -SELECT - count(*) AS value, "generated_group_field" +SELECT + count(*) AS value, "generated_group_field" FROM - (SELECT + (SELECT DISTINCT "pushedDownQuery"."user_id", "generated_group_field" FROM - (SELECT + (SELECT "eventQuery"."user_id", "eventQuery"."time", random(), ("eventQuery"."value_2") AS "generated_group_field" FROM - (SELECT + (SELECT * FROM - (SELECT + (SELECT "events"."time", "events"."user_id", "events"."value_2" - FROM + FROM events_table as "events" - WHERE + WHERE user_id > 10 and user_id < 40 AND event_type IN (40, 41, 42, 43, 44, 45) ) "temp_data_queries" INNER JOIN - (SELECT + (SELECT user_where_1_1.real_user_id FROM - (SELECT + (SELECT "users"."user_id" as real_user_id - FROM + FROM users_reference_table as "users" WHERE user_id > 10 and user_id < 40 and value_2 > 50 ) user_where_1_1 INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE - user_id > 10 and user_id < 40 and value_3 > 50 ) user_where_1_join_1 + WHERE + user_id > 10 and user_id < 40 and value_3 > 50 ) user_where_1_join_1 ON ("user_where_1_1".real_user_id = "user_where_1_join_1".user_id)) "user_filters_1" ON ("temp_data_queries".user_id = "user_filters_1".real_user_id)) "eventQuery") "pushedDownQuery") "pushedDownQuery" -GROUP BY - "generated_group_field" -ORDER BY +GROUP BY + "generated_group_field" +ORDER BY generated_group_field DESC, value DESC; value | generated_group_field -------+----------------------- @@ -523,40 +509,40 @@ ORDER BY (16 rows) -- single level inner joins with reference tables -SELECT - "value_3", count(*) AS cnt +SELECT + "value_3", count(*) AS cnt FROM - (SELECT + (SELECT "value_3", "user_id", random() FROM - (SELECT - users_in_segment_1.user_id, value_3 + (SELECT + users_in_segment_1.user_id, value_3 FROM - (SELECT + (SELECT user_id, value_3 * 2 as value_3 FROM - (SELECT - user_id, value_3 + (SELECT + user_id, value_3 FROM - (SELECT + (SELECT "users"."user_id", value_3 - FROM + FROM users_reference_table as "users" - WHERE + WHERE user_id > 10 and user_id < 40 and value_2 > 30 ) simple_user_where_1 ) all_buckets_1 ) users_in_segment_1 JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE + WHERE user_id > 10 and user_id < 40 and value_2 > 60 ) some_users_data ON ("users_in_segment_1".user_id = "some_users_data".user_id) - ) segmentalias_1) "tempQuery" + ) segmentalias_1) "tempQuery" GROUP BY "value_3" ORDER BY cnt, value_3 DESC LIMIT 10; value_3 | cnt @@ -579,42 +565,42 @@ SELECT * FROM (SELECT "some_users_data".user_id, "some_recent_users".value_3 FROM - (SELECT + (SELECT filter_users_1.user_id, value_3 FROM - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE + WHERE user_id > 20 and user_id < 70 and users.value_2 = 200) filter_users_1 JOIN LATERAL - (SELECT + (SELECT user_id, value_3 - FROM + FROM events_reference_table as "events" WHERE - user_id > 20 and user_id < 70 AND + user_id > 20 and user_id < 70 AND ("events".user_id = "filter_users_1".user_id) - ORDER BY + ORDER BY value_3 DESC LIMIT 1) "last_events_1" ON true ORDER BY value_3 DESC LIMIT 10) "some_recent_users" JOIN LATERAL - (SELECT + (SELECT "users".user_id - FROM + FROM users_reference_table as "users" - WHERE - "users"."user_id" = "some_recent_users"."user_id" AND + WHERE + "users"."user_id" = "some_recent_users"."user_id" AND users.value_2 > 200 LIMIT 1) "some_users_data" ON true - ORDER BY + ORDER BY value_3 DESC LIMIT 10) "some_users" -ORDER BY - value_3 DESC +ORDER BY + value_3 DESC LIMIT 10; user_id | value_3 ---------+--------- @@ -632,37 +618,37 @@ SET citus.subquery_pushdown to OFF; SELECT count(*) AS cnt, "generated_group_field" FROM - (SELECT + (SELECT "eventQuery"."user_id", random(), generated_group_field FROM - (SELECT + (SELECT "multi_group_wrapper_1".*, generated_group_field, random() FROM (SELECT * FROM - (SELECT + (SELECT "events"."time", "events"."user_id" as event_user_id - FROM + FROM events_table as "events" - WHERE + WHERE user_id > 80) "temp_data_queries" INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE - user_id > 80 and value_2 = 5) "user_filters_1" + WHERE + user_id > 80 and value_2 = 5) "user_filters_1" ON ("temp_data_queries".event_user_id = "user_filters_1".user_id)) AS "multi_group_wrapper_1" LEFT JOIN - (SELECT + (SELECT "users"."user_id" AS "user_id", value_2 AS "generated_group_field" - FROM + FROM users_table as "users") "left_group_by_1" - ON ("left_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" + ON ("left_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" group BY "generated_group_field" - ORDER BY + ORDER BY cnt DESC, generated_group_field ASC LIMIT 10; cnt | generated_group_field @@ -684,43 +670,43 @@ count(*) AS cnt, "generated_group_field" SELECT count(*) AS cnt, "generated_group_field" FROM - (SELECT + (SELECT "eventQuery"."user_id", random(), generated_group_field FROM - (SELECT + (SELECT "multi_group_wrapper_1".*, generated_group_field, random() FROM (SELECT * FROM - (SELECT + (SELECT "events"."time", "events"."user_id" as event_user_id - FROM + FROM events_table as "events" - WHERE + WHERE user_id > 80) "temp_data_queries" INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_table as "users" - WHERE - user_id > 80 and value_2 = 5) "user_filters_1" + WHERE + user_id > 80 and value_2 = 5) "user_filters_1" ON ("temp_data_queries".event_user_id = "user_filters_1".user_id)) AS "multi_group_wrapper_1" RIGHT JOIN - (SELECT + (SELECT "users"."user_id" AS "user_id", value_2 AS "generated_group_field" - FROM + FROM users_reference_table as "users") "right_group_by_1" - ON ("right_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" + ON ("right_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" group BY "generated_group_field" - ORDER BY + ORDER BY cnt DESC, generated_group_field ASC LIMIT 10; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join -- right join where the inner part of the join includes a reference table --- joined with hash partitioned table using non-equi join +-- joined with hash partitioned table using non-equi join SELECT user_id, sum(array_length(events_table, 1)), length(hasdone_event), hasdone_event FROM ( SELECT @@ -794,42 +780,42 @@ ORDER BY user_id; (4 rows) -- LEFT JOINs used with INNER JOINs --- events_table and users_reference_table joined +-- events_table and users_reference_table joined -- with event_table.non_part_key < reference_table.any_key SELECT count(*) AS cnt, "generated_group_field" FROM - (SELECT + (SELECT "eventQuery"."user_id", random(), generated_group_field FROM - (SELECT + (SELECT "multi_group_wrapper_1".*, generated_group_field, random() FROM (SELECT * FROM - (SELECT + (SELECT "events"."time", "events"."user_id" as event_user_id - FROM + FROM events_table as "events" - WHERE + WHERE user_id > 80) "temp_data_queries" INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE - user_id > 80 and value_2 = 5) "user_filters_1" + WHERE + user_id > 80 and value_2 = 5) "user_filters_1" ON ("temp_data_queries".event_user_id < "user_filters_1".user_id)) AS "multi_group_wrapper_1" RIGHT JOIN - (SELECT + (SELECT "users"."user_id" AS "user_id", value_2 AS "generated_group_field" - FROM + FROM users_table as "users") "left_group_by_1" - ON ("left_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" + ON ("left_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" group BY "generated_group_field" - ORDER BY + ORDER BY cnt DESC, generated_group_field ASC LIMIT 10; cnt | generated_group_field @@ -853,32 +839,32 @@ FROM FROM (SELECT user_id, time FROM - (SELECT + (SELECT user_id, time - FROM + FROM events_reference_table as "events" - WHERE + WHERE user_id > 10 and user_id < 40) "events_1" ORDER BY time DESC) "recent_events_1" - GROUP BY + GROUP BY user_id - ORDER BY + ORDER BY max(TIME) DESC) "some_recent_users" FULL JOIN - (SELECT + (SELECT "users".user_id - FROM + FROM users_table as "users" - WHERE - users.value_2 > 50 and users.value_2 < 55) "some_users_data" + WHERE + users.value_2 > 50 and users.value_2 < 55) "some_users_data" ON "some_users_data"."user_id" = "some_recent_users"."user_id" -ORDER BY +ORDER BY user_id limit 50; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join - -- + -- -- UNIONs and JOINs with reference tables, should error out -- SELECT ("final_query"."event_types") as types @@ -889,132 +875,132 @@ FROM FROM ( SELECT "t1"."user_id", min("t1"."time") AS "time", array_agg(("t1"."event") ORDER BY TIME ASC, event DESC) AS collected_events FROM ( - (SELECT + (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 0 AS event - FROM + FROM events_table as "events" - WHERE - event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) - UNION - (SELECT + WHERE + event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) + UNION + (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 1 AS event - FROM + FROM events_reference_table as "events" - WHERE + WHERE event_type IN (15, 16, 17, 18, 19) ) events_subquery_2) - UNION - (SELECT + UNION + (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 2 AS event - FROM + FROM events_table as "events" - WHERE + WHERE event_type IN (20, 21, 22, 23, 24, 25) ) events_subquery_3) - UNION - (SELECT + UNION + (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 3 AS event - FROM + FROM events_table as "events" - WHERE + WHERE event_type IN (26, 27, 28, 29, 30, 13)) events_subquery_4)) t1 - GROUP BY "t1"."user_id") AS t) "q" + GROUP BY "t1"."user_id") AS t) "q" INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_table as "users" - WHERE - value_1 > 50 and value_1 < 70) AS t + WHERE + value_1 > 50 and value_1 < 70) AS t ON (t.user_id = q.user_id)) as final_query -ORDER BY +ORDER BY types; ERROR: cannot push down this subquery DETAIL: Reference tables are not supported with union operator - -- reference table exist in the subquery of union, should error out + -- reference table exist in the subquery of union, should error out SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType FROM - ( SELECT + ( SELECT *, random() FROM - (SELECT + (SELECT "t"."user_id", "t"."time", unnest("t"."collected_events") AS "event_types" FROM - ( SELECT + ( SELECT "t1"."user_id", min("t1"."time") AS "time", array_agg(("t1"."event") ORDER BY TIME ASC, event DESC) AS collected_events FROM ( - (SELECT + (SELECT * FROM - (SELECT + (SELECT "events"."time", 0 AS event, "events"."user_id" - FROM + FROM events_table as "events" - WHERE - event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) - UNION + WHERE + event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) + UNION (SELECT * FROM ( SELECT * FROM ( - SELECT + SELECT max("users"."time"), 0 AS event, "users"."user_id" - FROM + FROM events_reference_table as "events", users_table as "users" - WHERE + WHERE events.user_id = users.user_id AND event_type IN (10, 11, 12, 13, 14, 15) GROUP BY "users"."user_id" ) as events_subquery_5 ) events_subquery_2) - UNION + UNION (SELECT * FROM - (SELECT + (SELECT "events"."time", 2 AS event, "events"."user_id" - FROM + FROM events_table as "events" - WHERE + WHERE event_type IN (20, 21, 22, 23, 24, 25) ) events_subquery_3) - UNION + UNION (SELECT * FROM (SELECT "events"."time", 3 AS event, "events"."user_id" - FROM + FROM events_table as "events" - WHERE + WHERE event_type IN (26, 27, 28, 29, 30, 13)) events_subquery_4) ) t1 - GROUP BY "t1"."user_id") AS t) "q" + GROUP BY "t1"."user_id") AS t) "q" INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_table as "users" - WHERE - value_1 > 50 and value_1 < 70) AS t + WHERE + value_1 > 50 and value_1 < 70) AS t ON (t.user_id = q.user_id)) as final_query -GROUP BY +GROUP BY types -ORDER BY +ORDER BY types; ERROR: cannot push down this subquery DETAIL: Reference tables are not supported with union operator --- +-- -- Should error out with UNION ALL Queries on reference tables -- SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType @@ -1027,40 +1013,40 @@ FROM FROM ( (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 0 AS event - FROM + FROM events_table as "events" - WHERE - event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) + WHERE + event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) UNION ALL (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 1 AS event - FROM + FROM events_table as "events" - WHERE + WHERE event_type IN (15, 16, 17, 18, 19) ) events_subquery_2) UNION ALL (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 2 AS event - FROM + FROM events_reference_table as "events" - WHERE + WHERE event_type IN (20, 21, 22, 23, 24, 25) ) events_subquery_3) UNION ALL (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 3 AS event - FROM + FROM events_table as "events" - WHERE + WHERE event_type IN (26, 27, 28, 29, 30, 13)) events_subquery_4)) t1 - GROUP BY "t1"."user_id") AS t) "q" + GROUP BY "t1"."user_id") AS t) "q" INNER JOIN (SELECT "users"."user_id" FROM users_table as "users" @@ -1069,7 +1055,7 @@ GROUP BY types ORDER BY types; ERROR: cannot push down this subquery DETAIL: Reference tables are not supported with union operator --- just a sanity check that we don't allow this if the reference table is on the +-- just a sanity check that we don't allow this if the reference table is on the -- left part of the left join SELECT count(*) FROM (SELECT random() FROM users_ref_test_table LEFT JOIN user_buy_test_table @@ -1090,37 +1076,37 @@ DETAIL: Each relation should be joined with at least one another relation using SELECT count(*) AS cnt, "generated_group_field" FROM - (SELECT + (SELECT "eventQuery"."user_id", random(), generated_group_field FROM - (SELECT + (SELECT "multi_group_wrapper_1".*, generated_group_field, random() FROM (SELECT * FROM - (SELECT + (SELECT "events"."time", "events"."user_id" as event_user_id - FROM + FROM events_table as "events" - WHERE + WHERE user_id > 80) "temp_data_queries" INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE - user_id > 80 and value_2 = 5) "user_filters_1" + WHERE + user_id > 80 and value_2 = 5) "user_filters_1" ON ("temp_data_queries".event_user_id < "user_filters_1".user_id)) AS "multi_group_wrapper_1" RIGHT JOIN - (SELECT + (SELECT "users"."user_id" AS "user_id", value_2 AS "generated_group_field" - FROM + FROM users_table as "users") "left_group_by_1" - ON ("left_group_by_1".user_id > "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" + ON ("left_group_by_1".user_id > "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" group BY "generated_group_field" - ORDER BY + ORDER BY cnt DESC, generated_group_field ASC LIMIT 10; ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys @@ -1128,21 +1114,21 @@ DETAIL: Each relation should be joined with at least one another relation using -- two hash partitioned relations are not joined -- on partiton keys although reference table is fine -- to push down -SELECT +SELECT u1.user_id, count(*) -FROM +FROM events_table as e1, users_table as u1 WHERE event_type IN - (SELECT + (SELECT event_type - FROM + FROM events_reference_table as e2 WHERE value_2 = 15 AND value_3 > 25 AND e1.value_2 > e2.value_2 - ) + ) AND u1.user_id > e1.user_id GROUP BY 1 ORDER BY 2 DESC, 1 DESC diff --git a/src/test/regress/expected/multi_subquery_window_functions.out b/src/test/regress/expected/multi_subquery_window_functions.out new file mode 100644 index 000000000..4788a6a63 --- /dev/null +++ b/src/test/regress/expected/multi_subquery_window_functions.out @@ -0,0 +1,1050 @@ +-- =================================================================== +-- test multi subquery functionality for window functions +-- =================================================================== +CREATE VIEW subq AS +SELECT + DISTINCT user_id, rank() OVER (PARTITION BY user_id ORDER BY value_1) +FROM + users_table +GROUP BY + user_id, value_1 +HAVING count(*) > 1; +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS (PARTITION BY user_id ORDER BY time DESC) +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; + user_id | time | rnk +---------+---------------------------------+----- + 23 | Fri Jan 10 20:11:40.439606 2014 | 127 + 23 | Fri Jan 10 20:15:35.594738 2014 | 126 + 23 | Fri Jan 10 23:14:59.348548 2014 | 125 + 23 | Fri Jan 10 23:38:35.800498 2014 | 124 + 25 | Fri Jan 10 21:50:55.465393 2014 | 123 + 23 | Sat Jan 11 00:40:59.383928 2014 | 123 + 25 | Fri Jan 10 22:43:09.881855 2014 | 122 + 23 | Sat Jan 11 00:42:46.148 2014 | 122 + 25 | Fri Jan 10 23:08:28.963923 2014 | 121 + 23 | Sat Jan 11 01:23:01.126017 2014 | 121 +(10 rows) + +-- the same test with different syntax +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER (PARTITION BY user_id ORDER BY time DESC) as rnk + FROM + events_table +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; + user_id | time | rnk +---------+---------------------------------+----- + 23 | Fri Jan 10 20:11:40.439606 2014 | 127 + 23 | Fri Jan 10 20:15:35.594738 2014 | 126 + 23 | Fri Jan 10 23:14:59.348548 2014 | 125 + 23 | Fri Jan 10 23:38:35.800498 2014 | 124 + 25 | Fri Jan 10 21:50:55.465393 2014 | 123 + 23 | Sat Jan 11 00:40:59.383928 2014 | 123 + 25 | Fri Jan 10 22:43:09.881855 2014 | 122 + 23 | Sat Jan 11 00:42:46.148 2014 | 122 + 25 | Fri Jan 10 23:08:28.963923 2014 | 121 + 23 | Sat Jan 11 01:23:01.126017 2014 | 121 +(10 rows) + +-- similar test with lag +SELECT + user_id, time, lag_event_type, row_no +FROM +( + SELECT + *, lag(event_type) OVER my_win as lag_event_type, row_number() OVER my_win as row_no + FROM + events_table WINDOW my_win AS (PARTITION BY user_id ORDER BY time DESC) +) as foo +ORDER BY + 4 DESC, 3 DESC NULLS LAST, 1 DESC, 2 DESC +LIMIT + 10; + user_id | time | lag_event_type | row_no +---------+---------------------------------+----------------+-------- + 23 | Fri Jan 10 20:11:40.439606 2014 | 338 | 127 + 23 | Fri Jan 10 20:15:35.594738 2014 | 999 | 126 + 23 | Fri Jan 10 23:14:59.348548 2014 | 783 | 125 + 23 | Fri Jan 10 23:38:35.800498 2014 | 802 | 124 + 25 | Fri Jan 10 21:50:55.465393 2014 | 517 | 123 + 23 | Sat Jan 11 00:40:59.383928 2014 | 359 | 123 + 25 | Fri Jan 10 22:43:09.881855 2014 | 918 | 122 + 23 | Sat Jan 11 00:42:46.148 2014 | 68 | 122 + 25 | Fri Jan 10 23:08:28.963923 2014 | 757 | 121 + 23 | Sat Jan 11 01:23:01.126017 2014 | 251 | 121 +(10 rows) + +-- simple window function, partitioned and grouped by on the distribution key +SELECT + user_id, rnk, avg_val_2 +FROM +( + SELECT + user_id, rank() OVER my_win as rnk, avg(value_2) as avg_val_2 + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY user_id ORDER BY avg(event_type) DESC) +) as foo +ORDER BY + 2 DESC, 1 DESC, 3 DESC +LIMIT + 10; + user_id | rnk | avg_val_2 +---------+-----+---------------------- + 98 | 12 | 647.5000000000000000 + 95 | 12 | 428.5000000000000000 + 94 | 12 | 608.6666666666666667 + 92 | 12 | 724.0000000000000000 + 91 | 12 | 549.0000000000000000 + 90 | 12 | 525.1000000000000000 + 89 | 12 | 531.0000000000000000 + 87 | 12 | 740.0000000000000000 + 84 | 12 | 487.7500000000000000 + 83 | 12 | 629.5000000000000000 +(10 rows) + +-- top level query has a group by on the result of the window function +SELECT + min(user_id), min(time), lag_event_type, count(*) +FROM +( + SELECT + *, lag(event_type) OVER my_win as lag_event_type + FROM + events_table WINDOW my_win AS (PARTITION BY user_id ORDER BY time DESC) +) as foo +GROUP BY + lag_event_type +ORDER BY + 3 DESC NULLS LAST, 1 DESC, 2 DESC +LIMIT + 10; + min | min | lag_event_type | count +-----+---------------------------------+----------------+------- + 45 | Sat Jan 11 12:47:09.502744 2014 | 1000 | 2 + 18 | Fri Jan 10 20:15:35.594738 2014 | 999 | 9 + 1 | Sat Jan 11 21:08:41.737933 2014 | 998 | 10 + 0 | Sat Jan 11 16:32:40.662168 2014 | 997 | 9 + 3 | Fri Jan 10 23:30:18.011423 2014 | 996 | 13 + 17 | Sun Jan 12 03:54:06.464758 2014 | 995 | 9 + 23 | Tue Jan 14 22:04:23.44321 2014 | 994 | 7 + 7 | Sat Jan 11 04:59:48.119353 2014 | 993 | 9 + 8 | Sat Jan 11 05:14:45.845071 2014 | 992 | 14 + 0 | Sun Jan 12 03:24:01.449152 2014 | 991 | 10 +(10 rows) + +-- window functions should work along with joins as well +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w1 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time) +) as foo +ORDER BY 3 DESC, 1 DESC, 2 DESC NULLS LAST +LIMIT 10; + user_id | lag | rank +---------+-----+------ + 90 | 90 | 114 + 72 | 72 | 109 + 26 | 26 | 109 + 91 | 91 | 108 + 55 | 55 | 107 + 27 | 27 | 106 + 60 | 60 | 101 + 98 | 98 | 97 + 39 | 39 | 95 + 61 | 61 | 93 +(10 rows) + +-- two window functions in a single subquery should work fine as well +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id, (events_table.value_2 % 25) ORDER BY events_table.time) +) as foo +ORDER BY 3 DESC, 1 DESC, 2 DESC NULLS LAST +LIMIT 10; + user_id | lag | rank +---------+-----+------ + 73 | 73 | 112 + 73 | | 112 + 48 | 48 | 111 + 48 | | 111 + 43 | 43 | 105 + 43 | | 105 + 77 | 77 | 104 + 77 | | 104 + 30 | 30 | 104 + 30 | | 104 +(10 rows) + +-- window functions should be fine within subquery joins +SELECT sub_1.user_id, max(lag_1), max(rank_1), max(rank_2) FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1 as lag_1, rank() OVER w2 as rank_1 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id, (events_table.value_2 % 25) ORDER BY events_table.time) +) as sub_1 +JOIN +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1 as lag_2, rank() OVER w2 as rank_2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.value_2 ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id, (events_table.value_2 % 50) ORDER BY events_table.time) +) as sub_2 + ON(sub_1.user_id = sub_2.user_id) + GROUP BY + sub_1.user_id + ORDER BY 3 DESC, 4 DESC, 1 DESC, 2 DESC NULLS LAST +LIMIT 10; + user_id | max | max | max +---------+-----+-----+----- + 73 | 73 | 112 | 112 + 48 | 48 | 111 | 111 + 43 | 43 | 105 | 1 + 77 | 77 | 104 | 104 + 30 | 30 | 104 | 104 + 50 | 50 | 101 | 1 + 79 | 79 | 97 | 1 + 49 | 49 | 96 | 96 + 44 | 44 | 93 | 1 + 13 | 13 | 87 | 1 +(10 rows) + +-- GROUP BYs and PARTITION BYs should work fine together +SELECT + avg(user_id), max(time), my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY user_id ORDER BY count(*) DESC) +) as foo +WHERE + my_rank > 5 +GROUP BY + my_rank +ORDER BY + 3 DESC, 1 DESC,2 DESC +LIMIT + 10; + avg | max | my_rank +---------------------+--------------------------+--------- + 48.6250000000000000 | Tue Jan 21 00:00:00 2014 | 12 + 48.4786324786324786 | Tue Jan 21 00:00:00 2014 | 11 + 50.2083333333333333 | Tue Jan 21 00:00:00 2014 | 10 + 51.8247422680412371 | Tue Jan 21 00:00:00 2014 | 9 + 46.2061855670103093 | Mon Jan 20 00:00:00 2014 | 8 + 52.5945945945945946 | Mon Jan 20 00:00:00 2014 | 7 + 52.2589285714285714 | Tue Jan 21 00:00:00 2014 | 6 +(7 rows) + +-- aggregates in the PARTITION BY is also allows +SELECT + avg(user_id), max(time), my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY user_id, avg(event_type%10)::int ORDER BY count(*) DESC) +) as foo +WHERE + my_rank > 0 +GROUP BY + my_rank +ORDER BY + 3 DESC, 1 DESC,2 DESC +LIMIT + 10; + avg | max | my_rank +---------------------+--------------------------+--------- + 22.0000000000000000 | Fri Jan 10 00:00:00 2014 | 8 + 60.4000000000000000 | Mon Jan 20 00:00:00 2014 | 7 + 55.5500000000000000 | Tue Jan 21 00:00:00 2014 | 6 + 50.6142857142857143 | Tue Jan 21 00:00:00 2014 | 5 + 53.6697247706422018 | Tue Jan 21 00:00:00 2014 | 4 + 49.7604166666666667 | Tue Jan 21 00:00:00 2014 | 3 + 47.9569892473118280 | Tue Jan 21 00:00:00 2014 | 2 + 49.5859375000000000 | Tue Jan 21 00:00:00 2014 | 1 +(8 rows) + +-- GROUP BY should not necessarly be inclusive of partitioning +-- but this query doesn't make much sense +SELECT + avg(user_id), my_rank +FROM +( + SELECT + user_id, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id + WINDOW my_win AS (PARTITION BY user_id, max(event_type) ORDER BY count(*) DESC) +) as foo +GROUP BY + my_rank +ORDER BY + 2 DESC, 1 DESC +LIMIT + 10; + avg | my_rank +---------------------+--------- + 50.0000000000000000 | 1 +(1 row) + + -- Using previously defined supported window function on distribution key +SELECT * FROM ( + SELECT + user_id, date_trunc('day', time) as time, sum(rank) OVER w2 + FROM ( + SELECT DISTINCT + user_id as user_id, time, rank() over w1 + FROM users_table + WINDOW + w AS (PARTITION BY user_id), + w1 AS (w ORDER BY value_2, value_3) + ) fab + WINDOW + w2 as (PARTITION BY user_id, time) +) a +ORDER BY + 1, 2, 3 DESC +LIMIT + 10; + user_id | time | sum +---------+--------------------------+----- + 0 | Fri Jan 10 00:00:00 2014 | 32 + 0 | Sat Jan 11 00:00:00 2014 | 40 + 0 | Sat Jan 11 00:00:00 2014 | 38 + 0 | Sat Jan 11 00:00:00 2014 | 30 + 0 | Sun Jan 12 00:00:00 2014 | 49 + 0 | Sun Jan 12 00:00:00 2014 | 47 + 0 | Sun Jan 12 00:00:00 2014 | 34 + 0 | Sun Jan 12 00:00:00 2014 | 29 + 0 | Sun Jan 12 00:00:00 2014 | 24 + 0 | Sun Jan 12 00:00:00 2014 | 20 +(10 rows) + +-- test with reference table partitioned on columns from both +SELECT * +FROM +( + SELECT + DISTINCT user_id, it_name, count(id) OVER (PARTITION BY user_id, id) + FROM + users_table, users_ref_test_table + WHERE users_table.value_2=users_ref_test_table.k_no +) a +ORDER BY + 1, 2, 3 +LIMIT + 20; + user_id | it_name | count +---------+---------+------- + 6 | User_4 | 1 + 8 | User_4 | 1 + 9 | User_3 | 1 + 11 | User_3 | 1 + 15 | User_6 | 1 + 16 | User_2 | 1 + 16 | User_3 | 1 + 17 | User_3 | 1 + 17 | User_4 | 1 + 20 | User_3 | 1 + 21 | User_6 | 1 + 22 | User_1 | 1 + 24 | User_4 | 1 + 26 | User_2 | 2 + 27 | User_4 | 1 + 28 | User_2 | 1 + 28 | User_3 | 1 + 31 | User_1 | 1 + 31 | User_4 | 1 + 34 | User_3 | 1 +(20 rows) + +-- Group by has more columns than partition by +SELECT * FROM ( + SELECT + DISTINCT user_id, SUM(value_2) OVER (PARTITION BY user_id) + FROM + users_table + GROUP BY + user_id, value_1, value_2 +) a +ORDER BY + 2 DESC, 1 +LIMIT + 10; + user_id | sum +---------+------- + 46 | 63666 + 23 | 62524 + 56 | 61350 + 12 | 61317 + 48 | 60144 + 71 | 60095 + 45 | 59904 + 94 | 59773 + 3 | 59141 + 93 | 58365 +(10 rows) + +SELECT user_id, max(sum) FROM ( + SELECT + user_id, SUM(value_2) OVER (PARTITION BY user_id, value_1) + FROM + users_table + GROUP BY + user_id, value_1, value_2 +) a +GROUP BY user_id ORDER BY + 2 DESC,1 +LIMIT + 10; + user_id | max +---------+------ + 1 | 2469 + 87 | 2089 + 81 | 1952 + 23 | 1891 + 58 | 1888 + 97 | 1868 + 94 | 1849 + 17 | 1844 + 22 | 1844 + 43 | 1843 +(10 rows) + +-- Window functions with HAVING clause +SELECT * FROM ( + SELECT + DISTINCT user_id, rank() OVER (PARTITION BY user_id ORDER BY value_1) + FROM + users_table + GROUP BY + user_id, value_1 HAVING count(*) > 1 +) a +ORDER BY + 2 DESC, 1 +LIMIT + 10; + user_id | rank +---------+------ + 12 | 10 + 55 | 10 + 12 | 9 + 25 | 9 + 46 | 9 + 55 | 9 + 12 | 8 + 25 | 8 + 36 | 8 + 46 | 8 +(10 rows) + +-- Window function in View works +SELECT * +FROM + subq +ORDER BY + 2 DESC, 1 +LIMIT + 10; + user_id | rank +---------+------ + 12 | 10 + 55 | 10 + 12 | 9 + 25 | 9 + 46 | 9 + 55 | 9 + 12 | 8 + 25 | 8 + 36 | 8 + 46 | 8 +(10 rows) + +-- Window functions with UNION/UNION ALL works +SELECT + max(avg) +FROM +( + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (1, 2, 3, 4, 5)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (6, 7, 8, 9, 10)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (11, 12, 13, 14, 15)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (16, 17, 18, 19, 20)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (21, 22, 23, 24, 25)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (26, 27, 28, 29, 30)) +) b +GROUP BY user_id +ORDER BY 1 DESC +LIMIT 5; + max +----- + 996 + 995 + 987 + 978 + 974 +(5 rows) + +SELECT * +FROM ( + ( SELECT user_id, + sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_1 + GROUP BY + user_id) + UNION + (SELECT + user_id, sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_2 + GROUP BY + user_id)) AS ftop +ORDER BY 2 DESC, 1 DESC +LIMIT 5; + user_id | sum +---------+-------- + 23 | 126017 + 45 | 117323 + 25 | 116595 + 17 | 116520 + 90 | 115843 +(5 rows) + +-- Subquery in where with window function +SELECT + user_id +FROM + users_table +WHERE + value_2 > 545 AND + value_2 < ALL ( + SELECT + avg(value_3) OVER (PARTITION BY user_id) + FROM + events_table + WHERE + users_table.user_id = events_table.user_id + ) +GROUP BY + user_id +ORDER BY + user_id DESC +LIMIT + 3; + user_id +--------- + 69 + 52 + 12 +(3 rows) + +-- Some more nested queries +SELECT + user_id, rank, SUM(ABS(value_2 - value_3)) AS difference, COUNT(*) AS distinct_users +FROM ( + SELECT + *, rank() OVER (PARTITION BY user_id ORDER BY value_2 DESC) + FROM ( + SELECT + user_id, value_2, sum(value_3) OVER (PARTITION BY user_id, value_2) as value_3 + FROM users_table + ) AS A +) AS A +GROUP BY + user_id, rank +ORDER BY + difference DESC, rank DESC +LIMIT 20; + user_id | rank | difference | distinct_users +---------+------+------------+---------------- + 2 | 101 | 3696 | 2 + 73 | 98 | 3526 | 2 + 73 | 103 | 3522 | 3 + 97 | 73 | 3440 | 2 + 43 | 92 | 3418 | 2 + 57 | 75 | 3286 | 2 + 66 | 64 | 3249 | 3 + 42 | 97 | 3218 | 2 + 19 | 101 | 3110 | 2 + 91 | 94 | 3064 | 2 + 71 | 100 | 3026 | 2 + 59 | 69 | 3016 | 2 + 46 | 83 | 2858 | 2 + 16 | 86 | 2848 | 2 + 23 | 63 | 2734 | 2 + 62 | 96 | 2668 | 2 + 81 | 84 | 2666 | 2 + 7 | 74 | 2648 | 2 + 27 | 97 | 2640 | 2 + 55 | 76 | 2630 | 2 +(20 rows) + +SELECT * FROM ( + SELECT DISTINCT + f3.user_id, ABS(f2.sum - f3.sum) + FROM ( + SELECT DISTINCT + user_id, sum(value_3) OVER (PARTITION BY user_id) + FROM + users_table + GROUP BY + user_id, value_3 + ) f3, + ( + SELECT DISTINCT + user_id, sum(value_2) OVER (PARTITION BY user_id) + FROM + users_table + GROUP BY + user_id, value_2 + ) f2 +WHERE + f3.user_id=f2.user_id +) a +ORDER BY + abs DESC +LIMIT 10; + user_id | abs +---------+------- + 64 | 10669 + 74 | 10037 + 26 | 9571 + 76 | 9376 + 81 | 8330 + 16 | 7746 + 9 | 7100 + 98 | 6922 + 94 | 6895 + 93 | 6653 +(10 rows) + +-- Partition by with aggregate functions. This query does not make much sense since the +-- result of aggregate function will be the same for every row in a partition and it is +-- not going to affect the group that the count function will work on. +SELECT * FROM ( + SELECT + user_id, COUNT(*) OVER (PARTITION BY user_id, MIN(value_2)) + FROM + users_table + GROUP BY + 1 +) a +ORDER BY + 1 DESC +LIMIT + 5; + user_id | count +---------+------- + 100 | 1 + 99 | 1 + 98 | 1 + 97 | 1 + 96 | 1 +(5 rows) + +EXPLAIN (COSTS FALSE, VERBOSE TRUE) + SELECT * + FROM ( + ( SELECT user_id, + sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_1 + GROUP BY + user_id) + UNION + (SELECT + user_id, sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_2 + GROUP BY + user_id)) AS ftop + ORDER BY 2 DESC, 1 DESC + LIMIT 5; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ + Limit + Output: remote_scan.user_id, remote_scan.sum + -> Sort + Output: remote_scan.user_id, remote_scan.sum + Sort Key: remote_scan.sum DESC, remote_scan.user_id DESC + -> Custom Scan (Citus Real-Time) + Output: remote_scan.user_id, remote_scan.sum + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> Limit + Output: users_table.user_id, (sum((sum(users_table.value_2) OVER (?)))) + -> Sort + Output: users_table.user_id, (sum((sum(users_table.value_2) OVER (?)))) + Sort Key: (sum((sum(users_table.value_2) OVER (?)))) DESC, users_table.user_id DESC + -> HashAggregate + Output: users_table.user_id, (sum((sum(users_table.value_2) OVER (?)))) + Group Key: users_table.user_id, (sum((sum(users_table.value_2) OVER (?)))) + -> Append + -> HashAggregate + Output: users_table.user_id, sum((sum(users_table.value_2) OVER (?))) + Group Key: users_table.user_id + -> HashAggregate + Output: users_table.user_id, (sum(users_table.value_2) OVER (?)) + Group Key: users_table.user_id, (sum(users_table.value_2) OVER (?)) + -> Append + -> WindowAgg + Output: users_table.user_id, sum(users_table.value_2) OVER (?) + -> Index Scan using is_index1_1400000 on public.users_table_1400000 users_table + Output: users_table.user_id, users_table."time", users_table.value_1, users_table.value_2, users_table.value_3, users_table.value_4 + -> WindowAgg + Output: events_table.user_id, sum(events_table.value_2) OVER (?) + -> Index Scan using is_index2_1400004 on public.events_table_1400004 events_table + Output: events_table.user_id, events_table."time", events_table.event_type, events_table.value_2, events_table.value_3, events_table.value_4 + -> HashAggregate + Output: users_table_1.user_id, sum((sum(users_table_1.value_2) OVER (?))) + Group Key: users_table_1.user_id + -> HashAggregate + Output: users_table_1.user_id, (sum(users_table_1.value_2) OVER (?)) + Group Key: users_table_1.user_id, (sum(users_table_1.value_2) OVER (?)) + -> Append + -> WindowAgg + Output: users_table_1.user_id, sum(users_table_1.value_2) OVER (?) + -> Index Scan using is_index1_1400000 on public.users_table_1400000 users_table_1 + Output: users_table_1.user_id, users_table_1."time", users_table_1.value_1, users_table_1.value_2, users_table_1.value_3, users_table_1.value_4 + -> WindowAgg + Output: events_table_1.user_id, sum(events_table_1.value_2) OVER (?) + -> Index Scan using is_index2_1400004 on public.events_table_1400004 events_table_1 + Output: events_table_1.user_id, events_table_1."time", events_table_1.event_type, events_table_1.value_2, events_table_1.value_3, events_table_1.value_4 +(50 rows) + +-- lets have some queries that Citus shouldn't push down +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS (PARTITION BY event_type ORDER BY time DESC) +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; +ERROR: cannot push down this subquery +DETAIL: Window functions with PARTITION BY list missing distribution column is currently unsupported +-- user needs to supply partition by which should +-- include the distribution key +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS () +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; +ERROR: cannot push down this subquery +DETAIL: Window functions without PARTITION BY on distribution column is currently unsupported +-- user needs to supply partition by which should +-- include the distribution key +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS (ORDER BY time DESC) +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; +ERROR: cannot push down this subquery +DETAIL: Window functions without PARTITION BY on distribution column is currently unsupported +-- w2 should not be pushed down +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id+1, (events_table.value_2 % 25) ORDER BY events_table.time) +) as foo +ORDER BY 3 DESC, 1 DESC, 2 DESC NULLS LAST +LIMIT 10; +ERROR: cannot push down this subquery +DETAIL: Window functions with PARTITION BY list missing distribution column is currently unsupported +-- w2 should not be pushed down +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (ORDER BY events_table.time) +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC NULLS LAST +LIMIT + 10; +ERROR: cannot push down this subquery +DETAIL: Window functions without PARTITION BY on distribution column is currently unsupported +-- GROUP BY includes the partition key, but not the WINDOW function +SELECT + user_id, time, my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (ORDER BY avg(event_type)) +) as foo +WHERE + my_rank > 125 +ORDER BY + 3 DESC, 1 DESC,2 DESC +LIMIT + 10; +ERROR: cannot push down this subquery +DETAIL: Window functions without PARTITION BY on distribution column is currently unsupported +-- GROUP BY includes the partition key, but not the WINDOW function +SELECT + user_id, time, my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY date_trunc('day', time) ORDER BY avg(event_type)) +) as foo +WHERE + my_rank > 125 +ORDER BY + 3 DESC, 1 DESC,2 DESC +LIMIT + 10; +ERROR: cannot push down this subquery +DETAIL: Window functions with PARTITION BY list missing distribution column is currently unsupported +-- Overriding window function but not supported +SELECT * FROM ( + SELECT + user_id, date_trunc('day', time) as time, sum(rank) OVER w2 + FROM ( + SELECT DISTINCT + user_id as user_id, time, rank() over w1 + FROM + users_table + WINDOW + w AS (PARTITION BY time), w1 AS (w ORDER BY value_2, value_3) + ) fab + WINDOW + w2 as (PARTITION BY user_id, time) +) a +ORDER BY +1,2,3; +ERROR: cannot push down this subquery +DETAIL: Window functions with PARTITION BY list missing distribution column is currently unsupported +-- Aggregate function on distribution column should error out +SELECT * FROM ( + SELECT + user_id, COUNT(*) OVER (PARTITION BY sum(user_id), MIN(value_2)) + FROM + users_table + GROUP BY + user_id +) a +ORDER BY + 1 DESC, 2 DESC; +ERROR: cannot push down this subquery +DETAIL: Window functions with PARTITION BY list missing distribution column is currently unsupported +-- test with reference table partitioned on only a column from reference table +SELECT * +FROM +( + SELECT + DISTINCT user_id, it_name, count(id) OVER (PARTITION BY id) + FROM + users_table, users_ref_test_table +) a +ORDER BY + 1, 2, 3 +LIMIT + 20; +ERROR: cannot push down this subquery +DETAIL: Window functions with PARTITION BY list missing distribution column is currently unsupported +-- UNION ALL with only one of them is not partitioned over distribution column which +-- should not be allowed. +SELECT + max(avg) +FROM +( + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (1, 2, 3, 4, 5)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (6, 7, 8, 9, 10)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (11, 12, 13, 14, 15)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (16, 17, 18, 19, 20)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (21, 22, 23, 24, 25)) + UNION ALL + (SELECT avg(value_3) over (partition by event_type), user_id FROM events_table where event_type IN (26, 27, 28, 29, 30)) +) b +GROUP BY user_id +ORDER BY 1 DESC +LIMIT 5; +ERROR: cannot push down this subquery +DETAIL: Window functions with PARTITION BY list missing distribution column is currently unsupported +-- UNION with only one subquery which has a partition on non-distribution column should +-- error out +SELECT * +FROM ( + ( SELECT user_id, + sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_1 + GROUP BY + user_id) + UNION + (SELECT + user_id, sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by event_type) AS counter + FROM + events_table) user_id_2 + GROUP BY + user_id)) AS ftop +ORDER BY 2 DESC, 1 DESC +LIMIT 5; +ERROR: cannot push down this subquery +DETAIL: Window functions with PARTITION BY list missing distribution column is currently unsupported +DROP VIEW subq; diff --git a/src/test/regress/input/multi_behavioral_analytics_create_table.source b/src/test/regress/input/multi_behavioral_analytics_create_table.source index b81082fe2..838bfbf23 100644 --- a/src/test/regress/input/multi_behavioral_analytics_create_table.source +++ b/src/test/regress/input/multi_behavioral_analytics_create_table.source @@ -2,10 +2,8 @@ -- multi behavioral analytics -- this file is intended to create the table requires for the tests -- - ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 1400000; ALTER SEQUENCE pg_catalog.pg_dist_jobid_seq RESTART 1400000; - SET citus.shard_replication_factor = 1; SET citus.shard_count = 4; @@ -30,10 +28,19 @@ SELECT create_distributed_table('agg_results_third', 'user_id'); CREATE TABLE agg_results_fourth (user_id int, value_1_agg int, value_2_agg int, value_3_agg float, value_4_agg bigint, agg_time timestamp); SELECT create_distributed_table('agg_results_fourth', 'user_id'); +CREATE TABLE users_ref_test_table(id int, it_name varchar(25), k_no int); +SELECT create_reference_table('users_ref_test_table'); +INSERT INTO users_ref_test_table VALUES(1,'User_1',45); +INSERT INTO users_ref_test_table VALUES(2,'User_2',46); +INSERT INTO users_ref_test_table VALUES(3,'User_3',47); +INSERT INTO users_ref_test_table VALUES(4,'User_4',48); +INSERT INTO users_ref_test_table VALUES(5,'User_5',49); +INSERT INTO users_ref_test_table VALUES(6,'User_6',50); + COPY users_table FROM '@abs_srcdir@/data/users_table.data' WITH CSV; COPY events_table FROM '@abs_srcdir@/data/events_table.data' WITH CSV; --- create indexes for +-- create indexes for CREATE INDEX is_index1 ON users_table(user_id); CREATE INDEX is_index2 ON events_table(user_id); CREATE INDEX is_index3 ON users_table(value_1); @@ -130,10 +137,10 @@ SELECT run_command_on_master_and_workers($f$ LEFTARG = user_composite_type, RIGHTARG = user_composite_type, PROCEDURE = equal_user_composite_type_function, - commutator = =, - RESTRICT = eqsel, - JOIN = eqjoinsel, - merges, + commutator = =, + RESTRICT = eqsel, + JOIN = eqjoinsel, + merges, hashes ); $f$); @@ -187,12 +194,12 @@ SELECT run_command_on_master_and_workers($f$ OPERATOR 3 = (user_composite_type, user_composite_type), OPERATOR 4 >= (user_composite_type, user_composite_type), OPERATOR 5 > (user_composite_type, user_composite_type), - + FUNCTION 1 cmp_user_composite_type_function(user_composite_type, user_composite_type); $f$); SELECT run_command_on_master_and_workers($f$ - + CREATE OPERATOR CLASS cats_2_op_fam_class DEFAULT FOR TYPE user_composite_type USING HASH AS OPERATOR 1 = (user_composite_type, user_composite_type), @@ -356,7 +363,7 @@ SET citus.shard_max_size TO "1MB"; CREATE TABLE events_reference_table (like events_table including all); SELECT create_reference_table('events_reference_table'); -CREATE INDEX events_ref_val2 on events_reference_table(value_2); +CREATE INDEX events_ref_val2 on events_reference_table(value_2); INSERT INTO events_reference_table SELECT * FROM events_table; CREATE TABLE users_reference_table (like users_table including all); diff --git a/src/test/regress/multi_schedule b/src/test/regress/multi_schedule index b24532b17..96a991fff 100644 --- a/src/test/regress/multi_schedule +++ b/src/test/regress/multi_schedule @@ -36,7 +36,7 @@ test: multi_load_data test: multi_behavioral_analytics_create_table test: multi_behavioral_analytics_basics multi_behavioral_analytics_single_shard_queries multi_insert_select_non_pushable_queries -test: multi_insert_select +test: multi_insert_select multi_insert_select_window # --- # Tests for partitioning support @@ -51,7 +51,7 @@ test: multi_deparse_shard_query multi_distributed_transaction_id test: multi_basic_queries multi_complex_expressions test: multi_explain test: multi_subquery multi_subquery_complex_queries multi_subquery_behavioral_analytics -test: multi_subquery_complex_reference_clause +test: multi_subquery_complex_reference_clause multi_subquery_window_functions test: multi_subquery_in_where_reference_clause test: multi_subquery_union multi_subquery_in_where_clause multi_subquery_misc test: multi_reference_table diff --git a/src/test/regress/output/multi_behavioral_analytics_create_table.source b/src/test/regress/output/multi_behavioral_analytics_create_table.source index 026634fe6..d0ad1547d 100644 --- a/src/test/regress/output/multi_behavioral_analytics_create_table.source +++ b/src/test/regress/output/multi_behavioral_analytics_create_table.source @@ -51,9 +51,22 @@ SELECT create_distributed_table('agg_results_fourth', 'user_id'); (1 row) +CREATE TABLE users_ref_test_table(id int, it_name varchar(25), k_no int); +SELECT create_reference_table('users_ref_test_table'); + create_reference_table +------------------------ + +(1 row) + +INSERT INTO users_ref_test_table VALUES(1,'User_1',45); +INSERT INTO users_ref_test_table VALUES(2,'User_2',46); +INSERT INTO users_ref_test_table VALUES(3,'User_3',47); +INSERT INTO users_ref_test_table VALUES(4,'User_4',48); +INSERT INTO users_ref_test_table VALUES(5,'User_5',49); +INSERT INTO users_ref_test_table VALUES(6,'User_6',50); COPY users_table FROM '@abs_srcdir@/data/users_table.data' WITH CSV; COPY events_table FROM '@abs_srcdir@/data/events_table.data' WITH CSV; --- create indexes for +-- create indexes for CREATE INDEX is_index1 ON users_table(user_id); NOTICE: using one-phase commit for distributed DDL commands HINT: You can enable two-phase commit for extra safety with: SET citus.multi_shard_commit_protocol TO '2pc' @@ -187,10 +200,10 @@ SELECT run_command_on_master_and_workers($f$ LEFTARG = user_composite_type, RIGHTARG = user_composite_type, PROCEDURE = equal_user_composite_type_function, - commutator = =, - RESTRICT = eqsel, - JOIN = eqjoinsel, - merges, + commutator = =, + RESTRICT = eqsel, + JOIN = eqjoinsel, + merges, hashes ); $f$); @@ -261,7 +274,7 @@ SELECT run_command_on_master_and_workers($f$ OPERATOR 3 = (user_composite_type, user_composite_type), OPERATOR 4 >= (user_composite_type, user_composite_type), OPERATOR 5 > (user_composite_type, user_composite_type), - + FUNCTION 1 cmp_user_composite_type_function(user_composite_type, user_composite_type); $f$); run_command_on_master_and_workers @@ -270,7 +283,7 @@ $f$); (1 row) SELECT run_command_on_master_and_workers($f$ - + CREATE OPERATOR CLASS cats_2_op_fam_class DEFAULT FOR TYPE user_composite_type USING HASH AS OPERATOR 1 = (user_composite_type, user_composite_type), @@ -428,7 +441,7 @@ SELECT create_reference_table('events_reference_table'); (1 row) -CREATE INDEX events_ref_val2 on events_reference_table(value_2); +CREATE INDEX events_ref_val2 on events_reference_table(value_2); INSERT INTO events_reference_table SELECT * FROM events_table; CREATE TABLE users_reference_table (like users_table including all); SELECT create_reference_table('users_reference_table'); diff --git a/src/test/regress/sql/multi_insert_select_window.sql b/src/test/regress/sql/multi_insert_select_window.sql new file mode 100644 index 000000000..97814c15b --- /dev/null +++ b/src/test/regress/sql/multi_insert_select_window.sql @@ -0,0 +1,769 @@ +-- =================================================================== +-- test insert select functionality for window functions +-- =================================================================== + +TRUNCATE agg_results; + +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS (PARTITION BY user_id ORDER BY time DESC) +) as foo; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- the same test with different syntax +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER (PARTITION BY user_id ORDER BY time DESC) as rnk + FROM + events_table +) as foo; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- similar test with lag +INSERT INTO agg_results (user_id, agg_time, value_2_agg, value_3_agg) +SELECT + user_id, time, lag_event_type, row_no +FROM +( + SELECT + *, lag(event_type) OVER my_win as lag_event_type, row_number() OVER my_win as row_no + FROM + events_table WINDOW my_win AS (PARTITION BY user_id ORDER BY time DESC) +) as foo; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- simple window function, partitioned and grouped by on the distribution key +INSERT INTO agg_results (user_id, value_1_agg, value_2_agg) +SELECT + user_id, rnk, tme +FROM +( + SELECT + user_id, rank() OVER my_win as rnk, avg(value_2) as tme + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY user_id ORDER BY avg(event_type) DESC) +) as foo; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- top level query has a group by on the result of the window function +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + min(user_id), min(time), lag_event_type +FROM +( + SELECT + *, lag(event_type) OVER my_win as lag_event_type + FROM + events_table WINDOW my_win AS (PARTITION BY user_id ORDER BY time DESC) +) as foo +GROUP BY + lag_event_type; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- window functions should work along with joins as well +INSERT INTO agg_results (user_id, value_1_agg, value_2_agg) +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w1 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time) +) as foo; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- two window functions in a single subquery should work fine as well +INSERT INTO agg_results (user_id, value_1_agg, value_2_agg) +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id, (events_table.value_2 % 25) ORDER BY events_table.time) +) as foo; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- window functions should be fine within subquery joins +INSERT INTO agg_results (user_id, value_1_agg, value_2_agg, value_3_agg) +SELECT sub_1.user_id, max(lag_1), max(rank_1), max(rank_2) FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1 as lag_1, rank() OVER w2 as rank_1 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id, (events_table.value_2 % 25) ORDER BY events_table.time) +) as sub_1 +JOIN +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1 as lag_2, rank() OVER w2 as rank_2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.value_2 ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id, (events_table.value_2 % 50) ORDER BY events_table.time) +) as sub_2 + ON(sub_1.user_id = sub_2.user_id) + GROUP BY + sub_1.user_id; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- GROUP BYs and PARTITION BYs should work fine together +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + avg(user_id), max(time), my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY user_id ORDER BY count(*) DESC) +) as foo +WHERE + my_rank > 5 +GROUP BY + my_rank; + + -- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- aggregates in the PARTITION BY is also allows +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + avg(user_id), max(time), my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY user_id, avg(event_type%10)::int ORDER BY count(*) DESC) +) as foo +WHERE + my_rank > 0 +GROUP BY + my_rank; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- GROUP BY should not necessarly be inclusive of partitioning +-- but this query doesn't make much sense +INSERT INTO agg_results (user_id, value_1_agg) +SELECT + avg(user_id), my_rank +FROM +( + SELECT + user_id, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id + WINDOW my_win AS (PARTITION BY user_id, max(event_type) ORDER BY count(*) DESC) +) as foo +GROUP BY + my_rank; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- Group by has more columns than partition by which uses coordinator insert ... select +INSERT INTO agg_results(user_id, value_2_agg) +SELECT * FROM ( + SELECT + DISTINCT user_id, SUM(value_2) OVER (PARTITION BY user_id) + FROM + users_table + GROUP BY + user_id, value_1, value_2 +) a +ORDER BY + 2 DESC, 1 +LIMIT + 10; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +INSERT INTO agg_results(user_id, value_2_agg) +SELECT user_id, max(sum) FROM ( + SELECT + user_id, SUM(value_2) OVER (PARTITION BY user_id, value_1) + FROM + users_table + GROUP BY + user_id, value_1, value_2 +) a +GROUP BY user_id; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + + -- Subquery in where with window function +INSERT INTO agg_results(user_id) +SELECT + user_id +FROM + users_table +WHERE + value_2 > 545 AND + value_2 < ALL ( + SELECT + avg(value_3) OVER (PARTITION BY user_id) + FROM + events_table + WHERE + users_table.user_id = events_table.user_id + ) +GROUP BY + user_id; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- Partition by with aggregate functions. This query does not make much sense since the +-- result of aggregate function will be the same for every row in a partition and it is +-- not going to affect the group that the count function will work on. +INSERT INTO agg_results(user_id, value_2_agg) +SELECT * FROM ( + SELECT + user_id, COUNT(*) OVER (PARTITION BY user_id, MIN(value_2)) + FROM + users_table + GROUP BY + 1 +) a; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- Some more nested queries +INSERT INTO agg_results(user_id, value_2_agg, value_3_agg, value_4_agg) +SELECT + user_id, rank, SUM(ABS(value_2 - value_3)) AS difference, COUNT(*) AS distinct_users +FROM ( + SELECT + *, rank() OVER (PARTITION BY user_id ORDER BY value_2 DESC) + FROM ( + SELECT + user_id, value_2, sum(value_3) OVER (PARTITION BY user_id, value_2) as value_3 + FROM users_table + ) AS A +) AS A +GROUP BY + user_id, rank; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +INSERT INTO agg_results(user_id, value_1_agg) +SELECT * FROM ( + SELECT DISTINCT + f3.user_id, ABS(f2.sum - f3.sum) + FROM ( + SELECT DISTINCT + user_id, sum(value_3) OVER (PARTITION BY user_id) + FROM + users_table + GROUP BY + user_id, value_3 + ) f3, + ( + SELECT DISTINCT + user_id, sum(value_2) OVER (PARTITION BY user_id) + FROM + users_table + GROUP BY + user_id, value_2 + ) f2 +WHERE + f3.user_id=f2.user_id +) a; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- test with reference table partitioned on columns from both +INSERT INTO agg_results(user_id, value_1_agg) +SELECT * +FROM +( + SELECT + DISTINCT user_id, count(id) OVER (PARTITION BY user_id, id) + FROM + users_table, users_ref_test_table +) a; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- Window functions with HAVING clause +INSERT INTO agg_results (user_id, value_1_agg) +SELECT * FROM ( + SELECT + DISTINCT user_id, rank() OVER (PARTITION BY user_id ORDER BY value_1) + FROM + users_table + GROUP BY + user_id, value_1 HAVING count(*) > 1 +) a; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- Window functions with HAVING clause which uses coordinator insert ... select +INSERT INTO agg_results (user_id, value_1_agg) +SELECT * FROM ( + SELECT + DISTINCT user_id, rank() OVER (PARTITION BY user_id ORDER BY value_1) + FROM + users_table + GROUP BY + user_id, value_1 HAVING count(*) > 1 +) a +ORDER BY + 2 DESC, 1 +LIMIT + 10; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- Window function in View works +CREATE VIEW view_with_window_func AS +SELECT + DISTINCT user_id, rank() OVER (PARTITION BY user_id ORDER BY value_1) +FROM + users_table +GROUP BY + user_id, value_1 +HAVING count(*) > 1; + +INSERT INTO agg_results(user_id, value_1_agg) +SELECT * +FROM + view_with_window_func; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- Window function in View works and the query uses coordinator insert ... select +INSERT INTO agg_results(user_id, value_1_agg) +SELECT * +FROM + view_with_window_func +LIMIT + 10; + +-- get some statistics from the aggregated results to ensure the results are correct +-- since there is a limit but not order, we cannot run avg(user_id) +SELECT count(*) FROM agg_results; +TRUNCATE agg_results; + +INSERT INTO agg_results(user_id, value_1_agg) +SELECT + user_id, max(avg) +FROM +( + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (1, 2, 3, 4, 5)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (6, 7, 8, 9, 10)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (11, 12, 13, 14, 15)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (16, 17, 18, 19, 20)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (21, 22, 23, 24, 25)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (26, 27, 28, 29, 30)) +) b +GROUP BY + user_id +LIMIT + 5; + +-- get some statistics from the aggregated results to ensure the results are correct +-- since there is a limit but not order, we cannot test avg or distinct count +SELECT count(*) FROM agg_results; +TRUNCATE agg_results; + +INSERT INTO agg_results(user_id, value_1_agg) +SELECT + user_id, max(avg) +FROM +( + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (1, 2, 3, 4, 5)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (6, 7, 8, 9, 10)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (11, 12, 13, 14, 15)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (16, 17, 18, 19, 20)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (21, 22, 23, 24, 25)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (26, 27, 28, 29, 30)) +) b +GROUP BY + user_id; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +INSERT INTO agg_results(user_id, value_1_agg) +SELECT * +FROM ( + ( SELECT user_id, + sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_1 + GROUP BY + user_id) + UNION + (SELECT + user_id, sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_2 + GROUP BY + user_id + ) +) AS ftop +LIMIT + 5; + +-- get some statistics from the aggregated results to ensure the results are correct +-- since there is a limit but not order, we cannot test avg or distinct count +SELECT count(*) FROM agg_results; +TRUNCATE agg_results; + +INSERT INTO agg_results(user_id, value_1_agg) +SELECT * +FROM ( + ( SELECT user_id, + sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_1 + GROUP BY + user_id) + UNION + (SELECT + user_id, sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_2 + GROUP BY + user_id + ) +) AS ftop; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; +TRUNCATE agg_results; + +-- lets have some queries that Citus shouldn't push down +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS (PARTITION BY event_type ORDER BY time DESC) +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; + +-- user needs to supply partition by which should +-- include the distribution key +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS () +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; + +-- user needs to supply partition by which should +-- include the distribution key +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS (ORDER BY time DESC) +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; + +-- w2 should not be pushed down +INSERT INTO agg_results (user_id, value_1_agg, value_2_agg) +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id+1, (events_table.value_2 % 25) ORDER BY events_table.time) +) as foo +LIMIT + 10; + +-- GROUP BY includes the partition key, but not the WINDOW function +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + user_id, time, my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (ORDER BY avg(event_type)) +) as foo +WHERE + my_rank > 125; + +-- GROUP BY includes the partition key, but not the WINDOW function +INSERT INTO agg_results (user_id, agg_time, value_2_agg) +SELECT + user_id, time, my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY date_trunc('day', time) ORDER BY avg(event_type)) +) as foo +WHERE + my_rank > 125; + +-- w2 should not be allowed +INSERT INTO agg_results (user_id, value_2_agg, value_3_agg) +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (ORDER BY events_table.time) +) as foo; + +-- unsupported window function with an override +INSERT INTO agg_results(user_id, agg_time, value_2_agg) +SELECT * FROM ( + SELECT + user_id, date_trunc('day', time) as time, sum(rank) OVER w2 + FROM ( + SELECT DISTINCT + user_id as user_id, time, rank() over w1 + FROM + users_table + WINDOW + w AS (PARTITION BY time), w1 AS (w ORDER BY value_2, value_3) + ) fab + WINDOW + w2 as (PARTITION BY user_id, time) +) a; + + -- Subquery in where with unsupported window function +INSERT INTO agg_results(user_id) +SELECT + user_id +FROM + users_table +WHERE + value_2 > 545 AND + value_2 < ALL ( + SELECT + avg(value_3) OVER () + FROM + events_table + WHERE + users_table.user_id = events_table.user_id + ) +GROUP BY + user_id; + +-- Aggregate function on distribution column should error out +INSERT INTO agg_results(user_id, value_2_agg) +SELECT * FROM ( + SELECT + user_id, COUNT(*) OVER (PARTITION BY sum(user_id), MIN(value_2)) + FROM + users_table + GROUP BY + user_id +) a; + +-- UNION with only one subquery which has a partition on non-distribution column should +-- error out +INSERT INTO agg_results(user_id, value_1_agg) +SELECT * +FROM ( + ( SELECT user_id, + sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_1 + GROUP BY + user_id) + UNION + (SELECT + user_id, sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by event_type) AS counter + FROM + events_table) user_id_2 + GROUP BY + user_id + ) +) AS ftop; + +DROP VIEW view_with_window_func; diff --git a/src/test/regress/sql/multi_subquery_complex_queries.sql b/src/test/regress/sql/multi_subquery_complex_queries.sql index 300a79a94..787110511 100644 --- a/src/test/regress/sql/multi_subquery_complex_queries.sql +++ b/src/test/regress/sql/multi_subquery_complex_queries.sql @@ -2160,18 +2160,6 @@ GROUP BY ORDER BY types; --- not supported due to window functions -SELECT user_id, - some_vals -FROM ( - SELECT * , - Row_number() over (PARTITION BY "user_id" ORDER BY "user_id") AS "some_vals", - Random() - FROM users_table - ) user_id -ORDER BY 1, - 2 limit 10; - -- not supported due to non relation rte SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType FROM diff --git a/src/test/regress/sql/multi_subquery_complex_reference_clause.sql b/src/test/regress/sql/multi_subquery_complex_reference_clause.sql index be2aa896e..c76237c1d 100644 --- a/src/test/regress/sql/multi_subquery_complex_reference_clause.sql +++ b/src/test/regress/sql/multi_subquery_complex_reference_clause.sql @@ -7,7 +7,7 @@ -- We don't need shard id sequence here, so commented out to prevent conflicts with concurrent tests -- ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 1400000; ALTER SEQUENCE pg_catalog.pg_dist_jobid_seq RESTART 1400000; - + SET citus.enable_router_execution TO FALSE; CREATE TABLE user_buy_test_table(user_id int, item_id int, buy_count int); @@ -23,15 +23,6 @@ INSERT INTO users_return_test_table VALUES(4,1,1); INSERT INTO users_return_test_table VALUES(1,3,1); INSERT INTO users_return_test_table VALUES(3,2,2); -CREATE TABLE users_ref_test_table(id int, it_name varchar(25), k_no int); -SELECT create_reference_table('users_ref_test_table'); -INSERT INTO users_ref_test_table VALUES(1,'User_1',45); -INSERT INTO users_ref_test_table VALUES(2,'User_2',46); -INSERT INTO users_ref_test_table VALUES(3,'User_3',47); -INSERT INTO users_ref_test_table VALUES(4,'User_4',48); -INSERT INTO users_ref_test_table VALUES(5,'User_5',49); -INSERT INTO users_ref_test_table VALUES(6,'User_6',50); - -- Simple Join test with reference table SELECT count(*) FROM (SELECT random() FROM user_buy_test_table JOIN users_ref_test_table @@ -101,7 +92,7 @@ SELECT count(*) FROM ON user_buy_test_table.user_id > users_ref_test_table.id AND users_ref_test_table.k_no > 44 AND user_buy_test_table.user_id > 44) subquery_2 WHERE subquery_1.user_id = subquery_2.user_id ; - -- Should be able to push down since reference tables are inner joined + -- Should be able to push down since reference tables are inner joined -- with hash distributed tables, the results of those joins are the parts of -- an outer join SELECT subquery_2.id FROM @@ -122,75 +113,75 @@ SELECT * FROM -- should be able to pushdown since reference table is in the -- inner part of the left join -SELECT - user_id, sum(value_1) -FROM - (SELECT +SELECT + user_id, sum(value_1) +FROM + (SELECT users_table.user_id, users_table.value_1, random() - FROM - users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) + FROM + users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) INNER JOIN events_reference_table ON (events_reference_table.value_2 = users_table.user_id) - ) as foo + ) as foo GROUP BY user_id ORDER BY 2 DESC LIMIT 10; -- same query as above, reference table is wrapped into a subquery -SELECT - user_id, sum(value_1) -FROM - (SELECT +SELECT + user_id, sum(value_1) +FROM + (SELECT users_table.user_id, users_table.value_1, random() - FROM - users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) + FROM + users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) INNER JOIN (SELECT *, random() FROM events_reference_table) as ref_all ON (ref_all.value_2 = users_table.user_id) - ) as foo + ) as foo GROUP BY user_id ORDER BY 2 DESC LIMIT 10; -- should be able to pushdown since reference table is in the -- inner part of the left join -SELECT - user_id, sum(value_1) -FROM - (SELECT +SELECT + user_id, sum(value_1) +FROM + (SELECT users_table.user_id, users_table.value_1, random() - FROM - users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) + FROM + users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) LEFT JOIN events_reference_table ON (events_reference_table.value_2 = users_table.user_id) - ) as foo + ) as foo GROUP BY user_id ORDER BY 2 DESC LIMIT 10; -- should not be able to pushdown since reference table is in the -- direct outer part of the left join -SELECT - user_id, sum(value_1) -FROM - (SELECT +SELECT + user_id, sum(value_1) +FROM + (SELECT users_table.user_id, users_table.value_1, random() - FROM - events_reference_table LEFT JOIN users_table ON (users_table.user_id = events_reference_table.value_2) + FROM + events_reference_table LEFT JOIN users_table ON (users_table.user_id = events_reference_table.value_2) LEFT JOIN events_table ON (events_table.user_id = users_table.user_id) - ) as foo - GROUP BY user_id ORDER BY 2 DESC LIMIT 10; + ) as foo + GROUP BY user_id ORDER BY 2 DESC LIMIT 10; -- should not be able to pushdown since reference table is in the -- direct outer part of the left join wrapped into a subquery SELECT * FROM - (SELECT *, random() FROM events_reference_table) as ref_all LEFT JOIN users_table + (SELECT *, random() FROM events_reference_table) as ref_all LEFT JOIN users_table ON (users_table.user_id = ref_all.value_2); -- should not be able to pushdown since reference table is in the -- outer part of the left join -SELECT - user_id, sum(value_1) -FROM - (SELECT +SELECT + user_id, sum(value_1) +FROM + (SELECT users_table.user_id, users_table.value_1, random() - FROM - events_reference_table LEFT JOIN users_table ON (users_table.user_id = events_reference_table.value_2) + FROM + events_reference_table LEFT JOIN users_table ON (users_table.user_id = events_reference_table.value_2) LEFT JOIN events_table ON (events_table.user_id = users_table.user_id) - ) as foo - GROUP BY user_id ORDER BY 2 DESC LIMIT 10; + ) as foo + GROUP BY user_id ORDER BY 2 DESC LIMIT 10; -- should be able to pushdown since reference table is in the -- inner part of the left join @@ -198,18 +189,18 @@ SELECT * FROM ( SELECT DISTINCT foo.user_id FROM - ((SELECT + ((SELECT "events"."time", "events"."user_id" as event_user_id, value_2 as event_val_2, random() - FROM + FROM events_reference_table as "events" - WHERE + WHERE event_type > 80) as "temp_data_queries" INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_table as "users" - WHERE + WHERE user_id > 80 and value_2 = 5) as foo_in ON (event_val_2 = user_id)) as foo LEFT JOIN (SELECT user_id as user_user_id FROM users_table) as fooo ON (user_id = user_user_id)) as bar; @@ -218,23 +209,23 @@ SELECT * FROM ( SELECT DISTINCT foo.user_id FROM - ((SELECT + ((SELECT "events"."time", "events"."user_id" as event_user_id, value_2 as event_val_2, random() - FROM + FROM events_reference_table as "events" - WHERE + WHERE event_type > 80) as "temp_data_queries" LEFT JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_table as "users" - WHERE + WHERE user_id > 80 and value_2 = 5) as foo_in ON (event_val_2 = user_id)) as foo LEFT JOIN (SELECT user_id as user_user_id FROM users_table) as fooo ON (user_id = user_user_id)) as bar; --- we could even suuport the following where the subquery --- on the outer part of the left join contains a reference table +-- we could even suuport the following where the subquery +-- on the outer part of the left join contains a reference table SELECT max(events_all.cnt), events_all.usr_id FROM (SELECT users_table.user_id as usr_id, @@ -246,7 +237,7 @@ LEFT JOIN events_table ON (events_all.usr_id = events_table.user_id) GROUP BY 2 -- but, we fail to pushdown the following query where join that reference table appears -- wrapped into a subquery SELECT max(events_all.cnt), - events_all.usr_id + events_all.usr_id FROM( SELECT *, random() FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt @@ -263,141 +254,141 @@ LIMIT 5; SET citus.subquery_pushdown to ON; SELECT user_id, lastseen FROM - (SELECT + (SELECT "some_users_data".user_id, lastseen FROM - (SELECT + (SELECT filter_users_1.user_id, time AS lastseen FROM - (SELECT + (SELECT user_where_1_1.user_id FROM - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE + WHERE user_id > 12 and user_id < 16 and value_1 > 20) user_where_1_1 INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE - user_id > 12 and user_id < 16 and value_2 > 60) user_where_1_join_1 - ON ("user_where_1_1".user_id = "user_where_1_join_1".user_id)) - filter_users_1 + WHERE + user_id > 12 and user_id < 16 and value_2 > 60) user_where_1_join_1 + ON ("user_where_1_1".user_id = "user_where_1_join_1".user_id)) + filter_users_1 JOIN LATERAL - (SELECT + (SELECT user_id, time - FROM + FROM events_reference_table as "events" WHERE - user_id > 12 and user_id < 16 AND + user_id > 12 and user_id < 16 AND user_id = filter_users_1.user_id - ORDER BY + ORDER BY time DESC - LIMIT 1) "last_events_1" + LIMIT 1) "last_events_1" ON TRUE - ORDER BY + ORDER BY time DESC LIMIT 10) "some_recent_users" JOIN LATERAL - (SELECT + (SELECT "users".user_id - FROM + FROM users_reference_table as "users" - WHERE - "users"."user_id" = "some_recent_users"."user_id" AND + WHERE + "users"."user_id" = "some_recent_users"."user_id" AND "users"."value_2" > 70 - LIMIT 1) "some_users_data" + LIMIT 1) "some_users_data" ON TRUE - ORDER BY + ORDER BY lastseen DESC LIMIT 10) "some_users" -ORDER BY +ORDER BY user_id DESC LIMIT 10; SET citus.subquery_pushdown to OFF; -- NESTED INNER JOINs with reference tables -SELECT - count(*) AS value, "generated_group_field" +SELECT + count(*) AS value, "generated_group_field" FROM - (SELECT + (SELECT DISTINCT "pushedDownQuery"."user_id", "generated_group_field" FROM - (SELECT + (SELECT "eventQuery"."user_id", "eventQuery"."time", random(), ("eventQuery"."value_2") AS "generated_group_field" FROM - (SELECT + (SELECT * FROM - (SELECT + (SELECT "events"."time", "events"."user_id", "events"."value_2" - FROM + FROM events_table as "events" - WHERE + WHERE user_id > 10 and user_id < 40 AND event_type IN (40, 41, 42, 43, 44, 45) ) "temp_data_queries" INNER JOIN - (SELECT + (SELECT user_where_1_1.real_user_id FROM - (SELECT + (SELECT "users"."user_id" as real_user_id - FROM + FROM users_reference_table as "users" WHERE user_id > 10 and user_id < 40 and value_2 > 50 ) user_where_1_1 INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE - user_id > 10 and user_id < 40 and value_3 > 50 ) user_where_1_join_1 + WHERE + user_id > 10 and user_id < 40 and value_3 > 50 ) user_where_1_join_1 ON ("user_where_1_1".real_user_id = "user_where_1_join_1".user_id)) "user_filters_1" ON ("temp_data_queries".user_id = "user_filters_1".real_user_id)) "eventQuery") "pushedDownQuery") "pushedDownQuery" -GROUP BY - "generated_group_field" -ORDER BY +GROUP BY + "generated_group_field" +ORDER BY generated_group_field DESC, value DESC; -- single level inner joins with reference tables -SELECT - "value_3", count(*) AS cnt +SELECT + "value_3", count(*) AS cnt FROM - (SELECT + (SELECT "value_3", "user_id", random() FROM - (SELECT - users_in_segment_1.user_id, value_3 + (SELECT + users_in_segment_1.user_id, value_3 FROM - (SELECT + (SELECT user_id, value_3 * 2 as value_3 FROM - (SELECT - user_id, value_3 + (SELECT + user_id, value_3 FROM - (SELECT + (SELECT "users"."user_id", value_3 - FROM + FROM users_reference_table as "users" - WHERE + WHERE user_id > 10 and user_id < 40 and value_2 > 30 ) simple_user_where_1 ) all_buckets_1 ) users_in_segment_1 JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE + WHERE user_id > 10 and user_id < 40 and value_2 > 60 ) some_users_data ON ("users_in_segment_1".user_id = "some_users_data".user_id) - ) segmentalias_1) "tempQuery" + ) segmentalias_1) "tempQuery" GROUP BY "value_3" ORDER BY cnt, value_3 DESC LIMIT 10; @@ -407,42 +398,42 @@ SELECT * FROM (SELECT "some_users_data".user_id, "some_recent_users".value_3 FROM - (SELECT + (SELECT filter_users_1.user_id, value_3 FROM - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE + WHERE user_id > 20 and user_id < 70 and users.value_2 = 200) filter_users_1 JOIN LATERAL - (SELECT + (SELECT user_id, value_3 - FROM + FROM events_reference_table as "events" WHERE - user_id > 20 and user_id < 70 AND + user_id > 20 and user_id < 70 AND ("events".user_id = "filter_users_1".user_id) - ORDER BY + ORDER BY value_3 DESC LIMIT 1) "last_events_1" ON true ORDER BY value_3 DESC LIMIT 10) "some_recent_users" JOIN LATERAL - (SELECT + (SELECT "users".user_id - FROM + FROM users_reference_table as "users" - WHERE - "users"."user_id" = "some_recent_users"."user_id" AND + WHERE + "users"."user_id" = "some_recent_users"."user_id" AND users.value_2 > 200 LIMIT 1) "some_users_data" ON true - ORDER BY + ORDER BY value_3 DESC LIMIT 10) "some_users" -ORDER BY - value_3 DESC +ORDER BY + value_3 DESC LIMIT 10; SET citus.subquery_pushdown to OFF; @@ -451,37 +442,37 @@ SET citus.subquery_pushdown to OFF; SELECT count(*) AS cnt, "generated_group_field" FROM - (SELECT + (SELECT "eventQuery"."user_id", random(), generated_group_field FROM - (SELECT + (SELECT "multi_group_wrapper_1".*, generated_group_field, random() FROM (SELECT * FROM - (SELECT + (SELECT "events"."time", "events"."user_id" as event_user_id - FROM + FROM events_table as "events" - WHERE + WHERE user_id > 80) "temp_data_queries" INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE - user_id > 80 and value_2 = 5) "user_filters_1" + WHERE + user_id > 80 and value_2 = 5) "user_filters_1" ON ("temp_data_queries".event_user_id = "user_filters_1".user_id)) AS "multi_group_wrapper_1" LEFT JOIN - (SELECT + (SELECT "users"."user_id" AS "user_id", value_2 AS "generated_group_field" - FROM + FROM users_table as "users") "left_group_by_1" - ON ("left_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" + ON ("left_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" group BY "generated_group_field" - ORDER BY + ORDER BY cnt DESC, generated_group_field ASC LIMIT 10; @@ -490,42 +481,42 @@ count(*) AS cnt, "generated_group_field" SELECT count(*) AS cnt, "generated_group_field" FROM - (SELECT + (SELECT "eventQuery"."user_id", random(), generated_group_field FROM - (SELECT + (SELECT "multi_group_wrapper_1".*, generated_group_field, random() FROM (SELECT * FROM - (SELECT + (SELECT "events"."time", "events"."user_id" as event_user_id - FROM + FROM events_table as "events" - WHERE + WHERE user_id > 80) "temp_data_queries" INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_table as "users" - WHERE - user_id > 80 and value_2 = 5) "user_filters_1" + WHERE + user_id > 80 and value_2 = 5) "user_filters_1" ON ("temp_data_queries".event_user_id = "user_filters_1".user_id)) AS "multi_group_wrapper_1" RIGHT JOIN - (SELECT + (SELECT "users"."user_id" AS "user_id", value_2 AS "generated_group_field" - FROM + FROM users_reference_table as "users") "right_group_by_1" - ON ("right_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" + ON ("right_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" group BY "generated_group_field" - ORDER BY + ORDER BY cnt DESC, generated_group_field ASC LIMIT 10; -- right join where the inner part of the join includes a reference table --- joined with hash partitioned table using non-equi join +-- joined with hash partitioned table using non-equi join SELECT user_id, sum(array_length(events_table, 1)), length(hasdone_event), hasdone_event FROM ( SELECT @@ -585,42 +576,42 @@ ORDER BY user_id; -- LEFT JOINs used with INNER JOINs --- events_table and users_reference_table joined +-- events_table and users_reference_table joined -- with event_table.non_part_key < reference_table.any_key SELECT count(*) AS cnt, "generated_group_field" FROM - (SELECT + (SELECT "eventQuery"."user_id", random(), generated_group_field FROM - (SELECT + (SELECT "multi_group_wrapper_1".*, generated_group_field, random() FROM (SELECT * FROM - (SELECT + (SELECT "events"."time", "events"."user_id" as event_user_id - FROM + FROM events_table as "events" - WHERE + WHERE user_id > 80) "temp_data_queries" INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE - user_id > 80 and value_2 = 5) "user_filters_1" + WHERE + user_id > 80 and value_2 = 5) "user_filters_1" ON ("temp_data_queries".event_user_id < "user_filters_1".user_id)) AS "multi_group_wrapper_1" RIGHT JOIN - (SELECT + (SELECT "users"."user_id" AS "user_id", value_2 AS "generated_group_field" - FROM + FROM users_table as "users") "left_group_by_1" - ON ("left_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" + ON ("left_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" group BY "generated_group_field" - ORDER BY + ORDER BY cnt DESC, generated_group_field ASC LIMIT 10; @@ -631,31 +622,31 @@ FROM FROM (SELECT user_id, time FROM - (SELECT + (SELECT user_id, time - FROM + FROM events_reference_table as "events" - WHERE + WHERE user_id > 10 and user_id < 40) "events_1" ORDER BY time DESC) "recent_events_1" - GROUP BY + GROUP BY user_id - ORDER BY + ORDER BY max(TIME) DESC) "some_recent_users" FULL JOIN - (SELECT + (SELECT "users".user_id - FROM + FROM users_table as "users" - WHERE - users.value_2 > 50 and users.value_2 < 55) "some_users_data" + WHERE + users.value_2 > 50 and users.value_2 < 55) "some_users_data" ON "some_users_data"."user_id" = "some_recent_users"."user_id" -ORDER BY +ORDER BY user_id limit 50; - -- + -- -- UNIONs and JOINs with reference tables, should error out -- SELECT ("final_query"."event_types") as types @@ -666,130 +657,130 @@ FROM FROM ( SELECT "t1"."user_id", min("t1"."time") AS "time", array_agg(("t1"."event") ORDER BY TIME ASC, event DESC) AS collected_events FROM ( - (SELECT + (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 0 AS event - FROM + FROM events_table as "events" - WHERE - event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) - UNION - (SELECT + WHERE + event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) + UNION + (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 1 AS event - FROM + FROM events_reference_table as "events" - WHERE + WHERE event_type IN (15, 16, 17, 18, 19) ) events_subquery_2) - UNION - (SELECT + UNION + (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 2 AS event - FROM + FROM events_table as "events" - WHERE + WHERE event_type IN (20, 21, 22, 23, 24, 25) ) events_subquery_3) - UNION - (SELECT + UNION + (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 3 AS event - FROM + FROM events_table as "events" - WHERE + WHERE event_type IN (26, 27, 28, 29, 30, 13)) events_subquery_4)) t1 - GROUP BY "t1"."user_id") AS t) "q" + GROUP BY "t1"."user_id") AS t) "q" INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_table as "users" - WHERE - value_1 > 50 and value_1 < 70) AS t + WHERE + value_1 > 50 and value_1 < 70) AS t ON (t.user_id = q.user_id)) as final_query -ORDER BY +ORDER BY types; - -- reference table exist in the subquery of union, should error out + -- reference table exist in the subquery of union, should error out SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType FROM - ( SELECT + ( SELECT *, random() FROM - (SELECT + (SELECT "t"."user_id", "t"."time", unnest("t"."collected_events") AS "event_types" FROM - ( SELECT + ( SELECT "t1"."user_id", min("t1"."time") AS "time", array_agg(("t1"."event") ORDER BY TIME ASC, event DESC) AS collected_events FROM ( - (SELECT + (SELECT * FROM - (SELECT + (SELECT "events"."time", 0 AS event, "events"."user_id" - FROM + FROM events_table as "events" - WHERE - event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) - UNION + WHERE + event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) + UNION (SELECT * FROM ( SELECT * FROM ( - SELECT + SELECT max("users"."time"), 0 AS event, "users"."user_id" - FROM + FROM events_reference_table as "events", users_table as "users" - WHERE + WHERE events.user_id = users.user_id AND event_type IN (10, 11, 12, 13, 14, 15) GROUP BY "users"."user_id" ) as events_subquery_5 ) events_subquery_2) - UNION + UNION (SELECT * FROM - (SELECT + (SELECT "events"."time", 2 AS event, "events"."user_id" - FROM + FROM events_table as "events" - WHERE + WHERE event_type IN (20, 21, 22, 23, 24, 25) ) events_subquery_3) - UNION + UNION (SELECT * FROM (SELECT "events"."time", 3 AS event, "events"."user_id" - FROM + FROM events_table as "events" - WHERE + WHERE event_type IN (26, 27, 28, 29, 30, 13)) events_subquery_4) ) t1 - GROUP BY "t1"."user_id") AS t) "q" + GROUP BY "t1"."user_id") AS t) "q" INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_table as "users" - WHERE - value_1 > 50 and value_1 < 70) AS t + WHERE + value_1 > 50 and value_1 < 70) AS t ON (t.user_id = q.user_id)) as final_query -GROUP BY +GROUP BY types -ORDER BY +ORDER BY types; --- +-- -- Should error out with UNION ALL Queries on reference tables -- SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType @@ -802,40 +793,40 @@ FROM FROM ( (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 0 AS event - FROM + FROM events_table as "events" - WHERE - event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) + WHERE + event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) UNION ALL (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 1 AS event - FROM + FROM events_table as "events" - WHERE + WHERE event_type IN (15, 16, 17, 18, 19) ) events_subquery_2) UNION ALL (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 2 AS event - FROM + FROM events_reference_table as "events" - WHERE + WHERE event_type IN (20, 21, 22, 23, 24, 25) ) events_subquery_3) UNION ALL (SELECT * FROM - (SELECT + (SELECT "events"."user_id", "events"."time", 3 AS event - FROM + FROM events_table as "events" - WHERE + WHERE event_type IN (26, 27, 28, 29, 30, 13)) events_subquery_4)) t1 - GROUP BY "t1"."user_id") AS t) "q" + GROUP BY "t1"."user_id") AS t) "q" INNER JOIN (SELECT "users"."user_id" FROM users_table as "users" @@ -843,7 +834,7 @@ INNER JOIN GROUP BY types ORDER BY types; --- just a sanity check that we don't allow this if the reference table is on the +-- just a sanity check that we don't allow this if the reference table is on the -- left part of the left join SELECT count(*) FROM (SELECT random() FROM users_ref_test_table LEFT JOIN user_buy_test_table @@ -862,58 +853,58 @@ WHERE subquery_1.user_id != subquery_2.user_id ; SELECT count(*) AS cnt, "generated_group_field" FROM - (SELECT + (SELECT "eventQuery"."user_id", random(), generated_group_field FROM - (SELECT + (SELECT "multi_group_wrapper_1".*, generated_group_field, random() FROM (SELECT * FROM - (SELECT + (SELECT "events"."time", "events"."user_id" as event_user_id - FROM + FROM events_table as "events" - WHERE + WHERE user_id > 80) "temp_data_queries" INNER JOIN - (SELECT + (SELECT "users"."user_id" - FROM + FROM users_reference_table as "users" - WHERE - user_id > 80 and value_2 = 5) "user_filters_1" + WHERE + user_id > 80 and value_2 = 5) "user_filters_1" ON ("temp_data_queries".event_user_id < "user_filters_1".user_id)) AS "multi_group_wrapper_1" RIGHT JOIN - (SELECT + (SELECT "users"."user_id" AS "user_id", value_2 AS "generated_group_field" - FROM + FROM users_table as "users") "left_group_by_1" - ON ("left_group_by_1".user_id > "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" + ON ("left_group_by_1".user_id > "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" group BY "generated_group_field" - ORDER BY + ORDER BY cnt DESC, generated_group_field ASC LIMIT 10; -- two hash partitioned relations are not joined -- on partiton keys although reference table is fine -- to push down -SELECT +SELECT u1.user_id, count(*) -FROM +FROM events_table as e1, users_table as u1 WHERE event_type IN - (SELECT + (SELECT event_type - FROM + FROM events_reference_table as e2 WHERE value_2 = 15 AND value_3 > 25 AND e1.value_2 > e2.value_2 - ) + ) AND u1.user_id > e1.user_id GROUP BY 1 ORDER BY 2 DESC, 1 DESC diff --git a/src/test/regress/sql/multi_subquery_window_functions.sql b/src/test/regress/sql/multi_subquery_window_functions.sql new file mode 100644 index 000000000..634ce790c --- /dev/null +++ b/src/test/regress/sql/multi_subquery_window_functions.sql @@ -0,0 +1,706 @@ +-- =================================================================== +-- test multi subquery functionality for window functions +-- =================================================================== + +CREATE VIEW subq AS +SELECT + DISTINCT user_id, rank() OVER (PARTITION BY user_id ORDER BY value_1) +FROM + users_table +GROUP BY + user_id, value_1 +HAVING count(*) > 1; + +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS (PARTITION BY user_id ORDER BY time DESC) +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; + +-- the same test with different syntax +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER (PARTITION BY user_id ORDER BY time DESC) as rnk + FROM + events_table +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; + +-- similar test with lag +SELECT + user_id, time, lag_event_type, row_no +FROM +( + SELECT + *, lag(event_type) OVER my_win as lag_event_type, row_number() OVER my_win as row_no + FROM + events_table WINDOW my_win AS (PARTITION BY user_id ORDER BY time DESC) +) as foo +ORDER BY + 4 DESC, 3 DESC NULLS LAST, 1 DESC, 2 DESC +LIMIT + 10; + +-- simple window function, partitioned and grouped by on the distribution key +SELECT + user_id, rnk, avg_val_2 +FROM +( + SELECT + user_id, rank() OVER my_win as rnk, avg(value_2) as avg_val_2 + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY user_id ORDER BY avg(event_type) DESC) +) as foo +ORDER BY + 2 DESC, 1 DESC, 3 DESC +LIMIT + 10; + +-- top level query has a group by on the result of the window function +SELECT + min(user_id), min(time), lag_event_type, count(*) +FROM +( + SELECT + *, lag(event_type) OVER my_win as lag_event_type + FROM + events_table WINDOW my_win AS (PARTITION BY user_id ORDER BY time DESC) +) as foo +GROUP BY + lag_event_type +ORDER BY + 3 DESC NULLS LAST, 1 DESC, 2 DESC +LIMIT + 10; + +-- window functions should work along with joins as well +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w1 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time) +) as foo +ORDER BY 3 DESC, 1 DESC, 2 DESC NULLS LAST +LIMIT 10; + +-- two window functions in a single subquery should work fine as well +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id, (events_table.value_2 % 25) ORDER BY events_table.time) +) as foo +ORDER BY 3 DESC, 1 DESC, 2 DESC NULLS LAST +LIMIT 10; + +-- window functions should be fine within subquery joins +SELECT sub_1.user_id, max(lag_1), max(rank_1), max(rank_2) FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1 as lag_1, rank() OVER w2 as rank_1 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id, (events_table.value_2 % 25) ORDER BY events_table.time) +) as sub_1 +JOIN +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1 as lag_2, rank() OVER w2 as rank_2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.value_2 ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id, (events_table.value_2 % 50) ORDER BY events_table.time) +) as sub_2 + ON(sub_1.user_id = sub_2.user_id) + GROUP BY + sub_1.user_id + ORDER BY 3 DESC, 4 DESC, 1 DESC, 2 DESC NULLS LAST +LIMIT 10; + +-- GROUP BYs and PARTITION BYs should work fine together +SELECT + avg(user_id), max(time), my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY user_id ORDER BY count(*) DESC) +) as foo +WHERE + my_rank > 5 +GROUP BY + my_rank +ORDER BY + 3 DESC, 1 DESC,2 DESC +LIMIT + 10; + +-- aggregates in the PARTITION BY is also allows +SELECT + avg(user_id), max(time), my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY user_id, avg(event_type%10)::int ORDER BY count(*) DESC) +) as foo +WHERE + my_rank > 0 +GROUP BY + my_rank +ORDER BY + 3 DESC, 1 DESC,2 DESC +LIMIT + 10; + +-- GROUP BY should not necessarly be inclusive of partitioning +-- but this query doesn't make much sense +SELECT + avg(user_id), my_rank +FROM +( + SELECT + user_id, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id + WINDOW my_win AS (PARTITION BY user_id, max(event_type) ORDER BY count(*) DESC) +) as foo +GROUP BY + my_rank +ORDER BY + 2 DESC, 1 DESC +LIMIT + 10; + + -- Using previously defined supported window function on distribution key +SELECT * FROM ( + SELECT + user_id, date_trunc('day', time) as time, sum(rank) OVER w2 + FROM ( + SELECT DISTINCT + user_id as user_id, time, rank() over w1 + FROM users_table + WINDOW + w AS (PARTITION BY user_id), + w1 AS (w ORDER BY value_2, value_3) + ) fab + WINDOW + w2 as (PARTITION BY user_id, time) +) a +ORDER BY + 1, 2, 3 DESC +LIMIT + 10; + +-- test with reference table partitioned on columns from both +SELECT * +FROM +( + SELECT + DISTINCT user_id, it_name, count(id) OVER (PARTITION BY user_id, id) + FROM + users_table, users_ref_test_table + WHERE users_table.value_2=users_ref_test_table.k_no +) a +ORDER BY + 1, 2, 3 +LIMIT + 20; + +-- Group by has more columns than partition by +SELECT * FROM ( + SELECT + DISTINCT user_id, SUM(value_2) OVER (PARTITION BY user_id) + FROM + users_table + GROUP BY + user_id, value_1, value_2 +) a +ORDER BY + 2 DESC, 1 +LIMIT + 10; + +SELECT user_id, max(sum) FROM ( + SELECT + user_id, SUM(value_2) OVER (PARTITION BY user_id, value_1) + FROM + users_table + GROUP BY + user_id, value_1, value_2 +) a +GROUP BY user_id ORDER BY + 2 DESC,1 +LIMIT + 10; + +-- Window functions with HAVING clause +SELECT * FROM ( + SELECT + DISTINCT user_id, rank() OVER (PARTITION BY user_id ORDER BY value_1) + FROM + users_table + GROUP BY + user_id, value_1 HAVING count(*) > 1 +) a +ORDER BY + 2 DESC, 1 +LIMIT + 10; + +-- Window function in View works +SELECT * +FROM + subq +ORDER BY + 2 DESC, 1 +LIMIT + 10; + +-- Window functions with UNION/UNION ALL works +SELECT + max(avg) +FROM +( + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (1, 2, 3, 4, 5)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (6, 7, 8, 9, 10)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (11, 12, 13, 14, 15)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (16, 17, 18, 19, 20)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (21, 22, 23, 24, 25)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (26, 27, 28, 29, 30)) +) b +GROUP BY user_id +ORDER BY 1 DESC +LIMIT 5; + +SELECT * +FROM ( + ( SELECT user_id, + sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_1 + GROUP BY + user_id) + UNION + (SELECT + user_id, sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_2 + GROUP BY + user_id)) AS ftop +ORDER BY 2 DESC, 1 DESC +LIMIT 5; + +-- Subquery in where with window function +SELECT + user_id +FROM + users_table +WHERE + value_2 > 545 AND + value_2 < ALL ( + SELECT + avg(value_3) OVER (PARTITION BY user_id) + FROM + events_table + WHERE + users_table.user_id = events_table.user_id + ) +GROUP BY + user_id +ORDER BY + user_id DESC +LIMIT + 3; + +-- Some more nested queries +SELECT + user_id, rank, SUM(ABS(value_2 - value_3)) AS difference, COUNT(*) AS distinct_users +FROM ( + SELECT + *, rank() OVER (PARTITION BY user_id ORDER BY value_2 DESC) + FROM ( + SELECT + user_id, value_2, sum(value_3) OVER (PARTITION BY user_id, value_2) as value_3 + FROM users_table + ) AS A +) AS A +GROUP BY + user_id, rank +ORDER BY + difference DESC, rank DESC +LIMIT 20; + +SELECT * FROM ( + SELECT DISTINCT + f3.user_id, ABS(f2.sum - f3.sum) + FROM ( + SELECT DISTINCT + user_id, sum(value_3) OVER (PARTITION BY user_id) + FROM + users_table + GROUP BY + user_id, value_3 + ) f3, + ( + SELECT DISTINCT + user_id, sum(value_2) OVER (PARTITION BY user_id) + FROM + users_table + GROUP BY + user_id, value_2 + ) f2 +WHERE + f3.user_id=f2.user_id +) a +ORDER BY + abs DESC +LIMIT 10; + + +-- Partition by with aggregate functions. This query does not make much sense since the +-- result of aggregate function will be the same for every row in a partition and it is +-- not going to affect the group that the count function will work on. +SELECT * FROM ( + SELECT + user_id, COUNT(*) OVER (PARTITION BY user_id, MIN(value_2)) + FROM + users_table + GROUP BY + 1 +) a +ORDER BY + 1 DESC +LIMIT + 5; + +EXPLAIN (COSTS FALSE, VERBOSE TRUE) + SELECT * + FROM ( + ( SELECT user_id, + sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_1 + GROUP BY + user_id) + UNION + (SELECT + user_id, sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_2 + GROUP BY + user_id)) AS ftop + ORDER BY 2 DESC, 1 DESC + LIMIT 5; + +-- lets have some queries that Citus shouldn't push down +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS (PARTITION BY event_type ORDER BY time DESC) +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; + +-- user needs to supply partition by which should +-- include the distribution key +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS () +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; + +-- user needs to supply partition by which should +-- include the distribution key +SELECT + user_id, time, rnk +FROM +( + SELECT + *, rank() OVER my_win as rnk + FROM + events_table + WINDOW my_win AS (ORDER BY time DESC) +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC +LIMIT + 10; + +-- w2 should not be pushed down +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (PARTITION BY users_table.user_id+1, (events_table.value_2 % 25) ORDER BY events_table.time) +) as foo +ORDER BY 3 DESC, 1 DESC, 2 DESC NULLS LAST +LIMIT 10; + +-- w2 should not be pushed down +SELECT * FROM +( + SELECT + DISTINCT users_table.user_id, lag(users_table.user_id) OVER w1, rank() OVER w2 + FROM + users_table, events_table + WHERE + users_table.user_id = events_table.user_id and + event_type < 25 + WINDOW w1 AS (PARTITION BY users_table.user_id, events_table.event_type ORDER BY events_table.time), + w2 AS (ORDER BY events_table.time) +) as foo +ORDER BY + 3 DESC, 1 DESC, 2 DESC NULLS LAST +LIMIT + 10; + +-- GROUP BY includes the partition key, but not the WINDOW function +SELECT + user_id, time, my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (ORDER BY avg(event_type)) +) as foo +WHERE + my_rank > 125 +ORDER BY + 3 DESC, 1 DESC,2 DESC +LIMIT + 10; + +-- GROUP BY includes the partition key, but not the WINDOW function +SELECT + user_id, time, my_rank +FROM +( + SELECT + user_id, date_trunc('day', time) as time, rank() OVER my_win as my_rank + FROM + events_table + GROUP BY + user_id, date_trunc('day', time) + WINDOW my_win AS (PARTITION BY date_trunc('day', time) ORDER BY avg(event_type)) +) as foo +WHERE + my_rank > 125 +ORDER BY + 3 DESC, 1 DESC,2 DESC +LIMIT + 10; + +-- Overriding window function but not supported +SELECT * FROM ( + SELECT + user_id, date_trunc('day', time) as time, sum(rank) OVER w2 + FROM ( + SELECT DISTINCT + user_id as user_id, time, rank() over w1 + FROM + users_table + WINDOW + w AS (PARTITION BY time), w1 AS (w ORDER BY value_2, value_3) + ) fab + WINDOW + w2 as (PARTITION BY user_id, time) +) a +ORDER BY +1,2,3; + + +-- Aggregate function on distribution column should error out +SELECT * FROM ( + SELECT + user_id, COUNT(*) OVER (PARTITION BY sum(user_id), MIN(value_2)) + FROM + users_table + GROUP BY + user_id +) a +ORDER BY + 1 DESC, 2 DESC; + +-- test with reference table partitioned on only a column from reference table +SELECT * +FROM +( + SELECT + DISTINCT user_id, it_name, count(id) OVER (PARTITION BY id) + FROM + users_table, users_ref_test_table +) a +ORDER BY + 1, 2, 3 +LIMIT + 20; + +-- UNION ALL with only one of them is not partitioned over distribution column which +-- should not be allowed. +SELECT + max(avg) +FROM +( + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (1, 2, 3, 4, 5)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (6, 7, 8, 9, 10)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (11, 12, 13, 14, 15)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (16, 17, 18, 19, 20)) + UNION ALL + (SELECT avg(value_3) over (partition by user_id), user_id FROM events_table where event_type IN (21, 22, 23, 24, 25)) + UNION ALL + (SELECT avg(value_3) over (partition by event_type), user_id FROM events_table where event_type IN (26, 27, 28, 29, 30)) +) b +GROUP BY user_id +ORDER BY 1 DESC +LIMIT 5; + +-- UNION with only one subquery which has a partition on non-distribution column should +-- error out +SELECT * +FROM ( + ( SELECT user_id, + sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + events_table) user_id_1 + GROUP BY + user_id) + UNION + (SELECT + user_id, sum(counter) + FROM + (SELECT + user_id, sum(value_2) over (partition by user_id) AS counter + FROM + users_table + UNION + SELECT + user_id, sum(value_2) over (partition by event_type) AS counter + FROM + events_table) user_id_2 + GROUP BY + user_id)) AS ftop +ORDER BY 2 DESC, 1 DESC +LIMIT 5; + +DROP VIEW subq;