diff --git a/src/backend/distributed/planner/multi_logical_optimizer.c b/src/backend/distributed/planner/multi_logical_optimizer.c index 10fd0125f..b0f139d97 100644 --- a/src/backend/distributed/planner/multi_logical_optimizer.c +++ b/src/backend/distributed/planner/multi_logical_optimizer.c @@ -1296,7 +1296,13 @@ MasterExtendedOpNode(MultiExtendedOp *originalOpNode, Expr *newExpression = NULL; bool hasAggregates = contain_agg_clause((Node *) originalExpression); - if (hasAggregates) + bool hasWindowFunction = contain_window_function((Node *) originalExpression); + + /* + * if the aggregate belongs to a window function, it is not mutated, but pushed + * down to worker as it is. Master query should treat that as a Var. + */ + if (hasAggregates && !hasWindowFunction) { Node *newNode = MasterAggregateMutator((Node *) originalExpression, walkerContext); @@ -1858,6 +1864,9 @@ WorkerExtendedOpNode(MultiExtendedOp *originalOpNode, bool hasNonPartitionColumnDistinctAgg = false; bool repartitionSubquery = false; + /* only window functions that can be pushed down reach here */ + bool pushDownWindowFunction = originalOpNode->hasWindowFuncs; + walkerContext->expressionList = NIL; /* find max of sort group ref index */ @@ -1893,12 +1902,18 @@ WorkerExtendedOpNode(MultiExtendedOp *originalOpNode, List *newExpressionList = NIL; ListCell *newExpressionCell = NULL; bool hasAggregates = contain_agg_clause((Node *) originalExpression); + bool hasWindowFunction = contain_window_function((Node *) originalExpression); /* reset walker context */ walkerContext->expressionList = NIL; walkerContext->createGroupByClause = false; - if (hasAggregates) + /* + * If the expression uses aggregates inside window function contain agg + * clause still returns true. We want to make sure it is not a part of + * window function before we proceed. + */ + if (hasAggregates && !hasWindowFunction) { WorkerAggregateWalker((Node *) originalExpression, walkerContext); @@ -2013,6 +2028,8 @@ WorkerExtendedOpNode(MultiExtendedOp *originalOpNode, workerExtendedOpNode = CitusMakeNode(MultiExtendedOp); workerExtendedOpNode->distinctClause = NIL; workerExtendedOpNode->hasDistinctOn = false; + workerExtendedOpNode->hasWindowFuncs = originalOpNode->hasWindowFuncs; + workerExtendedOpNode->windowClause = originalOpNode->windowClause; if (!queryHasAggregates) { @@ -2042,13 +2059,48 @@ WorkerExtendedOpNode(MultiExtendedOp *originalOpNode, newTargetEntryListForSortClauses); } + if (workerExtendedOpNode->windowClause) + { + List *windowClauseList = workerExtendedOpNode->windowClause; + ListCell *windowClauseCell = NULL; + + foreach(windowClauseCell, windowClauseList) + { + WindowClause *windowClause = (WindowClause *) lfirst(windowClauseCell); + + List *partitionClauseTargetList = + GenerateNewTargetEntriesForSortClauses(originalOpNode->targetList, + windowClause->partitionClause, + &targetProjectionNumber, + &nextSortGroupRefIndex); + List *orderClauseTargetList = + GenerateNewTargetEntriesForSortClauses(originalOpNode->targetList, + windowClause->orderClause, + &targetProjectionNumber, + &nextSortGroupRefIndex); + + newTargetEntryList = list_concat(newTargetEntryList, + partitionClauseTargetList); + newTargetEntryList = list_concat(newTargetEntryList, + orderClauseTargetList); + } + } + workerExtendedOpNode->targetList = newTargetEntryList; /* * If grouped by a partition column whose values are shards have disjoint sets * of partition values, we can push down the having qualifier. + * + * When a query with subquery is provided, we can't determine if + * groupedByDisjointPartitionColumn, therefore we also check if there is a + * window function too. If there is a window function we would know that it + * is safe to push down (i.e. it is partitioned on distribution column, and + * if there is a group by, it contains distribution column). + * */ - if (havingQual != NULL && groupedByDisjointPartitionColumn) + if (havingQual != NULL && + (groupedByDisjointPartitionColumn || pushDownWindowFunction)) { workerExtendedOpNode->havingQual = originalOpNode->havingQual; } @@ -2067,7 +2119,10 @@ HasNonPartitionColumnDistinctAgg(List *targetEntryList, Node *havingQual, List *tableNodeList) { List *targetVarList = pull_var_clause((Node *) targetEntryList, - PVC_INCLUDE_AGGREGATES); + PVC_INCLUDE_AGGREGATES | + PVC_RECURSE_WINDOWFUNCS); + + /* having clause can't have window functions, no need to recurse for that */ List *havingVarList = pull_var_clause((Node *) havingQual, PVC_INCLUDE_AGGREGATES); List *aggregateCheckList = list_concat(targetVarList, havingVarList); @@ -2722,7 +2777,8 @@ ErrorIfContainsUnsupportedAggregate(MultiNode *logicalPlanNode) * PVC_REJECT_PLACEHOLDERS is implicit if PVC_INCLUDE_PLACEHOLDERS isn't * specified. */ - List *expressionList = pull_var_clause((Node *) targetList, PVC_INCLUDE_AGGREGATES); + List *expressionList = pull_var_clause((Node *) targetList, PVC_INCLUDE_AGGREGATES | + PVC_INCLUDE_WINDOWFUNCS); ListCell *expressionCell = NULL; foreach(expressionCell, expressionList) diff --git a/src/backend/distributed/planner/multi_logical_planner.c b/src/backend/distributed/planner/multi_logical_planner.c index da683914a..41c665a0c 100644 --- a/src/backend/distributed/planner/multi_logical_planner.c +++ b/src/backend/distributed/planner/multi_logical_planner.c @@ -161,6 +161,7 @@ static MultiNode * SubqueryMultiNodeTree(Query *originalQuery, plannerRestrictionContext); static MultiNode * SubqueryPushdownMultiNodeTree(Query *queryTree); +static void FlattenJoinVars(List *columnList, Query *queryTree); static List * CreateSubqueryTargetEntryList(List *columnList); static void UpdateVarMappingsForExtendedOpNode(List *columnList, List *subqueryTargetEntryList); @@ -212,6 +213,7 @@ static bool ShouldUseSubqueryPushDown(Query *originalQuery, Query *rewrittenQuery) { List *qualifierList = NIL; + StringInfo errorMessage = NULL; /* * We check the existence of subqueries in FROM clause on the modified query @@ -253,6 +255,13 @@ ShouldUseSubqueryPushDown(Query *originalQuery, Query *rewrittenQuery) return true; } + /* check if the query has a window function and it is safe to pushdown */ + if (originalQuery->hasWindowFuncs && + SafeToPushdownWindowFunction(originalQuery, &errorMessage)) + { + return true; + } + return false; } @@ -850,8 +859,8 @@ DeferErrorIfCannotPushdownSubquery(Query *subqueryTree, bool outerMostQueryHasLi * We support window functions when the window function * is partitioned on distribution column. */ - if (subqueryTree->windowClause && !SafeToPushdownWindowFunction(subqueryTree, - &errorInfo)) + if (subqueryTree->hasWindowFuncs && !SafeToPushdownWindowFunction(subqueryTree, + &errorInfo)) { errorDetail = (char *) errorInfo->data; preconditionsSatisfied = false; @@ -2079,6 +2088,7 @@ DeferErrorIfQueryNotSupported(Query *queryTree) bool hasComplexJoinOrder = false; bool hasComplexRangeTableType = false; bool preconditionsSatisfied = true; + StringInfo errorInfo = NULL; const char *errorHint = NULL; const char *joinHint = "Consider joining tables on partition column and have " "equal filter on joining columns."; @@ -2097,15 +2107,16 @@ DeferErrorIfQueryNotSupported(Query *queryTree) errorHint = filterHint; } - if (queryTree->hasWindowFuncs) + if (queryTree->hasWindowFuncs && + !SafeToPushdownWindowFunction(queryTree, &errorInfo)) { preconditionsSatisfied = false; errorMessage = "could not run distributed query because the window " "function that is used cannot be pushed down"; errorHint = "Window functions are supported in two ways. Either add " "an equality filter on the distributed tables' partition " - "column or use the window functions inside a subquery with " - "a PARTITION BY clause containing the distribution column"; + "column or use the window functions with a PARTITION BY " + "clause containing the distribution column"; } if (queryTree->setOperations) @@ -3135,6 +3146,8 @@ MultiExtendedOpNode(Query *queryTree) extendedOpNode->havingQual = queryTree->havingQual; extendedOpNode->distinctClause = queryTree->distinctClause; extendedOpNode->hasDistinctOn = queryTree->hasDistinctOn; + extendedOpNode->hasWindowFuncs = queryTree->hasWindowFuncs; + extendedOpNode->windowClause = queryTree->windowClause; return extendedOpNode; } @@ -3396,7 +3409,8 @@ pull_var_clause_default(Node *node) * PVC_REJECT_PLACEHOLDERS is implicit if PVC_INCLUDE_PLACEHOLDERS * isn't specified. */ - List *columnList = pull_var_clause(node, PVC_RECURSE_AGGREGATES); + List *columnList = pull_var_clause(node, PVC_RECURSE_AGGREGATES | + PVC_RECURSE_WINDOWFUNCS); return columnList; } @@ -3768,6 +3782,8 @@ SubqueryPushdownMultiNodeTree(Query *queryTree) havingClauseColumnList = pull_var_clause_default(queryTree->havingQual); columnList = list_concat(targetColumnList, havingClauseColumnList); + FlattenJoinVars(columnList, queryTree); + /* create a target entry for each unique column */ subqueryTargetEntryList = CreateSubqueryTargetEntryList(columnList); @@ -3835,6 +3851,73 @@ SubqueryPushdownMultiNodeTree(Query *queryTree) } +/* + * FlattenJoinVars iterates over provided columnList to identify + * Var's that are referenced from join RTE, and reverts back to their + * original RTEs. + * + * This is required because Postgres allows columns to be referenced using + * a join alias. Therefore the same column from a table could be referenced + * twice using its absolute table name (t1.a), and without table name (a). + * This is a problem when one of them is inside the group by clause and the + * other is not. Postgres is smart about it to detect that both target columns + * resolve to the same thing, and allows a single group by clause to cover + * both target entries when standard planner is called. Since we operate on + * the original query, we want to make sure we provide correct varno/varattno + * values to Postgres so that it could produce valid query. + * + * Only exception is that, if a join is given an alias name, we do not want to + * flatten those var's. If we do, deparsing fails since it expects to see a join + * alias, and cannot access the RTE in the join tree by their names. + */ +static void +FlattenJoinVars(List *columnList, Query *queryTree) +{ + ListCell *columnCell = NULL; + List *rteList = queryTree->rtable; + + foreach(columnCell, columnList) + { + Var *column = (Var *) lfirst(columnCell); + RangeTblEntry *columnRte = NULL; + PlannerInfo *root = NULL; + + Assert(IsA(column, Var)); + + /* + * if join does not have an alias, it is copied over join rte. + * There is no need to find the JoinExpr to check whether it has + * an alias defined. + * + * We use the planner's flatten_join_alias_vars routine to do + * the flattening; it wants a PlannerInfo root node, which + * fortunately can be mostly dummy. + */ + columnRte = rt_fetch(column->varno, rteList); + if (columnRte->rtekind == RTE_JOIN && columnRte->alias == NULL) + { + Var *normalizedVar = NULL; + + if (root == NULL) + { + root = makeNode(PlannerInfo); + root->parse = (queryTree); + root->planner_cxt = CurrentMemoryContext; + root->hasJoinRTEs = true; + } + + normalizedVar = (Var *) flatten_join_alias_vars(root, (Node *) column); + + /* + * We need to copy values over existing one to make sure it is updated on + * respective places. + */ + memcpy(column, normalizedVar, sizeof(Var)); + } + } +} + + /* * CreateSubqueryTargetEntryList creates a target entry for each unique column * in the column list and returns the target entry list. diff --git a/src/backend/distributed/planner/multi_physical_planner.c b/src/backend/distributed/planner/multi_physical_planner.c index 2ba9dceb5..c5ae6ce0f 100644 --- a/src/backend/distributed/planner/multi_physical_planner.c +++ b/src/backend/distributed/planner/multi_physical_planner.c @@ -1415,6 +1415,8 @@ BuildSubqueryJobQuery(MultiNode *multiNode) bool hasAggregates = false; List *distinctClause = NIL; bool hasDistinctOn = false; + bool hasWindowFuncs = false; + List *windowClause = NIL; /* we start building jobs from below the collect node */ Assert(!CitusIsA(multiNode, MultiCollect)); @@ -1462,6 +1464,8 @@ BuildSubqueryJobQuery(MultiNode *multiNode) havingQual = extendedOp->havingQual; distinctClause = extendedOp->distinctClause; hasDistinctOn = extendedOp->hasDistinctOn; + hasWindowFuncs = extendedOp->hasWindowFuncs; + windowClause = extendedOp->windowClause; } /* build group clauses */ @@ -1508,6 +1512,8 @@ BuildSubqueryJobQuery(MultiNode *multiNode) jobQuery->hasAggs = hasAggregates; jobQuery->hasDistinctOn = hasDistinctOn; jobQuery->distinctClause = distinctClause; + jobQuery->hasWindowFuncs = hasWindowFuncs; + jobQuery->windowClause = windowClause; return jobQuery; } diff --git a/src/include/distributed/multi_logical_planner.h b/src/include/distributed/multi_logical_planner.h index cf5cfa650..4c4448053 100644 --- a/src/include/distributed/multi_logical_planner.h +++ b/src/include/distributed/multi_logical_planner.h @@ -176,6 +176,8 @@ typedef struct MultiExtendedOp Node *havingQual; List *distinctClause; bool hasDistinctOn; + bool hasWindowFuncs; + List *windowClause; } MultiExtendedOp; diff --git a/src/test/regress/expected/multi_insert_select_window.out b/src/test/regress/expected/multi_insert_select_window.out index f0ba6d581..6df46b924 100644 --- a/src/test/regress/expected/multi_insert_select_window.out +++ b/src/test/regress/expected/multi_insert_select_window.out @@ -654,7 +654,7 @@ ORDER BY LIMIT 10; ERROR: could not run distributed query because the window function that is used cannot be pushed down -HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions inside a subquery with a PARTITION BY clause containing the distribution column +HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column -- user needs to supply partition by which should -- include the distribution key INSERT INTO agg_results_window (user_id, agg_time, value_2_agg) @@ -673,7 +673,7 @@ ORDER BY LIMIT 10; ERROR: could not run distributed query because the window function that is used cannot be pushed down -HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions inside a subquery with a PARTITION BY clause containing the distribution column +HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column -- user needs to supply partition by which should -- include the distribution key INSERT INTO agg_results_window (user_id, agg_time, value_2_agg) @@ -692,7 +692,7 @@ ORDER BY LIMIT 10; ERROR: could not run distributed query because the window function that is used cannot be pushed down -HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions inside a subquery with a PARTITION BY clause containing the distribution column +HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column -- w2 should not be pushed down INSERT INTO agg_results_window (user_id, value_1_agg, value_2_agg) SELECT * FROM @@ -710,7 +710,7 @@ SELECT * FROM LIMIT 10; ERROR: could not run distributed query because the window function that is used cannot be pushed down -HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions inside a subquery with a PARTITION BY clause containing the distribution column +HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column -- GROUP BY includes the partition key, but not the WINDOW function INSERT INTO agg_results_window (user_id, agg_time, value_2_agg) SELECT @@ -728,7 +728,7 @@ FROM WHERE my_rank > 125; ERROR: could not run distributed query because the window function that is used cannot be pushed down -HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions inside a subquery with a PARTITION BY clause containing the distribution column +HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column -- GROUP BY includes the partition key, but not the WINDOW function INSERT INTO agg_results_window (user_id, agg_time, value_2_agg) SELECT @@ -746,7 +746,7 @@ FROM WHERE my_rank > 125; ERROR: could not run distributed query because the window function that is used cannot be pushed down -HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions inside a subquery with a PARTITION BY clause containing the distribution column +HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column -- w2 should not be allowed INSERT INTO agg_results_window (user_id, value_2_agg, value_3_agg) SELECT * FROM @@ -762,7 +762,7 @@ SELECT * FROM w2 AS (ORDER BY events_table.time) ) as foo; ERROR: could not run distributed query because the window function that is used cannot be pushed down -HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions inside a subquery with a PARTITION BY clause containing the distribution column +HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column -- unsupported window function with an override INSERT INTO agg_results_window(user_id, agg_time, value_2_agg) SELECT * FROM ( @@ -780,7 +780,7 @@ SELECT * FROM ( w2 as (PARTITION BY user_id, time) ) a; ERROR: could not run distributed query because the window function that is used cannot be pushed down -HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions inside a subquery with a PARTITION BY clause containing the distribution column +HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column -- Subquery in where with unsupported window function INSERT INTO agg_results_window(user_id) SELECT @@ -812,7 +812,7 @@ SELECT * FROM ( user_id ) a; ERROR: could not run distributed query because the window function that is used cannot be pushed down -HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions inside a subquery with a PARTITION BY clause containing the distribution column +HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column -- UNION with only one subquery which has a partition on non-distribution column should -- error out INSERT INTO agg_results_window(user_id, value_1_agg) @@ -850,5 +850,5 @@ FROM ( ) ) AS ftop; ERROR: could not run distributed query because the window function that is used cannot be pushed down -HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions inside a subquery with a PARTITION BY clause containing the distribution column +HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column DROP VIEW view_with_window_func; diff --git a/src/test/regress/expected/multi_mx_router_planner.out b/src/test/regress/expected/multi_mx_router_planner.out index 2f5a4e150..8e44c68b3 100644 --- a/src/test/regress/expected/multi_mx_router_planner.out +++ b/src/test/regress/expected/multi_mx_router_planner.out @@ -1129,12 +1129,12 @@ SELECT id, MIN(id) over (order by word_count) FROM articles_hash_mx WHERE author_id = 1 or author_id = 2; ERROR: could not run distributed query because the window function that is used cannot be pushed down -HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions inside a subquery with a PARTITION BY clause containing the distribution column +HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column SELECT LAG(title, 1) over (ORDER BY word_count) prev, title, word_count FROM articles_hash_mx WHERE author_id = 5 or author_id = 2; ERROR: could not run distributed query because the window function that is used cannot be pushed down -HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions inside a subquery with a PARTITION BY clause containing the distribution column +HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column -- complex query hitting a single shard SELECT count(DISTINCT CASE diff --git a/src/test/regress/expected/multi_router_planner.out b/src/test/regress/expected/multi_router_planner.out index 7e8b40d8f..9c3f7aa26 100644 --- a/src/test/regress/expected/multi_router_planner.out +++ b/src/test/regress/expected/multi_router_planner.out @@ -1306,12 +1306,12 @@ SELECT id, MIN(id) over (order by word_count) FROM articles_hash WHERE author_id = 1 or author_id = 2; ERROR: could not run distributed query because the window function that is used cannot be pushed down -HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions inside a subquery with a PARTITION BY clause containing the distribution column +HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column SELECT LAG(title, 1) over (ORDER BY word_count) prev, title, word_count FROM articles_hash WHERE author_id = 5 or author_id = 2; ERROR: could not run distributed query because the window function that is used cannot be pushed down -HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions inside a subquery with a PARTITION BY clause containing the distribution column +HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column -- where false queries are router plannable SELECT * FROM articles_hash diff --git a/src/test/regress/expected/multi_subquery_complex_queries.out b/src/test/regress/expected/multi_subquery_complex_queries.out index a41ca453f..03b6b7ecb 100644 --- a/src/test/regress/expected/multi_subquery_complex_queries.out +++ b/src/test/regress/expected/multi_subquery_complex_queries.out @@ -2581,3 +2581,112 @@ ORDER BY 3 | 1 (4 rows) +-- we've fixed a bug related to joins w/wout alias +-- while implementing top window functions +-- thus adding some tests related to that (i.e., next 3 tests) +WITH users_events AS +( + SELECT + user_id + FROM + users_table +) +SELECT + uid, + event_type, + value_2, + value_3 +FROM ( + (SELECT + user_id as uid + FROM + users_events + ) users + JOIN + events_table + ON + users.uid = events_table.event_type + ) a +ORDER BY + 1,2,3,4 +LIMIT 5; + uid | event_type | value_2 | value_3 +-----+------------+---------+--------- + 1 | 1 | 0 | 2 + 1 | 1 | 0 | 2 + 1 | 1 | 0 | 2 + 1 | 1 | 0 | 2 + 1 | 1 | 0 | 2 +(5 rows) + +-- the following queries are almost the same, +-- the only difference is the final GROUP BY +SELECT a.user_id, avg(b.value_2) as subquery_avg +FROM + (SELECT + user_id + FROM + users_table + WHERE + (value_1 > 2) + GROUP BY + user_id + HAVING + count(distinct value_1) > 2 + ) as a + LEFT JOIN + (SELECT + DISTINCT ON (value_2) value_2 , user_id, value_3 + FROM + users_table + WHERE + (value_1 > 3) + ORDER BY + 1,2,3 + ) AS b + USING (user_id) +GROUP BY user_id +ORDER BY 1; + user_id | subquery_avg +---------+------------------------ + 1 | 2.3333333333333333 + 3 | 5.0000000000000000 + 4 | 1.00000000000000000000 + 5 | +(4 rows) + +-- see the comment for the above query +SELECT a.user_id, avg(b.value_2) as subquery_avg +FROM + (SELECT + user_id + FROM + users_table + WHERE + (value_1 > 2) + GROUP BY + user_id + HAVING + count(distinct value_1) > 2 + ) as a + LEFT JOIN + (SELECT + DISTINCT ON (value_2) value_2 , user_id, value_3 + FROM + users_table + WHERE + (value_1 > 3) + ORDER BY + 1,2,3 + ) AS b + USING (user_id) +GROUP BY a.user_id +ORDER BY 1; + user_id | subquery_avg +---------+------------------------ + 1 | 2.3333333333333333 + 3 | 5.0000000000000000 + 4 | 1.00000000000000000000 + 5 | +(4 rows) + diff --git a/src/test/regress/expected/multi_subquery_window_functions.out b/src/test/regress/expected/multi_subquery_window_functions.out index b43e97745..ee4e9eccc 100644 --- a/src/test/regress/expected/multi_subquery_window_functions.out +++ b/src/test/regress/expected/multi_subquery_window_functions.out @@ -791,7 +791,7 @@ ORDER BY LIMIT 10; ERROR: could not run distributed query because the window function that is used cannot be pushed down -HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions inside a subquery with a PARTITION BY clause containing the distribution column +HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column -- user needs to supply partition by which should -- include the distribution key SELECT @@ -809,7 +809,7 @@ ORDER BY LIMIT 10; ERROR: could not run distributed query because the window function that is used cannot be pushed down -HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions inside a subquery with a PARTITION BY clause containing the distribution column +HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column -- user needs to supply partition by which should -- include the distribution key SELECT @@ -827,7 +827,7 @@ ORDER BY LIMIT 10; ERROR: could not run distributed query because the window function that is used cannot be pushed down -HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions inside a subquery with a PARTITION BY clause containing the distribution column +HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column -- w2 should not be pushed down SELECT * FROM ( @@ -844,7 +844,7 @@ SELECT * FROM ORDER BY 3 DESC, 1 DESC, 2 DESC NULLS LAST LIMIT 10; ERROR: could not run distributed query because the window function that is used cannot be pushed down -HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions inside a subquery with a PARTITION BY clause containing the distribution column +HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column -- w2 should not be pushed down SELECT * FROM ( @@ -863,7 +863,7 @@ ORDER BY LIMIT 10; ERROR: could not run distributed query because the window function that is used cannot be pushed down -HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions inside a subquery with a PARTITION BY clause containing the distribution column +HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column -- GROUP BY includes the partition key, but not the WINDOW function SELECT user_id, time, my_rank @@ -884,7 +884,7 @@ ORDER BY LIMIT 10; ERROR: could not run distributed query because the window function that is used cannot be pushed down -HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions inside a subquery with a PARTITION BY clause containing the distribution column +HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column -- GROUP BY includes the partition key, but not the WINDOW function SELECT user_id, time, my_rank @@ -905,7 +905,7 @@ ORDER BY LIMIT 10; ERROR: could not run distributed query because the window function that is used cannot be pushed down -HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions inside a subquery with a PARTITION BY clause containing the distribution column +HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column -- Overriding window function but not supported SELECT * FROM ( SELECT @@ -924,7 +924,7 @@ SELECT * FROM ( ORDER BY 1,2,3; ERROR: could not run distributed query because the window function that is used cannot be pushed down -HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions inside a subquery with a PARTITION BY clause containing the distribution column +HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column -- Aggregate function on distribution column should error out SELECT * FROM ( SELECT @@ -937,7 +937,7 @@ SELECT * FROM ( ORDER BY 1 DESC, 2 DESC; ERROR: could not run distributed query because the window function that is used cannot be pushed down -HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions inside a subquery with a PARTITION BY clause containing the distribution column +HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column -- test with reference table partitioned on only a column from reference table SELECT * FROM @@ -952,7 +952,7 @@ ORDER BY LIMIT 20; ERROR: could not run distributed query because the window function that is used cannot be pushed down -HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions inside a subquery with a PARTITION BY clause containing the distribution column +HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column -- UNION ALL with only one of them is not partitioned over distribution column which -- should not be allowed. SELECT @@ -975,7 +975,7 @@ GROUP BY user_id ORDER BY 1 DESC LIMIT 5; ERROR: could not run distributed query because the window function that is used cannot be pushed down -HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions inside a subquery with a PARTITION BY clause containing the distribution column +HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column -- UNION with only one subquery which has a partition on non-distribution column should -- error out SELECT * @@ -1012,5 +1012,5 @@ FROM ( ORDER BY 2 DESC, 1 DESC LIMIT 5; ERROR: could not run distributed query because the window function that is used cannot be pushed down -HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions inside a subquery with a PARTITION BY clause containing the distribution column +HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column DROP VIEW subq; diff --git a/src/test/regress/expected/set_operations.out b/src/test/regress/expected/set_operations.out index 2ec8c9c6e..39ba33d63 100644 --- a/src/test/regress/expected/set_operations.out +++ b/src/test/regress/expected/set_operations.out @@ -587,7 +587,7 @@ SELECT x, y, rnk FROM (SELECT *, rank() OVER my_win as rnk FROM test WINDOW my_w UNION SELECT x, y, rnk FROM (SELECT *, rank() OVER my_win as rnk FROM test WINDOW my_win AS (PARTITION BY y ORDER BY x DESC)) as bar; ERROR: could not run distributed query because the window function that is used cannot be pushed down -HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions inside a subquery with a PARTITION BY clause containing the distribution column +HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column -- other set operations in joins also cannot be pushed down SELECT * FROM ((SELECT * FROM test) EXCEPT (SELECT * FROM test ORDER BY x LIMIT 1)) u JOIN test USING (x) ORDER BY 1,2; DEBUG: push down of limit count: 1 diff --git a/src/test/regress/expected/subqueries_not_supported.out b/src/test/regress/expected/subqueries_not_supported.out index 34c9ee79a..e223d6b1f 100644 --- a/src/test/regress/expected/subqueries_not_supported.out +++ b/src/test/regress/expected/subqueries_not_supported.out @@ -100,7 +100,7 @@ ORDER BY LIMIT 10) as foo; ERROR: could not run distributed query because the window function that is used cannot be pushed down -HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions inside a subquery with a PARTITION BY clause containing the distribution column +HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column -- OUTER JOINs where the outer part is recursively planned and not the other way -- around is not supported SELECT diff --git a/src/test/regress/expected/window_functions.out b/src/test/regress/expected/window_functions.out new file mode 100644 index 000000000..6982106f3 --- /dev/null +++ b/src/test/regress/expected/window_functions.out @@ -0,0 +1,915 @@ +-- =================================================================== +-- test top level window functions that are pushdownable +-- =================================================================== +-- a very simple window function with an aggregate and a window function +-- distribution column is on the partition by clause +SELECT + user_id, COUNT(*) OVER (PARTITION BY user_id), + rank() OVER (PARTITION BY user_id) +FROM + users_table +ORDER BY + 1 DESC, 2 DESC, 3 DESC +LIMIT 5; + user_id | count | rank +---------+-------+------ + 6 | 10 | 1 + 6 | 10 | 1 + 6 | 10 | 1 + 6 | 10 | 1 + 6 | 10 | 1 +(5 rows) + +-- a more complicated window clause, including an aggregate +-- in both the window clause and the target entry +SELECT + user_id, avg(avg(value_3)) OVER (PARTITION BY user_id, MIN(value_2)) +FROM + users_table +GROUP BY + 1 +ORDER BY + 2 DESC NULLS LAST, 1 DESC; + user_id | avg +---------+------------------ + 2 | 3 + 4 | 2.82608695652174 + 3 | 2.70588235294118 + 6 | 2.6 + 1 | 2.57142857142857 + 5 | 2.46153846153846 +(6 rows) + +-- window clause operates on the results of a subquery +SELECT + user_id, max(value_1) OVER (PARTITION BY user_id, MIN(value_2)) +FROM ( + SELECT + DISTINCT us.user_id, us.value_2, value_1, random() as r1 + FROM + users_table as us, events_table + WHERE + us.user_id = events_table.user_id AND event_type IN (1,2) + ORDER BY + user_id, value_2 + ) s +GROUP BY + 1, value_1 +ORDER BY + 2 DESC, 1; + user_id | max +---------+----- + 1 | 5 + 3 | 5 + 3 | 5 + 4 | 5 + 5 | 5 + 5 | 5 + 6 | 5 + 6 | 5 + 1 | 4 + 2 | 4 + 3 | 4 + 3 | 4 + 3 | 4 + 4 | 4 + 4 | 4 + 5 | 4 + 5 | 4 + 1 | 3 + 2 | 3 + 2 | 3 + 2 | 3 + 6 | 3 + 2 | 2 + 4 | 2 + 4 | 2 + 4 | 2 + 6 | 2 + 1 | 1 + 3 | 1 + 5 | 1 + 6 | 1 + 5 | 0 +(32 rows) + +-- window function operates on the results of +-- a join +SELECT + us.user_id, + SUM(us.value_1) OVER (PARTITION BY us.user_id) +FROM + users_table us + JOIN + events_table ev + ON (us.user_id = ev.user_id) +GROUP BY + 1, + value_1 +ORDER BY + 1, + 2 +LIMIT 5; + user_id | sum +---------+----- + 1 | 13 + 1 | 13 + 1 | 13 + 1 | 13 + 2 | 10 +(5 rows) + +-- the same query, but this time join with an alias +SELECT + user_id, value_1, SUM(j.value_1) OVER (PARTITION BY j.user_id) +FROM + (users_table us + JOIN + events_table ev + USING (user_id ) + ) j +GROUP BY + user_id, + value_1 +ORDER BY + 3 DESC, 2 DESC, 1 DESC +LIMIT 5; + user_id | value_1 | sum +---------+---------+----- + 5 | 5 | 15 + 4 | 5 | 15 + 3 | 5 | 15 + 5 | 4 | 15 + 4 | 4 | 15 +(5 rows) + +-- querying views that have window functions should be ok +CREATE VIEW window_view AS +SELECT + DISTINCT user_id, rank() OVER (PARTITION BY user_id ORDER BY value_1) +FROM + users_table +GROUP BY + user_id, value_1 +HAVING count(*) > 1; +-- Window function in View works +SELECT * +FROM + window_view +ORDER BY + 2 DESC, 1 +LIMIT 10; + user_id | rank +---------+------ + 5 | 6 + 2 | 5 + 4 | 5 + 5 | 5 + 2 | 4 + 3 | 4 + 4 | 4 + 5 | 4 + 6 | 4 + 2 | 3 +(10 rows) + +-- the other way around also should work fine +-- query a view using window functions +CREATE VIEW users_view AS SELECT * FROM users_table; +SELECT + DISTINCT user_id, rank() OVER (PARTITION BY user_id ORDER BY value_1) +FROM + users_view +GROUP BY + user_id, value_1 +HAVING count(*) > 4 +ORDER BY + 2 DESC, 1; + user_id | rank +---------+------ + 4 | 2 + 5 | 2 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(6 rows) + +DROP VIEW users_view, window_view; +-- window function uses columns from two different tables +SELECT + DISTINCT ON (events_table.user_id, rnk) events_table.user_id, rank() OVER my_win AS rnk +FROM + events_table, users_table +WHERE + users_table.user_id = events_table.user_id +WINDOW + my_win AS (PARTITION BY events_table.user_id, users_table.value_1 ORDER BY events_table.time DESC) +ORDER BY + rnk DESC, 1 DESC +LIMIT 10; + user_id | rnk +---------+----- + 3 | 121 + 5 | 118 + 2 | 116 + 3 | 115 + 4 | 113 + 2 | 111 + 5 | 109 + 3 | 109 + 4 | 106 + 2 | 106 +(10 rows) + +-- the same query with reference table column is also on the partition by clause +SELECT + DISTINCT ON (events_table.user_id, rnk) events_table.user_id, rank() OVER my_win AS rnk +FROM + events_table, users_ref_test_table uref +WHERE + uref.id = events_table.user_id +WINDOW + my_win AS (PARTITION BY events_table.user_id, uref.k_no ORDER BY events_table.time DESC) +ORDER BY + rnk DESC, 1 DESC +LIMIT 10; + user_id | rnk +---------+----- + 2 | 24 + 2 | 23 + 2 | 22 + 3 | 21 + 2 | 21 + 3 | 20 + 2 | 20 + 3 | 19 + 2 | 19 + 3 | 18 +(10 rows) + +-- similar query with no distribution column is on the partition by clause +-- is not supported +SELECT + DISTINCT ON (events_table.user_id, rnk) events_table.user_id, rank() OVER my_win AS rnk +FROM + events_table, users_ref_test_table uref +WHERE + uref.id = events_table.user_id +WINDOW + my_win AS (PARTITION BY events_table.value_2, uref.k_no ORDER BY events_table.time DESC) +ORDER BY + rnk DESC, 1 DESC +LIMIT 10; +ERROR: could not run distributed query because the window function that is used cannot be pushed down +HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column +-- ORDER BY in the window function is an aggragate +SELECT + user_id, rank() OVER my_win as rnk, avg(value_2) as avg_val_2 +FROM + events_table +GROUP BY + user_id, date_trunc('day', time) +WINDOW + my_win AS (PARTITION BY user_id ORDER BY avg(event_type) DESC) +ORDER BY + 3 DESC, 2 DESC, 1 DESC; + user_id | rnk | avg_val_2 +---------+-----+-------------------- + 1 | 1 | 3.3750000000000000 + 3 | 2 | 3.1666666666666667 + 5 | 1 | 2.6666666666666667 + 6 | 1 | 2.5000000000000000 + 4 | 1 | 2.5000000000000000 + 2 | 1 | 2.4736842105263158 + 4 | 2 | 2.4000000000000000 + 1 | 2 | 2.1428571428571429 + 5 | 2 | 2.0909090909090909 + 6 | 2 | 2.0000000000000000 + 2 | 2 | 2.0000000000000000 + 3 | 1 | 1.8000000000000000 +(12 rows) + +-- lets push the limits of writing complex expressions aling with the window functions +SELECT + COUNT(*) OVER (PARTITION BY user_id, user_id + 1), + rank() OVER (PARTITION BY user_id) as cnt1, + COUNT(*) OVER (PARTITION BY user_id, abs(value_1 - value_2)) as cnt2, + date_trunc('min', lag(time) OVER (PARTITION BY user_id)) as datee, + rank() OVER my_win as rnnk, + avg(CASE + WHEN user_id > 4 + THEN value_1 + ELSE value_2 + END) FILTER (WHERE user_id > 2) OVER my_win_2 as filtered_count, + sum(user_id * (5.0 / (value_1 + value_2 + 0.1)) * value_3) FILTER (WHERE value_1::text LIKE '%1%') OVER my_win_4 as cnt_with_filter_2 +FROM + users_table +WINDOW + my_win AS (PARTITION BY user_id, (value_1%3)::int ORDER BY time DESC), + my_win_2 AS (PARTITION BY user_id, (value_1)::int ORDER BY time DESC), + my_win_3 AS (PARTITION BY user_id, date_trunc('min', time)), + my_win_4 AS (my_win_3 ORDER BY value_2, value_3) +ORDER BY + cnt_with_filter_2 DESC NULLS LAST, filtered_count DESC NULLS LAST, datee DESC NULLS LAST, rnnk DESC, cnt2 DESC, cnt1 DESC, user_id DESC +LIMIT 5; + count | cnt1 | cnt2 | datee | rnnk | filtered_count | cnt_with_filter_2 +-------+------+------+--------------------------+------+------------------------+------------------- + 23 | 1 | 7 | Thu Nov 23 02:14:00 2017 | 6 | 0.00000000000000000000 | 72.7272727272727 + 10 | 1 | 3 | Wed Nov 22 23:01:00 2017 | 1 | 1.00000000000000000000 | 57.1428571428571 + 17 | 1 | 5 | Wed Nov 22 23:24:00 2017 | 8 | 3.0000000000000000 | 28.5714285714286 + 17 | 1 | 5 | | 10 | 2.6666666666666667 | 28.5714285714286 + 17 | 1 | 5 | Thu Nov 23 00:15:00 2017 | 7 | 3.6666666666666667 | 24.1935483870968 +(5 rows) + +-- some tests with GROUP BY along with PARTITION BY +SELECT + user_id, + rank() OVER my_win as my_rank, + avg(avg(event_type)) OVER my_win_2 as avg, + max(time) as mx_time +FROM + events_table +GROUP BY + user_id, + value_2 +WINDOW + my_win AS (PARTITION BY user_id, max(event_type) ORDER BY count(*) DESC), + my_win_2 AS (PARTITION BY user_id, avg(user_id) ORDER BY count(*) DESC) +ORDER BY + avg DESC, + mx_time DESC, + my_rank DESC, + user_id DESC; + user_id | my_rank | avg | mx_time +---------+---------+------------------------+--------------------------------- + 6 | 1 | 3.0000000000000000 | Thu Nov 23 14:00:13.20013 2017 + 6 | 2 | 3.0000000000000000 | Thu Nov 23 11:16:13.106691 2017 + 6 | 1 | 3.0000000000000000 | Thu Nov 23 07:27:32.822068 2017 + 3 | 1 | 2.9857142857142857 | Thu Nov 23 16:31:56.219594 2017 + 4 | 2 | 2.9555555555555556 | Thu Nov 23 14:19:25.765876 2017 + 4 | 1 | 2.9555555555555556 | Thu Nov 23 08:36:53.871919 2017 + 1 | 4 | 2.8633333333333333 | Wed Nov 22 21:06:57.457147 2017 + 1 | 1 | 2.8250000000000000 | Thu Nov 23 21:54:46.924477 2017 + 2 | 2 | 2.7738095238095238 | Thu Nov 23 13:27:37.441959 2017 + 1 | 2 | 2.7722222222222222 | Thu Nov 23 09:23:30.994345 2017 + 3 | 1 | 2.7682539682539682 | Thu Nov 23 01:17:49.040685 2017 + 2 | 1 | 2.7142857142857143 | Thu Nov 23 15:58:49.273421 2017 + 1 | 3 | 2.5791666666666667 | Thu Nov 23 11:09:38.074595 2017 + 3 | 1 | 2.5714285714285714 | Thu Nov 23 16:44:41.903713 2017 + 2 | 1 | 2.5158730158730159 | Thu Nov 23 14:02:47.738901 2017 + 4 | 1 | 2.47777777777777778333 | Thu Nov 23 16:20:33.264457 2017 + 4 | 3 | 2.47777777777777778333 | Thu Nov 23 08:14:18.231273 2017 + 4 | 3 | 2.47777777777777778333 | Thu Nov 23 07:32:45.521278 2017 + 1 | 1 | 2.4000000000000000 | Thu Nov 23 10:23:27.617726 2017 + 2 | 1 | 2.3869047619047619 | Thu Nov 23 17:26:14.563216 2017 + 3 | 1 | 2.3841269841269841 | Thu Nov 23 18:08:26.550729 2017 + 3 | 1 | 2.3841269841269841 | Thu Nov 23 09:38:45.338008 2017 + 3 | 2 | 2.3841269841269841 | Thu Nov 23 06:44:50.887182 2017 + 2 | 2 | 2.3095238095238095 | Thu Nov 23 04:05:16.217731 2017 + 5 | 2 | 2.3000000000000000 | Thu Nov 23 14:28:51.833214 2017 + 5 | 2 | 2.3000000000000000 | Thu Nov 23 14:23:09.889786 2017 + 4 | 1 | 2.2000000000000000 | Thu Nov 23 18:10:21.338399 2017 + 2 | 1 | 2.09126984126984126667 | Thu Nov 23 03:35:04.321504 2017 + 5 | 1 | 2.0000000000000000 | Thu Nov 23 16:11:02.929469 2017 + 5 | 1 | 2.0000000000000000 | Thu Nov 23 14:40:40.467511 2017 + 5 | 1 | 2.0000000000000000 | Thu Nov 23 13:26:45.571108 2017 +(31 rows) + +-- test for range and rows mode and different window functions +-- mostly to make sure that deparsing works fine +SELECT + user_id, + rank() OVER (PARTITION BY user_id ROWS BETWEEN + UNBOUNDED PRECEDING AND CURRENT ROW), + dense_rank() OVER (PARTITION BY user_id RANGE BETWEEN + UNBOUNDED PRECEDING AND CURRENT ROW), + CUME_DIST() OVER (PARTITION BY user_id RANGE BETWEEN + UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING), + PERCENT_RANK() OVER (PARTITION BY user_id ORDER BY avg(value_1) RANGE BETWEEN + UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) +FROM + users_table +GROUP BY + 1 +ORDER BY + 4 DESC,3 DESC,2 DESC ,1 DESC; + user_id | rank | dense_rank | cume_dist | percent_rank +---------+------+------------+-----------+-------------- + 6 | 1 | 1 | 1 | 0 + 5 | 1 | 1 | 1 | 0 + 4 | 1 | 1 | 1 | 0 + 3 | 1 | 1 | 1 | 0 + 2 | 1 | 1 | 1 | 0 + 1 | 1 | 1 | 1 | 0 +(6 rows) + +-- some tests with GROUP BY, HAVING and LIMIT +SELECT + user_id, sum(event_type) OVER my_win , event_type +FROM + events_table +GROUP BY + user_id, event_type +HAVING count(*) > 2 + WINDOW my_win AS (PARTITION BY user_id, max(event_type) ORDER BY count(*) DESC) +ORDER BY + 2 DESC, 3 DESC, 1 DESC +LIMIT + 5; + user_id | sum | event_type +---------+-----+------------ + 4 | 4 | 4 + 3 | 4 | 4 + 2 | 4 | 4 + 1 | 4 | 4 + 5 | 3 | 3 +(5 rows) + +-- Group by has more columns than partition by +SELECT + DISTINCT user_id, SUM(value_2) OVER (PARTITION BY user_id) +FROM + users_table +GROUP BY + user_id, value_1, value_2 +HAVING count(*) > 2 +ORDER BY + 2 DESC, 1 +LIMIT + 10; + user_id | sum +---------+----- + 5 | 3 + 4 | 2 +(2 rows) + +SELECT + DISTINCT ON (user_id) user_id, SUM(value_2) OVER (PARTITION BY user_id) +FROM + users_table +GROUP BY + user_id, value_1, value_2 +HAVING count(*) > 2 +ORDER BY + 1, 2 DESC +LIMIT + 10; + user_id | sum +---------+----- + 4 | 2 + 5 | 3 +(2 rows) + +SELECT + DISTINCT ON (SUM(value_1) OVER (PARTITION BY user_id)) user_id, SUM(value_2) OVER (PARTITION BY user_id) +FROM + users_table +GROUP BY + user_id, value_1, value_2 +HAVING count(*) > 2 +ORDER BY + (SUM(value_1) OVER (PARTITION BY user_id)) , 2 DESC, 1 +LIMIT + 10; + user_id | sum +---------+----- + 5 | 3 + 4 | 2 +(2 rows) + + +-- not a meaningful query, with interesting syntax +SELECT + user_id, + AVG(avg(value_1)) OVER (PARTITION BY user_id, max(user_id), MIN(value_2)), + AVG(avg(user_id)) OVER (PARTITION BY user_id, min(user_id), AVG(value_1)) +FROM + users_table +GROUP BY + 1 +ORDER BY + 3 DESC, 2 DESC, 1 DESC; + user_id | avg | avg +---------+--------------------+------------------------ + 6 | 2.1000000000000000 | 6.0000000000000000 + 5 | 2.6538461538461538 | 5.0000000000000000 + 4 | 2.7391304347826087 | 4.0000000000000000 + 3 | 2.3529411764705882 | 3.0000000000000000 + 2 | 2.3333333333333333 | 2.0000000000000000 + 1 | 3.2857142857142857 | 1.00000000000000000000 +(6 rows) + +EXPLAIN +SELECT + user_id, + AVG(avg(value_1)) OVER (PARTITION BY user_id, max(user_id), MIN(value_2)), + AVG(avg(user_id)) OVER (PARTITION BY user_id, min(user_id), AVG(value_1)) +FROM + users_table +GROUP BY + 1 +ORDER BY + 3 DESC, 2 DESC, 1 DESC; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------------------- + Sort (cost=0.00..0.00 rows=0 width=0) + Sort Key: remote_scan.avg_1 DESC, remote_scan.avg DESC, remote_scan.user_id DESC + -> HashAggregate (cost=0.00..0.00 rows=0 width=0) + Group Key: remote_scan.user_id + -> Custom Scan (Citus Real-Time) (cost=0.00..0.00 rows=0 width=0) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> WindowAgg (cost=5.06..5.80 rows=33 width=128) + -> Sort (cost=5.06..5.14 rows=33 width=128) + Sort Key: users_table.user_id, (min(users_table.user_id)), (avg(users_table.value_1)) + -> WindowAgg (cost=3.48..4.22 rows=33 width=128) + -> Sort (cost=3.48..3.56 rows=33 width=96) + Sort Key: users_table.user_id, (max(users_table.user_id)), (min(users_table.value_2)) + -> HashAggregate (cost=2.07..2.65 rows=33 width=96) + Group Key: users_table.user_id + -> Seq Scan on users_table_1400000 users_table (cost=0.00..1.33 rows=33 width=12) +(18 rows) + +SELECT + user_id, + 1 + sum(value_1), + 1 + AVG(value_2) OVER (partition by user_id) +FROM + users_table +GROUP BY + user_id, value_2 +ORDER BY + user_id, value_2; + user_id | ?column? | ?column? +---------+----------+-------------------- + 1 | 5 | 3.2500000000000000 + 1 | 4 | 3.2500000000000000 + 1 | 6 | 3.2500000000000000 + 1 | 12 | 3.2500000000000000 + 2 | 3 | 3.5000000000000000 + 2 | 5 | 3.5000000000000000 + 2 | 13 | 3.5000000000000000 + 2 | 6 | 3.5000000000000000 + 2 | 17 | 3.5000000000000000 + 2 | 4 | 3.5000000000000000 + 3 | 3 | 4.0000000000000000 + 3 | 13 | 4.0000000000000000 + 3 | 10 | 4.0000000000000000 + 3 | 2 | 4.0000000000000000 + 3 | 17 | 4.0000000000000000 + 4 | 4 | 3.5000000000000000 + 4 | 28 | 3.5000000000000000 + 4 | 1 | 3.5000000000000000 + 4 | 11 | 3.5000000000000000 + 4 | 17 | 3.5000000000000000 + 4 | 8 | 3.5000000000000000 + 5 | 7 | 3.5000000000000000 + 5 | 17 | 3.5000000000000000 + 5 | 24 | 3.5000000000000000 + 5 | 9 | 3.5000000000000000 + 5 | 8 | 3.5000000000000000 + 5 | 10 | 3.5000000000000000 + 6 | 6 | 3.0000000000000000 + 6 | 3 | 3.0000000000000000 + 6 | 9 | 3.0000000000000000 + 6 | 3 | 3.0000000000000000 + 6 | 5 | 3.0000000000000000 +(32 rows) + +EXPLAIN +SELECT + user_id, + 1 + sum(value_1), + 1 + AVG(value_2) OVER (partition by user_id) +FROM + users_table +GROUP BY + user_id, value_2 +ORDER BY + user_id, value_2; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------- + Sort (cost=0.00..0.00 rows=0 width=0) + Sort Key: remote_scan.user_id, remote_scan.worker_column_4 + -> HashAggregate (cost=0.00..0.00 rows=0 width=0) + Group Key: remote_scan.user_id, remote_scan.worker_column_4 + -> Custom Scan (Citus Real-Time) (cost=0.00..0.00 rows=0 width=0) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> WindowAgg (cost=2.16..3.40 rows=33 width=48) + -> GroupAggregate (cost=2.16..2.82 rows=33 width=16) + Group Key: users_table.user_id, users_table.value_2 + -> Sort (cost=2.16..2.24 rows=33 width=12) + Sort Key: users_table.user_id, users_table.value_2 + -> Seq Scan on users_table_1400000 users_table (cost=0.00..1.33 rows=33 width=12) +(15 rows) + +SELECT + user_id, + 1 + sum(value_1), + 1 + AVG(value_2) OVER (partition by user_id) +FROM + users_table +GROUP BY + user_id, value_2 +ORDER BY + 2 DESC, 1 +LIMIT 5; + user_id | ?column? | ?column? +---------+----------+-------------------- + 4 | 28 | 3.5000000000000000 + 5 | 24 | 3.5000000000000000 + 2 | 17 | 3.5000000000000000 + 3 | 17 | 4.0000000000000000 + 4 | 17 | 3.5000000000000000 +(5 rows) + +-- rank and ordering in the reverse order +SELECT + user_id, + avg(value_1), + RANK() OVER (partition by user_id order by value_2) +FROM + users_table +GROUP BY user_id, value_2 +ORDER BY user_id, value_2 DESC; + user_id | avg | rank +---------+------------------------+------ + 1 | 3.6666666666666667 | 4 + 1 | 2.5000000000000000 | 3 + 1 | 3.0000000000000000 | 2 + 1 | 4.0000000000000000 | 1 + 2 | 1.5000000000000000 | 6 + 2 | 3.2000000000000000 | 5 + 2 | 1.6666666666666667 | 4 + 2 | 3.0000000000000000 | 3 + 2 | 1.3333333333333333 | 2 + 2 | 2.0000000000000000 | 1 + 3 | 2.6666666666666667 | 5 + 3 | 1.00000000000000000000 | 4 + 3 | 3.0000000000000000 | 3 + 3 | 2.4000000000000000 | 2 + 3 | 1.00000000000000000000 | 1 + 4 | 3.5000000000000000 | 6 + 4 | 3.2000000000000000 | 5 + 4 | 3.3333333333333333 | 4 + 4 | 0.00000000000000000000 | 3 + 4 | 3.0000000000000000 | 2 + 4 | 1.00000000000000000000 | 1 + 5 | 3.0000000000000000 | 6 + 5 | 2.3333333333333333 | 5 + 5 | 1.6000000000000000 | 4 + 5 | 2.8750000000000000 | 3 + 5 | 3.2000000000000000 | 2 + 5 | 3.0000000000000000 | 1 + 6 | 1.3333333333333333 | 5 + 6 | 2.0000000000000000 | 4 + 6 | 4.0000000000000000 | 3 + 6 | 1.00000000000000000000 | 2 + 6 | 2.5000000000000000 | 1 +(32 rows) + +-- order by in the window function is same as avg(value_1) DESC +SELECT + user_id, + avg(value_1), + RANK() OVER (partition by user_id order by 1 / (1 + avg(value_1))) +FROM + users_table +GROUP BY user_id, value_2 +ORDER BY user_id, avg(value_1) DESC; + user_id | avg | rank +---------+------------------------+------ + 1 | 4.0000000000000000 | 1 + 1 | 3.6666666666666667 | 2 + 1 | 3.0000000000000000 | 3 + 1 | 2.5000000000000000 | 4 + 2 | 3.2000000000000000 | 1 + 2 | 3.0000000000000000 | 2 + 2 | 2.0000000000000000 | 3 + 2 | 1.6666666666666667 | 4 + 2 | 1.5000000000000000 | 5 + 2 | 1.3333333333333333 | 6 + 3 | 3.0000000000000000 | 1 + 3 | 2.6666666666666667 | 2 + 3 | 2.4000000000000000 | 3 + 3 | 1.00000000000000000000 | 4 + 3 | 1.00000000000000000000 | 4 + 4 | 3.5000000000000000 | 1 + 4 | 3.3333333333333333 | 2 + 4 | 3.2000000000000000 | 3 + 4 | 3.0000000000000000 | 4 + 4 | 1.00000000000000000000 | 5 + 4 | 0.00000000000000000000 | 6 + 5 | 3.2000000000000000 | 1 + 5 | 3.0000000000000000 | 2 + 5 | 3.0000000000000000 | 2 + 5 | 2.8750000000000000 | 4 + 5 | 2.3333333333333333 | 5 + 5 | 1.6000000000000000 | 6 + 6 | 4.0000000000000000 | 1 + 6 | 2.5000000000000000 | 2 + 6 | 2.0000000000000000 | 3 + 6 | 1.3333333333333333 | 4 + 6 | 1.00000000000000000000 | 5 +(32 rows) + +EXPLAIN +SELECT + user_id, + avg(value_1), + RANK() OVER (partition by user_id order by 1 / (1 + avg(value_1))) +FROM + users_table +GROUP BY user_id, value_2 +ORDER BY user_id, avg(value_1) DESC; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Sort (cost=0.00..0.00 rows=0 width=0) + Sort Key: remote_scan.user_id, (pg_catalog.sum(((pg_catalog.sum(remote_scan.avg) / pg_catalog.sum(remote_scan.avg_1)))) / pg_catalog.sum(remote_scan.rank)) DESC + -> HashAggregate (cost=0.00..0.00 rows=0 width=0) + Group Key: remote_scan.user_id, remote_scan.worker_column_5 + -> Custom Scan (Citus Real-Time) (cost=0.00..0.00 rows=0 width=0) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> WindowAgg (cost=3.32..4.14 rows=33 width=80) + -> Sort (cost=3.32..3.40 rows=33 width=56) + Sort Key: users_table.user_id, (('1'::numeric / ('1'::numeric + avg(users_table.value_1)))) + -> HashAggregate (cost=1.91..2.49 rows=33 width=56) + Group Key: users_table.user_id, users_table.value_2 + -> Seq Scan on users_table_1400000 users_table (cost=0.00..1.33 rows=33 width=12) +(15 rows) + +-- order by in the window function is same as avg(value_1) DESC +SELECT + user_id, + avg(value_1), + RANK() OVER (partition by user_id order by 1 / (1 + avg(value_1))) +FROM + users_table +GROUP BY user_id, value_2 +ORDER BY user_id, avg(value_1) DESC; + user_id | avg | rank +---------+------------------------+------ + 1 | 4.0000000000000000 | 1 + 1 | 3.6666666666666667 | 2 + 1 | 3.0000000000000000 | 3 + 1 | 2.5000000000000000 | 4 + 2 | 3.2000000000000000 | 1 + 2 | 3.0000000000000000 | 2 + 2 | 2.0000000000000000 | 3 + 2 | 1.6666666666666667 | 4 + 2 | 1.5000000000000000 | 5 + 2 | 1.3333333333333333 | 6 + 3 | 3.0000000000000000 | 1 + 3 | 2.6666666666666667 | 2 + 3 | 2.4000000000000000 | 3 + 3 | 1.00000000000000000000 | 4 + 3 | 1.00000000000000000000 | 4 + 4 | 3.5000000000000000 | 1 + 4 | 3.3333333333333333 | 2 + 4 | 3.2000000000000000 | 3 + 4 | 3.0000000000000000 | 4 + 4 | 1.00000000000000000000 | 5 + 4 | 0.00000000000000000000 | 6 + 5 | 3.2000000000000000 | 1 + 5 | 3.0000000000000000 | 2 + 5 | 3.0000000000000000 | 2 + 5 | 2.8750000000000000 | 4 + 5 | 2.3333333333333333 | 5 + 5 | 1.6000000000000000 | 6 + 6 | 4.0000000000000000 | 1 + 6 | 2.5000000000000000 | 2 + 6 | 2.0000000000000000 | 3 + 6 | 1.3333333333333333 | 4 + 6 | 1.00000000000000000000 | 5 +(32 rows) + +-- limit is not pushed down to worker !! +EXPLAIN +SELECT + user_id, + avg(value_1), + RANK() OVER (partition by user_id order by 1 / (1 + avg(value_1))) +FROM + users_table +GROUP BY user_id, value_2 +ORDER BY user_id, avg(value_1) DESC +LIMIT 5; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Limit (cost=0.00..0.00 rows=0 width=0) + -> Sort (cost=0.00..0.00 rows=0 width=0) + Sort Key: remote_scan.user_id, (pg_catalog.sum(((pg_catalog.sum(remote_scan.avg) / pg_catalog.sum(remote_scan.avg_1)))) / pg_catalog.sum(remote_scan.rank)) DESC + -> HashAggregate (cost=0.00..0.00 rows=0 width=0) + Group Key: remote_scan.user_id, remote_scan.worker_column_5 + -> Custom Scan (Citus Real-Time) (cost=0.00..0.00 rows=0 width=0) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> WindowAgg (cost=3.32..4.14 rows=33 width=80) + -> Sort (cost=3.32..3.40 rows=33 width=56) + Sort Key: users_table.user_id, (('1'::numeric / ('1'::numeric + avg(users_table.value_1)))) + -> HashAggregate (cost=1.91..2.49 rows=33 width=56) + Group Key: users_table.user_id, users_table.value_2 + -> Seq Scan on users_table_1400000 users_table (cost=0.00..1.33 rows=33 width=12) +(16 rows) + +EXPLAIN +SELECT + user_id, + avg(value_1), + RANK() OVER (partition by user_id order by 1 / (1 + avg(value_1))) +FROM + users_table +GROUP BY user_id, value_2 +ORDER BY user_id, avg(value_1) DESC +LIMIT 5; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Limit (cost=0.00..0.00 rows=0 width=0) + -> Sort (cost=0.00..0.00 rows=0 width=0) + Sort Key: remote_scan.user_id, (pg_catalog.sum(((pg_catalog.sum(remote_scan.avg) / pg_catalog.sum(remote_scan.avg_1)))) / pg_catalog.sum(remote_scan.rank)) DESC + -> HashAggregate (cost=0.00..0.00 rows=0 width=0) + Group Key: remote_scan.user_id, remote_scan.worker_column_5 + -> Custom Scan (Citus Real-Time) (cost=0.00..0.00 rows=0 width=0) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> WindowAgg (cost=3.32..4.14 rows=33 width=80) + -> Sort (cost=3.32..3.40 rows=33 width=56) + Sort Key: users_table.user_id, (('1'::numeric / ('1'::numeric + avg(users_table.value_1)))) + -> HashAggregate (cost=1.91..2.49 rows=33 width=56) + Group Key: users_table.user_id, users_table.value_2 + -> Seq Scan on users_table_1400000 users_table (cost=0.00..1.33 rows=33 width=12) +(16 rows) + +EXPLAIN +SELECT + user_id, + avg(value_1), + RANK() OVER (partition by user_id order by 1 / (1 + sum(value_2))) +FROM + users_table +GROUP BY user_id, value_2 +ORDER BY user_id, avg(value_1) DESC +LIMIT 5; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Limit (cost=0.00..0.00 rows=0 width=0) + -> Sort (cost=0.00..0.00 rows=0 width=0) + Sort Key: remote_scan.user_id, (pg_catalog.sum(((pg_catalog.sum(remote_scan.avg) / pg_catalog.sum(remote_scan.avg_1)))) / pg_catalog.sum(remote_scan.rank)) DESC + -> HashAggregate (cost=0.00..0.00 rows=0 width=0) + Group Key: remote_scan.user_id, remote_scan.worker_column_5 + -> Custom Scan (Citus Real-Time) (cost=0.00..0.00 rows=0 width=0) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> WindowAgg (cost=3.15..3.98 rows=33 width=48) + -> Sort (cost=3.15..3.23 rows=33 width=40) + Sort Key: users_table.user_id, ((1 / (1 + sum(users_table.value_2)))) + -> HashAggregate (cost=1.83..2.32 rows=33 width=40) + Group Key: users_table.user_id, users_table.value_2 + -> Seq Scan on users_table_1400000 users_table (cost=0.00..1.33 rows=33 width=12) +(16 rows) + +EXPLAIN +SELECT + user_id, + avg(value_1), + RANK() OVER (partition by user_id order by sum(value_2)) +FROM + users_table +GROUP BY user_id, value_2 +ORDER BY user_id, avg(value_1) DESC +LIMIT 5; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Limit (cost=0.00..0.00 rows=0 width=0) + -> Sort (cost=0.00..0.00 rows=0 width=0) + Sort Key: remote_scan.user_id, (pg_catalog.sum(((pg_catalog.sum(remote_scan.avg) / pg_catalog.sum(remote_scan.avg_1)))) / pg_catalog.sum(remote_scan.rank)) DESC + -> HashAggregate (cost=0.00..0.00 rows=0 width=0) + Group Key: remote_scan.user_id, remote_scan.worker_column_5 + -> Custom Scan (Citus Real-Time) (cost=0.00..0.00 rows=0 width=0) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> WindowAgg (cost=2.90..3.56 rows=33 width=40) + -> Sort (cost=2.90..2.99 rows=33 width=32) + Sort Key: users_table.user_id, (sum(users_table.value_2)) + -> HashAggregate (cost=1.74..2.07 rows=33 width=32) + Group Key: users_table.user_id, users_table.value_2 + -> Seq Scan on users_table_1400000 users_table (cost=0.00..1.33 rows=33 width=12) +(16 rows) + diff --git a/src/test/regress/expected/with_basics.out b/src/test/regress/expected/with_basics.out index 53096996a..b615dedb9 100644 --- a/src/test/regress/expected/with_basics.out +++ b/src/test/regress/expected/with_basics.out @@ -471,7 +471,7 @@ ORDER BY LIMIT 5; ERROR: could not run distributed query because the window function that is used cannot be pushed down -HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions inside a subquery with a PARTITION BY clause containing the distribution column +HINT: Window functions are supported in two ways. Either add an equality filter on the distributed tables' partition column or use the window functions with a PARTITION BY clause containing the distribution column -- Window functions that partition by the distribution column in subqueries in CTEs are ok WITH top_users AS (SELECT * diff --git a/src/test/regress/multi_schedule b/src/test/regress/multi_schedule index 2c4a0fdc0..7f0007ecd 100644 --- a/src/test/regress/multi_schedule +++ b/src/test/regress/multi_schedule @@ -31,7 +31,8 @@ test: multi_read_from_secondaries # ---------- test: multi_create_table test: multi_create_table_constraints multi_master_protocol multi_load_data multi_behavioral_analytics_create_table -test: multi_behavioral_analytics_basics multi_behavioral_analytics_single_shard_queries multi_insert_select_non_pushable_queries multi_insert_select multi_insert_select_window multi_shard_update_delete +test: multi_behavioral_analytics_basics multi_behavioral_analytics_single_shard_queries multi_insert_select_non_pushable_queries multi_insert_select +test: multi_insert_select_window multi_shard_update_delete window_functions # ---------- # Tests for partitioning support diff --git a/src/test/regress/sql/multi_subquery_complex_queries.sql b/src/test/regress/sql/multi_subquery_complex_queries.sql index 319cc743c..ae89eb166 100644 --- a/src/test/regress/sql/multi_subquery_complex_queries.sql +++ b/src/test/regress/sql/multi_subquery_complex_queries.sql @@ -2313,3 +2313,90 @@ GROUP BY types ORDER BY types; + +-- we've fixed a bug related to joins w/wout alias +-- while implementing top window functions +-- thus adding some tests related to that (i.e., next 3 tests) +WITH users_events AS +( + SELECT + user_id + FROM + users_table +) +SELECT + uid, + event_type, + value_2, + value_3 +FROM ( + (SELECT + user_id as uid + FROM + users_events + ) users + JOIN + events_table + ON + users.uid = events_table.event_type + ) a +ORDER BY + 1,2,3,4 +LIMIT 5; + +-- the following queries are almost the same, +-- the only difference is the final GROUP BY +SELECT a.user_id, avg(b.value_2) as subquery_avg +FROM + (SELECT + user_id + FROM + users_table + WHERE + (value_1 > 2) + GROUP BY + user_id + HAVING + count(distinct value_1) > 2 + ) as a + LEFT JOIN + (SELECT + DISTINCT ON (value_2) value_2 , user_id, value_3 + FROM + users_table + WHERE + (value_1 > 3) + ORDER BY + 1,2,3 + ) AS b + USING (user_id) +GROUP BY user_id +ORDER BY 1; + +-- see the comment for the above query +SELECT a.user_id, avg(b.value_2) as subquery_avg +FROM + (SELECT + user_id + FROM + users_table + WHERE + (value_1 > 2) + GROUP BY + user_id + HAVING + count(distinct value_1) > 2 + ) as a + LEFT JOIN + (SELECT + DISTINCT ON (value_2) value_2 , user_id, value_3 + FROM + users_table + WHERE + (value_1 > 3) + ORDER BY + 1,2,3 + ) AS b + USING (user_id) +GROUP BY a.user_id +ORDER BY 1; diff --git a/src/test/regress/sql/window_functions.sql b/src/test/regress/sql/window_functions.sql new file mode 100644 index 000000000..9fe3ee7ba --- /dev/null +++ b/src/test/regress/sql/window_functions.sql @@ -0,0 +1,421 @@ +-- =================================================================== +-- test top level window functions that are pushdownable +-- =================================================================== + +-- a very simple window function with an aggregate and a window function +-- distribution column is on the partition by clause +SELECT + user_id, COUNT(*) OVER (PARTITION BY user_id), + rank() OVER (PARTITION BY user_id) +FROM + users_table +ORDER BY + 1 DESC, 2 DESC, 3 DESC +LIMIT 5; + +-- a more complicated window clause, including an aggregate +-- in both the window clause and the target entry +SELECT + user_id, avg(avg(value_3)) OVER (PARTITION BY user_id, MIN(value_2)) +FROM + users_table +GROUP BY + 1 +ORDER BY + 2 DESC NULLS LAST, 1 DESC; + +-- window clause operates on the results of a subquery +SELECT + user_id, max(value_1) OVER (PARTITION BY user_id, MIN(value_2)) +FROM ( + SELECT + DISTINCT us.user_id, us.value_2, value_1, random() as r1 + FROM + users_table as us, events_table + WHERE + us.user_id = events_table.user_id AND event_type IN (1,2) + ORDER BY + user_id, value_2 + ) s +GROUP BY + 1, value_1 +ORDER BY + 2 DESC, 1; + +-- window function operates on the results of +-- a join +SELECT + us.user_id, + SUM(us.value_1) OVER (PARTITION BY us.user_id) +FROM + users_table us + JOIN + events_table ev + ON (us.user_id = ev.user_id) +GROUP BY + 1, + value_1 +ORDER BY + 1, + 2 +LIMIT 5; + +-- the same query, but this time join with an alias +SELECT + user_id, value_1, SUM(j.value_1) OVER (PARTITION BY j.user_id) +FROM + (users_table us + JOIN + events_table ev + USING (user_id ) + ) j +GROUP BY + user_id, + value_1 +ORDER BY + 3 DESC, 2 DESC, 1 DESC +LIMIT 5; + +-- querying views that have window functions should be ok +CREATE VIEW window_view AS +SELECT + DISTINCT user_id, rank() OVER (PARTITION BY user_id ORDER BY value_1) +FROM + users_table +GROUP BY + user_id, value_1 +HAVING count(*) > 1; + +-- Window function in View works +SELECT * +FROM + window_view +ORDER BY + 2 DESC, 1 +LIMIT 10; + +-- the other way around also should work fine +-- query a view using window functions +CREATE VIEW users_view AS SELECT * FROM users_table; + +SELECT + DISTINCT user_id, rank() OVER (PARTITION BY user_id ORDER BY value_1) +FROM + users_view +GROUP BY + user_id, value_1 +HAVING count(*) > 4 +ORDER BY + 2 DESC, 1; + +DROP VIEW users_view, window_view; + +-- window function uses columns from two different tables +SELECT + DISTINCT ON (events_table.user_id, rnk) events_table.user_id, rank() OVER my_win AS rnk +FROM + events_table, users_table +WHERE + users_table.user_id = events_table.user_id +WINDOW + my_win AS (PARTITION BY events_table.user_id, users_table.value_1 ORDER BY events_table.time DESC) +ORDER BY + rnk DESC, 1 DESC +LIMIT 10; + +-- the same query with reference table column is also on the partition by clause +SELECT + DISTINCT ON (events_table.user_id, rnk) events_table.user_id, rank() OVER my_win AS rnk +FROM + events_table, users_ref_test_table uref +WHERE + uref.id = events_table.user_id +WINDOW + my_win AS (PARTITION BY events_table.user_id, uref.k_no ORDER BY events_table.time DESC) +ORDER BY + rnk DESC, 1 DESC +LIMIT 10; + +-- similar query with no distribution column is on the partition by clause +-- is not supported +SELECT + DISTINCT ON (events_table.user_id, rnk) events_table.user_id, rank() OVER my_win AS rnk +FROM + events_table, users_ref_test_table uref +WHERE + uref.id = events_table.user_id +WINDOW + my_win AS (PARTITION BY events_table.value_2, uref.k_no ORDER BY events_table.time DESC) +ORDER BY + rnk DESC, 1 DESC +LIMIT 10; + +-- ORDER BY in the window function is an aggragate +SELECT + user_id, rank() OVER my_win as rnk, avg(value_2) as avg_val_2 +FROM + events_table +GROUP BY + user_id, date_trunc('day', time) +WINDOW + my_win AS (PARTITION BY user_id ORDER BY avg(event_type) DESC) +ORDER BY + 3 DESC, 2 DESC, 1 DESC; + +-- lets push the limits of writing complex expressions aling with the window functions +SELECT + COUNT(*) OVER (PARTITION BY user_id, user_id + 1), + rank() OVER (PARTITION BY user_id) as cnt1, + COUNT(*) OVER (PARTITION BY user_id, abs(value_1 - value_2)) as cnt2, + date_trunc('min', lag(time) OVER (PARTITION BY user_id)) as datee, + rank() OVER my_win as rnnk, + avg(CASE + WHEN user_id > 4 + THEN value_1 + ELSE value_2 + END) FILTER (WHERE user_id > 2) OVER my_win_2 as filtered_count, + sum(user_id * (5.0 / (value_1 + value_2 + 0.1)) * value_3) FILTER (WHERE value_1::text LIKE '%1%') OVER my_win_4 as cnt_with_filter_2 +FROM + users_table +WINDOW + my_win AS (PARTITION BY user_id, (value_1%3)::int ORDER BY time DESC), + my_win_2 AS (PARTITION BY user_id, (value_1)::int ORDER BY time DESC), + my_win_3 AS (PARTITION BY user_id, date_trunc('min', time)), + my_win_4 AS (my_win_3 ORDER BY value_2, value_3) +ORDER BY + cnt_with_filter_2 DESC NULLS LAST, filtered_count DESC NULLS LAST, datee DESC NULLS LAST, rnnk DESC, cnt2 DESC, cnt1 DESC, user_id DESC +LIMIT 5; + +-- some tests with GROUP BY along with PARTITION BY +SELECT + user_id, + rank() OVER my_win as my_rank, + avg(avg(event_type)) OVER my_win_2 as avg, + max(time) as mx_time +FROM + events_table +GROUP BY + user_id, + value_2 +WINDOW + my_win AS (PARTITION BY user_id, max(event_type) ORDER BY count(*) DESC), + my_win_2 AS (PARTITION BY user_id, avg(user_id) ORDER BY count(*) DESC) +ORDER BY + avg DESC, + mx_time DESC, + my_rank DESC, + user_id DESC; + +-- test for range and rows mode and different window functions +-- mostly to make sure that deparsing works fine +SELECT + user_id, + rank() OVER (PARTITION BY user_id ROWS BETWEEN + UNBOUNDED PRECEDING AND CURRENT ROW), + dense_rank() OVER (PARTITION BY user_id RANGE BETWEEN + UNBOUNDED PRECEDING AND CURRENT ROW), + CUME_DIST() OVER (PARTITION BY user_id RANGE BETWEEN + UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING), + PERCENT_RANK() OVER (PARTITION BY user_id ORDER BY avg(value_1) RANGE BETWEEN + UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) +FROM + users_table +GROUP BY + 1 +ORDER BY + 4 DESC,3 DESC,2 DESC ,1 DESC; + +-- some tests with GROUP BY, HAVING and LIMIT +SELECT + user_id, sum(event_type) OVER my_win , event_type +FROM + events_table +GROUP BY + user_id, event_type +HAVING count(*) > 2 + WINDOW my_win AS (PARTITION BY user_id, max(event_type) ORDER BY count(*) DESC) +ORDER BY + 2 DESC, 3 DESC, 1 DESC +LIMIT + 5; + +-- Group by has more columns than partition by +SELECT + DISTINCT user_id, SUM(value_2) OVER (PARTITION BY user_id) +FROM + users_table +GROUP BY + user_id, value_1, value_2 +HAVING count(*) > 2 +ORDER BY + 2 DESC, 1 +LIMIT + 10; + +SELECT + DISTINCT ON (user_id) user_id, SUM(value_2) OVER (PARTITION BY user_id) +FROM + users_table +GROUP BY + user_id, value_1, value_2 +HAVING count(*) > 2 +ORDER BY + 1, 2 DESC +LIMIT + 10; + +SELECT + DISTINCT ON (SUM(value_1) OVER (PARTITION BY user_id)) user_id, SUM(value_2) OVER (PARTITION BY user_id) +FROM + users_table +GROUP BY + user_id, value_1, value_2 +HAVING count(*) > 2 +ORDER BY + (SUM(value_1) OVER (PARTITION BY user_id)) , 2 DESC, 1 +LIMIT + 10; + +-- not a meaningful query, with interesting syntax +SELECT + user_id, + AVG(avg(value_1)) OVER (PARTITION BY user_id, max(user_id), MIN(value_2)), + AVG(avg(user_id)) OVER (PARTITION BY user_id, min(user_id), AVG(value_1)) +FROM + users_table +GROUP BY + 1 +ORDER BY + 3 DESC, 2 DESC, 1 DESC; + +EXPLAIN +SELECT + user_id, + AVG(avg(value_1)) OVER (PARTITION BY user_id, max(user_id), MIN(value_2)), + AVG(avg(user_id)) OVER (PARTITION BY user_id, min(user_id), AVG(value_1)) +FROM + users_table +GROUP BY + 1 +ORDER BY + 3 DESC, 2 DESC, 1 DESC; + +SELECT + user_id, + 1 + sum(value_1), + 1 + AVG(value_2) OVER (partition by user_id) +FROM + users_table +GROUP BY + user_id, value_2 +ORDER BY + user_id, value_2; + +EXPLAIN +SELECT + user_id, + 1 + sum(value_1), + 1 + AVG(value_2) OVER (partition by user_id) +FROM + users_table +GROUP BY + user_id, value_2 +ORDER BY + user_id, value_2; + +SELECT + user_id, + 1 + sum(value_1), + 1 + AVG(value_2) OVER (partition by user_id) +FROM + users_table +GROUP BY + user_id, value_2 +ORDER BY + 2 DESC, 1 +LIMIT 5; + +-- rank and ordering in the reverse order +SELECT + user_id, + avg(value_1), + RANK() OVER (partition by user_id order by value_2) +FROM + users_table +GROUP BY user_id, value_2 +ORDER BY user_id, value_2 DESC; + +-- order by in the window function is same as avg(value_1) DESC +SELECT + user_id, + avg(value_1), + RANK() OVER (partition by user_id order by 1 / (1 + avg(value_1))) +FROM + users_table +GROUP BY user_id, value_2 +ORDER BY user_id, avg(value_1) DESC; + +EXPLAIN +SELECT + user_id, + avg(value_1), + RANK() OVER (partition by user_id order by 1 / (1 + avg(value_1))) +FROM + users_table +GROUP BY user_id, value_2 +ORDER BY user_id, avg(value_1) DESC; + +-- order by in the window function is same as avg(value_1) DESC +SELECT + user_id, + avg(value_1), + RANK() OVER (partition by user_id order by 1 / (1 + avg(value_1))) +FROM + users_table +GROUP BY user_id, value_2 +ORDER BY user_id, avg(value_1) DESC; + +-- limit is not pushed down to worker !! +EXPLAIN +SELECT + user_id, + avg(value_1), + RANK() OVER (partition by user_id order by 1 / (1 + avg(value_1))) +FROM + users_table +GROUP BY user_id, value_2 +ORDER BY user_id, avg(value_1) DESC +LIMIT 5; + +EXPLAIN +SELECT + user_id, + avg(value_1), + RANK() OVER (partition by user_id order by 1 / (1 + avg(value_1))) +FROM + users_table +GROUP BY user_id, value_2 +ORDER BY user_id, avg(value_1) DESC +LIMIT 5; + +EXPLAIN +SELECT + user_id, + avg(value_1), + RANK() OVER (partition by user_id order by 1 / (1 + sum(value_2))) +FROM + users_table +GROUP BY user_id, value_2 +ORDER BY user_id, avg(value_1) DESC +LIMIT 5; + +EXPLAIN +SELECT + user_id, + avg(value_1), + RANK() OVER (partition by user_id order by sum(value_2)) +FROM + users_table +GROUP BY user_id, value_2 +ORDER BY user_id, avg(value_1) DESC +LIMIT 5;