diff --git a/src/backend/distributed/planner/multi_logical_optimizer.c b/src/backend/distributed/planner/multi_logical_optimizer.c index 0772a6ea4..93a6c9165 100644 --- a/src/backend/distributed/planner/multi_logical_optimizer.c +++ b/src/backend/distributed/planner/multi_logical_optimizer.c @@ -3051,8 +3051,7 @@ ErrorIfCannotPushdownSubquery(Query *subqueryTree, bool outerQueryHasLimit) * ErrorIfUnsupportedTableCombination checks if the given query tree contains any * unsupported range table combinations. For this, the function walks over all * range tables in the join tree, and checks if they correspond to simple relations - * or subqueries. It also checks if there is a join between a regular table and - * a subquery and if join is on more than two range table entries. + * or subqueries. */ static void ErrorIfUnsupportedTableCombination(Query *queryTree) @@ -3062,8 +3061,6 @@ ErrorIfUnsupportedTableCombination(Query *queryTree) ListCell *joinTreeTableIndexCell = NULL; bool unsupporteTableCombination = false; char *errorDetail = NULL; - uint32 relationRangeTableCount = 0; - uint32 subqueryRangeTableCount = 0; /* * Extract all range table indexes from the join tree. Note that sub-queries @@ -3086,15 +3083,8 @@ ErrorIfUnsupportedTableCombination(Query *queryTree) * Check if the range table in the join tree is a simple relation or a * subquery. */ - if (rangeTableEntry->rtekind == RTE_RELATION) - { - relationRangeTableCount++; - } - else if (rangeTableEntry->rtekind == RTE_SUBQUERY) - { - subqueryRangeTableCount++; - } - else + if (rangeTableEntry->rtekind != RTE_RELATION && + rangeTableEntry->rtekind != RTE_SUBQUERY) { unsupporteTableCombination = true; errorDetail = "Table expressions other than simple relations and " @@ -3103,19 +3093,6 @@ ErrorIfUnsupportedTableCombination(Query *queryTree) } } - if ((subqueryRangeTableCount > 0) && (relationRangeTableCount > 0)) - { - unsupporteTableCombination = true; - errorDetail = "Joins between regular tables and subqueries are unsupported"; - } - - if ((relationRangeTableCount > 2) || (subqueryRangeTableCount > 2)) - { - unsupporteTableCombination = true; - errorDetail = "Joins between more than two relations and subqueries are " - "unsupported"; - } - /* finally check and error out if not satisfied */ if (unsupporteTableCombination) { diff --git a/src/backend/distributed/planner/multi_logical_planner.c b/src/backend/distributed/planner/multi_logical_planner.c index 51ae92940..9e590d7e6 100644 --- a/src/backend/distributed/planner/multi_logical_planner.c +++ b/src/backend/distributed/planner/multi_logical_planner.c @@ -105,8 +105,12 @@ static MultiNode * ApplyCartesianProduct(MultiNode *leftNode, MultiNode *rightNo * functions will be removed with upcoming subqery changes. */ static MultiNode * SubqueryPushdownMultiPlanTree(Query *queryTree); + static void ErrorIfSubqueryJoin(Query *queryTree); -static MultiTable * MultiSubqueryPushdownTable(RangeTblEntry *subqueryRangeTableEntry); +static List * CreateSubqueryTargetEntryList(List *columnList); +static void UpdateVarMappingsForExtendedOpNode(List *columnList, + List *subqueryTargetEntryList); +static MultiTable * MultiSubqueryPushdownTable(Query *subquery); /* @@ -1987,67 +1991,109 @@ ApplyCartesianProduct(MultiNode *leftNode, MultiNode *rightNode, * SubqueryPushdownMultiTree creates logical plan for subquery pushdown logic. * Note that this logic will be changed in next iterations, so we decoupled it * from other parts of code although it causes some code duplication. + * + * Current subquery pushdown support in MultiTree logic requires a single range + * table entry in the top most from clause. Therefore we inject an synthetic + * query derived from the top level query and make it the only range table + * entry for the top level query. This way we can push down any subquery joins + * down to workers without invoking join order planner. */ static MultiNode * SubqueryPushdownMultiPlanTree(Query *queryTree) { List *targetEntryList = queryTree->targetList; List *qualifierList = NIL; - List *qualifierColumnList = NIL; - List *targetListColumnList = NIL; List *columnList = NIL; - ListCell *columnCell = NULL; + List *targetColumnList = NIL; MultiCollect *subqueryCollectNode = CitusMakeNode(MultiCollect); MultiTable *subqueryNode = NULL; - MultiSelect *selectNode = NULL; MultiProject *projectNode = NULL; MultiExtendedOp *extendedOpNode = NULL; MultiNode *currentTopNode = NULL; - RangeTblEntry *subqueryRangeTableEntry = NULL; - List *subqueryEntryList = SubqueryEntryList(queryTree); + Query *pushedDownQuery = NULL; + List *subqueryTargetEntryList = NIL; + List *havingClauseColumnList = NIL; /* verify we can perform distributed planning on this query */ ErrorIfQueryNotSupported(queryTree); - ErrorIfSubqueryJoin(queryTree); /* extract qualifiers and verify we can plan for them */ qualifierList = QualifierList(queryTree->jointree); ValidateClauseList(qualifierList); /* - * We disregard pulled subqueries. This changes order of range table list. - * We do not allow subquery joins, so we will have only one range table - * entry in range table list after dropping pulled subquery. For this reason, - * here we are updating columns in the most outer query for where clause - * list and target list accordingly. + * We would be creating a new Query and pushing down top level query's + * contents down to it. Join and filter clauses in higher level query would + * be transferred to lower query. Therefore after this function we would + * only have a single range table entry in the top level query. We need to + * create a target list entry in lower query for each column reference in + * upper level query's target list and having clauses. Any column reference + * in the upper query will be updated to have varno=1, and varattno= + * of matching target entry in pushed down query. + * Consider query + * SELECT s1.a, sum(s2.c) + * FROM (some subquery) s1, (some subquery) s2 + * WHERE s1.a = s2.a + * GROUP BY s1.a + * HAVING avg(s2.b); + * + * We want to prepare a multi tree to avoid subquery joins at top level, + * therefore above query is converted to an equivalent + * SELECT worker_column_0, sum(worker_column_1) + * FROM ( + * SELECT + * s1.a AS worker_column_0, + * s2.c AS worker_column_1, + * s2.b AS as worker_column_2 + * FROM (some subquery) s1, (some subquery) s2 + * WHERE s1.a = s2.a) worker_subquery + * GROUP BY worker_column_0 + * HAVING avg(worker_column_2); + * After this conversion MultiTree is created as follows + * + * MultiExtendedOpNode( + * targetList : worker_column_0, sum(worker_column_1) + * groupBy : worker_column_0 + * having : avg(worker_column_2)) + * --->MultiProject (worker_column_0, worker_column_1, worker_column_2) + * --->---> MultiTable (subquery : worker_subquery) + * + * Master and worker queries will be created out of this MultiTree at later stages. */ - Assert(list_length(subqueryEntryList) == 1); - qualifierColumnList = pull_var_clause_default((Node *) qualifierList); - targetListColumnList = pull_var_clause_default((Node *) targetEntryList); + /* + * uniqueColumnList contains all columns returned by subquery. Subquery target + * entry list, subquery range table entry's column name list are derived from + * uniqueColumnList. Columns mentioned in multiProject node and multiExtendedOp + * node are indexed with their respective position in uniqueColumnList. + */ + targetColumnList = pull_var_clause_default((Node *) targetEntryList); + havingClauseColumnList = pull_var_clause_default(queryTree->havingQual); + columnList = list_concat(targetColumnList, havingClauseColumnList); - columnList = list_concat(qualifierColumnList, targetListColumnList); - foreach(columnCell, columnList) - { - Var *column = (Var *) lfirst(columnCell); - column->varno = 1; - } + /* create a target entry for each unique column */ + subqueryTargetEntryList = CreateSubqueryTargetEntryList(columnList); - /* create multi node for the subquery */ - subqueryRangeTableEntry = (RangeTblEntry *) linitial(subqueryEntryList); - subqueryNode = MultiSubqueryPushdownTable(subqueryRangeTableEntry); + /* + * Update varno/varattno fields of columns in columnList to + * point to corresponding target entry in subquery target entry list. + */ + UpdateVarMappingsForExtendedOpNode(columnList, subqueryTargetEntryList); + + /* new query only has target entries, join tree, and rtable*/ + pushedDownQuery = makeNode(Query); + pushedDownQuery->commandType = queryTree->commandType; + pushedDownQuery->targetList = subqueryTargetEntryList; + pushedDownQuery->jointree = copyObject(queryTree->jointree); + pushedDownQuery->rtable = copyObject(queryTree->rtable); + pushedDownQuery->setOperations = copyObject(queryTree->setOperations); + pushedDownQuery->querySource = queryTree->querySource; + + subqueryNode = MultiSubqueryPushdownTable(pushedDownQuery); SetChild((MultiUnaryNode *) subqueryCollectNode, (MultiNode *) subqueryNode); currentTopNode = (MultiNode *) subqueryCollectNode; - /* build select node if the query has selection criteria */ - selectNode = MultiSelectNode(qualifierList); - if (selectNode != NULL) - { - SetChild((MultiUnaryNode *) selectNode, currentTopNode); - currentTopNode = (MultiNode *) selectNode; - } - /* build project node for the columns to project */ projectNode = MultiProjectNode(targetEntryList); SetChild((MultiUnaryNode *) projectNode, currentTopNode); @@ -2060,6 +2106,20 @@ SubqueryPushdownMultiPlanTree(Query *queryTree) * in the logical optimizer. */ extendedOpNode = MultiExtendedOpNode(queryTree); + + /* + * Postgres standard planner converts having qual node to a list of and + * clauses and expects havingQual to be of type List when executing the + * query later. This function is called on an original query, therefore + * havingQual has not been converted yet. Perform conversion here. + */ + if (extendedOpNode->havingQual != NULL && + !IsA(extendedOpNode->havingQual, List)) + { + extendedOpNode->havingQual = + (Node *) make_ands_implicit((Expr *) extendedOpNode->havingQual); + } + SetChild((MultiUnaryNode *) extendedOpNode, currentTopNode); currentTopNode = (MultiNode *) extendedOpNode; @@ -2094,22 +2154,105 @@ ErrorIfSubqueryJoin(Query *queryTree) /* - * MultiSubqueryPushdownTable creates a MultiTable from the given subquery range - * table entry and returns it. Note that this sets subquery field of MultiTable - * to subquery of the given range table entry. + * CreateSubqueryTargetEntryList creates a target entry for each unique column + * in the column list and returns the target entry list. + */ +static List * +CreateSubqueryTargetEntryList(List *columnList) +{ + AttrNumber resNo = 1; + ListCell *columnCell = NULL; + List *uniqueColumnList = NIL; + List *subqueryTargetEntryList = NIL; + + foreach(columnCell, columnList) + { + Var *column = (Var *) lfirst(columnCell); + uniqueColumnList = list_append_unique(uniqueColumnList, copyObject(column)); + } + + foreach(columnCell, uniqueColumnList) + { + Var *column = (Var *) lfirst(columnCell); + TargetEntry *newTargetEntry = makeNode(TargetEntry); + StringInfo columnNameString = makeStringInfo(); + + newTargetEntry->expr = (Expr *) copyObject(column); + appendStringInfo(columnNameString, WORKER_COLUMN_FORMAT, resNo); + newTargetEntry->resname = columnNameString->data; + newTargetEntry->resjunk = false; + newTargetEntry->resno = resNo; + + subqueryTargetEntryList = lappend(subqueryTargetEntryList, newTargetEntry); + resNo++; + } + + return subqueryTargetEntryList; +} + + +/* + * UpdateVarMappingsForExtendedOpNode updates varno/varattno fields of columns + * in columnList to point to corresponding target in subquery target entry + * list. + */ +static void +UpdateVarMappingsForExtendedOpNode(List *columnList, List *subqueryTargetEntryList) +{ + ListCell *columnCell = NULL; + foreach(columnCell, columnList) + { + Var *columnOnTheExtendedNode = (Var *) lfirst(columnCell); + ListCell *targetEntryCell = NULL; + foreach(targetEntryCell, subqueryTargetEntryList) + { + TargetEntry *targetEntry = (TargetEntry *) lfirst(targetEntryCell); + Var *targetColumn = NULL; + + Assert(IsA(targetEntry->expr, Var)); + targetColumn = (Var *) targetEntry->expr; + if (columnOnTheExtendedNode->varno == targetColumn->varno && + columnOnTheExtendedNode->varattno == targetColumn->varattno) + { + columnOnTheExtendedNode->varno = 1; + columnOnTheExtendedNode->varattno = targetEntry->resno; + break; + } + } + } +} + + +/* + * MultiSubqueryPushdownTable creates a MultiTable from the given subquery, + * populates column list and returns the multitable. */ static MultiTable * -MultiSubqueryPushdownTable(RangeTblEntry *subqueryRangeTableEntry) +MultiSubqueryPushdownTable(Query *subquery) { - Query *subquery = subqueryRangeTableEntry->subquery; + MultiTable *subqueryTableNode = NULL; + StringInfo rteName = makeStringInfo(); + List *columnNamesList = NIL; + ListCell *targetEntryCell = NULL; - MultiTable *subqueryTableNode = CitusMakeNode(MultiTable); + appendStringInfo(rteName, "worker_subquery"); + + foreach(targetEntryCell, subquery->targetList) + { + TargetEntry *targetEntry = (TargetEntry *) lfirst(targetEntryCell); + columnNamesList = lappend(columnNamesList, makeString(targetEntry->resname)); + } + + subqueryTableNode = CitusMakeNode(MultiTable); subqueryTableNode->subquery = subquery; - subqueryTableNode->relationId = HEAP_ANALYTICS_SUBQUERY_RELATION_ID; + subqueryTableNode->relationId = SUBQUERY_PUSHDOWN_RELATION_ID; subqueryTableNode->rangeTableId = SUBQUERY_RANGE_TABLE_ID; subqueryTableNode->partitionColumn = NULL; - subqueryTableNode->alias = subqueryRangeTableEntry->alias; - subqueryTableNode->referenceNames = subqueryRangeTableEntry->eref; + subqueryTableNode->alias = makeNode(Alias); + subqueryTableNode->alias->aliasname = rteName->data; + subqueryTableNode->referenceNames = makeNode(Alias); + subqueryTableNode->referenceNames->aliasname = rteName->data; + subqueryTableNode->referenceNames->colnames = columnNamesList; return subqueryTableNode; } diff --git a/src/backend/distributed/planner/multi_physical_planner.c b/src/backend/distributed/planner/multi_physical_planner.c index 0de1bab2a..bcab6d92f 100644 --- a/src/backend/distributed/planner/multi_physical_planner.c +++ b/src/backend/distributed/planner/multi_physical_planner.c @@ -817,7 +817,7 @@ BaseRangeTableList(MultiNode *multiNode) */ MultiTable *multiTable = (MultiTable *) multiNode; if (multiTable->relationId != SUBQUERY_RELATION_ID && - multiTable->relationId != HEAP_ANALYTICS_SUBQUERY_RELATION_ID) + multiTable->relationId != SUBQUERY_PUSHDOWN_RELATION_ID) { RangeTblEntry *rangeTableEntry = makeNode(RangeTblEntry); rangeTableEntry->inFromCl = true; @@ -1397,6 +1397,7 @@ BuildSubqueryJobQuery(MultiNode *multiNode) List *sortClauseList = NIL; List *groupClauseList = NIL; List *whereClauseList = NIL; + Node *havingQual = NULL; Node *limitCount = NULL; Node *limitOffset = NULL; FromExpr *joinTree = NULL; @@ -1436,7 +1437,7 @@ BuildSubqueryJobQuery(MultiNode *multiNode) targetList = QueryTargetList(multiNode); } - /* extract limit count/offset and sort clauses */ + /* extract limit count/offset, sort and having clauses */ if (extendedOpNodeList != NIL) { MultiExtendedOp *extendedOp = (MultiExtendedOp *) linitial(extendedOpNodeList); @@ -1444,6 +1445,7 @@ BuildSubqueryJobQuery(MultiNode *multiNode) limitCount = extendedOp->limitCount; limitOffset = extendedOp->limitOffset; sortClauseList = extendedOp->sortClauseList; + havingQual = extendedOp->havingQual; } /* build group clauses */ @@ -1473,7 +1475,9 @@ BuildSubqueryJobQuery(MultiNode *multiNode) jobQuery->groupClause = groupClauseList; jobQuery->limitOffset = limitOffset; jobQuery->limitCount = limitCount; - jobQuery->hasAggs = contain_agg_clause((Node *) targetList); + jobQuery->havingQual = havingQual; + jobQuery->hasAggs = contain_agg_clause((Node *) targetList) || + contain_agg_clause((Node *) havingQual); return jobQuery; } diff --git a/src/include/distributed/multi_logical_planner.h b/src/include/distributed/multi_logical_planner.h index 8745f53e2..5b11a8ff3 100644 --- a/src/include/distributed/multi_logical_planner.h +++ b/src/include/distributed/multi_logical_planner.h @@ -24,7 +24,7 @@ #define SUBQUERY_RANGE_TABLE_ID -1 #define SUBQUERY_RELATION_ID 10000 -#define HEAP_ANALYTICS_SUBQUERY_RELATION_ID 10001 +#define SUBQUERY_PUSHDOWN_RELATION_ID 10001 /* diff --git a/src/test/regress/expected/multi_subquery_behavioral_analytics.out b/src/test/regress/expected/multi_subquery_behavioral_analytics.out new file mode 100644 index 000000000..69097972d --- /dev/null +++ b/src/test/regress/expected/multi_subquery_behavioral_analytics.out @@ -0,0 +1,1582 @@ +-- +-- multi subquery behavioral analytics queries aims to expand existing subquery pushdown +-- regression tests to cover more cases +-- the tables that are used depends to multi_insert_select_behavioral_analytics_create_table.sql +-- +-- We don't need shard id sequence here, so commented out to prevent conflicts with concurrent tests +-- ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 1430000; +-- ALTER SEQUENCE pg_catalog.pg_dist_jobid_seq RESTART 1430000; +SET citus.subquery_pushdown TO TRUE; +SET citus.enable_router_execution TO FALSE; +------------------------------------ +-- Vanilla funnel query +------------------------------------ +SELECT user_id, array_length(events_table, 1) +FROM ( + SELECT user_id, array_agg(event ORDER BY time) AS events_table + FROM ( + SELECT u.user_id, e.event_type::text AS event, e.time + FROM users_table AS u, + events_table AS e + WHERE u.user_id = e.user_id + AND u.user_id >= 10 + AND u.user_id <= 25 + AND e.event_type IN (100, 101, 102) + ) t + GROUP BY user_id +) q +ORDER BY 2 DESC, 1; + user_id | array_length +---------+-------------- + 13 | 172 + 12 | 121 + 23 | 115 + 10 | 114 + 20 | 90 +(5 rows) + +------------------------------------ +-- Funnel grouped by whether or not a user has done an event +-- This has multiple subqueries joinin at the top level +-- Query will be supported when we enable unions +------------------------------------ +SELECT user_id, sum(array_length(events_table, 1)), length(hasdone_event), hasdone_event +FROM ( + SELECT + t1.user_id, + array_agg(event ORDER BY time) AS events_table, + COALESCE(hasdone_event, 'Has not done event') AS hasdone_event + FROM ( + ( + SELECT u.user_id, 'step=>1'::text AS event, e.time + FROM users_table AS u, + events_table AS e + WHERE u.user_id = e.user_id + AND u.user_id >= 10 + AND u.user_id <= 25 + AND e.event_type IN (100, 101, 102) + ) + UNION + ( + SELECT u.user_id, 'step=>2'::text AS event, e.time + FROM users_table AS u, + events_table AS e + WHERE u.user_id = e.user_id + AND u.user_id >= 10 + AND u.user_id <= 25 + AND e.event_type IN (103, 104, 105) + ) + ) t1 LEFT JOIN ( + SELECT DISTINCT user_id, + 'Has done event'::TEXT AS hasdone_event + FROM events_table AS e + WHERE e.user_id >= 10 + AND e.user_id <= 25 + AND e.event_type IN (106, 107, 108) + ) t2 ON (t1.user_id = t2.user_id) + GROUP BY t1.user_id, hasdone_event +) t GROUP BY user_id, hasdone_event +ORDER BY user_id; +ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys +DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. +-- same query but multiple joins are one level below, returns count of row instead of actual rows +SELECT count(*) +FROM ( + SELECT user_id, sum(array_length(events_table, 1)), length(hasdone_event), hasdone_event + FROM ( + SELECT + t1.user_id, + array_agg(event ORDER BY time) AS events_table, + COALESCE(hasdone_event, 'Has not done event') AS hasdone_event + FROM ( + ( + SELECT u.user_id, 'step=>1'::text AS event, e.time + FROM users_table AS u, + events_table AS e + WHERE u.user_id = e.user_id + AND u.user_id >= 10 + AND u.user_id <= 25 + AND e.event_type IN (100, 101, 102) + ) + UNION + ( + SELECT u.user_id, 'step=>2'::text AS event, e.time + FROM users_table AS u, + events_table AS e + WHERE u.user_id = e.user_id + AND u.user_id >= 10 + AND u.user_id <= 25 + AND e.event_type IN (103, 104, 105) + ) + ) t1 LEFT JOIN ( + SELECT DISTINCT user_id, + 'Has done event'::TEXT AS hasdone_event + FROM events_table AS e + WHERE e.user_id >= 10 + AND e.user_id <= 25 + AND e.event_type IN (106, 107, 108) + ) t2 ON (t1.user_id = t2.user_id) + GROUP BY t1.user_id, hasdone_event + ) t GROUP BY user_id, hasdone_event + ORDER BY user_id) u; +ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys +DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. +-- Same queries written without unions +SELECT user_id, sum(array_length(events_table, 1)), length(hasdone_event), hasdone_event +FROM ( + SELECT + t1.user_id, + array_agg(event ORDER BY time) AS events_table, + COALESCE(hasdone_event, 'Has not done event') AS hasdone_event + FROM ( + SELECT + u.user_id, + CASE WHEN e.event_type IN (100, 101, 102) THEN 'step=>1'::text else 'step==>2'::text END AS event, + e.time + FROM users_table AS u, + events_table AS e + WHERE u.user_id = e.user_id + AND u.user_id >= 10 + AND u.user_id <= 25 + AND e.event_type IN (100, 101, 102, 103, 104, 105) + GROUP BY 1,2,3 + ) t1 LEFT JOIN ( + SELECT DISTINCT user_id, + 'Has done event'::TEXT AS hasdone_event + FROM events_table AS e + WHERE e.user_id >= 10 + AND e.user_id <= 25 + AND e.event_type IN (106, 107, 108) + ) t2 ON (t1.user_id = t2.user_id) + GROUP BY t1.user_id, hasdone_event +) t GROUP BY user_id, hasdone_event +ORDER BY user_id; + user_id | sum | length | hasdone_event +---------+-----+--------+-------------------- + 10 | 1 | 18 | Has not done event + 12 | 1 | 14 | Has done event + 13 | 2 | 18 | Has not done event + 15 | 1 | 18 | Has not done event + 17 | 1 | 18 | Has not done event + 19 | 1 | 14 | Has done event + 20 | 2 | 18 | Has not done event + 23 | 1 | 18 | Has not done event +(8 rows) + +-- same query but multiple joins are one level below, returns count of row instead of actual rows +SELECT count(*) +FROM ( + SELECT user_id, sum(array_length(events_table, 1)), length(hasdone_event), hasdone_event + FROM ( + SELECT + t1.user_id, + array_agg(event ORDER BY time) AS events_table, + COALESCE(hasdone_event, 'Has not done event') AS hasdone_event + FROM ( + SELECT + u.user_id, + CASE WHEN e.event_type in (100, 101, 102) then 'step=>1'::text else 'step==>2'::text END AS event, + e.time + FROM users_table AS u, + events_table AS e + WHERE u.user_id = e.user_id + AND u.user_id >= 10 + AND u.user_id <= 25 + AND e.event_type IN (100, 101, 102, 103, 104, 105) + GROUP BY 1,2,3 + ) t1 LEFT JOIN ( + SELECT DISTINCT user_id, + 'Has done event'::TEXT AS hasdone_event + FROM events_table AS e + WHERE e.user_id >= 10 + AND e.user_id <= 25 + AND e.event_type IN (106, 107, 108) + ) t2 ON (t1.user_id = t2.user_id) + GROUP BY t1.user_id, hasdone_event + ) t GROUP BY user_id, hasdone_event + ORDER BY user_id) u; + count +------- + 8 +(1 row) + +------------------------------------ +-- Funnel, grouped by the number of times a user has done an event +-- These will be supported when we add unions +------------------------------------ +SELECT + user_id, + avg(array_length(events_table, 1)) AS event_average, + count_pay + FROM ( + SELECT + subquery_1.user_id, + array_agg(event ORDER BY time) AS events_table, + COALESCE(count_pay, 0) AS count_pay + FROM + ( + (SELECT + users_table.user_id, + 'action=>1'AS event, + events_table.time + FROM + users_table, + events_table + WHERE + users_table.user_id = events_table.user_id AND + users_table.user_id >= 10 AND + users_table.user_id <= 70 AND + events_table.event_type > 10 AND events_table.event_type < 12 + ) + UNION + (SELECT + users_table.user_id, + 'action=>2'AS event, + events_table.time + FROM + users_table, + events_table + WHERE + users_table.user_id = events_table.user_id AND + users_table.user_id >= 10 AND + users_table.user_id <= 70 AND + events_table.event_type > 12 AND events_table.event_type < 14 + ) + ) AS subquery_1 + LEFT JOIN + (SELECT + user_id, + COUNT(*) AS count_pay + FROM + users_table + WHERE + user_id >= 10 AND + user_id <= 70 AND + users_table.value_1 > 15 AND users_table.value_1 < 17 + GROUP BY + user_id + HAVING + COUNT(*) > 1) AS subquery_2 + ON + subquery_1.user_id = subquery_2.user_id + GROUP BY + subquery_1.user_id, + count_pay) AS subquery_top +WHERE + array_ndims(events_table) > 0 +GROUP BY + count_pay, user_id +ORDER BY + event_average DESC, count_pay DESC, user_id DESC; +ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys +DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. +SELECT + user_id, + avg(array_length(events_table, 1)) AS event_average, + count_pay + FROM ( + SELECT + subquery_1.user_id, + array_agg(event ORDER BY time) AS events_table, + COALESCE(count_pay, 0) AS count_pay + FROM + ( + (SELECT + users_table.user_id, + 'action=>1'AS event, + events_table.time + FROM + users_table, + events_table + WHERE + users_table.user_id = events_table.user_id AND + users_table.user_id >= 10 AND + users_table.user_id <= 70 AND + events_table.event_type > 10 AND events_table.event_type < 12 + ) + UNION + (SELECT + users_table.user_id, + 'action=>2'AS event, + events_table.time + FROM + users_table, + events_table + WHERE + users_table.user_id = events_table.user_id AND + users_table.user_id >= 10 AND + users_table.user_id <= 70 AND + events_table.event_type > 12 AND events_table.event_type < 14 + ) + ) AS subquery_1 + LEFT JOIN + (SELECT + user_id, + COUNT(*) AS count_pay + FROM + users_table + WHERE + user_id >= 10 AND + user_id <= 70 AND + users_table.value_1 > 15 AND users_table.value_1 < 17 + GROUP BY + user_id + HAVING + COUNT(*) > 1) AS subquery_2 + ON + subquery_1.user_id = subquery_2.user_id + GROUP BY + subquery_1.user_id, + count_pay) AS subquery_top +WHERE + array_ndims(events_table) > 0 +GROUP BY + count_pay, user_id +HAVING + avg(array_length(events_table, 1)) > 0 +ORDER BY + event_average DESC, count_pay DESC, user_id DESC; +ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys +DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. +-- Same queries rewritten without using unions + SELECT + user_id, + avg(array_length(events_table, 1)) AS event_average, + count_pay + FROM ( + SELECT + subquery_1.user_id, + array_agg(event ORDER BY time) AS events_table, + COALESCE(count_pay, 0) AS count_pay + FROM + ( + SELECT + users_table.user_id, + CASE WHEN events_table.event_type > 10 AND events_table.event_type < 12 THEN 'action=>1' ELSE 'action=>2' END AS event, + events_table.time + FROM + users_table, + events_table + WHERE + users_table.user_id = events_table.user_id AND + users_table.user_id >= 10 AND + users_table.user_id <= 70 AND + (events_table.event_type > 10 AND events_table.event_type < 12 + OR + events_table.event_type > 12 AND events_table.event_type < 14) + GROUP BY 1, 2, 3 + ) AS subquery_1 + LEFT JOIN + (SELECT + user_id, + COUNT(*) AS count_pay + FROM + users_table + WHERE + user_id >= 10 AND + user_id <= 70 AND + users_table.value_1 > 15 AND users_table.value_1 < 17 + GROUP BY + user_id + HAVING + COUNT(*) > 1) AS subquery_2 + ON + subquery_1.user_id = subquery_2.user_id + GROUP BY + subquery_1.user_id, + count_pay) AS subquery_top +WHERE + array_ndims(events_table) > 0 +GROUP BY + count_pay, user_id +ORDER BY + event_average DESC, count_pay DESC, user_id DESC; + user_id | event_average | count_pay +---------+------------------------+----------- + 69 | 1.00000000000000000000 | 0 + 65 | 1.00000000000000000000 | 0 + 58 | 1.00000000000000000000 | 0 + 49 | 1.00000000000000000000 | 0 + 40 | 1.00000000000000000000 | 0 + 32 | 1.00000000000000000000 | 0 + 29 | 1.00000000000000000000 | 0 + 18 | 1.00000000000000000000 | 0 +(8 rows) + +SELECT + user_id, + avg(array_length(events_table, 1)) AS event_average, + count_pay + FROM ( + SELECT + subquery_1.user_id, + array_agg(event ORDER BY time) AS events_table, + COALESCE(count_pay, 0) AS count_pay + FROM + ( + SELECT + users_table.user_id, + CASE WHEN events_table.event_type > 10 AND events_table.event_type < 12 THEN 'action=>1' ELSE 'action=>2' END AS event, + events_table.time + FROM + users_table, + events_table + WHERE + users_table.user_id = events_table.user_id AND + users_table.user_id >= 10 AND + users_table.user_id <= 70 AND + (events_table.event_type > 10 AND events_table.event_type < 12 + OR + events_table.event_type > 12 AND events_table.event_type < 14) + GROUP BY 1, 2, 3 + ) AS subquery_1 + LEFT JOIN + (SELECT + user_id, + COUNT(*) AS count_pay + FROM + users_table + WHERE + user_id >= 10 AND + user_id <= 70 AND + users_table.value_1 > 15 AND users_table.value_1 < 17 + GROUP BY + user_id + HAVING + COUNT(*) > 1) AS subquery_2 + ON + subquery_1.user_id = subquery_2.user_id + GROUP BY + subquery_1.user_id, + count_pay) AS subquery_top +WHERE + array_ndims(events_table) > 0 +GROUP BY + count_pay, user_id +HAVING + avg(array_length(events_table, 1)) > 0 +ORDER BY + event_average DESC, count_pay DESC, user_id DESC; + user_id | event_average | count_pay +---------+------------------------+----------- + 69 | 1.00000000000000000000 | 0 + 65 | 1.00000000000000000000 | 0 + 58 | 1.00000000000000000000 | 0 + 49 | 1.00000000000000000000 | 0 + 40 | 1.00000000000000000000 | 0 + 32 | 1.00000000000000000000 | 0 + 29 | 1.00000000000000000000 | 0 + 18 | 1.00000000000000000000 | 0 +(8 rows) + +------------------------------------ +-- Most recently seen users_table events_table +------------------------------------ +-- Note that we don't use ORDER BY/LIMIT yet +------------------------------------ +SELECT + user_id, + user_lastseen, + array_length(event_array, 1) +FROM ( + SELECT + user_id, + max(u.time) as user_lastseen, + array_agg(event_type ORDER BY u.time) AS event_array + FROM ( + SELECT user_id, time + FROM users_table + WHERE + user_id >= 10 AND + user_id <= 70 AND + users_table.value_1 > 10 AND users_table.value_1 < 12 + ) u LEFT JOIN LATERAL ( + SELECT event_type, time + FROM events_table + WHERE user_id = u.user_id AND + events_table.event_type > 10 AND events_table.event_type < 12 + ) t ON true + GROUP BY user_id +) AS shard_union +ORDER BY user_lastseen DESC, user_id; + user_id | user_lastseen | array_length +---------+---------------------------------+-------------- + 12 | Sun Jan 19 01:49:20.372688 2014 | 1 + 20 | Sat Jan 18 14:25:31.817903 2014 | 1 + 42 | Thu Jan 16 07:08:02.651966 2014 | 1 + 56 | Tue Jan 14 12:11:47.27375 2014 | 1 + 57 | Mon Jan 13 14:53:50.494836 2014 | 1 + 65 | Sun Jan 12 03:14:26.810597 2014 | 1 +(6 rows) + +------------------------------------ +-- Count the number of distinct users_table who are in segment X and Y and Z +-- This query will be supported when we have subqueries in where clauses. +------------------------------------ +SELECT DISTINCT user_id +FROM users_table +WHERE user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 10 AND value_1 <= 20) + AND user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 30 AND value_1 <= 40) + AND user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 50 AND value_1 <= 60); +ERROR: could not run distributed query with join types other than INNER or OUTER JOINS +HINT: Consider joining tables on partition column and have equal filter on joining columns. +------------------------------------ +-- Find customers who have done X, and satisfy other customer specific criteria +-- This query will be supported when we have subqueries in where clauses. +------------------------------------ +SELECT user_id, value_2 FROM users_table WHERE + value_1 > 101 AND value_1 < 110 + AND value_2 >= 5 + AND EXISTS (SELECT user_id FROM events_table WHERE event_type>101 AND event_type < 110 AND value_3 > 100 AND user_id=users_table.user_id); +ERROR: could not run distributed query with join types other than INNER or OUTER JOINS +HINT: Consider joining tables on partition column and have equal filter on joining columns. +------------------------------------ +-- Customers who haven’t done X, and satisfy other customer specific criteria +-- This query will be supported when we have subqueries in where clauses. +------------------------------------ +SELECT user_id, value_2 FROM users_table WHERE + value_1 = 101 + AND value_2 >= 5 + AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type=101 AND value_3 > 100 AND user_id=users_table.user_id); +ERROR: could not run distributed query with subquery outside the FROM clause +HINT: Consider using an equality filter on the distributed table's partition column. +------------------------------------ +-- Customers who have done X and Y, and satisfy other customer specific criteria +-- This query will be supported when we have subqueries in where clauses. +------------------------------------ +SELECT user_id, value_2 FROM users_table WHERE + value_1 > 100 + AND value_2 >= 5 + AND EXISTS (SELECT user_id FROM events_table WHERE event_type!=100 AND value_3 > 100 AND user_id=users_table.user_id) + AND EXISTS (SELECT user_id FROM events_table WHERE event_type=101 AND value_3 > 100 AND user_id=users_table.user_id); +ERROR: could not run distributed query with join types other than INNER or OUTER JOINS +HINT: Consider joining tables on partition column and have equal filter on joining columns. +------------------------------------ +-- Customers who have done X and haven’t done Y, and satisfy other customer specific criteria +-- This query will be supported when we have subqueries in where clauses. +------------------------------------ +SELECT user_id, value_2 FROM users_table WHERE + value_2 >= 5 + AND EXISTS (SELECT user_id FROM events_table WHERE event_type > 100 AND event_type <= 300 AND value_3 > 100 AND user_id=users_table.user_id) + AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type > 300 AND event_type <= 350 AND value_3 > 100 AND user_id=users_table.user_id); +ERROR: could not run distributed query with join types other than INNER or OUTER JOINS +HINT: Consider joining tables on partition column and have equal filter on joining columns. +------------------------------------ +-- Customers who have done X more than 2 times, and satisfy other customer specific criteria +-- This query will be supported when we have subqueries in where clauses. +------------------------------------ +SELECT user_id, + value_2 + FROM users_table + WHERE value_1 > 100 + AND value_1 < 124 + AND value_2 >= 5 + AND EXISTS (SELECT user_id + FROM events_table + WHERE event_type > 100 + AND event_type < 124 + AND value_3 > 100 + AND user_id = users_table.user_id + GROUP BY user_id + HAVING Count(*) > 2); +ERROR: could not run distributed query with subquery outside the FROM clause +HINT: Consider using an equality filter on the distributed table's partition column. +------------------------------------ +-- Find me all users_table who logged in more than once +------------------------------------ +SELECT user_id, value_1 from +( + SELECT user_id, value_1 From users_table + WHERE value_2 > 100 and user_id = 15 GROUP BY value_1, user_id HAVING count(*) > 1 +) AS a +ORDER BY user_id ASC, value_1 ASC; + user_id | value_1 +---------+--------- + 15 | 212 + 15 | 230 + 15 | 417 + 15 | 490 + 15 | 529 + 15 | 926 +(6 rows) + +-- same query with additional filter to make it not router plannable +SELECT user_id, value_1 from +( + SELECT user_id, value_1 From users_table + WHERE value_2 > 100 and (user_id = 15 OR user_id = 16) GROUP BY value_1, user_id HAVING count(*) > 1 +) AS a +ORDER BY user_id ASC, value_1 ASC; + user_id | value_1 +---------+--------- + 15 | 212 + 15 | 230 + 15 | 417 + 15 | 490 + 15 | 529 + 15 | 926 + 16 | 339 + 16 | 485 + 16 | 717 + 16 | 903 +(10 rows) + +------------------------------------ +-- Find me all users_table who has done some event and has filters +-- This query will be supported when we have subqueries in where clauses. +------------------------------------ +SELECT user_id +FROM events_table +WHERE + event_type = 16 AND value_2 > 50 +AND user_id IN + (SELECT user_id + FROM users_table + WHERE + value_1 = 15 AND value_2 > 25); +ERROR: could not run distributed query with join types other than INNER or OUTER JOINS +HINT: Consider joining tables on partition column and have equal filter on joining columns. +------------------------------------ +-- Which events_table did people who has done some specific events_table +-- This query will be supported when we have subqueries in where clauses. +------------------------------------ +SELECT user_id, event_type FROM events_table +WHERE user_id in (SELECT user_id from events_table WHERE event_type > 500 and event_type < 505) +GROUP BY user_id, event_type; +ERROR: could not run distributed query with join types other than INNER or OUTER JOINS +HINT: Consider joining tables on partition column and have equal filter on joining columns. +------------------------------------ +-- Find me all the users_table who has done some event more than three times +------------------------------------ +SELECT user_id FROM +( + SELECT + user_id + FROM + events_table + WHERE event_type = 901 + GROUP BY user_id HAVING count(*) > 3 +) AS a +ORDER BY user_id; + user_id +--------- + 57 +(1 row) + +------------------------------------ +-- Find my assets that have the highest probability and fetch their metadata +------------------------------------ +CREATE TEMP TABLE assets AS +SELECT + users_table.user_id, users_table.value_1, prob +FROM + users_table + JOIN + (SELECT + ma.user_id, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob + FROM + users_table AS ma, events_table as short_list + WHERE + short_list.user_id = ma.user_id and ma.value_1 < 50 and short_list.event_type < 50 + ) temp + ON users_table.user_id = temp.user_id + WHERE users_table.value_1 < 50; + -- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM assets; + count | count | avg +-------+-------+--------------------- + 14371 | 101 | 50.5232064574490293 +(1 row) + +DROP TABLE assets; +-- count number of distinct users who have value_1 equal to 5 or 13 but not 3 +-- original query that fails +SELECT count(*) FROM +( +SELECT user_id +FROM users_table +WHERE (value_1 = '5' + OR value_1 = '13') +AND user_id NOT IN (select user_id from users_table where value_1 = '3') +GROUP BY user_id +HAVING count(distinct value_1) = 2 +) as foo; +ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys +DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. +-- previous push down query +SELECT subquery_count FROM + (SELECT count(*) as subquery_count FROM + (SELECT + user_id + FROM + users_table + WHERE + (value_1 = '5' OR value_1 = '13') + GROUP BY user_id + HAVING count(distinct value_1) = 2) as a + LEFT JOIN + (SELECT + user_id + FROM + users_table + WHERE + (value_1 = '3') + GROUP BY user_id) as b on a.user_id = b.user_id WHERE b.user_id IS NULL + GROUP BY a.user_id) AS inner_subquery; + subquery_count +---------------- + 1 +(1 row) + +-- new pushdown query without single range table entry at top requirement +SELECT count(*) as subquery_count +FROM ( + SELECT + user_id + FROM + users_table + WHERE + (value_1 = '5' OR value_1 = '13') + GROUP BY user_id + HAVING count(distinct value_1) = 2 + ) as a + LEFT JOIN ( + SELECT + user_id + FROM + users_table + WHERE + (value_1 = '3') + GROUP BY user_id) AS b + ON a.user_id = b.user_id +WHERE b.user_id IS NULL +GROUP BY a.user_id; + subquery_count +---------------- + 1 +(1 row) + +-- multi-subquery-join +-- The first query has filters on partion column to make it router plannable +-- but it is processed by logical planner since we disabled router execution +SELECT + e1.user_id, + sum(view_homepage) AS viewed_homepage, + sum(use_demo) AS use_demo, + sum(enter_credit_card) AS entered_credit_card, + sum(submit_card_info) as submit_card_info, + sum(see_bought_screen) as see_bought_screen +FROM ( + -- Get the first time each user viewed the homepage. + SELECT + user_id, + 1 AS view_homepage, + min(time) AS view_homepage_time + FROM events_table + WHERE user_id = 1 and + event_type IN (10, 20, 30, 40, 50, 60, 70, 80, 90) + GROUP BY user_id +) e1 LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS use_demo, + time AS use_demo_time + FROM events_table + WHERE + user_id = e1.user_id AND user_id = 1 and + event_type IN (11, 21, 31, 41, 51, 61, 71, 81, 91) + ORDER BY time + LIMIT 1 +) e2 ON true LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS enter_credit_card, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e2.user_id AND user_id = 1 and + event_type IN (12, 22, 32, 42, 52, 62, 72, 82, 92) + ORDER BY time + LIMIT 1 +) e3 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS submit_card_info, + user_id, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e3.user_id AND user_id = 1 and + event_type IN (13, 23, 33, 43, 53, 63, 73, 83, 93) + ORDER BY time + LIMIT 1 +) e4 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS see_bought_screen + FROM events_table + WHERE + user_id = e4.user_id AND user_id = 1 and + event_type IN (14, 24, 34, 44, 54, 64, 74, 84, 94) + ORDER BY time + LIMIT 1 +) e5 ON true +where e1.user_id = 1 +group by e1.user_id +limit 1; + user_id | viewed_homepage | use_demo | entered_credit_card | submit_card_info | see_bought_screen +---------+-----------------+----------+---------------------+------------------+------------------- + 1 | 1 | | | | +(1 row) + +-- Same query without all limitations +SELECT + e1.user_id, + sum(view_homepage) AS viewed_homepage, + sum(use_demo) AS use_demo, + sum(enter_credit_card) AS entered_credit_card, + sum(submit_card_info) as submit_card_info, + sum(see_bought_screen) as see_bought_screen +FROM ( + -- Get the first time each user viewed the homepage. + SELECT + user_id, + 1 AS view_homepage, + min(time) AS view_homepage_time + FROM events_table + WHERE + event_type IN (10, 20, 30, 40, 50, 60, 70, 80, 90) + GROUP BY user_id +) e1 LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS use_demo, + time AS use_demo_time + FROM events_table + WHERE + user_id = e1.user_id AND + event_type IN (11, 21, 31, 41, 51, 61, 71, 81, 91) + ORDER BY time +) e2 ON true LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS enter_credit_card, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e2.user_id AND + event_type IN (12, 22, 32, 42, 52, 62, 72, 82, 92) + ORDER BY time +) e3 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS submit_card_info, + user_id, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e3.user_id AND + event_type IN (13, 23, 33, 43, 53, 63, 73, 83, 93) + ORDER BY time +) e4 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS see_bought_screen + FROM events_table + WHERE + user_id = e4.user_id AND + event_type IN (14, 24, 34, 44, 54, 64, 74, 84, 94) + ORDER BY time +) e5 ON true +GROUP BY e1.user_id +ORDER BY 6 DESC NULLS LAST, 5 DESC NULLS LAST, 4 DESC NULLS LAST, 3 DESC NULLS LAST, 2 DESC NULLS LAST, 1 +LIMIT 15; + user_id | viewed_homepage | use_demo | entered_credit_card | submit_card_info | see_bought_screen +---------+-----------------+----------+---------------------+------------------+------------------- + 72 | 36 | 36 | 36 | 36 | 36 + 95 | 12 | 12 | 12 | 12 | 12 + 82 | 4 | 4 | 4 | 4 | 4 + 74 | 3 | 3 | 3 | 3 | 3 + 83 | 3 | 3 | 3 | 3 | 3 + 6 | 2 | 2 | 2 | 2 | 2 + 42 | 1 | 1 | 1 | 1 | 1 + 5 | 4 | 4 | 4 | 4 | + 93 | 4 | 4 | 4 | 4 | + 51 | 1 | 1 | 1 | 1 | + 85 | 6 | 6 | 6 | | + 73 | 4 | 4 | 4 | | + 0 | 3 | 3 | 3 | | + 10 | 2 | 2 | 2 | | + 13 | 2 | 2 | 2 | | +(15 rows) + +-- Same query without all limitations but uses having() to show only those submitted their credit card info +SELECT + e1.user_id, + sum(view_homepage) AS viewed_homepage, + sum(use_demo) AS use_demo, + sum(enter_credit_card) AS entered_credit_card, + sum(submit_card_info) as submit_card_info, + sum(see_bought_screen) as see_bought_screen +FROM ( + -- Get the first time each user viewed the homepage. + SELECT + user_id, + 1 AS view_homepage, + min(time) AS view_homepage_time + FROM events_table + WHERE + event_type IN (10, 20, 30, 40, 50, 60, 70, 80, 90) + GROUP BY user_id +) e1 LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS use_demo, + time AS use_demo_time + FROM events_table + WHERE + user_id = e1.user_id AND + event_type IN (11, 21, 31, 41, 51, 61, 71, 81, 91) + ORDER BY time +) e2 ON true LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS enter_credit_card, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e2.user_id AND + event_type IN (12, 22, 32, 42, 52, 62, 72, 82, 92) + ORDER BY time +) e3 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS submit_card_info, + user_id, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e3.user_id AND + event_type IN (13, 23, 33, 43, 53, 63, 73, 83, 93) + ORDER BY time +) e4 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS see_bought_screen + FROM events_table + WHERE + user_id = e4.user_id AND + event_type IN (14, 24, 34, 44, 54, 64, 74, 84, 94) + ORDER BY time +) e5 ON true +group by e1.user_id +HAVING sum(submit_card_info) > 0 +ORDER BY 6 DESC NULLS LAST, 5 DESC NULLS LAST, 4 DESC NULLS LAST, 3 DESC NULLS LAST, 2 DESC NULLS LAST, 1 +LIMIT 15; + user_id | viewed_homepage | use_demo | entered_credit_card | submit_card_info | see_bought_screen +---------+-----------------+----------+---------------------+------------------+------------------- + 72 | 36 | 36 | 36 | 36 | 36 + 95 | 12 | 12 | 12 | 12 | 12 + 82 | 4 | 4 | 4 | 4 | 4 + 74 | 3 | 3 | 3 | 3 | 3 + 83 | 3 | 3 | 3 | 3 | 3 + 6 | 2 | 2 | 2 | 2 | 2 + 42 | 1 | 1 | 1 | 1 | 1 + 5 | 4 | 4 | 4 | 4 | + 93 | 4 | 4 | 4 | 4 | + 51 | 1 | 1 | 1 | 1 | +(10 rows) + +-- Explain analyze on this query fails due to #756 +-- avg expression used on order by +SELECT a.user_id, avg(b.value_2) as subquery_avg +FROM ( + SELECT + user_id + FROM + users_table + WHERE + (value_1 > 5) + GROUP BY user_id + HAVING count(distinct value_1) > 88 + ) as a + LEFT JOIN ( + SELECT + user_id, value_2, value_3 + FROM + users_table + WHERE + (value_1 > 3)) AS b +ON a.user_id = b.user_id +WHERE b.user_id IS NOT NULL +GROUP BY a.user_id +ORDER BY avg(b.value_3), 2, 1 +LIMIT 5; + user_id | subquery_avg +---------+---------------------- + 99 | 456.7446808510638298 + 83 | 469.6037735849056604 + 61 | 486.5869565217391304 + 78 | 434.9009009009009009 + 77 | 449.9313725490196078 +(5 rows) + +-- add having +SELECT a.user_id, avg(b.value_2) as subquery_avg +FROM ( + SELECT + user_id + FROM + users_table + WHERE + (value_1 > 5) + GROUP BY user_id + HAVING count(distinct value_1) > 88 + ) as a + LEFT JOIN ( + SELECT + user_id, value_2, value_3 + FROM + users_table + WHERE + (value_1 > 3)) AS b +ON a.user_id = b.user_id +WHERE b.user_id IS NOT NULL +GROUP BY a.user_id +HAVING sum(b.value_3) > 50000 +ORDER BY avg(b.value_3), 2, 1 +LIMIT 5; + user_id | subquery_avg +---------+---------------------- + 78 | 434.9009009009009009 + 29 | 505.0934579439252336 + 17 | 526.9633027522935780 + 91 | 501.4339622641509434 + 24 | 515.1714285714285714 +(5 rows) + +-- avg on the value_3 is not a resjunk +SELECT a.user_id, avg(b.value_2) as subquery_avg, avg(b.value_3) +FROM + (SELECT user_id + FROM users_table + WHERE (value_1 > 5) + GROUP BY user_id + HAVING count(distinct value_1) > 88 + ) as a + LEFT JOIN + (SELECT user_id, value_2, value_3 + FROM users_table + WHERE (value_1 > 3) + ) AS b + ON a.user_id = b.user_id +WHERE b.user_id IS NOT NULL +GROUP BY a.user_id +ORDER BY avg(b.value_3) DESC, 2, 1 +LIMIT 5; + user_id | subquery_avg | avg +---------+----------------------+------------------ + 6 | 523.8247422680412371 | 569.226804123711 + 62 | 497.1545454545454545 | 567.681818181818 + 8 | 524.5894736842105263 | 565.2 + 10 | 502.2017543859649123 | 561.929824561404 + 16 | 467.5145631067961165 | 561.73786407767 +(5 rows) + +-- a powerful query structure that analyzes users/events +-- using (relation JOIN subquery JOIN relation) +SELECT u.user_id, sub.value_2, sub.value_3, COUNT(e2.user_id) counts +FROM + users_table u + LEFT OUTER JOIN LATERAL + (SELECT * + FROM events_table e1 + WHERE e1.user_id = u.user_id + ORDER BY e1.value_3 DESC + LIMIT 1 + ) sub + ON true + LEFT OUTER JOIN events_table e2 + ON e2.user_id = sub.user_id +WHERE e2.value_2 > 10 and e2.value_2 < 50 AND u.value_2 > 10 and u.value_2 < 50 +GROUP BY u.user_id, sub.value_2, sub.value_3 +ORDER BY 4 DESC, 1 DESC, 2 ASC, 3 ASC +LIMIT 10; + user_id | value_2 | value_3 | counts +---------+---------+---------+-------- + 87 | 807 | 990 | 45 + 25 | 613 | 992 | 40 + 26 | 952 | 982 | 36 + 17 | 277 | 993 | 36 + 83 | 571 | 1000 | 35 + 99 | 309 | 998 | 32 + 96 | 571 | 987 | 30 + 95 | 631 | 997 | 30 + 82 | 444 | 997 | 28 + 57 | 975 | 989 | 25 +(10 rows) + +-- distinct users joined with events +SELECT + avg(events_table.event_type) as avg_type, + count(*) as users_count +FROM events_table + JOIN + (SELECT DISTINCT user_id + FROM users_table + ) as distinct_users + ON distinct_users.user_id = events_table.user_id +GROUP BY distinct_users.user_id +ORDER BY users_count desc, avg_type DESC +LIMIT 5; + avg_type | users_count +----------------------+------------- + 496.5748031496062992 | 127 + 531.1788617886178862 | 123 + 504.6806722689075630 | 119 + 503.7203389830508475 | 118 + 506.3793103448275862 | 116 +(5 rows) + +-- reduce the data set, aggregate and join +SELECT + events_table.event_type, + users_count.ct +FROM events_table + JOIN + (SELECT distinct_users.user_id, count(1) as ct + FROM + (SELECT user_id + FROM users_table + ) as distinct_users + GROUP BY distinct_users.user_id + ) as users_count + ON users_count.user_id = events_table.user_id +ORDER BY users_count.ct desc, event_type DESC +LIMIT 5; + event_type | ct +------------+----- + 996 | 121 + 986 | 121 + 979 | 121 + 975 | 121 + 960 | 121 +(5 rows) + +--- now, test (subquery JOIN subquery) +SELECT n1.user_id, count_1, total_count +FROM + (SELECT user_id, count(1) as count_1 + FROM users_table + GROUP BY user_id + ) n1 + INNER JOIN + (SELECT user_id, count(1) as total_count + FROM events_table + GROUP BY user_id, event_type + ) n2 + ON (n2.user_id = n1.user_id) +ORDER BY total_count DESC, count_1 DESC, 1 DESC +LIMIT 10; + user_id | count_1 | total_count +---------+---------+------------- + 57 | 105 | 4 + 78 | 112 | 3 + 45 | 111 | 3 + 40 | 107 | 3 + 36 | 106 | 3 + 25 | 105 | 3 + 86 | 100 | 3 + 80 | 100 | 3 + 60 | 100 | 3 + 35 | 100 | 3 +(10 rows) + +SELECT a.user_id, avg(b.value_2) as subquery_avg +FROM + (SELECT user_id + FROM users_table + WHERE (value_1 > 5) + GROUP BY user_id + HAVING count(distinct value_1) > 88 + ) as a + LEFT JOIN + (SELECT DISTINCT ON (user_id) user_id, value_2, value_3 + FROM users_table + WHERE (value_1 > 3) + ORDER BY 1,2,3 + ) AS b + ON a.user_id = b.user_id +WHERE b.user_id IS NOT NULL +GROUP BY a.user_id +ORDER BY avg(b.value_3), 2, 1 +LIMIT 5; + user_id | subquery_avg +---------+--------------------- + 10 | 5.0000000000000000 + 87 | 12.0000000000000000 + 77 | 28.0000000000000000 + 37 | 17.0000000000000000 + 11 | 3.0000000000000000 +(5 rows) + +-- distinct clause must include partition column +-- when used in target list +SELECT a.user_id, avg(b.value_2) as subquery_avg +FROM + (SELECT user_id + FROM users_table + WHERE (value_1 > 5) + GROUP BY user_id + HAVING count(distinct value_1) > 88 + ) as a + LEFT JOIN + (SELECT DISTINCT ON (value_2) value_2 , user_id, value_3 + FROM users_table + WHERE (value_1 > 3) + ORDER BY 1,2,3 + ) AS b + USING (user_id) +GROUP BY user_id; +ERROR: cannot push down this subquery +DETAIL: Distinct on columns without partition column is currently unsupported +SELECT a.user_id, avg(b.value_2) as subquery_avg +FROM + (SELECT user_id + FROM users_table + WHERE (value_1 > 5) + GROUP BY user_id + HAVING count(distinct value_1) > 88 + ) as a + LEFT JOIN + (SELECT DISTINCT ON (value_2, user_id) value_2 , user_id, value_3 + FROM users_table + WHERE (value_1 > 3) + ORDER BY 1,2,3 + ) AS b + ON a.user_id = b.user_id +WHERE b.user_id IS NOT NULL +GROUP BY a.user_id +ORDER BY avg(b.value_3), 2, 1 +LIMIT 5; + user_id | subquery_avg +---------+---------------------- + 99 | 459.1910112359550562 + 83 | 458.0721649484536082 + 9 | 541.5217391304347826 + 78 | 434.2336448598130841 + 77 | 443.8686868686868687 +(5 rows) + +SELECT user_id, event_type +FROM + (SELECT * + FROM + ( + (SELECT event_type, user_id as a_user_id FROM events_table) AS a + JOIN + (SELECT + ma.user_id AS user_id, ma.value_2 AS value_2, + (GREATEST(coalesce((ma.value_3 * ma.value_2 ) / 20, 0.0) + GREATEST(1.0))) / 2 AS prob + FROM users_table AS ma + WHERE (ma.value_2 > 100) + ORDER BY prob DESC, user_id DESC + LIMIT 10 + ) AS ma + ON (a.a_user_id = ma.user_id) + ) AS inner_sub + ORDER BY prob DESC, user_id DESC + LIMIT 10 + ) AS outer_sub +ORDER BY prob DESC, event_type DESC, user_id DESC +LIMIT 10; + user_id | event_type +---------+------------ + 10 | 813 + 10 | 806 + 10 | 805 + 10 | 685 + 10 | 591 + 10 | 442 + 10 | 333 + 10 | 317 + 10 | 244 + 10 | 169 +(10 rows) + +-- very similar query but produces different result due to +-- ordering difference in the previous one's inner query +SELECT user_id, event_type +FROM + (SELECT event_type, user_id as a_user_id FROM events_table) AS a + JOIN + (SELECT + ma.user_id AS user_id, ma.value_2 AS value_2, + (GREATEST(coalesce((ma.value_3 * ma.value_2 ) / 20, 0.0) + GREATEST(1.0))) / 2 AS prob + FROM users_table AS ma + WHERE (ma.value_2 > 100) + ORDER BY prob DESC, user_id DESC + LIMIT 10 + ) AS ma + ON (a.a_user_id = ma.user_id) +ORDER BY prob DESC, event_type DESC, user_id DESC +LIMIT 10; + user_id | event_type +---------+------------ + 10 | 998 + 10 | 996 + 10 | 981 + 10 | 975 + 10 | 962 + 10 | 945 + 10 | 945 + 10 | 933 + 10 | 932 + 10 | 915 +(10 rows) + +-- now they produce the same result when ordering fixed in 'outer_sub' +SELECT user_id, event_type +FROM + (SELECT * + FROM + ( + (SELECT event_type, user_id as a_user_id FROM events_table) AS a + JOIN + (SELECT + ma.user_id AS user_id, ma.value_2 AS value_2, + (GREATEST(coalesce((ma.value_3 * ma.value_2 ) / 20, 0.0) + GREATEST(1.0))) / 2 AS prob + FROM users_table AS ma + WHERE (ma.value_2 > 100) + ORDER BY prob DESC, user_id DESC + LIMIT 10 + ) AS ma + ON (a.a_user_id = ma.user_id) + ) AS inner_sub + ORDER BY prob DESC, event_type DESC, user_id DESC + LIMIT 10 + ) AS outer_sub +ORDER BY prob DESC, event_type DESC, user_id DESC +LIMIT 10; + user_id | event_type +---------+------------ + 10 | 998 + 10 | 996 + 10 | 981 + 10 | 975 + 10 | 962 + 10 | 945 + 10 | 945 + 10 | 933 + 10 | 932 + 10 | 915 +(10 rows) + +-- this is one complex join query derived from a user's production query +-- first declare the function on workers on master +-- With array_index: +SELECT * FROM run_command_on_workers('CREATE OR REPLACE FUNCTION array_index(ANYARRAY, ANYELEMENT) + RETURNS INT AS $$ + SELECT i + FROM (SELECT generate_series(array_lower($1, 1), array_upper($1, 1))) g(i) + WHERE $1 [i] = $2 + LIMIT 1; + $$ LANGUAGE sql') +ORDER BY 1,2; + nodename | nodeport | success | result +-----------+----------+---------+----------------- + localhost | 57637 | t | CREATE FUNCTION + localhost | 57638 | t | CREATE FUNCTION +(2 rows) + +CREATE OR REPLACE FUNCTION array_index(ANYARRAY, ANYELEMENT) + RETURNS INT AS $$ + SELECT i + FROM (SELECT generate_series(array_lower($1, 1), array_upper($1, 1))) g(i) + WHERE $1 [i] = $2 + LIMIT 1; + $$ LANGUAGE sql; +SELECT * +FROM + (SELECT * + FROM + ( + (SELECT + user_id AS user_id_e, + event_type as event_type_e + FROM + events_table + ) AS ma_e + JOIN + (SELECT + value_2, + value_3, + user_id + FROM + (SELECT * + FROM + ( + (SELECT user_id_p AS user_id + FROM + (SELECT * + FROM + ( + (SELECT user_id AS user_id_p + FROM events_table + WHERE (event_type IN (1, 2, 3, 4, 5)) + ) AS ma_p + JOIN + (SELECT user_id AS user_id_a + FROM users_table + WHERE (value_2 % 5 = 1) + ) AS a + ON (a.user_id_a = ma_p.user_id_p) + ) + ) AS a_ma_p + ) AS inner_filter_q + JOIN + (SELECT + value_2, + value_3, + user_id AS user_id_ck + FROM events_table + WHERE event_type = ANY(ARRAY [10, 11, 12]) + ORDER BY + value_3 ASC, + user_id_ck DESC, + array_index(ARRAY [1, 2, 3], (value_2 % 3)) ASC + LIMIT 10 + ) AS ma_ck + ON (ma_ck.user_id_ck = inner_filter_q.user_id) + ) AS inner_sub_q + ORDER BY + value_3 ASC, + user_id_ck DESC, + array_index(ARRAY [1, 2, 3], (value_2 % 3)) ASC + LIMIT 10 + ) AS outer_sub_q + ORDER BY + value_3 ASC, + user_id DESC, + array_index(ARRAY [1, 2, 3], (value_2 % 3)) ASC + LIMIT 10) AS inner_search_q + ON (ma_e.user_id_e = inner_search_q.user_id) + ) AS outer_inner_sub_q + ORDER BY + value_3 ASC, + user_id DESC, + array_index(ARRAY [1, 2, 3], (value_2 % 3)) ASC, + event_type_e DESC + LIMIT 10) AS outer_outer_sub_q +ORDER BY + value_3 ASC, + user_id DESC, + array_index(ARRAY [1, 2, 3], (value_2 % 3)) ASC, + event_type_e DESC +LIMIT 10; + user_id_e | event_type_e | value_2 | value_3 | user_id +-----------+--------------+---------+---------+--------- + 65 | 991 | 167 | 108 | 65 + 65 | 991 | 167 | 108 | 65 + 65 | 991 | 167 | 108 | 65 + 65 | 991 | 167 | 108 | 65 + 65 | 991 | 167 | 108 | 65 + 65 | 991 | 167 | 108 | 65 + 65 | 991 | 167 | 108 | 65 + 65 | 991 | 167 | 108 | 65 + 65 | 991 | 167 | 108 | 65 + 65 | 991 | 167 | 108 | 65 +(10 rows) + +-- top level select * is removed now there is +-- a join at top level. +SELECT * +FROM + ( + (SELECT + user_id AS user_id_e, + event_type as event_type_e + FROM + events_table + ) AS ma_e + JOIN + (SELECT + value_2, + value_3, + user_id + FROM + (SELECT * + FROM + ( + (SELECT user_id_p AS user_id + FROM + (SELECT * + FROM + ( + (SELECT user_id AS user_id_p + FROM events_table + WHERE (event_type IN (1, 2, 3, 4, 5)) + ) AS ma_p + JOIN + (SELECT user_id AS user_id_a + FROM users_table + WHERE (value_2 % 5 = 1) + ) AS a + ON (a.user_id_a = ma_p.user_id_p) + ) + ) AS a_ma_p + ) AS inner_filter_q + JOIN + (SELECT + value_2, + value_3, + user_id AS user_id_ck + FROM events_table + WHERE event_type = ANY(ARRAY [10, 11, 12]) + ORDER BY + value_3 ASC, + user_id_ck DESC, + array_index(ARRAY [1, 2, 3], (value_2 % 3)) ASC + LIMIT 10 + ) AS ma_ck + ON (ma_ck.user_id_ck = inner_filter_q.user_id) + ) AS inner_sub_q + ORDER BY + value_3 ASC, + user_id_ck DESC, + array_index(ARRAY [1, 2, 3], (value_2 % 3)) ASC + LIMIT 10 + ) AS outer_sub_q + ORDER BY + value_3 ASC, + user_id DESC, + array_index(ARRAY [1, 2, 3], (value_2 % 3)) ASC + LIMIT 10) AS inner_search_q + ON (ma_e.user_id_e = inner_search_q.user_id) + ) AS outer_inner_sub_q +ORDER BY + value_3 ASC, + user_id DESC, + array_index(ARRAY [1, 2, 3], (value_2 % 3)) ASC, + event_type_e DESC +LIMIT 10; + user_id_e | event_type_e | value_2 | value_3 | user_id +-----------+--------------+---------+---------+--------- + 65 | 991 | 167 | 108 | 65 + 65 | 991 | 167 | 108 | 65 + 65 | 991 | 167 | 108 | 65 + 65 | 991 | 167 | 108 | 65 + 65 | 991 | 167 | 108 | 65 + 65 | 991 | 167 | 108 | 65 + 65 | 991 | 167 | 108 | 65 + 65 | 991 | 167 | 108 | 65 + 65 | 991 | 167 | 108 | 65 + 65 | 991 | 167 | 108 | 65 +(10 rows) + +-- drop created functions +SELECT * FROM run_command_on_workers('DROP FUNCTION array_index(ANYARRAY, ANYELEMENT)') +ORDER BY 1,2; + nodename | nodeport | success | result +-----------+----------+---------+--------------- + localhost | 57637 | t | DROP FUNCTION + localhost | 57638 | t | DROP FUNCTION +(2 rows) + +DROP FUNCTION array_index(ANYARRAY, ANYELEMENT); +SET citus.subquery_pushdown TO FALSE; +SET citus.enable_router_execution TO TRUE; diff --git a/src/test/regress/expected/multi_subquery_complex_queries.out b/src/test/regress/expected/multi_subquery_complex_queries.out index 70033ab9d..7939f58b6 100644 --- a/src/test/regress/expected/multi_subquery_complex_queries.out +++ b/src/test/regress/expected/multi_subquery_complex_queries.out @@ -208,6 +208,41 @@ FROM 34 | Tue Jan 21 04:15:03.874341 2014 (6 rows) +-- same query with subuqery joins in topmost select +SELECT "some_users_data".user_id, lastseen +FROM + (SELECT user_id, + Max(TIME) AS lastseen + FROM + (SELECT user_id, + TIME + FROM + (SELECT user_id, + TIME + FROM events_table as "events" + WHERE user_id > 10 and user_id < 40) "events_1" + ORDER BY TIME DESC + LIMIT 1000) "recent_events_1" + GROUP BY user_id + ORDER BY max(TIME) DESC) "some_recent_users" + JOIN LATERAL + (SELECT "users".user_id + FROM users_table as "users" + WHERE "users"."user_id" = "some_recent_users"."user_id" + AND users.value_2 > 50 and users.value_2 < 55 + LIMIT 1) "some_users_data" ON TRUE +ORDER BY user_id +limit 50; + user_id | lastseen +---------+--------------------------------- + 19 | Tue Jan 21 05:23:09.26298 2014 + 22 | Tue Jan 21 05:22:28.223506 2014 + 25 | Tue Jan 21 01:10:29.315788 2014 + 31 | Tue Jan 21 02:43:24.591489 2014 + 33 | Tue Jan 21 04:23:35.623056 2014 + 34 | Tue Jan 21 04:15:03.874341 2014 +(6 rows) + -- not supported since JOIN is not on the partition key SELECT * FROM @@ -321,6 +356,50 @@ limit 10; 15 | Tue Jan 21 02:25:36.136461 2014 (10 rows) +-- +-- A similar query with topmost select is dropped +-- and replaced by aggregation. Notice the heavy use of limit +-- +SELECT "some_users_data".user_id, MAX(lastseen), count(*) + FROM + (SELECT filter_users_1.user_id, + TIME AS lastseen + FROM + (SELECT user_where_1_1.user_id + FROM + (SELECT "users"."user_id" + FROM users_table as "users" + WHERE user_id > 12 and user_id < 16 and value_1 > 20) user_where_1_1 + INNER JOIN + (SELECT "users"."user_id" + FROM users_table as "users" + WHERE user_id > 12 and user_id < 16 and value_2 > 60) user_where_1_join_1 + ON ("user_where_1_1".user_id = "user_where_1_join_1".user_id)) filter_users_1 + JOIN LATERAL + (SELECT user_id, + TIME + FROM events_table as "events" + WHERE user_id > 12 and user_id < 16 and user_id = filter_users_1.user_id + ORDER BY TIME DESC + LIMIT 1) "last_events_1" ON TRUE + ORDER BY TIME DESC + LIMIT 10) "some_recent_users" + JOIN LATERAL + (SELECT "users".user_id + FROM users_table as "users" + WHERE "users"."user_id" = "some_recent_users"."user_id" + AND "users"."value_2" > 70 + LIMIT 1) "some_users_data" ON TRUE +GROUP BY 1 +ORDER BY 2, 1 DESC +limit 10; + user_id | max | count +---------+---------------------------------+------- + 15 | Tue Jan 21 02:25:36.136461 2014 | 10 + 13 | Tue Jan 21 05:06:48.989766 2014 | 10 + 14 | Tue Jan 21 05:46:51.286381 2014 | 10 +(3 rows) + -- not supported since the inner JOIN is not equi join SELECT user_id, lastseen FROM @@ -586,23 +665,29 @@ DETAIL: Each relation should be joined with at least one another relation using SELECT "value_3", count(*) AS cnt FROM -(SELECT "value_3", "user_id", random() - FROM - (SELECT users_in_segment_1.user_id, value_3 - FROM - (SELECT user_id, value_3 * 2 as value_3 - FROM - (SELECT user_id, value_3 - FROM - (SELECT "users"."user_id", value_3 - FROM users_table as "users" - WHERE user_id > 10 and user_id < 40 and value_2 > 30) simple_user_where_1) all_buckets_1) users_in_segment_1 - JOIN - (SELECT "users"."user_id" - FROM users_table as "users" - WHERE user_id > 10 and user_id < 40 and value_2 > 60) some_users_data - ON ("users_in_segment_1".user_id = "some_users_data".user_id)) segmentalias_1) "tempQuery" - GROUP BY "value_3" ORDER BY cnt, value_3 DESC LIMIT 10; + (SELECT "value_3", "user_id", random() + FROM + (SELECT users_in_segment_1.user_id, value_3 + FROM + (SELECT user_id, value_3 * 2 as value_3 + FROM + (SELECT user_id, value_3 + FROM + (SELECT "users"."user_id", value_3 + FROM users_table as "users" + WHERE user_id > 10 and user_id < 40 and value_2 > 30 + ) simple_user_where_1 + ) all_buckets_1 + ) users_in_segment_1 + JOIN + (SELECT "users"."user_id" + FROM users_table as "users" + WHERE user_id > 10 and user_id < 40 and value_2 > 60 + ) some_users_data + ON ("users_in_segment_1".user_id = "some_users_data".user_id) + ) segmentalias_1) "tempQuery" +GROUP BY "value_3" +ORDER BY cnt, value_3 DESC LIMIT 10; value_3 | cnt ---------+----- 556 | 75 @@ -683,6 +768,45 @@ FROM 21 | 985 (6 rows) +-- nested lateral join at top most level +SELECT "some_users_data".user_id, "some_recent_users".value_3 +FROM + (SELECT filter_users_1.user_id, value_3 + FROM + (SELECT "users"."user_id" + FROM users_table as "users" + WHERE user_id > 20 and user_id < 70 and users.value_2 = 200 + ) filter_users_1 + JOIN LATERAL + (SELECT user_id, value_3 + FROM events_table as "events" + WHERE user_id > 20 and user_id < 70 + AND ("events".user_id = "filter_users_1".user_id) + ORDER BY value_3 DESC + LIMIT 1 + ) "last_events_1" ON TRUE + ORDER BY value_3 DESC + LIMIT 10 + ) "some_recent_users" + JOIN LATERAL + (SELECT "users".user_id + FROM users_table as "users" + WHERE "users"."user_id" = "some_recent_users"."user_id" + AND users.value_2 > 200 + LIMIT 1 + ) "some_users_data" ON TRUE +ORDER BY value_3 DESC, user_id ASC +LIMIT 10; + user_id | value_3 +---------+--------- + 44 | 998 + 65 | 996 + 66 | 996 + 37 | 995 + 57 | 989 + 21 | 985 +(6 rows) + -- longer nested lateral joins SELECT * FROM @@ -726,6 +850,45 @@ FROM 21 | 985 (6 rows) +-- longer nested lateral join wth top level join +SELECT "some_users_data".user_id, "some_recent_users".value_3 +FROM + (SELECT filter_users_1.user_id, value_3 + FROM + (SELECT "users"."user_id" + FROM users_table as "users" + WHERE user_id > 20 and user_id < 70 and users.value_2 = 200 + ) filter_users_1 + JOIN LATERAL + (SELECT user_id, value_3 + FROM events_table as "events" + WHERE user_id > 20 and user_id < 70 + AND ("events".user_id = "filter_users_1".user_id) + ORDER BY value_3 DESC + LIMIT 1 + ) "last_events_1" ON TRUE + ORDER BY value_3 DESC + LIMIT 10 + ) "some_recent_users" + JOIN LATERAL + (SELECT "users".user_id + FROM users_table as "users" + WHERE "users"."user_id" = "some_recent_users"."user_id" + AND users.value_2 > 200 + LIMIT 1 + ) "some_users_data" ON TRUE +ORDER BY value_3 DESC +LIMIT 10; + user_id | value_3 +---------+--------- + 44 | 998 + 65 | 996 + 66 | 996 + 37 | 995 + 57 | 989 + 21 | 985 +(6 rows) + -- LEFT JOINs used with INNER JOINs SELECT count(*) AS cnt, "generated_group_field" diff --git a/src/test/regress/input/multi_subquery.source b/src/test/regress/input/multi_subquery.source index b5bd62148..3077f89ed 100644 --- a/src/test/regress/input/multi_subquery.source +++ b/src/test/regress/input/multi_subquery.source @@ -197,9 +197,7 @@ SELECT count(*) FROM -- (SELECT l_orderkey FROM lineitem_subquery) UNION -- (SELECT l_orderkey FROM lineitem_subquery) --) b; - --- Check that we error out if the outermost query has subquery join. - +-- Check that we error out if inner query has limit but outer quers has not. SELECT avg(o_totalprice/l_quantity) FROM diff --git a/src/test/regress/multi_schedule b/src/test/regress/multi_schedule index 5868c5a2c..4362c4314 100644 --- a/src/test/regress/multi_schedule +++ b/src/test/regress/multi_schedule @@ -40,7 +40,7 @@ test: multi_insert_select test: multi_deparse_shard_query test: multi_basic_queries multi_complex_expressions multi_verify_no_subquery test: multi_explain -test: multi_subquery multi_subquery_complex_queries +test: multi_subquery multi_subquery_complex_queries multi_subquery_behavioral_analytics test: multi_reference_table test: multi_outer_join_reference test: multi_single_relation_subquery diff --git a/src/test/regress/output/multi_subquery.source b/src/test/regress/output/multi_subquery.source index c96274281..5c0d6dab3 100644 --- a/src/test/regress/output/multi_subquery.source +++ b/src/test/regress/output/multi_subquery.source @@ -194,7 +194,7 @@ DETAIL: Union All clauses are currently unsupported -- (SELECT l_orderkey FROM lineitem_subquery) UNION -- (SELECT l_orderkey FROM lineitem_subquery) --) b; --- Check that we error out if the outermost query has subquery join. +-- Check that we error out if inner query has limit but outer quers has not. SELECT avg(o_totalprice/l_quantity) FROM @@ -214,8 +214,8 @@ FROM orders_subquery WHERE lineitem_quantities.l_orderkey = o_orderkey) orders_price ON true; -ERROR: cannot perform distributed planning on this query -DETAIL: Join in subqueries is not supported yet +ERROR: cannot push down this subquery +DETAIL: Limit in subquery without limit in the outer query is unsupported -- Check that we error out if the outermost query is a distinct clause. SELECT count(DISTINCT a) diff --git a/src/test/regress/output/multi_subquery_0.source b/src/test/regress/output/multi_subquery_0.source index ba1e1495f..8a16c9be4 100644 --- a/src/test/regress/output/multi_subquery_0.source +++ b/src/test/regress/output/multi_subquery_0.source @@ -194,7 +194,7 @@ DETAIL: Union All clauses are currently unsupported -- (SELECT l_orderkey FROM lineitem_subquery) UNION -- (SELECT l_orderkey FROM lineitem_subquery) --) b; --- Check that we error out if the outermost query has subquery join. +-- Check that we error out if inner query has limit but outer quers has not. SELECT avg(o_totalprice/l_quantity) FROM @@ -214,8 +214,8 @@ FROM orders_subquery WHERE lineitem_quantities.l_orderkey = o_orderkey) orders_price ON true; -ERROR: cannot perform distributed planning on this query -DETAIL: Join in subqueries is not supported yet +ERROR: cannot push down this subquery +DETAIL: Limit in subquery without limit in the outer query is unsupported -- Check that we error out if the outermost query is a distinct clause. SELECT count(DISTINCT a) diff --git a/src/test/regress/sql/multi_subquery_behavioral_analytics.sql b/src/test/regress/sql/multi_subquery_behavioral_analytics.sql new file mode 100644 index 000000000..c4222d004 --- /dev/null +++ b/src/test/regress/sql/multi_subquery_behavioral_analytics.sql @@ -0,0 +1,1292 @@ +-- +-- multi subquery behavioral analytics queries aims to expand existing subquery pushdown +-- regression tests to cover more cases +-- the tables that are used depends to multi_insert_select_behavioral_analytics_create_table.sql +-- + +-- We don't need shard id sequence here, so commented out to prevent conflicts with concurrent tests +-- ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 1430000; +-- ALTER SEQUENCE pg_catalog.pg_dist_jobid_seq RESTART 1430000; + +SET citus.subquery_pushdown TO TRUE; +SET citus.enable_router_execution TO FALSE; + +------------------------------------ +-- Vanilla funnel query +------------------------------------ +SELECT user_id, array_length(events_table, 1) +FROM ( + SELECT user_id, array_agg(event ORDER BY time) AS events_table + FROM ( + SELECT u.user_id, e.event_type::text AS event, e.time + FROM users_table AS u, + events_table AS e + WHERE u.user_id = e.user_id + AND u.user_id >= 10 + AND u.user_id <= 25 + AND e.event_type IN (100, 101, 102) + ) t + GROUP BY user_id +) q +ORDER BY 2 DESC, 1; + +------------------------------------ +-- Funnel grouped by whether or not a user has done an event +-- This has multiple subqueries joinin at the top level +-- Query will be supported when we enable unions +------------------------------------ +SELECT user_id, sum(array_length(events_table, 1)), length(hasdone_event), hasdone_event +FROM ( + SELECT + t1.user_id, + array_agg(event ORDER BY time) AS events_table, + COALESCE(hasdone_event, 'Has not done event') AS hasdone_event + FROM ( + ( + SELECT u.user_id, 'step=>1'::text AS event, e.time + FROM users_table AS u, + events_table AS e + WHERE u.user_id = e.user_id + AND u.user_id >= 10 + AND u.user_id <= 25 + AND e.event_type IN (100, 101, 102) + ) + UNION + ( + SELECT u.user_id, 'step=>2'::text AS event, e.time + FROM users_table AS u, + events_table AS e + WHERE u.user_id = e.user_id + AND u.user_id >= 10 + AND u.user_id <= 25 + AND e.event_type IN (103, 104, 105) + ) + ) t1 LEFT JOIN ( + SELECT DISTINCT user_id, + 'Has done event'::TEXT AS hasdone_event + FROM events_table AS e + WHERE e.user_id >= 10 + AND e.user_id <= 25 + AND e.event_type IN (106, 107, 108) + ) t2 ON (t1.user_id = t2.user_id) + GROUP BY t1.user_id, hasdone_event +) t GROUP BY user_id, hasdone_event +ORDER BY user_id; + +-- same query but multiple joins are one level below, returns count of row instead of actual rows +SELECT count(*) +FROM ( + SELECT user_id, sum(array_length(events_table, 1)), length(hasdone_event), hasdone_event + FROM ( + SELECT + t1.user_id, + array_agg(event ORDER BY time) AS events_table, + COALESCE(hasdone_event, 'Has not done event') AS hasdone_event + FROM ( + ( + SELECT u.user_id, 'step=>1'::text AS event, e.time + FROM users_table AS u, + events_table AS e + WHERE u.user_id = e.user_id + AND u.user_id >= 10 + AND u.user_id <= 25 + AND e.event_type IN (100, 101, 102) + ) + UNION + ( + SELECT u.user_id, 'step=>2'::text AS event, e.time + FROM users_table AS u, + events_table AS e + WHERE u.user_id = e.user_id + AND u.user_id >= 10 + AND u.user_id <= 25 + AND e.event_type IN (103, 104, 105) + ) + ) t1 LEFT JOIN ( + SELECT DISTINCT user_id, + 'Has done event'::TEXT AS hasdone_event + FROM events_table AS e + WHERE e.user_id >= 10 + AND e.user_id <= 25 + AND e.event_type IN (106, 107, 108) + ) t2 ON (t1.user_id = t2.user_id) + GROUP BY t1.user_id, hasdone_event + ) t GROUP BY user_id, hasdone_event + ORDER BY user_id) u; + +-- Same queries written without unions +SELECT user_id, sum(array_length(events_table, 1)), length(hasdone_event), hasdone_event +FROM ( + SELECT + t1.user_id, + array_agg(event ORDER BY time) AS events_table, + COALESCE(hasdone_event, 'Has not done event') AS hasdone_event + FROM ( + SELECT + u.user_id, + CASE WHEN e.event_type IN (100, 101, 102) THEN 'step=>1'::text else 'step==>2'::text END AS event, + e.time + FROM users_table AS u, + events_table AS e + WHERE u.user_id = e.user_id + AND u.user_id >= 10 + AND u.user_id <= 25 + AND e.event_type IN (100, 101, 102, 103, 104, 105) + GROUP BY 1,2,3 + ) t1 LEFT JOIN ( + SELECT DISTINCT user_id, + 'Has done event'::TEXT AS hasdone_event + FROM events_table AS e + WHERE e.user_id >= 10 + AND e.user_id <= 25 + AND e.event_type IN (106, 107, 108) + ) t2 ON (t1.user_id = t2.user_id) + GROUP BY t1.user_id, hasdone_event +) t GROUP BY user_id, hasdone_event +ORDER BY user_id; + +-- same query but multiple joins are one level below, returns count of row instead of actual rows +SELECT count(*) +FROM ( + SELECT user_id, sum(array_length(events_table, 1)), length(hasdone_event), hasdone_event + FROM ( + SELECT + t1.user_id, + array_agg(event ORDER BY time) AS events_table, + COALESCE(hasdone_event, 'Has not done event') AS hasdone_event + FROM ( + SELECT + u.user_id, + CASE WHEN e.event_type in (100, 101, 102) then 'step=>1'::text else 'step==>2'::text END AS event, + e.time + FROM users_table AS u, + events_table AS e + WHERE u.user_id = e.user_id + AND u.user_id >= 10 + AND u.user_id <= 25 + AND e.event_type IN (100, 101, 102, 103, 104, 105) + GROUP BY 1,2,3 + ) t1 LEFT JOIN ( + SELECT DISTINCT user_id, + 'Has done event'::TEXT AS hasdone_event + FROM events_table AS e + WHERE e.user_id >= 10 + AND e.user_id <= 25 + AND e.event_type IN (106, 107, 108) + ) t2 ON (t1.user_id = t2.user_id) + GROUP BY t1.user_id, hasdone_event + ) t GROUP BY user_id, hasdone_event + ORDER BY user_id) u; + +------------------------------------ +-- Funnel, grouped by the number of times a user has done an event +-- These will be supported when we add unions +------------------------------------ +SELECT + user_id, + avg(array_length(events_table, 1)) AS event_average, + count_pay + FROM ( + SELECT + subquery_1.user_id, + array_agg(event ORDER BY time) AS events_table, + COALESCE(count_pay, 0) AS count_pay + FROM + ( + (SELECT + users_table.user_id, + 'action=>1'AS event, + events_table.time + FROM + users_table, + events_table + WHERE + users_table.user_id = events_table.user_id AND + users_table.user_id >= 10 AND + users_table.user_id <= 70 AND + events_table.event_type > 10 AND events_table.event_type < 12 + ) + UNION + (SELECT + users_table.user_id, + 'action=>2'AS event, + events_table.time + FROM + users_table, + events_table + WHERE + users_table.user_id = events_table.user_id AND + users_table.user_id >= 10 AND + users_table.user_id <= 70 AND + events_table.event_type > 12 AND events_table.event_type < 14 + ) + ) AS subquery_1 + LEFT JOIN + (SELECT + user_id, + COUNT(*) AS count_pay + FROM + users_table + WHERE + user_id >= 10 AND + user_id <= 70 AND + users_table.value_1 > 15 AND users_table.value_1 < 17 + GROUP BY + user_id + HAVING + COUNT(*) > 1) AS subquery_2 + ON + subquery_1.user_id = subquery_2.user_id + GROUP BY + subquery_1.user_id, + count_pay) AS subquery_top +WHERE + array_ndims(events_table) > 0 +GROUP BY + count_pay, user_id +ORDER BY + event_average DESC, count_pay DESC, user_id DESC; + +SELECT + user_id, + avg(array_length(events_table, 1)) AS event_average, + count_pay + FROM ( + SELECT + subquery_1.user_id, + array_agg(event ORDER BY time) AS events_table, + COALESCE(count_pay, 0) AS count_pay + FROM + ( + (SELECT + users_table.user_id, + 'action=>1'AS event, + events_table.time + FROM + users_table, + events_table + WHERE + users_table.user_id = events_table.user_id AND + users_table.user_id >= 10 AND + users_table.user_id <= 70 AND + events_table.event_type > 10 AND events_table.event_type < 12 + ) + UNION + (SELECT + users_table.user_id, + 'action=>2'AS event, + events_table.time + FROM + users_table, + events_table + WHERE + users_table.user_id = events_table.user_id AND + users_table.user_id >= 10 AND + users_table.user_id <= 70 AND + events_table.event_type > 12 AND events_table.event_type < 14 + ) + ) AS subquery_1 + LEFT JOIN + (SELECT + user_id, + COUNT(*) AS count_pay + FROM + users_table + WHERE + user_id >= 10 AND + user_id <= 70 AND + users_table.value_1 > 15 AND users_table.value_1 < 17 + GROUP BY + user_id + HAVING + COUNT(*) > 1) AS subquery_2 + ON + subquery_1.user_id = subquery_2.user_id + GROUP BY + subquery_1.user_id, + count_pay) AS subquery_top +WHERE + array_ndims(events_table) > 0 +GROUP BY + count_pay, user_id +HAVING + avg(array_length(events_table, 1)) > 0 +ORDER BY + event_average DESC, count_pay DESC, user_id DESC; + +-- Same queries rewritten without using unions + SELECT + user_id, + avg(array_length(events_table, 1)) AS event_average, + count_pay + FROM ( + SELECT + subquery_1.user_id, + array_agg(event ORDER BY time) AS events_table, + COALESCE(count_pay, 0) AS count_pay + FROM + ( + SELECT + users_table.user_id, + CASE WHEN events_table.event_type > 10 AND events_table.event_type < 12 THEN 'action=>1' ELSE 'action=>2' END AS event, + events_table.time + FROM + users_table, + events_table + WHERE + users_table.user_id = events_table.user_id AND + users_table.user_id >= 10 AND + users_table.user_id <= 70 AND + (events_table.event_type > 10 AND events_table.event_type < 12 + OR + events_table.event_type > 12 AND events_table.event_type < 14) + GROUP BY 1, 2, 3 + ) AS subquery_1 + LEFT JOIN + (SELECT + user_id, + COUNT(*) AS count_pay + FROM + users_table + WHERE + user_id >= 10 AND + user_id <= 70 AND + users_table.value_1 > 15 AND users_table.value_1 < 17 + GROUP BY + user_id + HAVING + COUNT(*) > 1) AS subquery_2 + ON + subquery_1.user_id = subquery_2.user_id + GROUP BY + subquery_1.user_id, + count_pay) AS subquery_top +WHERE + array_ndims(events_table) > 0 +GROUP BY + count_pay, user_id +ORDER BY + event_average DESC, count_pay DESC, user_id DESC; + +SELECT + user_id, + avg(array_length(events_table, 1)) AS event_average, + count_pay + FROM ( + SELECT + subquery_1.user_id, + array_agg(event ORDER BY time) AS events_table, + COALESCE(count_pay, 0) AS count_pay + FROM + ( + SELECT + users_table.user_id, + CASE WHEN events_table.event_type > 10 AND events_table.event_type < 12 THEN 'action=>1' ELSE 'action=>2' END AS event, + events_table.time + FROM + users_table, + events_table + WHERE + users_table.user_id = events_table.user_id AND + users_table.user_id >= 10 AND + users_table.user_id <= 70 AND + (events_table.event_type > 10 AND events_table.event_type < 12 + OR + events_table.event_type > 12 AND events_table.event_type < 14) + GROUP BY 1, 2, 3 + ) AS subquery_1 + LEFT JOIN + (SELECT + user_id, + COUNT(*) AS count_pay + FROM + users_table + WHERE + user_id >= 10 AND + user_id <= 70 AND + users_table.value_1 > 15 AND users_table.value_1 < 17 + GROUP BY + user_id + HAVING + COUNT(*) > 1) AS subquery_2 + ON + subquery_1.user_id = subquery_2.user_id + GROUP BY + subquery_1.user_id, + count_pay) AS subquery_top +WHERE + array_ndims(events_table) > 0 +GROUP BY + count_pay, user_id +HAVING + avg(array_length(events_table, 1)) > 0 +ORDER BY + event_average DESC, count_pay DESC, user_id DESC; + +------------------------------------ +-- Most recently seen users_table events_table +------------------------------------ +-- Note that we don't use ORDER BY/LIMIT yet +------------------------------------ +SELECT + user_id, + user_lastseen, + array_length(event_array, 1) +FROM ( + SELECT + user_id, + max(u.time) as user_lastseen, + array_agg(event_type ORDER BY u.time) AS event_array + FROM ( + SELECT user_id, time + FROM users_table + WHERE + user_id >= 10 AND + user_id <= 70 AND + users_table.value_1 > 10 AND users_table.value_1 < 12 + ) u LEFT JOIN LATERAL ( + SELECT event_type, time + FROM events_table + WHERE user_id = u.user_id AND + events_table.event_type > 10 AND events_table.event_type < 12 + ) t ON true + GROUP BY user_id +) AS shard_union +ORDER BY user_lastseen DESC, user_id; + +------------------------------------ +-- Count the number of distinct users_table who are in segment X and Y and Z +-- This query will be supported when we have subqueries in where clauses. +------------------------------------ +SELECT DISTINCT user_id +FROM users_table +WHERE user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 10 AND value_1 <= 20) + AND user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 30 AND value_1 <= 40) + AND user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 50 AND value_1 <= 60); + +------------------------------------ +-- Find customers who have done X, and satisfy other customer specific criteria +-- This query will be supported when we have subqueries in where clauses. +------------------------------------ +SELECT user_id, value_2 FROM users_table WHERE + value_1 > 101 AND value_1 < 110 + AND value_2 >= 5 + AND EXISTS (SELECT user_id FROM events_table WHERE event_type>101 AND event_type < 110 AND value_3 > 100 AND user_id=users_table.user_id); + +------------------------------------ +-- Customers who haven’t done X, and satisfy other customer specific criteria +-- This query will be supported when we have subqueries in where clauses. +------------------------------------ +SELECT user_id, value_2 FROM users_table WHERE + value_1 = 101 + AND value_2 >= 5 + AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type=101 AND value_3 > 100 AND user_id=users_table.user_id); + +------------------------------------ +-- Customers who have done X and Y, and satisfy other customer specific criteria +-- This query will be supported when we have subqueries in where clauses. +------------------------------------ +SELECT user_id, value_2 FROM users_table WHERE + value_1 > 100 + AND value_2 >= 5 + AND EXISTS (SELECT user_id FROM events_table WHERE event_type!=100 AND value_3 > 100 AND user_id=users_table.user_id) + AND EXISTS (SELECT user_id FROM events_table WHERE event_type=101 AND value_3 > 100 AND user_id=users_table.user_id); + +------------------------------------ +-- Customers who have done X and haven’t done Y, and satisfy other customer specific criteria +-- This query will be supported when we have subqueries in where clauses. +------------------------------------ +SELECT user_id, value_2 FROM users_table WHERE + value_2 >= 5 + AND EXISTS (SELECT user_id FROM events_table WHERE event_type > 100 AND event_type <= 300 AND value_3 > 100 AND user_id=users_table.user_id) + AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type > 300 AND event_type <= 350 AND value_3 > 100 AND user_id=users_table.user_id); + +------------------------------------ +-- Customers who have done X more than 2 times, and satisfy other customer specific criteria +-- This query will be supported when we have subqueries in where clauses. +------------------------------------ +SELECT user_id, + value_2 + FROM users_table + WHERE value_1 > 100 + AND value_1 < 124 + AND value_2 >= 5 + AND EXISTS (SELECT user_id + FROM events_table + WHERE event_type > 100 + AND event_type < 124 + AND value_3 > 100 + AND user_id = users_table.user_id + GROUP BY user_id + HAVING Count(*) > 2); + +------------------------------------ +-- Find me all users_table who logged in more than once +------------------------------------ +SELECT user_id, value_1 from +( + SELECT user_id, value_1 From users_table + WHERE value_2 > 100 and user_id = 15 GROUP BY value_1, user_id HAVING count(*) > 1 +) AS a +ORDER BY user_id ASC, value_1 ASC; + +-- same query with additional filter to make it not router plannable +SELECT user_id, value_1 from +( + SELECT user_id, value_1 From users_table + WHERE value_2 > 100 and (user_id = 15 OR user_id = 16) GROUP BY value_1, user_id HAVING count(*) > 1 +) AS a +ORDER BY user_id ASC, value_1 ASC; + +------------------------------------ +-- Find me all users_table who has done some event and has filters +-- This query will be supported when we have subqueries in where clauses. +------------------------------------ +SELECT user_id +FROM events_table +WHERE + event_type = 16 AND value_2 > 50 +AND user_id IN + (SELECT user_id + FROM users_table + WHERE + value_1 = 15 AND value_2 > 25); + +------------------------------------ +-- Which events_table did people who has done some specific events_table +-- This query will be supported when we have subqueries in where clauses. +------------------------------------ +SELECT user_id, event_type FROM events_table +WHERE user_id in (SELECT user_id from events_table WHERE event_type > 500 and event_type < 505) +GROUP BY user_id, event_type; + +------------------------------------ +-- Find me all the users_table who has done some event more than three times +------------------------------------ +SELECT user_id FROM +( + SELECT + user_id + FROM + events_table + WHERE event_type = 901 + GROUP BY user_id HAVING count(*) > 3 +) AS a +ORDER BY user_id; + +------------------------------------ +-- Find my assets that have the highest probability and fetch their metadata +------------------------------------ +CREATE TEMP TABLE assets AS +SELECT + users_table.user_id, users_table.value_1, prob +FROM + users_table + JOIN + (SELECT + ma.user_id, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob + FROM + users_table AS ma, events_table as short_list + WHERE + short_list.user_id = ma.user_id and ma.value_1 < 50 and short_list.event_type < 50 + ) temp + ON users_table.user_id = temp.user_id + WHERE users_table.value_1 < 50; + + -- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM assets; + +DROP TABLE assets; + +-- count number of distinct users who have value_1 equal to 5 or 13 but not 3 +-- original query that fails +SELECT count(*) FROM +( +SELECT user_id +FROM users_table +WHERE (value_1 = '5' + OR value_1 = '13') +AND user_id NOT IN (select user_id from users_table where value_1 = '3') +GROUP BY user_id +HAVING count(distinct value_1) = 2 +) as foo; + +-- previous push down query +SELECT subquery_count FROM + (SELECT count(*) as subquery_count FROM + (SELECT + user_id + FROM + users_table + WHERE + (value_1 = '5' OR value_1 = '13') + GROUP BY user_id + HAVING count(distinct value_1) = 2) as a + LEFT JOIN + (SELECT + user_id + FROM + users_table + WHERE + (value_1 = '3') + GROUP BY user_id) as b on a.user_id = b.user_id WHERE b.user_id IS NULL + GROUP BY a.user_id) AS inner_subquery; + +-- new pushdown query without single range table entry at top requirement +SELECT count(*) as subquery_count +FROM ( + SELECT + user_id + FROM + users_table + WHERE + (value_1 = '5' OR value_1 = '13') + GROUP BY user_id + HAVING count(distinct value_1) = 2 + ) as a + LEFT JOIN ( + SELECT + user_id + FROM + users_table + WHERE + (value_1 = '3') + GROUP BY user_id) AS b + ON a.user_id = b.user_id +WHERE b.user_id IS NULL +GROUP BY a.user_id; + +-- multi-subquery-join +-- The first query has filters on partion column to make it router plannable +-- but it is processed by logical planner since we disabled router execution +SELECT + e1.user_id, + sum(view_homepage) AS viewed_homepage, + sum(use_demo) AS use_demo, + sum(enter_credit_card) AS entered_credit_card, + sum(submit_card_info) as submit_card_info, + sum(see_bought_screen) as see_bought_screen +FROM ( + -- Get the first time each user viewed the homepage. + SELECT + user_id, + 1 AS view_homepage, + min(time) AS view_homepage_time + FROM events_table + WHERE user_id = 1 and + event_type IN (10, 20, 30, 40, 50, 60, 70, 80, 90) + GROUP BY user_id +) e1 LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS use_demo, + time AS use_demo_time + FROM events_table + WHERE + user_id = e1.user_id AND user_id = 1 and + event_type IN (11, 21, 31, 41, 51, 61, 71, 81, 91) + ORDER BY time + LIMIT 1 +) e2 ON true LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS enter_credit_card, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e2.user_id AND user_id = 1 and + event_type IN (12, 22, 32, 42, 52, 62, 72, 82, 92) + ORDER BY time + LIMIT 1 +) e3 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS submit_card_info, + user_id, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e3.user_id AND user_id = 1 and + event_type IN (13, 23, 33, 43, 53, 63, 73, 83, 93) + ORDER BY time + LIMIT 1 +) e4 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS see_bought_screen + FROM events_table + WHERE + user_id = e4.user_id AND user_id = 1 and + event_type IN (14, 24, 34, 44, 54, 64, 74, 84, 94) + ORDER BY time + LIMIT 1 +) e5 ON true +where e1.user_id = 1 +group by e1.user_id +limit 1; + +-- Same query without all limitations +SELECT + e1.user_id, + sum(view_homepage) AS viewed_homepage, + sum(use_demo) AS use_demo, + sum(enter_credit_card) AS entered_credit_card, + sum(submit_card_info) as submit_card_info, + sum(see_bought_screen) as see_bought_screen +FROM ( + -- Get the first time each user viewed the homepage. + SELECT + user_id, + 1 AS view_homepage, + min(time) AS view_homepage_time + FROM events_table + WHERE + event_type IN (10, 20, 30, 40, 50, 60, 70, 80, 90) + GROUP BY user_id +) e1 LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS use_demo, + time AS use_demo_time + FROM events_table + WHERE + user_id = e1.user_id AND + event_type IN (11, 21, 31, 41, 51, 61, 71, 81, 91) + ORDER BY time +) e2 ON true LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS enter_credit_card, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e2.user_id AND + event_type IN (12, 22, 32, 42, 52, 62, 72, 82, 92) + ORDER BY time +) e3 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS submit_card_info, + user_id, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e3.user_id AND + event_type IN (13, 23, 33, 43, 53, 63, 73, 83, 93) + ORDER BY time +) e4 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS see_bought_screen + FROM events_table + WHERE + user_id = e4.user_id AND + event_type IN (14, 24, 34, 44, 54, 64, 74, 84, 94) + ORDER BY time +) e5 ON true +GROUP BY e1.user_id +ORDER BY 6 DESC NULLS LAST, 5 DESC NULLS LAST, 4 DESC NULLS LAST, 3 DESC NULLS LAST, 2 DESC NULLS LAST, 1 +LIMIT 15; + +-- Same query without all limitations but uses having() to show only those submitted their credit card info +SELECT + e1.user_id, + sum(view_homepage) AS viewed_homepage, + sum(use_demo) AS use_demo, + sum(enter_credit_card) AS entered_credit_card, + sum(submit_card_info) as submit_card_info, + sum(see_bought_screen) as see_bought_screen +FROM ( + -- Get the first time each user viewed the homepage. + SELECT + user_id, + 1 AS view_homepage, + min(time) AS view_homepage_time + FROM events_table + WHERE + event_type IN (10, 20, 30, 40, 50, 60, 70, 80, 90) + GROUP BY user_id +) e1 LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS use_demo, + time AS use_demo_time + FROM events_table + WHERE + user_id = e1.user_id AND + event_type IN (11, 21, 31, 41, 51, 61, 71, 81, 91) + ORDER BY time +) e2 ON true LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS enter_credit_card, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e2.user_id AND + event_type IN (12, 22, 32, 42, 52, 62, 72, 82, 92) + ORDER BY time +) e3 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS submit_card_info, + user_id, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e3.user_id AND + event_type IN (13, 23, 33, 43, 53, 63, 73, 83, 93) + ORDER BY time +) e4 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS see_bought_screen + FROM events_table + WHERE + user_id = e4.user_id AND + event_type IN (14, 24, 34, 44, 54, 64, 74, 84, 94) + ORDER BY time +) e5 ON true +group by e1.user_id +HAVING sum(submit_card_info) > 0 +ORDER BY 6 DESC NULLS LAST, 5 DESC NULLS LAST, 4 DESC NULLS LAST, 3 DESC NULLS LAST, 2 DESC NULLS LAST, 1 +LIMIT 15; + +-- Explain analyze on this query fails due to #756 +-- avg expression used on order by +SELECT a.user_id, avg(b.value_2) as subquery_avg +FROM ( + SELECT + user_id + FROM + users_table + WHERE + (value_1 > 5) + GROUP BY user_id + HAVING count(distinct value_1) > 88 + ) as a + LEFT JOIN ( + SELECT + user_id, value_2, value_3 + FROM + users_table + WHERE + (value_1 > 3)) AS b +ON a.user_id = b.user_id +WHERE b.user_id IS NOT NULL +GROUP BY a.user_id +ORDER BY avg(b.value_3), 2, 1 +LIMIT 5; + +-- add having +SELECT a.user_id, avg(b.value_2) as subquery_avg +FROM ( + SELECT + user_id + FROM + users_table + WHERE + (value_1 > 5) + GROUP BY user_id + HAVING count(distinct value_1) > 88 + ) as a + LEFT JOIN ( + SELECT + user_id, value_2, value_3 + FROM + users_table + WHERE + (value_1 > 3)) AS b +ON a.user_id = b.user_id +WHERE b.user_id IS NOT NULL +GROUP BY a.user_id +HAVING sum(b.value_3) > 50000 +ORDER BY avg(b.value_3), 2, 1 +LIMIT 5; + +-- avg on the value_3 is not a resjunk +SELECT a.user_id, avg(b.value_2) as subquery_avg, avg(b.value_3) +FROM + (SELECT user_id + FROM users_table + WHERE (value_1 > 5) + GROUP BY user_id + HAVING count(distinct value_1) > 88 + ) as a + LEFT JOIN + (SELECT user_id, value_2, value_3 + FROM users_table + WHERE (value_1 > 3) + ) AS b + ON a.user_id = b.user_id +WHERE b.user_id IS NOT NULL +GROUP BY a.user_id +ORDER BY avg(b.value_3) DESC, 2, 1 +LIMIT 5; + +-- a powerful query structure that analyzes users/events +-- using (relation JOIN subquery JOIN relation) +SELECT u.user_id, sub.value_2, sub.value_3, COUNT(e2.user_id) counts +FROM + users_table u + LEFT OUTER JOIN LATERAL + (SELECT * + FROM events_table e1 + WHERE e1.user_id = u.user_id + ORDER BY e1.value_3 DESC + LIMIT 1 + ) sub + ON true + LEFT OUTER JOIN events_table e2 + ON e2.user_id = sub.user_id +WHERE e2.value_2 > 10 and e2.value_2 < 50 AND u.value_2 > 10 and u.value_2 < 50 +GROUP BY u.user_id, sub.value_2, sub.value_3 +ORDER BY 4 DESC, 1 DESC, 2 ASC, 3 ASC +LIMIT 10; + +-- distinct users joined with events +SELECT + avg(events_table.event_type) as avg_type, + count(*) as users_count +FROM events_table + JOIN + (SELECT DISTINCT user_id + FROM users_table + ) as distinct_users + ON distinct_users.user_id = events_table.user_id +GROUP BY distinct_users.user_id +ORDER BY users_count desc, avg_type DESC +LIMIT 5; + +-- reduce the data set, aggregate and join +SELECT + events_table.event_type, + users_count.ct +FROM events_table + JOIN + (SELECT distinct_users.user_id, count(1) as ct + FROM + (SELECT user_id + FROM users_table + ) as distinct_users + GROUP BY distinct_users.user_id + ) as users_count + ON users_count.user_id = events_table.user_id +ORDER BY users_count.ct desc, event_type DESC +LIMIT 5; + +--- now, test (subquery JOIN subquery) +SELECT n1.user_id, count_1, total_count +FROM + (SELECT user_id, count(1) as count_1 + FROM users_table + GROUP BY user_id + ) n1 + INNER JOIN + (SELECT user_id, count(1) as total_count + FROM events_table + GROUP BY user_id, event_type + ) n2 + ON (n2.user_id = n1.user_id) +ORDER BY total_count DESC, count_1 DESC, 1 DESC +LIMIT 10; + +SELECT a.user_id, avg(b.value_2) as subquery_avg +FROM + (SELECT user_id + FROM users_table + WHERE (value_1 > 5) + GROUP BY user_id + HAVING count(distinct value_1) > 88 + ) as a + LEFT JOIN + (SELECT DISTINCT ON (user_id) user_id, value_2, value_3 + FROM users_table + WHERE (value_1 > 3) + ORDER BY 1,2,3 + ) AS b + ON a.user_id = b.user_id +WHERE b.user_id IS NOT NULL +GROUP BY a.user_id +ORDER BY avg(b.value_3), 2, 1 +LIMIT 5; + +-- distinct clause must include partition column +-- when used in target list +SELECT a.user_id, avg(b.value_2) as subquery_avg +FROM + (SELECT user_id + FROM users_table + WHERE (value_1 > 5) + GROUP BY user_id + HAVING count(distinct value_1) > 88 + ) as a + LEFT JOIN + (SELECT DISTINCT ON (value_2) value_2 , user_id, value_3 + FROM users_table + WHERE (value_1 > 3) + ORDER BY 1,2,3 + ) AS b + USING (user_id) +GROUP BY user_id; + +SELECT a.user_id, avg(b.value_2) as subquery_avg +FROM + (SELECT user_id + FROM users_table + WHERE (value_1 > 5) + GROUP BY user_id + HAVING count(distinct value_1) > 88 + ) as a + LEFT JOIN + (SELECT DISTINCT ON (value_2, user_id) value_2 , user_id, value_3 + FROM users_table + WHERE (value_1 > 3) + ORDER BY 1,2,3 + ) AS b + ON a.user_id = b.user_id +WHERE b.user_id IS NOT NULL +GROUP BY a.user_id +ORDER BY avg(b.value_3), 2, 1 +LIMIT 5; + +SELECT user_id, event_type +FROM + (SELECT * + FROM + ( + (SELECT event_type, user_id as a_user_id FROM events_table) AS a + JOIN + (SELECT + ma.user_id AS user_id, ma.value_2 AS value_2, + (GREATEST(coalesce((ma.value_3 * ma.value_2 ) / 20, 0.0) + GREATEST(1.0))) / 2 AS prob + FROM users_table AS ma + WHERE (ma.value_2 > 100) + ORDER BY prob DESC, user_id DESC + LIMIT 10 + ) AS ma + ON (a.a_user_id = ma.user_id) + ) AS inner_sub + ORDER BY prob DESC, user_id DESC + LIMIT 10 + ) AS outer_sub +ORDER BY prob DESC, event_type DESC, user_id DESC +LIMIT 10; + +-- very similar query but produces different result due to +-- ordering difference in the previous one's inner query +SELECT user_id, event_type +FROM + (SELECT event_type, user_id as a_user_id FROM events_table) AS a + JOIN + (SELECT + ma.user_id AS user_id, ma.value_2 AS value_2, + (GREATEST(coalesce((ma.value_3 * ma.value_2 ) / 20, 0.0) + GREATEST(1.0))) / 2 AS prob + FROM users_table AS ma + WHERE (ma.value_2 > 100) + ORDER BY prob DESC, user_id DESC + LIMIT 10 + ) AS ma + ON (a.a_user_id = ma.user_id) +ORDER BY prob DESC, event_type DESC, user_id DESC +LIMIT 10; + +-- now they produce the same result when ordering fixed in 'outer_sub' +SELECT user_id, event_type +FROM + (SELECT * + FROM + ( + (SELECT event_type, user_id as a_user_id FROM events_table) AS a + JOIN + (SELECT + ma.user_id AS user_id, ma.value_2 AS value_2, + (GREATEST(coalesce((ma.value_3 * ma.value_2 ) / 20, 0.0) + GREATEST(1.0))) / 2 AS prob + FROM users_table AS ma + WHERE (ma.value_2 > 100) + ORDER BY prob DESC, user_id DESC + LIMIT 10 + ) AS ma + ON (a.a_user_id = ma.user_id) + ) AS inner_sub + ORDER BY prob DESC, event_type DESC, user_id DESC + LIMIT 10 + ) AS outer_sub +ORDER BY prob DESC, event_type DESC, user_id DESC +LIMIT 10; + +-- this is one complex join query derived from a user's production query +-- first declare the function on workers on master +-- With array_index: +SELECT * FROM run_command_on_workers('CREATE OR REPLACE FUNCTION array_index(ANYARRAY, ANYELEMENT) + RETURNS INT AS $$ + SELECT i + FROM (SELECT generate_series(array_lower($1, 1), array_upper($1, 1))) g(i) + WHERE $1 [i] = $2 + LIMIT 1; + $$ LANGUAGE sql') +ORDER BY 1,2; + +CREATE OR REPLACE FUNCTION array_index(ANYARRAY, ANYELEMENT) + RETURNS INT AS $$ + SELECT i + FROM (SELECT generate_series(array_lower($1, 1), array_upper($1, 1))) g(i) + WHERE $1 [i] = $2 + LIMIT 1; + $$ LANGUAGE sql; + +SELECT * +FROM + (SELECT * + FROM + ( + (SELECT + user_id AS user_id_e, + event_type as event_type_e + FROM + events_table + ) AS ma_e + JOIN + (SELECT + value_2, + value_3, + user_id + FROM + (SELECT * + FROM + ( + (SELECT user_id_p AS user_id + FROM + (SELECT * + FROM + ( + (SELECT user_id AS user_id_p + FROM events_table + WHERE (event_type IN (1, 2, 3, 4, 5)) + ) AS ma_p + JOIN + (SELECT user_id AS user_id_a + FROM users_table + WHERE (value_2 % 5 = 1) + ) AS a + ON (a.user_id_a = ma_p.user_id_p) + ) + ) AS a_ma_p + ) AS inner_filter_q + JOIN + (SELECT + value_2, + value_3, + user_id AS user_id_ck + FROM events_table + WHERE event_type = ANY(ARRAY [10, 11, 12]) + ORDER BY + value_3 ASC, + user_id_ck DESC, + array_index(ARRAY [1, 2, 3], (value_2 % 3)) ASC + LIMIT 10 + ) AS ma_ck + ON (ma_ck.user_id_ck = inner_filter_q.user_id) + ) AS inner_sub_q + ORDER BY + value_3 ASC, + user_id_ck DESC, + array_index(ARRAY [1, 2, 3], (value_2 % 3)) ASC + LIMIT 10 + ) AS outer_sub_q + ORDER BY + value_3 ASC, + user_id DESC, + array_index(ARRAY [1, 2, 3], (value_2 % 3)) ASC + LIMIT 10) AS inner_search_q + ON (ma_e.user_id_e = inner_search_q.user_id) + ) AS outer_inner_sub_q + ORDER BY + value_3 ASC, + user_id DESC, + array_index(ARRAY [1, 2, 3], (value_2 % 3)) ASC, + event_type_e DESC + LIMIT 10) AS outer_outer_sub_q +ORDER BY + value_3 ASC, + user_id DESC, + array_index(ARRAY [1, 2, 3], (value_2 % 3)) ASC, + event_type_e DESC +LIMIT 10; + +-- top level select * is removed now there is +-- a join at top level. +SELECT * +FROM + ( + (SELECT + user_id AS user_id_e, + event_type as event_type_e + FROM + events_table + ) AS ma_e + JOIN + (SELECT + value_2, + value_3, + user_id + FROM + (SELECT * + FROM + ( + (SELECT user_id_p AS user_id + FROM + (SELECT * + FROM + ( + (SELECT user_id AS user_id_p + FROM events_table + WHERE (event_type IN (1, 2, 3, 4, 5)) + ) AS ma_p + JOIN + (SELECT user_id AS user_id_a + FROM users_table + WHERE (value_2 % 5 = 1) + ) AS a + ON (a.user_id_a = ma_p.user_id_p) + ) + ) AS a_ma_p + ) AS inner_filter_q + JOIN + (SELECT + value_2, + value_3, + user_id AS user_id_ck + FROM events_table + WHERE event_type = ANY(ARRAY [10, 11, 12]) + ORDER BY + value_3 ASC, + user_id_ck DESC, + array_index(ARRAY [1, 2, 3], (value_2 % 3)) ASC + LIMIT 10 + ) AS ma_ck + ON (ma_ck.user_id_ck = inner_filter_q.user_id) + ) AS inner_sub_q + ORDER BY + value_3 ASC, + user_id_ck DESC, + array_index(ARRAY [1, 2, 3], (value_2 % 3)) ASC + LIMIT 10 + ) AS outer_sub_q + ORDER BY + value_3 ASC, + user_id DESC, + array_index(ARRAY [1, 2, 3], (value_2 % 3)) ASC + LIMIT 10) AS inner_search_q + ON (ma_e.user_id_e = inner_search_q.user_id) + ) AS outer_inner_sub_q +ORDER BY + value_3 ASC, + user_id DESC, + array_index(ARRAY [1, 2, 3], (value_2 % 3)) ASC, + event_type_e DESC +LIMIT 10; + + +-- drop created functions +SELECT * FROM run_command_on_workers('DROP FUNCTION array_index(ANYARRAY, ANYELEMENT)') +ORDER BY 1,2; +DROP FUNCTION array_index(ANYARRAY, ANYELEMENT); + + +SET citus.subquery_pushdown TO FALSE; +SET citus.enable_router_execution TO TRUE; diff --git a/src/test/regress/sql/multi_subquery_complex_queries.sql b/src/test/regress/sql/multi_subquery_complex_queries.sql index 1f4ea2e99..1f9d9ed65 100644 --- a/src/test/regress/sql/multi_subquery_complex_queries.sql +++ b/src/test/regress/sql/multi_subquery_complex_queries.sql @@ -206,6 +206,32 @@ FROM order BY user_id limit 50; +-- same query with subuqery joins in topmost select +SELECT "some_users_data".user_id, lastseen +FROM + (SELECT user_id, + Max(TIME) AS lastseen + FROM + (SELECT user_id, + TIME + FROM + (SELECT user_id, + TIME + FROM events_table as "events" + WHERE user_id > 10 and user_id < 40) "events_1" + ORDER BY TIME DESC + LIMIT 1000) "recent_events_1" + GROUP BY user_id + ORDER BY max(TIME) DESC) "some_recent_users" + JOIN LATERAL + (SELECT "users".user_id + FROM users_table as "users" + WHERE "users"."user_id" = "some_recent_users"."user_id" + AND users.value_2 > 50 and users.value_2 < 55 + LIMIT 1) "some_users_data" ON TRUE +ORDER BY user_id +limit 50; + -- not supported since JOIN is not on the partition key SELECT * FROM @@ -304,6 +330,44 @@ FROM order BY user_id DESC limit 10; +-- +-- A similar query with topmost select is dropped +-- and replaced by aggregation. Notice the heavy use of limit +-- +SELECT "some_users_data".user_id, MAX(lastseen), count(*) + FROM + (SELECT filter_users_1.user_id, + TIME AS lastseen + FROM + (SELECT user_where_1_1.user_id + FROM + (SELECT "users"."user_id" + FROM users_table as "users" + WHERE user_id > 12 and user_id < 16 and value_1 > 20) user_where_1_1 + INNER JOIN + (SELECT "users"."user_id" + FROM users_table as "users" + WHERE user_id > 12 and user_id < 16 and value_2 > 60) user_where_1_join_1 + ON ("user_where_1_1".user_id = "user_where_1_join_1".user_id)) filter_users_1 + JOIN LATERAL + (SELECT user_id, + TIME + FROM events_table as "events" + WHERE user_id > 12 and user_id < 16 and user_id = filter_users_1.user_id + ORDER BY TIME DESC + LIMIT 1) "last_events_1" ON TRUE + ORDER BY TIME DESC + LIMIT 10) "some_recent_users" + JOIN LATERAL + (SELECT "users".user_id + FROM users_table as "users" + WHERE "users"."user_id" = "some_recent_users"."user_id" + AND "users"."value_2" > 70 + LIMIT 1) "some_users_data" ON TRUE +GROUP BY 1 +ORDER BY 2, 1 DESC +limit 10; + -- not supported since the inner JOIN is not equi join SELECT user_id, lastseen FROM @@ -545,24 +609,29 @@ SELECT SELECT "value_3", count(*) AS cnt FROM -(SELECT "value_3", "user_id", random() - FROM - (SELECT users_in_segment_1.user_id, value_3 - FROM - (SELECT user_id, value_3 * 2 as value_3 - FROM - (SELECT user_id, value_3 - FROM - (SELECT "users"."user_id", value_3 - FROM users_table as "users" - WHERE user_id > 10 and user_id < 40 and value_2 > 30) simple_user_where_1) all_buckets_1) users_in_segment_1 - JOIN - (SELECT "users"."user_id" - FROM users_table as "users" - WHERE user_id > 10 and user_id < 40 and value_2 > 60) some_users_data - - ON ("users_in_segment_1".user_id = "some_users_data".user_id)) segmentalias_1) "tempQuery" - GROUP BY "value_3" ORDER BY cnt, value_3 DESC LIMIT 10; + (SELECT "value_3", "user_id", random() + FROM + (SELECT users_in_segment_1.user_id, value_3 + FROM + (SELECT user_id, value_3 * 2 as value_3 + FROM + (SELECT user_id, value_3 + FROM + (SELECT "users"."user_id", value_3 + FROM users_table as "users" + WHERE user_id > 10 and user_id < 40 and value_2 > 30 + ) simple_user_where_1 + ) all_buckets_1 + ) users_in_segment_1 + JOIN + (SELECT "users"."user_id" + FROM users_table as "users" + WHERE user_id > 10 and user_id < 40 and value_2 > 60 + ) some_users_data + ON ("users_in_segment_1".user_id = "some_users_data".user_id) + ) segmentalias_1) "tempQuery" +GROUP BY "value_3" +ORDER BY cnt, value_3 DESC LIMIT 10; -- not supported since there is no partition column equality at all @@ -622,6 +691,36 @@ FROM value_3 DESC limit 10; +-- nested lateral join at top most level +SELECT "some_users_data".user_id, "some_recent_users".value_3 +FROM + (SELECT filter_users_1.user_id, value_3 + FROM + (SELECT "users"."user_id" + FROM users_table as "users" + WHERE user_id > 20 and user_id < 70 and users.value_2 = 200 + ) filter_users_1 + JOIN LATERAL + (SELECT user_id, value_3 + FROM events_table as "events" + WHERE user_id > 20 and user_id < 70 + AND ("events".user_id = "filter_users_1".user_id) + ORDER BY value_3 DESC + LIMIT 1 + ) "last_events_1" ON TRUE + ORDER BY value_3 DESC + LIMIT 10 + ) "some_recent_users" + JOIN LATERAL + (SELECT "users".user_id + FROM users_table as "users" + WHERE "users"."user_id" = "some_recent_users"."user_id" + AND users.value_2 > 200 + LIMIT 1 + ) "some_users_data" ON TRUE +ORDER BY value_3 DESC, user_id ASC +LIMIT 10; + -- longer nested lateral joins SELECT * FROM @@ -656,7 +755,35 @@ FROM value_3 DESC limit 10; - +-- longer nested lateral join wth top level join +SELECT "some_users_data".user_id, "some_recent_users".value_3 +FROM + (SELECT filter_users_1.user_id, value_3 + FROM + (SELECT "users"."user_id" + FROM users_table as "users" + WHERE user_id > 20 and user_id < 70 and users.value_2 = 200 + ) filter_users_1 + JOIN LATERAL + (SELECT user_id, value_3 + FROM events_table as "events" + WHERE user_id > 20 and user_id < 70 + AND ("events".user_id = "filter_users_1".user_id) + ORDER BY value_3 DESC + LIMIT 1 + ) "last_events_1" ON TRUE + ORDER BY value_3 DESC + LIMIT 10 + ) "some_recent_users" + JOIN LATERAL + (SELECT "users".user_id + FROM users_table as "users" + WHERE "users"."user_id" = "some_recent_users"."user_id" + AND users.value_2 > 200 + LIMIT 1 + ) "some_users_data" ON TRUE +ORDER BY value_3 DESC +LIMIT 10; -- LEFT JOINs used with INNER JOINs SELECT