From a5b66912d4f0505551280baf5f9a3abcba37ab05 Mon Sep 17 00:00:00 2001 From: Onder Kalaci Date: Fri, 25 Aug 2017 15:05:21 +0300 Subject: [PATCH] Expand reference table support in subquery pushdown With this commit, we relax the restrictions put on the reference tables with subquery pushdown. We did three notable improvements: 1) Relax equi-join restrictions Previously, we always expected that the non-reference tables are equi joined with reference tables on the partition key of the non-reference table. With this commit, we allow any column of non-reference tables joined using non-equi joins as well. 2) Relax OUTER JOIN restrictions Previously Citus errored out if any reference table exists at any point of the outer part of an outer join. For instance, See the below sketch where (h) denotes a hash distributed relation, (r) denotes a reference table, (L) denotes LEFT JOIN and (I) denotes INNER JOIN. (L) / \ (I) h / \ r h Before this commit Citus would error out since a reference table appears on the left most part of an left join. However, that was too restrictive so that we only error out if the reference table is directly below and in the outer part of an outer join. 3) Bug fixes We've done some minor bugfixes in the existing implementation. --- .../planner/multi_logical_optimizer.c | 20 +- .../planner/multi_logical_planner.c | 155 ++++- .../relation_restriction_equivalence.c | 67 +- .../expected/multi_reference_table.out | 23 +- ...ulti_subquery_complex_reference_clause.out | 584 +++++++++++++++++- ...lti_subquery_in_where_reference_clause.out | 283 ++++++--- ...i_behavioral_analytics_create_table.source | 1 + ...i_behavioral_analytics_create_table.source | 1 + .../regress/sql/multi_reference_table.sql | 6 +- ...ulti_subquery_complex_reference_clause.sql | 425 ++++++++++++- ...lti_subquery_in_where_reference_clause.sql | 230 +++++-- 11 files changed, 1551 insertions(+), 244 deletions(-) diff --git a/src/backend/distributed/planner/multi_logical_optimizer.c b/src/backend/distributed/planner/multi_logical_optimizer.c index 7c7ae518c..63e0ad86d 100644 --- a/src/backend/distributed/planner/multi_logical_optimizer.c +++ b/src/backend/distributed/planner/multi_logical_optimizer.c @@ -2880,6 +2880,17 @@ FindReferencedTableColumn(Expr *columnExpression, List *parentQueryList, Query * return; } + /* + * We currently don't support finding partition keys in the subqueries + * that references from outer subqueries. For example, in corrolated + * subqueries in WHERE clause, we don't support use of partition keys + * in the subquery that is referred from the outer query. + */ + if (candidateColumn->varlevelsup > 0) + { + return; + } + rangeTableEntryIndex = candidateColumn->varno - 1; rangeTableEntry = list_nth(rangetableList, rangeTableEntryIndex); @@ -3096,7 +3107,14 @@ PartitionColumnOpExpressionList(Query *query) rangeTableEntryIndex = candidatePartitionColumn->varno - 1; rangeTableEntry = list_nth(rangetableList, rangeTableEntryIndex); - Assert(rangeTableEntry->rtekind == RTE_RELATION); + /* + * We currently don't support checking for equality when user refers + * to a column from the JOIN instead of the relation. + */ + if (rangeTableEntry->rtekind != RTE_RELATION) + { + continue; + } relationId = rangeTableEntry->relid; partitionColumn = DistPartitionKey(relationId); diff --git a/src/backend/distributed/planner/multi_logical_planner.c b/src/backend/distributed/planner/multi_logical_planner.c index f1da91072..545434100 100644 --- a/src/backend/distributed/planner/multi_logical_planner.c +++ b/src/backend/distributed/planner/multi_logical_planner.c @@ -31,6 +31,7 @@ #include "distributed/worker_protocol.h" #include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" +#include "nodes/relation.h" #include "optimizer/clauses.h" #include "optimizer/prep.h" #include "optimizer/tlist.h" @@ -69,6 +70,8 @@ static DeferredErrorMessage * DeferErrorIfUnsupportedSubqueryPushdown(Query * PlannerRestrictionContext * plannerRestrictionContext); +static DeferredErrorMessage * DeferErrorIfUnsupportedSublinkAndReferenceTable( + Query *queryTree); static DeferredErrorMessage * DeferErrorIfUnsupportedFilters(Query *subquery); static bool EqualOpExpressionLists(List *firstOpExpressionList, List *secondOpExpressionList); @@ -87,6 +90,7 @@ static MultiNode * MultiPlanTree(Query *queryTree); static void ErrorIfQueryNotSupported(Query *queryTree); static bool HasUnsupportedReferenceTableJoin( PlannerRestrictionContext *plannerRestrictionContext); +static bool ShouldRecurseForReferenceTableJoinChecks(RelOptInfo *relOptInfo); static bool HasUnsupportedJoinWalker(Node *node, void *context); static bool ErrorHintRequired(const char *errorHint, Query *queryTree); static DeferredErrorMessage * DeferErrorIfUnsupportedSubqueryRepartition(Query * @@ -96,8 +100,8 @@ static bool HasOuterJoin(Query *queryTree); static bool HasOuterJoinWalker(Node *node, void *maxJoinLevel); static bool HasComplexJoinOrder(Query *queryTree); static bool HasComplexRangeTableType(Query *queryTree); -static bool RelationInfoHasReferenceTable(PlannerInfo *plannerInfo, - RelOptInfo *relationInfo); +static bool RelationInfoContainsReferenceTable(PlannerInfo *plannerInfo, + RelOptInfo *relationInfo); static void ValidateClauseList(List *clauseList); static void ValidateSubqueryPushdownClauseList(List *clauseList); static bool ExtractFromExpressionWalker(Node *node, @@ -542,7 +546,16 @@ DeferErrorIfUnsupportedSubqueryPushdown(Query *originalQuery, "one another relation using distribution keys and " "equality operator.", NULL); } - else if (HasUnsupportedReferenceTableJoin(plannerRestrictionContext)) + + /* we shouldn't allow reference tables in the FROM clause when the query has sublinks */ + error = DeferErrorIfUnsupportedSublinkAndReferenceTable(originalQuery); + if (error) + { + return error; + } + + /* we shouldn't allow reference tables in the outer part of outer joins */ + if (HasUnsupportedReferenceTableJoin(plannerRestrictionContext)) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "cannot pushdown the subquery", @@ -582,6 +595,43 @@ DeferErrorIfUnsupportedSubqueryPushdown(Query *originalQuery, } +/* + * DeferErrorIfUnsupportedSublinkAndReferenceTable returns a deferred error if the + * given query is not suitable for subquery pushdown. + * + * While planning sublinks, we rely on Postgres in the sense that it converts some of + * sublinks into joins. + * + * In some cases, sublinks are pulled up and converted into outer joins. Those cases + * are already handled with HasUnsupportedReferenceTableJoin(). + * + * If the sublinks are not pulled up, we should still error out in if any reference table + * appears in the FROM clause of a subquery. + * + * Otherwise, the result would include duplicate rows. + */ +static DeferredErrorMessage * +DeferErrorIfUnsupportedSublinkAndReferenceTable(Query *queryTree) +{ + if (!queryTree->hasSubLinks) + { + return NULL; + } + + if (HasReferenceTable((Node *) queryTree->rtable)) + { + return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, + "cannot pushdown the subquery", + "Reference tables are not allowed in FROM " + "clause when the query has subqueries in " + "WHERE clause", + NULL); + } + + return NULL; +} + + /* * DeferErrorIfUnsupportedFilters checks if all leaf queries in the given query have * same filter on the partition column. Note that if there are queries without @@ -853,6 +903,14 @@ DeferErrorIfCannotPushdownSubquery(Query *subqueryTree, bool outerMostQueryHasLi } } + deferredError = DeferErrorIfUnsupportedSublinkAndReferenceTable(subqueryTree); + if (deferredError) + { + preconditionsSatisfied = false; + errorDetail = (char *) deferredError->detail; + } + + /* finally check and return deferred if not satisfied */ if (!preconditionsSatisfied) { @@ -1027,7 +1085,7 @@ DeferErrorIfUnsupportedTableCombination(Query *queryTree) /* * TargetListOnPartitionColumn checks if at least one target list entry is on - * partition column or the table is a reference table. + * partition column. */ static bool TargetListOnPartitionColumn(Query *query, List *targetEntryList) @@ -1047,15 +1105,11 @@ TargetListOnPartitionColumn(Query *query, List *targetEntryList) FindReferencedTableColumn(targetExpression, NIL, query, &relationId, &column); - /* - * If the expression belongs to reference table directly returns true. - * We can assume that target list entry always on partition column of - * reference tables. - */ + /* if the expression belongs to reference table directly returns false */ if (IsDistributedTable(relationId) && PartitionMethod(relationId) == DISTRIBUTE_BY_NONE) { - targetListOnPartitionColumn = true; + targetListOnPartitionColumn = false; break; } @@ -1459,15 +1513,18 @@ HasUnsupportedReferenceTableJoin(PlannerRestrictionContext *plannerRestrictionCo if (joinType == JOIN_SEMI || joinType == JOIN_ANTI || joinType == JOIN_LEFT) { - if (RelationInfoHasReferenceTable(plannerInfo, outerrel)) + if (ShouldRecurseForReferenceTableJoinChecks(outerrel) && + RelationInfoContainsReferenceTable(plannerInfo, outerrel)) { return true; } } else if (joinType == JOIN_FULL) { - if (RelationInfoHasReferenceTable(plannerInfo, innerrel) || - RelationInfoHasReferenceTable(plannerInfo, outerrel)) + if ((ShouldRecurseForReferenceTableJoinChecks(innerrel) && + RelationInfoContainsReferenceTable(plannerInfo, innerrel)) || + (ShouldRecurseForReferenceTableJoinChecks(outerrel) && + RelationInfoContainsReferenceTable(plannerInfo, outerrel))) { return true; } @@ -1479,12 +1536,66 @@ HasUnsupportedReferenceTableJoin(PlannerRestrictionContext *plannerRestrictionCo /* - * RelationInfoHasReferenceTable check whether the relationInfo has reference table. - * Since relation ids of relationInfo indexes to the range table entry list of - * planner info, planner info is also passed. + * ShouldRecurseForReferenceTableJoinChecks is a helper function for deciding + * on whether the input relOptInfo should be checked for unsupported reference + * tables. */ static bool -RelationInfoHasReferenceTable(PlannerInfo *plannerInfo, RelOptInfo *relationInfo) +ShouldRecurseForReferenceTableJoinChecks(RelOptInfo *relOptInfo) +{ + /* + * We shouldn't recursively go down for joins since we're already + * going to process each join seperately. Otherwise we'd restrict + * the coverage. See the below sketch where (h) denotes a hash + * distributed relation, (r) denotes a reference table, (L) denotes + * LEFT JOIN and (I) denotes INNER JOIN. If we're to recurse into + * the inner join, we'd be preventing to push down the following + * join tree, which is actually safe to push down. + * + * (L) + * / \ + * (I) h + * / \ + * r h + */ + if (relOptInfo->reloptkind == RELOPT_JOINREL) + { + return false; + } + + /* + * Note that we treat the same query where relations appear in subqueries + * differently. (i.e., use SELECT * FROM r; instead of r) + * + * In that case, to relax some restrictions, we do the following optimization: + * If the subplan (i.e., plannerInfo corresponding to the subquery) contains any + * joins, we skip reference table checks keeping in mind that the join is already + * going to be processed seperately. This optimization should suffice for many + * use cases. + */ + if (relOptInfo->reloptkind == RELOPT_BASEREL && relOptInfo->subroot != NULL) + { + PlannerInfo *subroot = relOptInfo->subroot; + + if (list_length(subroot->join_rel_list) > 0) + { + return false; + } + } + + return true; +} + + +/* + * RelationInfoContainsReferenceTable checks whether the relationInfo + * contains any reference tables. If found, the function returns true. + * + * Note that since relation ids of relationInfo indexes to the range + * table entry list of planner info, planner info is also passed. + */ +static bool +RelationInfoContainsReferenceTable(PlannerInfo *plannerInfo, RelOptInfo *relationInfo) { Relids relids = bms_copy(relationInfo->relids); int relationId = -1; @@ -2927,9 +3038,17 @@ ExtractRangeTableRelationWalkerWithRTEExpand(Node *node, List **rangeTableRelati rangeTableRelationList, 0); } } + else if (IsA(node, Query)) + { + walkIsComplete = query_tree_walker((Query *) node, + ExtractRangeTableRelationWalkerWithRTEExpand, + rangeTableRelationList, QTW_EXAMINE_RTES); + } else { - walkIsComplete = ExtractRangeTableRelationWalker(node, rangeTableRelationList); + walkIsComplete = expression_tree_walker(node, + ExtractRangeTableRelationWalkerWithRTEExpand, + rangeTableRelationList); } return walkIsComplete; diff --git a/src/backend/distributed/planner/relation_restriction_equivalence.c b/src/backend/distributed/planner/relation_restriction_equivalence.c index b8a61a00e..786df8736 100644 --- a/src/backend/distributed/planner/relation_restriction_equivalence.c +++ b/src/backend/distributed/planner/relation_restriction_equivalence.c @@ -62,6 +62,7 @@ typedef struct AttributeEquivalenceClassMember } AttributeEquivalenceClassMember; +static uint32 ReferenceRelationCount(RelationRestrictionContext *restrictionContext); static Var * FindTranslatedVar(List *appendRelList, Oid relationOid, Index relationRteIndex, Index *partitionKeyIndex); static bool EquivalenceListContainsRelationsEquality(List *attributeEquivalenceList, @@ -329,9 +330,8 @@ FindTranslatedVar(List *appendRelList, Oid relationOid, Index relationRteIndex, * joined on their partition keys. * * The function returns true if all relations are joined on their partition keys. - * Otherwise, the function returns false. In order to support reference tables - * with subqueries, equality between attributes of reference tables and partition - * key of distributed tables are also considered. + * Otherwise, the function returns false. We ignore reference tables at all since + * they don't have partition keys. * * In order to do that, we invented a new equivalence class namely: * AttributeEquivalenceClass. In very simple words, a AttributeEquivalenceClass is @@ -365,14 +365,24 @@ RestrictionEquivalenceForPartitionKeys(PlannerRestrictionContext * List *joinRestrictionAttributeEquivalenceList = NIL; List *allAttributeEquivalenceList = NIL; + uint32 referenceRelationCount = ReferenceRelationCount(restrictionContext); uint32 totalRelationCount = list_length(restrictionContext->relationRestrictionList); + uint32 nonReferenceRelationCount = totalRelationCount - referenceRelationCount; /* - * If the query includes only one relation, we should not check the partition - * column equality. Single table should not need to fetch data from other nodes - * except it's own node(s). + * If the query includes a single relation which is not a reference table, + * we should not check the partition column equality. + * Consider two example cases: + * (i) The query includes only a single colocated relation + * (ii) A colocated relation is joined with a (or multiple) reference + * table(s) where colocated relation is not joined on the partition key + * + * For the above two cases, we don't need to execute the partition column equality + * algorithm. The reason is that the essence of this function is to ensure that the + * tasks that are going to be created should not need data from other tasks. In both + * cases mentioned above, the necessary data per task would be on available. */ - if (totalRelationCount == 1) + if (nonReferenceRelationCount <= 1) { return true; } @@ -394,6 +404,31 @@ RestrictionEquivalenceForPartitionKeys(PlannerRestrictionContext * } +/* + * ReferenceRelationCount iterates over the relations and returns the reference table + * relation count. + */ +static uint32 +ReferenceRelationCount(RelationRestrictionContext *restrictionContext) +{ + ListCell *relationRestrictionCell = NULL; + uint32 referenceRelationCount = 0; + + foreach(relationRestrictionCell, restrictionContext->relationRestrictionList) + { + RelationRestriction *relationRestriction = + (RelationRestriction *) lfirst(relationRestrictionCell); + + if (PartitionMethod(relationRestriction->relationId) == DISTRIBUTE_BY_NONE) + { + referenceRelationCount++; + } + } + + return referenceRelationCount; +} + + /* * EquivalenceListContainsRelationsEquality gets a list of attributed equivalence * list and a relation restriction context. The function first generates a common @@ -434,6 +469,12 @@ EquivalenceListContainsRelationsEquality(List *attributeEquivalenceList, (RelationRestriction *) lfirst(relationRestrictionCell); int rteIdentity = GetRTEIdentity(relationRestriction->rte); + /* we shouldn't check for the equality of reference tables */ + if (PartitionMethod(relationRestriction->relationId) == DISTRIBUTE_BY_NONE) + { + continue; + } + if (!bms_is_member(rteIdentity, commonRteIdentities)) { return false; @@ -621,7 +662,7 @@ GetVarFromAssignedParam(List *parentPlannerParamList, Param *plannerParam) /* * GenerateCommonEquivalence gets a list of unrelated AttributeEquiavalanceClass - * whose all members are partition keys or a column of reference table. + * whose all members are partition keys. * * With the equivalence classes, the function follows the algorithm * outlined below: @@ -1084,8 +1125,14 @@ AddRteRelationToAttributeEquivalenceClass(AttributeEquivalenceClass ** Assert(rangeTableEntry->rtekind == RTE_RELATION); - if (PartitionMethod(relationId) != DISTRIBUTE_BY_NONE && - relationPartitionKey->varattno != varToBeAdded->varattno) + /* we don't need reference tables in the equality on columns */ + if (relationPartitionKey == NULL) + { + return; + } + + /* we're only interested in distribution columns */ + if (relationPartitionKey->varattno != varToBeAdded->varattno) { return; } diff --git a/src/test/regress/expected/multi_reference_table.out b/src/test/regress/expected/multi_reference_table.out index 6e6b45c59..3d56b8582 100644 --- a/src/test/regress/expected/multi_reference_table.out +++ b/src/test/regress/expected/multi_reference_table.out @@ -1122,7 +1122,7 @@ WHERE colocated_table_test.value_1 = reference_table_test.value_1; DEBUG: only reference tables may be queried when targeting a reference table with distributed INSERT ... SELECT DEBUG: Collecting INSERT ... SELECT results on coordinator --- not pushable due to lack of equality between partition column and column of reference table +-- safe to push down even lack of equality between partition column and column of reference table INSERT INTO colocated_table_test (value_1, value_2) SELECT @@ -1132,9 +1132,13 @@ FROM WHERE colocated_table_test_2.value_4 = reference_table_test.value_4 RETURNING value_1, value_2; -ERROR: cannot perform distributed planning for the given modification -DETAIL: Select query cannot be pushed down to the worker. --- some more complex queries (Note that there are more complex queries in multi_insert_select.sql) + value_1 | value_2 +---------+--------- + 1 | 1 + 2 | 2 +(2 rows) + +-- similar query with the above, this time partition key but without equality INSERT INTO colocated_table_test (value_1, value_2) SELECT @@ -1142,10 +1146,13 @@ SELECT FROM colocated_table_test_2, reference_table_test WHERE - colocated_table_test_2.value_2 = reference_table_test.value_2 + colocated_table_test_2.value_1 > reference_table_test.value_2 RETURNING value_1, value_2; -ERROR: cannot perform distributed planning for the given modification -DETAIL: Select query cannot be pushed down to the worker. + value_1 | value_2 +---------+--------- + 2 | 1 +(1 row) + -- partition column value comes from reference table, goes via coordinator INSERT INTO colocated_table_test (value_1, value_2) @@ -1606,7 +1613,7 @@ INSERT INTO reference_table_test VALUES (2, 2.0, '2', '2016-12-02'); SELECT master_modify_multiple_shards('DELETE FROM colocated_table_test'); master_modify_multiple_shards ------------------------------- - 6 + 9 (1 row) ROLLBACK; diff --git a/src/test/regress/expected/multi_subquery_complex_reference_clause.out b/src/test/regress/expected/multi_subquery_complex_reference_clause.out index ec0931d12..1d647d16a 100644 --- a/src/test/regress/expected/multi_subquery_complex_reference_clause.out +++ b/src/test/regress/expected/multi_subquery_complex_reference_clause.out @@ -60,18 +60,38 @@ SELECT count(*) FROM 1 (1 row) --- Should not work, no equality between partition column and reference table -SELECT * FROM - (SELECT random() FROM user_buy_test_table LEFT JOIN users_ref_test_table - ON user_buy_test_table.item_id = users_ref_test_table.id) subquery_1; -ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys -DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. --- Should not work, no equality between partition column and reference table -SELECT * FROM - (SELECT random() FROM user_buy_test_table LEFT JOIN users_ref_test_table - ON user_buy_test_table.user_id > users_ref_test_table.id) subquery_1; -ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys -DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. +-- Should work, although no equality between partition column and reference table +SELECT subquery_1.item_id FROM + (SELECT user_buy_test_table.item_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.item_id = users_ref_test_table.id) subquery_1 +ORDER BY 1; + item_id +--------- + 2 + 3 + 4 + 5 +(4 rows) + +-- Should work, although no equality between partition column and reference table +SELECT subquery_1.user_id FROM + (SELECT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.user_id > users_ref_test_table.id) subquery_1 +ORDER BY 1; + user_id +--------- + 1 + 2 + 3 + 3 + 7 + 7 + 7 + 7 + 7 + 7 +(10 rows) + -- Shouldn't work, reference table at the outer side is not allowed SELECT * FROM (SELECT random() FROM users_ref_test_table LEFT JOIN user_buy_test_table @@ -93,6 +113,33 @@ SELECT * FROM ON user_buy_test_table.user_id = users_ref_test_table.id) subquery_1; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join +-- Equi join test with reference table on non-partition keys +SELECT count(*) FROM + (SELECT random() FROM user_buy_test_table JOIN users_ref_test_table + ON user_buy_test_table.item_id = users_ref_test_table.id) subquery_1; + count +------- + 4 +(1 row) + +-- Non-equi join test with reference table on non-partition keys +SELECT count(*) FROM + (SELECT random() FROM user_buy_test_table JOIN users_ref_test_table + ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1; + count +------- + 10 +(1 row) + +-- Non-equi left joins with reference tables on non-partition keys +SELECT count(*) FROM + (SELECT random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1; + count +------- + 10 +(1 row) + -- Should pass since reference table locates in the inner part of each left join SELECT count(*) FROM (SELECT tt1.user_id, random() FROM user_buy_test_table AS tt1 JOIN users_return_test_table as tt2 @@ -105,13 +152,238 @@ SELECT count(*) FROM 2 (1 row) - -- Should not pass since reference table locates in the outer part of right join +-- two subqueries, each include joins with reference table +-- also, two hash distributed tables are joined on partition keys +SELECT count(*) FROM + (SELECT DISTINCT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.item_id > users_ref_test_table.id AND users_ref_test_table.k_no > 88 AND user_buy_test_table.item_id < 88) subquery_1, +(SELECT DISTINCT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.user_id > users_ref_test_table.id AND users_ref_test_table.k_no > 44 AND user_buy_test_table.user_id > 44) subquery_2 +WHERE subquery_1.user_id = subquery_2.user_id ; + count +------- + 4 +(1 row) + + -- Should be able to push down since reference tables are inner joined + -- with hash distributed tables, the results of those joins are the parts of + -- an outer join +SELECT subquery_2.id FROM + (SELECT tt1.user_id, random() FROM user_buy_test_table AS tt1 JOIN users_return_test_table as tt2 + ON tt1.user_id = tt2.user_id) subquery_1 + RIGHT JOIN + (SELECT tt1.user_id, ref.id, random() FROM user_buy_test_table as tt1 JOIN users_ref_test_table as ref + ON tt1.user_id = ref.id) subquery_2 ON subquery_1.user_id = subquery_2.user_id ORDER BY 1 DESC LIMIT 5; + id +---- + 3 + 2 + 1 +(3 rows) + +-- the same query as the above, but this Citus fails to pushdown the query +-- since the outer part of the right join doesn't include any joins SELECT * FROM (SELECT tt1.user_id, random() FROM user_buy_test_table AS tt1 JOIN users_return_test_table as tt2 ON tt1.user_id = tt2.user_id) subquery_1 RIGHT JOIN - (SELECT tt1.user_id, random() FROM user_buy_test_table as tt1 JOIN users_ref_test_table as ref - ON tt1.user_id = ref.id) subquery_2 ON subquery_1.user_id = subquery_2.user_id; + (SELECT *, random() FROM (SELECT tt1.user_id, random() FROM user_buy_test_table as tt1 JOIN users_ref_test_table as ref + ON tt1.user_id = ref.id) subquery_2_inner) subquery_2 ON subquery_1.user_id = subquery_2.user_id; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- should be able to pushdown since reference table is in the +-- inner part of the left join +SELECT + user_id, sum(value_1) +FROM + (SELECT + users_table.user_id, users_table.value_1, random() + FROM + users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) + INNER JOIN events_reference_table ON (events_reference_table.value_2 = users_table.user_id) + ) as foo + GROUP BY user_id ORDER BY 2 DESC LIMIT 10; + user_id | sum +---------+---------- + 12 | 92221920 + 17 | 89192642 + 96 | 85143744 + 45 | 84267456 + 90 | 84157047 + 43 | 82110240 + 1 | 81735612 + 72 | 78992640 + 67 | 72385516 + 97 | 71002659 +(10 rows) + +-- same query as above, reference table is wrapped into a subquery +SELECT + user_id, sum(value_1) +FROM + (SELECT + users_table.user_id, users_table.value_1, random() + FROM + users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) + INNER JOIN (SELECT *, random() FROM events_reference_table) as ref_all ON (ref_all.value_2 = users_table.user_id) + ) as foo + GROUP BY user_id ORDER BY 2 DESC LIMIT 10; + user_id | sum +---------+---------- + 12 | 92221920 + 17 | 89192642 + 96 | 85143744 + 45 | 84267456 + 90 | 84157047 + 43 | 82110240 + 1 | 81735612 + 72 | 78992640 + 67 | 72385516 + 97 | 71002659 +(10 rows) + +-- should be able to pushdown since reference table is in the +-- inner part of the left join +SELECT + user_id, sum(value_1) +FROM + (SELECT + users_table.user_id, users_table.value_1, random() + FROM + users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) + LEFT JOIN events_reference_table ON (events_reference_table.value_2 = users_table.user_id) + ) as foo + GROUP BY user_id ORDER BY 2 DESC LIMIT 10; + user_id | sum +---------+---------- + 12 | 92221920 + 17 | 89192642 + 96 | 85143744 + 45 | 84267456 + 90 | 84157047 + 43 | 82110240 + 1 | 81735612 + 72 | 78992640 + 67 | 72385516 + 97 | 71002659 +(10 rows) + +-- should not be able to pushdown since reference table is in the +-- direct outer part of the left join +SELECT + user_id, sum(value_1) +FROM + (SELECT + users_table.user_id, users_table.value_1, random() + FROM + events_reference_table LEFT JOIN users_table ON (users_table.user_id = events_reference_table.value_2) + LEFT JOIN events_table ON (events_table.user_id = users_table.user_id) + ) as foo + GROUP BY user_id ORDER BY 2 DESC LIMIT 10; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- should not be able to pushdown since reference table is in the +-- direct outer part of the left join wrapped into a subquery +SELECT + * +FROM + (SELECT *, random() FROM events_reference_table) as ref_all LEFT JOIN users_table + ON (users_table.user_id = ref_all.value_2); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- should not be able to pushdown since reference table is in the +-- outer part of the left join +SELECT + user_id, sum(value_1) +FROM + (SELECT + users_table.user_id, users_table.value_1, random() + FROM + events_reference_table LEFT JOIN users_table ON (users_table.user_id = events_reference_table.value_2) + LEFT JOIN events_table ON (events_table.user_id = users_table.user_id) + ) as foo + GROUP BY user_id ORDER BY 2 DESC LIMIT 10; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- should be able to pushdown since reference table is in the +-- inner part of the left join +SELECT * FROM +( + SELECT DISTINCT foo.user_id + FROM + ((SELECT + "events"."time", "events"."user_id" as event_user_id, value_2 as event_val_2, random() + FROM + events_reference_table as "events" + WHERE + event_type > 80) as "temp_data_queries" + INNER JOIN + (SELECT + "users"."user_id" + FROM + users_table as "users" + WHERE + user_id > 80 and value_2 = 5) as foo_in ON (event_val_2 = user_id)) as foo LEFT JOIN + (SELECT user_id as user_user_id FROM users_table) as fooo ON (user_id = user_user_id)) as bar; + user_id +--------- + 89 +(1 row) + +-- the same query but this time reference table is in the outer part of the query +SELECT * FROM +( + SELECT DISTINCT foo.user_id + FROM + ((SELECT + "events"."time", "events"."user_id" as event_user_id, value_2 as event_val_2, random() + FROM + events_reference_table as "events" + WHERE + event_type > 80) as "temp_data_queries" + LEFT JOIN + (SELECT + "users"."user_id" + FROM + users_table as "users" + WHERE + user_id > 80 and value_2 = 5) as foo_in ON (event_val_2 = user_id)) as foo LEFT JOIN + (SELECT user_id as user_user_id FROM users_table) as fooo ON (user_id = user_user_id)) as bar; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- we could even suuport the following where the subquery +-- on the outer part of the left join contains a reference table +SELECT max(events_all.cnt), events_all.usr_id +FROM + (SELECT users_table.user_id as usr_id, + count(*) as cnt + FROM events_reference_table + INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) GROUP BY users_table.user_id) AS events_all +LEFT JOIN events_table ON (events_all.usr_id = events_table.user_id) GROUP BY 2 ORDER BY 1 DESC, 2 DESC LIMIT 5; + max | usr_id +-------+-------- + 14605 | 23 + 13090 | 17 + 12915 | 25 + 12317 | 90 + 12285 | 87 +(5 rows) + +-- but, we fail to pushdown the following query where join that reference table appears +-- wrapped into a subquery +SELECT max(events_all.cnt), + events_all.usr_id + FROM( +SELECT *, random() FROM + (SELECT users_table.user_id AS usr_id, count(*) AS cnt + FROM events_reference_table + INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) + GROUP BY users_table.user_id) AS events_all_inner) AS events_all +LEFT JOIN events_table ON (events_all.usr_id = events_table.user_id) +GROUP BY 2 +ORDER BY 1 DESC, + 2 DESC +LIMIT 5; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join -- LATERAL JOINs used with INNER JOINs with reference tables @@ -193,10 +465,10 @@ SELECT count(*) AS value, "generated_group_field" FROM (SELECT - DISTINCT "pushedDownQuery"."real_user_id", "generated_group_field" + DISTINCT "pushedDownQuery"."user_id", "generated_group_field" FROM (SELECT - "eventQuery"."real_user_id", "eventQuery"."time", random(), ("eventQuery"."value_2") AS "generated_group_field" + "eventQuery"."user_id", "eventQuery"."time", random(), ("eventQuery"."value_2") AS "generated_group_field" FROM (SELECT * @@ -204,7 +476,7 @@ SELECT (SELECT "events"."time", "events"."user_id", "events"."value_2" FROM - events_reference_table as "events" + events_table as "events" WHERE user_id > 10 and user_id < 40 AND event_type IN (40, 41, 42, 43, 44, 45) ) "temp_data_queries" INNER JOIN @@ -355,8 +627,8 @@ LIMIT 10; (6 rows) SET citus.subquery_pushdown to OFF; --- LEFT JOINs used with INNER JOINs should error out since reference table exist in the --- left side of the LEFT JOIN. +-- LEFT JOINs used with INNER JOINs should not error out since reference table joined +-- with hash table that Citus can push down SELECT count(*) AS cnt, "generated_group_field" FROM @@ -393,8 +665,20 @@ count(*) AS cnt, "generated_group_field" ORDER BY cnt DESC, generated_group_field ASC LIMIT 10; -ERROR: cannot pushdown the subquery -DETAIL: There exist a reference table in the outer part of the outer join + cnt | generated_group_field +-----+----------------------- + 176 | 551 + 176 | 569 + 176 | 645 + 176 | 713 + 176 | 734 + 88 | 3 + 88 | 5 + 88 | 15 + 88 | 32 + 88 | 68 +(10 rows) + -- RIGHT JOINs used with INNER JOINs should error out since reference table exist in the -- right side of the RIGHT JOIN. SELECT @@ -435,6 +719,133 @@ count(*) AS cnt, "generated_group_field" LIMIT 10; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join +-- right join where the inner part of the join includes a reference table +-- joined with hash partitioned table using non-equi join +SELECT user_id, sum(array_length(events_table, 1)), length(hasdone_event), hasdone_event +FROM ( + SELECT + t1.user_id, + array_agg(event ORDER BY time) AS events_table, + COALESCE(hasdone_event, 'Has not done event') AS hasdone_event + FROM ( + ( + SELECT u.user_id, 'step=>1'::text AS event, e.time + FROM users_table AS u, + events_reference_table AS e + WHERE u.user_id > e.user_id + AND u.user_id >= 10 + AND u.user_id <= 25 + AND e.event_type IN (100, 101, 102) + ) + ) t1 RIGHT JOIN ( + SELECT DISTINCT user_id, + 'Has done event'::TEXT AS hasdone_event + FROM events_table AS e + WHERE e.user_id >= 10 + AND e.user_id <= 25 + AND e.event_type IN (106, 107, 108) + ) t2 ON (t1.user_id = t2.user_id) + GROUP BY t1.user_id, hasdone_event +) t GROUP BY user_id, hasdone_event +ORDER BY user_id; + user_id | sum | length | hasdone_event +---------+-----+--------+---------------- + 11 | 306 | 14 | Has done event + 12 | 363 | 14 | Has done event + 14 | 510 | 14 | Has done event + 18 | 600 | 14 | Has done event + 19 | 618 | 14 | Has done event +(5 rows) + +-- a similar query as the above, with non-partition key comparison +SELECT user_id, sum(array_length(events_table, 1)), length(hasdone_event), hasdone_event +FROM ( + SELECT + t1.user_id, + array_agg(event ORDER BY time) AS events_table, + COALESCE(hasdone_event, 'Has not done event') AS hasdone_event + FROM ( + ( + SELECT u.user_id, 'step=>1'::text AS event, e.time + FROM users_table AS u, + events_reference_table AS e + WHERE u.value_1 > e.user_id + AND u.user_id >= 10 + AND u.user_id <= 25 + AND e.event_type >= 125 AND e.event_type < 130 + ) + ) t1 RIGHT JOIN ( + SELECT DISTINCT user_id, + 'Has done event'::TEXT AS hasdone_event + FROM events_table AS e + WHERE e.user_id >= 10 + AND e.user_id <= 25 + AND e.event_type >= 130 AND e.event_type < 135 + ) t2 ON (t1.user_id = t2.user_id) + GROUP BY t1.user_id, hasdone_event +) t GROUP BY user_id, hasdone_event +ORDER BY user_id; + user_id | sum | length | hasdone_event +---------+------+--------+---------------- + 10 | 6018 | 14 | Has done event + 16 | 5373 | 14 | Has done event + 17 | 5683 | 14 | Has done event + 18 | 5321 | 14 | Has done event +(4 rows) + +-- LEFT JOINs used with INNER JOINs +-- events_table and users_reference_table joined +-- with event_table.non_part_key < reference_table.any_key +SELECT +count(*) AS cnt, "generated_group_field" + FROM + (SELECT + "eventQuery"."user_id", random(), generated_group_field + FROM + (SELECT + "multi_group_wrapper_1".*, generated_group_field, random() + FROM + (SELECT * + FROM + (SELECT + "events"."time", "events"."user_id" as event_user_id + FROM + events_table as "events" + WHERE + user_id > 80) "temp_data_queries" + INNER JOIN + (SELECT + "users"."user_id" + FROM + users_reference_table as "users" + WHERE + user_id > 80 and value_2 = 5) "user_filters_1" + ON ("temp_data_queries".event_user_id < "user_filters_1".user_id)) AS "multi_group_wrapper_1" + RIGHT JOIN + (SELECT + "users"."user_id" AS "user_id", value_2 AS "generated_group_field" + FROM + users_table as "users") "left_group_by_1" + ON ("left_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" + group BY + "generated_group_field" + ORDER BY + cnt DESC, generated_group_field ASC + LIMIT 10; + cnt | generated_group_field +-----+----------------------- + 540 | 814 + 533 | 746 + 473 | 914 + 449 | 684 + 445 | 715 + 423 | 191 + 419 | 39 + 415 | 108 + 414 | 819 + 411 | 642 +(10 rows) + -- Outer subquery with reference table SELECT "some_users_data".user_id, lastseen FROM @@ -468,9 +879,9 @@ limit 50; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join -- - -- UNIONs and JOINs with reference tables, shoukld error out + -- UNIONs and JOINs with reference tables, should error out -- -SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType +SELECT ("final_query"."event_types") as types FROM ( SELECT *, random() FROM @@ -484,7 +895,7 @@ FROM (SELECT "events"."user_id", "events"."time", 0 AS event FROM - events_reference_table as "events" + events_table as "events" WHERE event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) UNION @@ -494,7 +905,7 @@ FROM (SELECT "events"."user_id", "events"."time", 1 AS event FROM - events_table as "events" + events_reference_table as "events" WHERE event_type IN (15, 16, 17, 18, 19) ) events_subquery_2) UNION @@ -526,11 +937,9 @@ INNER JOIN WHERE value_1 > 50 and value_1 < 70) AS t ON (t.user_id = q.user_id)) as final_query -GROUP BY - types ORDER BY types; -ERROR: cannot push down this subquery +ERROR: cannot push down this subquery DETAIL: Reference tables are not supported with union operator -- reference table exist in the subquery of union, should error out SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType @@ -560,15 +969,15 @@ FROM SELECT * FROM ( SELECT - max("events"."time"), + max("users"."time"), 0 AS event, - "events"."user_id" + "users"."user_id" FROM events_reference_table as "events", users_table as "users" WHERE events.user_id = users.user_id AND event_type IN (10, 11, 12, 13, 14, 15) - GROUP BY "events"."user_id" + GROUP BY "users"."user_id" ) as events_subquery_5 ) events_subquery_2) UNION @@ -660,6 +1069,117 @@ GROUP BY types ORDER BY types; ERROR: cannot push down this subquery DETAIL: Reference tables are not supported with union operator +-- just a sanity check that we don't allow this if the reference table is on the +-- left part of the left join +SELECT count(*) FROM + (SELECT random() FROM users_ref_test_table LEFT JOIN user_buy_test_table + ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- we don't allow non equi join among hash partitioned tables +SELECT count(*) FROM + (SELECT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1, +(SELECT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.user_id > users_ref_test_table.id) subquery_2 +WHERE subquery_1.user_id != subquery_2.user_id ; +ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys +DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. +-- we cannot push this query since hash partitioned tables +-- are not joined on partition keys with equality +SELECT +count(*) AS cnt, "generated_group_field" + FROM + (SELECT + "eventQuery"."user_id", random(), generated_group_field + FROM + (SELECT + "multi_group_wrapper_1".*, generated_group_field, random() + FROM + (SELECT * + FROM + (SELECT + "events"."time", "events"."user_id" as event_user_id + FROM + events_table as "events" + WHERE + user_id > 80) "temp_data_queries" + INNER JOIN + (SELECT + "users"."user_id" + FROM + users_reference_table as "users" + WHERE + user_id > 80 and value_2 = 5) "user_filters_1" + ON ("temp_data_queries".event_user_id < "user_filters_1".user_id)) AS "multi_group_wrapper_1" + RIGHT JOIN + (SELECT + "users"."user_id" AS "user_id", value_2 AS "generated_group_field" + FROM + users_table as "users") "left_group_by_1" + ON ("left_group_by_1".user_id > "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" + group BY + "generated_group_field" + ORDER BY + cnt DESC, generated_group_field ASC + LIMIT 10; +ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys +DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. +-- two hash partitioned relations are not joined +-- on partiton keys although reference table is fine +-- to push down +SELECT + u1.user_id, count(*) +FROM + events_table as e1, users_table as u1 +WHERE + event_type IN + (SELECT + event_type + FROM + events_reference_table as e2 + WHERE + value_2 = 15 AND + value_3 > 25 AND + e1.value_2 > e2.value_2 + ) + AND u1.user_id > e1.user_id +GROUP BY 1 +ORDER BY 2 DESC, 1 DESC +LIMIT 5; +ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys +DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. +SELECT foo.user_id FROM +( + SELECT m.user_id, random() FROM users_table m JOIN events_reference_table r ON int4eq(m.user_id, r.user_id) + WHERE event_type > 100000 +) as foo; + user_id +--------- +(0 rows) + +-- not supported since group by is on the reference table column +SELECT foo.user_id FROM +( + SELECT r.user_id, random() FROM users_table m JOIN events_reference_table r ON int4eq(m.user_id, r.user_id) + GROUP BY r.user_id +) as foo; +ERROR: cannot push down this subquery +DETAIL: Group by list without partition column is currently unsupported +-- not supported since distinct is on the reference table column +SELECT foo.user_id FROM +( + SELECT DISTINCT r.user_id, random() FROM users_table m JOIN events_reference_table r ON int4eq(m.user_id, r.user_id) +) as foo; +ERROR: cannot push down this subquery +DETAIL: Distinct on columns without partition column is currently unsupported +-- not supported since distinct on is on the reference table column +SELECT foo.user_id FROM +( + SELECT DISTINCT ON(r.user_id) r.user_id, random() FROM users_table m JOIN events_reference_table r ON int4eq(m.user_id, r.user_id) +) as foo; +ERROR: cannot push down this subquery +DETAIL: Distinct on columns without partition column is currently unsupported DROP TABLE user_buy_test_table; DROP TABLE users_ref_test_table; DROP TABLE users_return_test_table; diff --git a/src/test/regress/expected/multi_subquery_in_where_reference_clause.out b/src/test/regress/expected/multi_subquery_in_where_reference_clause.out index a0e2bf7f0..48927b35d 100644 --- a/src/test/regress/expected/multi_subquery_in_where_reference_clause.out +++ b/src/test/regress/expected/multi_subquery_in_where_reference_clause.out @@ -1,33 +1,5 @@ -- -- queries to test the subquery pushdown on reference tables --- subqueries in WHERE with greater operator -SELECT - user_id -FROM - users_table -WHERE - value_2 > - (SELECT - max(value_2) - FROM - events_reference_table - WHERE - users_table.user_id = events_reference_table.user_id AND event_type = 50 - GROUP BY - user_id - ) -GROUP BY user_id -HAVING count(*) > 66 -ORDER BY user_id -LIMIT 5; - user_id ---------- - 49 - 55 - 56 - 63 -(4 rows) - -- subqueries in WHERE with IN operator SELECT user_id @@ -91,10 +63,10 @@ WHERE ) LIMIT 3; ERROR: cannot pushdown the subquery -DETAIL: There exist a reference table in the outer part of the outer join +DETAIL: Reference tables are not allowed in FROM clause when the query has subqueries in WHERE clause -- subqueries in WHERE with IN operator without equality SELECT - user_id + users_table.user_id, count(*) FROM users_table WHERE @@ -106,59 +78,15 @@ WHERE WHERE users_table.user_id > events_reference_table.user_id ) -GROUP BY user_id -ORDER BY user_id +GROUP BY users_table.user_id +ORDER BY 2 DESC, 1 DESC LIMIT 3; -ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys -DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. --- have reference table without any equality, should error out -SELECT - user_id -FROM - users_table -WHERE - value_2 > - (SELECT - max(value_2) - FROM - events_reference_table - WHERE - event_type = 50 - GROUP BY - user_id - ) -GROUP BY user_id -HAVING count(*) > 66 -ORDER BY user_id -LIMIT 5; -ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys -DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. --- users that appeared more than 118 times, should run since the reference table --- on the right side of the semi join -SELECT - user_id -FROM - users_table -WHERE 118 <= - (SELECT - count(*) - FROM - events_reference_table - WHERE - users_table.user_id = events_reference_table.user_id - GROUP BY - user_id) -GROUP BY - user_id -ORDER BY - user_id; - user_id ---------- - 13 - 17 - 23 - 25 -(4 rows) + user_id | count +---------+------- + 87 | 117 + 59 | 115 + 46 | 115 +(3 rows) -- should error out since reference table exist on the left side -- of the left lateral join @@ -224,3 +152,194 @@ SELECT user_id, value_2 FROM users_table WHERE ORDER BY 1, 2; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join +-- non-partition key equality with reference table + SELECT + user_id, count(*) +FROM + users_table +WHERE + value_3 =ANY(SELECT value_2 FROM users_reference_table WHERE value_1 >= 10 AND value_1 <= 20) + GROUP BY 1 ORDER BY 2 DESC, 1 DESC LIMIT 5; + user_id | count +---------+------- + 48 | 18 + 26 | 18 + 15 | 17 + 54 | 16 + 35 | 15 +(5 rows) + +-- non-partition key comparison with reference table +SELECT + user_id, count(*) +FROM + events_table as e1 +WHERE + event_type IN + (SELECT + event_type + FROM + events_reference_table as e2 + WHERE + value_2 = 15 AND + value_3 > 25 AND + e1.value_2 > e2.value_2 + ) +GROUP BY 1 +ORDER BY 2 DESC, 1 DESC +LIMIT 5; + user_id | count +---------+------- + 3 | 5 + 56 | 4 + 99 | 2 + 94 | 2 + 92 | 2 +(5 rows) + +-- subqueries in both WHERE and FROM clauses +-- should work since reference table is on the +-- inner part of the join +SELECT user_id, value_2 FROM users_table WHERE + value_1 > 101 AND value_1 < 110 + AND value_2 >= 5 + AND user_id IN + ( + SELECT + e1.user_id + FROM ( + -- Get the first time each user viewed the homepage. + SELECT + user_id, + 1 AS view_homepage, + min(time) AS view_homepage_time + FROM events_table + WHERE + event_type IN (10, 20, 30, 40, 50, 60, 70, 80, 90) + GROUP BY user_id + ) e1 LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS use_demo, + time AS use_demo_time + FROM events_table + WHERE + user_id = e1.user_id AND + event_type IN (11, 21, 31, 41, 51, 61, 71, 81, 91) + ORDER BY time + ) e2 ON true LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS enter_credit_card, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e2.user_id AND + event_type IN (12, 22, 32, 42, 52, 62, 72, 82, 92) + ORDER BY time + ) e3 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS submit_card_info, + user_id, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e3.user_id AND + event_type IN (13, 23, 33, 43, 53, 63, 73, 83, 93) + ORDER BY time + ) e4 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS see_bought_screen + FROM events_reference_table + WHERE + user_id = e4.user_id AND + event_type IN (14, 24, 34, 44, 54, 64, 74, 84, 94) + ORDER BY time + ) e5 ON true + group by e1.user_id + HAVING sum(submit_card_info) > 0 +) +ORDER BY 1, 2; + user_id | value_2 +---------+--------- + 5 | 884 + 42 | 55 + 42 | 471 + 51 | 758 + 72 | 897 + 82 | 691 + 95 | 951 +(7 rows) + +-- reference tables are not allowed if there is sublink +SELECT + count(*) +FROM + users_reference_table +WHERE user_id + NOT IN +(SELECT users_table.value_2 FROM users_table JOIN users_reference_table as u2 ON users_table.value_2 = u2.value_2); +ERROR: cannot pushdown the subquery +DETAIL: Reference tables are not allowed in FROM clause when the query has subqueries in WHERE clause +-- reference tables are not allowed if there is sublink +SELECT count(*) +FROM + (SELECT + user_id, random() FROM users_reference_table) AS vals + WHERE vals.user_id NOT IN + (SELECT users_table.value_2 + FROM users_table + JOIN users_reference_table AS u2 ON users_table.value_2 = u2.value_2); +ERROR: cannot pushdown the subquery +DETAIL: Reference tables are not allowed in FROM clause when the query has subqueries in WHERE clause +-- reference tables are not allowed if there is sublink +SELECT user_id, + count(*) +FROM users_reference_table +WHERE value_2 > ALL + (SELECT min(value_2) + FROM events_table + WHERE event_type > 50 AND users_reference_table.user_id = events_table.user_id + GROUP BY user_id) +GROUP BY user_id +HAVING count(*) > 66 +ORDER BY 2 DESC, + 1 DESC +LIMIT 5; +ERROR: cannot pushdown the subquery +DETAIL: Reference tables are not allowed in FROM clause when the query has subqueries in WHERE clause +-- reference tables are not allowed if there is sublink +-- this time in the subquery +SELECT * +FROM users_table +WHERE user_id IN + (SELECT users_table.user_id + FROM users_table, + users_reference_table + WHERE users_reference_table.user_id NOT IN + (SELECT value_2 + FROM users_reference_table AS u2)); +ERROR: cannot push down this subquery +DETAIL: Reference tables are not allowed in FROM clause when the query has subqueries in WHERE clause +-- not supported since GROUP BY references to an upper level query +SELECT + user_id +FROM + users_table +WHERE + value_2 > + (SELECT + max(value_2) + FROM + events_reference_table + WHERE + users_table.user_id = events_reference_table.user_id AND event_type = 50 + GROUP BY + users_table.user_id + ) +GROUP BY user_id +HAVING count(*) > 66 +ORDER BY user_id +LIMIT 5; +ERROR: cannot push down this subquery +DETAIL: Group by list without partition column is currently unsupported diff --git a/src/test/regress/input/multi_behavioral_analytics_create_table.source b/src/test/regress/input/multi_behavioral_analytics_create_table.source index c83673a2d..b81082fe2 100644 --- a/src/test/regress/input/multi_behavioral_analytics_create_table.source +++ b/src/test/regress/input/multi_behavioral_analytics_create_table.source @@ -356,6 +356,7 @@ SET citus.shard_max_size TO "1MB"; CREATE TABLE events_reference_table (like events_table including all); SELECT create_reference_table('events_reference_table'); +CREATE INDEX events_ref_val2 on events_reference_table(value_2); INSERT INTO events_reference_table SELECT * FROM events_table; CREATE TABLE users_reference_table (like users_table including all); diff --git a/src/test/regress/output/multi_behavioral_analytics_create_table.source b/src/test/regress/output/multi_behavioral_analytics_create_table.source index 81b996c0d..026634fe6 100644 --- a/src/test/regress/output/multi_behavioral_analytics_create_table.source +++ b/src/test/regress/output/multi_behavioral_analytics_create_table.source @@ -428,6 +428,7 @@ SELECT create_reference_table('events_reference_table'); (1 row) +CREATE INDEX events_ref_val2 on events_reference_table(value_2); INSERT INTO events_reference_table SELECT * FROM events_table; CREATE TABLE users_reference_table (like users_table including all); SELECT create_reference_table('users_reference_table'); diff --git a/src/test/regress/sql/multi_reference_table.sql b/src/test/regress/sql/multi_reference_table.sql index 8c498ba42..d254b352e 100644 --- a/src/test/regress/sql/multi_reference_table.sql +++ b/src/test/regress/sql/multi_reference_table.sql @@ -707,7 +707,7 @@ FROM WHERE colocated_table_test.value_1 = reference_table_test.value_1; --- not pushable due to lack of equality between partition column and column of reference table +-- safe to push down even lack of equality between partition column and column of reference table INSERT INTO colocated_table_test (value_1, value_2) SELECT @@ -718,7 +718,7 @@ WHERE colocated_table_test_2.value_4 = reference_table_test.value_4 RETURNING value_1, value_2; --- some more complex queries (Note that there are more complex queries in multi_insert_select.sql) +-- similar query with the above, this time partition key but without equality INSERT INTO colocated_table_test (value_1, value_2) SELECT @@ -726,7 +726,7 @@ SELECT FROM colocated_table_test_2, reference_table_test WHERE - colocated_table_test_2.value_2 = reference_table_test.value_2 + colocated_table_test_2.value_1 > reference_table_test.value_2 RETURNING value_1, value_2; -- partition column value comes from reference table, goes via coordinator diff --git a/src/test/regress/sql/multi_subquery_complex_reference_clause.sql b/src/test/regress/sql/multi_subquery_complex_reference_clause.sql index fc4ffef7b..58d3e9975 100644 --- a/src/test/regress/sql/multi_subquery_complex_reference_clause.sql +++ b/src/test/regress/sql/multi_subquery_complex_reference_clause.sql @@ -42,15 +42,17 @@ SELECT count(*) FROM (SELECT random(), k_no FROM user_buy_test_table LEFT JOIN users_ref_test_table ON user_buy_test_table.user_id = users_ref_test_table.id) subquery_1 WHERE k_no = 47; --- Should not work, no equality between partition column and reference table -SELECT * FROM - (SELECT random() FROM user_buy_test_table LEFT JOIN users_ref_test_table - ON user_buy_test_table.item_id = users_ref_test_table.id) subquery_1; +-- Should work, although no equality between partition column and reference table +SELECT subquery_1.item_id FROM + (SELECT user_buy_test_table.item_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.item_id = users_ref_test_table.id) subquery_1 +ORDER BY 1; --- Should not work, no equality between partition column and reference table -SELECT * FROM - (SELECT random() FROM user_buy_test_table LEFT JOIN users_ref_test_table - ON user_buy_test_table.user_id > users_ref_test_table.id) subquery_1; +-- Should work, although no equality between partition column and reference table +SELECT subquery_1.user_id FROM + (SELECT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.user_id > users_ref_test_table.id) subquery_1 +ORDER BY 1; -- Shouldn't work, reference table at the outer side is not allowed SELECT * FROM @@ -67,6 +69,21 @@ SELECT * FROM (SELECT random() FROM user_buy_test_table RIGHT JOIN users_ref_test_table ON user_buy_test_table.user_id = users_ref_test_table.id) subquery_1; +-- Equi join test with reference table on non-partition keys +SELECT count(*) FROM + (SELECT random() FROM user_buy_test_table JOIN users_ref_test_table + ON user_buy_test_table.item_id = users_ref_test_table.id) subquery_1; + +-- Non-equi join test with reference table on non-partition keys +SELECT count(*) FROM + (SELECT random() FROM user_buy_test_table JOIN users_ref_test_table + ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1; + +-- Non-equi left joins with reference tables on non-partition keys +SELECT count(*) FROM + (SELECT random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1; + -- Should pass since reference table locates in the inner part of each left join SELECT count(*) FROM (SELECT tt1.user_id, random() FROM user_buy_test_table AS tt1 JOIN users_return_test_table as tt2 @@ -75,13 +92,172 @@ SELECT count(*) FROM (SELECT tt1.user_id, random() FROM user_buy_test_table as tt1 LEFT JOIN users_ref_test_table as ref ON tt1.user_id = ref.id) subquery_2 ON subquery_1.user_id = subquery_2.user_id; - -- Should not pass since reference table locates in the outer part of right join +-- two subqueries, each include joins with reference table +-- also, two hash distributed tables are joined on partition keys +SELECT count(*) FROM + (SELECT DISTINCT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.item_id > users_ref_test_table.id AND users_ref_test_table.k_no > 88 AND user_buy_test_table.item_id < 88) subquery_1, +(SELECT DISTINCT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.user_id > users_ref_test_table.id AND users_ref_test_table.k_no > 44 AND user_buy_test_table.user_id > 44) subquery_2 +WHERE subquery_1.user_id = subquery_2.user_id ; + + -- Should be able to push down since reference tables are inner joined + -- with hash distributed tables, the results of those joins are the parts of + -- an outer join +SELECT subquery_2.id FROM + (SELECT tt1.user_id, random() FROM user_buy_test_table AS tt1 JOIN users_return_test_table as tt2 + ON tt1.user_id = tt2.user_id) subquery_1 + RIGHT JOIN + (SELECT tt1.user_id, ref.id, random() FROM user_buy_test_table as tt1 JOIN users_ref_test_table as ref + ON tt1.user_id = ref.id) subquery_2 ON subquery_1.user_id = subquery_2.user_id ORDER BY 1 DESC LIMIT 5; + +-- the same query as the above, but this Citus fails to pushdown the query +-- since the outer part of the right join doesn't include any joins SELECT * FROM (SELECT tt1.user_id, random() FROM user_buy_test_table AS tt1 JOIN users_return_test_table as tt2 ON tt1.user_id = tt2.user_id) subquery_1 RIGHT JOIN - (SELECT tt1.user_id, random() FROM user_buy_test_table as tt1 JOIN users_ref_test_table as ref - ON tt1.user_id = ref.id) subquery_2 ON subquery_1.user_id = subquery_2.user_id; + (SELECT *, random() FROM (SELECT tt1.user_id, random() FROM user_buy_test_table as tt1 JOIN users_ref_test_table as ref + ON tt1.user_id = ref.id) subquery_2_inner) subquery_2 ON subquery_1.user_id = subquery_2.user_id; + +-- should be able to pushdown since reference table is in the +-- inner part of the left join +SELECT + user_id, sum(value_1) +FROM + (SELECT + users_table.user_id, users_table.value_1, random() + FROM + users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) + INNER JOIN events_reference_table ON (events_reference_table.value_2 = users_table.user_id) + ) as foo + GROUP BY user_id ORDER BY 2 DESC LIMIT 10; + +-- same query as above, reference table is wrapped into a subquery +SELECT + user_id, sum(value_1) +FROM + (SELECT + users_table.user_id, users_table.value_1, random() + FROM + users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) + INNER JOIN (SELECT *, random() FROM events_reference_table) as ref_all ON (ref_all.value_2 = users_table.user_id) + ) as foo + GROUP BY user_id ORDER BY 2 DESC LIMIT 10; + +-- should be able to pushdown since reference table is in the +-- inner part of the left join +SELECT + user_id, sum(value_1) +FROM + (SELECT + users_table.user_id, users_table.value_1, random() + FROM + users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) + LEFT JOIN events_reference_table ON (events_reference_table.value_2 = users_table.user_id) + ) as foo + GROUP BY user_id ORDER BY 2 DESC LIMIT 10; + +-- should not be able to pushdown since reference table is in the +-- direct outer part of the left join +SELECT + user_id, sum(value_1) +FROM + (SELECT + users_table.user_id, users_table.value_1, random() + FROM + events_reference_table LEFT JOIN users_table ON (users_table.user_id = events_reference_table.value_2) + LEFT JOIN events_table ON (events_table.user_id = users_table.user_id) + ) as foo + GROUP BY user_id ORDER BY 2 DESC LIMIT 10; + +-- should not be able to pushdown since reference table is in the +-- direct outer part of the left join wrapped into a subquery +SELECT + * +FROM + (SELECT *, random() FROM events_reference_table) as ref_all LEFT JOIN users_table + ON (users_table.user_id = ref_all.value_2); + +-- should not be able to pushdown since reference table is in the +-- outer part of the left join +SELECT + user_id, sum(value_1) +FROM + (SELECT + users_table.user_id, users_table.value_1, random() + FROM + events_reference_table LEFT JOIN users_table ON (users_table.user_id = events_reference_table.value_2) + LEFT JOIN events_table ON (events_table.user_id = users_table.user_id) + ) as foo + GROUP BY user_id ORDER BY 2 DESC LIMIT 10; + +-- should be able to pushdown since reference table is in the +-- inner part of the left join +SELECT * FROM +( + SELECT DISTINCT foo.user_id + FROM + ((SELECT + "events"."time", "events"."user_id" as event_user_id, value_2 as event_val_2, random() + FROM + events_reference_table as "events" + WHERE + event_type > 80) as "temp_data_queries" + INNER JOIN + (SELECT + "users"."user_id" + FROM + users_table as "users" + WHERE + user_id > 80 and value_2 = 5) as foo_in ON (event_val_2 = user_id)) as foo LEFT JOIN + (SELECT user_id as user_user_id FROM users_table) as fooo ON (user_id = user_user_id)) as bar; + +-- the same query but this time reference table is in the outer part of the query +SELECT * FROM +( + SELECT DISTINCT foo.user_id + FROM + ((SELECT + "events"."time", "events"."user_id" as event_user_id, value_2 as event_val_2, random() + FROM + events_reference_table as "events" + WHERE + event_type > 80) as "temp_data_queries" + LEFT JOIN + (SELECT + "users"."user_id" + FROM + users_table as "users" + WHERE + user_id > 80 and value_2 = 5) as foo_in ON (event_val_2 = user_id)) as foo LEFT JOIN + (SELECT user_id as user_user_id FROM users_table) as fooo ON (user_id = user_user_id)) as bar; + +-- we could even suuport the following where the subquery +-- on the outer part of the left join contains a reference table +SELECT max(events_all.cnt), events_all.usr_id +FROM + (SELECT users_table.user_id as usr_id, + count(*) as cnt + FROM events_reference_table + INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) GROUP BY users_table.user_id) AS events_all +LEFT JOIN events_table ON (events_all.usr_id = events_table.user_id) GROUP BY 2 ORDER BY 1 DESC, 2 DESC LIMIT 5; + +-- but, we fail to pushdown the following query where join that reference table appears +-- wrapped into a subquery +SELECT max(events_all.cnt), + events_all.usr_id + FROM( +SELECT *, random() FROM + (SELECT users_table.user_id AS usr_id, count(*) AS cnt + FROM events_reference_table + INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) + GROUP BY users_table.user_id) AS events_all_inner) AS events_all +LEFT JOIN events_table ON (events_all.usr_id = events_table.user_id) +GROUP BY 2 +ORDER BY 1 DESC, + 2 DESC +LIMIT 5; -- LATERAL JOINs used with INNER JOINs with reference tables SET citus.subquery_pushdown to ON; @@ -149,10 +325,10 @@ SELECT count(*) AS value, "generated_group_field" FROM (SELECT - DISTINCT "pushedDownQuery"."real_user_id", "generated_group_field" + DISTINCT "pushedDownQuery"."user_id", "generated_group_field" FROM (SELECT - "eventQuery"."real_user_id", "eventQuery"."time", random(), ("eventQuery"."value_2") AS "generated_group_field" + "eventQuery"."user_id", "eventQuery"."time", random(), ("eventQuery"."value_2") AS "generated_group_field" FROM (SELECT * @@ -160,7 +336,7 @@ SELECT (SELECT "events"."time", "events"."user_id", "events"."value_2" FROM - events_reference_table as "events" + events_table as "events" WHERE user_id > 10 and user_id < 40 AND event_type IN (40, 41, 42, 43, 44, 45) ) "temp_data_queries" INNER JOIN @@ -270,8 +446,8 @@ ORDER BY LIMIT 10; SET citus.subquery_pushdown to OFF; --- LEFT JOINs used with INNER JOINs should error out since reference table exist in the --- left side of the LEFT JOIN. +-- LEFT JOINs used with INNER JOINs should not error out since reference table joined +-- with hash table that Citus can push down SELECT count(*) AS cnt, "generated_group_field" FROM @@ -348,6 +524,106 @@ count(*) AS cnt, "generated_group_field" cnt DESC, generated_group_field ASC LIMIT 10; +-- right join where the inner part of the join includes a reference table +-- joined with hash partitioned table using non-equi join +SELECT user_id, sum(array_length(events_table, 1)), length(hasdone_event), hasdone_event +FROM ( + SELECT + t1.user_id, + array_agg(event ORDER BY time) AS events_table, + COALESCE(hasdone_event, 'Has not done event') AS hasdone_event + FROM ( + ( + SELECT u.user_id, 'step=>1'::text AS event, e.time + FROM users_table AS u, + events_reference_table AS e + WHERE u.user_id > e.user_id + AND u.user_id >= 10 + AND u.user_id <= 25 + AND e.event_type IN (100, 101, 102) + ) + ) t1 RIGHT JOIN ( + SELECT DISTINCT user_id, + 'Has done event'::TEXT AS hasdone_event + FROM events_table AS e + WHERE e.user_id >= 10 + AND e.user_id <= 25 + AND e.event_type IN (106, 107, 108) + ) t2 ON (t1.user_id = t2.user_id) + GROUP BY t1.user_id, hasdone_event +) t GROUP BY user_id, hasdone_event +ORDER BY user_id; + +-- a similar query as the above, with non-partition key comparison +SELECT user_id, sum(array_length(events_table, 1)), length(hasdone_event), hasdone_event +FROM ( + SELECT + t1.user_id, + array_agg(event ORDER BY time) AS events_table, + COALESCE(hasdone_event, 'Has not done event') AS hasdone_event + FROM ( + ( + SELECT u.user_id, 'step=>1'::text AS event, e.time + FROM users_table AS u, + events_reference_table AS e + WHERE u.value_1 > e.user_id + AND u.user_id >= 10 + AND u.user_id <= 25 + AND e.event_type >= 125 AND e.event_type < 130 + ) + ) t1 RIGHT JOIN ( + SELECT DISTINCT user_id, + 'Has done event'::TEXT AS hasdone_event + FROM events_table AS e + WHERE e.user_id >= 10 + AND e.user_id <= 25 + AND e.event_type >= 130 AND e.event_type < 135 + ) t2 ON (t1.user_id = t2.user_id) + GROUP BY t1.user_id, hasdone_event +) t GROUP BY user_id, hasdone_event +ORDER BY user_id; + + +-- LEFT JOINs used with INNER JOINs +-- events_table and users_reference_table joined +-- with event_table.non_part_key < reference_table.any_key +SELECT +count(*) AS cnt, "generated_group_field" + FROM + (SELECT + "eventQuery"."user_id", random(), generated_group_field + FROM + (SELECT + "multi_group_wrapper_1".*, generated_group_field, random() + FROM + (SELECT * + FROM + (SELECT + "events"."time", "events"."user_id" as event_user_id + FROM + events_table as "events" + WHERE + user_id > 80) "temp_data_queries" + INNER JOIN + (SELECT + "users"."user_id" + FROM + users_reference_table as "users" + WHERE + user_id > 80 and value_2 = 5) "user_filters_1" + ON ("temp_data_queries".event_user_id < "user_filters_1".user_id)) AS "multi_group_wrapper_1" + RIGHT JOIN + (SELECT + "users"."user_id" AS "user_id", value_2 AS "generated_group_field" + FROM + users_table as "users") "left_group_by_1" + ON ("left_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" + group BY + "generated_group_field" + ORDER BY + cnt DESC, generated_group_field ASC + LIMIT 10; + -- Outer subquery with reference table SELECT "some_users_data".user_id, lastseen FROM @@ -380,9 +656,9 @@ ORDER BY limit 50; -- - -- UNIONs and JOINs with reference tables, shoukld error out + -- UNIONs and JOINs with reference tables, should error out -- -SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType +SELECT ("final_query"."event_types") as types FROM ( SELECT *, random() FROM @@ -396,7 +672,7 @@ FROM (SELECT "events"."user_id", "events"."time", 0 AS event FROM - events_reference_table as "events" + events_table as "events" WHERE event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) UNION @@ -406,7 +682,7 @@ FROM (SELECT "events"."user_id", "events"."time", 1 AS event FROM - events_table as "events" + events_reference_table as "events" WHERE event_type IN (15, 16, 17, 18, 19) ) events_subquery_2) UNION @@ -438,8 +714,6 @@ INNER JOIN WHERE value_1 > 50 and value_1 < 70) AS t ON (t.user_id = q.user_id)) as final_query -GROUP BY - types ORDER BY types; @@ -471,15 +745,15 @@ FROM SELECT * FROM ( SELECT - max("events"."time"), + max("users"."time"), 0 AS event, - "events"."user_id" + "users"."user_id" FROM events_reference_table as "events", users_table as "users" WHERE events.user_id = users.user_id AND event_type IN (10, 11, 12, 13, 14, 15) - GROUP BY "events"."user_id" + GROUP BY "users"."user_id" ) as events_subquery_5 ) events_subquery_2) UNION @@ -569,6 +843,107 @@ INNER JOIN GROUP BY types ORDER BY types; +-- just a sanity check that we don't allow this if the reference table is on the +-- left part of the left join +SELECT count(*) FROM + (SELECT random() FROM users_ref_test_table LEFT JOIN user_buy_test_table + ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1; + +-- we don't allow non equi join among hash partitioned tables +SELECT count(*) FROM + (SELECT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1, +(SELECT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.user_id > users_ref_test_table.id) subquery_2 +WHERE subquery_1.user_id != subquery_2.user_id ; + +-- we cannot push this query since hash partitioned tables +-- are not joined on partition keys with equality +SELECT +count(*) AS cnt, "generated_group_field" + FROM + (SELECT + "eventQuery"."user_id", random(), generated_group_field + FROM + (SELECT + "multi_group_wrapper_1".*, generated_group_field, random() + FROM + (SELECT * + FROM + (SELECT + "events"."time", "events"."user_id" as event_user_id + FROM + events_table as "events" + WHERE + user_id > 80) "temp_data_queries" + INNER JOIN + (SELECT + "users"."user_id" + FROM + users_reference_table as "users" + WHERE + user_id > 80 and value_2 = 5) "user_filters_1" + ON ("temp_data_queries".event_user_id < "user_filters_1".user_id)) AS "multi_group_wrapper_1" + RIGHT JOIN + (SELECT + "users"."user_id" AS "user_id", value_2 AS "generated_group_field" + FROM + users_table as "users") "left_group_by_1" + ON ("left_group_by_1".user_id > "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" + group BY + "generated_group_field" + ORDER BY + cnt DESC, generated_group_field ASC + LIMIT 10; + +-- two hash partitioned relations are not joined +-- on partiton keys although reference table is fine +-- to push down +SELECT + u1.user_id, count(*) +FROM + events_table as e1, users_table as u1 +WHERE + event_type IN + (SELECT + event_type + FROM + events_reference_table as e2 + WHERE + value_2 = 15 AND + value_3 > 25 AND + e1.value_2 > e2.value_2 + ) + AND u1.user_id > e1.user_id +GROUP BY 1 +ORDER BY 2 DESC, 1 DESC +LIMIT 5; + +SELECT foo.user_id FROM +( + SELECT m.user_id, random() FROM users_table m JOIN events_reference_table r ON int4eq(m.user_id, r.user_id) + WHERE event_type > 100000 +) as foo; + +-- not supported since group by is on the reference table column +SELECT foo.user_id FROM +( + SELECT r.user_id, random() FROM users_table m JOIN events_reference_table r ON int4eq(m.user_id, r.user_id) + GROUP BY r.user_id +) as foo; + +-- not supported since distinct is on the reference table column +SELECT foo.user_id FROM +( + SELECT DISTINCT r.user_id, random() FROM users_table m JOIN events_reference_table r ON int4eq(m.user_id, r.user_id) +) as foo; + +-- not supported since distinct on is on the reference table column +SELECT foo.user_id FROM +( + SELECT DISTINCT ON(r.user_id) r.user_id, random() FROM users_table m JOIN events_reference_table r ON int4eq(m.user_id, r.user_id) +) as foo; + DROP TABLE user_buy_test_table; DROP TABLE users_ref_test_table; DROP TABLE users_return_test_table; diff --git a/src/test/regress/sql/multi_subquery_in_where_reference_clause.sql b/src/test/regress/sql/multi_subquery_in_where_reference_clause.sql index 54266b085..5e38325f0 100644 --- a/src/test/regress/sql/multi_subquery_in_where_reference_clause.sql +++ b/src/test/regress/sql/multi_subquery_in_where_reference_clause.sql @@ -1,27 +1,6 @@ -- -- queries to test the subquery pushdown on reference tables --- subqueries in WHERE with greater operator -SELECT - user_id -FROM - users_table -WHERE - value_2 > - (SELECT - max(value_2) - FROM - events_reference_table - WHERE - users_table.user_id = events_reference_table.user_id AND event_type = 50 - GROUP BY - user_id - ) -GROUP BY user_id -HAVING count(*) > 66 -ORDER BY user_id -LIMIT 5; - -- subqueries in WHERE with IN operator SELECT user_id @@ -78,7 +57,7 @@ LIMIT 3; -- subqueries in WHERE with IN operator without equality SELECT - user_id + users_table.user_id, count(*) FROM users_table WHERE @@ -90,51 +69,10 @@ WHERE WHERE users_table.user_id > events_reference_table.user_id ) -GROUP BY user_id -ORDER BY user_id +GROUP BY users_table.user_id +ORDER BY 2 DESC, 1 DESC LIMIT 3; --- have reference table without any equality, should error out -SELECT - user_id -FROM - users_table -WHERE - value_2 > - (SELECT - max(value_2) - FROM - events_reference_table - WHERE - event_type = 50 - GROUP BY - user_id - ) -GROUP BY user_id -HAVING count(*) > 66 -ORDER BY user_id -LIMIT 5; - --- users that appeared more than 118 times, should run since the reference table --- on the right side of the semi join -SELECT - user_id -FROM - users_table -WHERE 118 <= - (SELECT - count(*) - FROM - events_reference_table - WHERE - users_table.user_id = events_reference_table.user_id - GROUP BY - user_id) -GROUP BY - user_id -ORDER BY - user_id; - -- should error out since reference table exist on the left side -- of the left lateral join SELECT user_id, value_2 FROM users_table WHERE @@ -197,3 +135,165 @@ SELECT user_id, value_2 FROM users_table WHERE HAVING sum(submit_card_info) > 0 ) ORDER BY 1, 2; + +-- non-partition key equality with reference table + SELECT + user_id, count(*) +FROM + users_table +WHERE + value_3 =ANY(SELECT value_2 FROM users_reference_table WHERE value_1 >= 10 AND value_1 <= 20) + GROUP BY 1 ORDER BY 2 DESC, 1 DESC LIMIT 5; + + +-- non-partition key comparison with reference table +SELECT + user_id, count(*) +FROM + events_table as e1 +WHERE + event_type IN + (SELECT + event_type + FROM + events_reference_table as e2 + WHERE + value_2 = 15 AND + value_3 > 25 AND + e1.value_2 > e2.value_2 + ) +GROUP BY 1 +ORDER BY 2 DESC, 1 DESC +LIMIT 5; + +-- subqueries in both WHERE and FROM clauses +-- should work since reference table is on the +-- inner part of the join +SELECT user_id, value_2 FROM users_table WHERE + value_1 > 101 AND value_1 < 110 + AND value_2 >= 5 + AND user_id IN + ( + SELECT + e1.user_id + FROM ( + -- Get the first time each user viewed the homepage. + SELECT + user_id, + 1 AS view_homepage, + min(time) AS view_homepage_time + FROM events_table + WHERE + event_type IN (10, 20, 30, 40, 50, 60, 70, 80, 90) + GROUP BY user_id + ) e1 LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS use_demo, + time AS use_demo_time + FROM events_table + WHERE + user_id = e1.user_id AND + event_type IN (11, 21, 31, 41, 51, 61, 71, 81, 91) + ORDER BY time + ) e2 ON true LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS enter_credit_card, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e2.user_id AND + event_type IN (12, 22, 32, 42, 52, 62, 72, 82, 92) + ORDER BY time + ) e3 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS submit_card_info, + user_id, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e3.user_id AND + event_type IN (13, 23, 33, 43, 53, 63, 73, 83, 93) + ORDER BY time + ) e4 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS see_bought_screen + FROM events_reference_table + WHERE + user_id = e4.user_id AND + event_type IN (14, 24, 34, 44, 54, 64, 74, 84, 94) + ORDER BY time + ) e5 ON true + group by e1.user_id + HAVING sum(submit_card_info) > 0 +) +ORDER BY 1, 2; + +-- reference tables are not allowed if there is sublink +SELECT + count(*) +FROM + users_reference_table +WHERE user_id + NOT IN +(SELECT users_table.value_2 FROM users_table JOIN users_reference_table as u2 ON users_table.value_2 = u2.value_2); + + +-- reference tables are not allowed if there is sublink +SELECT count(*) +FROM + (SELECT + user_id, random() FROM users_reference_table) AS vals + WHERE vals.user_id NOT IN + (SELECT users_table.value_2 + FROM users_table + JOIN users_reference_table AS u2 ON users_table.value_2 = u2.value_2); + +-- reference tables are not allowed if there is sublink +SELECT user_id, + count(*) +FROM users_reference_table +WHERE value_2 > ALL + (SELECT min(value_2) + FROM events_table + WHERE event_type > 50 AND users_reference_table.user_id = events_table.user_id + GROUP BY user_id) +GROUP BY user_id +HAVING count(*) > 66 +ORDER BY 2 DESC, + 1 DESC +LIMIT 5; + +-- reference tables are not allowed if there is sublink +-- this time in the subquery +SELECT * +FROM users_table +WHERE user_id IN + (SELECT users_table.user_id + FROM users_table, + users_reference_table + WHERE users_reference_table.user_id NOT IN + (SELECT value_2 + FROM users_reference_table AS u2)); + +-- not supported since GROUP BY references to an upper level query +SELECT + user_id +FROM + users_table +WHERE + value_2 > + (SELECT + max(value_2) + FROM + events_reference_table + WHERE + users_table.user_id = events_reference_table.user_id AND event_type = 50 + GROUP BY + users_table.user_id + ) +GROUP BY user_id +HAVING count(*) > 66 +ORDER BY user_id +LIMIT 5;