diff --git a/src/backend/distributed/planner/multi_logical_optimizer.c b/src/backend/distributed/planner/multi_logical_optimizer.c index 7c7ae518c..63e0ad86d 100644 --- a/src/backend/distributed/planner/multi_logical_optimizer.c +++ b/src/backend/distributed/planner/multi_logical_optimizer.c @@ -2880,6 +2880,17 @@ FindReferencedTableColumn(Expr *columnExpression, List *parentQueryList, Query * return; } + /* + * We currently don't support finding partition keys in the subqueries + * that references from outer subqueries. For example, in corrolated + * subqueries in WHERE clause, we don't support use of partition keys + * in the subquery that is referred from the outer query. + */ + if (candidateColumn->varlevelsup > 0) + { + return; + } + rangeTableEntryIndex = candidateColumn->varno - 1; rangeTableEntry = list_nth(rangetableList, rangeTableEntryIndex); @@ -3096,7 +3107,14 @@ PartitionColumnOpExpressionList(Query *query) rangeTableEntryIndex = candidatePartitionColumn->varno - 1; rangeTableEntry = list_nth(rangetableList, rangeTableEntryIndex); - Assert(rangeTableEntry->rtekind == RTE_RELATION); + /* + * We currently don't support checking for equality when user refers + * to a column from the JOIN instead of the relation. + */ + if (rangeTableEntry->rtekind != RTE_RELATION) + { + continue; + } relationId = rangeTableEntry->relid; partitionColumn = DistPartitionKey(relationId); diff --git a/src/backend/distributed/planner/multi_logical_planner.c b/src/backend/distributed/planner/multi_logical_planner.c index f1da91072..545434100 100644 --- a/src/backend/distributed/planner/multi_logical_planner.c +++ b/src/backend/distributed/planner/multi_logical_planner.c @@ -31,6 +31,7 @@ #include "distributed/worker_protocol.h" #include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" +#include "nodes/relation.h" #include "optimizer/clauses.h" #include "optimizer/prep.h" #include "optimizer/tlist.h" @@ -69,6 +70,8 @@ static DeferredErrorMessage * DeferErrorIfUnsupportedSubqueryPushdown(Query * PlannerRestrictionContext * plannerRestrictionContext); +static DeferredErrorMessage * DeferErrorIfUnsupportedSublinkAndReferenceTable( + Query *queryTree); static DeferredErrorMessage * DeferErrorIfUnsupportedFilters(Query *subquery); static bool EqualOpExpressionLists(List *firstOpExpressionList, List *secondOpExpressionList); @@ -87,6 +90,7 @@ static MultiNode * MultiPlanTree(Query *queryTree); static void ErrorIfQueryNotSupported(Query *queryTree); static bool HasUnsupportedReferenceTableJoin( PlannerRestrictionContext *plannerRestrictionContext); +static bool ShouldRecurseForReferenceTableJoinChecks(RelOptInfo *relOptInfo); static bool HasUnsupportedJoinWalker(Node *node, void *context); static bool ErrorHintRequired(const char *errorHint, Query *queryTree); static DeferredErrorMessage * DeferErrorIfUnsupportedSubqueryRepartition(Query * @@ -96,8 +100,8 @@ static bool HasOuterJoin(Query *queryTree); static bool HasOuterJoinWalker(Node *node, void *maxJoinLevel); static bool HasComplexJoinOrder(Query *queryTree); static bool HasComplexRangeTableType(Query *queryTree); -static bool RelationInfoHasReferenceTable(PlannerInfo *plannerInfo, - RelOptInfo *relationInfo); +static bool RelationInfoContainsReferenceTable(PlannerInfo *plannerInfo, + RelOptInfo *relationInfo); static void ValidateClauseList(List *clauseList); static void ValidateSubqueryPushdownClauseList(List *clauseList); static bool ExtractFromExpressionWalker(Node *node, @@ -542,7 +546,16 @@ DeferErrorIfUnsupportedSubqueryPushdown(Query *originalQuery, "one another relation using distribution keys and " "equality operator.", NULL); } - else if (HasUnsupportedReferenceTableJoin(plannerRestrictionContext)) + + /* we shouldn't allow reference tables in the FROM clause when the query has sublinks */ + error = DeferErrorIfUnsupportedSublinkAndReferenceTable(originalQuery); + if (error) + { + return error; + } + + /* we shouldn't allow reference tables in the outer part of outer joins */ + if (HasUnsupportedReferenceTableJoin(plannerRestrictionContext)) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "cannot pushdown the subquery", @@ -582,6 +595,43 @@ DeferErrorIfUnsupportedSubqueryPushdown(Query *originalQuery, } +/* + * DeferErrorIfUnsupportedSublinkAndReferenceTable returns a deferred error if the + * given query is not suitable for subquery pushdown. + * + * While planning sublinks, we rely on Postgres in the sense that it converts some of + * sublinks into joins. + * + * In some cases, sublinks are pulled up and converted into outer joins. Those cases + * are already handled with HasUnsupportedReferenceTableJoin(). + * + * If the sublinks are not pulled up, we should still error out in if any reference table + * appears in the FROM clause of a subquery. + * + * Otherwise, the result would include duplicate rows. + */ +static DeferredErrorMessage * +DeferErrorIfUnsupportedSublinkAndReferenceTable(Query *queryTree) +{ + if (!queryTree->hasSubLinks) + { + return NULL; + } + + if (HasReferenceTable((Node *) queryTree->rtable)) + { + return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, + "cannot pushdown the subquery", + "Reference tables are not allowed in FROM " + "clause when the query has subqueries in " + "WHERE clause", + NULL); + } + + return NULL; +} + + /* * DeferErrorIfUnsupportedFilters checks if all leaf queries in the given query have * same filter on the partition column. Note that if there are queries without @@ -853,6 +903,14 @@ DeferErrorIfCannotPushdownSubquery(Query *subqueryTree, bool outerMostQueryHasLi } } + deferredError = DeferErrorIfUnsupportedSublinkAndReferenceTable(subqueryTree); + if (deferredError) + { + preconditionsSatisfied = false; + errorDetail = (char *) deferredError->detail; + } + + /* finally check and return deferred if not satisfied */ if (!preconditionsSatisfied) { @@ -1027,7 +1085,7 @@ DeferErrorIfUnsupportedTableCombination(Query *queryTree) /* * TargetListOnPartitionColumn checks if at least one target list entry is on - * partition column or the table is a reference table. + * partition column. */ static bool TargetListOnPartitionColumn(Query *query, List *targetEntryList) @@ -1047,15 +1105,11 @@ TargetListOnPartitionColumn(Query *query, List *targetEntryList) FindReferencedTableColumn(targetExpression, NIL, query, &relationId, &column); - /* - * If the expression belongs to reference table directly returns true. - * We can assume that target list entry always on partition column of - * reference tables. - */ + /* if the expression belongs to reference table directly returns false */ if (IsDistributedTable(relationId) && PartitionMethod(relationId) == DISTRIBUTE_BY_NONE) { - targetListOnPartitionColumn = true; + targetListOnPartitionColumn = false; break; } @@ -1459,15 +1513,18 @@ HasUnsupportedReferenceTableJoin(PlannerRestrictionContext *plannerRestrictionCo if (joinType == JOIN_SEMI || joinType == JOIN_ANTI || joinType == JOIN_LEFT) { - if (RelationInfoHasReferenceTable(plannerInfo, outerrel)) + if (ShouldRecurseForReferenceTableJoinChecks(outerrel) && + RelationInfoContainsReferenceTable(plannerInfo, outerrel)) { return true; } } else if (joinType == JOIN_FULL) { - if (RelationInfoHasReferenceTable(plannerInfo, innerrel) || - RelationInfoHasReferenceTable(plannerInfo, outerrel)) + if ((ShouldRecurseForReferenceTableJoinChecks(innerrel) && + RelationInfoContainsReferenceTable(plannerInfo, innerrel)) || + (ShouldRecurseForReferenceTableJoinChecks(outerrel) && + RelationInfoContainsReferenceTable(plannerInfo, outerrel))) { return true; } @@ -1479,12 +1536,66 @@ HasUnsupportedReferenceTableJoin(PlannerRestrictionContext *plannerRestrictionCo /* - * RelationInfoHasReferenceTable check whether the relationInfo has reference table. - * Since relation ids of relationInfo indexes to the range table entry list of - * planner info, planner info is also passed. + * ShouldRecurseForReferenceTableJoinChecks is a helper function for deciding + * on whether the input relOptInfo should be checked for unsupported reference + * tables. */ static bool -RelationInfoHasReferenceTable(PlannerInfo *plannerInfo, RelOptInfo *relationInfo) +ShouldRecurseForReferenceTableJoinChecks(RelOptInfo *relOptInfo) +{ + /* + * We shouldn't recursively go down for joins since we're already + * going to process each join seperately. Otherwise we'd restrict + * the coverage. See the below sketch where (h) denotes a hash + * distributed relation, (r) denotes a reference table, (L) denotes + * LEFT JOIN and (I) denotes INNER JOIN. If we're to recurse into + * the inner join, we'd be preventing to push down the following + * join tree, which is actually safe to push down. + * + * (L) + * / \ + * (I) h + * / \ + * r h + */ + if (relOptInfo->reloptkind == RELOPT_JOINREL) + { + return false; + } + + /* + * Note that we treat the same query where relations appear in subqueries + * differently. (i.e., use SELECT * FROM r; instead of r) + * + * In that case, to relax some restrictions, we do the following optimization: + * If the subplan (i.e., plannerInfo corresponding to the subquery) contains any + * joins, we skip reference table checks keeping in mind that the join is already + * going to be processed seperately. This optimization should suffice for many + * use cases. + */ + if (relOptInfo->reloptkind == RELOPT_BASEREL && relOptInfo->subroot != NULL) + { + PlannerInfo *subroot = relOptInfo->subroot; + + if (list_length(subroot->join_rel_list) > 0) + { + return false; + } + } + + return true; +} + + +/* + * RelationInfoContainsReferenceTable checks whether the relationInfo + * contains any reference tables. If found, the function returns true. + * + * Note that since relation ids of relationInfo indexes to the range + * table entry list of planner info, planner info is also passed. + */ +static bool +RelationInfoContainsReferenceTable(PlannerInfo *plannerInfo, RelOptInfo *relationInfo) { Relids relids = bms_copy(relationInfo->relids); int relationId = -1; @@ -2927,9 +3038,17 @@ ExtractRangeTableRelationWalkerWithRTEExpand(Node *node, List **rangeTableRelati rangeTableRelationList, 0); } } + else if (IsA(node, Query)) + { + walkIsComplete = query_tree_walker((Query *) node, + ExtractRangeTableRelationWalkerWithRTEExpand, + rangeTableRelationList, QTW_EXAMINE_RTES); + } else { - walkIsComplete = ExtractRangeTableRelationWalker(node, rangeTableRelationList); + walkIsComplete = expression_tree_walker(node, + ExtractRangeTableRelationWalkerWithRTEExpand, + rangeTableRelationList); } return walkIsComplete; diff --git a/src/backend/distributed/planner/relation_restriction_equivalence.c b/src/backend/distributed/planner/relation_restriction_equivalence.c index b8a61a00e..786df8736 100644 --- a/src/backend/distributed/planner/relation_restriction_equivalence.c +++ b/src/backend/distributed/planner/relation_restriction_equivalence.c @@ -62,6 +62,7 @@ typedef struct AttributeEquivalenceClassMember } AttributeEquivalenceClassMember; +static uint32 ReferenceRelationCount(RelationRestrictionContext *restrictionContext); static Var * FindTranslatedVar(List *appendRelList, Oid relationOid, Index relationRteIndex, Index *partitionKeyIndex); static bool EquivalenceListContainsRelationsEquality(List *attributeEquivalenceList, @@ -329,9 +330,8 @@ FindTranslatedVar(List *appendRelList, Oid relationOid, Index relationRteIndex, * joined on their partition keys. * * The function returns true if all relations are joined on their partition keys. - * Otherwise, the function returns false. In order to support reference tables - * with subqueries, equality between attributes of reference tables and partition - * key of distributed tables are also considered. + * Otherwise, the function returns false. We ignore reference tables at all since + * they don't have partition keys. * * In order to do that, we invented a new equivalence class namely: * AttributeEquivalenceClass. In very simple words, a AttributeEquivalenceClass is @@ -365,14 +365,24 @@ RestrictionEquivalenceForPartitionKeys(PlannerRestrictionContext * List *joinRestrictionAttributeEquivalenceList = NIL; List *allAttributeEquivalenceList = NIL; + uint32 referenceRelationCount = ReferenceRelationCount(restrictionContext); uint32 totalRelationCount = list_length(restrictionContext->relationRestrictionList); + uint32 nonReferenceRelationCount = totalRelationCount - referenceRelationCount; /* - * If the query includes only one relation, we should not check the partition - * column equality. Single table should not need to fetch data from other nodes - * except it's own node(s). + * If the query includes a single relation which is not a reference table, + * we should not check the partition column equality. + * Consider two example cases: + * (i) The query includes only a single colocated relation + * (ii) A colocated relation is joined with a (or multiple) reference + * table(s) where colocated relation is not joined on the partition key + * + * For the above two cases, we don't need to execute the partition column equality + * algorithm. The reason is that the essence of this function is to ensure that the + * tasks that are going to be created should not need data from other tasks. In both + * cases mentioned above, the necessary data per task would be on available. */ - if (totalRelationCount == 1) + if (nonReferenceRelationCount <= 1) { return true; } @@ -394,6 +404,31 @@ RestrictionEquivalenceForPartitionKeys(PlannerRestrictionContext * } +/* + * ReferenceRelationCount iterates over the relations and returns the reference table + * relation count. + */ +static uint32 +ReferenceRelationCount(RelationRestrictionContext *restrictionContext) +{ + ListCell *relationRestrictionCell = NULL; + uint32 referenceRelationCount = 0; + + foreach(relationRestrictionCell, restrictionContext->relationRestrictionList) + { + RelationRestriction *relationRestriction = + (RelationRestriction *) lfirst(relationRestrictionCell); + + if (PartitionMethod(relationRestriction->relationId) == DISTRIBUTE_BY_NONE) + { + referenceRelationCount++; + } + } + + return referenceRelationCount; +} + + /* * EquivalenceListContainsRelationsEquality gets a list of attributed equivalence * list and a relation restriction context. The function first generates a common @@ -434,6 +469,12 @@ EquivalenceListContainsRelationsEquality(List *attributeEquivalenceList, (RelationRestriction *) lfirst(relationRestrictionCell); int rteIdentity = GetRTEIdentity(relationRestriction->rte); + /* we shouldn't check for the equality of reference tables */ + if (PartitionMethod(relationRestriction->relationId) == DISTRIBUTE_BY_NONE) + { + continue; + } + if (!bms_is_member(rteIdentity, commonRteIdentities)) { return false; @@ -621,7 +662,7 @@ GetVarFromAssignedParam(List *parentPlannerParamList, Param *plannerParam) /* * GenerateCommonEquivalence gets a list of unrelated AttributeEquiavalanceClass - * whose all members are partition keys or a column of reference table. + * whose all members are partition keys. * * With the equivalence classes, the function follows the algorithm * outlined below: @@ -1084,8 +1125,14 @@ AddRteRelationToAttributeEquivalenceClass(AttributeEquivalenceClass ** Assert(rangeTableEntry->rtekind == RTE_RELATION); - if (PartitionMethod(relationId) != DISTRIBUTE_BY_NONE && - relationPartitionKey->varattno != varToBeAdded->varattno) + /* we don't need reference tables in the equality on columns */ + if (relationPartitionKey == NULL) + { + return; + } + + /* we're only interested in distribution columns */ + if (relationPartitionKey->varattno != varToBeAdded->varattno) { return; } diff --git a/src/test/regress/expected/multi_reference_table.out b/src/test/regress/expected/multi_reference_table.out index 6e6b45c59..3d56b8582 100644 --- a/src/test/regress/expected/multi_reference_table.out +++ b/src/test/regress/expected/multi_reference_table.out @@ -1122,7 +1122,7 @@ WHERE colocated_table_test.value_1 = reference_table_test.value_1; DEBUG: only reference tables may be queried when targeting a reference table with distributed INSERT ... SELECT DEBUG: Collecting INSERT ... SELECT results on coordinator --- not pushable due to lack of equality between partition column and column of reference table +-- safe to push down even lack of equality between partition column and column of reference table INSERT INTO colocated_table_test (value_1, value_2) SELECT @@ -1132,9 +1132,13 @@ FROM WHERE colocated_table_test_2.value_4 = reference_table_test.value_4 RETURNING value_1, value_2; -ERROR: cannot perform distributed planning for the given modification -DETAIL: Select query cannot be pushed down to the worker. --- some more complex queries (Note that there are more complex queries in multi_insert_select.sql) + value_1 | value_2 +---------+--------- + 1 | 1 + 2 | 2 +(2 rows) + +-- similar query with the above, this time partition key but without equality INSERT INTO colocated_table_test (value_1, value_2) SELECT @@ -1142,10 +1146,13 @@ SELECT FROM colocated_table_test_2, reference_table_test WHERE - colocated_table_test_2.value_2 = reference_table_test.value_2 + colocated_table_test_2.value_1 > reference_table_test.value_2 RETURNING value_1, value_2; -ERROR: cannot perform distributed planning for the given modification -DETAIL: Select query cannot be pushed down to the worker. + value_1 | value_2 +---------+--------- + 2 | 1 +(1 row) + -- partition column value comes from reference table, goes via coordinator INSERT INTO colocated_table_test (value_1, value_2) @@ -1606,7 +1613,7 @@ INSERT INTO reference_table_test VALUES (2, 2.0, '2', '2016-12-02'); SELECT master_modify_multiple_shards('DELETE FROM colocated_table_test'); master_modify_multiple_shards ------------------------------- - 6 + 9 (1 row) ROLLBACK; diff --git a/src/test/regress/expected/multi_subquery_complex_reference_clause.out b/src/test/regress/expected/multi_subquery_complex_reference_clause.out index ec0931d12..1d647d16a 100644 --- a/src/test/regress/expected/multi_subquery_complex_reference_clause.out +++ b/src/test/regress/expected/multi_subquery_complex_reference_clause.out @@ -60,18 +60,38 @@ SELECT count(*) FROM 1 (1 row) --- Should not work, no equality between partition column and reference table -SELECT * FROM - (SELECT random() FROM user_buy_test_table LEFT JOIN users_ref_test_table - ON user_buy_test_table.item_id = users_ref_test_table.id) subquery_1; -ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys -DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. --- Should not work, no equality between partition column and reference table -SELECT * FROM - (SELECT random() FROM user_buy_test_table LEFT JOIN users_ref_test_table - ON user_buy_test_table.user_id > users_ref_test_table.id) subquery_1; -ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys -DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. +-- Should work, although no equality between partition column and reference table +SELECT subquery_1.item_id FROM + (SELECT user_buy_test_table.item_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.item_id = users_ref_test_table.id) subquery_1 +ORDER BY 1; + item_id +--------- + 2 + 3 + 4 + 5 +(4 rows) + +-- Should work, although no equality between partition column and reference table +SELECT subquery_1.user_id FROM + (SELECT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.user_id > users_ref_test_table.id) subquery_1 +ORDER BY 1; + user_id +--------- + 1 + 2 + 3 + 3 + 7 + 7 + 7 + 7 + 7 + 7 +(10 rows) + -- Shouldn't work, reference table at the outer side is not allowed SELECT * FROM (SELECT random() FROM users_ref_test_table LEFT JOIN user_buy_test_table @@ -93,6 +113,33 @@ SELECT * FROM ON user_buy_test_table.user_id = users_ref_test_table.id) subquery_1; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join +-- Equi join test with reference table on non-partition keys +SELECT count(*) FROM + (SELECT random() FROM user_buy_test_table JOIN users_ref_test_table + ON user_buy_test_table.item_id = users_ref_test_table.id) subquery_1; + count +------- + 4 +(1 row) + +-- Non-equi join test with reference table on non-partition keys +SELECT count(*) FROM + (SELECT random() FROM user_buy_test_table JOIN users_ref_test_table + ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1; + count +------- + 10 +(1 row) + +-- Non-equi left joins with reference tables on non-partition keys +SELECT count(*) FROM + (SELECT random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1; + count +------- + 10 +(1 row) + -- Should pass since reference table locates in the inner part of each left join SELECT count(*) FROM (SELECT tt1.user_id, random() FROM user_buy_test_table AS tt1 JOIN users_return_test_table as tt2 @@ -105,13 +152,238 @@ SELECT count(*) FROM 2 (1 row) - -- Should not pass since reference table locates in the outer part of right join +-- two subqueries, each include joins with reference table +-- also, two hash distributed tables are joined on partition keys +SELECT count(*) FROM + (SELECT DISTINCT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.item_id > users_ref_test_table.id AND users_ref_test_table.k_no > 88 AND user_buy_test_table.item_id < 88) subquery_1, +(SELECT DISTINCT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.user_id > users_ref_test_table.id AND users_ref_test_table.k_no > 44 AND user_buy_test_table.user_id > 44) subquery_2 +WHERE subquery_1.user_id = subquery_2.user_id ; + count +------- + 4 +(1 row) + + -- Should be able to push down since reference tables are inner joined + -- with hash distributed tables, the results of those joins are the parts of + -- an outer join +SELECT subquery_2.id FROM + (SELECT tt1.user_id, random() FROM user_buy_test_table AS tt1 JOIN users_return_test_table as tt2 + ON tt1.user_id = tt2.user_id) subquery_1 + RIGHT JOIN + (SELECT tt1.user_id, ref.id, random() FROM user_buy_test_table as tt1 JOIN users_ref_test_table as ref + ON tt1.user_id = ref.id) subquery_2 ON subquery_1.user_id = subquery_2.user_id ORDER BY 1 DESC LIMIT 5; + id +---- + 3 + 2 + 1 +(3 rows) + +-- the same query as the above, but this Citus fails to pushdown the query +-- since the outer part of the right join doesn't include any joins SELECT * FROM (SELECT tt1.user_id, random() FROM user_buy_test_table AS tt1 JOIN users_return_test_table as tt2 ON tt1.user_id = tt2.user_id) subquery_1 RIGHT JOIN - (SELECT tt1.user_id, random() FROM user_buy_test_table as tt1 JOIN users_ref_test_table as ref - ON tt1.user_id = ref.id) subquery_2 ON subquery_1.user_id = subquery_2.user_id; + (SELECT *, random() FROM (SELECT tt1.user_id, random() FROM user_buy_test_table as tt1 JOIN users_ref_test_table as ref + ON tt1.user_id = ref.id) subquery_2_inner) subquery_2 ON subquery_1.user_id = subquery_2.user_id; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- should be able to pushdown since reference table is in the +-- inner part of the left join +SELECT + user_id, sum(value_1) +FROM + (SELECT + users_table.user_id, users_table.value_1, random() + FROM + users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) + INNER JOIN events_reference_table ON (events_reference_table.value_2 = users_table.user_id) + ) as foo + GROUP BY user_id ORDER BY 2 DESC LIMIT 10; + user_id | sum +---------+---------- + 12 | 92221920 + 17 | 89192642 + 96 | 85143744 + 45 | 84267456 + 90 | 84157047 + 43 | 82110240 + 1 | 81735612 + 72 | 78992640 + 67 | 72385516 + 97 | 71002659 +(10 rows) + +-- same query as above, reference table is wrapped into a subquery +SELECT + user_id, sum(value_1) +FROM + (SELECT + users_table.user_id, users_table.value_1, random() + FROM + users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) + INNER JOIN (SELECT *, random() FROM events_reference_table) as ref_all ON (ref_all.value_2 = users_table.user_id) + ) as foo + GROUP BY user_id ORDER BY 2 DESC LIMIT 10; + user_id | sum +---------+---------- + 12 | 92221920 + 17 | 89192642 + 96 | 85143744 + 45 | 84267456 + 90 | 84157047 + 43 | 82110240 + 1 | 81735612 + 72 | 78992640 + 67 | 72385516 + 97 | 71002659 +(10 rows) + +-- should be able to pushdown since reference table is in the +-- inner part of the left join +SELECT + user_id, sum(value_1) +FROM + (SELECT + users_table.user_id, users_table.value_1, random() + FROM + users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) + LEFT JOIN events_reference_table ON (events_reference_table.value_2 = users_table.user_id) + ) as foo + GROUP BY user_id ORDER BY 2 DESC LIMIT 10; + user_id | sum +---------+---------- + 12 | 92221920 + 17 | 89192642 + 96 | 85143744 + 45 | 84267456 + 90 | 84157047 + 43 | 82110240 + 1 | 81735612 + 72 | 78992640 + 67 | 72385516 + 97 | 71002659 +(10 rows) + +-- should not be able to pushdown since reference table is in the +-- direct outer part of the left join +SELECT + user_id, sum(value_1) +FROM + (SELECT + users_table.user_id, users_table.value_1, random() + FROM + events_reference_table LEFT JOIN users_table ON (users_table.user_id = events_reference_table.value_2) + LEFT JOIN events_table ON (events_table.user_id = users_table.user_id) + ) as foo + GROUP BY user_id ORDER BY 2 DESC LIMIT 10; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- should not be able to pushdown since reference table is in the +-- direct outer part of the left join wrapped into a subquery +SELECT + * +FROM + (SELECT *, random() FROM events_reference_table) as ref_all LEFT JOIN users_table + ON (users_table.user_id = ref_all.value_2); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- should not be able to pushdown since reference table is in the +-- outer part of the left join +SELECT + user_id, sum(value_1) +FROM + (SELECT + users_table.user_id, users_table.value_1, random() + FROM + events_reference_table LEFT JOIN users_table ON (users_table.user_id = events_reference_table.value_2) + LEFT JOIN events_table ON (events_table.user_id = users_table.user_id) + ) as foo + GROUP BY user_id ORDER BY 2 DESC LIMIT 10; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- should be able to pushdown since reference table is in the +-- inner part of the left join +SELECT * FROM +( + SELECT DISTINCT foo.user_id + FROM + ((SELECT + "events"."time", "events"."user_id" as event_user_id, value_2 as event_val_2, random() + FROM + events_reference_table as "events" + WHERE + event_type > 80) as "temp_data_queries" + INNER JOIN + (SELECT + "users"."user_id" + FROM + users_table as "users" + WHERE + user_id > 80 and value_2 = 5) as foo_in ON (event_val_2 = user_id)) as foo LEFT JOIN + (SELECT user_id as user_user_id FROM users_table) as fooo ON (user_id = user_user_id)) as bar; + user_id +--------- + 89 +(1 row) + +-- the same query but this time reference table is in the outer part of the query +SELECT * FROM +( + SELECT DISTINCT foo.user_id + FROM + ((SELECT + "events"."time", "events"."user_id" as event_user_id, value_2 as event_val_2, random() + FROM + events_reference_table as "events" + WHERE + event_type > 80) as "temp_data_queries" + LEFT JOIN + (SELECT + "users"."user_id" + FROM + users_table as "users" + WHERE + user_id > 80 and value_2 = 5) as foo_in ON (event_val_2 = user_id)) as foo LEFT JOIN + (SELECT user_id as user_user_id FROM users_table) as fooo ON (user_id = user_user_id)) as bar; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- we could even suuport the following where the subquery +-- on the outer part of the left join contains a reference table +SELECT max(events_all.cnt), events_all.usr_id +FROM + (SELECT users_table.user_id as usr_id, + count(*) as cnt + FROM events_reference_table + INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) GROUP BY users_table.user_id) AS events_all +LEFT JOIN events_table ON (events_all.usr_id = events_table.user_id) GROUP BY 2 ORDER BY 1 DESC, 2 DESC LIMIT 5; + max | usr_id +-------+-------- + 14605 | 23 + 13090 | 17 + 12915 | 25 + 12317 | 90 + 12285 | 87 +(5 rows) + +-- but, we fail to pushdown the following query where join that reference table appears +-- wrapped into a subquery +SELECT max(events_all.cnt), + events_all.usr_id + FROM( +SELECT *, random() FROM + (SELECT users_table.user_id AS usr_id, count(*) AS cnt + FROM events_reference_table + INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) + GROUP BY users_table.user_id) AS events_all_inner) AS events_all +LEFT JOIN events_table ON (events_all.usr_id = events_table.user_id) +GROUP BY 2 +ORDER BY 1 DESC, + 2 DESC +LIMIT 5; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join -- LATERAL JOINs used with INNER JOINs with reference tables @@ -193,10 +465,10 @@ SELECT count(*) AS value, "generated_group_field" FROM (SELECT - DISTINCT "pushedDownQuery"."real_user_id", "generated_group_field" + DISTINCT "pushedDownQuery"."user_id", "generated_group_field" FROM (SELECT - "eventQuery"."real_user_id", "eventQuery"."time", random(), ("eventQuery"."value_2") AS "generated_group_field" + "eventQuery"."user_id", "eventQuery"."time", random(), ("eventQuery"."value_2") AS "generated_group_field" FROM (SELECT * @@ -204,7 +476,7 @@ SELECT (SELECT "events"."time", "events"."user_id", "events"."value_2" FROM - events_reference_table as "events" + events_table as "events" WHERE user_id > 10 and user_id < 40 AND event_type IN (40, 41, 42, 43, 44, 45) ) "temp_data_queries" INNER JOIN @@ -355,8 +627,8 @@ LIMIT 10; (6 rows) SET citus.subquery_pushdown to OFF; --- LEFT JOINs used with INNER JOINs should error out since reference table exist in the --- left side of the LEFT JOIN. +-- LEFT JOINs used with INNER JOINs should not error out since reference table joined +-- with hash table that Citus can push down SELECT count(*) AS cnt, "generated_group_field" FROM @@ -393,8 +665,20 @@ count(*) AS cnt, "generated_group_field" ORDER BY cnt DESC, generated_group_field ASC LIMIT 10; -ERROR: cannot pushdown the subquery -DETAIL: There exist a reference table in the outer part of the outer join + cnt | generated_group_field +-----+----------------------- + 176 | 551 + 176 | 569 + 176 | 645 + 176 | 713 + 176 | 734 + 88 | 3 + 88 | 5 + 88 | 15 + 88 | 32 + 88 | 68 +(10 rows) + -- RIGHT JOINs used with INNER JOINs should error out since reference table exist in the -- right side of the RIGHT JOIN. SELECT @@ -435,6 +719,133 @@ count(*) AS cnt, "generated_group_field" LIMIT 10; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join +-- right join where the inner part of the join includes a reference table +-- joined with hash partitioned table using non-equi join +SELECT user_id, sum(array_length(events_table, 1)), length(hasdone_event), hasdone_event +FROM ( + SELECT + t1.user_id, + array_agg(event ORDER BY time) AS events_table, + COALESCE(hasdone_event, 'Has not done event') AS hasdone_event + FROM ( + ( + SELECT u.user_id, 'step=>1'::text AS event, e.time + FROM users_table AS u, + events_reference_table AS e + WHERE u.user_id > e.user_id + AND u.user_id >= 10 + AND u.user_id <= 25 + AND e.event_type IN (100, 101, 102) + ) + ) t1 RIGHT JOIN ( + SELECT DISTINCT user_id, + 'Has done event'::TEXT AS hasdone_event + FROM events_table AS e + WHERE e.user_id >= 10 + AND e.user_id <= 25 + AND e.event_type IN (106, 107, 108) + ) t2 ON (t1.user_id = t2.user_id) + GROUP BY t1.user_id, hasdone_event +) t GROUP BY user_id, hasdone_event +ORDER BY user_id; + user_id | sum | length | hasdone_event +---------+-----+--------+---------------- + 11 | 306 | 14 | Has done event + 12 | 363 | 14 | Has done event + 14 | 510 | 14 | Has done event + 18 | 600 | 14 | Has done event + 19 | 618 | 14 | Has done event +(5 rows) + +-- a similar query as the above, with non-partition key comparison +SELECT user_id, sum(array_length(events_table, 1)), length(hasdone_event), hasdone_event +FROM ( + SELECT + t1.user_id, + array_agg(event ORDER BY time) AS events_table, + COALESCE(hasdone_event, 'Has not done event') AS hasdone_event + FROM ( + ( + SELECT u.user_id, 'step=>1'::text AS event, e.time + FROM users_table AS u, + events_reference_table AS e + WHERE u.value_1 > e.user_id + AND u.user_id >= 10 + AND u.user_id <= 25 + AND e.event_type >= 125 AND e.event_type < 130 + ) + ) t1 RIGHT JOIN ( + SELECT DISTINCT user_id, + 'Has done event'::TEXT AS hasdone_event + FROM events_table AS e + WHERE e.user_id >= 10 + AND e.user_id <= 25 + AND e.event_type >= 130 AND e.event_type < 135 + ) t2 ON (t1.user_id = t2.user_id) + GROUP BY t1.user_id, hasdone_event +) t GROUP BY user_id, hasdone_event +ORDER BY user_id; + user_id | sum | length | hasdone_event +---------+------+--------+---------------- + 10 | 6018 | 14 | Has done event + 16 | 5373 | 14 | Has done event + 17 | 5683 | 14 | Has done event + 18 | 5321 | 14 | Has done event +(4 rows) + +-- LEFT JOINs used with INNER JOINs +-- events_table and users_reference_table joined +-- with event_table.non_part_key < reference_table.any_key +SELECT +count(*) AS cnt, "generated_group_field" + FROM + (SELECT + "eventQuery"."user_id", random(), generated_group_field + FROM + (SELECT + "multi_group_wrapper_1".*, generated_group_field, random() + FROM + (SELECT * + FROM + (SELECT + "events"."time", "events"."user_id" as event_user_id + FROM + events_table as "events" + WHERE + user_id > 80) "temp_data_queries" + INNER JOIN + (SELECT + "users"."user_id" + FROM + users_reference_table as "users" + WHERE + user_id > 80 and value_2 = 5) "user_filters_1" + ON ("temp_data_queries".event_user_id < "user_filters_1".user_id)) AS "multi_group_wrapper_1" + RIGHT JOIN + (SELECT + "users"."user_id" AS "user_id", value_2 AS "generated_group_field" + FROM + users_table as "users") "left_group_by_1" + ON ("left_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" + group BY + "generated_group_field" + ORDER BY + cnt DESC, generated_group_field ASC + LIMIT 10; + cnt | generated_group_field +-----+----------------------- + 540 | 814 + 533 | 746 + 473 | 914 + 449 | 684 + 445 | 715 + 423 | 191 + 419 | 39 + 415 | 108 + 414 | 819 + 411 | 642 +(10 rows) + -- Outer subquery with reference table SELECT "some_users_data".user_id, lastseen FROM @@ -468,9 +879,9 @@ limit 50; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join -- - -- UNIONs and JOINs with reference tables, shoukld error out + -- UNIONs and JOINs with reference tables, should error out -- -SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType +SELECT ("final_query"."event_types") as types FROM ( SELECT *, random() FROM @@ -484,7 +895,7 @@ FROM (SELECT "events"."user_id", "events"."time", 0 AS event FROM - events_reference_table as "events" + events_table as "events" WHERE event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) UNION @@ -494,7 +905,7 @@ FROM (SELECT "events"."user_id", "events"."time", 1 AS event FROM - events_table as "events" + events_reference_table as "events" WHERE event_type IN (15, 16, 17, 18, 19) ) events_subquery_2) UNION @@ -526,11 +937,9 @@ INNER JOIN WHERE value_1 > 50 and value_1 < 70) AS t ON (t.user_id = q.user_id)) as final_query -GROUP BY - types ORDER BY types; -ERROR: cannot push down this subquery +ERROR: cannot push down this subquery DETAIL: Reference tables are not supported with union operator -- reference table exist in the subquery of union, should error out SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType @@ -560,15 +969,15 @@ FROM SELECT * FROM ( SELECT - max("events"."time"), + max("users"."time"), 0 AS event, - "events"."user_id" + "users"."user_id" FROM events_reference_table as "events", users_table as "users" WHERE events.user_id = users.user_id AND event_type IN (10, 11, 12, 13, 14, 15) - GROUP BY "events"."user_id" + GROUP BY "users"."user_id" ) as events_subquery_5 ) events_subquery_2) UNION @@ -660,6 +1069,117 @@ GROUP BY types ORDER BY types; ERROR: cannot push down this subquery DETAIL: Reference tables are not supported with union operator +-- just a sanity check that we don't allow this if the reference table is on the +-- left part of the left join +SELECT count(*) FROM + (SELECT random() FROM users_ref_test_table LEFT JOIN user_buy_test_table + ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- we don't allow non equi join among hash partitioned tables +SELECT count(*) FROM + (SELECT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1, +(SELECT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.user_id > users_ref_test_table.id) subquery_2 +WHERE subquery_1.user_id != subquery_2.user_id ; +ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys +DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. +-- we cannot push this query since hash partitioned tables +-- are not joined on partition keys with equality +SELECT +count(*) AS cnt, "generated_group_field" + FROM + (SELECT + "eventQuery"."user_id", random(), generated_group_field + FROM + (SELECT + "multi_group_wrapper_1".*, generated_group_field, random() + FROM + (SELECT * + FROM + (SELECT + "events"."time", "events"."user_id" as event_user_id + FROM + events_table as "events" + WHERE + user_id > 80) "temp_data_queries" + INNER JOIN + (SELECT + "users"."user_id" + FROM + users_reference_table as "users" + WHERE + user_id > 80 and value_2 = 5) "user_filters_1" + ON ("temp_data_queries".event_user_id < "user_filters_1".user_id)) AS "multi_group_wrapper_1" + RIGHT JOIN + (SELECT + "users"."user_id" AS "user_id", value_2 AS "generated_group_field" + FROM + users_table as "users") "left_group_by_1" + ON ("left_group_by_1".user_id > "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" + group BY + "generated_group_field" + ORDER BY + cnt DESC, generated_group_field ASC + LIMIT 10; +ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys +DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. +-- two hash partitioned relations are not joined +-- on partiton keys although reference table is fine +-- to push down +SELECT + u1.user_id, count(*) +FROM + events_table as e1, users_table as u1 +WHERE + event_type IN + (SELECT + event_type + FROM + events_reference_table as e2 + WHERE + value_2 = 15 AND + value_3 > 25 AND + e1.value_2 > e2.value_2 + ) + AND u1.user_id > e1.user_id +GROUP BY 1 +ORDER BY 2 DESC, 1 DESC +LIMIT 5; +ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys +DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. +SELECT foo.user_id FROM +( + SELECT m.user_id, random() FROM users_table m JOIN events_reference_table r ON int4eq(m.user_id, r.user_id) + WHERE event_type > 100000 +) as foo; + user_id +--------- +(0 rows) + +-- not supported since group by is on the reference table column +SELECT foo.user_id FROM +( + SELECT r.user_id, random() FROM users_table m JOIN events_reference_table r ON int4eq(m.user_id, r.user_id) + GROUP BY r.user_id +) as foo; +ERROR: cannot push down this subquery +DETAIL: Group by list without partition column is currently unsupported +-- not supported since distinct is on the reference table column +SELECT foo.user_id FROM +( + SELECT DISTINCT r.user_id, random() FROM users_table m JOIN events_reference_table r ON int4eq(m.user_id, r.user_id) +) as foo; +ERROR: cannot push down this subquery +DETAIL: Distinct on columns without partition column is currently unsupported +-- not supported since distinct on is on the reference table column +SELECT foo.user_id FROM +( + SELECT DISTINCT ON(r.user_id) r.user_id, random() FROM users_table m JOIN events_reference_table r ON int4eq(m.user_id, r.user_id) +) as foo; +ERROR: cannot push down this subquery +DETAIL: Distinct on columns without partition column is currently unsupported DROP TABLE user_buy_test_table; DROP TABLE users_ref_test_table; DROP TABLE users_return_test_table; diff --git a/src/test/regress/expected/multi_subquery_in_where_reference_clause.out b/src/test/regress/expected/multi_subquery_in_where_reference_clause.out index a0e2bf7f0..48927b35d 100644 --- a/src/test/regress/expected/multi_subquery_in_where_reference_clause.out +++ b/src/test/regress/expected/multi_subquery_in_where_reference_clause.out @@ -1,33 +1,5 @@ -- -- queries to test the subquery pushdown on reference tables --- subqueries in WHERE with greater operator -SELECT - user_id -FROM - users_table -WHERE - value_2 > - (SELECT - max(value_2) - FROM - events_reference_table - WHERE - users_table.user_id = events_reference_table.user_id AND event_type = 50 - GROUP BY - user_id - ) -GROUP BY user_id -HAVING count(*) > 66 -ORDER BY user_id -LIMIT 5; - user_id ---------- - 49 - 55 - 56 - 63 -(4 rows) - -- subqueries in WHERE with IN operator SELECT user_id @@ -91,10 +63,10 @@ WHERE ) LIMIT 3; ERROR: cannot pushdown the subquery -DETAIL: There exist a reference table in the outer part of the outer join +DETAIL: Reference tables are not allowed in FROM clause when the query has subqueries in WHERE clause -- subqueries in WHERE with IN operator without equality SELECT - user_id + users_table.user_id, count(*) FROM users_table WHERE @@ -106,59 +78,15 @@ WHERE WHERE users_table.user_id > events_reference_table.user_id ) -GROUP BY user_id -ORDER BY user_id +GROUP BY users_table.user_id +ORDER BY 2 DESC, 1 DESC LIMIT 3; -ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys -DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. --- have reference table without any equality, should error out -SELECT - user_id -FROM - users_table -WHERE - value_2 > - (SELECT - max(value_2) - FROM - events_reference_table - WHERE - event_type = 50 - GROUP BY - user_id - ) -GROUP BY user_id -HAVING count(*) > 66 -ORDER BY user_id -LIMIT 5; -ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys -DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. --- users that appeared more than 118 times, should run since the reference table --- on the right side of the semi join -SELECT - user_id -FROM - users_table -WHERE 118 <= - (SELECT - count(*) - FROM - events_reference_table - WHERE - users_table.user_id = events_reference_table.user_id - GROUP BY - user_id) -GROUP BY - user_id -ORDER BY - user_id; - user_id ---------- - 13 - 17 - 23 - 25 -(4 rows) + user_id | count +---------+------- + 87 | 117 + 59 | 115 + 46 | 115 +(3 rows) -- should error out since reference table exist on the left side -- of the left lateral join @@ -224,3 +152,194 @@ SELECT user_id, value_2 FROM users_table WHERE ORDER BY 1, 2; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join +-- non-partition key equality with reference table + SELECT + user_id, count(*) +FROM + users_table +WHERE + value_3 =ANY(SELECT value_2 FROM users_reference_table WHERE value_1 >= 10 AND value_1 <= 20) + GROUP BY 1 ORDER BY 2 DESC, 1 DESC LIMIT 5; + user_id | count +---------+------- + 48 | 18 + 26 | 18 + 15 | 17 + 54 | 16 + 35 | 15 +(5 rows) + +-- non-partition key comparison with reference table +SELECT + user_id, count(*) +FROM + events_table as e1 +WHERE + event_type IN + (SELECT + event_type + FROM + events_reference_table as e2 + WHERE + value_2 = 15 AND + value_3 > 25 AND + e1.value_2 > e2.value_2 + ) +GROUP BY 1 +ORDER BY 2 DESC, 1 DESC +LIMIT 5; + user_id | count +---------+------- + 3 | 5 + 56 | 4 + 99 | 2 + 94 | 2 + 92 | 2 +(5 rows) + +-- subqueries in both WHERE and FROM clauses +-- should work since reference table is on the +-- inner part of the join +SELECT user_id, value_2 FROM users_table WHERE + value_1 > 101 AND value_1 < 110 + AND value_2 >= 5 + AND user_id IN + ( + SELECT + e1.user_id + FROM ( + -- Get the first time each user viewed the homepage. + SELECT + user_id, + 1 AS view_homepage, + min(time) AS view_homepage_time + FROM events_table + WHERE + event_type IN (10, 20, 30, 40, 50, 60, 70, 80, 90) + GROUP BY user_id + ) e1 LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS use_demo, + time AS use_demo_time + FROM events_table + WHERE + user_id = e1.user_id AND + event_type IN (11, 21, 31, 41, 51, 61, 71, 81, 91) + ORDER BY time + ) e2 ON true LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS enter_credit_card, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e2.user_id AND + event_type IN (12, 22, 32, 42, 52, 62, 72, 82, 92) + ORDER BY time + ) e3 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS submit_card_info, + user_id, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e3.user_id AND + event_type IN (13, 23, 33, 43, 53, 63, 73, 83, 93) + ORDER BY time + ) e4 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS see_bought_screen + FROM events_reference_table + WHERE + user_id = e4.user_id AND + event_type IN (14, 24, 34, 44, 54, 64, 74, 84, 94) + ORDER BY time + ) e5 ON true + group by e1.user_id + HAVING sum(submit_card_info) > 0 +) +ORDER BY 1, 2; + user_id | value_2 +---------+--------- + 5 | 884 + 42 | 55 + 42 | 471 + 51 | 758 + 72 | 897 + 82 | 691 + 95 | 951 +(7 rows) + +-- reference tables are not allowed if there is sublink +SELECT + count(*) +FROM + users_reference_table +WHERE user_id + NOT IN +(SELECT users_table.value_2 FROM users_table JOIN users_reference_table as u2 ON users_table.value_2 = u2.value_2); +ERROR: cannot pushdown the subquery +DETAIL: Reference tables are not allowed in FROM clause when the query has subqueries in WHERE clause +-- reference tables are not allowed if there is sublink +SELECT count(*) +FROM + (SELECT + user_id, random() FROM users_reference_table) AS vals + WHERE vals.user_id NOT IN + (SELECT users_table.value_2 + FROM users_table + JOIN users_reference_table AS u2 ON users_table.value_2 = u2.value_2); +ERROR: cannot pushdown the subquery +DETAIL: Reference tables are not allowed in FROM clause when the query has subqueries in WHERE clause +-- reference tables are not allowed if there is sublink +SELECT user_id, + count(*) +FROM users_reference_table +WHERE value_2 > ALL + (SELECT min(value_2) + FROM events_table + WHERE event_type > 50 AND users_reference_table.user_id = events_table.user_id + GROUP BY user_id) +GROUP BY user_id +HAVING count(*) > 66 +ORDER BY 2 DESC, + 1 DESC +LIMIT 5; +ERROR: cannot pushdown the subquery +DETAIL: Reference tables are not allowed in FROM clause when the query has subqueries in WHERE clause +-- reference tables are not allowed if there is sublink +-- this time in the subquery +SELECT * +FROM users_table +WHERE user_id IN + (SELECT users_table.user_id + FROM users_table, + users_reference_table + WHERE users_reference_table.user_id NOT IN + (SELECT value_2 + FROM users_reference_table AS u2)); +ERROR: cannot push down this subquery +DETAIL: Reference tables are not allowed in FROM clause when the query has subqueries in WHERE clause +-- not supported since GROUP BY references to an upper level query +SELECT + user_id +FROM + users_table +WHERE + value_2 > + (SELECT + max(value_2) + FROM + events_reference_table + WHERE + users_table.user_id = events_reference_table.user_id AND event_type = 50 + GROUP BY + users_table.user_id + ) +GROUP BY user_id +HAVING count(*) > 66 +ORDER BY user_id +LIMIT 5; +ERROR: cannot push down this subquery +DETAIL: Group by list without partition column is currently unsupported diff --git a/src/test/regress/input/multi_behavioral_analytics_create_table.source b/src/test/regress/input/multi_behavioral_analytics_create_table.source index c83673a2d..b81082fe2 100644 --- a/src/test/regress/input/multi_behavioral_analytics_create_table.source +++ b/src/test/regress/input/multi_behavioral_analytics_create_table.source @@ -356,6 +356,7 @@ SET citus.shard_max_size TO "1MB"; CREATE TABLE events_reference_table (like events_table including all); SELECT create_reference_table('events_reference_table'); +CREATE INDEX events_ref_val2 on events_reference_table(value_2); INSERT INTO events_reference_table SELECT * FROM events_table; CREATE TABLE users_reference_table (like users_table including all); diff --git a/src/test/regress/output/multi_behavioral_analytics_create_table.source b/src/test/regress/output/multi_behavioral_analytics_create_table.source index 81b996c0d..026634fe6 100644 --- a/src/test/regress/output/multi_behavioral_analytics_create_table.source +++ b/src/test/regress/output/multi_behavioral_analytics_create_table.source @@ -428,6 +428,7 @@ SELECT create_reference_table('events_reference_table'); (1 row) +CREATE INDEX events_ref_val2 on events_reference_table(value_2); INSERT INTO events_reference_table SELECT * FROM events_table; CREATE TABLE users_reference_table (like users_table including all); SELECT create_reference_table('users_reference_table'); diff --git a/src/test/regress/sql/multi_reference_table.sql b/src/test/regress/sql/multi_reference_table.sql index 8c498ba42..d254b352e 100644 --- a/src/test/regress/sql/multi_reference_table.sql +++ b/src/test/regress/sql/multi_reference_table.sql @@ -707,7 +707,7 @@ FROM WHERE colocated_table_test.value_1 = reference_table_test.value_1; --- not pushable due to lack of equality between partition column and column of reference table +-- safe to push down even lack of equality between partition column and column of reference table INSERT INTO colocated_table_test (value_1, value_2) SELECT @@ -718,7 +718,7 @@ WHERE colocated_table_test_2.value_4 = reference_table_test.value_4 RETURNING value_1, value_2; --- some more complex queries (Note that there are more complex queries in multi_insert_select.sql) +-- similar query with the above, this time partition key but without equality INSERT INTO colocated_table_test (value_1, value_2) SELECT @@ -726,7 +726,7 @@ SELECT FROM colocated_table_test_2, reference_table_test WHERE - colocated_table_test_2.value_2 = reference_table_test.value_2 + colocated_table_test_2.value_1 > reference_table_test.value_2 RETURNING value_1, value_2; -- partition column value comes from reference table, goes via coordinator diff --git a/src/test/regress/sql/multi_subquery_complex_reference_clause.sql b/src/test/regress/sql/multi_subquery_complex_reference_clause.sql index fc4ffef7b..58d3e9975 100644 --- a/src/test/regress/sql/multi_subquery_complex_reference_clause.sql +++ b/src/test/regress/sql/multi_subquery_complex_reference_clause.sql @@ -42,15 +42,17 @@ SELECT count(*) FROM (SELECT random(), k_no FROM user_buy_test_table LEFT JOIN users_ref_test_table ON user_buy_test_table.user_id = users_ref_test_table.id) subquery_1 WHERE k_no = 47; --- Should not work, no equality between partition column and reference table -SELECT * FROM - (SELECT random() FROM user_buy_test_table LEFT JOIN users_ref_test_table - ON user_buy_test_table.item_id = users_ref_test_table.id) subquery_1; +-- Should work, although no equality between partition column and reference table +SELECT subquery_1.item_id FROM + (SELECT user_buy_test_table.item_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.item_id = users_ref_test_table.id) subquery_1 +ORDER BY 1; --- Should not work, no equality between partition column and reference table -SELECT * FROM - (SELECT random() FROM user_buy_test_table LEFT JOIN users_ref_test_table - ON user_buy_test_table.user_id > users_ref_test_table.id) subquery_1; +-- Should work, although no equality between partition column and reference table +SELECT subquery_1.user_id FROM + (SELECT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.user_id > users_ref_test_table.id) subquery_1 +ORDER BY 1; -- Shouldn't work, reference table at the outer side is not allowed SELECT * FROM @@ -67,6 +69,21 @@ SELECT * FROM (SELECT random() FROM user_buy_test_table RIGHT JOIN users_ref_test_table ON user_buy_test_table.user_id = users_ref_test_table.id) subquery_1; +-- Equi join test with reference table on non-partition keys +SELECT count(*) FROM + (SELECT random() FROM user_buy_test_table JOIN users_ref_test_table + ON user_buy_test_table.item_id = users_ref_test_table.id) subquery_1; + +-- Non-equi join test with reference table on non-partition keys +SELECT count(*) FROM + (SELECT random() FROM user_buy_test_table JOIN users_ref_test_table + ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1; + +-- Non-equi left joins with reference tables on non-partition keys +SELECT count(*) FROM + (SELECT random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1; + -- Should pass since reference table locates in the inner part of each left join SELECT count(*) FROM (SELECT tt1.user_id, random() FROM user_buy_test_table AS tt1 JOIN users_return_test_table as tt2 @@ -75,13 +92,172 @@ SELECT count(*) FROM (SELECT tt1.user_id, random() FROM user_buy_test_table as tt1 LEFT JOIN users_ref_test_table as ref ON tt1.user_id = ref.id) subquery_2 ON subquery_1.user_id = subquery_2.user_id; - -- Should not pass since reference table locates in the outer part of right join +-- two subqueries, each include joins with reference table +-- also, two hash distributed tables are joined on partition keys +SELECT count(*) FROM + (SELECT DISTINCT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.item_id > users_ref_test_table.id AND users_ref_test_table.k_no > 88 AND user_buy_test_table.item_id < 88) subquery_1, +(SELECT DISTINCT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.user_id > users_ref_test_table.id AND users_ref_test_table.k_no > 44 AND user_buy_test_table.user_id > 44) subquery_2 +WHERE subquery_1.user_id = subquery_2.user_id ; + + -- Should be able to push down since reference tables are inner joined + -- with hash distributed tables, the results of those joins are the parts of + -- an outer join +SELECT subquery_2.id FROM + (SELECT tt1.user_id, random() FROM user_buy_test_table AS tt1 JOIN users_return_test_table as tt2 + ON tt1.user_id = tt2.user_id) subquery_1 + RIGHT JOIN + (SELECT tt1.user_id, ref.id, random() FROM user_buy_test_table as tt1 JOIN users_ref_test_table as ref + ON tt1.user_id = ref.id) subquery_2 ON subquery_1.user_id = subquery_2.user_id ORDER BY 1 DESC LIMIT 5; + +-- the same query as the above, but this Citus fails to pushdown the query +-- since the outer part of the right join doesn't include any joins SELECT * FROM (SELECT tt1.user_id, random() FROM user_buy_test_table AS tt1 JOIN users_return_test_table as tt2 ON tt1.user_id = tt2.user_id) subquery_1 RIGHT JOIN - (SELECT tt1.user_id, random() FROM user_buy_test_table as tt1 JOIN users_ref_test_table as ref - ON tt1.user_id = ref.id) subquery_2 ON subquery_1.user_id = subquery_2.user_id; + (SELECT *, random() FROM (SELECT tt1.user_id, random() FROM user_buy_test_table as tt1 JOIN users_ref_test_table as ref + ON tt1.user_id = ref.id) subquery_2_inner) subquery_2 ON subquery_1.user_id = subquery_2.user_id; + +-- should be able to pushdown since reference table is in the +-- inner part of the left join +SELECT + user_id, sum(value_1) +FROM + (SELECT + users_table.user_id, users_table.value_1, random() + FROM + users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) + INNER JOIN events_reference_table ON (events_reference_table.value_2 = users_table.user_id) + ) as foo + GROUP BY user_id ORDER BY 2 DESC LIMIT 10; + +-- same query as above, reference table is wrapped into a subquery +SELECT + user_id, sum(value_1) +FROM + (SELECT + users_table.user_id, users_table.value_1, random() + FROM + users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) + INNER JOIN (SELECT *, random() FROM events_reference_table) as ref_all ON (ref_all.value_2 = users_table.user_id) + ) as foo + GROUP BY user_id ORDER BY 2 DESC LIMIT 10; + +-- should be able to pushdown since reference table is in the +-- inner part of the left join +SELECT + user_id, sum(value_1) +FROM + (SELECT + users_table.user_id, users_table.value_1, random() + FROM + users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) + LEFT JOIN events_reference_table ON (events_reference_table.value_2 = users_table.user_id) + ) as foo + GROUP BY user_id ORDER BY 2 DESC LIMIT 10; + +-- should not be able to pushdown since reference table is in the +-- direct outer part of the left join +SELECT + user_id, sum(value_1) +FROM + (SELECT + users_table.user_id, users_table.value_1, random() + FROM + events_reference_table LEFT JOIN users_table ON (users_table.user_id = events_reference_table.value_2) + LEFT JOIN events_table ON (events_table.user_id = users_table.user_id) + ) as foo + GROUP BY user_id ORDER BY 2 DESC LIMIT 10; + +-- should not be able to pushdown since reference table is in the +-- direct outer part of the left join wrapped into a subquery +SELECT + * +FROM + (SELECT *, random() FROM events_reference_table) as ref_all LEFT JOIN users_table + ON (users_table.user_id = ref_all.value_2); + +-- should not be able to pushdown since reference table is in the +-- outer part of the left join +SELECT + user_id, sum(value_1) +FROM + (SELECT + users_table.user_id, users_table.value_1, random() + FROM + events_reference_table LEFT JOIN users_table ON (users_table.user_id = events_reference_table.value_2) + LEFT JOIN events_table ON (events_table.user_id = users_table.user_id) + ) as foo + GROUP BY user_id ORDER BY 2 DESC LIMIT 10; + +-- should be able to pushdown since reference table is in the +-- inner part of the left join +SELECT * FROM +( + SELECT DISTINCT foo.user_id + FROM + ((SELECT + "events"."time", "events"."user_id" as event_user_id, value_2 as event_val_2, random() + FROM + events_reference_table as "events" + WHERE + event_type > 80) as "temp_data_queries" + INNER JOIN + (SELECT + "users"."user_id" + FROM + users_table as "users" + WHERE + user_id > 80 and value_2 = 5) as foo_in ON (event_val_2 = user_id)) as foo LEFT JOIN + (SELECT user_id as user_user_id FROM users_table) as fooo ON (user_id = user_user_id)) as bar; + +-- the same query but this time reference table is in the outer part of the query +SELECT * FROM +( + SELECT DISTINCT foo.user_id + FROM + ((SELECT + "events"."time", "events"."user_id" as event_user_id, value_2 as event_val_2, random() + FROM + events_reference_table as "events" + WHERE + event_type > 80) as "temp_data_queries" + LEFT JOIN + (SELECT + "users"."user_id" + FROM + users_table as "users" + WHERE + user_id > 80 and value_2 = 5) as foo_in ON (event_val_2 = user_id)) as foo LEFT JOIN + (SELECT user_id as user_user_id FROM users_table) as fooo ON (user_id = user_user_id)) as bar; + +-- we could even suuport the following where the subquery +-- on the outer part of the left join contains a reference table +SELECT max(events_all.cnt), events_all.usr_id +FROM + (SELECT users_table.user_id as usr_id, + count(*) as cnt + FROM events_reference_table + INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) GROUP BY users_table.user_id) AS events_all +LEFT JOIN events_table ON (events_all.usr_id = events_table.user_id) GROUP BY 2 ORDER BY 1 DESC, 2 DESC LIMIT 5; + +-- but, we fail to pushdown the following query where join that reference table appears +-- wrapped into a subquery +SELECT max(events_all.cnt), + events_all.usr_id + FROM( +SELECT *, random() FROM + (SELECT users_table.user_id AS usr_id, count(*) AS cnt + FROM events_reference_table + INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) + GROUP BY users_table.user_id) AS events_all_inner) AS events_all +LEFT JOIN events_table ON (events_all.usr_id = events_table.user_id) +GROUP BY 2 +ORDER BY 1 DESC, + 2 DESC +LIMIT 5; -- LATERAL JOINs used with INNER JOINs with reference tables SET citus.subquery_pushdown to ON; @@ -149,10 +325,10 @@ SELECT count(*) AS value, "generated_group_field" FROM (SELECT - DISTINCT "pushedDownQuery"."real_user_id", "generated_group_field" + DISTINCT "pushedDownQuery"."user_id", "generated_group_field" FROM (SELECT - "eventQuery"."real_user_id", "eventQuery"."time", random(), ("eventQuery"."value_2") AS "generated_group_field" + "eventQuery"."user_id", "eventQuery"."time", random(), ("eventQuery"."value_2") AS "generated_group_field" FROM (SELECT * @@ -160,7 +336,7 @@ SELECT (SELECT "events"."time", "events"."user_id", "events"."value_2" FROM - events_reference_table as "events" + events_table as "events" WHERE user_id > 10 and user_id < 40 AND event_type IN (40, 41, 42, 43, 44, 45) ) "temp_data_queries" INNER JOIN @@ -270,8 +446,8 @@ ORDER BY LIMIT 10; SET citus.subquery_pushdown to OFF; --- LEFT JOINs used with INNER JOINs should error out since reference table exist in the --- left side of the LEFT JOIN. +-- LEFT JOINs used with INNER JOINs should not error out since reference table joined +-- with hash table that Citus can push down SELECT count(*) AS cnt, "generated_group_field" FROM @@ -348,6 +524,106 @@ count(*) AS cnt, "generated_group_field" cnt DESC, generated_group_field ASC LIMIT 10; +-- right join where the inner part of the join includes a reference table +-- joined with hash partitioned table using non-equi join +SELECT user_id, sum(array_length(events_table, 1)), length(hasdone_event), hasdone_event +FROM ( + SELECT + t1.user_id, + array_agg(event ORDER BY time) AS events_table, + COALESCE(hasdone_event, 'Has not done event') AS hasdone_event + FROM ( + ( + SELECT u.user_id, 'step=>1'::text AS event, e.time + FROM users_table AS u, + events_reference_table AS e + WHERE u.user_id > e.user_id + AND u.user_id >= 10 + AND u.user_id <= 25 + AND e.event_type IN (100, 101, 102) + ) + ) t1 RIGHT JOIN ( + SELECT DISTINCT user_id, + 'Has done event'::TEXT AS hasdone_event + FROM events_table AS e + WHERE e.user_id >= 10 + AND e.user_id <= 25 + AND e.event_type IN (106, 107, 108) + ) t2 ON (t1.user_id = t2.user_id) + GROUP BY t1.user_id, hasdone_event +) t GROUP BY user_id, hasdone_event +ORDER BY user_id; + +-- a similar query as the above, with non-partition key comparison +SELECT user_id, sum(array_length(events_table, 1)), length(hasdone_event), hasdone_event +FROM ( + SELECT + t1.user_id, + array_agg(event ORDER BY time) AS events_table, + COALESCE(hasdone_event, 'Has not done event') AS hasdone_event + FROM ( + ( + SELECT u.user_id, 'step=>1'::text AS event, e.time + FROM users_table AS u, + events_reference_table AS e + WHERE u.value_1 > e.user_id + AND u.user_id >= 10 + AND u.user_id <= 25 + AND e.event_type >= 125 AND e.event_type < 130 + ) + ) t1 RIGHT JOIN ( + SELECT DISTINCT user_id, + 'Has done event'::TEXT AS hasdone_event + FROM events_table AS e + WHERE e.user_id >= 10 + AND e.user_id <= 25 + AND e.event_type >= 130 AND e.event_type < 135 + ) t2 ON (t1.user_id = t2.user_id) + GROUP BY t1.user_id, hasdone_event +) t GROUP BY user_id, hasdone_event +ORDER BY user_id; + + +-- LEFT JOINs used with INNER JOINs +-- events_table and users_reference_table joined +-- with event_table.non_part_key < reference_table.any_key +SELECT +count(*) AS cnt, "generated_group_field" + FROM + (SELECT + "eventQuery"."user_id", random(), generated_group_field + FROM + (SELECT + "multi_group_wrapper_1".*, generated_group_field, random() + FROM + (SELECT * + FROM + (SELECT + "events"."time", "events"."user_id" as event_user_id + FROM + events_table as "events" + WHERE + user_id > 80) "temp_data_queries" + INNER JOIN + (SELECT + "users"."user_id" + FROM + users_reference_table as "users" + WHERE + user_id > 80 and value_2 = 5) "user_filters_1" + ON ("temp_data_queries".event_user_id < "user_filters_1".user_id)) AS "multi_group_wrapper_1" + RIGHT JOIN + (SELECT + "users"."user_id" AS "user_id", value_2 AS "generated_group_field" + FROM + users_table as "users") "left_group_by_1" + ON ("left_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" + group BY + "generated_group_field" + ORDER BY + cnt DESC, generated_group_field ASC + LIMIT 10; + -- Outer subquery with reference table SELECT "some_users_data".user_id, lastseen FROM @@ -380,9 +656,9 @@ ORDER BY limit 50; -- - -- UNIONs and JOINs with reference tables, shoukld error out + -- UNIONs and JOINs with reference tables, should error out -- -SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType +SELECT ("final_query"."event_types") as types FROM ( SELECT *, random() FROM @@ -396,7 +672,7 @@ FROM (SELECT "events"."user_id", "events"."time", 0 AS event FROM - events_reference_table as "events" + events_table as "events" WHERE event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) UNION @@ -406,7 +682,7 @@ FROM (SELECT "events"."user_id", "events"."time", 1 AS event FROM - events_table as "events" + events_reference_table as "events" WHERE event_type IN (15, 16, 17, 18, 19) ) events_subquery_2) UNION @@ -438,8 +714,6 @@ INNER JOIN WHERE value_1 > 50 and value_1 < 70) AS t ON (t.user_id = q.user_id)) as final_query -GROUP BY - types ORDER BY types; @@ -471,15 +745,15 @@ FROM SELECT * FROM ( SELECT - max("events"."time"), + max("users"."time"), 0 AS event, - "events"."user_id" + "users"."user_id" FROM events_reference_table as "events", users_table as "users" WHERE events.user_id = users.user_id AND event_type IN (10, 11, 12, 13, 14, 15) - GROUP BY "events"."user_id" + GROUP BY "users"."user_id" ) as events_subquery_5 ) events_subquery_2) UNION @@ -569,6 +843,107 @@ INNER JOIN GROUP BY types ORDER BY types; +-- just a sanity check that we don't allow this if the reference table is on the +-- left part of the left join +SELECT count(*) FROM + (SELECT random() FROM users_ref_test_table LEFT JOIN user_buy_test_table + ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1; + +-- we don't allow non equi join among hash partitioned tables +SELECT count(*) FROM + (SELECT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1, +(SELECT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.user_id > users_ref_test_table.id) subquery_2 +WHERE subquery_1.user_id != subquery_2.user_id ; + +-- we cannot push this query since hash partitioned tables +-- are not joined on partition keys with equality +SELECT +count(*) AS cnt, "generated_group_field" + FROM + (SELECT + "eventQuery"."user_id", random(), generated_group_field + FROM + (SELECT + "multi_group_wrapper_1".*, generated_group_field, random() + FROM + (SELECT * + FROM + (SELECT + "events"."time", "events"."user_id" as event_user_id + FROM + events_table as "events" + WHERE + user_id > 80) "temp_data_queries" + INNER JOIN + (SELECT + "users"."user_id" + FROM + users_reference_table as "users" + WHERE + user_id > 80 and value_2 = 5) "user_filters_1" + ON ("temp_data_queries".event_user_id < "user_filters_1".user_id)) AS "multi_group_wrapper_1" + RIGHT JOIN + (SELECT + "users"."user_id" AS "user_id", value_2 AS "generated_group_field" + FROM + users_table as "users") "left_group_by_1" + ON ("left_group_by_1".user_id > "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" + group BY + "generated_group_field" + ORDER BY + cnt DESC, generated_group_field ASC + LIMIT 10; + +-- two hash partitioned relations are not joined +-- on partiton keys although reference table is fine +-- to push down +SELECT + u1.user_id, count(*) +FROM + events_table as e1, users_table as u1 +WHERE + event_type IN + (SELECT + event_type + FROM + events_reference_table as e2 + WHERE + value_2 = 15 AND + value_3 > 25 AND + e1.value_2 > e2.value_2 + ) + AND u1.user_id > e1.user_id +GROUP BY 1 +ORDER BY 2 DESC, 1 DESC +LIMIT 5; + +SELECT foo.user_id FROM +( + SELECT m.user_id, random() FROM users_table m JOIN events_reference_table r ON int4eq(m.user_id, r.user_id) + WHERE event_type > 100000 +) as foo; + +-- not supported since group by is on the reference table column +SELECT foo.user_id FROM +( + SELECT r.user_id, random() FROM users_table m JOIN events_reference_table r ON int4eq(m.user_id, r.user_id) + GROUP BY r.user_id +) as foo; + +-- not supported since distinct is on the reference table column +SELECT foo.user_id FROM +( + SELECT DISTINCT r.user_id, random() FROM users_table m JOIN events_reference_table r ON int4eq(m.user_id, r.user_id) +) as foo; + +-- not supported since distinct on is on the reference table column +SELECT foo.user_id FROM +( + SELECT DISTINCT ON(r.user_id) r.user_id, random() FROM users_table m JOIN events_reference_table r ON int4eq(m.user_id, r.user_id) +) as foo; + DROP TABLE user_buy_test_table; DROP TABLE users_ref_test_table; DROP TABLE users_return_test_table; diff --git a/src/test/regress/sql/multi_subquery_in_where_reference_clause.sql b/src/test/regress/sql/multi_subquery_in_where_reference_clause.sql index 54266b085..5e38325f0 100644 --- a/src/test/regress/sql/multi_subquery_in_where_reference_clause.sql +++ b/src/test/regress/sql/multi_subquery_in_where_reference_clause.sql @@ -1,27 +1,6 @@ -- -- queries to test the subquery pushdown on reference tables --- subqueries in WHERE with greater operator -SELECT - user_id -FROM - users_table -WHERE - value_2 > - (SELECT - max(value_2) - FROM - events_reference_table - WHERE - users_table.user_id = events_reference_table.user_id AND event_type = 50 - GROUP BY - user_id - ) -GROUP BY user_id -HAVING count(*) > 66 -ORDER BY user_id -LIMIT 5; - -- subqueries in WHERE with IN operator SELECT user_id @@ -78,7 +57,7 @@ LIMIT 3; -- subqueries in WHERE with IN operator without equality SELECT - user_id + users_table.user_id, count(*) FROM users_table WHERE @@ -90,51 +69,10 @@ WHERE WHERE users_table.user_id > events_reference_table.user_id ) -GROUP BY user_id -ORDER BY user_id +GROUP BY users_table.user_id +ORDER BY 2 DESC, 1 DESC LIMIT 3; --- have reference table without any equality, should error out -SELECT - user_id -FROM - users_table -WHERE - value_2 > - (SELECT - max(value_2) - FROM - events_reference_table - WHERE - event_type = 50 - GROUP BY - user_id - ) -GROUP BY user_id -HAVING count(*) > 66 -ORDER BY user_id -LIMIT 5; - --- users that appeared more than 118 times, should run since the reference table --- on the right side of the semi join -SELECT - user_id -FROM - users_table -WHERE 118 <= - (SELECT - count(*) - FROM - events_reference_table - WHERE - users_table.user_id = events_reference_table.user_id - GROUP BY - user_id) -GROUP BY - user_id -ORDER BY - user_id; - -- should error out since reference table exist on the left side -- of the left lateral join SELECT user_id, value_2 FROM users_table WHERE @@ -197,3 +135,165 @@ SELECT user_id, value_2 FROM users_table WHERE HAVING sum(submit_card_info) > 0 ) ORDER BY 1, 2; + +-- non-partition key equality with reference table + SELECT + user_id, count(*) +FROM + users_table +WHERE + value_3 =ANY(SELECT value_2 FROM users_reference_table WHERE value_1 >= 10 AND value_1 <= 20) + GROUP BY 1 ORDER BY 2 DESC, 1 DESC LIMIT 5; + + +-- non-partition key comparison with reference table +SELECT + user_id, count(*) +FROM + events_table as e1 +WHERE + event_type IN + (SELECT + event_type + FROM + events_reference_table as e2 + WHERE + value_2 = 15 AND + value_3 > 25 AND + e1.value_2 > e2.value_2 + ) +GROUP BY 1 +ORDER BY 2 DESC, 1 DESC +LIMIT 5; + +-- subqueries in both WHERE and FROM clauses +-- should work since reference table is on the +-- inner part of the join +SELECT user_id, value_2 FROM users_table WHERE + value_1 > 101 AND value_1 < 110 + AND value_2 >= 5 + AND user_id IN + ( + SELECT + e1.user_id + FROM ( + -- Get the first time each user viewed the homepage. + SELECT + user_id, + 1 AS view_homepage, + min(time) AS view_homepage_time + FROM events_table + WHERE + event_type IN (10, 20, 30, 40, 50, 60, 70, 80, 90) + GROUP BY user_id + ) e1 LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS use_demo, + time AS use_demo_time + FROM events_table + WHERE + user_id = e1.user_id AND + event_type IN (11, 21, 31, 41, 51, 61, 71, 81, 91) + ORDER BY time + ) e2 ON true LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS enter_credit_card, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e2.user_id AND + event_type IN (12, 22, 32, 42, 52, 62, 72, 82, 92) + ORDER BY time + ) e3 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS submit_card_info, + user_id, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e3.user_id AND + event_type IN (13, 23, 33, 43, 53, 63, 73, 83, 93) + ORDER BY time + ) e4 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS see_bought_screen + FROM events_reference_table + WHERE + user_id = e4.user_id AND + event_type IN (14, 24, 34, 44, 54, 64, 74, 84, 94) + ORDER BY time + ) e5 ON true + group by e1.user_id + HAVING sum(submit_card_info) > 0 +) +ORDER BY 1, 2; + +-- reference tables are not allowed if there is sublink +SELECT + count(*) +FROM + users_reference_table +WHERE user_id + NOT IN +(SELECT users_table.value_2 FROM users_table JOIN users_reference_table as u2 ON users_table.value_2 = u2.value_2); + + +-- reference tables are not allowed if there is sublink +SELECT count(*) +FROM + (SELECT + user_id, random() FROM users_reference_table) AS vals + WHERE vals.user_id NOT IN + (SELECT users_table.value_2 + FROM users_table + JOIN users_reference_table AS u2 ON users_table.value_2 = u2.value_2); + +-- reference tables are not allowed if there is sublink +SELECT user_id, + count(*) +FROM users_reference_table +WHERE value_2 > ALL + (SELECT min(value_2) + FROM events_table + WHERE event_type > 50 AND users_reference_table.user_id = events_table.user_id + GROUP BY user_id) +GROUP BY user_id +HAVING count(*) > 66 +ORDER BY 2 DESC, + 1 DESC +LIMIT 5; + +-- reference tables are not allowed if there is sublink +-- this time in the subquery +SELECT * +FROM users_table +WHERE user_id IN + (SELECT users_table.user_id + FROM users_table, + users_reference_table + WHERE users_reference_table.user_id NOT IN + (SELECT value_2 + FROM users_reference_table AS u2)); + +-- not supported since GROUP BY references to an upper level query +SELECT + user_id +FROM + users_table +WHERE + value_2 > + (SELECT + max(value_2) + FROM + events_reference_table + WHERE + users_table.user_id = events_reference_table.user_id AND event_type = 50 + GROUP BY + users_table.user_id + ) +GROUP BY user_id +HAVING count(*) > 66 +ORDER BY user_id +LIMIT 5;