From de33079065ba28c614174a2312588f44567536ba Mon Sep 17 00:00:00 2001 From: Onder Kalaci Date: Mon, 9 Mar 2020 11:57:03 +0100 Subject: [PATCH 1/2] Improve outer join checks Before this commit, the logic was: - As long as the outer side of the JOIN is not a JOIN (e.g., relation or subquery etc.), we check for the existence of any recurring tuples. There were two implications of this decision. First, even if a subquery which is on the outer side contains distributed table JOIN reference table, Citus would unnecessarily throw an error. Note that, the JOIN inside the subquery would already be going to be tested recursively. But, as long as that check passes, there is no reason for the upper JOIN to fail. An example, which used to fail and now works: SELECT * FROM (SELECT * FROM dist JOIN ref) as foo LEFT JOIN dist; Second, certain JOINs, especially with ON (true) conditions were not represented as Citus expects the JOINs to be in the format DeferredErrorIfUnsupportedRecurringTuplesJoin(). --- .../planner/query_pushdown_planning.c | 132 ++++++------------ ...ulti_subquery_complex_reference_clause.out | 124 +++++++++++++++- ...ulti_subquery_complex_reference_clause.sql | 51 +++++++ 3 files changed, 211 insertions(+), 96 deletions(-) diff --git a/src/backend/distributed/planner/query_pushdown_planning.c b/src/backend/distributed/planner/query_pushdown_planning.c index b26b4053f..4bc5589f8 100644 --- a/src/backend/distributed/planner/query_pushdown_planning.c +++ b/src/backend/distributed/planner/query_pushdown_planning.c @@ -79,10 +79,8 @@ static DeferredErrorMessage * DeferredErrorIfUnsupportedRecurringTuplesJoin( PlannerRestrictionContext *plannerRestrictionContext); static DeferredErrorMessage * DeferErrorIfUnsupportedTableCombination(Query *queryTree); static bool ExtractSetOperationStatmentWalker(Node *node, List **setOperationList); -static bool ShouldRecurseForRecurringTuplesJoinChecks(RelOptInfo *relOptInfo); -static bool RelationInfoContainsRecurringTuples(PlannerInfo *plannerInfo, - RelOptInfo *relationInfo, - RecurringTuplesType *recurType); +static RecurringTuplesType FetchFirstRecurType(PlannerInfo *plannerInfo, RelOptInfo * + relationInfo); static bool ContainsRecurringRTE(RangeTblEntry *rangeTableEntry, RecurringTuplesType *recurType); static bool ContainsRecurringRangeTable(List *rangeTable, RecurringTuplesType *recurType); @@ -776,7 +774,6 @@ DeferredErrorIfUnsupportedRecurringTuplesJoin( plannerRestrictionContext->joinRestrictionContext->joinRestrictionList; ListCell *joinRestrictionCell = NULL; RecurringTuplesType recurType = RECURRING_TUPLES_INVALID; - foreach(joinRestrictionCell, joinRestrictionList) { JoinRestriction *joinRestriction = (JoinRestriction *) lfirst( @@ -799,20 +796,33 @@ DeferredErrorIfUnsupportedRecurringTuplesJoin( continue; } - if (ShouldRecurseForRecurringTuplesJoinChecks(outerrel) && - RelationInfoContainsRecurringTuples(plannerInfo, outerrel, &recurType)) + + /* + * If the outer side of the join doesn't have any distributed tables + * (e.g., contains only recurring tuples), Citus should not pushdown + * the query. The reason is that recurring tuples on every shard would + * be added to the result, which is wrong. + */ + if (RelationInfoContainsOnlyRecurringTuples(plannerInfo, outerrel)) { + recurType = FetchFirstRecurType(plannerInfo, outerrel); + break; } } else if (joinType == JOIN_FULL) { - if ((ShouldRecurseForRecurringTuplesJoinChecks(innerrel) && - RelationInfoContainsRecurringTuples(plannerInfo, innerrel, - &recurType)) || - (ShouldRecurseForRecurringTuplesJoinChecks(outerrel) && - RelationInfoContainsRecurringTuples(plannerInfo, outerrel, &recurType))) + if (RelationInfoContainsOnlyRecurringTuples(plannerInfo, innerrel)) { + recurType = FetchFirstRecurType(plannerInfo, innerrel); + + break; + } + + if (RelationInfoContainsOnlyRecurringTuples(plannerInfo, outerrel)) + { + recurType = FetchFirstRecurType(plannerInfo, outerrel); + break; } } @@ -846,6 +856,7 @@ DeferredErrorIfUnsupportedRecurringTuplesJoin( "Complex subqueries and CTEs cannot be in the outer " "part of the outer join", NULL); } + return NULL; } @@ -1071,7 +1082,8 @@ DeferErrorIfUnsupportedTableCombination(Query *queryTree) * Extract all range table indexes from the join tree. Note that sub-queries * that get pulled up by PostgreSQL don't appear in this join tree. */ - ExtractRangeTableIndexWalker((Node *) queryTree->jointree, &joinTreeTableIndexList); + ExtractRangeTableIndexWalker((Node *) queryTree->jointree, + &joinTreeTableIndexList); foreach_int(joinTreeTableIndex, joinTreeTableIndexList) { @@ -1180,13 +1192,15 @@ DeferErrorIfUnsupportedUnionQuery(Query *subqueryTree) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "cannot push down this subquery", - "Intersect and Except are currently unsupported", NULL); + "Intersect and Except are currently unsupported", + NULL); } if (IsA(leftArg, RangeTblRef)) { leftArgRTI = ((RangeTblRef *) leftArg)->rtindex; - Query *leftArgSubquery = rt_fetch(leftArgRTI, subqueryTree->rtable)->subquery; + Query *leftArgSubquery = rt_fetch(leftArgRTI, + subqueryTree->rtable)->subquery; recurType = FromClauseRecurringTupleType(leftArgSubquery); if (recurType != RECURRING_TUPLES_INVALID) { @@ -1260,81 +1274,14 @@ ExtractSetOperationStatmentWalker(Node *node, List **setOperationList) (*setOperationList) = lappend(*setOperationList, setOperation); } - bool walkerResult = expression_tree_walker(node, ExtractSetOperationStatmentWalker, + bool walkerResult = expression_tree_walker(node, + ExtractSetOperationStatmentWalker, setOperationList); return walkerResult; } -/* - * ShouldRecurseForRecurringTuplesJoinChecks is a helper function for deciding - * on whether the input relOptInfo should be checked for table expressions that - * generate the same tuples in every query on a shard. We use this to avoid - * redundant checks and false positives in complex join trees. - */ -static bool -ShouldRecurseForRecurringTuplesJoinChecks(RelOptInfo *relOptInfo) -{ - bool shouldRecurse = true; - - /* - * We shouldn't recursively go down for joins since we're already - * going to process each join seperately. Otherwise we'd restrict - * the coverage. See the below sketch where (h) denotes a hash - * distributed relation, (r) denotes a reference table, (L) denotes - * LEFT JOIN and (I) denotes INNER JOIN. If we're to recurse into - * the inner join, we'd be preventing to push down the following - * join tree, which is actually safe to push down. - * - * (L) - * / \ - * (I) h - * / \ - * r h - */ - if (relOptInfo->reloptkind == RELOPT_JOINREL) - { - return false; - } - - /* - * Note that we treat the same query where relations appear in subqueries - * differently. (i.e., use SELECT * FROM r; instead of r) - * - * In that case, to relax some restrictions, we do the following optimization: - * If the subplan (i.e., plannerInfo corresponding to the subquery) contains any - * joins, we skip reference table checks keeping in mind that the join is already - * going to be processed seperately. This optimization should suffice for many - * use cases. - */ - if (relOptInfo->reloptkind == RELOPT_BASEREL && relOptInfo->subroot != NULL) - { - PlannerInfo *subroot = relOptInfo->subroot; - - if (list_length(subroot->join_rel_list) > 0) - { - RelOptInfo *subqueryJoin = linitial(subroot->join_rel_list); - - /* - * Subqueries without relations (e.g. SELECT 1) are a little funny. - * They are treated as having a join, but the join is between 0 - * relations and won't be in the join restriction list and therefore - * won't be revisited in DeferredErrorIfUnsupportedRecurringTuplesJoin. - * - * We therefore only skip joins with >0 relations. - */ - if (bms_num_members(subqueryJoin->relids) > 0) - { - shouldRecurse = false; - } - } - } - - return shouldRecurse; -} - - /* * RelationInfoContainsOnlyRecurringTuples returns false if any of the relations in * a RelOptInfo is not recurring. @@ -1370,19 +1317,20 @@ RelationInfoContainsOnlyRecurringTuples(PlannerInfo *plannerInfo, /* - * RelationInfoContainsRecurringTuples checks whether the relationInfo + * FetchFirstRecurType checks whether the relationInfo * contains any recurring table expression, namely a reference table, - * or immutable function. If found, RelationInfoContainsRecurringTuples + * or immutable function. If found, FetchFirstRecurType * returns true. * * Note that since relation ids of relationInfo indexes to the range * table entry list of planner info, planner info is also passed. */ -static bool -RelationInfoContainsRecurringTuples(PlannerInfo *plannerInfo, RelOptInfo *relationInfo, - RecurringTuplesType *recurType) +static RecurringTuplesType +FetchFirstRecurType(PlannerInfo *plannerInfo, RelOptInfo * + relationInfo) { Relids relids = relationInfo->relids; + RecurringTuplesType recurType = RECURRING_TUPLES_INVALID; int relationId = -1; while ((relationId = bms_next_member(relids, relationId)) >= 0) @@ -1390,13 +1338,13 @@ RelationInfoContainsRecurringTuples(PlannerInfo *plannerInfo, RelOptInfo *relati RangeTblEntry *rangeTableEntry = plannerInfo->simple_rte_array[relationId]; /* relationInfo has this range table entry */ - if (ContainsRecurringRTE(rangeTableEntry, recurType)) + if (ContainsRecurringRTE(rangeTableEntry, &recurType)) { - return true; + return recurType; } } - return false; + return recurType; } diff --git a/src/test/regress/expected/multi_subquery_complex_reference_clause.out b/src/test/regress/expected/multi_subquery_complex_reference_clause.out index 0a743e9ab..a40a6221c 100644 --- a/src/test/regress/expected/multi_subquery_complex_reference_clause.out +++ b/src/test/regress/expected/multi_subquery_complex_reference_clause.out @@ -189,8 +189,13 @@ SELECT subquery_1.user_id ON tt1.user_id = ref.id) subquery_2_inner) subquery_2 ON subquery_1.user_id = subquery_2.user_id ORDER BY 1; -ERROR: cannot pushdown the subquery -DETAIL: There exist a reference table in the outer part of the outer join + user_id +--------------------------------------------------------------------- + 1 + 3 + +(3 rows) + -- should be able to pushdown since reference table is in the -- inner part of the left join SELECT @@ -646,8 +651,15 @@ GROUP BY 2 ORDER BY 1 DESC, 2 DESC LIMIT 5; -ERROR: cannot pushdown the subquery -DETAIL: There exist a reference table in the outer part of the outer join + max | usr_id +--------------------------------------------------------------------- + 432 | 2 + 391 | 4 + 364 | 5 + 357 | 3 + 105 | 1 +(5 rows) + -- LATERAL JOINs used with INNER JOINs with reference tables SET citus.subquery_pushdown to ON; NOTICE: Setting citus.subquery_pushdown flag is discouraged becuase it forces the planner to pushdown certain queries, skipping relevant correctness checks. @@ -1823,6 +1835,110 @@ LIMIT 5; 6 (1 row) +-- outer part of the LEFT JOIN consists only reference tables, so we cannot push down +-- we have different combinations for ON condition, true/false/two column join/single column filter +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON true; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON true; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON false; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON false; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON (ref1.id > 5); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON (user_buy_test_table.user_id > 5); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON (ref1.id = user_buy_test_table.user_id); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON (ref2.id = user_buy_test_table.user_id); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON (ref1.id = user_buy_test_table.user_id); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON (ref2.id = user_buy_test_table.user_id); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- outer part of the LEFT JOIN consists only reference tables within a subquery, so we cannot push down +-- we have different combinations for ON condition, true/false/two column join/single column filter +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON true; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON true; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON false; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON false; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON (foo.id > 5); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON (user_buy_test_table.user_id > 19); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON (foo.id = user_buy_test_table.user_id); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- one example where unsupported outer join is deep inside a subquery +SELECT *, random() FROM ( +SELECT *,random() FROM user_buy_test_table WHERE user_id > ( +SELECT count(*) FROM (SELECT *,random() FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as bar) as foo LEFT JOIN (SELECT *, random() FROM (SELECT *,random() FROM user_buy_test_table d1 JOIN user_buy_test_table d2 USING (user_id)) as bar_inner ) as bar ON true)) as boo; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- In theory, we should be able to pushdown this query +-- however, as the LEFT JOIN condition is between a reference table and the distributed table +-- Postgres generates a LEFT JOIN alternative among those tables +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 INNER JOIN user_buy_test_table u1 on ref1.id = u1.user_id) as foo LEFT JOIN user_buy_test_table ON (foo.id = user_buy_test_table.user_id); +ERROR: cannot pushdown the subquery +DETAIL: Complex subqueries and CTEs cannot be in the outer part of the outer join +-- same as the above query, but this time LEFT JOIN condition is between distributed tables +-- so Postgres doesn't generate join restriction between reference and distributed tables +SELECT count(*) FROM (SELECT u1.*, random() FROM users_ref_test_table ref1 INNER JOIN user_buy_test_table u1 on ref1.id = u1.user_id) as foo LEFT JOIN user_buy_test_table ON (foo.user_id = user_buy_test_table.user_id); + count +--------------------------------------------------------------------- + 3 +(1 row) + +-- again, in theory should be OK to pushdown but +-- Postgres generates join restriction between reference and distributed tables +-- in one of the cases +SELECT count(*) FROM user_buy_test_table a LEFT JOIN users_ref_test_table b ON (true) RIGHT JOIN users_ref_test_table c ON (b.id = c.id); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- outer part of the LEFT JOIN consists only intermediate result due to LIMIT, so we cannot push down +SELECT count(*) FROM (SELECT ref1.* FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LIMIT 5) as foo LEFT JOIN user_buy_test_table ON true; +ERROR: cannot pushdown the subquery +DETAIL: Complex subqueries and CTEs cannot be in the outer part of the outer join +-- should be fine as OUTER part is the distributed table +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id RIGHT JOIN user_buy_test_table ON true; + count +--------------------------------------------------------------------- + 24 +(1 row) + +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id RIGHT JOIN user_buy_test_table ON false; + count +--------------------------------------------------------------------- + 4 +(1 row) + +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id RIGHT JOIN user_buy_test_table ON (ref1.id = user_id); + count +--------------------------------------------------------------------- + 4 +(1 row) + DROP TABLE user_buy_test_table; DROP TABLE users_ref_test_table; DROP TABLE users_return_test_table; diff --git a/src/test/regress/sql/multi_subquery_complex_reference_clause.sql b/src/test/regress/sql/multi_subquery_complex_reference_clause.sql index ec37df5ae..2704e37db 100644 --- a/src/test/regress/sql/multi_subquery_complex_reference_clause.sql +++ b/src/test/regress/sql/multi_subquery_complex_reference_clause.sql @@ -1329,6 +1329,57 @@ JOIN ORDER BY 1 LIMIT 5; +-- outer part of the LEFT JOIN consists only reference tables, so we cannot push down +-- we have different combinations for ON condition, true/false/two column join/single column filter +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON true; +SELECT count(*) FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON true; +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON false; +SELECT count(*) FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON false; +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON (ref1.id > 5); +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON (user_buy_test_table.user_id > 5); +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON (ref1.id = user_buy_test_table.user_id); +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON (ref2.id = user_buy_test_table.user_id); +SELECT count(*) FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON (ref1.id = user_buy_test_table.user_id); +SELECT count(*) FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON (ref2.id = user_buy_test_table.user_id); + + +-- outer part of the LEFT JOIN consists only reference tables within a subquery, so we cannot push down +-- we have different combinations for ON condition, true/false/two column join/single column filter +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON true; +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON true; +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON false; +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON false; +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON (foo.id > 5); +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON (user_buy_test_table.user_id > 19); +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON (foo.id = user_buy_test_table.user_id); + +-- one example where unsupported outer join is deep inside a subquery +SELECT *, random() FROM ( +SELECT *,random() FROM user_buy_test_table WHERE user_id > ( +SELECT count(*) FROM (SELECT *,random() FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as bar) as foo LEFT JOIN (SELECT *, random() FROM (SELECT *,random() FROM user_buy_test_table d1 JOIN user_buy_test_table d2 USING (user_id)) as bar_inner ) as bar ON true)) as boo; + +-- In theory, we should be able to pushdown this query +-- however, as the LEFT JOIN condition is between a reference table and the distributed table +-- Postgres generates a LEFT JOIN alternative among those tables +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 INNER JOIN user_buy_test_table u1 on ref1.id = u1.user_id) as foo LEFT JOIN user_buy_test_table ON (foo.id = user_buy_test_table.user_id); + +-- same as the above query, but this time LEFT JOIN condition is between distributed tables +-- so Postgres doesn't generate join restriction between reference and distributed tables +SELECT count(*) FROM (SELECT u1.*, random() FROM users_ref_test_table ref1 INNER JOIN user_buy_test_table u1 on ref1.id = u1.user_id) as foo LEFT JOIN user_buy_test_table ON (foo.user_id = user_buy_test_table.user_id); + +-- again, in theory should be OK to pushdown but +-- Postgres generates join restriction between reference and distributed tables +-- in one of the cases +SELECT count(*) FROM user_buy_test_table a LEFT JOIN users_ref_test_table b ON (true) RIGHT JOIN users_ref_test_table c ON (b.id = c.id); + +-- outer part of the LEFT JOIN consists only intermediate result due to LIMIT, so we cannot push down +SELECT count(*) FROM (SELECT ref1.* FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LIMIT 5) as foo LEFT JOIN user_buy_test_table ON true; + +-- should be fine as OUTER part is the distributed table +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id RIGHT JOIN user_buy_test_table ON true; +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id RIGHT JOIN user_buy_test_table ON false; +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id RIGHT JOIN user_buy_test_table ON (ref1.id = user_id); + DROP TABLE user_buy_test_table; DROP TABLE users_ref_test_table; DROP TABLE users_return_test_table; From 15e724c073b3990aee43a2c2d9e3e81ee4819dec Mon Sep 17 00:00:00 2001 From: Onder Kalaci Date: Thu, 8 Oct 2020 14:45:47 +0200 Subject: [PATCH 2/2] Add regression tests for outer/cross JOINs --- .../planner/query_pushdown_planning.c | 12 + src/test/regress/expected/cross_join.out | 253 ++++++++++++++++++ ...ulti_subquery_complex_reference_clause.out | 132 ++++++++- .../regress/expected/sqlancer_failures.out | 43 +++ src/test/regress/multi_schedule | 2 +- src/test/regress/sql/cross_join.sql | 131 +++++++++ ...ulti_subquery_complex_reference_clause.sql | 101 ++++++- src/test/regress/sql/sqlancer_failures.sql | 19 ++ 8 files changed, 681 insertions(+), 12 deletions(-) create mode 100644 src/test/regress/expected/cross_join.out create mode 100644 src/test/regress/sql/cross_join.sql diff --git a/src/backend/distributed/planner/query_pushdown_planning.c b/src/backend/distributed/planner/query_pushdown_planning.c index 4bc5589f8..863933759 100644 --- a/src/backend/distributed/planner/query_pushdown_planning.c +++ b/src/backend/distributed/planner/query_pushdown_planning.c @@ -805,6 +805,10 @@ DeferredErrorIfUnsupportedRecurringTuplesJoin( */ if (RelationInfoContainsOnlyRecurringTuples(plannerInfo, outerrel)) { + /* + * Find the first (or only) recurring RTE to give a meaningful + * error to the user. + */ recurType = FetchFirstRecurType(plannerInfo, outerrel); break; @@ -814,6 +818,10 @@ DeferredErrorIfUnsupportedRecurringTuplesJoin( { if (RelationInfoContainsOnlyRecurringTuples(plannerInfo, innerrel)) { + /* + * Find the first (or only) recurring RTE to give a meaningful + * error to the user. + */ recurType = FetchFirstRecurType(plannerInfo, innerrel); break; @@ -821,6 +829,10 @@ DeferredErrorIfUnsupportedRecurringTuplesJoin( if (RelationInfoContainsOnlyRecurringTuples(plannerInfo, outerrel)) { + /* + * Find the first (or only) recurring RTE to give a meaningful + * error to the user. + */ recurType = FetchFirstRecurType(plannerInfo, outerrel); break; diff --git a/src/test/regress/expected/cross_join.out b/src/test/regress/expected/cross_join.out new file mode 100644 index 000000000..8ced8541d --- /dev/null +++ b/src/test/regress/expected/cross_join.out @@ -0,0 +1,253 @@ +-- this test file relies on multi_behavioral_analytics_create_table +-- and aims to have variety of tests covering CROSS JOINs +-- "t1 CROSS JOIN t2" is equivalent of "t1 JOIN t2 ON true" +-- a distributed table can be cross joined with a reference table +-- and the CROSS JOIN can be in the outer part of an outer JOIN +SELECT count(*) FROM events_reference_table e1 CROSS JOIN events_table e2 LEFT JOIN users_table u ON (e2.user_id = u.user_id); + count +--------------------------------------------------------------------- + 176649 +(1 row) + +-- two distributed tables cannot be cross joined +-- as it lacks distribution key equality +SELECT count(*) FROM events_reference_table e1 CROSS JOIN events_table e2 CROSS JOIN users_table u; +ERROR: cannot perform distributed planning on this query +DETAIL: Cartesian products are currently unsupported +SELECT count(*) FROM events_reference_table e1, events_table e2, users_table u; +ERROR: cannot perform distributed planning on this query +DETAIL: Cartesian products are currently unsupported +-- we can provide the distribution key equality via WHERE clause +SELECT count(*) FROM events_reference_table e1 CROSS JOIN events_table e2 CROSS JOIN users_table u WHERE u.user_id = e2.user_id; + count +--------------------------------------------------------------------- + 176649 +(1 row) + +-- two reference tables are JOINed, and later CROSS JOINed with a distributed table +-- it is safe to pushdown +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id CROSS JOIN users_table; + count +--------------------------------------------------------------------- + 606 +(1 row) + +SELECT count(*) FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id CROSS JOIN users_table; + count +--------------------------------------------------------------------- + 606 +(1 row) + +SELECT count(*) FROM users_ref_test_table ref1 RIGHT JOIN users_ref_test_table ref2 on ref1.id = ref2.id CROSS JOIN users_table; + count +--------------------------------------------------------------------- + 606 +(1 row) + +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 CROSS JOIN users_table; + count +--------------------------------------------------------------------- + 3636 +(1 row) + +-- two reference tables CROSS JOINNed, and later JOINED with distributed tables +-- it is safe to pushdown +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 RIGHT JOIN users_table ON false; + count +--------------------------------------------------------------------- + 101 +(1 row) + +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 JOIN users_table ON false; + count +--------------------------------------------------------------------- + 0 +(1 row) + +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 JOIN users_table ON true; + count +--------------------------------------------------------------------- + 3636 +(1 row) + +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 RIGHT JOIN users_table ON true; + count +--------------------------------------------------------------------- + 3636 +(1 row) + +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 RIGHT JOIN users_table ON (ref1.id = users_table.user_id); + count +--------------------------------------------------------------------- + 606 +(1 row) + +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 JOIN users_table ON (ref1.id = users_table.user_id); + count +--------------------------------------------------------------------- + 606 +(1 row) + +-- two reference tables CROSS JOINNed, and later JOINED with distributed tables +-- but the reference table CROSS JOIN is in the outer side of the JOIN with the distributed table +-- so we cannot pushdown +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON (ref1.id = users_table.user_id); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 FULL JOIN users_table ON (ref1.id = users_table.user_id); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON (ref1.id != users_table.user_id); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON (ref1.id > 0); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON (users_table.user_id > 0); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON true; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON false; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- a reference tables CROSS JOINed with a distribted table, and later JOINED with distributed tables on distribution keys +-- so safe to pushdown +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 JOIN users_table u2 ON (u1.user_id = u2.user_id); + count +--------------------------------------------------------------------- + 11802 +(1 row) + +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table u2 ON (u1.user_id = u2.user_id); + count +--------------------------------------------------------------------- + 11802 +(1 row) + +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 FULL JOIN users_table u2 ON (u1.user_id = u2.user_id); + count +--------------------------------------------------------------------- + 11802 +(1 row) + +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 RIGHT JOIN users_table u2 ON (u1.user_id = u2.user_id); + count +--------------------------------------------------------------------- + 11802 +(1 row) + +-- a reference tables CROSS JOINed with a distribted table, and later JOINED with distributed tables on reference table column +-- so not safe to pushdown +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table u2 ON (ref2.id = u2.user_id); +ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 FULL JOIN users_table u2 ON (ref2.id = u2.user_id); +ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 RIGHT JOIN users_table u2 ON (ref2.id = u2.user_id); +ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns +-- via repartitioning, Citus can handle this query as the result of "u1 CROSS JOIN ref2" +-- can be repartitioned on ref2.id +Set citus.enable_repartition_joins to on; +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 JOIN users_table u2 ON (ref2.id = u2.user_id); + count +--------------------------------------------------------------------- + 10201 +(1 row) + +reset citus.enable_repartition_joins; +-- although the following has the "ref LEFT JOIN dist" type of query, the LEFT JOIN is eliminated by Postgres +-- because the INNER JOIN eliminates the LEFT JOIN +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON (ref1.id = users_table.user_id) JOIN users_table u2 ON (u2.user_id = users_table.user_id); + count +--------------------------------------------------------------------- + 11802 +(1 row) + +-- this is the same query as the above, but this time the outer query is also LEFT JOIN, meaning that Postgres +-- cannot eliminate the outer join +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON (ref1.id = users_table.user_id) LEFT JOIN users_table u2 ON (u2.user_id = users_table.user_id); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- cross join that goes through non-colocated subquery logic +-- for the "events_table" subquery as both distributed tables +-- do not have JOIN on the distribution key +SELECT max(events_all.cnt), + events_all.usr_id + FROM +(SELECT *, random() FROM + (SELECT *, random() + FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt + FROM (SELECT *,random FROM (SELECT *, random() FROM events_reference_table) as events_reference_table) as events_reference_table + INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) + GROUP BY users_table.user_id) AS events_all_inner + ) AS events_all + ) AS events_all + CROSS JOIN (SELECT *,random() FROM (SELECT *, random() FROM events_table) as events_table) as events_table +GROUP BY 2 +ORDER BY 1 DESC, + 2 DESC +LIMIT 5; + max | usr_id +--------------------------------------------------------------------- + 432 | 2 + 391 | 4 + 364 | 5 + 357 | 3 + 105 | 1 +(5 rows) + +-- cross join that goes through non-colocated subquery logic +-- for the "events_all" subquery as both distributed tables +-- do not have JOIN on the distribution key +SELECT max(events_all.cnt), + events_all.usr_id + FROM events_table + CROSS JOIN (SELECT *, random() + FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt + FROM events_reference_table + INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) + GROUP BY users_table.user_id) AS events_all_inner + ) AS events_all +GROUP BY 2 +ORDER BY 1 DESC, + 2 DESC +LIMIT 5; + max | usr_id +--------------------------------------------------------------------- + 432 | 2 + 391 | 4 + 364 | 5 + 357 | 3 + 105 | 1 +(5 rows) + +-- cross join is between a reference table and distributed table, and +-- deep inside a subquery. The subquery can be in the outer part of the LEFT JOIN +SELECT + users_table.* +FROM + (SELECT + events_all.*, random() + FROM + events_reference_table JOIN users_table USING(user_id) + JOIN + (SELECT *, random() + FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt + FROM (SELECT *,random FROM (SELECT *, random() FROM events_reference_table) as events_reference_table) as events_reference_table + CROSS JOIN users_table + GROUP BY users_table.user_id) AS events_all_inner + ) AS events_all ON (user_id = usr_id) + ) AS events_all + LEFT JOIN (SELECT *,random() FROM (SELECT *, random() FROM events_table) as events_table) as events_table ON (events_all.usr_id = events_table.user_id) + LEFT JOIN users_table USING (user_id) +ORDER BY 1,2,3,4 LIMIT 5; + user_id | time | value_1 | value_2 | value_3 | value_4 +--------------------------------------------------------------------- + 1 | Wed Nov 22 22:51:43.132261 2017 | 4 | 0 | 3 | + 1 | Wed Nov 22 22:51:43.132261 2017 | 4 | 0 | 3 | + 1 | Wed Nov 22 22:51:43.132261 2017 | 4 | 0 | 3 | + 1 | Wed Nov 22 22:51:43.132261 2017 | 4 | 0 | 3 | + 1 | Wed Nov 22 22:51:43.132261 2017 | 4 | 0 | 3 | +(5 rows) + diff --git a/src/test/regress/expected/multi_subquery_complex_reference_clause.out b/src/test/regress/expected/multi_subquery_complex_reference_clause.out index a40a6221c..9329e2df4 100644 --- a/src/test/regress/expected/multi_subquery_complex_reference_clause.out +++ b/src/test/regress/expected/multi_subquery_complex_reference_clause.out @@ -660,6 +660,132 @@ LIMIT 5; 105 | 1 (5 rows) +-- should be fine even if the tables are deep inside subqueries +SELECT max(events_all.cnt), + events_all.usr_id + FROM +(SELECT *, random() FROM + (SELECT *, random() + FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt + FROM (SELECT *,random FROM (SELECT *, random() FROM events_reference_table) as events_reference_table) as events_reference_table + INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) + GROUP BY users_table.user_id) AS events_all_inner + ) AS events_all + ) AS events_all + LEFT JOIN (SELECT *,random() FROM (SELECT *, random() FROM events_table) as events_table) as events_table ON (events_all.usr_id = events_table.user_id) +GROUP BY 2 +ORDER BY 1 DESC, + 2 DESC +LIMIT 5; + max | usr_id +--------------------------------------------------------------------- + 432 | 2 + 391 | 4 + 364 | 5 + 357 | 3 + 105 | 1 +(5 rows) + +-- should be fine with FULL join as well +SELECT max(events_all.cnt), + events_all.usr_id + FROM events_table + FULL JOIN (SELECT *, random() + FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt + FROM events_reference_table + INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) + GROUP BY users_table.user_id) AS events_all_inner + ) AS events_all ON (events_all.usr_id = events_table.user_id) +GROUP BY 2 +ORDER BY 1 DESC, + 2 DESC +LIMIT 5; + max | usr_id +--------------------------------------------------------------------- + 432 | 2 + 391 | 4 + 364 | 5 + 357 | 3 + 105 | 1 +(5 rows) + +-- two levels of "(ref_table JOIN dist_table) LEFT JOIN" +-- should be fine as well +SELECT + users_table.* +FROM + (SELECT + events_all.*, random() + FROM + events_reference_table JOIN users_table USING(user_id) + JOIN + (SELECT *, random() + FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt + FROM (SELECT *,random FROM (SELECT *, random() FROM events_reference_table) as events_reference_table) as events_reference_table + INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) + GROUP BY users_table.user_id) AS events_all_inner + ) AS events_all ON (user_id = usr_id) + ) AS events_all + LEFT JOIN (SELECT *,random() FROM (SELECT *, random() FROM events_table) as events_table) as events_table ON (events_all.usr_id = events_table.user_id) + LEFT JOIN users_table USING (user_id) +ORDER BY 1,2,3,4 LIMIT 5; + user_id | time | value_1 | value_2 | value_3 | value_4 +--------------------------------------------------------------------- + 1 | Wed Nov 22 22:51:43.132261 2017 | 4 | 0 | 3 | + 1 | Wed Nov 22 22:51:43.132261 2017 | 4 | 0 | 3 | + 1 | Wed Nov 22 22:51:43.132261 2017 | 4 | 0 | 3 | + 1 | Wed Nov 22 22:51:43.132261 2017 | 4 | 0 | 3 | + 1 | Wed Nov 22 22:51:43.132261 2017 | 4 | 0 | 3 | +(5 rows) + +-- we should be able to support OUTER joins involving +-- reference tables even if the subquery is in WHERE clause +SELECT count(*) +FROM events_table +WHERE user_id IN + (SELECT subquery_1.user_id + FROM + (SELECT tt1.user_id, + random() + FROM users_table AS tt1 + JOIN events_table AS tt2 ON tt1.user_id = tt2.user_id) subquery_1 + RIGHT JOIN + (SELECT *, + random() + FROM + (SELECT tt1.user_id, + random() + FROM users_table AS tt1 + JOIN events_reference_table AS REF ON tt1.user_id = ref.user_id) subquery_2_inner) subquery_2 ON subquery_1.user_id = subquery_2.user_id); + count +--------------------------------------------------------------------- + 101 +(1 row) + +-- we should be able to support OUTER joins involving +-- reference tables even if the subquery is in the outer part of a JOIN +SELECT count(*) +FROM users_table +RIGHT JOIN + (SELECT subquery_1.user_id + FROM + (SELECT tt1.user_id, + random() + FROM users_table AS tt1 + JOIN events_table AS tt2 ON tt1.user_id = tt2.user_id) subquery_1 + RIGHT JOIN + (SELECT *, + random() + FROM + (SELECT tt1.user_id, + random() + FROM users_table AS tt1 + JOIN events_reference_table AS REF ON tt1.user_id = ref.user_id) subquery_2_inner) subquery_2 ON subquery_1.user_id = subquery_2.user_id) AS foo USING (user_id); + count +--------------------------------------------------------------------- + 12664199 +(1 row) + -- LATERAL JOINs used with INNER JOINs with reference tables SET citus.subquery_pushdown to ON; NOTICE: Setting citus.subquery_pushdown flag is discouraged becuase it forces the planner to pushdown certain queries, skipping relevant correctness checks. @@ -1910,12 +2036,6 @@ SELECT count(*) FROM (SELECT u1.*, random() FROM users_ref_test_table ref1 INNER 3 (1 row) --- again, in theory should be OK to pushdown but --- Postgres generates join restriction between reference and distributed tables --- in one of the cases -SELECT count(*) FROM user_buy_test_table a LEFT JOIN users_ref_test_table b ON (true) RIGHT JOIN users_ref_test_table c ON (b.id = c.id); -ERROR: cannot pushdown the subquery -DETAIL: There exist a reference table in the outer part of the outer join -- outer part of the LEFT JOIN consists only intermediate result due to LIMIT, so we cannot push down SELECT count(*) FROM (SELECT ref1.* FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LIMIT 5) as foo LEFT JOIN user_buy_test_table ON true; ERROR: cannot pushdown the subquery diff --git a/src/test/regress/expected/sqlancer_failures.out b/src/test/regress/expected/sqlancer_failures.out index 84d9ecf10..2cb3de1cd 100644 --- a/src/test/regress/expected/sqlancer_failures.out +++ b/src/test/regress/expected/sqlancer_failures.out @@ -108,5 +108,48 @@ EXPLAIN (COSTS FALSE) SELECT FROM t5 WHERE c0 = CASE WHEN 2 BETWEEN 1 AND 3 THEN Filter: (c0 = 2) (7 rows) +CREATE TABLE IF NOT EXISTS t6(c0 TEXT CHECK (TRUE), c1 money ) WITH (autovacuum_vacuum_threshold=1180014707, autovacuum_freeze_table_age=13771154, autovacuum_vacuum_cost_delay=23, autovacuum_analyze_threshold=1935153914, autovacuum_freeze_min_age=721733768, autovacuum_enabled=0, autovacuum_vacuum_cost_limit=9983); +CREATE UNLOGGED TABLE IF NOT EXISTS t7(LIKE t6); +CREATE TABLE t8(LIKE t6 INCLUDING INDEXES); +CREATE UNLOGGED TABLE t9(LIKE t6 EXCLUDING STATISTICS); +CREATE TABLE t10(LIKE t7); +SELECT create_distributed_table('t6', 'c0'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +ALTER TABLE t6 ALTER COLUMN c0 SET NOT NULL; +SELECT create_reference_table('t7'); + create_reference_table +--------------------------------------------------------------------- + +(1 row) + +SELECT create_distributed_table('t8', 'c0'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +ALTER TABLE t8 ALTER COLUMN c0 SET NOT NULL; +SELECT create_distributed_table('t9', 'c0'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +ALTER TABLE t9 ALTER COLUMN c0 SET NOT NULL; +SELECT create_reference_table('t10'); + create_reference_table +--------------------------------------------------------------------- + +(1 row) + +SELECT count(*) FROM ( +SELECT ALL t7.c1, t7.c0, t8.c1, t10.c1, t8.c0 FROM t7 CROSS JOIN t10 FULL OUTER JOIN t8 ON (((((((('[832125354,1134163512)'::int4range)*('(0,2106623281)'::int4range)))-('(-600267905,509840582]'::int4range)))*('(-365203965,1662828182)'::int4range)))&<((((((('(-1286467417,697584012]'::int4range)*('[-1691485781,1341103963)'::int4range)))*((('(-1768368435,1719707648)'::int4range)*('(139536997,1275813540]'::int4range)))))*((((('[-2103910157,-1961746758)'::int4range)*('[-834534078,533073939)'::int4range)))*((('[-1030552151,552856781]'::int4range)*('[-1109419376,1205173697]'::int4range)))))))) +) AS foo; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join SET client_min_messages TO WARNING; DROP SCHEMA sqlancer_failures CASCADE; diff --git a/src/test/regress/multi_schedule b/src/test/regress/multi_schedule index 4a772fe0a..a20731e03 100644 --- a/src/test/regress/multi_schedule +++ b/src/test/regress/multi_schedule @@ -88,7 +88,7 @@ test: subquery_prepared_statements pg12 cte_inline pg13 # ---------- test: multi_deparse_shard_query multi_distributed_transaction_id intermediate_results limit_intermediate_size rollback_to_savepoint test: multi_explain hyperscale_tutorial partitioned_intermediate_results distributed_intermediate_results multi_real_time_transaction -test: multi_basic_queries multi_complex_expressions multi_subquery multi_subquery_complex_queries multi_subquery_behavioral_analytics +test: multi_basic_queries cross_join multi_complex_expressions multi_subquery multi_subquery_complex_queries multi_subquery_behavioral_analytics test: multi_subquery_complex_reference_clause multi_subquery_window_functions multi_view multi_sql_function multi_prepare_sql test: sql_procedure multi_function_in_join row_types materialized_view undistribute_table test: multi_subquery_in_where_reference_clause full_join adaptive_executor propagate_set_commands diff --git a/src/test/regress/sql/cross_join.sql b/src/test/regress/sql/cross_join.sql new file mode 100644 index 000000000..c301e753b --- /dev/null +++ b/src/test/regress/sql/cross_join.sql @@ -0,0 +1,131 @@ +-- this test file relies on multi_behavioral_analytics_create_table +-- and aims to have variety of tests covering CROSS JOINs +-- "t1 CROSS JOIN t2" is equivalent of "t1 JOIN t2 ON true" + +-- a distributed table can be cross joined with a reference table +-- and the CROSS JOIN can be in the outer part of an outer JOIN +SELECT count(*) FROM events_reference_table e1 CROSS JOIN events_table e2 LEFT JOIN users_table u ON (e2.user_id = u.user_id); + +-- two distributed tables cannot be cross joined +-- as it lacks distribution key equality +SELECT count(*) FROM events_reference_table e1 CROSS JOIN events_table e2 CROSS JOIN users_table u; +SELECT count(*) FROM events_reference_table e1, events_table e2, users_table u; + +-- we can provide the distribution key equality via WHERE clause +SELECT count(*) FROM events_reference_table e1 CROSS JOIN events_table e2 CROSS JOIN users_table u WHERE u.user_id = e2.user_id; + + +-- two reference tables are JOINed, and later CROSS JOINed with a distributed table +-- it is safe to pushdown +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id CROSS JOIN users_table; +SELECT count(*) FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id CROSS JOIN users_table; +SELECT count(*) FROM users_ref_test_table ref1 RIGHT JOIN users_ref_test_table ref2 on ref1.id = ref2.id CROSS JOIN users_table; +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 CROSS JOIN users_table; + +-- two reference tables CROSS JOINNed, and later JOINED with distributed tables +-- it is safe to pushdown +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 RIGHT JOIN users_table ON false; +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 JOIN users_table ON false; +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 JOIN users_table ON true; +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 RIGHT JOIN users_table ON true; +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 RIGHT JOIN users_table ON (ref1.id = users_table.user_id); +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 JOIN users_table ON (ref1.id = users_table.user_id); + +-- two reference tables CROSS JOINNed, and later JOINED with distributed tables +-- but the reference table CROSS JOIN is in the outer side of the JOIN with the distributed table +-- so we cannot pushdown +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON (ref1.id = users_table.user_id); +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 FULL JOIN users_table ON (ref1.id = users_table.user_id); +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON (ref1.id != users_table.user_id); +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON (ref1.id > 0); +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON (users_table.user_id > 0); +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON true; +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON false; + +-- a reference tables CROSS JOINed with a distribted table, and later JOINED with distributed tables on distribution keys +-- so safe to pushdown +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 JOIN users_table u2 ON (u1.user_id = u2.user_id); +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table u2 ON (u1.user_id = u2.user_id); +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 FULL JOIN users_table u2 ON (u1.user_id = u2.user_id); +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 RIGHT JOIN users_table u2 ON (u1.user_id = u2.user_id); + +-- a reference tables CROSS JOINed with a distribted table, and later JOINED with distributed tables on reference table column +-- so not safe to pushdown +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table u2 ON (ref2.id = u2.user_id); +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 FULL JOIN users_table u2 ON (ref2.id = u2.user_id); +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 RIGHT JOIN users_table u2 ON (ref2.id = u2.user_id); + +-- via repartitioning, Citus can handle this query as the result of "u1 CROSS JOIN ref2" +-- can be repartitioned on ref2.id +Set citus.enable_repartition_joins to on; +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 JOIN users_table u2 ON (ref2.id = u2.user_id); +reset citus.enable_repartition_joins; + +-- although the following has the "ref LEFT JOIN dist" type of query, the LEFT JOIN is eliminated by Postgres +-- because the INNER JOIN eliminates the LEFT JOIN +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON (ref1.id = users_table.user_id) JOIN users_table u2 ON (u2.user_id = users_table.user_id); + +-- this is the same query as the above, but this time the outer query is also LEFT JOIN, meaning that Postgres +-- cannot eliminate the outer join +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON (ref1.id = users_table.user_id) LEFT JOIN users_table u2 ON (u2.user_id = users_table.user_id); + +-- cross join that goes through non-colocated subquery logic +-- for the "events_table" subquery as both distributed tables +-- do not have JOIN on the distribution key +SELECT max(events_all.cnt), + events_all.usr_id + FROM +(SELECT *, random() FROM + (SELECT *, random() + FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt + FROM (SELECT *,random FROM (SELECT *, random() FROM events_reference_table) as events_reference_table) as events_reference_table + INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) + GROUP BY users_table.user_id) AS events_all_inner + ) AS events_all + ) AS events_all + CROSS JOIN (SELECT *,random() FROM (SELECT *, random() FROM events_table) as events_table) as events_table +GROUP BY 2 +ORDER BY 1 DESC, + 2 DESC +LIMIT 5; + +-- cross join that goes through non-colocated subquery logic +-- for the "events_all" subquery as both distributed tables +-- do not have JOIN on the distribution key +SELECT max(events_all.cnt), + events_all.usr_id + FROM events_table + CROSS JOIN (SELECT *, random() + FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt + FROM events_reference_table + INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) + GROUP BY users_table.user_id) AS events_all_inner + ) AS events_all +GROUP BY 2 +ORDER BY 1 DESC, + 2 DESC +LIMIT 5; + + +-- cross join is between a reference table and distributed table, and +-- deep inside a subquery. The subquery can be in the outer part of the LEFT JOIN +SELECT + users_table.* +FROM + (SELECT + events_all.*, random() + FROM + events_reference_table JOIN users_table USING(user_id) + JOIN + (SELECT *, random() + FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt + FROM (SELECT *,random FROM (SELECT *, random() FROM events_reference_table) as events_reference_table) as events_reference_table + CROSS JOIN users_table + GROUP BY users_table.user_id) AS events_all_inner + ) AS events_all ON (user_id = usr_id) + ) AS events_all + LEFT JOIN (SELECT *,random() FROM (SELECT *, random() FROM events_table) as events_table) as events_table ON (events_all.usr_id = events_table.user_id) + LEFT JOIN users_table USING (user_id) +ORDER BY 1,2,3,4 LIMIT 5; + + diff --git a/src/test/regress/sql/multi_subquery_complex_reference_clause.sql b/src/test/regress/sql/multi_subquery_complex_reference_clause.sql index 2704e37db..b8451f559 100644 --- a/src/test/regress/sql/multi_subquery_complex_reference_clause.sql +++ b/src/test/regress/sql/multi_subquery_complex_reference_clause.sql @@ -401,6 +401,102 @@ ORDER BY 1 DESC, 2 DESC LIMIT 5; +-- should be fine even if the tables are deep inside subqueries +SELECT max(events_all.cnt), + events_all.usr_id + FROM +(SELECT *, random() FROM + (SELECT *, random() + FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt + FROM (SELECT *,random FROM (SELECT *, random() FROM events_reference_table) as events_reference_table) as events_reference_table + INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) + GROUP BY users_table.user_id) AS events_all_inner + ) AS events_all + ) AS events_all + LEFT JOIN (SELECT *,random() FROM (SELECT *, random() FROM events_table) as events_table) as events_table ON (events_all.usr_id = events_table.user_id) +GROUP BY 2 +ORDER BY 1 DESC, + 2 DESC +LIMIT 5; + +-- should be fine with FULL join as well +SELECT max(events_all.cnt), + events_all.usr_id + FROM events_table + FULL JOIN (SELECT *, random() + FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt + FROM events_reference_table + INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) + GROUP BY users_table.user_id) AS events_all_inner + ) AS events_all ON (events_all.usr_id = events_table.user_id) +GROUP BY 2 +ORDER BY 1 DESC, + 2 DESC +LIMIT 5; + +-- two levels of "(ref_table JOIN dist_table) LEFT JOIN" +-- should be fine as well +SELECT + users_table.* +FROM + (SELECT + events_all.*, random() + FROM + events_reference_table JOIN users_table USING(user_id) + JOIN + (SELECT *, random() + FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt + FROM (SELECT *,random FROM (SELECT *, random() FROM events_reference_table) as events_reference_table) as events_reference_table + INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) + GROUP BY users_table.user_id) AS events_all_inner + ) AS events_all ON (user_id = usr_id) + ) AS events_all + LEFT JOIN (SELECT *,random() FROM (SELECT *, random() FROM events_table) as events_table) as events_table ON (events_all.usr_id = events_table.user_id) + LEFT JOIN users_table USING (user_id) +ORDER BY 1,2,3,4 LIMIT 5; + + +-- we should be able to support OUTER joins involving +-- reference tables even if the subquery is in WHERE clause +SELECT count(*) +FROM events_table +WHERE user_id IN + (SELECT subquery_1.user_id + FROM + (SELECT tt1.user_id, + random() + FROM users_table AS tt1 + JOIN events_table AS tt2 ON tt1.user_id = tt2.user_id) subquery_1 + RIGHT JOIN + (SELECT *, + random() + FROM + (SELECT tt1.user_id, + random() + FROM users_table AS tt1 + JOIN events_reference_table AS REF ON tt1.user_id = ref.user_id) subquery_2_inner) subquery_2 ON subquery_1.user_id = subquery_2.user_id); + +-- we should be able to support OUTER joins involving +-- reference tables even if the subquery is in the outer part of a JOIN + +SELECT count(*) +FROM users_table +RIGHT JOIN + (SELECT subquery_1.user_id + FROM + (SELECT tt1.user_id, + random() + FROM users_table AS tt1 + JOIN events_table AS tt2 ON tt1.user_id = tt2.user_id) subquery_1 + RIGHT JOIN + (SELECT *, + random() + FROM + (SELECT tt1.user_id, + random() + FROM users_table AS tt1 + JOIN events_reference_table AS REF ON tt1.user_id = ref.user_id) subquery_2_inner) subquery_2 ON subquery_1.user_id = subquery_2.user_id) AS foo USING (user_id); + -- LATERAL JOINs used with INNER JOINs with reference tables SET citus.subquery_pushdown to ON; SELECT user_id, lastseen @@ -1367,11 +1463,6 @@ SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 INN -- so Postgres doesn't generate join restriction between reference and distributed tables SELECT count(*) FROM (SELECT u1.*, random() FROM users_ref_test_table ref1 INNER JOIN user_buy_test_table u1 on ref1.id = u1.user_id) as foo LEFT JOIN user_buy_test_table ON (foo.user_id = user_buy_test_table.user_id); --- again, in theory should be OK to pushdown but --- Postgres generates join restriction between reference and distributed tables --- in one of the cases -SELECT count(*) FROM user_buy_test_table a LEFT JOIN users_ref_test_table b ON (true) RIGHT JOIN users_ref_test_table c ON (b.id = c.id); - -- outer part of the LEFT JOIN consists only intermediate result due to LIMIT, so we cannot push down SELECT count(*) FROM (SELECT ref1.* FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LIMIT 5) as foo LEFT JOIN user_buy_test_table ON true; diff --git a/src/test/regress/sql/sqlancer_failures.sql b/src/test/regress/sql/sqlancer_failures.sql index 0555af383..57531ea84 100644 --- a/src/test/regress/sql/sqlancer_failures.sql +++ b/src/test/regress/sql/sqlancer_failures.sql @@ -39,5 +39,24 @@ INSERT INTO t5 VALUES (CASE WHEN 2 BETWEEN 1 AND 3 THEN 2 ELSE 1 END); EXPLAIN (COSTS FALSE) SELECT FROM t5 WHERE c0 = 2; EXPLAIN (COSTS FALSE) SELECT FROM t5 WHERE c0 = CASE WHEN 2 BETWEEN 1 AND 3 THEN 2 ELSE 1 END; +CREATE TABLE IF NOT EXISTS t6(c0 TEXT CHECK (TRUE), c1 money ) WITH (autovacuum_vacuum_threshold=1180014707, autovacuum_freeze_table_age=13771154, autovacuum_vacuum_cost_delay=23, autovacuum_analyze_threshold=1935153914, autovacuum_freeze_min_age=721733768, autovacuum_enabled=0, autovacuum_vacuum_cost_limit=9983); +CREATE UNLOGGED TABLE IF NOT EXISTS t7(LIKE t6); +CREATE TABLE t8(LIKE t6 INCLUDING INDEXES); +CREATE UNLOGGED TABLE t9(LIKE t6 EXCLUDING STATISTICS); +CREATE TABLE t10(LIKE t7); + +SELECT create_distributed_table('t6', 'c0'); +ALTER TABLE t6 ALTER COLUMN c0 SET NOT NULL; +SELECT create_reference_table('t7'); +SELECT create_distributed_table('t8', 'c0'); +ALTER TABLE t8 ALTER COLUMN c0 SET NOT NULL; +SELECT create_distributed_table('t9', 'c0'); +ALTER TABLE t9 ALTER COLUMN c0 SET NOT NULL; +SELECT create_reference_table('t10'); + +SELECT count(*) FROM ( +SELECT ALL t7.c1, t7.c0, t8.c1, t10.c1, t8.c0 FROM t7 CROSS JOIN t10 FULL OUTER JOIN t8 ON (((((((('[832125354,1134163512)'::int4range)*('(0,2106623281)'::int4range)))-('(-600267905,509840582]'::int4range)))*('(-365203965,1662828182)'::int4range)))&<((((((('(-1286467417,697584012]'::int4range)*('[-1691485781,1341103963)'::int4range)))*((('(-1768368435,1719707648)'::int4range)*('(139536997,1275813540]'::int4range)))))*((((('[-2103910157,-1961746758)'::int4range)*('[-834534078,533073939)'::int4range)))*((('[-1030552151,552856781]'::int4range)*('[-1109419376,1205173697]'::int4range)))))))) +) AS foo; + SET client_min_messages TO WARNING; DROP SCHEMA sqlancer_failures CASCADE;