diff --git a/src/backend/distributed/planner/query_pushdown_planning.c b/src/backend/distributed/planner/query_pushdown_planning.c index b26b4053f..863933759 100644 --- a/src/backend/distributed/planner/query_pushdown_planning.c +++ b/src/backend/distributed/planner/query_pushdown_planning.c @@ -79,10 +79,8 @@ static DeferredErrorMessage * DeferredErrorIfUnsupportedRecurringTuplesJoin( PlannerRestrictionContext *plannerRestrictionContext); static DeferredErrorMessage * DeferErrorIfUnsupportedTableCombination(Query *queryTree); static bool ExtractSetOperationStatmentWalker(Node *node, List **setOperationList); -static bool ShouldRecurseForRecurringTuplesJoinChecks(RelOptInfo *relOptInfo); -static bool RelationInfoContainsRecurringTuples(PlannerInfo *plannerInfo, - RelOptInfo *relationInfo, - RecurringTuplesType *recurType); +static RecurringTuplesType FetchFirstRecurType(PlannerInfo *plannerInfo, RelOptInfo * + relationInfo); static bool ContainsRecurringRTE(RangeTblEntry *rangeTableEntry, RecurringTuplesType *recurType); static bool ContainsRecurringRangeTable(List *rangeTable, RecurringTuplesType *recurType); @@ -776,7 +774,6 @@ DeferredErrorIfUnsupportedRecurringTuplesJoin( plannerRestrictionContext->joinRestrictionContext->joinRestrictionList; ListCell *joinRestrictionCell = NULL; RecurringTuplesType recurType = RECURRING_TUPLES_INVALID; - foreach(joinRestrictionCell, joinRestrictionList) { JoinRestriction *joinRestriction = (JoinRestriction *) lfirst( @@ -799,20 +796,45 @@ DeferredErrorIfUnsupportedRecurringTuplesJoin( continue; } - if (ShouldRecurseForRecurringTuplesJoinChecks(outerrel) && - RelationInfoContainsRecurringTuples(plannerInfo, outerrel, &recurType)) + + /* + * If the outer side of the join doesn't have any distributed tables + * (e.g., contains only recurring tuples), Citus should not pushdown + * the query. The reason is that recurring tuples on every shard would + * be added to the result, which is wrong. + */ + if (RelationInfoContainsOnlyRecurringTuples(plannerInfo, outerrel)) { + /* + * Find the first (or only) recurring RTE to give a meaningful + * error to the user. + */ + recurType = FetchFirstRecurType(plannerInfo, outerrel); + break; } } else if (joinType == JOIN_FULL) { - if ((ShouldRecurseForRecurringTuplesJoinChecks(innerrel) && - RelationInfoContainsRecurringTuples(plannerInfo, innerrel, - &recurType)) || - (ShouldRecurseForRecurringTuplesJoinChecks(outerrel) && - RelationInfoContainsRecurringTuples(plannerInfo, outerrel, &recurType))) + if (RelationInfoContainsOnlyRecurringTuples(plannerInfo, innerrel)) { + /* + * Find the first (or only) recurring RTE to give a meaningful + * error to the user. + */ + recurType = FetchFirstRecurType(plannerInfo, innerrel); + + break; + } + + if (RelationInfoContainsOnlyRecurringTuples(plannerInfo, outerrel)) + { + /* + * Find the first (or only) recurring RTE to give a meaningful + * error to the user. + */ + recurType = FetchFirstRecurType(plannerInfo, outerrel); + break; } } @@ -846,6 +868,7 @@ DeferredErrorIfUnsupportedRecurringTuplesJoin( "Complex subqueries and CTEs cannot be in the outer " "part of the outer join", NULL); } + return NULL; } @@ -1071,7 +1094,8 @@ DeferErrorIfUnsupportedTableCombination(Query *queryTree) * Extract all range table indexes from the join tree. Note that sub-queries * that get pulled up by PostgreSQL don't appear in this join tree. */ - ExtractRangeTableIndexWalker((Node *) queryTree->jointree, &joinTreeTableIndexList); + ExtractRangeTableIndexWalker((Node *) queryTree->jointree, + &joinTreeTableIndexList); foreach_int(joinTreeTableIndex, joinTreeTableIndexList) { @@ -1180,13 +1204,15 @@ DeferErrorIfUnsupportedUnionQuery(Query *subqueryTree) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "cannot push down this subquery", - "Intersect and Except are currently unsupported", NULL); + "Intersect and Except are currently unsupported", + NULL); } if (IsA(leftArg, RangeTblRef)) { leftArgRTI = ((RangeTblRef *) leftArg)->rtindex; - Query *leftArgSubquery = rt_fetch(leftArgRTI, subqueryTree->rtable)->subquery; + Query *leftArgSubquery = rt_fetch(leftArgRTI, + subqueryTree->rtable)->subquery; recurType = FromClauseRecurringTupleType(leftArgSubquery); if (recurType != RECURRING_TUPLES_INVALID) { @@ -1260,81 +1286,14 @@ ExtractSetOperationStatmentWalker(Node *node, List **setOperationList) (*setOperationList) = lappend(*setOperationList, setOperation); } - bool walkerResult = expression_tree_walker(node, ExtractSetOperationStatmentWalker, + bool walkerResult = expression_tree_walker(node, + ExtractSetOperationStatmentWalker, setOperationList); return walkerResult; } -/* - * ShouldRecurseForRecurringTuplesJoinChecks is a helper function for deciding - * on whether the input relOptInfo should be checked for table expressions that - * generate the same tuples in every query on a shard. We use this to avoid - * redundant checks and false positives in complex join trees. - */ -static bool -ShouldRecurseForRecurringTuplesJoinChecks(RelOptInfo *relOptInfo) -{ - bool shouldRecurse = true; - - /* - * We shouldn't recursively go down for joins since we're already - * going to process each join seperately. Otherwise we'd restrict - * the coverage. See the below sketch where (h) denotes a hash - * distributed relation, (r) denotes a reference table, (L) denotes - * LEFT JOIN and (I) denotes INNER JOIN. If we're to recurse into - * the inner join, we'd be preventing to push down the following - * join tree, which is actually safe to push down. - * - * (L) - * / \ - * (I) h - * / \ - * r h - */ - if (relOptInfo->reloptkind == RELOPT_JOINREL) - { - return false; - } - - /* - * Note that we treat the same query where relations appear in subqueries - * differently. (i.e., use SELECT * FROM r; instead of r) - * - * In that case, to relax some restrictions, we do the following optimization: - * If the subplan (i.e., plannerInfo corresponding to the subquery) contains any - * joins, we skip reference table checks keeping in mind that the join is already - * going to be processed seperately. This optimization should suffice for many - * use cases. - */ - if (relOptInfo->reloptkind == RELOPT_BASEREL && relOptInfo->subroot != NULL) - { - PlannerInfo *subroot = relOptInfo->subroot; - - if (list_length(subroot->join_rel_list) > 0) - { - RelOptInfo *subqueryJoin = linitial(subroot->join_rel_list); - - /* - * Subqueries without relations (e.g. SELECT 1) are a little funny. - * They are treated as having a join, but the join is between 0 - * relations and won't be in the join restriction list and therefore - * won't be revisited in DeferredErrorIfUnsupportedRecurringTuplesJoin. - * - * We therefore only skip joins with >0 relations. - */ - if (bms_num_members(subqueryJoin->relids) > 0) - { - shouldRecurse = false; - } - } - } - - return shouldRecurse; -} - - /* * RelationInfoContainsOnlyRecurringTuples returns false if any of the relations in * a RelOptInfo is not recurring. @@ -1370,19 +1329,20 @@ RelationInfoContainsOnlyRecurringTuples(PlannerInfo *plannerInfo, /* - * RelationInfoContainsRecurringTuples checks whether the relationInfo + * FetchFirstRecurType checks whether the relationInfo * contains any recurring table expression, namely a reference table, - * or immutable function. If found, RelationInfoContainsRecurringTuples + * or immutable function. If found, FetchFirstRecurType * returns true. * * Note that since relation ids of relationInfo indexes to the range * table entry list of planner info, planner info is also passed. */ -static bool -RelationInfoContainsRecurringTuples(PlannerInfo *plannerInfo, RelOptInfo *relationInfo, - RecurringTuplesType *recurType) +static RecurringTuplesType +FetchFirstRecurType(PlannerInfo *plannerInfo, RelOptInfo * + relationInfo) { Relids relids = relationInfo->relids; + RecurringTuplesType recurType = RECURRING_TUPLES_INVALID; int relationId = -1; while ((relationId = bms_next_member(relids, relationId)) >= 0) @@ -1390,13 +1350,13 @@ RelationInfoContainsRecurringTuples(PlannerInfo *plannerInfo, RelOptInfo *relati RangeTblEntry *rangeTableEntry = plannerInfo->simple_rte_array[relationId]; /* relationInfo has this range table entry */ - if (ContainsRecurringRTE(rangeTableEntry, recurType)) + if (ContainsRecurringRTE(rangeTableEntry, &recurType)) { - return true; + return recurType; } } - return false; + return recurType; } diff --git a/src/test/regress/expected/cross_join.out b/src/test/regress/expected/cross_join.out new file mode 100644 index 000000000..8ced8541d --- /dev/null +++ b/src/test/regress/expected/cross_join.out @@ -0,0 +1,253 @@ +-- this test file relies on multi_behavioral_analytics_create_table +-- and aims to have variety of tests covering CROSS JOINs +-- "t1 CROSS JOIN t2" is equivalent of "t1 JOIN t2 ON true" +-- a distributed table can be cross joined with a reference table +-- and the CROSS JOIN can be in the outer part of an outer JOIN +SELECT count(*) FROM events_reference_table e1 CROSS JOIN events_table e2 LEFT JOIN users_table u ON (e2.user_id = u.user_id); + count +--------------------------------------------------------------------- + 176649 +(1 row) + +-- two distributed tables cannot be cross joined +-- as it lacks distribution key equality +SELECT count(*) FROM events_reference_table e1 CROSS JOIN events_table e2 CROSS JOIN users_table u; +ERROR: cannot perform distributed planning on this query +DETAIL: Cartesian products are currently unsupported +SELECT count(*) FROM events_reference_table e1, events_table e2, users_table u; +ERROR: cannot perform distributed planning on this query +DETAIL: Cartesian products are currently unsupported +-- we can provide the distribution key equality via WHERE clause +SELECT count(*) FROM events_reference_table e1 CROSS JOIN events_table e2 CROSS JOIN users_table u WHERE u.user_id = e2.user_id; + count +--------------------------------------------------------------------- + 176649 +(1 row) + +-- two reference tables are JOINed, and later CROSS JOINed with a distributed table +-- it is safe to pushdown +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id CROSS JOIN users_table; + count +--------------------------------------------------------------------- + 606 +(1 row) + +SELECT count(*) FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id CROSS JOIN users_table; + count +--------------------------------------------------------------------- + 606 +(1 row) + +SELECT count(*) FROM users_ref_test_table ref1 RIGHT JOIN users_ref_test_table ref2 on ref1.id = ref2.id CROSS JOIN users_table; + count +--------------------------------------------------------------------- + 606 +(1 row) + +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 CROSS JOIN users_table; + count +--------------------------------------------------------------------- + 3636 +(1 row) + +-- two reference tables CROSS JOINNed, and later JOINED with distributed tables +-- it is safe to pushdown +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 RIGHT JOIN users_table ON false; + count +--------------------------------------------------------------------- + 101 +(1 row) + +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 JOIN users_table ON false; + count +--------------------------------------------------------------------- + 0 +(1 row) + +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 JOIN users_table ON true; + count +--------------------------------------------------------------------- + 3636 +(1 row) + +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 RIGHT JOIN users_table ON true; + count +--------------------------------------------------------------------- + 3636 +(1 row) + +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 RIGHT JOIN users_table ON (ref1.id = users_table.user_id); + count +--------------------------------------------------------------------- + 606 +(1 row) + +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 JOIN users_table ON (ref1.id = users_table.user_id); + count +--------------------------------------------------------------------- + 606 +(1 row) + +-- two reference tables CROSS JOINNed, and later JOINED with distributed tables +-- but the reference table CROSS JOIN is in the outer side of the JOIN with the distributed table +-- so we cannot pushdown +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON (ref1.id = users_table.user_id); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 FULL JOIN users_table ON (ref1.id = users_table.user_id); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON (ref1.id != users_table.user_id); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON (ref1.id > 0); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON (users_table.user_id > 0); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON true; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON false; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- a reference tables CROSS JOINed with a distribted table, and later JOINED with distributed tables on distribution keys +-- so safe to pushdown +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 JOIN users_table u2 ON (u1.user_id = u2.user_id); + count +--------------------------------------------------------------------- + 11802 +(1 row) + +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table u2 ON (u1.user_id = u2.user_id); + count +--------------------------------------------------------------------- + 11802 +(1 row) + +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 FULL JOIN users_table u2 ON (u1.user_id = u2.user_id); + count +--------------------------------------------------------------------- + 11802 +(1 row) + +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 RIGHT JOIN users_table u2 ON (u1.user_id = u2.user_id); + count +--------------------------------------------------------------------- + 11802 +(1 row) + +-- a reference tables CROSS JOINed with a distribted table, and later JOINED with distributed tables on reference table column +-- so not safe to pushdown +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table u2 ON (ref2.id = u2.user_id); +ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 FULL JOIN users_table u2 ON (ref2.id = u2.user_id); +ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 RIGHT JOIN users_table u2 ON (ref2.id = u2.user_id); +ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns +-- via repartitioning, Citus can handle this query as the result of "u1 CROSS JOIN ref2" +-- can be repartitioned on ref2.id +Set citus.enable_repartition_joins to on; +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 JOIN users_table u2 ON (ref2.id = u2.user_id); + count +--------------------------------------------------------------------- + 10201 +(1 row) + +reset citus.enable_repartition_joins; +-- although the following has the "ref LEFT JOIN dist" type of query, the LEFT JOIN is eliminated by Postgres +-- because the INNER JOIN eliminates the LEFT JOIN +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON (ref1.id = users_table.user_id) JOIN users_table u2 ON (u2.user_id = users_table.user_id); + count +--------------------------------------------------------------------- + 11802 +(1 row) + +-- this is the same query as the above, but this time the outer query is also LEFT JOIN, meaning that Postgres +-- cannot eliminate the outer join +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON (ref1.id = users_table.user_id) LEFT JOIN users_table u2 ON (u2.user_id = users_table.user_id); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- cross join that goes through non-colocated subquery logic +-- for the "events_table" subquery as both distributed tables +-- do not have JOIN on the distribution key +SELECT max(events_all.cnt), + events_all.usr_id + FROM +(SELECT *, random() FROM + (SELECT *, random() + FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt + FROM (SELECT *,random FROM (SELECT *, random() FROM events_reference_table) as events_reference_table) as events_reference_table + INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) + GROUP BY users_table.user_id) AS events_all_inner + ) AS events_all + ) AS events_all + CROSS JOIN (SELECT *,random() FROM (SELECT *, random() FROM events_table) as events_table) as events_table +GROUP BY 2 +ORDER BY 1 DESC, + 2 DESC +LIMIT 5; + max | usr_id +--------------------------------------------------------------------- + 432 | 2 + 391 | 4 + 364 | 5 + 357 | 3 + 105 | 1 +(5 rows) + +-- cross join that goes through non-colocated subquery logic +-- for the "events_all" subquery as both distributed tables +-- do not have JOIN on the distribution key +SELECT max(events_all.cnt), + events_all.usr_id + FROM events_table + CROSS JOIN (SELECT *, random() + FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt + FROM events_reference_table + INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) + GROUP BY users_table.user_id) AS events_all_inner + ) AS events_all +GROUP BY 2 +ORDER BY 1 DESC, + 2 DESC +LIMIT 5; + max | usr_id +--------------------------------------------------------------------- + 432 | 2 + 391 | 4 + 364 | 5 + 357 | 3 + 105 | 1 +(5 rows) + +-- cross join is between a reference table and distributed table, and +-- deep inside a subquery. The subquery can be in the outer part of the LEFT JOIN +SELECT + users_table.* +FROM + (SELECT + events_all.*, random() + FROM + events_reference_table JOIN users_table USING(user_id) + JOIN + (SELECT *, random() + FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt + FROM (SELECT *,random FROM (SELECT *, random() FROM events_reference_table) as events_reference_table) as events_reference_table + CROSS JOIN users_table + GROUP BY users_table.user_id) AS events_all_inner + ) AS events_all ON (user_id = usr_id) + ) AS events_all + LEFT JOIN (SELECT *,random() FROM (SELECT *, random() FROM events_table) as events_table) as events_table ON (events_all.usr_id = events_table.user_id) + LEFT JOIN users_table USING (user_id) +ORDER BY 1,2,3,4 LIMIT 5; + user_id | time | value_1 | value_2 | value_3 | value_4 +--------------------------------------------------------------------- + 1 | Wed Nov 22 22:51:43.132261 2017 | 4 | 0 | 3 | + 1 | Wed Nov 22 22:51:43.132261 2017 | 4 | 0 | 3 | + 1 | Wed Nov 22 22:51:43.132261 2017 | 4 | 0 | 3 | + 1 | Wed Nov 22 22:51:43.132261 2017 | 4 | 0 | 3 | + 1 | Wed Nov 22 22:51:43.132261 2017 | 4 | 0 | 3 | +(5 rows) + diff --git a/src/test/regress/expected/multi_subquery_complex_reference_clause.out b/src/test/regress/expected/multi_subquery_complex_reference_clause.out index 0a743e9ab..9329e2df4 100644 --- a/src/test/regress/expected/multi_subquery_complex_reference_clause.out +++ b/src/test/regress/expected/multi_subquery_complex_reference_clause.out @@ -189,8 +189,13 @@ SELECT subquery_1.user_id ON tt1.user_id = ref.id) subquery_2_inner) subquery_2 ON subquery_1.user_id = subquery_2.user_id ORDER BY 1; -ERROR: cannot pushdown the subquery -DETAIL: There exist a reference table in the outer part of the outer join + user_id +--------------------------------------------------------------------- + 1 + 3 + +(3 rows) + -- should be able to pushdown since reference table is in the -- inner part of the left join SELECT @@ -646,8 +651,141 @@ GROUP BY 2 ORDER BY 1 DESC, 2 DESC LIMIT 5; -ERROR: cannot pushdown the subquery -DETAIL: There exist a reference table in the outer part of the outer join + max | usr_id +--------------------------------------------------------------------- + 432 | 2 + 391 | 4 + 364 | 5 + 357 | 3 + 105 | 1 +(5 rows) + +-- should be fine even if the tables are deep inside subqueries +SELECT max(events_all.cnt), + events_all.usr_id + FROM +(SELECT *, random() FROM + (SELECT *, random() + FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt + FROM (SELECT *,random FROM (SELECT *, random() FROM events_reference_table) as events_reference_table) as events_reference_table + INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) + GROUP BY users_table.user_id) AS events_all_inner + ) AS events_all + ) AS events_all + LEFT JOIN (SELECT *,random() FROM (SELECT *, random() FROM events_table) as events_table) as events_table ON (events_all.usr_id = events_table.user_id) +GROUP BY 2 +ORDER BY 1 DESC, + 2 DESC +LIMIT 5; + max | usr_id +--------------------------------------------------------------------- + 432 | 2 + 391 | 4 + 364 | 5 + 357 | 3 + 105 | 1 +(5 rows) + +-- should be fine with FULL join as well +SELECT max(events_all.cnt), + events_all.usr_id + FROM events_table + FULL JOIN (SELECT *, random() + FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt + FROM events_reference_table + INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) + GROUP BY users_table.user_id) AS events_all_inner + ) AS events_all ON (events_all.usr_id = events_table.user_id) +GROUP BY 2 +ORDER BY 1 DESC, + 2 DESC +LIMIT 5; + max | usr_id +--------------------------------------------------------------------- + 432 | 2 + 391 | 4 + 364 | 5 + 357 | 3 + 105 | 1 +(5 rows) + +-- two levels of "(ref_table JOIN dist_table) LEFT JOIN" +-- should be fine as well +SELECT + users_table.* +FROM + (SELECT + events_all.*, random() + FROM + events_reference_table JOIN users_table USING(user_id) + JOIN + (SELECT *, random() + FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt + FROM (SELECT *,random FROM (SELECT *, random() FROM events_reference_table) as events_reference_table) as events_reference_table + INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) + GROUP BY users_table.user_id) AS events_all_inner + ) AS events_all ON (user_id = usr_id) + ) AS events_all + LEFT JOIN (SELECT *,random() FROM (SELECT *, random() FROM events_table) as events_table) as events_table ON (events_all.usr_id = events_table.user_id) + LEFT JOIN users_table USING (user_id) +ORDER BY 1,2,3,4 LIMIT 5; + user_id | time | value_1 | value_2 | value_3 | value_4 +--------------------------------------------------------------------- + 1 | Wed Nov 22 22:51:43.132261 2017 | 4 | 0 | 3 | + 1 | Wed Nov 22 22:51:43.132261 2017 | 4 | 0 | 3 | + 1 | Wed Nov 22 22:51:43.132261 2017 | 4 | 0 | 3 | + 1 | Wed Nov 22 22:51:43.132261 2017 | 4 | 0 | 3 | + 1 | Wed Nov 22 22:51:43.132261 2017 | 4 | 0 | 3 | +(5 rows) + +-- we should be able to support OUTER joins involving +-- reference tables even if the subquery is in WHERE clause +SELECT count(*) +FROM events_table +WHERE user_id IN + (SELECT subquery_1.user_id + FROM + (SELECT tt1.user_id, + random() + FROM users_table AS tt1 + JOIN events_table AS tt2 ON tt1.user_id = tt2.user_id) subquery_1 + RIGHT JOIN + (SELECT *, + random() + FROM + (SELECT tt1.user_id, + random() + FROM users_table AS tt1 + JOIN events_reference_table AS REF ON tt1.user_id = ref.user_id) subquery_2_inner) subquery_2 ON subquery_1.user_id = subquery_2.user_id); + count +--------------------------------------------------------------------- + 101 +(1 row) + +-- we should be able to support OUTER joins involving +-- reference tables even if the subquery is in the outer part of a JOIN +SELECT count(*) +FROM users_table +RIGHT JOIN + (SELECT subquery_1.user_id + FROM + (SELECT tt1.user_id, + random() + FROM users_table AS tt1 + JOIN events_table AS tt2 ON tt1.user_id = tt2.user_id) subquery_1 + RIGHT JOIN + (SELECT *, + random() + FROM + (SELECT tt1.user_id, + random() + FROM users_table AS tt1 + JOIN events_reference_table AS REF ON tt1.user_id = ref.user_id) subquery_2_inner) subquery_2 ON subquery_1.user_id = subquery_2.user_id) AS foo USING (user_id); + count +--------------------------------------------------------------------- + 12664199 +(1 row) + -- LATERAL JOINs used with INNER JOINs with reference tables SET citus.subquery_pushdown to ON; NOTICE: Setting citus.subquery_pushdown flag is discouraged becuase it forces the planner to pushdown certain queries, skipping relevant correctness checks. @@ -1823,6 +1961,104 @@ LIMIT 5; 6 (1 row) +-- outer part of the LEFT JOIN consists only reference tables, so we cannot push down +-- we have different combinations for ON condition, true/false/two column join/single column filter +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON true; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON true; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON false; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON false; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON (ref1.id > 5); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON (user_buy_test_table.user_id > 5); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON (ref1.id = user_buy_test_table.user_id); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON (ref2.id = user_buy_test_table.user_id); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON (ref1.id = user_buy_test_table.user_id); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON (ref2.id = user_buy_test_table.user_id); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- outer part of the LEFT JOIN consists only reference tables within a subquery, so we cannot push down +-- we have different combinations for ON condition, true/false/two column join/single column filter +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON true; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON true; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON false; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON false; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON (foo.id > 5); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON (user_buy_test_table.user_id > 19); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON (foo.id = user_buy_test_table.user_id); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- one example where unsupported outer join is deep inside a subquery +SELECT *, random() FROM ( +SELECT *,random() FROM user_buy_test_table WHERE user_id > ( +SELECT count(*) FROM (SELECT *,random() FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as bar) as foo LEFT JOIN (SELECT *, random() FROM (SELECT *,random() FROM user_buy_test_table d1 JOIN user_buy_test_table d2 USING (user_id)) as bar_inner ) as bar ON true)) as boo; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- In theory, we should be able to pushdown this query +-- however, as the LEFT JOIN condition is between a reference table and the distributed table +-- Postgres generates a LEFT JOIN alternative among those tables +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 INNER JOIN user_buy_test_table u1 on ref1.id = u1.user_id) as foo LEFT JOIN user_buy_test_table ON (foo.id = user_buy_test_table.user_id); +ERROR: cannot pushdown the subquery +DETAIL: Complex subqueries and CTEs cannot be in the outer part of the outer join +-- same as the above query, but this time LEFT JOIN condition is between distributed tables +-- so Postgres doesn't generate join restriction between reference and distributed tables +SELECT count(*) FROM (SELECT u1.*, random() FROM users_ref_test_table ref1 INNER JOIN user_buy_test_table u1 on ref1.id = u1.user_id) as foo LEFT JOIN user_buy_test_table ON (foo.user_id = user_buy_test_table.user_id); + count +--------------------------------------------------------------------- + 3 +(1 row) + +-- outer part of the LEFT JOIN consists only intermediate result due to LIMIT, so we cannot push down +SELECT count(*) FROM (SELECT ref1.* FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LIMIT 5) as foo LEFT JOIN user_buy_test_table ON true; +ERROR: cannot pushdown the subquery +DETAIL: Complex subqueries and CTEs cannot be in the outer part of the outer join +-- should be fine as OUTER part is the distributed table +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id RIGHT JOIN user_buy_test_table ON true; + count +--------------------------------------------------------------------- + 24 +(1 row) + +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id RIGHT JOIN user_buy_test_table ON false; + count +--------------------------------------------------------------------- + 4 +(1 row) + +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id RIGHT JOIN user_buy_test_table ON (ref1.id = user_id); + count +--------------------------------------------------------------------- + 4 +(1 row) + DROP TABLE user_buy_test_table; DROP TABLE users_ref_test_table; DROP TABLE users_return_test_table; diff --git a/src/test/regress/expected/sqlancer_failures.out b/src/test/regress/expected/sqlancer_failures.out index 84d9ecf10..2cb3de1cd 100644 --- a/src/test/regress/expected/sqlancer_failures.out +++ b/src/test/regress/expected/sqlancer_failures.out @@ -108,5 +108,48 @@ EXPLAIN (COSTS FALSE) SELECT FROM t5 WHERE c0 = CASE WHEN 2 BETWEEN 1 AND 3 THEN Filter: (c0 = 2) (7 rows) +CREATE TABLE IF NOT EXISTS t6(c0 TEXT CHECK (TRUE), c1 money ) WITH (autovacuum_vacuum_threshold=1180014707, autovacuum_freeze_table_age=13771154, autovacuum_vacuum_cost_delay=23, autovacuum_analyze_threshold=1935153914, autovacuum_freeze_min_age=721733768, autovacuum_enabled=0, autovacuum_vacuum_cost_limit=9983); +CREATE UNLOGGED TABLE IF NOT EXISTS t7(LIKE t6); +CREATE TABLE t8(LIKE t6 INCLUDING INDEXES); +CREATE UNLOGGED TABLE t9(LIKE t6 EXCLUDING STATISTICS); +CREATE TABLE t10(LIKE t7); +SELECT create_distributed_table('t6', 'c0'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +ALTER TABLE t6 ALTER COLUMN c0 SET NOT NULL; +SELECT create_reference_table('t7'); + create_reference_table +--------------------------------------------------------------------- + +(1 row) + +SELECT create_distributed_table('t8', 'c0'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +ALTER TABLE t8 ALTER COLUMN c0 SET NOT NULL; +SELECT create_distributed_table('t9', 'c0'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +ALTER TABLE t9 ALTER COLUMN c0 SET NOT NULL; +SELECT create_reference_table('t10'); + create_reference_table +--------------------------------------------------------------------- + +(1 row) + +SELECT count(*) FROM ( +SELECT ALL t7.c1, t7.c0, t8.c1, t10.c1, t8.c0 FROM t7 CROSS JOIN t10 FULL OUTER JOIN t8 ON (((((((('[832125354,1134163512)'::int4range)*('(0,2106623281)'::int4range)))-('(-600267905,509840582]'::int4range)))*('(-365203965,1662828182)'::int4range)))&<((((((('(-1286467417,697584012]'::int4range)*('[-1691485781,1341103963)'::int4range)))*((('(-1768368435,1719707648)'::int4range)*('(139536997,1275813540]'::int4range)))))*((((('[-2103910157,-1961746758)'::int4range)*('[-834534078,533073939)'::int4range)))*((('[-1030552151,552856781]'::int4range)*('[-1109419376,1205173697]'::int4range)))))))) +) AS foo; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join SET client_min_messages TO WARNING; DROP SCHEMA sqlancer_failures CASCADE; diff --git a/src/test/regress/multi_schedule b/src/test/regress/multi_schedule index 4a772fe0a..a20731e03 100644 --- a/src/test/regress/multi_schedule +++ b/src/test/regress/multi_schedule @@ -88,7 +88,7 @@ test: subquery_prepared_statements pg12 cte_inline pg13 # ---------- test: multi_deparse_shard_query multi_distributed_transaction_id intermediate_results limit_intermediate_size rollback_to_savepoint test: multi_explain hyperscale_tutorial partitioned_intermediate_results distributed_intermediate_results multi_real_time_transaction -test: multi_basic_queries multi_complex_expressions multi_subquery multi_subquery_complex_queries multi_subquery_behavioral_analytics +test: multi_basic_queries cross_join multi_complex_expressions multi_subquery multi_subquery_complex_queries multi_subquery_behavioral_analytics test: multi_subquery_complex_reference_clause multi_subquery_window_functions multi_view multi_sql_function multi_prepare_sql test: sql_procedure multi_function_in_join row_types materialized_view undistribute_table test: multi_subquery_in_where_reference_clause full_join adaptive_executor propagate_set_commands diff --git a/src/test/regress/sql/cross_join.sql b/src/test/regress/sql/cross_join.sql new file mode 100644 index 000000000..c301e753b --- /dev/null +++ b/src/test/regress/sql/cross_join.sql @@ -0,0 +1,131 @@ +-- this test file relies on multi_behavioral_analytics_create_table +-- and aims to have variety of tests covering CROSS JOINs +-- "t1 CROSS JOIN t2" is equivalent of "t1 JOIN t2 ON true" + +-- a distributed table can be cross joined with a reference table +-- and the CROSS JOIN can be in the outer part of an outer JOIN +SELECT count(*) FROM events_reference_table e1 CROSS JOIN events_table e2 LEFT JOIN users_table u ON (e2.user_id = u.user_id); + +-- two distributed tables cannot be cross joined +-- as it lacks distribution key equality +SELECT count(*) FROM events_reference_table e1 CROSS JOIN events_table e2 CROSS JOIN users_table u; +SELECT count(*) FROM events_reference_table e1, events_table e2, users_table u; + +-- we can provide the distribution key equality via WHERE clause +SELECT count(*) FROM events_reference_table e1 CROSS JOIN events_table e2 CROSS JOIN users_table u WHERE u.user_id = e2.user_id; + + +-- two reference tables are JOINed, and later CROSS JOINed with a distributed table +-- it is safe to pushdown +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id CROSS JOIN users_table; +SELECT count(*) FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id CROSS JOIN users_table; +SELECT count(*) FROM users_ref_test_table ref1 RIGHT JOIN users_ref_test_table ref2 on ref1.id = ref2.id CROSS JOIN users_table; +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 CROSS JOIN users_table; + +-- two reference tables CROSS JOINNed, and later JOINED with distributed tables +-- it is safe to pushdown +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 RIGHT JOIN users_table ON false; +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 JOIN users_table ON false; +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 JOIN users_table ON true; +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 RIGHT JOIN users_table ON true; +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 RIGHT JOIN users_table ON (ref1.id = users_table.user_id); +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 JOIN users_table ON (ref1.id = users_table.user_id); + +-- two reference tables CROSS JOINNed, and later JOINED with distributed tables +-- but the reference table CROSS JOIN is in the outer side of the JOIN with the distributed table +-- so we cannot pushdown +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON (ref1.id = users_table.user_id); +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 FULL JOIN users_table ON (ref1.id = users_table.user_id); +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON (ref1.id != users_table.user_id); +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON (ref1.id > 0); +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON (users_table.user_id > 0); +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON true; +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON false; + +-- a reference tables CROSS JOINed with a distribted table, and later JOINED with distributed tables on distribution keys +-- so safe to pushdown +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 JOIN users_table u2 ON (u1.user_id = u2.user_id); +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table u2 ON (u1.user_id = u2.user_id); +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 FULL JOIN users_table u2 ON (u1.user_id = u2.user_id); +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 RIGHT JOIN users_table u2 ON (u1.user_id = u2.user_id); + +-- a reference tables CROSS JOINed with a distribted table, and later JOINED with distributed tables on reference table column +-- so not safe to pushdown +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table u2 ON (ref2.id = u2.user_id); +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 FULL JOIN users_table u2 ON (ref2.id = u2.user_id); +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 RIGHT JOIN users_table u2 ON (ref2.id = u2.user_id); + +-- via repartitioning, Citus can handle this query as the result of "u1 CROSS JOIN ref2" +-- can be repartitioned on ref2.id +Set citus.enable_repartition_joins to on; +SELECT count(*) FROM users_table u1 CROSS JOIN users_ref_test_table ref2 JOIN users_table u2 ON (ref2.id = u2.user_id); +reset citus.enable_repartition_joins; + +-- although the following has the "ref LEFT JOIN dist" type of query, the LEFT JOIN is eliminated by Postgres +-- because the INNER JOIN eliminates the LEFT JOIN +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON (ref1.id = users_table.user_id) JOIN users_table u2 ON (u2.user_id = users_table.user_id); + +-- this is the same query as the above, but this time the outer query is also LEFT JOIN, meaning that Postgres +-- cannot eliminate the outer join +SELECT count(*) FROM users_ref_test_table ref1 CROSS JOIN users_ref_test_table ref2 LEFT JOIN users_table ON (ref1.id = users_table.user_id) LEFT JOIN users_table u2 ON (u2.user_id = users_table.user_id); + +-- cross join that goes through non-colocated subquery logic +-- for the "events_table" subquery as both distributed tables +-- do not have JOIN on the distribution key +SELECT max(events_all.cnt), + events_all.usr_id + FROM +(SELECT *, random() FROM + (SELECT *, random() + FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt + FROM (SELECT *,random FROM (SELECT *, random() FROM events_reference_table) as events_reference_table) as events_reference_table + INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) + GROUP BY users_table.user_id) AS events_all_inner + ) AS events_all + ) AS events_all + CROSS JOIN (SELECT *,random() FROM (SELECT *, random() FROM events_table) as events_table) as events_table +GROUP BY 2 +ORDER BY 1 DESC, + 2 DESC +LIMIT 5; + +-- cross join that goes through non-colocated subquery logic +-- for the "events_all" subquery as both distributed tables +-- do not have JOIN on the distribution key +SELECT max(events_all.cnt), + events_all.usr_id + FROM events_table + CROSS JOIN (SELECT *, random() + FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt + FROM events_reference_table + INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) + GROUP BY users_table.user_id) AS events_all_inner + ) AS events_all +GROUP BY 2 +ORDER BY 1 DESC, + 2 DESC +LIMIT 5; + + +-- cross join is between a reference table and distributed table, and +-- deep inside a subquery. The subquery can be in the outer part of the LEFT JOIN +SELECT + users_table.* +FROM + (SELECT + events_all.*, random() + FROM + events_reference_table JOIN users_table USING(user_id) + JOIN + (SELECT *, random() + FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt + FROM (SELECT *,random FROM (SELECT *, random() FROM events_reference_table) as events_reference_table) as events_reference_table + CROSS JOIN users_table + GROUP BY users_table.user_id) AS events_all_inner + ) AS events_all ON (user_id = usr_id) + ) AS events_all + LEFT JOIN (SELECT *,random() FROM (SELECT *, random() FROM events_table) as events_table) as events_table ON (events_all.usr_id = events_table.user_id) + LEFT JOIN users_table USING (user_id) +ORDER BY 1,2,3,4 LIMIT 5; + + diff --git a/src/test/regress/sql/multi_subquery_complex_reference_clause.sql b/src/test/regress/sql/multi_subquery_complex_reference_clause.sql index ec37df5ae..b8451f559 100644 --- a/src/test/regress/sql/multi_subquery_complex_reference_clause.sql +++ b/src/test/regress/sql/multi_subquery_complex_reference_clause.sql @@ -401,6 +401,102 @@ ORDER BY 1 DESC, 2 DESC LIMIT 5; +-- should be fine even if the tables are deep inside subqueries +SELECT max(events_all.cnt), + events_all.usr_id + FROM +(SELECT *, random() FROM + (SELECT *, random() + FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt + FROM (SELECT *,random FROM (SELECT *, random() FROM events_reference_table) as events_reference_table) as events_reference_table + INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) + GROUP BY users_table.user_id) AS events_all_inner + ) AS events_all + ) AS events_all + LEFT JOIN (SELECT *,random() FROM (SELECT *, random() FROM events_table) as events_table) as events_table ON (events_all.usr_id = events_table.user_id) +GROUP BY 2 +ORDER BY 1 DESC, + 2 DESC +LIMIT 5; + +-- should be fine with FULL join as well +SELECT max(events_all.cnt), + events_all.usr_id + FROM events_table + FULL JOIN (SELECT *, random() + FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt + FROM events_reference_table + INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) + GROUP BY users_table.user_id) AS events_all_inner + ) AS events_all ON (events_all.usr_id = events_table.user_id) +GROUP BY 2 +ORDER BY 1 DESC, + 2 DESC +LIMIT 5; + +-- two levels of "(ref_table JOIN dist_table) LEFT JOIN" +-- should be fine as well +SELECT + users_table.* +FROM + (SELECT + events_all.*, random() + FROM + events_reference_table JOIN users_table USING(user_id) + JOIN + (SELECT *, random() + FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt + FROM (SELECT *,random FROM (SELECT *, random() FROM events_reference_table) as events_reference_table) as events_reference_table + INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) + GROUP BY users_table.user_id) AS events_all_inner + ) AS events_all ON (user_id = usr_id) + ) AS events_all + LEFT JOIN (SELECT *,random() FROM (SELECT *, random() FROM events_table) as events_table) as events_table ON (events_all.usr_id = events_table.user_id) + LEFT JOIN users_table USING (user_id) +ORDER BY 1,2,3,4 LIMIT 5; + + +-- we should be able to support OUTER joins involving +-- reference tables even if the subquery is in WHERE clause +SELECT count(*) +FROM events_table +WHERE user_id IN + (SELECT subquery_1.user_id + FROM + (SELECT tt1.user_id, + random() + FROM users_table AS tt1 + JOIN events_table AS tt2 ON tt1.user_id = tt2.user_id) subquery_1 + RIGHT JOIN + (SELECT *, + random() + FROM + (SELECT tt1.user_id, + random() + FROM users_table AS tt1 + JOIN events_reference_table AS REF ON tt1.user_id = ref.user_id) subquery_2_inner) subquery_2 ON subquery_1.user_id = subquery_2.user_id); + +-- we should be able to support OUTER joins involving +-- reference tables even if the subquery is in the outer part of a JOIN + +SELECT count(*) +FROM users_table +RIGHT JOIN + (SELECT subquery_1.user_id + FROM + (SELECT tt1.user_id, + random() + FROM users_table AS tt1 + JOIN events_table AS tt2 ON tt1.user_id = tt2.user_id) subquery_1 + RIGHT JOIN + (SELECT *, + random() + FROM + (SELECT tt1.user_id, + random() + FROM users_table AS tt1 + JOIN events_reference_table AS REF ON tt1.user_id = ref.user_id) subquery_2_inner) subquery_2 ON subquery_1.user_id = subquery_2.user_id) AS foo USING (user_id); + -- LATERAL JOINs used with INNER JOINs with reference tables SET citus.subquery_pushdown to ON; SELECT user_id, lastseen @@ -1329,6 +1425,52 @@ JOIN ORDER BY 1 LIMIT 5; +-- outer part of the LEFT JOIN consists only reference tables, so we cannot push down +-- we have different combinations for ON condition, true/false/two column join/single column filter +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON true; +SELECT count(*) FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON true; +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON false; +SELECT count(*) FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON false; +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON (ref1.id > 5); +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON (user_buy_test_table.user_id > 5); +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON (ref1.id = user_buy_test_table.user_id); +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON (ref2.id = user_buy_test_table.user_id); +SELECT count(*) FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON (ref1.id = user_buy_test_table.user_id); +SELECT count(*) FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON (ref2.id = user_buy_test_table.user_id); + + +-- outer part of the LEFT JOIN consists only reference tables within a subquery, so we cannot push down +-- we have different combinations for ON condition, true/false/two column join/single column filter +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON true; +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON true; +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON false; +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON false; +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON (foo.id > 5); +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON (user_buy_test_table.user_id > 19); +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON (foo.id = user_buy_test_table.user_id); + +-- one example where unsupported outer join is deep inside a subquery +SELECT *, random() FROM ( +SELECT *,random() FROM user_buy_test_table WHERE user_id > ( +SELECT count(*) FROM (SELECT *,random() FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as bar) as foo LEFT JOIN (SELECT *, random() FROM (SELECT *,random() FROM user_buy_test_table d1 JOIN user_buy_test_table d2 USING (user_id)) as bar_inner ) as bar ON true)) as boo; + +-- In theory, we should be able to pushdown this query +-- however, as the LEFT JOIN condition is between a reference table and the distributed table +-- Postgres generates a LEFT JOIN alternative among those tables +SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 INNER JOIN user_buy_test_table u1 on ref1.id = u1.user_id) as foo LEFT JOIN user_buy_test_table ON (foo.id = user_buy_test_table.user_id); + +-- same as the above query, but this time LEFT JOIN condition is between distributed tables +-- so Postgres doesn't generate join restriction between reference and distributed tables +SELECT count(*) FROM (SELECT u1.*, random() FROM users_ref_test_table ref1 INNER JOIN user_buy_test_table u1 on ref1.id = u1.user_id) as foo LEFT JOIN user_buy_test_table ON (foo.user_id = user_buy_test_table.user_id); + +-- outer part of the LEFT JOIN consists only intermediate result due to LIMIT, so we cannot push down +SELECT count(*) FROM (SELECT ref1.* FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LIMIT 5) as foo LEFT JOIN user_buy_test_table ON true; + +-- should be fine as OUTER part is the distributed table +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id RIGHT JOIN user_buy_test_table ON true; +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id RIGHT JOIN user_buy_test_table ON false; +SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id RIGHT JOIN user_buy_test_table ON (ref1.id = user_id); + DROP TABLE user_buy_test_table; DROP TABLE users_ref_test_table; DROP TABLE users_return_test_table; diff --git a/src/test/regress/sql/sqlancer_failures.sql b/src/test/regress/sql/sqlancer_failures.sql index 0555af383..57531ea84 100644 --- a/src/test/regress/sql/sqlancer_failures.sql +++ b/src/test/regress/sql/sqlancer_failures.sql @@ -39,5 +39,24 @@ INSERT INTO t5 VALUES (CASE WHEN 2 BETWEEN 1 AND 3 THEN 2 ELSE 1 END); EXPLAIN (COSTS FALSE) SELECT FROM t5 WHERE c0 = 2; EXPLAIN (COSTS FALSE) SELECT FROM t5 WHERE c0 = CASE WHEN 2 BETWEEN 1 AND 3 THEN 2 ELSE 1 END; +CREATE TABLE IF NOT EXISTS t6(c0 TEXT CHECK (TRUE), c1 money ) WITH (autovacuum_vacuum_threshold=1180014707, autovacuum_freeze_table_age=13771154, autovacuum_vacuum_cost_delay=23, autovacuum_analyze_threshold=1935153914, autovacuum_freeze_min_age=721733768, autovacuum_enabled=0, autovacuum_vacuum_cost_limit=9983); +CREATE UNLOGGED TABLE IF NOT EXISTS t7(LIKE t6); +CREATE TABLE t8(LIKE t6 INCLUDING INDEXES); +CREATE UNLOGGED TABLE t9(LIKE t6 EXCLUDING STATISTICS); +CREATE TABLE t10(LIKE t7); + +SELECT create_distributed_table('t6', 'c0'); +ALTER TABLE t6 ALTER COLUMN c0 SET NOT NULL; +SELECT create_reference_table('t7'); +SELECT create_distributed_table('t8', 'c0'); +ALTER TABLE t8 ALTER COLUMN c0 SET NOT NULL; +SELECT create_distributed_table('t9', 'c0'); +ALTER TABLE t9 ALTER COLUMN c0 SET NOT NULL; +SELECT create_reference_table('t10'); + +SELECT count(*) FROM ( +SELECT ALL t7.c1, t7.c0, t8.c1, t10.c1, t8.c0 FROM t7 CROSS JOIN t10 FULL OUTER JOIN t8 ON (((((((('[832125354,1134163512)'::int4range)*('(0,2106623281)'::int4range)))-('(-600267905,509840582]'::int4range)))*('(-365203965,1662828182)'::int4range)))&<((((((('(-1286467417,697584012]'::int4range)*('[-1691485781,1341103963)'::int4range)))*((('(-1768368435,1719707648)'::int4range)*('(139536997,1275813540]'::int4range)))))*((((('[-2103910157,-1961746758)'::int4range)*('[-834534078,533073939)'::int4range)))*((('[-1030552151,552856781]'::int4range)*('[-1109419376,1205173697]'::int4range)))))))) +) AS foo; + SET client_min_messages TO WARNING; DROP SCHEMA sqlancer_failures CASCADE;