From 100e5d3196bbe9b252d6ed8d84d68c90666d5d3d Mon Sep 17 00:00:00 2001 From: Marco Slot Date: Tue, 15 Dec 2020 15:23:38 +0100 Subject: [PATCH] Address review feedback --- .../planner/query_pushdown_planning.c | 185 ++++++++++-------- .../distributed/planner/recursive_planning.c | 3 +- src/include/distributed/recursive_planning.h | 3 + .../expected/multi_mx_router_planner.out | 2 +- .../multi_subquery_complex_queries.out | 39 ++-- .../regress/expected/subquery_in_where.out | 180 +++++++++++++++++ .../regress/sql/multi_mx_router_planner.sql | 2 +- .../sql/multi_subquery_complex_queries.sql | 33 ++-- src/test/regress/sql/subquery_in_where.sql | 137 +++++++++++++ 9 files changed, 458 insertions(+), 126 deletions(-) diff --git a/src/backend/distributed/planner/query_pushdown_planning.c b/src/backend/distributed/planner/query_pushdown_planning.c index 07bb8a9b2..38cc844ab 100644 --- a/src/backend/distributed/planner/query_pushdown_planning.c +++ b/src/backend/distributed/planner/query_pushdown_planning.c @@ -34,6 +34,7 @@ #include "distributed/pg_dist_partition.h" #include "distributed/query_utils.h" #include "distributed/query_pushdown_planning.h" +#include "distributed/recursive_planning.h" #include "distributed/relation_restriction_equivalence.h" #include "distributed/version_compat.h" #include "nodes/nodeFuncs.h" @@ -78,6 +79,7 @@ static RecurringTuplesType FromClauseRecurringTupleType(Query *queryTree); static DeferredErrorMessage * DeferredErrorIfUnsupportedRecurringTuplesJoin( PlannerRestrictionContext *plannerRestrictionContext); static DeferredErrorMessage * DeferErrorIfUnsupportedTableCombination(Query *queryTree); +static DeferredErrorMessage * DeferErrorIfSubqueryRequiresMerge(Query *subqueryTree); static bool ExtractSetOperationStatmentWalker(Node *node, List **setOperationList); static RecurringTuplesType FetchFirstRecurType(PlannerInfo *plannerInfo, Relids relids); @@ -911,7 +913,6 @@ DeferErrorIfCannotPushdownSubquery(Query *subqueryTree, bool outerMostQueryHasLi { bool preconditionsSatisfied = true; char *errorDetail = NULL; - StringInfo errorInfo = NULL; DeferredErrorMessage *deferredError = DeferErrorIfUnsupportedTableCombination( subqueryTree); @@ -934,84 +935,12 @@ DeferErrorIfCannotPushdownSubquery(Query *subqueryTree, bool outerMostQueryHasLi * push down SQL features within such a function, as long as co-located join * checks are applied. */ - if (!contain_vars_of_level((Node *) subqueryTree, 1)) + if (!ContainsReferencesToOuterQuery(subqueryTree)) { - if (subqueryTree->limitOffset) + deferredError = DeferErrorIfSubqueryRequiresMerge(subqueryTree); + if (deferredError) { - preconditionsSatisfied = false; - errorDetail = "Offset clause is currently unsupported when a subquery " - "references a column from another query"; - } - - /* limit is not supported when SubqueryPushdown is not set */ - if (subqueryTree->limitCount && !SubqueryPushdown) - { - preconditionsSatisfied = false; - errorDetail = "Limit in subquery is currently unsupported when a " - "subquery references a column from another query"; - } - - /* group clause list must include partition column */ - if (subqueryTree->groupClause) - { - List *groupClauseList = subqueryTree->groupClause; - List *targetEntryList = subqueryTree->targetList; - List *groupTargetEntryList = GroupTargetEntryList(groupClauseList, - targetEntryList); - bool groupOnPartitionColumn = - TargetListOnPartitionColumn(subqueryTree, groupTargetEntryList); - if (!groupOnPartitionColumn) - { - preconditionsSatisfied = false; - errorDetail = "Group by list without partition column is currently " - "unsupported when a subquery references a column " - "from another query"; - } - } - - /* we don't support aggregates without group by */ - if (subqueryTree->hasAggs && (subqueryTree->groupClause == NULL)) - { - preconditionsSatisfied = false; - errorDetail = "Aggregates without group by are currently unsupported " - "when a subquery references a column from another query"; - } - - /* having clause without group by on partition column is not supported */ - if (subqueryTree->havingQual && (subqueryTree->groupClause == NULL)) - { - preconditionsSatisfied = false; - errorDetail = "Having qual without group by on partition column is " - "currently unsupported when a subquery references " - "a column from another query"; - } - - /* - * We support window functions when the window function - * is partitioned on distribution column. - */ - if (subqueryTree->hasWindowFuncs && !SafeToPushdownWindowFunction(subqueryTree, - &errorInfo)) - { - errorDetail = (char *) errorInfo->data; - preconditionsSatisfied = false; - } - - /* distinct clause list must include partition column */ - if (subqueryTree->distinctClause) - { - List *distinctClauseList = subqueryTree->distinctClause; - List *targetEntryList = subqueryTree->targetList; - List *distinctTargetEntryList = GroupTargetEntryList(distinctClauseList, - targetEntryList); - bool distinctOnPartitionColumn = - TargetListOnPartitionColumn(subqueryTree, distinctTargetEntryList); - if (!distinctOnPartitionColumn) - { - preconditionsSatisfied = false; - errorDetail = "Distinct on columns without partition column is " - "currently unsupported"; - } + return deferredError; } } @@ -1080,6 +1009,108 @@ DeferErrorIfCannotPushdownSubquery(Query *subqueryTree, bool outerMostQueryHasLi } +/* + * DeferErrorIfSubqueryRequiresMerge returns a deferred error if the subquery + * requires a merge step on the coordinator (e.g. limit, group by non-distribution + * column, etc.). + */ +static DeferredErrorMessage * +DeferErrorIfSubqueryRequiresMerge(Query *subqueryTree) +{ + bool preconditionsSatisfied = true; + char *errorDetail = NULL; + + if (subqueryTree->limitOffset) + { + preconditionsSatisfied = false; + errorDetail = "Offset clause is currently unsupported when a subquery " + "references a column from another query"; + } + + /* limit is not supported when SubqueryPushdown is not set */ + if (subqueryTree->limitCount && !SubqueryPushdown) + { + preconditionsSatisfied = false; + errorDetail = "Limit in subquery is currently unsupported when a " + "subquery references a column from another query"; + } + + /* group clause list must include partition column */ + if (subqueryTree->groupClause) + { + List *groupClauseList = subqueryTree->groupClause; + List *targetEntryList = subqueryTree->targetList; + List *groupTargetEntryList = GroupTargetEntryList(groupClauseList, + targetEntryList); + bool groupOnPartitionColumn = + TargetListOnPartitionColumn(subqueryTree, groupTargetEntryList); + if (!groupOnPartitionColumn) + { + preconditionsSatisfied = false; + errorDetail = "Group by list without partition column is currently " + "unsupported when a subquery references a column " + "from another query"; + } + } + + /* we don't support aggregates without group by */ + if (subqueryTree->hasAggs && (subqueryTree->groupClause == NULL)) + { + preconditionsSatisfied = false; + errorDetail = "Aggregates without group by are currently unsupported " + "when a subquery references a column from another query"; + } + + /* having clause without group by on partition column is not supported */ + if (subqueryTree->havingQual && (subqueryTree->groupClause == NULL)) + { + preconditionsSatisfied = false; + errorDetail = "Having qual without group by on partition column is " + "currently unsupported when a subquery references " + "a column from another query"; + } + + /* + * We support window functions when the window function + * is partitioned on distribution column. + */ + StringInfo errorInfo = NULL; + if (subqueryTree->hasWindowFuncs && !SafeToPushdownWindowFunction(subqueryTree, + &errorInfo)) + { + errorDetail = (char *) errorInfo->data; + preconditionsSatisfied = false; + } + + /* distinct clause list must include partition column */ + if (subqueryTree->distinctClause) + { + List *distinctClauseList = subqueryTree->distinctClause; + List *targetEntryList = subqueryTree->targetList; + List *distinctTargetEntryList = GroupTargetEntryList(distinctClauseList, + targetEntryList); + bool distinctOnPartitionColumn = + TargetListOnPartitionColumn(subqueryTree, distinctTargetEntryList); + if (!distinctOnPartitionColumn) + { + preconditionsSatisfied = false; + errorDetail = "Distinct on columns without partition column is " + "currently unsupported"; + } + } + + /* finally check and return deferred if not satisfied */ + if (!preconditionsSatisfied) + { + return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, + "cannot push down this subquery", + errorDetail, NULL); + } + + return NULL; +} + + /* * DeferErrorIfUnsupportedTableCombination checks if the given query tree contains any * unsupported range table combinations. For this, the function walks over all diff --git a/src/backend/distributed/planner/recursive_planning.c b/src/backend/distributed/planner/recursive_planning.c index f8cc45cc7..a6761a7f4 100644 --- a/src/backend/distributed/planner/recursive_planning.c +++ b/src/backend/distributed/planner/recursive_planning.c @@ -180,7 +180,6 @@ static bool IsLocalTableRteOrMatView(Node *node); static DistributedSubPlan * CreateDistributedSubPlan(uint32 subPlanId, Query *subPlanQuery); static bool CteReferenceListWalker(Node *node, CteReferenceWalkerContext *context); -static bool ContainsReferencesToOuterQuery(Query *query); static bool ContainsReferencesToOuterQueryWalker(Node *node, VarLevelsUpWalkerContext *context); static bool NodeContainsSubqueryReferencingOuterQuery(Node *node); @@ -1288,7 +1287,7 @@ CteReferenceListWalker(Node *node, CteReferenceWalkerContext *context) * anything that points outside of the query itself. Such queries cannot be * planned recursively. */ -static bool +bool ContainsReferencesToOuterQuery(Query *query) { VarLevelsUpWalkerContext context = { 0 }; diff --git a/src/include/distributed/recursive_planning.h b/src/include/distributed/recursive_planning.h index 0a64f6845..98d230cb2 100644 --- a/src/include/distributed/recursive_planning.h +++ b/src/include/distributed/recursive_planning.h @@ -49,4 +49,7 @@ extern void ReplaceRTERelationWithRteSubquery(RangeTblEntry *rangeTableEntry, RecursivePlanningContext *context); extern bool IsRecursivelyPlannableRelation(RangeTblEntry *rangeTableEntry); extern bool IsRelationLocalTableOrMatView(Oid relationId); +extern bool ContainsReferencesToOuterQuery(Query *query); + + #endif /* RECURSIVE_PLANNING_H */ diff --git a/src/test/regress/expected/multi_mx_router_planner.out b/src/test/regress/expected/multi_mx_router_planner.out index f122bb4d0..bd82b4dd1 100644 --- a/src/test/regress/expected/multi_mx_router_planner.out +++ b/src/test/regress/expected/multi_mx_router_planner.out @@ -484,7 +484,7 @@ DEBUG: query has a single distribution column value: 1 41 | 11814 (5 rows) --- subqueries are not supported in SELECT clause +-- subqueries in SELECT clause SELECT a.title AS name, (SELECT a2.id FROM articles_single_shard_hash_mx a2 WHERE a.id = a2.id LIMIT 1) AS special_price FROM articles_hash_mx a; DEBUG: Router planner cannot handle multi-shard select queries diff --git a/src/test/regress/expected/multi_subquery_complex_queries.out b/src/test/regress/expected/multi_subquery_complex_queries.out index 1d526f575..dc4e62616 100644 --- a/src/test/regress/expected/multi_subquery_complex_queries.out +++ b/src/test/regress/expected/multi_subquery_complex_queries.out @@ -1227,7 +1227,7 @@ FROM WHERE "users"."user_id" = "some_recent_users"."user_id" AND users.value_2 > 1 and users.value_2 < 3 - LIMIT 1) "some_users_data" + ORDER BY 1 LIMIT 1) "some_users_data" ON TRUE ORDER BY user_id @@ -1240,9 +1240,7 @@ limit 50; -- reset subquery_pushdown SET citus.subquery_pushdown to OFF; --- we recursively plan recent_events_1 --- but not some_users_data since it has a reference --- from an outer query which is not recursively planned +-- mixture of recursively planned subqueries and correlated subqueries SELECT "some_users_data".user_id, lastseen FROM (SELECT user_id, max(time) AS lastseen @@ -1270,24 +1268,21 @@ FROM WHERE "users"."value_1" = "some_recent_users"."user_id" AND users.value_2 > 1 and users.value_2 < 3 - LIMIT 1) "some_users_data" + ORDER BY 1 LIMIT 1) "some_users_data" ON TRUE ORDER BY user_id limit 50; user_id | lastseen --------------------------------------------------------------------- + 1 | Thu Nov 23 18:08:26.550729 2017 2 | Thu Nov 23 17:26:14.563216 2017 3 | Thu Nov 23 18:08:26.550729 2017 3 | Thu Nov 23 17:26:14.563216 2017 - 5 | Thu Nov 23 18:08:26.550729 2017 5 | Thu Nov 23 17:26:14.563216 2017 6 | Thu Nov 23 18:08:26.550729 2017 (6 rows) --- we recursively plan some queries but fail in the end --- since some_users_data since it has a reference --- from an outer query which is not recursively planned SELECT "some_users_data".user_id, lastseen FROM (SELECT 2 * user_id as user_id, max(time) AS lastseen @@ -1315,7 +1310,7 @@ FROM WHERE "users"."user_id" = "some_recent_users"."user_id" AND users.value_2 > 1 and users.value_2 < 3 - LIMIT 1) "some_users_data" + ORDER BY 1 LIMIT 1) "some_users_data" ON TRUE ORDER BY user_id @@ -1379,7 +1374,7 @@ FROM WHERE "users"."user_id" = "some_recent_users"."user_id" AND "users"."value_2" > 4 - LIMIT 1) "some_users_data" + ORDER BY 1 LIMIT 1) "some_users_data" ON TRUE ORDER BY lastseen DESC @@ -1447,7 +1442,7 @@ SELECT "some_users_data".user_id, MAX(lastseen), count(*) WHERE "users"."user_id" = "some_recent_users"."user_id" AND "users"."value_2" > 4 - LIMIT 1) "some_users_data" ON true + ORDER BY 1 LIMIT 1) "some_users_data" ON true GROUP BY 1 ORDER BY 2, 1 DESC LIMIT 10; @@ -1504,7 +1499,7 @@ FROM WHERE "users"."user_id" = "some_recent_users"."user_id" AND "users"."value_2" > 4 - LIMIT 1) "some_users_data" ON true + ORDER BY 1 LIMIT 1) "some_users_data" ON true ORDER BY lastseen DESC LIMIT 10) "some_users" @@ -1519,7 +1514,7 @@ DEBUG: push down of limit count: 10 DEBUG: generating subplan XXX_2 for subquery SELECT filter_users_1.user_id, last_events_1."time" AS lastseen FROM ((SELECT user_where_1_1.user_id FROM ((SELECT users.user_id FROM public.users_table users WHERE ((users.user_id OPERATOR(pg_catalog.>) 1) AND (users.user_id OPERATOR(pg_catalog.<) 4) AND (users.value_1 OPERATOR(pg_catalog.>) 2))) user_where_1_1 JOIN (SELECT intermediate_result.user_id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer)) user_where_1_join_1 ON ((user_where_1_1.user_id OPERATOR(pg_catalog.<>) user_where_1_join_1.user_id)))) filter_users_1 JOIN LATERAL (SELECT events.user_id, events."time" FROM public.events_table events WHERE ((events.user_id OPERATOR(pg_catalog.>) 1) AND (events.user_id OPERATOR(pg_catalog.<) 4) AND (events.user_id OPERATOR(pg_catalog.=) filter_users_1.user_id)) ORDER BY events."time" DESC LIMIT 1) last_events_1 ON (true)) ORDER BY last_events_1."time" DESC LIMIT 10 DEBUG: Router planner cannot handle multi-shard select queries DEBUG: push down of limit count: 10 -DEBUG: generating subplan XXX_3 for subquery SELECT some_users_data.user_id, some_recent_users.lastseen FROM ((SELECT intermediate_result.user_id, intermediate_result.lastseen FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, lastseen timestamp without time zone)) some_recent_users JOIN LATERAL (SELECT users.user_id FROM public.users_table users WHERE ((users.user_id OPERATOR(pg_catalog.=) some_recent_users.user_id) AND (users.value_2 OPERATOR(pg_catalog.>) 4)) LIMIT 1) some_users_data ON (true)) ORDER BY some_recent_users.lastseen DESC LIMIT 10 +DEBUG: generating subplan XXX_3 for subquery SELECT some_users_data.user_id, some_recent_users.lastseen FROM ((SELECT intermediate_result.user_id, intermediate_result.lastseen FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, lastseen timestamp without time zone)) some_recent_users JOIN LATERAL (SELECT users.user_id FROM public.users_table users WHERE ((users.user_id OPERATOR(pg_catalog.=) some_recent_users.user_id) AND (users.value_2 OPERATOR(pg_catalog.>) 4)) ORDER BY users.user_id LIMIT 1) some_users_data ON (true)) ORDER BY some_recent_users.lastseen DESC LIMIT 10 DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT user_id, lastseen FROM (SELECT intermediate_result.user_id, intermediate_result.lastseen FROM read_intermediate_result('XXX_3'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, lastseen timestamp without time zone)) some_users ORDER BY user_id DESC, lastseen DESC LIMIT 10 DEBUG: Creating router plan user_id | lastseen @@ -1584,7 +1579,7 @@ FROM WHERE "users"."user_id" = "some_recent_users"."user_id" AND "users"."value_2" > 4 - LIMIT 1) "some_users_data" ON true + ORDER BY 1 LIMIT 1) "some_users_data" ON true ORDER BY lastseen DESC LIMIT 10) "some_users" @@ -1595,7 +1590,7 @@ DEBUG: generating subplan XXX_1 for subquery SELECT user_id, value_1 FROM publi DEBUG: push down of limit count: 10 DEBUG: generating subplan XXX_2 for subquery SELECT filter_users_1.user_id, last_events_1."time" AS lastseen FROM ((SELECT user_where_1_1.user_id FROM ((SELECT users.user_id FROM public.users_table users WHERE ((users.user_id OPERATOR(pg_catalog.>) 1) AND (users.user_id OPERATOR(pg_catalog.<) 4) AND (users.value_1 OPERATOR(pg_catalog.>) 2))) user_where_1_1 JOIN (SELECT intermediate_result.user_id, intermediate_result.value_1 FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, value_1 integer)) user_where_1_join_1 ON ((user_where_1_1.user_id OPERATOR(pg_catalog.=) user_where_1_join_1.value_1)))) filter_users_1 JOIN LATERAL (SELECT events.user_id, events."time" FROM public.events_table events WHERE ((events.user_id OPERATOR(pg_catalog.>) 1) AND (events.user_id OPERATOR(pg_catalog.<) 4) AND (events.user_id OPERATOR(pg_catalog.=) filter_users_1.user_id)) ORDER BY events."time" DESC LIMIT 1) last_events_1 ON (true)) ORDER BY last_events_1."time" DESC LIMIT 10 DEBUG: push down of limit count: 10 -DEBUG: generating subplan XXX_3 for subquery SELECT some_users_data.user_id, some_recent_users.lastseen FROM ((SELECT intermediate_result.user_id, intermediate_result.lastseen FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, lastseen timestamp without time zone)) some_recent_users JOIN LATERAL (SELECT users.user_id FROM public.users_table users WHERE ((users.user_id OPERATOR(pg_catalog.=) some_recent_users.user_id) AND (users.value_2 OPERATOR(pg_catalog.>) 4)) LIMIT 1) some_users_data ON (true)) ORDER BY some_recent_users.lastseen DESC LIMIT 10 +DEBUG: generating subplan XXX_3 for subquery SELECT some_users_data.user_id, some_recent_users.lastseen FROM ((SELECT intermediate_result.user_id, intermediate_result.lastseen FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, lastseen timestamp without time zone)) some_recent_users JOIN LATERAL (SELECT users.user_id FROM public.users_table users WHERE ((users.user_id OPERATOR(pg_catalog.=) some_recent_users.user_id) AND (users.value_2 OPERATOR(pg_catalog.>) 4)) ORDER BY users.user_id LIMIT 1) some_users_data ON (true)) ORDER BY some_recent_users.lastseen DESC LIMIT 10 DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT user_id, lastseen FROM (SELECT intermediate_result.user_id, intermediate_result.lastseen FROM read_intermediate_result('XXX_3'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, lastseen timestamp without time zone)) some_users ORDER BY user_id DESC, lastseen DESC LIMIT 10 user_id | lastseen --------------------------------------------------------------------- @@ -1667,9 +1662,7 @@ ORDER BY user_id DESC, lastseen DESC LIMIT 10; ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns --- not pushdownable since lower LATERAL JOIN is not on the partition key --- not recursively plannable due to LATERAL join where there is a reference --- from an outer query +-- complex lateral join between inner join and correlated subquery SELECT user_id, lastseen FROM (SELECT @@ -1715,7 +1708,7 @@ FROM WHERE "users"."value_1" = "some_recent_users"."user_id" AND "users"."value_2" > 4 - LIMIT 1) "some_users_data" ON true + ORDER BY 1 LIMIT 1) "some_users_data" ON true ORDER BY lastseen DESC LIMIT 10) "some_users" @@ -2339,9 +2332,7 @@ LIMIT 10; (1 row) SET citus.subquery_pushdown to OFF; --- not pushdownable since lower LATERAL JOIN is not on the partition key --- not recursively plannable due to LATERAL join where there is a reference --- from an outer query +-- on side of the lateral join can be recursively plannen, then pushed down SELECT * FROM (SELECT @@ -2372,7 +2363,7 @@ FROM WHERE "users"."value_2" = "some_recent_users"."user_id" AND value_2 > 4 - LIMIT 1) "some_users_data" ON true + ORDER BY 1 LIMIT 1) "some_users_data" ON true ORDER BY value_2 DESC LIMIT 10) "some_users" diff --git a/src/test/regress/expected/subquery_in_where.out b/src/test/regress/expected/subquery_in_where.out index a8f7fe2c6..ac98d85f6 100644 --- a/src/test/regress/expected/subquery_in_where.out +++ b/src/test/regress/expected/subquery_in_where.out @@ -967,6 +967,186 @@ DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT count(*) AS c 24 (1 row) +-- two levels of correlation should also allow +-- merge step in the subquery +SELECT sum(value_1) +FROM users_table u +WHERE EXISTS + (SELECT 1 + FROM events_table e + WHERE u.user_id = e.user_id AND + EXISTS + (SELECT 1 + FROM users_table u2 + WHERE u2.user_id = u.user_id AND u2.value_1 = 5 + LIMIT 1)); + sum +--------------------------------------------------------------------- + 216 +(1 row) + +-- correlated subquery in WHERE, with a slightly +-- different syntax that the result of the subquery +-- is compared with a constant +SELECT sum(value_1) +FROM users_table u1 +WHERE (SELECT COUNT(DISTINCT e1.value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id + ) > 115; + sum +--------------------------------------------------------------------- + +(1 row) + +-- a correlated subquery which requires merge step +-- can be pushed down on UPDATE/DELETE queries as well +-- rollback to keep the rest of the tests unchanged +BEGIN; +UPDATE users_table u1 + SET value_1 = (SELECT count(DISTINCT value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id); +DELETE FROM users_table u1 WHERE (SELECT count(DISTINCT value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id) > 10; +ROLLBACK; +-- a correlated anti-join can also be pushed down even if the subquery +-- has a LIMIT +SELECT avg(value_1) +FROM users_table u +WHERE NOT EXISTS + (SELECT 'XXX' + FROM events_table e + WHERE u.user_id = e.user_id and e.value_2 > 10000 LIMIT 1); + avg +--------------------------------------------------------------------- + 2.5544554455445545 +(1 row) + +-- a [correlated] lateral join can also be pushed down even if the subquery +-- has an aggregate wout a GROUP BY +SELECT + max(min_of_val_2), max(u1.value_1) +FROM + users_table u1 + LEFT JOIN LATERAL + (SELECT min(e1.value_2) as min_of_val_2 FROM events_table e1 WHERE e1.user_id = u1.user_id) as foo ON (true); + max | max +--------------------------------------------------------------------- + 1 | 5 +(1 row) + +-- a self join is followed by a correlated subquery +EXPLAIN (COSTS OFF) +SELECT + * +FROM + users_table u1 JOIN users_table u2 USING (user_id) +WHERE + u1.value_1 < u2.value_1 AND + (SELECT + count(*) + FROM + events_table e1 + WHERE + e1.user_id = u2.user_id) > 10; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Hash Join + Hash Cond: (u2.user_id = u1.user_id) + Join Filter: (u1.value_1 < u2.value_1) + -> Seq Scan on users_table_1400256 u2 + Filter: ((SubPlan 1) > 10) + SubPlan 1 + -> Aggregate + -> Seq Scan on events_table_1400260 e1 + Filter: (user_id = u2.user_id) + -> Hash + -> Seq Scan on users_table_1400256 u1 +(16 rows) + +-- when the colocated join of the FROM clause +-- entries happen on WHERE clause, Citus cannot +-- pushdown +-- Likely that the colocation checks should be +-- improved +SELECT + u1.user_id, u2.user_id +FROM + users_table u1, users_table u2 +WHERE + u1.value_1 < u2.value_1 AND + (SELECT + count(*) + FROM + events_table e1 + WHERE + e1.user_id = u2.user_id AND + u1.user_id = u2.user_id) > 10 +ORDER BY 1,2; +ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns +-- create a view that contains correlated subquery +CREATE TEMPORARY VIEW correlated_subquery_view AS + SELECT u1.user_id + FROM users_table u1 + WHERE (SELECT COUNT(DISTINCT e1.value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id + ) > 0; +SELECT sum(user_id) FROM correlated_subquery_view; + sum +--------------------------------------------------------------------- + 376 +(1 row) + +-- now, join the view with another correlated subquery +SELECT + sum(mx) +FROM + correlated_subquery_view + LEFT JOIN LATERAL + (SELECT max(value_2) as mx FROM events_table WHERE correlated_subquery_view.user_id = events_table.user_id) as foo ON (true); + sum +--------------------------------------------------------------------- + 459 +(1 row) + +-- as an edge case, JOIN is on false +SELECT + sum(mx) +FROM + correlated_subquery_view + LEFT JOIN LATERAL + (SELECT max(value_2) as mx FROM events_table WHERE correlated_subquery_view.user_id = events_table.user_id) as foo ON (false); + sum +--------------------------------------------------------------------- + +(1 row) + +SELECT sum(value_1) +FROM users_table u1 +WHERE (SELECT COUNT(DISTINCT e1.value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id AND false + ) > 115; +ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns +SELECT sum(value_1) +FROM users_table u1 +WHERE (SELECT COUNT(DISTINCT e1.value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id + ) > 115 AND false; + sum +--------------------------------------------------------------------- + +(1 row) + SET client_min_messages TO DEFAULT; DROP TABLE local_table; DROP SCHEMA subquery_in_where CASCADE; diff --git a/src/test/regress/sql/multi_mx_router_planner.sql b/src/test/regress/sql/multi_mx_router_planner.sql index 688a7c944..b241439b0 100644 --- a/src/test/regress/sql/multi_mx_router_planner.sql +++ b/src/test/regress/sql/multi_mx_router_planner.sql @@ -235,7 +235,7 @@ FROM articles_hash_mx, (SELECT id, word_count FROM articles_hash_mx) AS test WHERE test.id = articles_hash_mx.id and articles_hash_mx.author_id = 1 ORDER BY articles_hash_mx.id; --- subqueries are not supported in SELECT clause +-- subqueries in SELECT clause SELECT a.title AS name, (SELECT a2.id FROM articles_single_shard_hash_mx a2 WHERE a.id = a2.id LIMIT 1) AS special_price FROM articles_hash_mx a; diff --git a/src/test/regress/sql/multi_subquery_complex_queries.sql b/src/test/regress/sql/multi_subquery_complex_queries.sql index 7fe8a90bf..28e468712 100644 --- a/src/test/regress/sql/multi_subquery_complex_queries.sql +++ b/src/test/regress/sql/multi_subquery_complex_queries.sql @@ -1119,7 +1119,7 @@ FROM WHERE "users"."user_id" = "some_recent_users"."user_id" AND users.value_2 > 1 and users.value_2 < 3 - LIMIT 1) "some_users_data" + ORDER BY 1 LIMIT 1) "some_users_data" ON TRUE ORDER BY user_id @@ -1128,9 +1128,7 @@ limit 50; -- reset subquery_pushdown SET citus.subquery_pushdown to OFF; --- we recursively plan recent_events_1 --- but not some_users_data since it has a reference --- from an outer query which is not recursively planned +-- mixture of recursively planned subqueries and correlated subqueries SELECT "some_users_data".user_id, lastseen FROM (SELECT user_id, max(time) AS lastseen @@ -1158,15 +1156,12 @@ FROM WHERE "users"."value_1" = "some_recent_users"."user_id" AND users.value_2 > 1 and users.value_2 < 3 - LIMIT 1) "some_users_data" + ORDER BY 1 LIMIT 1) "some_users_data" ON TRUE ORDER BY user_id limit 50; --- we recursively plan some queries but fail in the end --- since some_users_data since it has a reference --- from an outer query which is not recursively planned SELECT "some_users_data".user_id, lastseen FROM (SELECT 2 * user_id as user_id, max(time) AS lastseen @@ -1194,7 +1189,7 @@ FROM WHERE "users"."user_id" = "some_recent_users"."user_id" AND users.value_2 > 1 and users.value_2 < 3 - LIMIT 1) "some_users_data" + ORDER BY 1 LIMIT 1) "some_users_data" ON TRUE ORDER BY user_id @@ -1251,7 +1246,7 @@ FROM WHERE "users"."user_id" = "some_recent_users"."user_id" AND "users"."value_2" > 4 - LIMIT 1) "some_users_data" + ORDER BY 1 LIMIT 1) "some_users_data" ON TRUE ORDER BY lastseen DESC @@ -1306,7 +1301,7 @@ SELECT "some_users_data".user_id, MAX(lastseen), count(*) WHERE "users"."user_id" = "some_recent_users"."user_id" AND "users"."value_2" > 4 - LIMIT 1) "some_users_data" ON true + ORDER BY 1 LIMIT 1) "some_users_data" ON true GROUP BY 1 ORDER BY 2, 1 DESC LIMIT 10; @@ -1360,7 +1355,7 @@ FROM WHERE "users"."user_id" = "some_recent_users"."user_id" AND "users"."value_2" > 4 - LIMIT 1) "some_users_data" ON true + ORDER BY 1 LIMIT 1) "some_users_data" ON true ORDER BY lastseen DESC LIMIT 10) "some_users" @@ -1418,7 +1413,7 @@ FROM WHERE "users"."user_id" = "some_recent_users"."user_id" AND "users"."value_2" > 4 - LIMIT 1) "some_users_data" ON true + ORDER BY 1 LIMIT 1) "some_users_data" ON true ORDER BY lastseen DESC LIMIT 10) "some_users" @@ -1484,9 +1479,7 @@ ORDER BY user_id DESC, lastseen DESC LIMIT 10; --- not pushdownable since lower LATERAL JOIN is not on the partition key --- not recursively plannable due to LATERAL join where there is a reference --- from an outer query +-- complex lateral join between inner join and correlated subquery SELECT user_id, lastseen FROM (SELECT @@ -1532,7 +1525,7 @@ FROM WHERE "users"."value_1" = "some_recent_users"."user_id" AND "users"."value_2" > 4 - LIMIT 1) "some_users_data" ON true + ORDER BY 1 LIMIT 1) "some_users_data" ON true ORDER BY lastseen DESC LIMIT 10) "some_users" @@ -2033,9 +2026,7 @@ ORDER BY LIMIT 10; SET citus.subquery_pushdown to OFF; --- not pushdownable since lower LATERAL JOIN is not on the partition key --- not recursively plannable due to LATERAL join where there is a reference --- from an outer query +-- on side of the lateral join can be recursively plannen, then pushed down SELECT * FROM (SELECT @@ -2066,7 +2057,7 @@ FROM WHERE "users"."value_2" = "some_recent_users"."user_id" AND value_2 > 4 - LIMIT 1) "some_users_data" ON true + ORDER BY 1 LIMIT 1) "some_users_data" ON true ORDER BY value_2 DESC LIMIT 10) "some_users" diff --git a/src/test/regress/sql/subquery_in_where.sql b/src/test/regress/sql/subquery_in_where.sql index 4f43a967b..5a383c927 100644 --- a/src/test/regress/sql/subquery_in_where.sql +++ b/src/test/regress/sql/subquery_in_where.sql @@ -713,6 +713,143 @@ WHERE WHERE u.value_2 > 3 GROUP BY r.value_2 HAVING min(r.value_3) > 0); +-- two levels of correlation should also allow +-- merge step in the subquery +SELECT sum(value_1) +FROM users_table u +WHERE EXISTS + (SELECT 1 + FROM events_table e + WHERE u.user_id = e.user_id AND + EXISTS + (SELECT 1 + FROM users_table u2 + WHERE u2.user_id = u.user_id AND u2.value_1 = 5 + LIMIT 1)); + +-- correlated subquery in WHERE, with a slightly +-- different syntax that the result of the subquery +-- is compared with a constant +SELECT sum(value_1) +FROM users_table u1 +WHERE (SELECT COUNT(DISTINCT e1.value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id + ) > 115; + + +-- a correlated subquery which requires merge step +-- can be pushed down on UPDATE/DELETE queries as well +-- rollback to keep the rest of the tests unchanged +BEGIN; +UPDATE users_table u1 + SET value_1 = (SELECT count(DISTINCT value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id); + +DELETE FROM users_table u1 WHERE (SELECT count(DISTINCT value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id) > 10; + +ROLLBACK; + +-- a correlated anti-join can also be pushed down even if the subquery +-- has a LIMIT +SELECT avg(value_1) +FROM users_table u +WHERE NOT EXISTS + (SELECT 'XXX' + FROM events_table e + WHERE u.user_id = e.user_id and e.value_2 > 10000 LIMIT 1); + +-- a [correlated] lateral join can also be pushed down even if the subquery +-- has an aggregate wout a GROUP BY +SELECT + max(min_of_val_2), max(u1.value_1) +FROM + users_table u1 + LEFT JOIN LATERAL + (SELECT min(e1.value_2) as min_of_val_2 FROM events_table e1 WHERE e1.user_id = u1.user_id) as foo ON (true); + + +-- a self join is followed by a correlated subquery +EXPLAIN (COSTS OFF) +SELECT + * +FROM + users_table u1 JOIN users_table u2 USING (user_id) +WHERE + u1.value_1 < u2.value_1 AND + (SELECT + count(*) + FROM + events_table e1 + WHERE + e1.user_id = u2.user_id) > 10; + +-- when the colocated join of the FROM clause +-- entries happen on WHERE clause, Citus cannot +-- pushdown +-- Likely that the colocation checks should be +-- improved +SELECT + u1.user_id, u2.user_id +FROM + users_table u1, users_table u2 +WHERE + u1.value_1 < u2.value_1 AND + (SELECT + count(*) + FROM + events_table e1 + WHERE + e1.user_id = u2.user_id AND + u1.user_id = u2.user_id) > 10 +ORDER BY 1,2; + + +-- create a view that contains correlated subquery +CREATE TEMPORARY VIEW correlated_subquery_view AS + SELECT u1.user_id + FROM users_table u1 + WHERE (SELECT COUNT(DISTINCT e1.value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id + ) > 0; + +SELECT sum(user_id) FROM correlated_subquery_view; + +-- now, join the view with another correlated subquery +SELECT + sum(mx) +FROM + correlated_subquery_view + LEFT JOIN LATERAL + (SELECT max(value_2) as mx FROM events_table WHERE correlated_subquery_view.user_id = events_table.user_id) as foo ON (true); + +-- as an edge case, JOIN is on false +SELECT + sum(mx) +FROM + correlated_subquery_view + LEFT JOIN LATERAL + (SELECT max(value_2) as mx FROM events_table WHERE correlated_subquery_view.user_id = events_table.user_id) as foo ON (false); + + +SELECT sum(value_1) +FROM users_table u1 +WHERE (SELECT COUNT(DISTINCT e1.value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id AND false + ) > 115; + +SELECT sum(value_1) +FROM users_table u1 +WHERE (SELECT COUNT(DISTINCT e1.value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id + ) > 115 AND false; + SET client_min_messages TO DEFAULT; DROP TABLE local_table;