diff --git a/src/backend/distributed/planner/query_pushdown_planning.c b/src/backend/distributed/planner/query_pushdown_planning.c index 0cb935a13..38cc844ab 100644 --- a/src/backend/distributed/planner/query_pushdown_planning.c +++ b/src/backend/distributed/planner/query_pushdown_planning.c @@ -34,6 +34,7 @@ #include "distributed/pg_dist_partition.h" #include "distributed/query_utils.h" #include "distributed/query_pushdown_planning.h" +#include "distributed/recursive_planning.h" #include "distributed/relation_restriction_equivalence.h" #include "distributed/version_compat.h" #include "nodes/nodeFuncs.h" @@ -78,6 +79,7 @@ static RecurringTuplesType FromClauseRecurringTupleType(Query *queryTree); static DeferredErrorMessage * DeferredErrorIfUnsupportedRecurringTuplesJoin( PlannerRestrictionContext *plannerRestrictionContext); static DeferredErrorMessage * DeferErrorIfUnsupportedTableCombination(Query *queryTree); +static DeferredErrorMessage * DeferErrorIfSubqueryRequiresMerge(Query *subqueryTree); static bool ExtractSetOperationStatmentWalker(Node *node, List **setOperationList); static RecurringTuplesType FetchFirstRecurType(PlannerInfo *plannerInfo, Relids relids); @@ -911,7 +913,6 @@ DeferErrorIfCannotPushdownSubquery(Query *subqueryTree, bool outerMostQueryHasLi { bool preconditionsSatisfied = true; char *errorDetail = NULL; - StringInfo errorInfo = NULL; DeferredErrorMessage *deferredError = DeferErrorIfUnsupportedTableCombination( subqueryTree); @@ -928,19 +929,19 @@ DeferErrorIfCannotPushdownSubquery(Query *subqueryTree, bool outerMostQueryHasLi "functions"; } - if (subqueryTree->limitOffset) + /* + * Correlated subqueries are effectively functions that are repeatedly called + * for the values of the vars that point to the outer query. We can liberally + * push down SQL features within such a function, as long as co-located join + * checks are applied. + */ + if (!ContainsReferencesToOuterQuery(subqueryTree)) { - preconditionsSatisfied = false; - errorDetail = "Offset clause is currently unsupported when a subquery " - "references a column from another query"; - } - - /* limit is not supported when SubqueryPushdown is not set */ - if (subqueryTree->limitCount && !SubqueryPushdown) - { - preconditionsSatisfied = false; - errorDetail = "Limit in subquery is currently unsupported when a " - "subquery references a column from another query"; + deferredError = DeferErrorIfSubqueryRequiresMerge(subqueryTree); + if (deferredError) + { + return deferredError; + } } /* @@ -981,24 +982,6 @@ DeferErrorIfCannotPushdownSubquery(Query *subqueryTree, bool outerMostQueryHasLi errorDetail = "For Update/Share commands are currently unsupported"; } - /* group clause list must include partition column */ - if (subqueryTree->groupClause) - { - List *groupClauseList = subqueryTree->groupClause; - List *targetEntryList = subqueryTree->targetList; - List *groupTargetEntryList = GroupTargetEntryList(groupClauseList, - targetEntryList); - bool groupOnPartitionColumn = TargetListOnPartitionColumn(subqueryTree, - groupTargetEntryList); - if (!groupOnPartitionColumn) - { - preconditionsSatisfied = false; - errorDetail = "Group by list without partition column is currently " - "unsupported when a subquery references a column " - "from another query"; - } - } - /* grouping sets are not allowed in subqueries*/ if (subqueryTree->groupingSets) { @@ -1007,15 +990,67 @@ DeferErrorIfCannotPushdownSubquery(Query *subqueryTree, bool outerMostQueryHasLi "or ROLLUP"; } - /* - * We support window functions when the window function - * is partitioned on distribution column. - */ - if (subqueryTree->hasWindowFuncs && !SafeToPushdownWindowFunction(subqueryTree, - &errorInfo)) + deferredError = DeferErrorIfFromClauseRecurs(subqueryTree); + if (deferredError) + { + return deferredError; + } + + + /* finally check and return deferred if not satisfied */ + if (!preconditionsSatisfied) + { + return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, + "cannot push down this subquery", + errorDetail, NULL); + } + + return NULL; +} + + +/* + * DeferErrorIfSubqueryRequiresMerge returns a deferred error if the subquery + * requires a merge step on the coordinator (e.g. limit, group by non-distribution + * column, etc.). + */ +static DeferredErrorMessage * +DeferErrorIfSubqueryRequiresMerge(Query *subqueryTree) +{ + bool preconditionsSatisfied = true; + char *errorDetail = NULL; + + if (subqueryTree->limitOffset) { - errorDetail = (char *) errorInfo->data; preconditionsSatisfied = false; + errorDetail = "Offset clause is currently unsupported when a subquery " + "references a column from another query"; + } + + /* limit is not supported when SubqueryPushdown is not set */ + if (subqueryTree->limitCount && !SubqueryPushdown) + { + preconditionsSatisfied = false; + errorDetail = "Limit in subquery is currently unsupported when a " + "subquery references a column from another query"; + } + + /* group clause list must include partition column */ + if (subqueryTree->groupClause) + { + List *groupClauseList = subqueryTree->groupClause; + List *targetEntryList = subqueryTree->targetList; + List *groupTargetEntryList = GroupTargetEntryList(groupClauseList, + targetEntryList); + bool groupOnPartitionColumn = + TargetListOnPartitionColumn(subqueryTree, groupTargetEntryList); + if (!groupOnPartitionColumn) + { + preconditionsSatisfied = false; + errorDetail = "Group by list without partition column is currently " + "unsupported when a subquery references a column " + "from another query"; + } } /* we don't support aggregates without group by */ @@ -1035,6 +1070,18 @@ DeferErrorIfCannotPushdownSubquery(Query *subqueryTree, bool outerMostQueryHasLi "a column from another query"; } + /* + * We support window functions when the window function + * is partitioned on distribution column. + */ + StringInfo errorInfo = NULL; + if (subqueryTree->hasWindowFuncs && !SafeToPushdownWindowFunction(subqueryTree, + &errorInfo)) + { + errorDetail = (char *) errorInfo->data; + preconditionsSatisfied = false; + } + /* distinct clause list must include partition column */ if (subqueryTree->distinctClause) { @@ -1052,13 +1099,6 @@ DeferErrorIfCannotPushdownSubquery(Query *subqueryTree, bool outerMostQueryHasLi } } - deferredError = DeferErrorIfFromClauseRecurs(subqueryTree); - if (deferredError) - { - return deferredError; - } - - /* finally check and return deferred if not satisfied */ if (!preconditionsSatisfied) { diff --git a/src/backend/distributed/planner/recursive_planning.c b/src/backend/distributed/planner/recursive_planning.c index f8cc45cc7..a6761a7f4 100644 --- a/src/backend/distributed/planner/recursive_planning.c +++ b/src/backend/distributed/planner/recursive_planning.c @@ -180,7 +180,6 @@ static bool IsLocalTableRteOrMatView(Node *node); static DistributedSubPlan * CreateDistributedSubPlan(uint32 subPlanId, Query *subPlanQuery); static bool CteReferenceListWalker(Node *node, CteReferenceWalkerContext *context); -static bool ContainsReferencesToOuterQuery(Query *query); static bool ContainsReferencesToOuterQueryWalker(Node *node, VarLevelsUpWalkerContext *context); static bool NodeContainsSubqueryReferencingOuterQuery(Node *node); @@ -1288,7 +1287,7 @@ CteReferenceListWalker(Node *node, CteReferenceWalkerContext *context) * anything that points outside of the query itself. Such queries cannot be * planned recursively. */ -static bool +bool ContainsReferencesToOuterQuery(Query *query) { VarLevelsUpWalkerContext context = { 0 }; diff --git a/src/include/distributed/recursive_planning.h b/src/include/distributed/recursive_planning.h index 0a64f6845..98d230cb2 100644 --- a/src/include/distributed/recursive_planning.h +++ b/src/include/distributed/recursive_planning.h @@ -49,4 +49,7 @@ extern void ReplaceRTERelationWithRteSubquery(RangeTblEntry *rangeTableEntry, RecursivePlanningContext *context); extern bool IsRecursivelyPlannableRelation(RangeTblEntry *rangeTableEntry); extern bool IsRelationLocalTableOrMatView(Oid relationId); +extern bool ContainsReferencesToOuterQuery(Query *query); + + #endif /* RECURSIVE_PLANNING_H */ diff --git a/src/test/regress/expected/multi_insert_select_window.out b/src/test/regress/expected/multi_insert_select_window.out index 2f522b127..0cf605990 100644 --- a/src/test/regress/expected/multi_insert_select_window.out +++ b/src/test/regress/expected/multi_insert_select_window.out @@ -773,8 +773,6 @@ WHERE ) GROUP BY user_id; -ERROR: cannot push down this subquery -DETAIL: Window functions without PARTITION BY on distribution column is currently unsupported INSERT INTO agg_results_window(user_id, value_2_agg) SELECT * FROM ( SELECT diff --git a/src/test/regress/expected/multi_mx_router_planner.out b/src/test/regress/expected/multi_mx_router_planner.out index 9486225c5..bd82b4dd1 100644 --- a/src/test/regress/expected/multi_mx_router_planner.out +++ b/src/test/regress/expected/multi_mx_router_planner.out @@ -484,11 +484,10 @@ DEBUG: query has a single distribution column value: 1 41 | 11814 (5 rows) --- subqueries are not supported in SELECT clause +-- subqueries in SELECT clause SELECT a.title AS name, (SELECT a2.id FROM articles_single_shard_hash_mx a2 WHERE a.id = a2.id LIMIT 1) AS special_price FROM articles_hash_mx a; DEBUG: Router planner cannot handle multi-shard select queries -DEBUG: skipping recursive planning for the subquery since it contains references to outer queries ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns -- simple lookup query SELECT * diff --git a/src/test/regress/expected/multi_router_planner.out b/src/test/regress/expected/multi_router_planner.out index f55c98050..047d77176 100644 --- a/src/test/regress/expected/multi_router_planner.out +++ b/src/test/regress/expected/multi_router_planner.out @@ -677,7 +677,6 @@ DEBUG: query has a single distribution column value: 1 SELECT a.title AS name, (SELECT a2.id FROM articles_single_shard_hash a2 WHERE a.id = a2.id LIMIT 1) AS special_price FROM articles_hash a; DEBUG: Router planner cannot handle multi-shard select queries -DEBUG: skipping recursive planning for the subquery since it contains references to outer queries ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns -- simple lookup query SELECT * diff --git a/src/test/regress/expected/multi_router_planner_fast_path.out b/src/test/regress/expected/multi_router_planner_fast_path.out index 003c78162..1fef5c721 100644 --- a/src/test/regress/expected/multi_router_planner_fast_path.out +++ b/src/test/regress/expected/multi_router_planner_fast_path.out @@ -423,7 +423,6 @@ DEBUG: query has a single distribution column value: 1 SELECT a.title AS name, (SELECT a2.id FROM articles_hash a2 WHERE a.id = a2.id LIMIT 1) AS special_price FROM articles_hash a; DEBUG: Router planner cannot handle multi-shard select queries -DEBUG: skipping recursive planning for the subquery since it contains references to outer queries ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns -- simple lookup query just works SELECT * diff --git a/src/test/regress/expected/multi_subquery_complex_queries.out b/src/test/regress/expected/multi_subquery_complex_queries.out index d9309efe2..dc4e62616 100644 --- a/src/test/regress/expected/multi_subquery_complex_queries.out +++ b/src/test/regress/expected/multi_subquery_complex_queries.out @@ -1227,7 +1227,7 @@ FROM WHERE "users"."user_id" = "some_recent_users"."user_id" AND users.value_2 > 1 and users.value_2 < 3 - LIMIT 1) "some_users_data" + ORDER BY 1 LIMIT 1) "some_users_data" ON TRUE ORDER BY user_id @@ -1240,9 +1240,7 @@ limit 50; -- reset subquery_pushdown SET citus.subquery_pushdown to OFF; --- we recursively plan recent_events_1 --- but not some_users_data since it has a reference --- from an outer query which is not recursively planned +-- mixture of recursively planned subqueries and correlated subqueries SELECT "some_users_data".user_id, lastseen FROM (SELECT user_id, max(time) AS lastseen @@ -1270,16 +1268,21 @@ FROM WHERE "users"."value_1" = "some_recent_users"."user_id" AND users.value_2 > 1 and users.value_2 < 3 - LIMIT 1) "some_users_data" + ORDER BY 1 LIMIT 1) "some_users_data" ON TRUE ORDER BY user_id limit 50; -ERROR: cannot push down this subquery -DETAIL: Limit in subquery is currently unsupported when a subquery references a column from another query --- we recursively plan some queries but fail in the end --- since some_users_data since it has a reference --- from an outer query which is not recursively planned + user_id | lastseen +--------------------------------------------------------------------- + 1 | Thu Nov 23 18:08:26.550729 2017 + 2 | Thu Nov 23 17:26:14.563216 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 17:26:14.563216 2017 + 5 | Thu Nov 23 17:26:14.563216 2017 + 6 | Thu Nov 23 18:08:26.550729 2017 +(6 rows) + SELECT "some_users_data".user_id, lastseen FROM (SELECT 2 * user_id as user_id, max(time) AS lastseen @@ -1307,13 +1310,17 @@ FROM WHERE "users"."user_id" = "some_recent_users"."user_id" AND users.value_2 > 1 and users.value_2 < 3 - LIMIT 1) "some_users_data" + ORDER BY 1 LIMIT 1) "some_users_data" ON TRUE ORDER BY user_id limit 50; -ERROR: cannot push down this subquery -DETAIL: Limit in subquery is currently unsupported when a subquery references a column from another query + user_id | lastseen +--------------------------------------------------------------------- + 4 | Thu Nov 23 17:26:14.563216 2017 + 6 | Thu Nov 23 18:08:26.550729 2017 +(2 rows) + -- LATERAL JOINs used with INNER JOINs SET citus.subquery_pushdown to ON; NOTICE: Setting citus.subquery_pushdown flag is discouraged becuase it forces the planner to pushdown certain queries, skipping relevant correctness checks. @@ -1367,7 +1374,7 @@ FROM WHERE "users"."user_id" = "some_recent_users"."user_id" AND "users"."value_2" > 4 - LIMIT 1) "some_users_data" + ORDER BY 1 LIMIT 1) "some_users_data" ON TRUE ORDER BY lastseen DESC @@ -1435,7 +1442,7 @@ SELECT "some_users_data".user_id, MAX(lastseen), count(*) WHERE "users"."user_id" = "some_recent_users"."user_id" AND "users"."value_2" > 4 - LIMIT 1) "some_users_data" ON true + ORDER BY 1 LIMIT 1) "some_users_data" ON true GROUP BY 1 ORDER BY 2, 1 DESC LIMIT 10; @@ -1492,7 +1499,7 @@ FROM WHERE "users"."user_id" = "some_recent_users"."user_id" AND "users"."value_2" > 4 - LIMIT 1) "some_users_data" ON true + ORDER BY 1 LIMIT 1) "some_users_data" ON true ORDER BY lastseen DESC LIMIT 10) "some_users" @@ -1502,11 +1509,28 @@ LIMIT 10; DEBUG: Router planner cannot handle multi-shard select queries DEBUG: Router planner cannot handle multi-shard select queries DEBUG: generating subplan XXX_1 for subquery SELECT user_id FROM public.users_table users WHERE ((user_id OPERATOR(pg_catalog.>) 1) AND (user_id OPERATOR(pg_catalog.<) 4) AND (value_2 OPERATOR(pg_catalog.>) 3)) -DEBUG: skipping recursive planning for the subquery since it contains references to outer queries DEBUG: Router planner cannot handle multi-shard select queries -DEBUG: skipping recursive planning for the subquery since it contains references to outer queries -ERROR: cannot push down this subquery -DETAIL: Limit in subquery is currently unsupported when a subquery references a column from another query +DEBUG: push down of limit count: 10 +DEBUG: generating subplan XXX_2 for subquery SELECT filter_users_1.user_id, last_events_1."time" AS lastseen FROM ((SELECT user_where_1_1.user_id FROM ((SELECT users.user_id FROM public.users_table users WHERE ((users.user_id OPERATOR(pg_catalog.>) 1) AND (users.user_id OPERATOR(pg_catalog.<) 4) AND (users.value_1 OPERATOR(pg_catalog.>) 2))) user_where_1_1 JOIN (SELECT intermediate_result.user_id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer)) user_where_1_join_1 ON ((user_where_1_1.user_id OPERATOR(pg_catalog.<>) user_where_1_join_1.user_id)))) filter_users_1 JOIN LATERAL (SELECT events.user_id, events."time" FROM public.events_table events WHERE ((events.user_id OPERATOR(pg_catalog.>) 1) AND (events.user_id OPERATOR(pg_catalog.<) 4) AND (events.user_id OPERATOR(pg_catalog.=) filter_users_1.user_id)) ORDER BY events."time" DESC LIMIT 1) last_events_1 ON (true)) ORDER BY last_events_1."time" DESC LIMIT 10 +DEBUG: Router planner cannot handle multi-shard select queries +DEBUG: push down of limit count: 10 +DEBUG: generating subplan XXX_3 for subquery SELECT some_users_data.user_id, some_recent_users.lastseen FROM ((SELECT intermediate_result.user_id, intermediate_result.lastseen FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, lastseen timestamp without time zone)) some_recent_users JOIN LATERAL (SELECT users.user_id FROM public.users_table users WHERE ((users.user_id OPERATOR(pg_catalog.=) some_recent_users.user_id) AND (users.value_2 OPERATOR(pg_catalog.>) 4)) ORDER BY users.user_id LIMIT 1) some_users_data ON (true)) ORDER BY some_recent_users.lastseen DESC LIMIT 10 +DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT user_id, lastseen FROM (SELECT intermediate_result.user_id, intermediate_result.lastseen FROM read_intermediate_result('XXX_3'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, lastseen timestamp without time zone)) some_users ORDER BY user_id DESC, lastseen DESC LIMIT 10 +DEBUG: Creating router plan + user_id | lastseen +--------------------------------------------------------------------- + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 +(10 rows) + SET citus.enable_repartition_joins to ON; SET client_min_messages TO DEBUG1; -- recursively planner since the inner JOIN is not on the partition key @@ -1555,7 +1579,7 @@ FROM WHERE "users"."user_id" = "some_recent_users"."user_id" AND "users"."value_2" > 4 - LIMIT 1) "some_users_data" ON true + ORDER BY 1 LIMIT 1) "some_users_data" ON true ORDER BY lastseen DESC LIMIT 10) "some_users" @@ -1563,8 +1587,25 @@ ORDER BY user_id DESC, lastseen DESC LIMIT 10; DEBUG: generating subplan XXX_1 for subquery SELECT user_id, value_1 FROM public.users_table users WHERE ((user_id OPERATOR(pg_catalog.>) 1) AND (user_id OPERATOR(pg_catalog.<) 4) AND (value_2 OPERATOR(pg_catalog.>) 3)) -ERROR: cannot push down this subquery -DETAIL: Limit in subquery is currently unsupported when a subquery references a column from another query +DEBUG: push down of limit count: 10 +DEBUG: generating subplan XXX_2 for subquery SELECT filter_users_1.user_id, last_events_1."time" AS lastseen FROM ((SELECT user_where_1_1.user_id FROM ((SELECT users.user_id FROM public.users_table users WHERE ((users.user_id OPERATOR(pg_catalog.>) 1) AND (users.user_id OPERATOR(pg_catalog.<) 4) AND (users.value_1 OPERATOR(pg_catalog.>) 2))) user_where_1_1 JOIN (SELECT intermediate_result.user_id, intermediate_result.value_1 FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, value_1 integer)) user_where_1_join_1 ON ((user_where_1_1.user_id OPERATOR(pg_catalog.=) user_where_1_join_1.value_1)))) filter_users_1 JOIN LATERAL (SELECT events.user_id, events."time" FROM public.events_table events WHERE ((events.user_id OPERATOR(pg_catalog.>) 1) AND (events.user_id OPERATOR(pg_catalog.<) 4) AND (events.user_id OPERATOR(pg_catalog.=) filter_users_1.user_id)) ORDER BY events."time" DESC LIMIT 1) last_events_1 ON (true)) ORDER BY last_events_1."time" DESC LIMIT 10 +DEBUG: push down of limit count: 10 +DEBUG: generating subplan XXX_3 for subquery SELECT some_users_data.user_id, some_recent_users.lastseen FROM ((SELECT intermediate_result.user_id, intermediate_result.lastseen FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, lastseen timestamp without time zone)) some_recent_users JOIN LATERAL (SELECT users.user_id FROM public.users_table users WHERE ((users.user_id OPERATOR(pg_catalog.=) some_recent_users.user_id) AND (users.value_2 OPERATOR(pg_catalog.>) 4)) ORDER BY users.user_id LIMIT 1) some_users_data ON (true)) ORDER BY some_recent_users.lastseen DESC LIMIT 10 +DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT user_id, lastseen FROM (SELECT intermediate_result.user_id, intermediate_result.lastseen FROM read_intermediate_result('XXX_3'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, lastseen timestamp without time zone)) some_users ORDER BY user_id DESC, lastseen DESC LIMIT 10 + user_id | lastseen +--------------------------------------------------------------------- + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 +(10 rows) + SET citus.enable_repartition_joins to OFF; RESET client_min_messages; -- not supported since upper LATERAL JOIN is not equi join @@ -1621,9 +1662,7 @@ ORDER BY user_id DESC, lastseen DESC LIMIT 10; ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns --- not pushdownable since lower LATERAL JOIN is not on the partition key --- not recursively plannable due to LATERAL join where there is a reference --- from an outer query +-- complex lateral join between inner join and correlated subquery SELECT user_id, lastseen FROM (SELECT @@ -1669,15 +1708,27 @@ FROM WHERE "users"."value_1" = "some_recent_users"."user_id" AND "users"."value_2" > 4 - LIMIT 1) "some_users_data" ON true + ORDER BY 1 LIMIT 1) "some_users_data" ON true ORDER BY lastseen DESC LIMIT 10) "some_users" ORDER BY user_id DESC, lastseen DESC LIMIT 10; -ERROR: cannot push down this subquery -DETAIL: Limit in subquery is currently unsupported when a subquery references a column from another query + user_id | lastseen +--------------------------------------------------------------------- + 5 | Thu Nov 23 17:26:14.563216 2017 + 5 | Thu Nov 23 17:26:14.563216 2017 + 5 | Thu Nov 23 17:26:14.563216 2017 + 5 | Thu Nov 23 17:26:14.563216 2017 + 5 | Thu Nov 23 17:26:14.563216 2017 + 5 | Thu Nov 23 17:26:14.563216 2017 + 5 | Thu Nov 23 17:26:14.563216 2017 + 5 | Thu Nov 23 17:26:14.563216 2017 + 5 | Thu Nov 23 17:26:14.563216 2017 + 5 | Thu Nov 23 17:26:14.563216 2017 +(10 rows) + -- NESTED INNER JOINs SELECT count(*) AS value, "generated_group_field" @@ -2281,9 +2332,7 @@ LIMIT 10; (1 row) SET citus.subquery_pushdown to OFF; --- not pushdownable since lower LATERAL JOIN is not on the partition key --- not recursively plannable due to LATERAL join where there is a reference --- from an outer query +-- on side of the lateral join can be recursively plannen, then pushed down SELECT * FROM (SELECT @@ -2314,15 +2363,17 @@ FROM WHERE "users"."value_2" = "some_recent_users"."user_id" AND value_2 > 4 - LIMIT 1) "some_users_data" ON true + ORDER BY 1 LIMIT 1) "some_users_data" ON true ORDER BY value_2 DESC LIMIT 10) "some_users" ORDER BY value_2 DESC, user_id DESC LIMIT 10; -ERROR: cannot push down this subquery -DETAIL: Limit in subquery is currently unsupported when a subquery references a column from another query + user_id | value_2 +--------------------------------------------------------------------- +(0 rows) + -- lets test some unsupported set operations -- not supported since we use INTERSECT SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType diff --git a/src/test/regress/expected/multi_subquery_in_where_clause.out b/src/test/regress/expected/multi_subquery_in_where_clause.out index a7cac9cbc..c5066b275 100644 --- a/src/test/regress/expected/multi_subquery_in_where_clause.out +++ b/src/test/regress/expected/multi_subquery_in_where_clause.out @@ -610,8 +610,10 @@ WHERE user_id OFFSET 3 ); -ERROR: cannot push down this subquery -DETAIL: Offset clause is currently unsupported when a subquery references a column from another query + user_id +--------------------------------------------------------------------- +(0 rows) + -- we can detect unsupported subqueries even if they appear -- in WHERE subquery -> FROM subquery -> WHERE subquery -- but we can recursively plan that anyway diff --git a/src/test/regress/expected/subqueries_not_supported.out b/src/test/regress/expected/subqueries_not_supported.out index 0bb5da386..fcd33386e 100644 --- a/src/test/regress/expected/subqueries_not_supported.out +++ b/src/test/regress/expected/subqueries_not_supported.out @@ -83,46 +83,6 @@ DEBUG: generating subplan XXX_1 for subquery SELECT users_table.value_2 FROM pu DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT foo.value_2 FROM ((SELECT intermediate_result.value_2 FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(value_2 integer)) foo LEFT JOIN (SELECT users_table.value_2 FROM public.users_table, public.events_table WHERE ((users_table.user_id OPERATOR(pg_catalog.=) events_table.user_id) AND (events_table.event_type OPERATOR(pg_catalog.=) ANY (ARRAY[5, 6, 7, 8])))) bar ON ((foo.value_2 OPERATOR(pg_catalog.=) bar.value_2))) ERROR: cannot pushdown the subquery DETAIL: Complex subqueries and CTEs cannot be in the outer part of the outer join --- Aggregates in subquery without partition column can be planned recursively --- unless there is a reference to an outer query -SELECT - * -FROM - users_table -WHERE - user_id IN - ( - SELECT - SUM(events_table.user_id) - FROM - events_table - WHERE - users_table.user_id = events_table.user_id - ) -; -ERROR: cannot push down this subquery -DETAIL: Aggregates without group by are currently unsupported when a subquery references a column from another query --- Having qual without group by on partition column can be planned recursively --- unless there is a reference to an outer query -SELECT - * -FROM - users_table -WHERE - user_id IN - ( - SELECT - SUM(events_table.user_id) - FROM - events_table - WHERE - events_table.user_id = users_table.user_id - HAVING - MIN(value_2) > 2 - ) -; -ERROR: cannot push down this subquery -DETAIL: Having qual without group by on partition column is currently unsupported when a subquery references a column from another query -- We do not support GROUPING SETS in subqueries -- This also includes ROLLUP or CUBE clauses SELECT * FROM (SELECT user_id, value_1 FROM users_table GROUP BY GROUPING SETS ((user_id), (value_1))) s; diff --git a/src/test/regress/expected/subquery_in_targetlist.out b/src/test/regress/expected/subquery_in_targetlist.out index d3352e32a..f654af850 100644 --- a/src/test/regress/expected/subquery_in_targetlist.out +++ b/src/test/regress/expected/subquery_in_targetlist.out @@ -31,14 +31,20 @@ ORDER BY 1,2 LIMIT 1; SELECT event_type, (SELECT max(time) FROM users_table WHERE user_id = e.user_id) FROM events_table e ORDER BY 1,2 LIMIT 1; -ERROR: cannot push down this subquery -DETAIL: Aggregates without group by are currently unsupported when a subquery references a column from another query + event_type | max +--------------------------------------------------------------------- + 0 | Thu Nov 23 13:52:54.83829 2017 +(1 row) + -- correlated subquery wtth limit SELECT event_type, (SELECT time FROM users_table WHERE user_id = e.user_id ORDER BY time LIMIT 1) FROM events_table e ORDER BY 1,2 LIMIT 1; -ERROR: cannot push down this subquery -DETAIL: Limit in subquery is currently unsupported when a subquery references a column from another query + event_type | time +--------------------------------------------------------------------- + 0 | Wed Nov 22 18:19:49.944985 2017 +(1 row) + -- correlated subquery with group by distribution column SELECT event_type, (SELECT max(time) FROM users_table WHERE user_id = e.user_id GROUP BY user_id) FROM events_table e @@ -52,8 +58,11 @@ ORDER BY 1,2 LIMIT 1; SELECT event_type, (SELECT max(time) FROM users_table WHERE user_id = e.user_id GROUP BY e.user_id) FROM events_table e ORDER BY 1,2 LIMIT 1; -ERROR: cannot push down this subquery -DETAIL: Group by list without partition column is currently unsupported when a subquery references a column from another query + event_type | max +--------------------------------------------------------------------- + 0 | Thu Nov 23 13:52:54.83829 2017 +(1 row) + -- correlated subquery co-located join in outer query SELECT event_type, (SELECT max(time) FROM users_table WHERE user_id = e.user_id GROUP BY user_id) FROM users_table u JOIN events_table e USING (user_id) @@ -81,8 +90,11 @@ ERROR: complex joins are only supported when all distributed tables are co-loca SELECT event_type, (SELECT max(time) FROM users_reference_table WHERE user_id = e.value_2) FROM events_table e ORDER BY 1,2 LIMIT 1; -ERROR: cannot push down this subquery -DETAIL: Aggregates without group by are currently unsupported when a subquery references a column from another query + event_type | max +--------------------------------------------------------------------- + 0 | Thu Nov 23 13:52:54.83829 2017 +(1 row) + -- correlated subquery with reference table and group by SELECT event_type, (SELECT max(time) FROM users_reference_table WHERE user_id = e.value_2 GROUP BY user_id) FROM events_table e @@ -270,8 +282,12 @@ SELECT (SELECT value_2 FROM view_1 WHERE user_id = e.user_id GROUP BY value_2) FROM events_table e GROUP BY 1 ORDER BY 1 LIMIT 3; -ERROR: cannot push down this subquery -DETAIL: Group by list without partition column is currently unsupported when a subquery references a column from another query + value_2 +--------------------------------------------------------------------- + 3 + +(2 rows) + -- without view in the outer query FROM SELECT (SELECT value_2 FROM view_1 WHERE user_id = e.user_id GROUP BY user_id, value_2) FROM view_1 e diff --git a/src/test/regress/expected/subquery_in_where.out b/src/test/regress/expected/subquery_in_where.out index 39221a7e5..ac98d85f6 100644 --- a/src/test/regress/expected/subquery_in_where.out +++ b/src/test/regress/expected/subquery_in_where.out @@ -691,6 +691,48 @@ DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT count(*) AS c 101 (1 row) +-- correlated subquery with aggregate in WHERE +SELECT + * +FROM + users_table +WHERE + user_id IN + ( + SELECT + SUM(events_table.user_id) + FROM + events_table + WHERE + users_table.user_id = events_table.user_id + ) +; + user_id | time | value_1 | value_2 | value_3 | value_4 +--------------------------------------------------------------------- +(0 rows) + +-- correlated subquery with aggregate in HAVING +SELECT + * +FROM + users_table +WHERE + user_id IN + ( + SELECT + SUM(events_table.user_id) + FROM + events_table + WHERE + events_table.user_id = users_table.user_id + HAVING + MIN(value_2) > 2 + ) +; + user_id | time | value_1 | value_2 | value_3 | value_4 +--------------------------------------------------------------------- +(0 rows) + -- Local tables also planned recursively, so using it as part of the FROM clause -- make the clause recurring CREATE TABLE local_table(id int, value_1 int); @@ -746,6 +788,365 @@ DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT count(*) AS c 10 (1 row) +-- basic NOT IN correlated subquery +SELECT + count(*) +FROM + events_table e +WHERE + value_2 NOT IN (SELECT value_2 FROM users_table WHERE user_id = e.user_id); + count +--------------------------------------------------------------------- + 7 +(1 row) + +-- correlated subquery with limit +SELECT + count(*) +FROM + events_table e +WHERE + value_2 IN (SELECT value_2 FROM users_table WHERE user_id = e.user_id ORDER BY value_2 LIMIT 1); + count +--------------------------------------------------------------------- + 10 +(1 row) + +-- correlated subquery with distinct +SELECT + count(*) +FROM + events_table e +WHERE + value_2 IN (SELECT DISTINCT (value_3) FROM users_table WHERE user_id = e.user_id); + count +--------------------------------------------------------------------- + 90 +(1 row) + +-- correlated subquery with aggregate +SELECT + count(*) +FROM + events_table e +WHERE + value_2 = (SELECT max(value_2) FROM users_table WHERE user_id = e.user_id); + count +--------------------------------------------------------------------- + 11 +(1 row) + +-- correlated subquery with window function +SELECT + count(*) +FROM + events_table e +WHERE + value_2 IN (SELECT row_number() OVER () FROM users_table WHERE user_id = e.user_id); + count +--------------------------------------------------------------------- + 94 +(1 row) + +-- correlated subquery with group by +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN (SELECT min(value_3) FROM users_table WHERE user_id = e.user_id GROUP BY value_2); + count +--------------------------------------------------------------------- + 72 +(1 row) + +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN (SELECT min(value_3) FROM users_table WHERE user_id = e.user_id GROUP BY value_2); + count +--------------------------------------------------------------------- + 72 +(1 row) + +-- correlated subquery with group by +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN (SELECT min(value_3) v FROM users_table WHERE user_id = e.user_id GROUP BY e.value_2); + count +--------------------------------------------------------------------- + 10 +(1 row) + +-- correlated subquery with having +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN (SELECT min(value_3) v FROM users_table WHERE user_id = e.user_id GROUP BY e.value_2 HAVING min(value_3) > (SELECT 1)); + count +--------------------------------------------------------------------- + 0 +(1 row) + +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN (SELECT min(value_3) v FROM users_table WHERE user_id = e.user_id GROUP BY e.value_2 HAVING min(value_3) > (SELECT e.value_3)); +ERROR: Subqueries in HAVING cannot refer to outer query +-- nested correlated subquery +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN ( + SELECT min(r.value_3) v FROM users_reference_table r JOIN (SELECT * FROM users_table WHERE user_id = e.user_id) u USING (user_id) + WHERE u.value_2 > 3 + GROUP BY e.value_2 HAVING min(r.value_3) > e.value_3); + count +--------------------------------------------------------------------- + 0 +(1 row) + +-- not co-located correlated subquery +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN ( + SELECT min(r.value_3) v FROM users_reference_table r JOIN (SELECT * FROM users_table WHERE value_2 = e.user_id) u USING (user_id) + WHERE u.value_2 > 3 + GROUP BY e.value_2 HAVING min(r.value_3) > e.value_3); +ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns +-- cartesian correlated subquery +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN ( + SELECT min(r.value_3) v FROM users_reference_table r JOIN users_table u USING (user_id) + WHERE u.value_2 > 3 + GROUP BY e.value_2 HAVING min(r.value_3) > e.value_3); +ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns +-- even more subtle cartesian correlated subquery +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN ( + SELECT min(r.value_3) v FROM users_reference_table r JOIN users_table u USING (user_id) + WHERE u.value_2 > 3 + GROUP BY u.value_2 HAVING min(r.value_3) > e.value_3); +ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns +-- not a correlated subquery, uses recursive planning +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN ( + SELECT min(r.value_3) v FROM users_reference_table r JOIN users_table u USING (user_id) + WHERE u.value_2 > 3 + GROUP BY r.value_2 HAVING min(r.value_3) > 0); +DEBUG: generating subplan XXX_1 for subquery SELECT min(r.value_3) AS v FROM (public.users_reference_table r JOIN public.users_table u USING (user_id)) WHERE (u.value_2 OPERATOR(pg_catalog.>) 3) GROUP BY r.value_2 HAVING (min(r.value_3) OPERATOR(pg_catalog.>) (0)::double precision) +DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT count(*) AS count FROM public.events_table e WHERE (value_3 OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.v FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(v double precision))) + count +--------------------------------------------------------------------- + 24 +(1 row) + +-- two levels of correlation should also allow +-- merge step in the subquery +SELECT sum(value_1) +FROM users_table u +WHERE EXISTS + (SELECT 1 + FROM events_table e + WHERE u.user_id = e.user_id AND + EXISTS + (SELECT 1 + FROM users_table u2 + WHERE u2.user_id = u.user_id AND u2.value_1 = 5 + LIMIT 1)); + sum +--------------------------------------------------------------------- + 216 +(1 row) + +-- correlated subquery in WHERE, with a slightly +-- different syntax that the result of the subquery +-- is compared with a constant +SELECT sum(value_1) +FROM users_table u1 +WHERE (SELECT COUNT(DISTINCT e1.value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id + ) > 115; + sum +--------------------------------------------------------------------- + +(1 row) + +-- a correlated subquery which requires merge step +-- can be pushed down on UPDATE/DELETE queries as well +-- rollback to keep the rest of the tests unchanged +BEGIN; +UPDATE users_table u1 + SET value_1 = (SELECT count(DISTINCT value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id); +DELETE FROM users_table u1 WHERE (SELECT count(DISTINCT value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id) > 10; +ROLLBACK; +-- a correlated anti-join can also be pushed down even if the subquery +-- has a LIMIT +SELECT avg(value_1) +FROM users_table u +WHERE NOT EXISTS + (SELECT 'XXX' + FROM events_table e + WHERE u.user_id = e.user_id and e.value_2 > 10000 LIMIT 1); + avg +--------------------------------------------------------------------- + 2.5544554455445545 +(1 row) + +-- a [correlated] lateral join can also be pushed down even if the subquery +-- has an aggregate wout a GROUP BY +SELECT + max(min_of_val_2), max(u1.value_1) +FROM + users_table u1 + LEFT JOIN LATERAL + (SELECT min(e1.value_2) as min_of_val_2 FROM events_table e1 WHERE e1.user_id = u1.user_id) as foo ON (true); + max | max +--------------------------------------------------------------------- + 1 | 5 +(1 row) + +-- a self join is followed by a correlated subquery +EXPLAIN (COSTS OFF) +SELECT + * +FROM + users_table u1 JOIN users_table u2 USING (user_id) +WHERE + u1.value_1 < u2.value_1 AND + (SELECT + count(*) + FROM + events_table e1 + WHERE + e1.user_id = u2.user_id) > 10; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Hash Join + Hash Cond: (u2.user_id = u1.user_id) + Join Filter: (u1.value_1 < u2.value_1) + -> Seq Scan on users_table_1400256 u2 + Filter: ((SubPlan 1) > 10) + SubPlan 1 + -> Aggregate + -> Seq Scan on events_table_1400260 e1 + Filter: (user_id = u2.user_id) + -> Hash + -> Seq Scan on users_table_1400256 u1 +(16 rows) + +-- when the colocated join of the FROM clause +-- entries happen on WHERE clause, Citus cannot +-- pushdown +-- Likely that the colocation checks should be +-- improved +SELECT + u1.user_id, u2.user_id +FROM + users_table u1, users_table u2 +WHERE + u1.value_1 < u2.value_1 AND + (SELECT + count(*) + FROM + events_table e1 + WHERE + e1.user_id = u2.user_id AND + u1.user_id = u2.user_id) > 10 +ORDER BY 1,2; +ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns +-- create a view that contains correlated subquery +CREATE TEMPORARY VIEW correlated_subquery_view AS + SELECT u1.user_id + FROM users_table u1 + WHERE (SELECT COUNT(DISTINCT e1.value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id + ) > 0; +SELECT sum(user_id) FROM correlated_subquery_view; + sum +--------------------------------------------------------------------- + 376 +(1 row) + +-- now, join the view with another correlated subquery +SELECT + sum(mx) +FROM + correlated_subquery_view + LEFT JOIN LATERAL + (SELECT max(value_2) as mx FROM events_table WHERE correlated_subquery_view.user_id = events_table.user_id) as foo ON (true); + sum +--------------------------------------------------------------------- + 459 +(1 row) + +-- as an edge case, JOIN is on false +SELECT + sum(mx) +FROM + correlated_subquery_view + LEFT JOIN LATERAL + (SELECT max(value_2) as mx FROM events_table WHERE correlated_subquery_view.user_id = events_table.user_id) as foo ON (false); + sum +--------------------------------------------------------------------- + +(1 row) + +SELECT sum(value_1) +FROM users_table u1 +WHERE (SELECT COUNT(DISTINCT e1.value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id AND false + ) > 115; +ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns +SELECT sum(value_1) +FROM users_table u1 +WHERE (SELECT COUNT(DISTINCT e1.value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id + ) > 115 AND false; + sum +--------------------------------------------------------------------- + +(1 row) + SET client_min_messages TO DEFAULT; DROP TABLE local_table; DROP SCHEMA subquery_in_where CASCADE; diff --git a/src/test/regress/expected/with_set_operations.out b/src/test/regress/expected/with_set_operations.out index c3ccbd4bd..461a184e5 100644 --- a/src/test/regress/expected/with_set_operations.out +++ b/src/test/regress/expected/with_set_operations.out @@ -413,7 +413,14 @@ DEBUG: generating subplan XXX_1 for CTE cte_1: SELECT user_id FROM public.users DEBUG: generating subplan XXX_2 for CTE cte_1: SELECT user_id FROM public.users_table DEBUG: generating subplan XXX_3 for subquery SELECT cte_1.user_id FROM (SELECT intermediate_result.user_id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer)) cte_1 UNION SELECT cte_1.user_id FROM (SELECT intermediate_result.user_id FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer)) cte_1 DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT DISTINCT user_id FROM public.events_table WHERE (event_type OPERATOR(pg_catalog.=) ANY (SELECT users_table.user_id FROM (SELECT intermediate_result.user_id FROM read_intermediate_result('XXX_3'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer)) foo, public.users_table WHERE ((users_table.value_2 OPERATOR(pg_catalog.=) foo.user_id) AND (events_table.user_id OPERATOR(pg_catalog.=) users_table.user_id)) LIMIT 5)) ORDER BY user_id DESC -ERROR: cannot push down this subquery -DETAIL: Limit in subquery is currently unsupported when a subquery references a column from another query + user_id +--------------------------------------------------------------------- + 5 + 4 + 3 + 2 + 1 +(5 rows) + SET client_min_messages TO DEFAULT; SET search_path TO public; diff --git a/src/test/regress/sql/multi_mx_router_planner.sql b/src/test/regress/sql/multi_mx_router_planner.sql index 688a7c944..b241439b0 100644 --- a/src/test/regress/sql/multi_mx_router_planner.sql +++ b/src/test/regress/sql/multi_mx_router_planner.sql @@ -235,7 +235,7 @@ FROM articles_hash_mx, (SELECT id, word_count FROM articles_hash_mx) AS test WHERE test.id = articles_hash_mx.id and articles_hash_mx.author_id = 1 ORDER BY articles_hash_mx.id; --- subqueries are not supported in SELECT clause +-- subqueries in SELECT clause SELECT a.title AS name, (SELECT a2.id FROM articles_single_shard_hash_mx a2 WHERE a.id = a2.id LIMIT 1) AS special_price FROM articles_hash_mx a; diff --git a/src/test/regress/sql/multi_subquery_complex_queries.sql b/src/test/regress/sql/multi_subquery_complex_queries.sql index 7fe8a90bf..28e468712 100644 --- a/src/test/regress/sql/multi_subquery_complex_queries.sql +++ b/src/test/regress/sql/multi_subquery_complex_queries.sql @@ -1119,7 +1119,7 @@ FROM WHERE "users"."user_id" = "some_recent_users"."user_id" AND users.value_2 > 1 and users.value_2 < 3 - LIMIT 1) "some_users_data" + ORDER BY 1 LIMIT 1) "some_users_data" ON TRUE ORDER BY user_id @@ -1128,9 +1128,7 @@ limit 50; -- reset subquery_pushdown SET citus.subquery_pushdown to OFF; --- we recursively plan recent_events_1 --- but not some_users_data since it has a reference --- from an outer query which is not recursively planned +-- mixture of recursively planned subqueries and correlated subqueries SELECT "some_users_data".user_id, lastseen FROM (SELECT user_id, max(time) AS lastseen @@ -1158,15 +1156,12 @@ FROM WHERE "users"."value_1" = "some_recent_users"."user_id" AND users.value_2 > 1 and users.value_2 < 3 - LIMIT 1) "some_users_data" + ORDER BY 1 LIMIT 1) "some_users_data" ON TRUE ORDER BY user_id limit 50; --- we recursively plan some queries but fail in the end --- since some_users_data since it has a reference --- from an outer query which is not recursively planned SELECT "some_users_data".user_id, lastseen FROM (SELECT 2 * user_id as user_id, max(time) AS lastseen @@ -1194,7 +1189,7 @@ FROM WHERE "users"."user_id" = "some_recent_users"."user_id" AND users.value_2 > 1 and users.value_2 < 3 - LIMIT 1) "some_users_data" + ORDER BY 1 LIMIT 1) "some_users_data" ON TRUE ORDER BY user_id @@ -1251,7 +1246,7 @@ FROM WHERE "users"."user_id" = "some_recent_users"."user_id" AND "users"."value_2" > 4 - LIMIT 1) "some_users_data" + ORDER BY 1 LIMIT 1) "some_users_data" ON TRUE ORDER BY lastseen DESC @@ -1306,7 +1301,7 @@ SELECT "some_users_data".user_id, MAX(lastseen), count(*) WHERE "users"."user_id" = "some_recent_users"."user_id" AND "users"."value_2" > 4 - LIMIT 1) "some_users_data" ON true + ORDER BY 1 LIMIT 1) "some_users_data" ON true GROUP BY 1 ORDER BY 2, 1 DESC LIMIT 10; @@ -1360,7 +1355,7 @@ FROM WHERE "users"."user_id" = "some_recent_users"."user_id" AND "users"."value_2" > 4 - LIMIT 1) "some_users_data" ON true + ORDER BY 1 LIMIT 1) "some_users_data" ON true ORDER BY lastseen DESC LIMIT 10) "some_users" @@ -1418,7 +1413,7 @@ FROM WHERE "users"."user_id" = "some_recent_users"."user_id" AND "users"."value_2" > 4 - LIMIT 1) "some_users_data" ON true + ORDER BY 1 LIMIT 1) "some_users_data" ON true ORDER BY lastseen DESC LIMIT 10) "some_users" @@ -1484,9 +1479,7 @@ ORDER BY user_id DESC, lastseen DESC LIMIT 10; --- not pushdownable since lower LATERAL JOIN is not on the partition key --- not recursively plannable due to LATERAL join where there is a reference --- from an outer query +-- complex lateral join between inner join and correlated subquery SELECT user_id, lastseen FROM (SELECT @@ -1532,7 +1525,7 @@ FROM WHERE "users"."value_1" = "some_recent_users"."user_id" AND "users"."value_2" > 4 - LIMIT 1) "some_users_data" ON true + ORDER BY 1 LIMIT 1) "some_users_data" ON true ORDER BY lastseen DESC LIMIT 10) "some_users" @@ -2033,9 +2026,7 @@ ORDER BY LIMIT 10; SET citus.subquery_pushdown to OFF; --- not pushdownable since lower LATERAL JOIN is not on the partition key --- not recursively plannable due to LATERAL join where there is a reference --- from an outer query +-- on side of the lateral join can be recursively plannen, then pushed down SELECT * FROM (SELECT @@ -2066,7 +2057,7 @@ FROM WHERE "users"."value_2" = "some_recent_users"."user_id" AND value_2 > 4 - LIMIT 1) "some_users_data" ON true + ORDER BY 1 LIMIT 1) "some_users_data" ON true ORDER BY value_2 DESC LIMIT 10) "some_users" diff --git a/src/test/regress/sql/subqueries_not_supported.sql b/src/test/regress/sql/subqueries_not_supported.sql index b08fe254c..242623a3f 100644 --- a/src/test/regress/sql/subqueries_not_supported.sql +++ b/src/test/regress/sql/subqueries_not_supported.sql @@ -78,46 +78,6 @@ FROM ON(foo.value_2 = bar.value_2); --- Aggregates in subquery without partition column can be planned recursively --- unless there is a reference to an outer query -SELECT - * -FROM - users_table -WHERE - user_id IN - ( - SELECT - SUM(events_table.user_id) - FROM - events_table - WHERE - users_table.user_id = events_table.user_id - ) -; - - --- Having qual without group by on partition column can be planned recursively --- unless there is a reference to an outer query -SELECT - * -FROM - users_table -WHERE - user_id IN - ( - SELECT - SUM(events_table.user_id) - FROM - events_table - WHERE - events_table.user_id = users_table.user_id - HAVING - MIN(value_2) > 2 - ) -; - - -- We do not support GROUPING SETS in subqueries -- This also includes ROLLUP or CUBE clauses SELECT * FROM (SELECT user_id, value_1 FROM users_table GROUP BY GROUPING SETS ((user_id), (value_1))) s; diff --git a/src/test/regress/sql/subquery_in_where.sql b/src/test/regress/sql/subquery_in_where.sql index 884307845..5a383c927 100644 --- a/src/test/regress/sql/subquery_in_where.sql +++ b/src/test/regress/sql/subquery_in_where.sql @@ -502,6 +502,43 @@ WHERE value_1 IN (SELECT value_1 FROM users_Table) OR (EXISTS (SELECT * FROM events_table)); +-- correlated subquery with aggregate in WHERE +SELECT + * +FROM + users_table +WHERE + user_id IN + ( + SELECT + SUM(events_table.user_id) + FROM + events_table + WHERE + users_table.user_id = events_table.user_id + ) +; + +-- correlated subquery with aggregate in HAVING +SELECT + * +FROM + users_table +WHERE + user_id IN + ( + SELECT + SUM(events_table.user_id) + FROM + events_table + WHERE + events_table.user_id = users_table.user_id + HAVING + MIN(value_2) > 2 + ) +; + + -- Local tables also planned recursively, so using it as part of the FROM clause -- make the clause recurring CREATE TABLE local_table(id int, value_1 int); @@ -542,6 +579,277 @@ IN FROM local_table); +-- basic NOT IN correlated subquery +SELECT + count(*) +FROM + events_table e +WHERE + value_2 NOT IN (SELECT value_2 FROM users_table WHERE user_id = e.user_id); + +-- correlated subquery with limit +SELECT + count(*) +FROM + events_table e +WHERE + value_2 IN (SELECT value_2 FROM users_table WHERE user_id = e.user_id ORDER BY value_2 LIMIT 1); + +-- correlated subquery with distinct +SELECT + count(*) +FROM + events_table e +WHERE + value_2 IN (SELECT DISTINCT (value_3) FROM users_table WHERE user_id = e.user_id); + +-- correlated subquery with aggregate +SELECT + count(*) +FROM + events_table e +WHERE + value_2 = (SELECT max(value_2) FROM users_table WHERE user_id = e.user_id); + +-- correlated subquery with window function +SELECT + count(*) +FROM + events_table e +WHERE + value_2 IN (SELECT row_number() OVER () FROM users_table WHERE user_id = e.user_id); + +-- correlated subquery with group by +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN (SELECT min(value_3) FROM users_table WHERE user_id = e.user_id GROUP BY value_2); + +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN (SELECT min(value_3) FROM users_table WHERE user_id = e.user_id GROUP BY value_2); + + +-- correlated subquery with group by +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN (SELECT min(value_3) v FROM users_table WHERE user_id = e.user_id GROUP BY e.value_2); + +-- correlated subquery with having +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN (SELECT min(value_3) v FROM users_table WHERE user_id = e.user_id GROUP BY e.value_2 HAVING min(value_3) > (SELECT 1)); + +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN (SELECT min(value_3) v FROM users_table WHERE user_id = e.user_id GROUP BY e.value_2 HAVING min(value_3) > (SELECT e.value_3)); + +-- nested correlated subquery +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN ( + SELECT min(r.value_3) v FROM users_reference_table r JOIN (SELECT * FROM users_table WHERE user_id = e.user_id) u USING (user_id) + WHERE u.value_2 > 3 + GROUP BY e.value_2 HAVING min(r.value_3) > e.value_3); + +-- not co-located correlated subquery +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN ( + SELECT min(r.value_3) v FROM users_reference_table r JOIN (SELECT * FROM users_table WHERE value_2 = e.user_id) u USING (user_id) + WHERE u.value_2 > 3 + GROUP BY e.value_2 HAVING min(r.value_3) > e.value_3); + +-- cartesian correlated subquery +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN ( + SELECT min(r.value_3) v FROM users_reference_table r JOIN users_table u USING (user_id) + WHERE u.value_2 > 3 + GROUP BY e.value_2 HAVING min(r.value_3) > e.value_3); + +-- even more subtle cartesian correlated subquery +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN ( + SELECT min(r.value_3) v FROM users_reference_table r JOIN users_table u USING (user_id) + WHERE u.value_2 > 3 + GROUP BY u.value_2 HAVING min(r.value_3) > e.value_3); + +-- not a correlated subquery, uses recursive planning +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN ( + SELECT min(r.value_3) v FROM users_reference_table r JOIN users_table u USING (user_id) + WHERE u.value_2 > 3 + GROUP BY r.value_2 HAVING min(r.value_3) > 0); + +-- two levels of correlation should also allow +-- merge step in the subquery +SELECT sum(value_1) +FROM users_table u +WHERE EXISTS + (SELECT 1 + FROM events_table e + WHERE u.user_id = e.user_id AND + EXISTS + (SELECT 1 + FROM users_table u2 + WHERE u2.user_id = u.user_id AND u2.value_1 = 5 + LIMIT 1)); + +-- correlated subquery in WHERE, with a slightly +-- different syntax that the result of the subquery +-- is compared with a constant +SELECT sum(value_1) +FROM users_table u1 +WHERE (SELECT COUNT(DISTINCT e1.value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id + ) > 115; + + +-- a correlated subquery which requires merge step +-- can be pushed down on UPDATE/DELETE queries as well +-- rollback to keep the rest of the tests unchanged +BEGIN; +UPDATE users_table u1 + SET value_1 = (SELECT count(DISTINCT value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id); + +DELETE FROM users_table u1 WHERE (SELECT count(DISTINCT value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id) > 10; + +ROLLBACK; + +-- a correlated anti-join can also be pushed down even if the subquery +-- has a LIMIT +SELECT avg(value_1) +FROM users_table u +WHERE NOT EXISTS + (SELECT 'XXX' + FROM events_table e + WHERE u.user_id = e.user_id and e.value_2 > 10000 LIMIT 1); + +-- a [correlated] lateral join can also be pushed down even if the subquery +-- has an aggregate wout a GROUP BY +SELECT + max(min_of_val_2), max(u1.value_1) +FROM + users_table u1 + LEFT JOIN LATERAL + (SELECT min(e1.value_2) as min_of_val_2 FROM events_table e1 WHERE e1.user_id = u1.user_id) as foo ON (true); + + +-- a self join is followed by a correlated subquery +EXPLAIN (COSTS OFF) +SELECT + * +FROM + users_table u1 JOIN users_table u2 USING (user_id) +WHERE + u1.value_1 < u2.value_1 AND + (SELECT + count(*) + FROM + events_table e1 + WHERE + e1.user_id = u2.user_id) > 10; + +-- when the colocated join of the FROM clause +-- entries happen on WHERE clause, Citus cannot +-- pushdown +-- Likely that the colocation checks should be +-- improved +SELECT + u1.user_id, u2.user_id +FROM + users_table u1, users_table u2 +WHERE + u1.value_1 < u2.value_1 AND + (SELECT + count(*) + FROM + events_table e1 + WHERE + e1.user_id = u2.user_id AND + u1.user_id = u2.user_id) > 10 +ORDER BY 1,2; + + +-- create a view that contains correlated subquery +CREATE TEMPORARY VIEW correlated_subquery_view AS + SELECT u1.user_id + FROM users_table u1 + WHERE (SELECT COUNT(DISTINCT e1.value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id + ) > 0; + +SELECT sum(user_id) FROM correlated_subquery_view; + +-- now, join the view with another correlated subquery +SELECT + sum(mx) +FROM + correlated_subquery_view + LEFT JOIN LATERAL + (SELECT max(value_2) as mx FROM events_table WHERE correlated_subquery_view.user_id = events_table.user_id) as foo ON (true); + +-- as an edge case, JOIN is on false +SELECT + sum(mx) +FROM + correlated_subquery_view + LEFT JOIN LATERAL + (SELECT max(value_2) as mx FROM events_table WHERE correlated_subquery_view.user_id = events_table.user_id) as foo ON (false); + + +SELECT sum(value_1) +FROM users_table u1 +WHERE (SELECT COUNT(DISTINCT e1.value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id AND false + ) > 115; + +SELECT sum(value_1) +FROM users_table u1 +WHERE (SELECT COUNT(DISTINCT e1.value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id + ) > 115 AND false; + SET client_min_messages TO DEFAULT; DROP TABLE local_table;