From 707a6554b1ebf7c8e1058dc1faad98ca3e8f699d Mon Sep 17 00:00:00 2001 From: Marco Slot Date: Fri, 4 Dec 2020 00:07:29 +0100 Subject: [PATCH 1/3] Support co-located/recurring correlated subqueries --- .../planner/query_pushdown_planning.c | 157 +++++++++--------- .../expected/multi_insert_select_window.out | 2 - .../expected/multi_mx_router_planner.out | 1 - .../regress/expected/multi_router_planner.out | 1 - .../multi_router_planner_fast_path.out | 1 - .../multi_subquery_complex_queries.out | 88 ++++++++-- .../multi_subquery_in_where_clause.out | 6 +- .../expected/subqueries_not_supported.out | 40 ----- .../expected/subquery_in_targetlist.out | 28 +++- .../regress/expected/subquery_in_where.out | 42 +++++ .../regress/expected/with_set_operations.out | 11 +- .../regress/sql/subqueries_not_supported.sql | 40 ----- src/test/regress/sql/subquery_in_where.sql | 37 +++++ 13 files changed, 269 insertions(+), 185 deletions(-) diff --git a/src/backend/distributed/planner/query_pushdown_planning.c b/src/backend/distributed/planner/query_pushdown_planning.c index 0cb935a13..07bb8a9b2 100644 --- a/src/backend/distributed/planner/query_pushdown_planning.c +++ b/src/backend/distributed/planner/query_pushdown_planning.c @@ -928,19 +928,91 @@ DeferErrorIfCannotPushdownSubquery(Query *subqueryTree, bool outerMostQueryHasLi "functions"; } - if (subqueryTree->limitOffset) + /* + * Correlated subqueries are effectively functions that are repeatedly called + * for the values of the vars that point to the outer query. We can liberally + * push down SQL features within such a function, as long as co-located join + * checks are applied. + */ + if (!contain_vars_of_level((Node *) subqueryTree, 1)) { - preconditionsSatisfied = false; - errorDetail = "Offset clause is currently unsupported when a subquery " - "references a column from another query"; - } + if (subqueryTree->limitOffset) + { + preconditionsSatisfied = false; + errorDetail = "Offset clause is currently unsupported when a subquery " + "references a column from another query"; + } - /* limit is not supported when SubqueryPushdown is not set */ - if (subqueryTree->limitCount && !SubqueryPushdown) - { - preconditionsSatisfied = false; - errorDetail = "Limit in subquery is currently unsupported when a " - "subquery references a column from another query"; + /* limit is not supported when SubqueryPushdown is not set */ + if (subqueryTree->limitCount && !SubqueryPushdown) + { + preconditionsSatisfied = false; + errorDetail = "Limit in subquery is currently unsupported when a " + "subquery references a column from another query"; + } + + /* group clause list must include partition column */ + if (subqueryTree->groupClause) + { + List *groupClauseList = subqueryTree->groupClause; + List *targetEntryList = subqueryTree->targetList; + List *groupTargetEntryList = GroupTargetEntryList(groupClauseList, + targetEntryList); + bool groupOnPartitionColumn = + TargetListOnPartitionColumn(subqueryTree, groupTargetEntryList); + if (!groupOnPartitionColumn) + { + preconditionsSatisfied = false; + errorDetail = "Group by list without partition column is currently " + "unsupported when a subquery references a column " + "from another query"; + } + } + + /* we don't support aggregates without group by */ + if (subqueryTree->hasAggs && (subqueryTree->groupClause == NULL)) + { + preconditionsSatisfied = false; + errorDetail = "Aggregates without group by are currently unsupported " + "when a subquery references a column from another query"; + } + + /* having clause without group by on partition column is not supported */ + if (subqueryTree->havingQual && (subqueryTree->groupClause == NULL)) + { + preconditionsSatisfied = false; + errorDetail = "Having qual without group by on partition column is " + "currently unsupported when a subquery references " + "a column from another query"; + } + + /* + * We support window functions when the window function + * is partitioned on distribution column. + */ + if (subqueryTree->hasWindowFuncs && !SafeToPushdownWindowFunction(subqueryTree, + &errorInfo)) + { + errorDetail = (char *) errorInfo->data; + preconditionsSatisfied = false; + } + + /* distinct clause list must include partition column */ + if (subqueryTree->distinctClause) + { + List *distinctClauseList = subqueryTree->distinctClause; + List *targetEntryList = subqueryTree->targetList; + List *distinctTargetEntryList = GroupTargetEntryList(distinctClauseList, + targetEntryList); + bool distinctOnPartitionColumn = + TargetListOnPartitionColumn(subqueryTree, distinctTargetEntryList); + if (!distinctOnPartitionColumn) + { + preconditionsSatisfied = false; + errorDetail = "Distinct on columns without partition column is " + "currently unsupported"; + } + } } /* @@ -981,24 +1053,6 @@ DeferErrorIfCannotPushdownSubquery(Query *subqueryTree, bool outerMostQueryHasLi errorDetail = "For Update/Share commands are currently unsupported"; } - /* group clause list must include partition column */ - if (subqueryTree->groupClause) - { - List *groupClauseList = subqueryTree->groupClause; - List *targetEntryList = subqueryTree->targetList; - List *groupTargetEntryList = GroupTargetEntryList(groupClauseList, - targetEntryList); - bool groupOnPartitionColumn = TargetListOnPartitionColumn(subqueryTree, - groupTargetEntryList); - if (!groupOnPartitionColumn) - { - preconditionsSatisfied = false; - errorDetail = "Group by list without partition column is currently " - "unsupported when a subquery references a column " - "from another query"; - } - } - /* grouping sets are not allowed in subqueries*/ if (subqueryTree->groupingSets) { @@ -1007,51 +1061,6 @@ DeferErrorIfCannotPushdownSubquery(Query *subqueryTree, bool outerMostQueryHasLi "or ROLLUP"; } - /* - * We support window functions when the window function - * is partitioned on distribution column. - */ - if (subqueryTree->hasWindowFuncs && !SafeToPushdownWindowFunction(subqueryTree, - &errorInfo)) - { - errorDetail = (char *) errorInfo->data; - preconditionsSatisfied = false; - } - - /* we don't support aggregates without group by */ - if (subqueryTree->hasAggs && (subqueryTree->groupClause == NULL)) - { - preconditionsSatisfied = false; - errorDetail = "Aggregates without group by are currently unsupported " - "when a subquery references a column from another query"; - } - - /* having clause without group by on partition column is not supported */ - if (subqueryTree->havingQual && (subqueryTree->groupClause == NULL)) - { - preconditionsSatisfied = false; - errorDetail = "Having qual without group by on partition column is " - "currently unsupported when a subquery references " - "a column from another query"; - } - - /* distinct clause list must include partition column */ - if (subqueryTree->distinctClause) - { - List *distinctClauseList = subqueryTree->distinctClause; - List *targetEntryList = subqueryTree->targetList; - List *distinctTargetEntryList = GroupTargetEntryList(distinctClauseList, - targetEntryList); - bool distinctOnPartitionColumn = - TargetListOnPartitionColumn(subqueryTree, distinctTargetEntryList); - if (!distinctOnPartitionColumn) - { - preconditionsSatisfied = false; - errorDetail = "Distinct on columns without partition column is " - "currently unsupported"; - } - } - deferredError = DeferErrorIfFromClauseRecurs(subqueryTree); if (deferredError) { diff --git a/src/test/regress/expected/multi_insert_select_window.out b/src/test/regress/expected/multi_insert_select_window.out index 2f522b127..0cf605990 100644 --- a/src/test/regress/expected/multi_insert_select_window.out +++ b/src/test/regress/expected/multi_insert_select_window.out @@ -773,8 +773,6 @@ WHERE ) GROUP BY user_id; -ERROR: cannot push down this subquery -DETAIL: Window functions without PARTITION BY on distribution column is currently unsupported INSERT INTO agg_results_window(user_id, value_2_agg) SELECT * FROM ( SELECT diff --git a/src/test/regress/expected/multi_mx_router_planner.out b/src/test/regress/expected/multi_mx_router_planner.out index 9486225c5..f122bb4d0 100644 --- a/src/test/regress/expected/multi_mx_router_planner.out +++ b/src/test/regress/expected/multi_mx_router_planner.out @@ -488,7 +488,6 @@ DEBUG: query has a single distribution column value: 1 SELECT a.title AS name, (SELECT a2.id FROM articles_single_shard_hash_mx a2 WHERE a.id = a2.id LIMIT 1) AS special_price FROM articles_hash_mx a; DEBUG: Router planner cannot handle multi-shard select queries -DEBUG: skipping recursive planning for the subquery since it contains references to outer queries ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns -- simple lookup query SELECT * diff --git a/src/test/regress/expected/multi_router_planner.out b/src/test/regress/expected/multi_router_planner.out index f55c98050..047d77176 100644 --- a/src/test/regress/expected/multi_router_planner.out +++ b/src/test/regress/expected/multi_router_planner.out @@ -677,7 +677,6 @@ DEBUG: query has a single distribution column value: 1 SELECT a.title AS name, (SELECT a2.id FROM articles_single_shard_hash a2 WHERE a.id = a2.id LIMIT 1) AS special_price FROM articles_hash a; DEBUG: Router planner cannot handle multi-shard select queries -DEBUG: skipping recursive planning for the subquery since it contains references to outer queries ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns -- simple lookup query SELECT * diff --git a/src/test/regress/expected/multi_router_planner_fast_path.out b/src/test/regress/expected/multi_router_planner_fast_path.out index 003c78162..1fef5c721 100644 --- a/src/test/regress/expected/multi_router_planner_fast_path.out +++ b/src/test/regress/expected/multi_router_planner_fast_path.out @@ -423,7 +423,6 @@ DEBUG: query has a single distribution column value: 1 SELECT a.title AS name, (SELECT a2.id FROM articles_hash a2 WHERE a.id = a2.id LIMIT 1) AS special_price FROM articles_hash a; DEBUG: Router planner cannot handle multi-shard select queries -DEBUG: skipping recursive planning for the subquery since it contains references to outer queries ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns -- simple lookup query just works SELECT * diff --git a/src/test/regress/expected/multi_subquery_complex_queries.out b/src/test/regress/expected/multi_subquery_complex_queries.out index d9309efe2..1d526f575 100644 --- a/src/test/regress/expected/multi_subquery_complex_queries.out +++ b/src/test/regress/expected/multi_subquery_complex_queries.out @@ -1275,8 +1275,16 @@ FROM ORDER BY user_id limit 50; -ERROR: cannot push down this subquery -DETAIL: Limit in subquery is currently unsupported when a subquery references a column from another query + user_id | lastseen +--------------------------------------------------------------------- + 2 | Thu Nov 23 17:26:14.563216 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 17:26:14.563216 2017 + 5 | Thu Nov 23 18:08:26.550729 2017 + 5 | Thu Nov 23 17:26:14.563216 2017 + 6 | Thu Nov 23 18:08:26.550729 2017 +(6 rows) + -- we recursively plan some queries but fail in the end -- since some_users_data since it has a reference -- from an outer query which is not recursively planned @@ -1312,8 +1320,12 @@ FROM ORDER BY user_id limit 50; -ERROR: cannot push down this subquery -DETAIL: Limit in subquery is currently unsupported when a subquery references a column from another query + user_id | lastseen +--------------------------------------------------------------------- + 4 | Thu Nov 23 17:26:14.563216 2017 + 6 | Thu Nov 23 18:08:26.550729 2017 +(2 rows) + -- LATERAL JOINs used with INNER JOINs SET citus.subquery_pushdown to ON; NOTICE: Setting citus.subquery_pushdown flag is discouraged becuase it forces the planner to pushdown certain queries, skipping relevant correctness checks. @@ -1502,11 +1514,28 @@ LIMIT 10; DEBUG: Router planner cannot handle multi-shard select queries DEBUG: Router planner cannot handle multi-shard select queries DEBUG: generating subplan XXX_1 for subquery SELECT user_id FROM public.users_table users WHERE ((user_id OPERATOR(pg_catalog.>) 1) AND (user_id OPERATOR(pg_catalog.<) 4) AND (value_2 OPERATOR(pg_catalog.>) 3)) -DEBUG: skipping recursive planning for the subquery since it contains references to outer queries DEBUG: Router planner cannot handle multi-shard select queries -DEBUG: skipping recursive planning for the subquery since it contains references to outer queries -ERROR: cannot push down this subquery -DETAIL: Limit in subquery is currently unsupported when a subquery references a column from another query +DEBUG: push down of limit count: 10 +DEBUG: generating subplan XXX_2 for subquery SELECT filter_users_1.user_id, last_events_1."time" AS lastseen FROM ((SELECT user_where_1_1.user_id FROM ((SELECT users.user_id FROM public.users_table users WHERE ((users.user_id OPERATOR(pg_catalog.>) 1) AND (users.user_id OPERATOR(pg_catalog.<) 4) AND (users.value_1 OPERATOR(pg_catalog.>) 2))) user_where_1_1 JOIN (SELECT intermediate_result.user_id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer)) user_where_1_join_1 ON ((user_where_1_1.user_id OPERATOR(pg_catalog.<>) user_where_1_join_1.user_id)))) filter_users_1 JOIN LATERAL (SELECT events.user_id, events."time" FROM public.events_table events WHERE ((events.user_id OPERATOR(pg_catalog.>) 1) AND (events.user_id OPERATOR(pg_catalog.<) 4) AND (events.user_id OPERATOR(pg_catalog.=) filter_users_1.user_id)) ORDER BY events."time" DESC LIMIT 1) last_events_1 ON (true)) ORDER BY last_events_1."time" DESC LIMIT 10 +DEBUG: Router planner cannot handle multi-shard select queries +DEBUG: push down of limit count: 10 +DEBUG: generating subplan XXX_3 for subquery SELECT some_users_data.user_id, some_recent_users.lastseen FROM ((SELECT intermediate_result.user_id, intermediate_result.lastseen FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, lastseen timestamp without time zone)) some_recent_users JOIN LATERAL (SELECT users.user_id FROM public.users_table users WHERE ((users.user_id OPERATOR(pg_catalog.=) some_recent_users.user_id) AND (users.value_2 OPERATOR(pg_catalog.>) 4)) LIMIT 1) some_users_data ON (true)) ORDER BY some_recent_users.lastseen DESC LIMIT 10 +DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT user_id, lastseen FROM (SELECT intermediate_result.user_id, intermediate_result.lastseen FROM read_intermediate_result('XXX_3'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, lastseen timestamp without time zone)) some_users ORDER BY user_id DESC, lastseen DESC LIMIT 10 +DEBUG: Creating router plan + user_id | lastseen +--------------------------------------------------------------------- + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 +(10 rows) + SET citus.enable_repartition_joins to ON; SET client_min_messages TO DEBUG1; -- recursively planner since the inner JOIN is not on the partition key @@ -1563,8 +1592,25 @@ ORDER BY user_id DESC, lastseen DESC LIMIT 10; DEBUG: generating subplan XXX_1 for subquery SELECT user_id, value_1 FROM public.users_table users WHERE ((user_id OPERATOR(pg_catalog.>) 1) AND (user_id OPERATOR(pg_catalog.<) 4) AND (value_2 OPERATOR(pg_catalog.>) 3)) -ERROR: cannot push down this subquery -DETAIL: Limit in subquery is currently unsupported when a subquery references a column from another query +DEBUG: push down of limit count: 10 +DEBUG: generating subplan XXX_2 for subquery SELECT filter_users_1.user_id, last_events_1."time" AS lastseen FROM ((SELECT user_where_1_1.user_id FROM ((SELECT users.user_id FROM public.users_table users WHERE ((users.user_id OPERATOR(pg_catalog.>) 1) AND (users.user_id OPERATOR(pg_catalog.<) 4) AND (users.value_1 OPERATOR(pg_catalog.>) 2))) user_where_1_1 JOIN (SELECT intermediate_result.user_id, intermediate_result.value_1 FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, value_1 integer)) user_where_1_join_1 ON ((user_where_1_1.user_id OPERATOR(pg_catalog.=) user_where_1_join_1.value_1)))) filter_users_1 JOIN LATERAL (SELECT events.user_id, events."time" FROM public.events_table events WHERE ((events.user_id OPERATOR(pg_catalog.>) 1) AND (events.user_id OPERATOR(pg_catalog.<) 4) AND (events.user_id OPERATOR(pg_catalog.=) filter_users_1.user_id)) ORDER BY events."time" DESC LIMIT 1) last_events_1 ON (true)) ORDER BY last_events_1."time" DESC LIMIT 10 +DEBUG: push down of limit count: 10 +DEBUG: generating subplan XXX_3 for subquery SELECT some_users_data.user_id, some_recent_users.lastseen FROM ((SELECT intermediate_result.user_id, intermediate_result.lastseen FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, lastseen timestamp without time zone)) some_recent_users JOIN LATERAL (SELECT users.user_id FROM public.users_table users WHERE ((users.user_id OPERATOR(pg_catalog.=) some_recent_users.user_id) AND (users.value_2 OPERATOR(pg_catalog.>) 4)) LIMIT 1) some_users_data ON (true)) ORDER BY some_recent_users.lastseen DESC LIMIT 10 +DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT user_id, lastseen FROM (SELECT intermediate_result.user_id, intermediate_result.lastseen FROM read_intermediate_result('XXX_3'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, lastseen timestamp without time zone)) some_users ORDER BY user_id DESC, lastseen DESC LIMIT 10 + user_id | lastseen +--------------------------------------------------------------------- + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 + 3 | Thu Nov 23 18:08:26.550729 2017 +(10 rows) + SET citus.enable_repartition_joins to OFF; RESET client_min_messages; -- not supported since upper LATERAL JOIN is not equi join @@ -1676,8 +1722,20 @@ FROM ORDER BY user_id DESC, lastseen DESC LIMIT 10; -ERROR: cannot push down this subquery -DETAIL: Limit in subquery is currently unsupported when a subquery references a column from another query + user_id | lastseen +--------------------------------------------------------------------- + 5 | Thu Nov 23 17:26:14.563216 2017 + 5 | Thu Nov 23 17:26:14.563216 2017 + 5 | Thu Nov 23 17:26:14.563216 2017 + 5 | Thu Nov 23 17:26:14.563216 2017 + 5 | Thu Nov 23 17:26:14.563216 2017 + 5 | Thu Nov 23 17:26:14.563216 2017 + 5 | Thu Nov 23 17:26:14.563216 2017 + 5 | Thu Nov 23 17:26:14.563216 2017 + 5 | Thu Nov 23 17:26:14.563216 2017 + 5 | Thu Nov 23 17:26:14.563216 2017 +(10 rows) + -- NESTED INNER JOINs SELECT count(*) AS value, "generated_group_field" @@ -2321,8 +2379,10 @@ FROM ORDER BY value_2 DESC, user_id DESC LIMIT 10; -ERROR: cannot push down this subquery -DETAIL: Limit in subquery is currently unsupported when a subquery references a column from another query + user_id | value_2 +--------------------------------------------------------------------- +(0 rows) + -- lets test some unsupported set operations -- not supported since we use INTERSECT SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType diff --git a/src/test/regress/expected/multi_subquery_in_where_clause.out b/src/test/regress/expected/multi_subquery_in_where_clause.out index a7cac9cbc..c5066b275 100644 --- a/src/test/regress/expected/multi_subquery_in_where_clause.out +++ b/src/test/regress/expected/multi_subquery_in_where_clause.out @@ -610,8 +610,10 @@ WHERE user_id OFFSET 3 ); -ERROR: cannot push down this subquery -DETAIL: Offset clause is currently unsupported when a subquery references a column from another query + user_id +--------------------------------------------------------------------- +(0 rows) + -- we can detect unsupported subqueries even if they appear -- in WHERE subquery -> FROM subquery -> WHERE subquery -- but we can recursively plan that anyway diff --git a/src/test/regress/expected/subqueries_not_supported.out b/src/test/regress/expected/subqueries_not_supported.out index 0bb5da386..fcd33386e 100644 --- a/src/test/regress/expected/subqueries_not_supported.out +++ b/src/test/regress/expected/subqueries_not_supported.out @@ -83,46 +83,6 @@ DEBUG: generating subplan XXX_1 for subquery SELECT users_table.value_2 FROM pu DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT foo.value_2 FROM ((SELECT intermediate_result.value_2 FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(value_2 integer)) foo LEFT JOIN (SELECT users_table.value_2 FROM public.users_table, public.events_table WHERE ((users_table.user_id OPERATOR(pg_catalog.=) events_table.user_id) AND (events_table.event_type OPERATOR(pg_catalog.=) ANY (ARRAY[5, 6, 7, 8])))) bar ON ((foo.value_2 OPERATOR(pg_catalog.=) bar.value_2))) ERROR: cannot pushdown the subquery DETAIL: Complex subqueries and CTEs cannot be in the outer part of the outer join --- Aggregates in subquery without partition column can be planned recursively --- unless there is a reference to an outer query -SELECT - * -FROM - users_table -WHERE - user_id IN - ( - SELECT - SUM(events_table.user_id) - FROM - events_table - WHERE - users_table.user_id = events_table.user_id - ) -; -ERROR: cannot push down this subquery -DETAIL: Aggregates without group by are currently unsupported when a subquery references a column from another query --- Having qual without group by on partition column can be planned recursively --- unless there is a reference to an outer query -SELECT - * -FROM - users_table -WHERE - user_id IN - ( - SELECT - SUM(events_table.user_id) - FROM - events_table - WHERE - events_table.user_id = users_table.user_id - HAVING - MIN(value_2) > 2 - ) -; -ERROR: cannot push down this subquery -DETAIL: Having qual without group by on partition column is currently unsupported when a subquery references a column from another query -- We do not support GROUPING SETS in subqueries -- This also includes ROLLUP or CUBE clauses SELECT * FROM (SELECT user_id, value_1 FROM users_table GROUP BY GROUPING SETS ((user_id), (value_1))) s; diff --git a/src/test/regress/expected/subquery_in_targetlist.out b/src/test/regress/expected/subquery_in_targetlist.out index d3352e32a..d266ff3b0 100644 --- a/src/test/regress/expected/subquery_in_targetlist.out +++ b/src/test/regress/expected/subquery_in_targetlist.out @@ -31,14 +31,20 @@ ORDER BY 1,2 LIMIT 1; SELECT event_type, (SELECT max(time) FROM users_table WHERE user_id = e.user_id) FROM events_table e ORDER BY 1,2 LIMIT 1; -ERROR: cannot push down this subquery -DETAIL: Aggregates without group by are currently unsupported when a subquery references a column from another query + event_type | max +--------------------------------------------------------------------- + 0 | Thu Nov 23 13:52:54.83829 2017 +(1 row) + -- correlated subquery wtth limit SELECT event_type, (SELECT time FROM users_table WHERE user_id = e.user_id ORDER BY time LIMIT 1) FROM events_table e ORDER BY 1,2 LIMIT 1; -ERROR: cannot push down this subquery -DETAIL: Limit in subquery is currently unsupported when a subquery references a column from another query + event_type | time +--------------------------------------------------------------------- + 0 | Wed Nov 22 18:19:49.944985 2017 +(1 row) + -- correlated subquery with group by distribution column SELECT event_type, (SELECT max(time) FROM users_table WHERE user_id = e.user_id GROUP BY user_id) FROM events_table e @@ -52,8 +58,11 @@ ORDER BY 1,2 LIMIT 1; SELECT event_type, (SELECT max(time) FROM users_table WHERE user_id = e.user_id GROUP BY e.user_id) FROM events_table e ORDER BY 1,2 LIMIT 1; -ERROR: cannot push down this subquery -DETAIL: Group by list without partition column is currently unsupported when a subquery references a column from another query + event_type | max +--------------------------------------------------------------------- + 0 | Thu Nov 23 13:52:54.83829 2017 +(1 row) + -- correlated subquery co-located join in outer query SELECT event_type, (SELECT max(time) FROM users_table WHERE user_id = e.user_id GROUP BY user_id) FROM users_table u JOIN events_table e USING (user_id) @@ -81,8 +90,11 @@ ERROR: complex joins are only supported when all distributed tables are co-loca SELECT event_type, (SELECT max(time) FROM users_reference_table WHERE user_id = e.value_2) FROM events_table e ORDER BY 1,2 LIMIT 1; -ERROR: cannot push down this subquery -DETAIL: Aggregates without group by are currently unsupported when a subquery references a column from another query + event_type | max +--------------------------------------------------------------------- + 0 | Thu Nov 23 13:52:54.83829 2017 +(1 row) + -- correlated subquery with reference table and group by SELECT event_type, (SELECT max(time) FROM users_reference_table WHERE user_id = e.value_2 GROUP BY user_id) FROM events_table e diff --git a/src/test/regress/expected/subquery_in_where.out b/src/test/regress/expected/subquery_in_where.out index 39221a7e5..138c2fcc9 100644 --- a/src/test/regress/expected/subquery_in_where.out +++ b/src/test/regress/expected/subquery_in_where.out @@ -691,6 +691,48 @@ DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT count(*) AS c 101 (1 row) +-- correlated subquery with aggregate in WHERE +SELECT + * +FROM + users_table +WHERE + user_id IN + ( + SELECT + SUM(events_table.user_id) + FROM + events_table + WHERE + users_table.user_id = events_table.user_id + ) +; + user_id | time | value_1 | value_2 | value_3 | value_4 +--------------------------------------------------------------------- +(0 rows) + +-- correlated subquery with aggregate in HAVING +SELECT + * +FROM + users_table +WHERE + user_id IN + ( + SELECT + SUM(events_table.user_id) + FROM + events_table + WHERE + events_table.user_id = users_table.user_id + HAVING + MIN(value_2) > 2 + ) +; + user_id | time | value_1 | value_2 | value_3 | value_4 +--------------------------------------------------------------------- +(0 rows) + -- Local tables also planned recursively, so using it as part of the FROM clause -- make the clause recurring CREATE TABLE local_table(id int, value_1 int); diff --git a/src/test/regress/expected/with_set_operations.out b/src/test/regress/expected/with_set_operations.out index c3ccbd4bd..461a184e5 100644 --- a/src/test/regress/expected/with_set_operations.out +++ b/src/test/regress/expected/with_set_operations.out @@ -413,7 +413,14 @@ DEBUG: generating subplan XXX_1 for CTE cte_1: SELECT user_id FROM public.users DEBUG: generating subplan XXX_2 for CTE cte_1: SELECT user_id FROM public.users_table DEBUG: generating subplan XXX_3 for subquery SELECT cte_1.user_id FROM (SELECT intermediate_result.user_id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer)) cte_1 UNION SELECT cte_1.user_id FROM (SELECT intermediate_result.user_id FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer)) cte_1 DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT DISTINCT user_id FROM public.events_table WHERE (event_type OPERATOR(pg_catalog.=) ANY (SELECT users_table.user_id FROM (SELECT intermediate_result.user_id FROM read_intermediate_result('XXX_3'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer)) foo, public.users_table WHERE ((users_table.value_2 OPERATOR(pg_catalog.=) foo.user_id) AND (events_table.user_id OPERATOR(pg_catalog.=) users_table.user_id)) LIMIT 5)) ORDER BY user_id DESC -ERROR: cannot push down this subquery -DETAIL: Limit in subquery is currently unsupported when a subquery references a column from another query + user_id +--------------------------------------------------------------------- + 5 + 4 + 3 + 2 + 1 +(5 rows) + SET client_min_messages TO DEFAULT; SET search_path TO public; diff --git a/src/test/regress/sql/subqueries_not_supported.sql b/src/test/regress/sql/subqueries_not_supported.sql index b08fe254c..242623a3f 100644 --- a/src/test/regress/sql/subqueries_not_supported.sql +++ b/src/test/regress/sql/subqueries_not_supported.sql @@ -78,46 +78,6 @@ FROM ON(foo.value_2 = bar.value_2); --- Aggregates in subquery without partition column can be planned recursively --- unless there is a reference to an outer query -SELECT - * -FROM - users_table -WHERE - user_id IN - ( - SELECT - SUM(events_table.user_id) - FROM - events_table - WHERE - users_table.user_id = events_table.user_id - ) -; - - --- Having qual without group by on partition column can be planned recursively --- unless there is a reference to an outer query -SELECT - * -FROM - users_table -WHERE - user_id IN - ( - SELECT - SUM(events_table.user_id) - FROM - events_table - WHERE - events_table.user_id = users_table.user_id - HAVING - MIN(value_2) > 2 - ) -; - - -- We do not support GROUPING SETS in subqueries -- This also includes ROLLUP or CUBE clauses SELECT * FROM (SELECT user_id, value_1 FROM users_table GROUP BY GROUPING SETS ((user_id), (value_1))) s; diff --git a/src/test/regress/sql/subquery_in_where.sql b/src/test/regress/sql/subquery_in_where.sql index 884307845..539fa27f7 100644 --- a/src/test/regress/sql/subquery_in_where.sql +++ b/src/test/regress/sql/subquery_in_where.sql @@ -502,6 +502,43 @@ WHERE value_1 IN (SELECT value_1 FROM users_Table) OR (EXISTS (SELECT * FROM events_table)); +-- correlated subquery with aggregate in WHERE +SELECT + * +FROM + users_table +WHERE + user_id IN + ( + SELECT + SUM(events_table.user_id) + FROM + events_table + WHERE + users_table.user_id = events_table.user_id + ) +; + +-- correlated subquery with aggregate in HAVING +SELECT + * +FROM + users_table +WHERE + user_id IN + ( + SELECT + SUM(events_table.user_id) + FROM + events_table + WHERE + events_table.user_id = users_table.user_id + HAVING + MIN(value_2) > 2 + ) +; + + -- Local tables also planned recursively, so using it as part of the FROM clause -- make the clause recurring CREATE TABLE local_table(id int, value_1 int); From 23dccd89418f5852280080f4601af42a49ba0c77 Mon Sep 17 00:00:00 2001 From: Marco Slot Date: Fri, 4 Dec 2020 13:23:54 +0100 Subject: [PATCH 2/3] Add some new tests for complex correlated subqueries in WHERE --- .../expected/subquery_in_targetlist.out | 8 +- .../regress/expected/subquery_in_where.out | 179 ++++++++++++++++++ src/test/regress/sql/subquery_in_where.sql | 134 +++++++++++++ 3 files changed, 319 insertions(+), 2 deletions(-) diff --git a/src/test/regress/expected/subquery_in_targetlist.out b/src/test/regress/expected/subquery_in_targetlist.out index d266ff3b0..f654af850 100644 --- a/src/test/regress/expected/subquery_in_targetlist.out +++ b/src/test/regress/expected/subquery_in_targetlist.out @@ -282,8 +282,12 @@ SELECT (SELECT value_2 FROM view_1 WHERE user_id = e.user_id GROUP BY value_2) FROM events_table e GROUP BY 1 ORDER BY 1 LIMIT 3; -ERROR: cannot push down this subquery -DETAIL: Group by list without partition column is currently unsupported when a subquery references a column from another query + value_2 +--------------------------------------------------------------------- + 3 + +(2 rows) + -- without view in the outer query FROM SELECT (SELECT value_2 FROM view_1 WHERE user_id = e.user_id GROUP BY user_id, value_2) FROM view_1 e diff --git a/src/test/regress/expected/subquery_in_where.out b/src/test/regress/expected/subquery_in_where.out index 138c2fcc9..a8f7fe2c6 100644 --- a/src/test/regress/expected/subquery_in_where.out +++ b/src/test/regress/expected/subquery_in_where.out @@ -788,6 +788,185 @@ DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT count(*) AS c 10 (1 row) +-- basic NOT IN correlated subquery +SELECT + count(*) +FROM + events_table e +WHERE + value_2 NOT IN (SELECT value_2 FROM users_table WHERE user_id = e.user_id); + count +--------------------------------------------------------------------- + 7 +(1 row) + +-- correlated subquery with limit +SELECT + count(*) +FROM + events_table e +WHERE + value_2 IN (SELECT value_2 FROM users_table WHERE user_id = e.user_id ORDER BY value_2 LIMIT 1); + count +--------------------------------------------------------------------- + 10 +(1 row) + +-- correlated subquery with distinct +SELECT + count(*) +FROM + events_table e +WHERE + value_2 IN (SELECT DISTINCT (value_3) FROM users_table WHERE user_id = e.user_id); + count +--------------------------------------------------------------------- + 90 +(1 row) + +-- correlated subquery with aggregate +SELECT + count(*) +FROM + events_table e +WHERE + value_2 = (SELECT max(value_2) FROM users_table WHERE user_id = e.user_id); + count +--------------------------------------------------------------------- + 11 +(1 row) + +-- correlated subquery with window function +SELECT + count(*) +FROM + events_table e +WHERE + value_2 IN (SELECT row_number() OVER () FROM users_table WHERE user_id = e.user_id); + count +--------------------------------------------------------------------- + 94 +(1 row) + +-- correlated subquery with group by +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN (SELECT min(value_3) FROM users_table WHERE user_id = e.user_id GROUP BY value_2); + count +--------------------------------------------------------------------- + 72 +(1 row) + +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN (SELECT min(value_3) FROM users_table WHERE user_id = e.user_id GROUP BY value_2); + count +--------------------------------------------------------------------- + 72 +(1 row) + +-- correlated subquery with group by +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN (SELECT min(value_3) v FROM users_table WHERE user_id = e.user_id GROUP BY e.value_2); + count +--------------------------------------------------------------------- + 10 +(1 row) + +-- correlated subquery with having +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN (SELECT min(value_3) v FROM users_table WHERE user_id = e.user_id GROUP BY e.value_2 HAVING min(value_3) > (SELECT 1)); + count +--------------------------------------------------------------------- + 0 +(1 row) + +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN (SELECT min(value_3) v FROM users_table WHERE user_id = e.user_id GROUP BY e.value_2 HAVING min(value_3) > (SELECT e.value_3)); +ERROR: Subqueries in HAVING cannot refer to outer query +-- nested correlated subquery +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN ( + SELECT min(r.value_3) v FROM users_reference_table r JOIN (SELECT * FROM users_table WHERE user_id = e.user_id) u USING (user_id) + WHERE u.value_2 > 3 + GROUP BY e.value_2 HAVING min(r.value_3) > e.value_3); + count +--------------------------------------------------------------------- + 0 +(1 row) + +-- not co-located correlated subquery +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN ( + SELECT min(r.value_3) v FROM users_reference_table r JOIN (SELECT * FROM users_table WHERE value_2 = e.user_id) u USING (user_id) + WHERE u.value_2 > 3 + GROUP BY e.value_2 HAVING min(r.value_3) > e.value_3); +ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns +-- cartesian correlated subquery +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN ( + SELECT min(r.value_3) v FROM users_reference_table r JOIN users_table u USING (user_id) + WHERE u.value_2 > 3 + GROUP BY e.value_2 HAVING min(r.value_3) > e.value_3); +ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns +-- even more subtle cartesian correlated subquery +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN ( + SELECT min(r.value_3) v FROM users_reference_table r JOIN users_table u USING (user_id) + WHERE u.value_2 > 3 + GROUP BY u.value_2 HAVING min(r.value_3) > e.value_3); +ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns +-- not a correlated subquery, uses recursive planning +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN ( + SELECT min(r.value_3) v FROM users_reference_table r JOIN users_table u USING (user_id) + WHERE u.value_2 > 3 + GROUP BY r.value_2 HAVING min(r.value_3) > 0); +DEBUG: generating subplan XXX_1 for subquery SELECT min(r.value_3) AS v FROM (public.users_reference_table r JOIN public.users_table u USING (user_id)) WHERE (u.value_2 OPERATOR(pg_catalog.>) 3) GROUP BY r.value_2 HAVING (min(r.value_3) OPERATOR(pg_catalog.>) (0)::double precision) +DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT count(*) AS count FROM public.events_table e WHERE (value_3 OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.v FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(v double precision))) + count +--------------------------------------------------------------------- + 24 +(1 row) + SET client_min_messages TO DEFAULT; DROP TABLE local_table; DROP SCHEMA subquery_in_where CASCADE; diff --git a/src/test/regress/sql/subquery_in_where.sql b/src/test/regress/sql/subquery_in_where.sql index 539fa27f7..4f43a967b 100644 --- a/src/test/regress/sql/subquery_in_where.sql +++ b/src/test/regress/sql/subquery_in_where.sql @@ -579,6 +579,140 @@ IN FROM local_table); +-- basic NOT IN correlated subquery +SELECT + count(*) +FROM + events_table e +WHERE + value_2 NOT IN (SELECT value_2 FROM users_table WHERE user_id = e.user_id); + +-- correlated subquery with limit +SELECT + count(*) +FROM + events_table e +WHERE + value_2 IN (SELECT value_2 FROM users_table WHERE user_id = e.user_id ORDER BY value_2 LIMIT 1); + +-- correlated subquery with distinct +SELECT + count(*) +FROM + events_table e +WHERE + value_2 IN (SELECT DISTINCT (value_3) FROM users_table WHERE user_id = e.user_id); + +-- correlated subquery with aggregate +SELECT + count(*) +FROM + events_table e +WHERE + value_2 = (SELECT max(value_2) FROM users_table WHERE user_id = e.user_id); + +-- correlated subquery with window function +SELECT + count(*) +FROM + events_table e +WHERE + value_2 IN (SELECT row_number() OVER () FROM users_table WHERE user_id = e.user_id); + +-- correlated subquery with group by +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN (SELECT min(value_3) FROM users_table WHERE user_id = e.user_id GROUP BY value_2); + +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN (SELECT min(value_3) FROM users_table WHERE user_id = e.user_id GROUP BY value_2); + + +-- correlated subquery with group by +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN (SELECT min(value_3) v FROM users_table WHERE user_id = e.user_id GROUP BY e.value_2); + +-- correlated subquery with having +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN (SELECT min(value_3) v FROM users_table WHERE user_id = e.user_id GROUP BY e.value_2 HAVING min(value_3) > (SELECT 1)); + +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN (SELECT min(value_3) v FROM users_table WHERE user_id = e.user_id GROUP BY e.value_2 HAVING min(value_3) > (SELECT e.value_3)); + +-- nested correlated subquery +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN ( + SELECT min(r.value_3) v FROM users_reference_table r JOIN (SELECT * FROM users_table WHERE user_id = e.user_id) u USING (user_id) + WHERE u.value_2 > 3 + GROUP BY e.value_2 HAVING min(r.value_3) > e.value_3); + +-- not co-located correlated subquery +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN ( + SELECT min(r.value_3) v FROM users_reference_table r JOIN (SELECT * FROM users_table WHERE value_2 = e.user_id) u USING (user_id) + WHERE u.value_2 > 3 + GROUP BY e.value_2 HAVING min(r.value_3) > e.value_3); + +-- cartesian correlated subquery +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN ( + SELECT min(r.value_3) v FROM users_reference_table r JOIN users_table u USING (user_id) + WHERE u.value_2 > 3 + GROUP BY e.value_2 HAVING min(r.value_3) > e.value_3); + +-- even more subtle cartesian correlated subquery +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN ( + SELECT min(r.value_3) v FROM users_reference_table r JOIN users_table u USING (user_id) + WHERE u.value_2 > 3 + GROUP BY u.value_2 HAVING min(r.value_3) > e.value_3); + +-- not a correlated subquery, uses recursive planning +SELECT + count(*) +FROM + events_table e +WHERE + value_3 IN ( + SELECT min(r.value_3) v FROM users_reference_table r JOIN users_table u USING (user_id) + WHERE u.value_2 > 3 + GROUP BY r.value_2 HAVING min(r.value_3) > 0); + SET client_min_messages TO DEFAULT; DROP TABLE local_table; From 100e5d3196bbe9b252d6ed8d84d68c90666d5d3d Mon Sep 17 00:00:00 2001 From: Marco Slot Date: Tue, 15 Dec 2020 15:23:38 +0100 Subject: [PATCH 3/3] Address review feedback --- .../planner/query_pushdown_planning.c | 185 ++++++++++-------- .../distributed/planner/recursive_planning.c | 3 +- src/include/distributed/recursive_planning.h | 3 + .../expected/multi_mx_router_planner.out | 2 +- .../multi_subquery_complex_queries.out | 39 ++-- .../regress/expected/subquery_in_where.out | 180 +++++++++++++++++ .../regress/sql/multi_mx_router_planner.sql | 2 +- .../sql/multi_subquery_complex_queries.sql | 33 ++-- src/test/regress/sql/subquery_in_where.sql | 137 +++++++++++++ 9 files changed, 458 insertions(+), 126 deletions(-) diff --git a/src/backend/distributed/planner/query_pushdown_planning.c b/src/backend/distributed/planner/query_pushdown_planning.c index 07bb8a9b2..38cc844ab 100644 --- a/src/backend/distributed/planner/query_pushdown_planning.c +++ b/src/backend/distributed/planner/query_pushdown_planning.c @@ -34,6 +34,7 @@ #include "distributed/pg_dist_partition.h" #include "distributed/query_utils.h" #include "distributed/query_pushdown_planning.h" +#include "distributed/recursive_planning.h" #include "distributed/relation_restriction_equivalence.h" #include "distributed/version_compat.h" #include "nodes/nodeFuncs.h" @@ -78,6 +79,7 @@ static RecurringTuplesType FromClauseRecurringTupleType(Query *queryTree); static DeferredErrorMessage * DeferredErrorIfUnsupportedRecurringTuplesJoin( PlannerRestrictionContext *plannerRestrictionContext); static DeferredErrorMessage * DeferErrorIfUnsupportedTableCombination(Query *queryTree); +static DeferredErrorMessage * DeferErrorIfSubqueryRequiresMerge(Query *subqueryTree); static bool ExtractSetOperationStatmentWalker(Node *node, List **setOperationList); static RecurringTuplesType FetchFirstRecurType(PlannerInfo *plannerInfo, Relids relids); @@ -911,7 +913,6 @@ DeferErrorIfCannotPushdownSubquery(Query *subqueryTree, bool outerMostQueryHasLi { bool preconditionsSatisfied = true; char *errorDetail = NULL; - StringInfo errorInfo = NULL; DeferredErrorMessage *deferredError = DeferErrorIfUnsupportedTableCombination( subqueryTree); @@ -934,84 +935,12 @@ DeferErrorIfCannotPushdownSubquery(Query *subqueryTree, bool outerMostQueryHasLi * push down SQL features within such a function, as long as co-located join * checks are applied. */ - if (!contain_vars_of_level((Node *) subqueryTree, 1)) + if (!ContainsReferencesToOuterQuery(subqueryTree)) { - if (subqueryTree->limitOffset) + deferredError = DeferErrorIfSubqueryRequiresMerge(subqueryTree); + if (deferredError) { - preconditionsSatisfied = false; - errorDetail = "Offset clause is currently unsupported when a subquery " - "references a column from another query"; - } - - /* limit is not supported when SubqueryPushdown is not set */ - if (subqueryTree->limitCount && !SubqueryPushdown) - { - preconditionsSatisfied = false; - errorDetail = "Limit in subquery is currently unsupported when a " - "subquery references a column from another query"; - } - - /* group clause list must include partition column */ - if (subqueryTree->groupClause) - { - List *groupClauseList = subqueryTree->groupClause; - List *targetEntryList = subqueryTree->targetList; - List *groupTargetEntryList = GroupTargetEntryList(groupClauseList, - targetEntryList); - bool groupOnPartitionColumn = - TargetListOnPartitionColumn(subqueryTree, groupTargetEntryList); - if (!groupOnPartitionColumn) - { - preconditionsSatisfied = false; - errorDetail = "Group by list without partition column is currently " - "unsupported when a subquery references a column " - "from another query"; - } - } - - /* we don't support aggregates without group by */ - if (subqueryTree->hasAggs && (subqueryTree->groupClause == NULL)) - { - preconditionsSatisfied = false; - errorDetail = "Aggregates without group by are currently unsupported " - "when a subquery references a column from another query"; - } - - /* having clause without group by on partition column is not supported */ - if (subqueryTree->havingQual && (subqueryTree->groupClause == NULL)) - { - preconditionsSatisfied = false; - errorDetail = "Having qual without group by on partition column is " - "currently unsupported when a subquery references " - "a column from another query"; - } - - /* - * We support window functions when the window function - * is partitioned on distribution column. - */ - if (subqueryTree->hasWindowFuncs && !SafeToPushdownWindowFunction(subqueryTree, - &errorInfo)) - { - errorDetail = (char *) errorInfo->data; - preconditionsSatisfied = false; - } - - /* distinct clause list must include partition column */ - if (subqueryTree->distinctClause) - { - List *distinctClauseList = subqueryTree->distinctClause; - List *targetEntryList = subqueryTree->targetList; - List *distinctTargetEntryList = GroupTargetEntryList(distinctClauseList, - targetEntryList); - bool distinctOnPartitionColumn = - TargetListOnPartitionColumn(subqueryTree, distinctTargetEntryList); - if (!distinctOnPartitionColumn) - { - preconditionsSatisfied = false; - errorDetail = "Distinct on columns without partition column is " - "currently unsupported"; - } + return deferredError; } } @@ -1080,6 +1009,108 @@ DeferErrorIfCannotPushdownSubquery(Query *subqueryTree, bool outerMostQueryHasLi } +/* + * DeferErrorIfSubqueryRequiresMerge returns a deferred error if the subquery + * requires a merge step on the coordinator (e.g. limit, group by non-distribution + * column, etc.). + */ +static DeferredErrorMessage * +DeferErrorIfSubqueryRequiresMerge(Query *subqueryTree) +{ + bool preconditionsSatisfied = true; + char *errorDetail = NULL; + + if (subqueryTree->limitOffset) + { + preconditionsSatisfied = false; + errorDetail = "Offset clause is currently unsupported when a subquery " + "references a column from another query"; + } + + /* limit is not supported when SubqueryPushdown is not set */ + if (subqueryTree->limitCount && !SubqueryPushdown) + { + preconditionsSatisfied = false; + errorDetail = "Limit in subquery is currently unsupported when a " + "subquery references a column from another query"; + } + + /* group clause list must include partition column */ + if (subqueryTree->groupClause) + { + List *groupClauseList = subqueryTree->groupClause; + List *targetEntryList = subqueryTree->targetList; + List *groupTargetEntryList = GroupTargetEntryList(groupClauseList, + targetEntryList); + bool groupOnPartitionColumn = + TargetListOnPartitionColumn(subqueryTree, groupTargetEntryList); + if (!groupOnPartitionColumn) + { + preconditionsSatisfied = false; + errorDetail = "Group by list without partition column is currently " + "unsupported when a subquery references a column " + "from another query"; + } + } + + /* we don't support aggregates without group by */ + if (subqueryTree->hasAggs && (subqueryTree->groupClause == NULL)) + { + preconditionsSatisfied = false; + errorDetail = "Aggregates without group by are currently unsupported " + "when a subquery references a column from another query"; + } + + /* having clause without group by on partition column is not supported */ + if (subqueryTree->havingQual && (subqueryTree->groupClause == NULL)) + { + preconditionsSatisfied = false; + errorDetail = "Having qual without group by on partition column is " + "currently unsupported when a subquery references " + "a column from another query"; + } + + /* + * We support window functions when the window function + * is partitioned on distribution column. + */ + StringInfo errorInfo = NULL; + if (subqueryTree->hasWindowFuncs && !SafeToPushdownWindowFunction(subqueryTree, + &errorInfo)) + { + errorDetail = (char *) errorInfo->data; + preconditionsSatisfied = false; + } + + /* distinct clause list must include partition column */ + if (subqueryTree->distinctClause) + { + List *distinctClauseList = subqueryTree->distinctClause; + List *targetEntryList = subqueryTree->targetList; + List *distinctTargetEntryList = GroupTargetEntryList(distinctClauseList, + targetEntryList); + bool distinctOnPartitionColumn = + TargetListOnPartitionColumn(subqueryTree, distinctTargetEntryList); + if (!distinctOnPartitionColumn) + { + preconditionsSatisfied = false; + errorDetail = "Distinct on columns without partition column is " + "currently unsupported"; + } + } + + /* finally check and return deferred if not satisfied */ + if (!preconditionsSatisfied) + { + return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, + "cannot push down this subquery", + errorDetail, NULL); + } + + return NULL; +} + + /* * DeferErrorIfUnsupportedTableCombination checks if the given query tree contains any * unsupported range table combinations. For this, the function walks over all diff --git a/src/backend/distributed/planner/recursive_planning.c b/src/backend/distributed/planner/recursive_planning.c index f8cc45cc7..a6761a7f4 100644 --- a/src/backend/distributed/planner/recursive_planning.c +++ b/src/backend/distributed/planner/recursive_planning.c @@ -180,7 +180,6 @@ static bool IsLocalTableRteOrMatView(Node *node); static DistributedSubPlan * CreateDistributedSubPlan(uint32 subPlanId, Query *subPlanQuery); static bool CteReferenceListWalker(Node *node, CteReferenceWalkerContext *context); -static bool ContainsReferencesToOuterQuery(Query *query); static bool ContainsReferencesToOuterQueryWalker(Node *node, VarLevelsUpWalkerContext *context); static bool NodeContainsSubqueryReferencingOuterQuery(Node *node); @@ -1288,7 +1287,7 @@ CteReferenceListWalker(Node *node, CteReferenceWalkerContext *context) * anything that points outside of the query itself. Such queries cannot be * planned recursively. */ -static bool +bool ContainsReferencesToOuterQuery(Query *query) { VarLevelsUpWalkerContext context = { 0 }; diff --git a/src/include/distributed/recursive_planning.h b/src/include/distributed/recursive_planning.h index 0a64f6845..98d230cb2 100644 --- a/src/include/distributed/recursive_planning.h +++ b/src/include/distributed/recursive_planning.h @@ -49,4 +49,7 @@ extern void ReplaceRTERelationWithRteSubquery(RangeTblEntry *rangeTableEntry, RecursivePlanningContext *context); extern bool IsRecursivelyPlannableRelation(RangeTblEntry *rangeTableEntry); extern bool IsRelationLocalTableOrMatView(Oid relationId); +extern bool ContainsReferencesToOuterQuery(Query *query); + + #endif /* RECURSIVE_PLANNING_H */ diff --git a/src/test/regress/expected/multi_mx_router_planner.out b/src/test/regress/expected/multi_mx_router_planner.out index f122bb4d0..bd82b4dd1 100644 --- a/src/test/regress/expected/multi_mx_router_planner.out +++ b/src/test/regress/expected/multi_mx_router_planner.out @@ -484,7 +484,7 @@ DEBUG: query has a single distribution column value: 1 41 | 11814 (5 rows) --- subqueries are not supported in SELECT clause +-- subqueries in SELECT clause SELECT a.title AS name, (SELECT a2.id FROM articles_single_shard_hash_mx a2 WHERE a.id = a2.id LIMIT 1) AS special_price FROM articles_hash_mx a; DEBUG: Router planner cannot handle multi-shard select queries diff --git a/src/test/regress/expected/multi_subquery_complex_queries.out b/src/test/regress/expected/multi_subquery_complex_queries.out index 1d526f575..dc4e62616 100644 --- a/src/test/regress/expected/multi_subquery_complex_queries.out +++ b/src/test/regress/expected/multi_subquery_complex_queries.out @@ -1227,7 +1227,7 @@ FROM WHERE "users"."user_id" = "some_recent_users"."user_id" AND users.value_2 > 1 and users.value_2 < 3 - LIMIT 1) "some_users_data" + ORDER BY 1 LIMIT 1) "some_users_data" ON TRUE ORDER BY user_id @@ -1240,9 +1240,7 @@ limit 50; -- reset subquery_pushdown SET citus.subquery_pushdown to OFF; --- we recursively plan recent_events_1 --- but not some_users_data since it has a reference --- from an outer query which is not recursively planned +-- mixture of recursively planned subqueries and correlated subqueries SELECT "some_users_data".user_id, lastseen FROM (SELECT user_id, max(time) AS lastseen @@ -1270,24 +1268,21 @@ FROM WHERE "users"."value_1" = "some_recent_users"."user_id" AND users.value_2 > 1 and users.value_2 < 3 - LIMIT 1) "some_users_data" + ORDER BY 1 LIMIT 1) "some_users_data" ON TRUE ORDER BY user_id limit 50; user_id | lastseen --------------------------------------------------------------------- + 1 | Thu Nov 23 18:08:26.550729 2017 2 | Thu Nov 23 17:26:14.563216 2017 3 | Thu Nov 23 18:08:26.550729 2017 3 | Thu Nov 23 17:26:14.563216 2017 - 5 | Thu Nov 23 18:08:26.550729 2017 5 | Thu Nov 23 17:26:14.563216 2017 6 | Thu Nov 23 18:08:26.550729 2017 (6 rows) --- we recursively plan some queries but fail in the end --- since some_users_data since it has a reference --- from an outer query which is not recursively planned SELECT "some_users_data".user_id, lastseen FROM (SELECT 2 * user_id as user_id, max(time) AS lastseen @@ -1315,7 +1310,7 @@ FROM WHERE "users"."user_id" = "some_recent_users"."user_id" AND users.value_2 > 1 and users.value_2 < 3 - LIMIT 1) "some_users_data" + ORDER BY 1 LIMIT 1) "some_users_data" ON TRUE ORDER BY user_id @@ -1379,7 +1374,7 @@ FROM WHERE "users"."user_id" = "some_recent_users"."user_id" AND "users"."value_2" > 4 - LIMIT 1) "some_users_data" + ORDER BY 1 LIMIT 1) "some_users_data" ON TRUE ORDER BY lastseen DESC @@ -1447,7 +1442,7 @@ SELECT "some_users_data".user_id, MAX(lastseen), count(*) WHERE "users"."user_id" = "some_recent_users"."user_id" AND "users"."value_2" > 4 - LIMIT 1) "some_users_data" ON true + ORDER BY 1 LIMIT 1) "some_users_data" ON true GROUP BY 1 ORDER BY 2, 1 DESC LIMIT 10; @@ -1504,7 +1499,7 @@ FROM WHERE "users"."user_id" = "some_recent_users"."user_id" AND "users"."value_2" > 4 - LIMIT 1) "some_users_data" ON true + ORDER BY 1 LIMIT 1) "some_users_data" ON true ORDER BY lastseen DESC LIMIT 10) "some_users" @@ -1519,7 +1514,7 @@ DEBUG: push down of limit count: 10 DEBUG: generating subplan XXX_2 for subquery SELECT filter_users_1.user_id, last_events_1."time" AS lastseen FROM ((SELECT user_where_1_1.user_id FROM ((SELECT users.user_id FROM public.users_table users WHERE ((users.user_id OPERATOR(pg_catalog.>) 1) AND (users.user_id OPERATOR(pg_catalog.<) 4) AND (users.value_1 OPERATOR(pg_catalog.>) 2))) user_where_1_1 JOIN (SELECT intermediate_result.user_id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer)) user_where_1_join_1 ON ((user_where_1_1.user_id OPERATOR(pg_catalog.<>) user_where_1_join_1.user_id)))) filter_users_1 JOIN LATERAL (SELECT events.user_id, events."time" FROM public.events_table events WHERE ((events.user_id OPERATOR(pg_catalog.>) 1) AND (events.user_id OPERATOR(pg_catalog.<) 4) AND (events.user_id OPERATOR(pg_catalog.=) filter_users_1.user_id)) ORDER BY events."time" DESC LIMIT 1) last_events_1 ON (true)) ORDER BY last_events_1."time" DESC LIMIT 10 DEBUG: Router planner cannot handle multi-shard select queries DEBUG: push down of limit count: 10 -DEBUG: generating subplan XXX_3 for subquery SELECT some_users_data.user_id, some_recent_users.lastseen FROM ((SELECT intermediate_result.user_id, intermediate_result.lastseen FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, lastseen timestamp without time zone)) some_recent_users JOIN LATERAL (SELECT users.user_id FROM public.users_table users WHERE ((users.user_id OPERATOR(pg_catalog.=) some_recent_users.user_id) AND (users.value_2 OPERATOR(pg_catalog.>) 4)) LIMIT 1) some_users_data ON (true)) ORDER BY some_recent_users.lastseen DESC LIMIT 10 +DEBUG: generating subplan XXX_3 for subquery SELECT some_users_data.user_id, some_recent_users.lastseen FROM ((SELECT intermediate_result.user_id, intermediate_result.lastseen FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, lastseen timestamp without time zone)) some_recent_users JOIN LATERAL (SELECT users.user_id FROM public.users_table users WHERE ((users.user_id OPERATOR(pg_catalog.=) some_recent_users.user_id) AND (users.value_2 OPERATOR(pg_catalog.>) 4)) ORDER BY users.user_id LIMIT 1) some_users_data ON (true)) ORDER BY some_recent_users.lastseen DESC LIMIT 10 DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT user_id, lastseen FROM (SELECT intermediate_result.user_id, intermediate_result.lastseen FROM read_intermediate_result('XXX_3'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, lastseen timestamp without time zone)) some_users ORDER BY user_id DESC, lastseen DESC LIMIT 10 DEBUG: Creating router plan user_id | lastseen @@ -1584,7 +1579,7 @@ FROM WHERE "users"."user_id" = "some_recent_users"."user_id" AND "users"."value_2" > 4 - LIMIT 1) "some_users_data" ON true + ORDER BY 1 LIMIT 1) "some_users_data" ON true ORDER BY lastseen DESC LIMIT 10) "some_users" @@ -1595,7 +1590,7 @@ DEBUG: generating subplan XXX_1 for subquery SELECT user_id, value_1 FROM publi DEBUG: push down of limit count: 10 DEBUG: generating subplan XXX_2 for subquery SELECT filter_users_1.user_id, last_events_1."time" AS lastseen FROM ((SELECT user_where_1_1.user_id FROM ((SELECT users.user_id FROM public.users_table users WHERE ((users.user_id OPERATOR(pg_catalog.>) 1) AND (users.user_id OPERATOR(pg_catalog.<) 4) AND (users.value_1 OPERATOR(pg_catalog.>) 2))) user_where_1_1 JOIN (SELECT intermediate_result.user_id, intermediate_result.value_1 FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, value_1 integer)) user_where_1_join_1 ON ((user_where_1_1.user_id OPERATOR(pg_catalog.=) user_where_1_join_1.value_1)))) filter_users_1 JOIN LATERAL (SELECT events.user_id, events."time" FROM public.events_table events WHERE ((events.user_id OPERATOR(pg_catalog.>) 1) AND (events.user_id OPERATOR(pg_catalog.<) 4) AND (events.user_id OPERATOR(pg_catalog.=) filter_users_1.user_id)) ORDER BY events."time" DESC LIMIT 1) last_events_1 ON (true)) ORDER BY last_events_1."time" DESC LIMIT 10 DEBUG: push down of limit count: 10 -DEBUG: generating subplan XXX_3 for subquery SELECT some_users_data.user_id, some_recent_users.lastseen FROM ((SELECT intermediate_result.user_id, intermediate_result.lastseen FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, lastseen timestamp without time zone)) some_recent_users JOIN LATERAL (SELECT users.user_id FROM public.users_table users WHERE ((users.user_id OPERATOR(pg_catalog.=) some_recent_users.user_id) AND (users.value_2 OPERATOR(pg_catalog.>) 4)) LIMIT 1) some_users_data ON (true)) ORDER BY some_recent_users.lastseen DESC LIMIT 10 +DEBUG: generating subplan XXX_3 for subquery SELECT some_users_data.user_id, some_recent_users.lastseen FROM ((SELECT intermediate_result.user_id, intermediate_result.lastseen FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, lastseen timestamp without time zone)) some_recent_users JOIN LATERAL (SELECT users.user_id FROM public.users_table users WHERE ((users.user_id OPERATOR(pg_catalog.=) some_recent_users.user_id) AND (users.value_2 OPERATOR(pg_catalog.>) 4)) ORDER BY users.user_id LIMIT 1) some_users_data ON (true)) ORDER BY some_recent_users.lastseen DESC LIMIT 10 DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT user_id, lastseen FROM (SELECT intermediate_result.user_id, intermediate_result.lastseen FROM read_intermediate_result('XXX_3'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, lastseen timestamp without time zone)) some_users ORDER BY user_id DESC, lastseen DESC LIMIT 10 user_id | lastseen --------------------------------------------------------------------- @@ -1667,9 +1662,7 @@ ORDER BY user_id DESC, lastseen DESC LIMIT 10; ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns --- not pushdownable since lower LATERAL JOIN is not on the partition key --- not recursively plannable due to LATERAL join where there is a reference --- from an outer query +-- complex lateral join between inner join and correlated subquery SELECT user_id, lastseen FROM (SELECT @@ -1715,7 +1708,7 @@ FROM WHERE "users"."value_1" = "some_recent_users"."user_id" AND "users"."value_2" > 4 - LIMIT 1) "some_users_data" ON true + ORDER BY 1 LIMIT 1) "some_users_data" ON true ORDER BY lastseen DESC LIMIT 10) "some_users" @@ -2339,9 +2332,7 @@ LIMIT 10; (1 row) SET citus.subquery_pushdown to OFF; --- not pushdownable since lower LATERAL JOIN is not on the partition key --- not recursively plannable due to LATERAL join where there is a reference --- from an outer query +-- on side of the lateral join can be recursively plannen, then pushed down SELECT * FROM (SELECT @@ -2372,7 +2363,7 @@ FROM WHERE "users"."value_2" = "some_recent_users"."user_id" AND value_2 > 4 - LIMIT 1) "some_users_data" ON true + ORDER BY 1 LIMIT 1) "some_users_data" ON true ORDER BY value_2 DESC LIMIT 10) "some_users" diff --git a/src/test/regress/expected/subquery_in_where.out b/src/test/regress/expected/subquery_in_where.out index a8f7fe2c6..ac98d85f6 100644 --- a/src/test/regress/expected/subquery_in_where.out +++ b/src/test/regress/expected/subquery_in_where.out @@ -967,6 +967,186 @@ DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT count(*) AS c 24 (1 row) +-- two levels of correlation should also allow +-- merge step in the subquery +SELECT sum(value_1) +FROM users_table u +WHERE EXISTS + (SELECT 1 + FROM events_table e + WHERE u.user_id = e.user_id AND + EXISTS + (SELECT 1 + FROM users_table u2 + WHERE u2.user_id = u.user_id AND u2.value_1 = 5 + LIMIT 1)); + sum +--------------------------------------------------------------------- + 216 +(1 row) + +-- correlated subquery in WHERE, with a slightly +-- different syntax that the result of the subquery +-- is compared with a constant +SELECT sum(value_1) +FROM users_table u1 +WHERE (SELECT COUNT(DISTINCT e1.value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id + ) > 115; + sum +--------------------------------------------------------------------- + +(1 row) + +-- a correlated subquery which requires merge step +-- can be pushed down on UPDATE/DELETE queries as well +-- rollback to keep the rest of the tests unchanged +BEGIN; +UPDATE users_table u1 + SET value_1 = (SELECT count(DISTINCT value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id); +DELETE FROM users_table u1 WHERE (SELECT count(DISTINCT value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id) > 10; +ROLLBACK; +-- a correlated anti-join can also be pushed down even if the subquery +-- has a LIMIT +SELECT avg(value_1) +FROM users_table u +WHERE NOT EXISTS + (SELECT 'XXX' + FROM events_table e + WHERE u.user_id = e.user_id and e.value_2 > 10000 LIMIT 1); + avg +--------------------------------------------------------------------- + 2.5544554455445545 +(1 row) + +-- a [correlated] lateral join can also be pushed down even if the subquery +-- has an aggregate wout a GROUP BY +SELECT + max(min_of_val_2), max(u1.value_1) +FROM + users_table u1 + LEFT JOIN LATERAL + (SELECT min(e1.value_2) as min_of_val_2 FROM events_table e1 WHERE e1.user_id = u1.user_id) as foo ON (true); + max | max +--------------------------------------------------------------------- + 1 | 5 +(1 row) + +-- a self join is followed by a correlated subquery +EXPLAIN (COSTS OFF) +SELECT + * +FROM + users_table u1 JOIN users_table u2 USING (user_id) +WHERE + u1.value_1 < u2.value_1 AND + (SELECT + count(*) + FROM + events_table e1 + WHERE + e1.user_id = u2.user_id) > 10; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Hash Join + Hash Cond: (u2.user_id = u1.user_id) + Join Filter: (u1.value_1 < u2.value_1) + -> Seq Scan on users_table_1400256 u2 + Filter: ((SubPlan 1) > 10) + SubPlan 1 + -> Aggregate + -> Seq Scan on events_table_1400260 e1 + Filter: (user_id = u2.user_id) + -> Hash + -> Seq Scan on users_table_1400256 u1 +(16 rows) + +-- when the colocated join of the FROM clause +-- entries happen on WHERE clause, Citus cannot +-- pushdown +-- Likely that the colocation checks should be +-- improved +SELECT + u1.user_id, u2.user_id +FROM + users_table u1, users_table u2 +WHERE + u1.value_1 < u2.value_1 AND + (SELECT + count(*) + FROM + events_table e1 + WHERE + e1.user_id = u2.user_id AND + u1.user_id = u2.user_id) > 10 +ORDER BY 1,2; +ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns +-- create a view that contains correlated subquery +CREATE TEMPORARY VIEW correlated_subquery_view AS + SELECT u1.user_id + FROM users_table u1 + WHERE (SELECT COUNT(DISTINCT e1.value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id + ) > 0; +SELECT sum(user_id) FROM correlated_subquery_view; + sum +--------------------------------------------------------------------- + 376 +(1 row) + +-- now, join the view with another correlated subquery +SELECT + sum(mx) +FROM + correlated_subquery_view + LEFT JOIN LATERAL + (SELECT max(value_2) as mx FROM events_table WHERE correlated_subquery_view.user_id = events_table.user_id) as foo ON (true); + sum +--------------------------------------------------------------------- + 459 +(1 row) + +-- as an edge case, JOIN is on false +SELECT + sum(mx) +FROM + correlated_subquery_view + LEFT JOIN LATERAL + (SELECT max(value_2) as mx FROM events_table WHERE correlated_subquery_view.user_id = events_table.user_id) as foo ON (false); + sum +--------------------------------------------------------------------- + +(1 row) + +SELECT sum(value_1) +FROM users_table u1 +WHERE (SELECT COUNT(DISTINCT e1.value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id AND false + ) > 115; +ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns +SELECT sum(value_1) +FROM users_table u1 +WHERE (SELECT COUNT(DISTINCT e1.value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id + ) > 115 AND false; + sum +--------------------------------------------------------------------- + +(1 row) + SET client_min_messages TO DEFAULT; DROP TABLE local_table; DROP SCHEMA subquery_in_where CASCADE; diff --git a/src/test/regress/sql/multi_mx_router_planner.sql b/src/test/regress/sql/multi_mx_router_planner.sql index 688a7c944..b241439b0 100644 --- a/src/test/regress/sql/multi_mx_router_planner.sql +++ b/src/test/regress/sql/multi_mx_router_planner.sql @@ -235,7 +235,7 @@ FROM articles_hash_mx, (SELECT id, word_count FROM articles_hash_mx) AS test WHERE test.id = articles_hash_mx.id and articles_hash_mx.author_id = 1 ORDER BY articles_hash_mx.id; --- subqueries are not supported in SELECT clause +-- subqueries in SELECT clause SELECT a.title AS name, (SELECT a2.id FROM articles_single_shard_hash_mx a2 WHERE a.id = a2.id LIMIT 1) AS special_price FROM articles_hash_mx a; diff --git a/src/test/regress/sql/multi_subquery_complex_queries.sql b/src/test/regress/sql/multi_subquery_complex_queries.sql index 7fe8a90bf..28e468712 100644 --- a/src/test/regress/sql/multi_subquery_complex_queries.sql +++ b/src/test/regress/sql/multi_subquery_complex_queries.sql @@ -1119,7 +1119,7 @@ FROM WHERE "users"."user_id" = "some_recent_users"."user_id" AND users.value_2 > 1 and users.value_2 < 3 - LIMIT 1) "some_users_data" + ORDER BY 1 LIMIT 1) "some_users_data" ON TRUE ORDER BY user_id @@ -1128,9 +1128,7 @@ limit 50; -- reset subquery_pushdown SET citus.subquery_pushdown to OFF; --- we recursively plan recent_events_1 --- but not some_users_data since it has a reference --- from an outer query which is not recursively planned +-- mixture of recursively planned subqueries and correlated subqueries SELECT "some_users_data".user_id, lastseen FROM (SELECT user_id, max(time) AS lastseen @@ -1158,15 +1156,12 @@ FROM WHERE "users"."value_1" = "some_recent_users"."user_id" AND users.value_2 > 1 and users.value_2 < 3 - LIMIT 1) "some_users_data" + ORDER BY 1 LIMIT 1) "some_users_data" ON TRUE ORDER BY user_id limit 50; --- we recursively plan some queries but fail in the end --- since some_users_data since it has a reference --- from an outer query which is not recursively planned SELECT "some_users_data".user_id, lastseen FROM (SELECT 2 * user_id as user_id, max(time) AS lastseen @@ -1194,7 +1189,7 @@ FROM WHERE "users"."user_id" = "some_recent_users"."user_id" AND users.value_2 > 1 and users.value_2 < 3 - LIMIT 1) "some_users_data" + ORDER BY 1 LIMIT 1) "some_users_data" ON TRUE ORDER BY user_id @@ -1251,7 +1246,7 @@ FROM WHERE "users"."user_id" = "some_recent_users"."user_id" AND "users"."value_2" > 4 - LIMIT 1) "some_users_data" + ORDER BY 1 LIMIT 1) "some_users_data" ON TRUE ORDER BY lastseen DESC @@ -1306,7 +1301,7 @@ SELECT "some_users_data".user_id, MAX(lastseen), count(*) WHERE "users"."user_id" = "some_recent_users"."user_id" AND "users"."value_2" > 4 - LIMIT 1) "some_users_data" ON true + ORDER BY 1 LIMIT 1) "some_users_data" ON true GROUP BY 1 ORDER BY 2, 1 DESC LIMIT 10; @@ -1360,7 +1355,7 @@ FROM WHERE "users"."user_id" = "some_recent_users"."user_id" AND "users"."value_2" > 4 - LIMIT 1) "some_users_data" ON true + ORDER BY 1 LIMIT 1) "some_users_data" ON true ORDER BY lastseen DESC LIMIT 10) "some_users" @@ -1418,7 +1413,7 @@ FROM WHERE "users"."user_id" = "some_recent_users"."user_id" AND "users"."value_2" > 4 - LIMIT 1) "some_users_data" ON true + ORDER BY 1 LIMIT 1) "some_users_data" ON true ORDER BY lastseen DESC LIMIT 10) "some_users" @@ -1484,9 +1479,7 @@ ORDER BY user_id DESC, lastseen DESC LIMIT 10; --- not pushdownable since lower LATERAL JOIN is not on the partition key --- not recursively plannable due to LATERAL join where there is a reference --- from an outer query +-- complex lateral join between inner join and correlated subquery SELECT user_id, lastseen FROM (SELECT @@ -1532,7 +1525,7 @@ FROM WHERE "users"."value_1" = "some_recent_users"."user_id" AND "users"."value_2" > 4 - LIMIT 1) "some_users_data" ON true + ORDER BY 1 LIMIT 1) "some_users_data" ON true ORDER BY lastseen DESC LIMIT 10) "some_users" @@ -2033,9 +2026,7 @@ ORDER BY LIMIT 10; SET citus.subquery_pushdown to OFF; --- not pushdownable since lower LATERAL JOIN is not on the partition key --- not recursively plannable due to LATERAL join where there is a reference --- from an outer query +-- on side of the lateral join can be recursively plannen, then pushed down SELECT * FROM (SELECT @@ -2066,7 +2057,7 @@ FROM WHERE "users"."value_2" = "some_recent_users"."user_id" AND value_2 > 4 - LIMIT 1) "some_users_data" ON true + ORDER BY 1 LIMIT 1) "some_users_data" ON true ORDER BY value_2 DESC LIMIT 10) "some_users" diff --git a/src/test/regress/sql/subquery_in_where.sql b/src/test/regress/sql/subquery_in_where.sql index 4f43a967b..5a383c927 100644 --- a/src/test/regress/sql/subquery_in_where.sql +++ b/src/test/regress/sql/subquery_in_where.sql @@ -713,6 +713,143 @@ WHERE WHERE u.value_2 > 3 GROUP BY r.value_2 HAVING min(r.value_3) > 0); +-- two levels of correlation should also allow +-- merge step in the subquery +SELECT sum(value_1) +FROM users_table u +WHERE EXISTS + (SELECT 1 + FROM events_table e + WHERE u.user_id = e.user_id AND + EXISTS + (SELECT 1 + FROM users_table u2 + WHERE u2.user_id = u.user_id AND u2.value_1 = 5 + LIMIT 1)); + +-- correlated subquery in WHERE, with a slightly +-- different syntax that the result of the subquery +-- is compared with a constant +SELECT sum(value_1) +FROM users_table u1 +WHERE (SELECT COUNT(DISTINCT e1.value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id + ) > 115; + + +-- a correlated subquery which requires merge step +-- can be pushed down on UPDATE/DELETE queries as well +-- rollback to keep the rest of the tests unchanged +BEGIN; +UPDATE users_table u1 + SET value_1 = (SELECT count(DISTINCT value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id); + +DELETE FROM users_table u1 WHERE (SELECT count(DISTINCT value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id) > 10; + +ROLLBACK; + +-- a correlated anti-join can also be pushed down even if the subquery +-- has a LIMIT +SELECT avg(value_1) +FROM users_table u +WHERE NOT EXISTS + (SELECT 'XXX' + FROM events_table e + WHERE u.user_id = e.user_id and e.value_2 > 10000 LIMIT 1); + +-- a [correlated] lateral join can also be pushed down even if the subquery +-- has an aggregate wout a GROUP BY +SELECT + max(min_of_val_2), max(u1.value_1) +FROM + users_table u1 + LEFT JOIN LATERAL + (SELECT min(e1.value_2) as min_of_val_2 FROM events_table e1 WHERE e1.user_id = u1.user_id) as foo ON (true); + + +-- a self join is followed by a correlated subquery +EXPLAIN (COSTS OFF) +SELECT + * +FROM + users_table u1 JOIN users_table u2 USING (user_id) +WHERE + u1.value_1 < u2.value_1 AND + (SELECT + count(*) + FROM + events_table e1 + WHERE + e1.user_id = u2.user_id) > 10; + +-- when the colocated join of the FROM clause +-- entries happen on WHERE clause, Citus cannot +-- pushdown +-- Likely that the colocation checks should be +-- improved +SELECT + u1.user_id, u2.user_id +FROM + users_table u1, users_table u2 +WHERE + u1.value_1 < u2.value_1 AND + (SELECT + count(*) + FROM + events_table e1 + WHERE + e1.user_id = u2.user_id AND + u1.user_id = u2.user_id) > 10 +ORDER BY 1,2; + + +-- create a view that contains correlated subquery +CREATE TEMPORARY VIEW correlated_subquery_view AS + SELECT u1.user_id + FROM users_table u1 + WHERE (SELECT COUNT(DISTINCT e1.value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id + ) > 0; + +SELECT sum(user_id) FROM correlated_subquery_view; + +-- now, join the view with another correlated subquery +SELECT + sum(mx) +FROM + correlated_subquery_view + LEFT JOIN LATERAL + (SELECT max(value_2) as mx FROM events_table WHERE correlated_subquery_view.user_id = events_table.user_id) as foo ON (true); + +-- as an edge case, JOIN is on false +SELECT + sum(mx) +FROM + correlated_subquery_view + LEFT JOIN LATERAL + (SELECT max(value_2) as mx FROM events_table WHERE correlated_subquery_view.user_id = events_table.user_id) as foo ON (false); + + +SELECT sum(value_1) +FROM users_table u1 +WHERE (SELECT COUNT(DISTINCT e1.value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id AND false + ) > 115; + +SELECT sum(value_1) +FROM users_table u1 +WHERE (SELECT COUNT(DISTINCT e1.value_2) + FROM events_table e1 + WHERE e1.user_id = u1.user_id + ) > 115 AND false; + SET client_min_messages TO DEFAULT; DROP TABLE local_table;