From 6f4ee503242d00f03be7142b13805540960b8dc7 Mon Sep 17 00:00:00 2001 From: Onder Kalaci Date: Wed, 19 Apr 2017 18:20:38 +0300 Subject: [PATCH] Support for subqueries in WHERE clause This commit enables subqueries in WHERE clause to be pushed down by the subquery pushdown logic. The support covers: - Correlated subqueries with IN, NOT IN, EXISTS, NOT EXISTS, operator expressions such as (>, <, =, ALL, ANY etc.) - Non-correlated subqueries with (partition_key) IN (SELECT partition_key ..) (partition_key) =ANY (SELECT partition_key ...) Note that this commit heavily utilizes the attribute equivalence logic introduced in the 1cb6a34ba8430fb2db3c481ac110923cecc0d12a. In general, this commit mostly adjusts the logical planner not to error out on the subqueries in WHERE clause. --- .../planner/multi_logical_planner.c | 201 ++++++- .../expected/multi_mx_router_planner.out | 11 +- .../regress/expected/multi_router_planner.out | 11 +- .../regress/expected/multi_simple_queries.out | 2 +- .../multi_subquery_behavioral_analytics.out | 182 ++++-- .../multi_subquery_in_where_clause.out | 555 ++++++++++++++++++ .../expected/multi_verify_no_subquery.out | 18 - src/test/regress/multi_binary_schedule | 2 +- src/test/regress/multi_schedule | 4 +- .../regress/multi_task_tracker_extra_schedule | 2 +- .../regress/sql/multi_mx_router_planner.sql | 8 - src/test/regress/sql/multi_router_planner.sql | 7 - .../multi_subquery_behavioral_analytics.sql | 104 +++- .../sql/multi_subquery_in_where_clause.sql | 486 +++++++++++++++ .../regress/sql/multi_verify_no_subquery.sql | 22 - 15 files changed, 1444 insertions(+), 171 deletions(-) create mode 100644 src/test/regress/expected/multi_subquery_in_where_clause.out delete mode 100644 src/test/regress/expected/multi_verify_no_subquery.out create mode 100644 src/test/regress/sql/multi_subquery_in_where_clause.sql delete mode 100644 src/test/regress/sql/multi_verify_no_subquery.sql diff --git a/src/backend/distributed/planner/multi_logical_planner.c b/src/backend/distributed/planner/multi_logical_planner.c index f2b08b096..1fb012375 100644 --- a/src/backend/distributed/planner/multi_logical_planner.c +++ b/src/backend/distributed/planner/multi_logical_planner.c @@ -93,6 +93,7 @@ static bool HasOuterJoinWalker(Node *node, void *maxJoinLevel); static bool HasComplexJoinOrder(Query *queryTree); static bool HasComplexRangeTableType(Query *queryTree); static void ValidateClauseList(List *clauseList); +static void ValidateSubqueryPushdownClauseList(List *clauseList); static bool ExtractFromExpressionWalker(Node *node, QualifierWalkerContext *walkerContext); static List * MultiTableNodeList(List *tableEntryList, List *rangeTableList); @@ -102,6 +103,8 @@ static MultiNode * MultiJoinTree(List *joinOrderList, List *collectTableList, static MultiCollect * CollectNodeForTable(List *collectTableList, uint32 rangeTableId); static MultiSelect * MultiSelectNode(List *whereClauseList); static bool IsSelectClause(Node *clause); +static void ErrorIfSublink(Node *clause); +static bool IsSublinkClause(Node *clause); static MultiProject * MultiProjectNode(List *targetEntryList); static MultiExtendedOp * MultiExtendedOpNode(Query *queryTree); @@ -134,6 +137,8 @@ static MultiNode * MultiSubqueryPlanTree(Query *originalQuery, Query *queryTree, PlannerRestrictionContext * plannerRestrictionContext); +static List * SublinkList(Query *originalQuery); +static bool ExtractSublinkWalker(Node *node, List **sublinkList); static MultiNode * SubqueryPushdownMultiPlanTree(Query *queryTree); static List * CreateSubqueryTargetEntryList(List *columnList); @@ -159,15 +164,18 @@ MultiLogicalPlanCreate(Query *originalQuery, Query *queryTree, { MultiNode *multiQueryNode = NULL; MultiTreeRoot *rootNode = NULL; - List *subqueryEntryList = NULL; /* - * We check the existence of subqueries in the modified query given that - * if postgres already flattened the subqueries, MultiPlanTree() can plan - * corresponding distributed plan. + * We check the existence of subqueries in FROM clause on the modified query + * given that if postgres already flattened the subqueries, MultiPlanTree() + * can plan corresponding distributed plan. + * + * We also check the existence of subqueries in WHERE clause. Note that + * this check needs to be done on the original query given that + * standard_planner() may replace the sublinks with anti/semi joins and + * MultiPlanTree() cannot plan such queries. */ - subqueryEntryList = SubqueryEntryList(queryTree); - if (subqueryEntryList != NIL) + if (SubqueryEntryList(queryTree) != NIL || SublinkList(originalQuery) != NIL) { multiQueryNode = MultiSubqueryPlanTree(originalQuery, queryTree, plannerRestrictionContext); @@ -185,6 +193,57 @@ MultiLogicalPlanCreate(Query *originalQuery, Query *queryTree, } +/* + * SublinkList finds the subquery nodes in the where clause of the given query. Note + * that the function should be called on the orignal query given that postgres + * standard_planner() may convert the subqueries in WHERE clause to joins. + */ +static List * +SublinkList(Query *originalQuery) +{ + FromExpr *joinTree = originalQuery->jointree; + Node *queryQuals = NULL; + List *sublinkList = NIL; + + if (!joinTree) + { + return NIL; + } + + queryQuals = joinTree->quals; + ExtractSublinkWalker(queryQuals, &sublinkList); + + return sublinkList; +} + + +/* + * ExtractSublinkWalker walks over a quals node, and finds all sublinks + * in that node. + */ +static bool +ExtractSublinkWalker(Node *node, List **sublinkList) +{ + bool walkerResult = false; + if (node == NULL) + { + return false; + } + + if (IsA(node, SubLink)) + { + (*sublinkList) = lappend(*sublinkList, node); + } + else + { + walkerResult = expression_tree_walker(node, ExtractSublinkWalker, + sublinkList); + } + + return walkerResult; +} + + /* * MultiSubqueryPlanTree gets the query objects and returns logical plan * for subqueries. @@ -336,8 +395,10 @@ DeferErrorIfUnsupportedSubqueryPushdown(Query *originalQuery, plannerRestrictionContext) { ListCell *rangeTableEntryCell = NULL; + ListCell *sublinkCell = NULL; List *subqueryEntryList = NIL; bool outerMostQueryHasLimit = false; + List *sublinkList = NIL; DeferredErrorMessage *error = NULL; RelationRestrictionContext *relationRestrictionContext = plannerRestrictionContext->relationRestrictionContext; @@ -395,6 +456,23 @@ DeferErrorIfUnsupportedSubqueryPushdown(Query *originalQuery, } } + sublinkList = SublinkList(originalQuery); + foreach(sublinkCell, sublinkList) + { + SubLink *sublink = (SubLink *) lfirst(sublinkCell); + Node *subselect = sublink->subselect; + + if (subselect && IsA(subselect, Query)) + { + error = DeferErrorIfCannotPushdownSubquery((Query *) subselect, + outerMostQueryHasLimit); + if (error) + { + return error; + } + } + } + return NULL; } @@ -549,12 +627,6 @@ DeferErrorIfCannotPushdownSubquery(Query *subqueryTree, bool outerMostQueryHasLi return deferredError; } - if (subqueryTree->hasSubLinks) - { - preconditionsSatisfied = false; - errorDetail = "Subqueries other than from-clause subqueries are unsupported"; - } - if (subqueryTree->rtable == NIL) { preconditionsSatisfied = false; @@ -1248,11 +1320,11 @@ ErrorIfQueryNotSupported(Query *queryTree) const char *filterHint = "Consider using an equality filter on the distributed " "table's partition column."; - if (queryTree->hasSubLinks) + if (queryTree->hasSubLinks && SublinkList(queryTree) == NIL) { preconditionsSatisfied = false; errorMessage = "could not run distributed query with subquery outside the " - "FROM clause"; + "FROM and WHERE clauses"; errorHint = filterHint; } @@ -1506,6 +1578,12 @@ DeferErrorIfUnsupportedSubqueryRepartition(Query *subqueryTree) errorDetail = "Subqueries with offset are not supported yet"; } + if (subqueryTree->hasSubLinks) + { + preconditionsSatisfied = false; + errorDetail = "Subqueries other than from-clause subqueries are unsupported"; + } + /* finally check and return error if conditions are not satisfied */ if (!preconditionsSatisfied) { @@ -1752,11 +1830,37 @@ ValidateClauseList(List *clauseList) { Node *clause = (Node *) lfirst(clauseCell); - bool selectClause = IsSelectClause(clause); - bool joinClause = IsJoinClause(clause); - bool orClause = or_clause(clause); + /* produce a better error message for sublinks */ + ErrorIfSublink(clause); - if (!(selectClause || joinClause || orClause)) + if (!(IsSelectClause(clause) || IsJoinClause(clause) || or_clause(clause))) + { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("unsupported clause type"))); + } + } +} + + +/* + * ValidateSubqueryPushdownClauseList walks over the given list of clauses, + * and checks that we can recognize all the clauses. This function ensures + * that we do not drop an unsupported clause type on the floor, and thus + * prevents erroneous results. + * + * Note that this function is slightly different than ValidateClauseList(), + * additionally allowing subkinks. + */ +static void +ValidateSubqueryPushdownClauseList(List *clauseList) +{ + ListCell *clauseCell = NULL; + foreach(clauseCell, clauseList) + { + Node *clause = (Node *) lfirst(clauseCell); + + if (!(IsSublinkClause(clause) || IsSelectClause(clause) || + IsJoinClause(clause) || or_clause(clause))) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("unsupported clause type"))); @@ -1810,6 +1914,10 @@ JoinClauseList(List *whereClauseList) * subqueries but differently from the outermost query, they are run on a copy * of parse tree and changes do not get persisted as modifications to the original * query tree. + * + * Also this function adds SubLinks to the baseQualifierList when they appear on + * the query's WHERE clause. The callers of the function should consider processing + * Sublinks as well. */ static bool ExtractFromExpressionWalker(Node *node, QualifierWalkerContext *walkerContext) @@ -2203,7 +2311,8 @@ MultiSelectNode(List *whereClauseList) /* * IsSelectClause determines if the given node is a select clause according to * our criteria. Our criteria defines a select clause as an expression that has - * zero or more columns belonging to only one table. + * zero or more columns belonging to only one table. The function assumes that + * no sublinks exists in the clause. */ static bool IsSelectClause(Node *clause) @@ -2213,16 +2322,6 @@ IsSelectClause(Node *clause) Var *firstColumn = NULL; Index firstColumnTableId = 0; bool isSelectClause = true; - NodeTag nodeTag = nodeTag(clause); - - /* error out for subqueries in WHERE clause */ - if (nodeTag == T_SubLink || nodeTag == T_SubPlan) - { - ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot perform distributed planning on this query"), - errdetail("Subqueries other than in from-clause are currently " - "unsupported"))); - } /* extract columns from the clause */ columnList = pull_var_clause_default(clause); @@ -2249,6 +2348,42 @@ IsSelectClause(Node *clause) } +/* + * ErrorIfSublink errors out if the input claise is either sublink or subplan. + */ +static void +ErrorIfSublink(Node *clause) +{ + NodeTag nodeTag = nodeTag(clause); + + /* error out for subqueries in WHERE clause */ + if (nodeTag == T_SubLink || nodeTag == T_SubPlan) + { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot perform distributed planning on this query"), + errdetail("Subqueries other than in from-clause are currently " + "unsupported"))); + } +} + + +/* + * IsSublinkClause determines if the given node is a sublink or subplan. + */ +static bool +IsSublinkClause(Node *clause) +{ + NodeTag nodeTag = nodeTag(clause); + + if (nodeTag == T_SubLink || nodeTag == T_SubPlan) + { + return true; + } + + return false; +} + + /* * MultiProjectNode builds the project node using the target entry information * from the query tree. The project node only encapsulates projected columns, @@ -2904,9 +3039,13 @@ SubqueryPushdownMultiPlanTree(Query *queryTree) /* verify we can perform distributed planning on this query */ ErrorIfQueryNotSupported(queryTree); - /* extract qualifiers and verify we can plan for them */ + /* + * Extract qualifiers and verify we can plan for them. Note that since + * subquery pushdown join planning is based on restriction equivalence, + * checking for these qualifiers may not be necessary. + */ qualifierList = QualifierList(queryTree->jointree); - ValidateClauseList(qualifierList); + ValidateSubqueryPushdownClauseList(qualifierList); /* * We would be creating a new Query and pushing down top level query's diff --git a/src/test/regress/expected/multi_mx_router_planner.out b/src/test/regress/expected/multi_mx_router_planner.out index 37f3a8718..da1fa9646 100644 --- a/src/test/regress/expected/multi_mx_router_planner.out +++ b/src/test/regress/expected/multi_mx_router_planner.out @@ -422,15 +422,6 @@ DEBUG: Plan is router executable SELECT * FROM articles_hash_mx, position('om' in 'Thomas') WHERE author_id = 1 or author_id = 2; ERROR: could not run distributed query with complex table expressions HINT: Consider using an equality filter on the distributed table's partition column. --- subqueries are not supported in WHERE clause in Citus -SELECT * FROM articles_hash_mx WHERE author_id IN (SELECT id FROM authors_hash_mx WHERE name LIKE '%a'); -ERROR: cannot plan queries that include both regular and partitioned relations -SELECT * FROM articles_hash_mx WHERE author_id IN (SELECT author_id FROM articles_hash_mx WHERE author_id = 1 or author_id = 3); -ERROR: could not run distributed query with join types other than INNER or OUTER JOINS -HINT: Consider joining tables on partition column and have equal filter on joining columns. -SELECT * FROM articles_hash_mx WHERE author_id = (SELECT 1); -ERROR: could not run distributed query with subquery outside the FROM clause -HINT: Consider using an equality filter on the distributed table's partition column. -- subqueries are supported in FROM clause but they are not router plannable SELECT articles_hash_mx.id,test.word_count FROM articles_hash_mx, (SELECT id, word_count FROM articles_hash_mx) AS test WHERE test.id = articles_hash_mx.id @@ -502,7 +493,7 @@ HINT: Set citus.task_executor_type to "task-tracker". -- subqueries are not supported in SELECT clause SELECT a.title AS name, (SELECT a2.id FROM articles_single_shard_hash_mx a2 WHERE a.id = a2.id LIMIT 1) AS special_price FROM articles_hash_mx a; -ERROR: could not run distributed query with subquery outside the FROM clause +ERROR: could not run distributed query with subquery outside the FROM and WHERE clauses HINT: Consider using an equality filter on the distributed table's partition column. -- simple lookup query SELECT * diff --git a/src/test/regress/expected/multi_router_planner.out b/src/test/regress/expected/multi_router_planner.out index c2996c6cb..369ee2f32 100644 --- a/src/test/regress/expected/multi_router_planner.out +++ b/src/test/regress/expected/multi_router_planner.out @@ -520,15 +520,6 @@ DEBUG: Plan is router executable SELECT * FROM articles_hash, position('om' in 'Thomas') WHERE author_id = 1 or author_id = 2; ERROR: could not run distributed query with complex table expressions HINT: Consider using an equality filter on the distributed table's partition column. --- subqueries are not supported in WHERE clause in Citus -SELECT * FROM articles_hash WHERE author_id IN (SELECT id FROM authors_hash WHERE name LIKE '%a'); -ERROR: cannot plan queries that include both regular and partitioned relations -SELECT * FROM articles_hash WHERE author_id IN (SELECT author_id FROM articles_hash WHERE author_id = 1 or author_id = 3); -ERROR: could not run distributed query with join types other than INNER or OUTER JOINS -HINT: Consider joining tables on partition column and have equal filter on joining columns. -SELECT * FROM articles_hash WHERE author_id = (SELECT 1); -ERROR: could not run distributed query with subquery outside the FROM clause -HINT: Consider using an equality filter on the distributed table's partition column. -- unless the query can be transformed into a join SELECT * FROM articles_hash WHERE author_id IN (SELECT author_id FROM articles_hash WHERE author_id = 2) @@ -615,7 +606,7 @@ HINT: Set citus.task_executor_type to "task-tracker". -- subqueries are not supported in SELECT clause SELECT a.title AS name, (SELECT a2.id FROM articles_single_shard_hash a2 WHERE a.id = a2.id LIMIT 1) AS special_price FROM articles_hash a; -ERROR: could not run distributed query with subquery outside the FROM clause +ERROR: could not run distributed query with subquery outside the FROM and WHERE clauses HINT: Consider using an equality filter on the distributed table's partition column. -- simple lookup query SELECT * diff --git a/src/test/regress/expected/multi_simple_queries.out b/src/test/regress/expected/multi_simple_queries.out index aead9dd76..b76029bd0 100644 --- a/src/test/regress/expected/multi_simple_queries.out +++ b/src/test/regress/expected/multi_simple_queries.out @@ -247,7 +247,7 @@ ORDER BY articles.id; -- subqueries are not supported in SELECT clause SELECT a.title AS name, (SELECT a2.id FROM articles_single_shard a2 WHERE a.id = a2.id LIMIT 1) AS special_price FROM articles a; -ERROR: could not run distributed query with subquery outside the FROM clause +ERROR: could not run distributed query with subquery outside the FROM and WHERE clauses HINT: Consider using an equality filter on the distributed table's partition column. -- joins are not supported between local and distributed tables SELECT title, authors.name FROM authors, articles WHERE authors.id = articles.author_id; diff --git a/src/test/regress/expected/multi_subquery_behavioral_analytics.out b/src/test/regress/expected/multi_subquery_behavioral_analytics.out index 4340d115b..dbe572adc 100644 --- a/src/test/regress/expected/multi_subquery_behavioral_analytics.out +++ b/src/test/regress/expected/multi_subquery_behavioral_analytics.out @@ -546,62 +546,103 @@ ORDER BY user_lastseen DESC, user_id; ------------------------------------ -- Count the number of distinct users_table who are in segment X and Y and Z --- This query will be supported when we have subqueries in where clauses. ------------------------------------ -SELECT DISTINCT user_id +SELECT user_id FROM users_table WHERE user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 10 AND value_1 <= 20) AND user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 30 AND value_1 <= 40) - AND user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 50 AND value_1 <= 60); -ERROR: could not run distributed query with join types other than INNER or OUTER JOINS -HINT: Consider joining tables on partition column and have equal filter on joining columns. + AND user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 50 AND value_1 <= 60) +GROUP BY + user_id +ORDER BY + user_id DESC + LIMIT 5; + user_id +--------- + 93 + 90 + 88 + 87 + 84 +(5 rows) + ------------------------------------ -- Find customers who have done X, and satisfy other customer specific criteria --- This query will be supported when we have subqueries in where clauses. ------------------------------------ SELECT user_id, value_2 FROM users_table WHERE value_1 > 101 AND value_1 < 110 AND value_2 >= 5 - AND EXISTS (SELECT user_id FROM events_table WHERE event_type > 101 AND event_type < 110 AND value_3 > 100 AND user_id = users_table.user_id); -ERROR: could not run distributed query with join types other than INNER or OUTER JOINS -HINT: Consider joining tables on partition column and have equal filter on joining columns. + AND EXISTS (SELECT user_id FROM events_table WHERE event_type > 101 AND event_type < 110 AND value_3 > 100 AND user_id = users_table.user_id) +ORDER BY 2 DESC, 1 DESC +LIMIT 5; + user_id | value_2 +---------+--------- + 95 | 951 + 4 | 934 + 2 | 908 + 90 | 900 + 49 | 847 +(5 rows) + ------------------------------------ -- Customers who haven’t done X, and satisfy other customer specific criteria --- This query will be supported when we have subqueries in where clauses. ------------------------------------ SELECT user_id, value_2 FROM users_table WHERE value_1 = 101 AND value_2 >= 5 - AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type=101 AND value_3 > 100 AND user_id = users_table.user_id); -ERROR: could not run distributed query with subquery outside the FROM clause -HINT: Consider using an equality filter on the distributed table's partition column. + AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type=101 AND value_3 > 100 AND user_id = users_table.user_id) +ORDER BY 1 DESC, 2 DESC +LIMIT 3; + user_id | value_2 +---------+--------- + 58 | 585 + 51 | 1000 + 48 | 861 +(3 rows) + ------------------------------------ -- Customers who have done X and Y, and satisfy other customer specific criteria --- This query will be supported when we have subqueries in where clauses. ------------------------------------ -SELECT user_id, value_2 FROM users_table WHERE +SELECT user_id, sum(value_2) as cnt FROM users_table WHERE value_1 > 100 AND value_2 >= 5 AND EXISTS (SELECT user_id FROM events_table WHERE event_type != 100 AND value_3 > 100 AND user_id = users_table.user_id) - AND EXISTS (SELECT user_id FROM events_table WHERE event_type = 101 AND value_3 > 100 AND user_id = users_table.user_id); -ERROR: could not run distributed query with join types other than INNER or OUTER JOINS -HINT: Consider joining tables on partition column and have equal filter on joining columns. + AND EXISTS (SELECT user_id FROM events_table WHERE event_type = 101 AND value_3 > 100 AND user_id = users_table.user_id) +GROUP BY + user_id +ORDER BY cnt DESC, user_id DESC +LIMIT 5; + user_id | cnt +---------+------- + 49 | 48606 + 69 | 46524 + 86 | 46163 + 80 | 45995 + 35 | 45437 +(5 rows) + ------------------------------------ -- Customers who have done X and haven’t done Y, and satisfy other customer specific criteria --- This query will be supported when we have subqueries in where clauses. ------------------------------------ SELECT user_id, value_2 FROM users_table WHERE value_2 >= 5 AND EXISTS (SELECT user_id FROM events_table WHERE event_type > 100 AND event_type <= 300 AND value_3 > 100 AND user_id = users_table.user_id) - AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type > 300 AND event_type <= 350 AND value_3 > 100 AND user_id = users_table.user_id); -ERROR: could not run distributed query with join types other than INNER or OUTER JOINS -HINT: Consider joining tables on partition column and have equal filter on joining columns. + AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type > 300 AND event_type <= 350 AND value_3 > 100 AND user_id = users_table.user_id) +ORDER BY 2 DESC, 1 DESC +LIMIT 4; + user_id | value_2 +---------+--------- + 8 | 996 + 96 | 995 + 8 | 995 + 96 | 989 +(4 rows) + ------------------------------------ -- Customers who have done X more than 2 times, and satisfy other customer specific criteria --- This query will be supported when we have subqueries in where clauses. ------------------------------------ SELECT user_id, - value_2 + avg(value_2) FROM users_table WHERE value_1 > 100 AND value_1 < 124 @@ -613,9 +654,21 @@ SELECT user_id, AND value_3 > 100 AND user_id = users_table.user_id GROUP BY user_id - HAVING Count(*) > 2); -ERROR: could not run distributed query with subquery outside the FROM clause -HINT: Consider using an equality filter on the distributed table's partition column. + HAVING Count(*) > 2) +GROUP BY + user_id +ORDER BY + 1 DESC, 2 DESC +LIMIT 5; + user_id | avg +---------+---------------------- + 99 | 571.6666666666666667 + 98 | 758.0000000000000000 + 96 | 459.6666666666666667 + 90 | 453.3333333333333333 + 89 | 215.0000000000000000 +(5 rows) + ------------------------------------ -- Find me all users_table who logged in more than once ------------------------------------ @@ -671,7 +724,6 @@ ORDER BY ------------------------------------ -- Find me all users_table who has done some event and has filters --- This query will be supported when we have subqueries in where clauses. ------------------------------------ SELECT user_id FROM events_table @@ -684,18 +736,32 @@ WHERE users_table WHERE value_1 = 15 AND value_2 > 25 - ); -ERROR: could not run distributed query with join types other than INNER or OUTER JOINS -HINT: Consider joining tables on partition column and have equal filter on joining columns. + ) +ORDER BY 1; + user_id +--------- + 7 + 53 +(2 rows) + ------------------------------------ -- Which events_table did people who has done some specific events_table --- This query will be supported when we have subqueries in where clauses. ------------------------------------ -SELECT user_id, event_type FROM events_table -WHERE user_id in (SELECT user_id from events_table WHERE event_type > 500 and event_type < 505) -GROUP BY user_id, event_type; -ERROR: could not run distributed query with join types other than INNER or OUTER JOINS -HINT: Consider joining tables on partition column and have equal filter on joining columns. +SELECT + user_id, event_type FROM events_table +WHERE + user_id in (SELECT user_id from events_table WHERE event_type > 500 and event_type < 505) +GROUP BY + user_id, event_type +ORDER BY 2 DESC, 1 +LIMIT 3; + user_id | event_type +---------+------------ + 18 | 999 + 23 | 999 + 26 | 999 +(3 rows) + ------------------------------------ -- Find me all the users_table who has done some event more than three times ------------------------------------ @@ -1950,5 +2016,45 @@ FROM ( ERROR: no value found for parameter 1 SET client_min_messages TO DEFAULT; DROP FUNCTION volatile_func_test(); -SET citus.subquery_pushdown to OFF; +CREATE FUNCTION test_join_function_2(integer, integer) RETURNS bool + AS 'select $1 > $2;' + LANGUAGE SQL + IMMUTABLE + RETURNS NULL ON NULL INPUT; +-- we don't support joins via functions +SELECT user_id, array_length(events_table, 1) +FROM ( + SELECT user_id, array_agg(event ORDER BY time) AS events_table + FROM ( + SELECT u.user_id, e.event_type::text AS event, e.time + FROM users_table AS u, + events_table AS e + WHERE test_join_function_2(u.user_id, e.user_id) + ) t + GROUP BY user_id +) q +ORDER BY 2 DESC, 1; +ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys +DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. +-- note that the following query has joins on the partition keys +-- however we fail to push down it due to the function call on the +-- where clause. We probably need to relax that check +SELECT + users_table.user_id, users_table.value_1, prob +FROM + users_table + JOIN + (SELECT + ma.user_id, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob + FROM + users_table AS ma, events_table as short_list + WHERE + short_list.user_id = ma.user_id and ma.value_1 < 50 and short_list.event_type < 50 + ) temp + ON users_table.user_id = temp.user_id + WHERE + users_table.value_1 < 50 AND test_join_function_2(users_table.user_id, temp.user_id); +ERROR: unsupported clause type +DROP FUNCTION test_join_function_2(integer, integer); SET citus.enable_router_execution TO TRUE; +SET citus.subquery_pushdown to OFF; diff --git a/src/test/regress/expected/multi_subquery_in_where_clause.out b/src/test/regress/expected/multi_subquery_in_where_clause.out new file mode 100644 index 000000000..52e4a69ec --- /dev/null +++ b/src/test/regress/expected/multi_subquery_in_where_clause.out @@ -0,0 +1,555 @@ +-- +-- multi subquery in where queries aims to expand existing subquery pushdown +-- regression tests to cover more cases specifically subqueries in WHERE clause +-- the tables that are used depends to multi_insert_select_behavioral_analytics_create_table.sql +-- +-- We don't need shard id sequence here, so commented out to prevent conflicts with concurrent tests +-- ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 1430000; +-- ALTER SEQUENCE pg_catalog.pg_dist_jobid_seq RESTART 1430000; +-- subqueries in WHERE with greater operator +SELECT + user_id +FROM + users_table +WHERE + value_2 > + (SELECT + max(value_2) + FROM + events_table + WHERE + users_table.user_id = events_table.user_id AND event_type = 50 + GROUP BY + user_id + ) +GROUP BY user_id +HAVING count(*) > 66 +ORDER BY user_id +LIMIT 5; + user_id +--------- + 49 + 55 + 56 + 63 +(4 rows) + +-- subqueries in where with ALL operator +SELECT + user_id +FROM + users_table +WHERE + value_2 > 545 AND + value_2 < ALL (SELECT avg(value_3) FROM events_table WHERE users_table.user_id = events_table.user_id GROUP BY user_id) +GROUP BY + 1 +ORDER BY + 1 DESC +LIMIT 3; + user_id +--------- + 69 + 52 + 12 +(3 rows) + +-- IN operator on non-partition key +SELECT + user_id +FROM + events_table as e1 +WHERE + event_type IN + (SELECT + event_type + FROM + events_table as e2 + WHERE + value_2 = 15 AND value_3 > 25 AND + e1.user_id = e2.user_id + ) +ORDER BY 1; + user_id +--------- + 8 + 17 + 33 + 47 + 54 + 54 + 56 + 71 + 79 + 86 +(10 rows) + +-- NOT IN on non-partition key +SELECT + user_id +FROM + events_table as e1 +WHERE + event_type NOT IN + (SELECT + event_type + FROM + events_table as e2 + WHERE + value_2 = 15 AND value_3 > 25 AND + e1.user_id = e2.user_id + ) + GROUP BY 1 +HAVING count(*) > 122 +ORDER BY 1; + user_id +--------- + 23 + 25 +(2 rows) + +-- non-correlated query with =ANY on partition keys + SELECT + user_id, count(*) +FROM + users_table +WHERE + user_id =ANY(SELECT user_id FROM users_table WHERE value_1 >= 10 AND value_1 <= 20) GROUP BY 1 ORDER BY 2 DESC LIMIT 5; + user_id | count +---------+------- + 12 | 121 + 87 | 117 + 37 | 115 + 23 | 115 + 46 | 115 +(5 rows) + +-- users that appeared more than 118 times +SELECT + user_id +FROM + users_table +WHERE 118 <= + (SELECT + count(*) + FROM + events_table + WHERE + users_table.user_id = events_table.user_id + GROUP BY + user_id) +GROUP BY + user_id +ORDER BY + user_id; + user_id +--------- + 13 + 17 + 23 + 25 +(4 rows) + +-- the following query doesn't have a meaningul result +-- but it is a valid query with an arbitrary subquery in +-- WHERE clause +SELECT user_id, value_2 FROM users_table WHERE + value_1 > 101 AND value_1 < 110 + AND value_2 >= 5 + AND user_id IN + ( + SELECT + e1.user_id + FROM ( + -- Get the first time each user viewed the homepage. + SELECT + user_id, + 1 AS view_homepage, + min(time) AS view_homepage_time + FROM events_table + WHERE + event_type IN (10, 20, 30, 40, 50, 60, 70, 80, 90) + GROUP BY user_id + ) e1 LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS use_demo, + time AS use_demo_time + FROM events_table + WHERE + user_id = e1.user_id AND + event_type IN (11, 21, 31, 41, 51, 61, 71, 81, 91) + ORDER BY time + ) e2 ON true LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS enter_credit_card, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e2.user_id AND + event_type IN (12, 22, 32, 42, 52, 62, 72, 82, 92) + ORDER BY time + ) e3 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS submit_card_info, + user_id, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e3.user_id AND + event_type IN (13, 23, 33, 43, 53, 63, 73, 83, 93) + ORDER BY time + ) e4 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS see_bought_screen + FROM events_table + WHERE + user_id = e4.user_id AND + event_type IN (14, 24, 34, 44, 54, 64, 74, 84, 94) + ORDER BY time + ) e5 ON true + group by e1.user_id + HAVING sum(submit_card_info) > 0 +) +ORDER BY 1, 2; + user_id | value_2 +---------+--------- + 5 | 884 + 42 | 55 + 42 | 471 + 51 | 758 + 72 | 897 + 82 | 691 + 95 | 951 +(7 rows) + +-- similar to the above query +-- the following query doesn't have a meaningul result +-- but it is a valid query with an arbitrary subquery in +-- WHERE clause +SELECT + user_id +FROM + users_table +WHERE + user_id IN +( + SELECT + user_id + FROM ( + SELECT + subquery_1.user_id, count_pay + FROM + ( + (SELECT + users_table.user_id, + 'action=>1'AS event, + events_table.time + FROM + users_table, + events_table + WHERE + users_table.user_id = events_table.user_id AND + users_table.user_id >= 10 AND + users_table.user_id <= 70 AND + events_table.event_type > 10 AND events_table.event_type < 12 + ) + UNION + (SELECT + users_table.user_id, + 'action=>2'AS event, + events_table.time + FROM + users_table, + events_table + WHERE + users_table.user_id = events_table.user_id AND + users_table.user_id >= 10 AND + users_table.user_id <= 70 AND + events_table.event_type > 12 AND events_table.event_type < 14 + ) + ) AS subquery_1 + LEFT JOIN + (SELECT + user_id, + COUNT(*) AS count_pay + FROM + users_table + WHERE + user_id >= 10 AND + user_id <= 70 AND + users_table.value_1 > 15 AND users_table.value_1 < 17 + GROUP BY + user_id + HAVING + COUNT(*) > 1) AS subquery_2 + ON + subquery_1.user_id = subquery_2.user_id + GROUP BY + subquery_1.user_id, + count_pay) AS subquery_top + GROUP BY + count_pay, user_id +) +GROUP BY user_id +HAVING count(*) > 3 AND sum(value_2) > 49000 +ORDER BY 1; + user_id +--------- + 18 + 29 + 40 + 49 + 58 + 69 +(6 rows) + +-- the following query doesn't have a meaningul result +-- but it is a valid query with an arbitrary subquery in +-- FROM clause involving a complex query in WHERE clause +SELECT user_id, array_length(events_table, 1) +FROM ( + SELECT user_id, array_agg(event ORDER BY time) AS events_table + FROM ( + SELECT + u.user_id, e.event_type::text AS event, e.time + FROM + users_table AS u, + events_table AS e + WHERE u.user_id = e.user_id AND + u.user_id IN + ( + SELECT + user_id + FROM + users_table + WHERE value_2 >= 5 + AND EXISTS (SELECT user_id FROM events_table WHERE event_type > 100 AND event_type <= 300 AND value_3 > 100 AND user_id = users_table.user_id) + AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type > 300 AND event_type <= 350 AND value_3 > 100 AND user_id = users_table.user_id) + ) + ) t + GROUP BY user_id +) q +ORDER BY 2 DESC, 1; + user_id | array_length +---------+-------------- + 96 | 12204 + 8 | 8170 +(2 rows) + +-- +-- below tests only aims for cases where all relations +-- are not joined on partition key +-- +-- e4 is not joined on the partition key +SELECT user_id, value_2 FROM users_table WHERE + value_1 > 101 AND value_1 < 110 + AND value_2 >= 5 + AND user_id IN + ( + SELECT + e1.user_id + FROM ( + -- Get the first time each user viewed the homepage. + SELECT + user_id, + 1 AS view_homepage, + min(time) AS view_homepage_time + FROM events_table + WHERE + event_type IN (10, 20, 30, 40, 50, 60, 70, 80, 90) + GROUP BY user_id + ) e1 LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS use_demo, + time AS use_demo_time + FROM events_table + WHERE + user_id = e1.user_id AND + event_type IN (11, 21, 31, 41, 51, 61, 71, 81, 91) + ORDER BY time + ) e2 ON true LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS enter_credit_card, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e2.user_id AND + event_type IN (12, 22, 32, 42, 52, 62, 72, 82, 92) + ORDER BY time + ) e3 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS submit_card_info, + user_id, + time AS enter_credit_card_time + FROM events_table + WHERE + value_2 = e3.user_id AND + event_type IN (13, 23, 33, 43, 53, 63, 73, 83, 93) + ORDER BY time + ) e4 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS see_bought_screen + FROM events_table + WHERE + user_id = e4.user_id AND + event_type IN (14, 24, 34, 44, 54, 64, 74, 84, 94) + ORDER BY time + ) e5 ON true + group by e1.user_id + HAVING sum(submit_card_info) > 0 +); +ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys +DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. +-- left leaf query does not return partition key +SELECT + user_id +FROM + users_table +WHERE + user_id IN +( + SELECT + user_id + FROM ( + SELECT + subquery_1.user_id, count_pay + FROM + ( + (SELECT + 2 * users_table.user_id as user_id, + 'action=>1'AS event, + events_table.time + FROM + users_table, + events_table + WHERE + users_table.user_id = events_table.user_id AND + users_table.user_id >= 10 AND + users_table.user_id <= 70 AND + events_table.event_type > 10 AND events_table.event_type < 12 + ) + UNION + (SELECT + users_table.user_id, + 'action=>2'AS event, + events_table.time + FROM + users_table, + events_table + WHERE + users_table.user_id = events_table.user_id AND + users_table.user_id >= 10 AND + users_table.user_id <= 70 AND + events_table.event_type > 12 AND events_table.event_type < 14 + ) + ) AS subquery_1 + LEFT JOIN + (SELECT + user_id, + COUNT(*) AS count_pay + FROM + users_table + WHERE + user_id >= 10 AND + user_id <= 70 AND + users_table.value_1 > 15 AND users_table.value_1 < 17 + GROUP BY + user_id + HAVING + COUNT(*) > 1) AS subquery_2 + ON + subquery_1.user_id = subquery_2.user_id + GROUP BY + subquery_1.user_id, + count_pay) AS subquery_top + GROUP BY + count_pay, user_id +) +GROUP BY user_id +HAVING count(*) > 3 AND sum(value_2) > 49000 +ORDER BY 1; +ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys +DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. +-- NOT EXISTS query has non-equi join +SELECT user_id, array_length(events_table, 1) +FROM ( + SELECT user_id, array_agg(event ORDER BY time) AS events_table + FROM ( + SELECT + u.user_id, e.event_type::text AS event, e.time + FROM + users_table AS u, + events_table AS e + WHERE u.user_id = e.user_id AND + u.user_id IN + ( + SELECT + user_id + FROM + users_table + WHERE value_2 >= 5 + AND EXISTS (SELECT user_id FROM events_table WHERE event_type > 100 AND event_type <= 300 AND value_3 > 100 AND user_id = users_table.user_id) + AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type > 300 AND event_type <= 350 AND value_3 > 100 AND user_id != users_table.user_id) + ) + ) t + GROUP BY user_id +) q +ORDER BY 2 DESC, 1; +ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys +DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. +-- subquery in where clause doesn't have a relation +SELECT + user_id +FROM + users_table +WHERE + value_2 > + (SELECT 1); +ERROR: cannot push down this subquery +DETAIL: Subqueries without relations are unsupported +-- OFFSET is not supported in the subquey +SELECT + user_id +FROM + users_table +WHERE + value_2 > + (SELECT + max(value_2) + FROM + events_table + WHERE + users_table.user_id = events_table.user_id AND event_type = 50 + GROUP BY + user_id + OFFSET 3 + ); +ERROR: cannot push down this subquery +DETAIL: Offset clause is currently unsupported +-- semi join is not on the partition key for the third subquery +SELECT user_id +FROM users_table +WHERE user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 10 AND value_1 <= 20) + AND user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 30 AND value_1 <= 40) + AND value_2 IN (SELECT user_id FROM users_table WHERE value_1 >= 50 AND value_1 <= 60); +ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys +DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. +CREATE FUNCTION test_join_function(integer, integer) RETURNS bool + AS 'select $1 > $2;' + LANGUAGE SQL + IMMUTABLE + RETURNS NULL ON NULL INPUT; +-- we disallow JOINs via functions +SELECT user_id, value_2 FROM users_table WHERE + value_1 = 101 + AND value_2 >= 5 + AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type=101 AND value_3 > 100 AND test_join_function(events_table.user_id, users_table.user_id)) +ORDER BY 1 DESC, 2 DESC +LIMIT 3; +ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys +DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. +DROP FUNCTION test_join_function(int,int); diff --git a/src/test/regress/expected/multi_verify_no_subquery.out b/src/test/regress/expected/multi_verify_no_subquery.out deleted file mode 100644 index ae1459c90..000000000 --- a/src/test/regress/expected/multi_verify_no_subquery.out +++ /dev/null @@ -1,18 +0,0 @@ --- --- MULTI_VERIFY_NO_SUBQUERY --- --- This test checks that we simply emit an error message instead of trying to --- process a distributed unsupported SQL subquery. -ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 1030000; -SELECT * FROM lineitem WHERE l_orderkey IN - (SELECT l_orderkey FROM lineitem WHERE l_quantity > 0); -ERROR: could not run distributed query with join types other than INNER or OUTER JOINS -SELECT l_quantity FROM lineitem WHERE EXISTS - (SELECT 1 FROM orders WHERE o_orderkey = l_orderkey); -ERROR: could not run distributed query with join types other than INNER or OUTER JOINS -SELECT l_quantity FROM lineitem WHERE l_orderkey IN (SELECT o_orderkey FROM orders); -ERROR: could not run distributed query with join types other than INNER or OUTER JOINS -SELECT l_orderkey FROM lineitem WHERE l_quantity > ALL(SELECT o_orderkey FROM orders); -ERROR: could not run distributed query with subquery outside the FROM clause -SELECT l_quantity FROM lineitem WHERE l_orderkey = (SELECT min(o_orderkey) FROM orders); -ERROR: could not run distributed query with subquery outside the FROM clause diff --git a/src/test/regress/multi_binary_schedule b/src/test/regress/multi_binary_schedule index d09cc9a1e..698eb69a2 100644 --- a/src/test/regress/multi_binary_schedule +++ b/src/test/regress/multi_binary_schedule @@ -21,7 +21,7 @@ test: multi_table_ddl test: multi_create_table test: multi_load_data -test: multi_basic_queries multi_complex_expressions multi_verify_no_subquery +test: multi_basic_queries multi_complex_expressions test: multi_single_relation_subquery test: multi_binary_master_copy_format diff --git a/src/test/regress/multi_schedule b/src/test/regress/multi_schedule index 1ca71eec6..d4025fbcc 100644 --- a/src/test/regress/multi_schedule +++ b/src/test/regress/multi_schedule @@ -38,10 +38,10 @@ test: multi_insert_select # Miscellaneous tests to check our query planning behavior # ---------- test: multi_deparse_shard_query -test: multi_basic_queries multi_complex_expressions multi_verify_no_subquery +test: multi_basic_queries multi_complex_expressions test: multi_explain test: multi_subquery multi_subquery_complex_queries multi_subquery_behavioral_analytics -test: multi_subquery_union +test: multi_subquery_union multi_subquery_in_where_clause test: multi_reference_table test: multi_outer_join_reference test: multi_single_relation_subquery diff --git a/src/test/regress/multi_task_tracker_extra_schedule b/src/test/regress/multi_task_tracker_extra_schedule index 8051e63d1..b56ba506f 100644 --- a/src/test/regress/multi_task_tracker_extra_schedule +++ b/src/test/regress/multi_task_tracker_extra_schedule @@ -28,7 +28,7 @@ test: multi_load_data # ---------- # Miscellaneous tests to check our query planning behavior # ---------- -test: multi_basic_queries multi_complex_expressions multi_verify_no_subquery +test: multi_basic_queries multi_complex_expressions test: multi_agg_distinct multi_limit_clause multi_limit_clause_approximate test: multi_average_expression multi_working_columns test: multi_array_agg diff --git a/src/test/regress/sql/multi_mx_router_planner.sql b/src/test/regress/sql/multi_mx_router_planner.sql index 2d9d06969..0eb66b353 100644 --- a/src/test/regress/sql/multi_mx_router_planner.sql +++ b/src/test/regress/sql/multi_mx_router_planner.sql @@ -219,14 +219,6 @@ SELECT * FROM articles_hash_mx, position('om' in 'Thomas') WHERE author_id = 1 o -- they are not supported if multiple workers are involved SELECT * FROM articles_hash_mx, position('om' in 'Thomas') WHERE author_id = 1 or author_id = 2; --- subqueries are not supported in WHERE clause in Citus -SELECT * FROM articles_hash_mx WHERE author_id IN (SELECT id FROM authors_hash_mx WHERE name LIKE '%a'); - -SELECT * FROM articles_hash_mx WHERE author_id IN (SELECT author_id FROM articles_hash_mx WHERE author_id = 1 or author_id = 3); - -SELECT * FROM articles_hash_mx WHERE author_id = (SELECT 1); - - -- subqueries are supported in FROM clause but they are not router plannable SELECT articles_hash_mx.id,test.word_count FROM articles_hash_mx, (SELECT id, word_count FROM articles_hash_mx) AS test WHERE test.id = articles_hash_mx.id diff --git a/src/test/regress/sql/multi_router_planner.sql b/src/test/regress/sql/multi_router_planner.sql index c94507b17..b9a72f69c 100644 --- a/src/test/regress/sql/multi_router_planner.sql +++ b/src/test/regress/sql/multi_router_planner.sql @@ -281,13 +281,6 @@ SELECT * FROM articles_hash, position('om' in 'Thomas') WHERE author_id = 1 or a -- they are not supported if multiple workers are involved SELECT * FROM articles_hash, position('om' in 'Thomas') WHERE author_id = 1 or author_id = 2; --- subqueries are not supported in WHERE clause in Citus -SELECT * FROM articles_hash WHERE author_id IN (SELECT id FROM authors_hash WHERE name LIKE '%a'); - -SELECT * FROM articles_hash WHERE author_id IN (SELECT author_id FROM articles_hash WHERE author_id = 1 or author_id = 3); - -SELECT * FROM articles_hash WHERE author_id = (SELECT 1); - -- unless the query can be transformed into a join SELECT * FROM articles_hash WHERE author_id IN (SELECT author_id FROM articles_hash WHERE author_id = 2) diff --git a/src/test/regress/sql/multi_subquery_behavioral_analytics.sql b/src/test/regress/sql/multi_subquery_behavioral_analytics.sql index 734bfd784..1e4fea414 100644 --- a/src/test/regress/sql/multi_subquery_behavioral_analytics.sql +++ b/src/test/regress/sql/multi_subquery_behavioral_analytics.sql @@ -458,57 +458,66 @@ ORDER BY user_lastseen DESC, user_id; ------------------------------------ -- Count the number of distinct users_table who are in segment X and Y and Z --- This query will be supported when we have subqueries in where clauses. ------------------------------------ -SELECT DISTINCT user_id +SELECT user_id FROM users_table WHERE user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 10 AND value_1 <= 20) AND user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 30 AND value_1 <= 40) - AND user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 50 AND value_1 <= 60); + AND user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 50 AND value_1 <= 60) +GROUP BY + user_id +ORDER BY + user_id DESC + LIMIT 5; ------------------------------------ -- Find customers who have done X, and satisfy other customer specific criteria --- This query will be supported when we have subqueries in where clauses. ------------------------------------ SELECT user_id, value_2 FROM users_table WHERE value_1 > 101 AND value_1 < 110 AND value_2 >= 5 - AND EXISTS (SELECT user_id FROM events_table WHERE event_type > 101 AND event_type < 110 AND value_3 > 100 AND user_id = users_table.user_id); + AND EXISTS (SELECT user_id FROM events_table WHERE event_type > 101 AND event_type < 110 AND value_3 > 100 AND user_id = users_table.user_id) +ORDER BY 2 DESC, 1 DESC +LIMIT 5; ------------------------------------ -- Customers who haven’t done X, and satisfy other customer specific criteria --- This query will be supported when we have subqueries in where clauses. ------------------------------------ SELECT user_id, value_2 FROM users_table WHERE value_1 = 101 AND value_2 >= 5 - AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type=101 AND value_3 > 100 AND user_id = users_table.user_id); + AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type=101 AND value_3 > 100 AND user_id = users_table.user_id) +ORDER BY 1 DESC, 2 DESC +LIMIT 3; ------------------------------------ -- Customers who have done X and Y, and satisfy other customer specific criteria --- This query will be supported when we have subqueries in where clauses. ------------------------------------ -SELECT user_id, value_2 FROM users_table WHERE +SELECT user_id, sum(value_2) as cnt FROM users_table WHERE value_1 > 100 AND value_2 >= 5 AND EXISTS (SELECT user_id FROM events_table WHERE event_type != 100 AND value_3 > 100 AND user_id = users_table.user_id) - AND EXISTS (SELECT user_id FROM events_table WHERE event_type = 101 AND value_3 > 100 AND user_id = users_table.user_id); + AND EXISTS (SELECT user_id FROM events_table WHERE event_type = 101 AND value_3 > 100 AND user_id = users_table.user_id) +GROUP BY + user_id +ORDER BY cnt DESC, user_id DESC +LIMIT 5; ------------------------------------ -- Customers who have done X and haven’t done Y, and satisfy other customer specific criteria --- This query will be supported when we have subqueries in where clauses. ------------------------------------ SELECT user_id, value_2 FROM users_table WHERE value_2 >= 5 AND EXISTS (SELECT user_id FROM events_table WHERE event_type > 100 AND event_type <= 300 AND value_3 > 100 AND user_id = users_table.user_id) - AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type > 300 AND event_type <= 350 AND value_3 > 100 AND user_id = users_table.user_id); + AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type > 300 AND event_type <= 350 AND value_3 > 100 AND user_id = users_table.user_id) +ORDER BY 2 DESC, 1 DESC +LIMIT 4; ------------------------------------ -- Customers who have done X more than 2 times, and satisfy other customer specific criteria --- This query will be supported when we have subqueries in where clauses. ------------------------------------ SELECT user_id, - value_2 + avg(value_2) FROM users_table WHERE value_1 > 100 AND value_1 < 124 @@ -520,7 +529,12 @@ SELECT user_id, AND value_3 > 100 AND user_id = users_table.user_id GROUP BY user_id - HAVING Count(*) > 2); + HAVING Count(*) > 2) +GROUP BY + user_id +ORDER BY + 1 DESC, 2 DESC +LIMIT 5; ------------------------------------ -- Find me all users_table who logged in more than once @@ -555,7 +569,6 @@ ORDER BY ------------------------------------ -- Find me all users_table who has done some event and has filters --- This query will be supported when we have subqueries in where clauses. ------------------------------------ SELECT user_id FROM events_table @@ -568,15 +581,20 @@ WHERE users_table WHERE value_1 = 15 AND value_2 > 25 - ); + ) +ORDER BY 1; ------------------------------------ -- Which events_table did people who has done some specific events_table --- This query will be supported when we have subqueries in where clauses. ------------------------------------ -SELECT user_id, event_type FROM events_table -WHERE user_id in (SELECT user_id from events_table WHERE event_type > 500 and event_type < 505) -GROUP BY user_id, event_type; +SELECT + user_id, event_type FROM events_table +WHERE + user_id in (SELECT user_id from events_table WHERE event_type > 500 and event_type < 505) +GROUP BY + user_id, event_type +ORDER BY 2 DESC, 1 +LIMIT 3; ------------------------------------ -- Find me all the users_table who has done some event more than three times @@ -1600,5 +1618,47 @@ FROM ( SET client_min_messages TO DEFAULT; DROP FUNCTION volatile_func_test(); -SET citus.subquery_pushdown to OFF; +CREATE FUNCTION test_join_function_2(integer, integer) RETURNS bool + AS 'select $1 > $2;' + LANGUAGE SQL + IMMUTABLE + RETURNS NULL ON NULL INPUT; + +-- we don't support joins via functions +SELECT user_id, array_length(events_table, 1) +FROM ( + SELECT user_id, array_agg(event ORDER BY time) AS events_table + FROM ( + SELECT u.user_id, e.event_type::text AS event, e.time + FROM users_table AS u, + events_table AS e + WHERE test_join_function_2(u.user_id, e.user_id) + ) t + GROUP BY user_id +) q +ORDER BY 2 DESC, 1; + +-- note that the following query has joins on the partition keys +-- however we fail to push down it due to the function call on the +-- where clause. We probably need to relax that check +SELECT + users_table.user_id, users_table.value_1, prob +FROM + users_table + JOIN + (SELECT + ma.user_id, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob + FROM + users_table AS ma, events_table as short_list + WHERE + short_list.user_id = ma.user_id and ma.value_1 < 50 and short_list.event_type < 50 + ) temp + ON users_table.user_id = temp.user_id + WHERE + users_table.value_1 < 50 AND test_join_function_2(users_table.user_id, temp.user_id); + +DROP FUNCTION test_join_function_2(integer, integer); + SET citus.enable_router_execution TO TRUE; +SET citus.subquery_pushdown to OFF; + diff --git a/src/test/regress/sql/multi_subquery_in_where_clause.sql b/src/test/regress/sql/multi_subquery_in_where_clause.sql new file mode 100644 index 000000000..ae880400d --- /dev/null +++ b/src/test/regress/sql/multi_subquery_in_where_clause.sql @@ -0,0 +1,486 @@ +-- +-- multi subquery in where queries aims to expand existing subquery pushdown +-- regression tests to cover more cases specifically subqueries in WHERE clause +-- the tables that are used depends to multi_insert_select_behavioral_analytics_create_table.sql +-- +-- We don't need shard id sequence here, so commented out to prevent conflicts with concurrent tests +-- ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 1430000; +-- ALTER SEQUENCE pg_catalog.pg_dist_jobid_seq RESTART 1430000; + +-- subqueries in WHERE with greater operator +SELECT + user_id +FROM + users_table +WHERE + value_2 > + (SELECT + max(value_2) + FROM + events_table + WHERE + users_table.user_id = events_table.user_id AND event_type = 50 + GROUP BY + user_id + ) +GROUP BY user_id +HAVING count(*) > 66 +ORDER BY user_id +LIMIT 5; + +-- subqueries in where with ALL operator +SELECT + user_id +FROM + users_table +WHERE + value_2 > 545 AND + value_2 < ALL (SELECT avg(value_3) FROM events_table WHERE users_table.user_id = events_table.user_id GROUP BY user_id) +GROUP BY + 1 +ORDER BY + 1 DESC +LIMIT 3; + +-- IN operator on non-partition key +SELECT + user_id +FROM + events_table as e1 +WHERE + event_type IN + (SELECT + event_type + FROM + events_table as e2 + WHERE + value_2 = 15 AND value_3 > 25 AND + e1.user_id = e2.user_id + ) +ORDER BY 1; + +-- NOT IN on non-partition key +SELECT + user_id +FROM + events_table as e1 +WHERE + event_type NOT IN + (SELECT + event_type + FROM + events_table as e2 + WHERE + value_2 = 15 AND value_3 > 25 AND + e1.user_id = e2.user_id + ) + GROUP BY 1 + +HAVING count(*) > 122 +ORDER BY 1; + +-- non-correlated query with =ANY on partition keys + SELECT + user_id, count(*) +FROM + users_table +WHERE + user_id =ANY(SELECT user_id FROM users_table WHERE value_1 >= 10 AND value_1 <= 20) GROUP BY 1 ORDER BY 2 DESC LIMIT 5; + +-- users that appeared more than 118 times +SELECT + user_id +FROM + users_table +WHERE 118 <= + (SELECT + count(*) + FROM + events_table + WHERE + users_table.user_id = events_table.user_id + GROUP BY + user_id) +GROUP BY + user_id +ORDER BY + user_id; + + +-- the following query doesn't have a meaningul result +-- but it is a valid query with an arbitrary subquery in +-- WHERE clause +SELECT user_id, value_2 FROM users_table WHERE + value_1 > 101 AND value_1 < 110 + AND value_2 >= 5 + AND user_id IN + ( + SELECT + e1.user_id + FROM ( + -- Get the first time each user viewed the homepage. + SELECT + user_id, + 1 AS view_homepage, + min(time) AS view_homepage_time + FROM events_table + WHERE + event_type IN (10, 20, 30, 40, 50, 60, 70, 80, 90) + GROUP BY user_id + ) e1 LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS use_demo, + time AS use_demo_time + FROM events_table + WHERE + user_id = e1.user_id AND + event_type IN (11, 21, 31, 41, 51, 61, 71, 81, 91) + ORDER BY time + ) e2 ON true LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS enter_credit_card, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e2.user_id AND + event_type IN (12, 22, 32, 42, 52, 62, 72, 82, 92) + ORDER BY time + ) e3 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS submit_card_info, + user_id, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e3.user_id AND + event_type IN (13, 23, 33, 43, 53, 63, 73, 83, 93) + ORDER BY time + ) e4 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS see_bought_screen + FROM events_table + WHERE + user_id = e4.user_id AND + event_type IN (14, 24, 34, 44, 54, 64, 74, 84, 94) + ORDER BY time + ) e5 ON true + group by e1.user_id + HAVING sum(submit_card_info) > 0 +) +ORDER BY 1, 2; + + +-- similar to the above query +-- the following query doesn't have a meaningul result +-- but it is a valid query with an arbitrary subquery in +-- WHERE clause +SELECT + user_id +FROM + users_table +WHERE + user_id IN +( + SELECT + user_id + FROM ( + SELECT + subquery_1.user_id, count_pay + FROM + ( + (SELECT + users_table.user_id, + 'action=>1'AS event, + events_table.time + FROM + users_table, + events_table + WHERE + users_table.user_id = events_table.user_id AND + users_table.user_id >= 10 AND + users_table.user_id <= 70 AND + events_table.event_type > 10 AND events_table.event_type < 12 + ) + UNION + (SELECT + users_table.user_id, + 'action=>2'AS event, + events_table.time + FROM + users_table, + events_table + WHERE + users_table.user_id = events_table.user_id AND + users_table.user_id >= 10 AND + users_table.user_id <= 70 AND + events_table.event_type > 12 AND events_table.event_type < 14 + ) + ) AS subquery_1 + LEFT JOIN + (SELECT + user_id, + COUNT(*) AS count_pay + FROM + users_table + WHERE + user_id >= 10 AND + user_id <= 70 AND + users_table.value_1 > 15 AND users_table.value_1 < 17 + GROUP BY + user_id + HAVING + COUNT(*) > 1) AS subquery_2 + ON + subquery_1.user_id = subquery_2.user_id + GROUP BY + subquery_1.user_id, + count_pay) AS subquery_top + GROUP BY + count_pay, user_id +) +GROUP BY user_id +HAVING count(*) > 3 AND sum(value_2) > 49000 +ORDER BY 1; + + +-- the following query doesn't have a meaningul result +-- but it is a valid query with an arbitrary subquery in +-- FROM clause involving a complex query in WHERE clause +SELECT user_id, array_length(events_table, 1) +FROM ( + SELECT user_id, array_agg(event ORDER BY time) AS events_table + FROM ( + SELECT + u.user_id, e.event_type::text AS event, e.time + FROM + users_table AS u, + events_table AS e + WHERE u.user_id = e.user_id AND + u.user_id IN + ( + SELECT + user_id + FROM + users_table + WHERE value_2 >= 5 + AND EXISTS (SELECT user_id FROM events_table WHERE event_type > 100 AND event_type <= 300 AND value_3 > 100 AND user_id = users_table.user_id) + AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type > 300 AND event_type <= 350 AND value_3 > 100 AND user_id = users_table.user_id) + ) + ) t + GROUP BY user_id +) q +ORDER BY 2 DESC, 1; + +-- +-- below tests only aims for cases where all relations +-- are not joined on partition key +-- + +-- e4 is not joined on the partition key +SELECT user_id, value_2 FROM users_table WHERE + value_1 > 101 AND value_1 < 110 + AND value_2 >= 5 + AND user_id IN + ( + SELECT + e1.user_id + FROM ( + -- Get the first time each user viewed the homepage. + SELECT + user_id, + 1 AS view_homepage, + min(time) AS view_homepage_time + FROM events_table + WHERE + event_type IN (10, 20, 30, 40, 50, 60, 70, 80, 90) + GROUP BY user_id + ) e1 LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS use_demo, + time AS use_demo_time + FROM events_table + WHERE + user_id = e1.user_id AND + event_type IN (11, 21, 31, 41, 51, 61, 71, 81, 91) + ORDER BY time + ) e2 ON true LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS enter_credit_card, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e2.user_id AND + event_type IN (12, 22, 32, 42, 52, 62, 72, 82, 92) + ORDER BY time + ) e3 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS submit_card_info, + user_id, + time AS enter_credit_card_time + FROM events_table + WHERE + value_2 = e3.user_id AND + event_type IN (13, 23, 33, 43, 53, 63, 73, 83, 93) + ORDER BY time + ) e4 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS see_bought_screen + FROM events_table + WHERE + user_id = e4.user_id AND + event_type IN (14, 24, 34, 44, 54, 64, 74, 84, 94) + ORDER BY time + ) e5 ON true + group by e1.user_id + HAVING sum(submit_card_info) > 0 +); + + +-- left leaf query does not return partition key +SELECT + user_id +FROM + users_table +WHERE + user_id IN +( + SELECT + user_id + FROM ( + SELECT + subquery_1.user_id, count_pay + FROM + ( + (SELECT + 2 * users_table.user_id as user_id, + 'action=>1'AS event, + events_table.time + FROM + users_table, + events_table + WHERE + users_table.user_id = events_table.user_id AND + users_table.user_id >= 10 AND + users_table.user_id <= 70 AND + events_table.event_type > 10 AND events_table.event_type < 12 + ) + UNION + (SELECT + users_table.user_id, + 'action=>2'AS event, + events_table.time + FROM + users_table, + events_table + WHERE + users_table.user_id = events_table.user_id AND + users_table.user_id >= 10 AND + users_table.user_id <= 70 AND + events_table.event_type > 12 AND events_table.event_type < 14 + ) + ) AS subquery_1 + LEFT JOIN + (SELECT + user_id, + COUNT(*) AS count_pay + FROM + users_table + WHERE + user_id >= 10 AND + user_id <= 70 AND + users_table.value_1 > 15 AND users_table.value_1 < 17 + GROUP BY + user_id + HAVING + COUNT(*) > 1) AS subquery_2 + ON + subquery_1.user_id = subquery_2.user_id + GROUP BY + subquery_1.user_id, + count_pay) AS subquery_top + GROUP BY + count_pay, user_id +) +GROUP BY user_id +HAVING count(*) > 3 AND sum(value_2) > 49000 +ORDER BY 1; + +-- NOT EXISTS query has non-equi join +SELECT user_id, array_length(events_table, 1) +FROM ( + SELECT user_id, array_agg(event ORDER BY time) AS events_table + FROM ( + SELECT + u.user_id, e.event_type::text AS event, e.time + FROM + users_table AS u, + events_table AS e + WHERE u.user_id = e.user_id AND + u.user_id IN + ( + SELECT + user_id + FROM + users_table + WHERE value_2 >= 5 + AND EXISTS (SELECT user_id FROM events_table WHERE event_type > 100 AND event_type <= 300 AND value_3 > 100 AND user_id = users_table.user_id) + AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type > 300 AND event_type <= 350 AND value_3 > 100 AND user_id != users_table.user_id) + ) + ) t + GROUP BY user_id +) q +ORDER BY 2 DESC, 1; + +-- subquery in where clause doesn't have a relation +SELECT + user_id +FROM + users_table +WHERE + value_2 > + (SELECT 1); + +-- OFFSET is not supported in the subquey +SELECT + user_id +FROM + users_table +WHERE + value_2 > + (SELECT + max(value_2) + FROM + events_table + WHERE + users_table.user_id = events_table.user_id AND event_type = 50 + GROUP BY + user_id + OFFSET 3 + ); + +-- semi join is not on the partition key for the third subquery +SELECT user_id +FROM users_table +WHERE user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 10 AND value_1 <= 20) + AND user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 30 AND value_1 <= 40) + AND value_2 IN (SELECT user_id FROM users_table WHERE value_1 >= 50 AND value_1 <= 60); + +CREATE FUNCTION test_join_function(integer, integer) RETURNS bool + AS 'select $1 > $2;' + LANGUAGE SQL + IMMUTABLE + RETURNS NULL ON NULL INPUT; + +-- we disallow JOINs via functions +SELECT user_id, value_2 FROM users_table WHERE + value_1 = 101 + AND value_2 >= 5 + AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type=101 AND value_3 > 100 AND test_join_function(events_table.user_id, users_table.user_id)) +ORDER BY 1 DESC, 2 DESC +LIMIT 3; + +DROP FUNCTION test_join_function(int,int); diff --git a/src/test/regress/sql/multi_verify_no_subquery.sql b/src/test/regress/sql/multi_verify_no_subquery.sql deleted file mode 100644 index d475964bf..000000000 --- a/src/test/regress/sql/multi_verify_no_subquery.sql +++ /dev/null @@ -1,22 +0,0 @@ --- --- MULTI_VERIFY_NO_SUBQUERY --- - --- This test checks that we simply emit an error message instead of trying to --- process a distributed unsupported SQL subquery. - - -ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 1030000; - - -SELECT * FROM lineitem WHERE l_orderkey IN - (SELECT l_orderkey FROM lineitem WHERE l_quantity > 0); - -SELECT l_quantity FROM lineitem WHERE EXISTS - (SELECT 1 FROM orders WHERE o_orderkey = l_orderkey); - -SELECT l_quantity FROM lineitem WHERE l_orderkey IN (SELECT o_orderkey FROM orders); - -SELECT l_orderkey FROM lineitem WHERE l_quantity > ALL(SELECT o_orderkey FROM orders); - -SELECT l_quantity FROM lineitem WHERE l_orderkey = (SELECT min(o_orderkey) FROM orders);