diff --git a/src/backend/distributed/planner/multi_logical_planner.c b/src/backend/distributed/planner/multi_logical_planner.c index f2b08b096..1fb012375 100644 --- a/src/backend/distributed/planner/multi_logical_planner.c +++ b/src/backend/distributed/planner/multi_logical_planner.c @@ -93,6 +93,7 @@ static bool HasOuterJoinWalker(Node *node, void *maxJoinLevel); static bool HasComplexJoinOrder(Query *queryTree); static bool HasComplexRangeTableType(Query *queryTree); static void ValidateClauseList(List *clauseList); +static void ValidateSubqueryPushdownClauseList(List *clauseList); static bool ExtractFromExpressionWalker(Node *node, QualifierWalkerContext *walkerContext); static List * MultiTableNodeList(List *tableEntryList, List *rangeTableList); @@ -102,6 +103,8 @@ static MultiNode * MultiJoinTree(List *joinOrderList, List *collectTableList, static MultiCollect * CollectNodeForTable(List *collectTableList, uint32 rangeTableId); static MultiSelect * MultiSelectNode(List *whereClauseList); static bool IsSelectClause(Node *clause); +static void ErrorIfSublink(Node *clause); +static bool IsSublinkClause(Node *clause); static MultiProject * MultiProjectNode(List *targetEntryList); static MultiExtendedOp * MultiExtendedOpNode(Query *queryTree); @@ -134,6 +137,8 @@ static MultiNode * MultiSubqueryPlanTree(Query *originalQuery, Query *queryTree, PlannerRestrictionContext * plannerRestrictionContext); +static List * SublinkList(Query *originalQuery); +static bool ExtractSublinkWalker(Node *node, List **sublinkList); static MultiNode * SubqueryPushdownMultiPlanTree(Query *queryTree); static List * CreateSubqueryTargetEntryList(List *columnList); @@ -159,15 +164,18 @@ MultiLogicalPlanCreate(Query *originalQuery, Query *queryTree, { MultiNode *multiQueryNode = NULL; MultiTreeRoot *rootNode = NULL; - List *subqueryEntryList = NULL; /* - * We check the existence of subqueries in the modified query given that - * if postgres already flattened the subqueries, MultiPlanTree() can plan - * corresponding distributed plan. + * We check the existence of subqueries in FROM clause on the modified query + * given that if postgres already flattened the subqueries, MultiPlanTree() + * can plan corresponding distributed plan. + * + * We also check the existence of subqueries in WHERE clause. Note that + * this check needs to be done on the original query given that + * standard_planner() may replace the sublinks with anti/semi joins and + * MultiPlanTree() cannot plan such queries. */ - subqueryEntryList = SubqueryEntryList(queryTree); - if (subqueryEntryList != NIL) + if (SubqueryEntryList(queryTree) != NIL || SublinkList(originalQuery) != NIL) { multiQueryNode = MultiSubqueryPlanTree(originalQuery, queryTree, plannerRestrictionContext); @@ -185,6 +193,57 @@ MultiLogicalPlanCreate(Query *originalQuery, Query *queryTree, } +/* + * SublinkList finds the subquery nodes in the where clause of the given query. Note + * that the function should be called on the orignal query given that postgres + * standard_planner() may convert the subqueries in WHERE clause to joins. + */ +static List * +SublinkList(Query *originalQuery) +{ + FromExpr *joinTree = originalQuery->jointree; + Node *queryQuals = NULL; + List *sublinkList = NIL; + + if (!joinTree) + { + return NIL; + } + + queryQuals = joinTree->quals; + ExtractSublinkWalker(queryQuals, &sublinkList); + + return sublinkList; +} + + +/* + * ExtractSublinkWalker walks over a quals node, and finds all sublinks + * in that node. + */ +static bool +ExtractSublinkWalker(Node *node, List **sublinkList) +{ + bool walkerResult = false; + if (node == NULL) + { + return false; + } + + if (IsA(node, SubLink)) + { + (*sublinkList) = lappend(*sublinkList, node); + } + else + { + walkerResult = expression_tree_walker(node, ExtractSublinkWalker, + sublinkList); + } + + return walkerResult; +} + + /* * MultiSubqueryPlanTree gets the query objects and returns logical plan * for subqueries. @@ -336,8 +395,10 @@ DeferErrorIfUnsupportedSubqueryPushdown(Query *originalQuery, plannerRestrictionContext) { ListCell *rangeTableEntryCell = NULL; + ListCell *sublinkCell = NULL; List *subqueryEntryList = NIL; bool outerMostQueryHasLimit = false; + List *sublinkList = NIL; DeferredErrorMessage *error = NULL; RelationRestrictionContext *relationRestrictionContext = plannerRestrictionContext->relationRestrictionContext; @@ -395,6 +456,23 @@ DeferErrorIfUnsupportedSubqueryPushdown(Query *originalQuery, } } + sublinkList = SublinkList(originalQuery); + foreach(sublinkCell, sublinkList) + { + SubLink *sublink = (SubLink *) lfirst(sublinkCell); + Node *subselect = sublink->subselect; + + if (subselect && IsA(subselect, Query)) + { + error = DeferErrorIfCannotPushdownSubquery((Query *) subselect, + outerMostQueryHasLimit); + if (error) + { + return error; + } + } + } + return NULL; } @@ -549,12 +627,6 @@ DeferErrorIfCannotPushdownSubquery(Query *subqueryTree, bool outerMostQueryHasLi return deferredError; } - if (subqueryTree->hasSubLinks) - { - preconditionsSatisfied = false; - errorDetail = "Subqueries other than from-clause subqueries are unsupported"; - } - if (subqueryTree->rtable == NIL) { preconditionsSatisfied = false; @@ -1248,11 +1320,11 @@ ErrorIfQueryNotSupported(Query *queryTree) const char *filterHint = "Consider using an equality filter on the distributed " "table's partition column."; - if (queryTree->hasSubLinks) + if (queryTree->hasSubLinks && SublinkList(queryTree) == NIL) { preconditionsSatisfied = false; errorMessage = "could not run distributed query with subquery outside the " - "FROM clause"; + "FROM and WHERE clauses"; errorHint = filterHint; } @@ -1506,6 +1578,12 @@ DeferErrorIfUnsupportedSubqueryRepartition(Query *subqueryTree) errorDetail = "Subqueries with offset are not supported yet"; } + if (subqueryTree->hasSubLinks) + { + preconditionsSatisfied = false; + errorDetail = "Subqueries other than from-clause subqueries are unsupported"; + } + /* finally check and return error if conditions are not satisfied */ if (!preconditionsSatisfied) { @@ -1752,11 +1830,37 @@ ValidateClauseList(List *clauseList) { Node *clause = (Node *) lfirst(clauseCell); - bool selectClause = IsSelectClause(clause); - bool joinClause = IsJoinClause(clause); - bool orClause = or_clause(clause); + /* produce a better error message for sublinks */ + ErrorIfSublink(clause); - if (!(selectClause || joinClause || orClause)) + if (!(IsSelectClause(clause) || IsJoinClause(clause) || or_clause(clause))) + { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("unsupported clause type"))); + } + } +} + + +/* + * ValidateSubqueryPushdownClauseList walks over the given list of clauses, + * and checks that we can recognize all the clauses. This function ensures + * that we do not drop an unsupported clause type on the floor, and thus + * prevents erroneous results. + * + * Note that this function is slightly different than ValidateClauseList(), + * additionally allowing subkinks. + */ +static void +ValidateSubqueryPushdownClauseList(List *clauseList) +{ + ListCell *clauseCell = NULL; + foreach(clauseCell, clauseList) + { + Node *clause = (Node *) lfirst(clauseCell); + + if (!(IsSublinkClause(clause) || IsSelectClause(clause) || + IsJoinClause(clause) || or_clause(clause))) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("unsupported clause type"))); @@ -1810,6 +1914,10 @@ JoinClauseList(List *whereClauseList) * subqueries but differently from the outermost query, they are run on a copy * of parse tree and changes do not get persisted as modifications to the original * query tree. + * + * Also this function adds SubLinks to the baseQualifierList when they appear on + * the query's WHERE clause. The callers of the function should consider processing + * Sublinks as well. */ static bool ExtractFromExpressionWalker(Node *node, QualifierWalkerContext *walkerContext) @@ -2203,7 +2311,8 @@ MultiSelectNode(List *whereClauseList) /* * IsSelectClause determines if the given node is a select clause according to * our criteria. Our criteria defines a select clause as an expression that has - * zero or more columns belonging to only one table. + * zero or more columns belonging to only one table. The function assumes that + * no sublinks exists in the clause. */ static bool IsSelectClause(Node *clause) @@ -2213,16 +2322,6 @@ IsSelectClause(Node *clause) Var *firstColumn = NULL; Index firstColumnTableId = 0; bool isSelectClause = true; - NodeTag nodeTag = nodeTag(clause); - - /* error out for subqueries in WHERE clause */ - if (nodeTag == T_SubLink || nodeTag == T_SubPlan) - { - ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot perform distributed planning on this query"), - errdetail("Subqueries other than in from-clause are currently " - "unsupported"))); - } /* extract columns from the clause */ columnList = pull_var_clause_default(clause); @@ -2249,6 +2348,42 @@ IsSelectClause(Node *clause) } +/* + * ErrorIfSublink errors out if the input claise is either sublink or subplan. + */ +static void +ErrorIfSublink(Node *clause) +{ + NodeTag nodeTag = nodeTag(clause); + + /* error out for subqueries in WHERE clause */ + if (nodeTag == T_SubLink || nodeTag == T_SubPlan) + { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot perform distributed planning on this query"), + errdetail("Subqueries other than in from-clause are currently " + "unsupported"))); + } +} + + +/* + * IsSublinkClause determines if the given node is a sublink or subplan. + */ +static bool +IsSublinkClause(Node *clause) +{ + NodeTag nodeTag = nodeTag(clause); + + if (nodeTag == T_SubLink || nodeTag == T_SubPlan) + { + return true; + } + + return false; +} + + /* * MultiProjectNode builds the project node using the target entry information * from the query tree. The project node only encapsulates projected columns, @@ -2904,9 +3039,13 @@ SubqueryPushdownMultiPlanTree(Query *queryTree) /* verify we can perform distributed planning on this query */ ErrorIfQueryNotSupported(queryTree); - /* extract qualifiers and verify we can plan for them */ + /* + * Extract qualifiers and verify we can plan for them. Note that since + * subquery pushdown join planning is based on restriction equivalence, + * checking for these qualifiers may not be necessary. + */ qualifierList = QualifierList(queryTree->jointree); - ValidateClauseList(qualifierList); + ValidateSubqueryPushdownClauseList(qualifierList); /* * We would be creating a new Query and pushing down top level query's diff --git a/src/test/regress/expected/multi_mx_router_planner.out b/src/test/regress/expected/multi_mx_router_planner.out index 37f3a8718..da1fa9646 100644 --- a/src/test/regress/expected/multi_mx_router_planner.out +++ b/src/test/regress/expected/multi_mx_router_planner.out @@ -422,15 +422,6 @@ DEBUG: Plan is router executable SELECT * FROM articles_hash_mx, position('om' in 'Thomas') WHERE author_id = 1 or author_id = 2; ERROR: could not run distributed query with complex table expressions HINT: Consider using an equality filter on the distributed table's partition column. --- subqueries are not supported in WHERE clause in Citus -SELECT * FROM articles_hash_mx WHERE author_id IN (SELECT id FROM authors_hash_mx WHERE name LIKE '%a'); -ERROR: cannot plan queries that include both regular and partitioned relations -SELECT * FROM articles_hash_mx WHERE author_id IN (SELECT author_id FROM articles_hash_mx WHERE author_id = 1 or author_id = 3); -ERROR: could not run distributed query with join types other than INNER or OUTER JOINS -HINT: Consider joining tables on partition column and have equal filter on joining columns. -SELECT * FROM articles_hash_mx WHERE author_id = (SELECT 1); -ERROR: could not run distributed query with subquery outside the FROM clause -HINT: Consider using an equality filter on the distributed table's partition column. -- subqueries are supported in FROM clause but they are not router plannable SELECT articles_hash_mx.id,test.word_count FROM articles_hash_mx, (SELECT id, word_count FROM articles_hash_mx) AS test WHERE test.id = articles_hash_mx.id @@ -502,7 +493,7 @@ HINT: Set citus.task_executor_type to "task-tracker". -- subqueries are not supported in SELECT clause SELECT a.title AS name, (SELECT a2.id FROM articles_single_shard_hash_mx a2 WHERE a.id = a2.id LIMIT 1) AS special_price FROM articles_hash_mx a; -ERROR: could not run distributed query with subquery outside the FROM clause +ERROR: could not run distributed query with subquery outside the FROM and WHERE clauses HINT: Consider using an equality filter on the distributed table's partition column. -- simple lookup query SELECT * diff --git a/src/test/regress/expected/multi_router_planner.out b/src/test/regress/expected/multi_router_planner.out index c2996c6cb..369ee2f32 100644 --- a/src/test/regress/expected/multi_router_planner.out +++ b/src/test/regress/expected/multi_router_planner.out @@ -520,15 +520,6 @@ DEBUG: Plan is router executable SELECT * FROM articles_hash, position('om' in 'Thomas') WHERE author_id = 1 or author_id = 2; ERROR: could not run distributed query with complex table expressions HINT: Consider using an equality filter on the distributed table's partition column. --- subqueries are not supported in WHERE clause in Citus -SELECT * FROM articles_hash WHERE author_id IN (SELECT id FROM authors_hash WHERE name LIKE '%a'); -ERROR: cannot plan queries that include both regular and partitioned relations -SELECT * FROM articles_hash WHERE author_id IN (SELECT author_id FROM articles_hash WHERE author_id = 1 or author_id = 3); -ERROR: could not run distributed query with join types other than INNER or OUTER JOINS -HINT: Consider joining tables on partition column and have equal filter on joining columns. -SELECT * FROM articles_hash WHERE author_id = (SELECT 1); -ERROR: could not run distributed query with subquery outside the FROM clause -HINT: Consider using an equality filter on the distributed table's partition column. -- unless the query can be transformed into a join SELECT * FROM articles_hash WHERE author_id IN (SELECT author_id FROM articles_hash WHERE author_id = 2) @@ -615,7 +606,7 @@ HINT: Set citus.task_executor_type to "task-tracker". -- subqueries are not supported in SELECT clause SELECT a.title AS name, (SELECT a2.id FROM articles_single_shard_hash a2 WHERE a.id = a2.id LIMIT 1) AS special_price FROM articles_hash a; -ERROR: could not run distributed query with subquery outside the FROM clause +ERROR: could not run distributed query with subquery outside the FROM and WHERE clauses HINT: Consider using an equality filter on the distributed table's partition column. -- simple lookup query SELECT * diff --git a/src/test/regress/expected/multi_simple_queries.out b/src/test/regress/expected/multi_simple_queries.out index aead9dd76..b76029bd0 100644 --- a/src/test/regress/expected/multi_simple_queries.out +++ b/src/test/regress/expected/multi_simple_queries.out @@ -247,7 +247,7 @@ ORDER BY articles.id; -- subqueries are not supported in SELECT clause SELECT a.title AS name, (SELECT a2.id FROM articles_single_shard a2 WHERE a.id = a2.id LIMIT 1) AS special_price FROM articles a; -ERROR: could not run distributed query with subquery outside the FROM clause +ERROR: could not run distributed query with subquery outside the FROM and WHERE clauses HINT: Consider using an equality filter on the distributed table's partition column. -- joins are not supported between local and distributed tables SELECT title, authors.name FROM authors, articles WHERE authors.id = articles.author_id; diff --git a/src/test/regress/expected/multi_subquery_behavioral_analytics.out b/src/test/regress/expected/multi_subquery_behavioral_analytics.out index 4340d115b..dbe572adc 100644 --- a/src/test/regress/expected/multi_subquery_behavioral_analytics.out +++ b/src/test/regress/expected/multi_subquery_behavioral_analytics.out @@ -546,62 +546,103 @@ ORDER BY user_lastseen DESC, user_id; ------------------------------------ -- Count the number of distinct users_table who are in segment X and Y and Z --- This query will be supported when we have subqueries in where clauses. ------------------------------------ -SELECT DISTINCT user_id +SELECT user_id FROM users_table WHERE user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 10 AND value_1 <= 20) AND user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 30 AND value_1 <= 40) - AND user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 50 AND value_1 <= 60); -ERROR: could not run distributed query with join types other than INNER or OUTER JOINS -HINT: Consider joining tables on partition column and have equal filter on joining columns. + AND user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 50 AND value_1 <= 60) +GROUP BY + user_id +ORDER BY + user_id DESC + LIMIT 5; + user_id +--------- + 93 + 90 + 88 + 87 + 84 +(5 rows) + ------------------------------------ -- Find customers who have done X, and satisfy other customer specific criteria --- This query will be supported when we have subqueries in where clauses. ------------------------------------ SELECT user_id, value_2 FROM users_table WHERE value_1 > 101 AND value_1 < 110 AND value_2 >= 5 - AND EXISTS (SELECT user_id FROM events_table WHERE event_type > 101 AND event_type < 110 AND value_3 > 100 AND user_id = users_table.user_id); -ERROR: could not run distributed query with join types other than INNER or OUTER JOINS -HINT: Consider joining tables on partition column and have equal filter on joining columns. + AND EXISTS (SELECT user_id FROM events_table WHERE event_type > 101 AND event_type < 110 AND value_3 > 100 AND user_id = users_table.user_id) +ORDER BY 2 DESC, 1 DESC +LIMIT 5; + user_id | value_2 +---------+--------- + 95 | 951 + 4 | 934 + 2 | 908 + 90 | 900 + 49 | 847 +(5 rows) + ------------------------------------ -- Customers who haven’t done X, and satisfy other customer specific criteria --- This query will be supported when we have subqueries in where clauses. ------------------------------------ SELECT user_id, value_2 FROM users_table WHERE value_1 = 101 AND value_2 >= 5 - AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type=101 AND value_3 > 100 AND user_id = users_table.user_id); -ERROR: could not run distributed query with subquery outside the FROM clause -HINT: Consider using an equality filter on the distributed table's partition column. + AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type=101 AND value_3 > 100 AND user_id = users_table.user_id) +ORDER BY 1 DESC, 2 DESC +LIMIT 3; + user_id | value_2 +---------+--------- + 58 | 585 + 51 | 1000 + 48 | 861 +(3 rows) + ------------------------------------ -- Customers who have done X and Y, and satisfy other customer specific criteria --- This query will be supported when we have subqueries in where clauses. ------------------------------------ -SELECT user_id, value_2 FROM users_table WHERE +SELECT user_id, sum(value_2) as cnt FROM users_table WHERE value_1 > 100 AND value_2 >= 5 AND EXISTS (SELECT user_id FROM events_table WHERE event_type != 100 AND value_3 > 100 AND user_id = users_table.user_id) - AND EXISTS (SELECT user_id FROM events_table WHERE event_type = 101 AND value_3 > 100 AND user_id = users_table.user_id); -ERROR: could not run distributed query with join types other than INNER or OUTER JOINS -HINT: Consider joining tables on partition column and have equal filter on joining columns. + AND EXISTS (SELECT user_id FROM events_table WHERE event_type = 101 AND value_3 > 100 AND user_id = users_table.user_id) +GROUP BY + user_id +ORDER BY cnt DESC, user_id DESC +LIMIT 5; + user_id | cnt +---------+------- + 49 | 48606 + 69 | 46524 + 86 | 46163 + 80 | 45995 + 35 | 45437 +(5 rows) + ------------------------------------ -- Customers who have done X and haven’t done Y, and satisfy other customer specific criteria --- This query will be supported when we have subqueries in where clauses. ------------------------------------ SELECT user_id, value_2 FROM users_table WHERE value_2 >= 5 AND EXISTS (SELECT user_id FROM events_table WHERE event_type > 100 AND event_type <= 300 AND value_3 > 100 AND user_id = users_table.user_id) - AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type > 300 AND event_type <= 350 AND value_3 > 100 AND user_id = users_table.user_id); -ERROR: could not run distributed query with join types other than INNER or OUTER JOINS -HINT: Consider joining tables on partition column and have equal filter on joining columns. + AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type > 300 AND event_type <= 350 AND value_3 > 100 AND user_id = users_table.user_id) +ORDER BY 2 DESC, 1 DESC +LIMIT 4; + user_id | value_2 +---------+--------- + 8 | 996 + 96 | 995 + 8 | 995 + 96 | 989 +(4 rows) + ------------------------------------ -- Customers who have done X more than 2 times, and satisfy other customer specific criteria --- This query will be supported when we have subqueries in where clauses. ------------------------------------ SELECT user_id, - value_2 + avg(value_2) FROM users_table WHERE value_1 > 100 AND value_1 < 124 @@ -613,9 +654,21 @@ SELECT user_id, AND value_3 > 100 AND user_id = users_table.user_id GROUP BY user_id - HAVING Count(*) > 2); -ERROR: could not run distributed query with subquery outside the FROM clause -HINT: Consider using an equality filter on the distributed table's partition column. + HAVING Count(*) > 2) +GROUP BY + user_id +ORDER BY + 1 DESC, 2 DESC +LIMIT 5; + user_id | avg +---------+---------------------- + 99 | 571.6666666666666667 + 98 | 758.0000000000000000 + 96 | 459.6666666666666667 + 90 | 453.3333333333333333 + 89 | 215.0000000000000000 +(5 rows) + ------------------------------------ -- Find me all users_table who logged in more than once ------------------------------------ @@ -671,7 +724,6 @@ ORDER BY ------------------------------------ -- Find me all users_table who has done some event and has filters --- This query will be supported when we have subqueries in where clauses. ------------------------------------ SELECT user_id FROM events_table @@ -684,18 +736,32 @@ WHERE users_table WHERE value_1 = 15 AND value_2 > 25 - ); -ERROR: could not run distributed query with join types other than INNER or OUTER JOINS -HINT: Consider joining tables on partition column and have equal filter on joining columns. + ) +ORDER BY 1; + user_id +--------- + 7 + 53 +(2 rows) + ------------------------------------ -- Which events_table did people who has done some specific events_table --- This query will be supported when we have subqueries in where clauses. ------------------------------------ -SELECT user_id, event_type FROM events_table -WHERE user_id in (SELECT user_id from events_table WHERE event_type > 500 and event_type < 505) -GROUP BY user_id, event_type; -ERROR: could not run distributed query with join types other than INNER or OUTER JOINS -HINT: Consider joining tables on partition column and have equal filter on joining columns. +SELECT + user_id, event_type FROM events_table +WHERE + user_id in (SELECT user_id from events_table WHERE event_type > 500 and event_type < 505) +GROUP BY + user_id, event_type +ORDER BY 2 DESC, 1 +LIMIT 3; + user_id | event_type +---------+------------ + 18 | 999 + 23 | 999 + 26 | 999 +(3 rows) + ------------------------------------ -- Find me all the users_table who has done some event more than three times ------------------------------------ @@ -1950,5 +2016,45 @@ FROM ( ERROR: no value found for parameter 1 SET client_min_messages TO DEFAULT; DROP FUNCTION volatile_func_test(); -SET citus.subquery_pushdown to OFF; +CREATE FUNCTION test_join_function_2(integer, integer) RETURNS bool + AS 'select $1 > $2;' + LANGUAGE SQL + IMMUTABLE + RETURNS NULL ON NULL INPUT; +-- we don't support joins via functions +SELECT user_id, array_length(events_table, 1) +FROM ( + SELECT user_id, array_agg(event ORDER BY time) AS events_table + FROM ( + SELECT u.user_id, e.event_type::text AS event, e.time + FROM users_table AS u, + events_table AS e + WHERE test_join_function_2(u.user_id, e.user_id) + ) t + GROUP BY user_id +) q +ORDER BY 2 DESC, 1; +ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys +DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. +-- note that the following query has joins on the partition keys +-- however we fail to push down it due to the function call on the +-- where clause. We probably need to relax that check +SELECT + users_table.user_id, users_table.value_1, prob +FROM + users_table + JOIN + (SELECT + ma.user_id, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob + FROM + users_table AS ma, events_table as short_list + WHERE + short_list.user_id = ma.user_id and ma.value_1 < 50 and short_list.event_type < 50 + ) temp + ON users_table.user_id = temp.user_id + WHERE + users_table.value_1 < 50 AND test_join_function_2(users_table.user_id, temp.user_id); +ERROR: unsupported clause type +DROP FUNCTION test_join_function_2(integer, integer); SET citus.enable_router_execution TO TRUE; +SET citus.subquery_pushdown to OFF; diff --git a/src/test/regress/expected/multi_subquery_in_where_clause.out b/src/test/regress/expected/multi_subquery_in_where_clause.out new file mode 100644 index 000000000..52e4a69ec --- /dev/null +++ b/src/test/regress/expected/multi_subquery_in_where_clause.out @@ -0,0 +1,555 @@ +-- +-- multi subquery in where queries aims to expand existing subquery pushdown +-- regression tests to cover more cases specifically subqueries in WHERE clause +-- the tables that are used depends to multi_insert_select_behavioral_analytics_create_table.sql +-- +-- We don't need shard id sequence here, so commented out to prevent conflicts with concurrent tests +-- ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 1430000; +-- ALTER SEQUENCE pg_catalog.pg_dist_jobid_seq RESTART 1430000; +-- subqueries in WHERE with greater operator +SELECT + user_id +FROM + users_table +WHERE + value_2 > + (SELECT + max(value_2) + FROM + events_table + WHERE + users_table.user_id = events_table.user_id AND event_type = 50 + GROUP BY + user_id + ) +GROUP BY user_id +HAVING count(*) > 66 +ORDER BY user_id +LIMIT 5; + user_id +--------- + 49 + 55 + 56 + 63 +(4 rows) + +-- subqueries in where with ALL operator +SELECT + user_id +FROM + users_table +WHERE + value_2 > 545 AND + value_2 < ALL (SELECT avg(value_3) FROM events_table WHERE users_table.user_id = events_table.user_id GROUP BY user_id) +GROUP BY + 1 +ORDER BY + 1 DESC +LIMIT 3; + user_id +--------- + 69 + 52 + 12 +(3 rows) + +-- IN operator on non-partition key +SELECT + user_id +FROM + events_table as e1 +WHERE + event_type IN + (SELECT + event_type + FROM + events_table as e2 + WHERE + value_2 = 15 AND value_3 > 25 AND + e1.user_id = e2.user_id + ) +ORDER BY 1; + user_id +--------- + 8 + 17 + 33 + 47 + 54 + 54 + 56 + 71 + 79 + 86 +(10 rows) + +-- NOT IN on non-partition key +SELECT + user_id +FROM + events_table as e1 +WHERE + event_type NOT IN + (SELECT + event_type + FROM + events_table as e2 + WHERE + value_2 = 15 AND value_3 > 25 AND + e1.user_id = e2.user_id + ) + GROUP BY 1 +HAVING count(*) > 122 +ORDER BY 1; + user_id +--------- + 23 + 25 +(2 rows) + +-- non-correlated query with =ANY on partition keys + SELECT + user_id, count(*) +FROM + users_table +WHERE + user_id =ANY(SELECT user_id FROM users_table WHERE value_1 >= 10 AND value_1 <= 20) GROUP BY 1 ORDER BY 2 DESC LIMIT 5; + user_id | count +---------+------- + 12 | 121 + 87 | 117 + 37 | 115 + 23 | 115 + 46 | 115 +(5 rows) + +-- users that appeared more than 118 times +SELECT + user_id +FROM + users_table +WHERE 118 <= + (SELECT + count(*) + FROM + events_table + WHERE + users_table.user_id = events_table.user_id + GROUP BY + user_id) +GROUP BY + user_id +ORDER BY + user_id; + user_id +--------- + 13 + 17 + 23 + 25 +(4 rows) + +-- the following query doesn't have a meaningul result +-- but it is a valid query with an arbitrary subquery in +-- WHERE clause +SELECT user_id, value_2 FROM users_table WHERE + value_1 > 101 AND value_1 < 110 + AND value_2 >= 5 + AND user_id IN + ( + SELECT + e1.user_id + FROM ( + -- Get the first time each user viewed the homepage. + SELECT + user_id, + 1 AS view_homepage, + min(time) AS view_homepage_time + FROM events_table + WHERE + event_type IN (10, 20, 30, 40, 50, 60, 70, 80, 90) + GROUP BY user_id + ) e1 LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS use_demo, + time AS use_demo_time + FROM events_table + WHERE + user_id = e1.user_id AND + event_type IN (11, 21, 31, 41, 51, 61, 71, 81, 91) + ORDER BY time + ) e2 ON true LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS enter_credit_card, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e2.user_id AND + event_type IN (12, 22, 32, 42, 52, 62, 72, 82, 92) + ORDER BY time + ) e3 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS submit_card_info, + user_id, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e3.user_id AND + event_type IN (13, 23, 33, 43, 53, 63, 73, 83, 93) + ORDER BY time + ) e4 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS see_bought_screen + FROM events_table + WHERE + user_id = e4.user_id AND + event_type IN (14, 24, 34, 44, 54, 64, 74, 84, 94) + ORDER BY time + ) e5 ON true + group by e1.user_id + HAVING sum(submit_card_info) > 0 +) +ORDER BY 1, 2; + user_id | value_2 +---------+--------- + 5 | 884 + 42 | 55 + 42 | 471 + 51 | 758 + 72 | 897 + 82 | 691 + 95 | 951 +(7 rows) + +-- similar to the above query +-- the following query doesn't have a meaningul result +-- but it is a valid query with an arbitrary subquery in +-- WHERE clause +SELECT + user_id +FROM + users_table +WHERE + user_id IN +( + SELECT + user_id + FROM ( + SELECT + subquery_1.user_id, count_pay + FROM + ( + (SELECT + users_table.user_id, + 'action=>1'AS event, + events_table.time + FROM + users_table, + events_table + WHERE + users_table.user_id = events_table.user_id AND + users_table.user_id >= 10 AND + users_table.user_id <= 70 AND + events_table.event_type > 10 AND events_table.event_type < 12 + ) + UNION + (SELECT + users_table.user_id, + 'action=>2'AS event, + events_table.time + FROM + users_table, + events_table + WHERE + users_table.user_id = events_table.user_id AND + users_table.user_id >= 10 AND + users_table.user_id <= 70 AND + events_table.event_type > 12 AND events_table.event_type < 14 + ) + ) AS subquery_1 + LEFT JOIN + (SELECT + user_id, + COUNT(*) AS count_pay + FROM + users_table + WHERE + user_id >= 10 AND + user_id <= 70 AND + users_table.value_1 > 15 AND users_table.value_1 < 17 + GROUP BY + user_id + HAVING + COUNT(*) > 1) AS subquery_2 + ON + subquery_1.user_id = subquery_2.user_id + GROUP BY + subquery_1.user_id, + count_pay) AS subquery_top + GROUP BY + count_pay, user_id +) +GROUP BY user_id +HAVING count(*) > 3 AND sum(value_2) > 49000 +ORDER BY 1; + user_id +--------- + 18 + 29 + 40 + 49 + 58 + 69 +(6 rows) + +-- the following query doesn't have a meaningul result +-- but it is a valid query with an arbitrary subquery in +-- FROM clause involving a complex query in WHERE clause +SELECT user_id, array_length(events_table, 1) +FROM ( + SELECT user_id, array_agg(event ORDER BY time) AS events_table + FROM ( + SELECT + u.user_id, e.event_type::text AS event, e.time + FROM + users_table AS u, + events_table AS e + WHERE u.user_id = e.user_id AND + u.user_id IN + ( + SELECT + user_id + FROM + users_table + WHERE value_2 >= 5 + AND EXISTS (SELECT user_id FROM events_table WHERE event_type > 100 AND event_type <= 300 AND value_3 > 100 AND user_id = users_table.user_id) + AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type > 300 AND event_type <= 350 AND value_3 > 100 AND user_id = users_table.user_id) + ) + ) t + GROUP BY user_id +) q +ORDER BY 2 DESC, 1; + user_id | array_length +---------+-------------- + 96 | 12204 + 8 | 8170 +(2 rows) + +-- +-- below tests only aims for cases where all relations +-- are not joined on partition key +-- +-- e4 is not joined on the partition key +SELECT user_id, value_2 FROM users_table WHERE + value_1 > 101 AND value_1 < 110 + AND value_2 >= 5 + AND user_id IN + ( + SELECT + e1.user_id + FROM ( + -- Get the first time each user viewed the homepage. + SELECT + user_id, + 1 AS view_homepage, + min(time) AS view_homepage_time + FROM events_table + WHERE + event_type IN (10, 20, 30, 40, 50, 60, 70, 80, 90) + GROUP BY user_id + ) e1 LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS use_demo, + time AS use_demo_time + FROM events_table + WHERE + user_id = e1.user_id AND + event_type IN (11, 21, 31, 41, 51, 61, 71, 81, 91) + ORDER BY time + ) e2 ON true LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS enter_credit_card, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e2.user_id AND + event_type IN (12, 22, 32, 42, 52, 62, 72, 82, 92) + ORDER BY time + ) e3 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS submit_card_info, + user_id, + time AS enter_credit_card_time + FROM events_table + WHERE + value_2 = e3.user_id AND + event_type IN (13, 23, 33, 43, 53, 63, 73, 83, 93) + ORDER BY time + ) e4 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS see_bought_screen + FROM events_table + WHERE + user_id = e4.user_id AND + event_type IN (14, 24, 34, 44, 54, 64, 74, 84, 94) + ORDER BY time + ) e5 ON true + group by e1.user_id + HAVING sum(submit_card_info) > 0 +); +ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys +DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. +-- left leaf query does not return partition key +SELECT + user_id +FROM + users_table +WHERE + user_id IN +( + SELECT + user_id + FROM ( + SELECT + subquery_1.user_id, count_pay + FROM + ( + (SELECT + 2 * users_table.user_id as user_id, + 'action=>1'AS event, + events_table.time + FROM + users_table, + events_table + WHERE + users_table.user_id = events_table.user_id AND + users_table.user_id >= 10 AND + users_table.user_id <= 70 AND + events_table.event_type > 10 AND events_table.event_type < 12 + ) + UNION + (SELECT + users_table.user_id, + 'action=>2'AS event, + events_table.time + FROM + users_table, + events_table + WHERE + users_table.user_id = events_table.user_id AND + users_table.user_id >= 10 AND + users_table.user_id <= 70 AND + events_table.event_type > 12 AND events_table.event_type < 14 + ) + ) AS subquery_1 + LEFT JOIN + (SELECT + user_id, + COUNT(*) AS count_pay + FROM + users_table + WHERE + user_id >= 10 AND + user_id <= 70 AND + users_table.value_1 > 15 AND users_table.value_1 < 17 + GROUP BY + user_id + HAVING + COUNT(*) > 1) AS subquery_2 + ON + subquery_1.user_id = subquery_2.user_id + GROUP BY + subquery_1.user_id, + count_pay) AS subquery_top + GROUP BY + count_pay, user_id +) +GROUP BY user_id +HAVING count(*) > 3 AND sum(value_2) > 49000 +ORDER BY 1; +ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys +DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. +-- NOT EXISTS query has non-equi join +SELECT user_id, array_length(events_table, 1) +FROM ( + SELECT user_id, array_agg(event ORDER BY time) AS events_table + FROM ( + SELECT + u.user_id, e.event_type::text AS event, e.time + FROM + users_table AS u, + events_table AS e + WHERE u.user_id = e.user_id AND + u.user_id IN + ( + SELECT + user_id + FROM + users_table + WHERE value_2 >= 5 + AND EXISTS (SELECT user_id FROM events_table WHERE event_type > 100 AND event_type <= 300 AND value_3 > 100 AND user_id = users_table.user_id) + AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type > 300 AND event_type <= 350 AND value_3 > 100 AND user_id != users_table.user_id) + ) + ) t + GROUP BY user_id +) q +ORDER BY 2 DESC, 1; +ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys +DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. +-- subquery in where clause doesn't have a relation +SELECT + user_id +FROM + users_table +WHERE + value_2 > + (SELECT 1); +ERROR: cannot push down this subquery +DETAIL: Subqueries without relations are unsupported +-- OFFSET is not supported in the subquey +SELECT + user_id +FROM + users_table +WHERE + value_2 > + (SELECT + max(value_2) + FROM + events_table + WHERE + users_table.user_id = events_table.user_id AND event_type = 50 + GROUP BY + user_id + OFFSET 3 + ); +ERROR: cannot push down this subquery +DETAIL: Offset clause is currently unsupported +-- semi join is not on the partition key for the third subquery +SELECT user_id +FROM users_table +WHERE user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 10 AND value_1 <= 20) + AND user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 30 AND value_1 <= 40) + AND value_2 IN (SELECT user_id FROM users_table WHERE value_1 >= 50 AND value_1 <= 60); +ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys +DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. +CREATE FUNCTION test_join_function(integer, integer) RETURNS bool + AS 'select $1 > $2;' + LANGUAGE SQL + IMMUTABLE + RETURNS NULL ON NULL INPUT; +-- we disallow JOINs via functions +SELECT user_id, value_2 FROM users_table WHERE + value_1 = 101 + AND value_2 >= 5 + AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type=101 AND value_3 > 100 AND test_join_function(events_table.user_id, users_table.user_id)) +ORDER BY 1 DESC, 2 DESC +LIMIT 3; +ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys +DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. +DROP FUNCTION test_join_function(int,int); diff --git a/src/test/regress/expected/multi_verify_no_subquery.out b/src/test/regress/expected/multi_verify_no_subquery.out deleted file mode 100644 index ae1459c90..000000000 --- a/src/test/regress/expected/multi_verify_no_subquery.out +++ /dev/null @@ -1,18 +0,0 @@ --- --- MULTI_VERIFY_NO_SUBQUERY --- --- This test checks that we simply emit an error message instead of trying to --- process a distributed unsupported SQL subquery. -ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 1030000; -SELECT * FROM lineitem WHERE l_orderkey IN - (SELECT l_orderkey FROM lineitem WHERE l_quantity > 0); -ERROR: could not run distributed query with join types other than INNER or OUTER JOINS -SELECT l_quantity FROM lineitem WHERE EXISTS - (SELECT 1 FROM orders WHERE o_orderkey = l_orderkey); -ERROR: could not run distributed query with join types other than INNER or OUTER JOINS -SELECT l_quantity FROM lineitem WHERE l_orderkey IN (SELECT o_orderkey FROM orders); -ERROR: could not run distributed query with join types other than INNER or OUTER JOINS -SELECT l_orderkey FROM lineitem WHERE l_quantity > ALL(SELECT o_orderkey FROM orders); -ERROR: could not run distributed query with subquery outside the FROM clause -SELECT l_quantity FROM lineitem WHERE l_orderkey = (SELECT min(o_orderkey) FROM orders); -ERROR: could not run distributed query with subquery outside the FROM clause diff --git a/src/test/regress/multi_binary_schedule b/src/test/regress/multi_binary_schedule index d09cc9a1e..698eb69a2 100644 --- a/src/test/regress/multi_binary_schedule +++ b/src/test/regress/multi_binary_schedule @@ -21,7 +21,7 @@ test: multi_table_ddl test: multi_create_table test: multi_load_data -test: multi_basic_queries multi_complex_expressions multi_verify_no_subquery +test: multi_basic_queries multi_complex_expressions test: multi_single_relation_subquery test: multi_binary_master_copy_format diff --git a/src/test/regress/multi_schedule b/src/test/regress/multi_schedule index 1ca71eec6..d4025fbcc 100644 --- a/src/test/regress/multi_schedule +++ b/src/test/regress/multi_schedule @@ -38,10 +38,10 @@ test: multi_insert_select # Miscellaneous tests to check our query planning behavior # ---------- test: multi_deparse_shard_query -test: multi_basic_queries multi_complex_expressions multi_verify_no_subquery +test: multi_basic_queries multi_complex_expressions test: multi_explain test: multi_subquery multi_subquery_complex_queries multi_subquery_behavioral_analytics -test: multi_subquery_union +test: multi_subquery_union multi_subquery_in_where_clause test: multi_reference_table test: multi_outer_join_reference test: multi_single_relation_subquery diff --git a/src/test/regress/multi_task_tracker_extra_schedule b/src/test/regress/multi_task_tracker_extra_schedule index 8051e63d1..b56ba506f 100644 --- a/src/test/regress/multi_task_tracker_extra_schedule +++ b/src/test/regress/multi_task_tracker_extra_schedule @@ -28,7 +28,7 @@ test: multi_load_data # ---------- # Miscellaneous tests to check our query planning behavior # ---------- -test: multi_basic_queries multi_complex_expressions multi_verify_no_subquery +test: multi_basic_queries multi_complex_expressions test: multi_agg_distinct multi_limit_clause multi_limit_clause_approximate test: multi_average_expression multi_working_columns test: multi_array_agg diff --git a/src/test/regress/sql/multi_mx_router_planner.sql b/src/test/regress/sql/multi_mx_router_planner.sql index 2d9d06969..0eb66b353 100644 --- a/src/test/regress/sql/multi_mx_router_planner.sql +++ b/src/test/regress/sql/multi_mx_router_planner.sql @@ -219,14 +219,6 @@ SELECT * FROM articles_hash_mx, position('om' in 'Thomas') WHERE author_id = 1 o -- they are not supported if multiple workers are involved SELECT * FROM articles_hash_mx, position('om' in 'Thomas') WHERE author_id = 1 or author_id = 2; --- subqueries are not supported in WHERE clause in Citus -SELECT * FROM articles_hash_mx WHERE author_id IN (SELECT id FROM authors_hash_mx WHERE name LIKE '%a'); - -SELECT * FROM articles_hash_mx WHERE author_id IN (SELECT author_id FROM articles_hash_mx WHERE author_id = 1 or author_id = 3); - -SELECT * FROM articles_hash_mx WHERE author_id = (SELECT 1); - - -- subqueries are supported in FROM clause but they are not router plannable SELECT articles_hash_mx.id,test.word_count FROM articles_hash_mx, (SELECT id, word_count FROM articles_hash_mx) AS test WHERE test.id = articles_hash_mx.id diff --git a/src/test/regress/sql/multi_router_planner.sql b/src/test/regress/sql/multi_router_planner.sql index c94507b17..b9a72f69c 100644 --- a/src/test/regress/sql/multi_router_planner.sql +++ b/src/test/regress/sql/multi_router_planner.sql @@ -281,13 +281,6 @@ SELECT * FROM articles_hash, position('om' in 'Thomas') WHERE author_id = 1 or a -- they are not supported if multiple workers are involved SELECT * FROM articles_hash, position('om' in 'Thomas') WHERE author_id = 1 or author_id = 2; --- subqueries are not supported in WHERE clause in Citus -SELECT * FROM articles_hash WHERE author_id IN (SELECT id FROM authors_hash WHERE name LIKE '%a'); - -SELECT * FROM articles_hash WHERE author_id IN (SELECT author_id FROM articles_hash WHERE author_id = 1 or author_id = 3); - -SELECT * FROM articles_hash WHERE author_id = (SELECT 1); - -- unless the query can be transformed into a join SELECT * FROM articles_hash WHERE author_id IN (SELECT author_id FROM articles_hash WHERE author_id = 2) diff --git a/src/test/regress/sql/multi_subquery_behavioral_analytics.sql b/src/test/regress/sql/multi_subquery_behavioral_analytics.sql index 734bfd784..1e4fea414 100644 --- a/src/test/regress/sql/multi_subquery_behavioral_analytics.sql +++ b/src/test/regress/sql/multi_subquery_behavioral_analytics.sql @@ -458,57 +458,66 @@ ORDER BY user_lastseen DESC, user_id; ------------------------------------ -- Count the number of distinct users_table who are in segment X and Y and Z --- This query will be supported when we have subqueries in where clauses. ------------------------------------ -SELECT DISTINCT user_id +SELECT user_id FROM users_table WHERE user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 10 AND value_1 <= 20) AND user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 30 AND value_1 <= 40) - AND user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 50 AND value_1 <= 60); + AND user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 50 AND value_1 <= 60) +GROUP BY + user_id +ORDER BY + user_id DESC + LIMIT 5; ------------------------------------ -- Find customers who have done X, and satisfy other customer specific criteria --- This query will be supported when we have subqueries in where clauses. ------------------------------------ SELECT user_id, value_2 FROM users_table WHERE value_1 > 101 AND value_1 < 110 AND value_2 >= 5 - AND EXISTS (SELECT user_id FROM events_table WHERE event_type > 101 AND event_type < 110 AND value_3 > 100 AND user_id = users_table.user_id); + AND EXISTS (SELECT user_id FROM events_table WHERE event_type > 101 AND event_type < 110 AND value_3 > 100 AND user_id = users_table.user_id) +ORDER BY 2 DESC, 1 DESC +LIMIT 5; ------------------------------------ -- Customers who haven’t done X, and satisfy other customer specific criteria --- This query will be supported when we have subqueries in where clauses. ------------------------------------ SELECT user_id, value_2 FROM users_table WHERE value_1 = 101 AND value_2 >= 5 - AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type=101 AND value_3 > 100 AND user_id = users_table.user_id); + AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type=101 AND value_3 > 100 AND user_id = users_table.user_id) +ORDER BY 1 DESC, 2 DESC +LIMIT 3; ------------------------------------ -- Customers who have done X and Y, and satisfy other customer specific criteria --- This query will be supported when we have subqueries in where clauses. ------------------------------------ -SELECT user_id, value_2 FROM users_table WHERE +SELECT user_id, sum(value_2) as cnt FROM users_table WHERE value_1 > 100 AND value_2 >= 5 AND EXISTS (SELECT user_id FROM events_table WHERE event_type != 100 AND value_3 > 100 AND user_id = users_table.user_id) - AND EXISTS (SELECT user_id FROM events_table WHERE event_type = 101 AND value_3 > 100 AND user_id = users_table.user_id); + AND EXISTS (SELECT user_id FROM events_table WHERE event_type = 101 AND value_3 > 100 AND user_id = users_table.user_id) +GROUP BY + user_id +ORDER BY cnt DESC, user_id DESC +LIMIT 5; ------------------------------------ -- Customers who have done X and haven’t done Y, and satisfy other customer specific criteria --- This query will be supported when we have subqueries in where clauses. ------------------------------------ SELECT user_id, value_2 FROM users_table WHERE value_2 >= 5 AND EXISTS (SELECT user_id FROM events_table WHERE event_type > 100 AND event_type <= 300 AND value_3 > 100 AND user_id = users_table.user_id) - AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type > 300 AND event_type <= 350 AND value_3 > 100 AND user_id = users_table.user_id); + AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type > 300 AND event_type <= 350 AND value_3 > 100 AND user_id = users_table.user_id) +ORDER BY 2 DESC, 1 DESC +LIMIT 4; ------------------------------------ -- Customers who have done X more than 2 times, and satisfy other customer specific criteria --- This query will be supported when we have subqueries in where clauses. ------------------------------------ SELECT user_id, - value_2 + avg(value_2) FROM users_table WHERE value_1 > 100 AND value_1 < 124 @@ -520,7 +529,12 @@ SELECT user_id, AND value_3 > 100 AND user_id = users_table.user_id GROUP BY user_id - HAVING Count(*) > 2); + HAVING Count(*) > 2) +GROUP BY + user_id +ORDER BY + 1 DESC, 2 DESC +LIMIT 5; ------------------------------------ -- Find me all users_table who logged in more than once @@ -555,7 +569,6 @@ ORDER BY ------------------------------------ -- Find me all users_table who has done some event and has filters --- This query will be supported when we have subqueries in where clauses. ------------------------------------ SELECT user_id FROM events_table @@ -568,15 +581,20 @@ WHERE users_table WHERE value_1 = 15 AND value_2 > 25 - ); + ) +ORDER BY 1; ------------------------------------ -- Which events_table did people who has done some specific events_table --- This query will be supported when we have subqueries in where clauses. ------------------------------------ -SELECT user_id, event_type FROM events_table -WHERE user_id in (SELECT user_id from events_table WHERE event_type > 500 and event_type < 505) -GROUP BY user_id, event_type; +SELECT + user_id, event_type FROM events_table +WHERE + user_id in (SELECT user_id from events_table WHERE event_type > 500 and event_type < 505) +GROUP BY + user_id, event_type +ORDER BY 2 DESC, 1 +LIMIT 3; ------------------------------------ -- Find me all the users_table who has done some event more than three times @@ -1600,5 +1618,47 @@ FROM ( SET client_min_messages TO DEFAULT; DROP FUNCTION volatile_func_test(); -SET citus.subquery_pushdown to OFF; +CREATE FUNCTION test_join_function_2(integer, integer) RETURNS bool + AS 'select $1 > $2;' + LANGUAGE SQL + IMMUTABLE + RETURNS NULL ON NULL INPUT; + +-- we don't support joins via functions +SELECT user_id, array_length(events_table, 1) +FROM ( + SELECT user_id, array_agg(event ORDER BY time) AS events_table + FROM ( + SELECT u.user_id, e.event_type::text AS event, e.time + FROM users_table AS u, + events_table AS e + WHERE test_join_function_2(u.user_id, e.user_id) + ) t + GROUP BY user_id +) q +ORDER BY 2 DESC, 1; + +-- note that the following query has joins on the partition keys +-- however we fail to push down it due to the function call on the +-- where clause. We probably need to relax that check +SELECT + users_table.user_id, users_table.value_1, prob +FROM + users_table + JOIN + (SELECT + ma.user_id, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob + FROM + users_table AS ma, events_table as short_list + WHERE + short_list.user_id = ma.user_id and ma.value_1 < 50 and short_list.event_type < 50 + ) temp + ON users_table.user_id = temp.user_id + WHERE + users_table.value_1 < 50 AND test_join_function_2(users_table.user_id, temp.user_id); + +DROP FUNCTION test_join_function_2(integer, integer); + SET citus.enable_router_execution TO TRUE; +SET citus.subquery_pushdown to OFF; + diff --git a/src/test/regress/sql/multi_subquery_in_where_clause.sql b/src/test/regress/sql/multi_subquery_in_where_clause.sql new file mode 100644 index 000000000..ae880400d --- /dev/null +++ b/src/test/regress/sql/multi_subquery_in_where_clause.sql @@ -0,0 +1,486 @@ +-- +-- multi subquery in where queries aims to expand existing subquery pushdown +-- regression tests to cover more cases specifically subqueries in WHERE clause +-- the tables that are used depends to multi_insert_select_behavioral_analytics_create_table.sql +-- +-- We don't need shard id sequence here, so commented out to prevent conflicts with concurrent tests +-- ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 1430000; +-- ALTER SEQUENCE pg_catalog.pg_dist_jobid_seq RESTART 1430000; + +-- subqueries in WHERE with greater operator +SELECT + user_id +FROM + users_table +WHERE + value_2 > + (SELECT + max(value_2) + FROM + events_table + WHERE + users_table.user_id = events_table.user_id AND event_type = 50 + GROUP BY + user_id + ) +GROUP BY user_id +HAVING count(*) > 66 +ORDER BY user_id +LIMIT 5; + +-- subqueries in where with ALL operator +SELECT + user_id +FROM + users_table +WHERE + value_2 > 545 AND + value_2 < ALL (SELECT avg(value_3) FROM events_table WHERE users_table.user_id = events_table.user_id GROUP BY user_id) +GROUP BY + 1 +ORDER BY + 1 DESC +LIMIT 3; + +-- IN operator on non-partition key +SELECT + user_id +FROM + events_table as e1 +WHERE + event_type IN + (SELECT + event_type + FROM + events_table as e2 + WHERE + value_2 = 15 AND value_3 > 25 AND + e1.user_id = e2.user_id + ) +ORDER BY 1; + +-- NOT IN on non-partition key +SELECT + user_id +FROM + events_table as e1 +WHERE + event_type NOT IN + (SELECT + event_type + FROM + events_table as e2 + WHERE + value_2 = 15 AND value_3 > 25 AND + e1.user_id = e2.user_id + ) + GROUP BY 1 + +HAVING count(*) > 122 +ORDER BY 1; + +-- non-correlated query with =ANY on partition keys + SELECT + user_id, count(*) +FROM + users_table +WHERE + user_id =ANY(SELECT user_id FROM users_table WHERE value_1 >= 10 AND value_1 <= 20) GROUP BY 1 ORDER BY 2 DESC LIMIT 5; + +-- users that appeared more than 118 times +SELECT + user_id +FROM + users_table +WHERE 118 <= + (SELECT + count(*) + FROM + events_table + WHERE + users_table.user_id = events_table.user_id + GROUP BY + user_id) +GROUP BY + user_id +ORDER BY + user_id; + + +-- the following query doesn't have a meaningul result +-- but it is a valid query with an arbitrary subquery in +-- WHERE clause +SELECT user_id, value_2 FROM users_table WHERE + value_1 > 101 AND value_1 < 110 + AND value_2 >= 5 + AND user_id IN + ( + SELECT + e1.user_id + FROM ( + -- Get the first time each user viewed the homepage. + SELECT + user_id, + 1 AS view_homepage, + min(time) AS view_homepage_time + FROM events_table + WHERE + event_type IN (10, 20, 30, 40, 50, 60, 70, 80, 90) + GROUP BY user_id + ) e1 LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS use_demo, + time AS use_demo_time + FROM events_table + WHERE + user_id = e1.user_id AND + event_type IN (11, 21, 31, 41, 51, 61, 71, 81, 91) + ORDER BY time + ) e2 ON true LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS enter_credit_card, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e2.user_id AND + event_type IN (12, 22, 32, 42, 52, 62, 72, 82, 92) + ORDER BY time + ) e3 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS submit_card_info, + user_id, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e3.user_id AND + event_type IN (13, 23, 33, 43, 53, 63, 73, 83, 93) + ORDER BY time + ) e4 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS see_bought_screen + FROM events_table + WHERE + user_id = e4.user_id AND + event_type IN (14, 24, 34, 44, 54, 64, 74, 84, 94) + ORDER BY time + ) e5 ON true + group by e1.user_id + HAVING sum(submit_card_info) > 0 +) +ORDER BY 1, 2; + + +-- similar to the above query +-- the following query doesn't have a meaningul result +-- but it is a valid query with an arbitrary subquery in +-- WHERE clause +SELECT + user_id +FROM + users_table +WHERE + user_id IN +( + SELECT + user_id + FROM ( + SELECT + subquery_1.user_id, count_pay + FROM + ( + (SELECT + users_table.user_id, + 'action=>1'AS event, + events_table.time + FROM + users_table, + events_table + WHERE + users_table.user_id = events_table.user_id AND + users_table.user_id >= 10 AND + users_table.user_id <= 70 AND + events_table.event_type > 10 AND events_table.event_type < 12 + ) + UNION + (SELECT + users_table.user_id, + 'action=>2'AS event, + events_table.time + FROM + users_table, + events_table + WHERE + users_table.user_id = events_table.user_id AND + users_table.user_id >= 10 AND + users_table.user_id <= 70 AND + events_table.event_type > 12 AND events_table.event_type < 14 + ) + ) AS subquery_1 + LEFT JOIN + (SELECT + user_id, + COUNT(*) AS count_pay + FROM + users_table + WHERE + user_id >= 10 AND + user_id <= 70 AND + users_table.value_1 > 15 AND users_table.value_1 < 17 + GROUP BY + user_id + HAVING + COUNT(*) > 1) AS subquery_2 + ON + subquery_1.user_id = subquery_2.user_id + GROUP BY + subquery_1.user_id, + count_pay) AS subquery_top + GROUP BY + count_pay, user_id +) +GROUP BY user_id +HAVING count(*) > 3 AND sum(value_2) > 49000 +ORDER BY 1; + + +-- the following query doesn't have a meaningul result +-- but it is a valid query with an arbitrary subquery in +-- FROM clause involving a complex query in WHERE clause +SELECT user_id, array_length(events_table, 1) +FROM ( + SELECT user_id, array_agg(event ORDER BY time) AS events_table + FROM ( + SELECT + u.user_id, e.event_type::text AS event, e.time + FROM + users_table AS u, + events_table AS e + WHERE u.user_id = e.user_id AND + u.user_id IN + ( + SELECT + user_id + FROM + users_table + WHERE value_2 >= 5 + AND EXISTS (SELECT user_id FROM events_table WHERE event_type > 100 AND event_type <= 300 AND value_3 > 100 AND user_id = users_table.user_id) + AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type > 300 AND event_type <= 350 AND value_3 > 100 AND user_id = users_table.user_id) + ) + ) t + GROUP BY user_id +) q +ORDER BY 2 DESC, 1; + +-- +-- below tests only aims for cases where all relations +-- are not joined on partition key +-- + +-- e4 is not joined on the partition key +SELECT user_id, value_2 FROM users_table WHERE + value_1 > 101 AND value_1 < 110 + AND value_2 >= 5 + AND user_id IN + ( + SELECT + e1.user_id + FROM ( + -- Get the first time each user viewed the homepage. + SELECT + user_id, + 1 AS view_homepage, + min(time) AS view_homepage_time + FROM events_table + WHERE + event_type IN (10, 20, 30, 40, 50, 60, 70, 80, 90) + GROUP BY user_id + ) e1 LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS use_demo, + time AS use_demo_time + FROM events_table + WHERE + user_id = e1.user_id AND + event_type IN (11, 21, 31, 41, 51, 61, 71, 81, 91) + ORDER BY time + ) e2 ON true LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS enter_credit_card, + time AS enter_credit_card_time + FROM events_table + WHERE + user_id = e2.user_id AND + event_type IN (12, 22, 32, 42, 52, 62, 72, 82, 92) + ORDER BY time + ) e3 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS submit_card_info, + user_id, + time AS enter_credit_card_time + FROM events_table + WHERE + value_2 = e3.user_id AND + event_type IN (13, 23, 33, 43, 53, 63, 73, 83, 93) + ORDER BY time + ) e4 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS see_bought_screen + FROM events_table + WHERE + user_id = e4.user_id AND + event_type IN (14, 24, 34, 44, 54, 64, 74, 84, 94) + ORDER BY time + ) e5 ON true + group by e1.user_id + HAVING sum(submit_card_info) > 0 +); + + +-- left leaf query does not return partition key +SELECT + user_id +FROM + users_table +WHERE + user_id IN +( + SELECT + user_id + FROM ( + SELECT + subquery_1.user_id, count_pay + FROM + ( + (SELECT + 2 * users_table.user_id as user_id, + 'action=>1'AS event, + events_table.time + FROM + users_table, + events_table + WHERE + users_table.user_id = events_table.user_id AND + users_table.user_id >= 10 AND + users_table.user_id <= 70 AND + events_table.event_type > 10 AND events_table.event_type < 12 + ) + UNION + (SELECT + users_table.user_id, + 'action=>2'AS event, + events_table.time + FROM + users_table, + events_table + WHERE + users_table.user_id = events_table.user_id AND + users_table.user_id >= 10 AND + users_table.user_id <= 70 AND + events_table.event_type > 12 AND events_table.event_type < 14 + ) + ) AS subquery_1 + LEFT JOIN + (SELECT + user_id, + COUNT(*) AS count_pay + FROM + users_table + WHERE + user_id >= 10 AND + user_id <= 70 AND + users_table.value_1 > 15 AND users_table.value_1 < 17 + GROUP BY + user_id + HAVING + COUNT(*) > 1) AS subquery_2 + ON + subquery_1.user_id = subquery_2.user_id + GROUP BY + subquery_1.user_id, + count_pay) AS subquery_top + GROUP BY + count_pay, user_id +) +GROUP BY user_id +HAVING count(*) > 3 AND sum(value_2) > 49000 +ORDER BY 1; + +-- NOT EXISTS query has non-equi join +SELECT user_id, array_length(events_table, 1) +FROM ( + SELECT user_id, array_agg(event ORDER BY time) AS events_table + FROM ( + SELECT + u.user_id, e.event_type::text AS event, e.time + FROM + users_table AS u, + events_table AS e + WHERE u.user_id = e.user_id AND + u.user_id IN + ( + SELECT + user_id + FROM + users_table + WHERE value_2 >= 5 + AND EXISTS (SELECT user_id FROM events_table WHERE event_type > 100 AND event_type <= 300 AND value_3 > 100 AND user_id = users_table.user_id) + AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type > 300 AND event_type <= 350 AND value_3 > 100 AND user_id != users_table.user_id) + ) + ) t + GROUP BY user_id +) q +ORDER BY 2 DESC, 1; + +-- subquery in where clause doesn't have a relation +SELECT + user_id +FROM + users_table +WHERE + value_2 > + (SELECT 1); + +-- OFFSET is not supported in the subquey +SELECT + user_id +FROM + users_table +WHERE + value_2 > + (SELECT + max(value_2) + FROM + events_table + WHERE + users_table.user_id = events_table.user_id AND event_type = 50 + GROUP BY + user_id + OFFSET 3 + ); + +-- semi join is not on the partition key for the third subquery +SELECT user_id +FROM users_table +WHERE user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 10 AND value_1 <= 20) + AND user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 30 AND value_1 <= 40) + AND value_2 IN (SELECT user_id FROM users_table WHERE value_1 >= 50 AND value_1 <= 60); + +CREATE FUNCTION test_join_function(integer, integer) RETURNS bool + AS 'select $1 > $2;' + LANGUAGE SQL + IMMUTABLE + RETURNS NULL ON NULL INPUT; + +-- we disallow JOINs via functions +SELECT user_id, value_2 FROM users_table WHERE + value_1 = 101 + AND value_2 >= 5 + AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type=101 AND value_3 > 100 AND test_join_function(events_table.user_id, users_table.user_id)) +ORDER BY 1 DESC, 2 DESC +LIMIT 3; + +DROP FUNCTION test_join_function(int,int); diff --git a/src/test/regress/sql/multi_verify_no_subquery.sql b/src/test/regress/sql/multi_verify_no_subquery.sql deleted file mode 100644 index d475964bf..000000000 --- a/src/test/regress/sql/multi_verify_no_subquery.sql +++ /dev/null @@ -1,22 +0,0 @@ --- --- MULTI_VERIFY_NO_SUBQUERY --- - --- This test checks that we simply emit an error message instead of trying to --- process a distributed unsupported SQL subquery. - - -ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 1030000; - - -SELECT * FROM lineitem WHERE l_orderkey IN - (SELECT l_orderkey FROM lineitem WHERE l_quantity > 0); - -SELECT l_quantity FROM lineitem WHERE EXISTS - (SELECT 1 FROM orders WHERE o_orderkey = l_orderkey); - -SELECT l_quantity FROM lineitem WHERE l_orderkey IN (SELECT o_orderkey FROM orders); - -SELECT l_orderkey FROM lineitem WHERE l_quantity > ALL(SELECT o_orderkey FROM orders); - -SELECT l_quantity FROM lineitem WHERE l_orderkey = (SELECT min(o_orderkey) FROM orders);