From 771e7450cce3cd21065457f456a0c1eb283d4ed3 Mon Sep 17 00:00:00 2001 From: Onder Kalaci Date: Fri, 17 Nov 2017 08:02:22 +0200 Subject: [PATCH] Relax the checks on ensuring distribution columns for target entries With this commit, we allow pushing down subqueries with only reference tables where GROUP BY or DISTINCT clause or Window functions include only columns from reference tables. --- .../planner/multi_logical_planner.c | 110 ++++++++++++++++++ ...ulti_subquery_complex_reference_clause.out | 110 ++++++++++++++++++ ...lti_subquery_in_where_reference_clause.out | 23 ++++ ...ulti_subquery_complex_reference_clause.sql | 75 ++++++++++++ ...lti_subquery_in_where_reference_clause.sql | 22 ++++ 5 files changed, 340 insertions(+) diff --git a/src/backend/distributed/planner/multi_logical_planner.c b/src/backend/distributed/planner/multi_logical_planner.c index e514731df..51e1c48ed 100644 --- a/src/backend/distributed/planner/multi_logical_planner.c +++ b/src/backend/distributed/planner/multi_logical_planner.c @@ -84,6 +84,8 @@ static DeferredErrorMessage * DeferErrorIfUnsupportedUnionQuery(Query *queryTree static bool ExtractSetOperationStatmentWalker(Node *node, List **setOperationList); static DeferredErrorMessage * DeferErrorIfUnsupportedTableCombination(Query *queryTree); static bool WindowPartitionOnDistributionColumn(Query *query); +static bool AllTargetExpressionsAreColumnReferences(List *targetEntryList); +static bool RangeTableListContainsOnlyReferenceTables(List *rangeTableList); static FieldSelect * CompositeFieldRecursive(Expr *expression, Query *query); static bool FullCompositeFieldList(List *compositeFieldList); static MultiNode * MultiPlanTree(Query *queryTree); @@ -1234,10 +1236,118 @@ TargetListOnPartitionColumn(Query *query, List *targetEntryList) } } + /* + * We could still behave as if the target list is on partition column if + * all range table entries are reference tables and all target expressions + * are column references to the given query level. + */ + if (!targetListOnPartitionColumn) + { + if (RangeTableListContainsOnlyReferenceTables(query->rtable) && + AllTargetExpressionsAreColumnReferences(targetEntryList)) + { + targetListOnPartitionColumn = true; + } + } + return targetListOnPartitionColumn; } +/* + * AllTargetExpressionsAreColumnReferences returns true if non of the + * elements in the target entry list belong to an outer query (for + * example the query is a sublink and references to another query + * in the from list). + * + * The function also returns true if any of the target entries is not + * a column itself. This might be too restrictive, but, given that we're + * handling a very specific type of queries, that seems acceptable for now. + */ +static bool +AllTargetExpressionsAreColumnReferences(List *targetEntryList) +{ + ListCell *targetEntryCell = NULL; + + foreach(targetEntryCell, targetEntryList) + { + TargetEntry *targetEntry = lfirst(targetEntryCell); + Var *candidateColumn = NULL; + Expr *strippedColumnExpression = (Expr *) strip_implicit_coercions( + (Node *) targetEntry->expr); + + if (IsA(strippedColumnExpression, Var)) + { + candidateColumn = (Var *) strippedColumnExpression; + } + else if (IsA(strippedColumnExpression, FieldSelect)) + { + FieldSelect *compositeField = (FieldSelect *) strippedColumnExpression; + Expr *fieldExpression = compositeField->arg; + + if (IsA(fieldExpression, Var)) + { + candidateColumn = (Var *) fieldExpression; + } + } + + /* we don't support target entries that are not columns */ + if (candidateColumn == NULL) + { + return false; + } + + if (candidateColumn->varlevelsup > 0) + { + return false; + } + } + + return true; +} + + +/* + * RangeTableListContainsOnlyReferenceTables returns true if all range table + * entries are reference tables. + * + * The function returns false for range table entries that are not relations. + * + * Note that the function doesn't recurse into subqueries, returns false when + * a subquery is found. + */ +static bool +RangeTableListContainsOnlyReferenceTables(List *rangeTableList) +{ + ListCell *rangeTableCell = NULL; + foreach(rangeTableCell, rangeTableList) + { + RangeTblEntry *rangeTableEntry = (RangeTblEntry *) lfirst(rangeTableCell); + + if (rangeTableEntry->rtekind == RTE_RELATION) + { + Oid relationId = rangeTableEntry->relid; + + if (!IsDistributedTable(relationId)) + { + return false; + } + + if (PartitionMethod(relationId) != DISTRIBUTE_BY_NONE) + { + return false; + } + } + else + { + return false; + } + } + + return true; +} + + /* * FullCompositeFieldList gets a composite field list, and checks if all fields * of composite type are used in the list. diff --git a/src/test/regress/expected/multi_subquery_complex_reference_clause.out b/src/test/regress/expected/multi_subquery_complex_reference_clause.out index 3ca038a5a..209029da3 100644 --- a/src/test/regress/expected/multi_subquery_complex_reference_clause.out +++ b/src/test/regress/expected/multi_subquery_complex_reference_clause.out @@ -1193,6 +1193,116 @@ ORDER BY 1 LIMIT 3; 2 (3 rows) +-- should be able to pushdown since one of the subqueries has distinct on reference tables +-- and there is only reference table in that subquery +SELECT + distinct_users, event_type, time +FROM +(SELECT user_id, time, event_type FROM events_table) as events_dist INNER JOIN +(SELECT DISTINCT user_id as distinct_users FROM users_reference_table) users_ref ON (events_dist.user_id = users_ref.distinct_users) +ORDER BY time DESC +LIMIT 5 +OFFSET 0; + distinct_users | event_type | time +----------------+------------+--------------------------------- + 78 | 815 | Tue Jan 21 05:59:54.833395 2014 + 92 | 826 | Tue Jan 21 05:57:26.643861 2014 + 65 | 241 | Tue Jan 21 05:56:52.624231 2014 + 23 | 573 | Tue Jan 21 05:55:28.796818 2014 + 98 | 23 | Tue Jan 21 05:54:57.987456 2014 +(5 rows) + +-- the same query wuth multiple reference tables in the subquery +SELECT + distinct_users, event_type, time +FROM +(SELECT user_id, time, event_type FROM events_table) as events_dist INNER JOIN +(SELECT DISTINCT users_reference_table.user_id as distinct_users FROM users_reference_table, events_reference_table + WHERE events_reference_table.user_id = users_reference_table.user_id AND events_reference_table.event_type IN (1,2,3,4)) users_ref +ON (events_dist.user_id = users_ref.distinct_users) +ORDER BY time DESC +LIMIT 5 +OFFSET 0; + distinct_users | event_type | time +----------------+------------+--------------------------------- + 65 | 241 | Tue Jan 21 05:56:52.624231 2014 + 98 | 23 | Tue Jan 21 05:54:57.987456 2014 + 26 | 957 | Tue Jan 21 05:43:16.99674 2014 + 44 | 682 | Tue Jan 21 05:43:00.838945 2014 + 81 | 852 | Tue Jan 21 05:34:56.310878 2014 +(5 rows) + +-- similar query as the above, but with group bys +SELECT + distinct_users, event_type, time +FROM +(SELECT user_id, time, event_type FROM events_table) as events_dist INNER JOIN +(SELECT user_id as distinct_users FROM users_reference_table GROUP BY distinct_users) users_ref ON (events_dist.user_id = users_ref.distinct_users) +ORDER BY time DESC +LIMIT 5 +OFFSET 0; + distinct_users | event_type | time +----------------+------------+--------------------------------- + 78 | 815 | Tue Jan 21 05:59:54.833395 2014 + 92 | 826 | Tue Jan 21 05:57:26.643861 2014 + 65 | 241 | Tue Jan 21 05:56:52.624231 2014 + 23 | 573 | Tue Jan 21 05:55:28.796818 2014 + 98 | 23 | Tue Jan 21 05:54:57.987456 2014 +(5 rows) + +-- should not push down this query since there is a distributed table (i.e., events_table) +-- which is not in the DISTINCT clause +SELECT * FROM +( + SELECT DISTINCT users_reference_table.user_id FROM users_reference_table, events_table WHERE users_reference_table.user_id = events_table.value_4 +) as foo; +ERROR: cannot push down this subquery +DETAIL: Distinct on columns without partition column is currently unsupported +SELECT * FROM +( + SELECT users_reference_table.user_id FROM users_reference_table, events_table WHERE users_reference_table.user_id = events_table.value_4 + GROUP BY 1 +) as foo; +ERROR: cannot push down this subquery +DETAIL: Group by list without partition column is currently unsupported +-- similiar to the above examples, this time there is a subquery +-- whose output is not in the DISTINCT clause +SELECT * FROM +( + SELECT DISTINCT users_reference_table.user_id FROM users_reference_table, (SELECT user_id, random() FROM events_table) as us_events WHERE users_reference_table.user_id = us_events.user_id +) as foo; +ERROR: cannot push down this subquery +DETAIL: Distinct on columns without partition column is currently unsupported +-- the following query is safe to push down since the DISTINCT clause include distribution column +SELECT * FROM +( + SELECT DISTINCT users_reference_table.user_id, us_events.user_id FROM users_reference_table, (SELECT user_id, random() FROM events_table WHERE event_type IN (2,3)) as us_events WHERE users_reference_table.user_id = us_events.user_id +) as foo +ORDER BY 1 DESC +LIMIT 4; + user_id | user_id +---------+--------- + 98 | 98 + 96 | 96 + 90 | 90 + 81 | 81 +(4 rows) + +-- should not pushdown since there is a non partition column on the DISTINCT clause +SELECT * FROM +( + SELECT + DISTINCT users_reference_table.user_id, us_events.value_4 + FROM + users_reference_table, + (SELECT user_id, value_4, random() FROM events_table WHERE event_type IN (2,3)) as us_events + WHERE + users_reference_table.user_id = us_events.user_id +) as foo +ORDER BY 1 DESC +LIMIT 4; +ERROR: cannot push down this subquery +DETAIL: Distinct on columns without partition column is currently unsupported DROP TABLE user_buy_test_table; DROP TABLE users_ref_test_table; DROP TABLE users_return_test_table; diff --git a/src/test/regress/expected/multi_subquery_in_where_reference_clause.out b/src/test/regress/expected/multi_subquery_in_where_reference_clause.out index 48927b35d..ec1399f65 100644 --- a/src/test/regress/expected/multi_subquery_in_where_reference_clause.out +++ b/src/test/regress/expected/multi_subquery_in_where_reference_clause.out @@ -343,3 +343,26 @@ ORDER BY user_id LIMIT 5; ERROR: cannot push down this subquery DETAIL: Group by list without partition column is currently unsupported +-- similar query with slightly more complex group by +-- though the error message is a bit confusing +SELECT + user_id +FROM + users_table +WHERE + value_2 > + (SELECT + max(value_2) + FROM + events_reference_table + WHERE + users_table.user_id = events_reference_table.user_id AND event_type = 50 + GROUP BY + (users_table.user_id * 2) + ) +GROUP BY user_id +HAVING count(*) > 66 +ORDER BY user_id +LIMIT 5; +ERROR: cannot push down this subquery +DETAIL: Group by list without partition column is currently unsupported diff --git a/src/test/regress/sql/multi_subquery_complex_reference_clause.sql b/src/test/regress/sql/multi_subquery_complex_reference_clause.sql index c76237c1d..4aacb3c6f 100644 --- a/src/test/regress/sql/multi_subquery_complex_reference_clause.sql +++ b/src/test/regress/sql/multi_subquery_complex_reference_clause.sql @@ -950,6 +950,81 @@ SELECT foo.user_id FROM ) as foo ORDER BY 1 LIMIT 3; +-- should be able to pushdown since one of the subqueries has distinct on reference tables +-- and there is only reference table in that subquery +SELECT + distinct_users, event_type, time +FROM +(SELECT user_id, time, event_type FROM events_table) as events_dist INNER JOIN +(SELECT DISTINCT user_id as distinct_users FROM users_reference_table) users_ref ON (events_dist.user_id = users_ref.distinct_users) +ORDER BY time DESC +LIMIT 5 +OFFSET 0; + +-- the same query wuth multiple reference tables in the subquery +SELECT + distinct_users, event_type, time +FROM +(SELECT user_id, time, event_type FROM events_table) as events_dist INNER JOIN +(SELECT DISTINCT users_reference_table.user_id as distinct_users FROM users_reference_table, events_reference_table + WHERE events_reference_table.user_id = users_reference_table.user_id AND events_reference_table.event_type IN (1,2,3,4)) users_ref +ON (events_dist.user_id = users_ref.distinct_users) +ORDER BY time DESC +LIMIT 5 +OFFSET 0; + +-- similar query as the above, but with group bys +SELECT + distinct_users, event_type, time +FROM +(SELECT user_id, time, event_type FROM events_table) as events_dist INNER JOIN +(SELECT user_id as distinct_users FROM users_reference_table GROUP BY distinct_users) users_ref ON (events_dist.user_id = users_ref.distinct_users) +ORDER BY time DESC +LIMIT 5 +OFFSET 0; + +-- should not push down this query since there is a distributed table (i.e., events_table) +-- which is not in the DISTINCT clause +SELECT * FROM +( + SELECT DISTINCT users_reference_table.user_id FROM users_reference_table, events_table WHERE users_reference_table.user_id = events_table.value_4 +) as foo; + +SELECT * FROM +( + SELECT users_reference_table.user_id FROM users_reference_table, events_table WHERE users_reference_table.user_id = events_table.value_4 + GROUP BY 1 +) as foo; + +-- similiar to the above examples, this time there is a subquery +-- whose output is not in the DISTINCT clause +SELECT * FROM +( + SELECT DISTINCT users_reference_table.user_id FROM users_reference_table, (SELECT user_id, random() FROM events_table) as us_events WHERE users_reference_table.user_id = us_events.user_id +) as foo; + +-- the following query is safe to push down since the DISTINCT clause include distribution column +SELECT * FROM +( + SELECT DISTINCT users_reference_table.user_id, us_events.user_id FROM users_reference_table, (SELECT user_id, random() FROM events_table WHERE event_type IN (2,3)) as us_events WHERE users_reference_table.user_id = us_events.user_id +) as foo +ORDER BY 1 DESC +LIMIT 4; + +-- should not pushdown since there is a non partition column on the DISTINCT clause +SELECT * FROM +( + SELECT + DISTINCT users_reference_table.user_id, us_events.value_4 + FROM + users_reference_table, + (SELECT user_id, value_4, random() FROM events_table WHERE event_type IN (2,3)) as us_events + WHERE + users_reference_table.user_id = us_events.user_id +) as foo +ORDER BY 1 DESC +LIMIT 4; + DROP TABLE user_buy_test_table; DROP TABLE users_ref_test_table; DROP TABLE users_return_test_table; diff --git a/src/test/regress/sql/multi_subquery_in_where_reference_clause.sql b/src/test/regress/sql/multi_subquery_in_where_reference_clause.sql index 5e38325f0..c996df7af 100644 --- a/src/test/regress/sql/multi_subquery_in_where_reference_clause.sql +++ b/src/test/regress/sql/multi_subquery_in_where_reference_clause.sql @@ -297,3 +297,25 @@ GROUP BY user_id HAVING count(*) > 66 ORDER BY user_id LIMIT 5; + +-- similar query with slightly more complex group by +-- though the error message is a bit confusing +SELECT + user_id +FROM + users_table +WHERE + value_2 > + (SELECT + max(value_2) + FROM + events_reference_table + WHERE + users_table.user_id = events_reference_table.user_id AND event_type = 50 + GROUP BY + (users_table.user_id * 2) + ) +GROUP BY user_id +HAVING count(*) > 66 +ORDER BY user_id +LIMIT 5;