diff --git a/src/backend/distributed/planner/multi_logical_planner.c b/src/backend/distributed/planner/multi_logical_planner.c index e514731df..51e1c48ed 100644 --- a/src/backend/distributed/planner/multi_logical_planner.c +++ b/src/backend/distributed/planner/multi_logical_planner.c @@ -84,6 +84,8 @@ static DeferredErrorMessage * DeferErrorIfUnsupportedUnionQuery(Query *queryTree static bool ExtractSetOperationStatmentWalker(Node *node, List **setOperationList); static DeferredErrorMessage * DeferErrorIfUnsupportedTableCombination(Query *queryTree); static bool WindowPartitionOnDistributionColumn(Query *query); +static bool AllTargetExpressionsAreColumnReferences(List *targetEntryList); +static bool RangeTableListContainsOnlyReferenceTables(List *rangeTableList); static FieldSelect * CompositeFieldRecursive(Expr *expression, Query *query); static bool FullCompositeFieldList(List *compositeFieldList); static MultiNode * MultiPlanTree(Query *queryTree); @@ -1234,10 +1236,118 @@ TargetListOnPartitionColumn(Query *query, List *targetEntryList) } } + /* + * We could still behave as if the target list is on partition column if + * all range table entries are reference tables and all target expressions + * are column references to the given query level. + */ + if (!targetListOnPartitionColumn) + { + if (RangeTableListContainsOnlyReferenceTables(query->rtable) && + AllTargetExpressionsAreColumnReferences(targetEntryList)) + { + targetListOnPartitionColumn = true; + } + } + return targetListOnPartitionColumn; } +/* + * AllTargetExpressionsAreColumnReferences returns true if non of the + * elements in the target entry list belong to an outer query (for + * example the query is a sublink and references to another query + * in the from list). + * + * The function also returns true if any of the target entries is not + * a column itself. This might be too restrictive, but, given that we're + * handling a very specific type of queries, that seems acceptable for now. + */ +static bool +AllTargetExpressionsAreColumnReferences(List *targetEntryList) +{ + ListCell *targetEntryCell = NULL; + + foreach(targetEntryCell, targetEntryList) + { + TargetEntry *targetEntry = lfirst(targetEntryCell); + Var *candidateColumn = NULL; + Expr *strippedColumnExpression = (Expr *) strip_implicit_coercions( + (Node *) targetEntry->expr); + + if (IsA(strippedColumnExpression, Var)) + { + candidateColumn = (Var *) strippedColumnExpression; + } + else if (IsA(strippedColumnExpression, FieldSelect)) + { + FieldSelect *compositeField = (FieldSelect *) strippedColumnExpression; + Expr *fieldExpression = compositeField->arg; + + if (IsA(fieldExpression, Var)) + { + candidateColumn = (Var *) fieldExpression; + } + } + + /* we don't support target entries that are not columns */ + if (candidateColumn == NULL) + { + return false; + } + + if (candidateColumn->varlevelsup > 0) + { + return false; + } + } + + return true; +} + + +/* + * RangeTableListContainsOnlyReferenceTables returns true if all range table + * entries are reference tables. + * + * The function returns false for range table entries that are not relations. + * + * Note that the function doesn't recurse into subqueries, returns false when + * a subquery is found. + */ +static bool +RangeTableListContainsOnlyReferenceTables(List *rangeTableList) +{ + ListCell *rangeTableCell = NULL; + foreach(rangeTableCell, rangeTableList) + { + RangeTblEntry *rangeTableEntry = (RangeTblEntry *) lfirst(rangeTableCell); + + if (rangeTableEntry->rtekind == RTE_RELATION) + { + Oid relationId = rangeTableEntry->relid; + + if (!IsDistributedTable(relationId)) + { + return false; + } + + if (PartitionMethod(relationId) != DISTRIBUTE_BY_NONE) + { + return false; + } + } + else + { + return false; + } + } + + return true; +} + + /* * FullCompositeFieldList gets a composite field list, and checks if all fields * of composite type are used in the list. diff --git a/src/test/regress/expected/multi_subquery_complex_reference_clause.out b/src/test/regress/expected/multi_subquery_complex_reference_clause.out index 2a808d81e..3f0ad2069 100644 --- a/src/test/regress/expected/multi_subquery_complex_reference_clause.out +++ b/src/test/regress/expected/multi_subquery_complex_reference_clause.out @@ -1193,6 +1193,116 @@ ORDER BY 1 LIMIT 3; 2 (3 rows) +-- should be able to pushdown since one of the subqueries has distinct on reference tables +-- and there is only reference table in that subquery +SELECT + distinct_users, event_type, time +FROM +(SELECT user_id, time, event_type FROM events_table) as events_dist INNER JOIN +(SELECT DISTINCT user_id as distinct_users FROM users_reference_table) users_ref ON (events_dist.user_id = users_ref.distinct_users) +ORDER BY time DESC +LIMIT 5 +OFFSET 0; + distinct_users | event_type | time +----------------+------------+--------------------------------- + 78 | 815 | Tue Jan 21 05:59:54.833395 2014 + 92 | 826 | Tue Jan 21 05:57:26.643861 2014 + 65 | 241 | Tue Jan 21 05:56:52.624231 2014 + 23 | 573 | Tue Jan 21 05:55:28.796818 2014 + 98 | 23 | Tue Jan 21 05:54:57.987456 2014 +(5 rows) + +-- the same query wuth multiple reference tables in the subquery +SELECT + distinct_users, event_type, time +FROM +(SELECT user_id, time, event_type FROM events_table) as events_dist INNER JOIN +(SELECT DISTINCT users_reference_table.user_id as distinct_users FROM users_reference_table, events_reference_table + WHERE events_reference_table.user_id = users_reference_table.user_id AND events_reference_table.event_type IN (1,2,3,4)) users_ref +ON (events_dist.user_id = users_ref.distinct_users) +ORDER BY time DESC +LIMIT 5 +OFFSET 0; + distinct_users | event_type | time +----------------+------------+--------------------------------- + 65 | 241 | Tue Jan 21 05:56:52.624231 2014 + 98 | 23 | Tue Jan 21 05:54:57.987456 2014 + 26 | 957 | Tue Jan 21 05:43:16.99674 2014 + 44 | 682 | Tue Jan 21 05:43:00.838945 2014 + 81 | 852 | Tue Jan 21 05:34:56.310878 2014 +(5 rows) + +-- similar query as the above, but with group bys +SELECT + distinct_users, event_type, time +FROM +(SELECT user_id, time, event_type FROM events_table) as events_dist INNER JOIN +(SELECT user_id as distinct_users FROM users_reference_table GROUP BY distinct_users) users_ref ON (events_dist.user_id = users_ref.distinct_users) +ORDER BY time DESC +LIMIT 5 +OFFSET 0; + distinct_users | event_type | time +----------------+------------+--------------------------------- + 78 | 815 | Tue Jan 21 05:59:54.833395 2014 + 92 | 826 | Tue Jan 21 05:57:26.643861 2014 + 65 | 241 | Tue Jan 21 05:56:52.624231 2014 + 23 | 573 | Tue Jan 21 05:55:28.796818 2014 + 98 | 23 | Tue Jan 21 05:54:57.987456 2014 +(5 rows) + +-- should not push down this query since there is a distributed table (i.e., events_table) +-- which is not in the DISTINCT clause +SELECT * FROM +( + SELECT DISTINCT users_reference_table.user_id FROM users_reference_table, events_table WHERE users_reference_table.user_id = events_table.value_4 +) as foo; +ERROR: cannot push down this subquery +DETAIL: Distinct on columns without partition column is currently unsupported +SELECT * FROM +( + SELECT users_reference_table.user_id FROM users_reference_table, events_table WHERE users_reference_table.user_id = events_table.value_4 + GROUP BY 1 +) as foo; +ERROR: cannot push down this subquery +DETAIL: Group by list without partition column is currently unsupported +-- similiar to the above examples, this time there is a subquery +-- whose output is not in the DISTINCT clause +SELECT * FROM +( + SELECT DISTINCT users_reference_table.user_id FROM users_reference_table, (SELECT user_id, random() FROM events_table) as us_events WHERE users_reference_table.user_id = us_events.user_id +) as foo; +ERROR: cannot push down this subquery +DETAIL: Distinct on columns without partition column is currently unsupported +-- the following query is safe to push down since the DISTINCT clause include distribution column +SELECT * FROM +( + SELECT DISTINCT users_reference_table.user_id, us_events.user_id FROM users_reference_table, (SELECT user_id, random() FROM events_table WHERE event_type IN (2,3)) as us_events WHERE users_reference_table.user_id = us_events.user_id +) as foo +ORDER BY 1 DESC +LIMIT 4; + user_id | user_id +---------+--------- + 98 | 98 + 96 | 96 + 90 | 90 + 81 | 81 +(4 rows) + +-- should not pushdown since there is a non partition column on the DISTINCT clause +SELECT * FROM +( + SELECT + DISTINCT users_reference_table.user_id, us_events.value_4 + FROM + users_reference_table, + (SELECT user_id, value_4, random() FROM events_table WHERE event_type IN (2,3)) as us_events + WHERE + users_reference_table.user_id = us_events.user_id +) as foo +ORDER BY 1 DESC +LIMIT 4; +ERROR: cannot push down this subquery +DETAIL: Distinct on columns without partition column is currently unsupported DROP TABLE user_buy_test_table; DROP TABLE users_ref_test_table; DROP TABLE users_return_test_table; diff --git a/src/test/regress/expected/multi_subquery_in_where_reference_clause.out b/src/test/regress/expected/multi_subquery_in_where_reference_clause.out index 48927b35d..ec1399f65 100644 --- a/src/test/regress/expected/multi_subquery_in_where_reference_clause.out +++ b/src/test/regress/expected/multi_subquery_in_where_reference_clause.out @@ -343,3 +343,26 @@ ORDER BY user_id LIMIT 5; ERROR: cannot push down this subquery DETAIL: Group by list without partition column is currently unsupported +-- similar query with slightly more complex group by +-- though the error message is a bit confusing +SELECT + user_id +FROM + users_table +WHERE + value_2 > + (SELECT + max(value_2) + FROM + events_reference_table + WHERE + users_table.user_id = events_reference_table.user_id AND event_type = 50 + GROUP BY + (users_table.user_id * 2) + ) +GROUP BY user_id +HAVING count(*) > 66 +ORDER BY user_id +LIMIT 5; +ERROR: cannot push down this subquery +DETAIL: Group by list without partition column is currently unsupported diff --git a/src/test/regress/sql/multi_subquery_complex_reference_clause.sql b/src/test/regress/sql/multi_subquery_complex_reference_clause.sql index 183d8c0e0..731daa60a 100644 --- a/src/test/regress/sql/multi_subquery_complex_reference_clause.sql +++ b/src/test/regress/sql/multi_subquery_complex_reference_clause.sql @@ -950,6 +950,81 @@ SELECT foo.user_id FROM ) as foo ORDER BY 1 LIMIT 3; +-- should be able to pushdown since one of the subqueries has distinct on reference tables +-- and there is only reference table in that subquery +SELECT + distinct_users, event_type, time +FROM +(SELECT user_id, time, event_type FROM events_table) as events_dist INNER JOIN +(SELECT DISTINCT user_id as distinct_users FROM users_reference_table) users_ref ON (events_dist.user_id = users_ref.distinct_users) +ORDER BY time DESC +LIMIT 5 +OFFSET 0; + +-- the same query wuth multiple reference tables in the subquery +SELECT + distinct_users, event_type, time +FROM +(SELECT user_id, time, event_type FROM events_table) as events_dist INNER JOIN +(SELECT DISTINCT users_reference_table.user_id as distinct_users FROM users_reference_table, events_reference_table + WHERE events_reference_table.user_id = users_reference_table.user_id AND events_reference_table.event_type IN (1,2,3,4)) users_ref +ON (events_dist.user_id = users_ref.distinct_users) +ORDER BY time DESC +LIMIT 5 +OFFSET 0; + +-- similar query as the above, but with group bys +SELECT + distinct_users, event_type, time +FROM +(SELECT user_id, time, event_type FROM events_table) as events_dist INNER JOIN +(SELECT user_id as distinct_users FROM users_reference_table GROUP BY distinct_users) users_ref ON (events_dist.user_id = users_ref.distinct_users) +ORDER BY time DESC +LIMIT 5 +OFFSET 0; + +-- should not push down this query since there is a distributed table (i.e., events_table) +-- which is not in the DISTINCT clause +SELECT * FROM +( + SELECT DISTINCT users_reference_table.user_id FROM users_reference_table, events_table WHERE users_reference_table.user_id = events_table.value_4 +) as foo; + +SELECT * FROM +( + SELECT users_reference_table.user_id FROM users_reference_table, events_table WHERE users_reference_table.user_id = events_table.value_4 + GROUP BY 1 +) as foo; + +-- similiar to the above examples, this time there is a subquery +-- whose output is not in the DISTINCT clause +SELECT * FROM +( + SELECT DISTINCT users_reference_table.user_id FROM users_reference_table, (SELECT user_id, random() FROM events_table) as us_events WHERE users_reference_table.user_id = us_events.user_id +) as foo; + +-- the following query is safe to push down since the DISTINCT clause include distribution column +SELECT * FROM +( + SELECT DISTINCT users_reference_table.user_id, us_events.user_id FROM users_reference_table, (SELECT user_id, random() FROM events_table WHERE event_type IN (2,3)) as us_events WHERE users_reference_table.user_id = us_events.user_id +) as foo +ORDER BY 1 DESC +LIMIT 4; + +-- should not pushdown since there is a non partition column on the DISTINCT clause +SELECT * FROM +( + SELECT + DISTINCT users_reference_table.user_id, us_events.value_4 + FROM + users_reference_table, + (SELECT user_id, value_4, random() FROM events_table WHERE event_type IN (2,3)) as us_events + WHERE + users_reference_table.user_id = us_events.user_id +) as foo +ORDER BY 1 DESC +LIMIT 4; + DROP TABLE user_buy_test_table; DROP TABLE users_ref_test_table; DROP TABLE users_return_test_table; diff --git a/src/test/regress/sql/multi_subquery_in_where_reference_clause.sql b/src/test/regress/sql/multi_subquery_in_where_reference_clause.sql index 5e38325f0..c996df7af 100644 --- a/src/test/regress/sql/multi_subquery_in_where_reference_clause.sql +++ b/src/test/regress/sql/multi_subquery_in_where_reference_clause.sql @@ -297,3 +297,25 @@ GROUP BY user_id HAVING count(*) > 66 ORDER BY user_id LIMIT 5; + +-- similar query with slightly more complex group by +-- though the error message is a bit confusing +SELECT + user_id +FROM + users_table +WHERE + value_2 > + (SELECT + max(value_2) + FROM + events_reference_table + WHERE + users_table.user_id = events_reference_table.user_id AND event_type = 50 + GROUP BY + (users_table.user_id * 2) + ) +GROUP BY user_id +HAVING count(*) > 66 +ORDER BY user_id +LIMIT 5;