Relax the checks on ensuring distribution columns for target entries

With this commit, we allow pushing down subqueries with only
reference tables where GROUP BY or DISTINCT clause or Window
functions include only columns from reference tables.
pull/1816/head
Onder Kalaci 2017-11-17 08:02:22 +02:00
parent d063658d6d
commit d558ebb923
5 changed files with 340 additions and 0 deletions

View File

@ -84,6 +84,8 @@ static DeferredErrorMessage * DeferErrorIfUnsupportedUnionQuery(Query *queryTree
static bool ExtractSetOperationStatmentWalker(Node *node, List **setOperationList);
static DeferredErrorMessage * DeferErrorIfUnsupportedTableCombination(Query *queryTree);
static bool WindowPartitionOnDistributionColumn(Query *query);
static bool AllTargetExpressionsAreColumnReferences(List *targetEntryList);
static bool RangeTableListContainsOnlyReferenceTables(List *rangeTableList);
static FieldSelect * CompositeFieldRecursive(Expr *expression, Query *query);
static bool FullCompositeFieldList(List *compositeFieldList);
static MultiNode * MultiPlanTree(Query *queryTree);
@ -1234,10 +1236,118 @@ TargetListOnPartitionColumn(Query *query, List *targetEntryList)
}
}
/*
* We could still behave as if the target list is on partition column if
* all range table entries are reference tables and all target expressions
* are column references to the given query level.
*/
if (!targetListOnPartitionColumn)
{
if (RangeTableListContainsOnlyReferenceTables(query->rtable) &&
AllTargetExpressionsAreColumnReferences(targetEntryList))
{
targetListOnPartitionColumn = true;
}
}
return targetListOnPartitionColumn;
}
/*
* AllTargetExpressionsAreColumnReferences returns true if non of the
* elements in the target entry list belong to an outer query (for
* example the query is a sublink and references to another query
* in the from list).
*
* The function also returns true if any of the target entries is not
* a column itself. This might be too restrictive, but, given that we're
* handling a very specific type of queries, that seems acceptable for now.
*/
static bool
AllTargetExpressionsAreColumnReferences(List *targetEntryList)
{
ListCell *targetEntryCell = NULL;
foreach(targetEntryCell, targetEntryList)
{
TargetEntry *targetEntry = lfirst(targetEntryCell);
Var *candidateColumn = NULL;
Expr *strippedColumnExpression = (Expr *) strip_implicit_coercions(
(Node *) targetEntry->expr);
if (IsA(strippedColumnExpression, Var))
{
candidateColumn = (Var *) strippedColumnExpression;
}
else if (IsA(strippedColumnExpression, FieldSelect))
{
FieldSelect *compositeField = (FieldSelect *) strippedColumnExpression;
Expr *fieldExpression = compositeField->arg;
if (IsA(fieldExpression, Var))
{
candidateColumn = (Var *) fieldExpression;
}
}
/* we don't support target entries that are not columns */
if (candidateColumn == NULL)
{
return false;
}
if (candidateColumn->varlevelsup > 0)
{
return false;
}
}
return true;
}
/*
* RangeTableListContainsOnlyReferenceTables returns true if all range table
* entries are reference tables.
*
* The function returns false for range table entries that are not relations.
*
* Note that the function doesn't recurse into subqueries, returns false when
* a subquery is found.
*/
static bool
RangeTableListContainsOnlyReferenceTables(List *rangeTableList)
{
ListCell *rangeTableCell = NULL;
foreach(rangeTableCell, rangeTableList)
{
RangeTblEntry *rangeTableEntry = (RangeTblEntry *) lfirst(rangeTableCell);
if (rangeTableEntry->rtekind == RTE_RELATION)
{
Oid relationId = rangeTableEntry->relid;
if (!IsDistributedTable(relationId))
{
return false;
}
if (PartitionMethod(relationId) != DISTRIBUTE_BY_NONE)
{
return false;
}
}
else
{
return false;
}
}
return true;
}
/*
* FullCompositeFieldList gets a composite field list, and checks if all fields
* of composite type are used in the list.

View File

@ -1193,6 +1193,116 @@ ORDER BY 1 LIMIT 3;
2
(3 rows)
-- should be able to pushdown since one of the subqueries has distinct on reference tables
-- and there is only reference table in that subquery
SELECT
distinct_users, event_type, time
FROM
(SELECT user_id, time, event_type FROM events_table) as events_dist INNER JOIN
(SELECT DISTINCT user_id as distinct_users FROM users_reference_table) users_ref ON (events_dist.user_id = users_ref.distinct_users)
ORDER BY time DESC
LIMIT 5
OFFSET 0;
distinct_users | event_type | time
----------------+------------+---------------------------------
78 | 815 | Tue Jan 21 05:59:54.833395 2014
92 | 826 | Tue Jan 21 05:57:26.643861 2014
65 | 241 | Tue Jan 21 05:56:52.624231 2014
23 | 573 | Tue Jan 21 05:55:28.796818 2014
98 | 23 | Tue Jan 21 05:54:57.987456 2014
(5 rows)
-- the same query wuth multiple reference tables in the subquery
SELECT
distinct_users, event_type, time
FROM
(SELECT user_id, time, event_type FROM events_table) as events_dist INNER JOIN
(SELECT DISTINCT users_reference_table.user_id as distinct_users FROM users_reference_table, events_reference_table
WHERE events_reference_table.user_id = users_reference_table.user_id AND events_reference_table.event_type IN (1,2,3,4)) users_ref
ON (events_dist.user_id = users_ref.distinct_users)
ORDER BY time DESC
LIMIT 5
OFFSET 0;
distinct_users | event_type | time
----------------+------------+---------------------------------
65 | 241 | Tue Jan 21 05:56:52.624231 2014
98 | 23 | Tue Jan 21 05:54:57.987456 2014
26 | 957 | Tue Jan 21 05:43:16.99674 2014
44 | 682 | Tue Jan 21 05:43:00.838945 2014
81 | 852 | Tue Jan 21 05:34:56.310878 2014
(5 rows)
-- similar query as the above, but with group bys
SELECT
distinct_users, event_type, time
FROM
(SELECT user_id, time, event_type FROM events_table) as events_dist INNER JOIN
(SELECT user_id as distinct_users FROM users_reference_table GROUP BY distinct_users) users_ref ON (events_dist.user_id = users_ref.distinct_users)
ORDER BY time DESC
LIMIT 5
OFFSET 0;
distinct_users | event_type | time
----------------+------------+---------------------------------
78 | 815 | Tue Jan 21 05:59:54.833395 2014
92 | 826 | Tue Jan 21 05:57:26.643861 2014
65 | 241 | Tue Jan 21 05:56:52.624231 2014
23 | 573 | Tue Jan 21 05:55:28.796818 2014
98 | 23 | Tue Jan 21 05:54:57.987456 2014
(5 rows)
-- should not push down this query since there is a distributed table (i.e., events_table)
-- which is not in the DISTINCT clause
SELECT * FROM
(
SELECT DISTINCT users_reference_table.user_id FROM users_reference_table, events_table WHERE users_reference_table.user_id = events_table.value_4
) as foo;
ERROR: cannot push down this subquery
DETAIL: Distinct on columns without partition column is currently unsupported
SELECT * FROM
(
SELECT users_reference_table.user_id FROM users_reference_table, events_table WHERE users_reference_table.user_id = events_table.value_4
GROUP BY 1
) as foo;
ERROR: cannot push down this subquery
DETAIL: Group by list without partition column is currently unsupported
-- similiar to the above examples, this time there is a subquery
-- whose output is not in the DISTINCT clause
SELECT * FROM
(
SELECT DISTINCT users_reference_table.user_id FROM users_reference_table, (SELECT user_id, random() FROM events_table) as us_events WHERE users_reference_table.user_id = us_events.user_id
) as foo;
ERROR: cannot push down this subquery
DETAIL: Distinct on columns without partition column is currently unsupported
-- the following query is safe to push down since the DISTINCT clause include distribution column
SELECT * FROM
(
SELECT DISTINCT users_reference_table.user_id, us_events.user_id FROM users_reference_table, (SELECT user_id, random() FROM events_table WHERE event_type IN (2,3)) as us_events WHERE users_reference_table.user_id = us_events.user_id
) as foo
ORDER BY 1 DESC
LIMIT 4;
user_id | user_id
---------+---------
98 | 98
96 | 96
90 | 90
81 | 81
(4 rows)
-- should not pushdown since there is a non partition column on the DISTINCT clause
SELECT * FROM
(
SELECT
DISTINCT users_reference_table.user_id, us_events.value_4
FROM
users_reference_table,
(SELECT user_id, value_4, random() FROM events_table WHERE event_type IN (2,3)) as us_events
WHERE
users_reference_table.user_id = us_events.user_id
) as foo
ORDER BY 1 DESC
LIMIT 4;
ERROR: cannot push down this subquery
DETAIL: Distinct on columns without partition column is currently unsupported
DROP TABLE user_buy_test_table;
DROP TABLE users_ref_test_table;
DROP TABLE users_return_test_table;

View File

@ -343,3 +343,26 @@ ORDER BY user_id
LIMIT 5;
ERROR: cannot push down this subquery
DETAIL: Group by list without partition column is currently unsupported
-- similar query with slightly more complex group by
-- though the error message is a bit confusing
SELECT
user_id
FROM
users_table
WHERE
value_2 >
(SELECT
max(value_2)
FROM
events_reference_table
WHERE
users_table.user_id = events_reference_table.user_id AND event_type = 50
GROUP BY
(users_table.user_id * 2)
)
GROUP BY user_id
HAVING count(*) > 66
ORDER BY user_id
LIMIT 5;
ERROR: cannot push down this subquery
DETAIL: Group by list without partition column is currently unsupported

View File

@ -950,6 +950,81 @@ SELECT foo.user_id FROM
) as foo
ORDER BY 1 LIMIT 3;
-- should be able to pushdown since one of the subqueries has distinct on reference tables
-- and there is only reference table in that subquery
SELECT
distinct_users, event_type, time
FROM
(SELECT user_id, time, event_type FROM events_table) as events_dist INNER JOIN
(SELECT DISTINCT user_id as distinct_users FROM users_reference_table) users_ref ON (events_dist.user_id = users_ref.distinct_users)
ORDER BY time DESC
LIMIT 5
OFFSET 0;
-- the same query wuth multiple reference tables in the subquery
SELECT
distinct_users, event_type, time
FROM
(SELECT user_id, time, event_type FROM events_table) as events_dist INNER JOIN
(SELECT DISTINCT users_reference_table.user_id as distinct_users FROM users_reference_table, events_reference_table
WHERE events_reference_table.user_id = users_reference_table.user_id AND events_reference_table.event_type IN (1,2,3,4)) users_ref
ON (events_dist.user_id = users_ref.distinct_users)
ORDER BY time DESC
LIMIT 5
OFFSET 0;
-- similar query as the above, but with group bys
SELECT
distinct_users, event_type, time
FROM
(SELECT user_id, time, event_type FROM events_table) as events_dist INNER JOIN
(SELECT user_id as distinct_users FROM users_reference_table GROUP BY distinct_users) users_ref ON (events_dist.user_id = users_ref.distinct_users)
ORDER BY time DESC
LIMIT 5
OFFSET 0;
-- should not push down this query since there is a distributed table (i.e., events_table)
-- which is not in the DISTINCT clause
SELECT * FROM
(
SELECT DISTINCT users_reference_table.user_id FROM users_reference_table, events_table WHERE users_reference_table.user_id = events_table.value_4
) as foo;
SELECT * FROM
(
SELECT users_reference_table.user_id FROM users_reference_table, events_table WHERE users_reference_table.user_id = events_table.value_4
GROUP BY 1
) as foo;
-- similiar to the above examples, this time there is a subquery
-- whose output is not in the DISTINCT clause
SELECT * FROM
(
SELECT DISTINCT users_reference_table.user_id FROM users_reference_table, (SELECT user_id, random() FROM events_table) as us_events WHERE users_reference_table.user_id = us_events.user_id
) as foo;
-- the following query is safe to push down since the DISTINCT clause include distribution column
SELECT * FROM
(
SELECT DISTINCT users_reference_table.user_id, us_events.user_id FROM users_reference_table, (SELECT user_id, random() FROM events_table WHERE event_type IN (2,3)) as us_events WHERE users_reference_table.user_id = us_events.user_id
) as foo
ORDER BY 1 DESC
LIMIT 4;
-- should not pushdown since there is a non partition column on the DISTINCT clause
SELECT * FROM
(
SELECT
DISTINCT users_reference_table.user_id, us_events.value_4
FROM
users_reference_table,
(SELECT user_id, value_4, random() FROM events_table WHERE event_type IN (2,3)) as us_events
WHERE
users_reference_table.user_id = us_events.user_id
) as foo
ORDER BY 1 DESC
LIMIT 4;
DROP TABLE user_buy_test_table;
DROP TABLE users_ref_test_table;
DROP TABLE users_return_test_table;

View File

@ -297,3 +297,25 @@ GROUP BY user_id
HAVING count(*) > 66
ORDER BY user_id
LIMIT 5;
-- similar query with slightly more complex group by
-- though the error message is a bit confusing
SELECT
user_id
FROM
users_table
WHERE
value_2 >
(SELECT
max(value_2)
FROM
events_reference_table
WHERE
users_table.user_id = events_reference_table.user_id AND event_type = 50
GROUP BY
(users_table.user_id * 2)
)
GROUP BY user_id
HAVING count(*) > 66
ORDER BY user_id
LIMIT 5;