Merge pull request #1816 from citusdata/fix_reference_regression

Relax the checks for ensuring distribution columns in the target list
pull/1805/head
Önder Kalacı 2017-11-21 16:07:14 +03:00 committed by GitHub
commit 46c9922def
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 340 additions and 0 deletions

View File

@ -84,6 +84,8 @@ static DeferredErrorMessage * DeferErrorIfUnsupportedUnionQuery(Query *queryTree
static bool ExtractSetOperationStatmentWalker(Node *node, List **setOperationList);
static DeferredErrorMessage * DeferErrorIfUnsupportedTableCombination(Query *queryTree);
static bool WindowPartitionOnDistributionColumn(Query *query);
static bool AllTargetExpressionsAreColumnReferences(List *targetEntryList);
static bool RangeTableListContainsOnlyReferenceTables(List *rangeTableList);
static FieldSelect * CompositeFieldRecursive(Expr *expression, Query *query);
static bool FullCompositeFieldList(List *compositeFieldList);
static MultiNode * MultiPlanTree(Query *queryTree);
@ -1234,10 +1236,118 @@ TargetListOnPartitionColumn(Query *query, List *targetEntryList)
}
}
/*
* We could still behave as if the target list is on partition column if
* all range table entries are reference tables and all target expressions
* are column references to the given query level.
*/
if (!targetListOnPartitionColumn)
{
if (RangeTableListContainsOnlyReferenceTables(query->rtable) &&
AllTargetExpressionsAreColumnReferences(targetEntryList))
{
targetListOnPartitionColumn = true;
}
}
return targetListOnPartitionColumn;
}
/*
* AllTargetExpressionsAreColumnReferences returns true if non of the
* elements in the target entry list belong to an outer query (for
* example the query is a sublink and references to another query
* in the from list).
*
* The function also returns true if any of the target entries is not
* a column itself. This might be too restrictive, but, given that we're
* handling a very specific type of queries, that seems acceptable for now.
*/
static bool
AllTargetExpressionsAreColumnReferences(List *targetEntryList)
{
ListCell *targetEntryCell = NULL;
foreach(targetEntryCell, targetEntryList)
{
TargetEntry *targetEntry = lfirst(targetEntryCell);
Var *candidateColumn = NULL;
Expr *strippedColumnExpression = (Expr *) strip_implicit_coercions(
(Node *) targetEntry->expr);
if (IsA(strippedColumnExpression, Var))
{
candidateColumn = (Var *) strippedColumnExpression;
}
else if (IsA(strippedColumnExpression, FieldSelect))
{
FieldSelect *compositeField = (FieldSelect *) strippedColumnExpression;
Expr *fieldExpression = compositeField->arg;
if (IsA(fieldExpression, Var))
{
candidateColumn = (Var *) fieldExpression;
}
}
/* we don't support target entries that are not columns */
if (candidateColumn == NULL)
{
return false;
}
if (candidateColumn->varlevelsup > 0)
{
return false;
}
}
return true;
}
/*
* RangeTableListContainsOnlyReferenceTables returns true if all range table
* entries are reference tables.
*
* The function returns false for range table entries that are not relations.
*
* Note that the function doesn't recurse into subqueries, returns false when
* a subquery is found.
*/
static bool
RangeTableListContainsOnlyReferenceTables(List *rangeTableList)
{
ListCell *rangeTableCell = NULL;
foreach(rangeTableCell, rangeTableList)
{
RangeTblEntry *rangeTableEntry = (RangeTblEntry *) lfirst(rangeTableCell);
if (rangeTableEntry->rtekind == RTE_RELATION)
{
Oid relationId = rangeTableEntry->relid;
if (!IsDistributedTable(relationId))
{
return false;
}
if (PartitionMethod(relationId) != DISTRIBUTE_BY_NONE)
{
return false;
}
}
else
{
return false;
}
}
return true;
}
/*
* FullCompositeFieldList gets a composite field list, and checks if all fields
* of composite type are used in the list.

View File

@ -1193,6 +1193,116 @@ ORDER BY 1 LIMIT 3;
2
(3 rows)
-- should be able to pushdown since one of the subqueries has distinct on reference tables
-- and there is only reference table in that subquery
SELECT
distinct_users, event_type, time
FROM
(SELECT user_id, time, event_type FROM events_table) as events_dist INNER JOIN
(SELECT DISTINCT user_id as distinct_users FROM users_reference_table) users_ref ON (events_dist.user_id = users_ref.distinct_users)
ORDER BY time DESC
LIMIT 5
OFFSET 0;
distinct_users | event_type | time
----------------+------------+---------------------------------
78 | 815 | Tue Jan 21 05:59:54.833395 2014
92 | 826 | Tue Jan 21 05:57:26.643861 2014
65 | 241 | Tue Jan 21 05:56:52.624231 2014
23 | 573 | Tue Jan 21 05:55:28.796818 2014
98 | 23 | Tue Jan 21 05:54:57.987456 2014
(5 rows)
-- the same query wuth multiple reference tables in the subquery
SELECT
distinct_users, event_type, time
FROM
(SELECT user_id, time, event_type FROM events_table) as events_dist INNER JOIN
(SELECT DISTINCT users_reference_table.user_id as distinct_users FROM users_reference_table, events_reference_table
WHERE events_reference_table.user_id = users_reference_table.user_id AND events_reference_table.event_type IN (1,2,3,4)) users_ref
ON (events_dist.user_id = users_ref.distinct_users)
ORDER BY time DESC
LIMIT 5
OFFSET 0;
distinct_users | event_type | time
----------------+------------+---------------------------------
65 | 241 | Tue Jan 21 05:56:52.624231 2014
98 | 23 | Tue Jan 21 05:54:57.987456 2014
26 | 957 | Tue Jan 21 05:43:16.99674 2014
44 | 682 | Tue Jan 21 05:43:00.838945 2014
81 | 852 | Tue Jan 21 05:34:56.310878 2014
(5 rows)
-- similar query as the above, but with group bys
SELECT
distinct_users, event_type, time
FROM
(SELECT user_id, time, event_type FROM events_table) as events_dist INNER JOIN
(SELECT user_id as distinct_users FROM users_reference_table GROUP BY distinct_users) users_ref ON (events_dist.user_id = users_ref.distinct_users)
ORDER BY time DESC
LIMIT 5
OFFSET 0;
distinct_users | event_type | time
----------------+------------+---------------------------------
78 | 815 | Tue Jan 21 05:59:54.833395 2014
92 | 826 | Tue Jan 21 05:57:26.643861 2014
65 | 241 | Tue Jan 21 05:56:52.624231 2014
23 | 573 | Tue Jan 21 05:55:28.796818 2014
98 | 23 | Tue Jan 21 05:54:57.987456 2014
(5 rows)
-- should not push down this query since there is a distributed table (i.e., events_table)
-- which is not in the DISTINCT clause
SELECT * FROM
(
SELECT DISTINCT users_reference_table.user_id FROM users_reference_table, events_table WHERE users_reference_table.user_id = events_table.value_4
) as foo;
ERROR: cannot push down this subquery
DETAIL: Distinct on columns without partition column is currently unsupported
SELECT * FROM
(
SELECT users_reference_table.user_id FROM users_reference_table, events_table WHERE users_reference_table.user_id = events_table.value_4
GROUP BY 1
) as foo;
ERROR: cannot push down this subquery
DETAIL: Group by list without partition column is currently unsupported
-- similiar to the above examples, this time there is a subquery
-- whose output is not in the DISTINCT clause
SELECT * FROM
(
SELECT DISTINCT users_reference_table.user_id FROM users_reference_table, (SELECT user_id, random() FROM events_table) as us_events WHERE users_reference_table.user_id = us_events.user_id
) as foo;
ERROR: cannot push down this subquery
DETAIL: Distinct on columns without partition column is currently unsupported
-- the following query is safe to push down since the DISTINCT clause include distribution column
SELECT * FROM
(
SELECT DISTINCT users_reference_table.user_id, us_events.user_id FROM users_reference_table, (SELECT user_id, random() FROM events_table WHERE event_type IN (2,3)) as us_events WHERE users_reference_table.user_id = us_events.user_id
) as foo
ORDER BY 1 DESC
LIMIT 4;
user_id | user_id
---------+---------
98 | 98
96 | 96
90 | 90
81 | 81
(4 rows)
-- should not pushdown since there is a non partition column on the DISTINCT clause
SELECT * FROM
(
SELECT
DISTINCT users_reference_table.user_id, us_events.value_4
FROM
users_reference_table,
(SELECT user_id, value_4, random() FROM events_table WHERE event_type IN (2,3)) as us_events
WHERE
users_reference_table.user_id = us_events.user_id
) as foo
ORDER BY 1 DESC
LIMIT 4;
ERROR: cannot push down this subquery
DETAIL: Distinct on columns without partition column is currently unsupported
DROP TABLE user_buy_test_table;
DROP TABLE users_ref_test_table;
DROP TABLE users_return_test_table;

View File

@ -343,3 +343,26 @@ ORDER BY user_id
LIMIT 5;
ERROR: cannot push down this subquery
DETAIL: Group by list without partition column is currently unsupported
-- similar query with slightly more complex group by
-- though the error message is a bit confusing
SELECT
user_id
FROM
users_table
WHERE
value_2 >
(SELECT
max(value_2)
FROM
events_reference_table
WHERE
users_table.user_id = events_reference_table.user_id AND event_type = 50
GROUP BY
(users_table.user_id * 2)
)
GROUP BY user_id
HAVING count(*) > 66
ORDER BY user_id
LIMIT 5;
ERROR: cannot push down this subquery
DETAIL: Group by list without partition column is currently unsupported

View File

@ -950,6 +950,81 @@ SELECT foo.user_id FROM
) as foo
ORDER BY 1 LIMIT 3;
-- should be able to pushdown since one of the subqueries has distinct on reference tables
-- and there is only reference table in that subquery
SELECT
distinct_users, event_type, time
FROM
(SELECT user_id, time, event_type FROM events_table) as events_dist INNER JOIN
(SELECT DISTINCT user_id as distinct_users FROM users_reference_table) users_ref ON (events_dist.user_id = users_ref.distinct_users)
ORDER BY time DESC
LIMIT 5
OFFSET 0;
-- the same query wuth multiple reference tables in the subquery
SELECT
distinct_users, event_type, time
FROM
(SELECT user_id, time, event_type FROM events_table) as events_dist INNER JOIN
(SELECT DISTINCT users_reference_table.user_id as distinct_users FROM users_reference_table, events_reference_table
WHERE events_reference_table.user_id = users_reference_table.user_id AND events_reference_table.event_type IN (1,2,3,4)) users_ref
ON (events_dist.user_id = users_ref.distinct_users)
ORDER BY time DESC
LIMIT 5
OFFSET 0;
-- similar query as the above, but with group bys
SELECT
distinct_users, event_type, time
FROM
(SELECT user_id, time, event_type FROM events_table) as events_dist INNER JOIN
(SELECT user_id as distinct_users FROM users_reference_table GROUP BY distinct_users) users_ref ON (events_dist.user_id = users_ref.distinct_users)
ORDER BY time DESC
LIMIT 5
OFFSET 0;
-- should not push down this query since there is a distributed table (i.e., events_table)
-- which is not in the DISTINCT clause
SELECT * FROM
(
SELECT DISTINCT users_reference_table.user_id FROM users_reference_table, events_table WHERE users_reference_table.user_id = events_table.value_4
) as foo;
SELECT * FROM
(
SELECT users_reference_table.user_id FROM users_reference_table, events_table WHERE users_reference_table.user_id = events_table.value_4
GROUP BY 1
) as foo;
-- similiar to the above examples, this time there is a subquery
-- whose output is not in the DISTINCT clause
SELECT * FROM
(
SELECT DISTINCT users_reference_table.user_id FROM users_reference_table, (SELECT user_id, random() FROM events_table) as us_events WHERE users_reference_table.user_id = us_events.user_id
) as foo;
-- the following query is safe to push down since the DISTINCT clause include distribution column
SELECT * FROM
(
SELECT DISTINCT users_reference_table.user_id, us_events.user_id FROM users_reference_table, (SELECT user_id, random() FROM events_table WHERE event_type IN (2,3)) as us_events WHERE users_reference_table.user_id = us_events.user_id
) as foo
ORDER BY 1 DESC
LIMIT 4;
-- should not pushdown since there is a non partition column on the DISTINCT clause
SELECT * FROM
(
SELECT
DISTINCT users_reference_table.user_id, us_events.value_4
FROM
users_reference_table,
(SELECT user_id, value_4, random() FROM events_table WHERE event_type IN (2,3)) as us_events
WHERE
users_reference_table.user_id = us_events.user_id
) as foo
ORDER BY 1 DESC
LIMIT 4;
DROP TABLE user_buy_test_table;
DROP TABLE users_ref_test_table;
DROP TABLE users_return_test_table;

View File

@ -297,3 +297,25 @@ GROUP BY user_id
HAVING count(*) > 66
ORDER BY user_id
LIMIT 5;
-- similar query with slightly more complex group by
-- though the error message is a bit confusing
SELECT
user_id
FROM
users_table
WHERE
value_2 >
(SELECT
max(value_2)
FROM
events_reference_table
WHERE
users_table.user_id = events_reference_table.user_id AND event_type = 50
GROUP BY
(users_table.user_id * 2)
)
GROUP BY user_id
HAVING count(*) > 66
ORDER BY user_id
LIMIT 5;