Merge pull request #1852 from citusdata/group_by_on_function

Treat recurring tuples as reference tables for GROUP BY checks
pull/1874/head
Önder Kalacı 2017-12-13 16:37:31 +03:00 committed by GitHub
commit b5784ca03a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 358 additions and 30 deletions

View File

@ -109,7 +109,8 @@ static bool ExtractSetOperationStatmentWalker(Node *node, List **setOperationLis
static DeferredErrorMessage * DeferErrorIfUnsupportedTableCombination(Query *queryTree);
static bool WindowPartitionOnDistributionColumn(Query *query);
static bool AllTargetExpressionsAreColumnReferences(List *targetEntryList);
static bool RangeTableListContainsOnlyReferenceTables(List *rangeTableList);
static bool FindNodeCheckInRangeTableList(List *rtable, bool (*check)(Node *));
static bool IsDistributedTableRTE(Node *node);
static FieldSelect * CompositeFieldRecursive(Expr *expression, Query *query);
static bool FullCompositeFieldList(List *compositeFieldList);
static MultiNode * MultiNodeTree(Query *queryTree);
@ -1510,12 +1511,12 @@ TargetListOnPartitionColumn(Query *query, List *targetEntryList)
/*
* We could still behave as if the target list is on partition column if
* all range table entries are reference tables and all target expressions
* are column references to the given query level.
* all range table entries are reference tables or intermediate results,
* and all target expressions are column references to the given query level.
*/
if (!targetListOnPartitionColumn)
{
if (RangeTableListContainsOnlyReferenceTables(query->rtable) &&
if (!FindNodeCheckInRangeTableList(query->rtable, IsDistributedTableRTE) &&
AllTargetExpressionsAreColumnReferences(targetEntryList))
{
targetListOnPartitionColumn = true;
@ -1580,40 +1581,51 @@ AllTargetExpressionsAreColumnReferences(List *targetEntryList)
/*
* RangeTableListContainsOnlyReferenceTables returns true if all range table
* entries are reference tables.
* FindNodeCheckInRangeTableList finds a node for which the check
* function returns true.
*
* The function returns false for range table entries that are not relations.
*
* Note that the function doesn't recurse into subqueries, returns false when
* a subquery is found.
* FindNodeCheckInRangeTableList relies on FindNodeCheck() but only
* considers the range table entries.
*/
static bool
RangeTableListContainsOnlyReferenceTables(List *rangeTableList)
FindNodeCheckInRangeTableList(List *rtable, bool (*check)(Node *))
{
ListCell *rangeTableCell = NULL;
foreach(rangeTableCell, rangeTableList)
{
RangeTblEntry *rangeTableEntry = (RangeTblEntry *) lfirst(rangeTableCell);
return range_table_walker(rtable, FindNodeCheck, check, QTW_EXAMINE_RTES);
}
if (rangeTableEntry->rtekind == RTE_RELATION)
{
Oid relationId = rangeTableEntry->relid;
if (!IsDistributedTable(relationId))
/*
* IsDistributedTableRTE gets a node and returns true if the node
* is a range table relation entry that points to a distributed
* relation (i.e., excluding reference tables).
*/
static bool
IsDistributedTableRTE(Node *node)
{
RangeTblEntry *rangeTableEntry = NULL;
Oid relationId = InvalidOid;
if (node == NULL)
{
return false;
}
if (PartitionMethod(relationId) != DISTRIBUTE_BY_NONE)
if (!IsA(node, RangeTblEntry))
{
return false;
}
}
else
rangeTableEntry = (RangeTblEntry *) node;
if (rangeTableEntry->rtekind != RTE_RELATION)
{
return false;
}
relationId = rangeTableEntry->relid;
if (!IsDistributedTable(relationId) ||
PartitionMethod(relationId) == DISTRIBUTE_BY_NONE)
{
return false;
}
return true;

View File

@ -1361,6 +1361,186 @@ ORDER BY 1 DESC
LIMIT 4;
ERROR: cannot push down this subquery
DETAIL: Distinct on columns without partition column is currently unsupported
-- test the read_intermediate_result() for GROUP BYs
BEGIN;
SELECT broadcast_intermediate_result('squares', 'SELECT s, s*s FROM generate_series(1,200) s');
broadcast_intermediate_result
-------------------------------
200
(1 row)
-- single appereance of read_intermediate_result
SELECT
DISTINCT user_id
FROM
users_table
JOIN
(SELECT
max(res.val) as mx
FROM
read_intermediate_result('squares', 'binary') AS res (val int, val_square int)
GROUP BY res.val_square) squares
ON (mx = user_id)
ORDER BY 1
LIMIT 5;
user_id
---------
1
2
3
4
5
(5 rows)
-- similar to the above, with DISTINCT on intermediate result
SELECT DISTINCT user_id
FROM users_table
JOIN
(SELECT DISTINCT res.val AS mx
FROM read_intermediate_result('squares', 'binary') AS res (val int, val_square int)) squares ON (mx = user_id)
ORDER BY 1
LIMIT 5;
user_id
---------
1
2
3
4
5
(5 rows)
-- single appereance of read_intermediate_result but inside a subquery
SELECT
DISTINCT user_id
FROM
users_table
JOIN (
SELECT *,random() FROM (SELECT
max(res.val) as mx
FROM
(SELECT val, val_square FROM read_intermediate_result('squares', 'binary') AS res (val int, val_square int)) res
GROUP BY res.val_square) foo)
squares
ON (mx = user_id)
ORDER BY 1
LIMIT 5;
user_id
---------
1
2
3
4
5
(5 rows)
-- multiple read_intermediate_results in the same subquery is OK
SELECT
DISTINCT user_id
FROM
users_table
JOIN
(SELECT
max(res.val) as mx
FROM
read_intermediate_result('squares', 'binary') AS res (val int, val_square int),
read_intermediate_result('squares', 'binary') AS res2 (val int, val_square int)
WHERE res.val = res2.val_square
GROUP BY res2.val_square) squares
ON (mx = user_id)
ORDER BY 1
LIMIT 5;
user_id
---------
1
4
(2 rows)
-- mixed recurring tuples should be supported
SELECT
DISTINCT user_id
FROM
users_table
JOIN
(SELECT
max(res.val) as mx
FROM
read_intermediate_result('squares', 'binary') AS res (val int, val_square int),
generate_series(0, 10) i
WHERE
res.val = i
GROUP BY
i) squares
ON (mx = user_id)
ORDER BY 1
LIMIT 5;
user_id
---------
1
2
3
4
5
(5 rows)
-- should error out since there is a distributed table and
-- there are no columns on the GROUP BY from the distributed table
SELECT
DISTINCT user_id
FROM
users_reference_table
JOIN
(SELECT
max(val_square) as mx
FROM
read_intermediate_result('squares', 'binary') AS res (val int, val_square int), events_table
WHERE
events_table.user_id = res.val GROUP BY res.val) squares
ON (mx = user_id)
ORDER BY 1
LIMIT 5;
ERROR: cannot push down this subquery
DETAIL: Group by list without partition column is currently unsupported
ROLLBACK;
-- should work since we're using an immutable function as recurring tuple
SELECT
DISTINCT user_id
FROM
users_table
JOIN
(SELECT
max(i+5)as mx
FROM
generate_series(0, 10) as i GROUP BY i) squares
ON (mx = user_id)
ORDER BY 1
LIMIT 5;
user_id
---------
5
6
(2 rows)
-- should not work since we're
-- using an immutable function as recurring tuple
-- along with a distributed table, where GROUP BY is
-- on the recurring tuple
SELECT
DISTINCT user_id
FROM
users_reference_table
JOIN
(SELECT
max(i+5)as mx
FROM
generate_series(0, 10) as i, events_table
WHERE
events_table.user_id = i GROUP BY i) squares
ON (mx = user_id)
ORDER BY 1
LIMIT 5;
ERROR: cannot push down this subquery
DETAIL: Group by list without partition column is currently unsupported
DROP TABLE user_buy_test_table;
DROP TABLE users_ref_test_table;
DROP TABLE users_return_test_table;

View File

@ -1089,6 +1089,142 @@ SELECT * FROM
ORDER BY 1 DESC
LIMIT 4;
-- test the read_intermediate_result() for GROUP BYs
BEGIN;
SELECT broadcast_intermediate_result('squares', 'SELECT s, s*s FROM generate_series(1,200) s');
-- single appereance of read_intermediate_result
SELECT
DISTINCT user_id
FROM
users_table
JOIN
(SELECT
max(res.val) as mx
FROM
read_intermediate_result('squares', 'binary') AS res (val int, val_square int)
GROUP BY res.val_square) squares
ON (mx = user_id)
ORDER BY 1
LIMIT 5;
-- similar to the above, with DISTINCT on intermediate result
SELECT DISTINCT user_id
FROM users_table
JOIN
(SELECT DISTINCT res.val AS mx
FROM read_intermediate_result('squares', 'binary') AS res (val int, val_square int)) squares ON (mx = user_id)
ORDER BY 1
LIMIT 5;
-- single appereance of read_intermediate_result but inside a subquery
SELECT
DISTINCT user_id
FROM
users_table
JOIN (
SELECT *,random() FROM (SELECT
max(res.val) as mx
FROM
(SELECT val, val_square FROM read_intermediate_result('squares', 'binary') AS res (val int, val_square int)) res
GROUP BY res.val_square) foo)
squares
ON (mx = user_id)
ORDER BY 1
LIMIT 5;
-- multiple read_intermediate_results in the same subquery is OK
SELECT
DISTINCT user_id
FROM
users_table
JOIN
(SELECT
max(res.val) as mx
FROM
read_intermediate_result('squares', 'binary') AS res (val int, val_square int),
read_intermediate_result('squares', 'binary') AS res2 (val int, val_square int)
WHERE res.val = res2.val_square
GROUP BY res2.val_square) squares
ON (mx = user_id)
ORDER BY 1
LIMIT 5;
-- mixed recurring tuples should be supported
SELECT
DISTINCT user_id
FROM
users_table
JOIN
(SELECT
max(res.val) as mx
FROM
read_intermediate_result('squares', 'binary') AS res (val int, val_square int),
generate_series(0, 10) i
WHERE
res.val = i
GROUP BY
i) squares
ON (mx = user_id)
ORDER BY 1
LIMIT 5;
-- should error out since there is a distributed table and
-- there are no columns on the GROUP BY from the distributed table
SELECT
DISTINCT user_id
FROM
users_reference_table
JOIN
(SELECT
max(val_square) as mx
FROM
read_intermediate_result('squares', 'binary') AS res (val int, val_square int), events_table
WHERE
events_table.user_id = res.val GROUP BY res.val) squares
ON (mx = user_id)
ORDER BY 1
LIMIT 5;
ROLLBACK;
-- should work since we're using an immutable function as recurring tuple
SELECT
DISTINCT user_id
FROM
users_table
JOIN
(SELECT
max(i+5)as mx
FROM
generate_series(0, 10) as i GROUP BY i) squares
ON (mx = user_id)
ORDER BY 1
LIMIT 5;
-- should not work since we're
-- using an immutable function as recurring tuple
-- along with a distributed table, where GROUP BY is
-- on the recurring tuple
SELECT
DISTINCT user_id
FROM
users_reference_table
JOIN
(SELECT
max(i+5)as mx
FROM
generate_series(0, 10) as i, events_table
WHERE
events_table.user_id = i GROUP BY i) squares
ON (mx = user_id)
ORDER BY 1
LIMIT 5;
DROP TABLE user_buy_test_table;
DROP TABLE users_ref_test_table;
DROP TABLE users_return_test_table;