Merge pull request #3722 from citusdata/optimistic-duplicate-grouping

Duplicate grouping on worker whenever possible
pull/3710/head^2
Philip Dubé 2020-04-06 21:31:23 +00:00 committed by GitHub
commit ec734a643b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 19 additions and 57 deletions

View File

@ -328,9 +328,6 @@ static bool HasOrderByAggregate(List *sortClauseList, List *targetList);
static bool HasOrderByNonCommutativeAggregate(List *sortClauseList, List *targetList);
static bool HasOrderByComplexExpression(List *sortClauseList, List *targetList);
static bool HasOrderByHllType(List *sortClauseList, List *targetList);
static bool ShouldPushDownGroupingToWorker(MultiExtendedOp *opNode,
ExtendedOpNodeProperties *
extendedOpNodeProperties);
static bool ShouldProcessDistinctOrderAndLimitForWorker(
ExtendedOpNodeProperties *extendedOpNodeProperties,
bool pushingDownOriginalGrouping,
@ -2223,7 +2220,7 @@ WorkerExtendedOpNode(MultiExtendedOp *originalOpNode,
/* targetProjectionNumber starts from 1 */
queryTargetList.targetProjectionNumber = 1;
if (ShouldPushDownGroupingToWorker(originalOpNode, extendedOpNodeProperties))
if (!extendedOpNodeProperties->pullUpIntermediateRows)
{
queryGroupClause.groupClauseList = copyObject(originalGroupClauseList);
}
@ -4717,46 +4714,6 @@ HasOrderByHllType(List *sortClauseList, List *targetList)
}
/*
* ShouldPushDownGroupingToWorker returns whether we push down GROUP BY.
* This may return true even when GROUP BY is necessary on master.
*/
static bool
ShouldPushDownGroupingToWorker(MultiExtendedOp *opNode,
ExtendedOpNodeProperties *extendedOpNodeProperties)
{
if (extendedOpNodeProperties->pushDownGroupingAndHaving)
{
return true;
}
if (extendedOpNodeProperties->pullUpIntermediateRows)
{
return false;
}
/*
* Duplicate grouping if we have LIMIT without HAVING, as this can
* often result in LIMIT being pushed down.
*/
if (opNode->havingQual == NULL && opNode->limitCount != NULL)
{
return true;
}
/*
* If aggregates are being split across worker & master, so must grouping.
*/
if (contain_aggs_of_level(opNode->havingQual, 0) ||
contain_aggs_of_level((Node *) opNode->targetList, 0))
{
return true;
}
return false;
}
/*
* ShouldProcessDistinctOrderAndLimitForWorker returns whether
* ProcessDistinctClauseForWorkerQuery should be called. If not,

View File

@ -453,8 +453,11 @@ HashAggregate
Tasks Shown: One of 2
-> Task
Node: host=localhost port=xxxxx dbname=regression
-> Seq Scan on public.lineitem_290000 lineitem
-> HashAggregate
Output: l_quantity, l_quantity
Group Key: lineitem.l_quantity
-> Seq Scan on public.lineitem_290000 lineitem
Output: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment
-- Subquery pushdown tests with explain
EXPLAIN (COSTS OFF)
SELECT

View File

@ -747,7 +747,7 @@ RESET citus.subquery_pushdown;
VACUUM ANALYZE users_table;
-- explain tests
EXPLAIN (COSTS FALSE) SELECT user_id FROM recent_selected_users GROUP BY 1 ORDER BY 1;
QUERY PLAN
QUERY PLAN
---------------------------------------------------------------------
Sort
Sort Key: remote_scan.user_id
@ -758,17 +758,19 @@ EXPLAIN (COSTS FALSE) SELECT user_id FROM recent_selected_users GROUP BY 1 ORDER
Tasks Shown: One of 4
-> Task
Node: host=localhost port=xxxxx dbname=regression
-> Nested Loop
Join Filter: (users_table.user_id = users_table_1.user_id)
-> Sort
Sort Key: (max(users_table_1."time")) DESC
-> HashAggregate
Group Key: users_table_1.user_id
Filter: (max(users_table_1."time") > '2017-11-23 16:20:33.264457'::timestamp without time zone)
-> Seq Scan on users_table_1400256 users_table_1
-> Seq Scan on users_table_1400256 users_table
Filter: ((value_1 >= 1) AND (value_1 < 3))
(19 rows)
-> HashAggregate
Group Key: users_table.user_id
-> Nested Loop
Join Filter: (users_table.user_id = users_table_1.user_id)
-> Sort
Sort Key: (max(users_table_1."time")) DESC
-> HashAggregate
Group Key: users_table_1.user_id
Filter: (max(users_table_1."time") > '2017-11-23 16:20:33.264457'::timestamp without time zone)
-> Seq Scan on users_table_1400256 users_table_1
-> Seq Scan on users_table_1400256 users_table
Filter: ((value_1 >= 1) AND (value_1 < 3))
(21 rows)
EXPLAIN (COSTS FALSE) SELECT *
FROM (