Remove RemoveDuplicateJoinRestrictions() function

RemoveDuplicateJoinRestrictions() function was introduced with the aim of decrasing the overall planning times by eliminating the duplicate JOIN restriction entries (#1989). However, it turns out that the function itself is so CPU intensive with a very high algorithmic complexity, it hurts a lot more than it helps. The function is a clear example of premature optimization.

The table below shows the difference clearly:

"distributed query planning
 time master"	RemoveDuplicateJoinRestrictions() execution time on master	"Remove the function RemoveDuplicateJoinRestrictions()
this PR"
5 table INNER JOIN	9 msec	2msec	7 msec
10 table INNER JOIN	227 msec	194 msec	29  msec
20 table INNER JOIN	1 sec 235 msec	1  sec 139  msec	90 msecs
50 table INNER JOIN	24 seconds	21 seconds	1.5 seconds
100 table INNER JOIN	2 minutes 16 secods	1 minute 53 seconds	23 seconds
250 table INNER JOIN	Bottleneck on JoinClauseList	18 minutes 52 seconds	Bottleneck on JoinClauseList

5 table INNER JOIN in subquery	9 msec	0 msec	6 msec
10 table INNER JOIN subquery	33 msec	10 msec	32 msec
20 table INNER JOIN subquery	132 msec	67 msec	123 msec
50 table INNER JOIN subquery	1.2  seconds	900 msec	500 msec
100 table INNER JOIN subquery	6 seconds	5  seconds	2 seconds
250 table INNER JOIN subquery	54 seconds	37 seconds	20  seconds

5 table LEFT JOIN	5 msec	0 msec	5 msec
10 table LEFT JOIN	11 msec	0 msec	13 msec
20 table LEFT JOIN	26 msec	2 msec	30 msec
50 table LEFT JOIN	150 msec	15 msec	193 msec
100 table LEFT JOIN	757 msec	71 msec	722 msec
250 table LEFT JOIN	8 seconds	600 msec	8 seconds

5 JOINs among 2 table JOINs 	37 msec	11 msec	25 msec
10 JOINs among 2 table JOINs 	536 msec	306 msec	352 msec
20 JOINs among 2 table JOINs 	794 msec	181 msec	640 msec
50 JOINs among 2 table JOINs 	25 seconds	2 seconds	22 seconds
100 JOINs among 2 table JOINs 	Bottleneck on JoinClauseList	9 seconds	Bottleneck on JoinClauseList
150 JOINs among 2 table JOINs 	Bottleneck on JoinClauseList	46 seconds	Bottleneck on JoinClauseList

On top of the performance penalty, the function had a critical bug #4255, and with #4254 we hit one more important bug. It should be fixed by adding the followig check to the ContextCoversJoinRestriction():
```
static bool
JoinRelIdsSame(JoinRestriction *leftRestriction, JoinRestriction *rightRestriction)
{
	Relids leftInnerRelIds = leftRestriction->innerrel->relids;
	Relids rightInnerRelIds = rightRestriction->innerrel->relids;
	if (!bms_equal(leftInnerRelIds, rightInnerRelIds))
	{
		return false;
	}

	Relids leftOuterRelIds = leftRestriction->outerrel->relids;
	Relids rightOuterRelIds = rightRestriction->outerrel->relids;
	if (!bms_equal(leftOuterRelIds, rightOuterRelIds))
	{
		return false;
	}

	return true;
}
```

However, adding this eliminates all the benefits tha RemoveDuplicateJoinRestrictions() brings.

I've used the commands here to generate the JOINs mentioned in the PR: https://gist.github.com/onderkalaci/fe8654f9df5916c7af4c7c5eb892561e#file-gistfile1-txt

Inner and outer JOINs behave roughly the same, to simplify the table only added INNER joins.
pull/4264/head
Onder Kalaci 2020-10-21 10:29:39 +02:00
parent 790beea59f
commit 5c4c9304ba
6 changed files with 0 additions and 131 deletions

View File

@ -615,8 +615,6 @@ CreateDistributedPlannedStmt(DistributedPlanningContext *planContext)
{
uint64 planId = NextPlanId++;
bool hasUnresolvedParams = false;
JoinRestrictionContext *joinRestrictionContext =
planContext->plannerRestrictionContext->joinRestrictionContext;
PlannedStmt *resultPlan = NULL;
@ -646,9 +644,6 @@ CreateDistributedPlannedStmt(DistributedPlanningContext *planContext)
hasUnresolvedParams = true;
}
planContext->plannerRestrictionContext->joinRestrictionContext =
RemoveDuplicateJoinRestrictions(joinRestrictionContext);
DistributedPlan *distributedPlan =
CreateDistributedPlan(planId, planContext->originalQuery, planContext->query,
planContext->boundParams,
@ -1744,8 +1739,6 @@ multi_join_restriction_hook(PlannerInfo *root,
*/
joinRestrictionContext->hasSemiJoin = joinRestrictionContext->hasSemiJoin ||
extra->sjinfo->jointype == JOIN_SEMI;
joinRestrictionContext->hasOnlyInnerJoin = joinRestrictionContext->hasOnlyInnerJoin &&
extra->sjinfo->jointype == JOIN_INNER;
MemoryContextSwitchTo(oldMemoryContext);
}
@ -2135,7 +2128,6 @@ CreateAndPushPlannerRestrictionContext(void)
/* we'll apply logical AND as we add tables */
plannerRestrictionContext->relationRestrictionContext->allReferenceTables = true;
plannerRestrictionContext->joinRestrictionContext->hasOnlyInnerJoin = true;
plannerRestrictionContextList = lcons(plannerRestrictionContext,
plannerRestrictionContextList);

View File

@ -151,8 +151,6 @@ static bool RangeTableArrayContainsAnyRTEIdentities(RangeTblEntry **rangeTableEn
queryRteIdentities);
static int RangeTableOffsetCompat(PlannerInfo *root, AppendRelInfo *appendRelInfo);
static Relids QueryRteIdentities(Query *queryTree);
static bool ContextCoversJoinRestriction(JoinRestrictionContext *joinRestrictionContext,
JoinRestriction *joinRestrictionInTest);
/*
@ -1910,8 +1908,6 @@ FilterJoinRestrictionContext(JoinRestrictionContext *joinRestrictionContext, Rel
* No need to re calculate has join fields as we are still operating on
* the same query and as these values are calculated per-query basis.
*/
filtererdJoinRestrictionContext->hasOnlyInnerJoin =
joinRestrictionContext->hasOnlyInnerJoin;
filtererdJoinRestrictionContext->hasSemiJoin = joinRestrictionContext->hasSemiJoin;
return filtererdJoinRestrictionContext;
@ -2000,105 +1996,3 @@ QueryRteIdentities(Query *queryTree)
return queryRteIdentities;
}
/*
* RemoveDuplicateJoinRestrictions gets a join restriction context and returns a
* newly allocated join restriction context where the duplicate join restrictions
* removed.
*
* Note that we use PostgreSQL hooks to accumulate the join restrictions and PostgreSQL
* gives us all the join paths it tries while deciding on the join order. Thus, for
* queries that has many joins, this function is likely to remove lots of duplicate join
* restrictions. This becomes relevant for Citus on query pushdown check peformance.
*/
JoinRestrictionContext *
RemoveDuplicateJoinRestrictions(JoinRestrictionContext *joinRestrictionContext)
{
JoinRestrictionContext *filteredContext = palloc0(sizeof(JoinRestrictionContext));
ListCell *joinRestrictionCell = NULL;
filteredContext->joinRestrictionList = NIL;
foreach(joinRestrictionCell, joinRestrictionContext->joinRestrictionList)
{
JoinRestriction *joinRestriction = lfirst(joinRestrictionCell);
if (ContextCoversJoinRestriction(filteredContext, joinRestriction))
{
continue;
}
filteredContext->joinRestrictionList =
lappend(filteredContext->joinRestrictionList, joinRestriction);
}
/*
* No need to re calculate has join fields as we are still operating on
* the same query and as these values are calculated per-query basis.
*/
filteredContext->hasOnlyInnerJoin = joinRestrictionContext->hasOnlyInnerJoin;
filteredContext->hasSemiJoin = joinRestrictionContext->hasSemiJoin;
return filteredContext;
}
/*
* ContextCoversJoinRestriction returns true if the given joinRestriction
* has an equivalent of in the given joinRestrictionContext.
*/
static bool
ContextCoversJoinRestriction(JoinRestrictionContext *joinRestrictionContext,
JoinRestriction *joinRestrictionInTest)
{
JoinRestriction *joinRestrictionInContext = NULL;
List *joinRestrictionInContextList = joinRestrictionContext->joinRestrictionList;
foreach_ptr(joinRestrictionInContext, joinRestrictionInContextList)
{
/* obviously we shouldn't treat different join types as being the same */
if (joinRestrictionInContext->joinType != joinRestrictionInTest->joinType)
{
continue;
}
/*
* If we're dealing with different queries, we shouldn't treat their
* restrictions as being the same.
*/
if (joinRestrictionInContext->plannerInfo != joinRestrictionInTest->plannerInfo)
{
continue;
}
List *joinRestrictInfoListInTest =
joinRestrictionInTest->joinRestrictInfoList;
bool hasJoinRestriction = list_length(joinRestrictInfoListInTest) > 0;
bool hasOnlyInnerJoin = joinRestrictionContext->hasOnlyInnerJoin;
if (!hasOnlyInnerJoin && !hasJoinRestriction)
{
/*
* If join doesn't have a restriction (e.g., ON (true)) and planner
* is aware of at least one non-inner JOIN (e.g., outer/semi joins),
* we should not eliminiate joinRestrictionInTest. It can still be
* useful for detecting not supported outer-join checks even if it
* doesn't help for colocation checks.
*/
continue;
}
/*
* We check whether the restrictions in joinRestrictionInTest is a subset
* of the restrictions in joinRestrictionInContext in the sense that all the
* restrictions in the latter already exists in the former.
*/
List *joinRestrictInfoListInContext =
joinRestrictionInContext->joinRestrictInfoList;
if (LeftListIsSubset(joinRestrictInfoListInTest, joinRestrictInfoListInContext))
{
return true;
}
}
return false;
}

View File

@ -258,15 +258,3 @@ GenerateListFromElement(void *listElement, int listLength)
return list;
}
/*
* LeftListIsSubset returns true if lhs is subset of rhs list. Note that input
* lists' entries should implement equal() function.
*/
bool
LeftListIsSubset(const List *lhs, const List *rhs)
{
List *listDifference = list_difference(lhs, rhs);
return list_length(listDifference) == 0;
}

View File

@ -76,7 +76,6 @@ typedef struct JoinRestrictionContext
{
List *joinRestrictionList;
bool hasSemiJoin;
bool hasOnlyInnerJoin;
} JoinRestrictionContext;
typedef struct JoinRestriction

View File

@ -93,6 +93,5 @@ extern List * ListTake(List *pointerList, int size);
extern void * safe_list_nth(const List *list, int index);
extern List * GeneratePositiveIntSequenceList(int upTo);
extern List * GenerateListFromElement(void *listElement, int listLength);
extern bool LeftListIsSubset(const List *lhs, const List *rhs);
#endif /* CITUS_LISTUTILS_H */

View File

@ -36,9 +36,6 @@ extern List * DistributedRelationIdList(Query *query);
extern PlannerRestrictionContext * FilterPlannerRestrictionForQuery(
PlannerRestrictionContext *plannerRestrictionContext,
Query *query);
extern JoinRestrictionContext * RemoveDuplicateJoinRestrictions(JoinRestrictionContext *
joinRestrictionContext);
extern bool EquivalenceListContainsRelationsEquality(List *attributeEquivalenceList,
RelationRestrictionContext *
restrictionContext);