mirror of https://github.com/citusdata/citus.git
Remove RemoveDuplicateJoinRestrictions() function
RemoveDuplicateJoinRestrictions() function was introduced with the aim of decrasing the overall planning times by eliminating the duplicate JOIN restriction entries (#1989). However, it turns out that the function itself is so CPU intensive with a very high algorithmic complexity, it hurts a lot more than it helps. The function is a clear example of premature optimization. The table below shows the difference clearly: "distributed query planning time master" RemoveDuplicateJoinRestrictions() execution time on master "Remove the function RemoveDuplicateJoinRestrictions() this PR" 5 table INNER JOIN 9 msec 2msec 7 msec 10 table INNER JOIN 227 msec 194 msec 29 msec 20 table INNER JOIN 1 sec 235 msec 1 sec 139 msec 90 msecs 50 table INNER JOIN 24 seconds 21 seconds 1.5 seconds 100 table INNER JOIN 2 minutes 16 secods 1 minute 53 seconds 23 seconds 250 table INNER JOIN Bottleneck on JoinClauseList 18 minutes 52 seconds Bottleneck on JoinClauseList 5 table INNER JOIN in subquery 9 msec 0 msec 6 msec 10 table INNER JOIN subquery 33 msec 10 msec 32 msec 20 table INNER JOIN subquery 132 msec 67 msec 123 msec 50 table INNER JOIN subquery 1.2 seconds 900 msec 500 msec 100 table INNER JOIN subquery 6 seconds 5 seconds 2 seconds 250 table INNER JOIN subquery 54 seconds 37 seconds 20 seconds 5 table LEFT JOIN 5 msec 0 msec 5 msec 10 table LEFT JOIN 11 msec 0 msec 13 msec 20 table LEFT JOIN 26 msec 2 msec 30 msec 50 table LEFT JOIN 150 msec 15 msec 193 msec 100 table LEFT JOIN 757 msec 71 msec 722 msec 250 table LEFT JOIN 8 seconds 600 msec 8 seconds 5 JOINs among 2 table JOINs 37 msec 11 msec 25 msec 10 JOINs among 2 table JOINs 536 msec 306 msec 352 msec 20 JOINs among 2 table JOINs 794 msec 181 msec 640 msec 50 JOINs among 2 table JOINs 25 seconds 2 seconds 22 seconds 100 JOINs among 2 table JOINs Bottleneck on JoinClauseList 9 seconds Bottleneck on JoinClauseList 150 JOINs among 2 table JOINs Bottleneck on JoinClauseList 46 seconds Bottleneck on JoinClauseList On top of the performance penalty, the function had a critical bug #4255, and with #4254 we hit one more important bug. It should be fixed by adding the followig check to the ContextCoversJoinRestriction(): ``` static bool JoinRelIdsSame(JoinRestriction *leftRestriction, JoinRestriction *rightRestriction) { Relids leftInnerRelIds = leftRestriction->innerrel->relids; Relids rightInnerRelIds = rightRestriction->innerrel->relids; if (!bms_equal(leftInnerRelIds, rightInnerRelIds)) { return false; } Relids leftOuterRelIds = leftRestriction->outerrel->relids; Relids rightOuterRelIds = rightRestriction->outerrel->relids; if (!bms_equal(leftOuterRelIds, rightOuterRelIds)) { return false; } return true; } ``` However, adding this eliminates all the benefits tha RemoveDuplicateJoinRestrictions() brings. I've used the commands here to generate the JOINs mentioned in the PR: https://gist.github.com/onderkalaci/fe8654f9df5916c7af4c7c5eb892561e#file-gistfile1-txt Inner and outer JOINs behave roughly the same, to simplify the table only added INNER joins.pull/4264/head
parent
790beea59f
commit
5c4c9304ba
|
@ -615,8 +615,6 @@ CreateDistributedPlannedStmt(DistributedPlanningContext *planContext)
|
|||
{
|
||||
uint64 planId = NextPlanId++;
|
||||
bool hasUnresolvedParams = false;
|
||||
JoinRestrictionContext *joinRestrictionContext =
|
||||
planContext->plannerRestrictionContext->joinRestrictionContext;
|
||||
|
||||
PlannedStmt *resultPlan = NULL;
|
||||
|
||||
|
@ -646,9 +644,6 @@ CreateDistributedPlannedStmt(DistributedPlanningContext *planContext)
|
|||
hasUnresolvedParams = true;
|
||||
}
|
||||
|
||||
planContext->plannerRestrictionContext->joinRestrictionContext =
|
||||
RemoveDuplicateJoinRestrictions(joinRestrictionContext);
|
||||
|
||||
DistributedPlan *distributedPlan =
|
||||
CreateDistributedPlan(planId, planContext->originalQuery, planContext->query,
|
||||
planContext->boundParams,
|
||||
|
@ -1744,8 +1739,6 @@ multi_join_restriction_hook(PlannerInfo *root,
|
|||
*/
|
||||
joinRestrictionContext->hasSemiJoin = joinRestrictionContext->hasSemiJoin ||
|
||||
extra->sjinfo->jointype == JOIN_SEMI;
|
||||
joinRestrictionContext->hasOnlyInnerJoin = joinRestrictionContext->hasOnlyInnerJoin &&
|
||||
extra->sjinfo->jointype == JOIN_INNER;
|
||||
|
||||
MemoryContextSwitchTo(oldMemoryContext);
|
||||
}
|
||||
|
@ -2135,7 +2128,6 @@ CreateAndPushPlannerRestrictionContext(void)
|
|||
|
||||
/* we'll apply logical AND as we add tables */
|
||||
plannerRestrictionContext->relationRestrictionContext->allReferenceTables = true;
|
||||
plannerRestrictionContext->joinRestrictionContext->hasOnlyInnerJoin = true;
|
||||
|
||||
plannerRestrictionContextList = lcons(plannerRestrictionContext,
|
||||
plannerRestrictionContextList);
|
||||
|
|
|
@ -151,8 +151,6 @@ static bool RangeTableArrayContainsAnyRTEIdentities(RangeTblEntry **rangeTableEn
|
|||
queryRteIdentities);
|
||||
static int RangeTableOffsetCompat(PlannerInfo *root, AppendRelInfo *appendRelInfo);
|
||||
static Relids QueryRteIdentities(Query *queryTree);
|
||||
static bool ContextCoversJoinRestriction(JoinRestrictionContext *joinRestrictionContext,
|
||||
JoinRestriction *joinRestrictionInTest);
|
||||
|
||||
|
||||
/*
|
||||
|
@ -1910,8 +1908,6 @@ FilterJoinRestrictionContext(JoinRestrictionContext *joinRestrictionContext, Rel
|
|||
* No need to re calculate has join fields as we are still operating on
|
||||
* the same query and as these values are calculated per-query basis.
|
||||
*/
|
||||
filtererdJoinRestrictionContext->hasOnlyInnerJoin =
|
||||
joinRestrictionContext->hasOnlyInnerJoin;
|
||||
filtererdJoinRestrictionContext->hasSemiJoin = joinRestrictionContext->hasSemiJoin;
|
||||
|
||||
return filtererdJoinRestrictionContext;
|
||||
|
@ -2000,105 +1996,3 @@ QueryRteIdentities(Query *queryTree)
|
|||
|
||||
return queryRteIdentities;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* RemoveDuplicateJoinRestrictions gets a join restriction context and returns a
|
||||
* newly allocated join restriction context where the duplicate join restrictions
|
||||
* removed.
|
||||
*
|
||||
* Note that we use PostgreSQL hooks to accumulate the join restrictions and PostgreSQL
|
||||
* gives us all the join paths it tries while deciding on the join order. Thus, for
|
||||
* queries that has many joins, this function is likely to remove lots of duplicate join
|
||||
* restrictions. This becomes relevant for Citus on query pushdown check peformance.
|
||||
*/
|
||||
JoinRestrictionContext *
|
||||
RemoveDuplicateJoinRestrictions(JoinRestrictionContext *joinRestrictionContext)
|
||||
{
|
||||
JoinRestrictionContext *filteredContext = palloc0(sizeof(JoinRestrictionContext));
|
||||
ListCell *joinRestrictionCell = NULL;
|
||||
|
||||
filteredContext->joinRestrictionList = NIL;
|
||||
|
||||
foreach(joinRestrictionCell, joinRestrictionContext->joinRestrictionList)
|
||||
{
|
||||
JoinRestriction *joinRestriction = lfirst(joinRestrictionCell);
|
||||
|
||||
if (ContextCoversJoinRestriction(filteredContext, joinRestriction))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
filteredContext->joinRestrictionList =
|
||||
lappend(filteredContext->joinRestrictionList, joinRestriction);
|
||||
}
|
||||
|
||||
/*
|
||||
* No need to re calculate has join fields as we are still operating on
|
||||
* the same query and as these values are calculated per-query basis.
|
||||
*/
|
||||
filteredContext->hasOnlyInnerJoin = joinRestrictionContext->hasOnlyInnerJoin;
|
||||
filteredContext->hasSemiJoin = joinRestrictionContext->hasSemiJoin;
|
||||
|
||||
return filteredContext;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* ContextCoversJoinRestriction returns true if the given joinRestriction
|
||||
* has an equivalent of in the given joinRestrictionContext.
|
||||
*/
|
||||
static bool
|
||||
ContextCoversJoinRestriction(JoinRestrictionContext *joinRestrictionContext,
|
||||
JoinRestriction *joinRestrictionInTest)
|
||||
{
|
||||
JoinRestriction *joinRestrictionInContext = NULL;
|
||||
List *joinRestrictionInContextList = joinRestrictionContext->joinRestrictionList;
|
||||
foreach_ptr(joinRestrictionInContext, joinRestrictionInContextList)
|
||||
{
|
||||
/* obviously we shouldn't treat different join types as being the same */
|
||||
if (joinRestrictionInContext->joinType != joinRestrictionInTest->joinType)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we're dealing with different queries, we shouldn't treat their
|
||||
* restrictions as being the same.
|
||||
*/
|
||||
if (joinRestrictionInContext->plannerInfo != joinRestrictionInTest->plannerInfo)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
List *joinRestrictInfoListInTest =
|
||||
joinRestrictionInTest->joinRestrictInfoList;
|
||||
bool hasJoinRestriction = list_length(joinRestrictInfoListInTest) > 0;
|
||||
bool hasOnlyInnerJoin = joinRestrictionContext->hasOnlyInnerJoin;
|
||||
if (!hasOnlyInnerJoin && !hasJoinRestriction)
|
||||
{
|
||||
/*
|
||||
* If join doesn't have a restriction (e.g., ON (true)) and planner
|
||||
* is aware of at least one non-inner JOIN (e.g., outer/semi joins),
|
||||
* we should not eliminiate joinRestrictionInTest. It can still be
|
||||
* useful for detecting not supported outer-join checks even if it
|
||||
* doesn't help for colocation checks.
|
||||
*/
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* We check whether the restrictions in joinRestrictionInTest is a subset
|
||||
* of the restrictions in joinRestrictionInContext in the sense that all the
|
||||
* restrictions in the latter already exists in the former.
|
||||
*/
|
||||
List *joinRestrictInfoListInContext =
|
||||
joinRestrictionInContext->joinRestrictInfoList;
|
||||
if (LeftListIsSubset(joinRestrictInfoListInTest, joinRestrictInfoListInContext))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -258,15 +258,3 @@ GenerateListFromElement(void *listElement, int listLength)
|
|||
|
||||
return list;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* LeftListIsSubset returns true if lhs is subset of rhs list. Note that input
|
||||
* lists' entries should implement equal() function.
|
||||
*/
|
||||
bool
|
||||
LeftListIsSubset(const List *lhs, const List *rhs)
|
||||
{
|
||||
List *listDifference = list_difference(lhs, rhs);
|
||||
return list_length(listDifference) == 0;
|
||||
}
|
||||
|
|
|
@ -76,7 +76,6 @@ typedef struct JoinRestrictionContext
|
|||
{
|
||||
List *joinRestrictionList;
|
||||
bool hasSemiJoin;
|
||||
bool hasOnlyInnerJoin;
|
||||
} JoinRestrictionContext;
|
||||
|
||||
typedef struct JoinRestriction
|
||||
|
|
|
@ -93,6 +93,5 @@ extern List * ListTake(List *pointerList, int size);
|
|||
extern void * safe_list_nth(const List *list, int index);
|
||||
extern List * GeneratePositiveIntSequenceList(int upTo);
|
||||
extern List * GenerateListFromElement(void *listElement, int listLength);
|
||||
extern bool LeftListIsSubset(const List *lhs, const List *rhs);
|
||||
|
||||
#endif /* CITUS_LISTUTILS_H */
|
||||
|
|
|
@ -36,9 +36,6 @@ extern List * DistributedRelationIdList(Query *query);
|
|||
extern PlannerRestrictionContext * FilterPlannerRestrictionForQuery(
|
||||
PlannerRestrictionContext *plannerRestrictionContext,
|
||||
Query *query);
|
||||
extern JoinRestrictionContext * RemoveDuplicateJoinRestrictions(JoinRestrictionContext *
|
||||
joinRestrictionContext);
|
||||
|
||||
extern bool EquivalenceListContainsRelationsEquality(List *attributeEquivalenceList,
|
||||
RelationRestrictionContext *
|
||||
restrictionContext);
|
||||
|
|
Loading…
Reference in New Issue