citus/src/backend/distributed/planner/shard_pruning.c

1551 lines
44 KiB
C

/*-------------------------------------------------------------------------
*
* shard_pruning.c
* Shard pruning related code.
*
* The goal of shard pruning is to find a minimal (super)set of shards that
* need to be queried to find rows matching the expression in a query.
*
* In PruneShards, we first compute a simplified disjunctive normal form (DNF)
* of the expression as a list of pruning instances. Each pruning instance
* contains all AND-ed constraints on the partition column. An OR expression
* will result in two or more new pruning instances being added for the
* subexpressions. The "parent" instance is marked isPartial and ignored
* during pruning.
*
* We use the distributive property for constraints of the form P AND (Q OR R)
* to rewrite it to (P AND Q) OR (P AND R) by copying constraints from parent
* to "child" pruning instances. However, we do not distribute nested
* expressions. While (P OR Q) AND (R OR S) is logically equivalent to (P AND
* R) OR (P AND S) OR (Q AND R) OR (Q AND S), in our implementation it becomes
* P OR Q OR R OR S. This is acceptable since this will always result in a
* superset of shards. If this proves to be a issue in practice, a more
* complete algorithm could be implemented.
*
* We then evaluate each non-partial pruning instance in the disjunction
* through the following, increasingly expensive, steps:
*
* 1) If there is a constant equality constraint on the partition column, and
* no overlapping shards exist, find the shard interval in which the
* constant falls
*
* 2) If there is a hash range constraint on the partition column, find the
* shard interval matching the range
*
* 3) If there are range constraints (e.g. (a > 0 AND a < 10)) on the
* partition column, find the shard intervals that overlap with the range
*
* 4) If there are overlapping shards, exhaustively search all shards that are
* not excluded by constraints
*
* Finally, the union of the shards found by each pruning instance is
* returned.
*
* Copyright (c) 2014-2017, Citus Data, Inc.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "fmgr.h"
#include "distributed/shard_pruning.h"
#include "access/nbtree.h"
#include "catalog/pg_am.h"
#include "catalog/pg_collation.h"
#include "catalog/pg_type.h"
#include "distributed/metadata_cache.h"
#include "distributed/distributed_planner.h"
#include "distributed/multi_join_order.h"
#include "distributed/multi_physical_planner.h"
#include "distributed/shardinterval_utils.h"
#include "distributed/pg_dist_partition.h"
#include "distributed/version_compat.h"
#include "distributed/worker_protocol.h"
#include "nodes/nodeFuncs.h"
#include "nodes/makefuncs.h"
#include "optimizer/clauses.h"
#include "optimizer/planner.h"
#include "parser/parse_coerce.h"
#include "utils/arrayaccess.h"
#include "utils/catcache.h"
#include "utils/lsyscache.h"
#include "utils/memutils.h"
/*
* A pruning instance is a set of ANDed constraints on a partition key.
*/
typedef struct PruningInstance
{
/* Does this instance contain any prunable expressions? */
bool hasValidConstraint;
/*
* This constraint never evaluates to true, i.e. pruning does not have to
* be performed.
*/
bool evaluatesToFalse;
/*
* Constraints on the partition column value. If multiple values are
* found the more restrictive one should be stored here. Even in case of
* a hash-partitioned table, actual column-values are stored here, *not*
* hashed values.
*/
Const *lessConsts;
Const *lessEqualConsts;
Const *equalConsts;
Const *greaterEqualConsts;
Const *greaterConsts;
/*
* Constraint using a pre-hashed column value. The constant will store the
* hashed value, not the original value of the restriction.
*/
Const *hashedEqualConsts;
/*
* Types of constraints not understood. We could theoretically try more
* expensive methods of pruning if any such restrictions are found.
*
* TODO: any actual use for this? Right now there seems little point.
*/
List *otherRestrictions;
/*
* Has this PruningInstance been added to
* ClauseWalkerContext->pruningInstances? This is not done immediately,
* but the first time a constraint (independent of us being able to handle
* that constraint) is found.
*/
bool addedToPruningInstances;
/*
* When OR clauses are found, the non-ORed part (think of a < 3 AND (a > 5
* OR a > 7)) of the expression is stored in one PruningInstance which is
* then copied for the ORed expressions. The original is marked as
* isPartial, to avoid it being used for pruning.
*/
bool isPartial;
} PruningInstance;
/*
* Partial instances that need to be finished building. This is used to
* collect all ANDed restrictions, before looking into ORed expressions.
*/
typedef struct PendingPruningInstance
{
PruningInstance *instance;
Node *continueAt;
} PendingPruningInstance;
#if PG_VERSION_NUM >= 120000
typedef union \
{ \
FunctionCallInfoBaseData fcinfo; \
/* ensure enough space for nargs args is available */ \
char fcinfo_data[SizeForFunctionCallInfo(2)]; \
} FunctionCall2InfoData;
#else
typedef FunctionCallInfoData FunctionCall2InfoData;
#endif
/*
* Data necessary to perform a single PruneShards().
*/
typedef struct ClauseWalkerContext
{
Var *partitionColumn;
char partitionMethod;
/* ORed list of pruning targets */
List *pruningInstances;
/*
* Partially built PruningInstances, that need to be completed by doing a
* separate PrunableExpressionsWalker() pass.
*/
List *pendingInstances;
/* PruningInstance currently being built, all elegible constraints are added here */
PruningInstance *currentPruningInstance;
/*
* Information about function calls we need to perform. Re-using the same
* FunctionCall2InfoData, instead of using FunctionCall2Coll, is often
* cheaper.
*/
FunctionCall2InfoData compareValueFunctionCall;
FunctionCall2InfoData compareIntervalFunctionCall;
} ClauseWalkerContext;
static void PrunableExpressions(Node *originalNode, ClauseWalkerContext *context);
static bool PrunableExpressionsWalker(Node *originalNode, ClauseWalkerContext *context);
static void AddPartitionKeyRestrictionToInstance(ClauseWalkerContext *context,
OpExpr *opClause, Var *varClause,
Const *constantClause);
static Const * TransformPartitionRestrictionValue(Var *partitionColumn,
Const *restrictionValue);
static void AddSAOPartitionKeyRestrictionToInstance(ClauseWalkerContext *context,
ScalarArrayOpExpr *
arrayOperatorExpression);
static void AddHashRestrictionToInstance(ClauseWalkerContext *context, OpExpr *opClause,
Var *varClause, Const *constantClause);
static void AddNewConjuction(ClauseWalkerContext *context, OpExpr *op);
static PruningInstance * CopyPartialPruningInstance(PruningInstance *sourceInstance);
static List * ShardArrayToList(ShardInterval **shardArray, int length);
static List * DeepCopyShardIntervalList(List *originalShardIntervalList);
static int PerformValueCompare(FunctionCallInfo compareFunctionCall, Datum a,
Datum b);
static int PerformCompare(FunctionCallInfo compareFunctionCall);
static List * PruneOne(DistTableCacheEntry *cacheEntry, ClauseWalkerContext *context,
PruningInstance *prune);
static List * PruneWithBoundaries(DistTableCacheEntry *cacheEntry,
ClauseWalkerContext *context,
PruningInstance *prune);
static List * ExhaustivePrune(DistTableCacheEntry *cacheEntry,
ClauseWalkerContext *context,
PruningInstance *prune);
static bool ExhaustivePruneOne(ShardInterval *curInterval,
ClauseWalkerContext *context,
PruningInstance *prune);
static int UpperShardBoundary(Datum partitionColumnValue,
ShardInterval **shardIntervalCache,
int shardCount, FunctionCallInfo compareFunction,
bool includeMin);
static int LowerShardBoundary(Datum partitionColumnValue,
ShardInterval **shardIntervalCache,
int shardCount, FunctionCallInfo compareFunction,
bool includeMax);
/*
* PruneShards returns all shards from a distributed table that cannot be
* proven to be eliminated by whereClauseList.
*
* For reference tables, the function simply returns the single shard that the
* table has.
*
* When there is a single <partition column> = <constant> filter in the where
* clause list, the constant is written to the partitionValueConst pointer.
*/
List *
PruneShards(Oid relationId, Index rangeTableId, List *whereClauseList,
Const **partitionValueConst)
{
DistTableCacheEntry *cacheEntry = DistributedTableCacheEntry(relationId);
int shardCount = cacheEntry->shardIntervalArrayLength;
char partitionMethod = cacheEntry->partitionMethod;
ClauseWalkerContext context = { 0 };
ListCell *pruneCell;
List *prunedList = NIL;
bool foundRestriction = false;
bool foundPartitionColumnValue = false;
Const *singlePartitionValueConst = NULL;
/* there are no shards to return */
if (shardCount == 0)
{
return NIL;
}
/* always return empty result if WHERE clause is of the form: false (AND ..) */
if (ContainsFalseClause(whereClauseList))
{
return NIL;
}
/* short circuit for reference tables */
if (partitionMethod == DISTRIBUTE_BY_NONE)
{
prunedList = ShardArrayToList(cacheEntry->sortedShardIntervalArray,
cacheEntry->shardIntervalArrayLength);
return DeepCopyShardIntervalList(prunedList);
}
context.partitionMethod = partitionMethod;
context.partitionColumn = PartitionColumn(relationId, rangeTableId);
context.currentPruningInstance = palloc0(sizeof(PruningInstance));
if (cacheEntry->shardIntervalCompareFunction)
{
/* initiate function call info once (allows comparators to cache metadata) */
InitFunctionCallInfoData(*(FunctionCallInfo) &
context.compareIntervalFunctionCall,
cacheEntry->shardIntervalCompareFunction,
2, DEFAULT_COLLATION_OID, NULL, NULL);
}
else
{
ereport(ERROR, (errmsg("shard pruning not possible without "
"a shard interval comparator")));
}
if (cacheEntry->shardColumnCompareFunction)
{
/* initiate function call info once (allows comparators to cache metadata) */
InitFunctionCallInfoData(*(FunctionCallInfo) &
context.compareValueFunctionCall,
cacheEntry->shardColumnCompareFunction,
2, DEFAULT_COLLATION_OID, NULL, NULL);
}
else
{
ereport(ERROR, (errmsg("shard pruning not possible without "
"a partition column comparator")));
}
/* Figure out what we can prune on */
PrunableExpressions((Node *) whereClauseList, &context);
/*
* Prune using each of the PrunableInstances we found, and OR results
* together.
*/
foreach(pruneCell, context.pruningInstances)
{
PruningInstance *prune = (PruningInstance *) lfirst(pruneCell);
/*
* If this is a partial instance, a fully built one has also been
* added. Skip.
*/
if (prune->isPartial)
{
continue;
}
/*
* If the current instance has no prunable expressions, we'll have to
* return all shards. No point in continuing pruning in that case.
*/
if (!prune->hasValidConstraint)
{
foundRestriction = false;
break;
}
if (context.partitionMethod == DISTRIBUTE_BY_HASH)
{
if (!prune->evaluatesToFalse && !prune->equalConsts &&
!prune->hashedEqualConsts)
{
/* if hash-partitioned and no equals constraints, return all shards */
foundRestriction = false;
break;
}
else if (partitionValueConst != NULL && prune->equalConsts != NULL)
{
if (!foundPartitionColumnValue)
{
/* remember the partition column value */
singlePartitionValueConst = prune->equalConsts;
foundPartitionColumnValue = true;
}
else if (singlePartitionValueConst == NULL)
{
/* already found multiple partition column values */
}
else if (!equal(prune->equalConsts, singlePartitionValueConst))
{
/* found multiple partition column values */
singlePartitionValueConst = NULL;
}
}
}
List *pruneOneList = PruneOne(cacheEntry, &context, prune);
if (prunedList)
{
/*
* We can use list_union_ptr, which is a lot faster than doing
* comparing shards by value, because all the ShardIntervals are
* guaranteed to be from
* DistTableCacheEntry->sortedShardIntervalArray (thus having the
* same pointer values).
*/
prunedList = list_union_ptr(prunedList, pruneOneList);
}
else
{
prunedList = pruneOneList;
}
foundRestriction = true;
}
/* found no valid restriction, build list of all shards */
if (!foundRestriction)
{
prunedList = ShardArrayToList(cacheEntry->sortedShardIntervalArray,
cacheEntry->shardIntervalArrayLength);
}
/* if requested, copy the partition value constant */
if (partitionValueConst != NULL)
{
if (singlePartitionValueConst != NULL)
{
*partitionValueConst = copyObject(singlePartitionValueConst);
}
else
{
*partitionValueConst = NULL;
}
}
/*
* Deep copy list, so it's independent of the DistTableCacheEntry
* contents.
*/
return DeepCopyShardIntervalList(prunedList);
}
/*
* ContainsFalseClause returns whether the flattened where clause list
* contains false as a clause.
*/
bool
ContainsFalseClause(List *whereClauseList)
{
bool containsFalseClause = false;
ListCell *clauseCell = NULL;
foreach(clauseCell, whereClauseList)
{
Node *clause = (Node *) lfirst(clauseCell);
if (IsA(clause, Const))
{
Const *constant = (Const *) clause;
if (constant->consttype == BOOLOID && !DatumGetBool(constant->constvalue))
{
containsFalseClause = true;
break;
}
}
}
return containsFalseClause;
}
/*
* PrunableExpressions builds a list of all prunable expressions in node,
* storing them in context->pruningInstances.
*/
static void
PrunableExpressions(Node *node, ClauseWalkerContext *context)
{
/*
* Build initial list of prunable expressions. As long as only,
* implicitly or explicitly, ANDed expressions are found, this perform a
* depth-first search. When an ORed expression is found, the current
* PruningInstance is added to context->pruningInstances (once for each
* ORed expression), then the tree-traversal is continued without
* recursing. Once at the top-level again, we'll process all pending
* expressions - that allows us to find all ANDed expressions, before
* recursing into an ORed expression.
*/
PrunableExpressionsWalker(node, context);
/*
* Process all pending instances. While processing, new ones might be
* added to the list, so don't use foreach().
*
* Check the places in PruningInstanceWalker that push onto
* context->pendingInstances why construction of the PruningInstance might
* be pending.
*
* We copy the partial PruningInstance, and continue adding information by
* calling PrunableExpressionsWalker() on the copy, continuing at the the
* node stored in PendingPruningInstance->continueAt.
*/
while (context->pendingInstances != NIL)
{
PendingPruningInstance *instance =
(PendingPruningInstance *) linitial(context->pendingInstances);
PruningInstance *newPrune = CopyPartialPruningInstance(instance->instance);
context->pendingInstances = list_delete_first(context->pendingInstances);
context->currentPruningInstance = newPrune;
PrunableExpressionsWalker(instance->continueAt, context);
context->currentPruningInstance = NULL;
}
}
/*
* PrunableExpressionsWalker() is the main work horse for
* PrunableExpressions().
*/
static bool
PrunableExpressionsWalker(Node *node, ClauseWalkerContext *context)
{
if (node == NULL)
{
return false;
}
/*
* Check for expressions understood by this routine.
*/
if (IsA(node, List))
{
/* at the top of quals we'll frequently see lists, those are to be treated as ANDs */
}
else if (IsA(node, BoolExpr))
{
BoolExpr *boolExpr = (BoolExpr *) node;
if (boolExpr->boolop == NOT_EXPR)
{
return false;
}
else if (boolExpr->boolop == AND_EXPR)
{
return expression_tree_walker((Node *) boolExpr->args,
PrunableExpressionsWalker, context);
}
else if (boolExpr->boolop == OR_EXPR)
{
ListCell *opCell = NULL;
/*
* "Queue" partial pruning instances. This is used to convert
* expressions like (A AND (B OR C) AND D) into (A AND B AND D),
* (A AND C AND D), with A, B, C, D being restrictions. When the
* OR is encountered, a reference to the partially built
* PruningInstance (containing A at this point), is added to
* context->pendingInstances once for B and once for C. Once a
* full tree-walk completed, PrunableExpressions() will complete
* the pending instances, which'll now also know about restriction
* D, by calling PrunableExpressionsWalker() once for B and once
* for C.
*/
foreach(opCell, boolExpr->args)
{
AddNewConjuction(context, lfirst(opCell));
}
return false;
}
}
else if (IsA(node, OpExpr))
{
OpExpr *opClause = (OpExpr *) node;
PruningInstance *prune = context->currentPruningInstance;
Node *leftOperand = NULL;
Node *rightOperand = NULL;
Const *constantClause = NULL;
Var *varClause = NULL;
if (!prune->addedToPruningInstances)
{
context->pruningInstances = lappend(context->pruningInstances, prune);
prune->addedToPruningInstances = true;
}
if (list_length(opClause->args) == 2)
{
leftOperand = get_leftop((Expr *) opClause);
rightOperand = get_rightop((Expr *) opClause);
leftOperand = strip_implicit_coercions(leftOperand);
rightOperand = strip_implicit_coercions(rightOperand);
if (IsA(rightOperand, Const) && IsA(leftOperand, Var))
{
constantClause = (Const *) rightOperand;
varClause = (Var *) leftOperand;
}
else if (IsA(leftOperand, Const) && IsA(rightOperand, Var))
{
constantClause = (Const *) leftOperand;
varClause = (Var *) rightOperand;
}
}
if (constantClause && varClause && equal(varClause, context->partitionColumn))
{
/*
* Found a restriction on the partition column itself. Update the
* current constraint with the new information.
*/
AddPartitionKeyRestrictionToInstance(context, opClause, varClause,
constantClause);
}
else if (constantClause && varClause &&
varClause->varattno == RESERVED_HASHED_COLUMN_ID)
{
/*
* Found restriction that directly specifies the boundaries of a
* hashed column.
*/
AddHashRestrictionToInstance(context, opClause, varClause, constantClause);
}
return false;
}
else if (IsA(node, ScalarArrayOpExpr))
{
ScalarArrayOpExpr *arrayOperatorExpression = (ScalarArrayOpExpr *) node;
AddSAOPartitionKeyRestrictionToInstance(context, arrayOperatorExpression);
return false;
}
else
{
PruningInstance *prune = context->currentPruningInstance;
/*
* Mark expression as added, so we'll fail pruning if there's no ANDed
* restrictions that we know how to deal with.
*/
if (!prune->addedToPruningInstances)
{
context->pruningInstances = lappend(context->pruningInstances, prune);
prune->addedToPruningInstances = true;
}
return false;
}
return expression_tree_walker(node, PrunableExpressionsWalker, context);
}
/*
* AddSAOPartitionKeyRestrictionToInstance adds partcol = arrayelem operator
* restriction to the current pruning instance for each element of the array. These
* restrictions are added to pruning instance to prune shards based on IN/=ANY
* constraints.
*/
static void
AddSAOPartitionKeyRestrictionToInstance(ClauseWalkerContext *context,
ScalarArrayOpExpr *arrayOperatorExpression)
{
PruningInstance *prune = context->currentPruningInstance;
Node *leftOpExpression = linitial(arrayOperatorExpression->args);
Node *strippedLeftOpExpression = strip_implicit_coercions(leftOpExpression);
bool usingEqualityOperator = OperatorImplementsEquality(
arrayOperatorExpression->opno);
Expr *arrayArgument = (Expr *) lsecond(arrayOperatorExpression->args);
/* checking for partcol = ANY(const, value, s); or partcol IN (const,b,c); */
if (usingEqualityOperator && strippedLeftOpExpression != NULL &&
equal(strippedLeftOpExpression, context->partitionColumn) &&
IsA(arrayArgument, Const))
{
int16 typlen = 0;
bool typbyval = false;
char typalign = '\0';
Datum arrayElement = 0;
Datum inArray = ((Const *) arrayArgument)->constvalue;
bool isNull = false;
/* check for the NULL right-hand expression*/
if (inArray == 0)
{
return;
}
ArrayType *array = DatumGetArrayTypeP(((Const *) arrayArgument)->constvalue);
/* get the necessary information from array type to iterate over it */
Oid elementType = ARR_ELEMTYPE(array);
get_typlenbyvalalign(elementType,
&typlen,
&typbyval,
&typalign);
/* Iterate over the righthand array of expression */
ArrayIterator arrayIterator = array_create_iterator(array, 0, NULL);
while (array_iterate(arrayIterator, &arrayElement, &isNull))
{
Const *constElement = makeConst(elementType, -1,
DEFAULT_COLLATION_OID, typlen, arrayElement,
isNull, typbyval);
/* build partcol = arrayelem operator */
OpExpr *arrayEqualityOp = makeNode(OpExpr);
arrayEqualityOp->opno = arrayOperatorExpression->opno;
arrayEqualityOp->opfuncid = arrayOperatorExpression->opfuncid;
arrayEqualityOp->inputcollid = arrayOperatorExpression->inputcollid;
arrayEqualityOp->opresulttype = get_func_rettype(
arrayOperatorExpression->opfuncid);
arrayEqualityOp->opcollid = DEFAULT_COLLATION_OID;
arrayEqualityOp->location = -1;
arrayEqualityOp->args = list_make2(strippedLeftOpExpression, constElement);
AddNewConjuction(context, arrayEqualityOp);
}
}
/* Since we could not deal with the constraint, add the pruning instance to
* pruning instance list and labeled it as added.
*/
else if (!prune->addedToPruningInstances)
{
context->pruningInstances = lappend(context->pruningInstances, prune);
prune->addedToPruningInstances = true;
}
}
/*
* AddNewConjuction adds the OpExpr to pending instance list of context
* as conjunction as partial instance.
*/
static void
AddNewConjuction(ClauseWalkerContext *context, OpExpr *op)
{
PendingPruningInstance *instance = palloc0(sizeof(PendingPruningInstance));
instance->instance = context->currentPruningInstance;
instance->continueAt = (Node *) op;
/*
* Signal that this instance is not to be used for pruning on
* its own. Once the pending instance is processed, it'll be
* used.
*/
instance->instance->isPartial = true;
context->pendingInstances = lappend(context->pendingInstances, instance);
}
/*
* AddPartitionKeyRestrictionToInstance adds information about a PartitionKey
* $op Const restriction to the current pruning instance.
*/
static void
AddPartitionKeyRestrictionToInstance(ClauseWalkerContext *context, OpExpr *opClause,
Var *partitionColumn, Const *constantClause)
{
PruningInstance *prune = context->currentPruningInstance;
ListCell *btreeInterpretationCell = NULL;
bool matchedOp = false;
/* only have extra work to do if const isn't same type as partition column */
if (constantClause->consttype != partitionColumn->vartype)
{
/* we want our restriction value in terms of the type of the partition column */
constantClause = TransformPartitionRestrictionValue(partitionColumn,
constantClause);
if (constantClause == NULL)
{
/* couldn't coerce value, so we note this as a restriction we don't grok */
prune->otherRestrictions = lappend(prune->otherRestrictions, opClause);
return;
}
}
/* at this point, we'd better be able to pass binary Datums to comparison functions */
Assert(IsBinaryCoercible(constantClause->consttype, partitionColumn->vartype));
List *btreeInterpretationList = get_op_btree_interpretation(opClause->opno);
foreach(btreeInterpretationCell, btreeInterpretationList)
{
OpBtreeInterpretation *btreeInterpretation =
(OpBtreeInterpretation *) lfirst(btreeInterpretationCell);
switch (btreeInterpretation->strategy)
{
case BTLessStrategyNumber:
{
if (!prune->lessConsts ||
PerformValueCompare((FunctionCallInfo) &
context->compareValueFunctionCall,
constantClause->constvalue,
prune->lessConsts->constvalue) < 0)
{
prune->lessConsts = constantClause;
}
matchedOp = true;
break;
}
case BTLessEqualStrategyNumber:
{
if (!prune->lessEqualConsts ||
PerformValueCompare((FunctionCallInfo) &
context->compareValueFunctionCall,
constantClause->constvalue,
prune->lessEqualConsts->constvalue) < 0)
{
prune->lessEqualConsts = constantClause;
}
matchedOp = true;
break;
}
case BTEqualStrategyNumber:
{
if (!prune->equalConsts)
{
prune->equalConsts = constantClause;
}
else if (PerformValueCompare((FunctionCallInfo) &
context->compareValueFunctionCall,
constantClause->constvalue,
prune->equalConsts->constvalue) != 0)
{
/* key can't be equal to two values */
prune->evaluatesToFalse = true;
}
matchedOp = true;
break;
}
case BTGreaterEqualStrategyNumber:
{
if (!prune->greaterEqualConsts ||
PerformValueCompare((FunctionCallInfo) &
context->compareValueFunctionCall,
constantClause->constvalue,
prune->greaterEqualConsts->constvalue) > 0
)
{
prune->greaterEqualConsts = constantClause;
}
matchedOp = true;
break;
}
case BTGreaterStrategyNumber:
{
if (!prune->greaterConsts ||
PerformValueCompare((FunctionCallInfo) &
context->compareValueFunctionCall,
constantClause->constvalue,
prune->greaterConsts->constvalue) > 0)
{
prune->greaterConsts = constantClause;
}
matchedOp = true;
break;
}
case ROWCOMPARE_NE:
{
/*
* This case should only arise when ALL list elements have this
* "strategy" number set. Skipping to the end of the list might
* protect us if that assumption is violated, and an Assert can
* notify us if it ever is...
*/
/* should see this value immediately */
Assert(btreeInterpretationCell == btreeInterpretationList->head);
/* stop processing early, would only see unsupported nodes anyhow */
btreeInterpretationCell = btreeInterpretationList->tail;
/* TODO: could add support for this, if we feel like it */
matchedOp = false;
break;
}
default:
Assert(false);
}
}
if (!matchedOp)
{
prune->otherRestrictions = lappend(prune->otherRestrictions, opClause);
}
else
{
prune->hasValidConstraint = true;
}
}
/*
* Sometimes PostgreSQL chooses to try to wrap our Var in a coercion rather
* than the Const; to deal with this, we strip the coercions from both and
* manually coerce the Const into the type of our partition column. It is
* conceivable that in some instances, this may not be possible; in those cases
* we will simply fail to prune partitions based on this clause.
*/
static Const *
TransformPartitionRestrictionValue(Var *partitionColumn, Const *restrictionValue)
{
Node *transformedValue = coerce_to_target_type(NULL, (Node *) restrictionValue,
restrictionValue->consttype,
partitionColumn->vartype,
partitionColumn->vartypmod,
COERCION_ASSIGNMENT,
COERCE_IMPLICIT_CAST, -1);
/* if NULL, no implicit coercion is possible between the types */
if (transformedValue == NULL)
{
return NULL;
}
/* if still not a constant, evaluate coercion */
if (!IsA(transformedValue, Const))
{
transformedValue = (Node *) expression_planner((Expr *) transformedValue);
}
/* if still not a constant, no immutable coercion matched */
if (!IsA(transformedValue, Const))
{
return NULL;
}
return (Const *) transformedValue;
}
/*
* AddHashRestrictionToInstance adds information about a
* RESERVED_HASHED_COLUMN_ID = Const restriction to the current pruning
* instance.
*/
static void
AddHashRestrictionToInstance(ClauseWalkerContext *context, OpExpr *opClause,
Var *varClause, Const *constantClause)
{
PruningInstance *prune = context->currentPruningInstance;
ListCell *btreeInterpretationCell = NULL;
/* be paranoid */
Assert(IsBinaryCoercible(constantClause->consttype, INT4OID));
List *btreeInterpretationList =
get_op_btree_interpretation(opClause->opno);
foreach(btreeInterpretationCell, btreeInterpretationList)
{
OpBtreeInterpretation *btreeInterpretation =
(OpBtreeInterpretation *) lfirst(btreeInterpretationCell);
/*
* Ladidadida, dirty hackety hack. We only add such
* constraints (in ShardIntervalOpExpressions()) to select a
* shard based on its exact boundaries. For efficient binary
* search it's better to simply use one representative value
* to look up the shard. In practice, this is sufficient for
* now.
*/
if (btreeInterpretation->strategy == BTGreaterEqualStrategyNumber)
{
Assert(!prune->hashedEqualConsts);
prune->hashedEqualConsts = constantClause;
prune->hasValidConstraint = true;
}
}
}
/*
* CopyPartialPruningInstance copies a partial PruningInstance, so it can be
* completed.
*/
static PruningInstance *
CopyPartialPruningInstance(PruningInstance *sourceInstance)
{
PruningInstance *newInstance = palloc(sizeof(PruningInstance));
Assert(sourceInstance->isPartial);
/*
* To make the new PruningInstance useful for pruning, we have to reset it
* being partial - if necessary it'll be marked so again by
* PrunableExpressionsWalker().
*/
memcpy(newInstance, sourceInstance, sizeof(PruningInstance));
newInstance->addedToPruningInstances = false;
newInstance->isPartial = false;
return newInstance;
}
/*
* ShardArrayToList builds a list of out the array of ShardInterval*.
*/
static List *
ShardArrayToList(ShardInterval **shardArray, int length)
{
List *shardIntervalList = NIL;
for (int shardIndex = 0; shardIndex < length; shardIndex++)
{
ShardInterval *shardInterval =
shardArray[shardIndex];
shardIntervalList = lappend(shardIntervalList, shardInterval);
}
return shardIntervalList;
}
/*
* DeepCopyShardIntervalList copies originalShardIntervalList and the
* contained ShardIntervals, into a new list.
*/
static List *
DeepCopyShardIntervalList(List *originalShardIntervalList)
{
List *copiedShardIntervalList = NIL;
ListCell *shardIntervalCell = NULL;
foreach(shardIntervalCell, originalShardIntervalList)
{
ShardInterval *originalShardInterval =
(ShardInterval *) lfirst(shardIntervalCell);
ShardInterval *copiedShardInterval =
(ShardInterval *) palloc0(sizeof(ShardInterval));
CopyShardInterval(originalShardInterval, copiedShardInterval);
copiedShardIntervalList = lappend(copiedShardIntervalList, copiedShardInterval);
}
return copiedShardIntervalList;
}
/*
* PruneOne returns all shards in the table that match a single
* PruningInstance.
*/
static List *
PruneOne(DistTableCacheEntry *cacheEntry, ClauseWalkerContext *context,
PruningInstance *prune)
{
ShardInterval *shardInterval = NULL;
/* Well, if life always were this easy... */
if (prune->evaluatesToFalse)
{
return NIL;
}
/*
* For an equal constraints, if there's no overlapping shards (always the
* case for hash and range partitioning, sometimes for append), can
* perform binary search for the right interval. That's usually the
* fastest, so try that first.
*/
if (prune->equalConsts &&
!cacheEntry->hasOverlappingShardInterval)
{
shardInterval = FindShardInterval(prune->equalConsts->constvalue, cacheEntry);
/*
* If pruned down to nothing, we're done. Otherwise see if other
* methods prune down further / to nothing.
*/
if (!shardInterval)
{
return NIL;
}
}
/*
* If the hash value we're looking for is known, we can search for the
* interval directly. That's fast and should only ever be the case for a
* hash-partitioned table.
*/
if (prune->hashedEqualConsts)
{
ShardInterval **sortedShardIntervalArray = cacheEntry->sortedShardIntervalArray;
Assert(context->partitionMethod == DISTRIBUTE_BY_HASH);
int shardIndex = FindShardIntervalIndex(prune->hashedEqualConsts->constvalue,
cacheEntry);
if (shardIndex == INVALID_SHARD_INDEX)
{
return NIL;
}
else if (shardInterval &&
sortedShardIntervalArray[shardIndex]->shardId != shardInterval->shardId)
{
/*
* equalConst based pruning above yielded a different shard than
* pruning based on pre-hashed equality. This is useful in case
* of INSERT ... SELECT, where both can occur together (one via
* join/colocation, the other via a plain equality restriction).
*/
return NIL;
}
else
{
return list_make1(sortedShardIntervalArray[shardIndex]);
}
}
/*
* If previous pruning method yielded a single shard, and the table is not
* hash partitioned, attempt range based pruning to exclude it further.
*
* That's particularly important in particular for subquery pushdown,
* where it's very common to have a user specified equality restriction,
* and a range based restriction for shard boundaries, added by the
* subquery machinery.
*/
if (shardInterval)
{
if (context->partitionMethod != DISTRIBUTE_BY_HASH &&
ExhaustivePruneOne(shardInterval, context, prune))
{
return NIL;
}
else
{
/* no chance to prune further, return */
return list_make1(shardInterval);
}
}
/*
* Should never get here for hashing, we've filtered down to either zero
* or one shard, and returned.
*/
Assert(context->partitionMethod != DISTRIBUTE_BY_HASH);
/*
* Next method: binary search with fuzzy boundaries. Can't trivially do so
* if shards have overlapping boundaries.
*
* TODO: If we kept shard intervals separately sorted by both upper and
* lower boundaries, this should be possible?
*/
if (!cacheEntry->hasOverlappingShardInterval && (
prune->greaterConsts || prune->greaterEqualConsts ||
prune->lessConsts || prune->lessEqualConsts))
{
return PruneWithBoundaries(cacheEntry, context, prune);
}
/*
* Brute force: Check each shard.
*/
return ExhaustivePrune(cacheEntry, context, prune);
}
/*
* PerformCompare invokes comparator with prepared values, check for
* unexpected NULL returns.
*/
static int
PerformCompare(FunctionCallInfo compareFunctionCall)
{
Datum result = FunctionCallInvoke(compareFunctionCall);
if (compareFunctionCall->isnull)
{
elog(ERROR, "function %u returned NULL", compareFunctionCall->flinfo->fn_oid);
}
return DatumGetInt32(result);
}
/*
* PerformValueCompare invokes comparator with a/b, and checks for unexpected
* NULL returns.
*/
static int
PerformValueCompare(FunctionCallInfo compareFunctionCall, Datum a, Datum b)
{
fcSetArg(compareFunctionCall, 0, a);
fcSetArg(compareFunctionCall, 1, b);
return PerformCompare(compareFunctionCall);
}
/*
* LowerShardBoundary returns the index of the first ShardInterval that's >=
* (if includeMax) or > partitionColumnValue.
*/
static int
LowerShardBoundary(Datum partitionColumnValue, ShardInterval **shardIntervalCache,
int shardCount, FunctionCallInfo compareFunction, bool includeMax)
{
int lowerBoundIndex = 0;
int upperBoundIndex = shardCount;
Assert(shardCount != 0);
/* setup partitionColumnValue argument once */
fcSetArg(compareFunction, 0, partitionColumnValue);
while (lowerBoundIndex < upperBoundIndex)
{
int middleIndex = lowerBoundIndex + ((upperBoundIndex - lowerBoundIndex) / 2);
/* setup minValue as argument */
fcSetArg(compareFunction, 1, shardIntervalCache[middleIndex]->minValue);
/* execute cmp(partitionValue, lowerBound) */
int minValueComparison = PerformCompare(compareFunction);
/* and evaluate results */
if (minValueComparison < 0)
{
/* value smaller than entire range */
upperBoundIndex = middleIndex;
continue;
}
/* setup maxValue as argument */
fcSetArg(compareFunction, 1, shardIntervalCache[middleIndex]->maxValue);
/* execute cmp(partitionValue, upperBound) */
int maxValueComparison = PerformCompare(compareFunction);
if ((maxValueComparison == 0 && !includeMax) ||
maxValueComparison > 0)
{
/* value bigger than entire range */
lowerBoundIndex = middleIndex + 1;
continue;
}
/* found interval containing partitionValue */
return middleIndex;
}
Assert(lowerBoundIndex == upperBoundIndex);
/*
* If we get here, none of the ShardIntervals exactly contain the value
* (we'd have hit the return middleIndex; case otherwise). Figure out
* whether there's possibly any interval containing a value that's bigger
* than the partition key one.
*/
if (lowerBoundIndex == 0)
{
/* all intervals are bigger, thus return 0 */
return 0;
}
else if (lowerBoundIndex == shardCount)
{
/* partition value is bigger than all partition values */
return INVALID_SHARD_INDEX;
}
/* value falls inbetween intervals */
return lowerBoundIndex + 1;
}
/*
* UpperShardBoundary returns the index of the last ShardInterval that's <=
* (if includeMin) or < partitionColumnValue.
*/
static int
UpperShardBoundary(Datum partitionColumnValue, ShardInterval **shardIntervalCache,
int shardCount, FunctionCallInfo compareFunction, bool includeMin)
{
int lowerBoundIndex = 0;
int upperBoundIndex = shardCount;
Assert(shardCount != 0);
/* setup partitionColumnValue argument once */
fcSetArg(compareFunction, 0, partitionColumnValue);
while (lowerBoundIndex < upperBoundIndex)
{
int middleIndex = lowerBoundIndex + ((upperBoundIndex - lowerBoundIndex) / 2);
/* setup minValue as argument */
fcSetArg(compareFunction, 1, shardIntervalCache[middleIndex]->minValue);
/* execute cmp(partitionValue, lowerBound) */
int minValueComparison = PerformCompare(compareFunction);
/* and evaluate results */
if ((minValueComparison == 0 && !includeMin) ||
minValueComparison < 0)
{
/* value smaller than entire range */
upperBoundIndex = middleIndex;
continue;
}
/* setup maxValue as argument */
fcSetArg(compareFunction, 1, shardIntervalCache[middleIndex]->maxValue);
/* execute cmp(partitionValue, upperBound) */
int maxValueComparison = PerformCompare(compareFunction);
if (maxValueComparison > 0)
{
/* value bigger than entire range */
lowerBoundIndex = middleIndex + 1;
continue;
}
/* found interval containing partitionValue */
return middleIndex;
}
Assert(lowerBoundIndex == upperBoundIndex);
/*
* If we get here, none of the ShardIntervals exactly contain the value
* (we'd have hit the return middleIndex; case otherwise). Figure out
* whether there's possibly any interval containing a value that's smaller
* than the partition key one.
*/
if (upperBoundIndex == shardCount)
{
/* all intervals are smaller, thus return 0 */
return shardCount - 1;
}
else if (upperBoundIndex == 0)
{
/* partition value is smaller than all partition values */
return INVALID_SHARD_INDEX;
}
/* value falls inbetween intervals, return the inverval one smaller as bound */
return upperBoundIndex - 1;
}
/*
* PruneWithBoundaries searches for shards that match inequality constraints,
* using binary search on both the upper and lower boundary, and returns a
* list of surviving shards.
*/
static List *
PruneWithBoundaries(DistTableCacheEntry *cacheEntry, ClauseWalkerContext *context,
PruningInstance *prune)
{
List *remainingShardList = NIL;
int shardCount = cacheEntry->shardIntervalArrayLength;
ShardInterval **sortedShardIntervalArray = cacheEntry->sortedShardIntervalArray;
bool hasLowerBound = false;
bool hasUpperBound = false;
Datum lowerBound = 0;
Datum upperBound = 0;
bool lowerBoundInclusive = false;
bool upperBoundInclusive = false;
int lowerBoundIdx = -1;
int upperBoundIdx = -1;
FunctionCallInfo compareFunctionCall = (FunctionCallInfo) &
context->compareIntervalFunctionCall;
if (prune->greaterEqualConsts)
{
lowerBound = prune->greaterEqualConsts->constvalue;
lowerBoundInclusive = true;
hasLowerBound = true;
}
if (prune->greaterConsts)
{
/*
* Use the more restrictive one, if both greater and greaterEqual
* constraints are specified.
*/
if (!hasLowerBound ||
PerformValueCompare(compareFunctionCall,
prune->greaterConsts->constvalue,
lowerBound) >= 0)
{
lowerBound = prune->greaterConsts->constvalue;
lowerBoundInclusive = false;
hasLowerBound = true;
}
}
if (prune->lessEqualConsts)
{
upperBound = prune->lessEqualConsts->constvalue;
upperBoundInclusive = true;
hasUpperBound = true;
}
if (prune->lessConsts)
{
/*
* Use the more restrictive one, if both less and lessEqual
* constraints are specified.
*/
if (!hasUpperBound ||
PerformValueCompare(compareFunctionCall,
prune->lessConsts->constvalue,
upperBound) <= 0)
{
upperBound = prune->lessConsts->constvalue;
upperBoundInclusive = false;
hasUpperBound = true;
}
}
Assert(hasLowerBound || hasUpperBound);
/* find lower bound */
if (hasLowerBound)
{
lowerBoundIdx = LowerShardBoundary(lowerBound, sortedShardIntervalArray,
shardCount, compareFunctionCall,
lowerBoundInclusive);
}
else
{
lowerBoundIdx = 0;
}
/* find upper bound */
if (hasUpperBound)
{
upperBoundIdx = UpperShardBoundary(upperBound, sortedShardIntervalArray,
shardCount, compareFunctionCall,
upperBoundInclusive);
}
else
{
upperBoundIdx = shardCount - 1;
}
if (lowerBoundIdx == INVALID_SHARD_INDEX)
{
return NIL;
}
else if (upperBoundIdx == INVALID_SHARD_INDEX)
{
return NIL;
}
/*
* Build list of all shards that are in the range of shards (possibly 0).
*/
for (int curIdx = lowerBoundIdx; curIdx <= upperBoundIdx; curIdx++)
{
remainingShardList = lappend(remainingShardList,
sortedShardIntervalArray[curIdx]);
}
return remainingShardList;
}
/*
* ExhaustivePrune returns a list of shards matching PruningInstances
* constraints, by simply checking them for each individual shard.
*/
static List *
ExhaustivePrune(DistTableCacheEntry *cacheEntry, ClauseWalkerContext *context,
PruningInstance *prune)
{
List *remainingShardList = NIL;
int shardCount = cacheEntry->shardIntervalArrayLength;
ShardInterval **sortedShardIntervalArray = cacheEntry->sortedShardIntervalArray;
for (int curIdx = 0; curIdx < shardCount; curIdx++)
{
ShardInterval *curInterval = sortedShardIntervalArray[curIdx];
if (!ExhaustivePruneOne(curInterval, context, prune))
{
remainingShardList = lappend(remainingShardList, curInterval);
}
}
return remainingShardList;
}
/*
* ExhaustivePruneOne returns true if curInterval is pruned away, false
* otherwise.
*/
static bool
ExhaustivePruneOne(ShardInterval *curInterval,
ClauseWalkerContext *context,
PruningInstance *prune)
{
FunctionCallInfo compareFunctionCall = (FunctionCallInfo) &
context->compareIntervalFunctionCall;
Datum compareWith = 0;
/* NULL boundaries can't be compared to */
if (!curInterval->minValueExists || !curInterval->maxValueExists)
{
return false;
}
if (prune->equalConsts)
{
compareWith = prune->equalConsts->constvalue;
if (PerformValueCompare(compareFunctionCall,
compareWith,
curInterval->minValue) < 0)
{
return true;
}
if (PerformValueCompare(compareFunctionCall,
compareWith,
curInterval->maxValue) > 0)
{
return true;
}
}
if (prune->greaterEqualConsts)
{
compareWith = prune->greaterEqualConsts->constvalue;
if (PerformValueCompare(compareFunctionCall,
curInterval->maxValue,
compareWith) < 0)
{
return true;
}
}
if (prune->greaterConsts)
{
compareWith = prune->greaterConsts->constvalue;
if (PerformValueCompare(compareFunctionCall,
curInterval->maxValue,
compareWith) <= 0)
{
return true;
}
}
if (prune->lessEqualConsts)
{
compareWith = prune->lessEqualConsts->constvalue;
if (PerformValueCompare(compareFunctionCall,
curInterval->minValue,
compareWith) > 0)
{
return true;
}
}
if (prune->lessConsts)
{
compareWith = prune->lessConsts->constvalue;
if (PerformValueCompare(compareFunctionCall,
curInterval->minValue,
compareWith) >= 0)
{
return true;
}
}
return false;
}