mirror of https://github.com/citusdata/citus.git
Add select distinct, and distinct on support
Distinct, and distinct on() clauses are supported in simple selects, joins, subqueries, and insert into select queries.pull/1701/head
parent
6879f92e23
commit
f7ab901766
|
@ -30,6 +30,7 @@
|
||||||
#include "optimizer/clauses.h"
|
#include "optimizer/clauses.h"
|
||||||
#include "optimizer/planner.h"
|
#include "optimizer/planner.h"
|
||||||
#include "optimizer/restrictinfo.h"
|
#include "optimizer/restrictinfo.h"
|
||||||
|
#include "optimizer/tlist.h"
|
||||||
#include "optimizer/var.h"
|
#include "optimizer/var.h"
|
||||||
#include "parser/parsetree.h"
|
#include "parser/parsetree.h"
|
||||||
#include "parser/parse_coerce.h"
|
#include "parser/parse_coerce.h"
|
||||||
|
@ -53,6 +54,7 @@ static DeferredErrorMessage * DistributedInsertSelectSupported(Query *queryTree,
|
||||||
RangeTblEntry *subqueryRte,
|
RangeTblEntry *subqueryRte,
|
||||||
bool allReferenceTables);
|
bool allReferenceTables);
|
||||||
static DeferredErrorMessage * MultiTaskRouterSelectQuerySupported(Query *query);
|
static DeferredErrorMessage * MultiTaskRouterSelectQuerySupported(Query *query);
|
||||||
|
static bool HasUnsupportedDistinctOn(Query *query);
|
||||||
static DeferredErrorMessage * InsertPartitionColumnMatchesSelect(Query *query,
|
static DeferredErrorMessage * InsertPartitionColumnMatchesSelect(Query *query,
|
||||||
RangeTblEntry *insertRte,
|
RangeTblEntry *insertRte,
|
||||||
RangeTblEntry *
|
RangeTblEntry *
|
||||||
|
@ -777,6 +779,7 @@ MultiTaskRouterSelectQuerySupported(Query *query)
|
||||||
List *queryList = NIL;
|
List *queryList = NIL;
|
||||||
ListCell *queryCell = NULL;
|
ListCell *queryCell = NULL;
|
||||||
StringInfo errorDetail = NULL;
|
StringInfo errorDetail = NULL;
|
||||||
|
bool hasUnsupportedDistinctOn = false;
|
||||||
|
|
||||||
ExtractQueryWalker((Node *) query, &queryList);
|
ExtractQueryWalker((Node *) query, &queryList);
|
||||||
foreach(queryCell, queryList)
|
foreach(queryCell, queryList)
|
||||||
|
@ -865,14 +868,14 @@ MultiTaskRouterSelectQuerySupported(Query *query)
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We cannot support DISTINCT ON clauses since it could be on a non-partition column.
|
* We don't support DISTINCT ON clauses on non-partition columns.
|
||||||
* In that case, there is no way that Citus can support this.
|
|
||||||
*/
|
*/
|
||||||
if (subquery->hasDistinctOn)
|
hasUnsupportedDistinctOn = HasUnsupportedDistinctOn(subquery);
|
||||||
|
if (hasUnsupportedDistinctOn)
|
||||||
{
|
{
|
||||||
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
|
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
|
||||||
"DISTINCT ON clauses are not allowed in distributed "
|
"DISTINCT ON (non-partition column) clauses are not "
|
||||||
"INSERT ... SELECT queries",
|
"allowed in distributed INSERT ... SELECT queries",
|
||||||
NULL, NULL);
|
NULL, NULL);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -881,6 +884,36 @@ MultiTaskRouterSelectQuerySupported(Query *query)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* HasUnsupportedDistinctOn returns true if the query has distinct on and
|
||||||
|
* distinct targets do not contain partition column.
|
||||||
|
*/
|
||||||
|
static bool
|
||||||
|
HasUnsupportedDistinctOn(Query *query)
|
||||||
|
{
|
||||||
|
ListCell *distinctCell = NULL;
|
||||||
|
|
||||||
|
if (!query->hasDistinctOn)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach(distinctCell, query->distinctClause)
|
||||||
|
{
|
||||||
|
SortGroupClause *distinctClause = lfirst(distinctCell);
|
||||||
|
TargetEntry *distinctEntry = get_sortgroupclause_tle(distinctClause,
|
||||||
|
query->targetList);
|
||||||
|
|
||||||
|
if (IsPartitionColumn(distinctEntry->expr, query))
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* InsertPartitionColumnMatchesSelect returns NULL the partition column in the
|
* InsertPartitionColumnMatchesSelect returns NULL the partition column in the
|
||||||
* table targeted by INSERTed matches with the any of the SELECTed table's
|
* table targeted by INSERTed matches with the any of the SELECTed table's
|
||||||
|
|
|
@ -1298,6 +1298,8 @@ MasterExtendedOpNode(MultiExtendedOp *originalOpNode)
|
||||||
masterExtendedOpNode->targetList = newTargetEntryList;
|
masterExtendedOpNode->targetList = newTargetEntryList;
|
||||||
masterExtendedOpNode->groupClauseList = originalOpNode->groupClauseList;
|
masterExtendedOpNode->groupClauseList = originalOpNode->groupClauseList;
|
||||||
masterExtendedOpNode->sortClauseList = originalOpNode->sortClauseList;
|
masterExtendedOpNode->sortClauseList = originalOpNode->sortClauseList;
|
||||||
|
masterExtendedOpNode->distinctClause = originalOpNode->distinctClause;
|
||||||
|
masterExtendedOpNode->hasDistinctOn = originalOpNode->hasDistinctOn;
|
||||||
masterExtendedOpNode->limitCount = originalOpNode->limitCount;
|
masterExtendedOpNode->limitCount = originalOpNode->limitCount;
|
||||||
masterExtendedOpNode->limitOffset = originalOpNode->limitOffset;
|
masterExtendedOpNode->limitOffset = originalOpNode->limitOffset;
|
||||||
masterExtendedOpNode->havingQual = newHavingQual;
|
masterExtendedOpNode->havingQual = newHavingQual;
|
||||||
|
@ -1787,6 +1789,7 @@ WorkerExtendedOpNode(MultiExtendedOp *originalOpNode,
|
||||||
WorkerAggregateWalkerContext *walkerContext =
|
WorkerAggregateWalkerContext *walkerContext =
|
||||||
palloc0(sizeof(WorkerAggregateWalkerContext));
|
palloc0(sizeof(WorkerAggregateWalkerContext));
|
||||||
Index nextSortGroupRefIndex = 0;
|
Index nextSortGroupRefIndex = 0;
|
||||||
|
bool queryHasAggregates = false;
|
||||||
|
|
||||||
walkerContext->repartitionSubquery = false;
|
walkerContext->repartitionSubquery = false;
|
||||||
walkerContext->expressionList = NIL;
|
walkerContext->expressionList = NIL;
|
||||||
|
@ -1826,6 +1829,7 @@ WorkerExtendedOpNode(MultiExtendedOp *originalOpNode,
|
||||||
WorkerAggregateWalker((Node *) originalExpression, walkerContext);
|
WorkerAggregateWalker((Node *) originalExpression, walkerContext);
|
||||||
|
|
||||||
newExpressionList = walkerContext->expressionList;
|
newExpressionList = walkerContext->expressionList;
|
||||||
|
queryHasAggregates = true;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -1924,6 +1928,15 @@ WorkerExtendedOpNode(MultiExtendedOp *originalOpNode,
|
||||||
|
|
||||||
workerExtendedOpNode = CitusMakeNode(MultiExtendedOp);
|
workerExtendedOpNode = CitusMakeNode(MultiExtendedOp);
|
||||||
workerExtendedOpNode->targetList = newTargetEntryList;
|
workerExtendedOpNode->targetList = newTargetEntryList;
|
||||||
|
workerExtendedOpNode->distinctClause = NIL;
|
||||||
|
workerExtendedOpNode->hasDistinctOn = false;
|
||||||
|
|
||||||
|
if (!queryHasAggregates)
|
||||||
|
{
|
||||||
|
workerExtendedOpNode->distinctClause = originalOpNode->distinctClause;
|
||||||
|
workerExtendedOpNode->hasDistinctOn = originalOpNode->hasDistinctOn;
|
||||||
|
}
|
||||||
|
|
||||||
workerExtendedOpNode->groupClauseList = groupClauseList;
|
workerExtendedOpNode->groupClauseList = groupClauseList;
|
||||||
|
|
||||||
/* if we can push down the limit, also set related fields */
|
/* if we can push down the limit, also set related fields */
|
||||||
|
@ -3356,10 +3369,10 @@ WorkerLimitCount(MultiExtendedOp *originalOpNode,
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* WorkerSortClauseList first checks if the given extended node contains a limit
|
* WorkerSortClauseList first checks if the given extended node contains a limit
|
||||||
* that can be pushed down. If it does, the function then checks if we need to
|
* or hasDistinctOn that can be pushed down. If it does, the function then
|
||||||
* add any sorting and grouping clauses to the sort list we push down for the
|
* checks if we need to add any sorting and grouping clauses to the sort list we
|
||||||
* limit. If we do, the function adds these clauses and returns them. Otherwise,
|
* push down for the limit. If we do, the function adds these clauses and
|
||||||
* the function returns null.
|
* returns them. Otherwise, the function returns null.
|
||||||
*/
|
*/
|
||||||
static List *
|
static List *
|
||||||
WorkerSortClauseList(MultiExtendedOp *originalOpNode,
|
WorkerSortClauseList(MultiExtendedOp *originalOpNode,
|
||||||
|
@ -3370,8 +3383,8 @@ WorkerSortClauseList(MultiExtendedOp *originalOpNode,
|
||||||
List *sortClauseList = originalOpNode->sortClauseList;
|
List *sortClauseList = originalOpNode->sortClauseList;
|
||||||
List *targetList = originalOpNode->targetList;
|
List *targetList = originalOpNode->targetList;
|
||||||
|
|
||||||
/* if no limit node, no need to push down sort clauses */
|
/* if no limit node and no hasDistinctOn, no need to push down sort clauses */
|
||||||
if (originalOpNode->limitCount == NULL)
|
if (originalOpNode->limitCount == NULL && !originalOpNode->hasDistinctOn)
|
||||||
{
|
{
|
||||||
return NIL;
|
return NIL;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1805,13 +1805,6 @@ ErrorIfQueryNotSupported(Query *queryTree)
|
||||||
errorHint = filterHint;
|
errorHint = filterHint;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (queryTree->distinctClause)
|
|
||||||
{
|
|
||||||
preconditionsSatisfied = false;
|
|
||||||
errorMessage = "could not run distributed query with DISTINCT clause";
|
|
||||||
errorHint = filterHint;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (queryTree->groupingSets)
|
if (queryTree->groupingSets)
|
||||||
{
|
{
|
||||||
preconditionsSatisfied = false;
|
preconditionsSatisfied = false;
|
||||||
|
@ -2850,6 +2843,8 @@ MultiExtendedOpNode(Query *queryTree)
|
||||||
extendedOpNode->limitCount = queryTree->limitCount;
|
extendedOpNode->limitCount = queryTree->limitCount;
|
||||||
extendedOpNode->limitOffset = queryTree->limitOffset;
|
extendedOpNode->limitOffset = queryTree->limitOffset;
|
||||||
extendedOpNode->havingQual = queryTree->havingQual;
|
extendedOpNode->havingQual = queryTree->havingQual;
|
||||||
|
extendedOpNode->distinctClause = queryTree->distinctClause;
|
||||||
|
extendedOpNode->hasDistinctOn = queryTree->hasDistinctOn;
|
||||||
|
|
||||||
return extendedOpNode;
|
return extendedOpNode;
|
||||||
}
|
}
|
||||||
|
|
|
@ -151,6 +151,73 @@ BuildAggregatePlan(Query *masterQuery, Plan *subPlan)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* BuildDistinctPlan creates an returns a plan for distinct. Depending on
|
||||||
|
* availability of hash function it chooses HashAgg over Sort/Unique
|
||||||
|
* plans.
|
||||||
|
* This function has a potential performance issue since we blindly set
|
||||||
|
* Plan nodes without looking at cost. We might need to revisit this
|
||||||
|
* if we have performance issues with select distinct queries.
|
||||||
|
*/
|
||||||
|
static Plan *
|
||||||
|
BuildDistinctPlan(Query *masterQuery, Plan *subPlan)
|
||||||
|
{
|
||||||
|
Plan *distinctPlan = NULL;
|
||||||
|
bool distinctClausesHashable = true;
|
||||||
|
List *distinctClauseList = masterQuery->distinctClause;
|
||||||
|
List *targetList = copyObject(masterQuery->targetList);
|
||||||
|
List *columnList = pull_var_clause_default((Node *) targetList);
|
||||||
|
ListCell *columnCell = NULL;
|
||||||
|
|
||||||
|
if (IsA(subPlan, Agg))
|
||||||
|
{
|
||||||
|
return subPlan;
|
||||||
|
}
|
||||||
|
|
||||||
|
Assert(masterQuery->distinctClause);
|
||||||
|
Assert(!masterQuery->hasDistinctOn);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* For upper level plans above the sequential scan, the planner expects the
|
||||||
|
* table id (varno) to be set to OUTER_VAR.
|
||||||
|
*/
|
||||||
|
foreach(columnCell, columnList)
|
||||||
|
{
|
||||||
|
Var *column = (Var *) lfirst(columnCell);
|
||||||
|
column->varno = OUTER_VAR;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Create group by plan with HashAggregate if all distinct
|
||||||
|
* members are hashable, Otherwise create sort+unique plan.
|
||||||
|
*/
|
||||||
|
distinctClausesHashable = grouping_is_hashable(distinctClauseList);
|
||||||
|
if (distinctClausesHashable)
|
||||||
|
{
|
||||||
|
const long rowEstimate = 10; /* using the same value as BuildAggregatePlan() */
|
||||||
|
AttrNumber *distinctColumnIdArray = extract_grouping_cols(distinctClauseList,
|
||||||
|
subPlan->targetlist);
|
||||||
|
Oid *distinctColumnOpArray = extract_grouping_ops(distinctClauseList);
|
||||||
|
uint32 distinctClauseCount = list_length(distinctClauseList);
|
||||||
|
|
||||||
|
distinctPlan = (Plan *) make_agg(targetList, NIL, AGG_HASHED,
|
||||||
|
AGGSPLIT_SIMPLE, distinctClauseCount,
|
||||||
|
distinctColumnIdArray,
|
||||||
|
distinctColumnOpArray, NIL, NIL,
|
||||||
|
rowEstimate, subPlan);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
Sort *sortPlan = make_sort_from_sortclauses(masterQuery->distinctClause,
|
||||||
|
subPlan);
|
||||||
|
distinctPlan = (Plan *) make_unique_from_sortclauses((Plan *) sortPlan,
|
||||||
|
masterQuery->distinctClause);
|
||||||
|
}
|
||||||
|
|
||||||
|
return distinctPlan;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* BuildSelectStatement builds the final select statement to run on the master
|
* BuildSelectStatement builds the final select statement to run on the master
|
||||||
* node, before returning results to the user. The function first gets the custom
|
* node, before returning results to the user. The function first gets the custom
|
||||||
|
@ -166,6 +233,7 @@ BuildSelectStatement(Query *masterQuery, List *masterTargetList, CustomScan *rem
|
||||||
Plan *topLevelPlan = NULL;
|
Plan *topLevelPlan = NULL;
|
||||||
ListCell *targetEntryCell = NULL;
|
ListCell *targetEntryCell = NULL;
|
||||||
List *columnNameList = NULL;
|
List *columnNameList = NULL;
|
||||||
|
List *sortClauseList = copyObject(masterQuery->sortClause);
|
||||||
|
|
||||||
/* (1) make PlannedStmt and set basic information */
|
/* (1) make PlannedStmt and set basic information */
|
||||||
selectStatement = makeNode(PlannedStmt);
|
selectStatement = makeNode(PlannedStmt);
|
||||||
|
@ -203,10 +271,47 @@ BuildSelectStatement(Query *masterQuery, List *masterTargetList, CustomScan *rem
|
||||||
topLevelPlan = &remoteScan->scan.plan;
|
topLevelPlan = &remoteScan->scan.plan;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* (3) add a sorting plan if needed */
|
/*
|
||||||
if (masterQuery->sortClause)
|
* (3) create distinct plan if needed.
|
||||||
|
*
|
||||||
|
* distinct on() requires sort + unique plans. Unique itself is not enough
|
||||||
|
* as it only compares the current value with previous one when checking
|
||||||
|
* uniqueness, thus ordering is necessary. If already has order by
|
||||||
|
* clause we append distinct clauses to the end of it. Postgresql requires
|
||||||
|
* that if both distinct on() and order by exists, ordering shall start
|
||||||
|
* on distinct clauses. Therefore we can safely append distinct clauses to
|
||||||
|
* the end of order by clauses. Although the same column may appear more
|
||||||
|
* than once in order by clauses, created plan uses only one instance, for
|
||||||
|
* example order by a,b,a,a,b,c is translated to equivalent order by a,b,c.
|
||||||
|
*
|
||||||
|
* If the query has distinct clause but not distinct on, we first create
|
||||||
|
* distinct plan that is either HashAggreate or Sort + Unique plans depending
|
||||||
|
* on hashable property of columns in distinct clause. If there is order by
|
||||||
|
* clause, it is handled after distinct planning.
|
||||||
|
*/
|
||||||
|
if (masterQuery->hasDistinctOn)
|
||||||
|
{
|
||||||
|
ListCell *distinctCell = NULL;
|
||||||
|
foreach(distinctCell, masterQuery->distinctClause)
|
||||||
|
{
|
||||||
|
SortGroupClause *singleDistinctClause = lfirst(distinctCell);
|
||||||
|
Index sortGroupRef = singleDistinctClause->tleSortGroupRef;
|
||||||
|
|
||||||
|
if (get_sortgroupref_clause_noerr(sortGroupRef, sortClauseList) == NULL)
|
||||||
|
{
|
||||||
|
sortClauseList = lappend(sortClauseList, singleDistinctClause);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (masterQuery->distinctClause)
|
||||||
|
{
|
||||||
|
Plan *distinctPlan = BuildDistinctPlan(masterQuery, topLevelPlan);
|
||||||
|
topLevelPlan = distinctPlan;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* (4) add a sorting plan if needed */
|
||||||
|
if (sortClauseList)
|
||||||
{
|
{
|
||||||
List *sortClauseList = masterQuery->sortClause;
|
|
||||||
Sort *sortPlan = make_sort_from_sortclauses(sortClauseList, topLevelPlan);
|
Sort *sortPlan = make_sort_from_sortclauses(sortClauseList, topLevelPlan);
|
||||||
|
|
||||||
/* just for reproducible costs between different PostgreSQL versions */
|
/* just for reproducible costs between different PostgreSQL versions */
|
||||||
|
@ -217,7 +322,20 @@ BuildSelectStatement(Query *masterQuery, List *masterTargetList, CustomScan *rem
|
||||||
topLevelPlan = (Plan *) sortPlan;
|
topLevelPlan = (Plan *) sortPlan;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* (4) add a limit plan if needed */
|
/*
|
||||||
|
* (5) add a unique plan for distinctOn.
|
||||||
|
* If the query has distinct on we add a sort clause in step 3. Therefore
|
||||||
|
* Step 4 always creates a sort plan.
|
||||||
|
* */
|
||||||
|
if (masterQuery->hasDistinctOn)
|
||||||
|
{
|
||||||
|
Assert(IsA(topLevelPlan, Sort));
|
||||||
|
topLevelPlan =
|
||||||
|
(Plan *) make_unique_from_sortclauses(topLevelPlan,
|
||||||
|
masterQuery->distinctClause);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* (5) add a limit plan if needed */
|
||||||
if (masterQuery->limitCount || masterQuery->limitOffset)
|
if (masterQuery->limitCount || masterQuery->limitOffset)
|
||||||
{
|
{
|
||||||
Node *limitCount = masterQuery->limitCount;
|
Node *limitCount = masterQuery->limitCount;
|
||||||
|
@ -226,7 +344,7 @@ BuildSelectStatement(Query *masterQuery, List *masterTargetList, CustomScan *rem
|
||||||
topLevelPlan = (Plan *) limitPlan;
|
topLevelPlan = (Plan *) limitPlan;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* (5) finally set our top level plan in the plan tree */
|
/* (6) finally set our top level plan in the plan tree */
|
||||||
selectStatement->planTree = topLevelPlan;
|
selectStatement->planTree = topLevelPlan;
|
||||||
|
|
||||||
return selectStatement;
|
return selectStatement;
|
||||||
|
|
|
@ -578,6 +578,8 @@ BuildJobQuery(MultiNode *multiNode, List *dependedJobList)
|
||||||
FromExpr *joinTree = NULL;
|
FromExpr *joinTree = NULL;
|
||||||
Node *joinRoot = NULL;
|
Node *joinRoot = NULL;
|
||||||
Node *havingQual = NULL;
|
Node *havingQual = NULL;
|
||||||
|
bool hasDistinctOn = false;
|
||||||
|
List *distinctClause = NIL;
|
||||||
|
|
||||||
/* we start building jobs from below the collect node */
|
/* we start building jobs from below the collect node */
|
||||||
Assert(!CitusIsA(multiNode, MultiCollect));
|
Assert(!CitusIsA(multiNode, MultiCollect));
|
||||||
|
@ -625,6 +627,8 @@ BuildJobQuery(MultiNode *multiNode, List *dependedJobList)
|
||||||
{
|
{
|
||||||
MultiExtendedOp *extendedOp = (MultiExtendedOp *) linitial(extendedOpNodeList);
|
MultiExtendedOp *extendedOp = (MultiExtendedOp *) linitial(extendedOpNodeList);
|
||||||
targetList = copyObject(extendedOp->targetList);
|
targetList = copyObject(extendedOp->targetList);
|
||||||
|
distinctClause = extendedOp->distinctClause;
|
||||||
|
hasDistinctOn = extendedOp->hasDistinctOn;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -699,6 +703,8 @@ BuildJobQuery(MultiNode *multiNode, List *dependedJobList)
|
||||||
jobQuery->limitCount = limitCount;
|
jobQuery->limitCount = limitCount;
|
||||||
jobQuery->havingQual = havingQual;
|
jobQuery->havingQual = havingQual;
|
||||||
jobQuery->hasAggs = contain_agg_clause((Node *) targetList);
|
jobQuery->hasAggs = contain_agg_clause((Node *) targetList);
|
||||||
|
jobQuery->distinctClause = distinctClause;
|
||||||
|
jobQuery->hasDistinctOn = hasDistinctOn;
|
||||||
|
|
||||||
return jobQuery;
|
return jobQuery;
|
||||||
}
|
}
|
||||||
|
@ -1405,6 +1411,9 @@ BuildSubqueryJobQuery(MultiNode *multiNode)
|
||||||
Node *limitCount = NULL;
|
Node *limitCount = NULL;
|
||||||
Node *limitOffset = NULL;
|
Node *limitOffset = NULL;
|
||||||
FromExpr *joinTree = NULL;
|
FromExpr *joinTree = NULL;
|
||||||
|
bool hasAggregates = false;
|
||||||
|
List *distinctClause = NIL;
|
||||||
|
bool hasDistinctOn = false;
|
||||||
|
|
||||||
/* we start building jobs from below the collect node */
|
/* we start building jobs from below the collect node */
|
||||||
Assert(!CitusIsA(multiNode, MultiCollect));
|
Assert(!CitusIsA(multiNode, MultiCollect));
|
||||||
|
@ -1450,6 +1459,8 @@ BuildSubqueryJobQuery(MultiNode *multiNode)
|
||||||
limitOffset = extendedOp->limitOffset;
|
limitOffset = extendedOp->limitOffset;
|
||||||
sortClauseList = extendedOp->sortClauseList;
|
sortClauseList = extendedOp->sortClauseList;
|
||||||
havingQual = extendedOp->havingQual;
|
havingQual = extendedOp->havingQual;
|
||||||
|
distinctClause = extendedOp->distinctClause;
|
||||||
|
hasDistinctOn = extendedOp->hasDistinctOn;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* build group clauses */
|
/* build group clauses */
|
||||||
|
@ -1458,6 +1469,19 @@ BuildSubqueryJobQuery(MultiNode *multiNode)
|
||||||
/* build the where clause list using select predicates */
|
/* build the where clause list using select predicates */
|
||||||
whereClauseList = QuerySelectClauseList(multiNode);
|
whereClauseList = QuerySelectClauseList(multiNode);
|
||||||
|
|
||||||
|
if (contain_agg_clause((Node *) targetList) ||
|
||||||
|
contain_agg_clause((Node *) havingQual))
|
||||||
|
{
|
||||||
|
hasAggregates = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* distinct is not send to worker query if there are top level aggregates */
|
||||||
|
if (hasAggregates)
|
||||||
|
{
|
||||||
|
hasDistinctOn = false;
|
||||||
|
distinctClause = NIL;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Build the From/Where construct. We keep the where-clause list implicitly
|
* Build the From/Where construct. We keep the where-clause list implicitly
|
||||||
* AND'd, since both partition and join pruning depends on the clauses being
|
* AND'd, since both partition and join pruning depends on the clauses being
|
||||||
|
@ -1480,8 +1504,9 @@ BuildSubqueryJobQuery(MultiNode *multiNode)
|
||||||
jobQuery->limitOffset = limitOffset;
|
jobQuery->limitOffset = limitOffset;
|
||||||
jobQuery->limitCount = limitCount;
|
jobQuery->limitCount = limitCount;
|
||||||
jobQuery->havingQual = havingQual;
|
jobQuery->havingQual = havingQual;
|
||||||
jobQuery->hasAggs = contain_agg_clause((Node *) targetList) ||
|
jobQuery->hasAggs = hasAggregates;
|
||||||
contain_agg_clause((Node *) havingQual);
|
jobQuery->hasDistinctOn = hasDistinctOn;
|
||||||
|
jobQuery->distinctClause = distinctClause;
|
||||||
|
|
||||||
return jobQuery;
|
return jobQuery;
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,71 @@
|
||||||
|
/*-------------------------------------------------------------------------
|
||||||
|
*
|
||||||
|
* postgres_planning_function.c
|
||||||
|
* Includes planning routines copied from
|
||||||
|
* src/backend/optimizer/plan/createplan.c
|
||||||
|
*
|
||||||
|
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
|
||||||
|
* Portions Copyright (c) 1994, Regents of the University of California
|
||||||
|
*
|
||||||
|
* This needs to be closely in sync with the core code.
|
||||||
|
*-------------------------------------------------------------------------
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "postgres.h"
|
||||||
|
|
||||||
|
#include "distributed/multi_master_planner.h"
|
||||||
|
#include "nodes/plannodes.h"
|
||||||
|
#include "optimizer/tlist.h"
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* make_unique_from_sortclauses creates and returns a unique node
|
||||||
|
* from provided distinct clause list.
|
||||||
|
* The functions is copied from postgresql from
|
||||||
|
* src/backend/optimizer/plan/createplan.c.
|
||||||
|
*
|
||||||
|
* distinctList is a list of SortGroupClauses, identifying the targetlist items
|
||||||
|
* that should be considered by the Unique filter. The input path must
|
||||||
|
* already be sorted accordingly.
|
||||||
|
*/
|
||||||
|
Unique *
|
||||||
|
make_unique_from_sortclauses(Plan *lefttree, List *distinctList)
|
||||||
|
{
|
||||||
|
Unique *node = makeNode(Unique);
|
||||||
|
Plan *plan = &node->plan;
|
||||||
|
int numCols = list_length(distinctList);
|
||||||
|
int keyno = 0;
|
||||||
|
AttrNumber *uniqColIdx;
|
||||||
|
Oid *uniqOperators;
|
||||||
|
ListCell *slitem;
|
||||||
|
|
||||||
|
plan->targetlist = lefttree->targetlist;
|
||||||
|
plan->qual = NIL;
|
||||||
|
plan->lefttree = lefttree;
|
||||||
|
plan->righttree = NULL;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* convert SortGroupClause list into arrays of attr indexes and equality
|
||||||
|
* operators, as wanted by executor
|
||||||
|
*/
|
||||||
|
Assert(numCols > 0);
|
||||||
|
uniqColIdx = (AttrNumber *) palloc(sizeof(AttrNumber) * numCols);
|
||||||
|
uniqOperators = (Oid *) palloc(sizeof(Oid) * numCols);
|
||||||
|
|
||||||
|
foreach(slitem, distinctList)
|
||||||
|
{
|
||||||
|
SortGroupClause *sortcl = (SortGroupClause *) lfirst(slitem);
|
||||||
|
TargetEntry *tle = get_sortgroupclause_tle(sortcl, plan->targetlist);
|
||||||
|
|
||||||
|
uniqColIdx[keyno] = tle->resno;
|
||||||
|
uniqOperators[keyno] = sortcl->eqop;
|
||||||
|
Assert(OidIsValid(uniqOperators[keyno]));
|
||||||
|
keyno++;
|
||||||
|
}
|
||||||
|
|
||||||
|
node->numCols = numCols;
|
||||||
|
node->uniqColIdx = uniqColIdx;
|
||||||
|
node->uniqOperators = uniqOperators;
|
||||||
|
|
||||||
|
return node;
|
||||||
|
}
|
|
@ -292,6 +292,8 @@ OutMultiExtendedOp(OUTFUNC_ARGS)
|
||||||
WRITE_NODE_FIELD(limitCount);
|
WRITE_NODE_FIELD(limitCount);
|
||||||
WRITE_NODE_FIELD(limitOffset);
|
WRITE_NODE_FIELD(limitOffset);
|
||||||
WRITE_NODE_FIELD(havingQual);
|
WRITE_NODE_FIELD(havingQual);
|
||||||
|
WRITE_BOOL_FIELD(hasDistinctOn);
|
||||||
|
WRITE_NODE_FIELD(distinctClause);
|
||||||
|
|
||||||
OutMultiUnaryNodeFields(str, (const MultiUnaryNode *) node);
|
OutMultiUnaryNodeFields(str, (const MultiUnaryNode *) node);
|
||||||
}
|
}
|
||||||
|
|
|
@ -173,6 +173,8 @@ typedef struct MultiExtendedOp
|
||||||
Node *limitCount;
|
Node *limitCount;
|
||||||
Node *limitOffset;
|
Node *limitOffset;
|
||||||
Node *havingQual;
|
Node *havingQual;
|
||||||
|
List *distinctClause;
|
||||||
|
bool hasDistinctOn;
|
||||||
} MultiExtendedOp;
|
} MultiExtendedOp;
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -24,6 +24,7 @@ struct MultiPlan;
|
||||||
struct CustomScan;
|
struct CustomScan;
|
||||||
extern PlannedStmt * MasterNodeSelectPlan(struct MultiPlan *multiPlan,
|
extern PlannedStmt * MasterNodeSelectPlan(struct MultiPlan *multiPlan,
|
||||||
struct CustomScan *dataScan);
|
struct CustomScan *dataScan);
|
||||||
|
extern Unique * make_unique_from_sortclauses(Plan *lefttree, List *distinctList);
|
||||||
|
|
||||||
|
|
||||||
#endif /* MULTI_MASTER_PLANNER_H */
|
#endif /* MULTI_MASTER_PLANNER_H */
|
||||||
|
|
|
@ -454,4 +454,74 @@ SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results;
|
||||||
14371 | 101 | 50.5232064574490293
|
14371 | 101 | 50.5232064574490293
|
||||||
(1 row)
|
(1 row)
|
||||||
|
|
||||||
|
-- DISTINCT in the outer query and DISTINCT in the subquery
|
||||||
|
TRUNCATE agg_results;
|
||||||
|
INSERT INTO agg_results(user_id)
|
||||||
|
SELECT
|
||||||
|
DISTINCT users_ids.user_id
|
||||||
|
FROM
|
||||||
|
(SELECT DISTINCT user_id FROM users_table) as users_ids
|
||||||
|
JOIN
|
||||||
|
(SELECT
|
||||||
|
ma.user_id, ma.value_1, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob
|
||||||
|
FROM
|
||||||
|
users_table AS ma, events_table as short_list
|
||||||
|
WHERE
|
||||||
|
short_list.user_id = ma.user_id and ma.value_1 < 50 and short_list.event_type < 3
|
||||||
|
) temp
|
||||||
|
ON users_ids.user_id = temp.user_id
|
||||||
|
WHERE temp.value_1 < 50;
|
||||||
|
-- get some statistics from the aggregated results to ensure the results are correct
|
||||||
|
SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results;
|
||||||
|
count | count | avg
|
||||||
|
-------+-------+---------------------
|
||||||
|
27 | 27 | 54.0000000000000000
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
-- DISTINCT ON in the outer query and DISTINCT in the subquery
|
||||||
|
TRUNCATE agg_results;
|
||||||
|
INSERT INTO agg_results(user_id, value_1_agg, value_2_agg)
|
||||||
|
SELECT
|
||||||
|
DISTINCT ON (users_ids.user_id) users_ids.user_id, temp.value_1, prob
|
||||||
|
FROM
|
||||||
|
(SELECT DISTINCT user_id FROM users_table) as users_ids
|
||||||
|
JOIN
|
||||||
|
(SELECT
|
||||||
|
ma.user_id, ma.value_1, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob
|
||||||
|
FROM
|
||||||
|
users_table AS ma, events_table as short_list
|
||||||
|
WHERE
|
||||||
|
short_list.user_id = ma.user_id and ma.value_1 < 50 and short_list.event_type < 15
|
||||||
|
) temp
|
||||||
|
ON users_ids.user_id = temp.user_id
|
||||||
|
WHERE temp.value_1 < 50
|
||||||
|
ORDER BY 1, 2;
|
||||||
|
SELECT count(*), count(DISTINCT user_id), avg(user_id), avg(value_1_agg) FROM agg_results;
|
||||||
|
count | count | avg | avg
|
||||||
|
-------+-------+---------------------+---------------------
|
||||||
|
80 | 80 | 50.7875000000000000 | 10.0125000000000000
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
-- DISTINCT ON in the outer query and DISTINCT ON in the subquery
|
||||||
|
TRUNCATE agg_results;
|
||||||
|
INSERT INTO agg_results(user_id, value_1_agg, value_2_agg)
|
||||||
|
SELECT
|
||||||
|
DISTINCT ON (users_ids.user_id) users_ids.user_id, temp.value_1, prob
|
||||||
|
FROM
|
||||||
|
(SELECT DISTINCT ON (user_id) user_id, value_2 FROM users_table ORDER BY 1,2) as users_ids
|
||||||
|
JOIN
|
||||||
|
(SELECT
|
||||||
|
ma.user_id, ma.value_1, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob
|
||||||
|
FROM
|
||||||
|
users_table AS ma, events_table as short_list
|
||||||
|
WHERE
|
||||||
|
short_list.user_id = ma.user_id and ma.value_1 < 5000 and short_list.event_type < 3
|
||||||
|
) temp
|
||||||
|
ON users_ids.user_id = temp.user_id
|
||||||
|
ORDER BY 1, 2;
|
||||||
|
SELECT count(*), count(DISTINCT user_id), avg(user_id), avg(value_1_agg) FROM agg_results;
|
||||||
|
count | count | avg | avg
|
||||||
|
-------+-------+---------------------+--------------------
|
||||||
|
27 | 27 | 54.0000000000000000 | 9.8518518518518519
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
|
|
@ -626,16 +626,60 @@ DEBUG: Plan is router executable
|
||||||
SET client_min_messages TO INFO;
|
SET client_min_messages TO INFO;
|
||||||
truncate agg_events;
|
truncate agg_events;
|
||||||
SET client_min_messages TO DEBUG2;
|
SET client_min_messages TO DEBUG2;
|
||||||
-- we do not support DISTINCT ON clauses
|
-- DISTINCT ON clauses are supported
|
||||||
|
-- distinct on(non-partition column)
|
||||||
|
-- values are pulled to master
|
||||||
INSERT INTO agg_events (value_1_agg, user_id)
|
INSERT INTO agg_events (value_1_agg, user_id)
|
||||||
SELECT
|
SELECT
|
||||||
DISTINCT ON (value_1) value_1, user_id
|
DISTINCT ON (value_1) value_1, user_id
|
||||||
FROM
|
FROM
|
||||||
raw_events_first;
|
raw_events_first;
|
||||||
DEBUG: DISTINCT ON clauses are not allowed in distributed INSERT ... SELECT queries
|
DEBUG: DISTINCT ON (non-partition column) clauses are not allowed in distributed INSERT ... SELECT queries
|
||||||
DEBUG: Collecting INSERT ... SELECT results on coordinator
|
DEBUG: Collecting INSERT ... SELECT results on coordinator
|
||||||
ERROR: could not run distributed query with DISTINCT clause
|
SELECT user_id, value_1_agg FROM agg_events ORDER BY 1,2;
|
||||||
HINT: Consider using an equality filter on the distributed table's partition column.
|
user_id | value_1_agg
|
||||||
|
---------+-------------
|
||||||
|
1 | 10
|
||||||
|
2 | 20
|
||||||
|
3 | 30
|
||||||
|
4 | 40
|
||||||
|
5 | 50
|
||||||
|
6 | 60
|
||||||
|
7 |
|
||||||
|
8 | 80
|
||||||
|
9 | 90
|
||||||
|
(9 rows)
|
||||||
|
|
||||||
|
-- we don't want to see constraint vialotions, so truncate first
|
||||||
|
SET client_min_messages TO INFO;
|
||||||
|
truncate agg_events;
|
||||||
|
SET client_min_messages TO DEBUG2;
|
||||||
|
-- distinct on(partition column)
|
||||||
|
-- queries are forwared to workers
|
||||||
|
INSERT INTO agg_events (value_1_agg, user_id)
|
||||||
|
SELECT
|
||||||
|
DISTINCT ON (user_id) value_1, user_id
|
||||||
|
FROM
|
||||||
|
raw_events_first;
|
||||||
|
DEBUG: distributed statement: INSERT INTO public.agg_events_13300008 AS citus_table_alias (user_id, value_1_agg) SELECT DISTINCT ON (user_id) user_id, value_1 FROM public.raw_events_first_13300000 raw_events_first WHERE ((worker_hash(user_id) >= '-2147483648'::integer) AND (worker_hash(user_id) <= '-1073741825'::integer))
|
||||||
|
DEBUG: distributed statement: INSERT INTO public.agg_events_13300009 AS citus_table_alias (user_id, value_1_agg) SELECT DISTINCT ON (user_id) user_id, value_1 FROM public.raw_events_first_13300001 raw_events_first WHERE ((worker_hash(user_id) >= '-1073741824'::integer) AND (worker_hash(user_id) <= '-1'::integer))
|
||||||
|
DEBUG: distributed statement: INSERT INTO public.agg_events_13300010 AS citus_table_alias (user_id, value_1_agg) SELECT DISTINCT ON (user_id) user_id, value_1 FROM public.raw_events_first_13300002 raw_events_first WHERE ((worker_hash(user_id) >= 0) AND (worker_hash(user_id) <= 1073741823))
|
||||||
|
DEBUG: distributed statement: INSERT INTO public.agg_events_13300011 AS citus_table_alias (user_id, value_1_agg) SELECT DISTINCT ON (user_id) user_id, value_1 FROM public.raw_events_first_13300003 raw_events_first WHERE ((worker_hash(user_id) >= 1073741824) AND (worker_hash(user_id) <= 2147483647))
|
||||||
|
DEBUG: Plan is router executable
|
||||||
|
SELECT user_id, value_1_agg FROM agg_events ORDER BY 1,2;
|
||||||
|
user_id | value_1_agg
|
||||||
|
---------+-------------
|
||||||
|
1 | 10
|
||||||
|
2 | 20
|
||||||
|
3 | 30
|
||||||
|
4 | 40
|
||||||
|
5 | 50
|
||||||
|
6 | 60
|
||||||
|
7 |
|
||||||
|
8 | 80
|
||||||
|
9 | 90
|
||||||
|
(9 rows)
|
||||||
|
|
||||||
-- We do not support some CTEs
|
-- We do not support some CTEs
|
||||||
WITH fist_table_agg AS
|
WITH fist_table_agg AS
|
||||||
(SELECT sum(value_1) as v1_agg, user_id FROM raw_events_first GROUP BY user_id)
|
(SELECT sum(value_1) as v1_agg, user_id FROM raw_events_first GROUP BY user_id)
|
||||||
|
|
|
@ -0,0 +1,824 @@
|
||||||
|
--
|
||||||
|
-- MULTI_SELECT_DISTINCT
|
||||||
|
--
|
||||||
|
-- Tests select distinct, and select distinct on features.
|
||||||
|
--
|
||||||
|
-- function calls are supported
|
||||||
|
SELECT DISTINCT l_orderkey, now() FROM lineitem_hash_part LIMIT 0;
|
||||||
|
l_orderkey | now
|
||||||
|
------------+-----
|
||||||
|
(0 rows)
|
||||||
|
|
||||||
|
SELECT DISTINCT l_partkey, 1 + (random() * 0)::int FROM lineitem_hash_part ORDER BY 1 DESC LIMIT 3;
|
||||||
|
l_partkey | ?column?
|
||||||
|
-----------+----------
|
||||||
|
199973 | 1
|
||||||
|
199946 | 1
|
||||||
|
199943 | 1
|
||||||
|
(3 rows)
|
||||||
|
|
||||||
|
-- const expressions are supported
|
||||||
|
SELECT DISTINCT l_orderkey, 1+1 FROM lineitem_hash_part ORDER BY 1 LIMIT 5;
|
||||||
|
l_orderkey | ?column?
|
||||||
|
------------+----------
|
||||||
|
1 | 2
|
||||||
|
2 | 2
|
||||||
|
3 | 2
|
||||||
|
4 | 2
|
||||||
|
5 | 2
|
||||||
|
(5 rows)
|
||||||
|
|
||||||
|
-- non const expressions are also supported
|
||||||
|
SELECT DISTINCT l_orderkey, l_partkey + 1 FROM lineitem_hash_part ORDER BY 1, 2 LIMIT 5;
|
||||||
|
l_orderkey | ?column?
|
||||||
|
------------+----------
|
||||||
|
1 | 2133
|
||||||
|
1 | 15636
|
||||||
|
1 | 24028
|
||||||
|
1 | 63701
|
||||||
|
1 | 67311
|
||||||
|
(5 rows)
|
||||||
|
|
||||||
|
-- column expressions are supported
|
||||||
|
SELECT DISTINCT l_orderkey, l_shipinstruct || l_shipmode FROM lineitem_hash_part ORDER BY 2 , 1 LIMIT 5;
|
||||||
|
l_orderkey | ?column?
|
||||||
|
------------+----------------
|
||||||
|
32 | COLLECT CODAIR
|
||||||
|
39 | COLLECT CODAIR
|
||||||
|
66 | COLLECT CODAIR
|
||||||
|
70 | COLLECT CODAIR
|
||||||
|
98 | COLLECT CODAIR
|
||||||
|
(5 rows)
|
||||||
|
|
||||||
|
-- function calls with const input are supported
|
||||||
|
SELECT DISTINCT l_orderkey, strpos('AIR', 'A') FROM lineitem_hash_part ORDER BY 1,2 LIMIT 5;
|
||||||
|
l_orderkey | strpos
|
||||||
|
------------+--------
|
||||||
|
1 | 1
|
||||||
|
2 | 1
|
||||||
|
3 | 1
|
||||||
|
4 | 1
|
||||||
|
5 | 1
|
||||||
|
(5 rows)
|
||||||
|
|
||||||
|
-- function calls with non-const input are supported
|
||||||
|
SELECT DISTINCT l_orderkey, strpos(l_shipmode, 'I')
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
WHERE strpos(l_shipmode, 'I') > 1
|
||||||
|
ORDER BY 2, 1
|
||||||
|
LIMIT 5;
|
||||||
|
l_orderkey | strpos
|
||||||
|
------------+--------
|
||||||
|
1 | 2
|
||||||
|
3 | 2
|
||||||
|
5 | 2
|
||||||
|
32 | 2
|
||||||
|
33 | 2
|
||||||
|
(5 rows)
|
||||||
|
|
||||||
|
-- distinct on partition column
|
||||||
|
-- verify counts match with respect to count(distinct)
|
||||||
|
CREATE TEMP TABLE temp_orderkeys AS SELECT DISTINCT l_orderkey FROM lineitem_hash_part;
|
||||||
|
SELECT COUNT(*) FROM temp_orderkeys;
|
||||||
|
count
|
||||||
|
-------
|
||||||
|
2985
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT COUNT(DISTINCT l_orderkey) FROM lineitem_hash_part;
|
||||||
|
count
|
||||||
|
-------
|
||||||
|
2985
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT DISTINCT l_orderkey FROM lineitem_hash_part WHERE l_orderkey < 500 and l_partkey < 5000 order by 1;
|
||||||
|
l_orderkey
|
||||||
|
------------
|
||||||
|
1
|
||||||
|
3
|
||||||
|
32
|
||||||
|
35
|
||||||
|
39
|
||||||
|
65
|
||||||
|
129
|
||||||
|
130
|
||||||
|
134
|
||||||
|
164
|
||||||
|
194
|
||||||
|
228
|
||||||
|
261
|
||||||
|
290
|
||||||
|
320
|
||||||
|
321
|
||||||
|
354
|
||||||
|
418
|
||||||
|
(18 rows)
|
||||||
|
|
||||||
|
-- distinct on non-partition column
|
||||||
|
SELECT DISTINCT l_partkey FROM lineitem_hash_part WHERE l_orderkey > 5 and l_orderkey < 20 order by 1;
|
||||||
|
l_partkey
|
||||||
|
-----------
|
||||||
|
79251
|
||||||
|
94780
|
||||||
|
139636
|
||||||
|
145243
|
||||||
|
151894
|
||||||
|
157238
|
||||||
|
163073
|
||||||
|
182052
|
||||||
|
(8 rows)
|
||||||
|
|
||||||
|
SELECT DISTINCT l_shipmode FROM lineitem_hash_part ORDER BY 1 DESC;
|
||||||
|
l_shipmode
|
||||||
|
------------
|
||||||
|
TRUCK
|
||||||
|
SHIP
|
||||||
|
REG AIR
|
||||||
|
RAIL
|
||||||
|
MAIL
|
||||||
|
FOB
|
||||||
|
AIR
|
||||||
|
(7 rows)
|
||||||
|
|
||||||
|
-- distinct with multiple columns
|
||||||
|
SELECT DISTINCT l_orderkey, o_orderdate
|
||||||
|
FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey)
|
||||||
|
WHERE l_orderkey < 10
|
||||||
|
ORDER BY l_orderkey;
|
||||||
|
l_orderkey | o_orderdate
|
||||||
|
------------+-------------
|
||||||
|
1 | 01-02-1996
|
||||||
|
2 | 12-01-1996
|
||||||
|
3 | 10-14-1993
|
||||||
|
4 | 10-11-1995
|
||||||
|
5 | 07-30-1994
|
||||||
|
6 | 02-21-1992
|
||||||
|
7 | 01-10-1996
|
||||||
|
(7 rows)
|
||||||
|
|
||||||
|
-- distinct on partition column with aggregate
|
||||||
|
-- this is the same as the one without distinct due to group by
|
||||||
|
SELECT DISTINCT l_orderkey, count(*)
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
WHERE l_orderkey < 200
|
||||||
|
GROUP BY 1
|
||||||
|
HAVING count(*) > 5
|
||||||
|
ORDER BY 2 DESC, 1;
|
||||||
|
l_orderkey | count
|
||||||
|
------------+-------
|
||||||
|
7 | 7
|
||||||
|
68 | 7
|
||||||
|
129 | 7
|
||||||
|
164 | 7
|
||||||
|
194 | 7
|
||||||
|
1 | 6
|
||||||
|
3 | 6
|
||||||
|
32 | 6
|
||||||
|
35 | 6
|
||||||
|
39 | 6
|
||||||
|
67 | 6
|
||||||
|
69 | 6
|
||||||
|
70 | 6
|
||||||
|
71 | 6
|
||||||
|
134 | 6
|
||||||
|
135 | 6
|
||||||
|
163 | 6
|
||||||
|
192 | 6
|
||||||
|
197 | 6
|
||||||
|
(19 rows)
|
||||||
|
|
||||||
|
|
||||||
|
-- explain the query to see actual plan
|
||||||
|
EXPLAIN (COSTS FALSE)
|
||||||
|
SELECT DISTINCT l_orderkey, count(*)
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
WHERE l_orderkey < 200
|
||||||
|
GROUP BY 1
|
||||||
|
HAVING count(*) > 5
|
||||||
|
ORDER BY 2 DESC, 1;
|
||||||
|
QUERY PLAN
|
||||||
|
----------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
Sort
|
||||||
|
Sort Key: COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint))))::bigint, '0'::bigint) DESC, remote_scan.l_orderkey
|
||||||
|
-> HashAggregate
|
||||||
|
Group Key: remote_scan.l_orderkey
|
||||||
|
Filter: (COALESCE((pg_catalog.sum(remote_scan.worker_column_3))::bigint, '0'::bigint) > 5)
|
||||||
|
-> Custom Scan (Citus Real-Time)
|
||||||
|
Task Count: 4
|
||||||
|
Tasks Shown: One of 4
|
||||||
|
-> Task
|
||||||
|
Node: host=localhost port=57637 dbname=regression
|
||||||
|
-> HashAggregate
|
||||||
|
Group Key: l_orderkey
|
||||||
|
Filter: (count(*) > 5)
|
||||||
|
-> Seq Scan on lineitem_hash_part_360290 lineitem_hash_part
|
||||||
|
Filter: (l_orderkey < 200)
|
||||||
|
(15 rows)
|
||||||
|
|
||||||
|
-- distinct on non-partition column with aggregate
|
||||||
|
-- this is the same as non-distinct version due to group by
|
||||||
|
SELECT DISTINCT l_partkey, count(*)
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
GROUP BY 1
|
||||||
|
HAVING count(*) > 2
|
||||||
|
ORDER BY 1;
|
||||||
|
l_partkey | count
|
||||||
|
-----------+-------
|
||||||
|
1051 | 3
|
||||||
|
1927 | 3
|
||||||
|
6983 | 3
|
||||||
|
15283 | 3
|
||||||
|
87761 | 3
|
||||||
|
136884 | 3
|
||||||
|
149926 | 3
|
||||||
|
160895 | 3
|
||||||
|
177771 | 3
|
||||||
|
188804 | 3
|
||||||
|
199146 | 3
|
||||||
|
(11 rows)
|
||||||
|
|
||||||
|
|
||||||
|
-- explain the query to see actual plan
|
||||||
|
EXPLAIN (COSTS FALSE)
|
||||||
|
SELECT DISTINCT l_partkey, count(*)
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
GROUP BY 1
|
||||||
|
HAVING count(*) > 2
|
||||||
|
ORDER BY 1;
|
||||||
|
QUERY PLAN
|
||||||
|
----------------------------------------------------------------------------------------------------
|
||||||
|
Sort
|
||||||
|
Sort Key: remote_scan.l_partkey
|
||||||
|
-> HashAggregate
|
||||||
|
Group Key: remote_scan.l_partkey
|
||||||
|
Filter: (COALESCE((pg_catalog.sum(remote_scan.worker_column_3))::bigint, '0'::bigint) > 2)
|
||||||
|
-> Custom Scan (Citus Real-Time)
|
||||||
|
Task Count: 4
|
||||||
|
Tasks Shown: One of 4
|
||||||
|
-> Task
|
||||||
|
Node: host=localhost port=57637 dbname=regression
|
||||||
|
-> HashAggregate
|
||||||
|
Group Key: l_partkey
|
||||||
|
-> Seq Scan on lineitem_hash_part_360290 lineitem_hash_part
|
||||||
|
(13 rows)
|
||||||
|
|
||||||
|
-- distinct on non-partition column and avg
|
||||||
|
SELECT DISTINCT l_partkey, avg(l_linenumber)
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
WHERE l_partkey < 500
|
||||||
|
GROUP BY 1
|
||||||
|
HAVING avg(l_linenumber) > 2
|
||||||
|
ORDER BY 1;
|
||||||
|
l_partkey | avg
|
||||||
|
-----------+--------------------
|
||||||
|
18 | 7.0000000000000000
|
||||||
|
79 | 6.0000000000000000
|
||||||
|
149 | 4.5000000000000000
|
||||||
|
175 | 5.0000000000000000
|
||||||
|
179 | 6.0000000000000000
|
||||||
|
182 | 3.0000000000000000
|
||||||
|
222 | 4.0000000000000000
|
||||||
|
278 | 3.0000000000000000
|
||||||
|
299 | 7.0000000000000000
|
||||||
|
308 | 7.0000000000000000
|
||||||
|
309 | 5.0000000000000000
|
||||||
|
321 | 3.0000000000000000
|
||||||
|
337 | 6.0000000000000000
|
||||||
|
364 | 3.0000000000000000
|
||||||
|
403 | 4.0000000000000000
|
||||||
|
(15 rows)
|
||||||
|
|
||||||
|
-- distinct on multiple non-partition columns
|
||||||
|
SELECT DISTINCT l_partkey, l_suppkey
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
WHERE l_shipmode = 'AIR' AND l_orderkey < 100
|
||||||
|
ORDER BY 1, 2;
|
||||||
|
l_partkey | l_suppkey
|
||||||
|
-----------+-----------
|
||||||
|
2132 | 4633
|
||||||
|
4297 | 1798
|
||||||
|
37531 | 35
|
||||||
|
44161 | 6666
|
||||||
|
44706 | 4707
|
||||||
|
67831 | 5350
|
||||||
|
85811 | 8320
|
||||||
|
94368 | 6878
|
||||||
|
108338 | 849
|
||||||
|
108570 | 8571
|
||||||
|
137267 | 4807
|
||||||
|
137469 | 9983
|
||||||
|
173489 | 3490
|
||||||
|
196156 | 1195
|
||||||
|
197921 | 441
|
||||||
|
(15 rows)
|
||||||
|
|
||||||
|
|
||||||
|
EXPLAIN (COSTS FALSE)
|
||||||
|
SELECT DISTINCT l_partkey, l_suppkey
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
WHERE l_shipmode = 'AIR' AND l_orderkey < 100
|
||||||
|
ORDER BY 1, 2;
|
||||||
|
QUERY PLAN
|
||||||
|
-----------------------------------------------------------------------------------------------------
|
||||||
|
Sort
|
||||||
|
Sort Key: remote_scan.l_partkey, remote_scan.l_suppkey
|
||||||
|
-> HashAggregate
|
||||||
|
Group Key: remote_scan.l_partkey, remote_scan.l_suppkey
|
||||||
|
-> Custom Scan (Citus Real-Time)
|
||||||
|
Task Count: 4
|
||||||
|
Tasks Shown: One of 4
|
||||||
|
-> Task
|
||||||
|
Node: host=localhost port=57637 dbname=regression
|
||||||
|
-> Unique
|
||||||
|
-> Sort
|
||||||
|
Sort Key: l_partkey, l_suppkey
|
||||||
|
-> Seq Scan on lineitem_hash_part_360290 lineitem_hash_part
|
||||||
|
Filter: ((l_orderkey < 100) AND (l_shipmode = 'AIR'::bpchar))
|
||||||
|
(14 rows)
|
||||||
|
|
||||||
|
-- distinct on partition column
|
||||||
|
SELECT DISTINCT ON (l_orderkey) l_orderkey, l_partkey, l_suppkey
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
WHERE l_orderkey < 35
|
||||||
|
ORDER BY 1;
|
||||||
|
l_orderkey | l_partkey | l_suppkey
|
||||||
|
------------+-----------+-----------
|
||||||
|
1 | 155190 | 7706
|
||||||
|
2 | 106170 | 1191
|
||||||
|
3 | 4297 | 1798
|
||||||
|
4 | 88035 | 5560
|
||||||
|
5 | 108570 | 8571
|
||||||
|
6 | 139636 | 2150
|
||||||
|
7 | 182052 | 9607
|
||||||
|
32 | 82704 | 7721
|
||||||
|
33 | 61336 | 8855
|
||||||
|
34 | 88362 | 871
|
||||||
|
(10 rows)
|
||||||
|
|
||||||
|
|
||||||
|
EXPLAIN (COSTS FALSE)
|
||||||
|
SELECT DISTINCT ON (l_orderkey) l_orderkey, l_partkey, l_suppkey
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
WHERE l_orderkey < 35
|
||||||
|
ORDER BY 1;
|
||||||
|
QUERY PLAN
|
||||||
|
----------------------------------------------------------------------------------------------
|
||||||
|
Unique
|
||||||
|
-> Sort
|
||||||
|
Sort Key: remote_scan.l_orderkey
|
||||||
|
-> Custom Scan (Citus Real-Time)
|
||||||
|
Task Count: 4
|
||||||
|
Tasks Shown: One of 4
|
||||||
|
-> Task
|
||||||
|
Node: host=localhost port=57637 dbname=regression
|
||||||
|
-> Unique
|
||||||
|
-> Sort
|
||||||
|
Sort Key: l_orderkey
|
||||||
|
-> Seq Scan on lineitem_hash_part_360290 lineitem_hash_part
|
||||||
|
Filter: (l_orderkey < 35)
|
||||||
|
(13 rows)
|
||||||
|
|
||||||
|
-- distinct on non-partition column
|
||||||
|
-- note order by is required here
|
||||||
|
-- otherwise query results will be different since
|
||||||
|
-- distinct on clause is on non-partition column
|
||||||
|
SELECT DISTINCT ON (l_partkey) l_partkey, l_orderkey
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
ORDER BY 1,2
|
||||||
|
LIMIT 20;
|
||||||
|
l_partkey | l_orderkey
|
||||||
|
-----------+------------
|
||||||
|
18 | 12005
|
||||||
|
79 | 5121
|
||||||
|
91 | 2883
|
||||||
|
149 | 807
|
||||||
|
175 | 4102
|
||||||
|
179 | 2117
|
||||||
|
182 | 548
|
||||||
|
195 | 2528
|
||||||
|
204 | 10048
|
||||||
|
222 | 9413
|
||||||
|
245 | 9446
|
||||||
|
278 | 1287
|
||||||
|
299 | 1122
|
||||||
|
308 | 11137
|
||||||
|
309 | 2374
|
||||||
|
318 | 321
|
||||||
|
321 | 5984
|
||||||
|
337 | 10403
|
||||||
|
350 | 13698
|
||||||
|
358 | 4323
|
||||||
|
(20 rows)
|
||||||
|
|
||||||
|
EXPLAIN (COSTS FALSE)
|
||||||
|
SELECT DISTINCT ON (l_partkey) l_partkey, l_orderkey
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
ORDER BY 1,2
|
||||||
|
LIMIT 20;
|
||||||
|
QUERY PLAN
|
||||||
|
----------------------------------------------------------------------------------------------------------
|
||||||
|
Limit
|
||||||
|
-> Unique
|
||||||
|
-> Sort
|
||||||
|
Sort Key: remote_scan.l_partkey, remote_scan.l_orderkey
|
||||||
|
-> Custom Scan (Citus Real-Time)
|
||||||
|
Task Count: 4
|
||||||
|
Tasks Shown: One of 4
|
||||||
|
-> Task
|
||||||
|
Node: host=localhost port=57637 dbname=regression
|
||||||
|
-> Limit
|
||||||
|
-> Unique
|
||||||
|
-> Sort
|
||||||
|
Sort Key: l_partkey, l_orderkey
|
||||||
|
-> Seq Scan on lineitem_hash_part_360290 lineitem_hash_part
|
||||||
|
(14 rows)
|
||||||
|
|
||||||
|
-- distinct on with joins
|
||||||
|
-- each customer's first order key
|
||||||
|
SELECT DISTINCT ON (o_custkey) o_custkey, l_orderkey
|
||||||
|
FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey)
|
||||||
|
WHERE o_custkey < 15
|
||||||
|
ORDER BY 1,2;
|
||||||
|
o_custkey | l_orderkey
|
||||||
|
-----------+------------
|
||||||
|
1 | 9154
|
||||||
|
2 | 10563
|
||||||
|
4 | 320
|
||||||
|
5 | 11682
|
||||||
|
7 | 10402
|
||||||
|
8 | 102
|
||||||
|
10 | 1602
|
||||||
|
11 | 12800
|
||||||
|
13 | 994
|
||||||
|
14 | 11011
|
||||||
|
(10 rows)
|
||||||
|
|
||||||
|
EXPLAIN (COSTS FALSE)
|
||||||
|
SELECT DISTINCT ON (o_custkey) o_custkey, l_orderkey
|
||||||
|
FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey)
|
||||||
|
WHERE o_custkey < 15
|
||||||
|
ORDER BY 1,2;
|
||||||
|
QUERY PLAN
|
||||||
|
----------------------------------------------------------------------------------------------------------------
|
||||||
|
Unique
|
||||||
|
-> Sort
|
||||||
|
Sort Key: remote_scan.o_custkey, remote_scan.l_orderkey
|
||||||
|
-> Custom Scan (Citus Real-Time)
|
||||||
|
Task Count: 4
|
||||||
|
Tasks Shown: One of 4
|
||||||
|
-> Task
|
||||||
|
Node: host=localhost port=57637 dbname=regression
|
||||||
|
-> Unique
|
||||||
|
-> Sort
|
||||||
|
Sort Key: orders_hash_part.o_custkey, lineitem_hash_part.l_orderkey
|
||||||
|
-> Hash Join
|
||||||
|
Hash Cond: (lineitem_hash_part.l_orderkey = orders_hash_part.o_orderkey)
|
||||||
|
-> Seq Scan on lineitem_hash_part_360290 lineitem_hash_part
|
||||||
|
-> Hash
|
||||||
|
-> Seq Scan on orders_hash_part_360294 orders_hash_part
|
||||||
|
Filter: (o_custkey < 15)
|
||||||
|
(17 rows)
|
||||||
|
|
||||||
|
-- explain without order by
|
||||||
|
-- notice master plan has order by on distinct on column
|
||||||
|
EXPLAIN (COSTS FALSE)
|
||||||
|
SELECT DISTINCT ON (o_custkey) o_custkey, l_orderkey
|
||||||
|
FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey)
|
||||||
|
WHERE o_custkey < 15;
|
||||||
|
QUERY PLAN
|
||||||
|
----------------------------------------------------------------------------------------------------------------
|
||||||
|
Unique
|
||||||
|
-> Sort
|
||||||
|
Sort Key: remote_scan.o_custkey
|
||||||
|
-> Custom Scan (Citus Real-Time)
|
||||||
|
Task Count: 4
|
||||||
|
Tasks Shown: One of 4
|
||||||
|
-> Task
|
||||||
|
Node: host=localhost port=57637 dbname=regression
|
||||||
|
-> Unique
|
||||||
|
-> Sort
|
||||||
|
Sort Key: orders_hash_part.o_custkey
|
||||||
|
-> Hash Join
|
||||||
|
Hash Cond: (lineitem_hash_part.l_orderkey = orders_hash_part.o_orderkey)
|
||||||
|
-> Seq Scan on lineitem_hash_part_360290 lineitem_hash_part
|
||||||
|
-> Hash
|
||||||
|
-> Seq Scan on orders_hash_part_360294 orders_hash_part
|
||||||
|
Filter: (o_custkey < 15)
|
||||||
|
(17 rows)
|
||||||
|
|
||||||
|
-- each customer's each order's first l_partkey
|
||||||
|
SELECT DISTINCT ON (o_custkey, l_orderkey) o_custkey, l_orderkey, l_linenumber, l_partkey
|
||||||
|
FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey)
|
||||||
|
WHERE o_custkey < 20
|
||||||
|
ORDER BY 1,2,3;
|
||||||
|
o_custkey | l_orderkey | l_linenumber | l_partkey
|
||||||
|
-----------+------------+--------------+-----------
|
||||||
|
1 | 9154 | 1 | 86513
|
||||||
|
1 | 14656 | 1 | 59539
|
||||||
|
2 | 10563 | 1 | 147459
|
||||||
|
4 | 320 | 1 | 4415
|
||||||
|
4 | 739 | 1 | 84489
|
||||||
|
4 | 10688 | 1 | 45037
|
||||||
|
4 | 10788 | 1 | 50814
|
||||||
|
4 | 13728 | 1 | 86216
|
||||||
|
5 | 11682 | 1 | 31634
|
||||||
|
5 | 11746 | 1 | 180724
|
||||||
|
5 | 14308 | 1 | 157430
|
||||||
|
7 | 10402 | 1 | 53661
|
||||||
|
7 | 13031 | 1 | 112161
|
||||||
|
7 | 14145 | 1 | 138729
|
||||||
|
7 | 14404 | 1 | 143034
|
||||||
|
8 | 102 | 1 | 88914
|
||||||
|
8 | 164 | 1 | 91309
|
||||||
|
8 | 13601 | 1 | 40504
|
||||||
|
10 | 1602 | 1 | 182806
|
||||||
|
10 | 9862 | 1 | 86241
|
||||||
|
10 | 11431 | 1 | 62112
|
||||||
|
10 | 13124 | 1 | 29414
|
||||||
|
11 | 12800 | 1 | 152806
|
||||||
|
13 | 994 | 1 | 64486
|
||||||
|
13 | 1603 | 1 | 38191
|
||||||
|
13 | 4704 | 1 | 77934
|
||||||
|
13 | 9927 | 1 | 875
|
||||||
|
14 | 11011 | 1 | 172485
|
||||||
|
17 | 896 | 1 | 38675
|
||||||
|
17 | 5507 | 1 | 9600
|
||||||
|
19 | 353 | 1 | 119305
|
||||||
|
19 | 1504 | 1 | 81389
|
||||||
|
19 | 1669 | 1 | 78373
|
||||||
|
19 | 5893 | 1 | 133707
|
||||||
|
19 | 9954 | 1 | 92138
|
||||||
|
19 | 14885 | 1 | 36154
|
||||||
|
(36 rows)
|
||||||
|
|
||||||
|
-- explain without order by
|
||||||
|
EXPLAIN (COSTS FALSE)
|
||||||
|
SELECT DISTINCT ON (o_custkey, l_orderkey) o_custkey, l_orderkey, l_linenumber, l_partkey
|
||||||
|
FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey)
|
||||||
|
WHERE o_custkey < 20;
|
||||||
|
QUERY PLAN
|
||||||
|
----------------------------------------------------------------------------------------------------------------
|
||||||
|
Unique
|
||||||
|
-> Sort
|
||||||
|
Sort Key: remote_scan.o_custkey, remote_scan.l_orderkey
|
||||||
|
-> Custom Scan (Citus Real-Time)
|
||||||
|
Task Count: 4
|
||||||
|
Tasks Shown: One of 4
|
||||||
|
-> Task
|
||||||
|
Node: host=localhost port=57637 dbname=regression
|
||||||
|
-> Unique
|
||||||
|
-> Sort
|
||||||
|
Sort Key: orders_hash_part.o_custkey, lineitem_hash_part.l_orderkey
|
||||||
|
-> Hash Join
|
||||||
|
Hash Cond: (lineitem_hash_part.l_orderkey = orders_hash_part.o_orderkey)
|
||||||
|
-> Seq Scan on lineitem_hash_part_360290 lineitem_hash_part
|
||||||
|
-> Hash
|
||||||
|
-> Seq Scan on orders_hash_part_360294 orders_hash_part
|
||||||
|
Filter: (o_custkey < 20)
|
||||||
|
(17 rows)
|
||||||
|
|
||||||
|
-- each customer's each order's last l_partkey
|
||||||
|
SELECT DISTINCT ON (o_custkey, l_orderkey) o_custkey, l_orderkey, l_linenumber, l_partkey
|
||||||
|
FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey)
|
||||||
|
WHERE o_custkey < 15
|
||||||
|
ORDER BY 1,2,3 DESC;
|
||||||
|
o_custkey | l_orderkey | l_linenumber | l_partkey
|
||||||
|
-----------+------------+--------------+-----------
|
||||||
|
1 | 9154 | 7 | 173448
|
||||||
|
1 | 14656 | 1 | 59539
|
||||||
|
2 | 10563 | 4 | 110741
|
||||||
|
4 | 320 | 2 | 192158
|
||||||
|
4 | 739 | 5 | 187523
|
||||||
|
4 | 10688 | 2 | 132574
|
||||||
|
4 | 10788 | 4 | 196473
|
||||||
|
4 | 13728 | 3 | 12450
|
||||||
|
5 | 11682 | 3 | 177152
|
||||||
|
5 | 11746 | 7 | 193807
|
||||||
|
5 | 14308 | 3 | 140916
|
||||||
|
7 | 10402 | 2 | 64514
|
||||||
|
7 | 13031 | 6 | 7761
|
||||||
|
7 | 14145 | 6 | 130723
|
||||||
|
7 | 14404 | 7 | 35349
|
||||||
|
8 | 102 | 4 | 61158
|
||||||
|
8 | 164 | 7 | 3037
|
||||||
|
8 | 13601 | 5 | 12470
|
||||||
|
10 | 1602 | 1 | 182806
|
||||||
|
10 | 9862 | 5 | 135675
|
||||||
|
10 | 11431 | 7 | 8563
|
||||||
|
10 | 13124 | 3 | 67055
|
||||||
|
11 | 12800 | 5 | 179110
|
||||||
|
13 | 994 | 4 | 130471
|
||||||
|
13 | 1603 | 2 | 65209
|
||||||
|
13 | 4704 | 3 | 63081
|
||||||
|
13 | 9927 | 6 | 119356
|
||||||
|
14 | 11011 | 7 | 95939
|
||||||
|
(28 rows)
|
||||||
|
|
||||||
|
-- subqueries
|
||||||
|
SELECT DISTINCT l_orderkey, l_partkey
|
||||||
|
FROM (
|
||||||
|
SELECT l_orderkey, l_partkey
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
) q
|
||||||
|
ORDER BY 1,2
|
||||||
|
LIMIT 10;
|
||||||
|
l_orderkey | l_partkey
|
||||||
|
------------+-----------
|
||||||
|
1 | 2132
|
||||||
|
1 | 15635
|
||||||
|
1 | 24027
|
||||||
|
1 | 63700
|
||||||
|
1 | 67310
|
||||||
|
1 | 155190
|
||||||
|
2 | 106170
|
||||||
|
3 | 4297
|
||||||
|
3 | 19036
|
||||||
|
3 | 29380
|
||||||
|
(10 rows)
|
||||||
|
|
||||||
|
EXPLAIN (COSTS FALSE)
|
||||||
|
SELECT DISTINCT l_orderkey, l_partkey
|
||||||
|
FROM (
|
||||||
|
SELECT l_orderkey, l_partkey
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
) q
|
||||||
|
ORDER BY 1,2
|
||||||
|
LIMIT 10;
|
||||||
|
QUERY PLAN
|
||||||
|
----------------------------------------------------------------------------------------------------------
|
||||||
|
Limit
|
||||||
|
-> Sort
|
||||||
|
Sort Key: remote_scan.l_orderkey, remote_scan.l_partkey
|
||||||
|
-> HashAggregate
|
||||||
|
Group Key: remote_scan.l_orderkey, remote_scan.l_partkey
|
||||||
|
-> Custom Scan (Citus Real-Time)
|
||||||
|
Task Count: 4
|
||||||
|
Tasks Shown: One of 4
|
||||||
|
-> Task
|
||||||
|
Node: host=localhost port=57637 dbname=regression
|
||||||
|
-> Limit
|
||||||
|
-> Sort
|
||||||
|
Sort Key: l_orderkey, l_partkey
|
||||||
|
-> HashAggregate
|
||||||
|
Group Key: l_orderkey, l_partkey
|
||||||
|
-> Seq Scan on lineitem_hash_part_360290 lineitem_hash_part
|
||||||
|
(16 rows)
|
||||||
|
|
||||||
|
SELECT DISTINCT l_orderkey, cnt
|
||||||
|
FROM (
|
||||||
|
SELECT l_orderkey, count(*) as cnt
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
GROUP BY 1
|
||||||
|
) q
|
||||||
|
ORDER BY 1,2
|
||||||
|
LIMIT 10;
|
||||||
|
l_orderkey | cnt
|
||||||
|
------------+-----
|
||||||
|
1 | 6
|
||||||
|
2 | 1
|
||||||
|
3 | 6
|
||||||
|
4 | 1
|
||||||
|
5 | 3
|
||||||
|
6 | 1
|
||||||
|
7 | 7
|
||||||
|
32 | 6
|
||||||
|
33 | 4
|
||||||
|
34 | 3
|
||||||
|
(10 rows)
|
||||||
|
|
||||||
|
EXPLAIN (COSTS FALSE)
|
||||||
|
SELECT DISTINCT l_orderkey, cnt
|
||||||
|
FROM (
|
||||||
|
SELECT l_orderkey, count(*) as cnt
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
GROUP BY 1
|
||||||
|
) q
|
||||||
|
ORDER BY 1,2
|
||||||
|
LIMIT 10;
|
||||||
|
QUERY PLAN
|
||||||
|
----------------------------------------------------------------------------------------------------------------
|
||||||
|
Limit
|
||||||
|
-> Sort
|
||||||
|
Sort Key: remote_scan.l_orderkey, remote_scan.cnt
|
||||||
|
-> HashAggregate
|
||||||
|
Group Key: remote_scan.l_orderkey, remote_scan.cnt
|
||||||
|
-> Custom Scan (Citus Real-Time)
|
||||||
|
Task Count: 4
|
||||||
|
Tasks Shown: One of 4
|
||||||
|
-> Task
|
||||||
|
Node: host=localhost port=57637 dbname=regression
|
||||||
|
-> Limit
|
||||||
|
-> Sort
|
||||||
|
Sort Key: lineitem_hash_part.l_orderkey, (count(*))
|
||||||
|
-> HashAggregate
|
||||||
|
Group Key: lineitem_hash_part.l_orderkey, count(*)
|
||||||
|
-> HashAggregate
|
||||||
|
Group Key: lineitem_hash_part.l_orderkey
|
||||||
|
-> Seq Scan on lineitem_hash_part_360290 lineitem_hash_part
|
||||||
|
(18 rows)
|
||||||
|
|
||||||
|
-- distinct on partition column
|
||||||
|
-- random() is added to inner query to prevent flattening
|
||||||
|
SELECT DISTINCT ON (l_orderkey) l_orderkey, l_partkey
|
||||||
|
FROM (
|
||||||
|
SELECT l_orderkey, l_partkey, (random()*10)::int + 2 as r
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
) q
|
||||||
|
WHERE r > 1
|
||||||
|
ORDER BY 1,2
|
||||||
|
LIMIT 10;
|
||||||
|
l_orderkey | l_partkey
|
||||||
|
------------+-----------
|
||||||
|
1 | 2132
|
||||||
|
2 | 106170
|
||||||
|
3 | 4297
|
||||||
|
4 | 88035
|
||||||
|
5 | 37531
|
||||||
|
6 | 139636
|
||||||
|
7 | 79251
|
||||||
|
32 | 2743
|
||||||
|
33 | 33918
|
||||||
|
34 | 88362
|
||||||
|
(10 rows)
|
||||||
|
|
||||||
|
EXPLAIN (COSTS FALSE)
|
||||||
|
SELECT DISTINCT ON (l_orderkey) l_orderkey, l_partkey
|
||||||
|
FROM (
|
||||||
|
SELECT l_orderkey, l_partkey, (random()*10)::int + 2 as r
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
) q
|
||||||
|
WHERE r > 1
|
||||||
|
ORDER BY 1,2
|
||||||
|
LIMIT 10;
|
||||||
|
QUERY PLAN
|
||||||
|
----------------------------------------------------------------------------------------------------------------
|
||||||
|
Limit
|
||||||
|
-> Unique
|
||||||
|
-> Sort
|
||||||
|
Sort Key: remote_scan.l_orderkey, remote_scan.l_partkey
|
||||||
|
-> Custom Scan (Citus Real-Time)
|
||||||
|
Task Count: 4
|
||||||
|
Tasks Shown: One of 4
|
||||||
|
-> Task
|
||||||
|
Node: host=localhost port=57637 dbname=regression
|
||||||
|
-> Limit
|
||||||
|
-> Unique
|
||||||
|
-> Sort
|
||||||
|
Sort Key: q.l_orderkey, q.l_partkey
|
||||||
|
-> Subquery Scan on q
|
||||||
|
Filter: (q.r > 1)
|
||||||
|
-> Seq Scan on lineitem_hash_part_360290 lineitem_hash_part
|
||||||
|
(16 rows)
|
||||||
|
|
||||||
|
-- distinct on non-partition column
|
||||||
|
SELECT DISTINCT ON (l_partkey) l_orderkey, l_partkey
|
||||||
|
FROM (
|
||||||
|
SELECT l_orderkey, l_partkey, (random()*10)::int + 2 as r
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
) q
|
||||||
|
WHERE r > 1
|
||||||
|
ORDER BY 2,1
|
||||||
|
LIMIT 10;
|
||||||
|
l_orderkey | l_partkey
|
||||||
|
------------+-----------
|
||||||
|
12005 | 18
|
||||||
|
5121 | 79
|
||||||
|
2883 | 91
|
||||||
|
807 | 149
|
||||||
|
4102 | 175
|
||||||
|
2117 | 179
|
||||||
|
548 | 182
|
||||||
|
2528 | 195
|
||||||
|
10048 | 204
|
||||||
|
9413 | 222
|
||||||
|
(10 rows)
|
||||||
|
|
||||||
|
EXPLAIN (COSTS FALSE)
|
||||||
|
SELECT DISTINCT ON (l_partkey) l_orderkey, l_partkey
|
||||||
|
FROM (
|
||||||
|
SELECT l_orderkey, l_partkey, (random()*10)::int + 2 as r
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
) q
|
||||||
|
WHERE r > 1
|
||||||
|
ORDER BY 2,1
|
||||||
|
LIMIT 10;
|
||||||
|
QUERY PLAN
|
||||||
|
----------------------------------------------------------------------------------------------------------------
|
||||||
|
Limit
|
||||||
|
-> Unique
|
||||||
|
-> Sort
|
||||||
|
Sort Key: remote_scan.l_partkey, remote_scan.l_orderkey
|
||||||
|
-> Custom Scan (Citus Real-Time)
|
||||||
|
Task Count: 4
|
||||||
|
Tasks Shown: One of 4
|
||||||
|
-> Task
|
||||||
|
Node: host=localhost port=57637 dbname=regression
|
||||||
|
-> Limit
|
||||||
|
-> Unique
|
||||||
|
-> Sort
|
||||||
|
Sort Key: q.l_partkey, q.l_orderkey
|
||||||
|
-> Subquery Scan on q
|
||||||
|
Filter: (q.r > 1)
|
||||||
|
-> Seq Scan on lineitem_hash_part_360290 lineitem_hash_part
|
||||||
|
(16 rows)
|
||||||
|
|
|
@ -2071,6 +2071,83 @@ FROM
|
||||||
WHERE
|
WHERE
|
||||||
users_table.value_1 < 50 AND test_join_function_2(users_table.user_id, temp.user_id);
|
users_table.value_1 < 50 AND test_join_function_2(users_table.user_id, temp.user_id);
|
||||||
ERROR: unsupported clause type
|
ERROR: unsupported clause type
|
||||||
|
-- DISTINCT in the outer query and DISTINCT in the subquery
|
||||||
|
SELECT
|
||||||
|
DISTINCT users_ids.user_id
|
||||||
|
FROM
|
||||||
|
(SELECT DISTINCT user_id FROM users_table) as users_ids
|
||||||
|
JOIN
|
||||||
|
(SELECT
|
||||||
|
ma.user_id, ma.value_1, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob
|
||||||
|
FROM
|
||||||
|
users_table AS ma, events_table as short_list
|
||||||
|
WHERE
|
||||||
|
short_list.user_id = ma.user_id and ma.value_1 < 50 and short_list.event_type < 3
|
||||||
|
) temp
|
||||||
|
ON users_ids.user_id = temp.user_id
|
||||||
|
WHERE temp.value_1 < 50
|
||||||
|
ORDER BY 1
|
||||||
|
LIMIT 5;
|
||||||
|
user_id
|
||||||
|
---------
|
||||||
|
1
|
||||||
|
6
|
||||||
|
16
|
||||||
|
21
|
||||||
|
26
|
||||||
|
(5 rows)
|
||||||
|
|
||||||
|
-- DISTINCT ON in the outer query and DISTINCT in the subquery
|
||||||
|
SELECT
|
||||||
|
DISTINCT ON (users_ids.user_id) users_ids.user_id, temp.value_1, prob
|
||||||
|
FROM
|
||||||
|
(SELECT DISTINCT user_id FROM users_table) as users_ids
|
||||||
|
JOIN
|
||||||
|
(SELECT
|
||||||
|
ma.user_id, ma.value_1, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob
|
||||||
|
FROM
|
||||||
|
users_table AS ma, events_table as short_list
|
||||||
|
WHERE
|
||||||
|
short_list.user_id = ma.user_id and ma.value_1 < 50 and short_list.event_type < 15
|
||||||
|
) temp
|
||||||
|
ON users_ids.user_id = temp.user_id
|
||||||
|
WHERE temp.value_1 < 50
|
||||||
|
ORDER BY 1, 2
|
||||||
|
LIMIT 5;
|
||||||
|
user_id | value_1 | prob
|
||||||
|
---------+---------+------------------------
|
||||||
|
1 | 6 | 0.50000000000000000000
|
||||||
|
2 | 2 | 0.50000000000000000000
|
||||||
|
4 | 3 | 0.50000000000000000000
|
||||||
|
6 | 3 | 0.50000000000000000000
|
||||||
|
7 | 2 | 0.50000000000000000000
|
||||||
|
(5 rows)
|
||||||
|
|
||||||
|
-- DISTINCT ON in the outer query and DISTINCT ON in the subquery
|
||||||
|
SELECT
|
||||||
|
DISTINCT ON (users_ids.user_id) users_ids.user_id, temp.value_1, prob
|
||||||
|
FROM
|
||||||
|
(SELECT DISTINCT ON (user_id) user_id, value_1 FROM users_table ORDER BY 1,2) as users_ids
|
||||||
|
JOIN
|
||||||
|
(SELECT
|
||||||
|
ma.user_id, ma.value_1, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob
|
||||||
|
FROM
|
||||||
|
users_table AS ma, events_table as short_list
|
||||||
|
WHERE
|
||||||
|
short_list.user_id = ma.user_id and ma.value_1 < 25 and short_list.event_type < 3
|
||||||
|
) temp
|
||||||
|
ON users_ids.user_id = temp.user_id
|
||||||
|
ORDER BY 1,2
|
||||||
|
LIMIT 5;
|
||||||
|
user_id | value_1 | prob
|
||||||
|
---------+---------+------------------------
|
||||||
|
1 | 6 | 0.50000000000000000000
|
||||||
|
6 | 3 | 0.50000000000000000000
|
||||||
|
16 | 4 | 0.50000000000000000000
|
||||||
|
21 | 0 | 0.50000000000000000000
|
||||||
|
26 | 5 | 0.50000000000000000000
|
||||||
|
(5 rows)
|
||||||
|
|
||||||
DROP FUNCTION test_join_function_2(integer, integer);
|
DROP FUNCTION test_join_function_2(integer, integer);
|
||||||
SET citus.enable_router_execution TO TRUE;
|
SET citus.enable_router_execution TO TRUE;
|
||||||
SET citus.subquery_pushdown to OFF;
|
SET citus.subquery_pushdown to OFF;
|
||||||
|
|
|
@ -143,7 +143,7 @@ test: multi_outer_join
|
||||||
# is independed from the rest of the group, it is added to increase parallelism.
|
# is independed from the rest of the group, it is added to increase parallelism.
|
||||||
# ---
|
# ---
|
||||||
test: multi_create_fdw
|
test: multi_create_fdw
|
||||||
test: multi_complex_count_distinct
|
test: multi_complex_count_distinct multi_select_distinct
|
||||||
test: multi_distribution_metadata
|
test: multi_distribution_metadata
|
||||||
test: multi_generate_ddl_commands
|
test: multi_generate_ddl_commands
|
||||||
test: multi_create_shards
|
test: multi_create_shards
|
||||||
|
|
|
@ -418,3 +418,67 @@ FROM
|
||||||
-- get some statistics from the aggregated results to ensure the results are correct
|
-- get some statistics from the aggregated results to ensure the results are correct
|
||||||
SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results;
|
SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results;
|
||||||
|
|
||||||
|
-- DISTINCT in the outer query and DISTINCT in the subquery
|
||||||
|
TRUNCATE agg_results;
|
||||||
|
|
||||||
|
INSERT INTO agg_results(user_id)
|
||||||
|
SELECT
|
||||||
|
DISTINCT users_ids.user_id
|
||||||
|
FROM
|
||||||
|
(SELECT DISTINCT user_id FROM users_table) as users_ids
|
||||||
|
JOIN
|
||||||
|
(SELECT
|
||||||
|
ma.user_id, ma.value_1, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob
|
||||||
|
FROM
|
||||||
|
users_table AS ma, events_table as short_list
|
||||||
|
WHERE
|
||||||
|
short_list.user_id = ma.user_id and ma.value_1 < 50 and short_list.event_type < 3
|
||||||
|
) temp
|
||||||
|
ON users_ids.user_id = temp.user_id
|
||||||
|
WHERE temp.value_1 < 50;
|
||||||
|
|
||||||
|
-- get some statistics from the aggregated results to ensure the results are correct
|
||||||
|
SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results;
|
||||||
|
|
||||||
|
-- DISTINCT ON in the outer query and DISTINCT in the subquery
|
||||||
|
TRUNCATE agg_results;
|
||||||
|
|
||||||
|
INSERT INTO agg_results(user_id, value_1_agg, value_2_agg)
|
||||||
|
SELECT
|
||||||
|
DISTINCT ON (users_ids.user_id) users_ids.user_id, temp.value_1, prob
|
||||||
|
FROM
|
||||||
|
(SELECT DISTINCT user_id FROM users_table) as users_ids
|
||||||
|
JOIN
|
||||||
|
(SELECT
|
||||||
|
ma.user_id, ma.value_1, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob
|
||||||
|
FROM
|
||||||
|
users_table AS ma, events_table as short_list
|
||||||
|
WHERE
|
||||||
|
short_list.user_id = ma.user_id and ma.value_1 < 50 and short_list.event_type < 15
|
||||||
|
) temp
|
||||||
|
ON users_ids.user_id = temp.user_id
|
||||||
|
WHERE temp.value_1 < 50
|
||||||
|
ORDER BY 1, 2;
|
||||||
|
|
||||||
|
SELECT count(*), count(DISTINCT user_id), avg(user_id), avg(value_1_agg) FROM agg_results;
|
||||||
|
|
||||||
|
-- DISTINCT ON in the outer query and DISTINCT ON in the subquery
|
||||||
|
TRUNCATE agg_results;
|
||||||
|
|
||||||
|
INSERT INTO agg_results(user_id, value_1_agg, value_2_agg)
|
||||||
|
SELECT
|
||||||
|
DISTINCT ON (users_ids.user_id) users_ids.user_id, temp.value_1, prob
|
||||||
|
FROM
|
||||||
|
(SELECT DISTINCT ON (user_id) user_id, value_2 FROM users_table ORDER BY 1,2) as users_ids
|
||||||
|
JOIN
|
||||||
|
(SELECT
|
||||||
|
ma.user_id, ma.value_1, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob
|
||||||
|
FROM
|
||||||
|
users_table AS ma, events_table as short_list
|
||||||
|
WHERE
|
||||||
|
short_list.user_id = ma.user_id and ma.value_1 < 5000 and short_list.event_type < 3
|
||||||
|
) temp
|
||||||
|
ON users_ids.user_id = temp.user_id
|
||||||
|
ORDER BY 1, 2;
|
||||||
|
|
||||||
|
SELECT count(*), count(DISTINCT user_id), avg(user_id), avg(value_1_agg) FROM agg_results;
|
||||||
|
|
|
@ -478,13 +478,32 @@ SET client_min_messages TO INFO;
|
||||||
truncate agg_events;
|
truncate agg_events;
|
||||||
SET client_min_messages TO DEBUG2;
|
SET client_min_messages TO DEBUG2;
|
||||||
|
|
||||||
-- we do not support DISTINCT ON clauses
|
-- DISTINCT ON clauses are supported
|
||||||
|
-- distinct on(non-partition column)
|
||||||
|
-- values are pulled to master
|
||||||
INSERT INTO agg_events (value_1_agg, user_id)
|
INSERT INTO agg_events (value_1_agg, user_id)
|
||||||
SELECT
|
SELECT
|
||||||
DISTINCT ON (value_1) value_1, user_id
|
DISTINCT ON (value_1) value_1, user_id
|
||||||
FROM
|
FROM
|
||||||
raw_events_first;
|
raw_events_first;
|
||||||
|
|
||||||
|
SELECT user_id, value_1_agg FROM agg_events ORDER BY 1,2;
|
||||||
|
|
||||||
|
-- we don't want to see constraint vialotions, so truncate first
|
||||||
|
SET client_min_messages TO INFO;
|
||||||
|
truncate agg_events;
|
||||||
|
SET client_min_messages TO DEBUG2;
|
||||||
|
|
||||||
|
-- distinct on(partition column)
|
||||||
|
-- queries are forwared to workers
|
||||||
|
INSERT INTO agg_events (value_1_agg, user_id)
|
||||||
|
SELECT
|
||||||
|
DISTINCT ON (user_id) value_1, user_id
|
||||||
|
FROM
|
||||||
|
raw_events_first;
|
||||||
|
|
||||||
|
SELECT user_id, value_1_agg FROM agg_events ORDER BY 1,2;
|
||||||
|
|
||||||
-- We do not support some CTEs
|
-- We do not support some CTEs
|
||||||
WITH fist_table_agg AS
|
WITH fist_table_agg AS
|
||||||
(SELECT sum(value_1) as v1_agg, user_id FROM raw_events_first GROUP BY user_id)
|
(SELECT sum(value_1) as v1_agg, user_id FROM raw_events_first GROUP BY user_id)
|
||||||
|
|
|
@ -0,0 +1,245 @@
|
||||||
|
--
|
||||||
|
-- MULTI_SELECT_DISTINCT
|
||||||
|
--
|
||||||
|
-- Tests select distinct, and select distinct on features.
|
||||||
|
--
|
||||||
|
|
||||||
|
|
||||||
|
-- function calls are supported
|
||||||
|
SELECT DISTINCT l_orderkey, now() FROM lineitem_hash_part LIMIT 0;
|
||||||
|
|
||||||
|
SELECT DISTINCT l_partkey, 1 + (random() * 0)::int FROM lineitem_hash_part ORDER BY 1 DESC LIMIT 3;
|
||||||
|
|
||||||
|
-- const expressions are supported
|
||||||
|
SELECT DISTINCT l_orderkey, 1+1 FROM lineitem_hash_part ORDER BY 1 LIMIT 5;
|
||||||
|
|
||||||
|
-- non const expressions are also supported
|
||||||
|
SELECT DISTINCT l_orderkey, l_partkey + 1 FROM lineitem_hash_part ORDER BY 1, 2 LIMIT 5;
|
||||||
|
|
||||||
|
-- column expressions are supported
|
||||||
|
SELECT DISTINCT l_orderkey, l_shipinstruct || l_shipmode FROM lineitem_hash_part ORDER BY 2 , 1 LIMIT 5;
|
||||||
|
|
||||||
|
-- function calls with const input are supported
|
||||||
|
SELECT DISTINCT l_orderkey, strpos('AIR', 'A') FROM lineitem_hash_part ORDER BY 1,2 LIMIT 5;
|
||||||
|
|
||||||
|
-- function calls with non-const input are supported
|
||||||
|
SELECT DISTINCT l_orderkey, strpos(l_shipmode, 'I')
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
WHERE strpos(l_shipmode, 'I') > 1
|
||||||
|
ORDER BY 2, 1
|
||||||
|
LIMIT 5;
|
||||||
|
|
||||||
|
-- distinct on partition column
|
||||||
|
-- verify counts match with respect to count(distinct)
|
||||||
|
CREATE TEMP TABLE temp_orderkeys AS SELECT DISTINCT l_orderkey FROM lineitem_hash_part;
|
||||||
|
SELECT COUNT(*) FROM temp_orderkeys;
|
||||||
|
SELECT COUNT(DISTINCT l_orderkey) FROM lineitem_hash_part;
|
||||||
|
|
||||||
|
SELECT DISTINCT l_orderkey FROM lineitem_hash_part WHERE l_orderkey < 500 and l_partkey < 5000 order by 1;
|
||||||
|
|
||||||
|
-- distinct on non-partition column
|
||||||
|
SELECT DISTINCT l_partkey FROM lineitem_hash_part WHERE l_orderkey > 5 and l_orderkey < 20 order by 1;
|
||||||
|
|
||||||
|
SELECT DISTINCT l_shipmode FROM lineitem_hash_part ORDER BY 1 DESC;
|
||||||
|
|
||||||
|
-- distinct with multiple columns
|
||||||
|
SELECT DISTINCT l_orderkey, o_orderdate
|
||||||
|
FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey)
|
||||||
|
WHERE l_orderkey < 10
|
||||||
|
ORDER BY l_orderkey;
|
||||||
|
|
||||||
|
-- distinct on partition column with aggregate
|
||||||
|
-- this is the same as the one without distinct due to group by
|
||||||
|
SELECT DISTINCT l_orderkey, count(*)
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
WHERE l_orderkey < 200
|
||||||
|
GROUP BY 1
|
||||||
|
HAVING count(*) > 5
|
||||||
|
ORDER BY 2 DESC, 1;
|
||||||
|
|
||||||
|
-- explain the query to see actual plan
|
||||||
|
EXPLAIN (COSTS FALSE)
|
||||||
|
SELECT DISTINCT l_orderkey, count(*)
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
WHERE l_orderkey < 200
|
||||||
|
GROUP BY 1
|
||||||
|
HAVING count(*) > 5
|
||||||
|
ORDER BY 2 DESC, 1;
|
||||||
|
|
||||||
|
-- distinct on non-partition column with aggregate
|
||||||
|
-- this is the same as non-distinct version due to group by
|
||||||
|
SELECT DISTINCT l_partkey, count(*)
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
GROUP BY 1
|
||||||
|
HAVING count(*) > 2
|
||||||
|
ORDER BY 1;
|
||||||
|
|
||||||
|
-- explain the query to see actual plan
|
||||||
|
EXPLAIN (COSTS FALSE)
|
||||||
|
SELECT DISTINCT l_partkey, count(*)
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
GROUP BY 1
|
||||||
|
HAVING count(*) > 2
|
||||||
|
ORDER BY 1;
|
||||||
|
|
||||||
|
-- distinct on non-partition column and avg
|
||||||
|
SELECT DISTINCT l_partkey, avg(l_linenumber)
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
WHERE l_partkey < 500
|
||||||
|
GROUP BY 1
|
||||||
|
HAVING avg(l_linenumber) > 2
|
||||||
|
ORDER BY 1;
|
||||||
|
|
||||||
|
-- distinct on multiple non-partition columns
|
||||||
|
SELECT DISTINCT l_partkey, l_suppkey
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
WHERE l_shipmode = 'AIR' AND l_orderkey < 100
|
||||||
|
ORDER BY 1, 2;
|
||||||
|
|
||||||
|
EXPLAIN (COSTS FALSE)
|
||||||
|
SELECT DISTINCT l_partkey, l_suppkey
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
WHERE l_shipmode = 'AIR' AND l_orderkey < 100
|
||||||
|
ORDER BY 1, 2;
|
||||||
|
|
||||||
|
-- distinct on partition column
|
||||||
|
SELECT DISTINCT ON (l_orderkey) l_orderkey, l_partkey, l_suppkey
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
WHERE l_orderkey < 35
|
||||||
|
ORDER BY 1;
|
||||||
|
|
||||||
|
EXPLAIN (COSTS FALSE)
|
||||||
|
SELECT DISTINCT ON (l_orderkey) l_orderkey, l_partkey, l_suppkey
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
WHERE l_orderkey < 35
|
||||||
|
ORDER BY 1;
|
||||||
|
|
||||||
|
-- distinct on non-partition column
|
||||||
|
-- note order by is required here
|
||||||
|
-- otherwise query results will be different since
|
||||||
|
-- distinct on clause is on non-partition column
|
||||||
|
SELECT DISTINCT ON (l_partkey) l_partkey, l_orderkey
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
ORDER BY 1,2
|
||||||
|
LIMIT 20;
|
||||||
|
|
||||||
|
EXPLAIN (COSTS FALSE)
|
||||||
|
SELECT DISTINCT ON (l_partkey) l_partkey, l_orderkey
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
ORDER BY 1,2
|
||||||
|
LIMIT 20;
|
||||||
|
|
||||||
|
-- distinct on with joins
|
||||||
|
-- each customer's first order key
|
||||||
|
SELECT DISTINCT ON (o_custkey) o_custkey, l_orderkey
|
||||||
|
FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey)
|
||||||
|
WHERE o_custkey < 15
|
||||||
|
ORDER BY 1,2;
|
||||||
|
|
||||||
|
EXPLAIN (COSTS FALSE)
|
||||||
|
SELECT DISTINCT ON (o_custkey) o_custkey, l_orderkey
|
||||||
|
FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey)
|
||||||
|
WHERE o_custkey < 15
|
||||||
|
ORDER BY 1,2;
|
||||||
|
|
||||||
|
-- explain without order by
|
||||||
|
-- notice master plan has order by on distinct on column
|
||||||
|
EXPLAIN (COSTS FALSE)
|
||||||
|
SELECT DISTINCT ON (o_custkey) o_custkey, l_orderkey
|
||||||
|
FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey)
|
||||||
|
WHERE o_custkey < 15;
|
||||||
|
|
||||||
|
-- each customer's each order's first l_partkey
|
||||||
|
SELECT DISTINCT ON (o_custkey, l_orderkey) o_custkey, l_orderkey, l_linenumber, l_partkey
|
||||||
|
FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey)
|
||||||
|
WHERE o_custkey < 20
|
||||||
|
ORDER BY 1,2,3;
|
||||||
|
|
||||||
|
-- explain without order by
|
||||||
|
EXPLAIN (COSTS FALSE)
|
||||||
|
SELECT DISTINCT ON (o_custkey, l_orderkey) o_custkey, l_orderkey, l_linenumber, l_partkey
|
||||||
|
FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey)
|
||||||
|
WHERE o_custkey < 20;
|
||||||
|
|
||||||
|
-- each customer's each order's last l_partkey
|
||||||
|
SELECT DISTINCT ON (o_custkey, l_orderkey) o_custkey, l_orderkey, l_linenumber, l_partkey
|
||||||
|
FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey)
|
||||||
|
WHERE o_custkey < 15
|
||||||
|
ORDER BY 1,2,3 DESC;
|
||||||
|
|
||||||
|
-- subqueries
|
||||||
|
SELECT DISTINCT l_orderkey, l_partkey
|
||||||
|
FROM (
|
||||||
|
SELECT l_orderkey, l_partkey
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
) q
|
||||||
|
ORDER BY 1,2
|
||||||
|
LIMIT 10;
|
||||||
|
|
||||||
|
EXPLAIN (COSTS FALSE)
|
||||||
|
SELECT DISTINCT l_orderkey, l_partkey
|
||||||
|
FROM (
|
||||||
|
SELECT l_orderkey, l_partkey
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
) q
|
||||||
|
ORDER BY 1,2
|
||||||
|
LIMIT 10;
|
||||||
|
|
||||||
|
SELECT DISTINCT l_orderkey, cnt
|
||||||
|
FROM (
|
||||||
|
SELECT l_orderkey, count(*) as cnt
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
GROUP BY 1
|
||||||
|
) q
|
||||||
|
ORDER BY 1,2
|
||||||
|
LIMIT 10;
|
||||||
|
|
||||||
|
EXPLAIN (COSTS FALSE)
|
||||||
|
SELECT DISTINCT l_orderkey, cnt
|
||||||
|
FROM (
|
||||||
|
SELECT l_orderkey, count(*) as cnt
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
GROUP BY 1
|
||||||
|
) q
|
||||||
|
ORDER BY 1,2
|
||||||
|
LIMIT 10;
|
||||||
|
-- distinct on partition column
|
||||||
|
-- random() is added to inner query to prevent flattening
|
||||||
|
SELECT DISTINCT ON (l_orderkey) l_orderkey, l_partkey
|
||||||
|
FROM (
|
||||||
|
SELECT l_orderkey, l_partkey, (random()*10)::int + 2 as r
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
) q
|
||||||
|
WHERE r > 1
|
||||||
|
ORDER BY 1,2
|
||||||
|
LIMIT 10;
|
||||||
|
|
||||||
|
EXPLAIN (COSTS FALSE)
|
||||||
|
SELECT DISTINCT ON (l_orderkey) l_orderkey, l_partkey
|
||||||
|
FROM (
|
||||||
|
SELECT l_orderkey, l_partkey, (random()*10)::int + 2 as r
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
) q
|
||||||
|
WHERE r > 1
|
||||||
|
ORDER BY 1,2
|
||||||
|
LIMIT 10;
|
||||||
|
|
||||||
|
-- distinct on non-partition column
|
||||||
|
SELECT DISTINCT ON (l_partkey) l_orderkey, l_partkey
|
||||||
|
FROM (
|
||||||
|
SELECT l_orderkey, l_partkey, (random()*10)::int + 2 as r
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
) q
|
||||||
|
WHERE r > 1
|
||||||
|
ORDER BY 2,1
|
||||||
|
LIMIT 10;
|
||||||
|
|
||||||
|
EXPLAIN (COSTS FALSE)
|
||||||
|
SELECT DISTINCT ON (l_partkey) l_orderkey, l_partkey
|
||||||
|
FROM (
|
||||||
|
SELECT l_orderkey, l_partkey, (random()*10)::int + 2 as r
|
||||||
|
FROM lineitem_hash_part
|
||||||
|
) q
|
||||||
|
WHERE r > 1
|
||||||
|
ORDER BY 2,1
|
||||||
|
LIMIT 10;
|
|
@ -1659,8 +1659,60 @@ FROM
|
||||||
WHERE
|
WHERE
|
||||||
users_table.value_1 < 50 AND test_join_function_2(users_table.user_id, temp.user_id);
|
users_table.value_1 < 50 AND test_join_function_2(users_table.user_id, temp.user_id);
|
||||||
|
|
||||||
|
-- DISTINCT in the outer query and DISTINCT in the subquery
|
||||||
|
SELECT
|
||||||
|
DISTINCT users_ids.user_id
|
||||||
|
FROM
|
||||||
|
(SELECT DISTINCT user_id FROM users_table) as users_ids
|
||||||
|
JOIN
|
||||||
|
(SELECT
|
||||||
|
ma.user_id, ma.value_1, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob
|
||||||
|
FROM
|
||||||
|
users_table AS ma, events_table as short_list
|
||||||
|
WHERE
|
||||||
|
short_list.user_id = ma.user_id and ma.value_1 < 50 and short_list.event_type < 3
|
||||||
|
) temp
|
||||||
|
ON users_ids.user_id = temp.user_id
|
||||||
|
WHERE temp.value_1 < 50
|
||||||
|
ORDER BY 1
|
||||||
|
LIMIT 5;
|
||||||
|
|
||||||
|
-- DISTINCT ON in the outer query and DISTINCT in the subquery
|
||||||
|
SELECT
|
||||||
|
DISTINCT ON (users_ids.user_id) users_ids.user_id, temp.value_1, prob
|
||||||
|
FROM
|
||||||
|
(SELECT DISTINCT user_id FROM users_table) as users_ids
|
||||||
|
JOIN
|
||||||
|
(SELECT
|
||||||
|
ma.user_id, ma.value_1, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob
|
||||||
|
FROM
|
||||||
|
users_table AS ma, events_table as short_list
|
||||||
|
WHERE
|
||||||
|
short_list.user_id = ma.user_id and ma.value_1 < 50 and short_list.event_type < 15
|
||||||
|
) temp
|
||||||
|
ON users_ids.user_id = temp.user_id
|
||||||
|
WHERE temp.value_1 < 50
|
||||||
|
ORDER BY 1, 2
|
||||||
|
LIMIT 5;
|
||||||
|
|
||||||
|
-- DISTINCT ON in the outer query and DISTINCT ON in the subquery
|
||||||
|
SELECT
|
||||||
|
DISTINCT ON (users_ids.user_id) users_ids.user_id, temp.value_1, prob
|
||||||
|
FROM
|
||||||
|
(SELECT DISTINCT ON (user_id) user_id, value_1 FROM users_table ORDER BY 1,2) as users_ids
|
||||||
|
JOIN
|
||||||
|
(SELECT
|
||||||
|
ma.user_id, ma.value_1, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob
|
||||||
|
FROM
|
||||||
|
users_table AS ma, events_table as short_list
|
||||||
|
WHERE
|
||||||
|
short_list.user_id = ma.user_id and ma.value_1 < 25 and short_list.event_type < 3
|
||||||
|
) temp
|
||||||
|
ON users_ids.user_id = temp.user_id
|
||||||
|
ORDER BY 1,2
|
||||||
|
LIMIT 5;
|
||||||
|
|
||||||
DROP FUNCTION test_join_function_2(integer, integer);
|
DROP FUNCTION test_join_function_2(integer, integer);
|
||||||
|
|
||||||
SET citus.enable_router_execution TO TRUE;
|
SET citus.enable_router_execution TO TRUE;
|
||||||
SET citus.subquery_pushdown to OFF;
|
SET citus.subquery_pushdown to OFF;
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue