From f7ab901766faf827790040aa20c6cd26d425e257 Mon Sep 17 00:00:00 2001 From: Murat Tuncer Date: Wed, 4 Oct 2017 13:54:30 +0300 Subject: [PATCH] Add select distinct, and distinct on support Distinct, and distinct on() clauses are supported in simple selects, joins, subqueries, and insert into select queries. --- .../planner/insert_select_planner.c | 43 +- .../planner/multi_logical_optimizer.c | 25 +- .../planner/multi_logical_planner.c | 9 +- .../planner/multi_master_planner.c | 128 ++- .../planner/multi_physical_planner.c | 29 +- .../planner/postgres_planning_functions.c | 71 ++ .../distributed/utils/citus_outfuncs.c | 2 + .../distributed/multi_logical_planner.h | 2 + .../distributed/multi_master_planner.h | 1 + .../multi_behavioral_analytics_basics.out | 72 +- .../regress/expected/multi_insert_select.out | 52 +- .../expected/multi_select_distinct.out | 824 ++++++++++++++++++ .../multi_subquery_behavioral_analytics.out | 77 ++ src/test/regress/multi_schedule | 2 +- .../sql/multi_behavioral_analytics_basics.sql | 66 +- src/test/regress/sql/multi_insert_select.sql | 21 +- .../regress/sql/multi_select_distinct.sql | 245 ++++++ .../multi_subquery_behavioral_analytics.sql | 54 +- 18 files changed, 1689 insertions(+), 34 deletions(-) create mode 100644 src/backend/distributed/planner/postgres_planning_functions.c create mode 100644 src/test/regress/expected/multi_select_distinct.out create mode 100644 src/test/regress/sql/multi_select_distinct.sql diff --git a/src/backend/distributed/planner/insert_select_planner.c b/src/backend/distributed/planner/insert_select_planner.c index a3f7b107a..538f0c929 100644 --- a/src/backend/distributed/planner/insert_select_planner.c +++ b/src/backend/distributed/planner/insert_select_planner.c @@ -30,6 +30,7 @@ #include "optimizer/clauses.h" #include "optimizer/planner.h" #include "optimizer/restrictinfo.h" +#include "optimizer/tlist.h" #include "optimizer/var.h" #include "parser/parsetree.h" #include "parser/parse_coerce.h" @@ -53,6 +54,7 @@ static DeferredErrorMessage * DistributedInsertSelectSupported(Query *queryTree, RangeTblEntry *subqueryRte, bool allReferenceTables); static DeferredErrorMessage * MultiTaskRouterSelectQuerySupported(Query *query); +static bool HasUnsupportedDistinctOn(Query *query); static DeferredErrorMessage * InsertPartitionColumnMatchesSelect(Query *query, RangeTblEntry *insertRte, RangeTblEntry * @@ -777,6 +779,7 @@ MultiTaskRouterSelectQuerySupported(Query *query) List *queryList = NIL; ListCell *queryCell = NULL; StringInfo errorDetail = NULL; + bool hasUnsupportedDistinctOn = false; ExtractQueryWalker((Node *) query, &queryList); foreach(queryCell, queryList) @@ -865,14 +868,14 @@ MultiTaskRouterSelectQuerySupported(Query *query) } /* - * We cannot support DISTINCT ON clauses since it could be on a non-partition column. - * In that case, there is no way that Citus can support this. + * We don't support DISTINCT ON clauses on non-partition columns. */ - if (subquery->hasDistinctOn) + hasUnsupportedDistinctOn = HasUnsupportedDistinctOn(subquery); + if (hasUnsupportedDistinctOn) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, - "DISTINCT ON clauses are not allowed in distributed " - "INSERT ... SELECT queries", + "DISTINCT ON (non-partition column) clauses are not " + "allowed in distributed INSERT ... SELECT queries", NULL, NULL); } } @@ -881,6 +884,36 @@ MultiTaskRouterSelectQuerySupported(Query *query) } +/* + * HasUnsupportedDistinctOn returns true if the query has distinct on and + * distinct targets do not contain partition column. + */ +static bool +HasUnsupportedDistinctOn(Query *query) +{ + ListCell *distinctCell = NULL; + + if (!query->hasDistinctOn) + { + return false; + } + + foreach(distinctCell, query->distinctClause) + { + SortGroupClause *distinctClause = lfirst(distinctCell); + TargetEntry *distinctEntry = get_sortgroupclause_tle(distinctClause, + query->targetList); + + if (IsPartitionColumn(distinctEntry->expr, query)) + { + return false; + } + } + + return true; +} + + /* * InsertPartitionColumnMatchesSelect returns NULL the partition column in the * table targeted by INSERTed matches with the any of the SELECTed table's diff --git a/src/backend/distributed/planner/multi_logical_optimizer.c b/src/backend/distributed/planner/multi_logical_optimizer.c index e679da44c..03bd54cbd 100644 --- a/src/backend/distributed/planner/multi_logical_optimizer.c +++ b/src/backend/distributed/planner/multi_logical_optimizer.c @@ -1298,6 +1298,8 @@ MasterExtendedOpNode(MultiExtendedOp *originalOpNode) masterExtendedOpNode->targetList = newTargetEntryList; masterExtendedOpNode->groupClauseList = originalOpNode->groupClauseList; masterExtendedOpNode->sortClauseList = originalOpNode->sortClauseList; + masterExtendedOpNode->distinctClause = originalOpNode->distinctClause; + masterExtendedOpNode->hasDistinctOn = originalOpNode->hasDistinctOn; masterExtendedOpNode->limitCount = originalOpNode->limitCount; masterExtendedOpNode->limitOffset = originalOpNode->limitOffset; masterExtendedOpNode->havingQual = newHavingQual; @@ -1787,6 +1789,7 @@ WorkerExtendedOpNode(MultiExtendedOp *originalOpNode, WorkerAggregateWalkerContext *walkerContext = palloc0(sizeof(WorkerAggregateWalkerContext)); Index nextSortGroupRefIndex = 0; + bool queryHasAggregates = false; walkerContext->repartitionSubquery = false; walkerContext->expressionList = NIL; @@ -1826,6 +1829,7 @@ WorkerExtendedOpNode(MultiExtendedOp *originalOpNode, WorkerAggregateWalker((Node *) originalExpression, walkerContext); newExpressionList = walkerContext->expressionList; + queryHasAggregates = true; } else { @@ -1924,6 +1928,15 @@ WorkerExtendedOpNode(MultiExtendedOp *originalOpNode, workerExtendedOpNode = CitusMakeNode(MultiExtendedOp); workerExtendedOpNode->targetList = newTargetEntryList; + workerExtendedOpNode->distinctClause = NIL; + workerExtendedOpNode->hasDistinctOn = false; + + if (!queryHasAggregates) + { + workerExtendedOpNode->distinctClause = originalOpNode->distinctClause; + workerExtendedOpNode->hasDistinctOn = originalOpNode->hasDistinctOn; + } + workerExtendedOpNode->groupClauseList = groupClauseList; /* if we can push down the limit, also set related fields */ @@ -3356,10 +3369,10 @@ WorkerLimitCount(MultiExtendedOp *originalOpNode, /* * WorkerSortClauseList first checks if the given extended node contains a limit - * that can be pushed down. If it does, the function then checks if we need to - * add any sorting and grouping clauses to the sort list we push down for the - * limit. If we do, the function adds these clauses and returns them. Otherwise, - * the function returns null. + * or hasDistinctOn that can be pushed down. If it does, the function then + * checks if we need to add any sorting and grouping clauses to the sort list we + * push down for the limit. If we do, the function adds these clauses and + * returns them. Otherwise, the function returns null. */ static List * WorkerSortClauseList(MultiExtendedOp *originalOpNode, @@ -3370,8 +3383,8 @@ WorkerSortClauseList(MultiExtendedOp *originalOpNode, List *sortClauseList = originalOpNode->sortClauseList; List *targetList = originalOpNode->targetList; - /* if no limit node, no need to push down sort clauses */ - if (originalOpNode->limitCount == NULL) + /* if no limit node and no hasDistinctOn, no need to push down sort clauses */ + if (originalOpNode->limitCount == NULL && !originalOpNode->hasDistinctOn) { return NIL; } diff --git a/src/backend/distributed/planner/multi_logical_planner.c b/src/backend/distributed/planner/multi_logical_planner.c index af7e4bb24..e514731df 100644 --- a/src/backend/distributed/planner/multi_logical_planner.c +++ b/src/backend/distributed/planner/multi_logical_planner.c @@ -1805,13 +1805,6 @@ ErrorIfQueryNotSupported(Query *queryTree) errorHint = filterHint; } - if (queryTree->distinctClause) - { - preconditionsSatisfied = false; - errorMessage = "could not run distributed query with DISTINCT clause"; - errorHint = filterHint; - } - if (queryTree->groupingSets) { preconditionsSatisfied = false; @@ -2850,6 +2843,8 @@ MultiExtendedOpNode(Query *queryTree) extendedOpNode->limitCount = queryTree->limitCount; extendedOpNode->limitOffset = queryTree->limitOffset; extendedOpNode->havingQual = queryTree->havingQual; + extendedOpNode->distinctClause = queryTree->distinctClause; + extendedOpNode->hasDistinctOn = queryTree->hasDistinctOn; return extendedOpNode; } diff --git a/src/backend/distributed/planner/multi_master_planner.c b/src/backend/distributed/planner/multi_master_planner.c index b2db9d2bb..1ac9f3a67 100644 --- a/src/backend/distributed/planner/multi_master_planner.c +++ b/src/backend/distributed/planner/multi_master_planner.c @@ -151,6 +151,73 @@ BuildAggregatePlan(Query *masterQuery, Plan *subPlan) } +/* + * BuildDistinctPlan creates an returns a plan for distinct. Depending on + * availability of hash function it chooses HashAgg over Sort/Unique + * plans. + * This function has a potential performance issue since we blindly set + * Plan nodes without looking at cost. We might need to revisit this + * if we have performance issues with select distinct queries. + */ +static Plan * +BuildDistinctPlan(Query *masterQuery, Plan *subPlan) +{ + Plan *distinctPlan = NULL; + bool distinctClausesHashable = true; + List *distinctClauseList = masterQuery->distinctClause; + List *targetList = copyObject(masterQuery->targetList); + List *columnList = pull_var_clause_default((Node *) targetList); + ListCell *columnCell = NULL; + + if (IsA(subPlan, Agg)) + { + return subPlan; + } + + Assert(masterQuery->distinctClause); + Assert(!masterQuery->hasDistinctOn); + + /* + * For upper level plans above the sequential scan, the planner expects the + * table id (varno) to be set to OUTER_VAR. + */ + foreach(columnCell, columnList) + { + Var *column = (Var *) lfirst(columnCell); + column->varno = OUTER_VAR; + } + + /* + * Create group by plan with HashAggregate if all distinct + * members are hashable, Otherwise create sort+unique plan. + */ + distinctClausesHashable = grouping_is_hashable(distinctClauseList); + if (distinctClausesHashable) + { + const long rowEstimate = 10; /* using the same value as BuildAggregatePlan() */ + AttrNumber *distinctColumnIdArray = extract_grouping_cols(distinctClauseList, + subPlan->targetlist); + Oid *distinctColumnOpArray = extract_grouping_ops(distinctClauseList); + uint32 distinctClauseCount = list_length(distinctClauseList); + + distinctPlan = (Plan *) make_agg(targetList, NIL, AGG_HASHED, + AGGSPLIT_SIMPLE, distinctClauseCount, + distinctColumnIdArray, + distinctColumnOpArray, NIL, NIL, + rowEstimate, subPlan); + } + else + { + Sort *sortPlan = make_sort_from_sortclauses(masterQuery->distinctClause, + subPlan); + distinctPlan = (Plan *) make_unique_from_sortclauses((Plan *) sortPlan, + masterQuery->distinctClause); + } + + return distinctPlan; +} + + /* * BuildSelectStatement builds the final select statement to run on the master * node, before returning results to the user. The function first gets the custom @@ -166,6 +233,7 @@ BuildSelectStatement(Query *masterQuery, List *masterTargetList, CustomScan *rem Plan *topLevelPlan = NULL; ListCell *targetEntryCell = NULL; List *columnNameList = NULL; + List *sortClauseList = copyObject(masterQuery->sortClause); /* (1) make PlannedStmt and set basic information */ selectStatement = makeNode(PlannedStmt); @@ -203,10 +271,47 @@ BuildSelectStatement(Query *masterQuery, List *masterTargetList, CustomScan *rem topLevelPlan = &remoteScan->scan.plan; } - /* (3) add a sorting plan if needed */ - if (masterQuery->sortClause) + /* + * (3) create distinct plan if needed. + * + * distinct on() requires sort + unique plans. Unique itself is not enough + * as it only compares the current value with previous one when checking + * uniqueness, thus ordering is necessary. If already has order by + * clause we append distinct clauses to the end of it. Postgresql requires + * that if both distinct on() and order by exists, ordering shall start + * on distinct clauses. Therefore we can safely append distinct clauses to + * the end of order by clauses. Although the same column may appear more + * than once in order by clauses, created plan uses only one instance, for + * example order by a,b,a,a,b,c is translated to equivalent order by a,b,c. + * + * If the query has distinct clause but not distinct on, we first create + * distinct plan that is either HashAggreate or Sort + Unique plans depending + * on hashable property of columns in distinct clause. If there is order by + * clause, it is handled after distinct planning. + */ + if (masterQuery->hasDistinctOn) + { + ListCell *distinctCell = NULL; + foreach(distinctCell, masterQuery->distinctClause) + { + SortGroupClause *singleDistinctClause = lfirst(distinctCell); + Index sortGroupRef = singleDistinctClause->tleSortGroupRef; + + if (get_sortgroupref_clause_noerr(sortGroupRef, sortClauseList) == NULL) + { + sortClauseList = lappend(sortClauseList, singleDistinctClause); + } + } + } + else if (masterQuery->distinctClause) + { + Plan *distinctPlan = BuildDistinctPlan(masterQuery, topLevelPlan); + topLevelPlan = distinctPlan; + } + + /* (4) add a sorting plan if needed */ + if (sortClauseList) { - List *sortClauseList = masterQuery->sortClause; Sort *sortPlan = make_sort_from_sortclauses(sortClauseList, topLevelPlan); /* just for reproducible costs between different PostgreSQL versions */ @@ -217,7 +322,20 @@ BuildSelectStatement(Query *masterQuery, List *masterTargetList, CustomScan *rem topLevelPlan = (Plan *) sortPlan; } - /* (4) add a limit plan if needed */ + /* + * (5) add a unique plan for distinctOn. + * If the query has distinct on we add a sort clause in step 3. Therefore + * Step 4 always creates a sort plan. + * */ + if (masterQuery->hasDistinctOn) + { + Assert(IsA(topLevelPlan, Sort)); + topLevelPlan = + (Plan *) make_unique_from_sortclauses(topLevelPlan, + masterQuery->distinctClause); + } + + /* (5) add a limit plan if needed */ if (masterQuery->limitCount || masterQuery->limitOffset) { Node *limitCount = masterQuery->limitCount; @@ -226,7 +344,7 @@ BuildSelectStatement(Query *masterQuery, List *masterTargetList, CustomScan *rem topLevelPlan = (Plan *) limitPlan; } - /* (5) finally set our top level plan in the plan tree */ + /* (6) finally set our top level plan in the plan tree */ selectStatement->planTree = topLevelPlan; return selectStatement; diff --git a/src/backend/distributed/planner/multi_physical_planner.c b/src/backend/distributed/planner/multi_physical_planner.c index 9c0d1341f..50ec33878 100644 --- a/src/backend/distributed/planner/multi_physical_planner.c +++ b/src/backend/distributed/planner/multi_physical_planner.c @@ -578,6 +578,8 @@ BuildJobQuery(MultiNode *multiNode, List *dependedJobList) FromExpr *joinTree = NULL; Node *joinRoot = NULL; Node *havingQual = NULL; + bool hasDistinctOn = false; + List *distinctClause = NIL; /* we start building jobs from below the collect node */ Assert(!CitusIsA(multiNode, MultiCollect)); @@ -625,6 +627,8 @@ BuildJobQuery(MultiNode *multiNode, List *dependedJobList) { MultiExtendedOp *extendedOp = (MultiExtendedOp *) linitial(extendedOpNodeList); targetList = copyObject(extendedOp->targetList); + distinctClause = extendedOp->distinctClause; + hasDistinctOn = extendedOp->hasDistinctOn; } else { @@ -699,6 +703,8 @@ BuildJobQuery(MultiNode *multiNode, List *dependedJobList) jobQuery->limitCount = limitCount; jobQuery->havingQual = havingQual; jobQuery->hasAggs = contain_agg_clause((Node *) targetList); + jobQuery->distinctClause = distinctClause; + jobQuery->hasDistinctOn = hasDistinctOn; return jobQuery; } @@ -1405,6 +1411,9 @@ BuildSubqueryJobQuery(MultiNode *multiNode) Node *limitCount = NULL; Node *limitOffset = NULL; FromExpr *joinTree = NULL; + bool hasAggregates = false; + List *distinctClause = NIL; + bool hasDistinctOn = false; /* we start building jobs from below the collect node */ Assert(!CitusIsA(multiNode, MultiCollect)); @@ -1450,6 +1459,8 @@ BuildSubqueryJobQuery(MultiNode *multiNode) limitOffset = extendedOp->limitOffset; sortClauseList = extendedOp->sortClauseList; havingQual = extendedOp->havingQual; + distinctClause = extendedOp->distinctClause; + hasDistinctOn = extendedOp->hasDistinctOn; } /* build group clauses */ @@ -1458,6 +1469,19 @@ BuildSubqueryJobQuery(MultiNode *multiNode) /* build the where clause list using select predicates */ whereClauseList = QuerySelectClauseList(multiNode); + if (contain_agg_clause((Node *) targetList) || + contain_agg_clause((Node *) havingQual)) + { + hasAggregates = true; + } + + /* distinct is not send to worker query if there are top level aggregates */ + if (hasAggregates) + { + hasDistinctOn = false; + distinctClause = NIL; + } + /* * Build the From/Where construct. We keep the where-clause list implicitly * AND'd, since both partition and join pruning depends on the clauses being @@ -1480,8 +1504,9 @@ BuildSubqueryJobQuery(MultiNode *multiNode) jobQuery->limitOffset = limitOffset; jobQuery->limitCount = limitCount; jobQuery->havingQual = havingQual; - jobQuery->hasAggs = contain_agg_clause((Node *) targetList) || - contain_agg_clause((Node *) havingQual); + jobQuery->hasAggs = hasAggregates; + jobQuery->hasDistinctOn = hasDistinctOn; + jobQuery->distinctClause = distinctClause; return jobQuery; } diff --git a/src/backend/distributed/planner/postgres_planning_functions.c b/src/backend/distributed/planner/postgres_planning_functions.c new file mode 100644 index 000000000..20b05f921 --- /dev/null +++ b/src/backend/distributed/planner/postgres_planning_functions.c @@ -0,0 +1,71 @@ +/*------------------------------------------------------------------------- + * + * postgres_planning_function.c + * Includes planning routines copied from + * src/backend/optimizer/plan/createplan.c + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * This needs to be closely in sync with the core code. + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "distributed/multi_master_planner.h" +#include "nodes/plannodes.h" +#include "optimizer/tlist.h" + + +/* + * make_unique_from_sortclauses creates and returns a unique node + * from provided distinct clause list. + * The functions is copied from postgresql from + * src/backend/optimizer/plan/createplan.c. + * + * distinctList is a list of SortGroupClauses, identifying the targetlist items + * that should be considered by the Unique filter. The input path must + * already be sorted accordingly. + */ +Unique * +make_unique_from_sortclauses(Plan *lefttree, List *distinctList) +{ + Unique *node = makeNode(Unique); + Plan *plan = &node->plan; + int numCols = list_length(distinctList); + int keyno = 0; + AttrNumber *uniqColIdx; + Oid *uniqOperators; + ListCell *slitem; + + plan->targetlist = lefttree->targetlist; + plan->qual = NIL; + plan->lefttree = lefttree; + plan->righttree = NULL; + + /* + * convert SortGroupClause list into arrays of attr indexes and equality + * operators, as wanted by executor + */ + Assert(numCols > 0); + uniqColIdx = (AttrNumber *) palloc(sizeof(AttrNumber) * numCols); + uniqOperators = (Oid *) palloc(sizeof(Oid) * numCols); + + foreach(slitem, distinctList) + { + SortGroupClause *sortcl = (SortGroupClause *) lfirst(slitem); + TargetEntry *tle = get_sortgroupclause_tle(sortcl, plan->targetlist); + + uniqColIdx[keyno] = tle->resno; + uniqOperators[keyno] = sortcl->eqop; + Assert(OidIsValid(uniqOperators[keyno])); + keyno++; + } + + node->numCols = numCols; + node->uniqColIdx = uniqColIdx; + node->uniqOperators = uniqOperators; + + return node; +} diff --git a/src/backend/distributed/utils/citus_outfuncs.c b/src/backend/distributed/utils/citus_outfuncs.c index f2d988299..24241ef90 100644 --- a/src/backend/distributed/utils/citus_outfuncs.c +++ b/src/backend/distributed/utils/citus_outfuncs.c @@ -292,6 +292,8 @@ OutMultiExtendedOp(OUTFUNC_ARGS) WRITE_NODE_FIELD(limitCount); WRITE_NODE_FIELD(limitOffset); WRITE_NODE_FIELD(havingQual); + WRITE_BOOL_FIELD(hasDistinctOn); + WRITE_NODE_FIELD(distinctClause); OutMultiUnaryNodeFields(str, (const MultiUnaryNode *) node); } diff --git a/src/include/distributed/multi_logical_planner.h b/src/include/distributed/multi_logical_planner.h index b3ff12aac..2465b93ad 100644 --- a/src/include/distributed/multi_logical_planner.h +++ b/src/include/distributed/multi_logical_planner.h @@ -173,6 +173,8 @@ typedef struct MultiExtendedOp Node *limitCount; Node *limitOffset; Node *havingQual; + List *distinctClause; + bool hasDistinctOn; } MultiExtendedOp; diff --git a/src/include/distributed/multi_master_planner.h b/src/include/distributed/multi_master_planner.h index 9873ce4ff..54f4e2d95 100644 --- a/src/include/distributed/multi_master_planner.h +++ b/src/include/distributed/multi_master_planner.h @@ -24,6 +24,7 @@ struct MultiPlan; struct CustomScan; extern PlannedStmt * MasterNodeSelectPlan(struct MultiPlan *multiPlan, struct CustomScan *dataScan); +extern Unique * make_unique_from_sortclauses(Plan *lefttree, List *distinctList); #endif /* MULTI_MASTER_PLANNER_H */ diff --git a/src/test/regress/expected/multi_behavioral_analytics_basics.out b/src/test/regress/expected/multi_behavioral_analytics_basics.out index eb6da2a9c..413bb0a3b 100644 --- a/src/test/regress/expected/multi_behavioral_analytics_basics.out +++ b/src/test/regress/expected/multi_behavioral_analytics_basics.out @@ -454,4 +454,74 @@ SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; 14371 | 101 | 50.5232064574490293 (1 row) - +-- DISTINCT in the outer query and DISTINCT in the subquery +TRUNCATE agg_results; +INSERT INTO agg_results(user_id) +SELECT + DISTINCT users_ids.user_id +FROM + (SELECT DISTINCT user_id FROM users_table) as users_ids + JOIN + (SELECT + ma.user_id, ma.value_1, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob + FROM + users_table AS ma, events_table as short_list + WHERE + short_list.user_id = ma.user_id and ma.value_1 < 50 and short_list.event_type < 3 + ) temp + ON users_ids.user_id = temp.user_id + WHERE temp.value_1 < 50; +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + count | count | avg +-------+-------+--------------------- + 27 | 27 | 54.0000000000000000 +(1 row) + +-- DISTINCT ON in the outer query and DISTINCT in the subquery +TRUNCATE agg_results; +INSERT INTO agg_results(user_id, value_1_agg, value_2_agg) +SELECT + DISTINCT ON (users_ids.user_id) users_ids.user_id, temp.value_1, prob +FROM + (SELECT DISTINCT user_id FROM users_table) as users_ids + JOIN + (SELECT + ma.user_id, ma.value_1, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob + FROM + users_table AS ma, events_table as short_list + WHERE + short_list.user_id = ma.user_id and ma.value_1 < 50 and short_list.event_type < 15 + ) temp + ON users_ids.user_id = temp.user_id + WHERE temp.value_1 < 50 + ORDER BY 1, 2; +SELECT count(*), count(DISTINCT user_id), avg(user_id), avg(value_1_agg) FROM agg_results; + count | count | avg | avg +-------+-------+---------------------+--------------------- + 80 | 80 | 50.7875000000000000 | 10.0125000000000000 +(1 row) + +-- DISTINCT ON in the outer query and DISTINCT ON in the subquery +TRUNCATE agg_results; +INSERT INTO agg_results(user_id, value_1_agg, value_2_agg) +SELECT + DISTINCT ON (users_ids.user_id) users_ids.user_id, temp.value_1, prob +FROM + (SELECT DISTINCT ON (user_id) user_id, value_2 FROM users_table ORDER BY 1,2) as users_ids + JOIN + (SELECT + ma.user_id, ma.value_1, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob + FROM + users_table AS ma, events_table as short_list + WHERE + short_list.user_id = ma.user_id and ma.value_1 < 5000 and short_list.event_type < 3 + ) temp + ON users_ids.user_id = temp.user_id + ORDER BY 1, 2; +SELECT count(*), count(DISTINCT user_id), avg(user_id), avg(value_1_agg) FROM agg_results; + count | count | avg | avg +-------+-------+---------------------+-------------------- + 27 | 27 | 54.0000000000000000 | 9.8518518518518519 +(1 row) + diff --git a/src/test/regress/expected/multi_insert_select.out b/src/test/regress/expected/multi_insert_select.out index fb2c20853..9c1bbfc49 100644 --- a/src/test/regress/expected/multi_insert_select.out +++ b/src/test/regress/expected/multi_insert_select.out @@ -626,16 +626,60 @@ DEBUG: Plan is router executable SET client_min_messages TO INFO; truncate agg_events; SET client_min_messages TO DEBUG2; --- we do not support DISTINCT ON clauses +-- DISTINCT ON clauses are supported +-- distinct on(non-partition column) +-- values are pulled to master INSERT INTO agg_events (value_1_agg, user_id) SELECT DISTINCT ON (value_1) value_1, user_id FROM raw_events_first; -DEBUG: DISTINCT ON clauses are not allowed in distributed INSERT ... SELECT queries +DEBUG: DISTINCT ON (non-partition column) clauses are not allowed in distributed INSERT ... SELECT queries DEBUG: Collecting INSERT ... SELECT results on coordinator -ERROR: could not run distributed query with DISTINCT clause -HINT: Consider using an equality filter on the distributed table's partition column. +SELECT user_id, value_1_agg FROM agg_events ORDER BY 1,2; + user_id | value_1_agg +---------+------------- + 1 | 10 + 2 | 20 + 3 | 30 + 4 | 40 + 5 | 50 + 6 | 60 + 7 | + 8 | 80 + 9 | 90 +(9 rows) + +-- we don't want to see constraint vialotions, so truncate first +SET client_min_messages TO INFO; +truncate agg_events; +SET client_min_messages TO DEBUG2; +-- distinct on(partition column) +-- queries are forwared to workers +INSERT INTO agg_events (value_1_agg, user_id) + SELECT + DISTINCT ON (user_id) value_1, user_id + FROM + raw_events_first; +DEBUG: distributed statement: INSERT INTO public.agg_events_13300008 AS citus_table_alias (user_id, value_1_agg) SELECT DISTINCT ON (user_id) user_id, value_1 FROM public.raw_events_first_13300000 raw_events_first WHERE ((worker_hash(user_id) >= '-2147483648'::integer) AND (worker_hash(user_id) <= '-1073741825'::integer)) +DEBUG: distributed statement: INSERT INTO public.agg_events_13300009 AS citus_table_alias (user_id, value_1_agg) SELECT DISTINCT ON (user_id) user_id, value_1 FROM public.raw_events_first_13300001 raw_events_first WHERE ((worker_hash(user_id) >= '-1073741824'::integer) AND (worker_hash(user_id) <= '-1'::integer)) +DEBUG: distributed statement: INSERT INTO public.agg_events_13300010 AS citus_table_alias (user_id, value_1_agg) SELECT DISTINCT ON (user_id) user_id, value_1 FROM public.raw_events_first_13300002 raw_events_first WHERE ((worker_hash(user_id) >= 0) AND (worker_hash(user_id) <= 1073741823)) +DEBUG: distributed statement: INSERT INTO public.agg_events_13300011 AS citus_table_alias (user_id, value_1_agg) SELECT DISTINCT ON (user_id) user_id, value_1 FROM public.raw_events_first_13300003 raw_events_first WHERE ((worker_hash(user_id) >= 1073741824) AND (worker_hash(user_id) <= 2147483647)) +DEBUG: Plan is router executable +SELECT user_id, value_1_agg FROM agg_events ORDER BY 1,2; + user_id | value_1_agg +---------+------------- + 1 | 10 + 2 | 20 + 3 | 30 + 4 | 40 + 5 | 50 + 6 | 60 + 7 | + 8 | 80 + 9 | 90 +(9 rows) + -- We do not support some CTEs WITH fist_table_agg AS (SELECT sum(value_1) as v1_agg, user_id FROM raw_events_first GROUP BY user_id) diff --git a/src/test/regress/expected/multi_select_distinct.out b/src/test/regress/expected/multi_select_distinct.out new file mode 100644 index 000000000..ad2891d61 --- /dev/null +++ b/src/test/regress/expected/multi_select_distinct.out @@ -0,0 +1,824 @@ +-- +-- MULTI_SELECT_DISTINCT +-- +-- Tests select distinct, and select distinct on features. +-- +-- function calls are supported +SELECT DISTINCT l_orderkey, now() FROM lineitem_hash_part LIMIT 0; + l_orderkey | now +------------+----- +(0 rows) + +SELECT DISTINCT l_partkey, 1 + (random() * 0)::int FROM lineitem_hash_part ORDER BY 1 DESC LIMIT 3; + l_partkey | ?column? +-----------+---------- + 199973 | 1 + 199946 | 1 + 199943 | 1 +(3 rows) + +-- const expressions are supported +SELECT DISTINCT l_orderkey, 1+1 FROM lineitem_hash_part ORDER BY 1 LIMIT 5; + l_orderkey | ?column? +------------+---------- + 1 | 2 + 2 | 2 + 3 | 2 + 4 | 2 + 5 | 2 +(5 rows) + +-- non const expressions are also supported +SELECT DISTINCT l_orderkey, l_partkey + 1 FROM lineitem_hash_part ORDER BY 1, 2 LIMIT 5; + l_orderkey | ?column? +------------+---------- + 1 | 2133 + 1 | 15636 + 1 | 24028 + 1 | 63701 + 1 | 67311 +(5 rows) + +-- column expressions are supported +SELECT DISTINCT l_orderkey, l_shipinstruct || l_shipmode FROM lineitem_hash_part ORDER BY 2 , 1 LIMIT 5; + l_orderkey | ?column? +------------+---------------- + 32 | COLLECT CODAIR + 39 | COLLECT CODAIR + 66 | COLLECT CODAIR + 70 | COLLECT CODAIR + 98 | COLLECT CODAIR +(5 rows) + +-- function calls with const input are supported +SELECT DISTINCT l_orderkey, strpos('AIR', 'A') FROM lineitem_hash_part ORDER BY 1,2 LIMIT 5; + l_orderkey | strpos +------------+-------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +-- function calls with non-const input are supported +SELECT DISTINCT l_orderkey, strpos(l_shipmode, 'I') + FROM lineitem_hash_part + WHERE strpos(l_shipmode, 'I') > 1 + ORDER BY 2, 1 + LIMIT 5; + l_orderkey | strpos +------------+-------- + 1 | 2 + 3 | 2 + 5 | 2 + 32 | 2 + 33 | 2 +(5 rows) + +-- distinct on partition column +-- verify counts match with respect to count(distinct) +CREATE TEMP TABLE temp_orderkeys AS SELECT DISTINCT l_orderkey FROM lineitem_hash_part; +SELECT COUNT(*) FROM temp_orderkeys; + count +------- + 2985 +(1 row) + +SELECT COUNT(DISTINCT l_orderkey) FROM lineitem_hash_part; + count +------- + 2985 +(1 row) + +SELECT DISTINCT l_orderkey FROM lineitem_hash_part WHERE l_orderkey < 500 and l_partkey < 5000 order by 1; + l_orderkey +------------ + 1 + 3 + 32 + 35 + 39 + 65 + 129 + 130 + 134 + 164 + 194 + 228 + 261 + 290 + 320 + 321 + 354 + 418 +(18 rows) + +-- distinct on non-partition column +SELECT DISTINCT l_partkey FROM lineitem_hash_part WHERE l_orderkey > 5 and l_orderkey < 20 order by 1; + l_partkey +----------- + 79251 + 94780 + 139636 + 145243 + 151894 + 157238 + 163073 + 182052 +(8 rows) + +SELECT DISTINCT l_shipmode FROM lineitem_hash_part ORDER BY 1 DESC; + l_shipmode +------------ + TRUCK + SHIP + REG AIR + RAIL + MAIL + FOB + AIR +(7 rows) + +-- distinct with multiple columns +SELECT DISTINCT l_orderkey, o_orderdate + FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey) + WHERE l_orderkey < 10 + ORDER BY l_orderkey; + l_orderkey | o_orderdate +------------+------------- + 1 | 01-02-1996 + 2 | 12-01-1996 + 3 | 10-14-1993 + 4 | 10-11-1995 + 5 | 07-30-1994 + 6 | 02-21-1992 + 7 | 01-10-1996 +(7 rows) + +-- distinct on partition column with aggregate +-- this is the same as the one without distinct due to group by +SELECT DISTINCT l_orderkey, count(*) + FROM lineitem_hash_part + WHERE l_orderkey < 200 + GROUP BY 1 + HAVING count(*) > 5 + ORDER BY 2 DESC, 1; + l_orderkey | count +------------+------- + 7 | 7 + 68 | 7 + 129 | 7 + 164 | 7 + 194 | 7 + 1 | 6 + 3 | 6 + 32 | 6 + 35 | 6 + 39 | 6 + 67 | 6 + 69 | 6 + 70 | 6 + 71 | 6 + 134 | 6 + 135 | 6 + 163 | 6 + 192 | 6 + 197 | 6 +(19 rows) + + +-- explain the query to see actual plan +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_orderkey, count(*) + FROM lineitem_hash_part + WHERE l_orderkey < 200 + GROUP BY 1 + HAVING count(*) > 5 + ORDER BY 2 DESC, 1; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------------------------------------- + Sort + Sort Key: COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint))))::bigint, '0'::bigint) DESC, remote_scan.l_orderkey + -> HashAggregate + Group Key: remote_scan.l_orderkey + Filter: (COALESCE((pg_catalog.sum(remote_scan.worker_column_3))::bigint, '0'::bigint) > 5) + -> Custom Scan (Citus Real-Time) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> HashAggregate + Group Key: l_orderkey + Filter: (count(*) > 5) + -> Seq Scan on lineitem_hash_part_360290 lineitem_hash_part + Filter: (l_orderkey < 200) +(15 rows) + +-- distinct on non-partition column with aggregate +-- this is the same as non-distinct version due to group by +SELECT DISTINCT l_partkey, count(*) + FROM lineitem_hash_part + GROUP BY 1 + HAVING count(*) > 2 + ORDER BY 1; + l_partkey | count +-----------+------- + 1051 | 3 + 1927 | 3 + 6983 | 3 + 15283 | 3 + 87761 | 3 + 136884 | 3 + 149926 | 3 + 160895 | 3 + 177771 | 3 + 188804 | 3 + 199146 | 3 +(11 rows) + + +-- explain the query to see actual plan +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_partkey, count(*) + FROM lineitem_hash_part + GROUP BY 1 + HAVING count(*) > 2 + ORDER BY 1; + QUERY PLAN +---------------------------------------------------------------------------------------------------- + Sort + Sort Key: remote_scan.l_partkey + -> HashAggregate + Group Key: remote_scan.l_partkey + Filter: (COALESCE((pg_catalog.sum(remote_scan.worker_column_3))::bigint, '0'::bigint) > 2) + -> Custom Scan (Citus Real-Time) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> HashAggregate + Group Key: l_partkey + -> Seq Scan on lineitem_hash_part_360290 lineitem_hash_part +(13 rows) + +-- distinct on non-partition column and avg +SELECT DISTINCT l_partkey, avg(l_linenumber) + FROM lineitem_hash_part + WHERE l_partkey < 500 + GROUP BY 1 + HAVING avg(l_linenumber) > 2 + ORDER BY 1; + l_partkey | avg +-----------+-------------------- + 18 | 7.0000000000000000 + 79 | 6.0000000000000000 + 149 | 4.5000000000000000 + 175 | 5.0000000000000000 + 179 | 6.0000000000000000 + 182 | 3.0000000000000000 + 222 | 4.0000000000000000 + 278 | 3.0000000000000000 + 299 | 7.0000000000000000 + 308 | 7.0000000000000000 + 309 | 5.0000000000000000 + 321 | 3.0000000000000000 + 337 | 6.0000000000000000 + 364 | 3.0000000000000000 + 403 | 4.0000000000000000 +(15 rows) + +-- distinct on multiple non-partition columns +SELECT DISTINCT l_partkey, l_suppkey + FROM lineitem_hash_part + WHERE l_shipmode = 'AIR' AND l_orderkey < 100 + ORDER BY 1, 2; + l_partkey | l_suppkey +-----------+----------- + 2132 | 4633 + 4297 | 1798 + 37531 | 35 + 44161 | 6666 + 44706 | 4707 + 67831 | 5350 + 85811 | 8320 + 94368 | 6878 + 108338 | 849 + 108570 | 8571 + 137267 | 4807 + 137469 | 9983 + 173489 | 3490 + 196156 | 1195 + 197921 | 441 +(15 rows) + + +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_partkey, l_suppkey + FROM lineitem_hash_part + WHERE l_shipmode = 'AIR' AND l_orderkey < 100 + ORDER BY 1, 2; + QUERY PLAN +----------------------------------------------------------------------------------------------------- + Sort + Sort Key: remote_scan.l_partkey, remote_scan.l_suppkey + -> HashAggregate + Group Key: remote_scan.l_partkey, remote_scan.l_suppkey + -> Custom Scan (Citus Real-Time) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> Unique + -> Sort + Sort Key: l_partkey, l_suppkey + -> Seq Scan on lineitem_hash_part_360290 lineitem_hash_part + Filter: ((l_orderkey < 100) AND (l_shipmode = 'AIR'::bpchar)) +(14 rows) + +-- distinct on partition column +SELECT DISTINCT ON (l_orderkey) l_orderkey, l_partkey, l_suppkey + FROM lineitem_hash_part + WHERE l_orderkey < 35 + ORDER BY 1; + l_orderkey | l_partkey | l_suppkey +------------+-----------+----------- + 1 | 155190 | 7706 + 2 | 106170 | 1191 + 3 | 4297 | 1798 + 4 | 88035 | 5560 + 5 | 108570 | 8571 + 6 | 139636 | 2150 + 7 | 182052 | 9607 + 32 | 82704 | 7721 + 33 | 61336 | 8855 + 34 | 88362 | 871 +(10 rows) + + +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (l_orderkey) l_orderkey, l_partkey, l_suppkey + FROM lineitem_hash_part + WHERE l_orderkey < 35 + ORDER BY 1; + QUERY PLAN +---------------------------------------------------------------------------------------------- + Unique + -> Sort + Sort Key: remote_scan.l_orderkey + -> Custom Scan (Citus Real-Time) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> Unique + -> Sort + Sort Key: l_orderkey + -> Seq Scan on lineitem_hash_part_360290 lineitem_hash_part + Filter: (l_orderkey < 35) +(13 rows) + +-- distinct on non-partition column +-- note order by is required here +-- otherwise query results will be different since +-- distinct on clause is on non-partition column +SELECT DISTINCT ON (l_partkey) l_partkey, l_orderkey + FROM lineitem_hash_part + ORDER BY 1,2 + LIMIT 20; + l_partkey | l_orderkey +-----------+------------ + 18 | 12005 + 79 | 5121 + 91 | 2883 + 149 | 807 + 175 | 4102 + 179 | 2117 + 182 | 548 + 195 | 2528 + 204 | 10048 + 222 | 9413 + 245 | 9446 + 278 | 1287 + 299 | 1122 + 308 | 11137 + 309 | 2374 + 318 | 321 + 321 | 5984 + 337 | 10403 + 350 | 13698 + 358 | 4323 +(20 rows) + +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (l_partkey) l_partkey, l_orderkey + FROM lineitem_hash_part + ORDER BY 1,2 + LIMIT 20; + QUERY PLAN +---------------------------------------------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: remote_scan.l_partkey, remote_scan.l_orderkey + -> Custom Scan (Citus Real-Time) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> Limit + -> Unique + -> Sort + Sort Key: l_partkey, l_orderkey + -> Seq Scan on lineitem_hash_part_360290 lineitem_hash_part +(14 rows) + +-- distinct on with joins +-- each customer's first order key +SELECT DISTINCT ON (o_custkey) o_custkey, l_orderkey + FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey) + WHERE o_custkey < 15 + ORDER BY 1,2; + o_custkey | l_orderkey +-----------+------------ + 1 | 9154 + 2 | 10563 + 4 | 320 + 5 | 11682 + 7 | 10402 + 8 | 102 + 10 | 1602 + 11 | 12800 + 13 | 994 + 14 | 11011 +(10 rows) + +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (o_custkey) o_custkey, l_orderkey + FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey) + WHERE o_custkey < 15 + ORDER BY 1,2; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------- + Unique + -> Sort + Sort Key: remote_scan.o_custkey, remote_scan.l_orderkey + -> Custom Scan (Citus Real-Time) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> Unique + -> Sort + Sort Key: orders_hash_part.o_custkey, lineitem_hash_part.l_orderkey + -> Hash Join + Hash Cond: (lineitem_hash_part.l_orderkey = orders_hash_part.o_orderkey) + -> Seq Scan on lineitem_hash_part_360290 lineitem_hash_part + -> Hash + -> Seq Scan on orders_hash_part_360294 orders_hash_part + Filter: (o_custkey < 15) +(17 rows) + +-- explain without order by +-- notice master plan has order by on distinct on column +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (o_custkey) o_custkey, l_orderkey + FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey) + WHERE o_custkey < 15; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------- + Unique + -> Sort + Sort Key: remote_scan.o_custkey + -> Custom Scan (Citus Real-Time) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> Unique + -> Sort + Sort Key: orders_hash_part.o_custkey + -> Hash Join + Hash Cond: (lineitem_hash_part.l_orderkey = orders_hash_part.o_orderkey) + -> Seq Scan on lineitem_hash_part_360290 lineitem_hash_part + -> Hash + -> Seq Scan on orders_hash_part_360294 orders_hash_part + Filter: (o_custkey < 15) +(17 rows) + +-- each customer's each order's first l_partkey +SELECT DISTINCT ON (o_custkey, l_orderkey) o_custkey, l_orderkey, l_linenumber, l_partkey + FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey) + WHERE o_custkey < 20 + ORDER BY 1,2,3; + o_custkey | l_orderkey | l_linenumber | l_partkey +-----------+------------+--------------+----------- + 1 | 9154 | 1 | 86513 + 1 | 14656 | 1 | 59539 + 2 | 10563 | 1 | 147459 + 4 | 320 | 1 | 4415 + 4 | 739 | 1 | 84489 + 4 | 10688 | 1 | 45037 + 4 | 10788 | 1 | 50814 + 4 | 13728 | 1 | 86216 + 5 | 11682 | 1 | 31634 + 5 | 11746 | 1 | 180724 + 5 | 14308 | 1 | 157430 + 7 | 10402 | 1 | 53661 + 7 | 13031 | 1 | 112161 + 7 | 14145 | 1 | 138729 + 7 | 14404 | 1 | 143034 + 8 | 102 | 1 | 88914 + 8 | 164 | 1 | 91309 + 8 | 13601 | 1 | 40504 + 10 | 1602 | 1 | 182806 + 10 | 9862 | 1 | 86241 + 10 | 11431 | 1 | 62112 + 10 | 13124 | 1 | 29414 + 11 | 12800 | 1 | 152806 + 13 | 994 | 1 | 64486 + 13 | 1603 | 1 | 38191 + 13 | 4704 | 1 | 77934 + 13 | 9927 | 1 | 875 + 14 | 11011 | 1 | 172485 + 17 | 896 | 1 | 38675 + 17 | 5507 | 1 | 9600 + 19 | 353 | 1 | 119305 + 19 | 1504 | 1 | 81389 + 19 | 1669 | 1 | 78373 + 19 | 5893 | 1 | 133707 + 19 | 9954 | 1 | 92138 + 19 | 14885 | 1 | 36154 +(36 rows) + +-- explain without order by +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (o_custkey, l_orderkey) o_custkey, l_orderkey, l_linenumber, l_partkey + FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey) + WHERE o_custkey < 20; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------- + Unique + -> Sort + Sort Key: remote_scan.o_custkey, remote_scan.l_orderkey + -> Custom Scan (Citus Real-Time) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> Unique + -> Sort + Sort Key: orders_hash_part.o_custkey, lineitem_hash_part.l_orderkey + -> Hash Join + Hash Cond: (lineitem_hash_part.l_orderkey = orders_hash_part.o_orderkey) + -> Seq Scan on lineitem_hash_part_360290 lineitem_hash_part + -> Hash + -> Seq Scan on orders_hash_part_360294 orders_hash_part + Filter: (o_custkey < 20) +(17 rows) + +-- each customer's each order's last l_partkey +SELECT DISTINCT ON (o_custkey, l_orderkey) o_custkey, l_orderkey, l_linenumber, l_partkey + FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey) + WHERE o_custkey < 15 + ORDER BY 1,2,3 DESC; + o_custkey | l_orderkey | l_linenumber | l_partkey +-----------+------------+--------------+----------- + 1 | 9154 | 7 | 173448 + 1 | 14656 | 1 | 59539 + 2 | 10563 | 4 | 110741 + 4 | 320 | 2 | 192158 + 4 | 739 | 5 | 187523 + 4 | 10688 | 2 | 132574 + 4 | 10788 | 4 | 196473 + 4 | 13728 | 3 | 12450 + 5 | 11682 | 3 | 177152 + 5 | 11746 | 7 | 193807 + 5 | 14308 | 3 | 140916 + 7 | 10402 | 2 | 64514 + 7 | 13031 | 6 | 7761 + 7 | 14145 | 6 | 130723 + 7 | 14404 | 7 | 35349 + 8 | 102 | 4 | 61158 + 8 | 164 | 7 | 3037 + 8 | 13601 | 5 | 12470 + 10 | 1602 | 1 | 182806 + 10 | 9862 | 5 | 135675 + 10 | 11431 | 7 | 8563 + 10 | 13124 | 3 | 67055 + 11 | 12800 | 5 | 179110 + 13 | 994 | 4 | 130471 + 13 | 1603 | 2 | 65209 + 13 | 4704 | 3 | 63081 + 13 | 9927 | 6 | 119356 + 14 | 11011 | 7 | 95939 +(28 rows) + +-- subqueries +SELECT DISTINCT l_orderkey, l_partkey + FROM ( + SELECT l_orderkey, l_partkey + FROM lineitem_hash_part + ) q + ORDER BY 1,2 + LIMIT 10; + l_orderkey | l_partkey +------------+----------- + 1 | 2132 + 1 | 15635 + 1 | 24027 + 1 | 63700 + 1 | 67310 + 1 | 155190 + 2 | 106170 + 3 | 4297 + 3 | 19036 + 3 | 29380 +(10 rows) + +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_orderkey, l_partkey + FROM ( + SELECT l_orderkey, l_partkey + FROM lineitem_hash_part + ) q + ORDER BY 1,2 + LIMIT 10; + QUERY PLAN +---------------------------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: remote_scan.l_orderkey, remote_scan.l_partkey + -> HashAggregate + Group Key: remote_scan.l_orderkey, remote_scan.l_partkey + -> Custom Scan (Citus Real-Time) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> Limit + -> Sort + Sort Key: l_orderkey, l_partkey + -> HashAggregate + Group Key: l_orderkey, l_partkey + -> Seq Scan on lineitem_hash_part_360290 lineitem_hash_part +(16 rows) + +SELECT DISTINCT l_orderkey, cnt + FROM ( + SELECT l_orderkey, count(*) as cnt + FROM lineitem_hash_part + GROUP BY 1 + ) q + ORDER BY 1,2 + LIMIT 10; + l_orderkey | cnt +------------+----- + 1 | 6 + 2 | 1 + 3 | 6 + 4 | 1 + 5 | 3 + 6 | 1 + 7 | 7 + 32 | 6 + 33 | 4 + 34 | 3 +(10 rows) + +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_orderkey, cnt + FROM ( + SELECT l_orderkey, count(*) as cnt + FROM lineitem_hash_part + GROUP BY 1 + ) q + ORDER BY 1,2 + LIMIT 10; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: remote_scan.l_orderkey, remote_scan.cnt + -> HashAggregate + Group Key: remote_scan.l_orderkey, remote_scan.cnt + -> Custom Scan (Citus Real-Time) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> Limit + -> Sort + Sort Key: lineitem_hash_part.l_orderkey, (count(*)) + -> HashAggregate + Group Key: lineitem_hash_part.l_orderkey, count(*) + -> HashAggregate + Group Key: lineitem_hash_part.l_orderkey + -> Seq Scan on lineitem_hash_part_360290 lineitem_hash_part +(18 rows) + +-- distinct on partition column +-- random() is added to inner query to prevent flattening +SELECT DISTINCT ON (l_orderkey) l_orderkey, l_partkey + FROM ( + SELECT l_orderkey, l_partkey, (random()*10)::int + 2 as r + FROM lineitem_hash_part + ) q + WHERE r > 1 + ORDER BY 1,2 + LIMIT 10; + l_orderkey | l_partkey +------------+----------- + 1 | 2132 + 2 | 106170 + 3 | 4297 + 4 | 88035 + 5 | 37531 + 6 | 139636 + 7 | 79251 + 32 | 2743 + 33 | 33918 + 34 | 88362 +(10 rows) + +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (l_orderkey) l_orderkey, l_partkey + FROM ( + SELECT l_orderkey, l_partkey, (random()*10)::int + 2 as r + FROM lineitem_hash_part + ) q + WHERE r > 1 + ORDER BY 1,2 + LIMIT 10; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: remote_scan.l_orderkey, remote_scan.l_partkey + -> Custom Scan (Citus Real-Time) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> Limit + -> Unique + -> Sort + Sort Key: q.l_orderkey, q.l_partkey + -> Subquery Scan on q + Filter: (q.r > 1) + -> Seq Scan on lineitem_hash_part_360290 lineitem_hash_part +(16 rows) + +-- distinct on non-partition column +SELECT DISTINCT ON (l_partkey) l_orderkey, l_partkey + FROM ( + SELECT l_orderkey, l_partkey, (random()*10)::int + 2 as r + FROM lineitem_hash_part + ) q + WHERE r > 1 + ORDER BY 2,1 + LIMIT 10; + l_orderkey | l_partkey +------------+----------- + 12005 | 18 + 5121 | 79 + 2883 | 91 + 807 | 149 + 4102 | 175 + 2117 | 179 + 548 | 182 + 2528 | 195 + 10048 | 204 + 9413 | 222 +(10 rows) + +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (l_partkey) l_orderkey, l_partkey + FROM ( + SELECT l_orderkey, l_partkey, (random()*10)::int + 2 as r + FROM lineitem_hash_part + ) q + WHERE r > 1 + ORDER BY 2,1 + LIMIT 10; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: remote_scan.l_partkey, remote_scan.l_orderkey + -> Custom Scan (Citus Real-Time) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=57637 dbname=regression + -> Limit + -> Unique + -> Sort + Sort Key: q.l_partkey, q.l_orderkey + -> Subquery Scan on q + Filter: (q.r > 1) + -> Seq Scan on lineitem_hash_part_360290 lineitem_hash_part +(16 rows) + diff --git a/src/test/regress/expected/multi_subquery_behavioral_analytics.out b/src/test/regress/expected/multi_subquery_behavioral_analytics.out index b0e25d366..0ff786728 100644 --- a/src/test/regress/expected/multi_subquery_behavioral_analytics.out +++ b/src/test/regress/expected/multi_subquery_behavioral_analytics.out @@ -2071,6 +2071,83 @@ FROM WHERE users_table.value_1 < 50 AND test_join_function_2(users_table.user_id, temp.user_id); ERROR: unsupported clause type +-- DISTINCT in the outer query and DISTINCT in the subquery +SELECT + DISTINCT users_ids.user_id +FROM + (SELECT DISTINCT user_id FROM users_table) as users_ids + JOIN + (SELECT + ma.user_id, ma.value_1, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob + FROM + users_table AS ma, events_table as short_list + WHERE + short_list.user_id = ma.user_id and ma.value_1 < 50 and short_list.event_type < 3 + ) temp + ON users_ids.user_id = temp.user_id + WHERE temp.value_1 < 50 + ORDER BY 1 + LIMIT 5; + user_id +--------- + 1 + 6 + 16 + 21 + 26 +(5 rows) + +-- DISTINCT ON in the outer query and DISTINCT in the subquery +SELECT + DISTINCT ON (users_ids.user_id) users_ids.user_id, temp.value_1, prob +FROM + (SELECT DISTINCT user_id FROM users_table) as users_ids + JOIN + (SELECT + ma.user_id, ma.value_1, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob + FROM + users_table AS ma, events_table as short_list + WHERE + short_list.user_id = ma.user_id and ma.value_1 < 50 and short_list.event_type < 15 + ) temp + ON users_ids.user_id = temp.user_id + WHERE temp.value_1 < 50 + ORDER BY 1, 2 + LIMIT 5; + user_id | value_1 | prob +---------+---------+------------------------ + 1 | 6 | 0.50000000000000000000 + 2 | 2 | 0.50000000000000000000 + 4 | 3 | 0.50000000000000000000 + 6 | 3 | 0.50000000000000000000 + 7 | 2 | 0.50000000000000000000 +(5 rows) + +-- DISTINCT ON in the outer query and DISTINCT ON in the subquery +SELECT + DISTINCT ON (users_ids.user_id) users_ids.user_id, temp.value_1, prob +FROM + (SELECT DISTINCT ON (user_id) user_id, value_1 FROM users_table ORDER BY 1,2) as users_ids + JOIN + (SELECT + ma.user_id, ma.value_1, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob + FROM + users_table AS ma, events_table as short_list + WHERE + short_list.user_id = ma.user_id and ma.value_1 < 25 and short_list.event_type < 3 + ) temp + ON users_ids.user_id = temp.user_id + ORDER BY 1,2 + LIMIT 5; + user_id | value_1 | prob +---------+---------+------------------------ + 1 | 6 | 0.50000000000000000000 + 6 | 3 | 0.50000000000000000000 + 16 | 4 | 0.50000000000000000000 + 21 | 0 | 0.50000000000000000000 + 26 | 5 | 0.50000000000000000000 +(5 rows) + DROP FUNCTION test_join_function_2(integer, integer); SET citus.enable_router_execution TO TRUE; SET citus.subquery_pushdown to OFF; diff --git a/src/test/regress/multi_schedule b/src/test/regress/multi_schedule index 96a991fff..ca9822e6c 100644 --- a/src/test/regress/multi_schedule +++ b/src/test/regress/multi_schedule @@ -143,7 +143,7 @@ test: multi_outer_join # is independed from the rest of the group, it is added to increase parallelism. # --- test: multi_create_fdw -test: multi_complex_count_distinct +test: multi_complex_count_distinct multi_select_distinct test: multi_distribution_metadata test: multi_generate_ddl_commands test: multi_create_shards diff --git a/src/test/regress/sql/multi_behavioral_analytics_basics.sql b/src/test/regress/sql/multi_behavioral_analytics_basics.sql index 720d55fe2..e67ab6f8f 100644 --- a/src/test/regress/sql/multi_behavioral_analytics_basics.sql +++ b/src/test/regress/sql/multi_behavioral_analytics_basics.sql @@ -417,4 +417,68 @@ FROM -- get some statistics from the aggregated results to ensure the results are correct SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; - + +-- DISTINCT in the outer query and DISTINCT in the subquery +TRUNCATE agg_results; + +INSERT INTO agg_results(user_id) +SELECT + DISTINCT users_ids.user_id +FROM + (SELECT DISTINCT user_id FROM users_table) as users_ids + JOIN + (SELECT + ma.user_id, ma.value_1, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob + FROM + users_table AS ma, events_table as short_list + WHERE + short_list.user_id = ma.user_id and ma.value_1 < 50 and short_list.event_type < 3 + ) temp + ON users_ids.user_id = temp.user_id + WHERE temp.value_1 < 50; + +-- get some statistics from the aggregated results to ensure the results are correct +SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM agg_results; + +-- DISTINCT ON in the outer query and DISTINCT in the subquery +TRUNCATE agg_results; + +INSERT INTO agg_results(user_id, value_1_agg, value_2_agg) +SELECT + DISTINCT ON (users_ids.user_id) users_ids.user_id, temp.value_1, prob +FROM + (SELECT DISTINCT user_id FROM users_table) as users_ids + JOIN + (SELECT + ma.user_id, ma.value_1, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob + FROM + users_table AS ma, events_table as short_list + WHERE + short_list.user_id = ma.user_id and ma.value_1 < 50 and short_list.event_type < 15 + ) temp + ON users_ids.user_id = temp.user_id + WHERE temp.value_1 < 50 + ORDER BY 1, 2; + +SELECT count(*), count(DISTINCT user_id), avg(user_id), avg(value_1_agg) FROM agg_results; + +-- DISTINCT ON in the outer query and DISTINCT ON in the subquery +TRUNCATE agg_results; + +INSERT INTO agg_results(user_id, value_1_agg, value_2_agg) +SELECT + DISTINCT ON (users_ids.user_id) users_ids.user_id, temp.value_1, prob +FROM + (SELECT DISTINCT ON (user_id) user_id, value_2 FROM users_table ORDER BY 1,2) as users_ids + JOIN + (SELECT + ma.user_id, ma.value_1, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob + FROM + users_table AS ma, events_table as short_list + WHERE + short_list.user_id = ma.user_id and ma.value_1 < 5000 and short_list.event_type < 3 + ) temp + ON users_ids.user_id = temp.user_id + ORDER BY 1, 2; + +SELECT count(*), count(DISTINCT user_id), avg(user_id), avg(value_1_agg) FROM agg_results; diff --git a/src/test/regress/sql/multi_insert_select.sql b/src/test/regress/sql/multi_insert_select.sql index e2fb272db..9f78ed93d 100644 --- a/src/test/regress/sql/multi_insert_select.sql +++ b/src/test/regress/sql/multi_insert_select.sql @@ -478,13 +478,32 @@ SET client_min_messages TO INFO; truncate agg_events; SET client_min_messages TO DEBUG2; --- we do not support DISTINCT ON clauses +-- DISTINCT ON clauses are supported +-- distinct on(non-partition column) +-- values are pulled to master INSERT INTO agg_events (value_1_agg, user_id) SELECT DISTINCT ON (value_1) value_1, user_id FROM raw_events_first; +SELECT user_id, value_1_agg FROM agg_events ORDER BY 1,2; + +-- we don't want to see constraint vialotions, so truncate first +SET client_min_messages TO INFO; +truncate agg_events; +SET client_min_messages TO DEBUG2; + +-- distinct on(partition column) +-- queries are forwared to workers +INSERT INTO agg_events (value_1_agg, user_id) + SELECT + DISTINCT ON (user_id) value_1, user_id + FROM + raw_events_first; + +SELECT user_id, value_1_agg FROM agg_events ORDER BY 1,2; + -- We do not support some CTEs WITH fist_table_agg AS (SELECT sum(value_1) as v1_agg, user_id FROM raw_events_first GROUP BY user_id) diff --git a/src/test/regress/sql/multi_select_distinct.sql b/src/test/regress/sql/multi_select_distinct.sql new file mode 100644 index 000000000..eb01eceac --- /dev/null +++ b/src/test/regress/sql/multi_select_distinct.sql @@ -0,0 +1,245 @@ +-- +-- MULTI_SELECT_DISTINCT +-- +-- Tests select distinct, and select distinct on features. +-- + + +-- function calls are supported +SELECT DISTINCT l_orderkey, now() FROM lineitem_hash_part LIMIT 0; + +SELECT DISTINCT l_partkey, 1 + (random() * 0)::int FROM lineitem_hash_part ORDER BY 1 DESC LIMIT 3; + +-- const expressions are supported +SELECT DISTINCT l_orderkey, 1+1 FROM lineitem_hash_part ORDER BY 1 LIMIT 5; + +-- non const expressions are also supported +SELECT DISTINCT l_orderkey, l_partkey + 1 FROM lineitem_hash_part ORDER BY 1, 2 LIMIT 5; + +-- column expressions are supported +SELECT DISTINCT l_orderkey, l_shipinstruct || l_shipmode FROM lineitem_hash_part ORDER BY 2 , 1 LIMIT 5; + +-- function calls with const input are supported +SELECT DISTINCT l_orderkey, strpos('AIR', 'A') FROM lineitem_hash_part ORDER BY 1,2 LIMIT 5; + +-- function calls with non-const input are supported +SELECT DISTINCT l_orderkey, strpos(l_shipmode, 'I') + FROM lineitem_hash_part + WHERE strpos(l_shipmode, 'I') > 1 + ORDER BY 2, 1 + LIMIT 5; + +-- distinct on partition column +-- verify counts match with respect to count(distinct) +CREATE TEMP TABLE temp_orderkeys AS SELECT DISTINCT l_orderkey FROM lineitem_hash_part; +SELECT COUNT(*) FROM temp_orderkeys; +SELECT COUNT(DISTINCT l_orderkey) FROM lineitem_hash_part; + +SELECT DISTINCT l_orderkey FROM lineitem_hash_part WHERE l_orderkey < 500 and l_partkey < 5000 order by 1; + +-- distinct on non-partition column +SELECT DISTINCT l_partkey FROM lineitem_hash_part WHERE l_orderkey > 5 and l_orderkey < 20 order by 1; + +SELECT DISTINCT l_shipmode FROM lineitem_hash_part ORDER BY 1 DESC; + +-- distinct with multiple columns +SELECT DISTINCT l_orderkey, o_orderdate + FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey) + WHERE l_orderkey < 10 + ORDER BY l_orderkey; + +-- distinct on partition column with aggregate +-- this is the same as the one without distinct due to group by +SELECT DISTINCT l_orderkey, count(*) + FROM lineitem_hash_part + WHERE l_orderkey < 200 + GROUP BY 1 + HAVING count(*) > 5 + ORDER BY 2 DESC, 1; + +-- explain the query to see actual plan +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_orderkey, count(*) + FROM lineitem_hash_part + WHERE l_orderkey < 200 + GROUP BY 1 + HAVING count(*) > 5 + ORDER BY 2 DESC, 1; + +-- distinct on non-partition column with aggregate +-- this is the same as non-distinct version due to group by +SELECT DISTINCT l_partkey, count(*) + FROM lineitem_hash_part + GROUP BY 1 + HAVING count(*) > 2 + ORDER BY 1; + +-- explain the query to see actual plan +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_partkey, count(*) + FROM lineitem_hash_part + GROUP BY 1 + HAVING count(*) > 2 + ORDER BY 1; + +-- distinct on non-partition column and avg +SELECT DISTINCT l_partkey, avg(l_linenumber) + FROM lineitem_hash_part + WHERE l_partkey < 500 + GROUP BY 1 + HAVING avg(l_linenumber) > 2 + ORDER BY 1; + +-- distinct on multiple non-partition columns +SELECT DISTINCT l_partkey, l_suppkey + FROM lineitem_hash_part + WHERE l_shipmode = 'AIR' AND l_orderkey < 100 + ORDER BY 1, 2; + +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_partkey, l_suppkey + FROM lineitem_hash_part + WHERE l_shipmode = 'AIR' AND l_orderkey < 100 + ORDER BY 1, 2; + +-- distinct on partition column +SELECT DISTINCT ON (l_orderkey) l_orderkey, l_partkey, l_suppkey + FROM lineitem_hash_part + WHERE l_orderkey < 35 + ORDER BY 1; + +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (l_orderkey) l_orderkey, l_partkey, l_suppkey + FROM lineitem_hash_part + WHERE l_orderkey < 35 + ORDER BY 1; + +-- distinct on non-partition column +-- note order by is required here +-- otherwise query results will be different since +-- distinct on clause is on non-partition column +SELECT DISTINCT ON (l_partkey) l_partkey, l_orderkey + FROM lineitem_hash_part + ORDER BY 1,2 + LIMIT 20; + +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (l_partkey) l_partkey, l_orderkey + FROM lineitem_hash_part + ORDER BY 1,2 + LIMIT 20; + +-- distinct on with joins +-- each customer's first order key +SELECT DISTINCT ON (o_custkey) o_custkey, l_orderkey + FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey) + WHERE o_custkey < 15 + ORDER BY 1,2; + +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (o_custkey) o_custkey, l_orderkey + FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey) + WHERE o_custkey < 15 + ORDER BY 1,2; + +-- explain without order by +-- notice master plan has order by on distinct on column +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (o_custkey) o_custkey, l_orderkey + FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey) + WHERE o_custkey < 15; + +-- each customer's each order's first l_partkey +SELECT DISTINCT ON (o_custkey, l_orderkey) o_custkey, l_orderkey, l_linenumber, l_partkey + FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey) + WHERE o_custkey < 20 + ORDER BY 1,2,3; + +-- explain without order by +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (o_custkey, l_orderkey) o_custkey, l_orderkey, l_linenumber, l_partkey + FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey) + WHERE o_custkey < 20; + +-- each customer's each order's last l_partkey +SELECT DISTINCT ON (o_custkey, l_orderkey) o_custkey, l_orderkey, l_linenumber, l_partkey + FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey) + WHERE o_custkey < 15 + ORDER BY 1,2,3 DESC; + +-- subqueries +SELECT DISTINCT l_orderkey, l_partkey + FROM ( + SELECT l_orderkey, l_partkey + FROM lineitem_hash_part + ) q + ORDER BY 1,2 + LIMIT 10; + +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_orderkey, l_partkey + FROM ( + SELECT l_orderkey, l_partkey + FROM lineitem_hash_part + ) q + ORDER BY 1,2 + LIMIT 10; + +SELECT DISTINCT l_orderkey, cnt + FROM ( + SELECT l_orderkey, count(*) as cnt + FROM lineitem_hash_part + GROUP BY 1 + ) q + ORDER BY 1,2 + LIMIT 10; + +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_orderkey, cnt + FROM ( + SELECT l_orderkey, count(*) as cnt + FROM lineitem_hash_part + GROUP BY 1 + ) q + ORDER BY 1,2 + LIMIT 10; +-- distinct on partition column +-- random() is added to inner query to prevent flattening +SELECT DISTINCT ON (l_orderkey) l_orderkey, l_partkey + FROM ( + SELECT l_orderkey, l_partkey, (random()*10)::int + 2 as r + FROM lineitem_hash_part + ) q + WHERE r > 1 + ORDER BY 1,2 + LIMIT 10; + +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (l_orderkey) l_orderkey, l_partkey + FROM ( + SELECT l_orderkey, l_partkey, (random()*10)::int + 2 as r + FROM lineitem_hash_part + ) q + WHERE r > 1 + ORDER BY 1,2 + LIMIT 10; + +-- distinct on non-partition column +SELECT DISTINCT ON (l_partkey) l_orderkey, l_partkey + FROM ( + SELECT l_orderkey, l_partkey, (random()*10)::int + 2 as r + FROM lineitem_hash_part + ) q + WHERE r > 1 + ORDER BY 2,1 + LIMIT 10; + +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (l_partkey) l_orderkey, l_partkey + FROM ( + SELECT l_orderkey, l_partkey, (random()*10)::int + 2 as r + FROM lineitem_hash_part + ) q + WHERE r > 1 + ORDER BY 2,1 + LIMIT 10; diff --git a/src/test/regress/sql/multi_subquery_behavioral_analytics.sql b/src/test/regress/sql/multi_subquery_behavioral_analytics.sql index 71373b2e7..051a1265f 100644 --- a/src/test/regress/sql/multi_subquery_behavioral_analytics.sql +++ b/src/test/regress/sql/multi_subquery_behavioral_analytics.sql @@ -1659,8 +1659,60 @@ FROM WHERE users_table.value_1 < 50 AND test_join_function_2(users_table.user_id, temp.user_id); +-- DISTINCT in the outer query and DISTINCT in the subquery +SELECT + DISTINCT users_ids.user_id +FROM + (SELECT DISTINCT user_id FROM users_table) as users_ids + JOIN + (SELECT + ma.user_id, ma.value_1, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob + FROM + users_table AS ma, events_table as short_list + WHERE + short_list.user_id = ma.user_id and ma.value_1 < 50 and short_list.event_type < 3 + ) temp + ON users_ids.user_id = temp.user_id + WHERE temp.value_1 < 50 + ORDER BY 1 + LIMIT 5; + +-- DISTINCT ON in the outer query and DISTINCT in the subquery +SELECT + DISTINCT ON (users_ids.user_id) users_ids.user_id, temp.value_1, prob +FROM + (SELECT DISTINCT user_id FROM users_table) as users_ids + JOIN + (SELECT + ma.user_id, ma.value_1, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob + FROM + users_table AS ma, events_table as short_list + WHERE + short_list.user_id = ma.user_id and ma.value_1 < 50 and short_list.event_type < 15 + ) temp + ON users_ids.user_id = temp.user_id + WHERE temp.value_1 < 50 + ORDER BY 1, 2 + LIMIT 5; + +-- DISTINCT ON in the outer query and DISTINCT ON in the subquery +SELECT + DISTINCT ON (users_ids.user_id) users_ids.user_id, temp.value_1, prob +FROM + (SELECT DISTINCT ON (user_id) user_id, value_1 FROM users_table ORDER BY 1,2) as users_ids + JOIN + (SELECT + ma.user_id, ma.value_1, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob + FROM + users_table AS ma, events_table as short_list + WHERE + short_list.user_id = ma.user_id and ma.value_1 < 25 and short_list.event_type < 3 + ) temp + ON users_ids.user_id = temp.user_id + ORDER BY 1,2 + LIMIT 5; + DROP FUNCTION test_join_function_2(integer, integer); SET citus.enable_router_execution TO TRUE; SET citus.subquery_pushdown to OFF; -