From 45717dd01302d1570339f766217634c671f41d34 Mon Sep 17 00:00:00 2001 From: velioglu Date: Mon, 31 Jul 2017 15:44:36 +0300 Subject: [PATCH 1/6] Check equivalence on reference tables for subquery pushdown --- .../relation_restriction_equivalence.c | 74 ++++--------------- .../expected/multi_reference_table.out | 18 ++--- 2 files changed, 19 insertions(+), 73 deletions(-) diff --git a/src/backend/distributed/planner/relation_restriction_equivalence.c b/src/backend/distributed/planner/relation_restriction_equivalence.c index 8b45da19e..657e8751e 100644 --- a/src/backend/distributed/planner/relation_restriction_equivalence.c +++ b/src/backend/distributed/planner/relation_restriction_equivalence.c @@ -67,7 +67,6 @@ static Var * FindTranslatedVar(List *appendRelList, Oid relationOid, static bool EquivalenceListContainsRelationsEquality(List *attributeEquivalenceList, RelationRestrictionContext * restrictionContext); -static uint32 ReferenceRelationCount(RelationRestrictionContext *restrictionContext); static List * GenerateAttributeEquivalencesForRelationRestrictions( RelationRestrictionContext *restrictionContext); static AttributeEquivalenceClass * AttributeEquivalenceClassForEquivalenceClass( @@ -314,10 +313,9 @@ FindTranslatedVar(List *appendRelList, Oid relationOid, Index relationRteIndex, * joined on their partition keys. * * The function returns true if all relations are joined on their partition keys. - * Otherwise, the function returns false. Since reference tables do not have partition - * keys, we skip processing them. Also, if the query includes only a single non-reference - * distributed relation, the function returns true since it doesn't make sense to check - * for partition key equality in that case. + * Otherwise, the function returns false. In order to support reference tables + * with subqueries, equality between attributes of reference tables and partition + * key of distributed tables are also considered. * * In order to do that, we invented a new equivalence class namely: * AttributeEquivalenceClass. In very simple words, a AttributeEquivalenceClass is @@ -350,24 +348,15 @@ RestrictionEquivalenceForPartitionKeys(PlannerRestrictionContext * List *relationRestrictionAttributeEquivalenceList = NIL; List *joinRestrictionAttributeEquivalenceList = NIL; List *allAttributeEquivalenceList = NIL; - uint32 referenceRelationCount = ReferenceRelationCount(restrictionContext); + uint32 totalRelationCount = list_length(restrictionContext->relationRestrictionList); - uint32 nonReferenceRelationCount = totalRelationCount - referenceRelationCount; /* - * If the query includes a single relation which is not a reference table, - * we should not check the partition column equality. - * Consider two example cases: - * (i) The query includes only a single colocated relation - * (ii) A colocated relation is joined with a (or multiple) reference - * table(s) where colocated relation is not joined on the partition key - * - * For the above two cases, we don't need to execute the partition column equality - * algorithm. The reason is that the essence of this function is to ensure that the - * tasks that are going to be created should not need data from other tasks. In both - * cases mentioned above, the necessary data per task would be on available. + * If the query includes only one relation, we should not check the partition + * column equality. Single table should not need to fetch data from other nodes + * except it's own node(s). */ - if (nonReferenceRelationCount <= 1) + if (totalRelationCount == 1) { return true; } @@ -429,8 +418,7 @@ EquivalenceListContainsRelationsEquality(List *attributeEquivalenceList, (RelationRestriction *) lfirst(relationRestrictionCell); int rteIdentity = GetRTEIdentity(relationRestriction->rte); - if (DistPartitionKey(relationRestriction->relationId) && - !bms_is_member(rteIdentity, commonRteIdentities)) + if (!bms_is_member(rteIdentity, commonRteIdentities)) { return false; } @@ -440,31 +428,6 @@ EquivalenceListContainsRelationsEquality(List *attributeEquivalenceList, } -/* - * ReferenceRelationCount iterates over the relations and returns the reference table - * relation count. - */ -static uint32 -ReferenceRelationCount(RelationRestrictionContext *restrictionContext) -{ - ListCell *relationRestrictionCell = NULL; - uint32 referenceRelationCount = 0; - - foreach(relationRestrictionCell, restrictionContext->relationRestrictionList) - { - RelationRestriction *relationRestriction = - (RelationRestriction *) lfirst(relationRestrictionCell); - - if (PartitionMethod(relationRestriction->relationId) == DISTRIBUTE_BY_NONE) - { - referenceRelationCount++; - } - } - - return referenceRelationCount; -} - - /* * GenerateAttributeEquivalencesForRelationRestrictions gets a relation restriction * context and returns a list of AttributeEquivalenceClass. @@ -642,7 +605,7 @@ GetVarFromAssignedParam(List *parentPlannerParamList, Param *plannerParam) /* * GenerateCommonEquivalence gets a list of unrelated AttributeEquiavalanceClass - * whose all members are partition keys. + * whose all members are partition keys or a column of reference table. * * With the equivalence classes, the function follows the algorithm * outlined below: @@ -1092,9 +1055,6 @@ AddUnionSetOperationsToAttributeEquivalenceClass(AttributeEquivalenceClass ** * class using the rteIdentity provided by the rangeTableEntry. Note that * rteIdentities are only assigned to RTE_RELATIONs and this function asserts * the input rte to be an RTE_RELATION. - * - * Note that this function only adds partition keys to the attributeEquivalanceClass. - * This implies that there wouldn't be any columns for reference tables. */ static void AddRteRelationToAttributeEquivalenceClass(AttributeEquivalenceClass ** @@ -1103,19 +1063,13 @@ AddRteRelationToAttributeEquivalenceClass(AttributeEquivalenceClass ** Var *varToBeAdded) { AttributeEquivalenceClassMember *attributeEqMember = NULL; - Oid relationId = InvalidOid; - Var *relationPartitionKey = NULL; + Oid relationId = rangeTableEntry->relid; + Var *relationPartitionKey = DistPartitionKey(relationId); Assert(rangeTableEntry->rtekind == RTE_RELATION); - relationId = rangeTableEntry->relid; - if (PartitionMethod(relationId) == DISTRIBUTE_BY_NONE) - { - return; - } - - relationPartitionKey = DistPartitionKey(relationId); - if (relationPartitionKey->varattno != varToBeAdded->varattno) + if (PartitionMethod(relationId) != DISTRIBUTE_BY_NONE && + relationPartitionKey->varattno != varToBeAdded->varattno) { return; } diff --git a/src/test/regress/expected/multi_reference_table.out b/src/test/regress/expected/multi_reference_table.out index 7b8fcf8ac..fcd5f7c2c 100644 --- a/src/test/regress/expected/multi_reference_table.out +++ b/src/test/regress/expected/multi_reference_table.out @@ -1133,12 +1133,8 @@ FROM WHERE colocated_table_test_2.value_4 = reference_table_test.value_4 RETURNING value_1, value_2; - value_1 | value_2 ----------+--------- - 1 | 1 - 2 | 2 -(2 rows) - +ERROR: cannot perform distributed planning for the given modification +DETAIL: Select query cannot be pushed down to the worker. -- some more complex queries (Note that there are more complex queries in multi_insert_select.sql) INSERT INTO colocated_table_test (value_1, value_2) @@ -1149,12 +1145,8 @@ FROM WHERE colocated_table_test_2.value_2 = reference_table_test.value_2 RETURNING value_1, value_2; - value_1 | value_2 ----------+--------- - 1 | 1 - 2 | 2 -(2 rows) - +ERROR: cannot perform distributed planning for the given modification +DETAIL: Select query cannot be pushed down to the worker. -- partition column value comes from reference table, goes via coordinator INSERT INTO colocated_table_test (value_1, value_2) @@ -1615,7 +1607,7 @@ INSERT INTO reference_table_test VALUES (2, 2.0, '2', '2016-12-02'); SELECT master_modify_multiple_shards('DELETE FROM colocated_table_test'); master_modify_multiple_shards ------------------------------- - 10 + 6 (1 row) ROLLBACK; From c4e3b8b5e12327b91d2adcd97f5213f94eb8d884 Mon Sep 17 00:00:00 2001 From: velioglu Date: Tue, 1 Aug 2017 12:13:49 +0300 Subject: [PATCH 2/6] Add planner changes and tests for subquery on reference tables --- .../planner/insert_select_planner.c | 7 + .../planner/multi_logical_planner.c | 61 +++ .../planner/multi_physical_planner.c | 179 ++++++++- .../distributed/planner/multi_planner.c | 2 + .../planner/multi_router_planner.c | 6 +- .../distributed/multi_logical_planner.h | 1 + .../distributed/multi_physical_planner.h | 1 + src/include/distributed/multi_planner.h | 2 + ...ulti_subquery_complex_reference_clause.out | 376 ++++++++++++++++++ ...lti_subquery_in_where_reference_clause.out | 144 +++++++ src/test/regress/multi_schedule | 2 + ...ulti_subquery_complex_reference_clause.sql | 313 +++++++++++++++ ...lti_subquery_in_where_reference_clause.sql | 130 ++++++ 13 files changed, 1212 insertions(+), 12 deletions(-) create mode 100644 src/test/regress/expected/multi_subquery_complex_reference_clause.out create mode 100644 src/test/regress/expected/multi_subquery_in_where_reference_clause.out create mode 100644 src/test/regress/sql/multi_subquery_complex_reference_clause.sql create mode 100644 src/test/regress/sql/multi_subquery_in_where_reference_clause.sql diff --git a/src/backend/distributed/planner/insert_select_planner.c b/src/backend/distributed/planner/insert_select_planner.c index c7cc50fb8..896163784 100644 --- a/src/backend/distributed/planner/insert_select_planner.c +++ b/src/backend/distributed/planner/insert_select_planner.c @@ -464,6 +464,13 @@ RouterModifyTaskForShardInterval(Query *originalQuery, ShardInterval *shardInter } shardOpExpressions = ShardIntervalOpExpressions(shardInterval, rteIndex); + + /* means it is a reference table and do not add any shard interval information */ + if (shardOpExpressions == NIL) + { + continue; + } + shardRestrictionList = make_simple_restrictinfo((Expr *) shardOpExpressions); extendedBaseRestrictInfo = lappend(extendedBaseRestrictInfo, shardRestrictionList); diff --git a/src/backend/distributed/planner/multi_logical_planner.c b/src/backend/distributed/planner/multi_logical_planner.c index 8641af7fd..a0ef2c739 100644 --- a/src/backend/distributed/planner/multi_logical_planner.c +++ b/src/backend/distributed/planner/multi_logical_planner.c @@ -997,6 +997,23 @@ TargetListOnPartitionColumn(Query *query, List *targetEntryList) Expr *targetExpression = targetEntry->expr; bool isPartitionColumn = IsPartitionColumn(targetExpression, query); + Oid relationId = InvalidOid; + Var *column = NULL; + + FindReferencedTableColumn(targetExpression, NIL, query, &relationId, &column); + + /* + * If the expression belongs to reference table directly returns true, + * since logic of caller function checks whether it can find the necessaary + * data from each node. + */ + if (IsDistributedTable(relationId) && PartitionMethod(relationId) == + DISTRIBUTE_BY_NONE) + { + targetListOnPartitionColumn = true; + break; + } + if (isPartitionColumn) { FieldSelect *compositeField = CompositeFieldRecursive(targetExpression, @@ -2728,6 +2745,50 @@ ExtractRangeTableRelationWalker(Node *node, List **rangeTableRelationList) } +/* Get the list of relations from the given node. Note that the difference between + * this function and ExtractRangeTableRelationWalker is that this one recursively + * walk into range table entries if it can. + */ +bool +ExtractRTRelationFromNode(Node *node, List **rangeTableList) +{ + bool walkIsComplete = false; + if (node == NULL) + { + return false; + } + + if (IsA(node, RangeTblEntry)) + { + RangeTblEntry *rangeTableEntry = (RangeTblEntry *) node; + + if (rangeTableEntry->rtekind == RTE_RELATION && + rangeTableEntry->relkind != RELKIND_VIEW) + { + (*rangeTableList) = lappend(*rangeTableList, rangeTableEntry); + } + else if (rangeTableEntry->rtekind == RTE_SUBQUERY) + { + walkIsComplete = query_tree_walker(rangeTableEntry->subquery, + ExtractRTRelationFromNode, + rangeTableList, QTW_EXAMINE_RTES); + } + } + else if (IsA(node, Query)) + { + walkIsComplete = query_tree_walker((Query *) node, ExtractRTRelationFromNode, + rangeTableList, QTW_EXAMINE_RTES); + } + else + { + walkIsComplete = expression_tree_walker(node, ExtractRTRelationFromNode, + rangeTableList); + } + + return walkIsComplete; +} + + /* * ExtractRangeTableEntryWalker walks over a query tree, and finds all range * table entries. For recursing into the query tree, this function uses the diff --git a/src/backend/distributed/planner/multi_physical_planner.c b/src/backend/distributed/planner/multi_physical_planner.c index fcee39f40..9e3fd4458 100644 --- a/src/backend/distributed/planner/multi_physical_planner.c +++ b/src/backend/distributed/planner/multi_physical_planner.c @@ -127,6 +127,8 @@ static Job * BuildJobTreeTaskList(Job *jobTree, static List * SubquerySqlTaskList(Job *job, PlannerRestrictionContext *plannerRestrictionContext); static void ErrorIfUnsupportedShardDistribution(Query *query); +static void ErrorIfUnsupportedJoinReferenceTable( + PlannerRestrictionContext *plannerRestrictionContext); static bool CoPartitionedTables(Oid firstRelationId, Oid secondRelationId); static bool ShardIntervalsEqual(FmgrInfo *comparisonFunction, ShardInterval *firstInterval, @@ -195,6 +197,7 @@ static StringInfo MergeTableQueryString(uint32 taskIdIndex, List *targetEntryLis static StringInfo IntermediateTableQueryString(uint64 jobId, uint32 taskIdIndex, Query *reduceQuery); static uint32 FinalTargetEntryCount(List *targetEntryList); +static bool ReferenceTableExist(PlannerInfo *plannerInfo, RelOptInfo *relationInfo); /* @@ -2038,6 +2041,9 @@ SubquerySqlTaskList(Job *job, PlannerRestrictionContext *plannerRestrictionConte /* error if shards are not co-partitioned */ ErrorIfUnsupportedShardDistribution(subquery); + /* error if unsupported join on reference tables */ + ErrorIfUnsupportedJoinReferenceTable(plannerRestrictionContext); + /* get list of all range tables in subquery tree */ ExtractRangeTableRelationWalker((Node *) subquery, &rangeTableList); @@ -2061,7 +2067,16 @@ SubquerySqlTaskList(Job *job, PlannerRestrictionContext *plannerRestrictionConte break; } - Assert(targetCacheEntry != NULL); + /* + * That means all table are reference table and we can assign any reference + * table as an anchor one . + */ + if (targetCacheEntry == NULL) + { + RangeTblEntry *rangeTableEntry = (RangeTblEntry *) linitial(rangeTableList); + relationId = rangeTableEntry->relid; + targetCacheEntry = DistributedTableCacheEntry(relationId); + } shardCount = targetCacheEntry->shardIntervalArrayLength; for (shardOffset = 0; shardOffset < shardCount; shardOffset++) @@ -2088,6 +2103,146 @@ SubquerySqlTaskList(Job *job, PlannerRestrictionContext *plannerRestrictionConte } +/* + * ErrorIfUnsupportedJoinReferenceTable errors out if there exists a outer join + * exist between reference table and distributed tables. + */ +static void +ErrorIfUnsupportedJoinReferenceTable(PlannerRestrictionContext *plannerRestrictionContext) +{ + List *joinRestrictionList = + plannerRestrictionContext->joinRestrictionContext->joinRestrictionList; + ListCell *joinRestrictionCell = NULL; + + foreach(joinRestrictionCell, joinRestrictionList) + { + JoinRestriction *joinRestriction = (JoinRestriction *) lfirst( + joinRestrictionCell); + JoinType joinType = joinRestriction->joinType; + PlannerInfo *plannerInfo = joinRestriction->plannerInfo; + RelOptInfo *innerrel = joinRestriction->innerrel; + RelOptInfo *outerrel = joinRestriction->outerrel; + + switch (joinType) + { + case JOIN_SEMI: + { + if (ReferenceTableExist(plannerInfo, outerrel)) + { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("can not plan query having reference table on" + " the left part of semi join"))); + } + } + break; + + case JOIN_ANTI: + { + if (ReferenceTableExist(plannerInfo, innerrel)) + { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("can not plan query having reference table on" + " the left part of anti join"))); + } + } + break; + + case JOIN_LEFT: + { + if (ReferenceTableExist(plannerInfo, outerrel)) + { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("can not plan query having reference table on" + " the left part of left join"))); + } + } + break; + + case JOIN_RIGHT: + { + if (ReferenceTableExist(plannerInfo, innerrel)) + { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("can not plan query having reference table on" + " the right part of right join"))); + } + } + break; + + case JOIN_FULL: + { + if (ReferenceTableExist(plannerInfo, innerrel) || ReferenceTableExist( + plannerInfo, outerrel)) + { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg( + "can not plan query having reference table as a" + " part of full join"))); + } + } + break; + + default: + { } + break; + } + } +} + + +/* + * ReferenceTableExist check whether the relationInfo has reference table. + * Since relation ids of relationInfo indexes to the range table entry list of + * query, query is also passed. + */ +static bool +ReferenceTableExist(PlannerInfo *plannerInfo, RelOptInfo *relationInfo) +{ + Relids relids = bms_copy(relationInfo->relids); + int relationId = -1; + + while ((relationId = bms_first_member(relids)) >= 0) + { + RangeTblEntry *rangeTableEntry = plannerInfo->simple_rte_array[relationId]; + + /* relationInfo has this range table entry */ + if (RTEContainsReferenceTable(rangeTableEntry)) + { + return true; + } + } + + return false; +} + + +/* + * RTEContainsReferenceTable checks whether there exist a reference table in the + * given range table entry. + */ +bool +RTEContainsReferenceTable(RangeTblEntry *rangeTableEntry) +{ + List *relationList = NIL; + ListCell *relationCell = NULL; + ExtractRTRelationFromNode((Node *) rangeTableEntry, &relationList); + + foreach(relationCell, relationList) + { + RangeTblEntry *rangeTableEntry = (RangeTblEntry *) lfirst(relationCell); + Oid relationId = rangeTableEntry->relid; + + if (IsDistributedTable(relationId) && PartitionMethod(relationId) == + DISTRIBUTE_BY_NONE) + { + return true; + } + } + + return false; +} + + /* * ErrorIfUnsupportedShardDistribution gets list of relations in the given query * and checks if two conditions below hold for them, otherwise it errors out. @@ -2121,10 +2276,8 @@ ErrorIfUnsupportedShardDistribution(Query *query) } else { - ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot push down this subquery"), - errdetail("Currently range and hash partitioned " - "relations are supported"))); + /* do not need to handle reference tables */ + continue; } } @@ -2141,6 +2294,13 @@ ErrorIfUnsupportedShardDistribution(Query *query) Oid relationId = lfirst_oid(relationIdCell); bool coPartitionedTables = false; Oid currentRelationId = relationId; + char partitionMethod = PartitionMethod(relationId); + + /* do not need to check reference tables */ + if (partitionMethod == DISTRIBUTE_BY_NONE) + { + continue; + } /* get shard list of first relation and continue for the next relation */ if (relationIndex == 0) @@ -2298,9 +2458,6 @@ SubqueryTaskCreate(Query *originalQuery, ShardInterval *shardInterval, RestrictInfo *shardRestrictionList = NULL; DeferredErrorMessage *planningError = NULL; - /* such queries should go through router planner */ - Assert(!restrictionContext->allReferenceTables); - /* * Add the restriction qual parameter value in all baserestrictinfos. * Note that this has to be done on a copy, as the originals are needed @@ -2315,6 +2472,12 @@ SubqueryTaskCreate(Query *originalQuery, ShardInterval *shardInterval, shardOpExpressions = ShardIntervalOpExpressions(shardInterval, rteIndex); + /* means it is a reference table and do not add any shard interval info */ + if (shardOpExpressions == NIL) + { + continue; + } + shardRestrictionList = make_simple_restrictinfo((Expr *) shardOpExpressions); extendedBaseRestrictInfo = lappend(extendedBaseRestrictInfo, shardRestrictionList); diff --git a/src/backend/distributed/planner/multi_planner.c b/src/backend/distributed/planner/multi_planner.c index 74af951f2..5de8d5bd1 100644 --- a/src/backend/distributed/planner/multi_planner.c +++ b/src/backend/distributed/planner/multi_planner.c @@ -740,6 +740,8 @@ multi_join_restriction_hook(PlannerInfo *root, joinRestriction->joinType = jointype; joinRestriction->joinRestrictInfoList = restrictInfoList; joinRestriction->plannerInfo = root; + joinRestriction->innerrel = innerrel; + joinRestriction->outerrel = outerrel; joinRestrictionContext->joinRestrictionList = lappend(joinRestrictionContext->joinRestrictionList, joinRestriction); diff --git a/src/backend/distributed/planner/multi_router_planner.c b/src/backend/distributed/planner/multi_router_planner.c index ac71a5f29..7734ae32c 100644 --- a/src/backend/distributed/planner/multi_router_planner.c +++ b/src/backend/distributed/planner/multi_router_planner.c @@ -300,10 +300,8 @@ ShardIntervalOpExpressions(ShardInterval *shardInterval, Index rteIndex) } else { - ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("cannot create shard interval operator expression for " - "distributed relations other than hash, range and append distributed " - "relations"))); + /* do not add any shard range interval for reference tables */ + return NIL; } /* build the base expression for constraint */ diff --git a/src/include/distributed/multi_logical_planner.h b/src/include/distributed/multi_logical_planner.h index 594f2ce11..f73deb79b 100644 --- a/src/include/distributed/multi_logical_planner.h +++ b/src/include/distributed/multi_logical_planner.h @@ -206,6 +206,7 @@ extern List * TableEntryList(List *rangeTableList); extern List * UsedTableEntryList(Query *query); extern bool ExtractRangeTableRelationWalker(Node *node, List **rangeTableList); extern bool ExtractRangeTableEntryWalker(Node *node, List **rangeTableList); +extern bool ExtractRTRelationFromNode(Node *node, List **rangeTableList); extern List * pull_var_clause_default(Node *node); extern bool OperatorImplementsEquality(Oid opno); diff --git a/src/include/distributed/multi_physical_planner.h b/src/include/distributed/multi_physical_planner.h index bbff3a83e..5f2f08f1d 100644 --- a/src/include/distributed/multi_physical_planner.h +++ b/src/include/distributed/multi_physical_planner.h @@ -286,6 +286,7 @@ extern Const * MakeInt4Constant(Datum constantValue); extern int CompareShardPlacements(const void *leftElement, const void *rightElement); extern bool ShardIntervalsOverlap(ShardInterval *firstInterval, ShardInterval *secondInterval); +extern bool RTEContainsReferenceTable(RangeTblEntry *rangeTableEntry); /* function declarations for Task and Task list operations */ extern bool TasksEqual(const Task *a, const Task *b); diff --git a/src/include/distributed/multi_planner.h b/src/include/distributed/multi_planner.h index 952f2fa53..25c2ee4c5 100644 --- a/src/include/distributed/multi_planner.h +++ b/src/include/distributed/multi_planner.h @@ -52,6 +52,8 @@ typedef struct JoinRestriction JoinType joinType; List *joinRestrictInfoList; PlannerInfo *plannerInfo; + RelOptInfo *innerrel; + RelOptInfo *outerrel; } JoinRestriction; typedef struct PlannerRestrictionContext diff --git a/src/test/regress/expected/multi_subquery_complex_reference_clause.out b/src/test/regress/expected/multi_subquery_complex_reference_clause.out new file mode 100644 index 000000000..6a28a0226 --- /dev/null +++ b/src/test/regress/expected/multi_subquery_complex_reference_clause.out @@ -0,0 +1,376 @@ +-- +-- multi subquery complex queries aims to expand existing subquery pushdown +-- regression tests to cover more caeses +-- the tables that are used depends to multi_insert_select_behavioral_analytics_create_table.sql +-- +-- We don't need shard id sequence here, so commented out to prevent conflicts with concurrent tests +-- ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 1400000; +ALTER SEQUENCE pg_catalog.pg_dist_jobid_seq RESTART 1400000; + +SET citus.enable_router_execution TO FALSE; +CREATE TABLE events_reference_table as SELECT * FROM events_table; +CREATE TABLE users_reference_table as SELECT * FROM users_table; +SELECT create_reference_table('events_reference_table'); +NOTICE: Copying data from local table... + create_reference_table +------------------------ + +(1 row) + +SELECT create_reference_table('users_reference_table'); +NOTICE: Copying data from local table... + create_reference_table +------------------------ + +(1 row) + +-- LATERAL JOINs used with INNER JOINs with reference tables +SET citus.subquery_pushdown to ON; +SELECT user_id, lastseen +FROM + (SELECT + "some_users_data".user_id, lastseen + FROM + (SELECT + filter_users_1.user_id, time AS lastseen + FROM + (SELECT + user_where_1_1.user_id + FROM + (SELECT + "users"."user_id" + FROM + users_reference_table as "users" + WHERE + user_id > 12 and user_id < 16 and value_1 > 20) user_where_1_1 + INNER JOIN + (SELECT + "users"."user_id" + FROM + users_reference_table as "users" + WHERE + user_id > 12 and user_id < 16 and value_2 > 60) user_where_1_join_1 + ON ("user_where_1_1".user_id = "user_where_1_join_1".user_id)) + filter_users_1 + JOIN LATERAL + (SELECT + user_id, time + FROM + events_reference_table as "events" + WHERE + user_id > 12 and user_id < 16 AND + user_id = filter_users_1.user_id + ORDER BY + time DESC + LIMIT 1) "last_events_1" + ON TRUE + ORDER BY + time DESC + LIMIT 10) "some_recent_users" + JOIN LATERAL + (SELECT + "users".user_id + FROM + users_reference_table as "users" + WHERE + "users"."user_id" = "some_recent_users"."user_id" AND + "users"."value_2" > 70 + LIMIT 1) "some_users_data" + ON TRUE + ORDER BY + lastseen DESC + LIMIT 10) "some_users" +ORDER BY + user_id DESC +LIMIT 10; + user_id | lastseen +---------+--------------------------------- + 14 | Tue Jan 21 05:46:51.286381 2014 + 14 | Tue Jan 21 05:46:51.286381 2014 + 14 | Tue Jan 21 05:46:51.286381 2014 + 14 | Tue Jan 21 05:46:51.286381 2014 + 14 | Tue Jan 21 05:46:51.286381 2014 + 14 | Tue Jan 21 05:46:51.286381 2014 + 14 | Tue Jan 21 05:46:51.286381 2014 + 14 | Tue Jan 21 05:46:51.286381 2014 + 14 | Tue Jan 21 05:46:51.286381 2014 + 14 | Tue Jan 21 05:46:51.286381 2014 +(10 rows) + +SET citus.subquery_pushdown to OFF; +-- NESTED INNER JOINs with reference tables +SELECT + count(*) AS value, "generated_group_field" + FROM + (SELECT + DISTINCT "pushedDownQuery"."real_user_id", "generated_group_field" + FROM + (SELECT + "eventQuery"."real_user_id", "eventQuery"."time", random(), ("eventQuery"."value_2") AS "generated_group_field" + FROM + (SELECT + * + FROM + (SELECT + "events"."time", "events"."user_id", "events"."value_2" + FROM + events_reference_table as "events" + WHERE + user_id > 10 and user_id < 40 AND event_type IN (40, 41, 42, 43, 44, 45) ) "temp_data_queries" + INNER JOIN + (SELECT + user_where_1_1.real_user_id + FROM + (SELECT + "users"."user_id" as real_user_id + FROM + users_reference_table as "users" + WHERE + user_id > 10 and user_id < 40 and value_2 > 50 ) user_where_1_1 + INNER JOIN + (SELECT + "users"."user_id" + FROM + users_reference_table as "users" + WHERE + user_id > 10 and user_id < 40 and value_3 > 50 ) user_where_1_join_1 + ON ("user_where_1_1".real_user_id = "user_where_1_join_1".user_id)) "user_filters_1" + ON ("temp_data_queries".user_id = "user_filters_1".real_user_id)) "eventQuery") "pushedDownQuery") "pushedDownQuery" +GROUP BY + "generated_group_field" +ORDER BY + generated_group_field DESC, value DESC; + value | generated_group_field +-------+----------------------- + 1 | 966 + 1 | 917 + 1 | 905 + 1 | 868 + 1 | 836 + 1 | 791 + 1 | 671 + 1 | 642 + 1 | 358 + 1 | 317 + 1 | 307 + 1 | 302 + 1 | 214 + 1 | 166 + 1 | 116 + 1 | 1 +(16 rows) + +-- single level inner joins with reference tables +SELECT + "value_3", count(*) AS cnt +FROM + (SELECT + "value_3", "user_id", random() + FROM + (SELECT + users_in_segment_1.user_id, value_3 + FROM + (SELECT + user_id, value_3 * 2 as value_3 + FROM + (SELECT + user_id, value_3 + FROM + (SELECT + "users"."user_id", value_3 + FROM + users_reference_table as "users" + WHERE + user_id > 10 and user_id < 40 and value_2 > 30 + ) simple_user_where_1 + ) all_buckets_1 + ) users_in_segment_1 + JOIN + (SELECT + "users"."user_id" + FROM + users_reference_table as "users" + WHERE + user_id > 10 and user_id < 40 and value_2 > 60 + ) some_users_data + ON ("users_in_segment_1".user_id = "some_users_data".user_id) + ) segmentalias_1) "tempQuery" +GROUP BY "value_3" +ORDER BY cnt, value_3 DESC LIMIT 10; + value_3 | cnt +---------+----- + 556 | 75 + 228 | 75 + 146 | 75 + 70 | 75 + 1442 | 79 + 1232 | 79 + 1090 | 79 + 1012 | 79 + 886 | 79 + 674 | 79 +(10 rows) + +-- nested LATERAL JOINs with reference tables +SET citus.subquery_pushdown to ON; +SELECT * +FROM + (SELECT "some_users_data".user_id, "some_recent_users".value_3 + FROM + (SELECT + filter_users_1.user_id, value_3 + FROM + (SELECT + "users"."user_id" + FROM + users_reference_table as "users" + WHERE + user_id > 20 and user_id < 70 and users.value_2 = 200) filter_users_1 + JOIN LATERAL + (SELECT + user_id, value_3 + FROM + events_reference_table as "events" + WHERE + user_id > 20 and user_id < 70 AND + ("events".user_id = "filter_users_1".user_id) + ORDER BY + value_3 DESC + LIMIT 1) "last_events_1" ON true + ORDER BY value_3 DESC + LIMIT 10) "some_recent_users" + JOIN LATERAL + (SELECT + "users".user_id + FROM + users_reference_table as "users" + WHERE + "users"."user_id" = "some_recent_users"."user_id" AND + users.value_2 > 200 + LIMIT 1) "some_users_data" ON true + ORDER BY + value_3 DESC +LIMIT 10) "some_users" +ORDER BY + value_3 DESC +LIMIT 10; + user_id | value_3 +---------+--------- + 44 | 998 + 65 | 996 + 66 | 996 + 37 | 995 + 57 | 989 + 21 | 985 +(6 rows) + +SET citus.subquery_pushdown to OFF; +-- LEFT JOINs used with INNER JOINs should error out since reference table exist in the +-- left side of the LEFT JOIN. +SELECT +count(*) AS cnt, "generated_group_field" + FROM + (SELECT + "eventQuery"."user_id", random(), generated_group_field + FROM + (SELECT + "multi_group_wrapper_1".*, generated_group_field, random() + FROM + (SELECT * + FROM + (SELECT + "events"."time", "events"."user_id" as event_user_id + FROM + events_table as "events" + WHERE + user_id > 80) "temp_data_queries" + INNER JOIN + (SELECT + "users"."user_id" + FROM + users_reference_table as "users" + WHERE + user_id > 80 and value_2 = 5) "user_filters_1" + ON ("temp_data_queries".event_user_id = "user_filters_1".user_id)) AS "multi_group_wrapper_1" + LEFT JOIN + (SELECT + "users"."user_id" AS "user_id", value_2 AS "generated_group_field" + FROM + users_table as "users") "left_group_by_1" + ON ("left_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" + group BY + "generated_group_field" + ORDER BY + cnt DESC, generated_group_field ASC + LIMIT 10; +ERROR: can not plan query having reference table on the left part of left join + -- RIGHT JOINs used with INNER JOINs should error out since reference table exist in the +-- right side of the RIGHT JOIN. +SELECT +count(*) AS cnt, "generated_group_field" + FROM + (SELECT + "eventQuery"."user_id", random(), generated_group_field + FROM + (SELECT + "multi_group_wrapper_1".*, generated_group_field, random() + FROM + (SELECT * + FROM + (SELECT + "events"."time", "events"."user_id" as event_user_id + FROM + events_table as "events" + WHERE + user_id > 80) "temp_data_queries" + INNER JOIN + (SELECT + "users"."user_id" + FROM + users_table as "users" + WHERE + user_id > 80 and value_2 = 5) "user_filters_1" + ON ("temp_data_queries".event_user_id = "user_filters_1".user_id)) AS "multi_group_wrapper_1" + RIGHT JOIN + (SELECT + "users"."user_id" AS "user_id", value_2 AS "generated_group_field" + FROM + users_reference_table as "users") "right_group_by_1" + ON ("right_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" + group BY + "generated_group_field" + ORDER BY + cnt DESC, generated_group_field ASC + LIMIT 10; +ERROR: can not plan query having reference table on the left part of left join + -- Outer subquery with reference table +SELECT "some_users_data".user_id, lastseen +FROM + (SELECT user_id, max(time) AS lastseen + FROM + (SELECT user_id, time + FROM + (SELECT + user_id, time + FROM + events_reference_table as "events" + WHERE + user_id > 10 and user_id < 40) "events_1" + ORDER BY + time DESC) "recent_events_1" + GROUP BY + user_id + ORDER BY + max(TIME) DESC) "some_recent_users" + FULL JOIN + (SELECT + "users".user_id + FROM + users_table as "users" + WHERE + users.value_2 > 50 and users.value_2 < 55) "some_users_data" + ON "some_users_data"."user_id" = "some_recent_users"."user_id" +ORDER BY + user_id +limit 50; +ERROR: can not plan query having reference table as a part of full join diff --git a/src/test/regress/expected/multi_subquery_in_where_reference_clause.out b/src/test/regress/expected/multi_subquery_in_where_reference_clause.out new file mode 100644 index 000000000..0159510b2 --- /dev/null +++ b/src/test/regress/expected/multi_subquery_in_where_reference_clause.out @@ -0,0 +1,144 @@ +-- +-- queries to test the subquery pushdown on reference tables +-- subqueries in WHERE with greater operator +SELECT + user_id +FROM + users_table +WHERE + value_2 > + (SELECT + max(value_2) + FROM + events_reference_table + WHERE + users_table.user_id = events_reference_table.user_id AND event_type = 50 + GROUP BY + user_id + ) +GROUP BY user_id +HAVING count(*) > 66 +ORDER BY user_id +LIMIT 5; + user_id +--------- + 49 + 55 + 56 + 63 +(4 rows) + +-- have reference table without any equality, should error out +SELECT + user_id +FROM + users_table +WHERE + value_2 > + (SELECT + max(value_2) + FROM + events_reference_table + WHERE + event_type = 50 + GROUP BY + user_id + ) +GROUP BY user_id +HAVING count(*) > 66 +ORDER BY user_id +LIMIT 5; +ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys +DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. +-- users that appeared more than 118 times, should run since the reference table +-- on the right side of the semi join +SELECT + user_id +FROM + users_table +WHERE 118 <= + (SELECT + count(*) + FROM + events_reference_table + WHERE + users_table.user_id = events_reference_table.user_id + GROUP BY + user_id) +GROUP BY + user_id +ORDER BY + user_id; + user_id +--------- + 13 + 17 + 23 + 25 +(4 rows) + +-- should error out since reference table exist on the left side +-- of the left lateral join +SELECT user_id, value_2 FROM users_table WHERE + value_1 > 101 AND value_1 < 110 + AND value_2 >= 5 + AND user_id IN + ( + SELECT + e1.user_id + FROM ( + -- Get the first time each user viewed the homepage. + SELECT + user_id, + 1 AS view_homepage, + min(time) AS view_homepage_time + FROM events_reference_table + WHERE + event_type IN (10, 20, 30, 40, 50, 60, 70, 80, 90) + GROUP BY user_id + ) e1 LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS use_demo, + time AS use_demo_time + FROM events_reference_table + WHERE + user_id = e1.user_id AND + event_type IN (11, 21, 31, 41, 51, 61, 71, 81, 91) + ORDER BY time + ) e2 ON true LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS enter_credit_card, + time AS enter_credit_card_time + FROM events_reference_table + WHERE + user_id = e2.user_id AND + event_type IN (12, 22, 32, 42, 52, 62, 72, 82, 92) + ORDER BY time + ) e3 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS submit_card_info, + user_id, + time AS enter_credit_card_time + FROM events_reference_table + WHERE + user_id = e3.user_id AND + event_type IN (13, 23, 33, 43, 53, 63, 73, 83, 93) + ORDER BY time + ) e4 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS see_bought_screen + FROM events_reference_table + WHERE + user_id = e4.user_id AND + event_type IN (14, 24, 34, 44, 54, 64, 74, 84, 94) + ORDER BY time + ) e5 ON true + group by e1.user_id + HAVING sum(submit_card_info) > 0 +) +ORDER BY 1, 2; +ERROR: can not plan query having reference table on the left part of left join +DROP TABLE events_reference_table; +DROP TABLE users_reference_table; diff --git a/src/test/regress/multi_schedule b/src/test/regress/multi_schedule index 9311b5d23..b860d0e71 100644 --- a/src/test/regress/multi_schedule +++ b/src/test/regress/multi_schedule @@ -51,6 +51,8 @@ test: multi_deparse_shard_query multi_distributed_transaction_id test: multi_basic_queries multi_complex_expressions test: multi_explain test: multi_subquery multi_subquery_complex_queries multi_subquery_behavioral_analytics +test: multi_subquery_complex_reference_clause +test: multi_subquery_in_where_reference_clause test: multi_subquery_union multi_subquery_in_where_clause multi_subquery_misc test: multi_reference_table test: multi_outer_join_reference diff --git a/src/test/regress/sql/multi_subquery_complex_reference_clause.sql b/src/test/regress/sql/multi_subquery_complex_reference_clause.sql new file mode 100644 index 000000000..337b4f52d --- /dev/null +++ b/src/test/regress/sql/multi_subquery_complex_reference_clause.sql @@ -0,0 +1,313 @@ +-- +-- multi subquery complex queries aims to expand existing subquery pushdown +-- regression tests to cover more caeses +-- the tables that are used depends to multi_insert_select_behavioral_analytics_create_table.sql +-- + +-- We don't need shard id sequence here, so commented out to prevent conflicts with concurrent tests +-- ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 1400000; +ALTER SEQUENCE pg_catalog.pg_dist_jobid_seq RESTART 1400000; + +SET citus.enable_router_execution TO FALSE; + +CREATE TABLE events_reference_table as SELECT * FROM events_table; +CREATE TABLE users_reference_table as SELECT * FROM users_table; + +SELECT create_reference_table('events_reference_table'); +SELECT create_reference_table('users_reference_table'); + +-- LATERAL JOINs used with INNER JOINs with reference tables +SET citus.subquery_pushdown to ON; +SELECT user_id, lastseen +FROM + (SELECT + "some_users_data".user_id, lastseen + FROM + (SELECT + filter_users_1.user_id, time AS lastseen + FROM + (SELECT + user_where_1_1.user_id + FROM + (SELECT + "users"."user_id" + FROM + users_reference_table as "users" + WHERE + user_id > 12 and user_id < 16 and value_1 > 20) user_where_1_1 + INNER JOIN + (SELECT + "users"."user_id" + FROM + users_reference_table as "users" + WHERE + user_id > 12 and user_id < 16 and value_2 > 60) user_where_1_join_1 + ON ("user_where_1_1".user_id = "user_where_1_join_1".user_id)) + filter_users_1 + JOIN LATERAL + (SELECT + user_id, time + FROM + events_reference_table as "events" + WHERE + user_id > 12 and user_id < 16 AND + user_id = filter_users_1.user_id + ORDER BY + time DESC + LIMIT 1) "last_events_1" + ON TRUE + ORDER BY + time DESC + LIMIT 10) "some_recent_users" + JOIN LATERAL + (SELECT + "users".user_id + FROM + users_reference_table as "users" + WHERE + "users"."user_id" = "some_recent_users"."user_id" AND + "users"."value_2" > 70 + LIMIT 1) "some_users_data" + ON TRUE + ORDER BY + lastseen DESC + LIMIT 10) "some_users" +ORDER BY + user_id DESC +LIMIT 10; +SET citus.subquery_pushdown to OFF; + +-- NESTED INNER JOINs with reference tables +SELECT + count(*) AS value, "generated_group_field" + FROM + (SELECT + DISTINCT "pushedDownQuery"."real_user_id", "generated_group_field" + FROM + (SELECT + "eventQuery"."real_user_id", "eventQuery"."time", random(), ("eventQuery"."value_2") AS "generated_group_field" + FROM + (SELECT + * + FROM + (SELECT + "events"."time", "events"."user_id", "events"."value_2" + FROM + events_reference_table as "events" + WHERE + user_id > 10 and user_id < 40 AND event_type IN (40, 41, 42, 43, 44, 45) ) "temp_data_queries" + INNER JOIN + (SELECT + user_where_1_1.real_user_id + FROM + (SELECT + "users"."user_id" as real_user_id + FROM + users_reference_table as "users" + WHERE + user_id > 10 and user_id < 40 and value_2 > 50 ) user_where_1_1 + INNER JOIN + (SELECT + "users"."user_id" + FROM + users_reference_table as "users" + WHERE + user_id > 10 and user_id < 40 and value_3 > 50 ) user_where_1_join_1 + ON ("user_where_1_1".real_user_id = "user_where_1_join_1".user_id)) "user_filters_1" + ON ("temp_data_queries".user_id = "user_filters_1".real_user_id)) "eventQuery") "pushedDownQuery") "pushedDownQuery" +GROUP BY + "generated_group_field" +ORDER BY + generated_group_field DESC, value DESC; + +-- single level inner joins with reference tables +SELECT + "value_3", count(*) AS cnt +FROM + (SELECT + "value_3", "user_id", random() + FROM + (SELECT + users_in_segment_1.user_id, value_3 + FROM + (SELECT + user_id, value_3 * 2 as value_3 + FROM + (SELECT + user_id, value_3 + FROM + (SELECT + "users"."user_id", value_3 + FROM + users_reference_table as "users" + WHERE + user_id > 10 and user_id < 40 and value_2 > 30 + ) simple_user_where_1 + ) all_buckets_1 + ) users_in_segment_1 + JOIN + (SELECT + "users"."user_id" + FROM + users_reference_table as "users" + WHERE + user_id > 10 and user_id < 40 and value_2 > 60 + ) some_users_data + ON ("users_in_segment_1".user_id = "some_users_data".user_id) + ) segmentalias_1) "tempQuery" +GROUP BY "value_3" +ORDER BY cnt, value_3 DESC LIMIT 10; + +-- nested LATERAL JOINs with reference tables +SET citus.subquery_pushdown to ON; +SELECT * +FROM + (SELECT "some_users_data".user_id, "some_recent_users".value_3 + FROM + (SELECT + filter_users_1.user_id, value_3 + FROM + (SELECT + "users"."user_id" + FROM + users_reference_table as "users" + WHERE + user_id > 20 and user_id < 70 and users.value_2 = 200) filter_users_1 + JOIN LATERAL + (SELECT + user_id, value_3 + FROM + events_reference_table as "events" + WHERE + user_id > 20 and user_id < 70 AND + ("events".user_id = "filter_users_1".user_id) + ORDER BY + value_3 DESC + LIMIT 1) "last_events_1" ON true + ORDER BY value_3 DESC + LIMIT 10) "some_recent_users" + JOIN LATERAL + (SELECT + "users".user_id + FROM + users_reference_table as "users" + WHERE + "users"."user_id" = "some_recent_users"."user_id" AND + users.value_2 > 200 + LIMIT 1) "some_users_data" ON true + ORDER BY + value_3 DESC +LIMIT 10) "some_users" +ORDER BY + value_3 DESC +LIMIT 10; +SET citus.subquery_pushdown to OFF; + +-- LEFT JOINs used with INNER JOINs should error out since reference table exist in the +-- left side of the LEFT JOIN. +SELECT +count(*) AS cnt, "generated_group_field" + FROM + (SELECT + "eventQuery"."user_id", random(), generated_group_field + FROM + (SELECT + "multi_group_wrapper_1".*, generated_group_field, random() + FROM + (SELECT * + FROM + (SELECT + "events"."time", "events"."user_id" as event_user_id + FROM + events_table as "events" + WHERE + user_id > 80) "temp_data_queries" + INNER JOIN + (SELECT + "users"."user_id" + FROM + users_reference_table as "users" + WHERE + user_id > 80 and value_2 = 5) "user_filters_1" + ON ("temp_data_queries".event_user_id = "user_filters_1".user_id)) AS "multi_group_wrapper_1" + LEFT JOIN + (SELECT + "users"."user_id" AS "user_id", value_2 AS "generated_group_field" + FROM + users_table as "users") "left_group_by_1" + ON ("left_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" + group BY + "generated_group_field" + ORDER BY + cnt DESC, generated_group_field ASC + LIMIT 10; + + -- RIGHT JOINs used with INNER JOINs should error out since reference table exist in the +-- right side of the RIGHT JOIN. +SELECT +count(*) AS cnt, "generated_group_field" + FROM + (SELECT + "eventQuery"."user_id", random(), generated_group_field + FROM + (SELECT + "multi_group_wrapper_1".*, generated_group_field, random() + FROM + (SELECT * + FROM + (SELECT + "events"."time", "events"."user_id" as event_user_id + FROM + events_table as "events" + WHERE + user_id > 80) "temp_data_queries" + INNER JOIN + (SELECT + "users"."user_id" + FROM + users_table as "users" + WHERE + user_id > 80 and value_2 = 5) "user_filters_1" + ON ("temp_data_queries".event_user_id = "user_filters_1".user_id)) AS "multi_group_wrapper_1" + RIGHT JOIN + (SELECT + "users"."user_id" AS "user_id", value_2 AS "generated_group_field" + FROM + users_reference_table as "users") "right_group_by_1" + ON ("right_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" + group BY + "generated_group_field" + ORDER BY + cnt DESC, generated_group_field ASC + LIMIT 10; + + -- Outer subquery with reference table +SELECT "some_users_data".user_id, lastseen +FROM + (SELECT user_id, max(time) AS lastseen + FROM + (SELECT user_id, time + FROM + (SELECT + user_id, time + FROM + events_reference_table as "events" + WHERE + user_id > 10 and user_id < 40) "events_1" + ORDER BY + time DESC) "recent_events_1" + GROUP BY + user_id + ORDER BY + max(TIME) DESC) "some_recent_users" + FULL JOIN + (SELECT + "users".user_id + FROM + users_table as "users" + WHERE + users.value_2 > 50 and users.value_2 < 55) "some_users_data" + ON "some_users_data"."user_id" = "some_recent_users"."user_id" +ORDER BY + user_id +limit 50; \ No newline at end of file diff --git a/src/test/regress/sql/multi_subquery_in_where_reference_clause.sql b/src/test/regress/sql/multi_subquery_in_where_reference_clause.sql new file mode 100644 index 000000000..0126f362d --- /dev/null +++ b/src/test/regress/sql/multi_subquery_in_where_reference_clause.sql @@ -0,0 +1,130 @@ +-- +-- queries to test the subquery pushdown on reference tables + +-- subqueries in WHERE with greater operator +SELECT + user_id +FROM + users_table +WHERE + value_2 > + (SELECT + max(value_2) + FROM + events_reference_table + WHERE + users_table.user_id = events_reference_table.user_id AND event_type = 50 + GROUP BY + user_id + ) +GROUP BY user_id +HAVING count(*) > 66 +ORDER BY user_id +LIMIT 5; + +-- have reference table without any equality, should error out +SELECT + user_id +FROM + users_table +WHERE + value_2 > + (SELECT + max(value_2) + FROM + events_reference_table + WHERE + event_type = 50 + GROUP BY + user_id + ) +GROUP BY user_id +HAVING count(*) > 66 +ORDER BY user_id +LIMIT 5; + +-- users that appeared more than 118 times, should run since the reference table +-- on the right side of the semi join +SELECT + user_id +FROM + users_table +WHERE 118 <= + (SELECT + count(*) + FROM + events_reference_table + WHERE + users_table.user_id = events_reference_table.user_id + GROUP BY + user_id) +GROUP BY + user_id +ORDER BY + user_id; + +-- should error out since reference table exist on the left side +-- of the left lateral join +SELECT user_id, value_2 FROM users_table WHERE + value_1 > 101 AND value_1 < 110 + AND value_2 >= 5 + AND user_id IN + ( + SELECT + e1.user_id + FROM ( + -- Get the first time each user viewed the homepage. + SELECT + user_id, + 1 AS view_homepage, + min(time) AS view_homepage_time + FROM events_reference_table + WHERE + event_type IN (10, 20, 30, 40, 50, 60, 70, 80, 90) + GROUP BY user_id + ) e1 LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS use_demo, + time AS use_demo_time + FROM events_reference_table + WHERE + user_id = e1.user_id AND + event_type IN (11, 21, 31, 41, 51, 61, 71, 81, 91) + ORDER BY time + ) e2 ON true LEFT JOIN LATERAL ( + SELECT + user_id, + 1 AS enter_credit_card, + time AS enter_credit_card_time + FROM events_reference_table + WHERE + user_id = e2.user_id AND + event_type IN (12, 22, 32, 42, 52, 62, 72, 82, 92) + ORDER BY time + ) e3 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS submit_card_info, + user_id, + time AS enter_credit_card_time + FROM events_reference_table + WHERE + user_id = e3.user_id AND + event_type IN (13, 23, 33, 43, 53, 63, 73, 83, 93) + ORDER BY time + ) e4 ON true LEFT JOIN LATERAL ( + SELECT + 1 AS see_bought_screen + FROM events_reference_table + WHERE + user_id = e4.user_id AND + event_type IN (14, 24, 34, 44, 54, 64, 74, 84, 94) + ORDER BY time + ) e5 ON true + group by e1.user_id + HAVING sum(submit_card_info) > 0 +) +ORDER BY 1, 2; + +DROP TABLE events_reference_table; +DROP TABLE users_reference_table; \ No newline at end of file From 0359d035303fc6185b4eeb30e6e962869b5b5cf9 Mon Sep 17 00:00:00 2001 From: velioglu Date: Wed, 2 Aug 2017 15:42:40 +0300 Subject: [PATCH 3/6] Add set operation check for reference tables --- .../planner/multi_physical_planner.c | 54 +++++ ...ulti_subquery_complex_reference_clause.out | 190 +++++++++++++++++ ...ulti_subquery_complex_reference_clause.sql | 192 +++++++++++++++++- 3 files changed, 435 insertions(+), 1 deletion(-) diff --git a/src/backend/distributed/planner/multi_physical_planner.c b/src/backend/distributed/planner/multi_physical_planner.c index 9e3fd4458..a70527b70 100644 --- a/src/backend/distributed/planner/multi_physical_planner.c +++ b/src/backend/distributed/planner/multi_physical_planner.c @@ -198,6 +198,7 @@ static StringInfo IntermediateTableQueryString(uint64 jobId, uint32 taskIdIndex, Query *reduceQuery); static uint32 FinalTargetEntryCount(List *targetEntryList); static bool ReferenceTableExist(PlannerInfo *plannerInfo, RelOptInfo *relationInfo); +static void ErrorIfSetOpWithReferenceTable(Query *queryTree); /* @@ -2044,6 +2045,9 @@ SubquerySqlTaskList(Job *job, PlannerRestrictionContext *plannerRestrictionConte /* error if unsupported join on reference tables */ ErrorIfUnsupportedJoinReferenceTable(plannerRestrictionContext); + /* error if reference table exists as a part of any set operation */ + ErrorIfSetOpWithReferenceTable(subquery); + /* get list of all range tables in subquery tree */ ExtractRangeTableRelationWalker((Node *) subquery, &rangeTableList); @@ -2243,6 +2247,56 @@ RTEContainsReferenceTable(RangeTblEntry *rangeTableEntry) } +/* + * ErrorIfSetOpWithReferenceTable checks whether there exist a reference table + * as a part of any set operation. + */ +static void +ErrorIfSetOpWithReferenceTable(Query *queryTree) +{ + List *joinTreeTableIndexList = NIL; + Index subqueryRteIndex = 0; + RangeTblEntry *rangeTableEntry = NULL; + Query *subqueryTree = NULL; + List *rangeTableList = queryTree->rtable; + Node *setOperations = queryTree->setOperations; + ExtractRangeTableIndexWalker((Node *) queryTree->jointree, &joinTreeTableIndexList); + + if (setOperations != NULL) + { + List *rangeTableList = NIL; + ListCell *rangeTableCell = NULL; + ExtractRangeTableRelationWalker((Node *) queryTree, &rangeTableList); + + foreach(rangeTableCell, rangeTableList) + { + RangeTblEntry *rangeTableEntry = (RangeTblEntry *) lfirst(rangeTableCell); + Oid relationId = rangeTableEntry->relid; + if (PartitionMethod(relationId) == DISTRIBUTE_BY_NONE) + { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg( + "can not plan query having reference table with union"))); + } + } + } + + if (list_length(joinTreeTableIndexList) < 1) + { + return; + } + + subqueryRteIndex = linitial_int(joinTreeTableIndexList); + rangeTableEntry = rt_fetch(subqueryRteIndex, rangeTableList); + subqueryTree = rangeTableEntry->subquery; + + if (subqueryTree != NULL) + { + return ErrorIfSetOpWithReferenceTable(subqueryTree); + } +} + + /* * ErrorIfUnsupportedShardDistribution gets list of relations in the given query * and checks if two conditions below hold for them, otherwise it errors out. diff --git a/src/test/regress/expected/multi_subquery_complex_reference_clause.out b/src/test/regress/expected/multi_subquery_complex_reference_clause.out index 6a28a0226..d57954b5b 100644 --- a/src/test/regress/expected/multi_subquery_complex_reference_clause.out +++ b/src/test/regress/expected/multi_subquery_complex_reference_clause.out @@ -374,3 +374,193 @@ ORDER BY user_id limit 50; ERROR: can not plan query having reference table as a part of full join + -- + -- UNIONs and JOINs with reference tables, shoukld error out + -- +SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType +FROM + ( SELECT *, random() + FROM + ( SELECT "t"."user_id", "t"."time", unnest("t"."collected_events") AS "event_types" + FROM + ( SELECT "t1"."user_id", min("t1"."time") AS "time", array_agg(("t1"."event") ORDER BY TIME ASC, event DESC) AS collected_events + FROM ( + (SELECT + * + FROM + (SELECT + "events"."user_id", "events"."time", 0 AS event + FROM + events_reference_table as "events" + WHERE + event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) + UNION + (SELECT + * + FROM + (SELECT + "events"."user_id", "events"."time", 1 AS event + FROM + events_table as "events" + WHERE + event_type IN (15, 16, 17, 18, 19) ) events_subquery_2) + UNION + (SELECT + * + FROM + (SELECT + "events"."user_id", "events"."time", 2 AS event + FROM + events_table as "events" + WHERE + event_type IN (20, 21, 22, 23, 24, 25) ) events_subquery_3) + UNION + (SELECT + * + FROM + (SELECT + "events"."user_id", "events"."time", 3 AS event + FROM + events_table as "events" + WHERE + event_type IN (26, 27, 28, 29, 30, 13)) events_subquery_4)) t1 + GROUP BY "t1"."user_id") AS t) "q" +INNER JOIN + (SELECT + "users"."user_id" + FROM + users_table as "users" + WHERE + value_1 > 50 and value_1 < 70) AS t + ON (t.user_id = q.user_id)) as final_query +GROUP BY + types +ORDER BY + types; +ERROR: can not plan query having reference table with union + -- reference table exist in the subquery of union, should error out +SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType +FROM + ( SELECT + *, random() + FROM + (SELECT + "t"."user_id", "t"."time", unnest("t"."collected_events") AS "event_types" + FROM + ( SELECT + "t1"."user_id", min("t1"."time") AS "time", array_agg(("t1"."event") ORDER BY TIME ASC, event DESC) AS collected_events + FROM ( + (SELECT + * + FROM + (SELECT + "events"."time", 0 AS event, "events"."user_id" + FROM + events_table as "events" + WHERE + event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) + UNION + (SELECT * + FROM + ( + SELECT * FROM + ( + SELECT + max("events"."time"), + 0 AS event, + "events"."user_id" + FROM + events_reference_table as "events", users_table as "users" + WHERE + events.user_id = users.user_id AND + event_type IN (10, 11, 12, 13, 14, 15) + GROUP BY "events"."user_id" + ) as events_subquery_5 + ) events_subquery_2) + UNION + (SELECT * + FROM + (SELECT + "events"."time", 2 AS event, "events"."user_id" + FROM + events_table as "events" + WHERE + event_type IN (20, 21, 22, 23, 24, 25) ) events_subquery_3) + UNION + (SELECT * + FROM + (SELECT + "events"."time", 3 AS event, "events"."user_id" + FROM + events_table as "events" + WHERE + event_type IN (26, 27, 28, 29, 30, 13)) events_subquery_4) + ) t1 + GROUP BY "t1"."user_id") AS t) "q" +INNER JOIN + (SELECT + "users"."user_id" + FROM + users_table as "users" + WHERE + value_1 > 50 and value_1 < 70) AS t + ON (t.user_id = q.user_id)) as final_query +GROUP BY + types +ORDER BY + types; +ERROR: can not plan query having reference table with union +-- +-- Should error out with UNION ALL Queries on reference tables +-- +SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType +FROM + ( SELECT *, random() + FROM + ( SELECT "t"."user_id", "t"."time", unnest("t"."collected_events") AS "event_types" + FROM + ( SELECT "t1"."user_id", min("t1"."time") AS "time", array_agg(("t1"."event") ORDER BY TIME ASC, event DESC) AS collected_events + FROM ( + (SELECT * + FROM + (SELECT + "events"."user_id", "events"."time", 0 AS event + FROM + events_table as "events" + WHERE + event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) + UNION ALL + (SELECT * + FROM + (SELECT + "events"."user_id", "events"."time", 1 AS event + FROM + events_table as "events" + WHERE + event_type IN (15, 16, 17, 18, 19) ) events_subquery_2) + UNION ALL + (SELECT * + FROM + (SELECT + "events"."user_id", "events"."time", 2 AS event + FROM + events_reference_table as "events" + WHERE + event_type IN (20, 21, 22, 23, 24, 25) ) events_subquery_3) + UNION ALL + (SELECT * + FROM + (SELECT + "events"."user_id", "events"."time", 3 AS event + FROM + events_table as "events" + WHERE + event_type IN (26, 27, 28, 29, 30, 13)) events_subquery_4)) t1 + GROUP BY "t1"."user_id") AS t) "q" +INNER JOIN + (SELECT "users"."user_id" + FROM users_table as "users" + WHERE value_1 > 50 and value_1 < 70) AS t ON (t.user_id = q.user_id)) as final_query +GROUP BY types +ORDER BY types; +ERROR: can not plan query having reference table with union diff --git a/src/test/regress/sql/multi_subquery_complex_reference_clause.sql b/src/test/regress/sql/multi_subquery_complex_reference_clause.sql index 337b4f52d..103a3dbbe 100644 --- a/src/test/regress/sql/multi_subquery_complex_reference_clause.sql +++ b/src/test/regress/sql/multi_subquery_complex_reference_clause.sql @@ -310,4 +310,194 @@ FROM ON "some_users_data"."user_id" = "some_recent_users"."user_id" ORDER BY user_id -limit 50; \ No newline at end of file +limit 50; + + -- + -- UNIONs and JOINs with reference tables, shoukld error out + -- +SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType +FROM + ( SELECT *, random() + FROM + ( SELECT "t"."user_id", "t"."time", unnest("t"."collected_events") AS "event_types" + FROM + ( SELECT "t1"."user_id", min("t1"."time") AS "time", array_agg(("t1"."event") ORDER BY TIME ASC, event DESC) AS collected_events + FROM ( + (SELECT + * + FROM + (SELECT + "events"."user_id", "events"."time", 0 AS event + FROM + events_reference_table as "events" + WHERE + event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) + UNION + (SELECT + * + FROM + (SELECT + "events"."user_id", "events"."time", 1 AS event + FROM + events_table as "events" + WHERE + event_type IN (15, 16, 17, 18, 19) ) events_subquery_2) + UNION + (SELECT + * + FROM + (SELECT + "events"."user_id", "events"."time", 2 AS event + FROM + events_table as "events" + WHERE + event_type IN (20, 21, 22, 23, 24, 25) ) events_subquery_3) + UNION + (SELECT + * + FROM + (SELECT + "events"."user_id", "events"."time", 3 AS event + FROM + events_table as "events" + WHERE + event_type IN (26, 27, 28, 29, 30, 13)) events_subquery_4)) t1 + GROUP BY "t1"."user_id") AS t) "q" +INNER JOIN + (SELECT + "users"."user_id" + FROM + users_table as "users" + WHERE + value_1 > 50 and value_1 < 70) AS t + ON (t.user_id = q.user_id)) as final_query +GROUP BY + types +ORDER BY + types; + + -- reference table exist in the subquery of union, should error out +SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType +FROM + ( SELECT + *, random() + FROM + (SELECT + "t"."user_id", "t"."time", unnest("t"."collected_events") AS "event_types" + FROM + ( SELECT + "t1"."user_id", min("t1"."time") AS "time", array_agg(("t1"."event") ORDER BY TIME ASC, event DESC) AS collected_events + FROM ( + (SELECT + * + FROM + (SELECT + "events"."time", 0 AS event, "events"."user_id" + FROM + events_table as "events" + WHERE + event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) + UNION + (SELECT * + FROM + ( + SELECT * FROM + ( + SELECT + max("events"."time"), + 0 AS event, + "events"."user_id" + FROM + events_reference_table as "events", users_table as "users" + WHERE + events.user_id = users.user_id AND + event_type IN (10, 11, 12, 13, 14, 15) + GROUP BY "events"."user_id" + ) as events_subquery_5 + ) events_subquery_2) + UNION + (SELECT * + FROM + (SELECT + "events"."time", 2 AS event, "events"."user_id" + FROM + events_table as "events" + WHERE + event_type IN (20, 21, 22, 23, 24, 25) ) events_subquery_3) + UNION + (SELECT * + FROM + (SELECT + "events"."time", 3 AS event, "events"."user_id" + FROM + events_table as "events" + WHERE + event_type IN (26, 27, 28, 29, 30, 13)) events_subquery_4) + ) t1 + GROUP BY "t1"."user_id") AS t) "q" +INNER JOIN + (SELECT + "users"."user_id" + FROM + users_table as "users" + WHERE + value_1 > 50 and value_1 < 70) AS t + ON (t.user_id = q.user_id)) as final_query +GROUP BY + types +ORDER BY + types; + +-- +-- Should error out with UNION ALL Queries on reference tables +-- +SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType +FROM + ( SELECT *, random() + FROM + ( SELECT "t"."user_id", "t"."time", unnest("t"."collected_events") AS "event_types" + FROM + ( SELECT "t1"."user_id", min("t1"."time") AS "time", array_agg(("t1"."event") ORDER BY TIME ASC, event DESC) AS collected_events + FROM ( + (SELECT * + FROM + (SELECT + "events"."user_id", "events"."time", 0 AS event + FROM + events_table as "events" + WHERE + event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1) + UNION ALL + (SELECT * + FROM + (SELECT + "events"."user_id", "events"."time", 1 AS event + FROM + events_table as "events" + WHERE + event_type IN (15, 16, 17, 18, 19) ) events_subquery_2) + UNION ALL + (SELECT * + FROM + (SELECT + "events"."user_id", "events"."time", 2 AS event + FROM + events_reference_table as "events" + WHERE + event_type IN (20, 21, 22, 23, 24, 25) ) events_subquery_3) + UNION ALL + (SELECT * + FROM + (SELECT + "events"."user_id", "events"."time", 3 AS event + FROM + events_table as "events" + WHERE + event_type IN (26, 27, 28, 29, 30, 13)) events_subquery_4)) t1 + GROUP BY "t1"."user_id") AS t) "q" +INNER JOIN + (SELECT "users"."user_id" + FROM users_table as "users" + WHERE value_1 > 50 and value_1 < 70) AS t ON (t.user_id = q.user_id)) as final_query +GROUP BY types +ORDER BY types; \ No newline at end of file From ceba81ce354e0fe4dafcb1f8d3ba704d740178a7 Mon Sep 17 00:00:00 2001 From: velioglu Date: Tue, 8 Aug 2017 09:16:27 +0300 Subject: [PATCH 4/6] Move physical planner checks to logical planner --- .../planner/multi_logical_planner.c | 234 ++++++++++++++++-- .../planner/multi_physical_planner.c | 228 ++--------------- .../planner/multi_router_planner.c | 2 +- .../relation_restriction_equivalence.c | 15 ++ .../distributed/multi_logical_planner.h | 2 +- .../distributed/multi_physical_planner.h | 2 +- ...ulti_subquery_complex_reference_clause.out | 18 +- ...lti_subquery_in_where_reference_clause.out | 3 +- 8 files changed, 258 insertions(+), 246 deletions(-) diff --git a/src/backend/distributed/planner/multi_logical_planner.c b/src/backend/distributed/planner/multi_logical_planner.c index a0ef2c739..16fb25336 100644 --- a/src/backend/distributed/planner/multi_logical_planner.c +++ b/src/backend/distributed/planner/multi_logical_planner.c @@ -85,6 +85,8 @@ static FieldSelect * CompositeFieldRecursive(Expr *expression, Query *query); static bool FullCompositeFieldList(List *compositeFieldList); static MultiNode * MultiPlanTree(Query *queryTree); static void ErrorIfQueryNotSupported(Query *queryTree); +static bool HasUnsupportedReferenceTableJoin( + PlannerRestrictionContext *plannerRestrictionContext); static bool HasUnsupportedJoinWalker(Node *node, void *context); static bool ErrorHintRequired(const char *errorHint, Query *queryTree); static DeferredErrorMessage * DeferErrorIfUnsupportedSubqueryRepartition(Query * @@ -94,6 +96,8 @@ static bool HasOuterJoin(Query *queryTree); static bool HasOuterJoinWalker(Node *node, void *maxJoinLevel); static bool HasComplexJoinOrder(Query *queryTree); static bool HasComplexRangeTableType(Query *queryTree); +static bool RelationInfoHasReferenceTable(PlannerInfo *plannerInfo, + RelOptInfo *relationInfo); static void ValidateClauseList(List *clauseList); static void ValidateSubqueryPushdownClauseList(List *clauseList); static bool ExtractFromExpressionWalker(Node *node, @@ -188,7 +192,6 @@ MultiLogicalPlanCreate(Query *originalQuery, Query *queryTree, { originalQuery = (Query *) ResolveExternalParams((Node *) originalQuery, boundParams); - multiQueryNode = MultiSubqueryPlanTree(originalQuery, queryTree, plannerRestrictionContext); } @@ -539,6 +542,13 @@ DeferErrorIfUnsupportedSubqueryPushdown(Query *originalQuery, "one another relation using distribution keys and " "equality operator.", NULL); } + else if (HasUnsupportedReferenceTableJoin(plannerRestrictionContext)) + { + return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, + "cannot pushdown the subquery", + "There exist a reference table in the outer part of the outer join", + NULL); + } /* * We first extract all the queries that appear in the original query. Later, @@ -871,6 +881,10 @@ DeferErrorIfUnsupportedUnionQuery(Query *subqueryTree, { SetOperationStmt *setOperation = (SetOperationStmt *) lfirst(setOperationStatmentCell); + Node *leftArg = setOperation->larg; + Node *rightArg = setOperation->rarg; + int leftArgRTI = 0; + int rightArgRTI = 0; if (setOperation->op != SETOP_UNION) { @@ -878,6 +892,36 @@ DeferErrorIfUnsupportedUnionQuery(Query *subqueryTree, "cannot push down this subquery", "Intersect and Except are currently unsupported", NULL); } + + if (IsA(leftArg, RangeTblRef)) + { + Node *leftArgSubquery = NULL; + leftArgRTI = ((RangeTblRef *) leftArg)->rtindex; + leftArgSubquery = (Node *) rt_fetch(leftArgRTI, + subqueryTree->rtable)->subquery; + if (HasReferenceTable(leftArgSubquery)) + { + return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, + "cannot push down this subquery ", + "Reference tables are not supported with union" + " operator", NULL); + } + } + + if (IsA(rightArg, RangeTblRef)) + { + Node *rightArgSubquery = NULL; + rightArgRTI = ((RangeTblRef *) rightArg)->rtindex; + rightArgSubquery = (Node *) rt_fetch(rightArgRTI, + subqueryTree->rtable)->subquery; + if (HasReferenceTable(rightArgSubquery)) + { + return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, + "cannot push down this subquery", + "Reference tables are not supported with union" + " operator", NULL); + } + } } return NULL; @@ -982,7 +1026,7 @@ DeferErrorIfUnsupportedTableCombination(Query *queryTree) /* * TargetListOnPartitionColumn checks if at least one target list entry is on - * partition column. + * partition column or the table is a reference table. */ static bool TargetListOnPartitionColumn(Query *query, List *targetEntryList) @@ -1003,9 +1047,9 @@ TargetListOnPartitionColumn(Query *query, List *targetEntryList) FindReferencedTableColumn(targetExpression, NIL, query, &relationId, &column); /* - * If the expression belongs to reference table directly returns true, - * since logic of caller function checks whether it can find the necessaary - * data from each node. + * If the expression belongs to reference table directly returns true. + * We can assume that target list entry always on partition column of + * reference tables. */ if (IsDistributedTable(relationId) && PartitionMethod(relationId) == DISTRIBUTE_BY_NONE) @@ -1380,6 +1424,149 @@ MultiPlanTree(Query *queryTree) } +/* + * HasUnsupportedReferenceTableJoin returns true if there exists a outer join + * exist between reference table and distributed tables which does not obey the + * rules : + * - Reference tables can not be located in the outer part of the semi join (or + * the inner part of the anti join). Otherwise, we may have duplicate results. + * Although getting duplicate results is not possible by checking the equality + * on the column of the reference table and partition column of distributed table, + * we still keep these checks. Because, using the reference table in the outer + * part of the semi join is not very common. + * - Reference tables can not be located in the outer part of the left join and + * inner part of the right join. Otherwise we will definitely have duplicate rows. + * Beside, reference tables can not be used with full outer joins because of the + * same reason. + */ +static bool +HasUnsupportedReferenceTableJoin(PlannerRestrictionContext *plannerRestrictionContext) +{ + List *joinRestrictionList = + plannerRestrictionContext->joinRestrictionContext->joinRestrictionList; + ListCell *joinRestrictionCell = NULL; + + foreach(joinRestrictionCell, joinRestrictionList) + { + JoinRestriction *joinRestriction = (JoinRestriction *) lfirst( + joinRestrictionCell); + JoinType joinType = joinRestriction->joinType; + PlannerInfo *plannerInfo = joinRestriction->plannerInfo; + RelOptInfo *innerrel = joinRestriction->innerrel; + RelOptInfo *outerrel = joinRestriction->outerrel; + + switch (joinType) + { + case JOIN_SEMI: + { + if (RelationInfoHasReferenceTable(plannerInfo, outerrel)) + { + return true; + } + } + break; + + case JOIN_ANTI: + { + if (RelationInfoHasReferenceTable(plannerInfo, innerrel)) + { + return true; + } + } + break; + + case JOIN_LEFT: + { + if (RelationInfoHasReferenceTable(plannerInfo, outerrel)) + { + return true; + } + } + break; + + case JOIN_RIGHT: + { + if (RelationInfoHasReferenceTable(plannerInfo, innerrel)) + { + return true; + } + } + break; + + case JOIN_FULL: + { + if (RelationInfoHasReferenceTable(plannerInfo, innerrel) || + RelationInfoHasReferenceTable( + plannerInfo, outerrel)) + { + return true; + } + } + break; + + default: + { } + break; + } + } + + return false; +} + + +/* + * ReferenceTableExist check whether the relationInfo has reference table. + * Since relation ids of relationInfo indexes to the range table entry list of + * planner info, planner info is also passed. + */ +static bool +RelationInfoHasReferenceTable(PlannerInfo *plannerInfo, RelOptInfo *relationInfo) +{ + Relids relids = bms_copy(relationInfo->relids); + int relationId = -1; + + while ((relationId = bms_first_member(relids)) >= 0) + { + RangeTblEntry *rangeTableEntry = plannerInfo->simple_rte_array[relationId]; + + /* relationInfo has this range table entry */ + if (HasReferenceTable((Node *) rangeTableEntry)) + { + return true; + } + } + + return false; +} + + +/* + * HasReferenceTable checks whether there exist a reference table in the + * given node. + */ +bool +HasReferenceTable(Node *node) +{ + List *relationList = NIL; + ListCell *relationCell = NULL; + ExtractRangeTableRelationWalkerInRTE(node, &relationList); + + foreach(relationCell, relationList) + { + RangeTblEntry *rangeTableEntry = (RangeTblEntry *) lfirst(relationCell); + Oid relationId = rangeTableEntry->relid; + + if (IsDistributedTable(relationId) && PartitionMethod(relationId) == + DISTRIBUTE_BY_NONE) + { + return true; + } + } + + return false; +} + + /* * ErrorIfQueryNotSupported checks that we can perform distributed planning for * the given query. The checks in this function will be removed as we support @@ -2745,44 +2932,41 @@ ExtractRangeTableRelationWalker(Node *node, List **rangeTableRelationList) } -/* Get the list of relations from the given node. Note that the difference between - * this function and ExtractRangeTableRelationWalker is that this one recursively +/* + * ExtractRangeTableRelationWalkerInRTE obtains the list of relations from the + * given node. Note that the difference between this function and + * ExtractRangeTableRelationWalker is that this one recursively * walk into range table entries if it can. */ bool -ExtractRTRelationFromNode(Node *node, List **rangeTableList) +ExtractRangeTableRelationWalkerInRTE(Node *node, List **rangeTableRelationList) { bool walkIsComplete = false; + if (node == NULL) { - return false; + return walkIsComplete; } - - if (IsA(node, RangeTblEntry)) + else if (IsA(node, RangeTblEntry)) { RangeTblEntry *rangeTableEntry = (RangeTblEntry *) node; + List *rangeTableList = NIL; + rangeTableList = lappend(rangeTableList, rangeTableEntry); - if (rangeTableEntry->rtekind == RTE_RELATION && - rangeTableEntry->relkind != RELKIND_VIEW) + if (rangeTableEntry->rtekind == RTE_RELATION) { - (*rangeTableList) = lappend(*rangeTableList, rangeTableEntry); + (*rangeTableRelationList) = lappend(*rangeTableRelationList, rangeTableEntry); } - else if (rangeTableEntry->rtekind == RTE_SUBQUERY) + else { - walkIsComplete = query_tree_walker(rangeTableEntry->subquery, - ExtractRTRelationFromNode, - rangeTableList, QTW_EXAMINE_RTES); + walkIsComplete = range_table_walker(rangeTableList, + ExtractRangeTableRelationWalkerInRTE, + rangeTableRelationList, 0); } } - else if (IsA(node, Query)) - { - walkIsComplete = query_tree_walker((Query *) node, ExtractRTRelationFromNode, - rangeTableList, QTW_EXAMINE_RTES); - } else { - walkIsComplete = expression_tree_walker(node, ExtractRTRelationFromNode, - rangeTableList); + walkIsComplete = ExtractRangeTableRelationWalker(node, rangeTableRelationList); } return walkIsComplete; diff --git a/src/backend/distributed/planner/multi_physical_planner.c b/src/backend/distributed/planner/multi_physical_planner.c index a70527b70..53eeba729 100644 --- a/src/backend/distributed/planner/multi_physical_planner.c +++ b/src/backend/distributed/planner/multi_physical_planner.c @@ -127,8 +127,6 @@ static Job * BuildJobTreeTaskList(Job *jobTree, static List * SubquerySqlTaskList(Job *job, PlannerRestrictionContext *plannerRestrictionContext); static void ErrorIfUnsupportedShardDistribution(Query *query); -static void ErrorIfUnsupportedJoinReferenceTable( - PlannerRestrictionContext *plannerRestrictionContext); static bool CoPartitionedTables(Oid firstRelationId, Oid secondRelationId); static bool ShardIntervalsEqual(FmgrInfo *comparisonFunction, ShardInterval *firstInterval, @@ -197,8 +195,6 @@ static StringInfo MergeTableQueryString(uint32 taskIdIndex, List *targetEntryLis static StringInfo IntermediateTableQueryString(uint64 jobId, uint32 taskIdIndex, Query *reduceQuery); static uint32 FinalTargetEntryCount(List *targetEntryList); -static bool ReferenceTableExist(PlannerInfo *plannerInfo, RelOptInfo *relationInfo); -static void ErrorIfSetOpWithReferenceTable(Query *queryTree); /* @@ -2042,12 +2038,6 @@ SubquerySqlTaskList(Job *job, PlannerRestrictionContext *plannerRestrictionConte /* error if shards are not co-partitioned */ ErrorIfUnsupportedShardDistribution(subquery); - /* error if unsupported join on reference tables */ - ErrorIfUnsupportedJoinReferenceTable(plannerRestrictionContext); - - /* error if reference table exists as a part of any set operation */ - ErrorIfSetOpWithReferenceTable(subquery); - /* get list of all range tables in subquery tree */ ExtractRangeTableRelationWalker((Node *) subquery, &rangeTableList); @@ -2072,8 +2062,8 @@ SubquerySqlTaskList(Job *job, PlannerRestrictionContext *plannerRestrictionConte } /* - * That means all table are reference table and we can assign any reference - * table as an anchor one . + * That means all tables are reference tables and we can pick any any of them + * as an anchor table. */ if (targetCacheEntry == NULL) { @@ -2107,196 +2097,6 @@ SubquerySqlTaskList(Job *job, PlannerRestrictionContext *plannerRestrictionConte } -/* - * ErrorIfUnsupportedJoinReferenceTable errors out if there exists a outer join - * exist between reference table and distributed tables. - */ -static void -ErrorIfUnsupportedJoinReferenceTable(PlannerRestrictionContext *plannerRestrictionContext) -{ - List *joinRestrictionList = - plannerRestrictionContext->joinRestrictionContext->joinRestrictionList; - ListCell *joinRestrictionCell = NULL; - - foreach(joinRestrictionCell, joinRestrictionList) - { - JoinRestriction *joinRestriction = (JoinRestriction *) lfirst( - joinRestrictionCell); - JoinType joinType = joinRestriction->joinType; - PlannerInfo *plannerInfo = joinRestriction->plannerInfo; - RelOptInfo *innerrel = joinRestriction->innerrel; - RelOptInfo *outerrel = joinRestriction->outerrel; - - switch (joinType) - { - case JOIN_SEMI: - { - if (ReferenceTableExist(plannerInfo, outerrel)) - { - ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("can not plan query having reference table on" - " the left part of semi join"))); - } - } - break; - - case JOIN_ANTI: - { - if (ReferenceTableExist(plannerInfo, innerrel)) - { - ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("can not plan query having reference table on" - " the left part of anti join"))); - } - } - break; - - case JOIN_LEFT: - { - if (ReferenceTableExist(plannerInfo, outerrel)) - { - ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("can not plan query having reference table on" - " the left part of left join"))); - } - } - break; - - case JOIN_RIGHT: - { - if (ReferenceTableExist(plannerInfo, innerrel)) - { - ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("can not plan query having reference table on" - " the right part of right join"))); - } - } - break; - - case JOIN_FULL: - { - if (ReferenceTableExist(plannerInfo, innerrel) || ReferenceTableExist( - plannerInfo, outerrel)) - { - ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg( - "can not plan query having reference table as a" - " part of full join"))); - } - } - break; - - default: - { } - break; - } - } -} - - -/* - * ReferenceTableExist check whether the relationInfo has reference table. - * Since relation ids of relationInfo indexes to the range table entry list of - * query, query is also passed. - */ -static bool -ReferenceTableExist(PlannerInfo *plannerInfo, RelOptInfo *relationInfo) -{ - Relids relids = bms_copy(relationInfo->relids); - int relationId = -1; - - while ((relationId = bms_first_member(relids)) >= 0) - { - RangeTblEntry *rangeTableEntry = plannerInfo->simple_rte_array[relationId]; - - /* relationInfo has this range table entry */ - if (RTEContainsReferenceTable(rangeTableEntry)) - { - return true; - } - } - - return false; -} - - -/* - * RTEContainsReferenceTable checks whether there exist a reference table in the - * given range table entry. - */ -bool -RTEContainsReferenceTable(RangeTblEntry *rangeTableEntry) -{ - List *relationList = NIL; - ListCell *relationCell = NULL; - ExtractRTRelationFromNode((Node *) rangeTableEntry, &relationList); - - foreach(relationCell, relationList) - { - RangeTblEntry *rangeTableEntry = (RangeTblEntry *) lfirst(relationCell); - Oid relationId = rangeTableEntry->relid; - - if (IsDistributedTable(relationId) && PartitionMethod(relationId) == - DISTRIBUTE_BY_NONE) - { - return true; - } - } - - return false; -} - - -/* - * ErrorIfSetOpWithReferenceTable checks whether there exist a reference table - * as a part of any set operation. - */ -static void -ErrorIfSetOpWithReferenceTable(Query *queryTree) -{ - List *joinTreeTableIndexList = NIL; - Index subqueryRteIndex = 0; - RangeTblEntry *rangeTableEntry = NULL; - Query *subqueryTree = NULL; - List *rangeTableList = queryTree->rtable; - Node *setOperations = queryTree->setOperations; - ExtractRangeTableIndexWalker((Node *) queryTree->jointree, &joinTreeTableIndexList); - - if (setOperations != NULL) - { - List *rangeTableList = NIL; - ListCell *rangeTableCell = NULL; - ExtractRangeTableRelationWalker((Node *) queryTree, &rangeTableList); - - foreach(rangeTableCell, rangeTableList) - { - RangeTblEntry *rangeTableEntry = (RangeTblEntry *) lfirst(rangeTableCell); - Oid relationId = rangeTableEntry->relid; - if (PartitionMethod(relationId) == DISTRIBUTE_BY_NONE) - { - ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg( - "can not plan query having reference table with union"))); - } - } - } - - if (list_length(joinTreeTableIndexList) < 1) - { - return; - } - - subqueryRteIndex = linitial_int(joinTreeTableIndexList); - rangeTableEntry = rt_fetch(subqueryRteIndex, rangeTableList); - subqueryTree = rangeTableEntry->subquery; - - if (subqueryTree != NULL) - { - return ErrorIfSetOpWithReferenceTable(subqueryTree); - } -} - - /* * ErrorIfUnsupportedShardDistribution gets list of relations in the given query * and checks if two conditions below hold for them, otherwise it errors out. @@ -2311,6 +2111,7 @@ ErrorIfUnsupportedShardDistribution(Query *query) { Oid firstTableRelationId = InvalidOid; List *relationIdList = RelationIdList(query); + List *distributedRelationIdList = NIL; ListCell *relationIdCell = NULL; uint32 relationIndex = 0; uint32 rangeDistributedRelationCount = 0; @@ -2320,19 +2121,31 @@ ErrorIfUnsupportedShardDistribution(Query *query) { Oid relationId = lfirst_oid(relationIdCell); char partitionMethod = PartitionMethod(relationId); + if (partitionMethod == DISTRIBUTE_BY_RANGE) { rangeDistributedRelationCount++; + distributedRelationIdList = lappend_oid(distributedRelationIdList, + relationId); } else if (partitionMethod == DISTRIBUTE_BY_HASH) { hashDistributedRelationCount++; + distributedRelationIdList = lappend_oid(distributedRelationIdList, + relationId); } - else + else if (partitionMethod == DISTRIBUTE_BY_NONE) { /* do not need to handle reference tables */ continue; } + else + { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot pushdown this subquery"), + errdetail("Currently append partitioned relations " + "are not supported"))); + } } if ((rangeDistributedRelationCount > 0) && (hashDistributedRelationCount > 0)) @@ -2343,18 +2156,11 @@ ErrorIfUnsupportedShardDistribution(Query *query) "partitioned relations are unsupported"))); } - foreach(relationIdCell, relationIdList) + foreach(relationIdCell, distributedRelationIdList) { Oid relationId = lfirst_oid(relationIdCell); bool coPartitionedTables = false; Oid currentRelationId = relationId; - char partitionMethod = PartitionMethod(relationId); - - /* do not need to check reference tables */ - if (partitionMethod == DISTRIBUTE_BY_NONE) - { - continue; - } /* get shard list of first relation and continue for the next relation */ if (relationIndex == 0) diff --git a/src/backend/distributed/planner/multi_router_planner.c b/src/backend/distributed/planner/multi_router_planner.c index 7734ae32c..284496427 100644 --- a/src/backend/distributed/planner/multi_router_planner.c +++ b/src/backend/distributed/planner/multi_router_planner.c @@ -275,7 +275,7 @@ CreateSingleTaskRouterPlan(Query *originalQuery, Query *query, * The function returns hashed columns generated by MakeInt4Column() for the hash * partitioned tables in place of partition columns. * - * The function errors out if the given shard interval does not belong to a hash, + * The function returns NIL if shard interval does not belong to a hash, * range and append distributed tables. * * NB: If you update this, also look at PrunableExpressionsWalker(). diff --git a/src/backend/distributed/planner/relation_restriction_equivalence.c b/src/backend/distributed/planner/relation_restriction_equivalence.c index 657e8751e..1fa38f8d6 100644 --- a/src/backend/distributed/planner/relation_restriction_equivalence.c +++ b/src/backend/distributed/planner/relation_restriction_equivalence.c @@ -153,6 +153,7 @@ SafeToPushdownUnionSubquery(RelationRestrictionContext *restrictionContext) foreach(relationRestrictionCell, restrictionContext->relationRestrictionList) { RelationRestriction *relationRestriction = lfirst(relationRestrictionCell); + Oid relationId = relationRestriction->relationId; Index partitionKeyIndex = InvalidAttrNumber; PlannerInfo *relationPlannerRoot = relationRestriction->plannerInfo; List *targetList = relationPlannerRoot->parse->targetList; @@ -160,6 +161,20 @@ SafeToPushdownUnionSubquery(RelationRestrictionContext *restrictionContext) Var *varToBeAdded = NULL; TargetEntry *targetEntryToAdd = NULL; + /* + * Although it is not the best place to error out when facing with reference + * tables, we decide to error out here. Otherwise, we need to add equality + * for each reference table and it is more complex to implement. In the + * future implementation all checks will be gathered to single point. + */ + if (PartitionMethod(relationId) == DISTRIBUTE_BY_NONE) + { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot pushdown this query"), + errdetail( + "Reference tables are not allowed with set operations"))); + } + /* * We first check whether UNION ALLs are pulled up or not. Note that Postgres * planner creates AppendRelInfos per each UNION ALL query that is pulled up. diff --git a/src/include/distributed/multi_logical_planner.h b/src/include/distributed/multi_logical_planner.h index f73deb79b..bc4795d65 100644 --- a/src/include/distributed/multi_logical_planner.h +++ b/src/include/distributed/multi_logical_planner.h @@ -206,7 +206,7 @@ extern List * TableEntryList(List *rangeTableList); extern List * UsedTableEntryList(Query *query); extern bool ExtractRangeTableRelationWalker(Node *node, List **rangeTableList); extern bool ExtractRangeTableEntryWalker(Node *node, List **rangeTableList); -extern bool ExtractRTRelationFromNode(Node *node, List **rangeTableList); +extern bool ExtractRangeTableRelationWalkerInRTE(Node *node, List **rangeTableList); extern List * pull_var_clause_default(Node *node); extern bool OperatorImplementsEquality(Oid opno); diff --git a/src/include/distributed/multi_physical_planner.h b/src/include/distributed/multi_physical_planner.h index 5f2f08f1d..8f753a4e5 100644 --- a/src/include/distributed/multi_physical_planner.h +++ b/src/include/distributed/multi_physical_planner.h @@ -286,7 +286,7 @@ extern Const * MakeInt4Constant(Datum constantValue); extern int CompareShardPlacements(const void *leftElement, const void *rightElement); extern bool ShardIntervalsOverlap(ShardInterval *firstInterval, ShardInterval *secondInterval); -extern bool RTEContainsReferenceTable(RangeTblEntry *rangeTableEntry); +extern bool HasReferenceTable(Node *node); /* function declarations for Task and Task list operations */ extern bool TasksEqual(const Task *a, const Task *b); diff --git a/src/test/regress/expected/multi_subquery_complex_reference_clause.out b/src/test/regress/expected/multi_subquery_complex_reference_clause.out index d57954b5b..865d44e8e 100644 --- a/src/test/regress/expected/multi_subquery_complex_reference_clause.out +++ b/src/test/regress/expected/multi_subquery_complex_reference_clause.out @@ -303,7 +303,8 @@ count(*) AS cnt, "generated_group_field" ORDER BY cnt DESC, generated_group_field ASC LIMIT 10; -ERROR: can not plan query having reference table on the left part of left join +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join -- RIGHT JOINs used with INNER JOINs should error out since reference table exist in the -- right side of the RIGHT JOIN. SELECT @@ -342,7 +343,8 @@ count(*) AS cnt, "generated_group_field" ORDER BY cnt DESC, generated_group_field ASC LIMIT 10; -ERROR: can not plan query having reference table on the left part of left join +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join -- Outer subquery with reference table SELECT "some_users_data".user_id, lastseen FROM @@ -373,7 +375,8 @@ FROM ORDER BY user_id limit 50; -ERROR: can not plan query having reference table as a part of full join +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join -- -- UNIONs and JOINs with reference tables, shoukld error out -- @@ -437,7 +440,8 @@ GROUP BY types ORDER BY types; -ERROR: can not plan query having reference table with union +ERROR: cannot push down this subquery +DETAIL: Reference tables are not supported with union operator -- reference table exist in the subquery of union, should error out SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType FROM @@ -509,7 +513,8 @@ GROUP BY types ORDER BY types; -ERROR: can not plan query having reference table with union +ERROR: cannot push down this subquery +DETAIL: Reference tables are not supported with union operator -- -- Should error out with UNION ALL Queries on reference tables -- @@ -563,4 +568,5 @@ INNER JOIN WHERE value_1 > 50 and value_1 < 70) AS t ON (t.user_id = q.user_id)) as final_query GROUP BY types ORDER BY types; -ERROR: can not plan query having reference table with union +ERROR: cannot push down this subquery +DETAIL: Reference tables are not supported with union operator diff --git a/src/test/regress/expected/multi_subquery_in_where_reference_clause.out b/src/test/regress/expected/multi_subquery_in_where_reference_clause.out index 0159510b2..d45ec005f 100644 --- a/src/test/regress/expected/multi_subquery_in_where_reference_clause.out +++ b/src/test/regress/expected/multi_subquery_in_where_reference_clause.out @@ -139,6 +139,7 @@ SELECT user_id, value_2 FROM users_table WHERE HAVING sum(submit_card_info) > 0 ) ORDER BY 1, 2; -ERROR: can not plan query having reference table on the left part of left join +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join DROP TABLE events_reference_table; DROP TABLE users_reference_table; From 7550b8ad52f2ecf1e051931518fed272eba2e3b3 Mon Sep 17 00:00:00 2001 From: velioglu Date: Wed, 9 Aug 2017 11:26:19 +0300 Subject: [PATCH 5/6] Fix anchor shard id selection when reference table exists --- src/backend/distributed/planner/multi_physical_planner.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/distributed/planner/multi_physical_planner.c b/src/backend/distributed/planner/multi_physical_planner.c index 53eeba729..851597393 100644 --- a/src/backend/distributed/planner/multi_physical_planner.c +++ b/src/backend/distributed/planner/multi_physical_planner.c @@ -2392,7 +2392,7 @@ SubqueryTaskCreate(Query *originalQuery, ShardInterval *shardInterval, subqueryTask = CreateBasicTask(jobId, taskId, SQL_TASK, queryString->data); subqueryTask->dependedTaskList = NULL; - subqueryTask->anchorShardId = selectAnchorShardId; + subqueryTask->anchorShardId = shardInterval->shardId; subqueryTask->taskPlacementList = selectPlacementList; subqueryTask->upsertQuery = false; subqueryTask->relationShardList = relationShardList; From b0efffae1cb63683a7e9ede5b068122cbc7b9808 Mon Sep 17 00:00:00 2001 From: velioglu Date: Wed, 9 Aug 2017 17:57:57 +0300 Subject: [PATCH 6/6] Correct planner and add more tests --- .../planner/multi_logical_planner.c | 99 +++++----------- .../planner/multi_physical_planner.c | 15 ++- .../relation_restriction_equivalence.c | 3 +- .../distributed/multi_logical_planner.h | 3 +- src/test/regress/expected/multi_explain.out | 61 +++++----- src/test/regress/expected/multi_explain_0.out | 61 +++++----- .../expected/multi_reference_table.out | 3 +- ...ulti_subquery_complex_reference_clause.out | 111 ++++++++++++++++-- ...lti_subquery_in_where_reference_clause.out | 85 +++++++++++++- .../regress/expected/multi_subquery_union.out | 95 +++++++++++++++ ...i_behavioral_analytics_create_table.source | 8 ++ ...i_behavioral_analytics_create_table.source | 16 +++ .../regress/sql/multi_reference_table.sql | 3 +- ...ulti_subquery_complex_reference_clause.sql | 81 ++++++++++++- ...lti_subquery_in_where_reference_clause.sql | 75 +++++++++++- src/test/regress/sql/multi_subquery_union.sql | 91 ++++++++++++++ 16 files changed, 649 insertions(+), 161 deletions(-) diff --git a/src/backend/distributed/planner/multi_logical_planner.c b/src/backend/distributed/planner/multi_logical_planner.c index 16fb25336..f1da91072 100644 --- a/src/backend/distributed/planner/multi_logical_planner.c +++ b/src/backend/distributed/planner/multi_logical_planner.c @@ -546,7 +546,8 @@ DeferErrorIfUnsupportedSubqueryPushdown(Query *originalQuery, { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "cannot pushdown the subquery", - "There exist a reference table in the outer part of the outer join", + "There exist a reference table in the outer part of the " + "outer join", NULL); } @@ -1426,18 +1427,19 @@ MultiPlanTree(Query *queryTree) /* * HasUnsupportedReferenceTableJoin returns true if there exists a outer join - * exist between reference table and distributed tables which does not obey the - * rules : - * - Reference tables can not be located in the outer part of the semi join (or - * the inner part of the anti join). Otherwise, we may have duplicate results. - * Although getting duplicate results is not possible by checking the equality - * on the column of the reference table and partition column of distributed table, - * we still keep these checks. Because, using the reference table in the outer - * part of the semi join is not very common. - * - Reference tables can not be located in the outer part of the left join and - * inner part of the right join. Otherwise we will definitely have duplicate rows. - * Beside, reference tables can not be used with full outer joins because of the - * same reason. + * between reference table and distributed tables which does not follow + * the rules : + * - Reference tables can not be located in the outer part of the semi join or the + * anti join. Otherwise, we may have duplicate results. Although getting duplicate + * results is not possible by checking the equality on the column of the reference + * table and partition column of distributed table, we still keep these checks. + * Because, using the reference table in the outer part of the semi join or anti + * join is not very common. + * - Reference tables can not be located in the outer part of the left join + * (Note that PostgreSQL converts right joins to left joins. While converting + * join types, innerrel and outerrel are also switched.) Otherwise we will + * definitely have duplicate rows. Beside, reference tables can not be used + * with full outer joins because of the same reason. */ static bool HasUnsupportedReferenceTableJoin(PlannerRestrictionContext *plannerRestrictionContext) @@ -1455,58 +1457,20 @@ HasUnsupportedReferenceTableJoin(PlannerRestrictionContext *plannerRestrictionCo RelOptInfo *innerrel = joinRestriction->innerrel; RelOptInfo *outerrel = joinRestriction->outerrel; - switch (joinType) + if (joinType == JOIN_SEMI || joinType == JOIN_ANTI || joinType == JOIN_LEFT) { - case JOIN_SEMI: + if (RelationInfoHasReferenceTable(plannerInfo, outerrel)) { - if (RelationInfoHasReferenceTable(plannerInfo, outerrel)) - { - return true; - } + return true; } - break; - - case JOIN_ANTI: + } + else if (joinType == JOIN_FULL) + { + if (RelationInfoHasReferenceTable(plannerInfo, innerrel) || + RelationInfoHasReferenceTable(plannerInfo, outerrel)) { - if (RelationInfoHasReferenceTable(plannerInfo, innerrel)) - { - return true; - } + return true; } - break; - - case JOIN_LEFT: - { - if (RelationInfoHasReferenceTable(plannerInfo, outerrel)) - { - return true; - } - } - break; - - case JOIN_RIGHT: - { - if (RelationInfoHasReferenceTable(plannerInfo, innerrel)) - { - return true; - } - } - break; - - case JOIN_FULL: - { - if (RelationInfoHasReferenceTable(plannerInfo, innerrel) || - RelationInfoHasReferenceTable( - plannerInfo, outerrel)) - { - return true; - } - } - break; - - default: - { } - break; } } @@ -1515,7 +1479,7 @@ HasUnsupportedReferenceTableJoin(PlannerRestrictionContext *plannerRestrictionCo /* - * ReferenceTableExist check whether the relationInfo has reference table. + * RelationInfoHasReferenceTable check whether the relationInfo has reference table. * Since relation ids of relationInfo indexes to the range table entry list of * planner info, planner info is also passed. */ @@ -1549,7 +1513,7 @@ HasReferenceTable(Node *node) { List *relationList = NIL; ListCell *relationCell = NULL; - ExtractRangeTableRelationWalkerInRTE(node, &relationList); + ExtractRangeTableRelationWalkerWithRTEExpand(node, &relationList); foreach(relationCell, relationList) { @@ -2933,13 +2897,13 @@ ExtractRangeTableRelationWalker(Node *node, List **rangeTableRelationList) /* - * ExtractRangeTableRelationWalkerInRTE obtains the list of relations from the - * given node. Note that the difference between this function and + * ExtractRangeTableRelationWalkerWithRTEExpand obtains the list of relations + * from the given node. Note that the difference between this function and * ExtractRangeTableRelationWalker is that this one recursively * walk into range table entries if it can. */ bool -ExtractRangeTableRelationWalkerInRTE(Node *node, List **rangeTableRelationList) +ExtractRangeTableRelationWalkerWithRTEExpand(Node *node, List **rangeTableRelationList) { bool walkIsComplete = false; @@ -2950,8 +2914,7 @@ ExtractRangeTableRelationWalkerInRTE(Node *node, List **rangeTableRelationList) else if (IsA(node, RangeTblEntry)) { RangeTblEntry *rangeTableEntry = (RangeTblEntry *) node; - List *rangeTableList = NIL; - rangeTableList = lappend(rangeTableList, rangeTableEntry); + List *rangeTableList = list_make1(rangeTableEntry); if (rangeTableEntry->rtekind == RTE_RELATION) { @@ -2960,7 +2923,7 @@ ExtractRangeTableRelationWalkerInRTE(Node *node, List **rangeTableRelationList) else { walkIsComplete = range_table_walker(rangeTableList, - ExtractRangeTableRelationWalkerInRTE, + ExtractRangeTableRelationWalkerWithRTEExpand, rangeTableRelationList, 0); } } diff --git a/src/backend/distributed/planner/multi_physical_planner.c b/src/backend/distributed/planner/multi_physical_planner.c index 851597393..9c0d1341f 100644 --- a/src/backend/distributed/planner/multi_physical_planner.c +++ b/src/backend/distributed/planner/multi_physical_planner.c @@ -2111,7 +2111,7 @@ ErrorIfUnsupportedShardDistribution(Query *query) { Oid firstTableRelationId = InvalidOid; List *relationIdList = RelationIdList(query); - List *distributedRelationIdList = NIL; + List *nonReferenceRelations = NIL; ListCell *relationIdCell = NULL; uint32 relationIndex = 0; uint32 rangeDistributedRelationCount = 0; @@ -2121,18 +2121,17 @@ ErrorIfUnsupportedShardDistribution(Query *query) { Oid relationId = lfirst_oid(relationIdCell); char partitionMethod = PartitionMethod(relationId); - if (partitionMethod == DISTRIBUTE_BY_RANGE) { rangeDistributedRelationCount++; - distributedRelationIdList = lappend_oid(distributedRelationIdList, - relationId); + nonReferenceRelations = lappend_oid(nonReferenceRelations, + relationId); } else if (partitionMethod == DISTRIBUTE_BY_HASH) { hashDistributedRelationCount++; - distributedRelationIdList = lappend_oid(distributedRelationIdList, - relationId); + nonReferenceRelations = lappend_oid(nonReferenceRelations, + relationId); } else if (partitionMethod == DISTRIBUTE_BY_NONE) { @@ -2142,7 +2141,7 @@ ErrorIfUnsupportedShardDistribution(Query *query) else { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot pushdown this subquery"), + errmsg("cannot push down this subquery"), errdetail("Currently append partitioned relations " "are not supported"))); } @@ -2156,7 +2155,7 @@ ErrorIfUnsupportedShardDistribution(Query *query) "partitioned relations are unsupported"))); } - foreach(relationIdCell, distributedRelationIdList) + foreach(relationIdCell, nonReferenceRelations) { Oid relationId = lfirst_oid(relationIdCell); bool coPartitionedTables = false; diff --git a/src/backend/distributed/planner/relation_restriction_equivalence.c b/src/backend/distributed/planner/relation_restriction_equivalence.c index 1fa38f8d6..b8a61a00e 100644 --- a/src/backend/distributed/planner/relation_restriction_equivalence.c +++ b/src/backend/distributed/planner/relation_restriction_equivalence.c @@ -124,7 +124,8 @@ static Index RelationRestrictionPartitionKeyIndex(RelationRestriction * /* * SafeToPushdownUnionSubquery returns true if all the relations are returns - * partition keys in the same ordinal position. + * partition keys in the same ordinal position and there is no reference table + * exists. * * Note that the function expects (and asserts) the input query to be a top * level union query defined by TopLevelUnionQuery(). diff --git a/src/include/distributed/multi_logical_planner.h b/src/include/distributed/multi_logical_planner.h index bc4795d65..6ebebc80f 100644 --- a/src/include/distributed/multi_logical_planner.h +++ b/src/include/distributed/multi_logical_planner.h @@ -206,7 +206,8 @@ extern List * TableEntryList(List *rangeTableList); extern List * UsedTableEntryList(Query *query); extern bool ExtractRangeTableRelationWalker(Node *node, List **rangeTableList); extern bool ExtractRangeTableEntryWalker(Node *node, List **rangeTableList); -extern bool ExtractRangeTableRelationWalkerInRTE(Node *node, List **rangeTableList); +extern bool ExtractRangeTableRelationWalkerWithRTEExpand(Node *node, + List **rangeTableList); extern List * pull_var_clause_default(Node *node); extern bool OperatorImplementsEquality(Oid opno); diff --git a/src/test/regress/expected/multi_explain.out b/src/test/regress/expected/multi_explain.out index e78b1b1eb..4ce4b795d 100644 --- a/src/test/regress/expected/multi_explain.out +++ b/src/test/regress/expected/multi_explain.out @@ -446,15 +446,16 @@ Aggregate Node: host=localhost port=57637 dbname=regression -> Aggregate -> GroupAggregate - Group Key: (((NULL::user_composite_type)).tenant_id), (((NULL::user_composite_type)).user_id) + Group Key: ((users.composite_id).tenant_id), ((users.composite_id).user_id) -> Sort - Sort Key: (((NULL::user_composite_type)).tenant_id), (((NULL::user_composite_type)).user_id) - -> Nested Loop - Join Filter: ((NULL::user_composite_type) = events.composite_id) - -> Result - One-Time Filter: false - -> Seq Scan on events_1400027 events - Filter: ((event_type)::text = ANY ('{click,submit,pay}'::text[])) + Sort Key: ((users.composite_id).tenant_id), ((users.composite_id).user_id) + -> Hash Join + Hash Cond: (users.composite_id = events.composite_id) + -> Seq Scan on users_1400029 users + Filter: ((composite_id >= '(1,-9223372036854775808)'::user_composite_type) AND (composite_id <= '(1,9223372036854775807)'::user_composite_type)) + -> Hash + -> Seq Scan on events_1400025 events + Filter: ((event_type)::text = ANY ('{click,submit,pay}'::text[])) -- Union and left join subquery pushdown EXPLAIN (COSTS OFF) SELECT @@ -531,33 +532,34 @@ HashAggregate Sort Key: subquery_top.hasdone -> Subquery Scan on subquery_top -> GroupAggregate - Group Key: (((NULL::user_composite_type)).tenant_id), (((NULL::user_composite_type)).user_id), subquery_2.hasdone + Group Key: ((users.composite_id).tenant_id), ((users.composite_id).user_id), subquery_2.hasdone -> Sort - Sort Key: (((NULL::user_composite_type)).tenant_id), (((NULL::user_composite_type)).user_id), subquery_2.hasdone + Sort Key: ((users.composite_id).tenant_id), ((users.composite_id).user_id), subquery_2.hasdone -> Hash Left Join - Hash Cond: ((NULL::user_composite_type) = subquery_2.composite_id) - -> Unique - -> Sort - Sort Key: (((NULL::user_composite_type)).tenant_id), (((NULL::user_composite_type)).user_id), (NULL::user_composite_type), ('action=>1'::text), events.event_time - -> Append - -> Nested Loop - Join Filter: ((NULL::user_composite_type) = events.composite_id) - -> Result - One-Time Filter: false - -> Seq Scan on events_1400027 events + Hash Cond: (users.composite_id = subquery_2.composite_id) + -> HashAggregate + Group Key: ((users.composite_id).tenant_id), ((users.composite_id).user_id), users.composite_id, ('action=>1'::text), events.event_time + -> Append + -> Hash Join + Hash Cond: (users.composite_id = events.composite_id) + -> Seq Scan on users_1400029 users + Filter: ((composite_id >= '(1,-9223372036854775808)'::user_composite_type) AND (composite_id <= '(1,9223372036854775807)'::user_composite_type)) + -> Hash + -> Seq Scan on events_1400025 events Filter: ((event_type)::text = 'click'::text) - -> Nested Loop - Join Filter: ((NULL::user_composite_type) = events_1.composite_id) - -> Result - One-Time Filter: false - -> Seq Scan on events_1400027 events_1 + -> Hash Join + Hash Cond: (users_1.composite_id = events_1.composite_id) + -> Seq Scan on users_1400029 users_1 + Filter: ((composite_id >= '(1,-9223372036854775808)'::user_composite_type) AND (composite_id <= '(1,9223372036854775807)'::user_composite_type)) + -> Hash + -> Seq Scan on events_1400025 events_1 Filter: ((event_type)::text = 'submit'::text) -> Hash -> Subquery Scan on subquery_2 -> Unique -> Sort Sort Key: ((events_2.composite_id).tenant_id), ((events_2.composite_id).user_id) - -> Seq Scan on events_1400027 events_2 + -> Seq Scan on events_1400025 events_2 Filter: ((composite_id >= '(1,-9223372036854775808)'::user_composite_type) AND (composite_id <= '(1,9223372036854775807)'::user_composite_type) AND ((event_type)::text = 'pay'::text)) -- Union, left join and having subquery pushdown EXPLAIN (COSTS OFF) @@ -703,13 +705,12 @@ Limit -> Limit -> Sort Sort Key: users.lastseen DESC - -> Subquery Scan on users - -> Result - One-Time Filter: false + -> Seq Scan on users_1400029 users + Filter: ((composite_id >= '(1,-9223372036854775808)'::user_composite_type) AND (composite_id <= '(1,9223372036854775807)'::user_composite_type)) -> Limit -> Sort Sort Key: events.event_time DESC - -> Seq Scan on events_1400027 events + -> Seq Scan on events_1400025 events Filter: (composite_id = users.composite_id) -- Test all tasks output SET citus.explain_all_tasks TO on; diff --git a/src/test/regress/expected/multi_explain_0.out b/src/test/regress/expected/multi_explain_0.out index 998b97578..f44ac8d9c 100644 --- a/src/test/regress/expected/multi_explain_0.out +++ b/src/test/regress/expected/multi_explain_0.out @@ -446,15 +446,16 @@ Aggregate Node: host=localhost port=57637 dbname=regression -> Aggregate -> GroupAggregate - Group Key: (((NULL::user_composite_type)).tenant_id), (((NULL::user_composite_type)).user_id) + Group Key: ((users.composite_id).tenant_id), ((users.composite_id).user_id) -> Sort - Sort Key: (((NULL::user_composite_type)).tenant_id), (((NULL::user_composite_type)).user_id) - -> Nested Loop - Join Filter: ((NULL::user_composite_type) = events.composite_id) - -> Result - One-Time Filter: false - -> Seq Scan on events_1400027 events - Filter: ((event_type)::text = ANY ('{click,submit,pay}'::text[])) + Sort Key: ((users.composite_id).tenant_id), ((users.composite_id).user_id) + -> Hash Join + Hash Cond: (users.composite_id = events.composite_id) + -> Seq Scan on users_1400029 users + Filter: ((composite_id >= '(1,-9223372036854775808)'::user_composite_type) AND (composite_id <= '(1,9223372036854775807)'::user_composite_type)) + -> Hash + -> Seq Scan on events_1400025 events + Filter: ((event_type)::text = ANY ('{click,submit,pay}'::text[])) -- Union and left join subquery pushdown EXPLAIN (COSTS OFF) SELECT @@ -531,33 +532,34 @@ HashAggregate Sort Key: subquery_top.hasdone -> Subquery Scan on subquery_top -> GroupAggregate - Group Key: (((NULL::user_composite_type)).tenant_id), (((NULL::user_composite_type)).user_id), subquery_2.hasdone + Group Key: ((users.composite_id).tenant_id), ((users.composite_id).user_id), subquery_2.hasdone -> Sort - Sort Key: (((NULL::user_composite_type)).tenant_id), (((NULL::user_composite_type)).user_id), subquery_2.hasdone + Sort Key: ((users.composite_id).tenant_id), ((users.composite_id).user_id), subquery_2.hasdone -> Hash Left Join - Hash Cond: ((NULL::user_composite_type) = subquery_2.composite_id) - -> Unique - -> Sort - Sort Key: (((NULL::user_composite_type)).tenant_id), (((NULL::user_composite_type)).user_id), (NULL::user_composite_type), ('action=>1'::text), events.event_time - -> Append - -> Nested Loop - Join Filter: ((NULL::user_composite_type) = events.composite_id) - -> Result - One-Time Filter: false - -> Seq Scan on events_1400027 events + Hash Cond: (users.composite_id = subquery_2.composite_id) + -> HashAggregate + Group Key: ((users.composite_id).tenant_id), ((users.composite_id).user_id), users.composite_id, ('action=>1'::text), events.event_time + -> Append + -> Hash Join + Hash Cond: (users.composite_id = events.composite_id) + -> Seq Scan on users_1400029 users + Filter: ((composite_id >= '(1,-9223372036854775808)'::user_composite_type) AND (composite_id <= '(1,9223372036854775807)'::user_composite_type)) + -> Hash + -> Seq Scan on events_1400025 events Filter: ((event_type)::text = 'click'::text) - -> Nested Loop - Join Filter: ((NULL::user_composite_type) = events_1.composite_id) - -> Result - One-Time Filter: false - -> Seq Scan on events_1400027 events_1 + -> Hash Join + Hash Cond: (users_1.composite_id = events_1.composite_id) + -> Seq Scan on users_1400029 users_1 + Filter: ((composite_id >= '(1,-9223372036854775808)'::user_composite_type) AND (composite_id <= '(1,9223372036854775807)'::user_composite_type)) + -> Hash + -> Seq Scan on events_1400025 events_1 Filter: ((event_type)::text = 'submit'::text) -> Hash -> Subquery Scan on subquery_2 -> Unique -> Sort Sort Key: ((events_2.composite_id).tenant_id), ((events_2.composite_id).user_id) - -> Seq Scan on events_1400027 events_2 + -> Seq Scan on events_1400025 events_2 Filter: ((composite_id >= '(1,-9223372036854775808)'::user_composite_type) AND (composite_id <= '(1,9223372036854775807)'::user_composite_type) AND ((event_type)::text = 'pay'::text)) -- Union, left join and having subquery pushdown EXPLAIN (COSTS OFF) @@ -703,13 +705,12 @@ Limit -> Limit -> Sort Sort Key: users.lastseen DESC - -> Subquery Scan on users - -> Result - One-Time Filter: false + -> Seq Scan on users_1400029 users + Filter: ((composite_id >= '(1,-9223372036854775808)'::user_composite_type) AND (composite_id <= '(1,9223372036854775807)'::user_composite_type)) -> Limit -> Sort Sort Key: events.event_time DESC - -> Seq Scan on events_1400027 events + -> Seq Scan on events_1400025 events Filter: (composite_id = users.composite_id) -- Test all tasks output SET citus.explain_all_tasks TO on; diff --git a/src/test/regress/expected/multi_reference_table.out b/src/test/regress/expected/multi_reference_table.out index fcd5f7c2c..6e6b45c59 100644 --- a/src/test/regress/expected/multi_reference_table.out +++ b/src/test/regress/expected/multi_reference_table.out @@ -1122,8 +1122,7 @@ WHERE colocated_table_test.value_1 = reference_table_test.value_1; DEBUG: only reference tables may be queried when targeting a reference table with distributed INSERT ... SELECT DEBUG: Collecting INSERT ... SELECT results on coordinator --- now, insert into the hash partitioned table and use reference --- tables in the SELECT queries +-- not pushable due to lack of equality between partition column and column of reference table INSERT INTO colocated_table_test (value_1, value_2) SELECT diff --git a/src/test/regress/expected/multi_subquery_complex_reference_clause.out b/src/test/regress/expected/multi_subquery_complex_reference_clause.out index 865d44e8e..ec0931d12 100644 --- a/src/test/regress/expected/multi_subquery_complex_reference_clause.out +++ b/src/test/regress/expected/multi_subquery_complex_reference_clause.out @@ -8,22 +8,112 @@ ALTER SEQUENCE pg_catalog.pg_dist_jobid_seq RESTART 1400000; SET citus.enable_router_execution TO FALSE; -CREATE TABLE events_reference_table as SELECT * FROM events_table; -CREATE TABLE users_reference_table as SELECT * FROM users_table; -SELECT create_reference_table('events_reference_table'); -NOTICE: Copying data from local table... +CREATE TABLE user_buy_test_table(user_id int, item_id int, buy_count int); +SELECT create_distributed_table('user_buy_test_table', 'user_id'); + create_distributed_table +-------------------------- + +(1 row) + +INSERT INTO user_buy_test_table VALUES(1,2,1); +INSERT INTO user_buy_test_table VALUES(2,3,4); +INSERT INTO user_buy_test_table VALUES(3,4,2); +INSERT INTO user_buy_test_table VALUES(7,5,2); +CREATE TABLE users_return_test_table(user_id int, item_id int, buy_count int); +SELECT create_distributed_table('users_return_test_table', 'user_id'); + create_distributed_table +-------------------------- + +(1 row) + +INSERT INTO users_return_test_table VALUES(4,1,1); +INSERT INTO users_return_test_table VALUES(1,3,1); +INSERT INTO users_return_test_table VALUES(3,2,2); +CREATE TABLE users_ref_test_table(id int, it_name varchar(25), k_no int); +SELECT create_reference_table('users_ref_test_table'); create_reference_table ------------------------ (1 row) -SELECT create_reference_table('users_reference_table'); -NOTICE: Copying data from local table... - create_reference_table ------------------------- - +INSERT INTO users_ref_test_table VALUES(1,'User_1',45); +INSERT INTO users_ref_test_table VALUES(2,'User_2',46); +INSERT INTO users_ref_test_table VALUES(3,'User_3',47); +INSERT INTO users_ref_test_table VALUES(4,'User_4',48); +INSERT INTO users_ref_test_table VALUES(5,'User_5',49); +INSERT INTO users_ref_test_table VALUES(6,'User_6',50); +-- Simple Join test with reference table +SELECT count(*) FROM + (SELECT random() FROM user_buy_test_table JOIN users_ref_test_table + ON user_buy_test_table.user_id = users_ref_test_table.id) subquery_1; + count +------- + 3 (1 row) +-- Should work, reference table at the inner side is allowed +SELECT count(*) FROM + (SELECT random(), k_no FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.user_id = users_ref_test_table.id) subquery_1 WHERE k_no = 47; + count +------- + 1 +(1 row) + +-- Should not work, no equality between partition column and reference table +SELECT * FROM + (SELECT random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.item_id = users_ref_test_table.id) subquery_1; +ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys +DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. +-- Should not work, no equality between partition column and reference table +SELECT * FROM + (SELECT random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.user_id > users_ref_test_table.id) subquery_1; +ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys +DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. +-- Shouldn't work, reference table at the outer side is not allowed +SELECT * FROM + (SELECT random() FROM users_ref_test_table LEFT JOIN user_buy_test_table + ON users_ref_test_table.id = user_buy_test_table.user_id) subquery_1; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- Should work, reference table at the inner side is allowed +SELECT count(*) FROM + (SELECT random() FROM users_ref_test_table RIGHT JOIN user_buy_test_table + ON user_buy_test_table.user_id = users_ref_test_table.id) subquery_1; + count +------- + 4 +(1 row) + +-- Shouldn't work, reference table at the outer side is not allowed +SELECT * FROM + (SELECT random() FROM user_buy_test_table RIGHT JOIN users_ref_test_table + ON user_buy_test_table.user_id = users_ref_test_table.id) subquery_1; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- Should pass since reference table locates in the inner part of each left join +SELECT count(*) FROM + (SELECT tt1.user_id, random() FROM user_buy_test_table AS tt1 JOIN users_return_test_table as tt2 + ON tt1.user_id = tt2.user_id) subquery_1 + LEFT JOIN + (SELECT tt1.user_id, random() FROM user_buy_test_table as tt1 LEFT JOIN users_ref_test_table as ref + ON tt1.user_id = ref.id) subquery_2 ON subquery_1.user_id = subquery_2.user_id; + count +------- + 2 +(1 row) + + -- Should not pass since reference table locates in the outer part of right join +SELECT * FROM + (SELECT tt1.user_id, random() FROM user_buy_test_table AS tt1 JOIN users_return_test_table as tt2 + ON tt1.user_id = tt2.user_id) subquery_1 + RIGHT JOIN + (SELECT tt1.user_id, random() FROM user_buy_test_table as tt1 JOIN users_ref_test_table as ref + ON tt1.user_id = ref.id) subquery_2 ON subquery_1.user_id = subquery_2.user_id; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join -- LATERAL JOINs used with INNER JOINs with reference tables SET citus.subquery_pushdown to ON; SELECT user_id, lastseen @@ -570,3 +660,6 @@ GROUP BY types ORDER BY types; ERROR: cannot push down this subquery DETAIL: Reference tables are not supported with union operator +DROP TABLE user_buy_test_table; +DROP TABLE users_ref_test_table; +DROP TABLE users_return_test_table; diff --git a/src/test/regress/expected/multi_subquery_in_where_reference_clause.out b/src/test/regress/expected/multi_subquery_in_where_reference_clause.out index d45ec005f..a0e2bf7f0 100644 --- a/src/test/regress/expected/multi_subquery_in_where_reference_clause.out +++ b/src/test/regress/expected/multi_subquery_in_where_reference_clause.out @@ -28,6 +28,89 @@ LIMIT 5; 63 (4 rows) +-- subqueries in WHERE with IN operator +SELECT + user_id +FROM + users_table +WHERE + value_2 IN + (SELECT + value_2 + FROM + events_reference_table + WHERE + users_table.user_id = events_reference_table.user_id + ) +GROUP BY user_id +ORDER BY user_id +LIMIT 3; + user_id +--------- + 0 + 1 + 2 +(3 rows) + +-- subqueries in WHERE with NOT EXISTS operator, should work since +-- reference table in the inner part of the join +SELECT + user_id +FROM + users_table +WHERE + NOT EXISTS + (SELECT + value_2 + FROM + events_reference_table + WHERE + users_table.user_id = events_reference_table.user_id + ) +GROUP BY user_id +ORDER BY user_id +LIMIT 3; + user_id +--------- +(0 rows) + +-- subqueries in WHERE with NOT EXISTS operator, should not work +-- there is a reference table in the outer part of the join +SELECT + user_id +FROM + users_reference_table +WHERE + NOT EXISTS + (SELECT + value_2 + FROM + events_table + WHERE + users_reference_table.user_id = events_table.user_id + ) +LIMIT 3; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- subqueries in WHERE with IN operator without equality +SELECT + user_id +FROM + users_table +WHERE + value_2 IN + (SELECT + value_2 + FROM + events_reference_table + WHERE + users_table.user_id > events_reference_table.user_id + ) +GROUP BY user_id +ORDER BY user_id +LIMIT 3; +ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys +DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator. -- have reference table without any equality, should error out SELECT user_id @@ -141,5 +224,3 @@ SELECT user_id, value_2 FROM users_table WHERE ORDER BY 1, 2; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join -DROP TABLE events_reference_table; -DROP TABLE users_reference_table; diff --git a/src/test/regress/expected/multi_subquery_union.out b/src/test/regress/expected/multi_subquery_union.out index 0a858baf7..3a5e00400 100644 --- a/src/test/regress/expected/multi_subquery_union.out +++ b/src/test/regress/expected/multi_subquery_union.out @@ -23,6 +23,17 @@ LIMIT 5; 20 | 9 (5 rows) +-- a very simple union query with reference table +SELECT user_id, counter +FROM ( + SELECT user_id, value_2 % 10 AS counter FROM events_table WHERE event_type IN (1, 2, 3, 4, 5) + UNION + SELECT user_id, value_2 % 10 AS counter FROM events_reference_table WHERE event_type IN (5, 6, 7, 8, 9, 10) +) user_id +ORDER BY 2 DESC,1 +LIMIT 5; +ERROR: cannot pushdown this query +DETAIL: Reference tables are not allowed with set operations -- the same query with union all SELECT user_id, counter FROM ( @@ -41,6 +52,17 @@ LIMIT 5; 15 | 9 (5 rows) +-- the same query with union all and reference table +SELECT user_id, counter +FROM ( + SELECT user_id, value_2 % 10 AS counter FROM events_table WHERE event_type IN (1, 2, 3, 4, 5) + UNION ALL + SELECT user_id, value_2 % 10 AS counter FROM events_reference_table WHERE event_type IN (5, 6, 7, 8, 9, 10) +) user_id +ORDER BY 2 DESC,1 +LIMIT 5; +ERROR: cannot pushdown this query +DETAIL: Reference tables are not allowed with set operations -- the same query with group by SELECT user_id, sum(counter) FROM ( @@ -162,6 +184,22 @@ GROUP BY user_id ORDER BY 1 DESC LIMIT 5; 23508 (5 rows) +-- similar query this time more subqueries with reference table and target list contains a resjunk entry +SELECT sum(counter) +FROM ( + SELECT user_id, sum(value_2) AS counter FROM users_table where value_1 < 20 GROUP BY user_id HAVING sum(value_2) > 500 + UNION + SELECT user_id, sum(value_2) AS counter FROM users_table where value_1 < 40 and value_1 < 60 GROUP BY user_id HAVING sum(value_2) > 500 + UNION + SELECT user_id, sum(value_2) AS counter FROM users_reference_table where value_1 < 60 and value_1 < 80 GROUP BY user_id HAVING sum(value_2) > 500 + UNION + SELECT user_id, sum(value_2) AS counter FROM users_table where value_1 < 80 and value_1 < 100 GROUP BY user_id HAVING sum(value_2) > 500 + UNION + SELECT user_id, sum(value_2) AS counter FROM users_table where value_1 < 100 and value_1 < 120 GROUP BY user_id HAVING sum(value_2) > 500 +) user_id +GROUP BY user_id ORDER BY 1 DESC LIMIT 5; +ERROR: cannot pushdown this query +DETAIL: Reference tables are not allowed with set operations -- similar query as above, with UNION ALL SELECT sum(counter) FROM ( @@ -236,6 +274,50 @@ LIMIT 5; 90 | 115843 (5 rows) +-- unions within unions with reference table +SELECT * +FROM ( + ( SELECT user_id, + sum(counter) + FROM + (SELECT + user_id, sum(value_2) AS counter + FROM + users_table + GROUP BY + user_id + UNION + SELECT + user_id, sum(value_2) AS counter + FROM + events_reference_table + GROUP BY + user_id) user_id_1 + GROUP BY + user_id) + UNION + (SELECT + user_id, sum(counter) + FROM + (SELECT + user_id, sum(value_2) AS counter + FROM + users_table + GROUP BY + user_id + UNION + SELECT + user_id, sum(value_2) AS counter + FROM + events_table + GROUP BY + user_id) user_id_2 + GROUP BY + user_id)) AS ftop +ORDER BY 2 DESC, 1 DESC +LIMIT 5; +ERROR: cannot pushdown this query +DETAIL: Reference tables are not allowed with set operations -- top level unions are wrapped into top level aggregations SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType FROM @@ -493,6 +575,17 @@ FROM 20002 (1 row) +-- some UNION ALL queries that are going to be pulled up with reference table +SELECT + count(*) +FROM +( + (SELECT user_id FROM users_table) + UNION ALL + (SELECT user_id FROM events_reference_table) +) b; +ERROR: cannot pushdown this query +DETAIL: Reference tables are not allowed with set operations -- similar query without top level agg SELECT user_id @@ -899,3 +992,5 @@ ORDER BY types; ERROR: cannot push down this subquery DETAIL: Subqueries without relations are unsupported SET citus.enable_router_execution TO true; +DROP TABLE events_reference_table; +DROP TABLE users_reference_table; diff --git a/src/test/regress/input/multi_behavioral_analytics_create_table.source b/src/test/regress/input/multi_behavioral_analytics_create_table.source index fcb1ade20..c83673a2d 100644 --- a/src/test/regress/input/multi_behavioral_analytics_create_table.source +++ b/src/test/regress/input/multi_behavioral_analytics_create_table.source @@ -353,3 +353,11 @@ SET citus.shard_max_size TO "1MB"; \copy orders_subquery FROM '@abs_srcdir@/data/orders.1.data' with delimiter '|' \copy orders_subquery FROM '@abs_srcdir@/data/orders.2.data' with delimiter '|' + +CREATE TABLE events_reference_table (like events_table including all); +SELECT create_reference_table('events_reference_table'); +INSERT INTO events_reference_table SELECT * FROM events_table; + +CREATE TABLE users_reference_table (like users_table including all); +SELECT create_reference_table('users_reference_table'); +INSERT INTO users_reference_table SELECT * FROM users_table; diff --git a/src/test/regress/output/multi_behavioral_analytics_create_table.source b/src/test/regress/output/multi_behavioral_analytics_create_table.source index 093bc1d15..81b996c0d 100644 --- a/src/test/regress/output/multi_behavioral_analytics_create_table.source +++ b/src/test/regress/output/multi_behavioral_analytics_create_table.source @@ -421,3 +421,19 @@ SET citus.shard_max_size TO "1MB"; \copy lineitem_subquery FROM '@abs_srcdir@/data/lineitem.2.data' with delimiter '|' \copy orders_subquery FROM '@abs_srcdir@/data/orders.1.data' with delimiter '|' \copy orders_subquery FROM '@abs_srcdir@/data/orders.2.data' with delimiter '|' +CREATE TABLE events_reference_table (like events_table including all); +SELECT create_reference_table('events_reference_table'); + create_reference_table +------------------------ + +(1 row) + +INSERT INTO events_reference_table SELECT * FROM events_table; +CREATE TABLE users_reference_table (like users_table including all); +SELECT create_reference_table('users_reference_table'); + create_reference_table +------------------------ + +(1 row) + +INSERT INTO users_reference_table SELECT * FROM users_table; diff --git a/src/test/regress/sql/multi_reference_table.sql b/src/test/regress/sql/multi_reference_table.sql index 336b38bee..8c498ba42 100644 --- a/src/test/regress/sql/multi_reference_table.sql +++ b/src/test/regress/sql/multi_reference_table.sql @@ -707,8 +707,7 @@ FROM WHERE colocated_table_test.value_1 = reference_table_test.value_1; --- now, insert into the hash partitioned table and use reference --- tables in the SELECT queries +-- not pushable due to lack of equality between partition column and column of reference table INSERT INTO colocated_table_test (value_1, value_2) SELECT diff --git a/src/test/regress/sql/multi_subquery_complex_reference_clause.sql b/src/test/regress/sql/multi_subquery_complex_reference_clause.sql index 103a3dbbe..fc4ffef7b 100644 --- a/src/test/regress/sql/multi_subquery_complex_reference_clause.sql +++ b/src/test/regress/sql/multi_subquery_complex_reference_clause.sql @@ -10,11 +10,78 @@ ALTER SEQUENCE pg_catalog.pg_dist_jobid_seq RESTART 1400000; SET citus.enable_router_execution TO FALSE; -CREATE TABLE events_reference_table as SELECT * FROM events_table; -CREATE TABLE users_reference_table as SELECT * FROM users_table; +CREATE TABLE user_buy_test_table(user_id int, item_id int, buy_count int); +SELECT create_distributed_table('user_buy_test_table', 'user_id'); +INSERT INTO user_buy_test_table VALUES(1,2,1); +INSERT INTO user_buy_test_table VALUES(2,3,4); +INSERT INTO user_buy_test_table VALUES(3,4,2); +INSERT INTO user_buy_test_table VALUES(7,5,2); -SELECT create_reference_table('events_reference_table'); -SELECT create_reference_table('users_reference_table'); +CREATE TABLE users_return_test_table(user_id int, item_id int, buy_count int); +SELECT create_distributed_table('users_return_test_table', 'user_id'); +INSERT INTO users_return_test_table VALUES(4,1,1); +INSERT INTO users_return_test_table VALUES(1,3,1); +INSERT INTO users_return_test_table VALUES(3,2,2); + +CREATE TABLE users_ref_test_table(id int, it_name varchar(25), k_no int); +SELECT create_reference_table('users_ref_test_table'); +INSERT INTO users_ref_test_table VALUES(1,'User_1',45); +INSERT INTO users_ref_test_table VALUES(2,'User_2',46); +INSERT INTO users_ref_test_table VALUES(3,'User_3',47); +INSERT INTO users_ref_test_table VALUES(4,'User_4',48); +INSERT INTO users_ref_test_table VALUES(5,'User_5',49); +INSERT INTO users_ref_test_table VALUES(6,'User_6',50); + +-- Simple Join test with reference table +SELECT count(*) FROM + (SELECT random() FROM user_buy_test_table JOIN users_ref_test_table + ON user_buy_test_table.user_id = users_ref_test_table.id) subquery_1; + +-- Should work, reference table at the inner side is allowed +SELECT count(*) FROM + (SELECT random(), k_no FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.user_id = users_ref_test_table.id) subquery_1 WHERE k_no = 47; + +-- Should not work, no equality between partition column and reference table +SELECT * FROM + (SELECT random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.item_id = users_ref_test_table.id) subquery_1; + +-- Should not work, no equality between partition column and reference table +SELECT * FROM + (SELECT random() FROM user_buy_test_table LEFT JOIN users_ref_test_table + ON user_buy_test_table.user_id > users_ref_test_table.id) subquery_1; + +-- Shouldn't work, reference table at the outer side is not allowed +SELECT * FROM + (SELECT random() FROM users_ref_test_table LEFT JOIN user_buy_test_table + ON users_ref_test_table.id = user_buy_test_table.user_id) subquery_1; + +-- Should work, reference table at the inner side is allowed +SELECT count(*) FROM + (SELECT random() FROM users_ref_test_table RIGHT JOIN user_buy_test_table + ON user_buy_test_table.user_id = users_ref_test_table.id) subquery_1; + +-- Shouldn't work, reference table at the outer side is not allowed +SELECT * FROM + (SELECT random() FROM user_buy_test_table RIGHT JOIN users_ref_test_table + ON user_buy_test_table.user_id = users_ref_test_table.id) subquery_1; + +-- Should pass since reference table locates in the inner part of each left join +SELECT count(*) FROM + (SELECT tt1.user_id, random() FROM user_buy_test_table AS tt1 JOIN users_return_test_table as tt2 + ON tt1.user_id = tt2.user_id) subquery_1 + LEFT JOIN + (SELECT tt1.user_id, random() FROM user_buy_test_table as tt1 LEFT JOIN users_ref_test_table as ref + ON tt1.user_id = ref.id) subquery_2 ON subquery_1.user_id = subquery_2.user_id; + + -- Should not pass since reference table locates in the outer part of right join +SELECT * FROM + (SELECT tt1.user_id, random() FROM user_buy_test_table AS tt1 JOIN users_return_test_table as tt2 + ON tt1.user_id = tt2.user_id) subquery_1 + RIGHT JOIN + (SELECT tt1.user_id, random() FROM user_buy_test_table as tt1 JOIN users_ref_test_table as ref + ON tt1.user_id = ref.id) subquery_2 ON subquery_1.user_id = subquery_2.user_id; -- LATERAL JOINs used with INNER JOINs with reference tables SET citus.subquery_pushdown to ON; @@ -500,4 +567,8 @@ INNER JOIN FROM users_table as "users" WHERE value_1 > 50 and value_1 < 70) AS t ON (t.user_id = q.user_id)) as final_query GROUP BY types -ORDER BY types; \ No newline at end of file +ORDER BY types; + +DROP TABLE user_buy_test_table; +DROP TABLE users_ref_test_table; +DROP TABLE users_return_test_table; diff --git a/src/test/regress/sql/multi_subquery_in_where_reference_clause.sql b/src/test/regress/sql/multi_subquery_in_where_reference_clause.sql index 0126f362d..54266b085 100644 --- a/src/test/regress/sql/multi_subquery_in_where_reference_clause.sql +++ b/src/test/regress/sql/multi_subquery_in_where_reference_clause.sql @@ -22,6 +22,78 @@ HAVING count(*) > 66 ORDER BY user_id LIMIT 5; +-- subqueries in WHERE with IN operator +SELECT + user_id +FROM + users_table +WHERE + value_2 IN + (SELECT + value_2 + FROM + events_reference_table + WHERE + users_table.user_id = events_reference_table.user_id + ) +GROUP BY user_id +ORDER BY user_id +LIMIT 3; + +-- subqueries in WHERE with NOT EXISTS operator, should work since +-- reference table in the inner part of the join +SELECT + user_id +FROM + users_table +WHERE + NOT EXISTS + (SELECT + value_2 + FROM + events_reference_table + WHERE + users_table.user_id = events_reference_table.user_id + ) +GROUP BY user_id +ORDER BY user_id +LIMIT 3; + +-- subqueries in WHERE with NOT EXISTS operator, should not work +-- there is a reference table in the outer part of the join +SELECT + user_id +FROM + users_reference_table +WHERE + NOT EXISTS + (SELECT + value_2 + FROM + events_table + WHERE + users_reference_table.user_id = events_table.user_id + ) +LIMIT 3; + +-- subqueries in WHERE with IN operator without equality +SELECT + user_id +FROM + users_table +WHERE + value_2 IN + (SELECT + value_2 + FROM + events_reference_table + WHERE + users_table.user_id > events_reference_table.user_id + ) +GROUP BY user_id +ORDER BY user_id +LIMIT 3; + -- have reference table without any equality, should error out SELECT user_id @@ -125,6 +197,3 @@ SELECT user_id, value_2 FROM users_table WHERE HAVING sum(submit_card_info) > 0 ) ORDER BY 1, 2; - -DROP TABLE events_reference_table; -DROP TABLE users_reference_table; \ No newline at end of file diff --git a/src/test/regress/sql/multi_subquery_union.sql b/src/test/regress/sql/multi_subquery_union.sql index ef960e49e..f48d13603 100644 --- a/src/test/regress/sql/multi_subquery_union.sql +++ b/src/test/regress/sql/multi_subquery_union.sql @@ -17,6 +17,16 @@ FROM ( ORDER BY 2 DESC,1 LIMIT 5; +-- a very simple union query with reference table +SELECT user_id, counter +FROM ( + SELECT user_id, value_2 % 10 AS counter FROM events_table WHERE event_type IN (1, 2, 3, 4, 5) + UNION + SELECT user_id, value_2 % 10 AS counter FROM events_reference_table WHERE event_type IN (5, 6, 7, 8, 9, 10) +) user_id +ORDER BY 2 DESC,1 +LIMIT 5; + -- the same query with union all SELECT user_id, counter FROM ( @@ -27,6 +37,16 @@ FROM ( ORDER BY 2 DESC,1 LIMIT 5; +-- the same query with union all and reference table +SELECT user_id, counter +FROM ( + SELECT user_id, value_2 % 10 AS counter FROM events_table WHERE event_type IN (1, 2, 3, 4, 5) + UNION ALL + SELECT user_id, value_2 % 10 AS counter FROM events_reference_table WHERE event_type IN (5, 6, 7, 8, 9, 10) +) user_id +ORDER BY 2 DESC,1 +LIMIT 5; + -- the same query with group by SELECT user_id, sum(counter) FROM ( @@ -102,6 +122,21 @@ FROM ( ) user_id GROUP BY user_id ORDER BY 1 DESC LIMIT 5; +-- similar query this time more subqueries with reference table and target list contains a resjunk entry +SELECT sum(counter) +FROM ( + SELECT user_id, sum(value_2) AS counter FROM users_table where value_1 < 20 GROUP BY user_id HAVING sum(value_2) > 500 + UNION + SELECT user_id, sum(value_2) AS counter FROM users_table where value_1 < 40 and value_1 < 60 GROUP BY user_id HAVING sum(value_2) > 500 + UNION + SELECT user_id, sum(value_2) AS counter FROM users_reference_table where value_1 < 60 and value_1 < 80 GROUP BY user_id HAVING sum(value_2) > 500 + UNION + SELECT user_id, sum(value_2) AS counter FROM users_table where value_1 < 80 and value_1 < 100 GROUP BY user_id HAVING sum(value_2) > 500 + UNION + SELECT user_id, sum(value_2) AS counter FROM users_table where value_1 < 100 and value_1 < 120 GROUP BY user_id HAVING sum(value_2) > 500 +) user_id +GROUP BY user_id ORDER BY 1 DESC LIMIT 5; + -- similar query as above, with UNION ALL SELECT sum(counter) FROM ( @@ -160,6 +195,49 @@ FROM ( ORDER BY 2 DESC, 1 DESC LIMIT 5; +-- unions within unions with reference table +SELECT * +FROM ( + ( SELECT user_id, + sum(counter) + FROM + (SELECT + user_id, sum(value_2) AS counter + FROM + users_table + GROUP BY + user_id + UNION + SELECT + user_id, sum(value_2) AS counter + FROM + events_reference_table + GROUP BY + user_id) user_id_1 + GROUP BY + user_id) + UNION + (SELECT + user_id, sum(counter) + FROM + (SELECT + user_id, sum(value_2) AS counter + FROM + users_table + GROUP BY + user_id + UNION + SELECT + user_id, sum(value_2) AS counter + FROM + events_table + GROUP BY + user_id) user_id_2 + GROUP BY + user_id)) AS ftop +ORDER BY 2 DESC, 1 DESC +LIMIT 5; + -- top level unions are wrapped into top level aggregations SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType FROM @@ -377,6 +455,16 @@ FROM (SELECT user_id FROM events_table) ) b; +-- some UNION ALL queries that are going to be pulled up with reference table +SELECT + count(*) +FROM +( + (SELECT user_id FROM users_table) + UNION ALL + (SELECT user_id FROM events_reference_table) +) b; + -- similar query without top level agg SELECT user_id @@ -724,3 +812,6 @@ GROUP BY types ORDER BY types; SET citus.enable_router_execution TO true; + +DROP TABLE events_reference_table; +DROP TABLE users_reference_table;