Expand reference table support in subquery pushdown

With this commit, we relax the restrictions put on the reference
tables with subquery pushdown.

We did three notable improvements:

1) Relax equi-join restrictions

 Previously, we always expected that the non-reference tables are
 equi joined with reference tables on the partition key of the
 non-reference table.

 With this commit, we allow any column of non-reference tables
 joined using non-equi joins as well.

2) Relax OUTER JOIN restrictions

 Previously Citus errored out if any reference table exists at
 any point of the outer part of an outer join. For instance,
 See the below sketch where (h) denotes a hash distributed relation,
 (r) denotes a reference table, (L) denotes LEFT JOIN and
 (I) denotes INNER JOIN.

             (L)
             /  \
           (I)     h
          /  \
        r      h

 Before this commit Citus would error out since a reference table
 appears on the left most part of an left join. However, that was
 too restrictive so that we only error out if the reference table
 is directly below and in the outer part of an outer join.

3) Bug fixes

 We've done some minor bugfixes in the existing implementation.
pull/1628/head
Onder Kalaci 2017-08-25 15:05:21 +03:00
parent 18b9be3dfa
commit a5b66912d4
11 changed files with 1551 additions and 244 deletions

View File

@ -2880,6 +2880,17 @@ FindReferencedTableColumn(Expr *columnExpression, List *parentQueryList, Query *
return;
}
/*
* We currently don't support finding partition keys in the subqueries
* that references from outer subqueries. For example, in corrolated
* subqueries in WHERE clause, we don't support use of partition keys
* in the subquery that is referred from the outer query.
*/
if (candidateColumn->varlevelsup > 0)
{
return;
}
rangeTableEntryIndex = candidateColumn->varno - 1;
rangeTableEntry = list_nth(rangetableList, rangeTableEntryIndex);
@ -3096,7 +3107,14 @@ PartitionColumnOpExpressionList(Query *query)
rangeTableEntryIndex = candidatePartitionColumn->varno - 1;
rangeTableEntry = list_nth(rangetableList, rangeTableEntryIndex);
Assert(rangeTableEntry->rtekind == RTE_RELATION);
/*
* We currently don't support checking for equality when user refers
* to a column from the JOIN instead of the relation.
*/
if (rangeTableEntry->rtekind != RTE_RELATION)
{
continue;
}
relationId = rangeTableEntry->relid;
partitionColumn = DistPartitionKey(relationId);

View File

@ -31,6 +31,7 @@
#include "distributed/worker_protocol.h"
#include "nodes/makefuncs.h"
#include "nodes/nodeFuncs.h"
#include "nodes/relation.h"
#include "optimizer/clauses.h"
#include "optimizer/prep.h"
#include "optimizer/tlist.h"
@ -69,6 +70,8 @@ static DeferredErrorMessage * DeferErrorIfUnsupportedSubqueryPushdown(Query *
PlannerRestrictionContext
*
plannerRestrictionContext);
static DeferredErrorMessage * DeferErrorIfUnsupportedSublinkAndReferenceTable(
Query *queryTree);
static DeferredErrorMessage * DeferErrorIfUnsupportedFilters(Query *subquery);
static bool EqualOpExpressionLists(List *firstOpExpressionList,
List *secondOpExpressionList);
@ -87,6 +90,7 @@ static MultiNode * MultiPlanTree(Query *queryTree);
static void ErrorIfQueryNotSupported(Query *queryTree);
static bool HasUnsupportedReferenceTableJoin(
PlannerRestrictionContext *plannerRestrictionContext);
static bool ShouldRecurseForReferenceTableJoinChecks(RelOptInfo *relOptInfo);
static bool HasUnsupportedJoinWalker(Node *node, void *context);
static bool ErrorHintRequired(const char *errorHint, Query *queryTree);
static DeferredErrorMessage * DeferErrorIfUnsupportedSubqueryRepartition(Query *
@ -96,8 +100,8 @@ static bool HasOuterJoin(Query *queryTree);
static bool HasOuterJoinWalker(Node *node, void *maxJoinLevel);
static bool HasComplexJoinOrder(Query *queryTree);
static bool HasComplexRangeTableType(Query *queryTree);
static bool RelationInfoHasReferenceTable(PlannerInfo *plannerInfo,
RelOptInfo *relationInfo);
static bool RelationInfoContainsReferenceTable(PlannerInfo *plannerInfo,
RelOptInfo *relationInfo);
static void ValidateClauseList(List *clauseList);
static void ValidateSubqueryPushdownClauseList(List *clauseList);
static bool ExtractFromExpressionWalker(Node *node,
@ -542,7 +546,16 @@ DeferErrorIfUnsupportedSubqueryPushdown(Query *originalQuery,
"one another relation using distribution keys and "
"equality operator.", NULL);
}
else if (HasUnsupportedReferenceTableJoin(plannerRestrictionContext))
/* we shouldn't allow reference tables in the FROM clause when the query has sublinks */
error = DeferErrorIfUnsupportedSublinkAndReferenceTable(originalQuery);
if (error)
{
return error;
}
/* we shouldn't allow reference tables in the outer part of outer joins */
if (HasUnsupportedReferenceTableJoin(plannerRestrictionContext))
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"cannot pushdown the subquery",
@ -582,6 +595,43 @@ DeferErrorIfUnsupportedSubqueryPushdown(Query *originalQuery,
}
/*
* DeferErrorIfUnsupportedSublinkAndReferenceTable returns a deferred error if the
* given query is not suitable for subquery pushdown.
*
* While planning sublinks, we rely on Postgres in the sense that it converts some of
* sublinks into joins.
*
* In some cases, sublinks are pulled up and converted into outer joins. Those cases
* are already handled with HasUnsupportedReferenceTableJoin().
*
* If the sublinks are not pulled up, we should still error out in if any reference table
* appears in the FROM clause of a subquery.
*
* Otherwise, the result would include duplicate rows.
*/
static DeferredErrorMessage *
DeferErrorIfUnsupportedSublinkAndReferenceTable(Query *queryTree)
{
if (!queryTree->hasSubLinks)
{
return NULL;
}
if (HasReferenceTable((Node *) queryTree->rtable))
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"cannot pushdown the subquery",
"Reference tables are not allowed in FROM "
"clause when the query has subqueries in "
"WHERE clause",
NULL);
}
return NULL;
}
/*
* DeferErrorIfUnsupportedFilters checks if all leaf queries in the given query have
* same filter on the partition column. Note that if there are queries without
@ -853,6 +903,14 @@ DeferErrorIfCannotPushdownSubquery(Query *subqueryTree, bool outerMostQueryHasLi
}
}
deferredError = DeferErrorIfUnsupportedSublinkAndReferenceTable(subqueryTree);
if (deferredError)
{
preconditionsSatisfied = false;
errorDetail = (char *) deferredError->detail;
}
/* finally check and return deferred if not satisfied */
if (!preconditionsSatisfied)
{
@ -1027,7 +1085,7 @@ DeferErrorIfUnsupportedTableCombination(Query *queryTree)
/*
* TargetListOnPartitionColumn checks if at least one target list entry is on
* partition column or the table is a reference table.
* partition column.
*/
static bool
TargetListOnPartitionColumn(Query *query, List *targetEntryList)
@ -1047,15 +1105,11 @@ TargetListOnPartitionColumn(Query *query, List *targetEntryList)
FindReferencedTableColumn(targetExpression, NIL, query, &relationId, &column);
/*
* If the expression belongs to reference table directly returns true.
* We can assume that target list entry always on partition column of
* reference tables.
*/
/* if the expression belongs to reference table directly returns false */
if (IsDistributedTable(relationId) && PartitionMethod(relationId) ==
DISTRIBUTE_BY_NONE)
{
targetListOnPartitionColumn = true;
targetListOnPartitionColumn = false;
break;
}
@ -1459,15 +1513,18 @@ HasUnsupportedReferenceTableJoin(PlannerRestrictionContext *plannerRestrictionCo
if (joinType == JOIN_SEMI || joinType == JOIN_ANTI || joinType == JOIN_LEFT)
{
if (RelationInfoHasReferenceTable(plannerInfo, outerrel))
if (ShouldRecurseForReferenceTableJoinChecks(outerrel) &&
RelationInfoContainsReferenceTable(plannerInfo, outerrel))
{
return true;
}
}
else if (joinType == JOIN_FULL)
{
if (RelationInfoHasReferenceTable(plannerInfo, innerrel) ||
RelationInfoHasReferenceTable(plannerInfo, outerrel))
if ((ShouldRecurseForReferenceTableJoinChecks(innerrel) &&
RelationInfoContainsReferenceTable(plannerInfo, innerrel)) ||
(ShouldRecurseForReferenceTableJoinChecks(outerrel) &&
RelationInfoContainsReferenceTable(plannerInfo, outerrel)))
{
return true;
}
@ -1479,12 +1536,66 @@ HasUnsupportedReferenceTableJoin(PlannerRestrictionContext *plannerRestrictionCo
/*
* RelationInfoHasReferenceTable check whether the relationInfo has reference table.
* Since relation ids of relationInfo indexes to the range table entry list of
* planner info, planner info is also passed.
* ShouldRecurseForReferenceTableJoinChecks is a helper function for deciding
* on whether the input relOptInfo should be checked for unsupported reference
* tables.
*/
static bool
RelationInfoHasReferenceTable(PlannerInfo *plannerInfo, RelOptInfo *relationInfo)
ShouldRecurseForReferenceTableJoinChecks(RelOptInfo *relOptInfo)
{
/*
* We shouldn't recursively go down for joins since we're already
* going to process each join seperately. Otherwise we'd restrict
* the coverage. See the below sketch where (h) denotes a hash
* distributed relation, (r) denotes a reference table, (L) denotes
* LEFT JOIN and (I) denotes INNER JOIN. If we're to recurse into
* the inner join, we'd be preventing to push down the following
* join tree, which is actually safe to push down.
*
* (L)
* / \
* (I) h
* / \
* r h
*/
if (relOptInfo->reloptkind == RELOPT_JOINREL)
{
return false;
}
/*
* Note that we treat the same query where relations appear in subqueries
* differently. (i.e., use SELECT * FROM r; instead of r)
*
* In that case, to relax some restrictions, we do the following optimization:
* If the subplan (i.e., plannerInfo corresponding to the subquery) contains any
* joins, we skip reference table checks keeping in mind that the join is already
* going to be processed seperately. This optimization should suffice for many
* use cases.
*/
if (relOptInfo->reloptkind == RELOPT_BASEREL && relOptInfo->subroot != NULL)
{
PlannerInfo *subroot = relOptInfo->subroot;
if (list_length(subroot->join_rel_list) > 0)
{
return false;
}
}
return true;
}
/*
* RelationInfoContainsReferenceTable checks whether the relationInfo
* contains any reference tables. If found, the function returns true.
*
* Note that since relation ids of relationInfo indexes to the range
* table entry list of planner info, planner info is also passed.
*/
static bool
RelationInfoContainsReferenceTable(PlannerInfo *plannerInfo, RelOptInfo *relationInfo)
{
Relids relids = bms_copy(relationInfo->relids);
int relationId = -1;
@ -2927,9 +3038,17 @@ ExtractRangeTableRelationWalkerWithRTEExpand(Node *node, List **rangeTableRelati
rangeTableRelationList, 0);
}
}
else if (IsA(node, Query))
{
walkIsComplete = query_tree_walker((Query *) node,
ExtractRangeTableRelationWalkerWithRTEExpand,
rangeTableRelationList, QTW_EXAMINE_RTES);
}
else
{
walkIsComplete = ExtractRangeTableRelationWalker(node, rangeTableRelationList);
walkIsComplete = expression_tree_walker(node,
ExtractRangeTableRelationWalkerWithRTEExpand,
rangeTableRelationList);
}
return walkIsComplete;

View File

@ -62,6 +62,7 @@ typedef struct AttributeEquivalenceClassMember
} AttributeEquivalenceClassMember;
static uint32 ReferenceRelationCount(RelationRestrictionContext *restrictionContext);
static Var * FindTranslatedVar(List *appendRelList, Oid relationOid,
Index relationRteIndex, Index *partitionKeyIndex);
static bool EquivalenceListContainsRelationsEquality(List *attributeEquivalenceList,
@ -329,9 +330,8 @@ FindTranslatedVar(List *appendRelList, Oid relationOid, Index relationRteIndex,
* joined on their partition keys.
*
* The function returns true if all relations are joined on their partition keys.
* Otherwise, the function returns false. In order to support reference tables
* with subqueries, equality between attributes of reference tables and partition
* key of distributed tables are also considered.
* Otherwise, the function returns false. We ignore reference tables at all since
* they don't have partition keys.
*
* In order to do that, we invented a new equivalence class namely:
* AttributeEquivalenceClass. In very simple words, a AttributeEquivalenceClass is
@ -365,14 +365,24 @@ RestrictionEquivalenceForPartitionKeys(PlannerRestrictionContext *
List *joinRestrictionAttributeEquivalenceList = NIL;
List *allAttributeEquivalenceList = NIL;
uint32 referenceRelationCount = ReferenceRelationCount(restrictionContext);
uint32 totalRelationCount = list_length(restrictionContext->relationRestrictionList);
uint32 nonReferenceRelationCount = totalRelationCount - referenceRelationCount;
/*
* If the query includes only one relation, we should not check the partition
* column equality. Single table should not need to fetch data from other nodes
* except it's own node(s).
* If the query includes a single relation which is not a reference table,
* we should not check the partition column equality.
* Consider two example cases:
* (i) The query includes only a single colocated relation
* (ii) A colocated relation is joined with a (or multiple) reference
* table(s) where colocated relation is not joined on the partition key
*
* For the above two cases, we don't need to execute the partition column equality
* algorithm. The reason is that the essence of this function is to ensure that the
* tasks that are going to be created should not need data from other tasks. In both
* cases mentioned above, the necessary data per task would be on available.
*/
if (totalRelationCount == 1)
if (nonReferenceRelationCount <= 1)
{
return true;
}
@ -394,6 +404,31 @@ RestrictionEquivalenceForPartitionKeys(PlannerRestrictionContext *
}
/*
* ReferenceRelationCount iterates over the relations and returns the reference table
* relation count.
*/
static uint32
ReferenceRelationCount(RelationRestrictionContext *restrictionContext)
{
ListCell *relationRestrictionCell = NULL;
uint32 referenceRelationCount = 0;
foreach(relationRestrictionCell, restrictionContext->relationRestrictionList)
{
RelationRestriction *relationRestriction =
(RelationRestriction *) lfirst(relationRestrictionCell);
if (PartitionMethod(relationRestriction->relationId) == DISTRIBUTE_BY_NONE)
{
referenceRelationCount++;
}
}
return referenceRelationCount;
}
/*
* EquivalenceListContainsRelationsEquality gets a list of attributed equivalence
* list and a relation restriction context. The function first generates a common
@ -434,6 +469,12 @@ EquivalenceListContainsRelationsEquality(List *attributeEquivalenceList,
(RelationRestriction *) lfirst(relationRestrictionCell);
int rteIdentity = GetRTEIdentity(relationRestriction->rte);
/* we shouldn't check for the equality of reference tables */
if (PartitionMethod(relationRestriction->relationId) == DISTRIBUTE_BY_NONE)
{
continue;
}
if (!bms_is_member(rteIdentity, commonRteIdentities))
{
return false;
@ -621,7 +662,7 @@ GetVarFromAssignedParam(List *parentPlannerParamList, Param *plannerParam)
/*
* GenerateCommonEquivalence gets a list of unrelated AttributeEquiavalanceClass
* whose all members are partition keys or a column of reference table.
* whose all members are partition keys.
*
* With the equivalence classes, the function follows the algorithm
* outlined below:
@ -1084,8 +1125,14 @@ AddRteRelationToAttributeEquivalenceClass(AttributeEquivalenceClass **
Assert(rangeTableEntry->rtekind == RTE_RELATION);
if (PartitionMethod(relationId) != DISTRIBUTE_BY_NONE &&
relationPartitionKey->varattno != varToBeAdded->varattno)
/* we don't need reference tables in the equality on columns */
if (relationPartitionKey == NULL)
{
return;
}
/* we're only interested in distribution columns */
if (relationPartitionKey->varattno != varToBeAdded->varattno)
{
return;
}

View File

@ -1122,7 +1122,7 @@ WHERE
colocated_table_test.value_1 = reference_table_test.value_1;
DEBUG: only reference tables may be queried when targeting a reference table with distributed INSERT ... SELECT
DEBUG: Collecting INSERT ... SELECT results on coordinator
-- not pushable due to lack of equality between partition column and column of reference table
-- safe to push down even lack of equality between partition column and column of reference table
INSERT INTO
colocated_table_test (value_1, value_2)
SELECT
@ -1132,9 +1132,13 @@ FROM
WHERE
colocated_table_test_2.value_4 = reference_table_test.value_4
RETURNING value_1, value_2;
ERROR: cannot perform distributed planning for the given modification
DETAIL: Select query cannot be pushed down to the worker.
-- some more complex queries (Note that there are more complex queries in multi_insert_select.sql)
value_1 | value_2
---------+---------
1 | 1
2 | 2
(2 rows)
-- similar query with the above, this time partition key but without equality
INSERT INTO
colocated_table_test (value_1, value_2)
SELECT
@ -1142,10 +1146,13 @@ SELECT
FROM
colocated_table_test_2, reference_table_test
WHERE
colocated_table_test_2.value_2 = reference_table_test.value_2
colocated_table_test_2.value_1 > reference_table_test.value_2
RETURNING value_1, value_2;
ERROR: cannot perform distributed planning for the given modification
DETAIL: Select query cannot be pushed down to the worker.
value_1 | value_2
---------+---------
2 | 1
(1 row)
-- partition column value comes from reference table, goes via coordinator
INSERT INTO
colocated_table_test (value_1, value_2)
@ -1606,7 +1613,7 @@ INSERT INTO reference_table_test VALUES (2, 2.0, '2', '2016-12-02');
SELECT master_modify_multiple_shards('DELETE FROM colocated_table_test');
master_modify_multiple_shards
-------------------------------
6
9
(1 row)
ROLLBACK;

View File

@ -60,18 +60,38 @@ SELECT count(*) FROM
1
(1 row)
-- Should not work, no equality between partition column and reference table
SELECT * FROM
(SELECT random() FROM user_buy_test_table LEFT JOIN users_ref_test_table
ON user_buy_test_table.item_id = users_ref_test_table.id) subquery_1;
ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys
DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator.
-- Should not work, no equality between partition column and reference table
SELECT * FROM
(SELECT random() FROM user_buy_test_table LEFT JOIN users_ref_test_table
ON user_buy_test_table.user_id > users_ref_test_table.id) subquery_1;
ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys
DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator.
-- Should work, although no equality between partition column and reference table
SELECT subquery_1.item_id FROM
(SELECT user_buy_test_table.item_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table
ON user_buy_test_table.item_id = users_ref_test_table.id) subquery_1
ORDER BY 1;
item_id
---------
2
3
4
5
(4 rows)
-- Should work, although no equality between partition column and reference table
SELECT subquery_1.user_id FROM
(SELECT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table
ON user_buy_test_table.user_id > users_ref_test_table.id) subquery_1
ORDER BY 1;
user_id
---------
1
2
3
3
7
7
7
7
7
7
(10 rows)
-- Shouldn't work, reference table at the outer side is not allowed
SELECT * FROM
(SELECT random() FROM users_ref_test_table LEFT JOIN user_buy_test_table
@ -93,6 +113,33 @@ SELECT * FROM
ON user_buy_test_table.user_id = users_ref_test_table.id) subquery_1;
ERROR: cannot pushdown the subquery
DETAIL: There exist a reference table in the outer part of the outer join
-- Equi join test with reference table on non-partition keys
SELECT count(*) FROM
(SELECT random() FROM user_buy_test_table JOIN users_ref_test_table
ON user_buy_test_table.item_id = users_ref_test_table.id) subquery_1;
count
-------
4
(1 row)
-- Non-equi join test with reference table on non-partition keys
SELECT count(*) FROM
(SELECT random() FROM user_buy_test_table JOIN users_ref_test_table
ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1;
count
-------
10
(1 row)
-- Non-equi left joins with reference tables on non-partition keys
SELECT count(*) FROM
(SELECT random() FROM user_buy_test_table LEFT JOIN users_ref_test_table
ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1;
count
-------
10
(1 row)
-- Should pass since reference table locates in the inner part of each left join
SELECT count(*) FROM
(SELECT tt1.user_id, random() FROM user_buy_test_table AS tt1 JOIN users_return_test_table as tt2
@ -105,13 +152,238 @@ SELECT count(*) FROM
2
(1 row)
-- Should not pass since reference table locates in the outer part of right join
-- two subqueries, each include joins with reference table
-- also, two hash distributed tables are joined on partition keys
SELECT count(*) FROM
(SELECT DISTINCT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table
ON user_buy_test_table.item_id > users_ref_test_table.id AND users_ref_test_table.k_no > 88 AND user_buy_test_table.item_id < 88) subquery_1,
(SELECT DISTINCT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table
ON user_buy_test_table.user_id > users_ref_test_table.id AND users_ref_test_table.k_no > 44 AND user_buy_test_table.user_id > 44) subquery_2
WHERE subquery_1.user_id = subquery_2.user_id ;
count
-------
4
(1 row)
-- Should be able to push down since reference tables are inner joined
-- with hash distributed tables, the results of those joins are the parts of
-- an outer join
SELECT subquery_2.id FROM
(SELECT tt1.user_id, random() FROM user_buy_test_table AS tt1 JOIN users_return_test_table as tt2
ON tt1.user_id = tt2.user_id) subquery_1
RIGHT JOIN
(SELECT tt1.user_id, ref.id, random() FROM user_buy_test_table as tt1 JOIN users_ref_test_table as ref
ON tt1.user_id = ref.id) subquery_2 ON subquery_1.user_id = subquery_2.user_id ORDER BY 1 DESC LIMIT 5;
id
----
3
2
1
(3 rows)
-- the same query as the above, but this Citus fails to pushdown the query
-- since the outer part of the right join doesn't include any joins
SELECT * FROM
(SELECT tt1.user_id, random() FROM user_buy_test_table AS tt1 JOIN users_return_test_table as tt2
ON tt1.user_id = tt2.user_id) subquery_1
RIGHT JOIN
(SELECT tt1.user_id, random() FROM user_buy_test_table as tt1 JOIN users_ref_test_table as ref
ON tt1.user_id = ref.id) subquery_2 ON subquery_1.user_id = subquery_2.user_id;
(SELECT *, random() FROM (SELECT tt1.user_id, random() FROM user_buy_test_table as tt1 JOIN users_ref_test_table as ref
ON tt1.user_id = ref.id) subquery_2_inner) subquery_2 ON subquery_1.user_id = subquery_2.user_id;
ERROR: cannot pushdown the subquery
DETAIL: There exist a reference table in the outer part of the outer join
-- should be able to pushdown since reference table is in the
-- inner part of the left join
SELECT
user_id, sum(value_1)
FROM
(SELECT
users_table.user_id, users_table.value_1, random()
FROM
users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id)
INNER JOIN events_reference_table ON (events_reference_table.value_2 = users_table.user_id)
) as foo
GROUP BY user_id ORDER BY 2 DESC LIMIT 10;
user_id | sum
---------+----------
12 | 92221920
17 | 89192642
96 | 85143744
45 | 84267456
90 | 84157047
43 | 82110240
1 | 81735612
72 | 78992640
67 | 72385516
97 | 71002659
(10 rows)
-- same query as above, reference table is wrapped into a subquery
SELECT
user_id, sum(value_1)
FROM
(SELECT
users_table.user_id, users_table.value_1, random()
FROM
users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id)
INNER JOIN (SELECT *, random() FROM events_reference_table) as ref_all ON (ref_all.value_2 = users_table.user_id)
) as foo
GROUP BY user_id ORDER BY 2 DESC LIMIT 10;
user_id | sum
---------+----------
12 | 92221920
17 | 89192642
96 | 85143744
45 | 84267456
90 | 84157047
43 | 82110240
1 | 81735612
72 | 78992640
67 | 72385516
97 | 71002659
(10 rows)
-- should be able to pushdown since reference table is in the
-- inner part of the left join
SELECT
user_id, sum(value_1)
FROM
(SELECT
users_table.user_id, users_table.value_1, random()
FROM
users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id)
LEFT JOIN events_reference_table ON (events_reference_table.value_2 = users_table.user_id)
) as foo
GROUP BY user_id ORDER BY 2 DESC LIMIT 10;
user_id | sum
---------+----------
12 | 92221920
17 | 89192642
96 | 85143744
45 | 84267456
90 | 84157047
43 | 82110240
1 | 81735612
72 | 78992640
67 | 72385516
97 | 71002659
(10 rows)
-- should not be able to pushdown since reference table is in the
-- direct outer part of the left join
SELECT
user_id, sum(value_1)
FROM
(SELECT
users_table.user_id, users_table.value_1, random()
FROM
events_reference_table LEFT JOIN users_table ON (users_table.user_id = events_reference_table.value_2)
LEFT JOIN events_table ON (events_table.user_id = users_table.user_id)
) as foo
GROUP BY user_id ORDER BY 2 DESC LIMIT 10;
ERROR: cannot pushdown the subquery
DETAIL: There exist a reference table in the outer part of the outer join
-- should not be able to pushdown since reference table is in the
-- direct outer part of the left join wrapped into a subquery
SELECT
*
FROM
(SELECT *, random() FROM events_reference_table) as ref_all LEFT JOIN users_table
ON (users_table.user_id = ref_all.value_2);
ERROR: cannot pushdown the subquery
DETAIL: There exist a reference table in the outer part of the outer join
-- should not be able to pushdown since reference table is in the
-- outer part of the left join
SELECT
user_id, sum(value_1)
FROM
(SELECT
users_table.user_id, users_table.value_1, random()
FROM
events_reference_table LEFT JOIN users_table ON (users_table.user_id = events_reference_table.value_2)
LEFT JOIN events_table ON (events_table.user_id = users_table.user_id)
) as foo
GROUP BY user_id ORDER BY 2 DESC LIMIT 10;
ERROR: cannot pushdown the subquery
DETAIL: There exist a reference table in the outer part of the outer join
-- should be able to pushdown since reference table is in the
-- inner part of the left join
SELECT * FROM
(
SELECT DISTINCT foo.user_id
FROM
((SELECT
"events"."time", "events"."user_id" as event_user_id, value_2 as event_val_2, random()
FROM
events_reference_table as "events"
WHERE
event_type > 80) as "temp_data_queries"
INNER JOIN
(SELECT
"users"."user_id"
FROM
users_table as "users"
WHERE
user_id > 80 and value_2 = 5) as foo_in ON (event_val_2 = user_id)) as foo LEFT JOIN
(SELECT user_id as user_user_id FROM users_table) as fooo ON (user_id = user_user_id)) as bar;
user_id
---------
89
(1 row)
-- the same query but this time reference table is in the outer part of the query
SELECT * FROM
(
SELECT DISTINCT foo.user_id
FROM
((SELECT
"events"."time", "events"."user_id" as event_user_id, value_2 as event_val_2, random()
FROM
events_reference_table as "events"
WHERE
event_type > 80) as "temp_data_queries"
LEFT JOIN
(SELECT
"users"."user_id"
FROM
users_table as "users"
WHERE
user_id > 80 and value_2 = 5) as foo_in ON (event_val_2 = user_id)) as foo LEFT JOIN
(SELECT user_id as user_user_id FROM users_table) as fooo ON (user_id = user_user_id)) as bar;
ERROR: cannot pushdown the subquery
DETAIL: There exist a reference table in the outer part of the outer join
-- we could even suuport the following where the subquery
-- on the outer part of the left join contains a reference table
SELECT max(events_all.cnt), events_all.usr_id
FROM
(SELECT users_table.user_id as usr_id,
count(*) as cnt
FROM events_reference_table
INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) GROUP BY users_table.user_id) AS events_all
LEFT JOIN events_table ON (events_all.usr_id = events_table.user_id) GROUP BY 2 ORDER BY 1 DESC, 2 DESC LIMIT 5;
max | usr_id
-------+--------
14605 | 23
13090 | 17
12915 | 25
12317 | 90
12285 | 87
(5 rows)
-- but, we fail to pushdown the following query where join that reference table appears
-- wrapped into a subquery
SELECT max(events_all.cnt),
events_all.usr_id
FROM(
SELECT *, random() FROM
(SELECT users_table.user_id AS usr_id, count(*) AS cnt
FROM events_reference_table
INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id)
GROUP BY users_table.user_id) AS events_all_inner) AS events_all
LEFT JOIN events_table ON (events_all.usr_id = events_table.user_id)
GROUP BY 2
ORDER BY 1 DESC,
2 DESC
LIMIT 5;
ERROR: cannot pushdown the subquery
DETAIL: There exist a reference table in the outer part of the outer join
-- LATERAL JOINs used with INNER JOINs with reference tables
@ -193,10 +465,10 @@ SELECT
count(*) AS value, "generated_group_field"
FROM
(SELECT
DISTINCT "pushedDownQuery"."real_user_id", "generated_group_field"
DISTINCT "pushedDownQuery"."user_id", "generated_group_field"
FROM
(SELECT
"eventQuery"."real_user_id", "eventQuery"."time", random(), ("eventQuery"."value_2") AS "generated_group_field"
"eventQuery"."user_id", "eventQuery"."time", random(), ("eventQuery"."value_2") AS "generated_group_field"
FROM
(SELECT
*
@ -204,7 +476,7 @@ SELECT
(SELECT
"events"."time", "events"."user_id", "events"."value_2"
FROM
events_reference_table as "events"
events_table as "events"
WHERE
user_id > 10 and user_id < 40 AND event_type IN (40, 41, 42, 43, 44, 45) ) "temp_data_queries"
INNER JOIN
@ -355,8 +627,8 @@ LIMIT 10;
(6 rows)
SET citus.subquery_pushdown to OFF;
-- LEFT JOINs used with INNER JOINs should error out since reference table exist in the
-- left side of the LEFT JOIN.
-- LEFT JOINs used with INNER JOINs should not error out since reference table joined
-- with hash table that Citus can push down
SELECT
count(*) AS cnt, "generated_group_field"
FROM
@ -393,8 +665,20 @@ count(*) AS cnt, "generated_group_field"
ORDER BY
cnt DESC, generated_group_field ASC
LIMIT 10;
ERROR: cannot pushdown the subquery
DETAIL: There exist a reference table in the outer part of the outer join
cnt | generated_group_field
-----+-----------------------
176 | 551
176 | 569
176 | 645
176 | 713
176 | 734
88 | 3
88 | 5
88 | 15
88 | 32
88 | 68
(10 rows)
-- RIGHT JOINs used with INNER JOINs should error out since reference table exist in the
-- right side of the RIGHT JOIN.
SELECT
@ -435,6 +719,133 @@ count(*) AS cnt, "generated_group_field"
LIMIT 10;
ERROR: cannot pushdown the subquery
DETAIL: There exist a reference table in the outer part of the outer join
-- right join where the inner part of the join includes a reference table
-- joined with hash partitioned table using non-equi join
SELECT user_id, sum(array_length(events_table, 1)), length(hasdone_event), hasdone_event
FROM (
SELECT
t1.user_id,
array_agg(event ORDER BY time) AS events_table,
COALESCE(hasdone_event, 'Has not done event') AS hasdone_event
FROM (
(
SELECT u.user_id, 'step=>1'::text AS event, e.time
FROM users_table AS u,
events_reference_table AS e
WHERE u.user_id > e.user_id
AND u.user_id >= 10
AND u.user_id <= 25
AND e.event_type IN (100, 101, 102)
)
) t1 RIGHT JOIN (
SELECT DISTINCT user_id,
'Has done event'::TEXT AS hasdone_event
FROM events_table AS e
WHERE e.user_id >= 10
AND e.user_id <= 25
AND e.event_type IN (106, 107, 108)
) t2 ON (t1.user_id = t2.user_id)
GROUP BY t1.user_id, hasdone_event
) t GROUP BY user_id, hasdone_event
ORDER BY user_id;
user_id | sum | length | hasdone_event
---------+-----+--------+----------------
11 | 306 | 14 | Has done event
12 | 363 | 14 | Has done event
14 | 510 | 14 | Has done event
18 | 600 | 14 | Has done event
19 | 618 | 14 | Has done event
(5 rows)
-- a similar query as the above, with non-partition key comparison
SELECT user_id, sum(array_length(events_table, 1)), length(hasdone_event), hasdone_event
FROM (
SELECT
t1.user_id,
array_agg(event ORDER BY time) AS events_table,
COALESCE(hasdone_event, 'Has not done event') AS hasdone_event
FROM (
(
SELECT u.user_id, 'step=>1'::text AS event, e.time
FROM users_table AS u,
events_reference_table AS e
WHERE u.value_1 > e.user_id
AND u.user_id >= 10
AND u.user_id <= 25
AND e.event_type >= 125 AND e.event_type < 130
)
) t1 RIGHT JOIN (
SELECT DISTINCT user_id,
'Has done event'::TEXT AS hasdone_event
FROM events_table AS e
WHERE e.user_id >= 10
AND e.user_id <= 25
AND e.event_type >= 130 AND e.event_type < 135
) t2 ON (t1.user_id = t2.user_id)
GROUP BY t1.user_id, hasdone_event
) t GROUP BY user_id, hasdone_event
ORDER BY user_id;
user_id | sum | length | hasdone_event
---------+------+--------+----------------
10 | 6018 | 14 | Has done event
16 | 5373 | 14 | Has done event
17 | 5683 | 14 | Has done event
18 | 5321 | 14 | Has done event
(4 rows)
-- LEFT JOINs used with INNER JOINs
-- events_table and users_reference_table joined
-- with event_table.non_part_key < reference_table.any_key
SELECT
count(*) AS cnt, "generated_group_field"
FROM
(SELECT
"eventQuery"."user_id", random(), generated_group_field
FROM
(SELECT
"multi_group_wrapper_1".*, generated_group_field, random()
FROM
(SELECT *
FROM
(SELECT
"events"."time", "events"."user_id" as event_user_id
FROM
events_table as "events"
WHERE
user_id > 80) "temp_data_queries"
INNER JOIN
(SELECT
"users"."user_id"
FROM
users_reference_table as "users"
WHERE
user_id > 80 and value_2 = 5) "user_filters_1"
ON ("temp_data_queries".event_user_id < "user_filters_1".user_id)) AS "multi_group_wrapper_1"
RIGHT JOIN
(SELECT
"users"."user_id" AS "user_id", value_2 AS "generated_group_field"
FROM
users_table as "users") "left_group_by_1"
ON ("left_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery"
group BY
"generated_group_field"
ORDER BY
cnt DESC, generated_group_field ASC
LIMIT 10;
cnt | generated_group_field
-----+-----------------------
540 | 814
533 | 746
473 | 914
449 | 684
445 | 715
423 | 191
419 | 39
415 | 108
414 | 819
411 | 642
(10 rows)
-- Outer subquery with reference table
SELECT "some_users_data".user_id, lastseen
FROM
@ -468,9 +879,9 @@ limit 50;
ERROR: cannot pushdown the subquery
DETAIL: There exist a reference table in the outer part of the outer join
--
-- UNIONs and JOINs with reference tables, shoukld error out
-- UNIONs and JOINs with reference tables, should error out
--
SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType
SELECT ("final_query"."event_types") as types
FROM
( SELECT *, random()
FROM
@ -484,7 +895,7 @@ FROM
(SELECT
"events"."user_id", "events"."time", 0 AS event
FROM
events_reference_table as "events"
events_table as "events"
WHERE
event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1)
UNION
@ -494,7 +905,7 @@ FROM
(SELECT
"events"."user_id", "events"."time", 1 AS event
FROM
events_table as "events"
events_reference_table as "events"
WHERE
event_type IN (15, 16, 17, 18, 19) ) events_subquery_2)
UNION
@ -526,8 +937,6 @@ INNER JOIN
WHERE
value_1 > 50 and value_1 < 70) AS t
ON (t.user_id = q.user_id)) as final_query
GROUP BY
types
ORDER BY
types;
ERROR: cannot push down this subquery
@ -560,15 +969,15 @@ FROM
SELECT * FROM
(
SELECT
max("events"."time"),
max("users"."time"),
0 AS event,
"events"."user_id"
"users"."user_id"
FROM
events_reference_table as "events", users_table as "users"
WHERE
events.user_id = users.user_id AND
event_type IN (10, 11, 12, 13, 14, 15)
GROUP BY "events"."user_id"
GROUP BY "users"."user_id"
) as events_subquery_5
) events_subquery_2)
UNION
@ -660,6 +1069,117 @@ GROUP BY types
ORDER BY types;
ERROR: cannot push down this subquery
DETAIL: Reference tables are not supported with union operator
-- just a sanity check that we don't allow this if the reference table is on the
-- left part of the left join
SELECT count(*) FROM
(SELECT random() FROM users_ref_test_table LEFT JOIN user_buy_test_table
ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1;
ERROR: cannot pushdown the subquery
DETAIL: There exist a reference table in the outer part of the outer join
-- we don't allow non equi join among hash partitioned tables
SELECT count(*) FROM
(SELECT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table
ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1,
(SELECT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table
ON user_buy_test_table.user_id > users_ref_test_table.id) subquery_2
WHERE subquery_1.user_id != subquery_2.user_id ;
ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys
DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator.
-- we cannot push this query since hash partitioned tables
-- are not joined on partition keys with equality
SELECT
count(*) AS cnt, "generated_group_field"
FROM
(SELECT
"eventQuery"."user_id", random(), generated_group_field
FROM
(SELECT
"multi_group_wrapper_1".*, generated_group_field, random()
FROM
(SELECT *
FROM
(SELECT
"events"."time", "events"."user_id" as event_user_id
FROM
events_table as "events"
WHERE
user_id > 80) "temp_data_queries"
INNER JOIN
(SELECT
"users"."user_id"
FROM
users_reference_table as "users"
WHERE
user_id > 80 and value_2 = 5) "user_filters_1"
ON ("temp_data_queries".event_user_id < "user_filters_1".user_id)) AS "multi_group_wrapper_1"
RIGHT JOIN
(SELECT
"users"."user_id" AS "user_id", value_2 AS "generated_group_field"
FROM
users_table as "users") "left_group_by_1"
ON ("left_group_by_1".user_id > "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery"
group BY
"generated_group_field"
ORDER BY
cnt DESC, generated_group_field ASC
LIMIT 10;
ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys
DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator.
-- two hash partitioned relations are not joined
-- on partiton keys although reference table is fine
-- to push down
SELECT
u1.user_id, count(*)
FROM
events_table as e1, users_table as u1
WHERE
event_type IN
(SELECT
event_type
FROM
events_reference_table as e2
WHERE
value_2 = 15 AND
value_3 > 25 AND
e1.value_2 > e2.value_2
)
AND u1.user_id > e1.user_id
GROUP BY 1
ORDER BY 2 DESC, 1 DESC
LIMIT 5;
ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys
DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator.
SELECT foo.user_id FROM
(
SELECT m.user_id, random() FROM users_table m JOIN events_reference_table r ON int4eq(m.user_id, r.user_id)
WHERE event_type > 100000
) as foo;
user_id
---------
(0 rows)
-- not supported since group by is on the reference table column
SELECT foo.user_id FROM
(
SELECT r.user_id, random() FROM users_table m JOIN events_reference_table r ON int4eq(m.user_id, r.user_id)
GROUP BY r.user_id
) as foo;
ERROR: cannot push down this subquery
DETAIL: Group by list without partition column is currently unsupported
-- not supported since distinct is on the reference table column
SELECT foo.user_id FROM
(
SELECT DISTINCT r.user_id, random() FROM users_table m JOIN events_reference_table r ON int4eq(m.user_id, r.user_id)
) as foo;
ERROR: cannot push down this subquery
DETAIL: Distinct on columns without partition column is currently unsupported
-- not supported since distinct on is on the reference table column
SELECT foo.user_id FROM
(
SELECT DISTINCT ON(r.user_id) r.user_id, random() FROM users_table m JOIN events_reference_table r ON int4eq(m.user_id, r.user_id)
) as foo;
ERROR: cannot push down this subquery
DETAIL: Distinct on columns without partition column is currently unsupported
DROP TABLE user_buy_test_table;
DROP TABLE users_ref_test_table;
DROP TABLE users_return_test_table;

View File

@ -1,33 +1,5 @@
--
-- queries to test the subquery pushdown on reference tables
-- subqueries in WHERE with greater operator
SELECT
user_id
FROM
users_table
WHERE
value_2 >
(SELECT
max(value_2)
FROM
events_reference_table
WHERE
users_table.user_id = events_reference_table.user_id AND event_type = 50
GROUP BY
user_id
)
GROUP BY user_id
HAVING count(*) > 66
ORDER BY user_id
LIMIT 5;
user_id
---------
49
55
56
63
(4 rows)
-- subqueries in WHERE with IN operator
SELECT
user_id
@ -91,10 +63,10 @@ WHERE
)
LIMIT 3;
ERROR: cannot pushdown the subquery
DETAIL: There exist a reference table in the outer part of the outer join
DETAIL: Reference tables are not allowed in FROM clause when the query has subqueries in WHERE clause
-- subqueries in WHERE with IN operator without equality
SELECT
user_id
users_table.user_id, count(*)
FROM
users_table
WHERE
@ -106,59 +78,15 @@ WHERE
WHERE
users_table.user_id > events_reference_table.user_id
)
GROUP BY user_id
ORDER BY user_id
GROUP BY users_table.user_id
ORDER BY 2 DESC, 1 DESC
LIMIT 3;
ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys
DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator.
-- have reference table without any equality, should error out
SELECT
user_id
FROM
users_table
WHERE
value_2 >
(SELECT
max(value_2)
FROM
events_reference_table
WHERE
event_type = 50
GROUP BY
user_id
)
GROUP BY user_id
HAVING count(*) > 66
ORDER BY user_id
LIMIT 5;
ERROR: cannot pushdown the subquery since all relations are not joined using distribution keys
DETAIL: Each relation should be joined with at least one another relation using distribution keys and equality operator.
-- users that appeared more than 118 times, should run since the reference table
-- on the right side of the semi join
SELECT
user_id
FROM
users_table
WHERE 118 <=
(SELECT
count(*)
FROM
events_reference_table
WHERE
users_table.user_id = events_reference_table.user_id
GROUP BY
user_id)
GROUP BY
user_id
ORDER BY
user_id;
user_id
---------
13
17
23
25
(4 rows)
user_id | count
---------+-------
87 | 117
59 | 115
46 | 115
(3 rows)
-- should error out since reference table exist on the left side
-- of the left lateral join
@ -224,3 +152,194 @@ SELECT user_id, value_2 FROM users_table WHERE
ORDER BY 1, 2;
ERROR: cannot pushdown the subquery
DETAIL: There exist a reference table in the outer part of the outer join
-- non-partition key equality with reference table
SELECT
user_id, count(*)
FROM
users_table
WHERE
value_3 =ANY(SELECT value_2 FROM users_reference_table WHERE value_1 >= 10 AND value_1 <= 20)
GROUP BY 1 ORDER BY 2 DESC, 1 DESC LIMIT 5;
user_id | count
---------+-------
48 | 18
26 | 18
15 | 17
54 | 16
35 | 15
(5 rows)
-- non-partition key comparison with reference table
SELECT
user_id, count(*)
FROM
events_table as e1
WHERE
event_type IN
(SELECT
event_type
FROM
events_reference_table as e2
WHERE
value_2 = 15 AND
value_3 > 25 AND
e1.value_2 > e2.value_2
)
GROUP BY 1
ORDER BY 2 DESC, 1 DESC
LIMIT 5;
user_id | count
---------+-------
3 | 5
56 | 4
99 | 2
94 | 2
92 | 2
(5 rows)
-- subqueries in both WHERE and FROM clauses
-- should work since reference table is on the
-- inner part of the join
SELECT user_id, value_2 FROM users_table WHERE
value_1 > 101 AND value_1 < 110
AND value_2 >= 5
AND user_id IN
(
SELECT
e1.user_id
FROM (
-- Get the first time each user viewed the homepage.
SELECT
user_id,
1 AS view_homepage,
min(time) AS view_homepage_time
FROM events_table
WHERE
event_type IN (10, 20, 30, 40, 50, 60, 70, 80, 90)
GROUP BY user_id
) e1 LEFT JOIN LATERAL (
SELECT
user_id,
1 AS use_demo,
time AS use_demo_time
FROM events_table
WHERE
user_id = e1.user_id AND
event_type IN (11, 21, 31, 41, 51, 61, 71, 81, 91)
ORDER BY time
) e2 ON true LEFT JOIN LATERAL (
SELECT
user_id,
1 AS enter_credit_card,
time AS enter_credit_card_time
FROM events_table
WHERE
user_id = e2.user_id AND
event_type IN (12, 22, 32, 42, 52, 62, 72, 82, 92)
ORDER BY time
) e3 ON true LEFT JOIN LATERAL (
SELECT
1 AS submit_card_info,
user_id,
time AS enter_credit_card_time
FROM events_table
WHERE
user_id = e3.user_id AND
event_type IN (13, 23, 33, 43, 53, 63, 73, 83, 93)
ORDER BY time
) e4 ON true LEFT JOIN LATERAL (
SELECT
1 AS see_bought_screen
FROM events_reference_table
WHERE
user_id = e4.user_id AND
event_type IN (14, 24, 34, 44, 54, 64, 74, 84, 94)
ORDER BY time
) e5 ON true
group by e1.user_id
HAVING sum(submit_card_info) > 0
)
ORDER BY 1, 2;
user_id | value_2
---------+---------
5 | 884
42 | 55
42 | 471
51 | 758
72 | 897
82 | 691
95 | 951
(7 rows)
-- reference tables are not allowed if there is sublink
SELECT
count(*)
FROM
users_reference_table
WHERE user_id
NOT IN
(SELECT users_table.value_2 FROM users_table JOIN users_reference_table as u2 ON users_table.value_2 = u2.value_2);
ERROR: cannot pushdown the subquery
DETAIL: Reference tables are not allowed in FROM clause when the query has subqueries in WHERE clause
-- reference tables are not allowed if there is sublink
SELECT count(*)
FROM
(SELECT
user_id, random() FROM users_reference_table) AS vals
WHERE vals.user_id NOT IN
(SELECT users_table.value_2
FROM users_table
JOIN users_reference_table AS u2 ON users_table.value_2 = u2.value_2);
ERROR: cannot pushdown the subquery
DETAIL: Reference tables are not allowed in FROM clause when the query has subqueries in WHERE clause
-- reference tables are not allowed if there is sublink
SELECT user_id,
count(*)
FROM users_reference_table
WHERE value_2 > ALL
(SELECT min(value_2)
FROM events_table
WHERE event_type > 50 AND users_reference_table.user_id = events_table.user_id
GROUP BY user_id)
GROUP BY user_id
HAVING count(*) > 66
ORDER BY 2 DESC,
1 DESC
LIMIT 5;
ERROR: cannot pushdown the subquery
DETAIL: Reference tables are not allowed in FROM clause when the query has subqueries in WHERE clause
-- reference tables are not allowed if there is sublink
-- this time in the subquery
SELECT *
FROM users_table
WHERE user_id IN
(SELECT users_table.user_id
FROM users_table,
users_reference_table
WHERE users_reference_table.user_id NOT IN
(SELECT value_2
FROM users_reference_table AS u2));
ERROR: cannot push down this subquery
DETAIL: Reference tables are not allowed in FROM clause when the query has subqueries in WHERE clause
-- not supported since GROUP BY references to an upper level query
SELECT
user_id
FROM
users_table
WHERE
value_2 >
(SELECT
max(value_2)
FROM
events_reference_table
WHERE
users_table.user_id = events_reference_table.user_id AND event_type = 50
GROUP BY
users_table.user_id
)
GROUP BY user_id
HAVING count(*) > 66
ORDER BY user_id
LIMIT 5;
ERROR: cannot push down this subquery
DETAIL: Group by list without partition column is currently unsupported

View File

@ -356,6 +356,7 @@ SET citus.shard_max_size TO "1MB";
CREATE TABLE events_reference_table (like events_table including all);
SELECT create_reference_table('events_reference_table');
CREATE INDEX events_ref_val2 on events_reference_table(value_2);
INSERT INTO events_reference_table SELECT * FROM events_table;
CREATE TABLE users_reference_table (like users_table including all);

View File

@ -428,6 +428,7 @@ SELECT create_reference_table('events_reference_table');
(1 row)
CREATE INDEX events_ref_val2 on events_reference_table(value_2);
INSERT INTO events_reference_table SELECT * FROM events_table;
CREATE TABLE users_reference_table (like users_table including all);
SELECT create_reference_table('users_reference_table');

View File

@ -707,7 +707,7 @@ FROM
WHERE
colocated_table_test.value_1 = reference_table_test.value_1;
-- not pushable due to lack of equality between partition column and column of reference table
-- safe to push down even lack of equality between partition column and column of reference table
INSERT INTO
colocated_table_test (value_1, value_2)
SELECT
@ -718,7 +718,7 @@ WHERE
colocated_table_test_2.value_4 = reference_table_test.value_4
RETURNING value_1, value_2;
-- some more complex queries (Note that there are more complex queries in multi_insert_select.sql)
-- similar query with the above, this time partition key but without equality
INSERT INTO
colocated_table_test (value_1, value_2)
SELECT
@ -726,7 +726,7 @@ SELECT
FROM
colocated_table_test_2, reference_table_test
WHERE
colocated_table_test_2.value_2 = reference_table_test.value_2
colocated_table_test_2.value_1 > reference_table_test.value_2
RETURNING value_1, value_2;
-- partition column value comes from reference table, goes via coordinator

View File

@ -42,15 +42,17 @@ SELECT count(*) FROM
(SELECT random(), k_no FROM user_buy_test_table LEFT JOIN users_ref_test_table
ON user_buy_test_table.user_id = users_ref_test_table.id) subquery_1 WHERE k_no = 47;
-- Should not work, no equality between partition column and reference table
SELECT * FROM
(SELECT random() FROM user_buy_test_table LEFT JOIN users_ref_test_table
ON user_buy_test_table.item_id = users_ref_test_table.id) subquery_1;
-- Should work, although no equality between partition column and reference table
SELECT subquery_1.item_id FROM
(SELECT user_buy_test_table.item_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table
ON user_buy_test_table.item_id = users_ref_test_table.id) subquery_1
ORDER BY 1;
-- Should not work, no equality between partition column and reference table
SELECT * FROM
(SELECT random() FROM user_buy_test_table LEFT JOIN users_ref_test_table
ON user_buy_test_table.user_id > users_ref_test_table.id) subquery_1;
-- Should work, although no equality between partition column and reference table
SELECT subquery_1.user_id FROM
(SELECT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table
ON user_buy_test_table.user_id > users_ref_test_table.id) subquery_1
ORDER BY 1;
-- Shouldn't work, reference table at the outer side is not allowed
SELECT * FROM
@ -67,6 +69,21 @@ SELECT * FROM
(SELECT random() FROM user_buy_test_table RIGHT JOIN users_ref_test_table
ON user_buy_test_table.user_id = users_ref_test_table.id) subquery_1;
-- Equi join test with reference table on non-partition keys
SELECT count(*) FROM
(SELECT random() FROM user_buy_test_table JOIN users_ref_test_table
ON user_buy_test_table.item_id = users_ref_test_table.id) subquery_1;
-- Non-equi join test with reference table on non-partition keys
SELECT count(*) FROM
(SELECT random() FROM user_buy_test_table JOIN users_ref_test_table
ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1;
-- Non-equi left joins with reference tables on non-partition keys
SELECT count(*) FROM
(SELECT random() FROM user_buy_test_table LEFT JOIN users_ref_test_table
ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1;
-- Should pass since reference table locates in the inner part of each left join
SELECT count(*) FROM
(SELECT tt1.user_id, random() FROM user_buy_test_table AS tt1 JOIN users_return_test_table as tt2
@ -75,13 +92,172 @@ SELECT count(*) FROM
(SELECT tt1.user_id, random() FROM user_buy_test_table as tt1 LEFT JOIN users_ref_test_table as ref
ON tt1.user_id = ref.id) subquery_2 ON subquery_1.user_id = subquery_2.user_id;
-- Should not pass since reference table locates in the outer part of right join
-- two subqueries, each include joins with reference table
-- also, two hash distributed tables are joined on partition keys
SELECT count(*) FROM
(SELECT DISTINCT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table
ON user_buy_test_table.item_id > users_ref_test_table.id AND users_ref_test_table.k_no > 88 AND user_buy_test_table.item_id < 88) subquery_1,
(SELECT DISTINCT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table
ON user_buy_test_table.user_id > users_ref_test_table.id AND users_ref_test_table.k_no > 44 AND user_buy_test_table.user_id > 44) subquery_2
WHERE subquery_1.user_id = subquery_2.user_id ;
-- Should be able to push down since reference tables are inner joined
-- with hash distributed tables, the results of those joins are the parts of
-- an outer join
SELECT subquery_2.id FROM
(SELECT tt1.user_id, random() FROM user_buy_test_table AS tt1 JOIN users_return_test_table as tt2
ON tt1.user_id = tt2.user_id) subquery_1
RIGHT JOIN
(SELECT tt1.user_id, ref.id, random() FROM user_buy_test_table as tt1 JOIN users_ref_test_table as ref
ON tt1.user_id = ref.id) subquery_2 ON subquery_1.user_id = subquery_2.user_id ORDER BY 1 DESC LIMIT 5;
-- the same query as the above, but this Citus fails to pushdown the query
-- since the outer part of the right join doesn't include any joins
SELECT * FROM
(SELECT tt1.user_id, random() FROM user_buy_test_table AS tt1 JOIN users_return_test_table as tt2
ON tt1.user_id = tt2.user_id) subquery_1
RIGHT JOIN
(SELECT tt1.user_id, random() FROM user_buy_test_table as tt1 JOIN users_ref_test_table as ref
ON tt1.user_id = ref.id) subquery_2 ON subquery_1.user_id = subquery_2.user_id;
(SELECT *, random() FROM (SELECT tt1.user_id, random() FROM user_buy_test_table as tt1 JOIN users_ref_test_table as ref
ON tt1.user_id = ref.id) subquery_2_inner) subquery_2 ON subquery_1.user_id = subquery_2.user_id;
-- should be able to pushdown since reference table is in the
-- inner part of the left join
SELECT
user_id, sum(value_1)
FROM
(SELECT
users_table.user_id, users_table.value_1, random()
FROM
users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id)
INNER JOIN events_reference_table ON (events_reference_table.value_2 = users_table.user_id)
) as foo
GROUP BY user_id ORDER BY 2 DESC LIMIT 10;
-- same query as above, reference table is wrapped into a subquery
SELECT
user_id, sum(value_1)
FROM
(SELECT
users_table.user_id, users_table.value_1, random()
FROM
users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id)
INNER JOIN (SELECT *, random() FROM events_reference_table) as ref_all ON (ref_all.value_2 = users_table.user_id)
) as foo
GROUP BY user_id ORDER BY 2 DESC LIMIT 10;
-- should be able to pushdown since reference table is in the
-- inner part of the left join
SELECT
user_id, sum(value_1)
FROM
(SELECT
users_table.user_id, users_table.value_1, random()
FROM
users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id)
LEFT JOIN events_reference_table ON (events_reference_table.value_2 = users_table.user_id)
) as foo
GROUP BY user_id ORDER BY 2 DESC LIMIT 10;
-- should not be able to pushdown since reference table is in the
-- direct outer part of the left join
SELECT
user_id, sum(value_1)
FROM
(SELECT
users_table.user_id, users_table.value_1, random()
FROM
events_reference_table LEFT JOIN users_table ON (users_table.user_id = events_reference_table.value_2)
LEFT JOIN events_table ON (events_table.user_id = users_table.user_id)
) as foo
GROUP BY user_id ORDER BY 2 DESC LIMIT 10;
-- should not be able to pushdown since reference table is in the
-- direct outer part of the left join wrapped into a subquery
SELECT
*
FROM
(SELECT *, random() FROM events_reference_table) as ref_all LEFT JOIN users_table
ON (users_table.user_id = ref_all.value_2);
-- should not be able to pushdown since reference table is in the
-- outer part of the left join
SELECT
user_id, sum(value_1)
FROM
(SELECT
users_table.user_id, users_table.value_1, random()
FROM
events_reference_table LEFT JOIN users_table ON (users_table.user_id = events_reference_table.value_2)
LEFT JOIN events_table ON (events_table.user_id = users_table.user_id)
) as foo
GROUP BY user_id ORDER BY 2 DESC LIMIT 10;
-- should be able to pushdown since reference table is in the
-- inner part of the left join
SELECT * FROM
(
SELECT DISTINCT foo.user_id
FROM
((SELECT
"events"."time", "events"."user_id" as event_user_id, value_2 as event_val_2, random()
FROM
events_reference_table as "events"
WHERE
event_type > 80) as "temp_data_queries"
INNER JOIN
(SELECT
"users"."user_id"
FROM
users_table as "users"
WHERE
user_id > 80 and value_2 = 5) as foo_in ON (event_val_2 = user_id)) as foo LEFT JOIN
(SELECT user_id as user_user_id FROM users_table) as fooo ON (user_id = user_user_id)) as bar;
-- the same query but this time reference table is in the outer part of the query
SELECT * FROM
(
SELECT DISTINCT foo.user_id
FROM
((SELECT
"events"."time", "events"."user_id" as event_user_id, value_2 as event_val_2, random()
FROM
events_reference_table as "events"
WHERE
event_type > 80) as "temp_data_queries"
LEFT JOIN
(SELECT
"users"."user_id"
FROM
users_table as "users"
WHERE
user_id > 80 and value_2 = 5) as foo_in ON (event_val_2 = user_id)) as foo LEFT JOIN
(SELECT user_id as user_user_id FROM users_table) as fooo ON (user_id = user_user_id)) as bar;
-- we could even suuport the following where the subquery
-- on the outer part of the left join contains a reference table
SELECT max(events_all.cnt), events_all.usr_id
FROM
(SELECT users_table.user_id as usr_id,
count(*) as cnt
FROM events_reference_table
INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) GROUP BY users_table.user_id) AS events_all
LEFT JOIN events_table ON (events_all.usr_id = events_table.user_id) GROUP BY 2 ORDER BY 1 DESC, 2 DESC LIMIT 5;
-- but, we fail to pushdown the following query where join that reference table appears
-- wrapped into a subquery
SELECT max(events_all.cnt),
events_all.usr_id
FROM(
SELECT *, random() FROM
(SELECT users_table.user_id AS usr_id, count(*) AS cnt
FROM events_reference_table
INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id)
GROUP BY users_table.user_id) AS events_all_inner) AS events_all
LEFT JOIN events_table ON (events_all.usr_id = events_table.user_id)
GROUP BY 2
ORDER BY 1 DESC,
2 DESC
LIMIT 5;
-- LATERAL JOINs used with INNER JOINs with reference tables
SET citus.subquery_pushdown to ON;
@ -149,10 +325,10 @@ SELECT
count(*) AS value, "generated_group_field"
FROM
(SELECT
DISTINCT "pushedDownQuery"."real_user_id", "generated_group_field"
DISTINCT "pushedDownQuery"."user_id", "generated_group_field"
FROM
(SELECT
"eventQuery"."real_user_id", "eventQuery"."time", random(), ("eventQuery"."value_2") AS "generated_group_field"
"eventQuery"."user_id", "eventQuery"."time", random(), ("eventQuery"."value_2") AS "generated_group_field"
FROM
(SELECT
*
@ -160,7 +336,7 @@ SELECT
(SELECT
"events"."time", "events"."user_id", "events"."value_2"
FROM
events_reference_table as "events"
events_table as "events"
WHERE
user_id > 10 and user_id < 40 AND event_type IN (40, 41, 42, 43, 44, 45) ) "temp_data_queries"
INNER JOIN
@ -270,8 +446,8 @@ ORDER BY
LIMIT 10;
SET citus.subquery_pushdown to OFF;
-- LEFT JOINs used with INNER JOINs should error out since reference table exist in the
-- left side of the LEFT JOIN.
-- LEFT JOINs used with INNER JOINs should not error out since reference table joined
-- with hash table that Citus can push down
SELECT
count(*) AS cnt, "generated_group_field"
FROM
@ -348,6 +524,106 @@ count(*) AS cnt, "generated_group_field"
cnt DESC, generated_group_field ASC
LIMIT 10;
-- right join where the inner part of the join includes a reference table
-- joined with hash partitioned table using non-equi join
SELECT user_id, sum(array_length(events_table, 1)), length(hasdone_event), hasdone_event
FROM (
SELECT
t1.user_id,
array_agg(event ORDER BY time) AS events_table,
COALESCE(hasdone_event, 'Has not done event') AS hasdone_event
FROM (
(
SELECT u.user_id, 'step=>1'::text AS event, e.time
FROM users_table AS u,
events_reference_table AS e
WHERE u.user_id > e.user_id
AND u.user_id >= 10
AND u.user_id <= 25
AND e.event_type IN (100, 101, 102)
)
) t1 RIGHT JOIN (
SELECT DISTINCT user_id,
'Has done event'::TEXT AS hasdone_event
FROM events_table AS e
WHERE e.user_id >= 10
AND e.user_id <= 25
AND e.event_type IN (106, 107, 108)
) t2 ON (t1.user_id = t2.user_id)
GROUP BY t1.user_id, hasdone_event
) t GROUP BY user_id, hasdone_event
ORDER BY user_id;
-- a similar query as the above, with non-partition key comparison
SELECT user_id, sum(array_length(events_table, 1)), length(hasdone_event), hasdone_event
FROM (
SELECT
t1.user_id,
array_agg(event ORDER BY time) AS events_table,
COALESCE(hasdone_event, 'Has not done event') AS hasdone_event
FROM (
(
SELECT u.user_id, 'step=>1'::text AS event, e.time
FROM users_table AS u,
events_reference_table AS e
WHERE u.value_1 > e.user_id
AND u.user_id >= 10
AND u.user_id <= 25
AND e.event_type >= 125 AND e.event_type < 130
)
) t1 RIGHT JOIN (
SELECT DISTINCT user_id,
'Has done event'::TEXT AS hasdone_event
FROM events_table AS e
WHERE e.user_id >= 10
AND e.user_id <= 25
AND e.event_type >= 130 AND e.event_type < 135
) t2 ON (t1.user_id = t2.user_id)
GROUP BY t1.user_id, hasdone_event
) t GROUP BY user_id, hasdone_event
ORDER BY user_id;
-- LEFT JOINs used with INNER JOINs
-- events_table and users_reference_table joined
-- with event_table.non_part_key < reference_table.any_key
SELECT
count(*) AS cnt, "generated_group_field"
FROM
(SELECT
"eventQuery"."user_id", random(), generated_group_field
FROM
(SELECT
"multi_group_wrapper_1".*, generated_group_field, random()
FROM
(SELECT *
FROM
(SELECT
"events"."time", "events"."user_id" as event_user_id
FROM
events_table as "events"
WHERE
user_id > 80) "temp_data_queries"
INNER JOIN
(SELECT
"users"."user_id"
FROM
users_reference_table as "users"
WHERE
user_id > 80 and value_2 = 5) "user_filters_1"
ON ("temp_data_queries".event_user_id < "user_filters_1".user_id)) AS "multi_group_wrapper_1"
RIGHT JOIN
(SELECT
"users"."user_id" AS "user_id", value_2 AS "generated_group_field"
FROM
users_table as "users") "left_group_by_1"
ON ("left_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery"
group BY
"generated_group_field"
ORDER BY
cnt DESC, generated_group_field ASC
LIMIT 10;
-- Outer subquery with reference table
SELECT "some_users_data".user_id, lastseen
FROM
@ -380,9 +656,9 @@ ORDER BY
limit 50;
--
-- UNIONs and JOINs with reference tables, shoukld error out
-- UNIONs and JOINs with reference tables, should error out
--
SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType
SELECT ("final_query"."event_types") as types
FROM
( SELECT *, random()
FROM
@ -396,7 +672,7 @@ FROM
(SELECT
"events"."user_id", "events"."time", 0 AS event
FROM
events_reference_table as "events"
events_table as "events"
WHERE
event_type IN (10, 11, 12, 13, 14, 15) ) events_subquery_1)
UNION
@ -406,7 +682,7 @@ FROM
(SELECT
"events"."user_id", "events"."time", 1 AS event
FROM
events_table as "events"
events_reference_table as "events"
WHERE
event_type IN (15, 16, 17, 18, 19) ) events_subquery_2)
UNION
@ -438,8 +714,6 @@ INNER JOIN
WHERE
value_1 > 50 and value_1 < 70) AS t
ON (t.user_id = q.user_id)) as final_query
GROUP BY
types
ORDER BY
types;
@ -471,15 +745,15 @@ FROM
SELECT * FROM
(
SELECT
max("events"."time"),
max("users"."time"),
0 AS event,
"events"."user_id"
"users"."user_id"
FROM
events_reference_table as "events", users_table as "users"
WHERE
events.user_id = users.user_id AND
event_type IN (10, 11, 12, 13, 14, 15)
GROUP BY "events"."user_id"
GROUP BY "users"."user_id"
) as events_subquery_5
) events_subquery_2)
UNION
@ -569,6 +843,107 @@ INNER JOIN
GROUP BY types
ORDER BY types;
-- just a sanity check that we don't allow this if the reference table is on the
-- left part of the left join
SELECT count(*) FROM
(SELECT random() FROM users_ref_test_table LEFT JOIN user_buy_test_table
ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1;
-- we don't allow non equi join among hash partitioned tables
SELECT count(*) FROM
(SELECT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table
ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1,
(SELECT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table
ON user_buy_test_table.user_id > users_ref_test_table.id) subquery_2
WHERE subquery_1.user_id != subquery_2.user_id ;
-- we cannot push this query since hash partitioned tables
-- are not joined on partition keys with equality
SELECT
count(*) AS cnt, "generated_group_field"
FROM
(SELECT
"eventQuery"."user_id", random(), generated_group_field
FROM
(SELECT
"multi_group_wrapper_1".*, generated_group_field, random()
FROM
(SELECT *
FROM
(SELECT
"events"."time", "events"."user_id" as event_user_id
FROM
events_table as "events"
WHERE
user_id > 80) "temp_data_queries"
INNER JOIN
(SELECT
"users"."user_id"
FROM
users_reference_table as "users"
WHERE
user_id > 80 and value_2 = 5) "user_filters_1"
ON ("temp_data_queries".event_user_id < "user_filters_1".user_id)) AS "multi_group_wrapper_1"
RIGHT JOIN
(SELECT
"users"."user_id" AS "user_id", value_2 AS "generated_group_field"
FROM
users_table as "users") "left_group_by_1"
ON ("left_group_by_1".user_id > "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery"
group BY
"generated_group_field"
ORDER BY
cnt DESC, generated_group_field ASC
LIMIT 10;
-- two hash partitioned relations are not joined
-- on partiton keys although reference table is fine
-- to push down
SELECT
u1.user_id, count(*)
FROM
events_table as e1, users_table as u1
WHERE
event_type IN
(SELECT
event_type
FROM
events_reference_table as e2
WHERE
value_2 = 15 AND
value_3 > 25 AND
e1.value_2 > e2.value_2
)
AND u1.user_id > e1.user_id
GROUP BY 1
ORDER BY 2 DESC, 1 DESC
LIMIT 5;
SELECT foo.user_id FROM
(
SELECT m.user_id, random() FROM users_table m JOIN events_reference_table r ON int4eq(m.user_id, r.user_id)
WHERE event_type > 100000
) as foo;
-- not supported since group by is on the reference table column
SELECT foo.user_id FROM
(
SELECT r.user_id, random() FROM users_table m JOIN events_reference_table r ON int4eq(m.user_id, r.user_id)
GROUP BY r.user_id
) as foo;
-- not supported since distinct is on the reference table column
SELECT foo.user_id FROM
(
SELECT DISTINCT r.user_id, random() FROM users_table m JOIN events_reference_table r ON int4eq(m.user_id, r.user_id)
) as foo;
-- not supported since distinct on is on the reference table column
SELECT foo.user_id FROM
(
SELECT DISTINCT ON(r.user_id) r.user_id, random() FROM users_table m JOIN events_reference_table r ON int4eq(m.user_id, r.user_id)
) as foo;
DROP TABLE user_buy_test_table;
DROP TABLE users_ref_test_table;
DROP TABLE users_return_test_table;

View File

@ -1,27 +1,6 @@
--
-- queries to test the subquery pushdown on reference tables
-- subqueries in WHERE with greater operator
SELECT
user_id
FROM
users_table
WHERE
value_2 >
(SELECT
max(value_2)
FROM
events_reference_table
WHERE
users_table.user_id = events_reference_table.user_id AND event_type = 50
GROUP BY
user_id
)
GROUP BY user_id
HAVING count(*) > 66
ORDER BY user_id
LIMIT 5;
-- subqueries in WHERE with IN operator
SELECT
user_id
@ -78,7 +57,7 @@ LIMIT 3;
-- subqueries in WHERE with IN operator without equality
SELECT
user_id
users_table.user_id, count(*)
FROM
users_table
WHERE
@ -90,51 +69,10 @@ WHERE
WHERE
users_table.user_id > events_reference_table.user_id
)
GROUP BY user_id
ORDER BY user_id
GROUP BY users_table.user_id
ORDER BY 2 DESC, 1 DESC
LIMIT 3;
-- have reference table without any equality, should error out
SELECT
user_id
FROM
users_table
WHERE
value_2 >
(SELECT
max(value_2)
FROM
events_reference_table
WHERE
event_type = 50
GROUP BY
user_id
)
GROUP BY user_id
HAVING count(*) > 66
ORDER BY user_id
LIMIT 5;
-- users that appeared more than 118 times, should run since the reference table
-- on the right side of the semi join
SELECT
user_id
FROM
users_table
WHERE 118 <=
(SELECT
count(*)
FROM
events_reference_table
WHERE
users_table.user_id = events_reference_table.user_id
GROUP BY
user_id)
GROUP BY
user_id
ORDER BY
user_id;
-- should error out since reference table exist on the left side
-- of the left lateral join
SELECT user_id, value_2 FROM users_table WHERE
@ -197,3 +135,165 @@ SELECT user_id, value_2 FROM users_table WHERE
HAVING sum(submit_card_info) > 0
)
ORDER BY 1, 2;
-- non-partition key equality with reference table
SELECT
user_id, count(*)
FROM
users_table
WHERE
value_3 =ANY(SELECT value_2 FROM users_reference_table WHERE value_1 >= 10 AND value_1 <= 20)
GROUP BY 1 ORDER BY 2 DESC, 1 DESC LIMIT 5;
-- non-partition key comparison with reference table
SELECT
user_id, count(*)
FROM
events_table as e1
WHERE
event_type IN
(SELECT
event_type
FROM
events_reference_table as e2
WHERE
value_2 = 15 AND
value_3 > 25 AND
e1.value_2 > e2.value_2
)
GROUP BY 1
ORDER BY 2 DESC, 1 DESC
LIMIT 5;
-- subqueries in both WHERE and FROM clauses
-- should work since reference table is on the
-- inner part of the join
SELECT user_id, value_2 FROM users_table WHERE
value_1 > 101 AND value_1 < 110
AND value_2 >= 5
AND user_id IN
(
SELECT
e1.user_id
FROM (
-- Get the first time each user viewed the homepage.
SELECT
user_id,
1 AS view_homepage,
min(time) AS view_homepage_time
FROM events_table
WHERE
event_type IN (10, 20, 30, 40, 50, 60, 70, 80, 90)
GROUP BY user_id
) e1 LEFT JOIN LATERAL (
SELECT
user_id,
1 AS use_demo,
time AS use_demo_time
FROM events_table
WHERE
user_id = e1.user_id AND
event_type IN (11, 21, 31, 41, 51, 61, 71, 81, 91)
ORDER BY time
) e2 ON true LEFT JOIN LATERAL (
SELECT
user_id,
1 AS enter_credit_card,
time AS enter_credit_card_time
FROM events_table
WHERE
user_id = e2.user_id AND
event_type IN (12, 22, 32, 42, 52, 62, 72, 82, 92)
ORDER BY time
) e3 ON true LEFT JOIN LATERAL (
SELECT
1 AS submit_card_info,
user_id,
time AS enter_credit_card_time
FROM events_table
WHERE
user_id = e3.user_id AND
event_type IN (13, 23, 33, 43, 53, 63, 73, 83, 93)
ORDER BY time
) e4 ON true LEFT JOIN LATERAL (
SELECT
1 AS see_bought_screen
FROM events_reference_table
WHERE
user_id = e4.user_id AND
event_type IN (14, 24, 34, 44, 54, 64, 74, 84, 94)
ORDER BY time
) e5 ON true
group by e1.user_id
HAVING sum(submit_card_info) > 0
)
ORDER BY 1, 2;
-- reference tables are not allowed if there is sublink
SELECT
count(*)
FROM
users_reference_table
WHERE user_id
NOT IN
(SELECT users_table.value_2 FROM users_table JOIN users_reference_table as u2 ON users_table.value_2 = u2.value_2);
-- reference tables are not allowed if there is sublink
SELECT count(*)
FROM
(SELECT
user_id, random() FROM users_reference_table) AS vals
WHERE vals.user_id NOT IN
(SELECT users_table.value_2
FROM users_table
JOIN users_reference_table AS u2 ON users_table.value_2 = u2.value_2);
-- reference tables are not allowed if there is sublink
SELECT user_id,
count(*)
FROM users_reference_table
WHERE value_2 > ALL
(SELECT min(value_2)
FROM events_table
WHERE event_type > 50 AND users_reference_table.user_id = events_table.user_id
GROUP BY user_id)
GROUP BY user_id
HAVING count(*) > 66
ORDER BY 2 DESC,
1 DESC
LIMIT 5;
-- reference tables are not allowed if there is sublink
-- this time in the subquery
SELECT *
FROM users_table
WHERE user_id IN
(SELECT users_table.user_id
FROM users_table,
users_reference_table
WHERE users_reference_table.user_id NOT IN
(SELECT value_2
FROM users_reference_table AS u2));
-- not supported since GROUP BY references to an upper level query
SELECT
user_id
FROM
users_table
WHERE
value_2 >
(SELECT
max(value_2)
FROM
events_reference_table
WHERE
users_table.user_id = events_reference_table.user_id AND event_type = 50
GROUP BY
users_table.user_id
)
GROUP BY user_id
HAVING count(*) > 66
ORDER BY user_id
LIMIT 5;