From 217890af5f687e2a804b00c36f069025fe8fc769 Mon Sep 17 00:00:00 2001 From: Nils Dijk Date: Mon, 18 Nov 2019 16:25:46 +0100 Subject: [PATCH] Feature: Expression in reference join (#3180) DESCRIPTION: Expression in reference join Fixed: #2582 This patch allows arbitrary expressions in the join clause when joining to a reference table. An example of such joins could be found in CHbenCHmark queries 7, 8, 9 and 11; `mod((s_w_id * s_i_id),10000) = su_suppkey` and `ascii(substr(c_state,1,1)) = n2.n_nationkey`. Since the join is on a reference table these queries are able to be pushed down to the workers. To implement these queries we will widen the `IsJoinClause` predicate to not check if the expressions are a type `Var` after stripping the implicit coerciens. Instead we define a join clause when the `Var`'s in a clause come from more than 1 table. This allows more clauses to pass into the logical planner's `MultiNodeTree(...)` planning function. To compensate for this we tighten down the `LocalJoin`, `SinglePartitionJoin` and `DualPartitionJoin` to check for direct column references when planning. This allows the planner to work with arbitrary join expressions on reference tables. --- .../distributed/planner/multi_join_order.c | 282 ++++-- .../planner/multi_logical_planner.c | 89 +- .../planner/multi_physical_planner.c | 9 +- src/include/distributed/multi_join_order.h | 6 +- .../expected/chbenchmark_all_queries.out | 903 ++++++++++++++++++ .../expected/expression_reference_join.out | 56 ++ .../expected/single_hash_repartition_join.out | 2 +- src/test/regress/multi_schedule | 2 +- .../regress/sql/chbenchmark_all_queries.sql | 722 ++++++++++++++ .../regress/sql/expression_reference_join.sql | 41 + 10 files changed, 1963 insertions(+), 149 deletions(-) create mode 100644 src/test/regress/expected/chbenchmark_all_queries.out create mode 100644 src/test/regress/expected/expression_reference_join.out create mode 100644 src/test/regress/sql/chbenchmark_all_queries.sql create mode 100644 src/test/regress/sql/expression_reference_join.sql diff --git a/src/backend/distributed/planner/multi_join_order.c b/src/backend/distributed/planner/multi_join_order.c index 27b3d9d0d..e54bd0967 100644 --- a/src/backend/distributed/planner/multi_join_order.c +++ b/src/backend/distributed/planner/multi_join_order.c @@ -18,6 +18,7 @@ #include "access/heapam.h" #include "access/htup_details.h" #include "catalog/pg_am.h" +#include "distributed/listutils.h" #include "distributed/metadata_cache.h" #include "distributed/multi_join_order.h" #include "distributed/multi_physical_planner.h" @@ -208,37 +209,45 @@ ExtractLeftMostRangeTableIndex(Node *node, int *rangeTableIndex) /* - * JoinOnColumns determines whether two columns are joined by a given join clause - * list. + * JoinOnColumns determines whether two columns are joined by a given join clause list. */ static bool JoinOnColumns(Var *currentColumn, Var *candidateColumn, List *joinClauseList) { ListCell *joinClauseCell = NULL; - bool joinOnColumns = false; + if (currentColumn == NULL || candidateColumn == NULL) + { + /* + * LocalJoin can only be happening if we have both a current column and a target + * column, otherwise we are not joining two local tables + */ + return false; + } foreach(joinClauseCell, joinClauseList) { - OpExpr *joinClause = (OpExpr *) lfirst(joinClauseCell); - Var *leftColumn = LeftColumn(joinClause); - Var *rightColumn = RightColumn(joinClause); + OpExpr *joinClause = castNode(OpExpr, lfirst(joinClauseCell)); + Var *leftColumn = LeftColumnOrNULL(joinClause); + Var *rightColumn = RightColumnOrNULL(joinClause); - /* check if both join columns and both partition key columns match */ + /* + * Check if both join columns and both partition key columns match, since the + * current and candidate column's can't be NULL we know they wont match if either + * of the columns resolved to NULL above. + */ if (equal(leftColumn, currentColumn) && equal(rightColumn, candidateColumn)) { - joinOnColumns = true; - break; + return true; } if (equal(leftColumn, candidateColumn) && equal(rightColumn, currentColumn)) { - joinOnColumns = true; - break; + return true; } } - return joinOnColumns; + return false; } @@ -265,7 +274,19 @@ JoinOrderList(List *tableEntryList, List *joinClauseList) candidateJoinOrder = JoinOrderForTable(startingTable, tableEntryList, joinClauseList); - candidateJoinOrderList = lappend(candidateJoinOrderList, candidateJoinOrder); + if (candidateJoinOrder != NULL) + { + candidateJoinOrderList = lappend(candidateJoinOrderList, candidateJoinOrder); + } + } + + if (list_length(candidateJoinOrderList) == 0) + { + /* there are no plans that we can create, time to error */ + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("complex joins are only supported when all distributed " + "tables are joined on their distribution columns with " + "equal operator"))); } bestJoinOrder = BestJoinOrder(candidateJoinOrderList); @@ -341,6 +362,12 @@ JoinOrderForTable(TableEntry *firstTable, List *tableEntryList, List *joinClause pendingJoinNode = EvaluateJoinRules(joinedTableList, currentJoinNode, pendingTable, joinClauseList, joinType); + if (pendingJoinNode == NULL) + { + /* no join order could be generated, we try our next pending table */ + continue; + } + /* if this rule is better than previous ones, keep it */ pendingJoinRuleType = pendingJoinNode->joinRuleType; if (pendingJoinRuleType < nextJoinRuleType) @@ -350,6 +377,15 @@ JoinOrderForTable(TableEntry *firstTable, List *tableEntryList, List *joinClause } } + if (nextJoinNode == NULL) + { + /* + * There is no next join node found, this will repeat indefinitely hence we + * bail and let JoinOrderList try a new initial table + */ + return NULL; + } + Assert(nextJoinNode != NULL); nextJoinedTable = nextJoinNode->tableEntry; @@ -660,6 +696,11 @@ EvaluateJoinRules(List *joinedTableList, JoinOrderNode *currentJoinNode, } } + if (nextJoinNode == NULL) + { + return NULL; + } + Assert(nextJoinNode != NULL); nextJoinNode->joinType = joinType; nextJoinNode->joinClauseList = applicableJoinClauses; @@ -824,22 +865,18 @@ LocalJoin(JoinOrderNode *currentJoinNode, TableEntry *candidateTable, char candidatePartitionMethod = PartitionMethod(relationId); char currentPartitionMethod = currentJoinNode->partitionMethod; TableEntry *currentAnchorTable = currentJoinNode->anchorTable; - JoinRuleType currentJoinRuleType = currentJoinNode->joinRuleType; bool joinOnPartitionColumns = false; bool coPartitionedTables = false; /* - * If we previously dual-hash re-partitioned the tables for a join or made - * cartesian product, we currently don't allow local join. + * If we previously dual-hash re-partitioned the tables for a join or made cartesian + * product, there is no anchor table anymore. In that case we don't allow local join. */ - if (currentJoinRuleType == DUAL_PARTITION_JOIN || - currentJoinRuleType == CARTESIAN_PRODUCT) + if (currentAnchorTable == NULL) { return NULL; } - Assert(currentAnchorTable != NULL); - /* the partition method should be the same for a local join */ if (currentPartitionMethod != candidatePartitionMethod) { @@ -883,7 +920,6 @@ static JoinOrderNode * SinglePartitionJoin(JoinOrderNode *currentJoinNode, TableEntry *candidateTable, List *applicableJoinClauses, JoinType joinType) { - JoinOrderNode *nextJoinNode = NULL; Var *currentPartitionColumn = currentJoinNode->partitionColumn; char currentPartitionMethod = currentJoinNode->partitionMethod; TableEntry *currentAnchorTable = currentJoinNode->anchorTable; @@ -927,22 +963,22 @@ SinglePartitionJoin(JoinOrderNode *currentJoinNode, TableEntry *candidateTable, return NULL; } - nextJoinNode = MakeJoinOrderNode(candidateTable, SINGLE_HASH_PARTITION_JOIN, - currentPartitionColumn, - currentPartitionMethod, - currentAnchorTable); + return MakeJoinOrderNode(candidateTable, SINGLE_HASH_PARTITION_JOIN, + currentPartitionColumn, + currentPartitionMethod, + currentAnchorTable); } else { - nextJoinNode = MakeJoinOrderNode(candidateTable, SINGLE_RANGE_PARTITION_JOIN, - currentPartitionColumn, - currentPartitionMethod, - currentAnchorTable); + return MakeJoinOrderNode(candidateTable, SINGLE_RANGE_PARTITION_JOIN, + currentPartitionColumn, + currentPartitionMethod, + currentAnchorTable); } } /* evaluate re-partitioning the current table only if the rule didn't apply above */ - if (nextJoinNode == NULL && candidatePartitionMethod != DISTRIBUTE_BY_NONE) + if (candidatePartitionMethod != DISTRIBUTE_BY_NONE) { joinClause = SinglePartitionJoinClause(candidatePartitionColumn, applicableJoinClauses); @@ -959,24 +995,24 @@ SinglePartitionJoin(JoinOrderNode *currentJoinNode, TableEntry *candidateTable, return NULL; } - nextJoinNode = MakeJoinOrderNode(candidateTable, - SINGLE_HASH_PARTITION_JOIN, - candidatePartitionColumn, - candidatePartitionMethod, - candidateTable); + return MakeJoinOrderNode(candidateTable, + SINGLE_HASH_PARTITION_JOIN, + candidatePartitionColumn, + candidatePartitionMethod, + candidateTable); } else { - nextJoinNode = MakeJoinOrderNode(candidateTable, - SINGLE_RANGE_PARTITION_JOIN, - candidatePartitionColumn, - candidatePartitionMethod, - candidateTable); + return MakeJoinOrderNode(candidateTable, + SINGLE_RANGE_PARTITION_JOIN, + candidatePartitionColumn, + candidatePartitionMethod, + candidateTable); } } } - return nextJoinNode; + return NULL; } @@ -988,14 +1024,23 @@ SinglePartitionJoin(JoinOrderNode *currentJoinNode, TableEntry *candidateTable, OpExpr * SinglePartitionJoinClause(Var *partitionColumn, List *applicableJoinClauses) { - OpExpr *joinClause = NULL; ListCell *applicableJoinClauseCell = NULL; + if (partitionColumn == NULL) + { + return NULL; + } foreach(applicableJoinClauseCell, applicableJoinClauses) { - OpExpr *applicableJoinClause = (OpExpr *) lfirst(applicableJoinClauseCell); - Var *leftColumn = LeftColumn(applicableJoinClause); - Var *rightColumn = RightColumn(applicableJoinClause); + OpExpr *applicableJoinClause = castNode(OpExpr, lfirst(applicableJoinClauseCell)); + Var *leftColumn = LeftColumnOrNULL(applicableJoinClause); + Var *rightColumn = RightColumnOrNULL(applicableJoinClause); + if (leftColumn == NULL || rightColumn == NULL) + { + /* not a simple partition column join */ + continue; + } + /* * We first check if partition column matches either of the join columns @@ -1007,8 +1052,7 @@ SinglePartitionJoinClause(Var *partitionColumn, List *applicableJoinClauses) { if (leftColumn->vartype == rightColumn->vartype) { - joinClause = applicableJoinClause; - break; + return applicableJoinClause; } else { @@ -1017,7 +1061,7 @@ SinglePartitionJoinClause(Var *partitionColumn, List *applicableJoinClauses) } } - return joinClause; + return NULL; } @@ -1032,20 +1076,18 @@ static JoinOrderNode * DualPartitionJoin(JoinOrderNode *currentJoinNode, TableEntry *candidateTable, List *applicableJoinClauses, JoinType joinType) { - /* Because of the dual partition, anchor table information got lost */ - TableEntry *anchorTable = NULL; - JoinOrderNode *nextJoinNode = NULL; - OpExpr *joinClause = DualPartitionJoinClause(applicableJoinClauses); if (joinClause) { - Var *nextPartitionColumn = LeftColumn(joinClause); - nextJoinNode = MakeJoinOrderNode(candidateTable, DUAL_PARTITION_JOIN, - nextPartitionColumn, REDISTRIBUTE_BY_HASH, - anchorTable); + /* because of the dual partition, anchor table and partition column get lost */ + return MakeJoinOrderNode(candidateTable, + DUAL_PARTITION_JOIN, + NULL, + REDISTRIBUTE_BY_HASH, + NULL); } - return nextJoinNode; + return NULL; } @@ -1057,20 +1099,23 @@ DualPartitionJoin(JoinOrderNode *currentJoinNode, TableEntry *candidateTable, OpExpr * DualPartitionJoinClause(List *applicableJoinClauses) { - OpExpr *joinClause = NULL; ListCell *applicableJoinClauseCell = NULL; foreach(applicableJoinClauseCell, applicableJoinClauses) { OpExpr *applicableJoinClause = (OpExpr *) lfirst(applicableJoinClauseCell); - Var *leftColumn = LeftColumn(applicableJoinClause); - Var *rightColumn = RightColumn(applicableJoinClause); + Var *leftColumn = LeftColumnOrNULL(applicableJoinClause); + Var *rightColumn = RightColumnOrNULL(applicableJoinClause); + + if (leftColumn == NULL || rightColumn == NULL) + { + continue; + } /* we only need to check that the join column types match */ if (leftColumn->vartype == rightColumn->vartype) { - joinClause = applicableJoinClause; - break; + return applicableJoinClause; } else { @@ -1078,7 +1123,7 @@ DualPartitionJoinClause(List *applicableJoinClauses) } } - return joinClause; + return NULL; } @@ -1091,15 +1136,16 @@ static JoinOrderNode * CartesianProduct(JoinOrderNode *currentJoinNode, TableEntry *candidateTable, List *applicableJoinClauses, JoinType joinType) { - /* Because of the cartesian product, anchor table information got lost */ - TableEntry *anchorTable = NULL; + if (list_length(applicableJoinClauses) == 0) + { + /* Because of the cartesian product, anchor table information got lost */ + return MakeJoinOrderNode(candidateTable, CARTESIAN_PRODUCT, + currentJoinNode->partitionColumn, + currentJoinNode->partitionMethod, + NULL); + } - JoinOrderNode *nextJoinNode = MakeJoinOrderNode(candidateTable, CARTESIAN_PRODUCT, - currentJoinNode->partitionColumn, - currentJoinNode->partitionMethod, - anchorTable); - - return nextJoinNode; + return NULL; } @@ -1121,6 +1167,46 @@ MakeJoinOrderNode(TableEntry *tableEntry, JoinRuleType joinRuleType, } +/* + * IsApplicableJoinClause tests if the current joinClause is applicable to the join at + * hand. + * + * Given a list of left hand tables and a candidate right hand table the join clause is + * valid if atleast 1 column is from the right hand table AND all columns can be found + * in either the list of tables on the left *or* in the right hand table. + */ +bool +IsApplicableJoinClause(List *leftTableIdList, uint32 rightTableId, OpExpr *joinClause) +{ + List *varList = pull_var_clause_default((Node *) joinClause); + Var *var = NULL; + bool joinContainsRightTable = false; + foreach_ptr(var, varList) + { + uint32 columnTableId = var->varno; + if (rightTableId == columnTableId) + { + joinContainsRightTable = true; + } + else if (!list_member_int(leftTableIdList, columnTableId)) + { + /* + * We couldn't find this column either on the right hand side (first if + * statement), nor in the list on the left. This join clause involves a table + * not yet available during the candidate join. + */ + return false; + } + } + + /* + * All columns referenced in this clause are available during this join, now the join + * is applicable if we found our candidate table as well + */ + return joinContainsRightTable; +} + + /* * ApplicableJoinClauses finds all join clauses that apply between the given * left table list and the right table, and returns these found join clauses. @@ -1136,18 +1222,8 @@ ApplicableJoinClauses(List *leftTableIdList, uint32 rightTableId, List *joinClau foreach(joinClauseCell, joinClauseList) { - OpExpr *joinClause = (OpExpr *) lfirst(joinClauseCell); - Var *joinLeftColumn = LeftColumn(joinClause); - Var *joinRightColumn = RightColumn(joinClause); - - uint32 joinLeftTableId = joinLeftColumn->varno; - uint32 joinRightTableId = joinRightColumn->varno; - - bool leftListHasJoinLeft = list_member_int(leftTableIdList, joinLeftTableId); - bool leftListHasJoinRight = list_member_int(leftTableIdList, joinRightTableId); - - if ((leftListHasJoinLeft && (rightTableId == joinRightTableId)) || - (leftListHasJoinRight && (rightTableId == joinLeftTableId))) + OpExpr *joinClause = castNode(OpExpr, lfirst(joinClauseCell)); + if (IsApplicableJoinClause(leftTableIdList, rightTableId, joinClause)) { applicableJoinClauses = lappend(applicableJoinClauses, joinClause); } @@ -1157,37 +1233,41 @@ ApplicableJoinClauses(List *leftTableIdList, uint32 rightTableId, List *joinClau } -/* Returns the left column in the given join clause. */ +/* + * Returns the left column only when directly referenced in the given join clause, + * otherwise NULL is returned. + */ Var * -LeftColumn(OpExpr *joinClause) +LeftColumnOrNULL(OpExpr *joinClause) { List *argumentList = joinClause->args; Node *leftArgument = (Node *) linitial(argumentList); - List *varList = pull_var_clause_default(leftArgument); - Var *leftColumn = NULL; - - Assert(list_length(varList) == 1); - leftColumn = (Var *) linitial(varList); - - return leftColumn; + leftArgument = strip_implicit_coercions(leftArgument); + if (!IsA(leftArgument, Var)) + { + return NULL; + } + return castNode(Var, leftArgument); } -/* Returns the right column in the given join clause. */ +/* + * Returns the right column only when directly referenced in the given join clause, + * otherwise NULL is returned. + * */ Var * -RightColumn(OpExpr *joinClause) +RightColumnOrNULL(OpExpr *joinClause) { List *argumentList = joinClause->args; Node *rightArgument = (Node *) lsecond(argumentList); - List *varList = pull_var_clause_default(rightArgument); - Var *rightColumn = NULL; - - Assert(list_length(varList) == 1); - rightColumn = (Var *) linitial(varList); - - return rightColumn; + rightArgument = strip_implicit_coercions(rightArgument); + if (!IsA(rightArgument, Var)) + { + return NULL; + } + return castNode(Var, rightArgument); } diff --git a/src/backend/distributed/planner/multi_logical_planner.c b/src/backend/distributed/planner/multi_logical_planner.c index c7a70c030..bd8519d48 100644 --- a/src/backend/distributed/planner/multi_logical_planner.c +++ b/src/backend/distributed/planner/multi_logical_planner.c @@ -23,6 +23,7 @@ #include "distributed/colocation_utils.h" #include "distributed/metadata_cache.h" #include "distributed/insert_select_planner.h" +#include "distributed/listutils.h" #include "distributed/multi_logical_optimizer.h" #include "distributed/multi_logical_planner.h" #include "distributed/multi_physical_planner.h" @@ -1387,62 +1388,59 @@ ExtractFromExpressionWalker(Node *node, QualifierWalkerContext *walkerContext) bool IsJoinClause(Node *clause) { - bool isJoinClause = false; OpExpr *operatorExpression = NULL; - List *argumentList = NIL; - Node *leftArgument = NULL; - Node *rightArgument = NULL; - Node *strippedLeftArgument = NULL; - Node *strippedRightArgument = NULL; + bool equalsOperator = false; + List *varList = NIL; + Var *initialVar = NULL; + Var *var = NULL; if (!IsA(clause, OpExpr)) { return false; } - operatorExpression = (OpExpr *) clause; - argumentList = operatorExpression->args; + operatorExpression = castNode(OpExpr, clause); + equalsOperator = OperatorImplementsEquality(operatorExpression->opno); - /* join clauses must have two arguments */ - if (list_length(argumentList) != 2) + if (!equalsOperator) { + /* + * The single and dual repartition join and local join planners expect the clauses + * to be equi-join to calculate a hash on which to distribute. + * + * In the future we should move this clause to those planners and allow + * non-equi-join's in the reference join and cartesian product. This is tracked in + * https://github.com/citusdata/citus/issues/3198 + */ return false; } - /* get left and right side of the expression */ - leftArgument = (Node *) linitial(argumentList); - rightArgument = (Node *) lsecond(argumentList); - - strippedLeftArgument = strip_implicit_coercions(leftArgument); - strippedRightArgument = strip_implicit_coercions(rightArgument); - - /* each side of the expression should have only one column */ - if (IsA(strippedLeftArgument, Var) && IsA(strippedRightArgument, Var)) + /* + * take all column references from the clause, if we find 2 column references from a + * different relation we assume this is a join clause + */ + varList = pull_var_clause_default(clause); + if (list_length(varList) <= 0) { - Var *leftColumn = (Var *) strippedLeftArgument; - Var *rightColumn = (Var *) strippedRightArgument; - bool equiJoin = false; - bool joinBetweenDifferentTables = false; + /* no column references in query, not describing a join */ + return false; + } + initialVar = castNode(Var, linitial(varList)); - bool equalsOperator = OperatorImplementsEquality(operatorExpression->opno); - if (equalsOperator) + foreach_ptr(var, varList) + { + if (var->varno != initialVar->varno) { - equiJoin = true; - } - - if (leftColumn->varno != rightColumn->varno) - { - joinBetweenDifferentTables = true; - } - - /* codifies our logic for determining if this node is a join clause */ - if (equiJoin && joinBetweenDifferentTables) - { - isJoinClause = true; + /* + * this column reference comes from a different relation, hence describing a + * join + */ + return true; } } - return isJoinClause; + /* all column references were to the same relation, no join */ + return false; } @@ -2143,8 +2141,12 @@ ApplySinglePartitionJoin(MultiNode *leftNode, MultiNode *rightNode, joinClause = SinglePartitionJoinClause(partitionColumn, applicableJoinClauses); Assert(joinClause != NULL); - leftColumn = LeftColumn(joinClause); - rightColumn = RightColumn(joinClause); + /* both are verified in SinglePartitionJoinClause to not be NULL, assert is to guard */ + leftColumn = LeftColumnOrNULL(joinClause); + rightColumn = RightColumnOrNULL(joinClause); + + Assert(leftColumn != NULL); + Assert(rightColumn != NULL); if (equal(partitionColumn, leftColumn)) { @@ -2218,8 +2220,11 @@ ApplyDualPartitionJoin(MultiNode *leftNode, MultiNode *rightNode, joinClause = DualPartitionJoinClause(applicableJoinClauses); Assert(joinClause != NULL); - leftColumn = LeftColumn(joinClause); - rightColumn = RightColumn(joinClause); + /* both are verified in DualPartitionJoinClause to not be NULL, assert is to guard */ + leftColumn = LeftColumnOrNULL(joinClause); + rightColumn = RightColumnOrNULL(joinClause); + Assert(leftColumn != NULL); + Assert(rightColumn != NULL); rightTableIdList = OutputTableIdList(rightNode); rightTableId = (uint32) linitial_int(rightTableIdList); diff --git a/src/backend/distributed/planner/multi_physical_planner.c b/src/backend/distributed/planner/multi_physical_planner.c index 9e1991921..14ff78f4b 100644 --- a/src/backend/distributed/planner/multi_physical_planner.c +++ b/src/backend/distributed/planner/multi_physical_planner.c @@ -3603,8 +3603,13 @@ JoinSequenceArray(List *rangeTableFragmentsList, Query *jobQuery, List *depended continue; } - leftColumn = LeftColumn(nextJoinClause); - rightColumn = RightColumn(nextJoinClause); + leftColumn = LeftColumnOrNULL(nextJoinClause); + rightColumn = RightColumnOrNULL(nextJoinClause); + if (leftColumn == NULL || rightColumn == NULL) + { + continue; + } + leftRangeTableId = leftColumn->varno; rightRangeTableId = rightColumn->varno; diff --git a/src/include/distributed/multi_join_order.h b/src/include/distributed/multi_join_order.h index e8cb4f2dd..230726ef0 100644 --- a/src/include/distributed/multi_join_order.h +++ b/src/include/distributed/multi_join_order.h @@ -82,13 +82,15 @@ extern bool EnableSingleHashRepartitioning; /* Function declaration for determining table join orders */ extern List * JoinExprList(FromExpr *fromExpr); extern List * JoinOrderList(List *rangeTableEntryList, List *joinClauseList); +extern bool IsApplicableJoinClause(List *leftTableIdList, uint32 rightTableId, + OpExpr *joinClause); extern List * ApplicableJoinClauses(List *leftTableIdList, uint32 rightTableId, List *joinClauseList); extern OpExpr * SinglePartitionJoinClause(Var *partitionColumn, List *applicableJoinClauses); extern OpExpr * DualPartitionJoinClause(List *applicableJoinClauses); -extern Var * LeftColumn(OpExpr *joinClause); -extern Var * RightColumn(OpExpr *joinClause); +extern Var * LeftColumnOrNULL(OpExpr *joinClause); +extern Var * RightColumnOrNULL(OpExpr *joinClause); extern Var * PartitionColumn(Oid relationId, uint32 rangeTableId); extern Var * DistPartitionKey(Oid relationId); extern char PartitionMethod(Oid relationId); diff --git a/src/test/regress/expected/chbenchmark_all_queries.out b/src/test/regress/expected/chbenchmark_all_queries.out new file mode 100644 index 000000000..45400e8f1 --- /dev/null +++ b/src/test/regress/expected/chbenchmark_all_queries.out @@ -0,0 +1,903 @@ +SET citus.next_shard_id TO 1650000; +CREATE SCHEMA chbenchmark_all_queries; +SET search_path TO chbenchmark_all_queries; +SET citus.enable_repartition_joins TO on; +CREATE TABLE order_line ( + ol_w_id int NOT NULL, + ol_d_id int NOT NULL, + ol_o_id int NOT NULL, + ol_number int NOT NULL, + ol_i_id int NOT NULL, + ol_delivery_d timestamp NULL DEFAULT NULL, + ol_amount decimal(6,2) NOT NULL, + ol_supply_w_id int NOT NULL, + ol_quantity decimal(2,0) NOT NULL, + ol_dist_info char(24) NOT NULL, + PRIMARY KEY (ol_w_id,ol_d_id,ol_o_id,ol_number) +); +CREATE TABLE new_order ( + no_w_id int NOT NULL, + no_d_id int NOT NULL, + no_o_id int NOT NULL, + PRIMARY KEY (no_w_id,no_d_id,no_o_id) +); +CREATE TABLE stock ( + s_w_id int NOT NULL, + s_i_id int NOT NULL, + s_quantity decimal(4,0) NOT NULL, + s_ytd decimal(8,2) NOT NULL, + s_order_cnt int NOT NULL, + s_remote_cnt int NOT NULL, + s_data varchar(50) NOT NULL, + s_dist_01 char(24) NOT NULL, + s_dist_02 char(24) NOT NULL, + s_dist_03 char(24) NOT NULL, + s_dist_04 char(24) NOT NULL, + s_dist_05 char(24) NOT NULL, + s_dist_06 char(24) NOT NULL, + s_dist_07 char(24) NOT NULL, + s_dist_08 char(24) NOT NULL, + s_dist_09 char(24) NOT NULL, + s_dist_10 char(24) NOT NULL, + PRIMARY KEY (s_w_id,s_i_id) +); +CREATE TABLE oorder ( + o_w_id int NOT NULL, + o_d_id int NOT NULL, + o_id int NOT NULL, + o_c_id int NOT NULL, + o_carrier_id int DEFAULT NULL, + o_ol_cnt decimal(2,0) NOT NULL, + o_all_local decimal(1,0) NOT NULL, + o_entry_d timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (o_w_id,o_d_id,o_id), + UNIQUE (o_w_id,o_d_id,o_c_id,o_id) +); +CREATE TABLE history ( + h_c_id int NOT NULL, + h_c_d_id int NOT NULL, + h_c_w_id int NOT NULL, + h_d_id int NOT NULL, + h_w_id int NOT NULL, + h_date timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + h_amount decimal(6,2) NOT NULL, + h_data varchar(24) NOT NULL +); +CREATE TABLE customer ( + c_w_id int NOT NULL, + c_d_id int NOT NULL, + c_id int NOT NULL, + c_discount decimal(4,4) NOT NULL, + c_credit char(2) NOT NULL, + c_last varchar(16) NOT NULL, + c_first varchar(16) NOT NULL, + c_credit_lim decimal(12,2) NOT NULL, + c_balance decimal(12,2) NOT NULL, + c_ytd_payment float NOT NULL, + c_payment_cnt int NOT NULL, + c_delivery_cnt int NOT NULL, + c_street_1 varchar(20) NOT NULL, + c_street_2 varchar(20) NOT NULL, + c_city varchar(20) NOT NULL, + c_state char(2) NOT NULL, + c_zip char(9) NOT NULL, + c_phone char(16) NOT NULL, + c_since timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + c_middle char(2) NOT NULL, + c_data varchar(500) NOT NULL, + PRIMARY KEY (c_w_id,c_d_id,c_id) +); +CREATE TABLE district ( + d_w_id int NOT NULL, + d_id int NOT NULL, + d_ytd decimal(12,2) NOT NULL, + d_tax decimal(4,4) NOT NULL, + d_next_o_id int NOT NULL, + d_name varchar(10) NOT NULL, + d_street_1 varchar(20) NOT NULL, + d_street_2 varchar(20) NOT NULL, + d_city varchar(20) NOT NULL, + d_state char(2) NOT NULL, + d_zip char(9) NOT NULL, + PRIMARY KEY (d_w_id,d_id) +); +CREATE TABLE item ( + i_id int NOT NULL, + i_name varchar(24) NOT NULL, + i_price decimal(5,2) NOT NULL, + i_data varchar(50) NOT NULL, + i_im_id int NOT NULL, + PRIMARY KEY (i_id) +); +CREATE TABLE warehouse ( + w_id int NOT NULL, + w_ytd decimal(12,2) NOT NULL, + w_tax decimal(4,4) NOT NULL, + w_name varchar(10) NOT NULL, + w_street_1 varchar(20) NOT NULL, + w_street_2 varchar(20) NOT NULL, + w_city varchar(20) NOT NULL, + w_state char(2) NOT NULL, + w_zip char(9) NOT NULL, + PRIMARY KEY (w_id) +); +CREATE TABLE region ( + r_regionkey int not null, + r_name char(55) not null, + r_comment char(152) not null, + PRIMARY KEY ( r_regionkey ) +); +CREATE TABLE nation ( + n_nationkey int not null, + n_name char(25) not null, + n_regionkey int not null, + n_comment char(152) not null, + PRIMARY KEY ( n_nationkey ) +); +CREATE TABLE supplier ( + su_suppkey int not null, + su_name char(25) not null, + su_address varchar(40) not null, + su_nationkey int not null, + su_phone char(15) not null, + su_acctbal numeric(12,2) not null, + su_comment char(101) not null, + PRIMARY KEY ( su_suppkey ) +); +SELECT create_distributed_table('order_line','ol_w_id'); + create_distributed_table +-------------------------- + +(1 row) + +SELECT create_distributed_table('new_order','no_w_id'); + create_distributed_table +-------------------------- + +(1 row) + +SELECT create_distributed_table('stock','s_w_id'); + create_distributed_table +-------------------------- + +(1 row) + +SELECT create_distributed_table('oorder','o_w_id'); + create_distributed_table +-------------------------- + +(1 row) + +SELECT create_distributed_table('history','h_w_id'); + create_distributed_table +-------------------------- + +(1 row) + +SELECT create_distributed_table('customer','c_w_id'); + create_distributed_table +-------------------------- + +(1 row) + +SELECT create_distributed_table('district','d_w_id'); + create_distributed_table +-------------------------- + +(1 row) + +SELECT create_distributed_table('warehouse','w_id'); + create_distributed_table +-------------------------- + +(1 row) + +SELECT create_reference_table('item'); + create_reference_table +------------------------ + +(1 row) + +SELECT create_reference_table('region'); + create_reference_table +------------------------ + +(1 row) + +SELECT create_reference_table('nation'); + create_reference_table +------------------------ + +(1 row) + +SELECT create_reference_table('supplier'); + create_reference_table +------------------------ + +(1 row) + +TRUNCATE order_line, new_order, stock, oorder, history, customer, district, warehouse, item, region, nation, supplier; -- for easy copy in development +INSERT INTO supplier SELECT c, 'abc', 'def', c, 'ghi', c, 'jkl' FROM generate_series(0,10) AS c; +INSERT INTO new_order SELECT c, c, c FROM generate_series(0,10) AS c; +INSERT INTO stock SELECT c,c,c,c,c,c, 'abc','abc','abc','abc','abc','abc','abc','abc','abc','abc','abc' FROM generate_series(1,3) AS c; +INSERT INTO stock SELECT c, 5000,c,c,c,c, 'abc','abc','abc','abc','abc','abc','abc','abc','abc','abc','abc' FROM generate_series(1,3) AS c; -- mod(2*5000,10000) == 0 +INSERT INTO order_line SELECT c, c, c, c, c, '2008-10-17 00:00:00.000000', c, c, c, 'abc' FROM generate_series(0,10) AS c; +INSERT INTO oorder SELECT c, c, c, c, c, 1, 1, '2008-10-17 00:00:00.000000' FROM generate_series(0,10) AS c; +INSERT INTO customer SELECT c, c, c, 0, 'XX', 'John', 'Doe', 1000, 0, 0, c, c, 'Name', 'Street', 'Some City', 'CA', '12345', '+1 000 0000000', '2007-01-02 00:00:00.000000', 'NA', 'nothing special' FROM generate_series(0,10) AS c; +INSERT INTO item SELECT c, 'Keyboard', 50, 'co b', c FROM generate_series(0,10) AS c; --co% and %b filters all around +INSERT INTO region VALUES + (1, 'Not Europe', 'Big'), + (2, 'Europe', 'Big'); +INSERT INTO nation VALUES + (1, 'United States', 1, 'Also Big'), + (4, 'The Netherlands', 2, 'Flat'), + (9, 'Germany', 2, 'Germany must be in here for Q7'), + (67, 'Cambodia', 2, 'I don''t understand how we got from California to Cambodia but I will take it, it also is not in Europe, but we need it to be for Q8'); +-- Query 1 +SELECT + ol_number, + sum(ol_quantity) as sum_qty, + sum(ol_amount) as sum_amount, + avg(ol_quantity) as avg_qty, + avg(ol_amount) as avg_amount, + count(*) as count_order +FROM order_line +WHERE ol_delivery_d > '2007-01-02 00:00:00.000000' +GROUP BY ol_number +ORDER BY ol_number; + ol_number | sum_qty | sum_amount | avg_qty | avg_amount | count_order +-----------+---------+------------+------------------------+------------------------+------------- + 0 | 0 | 0.00 | 0.00000000000000000000 | 0.00000000000000000000 | 1 + 1 | 1 | 1.00 | 1.00000000000000000000 | 1.00000000000000000000 | 1 + 2 | 2 | 2.00 | 2.0000000000000000 | 2.0000000000000000 | 1 + 3 | 3 | 3.00 | 3.0000000000000000 | 3.0000000000000000 | 1 + 4 | 4 | 4.00 | 4.0000000000000000 | 4.0000000000000000 | 1 + 5 | 5 | 5.00 | 5.0000000000000000 | 5.0000000000000000 | 1 + 6 | 6 | 6.00 | 6.0000000000000000 | 6.0000000000000000 | 1 + 7 | 7 | 7.00 | 7.0000000000000000 | 7.0000000000000000 | 1 + 8 | 8 | 8.00 | 8.0000000000000000 | 8.0000000000000000 | 1 + 9 | 9 | 9.00 | 9.0000000000000000 | 9.0000000000000000 | 1 + 10 | 10 | 10.00 | 10.0000000000000000 | 10.0000000000000000 | 1 +(11 rows) + +-- Query 2 +SELECT + su_suppkey, + su_name, + n_name, + i_id, + i_name, + su_address, + su_phone, + su_comment +FROM + item, + supplier, + stock, + nation, + region, + (SELECT + s_i_id AS m_i_id, + min(s_quantity) as m_s_quantity + FROM + stock, + supplier, + nation, + region + WHERE mod((s_w_id*s_i_id),10000)=su_suppkey + AND su_nationkey=n_nationkey + AND n_regionkey=r_regionkey + AND r_name LIKE 'Europ%' + GROUP BY s_i_id) m +WHERE i_id = s_i_id + AND mod((s_w_id * s_i_id), 10000) = su_suppkey + AND su_nationkey = n_nationkey + AND n_regionkey = r_regionkey + AND i_data LIKE '%b' + AND r_name LIKE 'Europ%' + AND i_id = m_i_id + AND s_quantity = m_s_quantity +ORDER BY + n_name, + su_name, + i_id; + su_suppkey | su_name | n_name | i_id | i_name | su_address | su_phone | su_comment +------------+---------------------------+---------------------------+------+----------+------------+-----------------+------------------------------------------------------------------------------------------------------- + 9 | abc | Germany | 3 | Keyboard | def | ghi | jkl + 4 | abc | The Netherlands | 2 | Keyboard | def | ghi | jkl +(2 rows) + +-- Query 3 +SELECT + ol_o_id, + ol_w_id, + ol_d_id, + sum(ol_amount) AS revenue, + o_entry_d +FROM + customer, + new_order, + oorder, + order_line +WHERE c_state LIKE 'C%' -- used to ba A%, but C% works with our small data + AND c_id = o_c_id + AND c_w_id = o_w_id + AND c_d_id = o_d_id + AND no_w_id = o_w_id + AND no_d_id = o_d_id + AND no_o_id = o_id + AND ol_w_id = o_w_id + AND ol_d_id = o_d_id + AND ol_o_id = o_id + AND o_entry_d > '2007-01-02 00:00:00.000000' +GROUP BY + ol_o_id, + ol_w_id, + ol_d_id, + o_entry_d +ORDER BY + revenue DESC, + o_entry_d; + ol_o_id | ol_w_id | ol_d_id | revenue | o_entry_d +---------+---------+---------+---------+-------------------------- + 10 | 10 | 10 | 10.00 | Fri Oct 17 00:00:00 2008 + 9 | 9 | 9 | 9.00 | Fri Oct 17 00:00:00 2008 + 8 | 8 | 8 | 8.00 | Fri Oct 17 00:00:00 2008 + 7 | 7 | 7 | 7.00 | Fri Oct 17 00:00:00 2008 + 6 | 6 | 6 | 6.00 | Fri Oct 17 00:00:00 2008 + 5 | 5 | 5 | 5.00 | Fri Oct 17 00:00:00 2008 + 4 | 4 | 4 | 4.00 | Fri Oct 17 00:00:00 2008 + 3 | 3 | 3 | 3.00 | Fri Oct 17 00:00:00 2008 + 2 | 2 | 2 | 2.00 | Fri Oct 17 00:00:00 2008 + 1 | 1 | 1 | 1.00 | Fri Oct 17 00:00:00 2008 + 0 | 0 | 0 | 0.00 | Fri Oct 17 00:00:00 2008 +(11 rows) + +-- Query 4 +SELECT + o_ol_cnt, + count(*) as order_count +FROM + oorder +WHERE o_entry_d >= '2007-01-02 00:00:00.000000' + AND o_entry_d < '2012-01-02 00:00:00.000000' + AND exists (SELECT * + FROM order_line + WHERE o_id = ol_o_id + AND o_w_id = ol_w_id + AND o_d_id = ol_d_id + AND ol_delivery_d >= o_entry_d) +GROUP BY o_ol_cnt +ORDER BY o_ol_cnt; + o_ol_cnt | order_count +----------+------------- + 1 | 11 +(1 row) + +-- Query 5 +SELECT + n_name, + sum(ol_amount) AS revenue +FROM + customer, + oorder, + order_line, + stock, + supplier, + nation, + region +WHERE c_id = o_c_id + AND c_w_id = o_w_id + AND c_d_id = o_d_id + AND ol_o_id = o_id + AND ol_w_id = o_w_id + AND ol_d_id=o_d_id + AND ol_w_id = s_w_id + AND ol_i_id = s_i_id + AND mod((s_w_id * s_i_id),10000) = su_suppkey +-- our dataset does not have the supplier in the same nation as the customer causing this +-- join to filter out all the data. We verify later on that we can actually perform an +-- ascii(substr(c_state,1,1)) == reference table column join later on so it should not +-- matter we skip this filter here. +--AND ascii(substr(c_state,1,1)) = su_nationkey + AND su_nationkey = n_nationkey + AND n_regionkey = r_regionkey + AND r_name = 'Europe' + AND o_entry_d >= '2007-01-02 00:00:00.000000' +GROUP BY n_name +ORDER BY revenue DESC; + n_name | revenue +---------------------------+--------- + Germany | 3.00 + The Netherlands | 2.00 +(2 rows) + +-- Query 6 +SELECT + sum(ol_amount) AS revenue +FROM order_line +WHERE ol_delivery_d >= '1999-01-01 00:00:00.000000' + AND ol_delivery_d < '2020-01-01 00:00:00.000000' + AND ol_quantity BETWEEN 1 AND 100000; + revenue +--------- + 55.00 +(1 row) + +-- Query 7 +SELECT + su_nationkey as supp_nation, + substr(c_state,1,1) as cust_nation, + extract(year from o_entry_d) as l_year, + sum(ol_amount) as revenue +FROM + supplier, + stock, + order_line, + oorder, + customer, + nation n1, + nation n2 +WHERE ol_supply_w_id = s_w_id + AND ol_i_id = s_i_id + AND mod((s_w_id * s_i_id), 10000) = su_suppkey + AND ol_w_id = o_w_id + AND ol_d_id = o_d_id + AND ol_o_id = o_id + AND c_id = o_c_id + AND c_w_id = o_w_id + AND c_d_id = o_d_id + AND su_nationkey = n1.n_nationkey + AND ascii(substr(c_state,1,1)) = n2.n_nationkey + AND ( + (n1.n_name = 'Germany' AND n2.n_name = 'Cambodia') + OR (n1.n_name = 'Cambodia' AND n2.n_name = 'Germany') + ) + AND ol_delivery_d BETWEEN '2007-01-02 00:00:00.000000' AND '2012-01-02 00:00:00.000000' +GROUP BY + su_nationkey, + substr(c_state,1,1), + extract(year from o_entry_d) +ORDER BY + su_nationkey, + cust_nation, + l_year; + supp_nation | cust_nation | l_year | revenue +-------------+-------------+--------+--------- + 9 | C | 2008 | 3.00 +(1 row) + +-- Query 8 +SELECT + extract(year from o_entry_d) as l_year, + sum(case when n2.n_name = 'Germany' then ol_amount else 0 end) / sum(ol_amount) as mkt_share +FROM + item, + supplier, + stock, + order_line, + oorder, + customer, + nation n1, + nation n2, + region +WHERE i_id = s_i_id + AND ol_i_id = s_i_id + AND ol_supply_w_id = s_w_id + AND mod((s_w_id * s_i_id),10000) = su_suppkey + AND ol_w_id = o_w_id + AND ol_d_id = o_d_id + AND ol_o_id = o_id + AND c_id = o_c_id + AND c_w_id = o_w_id + AND c_d_id = o_d_id + AND n1.n_nationkey = ascii(substr(c_state,1,1)) + AND n1.n_regionkey = r_regionkey + AND ol_i_id < 1000 + AND r_name = 'Europe' + AND su_nationkey = n2.n_nationkey + AND o_entry_d BETWEEN '2007-01-02 00:00:00.000000' AND '2012-01-02 00:00:00.000000' + AND i_data LIKE '%b' + AND i_id = ol_i_id +GROUP BY extract(YEAR FROM o_entry_d) +ORDER BY l_year; + l_year | mkt_share +--------+------------------------ + 2008 | 0.50000000000000000000 +(1 row) + +-- Query 9 +SELECT + n_name, + extract(year from o_entry_d) as l_year, + sum(ol_amount) as sum_profit +FROM + item, + stock, + supplier, + order_line, + oorder, + nation +WHERE ol_i_id = s_i_id + AND ol_supply_w_id = s_w_id + AND mod((s_w_id * s_i_id), 10000) = su_suppkey + AND ol_w_id = o_w_id + AND ol_d_id = o_d_id + AND ol_o_id = o_id + AND ol_i_id = i_id + AND su_nationkey = n_nationkey + AND i_data LIKE '%b' -- this used to be %BB but that will not work with our small dataset +GROUP BY + n_name, + extract(YEAR FROM o_entry_d) +ORDER BY + n_name, + l_year DESC; + n_name | l_year | sum_profit +---------------------------+--------+------------ + Germany | 2008 | 3.00 + The Netherlands | 2008 | 2.00 + United States | 2008 | 1.00 +(3 rows) + +-- Query 10 +SELECT + c_id, + c_last, + sum(ol_amount) AS revenue, + c_city, + c_phone, + n_name +FROM + customer, + oorder, + order_line, + nation +WHERE c_id = o_c_id + AND c_w_id = o_w_id + AND c_d_id = o_d_id + AND ol_w_id = o_w_id + AND ol_d_id = o_d_id + AND ol_o_id = o_id + AND o_entry_d >= '2007-01-02 00:00:00.000000' + AND o_entry_d <= ol_delivery_d + AND n_nationkey = ascii(substr(c_state,1,1)) +GROUP BY + c_id, + c_last, + c_city, + c_phone, + n_name +ORDER BY revenue DESC; + c_id | c_last | revenue | c_city | c_phone | n_name +------+--------+---------+-----------+------------------+--------------------------- + 10 | John | 10.00 | Some City | +1 000 0000000 | Cambodia + 9 | John | 9.00 | Some City | +1 000 0000000 | Cambodia + 8 | John | 8.00 | Some City | +1 000 0000000 | Cambodia + 7 | John | 7.00 | Some City | +1 000 0000000 | Cambodia + 6 | John | 6.00 | Some City | +1 000 0000000 | Cambodia + 5 | John | 5.00 | Some City | +1 000 0000000 | Cambodia + 4 | John | 4.00 | Some City | +1 000 0000000 | Cambodia + 3 | John | 3.00 | Some City | +1 000 0000000 | Cambodia + 2 | John | 2.00 | Some City | +1 000 0000000 | Cambodia + 1 | John | 1.00 | Some City | +1 000 0000000 | Cambodia + 0 | John | 0.00 | Some City | +1 000 0000000 | Cambodia +(11 rows) + +-- Query 11 +SELECT + s_i_id, + sum(s_order_cnt) AS ordercount +FROM + stock, + supplier, + nation +WHERE mod((s_w_id * s_i_id),10000) = su_suppkey + AND su_nationkey = n_nationkey + AND n_name = 'Germany' +GROUP BY s_i_id +HAVING sum(s_order_cnt) > + (SELECT sum(s_order_cnt) * .005 + FROM + stock, + supplier, + nation + WHERE mod((s_w_id * s_i_id),10000) = su_suppkey + AND su_nationkey = n_nationkey + AND n_name = 'Germany') +ORDER BY ordercount DESC; + s_i_id | ordercount +--------+------------ + 3 | 3 +(1 row) + +-- Query 12 +SELECT + o_ol_cnt, + sum(case when o_carrier_id = 1 or o_carrier_id = 2 then 1 else 0 end) as high_line_count, + sum(case when o_carrier_id <> 1 and o_carrier_id <> 2 then 1 else 0 end) as low_line_count +FROM + oorder, + order_line +WHERE ol_w_id = o_w_id + AND ol_d_id = o_d_id + AND ol_o_id = o_id + AND o_entry_d <= ol_delivery_d + AND ol_delivery_d < '2020-01-01 00:00:00.000000' +GROUP BY o_ol_cnt +ORDER BY o_ol_cnt; + o_ol_cnt | high_line_count | low_line_count +----------+-----------------+---------------- + 1 | 2 | 9 +(1 row) + +-- Query 13 +SELECT + c_count, + count(*) AS custdist +FROM (SELECT + c_id, + count(o_id) + FROM customer + LEFT OUTER JOIN oorder ON ( + c_w_id = o_w_id + AND c_d_id = o_d_id + AND c_id = o_c_id + AND o_carrier_id > 8) + GROUP BY c_id) AS c_orders (c_id, c_count) +GROUP BY c_count +ORDER BY + custdist DESC, + c_count DESC; + c_count | custdist +---------+---------- + 0 | 9 + 1 | 2 +(2 rows) + +-- Query 14 +SELECT + 100.00 * sum(CASE WHEN i_data LIKE 'PR%' THEN ol_amount ELSE 0 END) / (1+sum(ol_amount)) AS promo_revenue +FROM + order_line, + item +WHERE ol_i_id = i_id + AND ol_delivery_d >= '2007-01-02 00:00:00.000000' + AND ol_delivery_d < '2020-01-02 00:00:00.000000'; + promo_revenue +------------------------ + 0.00000000000000000000 +(1 row) + +-- Query 15 +WITH revenue (supplier_no, total_revenue) AS ( + SELECT + mod((s_w_id * s_i_id),10000) AS supplier_no, + sum(ol_amount) AS total_revenue + FROM + order_line, + stock + WHERE ol_i_id = s_i_id + AND ol_supply_w_id = s_w_id + AND ol_delivery_d >= '2007-01-02 00:00:00.000000' + GROUP BY mod((s_w_id * s_i_id),10000)) +SELECT + su_suppkey, + su_name, + su_address, + su_phone, + total_revenue +FROM + supplier, + revenue +WHERE su_suppkey = supplier_no + AND total_revenue = (SELECT max(total_revenue) FROM revenue) +ORDER BY su_suppkey; + su_suppkey | su_name | su_address | su_phone | total_revenue +------------+---------------------------+------------+-----------------+--------------- + 9 | abc | def | ghi | 3.00 +(1 row) + +--Q16 +SELECT + i_name, + substr(i_data, 1, 3) AS brand, + i_price, + count(DISTINCT (mod((s_w_id * s_i_id),10000))) AS supplier_cnt +FROM + stock, + item +WHERE i_id = s_i_id + AND i_data NOT LIKE 'zz%' + AND (mod((s_w_id * s_i_id),10000) NOT IN + (SELECT su_suppkey + FROM supplier + WHERE su_comment LIKE '%bad%')) +GROUP BY + i_name, + substr(i_data, 1, 3), + i_price +ORDER BY supplier_cnt DESC; + i_name | brand | i_price | supplier_cnt +----------+-------+---------+-------------- + Keyboard | co | 50.00 | 3 +(1 row) + +--Q17 +SELECT + sum(ol_amount) / 2.0 AS avg_yearly +FROM + order_line, + (SELECT + i_id, + avg(ol_quantity) AS a + FROM + item, + order_line + WHERE i_data LIKE '%b' + AND ol_i_id = i_id + GROUP BY i_id) t +WHERE ol_i_id = t.i_id; + avg_yearly +--------------------- + 27.5000000000000000 +(1 row) + +-- this filter was at the end causing the dataset to be empty. it should not have any +-- influence on how the query gets planned so I removed the clause +--AND ol_quantity < t.a; +-- Query 18 +SELECT + c_last, + c_id o_id, + o_entry_d, + o_ol_cnt, + sum(ol_amount) +FROM + customer, + oorder, + order_line +WHERE c_id = o_c_id + AND c_w_id = o_w_id + AND c_d_id = o_d_id + AND ol_w_id = o_w_id + AND ol_d_id = o_d_id + AND ol_o_id = o_id +GROUP BY + o_id, + o_w_id, + o_d_id, + c_id, + c_last, + o_entry_d, + o_ol_cnt +HAVING sum(ol_amount) > 5 -- was 200, but thats too big for the dataset +ORDER BY + sum(ol_amount) DESC, + o_entry_d; + c_last | o_id | o_entry_d | o_ol_cnt | sum +--------+------+--------------------------+----------+------- + John | 10 | Fri Oct 17 00:00:00 2008 | 1 | 10.00 + John | 9 | Fri Oct 17 00:00:00 2008 | 1 | 9.00 + John | 8 | Fri Oct 17 00:00:00 2008 | 1 | 8.00 + John | 7 | Fri Oct 17 00:00:00 2008 | 1 | 7.00 + John | 6 | Fri Oct 17 00:00:00 2008 | 1 | 6.00 +(5 rows) + +-- Query 19 +SELECT + sum(ol_amount) AS revenue +FROM + order_line, + item +WHERE ( ol_i_id = i_id + AND i_data LIKE '%a' + AND ol_quantity >= 1 + AND ol_quantity <= 10 + AND i_price BETWEEN 1 AND 400000 + AND ol_w_id IN (1,2,3)) + OR ( ol_i_id = i_id + AND i_data LIKE '%b' + AND ol_quantity >= 1 + AND ol_quantity <= 10 + AND i_price BETWEEN 1 AND 400000 + AND ol_w_id IN (1,2,4)) + OR ( ol_i_id = i_id + AND i_data LIKE '%c' + AND ol_quantity >= 1 + AND ol_quantity <= 10 + AND i_price BETWEEN 1 AND 400000 + AND ol_w_id IN (1,5,3)); + revenue +--------- + 7.00 +(1 row) + +-- Query 20 +SELECT + su_name, + su_address +FROM + supplier, + nation +WHERE su_suppkey in + (SELECT + mod(s_i_id * s_w_id, 10000) + FROM + stock, + order_line + WHERE s_i_id IN + (SELECT i_id + FROM item + WHERE i_data LIKE 'co%') + AND ol_i_id = s_i_id + AND ol_delivery_d > '2008-05-23 12:00:00' -- was 2010, but our order is in 2008 + GROUP BY s_i_id, s_w_id, s_quantity + HAVING 2*s_quantity > sum(ol_quantity)) + AND su_nationkey = n_nationkey + AND n_name = 'Germany' +ORDER BY su_name; + su_name | su_address +---------------------------+------------ + abc | def +(1 row) + +-- Query 21 +-- DATA SET DOES NOT COVER THIS QUERY +SELECT + su_name, + count(*) AS numwait +FROM + supplier, + order_line l1, + oorder, + stock, + nation +WHERE ol_o_id = o_id + AND ol_w_id = o_w_id + AND ol_d_id = o_d_id + AND ol_w_id = s_w_id + AND ol_i_id = s_i_id + AND mod((s_w_id * s_i_id),10000) = su_suppkey + AND l1.ol_delivery_d > o_entry_d + AND NOT exists (SELECT * + FROM order_line l2 + WHERE l2.ol_o_id = l1.ol_o_id + AND l2.ol_w_id = l1.ol_w_id + AND l2.ol_d_id = l1.ol_d_id + AND l2.ol_delivery_d > l1.ol_delivery_d) + AND su_nationkey = n_nationkey + AND n_name = 'Germany' +GROUP BY su_name +ORDER BY + numwait desc, + su_name; + su_name | numwait +---------+--------- +(0 rows) + +-- Query 22 +-- DATA SET DOES NOT COVER THIS QUERY +SELECT + substr(c_state,1,1) AS country, + count(*) AS numcust, + sum(c_balance) AS totacctbal +FROM customer +WHERE substr(c_phone,1,1) in ('1','2','3','4','5','6','7') + AND c_balance > (SELECT avg(c_BALANCE) + FROM customer + WHERE c_balance > 0.00 + AND substr(c_phone,1,1) in ('1','2','3','4','5','6','7')) + AND NOT exists (SELECT * + FROM oorder + WHERE o_c_id = c_id + AND o_w_id = c_w_id + AND o_d_id = c_d_id) +GROUP BY substr(c_state,1,1) +ORDER BY substr(c_state,1,1); + country | numcust | totacctbal +---------+---------+------------ +(0 rows) + +SET client_min_messages TO WARNING; +DROP SCHEMA chbenchmark_all_queries CASCADE; diff --git a/src/test/regress/expected/expression_reference_join.out b/src/test/regress/expected/expression_reference_join.out new file mode 100644 index 000000000..80c8d55c2 --- /dev/null +++ b/src/test/regress/expected/expression_reference_join.out @@ -0,0 +1,56 @@ +SET citus.next_shard_id TO 1670000; +CREATE SCHEMA expression_reference_join; +SET search_path TO expression_reference_join; +SET citus.shard_count TO 4; +SET citus.enable_repartition_joins TO on; +CREATE TABLE ref (a int, b int); +CREATE TABLE test (x int, y int); +INSERT INTO ref VALUES + (2,2), + (4,4); +INSERT INTO test VALUES + (1,2), + (2,2); +SELECT create_reference_table('ref'); +NOTICE: Copying data from local table... + create_reference_table +------------------------ + +(1 row) + +SELECT create_distributed_table('test', 'x'); +NOTICE: Copying data from local table... + create_distributed_table +-------------------------- + +(1 row) + +-- PR 3180 implements expressions in join clauses to reference tables to support CHbenCHmark queries 7/8/9 +-- plannable as a repartition + reference join, now with an expression in the join clause +SELECT * +FROM + test t1 JOIN test t2 USING (y), -- causes repartition, which makes this not routable or pushdownable + ref a +WHERE t2.y * 2 = a.a +ORDER BY 1,2,3; + y | x | x | a | b +---+---+---+---+--- + 2 | 1 | 1 | 4 | 4 + 2 | 1 | 2 | 4 | 4 + 2 | 2 | 1 | 4 | 4 + 2 | 2 | 2 | 4 | 4 +(4 rows) + +-- The join clause is wider than it used to be, causing this query to be recognized by the LogicalPlanner as a repartition join. +-- Unplannable query due to a three-way join which causes no valid path (besides the cartesian product) to be found +SELECT * +FROM + test t1 JOIN test t2 USING (y), -- causes repartition, which makes this not routable or pushdownable + ref a, + ref b +WHERE t2.y - a.a - b.b = 0 +ORDER BY 1,2,3; +ERROR: cannot perform distributed planning on this query +DETAIL: Cartesian products are currently unsupported +SET client_min_messages TO WARNING; +DROP SCHEMA expression_reference_join CASCADE; diff --git a/src/test/regress/expected/single_hash_repartition_join.out b/src/test/regress/expected/single_hash_repartition_join.out index 9e87d43c1..2f6e88b88 100644 --- a/src/test/regress/expected/single_hash_repartition_join.out +++ b/src/test/regress/expected/single_hash_repartition_join.out @@ -243,7 +243,7 @@ DEBUG: dual partition column types do not match DEBUG: single partition column types do not match DEBUG: single partition column types do not match DEBUG: dual partition column types do not match -LOG: join order: [ "single_hash_repartition_first" ][ local partition join "single_hash_repartition_first" ][ cartesian product "single_hash_repartition_second" ] +LOG: join order: [ "single_hash_repartition_second" ][ cartesian product "single_hash_repartition_first" ][ dual partition join "single_hash_repartition_first" ] ERROR: cannot perform distributed planning on this query DETAIL: Cartesian products are currently unsupported -- single repartition query in CTE diff --git a/src/test/regress/multi_schedule b/src/test/regress/multi_schedule index 0127673ba..0e481cd1c 100644 --- a/src/test/regress/multi_schedule +++ b/src/test/regress/multi_schedule @@ -73,7 +73,7 @@ test: multi_reference_table multi_select_for_update relation_access_tracking test: custom_aggregate_support aggregate_support test: multi_average_expression multi_working_columns multi_having_pushdown test: multi_array_agg multi_limit_clause multi_orderby_limit_pushdown -test: multi_jsonb_agg multi_jsonb_object_agg multi_json_agg multi_json_object_agg bool_agg ch_bench_having ch_bench_subquery_repartition +test: multi_jsonb_agg multi_jsonb_object_agg multi_json_agg multi_json_object_agg bool_agg ch_bench_having ch_bench_subquery_repartition chbenchmark_all_queries expression_reference_join test: multi_agg_type_conversion multi_count_type_conversion test: multi_partition_pruning single_hash_repartition_join test: multi_join_pruning multi_hash_pruning diff --git a/src/test/regress/sql/chbenchmark_all_queries.sql b/src/test/regress/sql/chbenchmark_all_queries.sql new file mode 100644 index 000000000..4307aaa98 --- /dev/null +++ b/src/test/regress/sql/chbenchmark_all_queries.sql @@ -0,0 +1,722 @@ +SET citus.next_shard_id TO 1650000; +CREATE SCHEMA chbenchmark_all_queries; +SET search_path TO chbenchmark_all_queries; + +SET citus.enable_repartition_joins TO on; +CREATE TABLE order_line ( + ol_w_id int NOT NULL, + ol_d_id int NOT NULL, + ol_o_id int NOT NULL, + ol_number int NOT NULL, + ol_i_id int NOT NULL, + ol_delivery_d timestamp NULL DEFAULT NULL, + ol_amount decimal(6,2) NOT NULL, + ol_supply_w_id int NOT NULL, + ol_quantity decimal(2,0) NOT NULL, + ol_dist_info char(24) NOT NULL, + PRIMARY KEY (ol_w_id,ol_d_id,ol_o_id,ol_number) +); +CREATE TABLE new_order ( + no_w_id int NOT NULL, + no_d_id int NOT NULL, + no_o_id int NOT NULL, + PRIMARY KEY (no_w_id,no_d_id,no_o_id) +); +CREATE TABLE stock ( + s_w_id int NOT NULL, + s_i_id int NOT NULL, + s_quantity decimal(4,0) NOT NULL, + s_ytd decimal(8,2) NOT NULL, + s_order_cnt int NOT NULL, + s_remote_cnt int NOT NULL, + s_data varchar(50) NOT NULL, + s_dist_01 char(24) NOT NULL, + s_dist_02 char(24) NOT NULL, + s_dist_03 char(24) NOT NULL, + s_dist_04 char(24) NOT NULL, + s_dist_05 char(24) NOT NULL, + s_dist_06 char(24) NOT NULL, + s_dist_07 char(24) NOT NULL, + s_dist_08 char(24) NOT NULL, + s_dist_09 char(24) NOT NULL, + s_dist_10 char(24) NOT NULL, + PRIMARY KEY (s_w_id,s_i_id) +); +CREATE TABLE oorder ( + o_w_id int NOT NULL, + o_d_id int NOT NULL, + o_id int NOT NULL, + o_c_id int NOT NULL, + o_carrier_id int DEFAULT NULL, + o_ol_cnt decimal(2,0) NOT NULL, + o_all_local decimal(1,0) NOT NULL, + o_entry_d timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (o_w_id,o_d_id,o_id), + UNIQUE (o_w_id,o_d_id,o_c_id,o_id) +); +CREATE TABLE history ( + h_c_id int NOT NULL, + h_c_d_id int NOT NULL, + h_c_w_id int NOT NULL, + h_d_id int NOT NULL, + h_w_id int NOT NULL, + h_date timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + h_amount decimal(6,2) NOT NULL, + h_data varchar(24) NOT NULL +); +CREATE TABLE customer ( + c_w_id int NOT NULL, + c_d_id int NOT NULL, + c_id int NOT NULL, + c_discount decimal(4,4) NOT NULL, + c_credit char(2) NOT NULL, + c_last varchar(16) NOT NULL, + c_first varchar(16) NOT NULL, + c_credit_lim decimal(12,2) NOT NULL, + c_balance decimal(12,2) NOT NULL, + c_ytd_payment float NOT NULL, + c_payment_cnt int NOT NULL, + c_delivery_cnt int NOT NULL, + c_street_1 varchar(20) NOT NULL, + c_street_2 varchar(20) NOT NULL, + c_city varchar(20) NOT NULL, + c_state char(2) NOT NULL, + c_zip char(9) NOT NULL, + c_phone char(16) NOT NULL, + c_since timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + c_middle char(2) NOT NULL, + c_data varchar(500) NOT NULL, + PRIMARY KEY (c_w_id,c_d_id,c_id) +); +CREATE TABLE district ( + d_w_id int NOT NULL, + d_id int NOT NULL, + d_ytd decimal(12,2) NOT NULL, + d_tax decimal(4,4) NOT NULL, + d_next_o_id int NOT NULL, + d_name varchar(10) NOT NULL, + d_street_1 varchar(20) NOT NULL, + d_street_2 varchar(20) NOT NULL, + d_city varchar(20) NOT NULL, + d_state char(2) NOT NULL, + d_zip char(9) NOT NULL, + PRIMARY KEY (d_w_id,d_id) +); +CREATE TABLE item ( + i_id int NOT NULL, + i_name varchar(24) NOT NULL, + i_price decimal(5,2) NOT NULL, + i_data varchar(50) NOT NULL, + i_im_id int NOT NULL, + PRIMARY KEY (i_id) +); +CREATE TABLE warehouse ( + w_id int NOT NULL, + w_ytd decimal(12,2) NOT NULL, + w_tax decimal(4,4) NOT NULL, + w_name varchar(10) NOT NULL, + w_street_1 varchar(20) NOT NULL, + w_street_2 varchar(20) NOT NULL, + w_city varchar(20) NOT NULL, + w_state char(2) NOT NULL, + w_zip char(9) NOT NULL, + PRIMARY KEY (w_id) +); +CREATE TABLE region ( + r_regionkey int not null, + r_name char(55) not null, + r_comment char(152) not null, + PRIMARY KEY ( r_regionkey ) +); +CREATE TABLE nation ( + n_nationkey int not null, + n_name char(25) not null, + n_regionkey int not null, + n_comment char(152) not null, + PRIMARY KEY ( n_nationkey ) +); +CREATE TABLE supplier ( + su_suppkey int not null, + su_name char(25) not null, + su_address varchar(40) not null, + su_nationkey int not null, + su_phone char(15) not null, + su_acctbal numeric(12,2) not null, + su_comment char(101) not null, + PRIMARY KEY ( su_suppkey ) +); + +SELECT create_distributed_table('order_line','ol_w_id'); +SELECT create_distributed_table('new_order','no_w_id'); +SELECT create_distributed_table('stock','s_w_id'); +SELECT create_distributed_table('oorder','o_w_id'); +SELECT create_distributed_table('history','h_w_id'); +SELECT create_distributed_table('customer','c_w_id'); +SELECT create_distributed_table('district','d_w_id'); +SELECT create_distributed_table('warehouse','w_id'); +SELECT create_reference_table('item'); +SELECT create_reference_table('region'); +SELECT create_reference_table('nation'); +SELECT create_reference_table('supplier'); + +TRUNCATE order_line, new_order, stock, oorder, history, customer, district, warehouse, item, region, nation, supplier; -- for easy copy in development +INSERT INTO supplier SELECT c, 'abc', 'def', c, 'ghi', c, 'jkl' FROM generate_series(0,10) AS c; +INSERT INTO new_order SELECT c, c, c FROM generate_series(0,10) AS c; +INSERT INTO stock SELECT c,c,c,c,c,c, 'abc','abc','abc','abc','abc','abc','abc','abc','abc','abc','abc' FROM generate_series(1,3) AS c; +INSERT INTO stock SELECT c, 5000,c,c,c,c, 'abc','abc','abc','abc','abc','abc','abc','abc','abc','abc','abc' FROM generate_series(1,3) AS c; -- mod(2*5000,10000) == 0 +INSERT INTO order_line SELECT c, c, c, c, c, '2008-10-17 00:00:00.000000', c, c, c, 'abc' FROM generate_series(0,10) AS c; +INSERT INTO oorder SELECT c, c, c, c, c, 1, 1, '2008-10-17 00:00:00.000000' FROM generate_series(0,10) AS c; +INSERT INTO customer SELECT c, c, c, 0, 'XX', 'John', 'Doe', 1000, 0, 0, c, c, 'Name', 'Street', 'Some City', 'CA', '12345', '+1 000 0000000', '2007-01-02 00:00:00.000000', 'NA', 'nothing special' FROM generate_series(0,10) AS c; +INSERT INTO item SELECT c, 'Keyboard', 50, 'co b', c FROM generate_series(0,10) AS c; --co% and %b filters all around +INSERT INTO region VALUES + (1, 'Not Europe', 'Big'), + (2, 'Europe', 'Big'); +INSERT INTO nation VALUES + (1, 'United States', 1, 'Also Big'), + (4, 'The Netherlands', 2, 'Flat'), + (9, 'Germany', 2, 'Germany must be in here for Q7'), + (67, 'Cambodia', 2, 'I don''t understand how we got from California to Cambodia but I will take it, it also is not in Europe, but we need it to be for Q8'); + +-- Query 1 +SELECT + ol_number, + sum(ol_quantity) as sum_qty, + sum(ol_amount) as sum_amount, + avg(ol_quantity) as avg_qty, + avg(ol_amount) as avg_amount, + count(*) as count_order +FROM order_line +WHERE ol_delivery_d > '2007-01-02 00:00:00.000000' +GROUP BY ol_number +ORDER BY ol_number; + +-- Query 2 +SELECT + su_suppkey, + su_name, + n_name, + i_id, + i_name, + su_address, + su_phone, + su_comment +FROM + item, + supplier, + stock, + nation, + region, + (SELECT + s_i_id AS m_i_id, + min(s_quantity) as m_s_quantity + FROM + stock, + supplier, + nation, + region + WHERE mod((s_w_id*s_i_id),10000)=su_suppkey + AND su_nationkey=n_nationkey + AND n_regionkey=r_regionkey + AND r_name LIKE 'Europ%' + GROUP BY s_i_id) m +WHERE i_id = s_i_id + AND mod((s_w_id * s_i_id), 10000) = su_suppkey + AND su_nationkey = n_nationkey + AND n_regionkey = r_regionkey + AND i_data LIKE '%b' + AND r_name LIKE 'Europ%' + AND i_id = m_i_id + AND s_quantity = m_s_quantity +ORDER BY + n_name, + su_name, + i_id; + +-- Query 3 +SELECT + ol_o_id, + ol_w_id, + ol_d_id, + sum(ol_amount) AS revenue, + o_entry_d +FROM + customer, + new_order, + oorder, + order_line +WHERE c_state LIKE 'C%' -- used to ba A%, but C% works with our small data + AND c_id = o_c_id + AND c_w_id = o_w_id + AND c_d_id = o_d_id + AND no_w_id = o_w_id + AND no_d_id = o_d_id + AND no_o_id = o_id + AND ol_w_id = o_w_id + AND ol_d_id = o_d_id + AND ol_o_id = o_id + AND o_entry_d > '2007-01-02 00:00:00.000000' +GROUP BY + ol_o_id, + ol_w_id, + ol_d_id, + o_entry_d +ORDER BY + revenue DESC, + o_entry_d; + +-- Query 4 +SELECT + o_ol_cnt, + count(*) as order_count +FROM + oorder +WHERE o_entry_d >= '2007-01-02 00:00:00.000000' + AND o_entry_d < '2012-01-02 00:00:00.000000' + AND exists (SELECT * + FROM order_line + WHERE o_id = ol_o_id + AND o_w_id = ol_w_id + AND o_d_id = ol_d_id + AND ol_delivery_d >= o_entry_d) +GROUP BY o_ol_cnt +ORDER BY o_ol_cnt; + +-- Query 5 +SELECT + n_name, + sum(ol_amount) AS revenue +FROM + customer, + oorder, + order_line, + stock, + supplier, + nation, + region +WHERE c_id = o_c_id + AND c_w_id = o_w_id + AND c_d_id = o_d_id + AND ol_o_id = o_id + AND ol_w_id = o_w_id + AND ol_d_id=o_d_id + AND ol_w_id = s_w_id + AND ol_i_id = s_i_id + AND mod((s_w_id * s_i_id),10000) = su_suppkey +-- our dataset does not have the supplier in the same nation as the customer causing this +-- join to filter out all the data. We verify later on that we can actually perform an +-- ascii(substr(c_state,1,1)) == reference table column join later on so it should not +-- matter we skip this filter here. +--AND ascii(substr(c_state,1,1)) = su_nationkey + AND su_nationkey = n_nationkey + AND n_regionkey = r_regionkey + AND r_name = 'Europe' + AND o_entry_d >= '2007-01-02 00:00:00.000000' +GROUP BY n_name +ORDER BY revenue DESC; + +-- Query 6 +SELECT + sum(ol_amount) AS revenue +FROM order_line +WHERE ol_delivery_d >= '1999-01-01 00:00:00.000000' + AND ol_delivery_d < '2020-01-01 00:00:00.000000' + AND ol_quantity BETWEEN 1 AND 100000; + +-- Query 7 +SELECT + su_nationkey as supp_nation, + substr(c_state,1,1) as cust_nation, + extract(year from o_entry_d) as l_year, + sum(ol_amount) as revenue +FROM + supplier, + stock, + order_line, + oorder, + customer, + nation n1, + nation n2 +WHERE ol_supply_w_id = s_w_id + AND ol_i_id = s_i_id + AND mod((s_w_id * s_i_id), 10000) = su_suppkey + AND ol_w_id = o_w_id + AND ol_d_id = o_d_id + AND ol_o_id = o_id + AND c_id = o_c_id + AND c_w_id = o_w_id + AND c_d_id = o_d_id + AND su_nationkey = n1.n_nationkey + AND ascii(substr(c_state,1,1)) = n2.n_nationkey + AND ( + (n1.n_name = 'Germany' AND n2.n_name = 'Cambodia') + OR (n1.n_name = 'Cambodia' AND n2.n_name = 'Germany') + ) + AND ol_delivery_d BETWEEN '2007-01-02 00:00:00.000000' AND '2012-01-02 00:00:00.000000' +GROUP BY + su_nationkey, + substr(c_state,1,1), + extract(year from o_entry_d) +ORDER BY + su_nationkey, + cust_nation, + l_year; + +-- Query 8 +SELECT + extract(year from o_entry_d) as l_year, + sum(case when n2.n_name = 'Germany' then ol_amount else 0 end) / sum(ol_amount) as mkt_share +FROM + item, + supplier, + stock, + order_line, + oorder, + customer, + nation n1, + nation n2, + region +WHERE i_id = s_i_id + AND ol_i_id = s_i_id + AND ol_supply_w_id = s_w_id + AND mod((s_w_id * s_i_id),10000) = su_suppkey + AND ol_w_id = o_w_id + AND ol_d_id = o_d_id + AND ol_o_id = o_id + AND c_id = o_c_id + AND c_w_id = o_w_id + AND c_d_id = o_d_id + AND n1.n_nationkey = ascii(substr(c_state,1,1)) + AND n1.n_regionkey = r_regionkey + AND ol_i_id < 1000 + AND r_name = 'Europe' + AND su_nationkey = n2.n_nationkey + AND o_entry_d BETWEEN '2007-01-02 00:00:00.000000' AND '2012-01-02 00:00:00.000000' + AND i_data LIKE '%b' + AND i_id = ol_i_id +GROUP BY extract(YEAR FROM o_entry_d) +ORDER BY l_year; + +-- Query 9 +SELECT + n_name, + extract(year from o_entry_d) as l_year, + sum(ol_amount) as sum_profit +FROM + item, + stock, + supplier, + order_line, + oorder, + nation +WHERE ol_i_id = s_i_id + AND ol_supply_w_id = s_w_id + AND mod((s_w_id * s_i_id), 10000) = su_suppkey + AND ol_w_id = o_w_id + AND ol_d_id = o_d_id + AND ol_o_id = o_id + AND ol_i_id = i_id + AND su_nationkey = n_nationkey + AND i_data LIKE '%b' -- this used to be %BB but that will not work with our small dataset +GROUP BY + n_name, + extract(YEAR FROM o_entry_d) +ORDER BY + n_name, + l_year DESC; + +-- Query 10 +SELECT + c_id, + c_last, + sum(ol_amount) AS revenue, + c_city, + c_phone, + n_name +FROM + customer, + oorder, + order_line, + nation +WHERE c_id = o_c_id + AND c_w_id = o_w_id + AND c_d_id = o_d_id + AND ol_w_id = o_w_id + AND ol_d_id = o_d_id + AND ol_o_id = o_id + AND o_entry_d >= '2007-01-02 00:00:00.000000' + AND o_entry_d <= ol_delivery_d + AND n_nationkey = ascii(substr(c_state,1,1)) +GROUP BY + c_id, + c_last, + c_city, + c_phone, + n_name +ORDER BY revenue DESC; + +-- Query 11 +SELECT + s_i_id, + sum(s_order_cnt) AS ordercount +FROM + stock, + supplier, + nation +WHERE mod((s_w_id * s_i_id),10000) = su_suppkey + AND su_nationkey = n_nationkey + AND n_name = 'Germany' +GROUP BY s_i_id +HAVING sum(s_order_cnt) > + (SELECT sum(s_order_cnt) * .005 + FROM + stock, + supplier, + nation + WHERE mod((s_w_id * s_i_id),10000) = su_suppkey + AND su_nationkey = n_nationkey + AND n_name = 'Germany') +ORDER BY ordercount DESC; + +-- Query 12 +SELECT + o_ol_cnt, + sum(case when o_carrier_id = 1 or o_carrier_id = 2 then 1 else 0 end) as high_line_count, + sum(case when o_carrier_id <> 1 and o_carrier_id <> 2 then 1 else 0 end) as low_line_count +FROM + oorder, + order_line +WHERE ol_w_id = o_w_id + AND ol_d_id = o_d_id + AND ol_o_id = o_id + AND o_entry_d <= ol_delivery_d + AND ol_delivery_d < '2020-01-01 00:00:00.000000' +GROUP BY o_ol_cnt +ORDER BY o_ol_cnt; + +-- Query 13 +SELECT + c_count, + count(*) AS custdist +FROM (SELECT + c_id, + count(o_id) + FROM customer + LEFT OUTER JOIN oorder ON ( + c_w_id = o_w_id + AND c_d_id = o_d_id + AND c_id = o_c_id + AND o_carrier_id > 8) + GROUP BY c_id) AS c_orders (c_id, c_count) +GROUP BY c_count +ORDER BY + custdist DESC, + c_count DESC; + +-- Query 14 +SELECT + 100.00 * sum(CASE WHEN i_data LIKE 'PR%' THEN ol_amount ELSE 0 END) / (1+sum(ol_amount)) AS promo_revenue +FROM + order_line, + item +WHERE ol_i_id = i_id + AND ol_delivery_d >= '2007-01-02 00:00:00.000000' + AND ol_delivery_d < '2020-01-02 00:00:00.000000'; + +-- Query 15 +WITH revenue (supplier_no, total_revenue) AS ( + SELECT + mod((s_w_id * s_i_id),10000) AS supplier_no, + sum(ol_amount) AS total_revenue + FROM + order_line, + stock + WHERE ol_i_id = s_i_id + AND ol_supply_w_id = s_w_id + AND ol_delivery_d >= '2007-01-02 00:00:00.000000' + GROUP BY mod((s_w_id * s_i_id),10000)) +SELECT + su_suppkey, + su_name, + su_address, + su_phone, + total_revenue +FROM + supplier, + revenue +WHERE su_suppkey = supplier_no + AND total_revenue = (SELECT max(total_revenue) FROM revenue) +ORDER BY su_suppkey; + +--Q16 +SELECT + i_name, + substr(i_data, 1, 3) AS brand, + i_price, + count(DISTINCT (mod((s_w_id * s_i_id),10000))) AS supplier_cnt +FROM + stock, + item +WHERE i_id = s_i_id + AND i_data NOT LIKE 'zz%' + AND (mod((s_w_id * s_i_id),10000) NOT IN + (SELECT su_suppkey + FROM supplier + WHERE su_comment LIKE '%bad%')) +GROUP BY + i_name, + substr(i_data, 1, 3), + i_price +ORDER BY supplier_cnt DESC; + +--Q17 +SELECT + sum(ol_amount) / 2.0 AS avg_yearly +FROM + order_line, + (SELECT + i_id, + avg(ol_quantity) AS a + FROM + item, + order_line + WHERE i_data LIKE '%b' + AND ol_i_id = i_id + GROUP BY i_id) t +WHERE ol_i_id = t.i_id; +-- this filter was at the end causing the dataset to be empty. it should not have any +-- influence on how the query gets planned so I removed the clause +--AND ol_quantity < t.a; + +-- Query 18 +SELECT + c_last, + c_id o_id, + o_entry_d, + o_ol_cnt, + sum(ol_amount) +FROM + customer, + oorder, + order_line +WHERE c_id = o_c_id + AND c_w_id = o_w_id + AND c_d_id = o_d_id + AND ol_w_id = o_w_id + AND ol_d_id = o_d_id + AND ol_o_id = o_id +GROUP BY + o_id, + o_w_id, + o_d_id, + c_id, + c_last, + o_entry_d, + o_ol_cnt +HAVING sum(ol_amount) > 5 -- was 200, but thats too big for the dataset +ORDER BY + sum(ol_amount) DESC, + o_entry_d; + +-- Query 19 +SELECT + sum(ol_amount) AS revenue +FROM + order_line, + item +WHERE ( ol_i_id = i_id + AND i_data LIKE '%a' + AND ol_quantity >= 1 + AND ol_quantity <= 10 + AND i_price BETWEEN 1 AND 400000 + AND ol_w_id IN (1,2,3)) + OR ( ol_i_id = i_id + AND i_data LIKE '%b' + AND ol_quantity >= 1 + AND ol_quantity <= 10 + AND i_price BETWEEN 1 AND 400000 + AND ol_w_id IN (1,2,4)) + OR ( ol_i_id = i_id + AND i_data LIKE '%c' + AND ol_quantity >= 1 + AND ol_quantity <= 10 + AND i_price BETWEEN 1 AND 400000 + AND ol_w_id IN (1,5,3)); + +-- Query 20 +SELECT + su_name, + su_address +FROM + supplier, + nation +WHERE su_suppkey in + (SELECT + mod(s_i_id * s_w_id, 10000) + FROM + stock, + order_line + WHERE s_i_id IN + (SELECT i_id + FROM item + WHERE i_data LIKE 'co%') + AND ol_i_id = s_i_id + AND ol_delivery_d > '2008-05-23 12:00:00' -- was 2010, but our order is in 2008 + GROUP BY s_i_id, s_w_id, s_quantity + HAVING 2*s_quantity > sum(ol_quantity)) + AND su_nationkey = n_nationkey + AND n_name = 'Germany' +ORDER BY su_name; + +-- Query 21 +-- DATA SET DOES NOT COVER THIS QUERY +SELECT + su_name, + count(*) AS numwait +FROM + supplier, + order_line l1, + oorder, + stock, + nation +WHERE ol_o_id = o_id + AND ol_w_id = o_w_id + AND ol_d_id = o_d_id + AND ol_w_id = s_w_id + AND ol_i_id = s_i_id + AND mod((s_w_id * s_i_id),10000) = su_suppkey + AND l1.ol_delivery_d > o_entry_d + AND NOT exists (SELECT * + FROM order_line l2 + WHERE l2.ol_o_id = l1.ol_o_id + AND l2.ol_w_id = l1.ol_w_id + AND l2.ol_d_id = l1.ol_d_id + AND l2.ol_delivery_d > l1.ol_delivery_d) + AND su_nationkey = n_nationkey + AND n_name = 'Germany' +GROUP BY su_name +ORDER BY + numwait desc, + su_name; + +-- Query 22 +-- DATA SET DOES NOT COVER THIS QUERY +SELECT + substr(c_state,1,1) AS country, + count(*) AS numcust, + sum(c_balance) AS totacctbal +FROM customer +WHERE substr(c_phone,1,1) in ('1','2','3','4','5','6','7') + AND c_balance > (SELECT avg(c_BALANCE) + FROM customer + WHERE c_balance > 0.00 + AND substr(c_phone,1,1) in ('1','2','3','4','5','6','7')) + AND NOT exists (SELECT * + FROM oorder + WHERE o_c_id = c_id + AND o_w_id = c_w_id + AND o_d_id = c_d_id) +GROUP BY substr(c_state,1,1) +ORDER BY substr(c_state,1,1); + +SET client_min_messages TO WARNING; +DROP SCHEMA chbenchmark_all_queries CASCADE; diff --git a/src/test/regress/sql/expression_reference_join.sql b/src/test/regress/sql/expression_reference_join.sql new file mode 100644 index 000000000..148f4a954 --- /dev/null +++ b/src/test/regress/sql/expression_reference_join.sql @@ -0,0 +1,41 @@ +SET citus.next_shard_id TO 1670000; +CREATE SCHEMA expression_reference_join; +SET search_path TO expression_reference_join; +SET citus.shard_count TO 4; +SET citus.enable_repartition_joins TO on; + +CREATE TABLE ref (a int, b int); +CREATE TABLE test (x int, y int); + +INSERT INTO ref VALUES + (2,2), + (4,4); + +INSERT INTO test VALUES + (1,2), + (2,2); + +SELECT create_reference_table('ref'); +SELECT create_distributed_table('test', 'x'); + +-- PR 3180 implements expressions in join clauses to reference tables to support CHbenCHmark queries 7/8/9 +-- plannable as a repartition + reference join, now with an expression in the join clause +SELECT * +FROM + test t1 JOIN test t2 USING (y), -- causes repartition, which makes this not routable or pushdownable + ref a +WHERE t2.y * 2 = a.a +ORDER BY 1,2,3; + +-- The join clause is wider than it used to be, causing this query to be recognized by the LogicalPlanner as a repartition join. +-- Unplannable query due to a three-way join which causes no valid path (besides the cartesian product) to be found +SELECT * +FROM + test t1 JOIN test t2 USING (y), -- causes repartition, which makes this not routable or pushdownable + ref a, + ref b +WHERE t2.y - a.a - b.b = 0 +ORDER BY 1,2,3; + +SET client_min_messages TO WARNING; +DROP SCHEMA expression_reference_join CASCADE;