Feature: Expression in reference join (#3180)

DESCRIPTION: Expression in reference join

Fixed: #2582

This patch allows arbitrary expressions in the join clause when joining to a reference table. An example of such joins could be found in CHbenCHmark queries 7, 8, 9 and 11; `mod((s_w_id * s_i_id),10000) = su_suppkey` and `ascii(substr(c_state,1,1)) = n2.n_nationkey`. Since the join is on a reference table these queries are able to be pushed down to the workers.

To implement these queries we will widen the `IsJoinClause` predicate to not check if the expressions are a type `Var` after stripping the implicit coerciens. Instead we define a join clause when the `Var`'s in a clause come from more than 1 table.

This allows more clauses to pass into the logical planner's `MultiNodeTree(...)` planning function. To compensate for this we tighten down the `LocalJoin`, `SinglePartitionJoin` and `DualPartitionJoin` to check for direct column references when planning. This allows the planner to work with arbitrary join expressions on reference tables.
pull/3184/head
Nils Dijk 2019-11-18 16:25:46 +01:00 committed by GitHub
parent a4c90b6ee1
commit 217890af5f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 1963 additions and 149 deletions

View File

@ -18,6 +18,7 @@
#include "access/heapam.h"
#include "access/htup_details.h"
#include "catalog/pg_am.h"
#include "distributed/listutils.h"
#include "distributed/metadata_cache.h"
#include "distributed/multi_join_order.h"
#include "distributed/multi_physical_planner.h"
@ -208,37 +209,45 @@ ExtractLeftMostRangeTableIndex(Node *node, int *rangeTableIndex)
/*
* JoinOnColumns determines whether two columns are joined by a given join clause
* list.
* JoinOnColumns determines whether two columns are joined by a given join clause list.
*/
static bool
JoinOnColumns(Var *currentColumn, Var *candidateColumn, List *joinClauseList)
{
ListCell *joinClauseCell = NULL;
bool joinOnColumns = false;
if (currentColumn == NULL || candidateColumn == NULL)
{
/*
* LocalJoin can only be happening if we have both a current column and a target
* column, otherwise we are not joining two local tables
*/
return false;
}
foreach(joinClauseCell, joinClauseList)
{
OpExpr *joinClause = (OpExpr *) lfirst(joinClauseCell);
Var *leftColumn = LeftColumn(joinClause);
Var *rightColumn = RightColumn(joinClause);
OpExpr *joinClause = castNode(OpExpr, lfirst(joinClauseCell));
Var *leftColumn = LeftColumnOrNULL(joinClause);
Var *rightColumn = RightColumnOrNULL(joinClause);
/* check if both join columns and both partition key columns match */
/*
* Check if both join columns and both partition key columns match, since the
* current and candidate column's can't be NULL we know they wont match if either
* of the columns resolved to NULL above.
*/
if (equal(leftColumn, currentColumn) &&
equal(rightColumn, candidateColumn))
{
joinOnColumns = true;
break;
return true;
}
if (equal(leftColumn, candidateColumn) &&
equal(rightColumn, currentColumn))
{
joinOnColumns = true;
break;
return true;
}
}
return joinOnColumns;
return false;
}
@ -265,7 +274,19 @@ JoinOrderList(List *tableEntryList, List *joinClauseList)
candidateJoinOrder = JoinOrderForTable(startingTable, tableEntryList,
joinClauseList);
candidateJoinOrderList = lappend(candidateJoinOrderList, candidateJoinOrder);
if (candidateJoinOrder != NULL)
{
candidateJoinOrderList = lappend(candidateJoinOrderList, candidateJoinOrder);
}
}
if (list_length(candidateJoinOrderList) == 0)
{
/* there are no plans that we can create, time to error */
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("complex joins are only supported when all distributed "
"tables are joined on their distribution columns with "
"equal operator")));
}
bestJoinOrder = BestJoinOrder(candidateJoinOrderList);
@ -341,6 +362,12 @@ JoinOrderForTable(TableEntry *firstTable, List *tableEntryList, List *joinClause
pendingJoinNode = EvaluateJoinRules(joinedTableList, currentJoinNode,
pendingTable, joinClauseList, joinType);
if (pendingJoinNode == NULL)
{
/* no join order could be generated, we try our next pending table */
continue;
}
/* if this rule is better than previous ones, keep it */
pendingJoinRuleType = pendingJoinNode->joinRuleType;
if (pendingJoinRuleType < nextJoinRuleType)
@ -350,6 +377,15 @@ JoinOrderForTable(TableEntry *firstTable, List *tableEntryList, List *joinClause
}
}
if (nextJoinNode == NULL)
{
/*
* There is no next join node found, this will repeat indefinitely hence we
* bail and let JoinOrderList try a new initial table
*/
return NULL;
}
Assert(nextJoinNode != NULL);
nextJoinedTable = nextJoinNode->tableEntry;
@ -660,6 +696,11 @@ EvaluateJoinRules(List *joinedTableList, JoinOrderNode *currentJoinNode,
}
}
if (nextJoinNode == NULL)
{
return NULL;
}
Assert(nextJoinNode != NULL);
nextJoinNode->joinType = joinType;
nextJoinNode->joinClauseList = applicableJoinClauses;
@ -824,22 +865,18 @@ LocalJoin(JoinOrderNode *currentJoinNode, TableEntry *candidateTable,
char candidatePartitionMethod = PartitionMethod(relationId);
char currentPartitionMethod = currentJoinNode->partitionMethod;
TableEntry *currentAnchorTable = currentJoinNode->anchorTable;
JoinRuleType currentJoinRuleType = currentJoinNode->joinRuleType;
bool joinOnPartitionColumns = false;
bool coPartitionedTables = false;
/*
* If we previously dual-hash re-partitioned the tables for a join or made
* cartesian product, we currently don't allow local join.
* If we previously dual-hash re-partitioned the tables for a join or made cartesian
* product, there is no anchor table anymore. In that case we don't allow local join.
*/
if (currentJoinRuleType == DUAL_PARTITION_JOIN ||
currentJoinRuleType == CARTESIAN_PRODUCT)
if (currentAnchorTable == NULL)
{
return NULL;
}
Assert(currentAnchorTable != NULL);
/* the partition method should be the same for a local join */
if (currentPartitionMethod != candidatePartitionMethod)
{
@ -883,7 +920,6 @@ static JoinOrderNode *
SinglePartitionJoin(JoinOrderNode *currentJoinNode, TableEntry *candidateTable,
List *applicableJoinClauses, JoinType joinType)
{
JoinOrderNode *nextJoinNode = NULL;
Var *currentPartitionColumn = currentJoinNode->partitionColumn;
char currentPartitionMethod = currentJoinNode->partitionMethod;
TableEntry *currentAnchorTable = currentJoinNode->anchorTable;
@ -927,22 +963,22 @@ SinglePartitionJoin(JoinOrderNode *currentJoinNode, TableEntry *candidateTable,
return NULL;
}
nextJoinNode = MakeJoinOrderNode(candidateTable, SINGLE_HASH_PARTITION_JOIN,
currentPartitionColumn,
currentPartitionMethod,
currentAnchorTable);
return MakeJoinOrderNode(candidateTable, SINGLE_HASH_PARTITION_JOIN,
currentPartitionColumn,
currentPartitionMethod,
currentAnchorTable);
}
else
{
nextJoinNode = MakeJoinOrderNode(candidateTable, SINGLE_RANGE_PARTITION_JOIN,
currentPartitionColumn,
currentPartitionMethod,
currentAnchorTable);
return MakeJoinOrderNode(candidateTable, SINGLE_RANGE_PARTITION_JOIN,
currentPartitionColumn,
currentPartitionMethod,
currentAnchorTable);
}
}
/* evaluate re-partitioning the current table only if the rule didn't apply above */
if (nextJoinNode == NULL && candidatePartitionMethod != DISTRIBUTE_BY_NONE)
if (candidatePartitionMethod != DISTRIBUTE_BY_NONE)
{
joinClause = SinglePartitionJoinClause(candidatePartitionColumn,
applicableJoinClauses);
@ -959,24 +995,24 @@ SinglePartitionJoin(JoinOrderNode *currentJoinNode, TableEntry *candidateTable,
return NULL;
}
nextJoinNode = MakeJoinOrderNode(candidateTable,
SINGLE_HASH_PARTITION_JOIN,
candidatePartitionColumn,
candidatePartitionMethod,
candidateTable);
return MakeJoinOrderNode(candidateTable,
SINGLE_HASH_PARTITION_JOIN,
candidatePartitionColumn,
candidatePartitionMethod,
candidateTable);
}
else
{
nextJoinNode = MakeJoinOrderNode(candidateTable,
SINGLE_RANGE_PARTITION_JOIN,
candidatePartitionColumn,
candidatePartitionMethod,
candidateTable);
return MakeJoinOrderNode(candidateTable,
SINGLE_RANGE_PARTITION_JOIN,
candidatePartitionColumn,
candidatePartitionMethod,
candidateTable);
}
}
}
return nextJoinNode;
return NULL;
}
@ -988,14 +1024,23 @@ SinglePartitionJoin(JoinOrderNode *currentJoinNode, TableEntry *candidateTable,
OpExpr *
SinglePartitionJoinClause(Var *partitionColumn, List *applicableJoinClauses)
{
OpExpr *joinClause = NULL;
ListCell *applicableJoinClauseCell = NULL;
if (partitionColumn == NULL)
{
return NULL;
}
foreach(applicableJoinClauseCell, applicableJoinClauses)
{
OpExpr *applicableJoinClause = (OpExpr *) lfirst(applicableJoinClauseCell);
Var *leftColumn = LeftColumn(applicableJoinClause);
Var *rightColumn = RightColumn(applicableJoinClause);
OpExpr *applicableJoinClause = castNode(OpExpr, lfirst(applicableJoinClauseCell));
Var *leftColumn = LeftColumnOrNULL(applicableJoinClause);
Var *rightColumn = RightColumnOrNULL(applicableJoinClause);
if (leftColumn == NULL || rightColumn == NULL)
{
/* not a simple partition column join */
continue;
}
/*
* We first check if partition column matches either of the join columns
@ -1007,8 +1052,7 @@ SinglePartitionJoinClause(Var *partitionColumn, List *applicableJoinClauses)
{
if (leftColumn->vartype == rightColumn->vartype)
{
joinClause = applicableJoinClause;
break;
return applicableJoinClause;
}
else
{
@ -1017,7 +1061,7 @@ SinglePartitionJoinClause(Var *partitionColumn, List *applicableJoinClauses)
}
}
return joinClause;
return NULL;
}
@ -1032,20 +1076,18 @@ static JoinOrderNode *
DualPartitionJoin(JoinOrderNode *currentJoinNode, TableEntry *candidateTable,
List *applicableJoinClauses, JoinType joinType)
{
/* Because of the dual partition, anchor table information got lost */
TableEntry *anchorTable = NULL;
JoinOrderNode *nextJoinNode = NULL;
OpExpr *joinClause = DualPartitionJoinClause(applicableJoinClauses);
if (joinClause)
{
Var *nextPartitionColumn = LeftColumn(joinClause);
nextJoinNode = MakeJoinOrderNode(candidateTable, DUAL_PARTITION_JOIN,
nextPartitionColumn, REDISTRIBUTE_BY_HASH,
anchorTable);
/* because of the dual partition, anchor table and partition column get lost */
return MakeJoinOrderNode(candidateTable,
DUAL_PARTITION_JOIN,
NULL,
REDISTRIBUTE_BY_HASH,
NULL);
}
return nextJoinNode;
return NULL;
}
@ -1057,20 +1099,23 @@ DualPartitionJoin(JoinOrderNode *currentJoinNode, TableEntry *candidateTable,
OpExpr *
DualPartitionJoinClause(List *applicableJoinClauses)
{
OpExpr *joinClause = NULL;
ListCell *applicableJoinClauseCell = NULL;
foreach(applicableJoinClauseCell, applicableJoinClauses)
{
OpExpr *applicableJoinClause = (OpExpr *) lfirst(applicableJoinClauseCell);
Var *leftColumn = LeftColumn(applicableJoinClause);
Var *rightColumn = RightColumn(applicableJoinClause);
Var *leftColumn = LeftColumnOrNULL(applicableJoinClause);
Var *rightColumn = RightColumnOrNULL(applicableJoinClause);
if (leftColumn == NULL || rightColumn == NULL)
{
continue;
}
/* we only need to check that the join column types match */
if (leftColumn->vartype == rightColumn->vartype)
{
joinClause = applicableJoinClause;
break;
return applicableJoinClause;
}
else
{
@ -1078,7 +1123,7 @@ DualPartitionJoinClause(List *applicableJoinClauses)
}
}
return joinClause;
return NULL;
}
@ -1091,15 +1136,16 @@ static JoinOrderNode *
CartesianProduct(JoinOrderNode *currentJoinNode, TableEntry *candidateTable,
List *applicableJoinClauses, JoinType joinType)
{
/* Because of the cartesian product, anchor table information got lost */
TableEntry *anchorTable = NULL;
if (list_length(applicableJoinClauses) == 0)
{
/* Because of the cartesian product, anchor table information got lost */
return MakeJoinOrderNode(candidateTable, CARTESIAN_PRODUCT,
currentJoinNode->partitionColumn,
currentJoinNode->partitionMethod,
NULL);
}
JoinOrderNode *nextJoinNode = MakeJoinOrderNode(candidateTable, CARTESIAN_PRODUCT,
currentJoinNode->partitionColumn,
currentJoinNode->partitionMethod,
anchorTable);
return nextJoinNode;
return NULL;
}
@ -1121,6 +1167,46 @@ MakeJoinOrderNode(TableEntry *tableEntry, JoinRuleType joinRuleType,
}
/*
* IsApplicableJoinClause tests if the current joinClause is applicable to the join at
* hand.
*
* Given a list of left hand tables and a candidate right hand table the join clause is
* valid if atleast 1 column is from the right hand table AND all columns can be found
* in either the list of tables on the left *or* in the right hand table.
*/
bool
IsApplicableJoinClause(List *leftTableIdList, uint32 rightTableId, OpExpr *joinClause)
{
List *varList = pull_var_clause_default((Node *) joinClause);
Var *var = NULL;
bool joinContainsRightTable = false;
foreach_ptr(var, varList)
{
uint32 columnTableId = var->varno;
if (rightTableId == columnTableId)
{
joinContainsRightTable = true;
}
else if (!list_member_int(leftTableIdList, columnTableId))
{
/*
* We couldn't find this column either on the right hand side (first if
* statement), nor in the list on the left. This join clause involves a table
* not yet available during the candidate join.
*/
return false;
}
}
/*
* All columns referenced in this clause are available during this join, now the join
* is applicable if we found our candidate table as well
*/
return joinContainsRightTable;
}
/*
* ApplicableJoinClauses finds all join clauses that apply between the given
* left table list and the right table, and returns these found join clauses.
@ -1136,18 +1222,8 @@ ApplicableJoinClauses(List *leftTableIdList, uint32 rightTableId, List *joinClau
foreach(joinClauseCell, joinClauseList)
{
OpExpr *joinClause = (OpExpr *) lfirst(joinClauseCell);
Var *joinLeftColumn = LeftColumn(joinClause);
Var *joinRightColumn = RightColumn(joinClause);
uint32 joinLeftTableId = joinLeftColumn->varno;
uint32 joinRightTableId = joinRightColumn->varno;
bool leftListHasJoinLeft = list_member_int(leftTableIdList, joinLeftTableId);
bool leftListHasJoinRight = list_member_int(leftTableIdList, joinRightTableId);
if ((leftListHasJoinLeft && (rightTableId == joinRightTableId)) ||
(leftListHasJoinRight && (rightTableId == joinLeftTableId)))
OpExpr *joinClause = castNode(OpExpr, lfirst(joinClauseCell));
if (IsApplicableJoinClause(leftTableIdList, rightTableId, joinClause))
{
applicableJoinClauses = lappend(applicableJoinClauses, joinClause);
}
@ -1157,37 +1233,41 @@ ApplicableJoinClauses(List *leftTableIdList, uint32 rightTableId, List *joinClau
}
/* Returns the left column in the given join clause. */
/*
* Returns the left column only when directly referenced in the given join clause,
* otherwise NULL is returned.
*/
Var *
LeftColumn(OpExpr *joinClause)
LeftColumnOrNULL(OpExpr *joinClause)
{
List *argumentList = joinClause->args;
Node *leftArgument = (Node *) linitial(argumentList);
List *varList = pull_var_clause_default(leftArgument);
Var *leftColumn = NULL;
Assert(list_length(varList) == 1);
leftColumn = (Var *) linitial(varList);
return leftColumn;
leftArgument = strip_implicit_coercions(leftArgument);
if (!IsA(leftArgument, Var))
{
return NULL;
}
return castNode(Var, leftArgument);
}
/* Returns the right column in the given join clause. */
/*
* Returns the right column only when directly referenced in the given join clause,
* otherwise NULL is returned.
* */
Var *
RightColumn(OpExpr *joinClause)
RightColumnOrNULL(OpExpr *joinClause)
{
List *argumentList = joinClause->args;
Node *rightArgument = (Node *) lsecond(argumentList);
List *varList = pull_var_clause_default(rightArgument);
Var *rightColumn = NULL;
Assert(list_length(varList) == 1);
rightColumn = (Var *) linitial(varList);
return rightColumn;
rightArgument = strip_implicit_coercions(rightArgument);
if (!IsA(rightArgument, Var))
{
return NULL;
}
return castNode(Var, rightArgument);
}

View File

@ -23,6 +23,7 @@
#include "distributed/colocation_utils.h"
#include "distributed/metadata_cache.h"
#include "distributed/insert_select_planner.h"
#include "distributed/listutils.h"
#include "distributed/multi_logical_optimizer.h"
#include "distributed/multi_logical_planner.h"
#include "distributed/multi_physical_planner.h"
@ -1387,62 +1388,59 @@ ExtractFromExpressionWalker(Node *node, QualifierWalkerContext *walkerContext)
bool
IsJoinClause(Node *clause)
{
bool isJoinClause = false;
OpExpr *operatorExpression = NULL;
List *argumentList = NIL;
Node *leftArgument = NULL;
Node *rightArgument = NULL;
Node *strippedLeftArgument = NULL;
Node *strippedRightArgument = NULL;
bool equalsOperator = false;
List *varList = NIL;
Var *initialVar = NULL;
Var *var = NULL;
if (!IsA(clause, OpExpr))
{
return false;
}
operatorExpression = (OpExpr *) clause;
argumentList = operatorExpression->args;
operatorExpression = castNode(OpExpr, clause);
equalsOperator = OperatorImplementsEquality(operatorExpression->opno);
/* join clauses must have two arguments */
if (list_length(argumentList) != 2)
if (!equalsOperator)
{
/*
* The single and dual repartition join and local join planners expect the clauses
* to be equi-join to calculate a hash on which to distribute.
*
* In the future we should move this clause to those planners and allow
* non-equi-join's in the reference join and cartesian product. This is tracked in
* https://github.com/citusdata/citus/issues/3198
*/
return false;
}
/* get left and right side of the expression */
leftArgument = (Node *) linitial(argumentList);
rightArgument = (Node *) lsecond(argumentList);
strippedLeftArgument = strip_implicit_coercions(leftArgument);
strippedRightArgument = strip_implicit_coercions(rightArgument);
/* each side of the expression should have only one column */
if (IsA(strippedLeftArgument, Var) && IsA(strippedRightArgument, Var))
/*
* take all column references from the clause, if we find 2 column references from a
* different relation we assume this is a join clause
*/
varList = pull_var_clause_default(clause);
if (list_length(varList) <= 0)
{
Var *leftColumn = (Var *) strippedLeftArgument;
Var *rightColumn = (Var *) strippedRightArgument;
bool equiJoin = false;
bool joinBetweenDifferentTables = false;
/* no column references in query, not describing a join */
return false;
}
initialVar = castNode(Var, linitial(varList));
bool equalsOperator = OperatorImplementsEquality(operatorExpression->opno);
if (equalsOperator)
foreach_ptr(var, varList)
{
if (var->varno != initialVar->varno)
{
equiJoin = true;
}
if (leftColumn->varno != rightColumn->varno)
{
joinBetweenDifferentTables = true;
}
/* codifies our logic for determining if this node is a join clause */
if (equiJoin && joinBetweenDifferentTables)
{
isJoinClause = true;
/*
* this column reference comes from a different relation, hence describing a
* join
*/
return true;
}
}
return isJoinClause;
/* all column references were to the same relation, no join */
return false;
}
@ -2143,8 +2141,12 @@ ApplySinglePartitionJoin(MultiNode *leftNode, MultiNode *rightNode,
joinClause = SinglePartitionJoinClause(partitionColumn, applicableJoinClauses);
Assert(joinClause != NULL);
leftColumn = LeftColumn(joinClause);
rightColumn = RightColumn(joinClause);
/* both are verified in SinglePartitionJoinClause to not be NULL, assert is to guard */
leftColumn = LeftColumnOrNULL(joinClause);
rightColumn = RightColumnOrNULL(joinClause);
Assert(leftColumn != NULL);
Assert(rightColumn != NULL);
if (equal(partitionColumn, leftColumn))
{
@ -2218,8 +2220,11 @@ ApplyDualPartitionJoin(MultiNode *leftNode, MultiNode *rightNode,
joinClause = DualPartitionJoinClause(applicableJoinClauses);
Assert(joinClause != NULL);
leftColumn = LeftColumn(joinClause);
rightColumn = RightColumn(joinClause);
/* both are verified in DualPartitionJoinClause to not be NULL, assert is to guard */
leftColumn = LeftColumnOrNULL(joinClause);
rightColumn = RightColumnOrNULL(joinClause);
Assert(leftColumn != NULL);
Assert(rightColumn != NULL);
rightTableIdList = OutputTableIdList(rightNode);
rightTableId = (uint32) linitial_int(rightTableIdList);

View File

@ -3603,8 +3603,13 @@ JoinSequenceArray(List *rangeTableFragmentsList, Query *jobQuery, List *depended
continue;
}
leftColumn = LeftColumn(nextJoinClause);
rightColumn = RightColumn(nextJoinClause);
leftColumn = LeftColumnOrNULL(nextJoinClause);
rightColumn = RightColumnOrNULL(nextJoinClause);
if (leftColumn == NULL || rightColumn == NULL)
{
continue;
}
leftRangeTableId = leftColumn->varno;
rightRangeTableId = rightColumn->varno;

View File

@ -82,13 +82,15 @@ extern bool EnableSingleHashRepartitioning;
/* Function declaration for determining table join orders */
extern List * JoinExprList(FromExpr *fromExpr);
extern List * JoinOrderList(List *rangeTableEntryList, List *joinClauseList);
extern bool IsApplicableJoinClause(List *leftTableIdList, uint32 rightTableId,
OpExpr *joinClause);
extern List * ApplicableJoinClauses(List *leftTableIdList, uint32 rightTableId,
List *joinClauseList);
extern OpExpr * SinglePartitionJoinClause(Var *partitionColumn,
List *applicableJoinClauses);
extern OpExpr * DualPartitionJoinClause(List *applicableJoinClauses);
extern Var * LeftColumn(OpExpr *joinClause);
extern Var * RightColumn(OpExpr *joinClause);
extern Var * LeftColumnOrNULL(OpExpr *joinClause);
extern Var * RightColumnOrNULL(OpExpr *joinClause);
extern Var * PartitionColumn(Oid relationId, uint32 rangeTableId);
extern Var * DistPartitionKey(Oid relationId);
extern char PartitionMethod(Oid relationId);

View File

@ -0,0 +1,903 @@
SET citus.next_shard_id TO 1650000;
CREATE SCHEMA chbenchmark_all_queries;
SET search_path TO chbenchmark_all_queries;
SET citus.enable_repartition_joins TO on;
CREATE TABLE order_line (
ol_w_id int NOT NULL,
ol_d_id int NOT NULL,
ol_o_id int NOT NULL,
ol_number int NOT NULL,
ol_i_id int NOT NULL,
ol_delivery_d timestamp NULL DEFAULT NULL,
ol_amount decimal(6,2) NOT NULL,
ol_supply_w_id int NOT NULL,
ol_quantity decimal(2,0) NOT NULL,
ol_dist_info char(24) NOT NULL,
PRIMARY KEY (ol_w_id,ol_d_id,ol_o_id,ol_number)
);
CREATE TABLE new_order (
no_w_id int NOT NULL,
no_d_id int NOT NULL,
no_o_id int NOT NULL,
PRIMARY KEY (no_w_id,no_d_id,no_o_id)
);
CREATE TABLE stock (
s_w_id int NOT NULL,
s_i_id int NOT NULL,
s_quantity decimal(4,0) NOT NULL,
s_ytd decimal(8,2) NOT NULL,
s_order_cnt int NOT NULL,
s_remote_cnt int NOT NULL,
s_data varchar(50) NOT NULL,
s_dist_01 char(24) NOT NULL,
s_dist_02 char(24) NOT NULL,
s_dist_03 char(24) NOT NULL,
s_dist_04 char(24) NOT NULL,
s_dist_05 char(24) NOT NULL,
s_dist_06 char(24) NOT NULL,
s_dist_07 char(24) NOT NULL,
s_dist_08 char(24) NOT NULL,
s_dist_09 char(24) NOT NULL,
s_dist_10 char(24) NOT NULL,
PRIMARY KEY (s_w_id,s_i_id)
);
CREATE TABLE oorder (
o_w_id int NOT NULL,
o_d_id int NOT NULL,
o_id int NOT NULL,
o_c_id int NOT NULL,
o_carrier_id int DEFAULT NULL,
o_ol_cnt decimal(2,0) NOT NULL,
o_all_local decimal(1,0) NOT NULL,
o_entry_d timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (o_w_id,o_d_id,o_id),
UNIQUE (o_w_id,o_d_id,o_c_id,o_id)
);
CREATE TABLE history (
h_c_id int NOT NULL,
h_c_d_id int NOT NULL,
h_c_w_id int NOT NULL,
h_d_id int NOT NULL,
h_w_id int NOT NULL,
h_date timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
h_amount decimal(6,2) NOT NULL,
h_data varchar(24) NOT NULL
);
CREATE TABLE customer (
c_w_id int NOT NULL,
c_d_id int NOT NULL,
c_id int NOT NULL,
c_discount decimal(4,4) NOT NULL,
c_credit char(2) NOT NULL,
c_last varchar(16) NOT NULL,
c_first varchar(16) NOT NULL,
c_credit_lim decimal(12,2) NOT NULL,
c_balance decimal(12,2) NOT NULL,
c_ytd_payment float NOT NULL,
c_payment_cnt int NOT NULL,
c_delivery_cnt int NOT NULL,
c_street_1 varchar(20) NOT NULL,
c_street_2 varchar(20) NOT NULL,
c_city varchar(20) NOT NULL,
c_state char(2) NOT NULL,
c_zip char(9) NOT NULL,
c_phone char(16) NOT NULL,
c_since timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
c_middle char(2) NOT NULL,
c_data varchar(500) NOT NULL,
PRIMARY KEY (c_w_id,c_d_id,c_id)
);
CREATE TABLE district (
d_w_id int NOT NULL,
d_id int NOT NULL,
d_ytd decimal(12,2) NOT NULL,
d_tax decimal(4,4) NOT NULL,
d_next_o_id int NOT NULL,
d_name varchar(10) NOT NULL,
d_street_1 varchar(20) NOT NULL,
d_street_2 varchar(20) NOT NULL,
d_city varchar(20) NOT NULL,
d_state char(2) NOT NULL,
d_zip char(9) NOT NULL,
PRIMARY KEY (d_w_id,d_id)
);
CREATE TABLE item (
i_id int NOT NULL,
i_name varchar(24) NOT NULL,
i_price decimal(5,2) NOT NULL,
i_data varchar(50) NOT NULL,
i_im_id int NOT NULL,
PRIMARY KEY (i_id)
);
CREATE TABLE warehouse (
w_id int NOT NULL,
w_ytd decimal(12,2) NOT NULL,
w_tax decimal(4,4) NOT NULL,
w_name varchar(10) NOT NULL,
w_street_1 varchar(20) NOT NULL,
w_street_2 varchar(20) NOT NULL,
w_city varchar(20) NOT NULL,
w_state char(2) NOT NULL,
w_zip char(9) NOT NULL,
PRIMARY KEY (w_id)
);
CREATE TABLE region (
r_regionkey int not null,
r_name char(55) not null,
r_comment char(152) not null,
PRIMARY KEY ( r_regionkey )
);
CREATE TABLE nation (
n_nationkey int not null,
n_name char(25) not null,
n_regionkey int not null,
n_comment char(152) not null,
PRIMARY KEY ( n_nationkey )
);
CREATE TABLE supplier (
su_suppkey int not null,
su_name char(25) not null,
su_address varchar(40) not null,
su_nationkey int not null,
su_phone char(15) not null,
su_acctbal numeric(12,2) not null,
su_comment char(101) not null,
PRIMARY KEY ( su_suppkey )
);
SELECT create_distributed_table('order_line','ol_w_id');
create_distributed_table
--------------------------
(1 row)
SELECT create_distributed_table('new_order','no_w_id');
create_distributed_table
--------------------------
(1 row)
SELECT create_distributed_table('stock','s_w_id');
create_distributed_table
--------------------------
(1 row)
SELECT create_distributed_table('oorder','o_w_id');
create_distributed_table
--------------------------
(1 row)
SELECT create_distributed_table('history','h_w_id');
create_distributed_table
--------------------------
(1 row)
SELECT create_distributed_table('customer','c_w_id');
create_distributed_table
--------------------------
(1 row)
SELECT create_distributed_table('district','d_w_id');
create_distributed_table
--------------------------
(1 row)
SELECT create_distributed_table('warehouse','w_id');
create_distributed_table
--------------------------
(1 row)
SELECT create_reference_table('item');
create_reference_table
------------------------
(1 row)
SELECT create_reference_table('region');
create_reference_table
------------------------
(1 row)
SELECT create_reference_table('nation');
create_reference_table
------------------------
(1 row)
SELECT create_reference_table('supplier');
create_reference_table
------------------------
(1 row)
TRUNCATE order_line, new_order, stock, oorder, history, customer, district, warehouse, item, region, nation, supplier; -- for easy copy in development
INSERT INTO supplier SELECT c, 'abc', 'def', c, 'ghi', c, 'jkl' FROM generate_series(0,10) AS c;
INSERT INTO new_order SELECT c, c, c FROM generate_series(0,10) AS c;
INSERT INTO stock SELECT c,c,c,c,c,c, 'abc','abc','abc','abc','abc','abc','abc','abc','abc','abc','abc' FROM generate_series(1,3) AS c;
INSERT INTO stock SELECT c, 5000,c,c,c,c, 'abc','abc','abc','abc','abc','abc','abc','abc','abc','abc','abc' FROM generate_series(1,3) AS c; -- mod(2*5000,10000) == 0
INSERT INTO order_line SELECT c, c, c, c, c, '2008-10-17 00:00:00.000000', c, c, c, 'abc' FROM generate_series(0,10) AS c;
INSERT INTO oorder SELECT c, c, c, c, c, 1, 1, '2008-10-17 00:00:00.000000' FROM generate_series(0,10) AS c;
INSERT INTO customer SELECT c, c, c, 0, 'XX', 'John', 'Doe', 1000, 0, 0, c, c, 'Name', 'Street', 'Some City', 'CA', '12345', '+1 000 0000000', '2007-01-02 00:00:00.000000', 'NA', 'nothing special' FROM generate_series(0,10) AS c;
INSERT INTO item SELECT c, 'Keyboard', 50, 'co b', c FROM generate_series(0,10) AS c; --co% and %b filters all around
INSERT INTO region VALUES
(1, 'Not Europe', 'Big'),
(2, 'Europe', 'Big');
INSERT INTO nation VALUES
(1, 'United States', 1, 'Also Big'),
(4, 'The Netherlands', 2, 'Flat'),
(9, 'Germany', 2, 'Germany must be in here for Q7'),
(67, 'Cambodia', 2, 'I don''t understand how we got from California to Cambodia but I will take it, it also is not in Europe, but we need it to be for Q8');
-- Query 1
SELECT
ol_number,
sum(ol_quantity) as sum_qty,
sum(ol_amount) as sum_amount,
avg(ol_quantity) as avg_qty,
avg(ol_amount) as avg_amount,
count(*) as count_order
FROM order_line
WHERE ol_delivery_d > '2007-01-02 00:00:00.000000'
GROUP BY ol_number
ORDER BY ol_number;
ol_number | sum_qty | sum_amount | avg_qty | avg_amount | count_order
-----------+---------+------------+------------------------+------------------------+-------------
0 | 0 | 0.00 | 0.00000000000000000000 | 0.00000000000000000000 | 1
1 | 1 | 1.00 | 1.00000000000000000000 | 1.00000000000000000000 | 1
2 | 2 | 2.00 | 2.0000000000000000 | 2.0000000000000000 | 1
3 | 3 | 3.00 | 3.0000000000000000 | 3.0000000000000000 | 1
4 | 4 | 4.00 | 4.0000000000000000 | 4.0000000000000000 | 1
5 | 5 | 5.00 | 5.0000000000000000 | 5.0000000000000000 | 1
6 | 6 | 6.00 | 6.0000000000000000 | 6.0000000000000000 | 1
7 | 7 | 7.00 | 7.0000000000000000 | 7.0000000000000000 | 1
8 | 8 | 8.00 | 8.0000000000000000 | 8.0000000000000000 | 1
9 | 9 | 9.00 | 9.0000000000000000 | 9.0000000000000000 | 1
10 | 10 | 10.00 | 10.0000000000000000 | 10.0000000000000000 | 1
(11 rows)
-- Query 2
SELECT
su_suppkey,
su_name,
n_name,
i_id,
i_name,
su_address,
su_phone,
su_comment
FROM
item,
supplier,
stock,
nation,
region,
(SELECT
s_i_id AS m_i_id,
min(s_quantity) as m_s_quantity
FROM
stock,
supplier,
nation,
region
WHERE mod((s_w_id*s_i_id),10000)=su_suppkey
AND su_nationkey=n_nationkey
AND n_regionkey=r_regionkey
AND r_name LIKE 'Europ%'
GROUP BY s_i_id) m
WHERE i_id = s_i_id
AND mod((s_w_id * s_i_id), 10000) = su_suppkey
AND su_nationkey = n_nationkey
AND n_regionkey = r_regionkey
AND i_data LIKE '%b'
AND r_name LIKE 'Europ%'
AND i_id = m_i_id
AND s_quantity = m_s_quantity
ORDER BY
n_name,
su_name,
i_id;
su_suppkey | su_name | n_name | i_id | i_name | su_address | su_phone | su_comment
------------+---------------------------+---------------------------+------+----------+------------+-----------------+-------------------------------------------------------------------------------------------------------
9 | abc | Germany | 3 | Keyboard | def | ghi | jkl
4 | abc | The Netherlands | 2 | Keyboard | def | ghi | jkl
(2 rows)
-- Query 3
SELECT
ol_o_id,
ol_w_id,
ol_d_id,
sum(ol_amount) AS revenue,
o_entry_d
FROM
customer,
new_order,
oorder,
order_line
WHERE c_state LIKE 'C%' -- used to ba A%, but C% works with our small data
AND c_id = o_c_id
AND c_w_id = o_w_id
AND c_d_id = o_d_id
AND no_w_id = o_w_id
AND no_d_id = o_d_id
AND no_o_id = o_id
AND ol_w_id = o_w_id
AND ol_d_id = o_d_id
AND ol_o_id = o_id
AND o_entry_d > '2007-01-02 00:00:00.000000'
GROUP BY
ol_o_id,
ol_w_id,
ol_d_id,
o_entry_d
ORDER BY
revenue DESC,
o_entry_d;
ol_o_id | ol_w_id | ol_d_id | revenue | o_entry_d
---------+---------+---------+---------+--------------------------
10 | 10 | 10 | 10.00 | Fri Oct 17 00:00:00 2008
9 | 9 | 9 | 9.00 | Fri Oct 17 00:00:00 2008
8 | 8 | 8 | 8.00 | Fri Oct 17 00:00:00 2008
7 | 7 | 7 | 7.00 | Fri Oct 17 00:00:00 2008
6 | 6 | 6 | 6.00 | Fri Oct 17 00:00:00 2008
5 | 5 | 5 | 5.00 | Fri Oct 17 00:00:00 2008
4 | 4 | 4 | 4.00 | Fri Oct 17 00:00:00 2008
3 | 3 | 3 | 3.00 | Fri Oct 17 00:00:00 2008
2 | 2 | 2 | 2.00 | Fri Oct 17 00:00:00 2008
1 | 1 | 1 | 1.00 | Fri Oct 17 00:00:00 2008
0 | 0 | 0 | 0.00 | Fri Oct 17 00:00:00 2008
(11 rows)
-- Query 4
SELECT
o_ol_cnt,
count(*) as order_count
FROM
oorder
WHERE o_entry_d >= '2007-01-02 00:00:00.000000'
AND o_entry_d < '2012-01-02 00:00:00.000000'
AND exists (SELECT *
FROM order_line
WHERE o_id = ol_o_id
AND o_w_id = ol_w_id
AND o_d_id = ol_d_id
AND ol_delivery_d >= o_entry_d)
GROUP BY o_ol_cnt
ORDER BY o_ol_cnt;
o_ol_cnt | order_count
----------+-------------
1 | 11
(1 row)
-- Query 5
SELECT
n_name,
sum(ol_amount) AS revenue
FROM
customer,
oorder,
order_line,
stock,
supplier,
nation,
region
WHERE c_id = o_c_id
AND c_w_id = o_w_id
AND c_d_id = o_d_id
AND ol_o_id = o_id
AND ol_w_id = o_w_id
AND ol_d_id=o_d_id
AND ol_w_id = s_w_id
AND ol_i_id = s_i_id
AND mod((s_w_id * s_i_id),10000) = su_suppkey
-- our dataset does not have the supplier in the same nation as the customer causing this
-- join to filter out all the data. We verify later on that we can actually perform an
-- ascii(substr(c_state,1,1)) == reference table column join later on so it should not
-- matter we skip this filter here.
--AND ascii(substr(c_state,1,1)) = su_nationkey
AND su_nationkey = n_nationkey
AND n_regionkey = r_regionkey
AND r_name = 'Europe'
AND o_entry_d >= '2007-01-02 00:00:00.000000'
GROUP BY n_name
ORDER BY revenue DESC;
n_name | revenue
---------------------------+---------
Germany | 3.00
The Netherlands | 2.00
(2 rows)
-- Query 6
SELECT
sum(ol_amount) AS revenue
FROM order_line
WHERE ol_delivery_d >= '1999-01-01 00:00:00.000000'
AND ol_delivery_d < '2020-01-01 00:00:00.000000'
AND ol_quantity BETWEEN 1 AND 100000;
revenue
---------
55.00
(1 row)
-- Query 7
SELECT
su_nationkey as supp_nation,
substr(c_state,1,1) as cust_nation,
extract(year from o_entry_d) as l_year,
sum(ol_amount) as revenue
FROM
supplier,
stock,
order_line,
oorder,
customer,
nation n1,
nation n2
WHERE ol_supply_w_id = s_w_id
AND ol_i_id = s_i_id
AND mod((s_w_id * s_i_id), 10000) = su_suppkey
AND ol_w_id = o_w_id
AND ol_d_id = o_d_id
AND ol_o_id = o_id
AND c_id = o_c_id
AND c_w_id = o_w_id
AND c_d_id = o_d_id
AND su_nationkey = n1.n_nationkey
AND ascii(substr(c_state,1,1)) = n2.n_nationkey
AND (
(n1.n_name = 'Germany' AND n2.n_name = 'Cambodia')
OR (n1.n_name = 'Cambodia' AND n2.n_name = 'Germany')
)
AND ol_delivery_d BETWEEN '2007-01-02 00:00:00.000000' AND '2012-01-02 00:00:00.000000'
GROUP BY
su_nationkey,
substr(c_state,1,1),
extract(year from o_entry_d)
ORDER BY
su_nationkey,
cust_nation,
l_year;
supp_nation | cust_nation | l_year | revenue
-------------+-------------+--------+---------
9 | C | 2008 | 3.00
(1 row)
-- Query 8
SELECT
extract(year from o_entry_d) as l_year,
sum(case when n2.n_name = 'Germany' then ol_amount else 0 end) / sum(ol_amount) as mkt_share
FROM
item,
supplier,
stock,
order_line,
oorder,
customer,
nation n1,
nation n2,
region
WHERE i_id = s_i_id
AND ol_i_id = s_i_id
AND ol_supply_w_id = s_w_id
AND mod((s_w_id * s_i_id),10000) = su_suppkey
AND ol_w_id = o_w_id
AND ol_d_id = o_d_id
AND ol_o_id = o_id
AND c_id = o_c_id
AND c_w_id = o_w_id
AND c_d_id = o_d_id
AND n1.n_nationkey = ascii(substr(c_state,1,1))
AND n1.n_regionkey = r_regionkey
AND ol_i_id < 1000
AND r_name = 'Europe'
AND su_nationkey = n2.n_nationkey
AND o_entry_d BETWEEN '2007-01-02 00:00:00.000000' AND '2012-01-02 00:00:00.000000'
AND i_data LIKE '%b'
AND i_id = ol_i_id
GROUP BY extract(YEAR FROM o_entry_d)
ORDER BY l_year;
l_year | mkt_share
--------+------------------------
2008 | 0.50000000000000000000
(1 row)
-- Query 9
SELECT
n_name,
extract(year from o_entry_d) as l_year,
sum(ol_amount) as sum_profit
FROM
item,
stock,
supplier,
order_line,
oorder,
nation
WHERE ol_i_id = s_i_id
AND ol_supply_w_id = s_w_id
AND mod((s_w_id * s_i_id), 10000) = su_suppkey
AND ol_w_id = o_w_id
AND ol_d_id = o_d_id
AND ol_o_id = o_id
AND ol_i_id = i_id
AND su_nationkey = n_nationkey
AND i_data LIKE '%b' -- this used to be %BB but that will not work with our small dataset
GROUP BY
n_name,
extract(YEAR FROM o_entry_d)
ORDER BY
n_name,
l_year DESC;
n_name | l_year | sum_profit
---------------------------+--------+------------
Germany | 2008 | 3.00
The Netherlands | 2008 | 2.00
United States | 2008 | 1.00
(3 rows)
-- Query 10
SELECT
c_id,
c_last,
sum(ol_amount) AS revenue,
c_city,
c_phone,
n_name
FROM
customer,
oorder,
order_line,
nation
WHERE c_id = o_c_id
AND c_w_id = o_w_id
AND c_d_id = o_d_id
AND ol_w_id = o_w_id
AND ol_d_id = o_d_id
AND ol_o_id = o_id
AND o_entry_d >= '2007-01-02 00:00:00.000000'
AND o_entry_d <= ol_delivery_d
AND n_nationkey = ascii(substr(c_state,1,1))
GROUP BY
c_id,
c_last,
c_city,
c_phone,
n_name
ORDER BY revenue DESC;
c_id | c_last | revenue | c_city | c_phone | n_name
------+--------+---------+-----------+------------------+---------------------------
10 | John | 10.00 | Some City | +1 000 0000000 | Cambodia
9 | John | 9.00 | Some City | +1 000 0000000 | Cambodia
8 | John | 8.00 | Some City | +1 000 0000000 | Cambodia
7 | John | 7.00 | Some City | +1 000 0000000 | Cambodia
6 | John | 6.00 | Some City | +1 000 0000000 | Cambodia
5 | John | 5.00 | Some City | +1 000 0000000 | Cambodia
4 | John | 4.00 | Some City | +1 000 0000000 | Cambodia
3 | John | 3.00 | Some City | +1 000 0000000 | Cambodia
2 | John | 2.00 | Some City | +1 000 0000000 | Cambodia
1 | John | 1.00 | Some City | +1 000 0000000 | Cambodia
0 | John | 0.00 | Some City | +1 000 0000000 | Cambodia
(11 rows)
-- Query 11
SELECT
s_i_id,
sum(s_order_cnt) AS ordercount
FROM
stock,
supplier,
nation
WHERE mod((s_w_id * s_i_id),10000) = su_suppkey
AND su_nationkey = n_nationkey
AND n_name = 'Germany'
GROUP BY s_i_id
HAVING sum(s_order_cnt) >
(SELECT sum(s_order_cnt) * .005
FROM
stock,
supplier,
nation
WHERE mod((s_w_id * s_i_id),10000) = su_suppkey
AND su_nationkey = n_nationkey
AND n_name = 'Germany')
ORDER BY ordercount DESC;
s_i_id | ordercount
--------+------------
3 | 3
(1 row)
-- Query 12
SELECT
o_ol_cnt,
sum(case when o_carrier_id = 1 or o_carrier_id = 2 then 1 else 0 end) as high_line_count,
sum(case when o_carrier_id <> 1 and o_carrier_id <> 2 then 1 else 0 end) as low_line_count
FROM
oorder,
order_line
WHERE ol_w_id = o_w_id
AND ol_d_id = o_d_id
AND ol_o_id = o_id
AND o_entry_d <= ol_delivery_d
AND ol_delivery_d < '2020-01-01 00:00:00.000000'
GROUP BY o_ol_cnt
ORDER BY o_ol_cnt;
o_ol_cnt | high_line_count | low_line_count
----------+-----------------+----------------
1 | 2 | 9
(1 row)
-- Query 13
SELECT
c_count,
count(*) AS custdist
FROM (SELECT
c_id,
count(o_id)
FROM customer
LEFT OUTER JOIN oorder ON (
c_w_id = o_w_id
AND c_d_id = o_d_id
AND c_id = o_c_id
AND o_carrier_id > 8)
GROUP BY c_id) AS c_orders (c_id, c_count)
GROUP BY c_count
ORDER BY
custdist DESC,
c_count DESC;
c_count | custdist
---------+----------
0 | 9
1 | 2
(2 rows)
-- Query 14
SELECT
100.00 * sum(CASE WHEN i_data LIKE 'PR%' THEN ol_amount ELSE 0 END) / (1+sum(ol_amount)) AS promo_revenue
FROM
order_line,
item
WHERE ol_i_id = i_id
AND ol_delivery_d >= '2007-01-02 00:00:00.000000'
AND ol_delivery_d < '2020-01-02 00:00:00.000000';
promo_revenue
------------------------
0.00000000000000000000
(1 row)
-- Query 15
WITH revenue (supplier_no, total_revenue) AS (
SELECT
mod((s_w_id * s_i_id),10000) AS supplier_no,
sum(ol_amount) AS total_revenue
FROM
order_line,
stock
WHERE ol_i_id = s_i_id
AND ol_supply_w_id = s_w_id
AND ol_delivery_d >= '2007-01-02 00:00:00.000000'
GROUP BY mod((s_w_id * s_i_id),10000))
SELECT
su_suppkey,
su_name,
su_address,
su_phone,
total_revenue
FROM
supplier,
revenue
WHERE su_suppkey = supplier_no
AND total_revenue = (SELECT max(total_revenue) FROM revenue)
ORDER BY su_suppkey;
su_suppkey | su_name | su_address | su_phone | total_revenue
------------+---------------------------+------------+-----------------+---------------
9 | abc | def | ghi | 3.00
(1 row)
--Q16
SELECT
i_name,
substr(i_data, 1, 3) AS brand,
i_price,
count(DISTINCT (mod((s_w_id * s_i_id),10000))) AS supplier_cnt
FROM
stock,
item
WHERE i_id = s_i_id
AND i_data NOT LIKE 'zz%'
AND (mod((s_w_id * s_i_id),10000) NOT IN
(SELECT su_suppkey
FROM supplier
WHERE su_comment LIKE '%bad%'))
GROUP BY
i_name,
substr(i_data, 1, 3),
i_price
ORDER BY supplier_cnt DESC;
i_name | brand | i_price | supplier_cnt
----------+-------+---------+--------------
Keyboard | co | 50.00 | 3
(1 row)
--Q17
SELECT
sum(ol_amount) / 2.0 AS avg_yearly
FROM
order_line,
(SELECT
i_id,
avg(ol_quantity) AS a
FROM
item,
order_line
WHERE i_data LIKE '%b'
AND ol_i_id = i_id
GROUP BY i_id) t
WHERE ol_i_id = t.i_id;
avg_yearly
---------------------
27.5000000000000000
(1 row)
-- this filter was at the end causing the dataset to be empty. it should not have any
-- influence on how the query gets planned so I removed the clause
--AND ol_quantity < t.a;
-- Query 18
SELECT
c_last,
c_id o_id,
o_entry_d,
o_ol_cnt,
sum(ol_amount)
FROM
customer,
oorder,
order_line
WHERE c_id = o_c_id
AND c_w_id = o_w_id
AND c_d_id = o_d_id
AND ol_w_id = o_w_id
AND ol_d_id = o_d_id
AND ol_o_id = o_id
GROUP BY
o_id,
o_w_id,
o_d_id,
c_id,
c_last,
o_entry_d,
o_ol_cnt
HAVING sum(ol_amount) > 5 -- was 200, but thats too big for the dataset
ORDER BY
sum(ol_amount) DESC,
o_entry_d;
c_last | o_id | o_entry_d | o_ol_cnt | sum
--------+------+--------------------------+----------+-------
John | 10 | Fri Oct 17 00:00:00 2008 | 1 | 10.00
John | 9 | Fri Oct 17 00:00:00 2008 | 1 | 9.00
John | 8 | Fri Oct 17 00:00:00 2008 | 1 | 8.00
John | 7 | Fri Oct 17 00:00:00 2008 | 1 | 7.00
John | 6 | Fri Oct 17 00:00:00 2008 | 1 | 6.00
(5 rows)
-- Query 19
SELECT
sum(ol_amount) AS revenue
FROM
order_line,
item
WHERE ( ol_i_id = i_id
AND i_data LIKE '%a'
AND ol_quantity >= 1
AND ol_quantity <= 10
AND i_price BETWEEN 1 AND 400000
AND ol_w_id IN (1,2,3))
OR ( ol_i_id = i_id
AND i_data LIKE '%b'
AND ol_quantity >= 1
AND ol_quantity <= 10
AND i_price BETWEEN 1 AND 400000
AND ol_w_id IN (1,2,4))
OR ( ol_i_id = i_id
AND i_data LIKE '%c'
AND ol_quantity >= 1
AND ol_quantity <= 10
AND i_price BETWEEN 1 AND 400000
AND ol_w_id IN (1,5,3));
revenue
---------
7.00
(1 row)
-- Query 20
SELECT
su_name,
su_address
FROM
supplier,
nation
WHERE su_suppkey in
(SELECT
mod(s_i_id * s_w_id, 10000)
FROM
stock,
order_line
WHERE s_i_id IN
(SELECT i_id
FROM item
WHERE i_data LIKE 'co%')
AND ol_i_id = s_i_id
AND ol_delivery_d > '2008-05-23 12:00:00' -- was 2010, but our order is in 2008
GROUP BY s_i_id, s_w_id, s_quantity
HAVING 2*s_quantity > sum(ol_quantity))
AND su_nationkey = n_nationkey
AND n_name = 'Germany'
ORDER BY su_name;
su_name | su_address
---------------------------+------------
abc | def
(1 row)
-- Query 21
-- DATA SET DOES NOT COVER THIS QUERY
SELECT
su_name,
count(*) AS numwait
FROM
supplier,
order_line l1,
oorder,
stock,
nation
WHERE ol_o_id = o_id
AND ol_w_id = o_w_id
AND ol_d_id = o_d_id
AND ol_w_id = s_w_id
AND ol_i_id = s_i_id
AND mod((s_w_id * s_i_id),10000) = su_suppkey
AND l1.ol_delivery_d > o_entry_d
AND NOT exists (SELECT *
FROM order_line l2
WHERE l2.ol_o_id = l1.ol_o_id
AND l2.ol_w_id = l1.ol_w_id
AND l2.ol_d_id = l1.ol_d_id
AND l2.ol_delivery_d > l1.ol_delivery_d)
AND su_nationkey = n_nationkey
AND n_name = 'Germany'
GROUP BY su_name
ORDER BY
numwait desc,
su_name;
su_name | numwait
---------+---------
(0 rows)
-- Query 22
-- DATA SET DOES NOT COVER THIS QUERY
SELECT
substr(c_state,1,1) AS country,
count(*) AS numcust,
sum(c_balance) AS totacctbal
FROM customer
WHERE substr(c_phone,1,1) in ('1','2','3','4','5','6','7')
AND c_balance > (SELECT avg(c_BALANCE)
FROM customer
WHERE c_balance > 0.00
AND substr(c_phone,1,1) in ('1','2','3','4','5','6','7'))
AND NOT exists (SELECT *
FROM oorder
WHERE o_c_id = c_id
AND o_w_id = c_w_id
AND o_d_id = c_d_id)
GROUP BY substr(c_state,1,1)
ORDER BY substr(c_state,1,1);
country | numcust | totacctbal
---------+---------+------------
(0 rows)
SET client_min_messages TO WARNING;
DROP SCHEMA chbenchmark_all_queries CASCADE;

View File

@ -0,0 +1,56 @@
SET citus.next_shard_id TO 1670000;
CREATE SCHEMA expression_reference_join;
SET search_path TO expression_reference_join;
SET citus.shard_count TO 4;
SET citus.enable_repartition_joins TO on;
CREATE TABLE ref (a int, b int);
CREATE TABLE test (x int, y int);
INSERT INTO ref VALUES
(2,2),
(4,4);
INSERT INTO test VALUES
(1,2),
(2,2);
SELECT create_reference_table('ref');
NOTICE: Copying data from local table...
create_reference_table
------------------------
(1 row)
SELECT create_distributed_table('test', 'x');
NOTICE: Copying data from local table...
create_distributed_table
--------------------------
(1 row)
-- PR 3180 implements expressions in join clauses to reference tables to support CHbenCHmark queries 7/8/9
-- plannable as a repartition + reference join, now with an expression in the join clause
SELECT *
FROM
test t1 JOIN test t2 USING (y), -- causes repartition, which makes this not routable or pushdownable
ref a
WHERE t2.y * 2 = a.a
ORDER BY 1,2,3;
y | x | x | a | b
---+---+---+---+---
2 | 1 | 1 | 4 | 4
2 | 1 | 2 | 4 | 4
2 | 2 | 1 | 4 | 4
2 | 2 | 2 | 4 | 4
(4 rows)
-- The join clause is wider than it used to be, causing this query to be recognized by the LogicalPlanner as a repartition join.
-- Unplannable query due to a three-way join which causes no valid path (besides the cartesian product) to be found
SELECT *
FROM
test t1 JOIN test t2 USING (y), -- causes repartition, which makes this not routable or pushdownable
ref a,
ref b
WHERE t2.y - a.a - b.b = 0
ORDER BY 1,2,3;
ERROR: cannot perform distributed planning on this query
DETAIL: Cartesian products are currently unsupported
SET client_min_messages TO WARNING;
DROP SCHEMA expression_reference_join CASCADE;

View File

@ -243,7 +243,7 @@ DEBUG: dual partition column types do not match
DEBUG: single partition column types do not match
DEBUG: single partition column types do not match
DEBUG: dual partition column types do not match
LOG: join order: [ "single_hash_repartition_first" ][ local partition join "single_hash_repartition_first" ][ cartesian product "single_hash_repartition_second" ]
LOG: join order: [ "single_hash_repartition_second" ][ cartesian product "single_hash_repartition_first" ][ dual partition join "single_hash_repartition_first" ]
ERROR: cannot perform distributed planning on this query
DETAIL: Cartesian products are currently unsupported
-- single repartition query in CTE

View File

@ -73,7 +73,7 @@ test: multi_reference_table multi_select_for_update relation_access_tracking
test: custom_aggregate_support aggregate_support
test: multi_average_expression multi_working_columns multi_having_pushdown
test: multi_array_agg multi_limit_clause multi_orderby_limit_pushdown
test: multi_jsonb_agg multi_jsonb_object_agg multi_json_agg multi_json_object_agg bool_agg ch_bench_having ch_bench_subquery_repartition
test: multi_jsonb_agg multi_jsonb_object_agg multi_json_agg multi_json_object_agg bool_agg ch_bench_having ch_bench_subquery_repartition chbenchmark_all_queries expression_reference_join
test: multi_agg_type_conversion multi_count_type_conversion
test: multi_partition_pruning single_hash_repartition_join
test: multi_join_pruning multi_hash_pruning

View File

@ -0,0 +1,722 @@
SET citus.next_shard_id TO 1650000;
CREATE SCHEMA chbenchmark_all_queries;
SET search_path TO chbenchmark_all_queries;
SET citus.enable_repartition_joins TO on;
CREATE TABLE order_line (
ol_w_id int NOT NULL,
ol_d_id int NOT NULL,
ol_o_id int NOT NULL,
ol_number int NOT NULL,
ol_i_id int NOT NULL,
ol_delivery_d timestamp NULL DEFAULT NULL,
ol_amount decimal(6,2) NOT NULL,
ol_supply_w_id int NOT NULL,
ol_quantity decimal(2,0) NOT NULL,
ol_dist_info char(24) NOT NULL,
PRIMARY KEY (ol_w_id,ol_d_id,ol_o_id,ol_number)
);
CREATE TABLE new_order (
no_w_id int NOT NULL,
no_d_id int NOT NULL,
no_o_id int NOT NULL,
PRIMARY KEY (no_w_id,no_d_id,no_o_id)
);
CREATE TABLE stock (
s_w_id int NOT NULL,
s_i_id int NOT NULL,
s_quantity decimal(4,0) NOT NULL,
s_ytd decimal(8,2) NOT NULL,
s_order_cnt int NOT NULL,
s_remote_cnt int NOT NULL,
s_data varchar(50) NOT NULL,
s_dist_01 char(24) NOT NULL,
s_dist_02 char(24) NOT NULL,
s_dist_03 char(24) NOT NULL,
s_dist_04 char(24) NOT NULL,
s_dist_05 char(24) NOT NULL,
s_dist_06 char(24) NOT NULL,
s_dist_07 char(24) NOT NULL,
s_dist_08 char(24) NOT NULL,
s_dist_09 char(24) NOT NULL,
s_dist_10 char(24) NOT NULL,
PRIMARY KEY (s_w_id,s_i_id)
);
CREATE TABLE oorder (
o_w_id int NOT NULL,
o_d_id int NOT NULL,
o_id int NOT NULL,
o_c_id int NOT NULL,
o_carrier_id int DEFAULT NULL,
o_ol_cnt decimal(2,0) NOT NULL,
o_all_local decimal(1,0) NOT NULL,
o_entry_d timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (o_w_id,o_d_id,o_id),
UNIQUE (o_w_id,o_d_id,o_c_id,o_id)
);
CREATE TABLE history (
h_c_id int NOT NULL,
h_c_d_id int NOT NULL,
h_c_w_id int NOT NULL,
h_d_id int NOT NULL,
h_w_id int NOT NULL,
h_date timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
h_amount decimal(6,2) NOT NULL,
h_data varchar(24) NOT NULL
);
CREATE TABLE customer (
c_w_id int NOT NULL,
c_d_id int NOT NULL,
c_id int NOT NULL,
c_discount decimal(4,4) NOT NULL,
c_credit char(2) NOT NULL,
c_last varchar(16) NOT NULL,
c_first varchar(16) NOT NULL,
c_credit_lim decimal(12,2) NOT NULL,
c_balance decimal(12,2) NOT NULL,
c_ytd_payment float NOT NULL,
c_payment_cnt int NOT NULL,
c_delivery_cnt int NOT NULL,
c_street_1 varchar(20) NOT NULL,
c_street_2 varchar(20) NOT NULL,
c_city varchar(20) NOT NULL,
c_state char(2) NOT NULL,
c_zip char(9) NOT NULL,
c_phone char(16) NOT NULL,
c_since timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
c_middle char(2) NOT NULL,
c_data varchar(500) NOT NULL,
PRIMARY KEY (c_w_id,c_d_id,c_id)
);
CREATE TABLE district (
d_w_id int NOT NULL,
d_id int NOT NULL,
d_ytd decimal(12,2) NOT NULL,
d_tax decimal(4,4) NOT NULL,
d_next_o_id int NOT NULL,
d_name varchar(10) NOT NULL,
d_street_1 varchar(20) NOT NULL,
d_street_2 varchar(20) NOT NULL,
d_city varchar(20) NOT NULL,
d_state char(2) NOT NULL,
d_zip char(9) NOT NULL,
PRIMARY KEY (d_w_id,d_id)
);
CREATE TABLE item (
i_id int NOT NULL,
i_name varchar(24) NOT NULL,
i_price decimal(5,2) NOT NULL,
i_data varchar(50) NOT NULL,
i_im_id int NOT NULL,
PRIMARY KEY (i_id)
);
CREATE TABLE warehouse (
w_id int NOT NULL,
w_ytd decimal(12,2) NOT NULL,
w_tax decimal(4,4) NOT NULL,
w_name varchar(10) NOT NULL,
w_street_1 varchar(20) NOT NULL,
w_street_2 varchar(20) NOT NULL,
w_city varchar(20) NOT NULL,
w_state char(2) NOT NULL,
w_zip char(9) NOT NULL,
PRIMARY KEY (w_id)
);
CREATE TABLE region (
r_regionkey int not null,
r_name char(55) not null,
r_comment char(152) not null,
PRIMARY KEY ( r_regionkey )
);
CREATE TABLE nation (
n_nationkey int not null,
n_name char(25) not null,
n_regionkey int not null,
n_comment char(152) not null,
PRIMARY KEY ( n_nationkey )
);
CREATE TABLE supplier (
su_suppkey int not null,
su_name char(25) not null,
su_address varchar(40) not null,
su_nationkey int not null,
su_phone char(15) not null,
su_acctbal numeric(12,2) not null,
su_comment char(101) not null,
PRIMARY KEY ( su_suppkey )
);
SELECT create_distributed_table('order_line','ol_w_id');
SELECT create_distributed_table('new_order','no_w_id');
SELECT create_distributed_table('stock','s_w_id');
SELECT create_distributed_table('oorder','o_w_id');
SELECT create_distributed_table('history','h_w_id');
SELECT create_distributed_table('customer','c_w_id');
SELECT create_distributed_table('district','d_w_id');
SELECT create_distributed_table('warehouse','w_id');
SELECT create_reference_table('item');
SELECT create_reference_table('region');
SELECT create_reference_table('nation');
SELECT create_reference_table('supplier');
TRUNCATE order_line, new_order, stock, oorder, history, customer, district, warehouse, item, region, nation, supplier; -- for easy copy in development
INSERT INTO supplier SELECT c, 'abc', 'def', c, 'ghi', c, 'jkl' FROM generate_series(0,10) AS c;
INSERT INTO new_order SELECT c, c, c FROM generate_series(0,10) AS c;
INSERT INTO stock SELECT c,c,c,c,c,c, 'abc','abc','abc','abc','abc','abc','abc','abc','abc','abc','abc' FROM generate_series(1,3) AS c;
INSERT INTO stock SELECT c, 5000,c,c,c,c, 'abc','abc','abc','abc','abc','abc','abc','abc','abc','abc','abc' FROM generate_series(1,3) AS c; -- mod(2*5000,10000) == 0
INSERT INTO order_line SELECT c, c, c, c, c, '2008-10-17 00:00:00.000000', c, c, c, 'abc' FROM generate_series(0,10) AS c;
INSERT INTO oorder SELECT c, c, c, c, c, 1, 1, '2008-10-17 00:00:00.000000' FROM generate_series(0,10) AS c;
INSERT INTO customer SELECT c, c, c, 0, 'XX', 'John', 'Doe', 1000, 0, 0, c, c, 'Name', 'Street', 'Some City', 'CA', '12345', '+1 000 0000000', '2007-01-02 00:00:00.000000', 'NA', 'nothing special' FROM generate_series(0,10) AS c;
INSERT INTO item SELECT c, 'Keyboard', 50, 'co b', c FROM generate_series(0,10) AS c; --co% and %b filters all around
INSERT INTO region VALUES
(1, 'Not Europe', 'Big'),
(2, 'Europe', 'Big');
INSERT INTO nation VALUES
(1, 'United States', 1, 'Also Big'),
(4, 'The Netherlands', 2, 'Flat'),
(9, 'Germany', 2, 'Germany must be in here for Q7'),
(67, 'Cambodia', 2, 'I don''t understand how we got from California to Cambodia but I will take it, it also is not in Europe, but we need it to be for Q8');
-- Query 1
SELECT
ol_number,
sum(ol_quantity) as sum_qty,
sum(ol_amount) as sum_amount,
avg(ol_quantity) as avg_qty,
avg(ol_amount) as avg_amount,
count(*) as count_order
FROM order_line
WHERE ol_delivery_d > '2007-01-02 00:00:00.000000'
GROUP BY ol_number
ORDER BY ol_number;
-- Query 2
SELECT
su_suppkey,
su_name,
n_name,
i_id,
i_name,
su_address,
su_phone,
su_comment
FROM
item,
supplier,
stock,
nation,
region,
(SELECT
s_i_id AS m_i_id,
min(s_quantity) as m_s_quantity
FROM
stock,
supplier,
nation,
region
WHERE mod((s_w_id*s_i_id),10000)=su_suppkey
AND su_nationkey=n_nationkey
AND n_regionkey=r_regionkey
AND r_name LIKE 'Europ%'
GROUP BY s_i_id) m
WHERE i_id = s_i_id
AND mod((s_w_id * s_i_id), 10000) = su_suppkey
AND su_nationkey = n_nationkey
AND n_regionkey = r_regionkey
AND i_data LIKE '%b'
AND r_name LIKE 'Europ%'
AND i_id = m_i_id
AND s_quantity = m_s_quantity
ORDER BY
n_name,
su_name,
i_id;
-- Query 3
SELECT
ol_o_id,
ol_w_id,
ol_d_id,
sum(ol_amount) AS revenue,
o_entry_d
FROM
customer,
new_order,
oorder,
order_line
WHERE c_state LIKE 'C%' -- used to ba A%, but C% works with our small data
AND c_id = o_c_id
AND c_w_id = o_w_id
AND c_d_id = o_d_id
AND no_w_id = o_w_id
AND no_d_id = o_d_id
AND no_o_id = o_id
AND ol_w_id = o_w_id
AND ol_d_id = o_d_id
AND ol_o_id = o_id
AND o_entry_d > '2007-01-02 00:00:00.000000'
GROUP BY
ol_o_id,
ol_w_id,
ol_d_id,
o_entry_d
ORDER BY
revenue DESC,
o_entry_d;
-- Query 4
SELECT
o_ol_cnt,
count(*) as order_count
FROM
oorder
WHERE o_entry_d >= '2007-01-02 00:00:00.000000'
AND o_entry_d < '2012-01-02 00:00:00.000000'
AND exists (SELECT *
FROM order_line
WHERE o_id = ol_o_id
AND o_w_id = ol_w_id
AND o_d_id = ol_d_id
AND ol_delivery_d >= o_entry_d)
GROUP BY o_ol_cnt
ORDER BY o_ol_cnt;
-- Query 5
SELECT
n_name,
sum(ol_amount) AS revenue
FROM
customer,
oorder,
order_line,
stock,
supplier,
nation,
region
WHERE c_id = o_c_id
AND c_w_id = o_w_id
AND c_d_id = o_d_id
AND ol_o_id = o_id
AND ol_w_id = o_w_id
AND ol_d_id=o_d_id
AND ol_w_id = s_w_id
AND ol_i_id = s_i_id
AND mod((s_w_id * s_i_id),10000) = su_suppkey
-- our dataset does not have the supplier in the same nation as the customer causing this
-- join to filter out all the data. We verify later on that we can actually perform an
-- ascii(substr(c_state,1,1)) == reference table column join later on so it should not
-- matter we skip this filter here.
--AND ascii(substr(c_state,1,1)) = su_nationkey
AND su_nationkey = n_nationkey
AND n_regionkey = r_regionkey
AND r_name = 'Europe'
AND o_entry_d >= '2007-01-02 00:00:00.000000'
GROUP BY n_name
ORDER BY revenue DESC;
-- Query 6
SELECT
sum(ol_amount) AS revenue
FROM order_line
WHERE ol_delivery_d >= '1999-01-01 00:00:00.000000'
AND ol_delivery_d < '2020-01-01 00:00:00.000000'
AND ol_quantity BETWEEN 1 AND 100000;
-- Query 7
SELECT
su_nationkey as supp_nation,
substr(c_state,1,1) as cust_nation,
extract(year from o_entry_d) as l_year,
sum(ol_amount) as revenue
FROM
supplier,
stock,
order_line,
oorder,
customer,
nation n1,
nation n2
WHERE ol_supply_w_id = s_w_id
AND ol_i_id = s_i_id
AND mod((s_w_id * s_i_id), 10000) = su_suppkey
AND ol_w_id = o_w_id
AND ol_d_id = o_d_id
AND ol_o_id = o_id
AND c_id = o_c_id
AND c_w_id = o_w_id
AND c_d_id = o_d_id
AND su_nationkey = n1.n_nationkey
AND ascii(substr(c_state,1,1)) = n2.n_nationkey
AND (
(n1.n_name = 'Germany' AND n2.n_name = 'Cambodia')
OR (n1.n_name = 'Cambodia' AND n2.n_name = 'Germany')
)
AND ol_delivery_d BETWEEN '2007-01-02 00:00:00.000000' AND '2012-01-02 00:00:00.000000'
GROUP BY
su_nationkey,
substr(c_state,1,1),
extract(year from o_entry_d)
ORDER BY
su_nationkey,
cust_nation,
l_year;
-- Query 8
SELECT
extract(year from o_entry_d) as l_year,
sum(case when n2.n_name = 'Germany' then ol_amount else 0 end) / sum(ol_amount) as mkt_share
FROM
item,
supplier,
stock,
order_line,
oorder,
customer,
nation n1,
nation n2,
region
WHERE i_id = s_i_id
AND ol_i_id = s_i_id
AND ol_supply_w_id = s_w_id
AND mod((s_w_id * s_i_id),10000) = su_suppkey
AND ol_w_id = o_w_id
AND ol_d_id = o_d_id
AND ol_o_id = o_id
AND c_id = o_c_id
AND c_w_id = o_w_id
AND c_d_id = o_d_id
AND n1.n_nationkey = ascii(substr(c_state,1,1))
AND n1.n_regionkey = r_regionkey
AND ol_i_id < 1000
AND r_name = 'Europe'
AND su_nationkey = n2.n_nationkey
AND o_entry_d BETWEEN '2007-01-02 00:00:00.000000' AND '2012-01-02 00:00:00.000000'
AND i_data LIKE '%b'
AND i_id = ol_i_id
GROUP BY extract(YEAR FROM o_entry_d)
ORDER BY l_year;
-- Query 9
SELECT
n_name,
extract(year from o_entry_d) as l_year,
sum(ol_amount) as sum_profit
FROM
item,
stock,
supplier,
order_line,
oorder,
nation
WHERE ol_i_id = s_i_id
AND ol_supply_w_id = s_w_id
AND mod((s_w_id * s_i_id), 10000) = su_suppkey
AND ol_w_id = o_w_id
AND ol_d_id = o_d_id
AND ol_o_id = o_id
AND ol_i_id = i_id
AND su_nationkey = n_nationkey
AND i_data LIKE '%b' -- this used to be %BB but that will not work with our small dataset
GROUP BY
n_name,
extract(YEAR FROM o_entry_d)
ORDER BY
n_name,
l_year DESC;
-- Query 10
SELECT
c_id,
c_last,
sum(ol_amount) AS revenue,
c_city,
c_phone,
n_name
FROM
customer,
oorder,
order_line,
nation
WHERE c_id = o_c_id
AND c_w_id = o_w_id
AND c_d_id = o_d_id
AND ol_w_id = o_w_id
AND ol_d_id = o_d_id
AND ol_o_id = o_id
AND o_entry_d >= '2007-01-02 00:00:00.000000'
AND o_entry_d <= ol_delivery_d
AND n_nationkey = ascii(substr(c_state,1,1))
GROUP BY
c_id,
c_last,
c_city,
c_phone,
n_name
ORDER BY revenue DESC;
-- Query 11
SELECT
s_i_id,
sum(s_order_cnt) AS ordercount
FROM
stock,
supplier,
nation
WHERE mod((s_w_id * s_i_id),10000) = su_suppkey
AND su_nationkey = n_nationkey
AND n_name = 'Germany'
GROUP BY s_i_id
HAVING sum(s_order_cnt) >
(SELECT sum(s_order_cnt) * .005
FROM
stock,
supplier,
nation
WHERE mod((s_w_id * s_i_id),10000) = su_suppkey
AND su_nationkey = n_nationkey
AND n_name = 'Germany')
ORDER BY ordercount DESC;
-- Query 12
SELECT
o_ol_cnt,
sum(case when o_carrier_id = 1 or o_carrier_id = 2 then 1 else 0 end) as high_line_count,
sum(case when o_carrier_id <> 1 and o_carrier_id <> 2 then 1 else 0 end) as low_line_count
FROM
oorder,
order_line
WHERE ol_w_id = o_w_id
AND ol_d_id = o_d_id
AND ol_o_id = o_id
AND o_entry_d <= ol_delivery_d
AND ol_delivery_d < '2020-01-01 00:00:00.000000'
GROUP BY o_ol_cnt
ORDER BY o_ol_cnt;
-- Query 13
SELECT
c_count,
count(*) AS custdist
FROM (SELECT
c_id,
count(o_id)
FROM customer
LEFT OUTER JOIN oorder ON (
c_w_id = o_w_id
AND c_d_id = o_d_id
AND c_id = o_c_id
AND o_carrier_id > 8)
GROUP BY c_id) AS c_orders (c_id, c_count)
GROUP BY c_count
ORDER BY
custdist DESC,
c_count DESC;
-- Query 14
SELECT
100.00 * sum(CASE WHEN i_data LIKE 'PR%' THEN ol_amount ELSE 0 END) / (1+sum(ol_amount)) AS promo_revenue
FROM
order_line,
item
WHERE ol_i_id = i_id
AND ol_delivery_d >= '2007-01-02 00:00:00.000000'
AND ol_delivery_d < '2020-01-02 00:00:00.000000';
-- Query 15
WITH revenue (supplier_no, total_revenue) AS (
SELECT
mod((s_w_id * s_i_id),10000) AS supplier_no,
sum(ol_amount) AS total_revenue
FROM
order_line,
stock
WHERE ol_i_id = s_i_id
AND ol_supply_w_id = s_w_id
AND ol_delivery_d >= '2007-01-02 00:00:00.000000'
GROUP BY mod((s_w_id * s_i_id),10000))
SELECT
su_suppkey,
su_name,
su_address,
su_phone,
total_revenue
FROM
supplier,
revenue
WHERE su_suppkey = supplier_no
AND total_revenue = (SELECT max(total_revenue) FROM revenue)
ORDER BY su_suppkey;
--Q16
SELECT
i_name,
substr(i_data, 1, 3) AS brand,
i_price,
count(DISTINCT (mod((s_w_id * s_i_id),10000))) AS supplier_cnt
FROM
stock,
item
WHERE i_id = s_i_id
AND i_data NOT LIKE 'zz%'
AND (mod((s_w_id * s_i_id),10000) NOT IN
(SELECT su_suppkey
FROM supplier
WHERE su_comment LIKE '%bad%'))
GROUP BY
i_name,
substr(i_data, 1, 3),
i_price
ORDER BY supplier_cnt DESC;
--Q17
SELECT
sum(ol_amount) / 2.0 AS avg_yearly
FROM
order_line,
(SELECT
i_id,
avg(ol_quantity) AS a
FROM
item,
order_line
WHERE i_data LIKE '%b'
AND ol_i_id = i_id
GROUP BY i_id) t
WHERE ol_i_id = t.i_id;
-- this filter was at the end causing the dataset to be empty. it should not have any
-- influence on how the query gets planned so I removed the clause
--AND ol_quantity < t.a;
-- Query 18
SELECT
c_last,
c_id o_id,
o_entry_d,
o_ol_cnt,
sum(ol_amount)
FROM
customer,
oorder,
order_line
WHERE c_id = o_c_id
AND c_w_id = o_w_id
AND c_d_id = o_d_id
AND ol_w_id = o_w_id
AND ol_d_id = o_d_id
AND ol_o_id = o_id
GROUP BY
o_id,
o_w_id,
o_d_id,
c_id,
c_last,
o_entry_d,
o_ol_cnt
HAVING sum(ol_amount) > 5 -- was 200, but thats too big for the dataset
ORDER BY
sum(ol_amount) DESC,
o_entry_d;
-- Query 19
SELECT
sum(ol_amount) AS revenue
FROM
order_line,
item
WHERE ( ol_i_id = i_id
AND i_data LIKE '%a'
AND ol_quantity >= 1
AND ol_quantity <= 10
AND i_price BETWEEN 1 AND 400000
AND ol_w_id IN (1,2,3))
OR ( ol_i_id = i_id
AND i_data LIKE '%b'
AND ol_quantity >= 1
AND ol_quantity <= 10
AND i_price BETWEEN 1 AND 400000
AND ol_w_id IN (1,2,4))
OR ( ol_i_id = i_id
AND i_data LIKE '%c'
AND ol_quantity >= 1
AND ol_quantity <= 10
AND i_price BETWEEN 1 AND 400000
AND ol_w_id IN (1,5,3));
-- Query 20
SELECT
su_name,
su_address
FROM
supplier,
nation
WHERE su_suppkey in
(SELECT
mod(s_i_id * s_w_id, 10000)
FROM
stock,
order_line
WHERE s_i_id IN
(SELECT i_id
FROM item
WHERE i_data LIKE 'co%')
AND ol_i_id = s_i_id
AND ol_delivery_d > '2008-05-23 12:00:00' -- was 2010, but our order is in 2008
GROUP BY s_i_id, s_w_id, s_quantity
HAVING 2*s_quantity > sum(ol_quantity))
AND su_nationkey = n_nationkey
AND n_name = 'Germany'
ORDER BY su_name;
-- Query 21
-- DATA SET DOES NOT COVER THIS QUERY
SELECT
su_name,
count(*) AS numwait
FROM
supplier,
order_line l1,
oorder,
stock,
nation
WHERE ol_o_id = o_id
AND ol_w_id = o_w_id
AND ol_d_id = o_d_id
AND ol_w_id = s_w_id
AND ol_i_id = s_i_id
AND mod((s_w_id * s_i_id),10000) = su_suppkey
AND l1.ol_delivery_d > o_entry_d
AND NOT exists (SELECT *
FROM order_line l2
WHERE l2.ol_o_id = l1.ol_o_id
AND l2.ol_w_id = l1.ol_w_id
AND l2.ol_d_id = l1.ol_d_id
AND l2.ol_delivery_d > l1.ol_delivery_d)
AND su_nationkey = n_nationkey
AND n_name = 'Germany'
GROUP BY su_name
ORDER BY
numwait desc,
su_name;
-- Query 22
-- DATA SET DOES NOT COVER THIS QUERY
SELECT
substr(c_state,1,1) AS country,
count(*) AS numcust,
sum(c_balance) AS totacctbal
FROM customer
WHERE substr(c_phone,1,1) in ('1','2','3','4','5','6','7')
AND c_balance > (SELECT avg(c_BALANCE)
FROM customer
WHERE c_balance > 0.00
AND substr(c_phone,1,1) in ('1','2','3','4','5','6','7'))
AND NOT exists (SELECT *
FROM oorder
WHERE o_c_id = c_id
AND o_w_id = c_w_id
AND o_d_id = c_d_id)
GROUP BY substr(c_state,1,1)
ORDER BY substr(c_state,1,1);
SET client_min_messages TO WARNING;
DROP SCHEMA chbenchmark_all_queries CASCADE;

View File

@ -0,0 +1,41 @@
SET citus.next_shard_id TO 1670000;
CREATE SCHEMA expression_reference_join;
SET search_path TO expression_reference_join;
SET citus.shard_count TO 4;
SET citus.enable_repartition_joins TO on;
CREATE TABLE ref (a int, b int);
CREATE TABLE test (x int, y int);
INSERT INTO ref VALUES
(2,2),
(4,4);
INSERT INTO test VALUES
(1,2),
(2,2);
SELECT create_reference_table('ref');
SELECT create_distributed_table('test', 'x');
-- PR 3180 implements expressions in join clauses to reference tables to support CHbenCHmark queries 7/8/9
-- plannable as a repartition + reference join, now with an expression in the join clause
SELECT *
FROM
test t1 JOIN test t2 USING (y), -- causes repartition, which makes this not routable or pushdownable
ref a
WHERE t2.y * 2 = a.a
ORDER BY 1,2,3;
-- The join clause is wider than it used to be, causing this query to be recognized by the LogicalPlanner as a repartition join.
-- Unplannable query due to a three-way join which causes no valid path (besides the cartesian product) to be found
SELECT *
FROM
test t1 JOIN test t2 USING (y), -- causes repartition, which makes this not routable or pushdownable
ref a,
ref b
WHERE t2.y - a.a - b.b = 0
ORDER BY 1,2,3;
SET client_min_messages TO WARNING;
DROP SCHEMA expression_reference_join CASCADE;