Merge pull request #1852 from citusdata/group_by_on_function

Treat recurring tuples as reference tables for GROUP BY checks
2017-12-13 16:37:31 +03:00 · 2017-12-13 16:37:31 +03:00 · b5784ca03a
parent f0851257fa 86b2d9420c
commit b5784ca03a
3 changed files with 358 additions and 30 deletions
--- a/src/backend/distributed/planner/multi_logical_planner.c
+++ b/src/backend/distributed/planner/multi_logical_planner.c
@ -109,7 +109,8 @@ static bool ExtractSetOperationStatmentWalker(Node *node, List **setOperationLis
 static DeferredErrorMessage * DeferErrorIfUnsupportedTableCombination(Query *queryTree);
 static bool WindowPartitionOnDistributionColumn(Query *query);
 static bool AllTargetExpressionsAreColumnReferences(List *targetEntryList);
-static bool RangeTableListContainsOnlyReferenceTables(List *rangeTableList);
+static bool FindNodeCheckInRangeTableList(List *rtable, bool (*check)(Node *));
+static bool IsDistributedTableRTE(Node *node);
 static FieldSelect * CompositeFieldRecursive(Expr *expression, Query *query);
 static bool FullCompositeFieldList(List *compositeFieldList);
 static MultiNode * MultiNodeTree(Query *queryTree);
@ -1510,12 +1511,12 @@ TargetListOnPartitionColumn(Query *query, List *targetEntryList)

 	/*
 	 * We could still behave as if the target list is on partition column if
-	 * all range table entries are reference tables and all target expressions
-	 * are column references to the given query level.
+	 * all range table entries are reference tables or intermediate results,
+	 * and all target expressions are column references to the given query level.
 	 */
 	if (!targetListOnPartitionColumn)
 	{
-		if (RangeTableListContainsOnlyReferenceTables(query->rtable) &&
+		if (!FindNodeCheckInRangeTableList(query->rtable, IsDistributedTableRTE) &&
 			AllTargetExpressionsAreColumnReferences(targetEntryList))
 		{
 			targetListOnPartitionColumn = true;
@ -1580,40 +1581,51 @@ AllTargetExpressionsAreColumnReferences(List *targetEntryList)


 /*
- * RangeTableListContainsOnlyReferenceTables returns true if all range table
- * entries are reference tables.
+ * FindNodeCheckInRangeTableList finds a node for which the check
+ * function returns true.
 *
- * The function returns false for range table entries that are not relations.
- *
- * Note that the function doesn't recurse into subqueries, returns false when
- * a subquery is found.
+ * FindNodeCheckInRangeTableList relies on FindNodeCheck() but only
+ * considers the range table entries.
 */
 static bool
-RangeTableListContainsOnlyReferenceTables(List *rangeTableList)
+FindNodeCheckInRangeTableList(List *rtable, bool (*check)(Node *))
 {
-	ListCell *rangeTableCell = NULL;
-	foreach(rangeTableCell, rangeTableList)
-	{
-		RangeTblEntry *rangeTableEntry = (RangeTblEntry *) lfirst(rangeTableCell);
+	return range_table_walker(rtable, FindNodeCheck, check, QTW_EXAMINE_RTES);
+}

-		if (rangeTableEntry->rtekind == RTE_RELATION)
-		{
-			Oid relationId = rangeTableEntry->relid;

-			if (!IsDistributedTable(relationId))
+/*
+ * IsDistributedTableRTE gets a node and returns true if the node
+ * is a range table relation entry that points to a distributed
+ * relation (i.e., excluding reference tables).
+ */
+static bool
+IsDistributedTableRTE(Node *node)
+{
+	RangeTblEntry *rangeTableEntry = NULL;
+	Oid relationId = InvalidOid;
+
+	if (node == NULL)
 	{
 		return false;
 	}

-			if (PartitionMethod(relationId) != DISTRIBUTE_BY_NONE)
+	if (!IsA(node, RangeTblEntry))
 	{
 		return false;
 	}
-		}
-		else
+
+	rangeTableEntry = (RangeTblEntry *) node;
+	if (rangeTableEntry->rtekind != RTE_RELATION)
 	{
 		return false;
 	}
+
+	relationId = rangeTableEntry->relid;
+	if (!IsDistributedTable(relationId) ||
+		PartitionMethod(relationId) == DISTRIBUTE_BY_NONE)
+	{
+		return false;
 	}

 	return true;
--- a/src/test/regress/expected/multi_subquery_complex_reference_clause.out
+++ b/src/test/regress/expected/multi_subquery_complex_reference_clause.out
@ -1361,6 +1361,186 @@ ORDER BY 1 DESC
 LIMIT 4;
 ERROR:  cannot push down this subquery
 DETAIL:  Distinct on columns without partition column is currently unsupported
+-- test the read_intermediate_result() for GROUP BYs
+BEGIN;
+ 
+SELECT broadcast_intermediate_result('squares', 'SELECT s, s*s FROM generate_series(1,200) s');
+ broadcast_intermediate_result 
+-------------------------------
+                           200
+(1 row)
+
+-- single appereance of read_intermediate_result
+SELECT 
+  DISTINCT user_id 
+FROM 
+  users_table 
+JOIN 
+(SELECT 
+  max(res.val) as mx 
+FROM 
+  read_intermediate_result('squares', 'binary') AS res (val int, val_square int) 
+GROUP BY res.val_square) squares
+ ON (mx = user_id)
+ORDER BY 1
+LIMIT 5;
+ user_id 
+---------
+       1
+       2
+       3
+       4
+       5
+(5 rows)
+
+-- similar to the above, with DISTINCT on intermediate result
+SELECT DISTINCT user_id
+FROM users_table
+JOIN
+  (SELECT DISTINCT res.val AS mx
+   FROM read_intermediate_result('squares', 'binary') AS res (val int, val_square int)) squares ON (mx = user_id)
+ORDER BY 1
+LIMIT 5;
+ user_id 
+---------
+       1
+       2
+       3
+       4
+       5
+(5 rows)
+
+-- single appereance of read_intermediate_result but inside a subquery
+SELECT 
+  DISTINCT user_id 
+FROM 
+  users_table 
+JOIN (
+  SELECT *,random() FROM (SELECT 
+    max(res.val) as mx 
+  FROM 
+      (SELECT val, val_square FROM read_intermediate_result('squares', 'binary') AS res (val int, val_square int)) res 
+  GROUP BY res.val_square) foo)
+squares
+ ON (mx = user_id)
+ORDER BY 1
+LIMIT 5;
+ user_id 
+---------
+       1
+       2
+       3
+       4
+       5
+(5 rows)
+
+-- multiple read_intermediate_results in the same subquery is OK
+SELECT 
+  DISTINCT user_id 
+FROM 
+  users_table 
+JOIN 
+(SELECT 
+  max(res.val) as mx 
+FROM 
+  read_intermediate_result('squares', 'binary') AS res (val int, val_square int),
+  read_intermediate_result('squares', 'binary') AS res2 (val int, val_square int) 
+WHERE res.val = res2.val_square
+GROUP BY res2.val_square) squares
+ ON (mx = user_id)
+ORDER BY 1
+LIMIT 5;
+ user_id 
+---------
+       1
+       4
+(2 rows)
+
+-- mixed recurring tuples should be supported
+SELECT 
+  DISTINCT user_id 
+FROM 
+  users_table 
+JOIN 
+(SELECT 
+  max(res.val) as mx 
+FROM 
+  read_intermediate_result('squares', 'binary') AS res (val int, val_square int), 
+  generate_series(0, 10) i
+  WHERE
+  res.val = i
+  GROUP BY 
+    i) squares
+ ON (mx = user_id)
+ORDER BY 1
+LIMIT 5;
+ user_id 
+---------
+       1
+       2
+       3
+       4
+       5
+(5 rows)
+
+-- should error out since there is a distributed table and 
+-- there are no columns on the GROUP BY from the distributed table
+SELECT 
+  DISTINCT user_id 
+FROM 
+  users_reference_table
+JOIN 
+  (SELECT 
+    max(val_square) as mx 
+  FROM 
+    read_intermediate_result('squares', 'binary') AS res (val int, val_square int), events_table 
+  WHERE 
+    events_table.user_id = res.val GROUP BY res.val) squares
+ ON (mx = user_id)
+ORDER BY 1
+LIMIT 5;
+ERROR:  cannot push down this subquery
+DETAIL:  Group by list without partition column is currently unsupported
+ROLLBACK;
+-- should work since we're using an immutable function as recurring tuple
+SELECT 
+  DISTINCT user_id 
+FROM 
+  users_table 
+JOIN 
+(SELECT 
+  max(i+5)as mx 
+FROM 
+  generate_series(0, 10) as i GROUP BY i) squares
+ ON (mx = user_id)
+ORDER BY 1
+LIMIT 5;
+ user_id 
+---------
+       5
+       6
+(2 rows)
+
+-- should not work since we're 
+-- using an immutable function as recurring tuple
+-- along with a distributed table, where GROUP BY is 
+-- on the recurring tuple
+SELECT 
+  DISTINCT user_id 
+FROM 
+  users_reference_table
+JOIN 
+  (SELECT 
+    max(i+5)as mx 
+  FROM 
+     generate_series(0, 10) as i, events_table 
+  WHERE 
+    events_table.user_id = i GROUP BY i) squares
+ ON (mx = user_id)
+ORDER BY 1
+LIMIT 5;
+ERROR:  cannot push down this subquery
+DETAIL:  Group by list without partition column is currently unsupported
 DROP TABLE user_buy_test_table;
 DROP TABLE users_ref_test_table;
 DROP TABLE users_return_test_table;
--- a/src/test/regress/sql/multi_subquery_complex_reference_clause.sql
+++ b/src/test/regress/sql/multi_subquery_complex_reference_clause.sql
@ -1089,6 +1089,142 @@ SELECT * FROM
 ORDER BY 1 DESC 
 LIMIT 4;

+
+
+-- test the read_intermediate_result() for GROUP BYs
+BEGIN;
+ 
+SELECT broadcast_intermediate_result('squares', 'SELECT s, s*s FROM generate_series(1,200) s');
+
+-- single appereance of read_intermediate_result
+SELECT 
+  DISTINCT user_id 
+FROM 
+  users_table 
+JOIN 
+(SELECT 
+  max(res.val) as mx 
+FROM 
+  read_intermediate_result('squares', 'binary') AS res (val int, val_square int) 
+GROUP BY res.val_square) squares
+ ON (mx = user_id)
+ORDER BY 1
+LIMIT 5;
+
+-- similar to the above, with DISTINCT on intermediate result
+SELECT DISTINCT user_id
+FROM users_table
+JOIN
+  (SELECT DISTINCT res.val AS mx
+   FROM read_intermediate_result('squares', 'binary') AS res (val int, val_square int)) squares ON (mx = user_id)
+ORDER BY 1
+LIMIT 5;
+
+-- single appereance of read_intermediate_result but inside a subquery
+SELECT 
+  DISTINCT user_id 
+FROM 
+  users_table 
+JOIN (
+  SELECT *,random() FROM (SELECT 
+    max(res.val) as mx 
+  FROM 
+      (SELECT val, val_square FROM read_intermediate_result('squares', 'binary') AS res (val int, val_square int)) res 
+  GROUP BY res.val_square) foo)
+squares
+ ON (mx = user_id)
+ORDER BY 1
+LIMIT 5;
+
+-- multiple read_intermediate_results in the same subquery is OK
+SELECT 
+  DISTINCT user_id 
+FROM 
+  users_table 
+JOIN 
+(SELECT 
+  max(res.val) as mx 
+FROM 
+  read_intermediate_result('squares', 'binary') AS res (val int, val_square int),
+  read_intermediate_result('squares', 'binary') AS res2 (val int, val_square int) 
+WHERE res.val = res2.val_square
+GROUP BY res2.val_square) squares
+ ON (mx = user_id)
+ORDER BY 1
+LIMIT 5;
+
+-- mixed recurring tuples should be supported
+SELECT 
+  DISTINCT user_id 
+FROM 
+  users_table 
+JOIN 
+(SELECT 
+  max(res.val) as mx 
+FROM 
+  read_intermediate_result('squares', 'binary') AS res (val int, val_square int), 
+  generate_series(0, 10) i
+  WHERE
+  res.val = i
+  GROUP BY 
+    i) squares
+ ON (mx = user_id)
+ORDER BY 1
+LIMIT 5;
+
+-- should error out since there is a distributed table and 
+-- there are no columns on the GROUP BY from the distributed table
+SELECT 
+  DISTINCT user_id 
+FROM 
+  users_reference_table
+JOIN 
+  (SELECT 
+    max(val_square) as mx 
+  FROM 
+    read_intermediate_result('squares', 'binary') AS res (val int, val_square int), events_table 
+  WHERE 
+    events_table.user_id = res.val GROUP BY res.val) squares
+ ON (mx = user_id)
+ORDER BY 1
+LIMIT 5;
+
+ROLLBACK;
+
+-- should work since we're using an immutable function as recurring tuple
+SELECT 
+  DISTINCT user_id 
+FROM 
+  users_table 
+JOIN 
+(SELECT 
+  max(i+5)as mx 
+FROM 
+  generate_series(0, 10) as i GROUP BY i) squares
+ ON (mx = user_id)
+ORDER BY 1
+LIMIT 5;
+
+
+-- should not work since we're 
+-- using an immutable function as recurring tuple
+-- along with a distributed table, where GROUP BY is 
+-- on the recurring tuple
+SELECT 
+  DISTINCT user_id 
+FROM 
+  users_reference_table
+JOIN 
+  (SELECT 
+    max(i+5)as mx 
+  FROM 
+     generate_series(0, 10) as i, events_table 
+  WHERE 
+    events_table.user_id = i GROUP BY i) squares
+ ON (mx = user_id)
+ORDER BY 1
+LIMIT 5;
+
 DROP TABLE user_buy_test_table;
 DROP TABLE users_ref_test_table;
 DROP TABLE users_return_test_table;