Handle extra columns added in ExpandWorkerTargetEntry() in repartitioned INSERT/SELECT

pull/3376/head
Hadi Moshayedi 2020-01-12 17:56:49 -08:00
parent 89463f9760
commit 42c3c03b85
5 changed files with 136 additions and 0 deletions

View File

@ -78,6 +78,7 @@ static int PartitionColumnIndex(List *insertTargetList, Var *partitionColumn);
static bool IsRedistributablePlan(Plan *selectPlan, bool hasReturning);
static Expr * CastExpr(Expr *expr, Oid sourceType, Oid targetType, Oid targetCollation,
int targetTypeMod);
static void WrapTaskListForProjection(List *taskList, List *projectedTargetEntries);
/*
@ -233,6 +234,19 @@ CoordinatorInsertSelectExecScanInternal(CustomScanState *node)
partitionColumnIndex, quote_literal_cstr(
partitionColumnName))));
/*
* ExpandWorkerTargetEntry() can add additional columns to the worker
* query. Modify the task queries to only select columns we need.
*/
int requiredColumnCount = list_length(insertTargetList);
List *jobTargetList = distSelectJob->jobQuery->targetList;
if (list_length(jobTargetList) > requiredColumnCount)
{
List *projectedTargetEntries = ListTake(jobTargetList,
requiredColumnCount);
WrapTaskListForProjection(distSelectTaskList, projectedTargetEntries);
}
List **redistributedResults = RedistributeTaskListResults(distResultPrefix,
distSelectTaskList,
partitionColumnIndex,
@ -1038,3 +1052,41 @@ IsRedistributablePlan(Plan *selectPlan, bool hasReturning)
return true;
}
/*
* WrapForProjection wraps task->queryString to only select given projected
* columns. It modifies the taskList.
*/
static void
WrapTaskListForProjection(List *taskList, List *projectedTargetEntries)
{
StringInfo projectedColumnsString = makeStringInfo();
int entryIndex = 0;
TargetEntry *targetEntry = NULL;
foreach_ptr(targetEntry, projectedTargetEntries)
{
if (entryIndex != 0)
{
appendStringInfoChar(projectedColumnsString, ',');
}
char *columnName = targetEntry->resname;
Assert(columnName != NULL);
appendStringInfoString(projectedColumnsString, quote_identifier(columnName));
entryIndex++;
}
Task *task = NULL;
foreach_ptr(task, taskList)
{
Assert(task->queryString != NULL);
StringInfo wrappedQuery = makeStringInfo();
appendStringInfo(wrappedQuery, "SELECT %s FROM (%s) subquery",
projectedColumnsString->data,
task->queryString);
task->queryString = wrappedQuery->data;
}
}

View File

@ -176,3 +176,29 @@ StringJoin(List *stringList, char delimiter)
return joinedString->data;
}
/*
* ListTake returns the first size elements of given list. If size is greater
* than list's length, it returns all elements of list. This is modeled after
* the "take" function used in some Scheme implementations.
*/
List *
ListTake(List *pointerList, int size)
{
List *result = NIL;
int listIndex = 0;
ListCell *pointerCell = NULL;
foreach(pointerCell, pointerList)
{
result = lappend(result, lfirst(pointerCell));
listIndex++;
if (listIndex >= size)
{
break;
}
}
return result;
}

View File

@ -73,5 +73,6 @@ extern ArrayType * DatumArrayToArrayType(Datum *datumArray, int datumCount,
Oid datumTypeId);
extern HTAB * ListToHashSet(List *pointerList, Size keySize, bool isStringList);
extern char * StringJoin(List *stringList, char delimiter);
extern List * ListTake(List *pointerList, int size);
#endif /* CITUS_LISTUTILS_H */

View File

@ -268,6 +268,42 @@ SELECT * FROM target_table ORDER BY a;
-1 | {1,2,3}
(4 rows)
--
-- worker queries have more columns than necessary. ExpandWorkerTargetEntry() might
-- add additional columns to the target list.
--
TRUNCATE target_table;
\set VERBOSITY TERSE
-- first verify that the SELECT query below fetches 3 projected columns from workers
SET citus.log_remote_commands TO true; SET client_min_messages TO DEBUG;
CREATE TABLE results AS SELECT max(-a), array_agg(mapped_key) FROM source_table GROUP BY a;
DEBUG: Router planner cannot handle multi-shard select queries
DEBUG: building index "pg_toast_xxxxx_index" on table "pg_toast_xxxxx" serially
LOG: issuing SELECT max((OPERATOR(pg_catalog.-) a)) AS max, array_agg(mapped_key) AS array_agg, a AS worker_column_3 FROM insert_select_repartition.source_table_4213601 source_table WHERE true GROUP BY a
LOG: issuing SELECT max((OPERATOR(pg_catalog.-) a)) AS max, array_agg(mapped_key) AS array_agg, a AS worker_column_3 FROM insert_select_repartition.source_table_4213602 source_table WHERE true GROUP BY a
LOG: issuing SELECT max((OPERATOR(pg_catalog.-) a)) AS max, array_agg(mapped_key) AS array_agg, a AS worker_column_3 FROM insert_select_repartition.source_table_4213603 source_table WHERE true GROUP BY a
RESET citus.log_remote_commands; RESET client_min_messages;
DROP TABLE results;
-- now verify that we don't write the extra columns to the intermediate result files and
-- insertion to the target works fine.
SET client_min_messages TO DEBUG2;
INSERT INTO target_table SELECT max(-a), array_agg(mapped_key) FROM source_table GROUP BY a;
DEBUG: cannot perform distributed INSERT INTO ... SELECT because the partition columns in the source table and subquery do not match
DEBUG: Router planner cannot handle multi-shard select queries
DEBUG: performing repartitioned INSERT ... SELECT
DEBUG: partitioning SELECT query by column index 0 with name 'max'
DEBUG: distributed statement: INSERT INTO insert_select_repartition.target_table_4213604 AS citus_table_alias (a, b) SELECT max, array_agg FROM read_intermediate_results('{repartitioned_results_from_4213602_to_0,repartitioned_results_from_4213603_to_0}'::text[], 'binary'::citus_copy_format) intermediate_result(max integer, array_agg integer[])
DEBUG: distributed statement: INSERT INTO insert_select_repartition.target_table_4213605 AS citus_table_alias (a, b) SELECT max, array_agg FROM read_intermediate_results('{repartitioned_results_from_4213601_to_1}'::text[], 'binary'::citus_copy_format) intermediate_result(max integer, array_agg integer[])
RESET client_min_messages;
SELECT * FROM target_table ORDER BY a;
a | b
---------------------------------------------------------------------
-4 | {-4}
-3 | {-3}
-2 | {-2}
-1 | {-1}
(4 rows)
DROP TABLE source_table, target_table;
SET client_min_messages TO WARNING;
DROP SCHEMA insert_select_repartition CASCADE;

View File

@ -123,6 +123,27 @@ RESET client_min_messages;
SELECT * FROM target_table ORDER BY a;
--
-- worker queries have more columns than necessary. ExpandWorkerTargetEntry() might
-- add additional columns to the target list.
--
TRUNCATE target_table;
\set VERBOSITY TERSE
-- first verify that the SELECT query below fetches 3 projected columns from workers
SET citus.log_remote_commands TO true; SET client_min_messages TO DEBUG;
CREATE TABLE results AS SELECT max(-a), array_agg(mapped_key) FROM source_table GROUP BY a;
RESET citus.log_remote_commands; RESET client_min_messages;
DROP TABLE results;
-- now verify that we don't write the extra columns to the intermediate result files and
-- insertion to the target works fine.
SET client_min_messages TO DEBUG2;
INSERT INTO target_table SELECT max(-a), array_agg(mapped_key) FROM source_table GROUP BY a;
RESET client_min_messages;
SELECT * FROM target_table ORDER BY a;
DROP TABLE source_table, target_table;
SET client_min_messages TO WARNING;