Planner: lift volatile target‑list items in `WrapSubquery` to coordinator (prevents sequence‑leap in distributed `INSERT … SELECT`) (#7976)

This PR fixes #7784 and refactors the `WrapSubquery(Query *subquery)` function to improve clarity and correctness when handling volatile expressions in subqueries during Citus insert-select rewriting. ### Background The `WrapSubquery` function rewrites a query of the form: ```sql INSERT INTO target_table SELECT ... FROM ... ``` ...by wrapping the `SELECT` in a subquery: ```sql SELECT <outer-TL> FROM ( <subquery with volatile expressions replaced with NULL> ) citus_insert_select_subquery ``` This transformation allows: * **Volatile expressions** (e.g., `nextval`, `now`) **not used in `GROUP BY` or `ORDER BY`** to be evaluated **exactly once on the coordinator**. * **Stable/immutable or sort-relevant expressions** to remain in the worker-executed subquery. * Placeholder `NULL`s to maintain column alignment in the inner subquery. ### Fix Details * Restructured the code into labeled logical sections: 1. Build wrapper query (`SELECT … FROM (subquery)`) 2. Rewrite target lists with volatility analysis 3. Assign and return updated query trees * Preserved existing behavior, focusing on clarity and maintainability. ### How the new code handles volatile items stage | what we look for | what we do | why -- | -- | -- | -- scan target list once | 1. `expr_is_volatile(te->expr)` 2. `te->ressortgroupref != 0` (is the column used in GROUP BY / ORDER BY?) | decide whether to hoist or keep | we must not hoist an expression the inner query still needs for sorting/grouping, otherwise its `SortGroupClause` breaks volatile & not used in sort/group | deep‑copy the expression into the outer target list | executes once on the coordinator | | leave a typed `NULL `placeholder (visible, not `resjunk`) in the inner target list | keeps column numbering stable for helpers that already ran (reorder, cast); the worker sends a cheap constant | stable / immutable, or volatile but used in sort/group | keep the original expression in the inner list; outer list references it via a `Var `| workers can evaluate it safely and, if needed, the inner ORDER BY still works | ### Example Given this query: ```sql INSERT INTO t SELECT nextval('s'), 42 FROM generate_series(1, 2); ``` The planner rewrites it as: ```sql SELECT nextval('s'), col2 FROM (SELECT NULL::bigint AS col1, 42 AS col2 FROM generate_series(1, 2)) citus_insert_select_subquery; ``` This ensures `nextval('s')` is evaluated only once per row on the **coordinator**, not on each worker node, preserving correct sequence semantics. #### **Outer‑Var guard (`FindReferencedTableColumn`)** Because `WrapSubquery` adds an extra query level, lots of Vars that the old code never expected become “outer” Vars; without teaching `FindReferencedTableColumn` to climb that extra level reliably, Citus would intermittently reject valid foreign keys and even hit asserts. * Re‑implemented the outer‑Var guard so that the function: * **Walks deterministically up the query stack** when `skipOuterVars = false` (default for FK / UNION checks). A new while‑loop copies — rather than truncates — `parentQueryList` on each hop, eliminating list‑aliasing that made *issue 5248* fail intermittently in parallel regressions. * Handles multi‑level `varlevelsup` in a single loop; never mutates the caller’s list in place.
2025-05-06 17:45:49 +03:00 · 2025-05-06 17:45:49 +03:00 · a4040ba5da
parent d4dd44e715
commit a4040ba5da
4 changed files with 415 additions and 66 deletions
--- a/src/backend/distributed/planner/insert_select_planner.c
+++ b/src/backend/distributed/planner/insert_select_planner.c
@ -1247,7 +1247,7 @@ InsertPartitionColumnMatchesSelect(Query *query, RangeTblEntry *insertRte,
 		RangeTblEntry *subqueryPartitionColumnRelationIdRTE = NULL;
 		List *parentQueryList = list_make2(query, subquery);
-		bool skipOuterVars = true;
+		bool skipOuterVars = false;
 		FindReferencedTableColumn(selectTargetExpr,
 								  parentQueryList, subquery,
 								  &subqueryPartitionColumn,
@ -1543,71 +1543,147 @@ InsertSelectResultIdPrefix(uint64 planId)
 /*
- * WrapSubquery wraps the given query as a subquery in a newly constructed
+ * Return true if the expression tree can change value within a single scan
- * "SELECT * FROM (...subquery...) citus_insert_select_subquery" query.
+ * (i.e. the planner must treat it as VOLATILE).
 * We just delegate to PostgreSQL’s helper.
 */
 static inline bool
 expr_is_volatile(Node *node)
 {
 	/* contain_volatile_functions() also returns true for set-returning
 	 * volatile functions and for nextval()/currval(). */
 	return contain_volatile_functions(node);
 }
 /*
 * WrapSubquery
 *
 * Build a wrapper query:
 *
 *     SELECT <outer-TL>
 *       FROM ( <subquery with any volatile items stripped> )
 *            citus_insert_select_subquery
 *
 * Purpose:
 *   - Preserve column numbering while lifting volatile expressions to the coordinator.
 *   - Volatile (non-deterministic) expressions not used in GROUP BY / ORDER BY
 *     are lifted to the outer SELECT to ensure they are evaluated only once.
 *   - Stable/immutable expressions or volatile ones required by GROUP BY / ORDER BY
 *     stay in the subquery and are accessed via Vars in the outer SELECT.
 */
 Query *
 WrapSubquery(Query *subquery)
 {
 	/*
 	 * 1. Build the wrapper skeleton: SELECT ... FROM (subquery) alias
 	 */
 	ParseState *pstate = make_parsestate(NULL);
 	List *newTargetList = NIL;
 	Query *outerQuery = makeNode(Query);
 	outerQuery->commandType = CMD_SELECT;
-	/* create range table entries */
+	Alias *alias = makeAlias("citus_insert_select_subquery", NIL);
-	Alias *selectAlias = makeAlias("citus_insert_select_subquery", NIL);
+	RangeTblEntry *rte_subq =
-	RangeTblEntry *newRangeTableEntry = RangeTableEntryFromNSItem(
+		RangeTableEntryFromNSItem(
-		addRangeTableEntryForSubquery(
+			addRangeTableEntryForSubquery(pstate,
-			pstate, subquery,
+										  subquery,    /* still points to original subquery */
-			selectAlias, false, true));
+										  alias,
-	outerQuery->rtable = list_make1(newRangeTableEntry);
+										  false,       /* not LATERAL */
 										  true));      /* in FROM clause */
 	outerQuery->rtable = list_make1(rte_subq);
 #if PG_VERSION_NUM >= PG_VERSION_16
-	/*
+	/* Ensure RTE_SUBQUERY has proper permission handling */
-	 * This part of the code is more of a sanity check for readability,
+	Assert(rte_subq->rtekind == RTE_SUBQUERY &&
-	 * it doesn't really do anything.
+		   rte_subq->perminfoindex == 0);
 	 * addRangeTableEntryForSubquery doesn't add permission info
 	 * because the range table is set to be RTE_SUBQUERY.
 	 * Hence we should also have no perminfos here.
 	 */
 	Assert(newRangeTableEntry->rtekind == RTE_SUBQUERY &&
 		   newRangeTableEntry->perminfoindex == 0);
 	outerQuery->rteperminfos = NIL;
 #endif
-	/* set the FROM expression to the subquery */
+	RangeTblRef *rtref = makeNode(RangeTblRef);
-	RangeTblRef *newRangeTableRef = makeNode(RangeTblRef);
+	rtref->rtindex = 1;  /* Only one RTE, so index is 1 */
-	newRangeTableRef->rtindex = 1;
+	outerQuery->jointree = makeFromExpr(list_make1(rtref), NULL);
 	outerQuery->jointree = makeFromExpr(list_make1(newRangeTableRef), NULL);
-	/* create a target list that matches the SELECT */
+	/*
-	TargetEntry *selectTargetEntry = NULL;
+	 * 2. Create new target lists for inner (worker) and outer (coordinator)
-	foreach_declared_ptr(selectTargetEntry, subquery->targetList)
+	 */
-	{
+	List *newInnerTL = NIL;
-		/* exactly 1 entry in FROM */
+	List *newOuterTL = NIL;
-		int indexInRangeTable = 1;
+	int nextResno = 1;
-		if (selectTargetEntry->resjunk)
+	TargetEntry *te = NULL;
 	foreach_declared_ptr(te, subquery->targetList)
 	{
 		if (te->resjunk)
 		{
 			/* Keep resjunk entries only in subquery (not in outer query) */
 			newInnerTL = lappend(newInnerTL, te);
 			continue;
 		}
-		Var *newSelectVar = makeVar(indexInRangeTable, selectTargetEntry->resno,
+		bool isVolatile = expr_is_volatile((Node *) te->expr);
-									exprType((Node *) selectTargetEntry->expr),
+		bool usedInSort = (te->ressortgroupref != 0);
 									exprTypmod((Node *) selectTargetEntry->expr),
 									exprCollation((Node *) selectTargetEntry->expr), 0);
-		TargetEntry *newSelectTargetEntry = makeTargetEntry((Expr *) newSelectVar,
+		if (isVolatile && !usedInSort)
-															selectTargetEntry->resno,
+		{
-															selectTargetEntry->resname,
+			/*
-															selectTargetEntry->resjunk);
+			 * Lift volatile expression to outer query so it's evaluated once.
 			 * In inner query, place a NULL of the same type to preserve column position.
 			 */
 			TargetEntry *outerTE =
 				makeTargetEntry(copyObject(te->expr),
 								list_length(newOuterTL) + 1,
 								te->resname,
 								false);
 			newOuterTL = lappend(newOuterTL, outerTE);
-		newTargetList = lappend(newTargetList, newSelectTargetEntry);
+			Const *nullConst = makeNullConst(exprType((Node *) te->expr),
 											 exprTypmod((Node *) te->expr),
 											 exprCollation((Node *) te->expr));
 			TargetEntry *placeholder =
 				makeTargetEntry((Expr *) nullConst,
 								nextResno++,          /* preserve column position */
 								te->resname,
 								false);               /* visible, not resjunk */
 			newInnerTL = lappend(newInnerTL, placeholder);
 		}
 		else
 		{
 			/*
 			 * Either:
 			 *   - expression is stable or immutable, or
 			 *   - volatile but needed for sorting or grouping
 			 *
 			 * In both cases, keep it in subquery and reference it using a Var.
 			 */
 			TargetEntry *innerTE = te;          /* reuse original node */
 			innerTE->resno = nextResno++;
 			newInnerTL = lappend(newInnerTL, innerTE);
 			Var *v = makeVar(/* subquery reference index is 1 */
 				rtref->rtindex,     /* same as 1, but self‑documenting */
 				innerTE->resno,
 				exprType((Node *) innerTE->expr),
 				exprTypmod((Node *) innerTE->expr),
 				exprCollation((Node *) innerTE->expr),
 				0);
 			TargetEntry *outerTE =
 				makeTargetEntry((Expr *) v,
 								list_length(newOuterTL) + 1,
 								innerTE->resname,
 								false);
 			newOuterTL = lappend(newOuterTL, outerTE);
 		}
 	}
-	outerQuery->targetList = newTargetList;
+	/*
 	 * 3. Assign target lists and return the wrapper query
 	 */
 	subquery->targetList = newInnerTL;
 	outerQuery->targetList = newOuterTL;
 	return outerQuery;
 }
--- a/src/backend/distributed/planner/multi_logical_optimizer.c
+++ b/src/backend/distributed/planner/multi_logical_optimizer.c
@ -4478,46 +4478,43 @@ FindReferencedTableColumn(Expr *columnExpression, List *parentQueryList, Query *
 		return;
 	}
-
+	/* Walk up varlevelsup as many times as needed */
-	if (candidateColumn->varlevelsup > 0)
+	while (candidateColumn->varlevelsup > 0)
 	{
 		/* Caller asked us to ignore any outer Vars → just bail out */
 		if (skipOuterVars)
 		{
 			/*
 			 * we don't want to process outer vars, so we return early.
 			 */
 			return;
 		}
-		/*
+		/* Locate the parent query that owns this Var */
-		 * We currently don't support finding partition keys in the subqueries
+		int parentIdx =
-		 * that reference outer subqueries. For example, in correlated
+			list_length(parentQueryList) - candidateColumn->varlevelsup;
-		 * subqueries in WHERE clause, we don't support use of partition keys
+		if (!IsIndexInRange(parentQueryList, parentIdx))
 		 * in the subquery that is referred from the outer query.
 		 */
 		int parentQueryIndex = list_length(parentQueryList) -
 							   candidateColumn->varlevelsup;
 		if (!(IsIndexInRange(parentQueryList, parentQueryIndex)))
 		{
-			return;
+			return;                             /* malformed tree */
 		}
-		/*
+		/* Work on a fresh copy of the Var with varlevelsup reset */
 		 * Before we recurse into the query tree, we should update the candidateColumn and we use copy of it.
 		 * As we get the query from varlevelsup up, we reset the varlevelsup.
 		 */
 		candidateColumn = copyObject(candidateColumn);
 		candidateColumn->varlevelsup = 0;
 		/*
-		 * We should be careful about these fields because they need to
+		 * Make a *completely private* copy of parentQueryList for the
-		 * be updated correctly based on ctelevelsup and varlevelsup.
+		 * next recursion step.  We copy the whole list and then truncate
 		 * so every recursive branch owns its own list cells.
 		 */
-		query = list_nth(parentQueryList, parentQueryIndex);
+		List *newParent =
-		parentQueryList = list_truncate(parentQueryList, parentQueryIndex);
+			list_copy(parentQueryList);         /* duplicates every cell  */
 		newParent = list_truncate(newParent, parentIdx);
 		query = list_nth(parentQueryList, parentIdx);
 		parentQueryList = newParent;            /* hand private copy down */
 		/* Loop again if still pointing to an outer level */
 	}
 	if (candidateColumn->varattno == InvalidAttrNumber)
 	{
 		/*
--- a/src/test/regress/expected/multi_insert_select.out
+++ b/src/test/regress/expected/multi_insert_select.out
@ -3492,5 +3492,150 @@ $$);
   Task Count: 1
 (2 rows)
 ---------------------------------------------------------------------
 -- Regression Test Script for Issue #7784
 -- This script tests INSERT ... SELECT with a CTE for:
 --  1. Schema based sharding.
 --  2. A distributed table.
 ---------------------------------------------------------------------
 -- Enable schema-based sharding
 SET citus.enable_schema_based_sharding TO ON;
 -- Create a table for schema based sharding
 CREATE TABLE version_sch_based (
    id bigserial NOT NULL,
    description varchar(255),
    PRIMARY KEY (id)
 );
 -- Insert an initial row.
 INSERT INTO version_sch_based (description) VALUES ('Version 1');
 -- Duplicate the row using a CTE and INSERT ... SELECT.
 WITH v AS (
    SELECT * FROM version_sch_based WHERE description = 'Version 1'
 )
 INSERT INTO version_sch_based (description)
 SELECT description FROM v;
 -- Expected output:
 --   id | description
 --  ----+-------------
 --    1 | Version 1
 --    2 | Version 1
 -- Query the table and order by id for consistency.
 SELECT * FROM version_sch_based ORDER BY id;
 id | description
 ---------------------------------------------------------------------
  1 | Version 1
  2 | Version 1
 (2 rows)
 ---------------------------------------------------------------------
 -- Case 2: Distributed Table Scenario
 ---------------------------------------------------------------------
 SET citus.enable_schema_based_sharding TO OFF;
 -- Create a table for the distributed test.
 CREATE TABLE version_dist (
    id bigserial NOT NULL,
    description varchar(255),
    PRIMARY KEY (id)
 );
 -- Register the table as distributed using the 'id' column as the distribution key.
 SELECT create_distributed_table('version_dist', 'id');
 create_distributed_table
 ---------------------------------------------------------------------
 (1 row)
 -- Insert an initial row.
 INSERT INTO version_dist (description) VALUES ('Version 1');
 -- Duplicate the row using a CTE and INSERT ... SELECT.
 WITH v AS (
    SELECT * FROM version_dist WHERE description = 'Version 1'
 )
 INSERT INTO version_dist (description)
 SELECT description FROM v;
 -- Expected output:
 --   id | description
 --  ----+-------------
 --    1 | Version 1
 --    2 | Version 1
 -- Query the table and order by id for consistency.
 SELECT * FROM version_dist ORDER BY id;
 id | description
 ---------------------------------------------------------------------
  1 | Version 1
  2 | Version 1
 (2 rows)
 ---------------------------------------------------------------------
 -- Case 3: Distributed INSERT … SELECT with nextval()
 -- Verifies that nextval() is evaluated on the coordinator only.
 ---------------------------------------------------------------------
 -- A fresh sequence for clarity
 CREATE SEQUENCE seq_nextval_test START 100;
 -- Table with DEFAULT nextval()
 CREATE TABLE version_dist_seq (
    id  bigint DEFAULT nextval('seq_nextval_test'),
    description text,
    PRIMARY KEY (id)
 );
 SELECT create_distributed_table('version_dist_seq', 'id');
 create_distributed_table
 ---------------------------------------------------------------------
 (1 row)
 -- Seed one row (id = 100)
 INSERT INTO version_dist_seq (description) VALUES ('row‑0');
 -- CTE duplication – should produce **exactly one** new sequence value (id = 101)
 WITH v AS (
    SELECT * FROM version_dist_seq WHERE description = 'row‑0'
 )
 INSERT INTO version_dist_seq (description)
 SELECT description FROM v;
 -- Expected: ids are 100 and 101 (no gaps, no duplicates)
 SELECT id, description FROM version_dist_seq ORDER BY id;
 id  | description
 ---------------------------------------------------------------------
 100 | row‑0
 101 | row‑0
 (2 rows)
 ---------------------------------------------------------------------
 -- Case 4: UNION ALL + nextval() in distributed INSERT … SELECT
 ---------------------------------------------------------------------
 CREATE SEQUENCE seq_union_test START 200;
 CREATE TABLE version_dist_union (
    id  bigint DEFAULT nextval('seq_union_test'),
    val int,
    PRIMARY KEY (id)
 );
 SELECT create_distributed_table('version_dist_union', 'id');
 create_distributed_table
 ---------------------------------------------------------------------
 (1 row)
 -- Seed rows
 INSERT INTO version_dist_union (val) VALUES (1), (2);
 -- UNION ALL duplication; each leg returns two rows -> four inserts total
 WITH src AS (
    SELECT val FROM version_dist_union
    UNION ALL
    SELECT val FROM version_dist_union
 )
 INSERT INTO version_dist_union(val)
 SELECT val FROM src;
 -- Expected IDs: 200,201,202,203,204,205
 SELECT id, val FROM version_dist_union ORDER BY id;
 id  | val
 ---------------------------------------------------------------------
 200 |   1
 201 |   2
 202 |   1
 203 |   2
 204 |   1
 205 |   2
 (6 rows)
 -- End of Issue #7784
 SET client_min_messages TO ERROR;
 DROP SCHEMA multi_insert_select CASCADE;
--- a/src/test/regress/sql/multi_insert_select.sql
+++ b/src/test/regress/sql/multi_insert_select.sql
@ -2452,5 +2452,136 @@ SELECT coordinator_plan($$
  SELECT id FROM dist_table_5 JOIN cte_1 USING(id);
 $$);
 -------------------------------
 -- Regression Test Script for Issue #7784
 -- This script tests INSERT ... SELECT with a CTE for:
 --  1. Schema based sharding.
 --  2. A distributed table.
 -------------------------------
 -- Enable schema-based sharding
 SET citus.enable_schema_based_sharding TO ON;
 -- Create a table for schema based sharding
 CREATE TABLE version_sch_based (
    id bigserial NOT NULL,
    description varchar(255),
    PRIMARY KEY (id)
 );
 -- Insert an initial row.
 INSERT INTO version_sch_based (description) VALUES ('Version 1');
 -- Duplicate the row using a CTE and INSERT ... SELECT.
 WITH v AS (
    SELECT * FROM version_sch_based WHERE description = 'Version 1'
 )
 INSERT INTO version_sch_based (description)
 SELECT description FROM v;
 -- Expected output:
 --   id | description
 --  ----+-------------
 --    1 | Version 1
 --    2 | Version 1
 -- Query the table and order by id for consistency.
 SELECT * FROM version_sch_based ORDER BY id;
 --------------------------------------------------
 -- Case 2: Distributed Table Scenario
 --------------------------------------------------
 SET citus.enable_schema_based_sharding TO OFF;
 -- Create a table for the distributed test.
 CREATE TABLE version_dist (
    id bigserial NOT NULL,
    description varchar(255),
    PRIMARY KEY (id)
 );
 -- Register the table as distributed using the 'id' column as the distribution key.
 SELECT create_distributed_table('version_dist', 'id');
 -- Insert an initial row.
 INSERT INTO version_dist (description) VALUES ('Version 1');
 -- Duplicate the row using a CTE and INSERT ... SELECT.
 WITH v AS (
    SELECT * FROM version_dist WHERE description = 'Version 1'
 )
 INSERT INTO version_dist (description)
 SELECT description FROM v;
 -- Expected output:
 --   id | description
 --  ----+-------------
 --    1 | Version 1
 --    2 | Version 1
 -- Query the table and order by id for consistency.
 SELECT * FROM version_dist ORDER BY id;
 -------------------------------
 -- Case 3: Distributed INSERT … SELECT with nextval()
 -- Verifies that nextval() is evaluated on the coordinator only.
 -------------------------------
 -- A fresh sequence for clarity
 CREATE SEQUENCE seq_nextval_test START 100;
 -- Table with DEFAULT nextval()
 CREATE TABLE version_dist_seq (
    id  bigint DEFAULT nextval('seq_nextval_test'),
    description text,
    PRIMARY KEY (id)
 );
 SELECT create_distributed_table('version_dist_seq', 'id');
 -- Seed one row (id = 100)
 INSERT INTO version_dist_seq (description) VALUES ('row‑0');
 -- CTE duplication – should produce **exactly one** new sequence value (id = 101)
 WITH v AS (
    SELECT * FROM version_dist_seq WHERE description = 'row‑0'
 )
 INSERT INTO version_dist_seq (description)
 SELECT description FROM v;
 -- Expected: ids are 100 and 101 (no gaps, no duplicates)
 SELECT id, description FROM version_dist_seq ORDER BY id;
 -------------------------------
 -- Case 4: UNION ALL + nextval() in distributed INSERT … SELECT
 -------------------------------
 CREATE SEQUENCE seq_union_test START 200;
 CREATE TABLE version_dist_union (
    id  bigint DEFAULT nextval('seq_union_test'),
    val int,
    PRIMARY KEY (id)
 );
 SELECT create_distributed_table('version_dist_union', 'id');
 -- Seed rows
 INSERT INTO version_dist_union (val) VALUES (1), (2);
 -- UNION ALL duplication; each leg returns two rows -> four inserts total
 WITH src AS (
    SELECT val FROM version_dist_union
    UNION ALL
    SELECT val FROM version_dist_union
 )
 INSERT INTO version_dist_union(val)
 SELECT val FROM src;
 -- Expected IDs: 200,201,202,203,204,205
 SELECT id, val FROM version_dist_union ORDER BY id;
 -- End of Issue #7784
 SET client_min_messages TO ERROR;
 DROP SCHEMA multi_insert_select CASCADE;