Fix CTE traversal for outer Vars in FindReferencedTableColumn (remove assert; correct parentQueryList handling) (#8106)

fixes #8105 

This change lets `FindReferencedTableColumn()` correctly resolve columns
through a CTE even when the expression comes from an outer query level
(`varlevelsup > 0`, `skipOuterVars = false`). Before, we hit an
`Assert(skipOuterVars)` in this path.

**Problem**

* Hitting a CTE after walking outer Vars triggered
`Assert(skipOuterVars)`.
* Cause: we modified `parentQueryList` in place and didn’t rebuild the
correct parent chain before recursing into the CTE, so the path was
considered unsafe.

**Fix**

* Remove the `Assert(skipOuterVars)` in the `RTE_CTE` branch.
* Find the CTE’s owning level via `ctelevelsup` and compute
`cteParentListIndex`.
* Rebuild a private parent list for recursion: `list_copy` →
`list_truncate` → `lappend(current query)`.
* Add a bounds check before indexing the CTE’s `targetList`.

**Why it works**


```diff
-parentQueryList = lappend(parentQueryList, query);
-FindReferencedTableColumn(targetEntry->expr, parentQueryList,
-                          cteQuery, column, rteContainingReferencedColumn,
-                          skipOuterVars);
+    /* hand a private, bounded parent list to the recursion */
+    List *newParent = list_copy(parentQueryList);
+    newParent = list_truncate(newParent, cteParentListIndex + 1);
+    newParent = lappend(newParent, query);
+
+    FindReferencedTableColumn(targetEntry->expr,
+                              newParent,
+                              cteQuery,
+                              column,
+                              rteContainingReferencedColumn,
+                              skipOuterVars);
+}


```
**Before:** We changed `parentQueryList` in place (`parentQueryList =
lappend(...)`) and didn’t trim it to the CTE’s owner level.

**After:** We copy the list, trim it to the CTE’s owner level, then
append the current query. This keeps the parent list accurate for the
current recursion and safe when following outer Vars.


**Example: Nested subquery referencing the CTE (two levels down)**

```
WITH c AS MATERIALIZED (SELECT user_id FROM raw_events_first)
SELECT 1
FROM raw_events_first t
WHERE EXISTS (
  SELECT 1
  FROM (SELECT user_id FROM c) c2
  WHERE c2.user_id = t.user_id
);
```

Levels:
Q0 = top SELECT
Q1 = EXISTS subquery
Q2 = inner (SELECT user_id FROM c)

When resolving c2.user_id inside Q2:

- parentQueryList is [Q0, Q1, Q2].
- `ctelevelsup`: 2


`cteParentListIndex = length(parentQueryList) - ctelevelsup - 1`

- Recurse into the CTE’s query with [Q0, Q2].


**Tests (added in `multi_insert_select`)**

* **T1:** Correlated subquery that references a CTE (one level down) 
Verifies that resolving through `RTE_CTE` after following an outer `Var`
succeeds, row count matches source table.
* **T2:** Nested subquery that references a CTE (two levels down) 
Exercises deeper recursion and confirms identical to T1.
* **T3:** Scalar subquery in a target list that reads from the outer CTE
Checks expected row count and that no NULLs are inserted.

These tests cover the cases that previously hit `Assert(skipOuterVars)`
and confirm CTE references while following outer Vars.
pull/8108/head^2
Mehmet YILMAZ 2025-08-12 11:49:50 +03:00 committed by GitHub
parent 71d6328378
commit a6161f5a21
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 303 additions and 10 deletions

View File

@ -4583,11 +4583,10 @@ FindReferencedTableColumn(Expr *columnExpression, List *parentQueryList, Query *
else if (rangeTableEntry->rtekind == RTE_CTE) else if (rangeTableEntry->rtekind == RTE_CTE)
{ {
/* /*
* When outerVars are considered, we modify parentQueryList, so this * Resolve through a CTE even when skipOuterVars == false.
* logic might need to change when we support outervars in CTEs. * Maintain the invariant that each recursion level owns a private,
* correctly-bounded copy of parentQueryList.
*/ */
Assert(skipOuterVars);
int cteParentListIndex = list_length(parentQueryList) - int cteParentListIndex = list_length(parentQueryList) -
rangeTableEntry->ctelevelsup - 1; rangeTableEntry->ctelevelsup - 1;
Query *cteParentQuery = NULL; Query *cteParentQuery = NULL;
@ -4618,14 +4617,34 @@ FindReferencedTableColumn(Expr *columnExpression, List *parentQueryList, Query *
if (cte != NULL) if (cte != NULL)
{ {
Query *cteQuery = (Query *) cte->ctequery; Query *cteQuery = (Query *) cte->ctequery;
List *targetEntryList = cteQuery->targetList;
AttrNumber targetEntryIndex = candidateColumn->varattno - 1; AttrNumber targetEntryIndex = candidateColumn->varattno - 1;
TargetEntry *targetEntry = list_nth(targetEntryList, targetEntryIndex);
parentQueryList = lappend(parentQueryList, query); if (targetEntryIndex >= 0 &&
FindReferencedTableColumn(targetEntry->expr, parentQueryList, targetEntryIndex < list_length(cteQuery->targetList))
cteQuery, column, rteContainingReferencedColumn, {
skipOuterVars); TargetEntry *targetEntry =
list_nth(cteQuery->targetList, targetEntryIndex);
/* Build a private, bounded parentQueryList before recursing into the CTE.
* Invariant: list is [top current], owned by this call (no aliasing).
* For RTE_CTE:
* owner_idx = list_length(parentQueryList) - rangeTableEntry->ctelevelsup - 1;
* newParent = lappend(list_truncate(list_copy(parentQueryList), owner_idx + 1), query);
* Example (Q0 owns CTE; were in Q2 via nested subquery):
* parent=[Q0,Q1,Q2], ctelevelsup=2 owner_idx=0 newParent=[Q0,Q2].
* Keeps outer-Var level math correct without mutating the callers list.
*/
List *newParent = list_copy(parentQueryList);
newParent = list_truncate(newParent, cteParentListIndex + 1);
newParent = lappend(newParent, query);
FindReferencedTableColumn(targetEntry->expr,
newParent,
cteQuery,
column,
rteContainingReferencedColumn,
skipOuterVars);
}
} }
} }
} }

View File

@ -3637,5 +3637,157 @@ SELECT id, val FROM version_dist_union ORDER BY id;
(6 rows) (6 rows)
-- End of Issue #7784 -- End of Issue #7784
-- PR #8106 — CTE traversal works when following outer Vars
-- This script exercises three shapes:
-- T1) CTE referenced inside a correlated subquery (one level down)
-- T2) CTE referenced inside a nested subquery (two levels down)
-- T3) Subquery targetlist uses a scalar sublink into the outer CTE
CREATE SCHEMA pr8106_cte_outervar;
SET search_path = pr8106_cte_outervar, public;
-- Base tables for the tests
DROP TABLE IF EXISTS raw_events_first CASCADE;
NOTICE: table "raw_events_first" does not exist, skipping
DROP TABLE IF EXISTS agg_events CASCADE;
NOTICE: table "agg_events" does not exist, skipping
CREATE TABLE raw_events_first(
user_id int,
value_1 int
);
CREATE TABLE agg_events(
user_id int,
value_1_agg int
);
-- Distribute and colocate (distribution key = user_id)
SELECT create_distributed_table('raw_events_first', 'user_id');
create_distributed_table
---------------------------------------------------------------------
(1 row)
SELECT create_distributed_table('agg_events', 'user_id');
create_distributed_table
---------------------------------------------------------------------
(1 row)
-- Seed data (duplicates on some user_ids; some NULL value_1s)
INSERT INTO raw_events_first(user_id, value_1) VALUES
(1, 10), (1, 20), (1, NULL),
(2, NULL),
(3, 30),
(4, NULL),
(5, 50), (5, NULL),
(6, NULL);
---------------------------------------------------------------------
-- T1) CTE referenced inside a correlated subquery (one level down)
---------------------------------------------------------------------
TRUNCATE agg_events;
WITH c AS MATERIALIZED (
SELECT user_id FROM raw_events_first
)
INSERT INTO agg_events (user_id)
SELECT t.user_id
FROM raw_events_first t
WHERE EXISTS (SELECT 1 FROM c WHERE c.user_id = t.user_id);
-- Expect one insert per row in raw_events_first (EXISTS always true per user_id)
SELECT 't1_count_matches' AS test,
(SELECT count(*) FROM agg_events) =
(SELECT count(*) FROM raw_events_first) AS ok;
test | ok
---------------------------------------------------------------------
t1_count_matches | t
(1 row)
-- Spot-check: how many rows were inserted
SELECT 't1_rows' AS test, count(*) AS rows FROM agg_events;
test | rows
---------------------------------------------------------------------
t1_rows | 9
(1 row)
---------------------------------------------------------------------
-- T2) CTE referenced inside a nested subquery (two levels down)
---------------------------------------------------------------------
TRUNCATE agg_events;
WITH c AS MATERIALIZED (
SELECT user_id FROM raw_events_first
)
INSERT INTO agg_events (user_id)
SELECT t.user_id
FROM raw_events_first t
WHERE EXISTS (
SELECT 1
FROM (SELECT user_id FROM c) c2
WHERE c2.user_id = t.user_id
);
-- Same cardinality expectation as T1
SELECT 't2_count_matches' AS test,
(SELECT count(*) FROM agg_events) =
(SELECT count(*) FROM raw_events_first) AS ok;
test | ok
---------------------------------------------------------------------
t2_count_matches | t
(1 row)
SELECT 't2_rows' AS test, count(*) AS rows FROM agg_events;
test | rows
---------------------------------------------------------------------
t2_rows | 9
(1 row)
---------------------------------------------------------------------
-- T3) Subquery targetlist uses a scalar sublink into the outer CTE
-- (use MAX() to keep scalar subquery single-row)
---------------------------------------------------------------------
TRUNCATE agg_events;
WITH c AS (SELECT user_id, value_1 FROM raw_events_first)
INSERT INTO agg_events (user_id, value_1_agg)
SELECT d.user_id, d.value_1_agg
FROM (
SELECT t.user_id,
(SELECT max(c.value_1) FROM c WHERE c.user_id = t.user_id) AS value_1_agg
FROM raw_events_first t
) AS d
WHERE d.value_1_agg IS NOT NULL;
-- Expect one insert per row in raw_events_first whose user_id has at least one non-NULL value_1
SELECT 't3_count_matches' AS test,
(SELECT count(*) FROM agg_events) =
(
SELECT count(*)
FROM raw_events_first t
WHERE EXISTS (
SELECT 1 FROM raw_events_first c
WHERE c.user_id = t.user_id AND c.value_1 IS NOT NULL
)
) AS ok;
test | ok
---------------------------------------------------------------------
t3_count_matches | t
(1 row)
-- Also verify no NULLs were inserted into value_1_agg
SELECT 't3_no_null_value_1_agg' AS test,
NOT EXISTS (SELECT 1 FROM agg_events WHERE value_1_agg IS NULL) AS ok;
test | ok
---------------------------------------------------------------------
t3_no_null_value_1_agg | t
(1 row)
-- Deterministic sample of results
SELECT 't3_sample' AS test, user_id, value_1_agg
FROM agg_events
ORDER BY user_id
LIMIT 5;
test | user_id | value_1_agg
---------------------------------------------------------------------
t3_sample | 1 | 20
t3_sample | 1 | 20
t3_sample | 1 | 20
t3_sample | 3 | 30
t3_sample | 5 | 50
(5 rows)
-- End of PR #8106 — CTE traversal works when following outer Vars
SET client_min_messages TO ERROR; SET client_min_messages TO ERROR;
DROP SCHEMA pr8106_cte_outervar CASCADE;
DROP SCHEMA multi_insert_select CASCADE; DROP SCHEMA multi_insert_select CASCADE;

View File

@ -2583,5 +2583,127 @@ SELECT id, val FROM version_dist_union ORDER BY id;
-- End of Issue #7784 -- End of Issue #7784
-- PR #8106 — CTE traversal works when following outer Vars
-- This script exercises three shapes:
-- T1) CTE referenced inside a correlated subquery (one level down)
-- T2) CTE referenced inside a nested subquery (two levels down)
-- T3) Subquery targetlist uses a scalar sublink into the outer CTE
CREATE SCHEMA pr8106_cte_outervar;
SET search_path = pr8106_cte_outervar, public;
-- Base tables for the tests
DROP TABLE IF EXISTS raw_events_first CASCADE;
DROP TABLE IF EXISTS agg_events CASCADE;
CREATE TABLE raw_events_first(
user_id int,
value_1 int
);
CREATE TABLE agg_events(
user_id int,
value_1_agg int
);
-- Distribute and colocate (distribution key = user_id)
SELECT create_distributed_table('raw_events_first', 'user_id');
SELECT create_distributed_table('agg_events', 'user_id');
-- Seed data (duplicates on some user_ids; some NULL value_1s)
INSERT INTO raw_events_first(user_id, value_1) VALUES
(1, 10), (1, 20), (1, NULL),
(2, NULL),
(3, 30),
(4, NULL),
(5, 50), (5, NULL),
(6, NULL);
----------------------------------------------------------------------
-- T1) CTE referenced inside a correlated subquery (one level down)
----------------------------------------------------------------------
TRUNCATE agg_events;
WITH c AS MATERIALIZED (
SELECT user_id FROM raw_events_first
)
INSERT INTO agg_events (user_id)
SELECT t.user_id
FROM raw_events_first t
WHERE EXISTS (SELECT 1 FROM c WHERE c.user_id = t.user_id);
-- Expect one insert per row in raw_events_first (EXISTS always true per user_id)
SELECT 't1_count_matches' AS test,
(SELECT count(*) FROM agg_events) =
(SELECT count(*) FROM raw_events_first) AS ok;
-- Spot-check: how many rows were inserted
SELECT 't1_rows' AS test, count(*) AS rows FROM agg_events;
----------------------------------------------------------------------
-- T2) CTE referenced inside a nested subquery (two levels down)
----------------------------------------------------------------------
TRUNCATE agg_events;
WITH c AS MATERIALIZED (
SELECT user_id FROM raw_events_first
)
INSERT INTO agg_events (user_id)
SELECT t.user_id
FROM raw_events_first t
WHERE EXISTS (
SELECT 1
FROM (SELECT user_id FROM c) c2
WHERE c2.user_id = t.user_id
);
-- Same cardinality expectation as T1
SELECT 't2_count_matches' AS test,
(SELECT count(*) FROM agg_events) =
(SELECT count(*) FROM raw_events_first) AS ok;
SELECT 't2_rows' AS test, count(*) AS rows FROM agg_events;
----------------------------------------------------------------------
-- T3) Subquery targetlist uses a scalar sublink into the outer CTE
-- (use MAX() to keep scalar subquery single-row)
----------------------------------------------------------------------
TRUNCATE agg_events;
WITH c AS (SELECT user_id, value_1 FROM raw_events_first)
INSERT INTO agg_events (user_id, value_1_agg)
SELECT d.user_id, d.value_1_agg
FROM (
SELECT t.user_id,
(SELECT max(c.value_1) FROM c WHERE c.user_id = t.user_id) AS value_1_agg
FROM raw_events_first t
) AS d
WHERE d.value_1_agg IS NOT NULL;
-- Expect one insert per row in raw_events_first whose user_id has at least one non-NULL value_1
SELECT 't3_count_matches' AS test,
(SELECT count(*) FROM agg_events) =
(
SELECT count(*)
FROM raw_events_first t
WHERE EXISTS (
SELECT 1 FROM raw_events_first c
WHERE c.user_id = t.user_id AND c.value_1 IS NOT NULL
)
) AS ok;
-- Also verify no NULLs were inserted into value_1_agg
SELECT 't3_no_null_value_1_agg' AS test,
NOT EXISTS (SELECT 1 FROM agg_events WHERE value_1_agg IS NULL) AS ok;
-- Deterministic sample of results
SELECT 't3_sample' AS test, user_id, value_1_agg
FROM agg_events
ORDER BY user_id
LIMIT 5;
-- End of PR #8106 — CTE traversal works when following outer Vars
SET client_min_messages TO ERROR; SET client_min_messages TO ERROR;
DROP SCHEMA pr8106_cte_outervar CASCADE;
DROP SCHEMA multi_insert_select CASCADE; DROP SCHEMA multi_insert_select CASCADE;