diff --git a/src/backend/distributed/planner/multi_logical_optimizer.c b/src/backend/distributed/planner/multi_logical_optimizer.c index c4c11f4eb..97ea3381c 100644 --- a/src/backend/distributed/planner/multi_logical_optimizer.c +++ b/src/backend/distributed/planner/multi_logical_optimizer.c @@ -4560,11 +4560,10 @@ FindReferencedTableColumn(Expr *columnExpression, List *parentQueryList, Query * else if (rangeTableEntry->rtekind == RTE_CTE) { /* - * When outerVars are considered, we modify parentQueryList, so this - * logic might need to change when we support outervars in CTEs. + * Resolve through a CTE even when skipOuterVars == false. + * Maintain the invariant that each recursion level owns a private, + * correctly-bounded copy of parentQueryList. */ - Assert(skipOuterVars); - int cteParentListIndex = list_length(parentQueryList) - rangeTableEntry->ctelevelsup - 1; Query *cteParentQuery = NULL; @@ -4595,14 +4594,34 @@ FindReferencedTableColumn(Expr *columnExpression, List *parentQueryList, Query * if (cte != NULL) { Query *cteQuery = (Query *) cte->ctequery; - List *targetEntryList = cteQuery->targetList; AttrNumber targetEntryIndex = candidateColumn->varattno - 1; - TargetEntry *targetEntry = list_nth(targetEntryList, targetEntryIndex); - parentQueryList = lappend(parentQueryList, query); - FindReferencedTableColumn(targetEntry->expr, parentQueryList, - cteQuery, column, rteContainingReferencedColumn, - skipOuterVars); + if (targetEntryIndex >= 0 && + targetEntryIndex < list_length(cteQuery->targetList)) + { + TargetEntry *targetEntry = + list_nth(cteQuery->targetList, targetEntryIndex); + + /* Build a private, bounded parentQueryList before recursing into the CTE. + * Invariant: list is [top … current], owned by this call (no aliasing). + * For RTE_CTE: + * owner_idx = list_length(parentQueryList) - rangeTableEntry->ctelevelsup - 1; + * newParent = lappend(list_truncate(list_copy(parentQueryList), owner_idx + 1), query); + * Example (Q0 owns CTE; we’re in Q2 via nested subquery): + * parent=[Q0,Q1,Q2], ctelevelsup=2 ⇒ owner_idx=0 ⇒ newParent=[Q0,Q2]. + * Keeps outer-Var level math correct without mutating the caller’s list. + */ + List *newParent = list_copy(parentQueryList); + newParent = list_truncate(newParent, cteParentListIndex + 1); + newParent = lappend(newParent, query); + + FindReferencedTableColumn(targetEntry->expr, + newParent, + cteQuery, + column, + rteContainingReferencedColumn, + skipOuterVars); + } } } } diff --git a/src/test/regress/expected/multi_insert_select.out b/src/test/regress/expected/multi_insert_select.out index 00e335a82..76538f22d 100644 --- a/src/test/regress/expected/multi_insert_select.out +++ b/src/test/regress/expected/multi_insert_select.out @@ -3637,5 +3637,157 @@ SELECT id, val FROM version_dist_union ORDER BY id; (6 rows) -- End of Issue #7784 +-- PR #8106 — CTE traversal works when following outer Vars +-- This script exercises three shapes: +-- T1) CTE referenced inside a correlated subquery (one level down) +-- T2) CTE referenced inside a nested subquery (two levels down) +-- T3) Subquery targetlist uses a scalar sublink into the outer CTE +CREATE SCHEMA pr8106_cte_outervar; +SET search_path = pr8106_cte_outervar, public; +-- Base tables for the tests +DROP TABLE IF EXISTS raw_events_first CASCADE; +NOTICE: table "raw_events_first" does not exist, skipping +DROP TABLE IF EXISTS agg_events CASCADE; +NOTICE: table "agg_events" does not exist, skipping +CREATE TABLE raw_events_first( + user_id int, + value_1 int +); +CREATE TABLE agg_events( + user_id int, + value_1_agg int +); +-- Distribute and colocate (distribution key = user_id) +SELECT create_distributed_table('raw_events_first', 'user_id'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +SELECT create_distributed_table('agg_events', 'user_id'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +-- Seed data (duplicates on some user_ids; some NULL value_1’s) +INSERT INTO raw_events_first(user_id, value_1) VALUES + (1, 10), (1, 20), (1, NULL), + (2, NULL), + (3, 30), + (4, NULL), + (5, 50), (5, NULL), + (6, NULL); +--------------------------------------------------------------------- +-- T1) CTE referenced inside a correlated subquery (one level down) +--------------------------------------------------------------------- +TRUNCATE agg_events; +WITH c AS MATERIALIZED ( + SELECT user_id FROM raw_events_first +) +INSERT INTO agg_events (user_id) +SELECT t.user_id +FROM raw_events_first t +WHERE EXISTS (SELECT 1 FROM c WHERE c.user_id = t.user_id); +-- Expect one insert per row in raw_events_first (EXISTS always true per user_id) +SELECT 't1_count_matches' AS test, + (SELECT count(*) FROM agg_events) = + (SELECT count(*) FROM raw_events_first) AS ok; + test | ok +--------------------------------------------------------------------- + t1_count_matches | t +(1 row) + +-- Spot-check: how many rows were inserted +SELECT 't1_rows' AS test, count(*) AS rows FROM agg_events; + test | rows +--------------------------------------------------------------------- + t1_rows | 9 +(1 row) + +--------------------------------------------------------------------- +-- T2) CTE referenced inside a nested subquery (two levels down) +--------------------------------------------------------------------- +TRUNCATE agg_events; +WITH c AS MATERIALIZED ( + SELECT user_id FROM raw_events_first +) +INSERT INTO agg_events (user_id) +SELECT t.user_id +FROM raw_events_first t +WHERE EXISTS ( + SELECT 1 + FROM (SELECT user_id FROM c) c2 + WHERE c2.user_id = t.user_id +); +-- Same cardinality expectation as T1 +SELECT 't2_count_matches' AS test, + (SELECT count(*) FROM agg_events) = + (SELECT count(*) FROM raw_events_first) AS ok; + test | ok +--------------------------------------------------------------------- + t2_count_matches | t +(1 row) + +SELECT 't2_rows' AS test, count(*) AS rows FROM agg_events; + test | rows +--------------------------------------------------------------------- + t2_rows | 9 +(1 row) + +--------------------------------------------------------------------- +-- T3) Subquery targetlist uses a scalar sublink into the outer CTE +-- (use MAX() to keep scalar subquery single-row) +--------------------------------------------------------------------- +TRUNCATE agg_events; +WITH c AS (SELECT user_id, value_1 FROM raw_events_first) +INSERT INTO agg_events (user_id, value_1_agg) +SELECT d.user_id, d.value_1_agg +FROM ( + SELECT t.user_id, + (SELECT max(c.value_1) FROM c WHERE c.user_id = t.user_id) AS value_1_agg + FROM raw_events_first t +) AS d +WHERE d.value_1_agg IS NOT NULL; +-- Expect one insert per row in raw_events_first whose user_id has at least one non-NULL value_1 +SELECT 't3_count_matches' AS test, + (SELECT count(*) FROM agg_events) = + ( + SELECT count(*) + FROM raw_events_first t + WHERE EXISTS ( + SELECT 1 FROM raw_events_first c + WHERE c.user_id = t.user_id AND c.value_1 IS NOT NULL + ) + ) AS ok; + test | ok +--------------------------------------------------------------------- + t3_count_matches | t +(1 row) + +-- Also verify no NULLs were inserted into value_1_agg +SELECT 't3_no_null_value_1_agg' AS test, + NOT EXISTS (SELECT 1 FROM agg_events WHERE value_1_agg IS NULL) AS ok; + test | ok +--------------------------------------------------------------------- + t3_no_null_value_1_agg | t +(1 row) + +-- Deterministic sample of results +SELECT 't3_sample' AS test, user_id, value_1_agg +FROM agg_events +ORDER BY user_id +LIMIT 5; + test | user_id | value_1_agg +--------------------------------------------------------------------- + t3_sample | 1 | 20 + t3_sample | 1 | 20 + t3_sample | 1 | 20 + t3_sample | 3 | 30 + t3_sample | 5 | 50 +(5 rows) + +-- End of PR #8106 — CTE traversal works when following outer Vars SET client_min_messages TO ERROR; +DROP SCHEMA pr8106_cte_outervar CASCADE; DROP SCHEMA multi_insert_select CASCADE; diff --git a/src/test/regress/sql/multi_insert_select.sql b/src/test/regress/sql/multi_insert_select.sql index eabadda7c..19ae70abc 100644 --- a/src/test/regress/sql/multi_insert_select.sql +++ b/src/test/regress/sql/multi_insert_select.sql @@ -2583,5 +2583,127 @@ SELECT id, val FROM version_dist_union ORDER BY id; -- End of Issue #7784 +-- PR #8106 — CTE traversal works when following outer Vars +-- This script exercises three shapes: +-- T1) CTE referenced inside a correlated subquery (one level down) +-- T2) CTE referenced inside a nested subquery (two levels down) +-- T3) Subquery targetlist uses a scalar sublink into the outer CTE + +CREATE SCHEMA pr8106_cte_outervar; +SET search_path = pr8106_cte_outervar, public; + +-- Base tables for the tests +DROP TABLE IF EXISTS raw_events_first CASCADE; +DROP TABLE IF EXISTS agg_events CASCADE; + +CREATE TABLE raw_events_first( + user_id int, + value_1 int +); + +CREATE TABLE agg_events( + user_id int, + value_1_agg int +); + +-- Distribute and colocate (distribution key = user_id) +SELECT create_distributed_table('raw_events_first', 'user_id'); +SELECT create_distributed_table('agg_events', 'user_id'); + +-- Seed data (duplicates on some user_ids; some NULL value_1’s) +INSERT INTO raw_events_first(user_id, value_1) VALUES + (1, 10), (1, 20), (1, NULL), + (2, NULL), + (3, 30), + (4, NULL), + (5, 50), (5, NULL), + (6, NULL); + +---------------------------------------------------------------------- +-- T1) CTE referenced inside a correlated subquery (one level down) +---------------------------------------------------------------------- +TRUNCATE agg_events; + +WITH c AS MATERIALIZED ( + SELECT user_id FROM raw_events_first +) +INSERT INTO agg_events (user_id) +SELECT t.user_id +FROM raw_events_first t +WHERE EXISTS (SELECT 1 FROM c WHERE c.user_id = t.user_id); + +-- Expect one insert per row in raw_events_first (EXISTS always true per user_id) +SELECT 't1_count_matches' AS test, + (SELECT count(*) FROM agg_events) = + (SELECT count(*) FROM raw_events_first) AS ok; + +-- Spot-check: how many rows were inserted +SELECT 't1_rows' AS test, count(*) AS rows FROM agg_events; + +---------------------------------------------------------------------- +-- T2) CTE referenced inside a nested subquery (two levels down) +---------------------------------------------------------------------- +TRUNCATE agg_events; + +WITH c AS MATERIALIZED ( + SELECT user_id FROM raw_events_first +) +INSERT INTO agg_events (user_id) +SELECT t.user_id +FROM raw_events_first t +WHERE EXISTS ( + SELECT 1 + FROM (SELECT user_id FROM c) c2 + WHERE c2.user_id = t.user_id +); + +-- Same cardinality expectation as T1 +SELECT 't2_count_matches' AS test, + (SELECT count(*) FROM agg_events) = + (SELECT count(*) FROM raw_events_first) AS ok; + +SELECT 't2_rows' AS test, count(*) AS rows FROM agg_events; + +---------------------------------------------------------------------- +-- T3) Subquery targetlist uses a scalar sublink into the outer CTE +-- (use MAX() to keep scalar subquery single-row) +---------------------------------------------------------------------- +TRUNCATE agg_events; + +WITH c AS (SELECT user_id, value_1 FROM raw_events_first) +INSERT INTO agg_events (user_id, value_1_agg) +SELECT d.user_id, d.value_1_agg +FROM ( + SELECT t.user_id, + (SELECT max(c.value_1) FROM c WHERE c.user_id = t.user_id) AS value_1_agg + FROM raw_events_first t +) AS d +WHERE d.value_1_agg IS NOT NULL; + +-- Expect one insert per row in raw_events_first whose user_id has at least one non-NULL value_1 +SELECT 't3_count_matches' AS test, + (SELECT count(*) FROM agg_events) = + ( + SELECT count(*) + FROM raw_events_first t + WHERE EXISTS ( + SELECT 1 FROM raw_events_first c + WHERE c.user_id = t.user_id AND c.value_1 IS NOT NULL + ) + ) AS ok; + +-- Also verify no NULLs were inserted into value_1_agg +SELECT 't3_no_null_value_1_agg' AS test, + NOT EXISTS (SELECT 1 FROM agg_events WHERE value_1_agg IS NULL) AS ok; + +-- Deterministic sample of results +SELECT 't3_sample' AS test, user_id, value_1_agg +FROM agg_events +ORDER BY user_id +LIMIT 5; + +-- End of PR #8106 — CTE traversal works when following outer Vars + SET client_min_messages TO ERROR; +DROP SCHEMA pr8106_cte_outervar CASCADE; DROP SCHEMA multi_insert_select CASCADE;