From 91cae1fb29fbe8047920e4991f04415ab370f062 Mon Sep 17 00:00:00 2001 From: Colm Date: Wed, 27 Aug 2025 13:32:02 +0100 Subject: [PATCH] Fix bug in redundant WHERE clause detection. (#8162) Need to also check Postgres plan's rangetables for relations used in Initplans. DESCRIPTION: Fix a bug in redundant WHERE clause detection; we need to additionally check the Postgres plan's range tables for the presence of citus tables, to account for relations that are referenced from scalar subqueries. There is a fundamental flaw in 4139370, the assumption that, after Postgres planning has completed, all tables used in a query can be obtained by walking the query tree. This is not the case for scalar subqueries, which will be referenced by `PARAM` nodes. The fix adds an additional check of the Postgres plan range tables; if there is at least one citus table in there we do not need to change the needs distributed planning flag. Fixes #8159 --- .../distributed/planner/distributed_planner.c | 38 +++++++++++----- src/test/regress/citus_tests/run_test.py | 3 ++ .../regress/expected/subquery_in_where.out | 43 +++++++++++++++++++ src/test/regress/sql/subquery_in_where.sql | 32 ++++++++++++++ 4 files changed, 105 insertions(+), 11 deletions(-) diff --git a/src/backend/distributed/planner/distributed_planner.c b/src/backend/distributed/planner/distributed_planner.c index 098aaeb13..a15c3eede 100644 --- a/src/backend/distributed/planner/distributed_planner.c +++ b/src/backend/distributed/planner/distributed_planner.c @@ -141,10 +141,9 @@ static RouterPlanType GetRouterPlanType(Query *query, bool hasUnresolvedParams); static void ConcatenateRTablesAndPerminfos(PlannedStmt *mainPlan, PlannedStmt *concatPlan); -static bool CheckPostPlanDistribution(bool isDistributedQuery, - Query *origQuery, - List *rangeTableList, - Query *plannedQuery); +static bool CheckPostPlanDistribution(DistributedPlanningContext *planContext, + bool isDistributedQuery, + List *rangeTableList); /* Distributed planner hook */ PlannedStmt * @@ -265,10 +264,9 @@ distributed_planner(Query *parse, planContext.plan = standard_planner(planContext.query, NULL, planContext.cursorOptions, planContext.boundParams); - needsDistributedPlanning = CheckPostPlanDistribution(needsDistributedPlanning, - planContext.originalQuery, - rangeTableList, - planContext.query); + needsDistributedPlanning = CheckPostPlanDistribution(&planContext, + needsDistributedPlanning, + rangeTableList); if (needsDistributedPlanning) { @@ -2740,12 +2738,13 @@ WarnIfListHasForeignDistributedTable(List *rangeTableList) static bool -CheckPostPlanDistribution(bool isDistributedQuery, - Query *origQuery, List *rangeTableList, - Query *plannedQuery) +CheckPostPlanDistribution(DistributedPlanningContext *planContext, bool + isDistributedQuery, List *rangeTableList) { if (isDistributedQuery) { + Query *origQuery = planContext->originalQuery; + Query *plannedQuery = planContext->query; Node *origQuals = origQuery->jointree->quals; Node *plannedQuals = plannedQuery->jointree->quals; @@ -2764,6 +2763,23 @@ CheckPostPlanDistribution(bool isDistributedQuery, */ if (origQuals != NULL && plannedQuals == NULL) { + bool planHasDistTable = ListContainsDistributedTableRTE( + planContext->plan->rtable, NULL); + + /* + * If the Postgres plan has a distributed table, we know for sure that + * the query requires distributed planning. + */ + if (planHasDistTable) + { + return true; + } + + /* + * Otherwise, if the query has less range table entries after Postgres, + * planning, we should re-evaluate the distribution of the query. Postgres + * may have optimized away all citus tables, per issues 7782, 7783. + */ List *rtesPostPlan = ExtractRangeTableEntryList(plannedQuery); if (list_length(rtesPostPlan) < list_length(rangeTableList)) { diff --git a/src/test/regress/citus_tests/run_test.py b/src/test/regress/citus_tests/run_test.py index c13f2a6d5..e5a567af8 100755 --- a/src/test/regress/citus_tests/run_test.py +++ b/src/test/regress/citus_tests/run_test.py @@ -262,6 +262,9 @@ DEPS = { "multi_subquery_in_where_reference_clause": TestDeps( "minimal_schedule", ["multi_behavioral_analytics_create_table"] ), + "subquery_in_where": TestDeps( + "minimal_schedule", ["multi_behavioral_analytics_create_table"] + ), } diff --git a/src/test/regress/expected/subquery_in_where.out b/src/test/regress/expected/subquery_in_where.out index 901954265..361415800 100644 --- a/src/test/regress/expected/subquery_in_where.out +++ b/src/test/regress/expected/subquery_in_where.out @@ -1253,10 +1253,53 @@ SELECT vkey, pkey FROM t3; --------------------------------------------------------------------- (0 rows) +-- Redundant WHERE clause with distributed parititioned table +CREATE TABLE a (a int); +INSERT INTO a VALUES (1); +-- populated distributed partitioned table +create table partitioned_table (a INT UNIQUE) PARTITION BY RANGE(a); +CREATE TABLE par_1 PARTITION OF partitioned_table FOR VALUES FROM (1) TO (41); +CREATE TABLE par_2 PARTITION OF partitioned_table FOR VALUES FROM (41) TO (81); +CREATE TABLE par_3 PARTITION OF partitioned_table FOR VALUES FROM (81) TO (121); +CREATE TABLE par_4 PARTITION OF partitioned_table FOR VALUES FROM (121) TO (161); +SELECT create_distributed_table('partitioned_table', 'a'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +insert into partitioned_table(a) select i from generate_series(1,160) i; +-- test citus table in init plan +-- with redundant WHERE clause +SELECT CASE WHEN EXISTS ( + SELECT * FROM partitioned_table + ) THEN 1 ELSE 0 END AS table_non_empty +FROM a +WHERE true; + table_non_empty +--------------------------------------------------------------------- + 1 +(1 row) + +-- test citus table in init plan +-- with redundant WHERE clause involving +-- a citus table +SELECT CASE WHEN EXISTS ( + SELECT * FROM partitioned_table + ) THEN 1 ELSE 0 END AS table_non_empty +FROM a +WHERE true OR NOT EXISTS (SELECT 1 FROM t1); + table_non_empty +--------------------------------------------------------------------- + 1 +(1 row) + DROP TABLE local_table; DROP TABLE t0; DROP TABLE t1; DROP TABLE t3; DROP TABLE t7; +DROP TABLE a; +DROP TABLE partitioned_table CASCADE; DROP SCHEMA subquery_in_where CASCADE; SET search_path TO public; diff --git a/src/test/regress/sql/subquery_in_where.sql b/src/test/regress/sql/subquery_in_where.sql index ebdb60890..3a07cdc75 100644 --- a/src/test/regress/sql/subquery_in_where.sql +++ b/src/test/regress/sql/subquery_in_where.sql @@ -929,10 +929,42 @@ where TRUE or (((t3.vkey) >= (select -- Distributed table t3 is now empty SELECT vkey, pkey FROM t3; +-- Redundant WHERE clause with distributed parititioned table +CREATE TABLE a (a int); +INSERT INTO a VALUES (1); + +-- populated distributed partitioned table +create table partitioned_table (a INT UNIQUE) PARTITION BY RANGE(a); +CREATE TABLE par_1 PARTITION OF partitioned_table FOR VALUES FROM (1) TO (41); +CREATE TABLE par_2 PARTITION OF partitioned_table FOR VALUES FROM (41) TO (81); +CREATE TABLE par_3 PARTITION OF partitioned_table FOR VALUES FROM (81) TO (121); +CREATE TABLE par_4 PARTITION OF partitioned_table FOR VALUES FROM (121) TO (161); +SELECT create_distributed_table('partitioned_table', 'a'); +insert into partitioned_table(a) select i from generate_series(1,160) i; + +-- test citus table in init plan +-- with redundant WHERE clause +SELECT CASE WHEN EXISTS ( + SELECT * FROM partitioned_table + ) THEN 1 ELSE 0 END AS table_non_empty +FROM a +WHERE true; + +-- test citus table in init plan +-- with redundant WHERE clause involving +-- a citus table +SELECT CASE WHEN EXISTS ( + SELECT * FROM partitioned_table + ) THEN 1 ELSE 0 END AS table_non_empty +FROM a +WHERE true OR NOT EXISTS (SELECT 1 FROM t1); + DROP TABLE local_table; DROP TABLE t0; DROP TABLE t1; DROP TABLE t3; DROP TABLE t7; +DROP TABLE a; +DROP TABLE partitioned_table CASCADE; DROP SCHEMA subquery_in_where CASCADE; SET search_path TO public;