From 20eec72e4f54b67648c2f359c6eaca102aa35dcd Mon Sep 17 00:00:00 2001 From: Mehmet Yilmaz Date: Thu, 7 Nov 2024 12:30:29 +0000 Subject: [PATCH] dml_recursive update --- src/test/regress/expected/dml_recursive.out | 6 + src/test/regress/expected/dml_recursive_0.out | 394 ++++++++++++++++++ src/test/regress/sql/dml_recursive.sql | 6 + 3 files changed, 406 insertions(+) create mode 100644 src/test/regress/expected/dml_recursive_0.out diff --git a/src/test/regress/expected/dml_recursive.out b/src/test/regress/expected/dml_recursive.out index cc4058def..3f72b00f1 100644 --- a/src/test/regress/expected/dml_recursive.out +++ b/src/test/regress/expected/dml_recursive.out @@ -1,3 +1,9 @@ +-- Two alternative test outputs: +-- dml_recursive.out for PG16 and before +-- dml_recursive_0.out for PG17 +-- related commit +-- PostgreSQL 17 includes an enhancement that allows the optimizer to transform correlated IN subqueries into more efficient join operations. +-- https://git.postgresql.org/gitweb/?p=postgresql.git;a=commitdiff;h=9f1337639 CREATE SCHEMA recursive_dml_queries; SET search_path TO recursive_dml_queries, public; SET citus.next_shard_id TO 2370000; diff --git a/src/test/regress/expected/dml_recursive_0.out b/src/test/regress/expected/dml_recursive_0.out new file mode 100644 index 000000000..de33a9175 --- /dev/null +++ b/src/test/regress/expected/dml_recursive_0.out @@ -0,0 +1,394 @@ +-- Two alternative test outputs: +-- dml_recursive.out for PG16 and before +-- dml_recursive_0.out for PG17 +-- related commit +-- PostgreSQL 17 includes an enhancement that allows the optimizer to transform correlated IN subqueries into more efficient join operations. +-- https://git.postgresql.org/gitweb/?p=postgresql.git;a=commitdiff;h=9f1337639 +CREATE SCHEMA recursive_dml_queries; +SET search_path TO recursive_dml_queries, public; +SET citus.next_shard_id TO 2370000; +CREATE TABLE recursive_dml_queries.distributed_table (tenant_id text, dept int, info jsonb); +SELECT create_distributed_table('distributed_table', 'tenant_id'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +CREATE TABLE recursive_dml_queries.second_distributed_table (tenant_id text, dept int, info jsonb); +SELECT create_distributed_table('second_distributed_table', 'tenant_id'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +CREATE TABLE recursive_dml_queries.reference_table (id text, name text); +SELECT create_reference_table('reference_table'); + create_reference_table +--------------------------------------------------------------------- + +(1 row) + +CREATE TABLE recursive_dml_queries.local_table (id text, name text); +INSERT INTO distributed_table SELECT i::text, i % 10, row_to_json(row(i, i*i)) FROM generate_series (0, 100) i; +INSERT INTO second_distributed_table SELECT i::text, i % 10, row_to_json(row(i, i*i)) FROM generate_series (0, 100) i; +INSERT INTO reference_table SELECT i::text, 'user_' || i FROM generate_series (0, 100) i; +INSERT INTO local_table SELECT i::text, 'user_' || i FROM generate_series (0, 100) i; +CREATE VIEW tenant_ids AS + SELECT + tenant_id, name + FROM + distributed_table, reference_table + WHERE + distributed_table.dept::text = reference_table.id + ORDER BY 2 DESC, 1 DESC; +SET client_min_messages TO DEBUG1; +-- the subquery foo is recursively planned +UPDATE + reference_table +SET + name = 'new_' || name +FROM +( + SELECT + avg(second_distributed_table.tenant_id::int) as avg_tenant_id + FROM + second_distributed_table +) as foo +WHERE + foo.avg_tenant_id::int::text = reference_table.id +RETURNING + reference_table.name; +DEBUG: generating subplan XXX_1 for subquery SELECT avg((tenant_id)::integer) AS avg_tenant_id FROM recursive_dml_queries.second_distributed_table +DEBUG: Plan XXX query after replacing subqueries and CTEs: UPDATE recursive_dml_queries.reference_table SET name = ('new_'::text OPERATOR(pg_catalog.||) reference_table.name) FROM (SELECT intermediate_result.avg_tenant_id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(avg_tenant_id numeric)) foo WHERE (((foo.avg_tenant_id)::integer)::text OPERATOR(pg_catalog.=) reference_table.id) RETURNING reference_table.name + name +--------------------------------------------------------------------- + new_user_50 +(1 row) + +-- the subquery foo is recursively planned +-- but note that the subquery foo itself is pushdownable +UPDATE + second_distributed_table +SET + dept = foo.max_dept * 2 +FROM +( + SELECT DISTINCT ON (tenant_id) tenant_id, max(dept) as max_dept FROM + ( + SELECT + second_distributed_table.dept, second_distributed_table.tenant_id + FROM + second_distributed_table, distributed_table + WHERE + distributed_table.tenant_id = second_distributed_table.tenant_id + ) foo_inner + GROUP BY + tenant_id + ORDER BY 1 DESC +) as foo +WHERE + foo.tenant_id != second_distributed_table.tenant_id + AND second_distributed_table.dept IN (2) +RETURNING + second_distributed_table.tenant_id, second_distributed_table.dept; +DEBUG: generating subplan XXX_1 for subquery SELECT DISTINCT ON (tenant_id) tenant_id, max(dept) AS max_dept FROM (SELECT second_distributed_table.dept, second_distributed_table.tenant_id FROM recursive_dml_queries.second_distributed_table, recursive_dml_queries.distributed_table WHERE (distributed_table.tenant_id OPERATOR(pg_catalog.=) second_distributed_table.tenant_id)) foo_inner GROUP BY tenant_id ORDER BY tenant_id DESC +DEBUG: Plan XXX query after replacing subqueries and CTEs: UPDATE recursive_dml_queries.second_distributed_table SET dept = (foo.max_dept OPERATOR(pg_catalog.*) 2) FROM (SELECT intermediate_result.tenant_id, intermediate_result.max_dept FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(tenant_id text, max_dept integer)) foo WHERE ((foo.tenant_id OPERATOR(pg_catalog.<>) second_distributed_table.tenant_id) AND (second_distributed_table.dept OPERATOR(pg_catalog.=) 2)) RETURNING second_distributed_table.tenant_id, second_distributed_table.dept + tenant_id | dept +--------------------------------------------------------------------- + 12 | 18 + 2 | 18 + 22 | 18 + 32 | 18 + 42 | 18 + 52 | 18 + 62 | 18 + 72 | 18 + 82 | 18 + 92 | 18 +(10 rows) + +-- the subquery foo is recursively planned +-- and foo itself is a non colocated subquery and recursively planned +UPDATE + second_distributed_table +SET + dept = foo.tenant_id::int / 4 +FROM +( + SELECT DISTINCT foo_inner_1.tenant_id FROM + ( + SELECT + second_distributed_table.dept, second_distributed_table.tenant_id + FROM + second_distributed_table, distributed_table + WHERE + distributed_table.tenant_id = second_distributed_table.tenant_id + AND + second_distributed_table.dept IN (3,4) + ) foo_inner_1, + ( + SELECT + second_distributed_table.tenant_id + FROM + second_distributed_table, distributed_table + WHERE + distributed_table.tenant_id = second_distributed_table.tenant_id + AND + second_distributed_table.dept IN (4,5) + )foo_inner_2 + WHERE foo_inner_1.tenant_id != foo_inner_2.tenant_id +) as foo +WHERE + foo.tenant_id != second_distributed_table.tenant_id + AND second_distributed_table.dept IN (3); +DEBUG: generating subplan XXX_1 for subquery SELECT second_distributed_table.tenant_id FROM recursive_dml_queries.second_distributed_table, recursive_dml_queries.distributed_table WHERE ((distributed_table.tenant_id OPERATOR(pg_catalog.=) second_distributed_table.tenant_id) AND (second_distributed_table.dept OPERATOR(pg_catalog.=) ANY (ARRAY[4, 5]))) +DEBUG: generating subplan XXX_2 for subquery SELECT DISTINCT foo_inner_1.tenant_id FROM (SELECT second_distributed_table.dept, second_distributed_table.tenant_id FROM recursive_dml_queries.second_distributed_table, recursive_dml_queries.distributed_table WHERE ((distributed_table.tenant_id OPERATOR(pg_catalog.=) second_distributed_table.tenant_id) AND (second_distributed_table.dept OPERATOR(pg_catalog.=) ANY (ARRAY[3, 4])))) foo_inner_1, (SELECT intermediate_result.tenant_id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(tenant_id text)) foo_inner_2 WHERE (foo_inner_1.tenant_id OPERATOR(pg_catalog.<>) foo_inner_2.tenant_id) +DEBUG: Plan XXX query after replacing subqueries and CTEs: UPDATE recursive_dml_queries.second_distributed_table SET dept = ((foo.tenant_id)::integer OPERATOR(pg_catalog./) 4) FROM (SELECT intermediate_result.tenant_id FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(tenant_id text)) foo WHERE ((foo.tenant_id OPERATOR(pg_catalog.<>) second_distributed_table.tenant_id) AND (second_distributed_table.dept OPERATOR(pg_catalog.=) 3)) +-- we currently do not allow local tables in modification queries +UPDATE + distributed_table +SET + dept = avg_tenant_id::int +FROM +( + SELECT + avg(local_table.id::int) as avg_tenant_id + FROM + local_table +) as foo +WHERE + foo.avg_tenant_id::int::text = distributed_table.tenant_id +RETURNING + distributed_table.*; +DEBUG: generating subplan XXX_1 for subquery SELECT avg((id)::integer) AS avg_tenant_id FROM recursive_dml_queries.local_table +DEBUG: Plan XXX query after replacing subqueries and CTEs: UPDATE recursive_dml_queries.distributed_table SET dept = (foo.avg_tenant_id)::integer FROM (SELECT intermediate_result.avg_tenant_id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(avg_tenant_id numeric)) foo WHERE (((foo.avg_tenant_id)::integer)::text OPERATOR(pg_catalog.=) distributed_table.tenant_id) RETURNING distributed_table.tenant_id, distributed_table.dept, distributed_table.info + tenant_id | dept | info +--------------------------------------------------------------------- + 50 | 50 | {"f1": 50, "f2": 2500} +(1 row) + +-- we currently do not allow views in modification queries +UPDATE + distributed_table +SET + dept = avg_tenant_id::int +FROM +( + SELECT + avg(tenant_id::int) as avg_tenant_id + FROM + tenant_ids +) as foo +WHERE + foo.avg_tenant_id::int::text = distributed_table.tenant_id +RETURNING + distributed_table.*; +DEBUG: generating subplan XXX_1 for subquery SELECT avg((tenant_id)::integer) AS avg_tenant_id FROM (SELECT distributed_table.tenant_id, reference_table.name FROM recursive_dml_queries.distributed_table, recursive_dml_queries.reference_table WHERE ((distributed_table.dept)::text OPERATOR(pg_catalog.=) reference_table.id) ORDER BY reference_table.name DESC, distributed_table.tenant_id DESC) tenant_ids +DEBUG: Plan XXX query after replacing subqueries and CTEs: UPDATE recursive_dml_queries.distributed_table SET dept = (foo.avg_tenant_id)::integer FROM (SELECT intermediate_result.avg_tenant_id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(avg_tenant_id numeric)) foo WHERE (((foo.avg_tenant_id)::integer)::text OPERATOR(pg_catalog.=) distributed_table.tenant_id) RETURNING distributed_table.tenant_id, distributed_table.dept, distributed_table.info + tenant_id | dept | info +--------------------------------------------------------------------- + 50 | 50 | {"f1": 50, "f2": 2500} +(1 row) + +-- there is a lateral join (e.g., correlated subquery) thus the subqueries cannot be +-- recursively planned, however it can be planned using the repartition planner +SET citus.enable_repartition_joins to on; +SELECT DISTINCT foo_inner_1.tenant_id FROM +( + SELECT + second_distributed_table.dept, second_distributed_table.tenant_id + FROM + second_distributed_table, distributed_table + WHERE + distributed_table.tenant_id = second_distributed_table.tenant_id + AND + second_distributed_table.dept IN (3,4) +) +foo_inner_1 JOIN LATERAL +( + SELECT + second_distributed_table.tenant_id + FROM + second_distributed_table, distributed_table + WHERE + distributed_table.tenant_id = second_distributed_table.tenant_id + AND foo_inner_1.dept = second_distributed_table.dept + AND + second_distributed_table.dept IN (4,5) +) foo_inner_2 +ON (foo_inner_2.tenant_id != foo_inner_1.tenant_id) +ORDER BY foo_inner_1.tenant_id; + tenant_id +--------------------------------------------------------------------- + 14 + 24 + 34 + 4 + 44 + 54 + 64 + 74 + 84 + 94 +(10 rows) + +RESET citus.enable_repartition_joins; +-- there is a lateral join (e.g., correlated subquery) thus the subqueries cannot be +-- recursively planned, this one can not be planned by the repartition planner +-- because of the IN query on a non unique column +UPDATE + second_distributed_table +SET + dept = foo.tenant_id::int / 4 +FROM +( + SELECT DISTINCT foo_inner_1.tenant_id FROM + ( + SELECT + second_distributed_table.dept, second_distributed_table.tenant_id + FROM + second_distributed_table, distributed_table + WHERE + distributed_table.tenant_id = second_distributed_table.tenant_id + AND + second_distributed_table.dept IN (select dept from second_distributed_table)) + foo_inner_1 JOIN LATERAL + ( + SELECT + second_distributed_table.tenant_id + FROM + second_distributed_table, distributed_table + WHERE + distributed_table.tenant_id = second_distributed_table.tenant_id + AND foo_inner_1.dept = second_distributed_table.dept + AND + second_distributed_table.dept IN (4,5) + ) foo_inner_2 + ON (foo_inner_2.tenant_id != foo_inner_1.tenant_id) + ) as foo +RETURNING *; +DEBUG: generating subplan XXX_1 for subquery SELECT dept FROM recursive_dml_queries.second_distributed_table +ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns +-- again a correlated subquery +-- this time distribution key eq. exists +-- however recursive planning is prevented due to correlated subqueries +UPDATE + second_distributed_table +SET + dept = foo.tenant_id::int / 4 +FROM +( + SELECT baz.tenant_id FROM + ( + SELECT + second_distributed_table.dept, second_distributed_table.tenant_id + FROM + second_distributed_table, distributed_table as d1 + WHERE + d1.tenant_id = second_distributed_table.tenant_id + AND + second_distributed_table.dept IN (3,4) + AND + second_distributed_table.tenant_id IN + ( + SELECT s2.tenant_id + FROM second_distributed_table as s2 + GROUP BY d1.tenant_id, s2.tenant_id + ) + ) as baz + ) as foo WHERE second_distributed_table.tenant_id = foo.tenant_id +RETURNING *; + tenant_id | dept | info | tenant_id +--------------------------------------------------------------------- + 14 | 3 | {"f1": 14, "f2": 196} | 14 + 23 | 5 | {"f1": 23, "f2": 529} | 23 + 24 | 6 | {"f1": 24, "f2": 576} | 24 + 3 | 0 | {"f1": 3, "f2": 9} | 3 + 33 | 8 | {"f1": 33, "f2": 1089} | 33 + 34 | 8 | {"f1": 34, "f2": 1156} | 34 + 4 | 1 | {"f1": 4, "f2": 16} | 4 + 43 | 10 | {"f1": 43, "f2": 1849} | 43 + 44 | 11 | {"f1": 44, "f2": 1936} | 44 + 53 | 13 | {"f1": 53, "f2": 2809} | 53 + 54 | 13 | {"f1": 54, "f2": 2916} | 54 + 63 | 15 | {"f1": 63, "f2": 3969} | 63 + 64 | 16 | {"f1": 64, "f2": 4096} | 64 + 73 | 18 | {"f1": 73, "f2": 5329} | 73 + 74 | 18 | {"f1": 74, "f2": 5476} | 74 + 83 | 20 | {"f1": 83, "f2": 6889} | 83 + 84 | 21 | {"f1": 84, "f2": 7056} | 84 + 93 | 23 | {"f1": 93, "f2": 8649} | 93 + 94 | 23 | {"f1": 94, "f2": 8836} | 94 +(19 rows) + +-- we don't support subqueries/CTEs inside VALUES +INSERT INTO + second_distributed_table (tenant_id, dept) +VALUES ('3', (WITH vals AS (SELECT 3) select * from vals)); +DEBUG: CTE vals is going to be inlined via distributed planning +DEBUG: generating subplan XXX_1 for CTE vals: SELECT 3 +DEBUG: Plan XXX query after replacing subqueries and CTEs: INSERT INTO recursive_dml_queries.second_distributed_table (tenant_id, dept) VALUES ('3'::text, (SELECT vals."?column?" FROM (SELECT intermediate_result."?column?" FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result("?column?" integer)) vals("?column?"))) +ERROR: subqueries are not supported within INSERT queries +HINT: Try rewriting your queries with 'INSERT INTO ... SELECT' syntax. +INSERT INTO + second_distributed_table (tenant_id, dept) +VALUES ('3', (SELECT 3)); +ERROR: subqueries are not supported within INSERT queries +HINT: Try rewriting your queries with 'INSERT INTO ... SELECT' syntax. +-- DML with an unreferenced SELECT CTE +WITH cte_1 AS ( + WITH cte_2 AS ( + SELECT tenant_id as cte2_id + FROM second_distributed_table + WHERE dept >= 2 + ) + UPDATE distributed_table + SET dept = 10 + RETURNING * +) +UPDATE distributed_table +SET dept = 5 +FROM cte_1 +WHERE distributed_table.tenant_id < cte_1.tenant_id; +DEBUG: generating subplan XXX_1 for CTE cte_1: WITH cte_2 AS (SELECT second_distributed_table.tenant_id AS cte2_id FROM recursive_dml_queries.second_distributed_table WHERE (second_distributed_table.dept OPERATOR(pg_catalog.>=) 2)) UPDATE recursive_dml_queries.distributed_table SET dept = 10 RETURNING tenant_id, dept, info +DEBUG: Plan XXX query after replacing subqueries and CTEs: UPDATE recursive_dml_queries.distributed_table SET dept = 5 FROM (SELECT intermediate_result.tenant_id, intermediate_result.dept, intermediate_result.info FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(tenant_id text, dept integer, info jsonb)) cte_1 WHERE (distributed_table.tenant_id OPERATOR(pg_catalog.<) cte_1.tenant_id) +WITH cte_1 AS ( + WITH cte_2 AS ( + SELECT tenant_id as cte2_id + FROM second_distributed_table + WHERE dept >= 2 + ) + UPDATE distributed_table + SET dept = 10 + RETURNING * +) +UPDATE distributed_table +SET dept = 5 +FROM cte_1 +WHERE distributed_table.tenant_id < cte_1.tenant_id; +DEBUG: generating subplan XXX_1 for CTE cte_1: WITH cte_2 AS (SELECT second_distributed_table.tenant_id AS cte2_id FROM recursive_dml_queries.second_distributed_table WHERE (second_distributed_table.dept OPERATOR(pg_catalog.>=) 2)) UPDATE recursive_dml_queries.distributed_table SET dept = 10 RETURNING tenant_id, dept, info +DEBUG: Plan XXX query after replacing subqueries and CTEs: UPDATE recursive_dml_queries.distributed_table SET dept = 5 FROM (SELECT intermediate_result.tenant_id, intermediate_result.dept, intermediate_result.info FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(tenant_id text, dept integer, info jsonb)) cte_1 WHERE (distributed_table.tenant_id OPERATOR(pg_catalog.<) cte_1.tenant_id) +-- we support updating local table with a join with +-- distributed tables, though as the local table +-- is target here, distributed table is recursively +-- planned +UPDATE + local_table +SET + id = 'citus_test' +FROM + distributed_table +WHERE + distributed_table.tenant_id = local_table.id; +DEBUG: Wrapping relation "distributed_table" to a subquery +DEBUG: generating subplan XXX_1 for subquery SELECT tenant_id FROM recursive_dml_queries.distributed_table WHERE true +DEBUG: Plan XXX query after replacing subqueries and CTEs: UPDATE recursive_dml_queries.local_table SET id = 'citus_test'::text FROM (SELECT distributed_table_1.tenant_id, NULL::integer AS dept, NULL::jsonb AS info FROM (SELECT intermediate_result.tenant_id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(tenant_id text)) distributed_table_1) distributed_table WHERE (distributed_table.tenant_id OPERATOR(pg_catalog.=) local_table.id) +RESET client_min_messages; +DROP SCHEMA recursive_dml_queries CASCADE; +NOTICE: drop cascades to 6 other objects +DETAIL: drop cascades to table distributed_table +drop cascades to table second_distributed_table +drop cascades to table reference_table +drop cascades to table reference_table_2370008 +drop cascades to table local_table +drop cascades to view tenant_ids diff --git a/src/test/regress/sql/dml_recursive.sql b/src/test/regress/sql/dml_recursive.sql index 89e654b6c..0bc99607d 100644 --- a/src/test/regress/sql/dml_recursive.sql +++ b/src/test/regress/sql/dml_recursive.sql @@ -1,3 +1,9 @@ +-- Two alternative test outputs: +-- dml_recursive.out for PG16 and before +-- dml_recursive_0.out for PG17 +-- related commit +-- PostgreSQL 17 includes an enhancement that allows the optimizer to transform correlated IN subqueries into more efficient join operations. +-- https://git.postgresql.org/gitweb/?p=postgresql.git;a=commitdiff;h=9f1337639 CREATE SCHEMA recursive_dml_queries; SET search_path TO recursive_dml_queries, public; SET citus.next_shard_id TO 2370000;