From 98dcbeb3042e902896c796be3b9414a3f98416d7 Mon Sep 17 00:00:00 2001 From: Naisila Puka <37271756+naisila@users.noreply.github.com> Date: Wed, 31 Aug 2022 10:48:01 +0300 Subject: [PATCH 01/16] Specifies that our CustomScan providers support projections (#6244) Before, this was the default mode for CustomScan providers. Now, the default is to assume that they can't project. This causes performance penalties due to adding unnecessary Result nodes. Hence we use the newly added flag, CUSTOMPATH_SUPPORT_PROJECTION to get it back to how it was. In PG15 support branch we created explain functions to ignore the new Result nodes, so we undo that in this commit. Relevant PG commit: 955b3e0f9269639fb916cee3dea37aee50b82df0 --- src/backend/columnar/columnar_customscan.c | 12 + .../planner/combine_query_planner.c | 6 + .../distributed/planner/distributed_planner.c | 7 + .../expected/columnar_chunk_filtering.out | 8 +- .../expected/columnar_citus_integration.out | 4 +- .../expected/insert_select_repartition.out | 8 +- .../expected/insert_select_repartition_0.out | 8 +- .../expected/multi_select_distinct.out | 68 +- .../expected/multi_select_distinct_0.out | 1548 +++++++++++++++++ .../regress/expected/multi_test_helpers.out | 34 - .../regress/expected/window_functions.out | 4 +- .../regress/expected/window_functions_0.out | 4 +- .../regress/sql/columnar_chunk_filtering.sql | 4 - .../sql/columnar_citus_integration.sql | 2 - .../regress/sql/insert_select_repartition.sql | 4 - .../regress/sql/multi_select_distinct.sql | 16 +- src/test/regress/sql/multi_test_helpers.sql | 36 - src/test/regress/sql/window_functions.sql | 2 - 18 files changed, 1616 insertions(+), 159 deletions(-) create mode 100644 src/test/regress/expected/multi_select_distinct_0.out diff --git a/src/backend/columnar/columnar_customscan.c b/src/backend/columnar/columnar_customscan.c index 98c13e2a7..74c50e4f6 100644 --- a/src/backend/columnar/columnar_customscan.c +++ b/src/backend/columnar/columnar_customscan.c @@ -1303,6 +1303,12 @@ AddColumnarScanPath(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte, cpath->methods = &ColumnarScanPathMethods; +#if (PG_VERSION_NUM >= PG_VERSION_15) + + /* necessary to avoid extra Result node in PG15 */ + cpath->flags = CUSTOMPATH_SUPPORT_PROJECTION; +#endif + /* * populate generic path information */ @@ -1545,6 +1551,12 @@ ColumnarScanPath_PlanCustomPath(PlannerInfo *root, cscan->scan.plan.targetlist = list_copy(tlist); cscan->scan.scanrelid = best_path->path.parent->relid; +#if (PG_VERSION_NUM >= 150000) + + /* necessary to avoid extra Result node in PG15 */ + cscan->flags = CUSTOMPATH_SUPPORT_PROJECTION; +#endif + return (Plan *) cscan; } diff --git a/src/backend/distributed/planner/combine_query_planner.c b/src/backend/distributed/planner/combine_query_planner.c index f67f71b53..0a871f3e6 100644 --- a/src/backend/distributed/planner/combine_query_planner.c +++ b/src/backend/distributed/planner/combine_query_planner.c @@ -136,6 +136,12 @@ CreateCitusCustomScanPath(PlannerInfo *root, RelOptInfo *relOptInfo, path->custom_path.path.pathtarget = relOptInfo->reltarget; path->custom_path.path.parent = relOptInfo; +#if (PG_VERSION_NUM >= PG_VERSION_15) + + /* necessary to avoid extra Result node in PG15 */ + path->custom_path.flags = CUSTOMPATH_SUPPORT_PROJECTION; +#endif + /* * The 100k rows we put on the cost of the path is kind of arbitrary and could be * improved in accuracy to produce better plans. diff --git a/src/backend/distributed/planner/distributed_planner.c b/src/backend/distributed/planner/distributed_planner.c index 35664d7c7..17816a3b4 100644 --- a/src/backend/distributed/planner/distributed_planner.c +++ b/src/backend/distributed/planner/distributed_planner.c @@ -1369,7 +1369,14 @@ FinalizePlan(PlannedStmt *localPlan, DistributedPlan *distributedPlan) Node *distributedPlanData = (Node *) distributedPlan; customScan->custom_private = list_make1(distributedPlanData); + +#if (PG_VERSION_NUM >= PG_VERSION_15) + + /* necessary to avoid extra Result node in PG15 */ + customScan->flags = CUSTOMPATH_SUPPORT_BACKWARD_SCAN | CUSTOMPATH_SUPPORT_PROJECTION; +#else customScan->flags = CUSTOMPATH_SUPPORT_BACKWARD_SCAN; +#endif /* * Fast path queries cannot have any subplans by definition, so skip diff --git a/src/test/regress/expected/columnar_chunk_filtering.out b/src/test/regress/expected/columnar_chunk_filtering.out index 292d5fb1a..0d0534ccc 100644 --- a/src/test/regress/expected/columnar_chunk_filtering.out +++ b/src/test/regress/expected/columnar_chunk_filtering.out @@ -264,21 +264,17 @@ EXPLAIN (analyze on, costs off, timing off, summary off) Columnar Projected Columns: a (9 rows) -SELECT plan_without_arrows($Q$ EXPLAIN (costs off, timing off, summary off) SELECT y, * FROM another_columnar_table; -$Q$); - plan_without_arrows + QUERY PLAN --------------------------------------------------------------------- Custom Scan (ColumnarScan) on another_columnar_table Columnar Projected Columns: x, y (2 rows) -SELECT plan_without_arrows($Q$ EXPLAIN (costs off, timing off, summary off) SELECT *, x FROM another_columnar_table; -$Q$); - plan_without_arrows + QUERY PLAN --------------------------------------------------------------------- Custom Scan (ColumnarScan) on another_columnar_table Columnar Projected Columns: x, y diff --git a/src/test/regress/expected/columnar_citus_integration.out b/src/test/regress/expected/columnar_citus_integration.out index 8beb09edf..fb7d9201e 100644 --- a/src/test/regress/expected/columnar_citus_integration.out +++ b/src/test/regress/expected/columnar_citus_integration.out @@ -958,15 +958,13 @@ SELECT * FROM weird_col_explain; (7 rows) \set VERBOSITY terse -SELECT public.plan_without_result_lines($Q$ EXPLAIN (COSTS OFF, SUMMARY OFF) SELECT *, "bbbbbbbbbbbbbbbbbbbbbbbbb\!bbbb'bbbbbbbbbbbbbbbbbbbbb''bbbbbbbb" FROM weird_col_explain WHERE "bbbbbbbbbbbbbbbbbbbbbbbbb\!bbbb'bbbbbbbbbbbbbbbbbbbbb''bbbbbbbb" * 2 > "aaaaaaaaaaaa$aaaaaa$$aaaaaaaaaaaaaaaaaaaaaaaaaaaaa'aaaaaaaa'$a'!"; -$Q$); NOTICE: identifier "aaaaaaaaaaaa$aaaaaa$$aaaaaaaaaaaaaaaaaaaaaaaaaaaaa'aaaaaaaa'$a'!" will be truncated to "aaaaaaaaaaaa$aaaaaa$$aaaaaaaaaaaaaaaaaaaaaaaaaaaaa'aaaaaaaa'$a'" - plan_without_result_lines + QUERY PLAN --------------------------------------------------------------------- Custom Scan (Citus Adaptive) Task Count: 4 diff --git a/src/test/regress/expected/insert_select_repartition.out b/src/test/regress/expected/insert_select_repartition.out index 913419072..5d9ddca6a 100644 --- a/src/test/regress/expected/insert_select_repartition.out +++ b/src/test/regress/expected/insert_select_repartition.out @@ -1261,10 +1261,8 @@ NOTICE: copying the data has completed (1 row) -SELECT public.plan_without_result_lines($Q$ explain (costs off) insert into table_with_sequences select y, x from table_with_sequences; -$Q$); - plan_without_result_lines + QUERY PLAN --------------------------------------------------------------------- Custom Scan (Citus INSERT ... SELECT) INSERT/SELECT method: pull to coordinator @@ -1289,10 +1287,8 @@ NOTICE: copying the data has completed (1 row) -SELECT public.plan_without_result_lines($Q$ explain (costs off) insert into table_with_user_sequences select y, x from table_with_user_sequences; -$Q$); - plan_without_result_lines + QUERY PLAN --------------------------------------------------------------------- Custom Scan (Citus INSERT ... SELECT) INSERT/SELECT method: pull to coordinator diff --git a/src/test/regress/expected/insert_select_repartition_0.out b/src/test/regress/expected/insert_select_repartition_0.out index 31377ef16..2eea30bdf 100644 --- a/src/test/regress/expected/insert_select_repartition_0.out +++ b/src/test/regress/expected/insert_select_repartition_0.out @@ -1261,10 +1261,8 @@ NOTICE: copying the data has completed (1 row) -SELECT public.plan_without_result_lines($Q$ explain (costs off) insert into table_with_sequences select y, x from table_with_sequences; -$Q$); - plan_without_result_lines + QUERY PLAN --------------------------------------------------------------------- Custom Scan (Citus INSERT ... SELECT) INSERT/SELECT method: pull to coordinator @@ -1289,10 +1287,8 @@ NOTICE: copying the data has completed (1 row) -SELECT public.plan_without_result_lines($Q$ explain (costs off) insert into table_with_user_sequences select y, x from table_with_user_sequences; -$Q$); - plan_without_result_lines + QUERY PLAN --------------------------------------------------------------------- Custom Scan (Citus INSERT ... SELECT) INSERT/SELECT method: pull to coordinator diff --git a/src/test/regress/expected/multi_select_distinct.out b/src/test/regress/expected/multi_select_distinct.out index 1112124ae..d281ad4b4 100644 --- a/src/test/regress/expected/multi_select_distinct.out +++ b/src/test/regress/expected/multi_select_distinct.out @@ -3,6 +3,13 @@ -- -- Tests select distinct, and select distinct on features. -- +SHOW server_version \gset +SELECT substring(:'server_version', '\d+')::int >= 15 AS server_version_ge_15; + server_version_ge_15 +--------------------------------------------------------------------- + t +(1 row) + ANALYZE lineitem_hash_part; -- function calls are supported SELECT DISTINCT l_orderkey, now() FROM lineitem_hash_part LIMIT 0; @@ -306,14 +313,12 @@ EXPLAIN (COSTS FALSE) -- check the plan if the hash aggreate is disabled. We expect to see sort+unique -- instead of aggregate plan node to handle distinct. SET enable_hashagg TO off; -SELECT public.plan_without_result_lines($Q$ EXPLAIN (COSTS FALSE) SELECT DISTINCT count(*) FROM lineitem_hash_part GROUP BY l_suppkey, l_linenumber ORDER BY 1; -$Q$); - plan_without_result_lines + QUERY PLAN --------------------------------------------------------------------- Unique -> Sort @@ -382,15 +387,13 @@ EXPLAIN (COSTS FALSE) -- check the plan if the hash aggreate is disabled. Similar to the explain of -- the query above. SET enable_hashagg TO off; -SELECT public.plan_without_result_lines($Q$ EXPLAIN (COSTS FALSE) SELECT DISTINCT l_suppkey, count(*) FROM lineitem_hash_part GROUP BY l_suppkey, l_linenumber ORDER BY 1 LIMIT 10; -$Q$); - plan_without_result_lines + QUERY PLAN --------------------------------------------------------------------- Limit -> Unique @@ -440,12 +443,13 @@ EXPLAIN (COSTS FALSE) GROUP BY l_suppkey, l_linenumber ORDER BY 1,2 LIMIT 10; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- Limit - -> Unique - -> Sort - Sort Key: remote_scan.l_suppkey, ((pg_catalog.sum(remote_scan.avg) / pg_catalog.sum(remote_scan.avg_1))) + -> Sort + Sort Key: remote_scan.l_suppkey, ((pg_catalog.sum(remote_scan.avg) / pg_catalog.sum(remote_scan.avg_1))) + -> HashAggregate + Group Key: remote_scan.l_suppkey, (pg_catalog.sum(remote_scan.avg) / pg_catalog.sum(remote_scan.avg_1)) -> HashAggregate Group Key: remote_scan.l_suppkey, remote_scan.worker_column_4 -> Custom Scan (Citus Adaptive) @@ -456,20 +460,18 @@ EXPLAIN (COSTS FALSE) -> HashAggregate Group Key: l_suppkey, l_linenumber -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part -(14 rows) +(15 rows) -- check the plan if the hash aggreate is disabled. This explain errors out due -- to a bug right now, expectation must be corrected after fixing it. SET enable_hashagg TO off; -SELECT public.plan_without_result_lines($Q$ EXPLAIN (COSTS FALSE) SELECT DISTINCT l_suppkey, avg(l_partkey) FROM lineitem_hash_part GROUP BY l_suppkey, l_linenumber ORDER BY 1,2 LIMIT 10; -$Q$); - plan_without_result_lines + QUERY PLAN --------------------------------------------------------------------- Limit -> Unique @@ -539,15 +541,13 @@ EXPLAIN (COSTS FALSE) -- check the plan if the hash aggreate is disabled. We expect to see sort+unique to -- handle distinct on. SET enable_hashagg TO off; -SELECT public.plan_without_result_lines($Q$ EXPLAIN (COSTS FALSE) SELECT DISTINCT ON (l_suppkey) avg(l_partkey) FROM lineitem_hash_part GROUP BY l_suppkey, l_linenumber ORDER BY l_suppkey,1 LIMIT 10; -$Q$); - plan_without_result_lines + QUERY PLAN --------------------------------------------------------------------- Limit -> Unique @@ -595,12 +595,13 @@ EXPLAIN (COSTS FALSE) GROUP BY l_suppkey, l_linenumber ORDER BY 1 LIMIT 10; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- Limit - -> Unique - -> Sort - Sort Key: ((sum(remote_scan.avg) / (pg_catalog.sum(remote_scan.avg_1))::double precision)) + -> Sort + Sort Key: ((sum(remote_scan.avg) / (pg_catalog.sum(remote_scan.avg_1))::double precision)) + -> HashAggregate + Group Key: (sum(remote_scan.avg) / (pg_catalog.sum(remote_scan.avg_1))::double precision) -> HashAggregate Group Key: remote_scan.worker_column_3, remote_scan.worker_column_4 -> Custom Scan (Citus Adaptive) @@ -611,20 +612,18 @@ EXPLAIN (COSTS FALSE) -> HashAggregate Group Key: l_suppkey, l_linenumber -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part -(14 rows) +(15 rows) -- check the plan if the hash aggreate is disabled. This explain errors out due -- to a bug right now, expectation must be corrected after fixing it. SET enable_hashagg TO off; -SELECT public.plan_without_result_lines($Q$ EXPLAIN (COSTS FALSE) SELECT DISTINCT avg(ceil(l_partkey / 2)) FROM lineitem_hash_part GROUP BY l_suppkey, l_linenumber ORDER BY 1 LIMIT 10; -$Q$); - plan_without_result_lines + QUERY PLAN --------------------------------------------------------------------- Limit -> Unique @@ -672,12 +671,13 @@ EXPLAIN (COSTS FALSE) GROUP BY l_suppkey, l_linenumber ORDER BY 1 LIMIT 10; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- Limit - -> Unique - -> Sort - Sort Key: (((pg_catalog.sum(remote_scan.dis))::bigint + COALESCE((pg_catalog.sum(remote_scan.dis_1))::bigint, '0'::bigint))) + -> Sort + Sort Key: (((pg_catalog.sum(remote_scan.dis))::bigint + COALESCE((pg_catalog.sum(remote_scan.dis_1))::bigint, '0'::bigint))) + -> HashAggregate + Group Key: ((pg_catalog.sum(remote_scan.dis))::bigint + COALESCE((pg_catalog.sum(remote_scan.dis_1))::bigint, '0'::bigint)) -> HashAggregate Group Key: remote_scan.worker_column_3, remote_scan.worker_column_4 -> Custom Scan (Citus Adaptive) @@ -688,20 +688,18 @@ EXPLAIN (COSTS FALSE) -> HashAggregate Group Key: l_suppkey, l_linenumber -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part -(14 rows) +(15 rows) -- check the plan if the hash aggreate is disabled. This explain errors out due -- to a bug right now, expectation must be corrected after fixing it. SET enable_hashagg TO off; -SELECT public.plan_without_result_lines($Q$ EXPLAIN (COSTS FALSE) SELECT DISTINCT sum(l_suppkey) + count(l_partkey) AS dis FROM lineitem_hash_part GROUP BY l_suppkey, l_linenumber ORDER BY 1 LIMIT 10; -$Q$); - plan_without_result_lines + QUERY PLAN --------------------------------------------------------------------- Limit -> Unique @@ -910,14 +908,12 @@ EXPLAIN (COSTS FALSE) -- check the plan if the hash aggreate is disabled SET enable_hashagg TO off; -SELECT public.plan_without_result_lines($Q$ EXPLAIN (COSTS FALSE) SELECT DISTINCT ceil(count(case when l_partkey > 100000 THEN 1 ELSE 0 END) / 2) AS count FROM lineitem_hash_part GROUP BY l_suppkey ORDER BY 1; -$Q$); - plan_without_result_lines + QUERY PLAN --------------------------------------------------------------------- Unique -> Sort diff --git a/src/test/regress/expected/multi_select_distinct_0.out b/src/test/regress/expected/multi_select_distinct_0.out new file mode 100644 index 000000000..69e90b7a0 --- /dev/null +++ b/src/test/regress/expected/multi_select_distinct_0.out @@ -0,0 +1,1548 @@ +-- +-- MULTI_SELECT_DISTINCT +-- +-- Tests select distinct, and select distinct on features. +-- +SHOW server_version \gset +SELECT substring(:'server_version', '\d+')::int >= 15 AS server_version_ge_15; + server_version_ge_15 +--------------------------------------------------------------------- + f +(1 row) + +ANALYZE lineitem_hash_part; +-- function calls are supported +SELECT DISTINCT l_orderkey, now() FROM lineitem_hash_part LIMIT 0; + l_orderkey | now +--------------------------------------------------------------------- +(0 rows) + +SELECT DISTINCT l_orderkey, avg(l_linenumber) +FROM lineitem_hash_part +GROUP BY l_orderkey +HAVING avg(l_linenumber) = (select avg(distinct l_linenumber)) +LIMIT 10; +ERROR: Subqueries in HAVING cannot refer to outer query +SELECT DISTINCT l_orderkey +FROM lineitem_hash_part +GROUP BY l_orderkey +HAVING (select avg(distinct l_linenumber) = l_orderkey) +LIMIT 10; +ERROR: Subqueries in HAVING cannot refer to outer query +SELECT DISTINCT l_partkey, 1 + (random() * 0)::int FROM lineitem_hash_part ORDER BY 1 DESC LIMIT 3; + l_partkey | ?column? +--------------------------------------------------------------------- + 199973 | 1 + 199946 | 1 + 199943 | 1 +(3 rows) + +-- const expressions are supported +SELECT DISTINCT l_orderkey, 1+1 FROM lineitem_hash_part ORDER BY 1 LIMIT 5; + l_orderkey | ?column? +--------------------------------------------------------------------- + 1 | 2 + 2 | 2 + 3 | 2 + 4 | 2 + 5 | 2 +(5 rows) + +-- non const expressions are also supported +SELECT DISTINCT l_orderkey, l_partkey + 1 FROM lineitem_hash_part ORDER BY 1, 2 LIMIT 5; + l_orderkey | ?column? +--------------------------------------------------------------------- + 1 | 2133 + 1 | 15636 + 1 | 24028 + 1 | 63701 + 1 | 67311 +(5 rows) + +-- column expressions are supported +SELECT DISTINCT l_orderkey, l_shipinstruct || l_shipmode FROM lineitem_hash_part ORDER BY 2 , 1 LIMIT 5; + l_orderkey | ?column? +--------------------------------------------------------------------- + 32 | COLLECT CODAIR + 39 | COLLECT CODAIR + 66 | COLLECT CODAIR + 70 | COLLECT CODAIR + 98 | COLLECT CODAIR +(5 rows) + +-- function calls with const input are supported +SELECT DISTINCT l_orderkey, strpos('AIR', 'A') FROM lineitem_hash_part ORDER BY 1,2 LIMIT 5; + l_orderkey | strpos +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +-- function calls with non-const input are supported +SELECT DISTINCT l_orderkey, strpos(l_shipmode, 'I') + FROM lineitem_hash_part + WHERE strpos(l_shipmode, 'I') > 1 + ORDER BY 2, 1 + LIMIT 5; + l_orderkey | strpos +--------------------------------------------------------------------- + 1 | 2 + 3 | 2 + 5 | 2 + 32 | 2 + 33 | 2 +(5 rows) + +-- row types are supported +SELECT DISTINCT (l_orderkey, l_partkey) AS pair FROM lineitem_hash_part ORDER BY 1 LIMIT 5; + pair +--------------------------------------------------------------------- + (1,2132) + (1,15635) + (1,24027) + (1,63700) + (1,67310) +(5 rows) + +-- distinct on partition column +-- verify counts match with respect to count(distinct) +CREATE TEMP TABLE temp_orderkeys AS SELECT DISTINCT l_orderkey FROM lineitem_hash_part; +SELECT COUNT(*) FROM temp_orderkeys; + count +--------------------------------------------------------------------- + 2985 +(1 row) + +SELECT COUNT(DISTINCT l_orderkey) FROM lineitem_hash_part; + count +--------------------------------------------------------------------- + 2985 +(1 row) + +SELECT DISTINCT l_orderkey FROM lineitem_hash_part WHERE l_orderkey < 500 and l_partkey < 5000 order by 1; + l_orderkey +--------------------------------------------------------------------- + 1 + 3 + 32 + 35 + 39 + 65 + 129 + 130 + 134 + 164 + 194 + 228 + 261 + 290 + 320 + 321 + 354 + 418 +(18 rows) + +-- distinct on non-partition column +SELECT DISTINCT l_partkey FROM lineitem_hash_part WHERE l_orderkey > 5 and l_orderkey < 20 order by 1; + l_partkey +--------------------------------------------------------------------- + 79251 + 94780 + 139636 + 145243 + 151894 + 157238 + 163073 + 182052 +(8 rows) + +SELECT DISTINCT l_shipmode FROM lineitem_hash_part ORDER BY 1 DESC; + l_shipmode +--------------------------------------------------------------------- + TRUCK + SHIP + REG AIR + RAIL + MAIL + FOB + AIR +(7 rows) + +-- distinct with multiple columns +SELECT DISTINCT l_orderkey, o_orderdate + FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey) + WHERE l_orderkey < 10 + ORDER BY l_orderkey; + l_orderkey | o_orderdate +--------------------------------------------------------------------- + 1 | 01-02-1996 + 2 | 12-01-1996 + 3 | 10-14-1993 + 4 | 10-11-1995 + 5 | 07-30-1994 + 6 | 02-21-1992 + 7 | 01-10-1996 +(7 rows) + +-- distinct on partition column with aggregate +-- this is the same as the one without distinct due to group by +SELECT DISTINCT l_orderkey, count(*) + FROM lineitem_hash_part + WHERE l_orderkey < 200 + GROUP BY 1 + HAVING count(*) > 5 + ORDER BY 2 DESC, 1; + l_orderkey | count +--------------------------------------------------------------------- + 7 | 7 + 68 | 7 + 129 | 7 + 164 | 7 + 194 | 7 + 1 | 6 + 3 | 6 + 32 | 6 + 35 | 6 + 39 | 6 + 67 | 6 + 69 | 6 + 70 | 6 + 71 | 6 + 134 | 6 + 135 | 6 + 163 | 6 + 192 | 6 + 197 | 6 +(19 rows) + +-- explain the query to see actual plan +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_orderkey, count(*) + FROM lineitem_hash_part + WHERE l_orderkey < 200 + GROUP BY 1 + HAVING count(*) > 5 + ORDER BY 2 DESC, 1; + QUERY PLAN +--------------------------------------------------------------------- + Sort + Sort Key: remote_scan.count DESC, remote_scan.l_orderkey + -> HashAggregate + Group Key: remote_scan.count, remote_scan.l_orderkey + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_orderkey + Filter: (count(*) > 5) + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part + Filter: (l_orderkey < 200) +(14 rows) + +-- check the plan if the hash aggreate is disabled +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_orderkey, count(*) + FROM lineitem_hash_part + WHERE l_orderkey < 200 + GROUP BY 1 + HAVING count(*) > 5 + ORDER BY 2 DESC, 1; + QUERY PLAN +--------------------------------------------------------------------- + Unique + -> Sort + Sort Key: remote_scan.count DESC, remote_scan.l_orderkey + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_orderkey + Filter: (count(*) > 5) + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part + Filter: (l_orderkey < 200) +(13 rows) + +SET enable_hashagg TO on; +-- distinct on aggregate of group by columns, we try to check whether we handle +-- queries which does not have any group by column in distinct columns properly. +SELECT DISTINCT count(*) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1; + count +--------------------------------------------------------------------- + 1 + 2 + 3 + 4 +(4 rows) + +-- explain the query to see actual plan. We expect to see Aggregate node having +-- group by key on count(*) column, since columns in the Group By doesn't guarantee +-- the uniqueness of the result. +EXPLAIN (COSTS FALSE) + SELECT DISTINCT count(*) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1; + QUERY PLAN +--------------------------------------------------------------------- + Unique + -> Sort + Sort Key: (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) + -> HashAggregate + Group Key: remote_scan.worker_column_2, remote_scan.worker_column_3 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(13 rows) + +-- check the plan if the hash aggreate is disabled. We expect to see sort+unique +-- instead of aggregate plan node to handle distinct. +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT count(*) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1; + QUERY PLAN +--------------------------------------------------------------------- + Unique + -> Sort + Sort Key: (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) + -> GroupAggregate + Group Key: remote_scan.worker_column_2, remote_scan.worker_column_3 + -> Sort + Sort Key: remote_scan.worker_column_2, remote_scan.worker_column_3 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(15 rows) + +SET enable_hashagg TO on; +-- Now we have only part of group clause columns in distinct, yet it is still not +-- enough to use Group By columns to guarantee uniqueness of result list. +SELECT DISTINCT l_suppkey, count(*) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; + l_suppkey | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 + 7 | 1 + 10 | 1 + 12 | 1 + 13 | 1 + 14 | 1 +(10 rows) + +-- explain the query to see actual plan. Similar to the explain of the query above. +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_suppkey, count(*) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: remote_scan.l_suppkey, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) + -> HashAggregate + Group Key: remote_scan.l_suppkey, remote_scan.worker_column_3 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(14 rows) + +-- check the plan if the hash aggreate is disabled. Similar to the explain of +-- the query above. +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_suppkey, count(*) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: remote_scan.l_suppkey, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) + -> GroupAggregate + Group Key: remote_scan.l_suppkey, remote_scan.worker_column_3 + -> Sort + Sort Key: remote_scan.l_suppkey, remote_scan.worker_column_3 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(16 rows) + +SET enable_hashagg TO on; +-- Similar to the above query, not with count but avg. Only difference with the +-- above query is that, we create run two aggregate functions in workers. +SELECT DISTINCT l_suppkey, avg(l_partkey) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1,2 + LIMIT 10; + l_suppkey | avg +--------------------------------------------------------------------- + 1 | 190000.000000000000 + 2 | 172450.000000000000 + 3 | 112469.000000000000 + 3 | 134976.000000000000 + 4 | 112470.000000000000 + 4 | 142461.000000000000 + 5 | 182450.000000000000 + 7 | 137493.000000000000 + 10 | 150009.000000000000 + 12 | 17510.0000000000000000 +(10 rows) + +-- explain the query to see actual plan. Similar to the explain of the query above. +-- Only aggregate functions will be changed. +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_suppkey, avg(l_partkey) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1,2 + LIMIT 10; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: remote_scan.l_suppkey, ((pg_catalog.sum(remote_scan.avg) / pg_catalog.sum(remote_scan.avg_1))) + -> HashAggregate + Group Key: remote_scan.l_suppkey, remote_scan.worker_column_4 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(14 rows) + +-- check the plan if the hash aggreate is disabled. This explain errors out due +-- to a bug right now, expectation must be corrected after fixing it. +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_suppkey, avg(l_partkey) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1,2 + LIMIT 10; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: remote_scan.l_suppkey, ((pg_catalog.sum(remote_scan.avg) / pg_catalog.sum(remote_scan.avg_1))) + -> GroupAggregate + Group Key: remote_scan.l_suppkey, remote_scan.worker_column_4 + -> Sort + Sort Key: remote_scan.l_suppkey, remote_scan.worker_column_4 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(16 rows) + +SET enable_hashagg TO on; +-- Similar to the above query but with distinct on +SELECT DISTINCT ON (l_suppkey) avg(l_partkey) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY l_suppkey,1 + LIMIT 10; + avg +--------------------------------------------------------------------- + 190000.000000000000 + 172450.000000000000 + 112469.000000000000 + 112470.000000000000 + 182450.000000000000 + 137493.000000000000 + 150009.000000000000 + 17510.0000000000000000 + 87504.000000000000 + 77506.000000000000 +(10 rows) + +-- explain the query to see actual plan. We expect to see sort+unique to handle +-- distinct on. +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (l_suppkey) avg(l_partkey) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY l_suppkey,1 + LIMIT 10; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: remote_scan.worker_column_3, ((pg_catalog.sum(remote_scan.avg) / pg_catalog.sum(remote_scan.avg_1))) + -> HashAggregate + Group Key: remote_scan.worker_column_3, remote_scan.worker_column_4 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(14 rows) + +-- check the plan if the hash aggreate is disabled. We expect to see sort+unique to +-- handle distinct on. +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (l_suppkey) avg(l_partkey) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY l_suppkey,1 + LIMIT 10; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: remote_scan.worker_column_3, ((pg_catalog.sum(remote_scan.avg) / pg_catalog.sum(remote_scan.avg_1))) + -> GroupAggregate + Group Key: remote_scan.worker_column_3, remote_scan.worker_column_4 + -> Sort + Sort Key: remote_scan.worker_column_3, remote_scan.worker_column_4 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(16 rows) + +SET enable_hashagg TO on; +-- distinct with expression and aggregation +SELECT DISTINCT avg(ceil(l_partkey / 2)) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; + avg +--------------------------------------------------------------------- + 9 + 39 + 74 + 87 + 89 + 91 + 97 + 102 + 111 + 122 +(10 rows) + +-- explain the query to see actual plan +EXPLAIN (COSTS FALSE) + SELECT DISTINCT avg(ceil(l_partkey / 2)) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: ((sum(remote_scan.avg) / (pg_catalog.sum(remote_scan.avg_1))::double precision)) + -> HashAggregate + Group Key: remote_scan.worker_column_3, remote_scan.worker_column_4 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(14 rows) + +-- check the plan if the hash aggreate is disabled. This explain errors out due +-- to a bug right now, expectation must be corrected after fixing it. +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT avg(ceil(l_partkey / 2)) + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: ((sum(remote_scan.avg) / (pg_catalog.sum(remote_scan.avg_1))::double precision)) + -> GroupAggregate + Group Key: remote_scan.worker_column_3, remote_scan.worker_column_4 + -> Sort + Sort Key: remote_scan.worker_column_3, remote_scan.worker_column_4 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(16 rows) + +SET enable_hashagg TO on; +-- expression among aggregations. +SELECT DISTINCT sum(l_suppkey) + count(l_partkey) AS dis + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; + dis +--------------------------------------------------------------------- + 2 + 3 + 4 + 5 + 6 + 8 + 11 + 13 + 14 + 15 +(10 rows) + +-- explain the query to see actual plan +EXPLAIN (COSTS FALSE) + SELECT DISTINCT sum(l_suppkey) + count(l_partkey) AS dis + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: (((pg_catalog.sum(remote_scan.dis))::bigint + COALESCE((pg_catalog.sum(remote_scan.dis_1))::bigint, '0'::bigint))) + -> HashAggregate + Group Key: remote_scan.worker_column_3, remote_scan.worker_column_4 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(14 rows) + +-- check the plan if the hash aggreate is disabled. This explain errors out due +-- to a bug right now, expectation must be corrected after fixing it. +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT sum(l_suppkey) + count(l_partkey) AS dis + FROM lineitem_hash_part + GROUP BY l_suppkey, l_linenumber + ORDER BY 1 + LIMIT 10; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: (((pg_catalog.sum(remote_scan.dis))::bigint + COALESCE((pg_catalog.sum(remote_scan.dis_1))::bigint, '0'::bigint))) + -> GroupAggregate + Group Key: remote_scan.worker_column_3, remote_scan.worker_column_4 + -> Sort + Sort Key: remote_scan.worker_column_3, remote_scan.worker_column_4 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_suppkey, l_linenumber + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(16 rows) + +SET enable_hashagg TO on; +-- distinct on all columns, note Group By columns guarantees uniqueness of the +-- result list. +SELECT DISTINCT * + FROM lineitem_hash_part + GROUP BY 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 + ORDER BY 1,2 + LIMIT 10; + l_orderkey | l_partkey | l_suppkey | l_linenumber | l_quantity | l_extendedprice | l_discount | l_tax | l_returnflag | l_linestatus | l_shipdate | l_commitdate | l_receiptdate | l_shipinstruct | l_shipmode | l_comment +--------------------------------------------------------------------- + 1 | 2132 | 4633 | 4 | 28.00 | 28955.64 | 0.09 | 0.06 | N | O | 04-21-1996 | 03-30-1996 | 05-16-1996 | NONE | AIR | lites. fluffily even de + 1 | 15635 | 638 | 6 | 32.00 | 49620.16 | 0.07 | 0.02 | N | O | 01-30-1996 | 02-07-1996 | 02-03-1996 | DELIVER IN PERSON | MAIL | arefully slyly ex + 1 | 24027 | 1534 | 5 | 24.00 | 22824.48 | 0.10 | 0.04 | N | O | 03-30-1996 | 03-14-1996 | 04-01-1996 | NONE | FOB | pending foxes. slyly re + 1 | 63700 | 3701 | 3 | 8.00 | 13309.60 | 0.10 | 0.02 | N | O | 01-29-1996 | 03-05-1996 | 01-31-1996 | TAKE BACK RETURN | REG AIR | riously. regular, express dep + 1 | 67310 | 7311 | 2 | 36.00 | 45983.16 | 0.09 | 0.06 | N | O | 04-12-1996 | 02-28-1996 | 04-20-1996 | TAKE BACK RETURN | MAIL | ly final dependencies: slyly bold + 1 | 155190 | 7706 | 1 | 17.00 | 21168.23 | 0.04 | 0.02 | N | O | 03-13-1996 | 02-12-1996 | 03-22-1996 | DELIVER IN PERSON | TRUCK | egular courts above the + 2 | 106170 | 1191 | 1 | 38.00 | 44694.46 | 0.00 | 0.05 | N | O | 01-28-1997 | 01-14-1997 | 02-02-1997 | TAKE BACK RETURN | RAIL | ven requests. deposits breach a + 3 | 4297 | 1798 | 1 | 45.00 | 54058.05 | 0.06 | 0.00 | R | F | 02-02-1994 | 01-04-1994 | 02-23-1994 | NONE | AIR | ongside of the furiously brave acco + 3 | 19036 | 6540 | 2 | 49.00 | 46796.47 | 0.10 | 0.00 | R | F | 11-09-1993 | 12-20-1993 | 11-24-1993 | TAKE BACK RETURN | RAIL | unusual accounts. eve + 3 | 29380 | 1883 | 4 | 2.00 | 2618.76 | 0.01 | 0.06 | A | F | 12-04-1993 | 01-07-1994 | 01-01-1994 | NONE | TRUCK | y. fluffily pending d +(10 rows) + +-- explain the query to see actual plan. We expect to see only one aggregation +-- node since group by columns guarantees the uniqueness. +SELECT coordinator_plan($Q$ +EXPLAIN (COSTS FALSE) + SELECT DISTINCT * + FROM lineitem_hash_part + GROUP BY 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 + ORDER BY 1,2 + LIMIT 10; +$Q$); + coordinator_plan +--------------------------------------------------------------------- + Limit + -> Sort + Sort Key: remote_scan.l_orderkey, remote_scan.l_partkey + -> HashAggregate + Group Key: remote_scan.l_orderkey, remote_scan.l_partkey, remote_scan.l_suppkey, remote_scan.l_linenumber, remote_scan.l_quantity, remote_scan.l_extendedprice, remote_scan.l_discount, remote_scan.l_tax, remote_scan.l_returnflag, remote_scan.l_linestatus, remote_scan.l_shipdate, remote_scan.l_commitdate, remote_scan.l_receiptdate, remote_scan.l_shipinstruct, remote_scan.l_shipmode, remote_scan.l_comment + -> Custom Scan (Citus Adaptive) + Task Count: 4 +(7 rows) + +-- check the plan if the hash aggreate is disabled. We expect to see only one +-- aggregation node since group by columns guarantees the uniqueness. +SET enable_hashagg TO off; +SELECT coordinator_plan($Q$ +EXPLAIN (COSTS FALSE) + SELECT DISTINCT * + FROM lineitem_hash_part + GROUP BY 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 + ORDER BY 1,2 + LIMIT 10; +$Q$); + coordinator_plan +--------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: remote_scan.l_orderkey, remote_scan.l_partkey, remote_scan.l_suppkey, remote_scan.l_linenumber, remote_scan.l_quantity, remote_scan.l_extendedprice, remote_scan.l_discount, remote_scan.l_tax, remote_scan.l_returnflag, remote_scan.l_linestatus, remote_scan.l_shipdate, remote_scan.l_commitdate, remote_scan.l_receiptdate, remote_scan.l_shipinstruct, remote_scan.l_shipmode, remote_scan.l_comment + -> Custom Scan (Citus Adaptive) + Task Count: 4 +(6 rows) + +SET enable_hashagg TO on; +-- distinct on count distinct +SELECT DISTINCT count(DISTINCT l_partkey), count(DISTINCT l_shipmode) + FROM lineitem_hash_part + GROUP BY l_orderkey + ORDER BY 1,2; + count | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 2 | 2 + 3 | 1 + 3 | 2 + 3 | 3 + 4 | 1 + 4 | 2 + 4 | 3 + 4 | 4 + 5 | 2 + 5 | 3 + 5 | 4 + 5 | 5 + 6 | 2 + 6 | 3 + 6 | 4 + 6 | 5 + 6 | 6 + 7 | 2 + 7 | 3 + 7 | 4 + 7 | 5 + 7 | 6 + 7 | 7 +(25 rows) + +-- explain the query to see actual plan. We expect to see aggregation plan for +-- the outer distinct. +EXPLAIN (COSTS FALSE) + SELECT DISTINCT count(DISTINCT l_partkey), count(DISTINCT l_shipmode) + FROM lineitem_hash_part + GROUP BY l_orderkey + ORDER BY 1,2; + QUERY PLAN +--------------------------------------------------------------------- + Sort + Sort Key: remote_scan.count, remote_scan.count_1 + -> HashAggregate + Group Key: remote_scan.count, remote_scan.count_1 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> GroupAggregate + Group Key: l_orderkey + -> Sort + Sort Key: l_orderkey + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(14 rows) + +-- check the plan if the hash aggreate is disabled. We expect to see sort + unique +-- plans for the outer distinct. +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT count(DISTINCT l_partkey), count(DISTINCT l_shipmode) + FROM lineitem_hash_part + GROUP BY l_orderkey + ORDER BY 1,2; + QUERY PLAN +--------------------------------------------------------------------- + Unique + -> Sort + Sort Key: remote_scan.count, remote_scan.count_1 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> GroupAggregate + Group Key: l_orderkey + -> Sort + Sort Key: l_orderkey + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(13 rows) + +SET enable_hashagg TO on; +-- distinct on aggregation with filter and expression +SELECT DISTINCT ceil(count(case when l_partkey > 100000 THEN 1 ELSE 0 END) / 2) AS count + FROM lineitem_hash_part + GROUP BY l_suppkey + ORDER BY 1; + count +--------------------------------------------------------------------- + 0 + 1 + 2 + 3 + 4 +(5 rows) + +-- explain the query to see actual plan +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ceil(count(case when l_partkey > 100000 THEN 1 ELSE 0 END) / 2) AS count + FROM lineitem_hash_part + GROUP BY l_suppkey + ORDER BY 1; + QUERY PLAN +--------------------------------------------------------------------- + Unique + -> Sort + Sort Key: (ceil(((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint) / 2))::double precision)) + -> HashAggregate + Group Key: remote_scan.worker_column_2 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_suppkey + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(13 rows) + +-- check the plan if the hash aggreate is disabled +SET enable_hashagg TO off; +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ceil(count(case when l_partkey > 100000 THEN 1 ELSE 0 END) / 2) AS count + FROM lineitem_hash_part + GROUP BY l_suppkey + ORDER BY 1; + QUERY PLAN +--------------------------------------------------------------------- + Unique + -> Sort + Sort Key: (ceil(((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint) / 2))::double precision)) + -> GroupAggregate + Group Key: remote_scan.worker_column_2 + -> Sort + Sort Key: remote_scan.worker_column_2 + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_suppkey + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(15 rows) + +SET enable_hashagg TO on; +-- explain the query to see actual plan with array_agg aggregation. +SELECT coordinator_plan($Q$ +EXPLAIN (COSTS FALSE) + SELECT DISTINCT array_agg(l_linenumber), array_length(array_agg(l_linenumber), 1) + FROM lineitem_hash_part + GROUP BY l_orderkey + ORDER BY 2 + LIMIT 15; +$Q$); + coordinator_plan +--------------------------------------------------------------------- + Limit + -> Sort + Sort Key: remote_scan.array_length + -> HashAggregate + Group Key: remote_scan.array_length, remote_scan.array_agg + -> Custom Scan (Citus Adaptive) + Task Count: 4 +(7 rows) + +-- check the plan if the hash aggreate is disabled. +SET enable_hashagg TO off; +SELECT coordinator_plan($Q$ +EXPLAIN (COSTS FALSE) + SELECT DISTINCT array_agg(l_linenumber), array_length(array_agg(l_linenumber), 1) + FROM lineitem_hash_part + GROUP BY l_orderkey + ORDER BY 2 + LIMIT 15; +$Q$); + coordinator_plan +--------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: remote_scan.array_length, remote_scan.array_agg + -> Custom Scan (Citus Adaptive) + Task Count: 4 +(6 rows) + +SET enable_hashagg TO on; +-- distinct on non-partition column with aggregate +-- this is the same as non-distinct version due to group by +SELECT DISTINCT l_partkey, count(*) + FROM lineitem_hash_part + GROUP BY 1 + HAVING count(*) > 2 + ORDER BY 1; + l_partkey | count +--------------------------------------------------------------------- + 1051 | 3 + 1927 | 3 + 6983 | 3 + 15283 | 3 + 87761 | 3 + 136884 | 3 + 149926 | 3 + 160895 | 3 + 177771 | 3 + 188804 | 3 + 199146 | 3 +(11 rows) + +-- explain the query to see actual plan +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_partkey, count(*) + FROM lineitem_hash_part + GROUP BY 1 + HAVING count(*) > 2 + ORDER BY 1; + QUERY PLAN +--------------------------------------------------------------------- + Unique + -> Sort + Sort Key: remote_scan.l_partkey, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) + -> HashAggregate + Group Key: remote_scan.l_partkey + Filter: (COALESCE((pg_catalog.sum(remote_scan.worker_column_3))::bigint, '0'::bigint) > 2) + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: l_partkey + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(14 rows) + +-- distinct on non-partition column and avg +SELECT DISTINCT l_partkey, avg(l_linenumber) + FROM lineitem_hash_part + WHERE l_partkey < 500 + GROUP BY 1 + HAVING avg(l_linenumber) > 2 + ORDER BY 1; + l_partkey | avg +--------------------------------------------------------------------- + 18 | 7.0000000000000000 + 79 | 6.0000000000000000 + 149 | 4.5000000000000000 + 175 | 5.0000000000000000 + 179 | 6.0000000000000000 + 182 | 3.0000000000000000 + 222 | 4.0000000000000000 + 278 | 3.0000000000000000 + 299 | 7.0000000000000000 + 308 | 7.0000000000000000 + 309 | 5.0000000000000000 + 321 | 3.0000000000000000 + 337 | 6.0000000000000000 + 364 | 3.0000000000000000 + 403 | 4.0000000000000000 +(15 rows) + +-- distinct on multiple non-partition columns +SELECT DISTINCT l_partkey, l_suppkey + FROM lineitem_hash_part + WHERE l_shipmode = 'AIR' AND l_orderkey < 100 + ORDER BY 1, 2; + l_partkey | l_suppkey +--------------------------------------------------------------------- + 2132 | 4633 + 4297 | 1798 + 37531 | 35 + 44161 | 6666 + 44706 | 4707 + 67831 | 5350 + 85811 | 8320 + 94368 | 6878 + 108338 | 849 + 108570 | 8571 + 137267 | 4807 + 137469 | 9983 + 173489 | 3490 + 196156 | 1195 + 197921 | 441 +(15 rows) + +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_partkey, l_suppkey + FROM lineitem_hash_part + WHERE l_shipmode = 'AIR' AND l_orderkey < 100 + ORDER BY 1, 2; + QUERY PLAN +--------------------------------------------------------------------- + Sort + Sort Key: remote_scan.l_partkey, remote_scan.l_suppkey + -> HashAggregate + Group Key: remote_scan.l_partkey, remote_scan.l_suppkey + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Unique + -> Sort + Sort Key: l_partkey, l_suppkey + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part + Filter: ((l_orderkey < 100) AND (l_shipmode = 'AIR'::bpchar)) +(14 rows) + +-- distinct on partition column +SELECT DISTINCT ON (l_orderkey) l_orderkey, l_partkey, l_suppkey + FROM lineitem_hash_part + WHERE l_orderkey < 35 + ORDER BY 1, 2, 3; + l_orderkey | l_partkey | l_suppkey +--------------------------------------------------------------------- + 1 | 2132 | 4633 + 2 | 106170 | 1191 + 3 | 4297 | 1798 + 4 | 88035 | 5560 + 5 | 37531 | 35 + 6 | 139636 | 2150 + 7 | 79251 | 1759 + 32 | 2743 | 7744 + 33 | 33918 | 3919 + 34 | 88362 | 871 +(10 rows) + +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (l_orderkey) l_orderkey, l_partkey, l_suppkey + FROM lineitem_hash_part + WHERE l_orderkey < 35 + ORDER BY 1, 2, 3; + QUERY PLAN +--------------------------------------------------------------------- + Unique + -> Sort + Sort Key: remote_scan.l_orderkey, remote_scan.l_partkey, remote_scan.l_suppkey + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Unique + -> Sort + Sort Key: l_orderkey, l_partkey, l_suppkey + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part + Filter: (l_orderkey < 35) +(13 rows) + +-- distinct on non-partition column +-- note order by is required here +-- otherwise query results will be different since +-- distinct on clause is on non-partition column +SELECT DISTINCT ON (l_partkey) l_partkey, l_orderkey + FROM lineitem_hash_part + ORDER BY 1,2 + LIMIT 20; + l_partkey | l_orderkey +--------------------------------------------------------------------- + 18 | 12005 + 79 | 5121 + 91 | 2883 + 149 | 807 + 175 | 4102 + 179 | 2117 + 182 | 548 + 195 | 2528 + 204 | 10048 + 222 | 9413 + 245 | 9446 + 278 | 1287 + 299 | 1122 + 308 | 11137 + 309 | 2374 + 318 | 321 + 321 | 5984 + 337 | 10403 + 350 | 13698 + 358 | 4323 +(20 rows) + +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (l_partkey) l_partkey, l_orderkey + FROM lineitem_hash_part + ORDER BY 1,2 + LIMIT 20; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: remote_scan.l_partkey, remote_scan.l_orderkey + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Unique + -> Sort + Sort Key: l_partkey, l_orderkey + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(14 rows) + +-- distinct on with joins +-- each customer's first order key +SELECT DISTINCT ON (o_custkey) o_custkey, l_orderkey + FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey) + WHERE o_custkey < 15 + ORDER BY 1,2; + o_custkey | l_orderkey +--------------------------------------------------------------------- + 1 | 9154 + 2 | 10563 + 4 | 320 + 5 | 11682 + 7 | 10402 + 8 | 102 + 10 | 1602 + 11 | 12800 + 13 | 994 + 14 | 11011 +(10 rows) + +SELECT coordinator_plan($Q$ +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (o_custkey) o_custkey, l_orderkey + FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey) + WHERE o_custkey < 15 + ORDER BY 1,2; +$Q$); + coordinator_plan +--------------------------------------------------------------------- + Unique + -> Sort + Sort Key: remote_scan.o_custkey, remote_scan.l_orderkey + -> Custom Scan (Citus Adaptive) + Task Count: 4 +(5 rows) + +-- explain without order by +-- notice master plan has order by on distinct on column +SELECT coordinator_plan($Q$ +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (o_custkey) o_custkey, l_orderkey + FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey) + WHERE o_custkey < 15; +$Q$); + coordinator_plan +--------------------------------------------------------------------- + Unique + -> Sort + Sort Key: remote_scan.o_custkey + -> Custom Scan (Citus Adaptive) + Task Count: 4 +(5 rows) + +-- each customer's each order's first l_partkey +SELECT DISTINCT ON (o_custkey, l_orderkey) o_custkey, l_orderkey, l_linenumber, l_partkey + FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey) + WHERE o_custkey < 20 + ORDER BY 1,2,3; + o_custkey | l_orderkey | l_linenumber | l_partkey +--------------------------------------------------------------------- + 1 | 9154 | 1 | 86513 + 1 | 14656 | 1 | 59539 + 2 | 10563 | 1 | 147459 + 4 | 320 | 1 | 4415 + 4 | 739 | 1 | 84489 + 4 | 10688 | 1 | 45037 + 4 | 10788 | 1 | 50814 + 4 | 13728 | 1 | 86216 + 5 | 11682 | 1 | 31634 + 5 | 11746 | 1 | 180724 + 5 | 14308 | 1 | 157430 + 7 | 10402 | 1 | 53661 + 7 | 13031 | 1 | 112161 + 7 | 14145 | 1 | 138729 + 7 | 14404 | 1 | 143034 + 8 | 102 | 1 | 88914 + 8 | 164 | 1 | 91309 + 8 | 13601 | 1 | 40504 + 10 | 1602 | 1 | 182806 + 10 | 9862 | 1 | 86241 + 10 | 11431 | 1 | 62112 + 10 | 13124 | 1 | 29414 + 11 | 12800 | 1 | 152806 + 13 | 994 | 1 | 64486 + 13 | 1603 | 1 | 38191 + 13 | 4704 | 1 | 77934 + 13 | 9927 | 1 | 875 + 14 | 11011 | 1 | 172485 + 17 | 896 | 1 | 38675 + 17 | 5507 | 1 | 9600 + 19 | 353 | 1 | 119305 + 19 | 1504 | 1 | 81389 + 19 | 1669 | 1 | 78373 + 19 | 5893 | 1 | 133707 + 19 | 9954 | 1 | 92138 + 19 | 14885 | 1 | 36154 +(36 rows) + +-- explain without order by +SELECT coordinator_plan($Q$ +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (o_custkey, l_orderkey) o_custkey, l_orderkey, l_linenumber, l_partkey + FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey) + WHERE o_custkey < 20; +$Q$); + coordinator_plan +--------------------------------------------------------------------- + Unique + -> Sort + Sort Key: remote_scan.o_custkey, remote_scan.l_orderkey + -> Custom Scan (Citus Adaptive) + Task Count: 4 +(5 rows) + +-- each customer's each order's last l_partkey +SELECT DISTINCT ON (o_custkey, l_orderkey) o_custkey, l_orderkey, l_linenumber, l_partkey + FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey) + WHERE o_custkey < 15 + ORDER BY 1,2,3 DESC; + o_custkey | l_orderkey | l_linenumber | l_partkey +--------------------------------------------------------------------- + 1 | 9154 | 7 | 173448 + 1 | 14656 | 1 | 59539 + 2 | 10563 | 4 | 110741 + 4 | 320 | 2 | 192158 + 4 | 739 | 5 | 187523 + 4 | 10688 | 2 | 132574 + 4 | 10788 | 4 | 196473 + 4 | 13728 | 3 | 12450 + 5 | 11682 | 3 | 177152 + 5 | 11746 | 7 | 193807 + 5 | 14308 | 3 | 140916 + 7 | 10402 | 2 | 64514 + 7 | 13031 | 6 | 7761 + 7 | 14145 | 6 | 130723 + 7 | 14404 | 7 | 35349 + 8 | 102 | 4 | 61158 + 8 | 164 | 7 | 3037 + 8 | 13601 | 5 | 12470 + 10 | 1602 | 1 | 182806 + 10 | 9862 | 5 | 135675 + 10 | 11431 | 7 | 8563 + 10 | 13124 | 3 | 67055 + 11 | 12800 | 5 | 179110 + 13 | 994 | 4 | 130471 + 13 | 1603 | 2 | 65209 + 13 | 4704 | 3 | 63081 + 13 | 9927 | 6 | 119356 + 14 | 11011 | 7 | 95939 +(28 rows) + +-- subqueries +SELECT DISTINCT l_orderkey, l_partkey + FROM ( + SELECT l_orderkey, l_partkey + FROM lineitem_hash_part + ) q + ORDER BY 1,2 + LIMIT 10; + l_orderkey | l_partkey +--------------------------------------------------------------------- + 1 | 2132 + 1 | 15635 + 1 | 24027 + 1 | 63700 + 1 | 67310 + 1 | 155190 + 2 | 106170 + 3 | 4297 + 3 | 19036 + 3 | 29380 +(10 rows) + +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_orderkey, l_partkey + FROM ( + SELECT l_orderkey, l_partkey + FROM lineitem_hash_part + ) q + ORDER BY 1,2 + LIMIT 10; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Sort + Sort Key: remote_scan.l_orderkey, remote_scan.l_partkey + -> HashAggregate + Group Key: remote_scan.l_orderkey, remote_scan.l_partkey + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: l_orderkey, l_partkey + -> HashAggregate + Group Key: l_orderkey, l_partkey + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(16 rows) + +SELECT DISTINCT l_orderkey, cnt + FROM ( + SELECT l_orderkey, count(*) as cnt + FROM lineitem_hash_part + GROUP BY 1 + ) q + ORDER BY 1,2 + LIMIT 10; + l_orderkey | cnt +--------------------------------------------------------------------- + 1 | 6 + 2 | 1 + 3 | 6 + 4 | 1 + 5 | 3 + 6 | 1 + 7 | 7 + 32 | 6 + 33 | 4 + 34 | 3 +(10 rows) + +EXPLAIN (COSTS FALSE) + SELECT DISTINCT l_orderkey, cnt + FROM ( + SELECT l_orderkey, count(*) as cnt + FROM lineitem_hash_part + GROUP BY 1 + ) q + ORDER BY 1,2 + LIMIT 10; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Sort + Sort Key: remote_scan.l_orderkey, remote_scan.cnt + -> HashAggregate + Group Key: remote_scan.l_orderkey, remote_scan.cnt + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: lineitem_hash_part.l_orderkey, (count(*)) + -> HashAggregate + Group Key: lineitem_hash_part.l_orderkey, count(*) + -> HashAggregate + Group Key: lineitem_hash_part.l_orderkey + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(18 rows) + +-- distinct on partition column +-- random() is added to inner query to prevent flattening +SELECT DISTINCT ON (l_orderkey) l_orderkey, l_partkey + FROM ( + SELECT l_orderkey, l_partkey, (random()*10)::int + 2 as r + FROM lineitem_hash_part + ) q + WHERE r > 1 + ORDER BY 1,2 + LIMIT 10; + l_orderkey | l_partkey +--------------------------------------------------------------------- + 1 | 2132 + 2 | 106170 + 3 | 4297 + 4 | 88035 + 5 | 37531 + 6 | 139636 + 7 | 79251 + 32 | 2743 + 33 | 33918 + 34 | 88362 +(10 rows) + +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (l_orderkey) l_orderkey, l_partkey + FROM ( + SELECT l_orderkey, l_partkey, (random()*10)::int + 2 as r + FROM lineitem_hash_part + ) q + WHERE r > 1 + ORDER BY 1,2 + LIMIT 10; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: remote_scan.l_orderkey, remote_scan.l_partkey + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Unique + -> Sort + Sort Key: q.l_orderkey, q.l_partkey + -> Subquery Scan on q + Filter: (q.r > 1) + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(16 rows) + +-- distinct on non-partition column +SELECT DISTINCT ON (l_partkey) l_orderkey, l_partkey + FROM ( + SELECT l_orderkey, l_partkey, (random()*10)::int + 2 as r + FROM lineitem_hash_part + ) q + WHERE r > 1 + ORDER BY 2,1 + LIMIT 10; + l_orderkey | l_partkey +--------------------------------------------------------------------- + 12005 | 18 + 5121 | 79 + 2883 | 91 + 807 | 149 + 4102 | 175 + 2117 | 179 + 548 | 182 + 2528 | 195 + 10048 | 204 + 9413 | 222 +(10 rows) + +EXPLAIN (COSTS FALSE) + SELECT DISTINCT ON (l_partkey) l_orderkey, l_partkey + FROM ( + SELECT l_orderkey, l_partkey, (random()*10)::int + 2 as r + FROM lineitem_hash_part + ) q + WHERE r > 1 + ORDER BY 2,1 + LIMIT 10; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Unique + -> Sort + Sort Key: remote_scan.l_partkey, remote_scan.l_orderkey + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Unique + -> Sort + Sort Key: q.l_partkey, q.l_orderkey + -> Subquery Scan on q + Filter: (q.r > 1) + -> Seq Scan on lineitem_hash_part_360041 lineitem_hash_part +(16 rows) + diff --git a/src/test/regress/expected/multi_test_helpers.out b/src/test/regress/expected/multi_test_helpers.out index b02cd6cd4..640a8d9ee 100644 --- a/src/test/regress/expected/multi_test_helpers.out +++ b/src/test/regress/expected/multi_test_helpers.out @@ -17,15 +17,10 @@ BEGIN END; $$LANGUAGE plpgsql; -- Create a function to ignore worker plans in explain output --- Also remove extra "-> Result" lines for PG15 support CREATE OR REPLACE FUNCTION coordinator_plan(explain_command text, out query_plan text) RETURNS SETOF TEXT AS $$ BEGIN FOR query_plan IN execute explain_command LOOP - IF (query_plan LIKE '%-> Result%' OR query_plan = 'Result') - THEN - CONTINUE; - END IF; RETURN next; IF query_plan LIKE '%Task Count:%' THEN @@ -36,16 +31,12 @@ BEGIN END; $$ language plpgsql; -- Create a function to ignore worker plans in explain output -- It also shows task count for plan and subplans --- Also remove extra "-> Result" lines for PG15 support CREATE OR REPLACE FUNCTION coordinator_plan_with_subplans(explain_command text, out query_plan text) RETURNS SETOF TEXT AS $$ DECLARE task_count_line_reached boolean := false; BEGIN FOR query_plan IN execute explain_command LOOP - IF (query_plan LIKE '%-> Result%' OR query_plan = 'Result') THEN - CONTINUE; - END IF; IF NOT task_count_line_reached THEN RETURN next; END IF; @@ -59,19 +50,6 @@ BEGIN END LOOP; RETURN; END; $$ language plpgsql; --- Create a function to ignore "-> Result" lines for PG15 support --- In PG15 there are some extra "-> Result" lines -CREATE OR REPLACE FUNCTION plan_without_result_lines(explain_command text, out query_plan text) -RETURNS SETOF TEXT AS $$ -BEGIN - FOR query_plan IN execute explain_command LOOP - IF (query_plan LIKE '%-> Result%' OR query_plan = 'Result') THEN - CONTINUE; - END IF; - RETURN next; - END LOOP; - RETURN; -END; $$ language plpgsql; -- Create a function to normalize Memory Usage, Buckets, Batches CREATE OR REPLACE FUNCTION plan_normalize_memory(explain_command text, out query_plan text) RETURNS SETOF TEXT AS $$ @@ -81,18 +59,6 @@ BEGIN RETURN NEXT; END LOOP; END; $$ language plpgsql; --- Create a function to remove arrows from the explain plan -CREATE OR REPLACE FUNCTION plan_without_arrows(explain_command text, out query_plan text) -RETURNS SETOF TEXT AS $$ -BEGIN - FOR query_plan IN execute explain_command LOOP - IF (query_plan LIKE '%-> Result%' OR query_plan = 'Result') THEN - CONTINUE; - END IF; - query_plan := regexp_replace(query_plan, '( )*-> (.*)', '\2', 'g'); - RETURN NEXT; - END LOOP; -END; $$ language plpgsql; -- helper function that returns true if output of given explain has "is not null" (case in-sensitive) CREATE OR REPLACE FUNCTION explain_has_is_not_null(explain_command text) RETURNS BOOLEAN AS $$ diff --git a/src/test/regress/expected/window_functions.out b/src/test/regress/expected/window_functions.out index 2e88f5b51..6f30a49e3 100644 --- a/src/test/regress/expected/window_functions.out +++ b/src/test/regress/expected/window_functions.out @@ -1491,12 +1491,10 @@ LIMIT 5; (17 rows) -- Grouping can be pushed down with aggregates even when window function can't -SELECT public.plan_without_result_lines($Q$ EXPLAIN (COSTS FALSE) SELECT user_id, count(value_1), stddev(value_1), count(user_id) OVER (PARTITION BY random()) FROM users_table GROUP BY user_id HAVING avg(value_1) > 2 LIMIT 1; -$Q$); - plan_without_result_lines + QUERY PLAN --------------------------------------------------------------------- Limit -> WindowAgg diff --git a/src/test/regress/expected/window_functions_0.out b/src/test/regress/expected/window_functions_0.out index c9442c7b5..c5a132301 100644 --- a/src/test/regress/expected/window_functions_0.out +++ b/src/test/regress/expected/window_functions_0.out @@ -1495,12 +1495,10 @@ LIMIT 5; (18 rows) -- Grouping can be pushed down with aggregates even when window function can't -SELECT public.plan_without_result_lines($Q$ EXPLAIN (COSTS FALSE) SELECT user_id, count(value_1), stddev(value_1), count(user_id) OVER (PARTITION BY random()) FROM users_table GROUP BY user_id HAVING avg(value_1) > 2 LIMIT 1; -$Q$); - plan_without_result_lines + QUERY PLAN --------------------------------------------------------------------- Limit -> WindowAgg diff --git a/src/test/regress/sql/columnar_chunk_filtering.sql b/src/test/regress/sql/columnar_chunk_filtering.sql index 335401a20..b8b2b411d 100644 --- a/src/test/regress/sql/columnar_chunk_filtering.sql +++ b/src/test/regress/sql/columnar_chunk_filtering.sql @@ -130,15 +130,11 @@ INSERT INTO another_columnar_table SELECT generate_series(0,5); EXPLAIN (analyze on, costs off, timing off, summary off) SELECT a, y FROM multi_column_chunk_filtering, another_columnar_table WHERE x > 1; -SELECT plan_without_arrows($Q$ EXPLAIN (costs off, timing off, summary off) SELECT y, * FROM another_columnar_table; -$Q$); -SELECT plan_without_arrows($Q$ EXPLAIN (costs off, timing off, summary off) SELECT *, x FROM another_columnar_table; -$Q$); EXPLAIN (costs off, timing off, summary off) SELECT y, another_columnar_table FROM another_columnar_table; diff --git a/src/test/regress/sql/columnar_citus_integration.sql b/src/test/regress/sql/columnar_citus_integration.sql index 566c3a9f6..514508795 100644 --- a/src/test/regress/sql/columnar_citus_integration.sql +++ b/src/test/regress/sql/columnar_citus_integration.sql @@ -429,13 +429,11 @@ EXPLAIN (COSTS OFF, SUMMARY OFF) SELECT * FROM weird_col_explain; \set VERBOSITY terse -SELECT public.plan_without_result_lines($Q$ EXPLAIN (COSTS OFF, SUMMARY OFF) SELECT *, "bbbbbbbbbbbbbbbbbbbbbbbbb\!bbbb'bbbbbbbbbbbbbbbbbbbbb''bbbbbbbb" FROM weird_col_explain WHERE "bbbbbbbbbbbbbbbbbbbbbbbbb\!bbbb'bbbbbbbbbbbbbbbbbbbbb''bbbbbbbb" * 2 > "aaaaaaaaaaaa$aaaaaa$$aaaaaaaaaaaaaaaaaaaaaaaaaaaaa'aaaaaaaa'$a'!"; -$Q$); \set VERBOSITY default -- should not project any columns diff --git a/src/test/regress/sql/insert_select_repartition.sql b/src/test/regress/sql/insert_select_repartition.sql index ee6065b88..94a16fed0 100644 --- a/src/test/regress/sql/insert_select_repartition.sql +++ b/src/test/regress/sql/insert_select_repartition.sql @@ -635,9 +635,7 @@ DO UPDATE SET create table table_with_sequences (x int, y int, z bigserial); insert into table_with_sequences values (1,1); select create_distributed_table('table_with_sequences','x'); -SELECT public.plan_without_result_lines($Q$ explain (costs off) insert into table_with_sequences select y, x from table_with_sequences; -$Q$); -- verify that we don't report repartitioned insert/select for tables -- with user-defined sequences. @@ -645,9 +643,7 @@ CREATE SEQUENCE user_defined_sequence; create table table_with_user_sequences (x int, y int, z bigint default nextval('user_defined_sequence')); insert into table_with_user_sequences values (1,1); select create_distributed_table('table_with_user_sequences','x'); -SELECT public.plan_without_result_lines($Q$ explain (costs off) insert into table_with_user_sequences select y, x from table_with_user_sequences; -$Q$); -- clean-up SET client_min_messages TO WARNING; diff --git a/src/test/regress/sql/multi_select_distinct.sql b/src/test/regress/sql/multi_select_distinct.sql index 75dd99da0..a2ee189b0 100644 --- a/src/test/regress/sql/multi_select_distinct.sql +++ b/src/test/regress/sql/multi_select_distinct.sql @@ -3,6 +3,8 @@ -- -- Tests select distinct, and select distinct on features. -- +SHOW server_version \gset +SELECT substring(:'server_version', '\d+')::int >= 15 AS server_version_ge_15; ANALYZE lineitem_hash_part; @@ -113,13 +115,11 @@ EXPLAIN (COSTS FALSE) -- check the plan if the hash aggreate is disabled. We expect to see sort+unique -- instead of aggregate plan node to handle distinct. SET enable_hashagg TO off; -SELECT public.plan_without_result_lines($Q$ EXPLAIN (COSTS FALSE) SELECT DISTINCT count(*) FROM lineitem_hash_part GROUP BY l_suppkey, l_linenumber ORDER BY 1; -$Q$); SET enable_hashagg TO on; @@ -142,14 +142,12 @@ EXPLAIN (COSTS FALSE) -- check the plan if the hash aggreate is disabled. Similar to the explain of -- the query above. SET enable_hashagg TO off; -SELECT public.plan_without_result_lines($Q$ EXPLAIN (COSTS FALSE) SELECT DISTINCT l_suppkey, count(*) FROM lineitem_hash_part GROUP BY l_suppkey, l_linenumber ORDER BY 1 LIMIT 10; -$Q$); SET enable_hashagg TO on; @@ -173,14 +171,12 @@ EXPLAIN (COSTS FALSE) -- check the plan if the hash aggreate is disabled. This explain errors out due -- to a bug right now, expectation must be corrected after fixing it. SET enable_hashagg TO off; -SELECT public.plan_without_result_lines($Q$ EXPLAIN (COSTS FALSE) SELECT DISTINCT l_suppkey, avg(l_partkey) FROM lineitem_hash_part GROUP BY l_suppkey, l_linenumber ORDER BY 1,2 LIMIT 10; -$Q$); SET enable_hashagg TO on; @@ -203,14 +199,12 @@ EXPLAIN (COSTS FALSE) -- check the plan if the hash aggreate is disabled. We expect to see sort+unique to -- handle distinct on. SET enable_hashagg TO off; -SELECT public.plan_without_result_lines($Q$ EXPLAIN (COSTS FALSE) SELECT DISTINCT ON (l_suppkey) avg(l_partkey) FROM lineitem_hash_part GROUP BY l_suppkey, l_linenumber ORDER BY l_suppkey,1 LIMIT 10; -$Q$); SET enable_hashagg TO on; @@ -232,14 +226,12 @@ EXPLAIN (COSTS FALSE) -- check the plan if the hash aggreate is disabled. This explain errors out due -- to a bug right now, expectation must be corrected after fixing it. SET enable_hashagg TO off; -SELECT public.plan_without_result_lines($Q$ EXPLAIN (COSTS FALSE) SELECT DISTINCT avg(ceil(l_partkey / 2)) FROM lineitem_hash_part GROUP BY l_suppkey, l_linenumber ORDER BY 1 LIMIT 10; -$Q$); SET enable_hashagg TO on; @@ -261,14 +253,12 @@ EXPLAIN (COSTS FALSE) -- check the plan if the hash aggreate is disabled. This explain errors out due -- to a bug right now, expectation must be corrected after fixing it. SET enable_hashagg TO off; -SELECT public.plan_without_result_lines($Q$ EXPLAIN (COSTS FALSE) SELECT DISTINCT sum(l_suppkey) + count(l_partkey) AS dis FROM lineitem_hash_part GROUP BY l_suppkey, l_linenumber ORDER BY 1 LIMIT 10; -$Q$); SET enable_hashagg TO on; @@ -345,13 +335,11 @@ EXPLAIN (COSTS FALSE) -- check the plan if the hash aggreate is disabled SET enable_hashagg TO off; -SELECT public.plan_without_result_lines($Q$ EXPLAIN (COSTS FALSE) SELECT DISTINCT ceil(count(case when l_partkey > 100000 THEN 1 ELSE 0 END) / 2) AS count FROM lineitem_hash_part GROUP BY l_suppkey ORDER BY 1; -$Q$); SET enable_hashagg TO on; diff --git a/src/test/regress/sql/multi_test_helpers.sql b/src/test/regress/sql/multi_test_helpers.sql index b5d4b9cd9..51cb2b129 100644 --- a/src/test/regress/sql/multi_test_helpers.sql +++ b/src/test/regress/sql/multi_test_helpers.sql @@ -20,15 +20,10 @@ END; $$LANGUAGE plpgsql; -- Create a function to ignore worker plans in explain output --- Also remove extra "-> Result" lines for PG15 support CREATE OR REPLACE FUNCTION coordinator_plan(explain_command text, out query_plan text) RETURNS SETOF TEXT AS $$ BEGIN FOR query_plan IN execute explain_command LOOP - IF (query_plan LIKE '%-> Result%' OR query_plan = 'Result') - THEN - CONTINUE; - END IF; RETURN next; IF query_plan LIKE '%Task Count:%' THEN @@ -40,16 +35,12 @@ END; $$ language plpgsql; -- Create a function to ignore worker plans in explain output -- It also shows task count for plan and subplans --- Also remove extra "-> Result" lines for PG15 support CREATE OR REPLACE FUNCTION coordinator_plan_with_subplans(explain_command text, out query_plan text) RETURNS SETOF TEXT AS $$ DECLARE task_count_line_reached boolean := false; BEGIN FOR query_plan IN execute explain_command LOOP - IF (query_plan LIKE '%-> Result%' OR query_plan = 'Result') THEN - CONTINUE; - END IF; IF NOT task_count_line_reached THEN RETURN next; END IF; @@ -64,20 +55,6 @@ BEGIN RETURN; END; $$ language plpgsql; --- Create a function to ignore "-> Result" lines for PG15 support --- In PG15 there are some extra "-> Result" lines -CREATE OR REPLACE FUNCTION plan_without_result_lines(explain_command text, out query_plan text) -RETURNS SETOF TEXT AS $$ -BEGIN - FOR query_plan IN execute explain_command LOOP - IF (query_plan LIKE '%-> Result%' OR query_plan = 'Result') THEN - CONTINUE; - END IF; - RETURN next; - END LOOP; - RETURN; -END; $$ language plpgsql; - -- Create a function to normalize Memory Usage, Buckets, Batches CREATE OR REPLACE FUNCTION plan_normalize_memory(explain_command text, out query_plan text) RETURNS SETOF TEXT AS $$ @@ -88,19 +65,6 @@ BEGIN END LOOP; END; $$ language plpgsql; --- Create a function to remove arrows from the explain plan -CREATE OR REPLACE FUNCTION plan_without_arrows(explain_command text, out query_plan text) -RETURNS SETOF TEXT AS $$ -BEGIN - FOR query_plan IN execute explain_command LOOP - IF (query_plan LIKE '%-> Result%' OR query_plan = 'Result') THEN - CONTINUE; - END IF; - query_plan := regexp_replace(query_plan, '( )*-> (.*)', '\2', 'g'); - RETURN NEXT; - END LOOP; -END; $$ language plpgsql; - -- helper function that returns true if output of given explain has "is not null" (case in-sensitive) CREATE OR REPLACE FUNCTION explain_has_is_not_null(explain_command text) RETURNS BOOLEAN AS $$ diff --git a/src/test/regress/sql/window_functions.sql b/src/test/regress/sql/window_functions.sql index 77f353efb..de936c95c 100644 --- a/src/test/regress/sql/window_functions.sql +++ b/src/test/regress/sql/window_functions.sql @@ -576,11 +576,9 @@ ORDER BY user_id, avg(value_1) DESC LIMIT 5; -- Grouping can be pushed down with aggregates even when window function can't -SELECT public.plan_without_result_lines($Q$ EXPLAIN (COSTS FALSE) SELECT user_id, count(value_1), stddev(value_1), count(user_id) OVER (PARTITION BY random()) FROM users_table GROUP BY user_id HAVING avg(value_1) > 2 LIMIT 1; -$Q$); -- Window function with inlined CTE WITH cte as ( From 8bb082e77d8bf13ed92c5e1726e7a0e4321223e4 Mon Sep 17 00:00:00 2001 From: Jelte Fennema Date: Wed, 31 Aug 2022 13:55:47 +0200 Subject: [PATCH 02/16] Fix reporting of progress on waiting and moved shards (#6274) In commit 31faa88a4e I removed some features of the rebalance progress monitor. I did this because the plan was to remove the foreground shard rebalancer later in the PR that would add the background shard rebalancer. So, I didn't want to spend time fixing something that we would throw away anyway. As it turns out we're not removing the foreground shard rebalancer after all, so it made sens to fix the stuff that I broke. This PR does that. For the most part this commit reverts the changes in commit 31faa88a4e. It's not a full revert though, because it keeps the improved tests and the changes to `citus_move_shard_placement`. --- .../distributed/operations/repair_shards.c | 30 ++-- .../distributed/operations/shard_rebalancer.c | 130 ++++++++++++++++-- .../distributed/progress/multi_progress.c | 8 +- src/include/distributed/shard_rebalancer.h | 6 +- .../isolation_shard_rebalancer_progress.out | 16 ++- 5 files changed, 161 insertions(+), 29 deletions(-) diff --git a/src/backend/distributed/operations/repair_shards.c b/src/backend/distributed/operations/repair_shards.c index 193797384..4388b86fd 100644 --- a/src/backend/distributed/operations/repair_shards.c +++ b/src/backend/distributed/operations/repair_shards.c @@ -21,6 +21,7 @@ #include "catalog/pg_class.h" #include "catalog/pg_enum.h" #include "distributed/adaptive_executor.h" +#include "distributed/backend_data.h" #include "distributed/citus_ruleutils.h" #include "distributed/colocation_utils.h" #include "distributed/commands.h" @@ -397,15 +398,28 @@ citus_move_shard_placement(PG_FUNCTION_ARGS) targetNodeName, targetNodePort); - WorkerNode *sourceNode = FindWorkerNode(sourceNodeName, sourceNodePort); - WorkerNode *targetNode = FindWorkerNode(targetNodeName, targetNodePort); + /* + * We want to be able to track progress of shard moves using + * get_rebalancer_progress. If this move is initiated by the rebalancer, + * then the rebalancer call has already set up the shared memory that is + * used to do that. But if citus_move_shard_placement is called directly by + * the user (or through any other mechanism), then the shared memory is not + * set up yet. In that case we do it here. + */ + if (!IsRebalancerInternalBackend()) + { + WorkerNode *sourceNode = FindWorkerNode(sourceNodeName, sourceNodePort); + WorkerNode *targetNode = FindWorkerNode(targetNodeName, targetNodePort); - PlacementUpdateEvent *placementUpdateEvent = palloc0(sizeof(PlacementUpdateEvent)); - placementUpdateEvent->updateType = PLACEMENT_UPDATE_MOVE; - placementUpdateEvent->shardId = shardId; - placementUpdateEvent->sourceNode = sourceNode; - placementUpdateEvent->targetNode = targetNode; - SetupRebalanceMonitor(list_make1(placementUpdateEvent), relationId); + PlacementUpdateEvent *placementUpdateEvent = palloc0( + sizeof(PlacementUpdateEvent)); + placementUpdateEvent->updateType = PLACEMENT_UPDATE_MOVE; + placementUpdateEvent->shardId = shardId; + placementUpdateEvent->sourceNode = sourceNode; + placementUpdateEvent->targetNode = targetNode; + SetupRebalanceMonitor(list_make1(placementUpdateEvent), relationId, + REBALANCE_PROGRESS_MOVING); + } /* * At this point of the shard moves, we don't need to block the writes to diff --git a/src/backend/distributed/operations/shard_rebalancer.c b/src/backend/distributed/operations/shard_rebalancer.c index ad7cccb0e..c6079eb83 100644 --- a/src/backend/distributed/operations/shard_rebalancer.c +++ b/src/backend/distributed/operations/shard_rebalancer.c @@ -201,6 +201,8 @@ static int PlacementsHashCompare(const void *lhsKey, const void *rhsKey, Size ke static uint32 PlacementsHashHashCode(const void *key, Size keySize); static bool WorkerNodeListContains(List *workerNodeList, const char *workerName, uint32 workerPort); +static void UpdateColocatedShardPlacementProgress(uint64 shardId, char *sourceName, + int sourcePort, uint64 progress); static bool IsPlacementOnWorkerNode(ShardPlacement *placement, WorkerNode *workerNode); static NodeFillState * FindFillStateForPlacement(RebalanceState *state, ShardPlacement *placement); @@ -258,6 +260,7 @@ PG_FUNCTION_INFO_V1(pg_dist_rebalance_strategy_enterprise_check); bool RunningUnderIsolationTest = false; int MaxRebalancerLoggedIgnoredMoves = 5; + #ifdef USE_ASSERT_CHECKING /* @@ -763,7 +766,9 @@ ExecutePlacementUpdates(List *placementUpdateList, Oid shardReplicationModeOid, * dsm handle so that it can be used for updating the progress and cleaning things up. */ void -SetupRebalanceMonitor(List *placementUpdateList, Oid relationId) +SetupRebalanceMonitor(List *placementUpdateList, + Oid relationId, + uint64 initialProgressState) { List *colocatedUpdateList = GetColocatedRebalanceSteps(placementUpdateList); ListCell *colocatedUpdateCell = NULL; @@ -787,7 +792,7 @@ SetupRebalanceMonitor(List *placementUpdateList, Oid relationId) event->shardId = colocatedUpdate->shardId; event->sourcePort = colocatedUpdate->sourceNode->workerPort; event->targetPort = colocatedUpdate->targetNode->workerPort; - pg_atomic_init_u64(&event->progress, REBALANCE_PROGRESS_MOVING); + pg_atomic_init_u64(&event->progress, initialProgressState); eventIndex++; } @@ -1186,34 +1191,63 @@ BuildShardSizesHash(ProgressMonitorData *monitor, HTAB *shardStatistics) PlacementUpdateEventProgress *step = placementUpdateEvents + eventIndex; uint64 shardId = step->shardId; + uint64 shardSize = 0; + uint64 backupShardSize = 0; + uint64 progress = pg_atomic_read_u64(&step->progress); - uint64 shardSize = WorkerShardSize(shardStatistics, step->sourceName, - step->sourcePort, shardId); + uint64 sourceSize = WorkerShardSize(shardStatistics, step->sourceName, + step->sourcePort, shardId); + uint64 targetSize = WorkerShardSize(shardStatistics, step->targetName, + step->targetPort, shardId); + + if (progress == REBALANCE_PROGRESS_WAITING || + progress == REBALANCE_PROGRESS_MOVING) + { + /* + * If we are not done with the move, the correct shard size is the + * size on the source. + */ + shardSize = sourceSize; + backupShardSize = targetSize; + } + else if (progress == REBALANCE_PROGRESS_MOVED) + { + /* + * If we are done with the move, the correct shard size is the size + * on the target + */ + shardSize = targetSize; + backupShardSize = sourceSize; + } if (shardSize == 0) { - /* - * It's possible that we are reading the sizes after the move has - * already fininshed. This means that the shards on the source - * might have already been deleted. In that case we instead report - * the size on the target as the shard size, since that is now the - * only existing shard. - */ - shardSize = WorkerShardSize(shardStatistics, step->targetName, - step->targetPort, shardId); - if (shardSize == 0) + if (backupShardSize == 0) { /* * We don't have any useful shard size. This can happen when a * shard is moved multiple times and it is not present on * either of these nodes. Probably the shard is on a worker - * related to the next move. In the weird case that this shard + * related to another event. In the weird case that this shard * is on the nodes and actually is size 0, we will have no * entry in the hashmap. When fetching from it we always * default to 0 if no entry is found, so that's fine. */ continue; } + + /* + * Because of the way we fetch shard sizes they are from a slightly + * earlier moment than the progress state we just read from shared + * memory. Usually this is no problem, but there exist some race + * conditions where this matters. For example, for very quick moves + * it is possible that even though a step is now reported as MOVED, + * when we read the shard sizes the move had not even started yet. + * This in turn can mean that the target size is 0 while the source + * size is not. We try to handle such rare edge cases by falling + * back on the other shard size if that one is not 0. + */ + shardSize = backupShardSize; } @@ -1427,6 +1461,15 @@ GetMovedShardIdsByWorker(PlacementUpdateEventProgress *steps, int stepCount, AddToWorkerShardIdSet(shardsByWorker, step->sourceName, step->sourcePort, step->shardId); + if (pg_atomic_read_u64(&step->progress) == REBALANCE_PROGRESS_WAITING) + { + /* + * shard move has not started so we don't need target stats for + * this shard + */ + continue; + } + AddToWorkerShardIdSet(shardsByWorker, step->targetName, step->targetPort, step->shardId); } @@ -1559,6 +1602,8 @@ RebalanceTableShards(RebalanceOptions *options, Oid shardReplicationModeOid) * This uses the first relationId from the list, it's only used for display * purposes so it does not really matter which to show */ + SetupRebalanceMonitor(placementUpdateList, linitial_oid(options->relationIdList), + REBALANCE_PROGRESS_WAITING); ExecutePlacementUpdates(placementUpdateList, shardReplicationModeOid, "Moving"); FinalizeCurrentProgressMonitor(); } @@ -1635,11 +1680,21 @@ UpdateShardPlacement(PlacementUpdateEvent *placementUpdateEvent, errmsg("only moving or copying shards is supported"))); } + UpdateColocatedShardPlacementProgress(shardId, + sourceNode->workerName, + sourceNode->workerPort, + REBALANCE_PROGRESS_MOVING); + /* * In case of failure, we throw an error such that rebalance_table_shards * fails early. */ ExecuteRebalancerCommandInSeparateTransaction(placementUpdateCommand->data); + + UpdateColocatedShardPlacementProgress(shardId, + sourceNode->workerName, + sourceNode->workerPort, + REBALANCE_PROGRESS_MOVED); } @@ -2700,6 +2755,51 @@ WorkerNodeListContains(List *workerNodeList, const char *workerName, uint32 work } +/* + * UpdateColocatedShardPlacementProgress updates the progress of the given placement, + * along with its colocated placements, to the given state. + */ +static void +UpdateColocatedShardPlacementProgress(uint64 shardId, char *sourceName, int sourcePort, + uint64 progress) +{ + ProgressMonitorData *header = GetCurrentProgressMonitor(); + + if (header != NULL) + { + PlacementUpdateEventProgress *steps = ProgressMonitorSteps(header); + ListCell *colocatedShardIntervalCell = NULL; + + ShardInterval *shardInterval = LoadShardInterval(shardId); + List *colocatedShardIntervalList = ColocatedShardIntervalList(shardInterval); + + for (int moveIndex = 0; moveIndex < header->stepCount; moveIndex++) + { + PlacementUpdateEventProgress *step = steps + moveIndex; + uint64 currentShardId = step->shardId; + bool colocatedShard = false; + + foreach(colocatedShardIntervalCell, colocatedShardIntervalList) + { + ShardInterval *candidateShard = lfirst(colocatedShardIntervalCell); + if (candidateShard->shardId == currentShardId) + { + colocatedShard = true; + break; + } + } + + if (colocatedShard && + strcmp(step->sourceName, sourceName) == 0 && + step->sourcePort == sourcePort) + { + pg_atomic_write_u64(&step->progress, progress); + } + } + } +} + + /* * pg_dist_rebalance_strategy_enterprise_check is a now removed function, but * to avoid issues during upgrades a C stub is kept. diff --git a/src/backend/distributed/progress/multi_progress.c b/src/backend/distributed/progress/multi_progress.c index 9b9c2faa6..657f3356c 100644 --- a/src/backend/distributed/progress/multi_progress.c +++ b/src/backend/distributed/progress/multi_progress.c @@ -109,11 +109,17 @@ GetCurrentProgressMonitor(void) /* * FinalizeCurrentProgressMonitor releases the dynamic memory segment of the current * progress monitoring data structure and removes the process from - * pg_stat_get_progress_info() output. + * pg_stat_get_progress_info() output. If there's no such dynamic memory + * segment this is a no-op. */ void FinalizeCurrentProgressMonitor(void) { + if (currentProgressDSMHandle == DSM_HANDLE_INVALID) + { + return; + } + dsm_segment *dsmSegment = dsm_find_mapping(currentProgressDSMHandle); if (dsmSegment != NULL) diff --git a/src/include/distributed/shard_rebalancer.h b/src/include/distributed/shard_rebalancer.h index d21894ffc..36c38ffff 100644 --- a/src/include/distributed/shard_rebalancer.h +++ b/src/include/distributed/shard_rebalancer.h @@ -73,7 +73,9 @@ /* *INDENT-ON* */ #define REBALANCE_ACTIVITY_MAGIC_NUMBER 1337 +#define REBALANCE_PROGRESS_WAITING 0 #define REBALANCE_PROGRESS_MOVING 1 +#define REBALANCE_PROGRESS_MOVED 2 /* Enumeration that defines different placement update types */ typedef enum @@ -193,7 +195,9 @@ extern List * ReplicationPlacementUpdates(List *workerNodeList, List *shardPlace extern void ExecuteRebalancerCommandInSeparateTransaction(char *command); extern void AcquirePlacementColocationLock(Oid relationId, int lockMode, const char *operationName); -extern void SetupRebalanceMonitor(List *placementUpdateList, Oid relationId); +extern void SetupRebalanceMonitor(List *placementUpdateList, + Oid relationId, + uint64 initialProgressState); #endif /* SHARD_REBALANCER_H */ diff --git a/src/test/regress/expected/isolation_shard_rebalancer_progress.out b/src/test/regress/expected/isolation_shard_rebalancer_progress.out index 000542bca..731f72c14 100644 --- a/src/test/regress/expected/isolation_shard_rebalancer_progress.out +++ b/src/test/regress/expected/isolation_shard_rebalancer_progress.out @@ -35,7 +35,9 @@ table_name|shardid|shard_size|sourcename|sourceport|source_shard_size|targetname --------------------------------------------------------------------- colocated1|1500001| 49152|localhost | 57637| 49152|localhost | 57638| 0| 1 colocated2|1500005| 376832|localhost | 57637| 376832|localhost | 57638| 0| 1 -(2 rows) +colocated1|1500002| 196608|localhost | 57637| 196608|localhost | 57638| 0| 0 +colocated2|1500006| 8192|localhost | 57637| 8192|localhost | 57638| 0| 0 +(4 rows) step s2-unlock-1-start: ROLLBACK; @@ -112,9 +114,11 @@ step s7-get-progress: table_name|shardid|shard_size|sourcename|sourceport|source_shard_size|targetname|targetport|target_shard_size|progress --------------------------------------------------------------------- +colocated1|1500001| 73728|localhost | 57637| 0|localhost | 57638| 73728| 2 +colocated2|1500005| 401408|localhost | 57637| 0|localhost | 57638| 401408| 2 colocated1|1500002| 196608|localhost | 57637| 196608|localhost | 57638| 0| 1 colocated2|1500006| 8192|localhost | 57637| 8192|localhost | 57638| 0| 1 -(2 rows) +(4 rows) step s3-unlock-2-start: ROLLBACK; @@ -205,7 +209,9 @@ table_name|shardid|shard_size|sourcename|sourceport|source_shard_size|targetname --------------------------------------------------------------------- colocated1|1500001| 49152|localhost | 57637| 49152|localhost | 57638| 73728| 1 colocated2|1500005| 376832|localhost | 57637| 376832|localhost | 57638| 401408| 1 -(2 rows) +colocated1|1500002| 196608|localhost | 57637| 196608|localhost | 57638| 0| 0 +colocated2|1500006| 8192|localhost | 57637| 8192|localhost | 57638| 0| 0 +(4 rows) step s7-release-lock: COMMIT; @@ -288,7 +294,9 @@ table_name|shardid|shard_size|sourcename|sourceport|source_shard_size|targetname --------------------------------------------------------------------- colocated1|1500001| 49152|localhost | 57637| 49152|localhost | 57638| 8192| 1 colocated2|1500005| 376832|localhost | 57637| 376832|localhost | 57638| 8192| 1 -(2 rows) +colocated1|1500002| 196608|localhost | 57637| 196608|localhost | 57638| 0| 0 +colocated2|1500006| 8192|localhost | 57637| 8192|localhost | 57638| 0| 0 +(4 rows) step s6-release-advisory-lock: SELECT pg_advisory_unlock(44000, 55152); From c14bf3a6606a4c1ccbdea6d9c05b652d41fd167c Mon Sep 17 00:00:00 2001 From: Jelte Fennema Date: Wed, 31 Aug 2022 14:09:39 +0200 Subject: [PATCH 03/16] Add a job to CI to check tests for flakyness (#6276) We have lots of flaky tests in CI and most of these random failures are very hard/impossible to reproduce locally. This adds a job definition to CI that allows adding a temporary job to rerun the same test in CI a lot of times. This will very often reproduce the random failures. If you then try to change the test or code to fix the random failure, you can confirm that it's indeed fixed by using this job. A future improvement to this job would be to run it (or a variant of it) automatically for every newly added test, and maybe even changed tests. This is not implemented in this PR. An example of this job running can be found here: https://app.circleci.com/pipelines/github/citusdata/citus/26682/workflows/a2638385-35bc-443c-badc-7713a8101313 --- .circleci/config.yml | 115 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index b846264e8..bab2abdff 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -22,6 +22,14 @@ parameters: style_checker_tools_version: type: string default: '0.8.18' + flaky_test: + type: string + default: '' + flaky_test_make: + type: string + default: check-minimal + + jobs: build: description: Build the citus extension @@ -529,9 +537,116 @@ jobs: name: install dependencies and run ch_benchmark tests no_output_timeout: 20m + test-flakyness: + description: Runs a test multiple times to see if it's flaky + parallelism: 32 + parameters: + pg_major: + description: 'postgres major version' + type: integer + image: + description: 'docker image to use as for the tests' + type: string + default: citus/exttester + image_tag: + description: 'docker image tag to use' + type: string + make: + description: 'make target' + type: string + default: check-minimal + test: + description: 'the test that should be run multiple times' + type: string + runs: + description: 'number of times that the test should be run in total' + type: integer + default: 1600 + docker: + - image: '<< parameters.image >>:<< parameters.image_tag >><< pipeline.parameters.image_suffix >>' + working_directory: /home/circleci/project + resource_class: small + steps: + - checkout + - attach_workspace: + at: . + - run: + name: 'Install Extension' + command: | + tar xfv "${CIRCLE_WORKING_DIRECTORY}/install-${PG_MAJOR}.tar" --directory / + - run: + name: 'Configure' + command: | + chown -R circleci . + gosu circleci ./configure --without-pg-version-check + - run: + name: 'Enable core dumps' + command: | + ulimit -c unlimited + - run: + name: 'Run minimal tests' + command: | + gosu circleci make -C src/test/regress << parameters.make >> EXTRA_TESTS="$(for i in $(seq << parameters.runs >> | circleci tests split); do echo -n '<< parameters.test >> ' ; done)" + no_output_timeout: 2m + - run: + name: 'Regressions' + command: | + if [ -f "src/test/regress/regression.diffs" ]; then + cat src/test/regress/regression.diffs + exit 1 + fi + when: on_fail + - run: + name: 'Copy coredumps' + command: | + mkdir -p /tmp/core_dumps + if ls core.* 1> /dev/null 2>&1; then + cp core.* /tmp/core_dumps + fi + when: on_fail + - store_artifacts: + name: 'Save regressions' + path: src/test/regress/regression.diffs + - store_artifacts: + name: 'Save mitmproxy output (failure test specific)' + path: src/test/regress/proxy.output + - store_artifacts: + name: 'Save results' + path: src/test/regress/results/ + - store_artifacts: + name: 'Save core dumps' + path: /tmp/core_dumps + - store_artifacts: + name: 'Save coordinator log' + path: src/test/regress/tmp_check/master/log + - store_artifacts: + name: 'Save worker1 log' + path: src/test/regress/tmp_check/worker.57637/log + - store_artifacts: + name: 'Save worker2 log' + path: src/test/regress/tmp_check/worker.57638/log + workflows: version: 2 + flaky_test_debugging: + when: << pipeline.parameters.flaky_test >> + jobs: + - build: + name: build-flaky-15 + pg_major: 15 + image_tag: '<< pipeline.parameters.pg15_version >>' + + - test-flakyness: + name: 'test-15_flaky' + pg_major: 15 + image_tag: '<< pipeline.parameters.pg15_version >>' + requires: [build-flaky-15] + make: '<< pipeline.parameters.flaky_test_make >>' + test: '<< pipeline.parameters.flaky_test >>' + build_and_test: + when: + not: << pipeline.parameters.flaky_test >> jobs: - build: name: build-13 From 317dda6af1e8b6c7bfa1f02f22436a146cd72df3 Mon Sep 17 00:00:00 2001 From: Naisila Puka <37271756+naisila@users.noreply.github.com> Date: Thu, 1 Sep 2022 11:56:31 +0300 Subject: [PATCH 04/16] Use RelationGetPrimaryKeyIndex for citus catalog tables (#6262) pg_dist_node and pg_dist_colocation have a primary key index, not a replica identity index. Citus catalog tables are created in public schema, which has replica identity index by default as primary key index. Later the citus catalog tables are moved to pg_catalog schema. During pg_upgrade, all tables are recreated, and given that pg_dist_colocation is found in pg_catalog schema, it is recreated in that schema, and when it is recreated it doesn't have a replica identity index, because catalog tables have no replica identity. Further action: Do we even need to acquire this lock on the primary key index? Postgres doesn't acquire such locks on indexes before deleting catalog tuples. Also, catalog tuples don't have replica identities by definition. --- src/backend/distributed/metadata/node_metadata.c | 11 +++++++++-- src/backend/distributed/utils/colocation_utils.c | 11 +++++++++-- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/backend/distributed/metadata/node_metadata.c b/src/backend/distributed/metadata/node_metadata.c index 3f729eccf..f3c61b64c 100644 --- a/src/backend/distributed/metadata/node_metadata.c +++ b/src/backend/distributed/metadata/node_metadata.c @@ -2677,9 +2677,16 @@ DeleteNodeRow(char *nodeName, int32 nodePort) /* * simple_heap_delete() expects that the caller has at least an - * AccessShareLock on replica identity index. + * AccessShareLock on primary key index. + * + * XXX: This does not seem required, do we really need to acquire this lock? + * Postgres doesn't acquire such locks on indexes before deleting catalog tuples. + * Linking here the reasons we added this lock acquirement: + * https://github.com/citusdata/citus/pull/2851#discussion_r306569462 + * https://github.com/citusdata/citus/pull/2855#discussion_r313628554 + * https://github.com/citusdata/citus/issues/1890 */ - Relation replicaIndex = index_open(RelationGetReplicaIndex(pgDistNode), + Relation replicaIndex = index_open(RelationGetPrimaryKeyIndex(pgDistNode), AccessShareLock); ScanKeyInit(&scanKey[0], Anum_pg_dist_node_nodename, diff --git a/src/backend/distributed/utils/colocation_utils.c b/src/backend/distributed/utils/colocation_utils.c index bb9488af1..a5a32db0e 100644 --- a/src/backend/distributed/utils/colocation_utils.c +++ b/src/backend/distributed/utils/colocation_utils.c @@ -1303,10 +1303,17 @@ DeleteColocationGroupLocally(uint32 colocationId) { /* * simple_heap_delete() expects that the caller has at least an - * AccessShareLock on replica identity index. + * AccessShareLock on primary key index. + * + * XXX: This does not seem required, do we really need to acquire this lock? + * Postgres doesn't acquire such locks on indexes before deleting catalog tuples. + * Linking here the reasons we added this lock acquirement: + * https://github.com/citusdata/citus/pull/2851#discussion_r306569462 + * https://github.com/citusdata/citus/pull/2855#discussion_r313628554 + * https://github.com/citusdata/citus/issues/1890 */ Relation replicaIndex = - index_open(RelationGetReplicaIndex(pgDistColocation), + index_open(RelationGetPrimaryKeyIndex(pgDistColocation), AccessShareLock); simple_heap_delete(pgDistColocation, &(heapTuple->t_self)); From 9e2b96caa56e6a9486254ae945a3db73707a3440 Mon Sep 17 00:00:00 2001 From: Naisila Puka <37271756+naisila@users.noreply.github.com> Date: Thu, 1 Sep 2022 12:32:44 +0300 Subject: [PATCH 05/16] Add pg14->pg15 upgrade test for dist. triggers on part. tables (#6265) PRE PG15, Renaming the parent triggers on partitioned tables doesn't recurse to renaming the child triggers on the partitions as well. In PG15, Renaming triggers on partitioned tables recurses to renaming the triggers on the partitions as well. Add an upgrade test to make sure we are not breaking anything with distributed triggers on distributed partitioned tables. Relevant PG commit: 80ba4bb383538a2ee846fece6a7b8da9518b6866 --- src/test/regress/after_pg_upgrade_schedule | 2 +- src/test/regress/before_pg_upgrade_schedule | 1 + .../upgrade_distributed_triggers_after.out | 226 ++++++++++++++++ .../upgrade_distributed_triggers_after_0.out | 16 ++ .../upgrade_distributed_triggers_before.out | 243 ++++++++++++++++++ .../upgrade_distributed_triggers_before_0.out | 14 + .../upgrade_distributed_triggers_after.sql | 72 ++++++ .../upgrade_distributed_triggers_before.sql | 116 +++++++++ 8 files changed, 689 insertions(+), 1 deletion(-) create mode 100644 src/test/regress/expected/upgrade_distributed_triggers_after.out create mode 100644 src/test/regress/expected/upgrade_distributed_triggers_after_0.out create mode 100644 src/test/regress/expected/upgrade_distributed_triggers_before.out create mode 100644 src/test/regress/expected/upgrade_distributed_triggers_before_0.out create mode 100644 src/test/regress/sql/upgrade_distributed_triggers_after.sql create mode 100644 src/test/regress/sql/upgrade_distributed_triggers_before.sql diff --git a/src/test/regress/after_pg_upgrade_schedule b/src/test/regress/after_pg_upgrade_schedule index 52b88d71d..6b8c3973f 100644 --- a/src/test/regress/after_pg_upgrade_schedule +++ b/src/test/regress/after_pg_upgrade_schedule @@ -1,4 +1,4 @@ -test: upgrade_basic_after upgrade_type_after upgrade_ref2ref_after upgrade_distributed_function_after upgrade_rebalance_strategy_after upgrade_list_citus_objects upgrade_autoconverted_after upgrade_citus_stat_activity upgrade_citus_locks +test: upgrade_basic_after upgrade_type_after upgrade_ref2ref_after upgrade_distributed_function_after upgrade_rebalance_strategy_after upgrade_list_citus_objects upgrade_autoconverted_after upgrade_citus_stat_activity upgrade_citus_locks upgrade_distributed_triggers_after # This attempts dropping citus extension (and rollbacks), so please do # not run in parallel with any other tests. diff --git a/src/test/regress/before_pg_upgrade_schedule b/src/test/regress/before_pg_upgrade_schedule index 93bcce368..671d6fa6f 100644 --- a/src/test/regress/before_pg_upgrade_schedule +++ b/src/test/regress/before_pg_upgrade_schedule @@ -8,6 +8,7 @@ test: upgrade_distributed_function_before upgrade_rebalance_strategy_before test: upgrade_autoconverted_before test: upgrade_citus_stat_activity test: upgrade_citus_locks +test: upgrade_distributed_triggers_before # upgrade_columnar_before renames public schema to citus_schema, so let's # run this test as the last one. diff --git a/src/test/regress/expected/upgrade_distributed_triggers_after.out b/src/test/regress/expected/upgrade_distributed_triggers_after.out new file mode 100644 index 000000000..591f33401 --- /dev/null +++ b/src/test/regress/expected/upgrade_distributed_triggers_after.out @@ -0,0 +1,226 @@ +-- +-- UPGRADE_DISTRIBUTED_TRIGGERS_AFTER +-- +-- In PG15, Renaming triggers on partitioned tables +-- recurses to renaming the triggers on the partitions as well. +-- Relevant PG commit: +-- 80ba4bb383538a2ee846fece6a7b8da9518b6866 +-- +-- this test is relevant only for pg14-15 upgrade +-- +SHOW server_version \gset +SELECT substring(:'server_version', '\d+')::int = 15 AS server_version_eq_15 +\gset +\if :server_version_eq_15 +\else +\q +\endif +SET search_path TO upgrade_distributed_triggers, public; +SET citus.shard_count TO 4; +SET citus.enable_unsafe_triggers TO true; +SELECT run_command_on_workers('ALTER SYSTEM SET citus.enable_unsafe_triggers TO true;'); + run_command_on_workers +--------------------------------------------------------------------- + (localhost,10201,t,"ALTER SYSTEM") + (localhost,10202,t,"ALTER SYSTEM") +(2 rows) + +SELECT run_command_on_workers('SELECT pg_reload_conf();'); + run_command_on_workers +--------------------------------------------------------------------- + (localhost,10201,t,t) + (localhost,10202,t,t) +(2 rows) + +-- after PG15 upgrade, all child triggers have the same name with the parent triggers +-- check that the workers are also updated +SELECT * FROM sale_triggers ORDER BY 1, 2; + tgname | tgrelid | tgenabled +--------------------------------------------------------------------- + another_renamed_trigger | sale | O + another_renamed_trigger | sale_newyork | O + another_renamed_trigger | sale_california | O + not_renamed_trigger | sale | O + not_renamed_trigger | sale_newyork | O + not_renamed_trigger | sale_california | O + renamed_record_sale_trigger | sale | O + renamed_record_sale_trigger | sale_newyork | O + renamed_record_sale_trigger | sale_california | O + truncate_trigger_xxxxxxx | sale | O + truncate_trigger_xxxxxxx | sale_newyork | O + truncate_trigger_xxxxxxx | sale_california | O +(12 rows) + +SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'record_sale_trigger%' ORDER BY 1,2; + tgrelid | tgname +--------------------------------------------------------------------- +(0 rows) + +SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%' ORDER BY 1,2; + tgrelid | tgname +--------------------------------------------------------------------- + sale | renamed_record_sale_trigger + sale_california | renamed_record_sale_trigger + sale_newyork | renamed_record_sale_trigger +(3 rows) + +SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'record_sale_trigger%';$$); + run_command_on_workers +--------------------------------------------------------------------- + (localhost,10201,t,0) + (localhost,10202,t,0) +(2 rows) + +SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%';$$); + run_command_on_workers +--------------------------------------------------------------------- + (localhost,10201,t,9) + (localhost,10202,t,9) +(2 rows) + +-- create another partition to verify that all is safe and sound +CREATE TABLE sale_alabama PARTITION OF sale FOR VALUES IN ('AL'); +SELECT * FROM sale_triggers ORDER BY 1, 2; + tgname | tgrelid | tgenabled +--------------------------------------------------------------------- + another_renamed_trigger | sale | O + another_renamed_trigger | sale_newyork | O + another_renamed_trigger | sale_california | O + another_renamed_trigger | sale_alabama | O + not_renamed_trigger | sale | O + not_renamed_trigger | sale_newyork | O + not_renamed_trigger | sale_california | O + not_renamed_trigger | sale_alabama | O + renamed_record_sale_trigger | sale | O + renamed_record_sale_trigger | sale_newyork | O + renamed_record_sale_trigger | sale_california | O + renamed_record_sale_trigger | sale_alabama | O + truncate_trigger_xxxxxxx | sale | O + truncate_trigger_xxxxxxx | sale_newyork | O + truncate_trigger_xxxxxxx | sale_california | O + truncate_trigger_xxxxxxx | sale_alabama | O +(16 rows) + +SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'record_sale_trigger%' ORDER BY 1,2; + tgrelid | tgname +--------------------------------------------------------------------- +(0 rows) + +SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%' ORDER BY 1,2; + tgrelid | tgname +--------------------------------------------------------------------- + sale | renamed_record_sale_trigger + sale_alabama | renamed_record_sale_trigger + sale_california | renamed_record_sale_trigger + sale_newyork | renamed_record_sale_trigger +(4 rows) + +SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'record_sale_trigger%';$$); + run_command_on_workers +--------------------------------------------------------------------- + (localhost,10201,t,0) + (localhost,10202,t,0) +(2 rows) + +SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%';$$); + run_command_on_workers +--------------------------------------------------------------------- + (localhost,10201,t,12) + (localhost,10202,t,12) +(2 rows) + +-- drop a trigger to verify that all is safe and sound +DROP TRIGGER another_renamed_trigger ON sale; +SELECT * FROM sale_triggers ORDER BY 1, 2; + tgname | tgrelid | tgenabled +--------------------------------------------------------------------- + not_renamed_trigger | sale | O + not_renamed_trigger | sale_newyork | O + not_renamed_trigger | sale_california | O + not_renamed_trigger | sale_alabama | O + renamed_record_sale_trigger | sale | O + renamed_record_sale_trigger | sale_newyork | O + renamed_record_sale_trigger | sale_california | O + renamed_record_sale_trigger | sale_alabama | O + truncate_trigger_xxxxxxx | sale | O + truncate_trigger_xxxxxxx | sale_newyork | O + truncate_trigger_xxxxxxx | sale_california | O + truncate_trigger_xxxxxxx | sale_alabama | O +(12 rows) + +-- rename a trigger - note that it also renames the triggers on the partitions +ALTER TRIGGER "renamed_record_sale_trigger" ON "sale" RENAME TO "final_renamed_record_sale_trigger"; +SELECT * FROM sale_triggers ORDER BY 1, 2; + tgname | tgrelid | tgenabled +--------------------------------------------------------------------- + final_renamed_record_sale_trigger | sale | O + final_renamed_record_sale_trigger | sale_newyork | O + final_renamed_record_sale_trigger | sale_california | O + final_renamed_record_sale_trigger | sale_alabama | O + not_renamed_trigger | sale | O + not_renamed_trigger | sale_newyork | O + not_renamed_trigger | sale_california | O + not_renamed_trigger | sale_alabama | O + truncate_trigger_xxxxxxx | sale | O + truncate_trigger_xxxxxxx | sale_newyork | O + truncate_trigger_xxxxxxx | sale_california | O + truncate_trigger_xxxxxxx | sale_alabama | O +(12 rows) + +SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%' ORDER BY 1,2; + tgrelid | tgname +--------------------------------------------------------------------- +(0 rows) + +SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'final_renamed_record_sale_trigger%' ORDER BY 1,2; + tgrelid | tgname +--------------------------------------------------------------------- + sale | final_renamed_record_sale_trigger + sale_alabama | final_renamed_record_sale_trigger + sale_california | final_renamed_record_sale_trigger + sale_newyork | final_renamed_record_sale_trigger +(4 rows) + +SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%';$$); + run_command_on_workers +--------------------------------------------------------------------- + (localhost,10201,t,0) + (localhost,10202,t,0) +(2 rows) + +SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'final_renamed_record_sale_trigger%';$$); + run_command_on_workers +--------------------------------------------------------------------- + (localhost,10201,t,12) + (localhost,10202,t,12) +(2 rows) + +DROP TRIGGER final_renamed_record_sale_trigger ON sale; +-- create another trigger and rename it +CREATE TRIGGER yet_another_trigger +AFTER INSERT OR UPDATE OR DELETE ON sale +FOR EACH ROW EXECUTE FUNCTION upgrade_distributed_triggers.record_sale(); +ALTER TRIGGER "yet_another_trigger" ON "sale" RENAME TO "renamed_yet_another_trigger"; +SELECT * FROM sale_triggers ORDER BY 1, 2; + tgname | tgrelid | tgenabled +--------------------------------------------------------------------- + not_renamed_trigger | sale | O + not_renamed_trigger | sale_newyork | O + not_renamed_trigger | sale_california | O + not_renamed_trigger | sale_alabama | O + renamed_yet_another_trigger | sale | O + renamed_yet_another_trigger | sale_newyork | O + renamed_yet_another_trigger | sale_california | O + renamed_yet_another_trigger | sale_alabama | O + truncate_trigger_xxxxxxx | sale | O + truncate_trigger_xxxxxxx | sale_newyork | O + truncate_trigger_xxxxxxx | sale_california | O + truncate_trigger_xxxxxxx | sale_alabama | O +(12 rows) + +DROP SCHEMA upgrade_distributed_triggers CASCADE; +NOTICE: drop cascades to 4 other objects +DETAIL: drop cascades to function record_sale() +drop cascades to table sale +drop cascades to table record_sale +drop cascades to view sale_triggers diff --git a/src/test/regress/expected/upgrade_distributed_triggers_after_0.out b/src/test/regress/expected/upgrade_distributed_triggers_after_0.out new file mode 100644 index 000000000..2b1f5cac1 --- /dev/null +++ b/src/test/regress/expected/upgrade_distributed_triggers_after_0.out @@ -0,0 +1,16 @@ +-- +-- UPGRADE_DISTRIBUTED_TRIGGERS_AFTER +-- +-- In PG15, Renaming triggers on partitioned tables +-- recurses to renaming the triggers on the partitions as well. +-- Relevant PG commit: +-- 80ba4bb383538a2ee846fece6a7b8da9518b6866 +-- +-- this test is relevant only for pg14-15 upgrade +-- +SHOW server_version \gset +SELECT substring(:'server_version', '\d+')::int = 15 AS server_version_eq_15 +\gset +\if :server_version_eq_15 +\else +\q diff --git a/src/test/regress/expected/upgrade_distributed_triggers_before.out b/src/test/regress/expected/upgrade_distributed_triggers_before.out new file mode 100644 index 000000000..5e369c84c --- /dev/null +++ b/src/test/regress/expected/upgrade_distributed_triggers_before.out @@ -0,0 +1,243 @@ +-- +-- UPGRADE_DISTRIBUTED_TRIGGERS_BEFORE +-- +-- PRE PG15, Renaming the parent triggers on partitioned tables doesn't +-- recurse to renaming the child triggers on the partitions as well. +-- +-- this test is relevant only for pg14-15 upgrade +-- +SHOW server_version \gset +SELECT substring(:'server_version', '\d+')::int = 14 AS server_version_eq_14 +\gset +\if :server_version_eq_14 +\else +\q +\endif +CREATE SCHEMA upgrade_distributed_triggers; +SET search_path TO upgrade_distributed_triggers, public; +SET citus.shard_count TO 4; +SET citus.enable_unsafe_triggers TO true; +SELECT run_command_on_workers('ALTER SYSTEM SET citus.enable_unsafe_triggers TO true;'); + run_command_on_workers +--------------------------------------------------------------------- + (localhost,10201,t,"ALTER SYSTEM") + (localhost,10202,t,"ALTER SYSTEM") +(2 rows) + +SELECT run_command_on_workers('SELECT pg_reload_conf();'); + run_command_on_workers +--------------------------------------------------------------------- + (localhost,10201,t,t) + (localhost,10202,t,t) +(2 rows) + +CREATE TABLE sale( + sale_date date not null, + state_code text, + product_sku text, + units integer) + PARTITION BY list (state_code); +ALTER TABLE sale ADD CONSTRAINT sale_pk PRIMARY KEY (state_code, sale_date); +CREATE TABLE sale_newyork PARTITION OF sale FOR VALUES IN ('NY'); +CREATE TABLE record_sale( + operation_type text not null, + product_sku text, + state_code text, + units integer, + PRIMARY KEY(state_code, product_sku, operation_type, units)); +SELECT create_distributed_table('sale', 'state_code'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +CREATE TABLE sale_california PARTITION OF sale FOR VALUES IN ('CA'); +SELECT create_distributed_table('record_sale', 'state_code', colocate_with := 'sale'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +CREATE OR REPLACE FUNCTION record_sale() +RETURNS trigger +AS $$ +BEGIN + INSERT INTO upgrade_distributed_triggers.record_sale(operation_type, product_sku, state_code, units) + VALUES (TG_OP, NEW.product_sku, NEW.state_code, NEW.units); + RETURN NULL; +END; +$$ LANGUAGE plpgsql; +-- will rename this trigger +CREATE TRIGGER record_sale_trigger +AFTER INSERT OR UPDATE OR DELETE ON sale +FOR EACH ROW EXECUTE FUNCTION upgrade_distributed_triggers.record_sale(); +-- will rename this trigger +CREATE TRIGGER another_trigger +AFTER INSERT OR UPDATE OR DELETE ON sale +FOR EACH ROW EXECUTE FUNCTION upgrade_distributed_triggers.record_sale(); +-- won't rename this trigger +CREATE TRIGGER not_renamed_trigger +AFTER INSERT OR UPDATE OR DELETE ON sale +FOR EACH ROW EXECUTE FUNCTION upgrade_distributed_triggers.record_sale(); +-- Trigger function should appear on workers +SELECT proname from pg_proc WHERE oid='upgrade_distributed_triggers.record_sale'::regproc; + proname +--------------------------------------------------------------------- + record_sale +(1 row) + +SELECT run_command_on_workers($$SELECT count(*) FROM pg_proc WHERE oid='upgrade_distributed_triggers.record_sale'::regproc$$); + run_command_on_workers +--------------------------------------------------------------------- + (localhost,10201,t,1) + (localhost,10202,t,1) +(2 rows) + +-- Trigger should appear on workers +SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'record_sale_trigger%' ORDER BY 1,2; + tgrelid | tgname +--------------------------------------------------------------------- + sale | record_sale_trigger + sale_california | record_sale_trigger + sale_newyork | record_sale_trigger +(3 rows) + +SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'record_sale_trigger%';$$); + run_command_on_workers +--------------------------------------------------------------------- + (localhost,10201,t,9) + (localhost,10202,t,9) +(2 rows) + +CREATE VIEW sale_triggers AS + SELECT tgname, tgrelid::regclass, tgenabled + FROM pg_trigger + WHERE tgrelid::regclass::text like 'sale%' + ORDER BY 1, 2; +SELECT * FROM sale_triggers ORDER BY 1, 2; + tgname | tgrelid | tgenabled +--------------------------------------------------------------------- + another_trigger | sale | O + another_trigger | sale_newyork | O + another_trigger | sale_california | O + not_renamed_trigger | sale | O + not_renamed_trigger | sale_newyork | O + not_renamed_trigger | sale_california | O + record_sale_trigger | sale | O + record_sale_trigger | sale_newyork | O + record_sale_trigger | sale_california | O + truncate_trigger_xxxxxxx | sale | O + truncate_trigger_xxxxxxx | sale_newyork | O + truncate_trigger_xxxxxxx | sale_california | O +(12 rows) + +-- rename the triggers - note that it doesn't rename the +-- triggers on the partitions +ALTER TRIGGER record_sale_trigger ON sale RENAME TO renamed_record_sale_trigger; +SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'record_sale_trigger%' ORDER BY 1,2; + tgrelid | tgname +--------------------------------------------------------------------- + sale_california | record_sale_trigger + sale_newyork | record_sale_trigger +(2 rows) + +SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%' ORDER BY 1,2; + tgrelid | tgname +--------------------------------------------------------------------- + sale | renamed_record_sale_trigger +(1 row) + +SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'record_sale_trigger%';$$); + run_command_on_workers +--------------------------------------------------------------------- + (localhost,10201,t,6) + (localhost,10202,t,6) +(2 rows) + +SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%';$$); + run_command_on_workers +--------------------------------------------------------------------- + (localhost,10201,t,3) + (localhost,10202,t,3) +(2 rows) + +ALTER TRIGGER another_trigger ON sale RENAME TO another_renamed_trigger; +SELECT * FROM sale_triggers ORDER BY 1, 2; + tgname | tgrelid | tgenabled +--------------------------------------------------------------------- + another_renamed_trigger | sale | O + another_trigger | sale_newyork | O + another_trigger | sale_california | O + not_renamed_trigger | sale | O + not_renamed_trigger | sale_newyork | O + not_renamed_trigger | sale_california | O + record_sale_trigger | sale_newyork | O + record_sale_trigger | sale_california | O + renamed_record_sale_trigger | sale | O + truncate_trigger_xxxxxxx | sale | O + truncate_trigger_xxxxxxx | sale_newyork | O + truncate_trigger_xxxxxxx | sale_california | O +(12 rows) + +-- although the child triggers haven't been renamed to +-- another_renamed_trigger, they are dropped when the parent is dropped +DROP TRIGGER another_renamed_trigger ON sale; +SELECT * FROM sale_triggers ORDER BY 1, 2; + tgname | tgrelid | tgenabled +--------------------------------------------------------------------- + not_renamed_trigger | sale | O + not_renamed_trigger | sale_newyork | O + not_renamed_trigger | sale_california | O + record_sale_trigger | sale_newyork | O + record_sale_trigger | sale_california | O + renamed_record_sale_trigger | sale | O + truncate_trigger_xxxxxxx | sale | O + truncate_trigger_xxxxxxx | sale_newyork | O + truncate_trigger_xxxxxxx | sale_california | O +(9 rows) + +SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'another_renamed_trigger%' ORDER BY 1,2; + tgrelid | tgname +--------------------------------------------------------------------- +(0 rows) + +SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'another_trigger%' ORDER BY 1,2; + tgrelid | tgname +--------------------------------------------------------------------- +(0 rows) + +SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'another_renamed_trigger%';$$); + run_command_on_workers +--------------------------------------------------------------------- + (localhost,10201,t,0) + (localhost,10202,t,0) +(2 rows) + +SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'another_trigger%';$$); + run_command_on_workers +--------------------------------------------------------------------- + (localhost,10201,t,0) + (localhost,10202,t,0) +(2 rows) + +CREATE TRIGGER another_trigger +AFTER INSERT OR UPDATE OR DELETE ON sale +FOR EACH ROW EXECUTE FUNCTION upgrade_distributed_triggers.record_sale(); +ALTER TRIGGER another_trigger ON sale RENAME TO another_renamed_trigger; +SELECT * FROM sale_triggers ORDER BY 1, 2; + tgname | tgrelid | tgenabled +--------------------------------------------------------------------- + another_renamed_trigger | sale | O + another_trigger | sale_newyork | O + another_trigger | sale_california | O + not_renamed_trigger | sale | O + not_renamed_trigger | sale_newyork | O + not_renamed_trigger | sale_california | O + record_sale_trigger | sale_newyork | O + record_sale_trigger | sale_california | O + renamed_record_sale_trigger | sale | O + truncate_trigger_xxxxxxx | sale | O + truncate_trigger_xxxxxxx | sale_newyork | O + truncate_trigger_xxxxxxx | sale_california | O +(12 rows) + diff --git a/src/test/regress/expected/upgrade_distributed_triggers_before_0.out b/src/test/regress/expected/upgrade_distributed_triggers_before_0.out new file mode 100644 index 000000000..34a7cedf2 --- /dev/null +++ b/src/test/regress/expected/upgrade_distributed_triggers_before_0.out @@ -0,0 +1,14 @@ +-- +-- UPGRADE_DISTRIBUTED_TRIGGERS_BEFORE +-- +-- PRE PG15, Renaming the parent triggers on partitioned tables doesn't +-- recurse to renaming the child triggers on the partitions as well. +-- +-- this test is relevant only for pg14-15 upgrade +-- +SHOW server_version \gset +SELECT substring(:'server_version', '\d+')::int = 14 AS server_version_eq_14 +\gset +\if :server_version_eq_14 +\else +\q diff --git a/src/test/regress/sql/upgrade_distributed_triggers_after.sql b/src/test/regress/sql/upgrade_distributed_triggers_after.sql new file mode 100644 index 000000000..bcbd7432e --- /dev/null +++ b/src/test/regress/sql/upgrade_distributed_triggers_after.sql @@ -0,0 +1,72 @@ +-- +-- UPGRADE_DISTRIBUTED_TRIGGERS_AFTER +-- +-- In PG15, Renaming triggers on partitioned tables +-- recurses to renaming the triggers on the partitions as well. +-- Relevant PG commit: +-- 80ba4bb383538a2ee846fece6a7b8da9518b6866 +-- +-- this test is relevant only for pg14-15 upgrade +-- + +SHOW server_version \gset +SELECT substring(:'server_version', '\d+')::int = 15 AS server_version_eq_15 +\gset +\if :server_version_eq_15 +\else +\q +\endif + +SET search_path TO upgrade_distributed_triggers, public; +SET citus.shard_count TO 4; + +SET citus.enable_unsafe_triggers TO true; +SELECT run_command_on_workers('ALTER SYSTEM SET citus.enable_unsafe_triggers TO true;'); +SELECT run_command_on_workers('SELECT pg_reload_conf();'); + +-- after PG15 upgrade, all child triggers have the same name with the parent triggers +-- check that the workers are also updated +SELECT * FROM sale_triggers ORDER BY 1, 2; + +SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'record_sale_trigger%' ORDER BY 1,2; +SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%' ORDER BY 1,2; +SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'record_sale_trigger%';$$); +SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%';$$); + +-- create another partition to verify that all is safe and sound +CREATE TABLE sale_alabama PARTITION OF sale FOR VALUES IN ('AL'); + +SELECT * FROM sale_triggers ORDER BY 1, 2; + +SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'record_sale_trigger%' ORDER BY 1,2; +SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%' ORDER BY 1,2; +SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'record_sale_trigger%';$$); +SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%';$$); + +-- drop a trigger to verify that all is safe and sound +DROP TRIGGER another_renamed_trigger ON sale; + +SELECT * FROM sale_triggers ORDER BY 1, 2; + +-- rename a trigger - note that it also renames the triggers on the partitions +ALTER TRIGGER "renamed_record_sale_trigger" ON "sale" RENAME TO "final_renamed_record_sale_trigger"; + +SELECT * FROM sale_triggers ORDER BY 1, 2; + +SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%' ORDER BY 1,2; +SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'final_renamed_record_sale_trigger%' ORDER BY 1,2; +SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%';$$); +SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'final_renamed_record_sale_trigger%';$$); + +DROP TRIGGER final_renamed_record_sale_trigger ON sale; + +-- create another trigger and rename it +CREATE TRIGGER yet_another_trigger +AFTER INSERT OR UPDATE OR DELETE ON sale +FOR EACH ROW EXECUTE FUNCTION upgrade_distributed_triggers.record_sale(); + +ALTER TRIGGER "yet_another_trigger" ON "sale" RENAME TO "renamed_yet_another_trigger"; + +SELECT * FROM sale_triggers ORDER BY 1, 2; + +DROP SCHEMA upgrade_distributed_triggers CASCADE; diff --git a/src/test/regress/sql/upgrade_distributed_triggers_before.sql b/src/test/regress/sql/upgrade_distributed_triggers_before.sql new file mode 100644 index 000000000..a1d4c9db3 --- /dev/null +++ b/src/test/regress/sql/upgrade_distributed_triggers_before.sql @@ -0,0 +1,116 @@ +-- +-- UPGRADE_DISTRIBUTED_TRIGGERS_BEFORE +-- +-- PRE PG15, Renaming the parent triggers on partitioned tables doesn't +-- recurse to renaming the child triggers on the partitions as well. +-- +-- this test is relevant only for pg14-15 upgrade +-- + +SHOW server_version \gset +SELECT substring(:'server_version', '\d+')::int = 14 AS server_version_eq_14 +\gset +\if :server_version_eq_14 +\else +\q +\endif + +CREATE SCHEMA upgrade_distributed_triggers; +SET search_path TO upgrade_distributed_triggers, public; +SET citus.shard_count TO 4; + +SET citus.enable_unsafe_triggers TO true; +SELECT run_command_on_workers('ALTER SYSTEM SET citus.enable_unsafe_triggers TO true;'); +SELECT run_command_on_workers('SELECT pg_reload_conf();'); + +CREATE TABLE sale( + sale_date date not null, + state_code text, + product_sku text, + units integer) + PARTITION BY list (state_code); + +ALTER TABLE sale ADD CONSTRAINT sale_pk PRIMARY KEY (state_code, sale_date); + +CREATE TABLE sale_newyork PARTITION OF sale FOR VALUES IN ('NY'); + +CREATE TABLE record_sale( + operation_type text not null, + product_sku text, + state_code text, + units integer, + PRIMARY KEY(state_code, product_sku, operation_type, units)); + +SELECT create_distributed_table('sale', 'state_code'); +CREATE TABLE sale_california PARTITION OF sale FOR VALUES IN ('CA'); +SELECT create_distributed_table('record_sale', 'state_code', colocate_with := 'sale'); + +CREATE OR REPLACE FUNCTION record_sale() +RETURNS trigger +AS $$ +BEGIN + INSERT INTO upgrade_distributed_triggers.record_sale(operation_type, product_sku, state_code, units) + VALUES (TG_OP, NEW.product_sku, NEW.state_code, NEW.units); + RETURN NULL; +END; +$$ LANGUAGE plpgsql; + +-- will rename this trigger +CREATE TRIGGER record_sale_trigger +AFTER INSERT OR UPDATE OR DELETE ON sale +FOR EACH ROW EXECUTE FUNCTION upgrade_distributed_triggers.record_sale(); + +-- will rename this trigger +CREATE TRIGGER another_trigger +AFTER INSERT OR UPDATE OR DELETE ON sale +FOR EACH ROW EXECUTE FUNCTION upgrade_distributed_triggers.record_sale(); + +-- won't rename this trigger +CREATE TRIGGER not_renamed_trigger +AFTER INSERT OR UPDATE OR DELETE ON sale +FOR EACH ROW EXECUTE FUNCTION upgrade_distributed_triggers.record_sale(); + +-- Trigger function should appear on workers +SELECT proname from pg_proc WHERE oid='upgrade_distributed_triggers.record_sale'::regproc; +SELECT run_command_on_workers($$SELECT count(*) FROM pg_proc WHERE oid='upgrade_distributed_triggers.record_sale'::regproc$$); + +-- Trigger should appear on workers +SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'record_sale_trigger%' ORDER BY 1,2; +SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'record_sale_trigger%';$$); + +CREATE VIEW sale_triggers AS + SELECT tgname, tgrelid::regclass, tgenabled + FROM pg_trigger + WHERE tgrelid::regclass::text like 'sale%' + ORDER BY 1, 2; + +SELECT * FROM sale_triggers ORDER BY 1, 2; + +-- rename the triggers - note that it doesn't rename the +-- triggers on the partitions +ALTER TRIGGER record_sale_trigger ON sale RENAME TO renamed_record_sale_trigger; + +SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'record_sale_trigger%' ORDER BY 1,2; +SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%' ORDER BY 1,2; +SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'record_sale_trigger%';$$); +SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%';$$); + +ALTER TRIGGER another_trigger ON sale RENAME TO another_renamed_trigger; +SELECT * FROM sale_triggers ORDER BY 1, 2; + +-- although the child triggers haven't been renamed to +-- another_renamed_trigger, they are dropped when the parent is dropped +DROP TRIGGER another_renamed_trigger ON sale; +SELECT * FROM sale_triggers ORDER BY 1, 2; + +SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'another_renamed_trigger%' ORDER BY 1,2; +SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'another_trigger%' ORDER BY 1,2; +SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'another_renamed_trigger%';$$); +SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'another_trigger%';$$); + +CREATE TRIGGER another_trigger +AFTER INSERT OR UPDATE OR DELETE ON sale +FOR EACH ROW EXECUTE FUNCTION upgrade_distributed_triggers.record_sale(); + +ALTER TRIGGER another_trigger ON sale RENAME TO another_renamed_trigger; +SELECT * FROM sale_triggers ORDER BY 1, 2; From 432f399a5d27ea91652157b9e520afe6a28a229a Mon Sep 17 00:00:00 2001 From: Marco Slot Date: Thu, 1 Sep 2022 14:26:43 +0200 Subject: [PATCH 06/16] Allow citus_internal application_name with additional suffix (#6282) Co-authored-by: Marco Slot --- .../distributed/transaction/backend_data.c | 16 ++++++++-------- .../regress/expected/metadata_sync_helpers.out | 16 ++++++++++++++++ src/test/regress/sql/metadata_sync_helpers.sql | 7 +++++++ 3 files changed, 31 insertions(+), 8 deletions(-) diff --git a/src/backend/distributed/transaction/backend_data.c b/src/backend/distributed/transaction/backend_data.c index 34d90969b..670469cd7 100644 --- a/src/backend/distributed/transaction/backend_data.c +++ b/src/backend/distributed/transaction/backend_data.c @@ -1064,16 +1064,16 @@ ExtractGlobalPID(char *applicationName) return INVALID_CITUS_INTERNAL_BACKEND_GPID; } - /* are the remaining characters of the application name numbers */ - uint64 numberOfRemainingChars = strlen(applicationNameCopy) - prefixLength; - if (numberOfRemainingChars <= 0 || - !strisdigit_s(applicationNameCopy + prefixLength, numberOfRemainingChars)) - { - return INVALID_CITUS_INTERNAL_BACKEND_GPID; - } - char *globalPIDString = &applicationNameCopy[prefixLength]; uint64 globalPID = strtoul(globalPIDString, NULL, 10); + if (globalPID == 0) + { + /* + * INVALID_CITUS_INTERNAL_BACKEND_GPID is 0, but just to be explicit + * about how we handle strtoul errors. + */ + return INVALID_CITUS_INTERNAL_BACKEND_GPID; + } return globalPID; } diff --git a/src/test/regress/expected/metadata_sync_helpers.out b/src/test/regress/expected/metadata_sync_helpers.out index 9800f5031..1939fcf1c 100644 --- a/src/test/regress/expected/metadata_sync_helpers.out +++ b/src/test/regress/expected/metadata_sync_helpers.out @@ -108,6 +108,22 @@ BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED; SET application_name to 'citus_internal gpid=not a correct gpid'; SELECT citus_internal_add_partition_metadata ('test_2'::regclass, 'h', 'col_1', 0, 's'); ERROR: This is an internal Citus function can only be used in a distributed transaction +ROLLBACK; +-- application_name with suffix is ok (e.g. pgbouncer might add this) +BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED; + SELECT assign_distributed_transaction_id(0, 8, '2021-07-09 15:41:55.542377+02'); + assign_distributed_transaction_id +--------------------------------------------------------------------- + +(1 row) + + SET application_name to 'citus_internal gpid=10000000001 - from 10.12.14.16:10370'; + SELECT citus_internal_add_partition_metadata ('test_2'::regclass, 'h', 'col_1', 0, 's'); + citus_internal_add_partition_metadata +--------------------------------------------------------------------- + +(1 row) + ROLLBACK; -- application_name with empty gpid BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED; diff --git a/src/test/regress/sql/metadata_sync_helpers.sql b/src/test/regress/sql/metadata_sync_helpers.sql index 1f0ad09d0..ca32c5f16 100644 --- a/src/test/regress/sql/metadata_sync_helpers.sql +++ b/src/test/regress/sql/metadata_sync_helpers.sql @@ -75,6 +75,13 @@ BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED; SELECT citus_internal_add_partition_metadata ('test_2'::regclass, 'h', 'col_1', 0, 's'); ROLLBACK; +-- application_name with suffix is ok (e.g. pgbouncer might add this) +BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED; + SELECT assign_distributed_transaction_id(0, 8, '2021-07-09 15:41:55.542377+02'); + SET application_name to 'citus_internal gpid=10000000001 - from 10.12.14.16:10370'; + SELECT citus_internal_add_partition_metadata ('test_2'::regclass, 'h', 'col_1', 0, 's'); +ROLLBACK; + -- application_name with empty gpid BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED; SELECT assign_distributed_transaction_id(0, 8, '2021-07-09 15:41:55.542377+02'); From 7c8cc7fc6190a3109ada25fec8537daeb97f8f80 Mon Sep 17 00:00:00 2001 From: Ahmet Gedemenli Date: Fri, 2 Sep 2022 10:12:07 +0300 Subject: [PATCH 07/16] Fix flakiness for view tests (#6284) --- .../expected/citus_local_tables_mx.out | 49 +++++++++++++------ .../regress/sql/citus_local_tables_mx.sql | 16 ++++-- 2 files changed, 46 insertions(+), 19 deletions(-) diff --git a/src/test/regress/expected/citus_local_tables_mx.out b/src/test/regress/expected/citus_local_tables_mx.out index 6fcad3612..3177ed9e9 100644 --- a/src/test/regress/expected/citus_local_tables_mx.out +++ b/src/test/regress/expected/citus_local_tables_mx.out @@ -887,22 +887,39 @@ ALTER TABLE loc_tb ADD CONSTRAINT fkey FOREIGN KEY (a) references ref_tb(a); ERROR: Citus can not handle circular dependencies between distributed objects -- drop the view&matview with circular dependency DROP VIEW v103 CASCADE; -SET client_min_messages TO DEBUG1; -- now it should successfully add to metadata and create the views on workers ALTER TABLE loc_tb ADD CONSTRAINT fkey FOREIGN KEY (a) references ref_tb(a); -DEBUG: executing "CREATE OR REPLACE VIEW citus_local_tables_mx.v100 (a) AS SELECT loc_tb.a - FROM citus_local_tables_mx.loc_tb; ALTER VIEW citus_local_tables_mx.v100 OWNER TO postgres" -DEBUG: "view v100" has dependency to "table loc_tb" that is not in Citus' metadata -DEBUG: executing "CREATE OR REPLACE VIEW citus_local_tables_mx.v101 (a) AS SELECT loc_tb.a - FROM (citus_local_tables_mx.loc_tb - JOIN citus_local_tables_mx.ref_tb USING (a)); ALTER VIEW citus_local_tables_mx.v101 OWNER TO postgres" -DEBUG: "view v101" has dependency to "table loc_tb" that is not in Citus' metadata -DEBUG: executing "CREATE MATERIALIZED VIEW citus_local_tables_mx.matview_101 USING heap AS SELECT loc_tb.a - FROM citus_local_tables_mx.loc_tb;ALTER MATERIALIZED VIEW citus_local_tables_mx.matview_101 OWNER TO postgres" -DEBUG: executing "CREATE OR REPLACE VIEW citus_local_tables_mx.v102 (a) AS SELECT v101.a - FROM citus_local_tables_mx.v101; ALTER VIEW citus_local_tables_mx.v102 OWNER TO postgres" -DEBUG: "view v102" has dependency to "table loc_tb" that is not in Citus' metadata -DEBUG: validating foreign key constraint "fkey_xxxxxxx" +-- verify the views are created on workers +select run_command_on_workers($$SELECT count(*)=0 from citus_local_tables_mx.v100$$); + run_command_on_workers +--------------------------------------------------------------------- + (localhost,57637,t,t) + (localhost,57638,t,t) +(2 rows) + +select run_command_on_workers($$SELECT count(*)=0 from citus_local_tables_mx.v101$$); + run_command_on_workers +--------------------------------------------------------------------- + (localhost,57637,t,t) + (localhost,57638,t,t) +(2 rows) + +select run_command_on_workers($$SELECT count(*)=0 from citus_local_tables_mx.v102$$); + run_command_on_workers +--------------------------------------------------------------------- + (localhost,57637,t,t) + (localhost,57638,t,t) +(2 rows) + +CREATE TABLE loc_tb_2 (a int); +CREATE VIEW v104 AS SELECT * from loc_tb_2; +SET client_min_messages TO DEBUG1; +-- verify the CREATE command for the view is generated correctly +ALTER TABLE loc_tb_2 ADD CONSTRAINT fkey_2 FOREIGN KEY (a) references ref_tb(a); +DEBUG: executing "CREATE OR REPLACE VIEW citus_local_tables_mx.v104 (a) AS SELECT loc_tb_2.a + FROM citus_local_tables_mx.loc_tb_2; ALTER VIEW citus_local_tables_mx.v104 OWNER TO postgres" +DEBUG: "view v104" has dependency to "table loc_tb_2" that is not in Citus' metadata +DEBUG: validating foreign key constraint "fkey_2_1330083" SET client_min_messages TO WARNING; -- works fine select run_command_on_workers($$SELECT count(*) from citus_local_tables_mx.v100, citus_local_tables_mx.v101, citus_local_tables_mx.v102$$); @@ -922,14 +939,14 @@ select run_command_on_workers($$SELECT count(*) from citus_local_tables_mx.v100$ (localhost,57638,f,"ERROR: relation ""citus_local_tables_mx.v100"" does not exist") (2 rows) - select run_command_on_workers($$SELECT count(*) from citus_local_tables_mx.v101$$); +select run_command_on_workers($$SELECT count(*) from citus_local_tables_mx.v101$$); run_command_on_workers --------------------------------------------------------------------- (localhost,57637,f,"ERROR: relation ""citus_local_tables_mx.v101"" does not exist") (localhost,57638,f,"ERROR: relation ""citus_local_tables_mx.v101"" does not exist") (2 rows) - select run_command_on_workers($$SELECT count(*) from citus_local_tables_mx.v102$$); +select run_command_on_workers($$SELECT count(*) from citus_local_tables_mx.v102$$); run_command_on_workers --------------------------------------------------------------------- (localhost,57637,f,"ERROR: relation ""citus_local_tables_mx.v102"" does not exist") diff --git a/src/test/regress/sql/citus_local_tables_mx.sql b/src/test/regress/sql/citus_local_tables_mx.sql index 2a2fb70d3..86307cacb 100644 --- a/src/test/regress/sql/citus_local_tables_mx.sql +++ b/src/test/regress/sql/citus_local_tables_mx.sql @@ -465,9 +465,19 @@ ALTER TABLE loc_tb ADD CONSTRAINT fkey FOREIGN KEY (a) references ref_tb(a); -- drop the view&matview with circular dependency DROP VIEW v103 CASCADE; -SET client_min_messages TO DEBUG1; -- now it should successfully add to metadata and create the views on workers ALTER TABLE loc_tb ADD CONSTRAINT fkey FOREIGN KEY (a) references ref_tb(a); +-- verify the views are created on workers +select run_command_on_workers($$SELECT count(*)=0 from citus_local_tables_mx.v100$$); +select run_command_on_workers($$SELECT count(*)=0 from citus_local_tables_mx.v101$$); +select run_command_on_workers($$SELECT count(*)=0 from citus_local_tables_mx.v102$$); + +CREATE TABLE loc_tb_2 (a int); +CREATE VIEW v104 AS SELECT * from loc_tb_2; + +SET client_min_messages TO DEBUG1; +-- verify the CREATE command for the view is generated correctly +ALTER TABLE loc_tb_2 ADD CONSTRAINT fkey_2 FOREIGN KEY (a) references ref_tb(a); SET client_min_messages TO WARNING; -- works fine @@ -477,8 +487,8 @@ select run_command_on_workers($$SELECT count(*) from citus_local_tables_mx.v100, ALTER TABLE loc_tb DROP CONSTRAINT fkey; -- fails because fkey is dropped and table is converted to local table select run_command_on_workers($$SELECT count(*) from citus_local_tables_mx.v100$$); - select run_command_on_workers($$SELECT count(*) from citus_local_tables_mx.v101$$); - select run_command_on_workers($$SELECT count(*) from citus_local_tables_mx.v102$$); +select run_command_on_workers($$SELECT count(*) from citus_local_tables_mx.v101$$); +select run_command_on_workers($$SELECT count(*) from citus_local_tables_mx.v102$$); INSERT INTO loc_tb VALUES (1), (2); -- test a matview with columnar From 1c5b8588fe6702a7f2169e2b90a810b7c0246f36 Mon Sep 17 00:00:00 2001 From: Jelte Fennema Date: Fri, 2 Sep 2022 14:23:47 +0200 Subject: [PATCH 08/16] Address race condition in InitializeBackendData (#6285) Sometimes in CI our isolation_citus_dist_activity test fails randomly like this: ```diff step s2-view-dist: SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_dist_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%BEGIN%'), ('%pg_catalog.pg_isolation_test_session_is_blocked%'), ('%citus_add_node%')) AND backend_type = 'client backend' ORDER BY query DESC; query |citus_nodename_for_nodeid|citus_nodeport_for_nodeid|state |wait_event_type|wait_event|usename |datname ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------+-------------------------+-------------------+---------------+----------+--------+---------- INSERT INTO test_table VALUES (100, 100); |localhost | 57636|idle in transaction|Client |ClientRead|postgres|regression -(1 row) + + SELECT coalesce(to_jsonb(array_agg(csa_from_one_node.*)), '[{}]'::JSONB) + FROM ( + SELECT global_pid, worker_query AS is_worker_query, pg_stat_activity.* FROM + pg_stat_activity LEFT JOIN get_all_active_transactions() ON process_id = pid + ) AS csa_from_one_node; + |localhost | 57636|active | | |postgres|regression +(2 rows) step s3-view-worker: ``` Source: https://app.circleci.com/pipelines/github/citusdata/citus/26692/workflows/3406e4b4-b686-4667-bec6-8253ee0809b1/jobs/765119 I intended to fix this with #6263, but the fix turned out to be insufficient. This PR tries to address the issue by setting distributedCommandOriginator correctly in more situations. However, even with this change it's still possible to reproduce the flaky test in CI. In any case this should fix at least some instances of this issue. In passing this changes the isolation_citus_dist_activity test to allow running it multiple times in a row. --- src/backend/distributed/shared_library_init.c | 14 +++- .../distributed/transaction/backend_data.c | 55 +++++++------- src/include/distributed/backend_data.h | 6 +- .../isolation_citus_dist_activity.out | 72 +++++++++---------- .../spec/isolation_citus_dist_activity.spec | 2 +- 5 files changed, 79 insertions(+), 70 deletions(-) diff --git a/src/backend/distributed/shared_library_init.c b/src/backend/distributed/shared_library_init.c index 4874a4042..a71467ad6 100644 --- a/src/backend/distributed/shared_library_init.c +++ b/src/backend/distributed/shared_library_init.c @@ -2643,10 +2643,17 @@ StatisticsCollectionGucCheckHook(bool *newval, void **extra, GucSource source) static void CitusAuthHook(Port *port, int status) { - uint64 gpid = ExtractGlobalPID(port->application_name); + /* + * We determine the backend type here because other calls in this hook rely + * on it, both IsExternalClientBackend and InitializeBackendData. These + * calls would normally initialize its value based on the application_name + * global, but this global is not set yet at this point in the connection + * initialization. So here we determine it based on the value from Port. + */ + DetermineCitusBackendType(port->application_name); /* external connections to not have a GPID immediately */ - if (gpid == INVALID_CITUS_INTERNAL_BACKEND_GPID) + if (IsExternalClientBackend()) { /* * We raise the shared connection counter pre-emptively. As a result, we may @@ -2698,7 +2705,8 @@ CitusAuthHook(Port *port, int status) * replication connection. A replication connection backend will never call * StartupCitusBackend, which normally sets up the global PID. */ - InitializeBackendData(gpid); + InitializeBackendData(port->application_name); + /* let other authentication hooks to kick in first */ if (original_client_auth_hook) diff --git a/src/backend/distributed/transaction/backend_data.c b/src/backend/distributed/transaction/backend_data.c index 670469cd7..e641b7a9b 100644 --- a/src/backend/distributed/transaction/backend_data.c +++ b/src/backend/distributed/transaction/backend_data.c @@ -105,9 +105,6 @@ static BackendData *MyBackendData = NULL; static CitusBackendType CurrentBackendType = CITUS_BACKEND_NOT_ASSIGNED; -static void DetermineCitusBackendType(void); - - PG_FUNCTION_INFO_V1(assign_distributed_transaction_id); PG_FUNCTION_INFO_V1(get_current_transaction_id); PG_FUNCTION_INFO_V1(get_global_active_transactions); @@ -681,7 +678,7 @@ TotalProcCount(void) * initialized this backend can be correctly shown in citus_lock_waits. */ void -InitializeBackendData(uint64 globalPID) +InitializeBackendData(const char *applicationName) { if (MyBackendData != NULL) { @@ -693,6 +690,8 @@ InitializeBackendData(uint64 globalPID) return; } + uint64 gpid = ExtractGlobalPID(applicationName); + MyBackendData = &backendManagementShmemData->backends[MyProc->pgprocno]; Assert(MyBackendData); @@ -704,20 +703,8 @@ InitializeBackendData(uint64 globalPID) UnSetGlobalPID(); SpinLockAcquire(&MyBackendData->mutex); - - /* - * Use the given globalPID to initialize - */ - if (globalPID == INVALID_CITUS_INTERNAL_BACKEND_GPID) - { - MyBackendData->distributedCommandOriginator = - true; - } - else - { - MyBackendData->globalPID = globalPID; - MyBackendData->distributedCommandOriginator = false; - } + MyBackendData->distributedCommandOriginator = IsExternalClientBackend(); + MyBackendData->globalPID = gpid; SpinLockRelease(&MyBackendData->mutex); /* @@ -1045,7 +1032,7 @@ citus_pid_for_gpid(PG_FUNCTION_ARGS) * if the application name is not compatible with Citus' application names returns 0. */ uint64 -ExtractGlobalPID(char *applicationName) +ExtractGlobalPID(const char *applicationName) { /* does application name exist */ if (!applicationName) @@ -1371,7 +1358,7 @@ IsRebalancerInternalBackend(void) { if (CurrentBackendType == CITUS_BACKEND_NOT_ASSIGNED) { - DetermineCitusBackendType(); + DetermineCitusBackendType(application_name); } return CurrentBackendType == CITUS_REBALANCER_BACKEND; @@ -1387,7 +1374,7 @@ IsCitusInternalBackend(void) { if (CurrentBackendType == CITUS_BACKEND_NOT_ASSIGNED) { - DetermineCitusBackendType(); + DetermineCitusBackendType(application_name); } return CurrentBackendType == CITUS_INTERNAL_BACKEND; @@ -1403,29 +1390,41 @@ IsCitusRunCommandBackend(void) { if (CurrentBackendType == CITUS_BACKEND_NOT_ASSIGNED) { - DetermineCitusBackendType(); + DetermineCitusBackendType(application_name); } return CurrentBackendType == CITUS_RUN_COMMAND_BACKEND; } +bool +IsExternalClientBackend(void) +{ + if (CurrentBackendType == CITUS_BACKEND_NOT_ASSIGNED) + { + DetermineCitusBackendType(application_name); + } + + return CurrentBackendType == EXTERNAL_CLIENT_BACKEND; +} + + /* * DetermineCitusBackendType determines the type of backend based on the application_name. */ -static void -DetermineCitusBackendType(void) +void +DetermineCitusBackendType(const char *applicationName) { - if (ExtractGlobalPID(application_name) != INVALID_CITUS_INTERNAL_BACKEND_GPID) + if (ExtractGlobalPID(applicationName) != INVALID_CITUS_INTERNAL_BACKEND_GPID) { CurrentBackendType = CITUS_INTERNAL_BACKEND; } - else if (application_name && strcmp(application_name, CITUS_REBALANCER_NAME) == 0) + else if (applicationName && strcmp(applicationName, CITUS_REBALANCER_NAME) == 0) { CurrentBackendType = CITUS_REBALANCER_BACKEND; } - else if (application_name && - strcmp(application_name, CITUS_RUN_COMMAND_APPLICATION_NAME) == 0) + else if (applicationName && + strcmp(applicationName, CITUS_RUN_COMMAND_APPLICATION_NAME) == 0) { CurrentBackendType = CITUS_RUN_COMMAND_BACKEND; } diff --git a/src/include/distributed/backend_data.h b/src/include/distributed/backend_data.h index 12dbfb6df..b0846c8a7 100644 --- a/src/include/distributed/backend_data.h +++ b/src/include/distributed/backend_data.h @@ -50,7 +50,7 @@ extern void BackendManagementShmemInit(void); extern size_t BackendManagementShmemSize(void); extern void InitializeBackendManagement(void); extern int TotalProcCount(void); -extern void InitializeBackendData(uint64 globalPID); +extern void InitializeBackendData(const char *applicationName); extern void LockBackendSharedMemory(LWLockMode lockMode); extern void UnlockBackendSharedMemory(void); extern void UnSetDistributedTransactionId(void); @@ -62,7 +62,7 @@ extern void SetBackendDataGlobalPID(uint64 globalPID); extern uint64 GetGlobalPID(void); extern void SetBackendDataDistributedCommandOriginator(bool distributedCommandOriginator); -extern uint64 ExtractGlobalPID(char *applicationName); +extern uint64 ExtractGlobalPID(const char *applicationName); extern int ExtractNodeIdFromGlobalPID(uint64 globalPID, bool missingOk); extern int ExtractProcessIdFromGlobalPID(uint64 globalPID); extern void GetBackendDataForProc(PGPROC *proc, BackendData *result); @@ -74,9 +74,11 @@ extern LocalTransactionId GetMyProcLocalTransactionId(void); extern int GetExternalClientBackendCount(void); extern uint32 IncrementExternalClientBackendCounter(void); extern void DecrementExternalClientBackendCounter(void); +extern void DetermineCitusBackendType(const char *applicationName); extern bool IsCitusInternalBackend(void); extern bool IsRebalancerInternalBackend(void); extern bool IsCitusRunCommandBackend(void); +extern bool IsExternalClientBackend(void); extern void ResetCitusBackendType(void); #define INVALID_CITUS_INTERNAL_BACKEND_GPID 0 diff --git a/src/test/regress/expected/isolation_citus_dist_activity.out b/src/test/regress/expected/isolation_citus_dist_activity.out index 2f1810f7a..e674eed10 100644 --- a/src/test/regress/expected/isolation_citus_dist_activity.out +++ b/src/test/regress/expected/isolation_citus_dist_activity.out @@ -15,16 +15,16 @@ step s1-begin: BEGIN; step s2-begin: - BEGIN; + BEGIN; step s3-begin: - BEGIN; + BEGIN; step s1-alter-table: ALTER TABLE test_table ADD COLUMN x INT; step s2-sleep: - SELECT pg_sleep(0.5); + SELECT pg_sleep(0.5); pg_sleep --------------------------------------------------------------------- @@ -32,7 +32,7 @@ pg_sleep (1 row) step s2-view-dist: - SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_dist_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%BEGIN%'), ('%pg_catalog.pg_isolation_test_session_is_blocked%'), ('%citus_add_node%')) AND backend_type = 'client backend' ORDER BY query DESC; + SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_dist_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%BEGIN%'), ('%pg_catalog.pg_isolation_test_session_is_blocked%'), ('%citus_add_node%')) AND backend_type = 'client backend' ORDER BY query DESC; query |citus_nodename_for_nodeid|citus_nodeport_for_nodeid|state |wait_event_type|wait_event|usename |datname --------------------------------------------------------------------- @@ -42,7 +42,7 @@ query |citus_nodename_for_nodeid|citus_n (1 row) step s3-view-worker: - SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%csa_from_one_node%')) AND is_worker_query = true AND backend_type = 'client backend' ORDER BY query DESC; + SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%csa_from_one_node%')) AND is_worker_query = true AND backend_type = 'client backend' ORDER BY query DESC; query |citus_nodename_for_nodeid|citus_nodeport_for_nodeid|state |wait_event_type|wait_event|usename |datname --------------------------------------------------------------------- @@ -61,13 +61,13 @@ SELECT worker_apply_shard_ddl_command (1300001, 'public', ' (4 rows) step s2-rollback: - ROLLBACK; + ROLLBACK; step s1-commit: COMMIT; step s3-rollback: - ROLLBACK; + ROLLBACK; starting permutation: s1-cache-connections s1-begin s2-begin s3-begin s1-insert s2-sleep s2-view-dist s3-view-worker s2-rollback s1-commit s3-rollback @@ -85,16 +85,16 @@ step s1-begin: BEGIN; step s2-begin: - BEGIN; + BEGIN; step s3-begin: - BEGIN; + BEGIN; step s1-insert: - INSERT INTO test_table VALUES (100, 100); + INSERT INTO test_table VALUES (100, 100); step s2-sleep: - SELECT pg_sleep(0.5); + SELECT pg_sleep(0.5); pg_sleep --------------------------------------------------------------------- @@ -102,31 +102,31 @@ pg_sleep (1 row) step s2-view-dist: - SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_dist_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%BEGIN%'), ('%pg_catalog.pg_isolation_test_session_is_blocked%'), ('%citus_add_node%')) AND backend_type = 'client backend' ORDER BY query DESC; + SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_dist_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%BEGIN%'), ('%pg_catalog.pg_isolation_test_session_is_blocked%'), ('%citus_add_node%')) AND backend_type = 'client backend' ORDER BY query DESC; query |citus_nodename_for_nodeid|citus_nodeport_for_nodeid|state |wait_event_type|wait_event|usename |datname --------------------------------------------------------------------- - INSERT INTO test_table VALUES (100, 100); + INSERT INTO test_table VALUES (100, 100); |localhost | 57636|idle in transaction|Client |ClientRead|postgres|regression (1 row) step s3-view-worker: - SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%csa_from_one_node%')) AND is_worker_query = true AND backend_type = 'client backend' ORDER BY query DESC; + SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%csa_from_one_node%')) AND is_worker_query = true AND backend_type = 'client backend' ORDER BY query DESC; query |citus_nodename_for_nodeid|citus_nodeport_for_nodeid|state |wait_event_type|wait_event|usename |datname --------------------------------------------------------------------- -INSERT INTO public.test_table_1300008 (column1, column2) VALUES (100, 100)|localhost | 57636|idle in transaction|Client |ClientRead|postgres|regression +INSERT INTO public.test_table_1300003 (column1, column2) VALUES (100, 100)|localhost | 57636|idle in transaction|Client |ClientRead|postgres|regression (1 row) step s2-rollback: - ROLLBACK; + ROLLBACK; step s1-commit: COMMIT; step s3-rollback: - ROLLBACK; + ROLLBACK; starting permutation: s1-cache-connections s1-begin s2-begin s3-begin s1-select s2-sleep s2-view-dist s3-view-worker s2-rollback s1-commit s3-rollback @@ -144,10 +144,10 @@ step s1-begin: BEGIN; step s2-begin: - BEGIN; + BEGIN; step s3-begin: - BEGIN; + BEGIN; step s1-select: SELECT count(*) FROM test_table; @@ -158,7 +158,7 @@ count (1 row) step s2-sleep: - SELECT pg_sleep(0.5); + SELECT pg_sleep(0.5); pg_sleep --------------------------------------------------------------------- @@ -166,7 +166,7 @@ pg_sleep (1 row) step s2-view-dist: - SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_dist_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%BEGIN%'), ('%pg_catalog.pg_isolation_test_session_is_blocked%'), ('%citus_add_node%')) AND backend_type = 'client backend' ORDER BY query DESC; + SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_dist_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%BEGIN%'), ('%pg_catalog.pg_isolation_test_session_is_blocked%'), ('%citus_add_node%')) AND backend_type = 'client backend' ORDER BY query DESC; query |citus_nodename_for_nodeid|citus_nodeport_for_nodeid|state |wait_event_type|wait_event|usename |datname --------------------------------------------------------------------- @@ -176,24 +176,24 @@ query |citus_nodename_for_nodeid|citus_nodeport_f (1 row) step s3-view-worker: - SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%csa_from_one_node%')) AND is_worker_query = true AND backend_type = 'client backend' ORDER BY query DESC; + SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%csa_from_one_node%')) AND is_worker_query = true AND backend_type = 'client backend' ORDER BY query DESC; query |citus_nodename_for_nodeid|citus_nodeport_for_nodeid|state |wait_event_type|wait_event|usename |datname --------------------------------------------------------------------- -SELECT count(*) AS count FROM public.test_table_1300014 test_table WHERE true|localhost | 57636|idle in transaction|Client |ClientRead|postgres|regression -SELECT count(*) AS count FROM public.test_table_1300013 test_table WHERE true|localhost | 57636|idle in transaction|Client |ClientRead|postgres|regression -SELECT count(*) AS count FROM public.test_table_1300012 test_table WHERE true|localhost | 57636|idle in transaction|Client |ClientRead|postgres|regression -SELECT count(*) AS count FROM public.test_table_1300011 test_table WHERE true|localhost | 57636|idle in transaction|Client |ClientRead|postgres|regression +SELECT count(*) AS count FROM public.test_table_1300004 test_table WHERE true|localhost | 57636|idle in transaction|Client |ClientRead|postgres|regression +SELECT count(*) AS count FROM public.test_table_1300003 test_table WHERE true|localhost | 57636|idle in transaction|Client |ClientRead|postgres|regression +SELECT count(*) AS count FROM public.test_table_1300002 test_table WHERE true|localhost | 57636|idle in transaction|Client |ClientRead|postgres|regression +SELECT count(*) AS count FROM public.test_table_1300001 test_table WHERE true|localhost | 57636|idle in transaction|Client |ClientRead|postgres|regression (4 rows) step s2-rollback: - ROLLBACK; + ROLLBACK; step s1-commit: COMMIT; step s3-rollback: - ROLLBACK; + ROLLBACK; starting permutation: s1-cache-connections s1-begin s2-begin s3-begin s1-select-router s2-sleep s2-view-dist s3-view-worker s2-rollback s1-commit s3-rollback @@ -211,10 +211,10 @@ step s1-begin: BEGIN; step s2-begin: - BEGIN; + BEGIN; step s3-begin: - BEGIN; + BEGIN; step s1-select-router: SELECT count(*) FROM test_table WHERE column1 = 55; @@ -225,7 +225,7 @@ count (1 row) step s2-sleep: - SELECT pg_sleep(0.5); + SELECT pg_sleep(0.5); pg_sleep --------------------------------------------------------------------- @@ -233,7 +233,7 @@ pg_sleep (1 row) step s2-view-dist: - SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_dist_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%BEGIN%'), ('%pg_catalog.pg_isolation_test_session_is_blocked%'), ('%citus_add_node%')) AND backend_type = 'client backend' ORDER BY query DESC; + SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_dist_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%BEGIN%'), ('%pg_catalog.pg_isolation_test_session_is_blocked%'), ('%citus_add_node%')) AND backend_type = 'client backend' ORDER BY query DESC; query |citus_nodename_for_nodeid|citus_nodeport_for_nodeid|state |wait_event_type|wait_event|usename |datname --------------------------------------------------------------------- @@ -243,19 +243,19 @@ query |citus_nodename_for_node (1 row) step s3-view-worker: - SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%csa_from_one_node%')) AND is_worker_query = true AND backend_type = 'client backend' ORDER BY query DESC; + SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%csa_from_one_node%')) AND is_worker_query = true AND backend_type = 'client backend' ORDER BY query DESC; query |citus_nodename_for_nodeid|citus_nodeport_for_nodeid|state |wait_event_type|wait_event|usename |datname --------------------------------------------------------------------- -SELECT count(*) AS count FROM public.test_table_1300017 test_table WHERE (column1 OPERATOR(pg_catalog.=) 55)|localhost | 57636|idle in transaction|Client |ClientRead|postgres|regression +SELECT count(*) AS count FROM public.test_table_1300002 test_table WHERE (column1 OPERATOR(pg_catalog.=) 55)|localhost | 57636|idle in transaction|Client |ClientRead|postgres|regression (1 row) step s2-rollback: - ROLLBACK; + ROLLBACK; step s1-commit: COMMIT; step s3-rollback: - ROLLBACK; + ROLLBACK; diff --git a/src/test/regress/spec/isolation_citus_dist_activity.spec b/src/test/regress/spec/isolation_citus_dist_activity.spec index 9970ced85..5d62adbb1 100644 --- a/src/test/regress/spec/isolation_citus_dist_activity.spec +++ b/src/test/regress/spec/isolation_citus_dist_activity.spec @@ -9,7 +9,7 @@ setup AS 'citus', $$test_assign_global_pid$$; SET citus.shard_replication_factor TO 1; SET citus.shard_count TO 4; - select setval('pg_dist_shardid_seq', GREATEST(1300000, nextval('pg_dist_shardid_seq'))); + ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 1300001; CREATE TABLE test_table(column1 int, column2 int); SELECT create_distributed_table('test_table', 'column1'); From bd1383664890913a839fdda09dc0e0e6b886b55a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96nder=20Kalac=C4=B1?= Date: Mon, 5 Sep 2022 18:47:41 +0300 Subject: [PATCH 09/16] Add citus.skip_advisory_lock_permission_checks (#6293) --- src/backend/distributed/shared_library_init.c | 14 +++++++ src/backend/distributed/utils/resource_lock.c | 7 +++- src/include/distributed/resource_lock.h | 2 + .../regress/expected/adv_lock_permission.out | 41 ++++++++++++++++++ src/test/regress/multi_schedule | 2 +- src/test/regress/sql/adv_lock_permission.sql | 42 +++++++++++++++++++ 6 files changed, 106 insertions(+), 2 deletions(-) create mode 100644 src/test/regress/expected/adv_lock_permission.out create mode 100644 src/test/regress/sql/adv_lock_permission.sql diff --git a/src/backend/distributed/shared_library_init.c b/src/backend/distributed/shared_library_init.c index a71467ad6..343ac2f0b 100644 --- a/src/backend/distributed/shared_library_init.c +++ b/src/backend/distributed/shared_library_init.c @@ -2076,6 +2076,20 @@ RegisterCitusConfigVariables(void) ShowShardsForAppNamePrefixesAssignHook, NULL); + DefineCustomBoolVariable( + "citus.skip_advisory_lock_permission_checks", + gettext_noop("Postgres would normally enforce some " + "ownership checks while acquiring locks. " + "When this setting is 'off', Citus skips" + "ownership checks on internal advisory " + "locks."), + NULL, + &SkipAdvisoryLockPermissionChecks, + false, + GUC_SUPERUSER_ONLY, + GUC_NO_SHOW_ALL, + NULL, NULL, NULL); + DefineCustomBoolVariable( "citus.skip_jsonb_validation_in_copy", gettext_noop("Skip validation of JSONB columns on the coordinator during COPY " diff --git a/src/backend/distributed/utils/resource_lock.c b/src/backend/distributed/utils/resource_lock.c index 35991ade1..a4187aa65 100644 --- a/src/backend/distributed/utils/resource_lock.c +++ b/src/backend/distributed/utils/resource_lock.c @@ -109,6 +109,8 @@ PG_FUNCTION_INFO_V1(lock_relation_if_exists); /* Config variable managed via guc.c */ bool EnableAcquiringUnsafeLockFromWorkers = false; +bool SkipAdvisoryLockPermissionChecks = false; + /* * lock_shard_metadata allows the shard distribution metadata to be locked @@ -248,7 +250,10 @@ lock_shard_resources(PG_FUNCTION_ARGS) continue; } - EnsureTablePermissions(relationId, aclMask); + if (!SkipAdvisoryLockPermissionChecks) + { + EnsureTablePermissions(relationId, aclMask); + } LockShardResource(shardId, lockMode); } diff --git a/src/include/distributed/resource_lock.h b/src/include/distributed/resource_lock.h index f2a331451..375f6abbf 100644 --- a/src/include/distributed/resource_lock.h +++ b/src/include/distributed/resource_lock.h @@ -149,6 +149,7 @@ enum DistLockConfigs DIST_LOCK_NOWAIT = 2 }; + /* Lock shard/relation metadata for safe modifications */ extern void LockShardDistributionMetadata(int64 shardId, LOCKMODE lockMode); extern void LockPlacementCleanup(void); @@ -194,5 +195,6 @@ extern void AcquireDistributedLockOnRelations(List *relationList, LOCKMODE lockM extern void PreprocessLockStatement(LockStmt *stmt, ProcessUtilityContext context); extern bool EnableAcquiringUnsafeLockFromWorkers; +extern bool SkipAdvisoryLockPermissionChecks; #endif /* RESOURCE_LOCK_H */ diff --git a/src/test/regress/expected/adv_lock_permission.out b/src/test/regress/expected/adv_lock_permission.out new file mode 100644 index 000000000..d034fbb67 --- /dev/null +++ b/src/test/regress/expected/adv_lock_permission.out @@ -0,0 +1,41 @@ +CREATE SCHEMA adv_lock_permission; +SET search_path to adv_lock_permission; +-- do not cache any connections, we change some settings and don't want old ones cached +SET citus.max_cached_conns_per_worker TO 0; +CREATE ROLE user_1 WITH LOGIN; +CREATE TABLE reference_table_1 (A int); +SELECT create_reference_table('reference_table_1'); + create_reference_table +--------------------------------------------------------------------- + +(1 row) + +CREATE TABLE reference_table_2 (A int); +SELECT create_reference_table('reference_table_2'); + create_reference_table +--------------------------------------------------------------------- + +(1 row) + +GRANT USAGE ON SCHEMA adv_lock_permission TO user_1; +GRANT SELECT ON reference_table_1 TO user_1; +GRANT INSERT, UPDATE ON reference_table_2 TO user_1; +SET ROLE user_1; +-- do not cache any connections, we change some settings and don't want old ones cached +SET citus.max_cached_conns_per_worker TO 0; +SET search_path to adv_lock_permission; +INSERT INTO reference_table_2 SELECT * FROM reference_table_1; +ERROR: permission denied for table reference_table_1 +CONTEXT: while executing command on localhost:xxxxx +SET ROLE postgres; +-- do not cache any connections, we change some settings and don't want old ones cached +SET citus.max_cached_conns_per_worker TO 0; +-- change the role so that it can skip permission checks +ALTER ROLE user_1 SET citus.skip_advisory_lock_permission_checks TO on; +SET ROLE user_1; +SET citus.max_cached_conns_per_worker TO 0; +INSERT INTO reference_table_2 SELECT * FROM reference_table_1; +SET ROLE postgres; +SET client_min_messages TO ERROR; +DROP SCHEMA adv_lock_permission CASCADE; +DROP ROLE user_1; diff --git a/src/test/regress/multi_schedule b/src/test/regress/multi_schedule index a92da42fd..ef64e7657 100644 --- a/src/test/regress/multi_schedule +++ b/src/test/regress/multi_schedule @@ -86,7 +86,7 @@ test: multi_agg_type_conversion multi_count_type_conversion recursive_relation_p test: multi_partition_pruning single_hash_repartition_join unsupported_lateral_subqueries test: multi_join_pruning multi_hash_pruning intermediate_result_pruning test: multi_null_minmax_value_pruning cursors -test: modification_correctness +test: modification_correctness adv_lock_permission test: multi_query_directory_cleanup test: multi_task_assignment_policy multi_cross_shard test: multi_utility_statements diff --git a/src/test/regress/sql/adv_lock_permission.sql b/src/test/regress/sql/adv_lock_permission.sql new file mode 100644 index 000000000..2528fdc39 --- /dev/null +++ b/src/test/regress/sql/adv_lock_permission.sql @@ -0,0 +1,42 @@ +CREATE SCHEMA adv_lock_permission; +SET search_path to adv_lock_permission; + +-- do not cache any connections, we change some settings and don't want old ones cached +SET citus.max_cached_conns_per_worker TO 0; + +CREATE ROLE user_1 WITH LOGIN; + +CREATE TABLE reference_table_1 (A int); +SELECT create_reference_table('reference_table_1'); + +CREATE TABLE reference_table_2 (A int); +SELECT create_reference_table('reference_table_2'); + +GRANT USAGE ON SCHEMA adv_lock_permission TO user_1; +GRANT SELECT ON reference_table_1 TO user_1; +GRANT INSERT, UPDATE ON reference_table_2 TO user_1; + +SET ROLE user_1; + +-- do not cache any connections, we change some settings and don't want old ones cached +SET citus.max_cached_conns_per_worker TO 0; +SET search_path to adv_lock_permission; + +INSERT INTO reference_table_2 SELECT * FROM reference_table_1; + +SET ROLE postgres; +-- do not cache any connections, we change some settings and don't want old ones cached +SET citus.max_cached_conns_per_worker TO 0; + +-- change the role so that it can skip permission checks +ALTER ROLE user_1 SET citus.skip_advisory_lock_permission_checks TO on; + +SET ROLE user_1; + +SET citus.max_cached_conns_per_worker TO 0; +INSERT INTO reference_table_2 SELECT * FROM reference_table_1; + +SET ROLE postgres; +SET client_min_messages TO ERROR; +DROP SCHEMA adv_lock_permission CASCADE; +DROP ROLE user_1; From e6b184593108affeff19a93a0751128c5ad804ce Mon Sep 17 00:00:00 2001 From: Marco Slot Date: Mon, 5 Sep 2022 22:02:18 +0200 Subject: [PATCH 10/16] Change split logic to avoid EnsureReferenceTablesExistOnAllNodesExtended (#6208) Co-authored-by: Marco Slot --- .../distributed/operations/shard_split.c | 3 +- .../distributed/utils/reference_table_utils.c | 79 +++++++++++++++++++ .../distributed/reference_table_utils.h | 1 + .../multi_replicate_reference_table.out | 32 ++++++++ .../sql/multi_replicate_reference_table.sql | 15 ++++ 5 files changed, 129 insertions(+), 1 deletion(-) diff --git a/src/backend/distributed/operations/shard_split.c b/src/backend/distributed/operations/shard_split.c index bc4ad208b..f7116d565 100644 --- a/src/backend/distributed/operations/shard_split.c +++ b/src/backend/distributed/operations/shard_split.c @@ -501,6 +501,8 @@ SplitShard(SplitMode splitMode, List *workersForPlacementList = GetWorkerNodesFromWorkerIds(nodeIdsForPlacementList); + ErrorIfNotAllNodesHaveReferenceTableReplicas(workersForPlacementList); + List *sourceColocatedShardIntervalList = NIL; if (colocatedShardIntervalList == NIL) { @@ -517,7 +519,6 @@ SplitShard(SplitMode splitMode, if (splitMode == BLOCKING_SPLIT) { - EnsureReferenceTablesExistOnAllNodesExtended(TRANSFER_MODE_BLOCK_WRITES); BlockingShardSplit( splitOperation, splitWorkflowId, diff --git a/src/backend/distributed/utils/reference_table_utils.c b/src/backend/distributed/utils/reference_table_utils.c index 23e608a18..f4b2f4be3 100644 --- a/src/backend/distributed/utils/reference_table_utils.c +++ b/src/backend/distributed/utils/reference_table_utils.c @@ -50,6 +50,7 @@ static void ReplicateReferenceTableShardToNode(ShardInterval *shardInterval, int nodePort); static bool AnyRelationsModifiedInTransaction(List *relationIdList); static List * ReplicatedMetadataSyncedDistributedTableList(void); +static bool NodeHasAllReferenceTableReplicas(WorkerNode *workerNode); /* exports for SQL callable functions */ PG_FUNCTION_INFO_V1(upgrade_to_reference_table); @@ -688,3 +689,81 @@ ReplicateAllReferenceTablesToNode(WorkerNode *workerNode) } } } + + +/* + * ErrorIfNotAllNodesHaveReferenceTableReplicas throws an error when one of the + * nodes in the list does not have reference table replicas. + */ +void +ErrorIfNotAllNodesHaveReferenceTableReplicas(List *workerNodeList) +{ + WorkerNode *workerNode = NULL; + + foreach_ptr(workerNode, workerNodeList) + { + if (!NodeHasAllReferenceTableReplicas(workerNode)) + { + ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("reference tables have not been replicated to " + "node %s:%d yet", + workerNode->workerName, + workerNode->workerPort), + errdetail("Reference tables are lazily replicated after " + "adding a node, but must exist before shards can " + "be created on that node."), + errhint("Run SELECT replicate_reference_tables(); to " + "ensure reference tables exist on all nodes."))); + } + } +} + + +/* + * NodeHasAllReferenceTablesReplicas returns whether the given worker node has reference + * table replicas. If there are no reference tables the function returns true. + * + * This function does not do any locking, so the situation could change immediately after, + * though we can only ever transition from false to true, so only "false" could be the + * incorrect answer. + * + * In the case where the function returns true because no reference tables exist + * on the node, a reference table could be created immediately after. However, the + * creation logic guarantees that this reference table will be created on all the + * nodes, so our answer was correct. + */ +static bool +NodeHasAllReferenceTableReplicas(WorkerNode *workerNode) +{ + List *referenceTableIdList = CitusTableTypeIdList(REFERENCE_TABLE); + + if (list_length(referenceTableIdList) == 0) + { + /* no reference tables exist */ + return true; + } + + Oid referenceTableId = linitial_oid(referenceTableIdList); + List *shardIntervalList = LoadShardIntervalList(referenceTableId); + if (list_length(shardIntervalList) != 1) + { + /* check for corrupt metadata */ + ereport(ERROR, (errmsg("reference table \"%s\" can only have 1 shard", + get_rel_name(referenceTableId)))); + } + + ShardInterval *shardInterval = (ShardInterval *) linitial(shardIntervalList); + List *shardPlacementList = ActiveShardPlacementList(shardInterval->shardId); + + ShardPlacement *placement = NULL; + foreach_ptr(placement, shardPlacementList) + { + if (placement->groupId == workerNode->groupId) + { + /* our worker has a reference table placement */ + return true; + } + } + + return false; +} diff --git a/src/include/distributed/reference_table_utils.h b/src/include/distributed/reference_table_utils.h index a26f16630..80b282126 100644 --- a/src/include/distributed/reference_table_utils.h +++ b/src/include/distributed/reference_table_utils.h @@ -26,5 +26,6 @@ extern void DeleteAllReplicatedTablePlacementsFromNodeGroup(int32 groupId, bool localOnly); extern int CompareOids(const void *leftElement, const void *rightElement); extern void ReplicateAllReferenceTablesToNode(WorkerNode *workerNode); +extern void ErrorIfNotAllNodesHaveReferenceTableReplicas(List *workerNodeList); #endif /* REFERENCE_TABLE_UTILS_H_ */ diff --git a/src/test/regress/expected/multi_replicate_reference_table.out b/src/test/regress/expected/multi_replicate_reference_table.out index 2dd670108..afec8052b 100644 --- a/src/test/regress/expected/multi_replicate_reference_table.out +++ b/src/test/regress/expected/multi_replicate_reference_table.out @@ -1150,6 +1150,38 @@ CREATE TABLE test (x int, y int references ref(a)); SELECT create_distributed_table('test','x'); ERROR: canceling the transaction since it was involved in a distributed deadlock END; +-- verify the split fails if we still need to replicate reference tables +SELECT citus_remove_node('localhost', :worker_2_port); + citus_remove_node +--------------------------------------------------------------------- + +(1 row) + +SELECT create_distributed_table('test','x'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +SELECT citus_add_node('localhost', :worker_2_port); + citus_add_node +--------------------------------------------------------------------- + 1370022 +(1 row) + +SELECT + citus_split_shard_by_split_points(shardid, + ARRAY[(shardminvalue::int + (shardmaxvalue::int - shardminvalue::int)/2)::text], + ARRAY[nodeid, nodeid], + 'force_logical') +FROM + pg_dist_shard, pg_dist_node +WHERE + logicalrelid = 'replicate_reference_table.test'::regclass AND nodename = 'localhost' AND nodeport = :worker_2_port +ORDER BY shardid LIMIT 1; +ERROR: reference tables have not been replicated to node localhost:xxxxx yet +DETAIL: Reference tables are lazily replicated after adding a node, but must exist before shards can be created on that node. +HINT: Run SELECT replicate_reference_tables(); to ensure reference tables exist on all nodes. -- test adding an invalid node while we have reference tables to replicate -- set client message level to ERROR and verbosity to terse to supporess -- OS-dependent host name resolution warnings diff --git a/src/test/regress/sql/multi_replicate_reference_table.sql b/src/test/regress/sql/multi_replicate_reference_table.sql index c34007ad7..172d08f35 100644 --- a/src/test/regress/sql/multi_replicate_reference_table.sql +++ b/src/test/regress/sql/multi_replicate_reference_table.sql @@ -700,6 +700,21 @@ CREATE TABLE test (x int, y int references ref(a)); SELECT create_distributed_table('test','x'); END; +-- verify the split fails if we still need to replicate reference tables +SELECT citus_remove_node('localhost', :worker_2_port); +SELECT create_distributed_table('test','x'); +SELECT citus_add_node('localhost', :worker_2_port); +SELECT + citus_split_shard_by_split_points(shardid, + ARRAY[(shardminvalue::int + (shardmaxvalue::int - shardminvalue::int)/2)::text], + ARRAY[nodeid, nodeid], + 'force_logical') +FROM + pg_dist_shard, pg_dist_node +WHERE + logicalrelid = 'replicate_reference_table.test'::regclass AND nodename = 'localhost' AND nodeport = :worker_2_port +ORDER BY shardid LIMIT 1; + -- test adding an invalid node while we have reference tables to replicate -- set client message level to ERROR and verbosity to terse to supporess -- OS-dependent host name resolution warnings From fd9b3f4ae9a1a79fa15849f8fd901d5a71203eb0 Mon Sep 17 00:00:00 2001 From: Naisila Puka <37271756+naisila@users.noreply.github.com> Date: Tue, 6 Sep 2022 11:04:14 +0300 Subject: [PATCH 11/16] Add tests to make sure distributed clone trigger rename fails in PG15 (#6291) Relevant PG commit: 80ba4bb383538a2ee846fece6a7b8da9518b6866 --- src/test/regress/expected/pg15.out | 4 ++++ .../upgrade_distributed_triggers_after.out | 17 +++++++++++++++++ src/test/regress/sql/pg15.sql | 3 +++ .../sql/upgrade_distributed_triggers_after.sql | 5 +++++ 4 files changed, 29 insertions(+) diff --git a/src/test/regress/expected/pg15.out b/src/test/regress/expected/pg15.out index f59586b7a..1b214f4a9 100644 --- a/src/test/regress/expected/pg15.out +++ b/src/test/regress/expected/pg15.out @@ -168,6 +168,10 @@ SELECT * FROM sale_triggers ORDER BY 1, 2; truncate_trigger_xxxxxxx | sale_newyork | O (6 rows) +-- test that we can't rename a distributed clone trigger +ALTER TRIGGER "new_record_sale_trigger" ON "pg15"."sale_newyork" RENAME TO "another_trigger_name"; +ERROR: cannot rename trigger "new_record_sale_trigger" on table "sale_newyork" +HINT: Rename trigger on partitioned table "sale" instead. -- -- In PG15, For GENERATED columns, all dependencies of the generation -- expression are recorded as NORMAL dependencies of the column itself. diff --git a/src/test/regress/expected/upgrade_distributed_triggers_after.out b/src/test/regress/expected/upgrade_distributed_triggers_after.out index 591f33401..2e3f7330b 100644 --- a/src/test/regress/expected/upgrade_distributed_triggers_after.out +++ b/src/test/regress/expected/upgrade_distributed_triggers_after.out @@ -218,6 +218,23 @@ SELECT * FROM sale_triggers ORDER BY 1, 2; truncate_trigger_xxxxxxx | sale_alabama | O (12 rows) +-- after upgrade to PG15, test that we can't rename a distributed clone trigger +ALTER TRIGGER "renamed_yet_another_trigger" ON "sale_alabama" RENAME TO "another_trigger_name"; +ERROR: cannot rename trigger "renamed_yet_another_trigger" on table "sale_alabama" +HINT: Rename trigger on partitioned table "sale" instead. +SELECT count(*) FROM pg_trigger WHERE tgname like 'another_trigger_name%'; + count +--------------------------------------------------------------------- + 0 +(1 row) + +SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'another_trigger_name%';$$); + run_command_on_workers +--------------------------------------------------------------------- + (localhost,10201,t,0) + (localhost,10202,t,0) +(2 rows) + DROP SCHEMA upgrade_distributed_triggers CASCADE; NOTICE: drop cascades to 4 other objects DETAIL: drop cascades to function record_sale() diff --git a/src/test/regress/sql/pg15.sql b/src/test/regress/sql/pg15.sql index cefc419e8..73cee8054 100644 --- a/src/test/regress/sql/pg15.sql +++ b/src/test/regress/sql/pg15.sql @@ -114,6 +114,9 @@ SELECT * FROM sale_triggers ORDER BY 1, 2; ALTER TRIGGER "record_sale_trigger" ON "pg15"."sale" RENAME TO "new_record_sale_trigger"; SELECT * FROM sale_triggers ORDER BY 1, 2; +-- test that we can't rename a distributed clone trigger +ALTER TRIGGER "new_record_sale_trigger" ON "pg15"."sale_newyork" RENAME TO "another_trigger_name"; + -- -- In PG15, For GENERATED columns, all dependencies of the generation -- expression are recorded as NORMAL dependencies of the column itself. diff --git a/src/test/regress/sql/upgrade_distributed_triggers_after.sql b/src/test/regress/sql/upgrade_distributed_triggers_after.sql index bcbd7432e..681f1896b 100644 --- a/src/test/regress/sql/upgrade_distributed_triggers_after.sql +++ b/src/test/regress/sql/upgrade_distributed_triggers_after.sql @@ -69,4 +69,9 @@ ALTER TRIGGER "yet_another_trigger" ON "sale" RENAME TO "renamed_yet_another_tri SELECT * FROM sale_triggers ORDER BY 1, 2; +-- after upgrade to PG15, test that we can't rename a distributed clone trigger +ALTER TRIGGER "renamed_yet_another_trigger" ON "sale_alabama" RENAME TO "another_trigger_name"; +SELECT count(*) FROM pg_trigger WHERE tgname like 'another_trigger_name%'; +SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'another_trigger_name%';$$); + DROP SCHEMA upgrade_distributed_triggers CASCADE; From d7f41cacbe52c756279b32ae39c95f47b59261fc Mon Sep 17 00:00:00 2001 From: Naisila Puka <37271756+naisila@users.noreply.github.com> Date: Tue, 6 Sep 2022 12:19:25 +0300 Subject: [PATCH 12/16] Prohibit renaming child trigger on distributed partition pre PG15 (#6290) Pre PG15, renaming child triggers on partitions is allowed. When creating a trigger in a distributed parent partitioned table, the triggers on the shards of the partitions have the same name with the triggers on the corresponding parent shards of the parent table. Therefore, they don't have the same appended shard id as the shard id of the partition. Hence, when trying to rename a child trigger on a partition of a distributed table, we can't correctly find the triggers on the shards of the partition in order to rename them since we append a different shard id to the name of the trigger. Since we can't find the trigger we get a misleading error of inexistent trigger. In this commit we prohibit renaming child triggers on distributed partitions altogether. --- .../commands/distribute_object_ops.c | 2 +- src/backend/distributed/commands/trigger.c | 95 +++++++++++++++++++ src/include/distributed/commands.h | 2 + .../upgrade_distributed_triggers_before.out | 3 + .../upgrade_distributed_triggers_before.sql | 3 + 5 files changed, 104 insertions(+), 1 deletion(-) diff --git a/src/backend/distributed/commands/distribute_object_ops.c b/src/backend/distributed/commands/distribute_object_ops.c index 198d1df9a..74a7ce69b 100644 --- a/src/backend/distributed/commands/distribute_object_ops.c +++ b/src/backend/distributed/commands/distribute_object_ops.c @@ -1176,7 +1176,7 @@ static DistributeObjectOps View_Rename = { static DistributeObjectOps Trigger_Rename = { .deparse = NULL, .qualify = NULL, - .preprocess = NULL, + .preprocess = PreprocessAlterTriggerRenameStmt, .operationType = DIST_OPS_ALTER, .postprocess = PostprocessAlterTriggerRenameStmt, .address = NULL, diff --git a/src/backend/distributed/commands/trigger.c b/src/backend/distributed/commands/trigger.c index ffb0ab5ae..3bb2417e7 100644 --- a/src/backend/distributed/commands/trigger.c +++ b/src/backend/distributed/commands/trigger.c @@ -53,6 +53,9 @@ static void ExtractDropStmtTriggerAndRelationName(DropStmt *dropTriggerStmt, char **relationName); static void ErrorIfDropStmtDropsMultipleTriggers(DropStmt *dropTriggerStmt); static int16 GetTriggerTypeById(Oid triggerId); +#if (PG_VERSION_NUM < PG_VERSION_15) +static void ErrorOutIfCloneTrigger(Oid tgrelid, const char *tgname); +#endif /* GUC that overrides trigger checks for distributed tables and reference tables */ @@ -319,6 +322,40 @@ CreateTriggerEventExtendNames(CreateTrigStmt *createTriggerStmt, char *schemaNam } +/* + * PreprocessAlterTriggerRenameStmt is called before a ALTER TRIGGER RENAME + * command has been executed by standard process utility. This function errors + * out if we are trying to rename a child trigger on a partition of a distributed + * table. In PG15, this is not allowed anyway. + */ +List * +PreprocessAlterTriggerRenameStmt(Node *node, const char *queryString, + ProcessUtilityContext processUtilityContext) +{ +#if (PG_VERSION_NUM < PG_VERSION_15) + RenameStmt *renameTriggerStmt = castNode(RenameStmt, node); + Assert(renameTriggerStmt->renameType == OBJECT_TRIGGER); + + RangeVar *relation = renameTriggerStmt->relation; + + bool missingOk = false; + Oid relationId = RangeVarGetRelid(relation, ALTER_TRIGGER_LOCK_MODE, missingOk); + + if (!IsCitusTable(relationId)) + { + return NIL; + } + + EnsureCoordinator(); + ErrorOutForTriggerIfNotSupported(relationId); + + ErrorOutIfCloneTrigger(relationId, renameTriggerStmt->subname); +#endif + + return NIL; +} + + /* * PostprocessAlterTriggerRenameStmt is called after a ALTER TRIGGER RENAME * command has been executed by standard process utility. This function errors @@ -611,6 +648,64 @@ ErrorOutForTriggerIfNotSupported(Oid relationId) } +#if (PG_VERSION_NUM < PG_VERSION_15) + +/* + * ErrorOutIfCloneTrigger is a helper function to error + * out if we are trying to rename a child trigger on a + * partition of a distributed table. + * A lot of this code is borrowed from PG15 because + * renaming clone triggers isn't allowed in PG15 anymore. + */ +static void +ErrorOutIfCloneTrigger(Oid tgrelid, const char *tgname) +{ + HeapTuple tuple; + ScanKeyData key[2]; + + Relation tgrel = table_open(TriggerRelationId, RowExclusiveLock); + + /* + * Search for the trigger to modify. + */ + ScanKeyInit(&key[0], + Anum_pg_trigger_tgrelid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(tgrelid)); + ScanKeyInit(&key[1], + Anum_pg_trigger_tgname, + BTEqualStrategyNumber, F_NAMEEQ, + CStringGetDatum(tgname)); + SysScanDesc tgscan = systable_beginscan(tgrel, TriggerRelidNameIndexId, true, + NULL, 2, key); + + if (HeapTupleIsValid(tuple = systable_getnext(tgscan))) + { + Form_pg_trigger trigform = (Form_pg_trigger) GETSTRUCT(tuple); + + /* + * If the trigger descends from a trigger on a parent partitioned + * table, reject the rename. + * Appended shard ids to find the trigger on the partition's shards + * are not correct. Hence we would fail to find the trigger on the + * partition's shard. + */ + if (OidIsValid(trigform->tgparentid)) + { + ereport(ERROR, ( + errmsg( + "cannot rename child triggers on distributed partitions"))); + } + } + + systable_endscan(tgscan); + table_close(tgrel, RowExclusiveLock); +} + + +#endif + + /* * GetDropTriggerStmtRelation takes a DropStmt for a trigger object and returns * RangeVar for the relation that owns the trigger. diff --git a/src/include/distributed/commands.h b/src/include/distributed/commands.h index b7030adee..f660fe19d 100644 --- a/src/include/distributed/commands.h +++ b/src/include/distributed/commands.h @@ -667,6 +667,8 @@ extern List * CreateTriggerStmtObjectAddress(Node *node, bool missingOk, bool isPostprocess); extern void CreateTriggerEventExtendNames(CreateTrigStmt *createTriggerStmt, char *schemaName, uint64 shardId); +extern List * PreprocessAlterTriggerRenameStmt(Node *node, const char *queryString, + ProcessUtilityContext processUtilityContext); extern List * PostprocessAlterTriggerRenameStmt(Node *node, const char *queryString); extern void AlterTriggerRenameEventExtendNames(RenameStmt *renameTriggerStmt, char *schemaName, uint64 shardId); diff --git a/src/test/regress/expected/upgrade_distributed_triggers_before.out b/src/test/regress/expected/upgrade_distributed_triggers_before.out index 5e369c84c..d049c2b0a 100644 --- a/src/test/regress/expected/upgrade_distributed_triggers_before.out +++ b/src/test/regress/expected/upgrade_distributed_triggers_before.out @@ -241,3 +241,6 @@ SELECT * FROM sale_triggers ORDER BY 1, 2; truncate_trigger_xxxxxxx | sale_california | O (12 rows) +-- check that we can't rename child triggers on partitions of distributed tables +ALTER TRIGGER another_trigger ON sale_newyork RENAME TO another_renamed_trigger; +ERROR: cannot rename child triggers on distributed partitions diff --git a/src/test/regress/sql/upgrade_distributed_triggers_before.sql b/src/test/regress/sql/upgrade_distributed_triggers_before.sql index a1d4c9db3..9e0e5b7bb 100644 --- a/src/test/regress/sql/upgrade_distributed_triggers_before.sql +++ b/src/test/regress/sql/upgrade_distributed_triggers_before.sql @@ -114,3 +114,6 @@ FOR EACH ROW EXECUTE FUNCTION upgrade_distributed_triggers.record_sale(); ALTER TRIGGER another_trigger ON sale RENAME TO another_renamed_trigger; SELECT * FROM sale_triggers ORDER BY 1, 2; + +-- check that we can't rename child triggers on partitions of distributed tables +ALTER TRIGGER another_trigger ON sale_newyork RENAME TO another_renamed_trigger; From 85b19c851accf35a86ae8bb63fdd9abfa0ef58b9 Mon Sep 17 00:00:00 2001 From: Hanefi Onaldi Date: Mon, 5 Sep 2022 16:05:46 +0300 Subject: [PATCH 13/16] Disallow distributing by numeric with negative scale PG15 allows numeric scale to be negative or greater than precision. This causes issues and we may end up routing queries to a wrong shard due to differing hash results after rounding. Formerly, when specifying NUMERIC(precision, scale), the scale had to be in the range [0, precision], which was per SQL spec. PG15 extends the range of allowed scales to [-1000, 1000]. A negative scale implies rounding before the decimal point. For example, a column might be declared with a scale of -3 to round values to the nearest thousand. Note that the display scale remains non-negative, so in this case the display scale will be zero, and all digits before the decimal point will be displayed. Relevant PG commit: 085f931f52494e1f304e35571924efa6fcdc2b44 --- .../commands/create_distributed_table.c | 75 +++++++++++++++++++ src/test/regress/expected/pg15.out | 31 +++++++- src/test/regress/sql/pg15.sql | 10 +++ 3 files changed, 115 insertions(+), 1 deletion(-) diff --git a/src/backend/distributed/commands/create_distributed_table.c b/src/backend/distributed/commands/create_distributed_table.c index fd35acce2..a3ff6c4cb 100644 --- a/src/backend/distributed/commands/create_distributed_table.c +++ b/src/backend/distributed/commands/create_distributed_table.c @@ -139,6 +139,14 @@ static Oid DropFKeysAndUndistributeTable(Oid relationId); static void DropFKeysRelationInvolvedWithTableType(Oid relationId, int tableTypeFlag); static void CopyLocalDataIntoShards(Oid relationId); static List * TupleDescColumnNameList(TupleDesc tupleDescriptor); + +#if (PG_VERSION_NUM >= PG_VERSION_15) +static bool DistributionColumnUsesNumericColumnNegativeScale(TupleDesc relationDesc, + Var *distributionColumn); +static int numeric_typmod_scale(int32 typmod); +static bool is_valid_numeric_typmod(int32 typmod); +#endif + static bool DistributionColumnUsesGeneratedStoredColumn(TupleDesc relationDesc, Var *distributionColumn); static bool CanUseExclusiveConnections(Oid relationId, bool localTableEmpty); @@ -1681,6 +1689,20 @@ EnsureRelationCanBeDistributed(Oid relationId, Var *distributionColumn, "AS (...) STORED."))); } +#if (PG_VERSION_NUM >= PG_VERSION_15) + + /* verify target relation is not distributed by a column of type numeric with negative scale */ + if (distributionMethod != DISTRIBUTE_BY_NONE && + DistributionColumnUsesNumericColumnNegativeScale(relationDesc, + distributionColumn)) + { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot distribute relation: %s", relationName), + errdetail("Distribution column must not use numeric type " + "with negative scale"))); + } +#endif + /* check for support function needed by specified partition method */ if (distributionMethod == DISTRIBUTE_BY_HASH) { @@ -2401,6 +2423,59 @@ RelationUsesIdentityColumns(TupleDesc relationDesc) } +#if (PG_VERSION_NUM >= PG_VERSION_15) + +/* + * is_valid_numeric_typmod checks if the typmod value is valid + * + * Because of the offset, valid numeric typmods are at least VARHDRSZ + * + * Copied from PG. See numeric.c for understanding how this works. + */ +static bool +is_valid_numeric_typmod(int32 typmod) +{ + return typmod >= (int32) VARHDRSZ; +} + + +/* + * numeric_typmod_scale extracts the scale from a numeric typmod. + * + * Copied from PG. See numeric.c for understanding how this works. + * + */ +static int +numeric_typmod_scale(int32 typmod) +{ + return (((typmod - VARHDRSZ) & 0x7ff) ^ 1024) - 1024; +} + + +/* + * DistributionColumnUsesNumericColumnNegativeScale returns whether a given relation uses + * numeric data type with negative scale on distribution column + */ +static bool +DistributionColumnUsesNumericColumnNegativeScale(TupleDesc relationDesc, + Var *distributionColumn) +{ + Form_pg_attribute attributeForm = TupleDescAttr(relationDesc, + distributionColumn->varattno - 1); + + if (attributeForm->atttypid == NUMERICOID && + is_valid_numeric_typmod(attributeForm->atttypmod) && + numeric_typmod_scale(attributeForm->atttypmod) < 0) + { + return true; + } + + return false; +} + + +#endif + /* * DistributionColumnUsesGeneratedStoredColumn returns whether a given relation uses * GENERATED ALWAYS AS (...) STORED on distribution column diff --git a/src/test/regress/expected/pg15.out b/src/test/regress/expected/pg15.out index 1b214f4a9..e4d24af35 100644 --- a/src/test/regress/expected/pg15.out +++ b/src/test/regress/expected/pg15.out @@ -392,9 +392,37 @@ ON (true) WHEN MATCHED THEN UPDATE SET x = (SELECT count(*) FROM tbl2); ERROR: MERGE command is not supported on Citus tables yet +-- test numeric types with negative scale +CREATE TABLE numeric_negative_scale(numeric_column numeric(3,-1), orig_value int); +INSERT into numeric_negative_scale SELECT x,x FROM generate_series(111, 115) x; +-- verify that we can not distribute by a column that has numeric type with negative scale +SELECT create_distributed_table('numeric_negative_scale','numeric_column'); +ERROR: cannot distribute relation: numeric_negative_scale +DETAIL: Distribution column must not use numeric type with negative scale +-- However, we can distribute by other columns +SELECT create_distributed_table('numeric_negative_scale','orig_value'); +NOTICE: Copying data from local table... +NOTICE: copying the data has completed +DETAIL: The local data in the table is no longer visible, but is still on disk. +HINT: To remove the local data, run: SELECT truncate_local_data_after_distributing_table($$pg15.numeric_negative_scale$$) + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM numeric_negative_scale ORDER BY 1,2; + numeric_column | orig_value +--------------------------------------------------------------------- + 110 | 111 + 110 | 112 + 110 | 113 + 110 | 114 + 120 | 115 +(5 rows) + -- Clean up DROP SCHEMA pg15 CASCADE; -NOTICE: drop cascades to 9 other objects +NOTICE: drop cascades to 10 other objects DETAIL: drop cascades to collation german_phonebook_test drop cascades to collation default_provider drop cascades to table sale @@ -404,3 +432,4 @@ drop cascades to view sale_triggers drop cascades to table generated_stored_ref drop cascades to table tbl1 drop cascades to table tbl2 +drop cascades to table numeric_negative_scale diff --git a/src/test/regress/sql/pg15.sql b/src/test/regress/sql/pg15.sql index 73cee8054..189299bd8 100644 --- a/src/test/regress/sql/pg15.sql +++ b/src/test/regress/sql/pg15.sql @@ -245,5 +245,15 @@ ON (true) WHEN MATCHED THEN UPDATE SET x = (SELECT count(*) FROM tbl2); +-- test numeric types with negative scale +CREATE TABLE numeric_negative_scale(numeric_column numeric(3,-1), orig_value int); +INSERT into numeric_negative_scale SELECT x,x FROM generate_series(111, 115) x; +-- verify that we can not distribute by a column that has numeric type with negative scale +SELECT create_distributed_table('numeric_negative_scale','numeric_column'); +-- However, we can distribute by other columns +SELECT create_distributed_table('numeric_negative_scale','orig_value'); + +SELECT * FROM numeric_negative_scale ORDER BY 1,2; + -- Clean up DROP SCHEMA pg15 CASCADE; From 69726648ab190c65f487427737fb59b851e305d7 Mon Sep 17 00:00:00 2001 From: aykut-bozkurt <51649454+aykut-bozkurt@users.noreply.github.com> Date: Tue, 6 Sep 2022 16:29:14 +0300 Subject: [PATCH 14/16] verify shards if exists for insert, delete, update (#6280) Co-authored-by: Marco Slot --- .../distributed/executor/citus_custom_scan.c | 76 ++++ .../distributed/metadata/metadata_cache.c | 122 ++++--- .../planner/combine_query_planner.c | 3 +- .../distributed/planner/deparse_shard_query.c | 15 +- .../planner/multi_router_planner.c | 3 - src/backend/distributed/utils/shard_utils.c | 3 + .../distributed/combine_query_planner.h | 1 + src/include/distributed/metadata_cache.h | 2 +- .../isolation_blocking_shard_split.out | 38 +- .../isolation_non_blocking_shard_split.out | 6 +- .../expected/isolation_tenant_isolation.out | 263 ++++++++++---- ...isolation_tenant_isolation_nonblocking.out | 324 ++++++++++++++---- .../spec/isolation_tenant_isolation.spec | 9 + ...solation_tenant_isolation_nonblocking.spec | 9 + 14 files changed, 655 insertions(+), 219 deletions(-) diff --git a/src/backend/distributed/executor/citus_custom_scan.c b/src/backend/distributed/executor/citus_custom_scan.c index 7ce986203..5e4afd1a7 100644 --- a/src/backend/distributed/executor/citus_custom_scan.c +++ b/src/backend/distributed/executor/citus_custom_scan.c @@ -31,6 +31,7 @@ #include "distributed/multi_server_executor.h" #include "distributed/multi_router_planner.h" #include "distributed/query_stats.h" +#include "distributed/shard_utils.h" #include "distributed/subplan_execution.h" #include "distributed/worker_log_messages.h" #include "distributed/worker_protocol.h" @@ -67,6 +68,9 @@ static void CitusEndScan(CustomScanState *node); static void CitusReScan(CustomScanState *node); static void SetJobColocationId(Job *job); static void EnsureForceDelegationDistributionKey(Job *job); +static void EnsureAnchorShardsInJobExist(Job *job); +static bool AnchorShardsInTaskListExist(List *taskList); +static void TryToRerouteFastPathModifyQuery(Job *job); /* create custom scan methods for all executors */ @@ -406,6 +410,19 @@ CitusBeginModifyScan(CustomScanState *node, EState *estate, int eflags) /* prevent concurrent placement changes */ AcquireMetadataLocks(workerJob->taskList); + /* + * In case of a split, the shard might no longer be available. In that + * case try to reroute. We can only do this for fast path queries. + */ + if (currentPlan->fastPathRouterPlan && + !AnchorShardsInTaskListExist(workerJob->taskList)) + { + TryToRerouteFastPathModifyQuery(workerJob); + } + + /* ensure there is no invalid shard */ + EnsureAnchorShardsInJobExist(workerJob); + /* modify tasks are always assigned using first-replica policy */ workerJob->taskList = FirstReplicaAssignTaskList(workerJob->taskList); } @@ -440,6 +457,65 @@ CitusBeginModifyScan(CustomScanState *node, EState *estate, int eflags) } +/* + * TryToRerouteFastPathModifyQuery tries to reroute non-existent shards in given job if it finds any such shard, + * only for fastpath queries. + * + * Should only be called if the job belongs to a fastpath modify query + */ +static void +TryToRerouteFastPathModifyQuery(Job *job) +{ + if (job->jobQuery->commandType == CMD_INSERT) + { + RegenerateTaskListForInsert(job); + } + else + { + RegenerateTaskForFasthPathQuery(job); + RebuildQueryStrings(job); + } +} + + +/* + * EnsureAnchorShardsInJobExist ensures all shards are valid in job. + * If it finds a non-existent shard in given job, it throws an error. + */ +static void +EnsureAnchorShardsInJobExist(Job *job) +{ + if (!AnchorShardsInTaskListExist(job->taskList)) + { + ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("shard for the given value does not exist"), + errdetail( + "A concurrent shard split may have moved the data into a new set of shards."), + errhint("Retry the query."))); + } +} + + +/* + * AnchorShardsInTaskListExist checks whether all the anchor shards in the task list + * still exist. + */ +static bool +AnchorShardsInTaskListExist(List *taskList) +{ + Task *task = NULL; + foreach_ptr(task, taskList) + { + if (!ShardExists(task->anchorShardId)) + { + return false; + } + } + + return true; +} + + /* * ModifyJobNeedsEvaluation checks whether the functions and parameters in the job query * need to be evaluated before we can build task query strings. diff --git a/src/backend/distributed/metadata/metadata_cache.c b/src/backend/distributed/metadata/metadata_cache.c index afce210c8..7779c5e07 100644 --- a/src/backend/distributed/metadata/metadata_cache.c +++ b/src/backend/distributed/metadata/metadata_cache.c @@ -231,7 +231,7 @@ static ScanKeyData DistObjectScanKey[3]; /* local function forward declarations */ static HeapTuple PgDistPartitionTupleViaCatalog(Oid relationId); -static ShardIdCacheEntry * LookupShardIdCacheEntry(int64 shardId); +static ShardIdCacheEntry * LookupShardIdCacheEntry(int64 shardId, bool missingOk); static CitusTableCacheEntry * BuildCitusTableCacheEntry(Oid relationId); static void BuildCachedShardList(CitusTableCacheEntry *cacheEntry); static void PrepareWorkerNodeCache(void); @@ -280,10 +280,11 @@ static Oid LookupEnumValueId(Oid typeId, char *valueName); static void InvalidateCitusTableCacheEntrySlot(CitusTableCacheEntrySlot *cacheSlot); static void InvalidateDistTableCache(void); static void InvalidateDistObjectCache(void); -static void InitializeTableCacheEntry(int64 shardId); +static bool InitializeTableCacheEntry(int64 shardId, bool missingOk); static bool IsCitusTableTypeInternal(char partitionMethod, char replicationModel, CitusTableType tableType); -static bool RefreshTableCacheEntryIfInvalid(ShardIdCacheEntry *shardEntry); +static bool RefreshTableCacheEntryIfInvalid(ShardIdCacheEntry *shardEntry, bool + missingOk); static Oid DistAuthinfoRelationId(void); static Oid DistAuthinfoIndexId(void); @@ -781,7 +782,8 @@ CitusTableList(void) ShardInterval * LoadShardInterval(uint64 shardId) { - ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId); + bool missingOk = false; + ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId, missingOk); CitusTableCacheEntry *tableEntry = shardIdEntry->tableEntry; int shardIndex = shardIdEntry->shardIndex; @@ -798,13 +800,33 @@ LoadShardInterval(uint64 shardId) } +/* + * ShardExists returns whether given shard exists or not. It fails if missingOk is false + * and shard is not found. + */ +bool +ShardExists(uint64 shardId) +{ + bool missingOk = true; + ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId, missingOk); + + if (!shardIdEntry) + { + return false; + } + + return true; +} + + /* * RelationIdOfShard returns the relationId of the given shardId. */ Oid RelationIdForShard(uint64 shardId) { - ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId); + bool missingOk = false; + ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId, missingOk); CitusTableCacheEntry *tableEntry = shardIdEntry->tableEntry; return tableEntry->relationId; } @@ -817,7 +839,8 @@ RelationIdForShard(uint64 shardId) bool ReferenceTableShardId(uint64 shardId) { - ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId); + bool missingOk = false; + ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId, missingOk); CitusTableCacheEntry *tableEntry = shardIdEntry->tableEntry; return IsCitusTableTypeCacheEntry(tableEntry, REFERENCE_TABLE); } @@ -835,7 +858,8 @@ DistributedTableShardId(uint64 shardId) return false; } - ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId); + bool missingOk = false; + ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId, missingOk); CitusTableCacheEntry *tableEntry = shardIdEntry->tableEntry; return IsCitusTableTypeCacheEntry(tableEntry, DISTRIBUTED_TABLE); } @@ -850,7 +874,8 @@ DistributedTableShardId(uint64 shardId) GroupShardPlacement * LoadGroupShardPlacement(uint64 shardId, uint64 placementId) { - ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId); + bool missingOk = false; + ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId, missingOk); CitusTableCacheEntry *tableEntry = shardIdEntry->tableEntry; int shardIndex = shardIdEntry->shardIndex; @@ -885,7 +910,8 @@ LoadGroupShardPlacement(uint64 shardId, uint64 placementId) ShardPlacement * LoadShardPlacement(uint64 shardId, uint64 placementId) { - ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId); + bool missingOk = false; + ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId, missingOk); CitusTableCacheEntry *tableEntry = shardIdEntry->tableEntry; int shardIndex = shardIdEntry->shardIndex; GroupShardPlacement *groupPlacement = LoadGroupShardPlacement(shardId, placementId); @@ -908,7 +934,8 @@ ShardPlacementOnGroupIncludingOrphanedPlacements(int32 groupId, uint64 shardId) { ShardPlacement *placementOnNode = NULL; - ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId); + bool missingOk = false; + ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId, missingOk); CitusTableCacheEntry *tableEntry = shardIdEntry->tableEntry; int shardIndex = shardIdEntry->shardIndex; GroupShardPlacement *placementArray = @@ -1128,7 +1155,8 @@ ShardPlacementListIncludingOrphanedPlacements(uint64 shardId) { List *placementList = NIL; - ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId); + bool missingOk = false; + ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId, missingOk); CitusTableCacheEntry *tableEntry = shardIdEntry->tableEntry; int shardIndex = shardIdEntry->shardIndex; @@ -1168,15 +1196,24 @@ ShardPlacementListIncludingOrphanedPlacements(uint64 shardId) * build the cache entry. Afterwards we know that the shard has to be in the * cache if it exists. If the shard does *not* exist, this function errors * (because LookupShardRelationFromCatalog errors out). + * + * If missingOk is true and the shard cannot be found, the function returns false. */ -static void -InitializeTableCacheEntry(int64 shardId) +static bool +InitializeTableCacheEntry(int64 shardId, bool missingOk) { - bool missingOk = false; Oid relationId = LookupShardRelationFromCatalog(shardId, missingOk); + if (!OidIsValid(relationId)) + { + Assert(missingOk); + return false; + } + /* trigger building the cache for the shard id */ GetCitusTableCacheEntry(relationId); /* lgtm[cpp/return-value-ignored] */ + + return true; } @@ -1186,7 +1223,7 @@ InitializeTableCacheEntry(int64 shardId) * entry in the cache and false if it didn't. */ static bool -RefreshTableCacheEntryIfInvalid(ShardIdCacheEntry *shardEntry) +RefreshTableCacheEntryIfInvalid(ShardIdCacheEntry *shardEntry, bool missingOk) { /* * We might have some concurrent metadata changes. In order to get the changes, @@ -1198,7 +1235,8 @@ RefreshTableCacheEntryIfInvalid(ShardIdCacheEntry *shardEntry) return false; } Oid oldRelationId = shardEntry->tableEntry->relationId; - Oid currentRelationId = LookupShardRelationFromCatalog(shardEntry->shardId, false); + Oid currentRelationId = LookupShardRelationFromCatalog(shardEntry->shardId, + missingOk); /* * The relation OID to which the shard belongs could have changed, @@ -1213,11 +1251,12 @@ RefreshTableCacheEntryIfInvalid(ShardIdCacheEntry *shardEntry) /* - * LookupShardCacheEntry returns the cache entry belonging to a shard, or - * errors out if that shard is unknown. + * LookupShardCacheEntry returns the cache entry belonging to a shard. + * It errors out if that shard is unknown and missingOk is false. Else, + * it will return a NULL cache entry. */ static ShardIdCacheEntry * -LookupShardIdCacheEntry(int64 shardId) +LookupShardIdCacheEntry(int64 shardId, bool missingOk) { bool foundInCache = false; bool recheck = false; @@ -1231,12 +1270,16 @@ LookupShardIdCacheEntry(int64 shardId) if (!foundInCache) { - InitializeTableCacheEntry(shardId); + if (!InitializeTableCacheEntry(shardId, missingOk)) + { + return NULL; + } + recheck = true; } else { - recheck = RefreshTableCacheEntryIfInvalid(shardEntry); + recheck = RefreshTableCacheEntryIfInvalid(shardEntry, missingOk); } /* @@ -1250,7 +1293,8 @@ LookupShardIdCacheEntry(int64 shardId) if (!foundInCache) { - ereport(ERROR, (errmsg("could not find valid entry for shard " + int eflag = (missingOk) ? DEBUG1 : ERROR; + ereport(eflag, (errmsg("could not find valid entry for shard " UINT64_FORMAT, shardId))); } } @@ -4253,6 +4297,9 @@ InvalidateCitusTableCacheEntrySlot(CitusTableCacheEntrySlot *cacheSlot) { /* reload the metadata */ cacheSlot->citusTableMetadata->isValid = false; + + /* clean up ShardIdCacheHash */ + RemoveStaleShardIdCacheEntries(cacheSlot->citusTableMetadata); } } @@ -4653,37 +4700,6 @@ LookupShardRelationFromCatalog(int64 shardId, bool missingOk) } -/* - * ShardExists returns whether the given shard ID exists in pg_dist_shard. - */ -bool -ShardExists(int64 shardId) -{ - ScanKeyData scanKey[1]; - int scanKeyCount = 1; - Relation pgDistShard = table_open(DistShardRelationId(), AccessShareLock); - bool shardExists = false; - - ScanKeyInit(&scanKey[0], Anum_pg_dist_shard_shardid, - BTEqualStrategyNumber, F_INT8EQ, Int64GetDatum(shardId)); - - SysScanDesc scanDescriptor = systable_beginscan(pgDistShard, - DistShardShardidIndexId(), true, - NULL, scanKeyCount, scanKey); - - HeapTuple heapTuple = systable_getnext(scanDescriptor); - if (HeapTupleIsValid(heapTuple)) - { - shardExists = true; - } - - systable_endscan(scanDescriptor); - table_close(pgDistShard, NoLock); - - return shardExists; -} - - /* * GetPartitionTypeInputInfo populates output parameters with the interval type * identifier and modifier for the specified partition key/method combination. diff --git a/src/backend/distributed/planner/combine_query_planner.c b/src/backend/distributed/planner/combine_query_planner.c index 0a871f3e6..1ada6bcc0 100644 --- a/src/backend/distributed/planner/combine_query_planner.c +++ b/src/backend/distributed/planner/combine_query_planner.c @@ -30,7 +30,6 @@ static List * RemoteScanTargetList(List *workerTargetList); static PlannedStmt * BuildSelectStatementViaStdPlanner(Query *combineQuery, List *remoteScanTargetList, CustomScan *remoteScan); -static bool FindCitusExtradataContainerRTE(Node *node, RangeTblEntry **result); static Plan * CitusCustomScanPathPlan(PlannerInfo *root, RelOptInfo *rel, struct CustomPath *best_path, List *tlist, @@ -323,7 +322,7 @@ BuildSelectStatementViaStdPlanner(Query *combineQuery, List *remoteScanTargetLis * Finds the rangetable entry in the query that refers to the citus_extradata_container * and stores the pointer in result. */ -static bool +bool FindCitusExtradataContainerRTE(Node *node, RangeTblEntry **result) { if (node == NULL) diff --git a/src/backend/distributed/planner/deparse_shard_query.c b/src/backend/distributed/planner/deparse_shard_query.c index 11338df65..1c5c62034 100644 --- a/src/backend/distributed/planner/deparse_shard_query.c +++ b/src/backend/distributed/planner/deparse_shard_query.c @@ -17,6 +17,7 @@ #include "catalog/pg_constraint.h" #include "distributed/citus_nodefuncs.h" #include "distributed/citus_ruleutils.h" +#include "distributed/combine_query_planner.h" #include "distributed/deparse_shard_query.h" #include "distributed/insert_select_planner.h" #include "distributed/listutils.h" @@ -79,12 +80,13 @@ RebuildQueryStrings(Job *workerJob) if (UpdateOrDeleteQuery(query)) { + List *relationShardList = task->relationShardList; + /* * For UPDATE and DELETE queries, we may have subqueries and joins, so * we use relation shard list to update shard names and call * pg_get_query_def() directly. */ - List *relationShardList = task->relationShardList; UpdateRelationToShardNames((Node *) query, relationShardList); } else if (query->commandType == CMD_INSERT && task->modifyWithSubquery) @@ -229,7 +231,16 @@ UpdateRelationToShardNames(Node *node, List *relationShardList) RangeTblEntry *newRte = (RangeTblEntry *) node; - if (newRte->rtekind != RTE_RELATION) + if (newRte->rtekind == RTE_FUNCTION) + { + newRte = NULL; + if (!FindCitusExtradataContainerRTE(node, &newRte)) + { + /* only update function rtes containing citus_extradata_container */ + return false; + } + } + else if (newRte->rtekind != RTE_RELATION) { return false; } diff --git a/src/backend/distributed/planner/multi_router_planner.c b/src/backend/distributed/planner/multi_router_planner.c index 3193305a2..7335d1c2d 100644 --- a/src/backend/distributed/planner/multi_router_planner.c +++ b/src/backend/distributed/planner/multi_router_planner.c @@ -2677,9 +2677,6 @@ TargetShardIntervalForFastPathQuery(Query *query, bool *isMultiShardQuery, } /* we're only expecting single shard from a single table */ - Node *distKey PG_USED_FOR_ASSERTS_ONLY = NULL; - Assert(FastPathRouterQuery(query, &distKey) || !EnableFastPathRouterPlanner); - if (list_length(prunedShardIntervalList) > 1) { *isMultiShardQuery = true; diff --git a/src/backend/distributed/utils/shard_utils.c b/src/backend/distributed/utils/shard_utils.c index 7d5a63fa0..d6d41f192 100644 --- a/src/backend/distributed/utils/shard_utils.c +++ b/src/backend/distributed/utils/shard_utils.c @@ -16,7 +16,10 @@ #include "utils/fmgrprotos.h" #include "utils/lsyscache.h" #include "distributed/coordinator_protocol.h" +#include "distributed/listutils.h" +#include "distributed/log_utils.h" #include "distributed/metadata_utility.h" +#include "distributed/multi_physical_planner.h" #include "distributed/relay_utility.h" #include "distributed/shard_utils.h" diff --git a/src/include/distributed/combine_query_planner.h b/src/include/distributed/combine_query_planner.h index c4702b781..710010913 100644 --- a/src/include/distributed/combine_query_planner.h +++ b/src/include/distributed/combine_query_planner.h @@ -27,6 +27,7 @@ extern Path * CreateCitusCustomScanPath(PlannerInfo *root, RelOptInfo *relOptInf CustomScan *remoteScan); extern PlannedStmt * PlanCombineQuery(struct DistributedPlan *distributedPlan, struct CustomScan *dataScan); +extern bool FindCitusExtradataContainerRTE(Node *node, RangeTblEntry **result); extern bool ReplaceCitusExtraDataContainer; extern CustomScan *ReplaceCitusExtraDataContainerWithCustomScan; diff --git a/src/include/distributed/metadata_cache.h b/src/include/distributed/metadata_cache.h index 1a5939879..2b55cf8d9 100644 --- a/src/include/distributed/metadata_cache.h +++ b/src/include/distributed/metadata_cache.h @@ -157,6 +157,7 @@ extern uint32 ColocationIdViaCatalog(Oid relationId); extern bool IsCitusLocalTableByDistParams(char partitionMethod, char replicationModel); extern List * CitusTableList(void); extern ShardInterval * LoadShardInterval(uint64 shardId); +extern bool ShardExists(uint64 shardId); extern Oid RelationIdForShard(uint64 shardId); extern bool ReferenceTableShardId(uint64 shardId); extern bool DistributedTableShardId(uint64 shardId); @@ -174,7 +175,6 @@ extern int32 GetLocalNodeId(void); extern void CitusTableCacheFlushInvalidatedEntries(void); extern Oid LookupShardRelationFromCatalog(int64 shardId, bool missing_ok); extern List * ShardPlacementListIncludingOrphanedPlacements(uint64 shardId); -extern bool ShardExists(int64 shardId); extern void CitusInvalidateRelcacheByRelid(Oid relationId); extern void CitusInvalidateRelcacheByShardId(int64 shardId); extern void InvalidateForeignKeyGraph(void); diff --git a/src/test/regress/expected/isolation_blocking_shard_split.out b/src/test/regress/expected/isolation_blocking_shard_split.out index d720f3a32..b5b7dc71b 100644 --- a/src/test/regress/expected/isolation_blocking_shard_split.out +++ b/src/test/regress/expected/isolation_blocking_shard_split.out @@ -56,7 +56,6 @@ step s2-commit: COMMIT; step s1-update: <... completed> -ERROR: could not find valid entry for shard xxxxx step s1-commit: COMMIT; @@ -80,7 +79,7 @@ nodeport|shardid|success|result id|value --------------------------------------------------------------------- -123456789| 1 +123456789| 111 (1 row) @@ -140,7 +139,6 @@ step s2-commit: COMMIT; step s1-delete: <... completed> -ERROR: could not find valid entry for shard xxxxx step s1-commit: COMMIT; @@ -158,14 +156,13 @@ step s2-print-cluster: nodeport|shardid|success|result --------------------------------------------------------------------- 57637|1500001|t | 0 - 57637|1500003|t | 1 + 57637|1500003|t | 0 57638|1500004|t | 0 (3 rows) - id|value +id|value --------------------------------------------------------------------- -123456789| 1 -(1 row) +(0 rows) starting permutation: s1-load-cache s1-begin s1-select s2-begin s2-blocking-shard-split s1-insert s2-commit s1-commit s2-print-cluster @@ -221,7 +218,6 @@ get_shard_id_for_distribution_column 1500002 (1 row) -ERROR: could not find valid entry for shard xxxxx step s1-commit: COMMIT; @@ -239,13 +235,14 @@ step s2-print-cluster: nodeport|shardid|success|result --------------------------------------------------------------------- 57637|1500001|t | 0 - 57637|1500003|t | 0 + 57637|1500003|t | 1 57638|1500004|t | 0 (3 rows) -id|value + id|value --------------------------------------------------------------------- -(0 rows) +123456789| 1 +(1 row) starting permutation: s1-load-cache s1-begin s1-select s2-begin s2-blocking-shard-split s1-copy s2-commit s1-commit s2-print-cluster @@ -373,7 +370,6 @@ step s2-commit: COMMIT; step s1-update: <... completed> -ERROR: could not find valid entry for shard xxxxx step s1-commit: COMMIT; @@ -397,7 +393,7 @@ nodeport|shardid|success|result id|value --------------------------------------------------------------------- -123456789| 1 +123456789| 111 (1 row) @@ -453,7 +449,6 @@ step s2-commit: COMMIT; step s1-delete: <... completed> -ERROR: could not find valid entry for shard xxxxx step s1-commit: COMMIT; @@ -471,14 +466,13 @@ step s2-print-cluster: nodeport|shardid|success|result --------------------------------------------------------------------- 57637|1500001|t | 0 - 57637|1500003|t | 1 + 57637|1500003|t | 0 57638|1500004|t | 0 (3 rows) - id|value +id|value --------------------------------------------------------------------- -123456789| 1 -(1 row) +(0 rows) starting permutation: s1-begin s1-select s2-begin s2-blocking-shard-split s1-insert s2-commit s1-commit s2-print-cluster @@ -530,7 +524,6 @@ get_shard_id_for_distribution_column 1500002 (1 row) -ERROR: could not find valid entry for shard xxxxx step s1-commit: COMMIT; @@ -548,13 +541,14 @@ step s2-print-cluster: nodeport|shardid|success|result --------------------------------------------------------------------- 57637|1500001|t | 0 - 57637|1500003|t | 0 + 57637|1500003|t | 1 57638|1500004|t | 0 (3 rows) -id|value + id|value --------------------------------------------------------------------- -(0 rows) +123456789| 1 +(1 row) starting permutation: s1-begin s1-select s2-begin s2-blocking-shard-split s1-copy s2-commit s1-commit s2-print-cluster diff --git a/src/test/regress/expected/isolation_non_blocking_shard_split.out b/src/test/regress/expected/isolation_non_blocking_shard_split.out index 24af03012..e52dff295 100644 --- a/src/test/regress/expected/isolation_non_blocking_shard_split.out +++ b/src/test/regress/expected/isolation_non_blocking_shard_split.out @@ -434,7 +434,6 @@ step s1-end: COMMIT; step s4-insert: <... completed> -ERROR: could not find valid entry for shard xxxxx step s4-end: COMMIT; @@ -452,13 +451,14 @@ step s2-print-cluster: nodeport|shardid|success|result --------------------------------------------------------------------- 57638|1500002|t | 0 - 57638|1500003|t | 1 + 57638|1500003|t | 2 (2 rows) id|value --------------------------------------------------------------------- + 900| 1 123456789| 1 -(1 row) +(2 rows) starting permutation: s2-print-cluster s3-acquire-advisory-lock s1-begin s2-begin s1-non-blocking-shard-split s2-insert s2-end s2-print-cluster s3-release-advisory-lock s1-end s2-print-cluster diff --git a/src/test/regress/expected/isolation_tenant_isolation.out b/src/test/regress/expected/isolation_tenant_isolation.out index f663aa05d..e2b2e4e43 100644 --- a/src/test/regress/expected/isolation_tenant_isolation.out +++ b/src/test/regress/expected/isolation_tenant_isolation.out @@ -44,7 +44,6 @@ step s2-commit: COMMIT; step s1-update: <... completed> -ERROR: could not find valid entry for shard xxxxx step s1-commit: COMMIT; @@ -69,7 +68,7 @@ nodeport|shardid|success|result id|value --------------------------------------------------------------------- - 5| 10 + 5| 5 (1 row) @@ -117,7 +116,6 @@ step s2-commit: COMMIT; step s1-delete: <... completed> -ERROR: could not find valid entry for shard xxxxx step s1-commit: COMMIT; @@ -135,11 +133,85 @@ step s2-print-cluster: nodeport|shardid|success|result --------------------------------------------------------------------- 57637|1500009|t | 0 - 57637|1500010|t | 1 + 57637|1500010|t | 0 57637|1500011|t | 0 57638|1500008|t | 0 (4 rows) +id|value +--------------------------------------------------------------------- +(0 rows) + + +starting permutation: s1-load-cache s1-insert s1-begin s1-select s2-begin s2-isolate-tenant s1-update-complex s2-commit s1-commit s2-print-cluster +create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +step s1-load-cache: + TRUNCATE isolation_table; + +step s1-insert: + INSERT INTO isolation_table VALUES (5, 10); + +step s1-begin: + BEGIN; + -- the tests are written with the logic where single shard SELECTs + -- do not to open transaction blocks + SET citus.select_opens_transaction_block TO false; + +step s1-select: + SELECT count(*) FROM isolation_table WHERE id = 5; + +count +--------------------------------------------------------------------- + 1 +(1 row) + +step s2-begin: + BEGIN; + +step s2-isolate-tenant: + SELECT isolate_tenant_to_new_shard('isolation_table', 5, shard_transfer_mode => 'block_writes'); + +isolate_tenant_to_new_shard +--------------------------------------------------------------------- + 1500016 +(1 row) + +step s1-update-complex: + UPDATE isolation_table SET value = 5 WHERE id IN ( + SELECT max(id) FROM isolation_table + ); + +step s2-commit: + COMMIT; + +step s1-update-complex: <... completed> +ERROR: shard for the given value does not exist +step s1-commit: + COMMIT; + +step s2-print-cluster: + -- row count per shard + SELECT + nodeport, shardid, success, result + FROM + run_command_on_placements('isolation_table', 'select count(*) from %s') + ORDER BY + nodeport, shardid; + -- rows + SELECT id, value FROM isolation_table ORDER BY id, value; + +nodeport|shardid|success|result +--------------------------------------------------------------------- + 57637|1500015|t | 0 + 57637|1500016|t | 1 + 57637|1500017|t | 0 + 57638|1500014|t | 0 +(4 rows) + id|value --------------------------------------------------------------------- 5| 10 @@ -177,7 +249,7 @@ step s2-isolate-tenant: isolate_tenant_to_new_shard --------------------------------------------------------------------- - 1500016 + 1500022 (1 row) step s1-insert: @@ -187,7 +259,6 @@ step s2-commit: COMMIT; step s1-insert: <... completed> -ERROR: could not find valid entry for shard xxxxx step s1-commit: COMMIT; @@ -204,15 +275,16 @@ step s2-print-cluster: nodeport|shardid|success|result --------------------------------------------------------------------- - 57637|1500015|t | 0 - 57637|1500016|t | 0 - 57637|1500017|t | 0 - 57638|1500014|t | 0 + 57637|1500021|t | 0 + 57637|1500022|t | 1 + 57637|1500023|t | 0 + 57638|1500020|t | 0 (4 rows) id|value --------------------------------------------------------------------- -(0 rows) + 5| 10 +(1 row) starting permutation: s1-load-cache s1-begin s1-select s2-begin s2-isolate-tenant s1-copy s2-commit s1-commit s2-print-cluster @@ -246,7 +318,7 @@ step s2-isolate-tenant: isolate_tenant_to_new_shard --------------------------------------------------------------------- - 1500022 + 1500028 (1 row) step s1-copy: @@ -273,10 +345,10 @@ step s2-print-cluster: nodeport|shardid|success|result --------------------------------------------------------------------- - 57637|1500021|t | 0 - 57637|1500022|t | 0 - 57637|1500023|t | 0 - 57638|1500020|t | 0 + 57637|1500027|t | 0 + 57637|1500028|t | 0 + 57637|1500029|t | 0 + 57638|1500026|t | 0 (4 rows) id|value @@ -315,7 +387,7 @@ step s2-isolate-tenant: isolate_tenant_to_new_shard --------------------------------------------------------------------- - 1500028 + 1500034 (1 row) step s1-ddl: @@ -341,10 +413,10 @@ step s2-print-cluster: nodeport|shardid|success|result --------------------------------------------------------------------- - 57637|1500027|t | 0 - 57637|1500028|t | 0 - 57637|1500029|t | 0 - 57638|1500026|t | 0 + 57637|1500033|t | 0 + 57637|1500034|t | 0 + 57637|1500035|t | 0 + 57638|1500032|t | 0 (4 rows) id|value @@ -399,7 +471,7 @@ step s2-isolate-tenant: isolate_tenant_to_new_shard --------------------------------------------------------------------- - 1500034 + 1500040 (1 row) step s1-update: @@ -409,7 +481,6 @@ step s2-commit: COMMIT; step s1-update: <... completed> -ERROR: could not find valid entry for shard xxxxx step s1-commit: COMMIT; @@ -426,15 +497,15 @@ step s2-print-cluster: nodeport|shardid|success|result --------------------------------------------------------------------- - 57637|1500033|t | 0 - 57637|1500034|t | 1 - 57637|1500035|t | 0 - 57638|1500032|t | 0 + 57637|1500039|t | 0 + 57637|1500040|t | 1 + 57637|1500041|t | 0 + 57638|1500038|t | 0 (4 rows) id|value --------------------------------------------------------------------- - 5| 10 + 5| 5 (1 row) @@ -469,7 +540,7 @@ step s2-isolate-tenant: isolate_tenant_to_new_shard --------------------------------------------------------------------- - 1500040 + 1500046 (1 row) step s1-delete: @@ -479,7 +550,6 @@ step s2-commit: COMMIT; step s1-delete: <... completed> -ERROR: could not find valid entry for shard xxxxx step s1-commit: COMMIT; @@ -496,10 +566,81 @@ step s2-print-cluster: nodeport|shardid|success|result --------------------------------------------------------------------- - 57637|1500039|t | 0 - 57637|1500040|t | 1 - 57637|1500041|t | 0 - 57638|1500038|t | 0 + 57637|1500045|t | 0 + 57637|1500046|t | 0 + 57637|1500047|t | 0 + 57638|1500044|t | 0 +(4 rows) + +id|value +--------------------------------------------------------------------- +(0 rows) + + +starting permutation: s1-insert s1-begin s1-select s2-begin s2-isolate-tenant s1-update-complex s2-commit s1-commit s2-print-cluster +create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +step s1-insert: + INSERT INTO isolation_table VALUES (5, 10); + +step s1-begin: + BEGIN; + -- the tests are written with the logic where single shard SELECTs + -- do not to open transaction blocks + SET citus.select_opens_transaction_block TO false; + +step s1-select: + SELECT count(*) FROM isolation_table WHERE id = 5; + +count +--------------------------------------------------------------------- + 1 +(1 row) + +step s2-begin: + BEGIN; + +step s2-isolate-tenant: + SELECT isolate_tenant_to_new_shard('isolation_table', 5, shard_transfer_mode => 'block_writes'); + +isolate_tenant_to_new_shard +--------------------------------------------------------------------- + 1500052 +(1 row) + +step s1-update-complex: + UPDATE isolation_table SET value = 5 WHERE id IN ( + SELECT max(id) FROM isolation_table + ); + +step s2-commit: + COMMIT; + +step s1-update-complex: <... completed> +ERROR: shard for the given value does not exist +step s1-commit: + COMMIT; + +step s2-print-cluster: + -- row count per shard + SELECT + nodeport, shardid, success, result + FROM + run_command_on_placements('isolation_table', 'select count(*) from %s') + ORDER BY + nodeport, shardid; + -- rows + SELECT id, value FROM isolation_table ORDER BY id, value; + +nodeport|shardid|success|result +--------------------------------------------------------------------- + 57637|1500051|t | 0 + 57637|1500052|t | 1 + 57637|1500053|t | 0 + 57638|1500050|t | 0 (4 rows) id|value @@ -536,7 +677,7 @@ step s2-isolate-tenant: isolate_tenant_to_new_shard --------------------------------------------------------------------- - 1500046 + 1500058 (1 row) step s1-insert: @@ -546,7 +687,6 @@ step s2-commit: COMMIT; step s1-insert: <... completed> -ERROR: could not find valid entry for shard xxxxx step s1-commit: COMMIT; @@ -563,15 +703,16 @@ step s2-print-cluster: nodeport|shardid|success|result --------------------------------------------------------------------- - 57637|1500045|t | 0 - 57637|1500046|t | 0 - 57637|1500047|t | 0 - 57638|1500044|t | 0 + 57637|1500057|t | 0 + 57637|1500058|t | 1 + 57637|1500059|t | 0 + 57638|1500056|t | 0 (4 rows) id|value --------------------------------------------------------------------- -(0 rows) + 5| 10 +(1 row) starting permutation: s1-begin s1-select s2-begin s2-isolate-tenant s1-copy s2-commit s1-commit s2-print-cluster @@ -602,7 +743,7 @@ step s2-isolate-tenant: isolate_tenant_to_new_shard --------------------------------------------------------------------- - 1500052 + 1500064 (1 row) step s1-copy: @@ -629,10 +770,10 @@ step s2-print-cluster: nodeport|shardid|success|result --------------------------------------------------------------------- - 57637|1500051|t | 0 - 57637|1500052|t | 0 - 57637|1500053|t | 0 - 57638|1500050|t | 0 + 57637|1500063|t | 0 + 57637|1500064|t | 0 + 57637|1500065|t | 0 + 57638|1500062|t | 0 (4 rows) id|value @@ -668,7 +809,7 @@ step s2-isolate-tenant: isolate_tenant_to_new_shard --------------------------------------------------------------------- - 1500058 + 1500070 (1 row) step s1-ddl: @@ -694,10 +835,10 @@ step s2-print-cluster: nodeport|shardid|success|result --------------------------------------------------------------------- - 57637|1500057|t | 0 - 57637|1500058|t | 0 - 57637|1500059|t | 0 - 57638|1500056|t | 0 + 57637|1500069|t | 0 + 57637|1500070|t | 0 + 57637|1500071|t | 0 + 57638|1500068|t | 0 (4 rows) id|value @@ -744,7 +885,7 @@ step s1-isolate-tenant: isolate_tenant_to_new_shard --------------------------------------------------------------------- - 1500064 + 1500076 (1 row) step s2-isolate-tenant: @@ -767,10 +908,10 @@ step s2-print-cluster: nodeport|shardid|success|result --------------------------------------------------------------------- - 57637|1500061|t | 1 - 57638|1500063|t | 0 - 57638|1500064|t | 0 - 57638|1500065|t | 0 + 57637|1500073|t | 1 + 57638|1500075|t | 0 + 57638|1500076|t | 0 + 57638|1500077|t | 0 (4 rows) id|value @@ -799,7 +940,7 @@ step s1-isolate-tenant: isolate_tenant_to_new_shard --------------------------------------------------------------------- - 1500070 + 1500082 (1 row) step s2-isolate-tenant: @@ -822,10 +963,10 @@ step s2-print-cluster: nodeport|shardid|success|result --------------------------------------------------------------------- - 57637|1500067|t | 1 - 57638|1500069|t | 0 - 57638|1500070|t | 0 - 57638|1500071|t | 0 + 57637|1500079|t | 1 + 57638|1500081|t | 0 + 57638|1500082|t | 0 + 57638|1500083|t | 0 (4 rows) id|value diff --git a/src/test/regress/expected/isolation_tenant_isolation_nonblocking.out b/src/test/regress/expected/isolation_tenant_isolation_nonblocking.out index 7d8991615..17766f85e 100644 --- a/src/test/regress/expected/isolation_tenant_isolation_nonblocking.out +++ b/src/test/regress/expected/isolation_tenant_isolation_nonblocking.out @@ -59,7 +59,7 @@ t step s2-isolate-tenant: <... completed> isolate_tenant_to_new_shard --------------------------------------------------------------------- - 1500078 + 1500090 (1 row) step s2-commit: @@ -78,10 +78,10 @@ step s2-print-cluster: nodeport|shardid|success|result --------------------------------------------------------------------- - 57637|1500077|t | 0 - 57637|1500078|t | 1 - 57637|1500079|t | 0 - 57638|1500074|t | 0 + 57637|1500089|t | 0 + 57637|1500090|t | 1 + 57637|1500091|t | 0 + 57638|1500086|t | 0 (4 rows) id|value @@ -149,7 +149,7 @@ t step s2-isolate-tenant: <... completed> isolate_tenant_to_new_shard --------------------------------------------------------------------- - 1500086 + 1500098 (1 row) step s2-commit: @@ -168,10 +168,10 @@ step s2-print-cluster: nodeport|shardid|success|result --------------------------------------------------------------------- - 57637|1500085|t | 0 - 57637|1500086|t | 0 - 57637|1500087|t | 0 - 57638|1500082|t | 0 + 57637|1500097|t | 0 + 57637|1500098|t | 0 + 57637|1500099|t | 0 + 57638|1500094|t | 0 (4 rows) id|value @@ -179,6 +179,98 @@ id|value (0 rows) +starting permutation: s1-load-cache s1-insert s3-acquire-advisory-lock s1-begin s1-select s2-begin s2-isolate-tenant s1-update-complex s1-commit s3-release-advisory-lock s2-commit s2-print-cluster +create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +step s1-load-cache: + TRUNCATE isolation_table; + TRUNCATE isolation_table2; + +step s1-insert: + INSERT INTO isolation_table VALUES (5, 10); + INSERT INTO isolation_table2 VALUES (5, 10); + +step s3-acquire-advisory-lock: + SELECT pg_advisory_lock(44000, 55152); + +pg_advisory_lock +--------------------------------------------------------------------- + +(1 row) + +step s1-begin: + BEGIN; + -- the tests are written with the logic where single shard SELECTs + -- do not to open transaction blocks + SET citus.select_opens_transaction_block TO false; + +step s1-select: + SELECT count(*) FROM isolation_table WHERE id = 5; + +count +--------------------------------------------------------------------- + 1 +(1 row) + +step s2-begin: + BEGIN; + +step s2-isolate-tenant: + SELECT isolate_tenant_to_new_shard('isolation_table', 5, shard_transfer_mode => 'force_logical'); + +step s1-update-complex: + UPDATE isolation_table SET value = 5 WHERE id IN ( + SELECT max(id) FROM isolation_table + ); + +step s1-commit: + COMMIT; + +step s3-release-advisory-lock: + SELECT pg_advisory_unlock(44000, 55152); + +pg_advisory_unlock +--------------------------------------------------------------------- +t +(1 row) + +step s2-isolate-tenant: <... completed> +isolate_tenant_to_new_shard +--------------------------------------------------------------------- + 1500106 +(1 row) + +step s2-commit: + COMMIT; + +step s2-print-cluster: + -- row count per shard + SELECT + nodeport, shardid, success, result + FROM + run_command_on_placements('isolation_table', 'select count(*) from %s') + ORDER BY + nodeport, shardid; + -- rows + SELECT id, value FROM isolation_table ORDER BY id, value; + +nodeport|shardid|success|result +--------------------------------------------------------------------- + 57637|1500105|t | 0 + 57637|1500106|t | 1 + 57637|1500107|t | 0 + 57638|1500102|t | 0 +(4 rows) + +id|value +--------------------------------------------------------------------- + 5| 5 +(1 row) + + starting permutation: s1-load-cache s3-acquire-advisory-lock s1-begin s1-select s2-begin s2-isolate-tenant s1-insert s1-commit s3-release-advisory-lock s2-commit s2-print-cluster create_distributed_table --------------------------------------------------------------------- @@ -235,7 +327,7 @@ t step s2-isolate-tenant: <... completed> isolate_tenant_to_new_shard --------------------------------------------------------------------- - 1500094 + 1500114 (1 row) step s2-commit: @@ -254,10 +346,10 @@ step s2-print-cluster: nodeport|shardid|success|result --------------------------------------------------------------------- - 57637|1500093|t | 0 - 57637|1500094|t | 1 - 57637|1500095|t | 0 - 57638|1500090|t | 0 + 57637|1500113|t | 0 + 57637|1500114|t | 1 + 57637|1500115|t | 0 + 57638|1500110|t | 0 (4 rows) id|value @@ -321,7 +413,7 @@ t step s2-isolate-tenant: <... completed> isolate_tenant_to_new_shard --------------------------------------------------------------------- - 1500102 + 1500122 (1 row) step s2-commit: @@ -340,10 +432,10 @@ step s2-print-cluster: nodeport|shardid|success|result --------------------------------------------------------------------- - 57637|1500101|t | 1 - 57637|1500102|t | 1 - 57637|1500103|t | 2 - 57638|1500098|t | 1 + 57637|1500121|t | 1 + 57637|1500122|t | 1 + 57637|1500123|t | 2 + 57638|1500118|t | 1 (4 rows) id|value @@ -411,7 +503,7 @@ t step s2-isolate-tenant: <... completed> isolate_tenant_to_new_shard --------------------------------------------------------------------- - 1500110 + 1500130 (1 row) step s2-commit: @@ -430,10 +522,10 @@ step s2-print-cluster: nodeport|shardid|success|result --------------------------------------------------------------------- - 57637|1500109|t | 0 - 57637|1500110|t | 1 - 57637|1500111|t | 0 - 57638|1500106|t | 0 + 57637|1500129|t | 0 + 57637|1500130|t | 1 + 57637|1500131|t | 0 + 57638|1500126|t | 0 (4 rows) id|value @@ -497,7 +589,7 @@ t step s2-isolate-tenant: <... completed> isolate_tenant_to_new_shard --------------------------------------------------------------------- - 1500118 + 1500138 (1 row) step s2-commit: @@ -516,10 +608,10 @@ step s2-print-cluster: nodeport|shardid|success|result --------------------------------------------------------------------- - 57637|1500117|t | 0 - 57637|1500118|t | 0 - 57637|1500119|t | 0 - 57638|1500114|t | 0 + 57637|1500137|t | 0 + 57637|1500138|t | 0 + 57637|1500139|t | 0 + 57638|1500134|t | 0 (4 rows) id|value @@ -527,6 +619,94 @@ id|value (0 rows) +starting permutation: s1-insert s3-acquire-advisory-lock s1-begin s1-select s2-begin s2-isolate-tenant s1-update-complex s1-commit s3-release-advisory-lock s2-commit s2-print-cluster +create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +step s1-insert: + INSERT INTO isolation_table VALUES (5, 10); + INSERT INTO isolation_table2 VALUES (5, 10); + +step s3-acquire-advisory-lock: + SELECT pg_advisory_lock(44000, 55152); + +pg_advisory_lock +--------------------------------------------------------------------- + +(1 row) + +step s1-begin: + BEGIN; + -- the tests are written with the logic where single shard SELECTs + -- do not to open transaction blocks + SET citus.select_opens_transaction_block TO false; + +step s1-select: + SELECT count(*) FROM isolation_table WHERE id = 5; + +count +--------------------------------------------------------------------- + 1 +(1 row) + +step s2-begin: + BEGIN; + +step s2-isolate-tenant: + SELECT isolate_tenant_to_new_shard('isolation_table', 5, shard_transfer_mode => 'force_logical'); + +step s1-update-complex: + UPDATE isolation_table SET value = 5 WHERE id IN ( + SELECT max(id) FROM isolation_table + ); + +step s1-commit: + COMMIT; + +step s3-release-advisory-lock: + SELECT pg_advisory_unlock(44000, 55152); + +pg_advisory_unlock +--------------------------------------------------------------------- +t +(1 row) + +step s2-isolate-tenant: <... completed> +isolate_tenant_to_new_shard +--------------------------------------------------------------------- + 1500146 +(1 row) + +step s2-commit: + COMMIT; + +step s2-print-cluster: + -- row count per shard + SELECT + nodeport, shardid, success, result + FROM + run_command_on_placements('isolation_table', 'select count(*) from %s') + ORDER BY + nodeport, shardid; + -- rows + SELECT id, value FROM isolation_table ORDER BY id, value; + +nodeport|shardid|success|result +--------------------------------------------------------------------- + 57637|1500145|t | 0 + 57637|1500146|t | 1 + 57637|1500147|t | 0 + 57638|1500142|t | 0 +(4 rows) + +id|value +--------------------------------------------------------------------- + 5| 5 +(1 row) + + starting permutation: s3-acquire-advisory-lock s1-begin s1-select s2-begin s2-isolate-tenant s1-insert s1-commit s3-release-advisory-lock s2-commit s2-print-cluster create_distributed_table --------------------------------------------------------------------- @@ -579,7 +759,7 @@ t step s2-isolate-tenant: <... completed> isolate_tenant_to_new_shard --------------------------------------------------------------------- - 1500126 + 1500154 (1 row) step s2-commit: @@ -598,10 +778,10 @@ step s2-print-cluster: nodeport|shardid|success|result --------------------------------------------------------------------- - 57637|1500125|t | 0 - 57637|1500126|t | 1 - 57637|1500127|t | 0 - 57638|1500122|t | 0 + 57637|1500153|t | 0 + 57637|1500154|t | 1 + 57637|1500155|t | 0 + 57638|1500150|t | 0 (4 rows) id|value @@ -661,7 +841,7 @@ t step s2-isolate-tenant: <... completed> isolate_tenant_to_new_shard --------------------------------------------------------------------- - 1500134 + 1500162 (1 row) step s2-commit: @@ -680,10 +860,10 @@ step s2-print-cluster: nodeport|shardid|success|result --------------------------------------------------------------------- - 57637|1500133|t | 1 - 57637|1500134|t | 1 - 57637|1500135|t | 2 - 57638|1500130|t | 1 + 57637|1500161|t | 1 + 57637|1500162|t | 1 + 57637|1500163|t | 2 + 57638|1500158|t | 1 (4 rows) id|value @@ -736,7 +916,7 @@ t step s2-isolate-tenant: <... completed> isolate_tenant_to_new_shard --------------------------------------------------------------------- - 1500142 + 1500170 (1 row) step s2-print-cluster: @@ -752,10 +932,10 @@ step s2-print-cluster: nodeport|shardid|success|result --------------------------------------------------------------------- - 57637|1500141|t | 0 - 57637|1500142|t | 1 - 57637|1500143|t | 0 - 57638|1500138|t | 0 + 57637|1500169|t | 0 + 57637|1500170|t | 1 + 57637|1500171|t | 0 + 57638|1500166|t | 0 (4 rows) id|value @@ -804,7 +984,7 @@ t step s2-isolate-tenant: <... completed> isolate_tenant_to_new_shard --------------------------------------------------------------------- - 1500150 + 1500178 (1 row) step s2-print-cluster: @@ -820,10 +1000,10 @@ step s2-print-cluster: nodeport|shardid|success|result --------------------------------------------------------------------- - 57637|1500149|t | 0 - 57637|1500150|t | 1 - 57637|1500151|t | 0 - 57638|1500146|t | 0 + 57637|1500177|t | 0 + 57637|1500178|t | 1 + 57637|1500179|t | 0 + 57638|1500174|t | 0 (4 rows) id|value @@ -872,7 +1052,7 @@ t step s2-isolate-tenant: <... completed> isolate_tenant_to_new_shard --------------------------------------------------------------------- - 1500158 + 1500186 (1 row) step s2-print-cluster: @@ -888,10 +1068,10 @@ step s2-print-cluster: nodeport|shardid|success|result --------------------------------------------------------------------- - 57637|1500157|t | 0 - 57637|1500158|t | 1 - 57637|1500159|t | 0 - 57638|1500154|t | 0 + 57637|1500185|t | 0 + 57637|1500186|t | 1 + 57637|1500187|t | 0 + 57638|1500182|t | 0 (4 rows) id|value @@ -943,7 +1123,7 @@ t step s2-isolate-tenant: <... completed> isolate_tenant_to_new_shard --------------------------------------------------------------------- - 1500169 + 1500197 (1 row) step s2-commit: @@ -962,10 +1142,10 @@ step s2-print-cluster: nodeport|shardid|success|result --------------------------------------------------------------------- - 57637|1500168|t | 0 - 57637|1500169|t | 1 - 57637|1500170|t | 0 - 57638|1500165|t | 0 + 57637|1500196|t | 0 + 57637|1500197|t | 1 + 57637|1500198|t | 0 + 57638|1500193|t | 0 (4 rows) id|value @@ -1004,7 +1184,7 @@ step s1-isolate-tenant-no-same-coloc-blocking: isolate_tenant_to_new_shard --------------------------------------------------------------------- - 1500183 + 1500211 (1 row) step s3-release-advisory-lock: @@ -1018,7 +1198,7 @@ t step s2-isolate-tenant: <... completed> isolate_tenant_to_new_shard --------------------------------------------------------------------- - 1500180 + 1500208 (1 row) step s2-print-cluster: @@ -1034,10 +1214,10 @@ step s2-print-cluster: nodeport|shardid|success|result --------------------------------------------------------------------- - 57637|1500179|t | 0 - 57637|1500180|t | 1 - 57637|1500181|t | 0 - 57638|1500176|t | 0 + 57637|1500207|t | 0 + 57637|1500208|t | 1 + 57637|1500209|t | 0 + 57638|1500204|t | 0 (4 rows) id|value @@ -1076,7 +1256,7 @@ step s1-isolate-tenant-no-same-coloc-blocking: isolate_tenant_to_new_shard --------------------------------------------------------------------- - 1500194 + 1500222 (1 row) step s3-release-advisory-lock: @@ -1090,7 +1270,7 @@ t step s2-isolate-tenant: <... completed> isolate_tenant_to_new_shard --------------------------------------------------------------------- - 1500191 + 1500219 (1 row) step s2-print-cluster: @@ -1106,10 +1286,10 @@ step s2-print-cluster: nodeport|shardid|success|result --------------------------------------------------------------------- - 57637|1500190|t | 0 - 57637|1500191|t | 1 - 57637|1500192|t | 0 - 57638|1500187|t | 0 + 57637|1500218|t | 0 + 57637|1500219|t | 1 + 57637|1500220|t | 0 + 57638|1500215|t | 0 (4 rows) id|value diff --git a/src/test/regress/spec/isolation_tenant_isolation.spec b/src/test/regress/spec/isolation_tenant_isolation.spec index 6b0fcd53e..5db86685e 100644 --- a/src/test/regress/spec/isolation_tenant_isolation.spec +++ b/src/test/regress/spec/isolation_tenant_isolation.spec @@ -44,6 +44,13 @@ step "s1-update" UPDATE isolation_table SET value = 5 WHERE id = 5; } +step "s1-update-complex" +{ + UPDATE isolation_table SET value = 5 WHERE id IN ( + SELECT max(id) FROM isolation_table + ); +} + step "s1-delete" { DELETE FROM isolation_table WHERE id = 5; @@ -119,6 +126,7 @@ step "s2-print-index-count" // we expect DML/DDL queries to fail because the shard they are waiting for is destroyed permutation "s1-load-cache" "s1-insert" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-update" "s2-commit" "s1-commit" "s2-print-cluster" permutation "s1-load-cache" "s1-insert" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-delete" "s2-commit" "s1-commit" "s2-print-cluster" +permutation "s1-load-cache" "s1-insert" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-update-complex" "s2-commit" "s1-commit" "s2-print-cluster" permutation "s1-load-cache" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-insert" "s2-commit" "s1-commit" "s2-print-cluster" permutation "s1-load-cache" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-copy" "s2-commit" "s1-commit" "s2-print-cluster" permutation "s1-load-cache" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-ddl" "s2-commit" "s1-commit" "s2-print-cluster" "s2-print-index-count" @@ -127,6 +135,7 @@ permutation "s1-load-cache" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant // the same tests without loading the cache at first permutation "s1-insert" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-update" "s2-commit" "s1-commit" "s2-print-cluster" permutation "s1-insert" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-delete" "s2-commit" "s1-commit" "s2-print-cluster" +permutation "s1-insert" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-update-complex" "s2-commit" "s1-commit" "s2-print-cluster" permutation "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-insert" "s2-commit" "s1-commit" "s2-print-cluster" permutation "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-copy" "s2-commit" "s1-commit" "s2-print-cluster" permutation "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-ddl" "s2-commit" "s1-commit" "s2-print-cluster" "s2-print-index-count" diff --git a/src/test/regress/spec/isolation_tenant_isolation_nonblocking.spec b/src/test/regress/spec/isolation_tenant_isolation_nonblocking.spec index e7395e631..8dd726392 100644 --- a/src/test/regress/spec/isolation_tenant_isolation_nonblocking.spec +++ b/src/test/regress/spec/isolation_tenant_isolation_nonblocking.spec @@ -51,6 +51,13 @@ step "s1-update" UPDATE isolation_table SET value = 5 WHERE id = 5; } +step "s1-update-complex" +{ + UPDATE isolation_table SET value = 5 WHERE id IN ( + SELECT max(id) FROM isolation_table + ); +} + step "s1-delete" { DELETE FROM isolation_table WHERE id = 5; @@ -148,12 +155,14 @@ step "s3-release-advisory-lock" // we expect DML queries of s2 to succeed without being blocked. permutation "s1-load-cache" "s1-insert" "s3-acquire-advisory-lock" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-update" "s1-commit" "s3-release-advisory-lock" "s2-commit" "s2-print-cluster" permutation "s1-load-cache" "s1-insert" "s3-acquire-advisory-lock" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-delete" "s1-commit" "s3-release-advisory-lock" "s2-commit" "s2-print-cluster" +permutation "s1-load-cache" "s1-insert" "s3-acquire-advisory-lock" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-update-complex" "s1-commit" "s3-release-advisory-lock" "s2-commit" "s2-print-cluster" permutation "s1-load-cache" "s3-acquire-advisory-lock" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-insert" "s1-commit" "s3-release-advisory-lock" "s2-commit" "s2-print-cluster" permutation "s1-load-cache" "s3-acquire-advisory-lock" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-copy" "s1-commit" "s3-release-advisory-lock" "s2-commit" "s2-print-cluster" // the same tests without loading the cache at first permutation "s1-insert" "s3-acquire-advisory-lock" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-update" "s1-commit" "s3-release-advisory-lock" "s2-commit" "s2-print-cluster" permutation "s1-insert" "s3-acquire-advisory-lock" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-delete" "s1-commit" "s3-release-advisory-lock" "s2-commit" "s2-print-cluster" +permutation "s1-insert" "s3-acquire-advisory-lock" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-update-complex" "s1-commit" "s3-release-advisory-lock" "s2-commit" "s2-print-cluster" permutation "s3-acquire-advisory-lock" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-insert" "s1-commit" "s3-release-advisory-lock" "s2-commit" "s2-print-cluster" permutation "s3-acquire-advisory-lock" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-copy" "s1-commit" "s3-release-advisory-lock" "s2-commit" "s2-print-cluster" From 6f06ff78cc32cf67225d9d55e0679ce6e084c979 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emel=20=C5=9Eim=C5=9Fek?= Date: Tue, 6 Sep 2022 16:46:41 +0300 Subject: [PATCH 15/16] Throw an error if there is a RangeTblEntry that is not assigned an RTE identity. (#6295) * Fix issue : 6109 Segfault or (assertion failure) is possible when using a SQL function * DESCRIPTION: Ensures disallowing the usage of SQL functions referencing to a distributed table and prevents a segfault. Using a SQL function may result in segmentation fault in some cases. This change fixes the issue by throwing an error message when a SQL function cannot be handled. Fixes #6109. * DESCRIPTION: Ensures disallowing the usage of SQL functions referencing to a distributed table and prevents a segfault. Using a SQL function may result in segmentation fault in some cases. This change fixes the issue by throwing an error message when a SQL function cannot be handled. Fixes #6109. Co-authored-by: Emel Simsek --- .../distributed/planner/distributed_planner.c | 21 +++++++++++++++++-- .../regress/expected/multi_sql_function.out | 3 +++ src/test/regress/sql/multi_sql_function.sql | 3 +++ 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/src/backend/distributed/planner/distributed_planner.c b/src/backend/distributed/planner/distributed_planner.c index 17816a3b4..3f431f514 100644 --- a/src/backend/distributed/planner/distributed_planner.c +++ b/src/backend/distributed/planner/distributed_planner.c @@ -559,9 +559,26 @@ int GetRTEIdentity(RangeTblEntry *rte) { Assert(rte->rtekind == RTE_RELATION); - Assert(rte->values_lists != NIL); + + /* + * Since SQL functions might be in-lined by standard_planner, + * we might miss assigning an RTE identity for RangeTblEntries + * related to SQL functions. We already have checks in other + * places to throw an error for SQL functions but they are not + * sufficient due to function in-lining; so here we capture such + * cases and throw an error here. + */ + if (list_length(rte->values_lists) != 2) + { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot perform distributed planning on this " + "query because parameterized queries for SQL " + "functions referencing distributed tables are " + "not supported"), + errhint("Consider using PL/pgSQL functions instead."))); + } + Assert(IsA(rte->values_lists, IntList)); - Assert(list_length(rte->values_lists) == 2); return linitial_int(rte->values_lists); } diff --git a/src/test/regress/expected/multi_sql_function.out b/src/test/regress/expected/multi_sql_function.out index e1862c983..bcc4efcca 100644 --- a/src/test/regress/expected/multi_sql_function.out +++ b/src/test/regress/expected/multi_sql_function.out @@ -320,6 +320,9 @@ INSERT INTO test_parameterized_sql VALUES(1, 1); SELECT * FROM test_parameterized_sql_function(1); ERROR: cannot perform distributed planning on this query because parameterized queries for SQL functions referencing distributed tables are not supported HINT: Consider using PL/pgSQL functions instead. +SELECT (SELECT 1 FROM test_parameterized_sql limit 1) FROM test_parameterized_sql_function(1); +ERROR: cannot perform distributed planning on this query because parameterized queries for SQL functions referencing distributed tables are not supported +HINT: Consider using PL/pgSQL functions instead. SELECT test_parameterized_sql_function_in_subquery_where(1); ERROR: could not create distributed plan DETAIL: Possibly this is caused by the use of parameters in SQL functions, which is not supported in Citus. diff --git a/src/test/regress/sql/multi_sql_function.sql b/src/test/regress/sql/multi_sql_function.sql index 329d57996..1ef0ef40a 100644 --- a/src/test/regress/sql/multi_sql_function.sql +++ b/src/test/regress/sql/multi_sql_function.sql @@ -144,6 +144,9 @@ INSERT INTO test_parameterized_sql VALUES(1, 1); -- all of them should fail SELECT * FROM test_parameterized_sql_function(1); + +SELECT (SELECT 1 FROM test_parameterized_sql limit 1) FROM test_parameterized_sql_function(1); + SELECT test_parameterized_sql_function_in_subquery_where(1); -- postgres behaves slightly differently for the following From ac96370ddf06ad8c850f5a8d146a6b22f3ecc095 Mon Sep 17 00:00:00 2001 From: Gokhan Gulbiz Date: Tue, 6 Sep 2022 17:38:41 +0300 Subject: [PATCH 16/16] Use IsMultiStatementTransaction for SELECT .. FOR UPDATE queries (#6288) * Use IsMultiStatementTransaction instead of IsTransaction for row-locking operations. * Add regression test for SELECT..FOR UPDATE statement --- .../distributed/executor/adaptive_executor.c | 2 +- .../regress/expected/adaptive_executor.out | 33 ++++++++++++++++++- src/test/regress/sql/adaptive_executor.sql | 18 ++++++++++ 3 files changed, 51 insertions(+), 2 deletions(-) diff --git a/src/backend/distributed/executor/adaptive_executor.c b/src/backend/distributed/executor/adaptive_executor.c index a0e16876a..948d65791 100644 --- a/src/backend/distributed/executor/adaptive_executor.c +++ b/src/backend/distributed/executor/adaptive_executor.c @@ -1421,7 +1421,7 @@ DistributedExecutionRequiresRollback(List *taskList) * Do not check SelectOpensTransactionBlock, always open transaction block * if SELECT FOR UPDATE is executed inside a distributed transaction. */ - return IsTransactionBlock(); + return IsMultiStatementTransaction(); } if (ReadOnlyTask(task->taskType)) diff --git a/src/test/regress/expected/adaptive_executor.out b/src/test/regress/expected/adaptive_executor.out index aeaa553f2..4372747d6 100644 --- a/src/test/regress/expected/adaptive_executor.out +++ b/src/test/regress/expected/adaptive_executor.out @@ -78,5 +78,36 @@ $$); (1 row) END; +CREATE OR REPLACE FUNCTION select_for_update() +RETURNS void +AS $$ +DECLARE + my int; +BEGIN + SELECT y INTO my FROM test WHERE x = 1 FOR UPDATE; +END; +$$ LANGUAGE plpgsql; +-- so that we can prove that we open a transaction block by logging below: +-- "NOTICE: issuing BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;..." +SET citus.log_remote_commands TO on; +SELECT select_for_update(); +NOTICE: issuing BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;SELECT assign_distributed_transaction_id(xx, xx, 'xxxxxxx'); +DETAIL: on server postgres@localhost:xxxxx connectionId: xxxxxxx +CONTEXT: SQL statement "SELECT y FROM test WHERE x = 1 FOR UPDATE" +PL/pgSQL function select_for_update() line XX at SQL statement +NOTICE: issuing SELECT y FROM adaptive_executor.test_801009000 test WHERE (x OPERATOR(pg_catalog.=) 1) FOR UPDATE OF test +DETAIL: on server postgres@localhost:xxxxx connectionId: xxxxxxx +CONTEXT: SQL statement "SELECT y FROM test WHERE x = 1 FOR UPDATE" +PL/pgSQL function select_for_update() line XX at SQL statement +NOTICE: issuing COMMIT +DETAIL: on server postgres@localhost:xxxxx connectionId: xxxxxxx + select_for_update +--------------------------------------------------------------------- + +(1 row) + +SET citus.log_remote_commands TO off; DROP SCHEMA adaptive_executor CASCADE; -NOTICE: drop cascades to table test +NOTICE: drop cascades to 2 other objects +DETAIL: drop cascades to table test +drop cascades to function select_for_update() diff --git a/src/test/regress/sql/adaptive_executor.sql b/src/test/regress/sql/adaptive_executor.sql index f7d6c6f1e..5dd14e6f8 100644 --- a/src/test/regress/sql/adaptive_executor.sql +++ b/src/test/regress/sql/adaptive_executor.sql @@ -40,4 +40,22 @@ SELECT sum(result::bigint) FROM run_command_on_workers($$ $$); END; +CREATE OR REPLACE FUNCTION select_for_update() +RETURNS void +AS $$ +DECLARE + my int; +BEGIN + SELECT y INTO my FROM test WHERE x = 1 FOR UPDATE; +END; +$$ LANGUAGE plpgsql; + +-- so that we can prove that we open a transaction block by logging below: +-- "NOTICE: issuing BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;..." +SET citus.log_remote_commands TO on; + +SELECT select_for_update(); + +SET citus.log_remote_commands TO off; + DROP SCHEMA adaptive_executor CASCADE;