From 98dcbeb3042e902896c796be3b9414a3f98416d7 Mon Sep 17 00:00:00 2001
From: Naisila Puka <37271756+naisila@users.noreply.github.com>
Date: Wed, 31 Aug 2022 10:48:01 +0300
Subject: [PATCH 01/16] Specifies that our CustomScan providers support
 projections (#6244)

Before, this was the default mode for CustomScan providers.
Now, the default is to assume that they can't project.
This causes performance penalties due to adding unnecessary
Result nodes.

Hence we use the newly added flag, CUSTOMPATH_SUPPORT_PROJECTION
to get it back to how it was.

In PG15 support branch we created explain functions to ignore
the new Result nodes, so we undo that in this commit.

Relevant PG commit:
955b3e0f9269639fb916cee3dea37aee50b82df0
---
 src/backend/columnar/columnar_customscan.c    |   12 +
 .../planner/combine_query_planner.c           |    6 +
 .../distributed/planner/distributed_planner.c |    7 +
 .../expected/columnar_chunk_filtering.out     |    8 +-
 .../expected/columnar_citus_integration.out   |    4 +-
 .../expected/insert_select_repartition.out    |    8 +-
 .../expected/insert_select_repartition_0.out  |    8 +-
 .../expected/multi_select_distinct.out        |   68 +-
 .../expected/multi_select_distinct_0.out      | 1548 +++++++++++++++++
 .../regress/expected/multi_test_helpers.out   |   34 -
 .../regress/expected/window_functions.out     |    4 +-
 .../regress/expected/window_functions_0.out   |    4 +-
 .../regress/sql/columnar_chunk_filtering.sql  |    4 -
 .../sql/columnar_citus_integration.sql        |    2 -
 .../regress/sql/insert_select_repartition.sql |    4 -
 .../regress/sql/multi_select_distinct.sql     |   16 +-
 src/test/regress/sql/multi_test_helpers.sql   |   36 -
 src/test/regress/sql/window_functions.sql     |    2 -
 18 files changed, 1616 insertions(+), 159 deletions(-)
 create mode 100644 src/test/regress/expected/multi_select_distinct_0.out

diff --git a/src/backend/columnar/columnar_customscan.c b/src/backend/columnar/columnar_customscan.c
index 98c13e2a7..74c50e4f6 100644
--- a/src/backend/columnar/columnar_customscan.c
+++ b/src/backend/columnar/columnar_customscan.c
@@ -1303,6 +1303,12 @@ AddColumnarScanPath(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte,
 
 	cpath->methods = &ColumnarScanPathMethods;
 
+#if (PG_VERSION_NUM >= PG_VERSION_15)
+
+	/* necessary to avoid extra Result node in PG15 */
+	cpath->flags = CUSTOMPATH_SUPPORT_PROJECTION;
+#endif
+
 	/*
 	 * populate generic path information
 	 */
@@ -1545,6 +1551,12 @@ ColumnarScanPath_PlanCustomPath(PlannerInfo *root,
 	cscan->scan.plan.targetlist = list_copy(tlist);
 	cscan->scan.scanrelid = best_path->path.parent->relid;
 
+#if (PG_VERSION_NUM >= 150000)
+
+	/* necessary to avoid extra Result node in PG15 */
+	cscan->flags = CUSTOMPATH_SUPPORT_PROJECTION;
+#endif
+
 	return (Plan *) cscan;
 }
 
diff --git a/src/backend/distributed/planner/combine_query_planner.c b/src/backend/distributed/planner/combine_query_planner.c
index f67f71b53..0a871f3e6 100644
--- a/src/backend/distributed/planner/combine_query_planner.c
+++ b/src/backend/distributed/planner/combine_query_planner.c
@@ -136,6 +136,12 @@ CreateCitusCustomScanPath(PlannerInfo *root, RelOptInfo *relOptInfo,
 	path->custom_path.path.pathtarget = relOptInfo->reltarget;
 	path->custom_path.path.parent = relOptInfo;
 
+#if (PG_VERSION_NUM >= PG_VERSION_15)
+
+	/* necessary to avoid extra Result node in PG15 */
+	path->custom_path.flags = CUSTOMPATH_SUPPORT_PROJECTION;
+#endif
+
 	/*
 	 * The 100k rows we put on the cost of the path is kind of arbitrary and could be
 	 * improved in accuracy to produce better plans.
diff --git a/src/backend/distributed/planner/distributed_planner.c b/src/backend/distributed/planner/distributed_planner.c
index 35664d7c7..17816a3b4 100644
--- a/src/backend/distributed/planner/distributed_planner.c
+++ b/src/backend/distributed/planner/distributed_planner.c
@@ -1369,7 +1369,14 @@ FinalizePlan(PlannedStmt *localPlan, DistributedPlan *distributedPlan)
 	Node *distributedPlanData = (Node *) distributedPlan;
 
 	customScan->custom_private = list_make1(distributedPlanData);
+
+#if (PG_VERSION_NUM >= PG_VERSION_15)
+
+	/* necessary to avoid extra Result node in PG15 */
+	customScan->flags = CUSTOMPATH_SUPPORT_BACKWARD_SCAN | CUSTOMPATH_SUPPORT_PROJECTION;
+#else
 	customScan->flags = CUSTOMPATH_SUPPORT_BACKWARD_SCAN;
+#endif
 
 	/*
 	 * Fast path queries cannot have any subplans by definition, so skip
diff --git a/src/test/regress/expected/columnar_chunk_filtering.out b/src/test/regress/expected/columnar_chunk_filtering.out
index 292d5fb1a..0d0534ccc 100644
--- a/src/test/regress/expected/columnar_chunk_filtering.out
+++ b/src/test/regress/expected/columnar_chunk_filtering.out
@@ -264,21 +264,17 @@ EXPLAIN (analyze on, costs off, timing off, summary off)
          Columnar Projected Columns: a
 (9 rows)
 
-SELECT plan_without_arrows($Q$
 EXPLAIN (costs off, timing off, summary off)
   SELECT y, * FROM another_columnar_table;
-$Q$);
-                      plan_without_arrows
+                      QUERY PLAN
 ---------------------------------------------------------------------
  Custom Scan (ColumnarScan) on another_columnar_table
    Columnar Projected Columns: x, y
 (2 rows)
 
-SELECT plan_without_arrows($Q$
 EXPLAIN (costs off, timing off, summary off)
   SELECT *, x FROM another_columnar_table;
-$Q$);
-                      plan_without_arrows
+                      QUERY PLAN
 ---------------------------------------------------------------------
  Custom Scan (ColumnarScan) on another_columnar_table
    Columnar Projected Columns: x, y
diff --git a/src/test/regress/expected/columnar_citus_integration.out b/src/test/regress/expected/columnar_citus_integration.out
index 8beb09edf..fb7d9201e 100644
--- a/src/test/regress/expected/columnar_citus_integration.out
+++ b/src/test/regress/expected/columnar_citus_integration.out
@@ -958,15 +958,13 @@ SELECT * FROM weird_col_explain;
 (7 rows)
 
 \set VERBOSITY terse
-SELECT public.plan_without_result_lines($Q$
 EXPLAIN (COSTS OFF, SUMMARY OFF)
 SELECT *, "bbbbbbbbbbbbbbbbbbbbbbbbb\!bbbb'bbbbbbbbbbbbbbbbbbbbb''bbbbbbbb"
 FROM weird_col_explain
 WHERE "bbbbbbbbbbbbbbbbbbbbbbbbb\!bbbb'bbbbbbbbbbbbbbbbbbbbb''bbbbbbbb" * 2 >
       "aaaaaaaaaaaa$aaaaaa$$aaaaaaaaaaaaaaaaaaaaaaaaaaaaa'aaaaaaaa'$a'!";
-$Q$);
 NOTICE:  identifier "aaaaaaaaaaaa$aaaaaa$$aaaaaaaaaaaaaaaaaaaaaaaaaaaaa'aaaaaaaa'$a'!" will be truncated to "aaaaaaaaaaaa$aaaaaa$$aaaaaaaaaaaaaaaaaaaaaaaaaaaaa'aaaaaaaa'$a'"
-                                                                              plan_without_result_lines
+                                                                                   QUERY PLAN
 ---------------------------------------------------------------------
  Custom Scan (Citus Adaptive)
    Task Count: 4
diff --git a/src/test/regress/expected/insert_select_repartition.out b/src/test/regress/expected/insert_select_repartition.out
index 913419072..5d9ddca6a 100644
--- a/src/test/regress/expected/insert_select_repartition.out
+++ b/src/test/regress/expected/insert_select_repartition.out
@@ -1261,10 +1261,8 @@ NOTICE:  copying the data has completed
 
 (1 row)
 
-SELECT public.plan_without_result_lines($Q$
 explain (costs off) insert into table_with_sequences select y, x from table_with_sequences;
-$Q$);
-                               plan_without_result_lines
+                                   QUERY PLAN
 ---------------------------------------------------------------------
  Custom Scan (Citus INSERT ... SELECT)
    INSERT/SELECT method: pull to coordinator
@@ -1289,10 +1287,8 @@ NOTICE:  copying the data has completed
 
 (1 row)
 
-SELECT public.plan_without_result_lines($Q$
 explain (costs off) insert into table_with_user_sequences select y, x from table_with_user_sequences;
-$Q$);
-                                    plan_without_result_lines
+                                        QUERY PLAN
 ---------------------------------------------------------------------
  Custom Scan (Citus INSERT ... SELECT)
    INSERT/SELECT method: pull to coordinator
diff --git a/src/test/regress/expected/insert_select_repartition_0.out b/src/test/regress/expected/insert_select_repartition_0.out
index 31377ef16..2eea30bdf 100644
--- a/src/test/regress/expected/insert_select_repartition_0.out
+++ b/src/test/regress/expected/insert_select_repartition_0.out
@@ -1261,10 +1261,8 @@ NOTICE:  copying the data has completed
 
 (1 row)
 
-SELECT public.plan_without_result_lines($Q$
 explain (costs off) insert into table_with_sequences select y, x from table_with_sequences;
-$Q$);
-                               plan_without_result_lines
+                                   QUERY PLAN
 ---------------------------------------------------------------------
  Custom Scan (Citus INSERT ... SELECT)
    INSERT/SELECT method: pull to coordinator
@@ -1289,10 +1287,8 @@ NOTICE:  copying the data has completed
 
 (1 row)
 
-SELECT public.plan_without_result_lines($Q$
 explain (costs off) insert into table_with_user_sequences select y, x from table_with_user_sequences;
-$Q$);
-                                    plan_without_result_lines
+                                        QUERY PLAN
 ---------------------------------------------------------------------
  Custom Scan (Citus INSERT ... SELECT)
    INSERT/SELECT method: pull to coordinator
diff --git a/src/test/regress/expected/multi_select_distinct.out b/src/test/regress/expected/multi_select_distinct.out
index 1112124ae..d281ad4b4 100644
--- a/src/test/regress/expected/multi_select_distinct.out
+++ b/src/test/regress/expected/multi_select_distinct.out
@@ -3,6 +3,13 @@
 --
 -- Tests select distinct, and select distinct on features.
 --
+SHOW server_version \gset
+SELECT substring(:'server_version', '\d+')::int >= 15 AS server_version_ge_15;
+ server_version_ge_15
+---------------------------------------------------------------------
+ t
+(1 row)
+
 ANALYZE lineitem_hash_part;
 -- function calls are supported
 SELECT DISTINCT l_orderkey, now() FROM lineitem_hash_part LIMIT 0;
@@ -306,14 +313,12 @@ EXPLAIN (COSTS FALSE)
 -- check the plan if the hash aggreate is disabled. We expect to see sort+unique
 -- instead of aggregate plan node to handle distinct.
 SET enable_hashagg TO off;
-SELECT public.plan_without_result_lines($Q$
 EXPLAIN (COSTS FALSE)
 	SELECT DISTINCT count(*)
 		FROM lineitem_hash_part
 		GROUP BY l_suppkey, l_linenumber
 		ORDER BY 1;
-$Q$);
-                                        plan_without_result_lines
+                                             QUERY PLAN
 ---------------------------------------------------------------------
  Unique
    ->  Sort
@@ -382,15 +387,13 @@ EXPLAIN (COSTS FALSE)
 -- check the plan if the hash aggreate is disabled. Similar to the explain of
 -- the query above.
 SET enable_hashagg TO off;
-SELECT public.plan_without_result_lines($Q$
 EXPLAIN (COSTS FALSE)
 	SELECT DISTINCT l_suppkey, count(*)
 		FROM lineitem_hash_part
 		GROUP BY l_suppkey, l_linenumber
 		ORDER BY 1
 		LIMIT 10;
-$Q$);
-                                              plan_without_result_lines
+                                                     QUERY PLAN
 ---------------------------------------------------------------------
  Limit
    ->  Unique
@@ -440,12 +443,13 @@ EXPLAIN (COSTS FALSE)
 		GROUP BY l_suppkey, l_linenumber
 		ORDER BY 1,2
 		LIMIT 10;
-                                                       QUERY PLAN
+                                                      QUERY PLAN
 ---------------------------------------------------------------------
  Limit
-   ->  Unique
-         ->  Sort
-               Sort Key: remote_scan.l_suppkey, ((pg_catalog.sum(remote_scan.avg) / pg_catalog.sum(remote_scan.avg_1)))
+   ->  Sort
+         Sort Key: remote_scan.l_suppkey, ((pg_catalog.sum(remote_scan.avg) / pg_catalog.sum(remote_scan.avg_1)))
+         ->  HashAggregate
+               Group Key: remote_scan.l_suppkey, (pg_catalog.sum(remote_scan.avg) / pg_catalog.sum(remote_scan.avg_1))
                ->  HashAggregate
                      Group Key: remote_scan.l_suppkey, remote_scan.worker_column_4
                      ->  Custom Scan (Citus Adaptive)
@@ -456,20 +460,18 @@ EXPLAIN (COSTS FALSE)
                                  ->  HashAggregate
                                        Group Key: l_suppkey, l_linenumber
                                        ->  Seq Scan on lineitem_hash_part_360041 lineitem_hash_part
-(14 rows)
+(15 rows)
 
 -- check the plan if the hash aggreate is disabled. This explain errors out due
 -- to a bug right now, expectation must be corrected after fixing it.
 SET enable_hashagg TO off;
-SELECT public.plan_without_result_lines($Q$
 EXPLAIN (COSTS FALSE)
 	SELECT DISTINCT l_suppkey, avg(l_partkey)
 		FROM lineitem_hash_part
 		GROUP BY l_suppkey, l_linenumber
 		ORDER BY 1,2
 		LIMIT 10;
-$Q$);
-                                               plan_without_result_lines
+                                                       QUERY PLAN
 ---------------------------------------------------------------------
  Limit
    ->  Unique
@@ -539,15 +541,13 @@ EXPLAIN (COSTS FALSE)
 -- check the plan if the hash aggreate is disabled. We expect to see sort+unique to
 -- handle distinct on.
 SET enable_hashagg TO off;
-SELECT public.plan_without_result_lines($Q$
 EXPLAIN (COSTS FALSE)
 	SELECT DISTINCT ON (l_suppkey) avg(l_partkey)
 		FROM lineitem_hash_part
 		GROUP BY l_suppkey, l_linenumber
 		ORDER BY l_suppkey,1
 		LIMIT 10;
-$Q$);
-                                                  plan_without_result_lines
+                                                          QUERY PLAN
 ---------------------------------------------------------------------
  Limit
    ->  Unique
@@ -595,12 +595,13 @@ EXPLAIN (COSTS FALSE)
 		GROUP BY l_suppkey, l_linenumber
 		ORDER BY 1
 		LIMIT 10;
-                                                QUERY PLAN
+                                               QUERY PLAN
 ---------------------------------------------------------------------
  Limit
-   ->  Unique
-         ->  Sort
-               Sort Key: ((sum(remote_scan.avg) / (pg_catalog.sum(remote_scan.avg_1))::double precision))
+   ->  Sort
+         Sort Key: ((sum(remote_scan.avg) / (pg_catalog.sum(remote_scan.avg_1))::double precision))
+         ->  HashAggregate
+               Group Key: (sum(remote_scan.avg) / (pg_catalog.sum(remote_scan.avg_1))::double precision)
                ->  HashAggregate
                      Group Key: remote_scan.worker_column_3, remote_scan.worker_column_4
                      ->  Custom Scan (Citus Adaptive)
@@ -611,20 +612,18 @@ EXPLAIN (COSTS FALSE)
                                  ->  HashAggregate
                                        Group Key: l_suppkey, l_linenumber
                                        ->  Seq Scan on lineitem_hash_part_360041 lineitem_hash_part
-(14 rows)
+(15 rows)
 
 -- check the plan if the hash aggreate is disabled. This explain errors out due
 -- to a bug right now, expectation must be corrected after fixing it.
 SET enable_hashagg TO off;
-SELECT public.plan_without_result_lines($Q$
 EXPLAIN (COSTS FALSE)
 	SELECT DISTINCT avg(ceil(l_partkey / 2))
 		FROM lineitem_hash_part
 		GROUP BY l_suppkey, l_linenumber
 		ORDER BY 1
 		LIMIT 10;
-$Q$);
-                                           plan_without_result_lines
+                                                QUERY PLAN
 ---------------------------------------------------------------------
  Limit
    ->  Unique
@@ -672,12 +671,13 @@ EXPLAIN (COSTS FALSE)
 		GROUP BY l_suppkey, l_linenumber
 		ORDER BY 1
 		LIMIT 10;
-                                                                 QUERY PLAN
+                                                                QUERY PLAN
 ---------------------------------------------------------------------
  Limit
-   ->  Unique
-         ->  Sort
-               Sort Key: (((pg_catalog.sum(remote_scan.dis))::bigint + COALESCE((pg_catalog.sum(remote_scan.dis_1))::bigint, '0'::bigint)))
+   ->  Sort
+         Sort Key: (((pg_catalog.sum(remote_scan.dis))::bigint + COALESCE((pg_catalog.sum(remote_scan.dis_1))::bigint, '0'::bigint)))
+         ->  HashAggregate
+               Group Key: ((pg_catalog.sum(remote_scan.dis))::bigint + COALESCE((pg_catalog.sum(remote_scan.dis_1))::bigint, '0'::bigint))
                ->  HashAggregate
                      Group Key: remote_scan.worker_column_3, remote_scan.worker_column_4
                      ->  Custom Scan (Citus Adaptive)
@@ -688,20 +688,18 @@ EXPLAIN (COSTS FALSE)
                                  ->  HashAggregate
                                        Group Key: l_suppkey, l_linenumber
                                        ->  Seq Scan on lineitem_hash_part_360041 lineitem_hash_part
-(14 rows)
+(15 rows)
 
 -- check the plan if the hash aggreate is disabled. This explain errors out due
 -- to a bug right now, expectation must be corrected after fixing it.
 SET enable_hashagg TO off;
-SELECT public.plan_without_result_lines($Q$
 EXPLAIN (COSTS FALSE)
 	SELECT DISTINCT sum(l_suppkey) + count(l_partkey) AS dis
 		FROM lineitem_hash_part
 		GROUP BY l_suppkey, l_linenumber
 		ORDER BY 1
 		LIMIT 10;
-$Q$);
-                                                         plan_without_result_lines
+                                                                 QUERY PLAN
 ---------------------------------------------------------------------
  Limit
    ->  Unique
@@ -910,14 +908,12 @@ EXPLAIN (COSTS FALSE)
 
 -- check the plan if the hash aggreate is disabled
 SET enable_hashagg TO off;
-SELECT public.plan_without_result_lines($Q$
 EXPLAIN (COSTS FALSE)
 	SELECT DISTINCT ceil(count(case when l_partkey > 100000 THEN 1 ELSE 0 END) / 2) AS count
 		FROM lineitem_hash_part
 		GROUP BY l_suppkey
 		ORDER BY 1;
-$Q$);
-                                               plan_without_result_lines
+                                                       QUERY PLAN
 ---------------------------------------------------------------------
  Unique
    ->  Sort
diff --git a/src/test/regress/expected/multi_select_distinct_0.out b/src/test/regress/expected/multi_select_distinct_0.out
new file mode 100644
index 000000000..69e90b7a0
--- /dev/null
+++ b/src/test/regress/expected/multi_select_distinct_0.out
@@ -0,0 +1,1548 @@
+--
+-- MULTI_SELECT_DISTINCT
+--
+-- Tests select distinct, and select distinct on features.
+--
+SHOW server_version \gset
+SELECT substring(:'server_version', '\d+')::int >= 15 AS server_version_ge_15;
+ server_version_ge_15
+---------------------------------------------------------------------
+ f
+(1 row)
+
+ANALYZE lineitem_hash_part;
+-- function calls are supported
+SELECT DISTINCT l_orderkey, now() FROM lineitem_hash_part LIMIT 0;
+ l_orderkey | now
+---------------------------------------------------------------------
+(0 rows)
+
+SELECT DISTINCT l_orderkey, avg(l_linenumber)
+FROM lineitem_hash_part
+GROUP BY l_orderkey
+HAVING avg(l_linenumber) = (select avg(distinct l_linenumber))
+LIMIT 10;
+ERROR:  Subqueries in HAVING cannot refer to outer query
+SELECT DISTINCT l_orderkey
+FROM lineitem_hash_part
+GROUP BY l_orderkey
+HAVING (select avg(distinct l_linenumber) = l_orderkey)
+LIMIT 10;
+ERROR:  Subqueries in HAVING cannot refer to outer query
+SELECT DISTINCT l_partkey, 1 + (random() * 0)::int FROM lineitem_hash_part ORDER BY 1 DESC LIMIT 3;
+ l_partkey | ?column?
+---------------------------------------------------------------------
+    199973 |        1
+    199946 |        1
+    199943 |        1
+(3 rows)
+
+-- const expressions are supported
+SELECT DISTINCT l_orderkey, 1+1 FROM lineitem_hash_part ORDER BY 1 LIMIT 5;
+ l_orderkey | ?column?
+---------------------------------------------------------------------
+          1 |        2
+          2 |        2
+          3 |        2
+          4 |        2
+          5 |        2
+(5 rows)
+
+-- non const expressions are also supported
+SELECT DISTINCT l_orderkey, l_partkey + 1 FROM lineitem_hash_part ORDER BY 1, 2 LIMIT 5;
+ l_orderkey | ?column?
+---------------------------------------------------------------------
+          1 |     2133
+          1 |    15636
+          1 |    24028
+          1 |    63701
+          1 |    67311
+(5 rows)
+
+-- column expressions are supported
+SELECT DISTINCT l_orderkey, l_shipinstruct || l_shipmode FROM lineitem_hash_part ORDER BY 2 , 1 LIMIT 5;
+ l_orderkey |    ?column?
+---------------------------------------------------------------------
+         32 | COLLECT CODAIR
+         39 | COLLECT CODAIR
+         66 | COLLECT CODAIR
+         70 | COLLECT CODAIR
+         98 | COLLECT CODAIR
+(5 rows)
+
+-- function calls with const input are supported
+SELECT DISTINCT l_orderkey, strpos('AIR', 'A') FROM lineitem_hash_part ORDER BY 1,2 LIMIT 5;
+ l_orderkey | strpos
+---------------------------------------------------------------------
+          1 |      1
+          2 |      1
+          3 |      1
+          4 |      1
+          5 |      1
+(5 rows)
+
+-- function calls with non-const input are supported
+SELECT DISTINCT l_orderkey, strpos(l_shipmode, 'I')
+	FROM lineitem_hash_part
+	WHERE strpos(l_shipmode, 'I') > 1
+	ORDER BY 2, 1
+	LIMIT 5;
+ l_orderkey | strpos
+---------------------------------------------------------------------
+          1 |      2
+          3 |      2
+          5 |      2
+         32 |      2
+         33 |      2
+(5 rows)
+
+-- row types are supported
+SELECT DISTINCT (l_orderkey, l_partkey) AS pair FROM lineitem_hash_part ORDER BY 1 LIMIT 5;
+   pair
+---------------------------------------------------------------------
+ (1,2132)
+ (1,15635)
+ (1,24027)
+ (1,63700)
+ (1,67310)
+(5 rows)
+
+-- distinct on partition column
+-- verify counts match with respect to count(distinct)
+CREATE TEMP TABLE temp_orderkeys AS SELECT DISTINCT l_orderkey FROM lineitem_hash_part;
+SELECT COUNT(*) FROM temp_orderkeys;
+ count
+---------------------------------------------------------------------
+  2985
+(1 row)
+
+SELECT COUNT(DISTINCT l_orderkey) FROM lineitem_hash_part;
+ count
+---------------------------------------------------------------------
+  2985
+(1 row)
+
+SELECT DISTINCT l_orderkey FROM lineitem_hash_part WHERE l_orderkey < 500 and l_partkey < 5000 order by 1;
+ l_orderkey
+---------------------------------------------------------------------
+          1
+          3
+         32
+         35
+         39
+         65
+        129
+        130
+        134
+        164
+        194
+        228
+        261
+        290
+        320
+        321
+        354
+        418
+(18 rows)
+
+-- distinct on non-partition column
+SELECT DISTINCT l_partkey FROM lineitem_hash_part WHERE l_orderkey > 5 and l_orderkey < 20 order by 1;
+ l_partkey
+---------------------------------------------------------------------
+     79251
+     94780
+    139636
+    145243
+    151894
+    157238
+    163073
+    182052
+(8 rows)
+
+SELECT DISTINCT l_shipmode FROM lineitem_hash_part ORDER BY 1 DESC;
+ l_shipmode
+---------------------------------------------------------------------
+ TRUCK
+ SHIP
+ REG AIR
+ RAIL
+ MAIL
+ FOB
+ AIR
+(7 rows)
+
+-- distinct with multiple columns
+SELECT DISTINCT l_orderkey, o_orderdate
+	FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey)
+	WHERE l_orderkey < 10
+	ORDER BY l_orderkey;
+ l_orderkey | o_orderdate
+---------------------------------------------------------------------
+          1 | 01-02-1996
+          2 | 12-01-1996
+          3 | 10-14-1993
+          4 | 10-11-1995
+          5 | 07-30-1994
+          6 | 02-21-1992
+          7 | 01-10-1996
+(7 rows)
+
+-- distinct on partition column with aggregate
+-- this is the same as the one without distinct due to group by
+SELECT DISTINCT l_orderkey, count(*)
+	FROM lineitem_hash_part
+	WHERE l_orderkey < 200
+	GROUP BY 1
+	HAVING count(*) > 5
+	ORDER BY 2 DESC, 1;
+ l_orderkey | count
+---------------------------------------------------------------------
+          7 |     7
+         68 |     7
+        129 |     7
+        164 |     7
+        194 |     7
+          1 |     6
+          3 |     6
+         32 |     6
+         35 |     6
+         39 |     6
+         67 |     6
+         69 |     6
+         70 |     6
+         71 |     6
+        134 |     6
+        135 |     6
+        163 |     6
+        192 |     6
+        197 |     6
+(19 rows)
+
+-- explain the query to see actual plan
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT l_orderkey, count(*)
+		FROM lineitem_hash_part
+		WHERE l_orderkey < 200
+		GROUP BY 1
+		HAVING count(*) > 5
+		ORDER BY 2 DESC, 1;
+                                       QUERY PLAN
+---------------------------------------------------------------------
+ Sort
+   Sort Key: remote_scan.count DESC, remote_scan.l_orderkey
+   ->  HashAggregate
+         Group Key: remote_scan.count, remote_scan.l_orderkey
+         ->  Custom Scan (Citus Adaptive)
+               Task Count: 4
+               Tasks Shown: One of 4
+               ->  Task
+                     Node: host=localhost port=xxxxx dbname=regression
+                     ->  HashAggregate
+                           Group Key: l_orderkey
+                           Filter: (count(*) > 5)
+                           ->  Seq Scan on lineitem_hash_part_360041 lineitem_hash_part
+                                 Filter: (l_orderkey < 200)
+(14 rows)
+
+-- check the plan if the hash aggreate is disabled
+SET enable_hashagg TO off;
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT l_orderkey, count(*)
+		FROM lineitem_hash_part
+		WHERE l_orderkey < 200
+		GROUP BY 1
+		HAVING count(*) > 5
+		ORDER BY 2 DESC, 1;
+                                       QUERY PLAN
+---------------------------------------------------------------------
+ Unique
+   ->  Sort
+         Sort Key: remote_scan.count DESC, remote_scan.l_orderkey
+         ->  Custom Scan (Citus Adaptive)
+               Task Count: 4
+               Tasks Shown: One of 4
+               ->  Task
+                     Node: host=localhost port=xxxxx dbname=regression
+                     ->  HashAggregate
+                           Group Key: l_orderkey
+                           Filter: (count(*) > 5)
+                           ->  Seq Scan on lineitem_hash_part_360041 lineitem_hash_part
+                                 Filter: (l_orderkey < 200)
+(13 rows)
+
+SET enable_hashagg TO on;
+-- distinct on aggregate of group by columns, we try to check whether we handle
+-- queries which does not have any group by column in distinct columns properly.
+SELECT DISTINCT count(*)
+	FROM lineitem_hash_part
+	GROUP BY l_suppkey, l_linenumber
+	ORDER BY 1;
+ count
+---------------------------------------------------------------------
+     1
+     2
+     3
+     4
+(4 rows)
+
+-- explain the query to see actual plan. We expect to see Aggregate node having
+-- group by key on count(*) column, since columns in the Group By doesn't guarantee
+-- the uniqueness of the result.
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT count(*)
+		FROM lineitem_hash_part
+		GROUP BY l_suppkey, l_linenumber
+		ORDER BY 1;
+                                          QUERY PLAN
+---------------------------------------------------------------------
+ Unique
+   ->  Sort
+         Sort Key: (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint))
+         ->  HashAggregate
+               Group Key: remote_scan.worker_column_2, remote_scan.worker_column_3
+               ->  Custom Scan (Citus Adaptive)
+                     Task Count: 4
+                     Tasks Shown: One of 4
+                     ->  Task
+                           Node: host=localhost port=xxxxx dbname=regression
+                           ->  HashAggregate
+                                 Group Key: l_suppkey, l_linenumber
+                                 ->  Seq Scan on lineitem_hash_part_360041 lineitem_hash_part
+(13 rows)
+
+-- check the plan if the hash aggreate is disabled. We expect to see sort+unique
+-- instead of aggregate plan node to handle distinct.
+SET enable_hashagg TO off;
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT count(*)
+		FROM lineitem_hash_part
+		GROUP BY l_suppkey, l_linenumber
+		ORDER BY 1;
+                                             QUERY PLAN
+---------------------------------------------------------------------
+ Unique
+   ->  Sort
+         Sort Key: (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint))
+         ->  GroupAggregate
+               Group Key: remote_scan.worker_column_2, remote_scan.worker_column_3
+               ->  Sort
+                     Sort Key: remote_scan.worker_column_2, remote_scan.worker_column_3
+                     ->  Custom Scan (Citus Adaptive)
+                           Task Count: 4
+                           Tasks Shown: One of 4
+                           ->  Task
+                                 Node: host=localhost port=xxxxx dbname=regression
+                                 ->  HashAggregate
+                                       Group Key: l_suppkey, l_linenumber
+                                       ->  Seq Scan on lineitem_hash_part_360041 lineitem_hash_part
+(15 rows)
+
+SET enable_hashagg TO on;
+-- Now we have only part of group clause columns in distinct, yet it is still not
+-- enough to use Group By columns to guarantee uniqueness of result list.
+SELECT DISTINCT l_suppkey, count(*)
+	FROM lineitem_hash_part
+	GROUP BY l_suppkey, l_linenumber
+	ORDER BY 1
+	LIMIT 10;
+ l_suppkey | count
+---------------------------------------------------------------------
+         1 |     1
+         2 |     1
+         3 |     1
+         4 |     1
+         5 |     1
+         7 |     1
+        10 |     1
+        12 |     1
+        13 |     1
+        14 |     1
+(10 rows)
+
+-- explain the query to see actual plan. Similar to the explain of the query above.
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT l_suppkey, count(*)
+		FROM lineitem_hash_part
+		GROUP BY l_suppkey, l_linenumber
+		ORDER BY 1
+		LIMIT 10;
+                                                     QUERY PLAN
+---------------------------------------------------------------------
+ Limit
+   ->  Unique
+         ->  Sort
+               Sort Key: remote_scan.l_suppkey, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint))
+               ->  HashAggregate
+                     Group Key: remote_scan.l_suppkey, remote_scan.worker_column_3
+                     ->  Custom Scan (Citus Adaptive)
+                           Task Count: 4
+                           Tasks Shown: One of 4
+                           ->  Task
+                                 Node: host=localhost port=xxxxx dbname=regression
+                                 ->  HashAggregate
+                                       Group Key: l_suppkey, l_linenumber
+                                       ->  Seq Scan on lineitem_hash_part_360041 lineitem_hash_part
+(14 rows)
+
+-- check the plan if the hash aggreate is disabled. Similar to the explain of
+-- the query above.
+SET enable_hashagg TO off;
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT l_suppkey, count(*)
+		FROM lineitem_hash_part
+		GROUP BY l_suppkey, l_linenumber
+		ORDER BY 1
+		LIMIT 10;
+                                                     QUERY PLAN
+---------------------------------------------------------------------
+ Limit
+   ->  Unique
+         ->  Sort
+               Sort Key: remote_scan.l_suppkey, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint))
+               ->  GroupAggregate
+                     Group Key: remote_scan.l_suppkey, remote_scan.worker_column_3
+                     ->  Sort
+                           Sort Key: remote_scan.l_suppkey, remote_scan.worker_column_3
+                           ->  Custom Scan (Citus Adaptive)
+                                 Task Count: 4
+                                 Tasks Shown: One of 4
+                                 ->  Task
+                                       Node: host=localhost port=xxxxx dbname=regression
+                                       ->  HashAggregate
+                                             Group Key: l_suppkey, l_linenumber
+                                             ->  Seq Scan on lineitem_hash_part_360041 lineitem_hash_part
+(16 rows)
+
+SET enable_hashagg TO on;
+-- Similar to the above query, not with count but avg. Only difference with the
+-- above query is that, we create run two aggregate functions in workers.
+SELECT DISTINCT l_suppkey, avg(l_partkey)
+	FROM lineitem_hash_part
+	GROUP BY l_suppkey, l_linenumber
+	ORDER BY 1,2
+	LIMIT 10;
+ l_suppkey |          avg
+---------------------------------------------------------------------
+         1 |    190000.000000000000
+         2 |    172450.000000000000
+         3 |    112469.000000000000
+         3 |    134976.000000000000
+         4 |    112470.000000000000
+         4 |    142461.000000000000
+         5 |    182450.000000000000
+         7 |    137493.000000000000
+        10 |    150009.000000000000
+        12 | 17510.0000000000000000
+(10 rows)
+
+-- explain the query to see actual plan. Similar to the explain of the query above.
+-- Only aggregate functions will be changed.
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT l_suppkey, avg(l_partkey)
+		FROM lineitem_hash_part
+		GROUP BY l_suppkey, l_linenumber
+		ORDER BY 1,2
+		LIMIT 10;
+                                                       QUERY PLAN
+---------------------------------------------------------------------
+ Limit
+   ->  Unique
+         ->  Sort
+               Sort Key: remote_scan.l_suppkey, ((pg_catalog.sum(remote_scan.avg) / pg_catalog.sum(remote_scan.avg_1)))
+               ->  HashAggregate
+                     Group Key: remote_scan.l_suppkey, remote_scan.worker_column_4
+                     ->  Custom Scan (Citus Adaptive)
+                           Task Count: 4
+                           Tasks Shown: One of 4
+                           ->  Task
+                                 Node: host=localhost port=xxxxx dbname=regression
+                                 ->  HashAggregate
+                                       Group Key: l_suppkey, l_linenumber
+                                       ->  Seq Scan on lineitem_hash_part_360041 lineitem_hash_part
+(14 rows)
+
+-- check the plan if the hash aggreate is disabled. This explain errors out due
+-- to a bug right now, expectation must be corrected after fixing it.
+SET enable_hashagg TO off;
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT l_suppkey, avg(l_partkey)
+		FROM lineitem_hash_part
+		GROUP BY l_suppkey, l_linenumber
+		ORDER BY 1,2
+		LIMIT 10;
+                                                       QUERY PLAN
+---------------------------------------------------------------------
+ Limit
+   ->  Unique
+         ->  Sort
+               Sort Key: remote_scan.l_suppkey, ((pg_catalog.sum(remote_scan.avg) / pg_catalog.sum(remote_scan.avg_1)))
+               ->  GroupAggregate
+                     Group Key: remote_scan.l_suppkey, remote_scan.worker_column_4
+                     ->  Sort
+                           Sort Key: remote_scan.l_suppkey, remote_scan.worker_column_4
+                           ->  Custom Scan (Citus Adaptive)
+                                 Task Count: 4
+                                 Tasks Shown: One of 4
+                                 ->  Task
+                                       Node: host=localhost port=xxxxx dbname=regression
+                                       ->  HashAggregate
+                                             Group Key: l_suppkey, l_linenumber
+                                             ->  Seq Scan on lineitem_hash_part_360041 lineitem_hash_part
+(16 rows)
+
+SET enable_hashagg TO on;
+-- Similar to the above query but with distinct on
+SELECT DISTINCT ON (l_suppkey) avg(l_partkey)
+	FROM lineitem_hash_part
+	GROUP BY l_suppkey, l_linenumber
+	ORDER BY l_suppkey,1
+	LIMIT 10;
+          avg
+---------------------------------------------------------------------
+    190000.000000000000
+    172450.000000000000
+    112469.000000000000
+    112470.000000000000
+    182450.000000000000
+    137493.000000000000
+    150009.000000000000
+ 17510.0000000000000000
+     87504.000000000000
+     77506.000000000000
+(10 rows)
+
+-- explain the query to see actual plan. We expect to see sort+unique to handle
+-- distinct on.
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT ON (l_suppkey) avg(l_partkey)
+		FROM lineitem_hash_part
+		GROUP BY l_suppkey, l_linenumber
+		ORDER BY l_suppkey,1
+		LIMIT 10;
+                                                          QUERY PLAN
+---------------------------------------------------------------------
+ Limit
+   ->  Unique
+         ->  Sort
+               Sort Key: remote_scan.worker_column_3, ((pg_catalog.sum(remote_scan.avg) / pg_catalog.sum(remote_scan.avg_1)))
+               ->  HashAggregate
+                     Group Key: remote_scan.worker_column_3, remote_scan.worker_column_4
+                     ->  Custom Scan (Citus Adaptive)
+                           Task Count: 4
+                           Tasks Shown: One of 4
+                           ->  Task
+                                 Node: host=localhost port=xxxxx dbname=regression
+                                 ->  HashAggregate
+                                       Group Key: l_suppkey, l_linenumber
+                                       ->  Seq Scan on lineitem_hash_part_360041 lineitem_hash_part
+(14 rows)
+
+-- check the plan if the hash aggreate is disabled. We expect to see sort+unique to
+-- handle distinct on.
+SET enable_hashagg TO off;
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT ON (l_suppkey) avg(l_partkey)
+		FROM lineitem_hash_part
+		GROUP BY l_suppkey, l_linenumber
+		ORDER BY l_suppkey,1
+		LIMIT 10;
+                                                          QUERY PLAN
+---------------------------------------------------------------------
+ Limit
+   ->  Unique
+         ->  Sort
+               Sort Key: remote_scan.worker_column_3, ((pg_catalog.sum(remote_scan.avg) / pg_catalog.sum(remote_scan.avg_1)))
+               ->  GroupAggregate
+                     Group Key: remote_scan.worker_column_3, remote_scan.worker_column_4
+                     ->  Sort
+                           Sort Key: remote_scan.worker_column_3, remote_scan.worker_column_4
+                           ->  Custom Scan (Citus Adaptive)
+                                 Task Count: 4
+                                 Tasks Shown: One of 4
+                                 ->  Task
+                                       Node: host=localhost port=xxxxx dbname=regression
+                                       ->  HashAggregate
+                                             Group Key: l_suppkey, l_linenumber
+                                             ->  Seq Scan on lineitem_hash_part_360041 lineitem_hash_part
+(16 rows)
+
+SET enable_hashagg TO on;
+-- distinct with expression and aggregation
+SELECT DISTINCT avg(ceil(l_partkey / 2))
+	FROM lineitem_hash_part
+	GROUP BY l_suppkey, l_linenumber
+	ORDER BY 1
+	LIMIT 10;
+ avg
+---------------------------------------------------------------------
+   9
+  39
+  74
+  87
+  89
+  91
+  97
+ 102
+ 111
+ 122
+(10 rows)
+
+-- explain the query to see actual plan
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT avg(ceil(l_partkey / 2))
+		FROM lineitem_hash_part
+		GROUP BY l_suppkey, l_linenumber
+		ORDER BY 1
+		LIMIT 10;
+                                                QUERY PLAN
+---------------------------------------------------------------------
+ Limit
+   ->  Unique
+         ->  Sort
+               Sort Key: ((sum(remote_scan.avg) / (pg_catalog.sum(remote_scan.avg_1))::double precision))
+               ->  HashAggregate
+                     Group Key: remote_scan.worker_column_3, remote_scan.worker_column_4
+                     ->  Custom Scan (Citus Adaptive)
+                           Task Count: 4
+                           Tasks Shown: One of 4
+                           ->  Task
+                                 Node: host=localhost port=xxxxx dbname=regression
+                                 ->  HashAggregate
+                                       Group Key: l_suppkey, l_linenumber
+                                       ->  Seq Scan on lineitem_hash_part_360041 lineitem_hash_part
+(14 rows)
+
+-- check the plan if the hash aggreate is disabled. This explain errors out due
+-- to a bug right now, expectation must be corrected after fixing it.
+SET enable_hashagg TO off;
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT avg(ceil(l_partkey / 2))
+		FROM lineitem_hash_part
+		GROUP BY l_suppkey, l_linenumber
+		ORDER BY 1
+		LIMIT 10;
+                                                QUERY PLAN
+---------------------------------------------------------------------
+ Limit
+   ->  Unique
+         ->  Sort
+               Sort Key: ((sum(remote_scan.avg) / (pg_catalog.sum(remote_scan.avg_1))::double precision))
+               ->  GroupAggregate
+                     Group Key: remote_scan.worker_column_3, remote_scan.worker_column_4
+                     ->  Sort
+                           Sort Key: remote_scan.worker_column_3, remote_scan.worker_column_4
+                           ->  Custom Scan (Citus Adaptive)
+                                 Task Count: 4
+                                 Tasks Shown: One of 4
+                                 ->  Task
+                                       Node: host=localhost port=xxxxx dbname=regression
+                                       ->  HashAggregate
+                                             Group Key: l_suppkey, l_linenumber
+                                             ->  Seq Scan on lineitem_hash_part_360041 lineitem_hash_part
+(16 rows)
+
+SET enable_hashagg TO on;
+-- expression among aggregations.
+SELECT DISTINCT sum(l_suppkey) + count(l_partkey) AS dis
+	FROM lineitem_hash_part
+	GROUP BY l_suppkey, l_linenumber
+	ORDER BY 1
+	LIMIT 10;
+ dis
+---------------------------------------------------------------------
+   2
+   3
+   4
+   5
+   6
+   8
+  11
+  13
+  14
+  15
+(10 rows)
+
+-- explain the query to see actual plan
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT sum(l_suppkey) + count(l_partkey) AS dis
+		FROM lineitem_hash_part
+		GROUP BY l_suppkey, l_linenumber
+		ORDER BY 1
+		LIMIT 10;
+                                                                 QUERY PLAN
+---------------------------------------------------------------------
+ Limit
+   ->  Unique
+         ->  Sort
+               Sort Key: (((pg_catalog.sum(remote_scan.dis))::bigint + COALESCE((pg_catalog.sum(remote_scan.dis_1))::bigint, '0'::bigint)))
+               ->  HashAggregate
+                     Group Key: remote_scan.worker_column_3, remote_scan.worker_column_4
+                     ->  Custom Scan (Citus Adaptive)
+                           Task Count: 4
+                           Tasks Shown: One of 4
+                           ->  Task
+                                 Node: host=localhost port=xxxxx dbname=regression
+                                 ->  HashAggregate
+                                       Group Key: l_suppkey, l_linenumber
+                                       ->  Seq Scan on lineitem_hash_part_360041 lineitem_hash_part
+(14 rows)
+
+-- check the plan if the hash aggreate is disabled. This explain errors out due
+-- to a bug right now, expectation must be corrected after fixing it.
+SET enable_hashagg TO off;
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT sum(l_suppkey) + count(l_partkey) AS dis
+		FROM lineitem_hash_part
+		GROUP BY l_suppkey, l_linenumber
+		ORDER BY 1
+		LIMIT 10;
+                                                                 QUERY PLAN
+---------------------------------------------------------------------
+ Limit
+   ->  Unique
+         ->  Sort
+               Sort Key: (((pg_catalog.sum(remote_scan.dis))::bigint + COALESCE((pg_catalog.sum(remote_scan.dis_1))::bigint, '0'::bigint)))
+               ->  GroupAggregate
+                     Group Key: remote_scan.worker_column_3, remote_scan.worker_column_4
+                     ->  Sort
+                           Sort Key: remote_scan.worker_column_3, remote_scan.worker_column_4
+                           ->  Custom Scan (Citus Adaptive)
+                                 Task Count: 4
+                                 Tasks Shown: One of 4
+                                 ->  Task
+                                       Node: host=localhost port=xxxxx dbname=regression
+                                       ->  HashAggregate
+                                             Group Key: l_suppkey, l_linenumber
+                                             ->  Seq Scan on lineitem_hash_part_360041 lineitem_hash_part
+(16 rows)
+
+SET enable_hashagg TO on;
+-- distinct on all columns, note Group By columns guarantees uniqueness of the
+-- result list.
+SELECT DISTINCT *
+	FROM lineitem_hash_part
+	GROUP BY 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
+	ORDER BY 1,2
+	LIMIT 10;
+ l_orderkey | l_partkey | l_suppkey | l_linenumber | l_quantity | l_extendedprice | l_discount | l_tax | l_returnflag | l_linestatus | l_shipdate | l_commitdate | l_receiptdate |      l_shipinstruct       | l_shipmode |                 l_comment
+---------------------------------------------------------------------
+          1 |      2132 |      4633 |            4 |      28.00 |        28955.64 |       0.09 |  0.06 | N            | O            | 04-21-1996 | 03-30-1996   | 05-16-1996    | NONE                      | AIR        | lites. fluffily even de
+          1 |     15635 |       638 |            6 |      32.00 |        49620.16 |       0.07 |  0.02 | N            | O            | 01-30-1996 | 02-07-1996   | 02-03-1996    | DELIVER IN PERSON         | MAIL       | arefully slyly ex
+          1 |     24027 |      1534 |            5 |      24.00 |        22824.48 |       0.10 |  0.04 | N            | O            | 03-30-1996 | 03-14-1996   | 04-01-1996    | NONE                      | FOB        |  pending foxes. slyly re
+          1 |     63700 |      3701 |            3 |       8.00 |        13309.60 |       0.10 |  0.02 | N            | O            | 01-29-1996 | 03-05-1996   | 01-31-1996    | TAKE BACK RETURN          | REG AIR    | riously. regular, express dep
+          1 |     67310 |      7311 |            2 |      36.00 |        45983.16 |       0.09 |  0.06 | N            | O            | 04-12-1996 | 02-28-1996   | 04-20-1996    | TAKE BACK RETURN          | MAIL       | ly final         dependencies: slyly bold
+          1 |    155190 |      7706 |            1 |      17.00 |        21168.23 |       0.04 |  0.02 | N            | O            | 03-13-1996 | 02-12-1996   | 03-22-1996    | DELIVER IN PERSON         | TRUCK      | egular courts above the
+          2 |    106170 |      1191 |            1 |      38.00 |        44694.46 |       0.00 |  0.05 | N            | O            | 01-28-1997 | 01-14-1997   | 02-02-1997    | TAKE BACK RETURN          | RAIL       | ven requests. deposits breach a
+          3 |      4297 |      1798 |            1 |      45.00 |        54058.05 |       0.06 |  0.00 | R            | F            | 02-02-1994 | 01-04-1994   | 02-23-1994    | NONE                      | AIR        | ongside of the furiously brave acco
+          3 |     19036 |      6540 |            2 |      49.00 |        46796.47 |       0.10 |  0.00 | R            | F            | 11-09-1993 | 12-20-1993   | 11-24-1993    | TAKE BACK RETURN          | RAIL       |  unusual accounts. eve
+          3 |     29380 |      1883 |            4 |       2.00 |         2618.76 |       0.01 |  0.06 | A            | F            | 12-04-1993 | 01-07-1994   | 01-01-1994    | NONE                      | TRUCK      | y. fluffily pending d
+(10 rows)
+
+-- explain the query to see actual plan. We expect to see only one aggregation
+-- node since group by columns guarantees the uniqueness.
+SELECT coordinator_plan($Q$
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT *
+		FROM lineitem_hash_part
+		GROUP BY 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
+		ORDER BY 1,2
+		LIMIT 10;
+$Q$);
+                                                                                                                                                                                                          coordinator_plan
+---------------------------------------------------------------------
+ Limit
+   ->  Sort
+         Sort Key: remote_scan.l_orderkey, remote_scan.l_partkey
+         ->  HashAggregate
+               Group Key: remote_scan.l_orderkey, remote_scan.l_partkey, remote_scan.l_suppkey, remote_scan.l_linenumber, remote_scan.l_quantity, remote_scan.l_extendedprice, remote_scan.l_discount, remote_scan.l_tax, remote_scan.l_returnflag, remote_scan.l_linestatus, remote_scan.l_shipdate, remote_scan.l_commitdate, remote_scan.l_receiptdate, remote_scan.l_shipinstruct, remote_scan.l_shipmode, remote_scan.l_comment
+               ->  Custom Scan (Citus Adaptive)
+                     Task Count: 4
+(7 rows)
+
+-- check the plan if the hash aggreate is disabled. We expect to see only one
+-- aggregation node since group by columns guarantees the uniqueness.
+SET enable_hashagg TO off;
+SELECT coordinator_plan($Q$
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT *
+		FROM lineitem_hash_part
+		GROUP BY 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
+		ORDER BY 1,2
+		LIMIT 10;
+$Q$);
+                                                                                                                                                                                                          coordinator_plan
+---------------------------------------------------------------------
+ Limit
+   ->  Unique
+         ->  Sort
+               Sort Key: remote_scan.l_orderkey, remote_scan.l_partkey, remote_scan.l_suppkey, remote_scan.l_linenumber, remote_scan.l_quantity, remote_scan.l_extendedprice, remote_scan.l_discount, remote_scan.l_tax, remote_scan.l_returnflag, remote_scan.l_linestatus, remote_scan.l_shipdate, remote_scan.l_commitdate, remote_scan.l_receiptdate, remote_scan.l_shipinstruct, remote_scan.l_shipmode, remote_scan.l_comment
+               ->  Custom Scan (Citus Adaptive)
+                     Task Count: 4
+(6 rows)
+
+SET enable_hashagg TO on;
+-- distinct on count distinct
+SELECT DISTINCT count(DISTINCT l_partkey), count(DISTINCT l_shipmode)
+	FROM lineitem_hash_part
+	GROUP BY l_orderkey
+	ORDER BY 1,2;
+ count | count
+---------------------------------------------------------------------
+     1 |     1
+     2 |     1
+     2 |     2
+     3 |     1
+     3 |     2
+     3 |     3
+     4 |     1
+     4 |     2
+     4 |     3
+     4 |     4
+     5 |     2
+     5 |     3
+     5 |     4
+     5 |     5
+     6 |     2
+     6 |     3
+     6 |     4
+     6 |     5
+     6 |     6
+     7 |     2
+     7 |     3
+     7 |     4
+     7 |     5
+     7 |     6
+     7 |     7
+(25 rows)
+
+-- explain the query to see actual plan. We expect to see aggregation plan for
+-- the outer distinct.
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT count(DISTINCT l_partkey), count(DISTINCT l_shipmode)
+		FROM lineitem_hash_part
+		GROUP BY l_orderkey
+		ORDER BY 1,2;
+                                          QUERY PLAN
+---------------------------------------------------------------------
+ Sort
+   Sort Key: remote_scan.count, remote_scan.count_1
+   ->  HashAggregate
+         Group Key: remote_scan.count, remote_scan.count_1
+         ->  Custom Scan (Citus Adaptive)
+               Task Count: 4
+               Tasks Shown: One of 4
+               ->  Task
+                     Node: host=localhost port=xxxxx dbname=regression
+                     ->  GroupAggregate
+                           Group Key: l_orderkey
+                           ->  Sort
+                                 Sort Key: l_orderkey
+                                 ->  Seq Scan on lineitem_hash_part_360041 lineitem_hash_part
+(14 rows)
+
+-- check the plan if the hash aggreate is disabled. We expect to see sort + unique
+-- plans for the outer distinct.
+SET enable_hashagg TO off;
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT count(DISTINCT l_partkey), count(DISTINCT l_shipmode)
+		FROM lineitem_hash_part
+		GROUP BY l_orderkey
+		ORDER BY 1,2;
+                                          QUERY PLAN
+---------------------------------------------------------------------
+ Unique
+   ->  Sort
+         Sort Key: remote_scan.count, remote_scan.count_1
+         ->  Custom Scan (Citus Adaptive)
+               Task Count: 4
+               Tasks Shown: One of 4
+               ->  Task
+                     Node: host=localhost port=xxxxx dbname=regression
+                     ->  GroupAggregate
+                           Group Key: l_orderkey
+                           ->  Sort
+                                 Sort Key: l_orderkey
+                                 ->  Seq Scan on lineitem_hash_part_360041 lineitem_hash_part
+(13 rows)
+
+SET enable_hashagg TO on;
+-- distinct on aggregation with filter and expression
+SELECT DISTINCT ceil(count(case when l_partkey > 100000 THEN 1 ELSE 0 END) / 2) AS count
+	FROM lineitem_hash_part
+	GROUP BY l_suppkey
+	ORDER BY 1;
+ count
+---------------------------------------------------------------------
+     0
+     1
+     2
+     3
+     4
+(5 rows)
+
+-- explain the query to see actual plan
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT ceil(count(case when l_partkey > 100000 THEN 1 ELSE 0 END) / 2) AS count
+		FROM lineitem_hash_part
+		GROUP BY l_suppkey
+		ORDER BY 1;
+                                                       QUERY PLAN
+---------------------------------------------------------------------
+ Unique
+   ->  Sort
+         Sort Key: (ceil(((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint) / 2))::double precision))
+         ->  HashAggregate
+               Group Key: remote_scan.worker_column_2
+               ->  Custom Scan (Citus Adaptive)
+                     Task Count: 4
+                     Tasks Shown: One of 4
+                     ->  Task
+                           Node: host=localhost port=xxxxx dbname=regression
+                           ->  HashAggregate
+                                 Group Key: l_suppkey
+                                 ->  Seq Scan on lineitem_hash_part_360041 lineitem_hash_part
+(13 rows)
+
+-- check the plan if the hash aggreate is disabled
+SET enable_hashagg TO off;
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT ceil(count(case when l_partkey > 100000 THEN 1 ELSE 0 END) / 2) AS count
+		FROM lineitem_hash_part
+		GROUP BY l_suppkey
+		ORDER BY 1;
+                                                       QUERY PLAN
+---------------------------------------------------------------------
+ Unique
+   ->  Sort
+         Sort Key: (ceil(((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint) / 2))::double precision))
+         ->  GroupAggregate
+               Group Key: remote_scan.worker_column_2
+               ->  Sort
+                     Sort Key: remote_scan.worker_column_2
+                     ->  Custom Scan (Citus Adaptive)
+                           Task Count: 4
+                           Tasks Shown: One of 4
+                           ->  Task
+                                 Node: host=localhost port=xxxxx dbname=regression
+                                 ->  HashAggregate
+                                       Group Key: l_suppkey
+                                       ->  Seq Scan on lineitem_hash_part_360041 lineitem_hash_part
+(15 rows)
+
+SET enable_hashagg TO on;
+-- explain the query to see actual plan with array_agg aggregation.
+SELECT coordinator_plan($Q$
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT array_agg(l_linenumber), array_length(array_agg(l_linenumber), 1)
+		FROM lineitem_hash_part
+		GROUP BY l_orderkey
+		ORDER BY 2
+		LIMIT 15;
+$Q$);
+                             coordinator_plan
+---------------------------------------------------------------------
+ Limit
+   ->  Sort
+         Sort Key: remote_scan.array_length
+         ->  HashAggregate
+               Group Key: remote_scan.array_length, remote_scan.array_agg
+               ->  Custom Scan (Citus Adaptive)
+                     Task Count: 4
+(7 rows)
+
+-- check the plan if the hash aggreate is disabled.
+SET enable_hashagg TO off;
+SELECT coordinator_plan($Q$
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT array_agg(l_linenumber), array_length(array_agg(l_linenumber), 1)
+		FROM lineitem_hash_part
+		GROUP BY l_orderkey
+		ORDER BY 2
+		LIMIT 15;
+$Q$);
+                            coordinator_plan
+---------------------------------------------------------------------
+ Limit
+   ->  Unique
+         ->  Sort
+               Sort Key: remote_scan.array_length, remote_scan.array_agg
+               ->  Custom Scan (Citus Adaptive)
+                     Task Count: 4
+(6 rows)
+
+SET enable_hashagg TO on;
+-- distinct on non-partition column with aggregate
+-- this is the same as non-distinct version due to group by
+SELECT DISTINCT l_partkey, count(*)
+	FROM lineitem_hash_part
+	GROUP BY 1
+	HAVING count(*) > 2
+	ORDER BY 1;
+ l_partkey | count
+---------------------------------------------------------------------
+      1051 |     3
+      1927 |     3
+      6983 |     3
+     15283 |     3
+     87761 |     3
+    136884 |     3
+    149926 |     3
+    160895 |     3
+    177771 |     3
+    188804 |     3
+    199146 |     3
+(11 rows)
+
+-- explain the query to see actual plan
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT l_partkey, count(*)
+		FROM lineitem_hash_part
+		GROUP BY 1
+		HAVING count(*) > 2
+		ORDER BY 1;
+                                                  QUERY PLAN
+---------------------------------------------------------------------
+ Unique
+   ->  Sort
+         Sort Key: remote_scan.l_partkey, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint))
+         ->  HashAggregate
+               Group Key: remote_scan.l_partkey
+               Filter: (COALESCE((pg_catalog.sum(remote_scan.worker_column_3))::bigint, '0'::bigint) > 2)
+               ->  Custom Scan (Citus Adaptive)
+                     Task Count: 4
+                     Tasks Shown: One of 4
+                     ->  Task
+                           Node: host=localhost port=xxxxx dbname=regression
+                           ->  HashAggregate
+                                 Group Key: l_partkey
+                                 ->  Seq Scan on lineitem_hash_part_360041 lineitem_hash_part
+(14 rows)
+
+-- distinct on non-partition column and avg
+SELECT DISTINCT l_partkey, avg(l_linenumber)
+	FROM lineitem_hash_part
+	WHERE l_partkey < 500
+	GROUP BY 1
+	HAVING avg(l_linenumber) > 2
+	ORDER BY 1;
+ l_partkey |        avg
+---------------------------------------------------------------------
+        18 | 7.0000000000000000
+        79 | 6.0000000000000000
+       149 | 4.5000000000000000
+       175 | 5.0000000000000000
+       179 | 6.0000000000000000
+       182 | 3.0000000000000000
+       222 | 4.0000000000000000
+       278 | 3.0000000000000000
+       299 | 7.0000000000000000
+       308 | 7.0000000000000000
+       309 | 5.0000000000000000
+       321 | 3.0000000000000000
+       337 | 6.0000000000000000
+       364 | 3.0000000000000000
+       403 | 4.0000000000000000
+(15 rows)
+
+-- distinct on multiple non-partition columns
+SELECT DISTINCT l_partkey, l_suppkey
+	FROM lineitem_hash_part
+	WHERE l_shipmode = 'AIR' AND l_orderkey < 100
+	ORDER BY 1, 2;
+ l_partkey | l_suppkey
+---------------------------------------------------------------------
+      2132 |      4633
+      4297 |      1798
+     37531 |        35
+     44161 |      6666
+     44706 |      4707
+     67831 |      5350
+     85811 |      8320
+     94368 |      6878
+    108338 |       849
+    108570 |      8571
+    137267 |      4807
+    137469 |      9983
+    173489 |      3490
+    196156 |      1195
+    197921 |       441
+(15 rows)
+
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT l_partkey, l_suppkey
+		FROM lineitem_hash_part
+		WHERE l_shipmode = 'AIR' AND l_orderkey < 100
+		ORDER BY 1, 2;
+                                             QUERY PLAN
+---------------------------------------------------------------------
+ Sort
+   Sort Key: remote_scan.l_partkey, remote_scan.l_suppkey
+   ->  HashAggregate
+         Group Key: remote_scan.l_partkey, remote_scan.l_suppkey
+         ->  Custom Scan (Citus Adaptive)
+               Task Count: 4
+               Tasks Shown: One of 4
+               ->  Task
+                     Node: host=localhost port=xxxxx dbname=regression
+                     ->  Unique
+                           ->  Sort
+                                 Sort Key: l_partkey, l_suppkey
+                                 ->  Seq Scan on lineitem_hash_part_360041 lineitem_hash_part
+                                       Filter: ((l_orderkey < 100) AND (l_shipmode = 'AIR'::bpchar))
+(14 rows)
+
+-- distinct on partition column
+SELECT DISTINCT ON (l_orderkey) l_orderkey, l_partkey, l_suppkey
+	FROM lineitem_hash_part
+	WHERE l_orderkey < 35
+	ORDER BY 1, 2, 3;
+ l_orderkey | l_partkey | l_suppkey
+---------------------------------------------------------------------
+          1 |      2132 |      4633
+          2 |    106170 |      1191
+          3 |      4297 |      1798
+          4 |     88035 |      5560
+          5 |     37531 |        35
+          6 |    139636 |      2150
+          7 |     79251 |      1759
+         32 |      2743 |      7744
+         33 |     33918 |      3919
+         34 |     88362 |       871
+(10 rows)
+
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT ON (l_orderkey) l_orderkey, l_partkey, l_suppkey
+		FROM lineitem_hash_part
+		WHERE l_orderkey < 35
+		ORDER BY 1, 2, 3;
+                                          QUERY PLAN
+---------------------------------------------------------------------
+ Unique
+   ->  Sort
+         Sort Key: remote_scan.l_orderkey, remote_scan.l_partkey, remote_scan.l_suppkey
+         ->  Custom Scan (Citus Adaptive)
+               Task Count: 4
+               Tasks Shown: One of 4
+               ->  Task
+                     Node: host=localhost port=xxxxx dbname=regression
+                     ->  Unique
+                           ->  Sort
+                                 Sort Key: l_orderkey, l_partkey, l_suppkey
+                                 ->  Seq Scan on lineitem_hash_part_360041 lineitem_hash_part
+                                       Filter: (l_orderkey < 35)
+(13 rows)
+
+-- distinct on non-partition column
+-- note order by is required here
+-- otherwise query results will be different since
+-- distinct on clause is on non-partition column
+SELECT DISTINCT ON (l_partkey) l_partkey, l_orderkey
+	FROM lineitem_hash_part
+	ORDER BY 1,2
+	LIMIT 20;
+ l_partkey | l_orderkey
+---------------------------------------------------------------------
+        18 |      12005
+        79 |       5121
+        91 |       2883
+       149 |        807
+       175 |       4102
+       179 |       2117
+       182 |        548
+       195 |       2528
+       204 |      10048
+       222 |       9413
+       245 |       9446
+       278 |       1287
+       299 |       1122
+       308 |      11137
+       309 |       2374
+       318 |        321
+       321 |       5984
+       337 |      10403
+       350 |      13698
+       358 |       4323
+(20 rows)
+
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT ON (l_partkey) l_partkey, l_orderkey
+		FROM lineitem_hash_part
+		ORDER BY 1,2
+		LIMIT 20;
+                                                QUERY PLAN
+---------------------------------------------------------------------
+ Limit
+   ->  Unique
+         ->  Sort
+               Sort Key: remote_scan.l_partkey, remote_scan.l_orderkey
+               ->  Custom Scan (Citus Adaptive)
+                     Task Count: 4
+                     Tasks Shown: One of 4
+                     ->  Task
+                           Node: host=localhost port=xxxxx dbname=regression
+                           ->  Limit
+                                 ->  Unique
+                                       ->  Sort
+                                             Sort Key: l_partkey, l_orderkey
+                                             ->  Seq Scan on lineitem_hash_part_360041 lineitem_hash_part
+(14 rows)
+
+-- distinct on with joins
+-- each customer's first order key
+SELECT DISTINCT ON (o_custkey) o_custkey, l_orderkey
+	FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey)
+	WHERE o_custkey < 15
+	ORDER BY 1,2;
+ o_custkey | l_orderkey
+---------------------------------------------------------------------
+         1 |       9154
+         2 |      10563
+         4 |        320
+         5 |      11682
+         7 |      10402
+         8 |        102
+        10 |       1602
+        11 |      12800
+        13 |        994
+        14 |      11011
+(10 rows)
+
+SELECT coordinator_plan($Q$
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT ON (o_custkey) o_custkey, l_orderkey
+		FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey)
+		WHERE o_custkey < 15
+		ORDER BY 1,2;
+$Q$);
+                        coordinator_plan
+---------------------------------------------------------------------
+ Unique
+   ->  Sort
+         Sort Key: remote_scan.o_custkey, remote_scan.l_orderkey
+         ->  Custom Scan (Citus Adaptive)
+               Task Count: 4
+(5 rows)
+
+-- explain without order by
+-- notice master plan has order by on distinct on column
+SELECT coordinator_plan($Q$
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT ON (o_custkey) o_custkey, l_orderkey
+		FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey)
+		WHERE o_custkey < 15;
+$Q$);
+             coordinator_plan
+---------------------------------------------------------------------
+ Unique
+   ->  Sort
+         Sort Key: remote_scan.o_custkey
+         ->  Custom Scan (Citus Adaptive)
+               Task Count: 4
+(5 rows)
+
+-- each customer's each order's first l_partkey
+SELECT DISTINCT ON (o_custkey, l_orderkey) o_custkey, l_orderkey, l_linenumber, l_partkey
+	FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey)
+	WHERE o_custkey < 20
+	ORDER BY 1,2,3;
+ o_custkey | l_orderkey | l_linenumber | l_partkey
+---------------------------------------------------------------------
+         1 |       9154 |            1 |     86513
+         1 |      14656 |            1 |     59539
+         2 |      10563 |            1 |    147459
+         4 |        320 |            1 |      4415
+         4 |        739 |            1 |     84489
+         4 |      10688 |            1 |     45037
+         4 |      10788 |            1 |     50814
+         4 |      13728 |            1 |     86216
+         5 |      11682 |            1 |     31634
+         5 |      11746 |            1 |    180724
+         5 |      14308 |            1 |    157430
+         7 |      10402 |            1 |     53661
+         7 |      13031 |            1 |    112161
+         7 |      14145 |            1 |    138729
+         7 |      14404 |            1 |    143034
+         8 |        102 |            1 |     88914
+         8 |        164 |            1 |     91309
+         8 |      13601 |            1 |     40504
+        10 |       1602 |            1 |    182806
+        10 |       9862 |            1 |     86241
+        10 |      11431 |            1 |     62112
+        10 |      13124 |            1 |     29414
+        11 |      12800 |            1 |    152806
+        13 |        994 |            1 |     64486
+        13 |       1603 |            1 |     38191
+        13 |       4704 |            1 |     77934
+        13 |       9927 |            1 |       875
+        14 |      11011 |            1 |    172485
+        17 |        896 |            1 |     38675
+        17 |       5507 |            1 |      9600
+        19 |        353 |            1 |    119305
+        19 |       1504 |            1 |     81389
+        19 |       1669 |            1 |     78373
+        19 |       5893 |            1 |    133707
+        19 |       9954 |            1 |     92138
+        19 |      14885 |            1 |     36154
+(36 rows)
+
+-- explain without order by
+SELECT coordinator_plan($Q$
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT ON (o_custkey, l_orderkey) o_custkey, l_orderkey, l_linenumber, l_partkey
+		FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey)
+		WHERE o_custkey < 20;
+$Q$);
+                        coordinator_plan
+---------------------------------------------------------------------
+ Unique
+   ->  Sort
+         Sort Key: remote_scan.o_custkey, remote_scan.l_orderkey
+         ->  Custom Scan (Citus Adaptive)
+               Task Count: 4
+(5 rows)
+
+-- each customer's each order's last l_partkey
+SELECT DISTINCT ON (o_custkey, l_orderkey) o_custkey, l_orderkey, l_linenumber, l_partkey
+	FROM lineitem_hash_part JOIN orders_hash_part ON (l_orderkey = o_orderkey)
+	WHERE o_custkey < 15
+	ORDER BY 1,2,3 DESC;
+ o_custkey | l_orderkey | l_linenumber | l_partkey
+---------------------------------------------------------------------
+         1 |       9154 |            7 |    173448
+         1 |      14656 |            1 |     59539
+         2 |      10563 |            4 |    110741
+         4 |        320 |            2 |    192158
+         4 |        739 |            5 |    187523
+         4 |      10688 |            2 |    132574
+         4 |      10788 |            4 |    196473
+         4 |      13728 |            3 |     12450
+         5 |      11682 |            3 |    177152
+         5 |      11746 |            7 |    193807
+         5 |      14308 |            3 |    140916
+         7 |      10402 |            2 |     64514
+         7 |      13031 |            6 |      7761
+         7 |      14145 |            6 |    130723
+         7 |      14404 |            7 |     35349
+         8 |        102 |            4 |     61158
+         8 |        164 |            7 |      3037
+         8 |      13601 |            5 |     12470
+        10 |       1602 |            1 |    182806
+        10 |       9862 |            5 |    135675
+        10 |      11431 |            7 |      8563
+        10 |      13124 |            3 |     67055
+        11 |      12800 |            5 |    179110
+        13 |        994 |            4 |    130471
+        13 |       1603 |            2 |     65209
+        13 |       4704 |            3 |     63081
+        13 |       9927 |            6 |    119356
+        14 |      11011 |            7 |     95939
+(28 rows)
+
+-- subqueries
+SELECT DISTINCT l_orderkey, l_partkey
+	FROM (
+		SELECT l_orderkey, l_partkey
+		FROM lineitem_hash_part
+		) q
+	ORDER BY 1,2
+	LIMIT 10;
+ l_orderkey | l_partkey
+---------------------------------------------------------------------
+          1 |      2132
+          1 |     15635
+          1 |     24027
+          1 |     63700
+          1 |     67310
+          1 |    155190
+          2 |    106170
+          3 |      4297
+          3 |     19036
+          3 |     29380
+(10 rows)
+
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT l_orderkey, l_partkey
+		FROM (
+			SELECT l_orderkey, l_partkey
+			FROM lineitem_hash_part
+			) q
+		ORDER BY 1,2
+		LIMIT 10;
+                                                QUERY PLAN
+---------------------------------------------------------------------
+ Limit
+   ->  Sort
+         Sort Key: remote_scan.l_orderkey, remote_scan.l_partkey
+         ->  HashAggregate
+               Group Key: remote_scan.l_orderkey, remote_scan.l_partkey
+               ->  Custom Scan (Citus Adaptive)
+                     Task Count: 4
+                     Tasks Shown: One of 4
+                     ->  Task
+                           Node: host=localhost port=xxxxx dbname=regression
+                           ->  Limit
+                                 ->  Sort
+                                       Sort Key: l_orderkey, l_partkey
+                                       ->  HashAggregate
+                                             Group Key: l_orderkey, l_partkey
+                                             ->  Seq Scan on lineitem_hash_part_360041 lineitem_hash_part
+(16 rows)
+
+SELECT DISTINCT l_orderkey, cnt
+	FROM (
+		SELECT l_orderkey, count(*) as cnt
+		FROM lineitem_hash_part
+		GROUP BY 1
+		) q
+	ORDER BY 1,2
+	LIMIT 10;
+ l_orderkey | cnt
+---------------------------------------------------------------------
+          1 |   6
+          2 |   1
+          3 |   6
+          4 |   1
+          5 |   3
+          6 |   1
+          7 |   7
+         32 |   6
+         33 |   4
+         34 |   3
+(10 rows)
+
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT l_orderkey, cnt
+		FROM (
+			SELECT l_orderkey, count(*) as cnt
+			FROM lineitem_hash_part
+			GROUP BY 1
+			) q
+		ORDER BY 1,2
+		LIMIT 10;
+                                                   QUERY PLAN
+---------------------------------------------------------------------
+ Limit
+   ->  Sort
+         Sort Key: remote_scan.l_orderkey, remote_scan.cnt
+         ->  HashAggregate
+               Group Key: remote_scan.l_orderkey, remote_scan.cnt
+               ->  Custom Scan (Citus Adaptive)
+                     Task Count: 4
+                     Tasks Shown: One of 4
+                     ->  Task
+                           Node: host=localhost port=xxxxx dbname=regression
+                           ->  Limit
+                                 ->  Sort
+                                       Sort Key: lineitem_hash_part.l_orderkey, (count(*))
+                                       ->  HashAggregate
+                                             Group Key: lineitem_hash_part.l_orderkey, count(*)
+                                             ->  HashAggregate
+                                                   Group Key: lineitem_hash_part.l_orderkey
+                                                   ->  Seq Scan on lineitem_hash_part_360041 lineitem_hash_part
+(18 rows)
+
+-- distinct on partition column
+-- random() is added to inner query to prevent flattening
+SELECT DISTINCT ON (l_orderkey) l_orderkey, l_partkey
+	FROM (
+		SELECT l_orderkey, l_partkey, (random()*10)::int + 2 as r
+		FROM lineitem_hash_part
+		) q
+	WHERE r > 1
+	ORDER BY 1,2
+	LIMIT 10;
+ l_orderkey | l_partkey
+---------------------------------------------------------------------
+          1 |      2132
+          2 |    106170
+          3 |      4297
+          4 |     88035
+          5 |     37531
+          6 |    139636
+          7 |     79251
+         32 |      2743
+         33 |     33918
+         34 |     88362
+(10 rows)
+
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT ON (l_orderkey) l_orderkey, l_partkey
+		FROM (
+			SELECT l_orderkey, l_partkey, (random()*10)::int + 2 as r
+			FROM lineitem_hash_part
+			) q
+		WHERE r > 1
+		ORDER BY 1,2
+		LIMIT 10;
+                                                   QUERY PLAN
+---------------------------------------------------------------------
+ Limit
+   ->  Unique
+         ->  Sort
+               Sort Key: remote_scan.l_orderkey, remote_scan.l_partkey
+               ->  Custom Scan (Citus Adaptive)
+                     Task Count: 4
+                     Tasks Shown: One of 4
+                     ->  Task
+                           Node: host=localhost port=xxxxx dbname=regression
+                           ->  Limit
+                                 ->  Unique
+                                       ->  Sort
+                                             Sort Key: q.l_orderkey, q.l_partkey
+                                             ->  Subquery Scan on q
+                                                   Filter: (q.r > 1)
+                                                   ->  Seq Scan on lineitem_hash_part_360041 lineitem_hash_part
+(16 rows)
+
+-- distinct on non-partition column
+SELECT DISTINCT ON (l_partkey) l_orderkey, l_partkey
+	FROM (
+		SELECT l_orderkey, l_partkey, (random()*10)::int + 2 as r
+		FROM lineitem_hash_part
+		) q
+	WHERE r > 1
+	ORDER BY 2,1
+	LIMIT 10;
+ l_orderkey | l_partkey
+---------------------------------------------------------------------
+      12005 |        18
+       5121 |        79
+       2883 |        91
+        807 |       149
+       4102 |       175
+       2117 |       179
+        548 |       182
+       2528 |       195
+      10048 |       204
+       9413 |       222
+(10 rows)
+
+EXPLAIN (COSTS FALSE)
+	SELECT DISTINCT ON (l_partkey) l_orderkey, l_partkey
+		FROM (
+			SELECT l_orderkey, l_partkey, (random()*10)::int + 2 as r
+			FROM lineitem_hash_part
+			) q
+		WHERE r > 1
+		ORDER BY 2,1
+		LIMIT 10;
+                                                   QUERY PLAN
+---------------------------------------------------------------------
+ Limit
+   ->  Unique
+         ->  Sort
+               Sort Key: remote_scan.l_partkey, remote_scan.l_orderkey
+               ->  Custom Scan (Citus Adaptive)
+                     Task Count: 4
+                     Tasks Shown: One of 4
+                     ->  Task
+                           Node: host=localhost port=xxxxx dbname=regression
+                           ->  Limit
+                                 ->  Unique
+                                       ->  Sort
+                                             Sort Key: q.l_partkey, q.l_orderkey
+                                             ->  Subquery Scan on q
+                                                   Filter: (q.r > 1)
+                                                   ->  Seq Scan on lineitem_hash_part_360041 lineitem_hash_part
+(16 rows)
+
diff --git a/src/test/regress/expected/multi_test_helpers.out b/src/test/regress/expected/multi_test_helpers.out
index b02cd6cd4..640a8d9ee 100644
--- a/src/test/regress/expected/multi_test_helpers.out
+++ b/src/test/regress/expected/multi_test_helpers.out
@@ -17,15 +17,10 @@ BEGIN
 END;
 $$LANGUAGE plpgsql;
 -- Create a function to ignore worker plans in explain output
--- Also remove extra "->  Result" lines for PG15 support
 CREATE OR REPLACE FUNCTION coordinator_plan(explain_command text, out query_plan text)
 RETURNS SETOF TEXT AS $$
 BEGIN
   FOR query_plan IN execute explain_command LOOP
-    IF (query_plan LIKE '%->  Result%' OR query_plan = 'Result')
-    THEN
-        CONTINUE;
-    END IF;
     RETURN next;
     IF query_plan LIKE '%Task Count:%'
     THEN
@@ -36,16 +31,12 @@ BEGIN
 END; $$ language plpgsql;
 -- Create a function to ignore worker plans in explain output
 -- It also shows task count for plan and subplans
--- Also remove extra "->  Result" lines for PG15 support
 CREATE OR REPLACE FUNCTION coordinator_plan_with_subplans(explain_command text, out query_plan text)
 RETURNS SETOF TEXT AS $$
 DECLARE
     task_count_line_reached boolean := false;
 BEGIN
   FOR query_plan IN execute explain_command LOOP
-    IF (query_plan LIKE '%->  Result%' OR query_plan = 'Result') THEN
-        CONTINUE;
-    END IF;
     IF NOT task_count_line_reached THEN
         RETURN next;
     END IF;
@@ -59,19 +50,6 @@ BEGIN
   END LOOP;
   RETURN;
 END; $$ language plpgsql;
--- Create a function to ignore "->  Result" lines for PG15 support
--- In PG15 there are some extra "->  Result" lines
-CREATE OR REPLACE FUNCTION plan_without_result_lines(explain_command text, out query_plan text)
-RETURNS SETOF TEXT AS $$
-BEGIN
-  FOR query_plan IN execute explain_command LOOP
-    IF (query_plan LIKE '%->  Result%' OR query_plan = 'Result') THEN
-        CONTINUE;
-    END IF;
-    RETURN next;
-  END LOOP;
-  RETURN;
-END; $$ language plpgsql;
 -- Create a function to normalize Memory Usage, Buckets, Batches
 CREATE OR REPLACE FUNCTION plan_normalize_memory(explain_command text, out query_plan text)
 RETURNS SETOF TEXT AS $$
@@ -81,18 +59,6 @@ BEGIN
     RETURN NEXT;
   END LOOP;
 END; $$ language plpgsql;
--- Create a function to remove arrows from the explain plan
-CREATE OR REPLACE FUNCTION plan_without_arrows(explain_command text, out query_plan text)
-RETURNS SETOF TEXT AS $$
-BEGIN
-  FOR query_plan IN execute explain_command LOOP
-    IF (query_plan LIKE '%->  Result%' OR query_plan = 'Result') THEN
-        CONTINUE;
-    END IF;
-    query_plan := regexp_replace(query_plan, '( )*->  (.*)',  '\2', 'g');
-    RETURN NEXT;
-  END LOOP;
-END; $$ language plpgsql;
 -- helper function that returns true if output of given explain has "is not null" (case in-sensitive)
 CREATE OR REPLACE FUNCTION explain_has_is_not_null(explain_command text)
 RETURNS BOOLEAN AS $$
diff --git a/src/test/regress/expected/window_functions.out b/src/test/regress/expected/window_functions.out
index 2e88f5b51..6f30a49e3 100644
--- a/src/test/regress/expected/window_functions.out
+++ b/src/test/regress/expected/window_functions.out
@@ -1491,12 +1491,10 @@ LIMIT 5;
 (17 rows)
 
 -- Grouping can be pushed down with aggregates even when window function can't
-SELECT public.plan_without_result_lines($Q$
 EXPLAIN (COSTS FALSE)
 SELECT user_id, count(value_1), stddev(value_1), count(user_id) OVER (PARTITION BY random())
 FROM users_table GROUP BY user_id HAVING avg(value_1) > 2 LIMIT 1;
-$Q$);
-                               plan_without_result_lines
+                                   QUERY PLAN
 ---------------------------------------------------------------------
  Limit
    ->  WindowAgg
diff --git a/src/test/regress/expected/window_functions_0.out b/src/test/regress/expected/window_functions_0.out
index c9442c7b5..c5a132301 100644
--- a/src/test/regress/expected/window_functions_0.out
+++ b/src/test/regress/expected/window_functions_0.out
@@ -1495,12 +1495,10 @@ LIMIT 5;
 (18 rows)
 
 -- Grouping can be pushed down with aggregates even when window function can't
-SELECT public.plan_without_result_lines($Q$
 EXPLAIN (COSTS FALSE)
 SELECT user_id, count(value_1), stddev(value_1), count(user_id) OVER (PARTITION BY random())
 FROM users_table GROUP BY user_id HAVING avg(value_1) > 2 LIMIT 1;
-$Q$);
-                               plan_without_result_lines
+                                   QUERY PLAN
 ---------------------------------------------------------------------
  Limit
    ->  WindowAgg
diff --git a/src/test/regress/sql/columnar_chunk_filtering.sql b/src/test/regress/sql/columnar_chunk_filtering.sql
index 335401a20..b8b2b411d 100644
--- a/src/test/regress/sql/columnar_chunk_filtering.sql
+++ b/src/test/regress/sql/columnar_chunk_filtering.sql
@@ -130,15 +130,11 @@ INSERT INTO another_columnar_table SELECT generate_series(0,5);
 EXPLAIN (analyze on, costs off, timing off, summary off)
   SELECT a, y FROM multi_column_chunk_filtering, another_columnar_table WHERE x > 1;
 
-SELECT plan_without_arrows($Q$
 EXPLAIN (costs off, timing off, summary off)
   SELECT y, * FROM another_columnar_table;
-$Q$);
 
-SELECT plan_without_arrows($Q$
 EXPLAIN (costs off, timing off, summary off)
   SELECT *, x FROM another_columnar_table;
-$Q$);
 
 EXPLAIN (costs off, timing off, summary off)
   SELECT y, another_columnar_table FROM another_columnar_table;
diff --git a/src/test/regress/sql/columnar_citus_integration.sql b/src/test/regress/sql/columnar_citus_integration.sql
index 566c3a9f6..514508795 100644
--- a/src/test/regress/sql/columnar_citus_integration.sql
+++ b/src/test/regress/sql/columnar_citus_integration.sql
@@ -429,13 +429,11 @@ EXPLAIN (COSTS OFF, SUMMARY OFF)
 SELECT * FROM weird_col_explain;
 
 \set VERBOSITY terse
-SELECT public.plan_without_result_lines($Q$
 EXPLAIN (COSTS OFF, SUMMARY OFF)
 SELECT *, "bbbbbbbbbbbbbbbbbbbbbbbbb\!bbbb'bbbbbbbbbbbbbbbbbbbbb''bbbbbbbb"
 FROM weird_col_explain
 WHERE "bbbbbbbbbbbbbbbbbbbbbbbbb\!bbbb'bbbbbbbbbbbbbbbbbbbbb''bbbbbbbb" * 2 >
       "aaaaaaaaaaaa$aaaaaa$$aaaaaaaaaaaaaaaaaaaaaaaaaaaaa'aaaaaaaa'$a'!";
-$Q$);
 \set VERBOSITY default
 
 -- should not project any columns
diff --git a/src/test/regress/sql/insert_select_repartition.sql b/src/test/regress/sql/insert_select_repartition.sql
index ee6065b88..94a16fed0 100644
--- a/src/test/regress/sql/insert_select_repartition.sql
+++ b/src/test/regress/sql/insert_select_repartition.sql
@@ -635,9 +635,7 @@ DO UPDATE SET
 create table table_with_sequences (x int, y int, z bigserial);
 insert into table_with_sequences values (1,1);
 select create_distributed_table('table_with_sequences','x');
-SELECT public.plan_without_result_lines($Q$
 explain (costs off) insert into table_with_sequences select y, x from table_with_sequences;
-$Q$);
 
 -- verify that we don't report repartitioned insert/select for tables
 -- with user-defined sequences.
@@ -645,9 +643,7 @@ CREATE SEQUENCE user_defined_sequence;
 create table table_with_user_sequences (x int, y int, z bigint default nextval('user_defined_sequence'));
 insert into table_with_user_sequences values (1,1);
 select create_distributed_table('table_with_user_sequences','x');
-SELECT public.plan_without_result_lines($Q$
 explain (costs off) insert into table_with_user_sequences select y, x from table_with_user_sequences;
-$Q$);
 
 -- clean-up
 SET client_min_messages TO WARNING;
diff --git a/src/test/regress/sql/multi_select_distinct.sql b/src/test/regress/sql/multi_select_distinct.sql
index 75dd99da0..a2ee189b0 100644
--- a/src/test/regress/sql/multi_select_distinct.sql
+++ b/src/test/regress/sql/multi_select_distinct.sql
@@ -3,6 +3,8 @@
 --
 -- Tests select distinct, and select distinct on features.
 --
+SHOW server_version \gset
+SELECT substring(:'server_version', '\d+')::int >= 15 AS server_version_ge_15;
 
 ANALYZE lineitem_hash_part;
 
@@ -113,13 +115,11 @@ EXPLAIN (COSTS FALSE)
 -- check the plan if the hash aggreate is disabled. We expect to see sort+unique
 -- instead of aggregate plan node to handle distinct.
 SET enable_hashagg TO off;
-SELECT public.plan_without_result_lines($Q$
 EXPLAIN (COSTS FALSE)
 	SELECT DISTINCT count(*)
 		FROM lineitem_hash_part
 		GROUP BY l_suppkey, l_linenumber
 		ORDER BY 1;
-$Q$);
 
 SET enable_hashagg TO on;
 
@@ -142,14 +142,12 @@ EXPLAIN (COSTS FALSE)
 -- check the plan if the hash aggreate is disabled. Similar to the explain of
 -- the query above.
 SET enable_hashagg TO off;
-SELECT public.plan_without_result_lines($Q$
 EXPLAIN (COSTS FALSE)
 	SELECT DISTINCT l_suppkey, count(*)
 		FROM lineitem_hash_part
 		GROUP BY l_suppkey, l_linenumber
 		ORDER BY 1
 		LIMIT 10;
-$Q$);
 
 SET enable_hashagg TO on;
 
@@ -173,14 +171,12 @@ EXPLAIN (COSTS FALSE)
 -- check the plan if the hash aggreate is disabled. This explain errors out due
 -- to a bug right now, expectation must be corrected after fixing it.
 SET enable_hashagg TO off;
-SELECT public.plan_without_result_lines($Q$
 EXPLAIN (COSTS FALSE)
 	SELECT DISTINCT l_suppkey, avg(l_partkey)
 		FROM lineitem_hash_part
 		GROUP BY l_suppkey, l_linenumber
 		ORDER BY 1,2
 		LIMIT 10;
-$Q$);
 
 SET enable_hashagg TO on;
 
@@ -203,14 +199,12 @@ EXPLAIN (COSTS FALSE)
 -- check the plan if the hash aggreate is disabled. We expect to see sort+unique to
 -- handle distinct on.
 SET enable_hashagg TO off;
-SELECT public.plan_without_result_lines($Q$
 EXPLAIN (COSTS FALSE)
 	SELECT DISTINCT ON (l_suppkey) avg(l_partkey)
 		FROM lineitem_hash_part
 		GROUP BY l_suppkey, l_linenumber
 		ORDER BY l_suppkey,1
 		LIMIT 10;
-$Q$);
 
 SET enable_hashagg TO on;
 
@@ -232,14 +226,12 @@ EXPLAIN (COSTS FALSE)
 -- check the plan if the hash aggreate is disabled. This explain errors out due
 -- to a bug right now, expectation must be corrected after fixing it.
 SET enable_hashagg TO off;
-SELECT public.plan_without_result_lines($Q$
 EXPLAIN (COSTS FALSE)
 	SELECT DISTINCT avg(ceil(l_partkey / 2))
 		FROM lineitem_hash_part
 		GROUP BY l_suppkey, l_linenumber
 		ORDER BY 1
 		LIMIT 10;
-$Q$);
 
 SET enable_hashagg TO on;
 
@@ -261,14 +253,12 @@ EXPLAIN (COSTS FALSE)
 -- check the plan if the hash aggreate is disabled. This explain errors out due
 -- to a bug right now, expectation must be corrected after fixing it.
 SET enable_hashagg TO off;
-SELECT public.plan_without_result_lines($Q$
 EXPLAIN (COSTS FALSE)
 	SELECT DISTINCT sum(l_suppkey) + count(l_partkey) AS dis
 		FROM lineitem_hash_part
 		GROUP BY l_suppkey, l_linenumber
 		ORDER BY 1
 		LIMIT 10;
-$Q$);
 
 SET enable_hashagg TO on;
 
@@ -345,13 +335,11 @@ EXPLAIN (COSTS FALSE)
 
 -- check the plan if the hash aggreate is disabled
 SET enable_hashagg TO off;
-SELECT public.plan_without_result_lines($Q$
 EXPLAIN (COSTS FALSE)
 	SELECT DISTINCT ceil(count(case when l_partkey > 100000 THEN 1 ELSE 0 END) / 2) AS count
 		FROM lineitem_hash_part
 		GROUP BY l_suppkey
 		ORDER BY 1;
-$Q$);
 
 SET enable_hashagg TO on;
 
diff --git a/src/test/regress/sql/multi_test_helpers.sql b/src/test/regress/sql/multi_test_helpers.sql
index b5d4b9cd9..51cb2b129 100644
--- a/src/test/regress/sql/multi_test_helpers.sql
+++ b/src/test/regress/sql/multi_test_helpers.sql
@@ -20,15 +20,10 @@ END;
 $$LANGUAGE plpgsql;
 
 -- Create a function to ignore worker plans in explain output
--- Also remove extra "->  Result" lines for PG15 support
 CREATE OR REPLACE FUNCTION coordinator_plan(explain_command text, out query_plan text)
 RETURNS SETOF TEXT AS $$
 BEGIN
   FOR query_plan IN execute explain_command LOOP
-    IF (query_plan LIKE '%->  Result%' OR query_plan = 'Result')
-    THEN
-        CONTINUE;
-    END IF;
     RETURN next;
     IF query_plan LIKE '%Task Count:%'
     THEN
@@ -40,16 +35,12 @@ END; $$ language plpgsql;
 
 -- Create a function to ignore worker plans in explain output
 -- It also shows task count for plan and subplans
--- Also remove extra "->  Result" lines for PG15 support
 CREATE OR REPLACE FUNCTION coordinator_plan_with_subplans(explain_command text, out query_plan text)
 RETURNS SETOF TEXT AS $$
 DECLARE
     task_count_line_reached boolean := false;
 BEGIN
   FOR query_plan IN execute explain_command LOOP
-    IF (query_plan LIKE '%->  Result%' OR query_plan = 'Result') THEN
-        CONTINUE;
-    END IF;
     IF NOT task_count_line_reached THEN
         RETURN next;
     END IF;
@@ -64,20 +55,6 @@ BEGIN
   RETURN;
 END; $$ language plpgsql;
 
--- Create a function to ignore "->  Result" lines for PG15 support
--- In PG15 there are some extra "->  Result" lines
-CREATE OR REPLACE FUNCTION plan_without_result_lines(explain_command text, out query_plan text)
-RETURNS SETOF TEXT AS $$
-BEGIN
-  FOR query_plan IN execute explain_command LOOP
-    IF (query_plan LIKE '%->  Result%' OR query_plan = 'Result') THEN
-        CONTINUE;
-    END IF;
-    RETURN next;
-  END LOOP;
-  RETURN;
-END; $$ language plpgsql;
-
 -- Create a function to normalize Memory Usage, Buckets, Batches
 CREATE OR REPLACE FUNCTION plan_normalize_memory(explain_command text, out query_plan text)
 RETURNS SETOF TEXT AS $$
@@ -88,19 +65,6 @@ BEGIN
   END LOOP;
 END; $$ language plpgsql;
 
--- Create a function to remove arrows from the explain plan
-CREATE OR REPLACE FUNCTION plan_without_arrows(explain_command text, out query_plan text)
-RETURNS SETOF TEXT AS $$
-BEGIN
-  FOR query_plan IN execute explain_command LOOP
-    IF (query_plan LIKE '%->  Result%' OR query_plan = 'Result') THEN
-        CONTINUE;
-    END IF;
-    query_plan := regexp_replace(query_plan, '( )*->  (.*)',  '\2', 'g');
-    RETURN NEXT;
-  END LOOP;
-END; $$ language plpgsql;
-
 -- helper function that returns true if output of given explain has "is not null" (case in-sensitive)
 CREATE OR REPLACE FUNCTION explain_has_is_not_null(explain_command text)
 RETURNS BOOLEAN AS $$
diff --git a/src/test/regress/sql/window_functions.sql b/src/test/regress/sql/window_functions.sql
index 77f353efb..de936c95c 100644
--- a/src/test/regress/sql/window_functions.sql
+++ b/src/test/regress/sql/window_functions.sql
@@ -576,11 +576,9 @@ ORDER BY user_id, avg(value_1) DESC
 LIMIT 5;
 
 -- Grouping can be pushed down with aggregates even when window function can't
-SELECT public.plan_without_result_lines($Q$
 EXPLAIN (COSTS FALSE)
 SELECT user_id, count(value_1), stddev(value_1), count(user_id) OVER (PARTITION BY random())
 FROM users_table GROUP BY user_id HAVING avg(value_1) > 2 LIMIT 1;
-$Q$);
 
 -- Window function with inlined CTE
 WITH cte as (

From 8bb082e77d8bf13ed92c5e1726e7a0e4321223e4 Mon Sep 17 00:00:00 2001
From: Jelte Fennema <github-tech@jeltef.nl>
Date: Wed, 31 Aug 2022 13:55:47 +0200
Subject: [PATCH 02/16] Fix reporting of progress on waiting and moved shards
 (#6274)

In commit 31faa88a4e I removed some features of the rebalance progress
monitor. I did this because the plan was to remove the foreground shard
rebalancer later in the PR that would add the background shard
rebalancer. So, I didn't want to spend time fixing something that we
would throw away anyway.

As it turns out we're not removing the foreground shard rebalancer after
all, so it made sens to fix the stuff that I broke. This PR does that.
For the most part this commit reverts the changes in commit 31faa88a4e.
It's not a full revert though, because it keeps the improved tests and
the changes to `citus_move_shard_placement`.
---
 .../distributed/operations/repair_shards.c    |  30 ++--
 .../distributed/operations/shard_rebalancer.c | 130 ++++++++++++++++--
 .../distributed/progress/multi_progress.c     |   8 +-
 src/include/distributed/shard_rebalancer.h    |   6 +-
 .../isolation_shard_rebalancer_progress.out   |  16 ++-
 5 files changed, 161 insertions(+), 29 deletions(-)

diff --git a/src/backend/distributed/operations/repair_shards.c b/src/backend/distributed/operations/repair_shards.c
index 193797384..4388b86fd 100644
--- a/src/backend/distributed/operations/repair_shards.c
+++ b/src/backend/distributed/operations/repair_shards.c
@@ -21,6 +21,7 @@
 #include "catalog/pg_class.h"
 #include "catalog/pg_enum.h"
 #include "distributed/adaptive_executor.h"
+#include "distributed/backend_data.h"
 #include "distributed/citus_ruleutils.h"
 #include "distributed/colocation_utils.h"
 #include "distributed/commands.h"
@@ -397,15 +398,28 @@ citus_move_shard_placement(PG_FUNCTION_ARGS)
 									  targetNodeName, targetNodePort);
 
 
-	WorkerNode *sourceNode = FindWorkerNode(sourceNodeName, sourceNodePort);
-	WorkerNode *targetNode = FindWorkerNode(targetNodeName, targetNodePort);
+	/*
+	 * We want to be able to track progress of shard moves using
+	 * get_rebalancer_progress. If this move is initiated by the rebalancer,
+	 * then the rebalancer call has already set up the shared memory that is
+	 * used to do that. But if citus_move_shard_placement is called directly by
+	 * the user (or through any other mechanism), then the shared memory is not
+	 * set up yet. In that case we do it here.
+	 */
+	if (!IsRebalancerInternalBackend())
+	{
+		WorkerNode *sourceNode = FindWorkerNode(sourceNodeName, sourceNodePort);
+		WorkerNode *targetNode = FindWorkerNode(targetNodeName, targetNodePort);
 
-	PlacementUpdateEvent *placementUpdateEvent = palloc0(sizeof(PlacementUpdateEvent));
-	placementUpdateEvent->updateType = PLACEMENT_UPDATE_MOVE;
-	placementUpdateEvent->shardId = shardId;
-	placementUpdateEvent->sourceNode = sourceNode;
-	placementUpdateEvent->targetNode = targetNode;
-	SetupRebalanceMonitor(list_make1(placementUpdateEvent), relationId);
+		PlacementUpdateEvent *placementUpdateEvent = palloc0(
+			sizeof(PlacementUpdateEvent));
+		placementUpdateEvent->updateType = PLACEMENT_UPDATE_MOVE;
+		placementUpdateEvent->shardId = shardId;
+		placementUpdateEvent->sourceNode = sourceNode;
+		placementUpdateEvent->targetNode = targetNode;
+		SetupRebalanceMonitor(list_make1(placementUpdateEvent), relationId,
+							  REBALANCE_PROGRESS_MOVING);
+	}
 
 	/*
 	 * At this point of the shard moves, we don't need to block the writes to
diff --git a/src/backend/distributed/operations/shard_rebalancer.c b/src/backend/distributed/operations/shard_rebalancer.c
index ad7cccb0e..c6079eb83 100644
--- a/src/backend/distributed/operations/shard_rebalancer.c
+++ b/src/backend/distributed/operations/shard_rebalancer.c
@@ -201,6 +201,8 @@ static int PlacementsHashCompare(const void *lhsKey, const void *rhsKey, Size ke
 static uint32 PlacementsHashHashCode(const void *key, Size keySize);
 static bool WorkerNodeListContains(List *workerNodeList, const char *workerName,
 								   uint32 workerPort);
+static void UpdateColocatedShardPlacementProgress(uint64 shardId, char *sourceName,
+												  int sourcePort, uint64 progress);
 static bool IsPlacementOnWorkerNode(ShardPlacement *placement, WorkerNode *workerNode);
 static NodeFillState * FindFillStateForPlacement(RebalanceState *state,
 												 ShardPlacement *placement);
@@ -258,6 +260,7 @@ PG_FUNCTION_INFO_V1(pg_dist_rebalance_strategy_enterprise_check);
 bool RunningUnderIsolationTest = false;
 int MaxRebalancerLoggedIgnoredMoves = 5;
 
+
 #ifdef USE_ASSERT_CHECKING
 
 /*
@@ -763,7 +766,9 @@ ExecutePlacementUpdates(List *placementUpdateList, Oid shardReplicationModeOid,
  * dsm handle so that it can be used for updating the progress and cleaning things up.
  */
 void
-SetupRebalanceMonitor(List *placementUpdateList, Oid relationId)
+SetupRebalanceMonitor(List *placementUpdateList,
+					  Oid relationId,
+					  uint64 initialProgressState)
 {
 	List *colocatedUpdateList = GetColocatedRebalanceSteps(placementUpdateList);
 	ListCell *colocatedUpdateCell = NULL;
@@ -787,7 +792,7 @@ SetupRebalanceMonitor(List *placementUpdateList, Oid relationId)
 		event->shardId = colocatedUpdate->shardId;
 		event->sourcePort = colocatedUpdate->sourceNode->workerPort;
 		event->targetPort = colocatedUpdate->targetNode->workerPort;
-		pg_atomic_init_u64(&event->progress, REBALANCE_PROGRESS_MOVING);
+		pg_atomic_init_u64(&event->progress, initialProgressState);
 
 		eventIndex++;
 	}
@@ -1186,34 +1191,63 @@ BuildShardSizesHash(ProgressMonitorData *monitor, HTAB *shardStatistics)
 		PlacementUpdateEventProgress *step = placementUpdateEvents + eventIndex;
 
 		uint64 shardId = step->shardId;
+		uint64 shardSize = 0;
+		uint64 backupShardSize = 0;
+		uint64 progress = pg_atomic_read_u64(&step->progress);
 
-		uint64 shardSize = WorkerShardSize(shardStatistics, step->sourceName,
-										   step->sourcePort, shardId);
+		uint64 sourceSize = WorkerShardSize(shardStatistics, step->sourceName,
+											step->sourcePort, shardId);
+		uint64 targetSize = WorkerShardSize(shardStatistics, step->targetName,
+											step->targetPort, shardId);
+
+		if (progress == REBALANCE_PROGRESS_WAITING ||
+			progress == REBALANCE_PROGRESS_MOVING)
+		{
+			/*
+			 * If we are not done with the move, the correct shard size is the
+			 * size on the source.
+			 */
+			shardSize = sourceSize;
+			backupShardSize = targetSize;
+		}
+		else if (progress == REBALANCE_PROGRESS_MOVED)
+		{
+			/*
+			 * If we are done with the move, the correct shard size is the size
+			 * on the target
+			 */
+			shardSize = targetSize;
+			backupShardSize = sourceSize;
+		}
 
 		if (shardSize == 0)
 		{
-			/*
-			 * It's possible that we are reading the sizes after the move has
-			 * already fininshed. This means that the shards on the source
-			 * might have already been deleted. In that case we instead report
-			 * the size on the target as the shard size, since that is now the
-			 * only existing shard.
-			 */
-			shardSize = WorkerShardSize(shardStatistics, step->targetName,
-										step->targetPort, shardId);
-			if (shardSize == 0)
+			if (backupShardSize == 0)
 			{
 				/*
 				 * We don't have any useful shard size. This can happen when a
 				 * shard is moved multiple times and it is not present on
 				 * either of these nodes. Probably the shard is on a worker
-				 * related to the next move. In the weird case that this shard
+				 * related to another event. In the weird case that this shard
 				 * is on the nodes and actually is size 0, we will have no
 				 * entry in the hashmap. When fetching from it we always
 				 * default to 0 if no entry is found, so that's fine.
 				 */
 				continue;
 			}
+
+			/*
+			 * Because of the way we fetch shard sizes they are from a slightly
+			 * earlier moment than the progress state we just read from shared
+			 * memory. Usually this is no problem, but there exist some race
+			 * conditions where this matters. For example, for very quick moves
+			 * it is possible that even though a step is now reported as MOVED,
+			 * when we read the shard sizes the move had not even started yet.
+			 * This in turn can mean that the target size is 0 while the source
+			 * size is not. We try to handle such rare edge cases by falling
+			 * back on the other shard size if that one is not 0.
+			 */
+			shardSize = backupShardSize;
 		}
 
 
@@ -1427,6 +1461,15 @@ GetMovedShardIdsByWorker(PlacementUpdateEventProgress *steps, int stepCount,
 		AddToWorkerShardIdSet(shardsByWorker, step->sourceName, step->sourcePort,
 							  step->shardId);
 
+		if (pg_atomic_read_u64(&step->progress) == REBALANCE_PROGRESS_WAITING)
+		{
+			/*
+			 * shard move has not started so we don't need target stats for
+			 * this shard
+			 */
+			continue;
+		}
+
 		AddToWorkerShardIdSet(shardsByWorker, step->targetName, step->targetPort,
 							  step->shardId);
 	}
@@ -1559,6 +1602,8 @@ RebalanceTableShards(RebalanceOptions *options, Oid shardReplicationModeOid)
 	 * This uses the first relationId from the list, it's only used for display
 	 * purposes so it does not really matter which to show
 	 */
+	SetupRebalanceMonitor(placementUpdateList, linitial_oid(options->relationIdList),
+						  REBALANCE_PROGRESS_WAITING);
 	ExecutePlacementUpdates(placementUpdateList, shardReplicationModeOid, "Moving");
 	FinalizeCurrentProgressMonitor();
 }
@@ -1635,11 +1680,21 @@ UpdateShardPlacement(PlacementUpdateEvent *placementUpdateEvent,
 						errmsg("only moving or copying shards is supported")));
 	}
 
+	UpdateColocatedShardPlacementProgress(shardId,
+										  sourceNode->workerName,
+										  sourceNode->workerPort,
+										  REBALANCE_PROGRESS_MOVING);
+
 	/*
 	 * In case of failure, we throw an error such that rebalance_table_shards
 	 * fails early.
 	 */
 	ExecuteRebalancerCommandInSeparateTransaction(placementUpdateCommand->data);
+
+	UpdateColocatedShardPlacementProgress(shardId,
+										  sourceNode->workerName,
+										  sourceNode->workerPort,
+										  REBALANCE_PROGRESS_MOVED);
 }
 
 
@@ -2700,6 +2755,51 @@ WorkerNodeListContains(List *workerNodeList, const char *workerName, uint32 work
 }
 
 
+/*
+ * UpdateColocatedShardPlacementProgress updates the progress of the given placement,
+ * along with its colocated placements, to the given state.
+ */
+static void
+UpdateColocatedShardPlacementProgress(uint64 shardId, char *sourceName, int sourcePort,
+									  uint64 progress)
+{
+	ProgressMonitorData *header = GetCurrentProgressMonitor();
+
+	if (header != NULL)
+	{
+		PlacementUpdateEventProgress *steps = ProgressMonitorSteps(header);
+		ListCell *colocatedShardIntervalCell = NULL;
+
+		ShardInterval *shardInterval = LoadShardInterval(shardId);
+		List *colocatedShardIntervalList = ColocatedShardIntervalList(shardInterval);
+
+		for (int moveIndex = 0; moveIndex < header->stepCount; moveIndex++)
+		{
+			PlacementUpdateEventProgress *step = steps + moveIndex;
+			uint64 currentShardId = step->shardId;
+			bool colocatedShard = false;
+
+			foreach(colocatedShardIntervalCell, colocatedShardIntervalList)
+			{
+				ShardInterval *candidateShard = lfirst(colocatedShardIntervalCell);
+				if (candidateShard->shardId == currentShardId)
+				{
+					colocatedShard = true;
+					break;
+				}
+			}
+
+			if (colocatedShard &&
+				strcmp(step->sourceName, sourceName) == 0 &&
+				step->sourcePort == sourcePort)
+			{
+				pg_atomic_write_u64(&step->progress, progress);
+			}
+		}
+	}
+}
+
+
 /*
  * pg_dist_rebalance_strategy_enterprise_check is a now removed function, but
  * to avoid issues during upgrades a C stub is kept.
diff --git a/src/backend/distributed/progress/multi_progress.c b/src/backend/distributed/progress/multi_progress.c
index 9b9c2faa6..657f3356c 100644
--- a/src/backend/distributed/progress/multi_progress.c
+++ b/src/backend/distributed/progress/multi_progress.c
@@ -109,11 +109,17 @@ GetCurrentProgressMonitor(void)
 /*
  * FinalizeCurrentProgressMonitor releases the dynamic memory segment of the current
  * progress monitoring data structure and removes the process from
- * pg_stat_get_progress_info() output.
+ * pg_stat_get_progress_info() output. If there's no such dynamic memory
+ * segment this is a no-op.
  */
 void
 FinalizeCurrentProgressMonitor(void)
 {
+	if (currentProgressDSMHandle == DSM_HANDLE_INVALID)
+	{
+		return;
+	}
+
 	dsm_segment *dsmSegment = dsm_find_mapping(currentProgressDSMHandle);
 
 	if (dsmSegment != NULL)
diff --git a/src/include/distributed/shard_rebalancer.h b/src/include/distributed/shard_rebalancer.h
index d21894ffc..36c38ffff 100644
--- a/src/include/distributed/shard_rebalancer.h
+++ b/src/include/distributed/shard_rebalancer.h
@@ -73,7 +73,9 @@
 /* *INDENT-ON* */
 
 #define REBALANCE_ACTIVITY_MAGIC_NUMBER 1337
+#define REBALANCE_PROGRESS_WAITING 0
 #define REBALANCE_PROGRESS_MOVING 1
+#define REBALANCE_PROGRESS_MOVED 2
 
 /* Enumeration that defines different placement update types */
 typedef enum
@@ -193,7 +195,9 @@ extern List * ReplicationPlacementUpdates(List *workerNodeList, List *shardPlace
 extern void ExecuteRebalancerCommandInSeparateTransaction(char *command);
 extern void AcquirePlacementColocationLock(Oid relationId, int lockMode,
 										   const char *operationName);
-extern void SetupRebalanceMonitor(List *placementUpdateList, Oid relationId);
 
+extern void SetupRebalanceMonitor(List *placementUpdateList,
+								  Oid relationId,
+								  uint64 initialProgressState);
 
 #endif   /* SHARD_REBALANCER_H */
diff --git a/src/test/regress/expected/isolation_shard_rebalancer_progress.out b/src/test/regress/expected/isolation_shard_rebalancer_progress.out
index 000542bca..731f72c14 100644
--- a/src/test/regress/expected/isolation_shard_rebalancer_progress.out
+++ b/src/test/regress/expected/isolation_shard_rebalancer_progress.out
@@ -35,7 +35,9 @@ table_name|shardid|shard_size|sourcename|sourceport|source_shard_size|targetname
 ---------------------------------------------------------------------
 colocated1|1500001|     49152|localhost |     57637|            49152|localhost |     57638|                0|       1
 colocated2|1500005|    376832|localhost |     57637|           376832|localhost |     57638|                0|       1
-(2 rows)
+colocated1|1500002|    196608|localhost |     57637|           196608|localhost |     57638|                0|       0
+colocated2|1500006|      8192|localhost |     57637|             8192|localhost |     57638|                0|       0
+(4 rows)
 
 step s2-unlock-1-start:
  ROLLBACK;
@@ -112,9 +114,11 @@ step s7-get-progress:
 
 table_name|shardid|shard_size|sourcename|sourceport|source_shard_size|targetname|targetport|target_shard_size|progress
 ---------------------------------------------------------------------
+colocated1|1500001|     73728|localhost |     57637|                0|localhost |     57638|            73728|       2
+colocated2|1500005|    401408|localhost |     57637|                0|localhost |     57638|           401408|       2
 colocated1|1500002|    196608|localhost |     57637|           196608|localhost |     57638|                0|       1
 colocated2|1500006|      8192|localhost |     57637|             8192|localhost |     57638|                0|       1
-(2 rows)
+(4 rows)
 
 step s3-unlock-2-start:
  ROLLBACK;
@@ -205,7 +209,9 @@ table_name|shardid|shard_size|sourcename|sourceport|source_shard_size|targetname
 ---------------------------------------------------------------------
 colocated1|1500001|     49152|localhost |     57637|            49152|localhost |     57638|            73728|       1
 colocated2|1500005|    376832|localhost |     57637|           376832|localhost |     57638|           401408|       1
-(2 rows)
+colocated1|1500002|    196608|localhost |     57637|           196608|localhost |     57638|                0|       0
+colocated2|1500006|      8192|localhost |     57637|             8192|localhost |     57638|                0|       0
+(4 rows)
 
 step s7-release-lock:
  COMMIT;
@@ -288,7 +294,9 @@ table_name|shardid|shard_size|sourcename|sourceport|source_shard_size|targetname
 ---------------------------------------------------------------------
 colocated1|1500001|     49152|localhost |     57637|            49152|localhost |     57638|             8192|       1
 colocated2|1500005|    376832|localhost |     57637|           376832|localhost |     57638|             8192|       1
-(2 rows)
+colocated1|1500002|    196608|localhost |     57637|           196608|localhost |     57638|                0|       0
+colocated2|1500006|      8192|localhost |     57637|             8192|localhost |     57638|                0|       0
+(4 rows)
 
 step s6-release-advisory-lock:
     SELECT pg_advisory_unlock(44000, 55152);

From c14bf3a6606a4c1ccbdea6d9c05b652d41fd167c Mon Sep 17 00:00:00 2001
From: Jelte Fennema <github-tech@jeltef.nl>
Date: Wed, 31 Aug 2022 14:09:39 +0200
Subject: [PATCH 03/16] Add a job to CI to check tests for flakyness (#6276)

We have lots of flaky tests in CI and most of these random failures are
very hard/impossible to reproduce locally. This adds a job definition to
CI that allows adding a temporary job to rerun the same test in CI a lot
of times. This will very often reproduce the random failures. If you
then try to change the test or code to fix the random failure, you can
confirm that it's indeed fixed by using this job.

A future improvement to this job would be to run it (or a variant of it)
automatically for every newly added test, and maybe even changed tests.
This is not implemented in this PR.

An example of this job running can be found here:
https://app.circleci.com/pipelines/github/citusdata/citus/26682/workflows/a2638385-35bc-443c-badc-7713a8101313
---
 .circleci/config.yml | 115 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 115 insertions(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index b846264e8..bab2abdff 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -22,6 +22,14 @@ parameters:
   style_checker_tools_version:
     type: string
     default: '0.8.18'
+  flaky_test:
+    type: string
+    default: ''
+  flaky_test_make:
+    type: string
+    default: check-minimal
+
+
 jobs:
   build:
     description: Build the citus extension
@@ -529,9 +537,116 @@ jobs:
           name: install dependencies and run ch_benchmark tests
           no_output_timeout: 20m
 
+  test-flakyness:
+    description: Runs a test multiple times to see if it's flaky
+    parallelism: 32
+    parameters:
+      pg_major:
+        description: 'postgres major version'
+        type: integer
+      image:
+        description: 'docker image to use as for the tests'
+        type: string
+        default: citus/exttester
+      image_tag:
+        description: 'docker image tag to use'
+        type: string
+      make:
+        description: 'make target'
+        type: string
+        default: check-minimal
+      test:
+        description: 'the test that should be run multiple times'
+        type: string
+      runs:
+        description: 'number of times that the test should be run in total'
+        type: integer
+        default: 1600
+    docker:
+      - image: '<< parameters.image >>:<< parameters.image_tag >><< pipeline.parameters.image_suffix >>'
+    working_directory: /home/circleci/project
+    resource_class: small
+    steps:
+      - checkout
+      - attach_workspace:
+          at: .
+      - run:
+          name: 'Install Extension'
+          command: |
+            tar xfv "${CIRCLE_WORKING_DIRECTORY}/install-${PG_MAJOR}.tar" --directory /
+      - run:
+          name: 'Configure'
+          command: |
+            chown -R circleci .
+            gosu circleci ./configure --without-pg-version-check
+      - run:
+          name: 'Enable core dumps'
+          command: |
+            ulimit -c unlimited
+      - run:
+          name: 'Run minimal tests'
+          command: |
+            gosu circleci make -C src/test/regress << parameters.make >> EXTRA_TESTS="$(for i in $(seq << parameters.runs >> | circleci tests split); do echo -n '<< parameters.test >> ' ; done)"
+          no_output_timeout: 2m
+      - run:
+          name: 'Regressions'
+          command: |
+            if [ -f "src/test/regress/regression.diffs" ]; then
+              cat src/test/regress/regression.diffs
+              exit 1
+            fi
+          when: on_fail
+      - run:
+          name: 'Copy coredumps'
+          command: |
+            mkdir -p /tmp/core_dumps
+            if ls core.* 1> /dev/null 2>&1; then
+                cp core.* /tmp/core_dumps
+            fi
+          when: on_fail
+      - store_artifacts:
+          name: 'Save regressions'
+          path: src/test/regress/regression.diffs
+      - store_artifacts:
+          name: 'Save mitmproxy output (failure test specific)'
+          path: src/test/regress/proxy.output
+      - store_artifacts:
+          name: 'Save results'
+          path: src/test/regress/results/
+      - store_artifacts:
+          name: 'Save core dumps'
+          path: /tmp/core_dumps
+      - store_artifacts:
+          name: 'Save coordinator log'
+          path: src/test/regress/tmp_check/master/log
+      - store_artifacts:
+          name: 'Save worker1 log'
+          path: src/test/regress/tmp_check/worker.57637/log
+      - store_artifacts:
+          name: 'Save worker2 log'
+          path: src/test/regress/tmp_check/worker.57638/log
+
 workflows:
   version: 2
+  flaky_test_debugging:
+    when: << pipeline.parameters.flaky_test >>
+    jobs:
+      - build:
+          name: build-flaky-15
+          pg_major: 15
+          image_tag: '<< pipeline.parameters.pg15_version >>'
+
+      - test-flakyness:
+          name: 'test-15_flaky'
+          pg_major: 15
+          image_tag: '<< pipeline.parameters.pg15_version >>'
+          requires: [build-flaky-15]
+          make: '<< pipeline.parameters.flaky_test_make >>'
+          test: '<< pipeline.parameters.flaky_test >>'
+
   build_and_test:
+    when:
+      not: << pipeline.parameters.flaky_test >>
     jobs:
       - build:
           name: build-13

From 317dda6af1e8b6c7bfa1f02f22436a146cd72df3 Mon Sep 17 00:00:00 2001
From: Naisila Puka <37271756+naisila@users.noreply.github.com>
Date: Thu, 1 Sep 2022 11:56:31 +0300
Subject: [PATCH 04/16] Use RelationGetPrimaryKeyIndex for citus catalog tables
 (#6262)

pg_dist_node and pg_dist_colocation have a primary key index, not a replica identity index.

Citus catalog tables are created in public schema, which has replica identity index by default
as primary key index. Later the citus catalog tables are moved to pg_catalog schema.

During pg_upgrade, all tables are recreated, and given that pg_dist_colocation is found in
pg_catalog schema, it is recreated in that schema, and when it is recreated it doesn't
have a replica identity index, because catalog tables have no replica identity.

Further action:
Do we even need to acquire this lock on the primary key index?
Postgres doesn't acquire such locks on indexes before deleting catalog tuples.
Also, catalog tuples don't have replica identities by definition.
---
 src/backend/distributed/metadata/node_metadata.c | 11 +++++++++--
 src/backend/distributed/utils/colocation_utils.c | 11 +++++++++--
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/backend/distributed/metadata/node_metadata.c b/src/backend/distributed/metadata/node_metadata.c
index 3f729eccf..f3c61b64c 100644
--- a/src/backend/distributed/metadata/node_metadata.c
+++ b/src/backend/distributed/metadata/node_metadata.c
@@ -2677,9 +2677,16 @@ DeleteNodeRow(char *nodeName, int32 nodePort)
 
 	/*
 	 * simple_heap_delete() expects that the caller has at least an
-	 * AccessShareLock on replica identity index.
+	 * AccessShareLock on primary key index.
+	 *
+	 * XXX: This does not seem required, do we really need to acquire this lock?
+	 * Postgres doesn't acquire such locks on indexes before deleting catalog tuples.
+	 * Linking here the reasons we added this lock acquirement:
+	 * https://github.com/citusdata/citus/pull/2851#discussion_r306569462
+	 * https://github.com/citusdata/citus/pull/2855#discussion_r313628554
+	 * https://github.com/citusdata/citus/issues/1890
 	 */
-	Relation replicaIndex = index_open(RelationGetReplicaIndex(pgDistNode),
+	Relation replicaIndex = index_open(RelationGetPrimaryKeyIndex(pgDistNode),
 									   AccessShareLock);
 
 	ScanKeyInit(&scanKey[0], Anum_pg_dist_node_nodename,
diff --git a/src/backend/distributed/utils/colocation_utils.c b/src/backend/distributed/utils/colocation_utils.c
index bb9488af1..a5a32db0e 100644
--- a/src/backend/distributed/utils/colocation_utils.c
+++ b/src/backend/distributed/utils/colocation_utils.c
@@ -1303,10 +1303,17 @@ DeleteColocationGroupLocally(uint32 colocationId)
 	{
 		/*
 		 * simple_heap_delete() expects that the caller has at least an
-		 * AccessShareLock on replica identity index.
+		 * AccessShareLock on primary key index.
+		 *
+		 * XXX: This does not seem required, do we really need to acquire this lock?
+		 * Postgres doesn't acquire such locks on indexes before deleting catalog tuples.
+		 * Linking here the reasons we added this lock acquirement:
+		 * https://github.com/citusdata/citus/pull/2851#discussion_r306569462
+		 * https://github.com/citusdata/citus/pull/2855#discussion_r313628554
+		 * https://github.com/citusdata/citus/issues/1890
 		 */
 		Relation replicaIndex =
-			index_open(RelationGetReplicaIndex(pgDistColocation),
+			index_open(RelationGetPrimaryKeyIndex(pgDistColocation),
 					   AccessShareLock);
 		simple_heap_delete(pgDistColocation, &(heapTuple->t_self));
 

From 9e2b96caa56e6a9486254ae945a3db73707a3440 Mon Sep 17 00:00:00 2001
From: Naisila Puka <37271756+naisila@users.noreply.github.com>
Date: Thu, 1 Sep 2022 12:32:44 +0300
Subject: [PATCH 05/16] Add pg14->pg15 upgrade test for dist. triggers on part.
 tables (#6265)

PRE PG15, Renaming the parent triggers on partitioned tables doesn't
recurse to renaming the child triggers on the partitions as well.
In PG15, Renaming triggers on partitioned tables
recurses to renaming the triggers on the partitions as well.

Add an upgrade test to make sure we are not breaking anything
with distributed triggers on distributed partitioned tables.

Relevant PG commit:
80ba4bb383538a2ee846fece6a7b8da9518b6866
---
 src/test/regress/after_pg_upgrade_schedule    |   2 +-
 src/test/regress/before_pg_upgrade_schedule   |   1 +
 .../upgrade_distributed_triggers_after.out    | 226 ++++++++++++++++
 .../upgrade_distributed_triggers_after_0.out  |  16 ++
 .../upgrade_distributed_triggers_before.out   | 243 ++++++++++++++++++
 .../upgrade_distributed_triggers_before_0.out |  14 +
 .../upgrade_distributed_triggers_after.sql    |  72 ++++++
 .../upgrade_distributed_triggers_before.sql   | 116 +++++++++
 8 files changed, 689 insertions(+), 1 deletion(-)
 create mode 100644 src/test/regress/expected/upgrade_distributed_triggers_after.out
 create mode 100644 src/test/regress/expected/upgrade_distributed_triggers_after_0.out
 create mode 100644 src/test/regress/expected/upgrade_distributed_triggers_before.out
 create mode 100644 src/test/regress/expected/upgrade_distributed_triggers_before_0.out
 create mode 100644 src/test/regress/sql/upgrade_distributed_triggers_after.sql
 create mode 100644 src/test/regress/sql/upgrade_distributed_triggers_before.sql

diff --git a/src/test/regress/after_pg_upgrade_schedule b/src/test/regress/after_pg_upgrade_schedule
index 52b88d71d..6b8c3973f 100644
--- a/src/test/regress/after_pg_upgrade_schedule
+++ b/src/test/regress/after_pg_upgrade_schedule
@@ -1,4 +1,4 @@
-test: upgrade_basic_after upgrade_type_after upgrade_ref2ref_after upgrade_distributed_function_after upgrade_rebalance_strategy_after upgrade_list_citus_objects upgrade_autoconverted_after upgrade_citus_stat_activity upgrade_citus_locks
+test: upgrade_basic_after upgrade_type_after upgrade_ref2ref_after upgrade_distributed_function_after upgrade_rebalance_strategy_after upgrade_list_citus_objects upgrade_autoconverted_after upgrade_citus_stat_activity upgrade_citus_locks upgrade_distributed_triggers_after
 
 # This attempts dropping citus extension (and rollbacks), so please do
 # not run in parallel with any other tests.
diff --git a/src/test/regress/before_pg_upgrade_schedule b/src/test/regress/before_pg_upgrade_schedule
index 93bcce368..671d6fa6f 100644
--- a/src/test/regress/before_pg_upgrade_schedule
+++ b/src/test/regress/before_pg_upgrade_schedule
@@ -8,6 +8,7 @@ test: upgrade_distributed_function_before upgrade_rebalance_strategy_before
 test: upgrade_autoconverted_before
 test: upgrade_citus_stat_activity
 test: upgrade_citus_locks
+test: upgrade_distributed_triggers_before
 
 # upgrade_columnar_before renames public schema to citus_schema, so let's
 # run this test as the last one.
diff --git a/src/test/regress/expected/upgrade_distributed_triggers_after.out b/src/test/regress/expected/upgrade_distributed_triggers_after.out
new file mode 100644
index 000000000..591f33401
--- /dev/null
+++ b/src/test/regress/expected/upgrade_distributed_triggers_after.out
@@ -0,0 +1,226 @@
+--
+-- UPGRADE_DISTRIBUTED_TRIGGERS_AFTER
+--
+-- In PG15, Renaming triggers on partitioned tables
+-- recurses to renaming the triggers on the partitions as well.
+-- Relevant PG commit:
+-- 80ba4bb383538a2ee846fece6a7b8da9518b6866
+--
+-- this test is relevant only for pg14-15 upgrade
+--
+SHOW server_version \gset
+SELECT substring(:'server_version', '\d+')::int = 15 AS server_version_eq_15
+\gset
+\if :server_version_eq_15
+\else
+\q
+\endif
+SET search_path TO upgrade_distributed_triggers, public;
+SET citus.shard_count TO 4;
+SET citus.enable_unsafe_triggers TO true;
+SELECT run_command_on_workers('ALTER SYSTEM SET citus.enable_unsafe_triggers TO true;');
+       run_command_on_workers
+---------------------------------------------------------------------
+ (localhost,10201,t,"ALTER SYSTEM")
+ (localhost,10202,t,"ALTER SYSTEM")
+(2 rows)
+
+SELECT run_command_on_workers('SELECT pg_reload_conf();');
+ run_command_on_workers
+---------------------------------------------------------------------
+ (localhost,10201,t,t)
+ (localhost,10202,t,t)
+(2 rows)
+
+-- after PG15 upgrade, all child triggers have the same name with the parent triggers
+-- check that the workers are also updated
+SELECT * FROM sale_triggers ORDER BY 1, 2;
+           tgname            |     tgrelid     | tgenabled
+---------------------------------------------------------------------
+ another_renamed_trigger     | sale            | O
+ another_renamed_trigger     | sale_newyork    | O
+ another_renamed_trigger     | sale_california | O
+ not_renamed_trigger         | sale            | O
+ not_renamed_trigger         | sale_newyork    | O
+ not_renamed_trigger         | sale_california | O
+ renamed_record_sale_trigger | sale            | O
+ renamed_record_sale_trigger | sale_newyork    | O
+ renamed_record_sale_trigger | sale_california | O
+ truncate_trigger_xxxxxxx      | sale            | O
+ truncate_trigger_xxxxxxx      | sale_newyork    | O
+ truncate_trigger_xxxxxxx      | sale_california | O
+(12 rows)
+
+SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'record_sale_trigger%' ORDER BY 1,2;
+ tgrelid | tgname
+---------------------------------------------------------------------
+(0 rows)
+
+SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%' ORDER BY 1,2;
+     tgrelid     |           tgname
+---------------------------------------------------------------------
+ sale            | renamed_record_sale_trigger
+ sale_california | renamed_record_sale_trigger
+ sale_newyork    | renamed_record_sale_trigger
+(3 rows)
+
+SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'record_sale_trigger%';$$);
+ run_command_on_workers
+---------------------------------------------------------------------
+ (localhost,10201,t,0)
+ (localhost,10202,t,0)
+(2 rows)
+
+SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%';$$);
+ run_command_on_workers
+---------------------------------------------------------------------
+ (localhost,10201,t,9)
+ (localhost,10202,t,9)
+(2 rows)
+
+-- create another partition to verify that all is safe and sound
+CREATE TABLE sale_alabama PARTITION OF sale FOR VALUES IN ('AL');
+SELECT * FROM sale_triggers ORDER BY 1, 2;
+           tgname            |     tgrelid     | tgenabled
+---------------------------------------------------------------------
+ another_renamed_trigger     | sale            | O
+ another_renamed_trigger     | sale_newyork    | O
+ another_renamed_trigger     | sale_california | O
+ another_renamed_trigger     | sale_alabama    | O
+ not_renamed_trigger         | sale            | O
+ not_renamed_trigger         | sale_newyork    | O
+ not_renamed_trigger         | sale_california | O
+ not_renamed_trigger         | sale_alabama    | O
+ renamed_record_sale_trigger | sale            | O
+ renamed_record_sale_trigger | sale_newyork    | O
+ renamed_record_sale_trigger | sale_california | O
+ renamed_record_sale_trigger | sale_alabama    | O
+ truncate_trigger_xxxxxxx      | sale            | O
+ truncate_trigger_xxxxxxx      | sale_newyork    | O
+ truncate_trigger_xxxxxxx      | sale_california | O
+ truncate_trigger_xxxxxxx      | sale_alabama    | O
+(16 rows)
+
+SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'record_sale_trigger%' ORDER BY 1,2;
+ tgrelid | tgname
+---------------------------------------------------------------------
+(0 rows)
+
+SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%' ORDER BY 1,2;
+     tgrelid     |           tgname
+---------------------------------------------------------------------
+ sale            | renamed_record_sale_trigger
+ sale_alabama    | renamed_record_sale_trigger
+ sale_california | renamed_record_sale_trigger
+ sale_newyork    | renamed_record_sale_trigger
+(4 rows)
+
+SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'record_sale_trigger%';$$);
+ run_command_on_workers
+---------------------------------------------------------------------
+ (localhost,10201,t,0)
+ (localhost,10202,t,0)
+(2 rows)
+
+SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%';$$);
+ run_command_on_workers
+---------------------------------------------------------------------
+ (localhost,10201,t,12)
+ (localhost,10202,t,12)
+(2 rows)
+
+-- drop a trigger to verify that all is safe and sound
+DROP TRIGGER another_renamed_trigger ON sale;
+SELECT * FROM sale_triggers ORDER BY 1, 2;
+           tgname            |     tgrelid     | tgenabled
+---------------------------------------------------------------------
+ not_renamed_trigger         | sale            | O
+ not_renamed_trigger         | sale_newyork    | O
+ not_renamed_trigger         | sale_california | O
+ not_renamed_trigger         | sale_alabama    | O
+ renamed_record_sale_trigger | sale            | O
+ renamed_record_sale_trigger | sale_newyork    | O
+ renamed_record_sale_trigger | sale_california | O
+ renamed_record_sale_trigger | sale_alabama    | O
+ truncate_trigger_xxxxxxx      | sale            | O
+ truncate_trigger_xxxxxxx      | sale_newyork    | O
+ truncate_trigger_xxxxxxx      | sale_california | O
+ truncate_trigger_xxxxxxx      | sale_alabama    | O
+(12 rows)
+
+-- rename a trigger - note that it also renames the triggers on the partitions
+ALTER TRIGGER "renamed_record_sale_trigger" ON "sale" RENAME TO "final_renamed_record_sale_trigger";
+SELECT * FROM sale_triggers ORDER BY 1, 2;
+              tgname               |     tgrelid     | tgenabled
+---------------------------------------------------------------------
+ final_renamed_record_sale_trigger | sale            | O
+ final_renamed_record_sale_trigger | sale_newyork    | O
+ final_renamed_record_sale_trigger | sale_california | O
+ final_renamed_record_sale_trigger | sale_alabama    | O
+ not_renamed_trigger               | sale            | O
+ not_renamed_trigger               | sale_newyork    | O
+ not_renamed_trigger               | sale_california | O
+ not_renamed_trigger               | sale_alabama    | O
+ truncate_trigger_xxxxxxx            | sale            | O
+ truncate_trigger_xxxxxxx            | sale_newyork    | O
+ truncate_trigger_xxxxxxx            | sale_california | O
+ truncate_trigger_xxxxxxx            | sale_alabama    | O
+(12 rows)
+
+SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%' ORDER BY 1,2;
+ tgrelid | tgname
+---------------------------------------------------------------------
+(0 rows)
+
+SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'final_renamed_record_sale_trigger%' ORDER BY 1,2;
+     tgrelid     |              tgname
+---------------------------------------------------------------------
+ sale            | final_renamed_record_sale_trigger
+ sale_alabama    | final_renamed_record_sale_trigger
+ sale_california | final_renamed_record_sale_trigger
+ sale_newyork    | final_renamed_record_sale_trigger
+(4 rows)
+
+SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%';$$);
+ run_command_on_workers
+---------------------------------------------------------------------
+ (localhost,10201,t,0)
+ (localhost,10202,t,0)
+(2 rows)
+
+SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'final_renamed_record_sale_trigger%';$$);
+ run_command_on_workers
+---------------------------------------------------------------------
+ (localhost,10201,t,12)
+ (localhost,10202,t,12)
+(2 rows)
+
+DROP TRIGGER final_renamed_record_sale_trigger ON sale;
+-- create another trigger and rename it
+CREATE TRIGGER yet_another_trigger
+AFTER INSERT OR UPDATE OR DELETE ON sale
+FOR EACH ROW EXECUTE FUNCTION upgrade_distributed_triggers.record_sale();
+ALTER TRIGGER "yet_another_trigger" ON "sale" RENAME TO "renamed_yet_another_trigger";
+SELECT * FROM sale_triggers ORDER BY 1, 2;
+           tgname            |     tgrelid     | tgenabled
+---------------------------------------------------------------------
+ not_renamed_trigger         | sale            | O
+ not_renamed_trigger         | sale_newyork    | O
+ not_renamed_trigger         | sale_california | O
+ not_renamed_trigger         | sale_alabama    | O
+ renamed_yet_another_trigger | sale            | O
+ renamed_yet_another_trigger | sale_newyork    | O
+ renamed_yet_another_trigger | sale_california | O
+ renamed_yet_another_trigger | sale_alabama    | O
+ truncate_trigger_xxxxxxx      | sale            | O
+ truncate_trigger_xxxxxxx      | sale_newyork    | O
+ truncate_trigger_xxxxxxx      | sale_california | O
+ truncate_trigger_xxxxxxx      | sale_alabama    | O
+(12 rows)
+
+DROP SCHEMA upgrade_distributed_triggers CASCADE;
+NOTICE:  drop cascades to 4 other objects
+DETAIL:  drop cascades to function record_sale()
+drop cascades to table sale
+drop cascades to table record_sale
+drop cascades to view sale_triggers
diff --git a/src/test/regress/expected/upgrade_distributed_triggers_after_0.out b/src/test/regress/expected/upgrade_distributed_triggers_after_0.out
new file mode 100644
index 000000000..2b1f5cac1
--- /dev/null
+++ b/src/test/regress/expected/upgrade_distributed_triggers_after_0.out
@@ -0,0 +1,16 @@
+--
+-- UPGRADE_DISTRIBUTED_TRIGGERS_AFTER
+--
+-- In PG15, Renaming triggers on partitioned tables
+-- recurses to renaming the triggers on the partitions as well.
+-- Relevant PG commit:
+-- 80ba4bb383538a2ee846fece6a7b8da9518b6866
+--
+-- this test is relevant only for pg14-15 upgrade
+--
+SHOW server_version \gset
+SELECT substring(:'server_version', '\d+')::int = 15 AS server_version_eq_15
+\gset
+\if :server_version_eq_15
+\else
+\q
diff --git a/src/test/regress/expected/upgrade_distributed_triggers_before.out b/src/test/regress/expected/upgrade_distributed_triggers_before.out
new file mode 100644
index 000000000..5e369c84c
--- /dev/null
+++ b/src/test/regress/expected/upgrade_distributed_triggers_before.out
@@ -0,0 +1,243 @@
+--
+-- UPGRADE_DISTRIBUTED_TRIGGERS_BEFORE
+--
+-- PRE PG15, Renaming the parent triggers on partitioned tables doesn't
+-- recurse to renaming the child triggers on the partitions as well.
+--
+-- this test is relevant only for pg14-15 upgrade
+--
+SHOW server_version \gset
+SELECT substring(:'server_version', '\d+')::int = 14 AS server_version_eq_14
+\gset
+\if :server_version_eq_14
+\else
+\q
+\endif
+CREATE SCHEMA upgrade_distributed_triggers;
+SET search_path TO upgrade_distributed_triggers, public;
+SET citus.shard_count TO 4;
+SET citus.enable_unsafe_triggers TO true;
+SELECT run_command_on_workers('ALTER SYSTEM SET citus.enable_unsafe_triggers TO true;');
+       run_command_on_workers
+---------------------------------------------------------------------
+ (localhost,10201,t,"ALTER SYSTEM")
+ (localhost,10202,t,"ALTER SYSTEM")
+(2 rows)
+
+SELECT run_command_on_workers('SELECT pg_reload_conf();');
+ run_command_on_workers
+---------------------------------------------------------------------
+ (localhost,10201,t,t)
+ (localhost,10202,t,t)
+(2 rows)
+
+CREATE TABLE sale(
+    sale_date date not null,
+    state_code text,
+    product_sku text,
+    units integer)
+    PARTITION BY list (state_code);
+ALTER TABLE sale ADD CONSTRAINT sale_pk PRIMARY KEY (state_code, sale_date);
+CREATE TABLE sale_newyork PARTITION OF sale FOR VALUES IN ('NY');
+CREATE TABLE record_sale(
+    operation_type text not null,
+    product_sku text,
+    state_code text,
+    units integer,
+    PRIMARY KEY(state_code, product_sku, operation_type, units));
+SELECT create_distributed_table('sale', 'state_code');
+ create_distributed_table
+---------------------------------------------------------------------
+
+(1 row)
+
+CREATE TABLE sale_california PARTITION OF sale FOR VALUES IN ('CA');
+SELECT create_distributed_table('record_sale', 'state_code', colocate_with := 'sale');
+ create_distributed_table
+---------------------------------------------------------------------
+
+(1 row)
+
+CREATE OR REPLACE FUNCTION record_sale()
+RETURNS trigger
+AS $$
+BEGIN
+    INSERT INTO upgrade_distributed_triggers.record_sale(operation_type, product_sku, state_code, units)
+    VALUES (TG_OP, NEW.product_sku, NEW.state_code, NEW.units);
+    RETURN NULL;
+END;
+$$ LANGUAGE plpgsql;
+-- will rename this trigger
+CREATE TRIGGER record_sale_trigger
+AFTER INSERT OR UPDATE OR DELETE ON sale
+FOR EACH ROW EXECUTE FUNCTION upgrade_distributed_triggers.record_sale();
+-- will rename this trigger
+CREATE TRIGGER another_trigger
+AFTER INSERT OR UPDATE OR DELETE ON sale
+FOR EACH ROW EXECUTE FUNCTION upgrade_distributed_triggers.record_sale();
+-- won't rename this trigger
+CREATE TRIGGER not_renamed_trigger
+AFTER INSERT OR UPDATE OR DELETE ON sale
+FOR EACH ROW EXECUTE FUNCTION upgrade_distributed_triggers.record_sale();
+-- Trigger function should appear on workers
+SELECT proname from pg_proc WHERE oid='upgrade_distributed_triggers.record_sale'::regproc;
+   proname
+---------------------------------------------------------------------
+ record_sale
+(1 row)
+
+SELECT run_command_on_workers($$SELECT count(*) FROM pg_proc WHERE oid='upgrade_distributed_triggers.record_sale'::regproc$$);
+ run_command_on_workers
+---------------------------------------------------------------------
+ (localhost,10201,t,1)
+ (localhost,10202,t,1)
+(2 rows)
+
+-- Trigger should appear on workers
+SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'record_sale_trigger%' ORDER BY 1,2;
+     tgrelid     |       tgname
+---------------------------------------------------------------------
+ sale            | record_sale_trigger
+ sale_california | record_sale_trigger
+ sale_newyork    | record_sale_trigger
+(3 rows)
+
+SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'record_sale_trigger%';$$);
+ run_command_on_workers
+---------------------------------------------------------------------
+ (localhost,10201,t,9)
+ (localhost,10202,t,9)
+(2 rows)
+
+CREATE VIEW sale_triggers AS
+    SELECT tgname, tgrelid::regclass, tgenabled
+    FROM pg_trigger
+    WHERE tgrelid::regclass::text like 'sale%'
+    ORDER BY 1, 2;
+SELECT * FROM sale_triggers ORDER BY 1, 2;
+         tgname         |     tgrelid     | tgenabled
+---------------------------------------------------------------------
+ another_trigger        | sale            | O
+ another_trigger        | sale_newyork    | O
+ another_trigger        | sale_california | O
+ not_renamed_trigger    | sale            | O
+ not_renamed_trigger    | sale_newyork    | O
+ not_renamed_trigger    | sale_california | O
+ record_sale_trigger    | sale            | O
+ record_sale_trigger    | sale_newyork    | O
+ record_sale_trigger    | sale_california | O
+ truncate_trigger_xxxxxxx | sale            | O
+ truncate_trigger_xxxxxxx | sale_newyork    | O
+ truncate_trigger_xxxxxxx | sale_california | O
+(12 rows)
+
+-- rename the triggers - note that it doesn't rename the
+-- triggers on the partitions
+ALTER TRIGGER record_sale_trigger ON sale RENAME TO renamed_record_sale_trigger;
+SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'record_sale_trigger%' ORDER BY 1,2;
+     tgrelid     |       tgname
+---------------------------------------------------------------------
+ sale_california | record_sale_trigger
+ sale_newyork    | record_sale_trigger
+(2 rows)
+
+SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%' ORDER BY 1,2;
+ tgrelid |           tgname
+---------------------------------------------------------------------
+ sale    | renamed_record_sale_trigger
+(1 row)
+
+SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'record_sale_trigger%';$$);
+ run_command_on_workers
+---------------------------------------------------------------------
+ (localhost,10201,t,6)
+ (localhost,10202,t,6)
+(2 rows)
+
+SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%';$$);
+ run_command_on_workers
+---------------------------------------------------------------------
+ (localhost,10201,t,3)
+ (localhost,10202,t,3)
+(2 rows)
+
+ALTER TRIGGER another_trigger ON sale RENAME TO another_renamed_trigger;
+SELECT * FROM sale_triggers ORDER BY 1, 2;
+           tgname            |     tgrelid     | tgenabled
+---------------------------------------------------------------------
+ another_renamed_trigger     | sale            | O
+ another_trigger             | sale_newyork    | O
+ another_trigger             | sale_california | O
+ not_renamed_trigger         | sale            | O
+ not_renamed_trigger         | sale_newyork    | O
+ not_renamed_trigger         | sale_california | O
+ record_sale_trigger         | sale_newyork    | O
+ record_sale_trigger         | sale_california | O
+ renamed_record_sale_trigger | sale            | O
+ truncate_trigger_xxxxxxx      | sale            | O
+ truncate_trigger_xxxxxxx      | sale_newyork    | O
+ truncate_trigger_xxxxxxx      | sale_california | O
+(12 rows)
+
+-- although the child triggers haven't been renamed to
+-- another_renamed_trigger, they are dropped when the parent is dropped
+DROP TRIGGER another_renamed_trigger ON sale;
+SELECT * FROM sale_triggers ORDER BY 1, 2;
+           tgname            |     tgrelid     | tgenabled
+---------------------------------------------------------------------
+ not_renamed_trigger         | sale            | O
+ not_renamed_trigger         | sale_newyork    | O
+ not_renamed_trigger         | sale_california | O
+ record_sale_trigger         | sale_newyork    | O
+ record_sale_trigger         | sale_california | O
+ renamed_record_sale_trigger | sale            | O
+ truncate_trigger_xxxxxxx      | sale            | O
+ truncate_trigger_xxxxxxx      | sale_newyork    | O
+ truncate_trigger_xxxxxxx      | sale_california | O
+(9 rows)
+
+SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'another_renamed_trigger%' ORDER BY 1,2;
+ tgrelid | tgname
+---------------------------------------------------------------------
+(0 rows)
+
+SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'another_trigger%' ORDER BY 1,2;
+ tgrelid | tgname
+---------------------------------------------------------------------
+(0 rows)
+
+SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'another_renamed_trigger%';$$);
+ run_command_on_workers
+---------------------------------------------------------------------
+ (localhost,10201,t,0)
+ (localhost,10202,t,0)
+(2 rows)
+
+SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'another_trigger%';$$);
+ run_command_on_workers
+---------------------------------------------------------------------
+ (localhost,10201,t,0)
+ (localhost,10202,t,0)
+(2 rows)
+
+CREATE TRIGGER another_trigger
+AFTER INSERT OR UPDATE OR DELETE ON sale
+FOR EACH ROW EXECUTE FUNCTION upgrade_distributed_triggers.record_sale();
+ALTER TRIGGER another_trigger ON sale RENAME TO another_renamed_trigger;
+SELECT * FROM sale_triggers ORDER BY 1, 2;
+           tgname            |     tgrelid     | tgenabled
+---------------------------------------------------------------------
+ another_renamed_trigger     | sale            | O
+ another_trigger             | sale_newyork    | O
+ another_trigger             | sale_california | O
+ not_renamed_trigger         | sale            | O
+ not_renamed_trigger         | sale_newyork    | O
+ not_renamed_trigger         | sale_california | O
+ record_sale_trigger         | sale_newyork    | O
+ record_sale_trigger         | sale_california | O
+ renamed_record_sale_trigger | sale            | O
+ truncate_trigger_xxxxxxx      | sale            | O
+ truncate_trigger_xxxxxxx      | sale_newyork    | O
+ truncate_trigger_xxxxxxx      | sale_california | O
+(12 rows)
+
diff --git a/src/test/regress/expected/upgrade_distributed_triggers_before_0.out b/src/test/regress/expected/upgrade_distributed_triggers_before_0.out
new file mode 100644
index 000000000..34a7cedf2
--- /dev/null
+++ b/src/test/regress/expected/upgrade_distributed_triggers_before_0.out
@@ -0,0 +1,14 @@
+--
+-- UPGRADE_DISTRIBUTED_TRIGGERS_BEFORE
+--
+-- PRE PG15, Renaming the parent triggers on partitioned tables doesn't
+-- recurse to renaming the child triggers on the partitions as well.
+--
+-- this test is relevant only for pg14-15 upgrade
+--
+SHOW server_version \gset
+SELECT substring(:'server_version', '\d+')::int = 14 AS server_version_eq_14
+\gset
+\if :server_version_eq_14
+\else
+\q
diff --git a/src/test/regress/sql/upgrade_distributed_triggers_after.sql b/src/test/regress/sql/upgrade_distributed_triggers_after.sql
new file mode 100644
index 000000000..bcbd7432e
--- /dev/null
+++ b/src/test/regress/sql/upgrade_distributed_triggers_after.sql
@@ -0,0 +1,72 @@
+--
+-- UPGRADE_DISTRIBUTED_TRIGGERS_AFTER
+--
+-- In PG15, Renaming triggers on partitioned tables
+-- recurses to renaming the triggers on the partitions as well.
+-- Relevant PG commit:
+-- 80ba4bb383538a2ee846fece6a7b8da9518b6866
+--
+-- this test is relevant only for pg14-15 upgrade
+--
+
+SHOW server_version \gset
+SELECT substring(:'server_version', '\d+')::int = 15 AS server_version_eq_15
+\gset
+\if :server_version_eq_15
+\else
+\q
+\endif
+
+SET search_path TO upgrade_distributed_triggers, public;
+SET citus.shard_count TO 4;
+
+SET citus.enable_unsafe_triggers TO true;
+SELECT run_command_on_workers('ALTER SYSTEM SET citus.enable_unsafe_triggers TO true;');
+SELECT run_command_on_workers('SELECT pg_reload_conf();');
+
+-- after PG15 upgrade, all child triggers have the same name with the parent triggers
+-- check that the workers are also updated
+SELECT * FROM sale_triggers ORDER BY 1, 2;
+
+SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'record_sale_trigger%' ORDER BY 1,2;
+SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%' ORDER BY 1,2;
+SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'record_sale_trigger%';$$);
+SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%';$$);
+
+-- create another partition to verify that all is safe and sound
+CREATE TABLE sale_alabama PARTITION OF sale FOR VALUES IN ('AL');
+
+SELECT * FROM sale_triggers ORDER BY 1, 2;
+
+SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'record_sale_trigger%' ORDER BY 1,2;
+SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%' ORDER BY 1,2;
+SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'record_sale_trigger%';$$);
+SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%';$$);
+
+-- drop a trigger to verify that all is safe and sound
+DROP TRIGGER another_renamed_trigger ON sale;
+
+SELECT * FROM sale_triggers ORDER BY 1, 2;
+
+-- rename a trigger - note that it also renames the triggers on the partitions
+ALTER TRIGGER "renamed_record_sale_trigger" ON "sale" RENAME TO "final_renamed_record_sale_trigger";
+
+SELECT * FROM sale_triggers ORDER BY 1, 2;
+
+SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%' ORDER BY 1,2;
+SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'final_renamed_record_sale_trigger%' ORDER BY 1,2;
+SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%';$$);
+SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'final_renamed_record_sale_trigger%';$$);
+
+DROP TRIGGER final_renamed_record_sale_trigger ON sale;
+
+-- create another trigger and rename it
+CREATE TRIGGER yet_another_trigger
+AFTER INSERT OR UPDATE OR DELETE ON sale
+FOR EACH ROW EXECUTE FUNCTION upgrade_distributed_triggers.record_sale();
+
+ALTER TRIGGER "yet_another_trigger" ON "sale" RENAME TO "renamed_yet_another_trigger";
+
+SELECT * FROM sale_triggers ORDER BY 1, 2;
+
+DROP SCHEMA upgrade_distributed_triggers CASCADE;
diff --git a/src/test/regress/sql/upgrade_distributed_triggers_before.sql b/src/test/regress/sql/upgrade_distributed_triggers_before.sql
new file mode 100644
index 000000000..a1d4c9db3
--- /dev/null
+++ b/src/test/regress/sql/upgrade_distributed_triggers_before.sql
@@ -0,0 +1,116 @@
+--
+-- UPGRADE_DISTRIBUTED_TRIGGERS_BEFORE
+--
+-- PRE PG15, Renaming the parent triggers on partitioned tables doesn't
+-- recurse to renaming the child triggers on the partitions as well.
+--
+-- this test is relevant only for pg14-15 upgrade
+--
+
+SHOW server_version \gset
+SELECT substring(:'server_version', '\d+')::int = 14 AS server_version_eq_14
+\gset
+\if :server_version_eq_14
+\else
+\q
+\endif
+
+CREATE SCHEMA upgrade_distributed_triggers;
+SET search_path TO upgrade_distributed_triggers, public;
+SET citus.shard_count TO 4;
+
+SET citus.enable_unsafe_triggers TO true;
+SELECT run_command_on_workers('ALTER SYSTEM SET citus.enable_unsafe_triggers TO true;');
+SELECT run_command_on_workers('SELECT pg_reload_conf();');
+
+CREATE TABLE sale(
+    sale_date date not null,
+    state_code text,
+    product_sku text,
+    units integer)
+    PARTITION BY list (state_code);
+
+ALTER TABLE sale ADD CONSTRAINT sale_pk PRIMARY KEY (state_code, sale_date);
+
+CREATE TABLE sale_newyork PARTITION OF sale FOR VALUES IN ('NY');
+
+CREATE TABLE record_sale(
+    operation_type text not null,
+    product_sku text,
+    state_code text,
+    units integer,
+    PRIMARY KEY(state_code, product_sku, operation_type, units));
+
+SELECT create_distributed_table('sale', 'state_code');
+CREATE TABLE sale_california PARTITION OF sale FOR VALUES IN ('CA');
+SELECT create_distributed_table('record_sale', 'state_code', colocate_with := 'sale');
+
+CREATE OR REPLACE FUNCTION record_sale()
+RETURNS trigger
+AS $$
+BEGIN
+    INSERT INTO upgrade_distributed_triggers.record_sale(operation_type, product_sku, state_code, units)
+    VALUES (TG_OP, NEW.product_sku, NEW.state_code, NEW.units);
+    RETURN NULL;
+END;
+$$ LANGUAGE plpgsql;
+
+-- will rename this trigger
+CREATE TRIGGER record_sale_trigger
+AFTER INSERT OR UPDATE OR DELETE ON sale
+FOR EACH ROW EXECUTE FUNCTION upgrade_distributed_triggers.record_sale();
+
+-- will rename this trigger
+CREATE TRIGGER another_trigger
+AFTER INSERT OR UPDATE OR DELETE ON sale
+FOR EACH ROW EXECUTE FUNCTION upgrade_distributed_triggers.record_sale();
+
+-- won't rename this trigger
+CREATE TRIGGER not_renamed_trigger
+AFTER INSERT OR UPDATE OR DELETE ON sale
+FOR EACH ROW EXECUTE FUNCTION upgrade_distributed_triggers.record_sale();
+
+-- Trigger function should appear on workers
+SELECT proname from pg_proc WHERE oid='upgrade_distributed_triggers.record_sale'::regproc;
+SELECT run_command_on_workers($$SELECT count(*) FROM pg_proc WHERE oid='upgrade_distributed_triggers.record_sale'::regproc$$);
+
+-- Trigger should appear on workers
+SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'record_sale_trigger%' ORDER BY 1,2;
+SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'record_sale_trigger%';$$);
+
+CREATE VIEW sale_triggers AS
+    SELECT tgname, tgrelid::regclass, tgenabled
+    FROM pg_trigger
+    WHERE tgrelid::regclass::text like 'sale%'
+    ORDER BY 1, 2;
+
+SELECT * FROM sale_triggers ORDER BY 1, 2;
+
+-- rename the triggers - note that it doesn't rename the
+-- triggers on the partitions
+ALTER TRIGGER record_sale_trigger ON sale RENAME TO renamed_record_sale_trigger;
+
+SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'record_sale_trigger%' ORDER BY 1,2;
+SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%' ORDER BY 1,2;
+SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'record_sale_trigger%';$$);
+SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'renamed_record_sale_trigger%';$$);
+
+ALTER TRIGGER another_trigger ON sale RENAME TO another_renamed_trigger;
+SELECT * FROM sale_triggers ORDER BY 1, 2;
+
+-- although the child triggers haven't been renamed to
+-- another_renamed_trigger, they are dropped when the parent is dropped
+DROP TRIGGER another_renamed_trigger ON sale;
+SELECT * FROM sale_triggers ORDER BY 1, 2;
+
+SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'another_renamed_trigger%' ORDER BY 1,2;
+SELECT tgrelid::regclass::text, tgname FROM pg_trigger WHERE tgname like 'another_trigger%' ORDER BY 1,2;
+SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'another_renamed_trigger%';$$);
+SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'another_trigger%';$$);
+
+CREATE TRIGGER another_trigger
+AFTER INSERT OR UPDATE OR DELETE ON sale
+FOR EACH ROW EXECUTE FUNCTION upgrade_distributed_triggers.record_sale();
+
+ALTER TRIGGER another_trigger ON sale RENAME TO another_renamed_trigger;
+SELECT * FROM sale_triggers ORDER BY 1, 2;

From 432f399a5d27ea91652157b9e520afe6a28a229a Mon Sep 17 00:00:00 2001
From: Marco Slot <marco.slot@microsoft.com>
Date: Thu, 1 Sep 2022 14:26:43 +0200
Subject: [PATCH 06/16] Allow citus_internal application_name with additional
 suffix (#6282)

Co-authored-by: Marco Slot <marco.slot@gmail.com>
---
 .../distributed/transaction/backend_data.c       | 16 ++++++++--------
 .../regress/expected/metadata_sync_helpers.out   | 16 ++++++++++++++++
 src/test/regress/sql/metadata_sync_helpers.sql   |  7 +++++++
 3 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/src/backend/distributed/transaction/backend_data.c b/src/backend/distributed/transaction/backend_data.c
index 34d90969b..670469cd7 100644
--- a/src/backend/distributed/transaction/backend_data.c
+++ b/src/backend/distributed/transaction/backend_data.c
@@ -1064,16 +1064,16 @@ ExtractGlobalPID(char *applicationName)
 		return INVALID_CITUS_INTERNAL_BACKEND_GPID;
 	}
 
-	/* are the remaining characters of the application name numbers */
-	uint64 numberOfRemainingChars = strlen(applicationNameCopy) - prefixLength;
-	if (numberOfRemainingChars <= 0 ||
-		!strisdigit_s(applicationNameCopy + prefixLength, numberOfRemainingChars))
-	{
-		return INVALID_CITUS_INTERNAL_BACKEND_GPID;
-	}
-
 	char *globalPIDString = &applicationNameCopy[prefixLength];
 	uint64 globalPID = strtoul(globalPIDString, NULL, 10);
+	if (globalPID == 0)
+	{
+		/*
+		 * INVALID_CITUS_INTERNAL_BACKEND_GPID is 0, but just to be explicit
+		 * about how we handle strtoul errors.
+		 */
+		return INVALID_CITUS_INTERNAL_BACKEND_GPID;
+	}
 
 	return globalPID;
 }
diff --git a/src/test/regress/expected/metadata_sync_helpers.out b/src/test/regress/expected/metadata_sync_helpers.out
index 9800f5031..1939fcf1c 100644
--- a/src/test/regress/expected/metadata_sync_helpers.out
+++ b/src/test/regress/expected/metadata_sync_helpers.out
@@ -108,6 +108,22 @@ BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;
 	SET application_name to 'citus_internal gpid=not a correct gpid';
 	SELECT citus_internal_add_partition_metadata ('test_2'::regclass, 'h', 'col_1', 0, 's');
 ERROR:  This is an internal Citus function can only be used in a distributed transaction
+ROLLBACK;
+-- application_name with suffix is ok (e.g. pgbouncer might add this)
+BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;
+	SELECT assign_distributed_transaction_id(0, 8, '2021-07-09 15:41:55.542377+02');
+ assign_distributed_transaction_id
+---------------------------------------------------------------------
+
+(1 row)
+
+	SET application_name to 'citus_internal gpid=10000000001 - from 10.12.14.16:10370';
+	SELECT citus_internal_add_partition_metadata ('test_2'::regclass, 'h', 'col_1', 0, 's');
+ citus_internal_add_partition_metadata
+---------------------------------------------------------------------
+
+(1 row)
+
 ROLLBACK;
 -- application_name with empty gpid
 BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;
diff --git a/src/test/regress/sql/metadata_sync_helpers.sql b/src/test/regress/sql/metadata_sync_helpers.sql
index 1f0ad09d0..ca32c5f16 100644
--- a/src/test/regress/sql/metadata_sync_helpers.sql
+++ b/src/test/regress/sql/metadata_sync_helpers.sql
@@ -75,6 +75,13 @@ BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;
 	SELECT citus_internal_add_partition_metadata ('test_2'::regclass, 'h', 'col_1', 0, 's');
 ROLLBACK;
 
+-- application_name with suffix is ok (e.g. pgbouncer might add this)
+BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;
+	SELECT assign_distributed_transaction_id(0, 8, '2021-07-09 15:41:55.542377+02');
+	SET application_name to 'citus_internal gpid=10000000001 - from 10.12.14.16:10370';
+	SELECT citus_internal_add_partition_metadata ('test_2'::regclass, 'h', 'col_1', 0, 's');
+ROLLBACK;
+
 -- application_name with empty gpid
 BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;
 	SELECT assign_distributed_transaction_id(0, 8, '2021-07-09 15:41:55.542377+02');

From 7c8cc7fc6190a3109ada25fec8537daeb97f8f80 Mon Sep 17 00:00:00 2001
From: Ahmet Gedemenli <afgedemenli@gmail.com>
Date: Fri, 2 Sep 2022 10:12:07 +0300
Subject: [PATCH 07/16] Fix flakiness for view tests (#6284)

---
 .../expected/citus_local_tables_mx.out        | 49 +++++++++++++------
 .../regress/sql/citus_local_tables_mx.sql     | 16 ++++--
 2 files changed, 46 insertions(+), 19 deletions(-)

diff --git a/src/test/regress/expected/citus_local_tables_mx.out b/src/test/regress/expected/citus_local_tables_mx.out
index 6fcad3612..3177ed9e9 100644
--- a/src/test/regress/expected/citus_local_tables_mx.out
+++ b/src/test/regress/expected/citus_local_tables_mx.out
@@ -887,22 +887,39 @@ ALTER TABLE loc_tb ADD CONSTRAINT fkey FOREIGN KEY (a) references ref_tb(a);
 ERROR:  Citus can not handle circular dependencies between distributed objects
 -- drop the view&matview with circular dependency
 DROP VIEW v103 CASCADE;
-SET client_min_messages TO DEBUG1;
 -- now it should successfully add to metadata and create the views on workers
 ALTER TABLE loc_tb ADD CONSTRAINT fkey FOREIGN KEY (a) references ref_tb(a);
-DEBUG:  executing "CREATE OR REPLACE VIEW citus_local_tables_mx.v100 (a) AS  SELECT loc_tb.a
-   FROM citus_local_tables_mx.loc_tb; ALTER VIEW citus_local_tables_mx.v100 OWNER TO postgres"
-DEBUG:  "view v100" has dependency to "table loc_tb" that is not in Citus' metadata
-DEBUG:  executing "CREATE OR REPLACE VIEW citus_local_tables_mx.v101 (a) AS  SELECT loc_tb.a
-   FROM (citus_local_tables_mx.loc_tb
-     JOIN citus_local_tables_mx.ref_tb USING (a)); ALTER VIEW citus_local_tables_mx.v101 OWNER TO postgres"
-DEBUG:  "view v101" has dependency to "table loc_tb" that is not in Citus' metadata
-DEBUG:  executing "CREATE MATERIALIZED VIEW citus_local_tables_mx.matview_101 USING heap AS  SELECT loc_tb.a
-   FROM citus_local_tables_mx.loc_tb;ALTER MATERIALIZED VIEW citus_local_tables_mx.matview_101 OWNER TO postgres"
-DEBUG:  executing "CREATE OR REPLACE VIEW citus_local_tables_mx.v102 (a) AS  SELECT v101.a
-   FROM citus_local_tables_mx.v101; ALTER VIEW citus_local_tables_mx.v102 OWNER TO postgres"
-DEBUG:  "view v102" has dependency to "table loc_tb" that is not in Citus' metadata
-DEBUG:  validating foreign key constraint "fkey_xxxxxxx"
+-- verify the views are created on workers
+select run_command_on_workers($$SELECT count(*)=0 from citus_local_tables_mx.v100$$);
+ run_command_on_workers
+---------------------------------------------------------------------
+ (localhost,57637,t,t)
+ (localhost,57638,t,t)
+(2 rows)
+
+select run_command_on_workers($$SELECT count(*)=0 from citus_local_tables_mx.v101$$);
+ run_command_on_workers
+---------------------------------------------------------------------
+ (localhost,57637,t,t)
+ (localhost,57638,t,t)
+(2 rows)
+
+select run_command_on_workers($$SELECT count(*)=0 from citus_local_tables_mx.v102$$);
+ run_command_on_workers
+---------------------------------------------------------------------
+ (localhost,57637,t,t)
+ (localhost,57638,t,t)
+(2 rows)
+
+CREATE TABLE loc_tb_2 (a int);
+CREATE VIEW v104 AS SELECT * from loc_tb_2;
+SET client_min_messages TO DEBUG1;
+-- verify the CREATE command for the view is generated correctly
+ALTER TABLE loc_tb_2 ADD CONSTRAINT fkey_2 FOREIGN KEY (a) references ref_tb(a);
+DEBUG:  executing "CREATE OR REPLACE VIEW citus_local_tables_mx.v104 (a) AS  SELECT loc_tb_2.a
+   FROM citus_local_tables_mx.loc_tb_2; ALTER VIEW citus_local_tables_mx.v104 OWNER TO postgres"
+DEBUG:  "view v104" has dependency to "table loc_tb_2" that is not in Citus' metadata
+DEBUG:  validating foreign key constraint "fkey_2_1330083"
 SET client_min_messages TO WARNING;
 -- works fine
 select run_command_on_workers($$SELECT count(*) from citus_local_tables_mx.v100, citus_local_tables_mx.v101, citus_local_tables_mx.v102$$);
@@ -922,14 +939,14 @@ select run_command_on_workers($$SELECT count(*) from citus_local_tables_mx.v100$
  (localhost,57638,f,"ERROR:  relation ""citus_local_tables_mx.v100"" does not exist")
 (2 rows)
 
-	select run_command_on_workers($$SELECT count(*) from citus_local_tables_mx.v101$$);
+select run_command_on_workers($$SELECT count(*) from citus_local_tables_mx.v101$$);
                                 run_command_on_workers
 ---------------------------------------------------------------------
  (localhost,57637,f,"ERROR:  relation ""citus_local_tables_mx.v101"" does not exist")
  (localhost,57638,f,"ERROR:  relation ""citus_local_tables_mx.v101"" does not exist")
 (2 rows)
 
-	select run_command_on_workers($$SELECT count(*) from citus_local_tables_mx.v102$$);
+select run_command_on_workers($$SELECT count(*) from citus_local_tables_mx.v102$$);
                                 run_command_on_workers
 ---------------------------------------------------------------------
  (localhost,57637,f,"ERROR:  relation ""citus_local_tables_mx.v102"" does not exist")
diff --git a/src/test/regress/sql/citus_local_tables_mx.sql b/src/test/regress/sql/citus_local_tables_mx.sql
index 2a2fb70d3..86307cacb 100644
--- a/src/test/regress/sql/citus_local_tables_mx.sql
+++ b/src/test/regress/sql/citus_local_tables_mx.sql
@@ -465,9 +465,19 @@ ALTER TABLE loc_tb ADD CONSTRAINT fkey FOREIGN KEY (a) references ref_tb(a);
 -- drop the view&matview with circular dependency
 DROP VIEW v103 CASCADE;
 
-SET client_min_messages TO DEBUG1;
 -- now it should successfully add to metadata and create the views on workers
 ALTER TABLE loc_tb ADD CONSTRAINT fkey FOREIGN KEY (a) references ref_tb(a);
+-- verify the views are created on workers
+select run_command_on_workers($$SELECT count(*)=0 from citus_local_tables_mx.v100$$);
+select run_command_on_workers($$SELECT count(*)=0 from citus_local_tables_mx.v101$$);
+select run_command_on_workers($$SELECT count(*)=0 from citus_local_tables_mx.v102$$);
+
+CREATE TABLE loc_tb_2 (a int);
+CREATE VIEW v104 AS SELECT * from loc_tb_2;
+
+SET client_min_messages TO DEBUG1;
+-- verify the CREATE command for the view is generated correctly
+ALTER TABLE loc_tb_2 ADD CONSTRAINT fkey_2 FOREIGN KEY (a) references ref_tb(a);
 SET client_min_messages TO WARNING;
 
 -- works fine
@@ -477,8 +487,8 @@ select run_command_on_workers($$SELECT count(*) from citus_local_tables_mx.v100,
 ALTER TABLE loc_tb DROP CONSTRAINT fkey;
 -- fails because fkey is dropped and table is converted to local table
 select run_command_on_workers($$SELECT count(*) from citus_local_tables_mx.v100$$);
-	select run_command_on_workers($$SELECT count(*) from citus_local_tables_mx.v101$$);
-	select run_command_on_workers($$SELECT count(*) from citus_local_tables_mx.v102$$);
+select run_command_on_workers($$SELECT count(*) from citus_local_tables_mx.v101$$);
+select run_command_on_workers($$SELECT count(*) from citus_local_tables_mx.v102$$);
 
 INSERT INTO loc_tb VALUES (1), (2);
 -- test a matview with columnar

From 1c5b8588fe6702a7f2169e2b90a810b7c0246f36 Mon Sep 17 00:00:00 2001
From: Jelte Fennema <github-tech@jeltef.nl>
Date: Fri, 2 Sep 2022 14:23:47 +0200
Subject: [PATCH 08/16] Address race condition in InitializeBackendData (#6285)

Sometimes in CI our isolation_citus_dist_activity test fails randomly
like this:
```diff
 step s2-view-dist:
  SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_dist_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%BEGIN%'), ('%pg_catalog.pg_isolation_test_session_is_blocked%'), ('%citus_add_node%')) AND backend_type = 'client backend' ORDER BY query DESC;

 query                                                                                                                                                                                                                                                                                                                                                                 |citus_nodename_for_nodeid|citus_nodeport_for_nodeid|state              |wait_event_type|wait_event|usename |datname
 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------+-------------------------+-------------------+---------------+----------+--------+----------

   INSERT INTO test_table VALUES (100, 100);
                                                                                                                                                                                                                                                                                                                          |localhost                |                    57636|idle in transaction|Client         |ClientRead|postgres|regression
-(1 row)
+
+                SELECT coalesce(to_jsonb(array_agg(csa_from_one_node.*)), '[{}]'::JSONB)
+                FROM (
+                    SELECT global_pid, worker_query AS is_worker_query, pg_stat_activity.* FROM
+                    pg_stat_activity LEFT JOIN get_all_active_transactions() ON process_id = pid
+                ) AS csa_from_one_node;
+            |localhost                |                    57636|active             |               |          |postgres|regression
+(2 rows)

 step s3-view-worker:
```
Source: https://app.circleci.com/pipelines/github/citusdata/citus/26692/workflows/3406e4b4-b686-4667-bec6-8253ee0809b1/jobs/765119

I intended to fix this with #6263, but the fix turned out to be
insufficient. This PR tries to address the issue by setting
distributedCommandOriginator correctly in more situations. However, even
with this change it's still possible to reproduce the flaky test in CI.
In any case this should fix at least some instances of this issue.

In passing this changes the isolation_citus_dist_activity test to allow
running it multiple times in a row.
---
 src/backend/distributed/shared_library_init.c | 14 +++-
 .../distributed/transaction/backend_data.c    | 55 +++++++-------
 src/include/distributed/backend_data.h        |  6 +-
 .../isolation_citus_dist_activity.out         | 72 +++++++++----------
 .../spec/isolation_citus_dist_activity.spec   |  2 +-
 5 files changed, 79 insertions(+), 70 deletions(-)

diff --git a/src/backend/distributed/shared_library_init.c b/src/backend/distributed/shared_library_init.c
index 4874a4042..a71467ad6 100644
--- a/src/backend/distributed/shared_library_init.c
+++ b/src/backend/distributed/shared_library_init.c
@@ -2643,10 +2643,17 @@ StatisticsCollectionGucCheckHook(bool *newval, void **extra, GucSource source)
 static void
 CitusAuthHook(Port *port, int status)
 {
-	uint64 gpid = ExtractGlobalPID(port->application_name);
+	/*
+	 * We determine the backend type here because other calls in this hook rely
+	 * on it, both IsExternalClientBackend and InitializeBackendData. These
+	 * calls would normally initialize its value based on the application_name
+	 * global, but this global is not set yet at this point in the connection
+	 * initialization. So here we determine it based on the value from Port.
+	 */
+	DetermineCitusBackendType(port->application_name);
 
 	/* external connections to not have a GPID immediately */
-	if (gpid == INVALID_CITUS_INTERNAL_BACKEND_GPID)
+	if (IsExternalClientBackend())
 	{
 		/*
 		 * We raise the shared connection counter pre-emptively. As a result, we may
@@ -2698,7 +2705,8 @@ CitusAuthHook(Port *port, int status)
 	 * replication connection. A replication connection backend will never call
 	 * StartupCitusBackend, which normally sets up the global PID.
 	 */
-	InitializeBackendData(gpid);
+	InitializeBackendData(port->application_name);
+
 
 	/* let other authentication hooks to kick in first */
 	if (original_client_auth_hook)
diff --git a/src/backend/distributed/transaction/backend_data.c b/src/backend/distributed/transaction/backend_data.c
index 670469cd7..e641b7a9b 100644
--- a/src/backend/distributed/transaction/backend_data.c
+++ b/src/backend/distributed/transaction/backend_data.c
@@ -105,9 +105,6 @@ static BackendData *MyBackendData = NULL;
 static CitusBackendType CurrentBackendType = CITUS_BACKEND_NOT_ASSIGNED;
 
 
-static void DetermineCitusBackendType(void);
-
-
 PG_FUNCTION_INFO_V1(assign_distributed_transaction_id);
 PG_FUNCTION_INFO_V1(get_current_transaction_id);
 PG_FUNCTION_INFO_V1(get_global_active_transactions);
@@ -681,7 +678,7 @@ TotalProcCount(void)
  * initialized this backend can be correctly shown in citus_lock_waits.
  */
 void
-InitializeBackendData(uint64 globalPID)
+InitializeBackendData(const char *applicationName)
 {
 	if (MyBackendData != NULL)
 	{
@@ -693,6 +690,8 @@ InitializeBackendData(uint64 globalPID)
 		return;
 	}
 
+	uint64 gpid = ExtractGlobalPID(applicationName);
+
 	MyBackendData = &backendManagementShmemData->backends[MyProc->pgprocno];
 
 	Assert(MyBackendData);
@@ -704,20 +703,8 @@ InitializeBackendData(uint64 globalPID)
 	UnSetGlobalPID();
 
 	SpinLockAcquire(&MyBackendData->mutex);
-
-	/*
-	 * Use the given globalPID to initialize
-	 */
-	if (globalPID == INVALID_CITUS_INTERNAL_BACKEND_GPID)
-	{
-		MyBackendData->distributedCommandOriginator =
-			true;
-	}
-	else
-	{
-		MyBackendData->globalPID = globalPID;
-		MyBackendData->distributedCommandOriginator = false;
-	}
+	MyBackendData->distributedCommandOriginator = IsExternalClientBackend();
+	MyBackendData->globalPID = gpid;
 	SpinLockRelease(&MyBackendData->mutex);
 
 	/*
@@ -1045,7 +1032,7 @@ citus_pid_for_gpid(PG_FUNCTION_ARGS)
  * if the application name is not compatible with Citus' application names returns 0.
  */
 uint64
-ExtractGlobalPID(char *applicationName)
+ExtractGlobalPID(const char *applicationName)
 {
 	/* does application name exist */
 	if (!applicationName)
@@ -1371,7 +1358,7 @@ IsRebalancerInternalBackend(void)
 {
 	if (CurrentBackendType == CITUS_BACKEND_NOT_ASSIGNED)
 	{
-		DetermineCitusBackendType();
+		DetermineCitusBackendType(application_name);
 	}
 
 	return CurrentBackendType == CITUS_REBALANCER_BACKEND;
@@ -1387,7 +1374,7 @@ IsCitusInternalBackend(void)
 {
 	if (CurrentBackendType == CITUS_BACKEND_NOT_ASSIGNED)
 	{
-		DetermineCitusBackendType();
+		DetermineCitusBackendType(application_name);
 	}
 
 	return CurrentBackendType == CITUS_INTERNAL_BACKEND;
@@ -1403,29 +1390,41 @@ IsCitusRunCommandBackend(void)
 {
 	if (CurrentBackendType == CITUS_BACKEND_NOT_ASSIGNED)
 	{
-		DetermineCitusBackendType();
+		DetermineCitusBackendType(application_name);
 	}
 
 	return CurrentBackendType == CITUS_RUN_COMMAND_BACKEND;
 }
 
 
+bool
+IsExternalClientBackend(void)
+{
+	if (CurrentBackendType == CITUS_BACKEND_NOT_ASSIGNED)
+	{
+		DetermineCitusBackendType(application_name);
+	}
+
+	return CurrentBackendType == EXTERNAL_CLIENT_BACKEND;
+}
+
+
 /*
  * DetermineCitusBackendType determines the type of backend based on the application_name.
  */
-static void
-DetermineCitusBackendType(void)
+void
+DetermineCitusBackendType(const char *applicationName)
 {
-	if (ExtractGlobalPID(application_name) != INVALID_CITUS_INTERNAL_BACKEND_GPID)
+	if (ExtractGlobalPID(applicationName) != INVALID_CITUS_INTERNAL_BACKEND_GPID)
 	{
 		CurrentBackendType = CITUS_INTERNAL_BACKEND;
 	}
-	else if (application_name && strcmp(application_name, CITUS_REBALANCER_NAME) == 0)
+	else if (applicationName && strcmp(applicationName, CITUS_REBALANCER_NAME) == 0)
 	{
 		CurrentBackendType = CITUS_REBALANCER_BACKEND;
 	}
-	else if (application_name &&
-			 strcmp(application_name, CITUS_RUN_COMMAND_APPLICATION_NAME) == 0)
+	else if (applicationName &&
+			 strcmp(applicationName, CITUS_RUN_COMMAND_APPLICATION_NAME) == 0)
 	{
 		CurrentBackendType = CITUS_RUN_COMMAND_BACKEND;
 	}
diff --git a/src/include/distributed/backend_data.h b/src/include/distributed/backend_data.h
index 12dbfb6df..b0846c8a7 100644
--- a/src/include/distributed/backend_data.h
+++ b/src/include/distributed/backend_data.h
@@ -50,7 +50,7 @@ extern void BackendManagementShmemInit(void);
 extern size_t BackendManagementShmemSize(void);
 extern void InitializeBackendManagement(void);
 extern int TotalProcCount(void);
-extern void InitializeBackendData(uint64 globalPID);
+extern void InitializeBackendData(const char *applicationName);
 extern void LockBackendSharedMemory(LWLockMode lockMode);
 extern void UnlockBackendSharedMemory(void);
 extern void UnSetDistributedTransactionId(void);
@@ -62,7 +62,7 @@ extern void SetBackendDataGlobalPID(uint64 globalPID);
 extern uint64 GetGlobalPID(void);
 extern void SetBackendDataDistributedCommandOriginator(bool
 													   distributedCommandOriginator);
-extern uint64 ExtractGlobalPID(char *applicationName);
+extern uint64 ExtractGlobalPID(const char *applicationName);
 extern int ExtractNodeIdFromGlobalPID(uint64 globalPID, bool missingOk);
 extern int ExtractProcessIdFromGlobalPID(uint64 globalPID);
 extern void GetBackendDataForProc(PGPROC *proc, BackendData *result);
@@ -74,9 +74,11 @@ extern LocalTransactionId GetMyProcLocalTransactionId(void);
 extern int GetExternalClientBackendCount(void);
 extern uint32 IncrementExternalClientBackendCounter(void);
 extern void DecrementExternalClientBackendCounter(void);
+extern void DetermineCitusBackendType(const char *applicationName);
 extern bool IsCitusInternalBackend(void);
 extern bool IsRebalancerInternalBackend(void);
 extern bool IsCitusRunCommandBackend(void);
+extern bool IsExternalClientBackend(void);
 extern void ResetCitusBackendType(void);
 
 #define INVALID_CITUS_INTERNAL_BACKEND_GPID 0
diff --git a/src/test/regress/expected/isolation_citus_dist_activity.out b/src/test/regress/expected/isolation_citus_dist_activity.out
index 2f1810f7a..e674eed10 100644
--- a/src/test/regress/expected/isolation_citus_dist_activity.out
+++ b/src/test/regress/expected/isolation_citus_dist_activity.out
@@ -15,16 +15,16 @@ step s1-begin:
     BEGIN;
 
 step s2-begin:
-	BEGIN;
+ BEGIN;
 
 step s3-begin:
-	BEGIN;
+ BEGIN;
 
 step s1-alter-table:
     ALTER TABLE test_table ADD COLUMN x INT;
 
 step s2-sleep:
-	SELECT pg_sleep(0.5);
+ SELECT pg_sleep(0.5);
 
 pg_sleep
 ---------------------------------------------------------------------
@@ -32,7 +32,7 @@ pg_sleep
 (1 row)
 
 step s2-view-dist:
-	SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_dist_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%BEGIN%'), ('%pg_catalog.pg_isolation_test_session_is_blocked%'), ('%citus_add_node%')) AND backend_type = 'client backend' ORDER BY query DESC;
+ SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_dist_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%BEGIN%'), ('%pg_catalog.pg_isolation_test_session_is_blocked%'), ('%citus_add_node%')) AND backend_type = 'client backend' ORDER BY query DESC;
 
 query                                         |citus_nodename_for_nodeid|citus_nodeport_for_nodeid|state              |wait_event_type|wait_event|usename |datname
 ---------------------------------------------------------------------
@@ -42,7 +42,7 @@ query                                         |citus_nodename_for_nodeid|citus_n
 (1 row)
 
 step s3-view-worker:
-	SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%csa_from_one_node%')) AND is_worker_query = true AND backend_type = 'client backend' ORDER BY query DESC;
+ SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%csa_from_one_node%')) AND is_worker_query = true AND backend_type = 'client backend' ORDER BY query DESC;
 
 query                                                                                                      |citus_nodename_for_nodeid|citus_nodeport_for_nodeid|state              |wait_event_type|wait_event|usename |datname
 ---------------------------------------------------------------------
@@ -61,13 +61,13 @@ SELECT worker_apply_shard_ddl_command (1300001, 'public', '
 (4 rows)
 
 step s2-rollback:
-	ROLLBACK;
+ ROLLBACK;
 
 step s1-commit:
     COMMIT;
 
 step s3-rollback:
-	ROLLBACK;
+ ROLLBACK;
 
 
 starting permutation: s1-cache-connections s1-begin s2-begin s3-begin s1-insert s2-sleep s2-view-dist s3-view-worker s2-rollback s1-commit s3-rollback
@@ -85,16 +85,16 @@ step s1-begin:
     BEGIN;
 
 step s2-begin:
-	BEGIN;
+ BEGIN;
 
 step s3-begin:
-	BEGIN;
+ BEGIN;
 
 step s1-insert:
- 	INSERT INTO test_table VALUES (100, 100);
+  INSERT INTO test_table VALUES (100, 100);
 
 step s2-sleep:
-	SELECT pg_sleep(0.5);
+ SELECT pg_sleep(0.5);
 
 pg_sleep
 ---------------------------------------------------------------------
@@ -102,31 +102,31 @@ pg_sleep
 (1 row)
 
 step s2-view-dist:
-	SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_dist_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%BEGIN%'), ('%pg_catalog.pg_isolation_test_session_is_blocked%'), ('%citus_add_node%')) AND backend_type = 'client backend' ORDER BY query DESC;
+ SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_dist_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%BEGIN%'), ('%pg_catalog.pg_isolation_test_session_is_blocked%'), ('%citus_add_node%')) AND backend_type = 'client backend' ORDER BY query DESC;
 
 query                                        |citus_nodename_for_nodeid|citus_nodeport_for_nodeid|state              |wait_event_type|wait_event|usename |datname
 ---------------------------------------------------------------------
 
- 	INSERT INTO test_table VALUES (100, 100);
+  INSERT INTO test_table VALUES (100, 100);
 |localhost                |                    57636|idle in transaction|Client         |ClientRead|postgres|regression
 (1 row)
 
 step s3-view-worker:
-	SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%csa_from_one_node%')) AND is_worker_query = true AND backend_type = 'client backend' ORDER BY query DESC;
+ SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%csa_from_one_node%')) AND is_worker_query = true AND backend_type = 'client backend' ORDER BY query DESC;
 
 query                                                                     |citus_nodename_for_nodeid|citus_nodeport_for_nodeid|state              |wait_event_type|wait_event|usename |datname
 ---------------------------------------------------------------------
-INSERT INTO public.test_table_1300008 (column1, column2) VALUES (100, 100)|localhost                |                    57636|idle in transaction|Client         |ClientRead|postgres|regression
+INSERT INTO public.test_table_1300003 (column1, column2) VALUES (100, 100)|localhost                |                    57636|idle in transaction|Client         |ClientRead|postgres|regression
 (1 row)
 
 step s2-rollback:
-	ROLLBACK;
+ ROLLBACK;
 
 step s1-commit:
     COMMIT;
 
 step s3-rollback:
-	ROLLBACK;
+ ROLLBACK;
 
 
 starting permutation: s1-cache-connections s1-begin s2-begin s3-begin s1-select s2-sleep s2-view-dist s3-view-worker s2-rollback s1-commit s3-rollback
@@ -144,10 +144,10 @@ step s1-begin:
     BEGIN;
 
 step s2-begin:
-	BEGIN;
+ BEGIN;
 
 step s3-begin:
-	BEGIN;
+ BEGIN;
 
 step s1-select:
    SELECT count(*) FROM test_table;
@@ -158,7 +158,7 @@ count
 (1 row)
 
 step s2-sleep:
-	SELECT pg_sleep(0.5);
+ SELECT pg_sleep(0.5);
 
 pg_sleep
 ---------------------------------------------------------------------
@@ -166,7 +166,7 @@ pg_sleep
 (1 row)
 
 step s2-view-dist:
-	SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_dist_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%BEGIN%'), ('%pg_catalog.pg_isolation_test_session_is_blocked%'), ('%citus_add_node%')) AND backend_type = 'client backend' ORDER BY query DESC;
+ SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_dist_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%BEGIN%'), ('%pg_catalog.pg_isolation_test_session_is_blocked%'), ('%citus_add_node%')) AND backend_type = 'client backend' ORDER BY query DESC;
 
 query                                |citus_nodename_for_nodeid|citus_nodeport_for_nodeid|state              |wait_event_type|wait_event|usename |datname
 ---------------------------------------------------------------------
@@ -176,24 +176,24 @@ query                                |citus_nodename_for_nodeid|citus_nodeport_f
 (1 row)
 
 step s3-view-worker:
-	SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%csa_from_one_node%')) AND is_worker_query = true AND backend_type = 'client backend' ORDER BY query DESC;
+ SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%csa_from_one_node%')) AND is_worker_query = true AND backend_type = 'client backend' ORDER BY query DESC;
 
 query                                                                        |citus_nodename_for_nodeid|citus_nodeport_for_nodeid|state              |wait_event_type|wait_event|usename |datname
 ---------------------------------------------------------------------
-SELECT count(*) AS count FROM public.test_table_1300014 test_table WHERE true|localhost                |                    57636|idle in transaction|Client         |ClientRead|postgres|regression
-SELECT count(*) AS count FROM public.test_table_1300013 test_table WHERE true|localhost                |                    57636|idle in transaction|Client         |ClientRead|postgres|regression
-SELECT count(*) AS count FROM public.test_table_1300012 test_table WHERE true|localhost                |                    57636|idle in transaction|Client         |ClientRead|postgres|regression
-SELECT count(*) AS count FROM public.test_table_1300011 test_table WHERE true|localhost                |                    57636|idle in transaction|Client         |ClientRead|postgres|regression
+SELECT count(*) AS count FROM public.test_table_1300004 test_table WHERE true|localhost                |                    57636|idle in transaction|Client         |ClientRead|postgres|regression
+SELECT count(*) AS count FROM public.test_table_1300003 test_table WHERE true|localhost                |                    57636|idle in transaction|Client         |ClientRead|postgres|regression
+SELECT count(*) AS count FROM public.test_table_1300002 test_table WHERE true|localhost                |                    57636|idle in transaction|Client         |ClientRead|postgres|regression
+SELECT count(*) AS count FROM public.test_table_1300001 test_table WHERE true|localhost                |                    57636|idle in transaction|Client         |ClientRead|postgres|regression
 (4 rows)
 
 step s2-rollback:
-	ROLLBACK;
+ ROLLBACK;
 
 step s1-commit:
     COMMIT;
 
 step s3-rollback:
-	ROLLBACK;
+ ROLLBACK;
 
 
 starting permutation: s1-cache-connections s1-begin s2-begin s3-begin s1-select-router s2-sleep s2-view-dist s3-view-worker s2-rollback s1-commit s3-rollback
@@ -211,10 +211,10 @@ step s1-begin:
     BEGIN;
 
 step s2-begin:
-	BEGIN;
+ BEGIN;
 
 step s3-begin:
-	BEGIN;
+ BEGIN;
 
 step s1-select-router:
    SELECT count(*) FROM test_table WHERE column1 = 55;
@@ -225,7 +225,7 @@ count
 (1 row)
 
 step s2-sleep:
-	SELECT pg_sleep(0.5);
+ SELECT pg_sleep(0.5);
 
 pg_sleep
 ---------------------------------------------------------------------
@@ -233,7 +233,7 @@ pg_sleep
 (1 row)
 
 step s2-view-dist:
-	SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_dist_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%BEGIN%'), ('%pg_catalog.pg_isolation_test_session_is_blocked%'), ('%citus_add_node%')) AND backend_type = 'client backend' ORDER BY query DESC;
+ SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_dist_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%BEGIN%'), ('%pg_catalog.pg_isolation_test_session_is_blocked%'), ('%citus_add_node%')) AND backend_type = 'client backend' ORDER BY query DESC;
 
 query                                                   |citus_nodename_for_nodeid|citus_nodeport_for_nodeid|state              |wait_event_type|wait_event|usename |datname
 ---------------------------------------------------------------------
@@ -243,19 +243,19 @@ query                                                   |citus_nodename_for_node
 (1 row)
 
 step s3-view-worker:
-	SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%csa_from_one_node%')) AND is_worker_query = true AND backend_type = 'client backend' ORDER BY query DESC;
+ SELECT query, citus_nodename_for_nodeid(citus_nodeid_for_gpid(global_pid)), citus_nodeport_for_nodeid(citus_nodeid_for_gpid(global_pid)), state, wait_event_type, wait_event, usename, datname FROM citus_stat_activity WHERE query NOT ILIKE ALL(VALUES('%pg_prepared_xacts%'), ('%COMMIT%'), ('%csa_from_one_node%')) AND is_worker_query = true AND backend_type = 'client backend' ORDER BY query DESC;
 
 query                                                                                                       |citus_nodename_for_nodeid|citus_nodeport_for_nodeid|state              |wait_event_type|wait_event|usename |datname
 ---------------------------------------------------------------------
-SELECT count(*) AS count FROM public.test_table_1300017 test_table WHERE (column1 OPERATOR(pg_catalog.=) 55)|localhost                |                    57636|idle in transaction|Client         |ClientRead|postgres|regression
+SELECT count(*) AS count FROM public.test_table_1300002 test_table WHERE (column1 OPERATOR(pg_catalog.=) 55)|localhost                |                    57636|idle in transaction|Client         |ClientRead|postgres|regression
 (1 row)
 
 step s2-rollback:
-	ROLLBACK;
+ ROLLBACK;
 
 step s1-commit:
     COMMIT;
 
 step s3-rollback:
-	ROLLBACK;
+ ROLLBACK;
 
diff --git a/src/test/regress/spec/isolation_citus_dist_activity.spec b/src/test/regress/spec/isolation_citus_dist_activity.spec
index 9970ced85..5d62adbb1 100644
--- a/src/test/regress/spec/isolation_citus_dist_activity.spec
+++ b/src/test/regress/spec/isolation_citus_dist_activity.spec
@@ -9,7 +9,7 @@ setup
     AS 'citus', $$test_assign_global_pid$$;
     SET citus.shard_replication_factor TO 1;
     SET citus.shard_count TO 4;
-    select setval('pg_dist_shardid_seq', GREATEST(1300000, nextval('pg_dist_shardid_seq')));
+    ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 1300001;
 
     CREATE TABLE test_table(column1 int, column2 int);
     SELECT create_distributed_table('test_table', 'column1');

From bd1383664890913a839fdda09dc0e0e6b886b55a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96nder=20Kalac=C4=B1?= <onderkalaci@gmail.com>
Date: Mon, 5 Sep 2022 18:47:41 +0300
Subject: [PATCH 09/16] Add citus.skip_advisory_lock_permission_checks (#6293)

---
 src/backend/distributed/shared_library_init.c | 14 +++++++
 src/backend/distributed/utils/resource_lock.c |  7 +++-
 src/include/distributed/resource_lock.h       |  2 +
 .../regress/expected/adv_lock_permission.out  | 41 ++++++++++++++++++
 src/test/regress/multi_schedule               |  2 +-
 src/test/regress/sql/adv_lock_permission.sql  | 42 +++++++++++++++++++
 6 files changed, 106 insertions(+), 2 deletions(-)
 create mode 100644 src/test/regress/expected/adv_lock_permission.out
 create mode 100644 src/test/regress/sql/adv_lock_permission.sql

diff --git a/src/backend/distributed/shared_library_init.c b/src/backend/distributed/shared_library_init.c
index a71467ad6..343ac2f0b 100644
--- a/src/backend/distributed/shared_library_init.c
+++ b/src/backend/distributed/shared_library_init.c
@@ -2076,6 +2076,20 @@ RegisterCitusConfigVariables(void)
 		ShowShardsForAppNamePrefixesAssignHook,
 		NULL);
 
+	DefineCustomBoolVariable(
+		"citus.skip_advisory_lock_permission_checks",
+		gettext_noop("Postgres would normally enforce some "
+					 "ownership checks while acquiring locks. "
+					 "When this setting is 'off', Citus skips"
+					 "ownership checks on internal advisory "
+					 "locks."),
+		NULL,
+		&SkipAdvisoryLockPermissionChecks,
+		false,
+		GUC_SUPERUSER_ONLY,
+		GUC_NO_SHOW_ALL,
+		NULL, NULL, NULL);
+
 	DefineCustomBoolVariable(
 		"citus.skip_jsonb_validation_in_copy",
 		gettext_noop("Skip validation of JSONB columns on the coordinator during COPY "
diff --git a/src/backend/distributed/utils/resource_lock.c b/src/backend/distributed/utils/resource_lock.c
index 35991ade1..a4187aa65 100644
--- a/src/backend/distributed/utils/resource_lock.c
+++ b/src/backend/distributed/utils/resource_lock.c
@@ -109,6 +109,8 @@ PG_FUNCTION_INFO_V1(lock_relation_if_exists);
 
 /* Config variable managed via guc.c */
 bool EnableAcquiringUnsafeLockFromWorkers = false;
+bool SkipAdvisoryLockPermissionChecks = false;
+
 
 /*
  * lock_shard_metadata allows the shard distribution metadata to be locked
@@ -248,7 +250,10 @@ lock_shard_resources(PG_FUNCTION_ARGS)
 			continue;
 		}
 
-		EnsureTablePermissions(relationId, aclMask);
+		if (!SkipAdvisoryLockPermissionChecks)
+		{
+			EnsureTablePermissions(relationId, aclMask);
+		}
 
 		LockShardResource(shardId, lockMode);
 	}
diff --git a/src/include/distributed/resource_lock.h b/src/include/distributed/resource_lock.h
index f2a331451..375f6abbf 100644
--- a/src/include/distributed/resource_lock.h
+++ b/src/include/distributed/resource_lock.h
@@ -149,6 +149,7 @@ enum DistLockConfigs
 	DIST_LOCK_NOWAIT = 2
 };
 
+
 /* Lock shard/relation metadata for safe modifications */
 extern void LockShardDistributionMetadata(int64 shardId, LOCKMODE lockMode);
 extern void LockPlacementCleanup(void);
@@ -194,5 +195,6 @@ extern void AcquireDistributedLockOnRelations(List *relationList, LOCKMODE lockM
 extern void PreprocessLockStatement(LockStmt *stmt, ProcessUtilityContext context);
 
 extern bool EnableAcquiringUnsafeLockFromWorkers;
+extern bool SkipAdvisoryLockPermissionChecks;
 
 #endif /* RESOURCE_LOCK_H */
diff --git a/src/test/regress/expected/adv_lock_permission.out b/src/test/regress/expected/adv_lock_permission.out
new file mode 100644
index 000000000..d034fbb67
--- /dev/null
+++ b/src/test/regress/expected/adv_lock_permission.out
@@ -0,0 +1,41 @@
+CREATE SCHEMA adv_lock_permission;
+SET search_path to adv_lock_permission;
+-- do not cache any connections, we change some settings and don't want old ones cached
+SET citus.max_cached_conns_per_worker TO 0;
+CREATE ROLE user_1 WITH LOGIN;
+CREATE TABLE reference_table_1 (A int);
+SELECT create_reference_table('reference_table_1');
+ create_reference_table
+---------------------------------------------------------------------
+
+(1 row)
+
+CREATE TABLE reference_table_2 (A int);
+SELECT create_reference_table('reference_table_2');
+ create_reference_table
+---------------------------------------------------------------------
+
+(1 row)
+
+GRANT USAGE ON SCHEMA adv_lock_permission TO user_1;
+GRANT SELECT ON reference_table_1 TO user_1;
+GRANT INSERT, UPDATE ON reference_table_2 TO user_1;
+SET ROLE user_1;
+-- do not cache any connections, we change some settings and don't want old ones cached
+SET citus.max_cached_conns_per_worker TO 0;
+SET search_path to adv_lock_permission;
+INSERT INTO reference_table_2 SELECT * FROM reference_table_1;
+ERROR:  permission denied for table reference_table_1
+CONTEXT:  while executing command on localhost:xxxxx
+SET ROLE postgres;
+-- do not cache any connections, we change some settings and don't want old ones cached
+SET citus.max_cached_conns_per_worker TO 0;
+-- change the role so that it can skip permission checks
+ALTER ROLE user_1 SET citus.skip_advisory_lock_permission_checks TO on;
+SET ROLE user_1;
+SET citus.max_cached_conns_per_worker TO 0;
+INSERT INTO reference_table_2 SELECT * FROM reference_table_1;
+SET ROLE postgres;
+SET client_min_messages TO ERROR;
+DROP SCHEMA adv_lock_permission CASCADE;
+DROP ROLE user_1;
diff --git a/src/test/regress/multi_schedule b/src/test/regress/multi_schedule
index a92da42fd..ef64e7657 100644
--- a/src/test/regress/multi_schedule
+++ b/src/test/regress/multi_schedule
@@ -86,7 +86,7 @@ test: multi_agg_type_conversion multi_count_type_conversion recursive_relation_p
 test: multi_partition_pruning single_hash_repartition_join unsupported_lateral_subqueries
 test: multi_join_pruning multi_hash_pruning intermediate_result_pruning
 test: multi_null_minmax_value_pruning cursors
-test: modification_correctness
+test: modification_correctness adv_lock_permission
 test: multi_query_directory_cleanup
 test: multi_task_assignment_policy multi_cross_shard
 test: multi_utility_statements
diff --git a/src/test/regress/sql/adv_lock_permission.sql b/src/test/regress/sql/adv_lock_permission.sql
new file mode 100644
index 000000000..2528fdc39
--- /dev/null
+++ b/src/test/regress/sql/adv_lock_permission.sql
@@ -0,0 +1,42 @@
+CREATE SCHEMA adv_lock_permission;
+SET search_path to adv_lock_permission;
+
+-- do not cache any connections, we change some settings and don't want old ones cached
+SET citus.max_cached_conns_per_worker TO 0;
+
+CREATE ROLE user_1 WITH LOGIN;
+
+CREATE TABLE reference_table_1 (A int);
+SELECT create_reference_table('reference_table_1');
+
+CREATE TABLE reference_table_2 (A int);
+SELECT create_reference_table('reference_table_2');
+
+GRANT USAGE ON SCHEMA adv_lock_permission TO user_1;
+GRANT SELECT ON reference_table_1 TO user_1;
+GRANT INSERT, UPDATE ON reference_table_2 TO user_1;
+
+SET ROLE user_1;
+
+-- do not cache any connections, we change some settings and don't want old ones cached
+SET citus.max_cached_conns_per_worker TO 0;
+SET search_path to adv_lock_permission;
+
+INSERT INTO reference_table_2 SELECT * FROM reference_table_1;
+
+SET ROLE postgres;
+-- do not cache any connections, we change some settings and don't want old ones cached
+SET citus.max_cached_conns_per_worker TO 0;
+
+-- change the role so that it can skip permission checks
+ALTER ROLE user_1 SET citus.skip_advisory_lock_permission_checks TO on;
+
+SET ROLE user_1;
+
+SET citus.max_cached_conns_per_worker TO 0;
+INSERT INTO reference_table_2 SELECT * FROM reference_table_1;
+
+SET ROLE postgres;
+SET client_min_messages TO ERROR;
+DROP SCHEMA adv_lock_permission CASCADE;
+DROP ROLE user_1;

From e6b184593108affeff19a93a0751128c5ad804ce Mon Sep 17 00:00:00 2001
From: Marco Slot <marco.slot@microsoft.com>
Date: Mon, 5 Sep 2022 22:02:18 +0200
Subject: [PATCH 10/16] Change split logic to avoid
 EnsureReferenceTablesExistOnAllNodesExtended (#6208)

Co-authored-by: Marco Slot <marco.slot@gmail.com>
---
 .../distributed/operations/shard_split.c      |  3 +-
 .../distributed/utils/reference_table_utils.c | 79 +++++++++++++++++++
 .../distributed/reference_table_utils.h       |  1 +
 .../multi_replicate_reference_table.out       | 32 ++++++++
 .../sql/multi_replicate_reference_table.sql   | 15 ++++
 5 files changed, 129 insertions(+), 1 deletion(-)

diff --git a/src/backend/distributed/operations/shard_split.c b/src/backend/distributed/operations/shard_split.c
index bc4ad208b..f7116d565 100644
--- a/src/backend/distributed/operations/shard_split.c
+++ b/src/backend/distributed/operations/shard_split.c
@@ -501,6 +501,8 @@ SplitShard(SplitMode splitMode,
 
 	List *workersForPlacementList = GetWorkerNodesFromWorkerIds(nodeIdsForPlacementList);
 
+	ErrorIfNotAllNodesHaveReferenceTableReplicas(workersForPlacementList);
+
 	List *sourceColocatedShardIntervalList = NIL;
 	if (colocatedShardIntervalList == NIL)
 	{
@@ -517,7 +519,6 @@ SplitShard(SplitMode splitMode,
 
 	if (splitMode == BLOCKING_SPLIT)
 	{
-		EnsureReferenceTablesExistOnAllNodesExtended(TRANSFER_MODE_BLOCK_WRITES);
 		BlockingShardSplit(
 			splitOperation,
 			splitWorkflowId,
diff --git a/src/backend/distributed/utils/reference_table_utils.c b/src/backend/distributed/utils/reference_table_utils.c
index 23e608a18..f4b2f4be3 100644
--- a/src/backend/distributed/utils/reference_table_utils.c
+++ b/src/backend/distributed/utils/reference_table_utils.c
@@ -50,6 +50,7 @@ static void ReplicateReferenceTableShardToNode(ShardInterval *shardInterval,
 											   int nodePort);
 static bool AnyRelationsModifiedInTransaction(List *relationIdList);
 static List * ReplicatedMetadataSyncedDistributedTableList(void);
+static bool NodeHasAllReferenceTableReplicas(WorkerNode *workerNode);
 
 /* exports for SQL callable functions */
 PG_FUNCTION_INFO_V1(upgrade_to_reference_table);
@@ -688,3 +689,81 @@ ReplicateAllReferenceTablesToNode(WorkerNode *workerNode)
 		}
 	}
 }
+
+
+/*
+ * ErrorIfNotAllNodesHaveReferenceTableReplicas throws an error when one of the
+ * nodes in the list does not have reference table replicas.
+ */
+void
+ErrorIfNotAllNodesHaveReferenceTableReplicas(List *workerNodeList)
+{
+	WorkerNode *workerNode = NULL;
+
+	foreach_ptr(workerNode, workerNodeList)
+	{
+		if (!NodeHasAllReferenceTableReplicas(workerNode))
+		{
+			ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+							errmsg("reference tables have not been replicated to "
+								   "node %s:%d yet",
+								   workerNode->workerName,
+								   workerNode->workerPort),
+							errdetail("Reference tables are lazily replicated after "
+									  "adding a node, but must exist before shards can "
+									  "be created on that node."),
+							errhint("Run SELECT replicate_reference_tables(); to "
+									"ensure reference tables exist on all nodes.")));
+		}
+	}
+}
+
+
+/*
+ * NodeHasAllReferenceTablesReplicas returns whether the given worker node has reference
+ * table replicas. If there are no reference tables the function returns true.
+ *
+ * This function does not do any locking, so the situation could change immediately after,
+ * though we can only ever transition from false to true, so only "false" could be the
+ * incorrect answer.
+ *
+ * In the case where the function returns true because no reference tables exist
+ * on the node, a reference table could be created immediately after. However, the
+ * creation logic guarantees that this reference table will be created on all the
+ * nodes, so our answer was correct.
+ */
+static bool
+NodeHasAllReferenceTableReplicas(WorkerNode *workerNode)
+{
+	List *referenceTableIdList = CitusTableTypeIdList(REFERENCE_TABLE);
+
+	if (list_length(referenceTableIdList) == 0)
+	{
+		/* no reference tables exist */
+		return true;
+	}
+
+	Oid referenceTableId = linitial_oid(referenceTableIdList);
+	List *shardIntervalList = LoadShardIntervalList(referenceTableId);
+	if (list_length(shardIntervalList) != 1)
+	{
+		/* check for corrupt metadata */
+		ereport(ERROR, (errmsg("reference table \"%s\" can only have 1 shard",
+							   get_rel_name(referenceTableId))));
+	}
+
+	ShardInterval *shardInterval = (ShardInterval *) linitial(shardIntervalList);
+	List *shardPlacementList = ActiveShardPlacementList(shardInterval->shardId);
+
+	ShardPlacement *placement = NULL;
+	foreach_ptr(placement, shardPlacementList)
+	{
+		if (placement->groupId == workerNode->groupId)
+		{
+			/* our worker has a reference table placement */
+			return true;
+		}
+	}
+
+	return false;
+}
diff --git a/src/include/distributed/reference_table_utils.h b/src/include/distributed/reference_table_utils.h
index a26f16630..80b282126 100644
--- a/src/include/distributed/reference_table_utils.h
+++ b/src/include/distributed/reference_table_utils.h
@@ -26,5 +26,6 @@ extern void DeleteAllReplicatedTablePlacementsFromNodeGroup(int32 groupId,
 															bool localOnly);
 extern int CompareOids(const void *leftElement, const void *rightElement);
 extern void ReplicateAllReferenceTablesToNode(WorkerNode *workerNode);
+extern void ErrorIfNotAllNodesHaveReferenceTableReplicas(List *workerNodeList);
 
 #endif /* REFERENCE_TABLE_UTILS_H_ */
diff --git a/src/test/regress/expected/multi_replicate_reference_table.out b/src/test/regress/expected/multi_replicate_reference_table.out
index 2dd670108..afec8052b 100644
--- a/src/test/regress/expected/multi_replicate_reference_table.out
+++ b/src/test/regress/expected/multi_replicate_reference_table.out
@@ -1150,6 +1150,38 @@ CREATE TABLE test (x int, y int references ref(a));
 SELECT create_distributed_table('test','x');
 ERROR:  canceling the transaction since it was involved in a distributed deadlock
 END;
+-- verify the split fails if we still need to replicate reference tables
+SELECT citus_remove_node('localhost', :worker_2_port);
+ citus_remove_node
+---------------------------------------------------------------------
+
+(1 row)
+
+SELECT create_distributed_table('test','x');
+ create_distributed_table
+---------------------------------------------------------------------
+
+(1 row)
+
+SELECT citus_add_node('localhost', :worker_2_port);
+ citus_add_node
+---------------------------------------------------------------------
+        1370022
+(1 row)
+
+SELECT
+  citus_split_shard_by_split_points(shardid,
+                                    ARRAY[(shardminvalue::int + (shardmaxvalue::int - shardminvalue::int)/2)::text],
+                                    ARRAY[nodeid, nodeid],
+                                    'force_logical')
+FROM
+  pg_dist_shard, pg_dist_node
+WHERE
+  logicalrelid = 'replicate_reference_table.test'::regclass AND nodename = 'localhost' AND nodeport = :worker_2_port
+ORDER BY shardid LIMIT 1;
+ERROR:  reference tables have not been replicated to node localhost:xxxxx yet
+DETAIL:  Reference tables are lazily replicated after adding a node, but must exist before shards can be created on that node.
+HINT:  Run SELECT replicate_reference_tables(); to ensure reference tables exist on all nodes.
 -- test adding an invalid node while we have reference tables to replicate
 -- set client message level to ERROR and verbosity to terse to supporess
 -- OS-dependent host name resolution warnings
diff --git a/src/test/regress/sql/multi_replicate_reference_table.sql b/src/test/regress/sql/multi_replicate_reference_table.sql
index c34007ad7..172d08f35 100644
--- a/src/test/regress/sql/multi_replicate_reference_table.sql
+++ b/src/test/regress/sql/multi_replicate_reference_table.sql
@@ -700,6 +700,21 @@ CREATE TABLE test (x int, y int references ref(a));
 SELECT create_distributed_table('test','x');
 END;
 
+-- verify the split fails if we still need to replicate reference tables
+SELECT citus_remove_node('localhost', :worker_2_port);
+SELECT create_distributed_table('test','x');
+SELECT citus_add_node('localhost', :worker_2_port);
+SELECT
+  citus_split_shard_by_split_points(shardid,
+                                    ARRAY[(shardminvalue::int + (shardmaxvalue::int - shardminvalue::int)/2)::text],
+                                    ARRAY[nodeid, nodeid],
+                                    'force_logical')
+FROM
+  pg_dist_shard, pg_dist_node
+WHERE
+  logicalrelid = 'replicate_reference_table.test'::regclass AND nodename = 'localhost' AND nodeport = :worker_2_port
+ORDER BY shardid LIMIT 1;
+
 -- test adding an invalid node while we have reference tables to replicate
 -- set client message level to ERROR and verbosity to terse to supporess
 -- OS-dependent host name resolution warnings

From fd9b3f4ae9a1a79fa15849f8fd901d5a71203eb0 Mon Sep 17 00:00:00 2001
From: Naisila Puka <37271756+naisila@users.noreply.github.com>
Date: Tue, 6 Sep 2022 11:04:14 +0300
Subject: [PATCH 11/16] Add tests to make sure distributed clone trigger rename
 fails in PG15 (#6291)

Relevant PG commit:
80ba4bb383538a2ee846fece6a7b8da9518b6866
---
 src/test/regress/expected/pg15.out              |  4 ++++
 .../upgrade_distributed_triggers_after.out      | 17 +++++++++++++++++
 src/test/regress/sql/pg15.sql                   |  3 +++
 .../sql/upgrade_distributed_triggers_after.sql  |  5 +++++
 4 files changed, 29 insertions(+)

diff --git a/src/test/regress/expected/pg15.out b/src/test/regress/expected/pg15.out
index f59586b7a..1b214f4a9 100644
--- a/src/test/regress/expected/pg15.out
+++ b/src/test/regress/expected/pg15.out
@@ -168,6 +168,10 @@ SELECT * FROM sale_triggers ORDER BY 1, 2;
  truncate_trigger_xxxxxxx  | sale_newyork    | O
 (6 rows)
 
+-- test that we can't rename a distributed clone trigger
+ALTER TRIGGER "new_record_sale_trigger" ON "pg15"."sale_newyork" RENAME TO "another_trigger_name";
+ERROR:  cannot rename trigger "new_record_sale_trigger" on table "sale_newyork"
+HINT:  Rename trigger on partitioned table "sale" instead.
 --
 -- In PG15, For GENERATED columns, all dependencies of the generation
 -- expression are recorded as NORMAL dependencies of the column itself.
diff --git a/src/test/regress/expected/upgrade_distributed_triggers_after.out b/src/test/regress/expected/upgrade_distributed_triggers_after.out
index 591f33401..2e3f7330b 100644
--- a/src/test/regress/expected/upgrade_distributed_triggers_after.out
+++ b/src/test/regress/expected/upgrade_distributed_triggers_after.out
@@ -218,6 +218,23 @@ SELECT * FROM sale_triggers ORDER BY 1, 2;
  truncate_trigger_xxxxxxx      | sale_alabama    | O
 (12 rows)
 
+-- after upgrade to PG15, test that we can't rename a distributed clone trigger
+ALTER TRIGGER "renamed_yet_another_trigger" ON "sale_alabama" RENAME TO "another_trigger_name";
+ERROR:  cannot rename trigger "renamed_yet_another_trigger" on table "sale_alabama"
+HINT:  Rename trigger on partitioned table "sale" instead.
+SELECT count(*) FROM pg_trigger WHERE tgname like 'another_trigger_name%';
+ count
+---------------------------------------------------------------------
+     0
+(1 row)
+
+SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'another_trigger_name%';$$);
+ run_command_on_workers
+---------------------------------------------------------------------
+ (localhost,10201,t,0)
+ (localhost,10202,t,0)
+(2 rows)
+
 DROP SCHEMA upgrade_distributed_triggers CASCADE;
 NOTICE:  drop cascades to 4 other objects
 DETAIL:  drop cascades to function record_sale()
diff --git a/src/test/regress/sql/pg15.sql b/src/test/regress/sql/pg15.sql
index cefc419e8..73cee8054 100644
--- a/src/test/regress/sql/pg15.sql
+++ b/src/test/regress/sql/pg15.sql
@@ -114,6 +114,9 @@ SELECT * FROM sale_triggers ORDER BY 1, 2;
 ALTER TRIGGER "record_sale_trigger" ON "pg15"."sale" RENAME TO "new_record_sale_trigger";
 SELECT * FROM sale_triggers ORDER BY 1, 2;
 
+-- test that we can't rename a distributed clone trigger
+ALTER TRIGGER "new_record_sale_trigger" ON "pg15"."sale_newyork" RENAME TO "another_trigger_name";
+
 --
 -- In PG15, For GENERATED columns, all dependencies of the generation
 -- expression are recorded as NORMAL dependencies of the column itself.
diff --git a/src/test/regress/sql/upgrade_distributed_triggers_after.sql b/src/test/regress/sql/upgrade_distributed_triggers_after.sql
index bcbd7432e..681f1896b 100644
--- a/src/test/regress/sql/upgrade_distributed_triggers_after.sql
+++ b/src/test/regress/sql/upgrade_distributed_triggers_after.sql
@@ -69,4 +69,9 @@ ALTER TRIGGER "yet_another_trigger" ON "sale" RENAME TO "renamed_yet_another_tri
 
 SELECT * FROM sale_triggers ORDER BY 1, 2;
 
+-- after upgrade to PG15, test that we can't rename a distributed clone trigger
+ALTER TRIGGER "renamed_yet_another_trigger" ON "sale_alabama" RENAME TO "another_trigger_name";
+SELECT count(*) FROM pg_trigger WHERE tgname like 'another_trigger_name%';
+SELECT run_command_on_workers($$SELECT count(*) FROM pg_trigger WHERE tgname like 'another_trigger_name%';$$);
+
 DROP SCHEMA upgrade_distributed_triggers CASCADE;

From d7f41cacbe52c756279b32ae39c95f47b59261fc Mon Sep 17 00:00:00 2001
From: Naisila Puka <37271756+naisila@users.noreply.github.com>
Date: Tue, 6 Sep 2022 12:19:25 +0300
Subject: [PATCH 12/16] Prohibit renaming child trigger on distributed
 partition pre PG15 (#6290)

Pre PG15, renaming child triggers on partitions is allowed. When
creating a trigger in a distributed parent partitioned table, the
triggers on the shards of the partitions have the same name with
the triggers on the corresponding parent shards of the parent
table. Therefore, they don't have the same appended shard id as
the shard id of the partition. Hence, when trying to rename a
child trigger on a partition of a distributed table, we can't
correctly find the triggers on the shards of the partition in
order to rename them since we append a different shard id to the
name of the trigger. Since we can't find the trigger we get a
misleading error of inexistent trigger.

In this commit we prohibit renaming child triggers on distributed
partitions altogether.
---
 .../commands/distribute_object_ops.c          |  2 +-
 src/backend/distributed/commands/trigger.c    | 95 +++++++++++++++++++
 src/include/distributed/commands.h            |  2 +
 .../upgrade_distributed_triggers_before.out   |  3 +
 .../upgrade_distributed_triggers_before.sql   |  3 +
 5 files changed, 104 insertions(+), 1 deletion(-)

diff --git a/src/backend/distributed/commands/distribute_object_ops.c b/src/backend/distributed/commands/distribute_object_ops.c
index 198d1df9a..74a7ce69b 100644
--- a/src/backend/distributed/commands/distribute_object_ops.c
+++ b/src/backend/distributed/commands/distribute_object_ops.c
@@ -1176,7 +1176,7 @@ static DistributeObjectOps View_Rename = {
 static DistributeObjectOps Trigger_Rename = {
 	.deparse = NULL,
 	.qualify = NULL,
-	.preprocess = NULL,
+	.preprocess = PreprocessAlterTriggerRenameStmt,
 	.operationType = DIST_OPS_ALTER,
 	.postprocess = PostprocessAlterTriggerRenameStmt,
 	.address = NULL,
diff --git a/src/backend/distributed/commands/trigger.c b/src/backend/distributed/commands/trigger.c
index ffb0ab5ae..3bb2417e7 100644
--- a/src/backend/distributed/commands/trigger.c
+++ b/src/backend/distributed/commands/trigger.c
@@ -53,6 +53,9 @@ static void ExtractDropStmtTriggerAndRelationName(DropStmt *dropTriggerStmt,
 												  char **relationName);
 static void ErrorIfDropStmtDropsMultipleTriggers(DropStmt *dropTriggerStmt);
 static int16 GetTriggerTypeById(Oid triggerId);
+#if (PG_VERSION_NUM < PG_VERSION_15)
+static void ErrorOutIfCloneTrigger(Oid tgrelid, const char *tgname);
+#endif
 
 
 /* GUC that overrides trigger checks for distributed tables and reference tables */
@@ -319,6 +322,40 @@ CreateTriggerEventExtendNames(CreateTrigStmt *createTriggerStmt, char *schemaNam
 }
 
 
+/*
+ * PreprocessAlterTriggerRenameStmt is called before a ALTER TRIGGER RENAME
+ * command has been executed by standard process utility. This function errors
+ * out if we are trying to rename a child trigger on a partition of a distributed
+ * table. In PG15, this is not allowed anyway.
+ */
+List *
+PreprocessAlterTriggerRenameStmt(Node *node, const char *queryString,
+								 ProcessUtilityContext processUtilityContext)
+{
+#if (PG_VERSION_NUM < PG_VERSION_15)
+	RenameStmt *renameTriggerStmt = castNode(RenameStmt, node);
+	Assert(renameTriggerStmt->renameType == OBJECT_TRIGGER);
+
+	RangeVar *relation = renameTriggerStmt->relation;
+
+	bool missingOk = false;
+	Oid relationId = RangeVarGetRelid(relation, ALTER_TRIGGER_LOCK_MODE, missingOk);
+
+	if (!IsCitusTable(relationId))
+	{
+		return NIL;
+	}
+
+	EnsureCoordinator();
+	ErrorOutForTriggerIfNotSupported(relationId);
+
+	ErrorOutIfCloneTrigger(relationId, renameTriggerStmt->subname);
+#endif
+
+	return NIL;
+}
+
+
 /*
  * PostprocessAlterTriggerRenameStmt is called after a ALTER TRIGGER RENAME
  * command has been executed by standard process utility. This function errors
@@ -611,6 +648,64 @@ ErrorOutForTriggerIfNotSupported(Oid relationId)
 }
 
 
+#if (PG_VERSION_NUM < PG_VERSION_15)
+
+/*
+ * ErrorOutIfCloneTrigger is a helper function to error
+ * out if we are trying to rename a child trigger on a
+ * partition of a distributed table.
+ * A lot of this code is borrowed from PG15 because
+ * renaming clone triggers isn't allowed in PG15 anymore.
+ */
+static void
+ErrorOutIfCloneTrigger(Oid tgrelid, const char *tgname)
+{
+	HeapTuple tuple;
+	ScanKeyData key[2];
+
+	Relation tgrel = table_open(TriggerRelationId, RowExclusiveLock);
+
+	/*
+	 * Search for the trigger to modify.
+	 */
+	ScanKeyInit(&key[0],
+				Anum_pg_trigger_tgrelid,
+				BTEqualStrategyNumber, F_OIDEQ,
+				ObjectIdGetDatum(tgrelid));
+	ScanKeyInit(&key[1],
+				Anum_pg_trigger_tgname,
+				BTEqualStrategyNumber, F_NAMEEQ,
+				CStringGetDatum(tgname));
+	SysScanDesc tgscan = systable_beginscan(tgrel, TriggerRelidNameIndexId, true,
+											NULL, 2, key);
+
+	if (HeapTupleIsValid(tuple = systable_getnext(tgscan)))
+	{
+		Form_pg_trigger trigform = (Form_pg_trigger) GETSTRUCT(tuple);
+
+		/*
+		 * If the trigger descends from a trigger on a parent partitioned
+		 * table, reject the rename.
+		 * Appended shard ids to find the trigger on the partition's shards
+		 * are not correct. Hence we would fail to find the trigger on the
+		 * partition's shard.
+		 */
+		if (OidIsValid(trigform->tgparentid))
+		{
+			ereport(ERROR, (
+						errmsg(
+							"cannot rename child triggers on distributed partitions")));
+		}
+	}
+
+	systable_endscan(tgscan);
+	table_close(tgrel, RowExclusiveLock);
+}
+
+
+#endif
+
+
 /*
  * GetDropTriggerStmtRelation takes a DropStmt for a trigger object and returns
  * RangeVar for the relation that owns the trigger.
diff --git a/src/include/distributed/commands.h b/src/include/distributed/commands.h
index b7030adee..f660fe19d 100644
--- a/src/include/distributed/commands.h
+++ b/src/include/distributed/commands.h
@@ -667,6 +667,8 @@ extern List * CreateTriggerStmtObjectAddress(Node *node, bool missingOk, bool
 											 isPostprocess);
 extern void CreateTriggerEventExtendNames(CreateTrigStmt *createTriggerStmt,
 										  char *schemaName, uint64 shardId);
+extern List * PreprocessAlterTriggerRenameStmt(Node *node, const char *queryString,
+											   ProcessUtilityContext processUtilityContext);
 extern List * PostprocessAlterTriggerRenameStmt(Node *node, const char *queryString);
 extern void AlterTriggerRenameEventExtendNames(RenameStmt *renameTriggerStmt,
 											   char *schemaName, uint64 shardId);
diff --git a/src/test/regress/expected/upgrade_distributed_triggers_before.out b/src/test/regress/expected/upgrade_distributed_triggers_before.out
index 5e369c84c..d049c2b0a 100644
--- a/src/test/regress/expected/upgrade_distributed_triggers_before.out
+++ b/src/test/regress/expected/upgrade_distributed_triggers_before.out
@@ -241,3 +241,6 @@ SELECT * FROM sale_triggers ORDER BY 1, 2;
  truncate_trigger_xxxxxxx      | sale_california | O
 (12 rows)
 
+-- check that we can't rename child triggers on partitions of distributed tables
+ALTER TRIGGER another_trigger ON sale_newyork RENAME TO another_renamed_trigger;
+ERROR:  cannot rename child triggers on distributed partitions
diff --git a/src/test/regress/sql/upgrade_distributed_triggers_before.sql b/src/test/regress/sql/upgrade_distributed_triggers_before.sql
index a1d4c9db3..9e0e5b7bb 100644
--- a/src/test/regress/sql/upgrade_distributed_triggers_before.sql
+++ b/src/test/regress/sql/upgrade_distributed_triggers_before.sql
@@ -114,3 +114,6 @@ FOR EACH ROW EXECUTE FUNCTION upgrade_distributed_triggers.record_sale();
 
 ALTER TRIGGER another_trigger ON sale RENAME TO another_renamed_trigger;
 SELECT * FROM sale_triggers ORDER BY 1, 2;
+
+-- check that we can't rename child triggers on partitions of distributed tables
+ALTER TRIGGER another_trigger ON sale_newyork RENAME TO another_renamed_trigger;

From 85b19c851accf35a86ae8bb63fdd9abfa0ef58b9 Mon Sep 17 00:00:00 2001
From: Hanefi Onaldi <Hanefi.Onaldi@microsoft.com>
Date: Mon, 5 Sep 2022 16:05:46 +0300
Subject: [PATCH 13/16] Disallow distributing by numeric with negative scale

PG15 allows numeric scale to be negative or greater than precision. This
causes issues and we may end up routing queries to a wrong shard due to
differing hash results after rounding.

Formerly, when specifying NUMERIC(precision, scale), the scale had to be
in the range [0, precision], which was per SQL spec. PG15 extends the
range of allowed scales to [-1000, 1000].

A negative scale implies rounding before the decimal point. For
example, a column might be declared with a scale of -3 to round values
to the nearest thousand. Note that the display scale remains
non-negative, so in this case the display scale will be zero, and all
digits before the decimal point will be displayed.

Relevant PG commit: 085f931f52494e1f304e35571924efa6fcdc2b44
---
 .../commands/create_distributed_table.c       | 75 +++++++++++++++++++
 src/test/regress/expected/pg15.out            | 31 +++++++-
 src/test/regress/sql/pg15.sql                 | 10 +++
 3 files changed, 115 insertions(+), 1 deletion(-)

diff --git a/src/backend/distributed/commands/create_distributed_table.c b/src/backend/distributed/commands/create_distributed_table.c
index fd35acce2..a3ff6c4cb 100644
--- a/src/backend/distributed/commands/create_distributed_table.c
+++ b/src/backend/distributed/commands/create_distributed_table.c
@@ -139,6 +139,14 @@ static Oid DropFKeysAndUndistributeTable(Oid relationId);
 static void DropFKeysRelationInvolvedWithTableType(Oid relationId, int tableTypeFlag);
 static void CopyLocalDataIntoShards(Oid relationId);
 static List * TupleDescColumnNameList(TupleDesc tupleDescriptor);
+
+#if (PG_VERSION_NUM >= PG_VERSION_15)
+static bool DistributionColumnUsesNumericColumnNegativeScale(TupleDesc relationDesc,
+															 Var *distributionColumn);
+static int numeric_typmod_scale(int32 typmod);
+static bool is_valid_numeric_typmod(int32 typmod);
+#endif
+
 static bool DistributionColumnUsesGeneratedStoredColumn(TupleDesc relationDesc,
 														Var *distributionColumn);
 static bool CanUseExclusiveConnections(Oid relationId, bool localTableEmpty);
@@ -1681,6 +1689,20 @@ EnsureRelationCanBeDistributed(Oid relationId, Var *distributionColumn,
 								  "AS (...) STORED.")));
 	}
 
+#if (PG_VERSION_NUM >= PG_VERSION_15)
+
+	/* verify target relation is not distributed by a column of type numeric with negative scale */
+	if (distributionMethod != DISTRIBUTE_BY_NONE &&
+		DistributionColumnUsesNumericColumnNegativeScale(relationDesc,
+														 distributionColumn))
+	{
+		ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						errmsg("cannot distribute relation: %s", relationName),
+						errdetail("Distribution column must not use numeric type "
+								  "with negative scale")));
+	}
+#endif
+
 	/* check for support function needed by specified partition method */
 	if (distributionMethod == DISTRIBUTE_BY_HASH)
 	{
@@ -2401,6 +2423,59 @@ RelationUsesIdentityColumns(TupleDesc relationDesc)
 }
 
 
+#if (PG_VERSION_NUM >= PG_VERSION_15)
+
+/*
+ * is_valid_numeric_typmod checks if the typmod value is valid
+ *
+ * Because of the offset, valid numeric typmods are at least VARHDRSZ
+ *
+ * Copied from PG. See numeric.c for understanding how this works.
+ */
+static bool
+is_valid_numeric_typmod(int32 typmod)
+{
+	return typmod >= (int32) VARHDRSZ;
+}
+
+
+/*
+ * numeric_typmod_scale extracts the scale from a numeric typmod.
+ *
+ * Copied from PG. See numeric.c for understanding how this works.
+ *
+ */
+static int
+numeric_typmod_scale(int32 typmod)
+{
+	return (((typmod - VARHDRSZ) & 0x7ff) ^ 1024) - 1024;
+}
+
+
+/*
+ * DistributionColumnUsesNumericColumnNegativeScale returns whether a given relation uses
+ * numeric data type with negative scale on distribution column
+ */
+static bool
+DistributionColumnUsesNumericColumnNegativeScale(TupleDesc relationDesc,
+												 Var *distributionColumn)
+{
+	Form_pg_attribute attributeForm = TupleDescAttr(relationDesc,
+													distributionColumn->varattno - 1);
+
+	if (attributeForm->atttypid == NUMERICOID &&
+		is_valid_numeric_typmod(attributeForm->atttypmod) &&
+		numeric_typmod_scale(attributeForm->atttypmod) < 0)
+	{
+		return true;
+	}
+
+	return false;
+}
+
+
+#endif
+
 /*
  * DistributionColumnUsesGeneratedStoredColumn returns whether a given relation uses
  * GENERATED ALWAYS AS (...) STORED on distribution column
diff --git a/src/test/regress/expected/pg15.out b/src/test/regress/expected/pg15.out
index 1b214f4a9..e4d24af35 100644
--- a/src/test/regress/expected/pg15.out
+++ b/src/test/regress/expected/pg15.out
@@ -392,9 +392,37 @@ ON (true)
 WHEN MATCHED THEN
     UPDATE SET x = (SELECT count(*) FROM tbl2);
 ERROR:  MERGE command is not supported on Citus tables yet
+-- test numeric types with negative scale
+CREATE TABLE numeric_negative_scale(numeric_column numeric(3,-1), orig_value int);
+INSERT into numeric_negative_scale SELECT x,x FROM generate_series(111, 115) x;
+-- verify that we can not distribute by a column that has numeric type with negative scale
+SELECT create_distributed_table('numeric_negative_scale','numeric_column');
+ERROR:  cannot distribute relation: numeric_negative_scale
+DETAIL:  Distribution column must not use numeric type with negative scale
+-- However, we can distribute by other columns
+SELECT create_distributed_table('numeric_negative_scale','orig_value');
+NOTICE:  Copying data from local table...
+NOTICE:  copying the data has completed
+DETAIL:  The local data in the table is no longer visible, but is still on disk.
+HINT:  To remove the local data, run: SELECT truncate_local_data_after_distributing_table($$pg15.numeric_negative_scale$$)
+ create_distributed_table
+---------------------------------------------------------------------
+
+(1 row)
+
+SELECT * FROM numeric_negative_scale ORDER BY 1,2;
+ numeric_column | orig_value
+---------------------------------------------------------------------
+            110 |        111
+            110 |        112
+            110 |        113
+            110 |        114
+            120 |        115
+(5 rows)
+
 -- Clean up
 DROP SCHEMA pg15 CASCADE;
-NOTICE:  drop cascades to 9 other objects
+NOTICE:  drop cascades to 10 other objects
 DETAIL:  drop cascades to collation german_phonebook_test
 drop cascades to collation default_provider
 drop cascades to table sale
@@ -404,3 +432,4 @@ drop cascades to view sale_triggers
 drop cascades to table generated_stored_ref
 drop cascades to table tbl1
 drop cascades to table tbl2
+drop cascades to table numeric_negative_scale
diff --git a/src/test/regress/sql/pg15.sql b/src/test/regress/sql/pg15.sql
index 73cee8054..189299bd8 100644
--- a/src/test/regress/sql/pg15.sql
+++ b/src/test/regress/sql/pg15.sql
@@ -245,5 +245,15 @@ ON (true)
 WHEN MATCHED THEN
     UPDATE SET x = (SELECT count(*) FROM tbl2);
 
+-- test numeric types with negative scale
+CREATE TABLE numeric_negative_scale(numeric_column numeric(3,-1), orig_value int);
+INSERT into numeric_negative_scale SELECT x,x FROM generate_series(111, 115) x;
+-- verify that we can not distribute by a column that has numeric type with negative scale
+SELECT create_distributed_table('numeric_negative_scale','numeric_column');
+-- However, we can distribute by other columns
+SELECT create_distributed_table('numeric_negative_scale','orig_value');
+
+SELECT * FROM numeric_negative_scale ORDER BY 1,2;
+
 -- Clean up
 DROP SCHEMA pg15 CASCADE;

From 69726648ab190c65f487427737fb59b851e305d7 Mon Sep 17 00:00:00 2001
From: aykut-bozkurt <51649454+aykut-bozkurt@users.noreply.github.com>
Date: Tue, 6 Sep 2022 16:29:14 +0300
Subject: [PATCH 14/16] verify shards if exists for insert, delete, update
 (#6280)

Co-authored-by: Marco Slot <marco.slot@gmail.com>
---
 .../distributed/executor/citus_custom_scan.c  |  76 ++++
 .../distributed/metadata/metadata_cache.c     | 122 ++++---
 .../planner/combine_query_planner.c           |   3 +-
 .../distributed/planner/deparse_shard_query.c |  15 +-
 .../planner/multi_router_planner.c            |   3 -
 src/backend/distributed/utils/shard_utils.c   |   3 +
 .../distributed/combine_query_planner.h       |   1 +
 src/include/distributed/metadata_cache.h      |   2 +-
 .../isolation_blocking_shard_split.out        |  38 +-
 .../isolation_non_blocking_shard_split.out    |   6 +-
 .../expected/isolation_tenant_isolation.out   | 263 ++++++++++----
 ...isolation_tenant_isolation_nonblocking.out | 324 ++++++++++++++----
 .../spec/isolation_tenant_isolation.spec      |   9 +
 ...solation_tenant_isolation_nonblocking.spec |   9 +
 14 files changed, 655 insertions(+), 219 deletions(-)

diff --git a/src/backend/distributed/executor/citus_custom_scan.c b/src/backend/distributed/executor/citus_custom_scan.c
index 7ce986203..5e4afd1a7 100644
--- a/src/backend/distributed/executor/citus_custom_scan.c
+++ b/src/backend/distributed/executor/citus_custom_scan.c
@@ -31,6 +31,7 @@
 #include "distributed/multi_server_executor.h"
 #include "distributed/multi_router_planner.h"
 #include "distributed/query_stats.h"
+#include "distributed/shard_utils.h"
 #include "distributed/subplan_execution.h"
 #include "distributed/worker_log_messages.h"
 #include "distributed/worker_protocol.h"
@@ -67,6 +68,9 @@ static void CitusEndScan(CustomScanState *node);
 static void CitusReScan(CustomScanState *node);
 static void SetJobColocationId(Job *job);
 static void EnsureForceDelegationDistributionKey(Job *job);
+static void EnsureAnchorShardsInJobExist(Job *job);
+static bool AnchorShardsInTaskListExist(List *taskList);
+static void TryToRerouteFastPathModifyQuery(Job *job);
 
 
 /* create custom scan methods for all executors */
@@ -406,6 +410,19 @@ CitusBeginModifyScan(CustomScanState *node, EState *estate, int eflags)
 		/* prevent concurrent placement changes */
 		AcquireMetadataLocks(workerJob->taskList);
 
+		/*
+		 * In case of a split, the shard might no longer be available. In that
+		 * case try to reroute. We can only do this for fast path queries.
+		 */
+		if (currentPlan->fastPathRouterPlan &&
+			!AnchorShardsInTaskListExist(workerJob->taskList))
+		{
+			TryToRerouteFastPathModifyQuery(workerJob);
+		}
+
+		/* ensure there is no invalid shard */
+		EnsureAnchorShardsInJobExist(workerJob);
+
 		/* modify tasks are always assigned using first-replica policy */
 		workerJob->taskList = FirstReplicaAssignTaskList(workerJob->taskList);
 	}
@@ -440,6 +457,65 @@ CitusBeginModifyScan(CustomScanState *node, EState *estate, int eflags)
 }
 
 
+/*
+ * TryToRerouteFastPathModifyQuery tries to reroute non-existent shards in given job if it finds any such shard,
+ * only for fastpath queries.
+ *
+ * Should only be called if the job belongs to a fastpath modify query
+ */
+static void
+TryToRerouteFastPathModifyQuery(Job *job)
+{
+	if (job->jobQuery->commandType == CMD_INSERT)
+	{
+		RegenerateTaskListForInsert(job);
+	}
+	else
+	{
+		RegenerateTaskForFasthPathQuery(job);
+		RebuildQueryStrings(job);
+	}
+}
+
+
+/*
+ * EnsureAnchorShardsInJobExist ensures all shards are valid in job.
+ * If it finds a non-existent shard in given job, it throws an error.
+ */
+static void
+EnsureAnchorShardsInJobExist(Job *job)
+{
+	if (!AnchorShardsInTaskListExist(job->taskList))
+	{
+		ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+						errmsg("shard for the given value does not exist"),
+						errdetail(
+							"A concurrent shard split may have moved the data into a new set of shards."),
+						errhint("Retry the query.")));
+	}
+}
+
+
+/*
+ * AnchorShardsInTaskListExist checks whether all the anchor shards in the task list
+ * still exist.
+ */
+static bool
+AnchorShardsInTaskListExist(List *taskList)
+{
+	Task *task = NULL;
+	foreach_ptr(task, taskList)
+	{
+		if (!ShardExists(task->anchorShardId))
+		{
+			return false;
+		}
+	}
+
+	return true;
+}
+
+
 /*
  * ModifyJobNeedsEvaluation checks whether the functions and parameters in the job query
  * need to be evaluated before we can build task query strings.
diff --git a/src/backend/distributed/metadata/metadata_cache.c b/src/backend/distributed/metadata/metadata_cache.c
index afce210c8..7779c5e07 100644
--- a/src/backend/distributed/metadata/metadata_cache.c
+++ b/src/backend/distributed/metadata/metadata_cache.c
@@ -231,7 +231,7 @@ static ScanKeyData DistObjectScanKey[3];
 
 /* local function forward declarations */
 static HeapTuple PgDistPartitionTupleViaCatalog(Oid relationId);
-static ShardIdCacheEntry * LookupShardIdCacheEntry(int64 shardId);
+static ShardIdCacheEntry * LookupShardIdCacheEntry(int64 shardId, bool missingOk);
 static CitusTableCacheEntry * BuildCitusTableCacheEntry(Oid relationId);
 static void BuildCachedShardList(CitusTableCacheEntry *cacheEntry);
 static void PrepareWorkerNodeCache(void);
@@ -280,10 +280,11 @@ static Oid LookupEnumValueId(Oid typeId, char *valueName);
 static void InvalidateCitusTableCacheEntrySlot(CitusTableCacheEntrySlot *cacheSlot);
 static void InvalidateDistTableCache(void);
 static void InvalidateDistObjectCache(void);
-static void InitializeTableCacheEntry(int64 shardId);
+static bool InitializeTableCacheEntry(int64 shardId, bool missingOk);
 static bool IsCitusTableTypeInternal(char partitionMethod, char replicationModel,
 									 CitusTableType tableType);
-static bool RefreshTableCacheEntryIfInvalid(ShardIdCacheEntry *shardEntry);
+static bool RefreshTableCacheEntryIfInvalid(ShardIdCacheEntry *shardEntry, bool
+											missingOk);
 
 static Oid DistAuthinfoRelationId(void);
 static Oid DistAuthinfoIndexId(void);
@@ -781,7 +782,8 @@ CitusTableList(void)
 ShardInterval *
 LoadShardInterval(uint64 shardId)
 {
-	ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId);
+	bool missingOk = false;
+	ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId, missingOk);
 	CitusTableCacheEntry *tableEntry = shardIdEntry->tableEntry;
 	int shardIndex = shardIdEntry->shardIndex;
 
@@ -798,13 +800,33 @@ LoadShardInterval(uint64 shardId)
 }
 
 
+/*
+ * ShardExists returns whether given shard exists or not. It fails if missingOk is false
+ * and shard is not found.
+ */
+bool
+ShardExists(uint64 shardId)
+{
+	bool missingOk = true;
+	ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId, missingOk);
+
+	if (!shardIdEntry)
+	{
+		return false;
+	}
+
+	return true;
+}
+
+
 /*
  * RelationIdOfShard returns the relationId of the given shardId.
  */
 Oid
 RelationIdForShard(uint64 shardId)
 {
-	ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId);
+	bool missingOk = false;
+	ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId, missingOk);
 	CitusTableCacheEntry *tableEntry = shardIdEntry->tableEntry;
 	return tableEntry->relationId;
 }
@@ -817,7 +839,8 @@ RelationIdForShard(uint64 shardId)
 bool
 ReferenceTableShardId(uint64 shardId)
 {
-	ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId);
+	bool missingOk = false;
+	ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId, missingOk);
 	CitusTableCacheEntry *tableEntry = shardIdEntry->tableEntry;
 	return IsCitusTableTypeCacheEntry(tableEntry, REFERENCE_TABLE);
 }
@@ -835,7 +858,8 @@ DistributedTableShardId(uint64 shardId)
 		return false;
 	}
 
-	ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId);
+	bool missingOk = false;
+	ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId, missingOk);
 	CitusTableCacheEntry *tableEntry = shardIdEntry->tableEntry;
 	return IsCitusTableTypeCacheEntry(tableEntry, DISTRIBUTED_TABLE);
 }
@@ -850,7 +874,8 @@ DistributedTableShardId(uint64 shardId)
 GroupShardPlacement *
 LoadGroupShardPlacement(uint64 shardId, uint64 placementId)
 {
-	ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId);
+	bool missingOk = false;
+	ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId, missingOk);
 	CitusTableCacheEntry *tableEntry = shardIdEntry->tableEntry;
 	int shardIndex = shardIdEntry->shardIndex;
 
@@ -885,7 +910,8 @@ LoadGroupShardPlacement(uint64 shardId, uint64 placementId)
 ShardPlacement *
 LoadShardPlacement(uint64 shardId, uint64 placementId)
 {
-	ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId);
+	bool missingOk = false;
+	ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId, missingOk);
 	CitusTableCacheEntry *tableEntry = shardIdEntry->tableEntry;
 	int shardIndex = shardIdEntry->shardIndex;
 	GroupShardPlacement *groupPlacement = LoadGroupShardPlacement(shardId, placementId);
@@ -908,7 +934,8 @@ ShardPlacementOnGroupIncludingOrphanedPlacements(int32 groupId, uint64 shardId)
 {
 	ShardPlacement *placementOnNode = NULL;
 
-	ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId);
+	bool missingOk = false;
+	ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId, missingOk);
 	CitusTableCacheEntry *tableEntry = shardIdEntry->tableEntry;
 	int shardIndex = shardIdEntry->shardIndex;
 	GroupShardPlacement *placementArray =
@@ -1128,7 +1155,8 @@ ShardPlacementListIncludingOrphanedPlacements(uint64 shardId)
 {
 	List *placementList = NIL;
 
-	ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId);
+	bool missingOk = false;
+	ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId, missingOk);
 	CitusTableCacheEntry *tableEntry = shardIdEntry->tableEntry;
 	int shardIndex = shardIdEntry->shardIndex;
 
@@ -1168,15 +1196,24 @@ ShardPlacementListIncludingOrphanedPlacements(uint64 shardId)
  * build the cache entry. Afterwards we know that the shard has to be in the
  * cache if it exists. If the shard does *not* exist, this function errors
  * (because LookupShardRelationFromCatalog errors out).
+ *
+ * If missingOk is true and the shard cannot be found, the function returns false.
  */
-static void
-InitializeTableCacheEntry(int64 shardId)
+static bool
+InitializeTableCacheEntry(int64 shardId, bool missingOk)
 {
-	bool missingOk = false;
 	Oid relationId = LookupShardRelationFromCatalog(shardId, missingOk);
 
+	if (!OidIsValid(relationId))
+	{
+		Assert(missingOk);
+		return false;
+	}
+
 	/* trigger building the cache for the shard id */
 	GetCitusTableCacheEntry(relationId); /* lgtm[cpp/return-value-ignored] */
+
+	return true;
 }
 
 
@@ -1186,7 +1223,7 @@ InitializeTableCacheEntry(int64 shardId)
  * entry in the cache and false if it didn't.
  */
 static bool
-RefreshTableCacheEntryIfInvalid(ShardIdCacheEntry *shardEntry)
+RefreshTableCacheEntryIfInvalid(ShardIdCacheEntry *shardEntry, bool missingOk)
 {
 	/*
 	 * We might have some concurrent metadata changes. In order to get the changes,
@@ -1198,7 +1235,8 @@ RefreshTableCacheEntryIfInvalid(ShardIdCacheEntry *shardEntry)
 		return false;
 	}
 	Oid oldRelationId = shardEntry->tableEntry->relationId;
-	Oid currentRelationId = LookupShardRelationFromCatalog(shardEntry->shardId, false);
+	Oid currentRelationId = LookupShardRelationFromCatalog(shardEntry->shardId,
+														   missingOk);
 
 	/*
 	 * The relation OID to which the shard belongs could have changed,
@@ -1213,11 +1251,12 @@ RefreshTableCacheEntryIfInvalid(ShardIdCacheEntry *shardEntry)
 
 
 /*
- * LookupShardCacheEntry returns the cache entry belonging to a shard, or
- * errors out if that shard is unknown.
+ * LookupShardCacheEntry returns the cache entry belonging to a shard.
+ * It errors out if that shard is unknown and missingOk is false. Else,
+ * it will return a NULL cache entry.
  */
 static ShardIdCacheEntry *
-LookupShardIdCacheEntry(int64 shardId)
+LookupShardIdCacheEntry(int64 shardId, bool missingOk)
 {
 	bool foundInCache = false;
 	bool recheck = false;
@@ -1231,12 +1270,16 @@ LookupShardIdCacheEntry(int64 shardId)
 
 	if (!foundInCache)
 	{
-		InitializeTableCacheEntry(shardId);
+		if (!InitializeTableCacheEntry(shardId, missingOk))
+		{
+			return NULL;
+		}
+
 		recheck = true;
 	}
 	else
 	{
-		recheck = RefreshTableCacheEntryIfInvalid(shardEntry);
+		recheck = RefreshTableCacheEntryIfInvalid(shardEntry, missingOk);
 	}
 
 	/*
@@ -1250,7 +1293,8 @@ LookupShardIdCacheEntry(int64 shardId)
 
 		if (!foundInCache)
 		{
-			ereport(ERROR, (errmsg("could not find valid entry for shard "
+			int eflag = (missingOk) ? DEBUG1 : ERROR;
+			ereport(eflag, (errmsg("could not find valid entry for shard "
 								   UINT64_FORMAT, shardId)));
 		}
 	}
@@ -4253,6 +4297,9 @@ InvalidateCitusTableCacheEntrySlot(CitusTableCacheEntrySlot *cacheSlot)
 	{
 		/* reload the metadata */
 		cacheSlot->citusTableMetadata->isValid = false;
+
+		/* clean up ShardIdCacheHash */
+		RemoveStaleShardIdCacheEntries(cacheSlot->citusTableMetadata);
 	}
 }
 
@@ -4653,37 +4700,6 @@ LookupShardRelationFromCatalog(int64 shardId, bool missingOk)
 }
 
 
-/*
- * ShardExists returns whether the given shard ID exists in pg_dist_shard.
- */
-bool
-ShardExists(int64 shardId)
-{
-	ScanKeyData scanKey[1];
-	int scanKeyCount = 1;
-	Relation pgDistShard = table_open(DistShardRelationId(), AccessShareLock);
-	bool shardExists = false;
-
-	ScanKeyInit(&scanKey[0], Anum_pg_dist_shard_shardid,
-				BTEqualStrategyNumber, F_INT8EQ, Int64GetDatum(shardId));
-
-	SysScanDesc scanDescriptor = systable_beginscan(pgDistShard,
-													DistShardShardidIndexId(), true,
-													NULL, scanKeyCount, scanKey);
-
-	HeapTuple heapTuple = systable_getnext(scanDescriptor);
-	if (HeapTupleIsValid(heapTuple))
-	{
-		shardExists = true;
-	}
-
-	systable_endscan(scanDescriptor);
-	table_close(pgDistShard, NoLock);
-
-	return shardExists;
-}
-
-
 /*
  * GetPartitionTypeInputInfo populates output parameters with the interval type
  * identifier and modifier for the specified partition key/method combination.
diff --git a/src/backend/distributed/planner/combine_query_planner.c b/src/backend/distributed/planner/combine_query_planner.c
index 0a871f3e6..1ada6bcc0 100644
--- a/src/backend/distributed/planner/combine_query_planner.c
+++ b/src/backend/distributed/planner/combine_query_planner.c
@@ -30,7 +30,6 @@ static List * RemoteScanTargetList(List *workerTargetList);
 static PlannedStmt * BuildSelectStatementViaStdPlanner(Query *combineQuery,
 													   List *remoteScanTargetList,
 													   CustomScan *remoteScan);
-static bool FindCitusExtradataContainerRTE(Node *node, RangeTblEntry **result);
 
 static Plan * CitusCustomScanPathPlan(PlannerInfo *root, RelOptInfo *rel,
 									  struct CustomPath *best_path, List *tlist,
@@ -323,7 +322,7 @@ BuildSelectStatementViaStdPlanner(Query *combineQuery, List *remoteScanTargetLis
  * Finds the rangetable entry in the query that refers to the citus_extradata_container
  * and stores the pointer in result.
  */
-static bool
+bool
 FindCitusExtradataContainerRTE(Node *node, RangeTblEntry **result)
 {
 	if (node == NULL)
diff --git a/src/backend/distributed/planner/deparse_shard_query.c b/src/backend/distributed/planner/deparse_shard_query.c
index 11338df65..1c5c62034 100644
--- a/src/backend/distributed/planner/deparse_shard_query.c
+++ b/src/backend/distributed/planner/deparse_shard_query.c
@@ -17,6 +17,7 @@
 #include "catalog/pg_constraint.h"
 #include "distributed/citus_nodefuncs.h"
 #include "distributed/citus_ruleutils.h"
+#include "distributed/combine_query_planner.h"
 #include "distributed/deparse_shard_query.h"
 #include "distributed/insert_select_planner.h"
 #include "distributed/listutils.h"
@@ -79,12 +80,13 @@ RebuildQueryStrings(Job *workerJob)
 
 		if (UpdateOrDeleteQuery(query))
 		{
+			List *relationShardList = task->relationShardList;
+
 			/*
 			 * For UPDATE and DELETE queries, we may have subqueries and joins, so
 			 * we use relation shard list to update shard names and call
 			 * pg_get_query_def() directly.
 			 */
-			List *relationShardList = task->relationShardList;
 			UpdateRelationToShardNames((Node *) query, relationShardList);
 		}
 		else if (query->commandType == CMD_INSERT && task->modifyWithSubquery)
@@ -229,7 +231,16 @@ UpdateRelationToShardNames(Node *node, List *relationShardList)
 
 	RangeTblEntry *newRte = (RangeTblEntry *) node;
 
-	if (newRte->rtekind != RTE_RELATION)
+	if (newRte->rtekind == RTE_FUNCTION)
+	{
+		newRte = NULL;
+		if (!FindCitusExtradataContainerRTE(node, &newRte))
+		{
+			/* only update function rtes containing citus_extradata_container */
+			return false;
+		}
+	}
+	else if (newRte->rtekind != RTE_RELATION)
 	{
 		return false;
 	}
diff --git a/src/backend/distributed/planner/multi_router_planner.c b/src/backend/distributed/planner/multi_router_planner.c
index 3193305a2..7335d1c2d 100644
--- a/src/backend/distributed/planner/multi_router_planner.c
+++ b/src/backend/distributed/planner/multi_router_planner.c
@@ -2677,9 +2677,6 @@ TargetShardIntervalForFastPathQuery(Query *query, bool *isMultiShardQuery,
 	}
 
 	/* we're only expecting single shard from a single table */
-	Node *distKey PG_USED_FOR_ASSERTS_ONLY = NULL;
-	Assert(FastPathRouterQuery(query, &distKey) || !EnableFastPathRouterPlanner);
-
 	if (list_length(prunedShardIntervalList) > 1)
 	{
 		*isMultiShardQuery = true;
diff --git a/src/backend/distributed/utils/shard_utils.c b/src/backend/distributed/utils/shard_utils.c
index 7d5a63fa0..d6d41f192 100644
--- a/src/backend/distributed/utils/shard_utils.c
+++ b/src/backend/distributed/utils/shard_utils.c
@@ -16,7 +16,10 @@
 #include "utils/fmgrprotos.h"
 #include "utils/lsyscache.h"
 #include "distributed/coordinator_protocol.h"
+#include "distributed/listutils.h"
+#include "distributed/log_utils.h"
 #include "distributed/metadata_utility.h"
+#include "distributed/multi_physical_planner.h"
 #include "distributed/relay_utility.h"
 #include "distributed/shard_utils.h"
 
diff --git a/src/include/distributed/combine_query_planner.h b/src/include/distributed/combine_query_planner.h
index c4702b781..710010913 100644
--- a/src/include/distributed/combine_query_planner.h
+++ b/src/include/distributed/combine_query_planner.h
@@ -27,6 +27,7 @@ extern Path * CreateCitusCustomScanPath(PlannerInfo *root, RelOptInfo *relOptInf
 										CustomScan *remoteScan);
 extern PlannedStmt * PlanCombineQuery(struct DistributedPlan *distributedPlan,
 									  struct CustomScan *dataScan);
+extern bool FindCitusExtradataContainerRTE(Node *node, RangeTblEntry **result);
 extern bool ReplaceCitusExtraDataContainer;
 extern CustomScan *ReplaceCitusExtraDataContainerWithCustomScan;
 
diff --git a/src/include/distributed/metadata_cache.h b/src/include/distributed/metadata_cache.h
index 1a5939879..2b55cf8d9 100644
--- a/src/include/distributed/metadata_cache.h
+++ b/src/include/distributed/metadata_cache.h
@@ -157,6 +157,7 @@ extern uint32 ColocationIdViaCatalog(Oid relationId);
 extern bool IsCitusLocalTableByDistParams(char partitionMethod, char replicationModel);
 extern List * CitusTableList(void);
 extern ShardInterval * LoadShardInterval(uint64 shardId);
+extern bool ShardExists(uint64 shardId);
 extern Oid RelationIdForShard(uint64 shardId);
 extern bool ReferenceTableShardId(uint64 shardId);
 extern bool DistributedTableShardId(uint64 shardId);
@@ -174,7 +175,6 @@ extern int32 GetLocalNodeId(void);
 extern void CitusTableCacheFlushInvalidatedEntries(void);
 extern Oid LookupShardRelationFromCatalog(int64 shardId, bool missing_ok);
 extern List * ShardPlacementListIncludingOrphanedPlacements(uint64 shardId);
-extern bool ShardExists(int64 shardId);
 extern void CitusInvalidateRelcacheByRelid(Oid relationId);
 extern void CitusInvalidateRelcacheByShardId(int64 shardId);
 extern void InvalidateForeignKeyGraph(void);
diff --git a/src/test/regress/expected/isolation_blocking_shard_split.out b/src/test/regress/expected/isolation_blocking_shard_split.out
index d720f3a32..b5b7dc71b 100644
--- a/src/test/regress/expected/isolation_blocking_shard_split.out
+++ b/src/test/regress/expected/isolation_blocking_shard_split.out
@@ -56,7 +56,6 @@ step s2-commit:
 	COMMIT;
 
 step s1-update: <... completed>
-ERROR:  could not find valid entry for shard xxxxx
 step s1-commit:
 	COMMIT;
 
@@ -80,7 +79,7 @@ nodeport|shardid|success|result
 
        id|value
 ---------------------------------------------------------------------
-123456789|    1
+123456789|  111
 (1 row)
 
 
@@ -140,7 +139,6 @@ step s2-commit:
 	COMMIT;
 
 step s1-delete: <... completed>
-ERROR:  could not find valid entry for shard xxxxx
 step s1-commit:
 	COMMIT;
 
@@ -158,14 +156,13 @@ step s2-print-cluster:
 nodeport|shardid|success|result
 ---------------------------------------------------------------------
    57637|1500001|t      |     0
-   57637|1500003|t      |     1
+   57637|1500003|t      |     0
    57638|1500004|t      |     0
 (3 rows)
 
-       id|value
+id|value
 ---------------------------------------------------------------------
-123456789|    1
-(1 row)
+(0 rows)
 
 
 starting permutation: s1-load-cache s1-begin s1-select s2-begin s2-blocking-shard-split s1-insert s2-commit s1-commit s2-print-cluster
@@ -221,7 +218,6 @@ get_shard_id_for_distribution_column
                              1500002
 (1 row)
 
-ERROR:  could not find valid entry for shard xxxxx
 step s1-commit:
 	COMMIT;
 
@@ -239,13 +235,14 @@ step s2-print-cluster:
 nodeport|shardid|success|result
 ---------------------------------------------------------------------
    57637|1500001|t      |     0
-   57637|1500003|t      |     0
+   57637|1500003|t      |     1
    57638|1500004|t      |     0
 (3 rows)
 
-id|value
+       id|value
 ---------------------------------------------------------------------
-(0 rows)
+123456789|    1
+(1 row)
 
 
 starting permutation: s1-load-cache s1-begin s1-select s2-begin s2-blocking-shard-split s1-copy s2-commit s1-commit s2-print-cluster
@@ -373,7 +370,6 @@ step s2-commit:
 	COMMIT;
 
 step s1-update: <... completed>
-ERROR:  could not find valid entry for shard xxxxx
 step s1-commit:
 	COMMIT;
 
@@ -397,7 +393,7 @@ nodeport|shardid|success|result
 
        id|value
 ---------------------------------------------------------------------
-123456789|    1
+123456789|    111
 (1 row)
 
 
@@ -453,7 +449,6 @@ step s2-commit:
 	COMMIT;
 
 step s1-delete: <... completed>
-ERROR:  could not find valid entry for shard xxxxx
 step s1-commit:
 	COMMIT;
 
@@ -471,14 +466,13 @@ step s2-print-cluster:
 nodeport|shardid|success|result
 ---------------------------------------------------------------------
    57637|1500001|t      |     0
-   57637|1500003|t      |     1
+   57637|1500003|t      |     0
    57638|1500004|t      |     0
 (3 rows)
 
-       id|value
+id|value
 ---------------------------------------------------------------------
-123456789|    1
-(1 row)
+(0 rows)
 
 
 starting permutation: s1-begin s1-select s2-begin s2-blocking-shard-split s1-insert s2-commit s1-commit s2-print-cluster
@@ -530,7 +524,6 @@ get_shard_id_for_distribution_column
                              1500002
 (1 row)
 
-ERROR:  could not find valid entry for shard xxxxx
 step s1-commit:
 	COMMIT;
 
@@ -548,13 +541,14 @@ step s2-print-cluster:
 nodeport|shardid|success|result
 ---------------------------------------------------------------------
    57637|1500001|t      |     0
-   57637|1500003|t      |     0
+   57637|1500003|t      |     1
    57638|1500004|t      |     0
 (3 rows)
 
-id|value
+       id|value
 ---------------------------------------------------------------------
-(0 rows)
+123456789|    1
+(1 row)
 
 
 starting permutation: s1-begin s1-select s2-begin s2-blocking-shard-split s1-copy s2-commit s1-commit s2-print-cluster
diff --git a/src/test/regress/expected/isolation_non_blocking_shard_split.out b/src/test/regress/expected/isolation_non_blocking_shard_split.out
index 24af03012..e52dff295 100644
--- a/src/test/regress/expected/isolation_non_blocking_shard_split.out
+++ b/src/test/regress/expected/isolation_non_blocking_shard_split.out
@@ -434,7 +434,6 @@ step s1-end:
  COMMIT;
 
 step s4-insert: <... completed>
-ERROR:  could not find valid entry for shard xxxxx
 step s4-end:
    COMMIT;
 
@@ -452,13 +451,14 @@ step s2-print-cluster:
 nodeport|shardid|success|result
 ---------------------------------------------------------------------
    57638|1500002|t      |     0
-   57638|1500003|t      |     1
+   57638|1500003|t      |     2
 (2 rows)
 
        id|value
 ---------------------------------------------------------------------
+      900|    1
 123456789|    1
-(1 row)
+(2 rows)
 
 
 starting permutation: s2-print-cluster s3-acquire-advisory-lock s1-begin s2-begin s1-non-blocking-shard-split s2-insert s2-end s2-print-cluster s3-release-advisory-lock s1-end s2-print-cluster
diff --git a/src/test/regress/expected/isolation_tenant_isolation.out b/src/test/regress/expected/isolation_tenant_isolation.out
index f663aa05d..e2b2e4e43 100644
--- a/src/test/regress/expected/isolation_tenant_isolation.out
+++ b/src/test/regress/expected/isolation_tenant_isolation.out
@@ -44,7 +44,6 @@ step s2-commit:
  COMMIT;
 
 step s1-update: <... completed>
-ERROR:  could not find valid entry for shard xxxxx
 step s1-commit:
  COMMIT;
 
@@ -69,7 +68,7 @@ nodeport|shardid|success|result
 
 id|value
 ---------------------------------------------------------------------
- 5|   10
+ 5|    5
 (1 row)
 
 
@@ -117,7 +116,6 @@ step s2-commit:
  COMMIT;
 
 step s1-delete: <... completed>
-ERROR:  could not find valid entry for shard xxxxx
 step s1-commit:
  COMMIT;
 
@@ -135,11 +133,85 @@ step s2-print-cluster:
 nodeport|shardid|success|result
 ---------------------------------------------------------------------
    57637|1500009|t      |     0
-   57637|1500010|t      |     1
+   57637|1500010|t      |     0
    57637|1500011|t      |     0
    57638|1500008|t      |     0
 (4 rows)
 
+id|value
+---------------------------------------------------------------------
+(0 rows)
+
+
+starting permutation: s1-load-cache s1-insert s1-begin s1-select s2-begin s2-isolate-tenant s1-update-complex s2-commit s1-commit s2-print-cluster
+create_distributed_table
+---------------------------------------------------------------------
+
+(1 row)
+
+step s1-load-cache:
+ TRUNCATE isolation_table;
+
+step s1-insert:
+ INSERT INTO isolation_table VALUES (5, 10);
+
+step s1-begin:
+    BEGIN;
+    -- the tests are written with the logic where single shard SELECTs
+    -- do not to open transaction blocks
+    SET citus.select_opens_transaction_block TO false;
+
+step s1-select:
+ SELECT count(*) FROM isolation_table WHERE id = 5;
+
+count
+---------------------------------------------------------------------
+    1
+(1 row)
+
+step s2-begin:
+ BEGIN;
+
+step s2-isolate-tenant:
+ SELECT isolate_tenant_to_new_shard('isolation_table', 5, shard_transfer_mode => 'block_writes');
+
+isolate_tenant_to_new_shard
+---------------------------------------------------------------------
+                    1500016
+(1 row)
+
+step s1-update-complex:
+ UPDATE isolation_table SET value = 5 WHERE id IN (
+        SELECT max(id) FROM isolation_table
+    );
+ <waiting ...>
+step s2-commit: 
+ COMMIT;
+
+step s1-update-complex: <... completed>
+ERROR:  shard for the given value does not exist
+step s1-commit:
+ COMMIT;
+
+step s2-print-cluster:
+ -- row count per shard
+ SELECT
+  nodeport, shardid, success, result
+ FROM
+  run_command_on_placements('isolation_table', 'select count(*) from %s')
+ ORDER BY
+  nodeport, shardid;
+ -- rows
+ SELECT id, value FROM isolation_table ORDER BY id, value;
+
+nodeport|shardid|success|result
+---------------------------------------------------------------------
+   57637|1500015|t      |     0
+   57637|1500016|t      |     1
+   57637|1500017|t      |     0
+   57638|1500014|t      |     0
+(4 rows)
+
 id|value
 ---------------------------------------------------------------------
  5|   10
@@ -177,7 +249,7 @@ step s2-isolate-tenant:
 
 isolate_tenant_to_new_shard
 ---------------------------------------------------------------------
-                    1500016
+                    1500022
 (1 row)
 
 step s1-insert:
@@ -187,7 +259,6 @@ step s2-commit:
  COMMIT;
 
 step s1-insert: <... completed>
-ERROR:  could not find valid entry for shard xxxxx
 step s1-commit:
  COMMIT;
 
@@ -204,15 +275,16 @@ step s2-print-cluster:
 
 nodeport|shardid|success|result
 ---------------------------------------------------------------------
-   57637|1500015|t      |     0
-   57637|1500016|t      |     0
-   57637|1500017|t      |     0
-   57638|1500014|t      |     0
+   57637|1500021|t      |     0
+   57637|1500022|t      |     1
+   57637|1500023|t      |     0
+   57638|1500020|t      |     0
 (4 rows)
 
 id|value
 ---------------------------------------------------------------------
-(0 rows)
+ 5|   10
+(1 row)
 
 
 starting permutation: s1-load-cache s1-begin s1-select s2-begin s2-isolate-tenant s1-copy s2-commit s1-commit s2-print-cluster
@@ -246,7 +318,7 @@ step s2-isolate-tenant:
 
 isolate_tenant_to_new_shard
 ---------------------------------------------------------------------
-                    1500022
+                    1500028
 (1 row)
 
 step s1-copy:
@@ -273,10 +345,10 @@ step s2-print-cluster:
 
 nodeport|shardid|success|result
 ---------------------------------------------------------------------
-   57637|1500021|t      |     0
-   57637|1500022|t      |     0
-   57637|1500023|t      |     0
-   57638|1500020|t      |     0
+   57637|1500027|t      |     0
+   57637|1500028|t      |     0
+   57637|1500029|t      |     0
+   57638|1500026|t      |     0
 (4 rows)
 
 id|value
@@ -315,7 +387,7 @@ step s2-isolate-tenant:
 
 isolate_tenant_to_new_shard
 ---------------------------------------------------------------------
-                    1500028
+                    1500034
 (1 row)
 
 step s1-ddl:
@@ -341,10 +413,10 @@ step s2-print-cluster:
 
 nodeport|shardid|success|result
 ---------------------------------------------------------------------
-   57637|1500027|t      |     0
-   57637|1500028|t      |     0
-   57637|1500029|t      |     0
-   57638|1500026|t      |     0
+   57637|1500033|t      |     0
+   57637|1500034|t      |     0
+   57637|1500035|t      |     0
+   57638|1500032|t      |     0
 (4 rows)
 
 id|value
@@ -399,7 +471,7 @@ step s2-isolate-tenant:
 
 isolate_tenant_to_new_shard
 ---------------------------------------------------------------------
-                    1500034
+                    1500040
 (1 row)
 
 step s1-update:
@@ -409,7 +481,6 @@ step s2-commit:
  COMMIT;
 
 step s1-update: <... completed>
-ERROR:  could not find valid entry for shard xxxxx
 step s1-commit:
  COMMIT;
 
@@ -426,15 +497,15 @@ step s2-print-cluster:
 
 nodeport|shardid|success|result
 ---------------------------------------------------------------------
-   57637|1500033|t      |     0
-   57637|1500034|t      |     1
-   57637|1500035|t      |     0
-   57638|1500032|t      |     0
+   57637|1500039|t      |     0
+   57637|1500040|t      |     1
+   57637|1500041|t      |     0
+   57638|1500038|t      |     0
 (4 rows)
 
 id|value
 ---------------------------------------------------------------------
- 5|   10
+ 5|    5
 (1 row)
 
 
@@ -469,7 +540,7 @@ step s2-isolate-tenant:
 
 isolate_tenant_to_new_shard
 ---------------------------------------------------------------------
-                    1500040
+                    1500046
 (1 row)
 
 step s1-delete:
@@ -479,7 +550,6 @@ step s2-commit:
  COMMIT;
 
 step s1-delete: <... completed>
-ERROR:  could not find valid entry for shard xxxxx
 step s1-commit:
  COMMIT;
 
@@ -496,10 +566,81 @@ step s2-print-cluster:
 
 nodeport|shardid|success|result
 ---------------------------------------------------------------------
-   57637|1500039|t      |     0
-   57637|1500040|t      |     1
-   57637|1500041|t      |     0
-   57638|1500038|t      |     0
+   57637|1500045|t      |     0
+   57637|1500046|t      |     0
+   57637|1500047|t      |     0
+   57638|1500044|t      |     0
+(4 rows)
+
+id|value
+---------------------------------------------------------------------
+(0 rows)
+
+
+starting permutation: s1-insert s1-begin s1-select s2-begin s2-isolate-tenant s1-update-complex s2-commit s1-commit s2-print-cluster
+create_distributed_table
+---------------------------------------------------------------------
+
+(1 row)
+
+step s1-insert:
+ INSERT INTO isolation_table VALUES (5, 10);
+
+step s1-begin:
+    BEGIN;
+    -- the tests are written with the logic where single shard SELECTs
+    -- do not to open transaction blocks
+    SET citus.select_opens_transaction_block TO false;
+
+step s1-select:
+ SELECT count(*) FROM isolation_table WHERE id = 5;
+
+count
+---------------------------------------------------------------------
+    1
+(1 row)
+
+step s2-begin:
+ BEGIN;
+
+step s2-isolate-tenant:
+ SELECT isolate_tenant_to_new_shard('isolation_table', 5, shard_transfer_mode => 'block_writes');
+
+isolate_tenant_to_new_shard
+---------------------------------------------------------------------
+                    1500052
+(1 row)
+
+step s1-update-complex:
+ UPDATE isolation_table SET value = 5 WHERE id IN (
+        SELECT max(id) FROM isolation_table
+    );
+ <waiting ...>
+step s2-commit: 
+ COMMIT;
+
+step s1-update-complex: <... completed>
+ERROR:  shard for the given value does not exist
+step s1-commit:
+ COMMIT;
+
+step s2-print-cluster:
+ -- row count per shard
+ SELECT
+  nodeport, shardid, success, result
+ FROM
+  run_command_on_placements('isolation_table', 'select count(*) from %s')
+ ORDER BY
+  nodeport, shardid;
+ -- rows
+ SELECT id, value FROM isolation_table ORDER BY id, value;
+
+nodeport|shardid|success|result
+---------------------------------------------------------------------
+   57637|1500051|t      |     0
+   57637|1500052|t      |     1
+   57637|1500053|t      |     0
+   57638|1500050|t      |     0
 (4 rows)
 
 id|value
@@ -536,7 +677,7 @@ step s2-isolate-tenant:
 
 isolate_tenant_to_new_shard
 ---------------------------------------------------------------------
-                    1500046
+                    1500058
 (1 row)
 
 step s1-insert:
@@ -546,7 +687,6 @@ step s2-commit:
  COMMIT;
 
 step s1-insert: <... completed>
-ERROR:  could not find valid entry for shard xxxxx
 step s1-commit:
  COMMIT;
 
@@ -563,15 +703,16 @@ step s2-print-cluster:
 
 nodeport|shardid|success|result
 ---------------------------------------------------------------------
-   57637|1500045|t      |     0
-   57637|1500046|t      |     0
-   57637|1500047|t      |     0
-   57638|1500044|t      |     0
+   57637|1500057|t      |     0
+   57637|1500058|t      |     1
+   57637|1500059|t      |     0
+   57638|1500056|t      |     0
 (4 rows)
 
 id|value
 ---------------------------------------------------------------------
-(0 rows)
+ 5|   10
+(1 row)
 
 
 starting permutation: s1-begin s1-select s2-begin s2-isolate-tenant s1-copy s2-commit s1-commit s2-print-cluster
@@ -602,7 +743,7 @@ step s2-isolate-tenant:
 
 isolate_tenant_to_new_shard
 ---------------------------------------------------------------------
-                    1500052
+                    1500064
 (1 row)
 
 step s1-copy:
@@ -629,10 +770,10 @@ step s2-print-cluster:
 
 nodeport|shardid|success|result
 ---------------------------------------------------------------------
-   57637|1500051|t      |     0
-   57637|1500052|t      |     0
-   57637|1500053|t      |     0
-   57638|1500050|t      |     0
+   57637|1500063|t      |     0
+   57637|1500064|t      |     0
+   57637|1500065|t      |     0
+   57638|1500062|t      |     0
 (4 rows)
 
 id|value
@@ -668,7 +809,7 @@ step s2-isolate-tenant:
 
 isolate_tenant_to_new_shard
 ---------------------------------------------------------------------
-                    1500058
+                    1500070
 (1 row)
 
 step s1-ddl:
@@ -694,10 +835,10 @@ step s2-print-cluster:
 
 nodeport|shardid|success|result
 ---------------------------------------------------------------------
-   57637|1500057|t      |     0
-   57637|1500058|t      |     0
-   57637|1500059|t      |     0
-   57638|1500056|t      |     0
+   57637|1500069|t      |     0
+   57637|1500070|t      |     0
+   57637|1500071|t      |     0
+   57638|1500068|t      |     0
 (4 rows)
 
 id|value
@@ -744,7 +885,7 @@ step s1-isolate-tenant:
 
 isolate_tenant_to_new_shard
 ---------------------------------------------------------------------
-                    1500064
+                    1500076
 (1 row)
 
 step s2-isolate-tenant:
@@ -767,10 +908,10 @@ step s2-print-cluster:
 
 nodeport|shardid|success|result
 ---------------------------------------------------------------------
-   57637|1500061|t      |     1
-   57638|1500063|t      |     0
-   57638|1500064|t      |     0
-   57638|1500065|t      |     0
+   57637|1500073|t      |     1
+   57638|1500075|t      |     0
+   57638|1500076|t      |     0
+   57638|1500077|t      |     0
 (4 rows)
 
 id|value
@@ -799,7 +940,7 @@ step s1-isolate-tenant:
 
 isolate_tenant_to_new_shard
 ---------------------------------------------------------------------
-                    1500070
+                    1500082
 (1 row)
 
 step s2-isolate-tenant:
@@ -822,10 +963,10 @@ step s2-print-cluster:
 
 nodeport|shardid|success|result
 ---------------------------------------------------------------------
-   57637|1500067|t      |     1
-   57638|1500069|t      |     0
-   57638|1500070|t      |     0
-   57638|1500071|t      |     0
+   57637|1500079|t      |     1
+   57638|1500081|t      |     0
+   57638|1500082|t      |     0
+   57638|1500083|t      |     0
 (4 rows)
 
 id|value
diff --git a/src/test/regress/expected/isolation_tenant_isolation_nonblocking.out b/src/test/regress/expected/isolation_tenant_isolation_nonblocking.out
index 7d8991615..17766f85e 100644
--- a/src/test/regress/expected/isolation_tenant_isolation_nonblocking.out
+++ b/src/test/regress/expected/isolation_tenant_isolation_nonblocking.out
@@ -59,7 +59,7 @@ t
 step s2-isolate-tenant: <... completed>
 isolate_tenant_to_new_shard
 ---------------------------------------------------------------------
-                    1500078
+                    1500090
 (1 row)
 
 step s2-commit:
@@ -78,10 +78,10 @@ step s2-print-cluster:
 
 nodeport|shardid|success|result
 ---------------------------------------------------------------------
-   57637|1500077|t      |     0
-   57637|1500078|t      |     1
-   57637|1500079|t      |     0
-   57638|1500074|t      |     0
+   57637|1500089|t      |     0
+   57637|1500090|t      |     1
+   57637|1500091|t      |     0
+   57638|1500086|t      |     0
 (4 rows)
 
 id|value
@@ -149,7 +149,7 @@ t
 step s2-isolate-tenant: <... completed>
 isolate_tenant_to_new_shard
 ---------------------------------------------------------------------
-                    1500086
+                    1500098
 (1 row)
 
 step s2-commit:
@@ -168,10 +168,10 @@ step s2-print-cluster:
 
 nodeport|shardid|success|result
 ---------------------------------------------------------------------
-   57637|1500085|t      |     0
-   57637|1500086|t      |     0
-   57637|1500087|t      |     0
-   57638|1500082|t      |     0
+   57637|1500097|t      |     0
+   57637|1500098|t      |     0
+   57637|1500099|t      |     0
+   57638|1500094|t      |     0
 (4 rows)
 
 id|value
@@ -179,6 +179,98 @@ id|value
 (0 rows)
 
 
+starting permutation: s1-load-cache s1-insert s3-acquire-advisory-lock s1-begin s1-select s2-begin s2-isolate-tenant s1-update-complex s1-commit s3-release-advisory-lock s2-commit s2-print-cluster
+create_distributed_table
+---------------------------------------------------------------------
+
+(1 row)
+
+step s1-load-cache:
+ TRUNCATE isolation_table;
+ TRUNCATE isolation_table2;
+
+step s1-insert:
+ INSERT INTO isolation_table VALUES (5, 10);
+ INSERT INTO isolation_table2 VALUES (5, 10);
+
+step s3-acquire-advisory-lock:
+    SELECT pg_advisory_lock(44000, 55152);
+
+pg_advisory_lock
+---------------------------------------------------------------------
+
+(1 row)
+
+step s1-begin:
+    BEGIN;
+    -- the tests are written with the logic where single shard SELECTs
+    -- do not to open transaction blocks
+    SET citus.select_opens_transaction_block TO false;
+
+step s1-select:
+ SELECT count(*) FROM isolation_table WHERE id = 5;
+
+count
+---------------------------------------------------------------------
+    1
+(1 row)
+
+step s2-begin:
+ BEGIN;
+
+step s2-isolate-tenant:
+ SELECT isolate_tenant_to_new_shard('isolation_table', 5, shard_transfer_mode => 'force_logical');
+ <waiting ...>
+step s1-update-complex: 
+ UPDATE isolation_table SET value = 5 WHERE id IN (
+        SELECT max(id) FROM isolation_table
+    );
+
+step s1-commit:
+ COMMIT;
+
+step s3-release-advisory-lock:
+    SELECT pg_advisory_unlock(44000, 55152);
+
+pg_advisory_unlock
+---------------------------------------------------------------------
+t
+(1 row)
+
+step s2-isolate-tenant: <... completed>
+isolate_tenant_to_new_shard
+---------------------------------------------------------------------
+                    1500106
+(1 row)
+
+step s2-commit:
+ COMMIT;
+
+step s2-print-cluster:
+ -- row count per shard
+ SELECT
+  nodeport, shardid, success, result
+ FROM
+  run_command_on_placements('isolation_table', 'select count(*) from %s')
+ ORDER BY
+  nodeport, shardid;
+ -- rows
+ SELECT id, value FROM isolation_table ORDER BY id, value;
+
+nodeport|shardid|success|result
+---------------------------------------------------------------------
+   57637|1500105|t      |     0
+   57637|1500106|t      |     1
+   57637|1500107|t      |     0
+   57638|1500102|t      |     0
+(4 rows)
+
+id|value
+---------------------------------------------------------------------
+ 5|    5
+(1 row)
+
+
 starting permutation: s1-load-cache s3-acquire-advisory-lock s1-begin s1-select s2-begin s2-isolate-tenant s1-insert s1-commit s3-release-advisory-lock s2-commit s2-print-cluster
 create_distributed_table
 ---------------------------------------------------------------------
@@ -235,7 +327,7 @@ t
 step s2-isolate-tenant: <... completed>
 isolate_tenant_to_new_shard
 ---------------------------------------------------------------------
-                    1500094
+                    1500114
 (1 row)
 
 step s2-commit:
@@ -254,10 +346,10 @@ step s2-print-cluster:
 
 nodeport|shardid|success|result
 ---------------------------------------------------------------------
-   57637|1500093|t      |     0
-   57637|1500094|t      |     1
-   57637|1500095|t      |     0
-   57638|1500090|t      |     0
+   57637|1500113|t      |     0
+   57637|1500114|t      |     1
+   57637|1500115|t      |     0
+   57638|1500110|t      |     0
 (4 rows)
 
 id|value
@@ -321,7 +413,7 @@ t
 step s2-isolate-tenant: <... completed>
 isolate_tenant_to_new_shard
 ---------------------------------------------------------------------
-                    1500102
+                    1500122
 (1 row)
 
 step s2-commit:
@@ -340,10 +432,10 @@ step s2-print-cluster:
 
 nodeport|shardid|success|result
 ---------------------------------------------------------------------
-   57637|1500101|t      |     1
-   57637|1500102|t      |     1
-   57637|1500103|t      |     2
-   57638|1500098|t      |     1
+   57637|1500121|t      |     1
+   57637|1500122|t      |     1
+   57637|1500123|t      |     2
+   57638|1500118|t      |     1
 (4 rows)
 
 id|value
@@ -411,7 +503,7 @@ t
 step s2-isolate-tenant: <... completed>
 isolate_tenant_to_new_shard
 ---------------------------------------------------------------------
-                    1500110
+                    1500130
 (1 row)
 
 step s2-commit:
@@ -430,10 +522,10 @@ step s2-print-cluster:
 
 nodeport|shardid|success|result
 ---------------------------------------------------------------------
-   57637|1500109|t      |     0
-   57637|1500110|t      |     1
-   57637|1500111|t      |     0
-   57638|1500106|t      |     0
+   57637|1500129|t      |     0
+   57637|1500130|t      |     1
+   57637|1500131|t      |     0
+   57638|1500126|t      |     0
 (4 rows)
 
 id|value
@@ -497,7 +589,7 @@ t
 step s2-isolate-tenant: <... completed>
 isolate_tenant_to_new_shard
 ---------------------------------------------------------------------
-                    1500118
+                    1500138
 (1 row)
 
 step s2-commit:
@@ -516,10 +608,10 @@ step s2-print-cluster:
 
 nodeport|shardid|success|result
 ---------------------------------------------------------------------
-   57637|1500117|t      |     0
-   57637|1500118|t      |     0
-   57637|1500119|t      |     0
-   57638|1500114|t      |     0
+   57637|1500137|t      |     0
+   57637|1500138|t      |     0
+   57637|1500139|t      |     0
+   57638|1500134|t      |     0
 (4 rows)
 
 id|value
@@ -527,6 +619,94 @@ id|value
 (0 rows)
 
 
+starting permutation: s1-insert s3-acquire-advisory-lock s1-begin s1-select s2-begin s2-isolate-tenant s1-update-complex s1-commit s3-release-advisory-lock s2-commit s2-print-cluster
+create_distributed_table
+---------------------------------------------------------------------
+
+(1 row)
+
+step s1-insert:
+ INSERT INTO isolation_table VALUES (5, 10);
+ INSERT INTO isolation_table2 VALUES (5, 10);
+
+step s3-acquire-advisory-lock:
+    SELECT pg_advisory_lock(44000, 55152);
+
+pg_advisory_lock
+---------------------------------------------------------------------
+
+(1 row)
+
+step s1-begin:
+    BEGIN;
+    -- the tests are written with the logic where single shard SELECTs
+    -- do not to open transaction blocks
+    SET citus.select_opens_transaction_block TO false;
+
+step s1-select:
+ SELECT count(*) FROM isolation_table WHERE id = 5;
+
+count
+---------------------------------------------------------------------
+    1
+(1 row)
+
+step s2-begin:
+ BEGIN;
+
+step s2-isolate-tenant:
+ SELECT isolate_tenant_to_new_shard('isolation_table', 5, shard_transfer_mode => 'force_logical');
+ <waiting ...>
+step s1-update-complex: 
+ UPDATE isolation_table SET value = 5 WHERE id IN (
+        SELECT max(id) FROM isolation_table
+    );
+
+step s1-commit:
+ COMMIT;
+
+step s3-release-advisory-lock:
+    SELECT pg_advisory_unlock(44000, 55152);
+
+pg_advisory_unlock
+---------------------------------------------------------------------
+t
+(1 row)
+
+step s2-isolate-tenant: <... completed>
+isolate_tenant_to_new_shard
+---------------------------------------------------------------------
+                    1500146
+(1 row)
+
+step s2-commit:
+ COMMIT;
+
+step s2-print-cluster:
+ -- row count per shard
+ SELECT
+  nodeport, shardid, success, result
+ FROM
+  run_command_on_placements('isolation_table', 'select count(*) from %s')
+ ORDER BY
+  nodeport, shardid;
+ -- rows
+ SELECT id, value FROM isolation_table ORDER BY id, value;
+
+nodeport|shardid|success|result
+---------------------------------------------------------------------
+   57637|1500145|t      |     0
+   57637|1500146|t      |     1
+   57637|1500147|t      |     0
+   57638|1500142|t      |     0
+(4 rows)
+
+id|value
+---------------------------------------------------------------------
+ 5|    5
+(1 row)
+
+
 starting permutation: s3-acquire-advisory-lock s1-begin s1-select s2-begin s2-isolate-tenant s1-insert s1-commit s3-release-advisory-lock s2-commit s2-print-cluster
 create_distributed_table
 ---------------------------------------------------------------------
@@ -579,7 +759,7 @@ t
 step s2-isolate-tenant: <... completed>
 isolate_tenant_to_new_shard
 ---------------------------------------------------------------------
-                    1500126
+                    1500154
 (1 row)
 
 step s2-commit:
@@ -598,10 +778,10 @@ step s2-print-cluster:
 
 nodeport|shardid|success|result
 ---------------------------------------------------------------------
-   57637|1500125|t      |     0
-   57637|1500126|t      |     1
-   57637|1500127|t      |     0
-   57638|1500122|t      |     0
+   57637|1500153|t      |     0
+   57637|1500154|t      |     1
+   57637|1500155|t      |     0
+   57638|1500150|t      |     0
 (4 rows)
 
 id|value
@@ -661,7 +841,7 @@ t
 step s2-isolate-tenant: <... completed>
 isolate_tenant_to_new_shard
 ---------------------------------------------------------------------
-                    1500134
+                    1500162
 (1 row)
 
 step s2-commit:
@@ -680,10 +860,10 @@ step s2-print-cluster:
 
 nodeport|shardid|success|result
 ---------------------------------------------------------------------
-   57637|1500133|t      |     1
-   57637|1500134|t      |     1
-   57637|1500135|t      |     2
-   57638|1500130|t      |     1
+   57637|1500161|t      |     1
+   57637|1500162|t      |     1
+   57637|1500163|t      |     2
+   57638|1500158|t      |     1
 (4 rows)
 
 id|value
@@ -736,7 +916,7 @@ t
 step s2-isolate-tenant: <... completed>
 isolate_tenant_to_new_shard
 ---------------------------------------------------------------------
-                    1500142
+                    1500170
 (1 row)
 
 step s2-print-cluster:
@@ -752,10 +932,10 @@ step s2-print-cluster:
 
 nodeport|shardid|success|result
 ---------------------------------------------------------------------
-   57637|1500141|t      |     0
-   57637|1500142|t      |     1
-   57637|1500143|t      |     0
-   57638|1500138|t      |     0
+   57637|1500169|t      |     0
+   57637|1500170|t      |     1
+   57637|1500171|t      |     0
+   57638|1500166|t      |     0
 (4 rows)
 
 id|value
@@ -804,7 +984,7 @@ t
 step s2-isolate-tenant: <... completed>
 isolate_tenant_to_new_shard
 ---------------------------------------------------------------------
-                    1500150
+                    1500178
 (1 row)
 
 step s2-print-cluster:
@@ -820,10 +1000,10 @@ step s2-print-cluster:
 
 nodeport|shardid|success|result
 ---------------------------------------------------------------------
-   57637|1500149|t      |     0
-   57637|1500150|t      |     1
-   57637|1500151|t      |     0
-   57638|1500146|t      |     0
+   57637|1500177|t      |     0
+   57637|1500178|t      |     1
+   57637|1500179|t      |     0
+   57638|1500174|t      |     0
 (4 rows)
 
 id|value
@@ -872,7 +1052,7 @@ t
 step s2-isolate-tenant: <... completed>
 isolate_tenant_to_new_shard
 ---------------------------------------------------------------------
-                    1500158
+                    1500186
 (1 row)
 
 step s2-print-cluster:
@@ -888,10 +1068,10 @@ step s2-print-cluster:
 
 nodeport|shardid|success|result
 ---------------------------------------------------------------------
-   57637|1500157|t      |     0
-   57637|1500158|t      |     1
-   57637|1500159|t      |     0
-   57638|1500154|t      |     0
+   57637|1500185|t      |     0
+   57637|1500186|t      |     1
+   57637|1500187|t      |     0
+   57638|1500182|t      |     0
 (4 rows)
 
 id|value
@@ -943,7 +1123,7 @@ t
 step s2-isolate-tenant: <... completed>
 isolate_tenant_to_new_shard
 ---------------------------------------------------------------------
-                    1500169
+                    1500197
 (1 row)
 
 step s2-commit:
@@ -962,10 +1142,10 @@ step s2-print-cluster:
 
 nodeport|shardid|success|result
 ---------------------------------------------------------------------
-   57637|1500168|t      |     0
-   57637|1500169|t      |     1
-   57637|1500170|t      |     0
-   57638|1500165|t      |     0
+   57637|1500196|t      |     0
+   57637|1500197|t      |     1
+   57637|1500198|t      |     0
+   57638|1500193|t      |     0
 (4 rows)
 
 id|value
@@ -1004,7 +1184,7 @@ step s1-isolate-tenant-no-same-coloc-blocking:
 
 isolate_tenant_to_new_shard
 ---------------------------------------------------------------------
-                    1500183
+                    1500211
 (1 row)
 
 step s3-release-advisory-lock:
@@ -1018,7 +1198,7 @@ t
 step s2-isolate-tenant: <... completed>
 isolate_tenant_to_new_shard
 ---------------------------------------------------------------------
-                    1500180
+                    1500208
 (1 row)
 
 step s2-print-cluster:
@@ -1034,10 +1214,10 @@ step s2-print-cluster:
 
 nodeport|shardid|success|result
 ---------------------------------------------------------------------
-   57637|1500179|t      |     0
-   57637|1500180|t      |     1
-   57637|1500181|t      |     0
-   57638|1500176|t      |     0
+   57637|1500207|t      |     0
+   57637|1500208|t      |     1
+   57637|1500209|t      |     0
+   57638|1500204|t      |     0
 (4 rows)
 
 id|value
@@ -1076,7 +1256,7 @@ step s1-isolate-tenant-no-same-coloc-blocking:
 
 isolate_tenant_to_new_shard
 ---------------------------------------------------------------------
-                    1500194
+                    1500222
 (1 row)
 
 step s3-release-advisory-lock:
@@ -1090,7 +1270,7 @@ t
 step s2-isolate-tenant: <... completed>
 isolate_tenant_to_new_shard
 ---------------------------------------------------------------------
-                    1500191
+                    1500219
 (1 row)
 
 step s2-print-cluster:
@@ -1106,10 +1286,10 @@ step s2-print-cluster:
 
 nodeport|shardid|success|result
 ---------------------------------------------------------------------
-   57637|1500190|t      |     0
-   57637|1500191|t      |     1
-   57637|1500192|t      |     0
-   57638|1500187|t      |     0
+   57637|1500218|t      |     0
+   57637|1500219|t      |     1
+   57637|1500220|t      |     0
+   57638|1500215|t      |     0
 (4 rows)
 
 id|value
diff --git a/src/test/regress/spec/isolation_tenant_isolation.spec b/src/test/regress/spec/isolation_tenant_isolation.spec
index 6b0fcd53e..5db86685e 100644
--- a/src/test/regress/spec/isolation_tenant_isolation.spec
+++ b/src/test/regress/spec/isolation_tenant_isolation.spec
@@ -44,6 +44,13 @@ step "s1-update"
 	UPDATE isolation_table SET value = 5 WHERE id = 5;
 }
 
+step "s1-update-complex"
+{
+	UPDATE isolation_table SET value = 5 WHERE id IN (
+        SELECT max(id) FROM isolation_table
+    );
+}
+
 step "s1-delete"
 {
 	DELETE FROM isolation_table WHERE id = 5;
@@ -119,6 +126,7 @@ step "s2-print-index-count"
 // we expect DML/DDL queries to fail because the shard they are waiting for is destroyed
 permutation "s1-load-cache" "s1-insert" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-update" "s2-commit" "s1-commit" "s2-print-cluster"
 permutation "s1-load-cache" "s1-insert" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-delete" "s2-commit" "s1-commit" "s2-print-cluster"
+permutation "s1-load-cache" "s1-insert" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-update-complex" "s2-commit" "s1-commit" "s2-print-cluster"
 permutation "s1-load-cache" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-insert" "s2-commit" "s1-commit" "s2-print-cluster"
 permutation "s1-load-cache" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-copy" "s2-commit" "s1-commit" "s2-print-cluster"
 permutation "s1-load-cache" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-ddl" "s2-commit" "s1-commit" "s2-print-cluster" "s2-print-index-count"
@@ -127,6 +135,7 @@ permutation "s1-load-cache" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant
 // the same tests without loading the cache at first
 permutation "s1-insert" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-update" "s2-commit" "s1-commit" "s2-print-cluster"
 permutation "s1-insert" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-delete" "s2-commit" "s1-commit" "s2-print-cluster"
+permutation "s1-insert" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-update-complex" "s2-commit" "s1-commit" "s2-print-cluster"
 permutation "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-insert" "s2-commit" "s1-commit" "s2-print-cluster"
 permutation "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-copy" "s2-commit" "s1-commit" "s2-print-cluster"
 permutation "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-ddl" "s2-commit" "s1-commit" "s2-print-cluster" "s2-print-index-count"
diff --git a/src/test/regress/spec/isolation_tenant_isolation_nonblocking.spec b/src/test/regress/spec/isolation_tenant_isolation_nonblocking.spec
index e7395e631..8dd726392 100644
--- a/src/test/regress/spec/isolation_tenant_isolation_nonblocking.spec
+++ b/src/test/regress/spec/isolation_tenant_isolation_nonblocking.spec
@@ -51,6 +51,13 @@ step "s1-update"
 	UPDATE isolation_table SET value = 5 WHERE id = 5;
 }
 
+step "s1-update-complex"
+{
+	UPDATE isolation_table SET value = 5 WHERE id IN (
+        SELECT max(id) FROM isolation_table
+    );
+}
+
 step "s1-delete"
 {
 	DELETE FROM isolation_table WHERE id = 5;
@@ -148,12 +155,14 @@ step "s3-release-advisory-lock"
 // we expect DML queries of s2 to succeed without being blocked.
 permutation "s1-load-cache" "s1-insert" "s3-acquire-advisory-lock" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-update" "s1-commit" "s3-release-advisory-lock" "s2-commit" "s2-print-cluster"
 permutation "s1-load-cache" "s1-insert" "s3-acquire-advisory-lock" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-delete" "s1-commit" "s3-release-advisory-lock" "s2-commit" "s2-print-cluster"
+permutation "s1-load-cache" "s1-insert" "s3-acquire-advisory-lock" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-update-complex" "s1-commit" "s3-release-advisory-lock" "s2-commit" "s2-print-cluster"
 permutation "s1-load-cache" "s3-acquire-advisory-lock" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-insert" "s1-commit" "s3-release-advisory-lock" "s2-commit" "s2-print-cluster"
 permutation "s1-load-cache" "s3-acquire-advisory-lock" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-copy" "s1-commit" "s3-release-advisory-lock" "s2-commit" "s2-print-cluster"
 
 // the same tests without loading the cache at first
 permutation "s1-insert" "s3-acquire-advisory-lock" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-update" "s1-commit" "s3-release-advisory-lock" "s2-commit" "s2-print-cluster"
 permutation "s1-insert" "s3-acquire-advisory-lock" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-delete" "s1-commit" "s3-release-advisory-lock" "s2-commit" "s2-print-cluster"
+permutation "s1-insert" "s3-acquire-advisory-lock" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-update-complex" "s1-commit" "s3-release-advisory-lock" "s2-commit" "s2-print-cluster"
 permutation "s3-acquire-advisory-lock" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-insert" "s1-commit" "s3-release-advisory-lock" "s2-commit" "s2-print-cluster"
 permutation "s3-acquire-advisory-lock" "s1-begin" "s1-select" "s2-begin" "s2-isolate-tenant" "s1-copy" "s1-commit" "s3-release-advisory-lock" "s2-commit" "s2-print-cluster"
 

From 6f06ff78cc32cf67225d9d55e0679ce6e084c979 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Emel=20=C5=9Eim=C5=9Fek?= <emelhasdal@gmail.com>
Date: Tue, 6 Sep 2022 16:46:41 +0300
Subject: [PATCH 15/16] Throw an error if there is a RangeTblEntry that is not
 assigned an RTE identity. (#6295)

* Fix issue : 6109 Segfault or (assertion failure) is possible when using a SQL function

* DESCRIPTION: Ensures disallowing the usage of SQL functions referencing to a distributed table and prevents a segfault.
Using a SQL function may result in segmentation fault in some cases.
This change fixes the issue by throwing an error message when a SQL function cannot be handled.

Fixes #6109.

* DESCRIPTION: Ensures disallowing the usage of SQL functions referencing to a distributed table and prevents a segfault.
Using a SQL function may result in segmentation fault in some cases. This change fixes the issue by throwing an error message when a SQL function cannot be handled.

Fixes #6109.

Co-authored-by: Emel Simsek <emel.simsek@microsoft.com>
---
 .../distributed/planner/distributed_planner.c | 21 +++++++++++++++++--
 .../regress/expected/multi_sql_function.out   |  3 +++
 src/test/regress/sql/multi_sql_function.sql   |  3 +++
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/src/backend/distributed/planner/distributed_planner.c b/src/backend/distributed/planner/distributed_planner.c
index 17816a3b4..3f431f514 100644
--- a/src/backend/distributed/planner/distributed_planner.c
+++ b/src/backend/distributed/planner/distributed_planner.c
@@ -559,9 +559,26 @@ int
 GetRTEIdentity(RangeTblEntry *rte)
 {
 	Assert(rte->rtekind == RTE_RELATION);
-	Assert(rte->values_lists != NIL);
+
+	/*
+	 * Since SQL functions might be in-lined by standard_planner,
+	 * we might miss assigning an RTE identity for RangeTblEntries
+	 * related to SQL functions. We already have checks in other
+	 * places to throw an error for SQL functions but they are not
+	 * sufficient due to function in-lining; so here we capture such
+	 * cases and throw an error here.
+	 */
+	if (list_length(rte->values_lists) != 2)
+	{
+		ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						errmsg("cannot perform distributed planning on this "
+							   "query because parameterized queries for SQL "
+							   "functions referencing distributed tables are "
+							   "not supported"),
+						errhint("Consider using PL/pgSQL functions instead.")));
+	}
+
 	Assert(IsA(rte->values_lists, IntList));
-	Assert(list_length(rte->values_lists) == 2);
 
 	return linitial_int(rte->values_lists);
 }
diff --git a/src/test/regress/expected/multi_sql_function.out b/src/test/regress/expected/multi_sql_function.out
index e1862c983..bcc4efcca 100644
--- a/src/test/regress/expected/multi_sql_function.out
+++ b/src/test/regress/expected/multi_sql_function.out
@@ -320,6 +320,9 @@ INSERT INTO test_parameterized_sql VALUES(1, 1);
 SELECT * FROM test_parameterized_sql_function(1);
 ERROR:  cannot perform distributed planning on this query because parameterized queries for SQL functions referencing distributed tables are not supported
 HINT:  Consider using PL/pgSQL functions instead.
+SELECT (SELECT 1 FROM test_parameterized_sql limit 1) FROM test_parameterized_sql_function(1);
+ERROR:  cannot perform distributed planning on this query because parameterized queries for SQL functions referencing distributed tables are not supported
+HINT:  Consider using PL/pgSQL functions instead.
 SELECT test_parameterized_sql_function_in_subquery_where(1);
 ERROR:  could not create distributed plan
 DETAIL:  Possibly this is caused by the use of parameters in SQL functions, which is not supported in Citus.
diff --git a/src/test/regress/sql/multi_sql_function.sql b/src/test/regress/sql/multi_sql_function.sql
index 329d57996..1ef0ef40a 100644
--- a/src/test/regress/sql/multi_sql_function.sql
+++ b/src/test/regress/sql/multi_sql_function.sql
@@ -144,6 +144,9 @@ INSERT INTO test_parameterized_sql VALUES(1, 1);
 
 -- all of them should fail
 SELECT * FROM test_parameterized_sql_function(1);
+
+SELECT (SELECT 1 FROM test_parameterized_sql limit 1) FROM test_parameterized_sql_function(1);
+
 SELECT test_parameterized_sql_function_in_subquery_where(1);
 
 -- postgres behaves slightly differently for the following

From ac96370ddf06ad8c850f5a8d146a6b22f3ecc095 Mon Sep 17 00:00:00 2001
From: Gokhan Gulbiz <ggulbiz@gmail.com>
Date: Tue, 6 Sep 2022 17:38:41 +0300
Subject: [PATCH 16/16] Use IsMultiStatementTransaction for SELECT .. FOR
 UPDATE queries (#6288)

* Use IsMultiStatementTransaction instead of IsTransaction for row-locking operations.

* Add regression test for SELECT..FOR UPDATE statement
---
 .../distributed/executor/adaptive_executor.c  |  2 +-
 .../regress/expected/adaptive_executor.out    | 33 ++++++++++++++++++-
 src/test/regress/sql/adaptive_executor.sql    | 18 ++++++++++
 3 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/src/backend/distributed/executor/adaptive_executor.c b/src/backend/distributed/executor/adaptive_executor.c
index a0e16876a..948d65791 100644
--- a/src/backend/distributed/executor/adaptive_executor.c
+++ b/src/backend/distributed/executor/adaptive_executor.c
@@ -1421,7 +1421,7 @@ DistributedExecutionRequiresRollback(List *taskList)
 		 * Do not check SelectOpensTransactionBlock, always open transaction block
 		 * if SELECT FOR UPDATE is executed inside a distributed transaction.
 		 */
-		return IsTransactionBlock();
+		return IsMultiStatementTransaction();
 	}
 
 	if (ReadOnlyTask(task->taskType))
diff --git a/src/test/regress/expected/adaptive_executor.out b/src/test/regress/expected/adaptive_executor.out
index aeaa553f2..4372747d6 100644
--- a/src/test/regress/expected/adaptive_executor.out
+++ b/src/test/regress/expected/adaptive_executor.out
@@ -78,5 +78,36 @@ $$);
 (1 row)
 
 END;
+CREATE OR REPLACE FUNCTION select_for_update()
+RETURNS void
+AS $$
+DECLARE
+    my int;
+BEGIN
+    SELECT y INTO my FROM test WHERE x = 1 FOR UPDATE;
+END;
+$$ LANGUAGE plpgsql;
+-- so that we can prove that we open a transaction block by logging below:
+-- "NOTICE:  issuing BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;..."
+SET citus.log_remote_commands TO on;
+SELECT select_for_update();
+NOTICE:  issuing BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;SELECT assign_distributed_transaction_id(xx, xx, 'xxxxxxx');
+DETAIL:  on server postgres@localhost:xxxxx connectionId: xxxxxxx
+CONTEXT:  SQL statement "SELECT y         FROM test WHERE x = 1 FOR UPDATE"
+PL/pgSQL function select_for_update() line XX at SQL statement
+NOTICE:  issuing SELECT y FROM adaptive_executor.test_801009000 test WHERE (x OPERATOR(pg_catalog.=) 1) FOR UPDATE OF test
+DETAIL:  on server postgres@localhost:xxxxx connectionId: xxxxxxx
+CONTEXT:  SQL statement "SELECT y         FROM test WHERE x = 1 FOR UPDATE"
+PL/pgSQL function select_for_update() line XX at SQL statement
+NOTICE:  issuing COMMIT
+DETAIL:  on server postgres@localhost:xxxxx connectionId: xxxxxxx
+ select_for_update
+---------------------------------------------------------------------
+
+(1 row)
+
+SET citus.log_remote_commands TO off;
 DROP SCHEMA adaptive_executor CASCADE;
-NOTICE:  drop cascades to table test
+NOTICE:  drop cascades to 2 other objects
+DETAIL:  drop cascades to table test
+drop cascades to function select_for_update()
diff --git a/src/test/regress/sql/adaptive_executor.sql b/src/test/regress/sql/adaptive_executor.sql
index f7d6c6f1e..5dd14e6f8 100644
--- a/src/test/regress/sql/adaptive_executor.sql
+++ b/src/test/regress/sql/adaptive_executor.sql
@@ -40,4 +40,22 @@ SELECT sum(result::bigint) FROM run_command_on_workers($$
 $$);
 END;
 
+CREATE OR REPLACE FUNCTION select_for_update()
+RETURNS void
+AS $$
+DECLARE
+    my int;
+BEGIN
+    SELECT y INTO my FROM test WHERE x = 1 FOR UPDATE;
+END;
+$$ LANGUAGE plpgsql;
+
+-- so that we can prove that we open a transaction block by logging below:
+-- "NOTICE:  issuing BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;..."
+SET citus.log_remote_commands TO on;
+
+SELECT select_for_update();
+
+SET citus.log_remote_commands TO off;
+
 DROP SCHEMA adaptive_executor CASCADE;