From 6277ffd69eb7ca79e12a5dc655e9d424dbd2b349 Mon Sep 17 00:00:00 2001
From: Jelte Fennema <jelte.fennema@microsoft.com>
Date: Wed, 12 Oct 2022 16:35:09 +0200
Subject: [PATCH] Reduce isolation flakyness by improving blocked process
 detection (#6405)

Sometimes our CI randomly fails on a test in a way similar to this:
```diff
 step s2-drop:
     DROP TABLE cancel_table;
-
+ <waiting ...>
+step s2-drop: <... completed>

 starting permutation: s1-timeout s1-begin s1-sleep10000 s1-rollback s1-reset s1-drop
```
Source:
https://app.circleci.com/pipelines/github/citusdata/citus/26524/workflows/5415b84f-13a3-482f-bef9-648314c79a67/jobs/756377

I tried to fix that already in #6252 by disabling the maintenance daemon
during isolation tests. But it seems that hasn't fixed all cases of
these errors. This is another attempt at fixing these issues that seems
to have better results.

What it does is that it starts using the pInterestingPids parameter that
citus_isolation_test_session_is_blocked receives. With this change we
start filter out block-edges that are not caused by any of these pids.

In passing this change also makes it possible to run
`isolation_create_distributed_table_concurrently` with
`check-isolation-base`
---
 .../11.2-1.sql                                | 45 +++++++++++++++++++
 .../latest.sql                                | 12 +++++
 ...create_distributed_table_concurrently.spec |  1 +
 3 files changed, 58 insertions(+)
 create mode 100644 src/backend/distributed/sql/udfs/citus_isolation_test_session_is_blocked/11.2-1.sql
diff --git a/src/backend/distributed/sql/udfs/citus_isolation_test_session_is_blocked/11.2-1.sql b/src/backend/distributed/sql/udfs/citus_isolation_test_session_is_blocked/11.2-1.sql
new file mode 100644
index 000000000..ff0983910
--- /dev/null
+++ b/src/backend/distributed/sql/udfs/citus_isolation_test_session_is_blocked/11.2-1.sql
@@ -0,0 +1,45 @@
+CREATE OR REPLACE FUNCTION pg_catalog.citus_isolation_test_session_is_blocked(pBlockedPid integer, pInterestingPids integer[])
+RETURNS boolean AS $$
+  DECLARE
+    mBlockedGlobalPid int8;
+    workerProcessId integer := current_setting('citus.isolation_test_session_remote_process_id');
+    coordinatorProcessId integer := current_setting('citus.isolation_test_session_process_id');
+  BEGIN
+    IF pg_catalog.old_pg_isolation_test_session_is_blocked(pBlockedPid, pInterestingPids) THEN
+      RETURN true;
+    END IF;
+
+    -- pg says we're not blocked locally; check whether we're blocked globally.
+    -- Note that worker process may be blocked or waiting for a lock. So we need to
+    -- get transaction number for both of them. Following IF provides the transaction
+    -- number when the worker process waiting for other session.
+    IF EXISTS (SELECT 1 FROM get_global_active_transactions()
+               WHERE process_id = workerProcessId AND pBlockedPid = coordinatorProcessId) THEN
+      SELECT global_pid INTO mBlockedGlobalPid FROM get_global_active_transactions()
+      WHERE process_id = workerProcessId AND pBlockedPid = coordinatorProcessId;
+    ELSE
+      -- Check whether transactions initiated from the coordinator get locked
+      SELECT global_pid INTO mBlockedGlobalPid
+        FROM get_all_active_transactions() WHERE process_id = pBlockedPid;
+    END IF;
+
+    -- We convert the blocking_global_pid to a regular pid and only look at
+    -- blocks caused by the interesting pids, or the workerProcessPid. If we
+    -- don't do that we might find unrelated blocks caused by some random
+    -- other processes that are not involved in this isolation test. Because we
+    -- run our isolation tests on a single physical machine, the PID part of
+    -- the GPID is known to be unique within the whole cluster.
+    RETURN EXISTS (
+      SELECT 1 FROM citus_internal_global_blocked_processes()
+        WHERE waiting_global_pid = mBlockedGlobalPid
+        AND (
+          citus_pid_for_gpid(blocking_global_pid) in (
+              select * from unnest(pInterestingPids)
+          )
+          OR citus_pid_for_gpid(blocking_global_pid) = workerProcessId
+        )
+    );
+  END;
+$$ LANGUAGE plpgsql;
+
+REVOKE ALL ON FUNCTION citus_isolation_test_session_is_blocked(integer,integer[]) FROM PUBLIC;
diff --git a/src/backend/distributed/sql/udfs/citus_isolation_test_session_is_blocked/latest.sql b/src/backend/distributed/sql/udfs/citus_isolation_test_session_is_blocked/latest.sql
index 52174271b..ff0983910 100644
--- a/src/backend/distributed/sql/udfs/citus_isolation_test_session_is_blocked/latest.sql
+++ b/src/backend/distributed/sql/udfs/citus_isolation_test_session_is_blocked/latest.sql
@@ -23,9 +23,21 @@ RETURNS boolean AS $$
         FROM get_all_active_transactions() WHERE process_id = pBlockedPid;
     END IF;
 
+    -- We convert the blocking_global_pid to a regular pid and only look at
+    -- blocks caused by the interesting pids, or the workerProcessPid. If we
+    -- don't do that we might find unrelated blocks caused by some random
+    -- other processes that are not involved in this isolation test. Because we
+    -- run our isolation tests on a single physical machine, the PID part of
+    -- the GPID is known to be unique within the whole cluster.
     RETURN EXISTS (
       SELECT 1 FROM citus_internal_global_blocked_processes()
         WHERE waiting_global_pid = mBlockedGlobalPid
+        AND (
+          citus_pid_for_gpid(blocking_global_pid) in (
+              select * from unnest(pInterestingPids)
+          )
+          OR citus_pid_for_gpid(blocking_global_pid) = workerProcessId
+        )
     );
   END;
 $$ LANGUAGE plpgsql;
diff --git a/src/test/regress/spec/isolation_create_distributed_table_concurrently.spec b/src/test/regress/spec/isolation_create_distributed_table_concurrently.spec
index 7bd305a93..f6a83d309 100644
--- a/src/test/regress/spec/isolation_create_distributed_table_concurrently.spec
+++ b/src/test/regress/spec/isolation_create_distributed_table_concurrently.spec
@@ -1,5 +1,6 @@
 setup
 {
+	select setval('pg_dist_shardid_seq', GREATEST(1400292, nextval('pg_dist_shardid_seq')-1));
 	-- make sure coordinator is in metadata
 	SELECT citus_set_coordinator_host('localhost', 57636);
   	CREATE TABLE table_1(id int PRIMARY KEY);