Reduce isolation flakyness by improving blocked process detection (#6405)

Sometimes our CI randomly fails on a test in a way similar to this:
```diff
 step s2-drop:
     DROP TABLE cancel_table;
-
+ <waiting ...>
+step s2-drop: <... completed>

 starting permutation: s1-timeout s1-begin s1-sleep10000 s1-rollback s1-reset s1-drop
```
Source:
https://app.circleci.com/pipelines/github/citusdata/citus/26524/workflows/5415b84f-13a3-482f-bef9-648314c79a67/jobs/756377

I tried to fix that already in #6252 by disabling the maintenance daemon
during isolation tests. But it seems that hasn't fixed all cases of
these errors. This is another attempt at fixing these issues that seems
to have better results.

What it does is that it starts using the pInterestingPids parameter that
citus_isolation_test_session_is_blocked receives. With this change we
start filter out block-edges that are not caused by any of these pids.

In passing this change also makes it possible to run 
`isolation_create_distributed_table_concurrently` with
`check-isolation-base`
pull/6425/head
Jelte Fennema 2022-10-12 16:35:09 +02:00 committed by GitHub
parent ec3eebbaf6
commit 6277ffd69e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 58 additions and 0 deletions

View File

@ -0,0 +1,45 @@
CREATE OR REPLACE FUNCTION pg_catalog.citus_isolation_test_session_is_blocked(pBlockedPid integer, pInterestingPids integer[])
RETURNS boolean AS $$
DECLARE
mBlockedGlobalPid int8;
workerProcessId integer := current_setting('citus.isolation_test_session_remote_process_id');
coordinatorProcessId integer := current_setting('citus.isolation_test_session_process_id');
BEGIN
IF pg_catalog.old_pg_isolation_test_session_is_blocked(pBlockedPid, pInterestingPids) THEN
RETURN true;
END IF;
-- pg says we're not blocked locally; check whether we're blocked globally.
-- Note that worker process may be blocked or waiting for a lock. So we need to
-- get transaction number for both of them. Following IF provides the transaction
-- number when the worker process waiting for other session.
IF EXISTS (SELECT 1 FROM get_global_active_transactions()
WHERE process_id = workerProcessId AND pBlockedPid = coordinatorProcessId) THEN
SELECT global_pid INTO mBlockedGlobalPid FROM get_global_active_transactions()
WHERE process_id = workerProcessId AND pBlockedPid = coordinatorProcessId;
ELSE
-- Check whether transactions initiated from the coordinator get locked
SELECT global_pid INTO mBlockedGlobalPid
FROM get_all_active_transactions() WHERE process_id = pBlockedPid;
END IF;
-- We convert the blocking_global_pid to a regular pid and only look at
-- blocks caused by the interesting pids, or the workerProcessPid. If we
-- don't do that we might find unrelated blocks caused by some random
-- other processes that are not involved in this isolation test. Because we
-- run our isolation tests on a single physical machine, the PID part of
-- the GPID is known to be unique within the whole cluster.
RETURN EXISTS (
SELECT 1 FROM citus_internal_global_blocked_processes()
WHERE waiting_global_pid = mBlockedGlobalPid
AND (
citus_pid_for_gpid(blocking_global_pid) in (
select * from unnest(pInterestingPids)
)
OR citus_pid_for_gpid(blocking_global_pid) = workerProcessId
)
);
END;
$$ LANGUAGE plpgsql;
REVOKE ALL ON FUNCTION citus_isolation_test_session_is_blocked(integer,integer[]) FROM PUBLIC;

View File

@ -23,9 +23,21 @@ RETURNS boolean AS $$
FROM get_all_active_transactions() WHERE process_id = pBlockedPid;
END IF;
-- We convert the blocking_global_pid to a regular pid and only look at
-- blocks caused by the interesting pids, or the workerProcessPid. If we
-- don't do that we might find unrelated blocks caused by some random
-- other processes that are not involved in this isolation test. Because we
-- run our isolation tests on a single physical machine, the PID part of
-- the GPID is known to be unique within the whole cluster.
RETURN EXISTS (
SELECT 1 FROM citus_internal_global_blocked_processes()
WHERE waiting_global_pid = mBlockedGlobalPid
AND (
citus_pid_for_gpid(blocking_global_pid) in (
select * from unnest(pInterestingPids)
)
OR citus_pid_for_gpid(blocking_global_pid) = workerProcessId
)
);
END;
$$ LANGUAGE plpgsql;

View File

@ -1,5 +1,6 @@
setup
{
select setval('pg_dist_shardid_seq', GREATEST(1400292, nextval('pg_dist_shardid_seq')-1));
-- make sure coordinator is in metadata
SELECT citus_set_coordinator_host('localhost', 57636);
CREATE TABLE table_1(id int PRIMARY KEY);