mirror of https://github.com/citusdata/citus.git
Reduce isolation flakyness by improving blocked process detection (#6405)
Sometimes our CI randomly fails on a test in a way similar to this: ```diff step s2-drop: DROP TABLE cancel_table; - + <waiting ...> +step s2-drop: <... completed> starting permutation: s1-timeout s1-begin s1-sleep10000 s1-rollback s1-reset s1-drop ``` Source: https://app.circleci.com/pipelines/github/citusdata/citus/26524/workflows/5415b84f-13a3-482f-bef9-648314c79a67/jobs/756377 I tried to fix that already in #6252 by disabling the maintenance daemon during isolation tests. But it seems that hasn't fixed all cases of these errors. This is another attempt at fixing these issues that seems to have better results. What it does is that it starts using the pInterestingPids parameter that citus_isolation_test_session_is_blocked receives. With this change we start filter out block-edges that are not caused by any of these pids. In passing this change also makes it possible to run `isolation_create_distributed_table_concurrently` with `check-isolation-base`pull/6425/head
parent
ec3eebbaf6
commit
6277ffd69e
45
src/backend/distributed/sql/udfs/citus_isolation_test_session_is_blocked/11.2-1.sql
generated
Normal file
45
src/backend/distributed/sql/udfs/citus_isolation_test_session_is_blocked/11.2-1.sql
generated
Normal file
|
@ -0,0 +1,45 @@
|
|||
CREATE OR REPLACE FUNCTION pg_catalog.citus_isolation_test_session_is_blocked(pBlockedPid integer, pInterestingPids integer[])
|
||||
RETURNS boolean AS $$
|
||||
DECLARE
|
||||
mBlockedGlobalPid int8;
|
||||
workerProcessId integer := current_setting('citus.isolation_test_session_remote_process_id');
|
||||
coordinatorProcessId integer := current_setting('citus.isolation_test_session_process_id');
|
||||
BEGIN
|
||||
IF pg_catalog.old_pg_isolation_test_session_is_blocked(pBlockedPid, pInterestingPids) THEN
|
||||
RETURN true;
|
||||
END IF;
|
||||
|
||||
-- pg says we're not blocked locally; check whether we're blocked globally.
|
||||
-- Note that worker process may be blocked or waiting for a lock. So we need to
|
||||
-- get transaction number for both of them. Following IF provides the transaction
|
||||
-- number when the worker process waiting for other session.
|
||||
IF EXISTS (SELECT 1 FROM get_global_active_transactions()
|
||||
WHERE process_id = workerProcessId AND pBlockedPid = coordinatorProcessId) THEN
|
||||
SELECT global_pid INTO mBlockedGlobalPid FROM get_global_active_transactions()
|
||||
WHERE process_id = workerProcessId AND pBlockedPid = coordinatorProcessId;
|
||||
ELSE
|
||||
-- Check whether transactions initiated from the coordinator get locked
|
||||
SELECT global_pid INTO mBlockedGlobalPid
|
||||
FROM get_all_active_transactions() WHERE process_id = pBlockedPid;
|
||||
END IF;
|
||||
|
||||
-- We convert the blocking_global_pid to a regular pid and only look at
|
||||
-- blocks caused by the interesting pids, or the workerProcessPid. If we
|
||||
-- don't do that we might find unrelated blocks caused by some random
|
||||
-- other processes that are not involved in this isolation test. Because we
|
||||
-- run our isolation tests on a single physical machine, the PID part of
|
||||
-- the GPID is known to be unique within the whole cluster.
|
||||
RETURN EXISTS (
|
||||
SELECT 1 FROM citus_internal_global_blocked_processes()
|
||||
WHERE waiting_global_pid = mBlockedGlobalPid
|
||||
AND (
|
||||
citus_pid_for_gpid(blocking_global_pid) in (
|
||||
select * from unnest(pInterestingPids)
|
||||
)
|
||||
OR citus_pid_for_gpid(blocking_global_pid) = workerProcessId
|
||||
)
|
||||
);
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
REVOKE ALL ON FUNCTION citus_isolation_test_session_is_blocked(integer,integer[]) FROM PUBLIC;
|
|
@ -23,9 +23,21 @@ RETURNS boolean AS $$
|
|||
FROM get_all_active_transactions() WHERE process_id = pBlockedPid;
|
||||
END IF;
|
||||
|
||||
-- We convert the blocking_global_pid to a regular pid and only look at
|
||||
-- blocks caused by the interesting pids, or the workerProcessPid. If we
|
||||
-- don't do that we might find unrelated blocks caused by some random
|
||||
-- other processes that are not involved in this isolation test. Because we
|
||||
-- run our isolation tests on a single physical machine, the PID part of
|
||||
-- the GPID is known to be unique within the whole cluster.
|
||||
RETURN EXISTS (
|
||||
SELECT 1 FROM citus_internal_global_blocked_processes()
|
||||
WHERE waiting_global_pid = mBlockedGlobalPid
|
||||
AND (
|
||||
citus_pid_for_gpid(blocking_global_pid) in (
|
||||
select * from unnest(pInterestingPids)
|
||||
)
|
||||
OR citus_pid_for_gpid(blocking_global_pid) = workerProcessId
|
||||
)
|
||||
);
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
setup
|
||||
{
|
||||
select setval('pg_dist_shardid_seq', GREATEST(1400292, nextval('pg_dist_shardid_seq')-1));
|
||||
-- make sure coordinator is in metadata
|
||||
SELECT citus_set_coordinator_host('localhost', 57636);
|
||||
CREATE TABLE table_1(id int PRIMARY KEY);
|
||||
|
|
Loading…
Reference in New Issue