From 1dd775fae8eeb07a8d01e73cfaa11aa8e0457c00 Mon Sep 17 00:00:00 2001 From: Jelte Fennema Date: Tue, 23 Aug 2022 17:37:31 +0200 Subject: [PATCH] Speed up logical replication tests to fix flakyness (#6229) The isolation_tenant_isolation_nonblocking test would sometimes randomly fail in CI, because we have a limit of runtime limit of 2 minutes per test. ``` test isolation_tenant_isolation_nonblocking ... make: *** [Makefile:171: check-enterprise-isolation] Terminated Too long with no output (exceeded 2m0s): context deadline exceeded ``` One solution would obviously be to increase the timeout, but instead I spent some time to increase the speed of our tests by tweaking some timings. On my local machine the time it took to run the isolation_tenant_isolation_nonblocking test went from 75s to 15s. So now we should easily stay within the 2 minute per test limit. I also checked if the new settings improved other logical replication tests, but the impect differs wildly per test. One other example of a test that runs much quicker due to the change is isolation_non_blocking_shard_split_fkey. But the shard move tests I tried are impacted much less. Example of failed tests: https://app.circleci.com/pipelines/github/citusdata/citus/26373/workflows/4fa660e4-63c8-4844-bef8-70a7bea902b7/jobs/748199 --- src/test/regress/pg_regress_multi.pl | 4 ++-- .../regress/spec/isolation_tenant_isolation_nonblocking.spec | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/test/regress/pg_regress_multi.pl b/src/test/regress/pg_regress_multi.pl index 580c4b73e..01e36a6d2 100755 --- a/src/test/regress/pg_regress_multi.pl +++ b/src/test/regress/pg_regress_multi.pl @@ -451,12 +451,12 @@ push(@pgOptions, "wal_level='logical'"); # Faster logical replication status update so tests with logical replication # run faster -push(@pgOptions, "wal_receiver_status_interval=1"); +push(@pgOptions, "wal_receiver_status_interval=0"); # Faster logical replication apply worker launch so tests with logical # replication run faster. This is used in ApplyLauncherMain in # src/backend/replication/logical/launcher.c. -push(@pgOptions, "wal_retrieve_retry_interval=1000"); +push(@pgOptions, "wal_retrieve_retry_interval=250"); push(@pgOptions, "max_logical_replication_workers=50"); push(@pgOptions, "max_wal_senders=50"); diff --git a/src/test/regress/spec/isolation_tenant_isolation_nonblocking.spec b/src/test/regress/spec/isolation_tenant_isolation_nonblocking.spec index a1356ed17..3b22024f4 100644 --- a/src/test/regress/spec/isolation_tenant_isolation_nonblocking.spec +++ b/src/test/regress/spec/isolation_tenant_isolation_nonblocking.spec @@ -3,8 +3,8 @@ setup SET citus.shard_count to 2; SET citus.shard_replication_factor to 1; SELECT setval('pg_dist_shardid_seq', - CASE WHEN nextval('pg_dist_shardid_seq') > 1599999 OR nextval('pg_dist_shardid_seq') < 1500000 - THEN 1500000 + CASE WHEN nextval('pg_dist_shardid_seq') > 1599999 OR nextval('pg_dist_shardid_seq') < 1500072 + THEN 1500072 ELSE nextval('pg_dist_shardid_seq')-2 END);