From 2bccb5815770b67cf646a7dc5bb9539d5a29c010 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Wed, 1 Nov 2023 13:12:20 +0100 Subject: [PATCH 1/6] Run github actions on main (#7292) We want the nice looking green checkmark on our main branch too. This PR includes running on pushes to release branches too, but that won't come into effect until we have release branches with this workflow file. --- .github/workflows/build_and_test.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index d285e4f50..d900fe867 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -10,6 +10,10 @@ on: required: false default: false type: boolean + push: + branches: + - "main" + - "release-*" pull_request: types: [opened, reopened,synchronize] jobs: From c83c5567028d2035651c39f737ac5a944a70db16 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Wed, 1 Nov 2023 14:44:45 +0100 Subject: [PATCH 2/6] Fix flaky isolation_master_update_node (#7303) Sometimes in CI isolation_master_update_node fails like this: ```diff ------------------ (1 row) step s2-abort: ABORT; step s1-abort: ABORT; FATAL: terminating connection due to administrator command FATAL: terminating connection due to administrator command SSL connection has been closed unexpectedly +server closed the connection unexpectedly master_remove_node ------------------ ``` This just seesm like a random error line. The only way to reasonably fix this is by adding an extra output file. So that's what this PR does. --- .../isolation_master_update_node_1.out | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 src/test/regress/expected/isolation_master_update_node_1.out diff --git a/src/test/regress/expected/isolation_master_update_node_1.out b/src/test/regress/expected/isolation_master_update_node_1.out new file mode 100644 index 000000000..474956629 --- /dev/null +++ b/src/test/regress/expected/isolation_master_update_node_1.out @@ -0,0 +1,68 @@ +Parsed test spec with 2 sessions + +starting permutation: s1-begin s1-insert s2-begin s2-update-node-1 s1-abort s2-abort +create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +step s1-begin: BEGIN; +step s1-insert: INSERT INTO t1 SELECT generate_series(1, 100); +step s2-begin: BEGIN; +step s2-update-node-1: + -- update a specific node by address + SELECT master_update_node(nodeid, 'localhost', nodeport + 10) + FROM pg_dist_node + WHERE nodename = 'localhost' + AND nodeport = 57637; + +step s1-abort: ABORT; +step s2-update-node-1: <... completed> +master_update_node +--------------------------------------------------------------------- + +(1 row) + +step s2-abort: ABORT; +master_remove_node +--------------------------------------------------------------------- + + +(2 rows) + + +starting permutation: s1-begin s1-insert s2-begin s2-update-node-1-force s2-abort s1-abort +create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +step s1-begin: BEGIN; +step s1-insert: INSERT INTO t1 SELECT generate_series(1, 100); +step s2-begin: BEGIN; +step s2-update-node-1-force: + -- update a specific node by address (force) + SELECT master_update_node(nodeid, 'localhost', nodeport + 10, force => true, lock_cooldown => 100) + FROM pg_dist_node + WHERE nodename = 'localhost' + AND nodeport = 57637; + +step s2-update-node-1-force: <... completed> +master_update_node +--------------------------------------------------------------------- + +(1 row) + +step s2-abort: ABORT; +step s1-abort: ABORT; +FATAL: terminating connection due to administrator command +FATAL: terminating connection due to administrator command +SSL connection has been closed unexpectedly +server closed the connection unexpectedly + +master_remove_node +--------------------------------------------------------------------- + + +(2 rows) + From c9f2fc892d4ce01a4bc23beb508e2ff03f08a774 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Wed, 1 Nov 2023 15:08:51 +0100 Subject: [PATCH 3/6] Fix flaky failure_split_cleanup (#7299) Sometimes failure_split_cleanup failed in CI like this: ```diff ERROR: server closed the connection unexpectedly CONTEXT: while executing command on localhost:9060 SELECT operation_id, object_type, object_name, node_group_id, policy_type FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name; operation_id | object_type | object_name | node_group_id | policy_type --------------+-------------+-----------------------------------------------------------+---------------+------------- 777 | 1 | citus_failure_split_cleanup_schema.table_to_split_8981000 | 1 | 0 - 777 | 1 | citus_failure_split_cleanup_schema.table_to_split_8981002 | 1 | 1 777 | 1 | citus_failure_split_cleanup_schema.table_to_split_8981002 | 2 | 0 + 777 | 1 | citus_failure_split_cleanup_schema.table_to_split_8981002 | 1 | 1 777 | 1 | citus_failure_split_cleanup_schema.table_to_split_8981003 | 2 | 1 777 | 4 | citus_shard_split_publication_1_10_777 | 2 | 0 (5 rows) -- we need to allow connection so that we can connect to proxy ``` Source: https://github.com/citusdata/citus/actions/runs/6717642291/attempts/1#summary-18256014949 It's the common problem where we're missing a column in the ORDER BY clause. This fixes that by adding an node_group_id to the query in question. --- .../regress/expected/failure_split_cleanup.out | 18 +++++++++--------- src/test/regress/sql/failure_split_cleanup.sql | 16 ++++++++-------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/test/regress/expected/failure_split_cleanup.out b/src/test/regress/expected/failure_split_cleanup.out index fe646587c..d81335325 100644 --- a/src/test/regress/expected/failure_split_cleanup.out +++ b/src/test/regress/expected/failure_split_cleanup.out @@ -277,12 +277,12 @@ CONTEXT: while executing command on localhost:xxxxx ERROR: connection not open CONTEXT: while executing command on localhost:xxxxx SELECT operation_id, object_type, object_name, node_group_id, policy_type - FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name; + FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id; operation_id | object_type | object_name | node_group_id | policy_type --------------------------------------------------------------------- 777 | 1 | citus_failure_split_cleanup_schema.table_to_split_8981000 | 1 | 0 - 777 | 1 | citus_failure_split_cleanup_schema.table_to_split_8981002 | 2 | 0 777 | 1 | citus_failure_split_cleanup_schema.table_to_split_8981002 | 1 | 1 + 777 | 1 | citus_failure_split_cleanup_schema.table_to_split_8981002 | 2 | 0 777 | 1 | citus_failure_split_cleanup_schema.table_to_split_8981003 | 2 | 1 777 | 4 | citus_shard_split_publication_xxxxxxx_xxxxxxx_xxxxxxx | 2 | 0 777 | 4 | citus_shard_split_publication_xxxxxxx_xxxxxxx_xxxxxxx | 2 | 0 @@ -336,7 +336,7 @@ CONTEXT: while executing command on localhost:xxxxx (1 row) SELECT operation_id, object_type, object_name, node_group_id, policy_type - FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name; + FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id; operation_id | object_type | object_name | node_group_id | policy_type --------------------------------------------------------------------- (0 rows) @@ -388,7 +388,7 @@ CONTEXT: while executing command on localhost:xxxxx ERROR: connection not open CONTEXT: while executing command on localhost:xxxxx SELECT operation_id, object_type, object_name, node_group_id, policy_type - FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name; + FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id; operation_id | object_type | object_name | node_group_id | policy_type --------------------------------------------------------------------- 777 | 1 | citus_failure_split_cleanup_schema.table_to_split_8981000 | 1 | 0 @@ -455,7 +455,7 @@ CONTEXT: while executing command on localhost:xxxxx (1 row) SELECT operation_id, object_type, object_name, node_group_id, policy_type - FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name; + FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id; operation_id | object_type | object_name | node_group_id | policy_type --------------------------------------------------------------------- (0 rows) @@ -507,7 +507,7 @@ CONTEXT: while executing command on localhost:xxxxx ERROR: connection not open CONTEXT: while executing command on localhost:xxxxx SELECT operation_id, object_type, object_name, node_group_id, policy_type - FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name; + FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id; operation_id | object_type | object_name | node_group_id | policy_type --------------------------------------------------------------------- 777 | 1 | citus_failure_split_cleanup_schema.table_to_split_8981000 | 1 | 0 @@ -574,7 +574,7 @@ CONTEXT: while executing command on localhost:xxxxx (1 row) SELECT operation_id, object_type, object_name, node_group_id, policy_type - FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name; + FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id; operation_id | object_type | object_name | node_group_id | policy_type --------------------------------------------------------------------- (0 rows) @@ -634,7 +634,7 @@ WARNING: connection to the remote node localhost:xxxxx failed with the followin ERROR: connection not open CONTEXT: while executing command on localhost:xxxxx SELECT operation_id, object_type, object_name, node_group_id, policy_type - FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name; + FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id; operation_id | object_type | object_name | node_group_id | policy_type --------------------------------------------------------------------- 777 | 1 | citus_failure_split_cleanup_schema.table_to_split_8981002 | 1 | 1 @@ -701,7 +701,7 @@ CONTEXT: while executing command on localhost:xxxxx (1 row) SELECT operation_id, object_type, object_name, node_group_id, policy_type - FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name; + FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id; operation_id | object_type | object_name | node_group_id | policy_type --------------------------------------------------------------------- (0 rows) diff --git a/src/test/regress/sql/failure_split_cleanup.sql b/src/test/regress/sql/failure_split_cleanup.sql index 1b85d3d17..9dfbb245e 100644 --- a/src/test/regress/sql/failure_split_cleanup.sql +++ b/src/test/regress/sql/failure_split_cleanup.sql @@ -136,7 +136,7 @@ SELECT create_distributed_table('table_to_split', 'id'); ARRAY[:worker_1_node, :worker_2_node], 'force_logical'); SELECT operation_id, object_type, object_name, node_group_id, policy_type - FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name; + FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id; -- we need to allow connection so that we can connect to proxy SELECT citus.mitmproxy('conn.allow()'); @@ -155,7 +155,7 @@ SELECT create_distributed_table('table_to_split', 'id'); \c - postgres - :master_port SELECT public.wait_for_resource_cleanup(); SELECT operation_id, object_type, object_name, node_group_id, policy_type - FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name; + FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id; \c - - - :worker_2_proxy_port SET search_path TO "citus_failure_split_cleanup_schema", public, pg_catalog; @@ -182,7 +182,7 @@ SELECT create_distributed_table('table_to_split', 'id'); ARRAY[:worker_1_node, :worker_2_node], 'force_logical'); SELECT operation_id, object_type, object_name, node_group_id, policy_type - FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name; + FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id; -- we need to allow connection so that we can connect to proxy SELECT citus.mitmproxy('conn.allow()'); @@ -201,7 +201,7 @@ SELECT create_distributed_table('table_to_split', 'id'); \c - postgres - :master_port SELECT public.wait_for_resource_cleanup(); SELECT operation_id, object_type, object_name, node_group_id, policy_type - FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name; + FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id; \c - - - :worker_2_proxy_port SET search_path TO "citus_failure_split_cleanup_schema", public, pg_catalog; @@ -228,7 +228,7 @@ SELECT create_distributed_table('table_to_split', 'id'); ARRAY[:worker_1_node, :worker_2_node], 'force_logical'); SELECT operation_id, object_type, object_name, node_group_id, policy_type - FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name; + FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id; -- we need to allow connection so that we can connect to proxy SELECT citus.mitmproxy('conn.allow()'); @@ -247,7 +247,7 @@ SELECT create_distributed_table('table_to_split', 'id'); \c - postgres - :master_port SELECT public.wait_for_resource_cleanup(); SELECT operation_id, object_type, object_name, node_group_id, policy_type - FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name; + FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id; \c - - - :worker_2_proxy_port SET search_path TO "citus_failure_split_cleanup_schema", public, pg_catalog; @@ -275,7 +275,7 @@ SELECT create_distributed_table('table_to_split', 'id'); 'force_logical'); SELECT operation_id, object_type, object_name, node_group_id, policy_type - FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name; + FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id; SELECT relname FROM pg_class where relname LIKE '%table_to_split_%' AND relkind = 'r' order by relname; -- we need to allow connection so that we can connect to proxy SELECT citus.mitmproxy('conn.allow()'); @@ -295,7 +295,7 @@ SELECT create_distributed_table('table_to_split', 'id'); \c - postgres - :master_port SELECT public.wait_for_resource_cleanup(); SELECT operation_id, object_type, object_name, node_group_id, policy_type - FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name; + FROM pg_dist_cleanup where operation_id = 777 ORDER BY object_name, node_group_id; \c - - - :worker_2_proxy_port SET search_path TO "citus_failure_split_cleanup_schema", public, pg_catalog; From 5903196020ed3f444d70d76dca889bab35c756c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCrkan=20=C4=B0ndibay?= Date: Wed, 1 Nov 2023 18:52:22 +0300 Subject: [PATCH 4/6] Removes use-base-schedule flag from CI (#7301) Normally, tests which are written non-dependent to other tests can use minimal-tests and should use as well. However, in our test settings base-schedule is being used which may cause unnecessary dependencies and so unrelated errors that developers don't see in their local environment With this change, default setting will be minimal, so that tests will be free of unnecessary dependencies. --- .github/workflows/build_and_test.yml | 2 +- .github/workflows/flaky_test_debugging.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index d900fe867..804cd0bb9 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -505,7 +505,7 @@ jobs: for test in "${tests_array[@]}" do test_name=$(echo "$test" | sed -r "s/.+\/(.+)\..+/\1/") - gosu circleci src/test/regress/citus_tests/run_test.py $test_name --repeat ${{ env.runs }} --use-base-schedule --use-whole-schedule-line + gosu circleci src/test/regress/citus_tests/run_test.py $test_name --repeat ${{ env.runs }} --use-whole-schedule-line done shell: bash - uses: "./.github/actions/save_logs_and_results" diff --git a/.github/workflows/flaky_test_debugging.yml b/.github/workflows/flaky_test_debugging.yml index a666c1cd5..a744edc3b 100644 --- a/.github/workflows/flaky_test_debugging.yml +++ b/.github/workflows/flaky_test_debugging.yml @@ -71,7 +71,7 @@ jobs: - uses: "./.github/actions/setup_extension" - name: Run minimal tests run: |- - gosu circleci src/test/regress/citus_tests/run_test.py ${{ env.test }} --repeat ${{ env.runs }} --use-base-schedule --use-whole-schedule-line + gosu circleci src/test/regress/citus_tests/run_test.py ${{ env.test }} --repeat ${{ env.runs }} --use-whole-schedule-line shell: bash - uses: "./.github/actions/save_logs_and_results" if: always() From e3c93c303dec623f6e196ca8f2ca1d1a20c51e6c Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Wed, 1 Nov 2023 17:21:12 +0100 Subject: [PATCH 5/6] Fix flaky citus_non_blocking_split_shard_cleanup (#7311) Sometimes in CI citus_non_blocking_split_shard_cleanup failed like this: ```diff --- /__w/citus/citus/src/test/regress/expected/citus_non_blocking_split_shard_cleanup.out.modified 2023-11-01 15:07:14.280551207 +0000 +++ /__w/citus/citus/src/test/regress/results/citus_non_blocking_split_shard_cleanup.out.modified 2023-11-01 15:07:14.292551358 +0000 @@ -106,21 +106,22 @@ ----------------------------------- (1 row) \c - - - :worker_2_port SET search_path TO "citus_split_test_schema"; -- Replication slots should be cleaned up SELECT slot_name FROM pg_replication_slots; slot_name --------------------------------- -(0 rows) + citus_shard_split_slot_19_10_17 +(1 row) -- Publications should be cleanedup SELECT count(*) FROM pg_publication; count ``` It's expected that the replication slot is sometimes not cleaned up if we don't wait until resource cleanup completes. This PR starts doing that here. --- .../expected/citus_non_blocking_split_shard_cleanup.out | 6 ++++++ .../regress/sql/citus_non_blocking_split_shard_cleanup.sql | 2 ++ 2 files changed, 8 insertions(+) diff --git a/src/test/regress/expected/citus_non_blocking_split_shard_cleanup.out b/src/test/regress/expected/citus_non_blocking_split_shard_cleanup.out index e2685c2d7..a559ec442 100644 --- a/src/test/regress/expected/citus_non_blocking_split_shard_cleanup.out +++ b/src/test/regress/expected/citus_non_blocking_split_shard_cleanup.out @@ -107,6 +107,12 @@ SELECT pg_catalog.citus_split_shard_by_split_points( (1 row) +SELECT public.wait_for_resource_cleanup(); + wait_for_resource_cleanup +--------------------------------------------------------------------- + +(1 row) + \c - - - :worker_2_port SET search_path TO "citus_split_test_schema"; -- Replication slots should be cleaned up diff --git a/src/test/regress/sql/citus_non_blocking_split_shard_cleanup.sql b/src/test/regress/sql/citus_non_blocking_split_shard_cleanup.sql index ba3f95215..480d81b88 100644 --- a/src/test/regress/sql/citus_non_blocking_split_shard_cleanup.sql +++ b/src/test/regress/sql/citus_non_blocking_split_shard_cleanup.sql @@ -79,6 +79,8 @@ SELECT pg_catalog.citus_split_shard_by_split_points( ARRAY[:worker_2_node, :worker_2_node, :worker_2_node], 'force_logical'); +SELECT public.wait_for_resource_cleanup(); + \c - - - :worker_2_port SET search_path TO "citus_split_test_schema"; -- Replication slots should be cleaned up From 2cf4c0402319a9616e4d0feb4d9273757b3c1eaf Mon Sep 17 00:00:00 2001 From: Onur Tirtir Date: Thu, 2 Nov 2023 01:59:41 +0300 Subject: [PATCH 6/6] Fix flaky global_cancel.sql test (#7316) --- src/test/regress/expected/global_cancel.out | 10 ++++++++-- src/test/regress/sql/global_cancel.sql | 6 ++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/test/regress/expected/global_cancel.out b/src/test/regress/expected/global_cancel.out index 5adeef3c8..e5ce4fbc6 100644 --- a/src/test/regress/expected/global_cancel.out +++ b/src/test/regress/expected/global_cancel.out @@ -9,9 +9,14 @@ SELECT 1 FROM master_add_node('localhost', :master_port, groupid => 0); RESET client_min_messages; -- Kill maintenance daemon so it gets restarted and gets a gpid containing our -- nodeid -SELECT pg_terminate_backend(pid) +SELECT COUNT(pg_terminate_backend(pid)) >= 0 FROM pg_stat_activity -WHERE application_name = 'Citus Maintenance Daemon' \gset +WHERE application_name = 'Citus Maintenance Daemon'; + ?column? +--------------------------------------------------------------------- + t +(1 row) + -- reconnect to make sure we get a session with the gpid containing our nodeid \c - - - - CREATE SCHEMA global_cancel; @@ -77,6 +82,7 @@ ERROR: must be a superuser to terminate superuser process SELECT pg_cancel_backend(citus_backend_gpid()); ERROR: canceling statement due to user request \c - postgres - :master_port +DROP USER global_cancel_user; SET client_min_messages TO DEBUG; -- 10000000000 is the node id multiplier for global pid SELECT pg_cancel_backend(10000000000 * citus_coordinator_nodeid() + 0); diff --git a/src/test/regress/sql/global_cancel.sql b/src/test/regress/sql/global_cancel.sql index 848c3b01a..12330baf2 100644 --- a/src/test/regress/sql/global_cancel.sql +++ b/src/test/regress/sql/global_cancel.sql @@ -5,9 +5,9 @@ RESET client_min_messages; -- Kill maintenance daemon so it gets restarted and gets a gpid containing our -- nodeid -SELECT pg_terminate_backend(pid) +SELECT COUNT(pg_terminate_backend(pid)) >= 0 FROM pg_stat_activity -WHERE application_name = 'Citus Maintenance Daemon' \gset +WHERE application_name = 'Citus Maintenance Daemon'; -- reconnect to make sure we get a session with the gpid containing our nodeid \c - - - - @@ -58,6 +58,8 @@ SELECT pg_cancel_backend(citus_backend_gpid()); \c - postgres - :master_port +DROP USER global_cancel_user; + SET client_min_messages TO DEBUG; -- 10000000000 is the node id multiplier for global pid