From 3d67ae649746c7a2a18cbeef129e19880ebbd0d6 Mon Sep 17 00:00:00 2001 From: Jelte Fennema Date: Fri, 19 Aug 2022 11:11:07 +0200 Subject: [PATCH] Fix flakyness in failure_insert_select_repartition (#6202) This fixes our most commonly randomly failing failure test. The failing diff is as follows: ```diff SELECT citus.mitmproxy('conn.onQuery(query="fetch_intermediate_results").kill()'); mitmproxy ----------- (1 row) INSERT INTO target_table SELECT * FROM source_table; -ERROR: connection to the remote node localhost:xxxxx failed with the following error: connection not open +ERROR: could not open file "base/pgsql_job_cache/10_0_40/repartitioned_results_20770193413_from_4213590_to_1.data": No such file or directory +CONTEXT: while executing command on localhost:9060 +while executing command on localhost:57637 SELECT * FROM target_table ORDER BY a; ``` As far as I can tell this is the cause of a race condition: After killing fetch_intermediate_results on worker 9060, the previously created data file gets cleaned up. The fetch_intermediate_results call that's sent to worker 57637 will be cancelled and rolled back soon because of the failure on the other connection. But if that fetch_intermediate_results call is able to connect to 9060 before it is cancelled, it won't find the file it's looking for there anymore. So while it's not the error we expect, it does indicate that we succeeded. To avoid this issue instead of killing the fetch_intermediate_results call directly, we kill the COPY command that it uses to do the fetch. This results in stable output as can be seen here, where 227 runs of failure_insert_select_repartition succeeded: https://app.circleci.com/pipelines/github/citusdata/citus/26168/workflows/9c64a3b6-f46c-4725-9fb4-8f6a2d00a023/jobs/739389 To be clear this changes the test to affects the opposite fetch_intermediate_results call. This kills the fetch_intermediate_results call of worker 57637, instead of killing the fetch_intermediate_results call on worker 9060. Example of failing test: https://app.circleci.com/pipelines/github/citusdata/citus/26147/workflows/780e95ea-264a-4c9f-ad2e-cf11449a795e/jobs/738467 (cherry picked from commit 8ce12eb51f374d85f521795ce5eda17822f852cb) --- .../failure_insert_select_repartition.out | 20 ++++++++++++++----- .../sql/failure_insert_select_repartition.sql | 12 ++++++++--- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/src/test/regress/expected/failure_insert_select_repartition.out b/src/test/regress/expected/failure_insert_select_repartition.out index c634cfff8..8ad740e45 100644 --- a/src/test/regress/expected/failure_insert_select_repartition.out +++ b/src/test/regress/expected/failure_insert_select_repartition.out @@ -89,36 +89,46 @@ SELECT * FROM target_table ORDER BY a; (10 rows) -- --- kill fetch_intermediate_results +-- kill the COPY command that's created by fetch_intermediate_results -- this fails the fetch into target, so source replication doesn't matter -- and both should fail +-- We don't kill the fetch_intermediate_results query directly, because that +-- resulted in randomly failing tests on CI. The reason for that is that there +-- is a race condition, where killing the fetch_intermediate_results query +-- removes the data files before the fetch_intermediate_results query from the +-- other node can read them. In theory a similar race condition still exists +-- when killing the COPY, but CI doesn't hit that race condition in practice. -- TRUNCATE target_table; -SELECT citus.mitmproxy('conn.onQuery(query="fetch_intermediate_results").kill()'); +SELECT citus.mitmproxy('conn.onQuery(query="COPY").kill()'); mitmproxy --------------------------------------------------------------------- (1 row) INSERT INTO target_table SELECT * FROM source_table; -ERROR: connection to the remote node localhost:xxxxx failed with the following error: server closed the connection unexpectedly +ERROR: server closed the connection unexpectedly This probably means the server terminated abnormally before or while processing the request. +CONTEXT: while executing command on localhost:xxxxx +while executing command on localhost:xxxxx SELECT * FROM target_table ORDER BY a; a | b --------------------------------------------------------------------- (0 rows) -SELECT citus.mitmproxy('conn.onQuery(query="fetch_intermediate_results").kill()'); +SELECT citus.mitmproxy('conn.onQuery(query="COPY").kill()'); mitmproxy --------------------------------------------------------------------- (1 row) INSERT INTO target_table SELECT * FROM replicated_source_table; -ERROR: connection to the remote node localhost:xxxxx failed with the following error: server closed the connection unexpectedly +ERROR: server closed the connection unexpectedly This probably means the server terminated abnormally before or while processing the request. +CONTEXT: while executing command on localhost:xxxxx +while executing command on localhost:xxxxx SELECT * FROM target_table ORDER BY a; a | b --------------------------------------------------------------------- diff --git a/src/test/regress/sql/failure_insert_select_repartition.sql b/src/test/regress/sql/failure_insert_select_repartition.sql index 0c65c2529..d61fbe9df 100644 --- a/src/test/regress/sql/failure_insert_select_repartition.sql +++ b/src/test/regress/sql/failure_insert_select_repartition.sql @@ -47,19 +47,25 @@ INSERT INTO target_table SELECT * FROM replicated_source_table; SELECT * FROM target_table ORDER BY a; -- --- kill fetch_intermediate_results +-- kill the COPY command that's created by fetch_intermediate_results -- this fails the fetch into target, so source replication doesn't matter -- and both should fail +-- We don't kill the fetch_intermediate_results query directly, because that +-- resulted in randomly failing tests on CI. The reason for that is that there +-- is a race condition, where killing the fetch_intermediate_results query +-- removes the data files before the fetch_intermediate_results query from the +-- other node can read them. In theory a similar race condition still exists +-- when killing the COPY, but CI doesn't hit that race condition in practice. -- TRUNCATE target_table; -SELECT citus.mitmproxy('conn.onQuery(query="fetch_intermediate_results").kill()'); +SELECT citus.mitmproxy('conn.onQuery(query="COPY").kill()'); INSERT INTO target_table SELECT * FROM source_table; SELECT * FROM target_table ORDER BY a; -SELECT citus.mitmproxy('conn.onQuery(query="fetch_intermediate_results").kill()'); +SELECT citus.mitmproxy('conn.onQuery(query="COPY").kill()'); INSERT INTO target_table SELECT * FROM replicated_source_table; SELECT * FROM target_table ORDER BY a;