mirror of https://github.com/citusdata/citus.git
Fix flakyness in failure_insert_select_repartition (#6202)
This fixes our most commonly randomly failing failure test. The failing
diff is as follows:
```diff
SELECT citus.mitmproxy('conn.onQuery(query="fetch_intermediate_results").kill()');
mitmproxy
-----------
(1 row)
INSERT INTO target_table SELECT * FROM source_table;
-ERROR: connection to the remote node localhost:xxxxx failed with the following error: connection not open
+ERROR: could not open file "base/pgsql_job_cache/10_0_40/repartitioned_results_20770193413_from_4213590_to_1.data": No such file or directory
+CONTEXT: while executing command on localhost:9060
+while executing command on localhost:57637
SELECT * FROM target_table ORDER BY a;
```
As far as I can tell this is the cause of a race condition: After killing
fetch_intermediate_results on worker 9060, the previously created data
file gets cleaned up. The fetch_intermediate_results call that's sent
to worker 57637 will be cancelled and rolled back soon because of the
failure on the other connection. But if that fetch_intermediate_results
call is able to connect to 9060 before it is cancelled, it won't find
the file it's looking for there anymore. So while it's not the error we
expect, it does indicate that we succeeded.
To avoid this issue instead of killing the fetch_intermediate_results
call directly, we kill the COPY command that it uses to do the fetch.
This results in stable output as can be seen here, where 227 runs of
failure_insert_select_repartition succeeded:
https://app.circleci.com/pipelines/github/citusdata/citus/26168/workflows/9c64a3b6-f46c-4725-9fb4-8f6a2d00a023/jobs/739389
To be clear this changes the test to affects the opposite
fetch_intermediate_results call. This kills the fetch_intermediate_results
call of worker 57637, instead of killing the fetch_intermediate_results call
on worker 9060.
Example of failing test: https://app.circleci.com/pipelines/github/citusdata/citus/26147/workflows/780e95ea-264a-4c9f-ad2e-cf11449a795e/jobs/738467
(cherry picked from commit 8ce12eb51f
)
pull/6363/head
parent
173b7ed5ad
commit
3d67ae6497
|
@ -89,36 +89,46 @@ SELECT * FROM target_table ORDER BY a;
|
|||
(10 rows)
|
||||
|
||||
--
|
||||
-- kill fetch_intermediate_results
|
||||
-- kill the COPY command that's created by fetch_intermediate_results
|
||||
-- this fails the fetch into target, so source replication doesn't matter
|
||||
-- and both should fail
|
||||
-- We don't kill the fetch_intermediate_results query directly, because that
|
||||
-- resulted in randomly failing tests on CI. The reason for that is that there
|
||||
-- is a race condition, where killing the fetch_intermediate_results query
|
||||
-- removes the data files before the fetch_intermediate_results query from the
|
||||
-- other node can read them. In theory a similar race condition still exists
|
||||
-- when killing the COPY, but CI doesn't hit that race condition in practice.
|
||||
--
|
||||
TRUNCATE target_table;
|
||||
SELECT citus.mitmproxy('conn.onQuery(query="fetch_intermediate_results").kill()');
|
||||
SELECT citus.mitmproxy('conn.onQuery(query="COPY").kill()');
|
||||
mitmproxy
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
INSERT INTO target_table SELECT * FROM source_table;
|
||||
ERROR: connection to the remote node localhost:xxxxx failed with the following error: server closed the connection unexpectedly
|
||||
ERROR: server closed the connection unexpectedly
|
||||
This probably means the server terminated abnormally
|
||||
before or while processing the request.
|
||||
CONTEXT: while executing command on localhost:xxxxx
|
||||
while executing command on localhost:xxxxx
|
||||
SELECT * FROM target_table ORDER BY a;
|
||||
a | b
|
||||
---------------------------------------------------------------------
|
||||
(0 rows)
|
||||
|
||||
SELECT citus.mitmproxy('conn.onQuery(query="fetch_intermediate_results").kill()');
|
||||
SELECT citus.mitmproxy('conn.onQuery(query="COPY").kill()');
|
||||
mitmproxy
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
INSERT INTO target_table SELECT * FROM replicated_source_table;
|
||||
ERROR: connection to the remote node localhost:xxxxx failed with the following error: server closed the connection unexpectedly
|
||||
ERROR: server closed the connection unexpectedly
|
||||
This probably means the server terminated abnormally
|
||||
before or while processing the request.
|
||||
CONTEXT: while executing command on localhost:xxxxx
|
||||
while executing command on localhost:xxxxx
|
||||
SELECT * FROM target_table ORDER BY a;
|
||||
a | b
|
||||
---------------------------------------------------------------------
|
||||
|
|
|
@ -47,19 +47,25 @@ INSERT INTO target_table SELECT * FROM replicated_source_table;
|
|||
SELECT * FROM target_table ORDER BY a;
|
||||
|
||||
--
|
||||
-- kill fetch_intermediate_results
|
||||
-- kill the COPY command that's created by fetch_intermediate_results
|
||||
-- this fails the fetch into target, so source replication doesn't matter
|
||||
-- and both should fail
|
||||
-- We don't kill the fetch_intermediate_results query directly, because that
|
||||
-- resulted in randomly failing tests on CI. The reason for that is that there
|
||||
-- is a race condition, where killing the fetch_intermediate_results query
|
||||
-- removes the data files before the fetch_intermediate_results query from the
|
||||
-- other node can read them. In theory a similar race condition still exists
|
||||
-- when killing the COPY, but CI doesn't hit that race condition in practice.
|
||||
--
|
||||
|
||||
TRUNCATE target_table;
|
||||
|
||||
SELECT citus.mitmproxy('conn.onQuery(query="fetch_intermediate_results").kill()');
|
||||
SELECT citus.mitmproxy('conn.onQuery(query="COPY").kill()');
|
||||
INSERT INTO target_table SELECT * FROM source_table;
|
||||
|
||||
SELECT * FROM target_table ORDER BY a;
|
||||
|
||||
SELECT citus.mitmproxy('conn.onQuery(query="fetch_intermediate_results").kill()');
|
||||
SELECT citus.mitmproxy('conn.onQuery(query="COPY").kill()');
|
||||
INSERT INTO target_table SELECT * FROM replicated_source_table;
|
||||
|
||||
SELECT * FROM target_table ORDER BY a;
|
||||
|
|
Loading…
Reference in New Issue