Adds test failure_pg15.sql for duplicate error message cases

Each of the following tests: failure_ddl.sql, failure_truncate.sql
failure_multi_dml.sql, failure_vacuum.sql has a part with
alternative output for PG15 resulting from removal of duplicate
error messages
This test file has been created to avoid 4 alternative output files

Relevant PG commit:
618c16707a6d6e8f5c83ede2092975e4670201ad
naisila/failure_pg15
naisila 2022-08-07 22:29:25 +03:00
parent 8d087f3e63
commit 397ef6e4e1
12 changed files with 848 additions and 271 deletions

View File

@ -175,55 +175,29 @@ SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORD
(localhost,57637,100803,t,"{key,new_column,value}")
(4 rows)
-- the following tests rely the column not exists, so drop manually
ALTER TABLE test_table DROP COLUMN new_column;
-- but now kill just after the worker sends response to
-- COMMIT command, so we'll have lots of warnings but the command
-- should have been committed both on the distributed table and the placements
SET client_min_messages TO WARNING;
SELECT citus.mitmproxy('conn.onCommandComplete(command="^COMMIT").kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
ALTER TABLE test_table ADD COLUMN new_column INT;
WARNING: connection not open
CONTEXT: while executing command on localhost:xxxxx
WARNING: failed to commit transaction on localhost:xxxxx
WARNING: connection not open
CONTEXT: while executing command on localhost:xxxxx
WARNING: connection not open
CONTEXT: while executing command on localhost:xxxxx
WARNING: failed to commit transaction on localhost:xxxxx
WARNING: connection not open
CONTEXT: while executing command on localhost:xxxxx
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SET client_min_messages TO ERROR;
SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass;
array_agg
---------------------------------------------------------------------
{key,new_column,value}
(1 row)
SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1;
run_command_on_placements
---------------------------------------------------------------------
(localhost,9060,100800,t,"{key,new_column,value}")
(localhost,9060,100802,t,"{key,new_column,value}")
(localhost,57637,100801,t,"{key,new_column,value}")
(localhost,57637,100803,t,"{key,new_column,value}")
(4 rows)
-- Commenting out the following test since it has an output with no
-- duplicate error messages in PG15
-- To avoid adding alternative output file for this test, this
-- part is moved to failure_pg15.sql file.
-- Uncomment the following part when we drop support for PG14
-- and we delete failure_pg15.sql file.
-- -- the following tests rely the column not exists, so drop manually
-- ALTER TABLE test_table DROP COLUMN new_column;
-- -- but now kill just after the worker sends response to
-- -- COMMIT command, so we'll have lots of warnings but the command
-- -- should have been committed both on the distributed table and the placements
-- SET client_min_messages TO WARNING;
-- SELECT citus.mitmproxy('conn.onCommandComplete(command="^COMMIT").kill()');
-- ALTER TABLE test_table ADD COLUMN new_column INT;
-- SELECT citus.mitmproxy('conn.allow()');
-- SET client_min_messages TO ERROR;
-- SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass;
-- SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1;
-- now cancel just after the worker sends response to
-- but Postgres doesn't accepts interrupts during COMMIT and ROLLBACK
-- so should not cancel at all, so not an effective test but adding in
-- case Citus messes up this behaviour
SET client_min_messages TO ERROR;
SELECT citus.mitmproxy('conn.onCommandComplete(command="^COMMIT").cancel(' || pg_backend_pid() || ')');
mitmproxy
---------------------------------------------------------------------

View File

@ -358,76 +358,44 @@ SELECT * FROM dml_test ORDER BY id ASC;
5 | Epsilon
(4 rows)
-- drop table and recreate with different replication/sharding
DROP TABLE dml_test;
SET citus.shard_count = 1;
SET citus.shard_replication_factor = 2; -- two placements
CREATE TABLE dml_test (id integer, name text);
SELECT create_distributed_table('dml_test', 'id');
create_distributed_table
---------------------------------------------------------------------
(1 row)
COPY dml_test FROM STDIN WITH CSV;
---- test multiple statements against a single shard, but with two placements
-- fail at PREPARED COMMIT as we use 2PC
SELECT citus.mitmproxy('conn.onQuery(query="^COMMIT").kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
BEGIN;
DELETE FROM dml_test WHERE id = 1;
DELETE FROM dml_test WHERE id = 2;
INSERT INTO dml_test VALUES (5, 'Epsilon');
UPDATE dml_test SET name = 'alpha' WHERE id = 1;
UPDATE dml_test SET name = 'gamma' WHERE id = 3;
COMMIT;
WARNING: connection not open
CONTEXT: while executing command on localhost:xxxxx
WARNING: failed to commit transaction on localhost:xxxxx
WARNING: connection not open
CONTEXT: while executing command on localhost:xxxxx
-- all changes should be committed because we injected
-- the failure on the COMMIT time. And, we should not
-- mark any placements as INVALID
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT recover_prepared_transactions();
recover_prepared_transactions
---------------------------------------------------------------------
1
(1 row)
SELECT shardid FROM pg_dist_shard_placement WHERE shardstate = 3;
shardid
---------------------------------------------------------------------
(0 rows)
SET citus.task_assignment_policy TO "round-robin";
SELECT * FROM dml_test ORDER BY id ASC;
id | name
---------------------------------------------------------------------
3 | gamma
4 | Delta
5 | Epsilon
(3 rows)
SELECT * FROM dml_test ORDER BY id ASC;
id | name
---------------------------------------------------------------------
3 | gamma
4 | Delta
5 | Epsilon
(3 rows)
RESET citus.task_assignment_policy;
-- Commenting out the following test since it has an output with no
-- duplicate error messages in PG15
-- To avoid adding alternative output file for this test, this
-- part is moved to failure_pg15.sql file.
-- Uncomment the following part when we drop support for PG14
-- and we delete failure_pg15.sql file.
-- -- drop table and recreate with different replication/sharding
-- DROP TABLE dml_test;
-- SET citus.shard_count = 1;
-- SET citus.shard_replication_factor = 2; -- two placements
-- CREATE TABLE dml_test (id integer, name text);
-- SELECT create_distributed_table('dml_test', 'id');
-- COPY dml_test FROM STDIN WITH CSV;
-- 1,Alpha
-- 2,Beta
-- 3,Gamma
-- 4,Delta
-- \.
-- -- test multiple statements against a single shard, but with two placements
-- -- fail at PREPARED COMMIT as we use 2PC
-- SELECT citus.mitmproxy('conn.onQuery(query="^COMMIT").kill()');
-- BEGIN;
-- DELETE FROM dml_test WHERE id = 1;
-- DELETE FROM dml_test WHERE id = 2;
-- INSERT INTO dml_test VALUES (5, 'Epsilon');
-- UPDATE dml_test SET name = 'alpha' WHERE id = 1;
-- UPDATE dml_test SET name = 'gamma' WHERE id = 3;
-- COMMIT;
-- -- all changes should be committed because we injected
-- -- the failure on the COMMIT time. And, we should not
-- -- mark any placements as INVALID
-- SELECT citus.mitmproxy('conn.allow()');
-- SELECT recover_prepared_transactions();
-- SELECT shardid FROM pg_dist_shard_placement WHERE shardstate = 3;
-- SET citus.task_assignment_policy TO "round-robin";
-- SELECT * FROM dml_test ORDER BY id ASC;
-- SELECT * FROM dml_test ORDER BY id ASC;
-- RESET citus.task_assignment_policy;
-- drop table and recreate as reference table
DROP TABLE dml_test;
SET citus.shard_count = 2;

View File

@ -0,0 +1,254 @@
--
-- FAILURE_PG15
--
-- Each of the following tests: failure_ddl.sql, failure_truncate.sql
-- failure_multi_dml.sql, failure_vacuum.sql
-- has a part with alternative output for PG15 resulting
-- from removal of duplicate error messages
-- Relevant PG commit: 618c16707a6d6e8f5c83ede2092975e4670201ad
-- This test file has been created to avoid 4 alternative output files
CREATE SCHEMA pg15_failure;
SET citus.force_max_query_parallelization TO ON;
SET search_path TO 'pg15_failure';
-- do not cache any connections
SET citus.max_cached_conns_per_worker TO 0;
-- we don't want to see the prepared transaction numbers in the warnings
SET client_min_messages TO WARNING;
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SET citus.next_shard_id TO 100700;
-- we'll start with replication factor 1, 2PC and parallel mode
SET citus.shard_count = 4;
SET citus.shard_replication_factor = 1;
CREATE TABLE test_table (key int, value int);
SELECT create_distributed_table('test_table', 'key');
create_distributed_table
---------------------------------------------------------------------
(1 row)
-- from failure_ddl.sql
-- but now kill just after the worker sends response to
-- COMMIT command, so we'll have lots of warnings but the command
-- should have been committed both on the distributed table and the placements
SET client_min_messages TO WARNING;
SELECT citus.mitmproxy('conn.onCommandComplete(command="^COMMIT").kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
ALTER TABLE test_table ADD COLUMN new_column INT;
WARNING: connection not open
CONTEXT: while executing command on localhost:xxxxx
WARNING: failed to commit transaction on localhost:xxxxx
WARNING: connection not open
CONTEXT: while executing command on localhost:xxxxx
WARNING: failed to commit transaction on localhost:xxxxx
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SET client_min_messages TO ERROR;
SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass;
array_agg
---------------------------------------------------------------------
{key,new_column,value}
(1 row)
SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1;
run_command_on_placements
---------------------------------------------------------------------
(localhost,9060,100700,t,"{key,new_column,value}")
(localhost,9060,100702,t,"{key,new_column,value}")
(localhost,57637,100701,t,"{key,new_column,value}")
(localhost,57637,100703,t,"{key,new_column,value}")
(4 rows)
-- the following tests rely the column not exists, so drop manually
ALTER TABLE test_table DROP COLUMN new_column;
-- from failure_truncate.sql
CREATE VIEW unhealthy_shard_count AS
SELECT count(*)
FROM pg_dist_shard_placement pdsp
JOIN
pg_dist_shard pds
ON pdsp.shardid=pds.shardid
WHERE logicalrelid='pg15_failure.test_table'::regclass AND shardstate != 1;
INSERT INTO test_table SELECT x,x FROM generate_series(1,20) as f(x);
SET client_min_messages TO WARNING;
-- now kill just after the worker sends response to
-- COMMIT command, so we'll have lots of warnings but the command
-- should have been committed both on the distributed table and the placements
SELECT citus.mitmproxy('conn.onCommandComplete(command="^COMMIT").kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
TRUNCATE test_table;
WARNING: connection not open
CONTEXT: while executing command on localhost:xxxxx
WARNING: failed to commit transaction on localhost:xxxxx
WARNING: connection not open
CONTEXT: while executing command on localhost:xxxxx
WARNING: failed to commit transaction on localhost:xxxxx
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM unhealthy_shard_count;
count
---------------------------------------------------------------------
0
(1 row)
SELECT count(*) FROM test_table;
count
---------------------------------------------------------------------
0
(1 row)
-- from failure_multi_dml.sql
SET citus.shard_count = 1;
SET citus.shard_replication_factor = 2; -- two placements
CREATE TABLE dml_test (id integer, name text);
SELECT create_distributed_table('dml_test', 'id');
create_distributed_table
---------------------------------------------------------------------
(1 row)
COPY dml_test FROM STDIN WITH CSV;
---- test multiple statements against a single shard, but with two placements
-- fail at PREPARED COMMIT as we use 2PC
SELECT citus.mitmproxy('conn.onQuery(query="^COMMIT").kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
BEGIN;
DELETE FROM dml_test WHERE id = 1;
DELETE FROM dml_test WHERE id = 2;
INSERT INTO dml_test VALUES (5, 'Epsilon');
UPDATE dml_test SET name = 'alpha' WHERE id = 1;
UPDATE dml_test SET name = 'gamma' WHERE id = 3;
COMMIT;
WARNING: connection not open
CONTEXT: while executing command on localhost:xxxxx
WARNING: failed to commit transaction on localhost:xxxxx
-- all changes should be committed because we injected
-- the failure on the COMMIT time. And, we should not
-- mark any placements as INVALID
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT recover_prepared_transactions();
recover_prepared_transactions
---------------------------------------------------------------------
1
(1 row)
SELECT shardid FROM pg_dist_shard_placement WHERE shardstate = 3;
shardid
---------------------------------------------------------------------
(0 rows)
SET citus.task_assignment_policy TO "round-robin";
SELECT * FROM dml_test ORDER BY id ASC;
id | name
---------------------------------------------------------------------
3 | gamma
4 | Delta
5 | Epsilon
(3 rows)
SELECT * FROM dml_test ORDER BY id ASC;
id | name
---------------------------------------------------------------------
3 | gamma
4 | Delta
5 | Epsilon
(3 rows)
RESET citus.task_assignment_policy;
-- from failure_vacuum.sql
CREATE TABLE vacuum_test (key int, value int);
SELECT create_distributed_table('vacuum_test', 'key');
create_distributed_table
---------------------------------------------------------------------
(1 row)
SELECT citus.clear_network_traffic();
clear_network_traffic
---------------------------------------------------------------------
(1 row)
SELECT citus.mitmproxy('conn.onQuery(query="^VACUUM").kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
VACUUM vacuum_test;
ERROR: connection to the remote node localhost:xxxxx failed with the following error: connection not open
SELECT citus.mitmproxy('conn.onQuery(query="^ANALYZE").kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
ANALYZE vacuum_test;
ERROR: connection to the remote node localhost:xxxxx failed with the following error: connection not open
SELECT citus.mitmproxy('conn.onQuery(query="^COMMIT").kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
ANALYZE vacuum_test;
WARNING: connection not open
CONTEXT: while executing command on localhost:xxxxx
WARNING: failed to commit transaction on localhost:xxxxx
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT recover_prepared_transactions();
recover_prepared_transactions
---------------------------------------------------------------------
1
(1 row)
-- ANALYZE transactions being critical is an open question, see #2430
-- show that we never mark as INVALID on COMMIT FAILURE
SELECT shardid, shardstate FROM pg_dist_shard_placement where shardstate != 1 AND
shardid in ( SELECT shardid FROM pg_dist_shard WHERE logicalrelid = 'vacuum_test'::regclass);
shardid | shardstate
---------------------------------------------------------------------
(0 rows)
-- Clean up
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
DROP SCHEMA pg15_failure CASCADE;

View File

@ -0,0 +1,266 @@
--
-- FAILURE_PG15
--
-- Each of the following tests: failure_ddl.sql, failure_truncate.sql
-- failure_multi_dml.sql, failure_vacuum.sql
-- has a part with alternative output for PG15 resulting
-- from removal of duplicate error messages
-- Relevant PG commit: 618c16707a6d6e8f5c83ede2092975e4670201ad
-- This test file has been created to avoid 4 alternative output files
CREATE SCHEMA pg15_failure;
SET citus.force_max_query_parallelization TO ON;
SET search_path TO 'pg15_failure';
-- do not cache any connections
SET citus.max_cached_conns_per_worker TO 0;
-- we don't want to see the prepared transaction numbers in the warnings
SET client_min_messages TO WARNING;
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SET citus.next_shard_id TO 100700;
-- we'll start with replication factor 1, 2PC and parallel mode
SET citus.shard_count = 4;
SET citus.shard_replication_factor = 1;
CREATE TABLE test_table (key int, value int);
SELECT create_distributed_table('test_table', 'key');
create_distributed_table
---------------------------------------------------------------------
(1 row)
-- from failure_ddl.sql
-- but now kill just after the worker sends response to
-- COMMIT command, so we'll have lots of warnings but the command
-- should have been committed both on the distributed table and the placements
SET client_min_messages TO WARNING;
SELECT citus.mitmproxy('conn.onCommandComplete(command="^COMMIT").kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
ALTER TABLE test_table ADD COLUMN new_column INT;
WARNING: connection not open
CONTEXT: while executing command on localhost:xxxxx
WARNING: failed to commit transaction on localhost:xxxxx
WARNING: connection not open
CONTEXT: while executing command on localhost:xxxxx
WARNING: connection not open
CONTEXT: while executing command on localhost:xxxxx
WARNING: failed to commit transaction on localhost:xxxxx
WARNING: connection not open
CONTEXT: while executing command on localhost:xxxxx
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SET client_min_messages TO ERROR;
SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass;
array_agg
---------------------------------------------------------------------
{key,new_column,value}
(1 row)
SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1;
run_command_on_placements
---------------------------------------------------------------------
(localhost,9060,100700,t,"{key,new_column,value}")
(localhost,9060,100702,t,"{key,new_column,value}")
(localhost,57637,100701,t,"{key,new_column,value}")
(localhost,57637,100703,t,"{key,new_column,value}")
(4 rows)
-- the following tests rely the column not exists, so drop manually
ALTER TABLE test_table DROP COLUMN new_column;
-- from failure_truncate.sql
CREATE VIEW unhealthy_shard_count AS
SELECT count(*)
FROM pg_dist_shard_placement pdsp
JOIN
pg_dist_shard pds
ON pdsp.shardid=pds.shardid
WHERE logicalrelid='pg15_failure.test_table'::regclass AND shardstate != 1;
INSERT INTO test_table SELECT x,x FROM generate_series(1,20) as f(x);
SET client_min_messages TO WARNING;
-- now kill just after the worker sends response to
-- COMMIT command, so we'll have lots of warnings but the command
-- should have been committed both on the distributed table and the placements
SELECT citus.mitmproxy('conn.onCommandComplete(command="^COMMIT").kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
TRUNCATE test_table;
WARNING: connection not open
CONTEXT: while executing command on localhost:xxxxx
WARNING: failed to commit transaction on localhost:xxxxx
WARNING: connection not open
CONTEXT: while executing command on localhost:xxxxx
WARNING: connection not open
CONTEXT: while executing command on localhost:xxxxx
WARNING: failed to commit transaction on localhost:xxxxx
WARNING: connection not open
CONTEXT: while executing command on localhost:xxxxx
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM unhealthy_shard_count;
count
---------------------------------------------------------------------
0
(1 row)
SELECT count(*) FROM test_table;
count
---------------------------------------------------------------------
0
(1 row)
-- from failure_multi_dml.sql
SET citus.shard_count = 1;
SET citus.shard_replication_factor = 2; -- two placements
CREATE TABLE dml_test (id integer, name text);
SELECT create_distributed_table('dml_test', 'id');
create_distributed_table
---------------------------------------------------------------------
(1 row)
COPY dml_test FROM STDIN WITH CSV;
---- test multiple statements against a single shard, but with two placements
-- fail at PREPARED COMMIT as we use 2PC
SELECT citus.mitmproxy('conn.onQuery(query="^COMMIT").kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
BEGIN;
DELETE FROM dml_test WHERE id = 1;
DELETE FROM dml_test WHERE id = 2;
INSERT INTO dml_test VALUES (5, 'Epsilon');
UPDATE dml_test SET name = 'alpha' WHERE id = 1;
UPDATE dml_test SET name = 'gamma' WHERE id = 3;
COMMIT;
WARNING: connection not open
CONTEXT: while executing command on localhost:xxxxx
WARNING: failed to commit transaction on localhost:xxxxx
WARNING: connection not open
CONTEXT: while executing command on localhost:xxxxx
-- all changes should be committed because we injected
-- the failure on the COMMIT time. And, we should not
-- mark any placements as INVALID
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT recover_prepared_transactions();
recover_prepared_transactions
---------------------------------------------------------------------
1
(1 row)
SELECT shardid FROM pg_dist_shard_placement WHERE shardstate = 3;
shardid
---------------------------------------------------------------------
(0 rows)
SET citus.task_assignment_policy TO "round-robin";
SELECT * FROM dml_test ORDER BY id ASC;
id | name
---------------------------------------------------------------------
3 | gamma
4 | Delta
5 | Epsilon
(3 rows)
SELECT * FROM dml_test ORDER BY id ASC;
id | name
---------------------------------------------------------------------
3 | gamma
4 | Delta
5 | Epsilon
(3 rows)
RESET citus.task_assignment_policy;
-- from failure_vacuum.sql
CREATE TABLE vacuum_test (key int, value int);
SELECT create_distributed_table('vacuum_test', 'key');
create_distributed_table
---------------------------------------------------------------------
(1 row)
SELECT citus.clear_network_traffic();
clear_network_traffic
---------------------------------------------------------------------
(1 row)
SELECT citus.mitmproxy('conn.onQuery(query="^VACUUM").kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
VACUUM vacuum_test;
ERROR: connection to the remote node localhost:xxxxx failed with the following error: connection not open
SELECT citus.mitmproxy('conn.onQuery(query="^ANALYZE").kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
ANALYZE vacuum_test;
ERROR: connection to the remote node localhost:xxxxx failed with the following error: connection not open
SELECT citus.mitmproxy('conn.onQuery(query="^COMMIT").kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
ANALYZE vacuum_test;
WARNING: connection not open
CONTEXT: while executing command on localhost:xxxxx
WARNING: failed to commit transaction on localhost:xxxxx
WARNING: connection not open
CONTEXT: while executing command on localhost:xxxxx
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT recover_prepared_transactions();
recover_prepared_transactions
---------------------------------------------------------------------
1
(1 row)
-- ANALYZE transactions being critical is an open question, see #2430
-- show that we never mark as INVALID on COMMIT FAILURE
SELECT shardid, shardstate FROM pg_dist_shard_placement where shardstate != 1 AND
shardid in ( SELECT shardid FROM pg_dist_shard WHERE logicalrelid = 'vacuum_test'::regclass);
shardid | shardstate
---------------------------------------------------------------------
(0 rows)
-- Clean up
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
DROP SCHEMA pg15_failure CASCADE;

View File

@ -266,47 +266,23 @@ SELECT count(*) FROM test_table;
-- refill the table
TRUNCATE test_table;
INSERT INTO test_table SELECT x,x FROM generate_series(1,20) as f(x);
SET client_min_messages TO WARNING;
-- now kill just after the worker sends response to
-- COMMIT command, so we'll have lots of warnings but the command
-- should have been committed both on the distributed table and the placements
SELECT citus.mitmproxy('conn.onCommandComplete(command="^COMMIT").kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
TRUNCATE test_table;
WARNING: connection not open
CONTEXT: while executing command on localhost:xxxxx
WARNING: failed to commit transaction on localhost:xxxxx
WARNING: connection not open
CONTEXT: while executing command on localhost:xxxxx
WARNING: connection not open
CONTEXT: while executing command on localhost:xxxxx
WARNING: failed to commit transaction on localhost:xxxxx
WARNING: connection not open
CONTEXT: while executing command on localhost:xxxxx
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT * FROM unhealthy_shard_count;
count
---------------------------------------------------------------------
0
(1 row)
SELECT count(*) FROM test_table;
count
---------------------------------------------------------------------
0
(1 row)
SET client_min_messages TO ERROR;
INSERT INTO test_table SELECT x,x FROM generate_series(1,20) as f(x);
-- Commenting out the following test since it has an output with no
-- duplicate error messages in PG15
-- To avoid adding alternative output file for this test, this
-- part is moved to failure_pg15.sql file.
-- Uncomment the following part when we drop support for PG14
-- and we delete failure_pg15.sql file.
-- SET client_min_messages TO WARNING;
-- -- now kill just after the worker sends response to
-- -- COMMIT command, so we'll have lots of warnings but the command
-- -- should have been committed both on the distributed table and the placements
-- SELECT citus.mitmproxy('conn.onCommandComplete(command="^COMMIT").kill()');
-- TRUNCATE test_table;
-- SELECT citus.mitmproxy('conn.allow()');
-- SELECT * FROM unhealthy_shard_count;
-- SELECT count(*) FROM test_table;
-- SET client_min_messages TO ERROR;
-- INSERT INTO test_table SELECT x,x FROM generate_series(1,20) as f(x);
-- now cancel just after the worker sends response to
-- but Postgres doesn't accept interrupts during COMMIT and ROLLBACK
-- so should not cancel at all, so not an effective test but adding in

View File

@ -23,54 +23,24 @@ SELECT citus.clear_network_traffic();
(1 row)
SELECT citus.mitmproxy('conn.onQuery(query="^VACUUM").kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
VACUUM vacuum_test;
ERROR: connection to the remote node localhost:xxxxx failed with the following error: connection not open
SELECT citus.mitmproxy('conn.onQuery(query="^ANALYZE").kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
ANALYZE vacuum_test;
ERROR: connection to the remote node localhost:xxxxx failed with the following error: connection not open
SELECT citus.mitmproxy('conn.onQuery(query="^COMMIT").kill()');
mitmproxy
---------------------------------------------------------------------
(1 row)
ANALYZE vacuum_test;
WARNING: connection not open
CONTEXT: while executing command on localhost:xxxxx
WARNING: failed to commit transaction on localhost:xxxxx
WARNING: connection not open
CONTEXT: while executing command on localhost:xxxxx
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
---------------------------------------------------------------------
(1 row)
SELECT recover_prepared_transactions();
recover_prepared_transactions
---------------------------------------------------------------------
1
(1 row)
-- ANALYZE transactions being critical is an open question, see #2430
-- show that we never mark as INVALID on COMMIT FAILURE
SELECT shardid, shardstate FROM pg_dist_shard_placement where shardstate != 1 AND
shardid in ( SELECT shardid FROM pg_dist_shard WHERE logicalrelid = 'vacuum_test'::regclass);
shardid | shardstate
---------------------------------------------------------------------
(0 rows)
-- Commenting out the following test since it has an output with no
-- duplicate error messages in PG15
-- To avoid adding alternative output file for this test, this
-- part is moved to failure_pg15.sql file.
-- Uncomment the following part when we drop support for PG14
-- and we delete failure_pg15.sql file.
-- SELECT citus.mitmproxy('conn.onQuery(query="^VACUUM").kill()');
-- VACUUM vacuum_test;
-- SELECT citus.mitmproxy('conn.onQuery(query="^ANALYZE").kill()');
-- ANALYZE vacuum_test;
-- SELECT citus.mitmproxy('conn.onQuery(query="^COMMIT").kill()');
-- ANALYZE vacuum_test;
-- SELECT citus.mitmproxy('conn.allow()');
-- SELECT recover_prepared_transactions();
-- -- ANALYZE transactions being critical is an open question, see #2430
-- -- show that we never mark as INVALID on COMMIT FAILURE
-- SELECT shardid, shardstate FROM pg_dist_shard_placement where shardstate != 1 AND
-- shardid in ( SELECT shardid FROM pg_dist_shard WHERE logicalrelid = 'vacuum_test'::regclass);
-- the same tests with cancel
SELECT citus.mitmproxy('conn.onQuery(query="^VACUUM").cancel(' || pg_backend_pid() || ')');
mitmproxy

View File

@ -25,6 +25,7 @@ test: failure_cte_subquery
test: failure_insert_select_via_coordinator
test: failure_multi_dml
test: failure_vacuum
test: failure_pg15
test: failure_ref_tables
test: failure_insert_select_pushdown
test: failure_single_mod

View File

@ -83,26 +83,34 @@ SELECT citus.mitmproxy('conn.allow()');
SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass;
SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1;
-- the following tests rely the column not exists, so drop manually
ALTER TABLE test_table DROP COLUMN new_column;
-- Commenting out the following test since it has an output with no
-- duplicate error messages in PG15
-- To avoid adding alternative output file for this test, this
-- part is moved to failure_pg15.sql file.
-- Uncomment the following part when we drop support for PG14
-- and we delete failure_pg15.sql file.
-- but now kill just after the worker sends response to
-- COMMIT command, so we'll have lots of warnings but the command
-- should have been committed both on the distributed table and the placements
SET client_min_messages TO WARNING;
SELECT citus.mitmproxy('conn.onCommandComplete(command="^COMMIT").kill()');
ALTER TABLE test_table ADD COLUMN new_column INT;
SELECT citus.mitmproxy('conn.allow()');
-- -- the following tests rely the column not exists, so drop manually
-- ALTER TABLE test_table DROP COLUMN new_column;
SET client_min_messages TO ERROR;
-- -- but now kill just after the worker sends response to
-- -- COMMIT command, so we'll have lots of warnings but the command
-- -- should have been committed both on the distributed table and the placements
-- SET client_min_messages TO WARNING;
-- SELECT citus.mitmproxy('conn.onCommandComplete(command="^COMMIT").kill()');
-- ALTER TABLE test_table ADD COLUMN new_column INT;
-- SELECT citus.mitmproxy('conn.allow()');
SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass;
SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1;
-- SET client_min_messages TO ERROR;
-- SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass;
-- SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1;
-- now cancel just after the worker sends response to
-- but Postgres doesn't accepts interrupts during COMMIT and ROLLBACK
-- so should not cancel at all, so not an effective test but adding in
-- case Citus messes up this behaviour
SET client_min_messages TO ERROR;
SELECT citus.mitmproxy('conn.onCommandComplete(command="^COMMIT").cancel(' || pg_backend_pid() || ')');
ALTER TABLE test_table DROP COLUMN new_column;
SELECT citus.mitmproxy('conn.allow()');

View File

@ -192,46 +192,53 @@ COMMIT;
-- should see changes, because cancellation is ignored
SELECT * FROM dml_test ORDER BY id ASC;
-- drop table and recreate with different replication/sharding
-- Commenting out the following test since it has an output with no
-- duplicate error messages in PG15
-- To avoid adding alternative output file for this test, this
-- part is moved to failure_pg15.sql file.
-- Uncomment the following part when we drop support for PG14
-- and we delete failure_pg15.sql file.
DROP TABLE dml_test;
SET citus.shard_count = 1;
SET citus.shard_replication_factor = 2; -- two placements
-- -- drop table and recreate with different replication/sharding
CREATE TABLE dml_test (id integer, name text);
SELECT create_distributed_table('dml_test', 'id');
-- DROP TABLE dml_test;
-- SET citus.shard_count = 1;
-- SET citus.shard_replication_factor = 2; -- two placements
COPY dml_test FROM STDIN WITH CSV;
1,Alpha
2,Beta
3,Gamma
4,Delta
\.
-- CREATE TABLE dml_test (id integer, name text);
-- SELECT create_distributed_table('dml_test', 'id');
---- test multiple statements against a single shard, but with two placements
-- COPY dml_test FROM STDIN WITH CSV;
-- 1,Alpha
-- 2,Beta
-- 3,Gamma
-- 4,Delta
-- \.
-- fail at PREPARED COMMIT as we use 2PC
SELECT citus.mitmproxy('conn.onQuery(query="^COMMIT").kill()');
-- -- test multiple statements against a single shard, but with two placements
BEGIN;
DELETE FROM dml_test WHERE id = 1;
DELETE FROM dml_test WHERE id = 2;
INSERT INTO dml_test VALUES (5, 'Epsilon');
UPDATE dml_test SET name = 'alpha' WHERE id = 1;
UPDATE dml_test SET name = 'gamma' WHERE id = 3;
COMMIT;
-- -- fail at PREPARED COMMIT as we use 2PC
-- SELECT citus.mitmproxy('conn.onQuery(query="^COMMIT").kill()');
-- all changes should be committed because we injected
-- the failure on the COMMIT time. And, we should not
-- mark any placements as INVALID
SELECT citus.mitmproxy('conn.allow()');
SELECT recover_prepared_transactions();
SELECT shardid FROM pg_dist_shard_placement WHERE shardstate = 3;
-- BEGIN;
-- DELETE FROM dml_test WHERE id = 1;
-- DELETE FROM dml_test WHERE id = 2;
-- INSERT INTO dml_test VALUES (5, 'Epsilon');
-- UPDATE dml_test SET name = 'alpha' WHERE id = 1;
-- UPDATE dml_test SET name = 'gamma' WHERE id = 3;
-- COMMIT;
SET citus.task_assignment_policy TO "round-robin";
SELECT * FROM dml_test ORDER BY id ASC;
SELECT * FROM dml_test ORDER BY id ASC;
RESET citus.task_assignment_policy;
-- -- all changes should be committed because we injected
-- -- the failure on the COMMIT time. And, we should not
-- -- mark any placements as INVALID
-- SELECT citus.mitmproxy('conn.allow()');
-- SELECT recover_prepared_transactions();
-- SELECT shardid FROM pg_dist_shard_placement WHERE shardstate = 3;
-- SET citus.task_assignment_policy TO "round-robin";
-- SELECT * FROM dml_test ORDER BY id ASC;
-- SELECT * FROM dml_test ORDER BY id ASC;
-- RESET citus.task_assignment_policy;
-- drop table and recreate as reference table
DROP TABLE dml_test;

View File

@ -0,0 +1,139 @@
--
-- FAILURE_PG15
--
-- Each of the following tests: failure_ddl.sql, failure_truncate.sql
-- failure_multi_dml.sql, failure_vacuum.sql
-- has a part with alternative output for PG15 resulting
-- from removal of duplicate error messages
-- Relevant PG commit: 618c16707a6d6e8f5c83ede2092975e4670201ad
-- This test file has been created to avoid 4 alternative output files
CREATE SCHEMA pg15_failure;
SET citus.force_max_query_parallelization TO ON;
SET search_path TO 'pg15_failure';
-- do not cache any connections
SET citus.max_cached_conns_per_worker TO 0;
-- we don't want to see the prepared transaction numbers in the warnings
SET client_min_messages TO WARNING;
SELECT citus.mitmproxy('conn.allow()');
SET citus.next_shard_id TO 100700;
-- we'll start with replication factor 1, 2PC and parallel mode
SET citus.shard_count = 4;
SET citus.shard_replication_factor = 1;
CREATE TABLE test_table (key int, value int);
SELECT create_distributed_table('test_table', 'key');
-- from failure_ddl.sql
-- but now kill just after the worker sends response to
-- COMMIT command, so we'll have lots of warnings but the command
-- should have been committed both on the distributed table and the placements
SET client_min_messages TO WARNING;
SELECT citus.mitmproxy('conn.onCommandComplete(command="^COMMIT").kill()');
ALTER TABLE test_table ADD COLUMN new_column INT;
SELECT citus.mitmproxy('conn.allow()');
SET client_min_messages TO ERROR;
SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass;
SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1;
-- the following tests rely the column not exists, so drop manually
ALTER TABLE test_table DROP COLUMN new_column;
-- from failure_truncate.sql
CREATE VIEW unhealthy_shard_count AS
SELECT count(*)
FROM pg_dist_shard_placement pdsp
JOIN
pg_dist_shard pds
ON pdsp.shardid=pds.shardid
WHERE logicalrelid='pg15_failure.test_table'::regclass AND shardstate != 1;
INSERT INTO test_table SELECT x,x FROM generate_series(1,20) as f(x);
SET client_min_messages TO WARNING;
-- now kill just after the worker sends response to
-- COMMIT command, so we'll have lots of warnings but the command
-- should have been committed both on the distributed table and the placements
SELECT citus.mitmproxy('conn.onCommandComplete(command="^COMMIT").kill()');
TRUNCATE test_table;
SELECT citus.mitmproxy('conn.allow()');
SELECT * FROM unhealthy_shard_count;
SELECT count(*) FROM test_table;
-- from failure_multi_dml.sql
SET citus.shard_count = 1;
SET citus.shard_replication_factor = 2; -- two placements
CREATE TABLE dml_test (id integer, name text);
SELECT create_distributed_table('dml_test', 'id');
COPY dml_test FROM STDIN WITH CSV;
1,Alpha
2,Beta
3,Gamma
4,Delta
\.
---- test multiple statements against a single shard, but with two placements
-- fail at PREPARED COMMIT as we use 2PC
SELECT citus.mitmproxy('conn.onQuery(query="^COMMIT").kill()');
BEGIN;
DELETE FROM dml_test WHERE id = 1;
DELETE FROM dml_test WHERE id = 2;
INSERT INTO dml_test VALUES (5, 'Epsilon');
UPDATE dml_test SET name = 'alpha' WHERE id = 1;
UPDATE dml_test SET name = 'gamma' WHERE id = 3;
COMMIT;
-- all changes should be committed because we injected
-- the failure on the COMMIT time. And, we should not
-- mark any placements as INVALID
SELECT citus.mitmproxy('conn.allow()');
SELECT recover_prepared_transactions();
SELECT shardid FROM pg_dist_shard_placement WHERE shardstate = 3;
SET citus.task_assignment_policy TO "round-robin";
SELECT * FROM dml_test ORDER BY id ASC;
SELECT * FROM dml_test ORDER BY id ASC;
RESET citus.task_assignment_policy;
-- from failure_vacuum.sql
CREATE TABLE vacuum_test (key int, value int);
SELECT create_distributed_table('vacuum_test', 'key');
SELECT citus.clear_network_traffic();
SELECT citus.mitmproxy('conn.onQuery(query="^VACUUM").kill()');
VACUUM vacuum_test;
SELECT citus.mitmproxy('conn.onQuery(query="^ANALYZE").kill()');
ANALYZE vacuum_test;
SELECT citus.mitmproxy('conn.onQuery(query="^COMMIT").kill()');
ANALYZE vacuum_test;
SELECT citus.mitmproxy('conn.allow()');
SELECT recover_prepared_transactions();
-- ANALYZE transactions being critical is an open question, see #2430
-- show that we never mark as INVALID on COMMIT FAILURE
SELECT shardid, shardstate FROM pg_dist_shard_placement where shardstate != 1 AND
shardid in ( SELECT shardid FROM pg_dist_shard WHERE logicalrelid = 'vacuum_test'::regclass);
-- Clean up
SELECT citus.mitmproxy('conn.allow()');
DROP SCHEMA pg15_failure CASCADE;

View File

@ -103,18 +103,25 @@ SELECT count(*) FROM test_table;
TRUNCATE test_table;
INSERT INTO test_table SELECT x,x FROM generate_series(1,20) as f(x);
SET client_min_messages TO WARNING;
-- now kill just after the worker sends response to
-- COMMIT command, so we'll have lots of warnings but the command
-- should have been committed both on the distributed table and the placements
SELECT citus.mitmproxy('conn.onCommandComplete(command="^COMMIT").kill()');
TRUNCATE test_table;
SELECT citus.mitmproxy('conn.allow()');
SELECT * FROM unhealthy_shard_count;
SELECT count(*) FROM test_table;
SET client_min_messages TO ERROR;
-- Commenting out the following test since it has an output with no
-- duplicate error messages in PG15
-- To avoid adding alternative output file for this test, this
-- part is moved to failure_pg15.sql file.
-- Uncomment the following part when we drop support for PG14
-- and we delete failure_pg15.sql file.
INSERT INTO test_table SELECT x,x FROM generate_series(1,20) as f(x);
-- SET client_min_messages TO WARNING;
-- -- now kill just after the worker sends response to
-- -- COMMIT command, so we'll have lots of warnings but the command
-- -- should have been committed both on the distributed table and the placements
-- SELECT citus.mitmproxy('conn.onCommandComplete(command="^COMMIT").kill()');
-- TRUNCATE test_table;
-- SELECT citus.mitmproxy('conn.allow()');
-- SELECT * FROM unhealthy_shard_count;
-- SELECT count(*) FROM test_table;
-- SET client_min_messages TO ERROR;
-- INSERT INTO test_table SELECT x,x FROM generate_series(1,20) as f(x);
-- now cancel just after the worker sends response to
-- but Postgres doesn't accept interrupts during COMMIT and ROLLBACK

View File

@ -14,22 +14,29 @@ SELECT create_distributed_table('vacuum_test', 'key');
SELECT citus.clear_network_traffic();
SELECT citus.mitmproxy('conn.onQuery(query="^VACUUM").kill()');
VACUUM vacuum_test;
-- Commenting out the following test since it has an output with no
-- duplicate error messages in PG15
-- To avoid adding alternative output file for this test, this
-- part is moved to failure_pg15.sql file.
-- Uncomment the following part when we drop support for PG14
-- and we delete failure_pg15.sql file.
SELECT citus.mitmproxy('conn.onQuery(query="^ANALYZE").kill()');
ANALYZE vacuum_test;
-- SELECT citus.mitmproxy('conn.onQuery(query="^VACUUM").kill()');
-- VACUUM vacuum_test;
SELECT citus.mitmproxy('conn.onQuery(query="^COMMIT").kill()');
ANALYZE vacuum_test;
-- SELECT citus.mitmproxy('conn.onQuery(query="^ANALYZE").kill()');
-- ANALYZE vacuum_test;
SELECT citus.mitmproxy('conn.allow()');
SELECT recover_prepared_transactions();
-- SELECT citus.mitmproxy('conn.onQuery(query="^COMMIT").kill()');
-- ANALYZE vacuum_test;
-- ANALYZE transactions being critical is an open question, see #2430
-- show that we never mark as INVALID on COMMIT FAILURE
SELECT shardid, shardstate FROM pg_dist_shard_placement where shardstate != 1 AND
shardid in ( SELECT shardid FROM pg_dist_shard WHERE logicalrelid = 'vacuum_test'::regclass);
-- SELECT citus.mitmproxy('conn.allow()');
-- SELECT recover_prepared_transactions();
-- -- ANALYZE transactions being critical is an open question, see #2430
-- -- show that we never mark as INVALID on COMMIT FAILURE
-- SELECT shardid, shardstate FROM pg_dist_shard_placement where shardstate != 1 AND
-- shardid in ( SELECT shardid FROM pg_dist_shard WHERE logicalrelid = 'vacuum_test'::regclass);
-- the same tests with cancel
SELECT citus.mitmproxy('conn.onQuery(query="^VACUUM").cancel(' || pg_backend_pid() || ')');