From a446e71ee7454675defcb8ce7b99171c9c76bfda Mon Sep 17 00:00:00 2001 From: Onder Kalaci Date: Tue, 12 Jun 2018 11:25:23 +0300 Subject: [PATCH] Add failure testing for DDL commands This commit adds an extensive failure testing, which covers quite a bit of things and their combinations: - 1PC vs 2PC - Replication factor 1 and Replication factor 2 - Network failures and query cancellations - Sequential vs Parallel query execution mode --- src/test/regress/expected/failure_ddl.out | 1245 +++++++++++++++++++++ src/test/regress/failure_schedule | 3 + src/test/regress/sql/failure_ddl.sql | 432 +++++++ 3 files changed, 1680 insertions(+) create mode 100644 src/test/regress/expected/failure_ddl.out create mode 100644 src/test/regress/sql/failure_ddl.sql diff --git a/src/test/regress/expected/failure_ddl.out b/src/test/regress/expected/failure_ddl.out new file mode 100644 index 000000000..22995150e --- /dev/null +++ b/src/test/regress/expected/failure_ddl.out @@ -0,0 +1,1245 @@ +-- +-- Test DDL command propagation failures +-- Different dimensions we're testing: +-- Replication factor, 1PC-2PC, sequential-parallel modes +-- +CREATE SCHEMA ddl_failure; +SET search_path TO 'ddl_failure'; +-- we don't want to see the prepared transaction numbers in the warnings +SET client_min_messages TO ERROR; +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +SET citus.next_shard_id TO 100800; +-- we'll start with replication factor 1, 1PC and parallel mode +SET citus.multi_shard_commit_protocol TO '1pc'; +SET citus.shard_count = 4; +SET citus.shard_replication_factor = 1; +CREATE TABLE test_table (key int, value int); +SELECT create_distributed_table('test_table', 'key'); + create_distributed_table +-------------------------- + +(1 row) + +-- in the first test, kill just in the first +-- response we get from the worker +SELECT citus.mitmproxy('conn.onAuthenticationOk().kill()'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table ADD COLUMN new_column INT; +ERROR: connection error: localhost:57640 +DETAIL: server closed the connection unexpectedly + This probably means the server terminated abnormally + before or while processing the request. +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------- + {key,value} +(1 row) + +-- cancel just in the first +-- response we get from the worker +SELECT citus.mitmproxy('conn.onAuthenticationOk().cancel(' || pg_backend_pid() || ')'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table ADD COLUMN new_column INT; +ERROR: canceling statement due to user request +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------- + {key,value} +(1 row) + +-- kill as soon as the coordinator sends begin +SELECT citus.mitmproxy('conn.onQuery(query="^BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED").kill()'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table ADD COLUMN new_column INT; +ERROR: failure on connection marked as essential: localhost:57640 +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------- + {key,value} +(1 row) + +-- cancel as soon as the coordinator sends begin +SELECT citus.mitmproxy('conn.onQuery(query="^BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED").cancel(' || pg_backend_pid() || ')'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table ADD COLUMN new_column INT; +ERROR: canceling statement due to user request +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------- + {key,value} +(1 row) + +-- kill as soon as the coordinator sends worker_apply_shard_ddl_command +SELECT citus.mitmproxy('conn.onQuery(query="worker_apply_shard_ddl_command").kill()'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table ADD COLUMN new_column INT; +ERROR: server closed the connection unexpectedly + This probably means the server terminated abnormally + before or while processing the request. +CONTEXT: while executing command on localhost:57640 +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +-- show that we've never commited the changes +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------- + {key,value} +(1 row) + +-- cancel as soon as the coordinator sends worker_apply_shard_ddl_command +SELECT citus.mitmproxy('conn.onQuery(query="worker_apply_shard_ddl_command").cancel(' || pg_backend_pid() || ')'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table ADD COLUMN new_column INT; +ERROR: canceling statement due to user request +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +-- show that we've never commited the changes +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------- + {key,value} +(1 row) + +-- kill as soon as the coordinator sends COMMIT +SELECT citus.mitmproxy('conn.onQuery(query="^COMMIT").kill()'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table ADD COLUMN new_column INT; +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +-- since we've killed the connection just after +-- the coordinator sends the COMMIT, the command should be applied +-- to the distributed table and the shards on the other worker +-- however, there is no way to recover the failure on the shards +-- that live in the failed worker, since we're running 1PC +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------------------ + {key,new_column,value} +(1 row) + +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + run_command_on_placements +----------------------------------------------------- + (localhost,57637,100800,t,"{key,new_column,value}") + (localhost,57637,100802,t,"{key,new_column,value}") + (localhost,57640,100801,t,"{key,value}") + (localhost,57640,100803,t,"{key,value}") +(4 rows) + +-- manually drop & re-create the table for the next tests +DROP TABLE test_table; +SET citus.next_shard_id TO 100800; +SET citus.multi_shard_commit_protocol TO '1pc'; +SET citus.shard_count = 4; +SET citus.shard_replication_factor = 1; +CREATE TABLE test_table (key int, value int); +SELECT create_distributed_table('test_table', 'key'); + create_distributed_table +-------------------------- + +(1 row) + +-- cancel as soon as the coordinator sends COMMIT +SELECT citus.mitmproxy('conn.onQuery(query="^COMMIT").cancel(' || pg_backend_pid() || ')'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table ADD COLUMN new_column INT; +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +-- interrupts are held during COMMIT/ROLLBACK, so the command +-- should have been applied without any issues since cancel is ignored +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------------------ + {key,new_column,value} +(1 row) + +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + run_command_on_placements +----------------------------------------------------- + (localhost,57637,100800,t,"{key,new_column,value}") + (localhost,57637,100802,t,"{key,new_column,value}") + (localhost,57640,100801,t,"{key,new_column,value}") + (localhost,57640,100803,t,"{key,new_column,value}") +(4 rows) + +-- the following tests rely the column not exists, so drop manually +ALTER TABLE test_table DROP COLUMN new_column; +-- but now kill just after the worker sends response to +-- COMMIT command, so we'll have lots of warnings but the command +-- should have been committed both on the distributed table and the placements +SET client_min_messages TO WARNING; +SELECT citus.mitmproxy('conn.onCommandComplete(command="^COMMIT").kill()'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table ADD COLUMN new_column INT; +WARNING: connection not open +CONTEXT: while executing command on localhost:57640 +WARNING: failed to commit critical transaction on localhost:57640, metadata is likely out of sync +WARNING: connection not open +CONTEXT: while executing command on localhost:57640 +WARNING: connection not open +CONTEXT: while executing command on localhost:57640 +WARNING: failed to commit critical transaction on localhost:57640, metadata is likely out of sync +WARNING: connection not open +CONTEXT: while executing command on localhost:57640 +WARNING: could not commit transaction for shard 100803 on any active node +WARNING: could not commit transaction for shard 100801 on any active node +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +SET client_min_messages TO ERROR; +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------------------ + {key,new_column,value} +(1 row) + +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + run_command_on_placements +----------------------------------------------------- + (localhost,57637,100800,t,"{key,new_column,value}") + (localhost,57637,100802,t,"{key,new_column,value}") + (localhost,57640,100801,t,"{key,new_column,value}") + (localhost,57640,100803,t,"{key,new_column,value}") +(4 rows) + +-- now cancel just after the worker sends response to +-- but Postgres doesn't accepts interrupts during COMMIT and ROLLBACK +-- so should not cancel at all, so not an effective test but adding in +-- case Citus messes up this behaviour +SELECT citus.mitmproxy('conn.onCommandComplete(command="^COMMIT").cancel(' || pg_backend_pid() || ')'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table DROP COLUMN new_column; +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +-- the remaining tests rely on table having new_column +ALTER TABLE test_table ADD COLUMN new_column INT; +-- finally, test failing on ROLLBACK with 1PC +-- fail just after the coordinator sends the ROLLBACK +-- so the command can be rollbacked +SELECT citus.mitmproxy('conn.onQuery(query="ROLLBACK").kill()'); + mitmproxy +----------- + +(1 row) + +BEGIN; +SET LOCAL client_min_messages TO WARNING; +ALTER TABLE test_table DROP COLUMN new_column; +ROLLBACK; +WARNING: connection not open +CONTEXT: while executing command on localhost:57640 +WARNING: connection not open +CONTEXT: while executing command on localhost:57640 +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +-- now cancel just after the worker sends response to +-- but Postgres doesn't accepts interrupts during COMMIT and ROLLBACK +-- so should not cancel at all, so not an effective test but adding in +-- case Citus messes up this behaviour +SELECT citus.mitmproxy('conn.onQuery(query="ROLLBACK").cancel(' || pg_backend_pid() || ')'); + mitmproxy +----------- + +(1 row) + +BEGIN; +ALTER TABLE test_table DROP COLUMN new_column; +ROLLBACK; +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +-- but now kill just after the worker sends response to +-- ROLLBACK command, so we'll have lots of warnings but the command +-- should have been rollbacked both on the distributed table and the placements +SELECT citus.mitmproxy('conn.onCommandComplete(command="ROLLBACK").kill()'); + mitmproxy +----------- + +(1 row) + +BEGIN; +ALTER TABLE test_table DROP COLUMN new_column; +ROLLBACK; +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------------------ + {key,new_column,value} +(1 row) + +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + run_command_on_placements +----------------------------------------------------- + (localhost,57637,100800,t,"{key,new_column,value}") + (localhost,57637,100802,t,"{key,new_column,value}") + (localhost,57640,100801,t,"{key,new_column,value}") + (localhost,57640,100803,t,"{key,new_column,value}") +(4 rows) + +-- now, lets test with 2PC +SET citus.multi_shard_commit_protocol TO '2pc'; +-- in the first test, kill just in the first +-- response we get from the worker +SELECT citus.mitmproxy('conn.onAuthenticationOk().kill()'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table DROP COLUMN new_column; +ERROR: connection error: localhost:57640 +DETAIL: server closed the connection unexpectedly + This probably means the server terminated abnormally + before or while processing the request. +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------------------ + {key,new_column,value} +(1 row) + +-- cancel just in the first +-- response we get from the worker +SELECT citus.mitmproxy('conn.onAuthenticationOk().cancel(' || pg_backend_pid() || ')'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table DROP COLUMN new_column; +ERROR: canceling statement due to user request +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------------------ + {key,new_column,value} +(1 row) + +-- kill as soon as the coordinator sends begin +SELECT citus.mitmproxy('conn.onQuery(query="^BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED").kill()'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table DROP COLUMN new_column; +ERROR: failure on connection marked as essential: localhost:57640 +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------------------ + {key,new_column,value} +(1 row) + +-- cancel as soon as the coordinator sends begin +SELECT citus.mitmproxy('conn.onQuery(query="^BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED").cancel(' || pg_backend_pid() || ')'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table DROP COLUMN new_column; +ERROR: canceling statement due to user request +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------------------ + {key,new_column,value} +(1 row) + +-- kill as soon as the coordinator sends worker_apply_shard_ddl_command +SELECT citus.mitmproxy('conn.onQuery(query="worker_apply_shard_ddl_command").kill()'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table DROP COLUMN new_column; +ERROR: server closed the connection unexpectedly + This probably means the server terminated abnormally + before or while processing the request. +CONTEXT: while executing command on localhost:57640 +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------------------ + {key,new_column,value} +(1 row) + +-- cancel as soon as the coordinator sends worker_apply_shard_ddl_command +SELECT citus.mitmproxy('conn.onQuery(query="worker_apply_shard_ddl_command").cancel(' || pg_backend_pid() || ')'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table DROP COLUMN new_column; +ERROR: canceling statement due to user request +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------------------ + {key,new_column,value} +(1 row) + +-- killing on PREPARE should be fine, everything should be rollbacked +SELECT citus.mitmproxy('conn.onCommandComplete(command="PREPARE TRANSACTION").kill()'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table DROP COLUMN new_column; +ERROR: connection not open +CONTEXT: while executing command on localhost:57640 +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------------------ + {key,new_column,value} +(1 row) + +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + run_command_on_placements +----------------------------------------------------- + (localhost,57637,100800,t,"{key,new_column,value}") + (localhost,57637,100802,t,"{key,new_column,value}") + (localhost,57640,100801,t,"{key,new_column,value}") + (localhost,57640,100803,t,"{key,new_column,value}") +(4 rows) + +-- we should be able to recover the transaction and +-- see that the command is rollbacked +SELECT recover_prepared_transactions(); + recover_prepared_transactions +------------------------------- + 2 +(1 row) + +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + run_command_on_placements +----------------------------------------------------- + (localhost,57637,100800,t,"{key,new_column,value}") + (localhost,57637,100802,t,"{key,new_column,value}") + (localhost,57640,100801,t,"{key,new_column,value}") + (localhost,57640,100803,t,"{key,new_column,value}") +(4 rows) + +-- cancelling on PREPARE should be fine, everything should be rollbacked +SELECT citus.mitmproxy('conn.onCommandComplete(command="PREPARE TRANSACTION").cancel(' || pg_backend_pid() || ')'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table DROP COLUMN new_column; +ERROR: canceling statement due to user request +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------------------ + {key,new_column,value} +(1 row) + +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + run_command_on_placements +----------------------------------------------------- + (localhost,57637,100800,t,"{key,new_column,value}") + (localhost,57637,100802,t,"{key,new_column,value}") + (localhost,57640,100801,t,"{key,new_column,value}") + (localhost,57640,100803,t,"{key,new_column,value}") +(4 rows) + +-- we should be able to recover the transaction and +-- see that the command is rollbacked +SELECT recover_prepared_transactions(); + recover_prepared_transactions +------------------------------- + 1 +(1 row) + +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + run_command_on_placements +----------------------------------------------------- + (localhost,57637,100800,t,"{key,new_column,value}") + (localhost,57637,100802,t,"{key,new_column,value}") + (localhost,57640,100801,t,"{key,new_column,value}") + (localhost,57640,100803,t,"{key,new_column,value}") +(4 rows) + +-- killing on command complete of COMMIT PREPARE, we should see that the command succeeds +-- and all the workers committed +SELECT citus.mitmproxy('conn.onCommandComplete(command="COMMIT PREPARED").kill()'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table DROP COLUMN new_column; +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------- + {key,value} +(1 row) + +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + run_command_on_placements +------------------------------------------ + (localhost,57637,100800,t,"{key,value}") + (localhost,57637,100802,t,"{key,value}") + (localhost,57640,100801,t,"{key,value}") + (localhost,57640,100803,t,"{key,value}") +(4 rows) + +-- we shouldn't have any prepared transactions in the workers +SELECT recover_prepared_transactions(); + recover_prepared_transactions +------------------------------- + 0 +(1 row) + +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + run_command_on_placements +------------------------------------------ + (localhost,57637,100800,t,"{key,value}") + (localhost,57637,100802,t,"{key,value}") + (localhost,57640,100801,t,"{key,value}") + (localhost,57640,100803,t,"{key,value}") +(4 rows) + +-- kill as soon as the coordinator sends COMMIT +SELECT citus.mitmproxy('conn.onQuery(query="^COMMIT PREPARED").kill()'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table ADD COLUMN new_column INT; +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +-- some of the placements would be missing the new column +-- since we've not commited the prepared transactions +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------------------ + {key,new_column,value} +(1 row) + +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + run_command_on_placements +----------------------------------------------------- + (localhost,57637,100800,t,"{key,new_column,value}") + (localhost,57637,100802,t,"{key,new_column,value}") + (localhost,57640,100801,t,"{key,value}") + (localhost,57640,100803,t,"{key,value}") +(4 rows) + +-- we should be able to recover the transaction and +-- see that the command is committed +SELECT recover_prepared_transactions(); + recover_prepared_transactions +------------------------------- + 2 +(1 row) + +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + run_command_on_placements +----------------------------------------------------- + (localhost,57637,100800,t,"{key,new_column,value}") + (localhost,57637,100802,t,"{key,new_column,value}") + (localhost,57640,100801,t,"{key,new_column,value}") + (localhost,57640,100803,t,"{key,new_column,value}") +(4 rows) + +-- finally, test failing on ROLLBACK with 2PC +-- fail just after the coordinator sends the ROLLBACK +-- so the command can be rollbacked +SELECT citus.mitmproxy('conn.onQuery(query="ROLLBACK").kill()'); + mitmproxy +----------- + +(1 row) + +BEGIN; +ALTER TABLE test_table DROP COLUMN new_column; +ROLLBACK; +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +-- ROLLBACK should have failed on the distributed table and the placements +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------------------ + {key,new_column,value} +(1 row) + +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + run_command_on_placements +----------------------------------------------------- + (localhost,57637,100800,t,"{key,new_column,value}") + (localhost,57637,100802,t,"{key,new_column,value}") + (localhost,57640,100801,t,"{key,new_column,value}") + (localhost,57640,100803,t,"{key,new_column,value}") +(4 rows) + +-- but now kill just after the worker sends response to +-- ROLLBACK command, so we'll have lots of warnings but the command +-- should have been rollbacked both on the distributed table and the placements +SELECT citus.mitmproxy('conn.onCommandComplete(command="ROLLBACK").kill()'); + mitmproxy +----------- + +(1 row) + +BEGIN; +ALTER TABLE test_table DROP COLUMN new_column; +ROLLBACK; +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +-- make sure that the transaction is rollbacked +SELECT recover_prepared_transactions(); + recover_prepared_transactions +------------------------------- + 0 +(1 row) + +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + run_command_on_placements +----------------------------------------------------- + (localhost,57637,100800,t,"{key,new_column,value}") + (localhost,57637,100802,t,"{key,new_column,value}") + (localhost,57640,100801,t,"{key,new_column,value}") + (localhost,57640,100803,t,"{key,new_column,value}") +(4 rows) + +-- another set of tests with 2PC and replication factor = 2 +SET citus.multi_shard_commit_protocol TO '2pc'; +SET citus.shard_count = 4; +SET citus.shard_replication_factor = 2; +-- re-create the table with replication factor 2 +DROP TABLE test_table; +CREATE TABLE test_table (key int, value int); +SELECT create_distributed_table('test_table', 'key'); + create_distributed_table +-------------------------- + +(1 row) + +-- in the first test, kill just in the first +-- response we get from the worker +SELECT citus.mitmproxy('conn.onAuthenticationOk().kill()'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table ADD COLUMN new_column INT; +ERROR: connection error: localhost:57640 +DETAIL: server closed the connection unexpectedly + This probably means the server terminated abnormally + before or while processing the request. +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------- + {key,value} +(1 row) + +-- cancel just in the first +-- response we get from the worker +SELECT citus.mitmproxy('conn.onAuthenticationOk().cancel(' || pg_backend_pid() || ')'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table ADD COLUMN new_column INT; +ERROR: canceling statement due to user request +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------- + {key,value} +(1 row) + +-- kill as soon as the coordinator sends begin +SELECT citus.mitmproxy('conn.onQuery(query="^BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED").kill()'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table ADD COLUMN new_column INT; +ERROR: failure on connection marked as essential: localhost:57640 +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------- + {key,value} +(1 row) + +-- cancel as soon as the coordinator sends begin +SELECT citus.mitmproxy('conn.onQuery(query="^BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED").cancel(' || pg_backend_pid() || ')'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table ADD COLUMN new_column INT; +ERROR: canceling statement due to user request +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------- + {key,value} +(1 row) + +-- kill as soon as the coordinator sends worker_apply_shard_ddl_command +SELECT citus.mitmproxy('conn.onQuery(query="worker_apply_shard_ddl_command").kill()'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table ADD COLUMN new_column INT; +ERROR: server closed the connection unexpectedly + This probably means the server terminated abnormally + before or while processing the request. +CONTEXT: while executing command on localhost:57640 +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------- + {key,value} +(1 row) + +-- cancel as soon as the coordinator sends worker_apply_shard_ddl_command +SELECT citus.mitmproxy('conn.onQuery(query="worker_apply_shard_ddl_command").cancel(' || pg_backend_pid() || ')'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table ADD COLUMN new_column INT; +ERROR: canceling statement due to user request +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------- + {key,value} +(1 row) + +-- killing on PREPARE should be fine, everything should be rollbacked +SELECT citus.mitmproxy('conn.onCommandComplete(command="PREPARE TRANSACTION").kill()'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table ADD COLUMN new_column INT; +ERROR: connection not open +CONTEXT: while executing command on localhost:57640 +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +-- we should be able to recover the transaction and +-- see that the command is rollbacked on all workers +-- note that in this case recover_prepared_transactions() +-- sends ROLLBACK PREPARED to the workers given that +-- the transaction has not been commited on any placement yet +SELECT recover_prepared_transactions(); + recover_prepared_transactions +------------------------------- + 4 +(1 row) + +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + run_command_on_placements +------------------------------------------ + (localhost,57637,100804,t,"{key,value}") + (localhost,57637,100805,t,"{key,value}") + (localhost,57637,100806,t,"{key,value}") + (localhost,57637,100807,t,"{key,value}") + (localhost,57640,100804,t,"{key,value}") + (localhost,57640,100805,t,"{key,value}") + (localhost,57640,100806,t,"{key,value}") + (localhost,57640,100807,t,"{key,value}") +(8 rows) + +-- killing on command complete of COMMIT PREPARE, we should see that the command succeeds +-- and all the workers committed +SELECT citus.mitmproxy('conn.onCommandComplete(command="COMMIT PREPARED").kill()'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table ADD COLUMN new_column INT; +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------------------ + {key,new_column,value} +(1 row) + +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + run_command_on_placements +----------------------------------------------------- + (localhost,57637,100804,t,"{key,new_column,value}") + (localhost,57637,100805,t,"{key,new_column,value}") + (localhost,57637,100806,t,"{key,new_column,value}") + (localhost,57637,100807,t,"{key,new_column,value}") + (localhost,57640,100804,t,"{key,new_column,value}") + (localhost,57640,100805,t,"{key,new_column,value}") + (localhost,57640,100806,t,"{key,new_column,value}") + (localhost,57640,100807,t,"{key,new_column,value}") +(8 rows) + +-- we shouldn't have any prepared transactions in the workers +SELECT recover_prepared_transactions(); + recover_prepared_transactions +------------------------------- + 0 +(1 row) + +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + run_command_on_placements +----------------------------------------------------- + (localhost,57637,100804,t,"{key,new_column,value}") + (localhost,57637,100805,t,"{key,new_column,value}") + (localhost,57637,100806,t,"{key,new_column,value}") + (localhost,57637,100807,t,"{key,new_column,value}") + (localhost,57640,100804,t,"{key,new_column,value}") + (localhost,57640,100805,t,"{key,new_column,value}") + (localhost,57640,100806,t,"{key,new_column,value}") + (localhost,57640,100807,t,"{key,new_column,value}") +(8 rows) + +-- kill as soon as the coordinator sends COMMIT +SELECT citus.mitmproxy('conn.onQuery(query="^COMMIT PREPARED").kill()'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table DROP COLUMN new_column; +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +-- some of the placements would be missing the new column +-- since we've not commited the prepared transactions +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------- + {key,value} +(1 row) + +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + run_command_on_placements +----------------------------------------------------- + (localhost,57637,100804,t,"{key,value}") + (localhost,57637,100805,t,"{key,value}") + (localhost,57637,100806,t,"{key,value}") + (localhost,57637,100807,t,"{key,value}") + (localhost,57640,100804,t,"{key,new_column,value}") + (localhost,57640,100805,t,"{key,new_column,value}") + (localhost,57640,100806,t,"{key,new_column,value}") + (localhost,57640,100807,t,"{key,new_column,value}") +(8 rows) + +-- we should be able to recover the transaction and +-- see that the command is committed +SELECT recover_prepared_transactions(); + recover_prepared_transactions +------------------------------- + 4 +(1 row) + +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + run_command_on_placements +------------------------------------------ + (localhost,57637,100804,t,"{key,value}") + (localhost,57637,100805,t,"{key,value}") + (localhost,57637,100806,t,"{key,value}") + (localhost,57637,100807,t,"{key,value}") + (localhost,57640,100804,t,"{key,value}") + (localhost,57640,100805,t,"{key,value}") + (localhost,57640,100806,t,"{key,value}") + (localhost,57640,100807,t,"{key,value}") +(8 rows) + +-- finally, test failing on ROLLBACK with 2PC +-- fail just after the coordinator sends the ROLLBACK +-- so the command can be rollbacked +SELECT citus.mitmproxy('conn.onQuery(query="ROLLBACK").kill()'); + mitmproxy +----------- + +(1 row) + +BEGIN; +ALTER TABLE test_table ADD COLUMN new_column INT; +ROLLBACK; +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +-- ROLLBACK should have failed on the distributed table and the placements +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------- + {key,value} +(1 row) + +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + run_command_on_placements +------------------------------------------ + (localhost,57637,100804,t,"{key,value}") + (localhost,57637,100805,t,"{key,value}") + (localhost,57637,100806,t,"{key,value}") + (localhost,57637,100807,t,"{key,value}") + (localhost,57640,100804,t,"{key,value}") + (localhost,57640,100805,t,"{key,value}") + (localhost,57640,100806,t,"{key,value}") + (localhost,57640,100807,t,"{key,value}") +(8 rows) + +-- but now kill just after the worker sends response to +-- ROLLBACK command, so we'll have lots of warnings but the command +-- should have been rollbacked both on the distributed table and the placements +SELECT citus.mitmproxy('conn.onCommandComplete(command="ROLLBACK").kill()'); + mitmproxy +----------- + +(1 row) + +BEGIN; +ALTER TABLE test_table ADD COLUMN new_column INT; +ROLLBACK; +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +-- make sure that the transaction is rollbacked +SELECT recover_prepared_transactions(); + recover_prepared_transactions +------------------------------- + 0 +(1 row) + +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + run_command_on_placements +------------------------------------------ + (localhost,57637,100804,t,"{key,value}") + (localhost,57637,100805,t,"{key,value}") + (localhost,57637,100806,t,"{key,value}") + (localhost,57637,100807,t,"{key,value}") + (localhost,57640,100804,t,"{key,value}") + (localhost,57640,100805,t,"{key,value}") + (localhost,57640,100806,t,"{key,value}") + (localhost,57640,100807,t,"{key,value}") +(8 rows) + +-- now do some tests with sequential mode +SET citus.multi_shard_modify_mode TO 'sequential'; +-- kill as soon as the coordinator sends begin +SELECT citus.mitmproxy('conn.onQuery(query="^BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED").kill()'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table ADD COLUMN new_column INT; +ERROR: failure on connection marked as essential: localhost:57640 +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------- + {key,value} +(1 row) + +-- cancel as soon as the coordinator sends begin +SELECT citus.mitmproxy('conn.onQuery(query="^BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED").cancel(' || pg_backend_pid() || ')'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table ADD COLUMN new_column INT; +ERROR: canceling statement due to user request +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + array_agg +------------- + {key,value} +(1 row) + +-- kill as soon as the coordinator sends worker_apply_shard_ddl_command +SELECT citus.mitmproxy('conn.onQuery(query="worker_apply_shard_ddl_command").kill()'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table ADD COLUMN new_column INT; +ERROR: server closed the connection unexpectedly + This probably means the server terminated abnormally + before or while processing the request. +CONTEXT: while executing command on localhost:57640 +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +-- kill as soon as the coordinator after it sends worker_apply_shard_ddl_command 2nd time +SELECT citus.mitmproxy('conn.onQuery(query="worker_apply_shard_ddl_command").after(2).kill()'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table ADD COLUMN new_column INT; +ERROR: server closed the connection unexpectedly + This probably means the server terminated abnormally + before or while processing the request. +CONTEXT: while executing command on localhost:57640 +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +-- cancel as soon as the coordinator after it sends worker_apply_shard_ddl_command 2nd time +SELECT citus.mitmproxy('conn.onQuery(query="worker_apply_shard_ddl_command").after(2).cancel(' || pg_backend_pid() || ')'); + mitmproxy +----------- + +(1 row) + +ALTER TABLE test_table ADD COLUMN new_column INT; +ERROR: canceling statement due to user request +SELECT citus.mitmproxy('conn.allow()'); + mitmproxy +----------- + +(1 row) + +SET search_path TO 'public'; +DROP SCHEMA ddl_failure CASCADE; diff --git a/src/test/regress/failure_schedule b/src/test/regress/failure_schedule index 3b5e61bea..1b96e6657 100644 --- a/src/test/regress/failure_schedule +++ b/src/test/regress/failure_schedule @@ -3,3 +3,6 @@ test: failure_test_helpers # this should only be run by pg_regress_multi, you don't need it test: failure_setup +test: multi_test_helpers + +test: failure_ddl diff --git a/src/test/regress/sql/failure_ddl.sql b/src/test/regress/sql/failure_ddl.sql new file mode 100644 index 000000000..3d91db333 --- /dev/null +++ b/src/test/regress/sql/failure_ddl.sql @@ -0,0 +1,432 @@ +-- +-- Test DDL command propagation failures +-- Different dimensions we're testing: +-- Replication factor, 1PC-2PC, sequential-parallel modes +-- + + +CREATE SCHEMA ddl_failure; + +SET search_path TO 'ddl_failure'; + +-- we don't want to see the prepared transaction numbers in the warnings +SET client_min_messages TO ERROR; + +SELECT citus.mitmproxy('conn.allow()'); + +SET citus.next_shard_id TO 100800; + +-- we'll start with replication factor 1, 1PC and parallel mode +SET citus.multi_shard_commit_protocol TO '1pc'; +SET citus.shard_count = 4; +SET citus.shard_replication_factor = 1; + +CREATE TABLE test_table (key int, value int); +SELECT create_distributed_table('test_table', 'key'); + +-- in the first test, kill just in the first +-- response we get from the worker +SELECT citus.mitmproxy('conn.onAuthenticationOk().kill()'); +ALTER TABLE test_table ADD COLUMN new_column INT; +SELECT citus.mitmproxy('conn.allow()'); +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + +-- cancel just in the first +-- response we get from the worker +SELECT citus.mitmproxy('conn.onAuthenticationOk().cancel(' || pg_backend_pid() || ')'); +ALTER TABLE test_table ADD COLUMN new_column INT; +SELECT citus.mitmproxy('conn.allow()'); +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + +-- kill as soon as the coordinator sends begin +SELECT citus.mitmproxy('conn.onQuery(query="^BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED").kill()'); +ALTER TABLE test_table ADD COLUMN new_column INT; +SELECT citus.mitmproxy('conn.allow()'); +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + +-- cancel as soon as the coordinator sends begin +SELECT citus.mitmproxy('conn.onQuery(query="^BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED").cancel(' || pg_backend_pid() || ')'); +ALTER TABLE test_table ADD COLUMN new_column INT; +SELECT citus.mitmproxy('conn.allow()'); +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + +-- kill as soon as the coordinator sends worker_apply_shard_ddl_command +SELECT citus.mitmproxy('conn.onQuery(query="worker_apply_shard_ddl_command").kill()'); +ALTER TABLE test_table ADD COLUMN new_column INT; +SELECT citus.mitmproxy('conn.allow()'); + +-- show that we've never commited the changes +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + +-- cancel as soon as the coordinator sends worker_apply_shard_ddl_command +SELECT citus.mitmproxy('conn.onQuery(query="worker_apply_shard_ddl_command").cancel(' || pg_backend_pid() || ')'); +ALTER TABLE test_table ADD COLUMN new_column INT; +SELECT citus.mitmproxy('conn.allow()'); + +-- show that we've never commited the changes +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + +-- kill as soon as the coordinator sends COMMIT +SELECT citus.mitmproxy('conn.onQuery(query="^COMMIT").kill()'); +ALTER TABLE test_table ADD COLUMN new_column INT; +SELECT citus.mitmproxy('conn.allow()'); + +-- since we've killed the connection just after +-- the coordinator sends the COMMIT, the command should be applied +-- to the distributed table and the shards on the other worker +-- however, there is no way to recover the failure on the shards +-- that live in the failed worker, since we're running 1PC +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + +-- manually drop & re-create the table for the next tests +DROP TABLE test_table; +SET citus.next_shard_id TO 100800; +SET citus.multi_shard_commit_protocol TO '1pc'; +SET citus.shard_count = 4; +SET citus.shard_replication_factor = 1; + +CREATE TABLE test_table (key int, value int); +SELECT create_distributed_table('test_table', 'key'); + +-- cancel as soon as the coordinator sends COMMIT +SELECT citus.mitmproxy('conn.onQuery(query="^COMMIT").cancel(' || pg_backend_pid() || ')'); +ALTER TABLE test_table ADD COLUMN new_column INT; +SELECT citus.mitmproxy('conn.allow()'); + +-- interrupts are held during COMMIT/ROLLBACK, so the command +-- should have been applied without any issues since cancel is ignored +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + +-- the following tests rely the column not exists, so drop manually +ALTER TABLE test_table DROP COLUMN new_column; + +-- but now kill just after the worker sends response to +-- COMMIT command, so we'll have lots of warnings but the command +-- should have been committed both on the distributed table and the placements +SET client_min_messages TO WARNING; +SELECT citus.mitmproxy('conn.onCommandComplete(command="^COMMIT").kill()'); +ALTER TABLE test_table ADD COLUMN new_column INT; +SELECT citus.mitmproxy('conn.allow()'); + +SET client_min_messages TO ERROR; + +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + +-- now cancel just after the worker sends response to +-- but Postgres doesn't accepts interrupts during COMMIT and ROLLBACK +-- so should not cancel at all, so not an effective test but adding in +-- case Citus messes up this behaviour +SELECT citus.mitmproxy('conn.onCommandComplete(command="^COMMIT").cancel(' || pg_backend_pid() || ')'); +ALTER TABLE test_table DROP COLUMN new_column; +SELECT citus.mitmproxy('conn.allow()'); + +-- the remaining tests rely on table having new_column +ALTER TABLE test_table ADD COLUMN new_column INT; + +-- finally, test failing on ROLLBACK with 1PC + +-- fail just after the coordinator sends the ROLLBACK +-- so the command can be rollbacked +SELECT citus.mitmproxy('conn.onQuery(query="ROLLBACK").kill()'); +BEGIN; +SET LOCAL client_min_messages TO WARNING; +ALTER TABLE test_table DROP COLUMN new_column; +ROLLBACK; +SELECT citus.mitmproxy('conn.allow()'); + +-- now cancel just after the worker sends response to +-- but Postgres doesn't accepts interrupts during COMMIT and ROLLBACK +-- so should not cancel at all, so not an effective test but adding in +-- case Citus messes up this behaviour +SELECT citus.mitmproxy('conn.onQuery(query="ROLLBACK").cancel(' || pg_backend_pid() || ')'); +BEGIN; +ALTER TABLE test_table DROP COLUMN new_column; +ROLLBACK; +SELECT citus.mitmproxy('conn.allow()'); + +-- but now kill just after the worker sends response to +-- ROLLBACK command, so we'll have lots of warnings but the command +-- should have been rollbacked both on the distributed table and the placements +SELECT citus.mitmproxy('conn.onCommandComplete(command="ROLLBACK").kill()'); +BEGIN; +ALTER TABLE test_table DROP COLUMN new_column; +ROLLBACK; +SELECT citus.mitmproxy('conn.allow()'); + +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + +-- now, lets test with 2PC +SET citus.multi_shard_commit_protocol TO '2pc'; + +-- in the first test, kill just in the first +-- response we get from the worker +SELECT citus.mitmproxy('conn.onAuthenticationOk().kill()'); +ALTER TABLE test_table DROP COLUMN new_column; +SELECT citus.mitmproxy('conn.allow()'); +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + +-- cancel just in the first +-- response we get from the worker +SELECT citus.mitmproxy('conn.onAuthenticationOk().cancel(' || pg_backend_pid() || ')'); +ALTER TABLE test_table DROP COLUMN new_column; +SELECT citus.mitmproxy('conn.allow()'); +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + +-- kill as soon as the coordinator sends begin +SELECT citus.mitmproxy('conn.onQuery(query="^BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED").kill()'); +ALTER TABLE test_table DROP COLUMN new_column; +SELECT citus.mitmproxy('conn.allow()'); +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + +-- cancel as soon as the coordinator sends begin +SELECT citus.mitmproxy('conn.onQuery(query="^BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED").cancel(' || pg_backend_pid() || ')'); +ALTER TABLE test_table DROP COLUMN new_column; +SELECT citus.mitmproxy('conn.allow()'); +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + +-- kill as soon as the coordinator sends worker_apply_shard_ddl_command +SELECT citus.mitmproxy('conn.onQuery(query="worker_apply_shard_ddl_command").kill()'); +ALTER TABLE test_table DROP COLUMN new_column; +SELECT citus.mitmproxy('conn.allow()'); +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + +-- cancel as soon as the coordinator sends worker_apply_shard_ddl_command +SELECT citus.mitmproxy('conn.onQuery(query="worker_apply_shard_ddl_command").cancel(' || pg_backend_pid() || ')'); +ALTER TABLE test_table DROP COLUMN new_column; +SELECT citus.mitmproxy('conn.allow()'); +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + + +-- killing on PREPARE should be fine, everything should be rollbacked +SELECT citus.mitmproxy('conn.onCommandComplete(command="PREPARE TRANSACTION").kill()'); +ALTER TABLE test_table DROP COLUMN new_column; +SELECT citus.mitmproxy('conn.allow()'); +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + +-- we should be able to recover the transaction and +-- see that the command is rollbacked +SELECT recover_prepared_transactions(); +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + + +-- cancelling on PREPARE should be fine, everything should be rollbacked +SELECT citus.mitmproxy('conn.onCommandComplete(command="PREPARE TRANSACTION").cancel(' || pg_backend_pid() || ')'); +ALTER TABLE test_table DROP COLUMN new_column; +SELECT citus.mitmproxy('conn.allow()'); +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + +-- we should be able to recover the transaction and +-- see that the command is rollbacked +SELECT recover_prepared_transactions(); +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + +-- killing on command complete of COMMIT PREPARE, we should see that the command succeeds +-- and all the workers committed +SELECT citus.mitmproxy('conn.onCommandComplete(command="COMMIT PREPARED").kill()'); +ALTER TABLE test_table DROP COLUMN new_column; +SELECT citus.mitmproxy('conn.allow()'); + +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + +-- we shouldn't have any prepared transactions in the workers +SELECT recover_prepared_transactions(); +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + +-- kill as soon as the coordinator sends COMMIT +SELECT citus.mitmproxy('conn.onQuery(query="^COMMIT PREPARED").kill()'); +ALTER TABLE test_table ADD COLUMN new_column INT; +SELECT citus.mitmproxy('conn.allow()'); + +-- some of the placements would be missing the new column +-- since we've not commited the prepared transactions +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + +-- we should be able to recover the transaction and +-- see that the command is committed +SELECT recover_prepared_transactions(); +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + +-- finally, test failing on ROLLBACK with 2PC + +-- fail just after the coordinator sends the ROLLBACK +-- so the command can be rollbacked +SELECT citus.mitmproxy('conn.onQuery(query="ROLLBACK").kill()'); +BEGIN; +ALTER TABLE test_table DROP COLUMN new_column; +ROLLBACK; +SELECT citus.mitmproxy('conn.allow()'); + +-- ROLLBACK should have failed on the distributed table and the placements +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + +-- but now kill just after the worker sends response to +-- ROLLBACK command, so we'll have lots of warnings but the command +-- should have been rollbacked both on the distributed table and the placements +SELECT citus.mitmproxy('conn.onCommandComplete(command="ROLLBACK").kill()'); +BEGIN; +ALTER TABLE test_table DROP COLUMN new_column; +ROLLBACK; +SELECT citus.mitmproxy('conn.allow()'); + +-- make sure that the transaction is rollbacked +SELECT recover_prepared_transactions(); +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + + +-- another set of tests with 2PC and replication factor = 2 +SET citus.multi_shard_commit_protocol TO '2pc'; +SET citus.shard_count = 4; +SET citus.shard_replication_factor = 2; + +-- re-create the table with replication factor 2 +DROP TABLE test_table; +CREATE TABLE test_table (key int, value int); +SELECT create_distributed_table('test_table', 'key'); + +-- in the first test, kill just in the first +-- response we get from the worker +SELECT citus.mitmproxy('conn.onAuthenticationOk().kill()'); +ALTER TABLE test_table ADD COLUMN new_column INT; +SELECT citus.mitmproxy('conn.allow()'); +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + +-- cancel just in the first +-- response we get from the worker +SELECT citus.mitmproxy('conn.onAuthenticationOk().cancel(' || pg_backend_pid() || ')'); +ALTER TABLE test_table ADD COLUMN new_column INT; +SELECT citus.mitmproxy('conn.allow()'); +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + +-- kill as soon as the coordinator sends begin +SELECT citus.mitmproxy('conn.onQuery(query="^BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED").kill()'); +ALTER TABLE test_table ADD COLUMN new_column INT; +SELECT citus.mitmproxy('conn.allow()'); +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + +-- cancel as soon as the coordinator sends begin +SELECT citus.mitmproxy('conn.onQuery(query="^BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED").cancel(' || pg_backend_pid() || ')'); +ALTER TABLE test_table ADD COLUMN new_column INT; +SELECT citus.mitmproxy('conn.allow()'); +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + +-- kill as soon as the coordinator sends worker_apply_shard_ddl_command +SELECT citus.mitmproxy('conn.onQuery(query="worker_apply_shard_ddl_command").kill()'); +ALTER TABLE test_table ADD COLUMN new_column INT; +SELECT citus.mitmproxy('conn.allow()'); +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + +-- cancel as soon as the coordinator sends worker_apply_shard_ddl_command +SELECT citus.mitmproxy('conn.onQuery(query="worker_apply_shard_ddl_command").cancel(' || pg_backend_pid() || ')'); +ALTER TABLE test_table ADD COLUMN new_column INT; +SELECT citus.mitmproxy('conn.allow()'); +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + +-- killing on PREPARE should be fine, everything should be rollbacked +SELECT citus.mitmproxy('conn.onCommandComplete(command="PREPARE TRANSACTION").kill()'); +ALTER TABLE test_table ADD COLUMN new_column INT; +SELECT citus.mitmproxy('conn.allow()'); + +-- we should be able to recover the transaction and +-- see that the command is rollbacked on all workers +-- note that in this case recover_prepared_transactions() +-- sends ROLLBACK PREPARED to the workers given that +-- the transaction has not been commited on any placement yet +SELECT recover_prepared_transactions(); +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + +-- killing on command complete of COMMIT PREPARE, we should see that the command succeeds +-- and all the workers committed +SELECT citus.mitmproxy('conn.onCommandComplete(command="COMMIT PREPARED").kill()'); +ALTER TABLE test_table ADD COLUMN new_column INT; +SELECT citus.mitmproxy('conn.allow()'); + +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + +-- we shouldn't have any prepared transactions in the workers +SELECT recover_prepared_transactions(); +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + +-- kill as soon as the coordinator sends COMMIT +SELECT citus.mitmproxy('conn.onQuery(query="^COMMIT PREPARED").kill()'); +ALTER TABLE test_table DROP COLUMN new_column; +SELECT citus.mitmproxy('conn.allow()'); + +-- some of the placements would be missing the new column +-- since we've not commited the prepared transactions +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + +-- we should be able to recover the transaction and +-- see that the command is committed +SELECT recover_prepared_transactions(); +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + +-- finally, test failing on ROLLBACK with 2PC + +-- fail just after the coordinator sends the ROLLBACK +-- so the command can be rollbacked +SELECT citus.mitmproxy('conn.onQuery(query="ROLLBACK").kill()'); +BEGIN; +ALTER TABLE test_table ADD COLUMN new_column INT; +ROLLBACK; +SELECT citus.mitmproxy('conn.allow()'); + +-- ROLLBACK should have failed on the distributed table and the placements +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + +-- but now kill just after the worker sends response to +-- ROLLBACK command, so we'll have lots of warnings but the command +-- should have been rollbacked both on the distributed table and the placements +SELECT citus.mitmproxy('conn.onCommandComplete(command="ROLLBACK").kill()'); +BEGIN; +ALTER TABLE test_table ADD COLUMN new_column INT; +ROLLBACK; +SELECT citus.mitmproxy('conn.allow()'); + +-- make sure that the transaction is rollbacked +SELECT recover_prepared_transactions(); +SELECT run_command_on_placements('test_table', $$SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = '%s'::regclass;$$) ORDER BY 1; + +-- now do some tests with sequential mode +SET citus.multi_shard_modify_mode TO 'sequential'; + +-- kill as soon as the coordinator sends begin +SELECT citus.mitmproxy('conn.onQuery(query="^BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED").kill()'); +ALTER TABLE test_table ADD COLUMN new_column INT; +SELECT citus.mitmproxy('conn.allow()'); +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + +-- cancel as soon as the coordinator sends begin +SELECT citus.mitmproxy('conn.onQuery(query="^BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED").cancel(' || pg_backend_pid() || ')'); +ALTER TABLE test_table ADD COLUMN new_column INT; +SELECT citus.mitmproxy('conn.allow()'); +SELECT array_agg(name::text ORDER BY name::text) FROM public.table_attrs where relid = 'test_table'::regclass; + +-- kill as soon as the coordinator sends worker_apply_shard_ddl_command +SELECT citus.mitmproxy('conn.onQuery(query="worker_apply_shard_ddl_command").kill()'); +ALTER TABLE test_table ADD COLUMN new_column INT; +SELECT citus.mitmproxy('conn.allow()'); + +-- kill as soon as the coordinator after it sends worker_apply_shard_ddl_command 2nd time +SELECT citus.mitmproxy('conn.onQuery(query="worker_apply_shard_ddl_command").after(2).kill()'); +ALTER TABLE test_table ADD COLUMN new_column INT; +SELECT citus.mitmproxy('conn.allow()'); + +-- cancel as soon as the coordinator after it sends worker_apply_shard_ddl_command 2nd time +SELECT citus.mitmproxy('conn.onQuery(query="worker_apply_shard_ddl_command").after(2).cancel(' || pg_backend_pid() || ')'); +ALTER TABLE test_table ADD COLUMN new_column INT; +SELECT citus.mitmproxy('conn.allow()'); + +SET search_path TO 'public'; +DROP SCHEMA ddl_failure CASCADE;