From 02fa942be15f68489f9d834ba58a4dce47ccc391 Mon Sep 17 00:00:00 2001 From: Jelte Fennema Date: Tue, 30 Jun 2020 11:31:46 +0200 Subject: [PATCH] Fix assertion error when rolling back to savepoint (#3868) It was possible to get an assertion error, if a DML command was cancelled that opened a connection and then "ROLLBACK TO SAVEPOINT" was used to continue the transaction. The reason for this was that canceling the transaction might leave the `claimedExclusively` flag on for (some of) it's connections. This caused an assertion failure because `CanUseExistingConnection` would return false and a new connection would be opened, and then there would be two connections doing DML for the same placement. Which is disallowed. That this situation caused an assertion failure instead of an error, means that without asserts this could possibly result in some visibility bugs, similar to the ones described https://github.com/citusdata/citus/issues/3867 --- .../transaction/remote_transaction.c | 16 +++++++++++ .../transaction/transaction_management.c | 4 +-- .../expected/rollback_to_savepoint.out | 27 +++++++++++++++++++ src/test/regress/multi_schedule | 4 +-- src/test/regress/multi_schedule_hyperscale | 1 + .../regress/sql/rollback_to_savepoint.sql | 23 ++++++++++++++++ 6 files changed, 71 insertions(+), 4 deletions(-) create mode 100644 src/test/regress/expected/rollback_to_savepoint.out create mode 100644 src/test/regress/sql/rollback_to_savepoint.sql diff --git a/src/backend/distributed/transaction/remote_transaction.c b/src/backend/distributed/transaction/remote_transaction.c index 2ffe1c45c..4dcf3957d 100644 --- a/src/backend/distributed/transaction/remote_transaction.c +++ b/src/backend/distributed/transaction/remote_transaction.c @@ -1119,6 +1119,22 @@ CoordinatedRemoteTransactionsSavepointRollback(SubTransactionId subId) } FinishRemoteTransactionSavepointRollback(connection, subId); + + /* + * We unclaim the connection now so it can be used again when + * continuing after the ROLLBACK TO SAVEPOINT. + * XXX: We do not undo our hadDML/hadDDL flags. This could result in + * some queries not being allowed on Citus that would actually be fine + * to execute. Changing this would require us to keep track for each + * savepoint which placement connections had DDL/DML executed at that + * point and if they were already. We also do not call + * ResetShardPlacementAssociation. This might result in suboptimal + * parallelism, because of placement associations that are not really + * necessary anymore because of ROLLBACK TO SAVEPOINT. To change this + * we would need to keep track of when a connection becomes associated + * to a placement. + */ + UnclaimConnection(connection); } } diff --git a/src/backend/distributed/transaction/transaction_management.c b/src/backend/distributed/transaction/transaction_management.c index 67fa8f30d..cfd35fbe6 100644 --- a/src/backend/distributed/transaction/transaction_management.c +++ b/src/backend/distributed/transaction/transaction_management.c @@ -488,8 +488,8 @@ ResetShardPlacementTransactionState(void) /* - * Subtransaction callback - currently only used to remember whether a - * savepoint has been rolled back, as we don't support that. + * CoordinatedSubTransactionCallback is the callback used to implement + * distributed ROLLBACK TO SAVEPOINT. */ static void CoordinatedSubTransactionCallback(SubXactEvent event, SubTransactionId subId, diff --git a/src/test/regress/expected/rollback_to_savepoint.out b/src/test/regress/expected/rollback_to_savepoint.out new file mode 100644 index 000000000..b5b631463 --- /dev/null +++ b/src/test/regress/expected/rollback_to_savepoint.out @@ -0,0 +1,27 @@ +-- Regression test for this issue: +-- https://github.com/citusdata/citus/issues/3622 +SET citus.shard_count = 4; +SET citus.next_shard_id TO 1954000; +CREATE SCHEMA rollback_to_savepoint; +SET search_path TO rollback_to_savepoint; +CREATE TABLE t(a int); +SELECT create_distributed_table('t', 'a'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +-- This timeout is chosen such that the INSERT with +-- generate_series(1, 100000000) is cancelled at the right time to trigger the +-- bug +SET statement_timeout = '2s'; +BEGIN; +INSERT INTO t VALUES (4); +SAVEPOINT s1; +INSERT INTO t SELECT i FROM generate_series(1, 10000000) i; +ERROR: canceling statement due to statement timeout +ROLLBACK TO SAVEPOINT s1; +INSERT INTO t SELECT i FROM generate_series(1, 100) i; +ROLLBACK; +DROP SCHEMA rollback_to_savepoint CASCADE; +NOTICE: drop cascades to table t diff --git a/src/test/regress/multi_schedule b/src/test/regress/multi_schedule index 99d1e9daf..915a8f7a2 100644 --- a/src/test/regress/multi_schedule +++ b/src/test/regress/multi_schedule @@ -84,8 +84,8 @@ test: subquery_prepared_statements pg12 cte_inline # ---------- # Miscellaneous tests to check our query planning behavior # ---------- -test: multi_deparse_shard_query multi_distributed_transaction_id multi_real_time_transaction intermediate_results limit_intermediate_size -test: multi_explain hyperscale_tutorial partitioned_intermediate_results distributed_intermediate_results +test: multi_deparse_shard_query multi_distributed_transaction_id intermediate_results limit_intermediate_size rollback_to_savepoint +test: multi_explain hyperscale_tutorial partitioned_intermediate_results distributed_intermediate_results multi_real_time_transaction test: multi_basic_queries multi_complex_expressions multi_subquery multi_subquery_complex_queries multi_subquery_behavioral_analytics test: multi_subquery_complex_reference_clause multi_subquery_window_functions multi_view multi_sql_function multi_prepare_sql test: sql_procedure multi_function_in_join row_types materialized_view diff --git a/src/test/regress/multi_schedule_hyperscale b/src/test/regress/multi_schedule_hyperscale index 3898fbbbc..46f7ec041 100644 --- a/src/test/regress/multi_schedule_hyperscale +++ b/src/test/regress/multi_schedule_hyperscale @@ -64,6 +64,7 @@ test: multi_basic_queries multi_complex_expressions multi_subquery_complex_queri test: multi_subquery_complex_reference_clause multi_subquery_window_functions multi_sql_function test: multi_function_in_join row_types test: multi_subquery_in_where_reference_clause full_join adaptive_executor propagate_set_commands +test: rollback_to_savepoint test: multi_subquery_union multi_subquery_in_where_clause multi_subquery_misc test: multi_limit_clause_approximate multi_single_relation_subquery set_role_in_transaction test: multi_select_for_update diff --git a/src/test/regress/sql/rollback_to_savepoint.sql b/src/test/regress/sql/rollback_to_savepoint.sql new file mode 100644 index 000000000..48ef29a5c --- /dev/null +++ b/src/test/regress/sql/rollback_to_savepoint.sql @@ -0,0 +1,23 @@ +-- Regression test for this issue: +-- https://github.com/citusdata/citus/issues/3622 +SET citus.shard_count = 4; +SET citus.next_shard_id TO 1954000; +CREATE SCHEMA rollback_to_savepoint; +SET search_path TO rollback_to_savepoint; + +CREATE TABLE t(a int); +SELECT create_distributed_table('t', 'a'); + +-- This timeout is chosen such that the INSERT with +-- generate_series(1, 100000000) is cancelled at the right time to trigger the +-- bug +SET statement_timeout = '2s'; +BEGIN; +INSERT INTO t VALUES (4); +SAVEPOINT s1; +INSERT INTO t SELECT i FROM generate_series(1, 10000000) i; +ROLLBACK TO SAVEPOINT s1; +INSERT INTO t SELECT i FROM generate_series(1, 100) i; +ROLLBACK; + +DROP SCHEMA rollback_to_savepoint CASCADE;