From 392c5e2c34c7c79c51ebada82fd9c444700de322 Mon Sep 17 00:00:00 2001 From: Jelte Fennema Date: Tue, 30 Jun 2020 14:57:46 +0200 Subject: [PATCH] Fix wrong cancellation message about distributed deadlocks (#3956) --- src/backend/distributed/shared_library_init.c | 6 ++- .../distributed/transaction/backend_data.c | 11 +++++- src/include/distributed/backend_data.h | 2 +- .../statement_cancel_error_message.out | 39 +++++++++++++++++++ src/test/regress/multi_schedule | 2 +- .../sql/statement_cancel_error_message.sql | 30 ++++++++++++++ 6 files changed, 85 insertions(+), 5 deletions(-) create mode 100644 src/test/regress/expected/statement_cancel_error_message.out create mode 100644 src/test/regress/sql/statement_cancel_error_message.sql diff --git a/src/backend/distributed/shared_library_init.c b/src/backend/distributed/shared_library_init.c index d1853d676..e4cd7b789 100644 --- a/src/backend/distributed/shared_library_init.c +++ b/src/backend/distributed/shared_library_init.c @@ -355,10 +355,12 @@ multi_log_hook(ErrorData *edata) { /* * Show the user a meaningful error message when a backend is cancelled - * by the distributed deadlock detection. + * by the distributed deadlock detection. Also reset the state for this, + * since the next cancelation of the backend might have another reason. */ + bool clearState = true; if (edata->elevel == ERROR && edata->sqlerrcode == ERRCODE_QUERY_CANCELED && - MyBackendGotCancelledDueToDeadlock()) + MyBackendGotCancelledDueToDeadlock(clearState)) { edata->sqlerrcode = ERRCODE_T_R_DEADLOCK_DETECTED; edata->message = "canceling the transaction since it was " diff --git a/src/backend/distributed/transaction/backend_data.c b/src/backend/distributed/transaction/backend_data.c index afb6bfe5c..f49449251 100644 --- a/src/backend/distributed/transaction/backend_data.c +++ b/src/backend/distributed/transaction/backend_data.c @@ -645,6 +645,7 @@ UnSetDistributedTransactionId(void) MyBackendData->databaseId = 0; MyBackendData->userId = 0; + MyBackendData->cancelledDueToDeadlock = false; MyBackendData->transactionId.initiatorNodeIdentifier = 0; MyBackendData->transactionId.transactionOriginator = false; MyBackendData->transactionId.transactionNumber = 0; @@ -855,9 +856,13 @@ CancelTransactionDueToDeadlock(PGPROC *proc) * MyBackendGotCancelledDueToDeadlock returns whether the current distributed * transaction was cancelled due to a deadlock. If the backend is not in a * distributed transaction, the function returns false. + * We keep some session level state to keep track of if we were cancelled + * because of a distributed deadlock. When clearState is true, this function + * also resets that state. So after calling this function with clearState true, + * a second would always return false. */ bool -MyBackendGotCancelledDueToDeadlock(void) +MyBackendGotCancelledDueToDeadlock(bool clearState) { bool cancelledDueToDeadlock = false; @@ -873,6 +878,10 @@ MyBackendGotCancelledDueToDeadlock(void) { cancelledDueToDeadlock = MyBackendData->cancelledDueToDeadlock; } + if (clearState) + { + MyBackendData->cancelledDueToDeadlock = false; + } SpinLockRelease(&MyBackendData->mutex); diff --git a/src/include/distributed/backend_data.h b/src/include/distributed/backend_data.h index 19cff6eb4..ff2f926f7 100644 --- a/src/include/distributed/backend_data.h +++ b/src/include/distributed/backend_data.h @@ -65,7 +65,7 @@ extern void AssignDistributedTransactionId(void); extern void MarkCitusInitiatedCoordinatorBackend(void); extern void GetBackendDataForProc(PGPROC *proc, BackendData *result); extern void CancelTransactionDueToDeadlock(PGPROC *proc); -extern bool MyBackendGotCancelledDueToDeadlock(void); +extern bool MyBackendGotCancelledDueToDeadlock(bool clearState); extern List * ActiveDistributedTransactionNumbers(void); LocalTransactionId GetMyProcLocalTransactionId(void); diff --git a/src/test/regress/expected/statement_cancel_error_message.out b/src/test/regress/expected/statement_cancel_error_message.out new file mode 100644 index 000000000..cb75bcb0c --- /dev/null +++ b/src/test/regress/expected/statement_cancel_error_message.out @@ -0,0 +1,39 @@ +--- Regression test for this issue: +--- https://github.com/citusdata/citus/issues/3954 +SET citus.shard_count = 4; +SET citus.shard_replication_factor = 1; +SET citus.next_shard_id TO 4954000; +CREATE SCHEMA wrong_cancel_message; +SET search_path TO wrong_cancel_message; +SET citus.force_max_query_parallelization TO ON; +CREATE TABLE t_lock(id int); +SELECT create_distributed_table('t_lock','id'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +INSERT INTO t_lock VALUES (1), (2), (3), (4); +BEGIN; +SELECT id, pg_advisory_lock(15) FROM t_lock; +ERROR: canceling the transaction since it was involved in a distributed deadlock +-- ERROR: canceling the transaction since it was involved in a distributed deadlock +-- This one is expected, with force_max_query_parallelization the connections will deadlock themselves +ROLLBACK; +CREATE TABLE t_unrelated(a int); +SELECT create_distributed_table('t_unrelated', 'a'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +BEGIN; +SET LOCAL statement_timeout = '1ms'; +-- Ignore WARNING about non closed temporary file +SET LOCAL client_min_messages to ERROR; +INSERT INTO t_unrelated SELECT i FROM generate_series(1, 10) i; +ERROR: canceling statement due to statement timeout +ROLLBACK; +\set VERBOSITY terse +DROP SCHEMA wrong_cancel_message CASCADE; +NOTICE: drop cascades to 2 other objects diff --git a/src/test/regress/multi_schedule b/src/test/regress/multi_schedule index 915a8f7a2..d9dc2c4b2 100644 --- a/src/test/regress/multi_schedule +++ b/src/test/regress/multi_schedule @@ -90,7 +90,7 @@ test: multi_basic_queries multi_complex_expressions multi_subquery multi_subquer test: multi_subquery_complex_reference_clause multi_subquery_window_functions multi_view multi_sql_function multi_prepare_sql test: sql_procedure multi_function_in_join row_types materialized_view test: multi_subquery_in_where_reference_clause full_join adaptive_executor propagate_set_commands -test: multi_subquery_union multi_subquery_in_where_clause multi_subquery_misc +test: multi_subquery_union multi_subquery_in_where_clause multi_subquery_misc statement_cancel_error_message test: multi_agg_distinct multi_agg_approximate_distinct multi_limit_clause_approximate multi_outer_join_reference multi_single_relation_subquery multi_prepare_plsql set_role_in_transaction test: multi_reference_table multi_select_for_update relation_access_tracking test: custom_aggregate_support aggregate_support tdigest_aggregate_support diff --git a/src/test/regress/sql/statement_cancel_error_message.sql b/src/test/regress/sql/statement_cancel_error_message.sql new file mode 100644 index 000000000..aa0b778b3 --- /dev/null +++ b/src/test/regress/sql/statement_cancel_error_message.sql @@ -0,0 +1,30 @@ +--- Regression test for this issue: +--- https://github.com/citusdata/citus/issues/3954 +SET citus.shard_count = 4; +SET citus.shard_replication_factor = 1; +SET citus.next_shard_id TO 4954000; +CREATE SCHEMA wrong_cancel_message; +SET search_path TO wrong_cancel_message; + +SET citus.force_max_query_parallelization TO ON; +CREATE TABLE t_lock(id int); +SELECT create_distributed_table('t_lock','id'); +INSERT INTO t_lock VALUES (1), (2), (3), (4); +BEGIN; +SELECT id, pg_advisory_lock(15) FROM t_lock; +-- ERROR: canceling the transaction since it was involved in a distributed deadlock +-- This one is expected, with force_max_query_parallelization the connections will deadlock themselves +ROLLBACK; + +CREATE TABLE t_unrelated(a int); +SELECT create_distributed_table('t_unrelated', 'a'); + +BEGIN; +SET LOCAL statement_timeout = '1ms'; +-- Ignore WARNING about non closed temporary file +SET LOCAL client_min_messages to ERROR; +INSERT INTO t_unrelated SELECT i FROM generate_series(1, 10) i; +ROLLBACK; + +\set VERBOSITY terse +DROP SCHEMA wrong_cancel_message CASCADE;