Fix wrong cancellation message about distributed deadlocks (#3956)

2020-06-30 14:57:46 +02:00 · 2020-06-30 14:57:46 +02:00 · 392c5e2c34
parent 634d6cf9d7
commit 392c5e2c34
6 changed files with 85 additions and 5 deletions
--- a/src/backend/distributed/shared_library_init.c
+++ b/src/backend/distributed/shared_library_init.c
@ -355,10 +355,12 @@ multi_log_hook(ErrorData *edata)
 {
 	/*
 	 * Show the user a meaningful error message when a backend is cancelled
-	 * by the distributed deadlock detection.
+	 * by the distributed deadlock detection. Also reset the state for this,
+	 * since the next cancelation of the backend might have another reason.
 	 */
+	bool clearState = true;
 	if (edata->elevel == ERROR && edata->sqlerrcode == ERRCODE_QUERY_CANCELED &&
-		MyBackendGotCancelledDueToDeadlock())
+		MyBackendGotCancelledDueToDeadlock(clearState))
 	{
 		edata->sqlerrcode = ERRCODE_T_R_DEADLOCK_DETECTED;
 		edata->message = "canceling the transaction since it was "
--- a/src/backend/distributed/transaction/backend_data.c
+++ b/src/backend/distributed/transaction/backend_data.c
@ -645,6 +645,7 @@ UnSetDistributedTransactionId(void)

 		MyBackendData->databaseId = 0;
 		MyBackendData->userId = 0;
+		MyBackendData->cancelledDueToDeadlock = false;
 		MyBackendData->transactionId.initiatorNodeIdentifier = 0;
 		MyBackendData->transactionId.transactionOriginator = false;
 		MyBackendData->transactionId.transactionNumber = 0;
@ -855,9 +856,13 @@ CancelTransactionDueToDeadlock(PGPROC *proc)
 * MyBackendGotCancelledDueToDeadlock returns whether the current distributed
 * transaction was cancelled due to a deadlock. If the backend is not in a
 * distributed transaction, the function returns false.
+ * We keep some session level state to keep track of if we were cancelled
+ * because of a distributed deadlock. When clearState is true, this function
+ * also resets that state. So after calling this function with clearState true,
+ * a second would always return false.
 */
 bool
-MyBackendGotCancelledDueToDeadlock(void)
+MyBackendGotCancelledDueToDeadlock(bool clearState)
 {
 	bool cancelledDueToDeadlock = false;

@ -873,6 +878,10 @@ MyBackendGotCancelledDueToDeadlock(void)
 	{
 		cancelledDueToDeadlock = MyBackendData->cancelledDueToDeadlock;
 	}
+	if (clearState)
+	{
+		MyBackendData->cancelledDueToDeadlock = false;
+	}

 	SpinLockRelease(&MyBackendData->mutex);

--- a/src/include/distributed/backend_data.h
+++ b/src/include/distributed/backend_data.h
@ -65,7 +65,7 @@ extern void AssignDistributedTransactionId(void);
 extern void MarkCitusInitiatedCoordinatorBackend(void);
 extern void GetBackendDataForProc(PGPROC *proc, BackendData *result);
 extern void CancelTransactionDueToDeadlock(PGPROC *proc);
-extern bool MyBackendGotCancelledDueToDeadlock(void);
+extern bool MyBackendGotCancelledDueToDeadlock(bool clearState);
 extern List * ActiveDistributedTransactionNumbers(void);
 LocalTransactionId GetMyProcLocalTransactionId(void);

--- a/src/test/regress/expected/statement_cancel_error_message.out
+++ b/src/test/regress/expected/statement_cancel_error_message.out
@ -0,0 +1,39 @@
+--- Regression test for this issue:
+--- https://github.com/citusdata/citus/issues/3954
+SET citus.shard_count = 4;
+SET citus.shard_replication_factor = 1;
+SET citus.next_shard_id TO 4954000;
+CREATE SCHEMA wrong_cancel_message;
+SET search_path TO wrong_cancel_message;
+SET citus.force_max_query_parallelization TO ON;
+CREATE TABLE t_lock(id int);
+SELECT create_distributed_table('t_lock','id');
+ create_distributed_table
+---------------------------------------------------------------------
+
+(1 row)
+
+INSERT INTO t_lock VALUES (1), (2), (3), (4);
+BEGIN;
+SELECT id, pg_advisory_lock(15) FROM t_lock;
+ERROR:  canceling the transaction since it was involved in a distributed deadlock
+-- ERROR:  canceling the transaction since it was involved in a distributed deadlock
+-- This one is expected, with force_max_query_parallelization the connections will deadlock themselves
+ROLLBACK;
+CREATE TABLE t_unrelated(a int);
+SELECT create_distributed_table('t_unrelated', 'a');
+ create_distributed_table
+---------------------------------------------------------------------
+
+(1 row)
+
+BEGIN;
+SET LOCAL statement_timeout = '1ms';
+-- Ignore WARNING about non closed temporary file
+SET LOCAL client_min_messages to ERROR;
+INSERT INTO t_unrelated SELECT i FROM generate_series(1, 10) i;
+ERROR:  canceling statement due to statement timeout
+ROLLBACK;
+\set VERBOSITY terse
+DROP SCHEMA wrong_cancel_message CASCADE;
+NOTICE:  drop cascades to 2 other objects
--- a/src/test/regress/multi_schedule
+++ b/src/test/regress/multi_schedule
@ -90,7 +90,7 @@ test: multi_basic_queries multi_complex_expressions multi_subquery multi_subquer
 test: multi_subquery_complex_reference_clause multi_subquery_window_functions multi_view multi_sql_function multi_prepare_sql
 test: sql_procedure multi_function_in_join row_types materialized_view
 test: multi_subquery_in_where_reference_clause full_join adaptive_executor propagate_set_commands
-test: multi_subquery_union multi_subquery_in_where_clause multi_subquery_misc
+test: multi_subquery_union multi_subquery_in_where_clause multi_subquery_misc statement_cancel_error_message
 test: multi_agg_distinct multi_agg_approximate_distinct multi_limit_clause_approximate multi_outer_join_reference multi_single_relation_subquery multi_prepare_plsql set_role_in_transaction
 test: multi_reference_table multi_select_for_update relation_access_tracking
 test: custom_aggregate_support aggregate_support tdigest_aggregate_support
--- a/src/test/regress/sql/statement_cancel_error_message.sql
+++ b/src/test/regress/sql/statement_cancel_error_message.sql
@ -0,0 +1,30 @@
+--- Regression test for this issue:
+--- https://github.com/citusdata/citus/issues/3954
+SET citus.shard_count = 4;
+SET citus.shard_replication_factor = 1;
+SET citus.next_shard_id TO 4954000;
+CREATE SCHEMA wrong_cancel_message;
+SET search_path TO wrong_cancel_message;
+
+SET citus.force_max_query_parallelization TO ON;
+CREATE TABLE t_lock(id int);
+SELECT create_distributed_table('t_lock','id');
+INSERT INTO t_lock VALUES (1), (2), (3), (4);
+BEGIN;
+SELECT id, pg_advisory_lock(15) FROM t_lock;
+-- ERROR:  canceling the transaction since it was involved in a distributed deadlock
+-- This one is expected, with force_max_query_parallelization the connections will deadlock themselves
+ROLLBACK;
+
+CREATE TABLE t_unrelated(a int);
+SELECT create_distributed_table('t_unrelated', 'a');
+
+BEGIN;
+SET LOCAL statement_timeout = '1ms';
+-- Ignore WARNING about non closed temporary file
+SET LOCAL client_min_messages to ERROR;
+INSERT INTO t_unrelated SELECT i FROM generate_series(1, 10) i;
+ROLLBACK;
+
+\set VERBOSITY terse
+DROP SCHEMA wrong_cancel_message CASCADE;