Fix wrong cancellation message about distributed deadlocks (#3956)

pull/3961/head
Jelte Fennema 2020-06-30 14:57:46 +02:00 committed by GitHub
parent 634d6cf9d7
commit 392c5e2c34
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 85 additions and 5 deletions

View File

@ -355,10 +355,12 @@ multi_log_hook(ErrorData *edata)
{
/*
* Show the user a meaningful error message when a backend is cancelled
* by the distributed deadlock detection.
* by the distributed deadlock detection. Also reset the state for this,
* since the next cancelation of the backend might have another reason.
*/
bool clearState = true;
if (edata->elevel == ERROR && edata->sqlerrcode == ERRCODE_QUERY_CANCELED &&
MyBackendGotCancelledDueToDeadlock())
MyBackendGotCancelledDueToDeadlock(clearState))
{
edata->sqlerrcode = ERRCODE_T_R_DEADLOCK_DETECTED;
edata->message = "canceling the transaction since it was "

View File

@ -645,6 +645,7 @@ UnSetDistributedTransactionId(void)
MyBackendData->databaseId = 0;
MyBackendData->userId = 0;
MyBackendData->cancelledDueToDeadlock = false;
MyBackendData->transactionId.initiatorNodeIdentifier = 0;
MyBackendData->transactionId.transactionOriginator = false;
MyBackendData->transactionId.transactionNumber = 0;
@ -855,9 +856,13 @@ CancelTransactionDueToDeadlock(PGPROC *proc)
* MyBackendGotCancelledDueToDeadlock returns whether the current distributed
* transaction was cancelled due to a deadlock. If the backend is not in a
* distributed transaction, the function returns false.
* We keep some session level state to keep track of if we were cancelled
* because of a distributed deadlock. When clearState is true, this function
* also resets that state. So after calling this function with clearState true,
* a second would always return false.
*/
bool
MyBackendGotCancelledDueToDeadlock(void)
MyBackendGotCancelledDueToDeadlock(bool clearState)
{
bool cancelledDueToDeadlock = false;
@ -873,6 +878,10 @@ MyBackendGotCancelledDueToDeadlock(void)
{
cancelledDueToDeadlock = MyBackendData->cancelledDueToDeadlock;
}
if (clearState)
{
MyBackendData->cancelledDueToDeadlock = false;
}
SpinLockRelease(&MyBackendData->mutex);

View File

@ -65,7 +65,7 @@ extern void AssignDistributedTransactionId(void);
extern void MarkCitusInitiatedCoordinatorBackend(void);
extern void GetBackendDataForProc(PGPROC *proc, BackendData *result);
extern void CancelTransactionDueToDeadlock(PGPROC *proc);
extern bool MyBackendGotCancelledDueToDeadlock(void);
extern bool MyBackendGotCancelledDueToDeadlock(bool clearState);
extern List * ActiveDistributedTransactionNumbers(void);
LocalTransactionId GetMyProcLocalTransactionId(void);

View File

@ -0,0 +1,39 @@
--- Regression test for this issue:
--- https://github.com/citusdata/citus/issues/3954
SET citus.shard_count = 4;
SET citus.shard_replication_factor = 1;
SET citus.next_shard_id TO 4954000;
CREATE SCHEMA wrong_cancel_message;
SET search_path TO wrong_cancel_message;
SET citus.force_max_query_parallelization TO ON;
CREATE TABLE t_lock(id int);
SELECT create_distributed_table('t_lock','id');
create_distributed_table
---------------------------------------------------------------------
(1 row)
INSERT INTO t_lock VALUES (1), (2), (3), (4);
BEGIN;
SELECT id, pg_advisory_lock(15) FROM t_lock;
ERROR: canceling the transaction since it was involved in a distributed deadlock
-- ERROR: canceling the transaction since it was involved in a distributed deadlock
-- This one is expected, with force_max_query_parallelization the connections will deadlock themselves
ROLLBACK;
CREATE TABLE t_unrelated(a int);
SELECT create_distributed_table('t_unrelated', 'a');
create_distributed_table
---------------------------------------------------------------------
(1 row)
BEGIN;
SET LOCAL statement_timeout = '1ms';
-- Ignore WARNING about non closed temporary file
SET LOCAL client_min_messages to ERROR;
INSERT INTO t_unrelated SELECT i FROM generate_series(1, 10) i;
ERROR: canceling statement due to statement timeout
ROLLBACK;
\set VERBOSITY terse
DROP SCHEMA wrong_cancel_message CASCADE;
NOTICE: drop cascades to 2 other objects

View File

@ -90,7 +90,7 @@ test: multi_basic_queries multi_complex_expressions multi_subquery multi_subquer
test: multi_subquery_complex_reference_clause multi_subquery_window_functions multi_view multi_sql_function multi_prepare_sql
test: sql_procedure multi_function_in_join row_types materialized_view
test: multi_subquery_in_where_reference_clause full_join adaptive_executor propagate_set_commands
test: multi_subquery_union multi_subquery_in_where_clause multi_subquery_misc
test: multi_subquery_union multi_subquery_in_where_clause multi_subquery_misc statement_cancel_error_message
test: multi_agg_distinct multi_agg_approximate_distinct multi_limit_clause_approximate multi_outer_join_reference multi_single_relation_subquery multi_prepare_plsql set_role_in_transaction
test: multi_reference_table multi_select_for_update relation_access_tracking
test: custom_aggregate_support aggregate_support tdigest_aggregate_support

View File

@ -0,0 +1,30 @@
--- Regression test for this issue:
--- https://github.com/citusdata/citus/issues/3954
SET citus.shard_count = 4;
SET citus.shard_replication_factor = 1;
SET citus.next_shard_id TO 4954000;
CREATE SCHEMA wrong_cancel_message;
SET search_path TO wrong_cancel_message;
SET citus.force_max_query_parallelization TO ON;
CREATE TABLE t_lock(id int);
SELECT create_distributed_table('t_lock','id');
INSERT INTO t_lock VALUES (1), (2), (3), (4);
BEGIN;
SELECT id, pg_advisory_lock(15) FROM t_lock;
-- ERROR: canceling the transaction since it was involved in a distributed deadlock
-- This one is expected, with force_max_query_parallelization the connections will deadlock themselves
ROLLBACK;
CREATE TABLE t_unrelated(a int);
SELECT create_distributed_table('t_unrelated', 'a');
BEGIN;
SET LOCAL statement_timeout = '1ms';
-- Ignore WARNING about non closed temporary file
SET LOCAL client_min_messages to ERROR;
INSERT INTO t_unrelated SELECT i FROM generate_series(1, 10) i;
ROLLBACK;
\set VERBOSITY terse
DROP SCHEMA wrong_cancel_message CASCADE;