Detect deadlocks in replicate_reference_tables()

pull/3764/head
Hadi Moshayedi 2020-04-15 10:58:23 -07:00
parent df9048ebaa
commit 59b9a4e5a1
5 changed files with 74 additions and 10 deletions

View File

@ -135,8 +135,17 @@ UseCoordinatedTransaction(void)
} }
CurrentCoordinatedTransactionState = COORD_TRANS_STARTED; CurrentCoordinatedTransactionState = COORD_TRANS_STARTED;
/*
* If assign_distributed_transaction_id() has been called, we should reuse
* that identifier so distributed deadlock detection works properly.
*/
DistributedTransactionId *transactionId = GetCurrentDistributedTransactionId();
if (transactionId->transactionNumber == 0)
{
AssignDistributedTransactionId(); AssignDistributedTransactionId();
} }
}
/* /*

View File

@ -210,11 +210,15 @@ EnsureReferenceTablesExistOnAllNodes(void)
if (PQstatus(connection->pgConn) == CONNECTION_OK) if (PQstatus(connection->pgConn) == CONNECTION_OK)
{ {
UseCoordinatedTransaction();
RemoteTransactionBegin(connection);
StringInfo placementCopyCommand = StringInfo placementCopyCommand =
CopyShardPlacementToWorkerNodeQuery(sourceShardPlacement, CopyShardPlacementToWorkerNodeQuery(sourceShardPlacement,
newWorkerNode, newWorkerNode,
TRANSFER_MODE_AUTOMATIC); TRANSFER_MODE_AUTOMATIC);
ExecuteCriticalRemoteCommand(connection, placementCopyCommand->data); ExecuteCriticalRemoteCommand(connection, placementCopyCommand->data);
RemoteTransactionCommit(connection);
} }
else else
{ {

View File

@ -28,11 +28,11 @@ step detector-dump-wait-edges:
waiting_transaction_numblocking_transaction_numblocking_transaction_waiting waiting_transaction_numblocking_transaction_numblocking_transaction_waiting
392 391 f 390 389 f
transactionnumberwaitingtransactionnumbers transactionnumberwaitingtransactionnumbers
391 389
392 391 390 389
step s1-abort: step s1-abort:
ABORT; ABORT;
@ -75,14 +75,14 @@ step detector-dump-wait-edges:
waiting_transaction_numblocking_transaction_numblocking_transaction_waiting waiting_transaction_numblocking_transaction_numblocking_transaction_waiting
396 395 f 394 393 f
397 395 f 395 393 f
397 396 t 395 394 t
transactionnumberwaitingtransactionnumbers transactionnumberwaitingtransactionnumbers
395 393
396 395 394 393
397 395,396 395 393,394
step s1-abort: step s1-abort:
ABORT; ABORT;

View File

@ -1014,6 +1014,38 @@ SELECT stop_metadata_sync_to_node('localhost', :worker_1_port);
(1 row) (1 row)
--
-- The following case used to get stuck on create_distributed_table() instead
-- of detecting the distributed deadlock.
--
SET citus.replicate_reference_tables_on_activate TO off;
SET citus.shard_replication_factor TO 1;
select master_remove_node('localhost', :worker_2_port);
master_remove_node
---------------------------------------------------------------------
(1 row)
CREATE TABLE ref (a int primary key, b int);
SELECT create_reference_table('ref');
create_reference_table
---------------------------------------------------------------------
(1 row)
CREATE TABLE test (x int, y int references ref(a));
select 1 FROM master_add_node('localhost', :worker_2_port);
?column?
---------------------------------------------------------------------
1
(1 row)
BEGIN;
DROP TABLE test;
CREATE TABLE test (x int, y int references ref(a));
SELECT create_distributed_table('test','x');
ERROR: canceling the transaction since it was involved in a distributed deadlock
END;
-- test adding an invalid node while we have reference tables to replicate -- test adding an invalid node while we have reference tables to replicate
-- set client message level to ERROR and verbosity to terse to supporess -- set client message level to ERROR and verbosity to terse to supporess
-- OS-dependent host name resolution warnings -- OS-dependent host name resolution warnings

View File

@ -634,6 +634,25 @@ SET search_path TO replicate_reference_table;
SELECT stop_metadata_sync_to_node('localhost', :worker_1_port); SELECT stop_metadata_sync_to_node('localhost', :worker_1_port);
--
-- The following case used to get stuck on create_distributed_table() instead
-- of detecting the distributed deadlock.
--
SET citus.replicate_reference_tables_on_activate TO off;
SET citus.shard_replication_factor TO 1;
select master_remove_node('localhost', :worker_2_port);
CREATE TABLE ref (a int primary key, b int);
SELECT create_reference_table('ref');
CREATE TABLE test (x int, y int references ref(a));
select 1 FROM master_add_node('localhost', :worker_2_port);
BEGIN;
DROP TABLE test;
CREATE TABLE test (x int, y int references ref(a));
SELECT create_distributed_table('test','x');
END;
-- test adding an invalid node while we have reference tables to replicate -- test adding an invalid node while we have reference tables to replicate
-- set client message level to ERROR and verbosity to terse to supporess -- set client message level to ERROR and verbosity to terse to supporess
-- OS-dependent host name resolution warnings -- OS-dependent host name resolution warnings