From 6be1bacddd4cafcbc4edc9125a26cac549fc519a Mon Sep 17 00:00:00 2001 From: Hadi Moshayedi Date: Fri, 16 Aug 2019 11:02:01 -0700 Subject: [PATCH] Fix distributed deadlock for TRUNCATE --- src/backend/distributed/commands/truncate.c | 23 ----------- .../distributed/master/master_repair_shards.c | 4 ++ .../utils/multi_partitioning_utils.c | 39 ++++++++++++++++++- 3 files changed, 41 insertions(+), 25 deletions(-) diff --git a/src/backend/distributed/commands/truncate.c b/src/backend/distributed/commands/truncate.c index a1a2d1c5f..cca0f6a8b 100644 --- a/src/backend/distributed/commands/truncate.c +++ b/src/backend/distributed/commands/truncate.c @@ -48,32 +48,9 @@ static void AcquireDistributedLockOnRelations(List *relationIdList, LOCKMODE loc void ProcessTruncateStatement(TruncateStmt *truncateStatement) { - ListCell *rangeVarCell = NULL; - ListCell *relationCell = NULL; - List *lockedRelations = NIL; - - /* - * Lock relations while calling next three functions, because they expect - * relations to be locked. We release them then to avoid distributed - * deadlock in MX. - */ - foreach(rangeVarCell, truncateStatement->relations) - { - RangeVar *rangeVar = (RangeVar *) lfirst(rangeVarCell); - Relation relation = heap_openrv(rangeVar, AccessShareLock); - lockedRelations = lappend(lockedRelations, relation); - } - ErrorIfUnsupportedTruncateStmt(truncateStatement); EnsurePartitionTableNotReplicatedForTruncate(truncateStatement); ExecuteTruncateStmtSequentialIfNecessary(truncateStatement); - - foreach(relationCell, lockedRelations) - { - Relation relation = (Relation) lfirst(relationCell); - heap_close(relation, AccessShareLock); - } - LockTruncatedRelationMetadataInWorkers(truncateStatement); } diff --git a/src/backend/distributed/master/master_repair_shards.c b/src/backend/distributed/master/master_repair_shards.c index 11005fc21..444d1dc94 100644 --- a/src/backend/distributed/master/master_repair_shards.c +++ b/src/backend/distributed/master/master_repair_shards.c @@ -34,6 +34,7 @@ #include "lib/stringinfo.h" #include "nodes/pg_list.h" #include "storage/lock.h" +#include "storage/lmgr.h" #include "utils/builtins.h" #include "utils/elog.h" #include "utils/errcodes.h" @@ -231,6 +232,9 @@ RepairShardPlacement(int64 shardId, char *sourceNodeName, int32 sourceNodePort, List *placementList = NIL; ShardPlacement *placement = NULL; + /* prevent table from being dropped */ + LockRelationOid(distributedTableId, AccessShareLock); + EnsureTableOwner(distributedTableId); /* diff --git a/src/backend/distributed/utils/multi_partitioning_utils.c b/src/backend/distributed/utils/multi_partitioning_utils.c index f5a2124a9..b505e9f0f 100644 --- a/src/backend/distributed/utils/multi_partitioning_utils.c +++ b/src/backend/distributed/utils/multi_partitioning_utils.c @@ -24,6 +24,7 @@ #include "distributed/shardinterval_utils.h" #include "lib/stringinfo.h" #include "nodes/pg_list.h" +#include "pgstat.h" #include "utils/builtins.h" #include "utils/fmgroids.h" #include "utils/lsyscache.h" @@ -32,6 +33,7 @@ static char * PartitionBound(Oid partitionId); +static Relation try_relation_open_nolock(Oid relationId); /* @@ -69,7 +71,7 @@ PartitionedTable(Oid relationId) bool PartitionedTableNoLock(Oid relationId) { - Relation rel = try_relation_open(relationId, NoLock); + Relation rel = try_relation_open_nolock(relationId); bool partitionedTable = false; /* don't error out for tables that are dropped */ @@ -122,7 +124,7 @@ PartitionTable(Oid relationId) bool PartitionTableNoLock(Oid relationId) { - Relation rel = try_relation_open(relationId, NoLock); + Relation rel = try_relation_open_nolock(relationId); bool partitionTable = false; /* don't error out for tables that are dropped */ @@ -140,6 +142,39 @@ PartitionTableNoLock(Oid relationId) } +/* + * try_relation_open_nolock opens a relation with given relationId without + * acquiring locks. PostgreSQL's try_relation_open() asserts that caller + * has already acquired a lock on the relation, which we don't always do. + * + * ATTENTION: + * 1. Sync this with try_relation_open(). It hasn't changed for 10 to 12 + * releases though. + * 2. We should remove this after we fix the locking/distributed deadlock + * issues with MX Truncate. See https://github.com/citusdata/citus/pull/2894 + * for more discussion. + */ +static Relation +try_relation_open_nolock(Oid relationId) +{ + Relation relation = NULL; + if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(relationId))) + { + return NULL; + } + + relation = RelationIdGetRelation(relationId); + if (!RelationIsValid(relation)) + { + return NULL; + } + + pgstat_initstats(relation); + + return relation; +} + + /* * IsChildTable returns true if the table is inherited. Note that * partition tables inherites by default. However, this function