citus/src/backend/distributed/operations/shard_cleaner.c

289 lines
8.1 KiB
C

/*-------------------------------------------------------------------------
*
* shard_cleaner.c
* This implements the background process that cleans shards that are
* left around. Shards that are left around are marked as state 4
* (SHARD_STATE_TO_DELETE) in pg_dist_placement.
*
* Copyright (c), Citus Data, Inc.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/xact.h"
#include "postmaster/postmaster.h"
#include "distributed/coordinator_protocol.h"
#include "distributed/metadata_cache.h"
#include "distributed/shard_cleaner.h"
#include "distributed/shard_rebalancer.h"
#include "distributed/remote_commands.h"
#include "distributed/resource_lock.h"
#include "distributed/worker_transaction.h"
/* declarations for dynamic loading */
PG_FUNCTION_INFO_V1(citus_cleanup_orphaned_shards);
PG_FUNCTION_INFO_V1(isolation_cleanup_orphaned_shards);
static bool TryDropShard(GroupShardPlacement *placement);
static bool TryLockRelationAndPlacementCleanup(Oid relationId, LOCKMODE lockmode);
/*
* citus_cleanup_orphaned_shards implements a user-facing UDF to delete
* orphaned shards that are still haning around in the system. These shards are
* orphaned by previous actions that were not directly able to delete the
* placements eg. shard moving or dropping of a distributed table while one of
* the data nodes was not online.
*
* This function iterates through placements where shardstate is
* SHARD_STATE_TO_DELETE (shardstate = 4), drops the corresponding tables from
* the node and removes the placement information from the catalog.
*
* The function takes no arguments and runs cluster wide. It cannot be run in a
* transaction, because holding the locks it takes for a long time is not good.
* While the locks are held, it is impossible for the background daemon to
* cleanup orphaned shards.
*/
Datum
citus_cleanup_orphaned_shards(PG_FUNCTION_ARGS)
{
CheckCitusVersion(ERROR);
EnsureCoordinator();
PreventInTransactionBlock(true, "citus_cleanup_orphaned_shards");
bool waitForLocks = true;
int droppedShardCount = DropOrphanedShards(waitForLocks);
if (droppedShardCount > 0)
{
ereport(NOTICE, (errmsg("cleaned up %d orphaned shards", droppedShardCount)));
}
PG_RETURN_VOID();
}
/*
* isolation_cleanup_orphaned_shards implements a test UDF that's the same as
* citus_cleanup_orphaned_shards. The only difference is that this command can
* be run in transactions, this is to test
*/
Datum
isolation_cleanup_orphaned_shards(PG_FUNCTION_ARGS)
{
CheckCitusVersion(ERROR);
EnsureCoordinator();
bool waitForLocks = true;
int droppedShardCount = DropOrphanedShards(waitForLocks);
if (droppedShardCount > 0)
{
ereport(NOTICE, (errmsg("cleaned up %d orphaned shards", droppedShardCount)));
}
PG_RETURN_VOID();
}
/*
* DropOrphanedShardsInSeparateTransaction cleans up orphaned shards by
* connecting to localhost. This is done, so that the locks that
* DropOrphanedShards takes are only held for a short time.
*/
void
DropOrphanedShardsInSeparateTransaction(void)
{
ExecuteCriticalCommandInSeparateTransaction("CALL citus_cleanup_orphaned_shards()");
}
/*
* TryDropOrphanedShards is a wrapper around DropOrphanedShards that catches
* any errors to make it safe to use in the maintenance daemon.
*
* If dropping any of the shards failed this function returns -1, otherwise it
* returns the number of dropped shards.
*/
int
TryDropOrphanedShards(bool waitForLocks)
{
int droppedShardCount = 0;
MemoryContext savedContext = CurrentMemoryContext;
PG_TRY();
{
droppedShardCount = DropOrphanedShards(waitForLocks);
}
PG_CATCH();
{
MemoryContextSwitchTo(savedContext);
ErrorData *edata = CopyErrorData();
FlushErrorState();
/* rethrow as WARNING */
edata->elevel = WARNING;
ThrowErrorData(edata);
}
PG_END_TRY();
return droppedShardCount;
}
/*
* DropOrphanedShards removes shards that were marked SHARD_STATE_TO_DELETE before.
*
* It does so by trying to take an exclusive lock on the shard and its
* colocated placements before removing. If the lock cannot be obtained it
* skips the group and continues with others. The group that has been skipped
* will be removed at a later time when there are no locks held anymore on
* those placements.
*
* If waitForLocks is false, then if we cannot take a lock on pg_dist_placement
* we continue without waiting.
*
* Before doing any of this it will take an exclusive PlacementCleanup lock.
* This is to ensure that this function is not being run concurrently.
* Otherwise really bad race conditions are possible, such as removing all
* placements of a shard. waitForLocks indicates if this function should
* wait for this lock or not.
*
*/
int
DropOrphanedShards(bool waitForLocks)
{
int removedShardCount = 0;
ListCell *shardPlacementCell = NULL;
/*
* We should try to take the highest lock that we take
* later in this function for pg_dist_placement. We take RowExclusiveLock
* in DeleteShardPlacementRow.
*/
LOCKMODE lockmode = RowExclusiveLock;
if (!IsCoordinator())
{
return 0;
}
if (waitForLocks)
{
LockPlacementCleanup();
}
else
{
Oid distPlacementId = DistPlacementRelationId();
if (!TryLockRelationAndPlacementCleanup(distPlacementId, lockmode))
{
return 0;
}
}
int failedShardDropCount = 0;
List *shardPlacementList = AllShardPlacementsWithShardPlacementState(
SHARD_STATE_TO_DELETE);
foreach(shardPlacementCell, shardPlacementList)
{
GroupShardPlacement *placement = (GroupShardPlacement *) lfirst(
shardPlacementCell);
if (!PrimaryNodeForGroup(placement->groupId, NULL) ||
!ShardExists(placement->shardId))
{
continue;
}
if (TryDropShard(placement))
{
removedShardCount++;
}
else
{
failedShardDropCount++;
}
}
if (failedShardDropCount > 0)
{
ereport(WARNING, (errmsg("Failed to drop %d orphaned shards out of %d",
failedShardDropCount, list_length(shardPlacementList))));
}
return removedShardCount;
}
/*
* TryLockRelationAndPlacementCleanup tries to lock the given relation
* and the placement cleanup. If it cannot, it returns false.
*
*/
static bool
TryLockRelationAndPlacementCleanup(Oid relationId, LOCKMODE lockmode)
{
if (!ConditionalLockRelationOid(relationId, lockmode))
{
ereport(DEBUG1, (errmsg(
"could not acquire shard lock to cleanup placements")));
return false;
}
if (!TryLockPlacementCleanup())
{
ereport(DEBUG1, (errmsg("could not acquire lock to cleanup placements")));
return false;
}
return true;
}
/*
* TryDropShard tries to drop the given shard placement and returns
* true on success.
*/
static bool
TryDropShard(GroupShardPlacement *placement)
{
ShardPlacement *shardPlacement = LoadShardPlacement(placement->shardId,
placement->placementId);
ShardInterval *shardInterval = LoadShardInterval(shardPlacement->shardId);
ereport(LOG, (errmsg("dropping shard placement " INT64_FORMAT " of shard "
INT64_FORMAT " on %s:%d after it was moved away",
shardPlacement->placementId, shardPlacement->shardId,
shardPlacement->nodeName, shardPlacement->nodePort)));
/* prepare sql query to execute to drop the shard */
StringInfo dropQuery = makeStringInfo();
char *qualifiedTableName = ConstructQualifiedShardName(shardInterval);
appendStringInfo(dropQuery, DROP_REGULAR_TABLE_COMMAND, qualifiedTableName);
/*
* We set a lock_timeout here so that if there are running queries on the
* shards we won't get blocked more than 1s and fail.
*
* The lock timeout also avoids getting stuck in a distributed deadlock, which
* can occur because we might be holding pg_dist_placement locks while also
* taking locks on the shard placements, and this code interrupts the
* distributed deadlock detector.
*/
List *dropCommandList = list_make2("SET LOCAL lock_timeout TO '1s'",
dropQuery->data);
/* remove the shard from the node */
bool success =
SendOptionalCommandListToWorkerOutsideTransaction(shardPlacement->nodeName,
shardPlacement->nodePort,
NULL, dropCommandList);
if (success)
{
/* delete the actual placement */
DeleteShardPlacementRow(placement->placementId);
}
return success;
}