Merge pull request #4963 from citusdata/fill-rebalance-monitor-shardsize

pull/4992/head
Jelte Fennema 2021-05-20 16:51:08 +02:00 committed by GitHub
commit b25d3e83ef
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 842 additions and 65 deletions

View File

@ -31,6 +31,7 @@
#include "distributed/enterprise.h"
#include "distributed/hash_helpers.h"
#include "distributed/listutils.h"
#include "distributed/lock_graph.h"
#include "distributed/coordinator_protocol.h"
#include "distributed/metadata_cache.h"
#include "distributed/multi_client_executor.h"
@ -139,11 +140,52 @@ typedef struct RebalanceContext
FmgrInfo shardAllowedOnNodeUDF;
} RebalanceContext;
/* WorkerHashKey contains hostname and port to be used as a key in a hash */
typedef struct WorkerHashKey
{
char hostname[MAX_NODE_LENGTH];
int port;
} WorkerHashKey;
/* WorkerShardIds represents a set of shardIds grouped by worker */
typedef struct WorkerShardIds
{
WorkerHashKey worker;
/* This is a uint64 hashset representing the shard ids for a specific worker */
HTAB *shardIds;
} WorkerShardIds;
/* ShardStatistics contains statistics about a shard */
typedef struct ShardStatistics
{
uint64 shardId;
/* The shard its size in bytes. */
uint64 totalSize;
} ShardStatistics;
/*
* WorkerShardStatistics represents a set of statistics about shards,
* grouped by worker.
*/
typedef struct WorkerShardStatistics
{
WorkerHashKey worker;
/*
* Statistics for each shard on this worker:
* key: shardId
* value: ShardStatistics
*/
HTAB *statistics;
} WorkerShardStatistics;
/* static declarations for main logic */
static int ShardActivePlacementCount(HTAB *activePlacementsHash, uint64 shardId,
List *activeWorkerNodeList);
static bool UpdateShardPlacement(PlacementUpdateEvent *placementUpdateEvent,
static void UpdateShardPlacement(PlacementUpdateEvent *placementUpdateEvent,
List *responsiveNodeList, Oid shardReplicationModeOid);
/* static declarations for main logic's utility functions */
@ -192,6 +234,17 @@ static Form_pg_dist_rebalance_strategy GetRebalanceStrategy(Name name);
static void EnsureShardCostUDF(Oid functionOid);
static void EnsureNodeCapacityUDF(Oid functionOid);
static void EnsureShardAllowedOnNodeUDF(Oid functionOid);
static void ConflictShardPlacementUpdateOnlyWithIsolationTesting(uint64 shardId);
static HTAB * BuildWorkerShardStatisticsHash(PlacementUpdateEventProgress *steps,
int stepCount);
static HTAB * GetShardStatistics(MultiConnection *connection, HTAB *shardIds);
static HTAB * GetMovedShardIdsByWorker(PlacementUpdateEventProgress *steps,
int stepCount, bool fromSource);
static uint64 WorkerShardSize(HTAB *workerShardStatistics,
char *workerName, int workerPort, uint64 shardId);
static void AddToWorkerShardIdSet(HTAB *shardsByWorker, char *workerName, int workerPort,
uint64 shardId);
static HTAB * BuildShardSizesHash(ProgressMonitorData *monitor, HTAB *shardStatistics);
/* declarations for dynamic loading */
PG_FUNCTION_INFO_V1(rebalance_table_shards);
@ -204,8 +257,17 @@ PG_FUNCTION_INFO_V1(citus_shard_cost_by_disk_size);
PG_FUNCTION_INFO_V1(citus_validate_rebalance_strategy_functions);
PG_FUNCTION_INFO_V1(pg_dist_rebalance_strategy_enterprise_check);
bool RunningUnderIsolationTest = false;
int MaxRebalancerLoggedIgnoredMoves = 5;
/*
* This is randomly generated hardcoded number. It's used as the first part of
* the advisory lock identifier that's used during isolation tests. See the
* comments on ConflictShardPlacementUpdateOnlyWithIsolationTesting, for more
* information.
*/
#define SHARD_PLACEMENT_UPDATE_ADVISORY_LOCK_FIRST_KEY 29279
#ifdef USE_ASSERT_CHECKING
@ -464,8 +526,7 @@ NodeCapacity(WorkerNode *workerNode, void *voidContext)
static ShardCost
GetShardCost(uint64 shardId, void *voidContext)
{
ShardCost shardCost;
memset_struct_0(shardCost);
ShardCost shardCost = { 0 };
shardCost.shardId = shardId;
RebalanceContext *context = voidContext;
Datum shardCostDatum = FunctionCall1(&context->shardCostUDF, UInt64GetDatum(shardId));
@ -698,8 +759,6 @@ SetupRebalanceMonitor(List *placementUpdateList, Oid relationId)
event->shardId = colocatedUpdate->shardId;
event->sourcePort = colocatedUpdate->sourceNode->workerPort;
event->targetPort = colocatedUpdate->targetNode->workerPort;
event->shardSize = ShardLength(colocatedUpdate->shardId);
eventIndex++;
}
}
@ -1015,19 +1074,35 @@ get_rebalance_progress(PG_FUNCTION_ARGS)
List *rebalanceMonitorList = ProgressMonitorList(REBALANCE_ACTIVITY_MAGIC_NUMBER,
&segmentList);
foreach(rebalanceMonitorCell, rebalanceMonitorList)
{
ProgressMonitorData *monitor = lfirst(rebalanceMonitorCell);
PlacementUpdateEventProgress *placementUpdateEvents = monitor->steps;
HTAB *shardStatistics = BuildWorkerShardStatisticsHash(monitor->steps,
monitor->stepCount);
HTAB *shardSizes = BuildShardSizesHash(monitor, shardStatistics);
for (int eventIndex = 0; eventIndex < monitor->stepCount; eventIndex++)
{
PlacementUpdateEventProgress *step = placementUpdateEvents + eventIndex;
uint64 shardId = step->shardId;
ShardInterval *shardInterval = LoadShardInterval(shardId);
Datum values[9];
bool nulls[9];
uint64 sourceSize = WorkerShardSize(shardStatistics, step->sourceName,
step->sourcePort, shardId);
uint64 targetSize = WorkerShardSize(shardStatistics, step->targetName,
step->targetPort, shardId);
uint64 shardSize = 0;
ShardStatistics *shardSizesStat =
hash_search(shardSizes, &shardId, HASH_FIND, NULL);
if (shardSizesStat)
{
shardSize = shardSizesStat->totalSize;
}
Datum values[11];
bool nulls[11];
memset(values, 0, sizeof(values));
memset(nulls, 0, sizeof(nulls));
@ -1035,12 +1110,14 @@ get_rebalance_progress(PG_FUNCTION_ARGS)
values[0] = monitor->processId;
values[1] = ObjectIdGetDatum(shardInterval->relationId);
values[2] = UInt64GetDatum(shardId);
values[3] = UInt64GetDatum(step->shardSize);
values[3] = UInt64GetDatum(shardSize);
values[4] = PointerGetDatum(cstring_to_text(step->sourceName));
values[5] = UInt32GetDatum(step->sourcePort);
values[6] = PointerGetDatum(cstring_to_text(step->targetName));
values[7] = UInt32GetDatum(step->targetPort);
values[8] = UInt64GetDatum(step->progress);
values[9] = UInt64GetDatum(sourceSize);
values[10] = UInt64GetDatum(targetSize);
tuplestore_putvalues(tupstore, tupdesc, values, nulls);
}
@ -1054,6 +1131,348 @@ get_rebalance_progress(PG_FUNCTION_ARGS)
}
/*
* BuildShardSizesHash creates a hash that maps a shardid to its full size
* within the cluster. It does this by using the rebalance progress monitor
* state to find the node the shard is currently on. It then looks up the shard
* size in the shardStatistics hashmap for this node.
*/
static HTAB *
BuildShardSizesHash(ProgressMonitorData *monitor, HTAB *shardStatistics)
{
HASHCTL info = {
.keysize = sizeof(uint64),
.entrysize = sizeof(ShardStatistics),
.hcxt = CurrentMemoryContext
};
HTAB *shardSizes = hash_create(
"ShardSizeHash", 32, &info,
HASH_ELEM | HASH_CONTEXT | HASH_BLOBS);
PlacementUpdateEventProgress *placementUpdateEvents = monitor->steps;
for (int eventIndex = 0; eventIndex < monitor->stepCount; eventIndex++)
{
PlacementUpdateEventProgress *step = placementUpdateEvents + eventIndex;
uint64 shardId = step->shardId;
uint64 shardSize = 0;
uint64 backupShardSize = 0;
uint64 sourceSize = WorkerShardSize(shardStatistics, step->sourceName,
step->sourcePort, shardId);
uint64 targetSize = WorkerShardSize(shardStatistics, step->targetName,
step->targetPort, shardId);
if (step->progress == REBALANCE_PROGRESS_WAITING ||
step->progress == REBALANCE_PROGRESS_MOVING)
{
/*
* If we are not done with the move, the correct shard size is the
* size on the source.
*/
shardSize = sourceSize;
backupShardSize = targetSize;
}
else if (step->progress == REBALANCE_PROGRESS_MOVED)
{
/*
* If we are done with the move, the correct shard size is the size
* on the target
*/
shardSize = targetSize;
backupShardSize = sourceSize;
}
if (shardSize == 0)
{
if (backupShardSize == 0)
{
/*
* We don't have any useful shard size. This can happen when a
* shard is moved multiple times and it is not present on
* either of these nodes. Probably the shard is on a worker
* related to another event. In the weird case that this shard
* is on the nodes and actually is size 0, we will have no
* entry in the hashmap. When fetching from it we always
* default to 0 if no entry is found, so that's fine.
*/
continue;
}
/*
* There exist some race conditions where it's possible that the
* the state of the steps we see in shared memory are a bit behind
* what is actually going on. So it is possible that even though a
* step is reported as still being in the MOVING state, the shard
* move might have just finished completing. This in turn can mean
* that the source size is 0 while the target size is not. We try
* to handle such rare edge cases by falling back on the other
* shard size if that one is not 0.
*/
shardSize = backupShardSize;
}
ShardStatistics *currentWorkerStatistics =
hash_search(shardSizes, &shardId, HASH_ENTER, NULL);
currentWorkerStatistics->totalSize = shardSize;
}
return shardSizes;
}
/*
* WorkerShardSize returns the size of a shard in bytes on a worker, based on
* the workerShardStatisticsHash.
*/
static uint64
WorkerShardSize(HTAB *workerShardStatisticsHash, char *workerName, int workerPort,
uint64 shardId)
{
WorkerHashKey workerKey = { 0 };
strlcpy(workerKey.hostname, workerName, MAX_NODE_LENGTH);
workerKey.port = workerPort;
WorkerShardStatistics *workerStats =
hash_search(workerShardStatisticsHash, &workerKey, HASH_FIND, NULL);
if (!workerStats)
{
return 0;
}
ShardStatistics *shardStats =
hash_search(workerStats->statistics, &shardId, HASH_FIND, NULL);
if (!shardStats)
{
return 0;
}
return shardStats->totalSize;
}
/*
* BuildWorkerShardStatisticsHash returns a shard id -> shard statistics hash containing
* sizes of shards on the source node and destination node.
*/
static HTAB *
BuildWorkerShardStatisticsHash(PlacementUpdateEventProgress *steps, int stepCount)
{
HTAB *shardsByWorker = GetMovedShardIdsByWorker(steps, stepCount, true);
HASHCTL info = {
.keysize = sizeof(WorkerHashKey),
.entrysize = sizeof(WorkerShardStatistics),
.hcxt = CurrentMemoryContext
};
HTAB *workerShardStatistics = hash_create("WorkerShardStatistics", 32, &info,
HASH_ELEM | HASH_CONTEXT | HASH_BLOBS);
WorkerShardIds *entry = NULL;
HASH_SEQ_STATUS status;
hash_seq_init(&status, shardsByWorker);
while ((entry = hash_seq_search(&status)) != NULL)
{
int connectionFlags = 0;
MultiConnection *connection = GetNodeConnection(connectionFlags,
entry->worker.hostname,
entry->worker.port);
HTAB *statistics =
GetShardStatistics(connection, entry->shardIds);
WorkerHashKey workerKey = { 0 };
strlcpy(workerKey.hostname, entry->worker.hostname, MAX_NODE_LENGTH);
workerKey.port = entry->worker.port;
WorkerShardStatistics *moveStat =
hash_search(workerShardStatistics, &entry->worker, HASH_ENTER, NULL);
moveStat->statistics = statistics;
}
return workerShardStatistics;
}
/*
* GetShardStatistics fetches the statics for the given shard ids over the
* given connection. It returns a hashmap where the keys are the shard ids and
* the values are the statistics.
*/
static HTAB *
GetShardStatistics(MultiConnection *connection, HTAB *shardIds)
{
StringInfo query = makeStringInfo();
appendStringInfoString(
query,
"WITH shard_names (shard_id, schema_name, table_name) AS ((VALUES ");
bool isFirst = true;
uint64 *shardIdPtr = NULL;
HASH_SEQ_STATUS status;
hash_seq_init(&status, shardIds);
while ((shardIdPtr = hash_seq_search(&status)) != NULL)
{
uint64 shardId = *shardIdPtr;
ShardInterval *shardInterval = LoadShardInterval(shardId);
Oid relationId = shardInterval->relationId;
char *shardName = get_rel_name(relationId);
AppendShardIdToName(&shardName, shardId);
Oid schemaId = get_rel_namespace(relationId);
char *schemaName = get_namespace_name(schemaId);
if (!isFirst)
{
appendStringInfo(query, ", ");
}
appendStringInfo(query, "(" UINT64_FORMAT ",%s,%s)",
shardId,
quote_literal_cstr(schemaName),
quote_literal_cstr(shardName));
isFirst = false;
}
appendStringInfoString(query, "))");
appendStringInfoString(
query,
" SELECT shard_id, coalesce(pg_total_relation_size(tables.relid),0)"
/* for each shard in shardIds */
" FROM shard_names"
/* check if its name can be found in pg_class, if so return size */
" LEFT JOIN"
" (SELECT c.oid AS relid, c.relname, n.nspname"
" FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace) tables"
" ON tables.relname = shard_names.table_name AND"
" tables.nspname = shard_names.schema_name ");
PGresult *result = NULL;
int queryResult = ExecuteOptionalRemoteCommand(connection, query->data, &result);
if (queryResult != RESPONSE_OKAY)
{
ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE),
errmsg("cannot get the size because of a connection error")));
}
int rowCount = PQntuples(result);
int colCount = PQnfields(result);
/* This is not expected to ever happen, but we check just to be sure */
if (colCount < 2)
{
ereport(ERROR, (errmsg("unexpected number of columns returned by: %s",
query->data)));
}
HASHCTL info = {
.keysize = sizeof(uint64),
.entrysize = sizeof(ShardStatistics),
.hcxt = CurrentMemoryContext
};
HTAB *shardStatistics = hash_create("ShardStatisticsHash", 32, &info,
HASH_ELEM | HASH_CONTEXT | HASH_BLOBS);
for (int rowIndex = 0; rowIndex < rowCount; rowIndex++)
{
char *shardIdString = PQgetvalue(result, rowIndex, 0);
uint64 shardId = pg_strtouint64(shardIdString, NULL, 10);
char *sizeString = PQgetvalue(result, rowIndex, 1);
uint64 totalSize = pg_strtouint64(sizeString, NULL, 10);
ShardStatistics *statistics =
hash_search(shardStatistics, &shardId, HASH_ENTER, NULL);
statistics->totalSize = totalSize;
}
PQclear(result);
bool raiseErrors = true;
ClearResults(connection, raiseErrors);
return shardStatistics;
}
/*
* GetMovedShardIdsByWorker groups the shard ids in the provided steps by
* worker. It returns a hashmap that contains a set of these shard ids.
*/
static HTAB *
GetMovedShardIdsByWorker(PlacementUpdateEventProgress *steps, int stepCount,
bool fromSource)
{
HASHCTL info = {
.keysize = sizeof(WorkerHashKey),
.entrysize = sizeof(WorkerShardIds),
.hcxt = CurrentMemoryContext
};
HTAB *shardsByWorker = hash_create("GetRebalanceStepsByWorker", 32, &info,
HASH_ELEM | HASH_CONTEXT | HASH_BLOBS);
for (int stepIndex = 0; stepIndex < stepCount; stepIndex++)
{
PlacementUpdateEventProgress *step = &(steps[stepIndex]);
AddToWorkerShardIdSet(shardsByWorker, step->sourceName, step->sourcePort,
step->shardId);
if (step->progress == REBALANCE_PROGRESS_WAITING)
{
/*
* shard move has not started so we don't need target stats for
* this shard
*/
continue;
}
AddToWorkerShardIdSet(shardsByWorker, step->targetName, step->targetPort,
step->shardId);
}
return shardsByWorker;
}
/*
* AddToWorkerShardIdSet adds the shard id to the shard id set for the
* specified worker in the shardsByWorker hashmap.
*/
static void
AddToWorkerShardIdSet(HTAB *shardsByWorker, char *workerName, int workerPort,
uint64 shardId)
{
WorkerHashKey workerKey = { 0 };
strlcpy(workerKey.hostname, workerName, MAX_NODE_LENGTH);
workerKey.port = workerPort;
bool isFound = false;
WorkerShardIds *workerShardIds =
hash_search(shardsByWorker, &workerKey, HASH_ENTER, &isFound);
if (!isFound)
{
HASHCTL info = {
.keysize = sizeof(uint64),
.entrysize = sizeof(uint64),
.hcxt = CurrentMemoryContext
};
workerShardIds->shardIds = hash_create(
"WorkerShardIdsSet", 32, &info,
HASH_ELEM | HASH_CONTEXT | HASH_BLOBS);
}
hash_search(workerShardIds->shardIds, &shardId, HASH_ENTER, NULL);
}
/*
* NonColocatedDistRelationIdList returns a list of distributed table oids, one
* for each existing colocation group.
@ -1151,11 +1570,47 @@ RebalanceTableShards(RebalanceOptions *options, Oid shardReplicationModeOid)
}
/*
* ConflictShardPlacementUpdateOnlyWithIsolationTesting is only useful for
* testing and should not be called by any code-path except for
* UpdateShardPlacement().
*
* To be able to test the rebalance monitor functionality correctly, we need to
* be able to pause the rebalancer at a specific place in time. We cannot do
* this by block the shard move itself someway (e.g. by calling truncate on the
* distributed table). The reason for this is that we do the shard move in a
* newly opened connection. This causes our isolation tester block detection to
* not realise that the rebalance_table_shards call is blocked.
*
* So instead, before opening a connection we lock an advisory lock that's
* based on the shard id (shard id mod 1000). By locking this advisory lock in
* a different session we can block the rebalancer in a way that the isolation
* tester block detection is able to detect.
*/
static void
ConflictShardPlacementUpdateOnlyWithIsolationTesting(uint64 shardId)
{
LOCKTAG tag;
const bool sessionLock = false;
const bool dontWait = false;
if (RunningUnderIsolationTest)
{
/* we've picked a random lock */
SET_LOCKTAG_ADVISORY(tag, MyDatabaseId,
SHARD_PLACEMENT_UPDATE_ADVISORY_LOCK_FIRST_KEY,
shardId % 1000, 2);
(void) LockAcquire(&tag, ExclusiveLock, sessionLock, dontWait);
}
}
/*
* UpdateShardPlacement copies or moves a shard placement by calling
* the corresponding functions in Citus in a subtransaction.
*/
static bool
static void
UpdateShardPlacement(PlacementUpdateEvent *placementUpdateEvent,
List *responsiveNodeList, Oid shardReplicationModeOid)
{
@ -1177,13 +1632,9 @@ UpdateShardPlacement(PlacementUpdateEvent *placementUpdateEvent,
targetNode->workerPort);
if (!targetResponsive)
{
ereport(WARNING, (errmsg("%s:%d is not responsive", targetNode->workerName,
targetNode->workerPort)));
UpdateColocatedShardPlacementProgress(shardId,
sourceNode->workerName,
sourceNode->workerPort,
REBALANCE_PROGRESS_ERROR);
return false;
ereport(ERROR, (errmsg("target node %s:%d is not responsive",
targetNode->workerName,
targetNode->workerPort)));
}
/* if source node is not responsive, don't continue */
@ -1192,13 +1643,9 @@ UpdateShardPlacement(PlacementUpdateEvent *placementUpdateEvent,
sourceNode->workerPort);
if (!sourceResponsive)
{
ereport(WARNING, (errmsg("%s:%d is not responsive", sourceNode->workerName,
sourceNode->workerPort)));
UpdateColocatedShardPlacementProgress(shardId,
sourceNode->workerName,
sourceNode->workerPort,
REBALANCE_PROGRESS_ERROR);
return false;
ereport(ERROR, (errmsg("source node %s:%d is not responsive",
sourceNode->workerName,
sourceNode->workerPort)));
}
if (updateType == PLACEMENT_UPDATE_MOVE)
@ -1235,6 +1682,7 @@ UpdateShardPlacement(PlacementUpdateEvent *placementUpdateEvent,
sourceNode->workerPort,
REBALANCE_PROGRESS_MOVING);
ConflictShardPlacementUpdateOnlyWithIsolationTesting(shardId);
int connectionFlag = FORCE_NEW_CONNECTION;
MultiConnection *connection = GetNodeConnection(connectionFlag, LocalHostName,
PostPortNumber);
@ -1250,8 +1698,6 @@ UpdateShardPlacement(PlacementUpdateEvent *placementUpdateEvent,
sourceNode->workerPort,
REBALANCE_PROGRESS_MOVED);
CloseConnection(connection);
return true;
}

View File

@ -1499,6 +1499,19 @@ RegisterCitusConfigVariables(void)
GUC_SUPERUSER_ONLY,
NULL, NULL, NULL);
DefineCustomBoolVariable(
"citus.running_under_isolation_test",
gettext_noop(
"Only useful for testing purposes, when set to true, Citus does some "
"tricks to implement useful isolation tests with rebalancing. Should "
"never be set to true on production systems "),
gettext_noop("for details of the tricks implemented, refer to the source code"),
&RunningUnderIsolationTest,
false,
PGC_SUSET,
GUC_SUPERUSER_ONLY | GUC_NO_SHOW_ALL,
NULL, NULL, NULL);
DefineCustomBoolVariable(
"citus.select_opens_transaction_block",
gettext_noop("Open transaction blocks for SELECT commands"),

View File

@ -22,3 +22,5 @@ ON CONFLICT DO NOTHING;
ALTER TABLE pg_catalog.pg_dist_rebalance_strategy ADD COLUMN improvement_threshold float4 NOT NULL default 0;
UPDATE pg_catalog.pg_dist_rebalance_strategy SET improvement_threshold = 0.5 WHERE name = 'by_disk_size';
#include "udfs/get_rebalance_progress/10.1-1.sql"

View File

@ -83,24 +83,7 @@ DROP EXTENSION IF EXISTS shard_rebalancer;
#include "udfs/get_rebalance_table_shards_plan/9.0-1.sql"
#include "udfs/replicate_table_shards/9.0-1.sql"
#include "udfs/rebalance_table_shards/9.0-1.sql"
-- get_rebalance_progress returns the list of shard placement move operations along with
-- their progressions for ongoing rebalance operations.
--
CREATE OR REPLACE FUNCTION get_rebalance_progress()
RETURNS TABLE(sessionid integer,
table_name regclass,
shardid bigint,
shard_size bigint,
sourcename text,
sourceport int,
targetname text,
targetport int,
progress bigint)
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
COMMENT ON FUNCTION get_rebalance_progress()
IS 'provides progress information about the ongoing rebalance operations';
#include "udfs/get_rebalance_progress/9.0-1.sql"
DROP FUNCTION master_add_node(text, integer, integer, noderole, name);
CREATE FUNCTION master_add_node(nodename text,

View File

@ -36,3 +36,8 @@ DROP FUNCTION pg_catalog.citus_add_rebalance_strategy;
#include "../udfs/citus_add_rebalance_strategy/9.2-1.sql"
ALTER TABLE pg_catalog.pg_dist_rebalance_strategy DROP COLUMN improvement_threshold;
-- the migration for get_rebalance_progress from 9.0-1 was the first one,
-- so it doesn't have a DROP. This is why we DROP manually here.
DROP FUNCTION pg_catalog.get_rebalance_progress;
#include "../udfs/get_rebalance_progress/9.0-1.sql"

View File

@ -0,0 +1,18 @@
DROP FUNCTION pg_catalog.get_rebalance_progress();
CREATE OR REPLACE FUNCTION pg_catalog.get_rebalance_progress()
RETURNS TABLE(sessionid integer,
table_name regclass,
shardid bigint,
shard_size bigint,
sourcename text,
sourceport int,
targetname text,
targetport int,
progress bigint,
source_shard_size bigint,
target_shard_size bigint)
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
COMMENT ON FUNCTION pg_catalog.get_rebalance_progress()
IS 'provides progress information about the ongoing rebalance operations';

View File

@ -0,0 +1,17 @@
-- get_rebalance_progress returns the list of shard placement move operations along with
-- their progressions for ongoing rebalance operations.
--
CREATE OR REPLACE FUNCTION pg_catalog.get_rebalance_progress()
RETURNS TABLE(sessionid integer,
table_name regclass,
shardid bigint,
shard_size bigint,
sourcename text,
sourceport int,
targetname text,
targetport int,
progress bigint)
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
COMMENT ON FUNCTION pg_catalog.get_rebalance_progress()
IS 'provides progress information about the ongoing rebalance operations';

View File

@ -0,0 +1,18 @@
DROP FUNCTION pg_catalog.get_rebalance_progress();
CREATE OR REPLACE FUNCTION pg_catalog.get_rebalance_progress()
RETURNS TABLE(sessionid integer,
table_name regclass,
shardid bigint,
shard_size bigint,
sourcename text,
sourceport int,
targetname text,
targetport int,
progress bigint,
source_shard_size bigint,
target_shard_size bigint)
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
COMMENT ON FUNCTION pg_catalog.get_rebalance_progress()
IS 'provides progress information about the ongoing rebalance operations';

View File

@ -105,7 +105,6 @@ typedef struct PlacementUpdateEventProgress
int sourcePort;
char targetName[255];
int targetPort;
uint64 shardSize;
uint64 progress;
} PlacementUpdateEventProgress;
@ -172,6 +171,7 @@ typedef struct RebalancePlanFunctions
} RebalancePlanFunctions;
extern int MaxRebalancerLoggedIgnoredMoves;
extern bool RunningUnderIsolationTest;
/* External function declarations */
extern Datum shard_placement_rebalance_array(PG_FUNCTION_ARGS);

View File

@ -286,7 +286,7 @@ rebalance_table_shards
step s2-rebalance-all:
select rebalance_table_shards();
ERROR: could not acquire the lock required to rebalance public.distributed_transaction_id_table
ERROR: could not acquire the lock required to rebalance public.colocated1
step s1-commit:
COMMIT;
@ -308,7 +308,7 @@ replicate_table_shards
step s2-rebalance-all:
select rebalance_table_shards();
ERROR: could not acquire the lock required to rebalance public.distributed_transaction_id_table
ERROR: could not acquire the lock required to rebalance public.colocated1
step s1-commit:
COMMIT;
@ -374,7 +374,7 @@ rebalance_table_shards
step s2-drain:
select master_drain_node('localhost', 57638);
ERROR: could not acquire the lock required to move public.distributed_transaction_id_table
ERROR: could not acquire the lock required to move public.colocated1
step s1-commit:
COMMIT;
@ -396,7 +396,7 @@ replicate_table_shards
step s2-drain:
select master_drain_node('localhost', 57638);
ERROR: could not acquire the lock required to move public.distributed_transaction_id_table
ERROR: could not acquire the lock required to move public.colocated1
step s1-commit:
COMMIT;

View File

@ -0,0 +1,127 @@
Parsed test spec with 3 sessions
starting permutation: s2-lock-1 s2-lock-2 s1-rebalance-c1 s3-progress s2-unlock-1 s3-progress s2-unlock-2 s3-progress s1-commit s3-progress
master_set_node_property
step s2-lock-1:
SELECT pg_advisory_lock(29279, 1);
pg_advisory_lock
step s2-lock-2:
SELECT pg_advisory_lock(29279, 2);
pg_advisory_lock
step s1-rebalance-c1:
BEGIN;
SELECT * FROM get_rebalance_table_shards_plan('colocated1');
SELECT rebalance_table_shards('colocated1', shard_transfer_mode:='block_writes');
<waiting ...>
step s3-progress:
set client_min_messages=NOTICE;
SELECT
table_name,
shardid,
shard_size,
sourcename,
sourceport,
source_shard_size,
targetname,
targetport,
target_shard_size,
progress
FROM get_rebalance_progress();
table_name shardid shard_size sourcename sourceport source_shard_sizetargetname targetport target_shard_sizeprogress
colocated1 1500001 49152 localhost 57637 49152 localhost 57638 0 1
colocated2 1500005 376832 localhost 57637 376832 localhost 57638 0 1
colocated1 1500002 196608 localhost 57637 196608 localhost 57638 0 0
colocated2 1500006 8192 localhost 57637 8192 localhost 57638 0 0
step s2-unlock-1:
SELECT pg_advisory_unlock(29279, 1);
pg_advisory_unlock
t
step s3-progress:
set client_min_messages=NOTICE;
SELECT
table_name,
shardid,
shard_size,
sourcename,
sourceport,
source_shard_size,
targetname,
targetport,
target_shard_size,
progress
FROM get_rebalance_progress();
table_name shardid shard_size sourcename sourceport source_shard_sizetargetname targetport target_shard_sizeprogress
colocated1 1500001 49152 localhost 57637 0 localhost 57638 49152 2
colocated2 1500005 376832 localhost 57637 0 localhost 57638 376832 2
colocated1 1500002 196608 localhost 57637 196608 localhost 57638 0 1
colocated2 1500006 8192 localhost 57637 8192 localhost 57638 0 1
step s2-unlock-2:
SELECT pg_advisory_unlock(29279, 2);
pg_advisory_unlock
t
step s1-rebalance-c1: <... completed>
table_name shardid shard_size sourcename sourceport targetname targetport
colocated1 1500001 0 localhost 57637 localhost 57638
colocated2 1500005 0 localhost 57637 localhost 57638
colocated1 1500002 0 localhost 57637 localhost 57638
colocated2 1500006 0 localhost 57637 localhost 57638
rebalance_table_shards
step s3-progress:
set client_min_messages=NOTICE;
SELECT
table_name,
shardid,
shard_size,
sourcename,
sourceport,
source_shard_size,
targetname,
targetport,
target_shard_size,
progress
FROM get_rebalance_progress();
table_name shardid shard_size sourcename sourceport source_shard_sizetargetname targetport target_shard_sizeprogress
step s1-commit:
COMMIT;
step s3-progress:
set client_min_messages=NOTICE;
SELECT
table_name,
shardid,
shard_size,
sourcename,
sourceport,
source_shard_size,
targetname,
targetport,
target_shard_size,
progress
FROM get_rebalance_progress();
table_name shardid shard_size sourcename sourceport source_shard_sizetargetname targetport target_shard_sizeprogress
restore_isolation_tester_func

View File

@ -564,20 +564,22 @@ SELECT * FROM print_extension_changes();
-- Snapshot of state at 10.1-1
ALTER EXTENSION citus UPDATE TO '10.1-1';
SELECT * FROM print_extension_changes();
previous_object | current_object
previous_object | current_object
---------------------------------------------------------------------
function citus_add_rebalance_strategy(name,regproc,regproc,regproc,real,real) void |
function citus_internal.columnar_ensure_objects_exist() void |
function create_distributed_table(regclass,text,citus.distribution_type,text) void |
function get_rebalance_progress() TABLE(sessionid integer, table_name regclass, shardid bigint, shard_size bigint, sourcename text, sourceport integer, targetname text, targetport integer, progress bigint) |
function get_rebalance_table_shards_plan(regclass,real,integer,bigint[],boolean,name) TABLE(table_name regclass, shardid bigint, shard_size bigint, sourcename text, sourceport integer, targetname text, targetport integer) |
| function citus_add_rebalance_strategy(name,regproc,regproc,regproc,real,real,real) void
| function citus_local_disk_space_stats() record
| function create_distributed_table(regclass,text,citus.distribution_type,text,integer) void
| function get_rebalance_progress() TABLE(sessionid integer, table_name regclass, shardid bigint, shard_size bigint, sourcename text, sourceport integer, targetname text, targetport integer, progress bigint, source_shard_size bigint, target_shard_size bigint)
| function get_rebalance_table_shards_plan(regclass,real,integer,bigint[],boolean,name,real) TABLE(table_name regclass, shardid bigint, shard_size bigint, sourcename text, sourceport integer, targetname text, targetport integer)
| function worker_partitioned_relation_size(regclass) bigint
| function worker_partitioned_relation_total_size(regclass) bigint
| function worker_partitioned_table_size(regclass) bigint
(11 rows)
(13 rows)
DROP TABLE prev_objects, extension_diff;
-- show running version

View File

@ -181,6 +181,9 @@ RESET citus.shard_count;
RESET citus.shard_replication_factor;
RESET citus.replication_model;
-- Create a user to test multiuser usage of rebalancer functions
-- We explicitely don't create this user on worker nodes yet, so we can
-- test some more error handling. We create them later there.
SET citus.enable_create_role_propagation TO OFF;
CREATE USER testrole;
NOTICE: not propagating CREATE ROLE/USER commands to worker nodes
HINT: Connect to worker nodes directly to manually create all necessary users and roles.
@ -624,17 +627,48 @@ SELECT * FROM table_placements_per_node;
-- Check that max_shard_moves limits number of move operations
-- First check that we error if not table owner
-- Turn on NOTICE messages
SET ROLE testrole;
-- Make sure that rebalance is stopped if source or target nodes are
-- unresponsive.
SELECT rebalance_table_shards('rebalance_test_table',
shard_transfer_mode:='block_writes');
ERROR: target node localhost:xxxxx is not responsive
\c - - - :worker_1_port
SET citus.enable_create_role_propagation TO OFF;
CREATE USER testrole;
NOTICE: not propagating CREATE ROLE/USER commands to worker nodes
HINT: Connect to worker nodes directly to manually create all necessary users and roles.
GRANT ALL ON SCHEMA public TO testrole;
\c - - - :master_port
SET client_min_messages TO WARNING;
SET ROLE testrole;
SELECT rebalance_table_shards('rebalance_test_table',
threshold := 0, max_shard_moves := 1,
shard_transfer_mode:='block_writes');
WARNING: localhost:xxxxx is not responsive
rebalance_table_shards
---------------------------------------------------------------------
(1 row)
ERROR: source node localhost:xxxxx is not responsive
\c - - - :worker_2_port
SET citus.enable_create_role_propagation TO OFF;
CREATE USER testrole;
NOTICE: not propagating CREATE ROLE/USER commands to worker nodes
HINT: Connect to worker nodes directly to manually create all necessary users and roles.
GRANT ALL ON SCHEMA public TO testrole;
\c - - - :master_port
SET client_min_messages TO WARNING;
SET citus.next_shard_id TO 123010;
SET ROLE testrole;
SELECT rebalance_table_shards('rebalance_test_table',
shard_transfer_mode:='block_writes');
ERROR: must be owner of table rebalance_test_table
CONTEXT: while executing command on localhost:xxxxx
RESET ROLE;
-- Confirm no moves took place at all during these errors
SELECT * FROM table_placements_per_node;
nodeport | logicalrelid | count
---------------------------------------------------------------------
57637 | rebalance_test_table | 1
57638 | rebalance_test_table | 5
(2 rows)
SELECT rebalance_table_shards('rebalance_test_table',
threshold := 0, max_shard_moves := 1,
shard_transfer_mode:='block_writes');
@ -1002,7 +1036,7 @@ SELECT * FROM get_rebalance_table_shards_plan('colocated_rebalance_test', rebala
-- Check that we can call this function
SELECT * FROM get_rebalance_progress();
sessionid | table_name | shardid | shard_size | sourcename | sourceport | targetname | targetport | progress
sessionid | table_name | shardid | shard_size | sourcename | sourceport | targetname | targetport | progress | source_shard_size | target_shard_size
---------------------------------------------------------------------
(0 rows)
@ -1015,7 +1049,7 @@ SELECT * FROM rebalance_table_shards('colocated_rebalance_test', threshold := 0,
-- Check that we can call this function without a crash
SELECT * FROM get_rebalance_progress();
sessionid | table_name | shardid | shard_size | sourcename | sourceport | targetname | targetport | progress
sessionid | table_name | shardid | shard_size | sourcename | sourceport | targetname | targetport | progress | source_shard_size | target_shard_size
---------------------------------------------------------------------
(0 rows)
@ -1314,8 +1348,6 @@ RESET search_path;
DROP SCHEMA test_schema_support CASCADE;
\set VERBOSITY default
REVOKE ALL ON SCHEMA public FROM testrole;
ERROR: role "testrole" does not exist
CONTEXT: while executing command on localhost:xxxxx
DROP USER testrole;
-- Test costs
set citus.shard_count = 4;

View File

@ -73,6 +73,7 @@ test: isolation_blocking_move_single_shard_commands_on_mx
test: isolation_blocking_move_multi_shard_commands_on_mx
test: isolation_shard_rebalancer
test: isolation_rebalancer_deferred_drop
test: isolation_shard_rebalancer_progress
# MX tests
test: isolation_reference_on_mx

View File

@ -499,6 +499,8 @@ if($isolationtester)
push(@pgOptions, "citus.metadata_sync_interval=1000");
push(@pgOptions, "citus.metadata_sync_retry_interval=100");
push(@pgOptions, "client_min_messages='warning'"); # pg12 introduced notice showing during isolation tests
push(@pgOptions, "citus.running_under_isolation_test=true");
}
# Add externally added options last, so they overwrite the default ones above

View File

@ -3,9 +3,9 @@ setup
SELECT 1 FROM master_add_node('localhost', 57637);
SELECT 1 FROM master_add_node('localhost', 57638);
CREATE TABLE colocated1 (test_id integer NOT NULL, data text);
SELECT create_distributed_table('colocated1', 'test_id', 'hash');
SELECT create_distributed_table('colocated1', 'test_id', 'hash', 'none');
CREATE TABLE colocated2 (test_id integer NOT NULL, data text);
SELECT create_distributed_table('colocated2', 'test_id', 'hash');
SELECT create_distributed_table('colocated2', 'test_id', 'hash', 'colocated1');
CREATE TABLE non_colocated (test_id integer NOT NULL, data text);
SELECT create_distributed_table('non_colocated', 'test_id', 'hash', 'none');
}

View File

@ -0,0 +1,84 @@
setup
{
SELECT citus_internal.replace_isolation_tester_func();
SELECT citus_internal.refresh_isolation_tester_prepared_statement();
select setval('pg_dist_shardid_seq', GREATEST(1500000, nextval('pg_dist_shardid_seq')));
SET citus.shard_count TO 4;
SET citus.shard_replication_factor TO 1;
SELECT 1 FROM master_add_node('localhost', 57637);
SELECT master_set_node_property('localhost', 57638, 'shouldhaveshards', false);
CREATE TABLE colocated1 (test_id integer NOT NULL, data text);
SELECT create_distributed_table('colocated1', 'test_id', 'hash', 'none');
CREATE TABLE colocated2 (test_id integer NOT NULL, data text);
SELECT create_distributed_table('colocated2', 'test_id', 'hash', 'colocated1');
-- 1 and 3 are chosen so they go to shard 1 and 2
INSERT INTO colocated1(test_id) SELECT 1 from generate_series(0, 1000) i;
INSERT INTO colocated2(test_id) SELECT 1 from generate_series(0, 10000) i;
INSERT INTO colocated1(test_id) SELECT 3 from generate_series(0, 5000) i;
select * from pg_dist_placement;
SELECT master_set_node_property('localhost', 57638, 'shouldhaveshards', true);
}
teardown
{
DROP TABLE colocated2;
DROP TABLE colocated1;
SELECT citus_internal.restore_isolation_tester_func();
}
session "s1"
step "s1-rebalance-c1"
{
BEGIN;
SELECT * FROM get_rebalance_table_shards_plan('colocated1');
SELECT rebalance_table_shards('colocated1', shard_transfer_mode:='block_writes');
}
step "s1-commit"
{
COMMIT;
}
session "s2"
step "s2-lock-1"
{
SELECT pg_advisory_lock(29279, 1);
}
step "s2-lock-2"
{
SELECT pg_advisory_lock(29279, 2);
}
step "s2-unlock-1"
{
SELECT pg_advisory_unlock(29279, 1);
}
step "s2-unlock-2"
{
SELECT pg_advisory_unlock(29279, 2);
}
session "s3"
step "s3-progress"
{
set client_min_messages=NOTICE;
SELECT
table_name,
shardid,
shard_size,
sourcename,
sourceport,
source_shard_size,
targetname,
targetport,
target_shard_size,
progress
FROM get_rebalance_progress();
}
permutation "s2-lock-1" "s2-lock-2" "s1-rebalance-c1" "s3-progress" "s2-unlock-1" "s3-progress" "s2-unlock-2" "s3-progress" "s1-commit" "s3-progress"

View File

@ -22,6 +22,7 @@ teardown
SELECT set_max_shared_pool_size(100);
DROP FUNCTION wake_up_connection_pool_waiters();
DROP FUNCTION set_max_shared_pool_size(int);
DROP TABLE test;
}
session "s1"

View File

@ -80,6 +80,9 @@ RESET citus.shard_replication_factor;
RESET citus.replication_model;
-- Create a user to test multiuser usage of rebalancer functions
-- We explicitely don't create this user on worker nodes yet, so we can
-- test some more error handling. We create them later there.
SET citus.enable_create_role_propagation TO OFF;
CREATE USER testrole;
GRANT ALL ON SCHEMA public TO testrole;
@ -436,11 +439,34 @@ SELECT * FROM table_placements_per_node;
-- Check that max_shard_moves limits number of move operations
-- First check that we error if not table owner
-- Turn on NOTICE messages
SET ROLE testrole;
-- Make sure that rebalance is stopped if source or target nodes are
-- unresponsive.
SELECT rebalance_table_shards('rebalance_test_table',
shard_transfer_mode:='block_writes');
\c - - - :worker_1_port
SET citus.enable_create_role_propagation TO OFF;
CREATE USER testrole;
GRANT ALL ON SCHEMA public TO testrole;
\c - - - :master_port
SET client_min_messages TO WARNING;
SET ROLE testrole;
SELECT rebalance_table_shards('rebalance_test_table',
shard_transfer_mode:='block_writes');
\c - - - :worker_2_port
SET citus.enable_create_role_propagation TO OFF;
CREATE USER testrole;
GRANT ALL ON SCHEMA public TO testrole;
\c - - - :master_port
SET client_min_messages TO WARNING;
SET citus.next_shard_id TO 123010;
SET ROLE testrole;
SELECT rebalance_table_shards('rebalance_test_table',
threshold := 0, max_shard_moves := 1,
shard_transfer_mode:='block_writes');
RESET ROLE;
-- Confirm no moves took place at all during these errors
SELECT * FROM table_placements_per_node;
SELECT rebalance_table_shards('rebalance_test_table',
threshold := 0, max_shard_moves := 1,