diff --git a/.circleci/config.yml b/.circleci/config.yml index ecf5032a9..7b2e56ffe 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -407,6 +407,12 @@ workflows: image_tag: '11.9' make: check-worker requires: [build-11] + - test-citus: + name: 'test-11_check-operations' + pg_major: 11 + image_tag: '11.9' + make: check-operations + requires: [build-11] - test-citus: name: 'test-11_check-follower-cluster' pg_major: 11 @@ -451,6 +457,12 @@ workflows: image_tag: '12.4' make: check-worker requires: [build-12] + - test-citus: + name: 'test-12_check-operations' + pg_major: 12 + image_tag: '12.4' + make: check-operations + requires: [build-12] - test-citus: name: 'test-12_check-follower-cluster' pg_major: 12 @@ -507,6 +519,12 @@ workflows: image_tag: '13.0' make: check-worker requires: [build-13] + - test-citus: + name: 'test-13_check-operations' + pg_major: 13 + image_tag: '13.0' + make: check-operations + requires: [build-13] - test-citus: name: 'test-13_check-follower-cluster' pg_major: 13 diff --git a/src/backend/distributed/commands/foreign_constraint.c b/src/backend/distributed/commands/foreign_constraint.c index 966b8d795..371cd0514 100644 --- a/src/backend/distributed/commands/foreign_constraint.c +++ b/src/backend/distributed/commands/foreign_constraint.c @@ -15,6 +15,8 @@ #include "distributed/pg_version_constants.h" #include "access/htup_details.h" +#include "access/sysattr.h" +#include "access/xact.h" #include "catalog/namespace.h" #include "catalog/pg_constraint.h" #if (PG_VERSION_NUM >= PG_VERSION_12) @@ -23,13 +25,16 @@ #include "catalog/pg_type.h" #include "distributed/colocation_utils.h" #include "distributed/commands.h" +#include "distributed/coordinator_protocol.h" #include "distributed/listutils.h" #include "distributed/coordinator_protocol.h" #include "distributed/multi_join_order.h" #include "distributed/namespace_utils.h" #include "distributed/reference_table_utils.h" #include "distributed/version_compat.h" +#include "utils/builtins.h" #include "utils/fmgroids.h" +#include "utils/inval.h" #include "utils/lsyscache.h" #include "utils/rel.h" #include "utils/relcache.h" @@ -69,6 +74,8 @@ static List * GetForeignKeyIdsForColumn(char *columnName, Oid relationId, static List * GetForeignConstraintCommandsInternal(Oid relationId, int flags); static Oid get_relation_constraint_oid_compat(HeapTuple heapTuple); static bool IsTableTypeIncluded(Oid relationId, int flags); +static void UpdateConstraintIsValid(Oid constraintId, bool isValid); + /* * ConstraintIsAForeignKeyToReferenceTable checks if the given constraint is a @@ -1015,3 +1022,199 @@ IsTableTypeIncluded(Oid relationId, int flags) } return false; } + + +/* + * GetForeignConstraintCommandsToReferenceTable takes in a shardInterval, and + * returns the list of commands that are required to create the foreign + * constraints for that shardInterval. + * + * The function does the following hack: + * - Create the foreign constraints as INVALID on the shards + * - Manually update pg_constraint to mark the same foreign + * constraints as VALID + * + * We implement the above hack because we aim to skip the validation phase + * of foreign keys to reference tables. The validation is pretty costly and + * given that the source placements already valid, the validation in the + * target nodes is useless. + * + * The function does not apply the same logic for the already invalid foreign + * constraints. + */ +List * +GetForeignConstraintCommandsToReferenceTable(ShardInterval *shardInterval) +{ + ScanKeyData scanKey[1]; + int scanKeyCount = 1; + uint64 shardId = shardInterval->shardId; + Oid relationId = shardInterval->relationId; + + List *commandList = NIL; + + /* + * Set search_path to NIL so that all objects outside of pg_catalog will be + * schema-prefixed. pg_catalog will be added automatically when we call + * PushOverrideSearchPath(), since we set addCatalog to true; + */ + OverrideSearchPath *overridePath = GetOverrideSearchPath(CurrentMemoryContext); + overridePath->schemas = NIL; + overridePath->addCatalog = true; + PushOverrideSearchPath(overridePath); + + /* open system catalog and scan all constraints that belong to this table */ + Relation pgConstraint = table_open(ConstraintRelationId, AccessShareLock); + ScanKeyInit(&scanKey[0], Anum_pg_constraint_conrelid, BTEqualStrategyNumber, F_OIDEQ, + relationId); + + SysScanDesc scanDescriptor = systable_beginscan(pgConstraint, + ConstraintRelidTypidNameIndexId, + true, NULL, scanKeyCount, scanKey); + + HeapTuple heapTuple = systable_getnext(scanDescriptor); + while (HeapTupleIsValid(heapTuple)) + { + Form_pg_constraint constraintForm = (Form_pg_constraint) GETSTRUCT(heapTuple); + char *constraintDefinition = NULL; + + + if (constraintForm->contype != CONSTRAINT_FOREIGN) + { + heapTuple = systable_getnext(scanDescriptor); + continue; + } + + Oid referencedRelationId = constraintForm->confrelid; + if (PartitionMethod(referencedRelationId) != DISTRIBUTE_BY_NONE) + { + heapTuple = systable_getnext(scanDescriptor); + continue; + } + + Oid constraintId = get_relation_constraint_oid(relationId, + constraintForm->conname.data, + true); + + int64 referencedShardId = GetFirstShardId(referencedRelationId); + Oid referencedSchemaId = get_rel_namespace(referencedRelationId); + char *referencedSchemaName = get_namespace_name(referencedSchemaId); + char *escapedReferencedSchemaName = quote_literal_cstr(referencedSchemaName); + + Oid schemaId = get_rel_namespace(relationId); + char *schemaName = get_namespace_name(schemaId); + char *escapedSchemaName = quote_literal_cstr(schemaName); + + /* + * We're first marking the constraint's valid field as invalid + * and get the constraint definition. Later, we mark the constraint + * as valid back with directly updating to pg_constraint. + */ + if (constraintForm->convalidated == true) + { + UpdateConstraintIsValid(constraintId, false); + constraintDefinition = pg_get_constraintdef_command(constraintId); + UpdateConstraintIsValid(constraintId, true); + } + else + { + /* if the constraint is not valid, simply do nothing special */ + constraintDefinition = pg_get_constraintdef_command(constraintId); + } + + StringInfo applyForeignConstraintCommand = makeStringInfo(); + appendStringInfo(applyForeignConstraintCommand, + WORKER_APPLY_INTER_SHARD_DDL_COMMAND, shardId, + escapedSchemaName, referencedShardId, + escapedReferencedSchemaName, + quote_literal_cstr(constraintDefinition)); + commandList = lappend(commandList, applyForeignConstraintCommand->data); + + /* mark the constraint as valid again on the shard */ + if (constraintForm->convalidated == true) + { + StringInfo markConstraintValid = makeStringInfo(); + char *qualifiedReferencingShardName = + ConstructQualifiedShardName(shardInterval); + + char *shardConstraintName = pstrdup(constraintForm->conname.data); + AppendShardIdToName(&shardConstraintName, shardId); + + appendStringInfo(markConstraintValid, + "UPDATE pg_constraint SET convalidated = true WHERE " + "conrelid = %s::regclass AND conname = '%s'", + quote_literal_cstr(qualifiedReferencingShardName), + shardConstraintName); + commandList = lappend(commandList, markConstraintValid->data); + } + + heapTuple = systable_getnext(scanDescriptor); + } + + /* clean up scan and close system catalog */ + systable_endscan(scanDescriptor); + table_close(pgConstraint, AccessShareLock); + + /* revert back to original search_path */ + PopOverrideSearchPath(); + + return commandList; +} + + +/* + * UpdateConstraintIsValid is a utility function with sets the + * pg_constraint.convalidated to the given isValid for the given + * constraintId. + * + * This function should be called with caution because if used wrong + * could lead to data inconsistencies. + */ +static void +UpdateConstraintIsValid(Oid constraintId, bool isValid) +{ + HeapTuple heapTuple = NULL; + SysScanDesc scanDescriptor; + ScanKeyData scankey[1]; + Relation pgConstraint = table_open(ConstraintRelationId, AccessShareLock); + TupleDesc tupleDescriptor = RelationGetDescr(pgConstraint); + Datum values[Natts_pg_constraint]; + bool isnull[Natts_pg_constraint]; + bool replace[Natts_pg_constraint]; + + ScanKeyInit(&scankey[0], +#if PG_VERSION_NUM >= 120000 + Anum_pg_constraint_oid, +#else + ObjectIdAttributeNumber, +#endif + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(constraintId)); + + scanDescriptor = systable_beginscan(pgConstraint, + ConstraintOidIndexId, + true, + NULL, + 1, + scankey); + heapTuple = systable_getnext(scanDescriptor); + if (!HeapTupleIsValid(heapTuple)) + { + elog(ERROR, "could not find tuple for constraint %u", constraintId); + } + + memset(replace, 0, sizeof(replace)); + + values[Anum_pg_constraint_convalidated - 1] = BoolGetDatum(isValid); + isnull[Anum_pg_constraint_convalidated - 1] = false; + replace[Anum_pg_constraint_convalidated - 1] = true; + + heapTuple = heap_modify_tuple(heapTuple, tupleDescriptor, values, isnull, replace); + + CatalogTupleUpdate(pgConstraint, &heapTuple->t_self, heapTuple); + + CacheInvalidateHeapTuple(pgConstraint, heapTuple, NULL); + CommandCounterIncrement(); + + systable_endscan(scanDescriptor); + table_close(pgConstraint, NoLock); +} diff --git a/src/backend/distributed/metadata/metadata_cache.c b/src/backend/distributed/metadata/metadata_cache.c index 324a5ac38..1699202e8 100644 --- a/src/backend/distributed/metadata/metadata_cache.c +++ b/src/backend/distributed/metadata/metadata_cache.c @@ -132,6 +132,7 @@ typedef struct MetadataCacheData bool extensionLoaded; Oid distShardRelationId; Oid distPlacementRelationId; + Oid distRebalanceStrategyRelationId; Oid distNodeRelationId; Oid distNodeNodeIdIndexId; Oid distLocalGroupRelationId; @@ -2061,6 +2062,17 @@ DistLocalGroupIdRelationId(void) } +/* return oid of pg_dist_rebalance_strategy relation */ +Oid +DistRebalanceStrategyRelationId(void) +{ + CachedRelationLookup("pg_dist_rebalance_strategy", + &MetadataCache.distRebalanceStrategyRelationId); + + return MetadataCache.distRebalanceStrategyRelationId; +} + + /* return the oid of citus namespace */ Oid CitusCatalogNamespaceId(void) @@ -3939,6 +3951,37 @@ LookupShardRelationFromCatalog(int64 shardId, bool missingOk) } +/* + * ShardExists returns whether the given shard ID exists in pg_dist_shard. + */ +bool +ShardExists(int64 shardId) +{ + ScanKeyData scanKey[1]; + int scanKeyCount = 1; + Relation pgDistShard = table_open(DistShardRelationId(), AccessShareLock); + bool shardExists = false; + + ScanKeyInit(&scanKey[0], Anum_pg_dist_shard_shardid, + BTEqualStrategyNumber, F_INT8EQ, Int64GetDatum(shardId)); + + SysScanDesc scanDescriptor = systable_beginscan(pgDistShard, + DistShardShardidIndexId(), true, + NULL, scanKeyCount, scanKey); + + HeapTuple heapTuple = systable_getnext(scanDescriptor); + if (HeapTupleIsValid(heapTuple)) + { + shardExists = true; + } + + systable_endscan(scanDescriptor); + table_close(pgDistShard, NoLock); + + return shardExists; +} + + /* * GetPartitionTypeInputInfo populates output parameters with the interval type * identifier and modifier for the specified partition key/method combination. diff --git a/src/backend/distributed/metadata/metadata_utility.c b/src/backend/distributed/metadata/metadata_utility.c index af1563129..d30b33546 100644 --- a/src/backend/distributed/metadata/metadata_utility.c +++ b/src/backend/distributed/metadata/metadata_utility.c @@ -902,6 +902,46 @@ AllShardPlacementsOnNodeGroup(int32 groupId) } +/* + * AllShardPlacementsWithShardPlacementState finds shard placements with the given + * shardState from system catalogs, converts these placements to their in-memory + * representation, and returns the converted shard placements in a new list. + */ +List * +AllShardPlacementsWithShardPlacementState(ShardState shardState) +{ + List *shardPlacementList = NIL; + ScanKeyData scanKey[1]; + int scanKeyCount = 1; + + Relation pgPlacement = table_open(DistPlacementRelationId(), AccessShareLock); + + ScanKeyInit(&scanKey[0], Anum_pg_dist_placement_shardstate, + BTEqualStrategyNumber, F_INT4EQ, Int32GetDatum(shardState)); + + SysScanDesc scanDescriptor = systable_beginscan(pgPlacement, InvalidOid, false, + NULL, scanKeyCount, scanKey); + + HeapTuple heapTuple = systable_getnext(scanDescriptor); + while (HeapTupleIsValid(heapTuple)) + { + TupleDesc tupleDescriptor = RelationGetDescr(pgPlacement); + + GroupShardPlacement *placement = + TupleToGroupShardPlacement(tupleDescriptor, heapTuple); + + shardPlacementList = lappend(shardPlacementList, placement); + + heapTuple = systable_getnext(scanDescriptor); + } + + systable_endscan(scanDescriptor); + table_close(pgPlacement, NoLock); + + return shardPlacementList; +} + + /* * TupleToGroupShardPlacement takes in a heap tuple from pg_dist_placement, * and converts this tuple to in-memory struct. The function assumes the diff --git a/src/backend/distributed/operations/repair_shards.c b/src/backend/distributed/operations/repair_shards.c index 75d12b530..56edc95ff 100644 --- a/src/backend/distributed/operations/repair_shards.c +++ b/src/backend/distributed/operations/repair_shards.c @@ -11,18 +11,21 @@ */ #include "postgres.h" -#include "c.h" #include "fmgr.h" #include "miscadmin.h" #include +#include "access/htup_details.h" #include "catalog/pg_class.h" +#include "catalog/pg_enum.h" +#include "distributed/citus_ruleutils.h" #include "distributed/colocation_utils.h" #include "distributed/commands.h" #include "distributed/connection_management.h" #include "distributed/distributed_planner.h" #include "distributed/listutils.h" +#include "distributed/shard_cleaner.h" #include "distributed/coordinator_protocol.h" #include "distributed/metadata_cache.h" #include "distributed/metadata_sync.h" @@ -35,6 +38,7 @@ #include "distributed/worker_transaction.h" #include "lib/stringinfo.h" #include "nodes/pg_list.h" +#include "storage/lmgr.h" #include "storage/lock.h" #include "storage/lmgr.h" #include "utils/builtins.h" @@ -42,6 +46,8 @@ #include "utils/errcodes.h" #include "utils/lsyscache.h" #include "utils/palloc.h" +#include "utils/rel.h" +#include "utils/syscache.h" /* local function forward declarations */ static void ErrorIfTableCannotBeReplicated(Oid relationId); @@ -65,15 +71,27 @@ static void EnsureShardCanBeCopied(int64 shardId, const char *sourceNodeName, int32 sourceNodePort, const char *targetNodeName, int32 targetNodePort); static List * RecreateTableDDLCommandList(Oid relationId); -static List * WorkerApplyShardDDLCommandList(List *ddlCommandList, int64 shardId); static void EnsureTableListOwner(List *tableIdList); static void EnsureTableListSuitableForReplication(List *tableIdList); +static void DropColocatedShardPlacement(ShardInterval *shardInterval, char *nodeName, + int32 nodePort); +static void MarkForDropColocatedShardPlacement(ShardInterval *shardInterval, + char *nodeName, int32 nodePort); +static void UpdateColocatedShardPlacementMetadataOnWorkers(int64 shardId, + char *sourceNodeName, + int32 sourceNodePort, + char *targetNodeName, + int32 targetNodePort); + /* declarations for dynamic loading */ PG_FUNCTION_INFO_V1(master_copy_shard_placement); PG_FUNCTION_INFO_V1(master_move_shard_placement); +bool DeferShardDeleteOnMove = false; + + /* * master_copy_shard_placement implements a user-facing UDF to repair data from * a healthy (source) node to an inactive (target) node. To accomplish this it @@ -98,16 +116,15 @@ master_copy_shard_placement(PG_FUNCTION_ARGS) char *sourceNodeName = text_to_cstring(sourceNodeNameText); char *targetNodeName = text_to_cstring(targetNodeNameText); - CheckCitusVersion(ERROR); EnsureCoordinator(); + CheckCitusVersion(ERROR); char shardReplicationMode = LookupShardTransferMode(shardReplicationModeOid); if (shardReplicationMode == TRANSFER_MODE_FORCE_LOGICAL) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("using logical replication in " - "master_copy_shard_placement() requires Citus " - "Enterprise"))); + errmsg("the force_logical transfer mode is currently " + "unsupported"))); } ShardInterval *shardInterval = LoadShardInterval(shardId); @@ -131,19 +148,155 @@ master_copy_shard_placement(PG_FUNCTION_ARGS) /* * master_move_shard_placement moves given shard (and its co-located shards) from one - * node to the other node. + * node to the other node. To accomplish this it entirely recreates the table structure + * before copying all data. + * + * After that, there are two different paths. First one is blocking shard move in the + * sense that during shard move all modifications are paused to the shard. The second + * one relies on logical replication meaning that the writes blocked only for a very + * short duration almost only when the metadata is actually being updated. This option + * is currently only available in Citus Enterprise. + * + * After successful move operation, shards in the source node gets deleted. If the move + * fails at any point, this function throws an error, leaving the cluster without doing + * any changes in source node or target node. */ Datum master_move_shard_placement(PG_FUNCTION_ARGS) { - ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("master_move_shard_placement() is only supported on " - "Citus Enterprise"))); + int64 shardId = PG_GETARG_INT64(0); + char *sourceNodeName = text_to_cstring(PG_GETARG_TEXT_P(1)); + int32 sourceNodePort = PG_GETARG_INT32(2); + char *targetNodeName = text_to_cstring(PG_GETARG_TEXT_P(3)); + int32 targetNodePort = PG_GETARG_INT32(4); + Oid shardReplicationModeOid = PG_GETARG_OID(5); + + + ListCell *colocatedTableCell = NULL; + ListCell *colocatedShardCell = NULL; + + + CheckCitusVersion(ERROR); + EnsureCoordinator(); + + Oid relationId = RelationIdForShard(shardId); + ErrorIfMoveCitusLocalTable(relationId); + + ShardInterval *shardInterval = LoadShardInterval(shardId); + Oid distributedTableId = shardInterval->relationId; + + List *colocatedTableList = ColocatedTableList(distributedTableId); + List *colocatedShardList = ColocatedShardIntervalList(shardInterval); + + foreach(colocatedTableCell, colocatedTableList) + { + Oid colocatedTableId = lfirst_oid(colocatedTableCell); + char relationKind = '\0'; + + /* check that user has owner rights in all co-located tables */ + EnsureTableOwner(colocatedTableId); + + /* + * Block concurrent DDL / TRUNCATE commands on the relation. Similarly, + * block concurrent master_move_shard_placement() on any shard of + * the same relation. This is OK for now since we're executing shard + * moves sequentially anyway. + */ + LockRelationOid(colocatedTableId, ShareUpdateExclusiveLock); + + relationKind = get_rel_relkind(colocatedTableId); + if (relationKind == RELKIND_FOREIGN_TABLE) + { + char *relationName = get_rel_name(colocatedTableId); + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot repair shard"), + errdetail("Table %s is a foreign table. Repairing " + "shards backed by foreign tables is " + "not supported.", relationName))); + } + } + + /* we sort colocatedShardList so that lock operations will not cause any deadlocks */ + colocatedShardList = SortList(colocatedShardList, CompareShardIntervalsById); + foreach(colocatedShardCell, colocatedShardList) + { + ShardInterval *colocatedShard = (ShardInterval *) lfirst(colocatedShardCell); + uint64 colocatedShardId = colocatedShard->shardId; + + EnsureShardCanBeCopied(colocatedShardId, sourceNodeName, sourceNodePort, + targetNodeName, targetNodePort); + } + + char shardReplicationMode = LookupShardTransferMode(shardReplicationModeOid); + if (shardReplicationMode == TRANSFER_MODE_FORCE_LOGICAL) + { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("the force_logical transfer mode is currently " + "unsupported"))); + } + + BlockWritesToShardList(colocatedShardList); + + /* + * CopyColocatedShardPlacement function copies given shard with its co-located + * shards. + */ + CopyShardTables(colocatedShardList, sourceNodeName, sourceNodePort, targetNodeName, + targetNodePort); + + ShardInterval *colocatedShard = NULL; + foreach_ptr(colocatedShard, colocatedShardList) + { + uint64 colocatedShardId = colocatedShard->shardId; + uint32 groupId = GroupForNode(targetNodeName, targetNodePort); + uint64 placementId = GetNextPlacementId(); + + InsertShardPlacementRow(colocatedShardId, placementId, + SHARD_STATE_ACTIVE, ShardLength(colocatedShardId), + groupId); + } + + /* since this is move operation, we remove shards from source node after copy */ + if (DeferShardDeleteOnMove) + { + MarkForDropColocatedShardPlacement(shardInterval, sourceNodeName, sourceNodePort); + } + else + { + DropColocatedShardPlacement(shardInterval, sourceNodeName, sourceNodePort); + } + + UpdateColocatedShardPlacementMetadataOnWorkers(shardId, sourceNodeName, + sourceNodePort, targetNodeName, + targetNodePort); + + PG_RETURN_VOID(); } /* - * BlockWritesToShardList blocks writes to all shards in the given shard + * ErrorIfMoveCitusLocalTable is a helper function for rebalance_table_shards + * and master_move_shard_placement udf's to error out if relation with relationId + * is a citus local table. + */ +void +ErrorIfMoveCitusLocalTable(Oid relationId) +{ + if (!IsCitusTableType(relationId, CITUS_LOCAL_TABLE)) + { + return; + } + + char *qualifiedRelationName = generate_qualified_relation_name(relationId); + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("table %s is a citus local table, moving shard of " + "a citus local table is currently not supported", + qualifiedRelationName))); +} + + +/* + * BlockWritesToColocatedShardList blocks writes to all shards in the given shard * list. The function assumes that all the shards in the list are colocated. */ void @@ -185,8 +338,11 @@ BlockWritesToShardList(List *shardList) /* * ErrorIfTableCannotBeReplicated function errors out if the given table is not suitable - * for its shard being replicated. Shard replications is not allowed only for MX tables, - * since RF=1 is a must MX tables. + * for its shard being replicated. There are 2 cases in which shard replication is not + * allowed: + * + * 1) MX tables, since RF=1 is a must MX tables + * 2) Reference tables, since the shard should already exist in all workers */ static void ErrorIfTableCannotBeReplicated(Oid relationId) @@ -336,6 +492,7 @@ RepairShardPlacement(int64 shardId, const char *sourceNodeName, int32 sourceNode /* we generate necessary commands to recreate the shard in target node */ List *ddlCommandList = CopyShardCommandList(shardInterval, sourceNodeName, sourceNodePort, includeData); + List *foreignConstraintCommandList = CopyShardForeignConstraintCommandList( shardInterval); ddlCommandList = list_concat(ddlCommandList, foreignConstraintCommandList); @@ -502,15 +659,7 @@ EnsureTableListSuitableForReplication(List *tableIdList) IsCitusTableType(tableId, DISTRIBUTED_TABLE)) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot create foreign key constraint"), - errdetail("This shard has foreign constraints on it. " - "Citus currently supports " - "foreign key constraints only for " - "\"citus.shard_replication_factor = 1\"."), - errhint("Please change \"citus.shard_replication_factor to " - "1\". To learn more about using foreign keys with " - "other replication factors, please contact us at " - "https://citusdata.com/about/contact_us."))); + errmsg("cannot replicate shards with foreign keys"))); } } } @@ -553,21 +702,12 @@ CopyShardTables(List *shardIntervalList, char *sourceNodeName, int32 sourceNodeP * * Iterate through the colocated shards and create the foreign constraints and * attach child tables to their parents in a partitioning hierarchy. - * - * Note: After implementing foreign constraints from distributed to reference - * tables, we have decided to not create foreign constraints from hash - * distributed to reference tables at this stage for nonblocking rebalancer. - * We just create the co-located ones here. We add the foreign constraints - * from hash distributed to reference tables after being completely done with - * the copy procedure inside LogicallyReplicateShards. The reason is that, - * the reference tables have placements in both source and target workers and - * the copied shard would get updated twice because of a cascading DML coming - * from both of the placements. */ foreach_ptr(shardInterval, shardIntervalList) { List *shardForeignConstraintCommandList = NIL; List *referenceTableForeignConstraintList = NIL; + char *tableOwner = TableOwner(shardInterval->relationId); CopyShardForeignConstraintCommandListGrouped(shardInterval, @@ -988,12 +1128,109 @@ RecreateTableDDLCommandList(Oid relationId) } +/* + * DropColocatedShardPlacement deletes the shard placement metadata for the given shard + * placement from the pg_dist_placement, and then it drops the shard table + * from the given node. The function does this for all colocated placements. + */ +static void +DropColocatedShardPlacement(ShardInterval *shardInterval, char *nodeName, int32 nodePort) +{ + List *colocatedShardList = ColocatedShardIntervalList(shardInterval); + ListCell *colocatedShardCell = NULL; + + foreach(colocatedShardCell, colocatedShardList) + { + ShardInterval *colocatedShard = (ShardInterval *) lfirst(colocatedShardCell); + char *qualifiedTableName = ConstructQualifiedShardName(colocatedShard); + StringInfo dropQuery = makeStringInfo(); + uint64 shardId = colocatedShard->shardId; + List *shardPlacementList = ShardPlacementList(shardId); + ShardPlacement *placement = + SearchShardPlacementInListOrError(shardPlacementList, nodeName, nodePort); + + appendStringInfo(dropQuery, DROP_REGULAR_TABLE_COMMAND, qualifiedTableName); + + DeleteShardPlacementRow(placement->placementId); + SendCommandToWorker(nodeName, nodePort, dropQuery->data); + } +} + + +/* + * MarkForDropColocatedShardPlacement marks the shard placement metadata for the given + * shard placement to be deleted in pg_dist_placement. The function does this for all + * colocated placements. + */ +static void +MarkForDropColocatedShardPlacement(ShardInterval *shardInterval, char *nodeName, int32 + nodePort) +{ + List *colocatedShardList = ColocatedShardIntervalList(shardInterval); + ListCell *colocatedShardCell = NULL; + + foreach(colocatedShardCell, colocatedShardList) + { + ShardInterval *colocatedShard = (ShardInterval *) lfirst(colocatedShardCell); + uint64 shardId = colocatedShard->shardId; + List *shardPlacementList = ShardPlacementList(shardId); + ShardPlacement *placement = + SearchShardPlacementInListOrError(shardPlacementList, nodeName, nodePort); + + UpdateShardPlacementState(placement->placementId, SHARD_STATE_TO_DELETE); + } +} + + +/* + * UpdateColocatedShardPlacementMetadataOnWorkers updates the metadata about the + * placements of the given shard and its colocated shards by changing the nodename and + * nodeport of the shards from the source nodename/port to target nodename/port. + * + * Note that the function does nothing if the given shard belongs to a non-mx table. + */ +static void +UpdateColocatedShardPlacementMetadataOnWorkers(int64 shardId, + char *sourceNodeName, int32 sourceNodePort, + char *targetNodeName, int32 targetNodePort) +{ + ShardInterval *shardInterval = LoadShardInterval(shardId); + ListCell *colocatedShardCell = NULL; + bool shouldSyncMetadata = ShouldSyncTableMetadata(shardInterval->relationId); + + if (!shouldSyncMetadata) + { + return; + } + + List *colocatedShardList = ColocatedShardIntervalList(shardInterval); + + /* iterate through the colocated shards and copy each */ + foreach(colocatedShardCell, colocatedShardList) + { + ShardInterval *colocatedShard = (ShardInterval *) lfirst(colocatedShardCell); + StringInfo updateCommand = makeStringInfo(); + + appendStringInfo(updateCommand, "UPDATE pg_dist_shard_placement " + "SET nodename=%s, nodeport=%d WHERE " + "shardid=%lu AND nodename=%s AND nodeport=%d", + quote_literal_cstr(targetNodeName), + targetNodePort, + colocatedShard->shardId, + quote_literal_cstr(sourceNodeName), + sourceNodePort); + + SendCommandToWorkersWithMetadata(updateCommand->data); + } +} + + /* * WorkerApplyShardDDLCommandList wraps all DDL commands in ddlCommandList * in a call to worker_apply_shard_ddl_command to apply the DDL command to * the shard specified by shardId. */ -static List * +List * WorkerApplyShardDDLCommandList(List *ddlCommandList, int64 shardId) { List *applyDDLCommandList = NIL; diff --git a/src/backend/distributed/operations/shard_cleaner.c b/src/backend/distributed/operations/shard_cleaner.c new file mode 100644 index 000000000..f8e8d851b --- /dev/null +++ b/src/backend/distributed/operations/shard_cleaner.c @@ -0,0 +1,144 @@ +/*------------------------------------------------------------------------- + * + * shard_cleaner.c + * This implements the background process that cleans shards that are + * left around. Shards that are left around are marked as state 4 + * (SHARD_STATE_TO_DELETE) in pg_dist_placement. + * + * Copyright (c), Citus Data, Inc. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + + +#include "distributed/coordinator_protocol.h" +#include "distributed/metadata_cache.h" +#include "distributed/shard_cleaner.h" +#include "distributed/worker_transaction.h" + + +/* declarations for dynamic loading */ +PG_FUNCTION_INFO_V1(master_defer_delete_shards); + + +static int DropMarkedShards(void); + + +/* + * master_defer_delete_shards implements a user-facing UDF to deleter orphaned shards that + * are still haning around in the system. These shards are orphaned by previous actions + * that were not directly able to delete the placements eg. shard moving or dropping of a + * distributed table while one of the data nodes was not online. + * + * This function iterates through placements where shardstate is SHARD_STATE_TO_DELETE + * (shardstate = 4), drops the corresponding tables from the node and removes the + * placement information from the catalog. + * + * The function takes no arguments and runs cluster wide + */ +Datum +master_defer_delete_shards(PG_FUNCTION_ARGS) +{ + CheckCitusVersion(ERROR); + EnsureCoordinator(); + + int droppedShardCount = DropMarkedShards(); + + PG_RETURN_INT32(droppedShardCount); +} + + +/* + * TryDropMarkedShards is a wrapper around DropMarkedShards that catches + * any errors to make it safe to use in the maintenance daemon. + */ +int +TryDropMarkedShards(void) +{ + int droppedShardCount = 0; + MemoryContext savedContext = CurrentMemoryContext; + + PG_TRY(); + { + droppedShardCount = DropMarkedShards(); + } + PG_CATCH(); + { + MemoryContextSwitchTo(savedContext); + ErrorData *edata = CopyErrorData(); + FlushErrorState(); + + /* rethrow as WARNING */ + edata->elevel = WARNING; + ThrowErrorData(edata); + } + PG_END_TRY(); + + return droppedShardCount; +} + + +/* + * DropMarkedShards removes shards that were marked SHARD_STATE_TO_DELETE before. + * + * It does so by taking an exclusive lock on the shard and its colocated + * placements before removing. If the lock cannot be obtained it skips the + * group and continues with others. The group that has been skipped will be + * removed at a later time when there are no locks held anymore on those + * placements. + */ +static int +DropMarkedShards(void) +{ + int removedShardCount = 0; + ListCell *shardPlacementCell = NULL; + + if (!IsCoordinator()) + { + return removedShardCount; + } + + List *shardPlacementList = AllShardPlacementsWithShardPlacementState( + SHARD_STATE_TO_DELETE); + foreach(shardPlacementCell, shardPlacementList) + { + GroupShardPlacement *placement = (GroupShardPlacement *) lfirst( + shardPlacementCell); + + if (!PrimaryNodeForGroup(placement->groupId, NULL) || + !ShardExists(placement->shardId)) + { + continue; + } + + ShardPlacement *shardPlacement = LoadShardPlacement(placement->shardId, + placement->placementId); + ShardInterval *shardInterval = LoadShardInterval(shardPlacement->shardId); + + ereport(LOG, (errmsg("dropping shard placement " INT64_FORMAT " of shard " + INT64_FORMAT " on %s:%d after it was moved away", + shardPlacement->placementId, shardPlacement->shardId, + shardPlacement->nodeName, shardPlacement->nodePort))); + + /* prepare sql query to execute to drop the shard */ + StringInfo dropQuery = makeStringInfo(); + char *qualifiedTableName = ConstructQualifiedShardName(shardInterval); + appendStringInfo(dropQuery, DROP_REGULAR_TABLE_COMMAND, qualifiedTableName); + + List *dropCommandList = list_make2("SET LOCAL lock_timeout TO '1s'", + dropQuery->data); + + /* remove the shard from the node and the placement information */ + SendCommandListToWorkerInSingleTransaction(shardPlacement->nodeName, + shardPlacement->nodePort, + NULL, dropCommandList); + + DeleteShardPlacementRow(placement->placementId); + + removedShardCount++; + } + + return removedShardCount; +} diff --git a/src/backend/distributed/operations/shard_rebalancer.c b/src/backend/distributed/operations/shard_rebalancer.c index dda7a849f..58f435218 100644 --- a/src/backend/distributed/operations/shard_rebalancer.c +++ b/src/backend/distributed/operations/shard_rebalancer.c @@ -6,32 +6,2164 @@ * * Copyright (c) Citus Data, Inc. * - * $Id$ - * *------------------------------------------------------------------------- */ + #include "postgres.h" +#include "libpq-fe.h" + +#include + +#include "distributed/pg_version_constants.h" #include "access/htup_details.h" +#include "access/genam.h" #include "catalog/pg_type.h" #include "catalog/pg_proc.h" +#include "commands/dbcommands.h" +#include "commands/sequence.h" +#include "distributed/argutils.h" +#include "distributed/citus_safe_lib.h" +#include "distributed/citus_ruleutils.h" +#include "distributed/colocation_utils.h" +#include "distributed/connection_management.h" #include "distributed/enterprise.h" +#include "distributed/hash_helpers.h" +#include "distributed/intermediate_result_pruning.h" +#include "distributed/listutils.h" +#include "distributed/coordinator_protocol.h" +#include "distributed/metadata_cache.h" +#include "distributed/multi_client_executor.h" +#include "distributed/multi_progress.h" +#include "distributed/multi_server_executor.h" +#include "distributed/pg_dist_rebalance_strategy.h" +#include "distributed/reference_table_utils.h" +#include "distributed/remote_commands.h" +#include "distributed/resource_lock.h" +#include "distributed/shard_rebalancer.h" +#include "distributed/tuplestore.h" +#include "distributed/worker_protocol.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "postmaster/postmaster.h" +#include "storage/lmgr.h" +#include "utils/builtins.h" +#include "utils/fmgroids.h" +#include "utils/int8.h" +#include "utils/json.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" #include "utils/syscache.h" +#if PG_VERSION_NUM >= PG_VERSION_13 +#include "common/hashfn.h" +#endif + +/* RebalanceOptions are the options used to control the rebalance algorithm */ +typedef struct RebalanceOptions +{ + List *relationIdList; + float4 threshold; + int32 maxShardMoves; + ArrayType *excludedShardArray; + bool drainOnly; + Form_pg_dist_rebalance_strategy rebalanceStrategy; +} RebalanceOptions; + + +/* + * RebalanceState is used to keep the internal state of the rebalance + * algorithm in one place. + */ +typedef struct RebalanceState +{ + HTAB *placementsHash; + List *placementUpdateList; + RebalancePlanFunctions *functions; + List *fillStateListDesc; + List *fillStateListAsc; + List *disallowedPlacementList; + float4 totalCost; + float4 totalCapacity; +} RebalanceState; + + +/* RebalanceContext stores the context for the function callbacks */ +typedef struct RebalanceContext +{ + FmgrInfo shardCostUDF; + FmgrInfo nodeCapacityUDF; + FmgrInfo shardAllowedOnNodeUDF; +} RebalanceContext; + + +/* static declarations for main logic */ +static int ShardActivePlacementCount(HTAB *activePlacementsHash, uint64 shardId, + List *activeWorkerNodeList); +static bool UpdateShardPlacement(PlacementUpdateEvent *placementUpdateEvent, + List *responsiveNodeList, Oid shardReplicationModeOid); + +/* static declarations for main logic's utility functions */ +static HTAB * ActivePlacementsHash(List *shardPlacementList); +static bool PlacementsHashFind(HTAB *placementsHash, uint64 shardId, + WorkerNode *workerNode); +static void PlacementsHashEnter(HTAB *placementsHash, uint64 shardId, + WorkerNode *workerNode); +static void PlacementsHashRemove(HTAB *placementsHash, uint64 shardId, + WorkerNode *workerNode); +static int PlacementsHashCompare(const void *lhsKey, const void *rhsKey, Size keySize); +static uint32 PlacementsHashHashCode(const void *key, Size keySize); +static bool WorkerNodeListContains(List *workerNodeList, const char *workerName, + uint32 workerPort); +static void UpdateColocatedShardPlacementProgress(uint64 shardId, char *sourceName, + int sourcePort, uint64 progress); +static bool IsPlacementOnWorkerNode(ShardPlacement *placement, WorkerNode *workerNode); +static NodeFillState * FindFillStateForPlacement(RebalanceState *state, + ShardPlacement *placement); +static RebalanceState * InitRebalanceState(List *workerNodeList, List *shardPlacementList, + RebalancePlanFunctions *functions); +static void MoveShardsAwayFromDisallowedNodes(RebalanceState *state); +static bool FindAndMoveShardCost(float4 utilizationLowerBound, + float4 utilizationUpperBound, + RebalanceState *state); +static NodeFillState * FindAllowedTargetFillState(RebalanceState *state, uint64 shardId); +static void MoveShardCost(NodeFillState *sourceFillState, NodeFillState *targetFillState, + ShardCost *shardCost, RebalanceState *state); +static int CompareNodeFillStateAsc(const void *void1, const void *void2); +static int CompareNodeFillStateDesc(const void *void1, const void *void2); +static int CompareShardCostAsc(const void *void1, const void *void2); +static int CompareShardCostDesc(const void *void1, const void *void2); +static int CompareDisallowedPlacementAsc(const void *void1, const void *void2); +static int CompareDisallowedPlacementDesc(const void *void1, const void *void2); +static bool ShardAllowedOnNode(uint64 shardId, WorkerNode *workerNode, void *context); +static float4 NodeCapacity(WorkerNode *workerNode, void *context); +static ShardCost GetShardCost(uint64 shardId, void *context); +static List * NonColocatedDistRelationIdList(void); +static void RebalanceTableShards(RebalanceOptions *options, Oid shardReplicationModeOid); +static void AcquireColocationLock(Oid relationId, const char *operationName); +static void ExecutePlacementUpdates(List *placementUpdateList, Oid + shardReplicationModeOid, char *noticeOperation); +static float4 CalculateUtilization(float4 totalCost, float4 capacity); +static Form_pg_dist_rebalance_strategy GetRebalanceStrategy(Name name); static void EnsureShardCostUDF(Oid functionOid); static void EnsureNodeCapacityUDF(Oid functionOid); static void EnsureShardAllowedOnNodeUDF(Oid functionOid); -NOT_SUPPORTED_IN_COMMUNITY(rebalance_table_shards); -NOT_SUPPORTED_IN_COMMUNITY(replicate_table_shards); -NOT_SUPPORTED_IN_COMMUNITY(get_rebalance_table_shards_plan); -NOT_SUPPORTED_IN_COMMUNITY(get_rebalance_progress); -NOT_SUPPORTED_IN_COMMUNITY(master_drain_node); -NOT_SUPPORTED_IN_COMMUNITY(citus_shard_cost_by_disk_size); -PG_FUNCTION_INFO_V1(pg_dist_rebalance_strategy_enterprise_check); + +/* declarations for dynamic loading */ +PG_FUNCTION_INFO_V1(rebalance_table_shards); +PG_FUNCTION_INFO_V1(replicate_table_shards); +PG_FUNCTION_INFO_V1(get_rebalance_table_shards_plan); +PG_FUNCTION_INFO_V1(get_rebalance_progress); +PG_FUNCTION_INFO_V1(master_drain_node); +PG_FUNCTION_INFO_V1(citus_shard_cost_by_disk_size); PG_FUNCTION_INFO_V1(citus_validate_rebalance_strategy_functions); +PG_FUNCTION_INFO_V1(pg_dist_rebalance_strategy_enterprise_check); + + +#ifdef USE_ASSERT_CHECKING + +/* + * Check that all the invariants of the state hold. + */ +static void +CheckRebalanceStateInvariants(const RebalanceState *state) +{ + NodeFillState *fillState = NULL; + NodeFillState *prevFillState = NULL; + int fillStateIndex = 0; + int fillStateLength = list_length(state->fillStateListAsc); + + Assert(state != NULL); + Assert(list_length(state->fillStateListAsc) == list_length(state->fillStateListDesc)); + foreach_ptr(fillState, state->fillStateListAsc) + { + float4 totalCost = 0; + ShardCost *shardCost = NULL; + ShardCost *prevShardCost = NULL; + if (prevFillState != NULL) + { + /* Check that the previous fill state is more empty than this one */ + bool higherUtilization = fillState->utilization > prevFillState->utilization; + bool sameUtilization = fillState->utilization == prevFillState->utilization; + bool lowerOrSameCapacity = fillState->capacity <= prevFillState->capacity; + Assert(higherUtilization || (sameUtilization && lowerOrSameCapacity)); + } + + /* Check that fillStateListDesc is the reversed version of fillStateListAsc */ + Assert(list_nth(state->fillStateListDesc, fillStateLength - fillStateIndex - 1) == + fillState); + + + foreach_ptr(shardCost, fillState->shardCostListDesc) + { + if (prevShardCost != NULL) + { + /* Check that shard costs are sorted in descending order */ + Assert(shardCost->cost <= prevShardCost->cost); + } + totalCost += shardCost->cost; + } + + /* Check that utilization field is up to date. */ + Assert(fillState->utilization == CalculateUtilization(fillState->totalCost, + fillState->capacity)); + + /* + * Check that fillState->totalCost is within 0.1% difference of + * sum(fillState->shardCostListDesc->cost) + * We cannot compare exactly, because these numbers are floats and + * fillState->totalCost is modified by doing + and - on it. So instead + * we check that the numbers are roughly the same. + */ + float4 absoluteDifferenceBetweenTotalCosts = + fabsf(fillState->totalCost - totalCost); + float4 maximumAbsoluteValueOfTotalCosts = + fmaxf(fabsf(fillState->totalCost), fabsf(totalCost)); + Assert(absoluteDifferenceBetweenTotalCosts <= maximumAbsoluteValueOfTotalCosts / + 1000); + + prevFillState = fillState; + fillStateIndex++; + } +} + + +#else +#define CheckRebalanceStateInvariants(l) ((void) 0) +#endif /* USE_ASSERT_CHECKING */ + +/* + * BigIntArrayDatumContains checks if the array contains the given number. + */ +static bool +BigIntArrayDatumContains(Datum *array, int arrayLength, uint64 toFind) +{ + for (int i = 0; i < arrayLength; i++) + { + if (DatumGetInt64(array[i]) == toFind) + { + return true; + } + } + return false; +} + + +/* + * FullShardPlacementList returns a List containing all the shard placements of + * a specific table (excluding the excludedShardArray) + */ +static List * +FullShardPlacementList(Oid relationId, ArrayType *excludedShardArray) +{ + List *shardPlacementList = NIL; + CitusTableCacheEntry *citusTableCacheEntry = GetCitusTableCacheEntry(relationId); + int shardIntervalArrayLength = citusTableCacheEntry->shardIntervalArrayLength; + int excludedShardIdCount = ArrayObjectCount(excludedShardArray); + Datum *excludedShardArrayDatum = DeconstructArrayObject(excludedShardArray); + + for (int shardIndex = 0; shardIndex < shardIntervalArrayLength; shardIndex++) + { + ShardInterval *shardInterval = + citusTableCacheEntry->sortedShardIntervalArray[shardIndex]; + GroupShardPlacement *placementArray = + citusTableCacheEntry->arrayOfPlacementArrays[shardIndex]; + int numberOfPlacements = + citusTableCacheEntry->arrayOfPlacementArrayLengths[shardIndex]; + + if (BigIntArrayDatumContains(excludedShardArrayDatum, excludedShardIdCount, + shardInterval->shardId)) + { + continue; + } + + for (int placementIndex = 0; placementIndex < numberOfPlacements; + placementIndex++) + { + GroupShardPlacement *groupPlacement = &placementArray[placementIndex]; + WorkerNode *worker = LookupNodeForGroup(groupPlacement->groupId); + ShardPlacement *placement = CitusMakeNode(ShardPlacement); + placement->shardId = groupPlacement->shardId; + placement->shardLength = groupPlacement->shardLength; + placement->shardState = groupPlacement->shardState; + placement->nodeName = pstrdup(worker->workerName); + placement->nodePort = worker->workerPort; + placement->placementId = groupPlacement->placementId; + + shardPlacementList = lappend(shardPlacementList, placement); + } + } + return SortList(shardPlacementList, CompareShardPlacements); +} + + +/* + * SortedActiveWorkers returns all the active workers like + * ActiveReadableNodeList, but sorted. + */ +static List * +SortedActiveWorkers() +{ + List *activeWorkerList = ActiveReadableNodeList(); + return SortList(activeWorkerList, CompareWorkerNodes); +} + + +/* + * GetRebalanceSteps returns a List of PlacementUpdateEvents that are needed to + * rebalance a list of tables. + */ +static List * +GetRebalanceSteps(RebalanceOptions *options) +{ + EnsureShardCostUDF(options->rebalanceStrategy->shardCostFunction); + EnsureNodeCapacityUDF(options->rebalanceStrategy->nodeCapacityFunction); + EnsureShardAllowedOnNodeUDF(options->rebalanceStrategy->shardAllowedOnNodeFunction); + + RebalanceContext context; + memset(&context, 0, sizeof(RebalanceContext)); + fmgr_info(options->rebalanceStrategy->shardCostFunction, &context.shardCostUDF); + fmgr_info(options->rebalanceStrategy->nodeCapacityFunction, &context.nodeCapacityUDF); + fmgr_info(options->rebalanceStrategy->shardAllowedOnNodeFunction, + &context.shardAllowedOnNodeUDF); + + RebalancePlanFunctions rebalancePlanFunctions = { + .shardAllowedOnNode = ShardAllowedOnNode, + .nodeCapacity = NodeCapacity, + .shardCost = GetShardCost, + .context = &context, + }; + + /* sort the lists to make the function more deterministic */ + List *activeWorkerList = SortedActiveWorkers(); + List *shardPlacementListList = NIL; + + Oid relationId = InvalidOid; + foreach_oid(relationId, options->relationIdList) + { + List *shardPlacementList = FullShardPlacementList(relationId, + options->excludedShardArray); + shardPlacementListList = lappend(shardPlacementListList, shardPlacementList); + } + + if (options->threshold < options->rebalanceStrategy->minimumThreshold) + { + ereport(WARNING, (errmsg( + "the given threshold is lower than the minimum " + "threshold allowed by the rebalance strategy, " + "using the minimum allowed threshold instead" + ), + errdetail("Using threshold of %.2f", + options->rebalanceStrategy->minimumThreshold + ) + )); + options->threshold = options->rebalanceStrategy->minimumThreshold; + } + + return RebalancePlacementUpdates(activeWorkerList, + shardPlacementListList, + options->threshold, + options->maxShardMoves, + options->drainOnly, + &rebalancePlanFunctions); +} + + +/* + * ShardAllowedOnNode determines if shard is allowed on a specific worker node. + */ +static bool +ShardAllowedOnNode(uint64 shardId, WorkerNode *workerNode, void *voidContext) +{ + if (!workerNode->shouldHaveShards) + { + return false; + } + + RebalanceContext *context = voidContext; + Datum allowed = FunctionCall2(&context->shardAllowedOnNodeUDF, shardId, + workerNode->nodeId); + return DatumGetBool(allowed); +} + + +/* + * NodeCapacity returns the relative capacity of a node. A node with capacity 2 + * can contain twice as many shards as a node with capacity 1. The actual + * capacity can be a number grounded in reality, like the disk size, number of + * cores, but it doesn't have to be. + */ +static float4 +NodeCapacity(WorkerNode *workerNode, void *voidContext) +{ + if (!workerNode->shouldHaveShards) + { + return 0; + } + + RebalanceContext *context = voidContext; + Datum capacity = FunctionCall1(&context->nodeCapacityUDF, workerNode->nodeId); + return DatumGetFloat4(capacity); +} + + +/* + * GetShardCost returns the cost of the given shard. A shard with cost 2 will + * be weighted as heavily as two shards with cost 1. This cost number can be a + * number grounded in reality, like the shard size on disk, but it doesn't have + * to be. + */ +static ShardCost +GetShardCost(uint64 shardId, void *voidContext) +{ + ShardCost shardCost; + memset_struct_0(shardCost); + shardCost.shardId = shardId; + RebalanceContext *context = voidContext; + Datum shardCostDatum = FunctionCall1(&context->shardCostUDF, UInt64GetDatum(shardId)); + shardCost.cost = DatumGetFloat4(shardCostDatum); + return shardCost; +} + + +/* + * citus_shard_cost_by_disk_size gets the cost for a shard based on the disk + * size of the shard on a worker. The worker to check the disk size is + * determined by choosing the first active placement for the shard. The disk + * size is calculated using pg_total_relation_size, so it includes indexes. + * + * SQL signature: + * citus_shard_cost_by_disk_size(shardid bigint) returns float4 + */ +Datum +citus_shard_cost_by_disk_size(PG_FUNCTION_ARGS) +{ + uint64 shardId = PG_GETARG_INT64(0); + bool missingOk = false; + ShardPlacement *shardPlacement = ActiveShardPlacement(shardId, missingOk); + char *workerNodeName = shardPlacement->nodeName; + uint32 workerNodePort = shardPlacement->nodePort; + uint32 connectionFlag = 0; + PGresult *result = NULL; + bool raiseErrors = true; + char *sizeQuery = PG_TOTAL_RELATION_SIZE_FUNCTION; + ShardInterval *shardInterval = LoadShardInterval(shardId); + List *colocatedShardList = ColocatedShardIntervalList(shardInterval); + StringInfo tableSizeQuery = GenerateSizeQueryOnMultiplePlacements(colocatedShardList, + sizeQuery); + + MultiConnection *connection = GetNodeConnection(connectionFlag, workerNodeName, + workerNodePort); + int queryResult = ExecuteOptionalRemoteCommand(connection, tableSizeQuery->data, + &result); + + if (queryResult != RESPONSE_OKAY) + { + ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("cannot get the size because of a connection error"))); + } + + List *sizeList = ReadFirstColumnAsText(result); + if (list_length(sizeList) != 1) + { + ereport(ERROR, (errmsg( + "received wrong number of rows from worker, expected 1 received %d", + list_length(sizeList)))); + } + + StringInfo tableSizeStringInfo = (StringInfo) linitial(sizeList); + char *tableSizeString = tableSizeStringInfo->data; + uint64 tableSize = SafeStringToUint64(tableSizeString); + + PQclear(result); + ClearResults(connection, raiseErrors); + if (tableSize <= 0) + { + PG_RETURN_FLOAT4(1); + } + + PG_RETURN_FLOAT4(tableSize); +} + + +/* + * GetColocatedRebalanceSteps takes a List of PlacementUpdateEvents and creates + * a new List of containing those and all the updates for colocated shards. + */ +static List * +GetColocatedRebalanceSteps(List *placementUpdateList) +{ + ListCell *placementUpdateCell = NULL; + List *colocatedUpdateList = NIL; + + foreach(placementUpdateCell, placementUpdateList) + { + PlacementUpdateEvent *placementUpdate = lfirst(placementUpdateCell); + ShardInterval *shardInterval = LoadShardInterval(placementUpdate->shardId); + List *colocatedShardList = ColocatedShardIntervalList(shardInterval); + ListCell *colocatedShardCell = NULL; + + foreach(colocatedShardCell, colocatedShardList) + { + ShardInterval *colocatedShard = lfirst(colocatedShardCell); + PlacementUpdateEvent *colocatedUpdate = palloc0(sizeof(PlacementUpdateEvent)); + + colocatedUpdate->shardId = colocatedShard->shardId; + colocatedUpdate->sourceNode = placementUpdate->sourceNode; + colocatedUpdate->targetNode = placementUpdate->targetNode; + colocatedUpdate->updateType = placementUpdate->updateType; + + colocatedUpdateList = lappend(colocatedUpdateList, colocatedUpdate); + } + } + + return colocatedUpdateList; +} + + +/* + * AcquireColocationLock tries to acquire a lock for rebalance/replication. If + * this is it not possible it fails instantly because this means another + * rebalance/repliction is currently happening. This would really mess up + * planning. + */ +static void +AcquireColocationLock(Oid relationId, const char *operationName) +{ + uint32 lockId = relationId; + LOCKTAG tag; + + CitusTableCacheEntry *citusTableCacheEntry = GetCitusTableCacheEntry(relationId); + if (citusTableCacheEntry->colocationId != INVALID_COLOCATION_ID) + { + lockId = citusTableCacheEntry->colocationId; + } + + SET_LOCKTAG_REBALANCE_COLOCATION(tag, (int64) lockId); + + LockAcquireResult lockAcquired = LockAcquire(&tag, ExclusiveLock, false, true); + if (!lockAcquired) + { + ereport(ERROR, (errmsg("could not acquire the lock required to %s %s", + operationName, generate_qualified_relation_name( + relationId)))); + } +} + + +/* + * GetResponsiveWorkerList returns a List of workers that respond to new + * connection requests. + */ +static List * +GetResponsiveWorkerList() +{ + List *activeWorkerList = ActiveReadableNodeList(); + ListCell *activeWorkerCell = NULL; + List *responsiveWorkerList = NIL; + + foreach(activeWorkerCell, activeWorkerList) + { + WorkerNode *worker = lfirst(activeWorkerCell); + int connectionFlag = FORCE_NEW_CONNECTION; + + MultiConnection *connection = GetNodeConnection(connectionFlag, + worker->workerName, + worker->workerPort); + + if (connection != NULL && connection->pgConn != NULL) + { + if (PQstatus(connection->pgConn) == CONNECTION_OK) + { + responsiveWorkerList = lappend(responsiveWorkerList, worker); + } + + CloseConnection(connection); + } + } + return responsiveWorkerList; +} + + +/* + * ExecutePlacementUpdates copies or moves a shard placement by calling the + * corresponding functions in Citus in a separate subtransaction for each + * update. + */ +static void +ExecutePlacementUpdates(List *placementUpdateList, Oid shardReplicationModeOid, + char *noticeOperation) +{ + List *responsiveWorkerList = GetResponsiveWorkerList(); + ListCell *placementUpdateCell = NULL; + + char shardReplicationMode = LookupShardTransferMode(shardReplicationModeOid); + if (shardReplicationMode == TRANSFER_MODE_FORCE_LOGICAL) + { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("the force_logical transfer mode is currently " + "unsupported"))); + } + + foreach(placementUpdateCell, placementUpdateList) + { + PlacementUpdateEvent *placementUpdate = lfirst(placementUpdateCell); + ereport(NOTICE, (errmsg( + "%s shard %lu from %s:%u to %s:%u ...", + noticeOperation, + placementUpdate->shardId, + placementUpdate->sourceNode->workerName, + placementUpdate->sourceNode->workerPort, + placementUpdate->targetNode->workerName, + placementUpdate->targetNode->workerPort + ))); + UpdateShardPlacement(placementUpdate, responsiveWorkerList, + shardReplicationModeOid); + } +} + + +/* + * SetupRebalanceMonitor initializes the dynamic shared memory required for storing the + * progress information of a rebalance process. The function takes a List of + * PlacementUpdateEvents for all shards that will be moved (including colocated + * ones) and the relation id of the target table. The dynamic shared memory + * portion consists of a RebalanceMonitorHeader and multiple + * PlacementUpdateEventProgress, one for each planned shard placement move. The + * dsm_handle of the created segment is savedin the progress of the current backend so + * that it can be read by external agents such as get_rebalance_progress function by + * calling pg_stat_get_progress_info UDF. Since currently only VACUUM commands are + * officially allowed as the command type, we describe ourselves as a VACUUM command and + * in order to distinguish a rebalancer progress from regular VACUUM progresses, we put + * a magic number to the first progress field as an indicator. Finally we return the + * dsm handle so that it can be used for updating the progress and cleaning things up. + */ +static void +SetupRebalanceMonitor(List *placementUpdateList, Oid relationId) +{ + List *colocatedUpdateList = GetColocatedRebalanceSteps(placementUpdateList); + ListCell *colocatedUpdateCell = NULL; + + ProgressMonitorData *monitor = CreateProgressMonitor(REBALANCE_ACTIVITY_MAGIC_NUMBER, + list_length(colocatedUpdateList), + sizeof( + PlacementUpdateEventProgress), + relationId); + PlacementUpdateEventProgress *rebalanceSteps = monitor->steps; + + int32 eventIndex = 0; + foreach(colocatedUpdateCell, colocatedUpdateList) + { + PlacementUpdateEvent *colocatedUpdate = lfirst(colocatedUpdateCell); + PlacementUpdateEventProgress *event = rebalanceSteps + eventIndex; + + strlcpy(event->sourceName, colocatedUpdate->sourceNode->workerName, 255); + strlcpy(event->targetName, colocatedUpdate->targetNode->workerName, 255); + + event->shardId = colocatedUpdate->shardId; + event->sourcePort = colocatedUpdate->sourceNode->workerPort; + event->targetPort = colocatedUpdate->targetNode->workerPort; + event->shardSize = ShardLength(colocatedUpdate->shardId); + + eventIndex++; + } +} + + +/* + * rebalance_table_shards rebalances the shards across the workers. + * + * SQL signature: + * + * rebalance_table_shards( + * relation regclass, + * threshold float4, + * max_shard_moves int, + * excluded_shard_list bigint[], + * shard_transfer_mode citus.shard_transfer_mode, + * drain_only boolean, + * rebalance_strategy name + * ) RETURNS VOID + */ +Datum +rebalance_table_shards(PG_FUNCTION_ARGS) +{ + List *relationIdList = NIL; + if (!PG_ARGISNULL(0)) + { + Oid relationId = PG_GETARG_OID(0); + ErrorIfMoveCitusLocalTable(relationId); + + relationIdList = list_make1_oid(relationId); + } + else + { + /* + * Note that we don't need to do any checks to error out for + * citus local tables here as NonColocatedDistRelationIdList + * already doesn't return non-distributed tables. + */ + relationIdList = NonColocatedDistRelationIdList(); + } + + PG_ENSURE_ARGNOTNULL(2, "max_shard_moves"); + PG_ENSURE_ARGNOTNULL(3, "excluded_shard_list"); + PG_ENSURE_ARGNOTNULL(4, "shard_transfer_mode"); + PG_ENSURE_ARGNOTNULL(5, "drain_only"); + + Form_pg_dist_rebalance_strategy strategy = GetRebalanceStrategy( + PG_GETARG_NAME_OR_NULL(6)); + RebalanceOptions options = { + .relationIdList = relationIdList, + .threshold = PG_GETARG_FLOAT4_OR_DEFAULT(1, strategy->defaultThreshold), + .maxShardMoves = PG_GETARG_INT32(2), + .excludedShardArray = PG_GETARG_ARRAYTYPE_P(3), + .drainOnly = PG_GETARG_BOOL(5), + .rebalanceStrategy = strategy, + }; + Oid shardTransferModeOid = PG_GETARG_OID(4); + RebalanceTableShards(&options, shardTransferModeOid); + PG_RETURN_VOID(); +} + + +/* + * GetRebalanceStrategy returns the rebalance strategy from + * pg_dist_rebalance_strategy matching the given name. If name is NULL it + * returns the default rebalance strategy from pg_dist_rebalance_strategy. + */ +static Form_pg_dist_rebalance_strategy +GetRebalanceStrategy(Name name) +{ + Relation pgDistRebalanceStrategy = table_open(DistRebalanceStrategyRelationId(), + AccessShareLock); + + const int scanKeyCount = 1; + ScanKeyData scanKey[1]; + if (name == NULL) + { + /* WHERE default_strategy=true */ + ScanKeyInit(&scanKey[0], Anum_pg_dist_rebalance_strategy_default_strategy, + BTEqualStrategyNumber, F_BOOLEQ, BoolGetDatum(true)); + } + else + { + /* WHERE name=$name */ + ScanKeyInit(&scanKey[0], Anum_pg_dist_rebalance_strategy_name, + BTEqualStrategyNumber, F_NAMEEQ, NameGetDatum(name)); + } + SysScanDesc scanDescriptor = systable_beginscan(pgDistRebalanceStrategy, + InvalidOid, false, + NULL, scanKeyCount, scanKey); + + HeapTuple heapTuple = systable_getnext(scanDescriptor); + if (!HeapTupleIsValid(heapTuple)) + { + if (name == NULL) + { + ereport(ERROR, (errmsg( + "no rebalance_strategy was provided, but there is also no default strategy set"))); + } + ereport(ERROR, (errmsg("could not find rebalance strategy with name %s", + (char *) name))); + } + + Form_pg_dist_rebalance_strategy strategy = + (Form_pg_dist_rebalance_strategy) GETSTRUCT(heapTuple); + Form_pg_dist_rebalance_strategy strategy_copy = + palloc0(sizeof(FormData_pg_dist_rebalance_strategy)); + + /* Copy data over by dereferencing */ + *strategy_copy = *strategy; + + + systable_endscan(scanDescriptor); + table_close(pgDistRebalanceStrategy, NoLock); + + return strategy_copy; +} + + +/* + * master_drain_node drains a node by setting shouldhaveshards to false and + * running the rebalancer after in drain_only mode. + */ +Datum +master_drain_node(PG_FUNCTION_ARGS) +{ + PG_ENSURE_ARGNOTNULL(0, "nodename"); + PG_ENSURE_ARGNOTNULL(1, "nodeport"); + PG_ENSURE_ARGNOTNULL(2, "shard_transfer_mode"); + + text *nodeNameText = PG_GETARG_TEXT_P(0); + int32 nodePort = PG_GETARG_INT32(1); + Oid shardTransferModeOid = PG_GETARG_OID(2); + Form_pg_dist_rebalance_strategy strategy = GetRebalanceStrategy( + PG_GETARG_NAME_OR_NULL(3)); + RebalanceOptions options = { + .relationIdList = NonColocatedDistRelationIdList(), + .threshold = strategy->defaultThreshold, + .maxShardMoves = 0, + .excludedShardArray = construct_empty_array(INT4OID), + .drainOnly = true, + .rebalanceStrategy = strategy, + }; + + char *nodeName = text_to_cstring(nodeNameText); + int connectionFlag = FORCE_NEW_CONNECTION; + MultiConnection *connection = GetNodeConnection(connectionFlag, LOCAL_HOST_NAME, + PostPortNumber); + + /* + * This is done in a separate session. This way it's not undone if the + * draining fails midway through. + */ + ExecuteCriticalRemoteCommand(connection, psprintf( + "SELECT master_set_node_property(%s, %i, 'shouldhaveshards', false)", + quote_literal_cstr(nodeName), nodePort)); + + RebalanceTableShards(&options, shardTransferModeOid); + + PG_RETURN_VOID(); +} + + +/* + * replicate_table_shards replicates under-replicated shards of the specified + * table. + */ +Datum +replicate_table_shards(PG_FUNCTION_ARGS) +{ + Oid relationId = PG_GETARG_OID(0); + uint32 shardReplicationFactor = PG_GETARG_INT32(1); + int32 maxShardCopies = PG_GETARG_INT32(2); + ArrayType *excludedShardArray = PG_GETARG_ARRAYTYPE_P(3); + Oid shardReplicationModeOid = PG_GETARG_OID(4); + + char transferMode = LookupShardTransferMode(shardReplicationModeOid); + EnsureReferenceTablesExistOnAllNodesExtended(transferMode); + + AcquireColocationLock(relationId, "replicate"); + + List *activeWorkerList = SortedActiveWorkers(); + List *shardPlacementList = FullShardPlacementList(relationId, excludedShardArray); + + List *placementUpdateList = ReplicationPlacementUpdates(activeWorkerList, + shardPlacementList, + shardReplicationFactor); + placementUpdateList = list_truncate(placementUpdateList, maxShardCopies); + + ExecutePlacementUpdates(placementUpdateList, shardReplicationModeOid, "Copying"); + + PG_RETURN_VOID(); +} + + +/* + * get_rebalance_table_shards_plan function calculates the shard move steps + * required for the rebalance operations including the ones for colocated + * tables. + * + * SQL signature: + * + * get_rebalance_table_shards_plan( + * relation regclass, + * threshold float4, + * max_shard_moves int, + * excluded_shard_list bigint[], + * drain_only boolean, + * rebalance_strategy name + * ) + */ +Datum +get_rebalance_table_shards_plan(PG_FUNCTION_ARGS) +{ + List *relationIdList = NIL; + if (!PG_ARGISNULL(0)) + { + Oid relationId = PG_GETARG_OID(0); + ErrorIfMoveCitusLocalTable(relationId); + + relationIdList = list_make1_oid(relationId); + } + else + { + /* + * Note that we don't need to do any checks to error out for + * citus local tables here as NonColocatedDistRelationIdList + * already doesn't return non-distributed tables. + */ + relationIdList = NonColocatedDistRelationIdList(); + } + + PG_ENSURE_ARGNOTNULL(2, "max_shard_moves"); + PG_ENSURE_ARGNOTNULL(3, "excluded_shard_list"); + PG_ENSURE_ARGNOTNULL(4, "drain_only"); + + Form_pg_dist_rebalance_strategy strategy = GetRebalanceStrategy( + PG_GETARG_NAME_OR_NULL(5)); + RebalanceOptions options = { + .relationIdList = relationIdList, + .threshold = PG_GETARG_FLOAT4_OR_DEFAULT(1, strategy->defaultThreshold), + .maxShardMoves = PG_GETARG_INT32(2), + .excludedShardArray = PG_GETARG_ARRAYTYPE_P(3), + .drainOnly = PG_GETARG_BOOL(4), + .rebalanceStrategy = strategy, + }; + + + List *placementUpdateList = GetRebalanceSteps(&options); + List *colocatedUpdateList = GetColocatedRebalanceSteps(placementUpdateList); + ListCell *colocatedUpdateCell = NULL; + + TupleDesc tupdesc; + Tuplestorestate *tupstore = SetupTuplestore(fcinfo, &tupdesc); + + foreach(colocatedUpdateCell, colocatedUpdateList) + { + PlacementUpdateEvent *colocatedUpdate = lfirst(colocatedUpdateCell); + Datum values[7]; + bool nulls[7]; + + memset(values, 0, sizeof(values)); + memset(nulls, 0, sizeof(nulls)); + + values[0] = ObjectIdGetDatum(RelationIdForShard(colocatedUpdate->shardId)); + values[1] = UInt64GetDatum(colocatedUpdate->shardId); + values[2] = UInt64GetDatum(ShardLength(colocatedUpdate->shardId)); + values[3] = PointerGetDatum(cstring_to_text( + colocatedUpdate->sourceNode->workerName)); + values[4] = UInt32GetDatum(colocatedUpdate->sourceNode->workerPort); + values[5] = PointerGetDatum(cstring_to_text( + colocatedUpdate->targetNode->workerName)); + values[6] = UInt32GetDatum(colocatedUpdate->targetNode->workerPort); + + tuplestore_putvalues(tupstore, tupdesc, values, nulls); + } + + tuplestore_donestoring(tupstore); + + return (Datum) 0; +} + + +/* + * get_rebalance_progress collects information about the ongoing rebalance operations and + * returns the concatenated list of steps involved in the operations, along with their + * progress information. Currently the progress field can take 4 integer values + * (-1: error, 0: waiting, 1: moving, 2: moved). The progress field is of type bigint + * because we may implement a more granular, byte-level progress as a future improvement. + */ +Datum +get_rebalance_progress(PG_FUNCTION_ARGS) +{ + List *segmentList = NIL; + ListCell *rebalanceMonitorCell = NULL; + TupleDesc tupdesc; + Tuplestorestate *tupstore = SetupTuplestore(fcinfo, &tupdesc); + + /* get the addresses of all current rebalance monitors */ + List *rebalanceMonitorList = ProgressMonitorList(REBALANCE_ACTIVITY_MAGIC_NUMBER, + &segmentList); + + foreach(rebalanceMonitorCell, rebalanceMonitorList) + { + ProgressMonitorData *monitor = lfirst(rebalanceMonitorCell); + PlacementUpdateEventProgress *placementUpdateEvents = monitor->steps; + + for (int eventIndex = 0; eventIndex < monitor->stepCount; eventIndex++) + { + PlacementUpdateEventProgress *step = placementUpdateEvents + eventIndex; + uint64 shardId = step->shardId; + ShardInterval *shardInterval = LoadShardInterval(shardId); + + Datum values[9]; + bool nulls[9]; + + memset(values, 0, sizeof(values)); + memset(nulls, 0, sizeof(nulls)); + + values[0] = monitor->processId; + values[1] = ObjectIdGetDatum(shardInterval->relationId); + values[2] = UInt64GetDatum(shardId); + values[3] = UInt64GetDatum(step->shardSize); + values[4] = PointerGetDatum(cstring_to_text(step->sourceName)); + values[5] = UInt32GetDatum(step->sourcePort); + values[6] = PointerGetDatum(cstring_to_text(step->targetName)); + values[7] = UInt32GetDatum(step->targetPort); + values[8] = UInt64GetDatum(step->progress); + + tuplestore_putvalues(tupstore, tupdesc, values, nulls); + } + } + + tuplestore_donestoring(tupstore); + + DetachFromDSMSegments(segmentList); + + return (Datum) 0; +} + + +/* + * NonColocatedDistRelationIdList returns a list of distributed table oids, one + * for each existing colocation group. + */ +static List * +NonColocatedDistRelationIdList(void) +{ + List *relationIdList = NIL; + List *allCitusTablesList = CitusTableTypeIdList(ANY_CITUS_TABLE_TYPE); + Oid tableId = InvalidOid; + + /* allocate sufficient capacity for O(1) expected look-up time */ + int capacity = (int) (list_length(allCitusTablesList) / 0.75) + 1; + int flags = HASH_ELEM | HASH_CONTEXT | HASH_BLOBS; + HASHCTL info = { + .keysize = sizeof(Oid), + .entrysize = sizeof(Oid), + .hcxt = CurrentMemoryContext + }; + + HTAB *alreadySelectedColocationIds = hash_create("RebalanceColocationIdSet", + capacity, &info, flags); + foreach_oid(tableId, allCitusTablesList) + { + bool foundInSet = false; + CitusTableCacheEntry *citusTableCacheEntry = GetCitusTableCacheEntry( + tableId); + + if (!IsCitusTableTypeCacheEntry(citusTableCacheEntry, DISTRIBUTED_TABLE)) + { + /* + * We're only interested in distributed tables, should ignore + * reference tables and citus local tables. + */ + continue; + } + + if (citusTableCacheEntry->colocationId != INVALID_COLOCATION_ID) + { + hash_search(alreadySelectedColocationIds, + &citusTableCacheEntry->colocationId, HASH_ENTER, + &foundInSet); + if (foundInSet) + { + continue; + } + } + relationIdList = lappend_oid(relationIdList, tableId); + } + return relationIdList; +} + + +/* + * RebalanceTableShards rebalances the shards for the relations inside the + * relationIdList across the different workers. + */ +static void +RebalanceTableShards(RebalanceOptions *options, Oid shardReplicationModeOid) +{ + char transferMode = LookupShardTransferMode(shardReplicationModeOid); + EnsureReferenceTablesExistOnAllNodesExtended(transferMode); + + if (list_length(options->relationIdList) == 0) + { + return; + } + + Oid relationId = InvalidOid; + char *operationName = "rebalance"; + if (options->drainOnly) + { + operationName = "move"; + } + + foreach_oid(relationId, options->relationIdList) + { + AcquireColocationLock(relationId, operationName); + } + + List *placementUpdateList = GetRebalanceSteps(options); + + if (list_length(placementUpdateList) == 0) + { + return; + } + + /* + * This uses the first relationId from the list, it's only used for display + * purposes so it does not really matter which to show + */ + SetupRebalanceMonitor(placementUpdateList, linitial_oid(options->relationIdList)); + ExecutePlacementUpdates(placementUpdateList, shardReplicationModeOid, "Moving"); + FinalizeCurrentProgressMonitor(); +} + + +/* + * UpdateShardPlacement copies or moves a shard placement by calling + * the corresponding functions in Citus in a subtransaction. + */ +static bool +UpdateShardPlacement(PlacementUpdateEvent *placementUpdateEvent, + List *responsiveNodeList, Oid shardReplicationModeOid) +{ + PlacementUpdateType updateType = placementUpdateEvent->updateType; + uint64 shardId = placementUpdateEvent->shardId; + WorkerNode *sourceNode = placementUpdateEvent->sourceNode; + WorkerNode *targetNode = placementUpdateEvent->targetNode; + const char *doRepair = "false"; + int connectionFlag = FORCE_NEW_CONNECTION; + + Datum shardTranferModeLabelDatum = + DirectFunctionCall1(enum_out, shardReplicationModeOid); + char *shardTranferModeLabel = DatumGetCString(shardTranferModeLabelDatum); + + StringInfo placementUpdateCommand = makeStringInfo(); + + /* if target node is not responsive, don't continue */ + bool targetResponsive = WorkerNodeListContains(responsiveNodeList, + targetNode->workerName, + targetNode->workerPort); + if (!targetResponsive) + { + ereport(WARNING, (errmsg("%s:%d is not responsive", targetNode->workerName, + targetNode->workerPort))); + UpdateColocatedShardPlacementProgress(shardId, + sourceNode->workerName, + sourceNode->workerPort, + REBALANCE_PROGRESS_ERROR); + return false; + } + + /* if source node is not responsive, don't continue */ + bool sourceResponsive = WorkerNodeListContains(responsiveNodeList, + sourceNode->workerName, + sourceNode->workerPort); + if (!sourceResponsive) + { + ereport(WARNING, (errmsg("%s:%d is not responsive", sourceNode->workerName, + sourceNode->workerPort))); + UpdateColocatedShardPlacementProgress(shardId, + sourceNode->workerName, + sourceNode->workerPort, + REBALANCE_PROGRESS_ERROR); + return false; + } + + if (updateType == PLACEMENT_UPDATE_MOVE) + { + appendStringInfo(placementUpdateCommand, + "SELECT master_move_shard_placement(%ld,%s,%u,%s,%u,%s)", + shardId, + quote_literal_cstr(sourceNode->workerName), + sourceNode->workerPort, + quote_literal_cstr(targetNode->workerName), + targetNode->workerPort, + quote_literal_cstr(shardTranferModeLabel)); + } + else if (updateType == PLACEMENT_UPDATE_COPY) + { + appendStringInfo(placementUpdateCommand, + "SELECT master_copy_shard_placement(%ld,%s,%u,%s,%u,%s,%s)", + shardId, + quote_literal_cstr(sourceNode->workerName), + sourceNode->workerPort, + quote_literal_cstr(targetNode->workerName), + targetNode->workerPort, + doRepair, + quote_literal_cstr(shardTranferModeLabel)); + } + else + { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("only moving or copying shards is supported"))); + } + + UpdateColocatedShardPlacementProgress(shardId, + sourceNode->workerName, + sourceNode->workerPort, + REBALANCE_PROGRESS_MOVING); + + MultiConnection *connection = GetNodeConnection(connectionFlag, LOCAL_HOST_NAME, + PostPortNumber); + + /* + * In case of failure, we throw an error such that rebalance_table_shards + * fails early. + */ + ExecuteCriticalRemoteCommand(connection, placementUpdateCommand->data); + + UpdateColocatedShardPlacementProgress(shardId, + sourceNode->workerName, + sourceNode->workerPort, + REBALANCE_PROGRESS_MOVED); + + return true; +} + + +/* + * RebalancePlacementUpdates returns a list of placement updates which makes the + * cluster balanced. We move shards to these nodes until all nodes become utilized. + * We consider a node under-utilized if it has less than floor((1.0 - threshold) * + * placementCountAverage) shard placements. In each iteration we choose the node + * with maximum number of shard placements as the source, and we choose the node + * with minimum number of shard placements as the target. Then we choose a shard + * which is placed in the source node but not in the target node as the shard to + * move. + * + * The shardPlacementListList argument contains a list of lists of shard + * placements. Each of these lists are balanced independently. This is used to + * make sure different colocation groups are balanced separately, so each list + * contains the placements of a colocation group. + */ +List * +RebalancePlacementUpdates(List *workerNodeList, List *shardPlacementListList, + double threshold, + int32 maxShardMoves, + bool drainOnly, + RebalancePlanFunctions *functions) +{ + List *rebalanceStates = NIL; + RebalanceState *state = NULL; + List *shardPlacementList = NIL; + List *placementUpdateList = NIL; + + foreach_ptr(shardPlacementList, shardPlacementListList) + { + state = InitRebalanceState(workerNodeList, shardPlacementList, + functions); + rebalanceStates = lappend(rebalanceStates, state); + } + + foreach_ptr(state, rebalanceStates) + { + state->placementUpdateList = placementUpdateList; + MoveShardsAwayFromDisallowedNodes(state); + placementUpdateList = state->placementUpdateList; + } + + if (!drainOnly) + { + foreach_ptr(state, rebalanceStates) + { + state->placementUpdateList = placementUpdateList; + + /* calculate lower bound for placement count */ + float4 averageUtilization = (state->totalCost / state->totalCapacity); + float4 utilizationLowerBound = ((1.0 - threshold) * averageUtilization); + float4 utilizationUpperBound = ((1.0 + threshold) * averageUtilization); + + bool moreMovesAvailable = true; + while (list_length(state->placementUpdateList) < maxShardMoves && + moreMovesAvailable) + { + moreMovesAvailable = FindAndMoveShardCost(utilizationLowerBound, + utilizationUpperBound, + state); + } + placementUpdateList = state->placementUpdateList; + + if (moreMovesAvailable) + { + ereport(NOTICE, (errmsg( + "Stopped searching before we were out of moves. " + "Please rerun the rebalancer after it's finished " + "for a more optimal placement."))); + break; + } + } + } + + foreach_ptr(state, rebalanceStates) + { + hash_destroy(state->placementsHash); + } + + return placementUpdateList; +} + + +/* + * InitRebalanceState sets up a RebalanceState for it's arguments. The + * RebalanceState contains the information needed to calculate shard moves. + */ +static RebalanceState * +InitRebalanceState(List *workerNodeList, List *shardPlacementList, + RebalancePlanFunctions *functions) +{ + ShardPlacement *placement = NULL; + HASH_SEQ_STATUS status; + WorkerNode *workerNode = NULL; + + RebalanceState *state = palloc0(sizeof(RebalanceState)); + state->functions = functions; + state->placementsHash = ActivePlacementsHash(shardPlacementList); + + /* create empty fill state for all of the worker nodes */ + foreach_ptr(workerNode, workerNodeList) + { + NodeFillState *fillState = palloc0(sizeof(NodeFillState)); + fillState->node = workerNode; + fillState->capacity = functions->nodeCapacity(workerNode, functions->context); + + /* + * Set the utilization here although the totalCost is not set yet. This is + * important to set the utilization to INFINITY when the capacity is 0. + */ + fillState->utilization = CalculateUtilization(fillState->totalCost, + fillState->capacity); + state->fillStateListAsc = lappend(state->fillStateListAsc, fillState); + state->fillStateListDesc = lappend(state->fillStateListDesc, fillState); + state->totalCapacity += fillState->capacity; + } + + /* Fill the fill states for all of the worker nodes based on the placements */ + foreach_htab(placement, &status, state->placementsHash) + { + ShardCost *shardCost = palloc0(sizeof(ShardCost)); + NodeFillState *fillState = FindFillStateForPlacement(state, placement); + + Assert(fillState != NULL); + + *shardCost = functions->shardCost(placement->shardId, functions->context); + + fillState->totalCost += shardCost->cost; + fillState->utilization = CalculateUtilization(fillState->totalCost, + fillState->capacity); + fillState->shardCostListDesc = lappend(fillState->shardCostListDesc, + shardCost); + fillState->shardCostListDesc = SortList(fillState->shardCostListDesc, + CompareShardCostDesc); + + state->totalCost += shardCost->cost; + + if (!functions->shardAllowedOnNode(placement->shardId, fillState->node, + functions->context)) + { + DisallowedPlacement *disallowed = palloc0(sizeof(DisallowedPlacement)); + disallowed->shardCost = shardCost; + disallowed->fillState = fillState; + state->disallowedPlacementList = lappend(state->disallowedPlacementList, + disallowed); + } + } + foreach_htab_cleanup(placement, &status); + + state->fillStateListAsc = SortList(state->fillStateListAsc, CompareNodeFillStateAsc); + state->fillStateListDesc = SortList(state->fillStateListDesc, + CompareNodeFillStateDesc); + CheckRebalanceStateInvariants(state); + + return state; +} + + +/* + * CalculateUtilization returns INFINITY when capacity is 0 and + * totalCost/capacity otherwise. + */ +static float4 +CalculateUtilization(float4 totalCost, float4 capacity) +{ + if (capacity <= 0) + { + return INFINITY; + } + return totalCost / capacity; +} + + +/* + * FindFillStateForPlacement finds the fillState for the workernode that + * matches the placement. + */ +static NodeFillState * +FindFillStateForPlacement(RebalanceState *state, ShardPlacement *placement) +{ + NodeFillState *fillState = NULL; + + /* Find the correct fill state to add the placement to and do that */ + foreach_ptr(fillState, state->fillStateListAsc) + { + if (IsPlacementOnWorkerNode(placement, fillState->node)) + { + return fillState; + } + } + return NULL; +} + + +/* + * IsPlacementOnWorkerNode checks if the shard placement is for to the given + * workenode. + */ +static bool +IsPlacementOnWorkerNode(ShardPlacement *placement, WorkerNode *workerNode) +{ + if (strncmp(workerNode->workerName, placement->nodeName, WORKER_LENGTH) != 0) + { + return false; + } + return workerNode->workerPort == placement->nodePort; +} + + +/* + * CompareNodeFillStateAsc can be used to sort fill states from empty to full. + */ +static int +CompareNodeFillStateAsc(const void *void1, const void *void2) +{ + const NodeFillState *a = *((const NodeFillState **) void1); + const NodeFillState *b = *((const NodeFillState **) void2); + if (a->utilization < b->utilization) + { + return -1; + } + if (a->utilization > b->utilization) + { + return 1; + } + + /* + * If utilization prefer nodes with more capacity, since utilization will + * grow slower on those + */ + if (a->capacity > b->capacity) + { + return -1; + } + if (a->capacity < b->capacity) + { + return 1; + } + + /* Finally differentiate by node id */ + if (a->node->nodeId < b->node->nodeId) + { + return -1; + } + return a->node->nodeId > b->node->nodeId; +} + + +/* + * CompareNodeFillStateDesc can be used to sort fill states from full to empty. + */ +static int +CompareNodeFillStateDesc(const void *a, const void *b) +{ + return -CompareNodeFillStateAsc(a, b); +} + + +/* + * CompareShardCostAsc can be used to sort shard costs from low cost to high + * cost. + */ +static int +CompareShardCostAsc(const void *void1, const void *void2) +{ + const ShardCost *a = *((const ShardCost **) void1); + const ShardCost *b = *((const ShardCost **) void2); + if (a->cost < b->cost) + { + return -1; + } + if (a->cost > b->cost) + { + return 1; + } + + /* make compare function (more) stable for tests */ + if (a->shardId > b->shardId) + { + return -1; + } + return a->shardId < b->shardId; +} + + +/* + * CompareShardCostAsc can be used to sort shard costs from high cost to low + * cost. + */ +static int +CompareShardCostDesc(const void *a, const void *b) +{ + return -CompareShardCostAsc(a, b); +} + + +/* + * MoveShardsAwayFromDisallowedNodes returns a list of placement updates that + * move any shards that are not allowed on their current node to a node that + * they are allowed on. + */ +static void +MoveShardsAwayFromDisallowedNodes(RebalanceState *state) +{ + DisallowedPlacement *disallowedPlacement = NULL; + + state->disallowedPlacementList = SortList(state->disallowedPlacementList, + CompareDisallowedPlacementDesc); + + /* Move shards off of nodes they are not allowed on */ + foreach_ptr(disallowedPlacement, state->disallowedPlacementList) + { + NodeFillState *targetFillState = FindAllowedTargetFillState( + state, disallowedPlacement->shardCost->shardId); + if (targetFillState == NULL) + { + ereport(WARNING, (errmsg( + "Not allowed to move shard " UINT64_FORMAT + " anywhere from %s:%d", + disallowedPlacement->shardCost->shardId, + disallowedPlacement->fillState->node->workerName, + disallowedPlacement->fillState->node->workerPort + ))); + continue; + } + MoveShardCost(disallowedPlacement->fillState, + targetFillState, + disallowedPlacement->shardCost, + state); + } +} + + +/* + * CompareDisallowedPlacementAsc can be used to sort disallowed placements from + * low cost to high cost. + */ +static int +CompareDisallowedPlacementAsc(const void *void1, const void *void2) +{ + const DisallowedPlacement *a = *((const DisallowedPlacement **) void1); + const DisallowedPlacement *b = *((const DisallowedPlacement **) void2); + return CompareShardCostAsc(&(a->shardCost), &(b->shardCost)); +} + + +/* + * CompareDisallowedPlacementAsc can be used to sort disallowed placements from + * low cost to high cost. + */ +static int +CompareDisallowedPlacementDesc(const void *a, const void *b) +{ + return -CompareDisallowedPlacementAsc(a, b); +} + + +/* + * FindAllowedTargetFillState finds the first fill state in fillStateListAsc + * where the shard can be moved to. + */ +static NodeFillState * +FindAllowedTargetFillState(RebalanceState *state, uint64 shardId) +{ + NodeFillState *targetFillState = NULL; + foreach_ptr(targetFillState, state->fillStateListAsc) + { + bool hasShard = PlacementsHashFind( + state->placementsHash, + shardId, + targetFillState->node); + if (!hasShard && state->functions->shardAllowedOnNode( + shardId, + targetFillState->node, + state->functions->context)) + { + return targetFillState; + } + } + return NULL; +} + + +/* + * MoveShardCost moves a shardcost from the source to the target fill states + * and updates the RebalanceState accordingly. What it does in detail is: + * 1. add a placement update to state->placementUpdateList + * 2. update state->placementsHash + * 3. update totalcost, utilization and shardCostListDesc in source and target + * 4. resort state->fillStateListAsc/Desc + */ +static void +MoveShardCost(NodeFillState *sourceFillState, + NodeFillState *targetFillState, + ShardCost *shardCost, + RebalanceState *state) +{ + uint64 shardIdToMove = shardCost->shardId; + + /* construct the placement update */ + PlacementUpdateEvent *placementUpdateEvent = palloc0(sizeof(PlacementUpdateEvent)); + placementUpdateEvent->updateType = PLACEMENT_UPDATE_MOVE; + placementUpdateEvent->shardId = shardIdToMove; + placementUpdateEvent->sourceNode = sourceFillState->node; + placementUpdateEvent->targetNode = targetFillState->node; + + /* record the placement update */ + state->placementUpdateList = lappend(state->placementUpdateList, + placementUpdateEvent); + + /* update the placements hash and the node shard lists */ + PlacementsHashRemove(state->placementsHash, shardIdToMove, sourceFillState->node); + PlacementsHashEnter(state->placementsHash, shardIdToMove, targetFillState->node); + + sourceFillState->totalCost -= shardCost->cost; + sourceFillState->utilization = CalculateUtilization(sourceFillState->totalCost, + sourceFillState->capacity); + sourceFillState->shardCostListDesc = list_delete_ptr( + sourceFillState->shardCostListDesc, + shardCost); + + targetFillState->totalCost += shardCost->cost; + targetFillState->utilization = CalculateUtilization(targetFillState->totalCost, + targetFillState->capacity); + targetFillState->shardCostListDesc = lappend(targetFillState->shardCostListDesc, + shardCost); + targetFillState->shardCostListDesc = SortList(targetFillState->shardCostListDesc, + CompareShardCostDesc); + + state->fillStateListAsc = SortList(state->fillStateListAsc, CompareNodeFillStateAsc); + state->fillStateListDesc = SortList(state->fillStateListDesc, + CompareNodeFillStateDesc); + CheckRebalanceStateInvariants(state); +} + + +/* + * FindAndMoveShardCost is the main rebalancing algorithm. This takes the + * current state and returns a list with a new move appended that improves the + * balance of shards. The algorithm is greedy and will use the first new move + * that improves the balance. It finds nodes by trying to move a shard from the + * fullest node to the emptiest node. If no moves are possible it will try the + * second emptiest node until it tried all of them. Then it wil try the second + * fullest node. If it was able to find a move it will return true and false if + * it couldn't. + */ +static bool +FindAndMoveShardCost(float4 utilizationLowerBound, float4 utilizationUpperBound, + RebalanceState *state) +{ + NodeFillState *sourceFillState = NULL; + NodeFillState *targetFillState = NULL; + + /* + * find a source node for the move, starting at the node with the highest + * utilization + */ + foreach_ptr(sourceFillState, state->fillStateListDesc) + { + /* Don't move shards away from nodes that are already too empty, we're + * done searching */ + if (sourceFillState->utilization <= utilizationLowerBound) + { + return false; + } + + /* find a target node for the move, starting at the node with the + * lowest utilization */ + foreach_ptr(targetFillState, state->fillStateListAsc) + { + ShardCost *shardCost = NULL; + + /* Don't add more shards to nodes that are already at the upper + * bound. We should try the next source node now because further + * target nodes will also be above the upper bound */ + if (targetFillState->utilization >= utilizationUpperBound) + { + break; + } + + /* Don't move a shard between nodes that both have decent + * utilization. We should try the next source node now because + * further target nodes will also have have decent utilization */ + if (targetFillState->utilization >= utilizationLowerBound && + sourceFillState->utilization <= utilizationUpperBound) + { + break; + } + + /* find a shardcost that can be moved between between nodes that + * makes the cost distribution more equal */ + foreach_ptr(shardCost, sourceFillState->shardCostListDesc) + { + bool targetHasShard = PlacementsHashFind(state->placementsHash, + shardCost->shardId, + targetFillState->node); + float4 newTargetTotalCost = targetFillState->totalCost + shardCost->cost; + float4 newTargetUtilization = CalculateUtilization( + newTargetTotalCost, + targetFillState->capacity); + float4 newSourceTotalCost = sourceFillState->totalCost - shardCost->cost; + float4 newSourceUtilization = CalculateUtilization( + newSourceTotalCost, + sourceFillState->capacity); + + /* Skip shards that already are on the node */ + if (targetHasShard) + { + continue; + } + + /* Skip shards that already are not allowed on the node */ + if (!state->functions->shardAllowedOnNode(shardCost->shardId, + targetFillState->node, + state->functions->context)) + { + continue; + } + + /* + * Ensure that the cost distrubition is actually better + * after the move, i.e. the new highest utilization of + * source and target is lower than the previous highest, or + * the highest utilization is the same, but the lowest + * increased. + */ + if (newTargetUtilization > sourceFillState->utilization) + { + continue; + } + if (newTargetUtilization == sourceFillState->utilization && + newSourceUtilization <= targetFillState->utilization + ) + { + /* + * this can trigger when capacity of the nodes is not the + * same. Example (also a test): + * - node with capacity 3 + * - node with capacity 1 + * - 3 shards with cost 1 + * Best distribution would be 2 shards on node with + * capacity 3 and one on node with capacity 1 + */ + continue; + } + MoveShardCost(sourceFillState, targetFillState, + shardCost, state); + return true; + } + } + } + return false; +} + + +/* + * ReplicationPlacementUpdates returns a list of placement updates which + * replicates shard placements that need re-replication. To do this, the + * function loops over the shard placements, and for each shard placement + * which needs to be re-replicated, it chooses an active worker node with + * smallest number of shards as the target node. + */ +List * +ReplicationPlacementUpdates(List *workerNodeList, List *shardPlacementList, + int shardReplicationFactor) +{ + List *placementUpdateList = NIL; + ListCell *shardPlacementCell = NULL; + uint32 workerNodeIndex = 0; + HTAB *placementsHash = ActivePlacementsHash(shardPlacementList); + uint32 workerNodeCount = list_length(workerNodeList); + + /* get number of shards per node */ + uint32 *shardCountArray = palloc0(workerNodeCount * sizeof(uint32)); + foreach(shardPlacementCell, shardPlacementList) + { + ShardPlacement *placement = lfirst(shardPlacementCell); + if (placement->shardState != SHARD_STATE_ACTIVE) + { + continue; + } + + for (workerNodeIndex = 0; workerNodeIndex < workerNodeCount; workerNodeIndex++) + { + WorkerNode *node = list_nth(workerNodeList, workerNodeIndex); + if (strncmp(node->workerName, placement->nodeName, WORKER_LENGTH) == 0 && + node->workerPort == placement->nodePort) + { + shardCountArray[workerNodeIndex]++; + break; + } + } + } + + foreach(shardPlacementCell, shardPlacementList) + { + WorkerNode *sourceNode = NULL; + WorkerNode *targetNode = NULL; + uint32 targetNodeShardCount = UINT_MAX; + uint32 targetNodeIndex = 0; + + ShardPlacement *placement = (ShardPlacement *) lfirst(shardPlacementCell); + uint64 shardId = placement->shardId; + + /* skip the shard placement if it has enough replications */ + int activePlacementCount = ShardActivePlacementCount(placementsHash, shardId, + workerNodeList); + if (activePlacementCount >= shardReplicationFactor) + { + continue; + } + + /* + * We can copy the shard from any active worker node that contains the + * shard. + */ + for (workerNodeIndex = 0; workerNodeIndex < workerNodeCount; workerNodeIndex++) + { + WorkerNode *workerNode = list_nth(workerNodeList, workerNodeIndex); + + bool placementExists = PlacementsHashFind(placementsHash, shardId, + workerNode); + if (placementExists) + { + sourceNode = workerNode; + break; + } + } + + /* + * If we couldn't find any worker node which contains the shard, then + * all copies of the shard are list and we should error out. + */ + if (sourceNode == NULL) + { + ereport(ERROR, (errmsg("could not find a source for shard " UINT64_FORMAT, + shardId))); + } + + /* + * We can copy the shard to any worker node that doesn't contain the shard. + * Among such worker nodes, we choose the worker node with minimum shard + * count as the target. + */ + for (workerNodeIndex = 0; workerNodeIndex < workerNodeCount; workerNodeIndex++) + { + WorkerNode *workerNode = list_nth(workerNodeList, workerNodeIndex); + + if (!NodeCanHaveDistTablePlacements(workerNode)) + { + /* never replicate placements to nodes that should not have placements */ + continue; + } + + /* skip this node if it already contains the shard */ + bool placementExists = PlacementsHashFind(placementsHash, shardId, + workerNode); + if (placementExists) + { + continue; + } + + /* compare and change the target node */ + if (shardCountArray[workerNodeIndex] < targetNodeShardCount) + { + targetNode = workerNode; + targetNodeShardCount = shardCountArray[workerNodeIndex]; + targetNodeIndex = workerNodeIndex; + } + } + + /* + * If there is no worker node which doesn't contain the shard, then the + * shard replication factor is greater than number of worker nodes, and + * we should error out. + */ + if (targetNode == NULL) + { + ereport(ERROR, (errmsg("could not find a target for shard " UINT64_FORMAT, + shardId))); + } + + /* construct the placement update */ + PlacementUpdateEvent *placementUpdateEvent = palloc0( + sizeof(PlacementUpdateEvent)); + placementUpdateEvent->updateType = PLACEMENT_UPDATE_COPY; + placementUpdateEvent->shardId = shardId; + placementUpdateEvent->sourceNode = sourceNode; + placementUpdateEvent->targetNode = targetNode; + + /* record the placement update */ + placementUpdateList = lappend(placementUpdateList, placementUpdateEvent); + + /* update the placements hash and the shard count array */ + PlacementsHashEnter(placementsHash, shardId, targetNode); + shardCountArray[targetNodeIndex]++; + } + + hash_destroy(placementsHash); + + return placementUpdateList; +} + + +/* + * ShardActivePlacementCount returns the number of active placements for the + * given shard which are placed at the active worker nodes. + */ +static int +ShardActivePlacementCount(HTAB *activePlacementsHash, uint64 shardId, + List *activeWorkerNodeList) +{ + int shardActivePlacementCount = 0; + ListCell *workerNodeCell = NULL; + + foreach(workerNodeCell, activeWorkerNodeList) + { + WorkerNode *workerNode = lfirst(workerNodeCell); + bool placementExists = PlacementsHashFind(activePlacementsHash, shardId, + workerNode); + if (placementExists) + { + shardActivePlacementCount++; + } + } + + return shardActivePlacementCount; +} + + +/* + * ActivePlacementsHash creates and returns a hash set for the placements in + * the given list of shard placements which are in active state. + */ +static HTAB * +ActivePlacementsHash(List *shardPlacementList) +{ + ListCell *shardPlacementCell = NULL; + HASHCTL info; + int shardPlacementCount = list_length(shardPlacementList); + + memset(&info, 0, sizeof(info)); + info.keysize = sizeof(ShardPlacement); + info.entrysize = sizeof(ShardPlacement); + info.hash = PlacementsHashHashCode; + info.match = PlacementsHashCompare; + int hashFlags = (HASH_ELEM | HASH_FUNCTION | HASH_COMPARE); + + HTAB *shardPlacementsHash = hash_create("ActivePlacements Hash", + shardPlacementCount, &info, hashFlags); + + foreach(shardPlacementCell, shardPlacementList) + { + ShardPlacement *shardPlacement = (ShardPlacement *) lfirst(shardPlacementCell); + if (shardPlacement->shardState == SHARD_STATE_ACTIVE) + { + void *hashKey = (void *) shardPlacement; + hash_search(shardPlacementsHash, hashKey, HASH_ENTER, NULL); + } + } + + return shardPlacementsHash; +} + + +/* + * PlacementsHashFinds returns true if there exists a shard placement with the + * given workerNode and shard id in the given placements hash, otherwise it + * returns false. + */ +static bool +PlacementsHashFind(HTAB *placementsHash, uint64 shardId, WorkerNode *workerNode) +{ + bool placementFound = false; + + ShardPlacement shardPlacement; + memset(&shardPlacement, 0, sizeof(shardPlacement)); + + shardPlacement.shardId = shardId; + shardPlacement.nodeName = workerNode->workerName; + shardPlacement.nodePort = workerNode->workerPort; + + void *hashKey = (void *) (&shardPlacement); + hash_search(placementsHash, hashKey, HASH_FIND, &placementFound); + + return placementFound; +} + + +/* + * PlacementsHashEnter enters a shard placement for the given worker node and + * shard id to the given placements hash. + */ +static void +PlacementsHashEnter(HTAB *placementsHash, uint64 shardId, WorkerNode *workerNode) +{ + ShardPlacement shardPlacement; + memset(&shardPlacement, 0, sizeof(shardPlacement)); + + shardPlacement.shardId = shardId; + shardPlacement.nodeName = workerNode->workerName; + shardPlacement.nodePort = workerNode->workerPort; + + void *hashKey = (void *) (&shardPlacement); + hash_search(placementsHash, hashKey, HASH_ENTER, NULL); +} + + +/* + * PlacementsHashRemove removes the shard placement for the given worker node and + * shard id from the given placements hash. + */ +static void +PlacementsHashRemove(HTAB *placementsHash, uint64 shardId, WorkerNode *workerNode) +{ + ShardPlacement shardPlacement; + memset(&shardPlacement, 0, sizeof(shardPlacement)); + + shardPlacement.shardId = shardId; + shardPlacement.nodeName = workerNode->workerName; + shardPlacement.nodePort = workerNode->workerPort; + + void *hashKey = (void *) (&shardPlacement); + hash_search(placementsHash, hashKey, HASH_REMOVE, NULL); +} + + +/* + * ShardPlacementCompare compares two shard placements using shard id, node name, + * and node port number. + */ +static int +PlacementsHashCompare(const void *lhsKey, const void *rhsKey, Size keySize) +{ + const ShardPlacement *placementLhs = (const ShardPlacement *) lhsKey; + const ShardPlacement *placementRhs = (const ShardPlacement *) rhsKey; + + int shardIdCompare = 0; + + /* first, compare by shard id */ + if (placementLhs->shardId < placementRhs->shardId) + { + shardIdCompare = -1; + } + else if (placementLhs->shardId > placementRhs->shardId) + { + shardIdCompare = 1; + } + + if (shardIdCompare != 0) + { + return shardIdCompare; + } + + /* then, compare by node name */ + int nodeNameCompare = strncmp(placementLhs->nodeName, placementRhs->nodeName, + WORKER_LENGTH); + if (nodeNameCompare != 0) + { + return nodeNameCompare; + } + + /* finally, compare by node port */ + int nodePortCompare = placementLhs->nodePort - placementRhs->nodePort; + return nodePortCompare; +} + + +/* + * ShardPlacementHashCode computes the hash code for a shard placement from the + * placement's shard id, node name, and node port number. + */ +static uint32 +PlacementsHashHashCode(const void *key, Size keySize) +{ + const ShardPlacement *placement = (const ShardPlacement *) key; + const uint64 *shardId = &(placement->shardId); + const char *nodeName = placement->nodeName; + const uint32 *nodePort = &(placement->nodePort); + + /* standard hash function outlined in Effective Java, Item 8 */ + uint32 result = 17; + result = 37 * result + tag_hash(shardId, sizeof(uint64)); + result = 37 * result + string_hash(nodeName, WORKER_LENGTH); + result = 37 * result + tag_hash(nodePort, sizeof(uint32)); + + return result; +} + + +/* WorkerNodeListContains checks if the worker node exists in the given list. */ +static bool +WorkerNodeListContains(List *workerNodeList, const char *workerName, uint32 workerPort) +{ + bool workerNodeListContains = false; + ListCell *workerNodeCell = NULL; + + foreach(workerNodeCell, workerNodeList) + { + WorkerNode *workerNode = (WorkerNode *) lfirst(workerNodeCell); + + if ((strncmp(workerNode->workerName, workerName, WORKER_LENGTH) == 0) && + (workerNode->workerPort == workerPort)) + { + workerNodeListContains = true; + break; + } + } + + return workerNodeListContains; +} + + +/* + * UpdateColocatedShardPlacementProgress updates the progress of the given placement, + * along with its colocated placements, to the given state. + */ +static void +UpdateColocatedShardPlacementProgress(uint64 shardId, char *sourceName, int sourcePort, + uint64 progress) +{ + ProgressMonitorData *header = GetCurrentProgressMonitor(); + + if (header != NULL && header->steps != NULL) + { + PlacementUpdateEventProgress *steps = header->steps; + ListCell *colocatedShardIntervalCell = NULL; + + ShardInterval *shardInterval = LoadShardInterval(shardId); + List *colocatedShardIntervalList = ColocatedShardIntervalList(shardInterval); + + for (int moveIndex = 0; moveIndex < header->stepCount; moveIndex++) + { + PlacementUpdateEventProgress *step = steps + moveIndex; + uint64 currentShardId = step->shardId; + bool colocatedShard = false; + + foreach(colocatedShardIntervalCell, colocatedShardIntervalList) + { + ShardInterval *candidateShard = lfirst(colocatedShardIntervalCell); + if (candidateShard->shardId == currentShardId) + { + colocatedShard = true; + break; + } + } + + if (colocatedShard && + strcmp(step->sourceName, sourceName) == 0 && + step->sourcePort == sourcePort) + { + step->progress = progress; + } + } + } +} /* @@ -41,15 +2173,8 @@ PG_FUNCTION_INFO_V1(citus_validate_rebalance_strategy_functions); Datum pg_dist_rebalance_strategy_enterprise_check(PG_FUNCTION_ARGS) { - ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot write to pg_dist_rebalance_strategy"), - errdetail( - "Citus Community Edition does not support the use of " - "custom rebalance strategies."), - errhint( - "To learn more about using advanced rebalancing schemes " - "with Citus, please contact us at " - "https://citusdata.com/about/contact_us"))); + /* This is Enterprise, so this check is a no-op */ + PG_RETURN_VOID(); } diff --git a/src/backend/distributed/shared_library_init.c b/src/backend/distributed/shared_library_init.c index d54548b4c..4cf84fe3d 100644 --- a/src/backend/distributed/shared_library_init.c +++ b/src/backend/distributed/shared_library_init.c @@ -60,6 +60,7 @@ #include "distributed/reference_table_utils.h" #include "distributed/relation_access_tracking.h" #include "distributed/run_from_same_connection.h" +#include "distributed/shard_cleaner.h" #include "distributed/shared_connection_stats.h" #include "distributed/query_pushdown_planning.h" #include "distributed/time_constants.h" @@ -890,6 +891,38 @@ RegisterCitusConfigVariables(void) GUC_UNIT_MS | GUC_NO_SHOW_ALL, NULL, NULL, NULL); + DefineCustomBoolVariable( + "citus.defer_drop_after_shard_move", + gettext_noop("When enabled a shard move will mark old shards for deletion"), + gettext_noop("The deletion of a shard can sometimes run into a conflict with a " + "long running transactions on a the shard during the drop phase of " + "the shard move. This causes some moves to be rolled back after " + "resources have been spend on moving the shard. To prevent " + "conflicts this feature lets you skip the actual deletion till a " + "later point in time. When used one should set " + "citus.defer_shard_delete_interval to make sure defered deletions " + "will be executed"), + &DeferShardDeleteOnMove, + false, + PGC_USERSET, + 0, + NULL, NULL, NULL); + + DefineCustomIntVariable( + "citus.defer_shard_delete_interval", + gettext_noop("Sets the time to wait between background deletion for shards."), + gettext_noop("Shards that are marked for deferred deletion need to be deleted in " + "the background at a later time. This is done at a regular interval " + "configured here. The deletion is executed optimistically, it tries " + "to take a lock on a shard to clean, if the lock can't be acquired " + "the background worker moves on. When set to -1 this background " + "process is skipped."), + &DeferShardDeleteInterval, + -1, -1, 7 * 24 * 3600 * 1000, + PGC_SIGHUP, + GUC_UNIT_MS, + NULL, NULL, NULL); + DefineCustomBoolVariable( "citus.select_opens_transaction_block", gettext_noop("Open transaction blocks for SELECT commands"), diff --git a/src/backend/distributed/test/foreign_key_relationship_query.c b/src/backend/distributed/test/foreign_key_relationship_query.c index ea152acc2..a01785943 100644 --- a/src/backend/distributed/test/foreign_key_relationship_query.c +++ b/src/backend/distributed/test/foreign_key_relationship_query.c @@ -5,7 +5,7 @@ * This file contains UDFs for getting foreign constraint relationship between * distributed tables. * - * Copyright (c) Citus Data, Inc. + * Copyright (c), Citus Data, Inc. * *------------------------------------------------------------------------- */ @@ -15,6 +15,7 @@ #include "funcapi.h" #include "distributed/foreign_key_relationship.h" +#include "distributed/coordinator_protocol.h" #include "distributed/listutils.h" #include "distributed/metadata_cache.h" #include "distributed/tuplestore.h" diff --git a/src/backend/distributed/test/foreign_key_to_reference_table_rebalance.c b/src/backend/distributed/test/foreign_key_to_reference_table_rebalance.c new file mode 100644 index 000000000..4c5a21379 --- /dev/null +++ b/src/backend/distributed/test/foreign_key_to_reference_table_rebalance.c @@ -0,0 +1,87 @@ +/*------------------------------------------------------------------------- + * + * foreign_key_relationship_query.c + * + * This file contains UDFs for getting foreign constraint relationship between + * distributed tables. + * + * Copyright (c) 2018, Citus Data, Inc. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" +#include "fmgr.h" +#include "funcapi.h" + +#include "distributed/coordinator_protocol.h" +#include "distributed/listutils.h" +#include "distributed/metadata_cache.h" +#include "utils/builtins.h" + + +/* these functions are only exported in the regression tests */ +PG_FUNCTION_INFO_V1(get_foreign_key_to_reference_table_commands); + +/* + * get_foreign_key_to_reference_table_commands returns the list of commands + * for creating foreign keys to reference tables. + */ +Datum +get_foreign_key_to_reference_table_commands(PG_FUNCTION_ARGS) +{ + FuncCallContext *functionContext = NULL; + ListCell *commandsCell = NULL; + + CheckCitusVersion(ERROR); + + /* for the first we call this UDF, we need to populate the result to return set */ + if (SRF_IS_FIRSTCALL()) + { + Oid relationId = PG_GETARG_OID(0); + + /* create a function context for cross-call persistence */ + functionContext = SRF_FIRSTCALL_INIT(); + + /* switch to memory context appropriate for multiple function calls */ + MemoryContext oldContext = MemoryContextSwitchTo( + functionContext->multi_call_memory_ctx); + + CitusTableCacheEntry *cacheEntry = GetCitusTableCacheEntry(relationId); + ShardInterval *firstShardInterval = cacheEntry->sortedShardIntervalArray[0]; + ListCellAndListWrapper *wrapper = palloc0(sizeof(ListCellAndListWrapper)); + List *commandsList = + GetForeignConstraintCommandsToReferenceTable(firstShardInterval); + + commandsCell = list_head(commandsList); + wrapper->list = commandsList; + wrapper->listCell = commandsCell; + functionContext->user_fctx = wrapper; + MemoryContextSwitchTo(oldContext); + } + + /* + * On every call to this function, we get the current position in the + * statement list. We then iterate to the next position in the list and + * return the current statement, if we have not yet reached the end of + * list. + */ + functionContext = SRF_PERCALL_SETUP(); + + ListCellAndListWrapper *wrapper = + (ListCellAndListWrapper *) functionContext->user_fctx; + + if (wrapper->listCell != NULL) + { + char *command = (char *) lfirst(wrapper->listCell); + text *commandText = cstring_to_text(command); + + wrapper->listCell = lnext_compat(wrapper->list, wrapper->listCell); + + SRF_RETURN_NEXT(functionContext, PointerGetDatum(commandText)); + } + else + { + SRF_RETURN_DONE(functionContext); + } +} diff --git a/src/backend/distributed/test/shard_rebalancer.c b/src/backend/distributed/test/shard_rebalancer.c new file mode 100644 index 000000000..a402792f0 --- /dev/null +++ b/src/backend/distributed/test/shard_rebalancer.c @@ -0,0 +1,628 @@ +/*------------------------------------------------------------------------- + * + * test/shard_rebalancer.c + * + * This file contains functions used for unit testing the planning part of the + * shard rebalancer. + * + * Copyright (c) 2014-2019, Citus Data, Inc. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" +#include "libpq-fe.h" + +#include "safe_lib.h" + +#include "catalog/pg_type.h" +#include "distributed/citus_safe_lib.h" +#include "distributed/citus_ruleutils.h" +#include "distributed/connection_management.h" +#include "distributed/listutils.h" +#include "distributed/multi_physical_planner.h" +#include "distributed/shard_rebalancer.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "utils/builtins.h" +#include "utils/int8.h" +#include "utils/json.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" + +/* static declarations for json conversion */ +static List * JsonArrayToShardPlacementTestInfoList( + ArrayType *shardPlacementJsonArrayObject); +static List * JsonArrayToWorkerTestInfoList(ArrayType *workerNodeJsonArrayObject); +static bool JsonFieldValueBool(Datum jsonDocument, const char *key); +static uint32 JsonFieldValueUInt32(Datum jsonDocument, const char *key); +static uint64 JsonFieldValueUInt64(Datum jsonDocument, const char *key); +static char * JsonFieldValueString(Datum jsonDocument, const char *key); +static ArrayType * PlacementUpdateListToJsonArray(List *placementUpdateList); +static bool ShardAllowedOnNode(uint64 shardId, WorkerNode *workerNode, void *context); +static float NodeCapacity(WorkerNode *workerNode, void *context); +static ShardCost GetShardCost(uint64 shardId, void *context); + + +PG_FUNCTION_INFO_V1(shard_placement_rebalance_array); +PG_FUNCTION_INFO_V1(shard_placement_replication_array); +PG_FUNCTION_INFO_V1(worker_node_responsive); + +typedef struct ShardPlacementTestInfo +{ + ShardPlacement *placement; + uint64 cost; + bool nextColocationGroup; +} ShardPlacementTestInfo; + +typedef struct WorkerTestInfo +{ + WorkerNode *node; + List *disallowedShardIds; + float capacity; +} WorkerTestInfo; + +typedef struct RebalancePlanContext +{ + List *workerTestInfoList; + List *shardPlacementTestInfoList; +} RebalancePlacementContext; + + +/* + * shard_placement_rebalance_array returns a list of operations which can make a + * cluster consisting of given shard placements and worker nodes balanced with + * respect to the given threshold. Threshold is a value between 0 and 1 which + * determines the evenness in shard distribution. When threshold is 0, then all + * nodes should have equal number of shards. As threshold increases, cluster's + * evenness requirements decrease, and we can rebalance the cluster using less + * operations. + */ +Datum +shard_placement_rebalance_array(PG_FUNCTION_ARGS) +{ + ArrayType *workerNodeJsonArray = PG_GETARG_ARRAYTYPE_P(0); + ArrayType *shardPlacementJsonArray = PG_GETARG_ARRAYTYPE_P(1); + float threshold = PG_GETARG_FLOAT4(2); + int32 maxShardMoves = PG_GETARG_INT32(3); + bool drainOnly = PG_GETARG_BOOL(4); + + List *workerNodeList = NIL; + List *shardPlacementListList = NIL; + List *shardPlacementList = NIL; + WorkerTestInfo *workerTestInfo = NULL; + ShardPlacementTestInfo *shardPlacementTestInfo = NULL; + RebalancePlanFunctions rebalancePlanFunctions = { + .shardAllowedOnNode = ShardAllowedOnNode, + .nodeCapacity = NodeCapacity, + .shardCost = GetShardCost, + }; + RebalancePlacementContext context = { + .workerTestInfoList = NULL, + }; + + context.workerTestInfoList = JsonArrayToWorkerTestInfoList(workerNodeJsonArray); + context.shardPlacementTestInfoList = JsonArrayToShardPlacementTestInfoList( + shardPlacementJsonArray); + + /* we don't need original arrays any more, so we free them to save memory */ + pfree(workerNodeJsonArray); + pfree(shardPlacementJsonArray); + + /* map workerTestInfoList to a list of its WorkerNodes */ + foreach_ptr(workerTestInfo, context.workerTestInfoList) + { + workerNodeList = lappend(workerNodeList, workerTestInfo->node); + } + + /* map shardPlacementTestInfoList to a list of list of its ShardPlacements */ + foreach_ptr(shardPlacementTestInfo, context.shardPlacementTestInfoList) + { + if (shardPlacementTestInfo->nextColocationGroup) + { + shardPlacementList = SortList(shardPlacementList, CompareShardPlacements); + shardPlacementListList = lappend(shardPlacementListList, shardPlacementList); + shardPlacementList = NIL; + } + shardPlacementList = lappend(shardPlacementList, + shardPlacementTestInfo->placement); + } + shardPlacementList = SortList(shardPlacementList, CompareShardPlacements); + shardPlacementListList = lappend(shardPlacementListList, shardPlacementList); + + rebalancePlanFunctions.context = &context; + + /* sort the lists to make the function more deterministic */ + workerNodeList = SortList(workerNodeList, CompareWorkerNodes); + + List *placementUpdateList = RebalancePlacementUpdates(workerNodeList, + shardPlacementListList, + threshold, + maxShardMoves, + drainOnly, + &rebalancePlanFunctions); + ArrayType *placementUpdateJsonArray = PlacementUpdateListToJsonArray( + placementUpdateList); + + PG_RETURN_ARRAYTYPE_P(placementUpdateJsonArray); +} + + +/* + * ShardAllowedOnNode is the function that checks if shard is allowed to be on + * a worker when running the shard rebalancer unit tests. + */ +static bool +ShardAllowedOnNode(uint64 shardId, WorkerNode *workerNode, void *voidContext) +{ + RebalancePlacementContext *context = voidContext; + WorkerTestInfo *workerTestInfo = NULL; + uint64 *disallowedShardIdPtr = NULL; + foreach_ptr(workerTestInfo, context->workerTestInfoList) + { + if (workerTestInfo->node == workerNode) + { + break; + } + } + Assert(workerTestInfo != NULL); + + foreach_ptr(disallowedShardIdPtr, workerTestInfo->disallowedShardIds) + { + if (shardId == *disallowedShardIdPtr) + { + return false; + } + } + return true; +} + + +/* + * NodeCapacity is the function that gets the capacity of a worker when running + * the shard rebalancer unit tests. + */ +static float +NodeCapacity(WorkerNode *workerNode, void *voidContext) +{ + RebalancePlacementContext *context = voidContext; + WorkerTestInfo *workerTestInfo = NULL; + foreach_ptr(workerTestInfo, context->workerTestInfoList) + { + if (workerTestInfo->node == workerNode) + { + break; + } + } + Assert(workerTestInfo != NULL); + return workerTestInfo->capacity; +} + + +/* + * GetShardCost is the function that gets the ShardCost of a shard when running + * the shard rebalancer unit tests. + */ +static ShardCost +GetShardCost(uint64 shardId, void *voidContext) +{ + RebalancePlacementContext *context = voidContext; + ShardCost shardCost; + memset_struct_0(shardCost); + shardCost.shardId = shardId; + + ShardPlacementTestInfo *shardPlacementTestInfo = NULL; + foreach_ptr(shardPlacementTestInfo, context->shardPlacementTestInfoList) + { + if (shardPlacementTestInfo->placement->shardId == shardId) + { + break; + } + } + Assert(shardPlacementTestInfo != NULL); + shardCost.cost = shardPlacementTestInfo->cost; + return shardCost; +} + + +/* + * shard_placement_replication_array returns a list of operations which will + * replicate under-replicated shards in a cluster consisting of given shard + * placements and worker nodes. A shard is under-replicated if it has less + * active placements than the given shard replication factor. + */ +Datum +shard_placement_replication_array(PG_FUNCTION_ARGS) +{ + ArrayType *workerNodeJsonArray = PG_GETARG_ARRAYTYPE_P(0); + ArrayType *shardPlacementJsonArray = PG_GETARG_ARRAYTYPE_P(1); + uint32 shardReplicationFactor = PG_GETARG_INT32(2); + + List *workerNodeList = NIL; + List *shardPlacementList = NIL; + WorkerTestInfo *workerTestInfo = NULL; + ShardPlacementTestInfo *shardPlacementTestInfo = NULL; + + /* validate shard replication factor */ + if (shardReplicationFactor < SHARD_REPLICATION_FACTOR_MINIMUM || + shardReplicationFactor > SHARD_REPLICATION_FACTOR_MAXIMUM) + { + ereport(ERROR, (errmsg("invalid shard replication factor"), + errhint("Shard replication factor must be an integer " + "between %d and %d", SHARD_REPLICATION_FACTOR_MINIMUM, + SHARD_REPLICATION_FACTOR_MAXIMUM))); + } + + List *workerTestInfoList = JsonArrayToWorkerTestInfoList(workerNodeJsonArray); + List *shardPlacementTestInfoList = JsonArrayToShardPlacementTestInfoList( + shardPlacementJsonArray); + + /* we don't need original arrays any more, so we free them to save memory */ + pfree(workerNodeJsonArray); + pfree(shardPlacementJsonArray); + + foreach_ptr(workerTestInfo, workerTestInfoList) + { + workerNodeList = lappend(workerNodeList, workerTestInfo->node); + } + + foreach_ptr(shardPlacementTestInfo, shardPlacementTestInfoList) + { + shardPlacementList = lappend(shardPlacementList, + shardPlacementTestInfo->placement); + } + + /* sort the lists to make the function more deterministic */ + workerNodeList = SortList(workerNodeList, CompareWorkerNodes); + shardPlacementList = SortList(shardPlacementList, CompareShardPlacements); + + List *placementUpdateList = ReplicationPlacementUpdates(workerNodeList, + shardPlacementList, + shardReplicationFactor); + ArrayType *placementUpdateJsonArray = PlacementUpdateListToJsonArray( + placementUpdateList); + + PG_RETURN_ARRAYTYPE_P(placementUpdateJsonArray); +} + + +/* + * JsonArrayToShardPlacementTestInfoList converts the given shard placement json array + * to a list of ShardPlacement structs. + */ +static List * +JsonArrayToShardPlacementTestInfoList(ArrayType *shardPlacementJsonArrayObject) +{ + List *shardPlacementTestInfoList = NIL; + Datum *shardPlacementJsonArray = NULL; + int placementCount = 0; + + /* + * Memory is not automatically freed when we call UDFs using DirectFunctionCall. + * We call these functions in functionCallContext, so we can free the memory + * once they return. + */ + MemoryContext functionCallContext = AllocSetContextCreate(CurrentMemoryContext, + "Function Call Context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + + deconstruct_array(shardPlacementJsonArrayObject, JSONOID, -1, false, 'i', + &shardPlacementJsonArray, NULL, &placementCount); + + for (int placementIndex = 0; placementIndex < placementCount; placementIndex++) + { + Datum placementJson = shardPlacementJsonArray[placementIndex]; + ShardPlacementTestInfo *placementTestInfo = palloc0( + sizeof(ShardPlacementTestInfo)); + + MemoryContext oldContext = MemoryContextSwitchTo(functionCallContext); + + uint64 shardId = JsonFieldValueUInt64(placementJson, FIELD_NAME_SHARD_ID); + uint64 shardLength = JsonFieldValueUInt64(placementJson, FIELD_NAME_SHARD_LENGTH); + int shardState = JsonFieldValueUInt32(placementJson, FIELD_NAME_SHARD_STATE); + char *nodeName = JsonFieldValueString(placementJson, FIELD_NAME_NODE_NAME); + int nodePort = JsonFieldValueUInt32(placementJson, FIELD_NAME_NODE_PORT); + uint64 placementId = JsonFieldValueUInt64(placementJson, FIELD_NAME_PLACEMENT_ID); + + MemoryContextSwitchTo(oldContext); + + placementTestInfo->placement = palloc0(sizeof(ShardPlacement)); + placementTestInfo->placement->shardId = shardId; + placementTestInfo->placement->shardLength = shardLength; + placementTestInfo->placement->shardState = shardState; + placementTestInfo->placement->nodeName = pstrdup(nodeName); + placementTestInfo->placement->nodePort = nodePort; + placementTestInfo->placement->placementId = placementId; + + /* + * We have copied whatever we needed from the UDF calls, so we can free + * the memory allocated by them. + */ + MemoryContextReset(functionCallContext); + + + shardPlacementTestInfoList = lappend(shardPlacementTestInfoList, + placementTestInfo); + + PG_TRY(); + { + placementTestInfo->cost = JsonFieldValueUInt64(placementJson, + "cost"); + } + PG_CATCH(); + { + /* Ignore errors about not being able to find the key in that case cost is 1 */ + FlushErrorState(); + MemoryContextSwitchTo(oldContext); + placementTestInfo->cost = 1; + } + PG_END_TRY(); + + PG_TRY(); + { + placementTestInfo->nextColocationGroup = JsonFieldValueBool( + placementJson, "next_colocation"); + } + PG_CATCH(); + { + /* Ignore errors about not being able to find the key in that case cost is 1 */ + FlushErrorState(); + MemoryContextSwitchTo(oldContext); + } + PG_END_TRY(); + } + + pfree(shardPlacementJsonArray); + + return shardPlacementTestInfoList; +} + + +/* + * JsonArrayToWorkerNodeList converts the given worker node json array to a list + * of WorkerNode structs. + */ +static List * +JsonArrayToWorkerTestInfoList(ArrayType *workerNodeJsonArrayObject) +{ + List *workerTestInfoList = NIL; + Datum *workerNodeJsonArray = NULL; + int workerNodeCount = 0; + + deconstruct_array(workerNodeJsonArrayObject, JSONOID, -1, false, 'i', + &workerNodeJsonArray, NULL, &workerNodeCount); + + for (int workerNodeIndex = 0; workerNodeIndex < workerNodeCount; workerNodeIndex++) + { + Datum workerNodeJson = workerNodeJsonArray[workerNodeIndex]; + char *workerName = JsonFieldValueString(workerNodeJson, FIELD_NAME_WORKER_NAME); + uint32 workerPort = JsonFieldValueUInt32(workerNodeJson, + FIELD_NAME_WORKER_PORT); + List *disallowedShardIdList = NIL; + char *disallowedShardsString = NULL; + MemoryContext savedContext = CurrentMemoryContext; + + + WorkerTestInfo *workerTestInfo = palloc0(sizeof(WorkerTestInfo)); + WorkerNode *workerNode = palloc0(sizeof(WorkerNode)); + strncpy_s(workerNode->workerName, sizeof(workerNode->workerName), workerName, + WORKER_LENGTH); + workerNode->nodeId = workerNodeIndex; + workerNode->workerPort = workerPort; + workerNode->shouldHaveShards = true; + workerNode->nodeRole = PrimaryNodeRoleId(); + workerTestInfo->node = workerNode; + + PG_TRY(); + { + workerTestInfo->capacity = JsonFieldValueUInt64(workerNodeJson, + "capacity"); + } + PG_CATCH(); + { + /* Ignore errors about not being able to find the key in that case capacity is 1 */ + FlushErrorState(); + MemoryContextSwitchTo(savedContext); + workerTestInfo->capacity = 1; + } + PG_END_TRY(); + + + workerTestInfoList = lappend(workerTestInfoList, workerTestInfo); + PG_TRY(); + { + disallowedShardsString = JsonFieldValueString(workerNodeJson, + "disallowed_shards"); + } + PG_CATCH(); + { + /* Ignore errors about not being able to find the key in that case all shards are allowed */ + FlushErrorState(); + MemoryContextSwitchTo(savedContext); + disallowedShardsString = NULL; + } + PG_END_TRY(); + + if (disallowedShardsString == NULL) + { + continue; + } + + char *strtokPosition = NULL; + char *shardString = strtok_r(disallowedShardsString, ",", &strtokPosition); + while (shardString != NULL) + { + uint64 *shardInt = palloc0(sizeof(uint64)); + *shardInt = SafeStringToUint64(shardString); + disallowedShardIdList = lappend(disallowedShardIdList, shardInt); + shardString = strtok_r(NULL, ",", &strtokPosition); + } + workerTestInfo->disallowedShardIds = disallowedShardIdList; + } + + return workerTestInfoList; +} + + +/* + * JsonFieldValueBool gets the value of the given key in the given json + * document and returns it as a boolean. + */ +static bool +JsonFieldValueBool(Datum jsonDocument, const char *key) +{ + char *valueString = JsonFieldValueString(jsonDocument, key); + Datum valueBoolDatum = DirectFunctionCall1(boolin, CStringGetDatum(valueString)); + + return DatumGetBool(valueBoolDatum); +} + + +/* + * JsonFieldValueUInt32 gets the value of the given key in the given json + * document and returns it as an unsigned 32-bit integer. + */ +static uint32 +JsonFieldValueUInt32(Datum jsonDocument, const char *key) +{ + char *valueString = JsonFieldValueString(jsonDocument, key); + Datum valueInt4Datum = DirectFunctionCall1(int4in, CStringGetDatum(valueString)); + + uint32 valueUInt32 = DatumGetInt32(valueInt4Datum); + return valueUInt32; +} + + +/* + * JsonFieldValueUInt64 gets the value of the given key in the given json + * document and returns it as an unsigned 64-bit integer. + */ +static uint64 +JsonFieldValueUInt64(Datum jsonDocument, const char *key) +{ + char *valueString = JsonFieldValueString(jsonDocument, key); + Datum valueInt8Datum = DirectFunctionCall1(int8in, CStringGetDatum(valueString)); + + uint64 valueUInt64 = DatumGetInt64(valueInt8Datum); + return valueUInt64; +} + + +/* + * JsonFieldValueString gets the value of the given key in the given json + * document and returns it as a string. + */ +static char * +JsonFieldValueString(Datum jsonDocument, const char *key) +{ + Datum valueTextDatum = 0; + bool valueFetched = false; + Datum keyDatum = PointerGetDatum(cstring_to_text(key)); + + /* + * json_object_field_text can return NULL, but DirectFunctionalCall2 raises + * cryptic errors when the function returns NULL. We catch this error and + * raise a more meaningful error. + */ + PG_TRY(); + { + valueTextDatum = DirectFunctionCall2(json_object_field_text, + jsonDocument, keyDatum); + valueFetched = true; + } + PG_CATCH(); + { + FlushErrorState(); + valueFetched = false; + } + PG_END_TRY(); + + if (!valueFetched) + { + ereport(ERROR, (errmsg("could not get value for '%s'", key))); + } + + char *valueString = text_to_cstring(DatumGetTextP(valueTextDatum)); + return valueString; +} + + +/* + * PlacementUpdateListToJsonArray converts the given list of placement update + * data to a json array. + */ +static ArrayType * +PlacementUpdateListToJsonArray(List *placementUpdateList) +{ + ListCell *placementUpdateCell = NULL; + int placementUpdateIndex = 0; + + int placementUpdateCount = list_length(placementUpdateList); + Datum *placementUpdateJsonArray = palloc0(placementUpdateCount * sizeof(Datum)); + + foreach(placementUpdateCell, placementUpdateList) + { + PlacementUpdateEvent *placementUpdateEvent = lfirst(placementUpdateCell); + WorkerNode *sourceNode = placementUpdateEvent->sourceNode; + WorkerNode *targetNode = placementUpdateEvent->targetNode; + + StringInfo escapedSourceName = makeStringInfo(); + escape_json(escapedSourceName, sourceNode->workerName); + + StringInfo escapedTargetName = makeStringInfo(); + escape_json(escapedTargetName, targetNode->workerName); + + StringInfo placementUpdateJsonString = makeStringInfo(); + appendStringInfo(placementUpdateJsonString, PLACEMENT_UPDATE_JSON_FORMAT, + placementUpdateEvent->updateType, placementUpdateEvent->shardId, + escapedSourceName->data, sourceNode->workerPort, + escapedTargetName->data, targetNode->workerPort); + + Datum placementUpdateStringDatum = CStringGetDatum( + placementUpdateJsonString->data); + Datum placementUpdateJsonDatum = DirectFunctionCall1(json_in, + placementUpdateStringDatum); + + placementUpdateJsonArray[placementUpdateIndex] = placementUpdateJsonDatum; + placementUpdateIndex++; + } + + ArrayType *placementUpdateObject = construct_array(placementUpdateJsonArray, + placementUpdateCount, JSONOID, + -1, false, 'i'); + + return placementUpdateObject; +} + + +/* + * worker_node_responsive returns true if the given worker node is responsive. + * Otherwise, it returns false. + */ +Datum +worker_node_responsive(PG_FUNCTION_ARGS) +{ + text *workerNameText = PG_GETARG_TEXT_PP(0); + uint32 workerPort = PG_GETARG_INT32(1); + int connectionFlag = FORCE_NEW_CONNECTION; + + bool workerNodeResponsive = false; + const char *workerName = text_to_cstring(workerNameText); + + MultiConnection *connection = GetNodeConnection(connectionFlag, workerName, + workerPort); + + if (connection != NULL && connection->pgConn != NULL) + { + if (PQstatus(connection->pgConn) == CONNECTION_OK) + { + workerNodeResponsive = true; + } + + CloseConnection(connection); + } + + PG_RETURN_BOOL(workerNodeResponsive); +} diff --git a/src/backend/distributed/utils/hash_helpers.c b/src/backend/distributed/utils/hash_helpers.c index 0ed090dca..6bbf14938 100644 --- a/src/backend/distributed/utils/hash_helpers.c +++ b/src/backend/distributed/utils/hash_helpers.c @@ -32,3 +32,18 @@ hash_delete_all(HTAB *htab) Assert(found); } } + + +/* + * foreach_htab_cleanup cleans up the hash iteration state after the iteration + * is done. This is only needed when break statements are present in the + * foreach block. + */ +void +foreach_htab_cleanup(void *var, HASH_SEQ_STATUS *status) +{ + if ((var) != NULL) + { + hash_seq_term(status); + } +} diff --git a/src/backend/distributed/utils/maintenanced.c b/src/backend/distributed/utils/maintenanced.c index 74ac7fbe5..9b329de12 100644 --- a/src/backend/distributed/utils/maintenanced.c +++ b/src/backend/distributed/utils/maintenanced.c @@ -38,6 +38,7 @@ #include "distributed/coordinator_protocol.h" #include "distributed/metadata_cache.h" #include "distributed/metadata_sync.h" +#include "distributed/shard_cleaner.h" #include "distributed/statistics_collection.h" #include "distributed/transaction_recovery.h" #include "distributed/version_compat.h" @@ -92,6 +93,7 @@ typedef struct MaintenanceDaemonDBData /* config variable for distributed deadlock detection timeout */ double DistributedDeadlockDetectionTimeoutFactor = 2.0; int Recover2PCInterval = 60000; +int DeferShardDeleteInterval = 60000; /* config variables for metadata sync timeout */ int MetadataSyncInterval = 60000; @@ -289,6 +291,7 @@ CitusMaintenanceDaemonMain(Datum main_arg) bool retryStatsCollection USED_WITH_LIBCURL_ONLY = false; ErrorContextCallback errorCallback; TimestampTz lastRecoveryTime = 0; + TimestampTz lastShardCleanTime = 0; TimestampTz nextMetadataSyncTime = 0; /* @@ -586,6 +589,45 @@ CitusMaintenanceDaemonMain(Datum main_arg) timeout = Min(timeout, deadlockTimeout); } + if (!RecoveryInProgress() && DeferShardDeleteInterval > 0 && + TimestampDifferenceExceeds(lastShardCleanTime, GetCurrentTimestamp(), + DeferShardDeleteInterval)) + { + int numberOfDroppedShards = 0; + + InvalidateMetadataSystemCache(); + StartTransactionCommand(); + + if (!LockCitusExtension()) + { + ereport(DEBUG1, (errmsg( + "could not lock the citus extension, skipping shard cleaning"))); + } + else if (CheckCitusVersion(DEBUG1) && CitusHasBeenLoaded()) + { + /* + * Record last shard clean time at start to ensure we run once per + * DeferShardDeleteInterval. + */ + lastShardCleanTime = GetCurrentTimestamp(); + + numberOfDroppedShards = TryDropMarkedShards(); + } + + CommitTransactionCommand(); + + if (numberOfDroppedShards > 0) + { + ereport(LOG, (errmsg("maintenance daemon dropped %d distributed " + "shards previously marked to be removed", + numberOfDroppedShards))); + } + + /* make sure we don't wait too long */ + timeout = Min(timeout, DeferShardDeleteInterval); + } + + /* * Wait until timeout, or until somebody wakes us up. Also cast the timeout to * integer where we've calculated it using double for not losing the precision. diff --git a/src/include/distributed/coordinator_protocol.h b/src/include/distributed/coordinator_protocol.h index f871db5b2..2eb955564 100644 --- a/src/include/distributed/coordinator_protocol.h +++ b/src/include/distributed/coordinator_protocol.h @@ -253,6 +253,11 @@ extern ShardPlacement * SearchShardPlacementInList(List *shardPlacementList, extern ShardPlacement * SearchShardPlacementInListOrError(List *shardPlacementList, const char *nodeName, uint32 nodePort); +extern void ErrorIfMoveCitusLocalTable(Oid relationId); extern char LookupShardTransferMode(Oid shardReplicationModeOid); +extern void BlockWritesToShardList(List *shardList); +extern List * WorkerApplyShardDDLCommandList(List *ddlCommandList, int64 shardId); +extern List * GetForeignConstraintCommandsToReferenceTable(ShardInterval *shardInterval); + #endif /* COORDINATOR_PROTOCOL_H */ diff --git a/src/include/distributed/hash_helpers.h b/src/include/distributed/hash_helpers.h index b25cd1cd8..5d329e423 100644 --- a/src/include/distributed/hash_helpers.h +++ b/src/include/distributed/hash_helpers.h @@ -48,4 +48,6 @@ extern void hash_delete_all(HTAB *htab); (var) != NULL; \ (var) = hash_seq_search(status)) +extern void foreach_htab_cleanup(void *var, HASH_SEQ_STATUS *status); + #endif diff --git a/src/include/distributed/metadata_cache.h b/src/include/distributed/metadata_cache.h index da5d45d84..ed2790654 100644 --- a/src/include/distributed/metadata_cache.h +++ b/src/include/distributed/metadata_cache.h @@ -159,6 +159,7 @@ extern int32 GetLocalGroupId(void); extern void CitusTableCacheFlushInvalidatedEntries(void); extern Oid LookupShardRelationFromCatalog(int64 shardId, bool missing_ok); extern List * ShardPlacementList(uint64 shardId); +extern bool ShardExists(int64 shardId); extern void CitusInvalidateRelcacheByRelid(Oid relationId); extern void CitusInvalidateRelcacheByShardId(int64 shardId); extern void InvalidateForeignKeyGraph(void); @@ -210,6 +211,7 @@ extern Oid DistPartitionRelationId(void); extern Oid DistShardRelationId(void); extern Oid DistPlacementRelationId(void); extern Oid DistNodeRelationId(void); +extern Oid DistRebalanceStrategyRelationId(void); extern Oid DistLocalGroupIdRelationId(void); extern Oid DistObjectRelationId(void); extern Oid DistEnabledCustomAggregatesId(void); diff --git a/src/include/distributed/metadata_utility.h b/src/include/distributed/metadata_utility.h index 32535421d..fab9d2f0d 100644 --- a/src/include/distributed/metadata_utility.h +++ b/src/include/distributed/metadata_utility.h @@ -110,6 +110,7 @@ extern List * ActiveShardPlacementList(uint64 shardId); extern ShardPlacement * ActiveShardPlacement(uint64 shardId, bool missingOk); extern List * BuildShardPlacementList(ShardInterval *shardInterval); extern List * AllShardPlacementsOnNodeGroup(int32 groupId); +extern List * AllShardPlacementsWithShardPlacementState(ShardState shardState); extern List * GroupShardPlacementsForTableOnGroup(Oid relationId, int32 groupId); extern StringInfo GenerateSizeQueryOnMultiplePlacements(List *shardIntervalList, char *sizeQuery); diff --git a/src/include/distributed/shard_cleaner.h b/src/include/distributed/shard_cleaner.h new file mode 100644 index 000000000..caa739d7e --- /dev/null +++ b/src/include/distributed/shard_cleaner.h @@ -0,0 +1,20 @@ +/*------------------------------------------------------------------------- + * + * shard_cleaner.h + * Type and function declarations used in background shard cleaning + * + * Copyright (c) 2018, Citus Data, Inc. + * + *------------------------------------------------------------------------- + */ + +#ifndef CITUS_SHARD_CLEANER_H +#define CITUS_SHARD_CLEANER_H + +/* GUC to configure deferred shard deletion */ +extern int DeferShardDeleteInterval; +extern bool DeferShardDeleteOnMove; + +extern int TryDropMarkedShards(void); + +#endif /*CITUS_SHARD_CLEANER_H */ diff --git a/src/include/distributed/shard_rebalancer.h b/src/include/distributed/shard_rebalancer.h new file mode 100644 index 000000000..7e0716cb5 --- /dev/null +++ b/src/include/distributed/shard_rebalancer.h @@ -0,0 +1,159 @@ +/*------------------------------------------------------------------------- + * + * shard_rebalancer.h + * + * Type and function declarations for the shard rebalancer tool. + * + * Copyright (c), Citus Data, Inc. + * + *------------------------------------------------------------------------- + */ + +#ifndef SHARD_REBALANCER_H +#define SHARD_REBALANCER_H + +#include "postgres.h" + +#include "fmgr.h" +#include "nodes/pg_list.h" +#include "distributed/coordinator_protocol.h" +#include "distributed/worker_manager.h" + + +/* Limits for function parameters */ +#define SHARD_REPLICATION_FACTOR_MINIMUM 1 +#define SHARD_REPLICATION_FACTOR_MAXIMUM 100 + +/* Definitions for metadata update commands */ +#define INSERT_SHARD_PLACEMENT_COMMAND "INSERT INTO pg_dist_shard_placement VALUES(" \ + UINT64_FORMAT ", %d, " UINT64_FORMAT ", '%s', %d)" +#define DELETE_SHARD_PLACEMENT_COMMAND "DELETE FROM pg_dist_shard_placement WHERE " \ + "shardid=" UINT64_FORMAT \ + " AND nodename='%s' AND nodeport=%d" + +/* + * Definitions for shard placement json field names. These names should match + * the column names in pg_dist_shard_placement. + */ +#define FIELD_NAME_SHARD_ID "shardid" +#define FIELD_NAME_SHARD_LENGTH "shardlength" +#define FIELD_NAME_SHARD_STATE "shardstate" +#define FIELD_NAME_NODE_NAME "nodename" +#define FIELD_NAME_NODE_PORT "nodeport" +#define FIELD_NAME_PLACEMENT_ID "placementid" + +/* + * Definitions for worker node json field names. These names should match the + * column names in master_get_active_worker_nodes(). + */ +#define FIELD_NAME_WORKER_NAME "node_name" +#define FIELD_NAME_WORKER_PORT "node_port" + +/* Definitions for placement update json field names */ +#define FIELD_NAME_UPDATE_TYPE "updatetype" +#define FIELD_NAME_SOURCE_NAME "sourcename" +#define FIELD_NAME_SOURCE_PORT "sourceport" +#define FIELD_NAME_TARGET_NAME "targetname" +#define FIELD_NAME_TARGET_PORT "targetport" + +/* *INDENT-OFF* */ +/* Definition for format of placement update json document */ +#define PLACEMENT_UPDATE_JSON_FORMAT \ +"{"\ + "\"" FIELD_NAME_UPDATE_TYPE "\":%d,"\ + "\"" FIELD_NAME_SHARD_ID "\":" UINT64_FORMAT ","\ + "\"" FIELD_NAME_SOURCE_NAME "\":%s,"\ + "\"" FIELD_NAME_SOURCE_PORT "\":%d,"\ + "\"" FIELD_NAME_TARGET_NAME "\":%s,"\ + "\"" FIELD_NAME_TARGET_PORT "\":%d"\ +"}" + +/* *INDENT-ON* */ + +#define REBALANCE_ACTIVITY_MAGIC_NUMBER 1337 +#define REBALANCE_PROGRESS_ERROR -1 +#define REBALANCE_PROGRESS_WAITING 0 +#define REBALANCE_PROGRESS_MOVING 1 +#define REBALANCE_PROGRESS_MOVED 2 + +/* Enumeration that defines different placement update types */ +typedef enum +{ + PLACEMENT_UPDATE_INVALID_FIRST = 0, + PLACEMENT_UPDATE_MOVE = 1, + PLACEMENT_UPDATE_COPY = 2 +} PlacementUpdateType; + + +/* + * PlacementUpdateEvent represents a logical unit of work that copies or + * moves a shard placement. + */ +typedef struct PlacementUpdateEvent +{ + PlacementUpdateType updateType; + uint64 shardId; + WorkerNode *sourceNode; + WorkerNode *targetNode; +} PlacementUpdateEvent; + + +typedef struct PlacementUpdateEventProgress +{ + uint64 shardId; + char sourceName[255]; + int sourcePort; + char targetName[255]; + int targetPort; + uint64 shardSize; + uint64 progress; +} PlacementUpdateEventProgress; + +typedef struct NodeFillState +{ + WorkerNode *node; + float4 capacity; + float4 totalCost; + float4 utilization; + List *shardCostListDesc; +} NodeFillState; + +typedef struct ShardCost +{ + uint64 shardId; + float4 cost; +} ShardCost; + +typedef struct DisallowedPlacement +{ + ShardCost *shardCost; + NodeFillState *fillState; +} DisallowedPlacement; + +typedef struct RebalancePlanFunctions +{ + bool (*shardAllowedOnNode)(uint64 shardId, WorkerNode *workerNode, void *context); + float4 (*nodeCapacity)(WorkerNode *workerNode, void *context); + ShardCost (*shardCost)(uint64 shardId, void *context); + void *context; +} RebalancePlanFunctions; + +/* External function declarations */ +extern Datum shard_placement_rebalance_array(PG_FUNCTION_ARGS); +extern Datum shard_placement_replication_array(PG_FUNCTION_ARGS); +extern Datum worker_node_responsive(PG_FUNCTION_ARGS); +extern Datum update_shard_placement(PG_FUNCTION_ARGS); +extern Datum init_rebalance_monitor(PG_FUNCTION_ARGS); +extern Datum finalize_rebalance_monitor(PG_FUNCTION_ARGS); +extern Datum get_rebalance_progress(PG_FUNCTION_ARGS); + +extern List * RebalancePlacementUpdates(List *workerNodeList, List *shardPlacementList, + double threshold, + int32 maxShardMoves, + bool drainOnly, + RebalancePlanFunctions *rebalancePlanFunctions); +extern List * ReplicationPlacementUpdates(List *workerNodeList, List *shardPlacementList, + int shardReplicationFactor); + + +#endif /* SHARD_REBALANCER_H */ diff --git a/src/test/regress/Makefile b/src/test/regress/Makefile index c9c171afc..d259da950 100644 --- a/src/test/regress/Makefile +++ b/src/test/regress/Makefile @@ -42,7 +42,7 @@ output_files := $(patsubst $(citus_abs_srcdir)/output/%.source,expected/%.out, $ # intermediate, for muscle memory backward compatibility. check: check-full # check-full triggers all tests that ought to be run routinely -check-full: check-multi check-multi-mx check-worker check-follower-cluster check-failure +check-full: check-multi check-multi-mx check-worker check-operations check-follower-cluster check-failure ISOLATION_DEPDIR=.deps/isolation @@ -161,6 +161,10 @@ check-follower-cluster: all $(pg_regress_multi_check) --load-extension=citus --follower-cluster \ -- $(MULTI_REGRESS_OPTS) --schedule=$(citus_abs_srcdir)/multi_follower_schedule $(EXTRA_TESTS) +check-operations: all + $(pg_regress_multi_check) --load-extension=citus \ + -- $(MULTI_REGRESS_OPTS) --schedule=$(citus_abs_srcdir)/operations_schedule $(EXTRA_TESTS) + check-columnar: $(pg_regress_multi_check) --load-extension=citus \ -- $(MULTI_REGRESS_OPTS) --schedule=$(citus_abs_srcdir)/columnar_am_schedule $(EXTRA_TESTS) diff --git a/src/test/regress/expected/foreign_key_to_reference_shard_rebalance.out b/src/test/regress/expected/foreign_key_to_reference_shard_rebalance.out new file mode 100644 index 000000000..7bffe0b6f --- /dev/null +++ b/src/test/regress/expected/foreign_key_to_reference_shard_rebalance.out @@ -0,0 +1,193 @@ +-- +-- FOREIGN_KEY_TO_REFERENCE_SHARD_REBALANCE +-- +SET citus.next_shard_id TO 15000000; +CREATE SCHEMA fkey_to_reference_shard_rebalance; +SET search_path to fkey_to_reference_shard_rebalance; +SET citus.shard_replication_factor TO 1; +SET citus.shard_count to 8; +CREATE TYPE foreign_details AS (name text, relid text, refd_relid text); +CREATE VIEW table_fkeys_in_workers AS +SELECT +(json_populate_record(NULL::foreign_details, + json_array_elements_text((run_command_on_workers( $$ + SELECT + COALESCE(json_agg(row_to_json(d)), '[]'::json) + FROM + ( + SELECT + distinct name, + relid::regclass::text, + refd_relid::regclass::text + FROM + table_fkey_cols + ) + d $$ )).RESULT::json )::json )).* ; +-- check if master_move_shard_placement with logical replication creates the +-- foreign constraints properly after moving the shard +CREATE TABLE referenced_table(test_column int, test_column2 int UNIQUE, PRIMARY KEY(test_column)); +CREATE TABLE referencing_table(id int PRIMARY KEY, ref_id int, FOREIGN KEY (id) REFERENCES referenced_table(test_column) ON DELETE CASCADE); +CREATE TABLE referencing_table2(id int, ref_id int, FOREIGN KEY (ref_id) REFERENCES referenced_table(test_column2) ON DELETE CASCADE, FOREIGN KEY (id) REFERENCES referencing_table(id) ON DELETE CASCADE); +SELECT create_reference_table('referenced_table'); + create_reference_table +--------------------------------------------------------------------- + +(1 row) + +SELECT create_distributed_table('referencing_table', 'id'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +SELECT create_distributed_table('referencing_table2', 'id'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +INSERT INTO referenced_table SELECT i,i FROM generate_series (0, 100) i; +INSERT INTO referencing_table SELECT i,i FROM generate_series (0, 100) i; +INSERT INTO referencing_table2 SELECT i,i FROM generate_series (0, 100) i; +SELECT master_move_shard_placement(15000009, 'localhost', :worker_1_port, 'localhost', :worker_2_port); + master_move_shard_placement +--------------------------------------------------------------------- + +(1 row) + +SELECT count(*) FROM referencing_table2; + count +--------------------------------------------------------------------- + 101 +(1 row) + +SELECT * FROM table_fkeys_in_workers WHERE relid LIKE 'fkey_to_reference_shard_rebalance.%' AND refd_relid LIKE 'fkey_to_reference_shard_rebalance.%' ORDER BY 1,2,3; + name | relid | refd_relid +--------------------------------------------------------------------- + referencing_table2_id_fkey_15000009 | fkey_to_reference_shard_rebalance.referencing_table2_15000009 | fkey_to_reference_shard_rebalance.referencing_table_15000001 + referencing_table2_id_fkey_15000010 | fkey_to_reference_shard_rebalance.referencing_table2_15000010 | fkey_to_reference_shard_rebalance.referencing_table_15000002 + referencing_table2_id_fkey_15000011 | fkey_to_reference_shard_rebalance.referencing_table2_15000011 | fkey_to_reference_shard_rebalance.referencing_table_15000003 + referencing_table2_id_fkey_15000012 | fkey_to_reference_shard_rebalance.referencing_table2_15000012 | fkey_to_reference_shard_rebalance.referencing_table_15000004 + referencing_table2_id_fkey_15000013 | fkey_to_reference_shard_rebalance.referencing_table2_15000013 | fkey_to_reference_shard_rebalance.referencing_table_15000005 + referencing_table2_id_fkey_15000014 | fkey_to_reference_shard_rebalance.referencing_table2_15000014 | fkey_to_reference_shard_rebalance.referencing_table_15000006 + referencing_table2_id_fkey_15000015 | fkey_to_reference_shard_rebalance.referencing_table2_15000015 | fkey_to_reference_shard_rebalance.referencing_table_15000007 + referencing_table2_id_fkey_15000016 | fkey_to_reference_shard_rebalance.referencing_table2_15000016 | fkey_to_reference_shard_rebalance.referencing_table_15000008 + referencing_table2_ref_id_fkey_15000009 | fkey_to_reference_shard_rebalance.referencing_table2_15000009 | fkey_to_reference_shard_rebalance.referenced_table_15000000 + referencing_table2_ref_id_fkey_15000010 | fkey_to_reference_shard_rebalance.referencing_table2_15000010 | fkey_to_reference_shard_rebalance.referenced_table_15000000 + referencing_table2_ref_id_fkey_15000011 | fkey_to_reference_shard_rebalance.referencing_table2_15000011 | fkey_to_reference_shard_rebalance.referenced_table_15000000 + referencing_table2_ref_id_fkey_15000012 | fkey_to_reference_shard_rebalance.referencing_table2_15000012 | fkey_to_reference_shard_rebalance.referenced_table_15000000 + referencing_table2_ref_id_fkey_15000013 | fkey_to_reference_shard_rebalance.referencing_table2_15000013 | fkey_to_reference_shard_rebalance.referenced_table_15000000 + referencing_table2_ref_id_fkey_15000014 | fkey_to_reference_shard_rebalance.referencing_table2_15000014 | fkey_to_reference_shard_rebalance.referenced_table_15000000 + referencing_table2_ref_id_fkey_15000015 | fkey_to_reference_shard_rebalance.referencing_table2_15000015 | fkey_to_reference_shard_rebalance.referenced_table_15000000 + referencing_table2_ref_id_fkey_15000016 | fkey_to_reference_shard_rebalance.referencing_table2_15000016 | fkey_to_reference_shard_rebalance.referenced_table_15000000 + referencing_table_id_fkey_15000001 | fkey_to_reference_shard_rebalance.referencing_table_15000001 | fkey_to_reference_shard_rebalance.referenced_table_15000000 + referencing_table_id_fkey_15000002 | fkey_to_reference_shard_rebalance.referencing_table_15000002 | fkey_to_reference_shard_rebalance.referenced_table_15000000 + referencing_table_id_fkey_15000003 | fkey_to_reference_shard_rebalance.referencing_table_15000003 | fkey_to_reference_shard_rebalance.referenced_table_15000000 + referencing_table_id_fkey_15000004 | fkey_to_reference_shard_rebalance.referencing_table_15000004 | fkey_to_reference_shard_rebalance.referenced_table_15000000 + referencing_table_id_fkey_15000005 | fkey_to_reference_shard_rebalance.referencing_table_15000005 | fkey_to_reference_shard_rebalance.referenced_table_15000000 + referencing_table_id_fkey_15000006 | fkey_to_reference_shard_rebalance.referencing_table_15000006 | fkey_to_reference_shard_rebalance.referenced_table_15000000 + referencing_table_id_fkey_15000007 | fkey_to_reference_shard_rebalance.referencing_table_15000007 | fkey_to_reference_shard_rebalance.referenced_table_15000000 + referencing_table_id_fkey_15000008 | fkey_to_reference_shard_rebalance.referencing_table_15000008 | fkey_to_reference_shard_rebalance.referenced_table_15000000 +(24 rows) + +SELECT master_move_shard_placement(15000009, 'localhost', :worker_2_port, 'localhost', :worker_1_port, 'block_writes'); + master_move_shard_placement +--------------------------------------------------------------------- + +(1 row) + +SELECT count(*) FROM referencing_table2; + count +--------------------------------------------------------------------- + 101 +(1 row) + +SELECT * FROM table_fkeys_in_workers WHERE relid LIKE 'fkey_to_reference_shard_rebalance.%' AND refd_relid LIKE 'fkey_to_reference_shard_rebalance.%' ORDER BY 1,2,3; + name | relid | refd_relid +--------------------------------------------------------------------- + referencing_table2_id_fkey_15000009 | fkey_to_reference_shard_rebalance.referencing_table2_15000009 | fkey_to_reference_shard_rebalance.referencing_table_15000001 + referencing_table2_id_fkey_15000010 | fkey_to_reference_shard_rebalance.referencing_table2_15000010 | fkey_to_reference_shard_rebalance.referencing_table_15000002 + referencing_table2_id_fkey_15000011 | fkey_to_reference_shard_rebalance.referencing_table2_15000011 | fkey_to_reference_shard_rebalance.referencing_table_15000003 + referencing_table2_id_fkey_15000012 | fkey_to_reference_shard_rebalance.referencing_table2_15000012 | fkey_to_reference_shard_rebalance.referencing_table_15000004 + referencing_table2_id_fkey_15000013 | fkey_to_reference_shard_rebalance.referencing_table2_15000013 | fkey_to_reference_shard_rebalance.referencing_table_15000005 + referencing_table2_id_fkey_15000014 | fkey_to_reference_shard_rebalance.referencing_table2_15000014 | fkey_to_reference_shard_rebalance.referencing_table_15000006 + referencing_table2_id_fkey_15000015 | fkey_to_reference_shard_rebalance.referencing_table2_15000015 | fkey_to_reference_shard_rebalance.referencing_table_15000007 + referencing_table2_id_fkey_15000016 | fkey_to_reference_shard_rebalance.referencing_table2_15000016 | fkey_to_reference_shard_rebalance.referencing_table_15000008 + referencing_table2_ref_id_fkey_15000009 | fkey_to_reference_shard_rebalance.referencing_table2_15000009 | fkey_to_reference_shard_rebalance.referenced_table_15000000 + referencing_table2_ref_id_fkey_15000010 | fkey_to_reference_shard_rebalance.referencing_table2_15000010 | fkey_to_reference_shard_rebalance.referenced_table_15000000 + referencing_table2_ref_id_fkey_15000011 | fkey_to_reference_shard_rebalance.referencing_table2_15000011 | fkey_to_reference_shard_rebalance.referenced_table_15000000 + referencing_table2_ref_id_fkey_15000012 | fkey_to_reference_shard_rebalance.referencing_table2_15000012 | fkey_to_reference_shard_rebalance.referenced_table_15000000 + referencing_table2_ref_id_fkey_15000013 | fkey_to_reference_shard_rebalance.referencing_table2_15000013 | fkey_to_reference_shard_rebalance.referenced_table_15000000 + referencing_table2_ref_id_fkey_15000014 | fkey_to_reference_shard_rebalance.referencing_table2_15000014 | fkey_to_reference_shard_rebalance.referenced_table_15000000 + referencing_table2_ref_id_fkey_15000015 | fkey_to_reference_shard_rebalance.referencing_table2_15000015 | fkey_to_reference_shard_rebalance.referenced_table_15000000 + referencing_table2_ref_id_fkey_15000016 | fkey_to_reference_shard_rebalance.referencing_table2_15000016 | fkey_to_reference_shard_rebalance.referenced_table_15000000 + referencing_table_id_fkey_15000001 | fkey_to_reference_shard_rebalance.referencing_table_15000001 | fkey_to_reference_shard_rebalance.referenced_table_15000000 + referencing_table_id_fkey_15000002 | fkey_to_reference_shard_rebalance.referencing_table_15000002 | fkey_to_reference_shard_rebalance.referenced_table_15000000 + referencing_table_id_fkey_15000003 | fkey_to_reference_shard_rebalance.referencing_table_15000003 | fkey_to_reference_shard_rebalance.referenced_table_15000000 + referencing_table_id_fkey_15000004 | fkey_to_reference_shard_rebalance.referencing_table_15000004 | fkey_to_reference_shard_rebalance.referenced_table_15000000 + referencing_table_id_fkey_15000005 | fkey_to_reference_shard_rebalance.referencing_table_15000005 | fkey_to_reference_shard_rebalance.referenced_table_15000000 + referencing_table_id_fkey_15000006 | fkey_to_reference_shard_rebalance.referencing_table_15000006 | fkey_to_reference_shard_rebalance.referenced_table_15000000 + referencing_table_id_fkey_15000007 | fkey_to_reference_shard_rebalance.referencing_table_15000007 | fkey_to_reference_shard_rebalance.referenced_table_15000000 + referencing_table_id_fkey_15000008 | fkey_to_reference_shard_rebalance.referencing_table_15000008 | fkey_to_reference_shard_rebalance.referenced_table_15000000 +(24 rows) + +-- create a function to show the +CREATE FUNCTION get_foreign_key_to_reference_table_commands(Oid) + RETURNS SETOF text + LANGUAGE C STABLE STRICT + AS 'citus', $$get_foreign_key_to_reference_table_commands$$; +CREATE TABLE reference_table_commands (id int UNIQUE); +CREATE TABLE referenceing_dist_table (id int, col1 int, col2 int, col3 int); +SELECT create_reference_table('reference_table_commands'); + create_reference_table +--------------------------------------------------------------------- + +(1 row) + +SELECT create_distributed_table('referenceing_dist_table', 'id'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +ALTER TABLE referenceing_dist_table ADD CONSTRAINT c1 FOREIGN KEY (col1) REFERENCES reference_table_commands(id) ON UPDATE CASCADE; +ALTER TABLE referenceing_dist_table ADD CONSTRAINT c2 FOREIGN KEY (col2) REFERENCES reference_table_commands(id) ON UPDATE CASCADE NOT VALID; +ALTER TABLE referenceing_dist_table ADD CONSTRAINT very_very_very_very_very_very_very_very_very_very_very_very_very_long FOREIGN KEY (col3) REFERENCES reference_table_commands(id) ON UPDATE CASCADE; +NOTICE: identifier "very_very_very_very_very_very_very_very_very_very_very_very_very_long" will be truncated to "very_very_very_very_very_very_very_very_very_very_very_very_ver" +SELECT * FROM get_foreign_key_to_reference_table_commands('referenceing_dist_table'::regclass); + get_foreign_key_to_reference_table_commands +--------------------------------------------------------------------- + SELECT worker_apply_inter_shard_ddl_command (15000018, 'fkey_to_reference_shard_rebalance', 15000017, 'fkey_to_reference_shard_rebalance', 'ALTER TABLE fkey_to_reference_shard_rebalance.referenceing_dist_table ADD CONSTRAINT c1 FOREIGN KEY (col1) REFERENCES fkey_to_reference_shard_rebalance.reference_table_commands(id) ON UPDATE CASCADE NOT VALID') + UPDATE pg_constraint SET convalidated = true WHERE conrelid = 'fkey_to_reference_shard_rebalance.referenceing_dist_table_15000018'::regclass AND conname = 'c1_15000018' + SELECT worker_apply_inter_shard_ddl_command (15000018, 'fkey_to_reference_shard_rebalance', 15000017, 'fkey_to_reference_shard_rebalance', 'ALTER TABLE fkey_to_reference_shard_rebalance.referenceing_dist_table ADD CONSTRAINT c2 FOREIGN KEY (col2) REFERENCES fkey_to_reference_shard_rebalance.reference_table_commands(id) ON UPDATE CASCADE NOT VALID') + SELECT worker_apply_inter_shard_ddl_command (15000018, 'fkey_to_reference_shard_rebalance', 15000017, 'fkey_to_reference_shard_rebalance', 'ALTER TABLE fkey_to_reference_shard_rebalance.referenceing_dist_table ADD CONSTRAINT very_very_very_very_very_very_very_very_very_very_very_very_ver FOREIGN KEY (col3) REFERENCES fkey_to_reference_shard_rebalance.reference_table_commands(id) ON UPDATE CASCADE NOT VALID') + UPDATE pg_constraint SET convalidated = true WHERE conrelid = 'fkey_to_reference_shard_rebalance.referenceing_dist_table_15000018'::regclass AND conname = 'very_very_very_very_very_very_very_very_very__754e8716_15000018' +(5 rows) + +-- and show that rebalancer works fine +SELECT master_move_shard_placement(15000018, 'localhost', :worker_1_port, 'localhost', :worker_2_port); + master_move_shard_placement +--------------------------------------------------------------------- + +(1 row) + +\c - - - :worker_2_port +SELECT conname, contype, convalidated FROM pg_constraint WHERE conrelid = 'fkey_to_reference_shard_rebalance.referenceing_dist_table_15000018'::regclass ORDER BY 1; + conname | contype | convalidated +--------------------------------------------------------------------- + c1_15000018 | f | t + c2_15000018 | f | f + very_very_very_very_very_very_very_very_very__754e8716_15000018 | f | t +(3 rows) + +\c - - - :master_port +DROP SCHEMA fkey_to_reference_shard_rebalance CASCADE; +NOTICE: drop cascades to 8 other objects +DETAIL: drop cascades to type fkey_to_reference_shard_rebalance.foreign_details +drop cascades to view fkey_to_reference_shard_rebalance.table_fkeys_in_workers +drop cascades to table fkey_to_reference_shard_rebalance.referenced_table +drop cascades to table fkey_to_reference_shard_rebalance.referencing_table +drop cascades to table fkey_to_reference_shard_rebalance.referencing_table2 +drop cascades to function fkey_to_reference_shard_rebalance.get_foreign_key_to_reference_table_commands(oid) +drop cascades to table fkey_to_reference_shard_rebalance.reference_table_commands +drop cascades to table fkey_to_reference_shard_rebalance.referenceing_dist_table diff --git a/src/test/regress/expected/isolation_blocking_move_multi_shard_commands.out b/src/test/regress/expected/isolation_blocking_move_multi_shard_commands.out new file mode 100644 index 000000000..4e09e34ad --- /dev/null +++ b/src/test/regress/expected/isolation_blocking_move_multi_shard_commands.out @@ -0,0 +1,305 @@ +Parsed test spec with 2 sessions + +starting permutation: s1-begin s2-begin s2-insert s1-move-placement s2-end s1-end s1-select s1-get-shard-distribution +step s1-begin: + BEGIN; + +step s2-begin: + BEGIN; + +step s2-insert: + INSERT INTO logical_replicate_placement VALUES (15, 15), (172, 172); + +step s1-move-placement: + SELECT master_move_shard_placement(get_shard_id_for_distribution_column, 'localhost', 57637, 'localhost', 57638, shard_transfer_mode:='block_writes') FROM selected_shard; + +step s2-end: + COMMIT; + +step s1-move-placement: <... completed> +master_move_shard_placement + + +step s1-end: + COMMIT; + +step s1-select: + SELECT * FROM logical_replicate_placement order by y; + +x y + +15 15 +172 172 +step s1-get-shard-distribution: + select nodeport from pg_dist_placement inner join pg_dist_node on(pg_dist_placement.groupid = pg_dist_node.groupid) where shardid in (SELECT * FROM selected_shard) order by nodeport; + +nodeport + +57638 + +starting permutation: s1-begin s2-begin s2-upsert s1-move-placement s2-end s1-end s1-select s1-get-shard-distribution +step s1-begin: + BEGIN; + +step s2-begin: + BEGIN; + +step s2-upsert: + INSERT INTO logical_replicate_placement VALUES (15, 15), (172, 172); + INSERT INTO logical_replicate_placement VALUES (15, 15), (172, 172) ON CONFLICT (x) DO UPDATE SET y = logical_replicate_placement.y + 1; + +step s1-move-placement: + SELECT master_move_shard_placement(get_shard_id_for_distribution_column, 'localhost', 57637, 'localhost', 57638, shard_transfer_mode:='block_writes') FROM selected_shard; + +step s2-end: + COMMIT; + +step s1-move-placement: <... completed> +master_move_shard_placement + + +step s1-end: + COMMIT; + +step s1-select: + SELECT * FROM logical_replicate_placement order by y; + +x y + +15 16 +172 173 +step s1-get-shard-distribution: + select nodeport from pg_dist_placement inner join pg_dist_node on(pg_dist_placement.groupid = pg_dist_node.groupid) where shardid in (SELECT * FROM selected_shard) order by nodeport; + +nodeport + +57638 + +starting permutation: s1-insert s1-begin s2-begin s2-update s1-move-placement s2-end s1-end s1-select s1-get-shard-distribution +step s1-insert: + INSERT INTO logical_replicate_placement VALUES (15, 15), (172, 172); + +step s1-begin: + BEGIN; + +step s2-begin: + BEGIN; + +step s2-update: + UPDATE logical_replicate_placement SET y = y + 1; + +step s1-move-placement: + SELECT master_move_shard_placement(get_shard_id_for_distribution_column, 'localhost', 57637, 'localhost', 57638, shard_transfer_mode:='block_writes') FROM selected_shard; + +step s2-end: + COMMIT; + +step s1-move-placement: <... completed> +master_move_shard_placement + + +step s1-end: + COMMIT; + +step s1-select: + SELECT * FROM logical_replicate_placement order by y; + +x y + +15 16 +172 173 +step s1-get-shard-distribution: + select nodeport from pg_dist_placement inner join pg_dist_node on(pg_dist_placement.groupid = pg_dist_node.groupid) where shardid in (SELECT * FROM selected_shard) order by nodeport; + +nodeport + +57638 + +starting permutation: s1-insert s1-begin s2-begin s2-delete s1-move-placement s2-end s1-end s1-select s1-get-shard-distribution +step s1-insert: + INSERT INTO logical_replicate_placement VALUES (15, 15), (172, 172); + +step s1-begin: + BEGIN; + +step s2-begin: + BEGIN; + +step s2-delete: + DELETE FROM logical_replicate_placement; + +step s1-move-placement: + SELECT master_move_shard_placement(get_shard_id_for_distribution_column, 'localhost', 57637, 'localhost', 57638, shard_transfer_mode:='block_writes') FROM selected_shard; + +step s2-end: + COMMIT; + +step s1-move-placement: <... completed> +master_move_shard_placement + + +step s1-end: + COMMIT; + +step s1-select: + SELECT * FROM logical_replicate_placement order by y; + +x y + +step s1-get-shard-distribution: + select nodeport from pg_dist_placement inner join pg_dist_node on(pg_dist_placement.groupid = pg_dist_node.groupid) where shardid in (SELECT * FROM selected_shard) order by nodeport; + +nodeport + +57638 + +starting permutation: s1-insert s1-begin s2-begin s2-select s1-move-placement s2-end s1-end s1-get-shard-distribution +step s1-insert: + INSERT INTO logical_replicate_placement VALUES (15, 15), (172, 172); + +step s1-begin: + BEGIN; + +step s2-begin: + BEGIN; + +step s2-select: + SELECT * FROM logical_replicate_placement ORDER BY y; + +x y + +15 15 +172 172 +step s1-move-placement: + SELECT master_move_shard_placement(get_shard_id_for_distribution_column, 'localhost', 57637, 'localhost', 57638, shard_transfer_mode:='block_writes') FROM selected_shard; + +step s2-end: + COMMIT; + +step s1-move-placement: <... completed> +master_move_shard_placement + + +step s1-end: + COMMIT; + +step s1-get-shard-distribution: + select nodeport from pg_dist_placement inner join pg_dist_node on(pg_dist_placement.groupid = pg_dist_node.groupid) where shardid in (SELECT * FROM selected_shard) order by nodeport; + +nodeport + +57638 + +starting permutation: s1-begin s2-begin s2-copy s1-move-placement s2-end s1-end s1-select s1-get-shard-distribution +step s1-begin: + BEGIN; + +step s2-begin: + BEGIN; + +step s2-copy: + COPY logical_replicate_placement FROM PROGRAM 'echo "1,1\n2,2\n3,3\n4,4\n5,5\n15,30"' WITH CSV; + +step s1-move-placement: + SELECT master_move_shard_placement(get_shard_id_for_distribution_column, 'localhost', 57637, 'localhost', 57638, shard_transfer_mode:='block_writes') FROM selected_shard; + +step s2-end: + COMMIT; + +step s1-move-placement: <... completed> +master_move_shard_placement + + +step s1-end: + COMMIT; + +step s1-select: + SELECT * FROM logical_replicate_placement order by y; + +x y + +1 1 +2 2 +3 3 +4 4 +5 5 +15 30 +step s1-get-shard-distribution: + select nodeport from pg_dist_placement inner join pg_dist_node on(pg_dist_placement.groupid = pg_dist_node.groupid) where shardid in (SELECT * FROM selected_shard) order by nodeport; + +nodeport + +57638 + +starting permutation: s1-insert s1-begin s2-begin s2-truncate s1-move-placement s2-end s1-end s1-select s1-get-shard-distribution +step s1-insert: + INSERT INTO logical_replicate_placement VALUES (15, 15), (172, 172); + +step s1-begin: + BEGIN; + +step s2-begin: + BEGIN; + +step s2-truncate: + TRUNCATE logical_replicate_placement; + +step s1-move-placement: + SELECT master_move_shard_placement(get_shard_id_for_distribution_column, 'localhost', 57637, 'localhost', 57638, shard_transfer_mode:='block_writes') FROM selected_shard; + +step s2-end: + COMMIT; + +step s1-move-placement: <... completed> +master_move_shard_placement + + +step s1-end: + COMMIT; + +step s1-select: + SELECT * FROM logical_replicate_placement order by y; + +x y + +step s1-get-shard-distribution: + select nodeport from pg_dist_placement inner join pg_dist_node on(pg_dist_placement.groupid = pg_dist_node.groupid) where shardid in (SELECT * FROM selected_shard) order by nodeport; + +nodeport + +57638 + +starting permutation: s1-begin s2-begin s2-alter-table s1-move-placement s2-end s1-end s1-select s1-get-shard-distribution +step s1-begin: + BEGIN; + +step s2-begin: + BEGIN; + +step s2-alter-table: + ALTER TABLE logical_replicate_placement ADD COLUMN z INT; + +step s1-move-placement: + SELECT master_move_shard_placement(get_shard_id_for_distribution_column, 'localhost', 57637, 'localhost', 57638, shard_transfer_mode:='block_writes') FROM selected_shard; + +step s2-end: + COMMIT; + +step s1-move-placement: <... completed> +master_move_shard_placement + + +step s1-end: + COMMIT; + +step s1-select: + SELECT * FROM logical_replicate_placement order by y; + +x y z + +step s1-get-shard-distribution: + select nodeport from pg_dist_placement inner join pg_dist_node on(pg_dist_placement.groupid = pg_dist_node.groupid) where shardid in (SELECT * FROM selected_shard) order by nodeport; + +nodeport + +57638 diff --git a/src/test/regress/expected/isolation_blocking_move_multi_shard_commands_on_mx.out b/src/test/regress/expected/isolation_blocking_move_multi_shard_commands_on_mx.out new file mode 100644 index 000000000..68cb2c1fb --- /dev/null +++ b/src/test/regress/expected/isolation_blocking_move_multi_shard_commands_on_mx.out @@ -0,0 +1,245 @@ +Parsed test spec with 2 sessions + +starting permutation: s1-begin s2-start-session-level-connection s2-begin-on-worker s2-insert s1-move-placement s2-commit-worker s1-commit s1-select s1-get-shard-distribution s2-stop-connection +step s1-begin: + BEGIN; + +step s2-start-session-level-connection: + SELECT start_session_level_connection_to_node('localhost', 57638); + +start_session_level_connection_to_node + + +step s2-begin-on-worker: + SELECT run_commands_on_session_level_connection_to_node('BEGIN'); + +run_commands_on_session_level_connection_to_node + + +step s2-insert: + SELECT run_commands_on_session_level_connection_to_node('INSERT INTO logical_replicate_placement VALUES (15, 15), (172, 172)'); + +run_commands_on_session_level_connection_to_node + + +step s1-move-placement: + SELECT master_move_shard_placement(get_shard_id_for_distribution_column, 'localhost', 57637, 'localhost', 57638, shard_transfer_mode:='block_writes') FROM selected_shard; + +step s2-commit-worker: + SELECT run_commands_on_session_level_connection_to_node('COMMIT'); + +run_commands_on_session_level_connection_to_node + + +step s1-move-placement: <... completed> +master_move_shard_placement + + +step s1-commit: + COMMIT; + +step s1-select: + SELECT * FROM logical_replicate_placement order by y; + +x y + +15 15 +172 172 +step s1-get-shard-distribution: + select nodeport from pg_dist_placement inner join pg_dist_node on(pg_dist_placement.groupid = pg_dist_node.groupid) where shardid in (SELECT * FROM selected_shard) order by nodeport; + +nodeport + +57638 +step s2-stop-connection: + SELECT stop_session_level_connection_to_node(); + +stop_session_level_connection_to_node + + +restore_isolation_tester_func + + + +starting permutation: s1-insert s1-begin s2-start-session-level-connection s2-begin-on-worker s2-update s1-move-placement s2-commit-worker s1-commit s1-select s1-get-shard-distribution s2-stop-connection +step s1-insert: + INSERT INTO logical_replicate_placement VALUES (15, 15), (172, 172); + +step s1-begin: + BEGIN; + +step s2-start-session-level-connection: + SELECT start_session_level_connection_to_node('localhost', 57638); + +start_session_level_connection_to_node + + +step s2-begin-on-worker: + SELECT run_commands_on_session_level_connection_to_node('BEGIN'); + +run_commands_on_session_level_connection_to_node + + +step s2-update: + SELECT run_commands_on_session_level_connection_to_node('UPDATE logical_replicate_placement SET y = y + 1'); + +run_commands_on_session_level_connection_to_node + + +step s1-move-placement: + SELECT master_move_shard_placement(get_shard_id_for_distribution_column, 'localhost', 57637, 'localhost', 57638, shard_transfer_mode:='block_writes') FROM selected_shard; + +step s2-commit-worker: + SELECT run_commands_on_session_level_connection_to_node('COMMIT'); + +run_commands_on_session_level_connection_to_node + + +step s1-move-placement: <... completed> +master_move_shard_placement + + +step s1-commit: + COMMIT; + +step s1-select: + SELECT * FROM logical_replicate_placement order by y; + +x y + +15 16 +172 173 +step s1-get-shard-distribution: + select nodeport from pg_dist_placement inner join pg_dist_node on(pg_dist_placement.groupid = pg_dist_node.groupid) where shardid in (SELECT * FROM selected_shard) order by nodeport; + +nodeport + +57638 +step s2-stop-connection: + SELECT stop_session_level_connection_to_node(); + +stop_session_level_connection_to_node + + +restore_isolation_tester_func + + + +starting permutation: s1-insert s1-begin s2-start-session-level-connection s2-begin-on-worker s2-delete s1-move-placement s2-commit-worker s1-commit s1-select s1-get-shard-distribution s2-stop-connection +step s1-insert: + INSERT INTO logical_replicate_placement VALUES (15, 15), (172, 172); + +step s1-begin: + BEGIN; + +step s2-start-session-level-connection: + SELECT start_session_level_connection_to_node('localhost', 57638); + +start_session_level_connection_to_node + + +step s2-begin-on-worker: + SELECT run_commands_on_session_level_connection_to_node('BEGIN'); + +run_commands_on_session_level_connection_to_node + + +step s2-delete: + SELECT run_commands_on_session_level_connection_to_node('DELETE FROM logical_replicate_placement'); + +run_commands_on_session_level_connection_to_node + + +step s1-move-placement: + SELECT master_move_shard_placement(get_shard_id_for_distribution_column, 'localhost', 57637, 'localhost', 57638, shard_transfer_mode:='block_writes') FROM selected_shard; + +step s2-commit-worker: + SELECT run_commands_on_session_level_connection_to_node('COMMIT'); + +run_commands_on_session_level_connection_to_node + + +step s1-move-placement: <... completed> +master_move_shard_placement + + +step s1-commit: + COMMIT; + +step s1-select: + SELECT * FROM logical_replicate_placement order by y; + +x y + +step s1-get-shard-distribution: + select nodeport from pg_dist_placement inner join pg_dist_node on(pg_dist_placement.groupid = pg_dist_node.groupid) where shardid in (SELECT * FROM selected_shard) order by nodeport; + +nodeport + +57638 +step s2-stop-connection: + SELECT stop_session_level_connection_to_node(); + +stop_session_level_connection_to_node + + +restore_isolation_tester_func + + + +starting permutation: s1-insert s1-begin s2-start-session-level-connection s2-begin-on-worker s2-select s1-move-placement s2-commit-worker s1-commit s1-get-shard-distribution s2-stop-connection +step s1-insert: + INSERT INTO logical_replicate_placement VALUES (15, 15), (172, 172); + +step s1-begin: + BEGIN; + +step s2-start-session-level-connection: + SELECT start_session_level_connection_to_node('localhost', 57638); + +start_session_level_connection_to_node + + +step s2-begin-on-worker: + SELECT run_commands_on_session_level_connection_to_node('BEGIN'); + +run_commands_on_session_level_connection_to_node + + +step s2-select: + SELECT run_commands_on_session_level_connection_to_node('SELECT * FROM logical_replicate_placement ORDER BY y'); + +run_commands_on_session_level_connection_to_node + + +step s1-move-placement: + SELECT master_move_shard_placement(get_shard_id_for_distribution_column, 'localhost', 57637, 'localhost', 57638, shard_transfer_mode:='block_writes') FROM selected_shard; + +step s2-commit-worker: + SELECT run_commands_on_session_level_connection_to_node('COMMIT'); + +run_commands_on_session_level_connection_to_node + + +step s1-move-placement: <... completed> +master_move_shard_placement + + +step s1-commit: + COMMIT; + +step s1-get-shard-distribution: + select nodeport from pg_dist_placement inner join pg_dist_node on(pg_dist_placement.groupid = pg_dist_node.groupid) where shardid in (SELECT * FROM selected_shard) order by nodeport; + +nodeport + +57638 +step s2-stop-connection: + SELECT stop_session_level_connection_to_node(); + +stop_session_level_connection_to_node + + +restore_isolation_tester_func + + diff --git a/src/test/regress/expected/isolation_blocking_move_single_shard_commands.out b/src/test/regress/expected/isolation_blocking_move_single_shard_commands.out new file mode 100644 index 000000000..800e41aca --- /dev/null +++ b/src/test/regress/expected/isolation_blocking_move_single_shard_commands.out @@ -0,0 +1,223 @@ +Parsed test spec with 2 sessions + +starting permutation: s1-begin s2-begin s2-insert s1-move-placement s2-end s1-end s1-select s1-get-shard-distribution +step s1-begin: + BEGIN; + +step s2-begin: + BEGIN; + +step s2-insert: + INSERT INTO logical_replicate_placement VALUES (15, 15); + +step s1-move-placement: + SELECT master_move_shard_placement((SELECT * FROM selected_shard), 'localhost', 57637, 'localhost', 57638, shard_transfer_mode:='block_writes'); + +step s2-end: + COMMIT; + +step s1-move-placement: <... completed> +master_move_shard_placement + + +step s1-end: + COMMIT; + +step s1-select: + SELECT * FROM logical_replicate_placement order by y; + +x y + +15 15 +step s1-get-shard-distribution: + select nodeport from pg_dist_placement inner join pg_dist_node on(pg_dist_placement.groupid = pg_dist_node.groupid) where shardid in (SELECT * FROM selected_shard) order by nodeport; + +nodeport + +57638 + +starting permutation: s1-begin s2-begin s2-upsert s1-move-placement s2-end s1-end s1-select s1-get-shard-distribution +step s1-begin: + BEGIN; + +step s2-begin: + BEGIN; + +step s2-upsert: + INSERT INTO logical_replicate_placement VALUES (15, 15); + INSERT INTO logical_replicate_placement VALUES (15, 15) ON CONFLICT (x) DO UPDATE SET y = logical_replicate_placement.y + 1; + +step s1-move-placement: + SELECT master_move_shard_placement((SELECT * FROM selected_shard), 'localhost', 57637, 'localhost', 57638, shard_transfer_mode:='block_writes'); + +step s2-end: + COMMIT; + +step s1-move-placement: <... completed> +master_move_shard_placement + + +step s1-end: + COMMIT; + +step s1-select: + SELECT * FROM logical_replicate_placement order by y; + +x y + +15 16 +step s1-get-shard-distribution: + select nodeport from pg_dist_placement inner join pg_dist_node on(pg_dist_placement.groupid = pg_dist_node.groupid) where shardid in (SELECT * FROM selected_shard) order by nodeport; + +nodeport + +57638 + +starting permutation: s1-insert s1-begin s2-begin s2-update s1-move-placement s2-end s1-end s1-select s1-get-shard-distribution +step s1-insert: + INSERT INTO logical_replicate_placement VALUES (15, 15); + +step s1-begin: + BEGIN; + +step s2-begin: + BEGIN; + +step s2-update: + UPDATE logical_replicate_placement SET y = y + 1 WHERE x = 15; + +step s1-move-placement: + SELECT master_move_shard_placement((SELECT * FROM selected_shard), 'localhost', 57637, 'localhost', 57638, shard_transfer_mode:='block_writes'); + +step s2-end: + COMMIT; + +step s1-move-placement: <... completed> +master_move_shard_placement + + +step s1-end: + COMMIT; + +step s1-select: + SELECT * FROM logical_replicate_placement order by y; + +x y + +15 16 +step s1-get-shard-distribution: + select nodeport from pg_dist_placement inner join pg_dist_node on(pg_dist_placement.groupid = pg_dist_node.groupid) where shardid in (SELECT * FROM selected_shard) order by nodeport; + +nodeport + +57638 + +starting permutation: s1-insert s1-begin s2-begin s2-delete s1-move-placement s2-end s1-end s1-select s1-get-shard-distribution +step s1-insert: + INSERT INTO logical_replicate_placement VALUES (15, 15); + +step s1-begin: + BEGIN; + +step s2-begin: + BEGIN; + +step s2-delete: + DELETE FROM logical_replicate_placement WHERE x = 15; + +step s1-move-placement: + SELECT master_move_shard_placement((SELECT * FROM selected_shard), 'localhost', 57637, 'localhost', 57638, shard_transfer_mode:='block_writes'); + +step s2-end: + COMMIT; + +step s1-move-placement: <... completed> +master_move_shard_placement + + +step s1-end: + COMMIT; + +step s1-select: + SELECT * FROM logical_replicate_placement order by y; + +x y + +step s1-get-shard-distribution: + select nodeport from pg_dist_placement inner join pg_dist_node on(pg_dist_placement.groupid = pg_dist_node.groupid) where shardid in (SELECT * FROM selected_shard) order by nodeport; + +nodeport + +57638 + +starting permutation: s1-insert s1-begin s2-begin s2-select s1-move-placement s2-end s1-end s1-get-shard-distribution +step s1-insert: + INSERT INTO logical_replicate_placement VALUES (15, 15); + +step s1-begin: + BEGIN; + +step s2-begin: + BEGIN; + +step s2-select: + SELECT * FROM logical_replicate_placement ORDER BY y; + +x y + +15 15 +step s1-move-placement: + SELECT master_move_shard_placement((SELECT * FROM selected_shard), 'localhost', 57637, 'localhost', 57638, shard_transfer_mode:='block_writes'); + +step s2-end: + COMMIT; + +step s1-move-placement: <... completed> +master_move_shard_placement + + +step s1-end: + COMMIT; + +step s1-get-shard-distribution: + select nodeport from pg_dist_placement inner join pg_dist_node on(pg_dist_placement.groupid = pg_dist_node.groupid) where shardid in (SELECT * FROM selected_shard) order by nodeport; + +nodeport + +57638 + +starting permutation: s1-insert s1-begin s2-begin s2-select-for-update s1-move-placement s2-end s1-end s1-get-shard-distribution +step s1-insert: + INSERT INTO logical_replicate_placement VALUES (15, 15); + +step s1-begin: + BEGIN; + +step s2-begin: + BEGIN; + +step s2-select-for-update: + SELECT * FROM logical_replicate_placement WHERE x=15 FOR UPDATE; + +x y + +15 15 +step s1-move-placement: + SELECT master_move_shard_placement((SELECT * FROM selected_shard), 'localhost', 57637, 'localhost', 57638, shard_transfer_mode:='block_writes'); + +step s2-end: + COMMIT; + +step s1-move-placement: <... completed> +master_move_shard_placement + + +step s1-end: + COMMIT; + +step s1-get-shard-distribution: + select nodeport from pg_dist_placement inner join pg_dist_node on(pg_dist_placement.groupid = pg_dist_node.groupid) where shardid in (SELECT * FROM selected_shard) order by nodeport; + +nodeport + +57638 diff --git a/src/test/regress/expected/isolation_blocking_move_single_shard_commands_on_mx.out b/src/test/regress/expected/isolation_blocking_move_single_shard_commands_on_mx.out new file mode 100644 index 000000000..209275253 --- /dev/null +++ b/src/test/regress/expected/isolation_blocking_move_single_shard_commands_on_mx.out @@ -0,0 +1,300 @@ +Parsed test spec with 2 sessions + +starting permutation: s1-begin s2-start-session-level-connection s2-begin-on-worker s2-insert s1-move-placement s2-commit-worker s1-commit s1-select s1-get-shard-distribution s2-stop-connection +step s1-begin: + BEGIN; + +step s2-start-session-level-connection: + SELECT start_session_level_connection_to_node('localhost', 57638); + +start_session_level_connection_to_node + + +step s2-begin-on-worker: + SELECT run_commands_on_session_level_connection_to_node('BEGIN'); + +run_commands_on_session_level_connection_to_node + + +step s2-insert: + SELECT run_commands_on_session_level_connection_to_node('INSERT INTO logical_replicate_placement VALUES (15, 15)'); + +run_commands_on_session_level_connection_to_node + + +step s1-move-placement: + SELECT master_move_shard_placement((SELECT * FROM selected_shard), 'localhost', 57637, 'localhost', 57638, shard_transfer_mode:='block_writes'); + +step s2-commit-worker: + SELECT run_commands_on_session_level_connection_to_node('COMMIT'); + +run_commands_on_session_level_connection_to_node + + +step s1-move-placement: <... completed> +master_move_shard_placement + + +step s1-commit: + COMMIT; + +step s1-select: + SELECT * FROM logical_replicate_placement order by y; + +x y + +15 15 +step s1-get-shard-distribution: + select nodeport from pg_dist_placement inner join pg_dist_node on(pg_dist_placement.groupid = pg_dist_node.groupid) where shardid in (SELECT * FROM selected_shard) order by nodeport; + +nodeport + +57638 +step s2-stop-connection: + SELECT stop_session_level_connection_to_node(); + +stop_session_level_connection_to_node + + +restore_isolation_tester_func + + + +starting permutation: s1-insert s1-begin s2-start-session-level-connection s2-begin-on-worker s2-update s1-move-placement s2-commit-worker s1-commit s1-select s1-get-shard-distribution s2-stop-connection +step s1-insert: + INSERT INTO logical_replicate_placement VALUES (15, 15); + +step s1-begin: + BEGIN; + +step s2-start-session-level-connection: + SELECT start_session_level_connection_to_node('localhost', 57638); + +start_session_level_connection_to_node + + +step s2-begin-on-worker: + SELECT run_commands_on_session_level_connection_to_node('BEGIN'); + +run_commands_on_session_level_connection_to_node + + +step s2-update: + SELECT run_commands_on_session_level_connection_to_node('UPDATE logical_replicate_placement SET y = y + 1 WHERE x = 15'); + +run_commands_on_session_level_connection_to_node + + +step s1-move-placement: + SELECT master_move_shard_placement((SELECT * FROM selected_shard), 'localhost', 57637, 'localhost', 57638, shard_transfer_mode:='block_writes'); + +step s2-commit-worker: + SELECT run_commands_on_session_level_connection_to_node('COMMIT'); + +run_commands_on_session_level_connection_to_node + + +step s1-move-placement: <... completed> +master_move_shard_placement + + +step s1-commit: + COMMIT; + +step s1-select: + SELECT * FROM logical_replicate_placement order by y; + +x y + +15 16 +step s1-get-shard-distribution: + select nodeport from pg_dist_placement inner join pg_dist_node on(pg_dist_placement.groupid = pg_dist_node.groupid) where shardid in (SELECT * FROM selected_shard) order by nodeport; + +nodeport + +57638 +step s2-stop-connection: + SELECT stop_session_level_connection_to_node(); + +stop_session_level_connection_to_node + + +restore_isolation_tester_func + + + +starting permutation: s1-insert s1-begin s2-start-session-level-connection s2-begin-on-worker s2-delete s1-move-placement s2-commit-worker s1-commit s1-select s1-get-shard-distribution s2-stop-connection +step s1-insert: + INSERT INTO logical_replicate_placement VALUES (15, 15); + +step s1-begin: + BEGIN; + +step s2-start-session-level-connection: + SELECT start_session_level_connection_to_node('localhost', 57638); + +start_session_level_connection_to_node + + +step s2-begin-on-worker: + SELECT run_commands_on_session_level_connection_to_node('BEGIN'); + +run_commands_on_session_level_connection_to_node + + +step s2-delete: + SELECT run_commands_on_session_level_connection_to_node('DELETE FROM logical_replicate_placement WHERE x = 15'); + +run_commands_on_session_level_connection_to_node + + +step s1-move-placement: + SELECT master_move_shard_placement((SELECT * FROM selected_shard), 'localhost', 57637, 'localhost', 57638, shard_transfer_mode:='block_writes'); + +step s2-commit-worker: + SELECT run_commands_on_session_level_connection_to_node('COMMIT'); + +run_commands_on_session_level_connection_to_node + + +step s1-move-placement: <... completed> +master_move_shard_placement + + +step s1-commit: + COMMIT; + +step s1-select: + SELECT * FROM logical_replicate_placement order by y; + +x y + +step s1-get-shard-distribution: + select nodeport from pg_dist_placement inner join pg_dist_node on(pg_dist_placement.groupid = pg_dist_node.groupid) where shardid in (SELECT * FROM selected_shard) order by nodeport; + +nodeport + +57638 +step s2-stop-connection: + SELECT stop_session_level_connection_to_node(); + +stop_session_level_connection_to_node + + +restore_isolation_tester_func + + + +starting permutation: s1-insert s1-begin s2-start-session-level-connection s2-begin-on-worker s2-select s1-move-placement s2-commit-worker s1-commit s1-get-shard-distribution s2-stop-connection +step s1-insert: + INSERT INTO logical_replicate_placement VALUES (15, 15); + +step s1-begin: + BEGIN; + +step s2-start-session-level-connection: + SELECT start_session_level_connection_to_node('localhost', 57638); + +start_session_level_connection_to_node + + +step s2-begin-on-worker: + SELECT run_commands_on_session_level_connection_to_node('BEGIN'); + +run_commands_on_session_level_connection_to_node + + +step s2-select: + SELECT run_commands_on_session_level_connection_to_node('SELECT * FROM logical_replicate_placement ORDER BY y'); + +run_commands_on_session_level_connection_to_node + + +step s1-move-placement: + SELECT master_move_shard_placement((SELECT * FROM selected_shard), 'localhost', 57637, 'localhost', 57638, shard_transfer_mode:='block_writes'); + +step s2-commit-worker: + SELECT run_commands_on_session_level_connection_to_node('COMMIT'); + +run_commands_on_session_level_connection_to_node + + +step s1-move-placement: <... completed> +master_move_shard_placement + + +step s1-commit: + COMMIT; + +step s1-get-shard-distribution: + select nodeport from pg_dist_placement inner join pg_dist_node on(pg_dist_placement.groupid = pg_dist_node.groupid) where shardid in (SELECT * FROM selected_shard) order by nodeport; + +nodeport + +57638 +step s2-stop-connection: + SELECT stop_session_level_connection_to_node(); + +stop_session_level_connection_to_node + + +restore_isolation_tester_func + + + +starting permutation: s1-insert s1-begin s2-start-session-level-connection s2-begin-on-worker s2-select-for-update s1-move-placement s2-commit-worker s1-commit s1-get-shard-distribution s2-stop-connection +step s1-insert: + INSERT INTO logical_replicate_placement VALUES (15, 15); + +step s1-begin: + BEGIN; + +step s2-start-session-level-connection: + SELECT start_session_level_connection_to_node('localhost', 57638); + +start_session_level_connection_to_node + + +step s2-begin-on-worker: + SELECT run_commands_on_session_level_connection_to_node('BEGIN'); + +run_commands_on_session_level_connection_to_node + + +step s2-select-for-update: + SELECT run_commands_on_session_level_connection_to_node('SELECT * FROM logical_replicate_placement WHERE x=15 FOR UPDATE'); + +run_commands_on_session_level_connection_to_node + + +step s1-move-placement: + SELECT master_move_shard_placement((SELECT * FROM selected_shard), 'localhost', 57637, 'localhost', 57638, shard_transfer_mode:='block_writes'); + +step s2-commit-worker: + SELECT run_commands_on_session_level_connection_to_node('COMMIT'); + +run_commands_on_session_level_connection_to_node + + +step s1-move-placement: <... completed> +master_move_shard_placement + + +step s1-commit: + COMMIT; + +step s1-get-shard-distribution: + select nodeport from pg_dist_placement inner join pg_dist_node on(pg_dist_placement.groupid = pg_dist_node.groupid) where shardid in (SELECT * FROM selected_shard) order by nodeport; + +nodeport + +57638 +step s2-stop-connection: + SELECT stop_session_level_connection_to_node(); + +stop_session_level_connection_to_node + + +restore_isolation_tester_func + + diff --git a/src/test/regress/expected/isolation_shard_rebalancer.out b/src/test/regress/expected/isolation_shard_rebalancer.out new file mode 100644 index 000000000..4bb980230 --- /dev/null +++ b/src/test/regress/expected/isolation_shard_rebalancer.out @@ -0,0 +1,449 @@ +Parsed test spec with 2 sessions + +starting permutation: s1-rebalance-nc s2-rebalance-nc s1-commit +create_distributed_table + + +step s1-rebalance-nc: + BEGIN; + select rebalance_table_shards('non_colocated'); + +rebalance_table_shards + + +step s2-rebalance-nc: + select rebalance_table_shards('non_colocated'); + +ERROR: could not acquire the lock required to rebalance public.non_colocated +step s1-commit: + COMMIT; + +master_set_node_property + + + +starting permutation: s1-rebalance-nc s2-replicate-nc s1-commit +create_distributed_table + + +step s1-rebalance-nc: + BEGIN; + select rebalance_table_shards('non_colocated'); + +rebalance_table_shards + + +step s2-replicate-nc: + select replicate_table_shards('non_colocated'); + +ERROR: could not acquire the lock required to replicate public.non_colocated +step s1-commit: + COMMIT; + +master_set_node_property + + + +starting permutation: s1-replicate-nc s2-rebalance-nc s1-commit +create_distributed_table + + +step s1-replicate-nc: + BEGIN; + select replicate_table_shards('non_colocated'); + +replicate_table_shards + + +step s2-rebalance-nc: + select rebalance_table_shards('non_colocated'); + +ERROR: could not acquire the lock required to rebalance public.non_colocated +step s1-commit: + COMMIT; + +master_set_node_property + + + +starting permutation: s1-replicate-nc s2-replicate-nc s1-commit +create_distributed_table + + +step s1-replicate-nc: + BEGIN; + select replicate_table_shards('non_colocated'); + +replicate_table_shards + + +step s2-replicate-nc: + select replicate_table_shards('non_colocated'); + +ERROR: could not acquire the lock required to replicate public.non_colocated +step s1-commit: + COMMIT; + +master_set_node_property + + + +starting permutation: s1-rebalance-c1 s2-rebalance-c2 s1-commit +create_distributed_table + + +step s1-rebalance-c1: + BEGIN; + select rebalance_table_shards('colocated1'); + +rebalance_table_shards + + +step s2-rebalance-c2: + select rebalance_table_shards('colocated2'); + +ERROR: could not acquire the lock required to rebalance public.colocated2 +step s1-commit: + COMMIT; + +master_set_node_property + + + +starting permutation: s1-rebalance-c1 s2-replicate-c2 s1-commit +create_distributed_table + + +step s1-rebalance-c1: + BEGIN; + select rebalance_table_shards('colocated1'); + +rebalance_table_shards + + +step s2-replicate-c2: + select replicate_table_shards('colocated2'); + +ERROR: could not acquire the lock required to replicate public.colocated2 +step s1-commit: + COMMIT; + +master_set_node_property + + + +starting permutation: s1-replicate-c1 s2-rebalance-c2 s1-commit +create_distributed_table + + +step s1-replicate-c1: + BEGIN; + select replicate_table_shards('colocated1'); + +replicate_table_shards + + +step s2-rebalance-c2: + select rebalance_table_shards('colocated2'); + +ERROR: could not acquire the lock required to rebalance public.colocated2 +step s1-commit: + COMMIT; + +master_set_node_property + + + +starting permutation: s1-replicate-c1 s2-replicate-c2 s1-commit +create_distributed_table + + +step s1-replicate-c1: + BEGIN; + select replicate_table_shards('colocated1'); + +replicate_table_shards + + +step s2-replicate-c2: + select replicate_table_shards('colocated2'); + +ERROR: could not acquire the lock required to replicate public.colocated2 +step s1-commit: + COMMIT; + +master_set_node_property + + + +starting permutation: s1-rebalance-c1 s2-rebalance-nc s1-commit +create_distributed_table + + +step s1-rebalance-c1: + BEGIN; + select rebalance_table_shards('colocated1'); + +rebalance_table_shards + + +step s2-rebalance-nc: + select rebalance_table_shards('non_colocated'); + +rebalance_table_shards + + +step s1-commit: + COMMIT; + +master_set_node_property + + + +starting permutation: s1-rebalance-c1 s2-replicate-nc s1-commit +create_distributed_table + + +step s1-rebalance-c1: + BEGIN; + select rebalance_table_shards('colocated1'); + +rebalance_table_shards + + +step s2-replicate-nc: + select replicate_table_shards('non_colocated'); + +replicate_table_shards + + +step s1-commit: + COMMIT; + +master_set_node_property + + + +starting permutation: s1-replicate-c1 s2-rebalance-nc s1-commit +create_distributed_table + + +step s1-replicate-c1: + BEGIN; + select replicate_table_shards('colocated1'); + +replicate_table_shards + + +step s2-rebalance-nc: + select rebalance_table_shards('non_colocated'); + +rebalance_table_shards + + +step s1-commit: + COMMIT; + +master_set_node_property + + + +starting permutation: s1-replicate-c1 s2-replicate-nc s1-commit +create_distributed_table + + +step s1-replicate-c1: + BEGIN; + select replicate_table_shards('colocated1'); + +replicate_table_shards + + +step s2-replicate-nc: + select replicate_table_shards('non_colocated'); + +replicate_table_shards + + +step s1-commit: + COMMIT; + +master_set_node_property + + + +starting permutation: s1-rebalance-c1 s2-rebalance-all s1-commit +create_distributed_table + + +step s1-rebalance-c1: + BEGIN; + select rebalance_table_shards('colocated1'); + +rebalance_table_shards + + +step s2-rebalance-all: + select rebalance_table_shards(); + +ERROR: could not acquire the lock required to rebalance public.distributed_transaction_id_table +step s1-commit: + COMMIT; + +master_set_node_property + + + +starting permutation: s1-replicate-c1 s2-rebalance-all s1-commit +create_distributed_table + + +step s1-replicate-c1: + BEGIN; + select replicate_table_shards('colocated1'); + +replicate_table_shards + + +step s2-rebalance-all: + select rebalance_table_shards(); + +ERROR: could not acquire the lock required to rebalance public.distributed_transaction_id_table +step s1-commit: + COMMIT; + +master_set_node_property + + + +starting permutation: s1-rebalance-nc s2-rebalance-all s1-commit +create_distributed_table + + +step s1-rebalance-nc: + BEGIN; + select rebalance_table_shards('non_colocated'); + +rebalance_table_shards + + +step s2-rebalance-all: + select rebalance_table_shards(); + +ERROR: could not acquire the lock required to rebalance public.non_colocated +step s1-commit: + COMMIT; + +master_set_node_property + + + +starting permutation: s1-replicate-nc s2-rebalance-all s1-commit +create_distributed_table + + +step s1-replicate-nc: + BEGIN; + select replicate_table_shards('non_colocated'); + +replicate_table_shards + + +step s2-rebalance-all: + select rebalance_table_shards(); + +ERROR: could not acquire the lock required to rebalance public.non_colocated +step s1-commit: + COMMIT; + +master_set_node_property + + + +starting permutation: s1-rebalance-c1 s2-drain s1-commit +create_distributed_table + + +step s1-rebalance-c1: + BEGIN; + select rebalance_table_shards('colocated1'); + +rebalance_table_shards + + +step s2-drain: + select master_drain_node('localhost', 57638); + +ERROR: could not acquire the lock required to move public.distributed_transaction_id_table +step s1-commit: + COMMIT; + +master_set_node_property + + + +starting permutation: s1-replicate-c1 s2-drain s1-commit +create_distributed_table + + +step s1-replicate-c1: + BEGIN; + select replicate_table_shards('colocated1'); + +replicate_table_shards + + +step s2-drain: + select master_drain_node('localhost', 57638); + +ERROR: could not acquire the lock required to move public.distributed_transaction_id_table +step s1-commit: + COMMIT; + +master_set_node_property + + + +starting permutation: s1-rebalance-nc s2-drain s1-commit +create_distributed_table + + +step s1-rebalance-nc: + BEGIN; + select rebalance_table_shards('non_colocated'); + +rebalance_table_shards + + +step s2-drain: + select master_drain_node('localhost', 57638); + +ERROR: could not acquire the lock required to move public.non_colocated +step s1-commit: + COMMIT; + +master_set_node_property + + + +starting permutation: s1-replicate-nc s2-drain s1-commit +create_distributed_table + + +step s1-replicate-nc: + BEGIN; + select replicate_table_shards('non_colocated'); + +replicate_table_shards + + +step s2-drain: + select master_drain_node('localhost', 57638); + +ERROR: could not acquire the lock required to move public.non_colocated +step s1-commit: + COMMIT; + +master_set_node_property + + diff --git a/src/test/regress/expected/master_copy_shard_placement.out b/src/test/regress/expected/master_copy_shard_placement.out index f8e37fa39..65af97264 100644 --- a/src/test/regress/expected/master_copy_shard_placement.out +++ b/src/test/regress/expected/master_copy_shard_placement.out @@ -45,7 +45,8 @@ SELECT master_copy_shard_placement( get_shard_id_for_distribution_column('data', 'key-1'), 'localhost', :worker_1_port, 'localhost', :worker_2_port, - do_repair := false); + do_repair := false, + transfer_mode := 'block_writes'); ERROR: could not find placement matching "localhost:xxxxx" HINT: Confirm the placement still exists and try again. -- verify we error out if source and destination are the same @@ -53,14 +54,16 @@ SELECT master_copy_shard_placement( get_shard_id_for_distribution_column('data', 'key-1'), 'localhost', :worker_2_port, 'localhost', :worker_2_port, - do_repair := false); + do_repair := false, + transfer_mode := 'block_writes'); ERROR: shard xxxxx already exists in the target node -- verify we error out if target already contains a healthy placement SELECT master_copy_shard_placement( (SELECT shardid FROM pg_dist_shard WHERE logicalrelid='ref_table'::regclass::oid), 'localhost', :worker_1_port, 'localhost', :worker_2_port, - do_repair := false); + do_repair := false, + transfer_mode := 'block_writes'); ERROR: shard xxxxx already exists in the target node -- verify we error out if table has foreign key constraints INSERT INTO ref_table SELECT 1, value FROM data; @@ -70,16 +73,15 @@ SELECT master_copy_shard_placement( 'localhost', :worker_2_port, 'localhost', :worker_1_port, do_repair := false); -ERROR: cannot create foreign key constraint -DETAIL: This shard has foreign constraints on it. Citus currently supports foreign key constraints only for "citus.shard_replication_factor = 1". -HINT: Please change "citus.shard_replication_factor to 1". To learn more about using foreign keys with other replication factors, please contact us at https://citusdata.com/about/contact_us. +ERROR: cannot replicate shards with foreign keys ALTER TABLE data DROP CONSTRAINT distfk; -- replicate shard that contains key-1 SELECT master_copy_shard_placement( get_shard_id_for_distribution_column('data', 'key-1'), 'localhost', :worker_2_port, 'localhost', :worker_1_port, - do_repair := false); + do_repair := false, + transfer_mode := 'block_writes'); master_copy_shard_placement --------------------------------------------------------------------- @@ -123,7 +125,8 @@ SELECT master_copy_shard_placement( get_shard_id_for_distribution_column('mx_table', '1'), 'localhost', :worker_1_port, 'localhost', :worker_2_port, - do_repair := false); + do_repair := false, + transfer_mode := 'block_writes'); ERROR: Table 'mx_table' is streaming replicated. Shards of streaming replicated tables cannot be copied SELECT stop_metadata_sync_to_node('localhost', :worker_1_port); stop_metadata_sync_to_node diff --git a/src/test/regress/expected/multi_colocated_shard_rebalance.out b/src/test/regress/expected/multi_colocated_shard_rebalance.out new file mode 100644 index 000000000..70c4d8f20 --- /dev/null +++ b/src/test/regress/expected/multi_colocated_shard_rebalance.out @@ -0,0 +1,639 @@ +-- +-- MULTI_COLOCATED_SHARD_REBALANCE +-- +ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 13000000; +SET citus.shard_count TO 6; +SET citus.shard_replication_factor TO 1; +-- create distributed tables +CREATE TABLE table1_group1 ( id int PRIMARY KEY); +SELECT create_distributed_table('table1_group1', 'id', 'hash'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +CREATE TABLE table2_group1 ( id int ); +SELECT create_distributed_table('table2_group1', 'id', 'hash'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +SET citus.shard_count TO 8; +CREATE TABLE table5_groupX ( id int ); +SELECT create_distributed_table('table5_groupX', 'id', 'hash'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +CREATE TABLE table6_append ( id int ); +SELECT master_create_distributed_table('table6_append', 'id', 'append'); + master_create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +SELECT master_create_empty_shard('table6_append'); + master_create_empty_shard +--------------------------------------------------------------------- + 13000020 +(1 row) + +SELECT master_create_empty_shard('table6_append'); + master_create_empty_shard +--------------------------------------------------------------------- + 13000021 +(1 row) + +-- Mark tables as non-mx tables, in order to be able to test master_copy_shard_placement +UPDATE pg_dist_partition SET repmodel='c' WHERE logicalrelid IN + ('table1_group1'::regclass, 'table2_group1'::regclass, 'table5_groupX'::regclass); +-- test copy +-- test copying colocated shards +-- status before shard copy +SELECT s.shardid, s.logicalrelid::regclass, sp.nodeport +FROM + pg_dist_partition p, pg_dist_shard s, pg_dist_shard_placement sp +WHERE + p.logicalrelid = s.logicalrelid AND + s.shardid = sp.shardid AND + colocationid = (SELECT colocationid FROM pg_dist_partition WHERE logicalrelid = 'table1_group1'::regclass) +ORDER BY s.shardid, sp.nodeport; + shardid | logicalrelid | nodeport +--------------------------------------------------------------------- + 13000000 | table1_group1 | 57637 + 13000001 | table1_group1 | 57638 + 13000002 | table1_group1 | 57637 + 13000003 | table1_group1 | 57638 + 13000004 | table1_group1 | 57637 + 13000005 | table1_group1 | 57638 + 13000006 | table2_group1 | 57637 + 13000007 | table2_group1 | 57638 + 13000008 | table2_group1 | 57637 + 13000009 | table2_group1 | 57638 + 13000010 | table2_group1 | 57637 + 13000011 | table2_group1 | 57638 +(12 rows) + +-- copy colocated shards +SELECT master_copy_shard_placement(13000000, 'localhost', :worker_1_port, 'localhost', :worker_2_port, false); + master_copy_shard_placement +--------------------------------------------------------------------- + +(1 row) + +-- status after shard copy +SELECT s.shardid, s.logicalrelid::regclass, sp.nodeport +FROM + pg_dist_partition p, pg_dist_shard s, pg_dist_shard_placement sp +WHERE + p.logicalrelid = s.logicalrelid AND + s.shardid = sp.shardid AND + colocationid = (SELECT colocationid FROM pg_dist_partition WHERE logicalrelid = 'table1_group1'::regclass) +ORDER BY s.shardid, sp.nodeport; + shardid | logicalrelid | nodeport +--------------------------------------------------------------------- + 13000000 | table1_group1 | 57637 + 13000000 | table1_group1 | 57638 + 13000001 | table1_group1 | 57638 + 13000002 | table1_group1 | 57637 + 13000003 | table1_group1 | 57638 + 13000004 | table1_group1 | 57637 + 13000005 | table1_group1 | 57638 + 13000006 | table2_group1 | 57637 + 13000006 | table2_group1 | 57638 + 13000007 | table2_group1 | 57638 + 13000008 | table2_group1 | 57637 + 13000009 | table2_group1 | 57638 + 13000010 | table2_group1 | 57637 + 13000011 | table2_group1 | 57638 +(14 rows) + +-- also connect worker to verify we successfully copied given shard (and other colocated shards) +\c - - - :worker_2_port +SELECT "Column", "Type", "Modifiers" FROM table_desc WHERE relid='public.table1_group1_13000000'::regclass; + Column | Type | Modifiers +--------------------------------------------------------------------- + id | integer | not null +(1 row) + +SELECT "Column", "Type", "Modifiers" FROM table_desc WHERE relid='public.table2_group1_13000006'::regclass; + Column | Type | Modifiers +--------------------------------------------------------------------- + id | integer | +(1 row) + +\c - - - :master_port +-- copy colocated shards again to see error message +SELECT master_copy_shard_placement(13000000, 'localhost', :worker_1_port, 'localhost', :worker_2_port, false, 'force_logical'); +ERROR: the force_logical transfer mode is currently unsupported +-- test copying NOT colocated shard +-- status before shard copy +SELECT s.shardid, s.logicalrelid::regclass, sp.nodeport +FROM + pg_dist_partition p, pg_dist_shard s, pg_dist_shard_placement sp +WHERE + p.logicalrelid = s.logicalrelid AND + s.shardid = sp.shardid AND + p.logicalrelid = 'table5_groupX'::regclass +ORDER BY s.shardid, sp.nodeport; + shardid | logicalrelid | nodeport +--------------------------------------------------------------------- + 13000012 | table5_groupx | 57637 + 13000013 | table5_groupx | 57638 + 13000014 | table5_groupx | 57637 + 13000015 | table5_groupx | 57638 + 13000016 | table5_groupx | 57637 + 13000017 | table5_groupx | 57638 + 13000018 | table5_groupx | 57637 + 13000019 | table5_groupx | 57638 +(8 rows) + +-- copy NOT colocated shard +SELECT master_copy_shard_placement(13000012, 'localhost', :worker_1_port, 'localhost', :worker_2_port, false); + master_copy_shard_placement +--------------------------------------------------------------------- + +(1 row) + +-- status after shard copy +SELECT s.shardid, s.logicalrelid::regclass, sp.nodeport +FROM + pg_dist_partition p, pg_dist_shard s, pg_dist_shard_placement sp +WHERE + p.logicalrelid = s.logicalrelid AND + s.shardid = sp.shardid AND + p.logicalrelid = 'table5_groupX'::regclass +ORDER BY s.shardid, sp.nodeport; + shardid | logicalrelid | nodeport +--------------------------------------------------------------------- + 13000012 | table5_groupx | 57637 + 13000012 | table5_groupx | 57638 + 13000013 | table5_groupx | 57638 + 13000014 | table5_groupx | 57637 + 13000015 | table5_groupx | 57638 + 13000016 | table5_groupx | 57637 + 13000017 | table5_groupx | 57638 + 13000018 | table5_groupx | 57637 + 13000019 | table5_groupx | 57638 +(9 rows) + +-- test copying shard in append distributed table +-- status before shard copy +SELECT s.shardid, s.logicalrelid::regclass, sp.nodeport +FROM + pg_dist_partition p, pg_dist_shard s, pg_dist_shard_placement sp +WHERE + p.logicalrelid = s.logicalrelid AND + s.shardid = sp.shardid AND + p.logicalrelid = 'table6_append'::regclass +ORDER BY s.shardid, sp.nodeport; + shardid | logicalrelid | nodeport +--------------------------------------------------------------------- + 13000020 | table6_append | 57638 + 13000021 | table6_append | 57637 +(2 rows) + +-- copy shard in append distributed table +SELECT master_copy_shard_placement(13000020, 'localhost', :worker_2_port, 'localhost', :worker_1_port, false, 'force_logical'); +ERROR: the force_logical transfer mode is currently unsupported +-- status after shard copy +SELECT s.shardid, s.logicalrelid::regclass, sp.nodeport +FROM + pg_dist_partition p, pg_dist_shard s, pg_dist_shard_placement sp +WHERE + p.logicalrelid = s.logicalrelid AND + s.shardid = sp.shardid AND + p.logicalrelid = 'table6_append'::regclass +ORDER BY s.shardid, sp.nodeport; + shardid | logicalrelid | nodeport +--------------------------------------------------------------------- + 13000020 | table6_append | 57638 + 13000021 | table6_append | 57637 +(2 rows) + +-- test move +-- test moving colocated shards +-- status before shard move +SELECT s.shardid, s.logicalrelid::regclass, sp.nodeport +FROM + pg_dist_partition p, pg_dist_shard s, pg_dist_shard_placement sp +WHERE + p.logicalrelid = s.logicalrelid AND + s.shardid = sp.shardid AND + colocationid = (SELECT colocationid FROM pg_dist_partition WHERE logicalrelid = 'table1_group1'::regclass) +ORDER BY s.shardid, sp.nodeport; + shardid | logicalrelid | nodeport +--------------------------------------------------------------------- + 13000000 | table1_group1 | 57637 + 13000000 | table1_group1 | 57638 + 13000001 | table1_group1 | 57638 + 13000002 | table1_group1 | 57637 + 13000003 | table1_group1 | 57638 + 13000004 | table1_group1 | 57637 + 13000005 | table1_group1 | 57638 + 13000006 | table2_group1 | 57637 + 13000006 | table2_group1 | 57638 + 13000007 | table2_group1 | 57638 + 13000008 | table2_group1 | 57637 + 13000009 | table2_group1 | 57638 + 13000010 | table2_group1 | 57637 + 13000011 | table2_group1 | 57638 +(14 rows) + +-- try force_logical +SELECT master_move_shard_placement(13000001, 'localhost', :worker_2_port, 'localhost', :worker_1_port, 'force_logical'); +ERROR: the force_logical transfer mode is currently unsupported +-- move colocated shards +SELECT master_move_shard_placement(13000001, 'localhost', :worker_2_port, 'localhost', :worker_1_port); + master_move_shard_placement +--------------------------------------------------------------------- + +(1 row) + +-- status after shard move +SELECT s.shardid, s.logicalrelid::regclass, sp.nodeport +FROM + pg_dist_partition p, pg_dist_shard s, pg_dist_shard_placement sp +WHERE + p.logicalrelid = s.logicalrelid AND + s.shardid = sp.shardid AND + colocationid = (SELECT colocationid FROM pg_dist_partition WHERE logicalrelid = 'table1_group1'::regclass) +ORDER BY s.shardid, sp.nodeport; + shardid | logicalrelid | nodeport +--------------------------------------------------------------------- + 13000000 | table1_group1 | 57637 + 13000000 | table1_group1 | 57638 + 13000001 | table1_group1 | 57637 + 13000002 | table1_group1 | 57637 + 13000003 | table1_group1 | 57638 + 13000004 | table1_group1 | 57637 + 13000005 | table1_group1 | 57638 + 13000006 | table2_group1 | 57637 + 13000006 | table2_group1 | 57638 + 13000007 | table2_group1 | 57637 + 13000008 | table2_group1 | 57637 + 13000009 | table2_group1 | 57638 + 13000010 | table2_group1 | 57637 + 13000011 | table2_group1 | 57638 +(14 rows) + +-- also connect worker to verify we successfully moved given shard (and other colocated shards) +\c - - - :worker_1_port +SELECT "Column", "Type", "Modifiers" FROM table_desc WHERE relid='public.table1_group1_13000001'::regclass; + Column | Type | Modifiers +--------------------------------------------------------------------- + id | integer | not null +(1 row) + +SELECT "Column", "Type", "Modifiers" FROM table_desc WHERE relid='public.table2_group1_13000007'::regclass; + Column | Type | Modifiers +--------------------------------------------------------------------- + id | integer | +(1 row) + +\c - - - :master_port +-- test moving NOT colocated shard +-- status before shard move +SELECT s.shardid, s.logicalrelid::regclass, sp.nodeport +FROM + pg_dist_partition p, pg_dist_shard s, pg_dist_shard_placement sp +WHERE + p.logicalrelid = s.logicalrelid AND + s.shardid = sp.shardid AND + p.logicalrelid = 'table5_groupX'::regclass +ORDER BY s.shardid, sp.nodeport; + shardid | logicalrelid | nodeport +--------------------------------------------------------------------- + 13000012 | table5_groupx | 57637 + 13000012 | table5_groupx | 57638 + 13000013 | table5_groupx | 57638 + 13000014 | table5_groupx | 57637 + 13000015 | table5_groupx | 57638 + 13000016 | table5_groupx | 57637 + 13000017 | table5_groupx | 57638 + 13000018 | table5_groupx | 57637 + 13000019 | table5_groupx | 57638 +(9 rows) + +-- move NOT colocated shard +SELECT master_move_shard_placement(13000013, 'localhost', :worker_2_port, 'localhost', :worker_1_port); + master_move_shard_placement +--------------------------------------------------------------------- + +(1 row) + +-- status after shard move +SELECT s.shardid, s.logicalrelid::regclass, sp.nodeport +FROM + pg_dist_partition p, pg_dist_shard s, pg_dist_shard_placement sp +WHERE + p.logicalrelid = s.logicalrelid AND + s.shardid = sp.shardid AND + p.logicalrelid = 'table5_groupX'::regclass +ORDER BY s.shardid, sp.nodeport; + shardid | logicalrelid | nodeport +--------------------------------------------------------------------- + 13000012 | table5_groupx | 57637 + 13000012 | table5_groupx | 57638 + 13000013 | table5_groupx | 57637 + 13000014 | table5_groupx | 57637 + 13000015 | table5_groupx | 57638 + 13000016 | table5_groupx | 57637 + 13000017 | table5_groupx | 57638 + 13000018 | table5_groupx | 57637 + 13000019 | table5_groupx | 57638 +(9 rows) + +-- test moving shard in append distributed table +-- status before shard move +SELECT s.shardid, s.logicalrelid::regclass, sp.nodeport +FROM + pg_dist_partition p, pg_dist_shard s, pg_dist_shard_placement sp +WHERE + p.logicalrelid = s.logicalrelid AND + s.shardid = sp.shardid AND + p.logicalrelid = 'table6_append'::regclass +ORDER BY s.shardid, sp.nodeport; + shardid | logicalrelid | nodeport +--------------------------------------------------------------------- + 13000020 | table6_append | 57638 + 13000021 | table6_append | 57637 +(2 rows) + +-- move shard in append distributed table +SELECT master_move_shard_placement(13000021, 'localhost', :worker_1_port, 'localhost', :worker_2_port); + master_move_shard_placement +--------------------------------------------------------------------- + +(1 row) + +-- status after shard move +SELECT s.shardid, s.logicalrelid::regclass, sp.nodeport +FROM + pg_dist_partition p, pg_dist_shard s, pg_dist_shard_placement sp +WHERE + p.logicalrelid = s.logicalrelid AND + s.shardid = sp.shardid AND + p.logicalrelid = 'table6_append'::regclass +ORDER BY s.shardid, sp.nodeport; + shardid | logicalrelid | nodeport +--------------------------------------------------------------------- + 13000020 | table6_append | 57638 + 13000021 | table6_append | 57638 +(2 rows) + +-- try to move shard from wrong node +SELECT master_move_shard_placement(13000021, 'localhost', :worker_1_port, 'localhost', :worker_2_port); +ERROR: could not find placement matching "localhost:xxxxx" +HINT: Confirm the placement still exists and try again. +-- test shard move with foreign constraints +DROP TABLE IF EXISTS table1_group1, table2_group1; +SET citus.shard_count TO 6; +SET citus.shard_replication_factor TO 1; +-- create distributed tables +CREATE TABLE table1_group1 ( id int PRIMARY KEY); +SELECT create_distributed_table('table1_group1', 'id', 'hash'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +CREATE TABLE table2_group1 ( id int, table1_id int, FOREIGN KEY(table1_id) REFERENCES table1_group1(id)); +SELECT create_distributed_table('table2_group1', 'table1_id', 'hash'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +-- Mark the tables as non-mx tables +UPDATE pg_dist_partition SET repmodel='c' WHERE logicalrelid IN + ('table1_group1'::regclass, 'table2_group1'::regclass); +-- status before shard rebalance +SELECT s.shardid, s.logicalrelid::regclass, sp.nodeport +FROM + pg_dist_partition p, pg_dist_shard s, pg_dist_shard_placement sp +WHERE + p.logicalrelid = s.logicalrelid AND + s.shardid = sp.shardid AND + colocationid = (SELECT colocationid FROM pg_dist_partition WHERE logicalrelid = 'table1_group1'::regclass) +ORDER BY s.shardid, sp.nodeport; + shardid | logicalrelid | nodeport +--------------------------------------------------------------------- + 13000022 | table1_group1 | 57637 + 13000023 | table1_group1 | 57638 + 13000024 | table1_group1 | 57637 + 13000025 | table1_group1 | 57638 + 13000026 | table1_group1 | 57637 + 13000027 | table1_group1 | 57638 + 13000028 | table2_group1 | 57637 + 13000029 | table2_group1 | 57638 + 13000030 | table2_group1 | 57637 + 13000031 | table2_group1 | 57638 + 13000032 | table2_group1 | 57637 + 13000033 | table2_group1 | 57638 +(12 rows) + +SELECT master_move_shard_placement(13000022, 'localhost', :worker_1_port, 'localhost', :worker_2_port, 'block_writes'); + master_move_shard_placement +--------------------------------------------------------------------- + +(1 row) + +-- status after shard rebalance +SELECT s.shardid, s.logicalrelid::regclass, sp.nodeport +FROM + pg_dist_partition p, pg_dist_shard s, pg_dist_shard_placement sp +WHERE + p.logicalrelid = s.logicalrelid AND + s.shardid = sp.shardid AND + colocationid = (SELECT colocationid FROM pg_dist_partition WHERE logicalrelid = 'table1_group1'::regclass) +ORDER BY s.shardid, sp.nodeport; + shardid | logicalrelid | nodeport +--------------------------------------------------------------------- + 13000022 | table1_group1 | 57638 + 13000023 | table1_group1 | 57638 + 13000024 | table1_group1 | 57637 + 13000025 | table1_group1 | 57638 + 13000026 | table1_group1 | 57637 + 13000027 | table1_group1 | 57638 + 13000028 | table2_group1 | 57638 + 13000029 | table2_group1 | 57638 + 13000030 | table2_group1 | 57637 + 13000031 | table2_group1 | 57638 + 13000032 | table2_group1 | 57637 + 13000033 | table2_group1 | 57638 +(12 rows) + +-- also connect worker to verify we successfully moved given shard (and other colocated shards) +\c - - - :worker_2_port +SELECT "Column", "Type", "Modifiers" FROM table_desc WHERE relid='public.table1_group1_13000022'::regclass; + Column | Type | Modifiers +--------------------------------------------------------------------- + id | integer | not null +(1 row) + +SELECT "Column", "Type", "Modifiers" FROM table_desc WHERE relid='public.table2_group1_13000028'::regclass; + Column | Type | Modifiers +--------------------------------------------------------------------- + id | integer | + table1_id | integer | +(2 rows) + +-- make sure that we've created the foreign keys +SELECT "Constraint", "Definition" FROM table_fkeys + WHERE "Constraint" LIKE 'table2_group%' OR "Constraint" LIKE 'table1_group%'; + Constraint | Definition +--------------------------------------------------------------------- + table2_group1_table1_id_fkey_13000028 | FOREIGN KEY (table1_id) REFERENCES table1_group1_13000022(id) + table2_group1_table1_id_fkey_13000029 | FOREIGN KEY (table1_id) REFERENCES table1_group1_13000023(id) + table2_group1_table1_id_fkey_13000031 | FOREIGN KEY (table1_id) REFERENCES table1_group1_13000025(id) + table2_group1_table1_id_fkey_13000033 | FOREIGN KEY (table1_id) REFERENCES table1_group1_13000027(id) +(4 rows) + +\c - - - :master_port +-- test shard copy with foreign constraints +-- we expect it to error out because we do not support foreign constraints with replication factor > 1 +SELECT master_copy_shard_placement(13000022, 'localhost', :worker_2_port, 'localhost', :worker_1_port, false); +ERROR: cannot replicate shards with foreign keys +-- lets also test that master_move_shard_placement doesn't break serials +CREATE TABLE serial_move_test (key int, other_val serial); +SET citus.shard_replication_factor TO 1; +SELECT create_distributed_table('serial_move_test', 'key'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +-- key 15 goes to shard xxxxx +INSERT INTO serial_move_test (key) VALUES (15) RETURNING *; + key | other_val +--------------------------------------------------------------------- + 15 | 1 +(1 row) + +INSERT INTO serial_move_test (key) VALUES (15) RETURNING *; + key | other_val +--------------------------------------------------------------------- + 15 | 2 +(1 row) + +-- confirm the shard id +SELECT * FROM run_command_on_placements('serial_move_test', 'SELECT DISTINCT key FROM %s WHERE key = 15') WHERE result = '15' AND shardid = 13000034; + nodename | nodeport | shardid | success | result +--------------------------------------------------------------------- + localhost | 57637 | 13000034 | t | 15 +(1 row) + +SELECT master_move_shard_placement(13000034, 'localhost', :worker_1_port, 'localhost', :worker_2_port); + master_move_shard_placement +--------------------------------------------------------------------- + +(1 row) + +-- confirm the successfull move +SELECT * FROM run_command_on_placements('serial_move_test', 'SELECT DISTINCT key FROM %s WHERE key = 15') WHERE result = '15' AND shardid = 13000034; + nodename | nodeport | shardid | success | result +--------------------------------------------------------------------- + localhost | 57638 | 13000034 | t | 15 +(1 row) + +-- finally show that serials work fine afterwards +INSERT INTO serial_move_test (key) VALUES (15) RETURNING *; + key | other_val +--------------------------------------------------------------------- + 15 | 3 +(1 row) + +INSERT INTO serial_move_test (key) VALUES (15) RETURNING *; + key | other_val +--------------------------------------------------------------------- + 15 | 4 +(1 row) + +-- we should be able to move shard placements of partitioend tables +CREATE SCHEMA move_partitions; +CREATE TABLE move_partitions.events ( + id serial, + t timestamptz default now(), + payload text +) +PARTITION BY RANGE(t); +SET citus.shard_count TO 6; +SELECT create_distributed_table('move_partitions.events', 'id', colocate_with := 'none'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +CREATE TABLE move_partitions.events_1 PARTITION OF move_partitions.events +FOR VALUES FROM ('2015-01-01') TO ('2016-01-01'); +INSERT INTO move_partitions.events (t, payload) +SELECT '2015-01-01'::date + (interval '1 day' * s), s FROM generate_series(1, 100) s; +SELECT count(*) FROM move_partitions.events; + count +--------------------------------------------------------------------- + 100 +(1 row) + +-- try to move automatically +SELECT master_move_shard_placement(shardid, 'localhost', :worker_2_port, 'localhost', :worker_1_port) +FROM pg_dist_shard JOIN pg_dist_shard_placement USING (shardid) +WHERE logicalrelid = 'move_partitions.events'::regclass AND nodeport = :worker_2_port +ORDER BY shardid LIMIT 1; + master_move_shard_placement +--------------------------------------------------------------------- + +(1 row) + +SELECT count(*) FROM move_partitions.events; + count +--------------------------------------------------------------------- + 100 +(1 row) + +-- add a primary key to the partition +ALTER TABLE move_partitions.events_1 ADD CONSTRAINT e_1_pk PRIMARY KEY (id); +-- should be able to move automatically now +SELECT master_move_shard_placement(shardid, 'localhost', :worker_2_port, 'localhost', :worker_1_port) +FROM pg_dist_shard JOIN pg_dist_shard_placement USING (shardid) +WHERE logicalrelid = 'move_partitions.events'::regclass AND nodeport = :worker_2_port +ORDER BY shardid LIMIT 1; + master_move_shard_placement +--------------------------------------------------------------------- + +(1 row) + +SELECT count(*) FROM move_partitions.events; + count +--------------------------------------------------------------------- + 100 +(1 row) + +-- should also be able to move with block writes +SELECT master_move_shard_placement(shardid, 'localhost', :worker_2_port, 'localhost', :worker_1_port, 'block_writes') +FROM pg_dist_shard JOIN pg_dist_shard_placement USING (shardid) +WHERE logicalrelid = 'move_partitions.events'::regclass AND nodeport = :worker_2_port +ORDER BY shardid LIMIT 1; + master_move_shard_placement +--------------------------------------------------------------------- + +(1 row) + +SELECT count(*) FROM move_partitions.events; + count +--------------------------------------------------------------------- + 100 +(1 row) + +-- should have moved all shards to node 1 (2*6 = 12) +SELECT count(*) +FROM pg_dist_shard JOIN pg_dist_shard_placement USING (shardid) +WHERE logicalrelid::text LIKE 'move_partitions.events%' AND nodeport = :worker_1_port; + count +--------------------------------------------------------------------- + 12 +(1 row) + +DROP TABLE move_partitions.events; diff --git a/src/test/regress/expected/multi_move_mx.out b/src/test/regress/expected/multi_move_mx.out new file mode 100644 index 000000000..3d67c97de --- /dev/null +++ b/src/test/regress/expected/multi_move_mx.out @@ -0,0 +1,235 @@ +-- +-- MULTI_MOVE_MX +-- +ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 1550000; +SELECT start_metadata_sync_to_node('localhost', :worker_2_port); + start_metadata_sync_to_node +--------------------------------------------------------------------- + +(1 row) + +-- Create mx test tables +SET citus.shard_count TO 4; +SET citus.shard_replication_factor TO 1; +SET citus.replication_model TO 'streaming'; +CREATE TABLE mx_table_1 (a int); +SELECT create_distributed_table('mx_table_1', 'a'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +CREATE TABLE mx_table_2 (a int); +SELECT create_distributed_table('mx_table_2', 'a'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +CREATE TABLE mx_table_3 (a text); +SELECT create_distributed_table('mx_table_3', 'a'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +-- Check that the first two tables are colocated +SELECT + logicalrelid, repmodel +FROM + pg_dist_partition +WHERE + logicalrelid = 'mx_table_1'::regclass + OR logicalrelid = 'mx_table_2'::regclass + OR logicalrelid = 'mx_table_3'::regclass +ORDER BY + logicalrelid; + logicalrelid | repmodel +--------------------------------------------------------------------- + mx_table_1 | s + mx_table_2 | s + mx_table_3 | s +(3 rows) + +-- Check the list of shards +SELECT + logicalrelid, shardid, nodename, nodeport +FROM + pg_dist_shard NATURAL JOIN pg_dist_shard_placement +WHERE + logicalrelid = 'mx_table_1'::regclass + OR logicalrelid = 'mx_table_2'::regclass + OR logicalrelid = 'mx_table_3'::regclass +ORDER BY + logicalrelid, shardid; + logicalrelid | shardid | nodename | nodeport +--------------------------------------------------------------------- + mx_table_1 | 1550000 | localhost | 57637 + mx_table_1 | 1550001 | localhost | 57638 + mx_table_1 | 1550002 | localhost | 57637 + mx_table_1 | 1550003 | localhost | 57638 + mx_table_2 | 1550004 | localhost | 57637 + mx_table_2 | 1550005 | localhost | 57638 + mx_table_2 | 1550006 | localhost | 57637 + mx_table_2 | 1550007 | localhost | 57638 + mx_table_3 | 1550008 | localhost | 57637 + mx_table_3 | 1550009 | localhost | 57638 + mx_table_3 | 1550010 | localhost | 57637 + mx_table_3 | 1550011 | localhost | 57638 +(12 rows) + +-- Check the data on the worker +\c - - - :worker_2_port +SELECT + logicalrelid, shardid, nodename, nodeport +FROM + pg_dist_shard NATURAL JOIN pg_dist_shard_placement +WHERE + logicalrelid = 'mx_table_1'::regclass + OR logicalrelid = 'mx_table_2'::regclass + OR logicalrelid = 'mx_table_3'::regclass +ORDER BY + logicalrelid, shardid; + logicalrelid | shardid | nodename | nodeport +--------------------------------------------------------------------- + mx_table_1 | 1550000 | localhost | 57637 + mx_table_1 | 1550001 | localhost | 57638 + mx_table_1 | 1550002 | localhost | 57637 + mx_table_1 | 1550003 | localhost | 57638 + mx_table_2 | 1550004 | localhost | 57637 + mx_table_2 | 1550005 | localhost | 57638 + mx_table_2 | 1550006 | localhost | 57637 + mx_table_2 | 1550007 | localhost | 57638 + mx_table_3 | 1550008 | localhost | 57637 + mx_table_3 | 1550009 | localhost | 57638 + mx_table_3 | 1550010 | localhost | 57637 + mx_table_3 | 1550011 | localhost | 57638 +(12 rows) + +\c - - - :master_port +-- Check that master_copy_shard_placement cannot be run with MX tables +SELECT + master_copy_shard_placement(shardid, 'localhost', :worker_1_port, 'localhost', :worker_2_port, false, 'force_logical') +FROM + pg_dist_shard NATURAL JOIN pg_dist_shard_placement +WHERE + logicalrelid = 'mx_table_1'::regclass + AND nodeport = :worker_1_port +ORDER BY + shardid +LIMIT 1; +ERROR: the force_logical transfer mode is currently unsupported +-- Move a shard from worker 1 to worker 2 +SELECT + master_move_shard_placement(shardid, 'localhost', :worker_1_port, 'localhost', :worker_2_port) +FROM + pg_dist_shard NATURAL JOIN pg_dist_shard_placement +WHERE + logicalrelid = 'mx_table_1'::regclass + AND nodeport = :worker_1_port +ORDER BY + shardid +LIMIT 1; + master_move_shard_placement +--------------------------------------------------------------------- + +(1 row) + +-- Check that the shard and its colocated shard is moved, but not the other shards +SELECT + logicalrelid, shardid, nodename, nodeport +FROM + pg_dist_shard NATURAL JOIN pg_dist_shard_placement +WHERE + logicalrelid = 'mx_table_1'::regclass + OR logicalrelid = 'mx_table_2'::regclass + OR logicalrelid = 'mx_table_3'::regclass +ORDER BY + logicalrelid, shardid; + logicalrelid | shardid | nodename | nodeport +--------------------------------------------------------------------- + mx_table_1 | 1550000 | localhost | 57638 + mx_table_1 | 1550001 | localhost | 57638 + mx_table_1 | 1550002 | localhost | 57637 + mx_table_1 | 1550003 | localhost | 57638 + mx_table_2 | 1550004 | localhost | 57638 + mx_table_2 | 1550005 | localhost | 57638 + mx_table_2 | 1550006 | localhost | 57637 + mx_table_2 | 1550007 | localhost | 57638 + mx_table_3 | 1550008 | localhost | 57637 + mx_table_3 | 1550009 | localhost | 57638 + mx_table_3 | 1550010 | localhost | 57637 + mx_table_3 | 1550011 | localhost | 57638 +(12 rows) + +-- Check that the changes are made in the worker as well +\c - - - :worker_2_port +SELECT + logicalrelid, shardid, nodename, nodeport +FROM + pg_dist_shard NATURAL JOIN pg_dist_shard_placement +WHERE + logicalrelid = 'mx_table_1'::regclass + OR logicalrelid = 'mx_table_2'::regclass + OR logicalrelid = 'mx_table_3'::regclass +ORDER BY + logicalrelid, shardid; + logicalrelid | shardid | nodename | nodeport +--------------------------------------------------------------------- + mx_table_1 | 1550000 | localhost | 57638 + mx_table_1 | 1550001 | localhost | 57638 + mx_table_1 | 1550002 | localhost | 57637 + mx_table_1 | 1550003 | localhost | 57638 + mx_table_2 | 1550004 | localhost | 57638 + mx_table_2 | 1550005 | localhost | 57638 + mx_table_2 | 1550006 | localhost | 57637 + mx_table_2 | 1550007 | localhost | 57638 + mx_table_3 | 1550008 | localhost | 57637 + mx_table_3 | 1550009 | localhost | 57638 + mx_table_3 | 1550010 | localhost | 57637 + mx_table_3 | 1550011 | localhost | 57638 +(12 rows) + +-- Check that the UDFs cannot be called from the workers +SELECT + master_copy_shard_placement(shardid, 'localhost', :worker_2_port, 'localhost', :worker_1_port, false, 'force_logical') +FROM + pg_dist_shard NATURAL JOIN pg_dist_shard_placement +WHERE + logicalrelid = 'mx_table_1'::regclass + AND nodeport = :worker_2_port +ORDER BY + shardid +LIMIT 1 OFFSET 1; +ERROR: operation is not allowed on this node +HINT: Connect to the coordinator and run it again. +SELECT + master_move_shard_placement(shardid, 'localhost', :worker_2_port, 'localhost', :worker_1_port, 'force_logical') +FROM + pg_dist_shard NATURAL JOIN pg_dist_shard_placement +WHERE + logicalrelid = 'mx_table_1'::regclass + AND nodeport = :worker_2_port +ORDER BY + shardid +LIMIT 1 OFFSET 1; +ERROR: operation is not allowed on this node +HINT: Connect to the coordinator and run it again. +-- Cleanup +\c - - - :master_port +DROP TABLE mx_table_1; +DROP TABLE mx_table_2; +DROP TABLE mx_table_3; +SELECT stop_metadata_sync_to_node('localhost', :worker_2_port); + stop_metadata_sync_to_node +--------------------------------------------------------------------- + +(1 row) + +\c - - - :worker_2_port +DELETE FROM pg_dist_node; +DELETE FROM pg_dist_partition; +DELETE FROM pg_dist_shard; +DELETE FROM pg_dist_shard_placement; +\c - - - :master_port +RESET citus.replication_model; diff --git a/src/test/regress/expected/multi_test_helpers_superuser.out b/src/test/regress/expected/multi_test_helpers_superuser.out index b631814f8..eca214309 100644 --- a/src/test/regress/expected/multi_test_helpers_superuser.out +++ b/src/test/regress/expected/multi_test_helpers_superuser.out @@ -1,3 +1,9 @@ +CREATE OR REPLACE FUNCTION master_defer_delete_shards() + RETURNS int + LANGUAGE C STRICT + AS 'citus', $$master_defer_delete_shards$$; +COMMENT ON FUNCTION master_defer_delete_shards() + IS 'remove orphaned shards'; CREATE OR REPLACE FUNCTION wait_until_metadata_sync(timeout INTEGER DEFAULT 15000) RETURNS void LANGUAGE C STRICT diff --git a/src/test/regress/expected/multi_utility_warnings.out b/src/test/regress/expected/multi_utility_warnings.out index 6a417ef96..76d5a8325 100644 --- a/src/test/regress/expected/multi_utility_warnings.out +++ b/src/test/regress/expected/multi_utility_warnings.out @@ -25,7 +25,3 @@ ERROR: cannot write to pg_dist_poolinfo DETAIL: Citus Community Edition does not support the use of pooler options. HINT: To learn more about using advanced pooling schemes with Citus, please contact us at https://citusdata.com/about/contact_us ROLLBACK; -INSERT INTO pg_dist_rebalance_strategy VALUES ('should fail', false, 'citus_shard_cost_1', 'citus_node_capacity_1', 'citus_shard_allowed_on_node_true', 0, 0); -ERROR: cannot write to pg_dist_rebalance_strategy -DETAIL: Citus Community Edition does not support the use of custom rebalance strategies. -HINT: To learn more about using advanced rebalancing schemes with Citus, please contact us at https://citusdata.com/about/contact_us diff --git a/src/test/regress/expected/shard_move_deferred_delete.out b/src/test/regress/expected/shard_move_deferred_delete.out new file mode 100644 index 000000000..ddb2be9d8 --- /dev/null +++ b/src/test/regress/expected/shard_move_deferred_delete.out @@ -0,0 +1,111 @@ +-- +-- SHARD_MOVE_DEFERRED_DELETE +-- +SET citus.next_shard_id TO 20000000; +SET citus.shard_count TO 6; +SET citus.shard_replication_factor TO 1; +SET citus.defer_drop_after_shard_move TO on; +CREATE SCHEMA shard_move_deferred_delete; +SET search_path TO shard_move_deferred_delete; +CREATE TABLE t1 ( id int PRIMARY KEY); +SELECT create_distributed_table('t1', 'id'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +-- by counting how ofter we see the specific shard on all workers we can verify is the shard is there +SELECT run_command_on_workers($cmd$ + SELECT count(*) FROM pg_class WHERE relname = 't1_20000000'; +$cmd$); + run_command_on_workers +--------------------------------------------------------------------- + (localhost,57637,t,1) + (localhost,57638,t,0) +(2 rows) + +-- move shard +SELECT master_move_shard_placement(20000000, 'localhost', :worker_1_port, 'localhost', :worker_2_port); + master_move_shard_placement +--------------------------------------------------------------------- + +(1 row) + +-- we expect the shard to be on both workers now +SELECT run_command_on_workers($cmd$ + SELECT count(*) FROM pg_class WHERE relname = 't1_20000000'; +$cmd$); + run_command_on_workers +--------------------------------------------------------------------- + (localhost,57637,t,1) + (localhost,57638,t,1) +(2 rows) + +-- execute delayed removal +SELECT public.master_defer_delete_shards(); + master_defer_delete_shards +--------------------------------------------------------------------- + 1 +(1 row) + +-- we expect the shard to be on only the second worker +SELECT run_command_on_workers($cmd$ + SELECT count(*) FROM pg_class WHERE relname = 't1_20000000'; +$cmd$); + run_command_on_workers +--------------------------------------------------------------------- + (localhost,57637,t,0) + (localhost,57638,t,1) +(2 rows) + +SELECT master_move_shard_placement(20000000, 'localhost', :worker_2_port, 'localhost', :worker_1_port); + master_move_shard_placement +--------------------------------------------------------------------- + +(1 row) + +-- we expect the shard to be on both workers now +SELECT run_command_on_workers($cmd$ + SELECT count(*) FROM pg_class WHERE relname = 't1_20000000'; +$cmd$); + run_command_on_workers +--------------------------------------------------------------------- + (localhost,57637,t,1) + (localhost,57638,t,1) +(2 rows) + +-- enable auto delete +ALTER SYSTEM SET citus.defer_shard_delete_interval TO 10; +SELECT pg_reload_conf(); + pg_reload_conf +--------------------------------------------------------------------- + t +(1 row) + +-- Sleep 1 second to give Valgrind enough time to clear transactions +SELECT pg_sleep(1); + pg_sleep +--------------------------------------------------------------------- + +(1 row) + +-- we expect the shard to be on only the first worker +SELECT run_command_on_workers($cmd$ + SELECT count(*) FROM pg_class WHERE relname = 't1_20000000'; +$cmd$); + run_command_on_workers +--------------------------------------------------------------------- + (localhost,57637,t,1) + (localhost,57638,t,0) +(2 rows) + +-- reset test suite +ALTER SYSTEM SET citus.defer_shard_delete_interval TO -1; +SELECT pg_reload_conf(); + pg_reload_conf +--------------------------------------------------------------------- + t +(1 row) + +DROP SCHEMA shard_move_deferred_delete CASCADE; +NOTICE: drop cascades to table t1 diff --git a/src/test/regress/expected/shard_rebalancer.out b/src/test/regress/expected/shard_rebalancer.out new file mode 100644 index 000000000..b86713993 --- /dev/null +++ b/src/test/regress/expected/shard_rebalancer.out @@ -0,0 +1,2116 @@ +-- +-- MUTLI_SHARD_REBALANCER +-- +CREATE TABLE dist_table_test(a int primary key); +SELECT create_distributed_table('dist_table_test', 'a'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +CREATE TABLE ref_table_test(a int primary key); +SELECT create_reference_table('ref_table_test'); + create_reference_table +--------------------------------------------------------------------- + +(1 row) + +-- make sure that all rebalance operations works fine when +-- reference tables are replicated to the coordinator +SELECT 1 FROM master_add_node('localhost', :master_port, groupId=>0); +NOTICE: Replicating reference table "ref_table_test" to the node localhost:xxxxx + ?column? +--------------------------------------------------------------------- + 1 +(1 row) + +-- should just be noops even if we add the coordinator to the pg_dist_node +SELECT rebalance_table_shards('dist_table_test'); + rebalance_table_shards +--------------------------------------------------------------------- + +(1 row) + +SELECT rebalance_table_shards(); + rebalance_table_shards +--------------------------------------------------------------------- + +(1 row) + +-- test that calling rebalance_table_shards without specifying relation +-- wouldn't move shard of the citus local table. +CREATE TABLE citus_local_table(a int, b int); +SELECT create_citus_local_table('citus_local_table'); + create_citus_local_table +--------------------------------------------------------------------- + +(1 row) + +INSERT INTO citus_local_table VALUES (1, 2); +SELECT rebalance_table_shards(); + rebalance_table_shards +--------------------------------------------------------------------- + +(1 row) + +-- show that citus local table shard is still on the coordinator +SELECT tablename FROM pg_catalog.pg_tables where tablename like 'citus_local_table_%'; + tablename +--------------------------------------------------------------------- + citus_local_table_102047 +(1 row) + +-- also check that we still can access shard relation, not the shell table +SELECT count(*) FROM citus_local_table; + count +--------------------------------------------------------------------- + 1 +(1 row) + +SELECT master_drain_node('localhost', :master_port); + master_drain_node +--------------------------------------------------------------------- + +(1 row) + +-- show that citus local table shard is still on the coordinator +SELECT tablename FROM pg_catalog.pg_tables where tablename like 'citus_local_table_%'; + tablename +--------------------------------------------------------------------- + citus_local_table_102047 +(1 row) + +-- also check that we still can access shard relation, not the shell table +SELECT count(*) FROM citus_local_table; + count +--------------------------------------------------------------------- + 1 +(1 row) + +-- show that we do not create a shard rebalancing plan for citus local table +SELECT get_rebalance_table_shards_plan(); + get_rebalance_table_shards_plan +--------------------------------------------------------------------- +(0 rows) + +DROP TABLE citus_local_table; +CREATE TABLE dist_table_test_2(a int); +SET citus.shard_count TO 4; +SET citus.shard_replication_factor TO 1; +SET citus.replication_model TO "statement"; +SELECT create_distributed_table('dist_table_test_2', 'a'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +-- replicate reference table should ignore the coordinator +SET citus.shard_replication_factor TO 2; +SELECT replicate_table_shards('dist_table_test_2', max_shard_copies := 4, shard_transfer_mode:='block_writes'); +NOTICE: Copying shard xxxxx from localhost:xxxxx to localhost:xxxxx ... +NOTICE: Copying shard xxxxx from localhost:xxxxx to localhost:xxxxx ... +NOTICE: Copying shard xxxxx from localhost:xxxxx to localhost:xxxxx ... +NOTICE: Copying shard xxxxx from localhost:xxxxx to localhost:xxxxx ... + replicate_table_shards +--------------------------------------------------------------------- + +(1 row) + +DROP TABLE dist_table_test, dist_table_test_2, ref_table_test; +RESET citus.shard_count; +RESET citus.shard_replication_factor; +RESET citus.replication_model; +-- Create a user to test multiuser usage of rebalancer functions +CREATE USER testrole; +NOTICE: not propagating CREATE ROLE/USER commands to worker nodes +HINT: Connect to worker nodes directly to manually create all necessary users and roles. +GRANT ALL ON SCHEMA public TO testrole; +ERROR: role "testrole" does not exist +CONTEXT: while executing command on localhost:xxxxx +CREATE OR REPLACE FUNCTION shard_placement_rebalance_array( + worker_node_list json[], + shard_placement_list json[], + threshold float4 DEFAULT 0, + max_shard_moves int DEFAULT 1000000, + drain_only bool DEFAULT false +) +RETURNS json[] +AS 'citus' +LANGUAGE C STRICT VOLATILE; +CREATE FUNCTION shard_placement_replication_array(worker_node_list json[], + shard_placement_list json[], + shard_replication_factor int) +RETURNS json[] +AS 'citus' +LANGUAGE C STRICT VOLATILE; +CREATE FUNCTION worker_node_responsive(worker_node_name text, worker_node_port int) +RETURNS boolean +AS 'citus' +LANGUAGE C STRICT VOLATILE; +SET citus.next_shard_id TO 123000; +SELECT worker_node_responsive(node_name, node_port::int) + FROM master_get_active_worker_nodes() + ORDER BY node_name, node_port ASC; + worker_node_responsive +--------------------------------------------------------------------- + t + t +(2 rows) + +-- Check that worker_node_responsive returns false for dead nodes +-- Note that PostgreSQL tries all possible resolutions of localhost on failing +-- connections. This causes different error details to be printed on different +-- environments. Therefore, we first set verbosity to terse. +\set VERBOSITY terse +SELECT worker_node_responsive('localhost', 1); + worker_node_responsive +--------------------------------------------------------------------- + f +(1 row) + +\set VERBOSITY default +-- Check that with threshold=0.0 shard_placement_rebalance_array returns enough +-- moves to make the cluster completely balanced. +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":5, "shardid":5, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":6, "shardid":6, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}']::json[], + 0.0 +)); + unnest +--------------------------------------------------------------------- + {"updatetype":1,"shardid":1,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname2","targetport":5432} + {"updatetype":1,"shardid":2,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname2","targetport":5432} +(2 rows) + +-- Check that with two nodes and threshold=1.0 shard_placement_rebalance_array +-- doesn't return any moves, even if it is completely unbalanced. +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}']::json[], + 1.0 +)); + unnest +--------------------------------------------------------------------- +(0 rows) + +-- Check that with three nodes and threshold=1.0 +-- shard_placement_rebalance_array returns moves when it is completely unbalanced +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432}', + '{"node_name": "hostname3", "node_port": 5432}' + ]::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}']::json[], + 1.0 +)); + unnest +--------------------------------------------------------------------- + {"updatetype":1,"shardid":1,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname2","targetport":5432} +(1 row) + +-- Check that with with three nodes and threshold=2.0 +-- shard_placement_rebalance_array doesn't return any moves, even if it is +-- completely unbalanced. (with three nodes) +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432}', + '{"node_name": "hostname3", "node_port": 5432}' + ]::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}']::json[], + 2.0 +)); + unnest +--------------------------------------------------------------------- +(0 rows) + +-- Check that with threshold=0.0 shard_placement_rebalance_array doesn't return +-- any moves if the cluster is already balanced. +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":5, "shardid":5, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":6, "shardid":6, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}']::json[], + 0.0 +)); + unnest +--------------------------------------------------------------------- +(0 rows) + +-- Check that shard_placement_replication_array returns a shard copy operation +-- for each of the shards in an inactive node. +SELECT unnest(shard_placement_replication_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":3, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname3", "nodeport":5432}', + '{"placementid":4, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname3", "nodeport":5432}']::json[], + 2 +)); + unnest +--------------------------------------------------------------------- + {"updatetype":2,"shardid":1,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname2","targetport":5432} + {"updatetype":2,"shardid":2,"sourcename":"hostname2","sourceport":5432,"targetname":"hostname1","targetport":5432} +(2 rows) + +-- Check that shard_placement_replication_array returns a shard copy operation +-- for each of the inactive shards. +SELECT unnest(shard_placement_replication_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":3, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":1, "shardstate":3, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}']::json[], + 2 +)); + unnest +--------------------------------------------------------------------- + {"updatetype":2,"shardid":1,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname2","targetport":5432} + {"updatetype":2,"shardid":2,"sourcename":"hostname2","sourceport":5432,"targetname":"hostname1","targetport":5432} +(2 rows) + +-- Check that shard_placement_replication_array errors out if all placements of +-- a shard are placed on inactive nodes. +SELECT unnest(shard_placement_replication_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":2, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname3", "nodeport":5432}']::json[], + 2 +)); +ERROR: could not find a source for shard xxxxx +-- Check that shard_placement_replication_array errors out if replication factor +-- is more than number of active nodes. +SELECT unnest(shard_placement_replication_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}']::json[], + 2 +)); +ERROR: could not find a target for shard xxxxx +-- Ensure that shard_replication_factor is 2 during replicate_table_shards +-- and rebalance_table_shards tests +SET citus.shard_replication_factor TO 2; +-- Turn off NOTICE messages +SET client_min_messages TO WARNING; +-- Create a single-row test data for shard rebalancer test shards +CREATE TABLE shard_rebalancer_test_data AS SELECT 1::int as int_column; +-- Test replicate_table_shards, which will in turn test update_shard_placement +-- in copy mode. +CREATE TABLE replication_test_table(int_column int); +SELECT master_create_distributed_table('replication_test_table', 'int_column', 'append'); + master_create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +CREATE VIEW replication_test_table_placements_per_node AS + SELECT count(*) FROM pg_dist_shard_placement NATURAL JOIN pg_dist_shard + WHERE logicalrelid = 'replication_test_table'::regclass + GROUP BY nodename, nodeport + ORDER BY nodename, nodeport; +-- Create four shards with replication factor 2, and delete the placements +-- with smaller port number to simulate under-replicated shards. +SELECT count(master_create_empty_shard('replication_test_table')) + FROM generate_series(1, 4); + count +--------------------------------------------------------------------- + 4 +(1 row) + +DELETE FROM pg_dist_shard_placement WHERE placementid in ( + SELECT pg_dist_shard_placement.placementid + FROM pg_dist_shard_placement NATURAL JOIN pg_dist_shard + WHERE logicalrelid = 'replication_test_table'::regclass + AND (nodename, nodeport) = (SELECT nodename, nodeport FROM pg_dist_shard_placement + ORDER BY nodename, nodeport limit 1) +); +-- Upload the test data to the shards +SELECT count(master_append_table_to_shard(shardid, 'shard_rebalancer_test_data', + host(inet_server_addr()), inet_server_port())) + FROM pg_dist_shard + WHERE logicalrelid = 'replication_test_table'::regclass; + count +--------------------------------------------------------------------- + 4 +(1 row) + +-- Verify that there is one node with all placements +SELECT * FROM replication_test_table_placements_per_node; + count +--------------------------------------------------------------------- + 4 +(1 row) + +-- Check excluded_shard_list by excluding three shards with smaller ids +SELECT replicate_table_shards('replication_test_table', + excluded_shard_list := excluded_shard_list, + shard_transfer_mode:='block_writes') + FROM ( + SELECT (array_agg(DISTINCT shardid ORDER BY shardid))[1:3] AS excluded_shard_list + FROM pg_dist_shard + WHERE logicalrelid = 'replication_test_table'::regclass + ) T; + replicate_table_shards +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM replication_test_table_placements_per_node; + count +--------------------------------------------------------------------- + 1 + 4 +(2 rows) + +-- Check that with shard_replication_factor=1 we don't do any copies +SELECT replicate_table_shards('replication_test_table', + shard_replication_factor := 1, + shard_transfer_mode:='block_writes'); + replicate_table_shards +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM replication_test_table_placements_per_node; + count +--------------------------------------------------------------------- + 1 + 4 +(2 rows) + +-- Check that max_shard_copies limits number of copy operations +SELECT replicate_table_shards('replication_test_table', + max_shard_copies := 2, + shard_transfer_mode:='block_writes'); + replicate_table_shards +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM replication_test_table_placements_per_node; + count +--------------------------------------------------------------------- + 3 + 4 +(2 rows) + +-- Replicate the remaining under-replicated shards +SELECT replicate_table_shards('replication_test_table'); + replicate_table_shards +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM replication_test_table_placements_per_node; + count +--------------------------------------------------------------------- + 4 + 4 +(2 rows) + +-- Check that querying the table doesn't error out +SELECT count(*) FROM replication_test_table; + count +--------------------------------------------------------------------- + 4 +(1 row) + +DROP TABLE public.replication_test_table CASCADE; +-- Test rebalance_table_shards, which will in turn test update_shard_placement +-- in move mode. +CREATE TABLE rebalance_test_table(int_column int); +SELECT master_create_distributed_table('rebalance_test_table', 'int_column', 'append'); + master_create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +CREATE VIEW table_placements_per_node AS +SELECT nodeport, logicalrelid::regclass, count(*) +FROM pg_dist_shard_placement NATURAL JOIN pg_dist_shard +GROUP BY logicalrelid::regclass, nodename, nodeport +ORDER BY logicalrelid::regclass, nodename, nodeport; +-- Create six shards with replication factor 1 and move them to the same +-- node to create an unbalanced cluster. +CREATE PROCEDURE create_unbalanced_shards(rel text) +LANGUAGE SQL +AS $$ + SET citus.shard_replication_factor TO 1; + + SELECT count(master_create_empty_shard(rel)) + FROM generate_series(1, 6); + + SELECT count(master_move_shard_placement(shardid, + src.nodename, src.nodeport::int, + dst.nodename, dst.nodeport::int, + shard_transfer_mode:='block_writes')) + FROM pg_dist_shard s JOIN + pg_dist_shard_placement src USING (shardid), + (SELECT nodename, nodeport FROM pg_dist_shard_placement ORDER BY nodeport DESC LIMIT 1) dst + WHERE src.nodeport < dst.nodeport AND s.logicalrelid = rel::regclass; +$$; +CALL create_unbalanced_shards('rebalance_test_table'); +SET citus.shard_replication_factor TO 2; +-- Upload the test data to the shards +SELECT count(master_append_table_to_shard(shardid, 'shard_rebalancer_test_data', + host(inet_server_addr()), inet_server_port())) +FROM pg_dist_shard +WHERE logicalrelid = 'rebalance_test_table'::regclass; + count +--------------------------------------------------------------------- + 6 +(1 row) + +-- Verify that there is one node with all placements +SELECT * FROM table_placements_per_node; + nodeport | logicalrelid | count +--------------------------------------------------------------------- + 57638 | rebalance_test_table | 6 +(1 row) + +-- Check excluded_shard_list by excluding four shards with smaller ids +SELECT rebalance_table_shards('rebalance_test_table', + excluded_shard_list := excluded_shard_list, + threshold := 0, + shard_transfer_mode:='block_writes') +FROM ( + SELECT (array_agg(DISTINCT shardid ORDER BY shardid))[1:4] AS excluded_shard_list + FROM pg_dist_shard + WHERE logicalrelid = 'rebalance_test_table'::regclass +) T; + rebalance_table_shards +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM table_placements_per_node; + nodeport | logicalrelid | count +--------------------------------------------------------------------- + 57637 | rebalance_test_table | 1 + 57638 | rebalance_test_table | 5 +(2 rows) + +-- Check that max_shard_moves limits number of move operations +-- First check that we error if not table owner +SET ROLE testrole; +SELECT rebalance_table_shards('rebalance_test_table', + threshold := 0, max_shard_moves := 1, + shard_transfer_mode:='block_writes'); +WARNING: localhost:xxxxx is not responsive + rebalance_table_shards +--------------------------------------------------------------------- + +(1 row) + +RESET ROLE; +SELECT rebalance_table_shards('rebalance_test_table', + threshold := 0, max_shard_moves := 1, + shard_transfer_mode:='block_writes'); + rebalance_table_shards +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM table_placements_per_node; + nodeport | logicalrelid | count +--------------------------------------------------------------------- + 57637 | rebalance_test_table | 2 + 57638 | rebalance_test_table | 4 +(2 rows) + +-- Check that threshold=1 doesn't move any shards +SELECT rebalance_table_shards('rebalance_test_table', threshold := 1, shard_transfer_mode:='block_writes'); + rebalance_table_shards +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM table_placements_per_node; + nodeport | logicalrelid | count +--------------------------------------------------------------------- + 57637 | rebalance_test_table | 2 + 57638 | rebalance_test_table | 4 +(2 rows) + +-- Move the remaining shards using threshold=0 +SELECT rebalance_table_shards('rebalance_test_table', threshold := 0); + rebalance_table_shards +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM table_placements_per_node; + nodeport | logicalrelid | count +--------------------------------------------------------------------- + 57637 | rebalance_test_table | 3 + 57638 | rebalance_test_table | 3 +(2 rows) + +-- Check that shard is completely balanced and rebalancing again doesn't have +-- any effects. +SELECT rebalance_table_shards('rebalance_test_table', threshold := 0, shard_transfer_mode:='block_writes'); + rebalance_table_shards +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM table_placements_per_node; + nodeport | logicalrelid | count +--------------------------------------------------------------------- + 57637 | rebalance_test_table | 3 + 57638 | rebalance_test_table | 3 +(2 rows) + +-- Check that querying the table doesn't error out +SELECT count(*) FROM rebalance_test_table; + count +--------------------------------------------------------------------- + 6 +(1 row) + +DROP TABLE rebalance_test_table; +-- Test schema support +CREATE SCHEMA test_schema_support; +SELECT COUNT(*) FROM pg_dist_shard_placement; + count +--------------------------------------------------------------------- + 0 +(1 row) + +CREATE TABLE test_schema_support.nation_hash ( + n_nationkey integer not null, + n_name char(25) not null, + n_regionkey integer not null, + n_comment varchar(152) +); +SELECT master_create_distributed_table('test_schema_support.nation_hash', 'n_nationkey', 'hash'); + master_create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +SELECT master_create_worker_shards('test_schema_support.nation_hash', 4, 1); + master_create_worker_shards +--------------------------------------------------------------------- + +(1 row) + +CREATE TABLE test_schema_support.nation_hash2 ( + n_nationkey integer not null, + n_name char(25) not null, + n_regionkey integer not null, + n_comment varchar(152) +); +SELECT master_create_distributed_table('test_schema_support.nation_hash2', 'n_nationkey', 'hash'); + master_create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +SELECT master_create_worker_shards('test_schema_support.nation_hash2', 4, 1); + master_create_worker_shards +--------------------------------------------------------------------- + +(1 row) + +-- Shard count before replication +SELECT COUNT(*) FROM pg_dist_shard_placement; + count +--------------------------------------------------------------------- + 8 +(1 row) + +SET search_path TO public; +SELECT replicate_table_shards('test_schema_support.nation_hash', shard_transfer_mode:='block_writes'); + replicate_table_shards +--------------------------------------------------------------------- + +(1 row) + +-- Confirm replication +SELECT COUNT(*) FROM pg_dist_shard_placement; + count +--------------------------------------------------------------------- + 12 +(1 row) + +-- Test with search_path is set +SET search_path TO test_schema_support; +SELECT replicate_table_shards('nation_hash2', shard_transfer_mode:='block_writes'); + replicate_table_shards +--------------------------------------------------------------------- + +(1 row) + +-- Confirm replication +SELECT COUNT(*) FROM pg_dist_shard_placement; + count +--------------------------------------------------------------------- + 16 +(1 row) + +DROP TABLE test_schema_support.nation_hash; +DROP TABLE test_schema_support.nation_hash2; +-- Test rebalancer with schema +-- Next few operations is to create imbalanced distributed table +CREATE TABLE test_schema_support.imbalanced_table_local ( + id integer not null +); +INSERT INTO test_schema_support.imbalanced_table_local VALUES(1); +INSERT INTO test_schema_support.imbalanced_table_local VALUES(2); +INSERT INTO test_schema_support.imbalanced_table_local VALUES(3); +INSERT INTO test_schema_support.imbalanced_table_local VALUES(4); +CREATE TABLE test_schema_support.imbalanced_table ( + id integer not null +); +SELECT master_create_distributed_table('test_schema_support.imbalanced_table', 'id', 'append'); + master_create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +SET citus.shard_replication_factor TO 1; +SELECT * from master_create_empty_shard('test_schema_support.imbalanced_table'); + master_create_empty_shard +--------------------------------------------------------------------- + 123018 +(1 row) + +SELECT master_append_table_to_shard(123018, 'test_schema_support.imbalanced_table_local', 'localhost', :master_port); + master_append_table_to_shard +--------------------------------------------------------------------- + 0.00533333 +(1 row) + +SET citus.shard_replication_factor TO 2; +SELECT * from master_create_empty_shard('test_schema_support.imbalanced_table'); + master_create_empty_shard +--------------------------------------------------------------------- + 123019 +(1 row) + +SELECT master_append_table_to_shard(123019, 'test_schema_support.imbalanced_table_local', 'localhost', :master_port); + master_append_table_to_shard +--------------------------------------------------------------------- + 0.00533333 +(1 row) + +SET citus.shard_replication_factor TO 1; +SELECT * from master_create_empty_shard('test_schema_support.imbalanced_table'); + master_create_empty_shard +--------------------------------------------------------------------- + 123020 +(1 row) + +SELECT master_append_table_to_shard(123020, 'test_schema_support.imbalanced_table_local', 'localhost', :master_port); + master_append_table_to_shard +--------------------------------------------------------------------- + 0.00533333 +(1 row) + +-- imbalanced_table is now imbalanced +-- Shard counts in each node before rebalance +SELECT * FROM public.table_placements_per_node; + nodeport | logicalrelid | count +--------------------------------------------------------------------- + 57637 | imbalanced_table | 1 + 57638 | imbalanced_table | 3 +(2 rows) + +-- Row count in imbalanced table before rebalance +SELECT COUNT(*) FROM imbalanced_table; + count +--------------------------------------------------------------------- + 12 +(1 row) + +-- Try force_logical +SELECT rebalance_table_shards('imbalanced_table', threshold:=0, shard_transfer_mode:='force_logical'); +ERROR: the force_logical transfer mode is currently unsupported +-- Test rebalance operation +SELECT rebalance_table_shards('imbalanced_table', threshold:=0, shard_transfer_mode:='block_writes'); + rebalance_table_shards +--------------------------------------------------------------------- + +(1 row) + +-- Confirm rebalance +-- Shard counts in each node after rebalance +SELECT * FROM public.table_placements_per_node; + nodeport | logicalrelid | count +--------------------------------------------------------------------- + 57637 | imbalanced_table | 2 + 57638 | imbalanced_table | 2 +(2 rows) + +-- Row count in imbalanced table after rebalance +SELECT COUNT(*) FROM imbalanced_table; + count +--------------------------------------------------------------------- + 12 +(1 row) + +DROP TABLE public.shard_rebalancer_test_data; +DROP TABLE test_schema_support.imbalanced_table; +DROP TABLE test_schema_support.imbalanced_table_local; +SET citus.shard_replication_factor TO 1; +CREATE TABLE colocated_rebalance_test(id integer); +CREATE TABLE colocated_rebalance_test2(id integer); +SELECT create_distributed_table('colocated_rebalance_test', 'id'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +-- Move all shards to worker1 +SELECT master_move_shard_placement(shardid, 'localhost', :worker_2_port, 'localhost', :worker_1_port, 'block_writes') +FROM pg_dist_shard_placement +WHERE nodeport = :worker_2_port; + master_move_shard_placement +--------------------------------------------------------------------- + + +(2 rows) + +SELECT create_distributed_table('colocated_rebalance_test2', 'id'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +-- Confirm all shards for both tables are on worker1 +SELECT * FROM public.table_placements_per_node; + nodeport | logicalrelid | count +--------------------------------------------------------------------- + 57637 | colocated_rebalance_test | 4 + 57637 | colocated_rebalance_test2 | 4 +(2 rows) + +-- Confirm that the plan for drain_only doesn't show any moves +SELECT * FROM get_rebalance_table_shards_plan('colocated_rebalance_test', threshold := 0, drain_only := true); + table_name | shardid | shard_size | sourcename | sourceport | targetname | targetport +--------------------------------------------------------------------- +(0 rows) + +-- Running with drain_only shouldn't do anything +SELECT * FROM rebalance_table_shards('colocated_rebalance_test', threshold := 0, shard_transfer_mode := 'block_writes', drain_only := true); + rebalance_table_shards +--------------------------------------------------------------------- + +(1 row) + +-- Confirm that nothing changed +SELECT * FROM public.table_placements_per_node; + nodeport | logicalrelid | count +--------------------------------------------------------------------- + 57637 | colocated_rebalance_test | 4 + 57637 | colocated_rebalance_test2 | 4 +(2 rows) + +-- Confirm that the plan shows 2 shards of both tables moving back to worker2 +SELECT * FROM get_rebalance_table_shards_plan('colocated_rebalance_test', threshold := 0); + table_name | shardid | shard_size | sourcename | sourceport | targetname | targetport +--------------------------------------------------------------------- + colocated_rebalance_test | 123021 | 0 | localhost | 57637 | localhost | 57638 + colocated_rebalance_test2 | 123025 | 0 | localhost | 57637 | localhost | 57638 + colocated_rebalance_test | 123022 | 0 | localhost | 57637 | localhost | 57638 + colocated_rebalance_test2 | 123026 | 0 | localhost | 57637 | localhost | 57638 +(4 rows) + +-- Confirm that this also happens when using rebalancing by disk size even if the tables are empty +SELECT * FROM get_rebalance_table_shards_plan('colocated_rebalance_test', rebalance_strategy := 'by_disk_size'); + table_name | shardid | shard_size | sourcename | sourceport | targetname | targetport +--------------------------------------------------------------------- + colocated_rebalance_test | 123021 | 0 | localhost | 57637 | localhost | 57638 + colocated_rebalance_test2 | 123025 | 0 | localhost | 57637 | localhost | 57638 + colocated_rebalance_test | 123022 | 0 | localhost | 57637 | localhost | 57638 + colocated_rebalance_test2 | 123026 | 0 | localhost | 57637 | localhost | 57638 +(4 rows) + +-- Check that we can call this function +SELECT * FROM get_rebalance_progress(); + sessionid | table_name | shardid | shard_size | sourcename | sourceport | targetname | targetport | progress +--------------------------------------------------------------------- +(0 rows) + +-- Actually do the rebalance +SELECT * FROM rebalance_table_shards('colocated_rebalance_test', threshold := 0, shard_transfer_mode := 'block_writes'); + rebalance_table_shards +--------------------------------------------------------------------- + +(1 row) + +-- Check that we can call this function without a crash +SELECT * FROM get_rebalance_progress(); + sessionid | table_name | shardid | shard_size | sourcename | sourceport | targetname | targetport | progress +--------------------------------------------------------------------- +(0 rows) + +-- Confirm that the nodes are now there +SELECT * FROM public.table_placements_per_node; + nodeport | logicalrelid | count +--------------------------------------------------------------------- + 57637 | colocated_rebalance_test | 2 + 57638 | colocated_rebalance_test | 2 + 57637 | colocated_rebalance_test2 | 2 + 57638 | colocated_rebalance_test2 | 2 +(4 rows) + +CREATE TABLE non_colocated_rebalance_test(id integer); +SELECT create_distributed_table('non_colocated_rebalance_test', 'id', colocate_with := 'none'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +-- confirm that both colocation groups are balanced +SELECT * FROM public.table_placements_per_node; + nodeport | logicalrelid | count +--------------------------------------------------------------------- + 57637 | colocated_rebalance_test | 2 + 57638 | colocated_rebalance_test | 2 + 57637 | colocated_rebalance_test2 | 2 + 57638 | colocated_rebalance_test2 | 2 + 57637 | non_colocated_rebalance_test | 2 + 57638 | non_colocated_rebalance_test | 2 +(6 rows) + +-- testing behaviour when setting isdatanode to 'marked for draining' +SELECT * from master_set_node_property('localhost', :worker_2_port, 'shouldhaveshards', false); + master_set_node_property +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM get_rebalance_table_shards_plan('colocated_rebalance_test', threshold := 0); + table_name | shardid | shard_size | sourcename | sourceport | targetname | targetport +--------------------------------------------------------------------- + colocated_rebalance_test | 123021 | 0 | localhost | 57638 | localhost | 57637 + colocated_rebalance_test2 | 123025 | 0 | localhost | 57638 | localhost | 57637 + colocated_rebalance_test | 123022 | 0 | localhost | 57638 | localhost | 57637 + colocated_rebalance_test2 | 123026 | 0 | localhost | 57638 | localhost | 57637 +(4 rows) + +SELECT * FROM rebalance_table_shards('colocated_rebalance_test', threshold := 0, shard_transfer_mode := 'block_writes'); + rebalance_table_shards +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM public.table_placements_per_node; + nodeport | logicalrelid | count +--------------------------------------------------------------------- + 57637 | colocated_rebalance_test | 4 + 57637 | colocated_rebalance_test2 | 4 + 57637 | non_colocated_rebalance_test | 2 + 57638 | non_colocated_rebalance_test | 2 +(4 rows) + +SELECT * FROM get_rebalance_table_shards_plan('non_colocated_rebalance_test', threshold := 0); + table_name | shardid | shard_size | sourcename | sourceport | targetname | targetport +--------------------------------------------------------------------- + non_colocated_rebalance_test | 123030 | 0 | localhost | 57638 | localhost | 57637 + non_colocated_rebalance_test | 123032 | 0 | localhost | 57638 | localhost | 57637 +(2 rows) + +SELECT * FROM rebalance_table_shards('non_colocated_rebalance_test', threshold := 0, shard_transfer_mode := 'block_writes'); + rebalance_table_shards +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM public.table_placements_per_node; + nodeport | logicalrelid | count +--------------------------------------------------------------------- + 57637 | colocated_rebalance_test | 4 + 57637 | colocated_rebalance_test2 | 4 + 57637 | non_colocated_rebalance_test | 4 +(3 rows) + +-- Put shards back +SELECT * from master_set_node_property('localhost', :worker_2_port, 'shouldhaveshards', true); + master_set_node_property +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM rebalance_table_shards('colocated_rebalance_test', threshold := 0, shard_transfer_mode := 'block_writes'); + rebalance_table_shards +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM public.table_placements_per_node; + nodeport | logicalrelid | count +--------------------------------------------------------------------- + 57637 | colocated_rebalance_test | 2 + 57638 | colocated_rebalance_test | 2 + 57637 | colocated_rebalance_test2 | 2 + 57638 | colocated_rebalance_test2 | 2 + 57637 | non_colocated_rebalance_test | 4 +(5 rows) + +SELECT * FROM rebalance_table_shards('non_colocated_rebalance_test', threshold := 0, shard_transfer_mode := 'block_writes'); + rebalance_table_shards +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM public.table_placements_per_node; + nodeport | logicalrelid | count +--------------------------------------------------------------------- + 57637 | colocated_rebalance_test | 2 + 57638 | colocated_rebalance_test | 2 + 57637 | colocated_rebalance_test2 | 2 + 57638 | colocated_rebalance_test2 | 2 + 57637 | non_colocated_rebalance_test | 2 + 57638 | non_colocated_rebalance_test | 2 +(6 rows) + +-- testing behaviour when setting shouldhaveshards to false and rebalancing all +-- colocation groups with drain_only=true +SELECT * from master_set_node_property('localhost', :worker_2_port, 'shouldhaveshards', false); + master_set_node_property +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM get_rebalance_table_shards_plan(threshold := 0, drain_only := true); + table_name | shardid | shard_size | sourcename | sourceport | targetname | targetport +--------------------------------------------------------------------- + colocated_rebalance_test | 123021 | 0 | localhost | 57638 | localhost | 57637 + colocated_rebalance_test2 | 123025 | 0 | localhost | 57638 | localhost | 57637 + colocated_rebalance_test | 123022 | 0 | localhost | 57638 | localhost | 57637 + colocated_rebalance_test2 | 123026 | 0 | localhost | 57638 | localhost | 57637 + non_colocated_rebalance_test | 123029 | 0 | localhost | 57638 | localhost | 57637 + non_colocated_rebalance_test | 123030 | 0 | localhost | 57638 | localhost | 57637 +(6 rows) + +SELECT * FROM rebalance_table_shards(threshold := 0, shard_transfer_mode := 'block_writes', drain_only := true); + rebalance_table_shards +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM public.table_placements_per_node; + nodeport | logicalrelid | count +--------------------------------------------------------------------- + 57637 | colocated_rebalance_test | 4 + 57637 | colocated_rebalance_test2 | 4 + 57637 | non_colocated_rebalance_test | 4 +(3 rows) + +-- Put shards back +SELECT * from master_set_node_property('localhost', :worker_2_port, 'shouldhaveshards', true); + master_set_node_property +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM rebalance_table_shards(threshold := 0, shard_transfer_mode := 'block_writes'); + rebalance_table_shards +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM public.table_placements_per_node; + nodeport | logicalrelid | count +--------------------------------------------------------------------- + 57637 | colocated_rebalance_test | 2 + 57638 | colocated_rebalance_test | 2 + 57637 | colocated_rebalance_test2 | 2 + 57638 | colocated_rebalance_test2 | 2 + 57637 | non_colocated_rebalance_test | 2 + 57638 | non_colocated_rebalance_test | 2 +(6 rows) + +-- testing behaviour when setting shouldhaveshards to false and rebalancing all +-- colocation groups with drain_only=false +SELECT * from master_set_node_property('localhost', :worker_2_port, 'shouldhaveshards', false); + master_set_node_property +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM get_rebalance_table_shards_plan(threshold := 0); + table_name | shardid | shard_size | sourcename | sourceport | targetname | targetport +--------------------------------------------------------------------- + colocated_rebalance_test | 123021 | 0 | localhost | 57638 | localhost | 57637 + colocated_rebalance_test2 | 123025 | 0 | localhost | 57638 | localhost | 57637 + colocated_rebalance_test | 123022 | 0 | localhost | 57638 | localhost | 57637 + colocated_rebalance_test2 | 123026 | 0 | localhost | 57638 | localhost | 57637 + non_colocated_rebalance_test | 123029 | 0 | localhost | 57638 | localhost | 57637 + non_colocated_rebalance_test | 123030 | 0 | localhost | 57638 | localhost | 57637 +(6 rows) + +SELECT * FROM rebalance_table_shards(threshold := 0, shard_transfer_mode := 'block_writes'); + rebalance_table_shards +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM public.table_placements_per_node; + nodeport | logicalrelid | count +--------------------------------------------------------------------- + 57637 | colocated_rebalance_test | 4 + 57637 | colocated_rebalance_test2 | 4 + 57637 | non_colocated_rebalance_test | 4 +(3 rows) + +-- Put shards back +SELECT * from master_set_node_property('localhost', :worker_2_port, 'shouldhaveshards', true); + master_set_node_property +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM rebalance_table_shards(threshold := 0, shard_transfer_mode := 'block_writes'); + rebalance_table_shards +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM public.table_placements_per_node; + nodeport | logicalrelid | count +--------------------------------------------------------------------- + 57637 | colocated_rebalance_test | 2 + 57638 | colocated_rebalance_test | 2 + 57637 | colocated_rebalance_test2 | 2 + 57638 | colocated_rebalance_test2 | 2 + 57637 | non_colocated_rebalance_test | 2 + 57638 | non_colocated_rebalance_test | 2 +(6 rows) + +-- Make it a data node again +SELECT * from master_set_node_property('localhost', :worker_2_port, 'shouldhaveshards', true); + master_set_node_property +--------------------------------------------------------------------- + +(1 row) + +-- testing behaviour of master_drain_node +SELECT * from master_drain_node('localhost', :worker_2_port, shard_transfer_mode := 'block_writes'); + master_drain_node +--------------------------------------------------------------------- + +(1 row) + +select shouldhaveshards from pg_dist_node where nodeport = :worker_2_port; + shouldhaveshards +--------------------------------------------------------------------- + f +(1 row) + +SELECT * FROM public.table_placements_per_node; + nodeport | logicalrelid | count +--------------------------------------------------------------------- + 57637 | colocated_rebalance_test | 4 + 57637 | colocated_rebalance_test2 | 4 + 57637 | non_colocated_rebalance_test | 4 +(3 rows) + +-- Put shards back +SELECT * from master_set_node_property('localhost', :worker_2_port, 'shouldhaveshards', true); + master_set_node_property +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM rebalance_table_shards(threshold := 0, shard_transfer_mode := 'block_writes'); + rebalance_table_shards +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM public.table_placements_per_node; + nodeport | logicalrelid | count +--------------------------------------------------------------------- + 57637 | colocated_rebalance_test | 2 + 57638 | colocated_rebalance_test | 2 + 57637 | colocated_rebalance_test2 | 2 + 57638 | colocated_rebalance_test2 | 2 + 57637 | non_colocated_rebalance_test | 2 + 57638 | non_colocated_rebalance_test | 2 +(6 rows) + +-- Drop some tables for clear consistent error +DROP TABLE test_schema_support.colocated_rebalance_test2; +-- Leave no trace on workers +RESET search_path; +\set VERBOSITY terse +DROP SCHEMA test_schema_support CASCADE; +\set VERBOSITY default +REVOKE ALL ON SCHEMA public FROM testrole; +ERROR: role "testrole" does not exist +CONTEXT: while executing command on localhost:xxxxx +DROP USER testrole; +-- Test costs +set citus.shard_count = 4; +CREATE TABLE tab (x int); +SELECT create_distributed_table('tab','x'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +-- The following numbers are chosen such that they are placed on different +-- shards. +INSERT INTO tab SELECT 1 from generate_series(1, 30000); +INSERT INTO tab SELECT 2 from generate_series(1, 10000); +INSERT INTO tab SELECT 3 from generate_series(1, 10000); +INSERT INTO tab SELECT 6 from generate_series(1, 10000); +ANALYZE tab; +\c - - - :worker_1_port +SELECT table_schema, table_name, row_estimate, total_bytes + FROM ( + SELECT *, total_bytes-index_bytes-COALESCE(toast_bytes,0) AS table_bytes FROM ( + SELECT c.oid,nspname AS table_schema, relname AS TABLE_NAME + , c.reltuples AS row_estimate + , pg_total_relation_size(c.oid) AS total_bytes + , pg_indexes_size(c.oid) AS index_bytes + , pg_total_relation_size(reltoastrelid) AS toast_bytes + FROM pg_class c + LEFT JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE relkind = 'r' + ) a +WHERE table_schema = 'public' +) a ORDER BY table_name; + table_schema | table_name | row_estimate | total_bytes +--------------------------------------------------------------------- + public | tab_123033 | 30000 | 1114112 + public | tab_123035 | 10000 | 393216 +(2 rows) + +\c - - - :worker_2_port +SELECT table_schema, table_name, row_estimate, total_bytes + FROM ( + SELECT *, total_bytes-index_bytes-COALESCE(toast_bytes,0) AS table_bytes FROM ( + SELECT c.oid,nspname AS table_schema, relname AS TABLE_NAME + , c.reltuples AS row_estimate + , pg_total_relation_size(c.oid) AS total_bytes + , pg_indexes_size(c.oid) AS index_bytes + , pg_total_relation_size(reltoastrelid) AS toast_bytes + FROM pg_class c + LEFT JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE relkind = 'r' + ) a +WHERE table_schema = 'public' +) a ORDER BY table_name; + table_schema | table_name | row_estimate | total_bytes +--------------------------------------------------------------------- + public | tab_123034 | 10000 | 393216 + public | tab_123036 | 10000 | 393216 +(2 rows) + +\c - - - :master_port +SELECT * FROM get_rebalance_table_shards_plan('tab'); + table_name | shardid | shard_size | sourcename | sourceport | targetname | targetport +--------------------------------------------------------------------- +(0 rows) + +SELECT * FROM get_rebalance_table_shards_plan('tab', rebalance_strategy := 'by_disk_size'); + table_name | shardid | shard_size | sourcename | sourceport | targetname | targetport +--------------------------------------------------------------------- + tab | 123035 | 0 | localhost | 57637 | localhost | 57638 +(1 row) + +SELECT * FROM get_rebalance_table_shards_plan('tab', rebalance_strategy := 'by_disk_size', threshold := 0); +WARNING: the given threshold is lower than the minimum threshold allowed by the rebalance strategy, using the minimum allowed threshold instead +DETAIL: Using threshold of 0.01 + table_name | shardid | shard_size | sourcename | sourceport | targetname | targetport +--------------------------------------------------------------------- + tab | 123035 | 0 | localhost | 57637 | localhost | 57638 +(1 row) + +SELECT * FROM rebalance_table_shards('tab', shard_transfer_mode:='block_writes'); + rebalance_table_shards +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM public.table_placements_per_node; + nodeport | logicalrelid | count +--------------------------------------------------------------------- + 57637 | tab | 2 + 57638 | tab | 2 +(2 rows) + +SELECT * FROM rebalance_table_shards('tab', rebalance_strategy := 'by_disk_size', shard_transfer_mode:='block_writes'); +NOTICE: Moving shard xxxxx from localhost:xxxxx to localhost:xxxxx ... + rebalance_table_shards +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM public.table_placements_per_node; + nodeport | logicalrelid | count +--------------------------------------------------------------------- + 57637 | tab | 1 + 57638 | tab | 3 +(2 rows) + +SELECT * FROM rebalance_table_shards('tab', rebalance_strategy := 'by_disk_size', shard_transfer_mode:='block_writes', threshold := 0); +WARNING: the given threshold is lower than the minimum threshold allowed by the rebalance strategy, using the minimum allowed threshold instead +DETAIL: Using threshold of 0.01 + rebalance_table_shards +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM public.table_placements_per_node; + nodeport | logicalrelid | count +--------------------------------------------------------------------- + 57637 | tab | 1 + 57638 | tab | 3 +(2 rows) + +-- Check that sizes of colocated tables are added together for rebalances +set citus.shard_count = 4; +SET citus.next_shard_id TO 123050; +CREATE TABLE tab2 (x int); +SELECT create_distributed_table('tab2','x', colocate_with := 'tab'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +INSERT INTO tab2 SELECT 1 from generate_series(1, 0); +INSERT INTO tab2 SELECT 2 from generate_series(1, 60000); +INSERT INTO tab2 SELECT 3 from generate_series(1, 10000); +INSERT INTO tab2 SELECT 6 from generate_series(1, 10000); +ANALYZE tab, tab2; +\c - - - :worker_1_port +SELECT table_schema, table_name, row_estimate, total_bytes + FROM ( + SELECT *, total_bytes-index_bytes-COALESCE(toast_bytes,0) AS table_bytes FROM ( + SELECT c.oid,nspname AS table_schema, relname AS TABLE_NAME + , c.reltuples AS row_estimate + , pg_total_relation_size(c.oid) AS total_bytes + , pg_indexes_size(c.oid) AS index_bytes + , pg_total_relation_size(reltoastrelid) AS toast_bytes + FROM pg_class c + LEFT JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE relkind = 'r' + ) a +WHERE table_schema = 'public' +) a ORDER BY table_name; + table_schema | table_name | row_estimate | total_bytes +--------------------------------------------------------------------- + public | tab2_123050 | 0 | 0 + public | tab_123033 | 30000 | 1114112 +(2 rows) + +\c - - - :worker_2_port +SELECT table_schema, table_name, row_estimate, total_bytes + FROM ( + SELECT *, total_bytes-index_bytes-COALESCE(toast_bytes,0) AS table_bytes FROM ( + SELECT c.oid,nspname AS table_schema, relname AS TABLE_NAME + , c.reltuples AS row_estimate + , pg_total_relation_size(c.oid) AS total_bytes + , pg_indexes_size(c.oid) AS index_bytes + , pg_total_relation_size(reltoastrelid) AS toast_bytes + FROM pg_class c + LEFT JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE relkind = 'r' + ) a +WHERE table_schema = 'public' +) a ORDER BY table_name; + table_schema | table_name | row_estimate | total_bytes +--------------------------------------------------------------------- + public | tab2_123051 | 10000 | 393216 + public | tab2_123052 | 10000 | 393216 + public | tab2_123053 | 60000 | 2203648 + public | tab_123034 | 10000 | 393216 + public | tab_123035 | 10000 | 368640 + public | tab_123036 | 10000 | 393216 +(6 rows) + +\c - - - :master_port +SELECT * FROM get_rebalance_table_shards_plan('tab', rebalance_strategy := 'by_disk_size'); + table_name | shardid | shard_size | sourcename | sourceport | targetname | targetport +--------------------------------------------------------------------- + tab | 123036 | 0 | localhost | 57638 | localhost | 57637 + tab2 | 123053 | 0 | localhost | 57638 | localhost | 57637 + tab | 123033 | 0 | localhost | 57637 | localhost | 57638 + tab2 | 123050 | 0 | localhost | 57637 | localhost | 57638 +(4 rows) + +SELECT * FROM rebalance_table_shards('tab', rebalance_strategy := 'by_disk_size', shard_transfer_mode:='block_writes'); +NOTICE: Moving shard xxxxx from localhost:xxxxx to localhost:xxxxx ... +NOTICE: Moving shard xxxxx from localhost:xxxxx to localhost:xxxxx ... + rebalance_table_shards +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM public.table_placements_per_node; + nodeport | logicalrelid | count +--------------------------------------------------------------------- + 57637 | tab | 1 + 57638 | tab | 3 + 57637 | tab2 | 1 + 57638 | tab2 | 3 +(4 rows) + +ANALYZE tab, tab2; +\c - - - :worker_1_port +SELECT table_schema, table_name, row_estimate, total_bytes + FROM ( + SELECT *, total_bytes-index_bytes-COALESCE(toast_bytes,0) AS table_bytes FROM ( + SELECT c.oid,nspname AS table_schema, relname AS TABLE_NAME + , c.reltuples AS row_estimate + , pg_total_relation_size(c.oid) AS total_bytes + , pg_indexes_size(c.oid) AS index_bytes + , pg_total_relation_size(reltoastrelid) AS toast_bytes + FROM pg_class c + LEFT JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE relkind = 'r' + ) a +WHERE table_schema = 'public' +) a ORDER BY table_name; + table_schema | table_name | row_estimate | total_bytes +--------------------------------------------------------------------- + public | tab2_123053 | 60000 | 2179072 + public | tab_123036 | 10000 | 368640 +(2 rows) + +\c - - - :worker_2_port +SELECT table_schema, table_name, row_estimate, total_bytes + FROM ( + SELECT *, total_bytes-index_bytes-COALESCE(toast_bytes,0) AS table_bytes FROM ( + SELECT c.oid,nspname AS table_schema, relname AS TABLE_NAME + , c.reltuples AS row_estimate + , pg_total_relation_size(c.oid) AS total_bytes + , pg_indexes_size(c.oid) AS index_bytes + , pg_total_relation_size(reltoastrelid) AS toast_bytes + FROM pg_class c + LEFT JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE relkind = 'r' + ) a +WHERE table_schema = 'public' +) a ORDER BY table_name; + table_schema | table_name | row_estimate | total_bytes +--------------------------------------------------------------------- + public | tab2_123050 | 0 | 0 + public | tab2_123051 | 10000 | 393216 + public | tab2_123052 | 10000 | 393216 + public | tab_123033 | 30000 | 1089536 + public | tab_123034 | 10000 | 393216 + public | tab_123035 | 10000 | 368640 +(6 rows) + +\c - - - :master_port +DROP TABLE tab2; +CREATE OR REPLACE FUNCTION capacity_high_worker_1(nodeidarg int) + RETURNS real AS $$ + SELECT + (CASE WHEN nodeport = 57637 THEN 1000 ELSE 1 END)::real + FROM pg_dist_node where nodeid = nodeidarg + $$ LANGUAGE sql; +SELECT citus_add_rebalance_strategy( + 'capacity_high_worker_1', + 'citus_shard_cost_1', + 'capacity_high_worker_1', + 'citus_shard_allowed_on_node_true', + 0 + ); + citus_add_rebalance_strategy +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM get_rebalance_table_shards_plan('tab', rebalance_strategy := 'capacity_high_worker_1'); + table_name | shardid | shard_size | sourcename | sourceport | targetname | targetport +--------------------------------------------------------------------- + tab | 123033 | 0 | localhost | 57638 | localhost | 57637 + tab | 123034 | 0 | localhost | 57638 | localhost | 57637 + tab | 123035 | 0 | localhost | 57638 | localhost | 57637 +(3 rows) + +SELECT * FROM rebalance_table_shards('tab', rebalance_strategy := 'capacity_high_worker_1', shard_transfer_mode:='block_writes'); +NOTICE: Moving shard xxxxx from localhost:xxxxx to localhost:xxxxx ... +NOTICE: Moving shard xxxxx from localhost:xxxxx to localhost:xxxxx ... +NOTICE: Moving shard xxxxx from localhost:xxxxx to localhost:xxxxx ... + rebalance_table_shards +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM public.table_placements_per_node; + nodeport | logicalrelid | count +--------------------------------------------------------------------- + 57637 | tab | 4 +(1 row) + +SELECT citus_set_default_rebalance_strategy('capacity_high_worker_1'); + citus_set_default_rebalance_strategy +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM get_rebalance_table_shards_plan('tab'); + table_name | shardid | shard_size | sourcename | sourceport | targetname | targetport +--------------------------------------------------------------------- +(0 rows) + +SELECT * FROM rebalance_table_shards('tab', shard_transfer_mode:='block_writes'); + rebalance_table_shards +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM public.table_placements_per_node; + nodeport | logicalrelid | count +--------------------------------------------------------------------- + 57637 | tab | 4 +(1 row) + +CREATE FUNCTION only_worker_2(shardid bigint, nodeidarg int) + RETURNS boolean AS $$ + SELECT + (CASE WHEN nodeport = 57638 THEN TRUE ELSE FALSE END) + FROM pg_dist_node where nodeid = nodeidarg + $$ LANGUAGE sql; +SELECT citus_add_rebalance_strategy( + 'only_worker_2', + 'citus_shard_cost_1', + 'citus_node_capacity_1', + 'only_worker_2', + 0 + ); + citus_add_rebalance_strategy +--------------------------------------------------------------------- + +(1 row) + +SELECT citus_set_default_rebalance_strategy('only_worker_2'); + citus_set_default_rebalance_strategy +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM get_rebalance_table_shards_plan('tab'); + table_name | shardid | shard_size | sourcename | sourceport | targetname | targetport +--------------------------------------------------------------------- + tab | 123033 | 0 | localhost | 57637 | localhost | 57638 + tab | 123034 | 0 | localhost | 57637 | localhost | 57638 + tab | 123035 | 0 | localhost | 57637 | localhost | 57638 + tab | 123036 | 0 | localhost | 57637 | localhost | 57638 +(4 rows) + +SELECT * FROM rebalance_table_shards('tab', shard_transfer_mode:='block_writes'); +NOTICE: Moving shard xxxxx from localhost:xxxxx to localhost:xxxxx ... +NOTICE: Moving shard xxxxx from localhost:xxxxx to localhost:xxxxx ... +NOTICE: Moving shard xxxxx from localhost:xxxxx to localhost:xxxxx ... +NOTICE: Moving shard xxxxx from localhost:xxxxx to localhost:xxxxx ... + rebalance_table_shards +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM public.table_placements_per_node; + nodeport | logicalrelid | count +--------------------------------------------------------------------- + 57638 | tab | 4 +(1 row) + +SELECT citus_set_default_rebalance_strategy('by_shard_count'); + citus_set_default_rebalance_strategy +--------------------------------------------------------------------- + +(1 row) + +SELECT * FROM get_rebalance_table_shards_plan('tab'); + table_name | shardid | shard_size | sourcename | sourceport | targetname | targetport +--------------------------------------------------------------------- + tab | 123033 | 0 | localhost | 57638 | localhost | 57637 + tab | 123034 | 0 | localhost | 57638 | localhost | 57637 +(2 rows) + +-- Check all the error handling cases +SELECT * FROM get_rebalance_table_shards_plan('tab', rebalance_strategy := 'non_existing'); +ERROR: could not find rebalance strategy with name non_existing +SELECT * FROM rebalance_table_shards('tab', rebalance_strategy := 'non_existing'); +ERROR: could not find rebalance strategy with name non_existing +SELECT * FROM master_drain_node('localhost', :worker_2_port, rebalance_strategy := 'non_existing'); +ERROR: could not find rebalance strategy with name non_existing +SELECT citus_set_default_rebalance_strategy('non_existing'); +ERROR: strategy with specified name does not exist +CONTEXT: PL/pgSQL function citus_set_default_rebalance_strategy(text) line 5 at RAISE +UPDATE pg_dist_rebalance_strategy SET default_strategy=false; +SELECT * FROM get_rebalance_table_shards_plan('tab'); +ERROR: no rebalance_strategy was provided, but there is also no default strategy set +SELECT * FROM rebalance_table_shards('tab'); +ERROR: no rebalance_strategy was provided, but there is also no default strategy set +SELECT * FROM master_drain_node('localhost', :worker_2_port); +ERROR: no rebalance_strategy was provided, but there is also no default strategy set +UPDATE pg_dist_rebalance_strategy SET default_strategy=true WHERE name='by_shard_count'; +CREATE OR REPLACE FUNCTION shard_cost_no_arguments() + RETURNS real AS $$ SELECT 1.0::real $$ LANGUAGE sql; +CREATE OR REPLACE FUNCTION shard_cost_bad_arg_type(text) + RETURNS real AS $$ SELECT 1.0::real $$ LANGUAGE sql; +CREATE OR REPLACE FUNCTION shard_cost_bad_return_type(bigint) + RETURNS int AS $$ SELECT 1 $$ LANGUAGE sql; +CREATE OR REPLACE FUNCTION node_capacity_no_arguments() + RETURNS real AS $$ SELECT 1.0::real $$ LANGUAGE sql; +CREATE OR REPLACE FUNCTION node_capacity_bad_arg_type(text) + RETURNS real AS $$ SELECT 1.0::real $$ LANGUAGE sql; +CREATE OR REPLACE FUNCTION node_capacity_bad_return_type(int) + RETURNS int AS $$ SELECT 1 $$ LANGUAGE sql; +CREATE OR REPLACE FUNCTION shard_allowed_on_node_no_arguments() + RETURNS boolean AS $$ SELECT true $$ LANGUAGE sql; +CREATE OR REPLACE FUNCTION shard_allowed_on_node_bad_arg1(text, int) + RETURNS boolean AS $$ SELECT true $$ LANGUAGE sql; +CREATE OR REPLACE FUNCTION shard_allowed_on_node_bad_arg2(bigint, text) + RETURNS boolean AS $$ SELECT true $$ LANGUAGE sql; +CREATE OR REPLACE FUNCTION shard_allowed_on_node_bad_return_type(bigint, int) + RETURNS int AS $$ SELECT 1 $$ LANGUAGE sql; +SELECT citus_add_rebalance_strategy( + 'insert_should_fail', + 'shard_cost_no_arguments', + 'citus_node_capacity_1', + 'citus_shard_allowed_on_node_true', + 0 + ); +ERROR: signature for shard_cost_function is incorrect +DETAIL: number of arguments of shard_cost_no_arguments should be 1, not 0 +CONTEXT: SQL statement "SELECT citus_validate_rebalance_strategy_functions( + NEW.shard_cost_function, + NEW.node_capacity_function, + NEW.shard_allowed_on_node_function)" +PL/pgSQL function citus_internal.pg_dist_rebalance_strategy_trigger_func() line 5 at PERFORM +SQL function "citus_add_rebalance_strategy" statement 1 +SELECT citus_add_rebalance_strategy( + 'insert_should_fail', + 'shard_cost_bad_arg_type', + 'citus_node_capacity_1', + 'citus_shard_allowed_on_node_true', + 0 + ); +ERROR: signature for shard_cost_function is incorrect +DETAIL: argument type of shard_cost_bad_arg_type should be bigint +CONTEXT: SQL statement "SELECT citus_validate_rebalance_strategy_functions( + NEW.shard_cost_function, + NEW.node_capacity_function, + NEW.shard_allowed_on_node_function)" +PL/pgSQL function citus_internal.pg_dist_rebalance_strategy_trigger_func() line 5 at PERFORM +SQL function "citus_add_rebalance_strategy" statement 1 +SELECT citus_add_rebalance_strategy( + 'insert_should_fail', + 'shard_cost_bad_return_type', + 'citus_node_capacity_1', + 'citus_shard_allowed_on_node_true', + 0 + ); +ERROR: signature for shard_cost_function is incorrect +DETAIL: return type of shard_cost_bad_return_type should be real +CONTEXT: SQL statement "SELECT citus_validate_rebalance_strategy_functions( + NEW.shard_cost_function, + NEW.node_capacity_function, + NEW.shard_allowed_on_node_function)" +PL/pgSQL function citus_internal.pg_dist_rebalance_strategy_trigger_func() line 5 at PERFORM +SQL function "citus_add_rebalance_strategy" statement 1 +SELECT citus_add_rebalance_strategy( + 'insert_should_fail', + 0, + 'citus_node_capacity_1', + 'citus_shard_allowed_on_node_true', + 0 + ); +ERROR: cache lookup failed for shard_cost_function with oid 0 +CONTEXT: SQL statement "SELECT citus_validate_rebalance_strategy_functions( + NEW.shard_cost_function, + NEW.node_capacity_function, + NEW.shard_allowed_on_node_function)" +PL/pgSQL function citus_internal.pg_dist_rebalance_strategy_trigger_func() line 5 at PERFORM +SQL function "citus_add_rebalance_strategy" statement 1 +SELECT citus_add_rebalance_strategy( + 'insert_should_fail', + 'citus_shard_cost_1', + 'node_capacity_no_arguments', + 'citus_shard_allowed_on_node_true', + 0 + ); +ERROR: signature for node_capacity_function is incorrect +DETAIL: number of arguments of node_capacity_no_arguments should be 1, not 0 +CONTEXT: SQL statement "SELECT citus_validate_rebalance_strategy_functions( + NEW.shard_cost_function, + NEW.node_capacity_function, + NEW.shard_allowed_on_node_function)" +PL/pgSQL function citus_internal.pg_dist_rebalance_strategy_trigger_func() line 5 at PERFORM +SQL function "citus_add_rebalance_strategy" statement 1 +SELECT citus_add_rebalance_strategy( + 'insert_should_fail', + 'citus_shard_cost_1', + 'node_capacity_bad_arg_type', + 'citus_shard_allowed_on_node_true', + 0 + ); +ERROR: signature for node_capacity_function is incorrect +DETAIL: argument type of node_capacity_bad_arg_type should be int +CONTEXT: SQL statement "SELECT citus_validate_rebalance_strategy_functions( + NEW.shard_cost_function, + NEW.node_capacity_function, + NEW.shard_allowed_on_node_function)" +PL/pgSQL function citus_internal.pg_dist_rebalance_strategy_trigger_func() line 5 at PERFORM +SQL function "citus_add_rebalance_strategy" statement 1 +SELECT citus_add_rebalance_strategy( + 'insert_should_fail', + 'citus_shard_cost_1', + 'node_capacity_bad_return_type', + 'citus_shard_allowed_on_node_true', + 0 + ); +ERROR: signature for node_capacity_function is incorrect +DETAIL: return type of node_capacity_bad_return_type should be real +CONTEXT: SQL statement "SELECT citus_validate_rebalance_strategy_functions( + NEW.shard_cost_function, + NEW.node_capacity_function, + NEW.shard_allowed_on_node_function)" +PL/pgSQL function citus_internal.pg_dist_rebalance_strategy_trigger_func() line 5 at PERFORM +SQL function "citus_add_rebalance_strategy" statement 1 +SELECT citus_add_rebalance_strategy( + 'insert_should_fail', + 'citus_shard_cost_1', + 0, + 'citus_shard_allowed_on_node_true', + 0 + ); +ERROR: cache lookup failed for node_capacity_function with oid 0 +CONTEXT: SQL statement "SELECT citus_validate_rebalance_strategy_functions( + NEW.shard_cost_function, + NEW.node_capacity_function, + NEW.shard_allowed_on_node_function)" +PL/pgSQL function citus_internal.pg_dist_rebalance_strategy_trigger_func() line 5 at PERFORM +SQL function "citus_add_rebalance_strategy" statement 1 +SELECT citus_add_rebalance_strategy( + 'insert_should_fail', + 'citus_shard_cost_1', + 'citus_node_capacity_1', + 'shard_allowed_on_node_no_arguments', + 0 + ); +ERROR: signature for shard_allowed_on_node_function is incorrect +DETAIL: number of arguments of shard_allowed_on_node_no_arguments should be 2, not 0 +CONTEXT: SQL statement "SELECT citus_validate_rebalance_strategy_functions( + NEW.shard_cost_function, + NEW.node_capacity_function, + NEW.shard_allowed_on_node_function)" +PL/pgSQL function citus_internal.pg_dist_rebalance_strategy_trigger_func() line 5 at PERFORM +SQL function "citus_add_rebalance_strategy" statement 1 +SELECT citus_add_rebalance_strategy( + 'insert_should_fail', + 'citus_shard_cost_1', + 'citus_node_capacity_1', + 'shard_allowed_on_node_bad_arg1', + 0 + ); +ERROR: signature for shard_allowed_on_node_function is incorrect +DETAIL: type of first argument of shard_allowed_on_node_bad_arg1 should be bigint +CONTEXT: SQL statement "SELECT citus_validate_rebalance_strategy_functions( + NEW.shard_cost_function, + NEW.node_capacity_function, + NEW.shard_allowed_on_node_function)" +PL/pgSQL function citus_internal.pg_dist_rebalance_strategy_trigger_func() line 5 at PERFORM +SQL function "citus_add_rebalance_strategy" statement 1 +SELECT citus_add_rebalance_strategy( + 'insert_should_fail', + 'citus_shard_cost_1', + 'citus_node_capacity_1', + 'shard_allowed_on_node_bad_arg2', + 0 + ); +ERROR: signature for shard_allowed_on_node_function is incorrect +DETAIL: type of second argument of shard_allowed_on_node_bad_arg2 should be int +CONTEXT: SQL statement "SELECT citus_validate_rebalance_strategy_functions( + NEW.shard_cost_function, + NEW.node_capacity_function, + NEW.shard_allowed_on_node_function)" +PL/pgSQL function citus_internal.pg_dist_rebalance_strategy_trigger_func() line 5 at PERFORM +SQL function "citus_add_rebalance_strategy" statement 1 +SELECT citus_add_rebalance_strategy( + 'insert_should_fail', + 'citus_shard_cost_1', + 'citus_node_capacity_1', + 'shard_allowed_on_node_bad_return_type', + 0 + ); +ERROR: signature for shard_allowed_on_node_function is incorrect +DETAIL: return type of shard_allowed_on_node_bad_return_type should be boolean +CONTEXT: SQL statement "SELECT citus_validate_rebalance_strategy_functions( + NEW.shard_cost_function, + NEW.node_capacity_function, + NEW.shard_allowed_on_node_function)" +PL/pgSQL function citus_internal.pg_dist_rebalance_strategy_trigger_func() line 5 at PERFORM +SQL function "citus_add_rebalance_strategy" statement 1 +SELECT citus_add_rebalance_strategy( + 'insert_should_fail', + 'citus_shard_cost_1', + 'citus_node_capacity_1', + 0, + 0 + ); +ERROR: cache lookup failed for shard_allowed_on_node_function with oid 0 +CONTEXT: SQL statement "SELECT citus_validate_rebalance_strategy_functions( + NEW.shard_cost_function, + NEW.node_capacity_function, + NEW.shard_allowed_on_node_function)" +PL/pgSQL function citus_internal.pg_dist_rebalance_strategy_trigger_func() line 5 at PERFORM +SQL function "citus_add_rebalance_strategy" statement 1 +-- Confirm that manual insert/update has the same checks +INSERT INTO + pg_catalog.pg_dist_rebalance_strategy( + name, + shard_cost_function, + node_capacity_function, + shard_allowed_on_node_function, + default_threshold + ) VALUES ( + 'shard_cost_no_arguments', + 'shard_cost_no_arguments', + 'citus_node_capacity_1', + 'citus_shard_allowed_on_node_true', + 0 + ); +ERROR: signature for shard_cost_function is incorrect +DETAIL: number of arguments of shard_cost_no_arguments should be 1, not 0 +CONTEXT: SQL statement "SELECT citus_validate_rebalance_strategy_functions( + NEW.shard_cost_function, + NEW.node_capacity_function, + NEW.shard_allowed_on_node_function)" +PL/pgSQL function citus_internal.pg_dist_rebalance_strategy_trigger_func() line 5 at PERFORM +UPDATE pg_dist_rebalance_strategy SET shard_cost_function='shard_cost_no_arguments' WHERE name='by_disk_size'; +ERROR: signature for shard_cost_function is incorrect +DETAIL: number of arguments of shard_cost_no_arguments should be 1, not 0 +CONTEXT: SQL statement "SELECT citus_validate_rebalance_strategy_functions( + NEW.shard_cost_function, + NEW.node_capacity_function, + NEW.shard_allowed_on_node_function)" +PL/pgSQL function citus_internal.pg_dist_rebalance_strategy_trigger_func() line 5 at PERFORM +-- Confirm that only a single default strategy can exist +INSERT INTO + pg_catalog.pg_dist_rebalance_strategy( + name, + default_strategy, + shard_cost_function, + node_capacity_function, + shard_allowed_on_node_function, + default_threshold + ) VALUES ( + 'second_default', + true, + 'citus_shard_cost_1', + 'citus_node_capacity_1', + 'citus_shard_allowed_on_node_true', + 0 + ); +ERROR: there cannot be two default strategies +CONTEXT: PL/pgSQL function citus_internal.pg_dist_rebalance_strategy_trigger_func() line 19 at RAISE +UPDATE pg_dist_rebalance_strategy SET default_strategy=true WHERE name='by_disk_size'; +ERROR: there cannot be two default strategies +CONTEXT: PL/pgSQL function citus_internal.pg_dist_rebalance_strategy_trigger_func() line 19 at RAISE +-- ensure the trigger allows updating the default strategy +UPDATE pg_dist_rebalance_strategy SET default_strategy=true WHERE name='by_shard_count'; +-- Confirm that default strategy should be higher than minimum strategy +SELECT citus_add_rebalance_strategy( + 'default_threshold_too_low', + 'citus_shard_cost_1', + 'capacity_high_worker_1', + 'citus_shard_allowed_on_node_true', + 0, + 0.1 + ); +ERROR: default_threshold cannot be smaller than minimum_threshold +CONTEXT: PL/pgSQL function citus_internal.pg_dist_rebalance_strategy_trigger_func() line 10 at RAISE +SQL function "citus_add_rebalance_strategy" statement 1 +-- Make it a data node again +SELECT * from master_set_node_property('localhost', :worker_2_port, 'shouldhaveshards', true); + master_set_node_property +--------------------------------------------------------------------- + +(1 row) + +DROP TABLE tab; +-- we don't need the coordinator on pg_dist_node anymore +SELECT 1 FROM master_remove_node('localhost', :master_port); + ?column? +--------------------------------------------------------------------- + 1 +(1 row) + +-- +-- Make sure that rebalance_table_shards() and replicate_table_shards() replicate +-- reference tables to the coordinator when replicate_reference_tables_on_activate +-- is off. +-- +SET citus.replicate_reference_tables_on_activate TO off; +SET client_min_messages TO WARNING; +CREATE TABLE dist_table_test_3(a int); +SET citus.shard_count TO 4; +SET citus.shard_replication_factor TO 1; +SET citus.replication_model TO "statement"; +SELECT create_distributed_table('dist_table_test_3', 'a'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +CREATE TABLE ref_table(a int); +SELECT create_reference_table('ref_table'); + create_reference_table +--------------------------------------------------------------------- + +(1 row) + +SELECT 1 FROM master_add_node('localhost', :master_port, groupId=>0); + ?column? +--------------------------------------------------------------------- + 1 +(1 row) + +SELECT count(*) FROM pg_dist_shard NATURAL JOIN pg_dist_shard_placement WHERE logicalrelid = 'ref_table'::regclass; + count +--------------------------------------------------------------------- + 2 +(1 row) + +SET citus.shard_replication_factor TO 2; +SELECT replicate_table_shards('dist_table_test_3', max_shard_copies := 4, shard_transfer_mode:='block_writes'); + replicate_table_shards +--------------------------------------------------------------------- + +(1 row) + +SELECT count(*) FROM pg_dist_shard NATURAL JOIN pg_dist_shard_placement WHERE logicalrelid = 'ref_table'::regclass; + count +--------------------------------------------------------------------- + 3 +(1 row) + +SELECT 1 FROM master_remove_node('localhost', :master_port); + ?column? +--------------------------------------------------------------------- + 1 +(1 row) + +CREATE TABLE rebalance_test_table(int_column int); +SELECT master_create_distributed_table('rebalance_test_table', 'int_column', 'append'); + master_create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +CALL create_unbalanced_shards('rebalance_test_table'); +SELECT 1 FROM master_add_node('localhost', :master_port, groupId=>0); + ?column? +--------------------------------------------------------------------- + 1 +(1 row) + +SELECT count(*) FROM pg_dist_shard NATURAL JOIN pg_dist_shard_placement WHERE logicalrelid = 'ref_table'::regclass; + count +--------------------------------------------------------------------- + 2 +(1 row) + +SELECT rebalance_table_shards('rebalance_test_table', shard_transfer_mode:='block_writes'); + rebalance_table_shards +--------------------------------------------------------------------- + +(1 row) + +SELECT count(*) FROM pg_dist_shard NATURAL JOIN pg_dist_shard_placement WHERE logicalrelid = 'ref_table'::regclass; + count +--------------------------------------------------------------------- + 3 +(1 row) + +DROP TABLE dist_table_test_3, rebalance_test_table, ref_table; +SELECT 1 FROM master_remove_node('localhost', :master_port); + ?column? +--------------------------------------------------------------------- + 1 +(1 row) + +-- reference table 2 will not have a replica identity, causing the rebalancer to not work +-- when ran in the default mode. Instead we need to change the shard transfer mode to make +-- it work. This verifies the shard transfer mode used in the rebalancer is used for the +-- ensurance of reference table existence. +CREATE TABLE t1 (a int PRIMARY KEY, b int); +CREATE TABLE r1 (a int PRIMARY KEY, b int); +CREATE TABLE r2 (a int, b int); +-- we remove worker 2 before creating the tables, this will allow us to have an active +-- node without the reference tables +SELECT 1 from master_remove_node('localhost', :worker_2_port); + ?column? +--------------------------------------------------------------------- + 1 +(1 row) + +SELECT create_distributed_table('t1','a'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +SELECT create_reference_table('r1'); + create_reference_table +--------------------------------------------------------------------- + +(1 row) + +SELECT create_reference_table('r2'); + create_reference_table +--------------------------------------------------------------------- + +(1 row) + +-- add data so to actually copy data when forcing logical replication for reference tables +INSERT INTO r1 VALUES (1,2), (3,4); +INSERT INTO r2 VALUES (1,2), (3,4); +SELECT 1 from master_add_node('localhost', :worker_2_port); + ?column? +--------------------------------------------------------------------- + 1 +(1 row) + +SELECT rebalance_table_shards(); + rebalance_table_shards +--------------------------------------------------------------------- + +(1 row) + +DROP TABLE t1, r1, r2; +-- verify there are no distributed tables before we perform the following tests. Preceding +-- test suites should clean up their distributed tables. +SELECT count(*) FROM pg_dist_partition; + count +--------------------------------------------------------------------- + 0 +(1 row) + +-- verify a system having only reference tables will copy the reference tables when +-- executing the rebalancer +SELECT 1 from master_remove_node('localhost', :worker_2_port); + ?column? +--------------------------------------------------------------------- + 1 +(1 row) + +CREATE TABLE r1 (a int PRIMARY KEY, b int); +SELECT create_reference_table('r1'); + create_reference_table +--------------------------------------------------------------------- + +(1 row) + +SELECT 1 from master_add_node('localhost', :worker_2_port); + ?column? +--------------------------------------------------------------------- + 1 +(1 row) + +-- count the number of placements for the reference table to verify it is not available on +-- all nodes +SELECT count(*) +FROM pg_dist_shard +JOIN pg_dist_shard_placement USING (shardid) +WHERE logicalrelid = 'r1'::regclass; + count +--------------------------------------------------------------------- + 1 +(1 row) + +-- rebalance with _only_ a reference table, this should trigger the copy +SELECT rebalance_table_shards(); + rebalance_table_shards +--------------------------------------------------------------------- + +(1 row) + +-- verify the reference table is on all nodes after the rebalance +SELECT count(*) +FROM pg_dist_shard +JOIN pg_dist_shard_placement USING (shardid) +WHERE logicalrelid = 'r1'::regclass; + count +--------------------------------------------------------------------- + 2 +(1 row) + +-- cleanup tables +DROP TABLE r1; +-- lastly we need to verify that reference tables are copied before the replication factor +-- of other tables is increased. Without the copy of reference tables the replication might +-- fail. +SELECT 1 from master_remove_node('localhost', :worker_2_port); + ?column? +--------------------------------------------------------------------- + 1 +(1 row) + +CREATE TABLE t1 (a int PRIMARY KEY, b int); +CREATE TABLE r1 (a int PRIMARY KEY, b int); +SELECT create_distributed_table('t1', 'a'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +SELECT create_reference_table('r1'); + create_reference_table +--------------------------------------------------------------------- + +(1 row) + +SELECT 1 from master_add_node('localhost', :worker_2_port); + ?column? +--------------------------------------------------------------------- + 1 +(1 row) + +-- count the number of placements for the reference table to verify it is not available on +-- all nodes +SELECT count(*) +FROM pg_dist_shard +JOIN pg_dist_shard_placement USING (shardid) +WHERE logicalrelid = 'r1'::regclass; + count +--------------------------------------------------------------------- + 1 +(1 row) + +SELECT replicate_table_shards('t1', shard_replication_factor := 2); + replicate_table_shards +--------------------------------------------------------------------- + +(1 row) + +-- verify the reference table is on all nodes after replicate_table_shards +SELECT count(*) +FROM pg_dist_shard +JOIN pg_dist_shard_placement USING (shardid) +WHERE logicalrelid = 'r1'::regclass; + count +--------------------------------------------------------------------- + 2 +(1 row) + +DROP TABLE t1, r1; diff --git a/src/test/regress/expected/shard_rebalancer_unit.out b/src/test/regress/expected/shard_rebalancer_unit.out new file mode 100644 index 000000000..3308dfc3e --- /dev/null +++ b/src/test/regress/expected/shard_rebalancer_unit.out @@ -0,0 +1,502 @@ +CREATE OR REPLACE FUNCTION shard_placement_rebalance_array( + worker_node_list json[], + shard_placement_list json[], + threshold float4 DEFAULT 0, + max_shard_moves int DEFAULT 1000000, + drain_only bool DEFAULT false +) +RETURNS json[] +AS 'citus' +LANGUAGE C STRICT VOLATILE; +-- Check that even with threshold=0.0 shard_placement_rebalance_array returns +-- something when there's no completely balanced solution. +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}']::json[] +)); + unnest +--------------------------------------------------------------------- + {"updatetype":1,"shardid":1,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname2","targetport":5432} +(1 row) + +-- Check that a node can be drained in a balanced cluster +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432, "disallowed_shards": "1,2,3,4"}', + '{"node_name": "hostname2", "node_port": 5432}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}' + ]::json[] +)); + unnest +--------------------------------------------------------------------- + {"updatetype":1,"shardid":1,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname2","targetport":5432} + {"updatetype":1,"shardid":2,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname2","targetport":5432} +(2 rows) + +-- Check that an already drained node won't be filled again after a second +-- rebalance +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432, "disallowed_shards": "1,2,3,4"}', + '{"node_name": "hostname2", "node_port": 5432}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}' + ]::json[] +)); + unnest +--------------------------------------------------------------------- +(0 rows) + +-- Check that even when shards are already balanced, but shard xxxxx is on a node +-- where it is not allowed it will be moved and there will be rebalancing later +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432, "disallowed_shards": "1,2,3,5,6"}', + '{"node_name": "hostname2", "node_port": 5432, "disallowed_shards": "4"}', + '{"node_name": "hostname3", "node_port": 5432, "disallowed_shards": "4"}' + ]::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":5, "shardid":5, "shardstate":1, "shardlength":1, "nodename":"hostname3", "nodeport":5432}', + '{"placementid":6, "shardid":6, "shardstate":1, "shardlength":1, "nodename":"hostname3", "nodeport":5432}' + ]::json[] +)); + unnest +--------------------------------------------------------------------- + {"updatetype":1,"shardid":1,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname2","targetport":5432} + {"updatetype":1,"shardid":2,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname3","targetport":5432} + {"updatetype":1,"shardid":4,"sourcename":"hostname2","sourceport":5432,"targetname":"hostname1","targetport":5432} +(3 rows) + +-- Check that even when shards are already balanced, disallowed shards will be +-- moved away from hostname1 and the only shard that is allowed there will be +-- moved there +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432, "disallowed_shards": "1,2,3,5,6"}', + '{"node_name": "hostname2", "node_port": 5432}', + '{"node_name": "hostname3", "node_port": 5432}' + ]::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":5, "shardid":5, "shardstate":1, "shardlength":1, "nodename":"hostname3", "nodeport":5432}', + '{"placementid":6, "shardid":6, "shardstate":1, "shardlength":1, "nodename":"hostname3", "nodeport":5432}' + ]::json[] +)); + unnest +--------------------------------------------------------------------- + {"updatetype":1,"shardid":1,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname2","targetport":5432} + {"updatetype":1,"shardid":2,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname3","targetport":5432} + {"updatetype":1,"shardid":4,"sourcename":"hostname2","sourceport":5432,"targetname":"hostname1","targetport":5432} +(3 rows) + +-- Check that an error is returned when a shard is not allowed anywhere +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432, "disallowed_shards": "2,4"}', + '{"node_name": "hostname2", "node_port": 5432, "disallowed_shards": "1,4"}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}' + ]::json[] +)); +WARNING: Not allowed to move shard xxxxx anywhere from hostname2:5432 + unnest +--------------------------------------------------------------------- + {"updatetype":1,"shardid":2,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname2","targetport":5432} + {"updatetype":1,"shardid":3,"sourcename":"hostname2","sourceport":5432,"targetname":"hostname1","targetport":5432} +(2 rows) + +-- Check that cost is taken into account when rebalancing +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432, "cost": 3}']::json[] +)); + unnest +--------------------------------------------------------------------- + {"updatetype":1,"shardid":4,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname2","targetport":5432} +(1 row) + +-- Check that cost is taken into account when rebalancing disallowed placements +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432, "disallowed_shards": "1,2,3,4"}', + '{"node_name": "hostname2", "node_port": 5432}', + '{"node_name": "hostname3", "node_port": 5432}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432, "cost": 3}']::json[] +)); + unnest +--------------------------------------------------------------------- + {"updatetype":1,"shardid":4,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname2","targetport":5432} + {"updatetype":1,"shardid":1,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname3","targetport":5432} + {"updatetype":1,"shardid":2,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname3","targetport":5432} + {"updatetype":1,"shardid":3,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname3","targetport":5432} +(4 rows) + +-- Check that node capacacity is taken into account. +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432, "capacity": 3}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}']::json[] +)); + unnest +--------------------------------------------------------------------- + {"updatetype":1,"shardid":1,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname2","targetport":5432} + {"updatetype":1,"shardid":2,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname2","targetport":5432} + {"updatetype":1,"shardid":3,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname2","targetport":5432} +(3 rows) + +-- Check that shards are not moved when target utilization stays the same and +-- the source utilization goes below the original target utilization. hostname1 +-- has utilization of 1, after move hostname2 would have a utilization of 1 as +-- well. hostname1 would have utilization of 1 while hostname2 has utilization +-- of 2/3 now. Since load is spread more fairly with utilization 2/3 than 0 it +-- should choose that distribution. +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432, "capacity": 3}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}']::json[] +)); + unnest +--------------------------------------------------------------------- +(0 rows) + +-- Check that shards are moved even when target utilization stays the same, but +-- source utilization goes below the original target utilization. hostname2 +-- has utilization of 1, after move hostname1 would have a utilization of 1 as +-- well. hostname2 would have utilization of 2/3 while hostname1 now has +-- utilization of 0 now. Since load is spread more fairly with utilization 2/3 +-- than 0 it should choose that distribution. +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432, "capacity": 3}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}']::json[] +)); + unnest +--------------------------------------------------------------------- + {"updatetype":1,"shardid":1,"sourcename":"hostname2","sourceport":5432,"targetname":"hostname1","targetport":5432} +(1 row) + +-- Check that shards are moved even when target utilization stays the same, but +-- source utilization goes below the original target utilization. hostname2 +-- has utilization of 2, after move hostname1 would have a utilization of 2 as +-- well. hostname2 would have utilization of 1.5 while hostname1 now has +-- utilization of 1. Since load is spread more fairly with utilization 1.5 than +-- 1 it should choose that distribution. +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432, "capacity": 2}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":5, "shardid":5, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}']::json[] +)); + unnest +--------------------------------------------------------------------- + {"updatetype":1,"shardid":2,"sourcename":"hostname2","sourceport":5432,"targetname":"hostname1","targetport":5432} +(1 row) + +-- Check that shards are moved even when target utilization stays the same, but +-- source utilization goes below the original target utilization. hostname1 +-- has utilization of 2, after move hostname2 would have a utilization of 2 as +-- well. hostname1 would have utilization of 1 while hostname2 now has +-- utilization of 1.5. Since load is spread more fairly with utilization 1.5 +-- than 1 it should choose that distribution. +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432, "capacity": 2}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":5, "shardid":5, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}']::json[] +)); + unnest +--------------------------------------------------------------------- +(0 rows) + +-- Check that all shards will be moved to 1 node if its capacity is big enough +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432, "capacity": 4}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}']::json[] +)); + unnest +--------------------------------------------------------------------- + {"updatetype":1,"shardid":1,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname2","targetport":5432} + {"updatetype":1,"shardid":2,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname2","targetport":5432} + {"updatetype":1,"shardid":3,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname2","targetport":5432} +(3 rows) + +-- Check that shards will be moved to a smaller node node if utilization improves +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432, "capacity": 3}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}']::json[] +)); + unnest +--------------------------------------------------------------------- + {"updatetype":1,"shardid":1,"sourcename":"hostname2","sourceport":5432,"targetname":"hostname1","targetport":5432} +(1 row) + +-- Check that node capacity works with different shard costs +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432, "capacity": 3}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432, "cost": 3}']::json[] +)); + unnest +--------------------------------------------------------------------- + {"updatetype":1,"shardid":1,"sourcename":"hostname2","sourceport":5432,"targetname":"hostname1","targetport":5432} +(1 row) + +-- Check that node capacity works with different shard costs again +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432, "capacity": 3}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432, "cost": 2}']::json[] +)); + unnest +--------------------------------------------------------------------- + {"updatetype":1,"shardid":3,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname2","targetport":5432} + {"updatetype":1,"shardid":1,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname2","targetport":5432} +(2 rows) + +-- Check that max_shard_moves works and that we get a NOTICE that it is hit +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432, "capacity": 3}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432, "cost": 2}']::json[], + max_shard_moves := 1 +)); +NOTICE: Stopped searching before we were out of moves. Please rerun the rebalancer after it's finished for a more optimal placement. + unnest +--------------------------------------------------------------------- + {"updatetype":1,"shardid":3,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname2","targetport":5432} +(1 row) + +-- Check that node capacity works with different shard costs and disallowed_shards +-- NOTE: these moves are not optimal, once we implement merging of updates this +-- output should change. +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432, "capacity": 5}', + '{"node_name": "hostname3", "node_port": 5432, "disallowed_shards": "1,2"}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname3", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname3", "nodeport":5432, "cost": 2}']::json[] +)); + unnest +--------------------------------------------------------------------- + {"updatetype":1,"shardid":2,"sourcename":"hostname3","sourceport":5432,"targetname":"hostname2","targetport":5432} + {"updatetype":1,"shardid":1,"sourcename":"hostname3","sourceport":5432,"targetname":"hostname1","targetport":5432} + {"updatetype":1,"shardid":1,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname2","targetport":5432} +(3 rows) + +-- Check that draining + rebalancing nodes works +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432, "disallowed_shards": "1,2,3,4,5,6", "capacity": 0}', + '{"node_name": "hostname2", "node_port": 5432}', + '{"node_name": "hostname3", "node_port": 5432}' + ]::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":5, "shardid":5, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":6, "shardid":6, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}' + ]::json[] +)); + unnest +--------------------------------------------------------------------- + {"updatetype":1,"shardid":1,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname3","targetport":5432} + {"updatetype":1,"shardid":2,"sourcename":"hostname2","sourceport":5432,"targetname":"hostname3","targetport":5432} + {"updatetype":1,"shardid":3,"sourcename":"hostname2","sourceport":5432,"targetname":"hostname3","targetport":5432} +(3 rows) + +-- Check that draining nodes with drain only works +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432, "disallowed_shards": "1,2,3,4,5,6", "capacity": 0}', + '{"node_name": "hostname2", "node_port": 5432}', + '{"node_name": "hostname3", "node_port": 5432}' + ]::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":5, "shardid":5, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":6, "shardid":6, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}' + ]::json[], + drain_only := true +)); + unnest +--------------------------------------------------------------------- + {"updatetype":1,"shardid":1,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname3","targetport":5432} +(1 row) + +-- Check that draining nodes has priority over max_shard_moves +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432, "disallowed_shards": "1,2,3,4,5,6", "capacity": 0}', + '{"node_name": "hostname2", "node_port": 5432}', + '{"node_name": "hostname3", "node_port": 5432}' + ]::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":5, "shardid":5, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":6, "shardid":6, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}' + ]::json[], + max_shard_moves := 0 +)); +NOTICE: Stopped searching before we were out of moves. Please rerun the rebalancer after it's finished for a more optimal placement. + unnest +--------------------------------------------------------------------- + {"updatetype":1,"shardid":1,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname3","targetport":5432} +(1 row) + +-- Check that drained moves are counted towards shard moves and thus use up the +-- limit when doing normal rebalance moves +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432, "disallowed_shards": "1,2,3,4,5,6", "capacity": 0}', + '{"node_name": "hostname2", "node_port": 5432}', + '{"node_name": "hostname3", "node_port": 5432}' + ]::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":5, "shardid":5, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":6, "shardid":6, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}' + ]::json[], + max_shard_moves := 2 +)); +NOTICE: Stopped searching before we were out of moves. Please rerun the rebalancer after it's finished for a more optimal placement. + unnest +--------------------------------------------------------------------- + {"updatetype":1,"shardid":1,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname3","targetport":5432} + {"updatetype":1,"shardid":2,"sourcename":"hostname2","sourceport":5432,"targetname":"hostname3","targetport":5432} +(2 rows) + +-- Check that draining for all colocation groups is done before rebalancing +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432, "disallowed_shards": "1,2,3,4,5,6,7,8,9,10,11,12", "capacity": 0}', + '{"node_name": "hostname2", "node_port": 5432}', + '{"node_name": "hostname3", "node_port": 5432}' + ]::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":5, "shardid":5, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":6, "shardid":6, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":7, "shardid":7, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432, "next_colocation": true}', + '{"placementid":8, "shardid":8, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":9, "shardid":9, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":10, "shardid":10, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":11, "shardid":11, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":12, "shardid":12, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}' + ]::json[] +)); + unnest +--------------------------------------------------------------------- + {"updatetype":1,"shardid":1,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname3","targetport":5432} + {"updatetype":1,"shardid":7,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname3","targetport":5432} + {"updatetype":1,"shardid":2,"sourcename":"hostname2","sourceport":5432,"targetname":"hostname3","targetport":5432} + {"updatetype":1,"shardid":3,"sourcename":"hostname2","sourceport":5432,"targetname":"hostname3","targetport":5432} + {"updatetype":1,"shardid":8,"sourcename":"hostname2","sourceport":5432,"targetname":"hostname3","targetport":5432} + {"updatetype":1,"shardid":9,"sourcename":"hostname2","sourceport":5432,"targetname":"hostname3","targetport":5432} +(6 rows) + +-- Check that max_shard_moves warning is only shown once even if more than one +-- colocation group its placement updates are ignored because of it +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432, "disallowed_shards": "1,2,3,4,5,6,7,8,9,10,11,12", "capacity": 0}', + '{"node_name": "hostname2", "node_port": 5432}', + '{"node_name": "hostname3", "node_port": 5432}' + ]::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":5, "shardid":5, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":6, "shardid":6, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":7, "shardid":7, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432, "next_colocation": true}', + '{"placementid":8, "shardid":8, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":9, "shardid":9, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":10, "shardid":10, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":11, "shardid":11, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":12, "shardid":12, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}' + ]::json[], + max_shard_moves := 1 +)); +NOTICE: Stopped searching before we were out of moves. Please rerun the rebalancer after it's finished for a more optimal placement. + unnest +--------------------------------------------------------------------- + {"updatetype":1,"shardid":1,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname3","targetport":5432} + {"updatetype":1,"shardid":7,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname3","targetport":5432} +(2 rows) + +-- Check that moves for different colocation groups are added together when +-- taking into account max_shard_moves +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432, "disallowed_shards": "1,2,3,4,5,6,7,8,9,10,11,12", "capacity": 0}', + '{"node_name": "hostname2", "node_port": 5432}', + '{"node_name": "hostname3", "node_port": 5432}' + ]::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":5, "shardid":5, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":6, "shardid":6, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":7, "shardid":7, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432, "next_colocation": true}', + '{"placementid":8, "shardid":8, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":9, "shardid":9, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":10, "shardid":10, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":11, "shardid":11, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":12, "shardid":12, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}' + ]::json[], + max_shard_moves := 5 +)); +NOTICE: Stopped searching before we were out of moves. Please rerun the rebalancer after it's finished for a more optimal placement. + unnest +--------------------------------------------------------------------- + {"updatetype":1,"shardid":1,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname3","targetport":5432} + {"updatetype":1,"shardid":7,"sourcename":"hostname1","sourceport":5432,"targetname":"hostname3","targetport":5432} + {"updatetype":1,"shardid":2,"sourcename":"hostname2","sourceport":5432,"targetname":"hostname3","targetport":5432} + {"updatetype":1,"shardid":3,"sourcename":"hostname2","sourceport":5432,"targetname":"hostname3","targetport":5432} + {"updatetype":1,"shardid":8,"sourcename":"hostname2","sourceport":5432,"targetname":"hostname3","targetport":5432} +(5 rows) + diff --git a/src/test/regress/isolation_schedule b/src/test/regress/isolation_schedule index e734870ee..39ce4402f 100644 --- a/src/test/regress/isolation_schedule +++ b/src/test/regress/isolation_schedule @@ -66,6 +66,13 @@ test: shared_connection_waits test: isolation_cancellation test: isolation_undistribute_table +# Rebalancer +test: isolation_blocking_move_single_shard_commands +test: isolation_blocking_move_multi_shard_commands +test: isolation_blocking_move_single_shard_commands_on_mx +test: isolation_blocking_move_multi_shard_commands_on_mx +test: isolation_shard_rebalancer + # MX tests test: isolation_reference_on_mx test: isolation_ref2ref_foreign_keys_on_mx diff --git a/src/test/regress/operations_schedule b/src/test/regress/operations_schedule new file mode 100644 index 000000000..4e526e19d --- /dev/null +++ b/src/test/regress/operations_schedule @@ -0,0 +1,9 @@ +test: multi_cluster_management +test: multi_test_helpers multi_test_helpers_superuser +test: multi_test_catalog_views +test: shard_rebalancer_unit +test: shard_rebalancer +test: foreign_key_to_reference_shard_rebalance +test: multi_move_mx +test: shard_move_deferred_delete +test: multi_colocated_shard_rebalance diff --git a/src/test/regress/spec/isolation_blocking_move_multi_shard_commands.spec b/src/test/regress/spec/isolation_blocking_move_multi_shard_commands.spec new file mode 100644 index 000000000..ba534046b --- /dev/null +++ b/src/test/regress/spec/isolation_blocking_move_multi_shard_commands.spec @@ -0,0 +1,121 @@ +// we use 15 as partition key values through out the test +// so setting the corresponding shard here is useful + +setup +{ + SELECT citus_internal.replace_isolation_tester_func(); + SELECT citus_internal.refresh_isolation_tester_prepared_statement(); + + SET citus.shard_count TO 8; + SET citus.shard_replication_factor TO 1; + CREATE TABLE logical_replicate_placement (x int PRIMARY KEY, y int); + SELECT create_distributed_table('logical_replicate_placement', 'x'); + + SELECT get_shard_id_for_distribution_column('logical_replicate_placement', 15) INTO selected_shard; + +} + +teardown +{ + SELECT citus_internal.restore_isolation_tester_func(); + + DROP TABLE selected_shard; + DROP TABLE logical_replicate_placement; +} + + +session "s1" + +step "s1-begin" +{ + BEGIN; +} + +step "s1-move-placement" +{ + SELECT master_move_shard_placement(get_shard_id_for_distribution_column, 'localhost', 57637, 'localhost', 57638, shard_transfer_mode:='block_writes') FROM selected_shard; +} + +step "s1-end" +{ + COMMIT; +} + +step "s1-select" +{ + SELECT * FROM logical_replicate_placement order by y; +} + +step "s1-insert" +{ + INSERT INTO logical_replicate_placement VALUES (15, 15), (172, 172); +} + +step "s1-get-shard-distribution" +{ + select nodeport from pg_dist_placement inner join pg_dist_node on(pg_dist_placement.groupid = pg_dist_node.groupid) where shardid in (SELECT * FROM selected_shard) order by nodeport; +} + +session "s2" + +step "s2-begin" +{ + BEGIN; +} + +step "s2-select" +{ + SELECT * FROM logical_replicate_placement ORDER BY y; +} + +step "s2-insert" +{ + INSERT INTO logical_replicate_placement VALUES (15, 15), (172, 172); +} + +step "s2-delete" +{ + DELETE FROM logical_replicate_placement; +} + +step "s2-update" +{ + UPDATE logical_replicate_placement SET y = y + 1; +} + +step "s2-upsert" +{ + INSERT INTO logical_replicate_placement VALUES (15, 15), (172, 172); + + INSERT INTO logical_replicate_placement VALUES (15, 15), (172, 172) ON CONFLICT (x) DO UPDATE SET y = logical_replicate_placement.y + 1; +} + +step "s2-copy" +{ + COPY logical_replicate_placement FROM PROGRAM 'echo "1,1\n2,2\n3,3\n4,4\n5,5\n15,30"' WITH CSV; +} + +step "s2-truncate" +{ + TRUNCATE logical_replicate_placement; +} + +step "s2-alter-table" +{ + ALTER TABLE logical_replicate_placement ADD COLUMN z INT; +} + +step "s2-end" +{ + COMMIT; +} + +permutation "s1-begin" "s2-begin" "s2-insert" "s1-move-placement" "s2-end" "s1-end" "s1-select" "s1-get-shard-distribution" +permutation "s1-begin" "s2-begin" "s2-upsert" "s1-move-placement" "s2-end" "s1-end" "s1-select" "s1-get-shard-distribution" +permutation "s1-insert" "s1-begin" "s2-begin" "s2-update" "s1-move-placement" "s2-end" "s1-end" "s1-select" "s1-get-shard-distribution" +permutation "s1-insert" "s1-begin" "s2-begin" "s2-delete" "s1-move-placement" "s2-end" "s1-end" "s1-select" "s1-get-shard-distribution" +permutation "s1-insert" "s1-begin" "s2-begin" "s2-select" "s1-move-placement" "s2-end" "s1-end" "s1-get-shard-distribution" +permutation "s1-begin" "s2-begin" "s2-copy" "s1-move-placement" "s2-end" "s1-end" "s1-select" "s1-get-shard-distribution" +permutation "s1-insert" "s1-begin" "s2-begin" "s2-truncate" "s1-move-placement" "s2-end" "s1-end" "s1-select" "s1-get-shard-distribution" +permutation "s1-begin" "s2-begin" "s2-alter-table" "s1-move-placement" "s2-end" "s1-end" "s1-select" "s1-get-shard-distribution" + diff --git a/src/test/regress/spec/isolation_blocking_move_multi_shard_commands_on_mx.spec b/src/test/regress/spec/isolation_blocking_move_multi_shard_commands_on_mx.spec new file mode 100644 index 000000000..ac26a5f2c --- /dev/null +++ b/src/test/regress/spec/isolation_blocking_move_multi_shard_commands_on_mx.spec @@ -0,0 +1,132 @@ +// we use 15 as partition key values through out the test +// so setting the corresponding shard here is useful + +setup +{ + CREATE OR REPLACE FUNCTION start_session_level_connection_to_node(text, integer) + RETURNS void + LANGUAGE C STRICT VOLATILE + AS 'citus', $$start_session_level_connection_to_node$$; + + CREATE OR REPLACE FUNCTION run_commands_on_session_level_connection_to_node(text) + RETURNS void + LANGUAGE C STRICT VOLATILE + AS 'citus', $$run_commands_on_session_level_connection_to_node$$; + + CREATE OR REPLACE FUNCTION stop_session_level_connection_to_node() + RETURNS void + LANGUAGE C STRICT VOLATILE + AS 'citus', $$stop_session_level_connection_to_node$$; + + SELECT citus_internal.replace_isolation_tester_func(); + SELECT citus_internal.refresh_isolation_tester_prepared_statement(); + + -- start_metadata_sync_to_node can not be run inside a transaction block + -- following is a workaround to overcome that + -- port numbers are hard coded at the moment + SELECT master_run_on_worker( + ARRAY['localhost']::text[], + ARRAY[57636]::int[], + ARRAY[format('SELECT start_metadata_sync_to_node(''%s'', %s)', nodename, nodeport)]::text[], + false) + FROM pg_dist_node; + + SET citus.replication_model to streaming; + SET citus.shard_replication_factor TO 1; + + SET citus.shard_count TO 8; + SET citus.shard_replication_factor TO 1; + CREATE TABLE logical_replicate_placement (x int PRIMARY KEY, y int); + SELECT create_distributed_table('logical_replicate_placement', 'x'); + + SELECT get_shard_id_for_distribution_column('logical_replicate_placement', 15) INTO selected_shard; +} + +teardown +{ + DROP TABLE selected_shard; + DROP TABLE logical_replicate_placement; + + SELECT citus_internal.restore_isolation_tester_func(); +} + + +session "s1" + +step "s1-begin" +{ + BEGIN; +} + +step "s1-move-placement" +{ + SELECT master_move_shard_placement(get_shard_id_for_distribution_column, 'localhost', 57637, 'localhost', 57638, shard_transfer_mode:='block_writes') FROM selected_shard; +} + +step "s1-commit" +{ + COMMIT; +} + +step "s1-select" +{ + SELECT * FROM logical_replicate_placement order by y; +} + +step "s1-insert" +{ + INSERT INTO logical_replicate_placement VALUES (15, 15), (172, 172); +} + +step "s1-get-shard-distribution" +{ + select nodeport from pg_dist_placement inner join pg_dist_node on(pg_dist_placement.groupid = pg_dist_node.groupid) where shardid in (SELECT * FROM selected_shard) order by nodeport; +} + +session "s2" + +step "s2-start-session-level-connection" +{ + SELECT start_session_level_connection_to_node('localhost', 57638); +} + +step "s2-begin-on-worker" +{ + SELECT run_commands_on_session_level_connection_to_node('BEGIN'); +} + +step "s2-select" +{ + SELECT run_commands_on_session_level_connection_to_node('SELECT * FROM logical_replicate_placement ORDER BY y'); +} + +step "s2-insert" +{ + SELECT run_commands_on_session_level_connection_to_node('INSERT INTO logical_replicate_placement VALUES (15, 15), (172, 172)'); +} + +step "s2-delete" +{ + SELECT run_commands_on_session_level_connection_to_node('DELETE FROM logical_replicate_placement'); +} + +step "s2-update" +{ + SELECT run_commands_on_session_level_connection_to_node('UPDATE logical_replicate_placement SET y = y + 1'); +} + +step "s2-commit-worker" +{ + SELECT run_commands_on_session_level_connection_to_node('COMMIT'); +} + +step "s2-stop-connection" +{ + SELECT stop_session_level_connection_to_node(); +} + +permutation "s1-begin" "s2-start-session-level-connection" "s2-begin-on-worker" "s2-insert" "s1-move-placement" "s2-commit-worker" "s1-commit" "s1-select" "s1-get-shard-distribution" "s2-stop-connection" +permutation "s1-insert" "s1-begin" "s2-start-session-level-connection" "s2-begin-on-worker" "s2-update" "s1-move-placement" "s2-commit-worker" "s1-commit" "s1-select" "s1-get-shard-distribution" "s2-stop-connection" +permutation "s1-insert" "s1-begin" "s2-start-session-level-connection" "s2-begin-on-worker" "s2-delete" "s1-move-placement" "s2-commit-worker" "s1-commit" "s1-select" "s1-get-shard-distribution" "s2-stop-connection" +permutation "s1-insert" "s1-begin" "s2-start-session-level-connection" "s2-begin-on-worker" "s2-select" "s1-move-placement" "s2-commit-worker" "s1-commit" "s1-get-shard-distribution" "s2-stop-connection" + diff --git a/src/test/regress/spec/isolation_blocking_move_single_shard_commands.spec b/src/test/regress/spec/isolation_blocking_move_single_shard_commands.spec new file mode 100644 index 000000000..f1250010f --- /dev/null +++ b/src/test/regress/spec/isolation_blocking_move_single_shard_commands.spec @@ -0,0 +1,107 @@ +// we use 15 as the partition key value through out the test +// so setting the corresponding shard here is useful +setup +{ + SELECT citus_internal.replace_isolation_tester_func(); + SELECT citus_internal.refresh_isolation_tester_prepared_statement(); + + SET citus.shard_count TO 8; + SET citus.shard_replication_factor TO 1; + CREATE TABLE logical_replicate_placement (x int PRIMARY KEY, y int); + SELECT create_distributed_table('logical_replicate_placement', 'x'); + + SELECT get_shard_id_for_distribution_column('logical_replicate_placement', 15) INTO selected_shard; +} + +teardown +{ + SELECT citus_internal.restore_isolation_tester_func(); + + DROP TABLE selected_shard; + DROP TABLE logical_replicate_placement; +} + + +session "s1" + +step "s1-begin" +{ + BEGIN; +} + +step "s1-move-placement" +{ + SELECT master_move_shard_placement((SELECT * FROM selected_shard), 'localhost', 57637, 'localhost', 57638, shard_transfer_mode:='block_writes'); +} + +step "s1-end" +{ + COMMIT; +} + +step "s1-select" +{ + SELECT * FROM logical_replicate_placement order by y; +} + +step "s1-insert" +{ + INSERT INTO logical_replicate_placement VALUES (15, 15); +} + +step "s1-get-shard-distribution" +{ + select nodeport from pg_dist_placement inner join pg_dist_node on(pg_dist_placement.groupid = pg_dist_node.groupid) where shardid in (SELECT * FROM selected_shard) order by nodeport; +} + +session "s2" + +step "s2-begin" +{ + BEGIN; +} + +step "s2-select" +{ + SELECT * FROM logical_replicate_placement ORDER BY y; +} + +step "s2-insert" +{ + INSERT INTO logical_replicate_placement VALUES (15, 15); +} + +step "s2-select-for-update" +{ + SELECT * FROM logical_replicate_placement WHERE x=15 FOR UPDATE; +} + +step "s2-delete" +{ + DELETE FROM logical_replicate_placement WHERE x = 15; +} + +step "s2-update" +{ + UPDATE logical_replicate_placement SET y = y + 1 WHERE x = 15; +} + +step "s2-upsert" +{ + INSERT INTO logical_replicate_placement VALUES (15, 15); + + INSERT INTO logical_replicate_placement VALUES (15, 15) ON CONFLICT (x) DO UPDATE SET y = logical_replicate_placement.y + 1; +} + +step "s2-end" +{ + COMMIT; +} + +permutation "s1-begin" "s2-begin" "s2-insert" "s1-move-placement" "s2-end" "s1-end" "s1-select" "s1-get-shard-distribution" +permutation "s1-begin" "s2-begin" "s2-upsert" "s1-move-placement" "s2-end" "s1-end" "s1-select" "s1-get-shard-distribution" +permutation "s1-insert" "s1-begin" "s2-begin" "s2-update" "s1-move-placement" "s2-end" "s1-end" "s1-select" "s1-get-shard-distribution" +permutation "s1-insert" "s1-begin" "s2-begin" "s2-delete" "s1-move-placement" "s2-end" "s1-end" "s1-select" "s1-get-shard-distribution" +permutation "s1-insert" "s1-begin" "s2-begin" "s2-select" "s1-move-placement" "s2-end" "s1-end" "s1-get-shard-distribution" +permutation "s1-insert" "s1-begin" "s2-begin" "s2-select-for-update" "s1-move-placement" "s2-end" "s1-end" "s1-get-shard-distribution" + diff --git a/src/test/regress/spec/isolation_blocking_move_single_shard_commands_on_mx.spec b/src/test/regress/spec/isolation_blocking_move_single_shard_commands_on_mx.spec new file mode 100644 index 000000000..d0a3f323f --- /dev/null +++ b/src/test/regress/spec/isolation_blocking_move_single_shard_commands_on_mx.spec @@ -0,0 +1,136 @@ +// 15 as the partition key value through out the test +// so setting the corresponding shard here is useful +setup +{ + CREATE OR REPLACE FUNCTION start_session_level_connection_to_node(text, integer) + RETURNS void + LANGUAGE C STRICT VOLATILE + AS 'citus', $$start_session_level_connection_to_node$$; + + CREATE OR REPLACE FUNCTION run_commands_on_session_level_connection_to_node(text) + RETURNS void + LANGUAGE C STRICT VOLATILE + AS 'citus', $$run_commands_on_session_level_connection_to_node$$; + + CREATE OR REPLACE FUNCTION stop_session_level_connection_to_node() + RETURNS void + LANGUAGE C STRICT VOLATILE + AS 'citus', $$stop_session_level_connection_to_node$$; + + SELECT citus_internal.replace_isolation_tester_func(); + SELECT citus_internal.refresh_isolation_tester_prepared_statement(); + + -- start_metadata_sync_to_node can not be run inside a transaction block + -- following is a workaround to overcome that + -- port numbers are hard coded at the moment + SELECT master_run_on_worker( + ARRAY['localhost']::text[], + ARRAY[57636]::int[], + ARRAY[format('SELECT start_metadata_sync_to_node(''%s'', %s)', nodename, nodeport)]::text[], + false) + FROM pg_dist_node; + + SET citus.replication_model to streaming; + SET citus.shard_replication_factor TO 1; + + SET citus.shard_count TO 8; + CREATE TABLE logical_replicate_placement (x int PRIMARY KEY, y int); + SELECT create_distributed_table('logical_replicate_placement', 'x'); + + SELECT get_shard_id_for_distribution_column('logical_replicate_placement', 15) INTO selected_shard; +} + +teardown +{ + DROP TABLE selected_shard; + DROP TABLE logical_replicate_placement; + + SELECT citus_internal.restore_isolation_tester_func(); +} + + +session "s1" + +step "s1-begin" +{ + BEGIN; +} + +step "s1-move-placement" +{ + SELECT master_move_shard_placement((SELECT * FROM selected_shard), 'localhost', 57637, 'localhost', 57638, shard_transfer_mode:='block_writes'); +} + +step "s1-commit" +{ + COMMIT; +} + +step "s1-select" +{ + SELECT * FROM logical_replicate_placement order by y; +} + +step "s1-insert" +{ + INSERT INTO logical_replicate_placement VALUES (15, 15); +} + +step "s1-get-shard-distribution" +{ + select nodeport from pg_dist_placement inner join pg_dist_node on(pg_dist_placement.groupid = pg_dist_node.groupid) where shardid in (SELECT * FROM selected_shard) order by nodeport; +} + +session "s2" + +step "s2-start-session-level-connection" +{ + SELECT start_session_level_connection_to_node('localhost', 57638); +} + +step "s2-begin-on-worker" +{ + SELECT run_commands_on_session_level_connection_to_node('BEGIN'); +} + +step "s2-select" +{ + SELECT run_commands_on_session_level_connection_to_node('SELECT * FROM logical_replicate_placement ORDER BY y'); +} + +step "s2-insert" +{ + SELECT run_commands_on_session_level_connection_to_node('INSERT INTO logical_replicate_placement VALUES (15, 15)'); +} + +step "s2-select-for-update" +{ + SELECT run_commands_on_session_level_connection_to_node('SELECT * FROM logical_replicate_placement WHERE x=15 FOR UPDATE'); +} + +step "s2-delete" +{ + SELECT run_commands_on_session_level_connection_to_node('DELETE FROM logical_replicate_placement WHERE x = 15'); +} + +step "s2-update" +{ + SELECT run_commands_on_session_level_connection_to_node('UPDATE logical_replicate_placement SET y = y + 1 WHERE x = 15'); +} + +step "s2-commit-worker" +{ + SELECT run_commands_on_session_level_connection_to_node('COMMIT'); +} + +step "s2-stop-connection" +{ + SELECT stop_session_level_connection_to_node(); +} + +permutation "s1-begin" "s2-start-session-level-connection" "s2-begin-on-worker" "s2-insert" "s1-move-placement" "s2-commit-worker" "s1-commit" "s1-select" "s1-get-shard-distribution" "s2-stop-connection" +permutation "s1-insert" "s1-begin" "s2-start-session-level-connection" "s2-begin-on-worker" "s2-update" "s1-move-placement" "s2-commit-worker" "s1-commit" "s1-select" "s1-get-shard-distribution" "s2-stop-connection" +permutation "s1-insert" "s1-begin" "s2-start-session-level-connection" "s2-begin-on-worker" "s2-delete" "s1-move-placement" "s2-commit-worker" "s1-commit" "s1-select" "s1-get-shard-distribution" "s2-stop-connection" +permutation "s1-insert" "s1-begin" "s2-start-session-level-connection" "s2-begin-on-worker" "s2-select" "s1-move-placement" "s2-commit-worker" "s1-commit" "s1-get-shard-distribution" "s2-stop-connection" +permutation "s1-insert" "s1-begin" "s2-start-session-level-connection" "s2-begin-on-worker" "s2-select-for-update" "s1-move-placement" "s2-commit-worker" "s1-commit" "s1-get-shard-distribution" "s2-stop-connection" + diff --git a/src/test/regress/spec/isolation_shard_rebalancer.spec b/src/test/regress/spec/isolation_shard_rebalancer.spec new file mode 100644 index 000000000..ab3e0e6fe --- /dev/null +++ b/src/test/regress/spec/isolation_shard_rebalancer.spec @@ -0,0 +1,114 @@ +setup +{ + SELECT 1 FROM master_add_node('localhost', 57637); + SELECT 1 FROM master_add_node('localhost', 57638); + CREATE TABLE colocated1 (test_id integer NOT NULL, data text); + SELECT create_distributed_table('colocated1', 'test_id', 'hash'); + CREATE TABLE colocated2 (test_id integer NOT NULL, data text); + SELECT create_distributed_table('colocated2', 'test_id', 'hash'); + CREATE TABLE non_colocated (test_id integer NOT NULL, data text); + SELECT create_distributed_table('non_colocated', 'test_id', 'hash', 'none'); +} + +teardown +{ + DROP TABLE non_colocated; + DROP TABLE colocated2; + DROP TABLE colocated1; + SELECT master_set_node_property('localhost', 57638, 'shouldhaveshards', true); +} + +session "s1" + +step "s1-rebalance-c1" +{ + BEGIN; + select rebalance_table_shards('colocated1'); +} + +step "s1-replicate-c1" +{ + BEGIN; + select replicate_table_shards('colocated1'); +} + +step "s1-rebalance-nc" +{ + BEGIN; + select rebalance_table_shards('non_colocated'); +} + +step "s1-replicate-nc" +{ + BEGIN; + select replicate_table_shards('non_colocated'); +} + +step "s1-commit" +{ + COMMIT; +} + +session "s2" + + +step "s2-rebalance-c2" +{ + select rebalance_table_shards('colocated2'); +} + +step "s2-replicate-c2" +{ + select replicate_table_shards('colocated2'); +} + +step "s2-rebalance-nc" +{ + select rebalance_table_shards('non_colocated'); +} + +step "s2-replicate-nc" +{ + select replicate_table_shards('non_colocated'); +} + +step "s2-rebalance-all" +{ + select rebalance_table_shards(); +} + +step "s2-drain" +{ + select master_drain_node('localhost', 57638); +} + + +// disallowed because it's the same table +permutation "s1-rebalance-nc" "s2-rebalance-nc" "s1-commit" +permutation "s1-rebalance-nc" "s2-replicate-nc" "s1-commit" +permutation "s1-replicate-nc" "s2-rebalance-nc" "s1-commit" +permutation "s1-replicate-nc" "s2-replicate-nc" "s1-commit" + +// disallowed because it's the same colocation group +permutation "s1-rebalance-c1" "s2-rebalance-c2" "s1-commit" +permutation "s1-rebalance-c1" "s2-replicate-c2" "s1-commit" +permutation "s1-replicate-c1" "s2-rebalance-c2" "s1-commit" +permutation "s1-replicate-c1" "s2-replicate-c2" "s1-commit" + +// allowed because it's a different colocation group +permutation "s1-rebalance-c1" "s2-rebalance-nc" "s1-commit" +permutation "s1-rebalance-c1" "s2-replicate-nc" "s1-commit" +permutation "s1-replicate-c1" "s2-rebalance-nc" "s1-commit" +permutation "s1-replicate-c1" "s2-replicate-nc" "s1-commit" + +// disallowed because we because colocated1 is part of all +permutation "s1-rebalance-c1" "s2-rebalance-all" "s1-commit" +permutation "s1-replicate-c1" "s2-rebalance-all" "s1-commit" +permutation "s1-rebalance-nc" "s2-rebalance-all" "s1-commit" +permutation "s1-replicate-nc" "s2-rebalance-all" "s1-commit" + +// disallowed because we because draining is rebalancing +permutation "s1-rebalance-c1" "s2-drain" "s1-commit" +permutation "s1-replicate-c1" "s2-drain" "s1-commit" +permutation "s1-rebalance-nc" "s2-drain" "s1-commit" +permutation "s1-replicate-nc" "s2-drain" "s1-commit" diff --git a/src/test/regress/sql/foreign_key_to_reference_shard_rebalance.sql b/src/test/regress/sql/foreign_key_to_reference_shard_rebalance.sql new file mode 100644 index 000000000..390ad7357 --- /dev/null +++ b/src/test/regress/sql/foreign_key_to_reference_shard_rebalance.sql @@ -0,0 +1,79 @@ +-- +-- FOREIGN_KEY_TO_REFERENCE_SHARD_REBALANCE +-- + +SET citus.next_shard_id TO 15000000; +CREATE SCHEMA fkey_to_reference_shard_rebalance; +SET search_path to fkey_to_reference_shard_rebalance; +SET citus.shard_replication_factor TO 1; +SET citus.shard_count to 8; + +CREATE TYPE foreign_details AS (name text, relid text, refd_relid text); + +CREATE VIEW table_fkeys_in_workers AS +SELECT +(json_populate_record(NULL::foreign_details, + json_array_elements_text((run_command_on_workers( $$ + SELECT + COALESCE(json_agg(row_to_json(d)), '[]'::json) + FROM + ( + SELECT + distinct name, + relid::regclass::text, + refd_relid::regclass::text + FROM + table_fkey_cols + ) + d $$ )).RESULT::json )::json )).* ; + +-- check if master_move_shard_placement with logical replication creates the +-- foreign constraints properly after moving the shard +CREATE TABLE referenced_table(test_column int, test_column2 int UNIQUE, PRIMARY KEY(test_column)); +CREATE TABLE referencing_table(id int PRIMARY KEY, ref_id int, FOREIGN KEY (id) REFERENCES referenced_table(test_column) ON DELETE CASCADE); +CREATE TABLE referencing_table2(id int, ref_id int, FOREIGN KEY (ref_id) REFERENCES referenced_table(test_column2) ON DELETE CASCADE, FOREIGN KEY (id) REFERENCES referencing_table(id) ON DELETE CASCADE); +SELECT create_reference_table('referenced_table'); +SELECT create_distributed_table('referencing_table', 'id'); +SELECT create_distributed_table('referencing_table2', 'id'); + +INSERT INTO referenced_table SELECT i,i FROM generate_series (0, 100) i; +INSERT INTO referencing_table SELECT i,i FROM generate_series (0, 100) i; +INSERT INTO referencing_table2 SELECT i,i FROM generate_series (0, 100) i; + +SELECT master_move_shard_placement(15000009, 'localhost', :worker_1_port, 'localhost', :worker_2_port); + +SELECT count(*) FROM referencing_table2; + +SELECT * FROM table_fkeys_in_workers WHERE relid LIKE 'fkey_to_reference_shard_rebalance.%' AND refd_relid LIKE 'fkey_to_reference_shard_rebalance.%' ORDER BY 1,2,3; + +SELECT master_move_shard_placement(15000009, 'localhost', :worker_2_port, 'localhost', :worker_1_port, 'block_writes'); + +SELECT count(*) FROM referencing_table2; + +SELECT * FROM table_fkeys_in_workers WHERE relid LIKE 'fkey_to_reference_shard_rebalance.%' AND refd_relid LIKE 'fkey_to_reference_shard_rebalance.%' ORDER BY 1,2,3; + +-- create a function to show the +CREATE FUNCTION get_foreign_key_to_reference_table_commands(Oid) + RETURNS SETOF text + LANGUAGE C STABLE STRICT + AS 'citus', $$get_foreign_key_to_reference_table_commands$$; + +CREATE TABLE reference_table_commands (id int UNIQUE); +CREATE TABLE referenceing_dist_table (id int, col1 int, col2 int, col3 int); +SELECT create_reference_table('reference_table_commands'); +SELECT create_distributed_table('referenceing_dist_table', 'id'); +ALTER TABLE referenceing_dist_table ADD CONSTRAINT c1 FOREIGN KEY (col1) REFERENCES reference_table_commands(id) ON UPDATE CASCADE; +ALTER TABLE referenceing_dist_table ADD CONSTRAINT c2 FOREIGN KEY (col2) REFERENCES reference_table_commands(id) ON UPDATE CASCADE NOT VALID; +ALTER TABLE referenceing_dist_table ADD CONSTRAINT very_very_very_very_very_very_very_very_very_very_very_very_very_long FOREIGN KEY (col3) REFERENCES reference_table_commands(id) ON UPDATE CASCADE; +SELECT * FROM get_foreign_key_to_reference_table_commands('referenceing_dist_table'::regclass); + +-- and show that rebalancer works fine +SELECT master_move_shard_placement(15000018, 'localhost', :worker_1_port, 'localhost', :worker_2_port); + +\c - - - :worker_2_port + +SELECT conname, contype, convalidated FROM pg_constraint WHERE conrelid = 'fkey_to_reference_shard_rebalance.referenceing_dist_table_15000018'::regclass ORDER BY 1; + +\c - - - :master_port + +DROP SCHEMA fkey_to_reference_shard_rebalance CASCADE; diff --git a/src/test/regress/sql/master_copy_shard_placement.sql b/src/test/regress/sql/master_copy_shard_placement.sql index 6d0f2234c..37396cdf6 100644 --- a/src/test/regress/sql/master_copy_shard_placement.sql +++ b/src/test/regress/sql/master_copy_shard_placement.sql @@ -36,21 +36,24 @@ SELECT master_copy_shard_placement( get_shard_id_for_distribution_column('data', 'key-1'), 'localhost', :worker_1_port, 'localhost', :worker_2_port, - do_repair := false); + do_repair := false, + transfer_mode := 'block_writes'); -- verify we error out if source and destination are the same SELECT master_copy_shard_placement( get_shard_id_for_distribution_column('data', 'key-1'), 'localhost', :worker_2_port, 'localhost', :worker_2_port, - do_repair := false); + do_repair := false, + transfer_mode := 'block_writes'); -- verify we error out if target already contains a healthy placement SELECT master_copy_shard_placement( (SELECT shardid FROM pg_dist_shard WHERE logicalrelid='ref_table'::regclass::oid), 'localhost', :worker_1_port, 'localhost', :worker_2_port, - do_repair := false); + do_repair := false, + transfer_mode := 'block_writes'); -- verify we error out if table has foreign key constraints INSERT INTO ref_table SELECT 1, value FROM data; @@ -69,7 +72,8 @@ SELECT master_copy_shard_placement( get_shard_id_for_distribution_column('data', 'key-1'), 'localhost', :worker_2_port, 'localhost', :worker_1_port, - do_repair := false); + do_repair := false, + transfer_mode := 'block_writes'); -- forcefully mark the old replica as inactive UPDATE pg_dist_shard_placement SET shardstate = 3 @@ -95,7 +99,8 @@ SELECT master_copy_shard_placement( get_shard_id_for_distribution_column('mx_table', '1'), 'localhost', :worker_1_port, 'localhost', :worker_2_port, - do_repair := false); + do_repair := false, + transfer_mode := 'block_writes'); SELECT stop_metadata_sync_to_node('localhost', :worker_1_port); diff --git a/src/test/regress/sql/multi_colocated_shard_rebalance.sql b/src/test/regress/sql/multi_colocated_shard_rebalance.sql new file mode 100644 index 000000000..46de57776 --- /dev/null +++ b/src/test/regress/sql/multi_colocated_shard_rebalance.sql @@ -0,0 +1,336 @@ +-- +-- MULTI_COLOCATED_SHARD_REBALANCE +-- + +ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 13000000; + +SET citus.shard_count TO 6; +SET citus.shard_replication_factor TO 1; + +-- create distributed tables +CREATE TABLE table1_group1 ( id int PRIMARY KEY); +SELECT create_distributed_table('table1_group1', 'id', 'hash'); + +CREATE TABLE table2_group1 ( id int ); +SELECT create_distributed_table('table2_group1', 'id', 'hash'); + +SET citus.shard_count TO 8; +CREATE TABLE table5_groupX ( id int ); +SELECT create_distributed_table('table5_groupX', 'id', 'hash'); + +CREATE TABLE table6_append ( id int ); +SELECT master_create_distributed_table('table6_append', 'id', 'append'); +SELECT master_create_empty_shard('table6_append'); +SELECT master_create_empty_shard('table6_append'); + +-- Mark tables as non-mx tables, in order to be able to test master_copy_shard_placement +UPDATE pg_dist_partition SET repmodel='c' WHERE logicalrelid IN + ('table1_group1'::regclass, 'table2_group1'::regclass, 'table5_groupX'::regclass); + +-- test copy + +-- test copying colocated shards +-- status before shard copy +SELECT s.shardid, s.logicalrelid::regclass, sp.nodeport +FROM + pg_dist_partition p, pg_dist_shard s, pg_dist_shard_placement sp +WHERE + p.logicalrelid = s.logicalrelid AND + s.shardid = sp.shardid AND + colocationid = (SELECT colocationid FROM pg_dist_partition WHERE logicalrelid = 'table1_group1'::regclass) +ORDER BY s.shardid, sp.nodeport; + +-- copy colocated shards +SELECT master_copy_shard_placement(13000000, 'localhost', :worker_1_port, 'localhost', :worker_2_port, false); + +-- status after shard copy +SELECT s.shardid, s.logicalrelid::regclass, sp.nodeport +FROM + pg_dist_partition p, pg_dist_shard s, pg_dist_shard_placement sp +WHERE + p.logicalrelid = s.logicalrelid AND + s.shardid = sp.shardid AND + colocationid = (SELECT colocationid FROM pg_dist_partition WHERE logicalrelid = 'table1_group1'::regclass) +ORDER BY s.shardid, sp.nodeport; + +-- also connect worker to verify we successfully copied given shard (and other colocated shards) +\c - - - :worker_2_port +SELECT "Column", "Type", "Modifiers" FROM table_desc WHERE relid='public.table1_group1_13000000'::regclass; +SELECT "Column", "Type", "Modifiers" FROM table_desc WHERE relid='public.table2_group1_13000006'::regclass; +\c - - - :master_port + +-- copy colocated shards again to see error message +SELECT master_copy_shard_placement(13000000, 'localhost', :worker_1_port, 'localhost', :worker_2_port, false, 'force_logical'); + + +-- test copying NOT colocated shard +-- status before shard copy +SELECT s.shardid, s.logicalrelid::regclass, sp.nodeport +FROM + pg_dist_partition p, pg_dist_shard s, pg_dist_shard_placement sp +WHERE + p.logicalrelid = s.logicalrelid AND + s.shardid = sp.shardid AND + p.logicalrelid = 'table5_groupX'::regclass +ORDER BY s.shardid, sp.nodeport; + +-- copy NOT colocated shard +SELECT master_copy_shard_placement(13000012, 'localhost', :worker_1_port, 'localhost', :worker_2_port, false); + +-- status after shard copy +SELECT s.shardid, s.logicalrelid::regclass, sp.nodeport +FROM + pg_dist_partition p, pg_dist_shard s, pg_dist_shard_placement sp +WHERE + p.logicalrelid = s.logicalrelid AND + s.shardid = sp.shardid AND + p.logicalrelid = 'table5_groupX'::regclass +ORDER BY s.shardid, sp.nodeport; + + +-- test copying shard in append distributed table +-- status before shard copy +SELECT s.shardid, s.logicalrelid::regclass, sp.nodeport +FROM + pg_dist_partition p, pg_dist_shard s, pg_dist_shard_placement sp +WHERE + p.logicalrelid = s.logicalrelid AND + s.shardid = sp.shardid AND + p.logicalrelid = 'table6_append'::regclass +ORDER BY s.shardid, sp.nodeport; + +-- copy shard in append distributed table +SELECT master_copy_shard_placement(13000020, 'localhost', :worker_2_port, 'localhost', :worker_1_port, false, 'force_logical'); + +-- status after shard copy +SELECT s.shardid, s.logicalrelid::regclass, sp.nodeport +FROM + pg_dist_partition p, pg_dist_shard s, pg_dist_shard_placement sp +WHERE + p.logicalrelid = s.logicalrelid AND + s.shardid = sp.shardid AND + p.logicalrelid = 'table6_append'::regclass +ORDER BY s.shardid, sp.nodeport; + + +-- test move + +-- test moving colocated shards +-- status before shard move +SELECT s.shardid, s.logicalrelid::regclass, sp.nodeport +FROM + pg_dist_partition p, pg_dist_shard s, pg_dist_shard_placement sp +WHERE + p.logicalrelid = s.logicalrelid AND + s.shardid = sp.shardid AND + colocationid = (SELECT colocationid FROM pg_dist_partition WHERE logicalrelid = 'table1_group1'::regclass) +ORDER BY s.shardid, sp.nodeport; + +-- try force_logical +SELECT master_move_shard_placement(13000001, 'localhost', :worker_2_port, 'localhost', :worker_1_port, 'force_logical'); + +-- move colocated shards +SELECT master_move_shard_placement(13000001, 'localhost', :worker_2_port, 'localhost', :worker_1_port); + +-- status after shard move +SELECT s.shardid, s.logicalrelid::regclass, sp.nodeport +FROM + pg_dist_partition p, pg_dist_shard s, pg_dist_shard_placement sp +WHERE + p.logicalrelid = s.logicalrelid AND + s.shardid = sp.shardid AND + colocationid = (SELECT colocationid FROM pg_dist_partition WHERE logicalrelid = 'table1_group1'::regclass) +ORDER BY s.shardid, sp.nodeport; + +-- also connect worker to verify we successfully moved given shard (and other colocated shards) +\c - - - :worker_1_port +SELECT "Column", "Type", "Modifiers" FROM table_desc WHERE relid='public.table1_group1_13000001'::regclass; +SELECT "Column", "Type", "Modifiers" FROM table_desc WHERE relid='public.table2_group1_13000007'::regclass; +\c - - - :master_port + + +-- test moving NOT colocated shard +-- status before shard move +SELECT s.shardid, s.logicalrelid::regclass, sp.nodeport +FROM + pg_dist_partition p, pg_dist_shard s, pg_dist_shard_placement sp +WHERE + p.logicalrelid = s.logicalrelid AND + s.shardid = sp.shardid AND + p.logicalrelid = 'table5_groupX'::regclass +ORDER BY s.shardid, sp.nodeport; + +-- move NOT colocated shard +SELECT master_move_shard_placement(13000013, 'localhost', :worker_2_port, 'localhost', :worker_1_port); + +-- status after shard move +SELECT s.shardid, s.logicalrelid::regclass, sp.nodeport +FROM + pg_dist_partition p, pg_dist_shard s, pg_dist_shard_placement sp +WHERE + p.logicalrelid = s.logicalrelid AND + s.shardid = sp.shardid AND + p.logicalrelid = 'table5_groupX'::regclass +ORDER BY s.shardid, sp.nodeport; + + +-- test moving shard in append distributed table +-- status before shard move +SELECT s.shardid, s.logicalrelid::regclass, sp.nodeport +FROM + pg_dist_partition p, pg_dist_shard s, pg_dist_shard_placement sp +WHERE + p.logicalrelid = s.logicalrelid AND + s.shardid = sp.shardid AND + p.logicalrelid = 'table6_append'::regclass +ORDER BY s.shardid, sp.nodeport; + +-- move shard in append distributed table +SELECT master_move_shard_placement(13000021, 'localhost', :worker_1_port, 'localhost', :worker_2_port); + +-- status after shard move +SELECT s.shardid, s.logicalrelid::regclass, sp.nodeport +FROM + pg_dist_partition p, pg_dist_shard s, pg_dist_shard_placement sp +WHERE + p.logicalrelid = s.logicalrelid AND + s.shardid = sp.shardid AND + p.logicalrelid = 'table6_append'::regclass +ORDER BY s.shardid, sp.nodeport; + + +-- try to move shard from wrong node +SELECT master_move_shard_placement(13000021, 'localhost', :worker_1_port, 'localhost', :worker_2_port); + + +-- test shard move with foreign constraints +DROP TABLE IF EXISTS table1_group1, table2_group1; + +SET citus.shard_count TO 6; +SET citus.shard_replication_factor TO 1; + +-- create distributed tables +CREATE TABLE table1_group1 ( id int PRIMARY KEY); +SELECT create_distributed_table('table1_group1', 'id', 'hash'); + +CREATE TABLE table2_group1 ( id int, table1_id int, FOREIGN KEY(table1_id) REFERENCES table1_group1(id)); +SELECT create_distributed_table('table2_group1', 'table1_id', 'hash'); + +-- Mark the tables as non-mx tables +UPDATE pg_dist_partition SET repmodel='c' WHERE logicalrelid IN + ('table1_group1'::regclass, 'table2_group1'::regclass); + +-- status before shard rebalance +SELECT s.shardid, s.logicalrelid::regclass, sp.nodeport +FROM + pg_dist_partition p, pg_dist_shard s, pg_dist_shard_placement sp +WHERE + p.logicalrelid = s.logicalrelid AND + s.shardid = sp.shardid AND + colocationid = (SELECT colocationid FROM pg_dist_partition WHERE logicalrelid = 'table1_group1'::regclass) +ORDER BY s.shardid, sp.nodeport; + +SELECT master_move_shard_placement(13000022, 'localhost', :worker_1_port, 'localhost', :worker_2_port, 'block_writes'); + +-- status after shard rebalance +SELECT s.shardid, s.logicalrelid::regclass, sp.nodeport +FROM + pg_dist_partition p, pg_dist_shard s, pg_dist_shard_placement sp +WHERE + p.logicalrelid = s.logicalrelid AND + s.shardid = sp.shardid AND + colocationid = (SELECT colocationid FROM pg_dist_partition WHERE logicalrelid = 'table1_group1'::regclass) +ORDER BY s.shardid, sp.nodeport; + +-- also connect worker to verify we successfully moved given shard (and other colocated shards) +\c - - - :worker_2_port +SELECT "Column", "Type", "Modifiers" FROM table_desc WHERE relid='public.table1_group1_13000022'::regclass; +SELECT "Column", "Type", "Modifiers" FROM table_desc WHERE relid='public.table2_group1_13000028'::regclass; + +-- make sure that we've created the foreign keys +SELECT "Constraint", "Definition" FROM table_fkeys + WHERE "Constraint" LIKE 'table2_group%' OR "Constraint" LIKE 'table1_group%'; + +\c - - - :master_port + + +-- test shard copy with foreign constraints +-- we expect it to error out because we do not support foreign constraints with replication factor > 1 +SELECT master_copy_shard_placement(13000022, 'localhost', :worker_2_port, 'localhost', :worker_1_port, false); + + +-- lets also test that master_move_shard_placement doesn't break serials +CREATE TABLE serial_move_test (key int, other_val serial); +SET citus.shard_replication_factor TO 1; + +SELECT create_distributed_table('serial_move_test', 'key'); + +-- key 15 goes to shard 13000035 +INSERT INTO serial_move_test (key) VALUES (15) RETURNING *; +INSERT INTO serial_move_test (key) VALUES (15) RETURNING *; + +-- confirm the shard id +SELECT * FROM run_command_on_placements('serial_move_test', 'SELECT DISTINCT key FROM %s WHERE key = 15') WHERE result = '15' AND shardid = 13000034; + +SELECT master_move_shard_placement(13000034, 'localhost', :worker_1_port, 'localhost', :worker_2_port); + +-- confirm the successfull move +SELECT * FROM run_command_on_placements('serial_move_test', 'SELECT DISTINCT key FROM %s WHERE key = 15') WHERE result = '15' AND shardid = 13000034; + +-- finally show that serials work fine afterwards +INSERT INTO serial_move_test (key) VALUES (15) RETURNING *; +INSERT INTO serial_move_test (key) VALUES (15) RETURNING *; + +-- we should be able to move shard placements of partitioend tables +CREATE SCHEMA move_partitions; +CREATE TABLE move_partitions.events ( + id serial, + t timestamptz default now(), + payload text +) +PARTITION BY RANGE(t); + +SET citus.shard_count TO 6; +SELECT create_distributed_table('move_partitions.events', 'id', colocate_with := 'none'); + +CREATE TABLE move_partitions.events_1 PARTITION OF move_partitions.events +FOR VALUES FROM ('2015-01-01') TO ('2016-01-01'); + +INSERT INTO move_partitions.events (t, payload) +SELECT '2015-01-01'::date + (interval '1 day' * s), s FROM generate_series(1, 100) s; + +SELECT count(*) FROM move_partitions.events; + +-- try to move automatically +SELECT master_move_shard_placement(shardid, 'localhost', :worker_2_port, 'localhost', :worker_1_port) +FROM pg_dist_shard JOIN pg_dist_shard_placement USING (shardid) +WHERE logicalrelid = 'move_partitions.events'::regclass AND nodeport = :worker_2_port +ORDER BY shardid LIMIT 1; + +SELECT count(*) FROM move_partitions.events; + +-- add a primary key to the partition +ALTER TABLE move_partitions.events_1 ADD CONSTRAINT e_1_pk PRIMARY KEY (id); + +-- should be able to move automatically now +SELECT master_move_shard_placement(shardid, 'localhost', :worker_2_port, 'localhost', :worker_1_port) +FROM pg_dist_shard JOIN pg_dist_shard_placement USING (shardid) +WHERE logicalrelid = 'move_partitions.events'::regclass AND nodeport = :worker_2_port +ORDER BY shardid LIMIT 1; + +SELECT count(*) FROM move_partitions.events; + +-- should also be able to move with block writes +SELECT master_move_shard_placement(shardid, 'localhost', :worker_2_port, 'localhost', :worker_1_port, 'block_writes') +FROM pg_dist_shard JOIN pg_dist_shard_placement USING (shardid) +WHERE logicalrelid = 'move_partitions.events'::regclass AND nodeport = :worker_2_port +ORDER BY shardid LIMIT 1; + +SELECT count(*) FROM move_partitions.events; + +-- should have moved all shards to node 1 (2*6 = 12) +SELECT count(*) +FROM pg_dist_shard JOIN pg_dist_shard_placement USING (shardid) +WHERE logicalrelid::text LIKE 'move_partitions.events%' AND nodeport = :worker_1_port; + +DROP TABLE move_partitions.events; diff --git a/src/test/regress/sql/multi_move_mx.sql b/src/test/regress/sql/multi_move_mx.sql new file mode 100644 index 000000000..c317a08d7 --- /dev/null +++ b/src/test/regress/sql/multi_move_mx.sql @@ -0,0 +1,144 @@ +-- +-- MULTI_MOVE_MX +-- +ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 1550000; + +SELECT start_metadata_sync_to_node('localhost', :worker_2_port); + +-- Create mx test tables +SET citus.shard_count TO 4; +SET citus.shard_replication_factor TO 1; +SET citus.replication_model TO 'streaming'; + +CREATE TABLE mx_table_1 (a int); +SELECT create_distributed_table('mx_table_1', 'a'); + +CREATE TABLE mx_table_2 (a int); +SELECT create_distributed_table('mx_table_2', 'a'); + +CREATE TABLE mx_table_3 (a text); +SELECT create_distributed_table('mx_table_3', 'a'); + +-- Check that the first two tables are colocated +SELECT + logicalrelid, repmodel +FROM + pg_dist_partition +WHERE + logicalrelid = 'mx_table_1'::regclass + OR logicalrelid = 'mx_table_2'::regclass + OR logicalrelid = 'mx_table_3'::regclass +ORDER BY + logicalrelid; + +-- Check the list of shards +SELECT + logicalrelid, shardid, nodename, nodeport +FROM + pg_dist_shard NATURAL JOIN pg_dist_shard_placement +WHERE + logicalrelid = 'mx_table_1'::regclass + OR logicalrelid = 'mx_table_2'::regclass + OR logicalrelid = 'mx_table_3'::regclass +ORDER BY + logicalrelid, shardid; + +-- Check the data on the worker +\c - - - :worker_2_port +SELECT + logicalrelid, shardid, nodename, nodeport +FROM + pg_dist_shard NATURAL JOIN pg_dist_shard_placement +WHERE + logicalrelid = 'mx_table_1'::regclass + OR logicalrelid = 'mx_table_2'::regclass + OR logicalrelid = 'mx_table_3'::regclass +ORDER BY + logicalrelid, shardid; + +\c - - - :master_port +-- Check that master_copy_shard_placement cannot be run with MX tables +SELECT + master_copy_shard_placement(shardid, 'localhost', :worker_1_port, 'localhost', :worker_2_port, false, 'force_logical') +FROM + pg_dist_shard NATURAL JOIN pg_dist_shard_placement +WHERE + logicalrelid = 'mx_table_1'::regclass + AND nodeport = :worker_1_port +ORDER BY + shardid +LIMIT 1; + +-- Move a shard from worker 1 to worker 2 +SELECT + master_move_shard_placement(shardid, 'localhost', :worker_1_port, 'localhost', :worker_2_port) +FROM + pg_dist_shard NATURAL JOIN pg_dist_shard_placement +WHERE + logicalrelid = 'mx_table_1'::regclass + AND nodeport = :worker_1_port +ORDER BY + shardid +LIMIT 1; + +-- Check that the shard and its colocated shard is moved, but not the other shards +SELECT + logicalrelid, shardid, nodename, nodeport +FROM + pg_dist_shard NATURAL JOIN pg_dist_shard_placement +WHERE + logicalrelid = 'mx_table_1'::regclass + OR logicalrelid = 'mx_table_2'::regclass + OR logicalrelid = 'mx_table_3'::regclass +ORDER BY + logicalrelid, shardid; + +-- Check that the changes are made in the worker as well +\c - - - :worker_2_port +SELECT + logicalrelid, shardid, nodename, nodeport +FROM + pg_dist_shard NATURAL JOIN pg_dist_shard_placement +WHERE + logicalrelid = 'mx_table_1'::regclass + OR logicalrelid = 'mx_table_2'::regclass + OR logicalrelid = 'mx_table_3'::regclass +ORDER BY + logicalrelid, shardid; + +-- Check that the UDFs cannot be called from the workers +SELECT + master_copy_shard_placement(shardid, 'localhost', :worker_2_port, 'localhost', :worker_1_port, false, 'force_logical') +FROM + pg_dist_shard NATURAL JOIN pg_dist_shard_placement +WHERE + logicalrelid = 'mx_table_1'::regclass + AND nodeport = :worker_2_port +ORDER BY + shardid +LIMIT 1 OFFSET 1; + +SELECT + master_move_shard_placement(shardid, 'localhost', :worker_2_port, 'localhost', :worker_1_port, 'force_logical') +FROM + pg_dist_shard NATURAL JOIN pg_dist_shard_placement +WHERE + logicalrelid = 'mx_table_1'::regclass + AND nodeport = :worker_2_port +ORDER BY + shardid +LIMIT 1 OFFSET 1; + +-- Cleanup +\c - - - :master_port +DROP TABLE mx_table_1; +DROP TABLE mx_table_2; +DROP TABLE mx_table_3; +SELECT stop_metadata_sync_to_node('localhost', :worker_2_port); +\c - - - :worker_2_port +DELETE FROM pg_dist_node; +DELETE FROM pg_dist_partition; +DELETE FROM pg_dist_shard; +DELETE FROM pg_dist_shard_placement; +\c - - - :master_port +RESET citus.replication_model; diff --git a/src/test/regress/sql/multi_test_helpers_superuser.sql b/src/test/regress/sql/multi_test_helpers_superuser.sql index aa7b3ee66..a50d1d3cd 100644 --- a/src/test/regress/sql/multi_test_helpers_superuser.sql +++ b/src/test/regress/sql/multi_test_helpers_superuser.sql @@ -1,3 +1,10 @@ +CREATE OR REPLACE FUNCTION master_defer_delete_shards() + RETURNS int + LANGUAGE C STRICT + AS 'citus', $$master_defer_delete_shards$$; +COMMENT ON FUNCTION master_defer_delete_shards() + IS 'remove orphaned shards'; + CREATE OR REPLACE FUNCTION wait_until_metadata_sync(timeout INTEGER DEFAULT 15000) RETURNS void LANGUAGE C STRICT diff --git a/src/test/regress/sql/multi_utility_warnings.sql b/src/test/regress/sql/multi_utility_warnings.sql index 296e4f3c2..3a7b0a910 100644 --- a/src/test/regress/sql/multi_utility_warnings.sql +++ b/src/test/regress/sql/multi_utility_warnings.sql @@ -21,4 +21,3 @@ BEGIN; INSERT INTO pg_dist_node VALUES (1234567890, 1234567890, 'localhost', 5432); INSERT INTO pg_dist_poolinfo VALUES (1234567890, 'port=1234'); ROLLBACK; -INSERT INTO pg_dist_rebalance_strategy VALUES ('should fail', false, 'citus_shard_cost_1', 'citus_node_capacity_1', 'citus_shard_allowed_on_node_true', 0, 0); diff --git a/src/test/regress/sql/shard_move_deferred_delete.sql b/src/test/regress/sql/shard_move_deferred_delete.sql new file mode 100644 index 000000000..1d5d38ffa --- /dev/null +++ b/src/test/regress/sql/shard_move_deferred_delete.sql @@ -0,0 +1,61 @@ +-- +-- SHARD_MOVE_DEFERRED_DELETE +-- + +SET citus.next_shard_id TO 20000000; + +SET citus.shard_count TO 6; +SET citus.shard_replication_factor TO 1; +SET citus.defer_drop_after_shard_move TO on; + +CREATE SCHEMA shard_move_deferred_delete; +SET search_path TO shard_move_deferred_delete; + +CREATE TABLE t1 ( id int PRIMARY KEY); +SELECT create_distributed_table('t1', 'id'); + +-- by counting how ofter we see the specific shard on all workers we can verify is the shard is there +SELECT run_command_on_workers($cmd$ + SELECT count(*) FROM pg_class WHERE relname = 't1_20000000'; +$cmd$); + +-- move shard +SELECT master_move_shard_placement(20000000, 'localhost', :worker_1_port, 'localhost', :worker_2_port); + +-- we expect the shard to be on both workers now +SELECT run_command_on_workers($cmd$ + SELECT count(*) FROM pg_class WHERE relname = 't1_20000000'; +$cmd$); + +-- execute delayed removal +SELECT public.master_defer_delete_shards(); + +-- we expect the shard to be on only the second worker +SELECT run_command_on_workers($cmd$ + SELECT count(*) FROM pg_class WHERE relname = 't1_20000000'; +$cmd$); + +SELECT master_move_shard_placement(20000000, 'localhost', :worker_2_port, 'localhost', :worker_1_port); + +-- we expect the shard to be on both workers now +SELECT run_command_on_workers($cmd$ + SELECT count(*) FROM pg_class WHERE relname = 't1_20000000'; +$cmd$); + +-- enable auto delete +ALTER SYSTEM SET citus.defer_shard_delete_interval TO 10; +SELECT pg_reload_conf(); + +-- Sleep 1 second to give Valgrind enough time to clear transactions +SELECT pg_sleep(1); + +-- we expect the shard to be on only the first worker +SELECT run_command_on_workers($cmd$ + SELECT count(*) FROM pg_class WHERE relname = 't1_20000000'; +$cmd$); + +-- reset test suite +ALTER SYSTEM SET citus.defer_shard_delete_interval TO -1; +SELECT pg_reload_conf(); + +DROP SCHEMA shard_move_deferred_delete CASCADE; diff --git a/src/test/regress/sql/shard_rebalancer.sql b/src/test/regress/sql/shard_rebalancer.sql new file mode 100644 index 000000000..02c56b103 --- /dev/null +++ b/src/test/regress/sql/shard_rebalancer.sql @@ -0,0 +1,1148 @@ +-- +-- MUTLI_SHARD_REBALANCER +-- + +CREATE TABLE dist_table_test(a int primary key); +SELECT create_distributed_table('dist_table_test', 'a'); +CREATE TABLE ref_table_test(a int primary key); +SELECT create_reference_table('ref_table_test'); + +-- make sure that all rebalance operations works fine when +-- reference tables are replicated to the coordinator +SELECT 1 FROM master_add_node('localhost', :master_port, groupId=>0); + +-- should just be noops even if we add the coordinator to the pg_dist_node +SELECT rebalance_table_shards('dist_table_test'); +SELECT rebalance_table_shards(); + +-- test that calling rebalance_table_shards without specifying relation +-- wouldn't move shard of the citus local table. +CREATE TABLE citus_local_table(a int, b int); +SELECT create_citus_local_table('citus_local_table'); +INSERT INTO citus_local_table VALUES (1, 2); + +SELECT rebalance_table_shards(); + +-- show that citus local table shard is still on the coordinator +SELECT tablename FROM pg_catalog.pg_tables where tablename like 'citus_local_table_%'; +-- also check that we still can access shard relation, not the shell table +SELECT count(*) FROM citus_local_table; + +SELECT master_drain_node('localhost', :master_port); + +-- show that citus local table shard is still on the coordinator +SELECT tablename FROM pg_catalog.pg_tables where tablename like 'citus_local_table_%'; +-- also check that we still can access shard relation, not the shell table +SELECT count(*) FROM citus_local_table; + +-- show that we do not create a shard rebalancing plan for citus local table +SELECT get_rebalance_table_shards_plan(); + +DROP TABLE citus_local_table; + +CREATE TABLE dist_table_test_2(a int); +SET citus.shard_count TO 4; + +SET citus.shard_replication_factor TO 1; +SET citus.replication_model TO "statement"; +SELECT create_distributed_table('dist_table_test_2', 'a'); + +-- replicate reference table should ignore the coordinator +SET citus.shard_replication_factor TO 2; +SELECT replicate_table_shards('dist_table_test_2', max_shard_copies := 4, shard_transfer_mode:='block_writes'); + +DROP TABLE dist_table_test, dist_table_test_2, ref_table_test; +RESET citus.shard_count; +RESET citus.shard_replication_factor; +RESET citus.replication_model; + +-- Create a user to test multiuser usage of rebalancer functions +CREATE USER testrole; +GRANT ALL ON SCHEMA public TO testrole; + +CREATE OR REPLACE FUNCTION shard_placement_rebalance_array( + worker_node_list json[], + shard_placement_list json[], + threshold float4 DEFAULT 0, + max_shard_moves int DEFAULT 1000000, + drain_only bool DEFAULT false +) +RETURNS json[] +AS 'citus' +LANGUAGE C STRICT VOLATILE; + + +CREATE FUNCTION shard_placement_replication_array(worker_node_list json[], + shard_placement_list json[], + shard_replication_factor int) +RETURNS json[] +AS 'citus' +LANGUAGE C STRICT VOLATILE; + +CREATE FUNCTION worker_node_responsive(worker_node_name text, worker_node_port int) +RETURNS boolean +AS 'citus' +LANGUAGE C STRICT VOLATILE; + +SET citus.next_shard_id TO 123000; + +SELECT worker_node_responsive(node_name, node_port::int) + FROM master_get_active_worker_nodes() + ORDER BY node_name, node_port ASC; + +-- Check that worker_node_responsive returns false for dead nodes +-- Note that PostgreSQL tries all possible resolutions of localhost on failing +-- connections. This causes different error details to be printed on different +-- environments. Therefore, we first set verbosity to terse. + +\set VERBOSITY terse + +SELECT worker_node_responsive('localhost', 1); + +\set VERBOSITY default + +-- Check that with threshold=0.0 shard_placement_rebalance_array returns enough +-- moves to make the cluster completely balanced. + +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":5, "shardid":5, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":6, "shardid":6, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}']::json[], + 0.0 +)); + +-- Check that with two nodes and threshold=1.0 shard_placement_rebalance_array +-- doesn't return any moves, even if it is completely unbalanced. + +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}']::json[], + 1.0 +)); + +-- Check that with three nodes and threshold=1.0 +-- shard_placement_rebalance_array returns moves when it is completely unbalanced +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432}', + '{"node_name": "hostname3", "node_port": 5432}' + ]::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}']::json[], + 1.0 +)); + + +-- Check that with with three nodes and threshold=2.0 +-- shard_placement_rebalance_array doesn't return any moves, even if it is +-- completely unbalanced. (with three nodes) + + +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432}', + '{"node_name": "hostname3", "node_port": 5432}' + ]::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}']::json[], + 2.0 +)); + +-- Check that with threshold=0.0 shard_placement_rebalance_array doesn't return +-- any moves if the cluster is already balanced. + +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":5, "shardid":5, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":6, "shardid":6, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}']::json[], + 0.0 +)); + +-- Check that shard_placement_replication_array returns a shard copy operation +-- for each of the shards in an inactive node. + +SELECT unnest(shard_placement_replication_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":3, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname3", "nodeport":5432}', + '{"placementid":4, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname3", "nodeport":5432}']::json[], + 2 +)); + +-- Check that shard_placement_replication_array returns a shard copy operation +-- for each of the inactive shards. + +SELECT unnest(shard_placement_replication_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":3, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":1, "shardstate":3, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}']::json[], + 2 +)); + +-- Check that shard_placement_replication_array errors out if all placements of +-- a shard are placed on inactive nodes. + +SELECT unnest(shard_placement_replication_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":2, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname3", "nodeport":5432}']::json[], + 2 +)); + +-- Check that shard_placement_replication_array errors out if replication factor +-- is more than number of active nodes. + +SELECT unnest(shard_placement_replication_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}']::json[], + 2 +)); + +-- Ensure that shard_replication_factor is 2 during replicate_table_shards +-- and rebalance_table_shards tests + +SET citus.shard_replication_factor TO 2; + +-- Turn off NOTICE messages + +SET client_min_messages TO WARNING; + +-- Create a single-row test data for shard rebalancer test shards + +CREATE TABLE shard_rebalancer_test_data AS SELECT 1::int as int_column; + +-- Test replicate_table_shards, which will in turn test update_shard_placement +-- in copy mode. + +CREATE TABLE replication_test_table(int_column int); +SELECT master_create_distributed_table('replication_test_table', 'int_column', 'append'); + +CREATE VIEW replication_test_table_placements_per_node AS + SELECT count(*) FROM pg_dist_shard_placement NATURAL JOIN pg_dist_shard + WHERE logicalrelid = 'replication_test_table'::regclass + GROUP BY nodename, nodeport + ORDER BY nodename, nodeport; + +-- Create four shards with replication factor 2, and delete the placements +-- with smaller port number to simulate under-replicated shards. + +SELECT count(master_create_empty_shard('replication_test_table')) + FROM generate_series(1, 4); + +DELETE FROM pg_dist_shard_placement WHERE placementid in ( + SELECT pg_dist_shard_placement.placementid + FROM pg_dist_shard_placement NATURAL JOIN pg_dist_shard + WHERE logicalrelid = 'replication_test_table'::regclass + AND (nodename, nodeport) = (SELECT nodename, nodeport FROM pg_dist_shard_placement + ORDER BY nodename, nodeport limit 1) +); + +-- Upload the test data to the shards + +SELECT count(master_append_table_to_shard(shardid, 'shard_rebalancer_test_data', + host(inet_server_addr()), inet_server_port())) + FROM pg_dist_shard + WHERE logicalrelid = 'replication_test_table'::regclass; + +-- Verify that there is one node with all placements + +SELECT * FROM replication_test_table_placements_per_node; + +-- Check excluded_shard_list by excluding three shards with smaller ids + +SELECT replicate_table_shards('replication_test_table', + excluded_shard_list := excluded_shard_list, + shard_transfer_mode:='block_writes') + FROM ( + SELECT (array_agg(DISTINCT shardid ORDER BY shardid))[1:3] AS excluded_shard_list + FROM pg_dist_shard + WHERE logicalrelid = 'replication_test_table'::regclass + ) T; + +SELECT * FROM replication_test_table_placements_per_node; + +-- Check that with shard_replication_factor=1 we don't do any copies + +SELECT replicate_table_shards('replication_test_table', + shard_replication_factor := 1, + shard_transfer_mode:='block_writes'); + +SELECT * FROM replication_test_table_placements_per_node; + +-- Check that max_shard_copies limits number of copy operations + +SELECT replicate_table_shards('replication_test_table', + max_shard_copies := 2, + shard_transfer_mode:='block_writes'); + +SELECT * FROM replication_test_table_placements_per_node; + +-- Replicate the remaining under-replicated shards + +SELECT replicate_table_shards('replication_test_table'); + +SELECT * FROM replication_test_table_placements_per_node; + +-- Check that querying the table doesn't error out + +SELECT count(*) FROM replication_test_table; + +DROP TABLE public.replication_test_table CASCADE; + +-- Test rebalance_table_shards, which will in turn test update_shard_placement +-- in move mode. + +CREATE TABLE rebalance_test_table(int_column int); +SELECT master_create_distributed_table('rebalance_test_table', 'int_column', 'append'); + +CREATE VIEW table_placements_per_node AS +SELECT nodeport, logicalrelid::regclass, count(*) +FROM pg_dist_shard_placement NATURAL JOIN pg_dist_shard +GROUP BY logicalrelid::regclass, nodename, nodeport +ORDER BY logicalrelid::regclass, nodename, nodeport; + +-- Create six shards with replication factor 1 and move them to the same +-- node to create an unbalanced cluster. + +CREATE PROCEDURE create_unbalanced_shards(rel text) +LANGUAGE SQL +AS $$ + SET citus.shard_replication_factor TO 1; + + SELECT count(master_create_empty_shard(rel)) + FROM generate_series(1, 6); + + SELECT count(master_move_shard_placement(shardid, + src.nodename, src.nodeport::int, + dst.nodename, dst.nodeport::int, + shard_transfer_mode:='block_writes')) + FROM pg_dist_shard s JOIN + pg_dist_shard_placement src USING (shardid), + (SELECT nodename, nodeport FROM pg_dist_shard_placement ORDER BY nodeport DESC LIMIT 1) dst + WHERE src.nodeport < dst.nodeport AND s.logicalrelid = rel::regclass; +$$; + +CALL create_unbalanced_shards('rebalance_test_table'); + +SET citus.shard_replication_factor TO 2; + +-- Upload the test data to the shards + +SELECT count(master_append_table_to_shard(shardid, 'shard_rebalancer_test_data', + host(inet_server_addr()), inet_server_port())) +FROM pg_dist_shard +WHERE logicalrelid = 'rebalance_test_table'::regclass; + +-- Verify that there is one node with all placements + +SELECT * FROM table_placements_per_node; + +-- Check excluded_shard_list by excluding four shards with smaller ids + +SELECT rebalance_table_shards('rebalance_test_table', + excluded_shard_list := excluded_shard_list, + threshold := 0, + shard_transfer_mode:='block_writes') +FROM ( + SELECT (array_agg(DISTINCT shardid ORDER BY shardid))[1:4] AS excluded_shard_list + FROM pg_dist_shard + WHERE logicalrelid = 'rebalance_test_table'::regclass +) T; + +SELECT * FROM table_placements_per_node; + +-- Check that max_shard_moves limits number of move operations + +-- First check that we error if not table owner +SET ROLE testrole; +SELECT rebalance_table_shards('rebalance_test_table', + threshold := 0, max_shard_moves := 1, + shard_transfer_mode:='block_writes'); +RESET ROLE; + +SELECT rebalance_table_shards('rebalance_test_table', + threshold := 0, max_shard_moves := 1, + shard_transfer_mode:='block_writes'); + +SELECT * FROM table_placements_per_node; + +-- Check that threshold=1 doesn't move any shards + +SELECT rebalance_table_shards('rebalance_test_table', threshold := 1, shard_transfer_mode:='block_writes'); + +SELECT * FROM table_placements_per_node; + +-- Move the remaining shards using threshold=0 + +SELECT rebalance_table_shards('rebalance_test_table', threshold := 0); + +SELECT * FROM table_placements_per_node; + +-- Check that shard is completely balanced and rebalancing again doesn't have +-- any effects. + +SELECT rebalance_table_shards('rebalance_test_table', threshold := 0, shard_transfer_mode:='block_writes'); + +SELECT * FROM table_placements_per_node; + +-- Check that querying the table doesn't error out + +SELECT count(*) FROM rebalance_test_table; + +DROP TABLE rebalance_test_table; + +-- Test schema support + + +CREATE SCHEMA test_schema_support; + +SELECT COUNT(*) FROM pg_dist_shard_placement; + +CREATE TABLE test_schema_support.nation_hash ( + n_nationkey integer not null, + n_name char(25) not null, + n_regionkey integer not null, + n_comment varchar(152) +); + +SELECT master_create_distributed_table('test_schema_support.nation_hash', 'n_nationkey', 'hash'); +SELECT master_create_worker_shards('test_schema_support.nation_hash', 4, 1); + +CREATE TABLE test_schema_support.nation_hash2 ( + n_nationkey integer not null, + n_name char(25) not null, + n_regionkey integer not null, + n_comment varchar(152) +); + +SELECT master_create_distributed_table('test_schema_support.nation_hash2', 'n_nationkey', 'hash'); +SELECT master_create_worker_shards('test_schema_support.nation_hash2', 4, 1); + +-- Shard count before replication +SELECT COUNT(*) FROM pg_dist_shard_placement; + +SET search_path TO public; +SELECT replicate_table_shards('test_schema_support.nation_hash', shard_transfer_mode:='block_writes'); + +-- Confirm replication +SELECT COUNT(*) FROM pg_dist_shard_placement; + +-- Test with search_path is set +SET search_path TO test_schema_support; +SELECT replicate_table_shards('nation_hash2', shard_transfer_mode:='block_writes'); + +-- Confirm replication +SELECT COUNT(*) FROM pg_dist_shard_placement; + +DROP TABLE test_schema_support.nation_hash; +DROP TABLE test_schema_support.nation_hash2; + +-- Test rebalancer with schema +-- Next few operations is to create imbalanced distributed table + +CREATE TABLE test_schema_support.imbalanced_table_local ( + id integer not null +); +INSERT INTO test_schema_support.imbalanced_table_local VALUES(1); +INSERT INTO test_schema_support.imbalanced_table_local VALUES(2); +INSERT INTO test_schema_support.imbalanced_table_local VALUES(3); +INSERT INTO test_schema_support.imbalanced_table_local VALUES(4); + +CREATE TABLE test_schema_support.imbalanced_table ( + id integer not null +); + +SELECT master_create_distributed_table('test_schema_support.imbalanced_table', 'id', 'append'); + +SET citus.shard_replication_factor TO 1; +SELECT * from master_create_empty_shard('test_schema_support.imbalanced_table'); +SELECT master_append_table_to_shard(123018, 'test_schema_support.imbalanced_table_local', 'localhost', :master_port); + +SET citus.shard_replication_factor TO 2; +SELECT * from master_create_empty_shard('test_schema_support.imbalanced_table'); +SELECT master_append_table_to_shard(123019, 'test_schema_support.imbalanced_table_local', 'localhost', :master_port); + +SET citus.shard_replication_factor TO 1; +SELECT * from master_create_empty_shard('test_schema_support.imbalanced_table'); +SELECT master_append_table_to_shard(123020, 'test_schema_support.imbalanced_table_local', 'localhost', :master_port); + +-- imbalanced_table is now imbalanced + +-- Shard counts in each node before rebalance +SELECT * FROM public.table_placements_per_node; + +-- Row count in imbalanced table before rebalance +SELECT COUNT(*) FROM imbalanced_table; + +-- Try force_logical +SELECT rebalance_table_shards('imbalanced_table', threshold:=0, shard_transfer_mode:='force_logical'); + +-- Test rebalance operation +SELECT rebalance_table_shards('imbalanced_table', threshold:=0, shard_transfer_mode:='block_writes'); + +-- Confirm rebalance +-- Shard counts in each node after rebalance +SELECT * FROM public.table_placements_per_node; + +-- Row count in imbalanced table after rebalance +SELECT COUNT(*) FROM imbalanced_table; + +DROP TABLE public.shard_rebalancer_test_data; +DROP TABLE test_schema_support.imbalanced_table; +DROP TABLE test_schema_support.imbalanced_table_local; + +SET citus.shard_replication_factor TO 1; + +CREATE TABLE colocated_rebalance_test(id integer); +CREATE TABLE colocated_rebalance_test2(id integer); +SELECT create_distributed_table('colocated_rebalance_test', 'id'); + +-- Move all shards to worker1 +SELECT master_move_shard_placement(shardid, 'localhost', :worker_2_port, 'localhost', :worker_1_port, 'block_writes') +FROM pg_dist_shard_placement +WHERE nodeport = :worker_2_port; + + +SELECT create_distributed_table('colocated_rebalance_test2', 'id'); + +-- Confirm all shards for both tables are on worker1 +SELECT * FROM public.table_placements_per_node; + +-- Confirm that the plan for drain_only doesn't show any moves +SELECT * FROM get_rebalance_table_shards_plan('colocated_rebalance_test', threshold := 0, drain_only := true); +-- Running with drain_only shouldn't do anything +SELECT * FROM rebalance_table_shards('colocated_rebalance_test', threshold := 0, shard_transfer_mode := 'block_writes', drain_only := true); + +-- Confirm that nothing changed +SELECT * FROM public.table_placements_per_node; + +-- Confirm that the plan shows 2 shards of both tables moving back to worker2 +SELECT * FROM get_rebalance_table_shards_plan('colocated_rebalance_test', threshold := 0); +-- Confirm that this also happens when using rebalancing by disk size even if the tables are empty +SELECT * FROM get_rebalance_table_shards_plan('colocated_rebalance_test', rebalance_strategy := 'by_disk_size'); +-- Check that we can call this function +SELECT * FROM get_rebalance_progress(); +-- Actually do the rebalance +SELECT * FROM rebalance_table_shards('colocated_rebalance_test', threshold := 0, shard_transfer_mode := 'block_writes'); +-- Check that we can call this function without a crash +SELECT * FROM get_rebalance_progress(); + +-- Confirm that the nodes are now there +SELECT * FROM public.table_placements_per_node; + + +CREATE TABLE non_colocated_rebalance_test(id integer); +SELECT create_distributed_table('non_colocated_rebalance_test', 'id', colocate_with := 'none'); +-- confirm that both colocation groups are balanced +SELECT * FROM public.table_placements_per_node; + +-- testing behaviour when setting isdatanode to 'marked for draining' +SELECT * from master_set_node_property('localhost', :worker_2_port, 'shouldhaveshards', false); + +SELECT * FROM get_rebalance_table_shards_plan('colocated_rebalance_test', threshold := 0); +SELECT * FROM rebalance_table_shards('colocated_rebalance_test', threshold := 0, shard_transfer_mode := 'block_writes'); +SELECT * FROM public.table_placements_per_node; + +SELECT * FROM get_rebalance_table_shards_plan('non_colocated_rebalance_test', threshold := 0); +SELECT * FROM rebalance_table_shards('non_colocated_rebalance_test', threshold := 0, shard_transfer_mode := 'block_writes'); +SELECT * FROM public.table_placements_per_node; + +-- Put shards back +SELECT * from master_set_node_property('localhost', :worker_2_port, 'shouldhaveshards', true); + +SELECT * FROM rebalance_table_shards('colocated_rebalance_test', threshold := 0, shard_transfer_mode := 'block_writes'); +SELECT * FROM public.table_placements_per_node; +SELECT * FROM rebalance_table_shards('non_colocated_rebalance_test', threshold := 0, shard_transfer_mode := 'block_writes'); +SELECT * FROM public.table_placements_per_node; + +-- testing behaviour when setting shouldhaveshards to false and rebalancing all +-- colocation groups with drain_only=true +SELECT * from master_set_node_property('localhost', :worker_2_port, 'shouldhaveshards', false); +SELECT * FROM get_rebalance_table_shards_plan(threshold := 0, drain_only := true); +SELECT * FROM rebalance_table_shards(threshold := 0, shard_transfer_mode := 'block_writes', drain_only := true); +SELECT * FROM public.table_placements_per_node; + +-- Put shards back +SELECT * from master_set_node_property('localhost', :worker_2_port, 'shouldhaveshards', true); +SELECT * FROM rebalance_table_shards(threshold := 0, shard_transfer_mode := 'block_writes'); +SELECT * FROM public.table_placements_per_node; + +-- testing behaviour when setting shouldhaveshards to false and rebalancing all +-- colocation groups with drain_only=false +SELECT * from master_set_node_property('localhost', :worker_2_port, 'shouldhaveshards', false); +SELECT * FROM get_rebalance_table_shards_plan(threshold := 0); +SELECT * FROM rebalance_table_shards(threshold := 0, shard_transfer_mode := 'block_writes'); +SELECT * FROM public.table_placements_per_node; + +-- Put shards back +SELECT * from master_set_node_property('localhost', :worker_2_port, 'shouldhaveshards', true); +SELECT * FROM rebalance_table_shards(threshold := 0, shard_transfer_mode := 'block_writes'); +SELECT * FROM public.table_placements_per_node; + +-- Make it a data node again +SELECT * from master_set_node_property('localhost', :worker_2_port, 'shouldhaveshards', true); + +-- testing behaviour of master_drain_node +SELECT * from master_drain_node('localhost', :worker_2_port, shard_transfer_mode := 'block_writes'); +select shouldhaveshards from pg_dist_node where nodeport = :worker_2_port; +SELECT * FROM public.table_placements_per_node; + +-- Put shards back +SELECT * from master_set_node_property('localhost', :worker_2_port, 'shouldhaveshards', true); +SELECT * FROM rebalance_table_shards(threshold := 0, shard_transfer_mode := 'block_writes'); +SELECT * FROM public.table_placements_per_node; + + +-- Drop some tables for clear consistent error +DROP TABLE test_schema_support.colocated_rebalance_test2; + +-- Leave no trace on workers +RESET search_path; + +\set VERBOSITY terse +DROP SCHEMA test_schema_support CASCADE; +\set VERBOSITY default + +REVOKE ALL ON SCHEMA public FROM testrole; +DROP USER testrole; + +-- Test costs +set citus.shard_count = 4; +CREATE TABLE tab (x int); +SELECT create_distributed_table('tab','x'); +-- The following numbers are chosen such that they are placed on different +-- shards. +INSERT INTO tab SELECT 1 from generate_series(1, 30000); +INSERT INTO tab SELECT 2 from generate_series(1, 10000); +INSERT INTO tab SELECT 3 from generate_series(1, 10000); +INSERT INTO tab SELECT 6 from generate_series(1, 10000); +ANALYZE tab; + +\c - - - :worker_1_port +SELECT table_schema, table_name, row_estimate, total_bytes + FROM ( + SELECT *, total_bytes-index_bytes-COALESCE(toast_bytes,0) AS table_bytes FROM ( + SELECT c.oid,nspname AS table_schema, relname AS TABLE_NAME + , c.reltuples AS row_estimate + , pg_total_relation_size(c.oid) AS total_bytes + , pg_indexes_size(c.oid) AS index_bytes + , pg_total_relation_size(reltoastrelid) AS toast_bytes + FROM pg_class c + LEFT JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE relkind = 'r' + ) a +WHERE table_schema = 'public' +) a ORDER BY table_name; +\c - - - :worker_2_port +SELECT table_schema, table_name, row_estimate, total_bytes + FROM ( + SELECT *, total_bytes-index_bytes-COALESCE(toast_bytes,0) AS table_bytes FROM ( + SELECT c.oid,nspname AS table_schema, relname AS TABLE_NAME + , c.reltuples AS row_estimate + , pg_total_relation_size(c.oid) AS total_bytes + , pg_indexes_size(c.oid) AS index_bytes + , pg_total_relation_size(reltoastrelid) AS toast_bytes + FROM pg_class c + LEFT JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE relkind = 'r' + ) a +WHERE table_schema = 'public' +) a ORDER BY table_name; + +\c - - - :master_port + +SELECT * FROM get_rebalance_table_shards_plan('tab'); +SELECT * FROM get_rebalance_table_shards_plan('tab', rebalance_strategy := 'by_disk_size'); +SELECT * FROM get_rebalance_table_shards_plan('tab', rebalance_strategy := 'by_disk_size', threshold := 0); + +SELECT * FROM rebalance_table_shards('tab', shard_transfer_mode:='block_writes'); +SELECT * FROM public.table_placements_per_node; + +SELECT * FROM rebalance_table_shards('tab', rebalance_strategy := 'by_disk_size', shard_transfer_mode:='block_writes'); +SELECT * FROM public.table_placements_per_node; + +SELECT * FROM rebalance_table_shards('tab', rebalance_strategy := 'by_disk_size', shard_transfer_mode:='block_writes', threshold := 0); +SELECT * FROM public.table_placements_per_node; + +-- Check that sizes of colocated tables are added together for rebalances +set citus.shard_count = 4; +SET citus.next_shard_id TO 123050; +CREATE TABLE tab2 (x int); +SELECT create_distributed_table('tab2','x', colocate_with := 'tab'); +INSERT INTO tab2 SELECT 1 from generate_series(1, 0); +INSERT INTO tab2 SELECT 2 from generate_series(1, 60000); +INSERT INTO tab2 SELECT 3 from generate_series(1, 10000); +INSERT INTO tab2 SELECT 6 from generate_series(1, 10000); +ANALYZE tab, tab2; + +\c - - - :worker_1_port +SELECT table_schema, table_name, row_estimate, total_bytes + FROM ( + SELECT *, total_bytes-index_bytes-COALESCE(toast_bytes,0) AS table_bytes FROM ( + SELECT c.oid,nspname AS table_schema, relname AS TABLE_NAME + , c.reltuples AS row_estimate + , pg_total_relation_size(c.oid) AS total_bytes + , pg_indexes_size(c.oid) AS index_bytes + , pg_total_relation_size(reltoastrelid) AS toast_bytes + FROM pg_class c + LEFT JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE relkind = 'r' + ) a +WHERE table_schema = 'public' +) a ORDER BY table_name; +\c - - - :worker_2_port +SELECT table_schema, table_name, row_estimate, total_bytes + FROM ( + SELECT *, total_bytes-index_bytes-COALESCE(toast_bytes,0) AS table_bytes FROM ( + SELECT c.oid,nspname AS table_schema, relname AS TABLE_NAME + , c.reltuples AS row_estimate + , pg_total_relation_size(c.oid) AS total_bytes + , pg_indexes_size(c.oid) AS index_bytes + , pg_total_relation_size(reltoastrelid) AS toast_bytes + FROM pg_class c + LEFT JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE relkind = 'r' + ) a +WHERE table_schema = 'public' +) a ORDER BY table_name; + +\c - - - :master_port +SELECT * FROM get_rebalance_table_shards_plan('tab', rebalance_strategy := 'by_disk_size'); +SELECT * FROM rebalance_table_shards('tab', rebalance_strategy := 'by_disk_size', shard_transfer_mode:='block_writes'); +SELECT * FROM public.table_placements_per_node; +ANALYZE tab, tab2; + +\c - - - :worker_1_port +SELECT table_schema, table_name, row_estimate, total_bytes + FROM ( + SELECT *, total_bytes-index_bytes-COALESCE(toast_bytes,0) AS table_bytes FROM ( + SELECT c.oid,nspname AS table_schema, relname AS TABLE_NAME + , c.reltuples AS row_estimate + , pg_total_relation_size(c.oid) AS total_bytes + , pg_indexes_size(c.oid) AS index_bytes + , pg_total_relation_size(reltoastrelid) AS toast_bytes + FROM pg_class c + LEFT JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE relkind = 'r' + ) a +WHERE table_schema = 'public' +) a ORDER BY table_name; +\c - - - :worker_2_port +SELECT table_schema, table_name, row_estimate, total_bytes + FROM ( + SELECT *, total_bytes-index_bytes-COALESCE(toast_bytes,0) AS table_bytes FROM ( + SELECT c.oid,nspname AS table_schema, relname AS TABLE_NAME + , c.reltuples AS row_estimate + , pg_total_relation_size(c.oid) AS total_bytes + , pg_indexes_size(c.oid) AS index_bytes + , pg_total_relation_size(reltoastrelid) AS toast_bytes + FROM pg_class c + LEFT JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE relkind = 'r' + ) a +WHERE table_schema = 'public' +) a ORDER BY table_name; +\c - - - :master_port + +DROP TABLE tab2; + +CREATE OR REPLACE FUNCTION capacity_high_worker_1(nodeidarg int) + RETURNS real AS $$ + SELECT + (CASE WHEN nodeport = 57637 THEN 1000 ELSE 1 END)::real + FROM pg_dist_node where nodeid = nodeidarg + $$ LANGUAGE sql; + +SELECT citus_add_rebalance_strategy( + 'capacity_high_worker_1', + 'citus_shard_cost_1', + 'capacity_high_worker_1', + 'citus_shard_allowed_on_node_true', + 0 + ); + +SELECT * FROM get_rebalance_table_shards_plan('tab', rebalance_strategy := 'capacity_high_worker_1'); +SELECT * FROM rebalance_table_shards('tab', rebalance_strategy := 'capacity_high_worker_1', shard_transfer_mode:='block_writes'); +SELECT * FROM public.table_placements_per_node; + +SELECT citus_set_default_rebalance_strategy('capacity_high_worker_1'); +SELECT * FROM get_rebalance_table_shards_plan('tab'); +SELECT * FROM rebalance_table_shards('tab', shard_transfer_mode:='block_writes'); +SELECT * FROM public.table_placements_per_node; + +CREATE FUNCTION only_worker_2(shardid bigint, nodeidarg int) + RETURNS boolean AS $$ + SELECT + (CASE WHEN nodeport = 57638 THEN TRUE ELSE FALSE END) + FROM pg_dist_node where nodeid = nodeidarg + $$ LANGUAGE sql; + +SELECT citus_add_rebalance_strategy( + 'only_worker_2', + 'citus_shard_cost_1', + 'citus_node_capacity_1', + 'only_worker_2', + 0 + ); + +SELECT citus_set_default_rebalance_strategy('only_worker_2'); +SELECT * FROM get_rebalance_table_shards_plan('tab'); +SELECT * FROM rebalance_table_shards('tab', shard_transfer_mode:='block_writes'); +SELECT * FROM public.table_placements_per_node; + +SELECT citus_set_default_rebalance_strategy('by_shard_count'); +SELECT * FROM get_rebalance_table_shards_plan('tab'); + +-- Check all the error handling cases +SELECT * FROM get_rebalance_table_shards_plan('tab', rebalance_strategy := 'non_existing'); +SELECT * FROM rebalance_table_shards('tab', rebalance_strategy := 'non_existing'); +SELECT * FROM master_drain_node('localhost', :worker_2_port, rebalance_strategy := 'non_existing'); +SELECT citus_set_default_rebalance_strategy('non_existing'); + + +UPDATE pg_dist_rebalance_strategy SET default_strategy=false; +SELECT * FROM get_rebalance_table_shards_plan('tab'); +SELECT * FROM rebalance_table_shards('tab'); +SELECT * FROM master_drain_node('localhost', :worker_2_port); +UPDATE pg_dist_rebalance_strategy SET default_strategy=true WHERE name='by_shard_count'; + +CREATE OR REPLACE FUNCTION shard_cost_no_arguments() + RETURNS real AS $$ SELECT 1.0::real $$ LANGUAGE sql; + +CREATE OR REPLACE FUNCTION shard_cost_bad_arg_type(text) + RETURNS real AS $$ SELECT 1.0::real $$ LANGUAGE sql; + +CREATE OR REPLACE FUNCTION shard_cost_bad_return_type(bigint) + RETURNS int AS $$ SELECT 1 $$ LANGUAGE sql; + +CREATE OR REPLACE FUNCTION node_capacity_no_arguments() + RETURNS real AS $$ SELECT 1.0::real $$ LANGUAGE sql; + +CREATE OR REPLACE FUNCTION node_capacity_bad_arg_type(text) + RETURNS real AS $$ SELECT 1.0::real $$ LANGUAGE sql; + +CREATE OR REPLACE FUNCTION node_capacity_bad_return_type(int) + RETURNS int AS $$ SELECT 1 $$ LANGUAGE sql; + +CREATE OR REPLACE FUNCTION shard_allowed_on_node_no_arguments() + RETURNS boolean AS $$ SELECT true $$ LANGUAGE sql; + +CREATE OR REPLACE FUNCTION shard_allowed_on_node_bad_arg1(text, int) + RETURNS boolean AS $$ SELECT true $$ LANGUAGE sql; + +CREATE OR REPLACE FUNCTION shard_allowed_on_node_bad_arg2(bigint, text) + RETURNS boolean AS $$ SELECT true $$ LANGUAGE sql; + +CREATE OR REPLACE FUNCTION shard_allowed_on_node_bad_return_type(bigint, int) + RETURNS int AS $$ SELECT 1 $$ LANGUAGE sql; + +SELECT citus_add_rebalance_strategy( + 'insert_should_fail', + 'shard_cost_no_arguments', + 'citus_node_capacity_1', + 'citus_shard_allowed_on_node_true', + 0 + ); +SELECT citus_add_rebalance_strategy( + 'insert_should_fail', + 'shard_cost_bad_arg_type', + 'citus_node_capacity_1', + 'citus_shard_allowed_on_node_true', + 0 + ); +SELECT citus_add_rebalance_strategy( + 'insert_should_fail', + 'shard_cost_bad_return_type', + 'citus_node_capacity_1', + 'citus_shard_allowed_on_node_true', + 0 + ); +SELECT citus_add_rebalance_strategy( + 'insert_should_fail', + 0, + 'citus_node_capacity_1', + 'citus_shard_allowed_on_node_true', + 0 + ); + +SELECT citus_add_rebalance_strategy( + 'insert_should_fail', + 'citus_shard_cost_1', + 'node_capacity_no_arguments', + 'citus_shard_allowed_on_node_true', + 0 + ); +SELECT citus_add_rebalance_strategy( + 'insert_should_fail', + 'citus_shard_cost_1', + 'node_capacity_bad_arg_type', + 'citus_shard_allowed_on_node_true', + 0 + ); +SELECT citus_add_rebalance_strategy( + 'insert_should_fail', + 'citus_shard_cost_1', + 'node_capacity_bad_return_type', + 'citus_shard_allowed_on_node_true', + 0 + ); +SELECT citus_add_rebalance_strategy( + 'insert_should_fail', + 'citus_shard_cost_1', + 0, + 'citus_shard_allowed_on_node_true', + 0 + ); + +SELECT citus_add_rebalance_strategy( + 'insert_should_fail', + 'citus_shard_cost_1', + 'citus_node_capacity_1', + 'shard_allowed_on_node_no_arguments', + 0 + ); +SELECT citus_add_rebalance_strategy( + 'insert_should_fail', + 'citus_shard_cost_1', + 'citus_node_capacity_1', + 'shard_allowed_on_node_bad_arg1', + 0 + ); +SELECT citus_add_rebalance_strategy( + 'insert_should_fail', + 'citus_shard_cost_1', + 'citus_node_capacity_1', + 'shard_allowed_on_node_bad_arg2', + 0 + ); +SELECT citus_add_rebalance_strategy( + 'insert_should_fail', + 'citus_shard_cost_1', + 'citus_node_capacity_1', + 'shard_allowed_on_node_bad_return_type', + 0 + ); +SELECT citus_add_rebalance_strategy( + 'insert_should_fail', + 'citus_shard_cost_1', + 'citus_node_capacity_1', + 0, + 0 + ); + + +-- Confirm that manual insert/update has the same checks +INSERT INTO + pg_catalog.pg_dist_rebalance_strategy( + name, + shard_cost_function, + node_capacity_function, + shard_allowed_on_node_function, + default_threshold + ) VALUES ( + 'shard_cost_no_arguments', + 'shard_cost_no_arguments', + 'citus_node_capacity_1', + 'citus_shard_allowed_on_node_true', + 0 + ); +UPDATE pg_dist_rebalance_strategy SET shard_cost_function='shard_cost_no_arguments' WHERE name='by_disk_size'; + +-- Confirm that only a single default strategy can exist +INSERT INTO + pg_catalog.pg_dist_rebalance_strategy( + name, + default_strategy, + shard_cost_function, + node_capacity_function, + shard_allowed_on_node_function, + default_threshold + ) VALUES ( + 'second_default', + true, + 'citus_shard_cost_1', + 'citus_node_capacity_1', + 'citus_shard_allowed_on_node_true', + 0 + ); +UPDATE pg_dist_rebalance_strategy SET default_strategy=true WHERE name='by_disk_size'; +-- ensure the trigger allows updating the default strategy +UPDATE pg_dist_rebalance_strategy SET default_strategy=true WHERE name='by_shard_count'; + +-- Confirm that default strategy should be higher than minimum strategy +SELECT citus_add_rebalance_strategy( + 'default_threshold_too_low', + 'citus_shard_cost_1', + 'capacity_high_worker_1', + 'citus_shard_allowed_on_node_true', + 0, + 0.1 + ); + +-- Make it a data node again +SELECT * from master_set_node_property('localhost', :worker_2_port, 'shouldhaveshards', true); +DROP TABLE tab; + + +-- we don't need the coordinator on pg_dist_node anymore +SELECT 1 FROM master_remove_node('localhost', :master_port); + + +-- +-- Make sure that rebalance_table_shards() and replicate_table_shards() replicate +-- reference tables to the coordinator when replicate_reference_tables_on_activate +-- is off. +-- + +SET citus.replicate_reference_tables_on_activate TO off; +SET client_min_messages TO WARNING; + +CREATE TABLE dist_table_test_3(a int); +SET citus.shard_count TO 4; + +SET citus.shard_replication_factor TO 1; +SET citus.replication_model TO "statement"; +SELECT create_distributed_table('dist_table_test_3', 'a'); + +CREATE TABLE ref_table(a int); +SELECT create_reference_table('ref_table'); + +SELECT 1 FROM master_add_node('localhost', :master_port, groupId=>0); + +SELECT count(*) FROM pg_dist_shard NATURAL JOIN pg_dist_shard_placement WHERE logicalrelid = 'ref_table'::regclass; + +SET citus.shard_replication_factor TO 2; +SELECT replicate_table_shards('dist_table_test_3', max_shard_copies := 4, shard_transfer_mode:='block_writes'); + +SELECT count(*) FROM pg_dist_shard NATURAL JOIN pg_dist_shard_placement WHERE logicalrelid = 'ref_table'::regclass; + +SELECT 1 FROM master_remove_node('localhost', :master_port); + +CREATE TABLE rebalance_test_table(int_column int); +SELECT master_create_distributed_table('rebalance_test_table', 'int_column', 'append'); + +CALL create_unbalanced_shards('rebalance_test_table'); + +SELECT 1 FROM master_add_node('localhost', :master_port, groupId=>0); + +SELECT count(*) FROM pg_dist_shard NATURAL JOIN pg_dist_shard_placement WHERE logicalrelid = 'ref_table'::regclass; + +SELECT rebalance_table_shards('rebalance_test_table', shard_transfer_mode:='block_writes'); + +SELECT count(*) FROM pg_dist_shard NATURAL JOIN pg_dist_shard_placement WHERE logicalrelid = 'ref_table'::regclass; + +DROP TABLE dist_table_test_3, rebalance_test_table, ref_table; + +SELECT 1 FROM master_remove_node('localhost', :master_port); + +-- reference table 2 will not have a replica identity, causing the rebalancer to not work +-- when ran in the default mode. Instead we need to change the shard transfer mode to make +-- it work. This verifies the shard transfer mode used in the rebalancer is used for the +-- ensurance of reference table existence. + +CREATE TABLE t1 (a int PRIMARY KEY, b int); +CREATE TABLE r1 (a int PRIMARY KEY, b int); +CREATE TABLE r2 (a int, b int); + +-- we remove worker 2 before creating the tables, this will allow us to have an active +-- node without the reference tables + +SELECT 1 from master_remove_node('localhost', :worker_2_port); + +SELECT create_distributed_table('t1','a'); +SELECT create_reference_table('r1'); +SELECT create_reference_table('r2'); + +-- add data so to actually copy data when forcing logical replication for reference tables +INSERT INTO r1 VALUES (1,2), (3,4); +INSERT INTO r2 VALUES (1,2), (3,4); + +SELECT 1 from master_add_node('localhost', :worker_2_port); + +SELECT rebalance_table_shards(); + +DROP TABLE t1, r1, r2; + +-- verify there are no distributed tables before we perform the following tests. Preceding +-- test suites should clean up their distributed tables. +SELECT count(*) FROM pg_dist_partition; + +-- verify a system having only reference tables will copy the reference tables when +-- executing the rebalancer + +SELECT 1 from master_remove_node('localhost', :worker_2_port); + +CREATE TABLE r1 (a int PRIMARY KEY, b int); +SELECT create_reference_table('r1'); + +SELECT 1 from master_add_node('localhost', :worker_2_port); + +-- count the number of placements for the reference table to verify it is not available on +-- all nodes +SELECT count(*) +FROM pg_dist_shard +JOIN pg_dist_shard_placement USING (shardid) +WHERE logicalrelid = 'r1'::regclass; + +-- rebalance with _only_ a reference table, this should trigger the copy +SELECT rebalance_table_shards(); + +-- verify the reference table is on all nodes after the rebalance +SELECT count(*) +FROM pg_dist_shard +JOIN pg_dist_shard_placement USING (shardid) +WHERE logicalrelid = 'r1'::regclass; + +-- cleanup tables +DROP TABLE r1; + + +-- lastly we need to verify that reference tables are copied before the replication factor +-- of other tables is increased. Without the copy of reference tables the replication might +-- fail. + +SELECT 1 from master_remove_node('localhost', :worker_2_port); + +CREATE TABLE t1 (a int PRIMARY KEY, b int); +CREATE TABLE r1 (a int PRIMARY KEY, b int); +SELECT create_distributed_table('t1', 'a'); +SELECT create_reference_table('r1'); + +SELECT 1 from master_add_node('localhost', :worker_2_port); + +-- count the number of placements for the reference table to verify it is not available on +-- all nodes +SELECT count(*) +FROM pg_dist_shard +JOIN pg_dist_shard_placement USING (shardid) +WHERE logicalrelid = 'r1'::regclass; + +SELECT replicate_table_shards('t1', shard_replication_factor := 2); + +-- verify the reference table is on all nodes after replicate_table_shards +SELECT count(*) +FROM pg_dist_shard +JOIN pg_dist_shard_placement USING (shardid) +WHERE logicalrelid = 'r1'::regclass; + +DROP TABLE t1, r1; diff --git a/src/test/regress/sql/shard_rebalancer_unit.sql b/src/test/regress/sql/shard_rebalancer_unit.sql new file mode 100644 index 000000000..d6159cbd2 --- /dev/null +++ b/src/test/regress/sql/shard_rebalancer_unit.sql @@ -0,0 +1,383 @@ +CREATE OR REPLACE FUNCTION shard_placement_rebalance_array( + worker_node_list json[], + shard_placement_list json[], + threshold float4 DEFAULT 0, + max_shard_moves int DEFAULT 1000000, + drain_only bool DEFAULT false +) +RETURNS json[] +AS 'citus' +LANGUAGE C STRICT VOLATILE; + +-- Check that even with threshold=0.0 shard_placement_rebalance_array returns +-- something when there's no completely balanced solution. + + +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}']::json[] +)); + +-- Check that a node can be drained in a balanced cluster + +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432, "disallowed_shards": "1,2,3,4"}', + '{"node_name": "hostname2", "node_port": 5432}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}' + ]::json[] +)); + +-- Check that an already drained node won't be filled again after a second +-- rebalance + +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432, "disallowed_shards": "1,2,3,4"}', + '{"node_name": "hostname2", "node_port": 5432}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}' + ]::json[] +)); + + +-- Check that even when shards are already balanced, but shard 4 is on a node +-- where it is not allowed it will be moved and there will be rebalancing later + +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432, "disallowed_shards": "1,2,3,5,6"}', + '{"node_name": "hostname2", "node_port": 5432, "disallowed_shards": "4"}', + '{"node_name": "hostname3", "node_port": 5432, "disallowed_shards": "4"}' + ]::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":5, "shardid":5, "shardstate":1, "shardlength":1, "nodename":"hostname3", "nodeport":5432}', + '{"placementid":6, "shardid":6, "shardstate":1, "shardlength":1, "nodename":"hostname3", "nodeport":5432}' + ]::json[] +)); + +-- Check that even when shards are already balanced, disallowed shards will be +-- moved away from hostname1 and the only shard that is allowed there will be +-- moved there + +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432, "disallowed_shards": "1,2,3,5,6"}', + '{"node_name": "hostname2", "node_port": 5432}', + '{"node_name": "hostname3", "node_port": 5432}' + ]::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":5, "shardid":5, "shardstate":1, "shardlength":1, "nodename":"hostname3", "nodeport":5432}', + '{"placementid":6, "shardid":6, "shardstate":1, "shardlength":1, "nodename":"hostname3", "nodeport":5432}' + ]::json[] +)); + +-- Check that an error is returned when a shard is not allowed anywhere + +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432, "disallowed_shards": "2,4"}', + '{"node_name": "hostname2", "node_port": 5432, "disallowed_shards": "1,4"}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}' + ]::json[] +)); + +-- Check that cost is taken into account when rebalancing + +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432, "cost": 3}']::json[] +)); + + +-- Check that cost is taken into account when rebalancing disallowed placements + +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432, "disallowed_shards": "1,2,3,4"}', + '{"node_name": "hostname2", "node_port": 5432}', + '{"node_name": "hostname3", "node_port": 5432}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432, "cost": 3}']::json[] +)); + + +-- Check that node capacacity is taken into account. + +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432, "capacity": 3}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}']::json[] +)); + +-- Check that shards are not moved when target utilization stays the same and +-- the source utilization goes below the original target utilization. hostname1 +-- has utilization of 1, after move hostname2 would have a utilization of 1 as +-- well. hostname1 would have utilization of 1 while hostname2 has utilization +-- of 2/3 now. Since load is spread more fairly with utilization 2/3 than 0 it +-- should choose that distribution. +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432, "capacity": 3}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}']::json[] +)); + + +-- Check that shards are moved even when target utilization stays the same, but +-- source utilization goes below the original target utilization. hostname2 +-- has utilization of 1, after move hostname1 would have a utilization of 1 as +-- well. hostname2 would have utilization of 2/3 while hostname1 now has +-- utilization of 0 now. Since load is spread more fairly with utilization 2/3 +-- than 0 it should choose that distribution. +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432, "capacity": 3}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}']::json[] +)); + +-- Check that shards are moved even when target utilization stays the same, but +-- source utilization goes below the original target utilization. hostname2 +-- has utilization of 2, after move hostname1 would have a utilization of 2 as +-- well. hostname2 would have utilization of 1.5 while hostname1 now has +-- utilization of 1. Since load is spread more fairly with utilization 1.5 than +-- 1 it should choose that distribution. +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432, "capacity": 2}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":5, "shardid":5, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}']::json[] +)); + +-- Check that shards are moved even when target utilization stays the same, but +-- source utilization goes below the original target utilization. hostname1 +-- has utilization of 2, after move hostname2 would have a utilization of 2 as +-- well. hostname1 would have utilization of 1 while hostname2 now has +-- utilization of 1.5. Since load is spread more fairly with utilization 1.5 +-- than 1 it should choose that distribution. +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432, "capacity": 2}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":5, "shardid":5, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}']::json[] +)); + + +-- Check that all shards will be moved to 1 node if its capacity is big enough +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432, "capacity": 4}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}']::json[] +)); + +-- Check that shards will be moved to a smaller node node if utilization improves +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432, "capacity": 3}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}']::json[] +)); + +-- Check that node capacity works with different shard costs +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432, "capacity": 3}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432, "cost": 3}']::json[] +)); + +-- Check that node capacity works with different shard costs again +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432, "capacity": 3}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432, "cost": 2}']::json[] +)); + +-- Check that max_shard_moves works and that we get a NOTICE that it is hit +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432, "capacity": 3}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432, "cost": 2}']::json[], + max_shard_moves := 1 +)); + + +-- Check that node capacity works with different shard costs and disallowed_shards +-- NOTE: these moves are not optimal, once we implement merging of updates this +-- output should change. +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432}', + '{"node_name": "hostname2", "node_port": 5432, "capacity": 5}', + '{"node_name": "hostname3", "node_port": 5432, "disallowed_shards": "1,2"}']::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname3", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname3", "nodeport":5432, "cost": 2}']::json[] +)); + +-- Check that draining + rebalancing nodes works +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432, "disallowed_shards": "1,2,3,4,5,6", "capacity": 0}', + '{"node_name": "hostname2", "node_port": 5432}', + '{"node_name": "hostname3", "node_port": 5432}' + ]::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":5, "shardid":5, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":6, "shardid":6, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}' + ]::json[] +)); + + +-- Check that draining nodes with drain only works +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432, "disallowed_shards": "1,2,3,4,5,6", "capacity": 0}', + '{"node_name": "hostname2", "node_port": 5432}', + '{"node_name": "hostname3", "node_port": 5432}' + ]::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":5, "shardid":5, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":6, "shardid":6, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}' + ]::json[], + drain_only := true +)); + +-- Check that draining nodes has priority over max_shard_moves +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432, "disallowed_shards": "1,2,3,4,5,6", "capacity": 0}', + '{"node_name": "hostname2", "node_port": 5432}', + '{"node_name": "hostname3", "node_port": 5432}' + ]::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":5, "shardid":5, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":6, "shardid":6, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}' + ]::json[], + max_shard_moves := 0 +)); + +-- Check that drained moves are counted towards shard moves and thus use up the +-- limit when doing normal rebalance moves +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432, "disallowed_shards": "1,2,3,4,5,6", "capacity": 0}', + '{"node_name": "hostname2", "node_port": 5432}', + '{"node_name": "hostname3", "node_port": 5432}' + ]::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":5, "shardid":5, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":6, "shardid":6, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}' + ]::json[], + max_shard_moves := 2 +)); + +-- Check that draining for all colocation groups is done before rebalancing +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432, "disallowed_shards": "1,2,3,4,5,6,7,8,9,10,11,12", "capacity": 0}', + '{"node_name": "hostname2", "node_port": 5432}', + '{"node_name": "hostname3", "node_port": 5432}' + ]::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":5, "shardid":5, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":6, "shardid":6, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":7, "shardid":7, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432, "next_colocation": true}', + '{"placementid":8, "shardid":8, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":9, "shardid":9, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":10, "shardid":10, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":11, "shardid":11, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":12, "shardid":12, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}' + ]::json[] +)); + +-- Check that max_shard_moves warning is only shown once even if more than one +-- colocation group its placement updates are ignored because of it +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432, "disallowed_shards": "1,2,3,4,5,6,7,8,9,10,11,12", "capacity": 0}', + '{"node_name": "hostname2", "node_port": 5432}', + '{"node_name": "hostname3", "node_port": 5432}' + ]::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":5, "shardid":5, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":6, "shardid":6, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":7, "shardid":7, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432, "next_colocation": true}', + '{"placementid":8, "shardid":8, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":9, "shardid":9, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":10, "shardid":10, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":11, "shardid":11, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":12, "shardid":12, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}' + ]::json[], + max_shard_moves := 1 +)); + +-- Check that moves for different colocation groups are added together when +-- taking into account max_shard_moves +SELECT unnest(shard_placement_rebalance_array( + ARRAY['{"node_name": "hostname1", "node_port": 5432, "disallowed_shards": "1,2,3,4,5,6,7,8,9,10,11,12", "capacity": 0}', + '{"node_name": "hostname2", "node_port": 5432}', + '{"node_name": "hostname3", "node_port": 5432}' + ]::json[], + ARRAY['{"placementid":1, "shardid":1, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432}', + '{"placementid":2, "shardid":2, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":3, "shardid":3, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":4, "shardid":4, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":5, "shardid":5, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":6, "shardid":6, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":7, "shardid":7, "shardstate":1, "shardlength":1, "nodename":"hostname1", "nodeport":5432, "next_colocation": true}', + '{"placementid":8, "shardid":8, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":9, "shardid":9, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":10, "shardid":10, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":11, "shardid":11, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}', + '{"placementid":12, "shardid":12, "shardstate":1, "shardlength":1, "nodename":"hostname2", "nodeport":5432}' + ]::json[], + max_shard_moves := 5 +));