mirror of https://github.com/citusdata/citus.git
Snapshot-Based Node Split – Foundation and Core Implementation (#8122)
**DESCRIPTION:**
This pull request introduces the foundation and core logic for the
snapshot-based node split feature in Citus. This feature enables
promoting a streaming replica (referred to as a clone in this feature
and UI) to a primary node and rebalancing shards between the original
and the newly promoted node without requiring a full data copy.
This significantly reduces rebalance times for scale-out operations
where the new node already contains a full copy of the data via
streaming replication.
Key Highlights:
**1. Replica (Clone) Registration & Management Infrastructure**
Introduces a new set of UDFs to register and manage clone nodes:
- citus_add_clone_node()
- citus_add_clone_node_with_nodeid()
- citus_remove_clone_node()
- citus_remove_clone_node_with_nodeid()
These functions allow administrators to register a streaming replica of
an existing worker node as a clone, making it eligible for later
promotion via snapshot-based split.
**2. Snapshot-Based Node Split (Core Implementation)**
New core UDF:
- citus_promote_clone_and_rebalance()
This function implements the full workflow to promote a clone and
rebalance shards between the old and new primaries. Steps include:
1. Ensuring Exclusivity – Blocks any concurrent placement-changing
operations.
2. Blocking Writes – Temporarily blocks writes on the primary to ensure
consistency.
3. Replica Catch-up – Waits for the replica to be fully in sync.
4. Promotion – Promotes the replica to a primary using pg_promote.
5. Metadata Update – Updates metadata to reflect the newly promoted
primary node.
6. Shard Rebalancing – Redistributes shards between the old and new
primary nodes.
**3. Split Plan Preview**
A new helper UDF get_snapshot_based_node_split_plan() provides a preview
of the shard distribution post-split, without executing the promotion.
**Example:**
```
reb 63796> select * from pg_catalog.get_snapshot_based_node_split_plan('127.0.0.1',5433,'127.0.0.1',5453);
table_name | shardid | shard_size | placement_node
--------------+---------+------------+----------------
companies | 102008 | 0 | Primary Node
campaigns | 102010 | 0 | Primary Node
ads | 102012 | 0 | Primary Node
mscompanies | 102014 | 0 | Primary Node
mscampaigns | 102016 | 0 | Primary Node
msads | 102018 | 0 | Primary Node
mscompanies2 | 102020 | 0 | Primary Node
mscampaigns2 | 102022 | 0 | Primary Node
msads2 | 102024 | 0 | Primary Node
companies | 102009 | 0 | Clone Node
campaigns | 102011 | 0 | Clone Node
ads | 102013 | 0 | Clone Node
mscompanies | 102015 | 0 | Clone Node
mscampaigns | 102017 | 0 | Clone Node
msads | 102019 | 0 | Clone Node
mscompanies2 | 102021 | 0 | Clone Node
mscampaigns2 | 102023 | 0 | Clone Node
msads2 | 102025 | 0 | Clone Node
(18 rows)
```
**4 Test Infrastructure Enhancements**
- Added a new test case scheduler for snapshot-based split scenarios.
- Enhanced pg_regress_multi.pl to support creating node backups with
slightly modified options to simulate real-world backup-based clone
creation.
### 5. Usage Guide
The snapshot-based node split can be performed using the following
workflow:
**- Take a Backup of the Worker Node**
Run pg_basebackup (or an equivalent tool) against the existing worker
node to create a physical backup.
`pg_basebackup -h <primary_worker_host> -p <port> -D
/path/to/replica/data --write-recovery-conf
`
**- Start the Replica Node**
Start PostgreSQL on the replica using the backup data directory,
ensuring it is configured as a streaming replica of the original worker
node.
**- Register the Backup Node as a Clone**
Mark the registered replica as a clone of its original worker node:
`SELECT * FROM citus_add_clone_node('<clone_host>', <clone_port>,
'<primary_host>', <primary_port>);
`
**- Promote and Rebalance the Clone**
Promote the clone to a primary and rebalance shards between it and the
original worker:
`SELECT * FROM citus_promote_clone_and_rebalance('clone_node_id');
`
**- Drop Any Replication Slots from the Original Worker**
After promotion, clean up any unused replication slots from the original
worker:
`SELECT pg_drop_replication_slot('<slot_name>');
`
pull/8136/head
parent
f743b35fc2
commit
be6668e440
|
|
@ -153,6 +153,7 @@ jobs:
|
|||
- check-isolation
|
||||
- check-operations
|
||||
- check-follower-cluster
|
||||
- check-add-backup-node
|
||||
- check-columnar
|
||||
- check-columnar-isolation
|
||||
- check-enterprise
|
||||
|
|
@ -494,10 +495,14 @@ jobs:
|
|||
tests=${detected_changes}
|
||||
|
||||
# split the tests to be skipped --today we only skip upgrade tests
|
||||
# and snapshot based node addition tests.
|
||||
# snapshot based node addition tests are not flaky, as they promote
|
||||
# the streaming replica (clone) to a PostgreSQL primary node that is one way
|
||||
# operation
|
||||
skipped_tests=""
|
||||
not_skipped_tests=""
|
||||
for test in $tests; do
|
||||
if [[ $test =~ ^src/test/regress/sql/upgrade_ ]]; then
|
||||
if [[ $test =~ ^src/test/regress/sql/upgrade_ ]] || [[ $test =~ ^src/test/regress/sql/multi_add_node_from_backup ]]; then
|
||||
skipped_tests="$skipped_tests $test"
|
||||
else
|
||||
not_skipped_tests="$not_skipped_tests $test"
|
||||
|
|
|
|||
|
|
@ -221,6 +221,7 @@ typedef struct MetadataCacheData
|
|||
Oid textCopyFormatId;
|
||||
Oid primaryNodeRoleId;
|
||||
Oid secondaryNodeRoleId;
|
||||
Oid unavailableNodeRoleId;
|
||||
Oid pgTableIsVisibleFuncId;
|
||||
Oid citusTableIsVisibleFuncId;
|
||||
Oid distAuthinfoRelationId;
|
||||
|
|
@ -320,8 +321,9 @@ static void CachedRelationNamespaceLookup(const char *relationName, Oid relnames
|
|||
static void CachedRelationNamespaceLookupExtended(const char *relationName,
|
||||
Oid renamespace, Oid *cachedOid,
|
||||
bool missing_ok);
|
||||
static ShardPlacement * ResolveGroupShardPlacement(
|
||||
GroupShardPlacement *groupShardPlacement, CitusTableCacheEntry *tableEntry,
|
||||
static ShardPlacement * ResolveGroupShardPlacement(GroupShardPlacement *
|
||||
groupShardPlacement,
|
||||
CitusTableCacheEntry *tableEntry,
|
||||
int shardIndex);
|
||||
static Oid LookupEnumValueId(Oid typeId, char *valueName);
|
||||
static void InvalidateCitusTableCacheEntrySlot(CitusTableCacheEntrySlot *cacheSlot);
|
||||
|
|
@ -3600,6 +3602,20 @@ SecondaryNodeRoleId(void)
|
|||
}
|
||||
|
||||
|
||||
/* return the Oid of the 'unavailable' nodeRole enum value */
|
||||
Oid
|
||||
UnavailableNodeRoleId(void)
|
||||
{
|
||||
if (!MetadataCache.unavailableNodeRoleId)
|
||||
{
|
||||
MetadataCache.unavailableNodeRoleId = LookupStringEnumValueId("noderole",
|
||||
"unavailable");
|
||||
}
|
||||
|
||||
return MetadataCache.unavailableNodeRoleId;
|
||||
}
|
||||
|
||||
|
||||
Oid
|
||||
CitusJobStatusScheduledId(void)
|
||||
{
|
||||
|
|
@ -4417,6 +4433,8 @@ InitializeWorkerNodeCache(void)
|
|||
workerNode->isActive = currentNode->isActive;
|
||||
workerNode->nodeRole = currentNode->nodeRole;
|
||||
workerNode->shouldHaveShards = currentNode->shouldHaveShards;
|
||||
workerNode->nodeprimarynodeid = currentNode->nodeprimarynodeid;
|
||||
workerNode->nodeisclone = currentNode->nodeisclone;
|
||||
strlcpy(workerNode->nodeCluster, currentNode->nodeCluster, NAMEDATALEN);
|
||||
|
||||
newWorkerNodeArray[workerNodeIndex++] = workerNode;
|
||||
|
|
|
|||
|
|
@ -819,7 +819,7 @@ NodeListInsertCommand(List *workerNodeList)
|
|||
appendStringInfo(nodeListInsertCommand,
|
||||
"INSERT INTO pg_dist_node (nodeid, groupid, nodename, nodeport, "
|
||||
"noderack, hasmetadata, metadatasynced, isactive, noderole, "
|
||||
"nodecluster, shouldhaveshards) VALUES ");
|
||||
"nodecluster, shouldhaveshards, nodeisclone, nodeprimarynodeid) VALUES ");
|
||||
|
||||
/* iterate over the worker nodes, add the values */
|
||||
WorkerNode *workerNode = NULL;
|
||||
|
|
@ -829,13 +829,14 @@ NodeListInsertCommand(List *workerNodeList)
|
|||
char *metadataSyncedString = workerNode->metadataSynced ? "TRUE" : "FALSE";
|
||||
char *isActiveString = workerNode->isActive ? "TRUE" : "FALSE";
|
||||
char *shouldHaveShards = workerNode->shouldHaveShards ? "TRUE" : "FALSE";
|
||||
char *nodeiscloneString = workerNode->nodeisclone ? "TRUE" : "FALSE";
|
||||
|
||||
Datum nodeRoleOidDatum = ObjectIdGetDatum(workerNode->nodeRole);
|
||||
Datum nodeRoleStringDatum = DirectFunctionCall1(enum_out, nodeRoleOidDatum);
|
||||
char *nodeRoleString = DatumGetCString(nodeRoleStringDatum);
|
||||
|
||||
appendStringInfo(nodeListInsertCommand,
|
||||
"(%d, %d, %s, %d, %s, %s, %s, %s, '%s'::noderole, %s, %s)",
|
||||
"(%d, %d, %s, %d, %s, %s, %s, %s, '%s'::noderole, %s, %s, %s, %d)",
|
||||
workerNode->nodeId,
|
||||
workerNode->groupId,
|
||||
quote_literal_cstr(workerNode->workerName),
|
||||
|
|
@ -846,7 +847,9 @@ NodeListInsertCommand(List *workerNodeList)
|
|||
isActiveString,
|
||||
nodeRoleString,
|
||||
quote_literal_cstr(workerNode->nodeCluster),
|
||||
shouldHaveShards);
|
||||
shouldHaveShards,
|
||||
nodeiscloneString,
|
||||
workerNode->nodeprimarynodeid);
|
||||
|
||||
processedWorkerNodeCount++;
|
||||
if (processedWorkerNodeCount != workerCount)
|
||||
|
|
@ -880,9 +883,11 @@ NodeListIdempotentInsertCommand(List *workerNodeList)
|
|||
"hasmetadata = EXCLUDED.hasmetadata, "
|
||||
"isactive = EXCLUDED.isactive, "
|
||||
"noderole = EXCLUDED.noderole, "
|
||||
"nodecluster = EXCLUDED.nodecluster ,"
|
||||
"nodecluster = EXCLUDED.nodecluster, "
|
||||
"metadatasynced = EXCLUDED.metadatasynced, "
|
||||
"shouldhaveshards = EXCLUDED.shouldhaveshards";
|
||||
"shouldhaveshards = EXCLUDED.shouldhaveshards, "
|
||||
"nodeisclone = EXCLUDED.nodeisclone, "
|
||||
"nodeprimarynodeid = EXCLUDED.nodeprimarynodeid";
|
||||
appendStringInfoString(nodeInsertIdempotentCommand, onConflictStr);
|
||||
return nodeInsertIdempotentCommand->data;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -35,6 +35,7 @@
|
|||
|
||||
#include "distributed/citus_acquire_lock.h"
|
||||
#include "distributed/citus_safe_lib.h"
|
||||
#include "distributed/clonenode_utils.h"
|
||||
#include "distributed/colocation_utils.h"
|
||||
#include "distributed/commands.h"
|
||||
#include "distributed/commands/utility_hook.h"
|
||||
|
|
@ -84,6 +85,8 @@ typedef struct NodeMetadata
|
|||
bool isActive;
|
||||
Oid nodeRole;
|
||||
bool shouldHaveShards;
|
||||
uint32 nodeprimarynodeid;
|
||||
bool nodeisclone;
|
||||
char *nodeCluster;
|
||||
} NodeMetadata;
|
||||
|
||||
|
|
@ -106,7 +109,8 @@ static void InsertNodeRow(int nodeid, char *nodename, int32 nodeport,
|
|||
NodeMetadata *nodeMetadata);
|
||||
static void DeleteNodeRow(char *nodename, int32 nodeport);
|
||||
static void BlockDistributedQueriesOnMetadataNodes(void);
|
||||
static WorkerNode * TupleToWorkerNode(TupleDesc tupleDescriptor, HeapTuple heapTuple);
|
||||
static WorkerNode * TupleToWorkerNode(Relation pgDistNode, TupleDesc tupleDescriptor,
|
||||
HeapTuple heapTuple);
|
||||
static bool NodeIsLocal(WorkerNode *worker);
|
||||
static void SetLockTimeoutLocally(int32 lock_cooldown);
|
||||
static void UpdateNodeLocation(int32 nodeId, char *newNodeName, int32 newNodePort,
|
||||
|
|
@ -120,11 +124,10 @@ static char * NodeMetadataSyncedUpdateCommand(uint32 nodeId, bool metadataSynced
|
|||
static void ErrorIfCoordinatorMetadataSetFalse(WorkerNode *workerNode, Datum value,
|
||||
char *field);
|
||||
static WorkerNode * SetShouldHaveShards(WorkerNode *workerNode, bool shouldHaveShards);
|
||||
static WorkerNode * FindNodeAnyClusterByNodeId(uint32 nodeId);
|
||||
static void ErrorIfAnyNodeNotExist(List *nodeList);
|
||||
static void UpdateLocalGroupIdsViaMetadataContext(MetadataSyncContext *context);
|
||||
static void SendDeletionCommandsForReplicatedTablePlacements(
|
||||
MetadataSyncContext *context);
|
||||
static void SendDeletionCommandsForReplicatedTablePlacements(MetadataSyncContext *context)
|
||||
;
|
||||
static void SyncNodeMetadata(MetadataSyncContext *context);
|
||||
static void SetNodeStateViaMetadataContext(MetadataSyncContext *context,
|
||||
WorkerNode *workerNode,
|
||||
|
|
@ -134,12 +137,15 @@ static void MarkNodesNotSyncedInLoopBackConnection(MetadataSyncContext *context,
|
|||
static void EnsureParentSessionHasExclusiveLockOnPgDistNode(pid_t parentSessionPid);
|
||||
static void SetNodeMetadata(MetadataSyncContext *context, bool localOnly);
|
||||
static void EnsureTransactionalMetadataSyncMode(void);
|
||||
static void LockShardsInWorkerPlacementList(WorkerNode *workerNode, LOCKMODE
|
||||
lockMode);
|
||||
static BackgroundWorkerHandle * CheckBackgroundWorkerToObtainLocks(int32 lock_cooldown);
|
||||
static BackgroundWorkerHandle * LockPlacementsWithBackgroundWorkersInPrimaryNode(
|
||||
WorkerNode *workerNode, bool force, int32 lock_cooldown);
|
||||
|
||||
|
||||
static int32 CitusAddCloneNode(WorkerNode *primaryWorkerNode,
|
||||
char *cloneHostname, int32 clonePort);
|
||||
static void RemoveCloneNode(WorkerNode *cloneNode);
|
||||
|
||||
/* Function definitions go here */
|
||||
|
||||
/* declarations for dynamic loading */
|
||||
|
|
@ -168,6 +174,10 @@ PG_FUNCTION_INFO_V1(citus_coordinator_nodeid);
|
|||
PG_FUNCTION_INFO_V1(citus_is_coordinator);
|
||||
PG_FUNCTION_INFO_V1(citus_internal_mark_node_not_synced);
|
||||
PG_FUNCTION_INFO_V1(citus_is_primary_node);
|
||||
PG_FUNCTION_INFO_V1(citus_add_clone_node);
|
||||
PG_FUNCTION_INFO_V1(citus_add_clone_node_with_nodeid);
|
||||
PG_FUNCTION_INFO_V1(citus_remove_clone_node);
|
||||
PG_FUNCTION_INFO_V1(citus_remove_clone_node_with_nodeid);
|
||||
|
||||
/*
|
||||
* DefaultNodeMetadata creates a NodeMetadata struct with the fields set to
|
||||
|
|
@ -183,6 +193,8 @@ DefaultNodeMetadata()
|
|||
nodeMetadata.nodeRack = WORKER_DEFAULT_RACK;
|
||||
nodeMetadata.shouldHaveShards = true;
|
||||
nodeMetadata.groupId = INVALID_GROUP_ID;
|
||||
nodeMetadata.nodeisclone = false;
|
||||
nodeMetadata.nodeprimarynodeid = 0; /* 0 typically means InvalidNodeId */
|
||||
|
||||
return nodeMetadata;
|
||||
}
|
||||
|
|
@ -1177,6 +1189,33 @@ ActivateNodeList(MetadataSyncContext *context)
|
|||
}
|
||||
|
||||
|
||||
/*
|
||||
* ActivateCloneNodeAsPrimary sets the given worker node as primary and active
|
||||
* in the pg_dist_node catalog and make the clone node as first class citizen.
|
||||
*/
|
||||
void
|
||||
ActivateCloneNodeAsPrimary(WorkerNode *workerNode)
|
||||
{
|
||||
/*
|
||||
* Set the node as primary and active.
|
||||
*/
|
||||
SetWorkerColumnLocalOnly(workerNode, Anum_pg_dist_node_noderole,
|
||||
ObjectIdGetDatum(PrimaryNodeRoleId()));
|
||||
SetWorkerColumnLocalOnly(workerNode, Anum_pg_dist_node_isactive,
|
||||
BoolGetDatum(true));
|
||||
SetWorkerColumnLocalOnly(workerNode, Anum_pg_dist_node_nodeisclone,
|
||||
BoolGetDatum(false));
|
||||
SetWorkerColumnLocalOnly(workerNode, Anum_pg_dist_node_nodeprimarynodeid,
|
||||
Int32GetDatum(0));
|
||||
SetWorkerColumnLocalOnly(workerNode, Anum_pg_dist_node_hasmetadata,
|
||||
BoolGetDatum(true));
|
||||
SetWorkerColumnLocalOnly(workerNode, Anum_pg_dist_node_metadatasynced,
|
||||
BoolGetDatum(true));
|
||||
SetWorkerColumnLocalOnly(workerNode, Anum_pg_dist_node_shouldhaveshards,
|
||||
BoolGetDatum(true));
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Acquires shard metadata locks on all shards residing in the given worker node
|
||||
*
|
||||
|
|
@ -1200,7 +1239,8 @@ BackgroundWorkerHandle *
|
|||
CheckBackgroundWorkerToObtainLocks(int32 lock_cooldown)
|
||||
{
|
||||
BackgroundWorkerHandle *handle = StartLockAcquireHelperBackgroundWorker(MyProcPid,
|
||||
lock_cooldown);
|
||||
lock_cooldown)
|
||||
;
|
||||
if (!handle)
|
||||
{
|
||||
/*
|
||||
|
|
@ -1422,6 +1462,305 @@ master_update_node(PG_FUNCTION_ARGS)
|
|||
}
|
||||
|
||||
|
||||
/*
|
||||
* citus_add_clone_node adds a new node as a clone of an existing primary node.
|
||||
*/
|
||||
Datum
|
||||
citus_add_clone_node(PG_FUNCTION_ARGS)
|
||||
{
|
||||
CheckCitusVersion(ERROR);
|
||||
EnsureSuperUser();
|
||||
EnsureCoordinator();
|
||||
|
||||
text *cloneHostnameText = PG_GETARG_TEXT_P(0);
|
||||
int32 clonePort = PG_GETARG_INT32(1);
|
||||
text *primaryHostnameText = PG_GETARG_TEXT_P(2);
|
||||
int32 primaryPort = PG_GETARG_INT32(3);
|
||||
|
||||
char *cloneHostname = text_to_cstring(cloneHostnameText);
|
||||
char *primaryHostname = text_to_cstring(primaryHostnameText);
|
||||
|
||||
WorkerNode *primaryWorker = FindWorkerNodeAnyCluster(primaryHostname, primaryPort);
|
||||
|
||||
if (primaryWorker == NULL)
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg("primary node %s:%d not found in pg_dist_node",
|
||||
primaryHostname, primaryPort)));
|
||||
}
|
||||
|
||||
int32 cloneNodeId = CitusAddCloneNode(primaryWorker, cloneHostname, clonePort);
|
||||
|
||||
PG_RETURN_INT32(cloneNodeId);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* citus_add_clone_node_with_nodeid adds a new node as a clone of an existing primary node
|
||||
* using the primary node's ID. It records the clone's hostname, port, and links it to the
|
||||
* primary node's ID.
|
||||
*
|
||||
* This function is useful when you already know the primary node's ID and want to add a clone
|
||||
* without needing to look it up by hostname and port.
|
||||
*/
|
||||
Datum
|
||||
citus_add_clone_node_with_nodeid(PG_FUNCTION_ARGS)
|
||||
{
|
||||
CheckCitusVersion(ERROR);
|
||||
EnsureSuperUser();
|
||||
EnsureCoordinator();
|
||||
|
||||
text *cloneHostnameText = PG_GETARG_TEXT_P(0);
|
||||
int32 clonePort = PG_GETARG_INT32(1);
|
||||
int32 primaryNodeId = PG_GETARG_INT32(2);
|
||||
|
||||
char *cloneHostname = text_to_cstring(cloneHostnameText);
|
||||
|
||||
bool missingOk = false;
|
||||
WorkerNode *primaryWorkerNode = FindNodeWithNodeId(primaryNodeId, missingOk);
|
||||
|
||||
if (primaryWorkerNode == NULL)
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg("primary node with ID %d does not exist", primaryNodeId)));
|
||||
}
|
||||
|
||||
int32 cloneNodeId = CitusAddCloneNode(primaryWorkerNode, cloneHostname, clonePort);
|
||||
|
||||
PG_RETURN_INT32(cloneNodeId);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* CitusAddCloneNode function adds a new node as a clone of an existing primary node.
|
||||
* It records the clone's hostname, port, and links it to the primary node's ID.
|
||||
* The clone is initially marked as inactive and not having shards.
|
||||
*/
|
||||
static int32
|
||||
CitusAddCloneNode(WorkerNode *primaryWorkerNode,
|
||||
char *cloneHostname, int32 clonePort)
|
||||
{
|
||||
Assert(primaryWorkerNode != NULL);
|
||||
|
||||
/* Future-proofing: Ideally, a primary node should not itself be a clone.
|
||||
* This check might be more relevant once replica promotion logic exists.
|
||||
* For now, pg_dist_node.nodeisclone defaults to false for existing nodes.
|
||||
*/
|
||||
if (primaryWorkerNode->nodeisclone)
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg(
|
||||
"primary node %s:%d is itself a clone and cannot have clones",
|
||||
primaryWorkerNode->workerName, primaryWorkerNode->
|
||||
workerPort)));
|
||||
}
|
||||
|
||||
if (!primaryWorkerNode->shouldHaveShards)
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg(
|
||||
"primary node %s:%d does not have shards, node without shards cannot have clones",
|
||||
primaryWorkerNode->workerName, primaryWorkerNode->
|
||||
workerPort)));
|
||||
}
|
||||
|
||||
WorkerNode *existingCloneNode = FindWorkerNodeAnyCluster(cloneHostname, clonePort);
|
||||
if (existingCloneNode != NULL)
|
||||
{
|
||||
/*
|
||||
* Idempotency check: If the node already exists, is it already correctly
|
||||
* registered as a clone for THIS primary?
|
||||
*/
|
||||
if (existingCloneNode->nodeisclone &&
|
||||
existingCloneNode->nodeprimarynodeid == primaryWorkerNode->nodeId)
|
||||
{
|
||||
ereport(NOTICE, (errmsg(
|
||||
"node %s:%d is already registered as a clone for primary %s:%d (nodeid %d)",
|
||||
cloneHostname, clonePort,
|
||||
primaryWorkerNode->workerName, primaryWorkerNode->
|
||||
workerPort, primaryWorkerNode->nodeId)));
|
||||
PG_RETURN_INT32(existingCloneNode->nodeId);
|
||||
}
|
||||
else
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg(
|
||||
"a different node %s:%d (nodeid %d) already exists or is a clone for a different primary",
|
||||
cloneHostname, clonePort, existingCloneNode->nodeId)));
|
||||
}
|
||||
}
|
||||
EnsureValidStreamingReplica(primaryWorkerNode, cloneHostname, clonePort);
|
||||
|
||||
char *operation = "add";
|
||||
EnsureValidCloneMode(primaryWorkerNode, cloneHostname, clonePort, operation);
|
||||
|
||||
NodeMetadata nodeMetadata = DefaultNodeMetadata();
|
||||
|
||||
nodeMetadata.nodeisclone = true;
|
||||
nodeMetadata.nodeprimarynodeid = primaryWorkerNode->nodeId;
|
||||
nodeMetadata.isActive = false; /* Replicas start as inactive */
|
||||
nodeMetadata.shouldHaveShards = false; /* Replicas do not directly own primary shards */
|
||||
nodeMetadata.groupId = INVALID_GROUP_ID; /* Replicas get a new group ID and do not belong to any existing group */
|
||||
nodeMetadata.nodeRole = UnavailableNodeRoleId(); /* The node role is set to 'unavailable' */
|
||||
nodeMetadata.nodeCluster = primaryWorkerNode->nodeCluster; /* Same cluster as primary */
|
||||
|
||||
/* Other fields like hasMetadata, metadataSynced will take defaults from DefaultNodeMetadata
|
||||
* (typically true, true for hasMetadata and metadataSynced if it's a new node,
|
||||
* or might need adjustment based on replica strategy)
|
||||
* For now, let's assume DefaultNodeMetadata provides suitable defaults for these
|
||||
* or they will be set by AddNodeMetadata/ActivateNodeList if needed.
|
||||
* Specifically, hasMetadata is often true, and metadataSynced true after activation.
|
||||
* Since this replica is inactive, metadata sync status might be less critical initially.
|
||||
*/
|
||||
|
||||
bool nodeAlreadyExists = false;
|
||||
bool localOnly = false; /* Propagate change to other workers with metadata */
|
||||
|
||||
/*
|
||||
* AddNodeMetadata will take an ExclusiveLock on pg_dist_node.
|
||||
* It also checks again if the node already exists after acquiring the lock.
|
||||
*/
|
||||
int cloneNodeId = AddNodeMetadata(cloneHostname, clonePort, &nodeMetadata,
|
||||
&nodeAlreadyExists, localOnly);
|
||||
|
||||
if (nodeAlreadyExists)
|
||||
{
|
||||
/* This case should ideally be caught by the FindWorkerNodeAnyCluster check above,
|
||||
* but AddNodeMetadata does its own check after locking.
|
||||
* If it already exists and is correctly configured, we might have returned NOTICE above.
|
||||
* If it exists but is NOT correctly configured as our replica, an ERROR would be more appropriate.
|
||||
* AddNodeMetadata returns the existing node's ID if it finds one.
|
||||
* We need to ensure it is the *correct* replica.
|
||||
*/
|
||||
WorkerNode *fetchedExistingNode = FindNodeAnyClusterByNodeId(cloneNodeId);
|
||||
if (fetchedExistingNode != NULL && fetchedExistingNode->nodeisclone &&
|
||||
fetchedExistingNode->nodeprimarynodeid == primaryWorkerNode->nodeId)
|
||||
{
|
||||
ereport(NOTICE, (errmsg(
|
||||
"node %s:%d was already correctly registered as a clone for primary %s:%d (nodeid %d)",
|
||||
cloneHostname, clonePort,
|
||||
primaryWorkerNode->workerName, primaryWorkerNode->
|
||||
workerPort, primaryWorkerNode->nodeId)));
|
||||
|
||||
/* Intentional fall-through to return cloneNodeId */
|
||||
}
|
||||
else
|
||||
{
|
||||
/* This state is less expected if our initial check passed or errored. */
|
||||
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
|
||||
errmsg(
|
||||
"node %s:%d already exists but is not correctly configured as a clone for primary %s:%d",
|
||||
cloneHostname, clonePort, primaryWorkerNode->workerName,
|
||||
primaryWorkerNode->workerPort)));
|
||||
}
|
||||
}
|
||||
|
||||
TransactionModifiedNodeMetadata = true;
|
||||
|
||||
/*
|
||||
* Note: Clones added this way are inactive.
|
||||
* A separate UDF citus_promote_clone_and_rebalance
|
||||
* would be needed to activate them.
|
||||
*/
|
||||
|
||||
return cloneNodeId;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* citus_remove_clone_node removes an inactive streaming clone node from Citus metadata.
|
||||
*/
|
||||
Datum
|
||||
citus_remove_clone_node(PG_FUNCTION_ARGS)
|
||||
{
|
||||
CheckCitusVersion(ERROR);
|
||||
EnsureSuperUser();
|
||||
EnsureCoordinator();
|
||||
|
||||
text *nodeNameText = PG_GETARG_TEXT_P(0);
|
||||
int32 nodePort = PG_GETARG_INT32(1);
|
||||
char *nodeName = text_to_cstring(nodeNameText);
|
||||
|
||||
WorkerNode *workerNode = FindWorkerNodeAnyCluster(nodeName, nodePort);
|
||||
|
||||
if (workerNode == NULL)
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg("node \"%s:%d\" does not exist", nodeName, nodePort)));
|
||||
}
|
||||
|
||||
RemoveCloneNode(workerNode);
|
||||
|
||||
PG_RETURN_VOID();
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* citus_remove_clone_node_with_nodeid removes an inactive clone node from Citus metadata
|
||||
* using the node's ID.
|
||||
*/
|
||||
Datum
|
||||
citus_remove_clone_node_with_nodeid(PG_FUNCTION_ARGS)
|
||||
{
|
||||
CheckCitusVersion(ERROR);
|
||||
EnsureSuperUser();
|
||||
EnsureCoordinator();
|
||||
|
||||
uint32 replicaNodeId = PG_GETARG_INT32(0);
|
||||
|
||||
WorkerNode *replicaNode = FindNodeAnyClusterByNodeId(replicaNodeId);
|
||||
|
||||
if (replicaNode == NULL)
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg("Clone node with ID %d does not exist", replicaNodeId)));
|
||||
}
|
||||
RemoveCloneNode(replicaNode);
|
||||
|
||||
PG_RETURN_VOID();
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
RemoveCloneNode(WorkerNode *cloneNode)
|
||||
{
|
||||
Assert(cloneNode != NULL);
|
||||
|
||||
if (!cloneNode->nodeisclone)
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg("Node %s:%d (ID %d) is not a clone node. "
|
||||
"Use citus_remove_node() to remove primary or already promoted nodes.",
|
||||
cloneNode->workerName, cloneNode->workerPort, cloneNode->
|
||||
nodeId)));
|
||||
}
|
||||
|
||||
if (cloneNode->isActive)
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg(
|
||||
"Clone node %s:%d (ID %d) is marked as active and cannot be removed with this function. "
|
||||
"This might indicate a promoted clone. Consider using citus_remove_node() if you are sure, "
|
||||
"or ensure it's properly deactivated if it's an unpromoted clone in an unexpected state.",
|
||||
cloneNode->workerName, cloneNode->workerPort, cloneNode->
|
||||
nodeId)));
|
||||
}
|
||||
|
||||
/*
|
||||
* All checks passed, proceed with removal.
|
||||
* RemoveNodeFromCluster handles locking, catalog changes, connection closing, and metadata sync.
|
||||
*/
|
||||
ereport(NOTICE, (errmsg("Removing inactive clone node %s:%d (ID %d)",
|
||||
cloneNode->workerName, cloneNode->workerPort, cloneNode->
|
||||
nodeId)));
|
||||
|
||||
RemoveNodeFromCluster(cloneNode->workerName, cloneNode->workerPort);
|
||||
|
||||
/* RemoveNodeFromCluster might set this, but setting it here ensures it's marked for this UDF's transaction. */
|
||||
TransactionModifiedNodeMetadata = true;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* SetLockTimeoutLocally sets the lock_timeout to the given value.
|
||||
* This setting is local.
|
||||
|
|
@ -1859,7 +2198,7 @@ FindWorkerNodeAnyCluster(const char *nodeName, int32 nodePort)
|
|||
HeapTuple heapTuple = GetNodeTuple(nodeName, nodePort);
|
||||
if (heapTuple != NULL)
|
||||
{
|
||||
workerNode = TupleToWorkerNode(tupleDescriptor, heapTuple);
|
||||
workerNode = TupleToWorkerNode(pgDistNode, tupleDescriptor, heapTuple);
|
||||
}
|
||||
|
||||
table_close(pgDistNode, NoLock);
|
||||
|
|
@ -1871,7 +2210,7 @@ FindWorkerNodeAnyCluster(const char *nodeName, int32 nodePort)
|
|||
* FindNodeAnyClusterByNodeId searches pg_dist_node and returns the node with
|
||||
* the nodeId. If the node can't be found returns NULL.
|
||||
*/
|
||||
static WorkerNode *
|
||||
WorkerNode *
|
||||
FindNodeAnyClusterByNodeId(uint32 nodeId)
|
||||
{
|
||||
bool includeNodesFromOtherClusters = true;
|
||||
|
|
@ -1966,7 +2305,8 @@ ReadDistNode(bool includeNodesFromOtherClusters)
|
|||
HeapTuple heapTuple = systable_getnext(scanDescriptor);
|
||||
while (HeapTupleIsValid(heapTuple))
|
||||
{
|
||||
WorkerNode *workerNode = TupleToWorkerNode(tupleDescriptor, heapTuple);
|
||||
WorkerNode *workerNode = TupleToWorkerNode(pgDistNode, tupleDescriptor, heapTuple)
|
||||
;
|
||||
|
||||
if (includeNodesFromOtherClusters ||
|
||||
strncmp(workerNode->nodeCluster, CurrentCluster, WORKER_LENGTH) == 0)
|
||||
|
|
@ -2513,7 +2853,7 @@ SetWorkerColumnLocalOnly(WorkerNode *workerNode, int columnIndex, Datum value)
|
|||
CitusInvalidateRelcacheByRelid(DistNodeRelationId());
|
||||
CommandCounterIncrement();
|
||||
|
||||
WorkerNode *newWorkerNode = TupleToWorkerNode(tupleDescriptor, heapTuple);
|
||||
WorkerNode *newWorkerNode = TupleToWorkerNode(pgDistNode, tupleDescriptor, heapTuple);
|
||||
|
||||
table_close(pgDistNode, NoLock);
|
||||
|
||||
|
|
@ -2924,6 +3264,10 @@ InsertNodeRow(int nodeid, char *nodeName, int32 nodePort, NodeMetadata *nodeMeta
|
|||
values[Anum_pg_dist_node_nodecluster - 1] = nodeClusterNameDatum;
|
||||
values[Anum_pg_dist_node_shouldhaveshards - 1] = BoolGetDatum(
|
||||
nodeMetadata->shouldHaveShards);
|
||||
values[Anum_pg_dist_node_nodeisclone - 1] = BoolGetDatum(
|
||||
nodeMetadata->nodeisclone);
|
||||
values[Anum_pg_dist_node_nodeprimarynodeid - 1] = Int32GetDatum(
|
||||
nodeMetadata->nodeprimarynodeid);
|
||||
|
||||
Relation pgDistNode = table_open(DistNodeRelationId(), RowExclusiveLock);
|
||||
|
||||
|
|
@ -3015,19 +3359,18 @@ DeleteNodeRow(char *nodeName, int32 nodePort)
|
|||
* the caller already has locks on the tuple, and doesn't perform any locking.
|
||||
*/
|
||||
static WorkerNode *
|
||||
TupleToWorkerNode(TupleDesc tupleDescriptor, HeapTuple heapTuple)
|
||||
TupleToWorkerNode(Relation pgDistNode, TupleDesc tupleDescriptor, HeapTuple heapTuple)
|
||||
{
|
||||
Datum datumArray[Natts_pg_dist_node];
|
||||
bool isNullArray[Natts_pg_dist_node];
|
||||
|
||||
Assert(!HeapTupleHasNulls(heapTuple));
|
||||
|
||||
/*
|
||||
* This function can be called before "ALTER TABLE ... ADD COLUMN nodecluster ...",
|
||||
* therefore heap_deform_tuple() won't set the isNullArray for this column. We
|
||||
* initialize it true to be safe in that case.
|
||||
/* we add remove columns from pg_dist_node during extension upgrade and
|
||||
* and downgrads. Now the issue here is PostgreSQL never reuses the old
|
||||
* attnum. Dropped columns leave “holes” (attributes with attisdropped = true),
|
||||
* and a re-added column with the same name gets a new attnum at the end. So
|
||||
* we cannot use the deined Natts_pg_dist_node to allocate memory and also
|
||||
* we need to cater for the holes when fetching the column values
|
||||
*/
|
||||
memset(isNullArray, true, sizeof(isNullArray));
|
||||
int nAtts = tupleDescriptor->natts;
|
||||
Datum *datumArray = palloc0(sizeof(Datum) * nAtts);
|
||||
bool *isNullArray = palloc0(sizeof(bool) * nAtts);
|
||||
|
||||
/*
|
||||
* We use heap_deform_tuple() instead of heap_getattr() to expand tuple
|
||||
|
|
@ -3054,18 +3397,48 @@ TupleToWorkerNode(TupleDesc tupleDescriptor, HeapTuple heapTuple)
|
|||
1]);
|
||||
|
||||
/*
|
||||
* nodecluster column can be missing. In the case of extension creation/upgrade,
|
||||
* master_initialize_node_metadata function is called before the nodecluster
|
||||
* column is added to pg_dist_node table.
|
||||
* Attributes above this line are guaranteed to be present at the
|
||||
* exact defined attribute number. Atleast till now. If you are droping or
|
||||
* adding any of the above columns consider adjusting the code above
|
||||
*/
|
||||
if (!isNullArray[Anum_pg_dist_node_nodecluster - 1])
|
||||
Oid pgDistNodeRelId = RelationGetRelid(pgDistNode);
|
||||
|
||||
AttrNumber nodeClusterAttno = get_attnum(pgDistNodeRelId, "nodecluster");
|
||||
|
||||
if (nodeClusterAttno > 0 &&
|
||||
!TupleDescAttr(tupleDescriptor, nodeClusterAttno - 1)->attisdropped &&
|
||||
!isNullArray[nodeClusterAttno - 1])
|
||||
{
|
||||
Name nodeClusterName =
|
||||
DatumGetName(datumArray[Anum_pg_dist_node_nodecluster - 1]);
|
||||
DatumGetName(datumArray[nodeClusterAttno - 1]);
|
||||
char *nodeClusterString = NameStr(*nodeClusterName);
|
||||
strlcpy(workerNode->nodeCluster, nodeClusterString, NAMEDATALEN);
|
||||
}
|
||||
|
||||
if (nAtts > Anum_pg_dist_node_nodeisclone)
|
||||
{
|
||||
AttrNumber nodeIsCloneAttno = get_attnum(pgDistNodeRelId, "nodeisclone");
|
||||
if (nodeIsCloneAttno > 0 &&
|
||||
!TupleDescAttr(tupleDescriptor, nodeIsCloneAttno - 1)->attisdropped &&
|
||||
!isNullArray[nodeIsCloneAttno - 1])
|
||||
{
|
||||
workerNode->nodeisclone = DatumGetBool(datumArray[nodeIsCloneAttno - 1]);
|
||||
}
|
||||
AttrNumber nodePrimaryNodeIdAttno = get_attnum(pgDistNodeRelId,
|
||||
"nodeprimarynodeid");
|
||||
if (nodePrimaryNodeIdAttno > 0 &&
|
||||
!TupleDescAttr(tupleDescriptor, nodePrimaryNodeIdAttno - 1)->attisdropped &&
|
||||
!isNullArray[nodePrimaryNodeIdAttno - 1])
|
||||
{
|
||||
workerNode->nodeprimarynodeid = DatumGetInt32(datumArray[
|
||||
nodePrimaryNodeIdAttno - 1])
|
||||
;
|
||||
}
|
||||
}
|
||||
|
||||
pfree(datumArray);
|
||||
pfree(isNullArray);
|
||||
|
||||
return workerNode;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,422 @@
|
|||
#include "postgres.h"
|
||||
|
||||
#include "utils/fmgrprotos.h"
|
||||
#include "utils/pg_lsn.h"
|
||||
|
||||
#include "distributed/argutils.h"
|
||||
#include "distributed/clonenode_utils.h"
|
||||
#include "distributed/listutils.h"
|
||||
#include "distributed/metadata_cache.h"
|
||||
#include "distributed/metadata_sync.h"
|
||||
#include "distributed/remote_commands.h"
|
||||
#include "distributed/shard_rebalancer.h"
|
||||
|
||||
|
||||
static void BlockAllWritesToWorkerNode(WorkerNode *workerNode);
|
||||
static bool GetNodeIsInRecoveryStatus(WorkerNode *workerNode);
|
||||
static void PromoteCloneNode(WorkerNode *cloneWorkerNode);
|
||||
static void EnsureSingleNodePromotion(WorkerNode *primaryNode);
|
||||
|
||||
PG_FUNCTION_INFO_V1(citus_promote_clone_and_rebalance);
|
||||
|
||||
/*
|
||||
* citus_promote_clone_and_rebalance promotes an inactive clone node to become
|
||||
* the new primary node, replacing its original primary node.
|
||||
*
|
||||
* This function performs the following steps:
|
||||
* 1. Validates that the clone node exists and is properly configured
|
||||
* 2. Ensures the clone is inactive and has a valid primary node reference
|
||||
* 3. Blocks all writes to the primary node to prevent data divergence
|
||||
* 4. Waits for the clone to catch up with the primary's WAL position
|
||||
* 5. Promotes the clone node to become a standalone primary
|
||||
* 6. Updates metadata to mark the clone as active and primary
|
||||
* 7. Rebalances shards between the old primary and new primary
|
||||
* 8. Returns information about the promotion and any shard movements
|
||||
*
|
||||
* Arguments:
|
||||
* - clone_nodeid: The node ID of the clone to promote
|
||||
* - catchUpTimeoutSeconds: Maximum time to wait for clone to catch up (default: 300)
|
||||
*
|
||||
* The function ensures data consistency by blocking writes during the promotion
|
||||
* process and verifying replication lag before proceeding.
|
||||
*/
|
||||
Datum
|
||||
citus_promote_clone_and_rebalance(PG_FUNCTION_ARGS)
|
||||
{
|
||||
/* Ensure superuser and coordinator */
|
||||
EnsureSuperUser();
|
||||
EnsureCoordinator();
|
||||
|
||||
/* Get clone_nodeid argument */
|
||||
int32 cloneNodeIdArg = PG_GETARG_INT32(0);
|
||||
|
||||
/* Get catchUpTimeoutSeconds argument with default value of 300 */
|
||||
int32 catchUpTimeoutSeconds = PG_ARGISNULL(2) ? 300 : PG_GETARG_INT32(2);
|
||||
|
||||
/* Lock pg_dist_node to prevent concurrent modifications during this operation */
|
||||
LockRelationOid(DistNodeRelationId(), RowExclusiveLock);
|
||||
|
||||
WorkerNode *cloneNode = FindNodeAnyClusterByNodeId(cloneNodeIdArg);
|
||||
if (cloneNode == NULL)
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg("Clone node with ID %d not found.", cloneNodeIdArg)));
|
||||
}
|
||||
|
||||
if (!cloneNode->nodeisclone || cloneNode->nodeprimarynodeid == 0)
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg(
|
||||
"Node %s:%d (ID %d) is not a valid clone or its primary node ID is not set.",
|
||||
cloneNode->workerName, cloneNode->workerPort, cloneNode->
|
||||
nodeId)));
|
||||
}
|
||||
|
||||
if (cloneNode->isActive)
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg(
|
||||
"Clone node %s:%d (ID %d) is already active and cannot be promoted.",
|
||||
cloneNode->workerName, cloneNode->workerPort, cloneNode->
|
||||
nodeId)));
|
||||
}
|
||||
|
||||
WorkerNode *primaryNode = FindNodeAnyClusterByNodeId(cloneNode->nodeprimarynodeid);
|
||||
if (primaryNode == NULL)
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg("Primary node with ID %d (for clone %s:%d) not found.",
|
||||
cloneNode->nodeprimarynodeid, cloneNode->workerName,
|
||||
cloneNode->workerPort)));
|
||||
}
|
||||
|
||||
if (primaryNode->nodeisclone)
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg("Primary node %s:%d (ID %d) is itself a clone.",
|
||||
primaryNode->workerName, primaryNode->workerPort,
|
||||
primaryNode->nodeId)));
|
||||
}
|
||||
|
||||
if (!primaryNode->isActive)
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg("Primary node %s:%d (ID %d) is not active.",
|
||||
primaryNode->workerName, primaryNode->workerPort,
|
||||
primaryNode->nodeId)));
|
||||
}
|
||||
|
||||
/* Ensure the primary node is related to the clone node */
|
||||
if (primaryNode->nodeId != cloneNode->nodeprimarynodeid)
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg(
|
||||
"Clone node %s:%d (ID %d) is not a clone of the primary node %s:%d (ID %d).",
|
||||
cloneNode->workerName, cloneNode->workerPort, cloneNode->
|
||||
nodeId,
|
||||
primaryNode->workerName, primaryNode->workerPort,
|
||||
primaryNode->nodeId)));
|
||||
}
|
||||
|
||||
EnsureSingleNodePromotion(primaryNode);
|
||||
ereport(NOTICE, (errmsg(
|
||||
"Starting promotion process for clone node %s:%d (ID %d), original primary %s:%d (ID %d)",
|
||||
cloneNode->workerName, cloneNode->workerPort, cloneNode->
|
||||
nodeId,
|
||||
primaryNode->workerName, primaryNode->workerPort, primaryNode
|
||||
->nodeId)));
|
||||
|
||||
/* Step 0: Check if clone is replica of provided primary node and is not synchronous */
|
||||
char *operation = "promote";
|
||||
EnsureValidCloneMode(primaryNode, cloneNode->workerName, cloneNode->workerPort,
|
||||
operation);
|
||||
|
||||
/* Step 1: Block Writes on Original Primary's Shards */
|
||||
ereport(NOTICE, (errmsg(
|
||||
"Blocking writes on shards of original primary node %s:%d (group %d)",
|
||||
primaryNode->workerName, primaryNode->workerPort, primaryNode
|
||||
->groupId)));
|
||||
|
||||
BlockAllWritesToWorkerNode(primaryNode);
|
||||
|
||||
/* Step 2: Wait for Clone to Catch Up */
|
||||
ereport(NOTICE, (errmsg(
|
||||
"Waiting for clone %s:%d to catch up with primary %s:%d (timeout: %d seconds)",
|
||||
cloneNode->workerName, cloneNode->workerPort,
|
||||
primaryNode->workerName, primaryNode->workerPort,
|
||||
catchUpTimeoutSeconds)));
|
||||
|
||||
bool caughtUp = false;
|
||||
const int sleepIntervalSeconds = 5;
|
||||
int elapsedTimeSeconds = 0;
|
||||
|
||||
while (elapsedTimeSeconds < catchUpTimeoutSeconds)
|
||||
{
|
||||
uint64 repLag = GetReplicationLag(primaryNode, cloneNode);
|
||||
if (repLag <= 0)
|
||||
{
|
||||
caughtUp = true;
|
||||
break;
|
||||
}
|
||||
pg_usleep(sleepIntervalSeconds * 1000000L);
|
||||
elapsedTimeSeconds += sleepIntervalSeconds;
|
||||
}
|
||||
|
||||
if (!caughtUp)
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg(
|
||||
"Clone %s:%d failed to catch up with primary %s:%d within %d seconds.",
|
||||
cloneNode->workerName, cloneNode->workerPort,
|
||||
primaryNode->workerName, primaryNode->workerPort,
|
||||
catchUpTimeoutSeconds)));
|
||||
}
|
||||
|
||||
ereport(NOTICE, (errmsg("Clone %s:%d is now caught up with primary %s:%d.",
|
||||
cloneNode->workerName, cloneNode->workerPort,
|
||||
primaryNode->workerName, primaryNode->workerPort)));
|
||||
|
||||
|
||||
/* Step 3: PostgreSQL Clone Promotion */
|
||||
ereport(NOTICE, (errmsg("Attempting to promote clone %s:%d via pg_promote().",
|
||||
cloneNode->workerName, cloneNode->workerPort)));
|
||||
|
||||
PromoteCloneNode(cloneNode);
|
||||
|
||||
/* Step 4: Update Clone Metadata in pg_dist_node on Coordinator */
|
||||
|
||||
ereport(NOTICE, (errmsg("Updating metadata for promoted clone %s:%d (ID %d)",
|
||||
cloneNode->workerName, cloneNode->workerPort, cloneNode->
|
||||
nodeId)));
|
||||
ActivateCloneNodeAsPrimary(cloneNode);
|
||||
|
||||
/* We need to sync metadata changes to all nodes before rebalancing shards
|
||||
* since the rebalancing algorithm depends on the latest metadata.
|
||||
*/
|
||||
SyncNodeMetadataToNodes();
|
||||
|
||||
/* Step 5: Split Shards Between Primary and Clone */
|
||||
SplitShardsBetweenPrimaryAndClone(primaryNode, cloneNode, PG_GETARG_NAME_OR_NULL(1))
|
||||
;
|
||||
|
||||
|
||||
TransactionModifiedNodeMetadata = true; /* Inform Citus about metadata change */
|
||||
TriggerNodeMetadataSyncOnCommit(); /* Ensure changes are propagated */
|
||||
|
||||
|
||||
ereport(NOTICE, (errmsg(
|
||||
"Clone node %s:%d (ID %d) metadata updated. It is now a primary",
|
||||
cloneNode->workerName, cloneNode->workerPort, cloneNode->
|
||||
nodeId)));
|
||||
|
||||
|
||||
/* Step 6: Unblock Writes (should be handled by transaction commit) */
|
||||
ereport(NOTICE, (errmsg(
|
||||
"Clone node %s:%d (ID %d) successfully registered as a worker node",
|
||||
cloneNode->workerName, cloneNode->workerPort, cloneNode->
|
||||
nodeId)));
|
||||
|
||||
PG_RETURN_VOID();
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* PromoteCloneNode promotes a clone node to a primary node using PostgreSQL's
|
||||
* pg_promote() function.
|
||||
*
|
||||
* This function performs the following steps:
|
||||
* 1. Connects to the clone node
|
||||
* 2. Executes pg_promote(wait := true) to promote the clone to primary
|
||||
* 3. Reconnects to verify the promotion was successful
|
||||
* 4. Checks if the node is still in recovery mode (which would indicate failure)
|
||||
*
|
||||
* The function throws an ERROR if:
|
||||
* - Connection to the clone node fails
|
||||
* - The pg_promote() command fails
|
||||
* - The clone is still in recovery mode after promotion attempt
|
||||
*
|
||||
* On success, it logs a NOTICE message confirming the promotion.
|
||||
*
|
||||
* Note: This function assumes the clone has already been validated for promotion
|
||||
* (e.g., replication lag is acceptable, clone is not synchronous, etc.)
|
||||
*/
|
||||
static void
|
||||
PromoteCloneNode(WorkerNode *cloneWorkerNode)
|
||||
{
|
||||
/* Step 1: Connect to the clone node */
|
||||
int connectionFlag = 0;
|
||||
MultiConnection *cloneConnection = GetNodeConnection(connectionFlag,
|
||||
cloneWorkerNode->workerName,
|
||||
cloneWorkerNode->workerPort);
|
||||
|
||||
if (PQstatus(cloneConnection->pgConn) != CONNECTION_OK)
|
||||
{
|
||||
ReportConnectionError(cloneConnection, ERROR);
|
||||
}
|
||||
|
||||
/* Step 2: Execute pg_promote() to promote the clone to primary */
|
||||
const char *promoteQuery = "SELECT pg_promote(wait := true);";
|
||||
int resultCode = SendRemoteCommand(cloneConnection, promoteQuery);
|
||||
if (resultCode == 0)
|
||||
{
|
||||
ReportConnectionError(cloneConnection, ERROR);
|
||||
}
|
||||
ForgetResults(cloneConnection);
|
||||
CloseConnection(cloneConnection);
|
||||
|
||||
/* Step 3: Reconnect and verify the promotion was successful */
|
||||
if (GetNodeIsInRecoveryStatus(cloneWorkerNode))
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg(
|
||||
"Failed to promote clone %s:%d (ID %d). It is still in recovery.",
|
||||
cloneWorkerNode->workerName, cloneWorkerNode->workerPort,
|
||||
cloneWorkerNode->nodeId)));
|
||||
}
|
||||
else
|
||||
{
|
||||
ereport(NOTICE, (errmsg(
|
||||
"Clone node %s:%d (ID %d) has been successfully promoted.",
|
||||
cloneWorkerNode->workerName, cloneWorkerNode->workerPort,
|
||||
cloneWorkerNode->nodeId)));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
BlockAllWritesToWorkerNode(WorkerNode *workerNode)
|
||||
{
|
||||
ereport(NOTICE, (errmsg("Blocking all writes to worker node %s:%d (ID %d)",
|
||||
workerNode->workerName, workerNode->workerPort, workerNode->
|
||||
nodeId)));
|
||||
|
||||
LockShardsInWorkerPlacementList(workerNode, AccessExclusiveLock);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* GetNodeIsInRecoveryStatus checks if a PostgreSQL node is currently in recovery mode.
|
||||
*
|
||||
* This function connects to the specified worker node and executes pg_is_in_recovery()
|
||||
* to determine if the node is still acting as a replica (in recovery) or has been
|
||||
* promoted to a primary (not in recovery).
|
||||
*
|
||||
* Arguments:
|
||||
* - workerNode: The WorkerNode to check recovery status for
|
||||
*
|
||||
* Returns:
|
||||
* - true if the node is in recovery mode (acting as a replica)
|
||||
* - false if the node is not in recovery mode (acting as a primary)
|
||||
*
|
||||
* The function will ERROR if:
|
||||
* - Cannot establish connection to the node
|
||||
* - The remote query fails
|
||||
* - The query result cannot be parsed
|
||||
*
|
||||
* This is used after promoting a clone node to verify that the
|
||||
* promotion was successful and the node is no longer in recovery mode.
|
||||
*/
|
||||
static bool
|
||||
GetNodeIsInRecoveryStatus(WorkerNode *workerNode)
|
||||
{
|
||||
int connectionFlag = 0;
|
||||
MultiConnection *nodeConnection = GetNodeConnection(connectionFlag,
|
||||
workerNode->workerName,
|
||||
workerNode->workerPort);
|
||||
|
||||
if (PQstatus(nodeConnection->pgConn) != CONNECTION_OK)
|
||||
{
|
||||
ReportConnectionError(nodeConnection, ERROR);
|
||||
}
|
||||
|
||||
const char *recoveryQuery = "SELECT pg_is_in_recovery();";
|
||||
int resultCode = SendRemoteCommand(nodeConnection, recoveryQuery);
|
||||
if (resultCode == 0)
|
||||
{
|
||||
ReportConnectionError(nodeConnection, ERROR);
|
||||
}
|
||||
|
||||
PGresult *result = GetRemoteCommandResult(nodeConnection, true);
|
||||
if (!IsResponseOK(result))
|
||||
{
|
||||
ReportResultError(nodeConnection, result, ERROR);
|
||||
}
|
||||
|
||||
List *recoveryStatusList = ReadFirstColumnAsText(result);
|
||||
if (list_length(recoveryStatusList) != 1)
|
||||
{
|
||||
PQclear(result);
|
||||
ClearResults(nodeConnection, true);
|
||||
CloseConnection(nodeConnection);
|
||||
|
||||
ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE),
|
||||
errmsg("cannot parse recovery status result from %s:%d",
|
||||
workerNode->workerName,
|
||||
workerNode->workerPort)));
|
||||
}
|
||||
|
||||
StringInfo recoveryStatusInfo = (StringInfo) linitial(recoveryStatusList);
|
||||
bool isInRecovery = (strcmp(recoveryStatusInfo->data, "t") == 0) || (strcmp(
|
||||
recoveryStatusInfo
|
||||
->data,
|
||||
"true") == 0)
|
||||
;
|
||||
|
||||
PQclear(result);
|
||||
ForgetResults(nodeConnection);
|
||||
CloseConnection(nodeConnection);
|
||||
|
||||
return isInRecovery;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* EnsureSingleNodePromotion ensures that only one node promotion operation
|
||||
* can proceed at a time by acquiring necessary locks and checking for
|
||||
* conflicting operations.
|
||||
*
|
||||
* This function performs the following safety checks:
|
||||
* 1. Verifies no rebalance operations are currently running, as they would
|
||||
* conflict with the shard redistribution that occurs during promotion
|
||||
* 2. Acquires exclusive placement colocation locks on all shards residing
|
||||
* on the primary node's group to prevent concurrent shard operations
|
||||
*
|
||||
* The locks are acquired in shard ID order to prevent deadlocks when
|
||||
* multiple operations attempt to lock the same set of shards.
|
||||
*
|
||||
* Arguments:
|
||||
* - primaryNode: The primary node whose shards need to be locked
|
||||
*
|
||||
* Throws ERROR if:
|
||||
* - A rebalance operation is already running
|
||||
* - Unable to acquire necessary locks
|
||||
*/
|
||||
static void
|
||||
EnsureSingleNodePromotion(WorkerNode *primaryNode)
|
||||
{
|
||||
/* Error out if some rebalancer is running */
|
||||
int64 jobId = 0;
|
||||
if (HasNonTerminalJobOfType("rebalance", &jobId))
|
||||
{
|
||||
ereport(ERROR, (
|
||||
errmsg("A rebalance operation is already running as job %ld", jobId),
|
||||
errdetail("A rebalance was already scheduled as background job"),
|
||||
errhint("To monitor progress, run: SELECT * FROM "
|
||||
"citus_rebalance_status();")));
|
||||
}
|
||||
List *placementList = AllShardPlacementsOnNodeGroup(primaryNode->groupId);
|
||||
|
||||
/* lock shards in order of shard id to prevent deadlock */
|
||||
placementList = SortList(placementList, CompareShardPlacementsByShardId);
|
||||
|
||||
GroupShardPlacement *placement = NULL;
|
||||
foreach_declared_ptr(placement, placementList)
|
||||
{
|
||||
int64 shardId = placement->shardId;
|
||||
ShardInterval *shardInterval = LoadShardInterval(shardId);
|
||||
Oid distributedTableId = shardInterval->relationId;
|
||||
|
||||
AcquirePlacementColocationLock(distributedTableId, ExclusiveLock, "promote clone")
|
||||
;
|
||||
}
|
||||
}
|
||||
|
|
@ -81,8 +81,29 @@ typedef struct RebalanceOptions
|
|||
Form_pg_dist_rebalance_strategy rebalanceStrategy;
|
||||
const char *operationName;
|
||||
WorkerNode *workerNode;
|
||||
List *involvedWorkerNodeList;
|
||||
} RebalanceOptions;
|
||||
|
||||
typedef struct SplitPrimaryCloneShards
|
||||
{
|
||||
/*
|
||||
* primaryShardPlacementList contains the placements that
|
||||
* should stay on primary worker node.
|
||||
*/
|
||||
List *primaryShardIdList;
|
||||
|
||||
/*
|
||||
* cloneShardPlacementList contains the placements that should stay on
|
||||
* clone worker node.
|
||||
*/
|
||||
List *cloneShardIdList;
|
||||
} SplitPrimaryCloneShards;
|
||||
|
||||
|
||||
static SplitPrimaryCloneShards * GetPrimaryCloneSplitRebalanceSteps(RebalanceOptions
|
||||
*options,
|
||||
WorkerNode
|
||||
*cloneNode);
|
||||
|
||||
/*
|
||||
* RebalanceState is used to keep the internal state of the rebalance
|
||||
|
|
@ -324,6 +345,7 @@ PG_FUNCTION_INFO_V1(pg_dist_rebalance_strategy_enterprise_check);
|
|||
PG_FUNCTION_INFO_V1(citus_rebalance_start);
|
||||
PG_FUNCTION_INFO_V1(citus_rebalance_stop);
|
||||
PG_FUNCTION_INFO_V1(citus_rebalance_wait);
|
||||
PG_FUNCTION_INFO_V1(get_snapshot_based_node_split_plan);
|
||||
|
||||
bool RunningUnderCitusTestSuite = false;
|
||||
int MaxRebalancerLoggedIgnoredMoves = 5;
|
||||
|
|
@ -523,8 +545,17 @@ GetRebalanceSteps(RebalanceOptions *options)
|
|||
.context = &context,
|
||||
};
|
||||
|
||||
if (options->involvedWorkerNodeList == NULL)
|
||||
{
|
||||
/*
|
||||
* If the user did not specify a list of worker nodes, we use all the
|
||||
* active worker nodes.
|
||||
*/
|
||||
options->involvedWorkerNodeList = SortedActiveWorkers();
|
||||
}
|
||||
|
||||
/* sort the lists to make the function more deterministic */
|
||||
List *activeWorkerList = SortedActiveWorkers();
|
||||
List *activeWorkerList = options->involvedWorkerNodeList; /*SortedActiveWorkers(); */
|
||||
int shardAllowedNodeCount = 0;
|
||||
WorkerNode *workerNode = NULL;
|
||||
foreach_declared_ptr(workerNode, activeWorkerList)
|
||||
|
|
@ -987,6 +1018,7 @@ rebalance_table_shards(PG_FUNCTION_ARGS)
|
|||
.excludedShardArray = PG_GETARG_ARRAYTYPE_P(3),
|
||||
.drainOnly = PG_GETARG_BOOL(5),
|
||||
.rebalanceStrategy = strategy,
|
||||
.involvedWorkerNodeList = NULL,
|
||||
.improvementThreshold = strategy->improvementThreshold,
|
||||
};
|
||||
Oid shardTransferModeOid = PG_GETARG_OID(4);
|
||||
|
|
@ -3607,6 +3639,352 @@ EnsureShardCostUDF(Oid functionOid)
|
|||
}
|
||||
|
||||
|
||||
/*
|
||||
* SplitShardsBetweenPrimaryAndClone splits the shards in shardPlacementList
|
||||
* between the primary and clone nodes, adding them to the respective lists.
|
||||
*/
|
||||
void
|
||||
SplitShardsBetweenPrimaryAndClone(WorkerNode *primaryNode,
|
||||
WorkerNode *cloneNode,
|
||||
Name strategyName)
|
||||
{
|
||||
CheckCitusVersion(ERROR);
|
||||
|
||||
List *relationIdList = NonColocatedDistRelationIdList();
|
||||
|
||||
Form_pg_dist_rebalance_strategy strategy = GetRebalanceStrategy(strategyName);/* We use default strategy for now */
|
||||
|
||||
RebalanceOptions options = {
|
||||
.relationIdList = relationIdList,
|
||||
.threshold = 0, /* Threshold is not strictly needed for two nodes */
|
||||
.maxShardMoves = -1, /* No limit on moves between these two nodes */
|
||||
.excludedShardArray = construct_empty_array(INT8OID),
|
||||
.drainOnly = false, /* Not a drain operation */
|
||||
.rebalanceStrategy = strategy,
|
||||
.improvementThreshold = 0, /* Consider all beneficial moves */
|
||||
.workerNode = primaryNode /* indicate Primary node as a source node */
|
||||
};
|
||||
|
||||
SplitPrimaryCloneShards *splitShards = GetPrimaryCloneSplitRebalanceSteps(&options
|
||||
,
|
||||
cloneNode);
|
||||
AdjustShardsForPrimaryCloneNodeSplit(primaryNode, cloneNode,
|
||||
splitShards->primaryShardIdList, splitShards->
|
||||
cloneShardIdList);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* GetPrimaryCloneSplitRebalanceSteps returns a List of PlacementUpdateEvents that are needed to
|
||||
* rebalance a list of tables.
|
||||
*/
|
||||
static SplitPrimaryCloneShards *
|
||||
GetPrimaryCloneSplitRebalanceSteps(RebalanceOptions *options, WorkerNode *cloneNode)
|
||||
{
|
||||
WorkerNode *sourceNode = options->workerNode;
|
||||
WorkerNode *targetNode = cloneNode;
|
||||
|
||||
/* Initialize rebalance plan functions and context */
|
||||
EnsureShardCostUDF(options->rebalanceStrategy->shardCostFunction);
|
||||
EnsureNodeCapacityUDF(options->rebalanceStrategy->nodeCapacityFunction);
|
||||
EnsureShardAllowedOnNodeUDF(options->rebalanceStrategy->shardAllowedOnNodeFunction);
|
||||
|
||||
RebalanceContext context;
|
||||
memset(&context, 0, sizeof(RebalanceContext));
|
||||
fmgr_info(options->rebalanceStrategy->shardCostFunction, &context.shardCostUDF);
|
||||
fmgr_info(options->rebalanceStrategy->nodeCapacityFunction, &context.nodeCapacityUDF);
|
||||
fmgr_info(options->rebalanceStrategy->shardAllowedOnNodeFunction,
|
||||
&context.shardAllowedOnNodeUDF);
|
||||
|
||||
RebalancePlanFunctions rebalancePlanFunctions = {
|
||||
.shardAllowedOnNode = ShardAllowedOnNode,
|
||||
.nodeCapacity = NodeCapacity,
|
||||
.shardCost = GetShardCost,
|
||||
.context = &context,
|
||||
};
|
||||
|
||||
/*
|
||||
* Collect all active shard placements on the source node for the given relations.
|
||||
* Unlike the main rebalancer, we build a single list of all relevant source placements
|
||||
* across all specified relations (or all relations if none specified).
|
||||
*/
|
||||
List *allSourcePlacements = NIL;
|
||||
Oid relationIdItr = InvalidOid;
|
||||
foreach_declared_oid(relationIdItr, options->relationIdList)
|
||||
{
|
||||
List *shardPlacementList = FullShardPlacementList(relationIdItr,
|
||||
options->excludedShardArray);
|
||||
List *activeShardPlacementsForRelation =
|
||||
FilterShardPlacementList(shardPlacementList, IsActiveShardPlacement);
|
||||
|
||||
ShardPlacement *placement = NULL;
|
||||
foreach_declared_ptr(placement, activeShardPlacementsForRelation)
|
||||
{
|
||||
if (placement->nodeId == sourceNode->nodeId)
|
||||
{
|
||||
/* Ensure we don't add duplicate shardId if it's somehow listed under multiple relations */
|
||||
bool alreadyAdded = false;
|
||||
ShardPlacement *existingPlacement = NULL;
|
||||
foreach_declared_ptr(existingPlacement, allSourcePlacements)
|
||||
{
|
||||
if (existingPlacement->shardId == placement->shardId)
|
||||
{
|
||||
alreadyAdded = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!alreadyAdded)
|
||||
{
|
||||
allSourcePlacements = lappend(allSourcePlacements, placement);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
List *activeWorkerList = list_make2(options->workerNode, cloneNode);
|
||||
SplitPrimaryCloneShards *splitShards = palloc0(sizeof(SplitPrimaryCloneShards));
|
||||
splitShards->primaryShardIdList = NIL;
|
||||
splitShards->cloneShardIdList = NIL;
|
||||
|
||||
if (list_length(allSourcePlacements) > 0)
|
||||
{
|
||||
/*
|
||||
* Initialize RebalanceState considering only the source node's shards
|
||||
* and the two active workers (source and target).
|
||||
*/
|
||||
RebalanceState *state = InitRebalanceState(activeWorkerList, allSourcePlacements,
|
||||
&rebalancePlanFunctions);
|
||||
|
||||
NodeFillState *sourceFillState = NULL;
|
||||
NodeFillState *targetFillState = NULL;
|
||||
ListCell *fsc = NULL;
|
||||
|
||||
/* Identify the fill states for our specific source and target nodes */
|
||||
foreach(fsc, state->fillStateListAsc) /* Could be fillStateListDesc too, order doesn't matter here */
|
||||
{
|
||||
NodeFillState *fs = (NodeFillState *) lfirst(fsc);
|
||||
if (fs->node->nodeId == sourceNode->nodeId)
|
||||
{
|
||||
sourceFillState = fs;
|
||||
}
|
||||
else if (fs->node->nodeId == targetNode->nodeId)
|
||||
{
|
||||
targetFillState = fs;
|
||||
}
|
||||
}
|
||||
|
||||
if (sourceFillState != NULL && targetFillState != NULL)
|
||||
{
|
||||
/*
|
||||
* The goal is to move roughly half the total cost from source to target.
|
||||
* The target node is assumed to be empty or its existing load is not
|
||||
* considered for this specific two-node balancing plan's shard distribution.
|
||||
* We calculate costs based *only* on the shards currently on the source node.
|
||||
*/
|
||||
|
||||
/*
|
||||
* The core idea is to simulate the balancing process between these two nodes.
|
||||
* We have all shards on sourceFillState. TargetFillState is initially empty (in terms of these specific shards).
|
||||
* We want to move shards from source to target until their costs are as balanced as possible.
|
||||
*/
|
||||
float4 sourceCurrentCost = sourceFillState->totalCost;
|
||||
float4 targetCurrentCost = 0; /* Representing cost on target from these source shards */
|
||||
|
||||
/* Sort shards on source node by cost (descending). This is a common heuristic. */
|
||||
sourceFillState->shardCostListDesc = SortList(sourceFillState->
|
||||
shardCostListDesc,
|
||||
CompareShardCostDesc);
|
||||
|
||||
List *potentialMoves = NIL;
|
||||
ListCell *lc_shardcost = NULL;
|
||||
|
||||
/*
|
||||
* Iterate through each shard on the source node. For each shard, decide if moving it
|
||||
* to the target node would improve the balance (or is necessary to reach balance).
|
||||
* A simple greedy approach: move shard if target node's current cost is less than source's.
|
||||
*/
|
||||
foreach(lc_shardcost, sourceFillState->shardCostListDesc)
|
||||
{
|
||||
ShardCost *shardToConsider = (ShardCost *) lfirst(lc_shardcost);
|
||||
|
||||
/*
|
||||
* If moving this shard makes the target less loaded than the source would become,
|
||||
* or if target is simply less loaded currently, consider the move.
|
||||
* More accurately, we move if target's cost + shard's cost < source's cost - shard's cost (approximately)
|
||||
* or if target is significantly emptier.
|
||||
* The condition (targetCurrentCost < sourceCurrentCost - shardToConsider->cost) is a greedy choice.
|
||||
* A better check: would moving this shard reduce the difference in costs?
|
||||
* Current difference: abs(sourceCurrentCost - targetCurrentCost)
|
||||
* Difference after move: abs((sourceCurrentCost - shardToConsider->cost) - (targetCurrentCost + shardToConsider->cost))
|
||||
* Move if new difference is smaller.
|
||||
*/
|
||||
float4 costOfShard = shardToConsider->cost;
|
||||
float4 diffBefore = fabsf(sourceCurrentCost - targetCurrentCost);
|
||||
float4 diffAfter = fabsf((sourceCurrentCost - costOfShard) - (
|
||||
targetCurrentCost + costOfShard));
|
||||
|
||||
if (diffAfter < diffBefore)
|
||||
{
|
||||
PlacementUpdateEvent *update = palloc0(sizeof(PlacementUpdateEvent));
|
||||
update->shardId = shardToConsider->shardId;
|
||||
update->sourceNode = sourceNode;
|
||||
update->targetNode = targetNode;
|
||||
update->updateType = PLACEMENT_UPDATE_MOVE;
|
||||
potentialMoves = lappend(potentialMoves, update);
|
||||
splitShards->cloneShardIdList = lappend_int(splitShards->
|
||||
cloneShardIdList,
|
||||
shardToConsider->shardId
|
||||
);
|
||||
|
||||
|
||||
/* Update simulated costs for the next iteration */
|
||||
sourceCurrentCost -= costOfShard;
|
||||
targetCurrentCost += costOfShard;
|
||||
}
|
||||
else
|
||||
{
|
||||
splitShards->primaryShardIdList = lappend_int(splitShards->
|
||||
primaryShardIdList,
|
||||
shardToConsider->shardId
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* RebalanceState is in memory context, will be cleaned up */
|
||||
}
|
||||
return splitShards;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Snapshot-based node split plan outputs the shard placement plan
|
||||
* for primary and replica based node split
|
||||
*
|
||||
* SQL signature:
|
||||
* get_snapshot_based_node_split_plan(
|
||||
* primary_node_name text,
|
||||
* primary_node_port integer,
|
||||
* replica_node_name text,
|
||||
* replica_node_port integer,
|
||||
* rebalance_strategy name DEFAULT NULL
|
||||
*
|
||||
*/
|
||||
Datum
|
||||
get_snapshot_based_node_split_plan(PG_FUNCTION_ARGS)
|
||||
{
|
||||
CheckCitusVersion(ERROR);
|
||||
|
||||
text *primaryNodeNameText = PG_GETARG_TEXT_P(0);
|
||||
int32 primaryNodePort = PG_GETARG_INT32(1);
|
||||
text *cloneNodeNameText = PG_GETARG_TEXT_P(2);
|
||||
int32 cloneNodePort = PG_GETARG_INT32(3);
|
||||
|
||||
char *primaryNodeName = text_to_cstring(primaryNodeNameText);
|
||||
char *cloneNodeName = text_to_cstring(cloneNodeNameText);
|
||||
|
||||
WorkerNode *primaryNode = FindWorkerNodeOrError(primaryNodeName, primaryNodePort);
|
||||
WorkerNode *cloneNode = FindWorkerNodeOrError(cloneNodeName, cloneNodePort);
|
||||
|
||||
if (!cloneNode->nodeisclone || cloneNode->nodeprimarynodeid == 0)
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg(
|
||||
"Node %s:%d (ID %d) is not a valid clone or its primary node ID is not set.",
|
||||
cloneNode->workerName, cloneNode->workerPort,
|
||||
cloneNode->nodeId)));
|
||||
}
|
||||
if (primaryNode->nodeisclone)
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg("Primary node %s:%d (ID %d) is itself a replica.",
|
||||
primaryNode->workerName, primaryNode->workerPort,
|
||||
primaryNode->nodeId)));
|
||||
}
|
||||
|
||||
/* Ensure the primary node is related to the replica node */
|
||||
if (primaryNode->nodeId != cloneNode->nodeprimarynodeid)
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg(
|
||||
"Clone node %s:%d (ID %d) is not a clone of the primary node %s:%d (ID %d).",
|
||||
cloneNode->workerName, cloneNode->workerPort,
|
||||
cloneNode->nodeId,
|
||||
primaryNode->workerName, primaryNode->workerPort,
|
||||
primaryNode->nodeId)));
|
||||
}
|
||||
|
||||
List *relationIdList = NonColocatedDistRelationIdList();
|
||||
|
||||
Form_pg_dist_rebalance_strategy strategy = GetRebalanceStrategy(
|
||||
PG_GETARG_NAME_OR_NULL(4));
|
||||
|
||||
RebalanceOptions options = {
|
||||
.relationIdList = relationIdList,
|
||||
.threshold = 0, /* Threshold is not strictly needed for two nodes */
|
||||
.maxShardMoves = -1, /* No limit on moves between these two nodes */
|
||||
.excludedShardArray = construct_empty_array(INT8OID),
|
||||
.drainOnly = false, /* Not a drain operation */
|
||||
.rebalanceStrategy = strategy,
|
||||
.improvementThreshold = 0, /* Consider all beneficial moves */
|
||||
.workerNode = primaryNode /* indicate Primary node as a source node */
|
||||
};
|
||||
|
||||
SplitPrimaryCloneShards *splitShards = GetPrimaryCloneSplitRebalanceSteps(
|
||||
&options,
|
||||
cloneNode);
|
||||
|
||||
int shardId = 0;
|
||||
TupleDesc tupdesc;
|
||||
Tuplestorestate *tupstore = SetupTuplestore(fcinfo, &tupdesc);
|
||||
Datum values[4];
|
||||
bool nulls[4];
|
||||
|
||||
|
||||
foreach_declared_int(shardId, splitShards->primaryShardIdList)
|
||||
{
|
||||
ShardInterval *shardInterval = LoadShardInterval(shardId);
|
||||
List *colocatedShardList = ColocatedShardIntervalList(shardInterval);
|
||||
ListCell *colocatedShardCell = NULL;
|
||||
foreach(colocatedShardCell, colocatedShardList)
|
||||
{
|
||||
ShardInterval *colocatedShard = lfirst(colocatedShardCell);
|
||||
int colocatedShardId = colocatedShard->shardId;
|
||||
memset(values, 0, sizeof(values));
|
||||
memset(nulls, 0, sizeof(nulls));
|
||||
|
||||
values[0] = ObjectIdGetDatum(RelationIdForShard(colocatedShardId));
|
||||
values[1] = UInt64GetDatum(colocatedShardId);
|
||||
values[2] = UInt64GetDatum(ShardLength(colocatedShardId));
|
||||
values[3] = PointerGetDatum(cstring_to_text("Primary Node"));
|
||||
tuplestore_putvalues(tupstore, tupdesc, values, nulls);
|
||||
}
|
||||
}
|
||||
|
||||
foreach_declared_int(shardId, splitShards->cloneShardIdList)
|
||||
{
|
||||
ShardInterval *shardInterval = LoadShardInterval(shardId);
|
||||
List *colocatedShardList = ColocatedShardIntervalList(shardInterval);
|
||||
ListCell *colocatedShardCell = NULL;
|
||||
foreach(colocatedShardCell, colocatedShardList)
|
||||
{
|
||||
ShardInterval *colocatedShard = lfirst(colocatedShardCell);
|
||||
int colocatedShardId = colocatedShard->shardId;
|
||||
memset(values, 0, sizeof(values));
|
||||
memset(nulls, 0, sizeof(nulls));
|
||||
|
||||
values[0] = ObjectIdGetDatum(RelationIdForShard(colocatedShardId));
|
||||
values[1] = UInt64GetDatum(colocatedShardId);
|
||||
values[2] = UInt64GetDatum(ShardLength(colocatedShardId));
|
||||
values[3] = PointerGetDatum(cstring_to_text("Clone Node"));
|
||||
tuplestore_putvalues(tupstore, tupdesc, values, nulls);
|
||||
}
|
||||
}
|
||||
|
||||
return (Datum) 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* EnsureNodeCapacityUDF checks that the UDF matching the oid has the correct
|
||||
* signature to be used as a NodeCapacity function. The expected signature is:
|
||||
|
|
|
|||
|
|
@ -759,6 +759,205 @@ TransferShards(int64 shardId, char *sourceNodeName,
|
|||
}
|
||||
|
||||
|
||||
/*
|
||||
* AdjustShardsForPrimaryCloneNodeSplit is called when a primary-clone node split
|
||||
* occurs. It adjusts the shard placements between the primary and clone nodes based
|
||||
* on the provided shard lists. Since the clone is an exact replica of the primary
|
||||
* but the metadata is not aware of this replication, this function updates the
|
||||
* metadata to reflect the new shard distribution.
|
||||
*
|
||||
* The function handles three types of shards:
|
||||
*
|
||||
* 1. Shards moving to clone node (cloneShardList):
|
||||
* - Updates shard placement metadata to move placements from primary to clone
|
||||
* - No data movement is needed since the clone already has the data
|
||||
* - Adds cleanup records to remove the shard data from primary at transaction commit
|
||||
*
|
||||
* 2. Shards staying on primary node (primaryShardList):
|
||||
* - Metadata already correctly reflects these shards on primary
|
||||
* - Adds cleanup records to remove the shard data from clone node
|
||||
*
|
||||
* 3. Reference tables:
|
||||
* - Inserts new placement records on the clone node
|
||||
* - Data is already present on clone, so only metadata update is needed
|
||||
*
|
||||
* This function does not perform any actual data movement; it only updates the
|
||||
* shard placement metadata and schedules cleanup operations for later execution.
|
||||
*/
|
||||
void
|
||||
AdjustShardsForPrimaryCloneNodeSplit(WorkerNode *primaryNode,
|
||||
WorkerNode *cloneNode,
|
||||
List *primaryShardList,
|
||||
List *cloneShardList)
|
||||
{
|
||||
/* Input validation */
|
||||
if (primaryNode == NULL || cloneNode == NULL)
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("primary or clone worker node is NULL")));
|
||||
}
|
||||
|
||||
if (primaryNode->nodeId == cloneNode->nodeId)
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("primary and clone nodes must be different")));
|
||||
}
|
||||
|
||||
ereport(NOTICE, (errmsg(
|
||||
"adjusting shard placements for primary %s:%d and clone %s:%d",
|
||||
primaryNode->workerName, primaryNode->workerPort,
|
||||
cloneNode->workerName, cloneNode->workerPort)));
|
||||
|
||||
RegisterOperationNeedingCleanup();
|
||||
|
||||
/*
|
||||
* Process shards that will stay on the primary node.
|
||||
* For these shards, we need to remove their data from the clone node
|
||||
* since the metadata already correctly reflects them on primary.
|
||||
*/
|
||||
uint64 shardId = 0;
|
||||
uint32 primaryGroupId = GroupForNode(primaryNode->workerName, primaryNode->workerPort)
|
||||
;
|
||||
uint32 cloneGroupId = GroupForNode(cloneNode->workerName, cloneNode->workerPort);
|
||||
|
||||
ereport(NOTICE, (errmsg("processing %d shards for primary node GroupID %d",
|
||||
list_length(primaryShardList), primaryGroupId)));
|
||||
|
||||
|
||||
/*
|
||||
* For each shard staying on primary, insert cleanup records to remove
|
||||
* the shard data from the clone node. The metadata already correctly
|
||||
* reflects these shards on primary, so no metadata changes are needed.
|
||||
*/
|
||||
foreach_declared_int(shardId, primaryShardList)
|
||||
{
|
||||
ShardInterval *shardInterval = LoadShardInterval(shardId);
|
||||
List *colocatedShardList = ColocatedShardIntervalList(shardInterval);
|
||||
|
||||
char *qualifiedShardName = ConstructQualifiedShardName(shardInterval);
|
||||
ereport(LOG, (errmsg(
|
||||
"inserting DELETE shard record for shard %s from clone node GroupID %d",
|
||||
qualifiedShardName, cloneGroupId)));
|
||||
|
||||
InsertCleanupRecordsForShardPlacementsOnNode(colocatedShardList,
|
||||
cloneGroupId);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Process shards that will move to the clone node.
|
||||
* For these shards, we need to:
|
||||
* 1. Update metadata to move placements from primary to clone
|
||||
* 2. Remove the shard data from primary (via cleanup records)
|
||||
* 3. No data movement needed since clone already has the data
|
||||
*/
|
||||
ereport(NOTICE, (errmsg("processing %d shards for clone node GroupID %d", list_length(
|
||||
cloneShardList), cloneGroupId)));
|
||||
|
||||
foreach_declared_int(shardId, cloneShardList)
|
||||
{
|
||||
ShardInterval *shardInterval = LoadShardInterval(shardId);
|
||||
List *colocatedShardList = ColocatedShardIntervalList(shardInterval);
|
||||
|
||||
/*
|
||||
* Create new shard placement records on the clone node for all
|
||||
* colocated shards. This moves the shard placements from primary
|
||||
* to clone in the metadata.
|
||||
*/
|
||||
foreach_declared_ptr(shardInterval, colocatedShardList)
|
||||
{
|
||||
uint64 colocatedShardId = shardInterval->shardId;
|
||||
|
||||
uint64 placementId = GetNextPlacementId();
|
||||
InsertShardPlacementRow(colocatedShardId, placementId,
|
||||
ShardLength(colocatedShardId),
|
||||
cloneGroupId);
|
||||
}
|
||||
|
||||
/*
|
||||
* Update the metadata on worker nodes to reflect the new shard
|
||||
* placement distribution between primary and clone nodes.
|
||||
*/
|
||||
UpdateColocatedShardPlacementMetadataOnWorkers(shardId,
|
||||
primaryNode->workerName,
|
||||
primaryNode->workerPort,
|
||||
cloneNode->workerName,
|
||||
cloneNode->workerPort);
|
||||
|
||||
/*
|
||||
* Remove the shard placement records from primary node metadata
|
||||
* since these shards are now served from the clone node.
|
||||
*/
|
||||
DropShardPlacementsFromMetadata(colocatedShardList,
|
||||
primaryNode->workerName, primaryNode->workerPort);
|
||||
|
||||
char *qualifiedShardName = ConstructQualifiedShardName(shardInterval);
|
||||
ereport(LOG, (errmsg(
|
||||
"inserting DELETE shard record for shard %s from primary node GroupID %d",
|
||||
qualifiedShardName, primaryGroupId)));
|
||||
|
||||
/*
|
||||
* Insert cleanup records to remove the shard data from primary node
|
||||
* at transaction commit. This frees up space on the primary node
|
||||
* since the data is now served from the clone node.
|
||||
*/
|
||||
InsertCleanupRecordsForShardPlacementsOnNode(colocatedShardList,
|
||||
primaryGroupId);
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle reference tables - these need to be available on both
|
||||
* primary and clone nodes. Since the clone already has the data,
|
||||
* we just need to insert placement records for the clone node.
|
||||
*/
|
||||
int colocationId = GetReferenceTableColocationId();
|
||||
|
||||
if (colocationId == INVALID_COLOCATION_ID)
|
||||
{
|
||||
/* we have no reference table yet. */
|
||||
return;
|
||||
}
|
||||
ShardInterval *shardInterval = NULL;
|
||||
List *referenceTableIdList = CitusTableTypeIdList(REFERENCE_TABLE);
|
||||
Oid referenceTableId = linitial_oid(referenceTableIdList);
|
||||
List *shardIntervalList = LoadShardIntervalList(referenceTableId);
|
||||
foreach_declared_ptr(shardInterval, shardIntervalList)
|
||||
{
|
||||
List *colocatedShardList = ColocatedShardIntervalList(shardInterval);
|
||||
ShardInterval *colocatedShardInterval = NULL;
|
||||
|
||||
/*
|
||||
* For each reference table shard, create placement records on the
|
||||
* clone node. The data is already present on the clone, so we only
|
||||
* need to update the metadata to make the clone aware of these shards.
|
||||
*/
|
||||
foreach_declared_ptr(colocatedShardInterval, colocatedShardList)
|
||||
{
|
||||
uint64 colocatedShardId = colocatedShardInterval->shardId;
|
||||
|
||||
/*
|
||||
* Insert shard placement record for the clone node and
|
||||
* propagate the metadata change to worker nodes.
|
||||
*/
|
||||
uint64 placementId = GetNextPlacementId();
|
||||
InsertShardPlacementRow(colocatedShardId, placementId,
|
||||
ShardLength(colocatedShardId),
|
||||
cloneGroupId);
|
||||
|
||||
char *placementCommand = PlacementUpsertCommand(colocatedShardId, placementId,
|
||||
0, cloneGroupId);
|
||||
|
||||
SendCommandToWorkersWithMetadata(placementCommand);
|
||||
}
|
||||
}
|
||||
|
||||
ereport(NOTICE, (errmsg(
|
||||
"shard placement adjustment complete for primary %s:%d and clone %s:%d",
|
||||
primaryNode->workerName, primaryNode->workerPort,
|
||||
cloneNode->workerName, cloneNode->workerPort)));
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Insert deferred cleanup records.
|
||||
* The shards will be dropped by background cleaner later.
|
||||
|
|
@ -2269,6 +2468,7 @@ UpdateColocatedShardPlacementMetadataOnWorkers(int64 shardId,
|
|||
"SELECT citus_internal.update_placement_metadata(%ld, %d, %d)",
|
||||
colocatedShard->shardId,
|
||||
sourceGroupId, targetGroupId);
|
||||
|
||||
SendCommandToWorkersWithMetadata(updateCommand->data);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,7 @@
|
|||
-- Add replica information columns to pg_dist_node
|
||||
ALTER TABLE pg_catalog.pg_dist_node ADD COLUMN nodeisclone BOOLEAN NOT NULL DEFAULT FALSE;
|
||||
ALTER TABLE pg_catalog.pg_dist_node ADD COLUMN nodeprimarynodeid INT4 NOT NULL DEFAULT 0;
|
||||
|
||||
-- Add a comment to the table and columns for clarity in \d output
|
||||
COMMENT ON COLUMN pg_catalog.pg_dist_node.nodeisclone IS 'Indicates if this node is a replica of another node.';
|
||||
COMMENT ON COLUMN pg_catalog.pg_dist_node.nodeprimarynodeid IS 'If nodeisclone is true, this stores the nodeid of its primary node.';
|
||||
|
|
@ -0,0 +1,3 @@
|
|||
-- Remove clone information columns to pg_dist_node
|
||||
ALTER TABLE pg_catalog.pg_dist_node DROP COLUMN IF EXISTS nodeisclone;
|
||||
ALTER TABLE pg_catalog.pg_dist_node DROP COLUMN IF EXISTS nodeprimarynodeid;
|
||||
|
|
@ -3,6 +3,12 @@
|
|||
-- bump version to 13.2-1
|
||||
#include "udfs/worker_last_saved_explain_analyze/13.2-1.sql"
|
||||
|
||||
#include "cat_upgrades/add_clone_info_to_pg_dist_node.sql"
|
||||
#include "udfs/citus_add_clone_node/13.2-1.sql"
|
||||
#include "udfs/citus_remove_clone_node/13.2-1.sql"
|
||||
#include "udfs/citus_promote_clone_and_rebalance/13.2-1.sql"
|
||||
#include "udfs/get_snapshot_based_node_split_plan/13.2-1.sql"
|
||||
|
||||
#include "udfs/citus_rebalance_start/13.2-1.sql"
|
||||
#include "udfs/citus_internal_copy_single_shard_placement/13.2-1.sql"
|
||||
|
||||
|
|
|
|||
|
|
@ -8,6 +8,16 @@ DROP FUNCTION IF EXISTS pg_catalog.citus_rebalance_start(name, boolean, citus.sh
|
|||
DROP FUNCTION IF EXISTS pg_catalog.worker_last_saved_explain_analyze();
|
||||
#include "../udfs/worker_last_saved_explain_analyze/9.4-1.sql"
|
||||
|
||||
DROP FUNCTION IF EXISTS pg_catalog.citus_add_clone_node(text, integer, text, integer);
|
||||
DROP FUNCTION IF EXISTS pg_catalog.citus_add_clone_node_with_nodeid(text, integer, integer);
|
||||
|
||||
DROP FUNCTION IF EXISTS pg_catalog.citus_remove_clone_node(text, integer);
|
||||
DROP FUNCTION IF EXISTS pg_catalog.citus_remove_clone_node_with_nodeid(integer);
|
||||
|
||||
DROP FUNCTION IF EXISTS pg_catalog.citus_promote_clone_and_rebalance(integer, name, integer);
|
||||
DROP FUNCTION IF EXISTS pg_catalog.get_snapshot_based_node_split_plan(text, integer, text, integer, name);
|
||||
|
||||
#include "../cat_upgrades/remove_clone_info_to_pg_dist_node.sql"
|
||||
#include "../udfs/citus_finish_pg_upgrade/13.1-1.sql"
|
||||
|
||||
-- Note that we intentionally don't add the old columnar objects back to the "citus"
|
||||
|
|
|
|||
|
|
@ -0,0 +1,26 @@
|
|||
CREATE OR REPLACE FUNCTION pg_catalog.citus_add_clone_node(
|
||||
replica_hostname text,
|
||||
replica_port integer,
|
||||
primary_hostname text,
|
||||
primary_port integer)
|
||||
RETURNS INTEGER
|
||||
LANGUAGE C VOLATILE STRICT
|
||||
AS 'MODULE_PATHNAME', $$citus_add_clone_node$$;
|
||||
|
||||
COMMENT ON FUNCTION pg_catalog.citus_add_clone_node(text, integer, text, integer) IS
|
||||
'Adds a new node as a clone of an existing primary node. The clone is initially inactive. Returns the nodeid of the new clone node.';
|
||||
|
||||
REVOKE ALL ON FUNCTION pg_catalog.citus_add_clone_node(text, int, text, int) FROM PUBLIC;
|
||||
|
||||
CREATE OR REPLACE FUNCTION pg_catalog.citus_add_clone_node_with_nodeid(
|
||||
replica_hostname text,
|
||||
replica_port integer,
|
||||
primary_nodeid integer)
|
||||
RETURNS INTEGER
|
||||
LANGUAGE C VOLATILE STRICT
|
||||
AS 'MODULE_PATHNAME', $$citus_add_clone_node_with_nodeid$$;
|
||||
|
||||
COMMENT ON FUNCTION pg_catalog.citus_add_clone_node_with_nodeid(text, integer, integer) IS
|
||||
'Adds a new node as a clone of an existing primary node using the primary node''s ID. The clone is initially inactive. Returns the nodeid of the new clone node.';
|
||||
|
||||
REVOKE ALL ON FUNCTION pg_catalog.citus_add_clone_node_with_nodeid(text, int, int) FROM PUBLIC;
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
CREATE OR REPLACE FUNCTION pg_catalog.citus_add_clone_node(
|
||||
replica_hostname text,
|
||||
replica_port integer,
|
||||
primary_hostname text,
|
||||
primary_port integer)
|
||||
RETURNS INTEGER
|
||||
LANGUAGE C VOLATILE STRICT
|
||||
AS 'MODULE_PATHNAME', $$citus_add_clone_node$$;
|
||||
|
||||
COMMENT ON FUNCTION pg_catalog.citus_add_clone_node(text, integer, text, integer) IS
|
||||
'Adds a new node as a clone of an existing primary node. The clone is initially inactive. Returns the nodeid of the new clone node.';
|
||||
|
||||
REVOKE ALL ON FUNCTION pg_catalog.citus_add_clone_node(text, int, text, int) FROM PUBLIC;
|
||||
|
||||
CREATE OR REPLACE FUNCTION pg_catalog.citus_add_clone_node_with_nodeid(
|
||||
replica_hostname text,
|
||||
replica_port integer,
|
||||
primary_nodeid integer)
|
||||
RETURNS INTEGER
|
||||
LANGUAGE C VOLATILE STRICT
|
||||
AS 'MODULE_PATHNAME', $$citus_add_clone_node_with_nodeid$$;
|
||||
|
||||
COMMENT ON FUNCTION pg_catalog.citus_add_clone_node_with_nodeid(text, integer, integer) IS
|
||||
'Adds a new node as a clone of an existing primary node using the primary node''s ID. The clone is initially inactive. Returns the nodeid of the new clone node.';
|
||||
|
||||
REVOKE ALL ON FUNCTION pg_catalog.citus_add_clone_node_with_nodeid(text, int, int) FROM PUBLIC;
|
||||
13
src/backend/distributed/sql/udfs/citus_promote_clone_and_rebalance/13.2-1.sql
generated
Normal file
13
src/backend/distributed/sql/udfs/citus_promote_clone_and_rebalance/13.2-1.sql
generated
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
CREATE OR REPLACE FUNCTION pg_catalog.citus_promote_clone_and_rebalance(
|
||||
clone_nodeid integer,
|
||||
rebalance_strategy name DEFAULT NULL,
|
||||
catchup_timeout_seconds integer DEFAULT 300
|
||||
)
|
||||
RETURNS VOID
|
||||
AS 'MODULE_PATHNAME'
|
||||
LANGUAGE C VOLATILE;
|
||||
|
||||
COMMENT ON FUNCTION pg_catalog.citus_promote_clone_and_rebalance(integer, name, integer) IS
|
||||
'Promotes a registered clone node to a primary, performs necessary metadata updates, and rebalances a portion of shards from its original primary to the newly promoted node. The catchUpTimeoutSeconds parameter controls how long to wait for the clone to catch up with the primary (default: 300 seconds).';
|
||||
|
||||
REVOKE ALL ON FUNCTION pg_catalog.citus_promote_clone_and_rebalance(integer, name, integer) FROM PUBLIC;
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
CREATE OR REPLACE FUNCTION pg_catalog.citus_promote_clone_and_rebalance(
|
||||
clone_nodeid integer,
|
||||
rebalance_strategy name DEFAULT NULL,
|
||||
catchup_timeout_seconds integer DEFAULT 300
|
||||
)
|
||||
RETURNS VOID
|
||||
AS 'MODULE_PATHNAME'
|
||||
LANGUAGE C VOLATILE;
|
||||
|
||||
COMMENT ON FUNCTION pg_catalog.citus_promote_clone_and_rebalance(integer, name, integer) IS
|
||||
'Promotes a registered clone node to a primary, performs necessary metadata updates, and rebalances a portion of shards from its original primary to the newly promoted node. The catchUpTimeoutSeconds parameter controls how long to wait for the clone to catch up with the primary (default: 300 seconds).';
|
||||
|
||||
REVOKE ALL ON FUNCTION pg_catalog.citus_promote_clone_and_rebalance(integer, name, integer) FROM PUBLIC;
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
CREATE OR REPLACE FUNCTION pg_catalog.citus_remove_clone_node(
|
||||
nodename text,
|
||||
nodeport integer
|
||||
)
|
||||
RETURNS VOID
|
||||
LANGUAGE C VOLATILE STRICT
|
||||
AS 'MODULE_PATHNAME', $$citus_remove_clone_node$$;
|
||||
|
||||
COMMENT ON FUNCTION pg_catalog.citus_remove_clone_node(text, integer)
|
||||
IS 'Removes an inactive streaming clone node from Citus metadata. Errors if the node is not found, not registered as a clone, or is currently marked active.';
|
||||
|
||||
REVOKE ALL ON FUNCTION pg_catalog.citus_remove_clone_node(text, integer) FROM PUBLIC;
|
||||
|
||||
CREATE OR REPLACE FUNCTION pg_catalog.citus_remove_clone_node_with_nodeid(
|
||||
nodeid integer
|
||||
)
|
||||
RETURNS VOID
|
||||
LANGUAGE C VOLATILE STRICT
|
||||
AS 'MODULE_PATHNAME', $$citus_remove_clone_node_with_nodeid$$;
|
||||
|
||||
COMMENT ON FUNCTION pg_catalog.citus_remove_clone_node_with_nodeid(integer)
|
||||
IS 'Removes an inactive streaming clone node from Citus metadata using its node ID. Errors if the node is not found, not registered as a clone, or is currently marked active.';
|
||||
|
||||
REVOKE ALL ON FUNCTION pg_catalog.citus_remove_clone_node_with_nodeid(integer) FROM PUBLIC;
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
CREATE OR REPLACE FUNCTION pg_catalog.citus_remove_clone_node(
|
||||
nodename text,
|
||||
nodeport integer
|
||||
)
|
||||
RETURNS VOID
|
||||
LANGUAGE C VOLATILE STRICT
|
||||
AS 'MODULE_PATHNAME', $$citus_remove_clone_node$$;
|
||||
|
||||
COMMENT ON FUNCTION pg_catalog.citus_remove_clone_node(text, integer)
|
||||
IS 'Removes an inactive streaming clone node from Citus metadata. Errors if the node is not found, not registered as a clone, or is currently marked active.';
|
||||
|
||||
REVOKE ALL ON FUNCTION pg_catalog.citus_remove_clone_node(text, integer) FROM PUBLIC;
|
||||
|
||||
CREATE OR REPLACE FUNCTION pg_catalog.citus_remove_clone_node_with_nodeid(
|
||||
nodeid integer
|
||||
)
|
||||
RETURNS VOID
|
||||
LANGUAGE C VOLATILE STRICT
|
||||
AS 'MODULE_PATHNAME', $$citus_remove_clone_node_with_nodeid$$;
|
||||
|
||||
COMMENT ON FUNCTION pg_catalog.citus_remove_clone_node_with_nodeid(integer)
|
||||
IS 'Removes an inactive streaming clone node from Citus metadata using its node ID. Errors if the node is not found, not registered as a clone, or is currently marked active.';
|
||||
|
||||
REVOKE ALL ON FUNCTION pg_catalog.citus_remove_clone_node_with_nodeid(integer) FROM PUBLIC;
|
||||
18
src/backend/distributed/sql/udfs/get_snapshot_based_node_split_plan/13.2-1.sql
generated
Normal file
18
src/backend/distributed/sql/udfs/get_snapshot_based_node_split_plan/13.2-1.sql
generated
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
CREATE OR REPLACE FUNCTION pg_catalog.get_snapshot_based_node_split_plan(
|
||||
primary_node_name text,
|
||||
primary_node_port integer,
|
||||
replica_node_name text,
|
||||
replica_node_port integer,
|
||||
rebalance_strategy name DEFAULT NULL
|
||||
)
|
||||
RETURNS TABLE (table_name regclass,
|
||||
shardid bigint,
|
||||
shard_size bigint,
|
||||
placement_node text)
|
||||
AS 'MODULE_PATHNAME'
|
||||
LANGUAGE C VOLATILE;
|
||||
|
||||
COMMENT ON FUNCTION pg_catalog.get_snapshot_based_node_split_plan(text, int, text, int, name)
|
||||
IS 'shows the shard placements to balance shards between primary and replica worker nodes';
|
||||
|
||||
REVOKE ALL ON FUNCTION pg_catalog.get_snapshot_based_node_split_plan(text, int, text, int, name) FROM PUBLIC;
|
||||
|
|
@ -0,0 +1,18 @@
|
|||
CREATE OR REPLACE FUNCTION pg_catalog.get_snapshot_based_node_split_plan(
|
||||
primary_node_name text,
|
||||
primary_node_port integer,
|
||||
replica_node_name text,
|
||||
replica_node_port integer,
|
||||
rebalance_strategy name DEFAULT NULL
|
||||
)
|
||||
RETURNS TABLE (table_name regclass,
|
||||
shardid bigint,
|
||||
shard_size bigint,
|
||||
placement_node text)
|
||||
AS 'MODULE_PATHNAME'
|
||||
LANGUAGE C VOLATILE;
|
||||
|
||||
COMMENT ON FUNCTION pg_catalog.get_snapshot_based_node_split_plan(text, int, text, int, name)
|
||||
IS 'shows the shard placements to balance shards between primary and replica worker nodes';
|
||||
|
||||
REVOKE ALL ON FUNCTION pg_catalog.get_snapshot_based_node_split_plan(text, int, text, int, name) FROM PUBLIC;
|
||||
|
|
@ -0,0 +1,525 @@
|
|||
#include <arpa/inet.h>
|
||||
#include <netdb.h>
|
||||
#include <netinet/in.h>
|
||||
#include <sys/socket.h>
|
||||
|
||||
#include "postgres.h"
|
||||
|
||||
#include "utils/fmgrprotos.h"
|
||||
#include "utils/pg_lsn.h"
|
||||
|
||||
#include "distributed/argutils.h"
|
||||
#include "distributed/clonenode_utils.h"
|
||||
#include "distributed/listutils.h"
|
||||
#include "distributed/metadata_cache.h"
|
||||
#include "distributed/metadata_sync.h"
|
||||
#include "distributed/remote_commands.h"
|
||||
#include "distributed/shard_rebalancer.h"
|
||||
|
||||
/*
|
||||
* GetReplicationLag calculates the replication lag between the primary and replica nodes.
|
||||
* It returns the lag in bytes.
|
||||
*/
|
||||
int64
|
||||
GetReplicationLag(WorkerNode *primaryWorkerNode, WorkerNode *replicaWorkerNode)
|
||||
{
|
||||
/* Input validation */
|
||||
if (primaryWorkerNode == NULL || replicaWorkerNode == NULL)
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("primary or replica worker node is NULL")));
|
||||
}
|
||||
|
||||
#if PG_VERSION_NUM >= 100000
|
||||
const char *primary_lsn_query = "SELECT pg_current_wal_lsn()";
|
||||
const char *replica_lsn_query = "SELECT pg_last_wal_replay_lsn()";
|
||||
#else
|
||||
const char *primary_lsn_query = "SELECT pg_current_xlog_location()";
|
||||
const char *replica_lsn_query = "SELECT pg_last_xlog_replay_location()";
|
||||
#endif
|
||||
|
||||
int connectionFlag = 0;
|
||||
MultiConnection *primaryConnection = GetNodeConnection(connectionFlag,
|
||||
primaryWorkerNode->workerName,
|
||||
primaryWorkerNode->workerPort);
|
||||
if (PQstatus(primaryConnection->pgConn) != CONNECTION_OK)
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE),
|
||||
errmsg(
|
||||
"cannot connect to primary node %s:%d to fetch replication status",
|
||||
primaryWorkerNode->workerName, primaryWorkerNode->
|
||||
workerPort)));
|
||||
}
|
||||
MultiConnection *replicaConnection = GetNodeConnection(connectionFlag,
|
||||
replicaWorkerNode->workerName,
|
||||
replicaWorkerNode->workerPort);
|
||||
|
||||
if (PQstatus(replicaConnection->pgConn) != CONNECTION_OK)
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE),
|
||||
errmsg(
|
||||
"cannot connect to clone node %s:%d to fetch replication status",
|
||||
replicaWorkerNode->workerName, replicaWorkerNode->
|
||||
workerPort)));
|
||||
}
|
||||
|
||||
int primaryResultCode = SendRemoteCommand(primaryConnection, primary_lsn_query);
|
||||
if (primaryResultCode == 0)
|
||||
{
|
||||
ReportConnectionError(primaryConnection, ERROR);
|
||||
}
|
||||
|
||||
PGresult *primaryResult = GetRemoteCommandResult(primaryConnection, true);
|
||||
if (!IsResponseOK(primaryResult))
|
||||
{
|
||||
ReportResultError(primaryConnection, primaryResult, ERROR);
|
||||
}
|
||||
|
||||
int replicaResultCode = SendRemoteCommand(replicaConnection, replica_lsn_query);
|
||||
if (replicaResultCode == 0)
|
||||
{
|
||||
ReportConnectionError(replicaConnection, ERROR);
|
||||
}
|
||||
PGresult *replicaResult = GetRemoteCommandResult(replicaConnection, true);
|
||||
if (!IsResponseOK(replicaResult))
|
||||
{
|
||||
ReportResultError(replicaConnection, replicaResult, ERROR);
|
||||
}
|
||||
|
||||
|
||||
List *primaryLsnList = ReadFirstColumnAsText(primaryResult);
|
||||
if (list_length(primaryLsnList) != 1)
|
||||
{
|
||||
PQclear(primaryResult);
|
||||
ClearResults(primaryConnection, true);
|
||||
CloseConnection(primaryConnection);
|
||||
PQclear(replicaResult);
|
||||
ClearResults(replicaConnection, true);
|
||||
CloseConnection(replicaConnection);
|
||||
|
||||
ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE),
|
||||
errmsg("cannot parse primary LSN result from %s:%d",
|
||||
primaryWorkerNode->workerName,
|
||||
primaryWorkerNode->workerPort),
|
||||
errdetail("Expected exactly one row with LSN value")));
|
||||
}
|
||||
StringInfo primaryLsnQueryResInfo = (StringInfo) linitial(primaryLsnList);
|
||||
char *primary_lsn_str = primaryLsnQueryResInfo->data;
|
||||
|
||||
List *replicaLsnList = ReadFirstColumnAsText(replicaResult);
|
||||
if (list_length(replicaLsnList) != 1)
|
||||
{
|
||||
PQclear(primaryResult);
|
||||
ClearResults(primaryConnection, true);
|
||||
CloseConnection(primaryConnection);
|
||||
PQclear(replicaResult);
|
||||
ClearResults(replicaConnection, true);
|
||||
CloseConnection(replicaConnection);
|
||||
|
||||
ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE),
|
||||
errmsg("cannot parse clone LSN result from %s:%d",
|
||||
replicaWorkerNode->workerName,
|
||||
replicaWorkerNode->workerPort),
|
||||
errdetail("Expected exactly one row with LSN value")));
|
||||
}
|
||||
StringInfo replicaLsnQueryResInfo = (StringInfo) linitial(replicaLsnList);
|
||||
char *replica_lsn_str = replicaLsnQueryResInfo->data;
|
||||
|
||||
int64 primary_lsn = DatumGetLSN(DirectFunctionCall1(pg_lsn_in, CStringGetDatum(
|
||||
primary_lsn_str)));
|
||||
int64 replica_lsn = DatumGetLSN(DirectFunctionCall1(pg_lsn_in, CStringGetDatum(
|
||||
replica_lsn_str)));
|
||||
|
||||
int64 lag_bytes = primary_lsn - replica_lsn;
|
||||
|
||||
PQclear(primaryResult);
|
||||
ForgetResults(primaryConnection);
|
||||
CloseConnection(primaryConnection);
|
||||
|
||||
PQclear(replicaResult);
|
||||
ForgetResults(replicaConnection);
|
||||
CloseConnection(replicaConnection);
|
||||
|
||||
ereport(DEBUG1, (errmsg(
|
||||
"successfully measured replication lag: primary LSN %s, clone LSN %s",
|
||||
primary_lsn_str, replica_lsn_str)));
|
||||
ereport(NOTICE, (errmsg("replication lag between %s:%d and %s:%d is %ld bytes",
|
||||
primaryWorkerNode->workerName, primaryWorkerNode->workerPort,
|
||||
replicaWorkerNode->workerName, replicaWorkerNode->workerPort,
|
||||
lag_bytes)));
|
||||
return lag_bytes;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* EnsureValidCloneMode verifies that a clone node has a valid replication
|
||||
* relationship with the specified primary node.
|
||||
*
|
||||
* This function performs several critical checks:
|
||||
* 1. Validates that the clone is actually connected to and replicating from
|
||||
* the specified primary node
|
||||
* 2. Ensures the clone is not configured as a synchronous replica, which
|
||||
* would block 2PC commits on the primary when the clone gets promoted
|
||||
* 3. Verifies the replication connection is active and healthy
|
||||
*
|
||||
* The function connects to the primary node and queries pg_stat_replication
|
||||
* to find the clone's replication slot. It resolves hostnames to IP addresses
|
||||
* for robust matching since PostgreSQL may report different address formats.
|
||||
*
|
||||
* Parameters:
|
||||
* primaryWorkerNode - The primary node that should be sending replication data
|
||||
* cloneHostname - Hostname/IP of the clone node to verify
|
||||
* clonePort - Port of the clone node to verify
|
||||
* operation - Description of the operation being performed (for error messages)
|
||||
*
|
||||
* Throws ERROR if:
|
||||
* - Primary or clone parameters are invalid
|
||||
* - Cannot connect to the primary node
|
||||
* - Clone is not found in the primary's replication slots
|
||||
* - Clone is configured as a synchronous replica
|
||||
* - Replication connection is not active
|
||||
*/
|
||||
void
|
||||
EnsureValidCloneMode(WorkerNode *primaryWorkerNode,
|
||||
char *cloneHostname, int clonePort, char *operation)
|
||||
{
|
||||
Assert(operation != NULL);
|
||||
|
||||
if (primaryWorkerNode == NULL || cloneHostname == NULL)
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("primary or clone worker node is NULL")));
|
||||
}
|
||||
|
||||
ereport(NOTICE, (errmsg(
|
||||
"checking replication relationship between primary %s:%d and clone %s:%d",
|
||||
primaryWorkerNode->workerName, primaryWorkerNode->workerPort,
|
||||
cloneHostname, clonePort)));
|
||||
|
||||
/* Connect to primary node to check replication status */
|
||||
int connectionFlag = 0;
|
||||
MultiConnection *primaryConnection = GetNodeConnection(connectionFlag,
|
||||
primaryWorkerNode->workerName,
|
||||
primaryWorkerNode->workerPort);
|
||||
if (PQstatus(primaryConnection->pgConn) != CONNECTION_OK)
|
||||
{
|
||||
ReportConnectionError(primaryConnection, ERROR);
|
||||
}
|
||||
|
||||
/* Build query to check if clone is connected and get its sync state */
|
||||
StringInfo replicationCheckQuery = makeStringInfo();
|
||||
|
||||
/* First, try to resolve the hostname to IP address for more robust matching */
|
||||
char *resolvedIP = NULL;
|
||||
struct addrinfo hints, *result, *rp;
|
||||
|
||||
memset(&hints, 0, sizeof(hints));
|
||||
hints.ai_family = AF_UNSPEC; /* Allow IPv4 or IPv6 */
|
||||
hints.ai_socktype = SOCK_STREAM; /* TCP socket */
|
||||
hints.ai_flags = AI_PASSIVE; /* For wildcard IP address */
|
||||
|
||||
int getaddrinfo_result = getaddrinfo(cloneHostname, NULL, &hints, &result);
|
||||
if (getaddrinfo_result == 0)
|
||||
{
|
||||
/* Get the first resolved IP address */
|
||||
for (rp = result; rp != NULL; rp = rp->ai_next)
|
||||
{
|
||||
if (rp->ai_family == AF_INET)
|
||||
{
|
||||
/* IPv4 */
|
||||
struct sockaddr_in *addr_in = (struct sockaddr_in *) rp->ai_addr;
|
||||
resolvedIP = palloc(INET_ADDRSTRLEN);
|
||||
inet_ntop(AF_INET, &(addr_in->sin_addr), resolvedIP, INET_ADDRSTRLEN);
|
||||
break;
|
||||
}
|
||||
else if (rp->ai_family == AF_INET6)
|
||||
{
|
||||
/* IPv6 */
|
||||
struct sockaddr_in6 *addr_in6 = (struct sockaddr_in6 *) rp->ai_addr;
|
||||
resolvedIP = palloc(INET6_ADDRSTRLEN);
|
||||
inet_ntop(AF_INET6, &(addr_in6->sin6_addr), resolvedIP, INET6_ADDRSTRLEN);
|
||||
break;
|
||||
}
|
||||
}
|
||||
freeaddrinfo(result);
|
||||
}
|
||||
|
||||
ereport(NOTICE, (errmsg("checking replication for node %s (resolved IP: %s)",
|
||||
cloneHostname,
|
||||
resolvedIP ? resolvedIP : "unresolved")));
|
||||
|
||||
/* Build query to check if clone is connected and get its sync state */
|
||||
|
||||
/* We check multiple fields to handle different scenarios:
|
||||
* 1. application_name - if it's set to the node name
|
||||
* 2. client_hostname - if it's the hostname
|
||||
* 3. client_addr - if it's the IP address (most reliable)
|
||||
*/
|
||||
if (resolvedIP != NULL)
|
||||
{
|
||||
appendStringInfo(replicationCheckQuery,
|
||||
"SELECT sync_state, state FROM pg_stat_replication WHERE "
|
||||
"application_name = '%s' OR "
|
||||
"client_hostname = '%s' OR "
|
||||
"client_addr = '%s'",
|
||||
cloneHostname,
|
||||
cloneHostname,
|
||||
resolvedIP);
|
||||
pfree(resolvedIP);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Fallback to hostname-only check if IP resolution fails */
|
||||
appendStringInfo(replicationCheckQuery,
|
||||
"SELECT sync_state, state FROM pg_stat_replication WHERE "
|
||||
"application_name = '%s' OR "
|
||||
"client_hostname = '%s'",
|
||||
cloneHostname,
|
||||
cloneHostname);
|
||||
}
|
||||
|
||||
int replicationCheckResultCode = SendRemoteCommand(primaryConnection,
|
||||
replicationCheckQuery->data);
|
||||
if (replicationCheckResultCode == 0)
|
||||
{
|
||||
pfree(replicationCheckQuery->data);
|
||||
pfree(replicationCheckQuery);
|
||||
CloseConnection(primaryConnection);
|
||||
ReportConnectionError(primaryConnection, ERROR);
|
||||
}
|
||||
|
||||
PGresult *replicationCheckResult = GetRemoteCommandResult(primaryConnection, true);
|
||||
if (!IsResponseOK(replicationCheckResult))
|
||||
{
|
||||
ReportResultError(primaryConnection, replicationCheckResult, ERROR);
|
||||
}
|
||||
|
||||
List *replicationStateList = ReadFirstColumnAsText(replicationCheckResult);
|
||||
|
||||
/* Check if clone is connected to this primary */
|
||||
if (list_length(replicationStateList) == 0)
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg("clone %s:%d is not connected to primary %s:%d",
|
||||
cloneHostname, clonePort,
|
||||
primaryWorkerNode->workerName, primaryWorkerNode->
|
||||
workerPort),
|
||||
errdetail(
|
||||
"The clone must be actively replicating from the specified primary node. "
|
||||
"Check that the clone is running and properly configured for replication.")));
|
||||
}
|
||||
|
||||
/* Check if clone is synchronous */
|
||||
if (list_length(replicationStateList) > 0)
|
||||
{
|
||||
StringInfo syncStateInfo = (StringInfo) linitial(replicationStateList);
|
||||
if (syncStateInfo && syncStateInfo->data &&
|
||||
(strcmp(syncStateInfo->data, "sync") == 0 || strcmp(syncStateInfo->data,
|
||||
"quorum") == 0))
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg(
|
||||
"cannot %s clone %s:%d as it is configured as a synchronous replica",
|
||||
operation, cloneHostname, clonePort),
|
||||
errdetail(
|
||||
"Promoting a synchronous clone can cause data consistency issues. "
|
||||
"Please configure it as an asynchronous replica first.")))
|
||||
;
|
||||
}
|
||||
}
|
||||
|
||||
/* Cleanup resources */
|
||||
bool raiseErrors = false;
|
||||
PQclear(replicationCheckResult);
|
||||
ClearResults(primaryConnection, raiseErrors);
|
||||
pfree(replicationCheckQuery->data);
|
||||
pfree(replicationCheckQuery);
|
||||
CloseConnection(primaryConnection);
|
||||
|
||||
ereport(NOTICE, (errmsg(
|
||||
"clone %s:%d is properly connected to primary %s:%d and is not synchronous",
|
||||
cloneHostname, clonePort,
|
||||
primaryWorkerNode->workerName, primaryWorkerNode->workerPort))
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* EnsureValidStreamingReplica verifies that a node is a valid streaming replica
|
||||
* of the specified primary node.
|
||||
*
|
||||
* This function performs comprehensive validation to ensure the replica is:
|
||||
* 1. Currently in recovery mode (acting as a replica, not a primary)
|
||||
* 2. Has the same system identifier as the primary (ensuring they're part of
|
||||
* the same PostgreSQL cluster/timeline)
|
||||
*
|
||||
* The function connects to both the replica and primary nodes to perform these
|
||||
* checks. This validation is critical before performing operations like promotion
|
||||
* or failover to ensure data consistency and prevent split-brain scenarios.
|
||||
*
|
||||
* Parameters:
|
||||
* primaryWorkerNode - The primary node that should be the source of replication
|
||||
* replicaHostname - Hostname/IP of the replica node to validate
|
||||
* replicaPort - Port of the replica node to validate
|
||||
*
|
||||
* Throws ERROR if:
|
||||
* - Cannot connect to the replica or primary node
|
||||
* - Replica is not in recovery mode (indicating it's not acting as a replica)
|
||||
* - System identifiers don't match between primary and replica
|
||||
* - Any database queries fail during validation
|
||||
*
|
||||
*/
|
||||
void
|
||||
EnsureValidStreamingReplica(WorkerNode *primaryWorkerNode, char *replicaHostname, int
|
||||
replicaPort)
|
||||
{
|
||||
int connectionFlag = FORCE_NEW_CONNECTION;
|
||||
MultiConnection *replicaConnection = GetNodeConnection(connectionFlag, replicaHostname
|
||||
,
|
||||
replicaPort);
|
||||
|
||||
if (PQstatus(replicaConnection->pgConn) != CONNECTION_OK)
|
||||
{
|
||||
ReportConnectionError(replicaConnection, ERROR);
|
||||
}
|
||||
|
||||
const char *replica_recovery_query = "SELECT pg_is_in_recovery()";
|
||||
|
||||
int resultCode = SendRemoteCommand(replicaConnection, replica_recovery_query);
|
||||
|
||||
if (resultCode == 0)
|
||||
{
|
||||
ereport(DEBUG2, (errmsg(
|
||||
"cannot connect to %s:%d to check if it is in recovery mode",
|
||||
replicaHostname, replicaPort)));
|
||||
ReportConnectionError(replicaConnection, ERROR);
|
||||
}
|
||||
|
||||
bool raiseInterrupts = true;
|
||||
PGresult *result = GetRemoteCommandResult(replicaConnection, raiseInterrupts);
|
||||
|
||||
if (!IsResponseOK(result))
|
||||
{
|
||||
ereport(DEBUG2, (errmsg("failed to execute pg_is_in_recovery")));
|
||||
ReportResultError(replicaConnection, result, ERROR);
|
||||
}
|
||||
|
||||
List *sizeList = ReadFirstColumnAsText(result);
|
||||
if (list_length(sizeList) != 1)
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE),
|
||||
errmsg("cannot parse pg_is_in_recovery() result from %s:%d",
|
||||
replicaHostname,
|
||||
replicaPort)));
|
||||
}
|
||||
|
||||
StringInfo isInRecoveryQueryResInfo = (StringInfo) linitial(sizeList);
|
||||
char *isInRecoveryQueryResStr = isInRecoveryQueryResInfo->data;
|
||||
|
||||
if (strcmp(isInRecoveryQueryResStr, "t") != 0 && strcmp(isInRecoveryQueryResStr,
|
||||
"true") != 0)
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg("node %s:%d is not in recovery mode",
|
||||
replicaHostname, replicaPort)));
|
||||
}
|
||||
|
||||
PQclear(result);
|
||||
ForgetResults(replicaConnection);
|
||||
|
||||
/* Step2: Get the system identifier from replica */
|
||||
const char *sysidQuery =
|
||||
"SELECT system_identifier FROM pg_control_system()";
|
||||
|
||||
resultCode = SendRemoteCommand(replicaConnection, sysidQuery);
|
||||
|
||||
if (resultCode == 0)
|
||||
{
|
||||
ereport(DEBUG2, (errmsg("cannot connect to %s:%d to get system identifier",
|
||||
replicaHostname, replicaPort)));
|
||||
ReportConnectionError(replicaConnection, ERROR);
|
||||
}
|
||||
|
||||
result = GetRemoteCommandResult(replicaConnection, raiseInterrupts);
|
||||
if (!IsResponseOK(result))
|
||||
{
|
||||
ereport(DEBUG2, (errmsg("failed to execute get system identifier")));
|
||||
ReportResultError(replicaConnection, result, ERROR);
|
||||
}
|
||||
|
||||
List *sysidList = ReadFirstColumnAsText(result);
|
||||
if (list_length(sysidList) != 1)
|
||||
{
|
||||
CloseConnection(replicaConnection);
|
||||
ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE),
|
||||
errmsg("cannot parse get system identifier result from %s:%d",
|
||||
replicaHostname,
|
||||
replicaPort)));
|
||||
}
|
||||
|
||||
StringInfo sysidQueryResInfo = (StringInfo) linitial(sysidList);
|
||||
char *sysidQueryResStr = sysidQueryResInfo->data;
|
||||
|
||||
ereport(DEBUG2, (errmsg("system identifier of %s:%d is %s",
|
||||
replicaHostname, replicaPort, sysidQueryResStr)));
|
||||
|
||||
/* We do not need the connection anymore */
|
||||
PQclear(result);
|
||||
ForgetResults(replicaConnection);
|
||||
CloseConnection(replicaConnection);
|
||||
|
||||
/* Step3: Get system identifier from primary */
|
||||
ereport(DEBUG2, (errmsg("getting system identifier from primary %s:%d",
|
||||
primaryWorkerNode->workerName,
|
||||
primaryWorkerNode->workerPort)));
|
||||
|
||||
int primaryConnectionFlag = 0;
|
||||
MultiConnection *primaryConnection = GetNodeConnection(primaryConnectionFlag,
|
||||
primaryWorkerNode->workerName,
|
||||
primaryWorkerNode->workerPort);
|
||||
|
||||
if (PQstatus(primaryConnection->pgConn) != CONNECTION_OK)
|
||||
{
|
||||
ReportConnectionError(primaryConnection, ERROR);
|
||||
}
|
||||
|
||||
int primaryResultCode = SendRemoteCommand(primaryConnection, sysidQuery);
|
||||
if (primaryResultCode == 0)
|
||||
{
|
||||
ReportConnectionError(primaryConnection, ERROR);
|
||||
}
|
||||
|
||||
PGresult *primaryResult = GetRemoteCommandResult(primaryConnection, raiseInterrupts);
|
||||
if (!IsResponseOK(primaryResult))
|
||||
{
|
||||
ereport(DEBUG2, (errmsg("failed to execute get system identifier")));
|
||||
ReportResultError(primaryConnection, primaryResult, ERROR);
|
||||
}
|
||||
List *primarySizeList = ReadFirstColumnAsText(primaryResult);
|
||||
if (list_length(primarySizeList) != 1)
|
||||
{
|
||||
CloseConnection(primaryConnection);
|
||||
ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE),
|
||||
errmsg("cannot parse get system identifier result from %s:%d",
|
||||
primaryWorkerNode->workerName,
|
||||
primaryWorkerNode->workerPort)));
|
||||
}
|
||||
StringInfo primarySysidQueryResInfo = (StringInfo) linitial(primarySizeList);
|
||||
char *primarySysidQueryResStr = primarySysidQueryResInfo->data;
|
||||
|
||||
ereport(DEBUG2, (errmsg("system identifier of %s:%d is %s",
|
||||
primaryWorkerNode->workerName, primaryWorkerNode->workerPort,
|
||||
primarySysidQueryResStr)));
|
||||
|
||||
/* verify both identifiers */
|
||||
if (strcmp(sysidQueryResStr, primarySysidQueryResStr) != 0)
|
||||
{
|
||||
ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE),
|
||||
errmsg(
|
||||
"system identifiers do not match: %s (clone) vs %s (primary)",
|
||||
sysidQueryResStr, primarySysidQueryResStr)));
|
||||
}
|
||||
PQclear(primaryResult);
|
||||
ClearResults(primaryConnection, true);
|
||||
CloseConnection(primaryConnection);
|
||||
}
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
#ifndef CLONENODE_UTILS_H
|
||||
#define CLONENODE_UTILS_H
|
||||
|
||||
#include "distributed/metadata_cache.h"
|
||||
|
||||
extern int64 GetReplicationLag(WorkerNode *primaryWorkerNode, WorkerNode *
|
||||
replicaWorkerNode);
|
||||
extern void EnsureValidStreamingReplica(WorkerNode *primaryWorkerNode, char *
|
||||
replicaHostname, int replicaPort);
|
||||
extern void EnsureValidCloneMode(WorkerNode *primaryWorkerNode, char *cloneHostname, int
|
||||
clonePort, char *operation);
|
||||
|
||||
#endif /* CLONE_UTILS_H */
|
||||
|
|
@ -299,6 +299,7 @@ extern Oid CitusDependentObjectFuncId(void);
|
|||
/* enum oids */
|
||||
extern Oid PrimaryNodeRoleId(void);
|
||||
extern Oid SecondaryNodeRoleId(void);
|
||||
extern Oid UnavailableNodeRoleId(void);
|
||||
extern Oid CitusCopyFormatTypeId(void);
|
||||
extern Oid TextCopyFormatId(void);
|
||||
extern Oid BinaryCopyFormatId(void);
|
||||
|
|
|
|||
|
|
@ -345,7 +345,8 @@ extern bool IsDummyPlacement(ShardPlacement *taskPlacement);
|
|||
extern StringInfo GenerateSizeQueryOnMultiplePlacements(List *shardIntervalList,
|
||||
Oid indexId,
|
||||
SizeQueryType sizeQueryType,
|
||||
bool optimizePartitionCalculations);
|
||||
bool optimizePartitionCalculations
|
||||
);
|
||||
extern List * RemoveCoordinatorPlacementIfNotSingleNode(List *placementList);
|
||||
|
||||
/* Function declarations to modify shard and shard placement data */
|
||||
|
|
@ -467,4 +468,8 @@ extern bool IsBackgroundTaskStatusTerminal(BackgroundTaskStatus status);
|
|||
extern Oid BackgroundJobStatusOid(BackgroundJobStatus status);
|
||||
extern Oid BackgroundTaskStatusOid(BackgroundTaskStatus status);
|
||||
extern int GetAutoConvertedAttrIndexInPgDistPartition(TupleDesc tupleDEsc);
|
||||
|
||||
/* from node_metadata.c */
|
||||
extern void LockShardsInWorkerPlacementList(WorkerNode *workerNode, LOCKMODE lockMode);
|
||||
extern void ActivateCloneNodeAsPrimary(WorkerNode *workerNode);
|
||||
#endif /* METADATA_UTILITY_H */
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@
|
|||
* in particular their OUT parameters) must be changed whenever the definition of
|
||||
* pg_dist_node changes.
|
||||
*/
|
||||
#define Natts_pg_dist_node 11
|
||||
#define Natts_pg_dist_node 13
|
||||
#define Anum_pg_dist_node_nodeid 1
|
||||
#define Anum_pg_dist_node_groupid 2
|
||||
#define Anum_pg_dist_node_nodename 3
|
||||
|
|
@ -32,6 +32,8 @@
|
|||
#define Anum_pg_dist_node_nodecluster 9
|
||||
#define Anum_pg_dist_node_metadatasynced 10
|
||||
#define Anum_pg_dist_node_shouldhaveshards 11
|
||||
#define Anum_pg_dist_node_nodeisclone 12
|
||||
#define Anum_pg_dist_node_nodeprimarynodeid 13
|
||||
|
||||
#define GROUPID_SEQUENCE_NAME "pg_dist_groupid_seq"
|
||||
#define NODEID_SEQUENCE_NAME "pg_dist_node_nodeid_seq"
|
||||
|
|
|
|||
|
|
@ -222,4 +222,7 @@ extern void SetupRebalanceMonitor(List *placementUpdateList,
|
|||
uint64 initialProgressState,
|
||||
PlacementUpdateStatus initialStatus);
|
||||
|
||||
extern void SplitShardsBetweenPrimaryAndClone(WorkerNode *primaryNode,
|
||||
WorkerNode *cloneNode,
|
||||
Name strategyName);
|
||||
#endif /* SHARD_REBALANCER_H */
|
||||
|
|
|
|||
|
|
@ -85,3 +85,8 @@ extern void UpdatePlacementUpdateStatusForShardIntervalList(List *shardIntervalL
|
|||
extern void InsertDeferredDropCleanupRecordsForShards(List *shardIntervalList);
|
||||
extern void InsertCleanupRecordsForShardPlacementsOnNode(List *shardIntervalList,
|
||||
int32 groupId);
|
||||
|
||||
extern void AdjustShardsForPrimaryCloneNodeSplit(WorkerNode *primaryNode,
|
||||
WorkerNode *cloneNode,
|
||||
List *primaryShardList,
|
||||
List *cloneShardList);
|
||||
|
|
|
|||
|
|
@ -54,6 +54,8 @@ typedef struct WorkerNode
|
|||
char nodeCluster[NAMEDATALEN]; /* the cluster the node is a part of */
|
||||
bool metadataSynced; /* node has the most recent metadata */
|
||||
bool shouldHaveShards; /* if the node should have distributed table shards on it or not */
|
||||
bool nodeisclone; /* whether this node is a replica */
|
||||
int32 nodeprimarynodeid; /* nodeid of the primary for this replica */
|
||||
} WorkerNode;
|
||||
|
||||
|
||||
|
|
@ -84,6 +86,7 @@ extern WorkerNode * FindWorkerNode(const char *nodeName, int32 nodePort);
|
|||
extern WorkerNode * FindWorkerNodeOrError(const char *nodeName, int32 nodePort);
|
||||
extern WorkerNode * FindWorkerNodeAnyCluster(const char *nodeName, int32 nodePort);
|
||||
extern WorkerNode * FindNodeWithNodeId(int nodeId, bool missingOk);
|
||||
extern WorkerNode * FindNodeAnyClusterByNodeId(uint32 nodeId);
|
||||
extern WorkerNode * ModifiableWorkerNode(const char *nodeName, int32 nodePort);
|
||||
extern List * ReadDistNode(bool includeNodesFromOtherClusters);
|
||||
extern void EnsureCoordinator(void);
|
||||
|
|
|
|||
|
|
@ -45,7 +45,7 @@ vanilla_diffs_file = $(citus_abs_srcdir)/pg_vanilla_outputs/$(MAJORVERSION)/regr
|
|||
# intermediate, for muscle memory backward compatibility.
|
||||
check: check-full check-enterprise-full
|
||||
# check-full triggers all tests that ought to be run routinely
|
||||
check-full: check-multi check-multi-mx check-multi-1 check-operations check-follower-cluster check-isolation check-failure check-split check-vanilla check-columnar check-columnar-isolation check-pg-upgrade check-arbitrary-configs check-citus-upgrade check-citus-upgrade-mixed check-citus-upgrade-local check-citus-upgrade-mixed-local check-pytest check-query-generator
|
||||
check-full: check-multi check-multi-mx check-multi-1 check-operations check-add-backup-node check-follower-cluster check-isolation check-failure check-split check-vanilla check-columnar check-columnar-isolation check-pg-upgrade check-arbitrary-configs check-citus-upgrade check-citus-upgrade-mixed check-citus-upgrade-local check-citus-upgrade-mixed-local check-pytest check-query-generator
|
||||
# check-enterprise-full triggers all enterprise specific tests
|
||||
check-enterprise-full: check-enterprise check-enterprise-isolation check-enterprise-failure check-enterprise-isolation-logicalrep-1 check-enterprise-isolation-logicalrep-2 check-enterprise-isolation-logicalrep-3
|
||||
|
||||
|
|
@ -217,6 +217,10 @@ check-follower-cluster: all
|
|||
$(pg_regress_multi_check) --load-extension=citus --follower-cluster \
|
||||
-- $(MULTI_REGRESS_OPTS) --schedule=$(citus_abs_srcdir)/multi_follower_schedule $(EXTRA_TESTS)
|
||||
|
||||
check-add-backup-node: all
|
||||
$(pg_regress_multi_check) --load-extension=citus --follower-cluster --backupnodetest --worker-count=6 \
|
||||
-- $(MULTI_REGRESS_OPTS) --schedule=$(citus_abs_srcdir)/multi_add_backup_node_schedule $(EXTRA_TESTS)
|
||||
|
||||
check-operations: all
|
||||
$(pg_regress_multi_check) --load-extension=citus --worker-count=6 \
|
||||
-- $(MULTI_REGRESS_OPTS) --schedule=$(citus_abs_srcdir)/operations_schedule $(EXTRA_TESTS)
|
||||
|
|
|
|||
|
|
@ -113,6 +113,13 @@ DEPS = {
|
|||
),
|
||||
"create_role_propagation": TestDeps(None, ["multi_cluster_management"]),
|
||||
"single_node_enterprise": TestDeps(None),
|
||||
"multi_add_node_from_backup": TestDeps(None, repeatable=False, worker_count=5),
|
||||
"multi_add_node_from_backup_negative": TestDeps(
|
||||
None, ["multi_add_node_from_backup"], worker_count=5, repeatable=False
|
||||
),
|
||||
"multi_add_node_from_backup_sync_replica": TestDeps(
|
||||
None, repeatable=False, worker_count=5
|
||||
),
|
||||
"single_node": TestDeps(None, ["multi_test_helpers"]),
|
||||
"single_node_truncate": TestDeps(None),
|
||||
"multi_explain": TestDeps(
|
||||
|
|
|
|||
|
|
@ -906,11 +906,11 @@ SELECT citus_activate_node('localhost', :worker_2_proxy_port);
|
|||
ERROR: connection not open
|
||||
-- Show node metadata info on coordinator after failures
|
||||
SELECT * FROM pg_dist_node ORDER BY nodeport;
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||
---------------------------------------------------------------------
|
||||
2 | 2 | localhost | 9060 | default | f | t | primary | default | f | t
|
||||
3 | 0 | localhost | 57636 | default | t | t | primary | default | t | f
|
||||
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t
|
||||
2 | 2 | localhost | 9060 | default | f | t | primary | default | f | t | f | 0
|
||||
3 | 0 | localhost | 57636 | default | t | t | primary | default | t | f | f | 0
|
||||
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||
(3 rows)
|
||||
|
||||
-- Show that we can still query the node from coordinator
|
||||
|
|
@ -968,20 +968,20 @@ SELECT citus_activate_node('localhost', :worker_2_proxy_port);
|
|||
-- Show node metadata info on worker2 and coordinator after success
|
||||
\c - - - :worker_2_port
|
||||
SELECT * FROM pg_dist_node ORDER BY nodeport;
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||
---------------------------------------------------------------------
|
||||
2 | 2 | localhost | 9060 | default | t | t | primary | default | t | t
|
||||
3 | 0 | localhost | 57636 | default | t | t | primary | default | t | f
|
||||
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t
|
||||
2 | 2 | localhost | 9060 | default | t | t | primary | default | t | t | f | 0
|
||||
3 | 0 | localhost | 57636 | default | t | t | primary | default | t | f | f | 0
|
||||
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||
(3 rows)
|
||||
|
||||
\c - - - :master_port
|
||||
SELECT * FROM pg_dist_node ORDER BY nodeport;
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||
---------------------------------------------------------------------
|
||||
2 | 2 | localhost | 9060 | default | t | t | primary | default | t | t
|
||||
3 | 0 | localhost | 57636 | default | t | t | primary | default | t | f
|
||||
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t
|
||||
2 | 2 | localhost | 9060 | default | t | t | primary | default | t | t | f | 0
|
||||
3 | 0 | localhost | 57636 | default | t | t | primary | default | t | f | f | 0
|
||||
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||
(3 rows)
|
||||
|
||||
SELECT citus.mitmproxy('conn.allow()');
|
||||
|
|
|
|||
|
|
@ -0,0 +1,515 @@
|
|||
--
|
||||
-- Test for adding a worker node from a backup
|
||||
--
|
||||
-- setup cluster
|
||||
SELECT 1 FROM master_add_node('localhost', :worker_1_port);
|
||||
?column?
|
||||
---------------------------------------------------------------------
|
||||
1
|
||||
(1 row)
|
||||
|
||||
SELECT 1 FROM master_add_node('localhost', :worker_2_port);
|
||||
?column?
|
||||
---------------------------------------------------------------------
|
||||
1
|
||||
(1 row)
|
||||
|
||||
SELECT * from pg_dist_node;
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||
---------------------------------------------------------------------
|
||||
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||
2 | 2 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||
(2 rows)
|
||||
|
||||
-- create a distributed table and load data
|
||||
CREATE TABLE backup_test(id int, value text);
|
||||
SELECT create_distributed_table('backup_test', 'id', 'hash');
|
||||
create_distributed_table
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
INSERT INTO backup_test SELECT g, 'test' || g FROM generate_series(1, 10) g;
|
||||
-- Colocation group 1: create two tables table1_colg1, table2_colg1 and in a colocation group
|
||||
CREATE TABLE table1_colg1 (a int PRIMARY KEY);
|
||||
SELECT create_distributed_table('table1_colg1', 'a', shard_count => 4, colocate_with => 'none');
|
||||
create_distributed_table
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
CREATE TABLE table2_colg1 (b int PRIMARY KEY);
|
||||
SELECT create_distributed_table('table2_colg1', 'b', colocate_with => 'table1_colg1');
|
||||
create_distributed_table
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
-- Colocation group 2: create two tables table1_colg2, table2_colg2 and in a colocation group
|
||||
CREATE TABLE table1_colg2 (a int PRIMARY KEY);
|
||||
SELECT create_distributed_table('table1_colg2', 'a', shard_count => 4, colocate_with => 'none');
|
||||
create_distributed_table
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
CREATE TABLE table2_colg2 (b int primary key);
|
||||
SELECT create_distributed_table('table2_colg2', 'b', colocate_with => 'table1_colg2');
|
||||
create_distributed_table
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
-- Colocation group 3: create two tables table1_colg3, table2_colg3 and in a colocation group
|
||||
CREATE TABLE table1_colg3 (a int PRIMARY KEY);
|
||||
SELECT create_distributed_table('table1_colg3', 'a', shard_count => 4, colocate_with => 'none');
|
||||
create_distributed_table
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
CREATE TABLE table2_colg3 (b int primary key);
|
||||
SELECT create_distributed_table('table2_colg3', 'b', colocate_with => 'table1_colg3');
|
||||
create_distributed_table
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
-- Create reference tables with primary-foreign key relationships
|
||||
CREATE TABLE customers (
|
||||
id SERIAL PRIMARY KEY,
|
||||
name TEXT NOT NULL,
|
||||
email TEXT UNIQUE NOT NULL );
|
||||
CREATE TABLE orders (
|
||||
id SERIAL PRIMARY KEY,
|
||||
customer_id INTEGER NOT NULL REFERENCES customers(id),
|
||||
order_date DATE NOT NULL DEFAULT CURRENT_DATE);
|
||||
CREATE TABLE order_items (
|
||||
id SERIAL PRIMARY KEY,
|
||||
order_id INTEGER NOT NULL REFERENCES orders(id),
|
||||
product_name TEXT NOT NULL,
|
||||
quantity INTEGER NOT NULL,
|
||||
price NUMERIC(10, 2) NOT NULL
|
||||
);
|
||||
SELECT create_reference_table('customers');
|
||||
create_reference_table
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
SELECT create_reference_table('orders');
|
||||
create_reference_table
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
SELECT create_reference_table('order_items');
|
||||
create_reference_table
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
-- INSERT SOME DATA
|
||||
-- Insert 10 customers
|
||||
INSERT INTO customers (name, email)
|
||||
SELECT
|
||||
'Customer ' || i,
|
||||
'customer' || i || '@example.com'
|
||||
FROM generate_series(1, 10) AS i;
|
||||
-- Insert 30 orders: each customer gets 3 orders
|
||||
INSERT INTO orders (customer_id, order_date)
|
||||
SELECT
|
||||
(i % 10) + 1, -- customer_id between 1 and 10
|
||||
CURRENT_DATE - (i % 7)
|
||||
FROM generate_series(1, 30) AS i;
|
||||
-- Insert 90 order_items: each order has 3 items
|
||||
INSERT INTO order_items (order_id, product_name, quantity, price)
|
||||
SELECT
|
||||
(i % 30) + 1, -- order_id between 1 and 30
|
||||
'Product ' || (i % 5 + 1),
|
||||
(i % 10) + 1,
|
||||
round((random() * 100 + 10)::numeric, 2)
|
||||
FROM generate_series(1, 90) AS i;
|
||||
SELECT count(*) from customers;
|
||||
count
|
||||
---------------------------------------------------------------------
|
||||
10
|
||||
(1 row)
|
||||
|
||||
SELECT count(*) from orders;
|
||||
count
|
||||
---------------------------------------------------------------------
|
||||
30
|
||||
(1 row)
|
||||
|
||||
SELECT count(*) from order_items;
|
||||
count
|
||||
---------------------------------------------------------------------
|
||||
90
|
||||
(1 row)
|
||||
|
||||
-- verify initial shard placement
|
||||
SELECT nodename, nodeport, count(shardid) FROM pg_dist_shard_placement GROUP BY nodename, nodeport ORDER BY nodename, nodeport;
|
||||
nodename | nodeport | count
|
||||
---------------------------------------------------------------------
|
||||
localhost | 57637 | 17
|
||||
localhost | 57638 | 17
|
||||
(2 rows)
|
||||
|
||||
-- wait for the new node to be ready
|
||||
SELECT pg_sleep(5);
|
||||
pg_sleep
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
-- register the new node as a clone
|
||||
-- the function returns the new node id
|
||||
SELECT citus_add_clone_node('localhost', :follower_worker_1_port, 'localhost', :worker_1_port) AS clone_node_id \gset
|
||||
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
|
||||
NOTICE: checking replication for node localhost (resolved IP: ::1)
|
||||
NOTICE: clone localhost:xxxxx is properly connected to primary localhost:xxxxx and is not synchronous
|
||||
SELECT * from pg_dist_node ORDER by nodeid;
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||
---------------------------------------------------------------------
|
||||
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||
2 | 2 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||
3 | 3 | localhost | 9071 | default | f | f | unavailable | default | f | f | t | 1
|
||||
(3 rows)
|
||||
|
||||
SELECT :clone_node_id ;
|
||||
?column?
|
||||
---------------------------------------------------------------------
|
||||
3
|
||||
(1 row)
|
||||
|
||||
SELECT shardid, nodename, 'PRIMARY' as node_type FROM pg_dist_shard_placement WHERE nodeport = :worker_1_port ORDER BY shardid;
|
||||
shardid | nodename | node_type
|
||||
---------------------------------------------------------------------
|
||||
102008 | localhost | PRIMARY
|
||||
102010 | localhost | PRIMARY
|
||||
102012 | localhost | PRIMARY
|
||||
102014 | localhost | PRIMARY
|
||||
102016 | localhost | PRIMARY
|
||||
102018 | localhost | PRIMARY
|
||||
102020 | localhost | PRIMARY
|
||||
102022 | localhost | PRIMARY
|
||||
102024 | localhost | PRIMARY
|
||||
102026 | localhost | PRIMARY
|
||||
102028 | localhost | PRIMARY
|
||||
102030 | localhost | PRIMARY
|
||||
102032 | localhost | PRIMARY
|
||||
102034 | localhost | PRIMARY
|
||||
102036 | localhost | PRIMARY
|
||||
102037 | localhost | PRIMARY
|
||||
102038 | localhost | PRIMARY
|
||||
(17 rows)
|
||||
|
||||
SELECT shardid, nodename, 'CLONE' as node_type FROM pg_dist_shard_placement WHERE nodeport = :follower_worker_1_port ORDER BY shardid;
|
||||
shardid | nodename | node_type
|
||||
---------------------------------------------------------------------
|
||||
(0 rows)
|
||||
|
||||
SELECT * from get_snapshot_based_node_split_plan('localhost', :worker_1_port, 'localhost', :follower_worker_1_port);
|
||||
table_name | shardid | shard_size | placement_node
|
||||
---------------------------------------------------------------------
|
||||
table1_colg2 | 102020 | 0 | Primary Node
|
||||
table2_colg2 | 102024 | 0 | Primary Node
|
||||
table1_colg2 | 102022 | 0 | Primary Node
|
||||
table2_colg2 | 102026 | 0 | Primary Node
|
||||
table1_colg3 | 102028 | 0 | Primary Node
|
||||
table2_colg3 | 102032 | 0 | Primary Node
|
||||
table1_colg3 | 102030 | 0 | Primary Node
|
||||
table2_colg3 | 102034 | 0 | Primary Node
|
||||
backup_test | 102008 | 0 | Clone Node
|
||||
backup_test | 102010 | 0 | Clone Node
|
||||
table1_colg1 | 102012 | 0 | Clone Node
|
||||
table2_colg1 | 102016 | 0 | Clone Node
|
||||
table1_colg1 | 102014 | 0 | Clone Node
|
||||
table2_colg1 | 102018 | 0 | Clone Node
|
||||
(14 rows)
|
||||
|
||||
-- promote the clone and rebalance the shards
|
||||
SET client_min_messages to 'LOG';
|
||||
SELECT citus_promote_clone_and_rebalance(:clone_node_id);
|
||||
NOTICE: Starting promotion process for clone node localhost:xxxxx (ID 3), original primary localhost:xxxxx (ID 1)
|
||||
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
|
||||
NOTICE: checking replication for node localhost (resolved IP: ::1)
|
||||
NOTICE: clone localhost:xxxxx is properly connected to primary localhost:xxxxx and is not synchronous
|
||||
NOTICE: Blocking writes on shards of original primary node localhost:xxxxx (group 1)
|
||||
NOTICE: Blocking all writes to worker node localhost:xxxxx (ID 1)
|
||||
NOTICE: Waiting for clone localhost:xxxxx to catch up with primary localhost:xxxxx (timeout: 300 seconds)
|
||||
NOTICE: replication lag between localhost:xxxxx and localhost:xxxxx is 0 bytes
|
||||
NOTICE: Clone localhost:xxxxx is now caught up with primary localhost:xxxxx.
|
||||
NOTICE: Attempting to promote clone localhost:xxxxx via pg_promote().
|
||||
NOTICE: Clone node localhost:xxxxx (ID 3) has been successfully promoted.
|
||||
NOTICE: Updating metadata for promoted clone localhost:xxxxx (ID 3)
|
||||
NOTICE: adjusting shard placements for primary localhost:xxxxx and clone localhost:xxxxx
|
||||
NOTICE: processing 4 shards for primary node GroupID 1
|
||||
LOG: inserting DELETE shard record for shard public.table1_colg2_102020 from clone node GroupID 3
|
||||
LOG: inserting DELETE shard record for shard public.table1_colg2_102022 from clone node GroupID 3
|
||||
LOG: inserting DELETE shard record for shard public.table1_colg3_102028 from clone node GroupID 3
|
||||
LOG: inserting DELETE shard record for shard public.table1_colg3_102030 from clone node GroupID 3
|
||||
NOTICE: processing 4 shards for clone node GroupID 3
|
||||
LOG: inserting DELETE shard record for shard public.backup_test_102008 from primary node GroupID 1
|
||||
LOG: inserting DELETE shard record for shard public.backup_test_102010 from primary node GroupID 1
|
||||
LOG: inserting DELETE shard record for shard public.table2_colg1_102016 from primary node GroupID 1
|
||||
LOG: inserting DELETE shard record for shard public.table2_colg1_102018 from primary node GroupID 1
|
||||
NOTICE: shard placement adjustment complete for primary localhost:xxxxx and clone localhost:xxxxx
|
||||
NOTICE: Clone node localhost:xxxxx (ID 3) metadata updated. It is now a primary
|
||||
NOTICE: Clone node localhost:xxxxx (ID 3) successfully registered as a worker node
|
||||
citus_promote_clone_and_rebalance
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
SET client_min_messages to DEFAULT;
|
||||
SELECT shardid, nodename, 'PRIMARY' as node_type FROM pg_dist_shard_placement WHERE nodeport = :worker_1_port ORDER BY shardid;
|
||||
shardid | nodename | node_type
|
||||
---------------------------------------------------------------------
|
||||
102020 | localhost | PRIMARY
|
||||
102022 | localhost | PRIMARY
|
||||
102024 | localhost | PRIMARY
|
||||
102026 | localhost | PRIMARY
|
||||
102028 | localhost | PRIMARY
|
||||
102030 | localhost | PRIMARY
|
||||
102032 | localhost | PRIMARY
|
||||
102034 | localhost | PRIMARY
|
||||
102036 | localhost | PRIMARY
|
||||
102037 | localhost | PRIMARY
|
||||
102038 | localhost | PRIMARY
|
||||
(11 rows)
|
||||
|
||||
SELECT shardid, nodename, 'CLONE' as node_type FROM pg_dist_shard_placement WHERE nodeport = :follower_worker_1_port ORDER BY shardid;
|
||||
shardid | nodename | node_type
|
||||
---------------------------------------------------------------------
|
||||
102008 | localhost | CLONE
|
||||
102010 | localhost | CLONE
|
||||
102012 | localhost | CLONE
|
||||
102014 | localhost | CLONE
|
||||
102016 | localhost | CLONE
|
||||
102018 | localhost | CLONE
|
||||
102036 | localhost | CLONE
|
||||
102037 | localhost | CLONE
|
||||
102038 | localhost | CLONE
|
||||
(9 rows)
|
||||
|
||||
\c - - - :worker_1_port
|
||||
SELECT 'WORKER' as node_type,* from pg_dist_node;
|
||||
node_type | nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||
---------------------------------------------------------------------
|
||||
WORKER | 3 | 3 | localhost | 9071 | default | t | t | primary | default | t | t | f | 0
|
||||
WORKER | 1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||
WORKER | 2 | 2 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||
(3 rows)
|
||||
|
||||
SELECT 'WORKER' as node_type, nodename, nodeport, count(shardid) FROM pg_dist_shard_placement GROUP BY nodename, nodeport ORDER BY nodename, nodeport;
|
||||
node_type | nodename | nodeport | count
|
||||
---------------------------------------------------------------------
|
||||
WORKER | localhost | 9071 | 9
|
||||
WORKER | localhost | 57637 | 11
|
||||
WORKER | localhost | 57638 | 17
|
||||
(3 rows)
|
||||
|
||||
SELECT * from citus_tables;
|
||||
table_name | citus_table_type | distribution_column | colocation_id | table_size | shard_count | table_owner | access_method
|
||||
---------------------------------------------------------------------
|
||||
backup_test | distributed | id | 1 | 64 kB | 4 | postgres | heap
|
||||
customers | reference | <none> | 5 | 144 kB | 1 | postgres | heap
|
||||
order_items | reference | <none> | 5 | 96 kB | 1 | postgres | heap
|
||||
orders | reference | <none> | 5 | 72 kB | 1 | postgres | heap
|
||||
table1_colg1 | distributed | a | 2 | 32 kB | 4 | postgres | heap
|
||||
table1_colg2 | distributed | a | 3 | 32 kB | 4 | postgres | heap
|
||||
table1_colg3 | distributed | a | 4 | 32 kB | 4 | postgres | heap
|
||||
table2_colg1 | distributed | b | 2 | 32 kB | 4 | postgres | heap
|
||||
table2_colg2 | distributed | b | 3 | 32 kB | 4 | postgres | heap
|
||||
table2_colg3 | distributed | b | 4 | 32 kB | 4 | postgres | heap
|
||||
(10 rows)
|
||||
|
||||
SELECT id, value FROM backup_test ORDER BY id;
|
||||
id | value
|
||||
---------------------------------------------------------------------
|
||||
1 | test1
|
||||
2 | test2
|
||||
3 | test3
|
||||
4 | test4
|
||||
5 | test5
|
||||
6 | test6
|
||||
7 | test7
|
||||
8 | test8
|
||||
9 | test9
|
||||
10 | test10
|
||||
(10 rows)
|
||||
|
||||
SELECT count(*) from customers;
|
||||
count
|
||||
---------------------------------------------------------------------
|
||||
10
|
||||
(1 row)
|
||||
|
||||
SELECT count(*) from orders;
|
||||
count
|
||||
---------------------------------------------------------------------
|
||||
30
|
||||
(1 row)
|
||||
|
||||
SELECT count(*) from order_items;
|
||||
count
|
||||
---------------------------------------------------------------------
|
||||
90
|
||||
(1 row)
|
||||
|
||||
\c - - - :follower_worker_1_port
|
||||
SELECT 'CLONE' as node_type ,* from pg_dist_node;
|
||||
node_type | nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||
---------------------------------------------------------------------
|
||||
CLONE | 3 | 3 | localhost | 9071 | default | t | t | primary | default | t | t | f | 0
|
||||
CLONE | 1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||
CLONE | 2 | 2 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||
(3 rows)
|
||||
|
||||
SELECT 'CLONE' as node_type, nodename, nodeport, count(shardid) FROM pg_dist_shard_placement GROUP BY nodename, nodeport ORDER BY nodename, nodeport;
|
||||
node_type | nodename | nodeport | count
|
||||
---------------------------------------------------------------------
|
||||
CLONE | localhost | 9071 | 9
|
||||
CLONE | localhost | 57637 | 11
|
||||
CLONE | localhost | 57638 | 17
|
||||
(3 rows)
|
||||
|
||||
SELECT * from citus_tables;
|
||||
table_name | citus_table_type | distribution_column | colocation_id | table_size | shard_count | table_owner | access_method
|
||||
---------------------------------------------------------------------
|
||||
backup_test | distributed | id | 1 | 64 kB | 4 | postgres | heap
|
||||
customers | reference | <none> | 5 | 144 kB | 1 | postgres | heap
|
||||
order_items | reference | <none> | 5 | 96 kB | 1 | postgres | heap
|
||||
orders | reference | <none> | 5 | 72 kB | 1 | postgres | heap
|
||||
table1_colg1 | distributed | a | 2 | 32 kB | 4 | postgres | heap
|
||||
table1_colg2 | distributed | a | 3 | 32 kB | 4 | postgres | heap
|
||||
table1_colg3 | distributed | a | 4 | 32 kB | 4 | postgres | heap
|
||||
table2_colg1 | distributed | b | 2 | 32 kB | 4 | postgres | heap
|
||||
table2_colg2 | distributed | b | 3 | 32 kB | 4 | postgres | heap
|
||||
table2_colg3 | distributed | b | 4 | 32 kB | 4 | postgres | heap
|
||||
(10 rows)
|
||||
|
||||
SELECT id, value FROM backup_test ORDER BY id;
|
||||
id | value
|
||||
---------------------------------------------------------------------
|
||||
1 | test1
|
||||
2 | test2
|
||||
3 | test3
|
||||
4 | test4
|
||||
5 | test5
|
||||
6 | test6
|
||||
7 | test7
|
||||
8 | test8
|
||||
9 | test9
|
||||
10 | test10
|
||||
(10 rows)
|
||||
|
||||
SELECT count(*) from customers;
|
||||
count
|
||||
---------------------------------------------------------------------
|
||||
10
|
||||
(1 row)
|
||||
|
||||
SELECT count(*) from orders;
|
||||
count
|
||||
---------------------------------------------------------------------
|
||||
30
|
||||
(1 row)
|
||||
|
||||
SELECT count(*) from order_items;
|
||||
count
|
||||
---------------------------------------------------------------------
|
||||
90
|
||||
(1 row)
|
||||
|
||||
\c - - - :master_port
|
||||
SELECT 'MASTER' as node_type, * from pg_dist_node;
|
||||
node_type | nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||
---------------------------------------------------------------------
|
||||
MASTER | 2 | 2 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||
MASTER | 1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||
MASTER | 3 | 3 | localhost | 9071 | default | t | t | primary | default | t | t | f | 0
|
||||
(3 rows)
|
||||
|
||||
SELECT 'MASTER' as node_type, nodename, nodeport, count(shardid) FROM pg_dist_shard_placement GROUP BY nodename, nodeport ORDER BY nodename, nodeport;
|
||||
node_type | nodename | nodeport | count
|
||||
---------------------------------------------------------------------
|
||||
MASTER | localhost | 9071 | 9
|
||||
MASTER | localhost | 57637 | 11
|
||||
MASTER | localhost | 57638 | 17
|
||||
(3 rows)
|
||||
|
||||
SELECT * from citus_tables;
|
||||
table_name | citus_table_type | distribution_column | colocation_id | table_size | shard_count | table_owner | access_method
|
||||
---------------------------------------------------------------------
|
||||
backup_test | distributed | id | 1 | 64 kB | 4 | postgres | heap
|
||||
customers | reference | <none> | 5 | 144 kB | 1 | postgres | heap
|
||||
order_items | reference | <none> | 5 | 96 kB | 1 | postgres | heap
|
||||
orders | reference | <none> | 5 | 72 kB | 1 | postgres | heap
|
||||
table1_colg1 | distributed | a | 2 | 32 kB | 4 | postgres | heap
|
||||
table1_colg2 | distributed | a | 3 | 32 kB | 4 | postgres | heap
|
||||
table1_colg3 | distributed | a | 4 | 32 kB | 4 | postgres | heap
|
||||
table2_colg1 | distributed | b | 2 | 32 kB | 4 | postgres | heap
|
||||
table2_colg2 | distributed | b | 3 | 32 kB | 4 | postgres | heap
|
||||
table2_colg3 | distributed | b | 4 | 32 kB | 4 | postgres | heap
|
||||
(10 rows)
|
||||
|
||||
SELECT id, value FROM backup_test ORDER BY id;
|
||||
id | value
|
||||
---------------------------------------------------------------------
|
||||
1 | test1
|
||||
2 | test2
|
||||
3 | test3
|
||||
4 | test4
|
||||
5 | test5
|
||||
6 | test6
|
||||
7 | test7
|
||||
8 | test8
|
||||
9 | test9
|
||||
10 | test10
|
||||
(10 rows)
|
||||
|
||||
SELECT count(*) from customers;
|
||||
count
|
||||
---------------------------------------------------------------------
|
||||
10
|
||||
(1 row)
|
||||
|
||||
SELECT count(*) from orders;
|
||||
count
|
||||
---------------------------------------------------------------------
|
||||
30
|
||||
(1 row)
|
||||
|
||||
SELECT count(*) from order_items;
|
||||
count
|
||||
---------------------------------------------------------------------
|
||||
90
|
||||
(1 row)
|
||||
|
||||
-- verify data
|
||||
SELECT count(*) FROM backup_test;
|
||||
count
|
||||
---------------------------------------------------------------------
|
||||
10
|
||||
(1 row)
|
||||
|
||||
SELECT id, value FROM backup_test ORDER BY id;
|
||||
id | value
|
||||
---------------------------------------------------------------------
|
||||
1 | test1
|
||||
2 | test2
|
||||
3 | test3
|
||||
4 | test4
|
||||
5 | test5
|
||||
6 | test6
|
||||
7 | test7
|
||||
8 | test8
|
||||
9 | test9
|
||||
10 | test10
|
||||
(10 rows)
|
||||
|
||||
-- cleanup
|
||||
DROP TABLE backup_test;
|
||||
|
|
@ -0,0 +1,306 @@
|
|||
--
|
||||
-- Test for negative scenarios in clone promotion functionality
|
||||
--
|
||||
--try to add follower_worker_1 as a clone of worker_1 to the cluster
|
||||
-- this should fail as previous test has already promoted worker_1 to a primary node
|
||||
SELECT citus_add_clone_node('localhost', :follower_worker_1_port, 'localhost', :worker_1_port) AS clone_node_id \gset
|
||||
ERROR: a different node localhost:xxxxx (nodeid 3) already exists or is a clone for a different primary
|
||||
SELECT * from pg_dist_node ORDER by nodeid;
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||
---------------------------------------------------------------------
|
||||
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||
2 | 2 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||
3 | 3 | localhost | 9071 | default | t | t | primary | default | t | t | f | 0
|
||||
(3 rows)
|
||||
|
||||
--try to add worker_node2 as a clone of worker_node1
|
||||
-- this should fail as it is not a valid replica of worker_1
|
||||
SELECT citus_add_clone_node('localhost', :follower_worker_2_port, 'localhost', :worker_1_port) AS clone_node_id \gset
|
||||
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
|
||||
NOTICE: checking replication for node localhost (resolved IP: ::1)
|
||||
ERROR: clone localhost:xxxxx is not connected to primary localhost:xxxxx
|
||||
DETAIL: The clone must be actively replicating from the specified primary node. Check that the clone is running and properly configured for replication.
|
||||
SELECT * from pg_dist_node ORDER by nodeid;
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||
---------------------------------------------------------------------
|
||||
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||
2 | 2 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||
3 | 3 | localhost | 9071 | default | t | t | primary | default | t | t | f | 0
|
||||
(3 rows)
|
||||
|
||||
--try add
|
||||
-- create a distributed table and load data
|
||||
CREATE TABLE backup_test(id int, value text);
|
||||
SELECT create_distributed_table('backup_test', 'id', 'hash');
|
||||
create_distributed_table
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
INSERT INTO backup_test SELECT g, 'test' || g FROM generate_series(1, 10) g;
|
||||
-- Create reference table
|
||||
CREATE TABLE ref_table(id int PRIMARY KEY);
|
||||
SELECT create_reference_table('ref_table');
|
||||
create_reference_table
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
INSERT INTO ref_table SELECT i FROM generate_series(1, 5) i;
|
||||
SELECT COUNT(*) from backup_test;
|
||||
count
|
||||
---------------------------------------------------------------------
|
||||
10
|
||||
(1 row)
|
||||
|
||||
SELECT COUNT(*) from ref_table;
|
||||
count
|
||||
---------------------------------------------------------------------
|
||||
5
|
||||
(1 row)
|
||||
|
||||
-- verify initial shard placement
|
||||
SELECT nodename, nodeport, count(shardid) FROM pg_dist_shard_placement GROUP BY nodename, nodeport ORDER BY nodename, nodeport;
|
||||
nodename | nodeport | count
|
||||
---------------------------------------------------------------------
|
||||
localhost | 9071 | 10
|
||||
localhost | 57637 | 12
|
||||
localhost | 57638 | 18
|
||||
(3 rows)
|
||||
|
||||
-- Try to add replica of worker_node2 as a clone of worker_node1
|
||||
SELECT citus_add_clone_node('localhost', :follower_worker_2_port, 'localhost', :worker_1_port) AS clone_node_id \gset
|
||||
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
|
||||
NOTICE: checking replication for node localhost (resolved IP: ::1)
|
||||
ERROR: clone localhost:xxxxx is not connected to primary localhost:xxxxx
|
||||
DETAIL: The clone must be actively replicating from the specified primary node. Check that the clone is running and properly configured for replication.
|
||||
-- Test 1: Try to promote a non-existent clone node
|
||||
SELECT citus_promote_clone_and_rebalance(clone_nodeid =>99999);
|
||||
ERROR: Clone node with ID 99999 not found.
|
||||
-- Test 2: Try to promote a regular worker node (not a clone)
|
||||
SELECT citus_promote_clone_and_rebalance(clone_nodeid => 1);
|
||||
ERROR: Node localhost:xxxxx (ID 1) is not a valid clone or its primary node ID is not set.
|
||||
-- Test 3: Try to promote with invalid timeout (negative)
|
||||
SELECT citus_promote_clone_and_rebalance(clone_nodeid => 1,
|
||||
catchup_timeout_seconds => -100);
|
||||
ERROR: Node localhost:xxxxx (ID 1) is not a valid clone or its primary node ID is not set.
|
||||
-- register the new node as a clone, This should pass
|
||||
SELECT citus_add_clone_node('localhost', :follower_worker_2_port, 'localhost', :worker_2_port) AS clone_node_id \gset
|
||||
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
|
||||
NOTICE: checking replication for node localhost (resolved IP: ::1)
|
||||
NOTICE: clone localhost:xxxxx is properly connected to primary localhost:xxxxx and is not synchronous
|
||||
SELECT * from pg_dist_node ORDER by nodeid;
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||
---------------------------------------------------------------------
|
||||
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||
2 | 2 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||
3 | 3 | localhost | 9071 | default | t | t | primary | default | t | t | f | 0
|
||||
4 | 4 | localhost | 9072 | default | f | f | unavailable | default | f | f | t | 2
|
||||
(4 rows)
|
||||
|
||||
SELECT :clone_node_id;
|
||||
?column?
|
||||
---------------------------------------------------------------------
|
||||
4
|
||||
(1 row)
|
||||
|
||||
-- Test 4: Try to promote clone with invalid strategy name
|
||||
SELECT citus_promote_clone_and_rebalance(clone_nodeid => :clone_node_id, rebalance_strategy => 'invalid_strategy');
|
||||
NOTICE: Starting promotion process for clone node localhost:xxxxx (ID 4), original primary localhost:xxxxx (ID 2)
|
||||
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
|
||||
NOTICE: checking replication for node localhost (resolved IP: ::1)
|
||||
NOTICE: clone localhost:xxxxx is properly connected to primary localhost:xxxxx and is not synchronous
|
||||
NOTICE: Blocking writes on shards of original primary node localhost:xxxxx (group 2)
|
||||
NOTICE: Blocking all writes to worker node localhost:xxxxx (ID 2)
|
||||
NOTICE: Waiting for clone localhost:xxxxx to catch up with primary localhost:xxxxx (timeout: 300 seconds)
|
||||
NOTICE: replication lag between localhost:xxxxx and localhost:xxxxx is 0 bytes
|
||||
NOTICE: Clone localhost:xxxxx is now caught up with primary localhost:xxxxx.
|
||||
NOTICE: Attempting to promote clone localhost:xxxxx via pg_promote().
|
||||
NOTICE: Clone node localhost:xxxxx (ID 4) has been successfully promoted.
|
||||
NOTICE: Updating metadata for promoted clone localhost:xxxxx (ID 4)
|
||||
ERROR: could not find rebalance strategy with name invalid_strategy
|
||||
SELECT * from pg_dist_node ORDER by nodeid;
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||
---------------------------------------------------------------------
|
||||
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||
2 | 2 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||
3 | 3 | localhost | 9071 | default | t | t | primary | default | t | t | f | 0
|
||||
4 | 4 | localhost | 9072 | default | f | f | unavailable | default | f | f | t | 2
|
||||
(4 rows)
|
||||
|
||||
-- Test 9: Rollback the citus_promote_clone_and_rebalance transaction
|
||||
BEGIN;
|
||||
SELECT citus_promote_clone_and_rebalance(clone_nodeid => :clone_node_id);
|
||||
NOTICE: Starting promotion process for clone node localhost:xxxxx (ID 4), original primary localhost:xxxxx (ID 2)
|
||||
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
|
||||
NOTICE: checking replication for node localhost (resolved IP: ::1)
|
||||
ERROR: clone localhost:xxxxx is not connected to primary localhost:xxxxx
|
||||
DETAIL: The clone must be actively replicating from the specified primary node. Check that the clone is running and properly configured for replication.
|
||||
ROLLBACK;
|
||||
-- Verify no data is lost after rooling back the transaction
|
||||
SELECT COUNT(*) from backup_test;
|
||||
count
|
||||
---------------------------------------------------------------------
|
||||
10
|
||||
(1 row)
|
||||
|
||||
SELECT COUNT(*) from ref_table;
|
||||
count
|
||||
---------------------------------------------------------------------
|
||||
5
|
||||
(1 row)
|
||||
|
||||
SELECT * from pg_dist_node ORDER by nodeid;
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||
---------------------------------------------------------------------
|
||||
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||
2 | 2 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||
3 | 3 | localhost | 9071 | default | t | t | primary | default | t | t | f | 0
|
||||
4 | 4 | localhost | 9072 | default | f | f | unavailable | default | f | f | t | 2
|
||||
(4 rows)
|
||||
|
||||
-- Test 5: Try to add and promote a proper replica after rollback
|
||||
SELECT master_add_node('localhost', :worker_3_port) AS nodeid_3 \gset
|
||||
SELECT citus_add_clone_node('localhost', :follower_worker_3_port, 'localhost', :worker_3_port) AS clone_node_id_3 \gset
|
||||
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
|
||||
NOTICE: checking replication for node localhost (resolved IP: ::1)
|
||||
NOTICE: clone localhost:xxxxx is properly connected to primary localhost:xxxxx and is not synchronous
|
||||
set citus.shard_count = 100;
|
||||
CREATE TABLE backup_test2(id int, value text);
|
||||
SELECT create_distributed_table('backup_test2', 'id', 'hash');
|
||||
create_distributed_table
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
INSERT INTO backup_test2 SELECT g, 'test' || g FROM generate_series(1, 10) g;
|
||||
-- Create reference table
|
||||
CREATE TABLE ref_table2(id int PRIMARY KEY);
|
||||
SELECT create_reference_table('ref_table2');
|
||||
create_reference_table
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
INSERT INTO ref_table2 SELECT i FROM generate_series(1, 5) i;
|
||||
SELECT * from get_snapshot_based_node_split_plan('localhost', :worker_3_port, 'localhost', :follower_worker_3_port);
|
||||
table_name | shardid | shard_size | placement_node
|
||||
---------------------------------------------------------------------
|
||||
backup_test2 | 102091 | 0 | Primary Node
|
||||
backup_test2 | 102095 | 0 | Primary Node
|
||||
backup_test2 | 102099 | 0 | Primary Node
|
||||
backup_test2 | 102103 | 0 | Primary Node
|
||||
backup_test2 | 102111 | 0 | Primary Node
|
||||
backup_test2 | 102115 | 0 | Primary Node
|
||||
backup_test2 | 102119 | 0 | Primary Node
|
||||
backup_test2 | 102123 | 0 | Primary Node
|
||||
backup_test2 | 102127 | 0 | Primary Node
|
||||
backup_test2 | 102131 | 0 | Primary Node
|
||||
backup_test2 | 102135 | 0 | Primary Node
|
||||
backup_test2 | 102139 | 0 | Primary Node
|
||||
backup_test2 | 102143 | 0 | Primary Node
|
||||
backup_test2 | 102063 | 0 | Clone Node
|
||||
backup_test2 | 102071 | 0 | Clone Node
|
||||
backup_test2 | 102107 | 0 | Clone Node
|
||||
backup_test2 | 102047 | 0 | Clone Node
|
||||
backup_test2 | 102051 | 0 | Clone Node
|
||||
backup_test2 | 102055 | 0 | Clone Node
|
||||
backup_test2 | 102059 | 0 | Clone Node
|
||||
backup_test2 | 102067 | 0 | Clone Node
|
||||
backup_test2 | 102075 | 0 | Clone Node
|
||||
backup_test2 | 102079 | 0 | Clone Node
|
||||
backup_test2 | 102083 | 0 | Clone Node
|
||||
backup_test2 | 102087 | 0 | Clone Node
|
||||
(25 rows)
|
||||
|
||||
SET client_min_messages to 'LOG';
|
||||
SELECT citus_promote_clone_and_rebalance(clone_nodeid => :clone_node_id_3);
|
||||
NOTICE: Starting promotion process for clone node localhost:xxxxx (ID 6), original primary localhost:xxxxx (ID 5)
|
||||
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
|
||||
NOTICE: checking replication for node localhost (resolved IP: ::1)
|
||||
NOTICE: clone localhost:xxxxx is properly connected to primary localhost:xxxxx and is not synchronous
|
||||
NOTICE: Blocking writes on shards of original primary node localhost:xxxxx (group 5)
|
||||
NOTICE: Blocking all writes to worker node localhost:xxxxx (ID 5)
|
||||
NOTICE: Waiting for clone localhost:xxxxx to catch up with primary localhost:xxxxx (timeout: 300 seconds)
|
||||
NOTICE: replication lag between localhost:xxxxx and localhost:xxxxx is 0 bytes
|
||||
NOTICE: Clone localhost:xxxxx is now caught up with primary localhost:xxxxx.
|
||||
NOTICE: Attempting to promote clone localhost:xxxxx via pg_promote().
|
||||
NOTICE: Clone node localhost:xxxxx (ID 6) has been successfully promoted.
|
||||
NOTICE: Updating metadata for promoted clone localhost:xxxxx (ID 6)
|
||||
NOTICE: adjusting shard placements for primary localhost:xxxxx and clone localhost:xxxxx
|
||||
NOTICE: processing 13 shards for primary node GroupID 5
|
||||
LOG: inserting DELETE shard record for shard public.backup_test2_102091 from clone node GroupID 6
|
||||
LOG: inserting DELETE shard record for shard public.backup_test2_102095 from clone node GroupID 6
|
||||
LOG: inserting DELETE shard record for shard public.backup_test2_102099 from clone node GroupID 6
|
||||
LOG: inserting DELETE shard record for shard public.backup_test2_102103 from clone node GroupID 6
|
||||
LOG: inserting DELETE shard record for shard public.backup_test2_102111 from clone node GroupID 6
|
||||
LOG: inserting DELETE shard record for shard public.backup_test2_102115 from clone node GroupID 6
|
||||
LOG: inserting DELETE shard record for shard public.backup_test2_102119 from clone node GroupID 6
|
||||
LOG: inserting DELETE shard record for shard public.backup_test2_102123 from clone node GroupID 6
|
||||
LOG: inserting DELETE shard record for shard public.backup_test2_102127 from clone node GroupID 6
|
||||
LOG: inserting DELETE shard record for shard public.backup_test2_102131 from clone node GroupID 6
|
||||
LOG: inserting DELETE shard record for shard public.backup_test2_102135 from clone node GroupID 6
|
||||
LOG: inserting DELETE shard record for shard public.backup_test2_102139 from clone node GroupID 6
|
||||
LOG: inserting DELETE shard record for shard public.backup_test2_102143 from clone node GroupID 6
|
||||
NOTICE: processing 12 shards for clone node GroupID 6
|
||||
LOG: inserting DELETE shard record for shard public.backup_test2_102063 from primary node GroupID 5
|
||||
LOG: inserting DELETE shard record for shard public.backup_test2_102071 from primary node GroupID 5
|
||||
LOG: inserting DELETE shard record for shard public.backup_test2_102107 from primary node GroupID 5
|
||||
LOG: inserting DELETE shard record for shard public.backup_test2_102047 from primary node GroupID 5
|
||||
LOG: inserting DELETE shard record for shard public.backup_test2_102051 from primary node GroupID 5
|
||||
LOG: inserting DELETE shard record for shard public.backup_test2_102055 from primary node GroupID 5
|
||||
LOG: inserting DELETE shard record for shard public.backup_test2_102059 from primary node GroupID 5
|
||||
LOG: inserting DELETE shard record for shard public.backup_test2_102067 from primary node GroupID 5
|
||||
LOG: inserting DELETE shard record for shard public.backup_test2_102075 from primary node GroupID 5
|
||||
LOG: inserting DELETE shard record for shard public.backup_test2_102079 from primary node GroupID 5
|
||||
LOG: inserting DELETE shard record for shard public.backup_test2_102083 from primary node GroupID 5
|
||||
LOG: inserting DELETE shard record for shard public.backup_test2_102087 from primary node GroupID 5
|
||||
NOTICE: shard placement adjustment complete for primary localhost:xxxxx and clone localhost:xxxxx
|
||||
NOTICE: Clone node localhost:xxxxx (ID 6) metadata updated. It is now a primary
|
||||
NOTICE: Clone node localhost:xxxxx (ID 6) successfully registered as a worker node
|
||||
citus_promote_clone_and_rebalance
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
SET client_min_messages to DEFAULT;
|
||||
SELECT COUNT(*) from backup_test;
|
||||
count
|
||||
---------------------------------------------------------------------
|
||||
10
|
||||
(1 row)
|
||||
|
||||
SELECT COUNT(*) from ref_table;
|
||||
count
|
||||
---------------------------------------------------------------------
|
||||
5
|
||||
(1 row)
|
||||
|
||||
SELECT * from pg_dist_node ORDER by nodeid;
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||
---------------------------------------------------------------------
|
||||
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||
2 | 2 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||
3 | 3 | localhost | 9071 | default | t | t | primary | default | t | t | f | 0
|
||||
4 | 4 | localhost | 9072 | default | f | f | unavailable | default | f | f | t | 2
|
||||
5 | 5 | localhost | 57639 | default | t | t | primary | default | t | t | f | 0
|
||||
6 | 6 | localhost | 9073 | default | t | t | primary | default | t | t | f | 0
|
||||
(6 rows)
|
||||
|
||||
-- check the shard placement
|
||||
SELECT nodename, nodeport, count(shardid) FROM pg_dist_shard_placement GROUP BY nodename, nodeport ORDER BY nodename, nodeport;
|
||||
nodename | nodeport | count
|
||||
---------------------------------------------------------------------
|
||||
localhost | 9071 | 36
|
||||
localhost | 9073 | 17
|
||||
localhost | 57637 | 38
|
||||
localhost | 57638 | 44
|
||||
localhost | 57639 | 18
|
||||
(5 rows)
|
||||
|
||||
set citus.shard_count to default;
|
||||
-- cleanup
|
||||
DROP TABLE backup_test;
|
||||
DROP TABLE ref_table;
|
||||
DROP TABLE backup_test2;
|
||||
DROP TABLE ref_table2;
|
||||
|
|
@ -0,0 +1,45 @@
|
|||
--
|
||||
-- Test for negative scenarios in clone promotion functionality
|
||||
-- We do not allow synchronous replicas to be added as clones
|
||||
-- this test is to ensure that we do not allow this
|
||||
--
|
||||
SELECT * from pg_dist_node ORDER by nodeid;
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||
---------------------------------------------------------------------
|
||||
3 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||
4 | 2 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||
5 | 1 | localhost | 9071 | default | f | t | secondary | second-cluster | f | t | f | 0
|
||||
6 | 2 | localhost | 9072 | default | f | t | secondary | second-cluster | f | t | f | 0
|
||||
(4 rows)
|
||||
|
||||
SELECT master_remove_node('localhost', :follower_worker_1_port);
|
||||
master_remove_node
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
SELECT master_remove_node('localhost', :follower_worker_2_port);
|
||||
master_remove_node
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
-- this should fail as the replica is a synchronous replica that is not allowed
|
||||
SELECT citus_add_clone_node('localhost', :follower_worker_1_port, 'localhost', :worker_1_port) AS clone_node_id \gset
|
||||
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
|
||||
NOTICE: checking replication for node localhost (resolved IP: ::1)
|
||||
ERROR: cannot add clone localhost:xxxxx as it is configured as a synchronous replica
|
||||
DETAIL: Promoting a synchronous clone can cause data consistency issues. Please configure it as an asynchronous replica first.
|
||||
-- this should fail as the replica is a synchronous replica that is not allowed
|
||||
SELECT citus_add_clone_node('localhost', :follower_worker_2_port, 'localhost', :worker_2_port) AS clone_node_id \gset
|
||||
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
|
||||
NOTICE: checking replication for node localhost (resolved IP: ::1)
|
||||
ERROR: cannot add clone localhost:xxxxx as it is configured as a synchronous replica
|
||||
DETAIL: Promoting a synchronous clone can cause data consistency issues. Please configure it as an asynchronous replica first.
|
||||
SELECT * from pg_dist_node ORDER by nodeid;
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||
---------------------------------------------------------------------
|
||||
3 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||
4 | 2 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||
(2 rows)
|
||||
|
||||
|
|
@ -691,11 +691,11 @@ SELECT
|
|||
(1 row)
|
||||
|
||||
SELECT * FROM pg_dist_node ORDER BY nodeid;
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||
---------------------------------------------------------------------
|
||||
3 | 0 | localhost | 57636 | default | t | t | primary | default | t | f
|
||||
11 | 9 | localhost | 57637 | default | t | t | primary | default | t | t
|
||||
12 | 10 | localhost | 57638 | default | t | t | primary | default | t | t
|
||||
3 | 0 | localhost | 57636 | default | t | t | primary | default | t | f | f | 0
|
||||
11 | 9 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||
12 | 10 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||
(3 rows)
|
||||
|
||||
-- check that mixed add/remove node commands work fine inside transaction
|
||||
|
|
@ -928,11 +928,11 @@ CONTEXT: PL/pgSQL function citus_internal.pg_dist_node_trigger_func() line XX a
|
|||
INSERT INTO pg_dist_node (nodename, nodeport, groupid, noderole, nodecluster)
|
||||
VALUES ('localhost', 5000, 1000, 'primary', 'olap');
|
||||
ERROR: new row for relation "pg_dist_node" violates check constraint "primaries_are_only_allowed_in_the_default_cluster"
|
||||
DETAIL: Failing row contains (25, 1000, localhost, 5000, default, f, t, primary, olap, f, t).
|
||||
DETAIL: Failing row contains (25, 1000, localhost, 5000, default, f, t, primary, olap, f, t, f, 0).
|
||||
UPDATE pg_dist_node SET nodecluster = 'olap'
|
||||
WHERE nodeport = :worker_1_port;
|
||||
ERROR: new row for relation "pg_dist_node" violates check constraint "primaries_are_only_allowed_in_the_default_cluster"
|
||||
DETAIL: Failing row contains (17, 14, localhost, 57637, default, f, t, primary, olap, f, t).
|
||||
DETAIL: Failing row contains (17, 14, localhost, 57637, default, f, t, primary, olap, f, t, f, 0).
|
||||
-- check that you /can/ add a secondary node to a non-default cluster
|
||||
SELECT groupid AS worker_2_group FROM pg_dist_node WHERE nodeport = :worker_2_port \gset
|
||||
SELECT master_add_node('localhost', 8888, groupid => :worker_1_group, noderole => 'secondary', nodecluster=> 'olap');
|
||||
|
|
@ -955,9 +955,9 @@ SELECT master_add_node('localhost', 8887, groupid => :worker_1_group, noderole =
|
|||
(1 row)
|
||||
|
||||
SELECT * FROM pg_dist_node WHERE nodeport=8887;
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||
---------------------------------------------------------------------
|
||||
27 | 14 | localhost | 8887 | default | f | t | secondary | thisisasixtyfourcharacterstringrepeatedfourtimestomake256chars. | f | t
|
||||
27 | 14 | localhost | 8887 | default | f | t | secondary | thisisasixtyfourcharacterstringrepeatedfourtimestomake256chars. | f | t | f | 0
|
||||
(1 row)
|
||||
|
||||
-- don't remove the secondary and unavailable nodes, check that no commands are sent to
|
||||
|
|
@ -1036,9 +1036,9 @@ SELECT master_update_node(:worker_1_node, 'somehost', 9000);
|
|||
(1 row)
|
||||
|
||||
SELECT * FROM pg_dist_node WHERE nodeid = :worker_1_node;
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||
---------------------------------------------------------------------
|
||||
17 | 14 | somehost | 9000 | default | f | t | primary | default | f | t
|
||||
17 | 14 | somehost | 9000 | default | f | t | primary | default | f | t | f | 0
|
||||
(1 row)
|
||||
|
||||
-- cleanup
|
||||
|
|
@ -1049,9 +1049,9 @@ SELECT master_update_node(:worker_1_node, 'localhost', :worker_1_port);
|
|||
(1 row)
|
||||
|
||||
SELECT * FROM pg_dist_node WHERE nodeid = :worker_1_node;
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||
---------------------------------------------------------------------
|
||||
17 | 14 | localhost | 57637 | default | f | t | primary | default | f | t
|
||||
17 | 14 | localhost | 57637 | default | f | t | primary | default | f | t | f | 0
|
||||
(1 row)
|
||||
|
||||
SET client_min_messages TO ERROR;
|
||||
|
|
|
|||
|
|
@ -1640,10 +1640,16 @@ SELECT * FROM multi_extension.print_extension_changes();
|
|||
---------------------------------------------------------------------
|
||||
function citus_rebalance_start(name,boolean,citus.shard_transfer_mode) bigint |
|
||||
function worker_last_saved_explain_analyze() TABLE(explain_analyze_output text, execution_duration double precision) |
|
||||
| function citus_add_clone_node(text,integer,text,integer) integer
|
||||
| function citus_add_clone_node_with_nodeid(text,integer,integer) integer
|
||||
| function citus_internal.citus_internal_copy_single_shard_placement(bigint,integer,integer,integer,citus.shard_transfer_mode) void
|
||||
| function citus_promote_clone_and_rebalance(integer,name,integer) void
|
||||
| function citus_rebalance_start(name,boolean,citus.shard_transfer_mode,boolean,boolean) bigint
|
||||
| function citus_remove_clone_node(text,integer) void
|
||||
| function citus_remove_clone_node_with_nodeid(integer) void
|
||||
| function get_snapshot_based_node_split_plan(text,integer,text,integer,name) TABLE(table_name regclass, shardid bigint, shard_size bigint, placement_node text)
|
||||
| function worker_last_saved_explain_analyze() TABLE(explain_analyze_output text, execution_duration double precision, execution_ntuples double precision, execution_nloops double precision)
|
||||
(5 rows)
|
||||
(11 rows)
|
||||
|
||||
DROP TABLE multi_extension.prev_objects, multi_extension.extension_diff;
|
||||
-- show running version
|
||||
|
|
|
|||
|
|
@ -75,7 +75,7 @@ SELECT unnest(activate_node_snapshot()) order by 1;
|
|||
GRANT CREATE ON SCHEMA public TO pg_database_owner;
|
||||
GRANT USAGE ON SCHEMA public TO PUBLIC;
|
||||
GRANT USAGE ON SCHEMA public TO pg_database_owner;
|
||||
INSERT INTO pg_dist_node (nodeid, groupid, nodename, nodeport, noderack, hasmetadata, metadatasynced, isactive, noderole, nodecluster, shouldhaveshards) VALUES (1, 0, 'localhost', 57636, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', FALSE),(2, 1, 'localhost', 57637, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE),(3, 2, 'localhost', 57638, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE)
|
||||
INSERT INTO pg_dist_node (nodeid, groupid, nodename, nodeport, noderack, hasmetadata, metadatasynced, isactive, noderole, nodecluster, shouldhaveshards, nodeisclone, nodeprimarynodeid) VALUES (1, 0, 'localhost', 57636, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', FALSE, FALSE, 0),(2, 1, 'localhost', 57637, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE, FALSE, 0),(3, 2, 'localhost', 57638, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE, FALSE, 0)
|
||||
RESET ROLE
|
||||
RESET ROLE
|
||||
SELECT alter_role_if_exists('postgres', 'ALTER ROLE postgres SET lc_messages = ''C''')
|
||||
|
|
@ -148,7 +148,7 @@ SELECT unnest(activate_node_snapshot()) order by 1;
|
|||
GRANT CREATE ON SCHEMA public TO pg_database_owner;
|
||||
GRANT USAGE ON SCHEMA public TO PUBLIC;
|
||||
GRANT USAGE ON SCHEMA public TO pg_database_owner;
|
||||
INSERT INTO pg_dist_node (nodeid, groupid, nodename, nodeport, noderack, hasmetadata, metadatasynced, isactive, noderole, nodecluster, shouldhaveshards) VALUES (1, 0, 'localhost', 57636, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', FALSE),(2, 1, 'localhost', 57637, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE),(3, 2, 'localhost', 57638, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE)
|
||||
INSERT INTO pg_dist_node (nodeid, groupid, nodename, nodeport, noderack, hasmetadata, metadatasynced, isactive, noderole, nodecluster, shouldhaveshards, nodeisclone, nodeprimarynodeid) VALUES (1, 0, 'localhost', 57636, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', FALSE, FALSE, 0),(2, 1, 'localhost', 57637, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE, FALSE, 0),(3, 2, 'localhost', 57638, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE, FALSE, 0)
|
||||
RESET ROLE
|
||||
RESET ROLE
|
||||
SELECT alter_role_if_exists('postgres', 'ALTER ROLE postgres SET lc_messages = ''C''')
|
||||
|
|
@ -216,7 +216,7 @@ SELECT unnest(activate_node_snapshot()) order by 1;
|
|||
GRANT CREATE ON SCHEMA public TO pg_database_owner;
|
||||
GRANT USAGE ON SCHEMA public TO PUBLIC;
|
||||
GRANT USAGE ON SCHEMA public TO pg_database_owner;
|
||||
INSERT INTO pg_dist_node (nodeid, groupid, nodename, nodeport, noderack, hasmetadata, metadatasynced, isactive, noderole, nodecluster, shouldhaveshards) VALUES (1, 0, 'localhost', 57636, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', FALSE),(2, 1, 'localhost', 57637, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE),(3, 2, 'localhost', 57638, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE)
|
||||
INSERT INTO pg_dist_node (nodeid, groupid, nodename, nodeport, noderack, hasmetadata, metadatasynced, isactive, noderole, nodecluster, shouldhaveshards, nodeisclone, nodeprimarynodeid) VALUES (1, 0, 'localhost', 57636, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', FALSE, FALSE, 0),(2, 1, 'localhost', 57637, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE, FALSE, 0),(3, 2, 'localhost', 57638, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE, FALSE, 0)
|
||||
RESET ROLE
|
||||
RESET ROLE
|
||||
SELECT alter_role_if_exists('postgres', 'ALTER ROLE postgres SET lc_messages = ''C''')
|
||||
|
|
@ -277,7 +277,7 @@ SELECT unnest(activate_node_snapshot()) order by 1;
|
|||
GRANT CREATE ON SCHEMA public TO pg_database_owner;
|
||||
GRANT USAGE ON SCHEMA public TO PUBLIC;
|
||||
GRANT USAGE ON SCHEMA public TO pg_database_owner;
|
||||
INSERT INTO pg_dist_node (nodeid, groupid, nodename, nodeport, noderack, hasmetadata, metadatasynced, isactive, noderole, nodecluster, shouldhaveshards) VALUES (1, 0, 'localhost', 57636, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', FALSE),(2, 1, 'localhost', 57637, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE),(3, 2, 'localhost', 57638, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE)
|
||||
INSERT INTO pg_dist_node (nodeid, groupid, nodename, nodeport, noderack, hasmetadata, metadatasynced, isactive, noderole, nodecluster, shouldhaveshards, nodeisclone, nodeprimarynodeid) VALUES (1, 0, 'localhost', 57636, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', FALSE, FALSE, 0),(2, 1, 'localhost', 57637, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE, FALSE, 0),(3, 2, 'localhost', 57638, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE, FALSE, 0)
|
||||
RESET ROLE
|
||||
RESET ROLE
|
||||
SELECT alter_role_if_exists('postgres', 'ALTER ROLE postgres SET lc_messages = ''C''')
|
||||
|
|
@ -345,7 +345,7 @@ SELECT unnest(activate_node_snapshot()) order by 1;
|
|||
GRANT CREATE ON SCHEMA public TO pg_database_owner;
|
||||
GRANT USAGE ON SCHEMA public TO PUBLIC;
|
||||
GRANT USAGE ON SCHEMA public TO pg_database_owner;
|
||||
INSERT INTO pg_dist_node (nodeid, groupid, nodename, nodeport, noderack, hasmetadata, metadatasynced, isactive, noderole, nodecluster, shouldhaveshards) VALUES (1, 0, 'localhost', 57636, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', FALSE),(2, 1, 'localhost', 57637, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE),(3, 2, 'localhost', 57638, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE)
|
||||
INSERT INTO pg_dist_node (nodeid, groupid, nodename, nodeport, noderack, hasmetadata, metadatasynced, isactive, noderole, nodecluster, shouldhaveshards, nodeisclone, nodeprimarynodeid) VALUES (1, 0, 'localhost', 57636, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', FALSE, FALSE, 0),(2, 1, 'localhost', 57637, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE, FALSE, 0),(3, 2, 'localhost', 57638, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE, FALSE, 0)
|
||||
RESET ROLE
|
||||
RESET ROLE
|
||||
SELECT alter_role_if_exists('postgres', 'ALTER ROLE postgres SET lc_messages = ''C''')
|
||||
|
|
@ -406,7 +406,7 @@ SELECT unnest(activate_node_snapshot()) order by 1;
|
|||
GRANT CREATE ON SCHEMA public TO pg_database_owner;
|
||||
GRANT USAGE ON SCHEMA public TO PUBLIC;
|
||||
GRANT USAGE ON SCHEMA public TO pg_database_owner;
|
||||
INSERT INTO pg_dist_node (nodeid, groupid, nodename, nodeport, noderack, hasmetadata, metadatasynced, isactive, noderole, nodecluster, shouldhaveshards) VALUES (1, 0, 'localhost', 57636, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', FALSE),(2, 1, 'localhost', 57637, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE),(3, 2, 'localhost', 57638, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE)
|
||||
INSERT INTO pg_dist_node (nodeid, groupid, nodename, nodeport, noderack, hasmetadata, metadatasynced, isactive, noderole, nodecluster, shouldhaveshards, nodeisclone, nodeprimarynodeid) VALUES (1, 0, 'localhost', 57636, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', FALSE, FALSE, 0),(2, 1, 'localhost', 57637, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE, FALSE, 0),(3, 2, 'localhost', 57638, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE, FALSE, 0)
|
||||
RESET ROLE
|
||||
RESET ROLE
|
||||
SELECT alter_role_if_exists('postgres', 'ALTER ROLE postgres SET lc_messages = ''C''')
|
||||
|
|
@ -511,13 +511,13 @@ SELECT * FROM pg_dist_local_group;
|
|||
(1 row)
|
||||
|
||||
SELECT * FROM pg_dist_node ORDER BY nodeid;
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||
---------------------------------------------------------------------
|
||||
1 | 0 | localhost | 57636 | default | t | t | primary | default | t | f
|
||||
2 | 1 | localhost | 57637 | default | t | t | primary | default | t | t
|
||||
3 | 2 | localhost | 57638 | default | f | t | primary | default | f | t
|
||||
5 | 1 | localhost | 8888 | default | f | t | secondary | default | f | t
|
||||
6 | 1 | localhost | 8889 | default | f | t | secondary | second-cluster | f | t
|
||||
1 | 0 | localhost | 57636 | default | t | t | primary | default | t | f | f | 0
|
||||
2 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||
3 | 2 | localhost | 57638 | default | f | t | primary | default | f | t | f | 0
|
||||
5 | 1 | localhost | 8888 | default | f | t | secondary | default | f | t | f | 0
|
||||
6 | 1 | localhost | 8889 | default | f | t | secondary | second-cluster | f | t | f | 0
|
||||
(5 rows)
|
||||
|
||||
SELECT * FROM pg_dist_partition WHERE logicalrelid::text LIKE 'mx_testing_schema%' ORDER BY logicalrelid::text;
|
||||
|
|
@ -650,13 +650,13 @@ SELECT * FROM pg_dist_local_group;
|
|||
(1 row)
|
||||
|
||||
SELECT * FROM pg_dist_node ORDER BY nodeid;
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||
---------------------------------------------------------------------
|
||||
1 | 0 | localhost | 57636 | default | t | t | primary | default | t | f
|
||||
2 | 1 | localhost | 57637 | default | t | t | primary | default | t | t
|
||||
3 | 2 | localhost | 57638 | default | f | t | primary | default | f | t
|
||||
5 | 1 | localhost | 8888 | default | f | t | secondary | default | f | t
|
||||
6 | 1 | localhost | 8889 | default | f | t | secondary | second-cluster | f | t
|
||||
1 | 0 | localhost | 57636 | default | t | t | primary | default | t | f | f | 0
|
||||
2 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||
3 | 2 | localhost | 57638 | default | f | t | primary | default | f | t | f | 0
|
||||
5 | 1 | localhost | 8888 | default | f | t | secondary | default | f | t | f | 0
|
||||
6 | 1 | localhost | 8889 | default | f | t | secondary | second-cluster | f | t | f | 0
|
||||
(5 rows)
|
||||
|
||||
SELECT * FROM pg_dist_partition WHERE logicalrelid::text LIKE 'mx_testing_schema%' ORDER BY logicalrelid::text;
|
||||
|
|
@ -1982,7 +1982,7 @@ SELECT unnest(activate_node_snapshot()) order by 1;
|
|||
GRANT CREATE ON SCHEMA public TO pg_database_owner;
|
||||
GRANT USAGE ON SCHEMA public TO PUBLIC;
|
||||
GRANT USAGE ON SCHEMA public TO pg_database_owner;
|
||||
INSERT INTO pg_dist_node (nodeid, groupid, nodename, nodeport, noderack, hasmetadata, metadatasynced, isactive, noderole, nodecluster, shouldhaveshards) VALUES (5, 1, 'localhost', 8888, 'default', FALSE, FALSE, TRUE, 'secondary'::noderole, 'default', TRUE),(6, 1, 'localhost', 8889, 'default', FALSE, FALSE, TRUE, 'secondary'::noderole, 'second-cluster', TRUE),(1, 0, 'localhost', 57636, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', FALSE),(2, 1, 'localhost', 57637, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', TRUE),(8, 5, 'localhost', 57638, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', TRUE)
|
||||
INSERT INTO pg_dist_node (nodeid, groupid, nodename, nodeport, noderack, hasmetadata, metadatasynced, isactive, noderole, nodecluster, shouldhaveshards, nodeisclone, nodeprimarynodeid) VALUES (5, 1, 'localhost', 8888, 'default', FALSE, FALSE, TRUE, 'secondary'::noderole, 'default', TRUE, FALSE, 0),(6, 1, 'localhost', 8889, 'default', FALSE, FALSE, TRUE, 'secondary'::noderole, 'second-cluster', TRUE, FALSE, 0),(1, 0, 'localhost', 57636, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', FALSE, FALSE, 0),(2, 1, 'localhost', 57637, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', TRUE, FALSE, 0),(8, 5, 'localhost', 57638, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', TRUE, FALSE, 0)
|
||||
RESET ROLE
|
||||
RESET ROLE
|
||||
SELECT alter_role_if_exists('postgres', 'ALTER ROLE postgres SET lc_messages = ''C''')
|
||||
|
|
|
|||
|
|
@ -86,10 +86,10 @@ FROM test.maintenance_worker();
|
|||
|
||||
SELECT *
|
||||
FROM pg_dist_node;
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||
---------------------------------------------------------------------
|
||||
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t
|
||||
2 | 2 | localhost | 57638 | default | t | t | primary | default | t | t
|
||||
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||
2 | 2 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||
(2 rows)
|
||||
|
||||
CREATE DATABASE db2;
|
||||
|
|
@ -147,10 +147,10 @@ FROM test.maintenance_worker();
|
|||
|
||||
SELECT *
|
||||
FROM pg_dist_node;
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||
---------------------------------------------------------------------
|
||||
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t
|
||||
2 | 2 | localhost | 57638 | default | t | t | primary | default | t | t
|
||||
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||
2 | 2 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||
(2 rows)
|
||||
|
||||
SELECT groupid AS worker_1_group_id
|
||||
|
|
|
|||
|
|
@ -1758,9 +1758,9 @@ BEGIN
|
|||
INSERT INTO test (x) VALUES ($1);
|
||||
END;$$;
|
||||
SELECT * FROM pg_dist_node;
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards
|
||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||
---------------------------------------------------------------------
|
||||
5 | 0 | localhost | 57636 | default | t | t | primary | default | t | t
|
||||
5 | 0 | localhost | 57636 | default | t | t | primary | default | t | t | f | 0
|
||||
(1 row)
|
||||
|
||||
SELECT create_distributed_function('call_delegation(int)', '$1', 'test');
|
||||
|
|
|
|||
|
|
@ -43,6 +43,8 @@ ORDER BY 1;
|
|||
function broadcast_intermediate_result(text,text)
|
||||
function check_distributed_deadlocks()
|
||||
function citus_activate_node(text,integer)
|
||||
function citus_add_clone_node(text,integer,text,integer)
|
||||
function citus_add_clone_node_with_nodeid(text,integer,integer)
|
||||
function citus_add_inactive_node(text,integer,integer,noderole,name)
|
||||
function citus_add_local_table_to_metadata(regclass,boolean)
|
||||
function citus_add_node(text,integer,integer,noderole,name)
|
||||
|
|
@ -156,6 +158,7 @@ ORDER BY 1;
|
|||
function citus_pause_node_within_txn(integer,boolean,integer)
|
||||
function citus_pid_for_gpid(bigint)
|
||||
function citus_prepare_pg_upgrade()
|
||||
function citus_promote_clone_and_rebalance(integer,name,integer)
|
||||
function citus_query_stats()
|
||||
function citus_rebalance_start(name,boolean,citus.shard_transfer_mode,boolean,boolean)
|
||||
function citus_rebalance_status(boolean)
|
||||
|
|
@ -163,6 +166,8 @@ ORDER BY 1;
|
|||
function citus_rebalance_wait()
|
||||
function citus_relation_size(regclass)
|
||||
function citus_remote_connection_stats()
|
||||
function citus_remove_clone_node(text,integer)
|
||||
function citus_remove_clone_node_with_nodeid(integer)
|
||||
function citus_remove_node(text,integer)
|
||||
function citus_run_local_command(text)
|
||||
function citus_schema_distribute(regnamespace)
|
||||
|
|
@ -244,6 +249,7 @@ ORDER BY 1;
|
|||
function get_rebalance_progress()
|
||||
function get_rebalance_table_shards_plan(regclass,real,integer,bigint[],boolean,name,real)
|
||||
function get_shard_id_for_distribution_column(regclass,"any")
|
||||
function get_snapshot_based_node_split_plan(text,integer,text,integer,name)
|
||||
function isolate_tenant_to_new_shard(regclass,"any",text,citus.shard_transfer_mode)
|
||||
function json_cat_agg(json)
|
||||
function jsonb_cat_agg(jsonb)
|
||||
|
|
@ -395,6 +401,6 @@ ORDER BY 1;
|
|||
view citus_tables
|
||||
view pg_dist_shard_placement
|
||||
view time_partitions
|
||||
(363 rows)
|
||||
(369 rows)
|
||||
|
||||
DROP TABLE extension_basic_types;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,2 @@
|
|||
test: multi_add_node_from_backup
|
||||
test: multi_add_node_from_backup_negative
|
||||
|
|
@ -3,6 +3,7 @@ test: follower_single_node
|
|||
test: multi_follower_select_statements
|
||||
test: multi_follower_dml
|
||||
test: multi_follower_configure_followers
|
||||
test: multi_add_node_from_backup_sync_replica
|
||||
|
||||
# test that no tests leaked intermediate results. This should always be last
|
||||
test: ensure_no_intermediate_data_leak
|
||||
|
|
|
|||
|
|
@ -62,6 +62,7 @@ my $MASTER_FOLLOWERDIR = 'master-follower';
|
|||
my $isolationtester = 0;
|
||||
my $vanillatest = 0;
|
||||
my $followercluster = 0;
|
||||
my $backupnodetest = 0;
|
||||
my $bindir = "";
|
||||
my $libdir = undef;
|
||||
my $pgxsdir = "";
|
||||
|
|
@ -100,6 +101,7 @@ GetOptions(
|
|||
'isolationtester' => \$isolationtester,
|
||||
'vanillatest' => \$vanillatest,
|
||||
'follower-cluster' => \$followercluster,
|
||||
'backupnodetest' => \$backupnodetest,
|
||||
'bindir=s' => \$bindir,
|
||||
'libdir=s' => \$libdir,
|
||||
'pgxsdir=s' => \$pgxsdir,
|
||||
|
|
@ -483,7 +485,14 @@ push(@pgOptions, "citus.max_adaptive_executor_pool_size=4");
|
|||
push(@pgOptions, "citus.defer_shard_delete_interval=-1");
|
||||
push(@pgOptions, "citus.repartition_join_bucket_count_per_node=2");
|
||||
push(@pgOptions, "citus.sort_returning='on'");
|
||||
push(@pgOptions, "citus.shard_replication_factor=2");
|
||||
if ($backupnodetest)
|
||||
{
|
||||
push(@pgOptions, "citus.shard_replication_factor=1");
|
||||
}
|
||||
else
|
||||
{
|
||||
push(@pgOptions, "citus.shard_replication_factor=2");
|
||||
}
|
||||
push(@pgOptions, "citus.node_connection_timeout=${connectionTimeout}");
|
||||
push(@pgOptions, "citus.explain_analyze_sort_method='taskId'");
|
||||
push(@pgOptions, "citus.enable_manual_changes_to_shards=on");
|
||||
|
|
@ -885,7 +894,7 @@ if ($valgrind)
|
|||
$serversAreShutdown = "FALSE";
|
||||
|
||||
# enable synchronous replication if needed
|
||||
if ($followercluster)
|
||||
if ($followercluster && $backupnodetest == 0)
|
||||
{
|
||||
$synchronousReplication = "-c synchronous_standby_names='FIRST 1 (*)' -c synchronous_commit=remote_apply";
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,156 @@
|
|||
--
|
||||
-- Test for adding a worker node from a backup
|
||||
--
|
||||
|
||||
-- setup cluster
|
||||
SELECT 1 FROM master_add_node('localhost', :worker_1_port);
|
||||
SELECT 1 FROM master_add_node('localhost', :worker_2_port);
|
||||
|
||||
SELECT * from pg_dist_node;
|
||||
|
||||
|
||||
-- create a distributed table and load data
|
||||
CREATE TABLE backup_test(id int, value text);
|
||||
SELECT create_distributed_table('backup_test', 'id', 'hash');
|
||||
INSERT INTO backup_test SELECT g, 'test' || g FROM generate_series(1, 10) g;
|
||||
|
||||
-- Colocation group 1: create two tables table1_colg1, table2_colg1 and in a colocation group
|
||||
CREATE TABLE table1_colg1 (a int PRIMARY KEY);
|
||||
SELECT create_distributed_table('table1_colg1', 'a', shard_count => 4, colocate_with => 'none');
|
||||
|
||||
CREATE TABLE table2_colg1 (b int PRIMARY KEY);
|
||||
|
||||
SELECT create_distributed_table('table2_colg1', 'b', colocate_with => 'table1_colg1');
|
||||
|
||||
-- Colocation group 2: create two tables table1_colg2, table2_colg2 and in a colocation group
|
||||
CREATE TABLE table1_colg2 (a int PRIMARY KEY);
|
||||
|
||||
SELECT create_distributed_table('table1_colg2', 'a', shard_count => 4, colocate_with => 'none');
|
||||
|
||||
CREATE TABLE table2_colg2 (b int primary key);
|
||||
|
||||
SELECT create_distributed_table('table2_colg2', 'b', colocate_with => 'table1_colg2');
|
||||
|
||||
-- Colocation group 3: create two tables table1_colg3, table2_colg3 and in a colocation group
|
||||
CREATE TABLE table1_colg3 (a int PRIMARY KEY);
|
||||
|
||||
SELECT create_distributed_table('table1_colg3', 'a', shard_count => 4, colocate_with => 'none');
|
||||
|
||||
CREATE TABLE table2_colg3 (b int primary key);
|
||||
|
||||
SELECT create_distributed_table('table2_colg3', 'b', colocate_with => 'table1_colg3');
|
||||
|
||||
-- Create reference tables with primary-foreign key relationships
|
||||
|
||||
CREATE TABLE customers (
|
||||
id SERIAL PRIMARY KEY,
|
||||
name TEXT NOT NULL,
|
||||
email TEXT UNIQUE NOT NULL );
|
||||
|
||||
CREATE TABLE orders (
|
||||
id SERIAL PRIMARY KEY,
|
||||
customer_id INTEGER NOT NULL REFERENCES customers(id),
|
||||
order_date DATE NOT NULL DEFAULT CURRENT_DATE);
|
||||
|
||||
CREATE TABLE order_items (
|
||||
id SERIAL PRIMARY KEY,
|
||||
order_id INTEGER NOT NULL REFERENCES orders(id),
|
||||
product_name TEXT NOT NULL,
|
||||
quantity INTEGER NOT NULL,
|
||||
price NUMERIC(10, 2) NOT NULL
|
||||
);
|
||||
|
||||
SELECT create_reference_table('customers');
|
||||
SELECT create_reference_table('orders');
|
||||
SELECT create_reference_table('order_items');
|
||||
|
||||
-- INSERT SOME DATA
|
||||
-- Insert 10 customers
|
||||
INSERT INTO customers (name, email)
|
||||
SELECT
|
||||
'Customer ' || i,
|
||||
'customer' || i || '@example.com'
|
||||
FROM generate_series(1, 10) AS i;
|
||||
|
||||
-- Insert 30 orders: each customer gets 3 orders
|
||||
INSERT INTO orders (customer_id, order_date)
|
||||
SELECT
|
||||
(i % 10) + 1, -- customer_id between 1 and 10
|
||||
CURRENT_DATE - (i % 7)
|
||||
FROM generate_series(1, 30) AS i;
|
||||
|
||||
-- Insert 90 order_items: each order has 3 items
|
||||
INSERT INTO order_items (order_id, product_name, quantity, price)
|
||||
SELECT
|
||||
(i % 30) + 1, -- order_id between 1 and 30
|
||||
'Product ' || (i % 5 + 1),
|
||||
(i % 10) + 1,
|
||||
round((random() * 100 + 10)::numeric, 2)
|
||||
FROM generate_series(1, 90) AS i;
|
||||
|
||||
SELECT count(*) from customers;
|
||||
SELECT count(*) from orders;
|
||||
SELECT count(*) from order_items;
|
||||
|
||||
-- verify initial shard placement
|
||||
SELECT nodename, nodeport, count(shardid) FROM pg_dist_shard_placement GROUP BY nodename, nodeport ORDER BY nodename, nodeport;
|
||||
-- wait for the new node to be ready
|
||||
SELECT pg_sleep(5);
|
||||
|
||||
-- register the new node as a clone
|
||||
-- the function returns the new node id
|
||||
SELECT citus_add_clone_node('localhost', :follower_worker_1_port, 'localhost', :worker_1_port) AS clone_node_id \gset
|
||||
|
||||
SELECT * from pg_dist_node ORDER by nodeid;
|
||||
|
||||
SELECT :clone_node_id ;
|
||||
|
||||
SELECT shardid, nodename, 'PRIMARY' as node_type FROM pg_dist_shard_placement WHERE nodeport = :worker_1_port ORDER BY shardid;
|
||||
SELECT shardid, nodename, 'CLONE' as node_type FROM pg_dist_shard_placement WHERE nodeport = :follower_worker_1_port ORDER BY shardid;
|
||||
|
||||
SELECT * from get_snapshot_based_node_split_plan('localhost', :worker_1_port, 'localhost', :follower_worker_1_port);
|
||||
|
||||
-- promote the clone and rebalance the shards
|
||||
SET client_min_messages to 'LOG';
|
||||
SELECT citus_promote_clone_and_rebalance(:clone_node_id);
|
||||
SET client_min_messages to DEFAULT;
|
||||
|
||||
SELECT shardid, nodename, 'PRIMARY' as node_type FROM pg_dist_shard_placement WHERE nodeport = :worker_1_port ORDER BY shardid;
|
||||
SELECT shardid, nodename, 'CLONE' as node_type FROM pg_dist_shard_placement WHERE nodeport = :follower_worker_1_port ORDER BY shardid;
|
||||
|
||||
\c - - - :worker_1_port
|
||||
SELECT 'WORKER' as node_type,* from pg_dist_node;
|
||||
SELECT 'WORKER' as node_type, nodename, nodeport, count(shardid) FROM pg_dist_shard_placement GROUP BY nodename, nodeport ORDER BY nodename, nodeport;
|
||||
SELECT * from citus_tables;
|
||||
SELECT id, value FROM backup_test ORDER BY id;
|
||||
SELECT count(*) from customers;
|
||||
SELECT count(*) from orders;
|
||||
SELECT count(*) from order_items;
|
||||
|
||||
|
||||
\c - - - :follower_worker_1_port
|
||||
SELECT 'CLONE' as node_type ,* from pg_dist_node;
|
||||
SELECT 'CLONE' as node_type, nodename, nodeport, count(shardid) FROM pg_dist_shard_placement GROUP BY nodename, nodeport ORDER BY nodename, nodeport;
|
||||
SELECT * from citus_tables;
|
||||
SELECT id, value FROM backup_test ORDER BY id;
|
||||
SELECT count(*) from customers;
|
||||
SELECT count(*) from orders;
|
||||
SELECT count(*) from order_items;
|
||||
|
||||
|
||||
\c - - - :master_port
|
||||
SELECT 'MASTER' as node_type, * from pg_dist_node;
|
||||
SELECT 'MASTER' as node_type, nodename, nodeport, count(shardid) FROM pg_dist_shard_placement GROUP BY nodename, nodeport ORDER BY nodename, nodeport;
|
||||
SELECT * from citus_tables;
|
||||
SELECT id, value FROM backup_test ORDER BY id;
|
||||
SELECT count(*) from customers;
|
||||
SELECT count(*) from orders;
|
||||
SELECT count(*) from order_items;
|
||||
|
||||
-- verify data
|
||||
SELECT count(*) FROM backup_test;
|
||||
SELECT id, value FROM backup_test ORDER BY id;
|
||||
|
||||
-- cleanup
|
||||
DROP TABLE backup_test;
|
||||
|
||||
|
|
@ -0,0 +1,105 @@
|
|||
--
|
||||
-- Test for negative scenarios in clone promotion functionality
|
||||
--
|
||||
|
||||
--try to add follower_worker_1 as a clone of worker_1 to the cluster
|
||||
-- this should fail as previous test has already promoted worker_1 to a primary node
|
||||
SELECT citus_add_clone_node('localhost', :follower_worker_1_port, 'localhost', :worker_1_port) AS clone_node_id \gset
|
||||
|
||||
SELECT * from pg_dist_node ORDER by nodeid;
|
||||
|
||||
--try to add worker_node2 as a clone of worker_node1
|
||||
-- this should fail as it is not a valid replica of worker_1
|
||||
SELECT citus_add_clone_node('localhost', :follower_worker_2_port, 'localhost', :worker_1_port) AS clone_node_id \gset
|
||||
|
||||
SELECT * from pg_dist_node ORDER by nodeid;
|
||||
|
||||
--try add
|
||||
-- create a distributed table and load data
|
||||
CREATE TABLE backup_test(id int, value text);
|
||||
SELECT create_distributed_table('backup_test', 'id', 'hash');
|
||||
INSERT INTO backup_test SELECT g, 'test' || g FROM generate_series(1, 10) g;
|
||||
|
||||
-- Create reference table
|
||||
CREATE TABLE ref_table(id int PRIMARY KEY);
|
||||
SELECT create_reference_table('ref_table');
|
||||
INSERT INTO ref_table SELECT i FROM generate_series(1, 5) i;
|
||||
|
||||
SELECT COUNT(*) from backup_test;
|
||||
SELECT COUNT(*) from ref_table;
|
||||
|
||||
-- verify initial shard placement
|
||||
SELECT nodename, nodeport, count(shardid) FROM pg_dist_shard_placement GROUP BY nodename, nodeport ORDER BY nodename, nodeport;
|
||||
|
||||
-- Try to add replica of worker_node2 as a clone of worker_node1
|
||||
SELECT citus_add_clone_node('localhost', :follower_worker_2_port, 'localhost', :worker_1_port) AS clone_node_id \gset
|
||||
|
||||
-- Test 1: Try to promote a non-existent clone node
|
||||
SELECT citus_promote_clone_and_rebalance(clone_nodeid =>99999);
|
||||
|
||||
-- Test 2: Try to promote a regular worker node (not a clone)
|
||||
SELECT citus_promote_clone_and_rebalance(clone_nodeid => 1);
|
||||
|
||||
-- Test 3: Try to promote with invalid timeout (negative)
|
||||
SELECT citus_promote_clone_and_rebalance(clone_nodeid => 1,
|
||||
catchup_timeout_seconds => -100);
|
||||
|
||||
-- register the new node as a clone, This should pass
|
||||
SELECT citus_add_clone_node('localhost', :follower_worker_2_port, 'localhost', :worker_2_port) AS clone_node_id \gset
|
||||
|
||||
SELECT * from pg_dist_node ORDER by nodeid;
|
||||
|
||||
SELECT :clone_node_id;
|
||||
|
||||
-- Test 4: Try to promote clone with invalid strategy name
|
||||
SELECT citus_promote_clone_and_rebalance(clone_nodeid => :clone_node_id, rebalance_strategy => 'invalid_strategy');
|
||||
|
||||
SELECT * from pg_dist_node ORDER by nodeid;
|
||||
|
||||
-- Test 9: Rollback the citus_promote_clone_and_rebalance transaction
|
||||
BEGIN;
|
||||
SELECT citus_promote_clone_and_rebalance(clone_nodeid => :clone_node_id);
|
||||
ROLLBACK;
|
||||
|
||||
-- Verify no data is lost after rooling back the transaction
|
||||
SELECT COUNT(*) from backup_test;
|
||||
SELECT COUNT(*) from ref_table;
|
||||
|
||||
SELECT * from pg_dist_node ORDER by nodeid;
|
||||
|
||||
-- Test 5: Try to add and promote a proper replica after rollback
|
||||
SELECT master_add_node('localhost', :worker_3_port) AS nodeid_3 \gset
|
||||
SELECT citus_add_clone_node('localhost', :follower_worker_3_port, 'localhost', :worker_3_port) AS clone_node_id_3 \gset
|
||||
|
||||
set citus.shard_count = 100;
|
||||
CREATE TABLE backup_test2(id int, value text);
|
||||
SELECT create_distributed_table('backup_test2', 'id', 'hash');
|
||||
INSERT INTO backup_test2 SELECT g, 'test' || g FROM generate_series(1, 10) g;
|
||||
|
||||
-- Create reference table
|
||||
CREATE TABLE ref_table2(id int PRIMARY KEY);
|
||||
SELECT create_reference_table('ref_table2');
|
||||
INSERT INTO ref_table2 SELECT i FROM generate_series(1, 5) i;
|
||||
|
||||
SELECT * from get_snapshot_based_node_split_plan('localhost', :worker_3_port, 'localhost', :follower_worker_3_port);
|
||||
|
||||
SET client_min_messages to 'LOG';
|
||||
SELECT citus_promote_clone_and_rebalance(clone_nodeid => :clone_node_id_3);
|
||||
SET client_min_messages to DEFAULT;
|
||||
|
||||
SELECT COUNT(*) from backup_test;
|
||||
SELECT COUNT(*) from ref_table;
|
||||
|
||||
SELECT * from pg_dist_node ORDER by nodeid;
|
||||
|
||||
-- check the shard placement
|
||||
|
||||
SELECT nodename, nodeport, count(shardid) FROM pg_dist_shard_placement GROUP BY nodename, nodeport ORDER BY nodename, nodeport;
|
||||
|
||||
set citus.shard_count to default;
|
||||
|
||||
-- cleanup
|
||||
DROP TABLE backup_test;
|
||||
DROP TABLE ref_table;
|
||||
DROP TABLE backup_test2;
|
||||
DROP TABLE ref_table2;
|
||||
|
|
@ -0,0 +1,18 @@
|
|||
--
|
||||
-- Test for negative scenarios in clone promotion functionality
|
||||
-- We do not allow synchronous replicas to be added as clones
|
||||
-- this test is to ensure that we do not allow this
|
||||
--
|
||||
|
||||
SELECT * from pg_dist_node ORDER by nodeid;
|
||||
|
||||
SELECT master_remove_node('localhost', :follower_worker_1_port);
|
||||
SELECT master_remove_node('localhost', :follower_worker_2_port);
|
||||
|
||||
-- this should fail as the replica is a synchronous replica that is not allowed
|
||||
SELECT citus_add_clone_node('localhost', :follower_worker_1_port, 'localhost', :worker_1_port) AS clone_node_id \gset
|
||||
|
||||
-- this should fail as the replica is a synchronous replica that is not allowed
|
||||
SELECT citus_add_clone_node('localhost', :follower_worker_2_port, 'localhost', :worker_2_port) AS clone_node_id \gset
|
||||
|
||||
SELECT * from pg_dist_node ORDER by nodeid;
|
||||
Loading…
Reference in New Issue