mirror of https://github.com/citusdata/citus.git
Snapshot-Based Node Split – Foundation and Core Implementation (#8122)
**DESCRIPTION:**
This pull request introduces the foundation and core logic for the
snapshot-based node split feature in Citus. This feature enables
promoting a streaming replica (referred to as a clone in this feature
and UI) to a primary node and rebalancing shards between the original
and the newly promoted node without requiring a full data copy.
This significantly reduces rebalance times for scale-out operations
where the new node already contains a full copy of the data via
streaming replication.
Key Highlights:
**1. Replica (Clone) Registration & Management Infrastructure**
Introduces a new set of UDFs to register and manage clone nodes:
- citus_add_clone_node()
- citus_add_clone_node_with_nodeid()
- citus_remove_clone_node()
- citus_remove_clone_node_with_nodeid()
These functions allow administrators to register a streaming replica of
an existing worker node as a clone, making it eligible for later
promotion via snapshot-based split.
**2. Snapshot-Based Node Split (Core Implementation)**
New core UDF:
- citus_promote_clone_and_rebalance()
This function implements the full workflow to promote a clone and
rebalance shards between the old and new primaries. Steps include:
1. Ensuring Exclusivity – Blocks any concurrent placement-changing
operations.
2. Blocking Writes – Temporarily blocks writes on the primary to ensure
consistency.
3. Replica Catch-up – Waits for the replica to be fully in sync.
4. Promotion – Promotes the replica to a primary using pg_promote.
5. Metadata Update – Updates metadata to reflect the newly promoted
primary node.
6. Shard Rebalancing – Redistributes shards between the old and new
primary nodes.
**3. Split Plan Preview**
A new helper UDF get_snapshot_based_node_split_plan() provides a preview
of the shard distribution post-split, without executing the promotion.
**Example:**
```
reb 63796> select * from pg_catalog.get_snapshot_based_node_split_plan('127.0.0.1',5433,'127.0.0.1',5453);
table_name | shardid | shard_size | placement_node
--------------+---------+------------+----------------
companies | 102008 | 0 | Primary Node
campaigns | 102010 | 0 | Primary Node
ads | 102012 | 0 | Primary Node
mscompanies | 102014 | 0 | Primary Node
mscampaigns | 102016 | 0 | Primary Node
msads | 102018 | 0 | Primary Node
mscompanies2 | 102020 | 0 | Primary Node
mscampaigns2 | 102022 | 0 | Primary Node
msads2 | 102024 | 0 | Primary Node
companies | 102009 | 0 | Clone Node
campaigns | 102011 | 0 | Clone Node
ads | 102013 | 0 | Clone Node
mscompanies | 102015 | 0 | Clone Node
mscampaigns | 102017 | 0 | Clone Node
msads | 102019 | 0 | Clone Node
mscompanies2 | 102021 | 0 | Clone Node
mscampaigns2 | 102023 | 0 | Clone Node
msads2 | 102025 | 0 | Clone Node
(18 rows)
```
**4 Test Infrastructure Enhancements**
- Added a new test case scheduler for snapshot-based split scenarios.
- Enhanced pg_regress_multi.pl to support creating node backups with
slightly modified options to simulate real-world backup-based clone
creation.
### 5. Usage Guide
The snapshot-based node split can be performed using the following
workflow:
**- Take a Backup of the Worker Node**
Run pg_basebackup (or an equivalent tool) against the existing worker
node to create a physical backup.
`pg_basebackup -h <primary_worker_host> -p <port> -D
/path/to/replica/data --write-recovery-conf
`
**- Start the Replica Node**
Start PostgreSQL on the replica using the backup data directory,
ensuring it is configured as a streaming replica of the original worker
node.
**- Register the Backup Node as a Clone**
Mark the registered replica as a clone of its original worker node:
`SELECT * FROM citus_add_clone_node('<clone_host>', <clone_port>,
'<primary_host>', <primary_port>);
`
**- Promote and Rebalance the Clone**
Promote the clone to a primary and rebalance shards between it and the
original worker:
`SELECT * FROM citus_promote_clone_and_rebalance('clone_node_id');
`
**- Drop Any Replication Slots from the Original Worker**
After promotion, clean up any unused replication slots from the original
worker:
`SELECT pg_drop_replication_slot('<slot_name>');
`
pull/8136/head
parent
f743b35fc2
commit
be6668e440
|
|
@ -153,6 +153,7 @@ jobs:
|
||||||
- check-isolation
|
- check-isolation
|
||||||
- check-operations
|
- check-operations
|
||||||
- check-follower-cluster
|
- check-follower-cluster
|
||||||
|
- check-add-backup-node
|
||||||
- check-columnar
|
- check-columnar
|
||||||
- check-columnar-isolation
|
- check-columnar-isolation
|
||||||
- check-enterprise
|
- check-enterprise
|
||||||
|
|
@ -494,10 +495,14 @@ jobs:
|
||||||
tests=${detected_changes}
|
tests=${detected_changes}
|
||||||
|
|
||||||
# split the tests to be skipped --today we only skip upgrade tests
|
# split the tests to be skipped --today we only skip upgrade tests
|
||||||
|
# and snapshot based node addition tests.
|
||||||
|
# snapshot based node addition tests are not flaky, as they promote
|
||||||
|
# the streaming replica (clone) to a PostgreSQL primary node that is one way
|
||||||
|
# operation
|
||||||
skipped_tests=""
|
skipped_tests=""
|
||||||
not_skipped_tests=""
|
not_skipped_tests=""
|
||||||
for test in $tests; do
|
for test in $tests; do
|
||||||
if [[ $test =~ ^src/test/regress/sql/upgrade_ ]]; then
|
if [[ $test =~ ^src/test/regress/sql/upgrade_ ]] || [[ $test =~ ^src/test/regress/sql/multi_add_node_from_backup ]]; then
|
||||||
skipped_tests="$skipped_tests $test"
|
skipped_tests="$skipped_tests $test"
|
||||||
else
|
else
|
||||||
not_skipped_tests="$not_skipped_tests $test"
|
not_skipped_tests="$not_skipped_tests $test"
|
||||||
|
|
|
||||||
|
|
@ -221,6 +221,7 @@ typedef struct MetadataCacheData
|
||||||
Oid textCopyFormatId;
|
Oid textCopyFormatId;
|
||||||
Oid primaryNodeRoleId;
|
Oid primaryNodeRoleId;
|
||||||
Oid secondaryNodeRoleId;
|
Oid secondaryNodeRoleId;
|
||||||
|
Oid unavailableNodeRoleId;
|
||||||
Oid pgTableIsVisibleFuncId;
|
Oid pgTableIsVisibleFuncId;
|
||||||
Oid citusTableIsVisibleFuncId;
|
Oid citusTableIsVisibleFuncId;
|
||||||
Oid distAuthinfoRelationId;
|
Oid distAuthinfoRelationId;
|
||||||
|
|
@ -320,8 +321,9 @@ static void CachedRelationNamespaceLookup(const char *relationName, Oid relnames
|
||||||
static void CachedRelationNamespaceLookupExtended(const char *relationName,
|
static void CachedRelationNamespaceLookupExtended(const char *relationName,
|
||||||
Oid renamespace, Oid *cachedOid,
|
Oid renamespace, Oid *cachedOid,
|
||||||
bool missing_ok);
|
bool missing_ok);
|
||||||
static ShardPlacement * ResolveGroupShardPlacement(
|
static ShardPlacement * ResolveGroupShardPlacement(GroupShardPlacement *
|
||||||
GroupShardPlacement *groupShardPlacement, CitusTableCacheEntry *tableEntry,
|
groupShardPlacement,
|
||||||
|
CitusTableCacheEntry *tableEntry,
|
||||||
int shardIndex);
|
int shardIndex);
|
||||||
static Oid LookupEnumValueId(Oid typeId, char *valueName);
|
static Oid LookupEnumValueId(Oid typeId, char *valueName);
|
||||||
static void InvalidateCitusTableCacheEntrySlot(CitusTableCacheEntrySlot *cacheSlot);
|
static void InvalidateCitusTableCacheEntrySlot(CitusTableCacheEntrySlot *cacheSlot);
|
||||||
|
|
@ -3600,6 +3602,20 @@ SecondaryNodeRoleId(void)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* return the Oid of the 'unavailable' nodeRole enum value */
|
||||||
|
Oid
|
||||||
|
UnavailableNodeRoleId(void)
|
||||||
|
{
|
||||||
|
if (!MetadataCache.unavailableNodeRoleId)
|
||||||
|
{
|
||||||
|
MetadataCache.unavailableNodeRoleId = LookupStringEnumValueId("noderole",
|
||||||
|
"unavailable");
|
||||||
|
}
|
||||||
|
|
||||||
|
return MetadataCache.unavailableNodeRoleId;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
Oid
|
Oid
|
||||||
CitusJobStatusScheduledId(void)
|
CitusJobStatusScheduledId(void)
|
||||||
{
|
{
|
||||||
|
|
@ -4417,6 +4433,8 @@ InitializeWorkerNodeCache(void)
|
||||||
workerNode->isActive = currentNode->isActive;
|
workerNode->isActive = currentNode->isActive;
|
||||||
workerNode->nodeRole = currentNode->nodeRole;
|
workerNode->nodeRole = currentNode->nodeRole;
|
||||||
workerNode->shouldHaveShards = currentNode->shouldHaveShards;
|
workerNode->shouldHaveShards = currentNode->shouldHaveShards;
|
||||||
|
workerNode->nodeprimarynodeid = currentNode->nodeprimarynodeid;
|
||||||
|
workerNode->nodeisclone = currentNode->nodeisclone;
|
||||||
strlcpy(workerNode->nodeCluster, currentNode->nodeCluster, NAMEDATALEN);
|
strlcpy(workerNode->nodeCluster, currentNode->nodeCluster, NAMEDATALEN);
|
||||||
|
|
||||||
newWorkerNodeArray[workerNodeIndex++] = workerNode;
|
newWorkerNodeArray[workerNodeIndex++] = workerNode;
|
||||||
|
|
|
||||||
|
|
@ -819,7 +819,7 @@ NodeListInsertCommand(List *workerNodeList)
|
||||||
appendStringInfo(nodeListInsertCommand,
|
appendStringInfo(nodeListInsertCommand,
|
||||||
"INSERT INTO pg_dist_node (nodeid, groupid, nodename, nodeport, "
|
"INSERT INTO pg_dist_node (nodeid, groupid, nodename, nodeport, "
|
||||||
"noderack, hasmetadata, metadatasynced, isactive, noderole, "
|
"noderack, hasmetadata, metadatasynced, isactive, noderole, "
|
||||||
"nodecluster, shouldhaveshards) VALUES ");
|
"nodecluster, shouldhaveshards, nodeisclone, nodeprimarynodeid) VALUES ");
|
||||||
|
|
||||||
/* iterate over the worker nodes, add the values */
|
/* iterate over the worker nodes, add the values */
|
||||||
WorkerNode *workerNode = NULL;
|
WorkerNode *workerNode = NULL;
|
||||||
|
|
@ -829,13 +829,14 @@ NodeListInsertCommand(List *workerNodeList)
|
||||||
char *metadataSyncedString = workerNode->metadataSynced ? "TRUE" : "FALSE";
|
char *metadataSyncedString = workerNode->metadataSynced ? "TRUE" : "FALSE";
|
||||||
char *isActiveString = workerNode->isActive ? "TRUE" : "FALSE";
|
char *isActiveString = workerNode->isActive ? "TRUE" : "FALSE";
|
||||||
char *shouldHaveShards = workerNode->shouldHaveShards ? "TRUE" : "FALSE";
|
char *shouldHaveShards = workerNode->shouldHaveShards ? "TRUE" : "FALSE";
|
||||||
|
char *nodeiscloneString = workerNode->nodeisclone ? "TRUE" : "FALSE";
|
||||||
|
|
||||||
Datum nodeRoleOidDatum = ObjectIdGetDatum(workerNode->nodeRole);
|
Datum nodeRoleOidDatum = ObjectIdGetDatum(workerNode->nodeRole);
|
||||||
Datum nodeRoleStringDatum = DirectFunctionCall1(enum_out, nodeRoleOidDatum);
|
Datum nodeRoleStringDatum = DirectFunctionCall1(enum_out, nodeRoleOidDatum);
|
||||||
char *nodeRoleString = DatumGetCString(nodeRoleStringDatum);
|
char *nodeRoleString = DatumGetCString(nodeRoleStringDatum);
|
||||||
|
|
||||||
appendStringInfo(nodeListInsertCommand,
|
appendStringInfo(nodeListInsertCommand,
|
||||||
"(%d, %d, %s, %d, %s, %s, %s, %s, '%s'::noderole, %s, %s)",
|
"(%d, %d, %s, %d, %s, %s, %s, %s, '%s'::noderole, %s, %s, %s, %d)",
|
||||||
workerNode->nodeId,
|
workerNode->nodeId,
|
||||||
workerNode->groupId,
|
workerNode->groupId,
|
||||||
quote_literal_cstr(workerNode->workerName),
|
quote_literal_cstr(workerNode->workerName),
|
||||||
|
|
@ -846,7 +847,9 @@ NodeListInsertCommand(List *workerNodeList)
|
||||||
isActiveString,
|
isActiveString,
|
||||||
nodeRoleString,
|
nodeRoleString,
|
||||||
quote_literal_cstr(workerNode->nodeCluster),
|
quote_literal_cstr(workerNode->nodeCluster),
|
||||||
shouldHaveShards);
|
shouldHaveShards,
|
||||||
|
nodeiscloneString,
|
||||||
|
workerNode->nodeprimarynodeid);
|
||||||
|
|
||||||
processedWorkerNodeCount++;
|
processedWorkerNodeCount++;
|
||||||
if (processedWorkerNodeCount != workerCount)
|
if (processedWorkerNodeCount != workerCount)
|
||||||
|
|
@ -882,7 +885,9 @@ NodeListIdempotentInsertCommand(List *workerNodeList)
|
||||||
"noderole = EXCLUDED.noderole, "
|
"noderole = EXCLUDED.noderole, "
|
||||||
"nodecluster = EXCLUDED.nodecluster, "
|
"nodecluster = EXCLUDED.nodecluster, "
|
||||||
"metadatasynced = EXCLUDED.metadatasynced, "
|
"metadatasynced = EXCLUDED.metadatasynced, "
|
||||||
"shouldhaveshards = EXCLUDED.shouldhaveshards";
|
"shouldhaveshards = EXCLUDED.shouldhaveshards, "
|
||||||
|
"nodeisclone = EXCLUDED.nodeisclone, "
|
||||||
|
"nodeprimarynodeid = EXCLUDED.nodeprimarynodeid";
|
||||||
appendStringInfoString(nodeInsertIdempotentCommand, onConflictStr);
|
appendStringInfoString(nodeInsertIdempotentCommand, onConflictStr);
|
||||||
return nodeInsertIdempotentCommand->data;
|
return nodeInsertIdempotentCommand->data;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -35,6 +35,7 @@
|
||||||
|
|
||||||
#include "distributed/citus_acquire_lock.h"
|
#include "distributed/citus_acquire_lock.h"
|
||||||
#include "distributed/citus_safe_lib.h"
|
#include "distributed/citus_safe_lib.h"
|
||||||
|
#include "distributed/clonenode_utils.h"
|
||||||
#include "distributed/colocation_utils.h"
|
#include "distributed/colocation_utils.h"
|
||||||
#include "distributed/commands.h"
|
#include "distributed/commands.h"
|
||||||
#include "distributed/commands/utility_hook.h"
|
#include "distributed/commands/utility_hook.h"
|
||||||
|
|
@ -84,6 +85,8 @@ typedef struct NodeMetadata
|
||||||
bool isActive;
|
bool isActive;
|
||||||
Oid nodeRole;
|
Oid nodeRole;
|
||||||
bool shouldHaveShards;
|
bool shouldHaveShards;
|
||||||
|
uint32 nodeprimarynodeid;
|
||||||
|
bool nodeisclone;
|
||||||
char *nodeCluster;
|
char *nodeCluster;
|
||||||
} NodeMetadata;
|
} NodeMetadata;
|
||||||
|
|
||||||
|
|
@ -106,7 +109,8 @@ static void InsertNodeRow(int nodeid, char *nodename, int32 nodeport,
|
||||||
NodeMetadata *nodeMetadata);
|
NodeMetadata *nodeMetadata);
|
||||||
static void DeleteNodeRow(char *nodename, int32 nodeport);
|
static void DeleteNodeRow(char *nodename, int32 nodeport);
|
||||||
static void BlockDistributedQueriesOnMetadataNodes(void);
|
static void BlockDistributedQueriesOnMetadataNodes(void);
|
||||||
static WorkerNode * TupleToWorkerNode(TupleDesc tupleDescriptor, HeapTuple heapTuple);
|
static WorkerNode * TupleToWorkerNode(Relation pgDistNode, TupleDesc tupleDescriptor,
|
||||||
|
HeapTuple heapTuple);
|
||||||
static bool NodeIsLocal(WorkerNode *worker);
|
static bool NodeIsLocal(WorkerNode *worker);
|
||||||
static void SetLockTimeoutLocally(int32 lock_cooldown);
|
static void SetLockTimeoutLocally(int32 lock_cooldown);
|
||||||
static void UpdateNodeLocation(int32 nodeId, char *newNodeName, int32 newNodePort,
|
static void UpdateNodeLocation(int32 nodeId, char *newNodeName, int32 newNodePort,
|
||||||
|
|
@ -120,11 +124,10 @@ static char * NodeMetadataSyncedUpdateCommand(uint32 nodeId, bool metadataSynced
|
||||||
static void ErrorIfCoordinatorMetadataSetFalse(WorkerNode *workerNode, Datum value,
|
static void ErrorIfCoordinatorMetadataSetFalse(WorkerNode *workerNode, Datum value,
|
||||||
char *field);
|
char *field);
|
||||||
static WorkerNode * SetShouldHaveShards(WorkerNode *workerNode, bool shouldHaveShards);
|
static WorkerNode * SetShouldHaveShards(WorkerNode *workerNode, bool shouldHaveShards);
|
||||||
static WorkerNode * FindNodeAnyClusterByNodeId(uint32 nodeId);
|
|
||||||
static void ErrorIfAnyNodeNotExist(List *nodeList);
|
static void ErrorIfAnyNodeNotExist(List *nodeList);
|
||||||
static void UpdateLocalGroupIdsViaMetadataContext(MetadataSyncContext *context);
|
static void UpdateLocalGroupIdsViaMetadataContext(MetadataSyncContext *context);
|
||||||
static void SendDeletionCommandsForReplicatedTablePlacements(
|
static void SendDeletionCommandsForReplicatedTablePlacements(MetadataSyncContext *context)
|
||||||
MetadataSyncContext *context);
|
;
|
||||||
static void SyncNodeMetadata(MetadataSyncContext *context);
|
static void SyncNodeMetadata(MetadataSyncContext *context);
|
||||||
static void SetNodeStateViaMetadataContext(MetadataSyncContext *context,
|
static void SetNodeStateViaMetadataContext(MetadataSyncContext *context,
|
||||||
WorkerNode *workerNode,
|
WorkerNode *workerNode,
|
||||||
|
|
@ -134,12 +137,15 @@ static void MarkNodesNotSyncedInLoopBackConnection(MetadataSyncContext *context,
|
||||||
static void EnsureParentSessionHasExclusiveLockOnPgDistNode(pid_t parentSessionPid);
|
static void EnsureParentSessionHasExclusiveLockOnPgDistNode(pid_t parentSessionPid);
|
||||||
static void SetNodeMetadata(MetadataSyncContext *context, bool localOnly);
|
static void SetNodeMetadata(MetadataSyncContext *context, bool localOnly);
|
||||||
static void EnsureTransactionalMetadataSyncMode(void);
|
static void EnsureTransactionalMetadataSyncMode(void);
|
||||||
static void LockShardsInWorkerPlacementList(WorkerNode *workerNode, LOCKMODE
|
|
||||||
lockMode);
|
|
||||||
static BackgroundWorkerHandle * CheckBackgroundWorkerToObtainLocks(int32 lock_cooldown);
|
static BackgroundWorkerHandle * CheckBackgroundWorkerToObtainLocks(int32 lock_cooldown);
|
||||||
static BackgroundWorkerHandle * LockPlacementsWithBackgroundWorkersInPrimaryNode(
|
static BackgroundWorkerHandle * LockPlacementsWithBackgroundWorkersInPrimaryNode(
|
||||||
WorkerNode *workerNode, bool force, int32 lock_cooldown);
|
WorkerNode *workerNode, bool force, int32 lock_cooldown);
|
||||||
|
|
||||||
|
|
||||||
|
static int32 CitusAddCloneNode(WorkerNode *primaryWorkerNode,
|
||||||
|
char *cloneHostname, int32 clonePort);
|
||||||
|
static void RemoveCloneNode(WorkerNode *cloneNode);
|
||||||
|
|
||||||
/* Function definitions go here */
|
/* Function definitions go here */
|
||||||
|
|
||||||
/* declarations for dynamic loading */
|
/* declarations for dynamic loading */
|
||||||
|
|
@ -168,6 +174,10 @@ PG_FUNCTION_INFO_V1(citus_coordinator_nodeid);
|
||||||
PG_FUNCTION_INFO_V1(citus_is_coordinator);
|
PG_FUNCTION_INFO_V1(citus_is_coordinator);
|
||||||
PG_FUNCTION_INFO_V1(citus_internal_mark_node_not_synced);
|
PG_FUNCTION_INFO_V1(citus_internal_mark_node_not_synced);
|
||||||
PG_FUNCTION_INFO_V1(citus_is_primary_node);
|
PG_FUNCTION_INFO_V1(citus_is_primary_node);
|
||||||
|
PG_FUNCTION_INFO_V1(citus_add_clone_node);
|
||||||
|
PG_FUNCTION_INFO_V1(citus_add_clone_node_with_nodeid);
|
||||||
|
PG_FUNCTION_INFO_V1(citus_remove_clone_node);
|
||||||
|
PG_FUNCTION_INFO_V1(citus_remove_clone_node_with_nodeid);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* DefaultNodeMetadata creates a NodeMetadata struct with the fields set to
|
* DefaultNodeMetadata creates a NodeMetadata struct with the fields set to
|
||||||
|
|
@ -183,6 +193,8 @@ DefaultNodeMetadata()
|
||||||
nodeMetadata.nodeRack = WORKER_DEFAULT_RACK;
|
nodeMetadata.nodeRack = WORKER_DEFAULT_RACK;
|
||||||
nodeMetadata.shouldHaveShards = true;
|
nodeMetadata.shouldHaveShards = true;
|
||||||
nodeMetadata.groupId = INVALID_GROUP_ID;
|
nodeMetadata.groupId = INVALID_GROUP_ID;
|
||||||
|
nodeMetadata.nodeisclone = false;
|
||||||
|
nodeMetadata.nodeprimarynodeid = 0; /* 0 typically means InvalidNodeId */
|
||||||
|
|
||||||
return nodeMetadata;
|
return nodeMetadata;
|
||||||
}
|
}
|
||||||
|
|
@ -1177,6 +1189,33 @@ ActivateNodeList(MetadataSyncContext *context)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ActivateCloneNodeAsPrimary sets the given worker node as primary and active
|
||||||
|
* in the pg_dist_node catalog and make the clone node as first class citizen.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
ActivateCloneNodeAsPrimary(WorkerNode *workerNode)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Set the node as primary and active.
|
||||||
|
*/
|
||||||
|
SetWorkerColumnLocalOnly(workerNode, Anum_pg_dist_node_noderole,
|
||||||
|
ObjectIdGetDatum(PrimaryNodeRoleId()));
|
||||||
|
SetWorkerColumnLocalOnly(workerNode, Anum_pg_dist_node_isactive,
|
||||||
|
BoolGetDatum(true));
|
||||||
|
SetWorkerColumnLocalOnly(workerNode, Anum_pg_dist_node_nodeisclone,
|
||||||
|
BoolGetDatum(false));
|
||||||
|
SetWorkerColumnLocalOnly(workerNode, Anum_pg_dist_node_nodeprimarynodeid,
|
||||||
|
Int32GetDatum(0));
|
||||||
|
SetWorkerColumnLocalOnly(workerNode, Anum_pg_dist_node_hasmetadata,
|
||||||
|
BoolGetDatum(true));
|
||||||
|
SetWorkerColumnLocalOnly(workerNode, Anum_pg_dist_node_metadatasynced,
|
||||||
|
BoolGetDatum(true));
|
||||||
|
SetWorkerColumnLocalOnly(workerNode, Anum_pg_dist_node_shouldhaveshards,
|
||||||
|
BoolGetDatum(true));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Acquires shard metadata locks on all shards residing in the given worker node
|
* Acquires shard metadata locks on all shards residing in the given worker node
|
||||||
*
|
*
|
||||||
|
|
@ -1200,7 +1239,8 @@ BackgroundWorkerHandle *
|
||||||
CheckBackgroundWorkerToObtainLocks(int32 lock_cooldown)
|
CheckBackgroundWorkerToObtainLocks(int32 lock_cooldown)
|
||||||
{
|
{
|
||||||
BackgroundWorkerHandle *handle = StartLockAcquireHelperBackgroundWorker(MyProcPid,
|
BackgroundWorkerHandle *handle = StartLockAcquireHelperBackgroundWorker(MyProcPid,
|
||||||
lock_cooldown);
|
lock_cooldown)
|
||||||
|
;
|
||||||
if (!handle)
|
if (!handle)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
|
|
@ -1422,6 +1462,305 @@ master_update_node(PG_FUNCTION_ARGS)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* citus_add_clone_node adds a new node as a clone of an existing primary node.
|
||||||
|
*/
|
||||||
|
Datum
|
||||||
|
citus_add_clone_node(PG_FUNCTION_ARGS)
|
||||||
|
{
|
||||||
|
CheckCitusVersion(ERROR);
|
||||||
|
EnsureSuperUser();
|
||||||
|
EnsureCoordinator();
|
||||||
|
|
||||||
|
text *cloneHostnameText = PG_GETARG_TEXT_P(0);
|
||||||
|
int32 clonePort = PG_GETARG_INT32(1);
|
||||||
|
text *primaryHostnameText = PG_GETARG_TEXT_P(2);
|
||||||
|
int32 primaryPort = PG_GETARG_INT32(3);
|
||||||
|
|
||||||
|
char *cloneHostname = text_to_cstring(cloneHostnameText);
|
||||||
|
char *primaryHostname = text_to_cstring(primaryHostnameText);
|
||||||
|
|
||||||
|
WorkerNode *primaryWorker = FindWorkerNodeAnyCluster(primaryHostname, primaryPort);
|
||||||
|
|
||||||
|
if (primaryWorker == NULL)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||||
|
errmsg("primary node %s:%d not found in pg_dist_node",
|
||||||
|
primaryHostname, primaryPort)));
|
||||||
|
}
|
||||||
|
|
||||||
|
int32 cloneNodeId = CitusAddCloneNode(primaryWorker, cloneHostname, clonePort);
|
||||||
|
|
||||||
|
PG_RETURN_INT32(cloneNodeId);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* citus_add_clone_node_with_nodeid adds a new node as a clone of an existing primary node
|
||||||
|
* using the primary node's ID. It records the clone's hostname, port, and links it to the
|
||||||
|
* primary node's ID.
|
||||||
|
*
|
||||||
|
* This function is useful when you already know the primary node's ID and want to add a clone
|
||||||
|
* without needing to look it up by hostname and port.
|
||||||
|
*/
|
||||||
|
Datum
|
||||||
|
citus_add_clone_node_with_nodeid(PG_FUNCTION_ARGS)
|
||||||
|
{
|
||||||
|
CheckCitusVersion(ERROR);
|
||||||
|
EnsureSuperUser();
|
||||||
|
EnsureCoordinator();
|
||||||
|
|
||||||
|
text *cloneHostnameText = PG_GETARG_TEXT_P(0);
|
||||||
|
int32 clonePort = PG_GETARG_INT32(1);
|
||||||
|
int32 primaryNodeId = PG_GETARG_INT32(2);
|
||||||
|
|
||||||
|
char *cloneHostname = text_to_cstring(cloneHostnameText);
|
||||||
|
|
||||||
|
bool missingOk = false;
|
||||||
|
WorkerNode *primaryWorkerNode = FindNodeWithNodeId(primaryNodeId, missingOk);
|
||||||
|
|
||||||
|
if (primaryWorkerNode == NULL)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||||
|
errmsg("primary node with ID %d does not exist", primaryNodeId)));
|
||||||
|
}
|
||||||
|
|
||||||
|
int32 cloneNodeId = CitusAddCloneNode(primaryWorkerNode, cloneHostname, clonePort);
|
||||||
|
|
||||||
|
PG_RETURN_INT32(cloneNodeId);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* CitusAddCloneNode function adds a new node as a clone of an existing primary node.
|
||||||
|
* It records the clone's hostname, port, and links it to the primary node's ID.
|
||||||
|
* The clone is initially marked as inactive and not having shards.
|
||||||
|
*/
|
||||||
|
static int32
|
||||||
|
CitusAddCloneNode(WorkerNode *primaryWorkerNode,
|
||||||
|
char *cloneHostname, int32 clonePort)
|
||||||
|
{
|
||||||
|
Assert(primaryWorkerNode != NULL);
|
||||||
|
|
||||||
|
/* Future-proofing: Ideally, a primary node should not itself be a clone.
|
||||||
|
* This check might be more relevant once replica promotion logic exists.
|
||||||
|
* For now, pg_dist_node.nodeisclone defaults to false for existing nodes.
|
||||||
|
*/
|
||||||
|
if (primaryWorkerNode->nodeisclone)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||||
|
errmsg(
|
||||||
|
"primary node %s:%d is itself a clone and cannot have clones",
|
||||||
|
primaryWorkerNode->workerName, primaryWorkerNode->
|
||||||
|
workerPort)));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!primaryWorkerNode->shouldHaveShards)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||||
|
errmsg(
|
||||||
|
"primary node %s:%d does not have shards, node without shards cannot have clones",
|
||||||
|
primaryWorkerNode->workerName, primaryWorkerNode->
|
||||||
|
workerPort)));
|
||||||
|
}
|
||||||
|
|
||||||
|
WorkerNode *existingCloneNode = FindWorkerNodeAnyCluster(cloneHostname, clonePort);
|
||||||
|
if (existingCloneNode != NULL)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Idempotency check: If the node already exists, is it already correctly
|
||||||
|
* registered as a clone for THIS primary?
|
||||||
|
*/
|
||||||
|
if (existingCloneNode->nodeisclone &&
|
||||||
|
existingCloneNode->nodeprimarynodeid == primaryWorkerNode->nodeId)
|
||||||
|
{
|
||||||
|
ereport(NOTICE, (errmsg(
|
||||||
|
"node %s:%d is already registered as a clone for primary %s:%d (nodeid %d)",
|
||||||
|
cloneHostname, clonePort,
|
||||||
|
primaryWorkerNode->workerName, primaryWorkerNode->
|
||||||
|
workerPort, primaryWorkerNode->nodeId)));
|
||||||
|
PG_RETURN_INT32(existingCloneNode->nodeId);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||||
|
errmsg(
|
||||||
|
"a different node %s:%d (nodeid %d) already exists or is a clone for a different primary",
|
||||||
|
cloneHostname, clonePort, existingCloneNode->nodeId)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
EnsureValidStreamingReplica(primaryWorkerNode, cloneHostname, clonePort);
|
||||||
|
|
||||||
|
char *operation = "add";
|
||||||
|
EnsureValidCloneMode(primaryWorkerNode, cloneHostname, clonePort, operation);
|
||||||
|
|
||||||
|
NodeMetadata nodeMetadata = DefaultNodeMetadata();
|
||||||
|
|
||||||
|
nodeMetadata.nodeisclone = true;
|
||||||
|
nodeMetadata.nodeprimarynodeid = primaryWorkerNode->nodeId;
|
||||||
|
nodeMetadata.isActive = false; /* Replicas start as inactive */
|
||||||
|
nodeMetadata.shouldHaveShards = false; /* Replicas do not directly own primary shards */
|
||||||
|
nodeMetadata.groupId = INVALID_GROUP_ID; /* Replicas get a new group ID and do not belong to any existing group */
|
||||||
|
nodeMetadata.nodeRole = UnavailableNodeRoleId(); /* The node role is set to 'unavailable' */
|
||||||
|
nodeMetadata.nodeCluster = primaryWorkerNode->nodeCluster; /* Same cluster as primary */
|
||||||
|
|
||||||
|
/* Other fields like hasMetadata, metadataSynced will take defaults from DefaultNodeMetadata
|
||||||
|
* (typically true, true for hasMetadata and metadataSynced if it's a new node,
|
||||||
|
* or might need adjustment based on replica strategy)
|
||||||
|
* For now, let's assume DefaultNodeMetadata provides suitable defaults for these
|
||||||
|
* or they will be set by AddNodeMetadata/ActivateNodeList if needed.
|
||||||
|
* Specifically, hasMetadata is often true, and metadataSynced true after activation.
|
||||||
|
* Since this replica is inactive, metadata sync status might be less critical initially.
|
||||||
|
*/
|
||||||
|
|
||||||
|
bool nodeAlreadyExists = false;
|
||||||
|
bool localOnly = false; /* Propagate change to other workers with metadata */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* AddNodeMetadata will take an ExclusiveLock on pg_dist_node.
|
||||||
|
* It also checks again if the node already exists after acquiring the lock.
|
||||||
|
*/
|
||||||
|
int cloneNodeId = AddNodeMetadata(cloneHostname, clonePort, &nodeMetadata,
|
||||||
|
&nodeAlreadyExists, localOnly);
|
||||||
|
|
||||||
|
if (nodeAlreadyExists)
|
||||||
|
{
|
||||||
|
/* This case should ideally be caught by the FindWorkerNodeAnyCluster check above,
|
||||||
|
* but AddNodeMetadata does its own check after locking.
|
||||||
|
* If it already exists and is correctly configured, we might have returned NOTICE above.
|
||||||
|
* If it exists but is NOT correctly configured as our replica, an ERROR would be more appropriate.
|
||||||
|
* AddNodeMetadata returns the existing node's ID if it finds one.
|
||||||
|
* We need to ensure it is the *correct* replica.
|
||||||
|
*/
|
||||||
|
WorkerNode *fetchedExistingNode = FindNodeAnyClusterByNodeId(cloneNodeId);
|
||||||
|
if (fetchedExistingNode != NULL && fetchedExistingNode->nodeisclone &&
|
||||||
|
fetchedExistingNode->nodeprimarynodeid == primaryWorkerNode->nodeId)
|
||||||
|
{
|
||||||
|
ereport(NOTICE, (errmsg(
|
||||||
|
"node %s:%d was already correctly registered as a clone for primary %s:%d (nodeid %d)",
|
||||||
|
cloneHostname, clonePort,
|
||||||
|
primaryWorkerNode->workerName, primaryWorkerNode->
|
||||||
|
workerPort, primaryWorkerNode->nodeId)));
|
||||||
|
|
||||||
|
/* Intentional fall-through to return cloneNodeId */
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* This state is less expected if our initial check passed or errored. */
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
|
||||||
|
errmsg(
|
||||||
|
"node %s:%d already exists but is not correctly configured as a clone for primary %s:%d",
|
||||||
|
cloneHostname, clonePort, primaryWorkerNode->workerName,
|
||||||
|
primaryWorkerNode->workerPort)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TransactionModifiedNodeMetadata = true;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Note: Clones added this way are inactive.
|
||||||
|
* A separate UDF citus_promote_clone_and_rebalance
|
||||||
|
* would be needed to activate them.
|
||||||
|
*/
|
||||||
|
|
||||||
|
return cloneNodeId;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* citus_remove_clone_node removes an inactive streaming clone node from Citus metadata.
|
||||||
|
*/
|
||||||
|
Datum
|
||||||
|
citus_remove_clone_node(PG_FUNCTION_ARGS)
|
||||||
|
{
|
||||||
|
CheckCitusVersion(ERROR);
|
||||||
|
EnsureSuperUser();
|
||||||
|
EnsureCoordinator();
|
||||||
|
|
||||||
|
text *nodeNameText = PG_GETARG_TEXT_P(0);
|
||||||
|
int32 nodePort = PG_GETARG_INT32(1);
|
||||||
|
char *nodeName = text_to_cstring(nodeNameText);
|
||||||
|
|
||||||
|
WorkerNode *workerNode = FindWorkerNodeAnyCluster(nodeName, nodePort);
|
||||||
|
|
||||||
|
if (workerNode == NULL)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||||
|
errmsg("node \"%s:%d\" does not exist", nodeName, nodePort)));
|
||||||
|
}
|
||||||
|
|
||||||
|
RemoveCloneNode(workerNode);
|
||||||
|
|
||||||
|
PG_RETURN_VOID();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* citus_remove_clone_node_with_nodeid removes an inactive clone node from Citus metadata
|
||||||
|
* using the node's ID.
|
||||||
|
*/
|
||||||
|
Datum
|
||||||
|
citus_remove_clone_node_with_nodeid(PG_FUNCTION_ARGS)
|
||||||
|
{
|
||||||
|
CheckCitusVersion(ERROR);
|
||||||
|
EnsureSuperUser();
|
||||||
|
EnsureCoordinator();
|
||||||
|
|
||||||
|
uint32 replicaNodeId = PG_GETARG_INT32(0);
|
||||||
|
|
||||||
|
WorkerNode *replicaNode = FindNodeAnyClusterByNodeId(replicaNodeId);
|
||||||
|
|
||||||
|
if (replicaNode == NULL)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||||
|
errmsg("Clone node with ID %d does not exist", replicaNodeId)));
|
||||||
|
}
|
||||||
|
RemoveCloneNode(replicaNode);
|
||||||
|
|
||||||
|
PG_RETURN_VOID();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
RemoveCloneNode(WorkerNode *cloneNode)
|
||||||
|
{
|
||||||
|
Assert(cloneNode != NULL);
|
||||||
|
|
||||||
|
if (!cloneNode->nodeisclone)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||||
|
errmsg("Node %s:%d (ID %d) is not a clone node. "
|
||||||
|
"Use citus_remove_node() to remove primary or already promoted nodes.",
|
||||||
|
cloneNode->workerName, cloneNode->workerPort, cloneNode->
|
||||||
|
nodeId)));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cloneNode->isActive)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||||
|
errmsg(
|
||||||
|
"Clone node %s:%d (ID %d) is marked as active and cannot be removed with this function. "
|
||||||
|
"This might indicate a promoted clone. Consider using citus_remove_node() if you are sure, "
|
||||||
|
"or ensure it's properly deactivated if it's an unpromoted clone in an unexpected state.",
|
||||||
|
cloneNode->workerName, cloneNode->workerPort, cloneNode->
|
||||||
|
nodeId)));
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* All checks passed, proceed with removal.
|
||||||
|
* RemoveNodeFromCluster handles locking, catalog changes, connection closing, and metadata sync.
|
||||||
|
*/
|
||||||
|
ereport(NOTICE, (errmsg("Removing inactive clone node %s:%d (ID %d)",
|
||||||
|
cloneNode->workerName, cloneNode->workerPort, cloneNode->
|
||||||
|
nodeId)));
|
||||||
|
|
||||||
|
RemoveNodeFromCluster(cloneNode->workerName, cloneNode->workerPort);
|
||||||
|
|
||||||
|
/* RemoveNodeFromCluster might set this, but setting it here ensures it's marked for this UDF's transaction. */
|
||||||
|
TransactionModifiedNodeMetadata = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* SetLockTimeoutLocally sets the lock_timeout to the given value.
|
* SetLockTimeoutLocally sets the lock_timeout to the given value.
|
||||||
* This setting is local.
|
* This setting is local.
|
||||||
|
|
@ -1859,7 +2198,7 @@ FindWorkerNodeAnyCluster(const char *nodeName, int32 nodePort)
|
||||||
HeapTuple heapTuple = GetNodeTuple(nodeName, nodePort);
|
HeapTuple heapTuple = GetNodeTuple(nodeName, nodePort);
|
||||||
if (heapTuple != NULL)
|
if (heapTuple != NULL)
|
||||||
{
|
{
|
||||||
workerNode = TupleToWorkerNode(tupleDescriptor, heapTuple);
|
workerNode = TupleToWorkerNode(pgDistNode, tupleDescriptor, heapTuple);
|
||||||
}
|
}
|
||||||
|
|
||||||
table_close(pgDistNode, NoLock);
|
table_close(pgDistNode, NoLock);
|
||||||
|
|
@ -1871,7 +2210,7 @@ FindWorkerNodeAnyCluster(const char *nodeName, int32 nodePort)
|
||||||
* FindNodeAnyClusterByNodeId searches pg_dist_node and returns the node with
|
* FindNodeAnyClusterByNodeId searches pg_dist_node and returns the node with
|
||||||
* the nodeId. If the node can't be found returns NULL.
|
* the nodeId. If the node can't be found returns NULL.
|
||||||
*/
|
*/
|
||||||
static WorkerNode *
|
WorkerNode *
|
||||||
FindNodeAnyClusterByNodeId(uint32 nodeId)
|
FindNodeAnyClusterByNodeId(uint32 nodeId)
|
||||||
{
|
{
|
||||||
bool includeNodesFromOtherClusters = true;
|
bool includeNodesFromOtherClusters = true;
|
||||||
|
|
@ -1966,7 +2305,8 @@ ReadDistNode(bool includeNodesFromOtherClusters)
|
||||||
HeapTuple heapTuple = systable_getnext(scanDescriptor);
|
HeapTuple heapTuple = systable_getnext(scanDescriptor);
|
||||||
while (HeapTupleIsValid(heapTuple))
|
while (HeapTupleIsValid(heapTuple))
|
||||||
{
|
{
|
||||||
WorkerNode *workerNode = TupleToWorkerNode(tupleDescriptor, heapTuple);
|
WorkerNode *workerNode = TupleToWorkerNode(pgDistNode, tupleDescriptor, heapTuple)
|
||||||
|
;
|
||||||
|
|
||||||
if (includeNodesFromOtherClusters ||
|
if (includeNodesFromOtherClusters ||
|
||||||
strncmp(workerNode->nodeCluster, CurrentCluster, WORKER_LENGTH) == 0)
|
strncmp(workerNode->nodeCluster, CurrentCluster, WORKER_LENGTH) == 0)
|
||||||
|
|
@ -2513,7 +2853,7 @@ SetWorkerColumnLocalOnly(WorkerNode *workerNode, int columnIndex, Datum value)
|
||||||
CitusInvalidateRelcacheByRelid(DistNodeRelationId());
|
CitusInvalidateRelcacheByRelid(DistNodeRelationId());
|
||||||
CommandCounterIncrement();
|
CommandCounterIncrement();
|
||||||
|
|
||||||
WorkerNode *newWorkerNode = TupleToWorkerNode(tupleDescriptor, heapTuple);
|
WorkerNode *newWorkerNode = TupleToWorkerNode(pgDistNode, tupleDescriptor, heapTuple);
|
||||||
|
|
||||||
table_close(pgDistNode, NoLock);
|
table_close(pgDistNode, NoLock);
|
||||||
|
|
||||||
|
|
@ -2924,6 +3264,10 @@ InsertNodeRow(int nodeid, char *nodeName, int32 nodePort, NodeMetadata *nodeMeta
|
||||||
values[Anum_pg_dist_node_nodecluster - 1] = nodeClusterNameDatum;
|
values[Anum_pg_dist_node_nodecluster - 1] = nodeClusterNameDatum;
|
||||||
values[Anum_pg_dist_node_shouldhaveshards - 1] = BoolGetDatum(
|
values[Anum_pg_dist_node_shouldhaveshards - 1] = BoolGetDatum(
|
||||||
nodeMetadata->shouldHaveShards);
|
nodeMetadata->shouldHaveShards);
|
||||||
|
values[Anum_pg_dist_node_nodeisclone - 1] = BoolGetDatum(
|
||||||
|
nodeMetadata->nodeisclone);
|
||||||
|
values[Anum_pg_dist_node_nodeprimarynodeid - 1] = Int32GetDatum(
|
||||||
|
nodeMetadata->nodeprimarynodeid);
|
||||||
|
|
||||||
Relation pgDistNode = table_open(DistNodeRelationId(), RowExclusiveLock);
|
Relation pgDistNode = table_open(DistNodeRelationId(), RowExclusiveLock);
|
||||||
|
|
||||||
|
|
@ -3015,19 +3359,18 @@ DeleteNodeRow(char *nodeName, int32 nodePort)
|
||||||
* the caller already has locks on the tuple, and doesn't perform any locking.
|
* the caller already has locks on the tuple, and doesn't perform any locking.
|
||||||
*/
|
*/
|
||||||
static WorkerNode *
|
static WorkerNode *
|
||||||
TupleToWorkerNode(TupleDesc tupleDescriptor, HeapTuple heapTuple)
|
TupleToWorkerNode(Relation pgDistNode, TupleDesc tupleDescriptor, HeapTuple heapTuple)
|
||||||
{
|
{
|
||||||
Datum datumArray[Natts_pg_dist_node];
|
/* we add remove columns from pg_dist_node during extension upgrade and
|
||||||
bool isNullArray[Natts_pg_dist_node];
|
* and downgrads. Now the issue here is PostgreSQL never reuses the old
|
||||||
|
* attnum. Dropped columns leave “holes” (attributes with attisdropped = true),
|
||||||
Assert(!HeapTupleHasNulls(heapTuple));
|
* and a re-added column with the same name gets a new attnum at the end. So
|
||||||
|
* we cannot use the deined Natts_pg_dist_node to allocate memory and also
|
||||||
/*
|
* we need to cater for the holes when fetching the column values
|
||||||
* This function can be called before "ALTER TABLE ... ADD COLUMN nodecluster ...",
|
|
||||||
* therefore heap_deform_tuple() won't set the isNullArray for this column. We
|
|
||||||
* initialize it true to be safe in that case.
|
|
||||||
*/
|
*/
|
||||||
memset(isNullArray, true, sizeof(isNullArray));
|
int nAtts = tupleDescriptor->natts;
|
||||||
|
Datum *datumArray = palloc0(sizeof(Datum) * nAtts);
|
||||||
|
bool *isNullArray = palloc0(sizeof(bool) * nAtts);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We use heap_deform_tuple() instead of heap_getattr() to expand tuple
|
* We use heap_deform_tuple() instead of heap_getattr() to expand tuple
|
||||||
|
|
@ -3054,18 +3397,48 @@ TupleToWorkerNode(TupleDesc tupleDescriptor, HeapTuple heapTuple)
|
||||||
1]);
|
1]);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* nodecluster column can be missing. In the case of extension creation/upgrade,
|
* Attributes above this line are guaranteed to be present at the
|
||||||
* master_initialize_node_metadata function is called before the nodecluster
|
* exact defined attribute number. Atleast till now. If you are droping or
|
||||||
* column is added to pg_dist_node table.
|
* adding any of the above columns consider adjusting the code above
|
||||||
*/
|
*/
|
||||||
if (!isNullArray[Anum_pg_dist_node_nodecluster - 1])
|
Oid pgDistNodeRelId = RelationGetRelid(pgDistNode);
|
||||||
|
|
||||||
|
AttrNumber nodeClusterAttno = get_attnum(pgDistNodeRelId, "nodecluster");
|
||||||
|
|
||||||
|
if (nodeClusterAttno > 0 &&
|
||||||
|
!TupleDescAttr(tupleDescriptor, nodeClusterAttno - 1)->attisdropped &&
|
||||||
|
!isNullArray[nodeClusterAttno - 1])
|
||||||
{
|
{
|
||||||
Name nodeClusterName =
|
Name nodeClusterName =
|
||||||
DatumGetName(datumArray[Anum_pg_dist_node_nodecluster - 1]);
|
DatumGetName(datumArray[nodeClusterAttno - 1]);
|
||||||
char *nodeClusterString = NameStr(*nodeClusterName);
|
char *nodeClusterString = NameStr(*nodeClusterName);
|
||||||
strlcpy(workerNode->nodeCluster, nodeClusterString, NAMEDATALEN);
|
strlcpy(workerNode->nodeCluster, nodeClusterString, NAMEDATALEN);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (nAtts > Anum_pg_dist_node_nodeisclone)
|
||||||
|
{
|
||||||
|
AttrNumber nodeIsCloneAttno = get_attnum(pgDistNodeRelId, "nodeisclone");
|
||||||
|
if (nodeIsCloneAttno > 0 &&
|
||||||
|
!TupleDescAttr(tupleDescriptor, nodeIsCloneAttno - 1)->attisdropped &&
|
||||||
|
!isNullArray[nodeIsCloneAttno - 1])
|
||||||
|
{
|
||||||
|
workerNode->nodeisclone = DatumGetBool(datumArray[nodeIsCloneAttno - 1]);
|
||||||
|
}
|
||||||
|
AttrNumber nodePrimaryNodeIdAttno = get_attnum(pgDistNodeRelId,
|
||||||
|
"nodeprimarynodeid");
|
||||||
|
if (nodePrimaryNodeIdAttno > 0 &&
|
||||||
|
!TupleDescAttr(tupleDescriptor, nodePrimaryNodeIdAttno - 1)->attisdropped &&
|
||||||
|
!isNullArray[nodePrimaryNodeIdAttno - 1])
|
||||||
|
{
|
||||||
|
workerNode->nodeprimarynodeid = DatumGetInt32(datumArray[
|
||||||
|
nodePrimaryNodeIdAttno - 1])
|
||||||
|
;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pfree(datumArray);
|
||||||
|
pfree(isNullArray);
|
||||||
|
|
||||||
return workerNode;
|
return workerNode;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,422 @@
|
||||||
|
#include "postgres.h"
|
||||||
|
|
||||||
|
#include "utils/fmgrprotos.h"
|
||||||
|
#include "utils/pg_lsn.h"
|
||||||
|
|
||||||
|
#include "distributed/argutils.h"
|
||||||
|
#include "distributed/clonenode_utils.h"
|
||||||
|
#include "distributed/listutils.h"
|
||||||
|
#include "distributed/metadata_cache.h"
|
||||||
|
#include "distributed/metadata_sync.h"
|
||||||
|
#include "distributed/remote_commands.h"
|
||||||
|
#include "distributed/shard_rebalancer.h"
|
||||||
|
|
||||||
|
|
||||||
|
static void BlockAllWritesToWorkerNode(WorkerNode *workerNode);
|
||||||
|
static bool GetNodeIsInRecoveryStatus(WorkerNode *workerNode);
|
||||||
|
static void PromoteCloneNode(WorkerNode *cloneWorkerNode);
|
||||||
|
static void EnsureSingleNodePromotion(WorkerNode *primaryNode);
|
||||||
|
|
||||||
|
PG_FUNCTION_INFO_V1(citus_promote_clone_and_rebalance);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* citus_promote_clone_and_rebalance promotes an inactive clone node to become
|
||||||
|
* the new primary node, replacing its original primary node.
|
||||||
|
*
|
||||||
|
* This function performs the following steps:
|
||||||
|
* 1. Validates that the clone node exists and is properly configured
|
||||||
|
* 2. Ensures the clone is inactive and has a valid primary node reference
|
||||||
|
* 3. Blocks all writes to the primary node to prevent data divergence
|
||||||
|
* 4. Waits for the clone to catch up with the primary's WAL position
|
||||||
|
* 5. Promotes the clone node to become a standalone primary
|
||||||
|
* 6. Updates metadata to mark the clone as active and primary
|
||||||
|
* 7. Rebalances shards between the old primary and new primary
|
||||||
|
* 8. Returns information about the promotion and any shard movements
|
||||||
|
*
|
||||||
|
* Arguments:
|
||||||
|
* - clone_nodeid: The node ID of the clone to promote
|
||||||
|
* - catchUpTimeoutSeconds: Maximum time to wait for clone to catch up (default: 300)
|
||||||
|
*
|
||||||
|
* The function ensures data consistency by blocking writes during the promotion
|
||||||
|
* process and verifying replication lag before proceeding.
|
||||||
|
*/
|
||||||
|
Datum
|
||||||
|
citus_promote_clone_and_rebalance(PG_FUNCTION_ARGS)
|
||||||
|
{
|
||||||
|
/* Ensure superuser and coordinator */
|
||||||
|
EnsureSuperUser();
|
||||||
|
EnsureCoordinator();
|
||||||
|
|
||||||
|
/* Get clone_nodeid argument */
|
||||||
|
int32 cloneNodeIdArg = PG_GETARG_INT32(0);
|
||||||
|
|
||||||
|
/* Get catchUpTimeoutSeconds argument with default value of 300 */
|
||||||
|
int32 catchUpTimeoutSeconds = PG_ARGISNULL(2) ? 300 : PG_GETARG_INT32(2);
|
||||||
|
|
||||||
|
/* Lock pg_dist_node to prevent concurrent modifications during this operation */
|
||||||
|
LockRelationOid(DistNodeRelationId(), RowExclusiveLock);
|
||||||
|
|
||||||
|
WorkerNode *cloneNode = FindNodeAnyClusterByNodeId(cloneNodeIdArg);
|
||||||
|
if (cloneNode == NULL)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||||
|
errmsg("Clone node with ID %d not found.", cloneNodeIdArg)));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!cloneNode->nodeisclone || cloneNode->nodeprimarynodeid == 0)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||||
|
errmsg(
|
||||||
|
"Node %s:%d (ID %d) is not a valid clone or its primary node ID is not set.",
|
||||||
|
cloneNode->workerName, cloneNode->workerPort, cloneNode->
|
||||||
|
nodeId)));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cloneNode->isActive)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||||
|
errmsg(
|
||||||
|
"Clone node %s:%d (ID %d) is already active and cannot be promoted.",
|
||||||
|
cloneNode->workerName, cloneNode->workerPort, cloneNode->
|
||||||
|
nodeId)));
|
||||||
|
}
|
||||||
|
|
||||||
|
WorkerNode *primaryNode = FindNodeAnyClusterByNodeId(cloneNode->nodeprimarynodeid);
|
||||||
|
if (primaryNode == NULL)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||||
|
errmsg("Primary node with ID %d (for clone %s:%d) not found.",
|
||||||
|
cloneNode->nodeprimarynodeid, cloneNode->workerName,
|
||||||
|
cloneNode->workerPort)));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (primaryNode->nodeisclone)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||||
|
errmsg("Primary node %s:%d (ID %d) is itself a clone.",
|
||||||
|
primaryNode->workerName, primaryNode->workerPort,
|
||||||
|
primaryNode->nodeId)));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!primaryNode->isActive)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||||
|
errmsg("Primary node %s:%d (ID %d) is not active.",
|
||||||
|
primaryNode->workerName, primaryNode->workerPort,
|
||||||
|
primaryNode->nodeId)));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Ensure the primary node is related to the clone node */
|
||||||
|
if (primaryNode->nodeId != cloneNode->nodeprimarynodeid)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||||
|
errmsg(
|
||||||
|
"Clone node %s:%d (ID %d) is not a clone of the primary node %s:%d (ID %d).",
|
||||||
|
cloneNode->workerName, cloneNode->workerPort, cloneNode->
|
||||||
|
nodeId,
|
||||||
|
primaryNode->workerName, primaryNode->workerPort,
|
||||||
|
primaryNode->nodeId)));
|
||||||
|
}
|
||||||
|
|
||||||
|
EnsureSingleNodePromotion(primaryNode);
|
||||||
|
ereport(NOTICE, (errmsg(
|
||||||
|
"Starting promotion process for clone node %s:%d (ID %d), original primary %s:%d (ID %d)",
|
||||||
|
cloneNode->workerName, cloneNode->workerPort, cloneNode->
|
||||||
|
nodeId,
|
||||||
|
primaryNode->workerName, primaryNode->workerPort, primaryNode
|
||||||
|
->nodeId)));
|
||||||
|
|
||||||
|
/* Step 0: Check if clone is replica of provided primary node and is not synchronous */
|
||||||
|
char *operation = "promote";
|
||||||
|
EnsureValidCloneMode(primaryNode, cloneNode->workerName, cloneNode->workerPort,
|
||||||
|
operation);
|
||||||
|
|
||||||
|
/* Step 1: Block Writes on Original Primary's Shards */
|
||||||
|
ereport(NOTICE, (errmsg(
|
||||||
|
"Blocking writes on shards of original primary node %s:%d (group %d)",
|
||||||
|
primaryNode->workerName, primaryNode->workerPort, primaryNode
|
||||||
|
->groupId)));
|
||||||
|
|
||||||
|
BlockAllWritesToWorkerNode(primaryNode);
|
||||||
|
|
||||||
|
/* Step 2: Wait for Clone to Catch Up */
|
||||||
|
ereport(NOTICE, (errmsg(
|
||||||
|
"Waiting for clone %s:%d to catch up with primary %s:%d (timeout: %d seconds)",
|
||||||
|
cloneNode->workerName, cloneNode->workerPort,
|
||||||
|
primaryNode->workerName, primaryNode->workerPort,
|
||||||
|
catchUpTimeoutSeconds)));
|
||||||
|
|
||||||
|
bool caughtUp = false;
|
||||||
|
const int sleepIntervalSeconds = 5;
|
||||||
|
int elapsedTimeSeconds = 0;
|
||||||
|
|
||||||
|
while (elapsedTimeSeconds < catchUpTimeoutSeconds)
|
||||||
|
{
|
||||||
|
uint64 repLag = GetReplicationLag(primaryNode, cloneNode);
|
||||||
|
if (repLag <= 0)
|
||||||
|
{
|
||||||
|
caughtUp = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
pg_usleep(sleepIntervalSeconds * 1000000L);
|
||||||
|
elapsedTimeSeconds += sleepIntervalSeconds;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!caughtUp)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||||
|
errmsg(
|
||||||
|
"Clone %s:%d failed to catch up with primary %s:%d within %d seconds.",
|
||||||
|
cloneNode->workerName, cloneNode->workerPort,
|
||||||
|
primaryNode->workerName, primaryNode->workerPort,
|
||||||
|
catchUpTimeoutSeconds)));
|
||||||
|
}
|
||||||
|
|
||||||
|
ereport(NOTICE, (errmsg("Clone %s:%d is now caught up with primary %s:%d.",
|
||||||
|
cloneNode->workerName, cloneNode->workerPort,
|
||||||
|
primaryNode->workerName, primaryNode->workerPort)));
|
||||||
|
|
||||||
|
|
||||||
|
/* Step 3: PostgreSQL Clone Promotion */
|
||||||
|
ereport(NOTICE, (errmsg("Attempting to promote clone %s:%d via pg_promote().",
|
||||||
|
cloneNode->workerName, cloneNode->workerPort)));
|
||||||
|
|
||||||
|
PromoteCloneNode(cloneNode);
|
||||||
|
|
||||||
|
/* Step 4: Update Clone Metadata in pg_dist_node on Coordinator */
|
||||||
|
|
||||||
|
ereport(NOTICE, (errmsg("Updating metadata for promoted clone %s:%d (ID %d)",
|
||||||
|
cloneNode->workerName, cloneNode->workerPort, cloneNode->
|
||||||
|
nodeId)));
|
||||||
|
ActivateCloneNodeAsPrimary(cloneNode);
|
||||||
|
|
||||||
|
/* We need to sync metadata changes to all nodes before rebalancing shards
|
||||||
|
* since the rebalancing algorithm depends on the latest metadata.
|
||||||
|
*/
|
||||||
|
SyncNodeMetadataToNodes();
|
||||||
|
|
||||||
|
/* Step 5: Split Shards Between Primary and Clone */
|
||||||
|
SplitShardsBetweenPrimaryAndClone(primaryNode, cloneNode, PG_GETARG_NAME_OR_NULL(1))
|
||||||
|
;
|
||||||
|
|
||||||
|
|
||||||
|
TransactionModifiedNodeMetadata = true; /* Inform Citus about metadata change */
|
||||||
|
TriggerNodeMetadataSyncOnCommit(); /* Ensure changes are propagated */
|
||||||
|
|
||||||
|
|
||||||
|
ereport(NOTICE, (errmsg(
|
||||||
|
"Clone node %s:%d (ID %d) metadata updated. It is now a primary",
|
||||||
|
cloneNode->workerName, cloneNode->workerPort, cloneNode->
|
||||||
|
nodeId)));
|
||||||
|
|
||||||
|
|
||||||
|
/* Step 6: Unblock Writes (should be handled by transaction commit) */
|
||||||
|
ereport(NOTICE, (errmsg(
|
||||||
|
"Clone node %s:%d (ID %d) successfully registered as a worker node",
|
||||||
|
cloneNode->workerName, cloneNode->workerPort, cloneNode->
|
||||||
|
nodeId)));
|
||||||
|
|
||||||
|
PG_RETURN_VOID();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* PromoteCloneNode promotes a clone node to a primary node using PostgreSQL's
|
||||||
|
* pg_promote() function.
|
||||||
|
*
|
||||||
|
* This function performs the following steps:
|
||||||
|
* 1. Connects to the clone node
|
||||||
|
* 2. Executes pg_promote(wait := true) to promote the clone to primary
|
||||||
|
* 3. Reconnects to verify the promotion was successful
|
||||||
|
* 4. Checks if the node is still in recovery mode (which would indicate failure)
|
||||||
|
*
|
||||||
|
* The function throws an ERROR if:
|
||||||
|
* - Connection to the clone node fails
|
||||||
|
* - The pg_promote() command fails
|
||||||
|
* - The clone is still in recovery mode after promotion attempt
|
||||||
|
*
|
||||||
|
* On success, it logs a NOTICE message confirming the promotion.
|
||||||
|
*
|
||||||
|
* Note: This function assumes the clone has already been validated for promotion
|
||||||
|
* (e.g., replication lag is acceptable, clone is not synchronous, etc.)
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
PromoteCloneNode(WorkerNode *cloneWorkerNode)
|
||||||
|
{
|
||||||
|
/* Step 1: Connect to the clone node */
|
||||||
|
int connectionFlag = 0;
|
||||||
|
MultiConnection *cloneConnection = GetNodeConnection(connectionFlag,
|
||||||
|
cloneWorkerNode->workerName,
|
||||||
|
cloneWorkerNode->workerPort);
|
||||||
|
|
||||||
|
if (PQstatus(cloneConnection->pgConn) != CONNECTION_OK)
|
||||||
|
{
|
||||||
|
ReportConnectionError(cloneConnection, ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Step 2: Execute pg_promote() to promote the clone to primary */
|
||||||
|
const char *promoteQuery = "SELECT pg_promote(wait := true);";
|
||||||
|
int resultCode = SendRemoteCommand(cloneConnection, promoteQuery);
|
||||||
|
if (resultCode == 0)
|
||||||
|
{
|
||||||
|
ReportConnectionError(cloneConnection, ERROR);
|
||||||
|
}
|
||||||
|
ForgetResults(cloneConnection);
|
||||||
|
CloseConnection(cloneConnection);
|
||||||
|
|
||||||
|
/* Step 3: Reconnect and verify the promotion was successful */
|
||||||
|
if (GetNodeIsInRecoveryStatus(cloneWorkerNode))
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||||
|
errmsg(
|
||||||
|
"Failed to promote clone %s:%d (ID %d). It is still in recovery.",
|
||||||
|
cloneWorkerNode->workerName, cloneWorkerNode->workerPort,
|
||||||
|
cloneWorkerNode->nodeId)));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ereport(NOTICE, (errmsg(
|
||||||
|
"Clone node %s:%d (ID %d) has been successfully promoted.",
|
||||||
|
cloneWorkerNode->workerName, cloneWorkerNode->workerPort,
|
||||||
|
cloneWorkerNode->nodeId)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
BlockAllWritesToWorkerNode(WorkerNode *workerNode)
|
||||||
|
{
|
||||||
|
ereport(NOTICE, (errmsg("Blocking all writes to worker node %s:%d (ID %d)",
|
||||||
|
workerNode->workerName, workerNode->workerPort, workerNode->
|
||||||
|
nodeId)));
|
||||||
|
|
||||||
|
LockShardsInWorkerPlacementList(workerNode, AccessExclusiveLock);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* GetNodeIsInRecoveryStatus checks if a PostgreSQL node is currently in recovery mode.
|
||||||
|
*
|
||||||
|
* This function connects to the specified worker node and executes pg_is_in_recovery()
|
||||||
|
* to determine if the node is still acting as a replica (in recovery) or has been
|
||||||
|
* promoted to a primary (not in recovery).
|
||||||
|
*
|
||||||
|
* Arguments:
|
||||||
|
* - workerNode: The WorkerNode to check recovery status for
|
||||||
|
*
|
||||||
|
* Returns:
|
||||||
|
* - true if the node is in recovery mode (acting as a replica)
|
||||||
|
* - false if the node is not in recovery mode (acting as a primary)
|
||||||
|
*
|
||||||
|
* The function will ERROR if:
|
||||||
|
* - Cannot establish connection to the node
|
||||||
|
* - The remote query fails
|
||||||
|
* - The query result cannot be parsed
|
||||||
|
*
|
||||||
|
* This is used after promoting a clone node to verify that the
|
||||||
|
* promotion was successful and the node is no longer in recovery mode.
|
||||||
|
*/
|
||||||
|
static bool
|
||||||
|
GetNodeIsInRecoveryStatus(WorkerNode *workerNode)
|
||||||
|
{
|
||||||
|
int connectionFlag = 0;
|
||||||
|
MultiConnection *nodeConnection = GetNodeConnection(connectionFlag,
|
||||||
|
workerNode->workerName,
|
||||||
|
workerNode->workerPort);
|
||||||
|
|
||||||
|
if (PQstatus(nodeConnection->pgConn) != CONNECTION_OK)
|
||||||
|
{
|
||||||
|
ReportConnectionError(nodeConnection, ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
const char *recoveryQuery = "SELECT pg_is_in_recovery();";
|
||||||
|
int resultCode = SendRemoteCommand(nodeConnection, recoveryQuery);
|
||||||
|
if (resultCode == 0)
|
||||||
|
{
|
||||||
|
ReportConnectionError(nodeConnection, ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
PGresult *result = GetRemoteCommandResult(nodeConnection, true);
|
||||||
|
if (!IsResponseOK(result))
|
||||||
|
{
|
||||||
|
ReportResultError(nodeConnection, result, ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
List *recoveryStatusList = ReadFirstColumnAsText(result);
|
||||||
|
if (list_length(recoveryStatusList) != 1)
|
||||||
|
{
|
||||||
|
PQclear(result);
|
||||||
|
ClearResults(nodeConnection, true);
|
||||||
|
CloseConnection(nodeConnection);
|
||||||
|
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE),
|
||||||
|
errmsg("cannot parse recovery status result from %s:%d",
|
||||||
|
workerNode->workerName,
|
||||||
|
workerNode->workerPort)));
|
||||||
|
}
|
||||||
|
|
||||||
|
StringInfo recoveryStatusInfo = (StringInfo) linitial(recoveryStatusList);
|
||||||
|
bool isInRecovery = (strcmp(recoveryStatusInfo->data, "t") == 0) || (strcmp(
|
||||||
|
recoveryStatusInfo
|
||||||
|
->data,
|
||||||
|
"true") == 0)
|
||||||
|
;
|
||||||
|
|
||||||
|
PQclear(result);
|
||||||
|
ForgetResults(nodeConnection);
|
||||||
|
CloseConnection(nodeConnection);
|
||||||
|
|
||||||
|
return isInRecovery;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* EnsureSingleNodePromotion ensures that only one node promotion operation
|
||||||
|
* can proceed at a time by acquiring necessary locks and checking for
|
||||||
|
* conflicting operations.
|
||||||
|
*
|
||||||
|
* This function performs the following safety checks:
|
||||||
|
* 1. Verifies no rebalance operations are currently running, as they would
|
||||||
|
* conflict with the shard redistribution that occurs during promotion
|
||||||
|
* 2. Acquires exclusive placement colocation locks on all shards residing
|
||||||
|
* on the primary node's group to prevent concurrent shard operations
|
||||||
|
*
|
||||||
|
* The locks are acquired in shard ID order to prevent deadlocks when
|
||||||
|
* multiple operations attempt to lock the same set of shards.
|
||||||
|
*
|
||||||
|
* Arguments:
|
||||||
|
* - primaryNode: The primary node whose shards need to be locked
|
||||||
|
*
|
||||||
|
* Throws ERROR if:
|
||||||
|
* - A rebalance operation is already running
|
||||||
|
* - Unable to acquire necessary locks
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
EnsureSingleNodePromotion(WorkerNode *primaryNode)
|
||||||
|
{
|
||||||
|
/* Error out if some rebalancer is running */
|
||||||
|
int64 jobId = 0;
|
||||||
|
if (HasNonTerminalJobOfType("rebalance", &jobId))
|
||||||
|
{
|
||||||
|
ereport(ERROR, (
|
||||||
|
errmsg("A rebalance operation is already running as job %ld", jobId),
|
||||||
|
errdetail("A rebalance was already scheduled as background job"),
|
||||||
|
errhint("To monitor progress, run: SELECT * FROM "
|
||||||
|
"citus_rebalance_status();")));
|
||||||
|
}
|
||||||
|
List *placementList = AllShardPlacementsOnNodeGroup(primaryNode->groupId);
|
||||||
|
|
||||||
|
/* lock shards in order of shard id to prevent deadlock */
|
||||||
|
placementList = SortList(placementList, CompareShardPlacementsByShardId);
|
||||||
|
|
||||||
|
GroupShardPlacement *placement = NULL;
|
||||||
|
foreach_declared_ptr(placement, placementList)
|
||||||
|
{
|
||||||
|
int64 shardId = placement->shardId;
|
||||||
|
ShardInterval *shardInterval = LoadShardInterval(shardId);
|
||||||
|
Oid distributedTableId = shardInterval->relationId;
|
||||||
|
|
||||||
|
AcquirePlacementColocationLock(distributedTableId, ExclusiveLock, "promote clone")
|
||||||
|
;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -81,8 +81,29 @@ typedef struct RebalanceOptions
|
||||||
Form_pg_dist_rebalance_strategy rebalanceStrategy;
|
Form_pg_dist_rebalance_strategy rebalanceStrategy;
|
||||||
const char *operationName;
|
const char *operationName;
|
||||||
WorkerNode *workerNode;
|
WorkerNode *workerNode;
|
||||||
|
List *involvedWorkerNodeList;
|
||||||
} RebalanceOptions;
|
} RebalanceOptions;
|
||||||
|
|
||||||
|
typedef struct SplitPrimaryCloneShards
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* primaryShardPlacementList contains the placements that
|
||||||
|
* should stay on primary worker node.
|
||||||
|
*/
|
||||||
|
List *primaryShardIdList;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* cloneShardPlacementList contains the placements that should stay on
|
||||||
|
* clone worker node.
|
||||||
|
*/
|
||||||
|
List *cloneShardIdList;
|
||||||
|
} SplitPrimaryCloneShards;
|
||||||
|
|
||||||
|
|
||||||
|
static SplitPrimaryCloneShards * GetPrimaryCloneSplitRebalanceSteps(RebalanceOptions
|
||||||
|
*options,
|
||||||
|
WorkerNode
|
||||||
|
*cloneNode);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* RebalanceState is used to keep the internal state of the rebalance
|
* RebalanceState is used to keep the internal state of the rebalance
|
||||||
|
|
@ -324,6 +345,7 @@ PG_FUNCTION_INFO_V1(pg_dist_rebalance_strategy_enterprise_check);
|
||||||
PG_FUNCTION_INFO_V1(citus_rebalance_start);
|
PG_FUNCTION_INFO_V1(citus_rebalance_start);
|
||||||
PG_FUNCTION_INFO_V1(citus_rebalance_stop);
|
PG_FUNCTION_INFO_V1(citus_rebalance_stop);
|
||||||
PG_FUNCTION_INFO_V1(citus_rebalance_wait);
|
PG_FUNCTION_INFO_V1(citus_rebalance_wait);
|
||||||
|
PG_FUNCTION_INFO_V1(get_snapshot_based_node_split_plan);
|
||||||
|
|
||||||
bool RunningUnderCitusTestSuite = false;
|
bool RunningUnderCitusTestSuite = false;
|
||||||
int MaxRebalancerLoggedIgnoredMoves = 5;
|
int MaxRebalancerLoggedIgnoredMoves = 5;
|
||||||
|
|
@ -523,8 +545,17 @@ GetRebalanceSteps(RebalanceOptions *options)
|
||||||
.context = &context,
|
.context = &context,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if (options->involvedWorkerNodeList == NULL)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* If the user did not specify a list of worker nodes, we use all the
|
||||||
|
* active worker nodes.
|
||||||
|
*/
|
||||||
|
options->involvedWorkerNodeList = SortedActiveWorkers();
|
||||||
|
}
|
||||||
|
|
||||||
/* sort the lists to make the function more deterministic */
|
/* sort the lists to make the function more deterministic */
|
||||||
List *activeWorkerList = SortedActiveWorkers();
|
List *activeWorkerList = options->involvedWorkerNodeList; /*SortedActiveWorkers(); */
|
||||||
int shardAllowedNodeCount = 0;
|
int shardAllowedNodeCount = 0;
|
||||||
WorkerNode *workerNode = NULL;
|
WorkerNode *workerNode = NULL;
|
||||||
foreach_declared_ptr(workerNode, activeWorkerList)
|
foreach_declared_ptr(workerNode, activeWorkerList)
|
||||||
|
|
@ -987,6 +1018,7 @@ rebalance_table_shards(PG_FUNCTION_ARGS)
|
||||||
.excludedShardArray = PG_GETARG_ARRAYTYPE_P(3),
|
.excludedShardArray = PG_GETARG_ARRAYTYPE_P(3),
|
||||||
.drainOnly = PG_GETARG_BOOL(5),
|
.drainOnly = PG_GETARG_BOOL(5),
|
||||||
.rebalanceStrategy = strategy,
|
.rebalanceStrategy = strategy,
|
||||||
|
.involvedWorkerNodeList = NULL,
|
||||||
.improvementThreshold = strategy->improvementThreshold,
|
.improvementThreshold = strategy->improvementThreshold,
|
||||||
};
|
};
|
||||||
Oid shardTransferModeOid = PG_GETARG_OID(4);
|
Oid shardTransferModeOid = PG_GETARG_OID(4);
|
||||||
|
|
@ -3607,6 +3639,352 @@ EnsureShardCostUDF(Oid functionOid)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* SplitShardsBetweenPrimaryAndClone splits the shards in shardPlacementList
|
||||||
|
* between the primary and clone nodes, adding them to the respective lists.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
SplitShardsBetweenPrimaryAndClone(WorkerNode *primaryNode,
|
||||||
|
WorkerNode *cloneNode,
|
||||||
|
Name strategyName)
|
||||||
|
{
|
||||||
|
CheckCitusVersion(ERROR);
|
||||||
|
|
||||||
|
List *relationIdList = NonColocatedDistRelationIdList();
|
||||||
|
|
||||||
|
Form_pg_dist_rebalance_strategy strategy = GetRebalanceStrategy(strategyName);/* We use default strategy for now */
|
||||||
|
|
||||||
|
RebalanceOptions options = {
|
||||||
|
.relationIdList = relationIdList,
|
||||||
|
.threshold = 0, /* Threshold is not strictly needed for two nodes */
|
||||||
|
.maxShardMoves = -1, /* No limit on moves between these two nodes */
|
||||||
|
.excludedShardArray = construct_empty_array(INT8OID),
|
||||||
|
.drainOnly = false, /* Not a drain operation */
|
||||||
|
.rebalanceStrategy = strategy,
|
||||||
|
.improvementThreshold = 0, /* Consider all beneficial moves */
|
||||||
|
.workerNode = primaryNode /* indicate Primary node as a source node */
|
||||||
|
};
|
||||||
|
|
||||||
|
SplitPrimaryCloneShards *splitShards = GetPrimaryCloneSplitRebalanceSteps(&options
|
||||||
|
,
|
||||||
|
cloneNode);
|
||||||
|
AdjustShardsForPrimaryCloneNodeSplit(primaryNode, cloneNode,
|
||||||
|
splitShards->primaryShardIdList, splitShards->
|
||||||
|
cloneShardIdList);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* GetPrimaryCloneSplitRebalanceSteps returns a List of PlacementUpdateEvents that are needed to
|
||||||
|
* rebalance a list of tables.
|
||||||
|
*/
|
||||||
|
static SplitPrimaryCloneShards *
|
||||||
|
GetPrimaryCloneSplitRebalanceSteps(RebalanceOptions *options, WorkerNode *cloneNode)
|
||||||
|
{
|
||||||
|
WorkerNode *sourceNode = options->workerNode;
|
||||||
|
WorkerNode *targetNode = cloneNode;
|
||||||
|
|
||||||
|
/* Initialize rebalance plan functions and context */
|
||||||
|
EnsureShardCostUDF(options->rebalanceStrategy->shardCostFunction);
|
||||||
|
EnsureNodeCapacityUDF(options->rebalanceStrategy->nodeCapacityFunction);
|
||||||
|
EnsureShardAllowedOnNodeUDF(options->rebalanceStrategy->shardAllowedOnNodeFunction);
|
||||||
|
|
||||||
|
RebalanceContext context;
|
||||||
|
memset(&context, 0, sizeof(RebalanceContext));
|
||||||
|
fmgr_info(options->rebalanceStrategy->shardCostFunction, &context.shardCostUDF);
|
||||||
|
fmgr_info(options->rebalanceStrategy->nodeCapacityFunction, &context.nodeCapacityUDF);
|
||||||
|
fmgr_info(options->rebalanceStrategy->shardAllowedOnNodeFunction,
|
||||||
|
&context.shardAllowedOnNodeUDF);
|
||||||
|
|
||||||
|
RebalancePlanFunctions rebalancePlanFunctions = {
|
||||||
|
.shardAllowedOnNode = ShardAllowedOnNode,
|
||||||
|
.nodeCapacity = NodeCapacity,
|
||||||
|
.shardCost = GetShardCost,
|
||||||
|
.context = &context,
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Collect all active shard placements on the source node for the given relations.
|
||||||
|
* Unlike the main rebalancer, we build a single list of all relevant source placements
|
||||||
|
* across all specified relations (or all relations if none specified).
|
||||||
|
*/
|
||||||
|
List *allSourcePlacements = NIL;
|
||||||
|
Oid relationIdItr = InvalidOid;
|
||||||
|
foreach_declared_oid(relationIdItr, options->relationIdList)
|
||||||
|
{
|
||||||
|
List *shardPlacementList = FullShardPlacementList(relationIdItr,
|
||||||
|
options->excludedShardArray);
|
||||||
|
List *activeShardPlacementsForRelation =
|
||||||
|
FilterShardPlacementList(shardPlacementList, IsActiveShardPlacement);
|
||||||
|
|
||||||
|
ShardPlacement *placement = NULL;
|
||||||
|
foreach_declared_ptr(placement, activeShardPlacementsForRelation)
|
||||||
|
{
|
||||||
|
if (placement->nodeId == sourceNode->nodeId)
|
||||||
|
{
|
||||||
|
/* Ensure we don't add duplicate shardId if it's somehow listed under multiple relations */
|
||||||
|
bool alreadyAdded = false;
|
||||||
|
ShardPlacement *existingPlacement = NULL;
|
||||||
|
foreach_declared_ptr(existingPlacement, allSourcePlacements)
|
||||||
|
{
|
||||||
|
if (existingPlacement->shardId == placement->shardId)
|
||||||
|
{
|
||||||
|
alreadyAdded = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!alreadyAdded)
|
||||||
|
{
|
||||||
|
allSourcePlacements = lappend(allSourcePlacements, placement);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
List *activeWorkerList = list_make2(options->workerNode, cloneNode);
|
||||||
|
SplitPrimaryCloneShards *splitShards = palloc0(sizeof(SplitPrimaryCloneShards));
|
||||||
|
splitShards->primaryShardIdList = NIL;
|
||||||
|
splitShards->cloneShardIdList = NIL;
|
||||||
|
|
||||||
|
if (list_length(allSourcePlacements) > 0)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Initialize RebalanceState considering only the source node's shards
|
||||||
|
* and the two active workers (source and target).
|
||||||
|
*/
|
||||||
|
RebalanceState *state = InitRebalanceState(activeWorkerList, allSourcePlacements,
|
||||||
|
&rebalancePlanFunctions);
|
||||||
|
|
||||||
|
NodeFillState *sourceFillState = NULL;
|
||||||
|
NodeFillState *targetFillState = NULL;
|
||||||
|
ListCell *fsc = NULL;
|
||||||
|
|
||||||
|
/* Identify the fill states for our specific source and target nodes */
|
||||||
|
foreach(fsc, state->fillStateListAsc) /* Could be fillStateListDesc too, order doesn't matter here */
|
||||||
|
{
|
||||||
|
NodeFillState *fs = (NodeFillState *) lfirst(fsc);
|
||||||
|
if (fs->node->nodeId == sourceNode->nodeId)
|
||||||
|
{
|
||||||
|
sourceFillState = fs;
|
||||||
|
}
|
||||||
|
else if (fs->node->nodeId == targetNode->nodeId)
|
||||||
|
{
|
||||||
|
targetFillState = fs;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sourceFillState != NULL && targetFillState != NULL)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* The goal is to move roughly half the total cost from source to target.
|
||||||
|
* The target node is assumed to be empty or its existing load is not
|
||||||
|
* considered for this specific two-node balancing plan's shard distribution.
|
||||||
|
* We calculate costs based *only* on the shards currently on the source node.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The core idea is to simulate the balancing process between these two nodes.
|
||||||
|
* We have all shards on sourceFillState. TargetFillState is initially empty (in terms of these specific shards).
|
||||||
|
* We want to move shards from source to target until their costs are as balanced as possible.
|
||||||
|
*/
|
||||||
|
float4 sourceCurrentCost = sourceFillState->totalCost;
|
||||||
|
float4 targetCurrentCost = 0; /* Representing cost on target from these source shards */
|
||||||
|
|
||||||
|
/* Sort shards on source node by cost (descending). This is a common heuristic. */
|
||||||
|
sourceFillState->shardCostListDesc = SortList(sourceFillState->
|
||||||
|
shardCostListDesc,
|
||||||
|
CompareShardCostDesc);
|
||||||
|
|
||||||
|
List *potentialMoves = NIL;
|
||||||
|
ListCell *lc_shardcost = NULL;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Iterate through each shard on the source node. For each shard, decide if moving it
|
||||||
|
* to the target node would improve the balance (or is necessary to reach balance).
|
||||||
|
* A simple greedy approach: move shard if target node's current cost is less than source's.
|
||||||
|
*/
|
||||||
|
foreach(lc_shardcost, sourceFillState->shardCostListDesc)
|
||||||
|
{
|
||||||
|
ShardCost *shardToConsider = (ShardCost *) lfirst(lc_shardcost);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If moving this shard makes the target less loaded than the source would become,
|
||||||
|
* or if target is simply less loaded currently, consider the move.
|
||||||
|
* More accurately, we move if target's cost + shard's cost < source's cost - shard's cost (approximately)
|
||||||
|
* or if target is significantly emptier.
|
||||||
|
* The condition (targetCurrentCost < sourceCurrentCost - shardToConsider->cost) is a greedy choice.
|
||||||
|
* A better check: would moving this shard reduce the difference in costs?
|
||||||
|
* Current difference: abs(sourceCurrentCost - targetCurrentCost)
|
||||||
|
* Difference after move: abs((sourceCurrentCost - shardToConsider->cost) - (targetCurrentCost + shardToConsider->cost))
|
||||||
|
* Move if new difference is smaller.
|
||||||
|
*/
|
||||||
|
float4 costOfShard = shardToConsider->cost;
|
||||||
|
float4 diffBefore = fabsf(sourceCurrentCost - targetCurrentCost);
|
||||||
|
float4 diffAfter = fabsf((sourceCurrentCost - costOfShard) - (
|
||||||
|
targetCurrentCost + costOfShard));
|
||||||
|
|
||||||
|
if (diffAfter < diffBefore)
|
||||||
|
{
|
||||||
|
PlacementUpdateEvent *update = palloc0(sizeof(PlacementUpdateEvent));
|
||||||
|
update->shardId = shardToConsider->shardId;
|
||||||
|
update->sourceNode = sourceNode;
|
||||||
|
update->targetNode = targetNode;
|
||||||
|
update->updateType = PLACEMENT_UPDATE_MOVE;
|
||||||
|
potentialMoves = lappend(potentialMoves, update);
|
||||||
|
splitShards->cloneShardIdList = lappend_int(splitShards->
|
||||||
|
cloneShardIdList,
|
||||||
|
shardToConsider->shardId
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
/* Update simulated costs for the next iteration */
|
||||||
|
sourceCurrentCost -= costOfShard;
|
||||||
|
targetCurrentCost += costOfShard;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
splitShards->primaryShardIdList = lappend_int(splitShards->
|
||||||
|
primaryShardIdList,
|
||||||
|
shardToConsider->shardId
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* RebalanceState is in memory context, will be cleaned up */
|
||||||
|
}
|
||||||
|
return splitShards;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Snapshot-based node split plan outputs the shard placement plan
|
||||||
|
* for primary and replica based node split
|
||||||
|
*
|
||||||
|
* SQL signature:
|
||||||
|
* get_snapshot_based_node_split_plan(
|
||||||
|
* primary_node_name text,
|
||||||
|
* primary_node_port integer,
|
||||||
|
* replica_node_name text,
|
||||||
|
* replica_node_port integer,
|
||||||
|
* rebalance_strategy name DEFAULT NULL
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
Datum
|
||||||
|
get_snapshot_based_node_split_plan(PG_FUNCTION_ARGS)
|
||||||
|
{
|
||||||
|
CheckCitusVersion(ERROR);
|
||||||
|
|
||||||
|
text *primaryNodeNameText = PG_GETARG_TEXT_P(0);
|
||||||
|
int32 primaryNodePort = PG_GETARG_INT32(1);
|
||||||
|
text *cloneNodeNameText = PG_GETARG_TEXT_P(2);
|
||||||
|
int32 cloneNodePort = PG_GETARG_INT32(3);
|
||||||
|
|
||||||
|
char *primaryNodeName = text_to_cstring(primaryNodeNameText);
|
||||||
|
char *cloneNodeName = text_to_cstring(cloneNodeNameText);
|
||||||
|
|
||||||
|
WorkerNode *primaryNode = FindWorkerNodeOrError(primaryNodeName, primaryNodePort);
|
||||||
|
WorkerNode *cloneNode = FindWorkerNodeOrError(cloneNodeName, cloneNodePort);
|
||||||
|
|
||||||
|
if (!cloneNode->nodeisclone || cloneNode->nodeprimarynodeid == 0)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||||
|
errmsg(
|
||||||
|
"Node %s:%d (ID %d) is not a valid clone or its primary node ID is not set.",
|
||||||
|
cloneNode->workerName, cloneNode->workerPort,
|
||||||
|
cloneNode->nodeId)));
|
||||||
|
}
|
||||||
|
if (primaryNode->nodeisclone)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||||
|
errmsg("Primary node %s:%d (ID %d) is itself a replica.",
|
||||||
|
primaryNode->workerName, primaryNode->workerPort,
|
||||||
|
primaryNode->nodeId)));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Ensure the primary node is related to the replica node */
|
||||||
|
if (primaryNode->nodeId != cloneNode->nodeprimarynodeid)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||||
|
errmsg(
|
||||||
|
"Clone node %s:%d (ID %d) is not a clone of the primary node %s:%d (ID %d).",
|
||||||
|
cloneNode->workerName, cloneNode->workerPort,
|
||||||
|
cloneNode->nodeId,
|
||||||
|
primaryNode->workerName, primaryNode->workerPort,
|
||||||
|
primaryNode->nodeId)));
|
||||||
|
}
|
||||||
|
|
||||||
|
List *relationIdList = NonColocatedDistRelationIdList();
|
||||||
|
|
||||||
|
Form_pg_dist_rebalance_strategy strategy = GetRebalanceStrategy(
|
||||||
|
PG_GETARG_NAME_OR_NULL(4));
|
||||||
|
|
||||||
|
RebalanceOptions options = {
|
||||||
|
.relationIdList = relationIdList,
|
||||||
|
.threshold = 0, /* Threshold is not strictly needed for two nodes */
|
||||||
|
.maxShardMoves = -1, /* No limit on moves between these two nodes */
|
||||||
|
.excludedShardArray = construct_empty_array(INT8OID),
|
||||||
|
.drainOnly = false, /* Not a drain operation */
|
||||||
|
.rebalanceStrategy = strategy,
|
||||||
|
.improvementThreshold = 0, /* Consider all beneficial moves */
|
||||||
|
.workerNode = primaryNode /* indicate Primary node as a source node */
|
||||||
|
};
|
||||||
|
|
||||||
|
SplitPrimaryCloneShards *splitShards = GetPrimaryCloneSplitRebalanceSteps(
|
||||||
|
&options,
|
||||||
|
cloneNode);
|
||||||
|
|
||||||
|
int shardId = 0;
|
||||||
|
TupleDesc tupdesc;
|
||||||
|
Tuplestorestate *tupstore = SetupTuplestore(fcinfo, &tupdesc);
|
||||||
|
Datum values[4];
|
||||||
|
bool nulls[4];
|
||||||
|
|
||||||
|
|
||||||
|
foreach_declared_int(shardId, splitShards->primaryShardIdList)
|
||||||
|
{
|
||||||
|
ShardInterval *shardInterval = LoadShardInterval(shardId);
|
||||||
|
List *colocatedShardList = ColocatedShardIntervalList(shardInterval);
|
||||||
|
ListCell *colocatedShardCell = NULL;
|
||||||
|
foreach(colocatedShardCell, colocatedShardList)
|
||||||
|
{
|
||||||
|
ShardInterval *colocatedShard = lfirst(colocatedShardCell);
|
||||||
|
int colocatedShardId = colocatedShard->shardId;
|
||||||
|
memset(values, 0, sizeof(values));
|
||||||
|
memset(nulls, 0, sizeof(nulls));
|
||||||
|
|
||||||
|
values[0] = ObjectIdGetDatum(RelationIdForShard(colocatedShardId));
|
||||||
|
values[1] = UInt64GetDatum(colocatedShardId);
|
||||||
|
values[2] = UInt64GetDatum(ShardLength(colocatedShardId));
|
||||||
|
values[3] = PointerGetDatum(cstring_to_text("Primary Node"));
|
||||||
|
tuplestore_putvalues(tupstore, tupdesc, values, nulls);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach_declared_int(shardId, splitShards->cloneShardIdList)
|
||||||
|
{
|
||||||
|
ShardInterval *shardInterval = LoadShardInterval(shardId);
|
||||||
|
List *colocatedShardList = ColocatedShardIntervalList(shardInterval);
|
||||||
|
ListCell *colocatedShardCell = NULL;
|
||||||
|
foreach(colocatedShardCell, colocatedShardList)
|
||||||
|
{
|
||||||
|
ShardInterval *colocatedShard = lfirst(colocatedShardCell);
|
||||||
|
int colocatedShardId = colocatedShard->shardId;
|
||||||
|
memset(values, 0, sizeof(values));
|
||||||
|
memset(nulls, 0, sizeof(nulls));
|
||||||
|
|
||||||
|
values[0] = ObjectIdGetDatum(RelationIdForShard(colocatedShardId));
|
||||||
|
values[1] = UInt64GetDatum(colocatedShardId);
|
||||||
|
values[2] = UInt64GetDatum(ShardLength(colocatedShardId));
|
||||||
|
values[3] = PointerGetDatum(cstring_to_text("Clone Node"));
|
||||||
|
tuplestore_putvalues(tupstore, tupdesc, values, nulls);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return (Datum) 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* EnsureNodeCapacityUDF checks that the UDF matching the oid has the correct
|
* EnsureNodeCapacityUDF checks that the UDF matching the oid has the correct
|
||||||
* signature to be used as a NodeCapacity function. The expected signature is:
|
* signature to be used as a NodeCapacity function. The expected signature is:
|
||||||
|
|
|
||||||
|
|
@ -759,6 +759,205 @@ TransferShards(int64 shardId, char *sourceNodeName,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* AdjustShardsForPrimaryCloneNodeSplit is called when a primary-clone node split
|
||||||
|
* occurs. It adjusts the shard placements between the primary and clone nodes based
|
||||||
|
* on the provided shard lists. Since the clone is an exact replica of the primary
|
||||||
|
* but the metadata is not aware of this replication, this function updates the
|
||||||
|
* metadata to reflect the new shard distribution.
|
||||||
|
*
|
||||||
|
* The function handles three types of shards:
|
||||||
|
*
|
||||||
|
* 1. Shards moving to clone node (cloneShardList):
|
||||||
|
* - Updates shard placement metadata to move placements from primary to clone
|
||||||
|
* - No data movement is needed since the clone already has the data
|
||||||
|
* - Adds cleanup records to remove the shard data from primary at transaction commit
|
||||||
|
*
|
||||||
|
* 2. Shards staying on primary node (primaryShardList):
|
||||||
|
* - Metadata already correctly reflects these shards on primary
|
||||||
|
* - Adds cleanup records to remove the shard data from clone node
|
||||||
|
*
|
||||||
|
* 3. Reference tables:
|
||||||
|
* - Inserts new placement records on the clone node
|
||||||
|
* - Data is already present on clone, so only metadata update is needed
|
||||||
|
*
|
||||||
|
* This function does not perform any actual data movement; it only updates the
|
||||||
|
* shard placement metadata and schedules cleanup operations for later execution.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
AdjustShardsForPrimaryCloneNodeSplit(WorkerNode *primaryNode,
|
||||||
|
WorkerNode *cloneNode,
|
||||||
|
List *primaryShardList,
|
||||||
|
List *cloneShardList)
|
||||||
|
{
|
||||||
|
/* Input validation */
|
||||||
|
if (primaryNode == NULL || cloneNode == NULL)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||||
|
errmsg("primary or clone worker node is NULL")));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (primaryNode->nodeId == cloneNode->nodeId)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||||
|
errmsg("primary and clone nodes must be different")));
|
||||||
|
}
|
||||||
|
|
||||||
|
ereport(NOTICE, (errmsg(
|
||||||
|
"adjusting shard placements for primary %s:%d and clone %s:%d",
|
||||||
|
primaryNode->workerName, primaryNode->workerPort,
|
||||||
|
cloneNode->workerName, cloneNode->workerPort)));
|
||||||
|
|
||||||
|
RegisterOperationNeedingCleanup();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Process shards that will stay on the primary node.
|
||||||
|
* For these shards, we need to remove their data from the clone node
|
||||||
|
* since the metadata already correctly reflects them on primary.
|
||||||
|
*/
|
||||||
|
uint64 shardId = 0;
|
||||||
|
uint32 primaryGroupId = GroupForNode(primaryNode->workerName, primaryNode->workerPort)
|
||||||
|
;
|
||||||
|
uint32 cloneGroupId = GroupForNode(cloneNode->workerName, cloneNode->workerPort);
|
||||||
|
|
||||||
|
ereport(NOTICE, (errmsg("processing %d shards for primary node GroupID %d",
|
||||||
|
list_length(primaryShardList), primaryGroupId)));
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* For each shard staying on primary, insert cleanup records to remove
|
||||||
|
* the shard data from the clone node. The metadata already correctly
|
||||||
|
* reflects these shards on primary, so no metadata changes are needed.
|
||||||
|
*/
|
||||||
|
foreach_declared_int(shardId, primaryShardList)
|
||||||
|
{
|
||||||
|
ShardInterval *shardInterval = LoadShardInterval(shardId);
|
||||||
|
List *colocatedShardList = ColocatedShardIntervalList(shardInterval);
|
||||||
|
|
||||||
|
char *qualifiedShardName = ConstructQualifiedShardName(shardInterval);
|
||||||
|
ereport(LOG, (errmsg(
|
||||||
|
"inserting DELETE shard record for shard %s from clone node GroupID %d",
|
||||||
|
qualifiedShardName, cloneGroupId)));
|
||||||
|
|
||||||
|
InsertCleanupRecordsForShardPlacementsOnNode(colocatedShardList,
|
||||||
|
cloneGroupId);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Process shards that will move to the clone node.
|
||||||
|
* For these shards, we need to:
|
||||||
|
* 1. Update metadata to move placements from primary to clone
|
||||||
|
* 2. Remove the shard data from primary (via cleanup records)
|
||||||
|
* 3. No data movement needed since clone already has the data
|
||||||
|
*/
|
||||||
|
ereport(NOTICE, (errmsg("processing %d shards for clone node GroupID %d", list_length(
|
||||||
|
cloneShardList), cloneGroupId)));
|
||||||
|
|
||||||
|
foreach_declared_int(shardId, cloneShardList)
|
||||||
|
{
|
||||||
|
ShardInterval *shardInterval = LoadShardInterval(shardId);
|
||||||
|
List *colocatedShardList = ColocatedShardIntervalList(shardInterval);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Create new shard placement records on the clone node for all
|
||||||
|
* colocated shards. This moves the shard placements from primary
|
||||||
|
* to clone in the metadata.
|
||||||
|
*/
|
||||||
|
foreach_declared_ptr(shardInterval, colocatedShardList)
|
||||||
|
{
|
||||||
|
uint64 colocatedShardId = shardInterval->shardId;
|
||||||
|
|
||||||
|
uint64 placementId = GetNextPlacementId();
|
||||||
|
InsertShardPlacementRow(colocatedShardId, placementId,
|
||||||
|
ShardLength(colocatedShardId),
|
||||||
|
cloneGroupId);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Update the metadata on worker nodes to reflect the new shard
|
||||||
|
* placement distribution between primary and clone nodes.
|
||||||
|
*/
|
||||||
|
UpdateColocatedShardPlacementMetadataOnWorkers(shardId,
|
||||||
|
primaryNode->workerName,
|
||||||
|
primaryNode->workerPort,
|
||||||
|
cloneNode->workerName,
|
||||||
|
cloneNode->workerPort);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Remove the shard placement records from primary node metadata
|
||||||
|
* since these shards are now served from the clone node.
|
||||||
|
*/
|
||||||
|
DropShardPlacementsFromMetadata(colocatedShardList,
|
||||||
|
primaryNode->workerName, primaryNode->workerPort);
|
||||||
|
|
||||||
|
char *qualifiedShardName = ConstructQualifiedShardName(shardInterval);
|
||||||
|
ereport(LOG, (errmsg(
|
||||||
|
"inserting DELETE shard record for shard %s from primary node GroupID %d",
|
||||||
|
qualifiedShardName, primaryGroupId)));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Insert cleanup records to remove the shard data from primary node
|
||||||
|
* at transaction commit. This frees up space on the primary node
|
||||||
|
* since the data is now served from the clone node.
|
||||||
|
*/
|
||||||
|
InsertCleanupRecordsForShardPlacementsOnNode(colocatedShardList,
|
||||||
|
primaryGroupId);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Handle reference tables - these need to be available on both
|
||||||
|
* primary and clone nodes. Since the clone already has the data,
|
||||||
|
* we just need to insert placement records for the clone node.
|
||||||
|
*/
|
||||||
|
int colocationId = GetReferenceTableColocationId();
|
||||||
|
|
||||||
|
if (colocationId == INVALID_COLOCATION_ID)
|
||||||
|
{
|
||||||
|
/* we have no reference table yet. */
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
ShardInterval *shardInterval = NULL;
|
||||||
|
List *referenceTableIdList = CitusTableTypeIdList(REFERENCE_TABLE);
|
||||||
|
Oid referenceTableId = linitial_oid(referenceTableIdList);
|
||||||
|
List *shardIntervalList = LoadShardIntervalList(referenceTableId);
|
||||||
|
foreach_declared_ptr(shardInterval, shardIntervalList)
|
||||||
|
{
|
||||||
|
List *colocatedShardList = ColocatedShardIntervalList(shardInterval);
|
||||||
|
ShardInterval *colocatedShardInterval = NULL;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* For each reference table shard, create placement records on the
|
||||||
|
* clone node. The data is already present on the clone, so we only
|
||||||
|
* need to update the metadata to make the clone aware of these shards.
|
||||||
|
*/
|
||||||
|
foreach_declared_ptr(colocatedShardInterval, colocatedShardList)
|
||||||
|
{
|
||||||
|
uint64 colocatedShardId = colocatedShardInterval->shardId;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Insert shard placement record for the clone node and
|
||||||
|
* propagate the metadata change to worker nodes.
|
||||||
|
*/
|
||||||
|
uint64 placementId = GetNextPlacementId();
|
||||||
|
InsertShardPlacementRow(colocatedShardId, placementId,
|
||||||
|
ShardLength(colocatedShardId),
|
||||||
|
cloneGroupId);
|
||||||
|
|
||||||
|
char *placementCommand = PlacementUpsertCommand(colocatedShardId, placementId,
|
||||||
|
0, cloneGroupId);
|
||||||
|
|
||||||
|
SendCommandToWorkersWithMetadata(placementCommand);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ereport(NOTICE, (errmsg(
|
||||||
|
"shard placement adjustment complete for primary %s:%d and clone %s:%d",
|
||||||
|
primaryNode->workerName, primaryNode->workerPort,
|
||||||
|
cloneNode->workerName, cloneNode->workerPort)));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Insert deferred cleanup records.
|
* Insert deferred cleanup records.
|
||||||
* The shards will be dropped by background cleaner later.
|
* The shards will be dropped by background cleaner later.
|
||||||
|
|
@ -2269,6 +2468,7 @@ UpdateColocatedShardPlacementMetadataOnWorkers(int64 shardId,
|
||||||
"SELECT citus_internal.update_placement_metadata(%ld, %d, %d)",
|
"SELECT citus_internal.update_placement_metadata(%ld, %d, %d)",
|
||||||
colocatedShard->shardId,
|
colocatedShard->shardId,
|
||||||
sourceGroupId, targetGroupId);
|
sourceGroupId, targetGroupId);
|
||||||
|
|
||||||
SendCommandToWorkersWithMetadata(updateCommand->data);
|
SendCommandToWorkersWithMetadata(updateCommand->data);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,7 @@
|
||||||
|
-- Add replica information columns to pg_dist_node
|
||||||
|
ALTER TABLE pg_catalog.pg_dist_node ADD COLUMN nodeisclone BOOLEAN NOT NULL DEFAULT FALSE;
|
||||||
|
ALTER TABLE pg_catalog.pg_dist_node ADD COLUMN nodeprimarynodeid INT4 NOT NULL DEFAULT 0;
|
||||||
|
|
||||||
|
-- Add a comment to the table and columns for clarity in \d output
|
||||||
|
COMMENT ON COLUMN pg_catalog.pg_dist_node.nodeisclone IS 'Indicates if this node is a replica of another node.';
|
||||||
|
COMMENT ON COLUMN pg_catalog.pg_dist_node.nodeprimarynodeid IS 'If nodeisclone is true, this stores the nodeid of its primary node.';
|
||||||
|
|
@ -0,0 +1,3 @@
|
||||||
|
-- Remove clone information columns to pg_dist_node
|
||||||
|
ALTER TABLE pg_catalog.pg_dist_node DROP COLUMN IF EXISTS nodeisclone;
|
||||||
|
ALTER TABLE pg_catalog.pg_dist_node DROP COLUMN IF EXISTS nodeprimarynodeid;
|
||||||
|
|
@ -3,6 +3,12 @@
|
||||||
-- bump version to 13.2-1
|
-- bump version to 13.2-1
|
||||||
#include "udfs/worker_last_saved_explain_analyze/13.2-1.sql"
|
#include "udfs/worker_last_saved_explain_analyze/13.2-1.sql"
|
||||||
|
|
||||||
|
#include "cat_upgrades/add_clone_info_to_pg_dist_node.sql"
|
||||||
|
#include "udfs/citus_add_clone_node/13.2-1.sql"
|
||||||
|
#include "udfs/citus_remove_clone_node/13.2-1.sql"
|
||||||
|
#include "udfs/citus_promote_clone_and_rebalance/13.2-1.sql"
|
||||||
|
#include "udfs/get_snapshot_based_node_split_plan/13.2-1.sql"
|
||||||
|
|
||||||
#include "udfs/citus_rebalance_start/13.2-1.sql"
|
#include "udfs/citus_rebalance_start/13.2-1.sql"
|
||||||
#include "udfs/citus_internal_copy_single_shard_placement/13.2-1.sql"
|
#include "udfs/citus_internal_copy_single_shard_placement/13.2-1.sql"
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,16 @@ DROP FUNCTION IF EXISTS pg_catalog.citus_rebalance_start(name, boolean, citus.sh
|
||||||
DROP FUNCTION IF EXISTS pg_catalog.worker_last_saved_explain_analyze();
|
DROP FUNCTION IF EXISTS pg_catalog.worker_last_saved_explain_analyze();
|
||||||
#include "../udfs/worker_last_saved_explain_analyze/9.4-1.sql"
|
#include "../udfs/worker_last_saved_explain_analyze/9.4-1.sql"
|
||||||
|
|
||||||
|
DROP FUNCTION IF EXISTS pg_catalog.citus_add_clone_node(text, integer, text, integer);
|
||||||
|
DROP FUNCTION IF EXISTS pg_catalog.citus_add_clone_node_with_nodeid(text, integer, integer);
|
||||||
|
|
||||||
|
DROP FUNCTION IF EXISTS pg_catalog.citus_remove_clone_node(text, integer);
|
||||||
|
DROP FUNCTION IF EXISTS pg_catalog.citus_remove_clone_node_with_nodeid(integer);
|
||||||
|
|
||||||
|
DROP FUNCTION IF EXISTS pg_catalog.citus_promote_clone_and_rebalance(integer, name, integer);
|
||||||
|
DROP FUNCTION IF EXISTS pg_catalog.get_snapshot_based_node_split_plan(text, integer, text, integer, name);
|
||||||
|
|
||||||
|
#include "../cat_upgrades/remove_clone_info_to_pg_dist_node.sql"
|
||||||
#include "../udfs/citus_finish_pg_upgrade/13.1-1.sql"
|
#include "../udfs/citus_finish_pg_upgrade/13.1-1.sql"
|
||||||
|
|
||||||
-- Note that we intentionally don't add the old columnar objects back to the "citus"
|
-- Note that we intentionally don't add the old columnar objects back to the "citus"
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,26 @@
|
||||||
|
CREATE OR REPLACE FUNCTION pg_catalog.citus_add_clone_node(
|
||||||
|
replica_hostname text,
|
||||||
|
replica_port integer,
|
||||||
|
primary_hostname text,
|
||||||
|
primary_port integer)
|
||||||
|
RETURNS INTEGER
|
||||||
|
LANGUAGE C VOLATILE STRICT
|
||||||
|
AS 'MODULE_PATHNAME', $$citus_add_clone_node$$;
|
||||||
|
|
||||||
|
COMMENT ON FUNCTION pg_catalog.citus_add_clone_node(text, integer, text, integer) IS
|
||||||
|
'Adds a new node as a clone of an existing primary node. The clone is initially inactive. Returns the nodeid of the new clone node.';
|
||||||
|
|
||||||
|
REVOKE ALL ON FUNCTION pg_catalog.citus_add_clone_node(text, int, text, int) FROM PUBLIC;
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION pg_catalog.citus_add_clone_node_with_nodeid(
|
||||||
|
replica_hostname text,
|
||||||
|
replica_port integer,
|
||||||
|
primary_nodeid integer)
|
||||||
|
RETURNS INTEGER
|
||||||
|
LANGUAGE C VOLATILE STRICT
|
||||||
|
AS 'MODULE_PATHNAME', $$citus_add_clone_node_with_nodeid$$;
|
||||||
|
|
||||||
|
COMMENT ON FUNCTION pg_catalog.citus_add_clone_node_with_nodeid(text, integer, integer) IS
|
||||||
|
'Adds a new node as a clone of an existing primary node using the primary node''s ID. The clone is initially inactive. Returns the nodeid of the new clone node.';
|
||||||
|
|
||||||
|
REVOKE ALL ON FUNCTION pg_catalog.citus_add_clone_node_with_nodeid(text, int, int) FROM PUBLIC;
|
||||||
|
|
@ -0,0 +1,26 @@
|
||||||
|
CREATE OR REPLACE FUNCTION pg_catalog.citus_add_clone_node(
|
||||||
|
replica_hostname text,
|
||||||
|
replica_port integer,
|
||||||
|
primary_hostname text,
|
||||||
|
primary_port integer)
|
||||||
|
RETURNS INTEGER
|
||||||
|
LANGUAGE C VOLATILE STRICT
|
||||||
|
AS 'MODULE_PATHNAME', $$citus_add_clone_node$$;
|
||||||
|
|
||||||
|
COMMENT ON FUNCTION pg_catalog.citus_add_clone_node(text, integer, text, integer) IS
|
||||||
|
'Adds a new node as a clone of an existing primary node. The clone is initially inactive. Returns the nodeid of the new clone node.';
|
||||||
|
|
||||||
|
REVOKE ALL ON FUNCTION pg_catalog.citus_add_clone_node(text, int, text, int) FROM PUBLIC;
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION pg_catalog.citus_add_clone_node_with_nodeid(
|
||||||
|
replica_hostname text,
|
||||||
|
replica_port integer,
|
||||||
|
primary_nodeid integer)
|
||||||
|
RETURNS INTEGER
|
||||||
|
LANGUAGE C VOLATILE STRICT
|
||||||
|
AS 'MODULE_PATHNAME', $$citus_add_clone_node_with_nodeid$$;
|
||||||
|
|
||||||
|
COMMENT ON FUNCTION pg_catalog.citus_add_clone_node_with_nodeid(text, integer, integer) IS
|
||||||
|
'Adds a new node as a clone of an existing primary node using the primary node''s ID. The clone is initially inactive. Returns the nodeid of the new clone node.';
|
||||||
|
|
||||||
|
REVOKE ALL ON FUNCTION pg_catalog.citus_add_clone_node_with_nodeid(text, int, int) FROM PUBLIC;
|
||||||
13
src/backend/distributed/sql/udfs/citus_promote_clone_and_rebalance/13.2-1.sql
generated
Normal file
13
src/backend/distributed/sql/udfs/citus_promote_clone_and_rebalance/13.2-1.sql
generated
Normal file
|
|
@ -0,0 +1,13 @@
|
||||||
|
CREATE OR REPLACE FUNCTION pg_catalog.citus_promote_clone_and_rebalance(
|
||||||
|
clone_nodeid integer,
|
||||||
|
rebalance_strategy name DEFAULT NULL,
|
||||||
|
catchup_timeout_seconds integer DEFAULT 300
|
||||||
|
)
|
||||||
|
RETURNS VOID
|
||||||
|
AS 'MODULE_PATHNAME'
|
||||||
|
LANGUAGE C VOLATILE;
|
||||||
|
|
||||||
|
COMMENT ON FUNCTION pg_catalog.citus_promote_clone_and_rebalance(integer, name, integer) IS
|
||||||
|
'Promotes a registered clone node to a primary, performs necessary metadata updates, and rebalances a portion of shards from its original primary to the newly promoted node. The catchUpTimeoutSeconds parameter controls how long to wait for the clone to catch up with the primary (default: 300 seconds).';
|
||||||
|
|
||||||
|
REVOKE ALL ON FUNCTION pg_catalog.citus_promote_clone_and_rebalance(integer, name, integer) FROM PUBLIC;
|
||||||
|
|
@ -0,0 +1,13 @@
|
||||||
|
CREATE OR REPLACE FUNCTION pg_catalog.citus_promote_clone_and_rebalance(
|
||||||
|
clone_nodeid integer,
|
||||||
|
rebalance_strategy name DEFAULT NULL,
|
||||||
|
catchup_timeout_seconds integer DEFAULT 300
|
||||||
|
)
|
||||||
|
RETURNS VOID
|
||||||
|
AS 'MODULE_PATHNAME'
|
||||||
|
LANGUAGE C VOLATILE;
|
||||||
|
|
||||||
|
COMMENT ON FUNCTION pg_catalog.citus_promote_clone_and_rebalance(integer, name, integer) IS
|
||||||
|
'Promotes a registered clone node to a primary, performs necessary metadata updates, and rebalances a portion of shards from its original primary to the newly promoted node. The catchUpTimeoutSeconds parameter controls how long to wait for the clone to catch up with the primary (default: 300 seconds).';
|
||||||
|
|
||||||
|
REVOKE ALL ON FUNCTION pg_catalog.citus_promote_clone_and_rebalance(integer, name, integer) FROM PUBLIC;
|
||||||
|
|
@ -0,0 +1,24 @@
|
||||||
|
CREATE OR REPLACE FUNCTION pg_catalog.citus_remove_clone_node(
|
||||||
|
nodename text,
|
||||||
|
nodeport integer
|
||||||
|
)
|
||||||
|
RETURNS VOID
|
||||||
|
LANGUAGE C VOLATILE STRICT
|
||||||
|
AS 'MODULE_PATHNAME', $$citus_remove_clone_node$$;
|
||||||
|
|
||||||
|
COMMENT ON FUNCTION pg_catalog.citus_remove_clone_node(text, integer)
|
||||||
|
IS 'Removes an inactive streaming clone node from Citus metadata. Errors if the node is not found, not registered as a clone, or is currently marked active.';
|
||||||
|
|
||||||
|
REVOKE ALL ON FUNCTION pg_catalog.citus_remove_clone_node(text, integer) FROM PUBLIC;
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION pg_catalog.citus_remove_clone_node_with_nodeid(
|
||||||
|
nodeid integer
|
||||||
|
)
|
||||||
|
RETURNS VOID
|
||||||
|
LANGUAGE C VOLATILE STRICT
|
||||||
|
AS 'MODULE_PATHNAME', $$citus_remove_clone_node_with_nodeid$$;
|
||||||
|
|
||||||
|
COMMENT ON FUNCTION pg_catalog.citus_remove_clone_node_with_nodeid(integer)
|
||||||
|
IS 'Removes an inactive streaming clone node from Citus metadata using its node ID. Errors if the node is not found, not registered as a clone, or is currently marked active.';
|
||||||
|
|
||||||
|
REVOKE ALL ON FUNCTION pg_catalog.citus_remove_clone_node_with_nodeid(integer) FROM PUBLIC;
|
||||||
|
|
@ -0,0 +1,24 @@
|
||||||
|
CREATE OR REPLACE FUNCTION pg_catalog.citus_remove_clone_node(
|
||||||
|
nodename text,
|
||||||
|
nodeport integer
|
||||||
|
)
|
||||||
|
RETURNS VOID
|
||||||
|
LANGUAGE C VOLATILE STRICT
|
||||||
|
AS 'MODULE_PATHNAME', $$citus_remove_clone_node$$;
|
||||||
|
|
||||||
|
COMMENT ON FUNCTION pg_catalog.citus_remove_clone_node(text, integer)
|
||||||
|
IS 'Removes an inactive streaming clone node from Citus metadata. Errors if the node is not found, not registered as a clone, or is currently marked active.';
|
||||||
|
|
||||||
|
REVOKE ALL ON FUNCTION pg_catalog.citus_remove_clone_node(text, integer) FROM PUBLIC;
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION pg_catalog.citus_remove_clone_node_with_nodeid(
|
||||||
|
nodeid integer
|
||||||
|
)
|
||||||
|
RETURNS VOID
|
||||||
|
LANGUAGE C VOLATILE STRICT
|
||||||
|
AS 'MODULE_PATHNAME', $$citus_remove_clone_node_with_nodeid$$;
|
||||||
|
|
||||||
|
COMMENT ON FUNCTION pg_catalog.citus_remove_clone_node_with_nodeid(integer)
|
||||||
|
IS 'Removes an inactive streaming clone node from Citus metadata using its node ID. Errors if the node is not found, not registered as a clone, or is currently marked active.';
|
||||||
|
|
||||||
|
REVOKE ALL ON FUNCTION pg_catalog.citus_remove_clone_node_with_nodeid(integer) FROM PUBLIC;
|
||||||
18
src/backend/distributed/sql/udfs/get_snapshot_based_node_split_plan/13.2-1.sql
generated
Normal file
18
src/backend/distributed/sql/udfs/get_snapshot_based_node_split_plan/13.2-1.sql
generated
Normal file
|
|
@ -0,0 +1,18 @@
|
||||||
|
CREATE OR REPLACE FUNCTION pg_catalog.get_snapshot_based_node_split_plan(
|
||||||
|
primary_node_name text,
|
||||||
|
primary_node_port integer,
|
||||||
|
replica_node_name text,
|
||||||
|
replica_node_port integer,
|
||||||
|
rebalance_strategy name DEFAULT NULL
|
||||||
|
)
|
||||||
|
RETURNS TABLE (table_name regclass,
|
||||||
|
shardid bigint,
|
||||||
|
shard_size bigint,
|
||||||
|
placement_node text)
|
||||||
|
AS 'MODULE_PATHNAME'
|
||||||
|
LANGUAGE C VOLATILE;
|
||||||
|
|
||||||
|
COMMENT ON FUNCTION pg_catalog.get_snapshot_based_node_split_plan(text, int, text, int, name)
|
||||||
|
IS 'shows the shard placements to balance shards between primary and replica worker nodes';
|
||||||
|
|
||||||
|
REVOKE ALL ON FUNCTION pg_catalog.get_snapshot_based_node_split_plan(text, int, text, int, name) FROM PUBLIC;
|
||||||
|
|
@ -0,0 +1,18 @@
|
||||||
|
CREATE OR REPLACE FUNCTION pg_catalog.get_snapshot_based_node_split_plan(
|
||||||
|
primary_node_name text,
|
||||||
|
primary_node_port integer,
|
||||||
|
replica_node_name text,
|
||||||
|
replica_node_port integer,
|
||||||
|
rebalance_strategy name DEFAULT NULL
|
||||||
|
)
|
||||||
|
RETURNS TABLE (table_name regclass,
|
||||||
|
shardid bigint,
|
||||||
|
shard_size bigint,
|
||||||
|
placement_node text)
|
||||||
|
AS 'MODULE_PATHNAME'
|
||||||
|
LANGUAGE C VOLATILE;
|
||||||
|
|
||||||
|
COMMENT ON FUNCTION pg_catalog.get_snapshot_based_node_split_plan(text, int, text, int, name)
|
||||||
|
IS 'shows the shard placements to balance shards between primary and replica worker nodes';
|
||||||
|
|
||||||
|
REVOKE ALL ON FUNCTION pg_catalog.get_snapshot_based_node_split_plan(text, int, text, int, name) FROM PUBLIC;
|
||||||
|
|
@ -0,0 +1,525 @@
|
||||||
|
#include <arpa/inet.h>
|
||||||
|
#include <netdb.h>
|
||||||
|
#include <netinet/in.h>
|
||||||
|
#include <sys/socket.h>
|
||||||
|
|
||||||
|
#include "postgres.h"
|
||||||
|
|
||||||
|
#include "utils/fmgrprotos.h"
|
||||||
|
#include "utils/pg_lsn.h"
|
||||||
|
|
||||||
|
#include "distributed/argutils.h"
|
||||||
|
#include "distributed/clonenode_utils.h"
|
||||||
|
#include "distributed/listutils.h"
|
||||||
|
#include "distributed/metadata_cache.h"
|
||||||
|
#include "distributed/metadata_sync.h"
|
||||||
|
#include "distributed/remote_commands.h"
|
||||||
|
#include "distributed/shard_rebalancer.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
* GetReplicationLag calculates the replication lag between the primary and replica nodes.
|
||||||
|
* It returns the lag in bytes.
|
||||||
|
*/
|
||||||
|
int64
|
||||||
|
GetReplicationLag(WorkerNode *primaryWorkerNode, WorkerNode *replicaWorkerNode)
|
||||||
|
{
|
||||||
|
/* Input validation */
|
||||||
|
if (primaryWorkerNode == NULL || replicaWorkerNode == NULL)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||||
|
errmsg("primary or replica worker node is NULL")));
|
||||||
|
}
|
||||||
|
|
||||||
|
#if PG_VERSION_NUM >= 100000
|
||||||
|
const char *primary_lsn_query = "SELECT pg_current_wal_lsn()";
|
||||||
|
const char *replica_lsn_query = "SELECT pg_last_wal_replay_lsn()";
|
||||||
|
#else
|
||||||
|
const char *primary_lsn_query = "SELECT pg_current_xlog_location()";
|
||||||
|
const char *replica_lsn_query = "SELECT pg_last_xlog_replay_location()";
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int connectionFlag = 0;
|
||||||
|
MultiConnection *primaryConnection = GetNodeConnection(connectionFlag,
|
||||||
|
primaryWorkerNode->workerName,
|
||||||
|
primaryWorkerNode->workerPort);
|
||||||
|
if (PQstatus(primaryConnection->pgConn) != CONNECTION_OK)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE),
|
||||||
|
errmsg(
|
||||||
|
"cannot connect to primary node %s:%d to fetch replication status",
|
||||||
|
primaryWorkerNode->workerName, primaryWorkerNode->
|
||||||
|
workerPort)));
|
||||||
|
}
|
||||||
|
MultiConnection *replicaConnection = GetNodeConnection(connectionFlag,
|
||||||
|
replicaWorkerNode->workerName,
|
||||||
|
replicaWorkerNode->workerPort);
|
||||||
|
|
||||||
|
if (PQstatus(replicaConnection->pgConn) != CONNECTION_OK)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE),
|
||||||
|
errmsg(
|
||||||
|
"cannot connect to clone node %s:%d to fetch replication status",
|
||||||
|
replicaWorkerNode->workerName, replicaWorkerNode->
|
||||||
|
workerPort)));
|
||||||
|
}
|
||||||
|
|
||||||
|
int primaryResultCode = SendRemoteCommand(primaryConnection, primary_lsn_query);
|
||||||
|
if (primaryResultCode == 0)
|
||||||
|
{
|
||||||
|
ReportConnectionError(primaryConnection, ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
PGresult *primaryResult = GetRemoteCommandResult(primaryConnection, true);
|
||||||
|
if (!IsResponseOK(primaryResult))
|
||||||
|
{
|
||||||
|
ReportResultError(primaryConnection, primaryResult, ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
int replicaResultCode = SendRemoteCommand(replicaConnection, replica_lsn_query);
|
||||||
|
if (replicaResultCode == 0)
|
||||||
|
{
|
||||||
|
ReportConnectionError(replicaConnection, ERROR);
|
||||||
|
}
|
||||||
|
PGresult *replicaResult = GetRemoteCommandResult(replicaConnection, true);
|
||||||
|
if (!IsResponseOK(replicaResult))
|
||||||
|
{
|
||||||
|
ReportResultError(replicaConnection, replicaResult, ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
List *primaryLsnList = ReadFirstColumnAsText(primaryResult);
|
||||||
|
if (list_length(primaryLsnList) != 1)
|
||||||
|
{
|
||||||
|
PQclear(primaryResult);
|
||||||
|
ClearResults(primaryConnection, true);
|
||||||
|
CloseConnection(primaryConnection);
|
||||||
|
PQclear(replicaResult);
|
||||||
|
ClearResults(replicaConnection, true);
|
||||||
|
CloseConnection(replicaConnection);
|
||||||
|
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE),
|
||||||
|
errmsg("cannot parse primary LSN result from %s:%d",
|
||||||
|
primaryWorkerNode->workerName,
|
||||||
|
primaryWorkerNode->workerPort),
|
||||||
|
errdetail("Expected exactly one row with LSN value")));
|
||||||
|
}
|
||||||
|
StringInfo primaryLsnQueryResInfo = (StringInfo) linitial(primaryLsnList);
|
||||||
|
char *primary_lsn_str = primaryLsnQueryResInfo->data;
|
||||||
|
|
||||||
|
List *replicaLsnList = ReadFirstColumnAsText(replicaResult);
|
||||||
|
if (list_length(replicaLsnList) != 1)
|
||||||
|
{
|
||||||
|
PQclear(primaryResult);
|
||||||
|
ClearResults(primaryConnection, true);
|
||||||
|
CloseConnection(primaryConnection);
|
||||||
|
PQclear(replicaResult);
|
||||||
|
ClearResults(replicaConnection, true);
|
||||||
|
CloseConnection(replicaConnection);
|
||||||
|
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE),
|
||||||
|
errmsg("cannot parse clone LSN result from %s:%d",
|
||||||
|
replicaWorkerNode->workerName,
|
||||||
|
replicaWorkerNode->workerPort),
|
||||||
|
errdetail("Expected exactly one row with LSN value")));
|
||||||
|
}
|
||||||
|
StringInfo replicaLsnQueryResInfo = (StringInfo) linitial(replicaLsnList);
|
||||||
|
char *replica_lsn_str = replicaLsnQueryResInfo->data;
|
||||||
|
|
||||||
|
int64 primary_lsn = DatumGetLSN(DirectFunctionCall1(pg_lsn_in, CStringGetDatum(
|
||||||
|
primary_lsn_str)));
|
||||||
|
int64 replica_lsn = DatumGetLSN(DirectFunctionCall1(pg_lsn_in, CStringGetDatum(
|
||||||
|
replica_lsn_str)));
|
||||||
|
|
||||||
|
int64 lag_bytes = primary_lsn - replica_lsn;
|
||||||
|
|
||||||
|
PQclear(primaryResult);
|
||||||
|
ForgetResults(primaryConnection);
|
||||||
|
CloseConnection(primaryConnection);
|
||||||
|
|
||||||
|
PQclear(replicaResult);
|
||||||
|
ForgetResults(replicaConnection);
|
||||||
|
CloseConnection(replicaConnection);
|
||||||
|
|
||||||
|
ereport(DEBUG1, (errmsg(
|
||||||
|
"successfully measured replication lag: primary LSN %s, clone LSN %s",
|
||||||
|
primary_lsn_str, replica_lsn_str)));
|
||||||
|
ereport(NOTICE, (errmsg("replication lag between %s:%d and %s:%d is %ld bytes",
|
||||||
|
primaryWorkerNode->workerName, primaryWorkerNode->workerPort,
|
||||||
|
replicaWorkerNode->workerName, replicaWorkerNode->workerPort,
|
||||||
|
lag_bytes)));
|
||||||
|
return lag_bytes;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* EnsureValidCloneMode verifies that a clone node has a valid replication
|
||||||
|
* relationship with the specified primary node.
|
||||||
|
*
|
||||||
|
* This function performs several critical checks:
|
||||||
|
* 1. Validates that the clone is actually connected to and replicating from
|
||||||
|
* the specified primary node
|
||||||
|
* 2. Ensures the clone is not configured as a synchronous replica, which
|
||||||
|
* would block 2PC commits on the primary when the clone gets promoted
|
||||||
|
* 3. Verifies the replication connection is active and healthy
|
||||||
|
*
|
||||||
|
* The function connects to the primary node and queries pg_stat_replication
|
||||||
|
* to find the clone's replication slot. It resolves hostnames to IP addresses
|
||||||
|
* for robust matching since PostgreSQL may report different address formats.
|
||||||
|
*
|
||||||
|
* Parameters:
|
||||||
|
* primaryWorkerNode - The primary node that should be sending replication data
|
||||||
|
* cloneHostname - Hostname/IP of the clone node to verify
|
||||||
|
* clonePort - Port of the clone node to verify
|
||||||
|
* operation - Description of the operation being performed (for error messages)
|
||||||
|
*
|
||||||
|
* Throws ERROR if:
|
||||||
|
* - Primary or clone parameters are invalid
|
||||||
|
* - Cannot connect to the primary node
|
||||||
|
* - Clone is not found in the primary's replication slots
|
||||||
|
* - Clone is configured as a synchronous replica
|
||||||
|
* - Replication connection is not active
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
EnsureValidCloneMode(WorkerNode *primaryWorkerNode,
|
||||||
|
char *cloneHostname, int clonePort, char *operation)
|
||||||
|
{
|
||||||
|
Assert(operation != NULL);
|
||||||
|
|
||||||
|
if (primaryWorkerNode == NULL || cloneHostname == NULL)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||||
|
errmsg("primary or clone worker node is NULL")));
|
||||||
|
}
|
||||||
|
|
||||||
|
ereport(NOTICE, (errmsg(
|
||||||
|
"checking replication relationship between primary %s:%d and clone %s:%d",
|
||||||
|
primaryWorkerNode->workerName, primaryWorkerNode->workerPort,
|
||||||
|
cloneHostname, clonePort)));
|
||||||
|
|
||||||
|
/* Connect to primary node to check replication status */
|
||||||
|
int connectionFlag = 0;
|
||||||
|
MultiConnection *primaryConnection = GetNodeConnection(connectionFlag,
|
||||||
|
primaryWorkerNode->workerName,
|
||||||
|
primaryWorkerNode->workerPort);
|
||||||
|
if (PQstatus(primaryConnection->pgConn) != CONNECTION_OK)
|
||||||
|
{
|
||||||
|
ReportConnectionError(primaryConnection, ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Build query to check if clone is connected and get its sync state */
|
||||||
|
StringInfo replicationCheckQuery = makeStringInfo();
|
||||||
|
|
||||||
|
/* First, try to resolve the hostname to IP address for more robust matching */
|
||||||
|
char *resolvedIP = NULL;
|
||||||
|
struct addrinfo hints, *result, *rp;
|
||||||
|
|
||||||
|
memset(&hints, 0, sizeof(hints));
|
||||||
|
hints.ai_family = AF_UNSPEC; /* Allow IPv4 or IPv6 */
|
||||||
|
hints.ai_socktype = SOCK_STREAM; /* TCP socket */
|
||||||
|
hints.ai_flags = AI_PASSIVE; /* For wildcard IP address */
|
||||||
|
|
||||||
|
int getaddrinfo_result = getaddrinfo(cloneHostname, NULL, &hints, &result);
|
||||||
|
if (getaddrinfo_result == 0)
|
||||||
|
{
|
||||||
|
/* Get the first resolved IP address */
|
||||||
|
for (rp = result; rp != NULL; rp = rp->ai_next)
|
||||||
|
{
|
||||||
|
if (rp->ai_family == AF_INET)
|
||||||
|
{
|
||||||
|
/* IPv4 */
|
||||||
|
struct sockaddr_in *addr_in = (struct sockaddr_in *) rp->ai_addr;
|
||||||
|
resolvedIP = palloc(INET_ADDRSTRLEN);
|
||||||
|
inet_ntop(AF_INET, &(addr_in->sin_addr), resolvedIP, INET_ADDRSTRLEN);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else if (rp->ai_family == AF_INET6)
|
||||||
|
{
|
||||||
|
/* IPv6 */
|
||||||
|
struct sockaddr_in6 *addr_in6 = (struct sockaddr_in6 *) rp->ai_addr;
|
||||||
|
resolvedIP = palloc(INET6_ADDRSTRLEN);
|
||||||
|
inet_ntop(AF_INET6, &(addr_in6->sin6_addr), resolvedIP, INET6_ADDRSTRLEN);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
freeaddrinfo(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
ereport(NOTICE, (errmsg("checking replication for node %s (resolved IP: %s)",
|
||||||
|
cloneHostname,
|
||||||
|
resolvedIP ? resolvedIP : "unresolved")));
|
||||||
|
|
||||||
|
/* Build query to check if clone is connected and get its sync state */
|
||||||
|
|
||||||
|
/* We check multiple fields to handle different scenarios:
|
||||||
|
* 1. application_name - if it's set to the node name
|
||||||
|
* 2. client_hostname - if it's the hostname
|
||||||
|
* 3. client_addr - if it's the IP address (most reliable)
|
||||||
|
*/
|
||||||
|
if (resolvedIP != NULL)
|
||||||
|
{
|
||||||
|
appendStringInfo(replicationCheckQuery,
|
||||||
|
"SELECT sync_state, state FROM pg_stat_replication WHERE "
|
||||||
|
"application_name = '%s' OR "
|
||||||
|
"client_hostname = '%s' OR "
|
||||||
|
"client_addr = '%s'",
|
||||||
|
cloneHostname,
|
||||||
|
cloneHostname,
|
||||||
|
resolvedIP);
|
||||||
|
pfree(resolvedIP);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* Fallback to hostname-only check if IP resolution fails */
|
||||||
|
appendStringInfo(replicationCheckQuery,
|
||||||
|
"SELECT sync_state, state FROM pg_stat_replication WHERE "
|
||||||
|
"application_name = '%s' OR "
|
||||||
|
"client_hostname = '%s'",
|
||||||
|
cloneHostname,
|
||||||
|
cloneHostname);
|
||||||
|
}
|
||||||
|
|
||||||
|
int replicationCheckResultCode = SendRemoteCommand(primaryConnection,
|
||||||
|
replicationCheckQuery->data);
|
||||||
|
if (replicationCheckResultCode == 0)
|
||||||
|
{
|
||||||
|
pfree(replicationCheckQuery->data);
|
||||||
|
pfree(replicationCheckQuery);
|
||||||
|
CloseConnection(primaryConnection);
|
||||||
|
ReportConnectionError(primaryConnection, ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
PGresult *replicationCheckResult = GetRemoteCommandResult(primaryConnection, true);
|
||||||
|
if (!IsResponseOK(replicationCheckResult))
|
||||||
|
{
|
||||||
|
ReportResultError(primaryConnection, replicationCheckResult, ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
List *replicationStateList = ReadFirstColumnAsText(replicationCheckResult);
|
||||||
|
|
||||||
|
/* Check if clone is connected to this primary */
|
||||||
|
if (list_length(replicationStateList) == 0)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||||
|
errmsg("clone %s:%d is not connected to primary %s:%d",
|
||||||
|
cloneHostname, clonePort,
|
||||||
|
primaryWorkerNode->workerName, primaryWorkerNode->
|
||||||
|
workerPort),
|
||||||
|
errdetail(
|
||||||
|
"The clone must be actively replicating from the specified primary node. "
|
||||||
|
"Check that the clone is running and properly configured for replication.")));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Check if clone is synchronous */
|
||||||
|
if (list_length(replicationStateList) > 0)
|
||||||
|
{
|
||||||
|
StringInfo syncStateInfo = (StringInfo) linitial(replicationStateList);
|
||||||
|
if (syncStateInfo && syncStateInfo->data &&
|
||||||
|
(strcmp(syncStateInfo->data, "sync") == 0 || strcmp(syncStateInfo->data,
|
||||||
|
"quorum") == 0))
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||||
|
errmsg(
|
||||||
|
"cannot %s clone %s:%d as it is configured as a synchronous replica",
|
||||||
|
operation, cloneHostname, clonePort),
|
||||||
|
errdetail(
|
||||||
|
"Promoting a synchronous clone can cause data consistency issues. "
|
||||||
|
"Please configure it as an asynchronous replica first.")))
|
||||||
|
;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Cleanup resources */
|
||||||
|
bool raiseErrors = false;
|
||||||
|
PQclear(replicationCheckResult);
|
||||||
|
ClearResults(primaryConnection, raiseErrors);
|
||||||
|
pfree(replicationCheckQuery->data);
|
||||||
|
pfree(replicationCheckQuery);
|
||||||
|
CloseConnection(primaryConnection);
|
||||||
|
|
||||||
|
ereport(NOTICE, (errmsg(
|
||||||
|
"clone %s:%d is properly connected to primary %s:%d and is not synchronous",
|
||||||
|
cloneHostname, clonePort,
|
||||||
|
primaryWorkerNode->workerName, primaryWorkerNode->workerPort))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* EnsureValidStreamingReplica verifies that a node is a valid streaming replica
|
||||||
|
* of the specified primary node.
|
||||||
|
*
|
||||||
|
* This function performs comprehensive validation to ensure the replica is:
|
||||||
|
* 1. Currently in recovery mode (acting as a replica, not a primary)
|
||||||
|
* 2. Has the same system identifier as the primary (ensuring they're part of
|
||||||
|
* the same PostgreSQL cluster/timeline)
|
||||||
|
*
|
||||||
|
* The function connects to both the replica and primary nodes to perform these
|
||||||
|
* checks. This validation is critical before performing operations like promotion
|
||||||
|
* or failover to ensure data consistency and prevent split-brain scenarios.
|
||||||
|
*
|
||||||
|
* Parameters:
|
||||||
|
* primaryWorkerNode - The primary node that should be the source of replication
|
||||||
|
* replicaHostname - Hostname/IP of the replica node to validate
|
||||||
|
* replicaPort - Port of the replica node to validate
|
||||||
|
*
|
||||||
|
* Throws ERROR if:
|
||||||
|
* - Cannot connect to the replica or primary node
|
||||||
|
* - Replica is not in recovery mode (indicating it's not acting as a replica)
|
||||||
|
* - System identifiers don't match between primary and replica
|
||||||
|
* - Any database queries fail during validation
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
EnsureValidStreamingReplica(WorkerNode *primaryWorkerNode, char *replicaHostname, int
|
||||||
|
replicaPort)
|
||||||
|
{
|
||||||
|
int connectionFlag = FORCE_NEW_CONNECTION;
|
||||||
|
MultiConnection *replicaConnection = GetNodeConnection(connectionFlag, replicaHostname
|
||||||
|
,
|
||||||
|
replicaPort);
|
||||||
|
|
||||||
|
if (PQstatus(replicaConnection->pgConn) != CONNECTION_OK)
|
||||||
|
{
|
||||||
|
ReportConnectionError(replicaConnection, ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
const char *replica_recovery_query = "SELECT pg_is_in_recovery()";
|
||||||
|
|
||||||
|
int resultCode = SendRemoteCommand(replicaConnection, replica_recovery_query);
|
||||||
|
|
||||||
|
if (resultCode == 0)
|
||||||
|
{
|
||||||
|
ereport(DEBUG2, (errmsg(
|
||||||
|
"cannot connect to %s:%d to check if it is in recovery mode",
|
||||||
|
replicaHostname, replicaPort)));
|
||||||
|
ReportConnectionError(replicaConnection, ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool raiseInterrupts = true;
|
||||||
|
PGresult *result = GetRemoteCommandResult(replicaConnection, raiseInterrupts);
|
||||||
|
|
||||||
|
if (!IsResponseOK(result))
|
||||||
|
{
|
||||||
|
ereport(DEBUG2, (errmsg("failed to execute pg_is_in_recovery")));
|
||||||
|
ReportResultError(replicaConnection, result, ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
List *sizeList = ReadFirstColumnAsText(result);
|
||||||
|
if (list_length(sizeList) != 1)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE),
|
||||||
|
errmsg("cannot parse pg_is_in_recovery() result from %s:%d",
|
||||||
|
replicaHostname,
|
||||||
|
replicaPort)));
|
||||||
|
}
|
||||||
|
|
||||||
|
StringInfo isInRecoveryQueryResInfo = (StringInfo) linitial(sizeList);
|
||||||
|
char *isInRecoveryQueryResStr = isInRecoveryQueryResInfo->data;
|
||||||
|
|
||||||
|
if (strcmp(isInRecoveryQueryResStr, "t") != 0 && strcmp(isInRecoveryQueryResStr,
|
||||||
|
"true") != 0)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||||
|
errmsg("node %s:%d is not in recovery mode",
|
||||||
|
replicaHostname, replicaPort)));
|
||||||
|
}
|
||||||
|
|
||||||
|
PQclear(result);
|
||||||
|
ForgetResults(replicaConnection);
|
||||||
|
|
||||||
|
/* Step2: Get the system identifier from replica */
|
||||||
|
const char *sysidQuery =
|
||||||
|
"SELECT system_identifier FROM pg_control_system()";
|
||||||
|
|
||||||
|
resultCode = SendRemoteCommand(replicaConnection, sysidQuery);
|
||||||
|
|
||||||
|
if (resultCode == 0)
|
||||||
|
{
|
||||||
|
ereport(DEBUG2, (errmsg("cannot connect to %s:%d to get system identifier",
|
||||||
|
replicaHostname, replicaPort)));
|
||||||
|
ReportConnectionError(replicaConnection, ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
result = GetRemoteCommandResult(replicaConnection, raiseInterrupts);
|
||||||
|
if (!IsResponseOK(result))
|
||||||
|
{
|
||||||
|
ereport(DEBUG2, (errmsg("failed to execute get system identifier")));
|
||||||
|
ReportResultError(replicaConnection, result, ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
List *sysidList = ReadFirstColumnAsText(result);
|
||||||
|
if (list_length(sysidList) != 1)
|
||||||
|
{
|
||||||
|
CloseConnection(replicaConnection);
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE),
|
||||||
|
errmsg("cannot parse get system identifier result from %s:%d",
|
||||||
|
replicaHostname,
|
||||||
|
replicaPort)));
|
||||||
|
}
|
||||||
|
|
||||||
|
StringInfo sysidQueryResInfo = (StringInfo) linitial(sysidList);
|
||||||
|
char *sysidQueryResStr = sysidQueryResInfo->data;
|
||||||
|
|
||||||
|
ereport(DEBUG2, (errmsg("system identifier of %s:%d is %s",
|
||||||
|
replicaHostname, replicaPort, sysidQueryResStr)));
|
||||||
|
|
||||||
|
/* We do not need the connection anymore */
|
||||||
|
PQclear(result);
|
||||||
|
ForgetResults(replicaConnection);
|
||||||
|
CloseConnection(replicaConnection);
|
||||||
|
|
||||||
|
/* Step3: Get system identifier from primary */
|
||||||
|
ereport(DEBUG2, (errmsg("getting system identifier from primary %s:%d",
|
||||||
|
primaryWorkerNode->workerName,
|
||||||
|
primaryWorkerNode->workerPort)));
|
||||||
|
|
||||||
|
int primaryConnectionFlag = 0;
|
||||||
|
MultiConnection *primaryConnection = GetNodeConnection(primaryConnectionFlag,
|
||||||
|
primaryWorkerNode->workerName,
|
||||||
|
primaryWorkerNode->workerPort);
|
||||||
|
|
||||||
|
if (PQstatus(primaryConnection->pgConn) != CONNECTION_OK)
|
||||||
|
{
|
||||||
|
ReportConnectionError(primaryConnection, ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
int primaryResultCode = SendRemoteCommand(primaryConnection, sysidQuery);
|
||||||
|
if (primaryResultCode == 0)
|
||||||
|
{
|
||||||
|
ReportConnectionError(primaryConnection, ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
PGresult *primaryResult = GetRemoteCommandResult(primaryConnection, raiseInterrupts);
|
||||||
|
if (!IsResponseOK(primaryResult))
|
||||||
|
{
|
||||||
|
ereport(DEBUG2, (errmsg("failed to execute get system identifier")));
|
||||||
|
ReportResultError(primaryConnection, primaryResult, ERROR);
|
||||||
|
}
|
||||||
|
List *primarySizeList = ReadFirstColumnAsText(primaryResult);
|
||||||
|
if (list_length(primarySizeList) != 1)
|
||||||
|
{
|
||||||
|
CloseConnection(primaryConnection);
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE),
|
||||||
|
errmsg("cannot parse get system identifier result from %s:%d",
|
||||||
|
primaryWorkerNode->workerName,
|
||||||
|
primaryWorkerNode->workerPort)));
|
||||||
|
}
|
||||||
|
StringInfo primarySysidQueryResInfo = (StringInfo) linitial(primarySizeList);
|
||||||
|
char *primarySysidQueryResStr = primarySysidQueryResInfo->data;
|
||||||
|
|
||||||
|
ereport(DEBUG2, (errmsg("system identifier of %s:%d is %s",
|
||||||
|
primaryWorkerNode->workerName, primaryWorkerNode->workerPort,
|
||||||
|
primarySysidQueryResStr)));
|
||||||
|
|
||||||
|
/* verify both identifiers */
|
||||||
|
if (strcmp(sysidQueryResStr, primarySysidQueryResStr) != 0)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE),
|
||||||
|
errmsg(
|
||||||
|
"system identifiers do not match: %s (clone) vs %s (primary)",
|
||||||
|
sysidQueryResStr, primarySysidQueryResStr)));
|
||||||
|
}
|
||||||
|
PQclear(primaryResult);
|
||||||
|
ClearResults(primaryConnection, true);
|
||||||
|
CloseConnection(primaryConnection);
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,13 @@
|
||||||
|
#ifndef CLONENODE_UTILS_H
|
||||||
|
#define CLONENODE_UTILS_H
|
||||||
|
|
||||||
|
#include "distributed/metadata_cache.h"
|
||||||
|
|
||||||
|
extern int64 GetReplicationLag(WorkerNode *primaryWorkerNode, WorkerNode *
|
||||||
|
replicaWorkerNode);
|
||||||
|
extern void EnsureValidStreamingReplica(WorkerNode *primaryWorkerNode, char *
|
||||||
|
replicaHostname, int replicaPort);
|
||||||
|
extern void EnsureValidCloneMode(WorkerNode *primaryWorkerNode, char *cloneHostname, int
|
||||||
|
clonePort, char *operation);
|
||||||
|
|
||||||
|
#endif /* CLONE_UTILS_H */
|
||||||
|
|
@ -299,6 +299,7 @@ extern Oid CitusDependentObjectFuncId(void);
|
||||||
/* enum oids */
|
/* enum oids */
|
||||||
extern Oid PrimaryNodeRoleId(void);
|
extern Oid PrimaryNodeRoleId(void);
|
||||||
extern Oid SecondaryNodeRoleId(void);
|
extern Oid SecondaryNodeRoleId(void);
|
||||||
|
extern Oid UnavailableNodeRoleId(void);
|
||||||
extern Oid CitusCopyFormatTypeId(void);
|
extern Oid CitusCopyFormatTypeId(void);
|
||||||
extern Oid TextCopyFormatId(void);
|
extern Oid TextCopyFormatId(void);
|
||||||
extern Oid BinaryCopyFormatId(void);
|
extern Oid BinaryCopyFormatId(void);
|
||||||
|
|
|
||||||
|
|
@ -345,7 +345,8 @@ extern bool IsDummyPlacement(ShardPlacement *taskPlacement);
|
||||||
extern StringInfo GenerateSizeQueryOnMultiplePlacements(List *shardIntervalList,
|
extern StringInfo GenerateSizeQueryOnMultiplePlacements(List *shardIntervalList,
|
||||||
Oid indexId,
|
Oid indexId,
|
||||||
SizeQueryType sizeQueryType,
|
SizeQueryType sizeQueryType,
|
||||||
bool optimizePartitionCalculations);
|
bool optimizePartitionCalculations
|
||||||
|
);
|
||||||
extern List * RemoveCoordinatorPlacementIfNotSingleNode(List *placementList);
|
extern List * RemoveCoordinatorPlacementIfNotSingleNode(List *placementList);
|
||||||
|
|
||||||
/* Function declarations to modify shard and shard placement data */
|
/* Function declarations to modify shard and shard placement data */
|
||||||
|
|
@ -467,4 +468,8 @@ extern bool IsBackgroundTaskStatusTerminal(BackgroundTaskStatus status);
|
||||||
extern Oid BackgroundJobStatusOid(BackgroundJobStatus status);
|
extern Oid BackgroundJobStatusOid(BackgroundJobStatus status);
|
||||||
extern Oid BackgroundTaskStatusOid(BackgroundTaskStatus status);
|
extern Oid BackgroundTaskStatusOid(BackgroundTaskStatus status);
|
||||||
extern int GetAutoConvertedAttrIndexInPgDistPartition(TupleDesc tupleDEsc);
|
extern int GetAutoConvertedAttrIndexInPgDistPartition(TupleDesc tupleDEsc);
|
||||||
|
|
||||||
|
/* from node_metadata.c */
|
||||||
|
extern void LockShardsInWorkerPlacementList(WorkerNode *workerNode, LOCKMODE lockMode);
|
||||||
|
extern void ActivateCloneNodeAsPrimary(WorkerNode *workerNode);
|
||||||
#endif /* METADATA_UTILITY_H */
|
#endif /* METADATA_UTILITY_H */
|
||||||
|
|
|
||||||
|
|
@ -20,7 +20,7 @@
|
||||||
* in particular their OUT parameters) must be changed whenever the definition of
|
* in particular their OUT parameters) must be changed whenever the definition of
|
||||||
* pg_dist_node changes.
|
* pg_dist_node changes.
|
||||||
*/
|
*/
|
||||||
#define Natts_pg_dist_node 11
|
#define Natts_pg_dist_node 13
|
||||||
#define Anum_pg_dist_node_nodeid 1
|
#define Anum_pg_dist_node_nodeid 1
|
||||||
#define Anum_pg_dist_node_groupid 2
|
#define Anum_pg_dist_node_groupid 2
|
||||||
#define Anum_pg_dist_node_nodename 3
|
#define Anum_pg_dist_node_nodename 3
|
||||||
|
|
@ -32,6 +32,8 @@
|
||||||
#define Anum_pg_dist_node_nodecluster 9
|
#define Anum_pg_dist_node_nodecluster 9
|
||||||
#define Anum_pg_dist_node_metadatasynced 10
|
#define Anum_pg_dist_node_metadatasynced 10
|
||||||
#define Anum_pg_dist_node_shouldhaveshards 11
|
#define Anum_pg_dist_node_shouldhaveshards 11
|
||||||
|
#define Anum_pg_dist_node_nodeisclone 12
|
||||||
|
#define Anum_pg_dist_node_nodeprimarynodeid 13
|
||||||
|
|
||||||
#define GROUPID_SEQUENCE_NAME "pg_dist_groupid_seq"
|
#define GROUPID_SEQUENCE_NAME "pg_dist_groupid_seq"
|
||||||
#define NODEID_SEQUENCE_NAME "pg_dist_node_nodeid_seq"
|
#define NODEID_SEQUENCE_NAME "pg_dist_node_nodeid_seq"
|
||||||
|
|
|
||||||
|
|
@ -222,4 +222,7 @@ extern void SetupRebalanceMonitor(List *placementUpdateList,
|
||||||
uint64 initialProgressState,
|
uint64 initialProgressState,
|
||||||
PlacementUpdateStatus initialStatus);
|
PlacementUpdateStatus initialStatus);
|
||||||
|
|
||||||
|
extern void SplitShardsBetweenPrimaryAndClone(WorkerNode *primaryNode,
|
||||||
|
WorkerNode *cloneNode,
|
||||||
|
Name strategyName);
|
||||||
#endif /* SHARD_REBALANCER_H */
|
#endif /* SHARD_REBALANCER_H */
|
||||||
|
|
|
||||||
|
|
@ -85,3 +85,8 @@ extern void UpdatePlacementUpdateStatusForShardIntervalList(List *shardIntervalL
|
||||||
extern void InsertDeferredDropCleanupRecordsForShards(List *shardIntervalList);
|
extern void InsertDeferredDropCleanupRecordsForShards(List *shardIntervalList);
|
||||||
extern void InsertCleanupRecordsForShardPlacementsOnNode(List *shardIntervalList,
|
extern void InsertCleanupRecordsForShardPlacementsOnNode(List *shardIntervalList,
|
||||||
int32 groupId);
|
int32 groupId);
|
||||||
|
|
||||||
|
extern void AdjustShardsForPrimaryCloneNodeSplit(WorkerNode *primaryNode,
|
||||||
|
WorkerNode *cloneNode,
|
||||||
|
List *primaryShardList,
|
||||||
|
List *cloneShardList);
|
||||||
|
|
|
||||||
|
|
@ -54,6 +54,8 @@ typedef struct WorkerNode
|
||||||
char nodeCluster[NAMEDATALEN]; /* the cluster the node is a part of */
|
char nodeCluster[NAMEDATALEN]; /* the cluster the node is a part of */
|
||||||
bool metadataSynced; /* node has the most recent metadata */
|
bool metadataSynced; /* node has the most recent metadata */
|
||||||
bool shouldHaveShards; /* if the node should have distributed table shards on it or not */
|
bool shouldHaveShards; /* if the node should have distributed table shards on it or not */
|
||||||
|
bool nodeisclone; /* whether this node is a replica */
|
||||||
|
int32 nodeprimarynodeid; /* nodeid of the primary for this replica */
|
||||||
} WorkerNode;
|
} WorkerNode;
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -84,6 +86,7 @@ extern WorkerNode * FindWorkerNode(const char *nodeName, int32 nodePort);
|
||||||
extern WorkerNode * FindWorkerNodeOrError(const char *nodeName, int32 nodePort);
|
extern WorkerNode * FindWorkerNodeOrError(const char *nodeName, int32 nodePort);
|
||||||
extern WorkerNode * FindWorkerNodeAnyCluster(const char *nodeName, int32 nodePort);
|
extern WorkerNode * FindWorkerNodeAnyCluster(const char *nodeName, int32 nodePort);
|
||||||
extern WorkerNode * FindNodeWithNodeId(int nodeId, bool missingOk);
|
extern WorkerNode * FindNodeWithNodeId(int nodeId, bool missingOk);
|
||||||
|
extern WorkerNode * FindNodeAnyClusterByNodeId(uint32 nodeId);
|
||||||
extern WorkerNode * ModifiableWorkerNode(const char *nodeName, int32 nodePort);
|
extern WorkerNode * ModifiableWorkerNode(const char *nodeName, int32 nodePort);
|
||||||
extern List * ReadDistNode(bool includeNodesFromOtherClusters);
|
extern List * ReadDistNode(bool includeNodesFromOtherClusters);
|
||||||
extern void EnsureCoordinator(void);
|
extern void EnsureCoordinator(void);
|
||||||
|
|
|
||||||
|
|
@ -45,7 +45,7 @@ vanilla_diffs_file = $(citus_abs_srcdir)/pg_vanilla_outputs/$(MAJORVERSION)/regr
|
||||||
# intermediate, for muscle memory backward compatibility.
|
# intermediate, for muscle memory backward compatibility.
|
||||||
check: check-full check-enterprise-full
|
check: check-full check-enterprise-full
|
||||||
# check-full triggers all tests that ought to be run routinely
|
# check-full triggers all tests that ought to be run routinely
|
||||||
check-full: check-multi check-multi-mx check-multi-1 check-operations check-follower-cluster check-isolation check-failure check-split check-vanilla check-columnar check-columnar-isolation check-pg-upgrade check-arbitrary-configs check-citus-upgrade check-citus-upgrade-mixed check-citus-upgrade-local check-citus-upgrade-mixed-local check-pytest check-query-generator
|
check-full: check-multi check-multi-mx check-multi-1 check-operations check-add-backup-node check-follower-cluster check-isolation check-failure check-split check-vanilla check-columnar check-columnar-isolation check-pg-upgrade check-arbitrary-configs check-citus-upgrade check-citus-upgrade-mixed check-citus-upgrade-local check-citus-upgrade-mixed-local check-pytest check-query-generator
|
||||||
# check-enterprise-full triggers all enterprise specific tests
|
# check-enterprise-full triggers all enterprise specific tests
|
||||||
check-enterprise-full: check-enterprise check-enterprise-isolation check-enterprise-failure check-enterprise-isolation-logicalrep-1 check-enterprise-isolation-logicalrep-2 check-enterprise-isolation-logicalrep-3
|
check-enterprise-full: check-enterprise check-enterprise-isolation check-enterprise-failure check-enterprise-isolation-logicalrep-1 check-enterprise-isolation-logicalrep-2 check-enterprise-isolation-logicalrep-3
|
||||||
|
|
||||||
|
|
@ -217,6 +217,10 @@ check-follower-cluster: all
|
||||||
$(pg_regress_multi_check) --load-extension=citus --follower-cluster \
|
$(pg_regress_multi_check) --load-extension=citus --follower-cluster \
|
||||||
-- $(MULTI_REGRESS_OPTS) --schedule=$(citus_abs_srcdir)/multi_follower_schedule $(EXTRA_TESTS)
|
-- $(MULTI_REGRESS_OPTS) --schedule=$(citus_abs_srcdir)/multi_follower_schedule $(EXTRA_TESTS)
|
||||||
|
|
||||||
|
check-add-backup-node: all
|
||||||
|
$(pg_regress_multi_check) --load-extension=citus --follower-cluster --backupnodetest --worker-count=6 \
|
||||||
|
-- $(MULTI_REGRESS_OPTS) --schedule=$(citus_abs_srcdir)/multi_add_backup_node_schedule $(EXTRA_TESTS)
|
||||||
|
|
||||||
check-operations: all
|
check-operations: all
|
||||||
$(pg_regress_multi_check) --load-extension=citus --worker-count=6 \
|
$(pg_regress_multi_check) --load-extension=citus --worker-count=6 \
|
||||||
-- $(MULTI_REGRESS_OPTS) --schedule=$(citus_abs_srcdir)/operations_schedule $(EXTRA_TESTS)
|
-- $(MULTI_REGRESS_OPTS) --schedule=$(citus_abs_srcdir)/operations_schedule $(EXTRA_TESTS)
|
||||||
|
|
|
||||||
|
|
@ -113,6 +113,13 @@ DEPS = {
|
||||||
),
|
),
|
||||||
"create_role_propagation": TestDeps(None, ["multi_cluster_management"]),
|
"create_role_propagation": TestDeps(None, ["multi_cluster_management"]),
|
||||||
"single_node_enterprise": TestDeps(None),
|
"single_node_enterprise": TestDeps(None),
|
||||||
|
"multi_add_node_from_backup": TestDeps(None, repeatable=False, worker_count=5),
|
||||||
|
"multi_add_node_from_backup_negative": TestDeps(
|
||||||
|
None, ["multi_add_node_from_backup"], worker_count=5, repeatable=False
|
||||||
|
),
|
||||||
|
"multi_add_node_from_backup_sync_replica": TestDeps(
|
||||||
|
None, repeatable=False, worker_count=5
|
||||||
|
),
|
||||||
"single_node": TestDeps(None, ["multi_test_helpers"]),
|
"single_node": TestDeps(None, ["multi_test_helpers"]),
|
||||||
"single_node_truncate": TestDeps(None),
|
"single_node_truncate": TestDeps(None),
|
||||||
"multi_explain": TestDeps(
|
"multi_explain": TestDeps(
|
||||||
|
|
|
||||||
|
|
@ -906,11 +906,11 @@ SELECT citus_activate_node('localhost', :worker_2_proxy_port);
|
||||||
ERROR: connection not open
|
ERROR: connection not open
|
||||||
-- Show node metadata info on coordinator after failures
|
-- Show node metadata info on coordinator after failures
|
||||||
SELECT * FROM pg_dist_node ORDER BY nodeport;
|
SELECT * FROM pg_dist_node ORDER BY nodeport;
|
||||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards
|
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||||
---------------------------------------------------------------------
|
---------------------------------------------------------------------
|
||||||
2 | 2 | localhost | 9060 | default | f | t | primary | default | f | t
|
2 | 2 | localhost | 9060 | default | f | t | primary | default | f | t | f | 0
|
||||||
3 | 0 | localhost | 57636 | default | t | t | primary | default | t | f
|
3 | 0 | localhost | 57636 | default | t | t | primary | default | t | f | f | 0
|
||||||
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t
|
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||||
(3 rows)
|
(3 rows)
|
||||||
|
|
||||||
-- Show that we can still query the node from coordinator
|
-- Show that we can still query the node from coordinator
|
||||||
|
|
@ -968,20 +968,20 @@ SELECT citus_activate_node('localhost', :worker_2_proxy_port);
|
||||||
-- Show node metadata info on worker2 and coordinator after success
|
-- Show node metadata info on worker2 and coordinator after success
|
||||||
\c - - - :worker_2_port
|
\c - - - :worker_2_port
|
||||||
SELECT * FROM pg_dist_node ORDER BY nodeport;
|
SELECT * FROM pg_dist_node ORDER BY nodeport;
|
||||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards
|
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||||
---------------------------------------------------------------------
|
---------------------------------------------------------------------
|
||||||
2 | 2 | localhost | 9060 | default | t | t | primary | default | t | t
|
2 | 2 | localhost | 9060 | default | t | t | primary | default | t | t | f | 0
|
||||||
3 | 0 | localhost | 57636 | default | t | t | primary | default | t | f
|
3 | 0 | localhost | 57636 | default | t | t | primary | default | t | f | f | 0
|
||||||
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t
|
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||||
(3 rows)
|
(3 rows)
|
||||||
|
|
||||||
\c - - - :master_port
|
\c - - - :master_port
|
||||||
SELECT * FROM pg_dist_node ORDER BY nodeport;
|
SELECT * FROM pg_dist_node ORDER BY nodeport;
|
||||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards
|
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||||
---------------------------------------------------------------------
|
---------------------------------------------------------------------
|
||||||
2 | 2 | localhost | 9060 | default | t | t | primary | default | t | t
|
2 | 2 | localhost | 9060 | default | t | t | primary | default | t | t | f | 0
|
||||||
3 | 0 | localhost | 57636 | default | t | t | primary | default | t | f
|
3 | 0 | localhost | 57636 | default | t | t | primary | default | t | f | f | 0
|
||||||
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t
|
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||||
(3 rows)
|
(3 rows)
|
||||||
|
|
||||||
SELECT citus.mitmproxy('conn.allow()');
|
SELECT citus.mitmproxy('conn.allow()');
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,515 @@
|
||||||
|
--
|
||||||
|
-- Test for adding a worker node from a backup
|
||||||
|
--
|
||||||
|
-- setup cluster
|
||||||
|
SELECT 1 FROM master_add_node('localhost', :worker_1_port);
|
||||||
|
?column?
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
1
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT 1 FROM master_add_node('localhost', :worker_2_port);
|
||||||
|
?column?
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
1
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT * from pg_dist_node;
|
||||||
|
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
2 | 2 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
(2 rows)
|
||||||
|
|
||||||
|
-- create a distributed table and load data
|
||||||
|
CREATE TABLE backup_test(id int, value text);
|
||||||
|
SELECT create_distributed_table('backup_test', 'id', 'hash');
|
||||||
|
create_distributed_table
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
INSERT INTO backup_test SELECT g, 'test' || g FROM generate_series(1, 10) g;
|
||||||
|
-- Colocation group 1: create two tables table1_colg1, table2_colg1 and in a colocation group
|
||||||
|
CREATE TABLE table1_colg1 (a int PRIMARY KEY);
|
||||||
|
SELECT create_distributed_table('table1_colg1', 'a', shard_count => 4, colocate_with => 'none');
|
||||||
|
create_distributed_table
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
CREATE TABLE table2_colg1 (b int PRIMARY KEY);
|
||||||
|
SELECT create_distributed_table('table2_colg1', 'b', colocate_with => 'table1_colg1');
|
||||||
|
create_distributed_table
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
-- Colocation group 2: create two tables table1_colg2, table2_colg2 and in a colocation group
|
||||||
|
CREATE TABLE table1_colg2 (a int PRIMARY KEY);
|
||||||
|
SELECT create_distributed_table('table1_colg2', 'a', shard_count => 4, colocate_with => 'none');
|
||||||
|
create_distributed_table
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
CREATE TABLE table2_colg2 (b int primary key);
|
||||||
|
SELECT create_distributed_table('table2_colg2', 'b', colocate_with => 'table1_colg2');
|
||||||
|
create_distributed_table
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
-- Colocation group 3: create two tables table1_colg3, table2_colg3 and in a colocation group
|
||||||
|
CREATE TABLE table1_colg3 (a int PRIMARY KEY);
|
||||||
|
SELECT create_distributed_table('table1_colg3', 'a', shard_count => 4, colocate_with => 'none');
|
||||||
|
create_distributed_table
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
CREATE TABLE table2_colg3 (b int primary key);
|
||||||
|
SELECT create_distributed_table('table2_colg3', 'b', colocate_with => 'table1_colg3');
|
||||||
|
create_distributed_table
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
-- Create reference tables with primary-foreign key relationships
|
||||||
|
CREATE TABLE customers (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
name TEXT NOT NULL,
|
||||||
|
email TEXT UNIQUE NOT NULL );
|
||||||
|
CREATE TABLE orders (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
customer_id INTEGER NOT NULL REFERENCES customers(id),
|
||||||
|
order_date DATE NOT NULL DEFAULT CURRENT_DATE);
|
||||||
|
CREATE TABLE order_items (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
order_id INTEGER NOT NULL REFERENCES orders(id),
|
||||||
|
product_name TEXT NOT NULL,
|
||||||
|
quantity INTEGER NOT NULL,
|
||||||
|
price NUMERIC(10, 2) NOT NULL
|
||||||
|
);
|
||||||
|
SELECT create_reference_table('customers');
|
||||||
|
create_reference_table
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT create_reference_table('orders');
|
||||||
|
create_reference_table
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT create_reference_table('order_items');
|
||||||
|
create_reference_table
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
-- INSERT SOME DATA
|
||||||
|
-- Insert 10 customers
|
||||||
|
INSERT INTO customers (name, email)
|
||||||
|
SELECT
|
||||||
|
'Customer ' || i,
|
||||||
|
'customer' || i || '@example.com'
|
||||||
|
FROM generate_series(1, 10) AS i;
|
||||||
|
-- Insert 30 orders: each customer gets 3 orders
|
||||||
|
INSERT INTO orders (customer_id, order_date)
|
||||||
|
SELECT
|
||||||
|
(i % 10) + 1, -- customer_id between 1 and 10
|
||||||
|
CURRENT_DATE - (i % 7)
|
||||||
|
FROM generate_series(1, 30) AS i;
|
||||||
|
-- Insert 90 order_items: each order has 3 items
|
||||||
|
INSERT INTO order_items (order_id, product_name, quantity, price)
|
||||||
|
SELECT
|
||||||
|
(i % 30) + 1, -- order_id between 1 and 30
|
||||||
|
'Product ' || (i % 5 + 1),
|
||||||
|
(i % 10) + 1,
|
||||||
|
round((random() * 100 + 10)::numeric, 2)
|
||||||
|
FROM generate_series(1, 90) AS i;
|
||||||
|
SELECT count(*) from customers;
|
||||||
|
count
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
10
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT count(*) from orders;
|
||||||
|
count
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
30
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT count(*) from order_items;
|
||||||
|
count
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
90
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
-- verify initial shard placement
|
||||||
|
SELECT nodename, nodeport, count(shardid) FROM pg_dist_shard_placement GROUP BY nodename, nodeport ORDER BY nodename, nodeport;
|
||||||
|
nodename | nodeport | count
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
localhost | 57637 | 17
|
||||||
|
localhost | 57638 | 17
|
||||||
|
(2 rows)
|
||||||
|
|
||||||
|
-- wait for the new node to be ready
|
||||||
|
SELECT pg_sleep(5);
|
||||||
|
pg_sleep
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
-- register the new node as a clone
|
||||||
|
-- the function returns the new node id
|
||||||
|
SELECT citus_add_clone_node('localhost', :follower_worker_1_port, 'localhost', :worker_1_port) AS clone_node_id \gset
|
||||||
|
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
|
||||||
|
NOTICE: checking replication for node localhost (resolved IP: ::1)
|
||||||
|
NOTICE: clone localhost:xxxxx is properly connected to primary localhost:xxxxx and is not synchronous
|
||||||
|
SELECT * from pg_dist_node ORDER by nodeid;
|
||||||
|
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
2 | 2 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
3 | 3 | localhost | 9071 | default | f | f | unavailable | default | f | f | t | 1
|
||||||
|
(3 rows)
|
||||||
|
|
||||||
|
SELECT :clone_node_id ;
|
||||||
|
?column?
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
3
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT shardid, nodename, 'PRIMARY' as node_type FROM pg_dist_shard_placement WHERE nodeport = :worker_1_port ORDER BY shardid;
|
||||||
|
shardid | nodename | node_type
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
102008 | localhost | PRIMARY
|
||||||
|
102010 | localhost | PRIMARY
|
||||||
|
102012 | localhost | PRIMARY
|
||||||
|
102014 | localhost | PRIMARY
|
||||||
|
102016 | localhost | PRIMARY
|
||||||
|
102018 | localhost | PRIMARY
|
||||||
|
102020 | localhost | PRIMARY
|
||||||
|
102022 | localhost | PRIMARY
|
||||||
|
102024 | localhost | PRIMARY
|
||||||
|
102026 | localhost | PRIMARY
|
||||||
|
102028 | localhost | PRIMARY
|
||||||
|
102030 | localhost | PRIMARY
|
||||||
|
102032 | localhost | PRIMARY
|
||||||
|
102034 | localhost | PRIMARY
|
||||||
|
102036 | localhost | PRIMARY
|
||||||
|
102037 | localhost | PRIMARY
|
||||||
|
102038 | localhost | PRIMARY
|
||||||
|
(17 rows)
|
||||||
|
|
||||||
|
SELECT shardid, nodename, 'CLONE' as node_type FROM pg_dist_shard_placement WHERE nodeport = :follower_worker_1_port ORDER BY shardid;
|
||||||
|
shardid | nodename | node_type
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
(0 rows)
|
||||||
|
|
||||||
|
SELECT * from get_snapshot_based_node_split_plan('localhost', :worker_1_port, 'localhost', :follower_worker_1_port);
|
||||||
|
table_name | shardid | shard_size | placement_node
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
table1_colg2 | 102020 | 0 | Primary Node
|
||||||
|
table2_colg2 | 102024 | 0 | Primary Node
|
||||||
|
table1_colg2 | 102022 | 0 | Primary Node
|
||||||
|
table2_colg2 | 102026 | 0 | Primary Node
|
||||||
|
table1_colg3 | 102028 | 0 | Primary Node
|
||||||
|
table2_colg3 | 102032 | 0 | Primary Node
|
||||||
|
table1_colg3 | 102030 | 0 | Primary Node
|
||||||
|
table2_colg3 | 102034 | 0 | Primary Node
|
||||||
|
backup_test | 102008 | 0 | Clone Node
|
||||||
|
backup_test | 102010 | 0 | Clone Node
|
||||||
|
table1_colg1 | 102012 | 0 | Clone Node
|
||||||
|
table2_colg1 | 102016 | 0 | Clone Node
|
||||||
|
table1_colg1 | 102014 | 0 | Clone Node
|
||||||
|
table2_colg1 | 102018 | 0 | Clone Node
|
||||||
|
(14 rows)
|
||||||
|
|
||||||
|
-- promote the clone and rebalance the shards
|
||||||
|
SET client_min_messages to 'LOG';
|
||||||
|
SELECT citus_promote_clone_and_rebalance(:clone_node_id);
|
||||||
|
NOTICE: Starting promotion process for clone node localhost:xxxxx (ID 3), original primary localhost:xxxxx (ID 1)
|
||||||
|
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
|
||||||
|
NOTICE: checking replication for node localhost (resolved IP: ::1)
|
||||||
|
NOTICE: clone localhost:xxxxx is properly connected to primary localhost:xxxxx and is not synchronous
|
||||||
|
NOTICE: Blocking writes on shards of original primary node localhost:xxxxx (group 1)
|
||||||
|
NOTICE: Blocking all writes to worker node localhost:xxxxx (ID 1)
|
||||||
|
NOTICE: Waiting for clone localhost:xxxxx to catch up with primary localhost:xxxxx (timeout: 300 seconds)
|
||||||
|
NOTICE: replication lag between localhost:xxxxx and localhost:xxxxx is 0 bytes
|
||||||
|
NOTICE: Clone localhost:xxxxx is now caught up with primary localhost:xxxxx.
|
||||||
|
NOTICE: Attempting to promote clone localhost:xxxxx via pg_promote().
|
||||||
|
NOTICE: Clone node localhost:xxxxx (ID 3) has been successfully promoted.
|
||||||
|
NOTICE: Updating metadata for promoted clone localhost:xxxxx (ID 3)
|
||||||
|
NOTICE: adjusting shard placements for primary localhost:xxxxx and clone localhost:xxxxx
|
||||||
|
NOTICE: processing 4 shards for primary node GroupID 1
|
||||||
|
LOG: inserting DELETE shard record for shard public.table1_colg2_102020 from clone node GroupID 3
|
||||||
|
LOG: inserting DELETE shard record for shard public.table1_colg2_102022 from clone node GroupID 3
|
||||||
|
LOG: inserting DELETE shard record for shard public.table1_colg3_102028 from clone node GroupID 3
|
||||||
|
LOG: inserting DELETE shard record for shard public.table1_colg3_102030 from clone node GroupID 3
|
||||||
|
NOTICE: processing 4 shards for clone node GroupID 3
|
||||||
|
LOG: inserting DELETE shard record for shard public.backup_test_102008 from primary node GroupID 1
|
||||||
|
LOG: inserting DELETE shard record for shard public.backup_test_102010 from primary node GroupID 1
|
||||||
|
LOG: inserting DELETE shard record for shard public.table2_colg1_102016 from primary node GroupID 1
|
||||||
|
LOG: inserting DELETE shard record for shard public.table2_colg1_102018 from primary node GroupID 1
|
||||||
|
NOTICE: shard placement adjustment complete for primary localhost:xxxxx and clone localhost:xxxxx
|
||||||
|
NOTICE: Clone node localhost:xxxxx (ID 3) metadata updated. It is now a primary
|
||||||
|
NOTICE: Clone node localhost:xxxxx (ID 3) successfully registered as a worker node
|
||||||
|
citus_promote_clone_and_rebalance
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SET client_min_messages to DEFAULT;
|
||||||
|
SELECT shardid, nodename, 'PRIMARY' as node_type FROM pg_dist_shard_placement WHERE nodeport = :worker_1_port ORDER BY shardid;
|
||||||
|
shardid | nodename | node_type
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
102020 | localhost | PRIMARY
|
||||||
|
102022 | localhost | PRIMARY
|
||||||
|
102024 | localhost | PRIMARY
|
||||||
|
102026 | localhost | PRIMARY
|
||||||
|
102028 | localhost | PRIMARY
|
||||||
|
102030 | localhost | PRIMARY
|
||||||
|
102032 | localhost | PRIMARY
|
||||||
|
102034 | localhost | PRIMARY
|
||||||
|
102036 | localhost | PRIMARY
|
||||||
|
102037 | localhost | PRIMARY
|
||||||
|
102038 | localhost | PRIMARY
|
||||||
|
(11 rows)
|
||||||
|
|
||||||
|
SELECT shardid, nodename, 'CLONE' as node_type FROM pg_dist_shard_placement WHERE nodeport = :follower_worker_1_port ORDER BY shardid;
|
||||||
|
shardid | nodename | node_type
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
102008 | localhost | CLONE
|
||||||
|
102010 | localhost | CLONE
|
||||||
|
102012 | localhost | CLONE
|
||||||
|
102014 | localhost | CLONE
|
||||||
|
102016 | localhost | CLONE
|
||||||
|
102018 | localhost | CLONE
|
||||||
|
102036 | localhost | CLONE
|
||||||
|
102037 | localhost | CLONE
|
||||||
|
102038 | localhost | CLONE
|
||||||
|
(9 rows)
|
||||||
|
|
||||||
|
\c - - - :worker_1_port
|
||||||
|
SELECT 'WORKER' as node_type,* from pg_dist_node;
|
||||||
|
node_type | nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
WORKER | 3 | 3 | localhost | 9071 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
WORKER | 1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
WORKER | 2 | 2 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
(3 rows)
|
||||||
|
|
||||||
|
SELECT 'WORKER' as node_type, nodename, nodeport, count(shardid) FROM pg_dist_shard_placement GROUP BY nodename, nodeport ORDER BY nodename, nodeport;
|
||||||
|
node_type | nodename | nodeport | count
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
WORKER | localhost | 9071 | 9
|
||||||
|
WORKER | localhost | 57637 | 11
|
||||||
|
WORKER | localhost | 57638 | 17
|
||||||
|
(3 rows)
|
||||||
|
|
||||||
|
SELECT * from citus_tables;
|
||||||
|
table_name | citus_table_type | distribution_column | colocation_id | table_size | shard_count | table_owner | access_method
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
backup_test | distributed | id | 1 | 64 kB | 4 | postgres | heap
|
||||||
|
customers | reference | <none> | 5 | 144 kB | 1 | postgres | heap
|
||||||
|
order_items | reference | <none> | 5 | 96 kB | 1 | postgres | heap
|
||||||
|
orders | reference | <none> | 5 | 72 kB | 1 | postgres | heap
|
||||||
|
table1_colg1 | distributed | a | 2 | 32 kB | 4 | postgres | heap
|
||||||
|
table1_colg2 | distributed | a | 3 | 32 kB | 4 | postgres | heap
|
||||||
|
table1_colg3 | distributed | a | 4 | 32 kB | 4 | postgres | heap
|
||||||
|
table2_colg1 | distributed | b | 2 | 32 kB | 4 | postgres | heap
|
||||||
|
table2_colg2 | distributed | b | 3 | 32 kB | 4 | postgres | heap
|
||||||
|
table2_colg3 | distributed | b | 4 | 32 kB | 4 | postgres | heap
|
||||||
|
(10 rows)
|
||||||
|
|
||||||
|
SELECT id, value FROM backup_test ORDER BY id;
|
||||||
|
id | value
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
1 | test1
|
||||||
|
2 | test2
|
||||||
|
3 | test3
|
||||||
|
4 | test4
|
||||||
|
5 | test5
|
||||||
|
6 | test6
|
||||||
|
7 | test7
|
||||||
|
8 | test8
|
||||||
|
9 | test9
|
||||||
|
10 | test10
|
||||||
|
(10 rows)
|
||||||
|
|
||||||
|
SELECT count(*) from customers;
|
||||||
|
count
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
10
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT count(*) from orders;
|
||||||
|
count
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
30
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT count(*) from order_items;
|
||||||
|
count
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
90
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
\c - - - :follower_worker_1_port
|
||||||
|
SELECT 'CLONE' as node_type ,* from pg_dist_node;
|
||||||
|
node_type | nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
CLONE | 3 | 3 | localhost | 9071 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
CLONE | 1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
CLONE | 2 | 2 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
(3 rows)
|
||||||
|
|
||||||
|
SELECT 'CLONE' as node_type, nodename, nodeport, count(shardid) FROM pg_dist_shard_placement GROUP BY nodename, nodeport ORDER BY nodename, nodeport;
|
||||||
|
node_type | nodename | nodeport | count
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
CLONE | localhost | 9071 | 9
|
||||||
|
CLONE | localhost | 57637 | 11
|
||||||
|
CLONE | localhost | 57638 | 17
|
||||||
|
(3 rows)
|
||||||
|
|
||||||
|
SELECT * from citus_tables;
|
||||||
|
table_name | citus_table_type | distribution_column | colocation_id | table_size | shard_count | table_owner | access_method
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
backup_test | distributed | id | 1 | 64 kB | 4 | postgres | heap
|
||||||
|
customers | reference | <none> | 5 | 144 kB | 1 | postgres | heap
|
||||||
|
order_items | reference | <none> | 5 | 96 kB | 1 | postgres | heap
|
||||||
|
orders | reference | <none> | 5 | 72 kB | 1 | postgres | heap
|
||||||
|
table1_colg1 | distributed | a | 2 | 32 kB | 4 | postgres | heap
|
||||||
|
table1_colg2 | distributed | a | 3 | 32 kB | 4 | postgres | heap
|
||||||
|
table1_colg3 | distributed | a | 4 | 32 kB | 4 | postgres | heap
|
||||||
|
table2_colg1 | distributed | b | 2 | 32 kB | 4 | postgres | heap
|
||||||
|
table2_colg2 | distributed | b | 3 | 32 kB | 4 | postgres | heap
|
||||||
|
table2_colg3 | distributed | b | 4 | 32 kB | 4 | postgres | heap
|
||||||
|
(10 rows)
|
||||||
|
|
||||||
|
SELECT id, value FROM backup_test ORDER BY id;
|
||||||
|
id | value
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
1 | test1
|
||||||
|
2 | test2
|
||||||
|
3 | test3
|
||||||
|
4 | test4
|
||||||
|
5 | test5
|
||||||
|
6 | test6
|
||||||
|
7 | test7
|
||||||
|
8 | test8
|
||||||
|
9 | test9
|
||||||
|
10 | test10
|
||||||
|
(10 rows)
|
||||||
|
|
||||||
|
SELECT count(*) from customers;
|
||||||
|
count
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
10
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT count(*) from orders;
|
||||||
|
count
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
30
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT count(*) from order_items;
|
||||||
|
count
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
90
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
\c - - - :master_port
|
||||||
|
SELECT 'MASTER' as node_type, * from pg_dist_node;
|
||||||
|
node_type | nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
MASTER | 2 | 2 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
MASTER | 1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
MASTER | 3 | 3 | localhost | 9071 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
(3 rows)
|
||||||
|
|
||||||
|
SELECT 'MASTER' as node_type, nodename, nodeport, count(shardid) FROM pg_dist_shard_placement GROUP BY nodename, nodeport ORDER BY nodename, nodeport;
|
||||||
|
node_type | nodename | nodeport | count
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
MASTER | localhost | 9071 | 9
|
||||||
|
MASTER | localhost | 57637 | 11
|
||||||
|
MASTER | localhost | 57638 | 17
|
||||||
|
(3 rows)
|
||||||
|
|
||||||
|
SELECT * from citus_tables;
|
||||||
|
table_name | citus_table_type | distribution_column | colocation_id | table_size | shard_count | table_owner | access_method
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
backup_test | distributed | id | 1 | 64 kB | 4 | postgres | heap
|
||||||
|
customers | reference | <none> | 5 | 144 kB | 1 | postgres | heap
|
||||||
|
order_items | reference | <none> | 5 | 96 kB | 1 | postgres | heap
|
||||||
|
orders | reference | <none> | 5 | 72 kB | 1 | postgres | heap
|
||||||
|
table1_colg1 | distributed | a | 2 | 32 kB | 4 | postgres | heap
|
||||||
|
table1_colg2 | distributed | a | 3 | 32 kB | 4 | postgres | heap
|
||||||
|
table1_colg3 | distributed | a | 4 | 32 kB | 4 | postgres | heap
|
||||||
|
table2_colg1 | distributed | b | 2 | 32 kB | 4 | postgres | heap
|
||||||
|
table2_colg2 | distributed | b | 3 | 32 kB | 4 | postgres | heap
|
||||||
|
table2_colg3 | distributed | b | 4 | 32 kB | 4 | postgres | heap
|
||||||
|
(10 rows)
|
||||||
|
|
||||||
|
SELECT id, value FROM backup_test ORDER BY id;
|
||||||
|
id | value
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
1 | test1
|
||||||
|
2 | test2
|
||||||
|
3 | test3
|
||||||
|
4 | test4
|
||||||
|
5 | test5
|
||||||
|
6 | test6
|
||||||
|
7 | test7
|
||||||
|
8 | test8
|
||||||
|
9 | test9
|
||||||
|
10 | test10
|
||||||
|
(10 rows)
|
||||||
|
|
||||||
|
SELECT count(*) from customers;
|
||||||
|
count
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
10
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT count(*) from orders;
|
||||||
|
count
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
30
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT count(*) from order_items;
|
||||||
|
count
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
90
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
-- verify data
|
||||||
|
SELECT count(*) FROM backup_test;
|
||||||
|
count
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
10
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT id, value FROM backup_test ORDER BY id;
|
||||||
|
id | value
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
1 | test1
|
||||||
|
2 | test2
|
||||||
|
3 | test3
|
||||||
|
4 | test4
|
||||||
|
5 | test5
|
||||||
|
6 | test6
|
||||||
|
7 | test7
|
||||||
|
8 | test8
|
||||||
|
9 | test9
|
||||||
|
10 | test10
|
||||||
|
(10 rows)
|
||||||
|
|
||||||
|
-- cleanup
|
||||||
|
DROP TABLE backup_test;
|
||||||
|
|
@ -0,0 +1,306 @@
|
||||||
|
--
|
||||||
|
-- Test for negative scenarios in clone promotion functionality
|
||||||
|
--
|
||||||
|
--try to add follower_worker_1 as a clone of worker_1 to the cluster
|
||||||
|
-- this should fail as previous test has already promoted worker_1 to a primary node
|
||||||
|
SELECT citus_add_clone_node('localhost', :follower_worker_1_port, 'localhost', :worker_1_port) AS clone_node_id \gset
|
||||||
|
ERROR: a different node localhost:xxxxx (nodeid 3) already exists or is a clone for a different primary
|
||||||
|
SELECT * from pg_dist_node ORDER by nodeid;
|
||||||
|
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
2 | 2 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
3 | 3 | localhost | 9071 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
(3 rows)
|
||||||
|
|
||||||
|
--try to add worker_node2 as a clone of worker_node1
|
||||||
|
-- this should fail as it is not a valid replica of worker_1
|
||||||
|
SELECT citus_add_clone_node('localhost', :follower_worker_2_port, 'localhost', :worker_1_port) AS clone_node_id \gset
|
||||||
|
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
|
||||||
|
NOTICE: checking replication for node localhost (resolved IP: ::1)
|
||||||
|
ERROR: clone localhost:xxxxx is not connected to primary localhost:xxxxx
|
||||||
|
DETAIL: The clone must be actively replicating from the specified primary node. Check that the clone is running and properly configured for replication.
|
||||||
|
SELECT * from pg_dist_node ORDER by nodeid;
|
||||||
|
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
2 | 2 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
3 | 3 | localhost | 9071 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
(3 rows)
|
||||||
|
|
||||||
|
--try add
|
||||||
|
-- create a distributed table and load data
|
||||||
|
CREATE TABLE backup_test(id int, value text);
|
||||||
|
SELECT create_distributed_table('backup_test', 'id', 'hash');
|
||||||
|
create_distributed_table
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
INSERT INTO backup_test SELECT g, 'test' || g FROM generate_series(1, 10) g;
|
||||||
|
-- Create reference table
|
||||||
|
CREATE TABLE ref_table(id int PRIMARY KEY);
|
||||||
|
SELECT create_reference_table('ref_table');
|
||||||
|
create_reference_table
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
INSERT INTO ref_table SELECT i FROM generate_series(1, 5) i;
|
||||||
|
SELECT COUNT(*) from backup_test;
|
||||||
|
count
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
10
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT COUNT(*) from ref_table;
|
||||||
|
count
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
5
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
-- verify initial shard placement
|
||||||
|
SELECT nodename, nodeport, count(shardid) FROM pg_dist_shard_placement GROUP BY nodename, nodeport ORDER BY nodename, nodeport;
|
||||||
|
nodename | nodeport | count
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
localhost | 9071 | 10
|
||||||
|
localhost | 57637 | 12
|
||||||
|
localhost | 57638 | 18
|
||||||
|
(3 rows)
|
||||||
|
|
||||||
|
-- Try to add replica of worker_node2 as a clone of worker_node1
|
||||||
|
SELECT citus_add_clone_node('localhost', :follower_worker_2_port, 'localhost', :worker_1_port) AS clone_node_id \gset
|
||||||
|
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
|
||||||
|
NOTICE: checking replication for node localhost (resolved IP: ::1)
|
||||||
|
ERROR: clone localhost:xxxxx is not connected to primary localhost:xxxxx
|
||||||
|
DETAIL: The clone must be actively replicating from the specified primary node. Check that the clone is running and properly configured for replication.
|
||||||
|
-- Test 1: Try to promote a non-existent clone node
|
||||||
|
SELECT citus_promote_clone_and_rebalance(clone_nodeid =>99999);
|
||||||
|
ERROR: Clone node with ID 99999 not found.
|
||||||
|
-- Test 2: Try to promote a regular worker node (not a clone)
|
||||||
|
SELECT citus_promote_clone_and_rebalance(clone_nodeid => 1);
|
||||||
|
ERROR: Node localhost:xxxxx (ID 1) is not a valid clone or its primary node ID is not set.
|
||||||
|
-- Test 3: Try to promote with invalid timeout (negative)
|
||||||
|
SELECT citus_promote_clone_and_rebalance(clone_nodeid => 1,
|
||||||
|
catchup_timeout_seconds => -100);
|
||||||
|
ERROR: Node localhost:xxxxx (ID 1) is not a valid clone or its primary node ID is not set.
|
||||||
|
-- register the new node as a clone, This should pass
|
||||||
|
SELECT citus_add_clone_node('localhost', :follower_worker_2_port, 'localhost', :worker_2_port) AS clone_node_id \gset
|
||||||
|
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
|
||||||
|
NOTICE: checking replication for node localhost (resolved IP: ::1)
|
||||||
|
NOTICE: clone localhost:xxxxx is properly connected to primary localhost:xxxxx and is not synchronous
|
||||||
|
SELECT * from pg_dist_node ORDER by nodeid;
|
||||||
|
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
2 | 2 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
3 | 3 | localhost | 9071 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
4 | 4 | localhost | 9072 | default | f | f | unavailable | default | f | f | t | 2
|
||||||
|
(4 rows)
|
||||||
|
|
||||||
|
SELECT :clone_node_id;
|
||||||
|
?column?
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
4
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
-- Test 4: Try to promote clone with invalid strategy name
|
||||||
|
SELECT citus_promote_clone_and_rebalance(clone_nodeid => :clone_node_id, rebalance_strategy => 'invalid_strategy');
|
||||||
|
NOTICE: Starting promotion process for clone node localhost:xxxxx (ID 4), original primary localhost:xxxxx (ID 2)
|
||||||
|
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
|
||||||
|
NOTICE: checking replication for node localhost (resolved IP: ::1)
|
||||||
|
NOTICE: clone localhost:xxxxx is properly connected to primary localhost:xxxxx and is not synchronous
|
||||||
|
NOTICE: Blocking writes on shards of original primary node localhost:xxxxx (group 2)
|
||||||
|
NOTICE: Blocking all writes to worker node localhost:xxxxx (ID 2)
|
||||||
|
NOTICE: Waiting for clone localhost:xxxxx to catch up with primary localhost:xxxxx (timeout: 300 seconds)
|
||||||
|
NOTICE: replication lag between localhost:xxxxx and localhost:xxxxx is 0 bytes
|
||||||
|
NOTICE: Clone localhost:xxxxx is now caught up with primary localhost:xxxxx.
|
||||||
|
NOTICE: Attempting to promote clone localhost:xxxxx via pg_promote().
|
||||||
|
NOTICE: Clone node localhost:xxxxx (ID 4) has been successfully promoted.
|
||||||
|
NOTICE: Updating metadata for promoted clone localhost:xxxxx (ID 4)
|
||||||
|
ERROR: could not find rebalance strategy with name invalid_strategy
|
||||||
|
SELECT * from pg_dist_node ORDER by nodeid;
|
||||||
|
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
2 | 2 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
3 | 3 | localhost | 9071 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
4 | 4 | localhost | 9072 | default | f | f | unavailable | default | f | f | t | 2
|
||||||
|
(4 rows)
|
||||||
|
|
||||||
|
-- Test 9: Rollback the citus_promote_clone_and_rebalance transaction
|
||||||
|
BEGIN;
|
||||||
|
SELECT citus_promote_clone_and_rebalance(clone_nodeid => :clone_node_id);
|
||||||
|
NOTICE: Starting promotion process for clone node localhost:xxxxx (ID 4), original primary localhost:xxxxx (ID 2)
|
||||||
|
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
|
||||||
|
NOTICE: checking replication for node localhost (resolved IP: ::1)
|
||||||
|
ERROR: clone localhost:xxxxx is not connected to primary localhost:xxxxx
|
||||||
|
DETAIL: The clone must be actively replicating from the specified primary node. Check that the clone is running and properly configured for replication.
|
||||||
|
ROLLBACK;
|
||||||
|
-- Verify no data is lost after rooling back the transaction
|
||||||
|
SELECT COUNT(*) from backup_test;
|
||||||
|
count
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
10
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT COUNT(*) from ref_table;
|
||||||
|
count
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
5
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT * from pg_dist_node ORDER by nodeid;
|
||||||
|
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
2 | 2 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
3 | 3 | localhost | 9071 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
4 | 4 | localhost | 9072 | default | f | f | unavailable | default | f | f | t | 2
|
||||||
|
(4 rows)
|
||||||
|
|
||||||
|
-- Test 5: Try to add and promote a proper replica after rollback
|
||||||
|
SELECT master_add_node('localhost', :worker_3_port) AS nodeid_3 \gset
|
||||||
|
SELECT citus_add_clone_node('localhost', :follower_worker_3_port, 'localhost', :worker_3_port) AS clone_node_id_3 \gset
|
||||||
|
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
|
||||||
|
NOTICE: checking replication for node localhost (resolved IP: ::1)
|
||||||
|
NOTICE: clone localhost:xxxxx is properly connected to primary localhost:xxxxx and is not synchronous
|
||||||
|
set citus.shard_count = 100;
|
||||||
|
CREATE TABLE backup_test2(id int, value text);
|
||||||
|
SELECT create_distributed_table('backup_test2', 'id', 'hash');
|
||||||
|
create_distributed_table
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
INSERT INTO backup_test2 SELECT g, 'test' || g FROM generate_series(1, 10) g;
|
||||||
|
-- Create reference table
|
||||||
|
CREATE TABLE ref_table2(id int PRIMARY KEY);
|
||||||
|
SELECT create_reference_table('ref_table2');
|
||||||
|
create_reference_table
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
INSERT INTO ref_table2 SELECT i FROM generate_series(1, 5) i;
|
||||||
|
SELECT * from get_snapshot_based_node_split_plan('localhost', :worker_3_port, 'localhost', :follower_worker_3_port);
|
||||||
|
table_name | shardid | shard_size | placement_node
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
backup_test2 | 102091 | 0 | Primary Node
|
||||||
|
backup_test2 | 102095 | 0 | Primary Node
|
||||||
|
backup_test2 | 102099 | 0 | Primary Node
|
||||||
|
backup_test2 | 102103 | 0 | Primary Node
|
||||||
|
backup_test2 | 102111 | 0 | Primary Node
|
||||||
|
backup_test2 | 102115 | 0 | Primary Node
|
||||||
|
backup_test2 | 102119 | 0 | Primary Node
|
||||||
|
backup_test2 | 102123 | 0 | Primary Node
|
||||||
|
backup_test2 | 102127 | 0 | Primary Node
|
||||||
|
backup_test2 | 102131 | 0 | Primary Node
|
||||||
|
backup_test2 | 102135 | 0 | Primary Node
|
||||||
|
backup_test2 | 102139 | 0 | Primary Node
|
||||||
|
backup_test2 | 102143 | 0 | Primary Node
|
||||||
|
backup_test2 | 102063 | 0 | Clone Node
|
||||||
|
backup_test2 | 102071 | 0 | Clone Node
|
||||||
|
backup_test2 | 102107 | 0 | Clone Node
|
||||||
|
backup_test2 | 102047 | 0 | Clone Node
|
||||||
|
backup_test2 | 102051 | 0 | Clone Node
|
||||||
|
backup_test2 | 102055 | 0 | Clone Node
|
||||||
|
backup_test2 | 102059 | 0 | Clone Node
|
||||||
|
backup_test2 | 102067 | 0 | Clone Node
|
||||||
|
backup_test2 | 102075 | 0 | Clone Node
|
||||||
|
backup_test2 | 102079 | 0 | Clone Node
|
||||||
|
backup_test2 | 102083 | 0 | Clone Node
|
||||||
|
backup_test2 | 102087 | 0 | Clone Node
|
||||||
|
(25 rows)
|
||||||
|
|
||||||
|
SET client_min_messages to 'LOG';
|
||||||
|
SELECT citus_promote_clone_and_rebalance(clone_nodeid => :clone_node_id_3);
|
||||||
|
NOTICE: Starting promotion process for clone node localhost:xxxxx (ID 6), original primary localhost:xxxxx (ID 5)
|
||||||
|
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
|
||||||
|
NOTICE: checking replication for node localhost (resolved IP: ::1)
|
||||||
|
NOTICE: clone localhost:xxxxx is properly connected to primary localhost:xxxxx and is not synchronous
|
||||||
|
NOTICE: Blocking writes on shards of original primary node localhost:xxxxx (group 5)
|
||||||
|
NOTICE: Blocking all writes to worker node localhost:xxxxx (ID 5)
|
||||||
|
NOTICE: Waiting for clone localhost:xxxxx to catch up with primary localhost:xxxxx (timeout: 300 seconds)
|
||||||
|
NOTICE: replication lag between localhost:xxxxx and localhost:xxxxx is 0 bytes
|
||||||
|
NOTICE: Clone localhost:xxxxx is now caught up with primary localhost:xxxxx.
|
||||||
|
NOTICE: Attempting to promote clone localhost:xxxxx via pg_promote().
|
||||||
|
NOTICE: Clone node localhost:xxxxx (ID 6) has been successfully promoted.
|
||||||
|
NOTICE: Updating metadata for promoted clone localhost:xxxxx (ID 6)
|
||||||
|
NOTICE: adjusting shard placements for primary localhost:xxxxx and clone localhost:xxxxx
|
||||||
|
NOTICE: processing 13 shards for primary node GroupID 5
|
||||||
|
LOG: inserting DELETE shard record for shard public.backup_test2_102091 from clone node GroupID 6
|
||||||
|
LOG: inserting DELETE shard record for shard public.backup_test2_102095 from clone node GroupID 6
|
||||||
|
LOG: inserting DELETE shard record for shard public.backup_test2_102099 from clone node GroupID 6
|
||||||
|
LOG: inserting DELETE shard record for shard public.backup_test2_102103 from clone node GroupID 6
|
||||||
|
LOG: inserting DELETE shard record for shard public.backup_test2_102111 from clone node GroupID 6
|
||||||
|
LOG: inserting DELETE shard record for shard public.backup_test2_102115 from clone node GroupID 6
|
||||||
|
LOG: inserting DELETE shard record for shard public.backup_test2_102119 from clone node GroupID 6
|
||||||
|
LOG: inserting DELETE shard record for shard public.backup_test2_102123 from clone node GroupID 6
|
||||||
|
LOG: inserting DELETE shard record for shard public.backup_test2_102127 from clone node GroupID 6
|
||||||
|
LOG: inserting DELETE shard record for shard public.backup_test2_102131 from clone node GroupID 6
|
||||||
|
LOG: inserting DELETE shard record for shard public.backup_test2_102135 from clone node GroupID 6
|
||||||
|
LOG: inserting DELETE shard record for shard public.backup_test2_102139 from clone node GroupID 6
|
||||||
|
LOG: inserting DELETE shard record for shard public.backup_test2_102143 from clone node GroupID 6
|
||||||
|
NOTICE: processing 12 shards for clone node GroupID 6
|
||||||
|
LOG: inserting DELETE shard record for shard public.backup_test2_102063 from primary node GroupID 5
|
||||||
|
LOG: inserting DELETE shard record for shard public.backup_test2_102071 from primary node GroupID 5
|
||||||
|
LOG: inserting DELETE shard record for shard public.backup_test2_102107 from primary node GroupID 5
|
||||||
|
LOG: inserting DELETE shard record for shard public.backup_test2_102047 from primary node GroupID 5
|
||||||
|
LOG: inserting DELETE shard record for shard public.backup_test2_102051 from primary node GroupID 5
|
||||||
|
LOG: inserting DELETE shard record for shard public.backup_test2_102055 from primary node GroupID 5
|
||||||
|
LOG: inserting DELETE shard record for shard public.backup_test2_102059 from primary node GroupID 5
|
||||||
|
LOG: inserting DELETE shard record for shard public.backup_test2_102067 from primary node GroupID 5
|
||||||
|
LOG: inserting DELETE shard record for shard public.backup_test2_102075 from primary node GroupID 5
|
||||||
|
LOG: inserting DELETE shard record for shard public.backup_test2_102079 from primary node GroupID 5
|
||||||
|
LOG: inserting DELETE shard record for shard public.backup_test2_102083 from primary node GroupID 5
|
||||||
|
LOG: inserting DELETE shard record for shard public.backup_test2_102087 from primary node GroupID 5
|
||||||
|
NOTICE: shard placement adjustment complete for primary localhost:xxxxx and clone localhost:xxxxx
|
||||||
|
NOTICE: Clone node localhost:xxxxx (ID 6) metadata updated. It is now a primary
|
||||||
|
NOTICE: Clone node localhost:xxxxx (ID 6) successfully registered as a worker node
|
||||||
|
citus_promote_clone_and_rebalance
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SET client_min_messages to DEFAULT;
|
||||||
|
SELECT COUNT(*) from backup_test;
|
||||||
|
count
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
10
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT COUNT(*) from ref_table;
|
||||||
|
count
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
5
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT * from pg_dist_node ORDER by nodeid;
|
||||||
|
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
2 | 2 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
3 | 3 | localhost | 9071 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
4 | 4 | localhost | 9072 | default | f | f | unavailable | default | f | f | t | 2
|
||||||
|
5 | 5 | localhost | 57639 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
6 | 6 | localhost | 9073 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
(6 rows)
|
||||||
|
|
||||||
|
-- check the shard placement
|
||||||
|
SELECT nodename, nodeport, count(shardid) FROM pg_dist_shard_placement GROUP BY nodename, nodeport ORDER BY nodename, nodeport;
|
||||||
|
nodename | nodeport | count
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
localhost | 9071 | 36
|
||||||
|
localhost | 9073 | 17
|
||||||
|
localhost | 57637 | 38
|
||||||
|
localhost | 57638 | 44
|
||||||
|
localhost | 57639 | 18
|
||||||
|
(5 rows)
|
||||||
|
|
||||||
|
set citus.shard_count to default;
|
||||||
|
-- cleanup
|
||||||
|
DROP TABLE backup_test;
|
||||||
|
DROP TABLE ref_table;
|
||||||
|
DROP TABLE backup_test2;
|
||||||
|
DROP TABLE ref_table2;
|
||||||
|
|
@ -0,0 +1,45 @@
|
||||||
|
--
|
||||||
|
-- Test for negative scenarios in clone promotion functionality
|
||||||
|
-- We do not allow synchronous replicas to be added as clones
|
||||||
|
-- this test is to ensure that we do not allow this
|
||||||
|
--
|
||||||
|
SELECT * from pg_dist_node ORDER by nodeid;
|
||||||
|
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
3 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
4 | 2 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
5 | 1 | localhost | 9071 | default | f | t | secondary | second-cluster | f | t | f | 0
|
||||||
|
6 | 2 | localhost | 9072 | default | f | t | secondary | second-cluster | f | t | f | 0
|
||||||
|
(4 rows)
|
||||||
|
|
||||||
|
SELECT master_remove_node('localhost', :follower_worker_1_port);
|
||||||
|
master_remove_node
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT master_remove_node('localhost', :follower_worker_2_port);
|
||||||
|
master_remove_node
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
-- this should fail as the replica is a synchronous replica that is not allowed
|
||||||
|
SELECT citus_add_clone_node('localhost', :follower_worker_1_port, 'localhost', :worker_1_port) AS clone_node_id \gset
|
||||||
|
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
|
||||||
|
NOTICE: checking replication for node localhost (resolved IP: ::1)
|
||||||
|
ERROR: cannot add clone localhost:xxxxx as it is configured as a synchronous replica
|
||||||
|
DETAIL: Promoting a synchronous clone can cause data consistency issues. Please configure it as an asynchronous replica first.
|
||||||
|
-- this should fail as the replica is a synchronous replica that is not allowed
|
||||||
|
SELECT citus_add_clone_node('localhost', :follower_worker_2_port, 'localhost', :worker_2_port) AS clone_node_id \gset
|
||||||
|
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
|
||||||
|
NOTICE: checking replication for node localhost (resolved IP: ::1)
|
||||||
|
ERROR: cannot add clone localhost:xxxxx as it is configured as a synchronous replica
|
||||||
|
DETAIL: Promoting a synchronous clone can cause data consistency issues. Please configure it as an asynchronous replica first.
|
||||||
|
SELECT * from pg_dist_node ORDER by nodeid;
|
||||||
|
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||||
|
---------------------------------------------------------------------
|
||||||
|
3 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
4 | 2 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||||
|
(2 rows)
|
||||||
|
|
||||||
|
|
@ -691,11 +691,11 @@ SELECT
|
||||||
(1 row)
|
(1 row)
|
||||||
|
|
||||||
SELECT * FROM pg_dist_node ORDER BY nodeid;
|
SELECT * FROM pg_dist_node ORDER BY nodeid;
|
||||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards
|
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||||
---------------------------------------------------------------------
|
---------------------------------------------------------------------
|
||||||
3 | 0 | localhost | 57636 | default | t | t | primary | default | t | f
|
3 | 0 | localhost | 57636 | default | t | t | primary | default | t | f | f | 0
|
||||||
11 | 9 | localhost | 57637 | default | t | t | primary | default | t | t
|
11 | 9 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||||
12 | 10 | localhost | 57638 | default | t | t | primary | default | t | t
|
12 | 10 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||||
(3 rows)
|
(3 rows)
|
||||||
|
|
||||||
-- check that mixed add/remove node commands work fine inside transaction
|
-- check that mixed add/remove node commands work fine inside transaction
|
||||||
|
|
@ -928,11 +928,11 @@ CONTEXT: PL/pgSQL function citus_internal.pg_dist_node_trigger_func() line XX a
|
||||||
INSERT INTO pg_dist_node (nodename, nodeport, groupid, noderole, nodecluster)
|
INSERT INTO pg_dist_node (nodename, nodeport, groupid, noderole, nodecluster)
|
||||||
VALUES ('localhost', 5000, 1000, 'primary', 'olap');
|
VALUES ('localhost', 5000, 1000, 'primary', 'olap');
|
||||||
ERROR: new row for relation "pg_dist_node" violates check constraint "primaries_are_only_allowed_in_the_default_cluster"
|
ERROR: new row for relation "pg_dist_node" violates check constraint "primaries_are_only_allowed_in_the_default_cluster"
|
||||||
DETAIL: Failing row contains (25, 1000, localhost, 5000, default, f, t, primary, olap, f, t).
|
DETAIL: Failing row contains (25, 1000, localhost, 5000, default, f, t, primary, olap, f, t, f, 0).
|
||||||
UPDATE pg_dist_node SET nodecluster = 'olap'
|
UPDATE pg_dist_node SET nodecluster = 'olap'
|
||||||
WHERE nodeport = :worker_1_port;
|
WHERE nodeport = :worker_1_port;
|
||||||
ERROR: new row for relation "pg_dist_node" violates check constraint "primaries_are_only_allowed_in_the_default_cluster"
|
ERROR: new row for relation "pg_dist_node" violates check constraint "primaries_are_only_allowed_in_the_default_cluster"
|
||||||
DETAIL: Failing row contains (17, 14, localhost, 57637, default, f, t, primary, olap, f, t).
|
DETAIL: Failing row contains (17, 14, localhost, 57637, default, f, t, primary, olap, f, t, f, 0).
|
||||||
-- check that you /can/ add a secondary node to a non-default cluster
|
-- check that you /can/ add a secondary node to a non-default cluster
|
||||||
SELECT groupid AS worker_2_group FROM pg_dist_node WHERE nodeport = :worker_2_port \gset
|
SELECT groupid AS worker_2_group FROM pg_dist_node WHERE nodeport = :worker_2_port \gset
|
||||||
SELECT master_add_node('localhost', 8888, groupid => :worker_1_group, noderole => 'secondary', nodecluster=> 'olap');
|
SELECT master_add_node('localhost', 8888, groupid => :worker_1_group, noderole => 'secondary', nodecluster=> 'olap');
|
||||||
|
|
@ -955,9 +955,9 @@ SELECT master_add_node('localhost', 8887, groupid => :worker_1_group, noderole =
|
||||||
(1 row)
|
(1 row)
|
||||||
|
|
||||||
SELECT * FROM pg_dist_node WHERE nodeport=8887;
|
SELECT * FROM pg_dist_node WHERE nodeport=8887;
|
||||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards
|
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||||
---------------------------------------------------------------------
|
---------------------------------------------------------------------
|
||||||
27 | 14 | localhost | 8887 | default | f | t | secondary | thisisasixtyfourcharacterstringrepeatedfourtimestomake256chars. | f | t
|
27 | 14 | localhost | 8887 | default | f | t | secondary | thisisasixtyfourcharacterstringrepeatedfourtimestomake256chars. | f | t | f | 0
|
||||||
(1 row)
|
(1 row)
|
||||||
|
|
||||||
-- don't remove the secondary and unavailable nodes, check that no commands are sent to
|
-- don't remove the secondary and unavailable nodes, check that no commands are sent to
|
||||||
|
|
@ -1036,9 +1036,9 @@ SELECT master_update_node(:worker_1_node, 'somehost', 9000);
|
||||||
(1 row)
|
(1 row)
|
||||||
|
|
||||||
SELECT * FROM pg_dist_node WHERE nodeid = :worker_1_node;
|
SELECT * FROM pg_dist_node WHERE nodeid = :worker_1_node;
|
||||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards
|
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||||
---------------------------------------------------------------------
|
---------------------------------------------------------------------
|
||||||
17 | 14 | somehost | 9000 | default | f | t | primary | default | f | t
|
17 | 14 | somehost | 9000 | default | f | t | primary | default | f | t | f | 0
|
||||||
(1 row)
|
(1 row)
|
||||||
|
|
||||||
-- cleanup
|
-- cleanup
|
||||||
|
|
@ -1049,9 +1049,9 @@ SELECT master_update_node(:worker_1_node, 'localhost', :worker_1_port);
|
||||||
(1 row)
|
(1 row)
|
||||||
|
|
||||||
SELECT * FROM pg_dist_node WHERE nodeid = :worker_1_node;
|
SELECT * FROM pg_dist_node WHERE nodeid = :worker_1_node;
|
||||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards
|
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||||
---------------------------------------------------------------------
|
---------------------------------------------------------------------
|
||||||
17 | 14 | localhost | 57637 | default | f | t | primary | default | f | t
|
17 | 14 | localhost | 57637 | default | f | t | primary | default | f | t | f | 0
|
||||||
(1 row)
|
(1 row)
|
||||||
|
|
||||||
SET client_min_messages TO ERROR;
|
SET client_min_messages TO ERROR;
|
||||||
|
|
|
||||||
|
|
@ -1640,10 +1640,16 @@ SELECT * FROM multi_extension.print_extension_changes();
|
||||||
---------------------------------------------------------------------
|
---------------------------------------------------------------------
|
||||||
function citus_rebalance_start(name,boolean,citus.shard_transfer_mode) bigint |
|
function citus_rebalance_start(name,boolean,citus.shard_transfer_mode) bigint |
|
||||||
function worker_last_saved_explain_analyze() TABLE(explain_analyze_output text, execution_duration double precision) |
|
function worker_last_saved_explain_analyze() TABLE(explain_analyze_output text, execution_duration double precision) |
|
||||||
|
| function citus_add_clone_node(text,integer,text,integer) integer
|
||||||
|
| function citus_add_clone_node_with_nodeid(text,integer,integer) integer
|
||||||
| function citus_internal.citus_internal_copy_single_shard_placement(bigint,integer,integer,integer,citus.shard_transfer_mode) void
|
| function citus_internal.citus_internal_copy_single_shard_placement(bigint,integer,integer,integer,citus.shard_transfer_mode) void
|
||||||
|
| function citus_promote_clone_and_rebalance(integer,name,integer) void
|
||||||
| function citus_rebalance_start(name,boolean,citus.shard_transfer_mode,boolean,boolean) bigint
|
| function citus_rebalance_start(name,boolean,citus.shard_transfer_mode,boolean,boolean) bigint
|
||||||
|
| function citus_remove_clone_node(text,integer) void
|
||||||
|
| function citus_remove_clone_node_with_nodeid(integer) void
|
||||||
|
| function get_snapshot_based_node_split_plan(text,integer,text,integer,name) TABLE(table_name regclass, shardid bigint, shard_size bigint, placement_node text)
|
||||||
| function worker_last_saved_explain_analyze() TABLE(explain_analyze_output text, execution_duration double precision, execution_ntuples double precision, execution_nloops double precision)
|
| function worker_last_saved_explain_analyze() TABLE(explain_analyze_output text, execution_duration double precision, execution_ntuples double precision, execution_nloops double precision)
|
||||||
(5 rows)
|
(11 rows)
|
||||||
|
|
||||||
DROP TABLE multi_extension.prev_objects, multi_extension.extension_diff;
|
DROP TABLE multi_extension.prev_objects, multi_extension.extension_diff;
|
||||||
-- show running version
|
-- show running version
|
||||||
|
|
|
||||||
|
|
@ -75,7 +75,7 @@ SELECT unnest(activate_node_snapshot()) order by 1;
|
||||||
GRANT CREATE ON SCHEMA public TO pg_database_owner;
|
GRANT CREATE ON SCHEMA public TO pg_database_owner;
|
||||||
GRANT USAGE ON SCHEMA public TO PUBLIC;
|
GRANT USAGE ON SCHEMA public TO PUBLIC;
|
||||||
GRANT USAGE ON SCHEMA public TO pg_database_owner;
|
GRANT USAGE ON SCHEMA public TO pg_database_owner;
|
||||||
INSERT INTO pg_dist_node (nodeid, groupid, nodename, nodeport, noderack, hasmetadata, metadatasynced, isactive, noderole, nodecluster, shouldhaveshards) VALUES (1, 0, 'localhost', 57636, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', FALSE),(2, 1, 'localhost', 57637, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE),(3, 2, 'localhost', 57638, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE)
|
INSERT INTO pg_dist_node (nodeid, groupid, nodename, nodeport, noderack, hasmetadata, metadatasynced, isactive, noderole, nodecluster, shouldhaveshards, nodeisclone, nodeprimarynodeid) VALUES (1, 0, 'localhost', 57636, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', FALSE, FALSE, 0),(2, 1, 'localhost', 57637, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE, FALSE, 0),(3, 2, 'localhost', 57638, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE, FALSE, 0)
|
||||||
RESET ROLE
|
RESET ROLE
|
||||||
RESET ROLE
|
RESET ROLE
|
||||||
SELECT alter_role_if_exists('postgres', 'ALTER ROLE postgres SET lc_messages = ''C''')
|
SELECT alter_role_if_exists('postgres', 'ALTER ROLE postgres SET lc_messages = ''C''')
|
||||||
|
|
@ -148,7 +148,7 @@ SELECT unnest(activate_node_snapshot()) order by 1;
|
||||||
GRANT CREATE ON SCHEMA public TO pg_database_owner;
|
GRANT CREATE ON SCHEMA public TO pg_database_owner;
|
||||||
GRANT USAGE ON SCHEMA public TO PUBLIC;
|
GRANT USAGE ON SCHEMA public TO PUBLIC;
|
||||||
GRANT USAGE ON SCHEMA public TO pg_database_owner;
|
GRANT USAGE ON SCHEMA public TO pg_database_owner;
|
||||||
INSERT INTO pg_dist_node (nodeid, groupid, nodename, nodeport, noderack, hasmetadata, metadatasynced, isactive, noderole, nodecluster, shouldhaveshards) VALUES (1, 0, 'localhost', 57636, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', FALSE),(2, 1, 'localhost', 57637, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE),(3, 2, 'localhost', 57638, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE)
|
INSERT INTO pg_dist_node (nodeid, groupid, nodename, nodeport, noderack, hasmetadata, metadatasynced, isactive, noderole, nodecluster, shouldhaveshards, nodeisclone, nodeprimarynodeid) VALUES (1, 0, 'localhost', 57636, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', FALSE, FALSE, 0),(2, 1, 'localhost', 57637, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE, FALSE, 0),(3, 2, 'localhost', 57638, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE, FALSE, 0)
|
||||||
RESET ROLE
|
RESET ROLE
|
||||||
RESET ROLE
|
RESET ROLE
|
||||||
SELECT alter_role_if_exists('postgres', 'ALTER ROLE postgres SET lc_messages = ''C''')
|
SELECT alter_role_if_exists('postgres', 'ALTER ROLE postgres SET lc_messages = ''C''')
|
||||||
|
|
@ -216,7 +216,7 @@ SELECT unnest(activate_node_snapshot()) order by 1;
|
||||||
GRANT CREATE ON SCHEMA public TO pg_database_owner;
|
GRANT CREATE ON SCHEMA public TO pg_database_owner;
|
||||||
GRANT USAGE ON SCHEMA public TO PUBLIC;
|
GRANT USAGE ON SCHEMA public TO PUBLIC;
|
||||||
GRANT USAGE ON SCHEMA public TO pg_database_owner;
|
GRANT USAGE ON SCHEMA public TO pg_database_owner;
|
||||||
INSERT INTO pg_dist_node (nodeid, groupid, nodename, nodeport, noderack, hasmetadata, metadatasynced, isactive, noderole, nodecluster, shouldhaveshards) VALUES (1, 0, 'localhost', 57636, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', FALSE),(2, 1, 'localhost', 57637, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE),(3, 2, 'localhost', 57638, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE)
|
INSERT INTO pg_dist_node (nodeid, groupid, nodename, nodeport, noderack, hasmetadata, metadatasynced, isactive, noderole, nodecluster, shouldhaveshards, nodeisclone, nodeprimarynodeid) VALUES (1, 0, 'localhost', 57636, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', FALSE, FALSE, 0),(2, 1, 'localhost', 57637, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE, FALSE, 0),(3, 2, 'localhost', 57638, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE, FALSE, 0)
|
||||||
RESET ROLE
|
RESET ROLE
|
||||||
RESET ROLE
|
RESET ROLE
|
||||||
SELECT alter_role_if_exists('postgres', 'ALTER ROLE postgres SET lc_messages = ''C''')
|
SELECT alter_role_if_exists('postgres', 'ALTER ROLE postgres SET lc_messages = ''C''')
|
||||||
|
|
@ -277,7 +277,7 @@ SELECT unnest(activate_node_snapshot()) order by 1;
|
||||||
GRANT CREATE ON SCHEMA public TO pg_database_owner;
|
GRANT CREATE ON SCHEMA public TO pg_database_owner;
|
||||||
GRANT USAGE ON SCHEMA public TO PUBLIC;
|
GRANT USAGE ON SCHEMA public TO PUBLIC;
|
||||||
GRANT USAGE ON SCHEMA public TO pg_database_owner;
|
GRANT USAGE ON SCHEMA public TO pg_database_owner;
|
||||||
INSERT INTO pg_dist_node (nodeid, groupid, nodename, nodeport, noderack, hasmetadata, metadatasynced, isactive, noderole, nodecluster, shouldhaveshards) VALUES (1, 0, 'localhost', 57636, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', FALSE),(2, 1, 'localhost', 57637, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE),(3, 2, 'localhost', 57638, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE)
|
INSERT INTO pg_dist_node (nodeid, groupid, nodename, nodeport, noderack, hasmetadata, metadatasynced, isactive, noderole, nodecluster, shouldhaveshards, nodeisclone, nodeprimarynodeid) VALUES (1, 0, 'localhost', 57636, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', FALSE, FALSE, 0),(2, 1, 'localhost', 57637, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE, FALSE, 0),(3, 2, 'localhost', 57638, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE, FALSE, 0)
|
||||||
RESET ROLE
|
RESET ROLE
|
||||||
RESET ROLE
|
RESET ROLE
|
||||||
SELECT alter_role_if_exists('postgres', 'ALTER ROLE postgres SET lc_messages = ''C''')
|
SELECT alter_role_if_exists('postgres', 'ALTER ROLE postgres SET lc_messages = ''C''')
|
||||||
|
|
@ -345,7 +345,7 @@ SELECT unnest(activate_node_snapshot()) order by 1;
|
||||||
GRANT CREATE ON SCHEMA public TO pg_database_owner;
|
GRANT CREATE ON SCHEMA public TO pg_database_owner;
|
||||||
GRANT USAGE ON SCHEMA public TO PUBLIC;
|
GRANT USAGE ON SCHEMA public TO PUBLIC;
|
||||||
GRANT USAGE ON SCHEMA public TO pg_database_owner;
|
GRANT USAGE ON SCHEMA public TO pg_database_owner;
|
||||||
INSERT INTO pg_dist_node (nodeid, groupid, nodename, nodeport, noderack, hasmetadata, metadatasynced, isactive, noderole, nodecluster, shouldhaveshards) VALUES (1, 0, 'localhost', 57636, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', FALSE),(2, 1, 'localhost', 57637, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE),(3, 2, 'localhost', 57638, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE)
|
INSERT INTO pg_dist_node (nodeid, groupid, nodename, nodeport, noderack, hasmetadata, metadatasynced, isactive, noderole, nodecluster, shouldhaveshards, nodeisclone, nodeprimarynodeid) VALUES (1, 0, 'localhost', 57636, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', FALSE, FALSE, 0),(2, 1, 'localhost', 57637, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE, FALSE, 0),(3, 2, 'localhost', 57638, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE, FALSE, 0)
|
||||||
RESET ROLE
|
RESET ROLE
|
||||||
RESET ROLE
|
RESET ROLE
|
||||||
SELECT alter_role_if_exists('postgres', 'ALTER ROLE postgres SET lc_messages = ''C''')
|
SELECT alter_role_if_exists('postgres', 'ALTER ROLE postgres SET lc_messages = ''C''')
|
||||||
|
|
@ -406,7 +406,7 @@ SELECT unnest(activate_node_snapshot()) order by 1;
|
||||||
GRANT CREATE ON SCHEMA public TO pg_database_owner;
|
GRANT CREATE ON SCHEMA public TO pg_database_owner;
|
||||||
GRANT USAGE ON SCHEMA public TO PUBLIC;
|
GRANT USAGE ON SCHEMA public TO PUBLIC;
|
||||||
GRANT USAGE ON SCHEMA public TO pg_database_owner;
|
GRANT USAGE ON SCHEMA public TO pg_database_owner;
|
||||||
INSERT INTO pg_dist_node (nodeid, groupid, nodename, nodeport, noderack, hasmetadata, metadatasynced, isactive, noderole, nodecluster, shouldhaveshards) VALUES (1, 0, 'localhost', 57636, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', FALSE),(2, 1, 'localhost', 57637, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE),(3, 2, 'localhost', 57638, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE)
|
INSERT INTO pg_dist_node (nodeid, groupid, nodename, nodeport, noderack, hasmetadata, metadatasynced, isactive, noderole, nodecluster, shouldhaveshards, nodeisclone, nodeprimarynodeid) VALUES (1, 0, 'localhost', 57636, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', FALSE, FALSE, 0),(2, 1, 'localhost', 57637, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE, FALSE, 0),(3, 2, 'localhost', 57638, 'default', FALSE, FALSE, TRUE, 'primary'::noderole, 'default', TRUE, FALSE, 0)
|
||||||
RESET ROLE
|
RESET ROLE
|
||||||
RESET ROLE
|
RESET ROLE
|
||||||
SELECT alter_role_if_exists('postgres', 'ALTER ROLE postgres SET lc_messages = ''C''')
|
SELECT alter_role_if_exists('postgres', 'ALTER ROLE postgres SET lc_messages = ''C''')
|
||||||
|
|
@ -511,13 +511,13 @@ SELECT * FROM pg_dist_local_group;
|
||||||
(1 row)
|
(1 row)
|
||||||
|
|
||||||
SELECT * FROM pg_dist_node ORDER BY nodeid;
|
SELECT * FROM pg_dist_node ORDER BY nodeid;
|
||||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards
|
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||||
---------------------------------------------------------------------
|
---------------------------------------------------------------------
|
||||||
1 | 0 | localhost | 57636 | default | t | t | primary | default | t | f
|
1 | 0 | localhost | 57636 | default | t | t | primary | default | t | f | f | 0
|
||||||
2 | 1 | localhost | 57637 | default | t | t | primary | default | t | t
|
2 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||||
3 | 2 | localhost | 57638 | default | f | t | primary | default | f | t
|
3 | 2 | localhost | 57638 | default | f | t | primary | default | f | t | f | 0
|
||||||
5 | 1 | localhost | 8888 | default | f | t | secondary | default | f | t
|
5 | 1 | localhost | 8888 | default | f | t | secondary | default | f | t | f | 0
|
||||||
6 | 1 | localhost | 8889 | default | f | t | secondary | second-cluster | f | t
|
6 | 1 | localhost | 8889 | default | f | t | secondary | second-cluster | f | t | f | 0
|
||||||
(5 rows)
|
(5 rows)
|
||||||
|
|
||||||
SELECT * FROM pg_dist_partition WHERE logicalrelid::text LIKE 'mx_testing_schema%' ORDER BY logicalrelid::text;
|
SELECT * FROM pg_dist_partition WHERE logicalrelid::text LIKE 'mx_testing_schema%' ORDER BY logicalrelid::text;
|
||||||
|
|
@ -650,13 +650,13 @@ SELECT * FROM pg_dist_local_group;
|
||||||
(1 row)
|
(1 row)
|
||||||
|
|
||||||
SELECT * FROM pg_dist_node ORDER BY nodeid;
|
SELECT * FROM pg_dist_node ORDER BY nodeid;
|
||||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards
|
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||||
---------------------------------------------------------------------
|
---------------------------------------------------------------------
|
||||||
1 | 0 | localhost | 57636 | default | t | t | primary | default | t | f
|
1 | 0 | localhost | 57636 | default | t | t | primary | default | t | f | f | 0
|
||||||
2 | 1 | localhost | 57637 | default | t | t | primary | default | t | t
|
2 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||||
3 | 2 | localhost | 57638 | default | f | t | primary | default | f | t
|
3 | 2 | localhost | 57638 | default | f | t | primary | default | f | t | f | 0
|
||||||
5 | 1 | localhost | 8888 | default | f | t | secondary | default | f | t
|
5 | 1 | localhost | 8888 | default | f | t | secondary | default | f | t | f | 0
|
||||||
6 | 1 | localhost | 8889 | default | f | t | secondary | second-cluster | f | t
|
6 | 1 | localhost | 8889 | default | f | t | secondary | second-cluster | f | t | f | 0
|
||||||
(5 rows)
|
(5 rows)
|
||||||
|
|
||||||
SELECT * FROM pg_dist_partition WHERE logicalrelid::text LIKE 'mx_testing_schema%' ORDER BY logicalrelid::text;
|
SELECT * FROM pg_dist_partition WHERE logicalrelid::text LIKE 'mx_testing_schema%' ORDER BY logicalrelid::text;
|
||||||
|
|
@ -1982,7 +1982,7 @@ SELECT unnest(activate_node_snapshot()) order by 1;
|
||||||
GRANT CREATE ON SCHEMA public TO pg_database_owner;
|
GRANT CREATE ON SCHEMA public TO pg_database_owner;
|
||||||
GRANT USAGE ON SCHEMA public TO PUBLIC;
|
GRANT USAGE ON SCHEMA public TO PUBLIC;
|
||||||
GRANT USAGE ON SCHEMA public TO pg_database_owner;
|
GRANT USAGE ON SCHEMA public TO pg_database_owner;
|
||||||
INSERT INTO pg_dist_node (nodeid, groupid, nodename, nodeport, noderack, hasmetadata, metadatasynced, isactive, noderole, nodecluster, shouldhaveshards) VALUES (5, 1, 'localhost', 8888, 'default', FALSE, FALSE, TRUE, 'secondary'::noderole, 'default', TRUE),(6, 1, 'localhost', 8889, 'default', FALSE, FALSE, TRUE, 'secondary'::noderole, 'second-cluster', TRUE),(1, 0, 'localhost', 57636, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', FALSE),(2, 1, 'localhost', 57637, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', TRUE),(8, 5, 'localhost', 57638, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', TRUE)
|
INSERT INTO pg_dist_node (nodeid, groupid, nodename, nodeport, noderack, hasmetadata, metadatasynced, isactive, noderole, nodecluster, shouldhaveshards, nodeisclone, nodeprimarynodeid) VALUES (5, 1, 'localhost', 8888, 'default', FALSE, FALSE, TRUE, 'secondary'::noderole, 'default', TRUE, FALSE, 0),(6, 1, 'localhost', 8889, 'default', FALSE, FALSE, TRUE, 'secondary'::noderole, 'second-cluster', TRUE, FALSE, 0),(1, 0, 'localhost', 57636, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', FALSE, FALSE, 0),(2, 1, 'localhost', 57637, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', TRUE, FALSE, 0),(8, 5, 'localhost', 57638, 'default', TRUE, TRUE, TRUE, 'primary'::noderole, 'default', TRUE, FALSE, 0)
|
||||||
RESET ROLE
|
RESET ROLE
|
||||||
RESET ROLE
|
RESET ROLE
|
||||||
SELECT alter_role_if_exists('postgres', 'ALTER ROLE postgres SET lc_messages = ''C''')
|
SELECT alter_role_if_exists('postgres', 'ALTER ROLE postgres SET lc_messages = ''C''')
|
||||||
|
|
|
||||||
|
|
@ -86,10 +86,10 @@ FROM test.maintenance_worker();
|
||||||
|
|
||||||
SELECT *
|
SELECT *
|
||||||
FROM pg_dist_node;
|
FROM pg_dist_node;
|
||||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards
|
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||||
---------------------------------------------------------------------
|
---------------------------------------------------------------------
|
||||||
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t
|
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||||
2 | 2 | localhost | 57638 | default | t | t | primary | default | t | t
|
2 | 2 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||||
(2 rows)
|
(2 rows)
|
||||||
|
|
||||||
CREATE DATABASE db2;
|
CREATE DATABASE db2;
|
||||||
|
|
@ -147,10 +147,10 @@ FROM test.maintenance_worker();
|
||||||
|
|
||||||
SELECT *
|
SELECT *
|
||||||
FROM pg_dist_node;
|
FROM pg_dist_node;
|
||||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards
|
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||||
---------------------------------------------------------------------
|
---------------------------------------------------------------------
|
||||||
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t
|
1 | 1 | localhost | 57637 | default | t | t | primary | default | t | t | f | 0
|
||||||
2 | 2 | localhost | 57638 | default | t | t | primary | default | t | t
|
2 | 2 | localhost | 57638 | default | t | t | primary | default | t | t | f | 0
|
||||||
(2 rows)
|
(2 rows)
|
||||||
|
|
||||||
SELECT groupid AS worker_1_group_id
|
SELECT groupid AS worker_1_group_id
|
||||||
|
|
|
||||||
|
|
@ -1758,9 +1758,9 @@ BEGIN
|
||||||
INSERT INTO test (x) VALUES ($1);
|
INSERT INTO test (x) VALUES ($1);
|
||||||
END;$$;
|
END;$$;
|
||||||
SELECT * FROM pg_dist_node;
|
SELECT * FROM pg_dist_node;
|
||||||
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards
|
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
|
||||||
---------------------------------------------------------------------
|
---------------------------------------------------------------------
|
||||||
5 | 0 | localhost | 57636 | default | t | t | primary | default | t | t
|
5 | 0 | localhost | 57636 | default | t | t | primary | default | t | t | f | 0
|
||||||
(1 row)
|
(1 row)
|
||||||
|
|
||||||
SELECT create_distributed_function('call_delegation(int)', '$1', 'test');
|
SELECT create_distributed_function('call_delegation(int)', '$1', 'test');
|
||||||
|
|
|
||||||
|
|
@ -43,6 +43,8 @@ ORDER BY 1;
|
||||||
function broadcast_intermediate_result(text,text)
|
function broadcast_intermediate_result(text,text)
|
||||||
function check_distributed_deadlocks()
|
function check_distributed_deadlocks()
|
||||||
function citus_activate_node(text,integer)
|
function citus_activate_node(text,integer)
|
||||||
|
function citus_add_clone_node(text,integer,text,integer)
|
||||||
|
function citus_add_clone_node_with_nodeid(text,integer,integer)
|
||||||
function citus_add_inactive_node(text,integer,integer,noderole,name)
|
function citus_add_inactive_node(text,integer,integer,noderole,name)
|
||||||
function citus_add_local_table_to_metadata(regclass,boolean)
|
function citus_add_local_table_to_metadata(regclass,boolean)
|
||||||
function citus_add_node(text,integer,integer,noderole,name)
|
function citus_add_node(text,integer,integer,noderole,name)
|
||||||
|
|
@ -156,6 +158,7 @@ ORDER BY 1;
|
||||||
function citus_pause_node_within_txn(integer,boolean,integer)
|
function citus_pause_node_within_txn(integer,boolean,integer)
|
||||||
function citus_pid_for_gpid(bigint)
|
function citus_pid_for_gpid(bigint)
|
||||||
function citus_prepare_pg_upgrade()
|
function citus_prepare_pg_upgrade()
|
||||||
|
function citus_promote_clone_and_rebalance(integer,name,integer)
|
||||||
function citus_query_stats()
|
function citus_query_stats()
|
||||||
function citus_rebalance_start(name,boolean,citus.shard_transfer_mode,boolean,boolean)
|
function citus_rebalance_start(name,boolean,citus.shard_transfer_mode,boolean,boolean)
|
||||||
function citus_rebalance_status(boolean)
|
function citus_rebalance_status(boolean)
|
||||||
|
|
@ -163,6 +166,8 @@ ORDER BY 1;
|
||||||
function citus_rebalance_wait()
|
function citus_rebalance_wait()
|
||||||
function citus_relation_size(regclass)
|
function citus_relation_size(regclass)
|
||||||
function citus_remote_connection_stats()
|
function citus_remote_connection_stats()
|
||||||
|
function citus_remove_clone_node(text,integer)
|
||||||
|
function citus_remove_clone_node_with_nodeid(integer)
|
||||||
function citus_remove_node(text,integer)
|
function citus_remove_node(text,integer)
|
||||||
function citus_run_local_command(text)
|
function citus_run_local_command(text)
|
||||||
function citus_schema_distribute(regnamespace)
|
function citus_schema_distribute(regnamespace)
|
||||||
|
|
@ -244,6 +249,7 @@ ORDER BY 1;
|
||||||
function get_rebalance_progress()
|
function get_rebalance_progress()
|
||||||
function get_rebalance_table_shards_plan(regclass,real,integer,bigint[],boolean,name,real)
|
function get_rebalance_table_shards_plan(regclass,real,integer,bigint[],boolean,name,real)
|
||||||
function get_shard_id_for_distribution_column(regclass,"any")
|
function get_shard_id_for_distribution_column(regclass,"any")
|
||||||
|
function get_snapshot_based_node_split_plan(text,integer,text,integer,name)
|
||||||
function isolate_tenant_to_new_shard(regclass,"any",text,citus.shard_transfer_mode)
|
function isolate_tenant_to_new_shard(regclass,"any",text,citus.shard_transfer_mode)
|
||||||
function json_cat_agg(json)
|
function json_cat_agg(json)
|
||||||
function jsonb_cat_agg(jsonb)
|
function jsonb_cat_agg(jsonb)
|
||||||
|
|
@ -395,6 +401,6 @@ ORDER BY 1;
|
||||||
view citus_tables
|
view citus_tables
|
||||||
view pg_dist_shard_placement
|
view pg_dist_shard_placement
|
||||||
view time_partitions
|
view time_partitions
|
||||||
(363 rows)
|
(369 rows)
|
||||||
|
|
||||||
DROP TABLE extension_basic_types;
|
DROP TABLE extension_basic_types;
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,2 @@
|
||||||
|
test: multi_add_node_from_backup
|
||||||
|
test: multi_add_node_from_backup_negative
|
||||||
|
|
@ -3,6 +3,7 @@ test: follower_single_node
|
||||||
test: multi_follower_select_statements
|
test: multi_follower_select_statements
|
||||||
test: multi_follower_dml
|
test: multi_follower_dml
|
||||||
test: multi_follower_configure_followers
|
test: multi_follower_configure_followers
|
||||||
|
test: multi_add_node_from_backup_sync_replica
|
||||||
|
|
||||||
# test that no tests leaked intermediate results. This should always be last
|
# test that no tests leaked intermediate results. This should always be last
|
||||||
test: ensure_no_intermediate_data_leak
|
test: ensure_no_intermediate_data_leak
|
||||||
|
|
|
||||||
|
|
@ -62,6 +62,7 @@ my $MASTER_FOLLOWERDIR = 'master-follower';
|
||||||
my $isolationtester = 0;
|
my $isolationtester = 0;
|
||||||
my $vanillatest = 0;
|
my $vanillatest = 0;
|
||||||
my $followercluster = 0;
|
my $followercluster = 0;
|
||||||
|
my $backupnodetest = 0;
|
||||||
my $bindir = "";
|
my $bindir = "";
|
||||||
my $libdir = undef;
|
my $libdir = undef;
|
||||||
my $pgxsdir = "";
|
my $pgxsdir = "";
|
||||||
|
|
@ -100,6 +101,7 @@ GetOptions(
|
||||||
'isolationtester' => \$isolationtester,
|
'isolationtester' => \$isolationtester,
|
||||||
'vanillatest' => \$vanillatest,
|
'vanillatest' => \$vanillatest,
|
||||||
'follower-cluster' => \$followercluster,
|
'follower-cluster' => \$followercluster,
|
||||||
|
'backupnodetest' => \$backupnodetest,
|
||||||
'bindir=s' => \$bindir,
|
'bindir=s' => \$bindir,
|
||||||
'libdir=s' => \$libdir,
|
'libdir=s' => \$libdir,
|
||||||
'pgxsdir=s' => \$pgxsdir,
|
'pgxsdir=s' => \$pgxsdir,
|
||||||
|
|
@ -483,7 +485,14 @@ push(@pgOptions, "citus.max_adaptive_executor_pool_size=4");
|
||||||
push(@pgOptions, "citus.defer_shard_delete_interval=-1");
|
push(@pgOptions, "citus.defer_shard_delete_interval=-1");
|
||||||
push(@pgOptions, "citus.repartition_join_bucket_count_per_node=2");
|
push(@pgOptions, "citus.repartition_join_bucket_count_per_node=2");
|
||||||
push(@pgOptions, "citus.sort_returning='on'");
|
push(@pgOptions, "citus.sort_returning='on'");
|
||||||
|
if ($backupnodetest)
|
||||||
|
{
|
||||||
|
push(@pgOptions, "citus.shard_replication_factor=1");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
push(@pgOptions, "citus.shard_replication_factor=2");
|
push(@pgOptions, "citus.shard_replication_factor=2");
|
||||||
|
}
|
||||||
push(@pgOptions, "citus.node_connection_timeout=${connectionTimeout}");
|
push(@pgOptions, "citus.node_connection_timeout=${connectionTimeout}");
|
||||||
push(@pgOptions, "citus.explain_analyze_sort_method='taskId'");
|
push(@pgOptions, "citus.explain_analyze_sort_method='taskId'");
|
||||||
push(@pgOptions, "citus.enable_manual_changes_to_shards=on");
|
push(@pgOptions, "citus.enable_manual_changes_to_shards=on");
|
||||||
|
|
@ -885,7 +894,7 @@ if ($valgrind)
|
||||||
$serversAreShutdown = "FALSE";
|
$serversAreShutdown = "FALSE";
|
||||||
|
|
||||||
# enable synchronous replication if needed
|
# enable synchronous replication if needed
|
||||||
if ($followercluster)
|
if ($followercluster && $backupnodetest == 0)
|
||||||
{
|
{
|
||||||
$synchronousReplication = "-c synchronous_standby_names='FIRST 1 (*)' -c synchronous_commit=remote_apply";
|
$synchronousReplication = "-c synchronous_standby_names='FIRST 1 (*)' -c synchronous_commit=remote_apply";
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,156 @@
|
||||||
|
--
|
||||||
|
-- Test for adding a worker node from a backup
|
||||||
|
--
|
||||||
|
|
||||||
|
-- setup cluster
|
||||||
|
SELECT 1 FROM master_add_node('localhost', :worker_1_port);
|
||||||
|
SELECT 1 FROM master_add_node('localhost', :worker_2_port);
|
||||||
|
|
||||||
|
SELECT * from pg_dist_node;
|
||||||
|
|
||||||
|
|
||||||
|
-- create a distributed table and load data
|
||||||
|
CREATE TABLE backup_test(id int, value text);
|
||||||
|
SELECT create_distributed_table('backup_test', 'id', 'hash');
|
||||||
|
INSERT INTO backup_test SELECT g, 'test' || g FROM generate_series(1, 10) g;
|
||||||
|
|
||||||
|
-- Colocation group 1: create two tables table1_colg1, table2_colg1 and in a colocation group
|
||||||
|
CREATE TABLE table1_colg1 (a int PRIMARY KEY);
|
||||||
|
SELECT create_distributed_table('table1_colg1', 'a', shard_count => 4, colocate_with => 'none');
|
||||||
|
|
||||||
|
CREATE TABLE table2_colg1 (b int PRIMARY KEY);
|
||||||
|
|
||||||
|
SELECT create_distributed_table('table2_colg1', 'b', colocate_with => 'table1_colg1');
|
||||||
|
|
||||||
|
-- Colocation group 2: create two tables table1_colg2, table2_colg2 and in a colocation group
|
||||||
|
CREATE TABLE table1_colg2 (a int PRIMARY KEY);
|
||||||
|
|
||||||
|
SELECT create_distributed_table('table1_colg2', 'a', shard_count => 4, colocate_with => 'none');
|
||||||
|
|
||||||
|
CREATE TABLE table2_colg2 (b int primary key);
|
||||||
|
|
||||||
|
SELECT create_distributed_table('table2_colg2', 'b', colocate_with => 'table1_colg2');
|
||||||
|
|
||||||
|
-- Colocation group 3: create two tables table1_colg3, table2_colg3 and in a colocation group
|
||||||
|
CREATE TABLE table1_colg3 (a int PRIMARY KEY);
|
||||||
|
|
||||||
|
SELECT create_distributed_table('table1_colg3', 'a', shard_count => 4, colocate_with => 'none');
|
||||||
|
|
||||||
|
CREATE TABLE table2_colg3 (b int primary key);
|
||||||
|
|
||||||
|
SELECT create_distributed_table('table2_colg3', 'b', colocate_with => 'table1_colg3');
|
||||||
|
|
||||||
|
-- Create reference tables with primary-foreign key relationships
|
||||||
|
|
||||||
|
CREATE TABLE customers (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
name TEXT NOT NULL,
|
||||||
|
email TEXT UNIQUE NOT NULL );
|
||||||
|
|
||||||
|
CREATE TABLE orders (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
customer_id INTEGER NOT NULL REFERENCES customers(id),
|
||||||
|
order_date DATE NOT NULL DEFAULT CURRENT_DATE);
|
||||||
|
|
||||||
|
CREATE TABLE order_items (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
order_id INTEGER NOT NULL REFERENCES orders(id),
|
||||||
|
product_name TEXT NOT NULL,
|
||||||
|
quantity INTEGER NOT NULL,
|
||||||
|
price NUMERIC(10, 2) NOT NULL
|
||||||
|
);
|
||||||
|
|
||||||
|
SELECT create_reference_table('customers');
|
||||||
|
SELECT create_reference_table('orders');
|
||||||
|
SELECT create_reference_table('order_items');
|
||||||
|
|
||||||
|
-- INSERT SOME DATA
|
||||||
|
-- Insert 10 customers
|
||||||
|
INSERT INTO customers (name, email)
|
||||||
|
SELECT
|
||||||
|
'Customer ' || i,
|
||||||
|
'customer' || i || '@example.com'
|
||||||
|
FROM generate_series(1, 10) AS i;
|
||||||
|
|
||||||
|
-- Insert 30 orders: each customer gets 3 orders
|
||||||
|
INSERT INTO orders (customer_id, order_date)
|
||||||
|
SELECT
|
||||||
|
(i % 10) + 1, -- customer_id between 1 and 10
|
||||||
|
CURRENT_DATE - (i % 7)
|
||||||
|
FROM generate_series(1, 30) AS i;
|
||||||
|
|
||||||
|
-- Insert 90 order_items: each order has 3 items
|
||||||
|
INSERT INTO order_items (order_id, product_name, quantity, price)
|
||||||
|
SELECT
|
||||||
|
(i % 30) + 1, -- order_id between 1 and 30
|
||||||
|
'Product ' || (i % 5 + 1),
|
||||||
|
(i % 10) + 1,
|
||||||
|
round((random() * 100 + 10)::numeric, 2)
|
||||||
|
FROM generate_series(1, 90) AS i;
|
||||||
|
|
||||||
|
SELECT count(*) from customers;
|
||||||
|
SELECT count(*) from orders;
|
||||||
|
SELECT count(*) from order_items;
|
||||||
|
|
||||||
|
-- verify initial shard placement
|
||||||
|
SELECT nodename, nodeport, count(shardid) FROM pg_dist_shard_placement GROUP BY nodename, nodeport ORDER BY nodename, nodeport;
|
||||||
|
-- wait for the new node to be ready
|
||||||
|
SELECT pg_sleep(5);
|
||||||
|
|
||||||
|
-- register the new node as a clone
|
||||||
|
-- the function returns the new node id
|
||||||
|
SELECT citus_add_clone_node('localhost', :follower_worker_1_port, 'localhost', :worker_1_port) AS clone_node_id \gset
|
||||||
|
|
||||||
|
SELECT * from pg_dist_node ORDER by nodeid;
|
||||||
|
|
||||||
|
SELECT :clone_node_id ;
|
||||||
|
|
||||||
|
SELECT shardid, nodename, 'PRIMARY' as node_type FROM pg_dist_shard_placement WHERE nodeport = :worker_1_port ORDER BY shardid;
|
||||||
|
SELECT shardid, nodename, 'CLONE' as node_type FROM pg_dist_shard_placement WHERE nodeport = :follower_worker_1_port ORDER BY shardid;
|
||||||
|
|
||||||
|
SELECT * from get_snapshot_based_node_split_plan('localhost', :worker_1_port, 'localhost', :follower_worker_1_port);
|
||||||
|
|
||||||
|
-- promote the clone and rebalance the shards
|
||||||
|
SET client_min_messages to 'LOG';
|
||||||
|
SELECT citus_promote_clone_and_rebalance(:clone_node_id);
|
||||||
|
SET client_min_messages to DEFAULT;
|
||||||
|
|
||||||
|
SELECT shardid, nodename, 'PRIMARY' as node_type FROM pg_dist_shard_placement WHERE nodeport = :worker_1_port ORDER BY shardid;
|
||||||
|
SELECT shardid, nodename, 'CLONE' as node_type FROM pg_dist_shard_placement WHERE nodeport = :follower_worker_1_port ORDER BY shardid;
|
||||||
|
|
||||||
|
\c - - - :worker_1_port
|
||||||
|
SELECT 'WORKER' as node_type,* from pg_dist_node;
|
||||||
|
SELECT 'WORKER' as node_type, nodename, nodeport, count(shardid) FROM pg_dist_shard_placement GROUP BY nodename, nodeport ORDER BY nodename, nodeport;
|
||||||
|
SELECT * from citus_tables;
|
||||||
|
SELECT id, value FROM backup_test ORDER BY id;
|
||||||
|
SELECT count(*) from customers;
|
||||||
|
SELECT count(*) from orders;
|
||||||
|
SELECT count(*) from order_items;
|
||||||
|
|
||||||
|
|
||||||
|
\c - - - :follower_worker_1_port
|
||||||
|
SELECT 'CLONE' as node_type ,* from pg_dist_node;
|
||||||
|
SELECT 'CLONE' as node_type, nodename, nodeport, count(shardid) FROM pg_dist_shard_placement GROUP BY nodename, nodeport ORDER BY nodename, nodeport;
|
||||||
|
SELECT * from citus_tables;
|
||||||
|
SELECT id, value FROM backup_test ORDER BY id;
|
||||||
|
SELECT count(*) from customers;
|
||||||
|
SELECT count(*) from orders;
|
||||||
|
SELECT count(*) from order_items;
|
||||||
|
|
||||||
|
|
||||||
|
\c - - - :master_port
|
||||||
|
SELECT 'MASTER' as node_type, * from pg_dist_node;
|
||||||
|
SELECT 'MASTER' as node_type, nodename, nodeport, count(shardid) FROM pg_dist_shard_placement GROUP BY nodename, nodeport ORDER BY nodename, nodeport;
|
||||||
|
SELECT * from citus_tables;
|
||||||
|
SELECT id, value FROM backup_test ORDER BY id;
|
||||||
|
SELECT count(*) from customers;
|
||||||
|
SELECT count(*) from orders;
|
||||||
|
SELECT count(*) from order_items;
|
||||||
|
|
||||||
|
-- verify data
|
||||||
|
SELECT count(*) FROM backup_test;
|
||||||
|
SELECT id, value FROM backup_test ORDER BY id;
|
||||||
|
|
||||||
|
-- cleanup
|
||||||
|
DROP TABLE backup_test;
|
||||||
|
|
||||||
|
|
@ -0,0 +1,105 @@
|
||||||
|
--
|
||||||
|
-- Test for negative scenarios in clone promotion functionality
|
||||||
|
--
|
||||||
|
|
||||||
|
--try to add follower_worker_1 as a clone of worker_1 to the cluster
|
||||||
|
-- this should fail as previous test has already promoted worker_1 to a primary node
|
||||||
|
SELECT citus_add_clone_node('localhost', :follower_worker_1_port, 'localhost', :worker_1_port) AS clone_node_id \gset
|
||||||
|
|
||||||
|
SELECT * from pg_dist_node ORDER by nodeid;
|
||||||
|
|
||||||
|
--try to add worker_node2 as a clone of worker_node1
|
||||||
|
-- this should fail as it is not a valid replica of worker_1
|
||||||
|
SELECT citus_add_clone_node('localhost', :follower_worker_2_port, 'localhost', :worker_1_port) AS clone_node_id \gset
|
||||||
|
|
||||||
|
SELECT * from pg_dist_node ORDER by nodeid;
|
||||||
|
|
||||||
|
--try add
|
||||||
|
-- create a distributed table and load data
|
||||||
|
CREATE TABLE backup_test(id int, value text);
|
||||||
|
SELECT create_distributed_table('backup_test', 'id', 'hash');
|
||||||
|
INSERT INTO backup_test SELECT g, 'test' || g FROM generate_series(1, 10) g;
|
||||||
|
|
||||||
|
-- Create reference table
|
||||||
|
CREATE TABLE ref_table(id int PRIMARY KEY);
|
||||||
|
SELECT create_reference_table('ref_table');
|
||||||
|
INSERT INTO ref_table SELECT i FROM generate_series(1, 5) i;
|
||||||
|
|
||||||
|
SELECT COUNT(*) from backup_test;
|
||||||
|
SELECT COUNT(*) from ref_table;
|
||||||
|
|
||||||
|
-- verify initial shard placement
|
||||||
|
SELECT nodename, nodeport, count(shardid) FROM pg_dist_shard_placement GROUP BY nodename, nodeport ORDER BY nodename, nodeport;
|
||||||
|
|
||||||
|
-- Try to add replica of worker_node2 as a clone of worker_node1
|
||||||
|
SELECT citus_add_clone_node('localhost', :follower_worker_2_port, 'localhost', :worker_1_port) AS clone_node_id \gset
|
||||||
|
|
||||||
|
-- Test 1: Try to promote a non-existent clone node
|
||||||
|
SELECT citus_promote_clone_and_rebalance(clone_nodeid =>99999);
|
||||||
|
|
||||||
|
-- Test 2: Try to promote a regular worker node (not a clone)
|
||||||
|
SELECT citus_promote_clone_and_rebalance(clone_nodeid => 1);
|
||||||
|
|
||||||
|
-- Test 3: Try to promote with invalid timeout (negative)
|
||||||
|
SELECT citus_promote_clone_and_rebalance(clone_nodeid => 1,
|
||||||
|
catchup_timeout_seconds => -100);
|
||||||
|
|
||||||
|
-- register the new node as a clone, This should pass
|
||||||
|
SELECT citus_add_clone_node('localhost', :follower_worker_2_port, 'localhost', :worker_2_port) AS clone_node_id \gset
|
||||||
|
|
||||||
|
SELECT * from pg_dist_node ORDER by nodeid;
|
||||||
|
|
||||||
|
SELECT :clone_node_id;
|
||||||
|
|
||||||
|
-- Test 4: Try to promote clone with invalid strategy name
|
||||||
|
SELECT citus_promote_clone_and_rebalance(clone_nodeid => :clone_node_id, rebalance_strategy => 'invalid_strategy');
|
||||||
|
|
||||||
|
SELECT * from pg_dist_node ORDER by nodeid;
|
||||||
|
|
||||||
|
-- Test 9: Rollback the citus_promote_clone_and_rebalance transaction
|
||||||
|
BEGIN;
|
||||||
|
SELECT citus_promote_clone_and_rebalance(clone_nodeid => :clone_node_id);
|
||||||
|
ROLLBACK;
|
||||||
|
|
||||||
|
-- Verify no data is lost after rooling back the transaction
|
||||||
|
SELECT COUNT(*) from backup_test;
|
||||||
|
SELECT COUNT(*) from ref_table;
|
||||||
|
|
||||||
|
SELECT * from pg_dist_node ORDER by nodeid;
|
||||||
|
|
||||||
|
-- Test 5: Try to add and promote a proper replica after rollback
|
||||||
|
SELECT master_add_node('localhost', :worker_3_port) AS nodeid_3 \gset
|
||||||
|
SELECT citus_add_clone_node('localhost', :follower_worker_3_port, 'localhost', :worker_3_port) AS clone_node_id_3 \gset
|
||||||
|
|
||||||
|
set citus.shard_count = 100;
|
||||||
|
CREATE TABLE backup_test2(id int, value text);
|
||||||
|
SELECT create_distributed_table('backup_test2', 'id', 'hash');
|
||||||
|
INSERT INTO backup_test2 SELECT g, 'test' || g FROM generate_series(1, 10) g;
|
||||||
|
|
||||||
|
-- Create reference table
|
||||||
|
CREATE TABLE ref_table2(id int PRIMARY KEY);
|
||||||
|
SELECT create_reference_table('ref_table2');
|
||||||
|
INSERT INTO ref_table2 SELECT i FROM generate_series(1, 5) i;
|
||||||
|
|
||||||
|
SELECT * from get_snapshot_based_node_split_plan('localhost', :worker_3_port, 'localhost', :follower_worker_3_port);
|
||||||
|
|
||||||
|
SET client_min_messages to 'LOG';
|
||||||
|
SELECT citus_promote_clone_and_rebalance(clone_nodeid => :clone_node_id_3);
|
||||||
|
SET client_min_messages to DEFAULT;
|
||||||
|
|
||||||
|
SELECT COUNT(*) from backup_test;
|
||||||
|
SELECT COUNT(*) from ref_table;
|
||||||
|
|
||||||
|
SELECT * from pg_dist_node ORDER by nodeid;
|
||||||
|
|
||||||
|
-- check the shard placement
|
||||||
|
|
||||||
|
SELECT nodename, nodeport, count(shardid) FROM pg_dist_shard_placement GROUP BY nodename, nodeport ORDER BY nodename, nodeport;
|
||||||
|
|
||||||
|
set citus.shard_count to default;
|
||||||
|
|
||||||
|
-- cleanup
|
||||||
|
DROP TABLE backup_test;
|
||||||
|
DROP TABLE ref_table;
|
||||||
|
DROP TABLE backup_test2;
|
||||||
|
DROP TABLE ref_table2;
|
||||||
|
|
@ -0,0 +1,18 @@
|
||||||
|
--
|
||||||
|
-- Test for negative scenarios in clone promotion functionality
|
||||||
|
-- We do not allow synchronous replicas to be added as clones
|
||||||
|
-- this test is to ensure that we do not allow this
|
||||||
|
--
|
||||||
|
|
||||||
|
SELECT * from pg_dist_node ORDER by nodeid;
|
||||||
|
|
||||||
|
SELECT master_remove_node('localhost', :follower_worker_1_port);
|
||||||
|
SELECT master_remove_node('localhost', :follower_worker_2_port);
|
||||||
|
|
||||||
|
-- this should fail as the replica is a synchronous replica that is not allowed
|
||||||
|
SELECT citus_add_clone_node('localhost', :follower_worker_1_port, 'localhost', :worker_1_port) AS clone_node_id \gset
|
||||||
|
|
||||||
|
-- this should fail as the replica is a synchronous replica that is not allowed
|
||||||
|
SELECT citus_add_clone_node('localhost', :follower_worker_2_port, 'localhost', :worker_2_port) AS clone_node_id \gset
|
||||||
|
|
||||||
|
SELECT * from pg_dist_node ORDER by nodeid;
|
||||||
Loading…
Reference in New Issue