Enhance clone node replication status messages (#8152)

- Downgrade replication lag reporting from NOTICE to DEBUG to reduce
noise and improve regression test stability.
- Add hints to certain replication status messages for better clarity.
- Update expected output files accordingly.
pull/8155/head^2
Muhammad Usama 2025-08-26 21:48:07 +03:00 committed by GitHub
parent ce7ddc0d3d
commit 62e5fcfe09
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 36 additions and 28 deletions

View File

@ -140,10 +140,10 @@ GetReplicationLag(WorkerNode *primaryWorkerNode, WorkerNode *replicaWorkerNode)
ForgetResults(replicaConnection);
CloseConnection(replicaConnection);
ereport(DEBUG1, (errmsg(
ereport(DEBUG2, (errmsg(
"successfully measured replication lag: primary LSN %s, clone LSN %s",
primary_lsn_str, replica_lsn_str)));
ereport(NOTICE, (errmsg("replication lag between %s:%d and %s:%d is %ld bytes",
ereport(DEBUG1, (errmsg("replication lag between %s:%d and %s:%d is %ld bytes",
primaryWorkerNode->workerName, primaryWorkerNode->workerPort,
replicaWorkerNode->workerName, replicaWorkerNode->workerPort,
lag_bytes)));
@ -244,9 +244,9 @@ EnsureValidCloneMode(WorkerNode *primaryWorkerNode,
freeaddrinfo(result);
}
ereport(NOTICE, (errmsg("checking replication for node %s (resolved IP: %s)",
ereport(NOTICE, (errmsg("checking replication status of clone node %s:%d",
cloneHostname,
resolvedIP ? resolvedIP : "unresolved")));
clonePort)));
/* Build query to check if clone is connected and get its sync state */
@ -278,6 +278,11 @@ EnsureValidCloneMode(WorkerNode *primaryWorkerNode,
cloneHostname);
}
ereport(DEBUG2, (errmsg("sending replication status check query: %s to primary %s:%d",
replicationCheckQuery->data,
primaryWorkerNode->workerName,
primaryWorkerNode->workerPort)));
int replicationCheckResultCode = SendRemoteCommand(primaryConnection,
replicationCheckQuery->data);
if (replicationCheckResultCode == 0)
@ -305,8 +310,9 @@ EnsureValidCloneMode(WorkerNode *primaryWorkerNode,
primaryWorkerNode->workerName, primaryWorkerNode->
workerPort),
errdetail(
"The clone must be actively replicating from the specified primary node. "
"Check that the clone is running and properly configured for replication.")));
"The clone must be actively replicating from the specified primary node"),
errhint(
"Verify the clone is running and properly configured for replication")));
}
/* Check if clone is synchronous */
@ -322,9 +328,9 @@ EnsureValidCloneMode(WorkerNode *primaryWorkerNode,
"cannot %s clone %s:%d as it is configured as a synchronous replica",
operation, cloneHostname, clonePort),
errdetail(
"Promoting a synchronous clone can cause data consistency issues. "
"Please configure it as an asynchronous replica first.")))
;
"Promoting a synchronous clone can cause data consistency issues"),
errhint(
"Configure clone as an asynchronous replica")));
}
}

View File

@ -167,7 +167,7 @@ SELECT pg_sleep(5);
-- the function returns the new node id
SELECT citus_add_clone_node('localhost', :follower_worker_1_port, 'localhost', :worker_1_port) AS clone_node_id \gset
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
NOTICE: checking replication for node localhost (resolved IP: ::1)
NOTICE: checking replication status of clone node localhost:xxxxx
NOTICE: clone localhost:xxxxx is properly connected to primary localhost:xxxxx and is not synchronous
SELECT * from pg_dist_node ORDER by nodeid;
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
@ -234,12 +234,11 @@ SET client_min_messages to 'LOG';
SELECT citus_promote_clone_and_rebalance(:clone_node_id);
NOTICE: Starting promotion process for clone node localhost:xxxxx (ID 3), original primary localhost:xxxxx (ID 1)
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
NOTICE: checking replication for node localhost (resolved IP: ::1)
NOTICE: checking replication status of clone node localhost:xxxxx
NOTICE: clone localhost:xxxxx is properly connected to primary localhost:xxxxx and is not synchronous
NOTICE: Blocking writes on shards of original primary node localhost:xxxxx (group 1)
NOTICE: Blocking all writes to worker node localhost:xxxxx (ID 1)
NOTICE: Waiting for clone localhost:xxxxx to catch up with primary localhost:xxxxx (timeout: 300 seconds)
NOTICE: replication lag between localhost:xxxxx and localhost:xxxxx is 0 bytes
NOTICE: Clone localhost:xxxxx is now caught up with primary localhost:xxxxx.
NOTICE: Attempting to promote clone localhost:xxxxx via pg_promote().
NOTICE: Clone node localhost:xxxxx (ID 3) has been successfully promoted.

View File

@ -17,9 +17,10 @@ SELECT * from pg_dist_node ORDER by nodeid;
-- this should fail as it is not a valid replica of worker_1
SELECT citus_add_clone_node('localhost', :follower_worker_2_port, 'localhost', :worker_1_port) AS clone_node_id \gset
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
NOTICE: checking replication for node localhost (resolved IP: ::1)
NOTICE: checking replication status of clone node localhost:xxxxx
ERROR: clone localhost:xxxxx is not connected to primary localhost:xxxxx
DETAIL: The clone must be actively replicating from the specified primary node. Check that the clone is running and properly configured for replication.
DETAIL: The clone must be actively replicating from the specified primary node
HINT: Verify the clone is running and properly configured for replication
SELECT * from pg_dist_node ORDER by nodeid;
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
---------------------------------------------------------------------
@ -71,9 +72,10 @@ SELECT nodename, nodeport, count(shardid) FROM pg_dist_shard_placement GROUP BY
-- Try to add replica of worker_node2 as a clone of worker_node1
SELECT citus_add_clone_node('localhost', :follower_worker_2_port, 'localhost', :worker_1_port) AS clone_node_id \gset
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
NOTICE: checking replication for node localhost (resolved IP: ::1)
NOTICE: checking replication status of clone node localhost:xxxxx
ERROR: clone localhost:xxxxx is not connected to primary localhost:xxxxx
DETAIL: The clone must be actively replicating from the specified primary node. Check that the clone is running and properly configured for replication.
DETAIL: The clone must be actively replicating from the specified primary node
HINT: Verify the clone is running and properly configured for replication
-- Test 1: Try to promote a non-existent clone node
SELECT citus_promote_clone_and_rebalance(clone_nodeid =>99999);
ERROR: Clone node with ID 99999 not found.
@ -87,7 +89,7 @@ ERROR: Node localhost:xxxxx (ID 1) is not a valid clone or its primary node ID
-- register the new node as a clone, This should pass
SELECT citus_add_clone_node('localhost', :follower_worker_2_port, 'localhost', :worker_2_port) AS clone_node_id \gset
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
NOTICE: checking replication for node localhost (resolved IP: ::1)
NOTICE: checking replication status of clone node localhost:xxxxx
NOTICE: clone localhost:xxxxx is properly connected to primary localhost:xxxxx and is not synchronous
SELECT * from pg_dist_node ORDER by nodeid;
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
@ -108,12 +110,11 @@ SELECT :clone_node_id;
SELECT citus_promote_clone_and_rebalance(clone_nodeid => :clone_node_id, rebalance_strategy => 'invalid_strategy');
NOTICE: Starting promotion process for clone node localhost:xxxxx (ID 4), original primary localhost:xxxxx (ID 2)
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
NOTICE: checking replication for node localhost (resolved IP: ::1)
NOTICE: checking replication status of clone node localhost:xxxxx
NOTICE: clone localhost:xxxxx is properly connected to primary localhost:xxxxx and is not synchronous
NOTICE: Blocking writes on shards of original primary node localhost:xxxxx (group 2)
NOTICE: Blocking all writes to worker node localhost:xxxxx (ID 2)
NOTICE: Waiting for clone localhost:xxxxx to catch up with primary localhost:xxxxx (timeout: 300 seconds)
NOTICE: replication lag between localhost:xxxxx and localhost:xxxxx is 0 bytes
NOTICE: Clone localhost:xxxxx is now caught up with primary localhost:xxxxx.
NOTICE: Attempting to promote clone localhost:xxxxx via pg_promote().
NOTICE: Clone node localhost:xxxxx (ID 4) has been successfully promoted.
@ -133,9 +134,10 @@ BEGIN;
SELECT citus_promote_clone_and_rebalance(clone_nodeid => :clone_node_id);
NOTICE: Starting promotion process for clone node localhost:xxxxx (ID 4), original primary localhost:xxxxx (ID 2)
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
NOTICE: checking replication for node localhost (resolved IP: ::1)
NOTICE: checking replication status of clone node localhost:xxxxx
ERROR: clone localhost:xxxxx is not connected to primary localhost:xxxxx
DETAIL: The clone must be actively replicating from the specified primary node. Check that the clone is running and properly configured for replication.
DETAIL: The clone must be actively replicating from the specified primary node
HINT: Verify the clone is running and properly configured for replication
ROLLBACK;
-- Verify no data is lost after rooling back the transaction
SELECT COUNT(*) from backup_test;
@ -163,7 +165,7 @@ SELECT * from pg_dist_node ORDER by nodeid;
SELECT master_add_node('localhost', :worker_3_port) AS nodeid_3 \gset
SELECT citus_add_clone_node('localhost', :follower_worker_3_port, 'localhost', :worker_3_port) AS clone_node_id_3 \gset
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
NOTICE: checking replication for node localhost (resolved IP: ::1)
NOTICE: checking replication status of clone node localhost:xxxxx
NOTICE: clone localhost:xxxxx is properly connected to primary localhost:xxxxx and is not synchronous
set citus.shard_count = 100;
CREATE TABLE backup_test2(id int, value text);
@ -217,12 +219,11 @@ SET client_min_messages to 'LOG';
SELECT citus_promote_clone_and_rebalance(clone_nodeid => :clone_node_id_3);
NOTICE: Starting promotion process for clone node localhost:xxxxx (ID 6), original primary localhost:xxxxx (ID 5)
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
NOTICE: checking replication for node localhost (resolved IP: ::1)
NOTICE: checking replication status of clone node localhost:xxxxx
NOTICE: clone localhost:xxxxx is properly connected to primary localhost:xxxxx and is not synchronous
NOTICE: Blocking writes on shards of original primary node localhost:xxxxx (group 5)
NOTICE: Blocking all writes to worker node localhost:xxxxx (ID 5)
NOTICE: Waiting for clone localhost:xxxxx to catch up with primary localhost:xxxxx (timeout: 300 seconds)
NOTICE: replication lag between localhost:xxxxx and localhost:xxxxx is 0 bytes
NOTICE: Clone localhost:xxxxx is now caught up with primary localhost:xxxxx.
NOTICE: Attempting to promote clone localhost:xxxxx via pg_promote().
NOTICE: Clone node localhost:xxxxx (ID 6) has been successfully promoted.

View File

@ -27,15 +27,17 @@ SELECT master_remove_node('localhost', :follower_worker_2_port);
-- this should fail as the replica is a synchronous replica that is not allowed
SELECT citus_add_clone_node('localhost', :follower_worker_1_port, 'localhost', :worker_1_port) AS clone_node_id \gset
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
NOTICE: checking replication for node localhost (resolved IP: ::1)
NOTICE: checking replication status of clone node localhost:xxxxx
ERROR: cannot add clone localhost:xxxxx as it is configured as a synchronous replica
DETAIL: Promoting a synchronous clone can cause data consistency issues. Please configure it as an asynchronous replica first.
DETAIL: Promoting a synchronous clone can cause data consistency issues
HINT: Configure clone as an asynchronous replica
-- this should fail as the replica is a synchronous replica that is not allowed
SELECT citus_add_clone_node('localhost', :follower_worker_2_port, 'localhost', :worker_2_port) AS clone_node_id \gset
NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx
NOTICE: checking replication for node localhost (resolved IP: ::1)
NOTICE: checking replication status of clone node localhost:xxxxx
ERROR: cannot add clone localhost:xxxxx as it is configured as a synchronous replica
DETAIL: Promoting a synchronous clone can cause data consistency issues. Please configure it as an asynchronous replica first.
DETAIL: Promoting a synchronous clone can cause data consistency issues
HINT: Configure clone as an asynchronous replica
SELECT * from pg_dist_node ORDER by nodeid;
nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid
---------------------------------------------------------------------