From 0b9acbeb3d3df5ea73789d13380c0c4ed14e3d5a Mon Sep 17 00:00:00 2001 From: Muhammad Usama Date: Tue, 26 Aug 2025 21:48:07 +0300 Subject: [PATCH] Enhance clone node replication status messages (#8152) - Downgrade replication lag reporting from NOTICE to DEBUG to reduce noise and improve regression test stability. - Add hints to certain replication status messages for better clarity. - Update expected output files accordingly. --- .../distributed/utils/clonenode_utils.c | 24 +++++++++++------- .../expected/multi_add_node_from_backup.out | 5 ++-- .../multi_add_node_from_backup_negative.out | 25 ++++++++++--------- ...ulti_add_node_from_backup_sync_replica.out | 10 +++++--- 4 files changed, 36 insertions(+), 28 deletions(-) diff --git a/src/backend/distributed/utils/clonenode_utils.c b/src/backend/distributed/utils/clonenode_utils.c index 899358e3f..2d037068a 100644 --- a/src/backend/distributed/utils/clonenode_utils.c +++ b/src/backend/distributed/utils/clonenode_utils.c @@ -140,10 +140,10 @@ GetReplicationLag(WorkerNode *primaryWorkerNode, WorkerNode *replicaWorkerNode) ForgetResults(replicaConnection); CloseConnection(replicaConnection); - ereport(DEBUG1, (errmsg( + ereport(DEBUG2, (errmsg( "successfully measured replication lag: primary LSN %s, clone LSN %s", primary_lsn_str, replica_lsn_str))); - ereport(NOTICE, (errmsg("replication lag between %s:%d and %s:%d is %ld bytes", + ereport(DEBUG1, (errmsg("replication lag between %s:%d and %s:%d is %ld bytes", primaryWorkerNode->workerName, primaryWorkerNode->workerPort, replicaWorkerNode->workerName, replicaWorkerNode->workerPort, lag_bytes))); @@ -244,9 +244,9 @@ EnsureValidCloneMode(WorkerNode *primaryWorkerNode, freeaddrinfo(result); } - ereport(NOTICE, (errmsg("checking replication for node %s (resolved IP: %s)", + ereport(NOTICE, (errmsg("checking replication status of clone node %s:%d", cloneHostname, - resolvedIP ? resolvedIP : "unresolved"))); + clonePort))); /* Build query to check if clone is connected and get its sync state */ @@ -278,6 +278,11 @@ EnsureValidCloneMode(WorkerNode *primaryWorkerNode, cloneHostname); } + ereport(DEBUG2, (errmsg("sending replication status check query: %s to primary %s:%d", + replicationCheckQuery->data, + primaryWorkerNode->workerName, + primaryWorkerNode->workerPort))); + int replicationCheckResultCode = SendRemoteCommand(primaryConnection, replicationCheckQuery->data); if (replicationCheckResultCode == 0) @@ -305,8 +310,9 @@ EnsureValidCloneMode(WorkerNode *primaryWorkerNode, primaryWorkerNode->workerName, primaryWorkerNode-> workerPort), errdetail( - "The clone must be actively replicating from the specified primary node. " - "Check that the clone is running and properly configured for replication."))); + "The clone must be actively replicating from the specified primary node"), + errhint( + "Verify the clone is running and properly configured for replication"))); } /* Check if clone is synchronous */ @@ -322,9 +328,9 @@ EnsureValidCloneMode(WorkerNode *primaryWorkerNode, "cannot %s clone %s:%d as it is configured as a synchronous replica", operation, cloneHostname, clonePort), errdetail( - "Promoting a synchronous clone can cause data consistency issues. " - "Please configure it as an asynchronous replica first."))) - ; + "Promoting a synchronous clone can cause data consistency issues"), + errhint( + "Configure clone as an asynchronous replica"))); } } diff --git a/src/test/regress/expected/multi_add_node_from_backup.out b/src/test/regress/expected/multi_add_node_from_backup.out index 02627eaa6..1adeec830 100644 --- a/src/test/regress/expected/multi_add_node_from_backup.out +++ b/src/test/regress/expected/multi_add_node_from_backup.out @@ -167,7 +167,7 @@ SELECT pg_sleep(5); -- the function returns the new node id SELECT citus_add_clone_node('localhost', :follower_worker_1_port, 'localhost', :worker_1_port) AS clone_node_id \gset NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx -NOTICE: checking replication for node localhost (resolved IP: ::1) +NOTICE: checking replication status of clone node localhost:xxxxx NOTICE: clone localhost:xxxxx is properly connected to primary localhost:xxxxx and is not synchronous SELECT * from pg_dist_node ORDER by nodeid; nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid @@ -234,12 +234,11 @@ SET client_min_messages to 'LOG'; SELECT citus_promote_clone_and_rebalance(:clone_node_id); NOTICE: Starting promotion process for clone node localhost:xxxxx (ID 3), original primary localhost:xxxxx (ID 1) NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx -NOTICE: checking replication for node localhost (resolved IP: ::1) +NOTICE: checking replication status of clone node localhost:xxxxx NOTICE: clone localhost:xxxxx is properly connected to primary localhost:xxxxx and is not synchronous NOTICE: Blocking writes on shards of original primary node localhost:xxxxx (group 1) NOTICE: Blocking all writes to worker node localhost:xxxxx (ID 1) NOTICE: Waiting for clone localhost:xxxxx to catch up with primary localhost:xxxxx (timeout: 300 seconds) -NOTICE: replication lag between localhost:xxxxx and localhost:xxxxx is 0 bytes NOTICE: Clone localhost:xxxxx is now caught up with primary localhost:xxxxx. NOTICE: Attempting to promote clone localhost:xxxxx via pg_promote(). NOTICE: Clone node localhost:xxxxx (ID 3) has been successfully promoted. diff --git a/src/test/regress/expected/multi_add_node_from_backup_negative.out b/src/test/regress/expected/multi_add_node_from_backup_negative.out index d3916ecce..102e2131b 100644 --- a/src/test/regress/expected/multi_add_node_from_backup_negative.out +++ b/src/test/regress/expected/multi_add_node_from_backup_negative.out @@ -17,9 +17,10 @@ SELECT * from pg_dist_node ORDER by nodeid; -- this should fail as it is not a valid replica of worker_1 SELECT citus_add_clone_node('localhost', :follower_worker_2_port, 'localhost', :worker_1_port) AS clone_node_id \gset NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx -NOTICE: checking replication for node localhost (resolved IP: ::1) +NOTICE: checking replication status of clone node localhost:xxxxx ERROR: clone localhost:xxxxx is not connected to primary localhost:xxxxx -DETAIL: The clone must be actively replicating from the specified primary node. Check that the clone is running and properly configured for replication. +DETAIL: The clone must be actively replicating from the specified primary node +HINT: Verify the clone is running and properly configured for replication SELECT * from pg_dist_node ORDER by nodeid; nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid --------------------------------------------------------------------- @@ -71,9 +72,10 @@ SELECT nodename, nodeport, count(shardid) FROM pg_dist_shard_placement GROUP BY -- Try to add replica of worker_node2 as a clone of worker_node1 SELECT citus_add_clone_node('localhost', :follower_worker_2_port, 'localhost', :worker_1_port) AS clone_node_id \gset NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx -NOTICE: checking replication for node localhost (resolved IP: ::1) +NOTICE: checking replication status of clone node localhost:xxxxx ERROR: clone localhost:xxxxx is not connected to primary localhost:xxxxx -DETAIL: The clone must be actively replicating from the specified primary node. Check that the clone is running and properly configured for replication. +DETAIL: The clone must be actively replicating from the specified primary node +HINT: Verify the clone is running and properly configured for replication -- Test 1: Try to promote a non-existent clone node SELECT citus_promote_clone_and_rebalance(clone_nodeid =>99999); ERROR: Clone node with ID 99999 not found. @@ -87,7 +89,7 @@ ERROR: Node localhost:xxxxx (ID 1) is not a valid clone or its primary node ID -- register the new node as a clone, This should pass SELECT citus_add_clone_node('localhost', :follower_worker_2_port, 'localhost', :worker_2_port) AS clone_node_id \gset NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx -NOTICE: checking replication for node localhost (resolved IP: ::1) +NOTICE: checking replication status of clone node localhost:xxxxx NOTICE: clone localhost:xxxxx is properly connected to primary localhost:xxxxx and is not synchronous SELECT * from pg_dist_node ORDER by nodeid; nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid @@ -108,12 +110,11 @@ SELECT :clone_node_id; SELECT citus_promote_clone_and_rebalance(clone_nodeid => :clone_node_id, rebalance_strategy => 'invalid_strategy'); NOTICE: Starting promotion process for clone node localhost:xxxxx (ID 4), original primary localhost:xxxxx (ID 2) NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx -NOTICE: checking replication for node localhost (resolved IP: ::1) +NOTICE: checking replication status of clone node localhost:xxxxx NOTICE: clone localhost:xxxxx is properly connected to primary localhost:xxxxx and is not synchronous NOTICE: Blocking writes on shards of original primary node localhost:xxxxx (group 2) NOTICE: Blocking all writes to worker node localhost:xxxxx (ID 2) NOTICE: Waiting for clone localhost:xxxxx to catch up with primary localhost:xxxxx (timeout: 300 seconds) -NOTICE: replication lag between localhost:xxxxx and localhost:xxxxx is 0 bytes NOTICE: Clone localhost:xxxxx is now caught up with primary localhost:xxxxx. NOTICE: Attempting to promote clone localhost:xxxxx via pg_promote(). NOTICE: Clone node localhost:xxxxx (ID 4) has been successfully promoted. @@ -133,9 +134,10 @@ BEGIN; SELECT citus_promote_clone_and_rebalance(clone_nodeid => :clone_node_id); NOTICE: Starting promotion process for clone node localhost:xxxxx (ID 4), original primary localhost:xxxxx (ID 2) NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx -NOTICE: checking replication for node localhost (resolved IP: ::1) +NOTICE: checking replication status of clone node localhost:xxxxx ERROR: clone localhost:xxxxx is not connected to primary localhost:xxxxx -DETAIL: The clone must be actively replicating from the specified primary node. Check that the clone is running and properly configured for replication. +DETAIL: The clone must be actively replicating from the specified primary node +HINT: Verify the clone is running and properly configured for replication ROLLBACK; -- Verify no data is lost after rooling back the transaction SELECT COUNT(*) from backup_test; @@ -163,7 +165,7 @@ SELECT * from pg_dist_node ORDER by nodeid; SELECT master_add_node('localhost', :worker_3_port) AS nodeid_3 \gset SELECT citus_add_clone_node('localhost', :follower_worker_3_port, 'localhost', :worker_3_port) AS clone_node_id_3 \gset NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx -NOTICE: checking replication for node localhost (resolved IP: ::1) +NOTICE: checking replication status of clone node localhost:xxxxx NOTICE: clone localhost:xxxxx is properly connected to primary localhost:xxxxx and is not synchronous set citus.shard_count = 100; CREATE TABLE backup_test2(id int, value text); @@ -217,12 +219,11 @@ SET client_min_messages to 'LOG'; SELECT citus_promote_clone_and_rebalance(clone_nodeid => :clone_node_id_3); NOTICE: Starting promotion process for clone node localhost:xxxxx (ID 6), original primary localhost:xxxxx (ID 5) NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx -NOTICE: checking replication for node localhost (resolved IP: ::1) +NOTICE: checking replication status of clone node localhost:xxxxx NOTICE: clone localhost:xxxxx is properly connected to primary localhost:xxxxx and is not synchronous NOTICE: Blocking writes on shards of original primary node localhost:xxxxx (group 5) NOTICE: Blocking all writes to worker node localhost:xxxxx (ID 5) NOTICE: Waiting for clone localhost:xxxxx to catch up with primary localhost:xxxxx (timeout: 300 seconds) -NOTICE: replication lag between localhost:xxxxx and localhost:xxxxx is 0 bytes NOTICE: Clone localhost:xxxxx is now caught up with primary localhost:xxxxx. NOTICE: Attempting to promote clone localhost:xxxxx via pg_promote(). NOTICE: Clone node localhost:xxxxx (ID 6) has been successfully promoted. diff --git a/src/test/regress/expected/multi_add_node_from_backup_sync_replica.out b/src/test/regress/expected/multi_add_node_from_backup_sync_replica.out index 1237b0454..4103f8f48 100644 --- a/src/test/regress/expected/multi_add_node_from_backup_sync_replica.out +++ b/src/test/regress/expected/multi_add_node_from_backup_sync_replica.out @@ -27,15 +27,17 @@ SELECT master_remove_node('localhost', :follower_worker_2_port); -- this should fail as the replica is a synchronous replica that is not allowed SELECT citus_add_clone_node('localhost', :follower_worker_1_port, 'localhost', :worker_1_port) AS clone_node_id \gset NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx -NOTICE: checking replication for node localhost (resolved IP: ::1) +NOTICE: checking replication status of clone node localhost:xxxxx ERROR: cannot add clone localhost:xxxxx as it is configured as a synchronous replica -DETAIL: Promoting a synchronous clone can cause data consistency issues. Please configure it as an asynchronous replica first. +DETAIL: Promoting a synchronous clone can cause data consistency issues +HINT: Configure clone as an asynchronous replica -- this should fail as the replica is a synchronous replica that is not allowed SELECT citus_add_clone_node('localhost', :follower_worker_2_port, 'localhost', :worker_2_port) AS clone_node_id \gset NOTICE: checking replication relationship between primary localhost:xxxxx and clone localhost:xxxxx -NOTICE: checking replication for node localhost (resolved IP: ::1) +NOTICE: checking replication status of clone node localhost:xxxxx ERROR: cannot add clone localhost:xxxxx as it is configured as a synchronous replica -DETAIL: Promoting a synchronous clone can cause data consistency issues. Please configure it as an asynchronous replica first. +DETAIL: Promoting a synchronous clone can cause data consistency issues +HINT: Configure clone as an asynchronous replica SELECT * from pg_dist_node ORDER by nodeid; nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards | nodeisclone | nodeprimarynodeid ---------------------------------------------------------------------