Merge pull request #6057 from citusdata/fix_read_rep_error

Fix errors while promoting read-replicas to primary
pull/6060/head
Önder Kalacı 2022-07-13 15:14:21 +02:00 committed by GitHub
commit beebbfc9ff
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 72 additions and 6 deletions

View File

@ -2505,7 +2505,7 @@ SchemaOwnerName(Oid objectId)
static bool static bool
HasMetadataWorkers(void) HasMetadataWorkers(void)
{ {
List *workerNodeList = ActivePrimaryNonCoordinatorNodeList(NoLock); List *workerNodeList = ActiveReadableNonCoordinatorNodeList();
WorkerNode *workerNode = NULL; WorkerNode *workerNode = NULL;
foreach_ptr(workerNode, workerNodeList) foreach_ptr(workerNode, workerNodeList)

View File

@ -1687,7 +1687,7 @@ citus_is_coordinator(PG_FUNCTION_ARGS)
bool isCoordinator = false; bool isCoordinator = false;
if (GetLocalGroupId() == COORDINATOR_GROUP_ID && if (GetLocalGroupId() == COORDINATOR_GROUP_ID &&
ActivePrimaryNodeCount() > 0) ActiveReadableNodeCount() > 0)
{ {
isCoordinator = true; isCoordinator = true;
} }

View File

@ -94,12 +94,12 @@ ActivePrimaryNonCoordinatorNodeCount(void)
/* /*
* ActivePrimaryNodeCount returns the number of groups with a primary in the cluster. * ActiveReadableNodeCount returns the number of nodes in the cluster.
*/ */
uint32 uint32
ActivePrimaryNodeCount(void) ActiveReadableNodeCount(void)
{ {
List *nodeList = ActivePrimaryNodeList(NoLock); List *nodeList = ActiveReadableNodeList();
return list_length(nodeList); return list_length(nodeList);
} }

View File

@ -72,7 +72,7 @@ extern WorkerNode * WorkerGetRoundRobinCandidateNode(List *workerNodeList,
uint64 shardId, uint64 shardId,
uint32 placementIndex); uint32 placementIndex);
extern uint32 ActivePrimaryNonCoordinatorNodeCount(void); extern uint32 ActivePrimaryNonCoordinatorNodeCount(void);
extern uint32 ActivePrimaryNodeCount(void); extern uint32 ActiveReadableNodeCount(void);
extern List * ActivePrimaryNonCoordinatorNodeList(LOCKMODE lockMode); extern List * ActivePrimaryNonCoordinatorNodeList(LOCKMODE lockMode);
extern List * ActivePrimaryNodeList(LOCKMODE lockMode); extern List * ActivePrimaryNodeList(LOCKMODE lockMode);
extern List * ActivePrimaryRemoteNodeList(LOCKMODE lockMode); extern List * ActivePrimaryRemoteNodeList(LOCKMODE lockMode);

View File

@ -354,6 +354,33 @@ ERROR: writing to worker nodes is not currently allowed
DETAIL: citus.use_secondary_nodes is set to 'always' DETAIL: citus.use_secondary_nodes is set to 'always'
SELECT * FROM citus_local_table ORDER BY a; SELECT * FROM citus_local_table ORDER BY a;
ERROR: there is a shard placement in node group 0 but there are no nodes in that group ERROR: there is a shard placement in node group 0 but there are no nodes in that group
\c "port=57636 dbname=regression options='-c\ citus.use_secondary_nodes=always\ -c\ citus.cluster_name=second-cluster'"
-- when an existing read-replica is forked to become
-- another primary node, we sometimes have to use citus.use_secondary_nodes=always
-- even if the node is not in recovery mode. In those cases, allow LOCK
-- command on local / metadata tables, and also certain UDFs
SHOW citus.use_secondary_nodes;
citus.use_secondary_nodes
---------------------------------------------------------------------
always
(1 row)
SELECT pg_is_in_recovery();
pg_is_in_recovery
---------------------------------------------------------------------
f
(1 row)
SELECT citus_is_coordinator();
citus_is_coordinator
---------------------------------------------------------------------
t
(1 row)
BEGIN;
LOCK TABLE pg_dist_node IN SHARE ROW EXCLUSIVE MODE;
LOCK TABLE local IN SHARE ROW EXCLUSIVE MODE;
COMMIT;
\c -reuse-previous=off regression - - :master_port \c -reuse-previous=off regression - - :master_port
DROP TABLE the_table; DROP TABLE the_table;
DROP TABLE reference_table; DROP TABLE reference_table;

View File

@ -141,6 +141,25 @@ ORDER BY
localhost | 9072 localhost | 9072
(2 rows) (2 rows)
-- basic helper utilities should work fine
SELECT citus_is_coordinator();
citus_is_coordinator
---------------------------------------------------------------------
t
(1 row)
SELECT count(*) FROM citus_lock_waits;
count
---------------------------------------------------------------------
0
(1 row)
SELECT count(*) FROM citus_dist_stat_activity WHERE global_pid = citus_backend_gpid();
count
---------------------------------------------------------------------
1
(1 row)
-- okay, now let's play with nodecluster. If we change the cluster of our follower node -- okay, now let's play with nodecluster. If we change the cluster of our follower node
-- queries should stat failing again, since there are no worker nodes in the new cluster -- queries should stat failing again, since there are no worker nodes in the new cluster
\c "port=9070 dbname=regression options='-c\ citus.use_secondary_nodes=always\ -c\ citus.cluster_name=second-cluster'" \c "port=9070 dbname=regression options='-c\ citus.use_secondary_nodes=always\ -c\ citus.cluster_name=second-cluster'"

View File

@ -163,6 +163,20 @@ SELECT * FROM reference_table ORDER BY a;
INSERT INTO citus_local_table (a, b, z) VALUES (1, 2, 3); INSERT INTO citus_local_table (a, b, z) VALUES (1, 2, 3);
SELECT * FROM citus_local_table ORDER BY a; SELECT * FROM citus_local_table ORDER BY a;
\c "port=57636 dbname=regression options='-c\ citus.use_secondary_nodes=always\ -c\ citus.cluster_name=second-cluster'"
-- when an existing read-replica is forked to become
-- another primary node, we sometimes have to use citus.use_secondary_nodes=always
-- even if the node is not in recovery mode. In those cases, allow LOCK
-- command on local / metadata tables, and also certain UDFs
SHOW citus.use_secondary_nodes;
SELECT pg_is_in_recovery();
SELECT citus_is_coordinator();
BEGIN;
LOCK TABLE pg_dist_node IN SHARE ROW EXCLUSIVE MODE;
LOCK TABLE local IN SHARE ROW EXCLUSIVE MODE;
COMMIT;
\c -reuse-previous=off regression - - :master_port \c -reuse-previous=off regression - - :master_port
DROP TABLE the_table; DROP TABLE the_table;
DROP TABLE reference_table; DROP TABLE reference_table;

View File

@ -89,6 +89,12 @@ FROM
ORDER BY ORDER BY
node_name, node_port; node_name, node_port;
-- basic helper utilities should work fine
SELECT citus_is_coordinator();
SELECT count(*) FROM citus_lock_waits;
SELECT count(*) FROM citus_dist_stat_activity WHERE global_pid = citus_backend_gpid();
-- okay, now let's play with nodecluster. If we change the cluster of our follower node -- okay, now let's play with nodecluster. If we change the cluster of our follower node
-- queries should stat failing again, since there are no worker nodes in the new cluster -- queries should stat failing again, since there are no worker nodes in the new cluster