mirror of https://github.com/citusdata/citus.git
201 lines
6.1 KiB
C
201 lines
6.1 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* health_check.c
|
|
*
|
|
* UDFs to run health check operations by coordinating simple queries to test connectivity
|
|
* between connection pairs in the cluster.
|
|
*
|
|
*
|
|
* Copyright (c) Citus Data, Inc.
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
|
|
#include "distributed/argutils.h"
|
|
#include "distributed/listutils.h"
|
|
#include "distributed/lock_graph.h"
|
|
#include "distributed/metadata_cache.h"
|
|
#include "distributed/remote_commands.h"
|
|
#include "distributed/tuplestore.h"
|
|
#include "distributed/worker_manager.h"
|
|
#include "utils/builtins.h"
|
|
|
|
/* simple query to run on workers to check connectivity */
|
|
#define CONNECTIVITY_CHECK_QUERY "SELECT 1"
|
|
#define CONNECTIVITY_CHECK_COLUMNS 5
|
|
|
|
PG_FUNCTION_INFO_V1(citus_check_connection_to_node);
|
|
PG_FUNCTION_INFO_V1(citus_check_cluster_node_health);
|
|
|
|
static bool CheckConnectionToNode(char *nodeName, uint32 nodePort);
|
|
|
|
static void StoreAllConnectivityChecks(Tuplestorestate *tupleStore,
|
|
TupleDesc tupleDescriptor);
|
|
static char * GetConnectivityCheckCommand(const char *nodeName, const uint32 nodePort);
|
|
|
|
|
|
/*
|
|
* citus_check_connection_to_node sends a simple query from a worker node to another
|
|
* node, and returns success status.
|
|
*/
|
|
Datum
|
|
citus_check_connection_to_node(PG_FUNCTION_ARGS)
|
|
{
|
|
CheckCitusVersion(ERROR);
|
|
|
|
char *nodeName = PG_GETARG_TEXT_TO_CSTRING(0);
|
|
uint32 nodePort = PG_GETARG_UINT32(1);
|
|
|
|
bool success = CheckConnectionToNode(nodeName, nodePort);
|
|
PG_RETURN_BOOL(success);
|
|
}
|
|
|
|
|
|
/*
|
|
* CheckConnectionToNode sends a simple query to a node and returns success status
|
|
*/
|
|
static bool
|
|
CheckConnectionToNode(char *nodeName, uint32 nodePort)
|
|
{
|
|
int connectionFlags = 0;
|
|
MultiConnection *connection = GetNodeConnection(connectionFlags, nodeName, nodePort);
|
|
int responseStatus = ExecuteOptionalRemoteCommand(connection,
|
|
CONNECTIVITY_CHECK_QUERY, NULL);
|
|
|
|
return responseStatus == RESPONSE_OKAY;
|
|
}
|
|
|
|
|
|
/*
|
|
* citus_check_cluster_node_health UDF performs connectivity checks from all the nodes to
|
|
* all the nodes, and report success status
|
|
*/
|
|
Datum
|
|
citus_check_cluster_node_health(PG_FUNCTION_ARGS)
|
|
{
|
|
CheckCitusVersion(ERROR);
|
|
|
|
TupleDesc tupleDescriptor = NULL;
|
|
Tuplestorestate *tupleStore = SetupTuplestore(fcinfo, &tupleDescriptor);
|
|
|
|
StoreAllConnectivityChecks(tupleStore, tupleDescriptor);
|
|
|
|
PG_RETURN_VOID();
|
|
}
|
|
|
|
|
|
/*
|
|
* StoreAllConnectivityChecks performs connectivity checks from all the nodes to all the
|
|
* nodes, and report success status.
|
|
*
|
|
* Algorithm is:
|
|
* for sourceNode in activeReadableNodeList:
|
|
* c = connectToNode(sourceNode)
|
|
* for targetNode in activeReadableNodeList:
|
|
* result = c.execute("SELECT citus_check_connection_to_node(targetNode.name, targetNode.port")
|
|
* emit sourceNode.name, sourceNode.port, targetNode.name, targetNode.port, result
|
|
*
|
|
* -- result -> true -> connection attempt from source to target succeeded
|
|
* -- result -> false -> connection attempt from source to target failed
|
|
* -- result -> NULL -> connection attempt from the current node to source node failed
|
|
*/
|
|
static void
|
|
StoreAllConnectivityChecks(Tuplestorestate *tupleStore, TupleDesc tupleDescriptor)
|
|
{
|
|
Datum values[CONNECTIVITY_CHECK_COLUMNS];
|
|
bool isNulls[CONNECTIVITY_CHECK_COLUMNS];
|
|
|
|
/*
|
|
* Get all the readable node list so that we will check connectivity to followers in
|
|
* the cluster as well.
|
|
*/
|
|
List *workerNodeList = ActiveReadableNodeList();
|
|
|
|
/* we want to check for connectivity in a deterministic order */
|
|
workerNodeList = SortList(workerNodeList, CompareWorkerNodes);
|
|
|
|
/*
|
|
* We iterate over the workerNodeList twice, for source and target worker nodes. This
|
|
* operation is safe for foreach_ptr macro, as long as we use different variables for
|
|
* each iteration.
|
|
*/
|
|
WorkerNode *sourceWorkerNode = NULL;
|
|
foreach_ptr(sourceWorkerNode, workerNodeList)
|
|
{
|
|
const char *sourceNodeName = sourceWorkerNode->workerName;
|
|
const int sourceNodePort = sourceWorkerNode->workerPort;
|
|
int32 connectionFlags = 0;
|
|
|
|
/* open a connection to the source node using the synchronous api */
|
|
MultiConnection *connectionToSourceNode =
|
|
GetNodeConnection(connectionFlags, sourceNodeName, sourceNodePort);
|
|
|
|
/* the second iteration over workerNodeList for the target worker nodes. */
|
|
WorkerNode *targetWorkerNode = NULL;
|
|
foreach_ptr(targetWorkerNode, workerNodeList)
|
|
{
|
|
const char *targetNodeName = targetWorkerNode->workerName;
|
|
const int targetNodePort = targetWorkerNode->workerPort;
|
|
|
|
char *connectivityCheckCommandToTargetNode =
|
|
GetConnectivityCheckCommand(targetNodeName, targetNodePort);
|
|
|
|
PGresult *result = NULL;
|
|
int executionResult =
|
|
ExecuteOptionalRemoteCommand(connectionToSourceNode,
|
|
connectivityCheckCommandToTargetNode,
|
|
&result);
|
|
|
|
/* get ready for the next tuple */
|
|
memset(values, 0, sizeof(values));
|
|
memset(isNulls, false, sizeof(isNulls));
|
|
|
|
values[0] = PointerGetDatum(cstring_to_text(sourceNodeName));
|
|
values[1] = Int32GetDatum(sourceNodePort);
|
|
values[2] = PointerGetDatum(cstring_to_text(targetNodeName));
|
|
values[3] = Int32GetDatum(targetNodePort);
|
|
|
|
/*
|
|
* If we could not send the query or the result was not ok, set success field
|
|
* to NULL. This may indicate connection errors to a worker node, however that
|
|
* node can potentially connect to other nodes.
|
|
*
|
|
* Therefore, we mark the success as NULL to indicate that the connectivity
|
|
* status is unknown.
|
|
*/
|
|
if (executionResult != RESPONSE_OKAY)
|
|
{
|
|
isNulls[4] = true;
|
|
}
|
|
else
|
|
{
|
|
int rowIndex = 0;
|
|
int columnIndex = 0;
|
|
values[4] = BoolGetDatum(ParseBoolField(result, rowIndex, columnIndex));
|
|
}
|
|
|
|
tuplestore_putvalues(tupleStore, tupleDescriptor, values, isNulls);
|
|
|
|
PQclear(result);
|
|
ForgetResults(connectionToSourceNode);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* GetConnectivityCheckCommand returns the command to check connections to a node
|
|
*/
|
|
static char *
|
|
GetConnectivityCheckCommand(const char *nodeName, const uint32 nodePort)
|
|
{
|
|
StringInfo connectivityCheckCommand = makeStringInfo();
|
|
appendStringInfo(connectivityCheckCommand,
|
|
"SELECT citus_check_connection_to_node('%s', %d)",
|
|
nodeName, nodePort);
|
|
|
|
return connectivityCheckCommand->data;
|
|
}
|