citus/src/backend/distributed/utils/node_metadata.c

550 lines
15 KiB
C

/*
* node_metadata.c
* Functions that operate on pg_dist_node
*
* Copyright (c) 2012-2016, Citus Data, Inc.
*/
#include "postgres.h"
#include "miscadmin.h"
#include "funcapi.h"
#include "access/genam.h"
#include "access/heapam.h"
#include "access/htup.h"
#include "access/htup_details.h"
#include "access/skey.h"
#if (PG_VERSION_NUM >= 90500 && PG_VERSION_NUM < 90600)
#include "access/stratnum.h"
#else
#include "access/skey.h"
#endif
#include "access/tupmacs.h"
#include "access/xact.h"
#include "catalog/indexing.h"
#include "commands/sequence.h"
#include "distributed/master_protocol.h"
#include "distributed/metadata_cache.h"
#include "distributed/pg_dist_node.h"
#include "distributed/worker_manager.h"
#include "distributed/worker_transaction.h"
#include "lib/stringinfo.h"
#include "storage/lock.h"
#include "storage/fd.h"
#include "utils/builtins.h"
#include "utils/fmgroids.h"
#include "utils/rel.h"
#include "utils/relcache.h"
/* default group size */
int GroupSize = 1;
/* local function forward declarations */
static Datum GenerateNodeTuple(WorkerNode *workerNode);
static WorkerNode * FindWorkerNode(char *nodeName, int32 nodePort);
static uint32 NextGroupId(void);
static uint32 GetMaxGroupId(void);
static uint64 GetNodeCountInGroup(uint32 groupId);
static char * InsertNodeCommand(uint32 nodeid, char *nodename, int nodeport,
uint32 groupId);
static List * ParseWorkerNodeFile(const char *workerNodeFilename);
/* declarations for dynamic loading */
PG_FUNCTION_INFO_V1(cluster_add_node);
PG_FUNCTION_INFO_V1(cluster_read_worker_file);
PG_FUNCTION_INFO_V1(master_get_new_nodeid);
PG_FUNCTION_INFO_V1(master_get_next_groupid);
/*
* cluster_add_node function adds a new node to the cluster. If the node already
* exists, the function returns with the information about the node. If not, the
* following prodecure is followed while adding a node.
* If the groupId is not explicitly given by the user, the function picks the
* group that the new node should be in with respect to GroupSize. Then, the
* new node is inserted into the local pg_dist_node.
*
* TODO: The following will be added in the near future.
* Lastly, the new node is inserted to all other nodes' pg_dist_node table.
*/
Datum
cluster_add_node(PG_FUNCTION_ARGS)
{
text *nodeName = PG_GETARG_TEXT_P(0);
int32 nodePort = PG_GETARG_INT32(1);
int32 groupId = PG_GETARG_INT32(2);
char *nodeNameString = text_to_cstring(nodeName);
Relation pgDistNode = NULL;
Datum nextNodeId = 0;
int nextNodeIdInt = 0;
char *insertCommand = NULL;
Datum returnData = 0;
WorkerNode *workerNode = NULL;
/* acquire a lock so that no one can do this concurrently */
pgDistNode = heap_open(DistNodeRelationId(), AccessExclusiveLock);
/* check if the node already exists in the cluster */
workerNode = FindWorkerNode(nodeNameString, nodePort);
if (workerNode != NULL)
{
/* fill return data and return */
returnData = GenerateNodeTuple(workerNode);
/* close the heap */
heap_close(pgDistNode, AccessExclusiveLock);
PG_RETURN_DATUM(returnData);
}
/* user lets Citus to decide on the group that the newly added node should be in */
if (groupId == 0)
{
groupId = NextGroupId();
}
else
{
uint maxGroupId = GetMaxGroupId();
if (groupId > maxGroupId)
{
ereport(ERROR, (errmsg("you cannot add a node to a non-existing group")));
}
}
/* generate the new node id from the sequence */
nextNodeId = master_get_new_nodeid(NULL);
nextNodeIdInt = DatumGetUInt32(nextNodeId);
InsertNodeRow(nextNodeIdInt, nodeNameString, nodePort, groupId);
insertCommand = InsertNodeCommand(nextNodeIdInt, nodeNameString, nodePort, groupId);
/* TODO: enable this once we have fully metadata sync */
/* SendCommandToWorkersInParallel(insertCommand); */
heap_close(pgDistNode, AccessExclusiveLock);
/* fetch the worker node, and generate the output */
workerNode = FindWorkerNode(nodeNameString, nodePort);
returnData = GenerateNodeTuple(workerNode);
PG_RETURN_CSTRING(returnData);
}
/*
*/
Datum
cluster_read_worker_file(PG_FUNCTION_ARGS)
{
text *filePath = PG_GETARG_TEXT_P(0);
char *filePathCStr = text_to_cstring(filePath);
ListCell *workerNodeCell = NULL;
List *workerNodes = ParseWorkerNodeFile(filePathCStr);
foreach(workerNodeCell, workerNodes)
{
WorkerNode *workerNode = (WorkerNode *) lfirst(workerNodeCell);
Datum workerNameDatum = PointerGetDatum(cstring_to_text(workerNode->workerName));
DirectFunctionCall3(cluster_add_node, workerNameDatum,
UInt32GetDatum(workerNode->workerPort),
PointerGetDatum(NULL));
}
PG_RETURN_BOOL(true);
}
/*
* GenerateNodeTuple gets a worker node and return a heap tuple of
* given worker node.
*/
static Datum
GenerateNodeTuple(WorkerNode *workerNode)
{
Relation pgDistNode = NULL;
TupleDesc tupleDescriptor = NULL;
HeapTuple heapTuple = NULL;
Datum nodeDatum = 0;
Datum values[Natts_pg_dist_node];
bool isNulls[Natts_pg_dist_node];
/* form new shard tuple */
memset(values, 0, sizeof(values));
memset(isNulls, false, sizeof(isNulls));
values[Anum_pg_dist_node_nodeid - 1] = UInt32GetDatum(workerNode->nodeId);
values[Anum_pg_dist_node_groupid - 1] = UInt32GetDatum(workerNode->groupId);
values[Anum_pg_dist_node_nodename - 1] = CStringGetTextDatum(workerNode->workerName);
values[Anum_pg_dist_node_nodeport - 1] = UInt32GetDatum(workerNode->workerPort);
/* open shard relation and insert new tuple */
pgDistNode = heap_open(DistNodeRelationId(), AccessShareLock);
/* generate the tuple */
tupleDescriptor = RelationGetDescr(pgDistNode);
heapTuple = heap_form_tuple(tupleDescriptor, values, isNulls);
nodeDatum = HeapTupleGetDatum(heapTuple);
/* close the relation */
heap_close(pgDistNode, AccessShareLock);
return nodeDatum;
}
/*
* FindWorkerNode iterates of the worker nodes and returns the workerNode
* if it already exists. Else, the function returns NULL.
*/
static WorkerNode *
FindWorkerNode(char *nodeName, int32 nodePort)
{
WorkerNode *workerNode = NULL;
HTAB *workerNodeHash = GetWorkerNodeHash();
HASH_SEQ_STATUS status;
hash_seq_init(&status, workerNodeHash);
while ((workerNode = hash_seq_search(&status)) != NULL)
{
if (strncasecmp(nodeName, workerNode->workerName, WORKER_LENGTH) == 0 &&
nodePort == workerNode->workerPort)
{
/* we need to terminate the scan since we break */
hash_seq_term(&status);
break;
}
}
return workerNode;
}
/*
* NextGroupId returns the next group that that can be assigned to a node. If the
* group is full (i.e., it has equal or more elements than GroupSize), a new group id
* is generated and returned. Else, the current maximum group id is returned.
*/
static uint32
NextGroupId()
{
uint32 nextGroupId = 0;
uint32 maxGroupIdInt = GetMaxGroupId();
uint64 nodeCountInMaxGroupId = GetNodeCountInGroup(maxGroupIdInt);
if (nodeCountInMaxGroupId == 0 || nodeCountInMaxGroupId >= GroupSize)
{
Datum nextGroupIdDatum = master_get_next_groupid(NULL);
nextGroupId = DatumGetUInt32(nextGroupIdDatum);
}
else
{
nextGroupId = maxGroupIdInt;
}
return nextGroupId;
}
/*
* GetMaxGroupId iterates over the worker node hash, and returns the maximum
* group id from the table.
*/
static uint32
GetMaxGroupId()
{
uint32 maxGroupId = 0;
WorkerNode *workerNode = NULL;
HTAB *workerNodeHash = GetWorkerNodeHash();
HASH_SEQ_STATUS status;
hash_seq_init(&status, workerNodeHash);
while ((workerNode = hash_seq_search(&status)) != NULL)
{
uint32 workerNodeGroupId = workerNode->groupId;
if (workerNodeGroupId > maxGroupId)
{
maxGroupId = workerNodeGroupId;
}
}
return maxGroupId;
}
/*
* GetNodeCountInGroup iterates over the worker node hash, and returns the
* element count with the given groupId.
*/
static uint64
GetNodeCountInGroup(uint32 groupId)
{
uint64 elementCountInGroup = 0;
WorkerNode *workerNode = NULL;
HTAB *workerNodeHash = GetWorkerNodeHash();
HASH_SEQ_STATUS status;
hash_seq_init(&status, workerNodeHash);
while ((workerNode = hash_seq_search(&status)) != NULL)
{
uint32 workerNodeGroupId = workerNode->groupId;
if (workerNodeGroupId == groupId)
{
elementCountInGroup += 1;
}
}
return elementCountInGroup;
}
/*
* DistributionCreateCommands generates a commands that can be
* executed to replicate the metadata for a distributed table.
*/
static char *
InsertNodeCommand(uint32 nodeid, char *nodename, int nodeport, uint32 groupId)
{
StringInfo insertNodeCommand = makeStringInfo();
appendStringInfo(insertNodeCommand,
"INSERT INTO pg_dist_node " /*TODO: add a ON CONFLICT clause */
"(nodeid, nodename, nodeport, groupid) "
"VALUES "
"(%d, '%s', %d, '%c', %s , %d);",
nodeid,
nodename,
nodeport,
groupId);
return insertNodeCommand->data;
}
/*
* ParseWorkerNodeFile opens and parses the node name and node port from the
* specified configuration file.
* Note that this function is deprecated. Do not use this function for any new
* features.
*/
static List *
ParseWorkerNodeFile(const char *workerNodeFilename)
{
FILE *workerFileStream = NULL;
List *workerNodeList = NIL;
char workerNodeLine[MAXPGPATH];
char *workerFilePath = make_absolute_path(workerNodeFilename);
char *workerPatternTemplate = "%%%u[^# \t]%%*[ \t]%%%u[^# \t]%%*[ \t]%%%u[^# \t]";
char workerLinePattern[1024];
const int workerNameIndex = 0;
const int workerPortIndex = 1;
memset(workerLinePattern, '\0', sizeof(workerLinePattern));
workerFileStream = AllocateFile(workerFilePath, PG_BINARY_R);
if (workerFileStream == NULL)
{
if (errno == ENOENT)
{
ereport(DEBUG1, (errmsg("worker list file located at \"%s\" is not present",
workerFilePath)));
}
else
{
ereport(ERROR, (errcode_for_file_access(),
errmsg("could not open worker list file \"%s\": %m",
workerFilePath)));
}
return NIL;
}
/* build pattern to contain node name length limit */
snprintf(workerLinePattern, sizeof(workerLinePattern), workerPatternTemplate,
WORKER_LENGTH, MAX_PORT_LENGTH, WORKER_LENGTH);
while (fgets(workerNodeLine, sizeof(workerNodeLine), workerFileStream) != NULL)
{
const int workerLineLength = strnlen(workerNodeLine, MAXPGPATH);
WorkerNode *workerNode = NULL;
char *linePointer = NULL;
int32 nodePort = 5432; /* default port number */
int fieldCount = 0;
bool lineIsInvalid = false;
char nodeName[WORKER_LENGTH + 1];
char nodeRack[WORKER_LENGTH + 1];
char nodePortString[MAX_PORT_LENGTH + 1];
memset(nodeName, '\0', sizeof(nodeName));
//strlcpy(nodeRack, WORKER_DEFAULT_RACK, sizeof(nodeRack));
memset(nodePortString, '\0', sizeof(nodePortString));
if (workerLineLength == MAXPGPATH - 1)
{
ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("worker node list file line exceeds the maximum "
"length of %d", MAXPGPATH)));
}
/* trim trailing newlines preserved by fgets, if any */
linePointer = workerNodeLine + workerLineLength - 1;
while (linePointer >= workerNodeLine &&
(*linePointer == '\n' || *linePointer == '\r'))
{
*linePointer-- = '\0';
}
/* skip leading whitespace */
for (linePointer = workerNodeLine; *linePointer; linePointer++)
{
if (!isspace((unsigned char) *linePointer))
{
break;
}
}
/* if the entire line is whitespace or a comment, skip it */
if (*linePointer == '\0' || *linePointer == '#')
{
continue;
}
/* parse line; node name is required, but port and rack are optional */
fieldCount = sscanf(linePointer, workerLinePattern,
nodeName, nodePortString, nodeRack);
/* adjust field count for zero based indexes */
fieldCount--;
/* raise error if no fields were assigned */
if (fieldCount < workerNameIndex)
{
lineIsInvalid = true;
}
/* no special treatment for nodeName: already parsed by sscanf */
/* if a second token was specified, convert to integer port */
if (fieldCount >= workerPortIndex)
{
char *nodePortEnd = NULL;
errno = 0;
nodePort = strtol(nodePortString, &nodePortEnd, 10);
if (errno != 0 || (*nodePortEnd) != '\0' || nodePort <= 0)
{
lineIsInvalid = true;
}
}
if (lineIsInvalid)
{
ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("could not parse worker node line: %s",
workerNodeLine),
errhint("Lines in the worker node file must contain a valid "
"node name and, optionally, a positive port number. "
"Comments begin with a '#' character and extend to "
"the end of their line.")));
}
/* allocate worker node structure and set fields */
workerNode = (WorkerNode *) palloc0(sizeof(WorkerNode));
strlcpy(workerNode->workerName, nodeName, WORKER_LENGTH);
//strlcpy(workerNode->workerRack, nodeRack, WORKER_LENGTH);
workerNode->workerPort = nodePort;
workerNodeList = lappend(workerNodeList, workerNode);
}
FreeFile(workerFileStream);
free(workerFilePath);
return workerNodeList;
}
/*
* master_get_new_nodeid allocates and returns a unique nodeId for the node
* to be added. This allocation occurs both in shared memory and in write
* ahead logs; writing to logs avoids the risk of having nodeId collisions.
*
* Please note that the caller is still responsible for finalizing node data
* and the nodeId with the master node. Further note that this function relies
* on an internal sequence created in initdb to generate unique identifiers.
*
* NB: This can be called by any user; for now we have decided that that's
* ok. We might want to restrict this to users part of a specific role or such
* at some later point.
*/
Datum
master_get_new_nodeid(PG_FUNCTION_ARGS)
{
text *sequenceName = cstring_to_text(NODEID_SEQUENCE_NAME);
Oid sequenceId = ResolveRelationId(sequenceName);
Datum sequenceIdDatum = ObjectIdGetDatum(sequenceId);
Oid savedUserId = InvalidOid;
int savedSecurityContext = 0;
Datum shardIdDatum = 0;
GetUserIdAndSecContext(&savedUserId, &savedSecurityContext);
SetUserIdAndSecContext(CitusExtensionOwner(), SECURITY_LOCAL_USERID_CHANGE);
/* generate new and unique shardId from sequence */
shardIdDatum = DirectFunctionCall1(nextval_oid, sequenceIdDatum);
SetUserIdAndSecContext(savedUserId, savedSecurityContext);
PG_RETURN_DATUM(shardIdDatum);
}
/*
* master_get_next_groupid allocates and returns a unique groupId for the group
* to be created. This allocation occurs both in shared memory and in write
* ahead logs; writing to logs avoids the risk of having groupId collisions.
*
* Please note that the caller is still responsible for finalizing node data
* and the groupId with the master node. Further note that this function relies
* on an internal sequence created in initdb to generate unique identifiers.
*
* NB: This can be called by any user; for now we have decided that that's
* ok. We might want to restrict this to users part of a specific role or such
* at some later point.
*/
Datum
master_get_next_groupid(PG_FUNCTION_ARGS)
{
text *sequenceName = cstring_to_text(GROUPID_SEQUENCE_NAME);
Oid sequenceId = ResolveRelationId(sequenceName);
Datum sequenceIdDatum = ObjectIdGetDatum(sequenceId);
Oid savedUserId = InvalidOid;
int savedSecurityContext = 0;
Datum groupIdDatum = 0;
GetUserIdAndSecContext(&savedUserId, &savedSecurityContext);
SetUserIdAndSecContext(CitusExtensionOwner(), SECURITY_LOCAL_USERID_CHANGE);
/* generate new and unique shardId from sequence */
groupIdDatum = DirectFunctionCall1(nextval_oid, sequenceIdDatum);
SetUserIdAndSecContext(savedUserId, savedSecurityContext);
PG_RETURN_DATUM(groupIdDatum);
}