mirror of https://github.com/citusdata/citus.git
550 lines
15 KiB
C
550 lines
15 KiB
C
/*
|
|
* node_metadata.c
|
|
* Functions that operate on pg_dist_node
|
|
*
|
|
* Copyright (c) 2012-2016, Citus Data, Inc.
|
|
*/
|
|
#include "postgres.h"
|
|
#include "miscadmin.h"
|
|
#include "funcapi.h"
|
|
|
|
|
|
#include "access/genam.h"
|
|
#include "access/heapam.h"
|
|
#include "access/htup.h"
|
|
#include "access/htup_details.h"
|
|
#include "access/skey.h"
|
|
#if (PG_VERSION_NUM >= 90500 && PG_VERSION_NUM < 90600)
|
|
#include "access/stratnum.h"
|
|
#else
|
|
#include "access/skey.h"
|
|
#endif
|
|
#include "access/tupmacs.h"
|
|
#include "access/xact.h"
|
|
#include "catalog/indexing.h"
|
|
#include "commands/sequence.h"
|
|
#include "distributed/master_protocol.h"
|
|
#include "distributed/metadata_cache.h"
|
|
#include "distributed/pg_dist_node.h"
|
|
#include "distributed/worker_manager.h"
|
|
#include "distributed/worker_transaction.h"
|
|
#include "lib/stringinfo.h"
|
|
#include "storage/lock.h"
|
|
#include "storage/fd.h"
|
|
#include "utils/builtins.h"
|
|
#include "utils/fmgroids.h"
|
|
#include "utils/rel.h"
|
|
#include "utils/relcache.h"
|
|
|
|
|
|
/* default group size */
|
|
int GroupSize = 1;
|
|
|
|
|
|
/* local function forward declarations */
|
|
static Datum GenerateNodeTuple(WorkerNode *workerNode);
|
|
static WorkerNode * FindWorkerNode(char *nodeName, int32 nodePort);
|
|
static uint32 NextGroupId(void);
|
|
static uint32 GetMaxGroupId(void);
|
|
static uint64 GetNodeCountInGroup(uint32 groupId);
|
|
static char * InsertNodeCommand(uint32 nodeid, char *nodename, int nodeport,
|
|
uint32 groupId);
|
|
static List * ParseWorkerNodeFile(const char *workerNodeFilename);
|
|
|
|
/* declarations for dynamic loading */
|
|
PG_FUNCTION_INFO_V1(cluster_add_node);
|
|
PG_FUNCTION_INFO_V1(cluster_read_worker_file);
|
|
PG_FUNCTION_INFO_V1(master_get_new_nodeid);
|
|
PG_FUNCTION_INFO_V1(master_get_next_groupid);
|
|
|
|
|
|
/*
|
|
* cluster_add_node function adds a new node to the cluster. If the node already
|
|
* exists, the function returns with the information about the node. If not, the
|
|
* following prodecure is followed while adding a node.
|
|
* If the groupId is not explicitly given by the user, the function picks the
|
|
* group that the new node should be in with respect to GroupSize. Then, the
|
|
* new node is inserted into the local pg_dist_node.
|
|
*
|
|
* TODO: The following will be added in the near future.
|
|
* Lastly, the new node is inserted to all other nodes' pg_dist_node table.
|
|
*/
|
|
Datum
|
|
cluster_add_node(PG_FUNCTION_ARGS)
|
|
{
|
|
text *nodeName = PG_GETARG_TEXT_P(0);
|
|
int32 nodePort = PG_GETARG_INT32(1);
|
|
int32 groupId = PG_GETARG_INT32(2);
|
|
char *nodeNameString = text_to_cstring(nodeName);
|
|
|
|
Relation pgDistNode = NULL;
|
|
Datum nextNodeId = 0;
|
|
int nextNodeIdInt = 0;
|
|
char *insertCommand = NULL;
|
|
Datum returnData = 0;
|
|
WorkerNode *workerNode = NULL;
|
|
|
|
/* acquire a lock so that no one can do this concurrently */
|
|
pgDistNode = heap_open(DistNodeRelationId(), AccessExclusiveLock);
|
|
|
|
/* check if the node already exists in the cluster */
|
|
workerNode = FindWorkerNode(nodeNameString, nodePort);
|
|
if (workerNode != NULL)
|
|
{
|
|
/* fill return data and return */
|
|
returnData = GenerateNodeTuple(workerNode);
|
|
|
|
/* close the heap */
|
|
heap_close(pgDistNode, AccessExclusiveLock);
|
|
|
|
PG_RETURN_DATUM(returnData);
|
|
}
|
|
|
|
/* user lets Citus to decide on the group that the newly added node should be in */
|
|
if (groupId == 0)
|
|
{
|
|
groupId = NextGroupId();
|
|
}
|
|
else
|
|
{
|
|
uint maxGroupId = GetMaxGroupId();
|
|
|
|
if (groupId > maxGroupId)
|
|
{
|
|
ereport(ERROR, (errmsg("you cannot add a node to a non-existing group")));
|
|
}
|
|
}
|
|
|
|
/* generate the new node id from the sequence */
|
|
nextNodeId = master_get_new_nodeid(NULL);
|
|
nextNodeIdInt = DatumGetUInt32(nextNodeId);
|
|
|
|
InsertNodeRow(nextNodeIdInt, nodeNameString, nodePort, groupId);
|
|
|
|
insertCommand = InsertNodeCommand(nextNodeIdInt, nodeNameString, nodePort, groupId);
|
|
|
|
/* TODO: enable this once we have fully metadata sync */
|
|
/* SendCommandToWorkersInParallel(insertCommand); */
|
|
|
|
heap_close(pgDistNode, AccessExclusiveLock);
|
|
|
|
/* fetch the worker node, and generate the output */
|
|
workerNode = FindWorkerNode(nodeNameString, nodePort);
|
|
returnData = GenerateNodeTuple(workerNode);
|
|
|
|
PG_RETURN_CSTRING(returnData);
|
|
}
|
|
|
|
|
|
/*
|
|
*/
|
|
Datum
|
|
cluster_read_worker_file(PG_FUNCTION_ARGS)
|
|
{
|
|
text *filePath = PG_GETARG_TEXT_P(0);
|
|
char *filePathCStr = text_to_cstring(filePath);
|
|
|
|
ListCell *workerNodeCell = NULL;
|
|
List *workerNodes = ParseWorkerNodeFile(filePathCStr);
|
|
|
|
foreach(workerNodeCell, workerNodes)
|
|
{
|
|
WorkerNode *workerNode = (WorkerNode *) lfirst(workerNodeCell);
|
|
Datum workerNameDatum = PointerGetDatum(cstring_to_text(workerNode->workerName));
|
|
|
|
DirectFunctionCall3(cluster_add_node, workerNameDatum,
|
|
UInt32GetDatum(workerNode->workerPort),
|
|
PointerGetDatum(NULL));
|
|
}
|
|
|
|
PG_RETURN_BOOL(true);
|
|
}
|
|
|
|
|
|
/*
|
|
* GenerateNodeTuple gets a worker node and return a heap tuple of
|
|
* given worker node.
|
|
*/
|
|
static Datum
|
|
GenerateNodeTuple(WorkerNode *workerNode)
|
|
{
|
|
Relation pgDistNode = NULL;
|
|
TupleDesc tupleDescriptor = NULL;
|
|
HeapTuple heapTuple = NULL;
|
|
Datum nodeDatum = 0;
|
|
Datum values[Natts_pg_dist_node];
|
|
bool isNulls[Natts_pg_dist_node];
|
|
|
|
/* form new shard tuple */
|
|
memset(values, 0, sizeof(values));
|
|
memset(isNulls, false, sizeof(isNulls));
|
|
|
|
values[Anum_pg_dist_node_nodeid - 1] = UInt32GetDatum(workerNode->nodeId);
|
|
values[Anum_pg_dist_node_groupid - 1] = UInt32GetDatum(workerNode->groupId);
|
|
values[Anum_pg_dist_node_nodename - 1] = CStringGetTextDatum(workerNode->workerName);
|
|
values[Anum_pg_dist_node_nodeport - 1] = UInt32GetDatum(workerNode->workerPort);
|
|
|
|
/* open shard relation and insert new tuple */
|
|
pgDistNode = heap_open(DistNodeRelationId(), AccessShareLock);
|
|
|
|
/* generate the tuple */
|
|
tupleDescriptor = RelationGetDescr(pgDistNode);
|
|
heapTuple = heap_form_tuple(tupleDescriptor, values, isNulls);
|
|
|
|
nodeDatum = HeapTupleGetDatum(heapTuple);
|
|
|
|
/* close the relation */
|
|
heap_close(pgDistNode, AccessShareLock);
|
|
|
|
return nodeDatum;
|
|
}
|
|
|
|
|
|
/*
|
|
* FindWorkerNode iterates of the worker nodes and returns the workerNode
|
|
* if it already exists. Else, the function returns NULL.
|
|
*/
|
|
static WorkerNode *
|
|
FindWorkerNode(char *nodeName, int32 nodePort)
|
|
{
|
|
WorkerNode *workerNode = NULL;
|
|
HTAB *workerNodeHash = GetWorkerNodeHash();
|
|
HASH_SEQ_STATUS status;
|
|
|
|
hash_seq_init(&status, workerNodeHash);
|
|
|
|
while ((workerNode = hash_seq_search(&status)) != NULL)
|
|
{
|
|
if (strncasecmp(nodeName, workerNode->workerName, WORKER_LENGTH) == 0 &&
|
|
nodePort == workerNode->workerPort)
|
|
{
|
|
/* we need to terminate the scan since we break */
|
|
hash_seq_term(&status);
|
|
break;
|
|
}
|
|
}
|
|
|
|
return workerNode;
|
|
}
|
|
|
|
|
|
/*
|
|
* NextGroupId returns the next group that that can be assigned to a node. If the
|
|
* group is full (i.e., it has equal or more elements than GroupSize), a new group id
|
|
* is generated and returned. Else, the current maximum group id is returned.
|
|
*/
|
|
static uint32
|
|
NextGroupId()
|
|
{
|
|
uint32 nextGroupId = 0;
|
|
uint32 maxGroupIdInt = GetMaxGroupId();
|
|
uint64 nodeCountInMaxGroupId = GetNodeCountInGroup(maxGroupIdInt);
|
|
|
|
if (nodeCountInMaxGroupId == 0 || nodeCountInMaxGroupId >= GroupSize)
|
|
{
|
|
Datum nextGroupIdDatum = master_get_next_groupid(NULL);
|
|
|
|
nextGroupId = DatumGetUInt32(nextGroupIdDatum);
|
|
}
|
|
else
|
|
{
|
|
nextGroupId = maxGroupIdInt;
|
|
}
|
|
|
|
return nextGroupId;
|
|
}
|
|
|
|
|
|
/*
|
|
* GetMaxGroupId iterates over the worker node hash, and returns the maximum
|
|
* group id from the table.
|
|
*/
|
|
static uint32
|
|
GetMaxGroupId()
|
|
{
|
|
uint32 maxGroupId = 0;
|
|
WorkerNode *workerNode = NULL;
|
|
HTAB *workerNodeHash = GetWorkerNodeHash();
|
|
HASH_SEQ_STATUS status;
|
|
|
|
hash_seq_init(&status, workerNodeHash);
|
|
|
|
while ((workerNode = hash_seq_search(&status)) != NULL)
|
|
{
|
|
uint32 workerNodeGroupId = workerNode->groupId;
|
|
|
|
if (workerNodeGroupId > maxGroupId)
|
|
{
|
|
maxGroupId = workerNodeGroupId;
|
|
|
|
}
|
|
}
|
|
|
|
return maxGroupId;
|
|
}
|
|
|
|
|
|
/*
|
|
* GetNodeCountInGroup iterates over the worker node hash, and returns the
|
|
* element count with the given groupId.
|
|
*/
|
|
static uint64
|
|
GetNodeCountInGroup(uint32 groupId)
|
|
{
|
|
uint64 elementCountInGroup = 0;
|
|
WorkerNode *workerNode = NULL;
|
|
HTAB *workerNodeHash = GetWorkerNodeHash();
|
|
HASH_SEQ_STATUS status;
|
|
|
|
hash_seq_init(&status, workerNodeHash);
|
|
|
|
while ((workerNode = hash_seq_search(&status)) != NULL)
|
|
{
|
|
uint32 workerNodeGroupId = workerNode->groupId;
|
|
|
|
if (workerNodeGroupId == groupId)
|
|
{
|
|
elementCountInGroup += 1;
|
|
}
|
|
}
|
|
|
|
return elementCountInGroup;
|
|
}
|
|
|
|
|
|
/*
|
|
* DistributionCreateCommands generates a commands that can be
|
|
* executed to replicate the metadata for a distributed table.
|
|
*/
|
|
static char *
|
|
InsertNodeCommand(uint32 nodeid, char *nodename, int nodeport, uint32 groupId)
|
|
{
|
|
StringInfo insertNodeCommand = makeStringInfo();
|
|
|
|
appendStringInfo(insertNodeCommand,
|
|
"INSERT INTO pg_dist_node " /*TODO: add a ON CONFLICT clause */
|
|
"(nodeid, nodename, nodeport, groupid) "
|
|
"VALUES "
|
|
"(%d, '%s', %d, '%c', %s , %d);",
|
|
nodeid,
|
|
nodename,
|
|
nodeport,
|
|
groupId);
|
|
|
|
return insertNodeCommand->data;
|
|
}
|
|
|
|
|
|
/*
|
|
* ParseWorkerNodeFile opens and parses the node name and node port from the
|
|
* specified configuration file.
|
|
* Note that this function is deprecated. Do not use this function for any new
|
|
* features.
|
|
*/
|
|
static List *
|
|
ParseWorkerNodeFile(const char *workerNodeFilename)
|
|
{
|
|
FILE *workerFileStream = NULL;
|
|
List *workerNodeList = NIL;
|
|
char workerNodeLine[MAXPGPATH];
|
|
char *workerFilePath = make_absolute_path(workerNodeFilename);
|
|
char *workerPatternTemplate = "%%%u[^# \t]%%*[ \t]%%%u[^# \t]%%*[ \t]%%%u[^# \t]";
|
|
char workerLinePattern[1024];
|
|
const int workerNameIndex = 0;
|
|
const int workerPortIndex = 1;
|
|
|
|
memset(workerLinePattern, '\0', sizeof(workerLinePattern));
|
|
|
|
workerFileStream = AllocateFile(workerFilePath, PG_BINARY_R);
|
|
if (workerFileStream == NULL)
|
|
{
|
|
if (errno == ENOENT)
|
|
{
|
|
ereport(DEBUG1, (errmsg("worker list file located at \"%s\" is not present",
|
|
workerFilePath)));
|
|
}
|
|
else
|
|
{
|
|
ereport(ERROR, (errcode_for_file_access(),
|
|
errmsg("could not open worker list file \"%s\": %m",
|
|
workerFilePath)));
|
|
}
|
|
return NIL;
|
|
}
|
|
|
|
/* build pattern to contain node name length limit */
|
|
snprintf(workerLinePattern, sizeof(workerLinePattern), workerPatternTemplate,
|
|
WORKER_LENGTH, MAX_PORT_LENGTH, WORKER_LENGTH);
|
|
|
|
while (fgets(workerNodeLine, sizeof(workerNodeLine), workerFileStream) != NULL)
|
|
{
|
|
const int workerLineLength = strnlen(workerNodeLine, MAXPGPATH);
|
|
WorkerNode *workerNode = NULL;
|
|
char *linePointer = NULL;
|
|
int32 nodePort = 5432; /* default port number */
|
|
int fieldCount = 0;
|
|
bool lineIsInvalid = false;
|
|
char nodeName[WORKER_LENGTH + 1];
|
|
char nodeRack[WORKER_LENGTH + 1];
|
|
char nodePortString[MAX_PORT_LENGTH + 1];
|
|
|
|
memset(nodeName, '\0', sizeof(nodeName));
|
|
//strlcpy(nodeRack, WORKER_DEFAULT_RACK, sizeof(nodeRack));
|
|
memset(nodePortString, '\0', sizeof(nodePortString));
|
|
|
|
if (workerLineLength == MAXPGPATH - 1)
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR),
|
|
errmsg("worker node list file line exceeds the maximum "
|
|
"length of %d", MAXPGPATH)));
|
|
}
|
|
|
|
/* trim trailing newlines preserved by fgets, if any */
|
|
linePointer = workerNodeLine + workerLineLength - 1;
|
|
while (linePointer >= workerNodeLine &&
|
|
(*linePointer == '\n' || *linePointer == '\r'))
|
|
{
|
|
*linePointer-- = '\0';
|
|
}
|
|
|
|
/* skip leading whitespace */
|
|
for (linePointer = workerNodeLine; *linePointer; linePointer++)
|
|
{
|
|
if (!isspace((unsigned char) *linePointer))
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* if the entire line is whitespace or a comment, skip it */
|
|
if (*linePointer == '\0' || *linePointer == '#')
|
|
{
|
|
continue;
|
|
}
|
|
|
|
/* parse line; node name is required, but port and rack are optional */
|
|
fieldCount = sscanf(linePointer, workerLinePattern,
|
|
nodeName, nodePortString, nodeRack);
|
|
|
|
/* adjust field count for zero based indexes */
|
|
fieldCount--;
|
|
|
|
/* raise error if no fields were assigned */
|
|
if (fieldCount < workerNameIndex)
|
|
{
|
|
lineIsInvalid = true;
|
|
}
|
|
|
|
/* no special treatment for nodeName: already parsed by sscanf */
|
|
|
|
/* if a second token was specified, convert to integer port */
|
|
if (fieldCount >= workerPortIndex)
|
|
{
|
|
char *nodePortEnd = NULL;
|
|
|
|
errno = 0;
|
|
nodePort = strtol(nodePortString, &nodePortEnd, 10);
|
|
|
|
if (errno != 0 || (*nodePortEnd) != '\0' || nodePort <= 0)
|
|
{
|
|
lineIsInvalid = true;
|
|
}
|
|
}
|
|
|
|
if (lineIsInvalid)
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR),
|
|
errmsg("could not parse worker node line: %s",
|
|
workerNodeLine),
|
|
errhint("Lines in the worker node file must contain a valid "
|
|
"node name and, optionally, a positive port number. "
|
|
"Comments begin with a '#' character and extend to "
|
|
"the end of their line.")));
|
|
}
|
|
|
|
/* allocate worker node structure and set fields */
|
|
workerNode = (WorkerNode *) palloc0(sizeof(WorkerNode));
|
|
|
|
strlcpy(workerNode->workerName, nodeName, WORKER_LENGTH);
|
|
//strlcpy(workerNode->workerRack, nodeRack, WORKER_LENGTH);
|
|
workerNode->workerPort = nodePort;
|
|
|
|
workerNodeList = lappend(workerNodeList, workerNode);
|
|
}
|
|
|
|
FreeFile(workerFileStream);
|
|
free(workerFilePath);
|
|
|
|
return workerNodeList;
|
|
}
|
|
|
|
|
|
/*
|
|
* master_get_new_nodeid allocates and returns a unique nodeId for the node
|
|
* to be added. This allocation occurs both in shared memory and in write
|
|
* ahead logs; writing to logs avoids the risk of having nodeId collisions.
|
|
*
|
|
* Please note that the caller is still responsible for finalizing node data
|
|
* and the nodeId with the master node. Further note that this function relies
|
|
* on an internal sequence created in initdb to generate unique identifiers.
|
|
*
|
|
* NB: This can be called by any user; for now we have decided that that's
|
|
* ok. We might want to restrict this to users part of a specific role or such
|
|
* at some later point.
|
|
*/
|
|
Datum
|
|
master_get_new_nodeid(PG_FUNCTION_ARGS)
|
|
{
|
|
text *sequenceName = cstring_to_text(NODEID_SEQUENCE_NAME);
|
|
Oid sequenceId = ResolveRelationId(sequenceName);
|
|
Datum sequenceIdDatum = ObjectIdGetDatum(sequenceId);
|
|
Oid savedUserId = InvalidOid;
|
|
int savedSecurityContext = 0;
|
|
Datum shardIdDatum = 0;
|
|
|
|
GetUserIdAndSecContext(&savedUserId, &savedSecurityContext);
|
|
SetUserIdAndSecContext(CitusExtensionOwner(), SECURITY_LOCAL_USERID_CHANGE);
|
|
|
|
/* generate new and unique shardId from sequence */
|
|
shardIdDatum = DirectFunctionCall1(nextval_oid, sequenceIdDatum);
|
|
|
|
SetUserIdAndSecContext(savedUserId, savedSecurityContext);
|
|
|
|
PG_RETURN_DATUM(shardIdDatum);
|
|
}
|
|
|
|
|
|
/*
|
|
* master_get_next_groupid allocates and returns a unique groupId for the group
|
|
* to be created. This allocation occurs both in shared memory and in write
|
|
* ahead logs; writing to logs avoids the risk of having groupId collisions.
|
|
*
|
|
* Please note that the caller is still responsible for finalizing node data
|
|
* and the groupId with the master node. Further note that this function relies
|
|
* on an internal sequence created in initdb to generate unique identifiers.
|
|
*
|
|
* NB: This can be called by any user; for now we have decided that that's
|
|
* ok. We might want to restrict this to users part of a specific role or such
|
|
* at some later point.
|
|
*/
|
|
Datum
|
|
master_get_next_groupid(PG_FUNCTION_ARGS)
|
|
{
|
|
text *sequenceName = cstring_to_text(GROUPID_SEQUENCE_NAME);
|
|
Oid sequenceId = ResolveRelationId(sequenceName);
|
|
Datum sequenceIdDatum = ObjectIdGetDatum(sequenceId);
|
|
Oid savedUserId = InvalidOid;
|
|
int savedSecurityContext = 0;
|
|
Datum groupIdDatum = 0;
|
|
|
|
GetUserIdAndSecContext(&savedUserId, &savedSecurityContext);
|
|
SetUserIdAndSecContext(CitusExtensionOwner(), SECURITY_LOCAL_USERID_CHANGE);
|
|
|
|
/* generate new and unique shardId from sequence */
|
|
groupIdDatum = DirectFunctionCall1(nextval_oid, sequenceIdDatum);
|
|
|
|
SetUserIdAndSecContext(savedUserId, savedSecurityContext);
|
|
|
|
PG_RETURN_DATUM(groupIdDatum);
|
|
}
|