citus/src/backend/distributed/master/master_node_protocol.c

/*-------------------------------------------------------------------------
 *
 * master_node_protocol.c
 *	  Routines for requesting information from the master node for creating or
 *	  updating shards.
 *
 * Copyright (c) 2012-2016, Citus Data, Inc.
 *
 * $Id$
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include "funcapi.h"
#include "miscadmin.h"

#include "access/htup_details.h"
#include "catalog/catalog.h"
#include "catalog/dependency.h"
#include "catalog/indexing.h"
#include "catalog/namespace.h"
#include "catalog/pg_index.h"
#include "catalog/pg_type.h"
#include "commands/sequence.h"
#include "distributed/citus_ruleutils.h"
#include "distributed/listutils.h"
#include "distributed/master_protocol.h"
#include "distributed/metadata_cache.h"
#include "distributed/multi_physical_planner.h"
#include "distributed/pg_dist_shard.h"
#include "distributed/pg_dist_partition.h"
#include "distributed/worker_manager.h"
#include "foreign/foreign.h"
#include "libpq/ip.h"
#include "libpq/libpq-be.h"
#include "nodes/pg_list.h"
#include "storage/lock.h"
#include "utils/builtins.h"
#include "utils/fmgroids.h"
#include "utils/lsyscache.h"
#if (PG_VERSION_NUM >= 90500)
#include "utils/ruleutils.h"
#endif
#include "utils/syscache.h"
#include "utils/tqual.h"


/* Shard related configuration */
int ShardReplicationFactor = 2; /* desired replication factor for shards */
int ShardMaxSize = 1048576;     /* maximum size in KB one shard can grow to */
int ShardPlacementPolicy = SHARD_PLACEMENT_ROUND_ROBIN;


static Datum WorkerNodeGetDatum(WorkerNode *workerNode, TupleDesc tupleDescriptor);


/* exports for SQL callable functions */
PG_FUNCTION_INFO_V1(master_get_table_metadata);
PG_FUNCTION_INFO_V1(master_get_table_ddl_events);
PG_FUNCTION_INFO_V1(master_get_new_shardid);
PG_FUNCTION_INFO_V1(master_get_local_first_candidate_nodes);
PG_FUNCTION_INFO_V1(master_get_round_robin_candidate_nodes);
PG_FUNCTION_INFO_V1(master_get_active_worker_nodes);


/*
 * master_get_table_metadata takes in a relation name, and returns partition
 * related metadata for the relation. These metadata are grouped and returned in
 * a tuple, and are used by the caller when creating new shards. The function
 * errors if given relation does not exist, or is not partitioned.
 */
Datum
master_get_table_metadata(PG_FUNCTION_ARGS)
{
	text *relationName = PG_GETARG_TEXT_P(0);
	Oid relationId = ResolveRelationId(relationName);

	DistTableCacheEntry *partitionEntry = NULL;
	TypeFuncClass resultTypeClass = 0;
	Datum partitionKeyExpr = 0;
	Datum partitionKey = 0;
	Datum metadataDatum = 0;
	HeapTuple metadataTuple = NULL;
	TupleDesc metadataDescriptor = NULL;
	uint64 shardMaxSizeInBytes = 0;
	char relationType = 0;
	char storageType = 0;
	Datum values[TABLE_METADATA_FIELDS];
	bool isNulls[TABLE_METADATA_FIELDS];

	/* find partition tuple for partitioned relation */
	partitionEntry = DistributedTableCacheEntry(relationId);

	/* create tuple descriptor for return value */
	resultTypeClass = get_call_result_type(fcinfo, NULL, &metadataDescriptor);
	if (resultTypeClass != TYPEFUNC_COMPOSITE)
	{
		ereport(ERROR, (errmsg("return type must be a row type")));
	}

	/* get decompiled expression tree for partition key */
	partitionKeyExpr =
		PointerGetDatum(cstring_to_text(partitionEntry->partitionKeyString));
	partitionKey = DirectFunctionCall2(pg_get_expr, partitionKeyExpr,
									   ObjectIdGetDatum(relationId));

	/* form heap tuple for table metadata */
	memset(values, 0, sizeof(values));
	memset(isNulls, false, sizeof(isNulls));

	shardMaxSizeInBytes = (int64) ShardMaxSize * 1024L;

	/* get storage type */
	relationType = get_rel_relkind(relationId);
	if (relationType == RELKIND_RELATION)
	{
		storageType = SHARD_STORAGE_TABLE;
	}
	else if (relationType == RELKIND_FOREIGN_TABLE)
	{
		bool cstoreTable = CStoreTable(relationId);
		if (cstoreTable)
		{
			storageType = SHARD_STORAGE_COLUMNAR;
		}
		else
		{
			storageType = SHARD_STORAGE_FOREIGN;
		}
	}

	values[0] = ObjectIdGetDatum(relationId);
	values[1] = storageType;
	values[2] = partitionEntry->partitionMethod;
	values[3] = partitionKey;
	values[4] = Int32GetDatum(ShardReplicationFactor);
	values[5] = Int64GetDatum(shardMaxSizeInBytes);
	values[6] = Int32GetDatum(ShardPlacementPolicy);

	metadataTuple = heap_form_tuple(metadataDescriptor, values, isNulls);
	metadataDatum = HeapTupleGetDatum(metadataTuple);

	PG_RETURN_DATUM(metadataDatum);
}


/*
 * CStoreTable returns true if the given relationId belongs to a foreign cstore
 * table, otherwise it returns false.
 */
bool
CStoreTable(Oid relationId)
{
	bool cstoreTable = false;

	char relationKind = get_rel_relkind(relationId);
	if (relationKind == RELKIND_FOREIGN_TABLE)
	{
		ForeignTable *foreignTable = GetForeignTable(relationId);
		ForeignServer *server = GetForeignServer(foreignTable->serverid);
		ForeignDataWrapper *foreignDataWrapper = GetForeignDataWrapper(server->fdwid);

		if (strncmp(foreignDataWrapper->fdwname, CSTORE_FDW_NAME, NAMEDATALEN) == 0)
		{
			cstoreTable = true;
		}
	}

	return cstoreTable;
}


/*
 * master_get_table_ddl_events takes in a relation name, and returns the set of
 * DDL commands needed to reconstruct the relation. The returned DDL commands
 * are similar in flavor to schema definitions that pgdump returns. The function
 * errors if given relation does not exist.
 */
Datum
master_get_table_ddl_events(PG_FUNCTION_ARGS)
{
	FuncCallContext *functionContext = NULL;
	ListCell *tableDDLEventCell = NULL;

	/*
	 * On the very first call to this function, we first use the given relation
	 * name to get to the relation. We then recreate the list of DDL statements
	 * issued for this relation, and save the first statement's position in the
	 * function context.
	 */
	if (SRF_IS_FIRSTCALL())
	{
		text *relationName = PG_GETARG_TEXT_P(0);
		Oid relationId = ResolveRelationId(relationName);

		MemoryContext oldContext = NULL;
		List *tableDDLEventList = NIL;

		/* create a function context for cross-call persistence */
		functionContext = SRF_FIRSTCALL_INIT();

		/* switch to memory context appropriate for multiple function calls */
		oldContext = MemoryContextSwitchTo(functionContext->multi_call_memory_ctx);

		/* allocate DDL statements, and then save position in DDL statements */
		tableDDLEventList = GetTableDDLEvents(relationId);
		tableDDLEventCell = list_head(tableDDLEventList);

		functionContext->user_fctx = tableDDLEventCell;

		MemoryContextSwitchTo(oldContext);
	}

	/*
	 * On every call to this function, we get the current position in the
	 * statement list. We then iterate to the next position in the list and
	 * return the current statement, if we have not yet reached the end of
	 * list.
	 */
	functionContext = SRF_PERCALL_SETUP();

	tableDDLEventCell = (ListCell *) functionContext->user_fctx;
	if (tableDDLEventCell != NULL)
	{
		char *ddlStatement = (char *) lfirst(tableDDLEventCell);
		text *ddlStatementText = cstring_to_text(ddlStatement);

		functionContext->user_fctx = lnext(tableDDLEventCell);

		SRF_RETURN_NEXT(functionContext, PointerGetDatum(ddlStatementText));
	}
	else
	{
		SRF_RETURN_DONE(functionContext);
	}
}


/*
 * master_get_new_shardid allocates and returns a unique shardId for the shard
 * to be created. This allocation occurs both in shared memory and in write
 * ahead logs; writing to logs avoids the risk of having shardId collisions.
 *
 * Please note that the caller is still responsible for finalizing shard data
 * and the shardId with the master node. Further note that this function relies
 * on an internal sequence created in initdb to generate unique identifiers.
 *
 * NB: This can be called by any user; for now we have decided that that's
 * ok. We might want to restrict this to users part of a specific role or such
 * at some later point.
 */
Datum
master_get_new_shardid(PG_FUNCTION_ARGS)
{
	text *sequenceName = cstring_to_text(SHARDID_SEQUENCE_NAME);
	Oid sequenceId = ResolveRelationId(sequenceName);
	Datum sequenceIdDatum = ObjectIdGetDatum(sequenceId);
	Oid savedUserId = InvalidOid;
	int savedSecurityContext = 0;
	Datum shardIdDatum = 0;

	GetUserIdAndSecContext(&savedUserId, &savedSecurityContext);
	SetUserIdAndSecContext(CitusExtensionOwner(), SECURITY_LOCAL_USERID_CHANGE);

	/* generate new and unique shardId from sequence */
	shardIdDatum = DirectFunctionCall1(nextval_oid, sequenceIdDatum);

	SetUserIdAndSecContext(savedUserId, savedSecurityContext);

	PG_RETURN_DATUM(shardIdDatum);
}


/*
 * master_get_local_first_candidate_nodes returns a set of candidate host names
 * and port numbers on which to place new shards. The function makes sure to
 * always allocate the first candidate node as the node the caller is connecting
 * from; and allocates additional nodes until the shard replication factor is
 * met. The function errors if the caller's remote node name is not found in the
 * membership list, or if the number of available nodes falls short of the
 * replication factor.
 */
Datum
master_get_local_first_candidate_nodes(PG_FUNCTION_ARGS)
{
	FuncCallContext *functionContext = NULL;
	uint32 desiredNodeCount = 0;
	uint32 currentNodeCount = 0;

	if (SRF_IS_FIRSTCALL())
	{
		MemoryContext oldContext = NULL;
		TupleDesc tupleDescriptor = NULL;
		uint32 liveNodeCount = 0;
		bool hasOid = false;

		/* create a function context for cross-call persistence */
		functionContext = SRF_FIRSTCALL_INIT();

		/* switch to memory context appropriate for multiple function calls */
		oldContext = MemoryContextSwitchTo(functionContext->multi_call_memory_ctx);

		functionContext->user_fctx = NIL;
		functionContext->max_calls = ShardReplicationFactor;

		/* if enough live nodes, return an extra candidate node as backup */
		liveNodeCount = WorkerGetLiveNodeCount();
		if (liveNodeCount > ShardReplicationFactor)
		{
			functionContext->max_calls = ShardReplicationFactor + 1;
		}

		/*
		 * This tuple descriptor must match the output parameters declared for
		 * the function in pg_proc.
		 */
		tupleDescriptor = CreateTemplateTupleDesc(CANDIDATE_NODE_FIELDS, hasOid);
		TupleDescInitEntry(tupleDescriptor, (AttrNumber) 1, "node_name",
						   TEXTOID, -1, 0);
		TupleDescInitEntry(tupleDescriptor, (AttrNumber) 2, "node_port",
						   INT8OID, -1, 0);

		functionContext->tuple_desc = BlessTupleDesc(tupleDescriptor);

		MemoryContextSwitchTo(oldContext);
	}

	functionContext = SRF_PERCALL_SETUP();
	desiredNodeCount = functionContext->max_calls;
	currentNodeCount = functionContext->call_cntr;

	if (currentNodeCount < desiredNodeCount)
	{
		MemoryContext oldContext = NULL;
		List *currentNodeList = NIL;
		WorkerNode *candidateNode = NULL;
		Datum candidateDatum = 0;

		/* switch to memory context appropriate for multiple function calls */
		oldContext = MemoryContextSwitchTo(functionContext->multi_call_memory_ctx);
		currentNodeList = functionContext->user_fctx;

		candidateNode = WorkerGetLocalFirstCandidateNode(currentNodeList);
		if (candidateNode == NULL)
		{
			ereport(ERROR, (errmsg("could only find %u of %u required nodes",
								   currentNodeCount, desiredNodeCount)));
		}

		currentNodeList = lappend(currentNodeList, candidateNode);
		functionContext->user_fctx = currentNodeList;

		MemoryContextSwitchTo(oldContext);

		candidateDatum = WorkerNodeGetDatum(candidateNode, functionContext->tuple_desc);

		SRF_RETURN_NEXT(functionContext, candidateDatum);
	}
	else
	{
		SRF_RETURN_DONE(functionContext);
	}
}


/*
 * master_get_round_robin_candidate_nodes returns a set of candidate host names
 * and port numbers on which to place new shards. The function uses the round
 * robin policy to choose the nodes and tries to ensure that there is an even
 * distribution of shards across the worker nodes. This function errors out if
 * the number of available nodes falls short of the replication factor.
 */
Datum
master_get_round_robin_candidate_nodes(PG_FUNCTION_ARGS)
{
	uint64 shardId = PG_GETARG_INT64(0);
	FuncCallContext *functionContext = NULL;
	uint32 desiredNodeCount = 0;
	uint32 currentNodeCount = 0;

	if (SRF_IS_FIRSTCALL())
	{
		MemoryContext oldContext = NULL;
		TupleDesc tupleDescriptor = NULL;
		List *workerNodeList = NIL;
		TypeFuncClass resultTypeClass = 0;
		uint32 workerNodeCount = 0;

		/* create a function context for cross-call persistence */
		functionContext = SRF_FIRSTCALL_INIT();

		/* switch to memory context appropriate for multiple function calls */
		oldContext = MemoryContextSwitchTo(functionContext->multi_call_memory_ctx);

		/* get the worker node list and sort it for determinism */
		workerNodeList = WorkerNodeList();
		workerNodeList = SortList(workerNodeList, CompareWorkerNodes);

		functionContext->user_fctx = workerNodeList;
		functionContext->max_calls = ShardReplicationFactor;

		/* if we enough live nodes, return an extra candidate node as backup */
		workerNodeCount = (uint32) list_length(workerNodeList);
		if (workerNodeCount > ShardReplicationFactor)
		{
			functionContext->max_calls = ShardReplicationFactor + 1;
		}

		/* create tuple descriptor for return value */
		resultTypeClass = get_call_result_type(fcinfo, NULL, &tupleDescriptor);
		if (resultTypeClass != TYPEFUNC_COMPOSITE)
		{
			ereport(ERROR, (errmsg("return type must be a row type")));
		}

		functionContext->tuple_desc = tupleDescriptor;

		MemoryContextSwitchTo(oldContext);
	}

	functionContext = SRF_PERCALL_SETUP();
	desiredNodeCount = functionContext->max_calls;
	currentNodeCount = functionContext->call_cntr;

	if (currentNodeCount < desiredNodeCount)
	{
		List *workerNodeList = functionContext->user_fctx;
		WorkerNode *candidateNode = NULL;
		Datum candidateDatum = 0;

		candidateNode = WorkerGetRoundRobinCandidateNode(workerNodeList, shardId,
														 currentNodeCount);
		if (candidateNode == NULL)
		{
			ereport(ERROR, (errmsg("could only find %u of %u required nodes",
								   currentNodeCount, desiredNodeCount)));
		}

		candidateDatum = WorkerNodeGetDatum(candidateNode, functionContext->tuple_desc);

		SRF_RETURN_NEXT(functionContext, candidateDatum);
	}
	else
	{
		SRF_RETURN_DONE(functionContext);
	}
}


/*
 * master_get_active_worker_nodes returns a set of active worker host names and
 * port numbers in deterministic order. Currently we assume that all worker
 * nodes in pg_worker_list.conf are active.
 */
Datum
master_get_active_worker_nodes(PG_FUNCTION_ARGS)
{
	FuncCallContext *functionContext = NULL;
	uint32 workerNodeIndex = 0;
	uint32 workerNodeCount = 0;

	if (SRF_IS_FIRSTCALL())
	{
		MemoryContext oldContext = NULL;
		List *workerNodeList = NIL;
		uint32 workerNodeCount = 0;
		TupleDesc tupleDescriptor = NULL;
		bool hasOid = false;

		/* create a function context for cross-call persistence */
		functionContext = SRF_FIRSTCALL_INIT();

		/* switch to memory context appropriate for multiple function calls */
		oldContext = MemoryContextSwitchTo(functionContext->multi_call_memory_ctx);

		workerNodeList = WorkerNodeList();
		workerNodeCount = (uint32) list_length(workerNodeList);

		functionContext->user_fctx = workerNodeList;
		functionContext->max_calls = workerNodeCount;

		/*
		 * This tuple descriptor must match the output parameters declared for
		 * the function in pg_proc.
		 */
		tupleDescriptor = CreateTemplateTupleDesc(WORKER_NODE_FIELDS, hasOid);
		TupleDescInitEntry(tupleDescriptor, (AttrNumber) 1, "node_name",
						   TEXTOID, -1, 0);
		TupleDescInitEntry(tupleDescriptor, (AttrNumber) 2, "node_port",
						   INT8OID, -1, 0);

		functionContext->tuple_desc = BlessTupleDesc(tupleDescriptor);

		MemoryContextSwitchTo(oldContext);
	}

	functionContext = SRF_PERCALL_SETUP();
	workerNodeIndex = functionContext->call_cntr;
	workerNodeCount = functionContext->max_calls;

	if (workerNodeIndex < workerNodeCount)
	{
		List *workerNodeList = functionContext->user_fctx;
		WorkerNode *workerNode = list_nth(workerNodeList, workerNodeIndex);

		Datum workerNodeDatum = WorkerNodeGetDatum(workerNode,
												   functionContext->tuple_desc);

		SRF_RETURN_NEXT(functionContext, workerNodeDatum);
	}
	else
	{
		SRF_RETURN_DONE(functionContext);
	}
}


/* Finds the relationId from a potentially qualified relation name. */
Oid
ResolveRelationId(text *relationName)
{
	List *relationNameList = NIL;
	RangeVar *relation = NULL;
	Oid relationId = InvalidOid;
	bool failOK = false;        /* error if relation cannot be found */

	/* resolve relationId from passed in schema and relation name */
	relationNameList = textToQualifiedNameList(relationName);
	relation = makeRangeVarFromNameList(relationNameList);
	relationId = RangeVarGetRelid(relation, NoLock, failOK);

	return relationId;
}


/*
 * GetTableDDLEvents takes in a relationId, and returns the list of DDL commands
 * needed to reconstruct the relation. These DDL commands are all palloced; and
 * include the table's schema definition, optional column storage and statistics
 * definitions, and index and constraint defitions.
 */
List *
GetTableDDLEvents(Oid relationId)
{
	List *tableDDLEventList = NIL;
	char tableType = 0;
	char *tableSchemaDef = NULL;
	char *tableColumnOptionsDef = NULL;
	char *schemaName = NULL;
	Oid schemaId = InvalidOid;

	Relation pgIndex = NULL;
	SysScanDesc scanDescriptor = NULL;
	ScanKeyData scanKey[1];
	int scanKeyCount = 1;
	HeapTuple heapTuple = NULL;

	/* if foreign table, fetch extension and server definitions */
	tableType = get_rel_relkind(relationId);
	if (tableType == RELKIND_FOREIGN_TABLE)
	{
		char *extensionDef = pg_get_extensiondef_string(relationId);
		char *serverDef = pg_get_serverdef_string(relationId);

		if (extensionDef != NULL)
		{
			tableDDLEventList = lappend(tableDDLEventList, extensionDef);
		}
		tableDDLEventList = lappend(tableDDLEventList, serverDef);
	}

	/* create schema if the table is not in the default namespace (public) */
	schemaId = get_rel_namespace(relationId);
	schemaName = get_namespace_name(schemaId);
	if (strncmp(schemaName, "public", NAMEDATALEN) != 0)
	{
		StringInfo schemaNameDef = makeStringInfo();
		appendStringInfo(schemaNameDef, CREATE_SCHEMA_COMMAND, schemaName);

		tableDDLEventList = lappend(tableDDLEventList, schemaNameDef->data);
	}

	/* fetch table schema and column option definitions */
	tableSchemaDef = pg_get_tableschemadef_string(relationId);
	tableColumnOptionsDef = pg_get_tablecolumnoptionsdef_string(relationId);

	tableDDLEventList = lappend(tableDDLEventList, tableSchemaDef);
	if (tableColumnOptionsDef != NULL)
	{
		tableDDLEventList = lappend(tableDDLEventList, tableColumnOptionsDef);
	}

	/* open system catalog and scan all indexes that belong to this table */
	pgIndex = heap_open(IndexRelationId, AccessShareLock);

	ScanKeyInit(&scanKey[0], Anum_pg_index_indrelid,
				BTEqualStrategyNumber, F_OIDEQ, relationId);

	scanDescriptor = systable_beginscan(pgIndex,
										IndexIndrelidIndexId, true, /* indexOK */
										NULL, scanKeyCount, scanKey);

	heapTuple = systable_getnext(scanDescriptor);
	while (HeapTupleIsValid(heapTuple))
	{
		Form_pg_index indexForm = (Form_pg_index) GETSTRUCT(heapTuple);
		Oid indexId = indexForm->indexrelid;
		bool isConstraint = false;
		char *statementDef = NULL;

		/*
		 * A primary key index is always created by a constraint statement.
		 * A unique key index is created by a constraint if and only if the
		 * index has a corresponding constraint entry in pg_depend. Any other
		 * index form is never associated with a constraint.
		 */
		if (indexForm->indisprimary)
		{
			isConstraint = true;
		}
		else if (indexForm->indisunique)
		{
			Oid constraintId = get_index_constraint(indexId);
			isConstraint = OidIsValid(constraintId);
		}
		else
		{
			isConstraint = false;
		}

		/* get the corresponding constraint or index statement */
		if (isConstraint)
		{
			Oid constraintId = get_index_constraint(indexId);
			Assert(constraintId != InvalidOid);

#if (PG_VERSION_NUM >= 90500)
			statementDef = pg_get_constraintdef_command(constraintId);
#else
			statementDef = pg_get_constraintdef_string(constraintId);
#endif
		}
		else
		{
			statementDef = pg_get_indexdef_string(indexId);
		}

		/* append found constraint or index definition to the list */
		tableDDLEventList = lappend(tableDDLEventList, statementDef);

		/* if table is clustered on this index, append definition to the list */
		if (indexForm->indisclustered)
		{
			char *clusteredDef = pg_get_indexclusterdef_string(indexId);
			Assert(clusteredDef != NULL);

			tableDDLEventList = lappend(tableDDLEventList, clusteredDef);
		}

		heapTuple = systable_getnext(scanDescriptor);
	}

	/* clean up scan and close system catalog */
	systable_endscan(scanDescriptor);
	heap_close(pgIndex, AccessShareLock);

	return tableDDLEventList;
}


/*
 * WorkerNodeGetDatum converts the worker node passed to it into its datum
 * representation. To do this, the function first creates the heap tuple from
 * the worker node name and port. Then, the function converts the heap tuple
 * into a datum and returns it.
 */
static Datum
WorkerNodeGetDatum(WorkerNode *workerNode, TupleDesc tupleDescriptor)
{
	Datum values[WORKER_NODE_FIELDS];
	bool isNulls[WORKER_NODE_FIELDS];
	HeapTuple workerNodeTuple = NULL;
	Datum workerNodeDatum = 0;

	memset(values, 0, sizeof(values));
	memset(isNulls, false, sizeof(isNulls));

	values[0] = CStringGetTextDatum(workerNode->workerName);
	values[1] = Int64GetDatum((int64) workerNode->workerPort);

	workerNodeTuple = heap_form_tuple(tupleDescriptor, values, isNulls);
	workerNodeDatum = HeapTupleGetDatum(workerNodeTuple);

	return workerNodeDatum;
}